avtomatika-worker 1.0b2__py3-none-any.whl → 1.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- """A Python SDK for creating workers for the Py-Orchestrator."""
1
+ """A Python SDK for creating workers for the Avtomatika Orchestrator."""
2
2
 
3
3
  from importlib.metadata import PackageNotFoundError, version
4
4
 
@@ -4,13 +4,15 @@ from os import getenv
4
4
  from typing import Any
5
5
  from uuid import uuid4
6
6
 
7
+ from rxon.validators import validate_identifier
8
+
7
9
 
8
10
  class WorkerConfig:
9
11
  """A class for centralized management of worker configuration.
10
12
  Reads parameters from environment variables and provides default values.
11
13
  """
12
14
 
13
- def __init__(self):
15
+ def __init__(self) -> None:
14
16
  # --- Basic worker information ---
15
17
  self.WORKER_ID: str = getenv("WORKER_ID", f"worker-{uuid4()}")
16
18
  self.WORKER_TYPE: str = getenv("WORKER_TYPE", "generic-cpu-worker")
@@ -29,6 +31,9 @@ class WorkerConfig:
29
31
  "WORKER_INDIVIDUAL_TOKEN",
30
32
  getenv("WORKER_TOKEN", "your-secret-worker-token"),
31
33
  )
34
+ self.TLS_CA_PATH: str | None = getenv("TLS_CA_PATH")
35
+ self.TLS_CERT_PATH: str | None = getenv("TLS_CERT_PATH")
36
+ self.TLS_KEY_PATH: str | None = getenv("TLS_KEY_PATH")
32
37
 
33
38
  # --- Resources and performance ---
34
39
  self.COST_PER_SKILL: dict[str, float] = self._load_json_from_env("COST_PER_SKILL", default={})
@@ -54,6 +59,7 @@ class WorkerConfig:
54
59
  self.S3_ACCESS_KEY: str | None = getenv("S3_ACCESS_KEY")
55
60
  self.S3_SECRET_KEY: str | None = getenv("S3_SECRET_KEY")
56
61
  self.S3_DEFAULT_BUCKET: str = getenv("S3_DEFAULT_BUCKET", "avtomatika-payloads")
62
+ self.S3_REGION: str = getenv("S3_REGION", "us-east-1")
57
63
 
58
64
  # --- Tuning parameters ---
59
65
  self.HEARTBEAT_INTERVAL: float = float(getenv("HEARTBEAT_INTERVAL", "15"))
@@ -70,6 +76,19 @@ class WorkerConfig:
70
76
  self.ENABLE_WEBSOCKETS: bool = getenv("WORKER_ENABLE_WEBSOCKETS", "false").lower() == "true"
71
77
  self.MULTI_ORCHESTRATOR_MODE: str = getenv("MULTI_ORCHESTRATOR_MODE", "FAILOVER")
72
78
 
79
+ def validate(self) -> None:
80
+ """Validates critical configuration parameters."""
81
+ validate_identifier(self.WORKER_ID, "WORKER_ID")
82
+ if self.WORKER_TOKEN == "your-secret-worker-token":
83
+ print("Warning: WORKER_TOKEN is set to the default value. Tasks might fail authentication.")
84
+
85
+ if not self.ORCHESTRATORS:
86
+ raise ValueError("No orchestrators configured.")
87
+
88
+ for o in self.ORCHESTRATORS:
89
+ if not o.get("url"):
90
+ raise ValueError("Orchestrator configuration missing URL.")
91
+
73
92
  def _get_orchestrators_config(self) -> list[dict[str, Any]]:
74
93
  """
75
94
  Loads orchestrator configuration from the ORCHESTRATORS_CONFIG environment variable.
File without changes
avtomatika_worker/s3.py CHANGED
@@ -1,141 +1,238 @@
1
- from asyncio import gather, to_thread
1
+ from asyncio import Semaphore, gather, to_thread
2
+ from logging import getLogger
2
3
  from os import walk
3
4
  from os.path import basename, dirname, join, relpath
4
5
  from shutil import rmtree
5
- from typing import Any
6
- from urllib.parse import urlparse
6
+ from typing import Any, cast
7
7
 
8
- from aioboto3 import Session
8
+ from aiofiles import open as aio_open
9
9
  from aiofiles.os import makedirs
10
- from aiofiles.ospath import exists, isdir
11
- from botocore.client import Config
10
+ from aiofiles.ospath import exists, getsize, isdir
11
+ from obstore import get as obstore_get
12
+ from obstore import list as obstore_list
13
+ from obstore import put as obstore_put
14
+ from obstore.store import S3Store
15
+ from rxon.blob import parse_uri
16
+ from rxon.exceptions import IntegrityError
17
+ from rxon.models import FileMetadata
12
18
 
13
19
  from .config import WorkerConfig
14
20
 
21
+ logger = getLogger(__name__)
22
+
23
+ # Limit concurrent S3 operations to avoid "Too many open files"
24
+ MAX_S3_CONCURRENCY = 50
25
+
15
26
 
16
27
  class S3Manager:
17
- """Handles S3 payload offloading."""
28
+ """Handles S3 payload offloading using obstore (high-performance async S3 client)."""
18
29
 
19
30
  def __init__(self, config: WorkerConfig):
20
31
  self._config = config
21
- self._session = Session()
32
+ self._stores: dict[str, S3Store] = {}
33
+ self._semaphore = Semaphore(MAX_S3_CONCURRENCY)
34
+
35
+ def _get_store(self, bucket_name: str) -> S3Store:
36
+ """Creates or returns a cached S3Store for a specific bucket."""
37
+ if bucket_name in self._stores:
38
+ return self._stores[bucket_name]
22
39
 
23
- def _get_client_args(self) -> dict[str, Any]:
24
- """Returns standard arguments for S3 client creation."""
25
- return {
26
- "service_name": "s3",
27
- "endpoint_url": self._config.S3_ENDPOINT_URL,
40
+ config_kwargs = {
28
41
  "aws_access_key_id": self._config.S3_ACCESS_KEY,
29
42
  "aws_secret_access_key": self._config.S3_SECRET_KEY,
30
- "config": Config(signature_version="s3v4"),
43
+ "region": "us-east-1", # Default region if not specified, required by some clients
31
44
  }
32
45
 
33
- async def cleanup(self, task_id: str):
46
+ if self._config.S3_ENDPOINT_URL:
47
+ config_kwargs["endpoint"] = self._config.S3_ENDPOINT_URL
48
+ if self._config.S3_ENDPOINT_URL.startswith("http://"):
49
+ config_kwargs["allow_http"] = "true"
50
+
51
+ # Filter out None values
52
+ config_kwargs = {k: v for k, v in config_kwargs.items() if v is not None}
53
+
54
+ try:
55
+ store = S3Store(bucket_name, **config_kwargs)
56
+ self._stores[bucket_name] = store
57
+ return store
58
+ except Exception as e:
59
+ logger.error(f"Failed to create S3Store for bucket {bucket_name}: {e}")
60
+ raise
61
+
62
+ async def cleanup(self, task_id: str) -> None:
34
63
  """Removes the task-specific payload directory."""
35
64
  task_dir = join(self._config.TASK_FILES_DIR, task_id)
36
65
  if await exists(task_dir):
37
66
  await to_thread(lambda: rmtree(task_dir, ignore_errors=True))
38
67
 
39
- async def _process_s3_uri(self, uri: str, task_id: str) -> str:
40
- """Downloads a file or a folder (if uri ends with /) from S3 and returns the local path."""
41
- parsed_url = urlparse(uri)
42
- bucket_name = parsed_url.netloc
43
- object_key = parsed_url.path.lstrip("/")
68
+ async def _process_s3_uri(self, uri: str, task_id: str, verify_meta: FileMetadata | None = None) -> str:
69
+ """Downloads a file or a folder from S3 and returns the local path.
70
+ If verify_meta is provided, performs integrity checks.
71
+ """
72
+ try:
73
+ bucket_name, object_key, is_directory = parse_uri(uri)
74
+ store = self._get_store(bucket_name)
75
+
76
+ # Use task-specific directory for isolation
77
+ local_dir_root = join(self._config.TASK_FILES_DIR, task_id)
78
+ await makedirs(local_dir_root, exist_ok=True)
44
79
 
45
- # Use task-specific directory for isolation
46
- local_dir_root = join(self._config.TASK_FILES_DIR, task_id)
47
- await makedirs(local_dir_root, exist_ok=True)
80
+ logger.info(f"Starting download from S3: {uri}")
48
81
 
49
- async with self._session.client(**self._get_client_args()) as s3:
50
82
  # Handle folder download (prefix)
51
- if uri.endswith("/"):
83
+ if is_directory:
52
84
  folder_name = object_key.rstrip("/").split("/")[-1]
53
85
  local_folder_path = join(local_dir_root, folder_name)
54
-
55
- paginator = s3.get_paginator("list_objects_v2")
56
- tasks = []
57
- async for page in paginator.paginate(Bucket=bucket_name, Prefix=object_key):
58
- for obj in page.get("Contents", []):
59
- key = obj["Key"]
60
- if key.endswith("/"):
61
- continue
62
-
63
- # Calculate relative path inside the folder
64
- rel_path = key[len(object_key) :]
65
- local_file_path = join(local_folder_path, rel_path)
66
-
67
- await makedirs(dirname(local_file_path), exist_ok=True)
68
- tasks.append(s3.download_file(bucket_name, key, local_file_path))
69
-
70
- if tasks:
71
- await gather(*tasks)
86
+ files_to_download = []
87
+
88
+ async for obj in obstore_list(store, prefix=object_key):
89
+ key = obj.key
90
+ if key.endswith("/"):
91
+ continue
92
+ rel_path = key[len(object_key) :]
93
+ local_file_path = join(local_folder_path, rel_path)
94
+ await makedirs(dirname(local_file_path), exist_ok=True)
95
+ files_to_download.append((key, local_file_path))
96
+
97
+ async def _download_file(key: str, path: str) -> None:
98
+ async with self._semaphore:
99
+ result = await obstore_get(store, key)
100
+ async with aio_open(path, "wb") as f:
101
+ async for chunk in result.stream():
102
+ await f.write(chunk)
103
+
104
+ if files_to_download:
105
+ await gather(*[_download_file(k, p) for k, p in files_to_download])
106
+
107
+ logger.info(f"Successfully downloaded folder from S3: {uri} ({len(files_to_download)} files)")
72
108
  return local_folder_path
73
109
 
74
110
  # Handle single file download
75
111
  local_path = join(local_dir_root, basename(object_key))
76
- await s3.download_file(bucket_name, object_key, local_path)
112
+
113
+ result = await obstore_get(store, object_key)
114
+
115
+ # Integrity check before download
116
+ if verify_meta:
117
+ if verify_meta.size != result.meta.size:
118
+ raise IntegrityError(
119
+ f"Size mismatch for {uri}: expected {verify_meta.size}, got {result.meta.size}"
120
+ )
121
+ if verify_meta.etag and result.meta.e_tag:
122
+ actual_etag = result.meta.e_tag.strip('"')
123
+ expected_etag = verify_meta.etag.strip('"')
124
+ if actual_etag != expected_etag:
125
+ raise IntegrityError(f"ETag mismatch for {uri}: expected {expected_etag}, got {actual_etag}")
126
+
127
+ async with aio_open(local_path, "wb") as f:
128
+ async for chunk in result.stream():
129
+ await f.write(chunk)
130
+
131
+ logger.info(f"Successfully downloaded file from S3: {uri} -> {local_path}")
77
132
  return local_path
78
133
 
79
- async def _upload_to_s3(self, local_path: str) -> str:
80
- """Uploads a file or a folder to S3 and returns the S3 URI."""
134
+ except Exception as e:
135
+ # Catching generic Exception because obstore might raise different errors.
136
+ logger.exception(f"Error during download of {uri}: {e}")
137
+ raise
138
+
139
+ async def _upload_to_s3(self, local_path: str) -> FileMetadata:
140
+ """Uploads a file or a folder to S3 and returns FileMetadata."""
81
141
  bucket_name = self._config.S3_DEFAULT_BUCKET
142
+ store = self._get_store(bucket_name)
82
143
 
83
- async with self._session.client(**self._get_client_args()) as s3:
144
+ logger.info(f"Starting upload to S3 from local path: {local_path}")
145
+
146
+ try:
84
147
  # Handle folder upload
85
148
  if await isdir(local_path):
86
149
  folder_name = basename(local_path.rstrip("/"))
87
150
  s3_prefix = f"{folder_name}/"
88
- tasks = []
89
151
 
90
- # Use to_thread to avoid blocking event loop during file walk
91
152
  def _get_files_to_upload():
153
+ from os.path import getsize as std_getsize
154
+
92
155
  files_to_upload = []
156
+ total_size = 0
93
157
  for root, _, files in walk(local_path):
94
158
  for file in files:
95
159
  f_path = join(root, file)
96
160
  rel = relpath(f_path, local_path)
161
+ total_size += std_getsize(f_path)
97
162
  files_to_upload.append((f_path, f"{s3_prefix}{rel}"))
98
- return files_to_upload
163
+ return files_to_upload, total_size
99
164
 
100
- files_list = await to_thread(_get_files_to_upload)
165
+ files_list, total_size = await to_thread(_get_files_to_upload)
101
166
 
102
- for full_path, key in files_list:
103
- tasks.append(s3.upload_file(full_path, bucket_name, key))
167
+ async def _upload_file(path: str, key: str) -> None:
168
+ async with self._semaphore:
169
+ with open(path, "rb") as f:
170
+ await obstore_put(store, key, f)
104
171
 
105
- if tasks:
106
- await gather(*tasks)
172
+ if files_list:
173
+ await gather(*[_upload_file(f, k) for f, k in files_list])
107
174
 
108
- return f"s3://{bucket_name}/{s3_prefix}"
175
+ s3_uri = f"s3://{bucket_name}/{s3_prefix}"
176
+ logger.info(f"Successfully uploaded folder to S3: {local_path} -> {s3_uri} ({len(files_list)} files)")
177
+ return FileMetadata(uri=s3_uri, size=total_size)
109
178
 
110
179
  # Handle single file upload
111
180
  object_key = basename(local_path)
112
- await s3.upload_file(local_path, bucket_name, object_key)
113
- return f"s3://{bucket_name}/{object_key}"
114
-
115
- async def process_params(self, params: dict[str, Any], task_id: str) -> dict[str, Any]:
116
- """Recursively searches for S3 URIs in params and downloads the files."""
181
+ file_size = await getsize(local_path)
182
+ with open(local_path, "rb") as f:
183
+ put_result = await obstore_put(store, object_key, f)
184
+
185
+ s3_uri = f"s3://{bucket_name}/{object_key}"
186
+ etag = put_result.e_tag.strip('"') if put_result.e_tag else None
187
+ logger.info(f"Successfully uploaded file to S3: {local_path} -> {s3_uri} (ETag: {etag})")
188
+ return FileMetadata(uri=s3_uri, size=file_size, etag=etag)
189
+
190
+ except Exception as e:
191
+ logger.exception(f"Error during upload of {local_path}: {e}")
192
+ raise
193
+
194
+ async def process_params(
195
+ self, params: dict[str, Any], task_id: str, metadata: dict[str, FileMetadata] | None = None
196
+ ) -> dict[str, Any]:
197
+ """Recursively searches for S3 URIs in params and downloads the files.
198
+ Uses metadata for integrity verification if available.
199
+ """
117
200
  if not self._config.S3_ENDPOINT_URL:
118
201
  return params
119
202
 
120
- async def _process(item: Any) -> Any:
203
+ async def _process(item: Any, key_path: str = "") -> Any:
121
204
  if isinstance(item, str) and item.startswith("s3://"):
122
- return await self._process_s3_uri(item, task_id)
205
+ verify_meta = metadata.get(key_path) if metadata else None
206
+ return await self._process_s3_uri(item, task_id, verify_meta=verify_meta)
123
207
  if isinstance(item, dict):
124
- return {k: await _process(v) for k, v in item.items()}
125
- return [await _process(i) for i in item] if isinstance(item, list) else item
208
+ return {k: await _process(v, f"{key_path}.{k}" if key_path else k) for k, v in item.items()}
209
+ if isinstance(item, list):
210
+ return [await _process(v, f"{key_path}[{i}]") for i, v in enumerate(item)]
211
+ return item
126
212
 
127
- return await _process(params)
213
+ return cast(dict[str, Any], await _process(params))
128
214
 
129
- async def process_result(self, result: dict[str, Any]) -> dict[str, Any]:
130
- """Recursively searches for local file paths in the result and uploads them to S3."""
215
+ async def process_result(self, result: dict[str, Any]) -> tuple[dict[str, Any], dict[str, FileMetadata]]:
216
+ """Recursively searches for local file paths in the result and uploads them to S3.
217
+ Returns a tuple of (updated_result, metadata_map).
218
+ """
131
219
  if not self._config.S3_ENDPOINT_URL:
132
- return result
220
+ return result, {}
221
+
222
+ metadata_map = {}
133
223
 
134
- async def _process(item: Any) -> Any:
224
+ async def _process(item: Any, key_path: str = "") -> Any:
135
225
  if isinstance(item, str) and item.startswith(self._config.TASK_FILES_DIR):
136
- return await self._upload_to_s3(item) if await exists(item) else item
226
+ if await exists(item):
227
+ meta = await self._upload_to_s3(item)
228
+ metadata_map[key_path] = meta
229
+ return meta.uri
230
+ return item
137
231
  if isinstance(item, dict):
138
- return {k: await _process(v) for k, v in item.items()}
139
- return [await _process(i) for i in item] if isinstance(item, list) else item
232
+ return {k: await _process(v, f"{key_path}.{k}" if key_path else k) for k, v in item.items()}
233
+ if isinstance(item, list):
234
+ return [await _process(v, f"{key_path}[{i}]") for i, v in enumerate(item)]
235
+ return item
140
236
 
141
- return await _process(result)
237
+ updated_result = cast(dict[str, Any], await _process(result))
238
+ return updated_result, metadata_map
@@ -1,11 +1,17 @@
1
1
  from contextlib import asynccontextmanager
2
+ from json import dumps, loads
2
3
  from os.path import dirname, join
3
- from typing import AsyncGenerator
4
+ from typing import TYPE_CHECKING, Any, AsyncGenerator
4
5
 
5
6
  from aiofiles import open as aiopen
6
7
  from aiofiles.os import listdir, makedirs
7
8
  from aiofiles.ospath import exists as aio_exists
8
9
 
10
+ if TYPE_CHECKING:
11
+ from rxon.models import FileMetadata
12
+
13
+ from .s3 import S3Manager
14
+
9
15
 
10
16
  class TaskFiles:
11
17
  """
@@ -14,12 +20,13 @@ class TaskFiles:
14
20
  within an isolated workspace for each task.
15
21
  """
16
22
 
17
- def __init__(self, task_dir: str):
23
+ def __init__(self, task_dir: str, s3_manager: "S3Manager" = None):
18
24
  """
19
25
  Initializes TaskFiles with a specific task directory.
20
26
  The directory is not created until needed.
21
27
  """
22
28
  self._task_dir = task_dir
29
+ self._s3_manager = s3_manager
23
30
 
24
31
  async def get_root(self) -> str:
25
32
  """
@@ -37,6 +44,24 @@ class TaskFiles:
37
44
  root = await self.get_root()
38
45
  return join(root, filename)
39
46
 
47
+ def get_root_sync(self) -> str:
48
+ """
49
+ Synchronously returns the root directory for the task.
50
+ Creates the directory on disk if it doesn't exist.
51
+ """
52
+ import os
53
+
54
+ os.makedirs(self._task_dir, exist_ok=True)
55
+ return self._task_dir
56
+
57
+ def path_to_sync(self, filename: str) -> str:
58
+ """
59
+ Synchronously returns an absolute path for a file within the task directory.
60
+ Guarantees that the task root directory exists.
61
+ """
62
+ root = self.get_root_sync()
63
+ return join(root, filename)
64
+
40
65
  @asynccontextmanager
41
66
  async def open(self, filename: str, mode: str = "r") -> AsyncGenerator:
42
67
  """
@@ -79,6 +104,39 @@ class TaskFiles:
79
104
  async with self.open(filename, mode) as f:
80
105
  await f.write(data)
81
106
 
107
+ async def write_json(self, filename: str, data: Any) -> "FileMetadata | None":
108
+ """Writes data as JSON and optionally uploads to S3 if manager is available."""
109
+ content = dumps(data, indent=2)
110
+ await self.write(filename, content)
111
+ if self._s3_manager:
112
+ return await self.upload_file(filename)
113
+ return None
114
+
115
+ async def read_json(self, filename: str) -> Any:
116
+ """Reads a file and parses it as JSON."""
117
+ content = await self.read(filename)
118
+ return loads(content)
119
+
120
+ async def upload_file(self, filename: str) -> "FileMetadata":
121
+ """Uploads a specific file to S3 and returns its metadata."""
122
+ if not self._s3_manager:
123
+ raise RuntimeError("S3Manager not configured for this TaskFiles instance.")
124
+ path = await self.path_to(filename)
125
+ return await self._s3_manager._upload_to_s3(path)
126
+
127
+ async def upload_dir(self, dirname: str = "") -> "FileMetadata":
128
+ """Uploads the entire task directory or a subdirectory to S3."""
129
+ if not self._s3_manager:
130
+ raise RuntimeError("S3Manager not configured for this TaskFiles instance.")
131
+ path = join(self._task_dir, dirname) if dirname else self._task_dir
132
+ return await self._s3_manager._upload_to_s3(path)
133
+
134
+ async def download_file(self, uri: str, filename: str, verify_meta: "FileMetadata" = None) -> str:
135
+ """Downloads a file from S3 to the task directory with optional integrity check."""
136
+ if not self._s3_manager:
137
+ raise RuntimeError("S3Manager not configured for this TaskFiles instance.")
138
+ return await self._s3_manager._process_s3_uri(uri, self._task_dir.split("/")[-1], verify_meta=verify_meta)
139
+
82
140
  async def list(self) -> list[str]:
83
141
  """
84
142
  Asynchronously lists all file and directory names within the task root.
@@ -1,8 +1,26 @@
1
- # Error codes for worker task results
2
- TRANSIENT_ERROR = "TRANSIENT_ERROR"
3
- PERMANENT_ERROR = "PERMANENT_ERROR"
4
- INVALID_INPUT_ERROR = "INVALID_INPUT_ERROR"
1
+ from typing import Any, Awaitable, Callable, Dict
2
+
3
+ from rxon.constants import (
4
+ ERROR_CODE_INVALID_INPUT as INVALID_INPUT_ERROR,
5
+ )
6
+ from rxon.constants import (
7
+ ERROR_CODE_PERMANENT as PERMANENT_ERROR,
8
+ )
9
+ from rxon.constants import (
10
+ ERROR_CODE_TRANSIENT as TRANSIENT_ERROR,
11
+ )
12
+
13
+ Middleware = Callable[[Dict[str, Any], Callable[[], Awaitable[Any]]], Awaitable[Any]]
14
+ CapacityChecker = Callable[[str], bool]
5
15
 
6
16
 
7
17
  class ParamValidationError(Exception):
8
- """Custom exception for parameter validation errors."""
18
+ pass
19
+
20
+
21
+ __all__ = [
22
+ "INVALID_INPUT_ERROR",
23
+ "PERMANENT_ERROR",
24
+ "TRANSIENT_ERROR",
25
+ "ParamValidationError",
26
+ ]