avtomatika 1.0b2__tar.gz → 1.0b3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. {avtomatika-1.0b2/src/avtomatika.egg-info → avtomatika-1.0b3}/PKG-INFO +29 -11
  2. {avtomatika-1.0b2 → avtomatika-1.0b3}/README.md +19 -1
  3. {avtomatika-1.0b2 → avtomatika-1.0b3}/pyproject.toml +10 -10
  4. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/__init__.py +2 -3
  5. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/blueprint.py +5 -7
  6. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/client_config_loader.py +18 -6
  7. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/dispatcher.py +13 -19
  8. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/engine.py +16 -16
  9. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/executor.py +6 -3
  10. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/ratelimit.py +3 -10
  11. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/reputation.py +11 -2
  12. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/storage/__init__.py +3 -3
  13. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/storage/base.py +23 -0
  14. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/storage/memory.py +34 -8
  15. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/storage/redis.py +37 -20
  16. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/telemetry.py +3 -3
  17. avtomatika-1.0b3/src/avtomatika/watcher.py +82 -0
  18. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/worker_config_loader.py +7 -2
  19. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/ws_manager.py +2 -1
  20. {avtomatika-1.0b2 → avtomatika-1.0b3/src/avtomatika.egg-info}/PKG-INFO +29 -11
  21. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika.egg-info/SOURCES.txt +4 -0
  22. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika.egg-info/requires.txt +8 -8
  23. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_client_config_loader.py +7 -6
  24. avtomatika-1.0b3/tests/test_config_validation.py +60 -0
  25. avtomatika-1.0b3/tests/test_dispatcher_extended.py +95 -0
  26. avtomatika-1.0b3/tests/test_memory_locking.py +44 -0
  27. avtomatika-1.0b3/tests/test_redis_locking.py +45 -0
  28. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_watcher.py +3 -0
  29. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_worker_config_loader.py +7 -4
  30. avtomatika-1.0b2/src/avtomatika/watcher.py +0 -68
  31. {avtomatika-1.0b2 → avtomatika-1.0b3}/LICENSE +0 -0
  32. {avtomatika-1.0b2 → avtomatika-1.0b3}/setup.cfg +0 -0
  33. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/api.html +0 -0
  34. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/compression.py +0 -0
  35. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/config.py +0 -0
  36. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/context.py +0 -0
  37. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/data_types.py +0 -0
  38. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/datastore.py +0 -0
  39. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/health_checker.py +0 -0
  40. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/history/base.py +0 -0
  41. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/history/noop.py +0 -0
  42. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/history/postgres.py +0 -0
  43. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/history/sqlite.py +0 -0
  44. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/logging_config.py +0 -0
  45. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/metrics.py +0 -0
  46. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/py.typed +0 -0
  47. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/quota.py +0 -0
  48. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/security.py +0 -0
  49. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika.egg-info/dependency_links.txt +0 -0
  50. {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika.egg-info/top_level.txt +0 -0
  51. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_blueprint_conditions.py +0 -0
  52. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_blueprints.py +0 -0
  53. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_compression.py +0 -0
  54. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_context.py +0 -0
  55. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_dispatcher.py +0 -0
  56. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_engine.py +0 -0
  57. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_error_handling.py +0 -0
  58. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_executor.py +0 -0
  59. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_health_checker.py +0 -0
  60. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_history.py +0 -0
  61. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_integration.py +0 -0
  62. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_logging_config.py +0 -0
  63. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_memory_storage.py +0 -0
  64. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_metrics.py +0 -0
  65. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_noop_history.py +0 -0
  66. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_postgres_history.py +0 -0
  67. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_ratelimit.py +0 -0
  68. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_redis_storage.py +0 -0
  69. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_reputation.py +0 -0
  70. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_telemetry.py +0 -0
  71. {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_ws_manager.py +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: avtomatika
3
- Version: 1.0b2
4
- Summary: A state-machine based orchestrator for long-running jobs.
3
+ Version: 1.0b3
4
+ Summary: A state-machine based orchestrator for long-running AI and other jobs.
5
5
  Project-URL: Homepage, https://github.com/avtomatika-ai/avtomatika
6
6
  Project-URL: Bug Tracker, https://github.com/avtomatika-ai/avtomatika/issues
7
7
  Classifier: Development Status :: 4 - Beta
@@ -18,25 +18,25 @@ Requires-Dist: graphviz~=0.21
18
18
  Requires-Dist: zstandard~=0.24
19
19
  Requires-Dist: aioprometheus~=23.12
20
20
  Provides-Extra: redis
21
- Requires-Dist: redis~=6.4; extra == "redis"
21
+ Requires-Dist: redis~=7.1; extra == "redis"
22
22
  Requires-Dist: orjson~=3.11; extra == "redis"
23
23
  Provides-Extra: history
24
- Requires-Dist: aiosqlite~=0.21; extra == "history"
24
+ Requires-Dist: aiosqlite~=0.22; extra == "history"
25
25
  Requires-Dist: asyncpg~=0.30; extra == "history"
26
26
  Requires-Dist: orjson~=3.11; extra == "history"
27
27
  Provides-Extra: telemetry
28
- Requires-Dist: opentelemetry-api~=1.38; extra == "telemetry"
29
- Requires-Dist: opentelemetry-sdk~=1.38; extra == "telemetry"
30
- Requires-Dist: opentelemetry-exporter-otlp~=1.36; extra == "telemetry"
28
+ Requires-Dist: opentelemetry-api~=1.39; extra == "telemetry"
29
+ Requires-Dist: opentelemetry-sdk~=1.39; extra == "telemetry"
30
+ Requires-Dist: opentelemetry-exporter-otlp~=1.39; extra == "telemetry"
31
31
  Requires-Dist: opentelemetry-instrumentation-aiohttp-client~=0.59b0; extra == "telemetry"
32
32
  Provides-Extra: test
33
- Requires-Dist: pytest~=8.4; extra == "test"
33
+ Requires-Dist: pytest~=9.0; extra == "test"
34
34
  Requires-Dist: pytest-asyncio~=1.1; extra == "test"
35
- Requires-Dist: fakeredis~=2.31; extra == "test"
35
+ Requires-Dist: fakeredis~=2.33; extra == "test"
36
36
  Requires-Dist: pytest-aiohttp~=1.1; extra == "test"
37
37
  Requires-Dist: pytest-mock~=3.14; extra == "test"
38
38
  Requires-Dist: aioresponses~=0.7; extra == "test"
39
- Requires-Dist: backports.zstd; extra == "test"
39
+ Requires-Dist: backports.zstd~=1.2; extra == "test"
40
40
  Requires-Dist: opentelemetry-instrumentation-aiohttp-client; extra == "test"
41
41
  Provides-Extra: all
42
42
  Requires-Dist: avtomatika[redis]; extra == "all"
@@ -285,7 +285,7 @@ Run multiple tasks simultaneously and gather their results.
285
285
  @my_blueprint.handler_for("process_files")
286
286
  async def fan_out_handler(initial_data, actions):
287
287
  tasks_to_dispatch = [
288
- {"task_type": "file_analysis", "params": {"file": file}}
288
+ {"task_type": "file_analysis", "params": {"file": file}})
289
289
  for file in initial_data.get("files", [])
290
290
  ]
291
291
  # Use dispatch_parallel to send all tasks at once.
@@ -332,6 +332,8 @@ async def cache_handler(data_stores):
332
332
 
333
333
  The orchestrator's behavior can be configured through environment variables. Additionally, any configuration parameter loaded from environment variables can be programmatically overridden in your application code after the `Config` object has been initialized. This provides flexibility for different deployment and testing scenarios.
334
334
 
335
+ **Important:** The system employs **strict validation** for configuration files (`clients.toml`, `workers.toml`) at startup. If a configuration file is invalid (e.g., malformed TOML, missing required fields), the application will **fail fast** and exit with an error, rather than starting in a partially broken state. This ensures the security and integrity of the deployment.
336
+
335
337
  ### Fault Tolerance
336
338
 
337
339
  The orchestrator has built-in mechanisms for handling failures based on the `error.code` field in a worker's response.
@@ -340,6 +342,13 @@ The orchestrator has built-in mechanisms for handling failures based on the `err
340
342
  * **PERMANENT_ERROR**: A permanent error (e.g., a corrupted file). The task will be immediately sent to quarantine for manual investigation.
341
343
  * **INVALID_INPUT_ERROR**: An error in the input data. The entire pipeline (Job) will be immediately moved to the failed state.
342
344
 
345
+ ### High Availability & Distributed Locking
346
+
347
+ The architecture supports horizontal scaling. Multiple Orchestrator instances can run behind a load balancer.
348
+
349
+ * **Stateless API:** The API is stateless; all state is persisted in Redis.
350
+ * **Distributed Locking:** Background processes (`Watcher`, `ReputationCalculator`) use distributed locks (via Redis `SET NX`) to coordinate and prevent race conditions when multiple instances are active.
351
+
343
352
  ### Storage Backend
344
353
 
345
354
  By default, the engine uses in-memory storage. For production, you must configure persistent storage via environment variables.
@@ -408,3 +417,12 @@ To run the `avtomatika` test suite:
408
417
  ```bash
409
418
  pytest avtomatika/tests/
410
419
  ```
420
+
421
+ ## Detailed Documentation
422
+
423
+ For a deeper dive into the system, please refer to the following documents in the `docs/` directory:
424
+
425
+ - [**Architecture Guide**](docs/architecture.md): A detailed overview of the system components and their interactions.
426
+ - [**API Reference**](docs/api_reference.md): Full specification of the HTTP API.
427
+ - [**Deployment Guide**](docs/deployment.md): Instructions for deploying with Gunicorn/Uvicorn and NGINX.
428
+ - [**Cookbook**](docs/cookbook/README.md): Examples and best practices for creating blueprints.
@@ -239,7 +239,7 @@ Run multiple tasks simultaneously and gather their results.
239
239
  @my_blueprint.handler_for("process_files")
240
240
  async def fan_out_handler(initial_data, actions):
241
241
  tasks_to_dispatch = [
242
- {"task_type": "file_analysis", "params": {"file": file}}
242
+ {"task_type": "file_analysis", "params": {"file": file}})
243
243
  for file in initial_data.get("files", [])
244
244
  ]
245
245
  # Use dispatch_parallel to send all tasks at once.
@@ -286,6 +286,8 @@ async def cache_handler(data_stores):
286
286
 
287
287
  The orchestrator's behavior can be configured through environment variables. Additionally, any configuration parameter loaded from environment variables can be programmatically overridden in your application code after the `Config` object has been initialized. This provides flexibility for different deployment and testing scenarios.
288
288
 
289
+ **Important:** The system employs **strict validation** for configuration files (`clients.toml`, `workers.toml`) at startup. If a configuration file is invalid (e.g., malformed TOML, missing required fields), the application will **fail fast** and exit with an error, rather than starting in a partially broken state. This ensures the security and integrity of the deployment.
290
+
289
291
  ### Fault Tolerance
290
292
 
291
293
  The orchestrator has built-in mechanisms for handling failures based on the `error.code` field in a worker's response.
@@ -294,6 +296,13 @@ The orchestrator has built-in mechanisms for handling failures based on the `err
294
296
  * **PERMANENT_ERROR**: A permanent error (e.g., a corrupted file). The task will be immediately sent to quarantine for manual investigation.
295
297
  * **INVALID_INPUT_ERROR**: An error in the input data. The entire pipeline (Job) will be immediately moved to the failed state.
296
298
 
299
+ ### High Availability & Distributed Locking
300
+
301
+ The architecture supports horizontal scaling. Multiple Orchestrator instances can run behind a load balancer.
302
+
303
+ * **Stateless API:** The API is stateless; all state is persisted in Redis.
304
+ * **Distributed Locking:** Background processes (`Watcher`, `ReputationCalculator`) use distributed locks (via Redis `SET NX`) to coordinate and prevent race conditions when multiple instances are active.
305
+
297
306
  ### Storage Backend
298
307
 
299
308
  By default, the engine uses in-memory storage. For production, you must configure persistent storage via environment variables.
@@ -362,3 +371,12 @@ To run the `avtomatika` test suite:
362
371
  ```bash
363
372
  pytest avtomatika/tests/
364
373
  ```
374
+
375
+ ## Detailed Documentation
376
+
377
+ For a deeper dive into the system, please refer to the following documents in the `docs/` directory:
378
+
379
+ - [**Architecture Guide**](docs/architecture.md): A detailed overview of the system components and their interactions.
380
+ - [**API Reference**](docs/api_reference.md): Full specification of the HTTP API.
381
+ - [**Deployment Guide**](docs/deployment.md): Instructions for deploying with Gunicorn/Uvicorn and NGINX.
382
+ - [**Cookbook**](docs/cookbook/README.md): Examples and best practices for creating blueprints.
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "avtomatika"
7
- version = "1.0b2"
8
- description = "A state-machine based orchestrator for long-running jobs."
7
+ version = "1.0b3"
8
+ description = "A state-machine based orchestrator for long-running AI and other jobs."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
11
11
  classifiers = [
@@ -24,22 +24,22 @@ dependencies = [
24
24
  ]
25
25
 
26
26
  [project.optional-dependencies]
27
- redis = ["redis~=6.4", "orjson~=3.11"]
28
- history = ["aiosqlite~=0.21", "asyncpg~=0.30", "orjson~=3.11"]
27
+ redis = ["redis~=7.1", "orjson~=3.11"]
28
+ history = ["aiosqlite~=0.22", "asyncpg~=0.30", "orjson~=3.11"]
29
29
  telemetry = [
30
- "opentelemetry-api~=1.38",
31
- "opentelemetry-sdk~=1.38",
32
- "opentelemetry-exporter-otlp~=1.36",
30
+ "opentelemetry-api~=1.39",
31
+ "opentelemetry-sdk~=1.39",
32
+ "opentelemetry-exporter-otlp~=1.39",
33
33
  "opentelemetry-instrumentation-aiohttp-client~=0.59b0",
34
34
  ]
35
35
  test = [
36
- "pytest~=8.4",
36
+ "pytest~=9.0",
37
37
  "pytest-asyncio~=1.1",
38
- "fakeredis~=2.31",
38
+ "fakeredis~=2.33",
39
39
  "pytest-aiohttp~=1.1",
40
40
  "pytest-mock~=3.14",
41
41
  "aioresponses~=0.7",
42
- "backports.zstd",
42
+ "backports.zstd~=1.2",
43
43
  "opentelemetry-instrumentation-aiohttp-client",
44
44
  ]
45
45
  all = [
@@ -4,6 +4,7 @@
4
4
  This module exposes the primary classes for building and running state-driven automations.
5
5
  """
6
6
 
7
+ import contextlib
7
8
  from importlib.metadata import version
8
9
 
9
10
  __version__ = version("avtomatika")
@@ -22,9 +23,7 @@ __all__ = [
22
23
  "StorageBackend",
23
24
  ]
24
25
 
25
- try:
26
+ with contextlib.suppress(ImportError):
26
27
  from .storage.redis import RedisStorage # noqa: F401
27
28
 
28
29
  __all__.append("RedisStorage")
29
- except ImportError:
30
- pass
@@ -168,8 +168,7 @@ class StateMachineBlueprint:
168
168
  for handler in self.conditional_handlers:
169
169
  if handler.state == state and handler.evaluate(context):
170
170
  return handler.func
171
- default_handler = self.handlers.get(state)
172
- if default_handler:
171
+ if default_handler := self.handlers.get(state):
173
172
  return default_handler
174
173
  raise ValueError(
175
174
  f"No suitable handler found for state '{state}' in blueprint '{self.name}' for the given context.",
@@ -230,12 +229,11 @@ class StateMachineBlueprint:
230
229
  f"Could not parse handler '{handler_func.__name__}' for state '{handler_state}'. "
231
230
  f"Graph may be incomplete. Error: {e}"
232
231
  )
233
- pass
234
232
  for state in states:
235
233
  dot.node(state, state)
236
234
 
237
- if output_filename:
238
- dot.render(output_filename, format=output_format, cleanup=True)
239
- print(f"Graph rendered to {output_filename}.{output_format}")
240
- else:
235
+ if not output_filename:
241
236
  return dot.source
237
+ dot.render(output_filename, format=output_format, cleanup=True)
238
+ print(f"Graph rendered to {output_filename}.{output_format}")
239
+ return None
@@ -26,25 +26,37 @@ async def load_client_configs_to_redis(
26
26
  config_path,
27
27
  )
28
28
  return
29
+ except Exception as e:
30
+ logger.error(f"Failed to parse client config file '{config_path}': {e}")
31
+ raise ValueError(f"Invalid client configuration file: {e}") from e
29
32
 
30
33
  loaded_count = 0
31
34
  for client_name, config in clients_data.items():
35
+ if not isinstance(config, dict):
36
+ logger.error(f"Client '{client_name}' configuration must be a table (dict).")
37
+ raise ValueError(f"Invalid configuration for client '{client_name}'")
38
+
32
39
  token = config.get("token")
33
40
  if not token:
34
- logger.warning(
35
- "Skipping client '%s' due to missing 'token' field.",
36
- client_name,
37
- )
38
- continue
41
+ logger.error(f"Client '{client_name}' is missing required 'token' field.")
42
+ raise ValueError(f"Missing token for client '{client_name}'")
43
+
44
+ if not isinstance(token, str):
45
+ logger.error(f"Token for client '{client_name}' must be a string.")
46
+ raise ValueError(f"Invalid token type for client '{client_name}'")
39
47
 
40
48
  # Separate static config from dynamic quota values
41
49
  static_config = {k: v for k, v in config.items() if k != "monthly_attempts"}
42
50
  quota = config.get("monthly_attempts")
43
51
 
52
+ if quota is not None and not isinstance(quota, int):
53
+ logger.error(f"Quota 'monthly_attempts' for client '{client_name}' must be an integer.")
54
+ raise ValueError(f"Invalid quota type for client '{client_name}'")
55
+
44
56
  try:
45
57
  # Assume these storage methods will be implemented
46
58
  await storage.save_client_config(token, static_config)
47
- if quota is not None and isinstance(quota, int):
59
+ if quota is not None:
48
60
  await storage.initialize_client_quota(token, quota)
49
61
 
50
62
  loaded_count += 1
@@ -28,15 +28,13 @@ class Dispatcher:
28
28
  self.config = config
29
29
  self._round_robin_indices: Dict[str, int] = defaultdict(int)
30
30
 
31
+ @staticmethod
31
32
  def _is_worker_compliant(
32
- self,
33
33
  worker: Dict[str, Any],
34
34
  requirements: Dict[str, Any],
35
35
  ) -> bool:
36
36
  """Checks if a worker meets the specified resource requirements."""
37
- # GPU check
38
- required_gpu = requirements.get("gpu_info")
39
- if required_gpu:
37
+ if required_gpu := requirements.get("gpu_info"):
40
38
  gpu_info = worker.get("resources", {}).get("gpu_info")
41
39
  if not gpu_info:
42
40
  return False
@@ -51,17 +49,15 @@ class Dispatcher:
51
49
  ):
52
50
  return False
53
51
 
54
- # Installed models check
55
- required_models = requirements.get("installed_models")
56
- if required_models:
52
+ if required_models := requirements.get("installed_models"):
57
53
  installed_models = {m["name"] for m in worker.get("installed_models", [])}
58
54
  if not set(required_models).issubset(installed_models):
59
55
  return False
60
56
 
61
57
  return True
62
58
 
59
+ @staticmethod
63
60
  def _select_default(
64
- self,
65
61
  workers: List[Dict[str, Any]],
66
62
  task_type: str,
67
63
  ) -> Dict[str, Any]:
@@ -74,7 +70,7 @@ class Dispatcher:
74
70
  """
75
71
  warm_workers = [w for w in workers if task_type in w.get("hot_cache", [])]
76
72
 
77
- target_pool = warm_workers if warm_workers else workers
73
+ target_pool = warm_workers or workers
78
74
 
79
75
  # The `cost` field is deprecated but maintained for backward compatibility.
80
76
  min_cost = min(w.get("cost", float("inf")) for w in target_pool)
@@ -95,8 +91,8 @@ class Dispatcher:
95
91
  self._round_robin_indices[task_type] = idx + 1
96
92
  return selected_worker
97
93
 
94
+ @staticmethod
98
95
  def _select_least_connections(
99
- self,
100
96
  workers: List[Dict[str, Any]],
101
97
  task_type: str,
102
98
  ) -> Dict[str, Any]:
@@ -105,15 +101,16 @@ class Dispatcher:
105
101
  """
106
102
  return min(workers, key=lambda w: w.get("load", 0.0))
107
103
 
104
+ @staticmethod
108
105
  def _select_cheapest(
109
- self,
110
106
  workers: List[Dict[str, Any]],
111
107
  task_type: str,
112
108
  ) -> Dict[str, Any]:
113
109
  """Selects the cheapest worker based on 'cost_per_second'."""
114
110
  return min(workers, key=lambda w: w.get("cost_per_second", float("inf")))
115
111
 
116
- def _get_best_value_score(self, worker: Dict[str, Any]) -> float:
112
+ @staticmethod
113
+ def _get_best_value_score(worker: Dict[str, Any]) -> float:
117
114
  """Calculates a "score" for a worker using the formula cost / reputation.
118
115
  The lower the score, the better.
119
116
  """
@@ -121,9 +118,7 @@ class Dispatcher:
121
118
  # Default reputation is 1.0 if absent
122
119
  reputation = worker.get("reputation", 1.0)
123
120
  # Avoid division by zero
124
- if reputation == 0:
125
- return float("inf")
126
- return cost / reputation
121
+ return float("inf") if reputation == 0 else cost / reputation
127
122
 
128
123
  def _select_best_value(
129
124
  self,
@@ -153,10 +148,9 @@ class Dispatcher:
153
148
  idle_workers = [w for w in all_workers if w.get("status", "idle") == "idle"]
154
149
  logger.debug(f"Idle workers: {[w['worker_id'] for w in idle_workers]}")
155
150
  if not idle_workers:
156
- # If there are no idle workers, check if there are any busy workers in multi-orchestrator mode.
157
- # This doesn't change the logic (an error will still occur), but it makes the logs more informative.
158
- busy_mo_workers = [w for w in all_workers if w.get("status") == "busy" and "multi_orchestrator_info" in w]
159
- if busy_mo_workers:
151
+ if busy_mo_workers := [
152
+ w for w in all_workers if w.get("status") == "busy" and "multi_orchestrator_info" in w
153
+ ]:
160
154
  logger.warning(
161
155
  f"No idle workers. Found {len(busy_mo_workers)} busy workers "
162
156
  f"in multi-orchestrator mode. They are likely performing tasks for other Orchestrators.",
@@ -485,8 +485,7 @@ class OrchestratorEngine:
485
485
  await self.storage.save_job_state(job_id, job_state)
486
486
  # Optionally, trigger a specific 'cancelled' transition if defined in the blueprint
487
487
  transitions = job_state.get("current_task_transitions", {})
488
- next_state = transitions.get("cancelled")
489
- if next_state:
488
+ if next_state := transitions.get("cancelled"):
490
489
  job_state["current_state"] = next_state
491
490
  job_state["status"] = "running" # It's running the cancellation handler now
492
491
  await self.storage.save_job_state(job_id, job_state)
@@ -494,9 +493,7 @@ class OrchestratorEngine:
494
493
  return web.json_response({"status": "result_accepted_cancelled"}, status=200)
495
494
 
496
495
  transitions = job_state.get("current_task_transitions", {})
497
- next_state = transitions.get(result_status)
498
-
499
- if next_state:
496
+ if next_state := transitions.get(result_status):
500
497
  logging.info(f"Job {job_id} transitioning based on worker status '{result_status}' to state '{next_state}'")
501
498
 
502
499
  worker_data = result.get("data")
@@ -602,7 +599,8 @@ class OrchestratorEngine:
602
599
  await load_client_configs_to_redis(self.storage)
603
600
  return web.json_response({"status": "db_flushed"}, status=200)
604
601
 
605
- async def _docs_handler(self, request: web.Request) -> web.Response:
602
+ @staticmethod
603
+ async def _docs_handler(request: web.Request) -> web.Response:
606
604
  from importlib import resources
607
605
 
608
606
  try:
@@ -647,16 +645,7 @@ class OrchestratorEngine:
647
645
  all_protected_apps.append(protected_app)
648
646
 
649
647
  for app in all_protected_apps:
650
- app.router.add_get("/jobs/{job_id}", self._get_job_status_handler)
651
- app.router.add_post("/jobs/{job_id}/cancel", self._cancel_job_handler)
652
- if not isinstance(self.history_storage, NoOpHistoryStorage):
653
- app.router.add_get("/jobs/{job_id}/history", self._get_job_history_handler)
654
- app.router.add_get("/blueprints/{blueprint_name}/graph", self._get_blueprint_graph_handler)
655
- app.router.add_get("/workers", self._get_workers_handler)
656
- app.router.add_get("/jobs", self._get_jobs_handler)
657
- app.router.add_get("/dashboard", self._get_dashboard_handler)
658
- app.router.add_post("/admin/reload-workers", self._reload_worker_configs_handler)
659
-
648
+ self._register_common_routes(app)
660
649
  if has_unversioned_routes:
661
650
  self.app.add_subapp("/api/", protected_app)
662
651
  for version, app in versioned_apps.items():
@@ -676,6 +665,17 @@ class OrchestratorEngine:
676
665
  worker_app.router.add_get("/ws/{worker_id}", self._websocket_handler)
677
666
  self.app.add_subapp("/_worker/", worker_app)
678
667
 
668
+ def _register_common_routes(self, app):
669
+ app.router.add_get("/jobs/{job_id}", self._get_job_status_handler)
670
+ app.router.add_post("/jobs/{job_id}/cancel", self._cancel_job_handler)
671
+ if not isinstance(self.history_storage, NoOpHistoryStorage):
672
+ app.router.add_get("/jobs/{job_id}/history", self._get_job_history_handler)
673
+ app.router.add_get("/blueprints/{blueprint_name}/graph", self._get_blueprint_graph_handler)
674
+ app.router.add_get("/workers", self._get_workers_handler)
675
+ app.router.add_get("/jobs", self._get_jobs_handler)
676
+ app.router.add_get("/dashboard", self._get_dashboard_handler)
677
+ app.router.add_post("/admin/reload-workers", self._reload_worker_configs_handler)
678
+
679
679
  async def _websocket_handler(self, request: web.Request) -> web.WebSocketResponse:
680
680
  worker_id = request.match_info.get("worker_id")
681
681
  if not worker_id:
@@ -35,11 +35,13 @@ except ImportError:
35
35
  def inject(self, *args, **kwargs):
36
36
  pass
37
37
 
38
- def extract(self, *args, **kwargs):
38
+ @staticmethod
39
+ def extract(*args, **kwargs):
39
40
  return None
40
41
 
41
42
  class NoOpTraceContextTextMapPropagator:
42
- def extract(self, *args, **kwargs):
43
+ @staticmethod
44
+ def extract(*args, **kwargs):
43
45
  return None
44
46
 
45
47
  trace = NoOpTracer()
@@ -485,7 +487,8 @@ class JobExecutor:
485
487
  await self.storage.save_job_state(parent_job_id, parent_job_state)
486
488
  await self.storage.enqueue_job(parent_job_id)
487
489
 
488
- def _handle_task_completion(self, task: Task):
490
+ @staticmethod
491
+ def _handle_task_completion(task: Task):
489
492
  """Callback to handle completion of a job processing task."""
490
493
  try:
491
494
  # This will re-raise any exception caught in the task
@@ -1,3 +1,4 @@
1
+ from contextlib import suppress
1
2
  from typing import Awaitable, Callable
2
3
 
3
4
  from aiohttp import web
@@ -23,23 +24,15 @@ def rate_limit_middleware_factory(
23
24
  """Rate-limiting middleware that uses the provided storage backend."""
24
25
  # Determine the key for rate limiting (e.g., by worker_id or IP)
25
26
  # For worker endpoints, we key by worker_id. For others, by IP.
26
- key_identifier = request.match_info.get("worker_id", request.remote)
27
- if not key_identifier:
28
- # Fallback for cases where remote IP might not be available
29
- key_identifier = "unknown"
27
+ key_identifier = request.match_info.get("worker_id", request.remote) or "unknown"
30
28
 
31
29
  # Key by identifier and path to have per-endpoint limits
32
30
  rate_limit_key = f"ratelimit:{key_identifier}:{request.path}"
33
31
 
34
- try:
32
+ with suppress(Exception):
35
33
  count = await storage.increment_key_with_ttl(rate_limit_key, period)
36
34
  if count > limit:
37
35
  return web.json_response({"error": "Too Many Requests"}, status=429)
38
- except Exception:
39
- # If the rate limiter fails for any reason (e.g., Redis down),
40
- # it's safer to let the request through than to block everything.
41
- pass
42
-
43
36
  return await handler(request)
44
37
 
45
38
  return rate_limit_middleware
@@ -1,6 +1,7 @@
1
1
  from asyncio import CancelledError, sleep
2
2
  from logging import getLogger
3
3
  from typing import TYPE_CHECKING
4
+ from uuid import uuid4
4
5
 
5
6
  if TYPE_CHECKING:
6
7
  from .engine import OrchestratorEngine
@@ -20,14 +21,22 @@ class ReputationCalculator:
20
21
  self.history_storage = engine.history_storage
21
22
  self.interval_seconds = interval_seconds
22
23
  self._running = False
24
+ self._instance_id = str(uuid4())
23
25
 
24
26
  async def run(self):
25
27
  """The main loop that periodically triggers reputation recalculation."""
26
- logger.info("ReputationCalculator started.")
28
+ logger.info(f"ReputationCalculator started (Instance ID: {self._instance_id}).")
27
29
  self._running = True
28
30
  while self._running:
29
31
  try:
30
- await self.calculate_all_reputations()
32
+ # Attempt to acquire lock
33
+ if await self.storage.acquire_lock("global_reputation_lock", self._instance_id, 300):
34
+ try:
35
+ await self.calculate_all_reputations()
36
+ finally:
37
+ await self.storage.release_lock("global_reputation_lock", self._instance_id)
38
+ else:
39
+ logger.debug("ReputationCalculator lock held by another instance. Skipping.")
31
40
  except CancelledError:
32
41
  break
33
42
  except Exception:
@@ -1,11 +1,11 @@
1
+ import contextlib
2
+
1
3
  from .base import StorageBackend
2
4
  from .memory import MemoryStorage
3
5
 
4
6
  __all__ = ["StorageBackend", "MemoryStorage"]
5
7
 
6
- try:
8
+ with contextlib.suppress(ImportError):
7
9
  from .redis import RedisStorage # noqa: F401
8
10
 
9
11
  __all__.append("RedisStorage")
10
- except ImportError:
11
- pass
@@ -264,3 +264,26 @@ class StorageBackend(ABC):
264
264
  Used for metrics.
265
265
  """
266
266
  raise NotImplementedError
267
+
268
+ @abstractmethod
269
+ async def acquire_lock(self, key: str, holder_id: str, ttl: int) -> bool:
270
+ """
271
+ Attempts to acquire a distributed lock.
272
+
273
+ :param key: The unique key of the lock (e.g., 'watcher_lock').
274
+ :param holder_id: A unique identifier for the caller (e.g., UUID).
275
+ :param ttl: Time-to-live for the lock in seconds.
276
+ :return: True if the lock was acquired, False otherwise.
277
+ """
278
+ raise NotImplementedError
279
+
280
+ @abstractmethod
281
+ async def release_lock(self, key: str, holder_id: str) -> bool:
282
+ """
283
+ Releases a distributed lock if it is held by the specified holder_id.
284
+
285
+ :param key: The unique key of the lock.
286
+ :param holder_id: The identifier of the caller who presumably holds the lock.
287
+ :return: True if the lock was successfully released, False otherwise.
288
+ """
289
+ raise NotImplementedError
@@ -25,6 +25,7 @@ class MemoryStorage(StorageBackend):
25
25
  self._worker_tokens: Dict[str, str] = {}
26
26
  self._generic_keys: Dict[str, Any] = {}
27
27
  self._generic_key_ttls: Dict[str, float] = {}
28
+ self._locks: Dict[str, tuple[str, float]] = {} # key -> (holder_id, expiry_time)
28
29
 
29
30
  self._lock = Lock()
30
31
 
@@ -128,9 +129,11 @@ class MemoryStorage(StorageBackend):
128
129
  async with self._lock:
129
130
  now = monotonic()
130
131
  active_workers = []
131
- for worker_id, worker_info in self._workers.items():
132
- if self._worker_ttls.get(worker_id, 0) > now:
133
- active_workers.append(worker_info)
132
+ active_workers.extend(
133
+ worker_info
134
+ for worker_id, worker_info in self._workers.items()
135
+ if self._worker_ttls.get(worker_id, 0) > now
136
+ )
134
137
  return active_workers
135
138
 
136
139
  async def add_job_to_watch(self, job_id: str, timeout_at: float) -> None:
@@ -226,6 +229,7 @@ class MemoryStorage(StorageBackend):
226
229
  self._quotas.clear()
227
230
  self._generic_keys.clear()
228
231
  self._generic_key_ttls.clear()
232
+ self._locks.clear()
229
233
 
230
234
  async def get_job_queue_length(self) -> int:
231
235
  # No lock needed for asyncio.Queue.qsize()
@@ -234,13 +238,9 @@ class MemoryStorage(StorageBackend):
234
238
  async def get_active_worker_count(self) -> int:
235
239
  async with self._lock:
236
240
  now = monotonic()
237
- count = 0
238
241
  # Create a copy of keys to avoid issues with concurrent modifications
239
242
  worker_ids = list(self._workers.keys())
240
- for worker_id in worker_ids:
241
- if self._worker_ttls.get(worker_id, 0) > now:
242
- count += 1
243
- return count
243
+ return sum(self._worker_ttls.get(worker_id, 0) > now for worker_id in worker_ids)
244
244
 
245
245
  async def get_worker_info(self, worker_id: str) -> Optional[Dict[str, Any]]:
246
246
  async with self._lock:
@@ -273,3 +273,29 @@ class MemoryStorage(StorageBackend):
273
273
  "average_bid": 0,
274
274
  "error": "Statistics are not supported for MemoryStorage backend.",
275
275
  }
276
+
277
+ async def acquire_lock(self, key: str, holder_id: str, ttl: int) -> bool:
278
+ async with self._lock:
279
+ now = monotonic()
280
+ current_lock = self._locks.get(key)
281
+
282
+ # If lock exists and hasn't expired
283
+ if current_lock and current_lock[1] > now:
284
+ # If explicitly owned by us, we can extend/re-enter (optional behavior)
285
+ # But for strict locking, if it's held, return False (unless it's us? let's simpler: just False if held)
286
+ return False
287
+
288
+ # Acquire lock
289
+ self._locks[key] = (holder_id, now + ttl)
290
+ return True
291
+
292
+ async def release_lock(self, key: str, holder_id: str) -> bool:
293
+ async with self._lock:
294
+ current_lock = self._locks.get(key)
295
+ if current_lock:
296
+ owner, expiry = current_lock
297
+ # Only release if we are the owner
298
+ if owner == holder_id:
299
+ del self._locks[key]
300
+ return True
301
+ return False