avtomatika 1.0b7__tar.gz → 1.0b8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. {avtomatika-1.0b7 → avtomatika-1.0b8}/PKG-INFO +50 -2
  2. avtomatika-1.0b7/src/avtomatika.egg-info/PKG-INFO → avtomatika-1.0b8/README.md +44 -47
  3. {avtomatika-1.0b7 → avtomatika-1.0b8}/pyproject.toml +5 -1
  4. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/app_keys.py +1 -0
  5. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/config.py +10 -0
  6. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/data_types.py +2 -1
  7. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/dispatcher.py +8 -26
  8. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/engine.py +19 -1
  9. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/executor.py +34 -6
  10. avtomatika-1.0b8/src/avtomatika/health_checker.py +57 -0
  11. avtomatika-1.0b8/src/avtomatika/history/base.py +105 -0
  12. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/history/noop.py +18 -7
  13. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/history/postgres.py +8 -6
  14. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/history/sqlite.py +7 -5
  15. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/metrics.py +1 -1
  16. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/reputation.py +46 -40
  17. avtomatika-1.0b8/src/avtomatika/s3.py +323 -0
  18. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/storage/base.py +45 -4
  19. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/storage/memory.py +44 -6
  20. avtomatika-1.0b8/src/avtomatika/storage/redis.py +443 -0
  21. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/utils/webhook_sender.py +44 -2
  22. avtomatika-1.0b8/src/avtomatika/watcher.py +78 -0
  23. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/ws_manager.py +7 -6
  24. avtomatika-1.0b7/README.md → avtomatika-1.0b8/src/avtomatika.egg-info/PKG-INFO +95 -2
  25. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika.egg-info/SOURCES.txt +2 -0
  26. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika.egg-info/requires.txt +7 -0
  27. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_dispatcher.py +31 -44
  28. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_dispatcher_extended.py +7 -3
  29. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_engine.py +38 -4
  30. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_executor.py +3 -0
  31. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_handlers.py +8 -0
  32. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_history.py +11 -6
  33. avtomatika-1.0b8/tests/test_postgres_history.py +84 -0
  34. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_reputation.py +2 -2
  35. avtomatika-1.0b8/tests/test_s3.py +265 -0
  36. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_webhook_sender.py +23 -3
  37. avtomatika-1.0b7/src/avtomatika/health_checker.py +0 -39
  38. avtomatika-1.0b7/src/avtomatika/history/base.py +0 -51
  39. avtomatika-1.0b7/src/avtomatika/storage/redis.py +0 -510
  40. avtomatika-1.0b7/src/avtomatika/watcher.py +0 -80
  41. avtomatika-1.0b7/tests/test_postgres_history.py +0 -107
  42. {avtomatika-1.0b7 → avtomatika-1.0b8}/LICENSE +0 -0
  43. {avtomatika-1.0b7 → avtomatika-1.0b8}/setup.cfg +0 -0
  44. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/__init__.py +0 -0
  45. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/api/handlers.py +0 -0
  46. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/api/routes.py +0 -0
  47. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/api.html +0 -0
  48. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/blueprint.py +0 -0
  49. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/client_config_loader.py +0 -0
  50. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/compression.py +0 -0
  51. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/constants.py +0 -0
  52. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/context.py +0 -0
  53. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/datastore.py +0 -0
  54. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/logging_config.py +0 -0
  55. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/py.typed +0 -0
  56. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/quota.py +0 -0
  57. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/ratelimit.py +0 -0
  58. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/scheduler.py +0 -0
  59. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/scheduler_config_loader.py +0 -0
  60. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/security.py +0 -0
  61. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/storage/__init__.py +0 -0
  62. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/telemetry.py +0 -0
  63. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/utils/__init__.py +0 -0
  64. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika/worker_config_loader.py +0 -0
  65. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika.egg-info/dependency_links.txt +0 -0
  66. {avtomatika-1.0b7 → avtomatika-1.0b8}/src/avtomatika.egg-info/top_level.txt +0 -0
  67. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_blueprint_conditions.py +0 -0
  68. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_blueprint_integrity.py +0 -0
  69. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_blueprints.py +0 -0
  70. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_client_config_loader.py +0 -0
  71. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_compression.py +0 -0
  72. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_config_validation.py +0 -0
  73. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_context.py +0 -0
  74. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_error_handling.py +0 -0
  75. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_health_checker.py +0 -0
  76. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_integration.py +0 -0
  77. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_logging_config.py +0 -0
  78. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_memory_locking.py +0 -0
  79. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_memory_storage.py +0 -0
  80. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_metrics.py +0 -0
  81. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_noop_history.py +0 -0
  82. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_optimization.py +0 -0
  83. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_ratelimit.py +0 -0
  84. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_redis_locking.py +0 -0
  85. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_redis_storage.py +0 -0
  86. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_scheduler.py +0 -0
  87. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_telemetry.py +0 -0
  88. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_watcher.py +0 -0
  89. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_worker_config_loader.py +0 -0
  90. {avtomatika-1.0b7 → avtomatika-1.0b8}/tests/test_ws_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: avtomatika
3
- Version: 1.0b7
3
+ Version: 1.0b8
4
4
  Summary: A state-machine based orchestrator for long-running AI and other jobs.
5
5
  Project-URL: Homepage, https://github.com/avtomatika-ai/avtomatika
6
6
  Project-URL: Bug Tracker, https://github.com/avtomatika-ai/avtomatika/issues
@@ -20,6 +20,9 @@ Requires-Dist: msgpack~=1.1
20
20
  Requires-Dist: orjson~=3.11
21
21
  Provides-Extra: redis
22
22
  Requires-Dist: redis~=7.1; extra == "redis"
23
+ Provides-Extra: s3
24
+ Requires-Dist: obstore>=0.2; extra == "s3"
25
+ Requires-Dist: aiofiles~=23.2; extra == "s3"
23
26
  Provides-Extra: history
24
27
  Requires-Dist: aiosqlite~=0.22; extra == "history"
25
28
  Requires-Dist: asyncpg~=0.30; extra == "history"
@@ -37,10 +40,13 @@ Requires-Dist: pytest-mock~=3.14; extra == "test"
37
40
  Requires-Dist: aioresponses~=0.7; extra == "test"
38
41
  Requires-Dist: backports.zstd~=1.2; extra == "test"
39
42
  Requires-Dist: opentelemetry-instrumentation-aiohttp-client; extra == "test"
43
+ Requires-Dist: obstore>=0.2; extra == "test"
44
+ Requires-Dist: aiofiles~=23.2; extra == "test"
40
45
  Provides-Extra: all
41
46
  Requires-Dist: avtomatika[redis]; extra == "all"
42
47
  Requires-Dist: avtomatika[history]; extra == "all"
43
48
  Requires-Dist: avtomatika[telemetry]; extra == "all"
49
+ Requires-Dist: avtomatika[s3]; extra == "all"
44
50
  Dynamic: license-file
45
51
 
46
52
  # Avtomatika Orchestrator
@@ -60,6 +66,7 @@ This document serves as a comprehensive guide for developers looking to build pi
60
66
  - [Parallel Execution and Aggregation (Fan-out/Fan-in)](#parallel-execution-and-aggregation-fan-outfan-in)
61
67
  - [Dependency Injection (DataStore)](#dependency-injection-datastore)
62
68
  - [Native Scheduler](#native-scheduler)
69
+ - [S3 Payload Offloading](#s3-payload-offloading)
63
70
  - [Webhook Notifications](#webhook-notifications)
64
71
  - [Production Configuration](#production-configuration)
65
72
  - [Fault Tolerance](#fault-tolerance)
@@ -107,6 +114,11 @@ Avtomatika is part of a larger ecosystem:
107
114
  pip install "avtomatika[telemetry]"
108
115
  ```
109
116
 
117
+ * **Install with S3 support (Payload Offloading):**
118
+ ```bash
119
+ pip install "avtomatika[s3]"
120
+ ```
121
+
110
122
  * **Install all dependencies, including for testing:**
111
123
  ```bash
112
124
  pip install "avtomatika[all,test]"
@@ -250,6 +262,19 @@ async def publish_handler_old_style(context):
250
262
  print(f"Job {context.job_id}: Publishing video at {output_path} ({duration}s).")
251
263
  context.actions.transition_to("complete")
252
264
  ```
265
+ ## Key Concepts: JobContext and Actions
266
+
267
+ ### High Performance Architecture
268
+
269
+ Avtomatika is engineered for high-load environments with thousands of concurrent workers.
270
+
271
+ * **O(1) Dispatcher**: Uses advanced Redis Set intersections to find suitable workers instantly, regardless of the cluster size. No O(N) scanning.
272
+ * **Non-Blocking I/O**:
273
+ * **Webhooks**: Sent via a bounded background queue to prevent backpressure.
274
+ * **History Logging**: Writes to SQL databases are buffered and asynchronous, ensuring the main execution loop never blocks.
275
+ * **Redis Streams**: Uses blocking reads to eliminate busy-waiting and reduce CPU usage.
276
+ * **Memory Safety**: S3 file transfers use streaming to handle multi-gigabyte files with constant, low RAM usage.
277
+
253
278
  ## Blueprint Cookbook: Key Features
254
279
 
255
280
  ### 1. Conditional Transitions (`.when()`)
@@ -365,7 +390,30 @@ daily_at = "02:00"
365
390
 
366
391
  The orchestrator can send asynchronous notifications to an external system when a job completes, fails, or is quarantined. This eliminates the need for clients to constantly poll the API for status updates.
367
392
 
368
- * **Usage:** Pass a `webhook_url` in the request body when creating a job.
393
+ ### 7. S3 Payload Offloading
394
+
395
+ Orchestrator provides first-class support for handling large files via S3-compatible storage, powered by the high-performance `obstore` library (Rust bindings).
396
+
397
+ * **Memory Safe (Streaming)**: Uses streaming for uploads and downloads, allowing processing of files larger than available RAM without OOM errors.
398
+ * **Managed Mode**: The Orchestrator manages file lifecycle (automatic cleanup of S3 objects and local temporary files on job completion).
399
+ * **Dependency Injection**: Use the `task_files` argument in your handlers to easily read/write data.
400
+ * **Directory Support**: Supports recursive download and upload of entire directories.
401
+
402
+ ```python
403
+ @bp.handler_for("process_data")
404
+ async def process_data(task_files, actions):
405
+ # Streaming download of a large file
406
+ local_path = await task_files.download("large_dataset.csv")
407
+
408
+ # ... process data ...
409
+
410
+ # Upload results
411
+ await task_files.write_json("results.json", {"status": "done"})
412
+
413
+ actions.transition_to("finished")
414
+ ```
415
+
416
+ ## Production Configuration
369
417
  * **Events:**
370
418
  * `job_finished`: The job reached a final success state.
371
419
  * `job_failed`: The job failed (e.g., due to an error or invalid input).
@@ -1,48 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: avtomatika
3
- Version: 1.0b7
4
- Summary: A state-machine based orchestrator for long-running AI and other jobs.
5
- Project-URL: Homepage, https://github.com/avtomatika-ai/avtomatika
6
- Project-URL: Bug Tracker, https://github.com/avtomatika-ai/avtomatika/issues
7
- Classifier: Development Status :: 4 - Beta
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: License :: OSI Approved :: MIT License
10
- Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.11
12
- Description-Content-Type: text/markdown
13
- License-File: LICENSE
14
- Requires-Dist: aiohttp~=3.12
15
- Requires-Dist: python-json-logger~=4.0
16
- Requires-Dist: graphviz~=0.21
17
- Requires-Dist: zstandard~=0.24
18
- Requires-Dist: aioprometheus~=23.12
19
- Requires-Dist: msgpack~=1.1
20
- Requires-Dist: orjson~=3.11
21
- Provides-Extra: redis
22
- Requires-Dist: redis~=7.1; extra == "redis"
23
- Provides-Extra: history
24
- Requires-Dist: aiosqlite~=0.22; extra == "history"
25
- Requires-Dist: asyncpg~=0.30; extra == "history"
26
- Provides-Extra: telemetry
27
- Requires-Dist: opentelemetry-api~=1.39; extra == "telemetry"
28
- Requires-Dist: opentelemetry-sdk~=1.39; extra == "telemetry"
29
- Requires-Dist: opentelemetry-exporter-otlp~=1.39; extra == "telemetry"
30
- Requires-Dist: opentelemetry-instrumentation-aiohttp-client~=0.59b0; extra == "telemetry"
31
- Provides-Extra: test
32
- Requires-Dist: pytest~=9.0; extra == "test"
33
- Requires-Dist: pytest-asyncio~=1.1; extra == "test"
34
- Requires-Dist: fakeredis~=2.33; extra == "test"
35
- Requires-Dist: pytest-aiohttp~=1.1; extra == "test"
36
- Requires-Dist: pytest-mock~=3.14; extra == "test"
37
- Requires-Dist: aioresponses~=0.7; extra == "test"
38
- Requires-Dist: backports.zstd~=1.2; extra == "test"
39
- Requires-Dist: opentelemetry-instrumentation-aiohttp-client; extra == "test"
40
- Provides-Extra: all
41
- Requires-Dist: avtomatika[redis]; extra == "all"
42
- Requires-Dist: avtomatika[history]; extra == "all"
43
- Requires-Dist: avtomatika[telemetry]; extra == "all"
44
- Dynamic: license-file
45
-
46
1
  # Avtomatika Orchestrator
47
2
 
48
3
  Avtomatika is a powerful, state-driven engine for managing complex asynchronous workflows in Python. It provides a robust framework for building scalable and resilient applications by separating process logic from execution logic.
@@ -60,6 +15,7 @@ This document serves as a comprehensive guide for developers looking to build pi
60
15
  - [Parallel Execution and Aggregation (Fan-out/Fan-in)](#parallel-execution-and-aggregation-fan-outfan-in)
61
16
  - [Dependency Injection (DataStore)](#dependency-injection-datastore)
62
17
  - [Native Scheduler](#native-scheduler)
18
+ - [S3 Payload Offloading](#s3-payload-offloading)
63
19
  - [Webhook Notifications](#webhook-notifications)
64
20
  - [Production Configuration](#production-configuration)
65
21
  - [Fault Tolerance](#fault-tolerance)
@@ -107,6 +63,11 @@ Avtomatika is part of a larger ecosystem:
107
63
  pip install "avtomatika[telemetry]"
108
64
  ```
109
65
 
66
+ * **Install with S3 support (Payload Offloading):**
67
+ ```bash
68
+ pip install "avtomatika[s3]"
69
+ ```
70
+
110
71
  * **Install all dependencies, including for testing:**
111
72
  ```bash
112
73
  pip install "avtomatika[all,test]"
@@ -250,6 +211,19 @@ async def publish_handler_old_style(context):
250
211
  print(f"Job {context.job_id}: Publishing video at {output_path} ({duration}s).")
251
212
  context.actions.transition_to("complete")
252
213
  ```
214
+ ## Key Concepts: JobContext and Actions
215
+
216
+ ### High Performance Architecture
217
+
218
+ Avtomatika is engineered for high-load environments with thousands of concurrent workers.
219
+
220
+ * **O(1) Dispatcher**: Uses advanced Redis Set intersections to find suitable workers instantly, regardless of the cluster size. No O(N) scanning.
221
+ * **Non-Blocking I/O**:
222
+ * **Webhooks**: Sent via a bounded background queue to prevent backpressure.
223
+ * **History Logging**: Writes to SQL databases are buffered and asynchronous, ensuring the main execution loop never blocks.
224
+ * **Redis Streams**: Uses blocking reads to eliminate busy-waiting and reduce CPU usage.
225
+ * **Memory Safety**: S3 file transfers use streaming to handle multi-gigabyte files with constant, low RAM usage.
226
+
253
227
  ## Blueprint Cookbook: Key Features
254
228
 
255
229
  ### 1. Conditional Transitions (`.when()`)
@@ -365,7 +339,30 @@ daily_at = "02:00"
365
339
 
366
340
  The orchestrator can send asynchronous notifications to an external system when a job completes, fails, or is quarantined. This eliminates the need for clients to constantly poll the API for status updates.
367
341
 
368
- * **Usage:** Pass a `webhook_url` in the request body when creating a job.
342
+ ### 7. S3 Payload Offloading
343
+
344
+ Orchestrator provides first-class support for handling large files via S3-compatible storage, powered by the high-performance `obstore` library (Rust bindings).
345
+
346
+ * **Memory Safe (Streaming)**: Uses streaming for uploads and downloads, allowing processing of files larger than available RAM without OOM errors.
347
+ * **Managed Mode**: The Orchestrator manages file lifecycle (automatic cleanup of S3 objects and local temporary files on job completion).
348
+ * **Dependency Injection**: Use the `task_files` argument in your handlers to easily read/write data.
349
+ * **Directory Support**: Supports recursive download and upload of entire directories.
350
+
351
+ ```python
352
+ @bp.handler_for("process_data")
353
+ async def process_data(task_files, actions):
354
+ # Streaming download of a large file
355
+ local_path = await task_files.download("large_dataset.csv")
356
+
357
+ # ... process data ...
358
+
359
+ # Upload results
360
+ await task_files.write_json("results.json", {"status": "done"})
361
+
362
+ actions.transition_to("finished")
363
+ ```
364
+
365
+ ## Production Configuration
369
366
  * **Events:**
370
367
  * `job_finished`: The job reached a final success state.
371
368
  * `job_failed`: The job failed (e.g., due to an error or invalid input).
@@ -533,4 +530,4 @@ For a deeper dive into the system, please refer to the following documents:
533
530
  - [**Architecture Guide**](https://github.com/avtomatika-ai/avtomatika/blob/main/docs/architecture.md): A detailed overview of the system components and their interactions.
534
531
  - [**API Reference**](https://github.com/avtomatika-ai/avtomatika/blob/main/docs/api_reference.md): Full specification of the HTTP API.
535
532
  - [**Deployment Guide**](https://github.com/avtomatika-ai/avtomatika/blob/main/docs/deployment.md): Instructions for deploying with Gunicorn/Uvicorn and NGINX.
536
- - [**Cookbook**](https://github.com/avtomatika-ai/avtomatika/blob/main/docs/cookbook/README.md): Examples and best practices for creating blueprints.
533
+ - [**Cookbook**](https://github.com/avtomatika-ai/avtomatika/blob/main/docs/cookbook/README.md): Examples and best practices for creating blueprints.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "avtomatika"
7
- version = "1.0b7"
7
+ version = "1.0b8"
8
8
  description = "A state-machine based orchestrator for long-running AI and other jobs."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -26,6 +26,7 @@ dependencies = [
26
26
 
27
27
  [project.optional-dependencies]
28
28
  redis = ["redis~=7.1"]
29
+ s3 = ["obstore>=0.2", "aiofiles~=23.2"]
29
30
  history = ["aiosqlite~=0.22", "asyncpg~=0.30"]
30
31
  telemetry = [
31
32
  "opentelemetry-api~=1.39",
@@ -42,11 +43,14 @@ test = [
42
43
  "aioresponses~=0.7",
43
44
  "backports.zstd~=1.2",
44
45
  "opentelemetry-instrumentation-aiohttp-client",
46
+ "obstore>=0.2",
47
+ "aiofiles~=23.2",
45
48
  ]
46
49
  all = [
47
50
  "avtomatika[redis]",
48
51
  "avtomatika[history]",
49
52
  "avtomatika[telemetry]",
53
+ "avtomatika[s3]",
50
54
  ]
51
55
 
52
56
  [project.urls]
@@ -30,3 +30,4 @@ WATCHER_TASK_KEY = AppKey("watcher_task", Task)
30
30
  REPUTATION_CALCULATOR_TASK_KEY = AppKey("reputation_calculator_task", Task)
31
31
  HEALTH_CHECKER_TASK_KEY = AppKey("health_checker_task", Task)
32
32
  SCHEDULER_TASK_KEY = AppKey("scheduler_task", Task)
33
+ S3_SERVICE_KEY = AppKey("s3_service", "S3Service")
@@ -39,6 +39,7 @@ class Config:
39
39
 
40
40
  # Worker settings
41
41
  self.WORKER_TIMEOUT_SECONDS: int = int(getenv("WORKER_TIMEOUT_SECONDS", 300))
42
+ self.TASK_FILES_DIR: str = getenv("TASK_FILES_DIR", "/tmp/avtomatika-payloads")
42
43
  self.WORKER_POLL_TIMEOUT_SECONDS: int = int(
43
44
  getenv("WORKER_POLL_TIMEOUT_SECONDS", 30),
44
45
  )
@@ -52,10 +53,19 @@ class Config:
52
53
  self.EXECUTOR_MAX_CONCURRENT_JOBS: int = int(
53
54
  getenv("EXECUTOR_MAX_CONCURRENT_JOBS", 100),
54
55
  )
56
+ self.REDIS_STREAM_BLOCK_MS: int = int(getenv("REDIS_STREAM_BLOCK_MS", 5000))
55
57
 
56
58
  # History storage settings
57
59
  self.HISTORY_DATABASE_URI: str = getenv("HISTORY_DATABASE_URI", "")
58
60
 
61
+ # S3 settings
62
+ self.S3_ENDPOINT_URL: str = getenv("S3_ENDPOINT_URL", "")
63
+ self.S3_ACCESS_KEY: str = getenv("S3_ACCESS_KEY", "")
64
+ self.S3_SECRET_KEY: str = getenv("S3_SECRET_KEY", "")
65
+ self.S3_REGION: str = getenv("S3_REGION", "us-east-1")
66
+ self.S3_DEFAULT_BUCKET: str = getenv("S3_DEFAULT_BUCKET", "avtomatika-payloads")
67
+ self.S3_MAX_CONCURRENCY: int = int(getenv("S3_MAX_CONCURRENCY", 100))
68
+
59
69
  # Rate limiting settings
60
70
  self.RATE_LIMITING_ENABLED: bool = getenv("RATE_LIMITING_ENABLED", "true").lower() == "true"
61
71
 
@@ -21,10 +21,11 @@ class JobContext(NamedTuple):
21
21
  state_history: dict[str, Any]
22
22
  client: ClientConfig
23
23
  actions: "ActionFactory"
24
- data_stores: dict[str, Any] | None = None
24
+ data_stores: Any | None = None
25
25
  tracing_context: dict[str, Any] | None = None
26
26
  aggregation_results: dict[str, Any] | None = None
27
27
  webhook_url: str | None = None
28
+ task_files: Any | None = None
28
29
 
29
30
 
30
31
  class GPUInfo(NamedTuple):
@@ -137,32 +137,17 @@ class Dispatcher:
137
137
  dispatch_strategy = task_info.get("dispatch_strategy", "default")
138
138
  resource_requirements = task_info.get("resource_requirements")
139
139
 
140
- all_workers = await self.storage.get_available_workers()
141
- logger.info(f"Found {len(all_workers)} available workers")
142
- if not all_workers:
143
- raise RuntimeError("No available workers")
144
-
145
- # A worker is considered available if its status is 'idle' or not specified (for backward compatibility)
146
- logger.debug(f"All available workers: {[w['worker_id'] for w in all_workers]}")
147
- idle_workers = [w for w in all_workers if w.get("status", "idle") == "idle"]
148
- logger.debug(f"Idle workers: {[w['worker_id'] for w in idle_workers]}")
149
- if not idle_workers:
150
- if busy_mo_workers := [
151
- w for w in all_workers if w.get("status") == "busy" and "multi_orchestrator_info" in w
152
- ]:
153
- logger.warning(
154
- f"No idle workers. Found {len(busy_mo_workers)} busy workers "
155
- f"in multi-orchestrator mode. They are likely performing tasks for other Orchestrators.",
156
- )
157
- raise RuntimeError("No idle workers (all are 'busy')")
140
+ candidate_ids = await self.storage.find_workers_for_task(task_type)
141
+ if not candidate_ids:
142
+ logger.warning(f"No idle workers found for task '{task_type}'")
143
+ raise RuntimeError(f"No suitable workers for task type '{task_type}'")
144
+
145
+ capable_workers = await self.storage.get_workers(candidate_ids)
146
+ logger.debug(f"Found {len(capable_workers)} capable workers for task '{task_type}'")
158
147
 
159
- # Filter by task type
160
- capable_workers = [w for w in idle_workers if task_type in w.get("supported_tasks", [])]
161
- logger.debug(f"Capable workers for task '{task_type}': {[w['worker_id'] for w in capable_workers]}")
162
148
  if not capable_workers:
163
- raise RuntimeError(f"No suitable workers for task type '{task_type}'")
149
+ raise RuntimeError(f"No suitable workers for task type '{task_type}' (data missing)")
164
150
 
165
- # Filter by resource requirements
166
151
  if resource_requirements:
167
152
  compliant_workers = [w for w in capable_workers if self._is_worker_compliant(w, resource_requirements)]
168
153
  logger.debug(
@@ -175,7 +160,6 @@ class Dispatcher:
175
160
  )
176
161
  capable_workers = compliant_workers
177
162
 
178
- # Filter by maximum cost
179
163
  max_cost = task_info.get("max_cost")
180
164
  if max_cost is not None:
181
165
  cost_compliant_workers = [w for w in capable_workers if w.get("cost_per_second", float("inf")) <= max_cost]
@@ -188,7 +172,6 @@ class Dispatcher:
188
172
  )
189
173
  capable_workers = cost_compliant_workers
190
174
 
191
- # Select worker according to strategy
192
175
  if dispatch_strategy == "round_robin":
193
176
  selected_worker = self._select_round_robin(capable_workers, task_type)
194
177
  elif dispatch_strategy == "least_connections":
@@ -205,7 +188,6 @@ class Dispatcher:
205
188
  f"Dispatching task '{task_type}' to worker {worker_id} (strategy: {dispatch_strategy})",
206
189
  )
207
190
 
208
- # --- Task creation and enqueuing ---
209
191
  task_id = task_info.get("task_id") or str(uuid4())
210
192
  payload = {
211
193
  "job_id": job_id,
@@ -19,6 +19,7 @@ from .app_keys import (
19
19
  HTTP_SESSION_KEY,
20
20
  REPUTATION_CALCULATOR_KEY,
21
21
  REPUTATION_CALCULATOR_TASK_KEY,
22
+ S3_SERVICE_KEY,
22
23
  SCHEDULER_KEY,
23
24
  SCHEDULER_TASK_KEY,
24
25
  WATCHER_KEY,
@@ -37,6 +38,7 @@ from .history.base import HistoryStorageBase
37
38
  from .history.noop import NoOpHistoryStorage
38
39
  from .logging_config import setup_logging
39
40
  from .reputation import ReputationCalculator
41
+ from .s3 import S3Service
40
42
  from .scheduler import Scheduler
41
43
  from .storage.base import StorageBackend
42
44
  from .telemetry import setup_telemetry
@@ -141,6 +143,11 @@ class OrchestratorEngine:
141
143
  self.history_storage = NoOpHistoryStorage()
142
144
 
143
145
  async def on_startup(self, app: web.Application) -> None:
146
+ # 1. Fail Fast: Check Storage Connection
147
+ if not await self.storage.ping():
148
+ logger.critical("Failed to connect to Storage Backend (Redis). Exiting.")
149
+ raise RuntimeError("Storage Backend is unavailable.")
150
+
144
151
  try:
145
152
  from opentelemetry.instrumentation.aiohttp_client import (
146
153
  AioHttpClientInstrumentor,
@@ -152,6 +159,8 @@ class OrchestratorEngine:
152
159
  "opentelemetry-instrumentation-aiohttp-client not found. AIOHTTP client instrumentation is disabled."
153
160
  )
154
161
  await self._setup_history_storage()
162
+ # Start history background worker
163
+ await self.history_storage.start()
155
164
 
156
165
  # Load client configs if the path is provided
157
166
  if self.config.CLIENTS_CONFIG_PATH:
@@ -188,6 +197,7 @@ class OrchestratorEngine:
188
197
 
189
198
  app[HTTP_SESSION_KEY] = ClientSession()
190
199
  self.webhook_sender = WebhookSender(app[HTTP_SESSION_KEY])
200
+ self.webhook_sender.start()
191
201
  self.dispatcher = Dispatcher(self.storage, self.config)
192
202
  app[DISPATCHER_KEY] = self.dispatcher
193
203
  app[EXECUTOR_KEY] = JobExecutor(self, self.history_storage)
@@ -196,6 +206,7 @@ class OrchestratorEngine:
196
206
  app[HEALTH_CHECKER_KEY] = HealthChecker(self)
197
207
  app[SCHEDULER_KEY] = Scheduler(self)
198
208
  app[WS_MANAGER_KEY] = self.ws_manager
209
+ app[S3_SERVICE_KEY] = S3Service(self.config, self.history_storage)
199
210
 
200
211
  app[EXECUTOR_TASK_KEY] = create_task(app[EXECUTOR_KEY].run())
201
212
  app[WATCHER_TASK_KEY] = create_task(app[WATCHER_KEY].run())
@@ -220,6 +231,13 @@ class OrchestratorEngine:
220
231
  logger.info("Closing WebSocket connections...")
221
232
  await self.ws_manager.close_all()
222
233
 
234
+ logger.info("Stopping WebhookSender...")
235
+ await self.webhook_sender.stop()
236
+
237
+ if S3_SERVICE_KEY in app:
238
+ logger.info("Closing S3 Service...")
239
+ await app[S3_SERVICE_KEY].close()
240
+
223
241
  logger.info("Cancelling background tasks...")
224
242
  app[HEALTH_CHECKER_TASK_KEY].cancel()
225
243
  app[WATCHER_TASK_KEY].cancel()
@@ -352,7 +370,7 @@ class OrchestratorEngine:
352
370
  )
353
371
 
354
372
  # Run in background to not block the main flow
355
- create_task(self.webhook_sender.send(webhook_url, payload))
373
+ await self.webhook_sender.send(webhook_url, payload)
356
374
 
357
375
  def run(self) -> None:
358
376
  self.setup()
@@ -47,6 +47,7 @@ except ImportError:
47
47
  inject = NoOpPropagate().inject
48
48
  TraceContextTextMapPropagator = NoOpTraceContextTextMapPropagator() # Instantiate the class
49
49
 
50
+ from .app_keys import S3_SERVICE_KEY
50
51
  from .context import ActionFactory
51
52
  from .data_types import ClientConfig, JobContext
52
53
  from .history.base import HistoryStorageBase
@@ -74,7 +75,7 @@ class JobExecutor:
74
75
  self._running = False
75
76
  self._processing_messages: set[str] = set()
76
77
 
77
- async def _process_job(self, job_id: str, message_id: str):
78
+ async def _process_job(self, job_id: str, message_id: str) -> None:
78
79
  """The core logic for processing a single job dequeued from storage."""
79
80
  if message_id in self._processing_messages:
80
81
  return
@@ -143,6 +144,11 @@ class JobExecutor:
143
144
  plan=client_config_dict.get("plan", "unknown"),
144
145
  params=client_config_dict.get("params", {}),
145
146
  )
147
+
148
+ # Get TaskFiles if S3 service is available
149
+ s3_service = self.engine.app.get(S3_SERVICE_KEY)
150
+ task_files = s3_service.get_task_files(job_id) if s3_service else None
151
+
146
152
  context = JobContext(
147
153
  job_id=job_id,
148
154
  current_state=job_state["current_state"],
@@ -153,6 +159,7 @@ class JobExecutor:
153
159
  data_stores=SimpleNamespace(**blueprint.data_stores),
154
160
  tracing_context=tracing_context,
155
161
  aggregation_results=job_state.get("aggregation_results"),
162
+ task_files=task_files,
156
163
  )
157
164
 
158
165
  try:
@@ -173,12 +180,17 @@ class JobExecutor:
173
180
  params_to_inject["context"] = context
174
181
  if "actions" in param_names:
175
182
  params_to_inject["actions"] = action_factory
183
+ if "task_files" in param_names:
184
+ params_to_inject["task_files"] = task_files
176
185
  else:
177
186
  # New injection logic with prioritized lookup.
178
187
  context_as_dict = context._asdict()
179
188
  for param_name in param_names:
189
+ # Direct injection of task_files
190
+ if param_name == "task_files":
191
+ params_to_inject[param_name] = task_files
180
192
  # Look in JobContext fields first.
181
- if param_name in context_as_dict:
193
+ elif param_name in context_as_dict:
182
194
  params_to_inject[param_name] = context_as_dict[param_name]
183
195
  # Then look in state_history (data from previous steps/workers).
184
196
  elif param_name in context.state_history:
@@ -258,6 +270,15 @@ class JobExecutor:
258
270
  await self.storage.enqueue_job(job_id)
259
271
  else:
260
272
  logger.info(f"Job {job_id} reached terminal state {next_state}")
273
+
274
+ # Clean up S3 files if service is available
275
+ s3_service = self.engine.app.get(S3_SERVICE_KEY)
276
+ if s3_service:
277
+ task_files = s3_service.get_task_files(job_id)
278
+ if task_files:
279
+ # Run cleanup in background to not block response
280
+ create_task(task_files.cleanup())
281
+
261
282
  await self._check_and_resume_parent(job_state)
262
283
  # Send webhook for finished/failed jobs
263
284
  event_type = "job_finished" if next_state == "finished" else "job_failed"
@@ -522,7 +543,10 @@ class JobExecutor:
522
543
  # Wait for an available slot before fetching a new job
523
544
  await semaphore.acquire()
524
545
 
525
- result = await self.storage.dequeue_job()
546
+ # Block for a configured time waiting for a job
547
+ block_time = self.engine.config.REDIS_STREAM_BLOCK_MS
548
+ result = await self.storage.dequeue_job(block=block_time if block_time > 0 else None)
549
+
526
550
  if result:
527
551
  job_id, message_id = result
528
552
  task = create_task(self._process_job(job_id, message_id))
@@ -530,14 +554,18 @@ class JobExecutor:
530
554
  # Release the semaphore slot when the task is done
531
555
  task.add_done_callback(lambda _: semaphore.release())
532
556
  else:
533
- # No job found, release the slot and wait a bit
557
+ # Timeout reached, release slot and loop again
534
558
  semaphore.release()
535
- # Prevent busy loop if storage returns None immediately
536
- await sleep(0.1)
559
+ # Prevent busy loop if blocking is disabled (e.g. in tests) or failed
560
+ if block_time <= 0:
561
+ await sleep(0.1)
562
+
537
563
  except CancelledError:
538
564
  break
539
565
  except Exception:
540
566
  logger.exception("Error in JobExecutor main loop.")
567
+ # If an error occurred (e.g. Redis connection lost), sleep briefly to avoid log spam
568
+ semaphore.release()
541
569
  await sleep(1)
542
570
  logger.info("JobExecutor stopped.")
543
571
 
@@ -0,0 +1,57 @@
1
+ """This module previously contained an active HealthChecker.
2
+ In the new architecture with heartbeat messages from workers,
3
+ the orchestrator no longer needs to actively poll workers.
4
+
5
+ Redis automatically deletes worker keys when their TTL expires,
6
+ and `storage.get_available_workers()` only retrieves active keys.
7
+
8
+ This file is left as a placeholder in case passive health-check
9
+ logic is needed in the future (e.g., for logging expired workers).
10
+ """
11
+
12
+ from asyncio import CancelledError, sleep
13
+ from logging import getLogger
14
+ from typing import TYPE_CHECKING
15
+
16
+ if TYPE_CHECKING:
17
+ from .engine import OrchestratorEngine
18
+
19
+ logger = getLogger(__name__)
20
+
21
+
22
+ class HealthChecker:
23
+ def __init__(self, engine: "OrchestratorEngine", interval_seconds: int = 600):
24
+ self.engine = engine
25
+ self.storage = engine.storage
26
+ self.interval_seconds = interval_seconds
27
+ self._running = False
28
+ from uuid import uuid4
29
+
30
+ self._instance_id = str(uuid4())
31
+
32
+ async def run(self):
33
+ logger.info(f"HealthChecker started (Active Index Cleanup, Instance ID: {self._instance_id}).")
34
+ self._running = True
35
+ while self._running:
36
+ try:
37
+ # Use distributed lock to ensure only one instance cleans up
38
+ if await self.storage.acquire_lock(
39
+ "global_health_check_lock", self._instance_id, self.interval_seconds - 5
40
+ ):
41
+ try:
42
+ await self.storage.cleanup_expired_workers()
43
+ finally:
44
+ # We don't release the lock immediately to prevent other instances from
45
+ # running the same task if the interval is small.
46
+ pass
47
+
48
+ await sleep(self.interval_seconds)
49
+ except CancelledError:
50
+ break
51
+ except Exception:
52
+ logger.exception("Error in HealthChecker main loop.")
53
+ await sleep(60)
54
+ logger.info("HealthChecker stopped.")
55
+
56
+ def stop(self):
57
+ self._running = False