avtomatika 1.0b2__tar.gz → 1.0b3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {avtomatika-1.0b2/src/avtomatika.egg-info → avtomatika-1.0b3}/PKG-INFO +29 -11
- {avtomatika-1.0b2 → avtomatika-1.0b3}/README.md +19 -1
- {avtomatika-1.0b2 → avtomatika-1.0b3}/pyproject.toml +10 -10
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/__init__.py +2 -3
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/blueprint.py +5 -7
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/client_config_loader.py +18 -6
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/dispatcher.py +13 -19
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/engine.py +16 -16
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/executor.py +6 -3
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/ratelimit.py +3 -10
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/reputation.py +11 -2
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/storage/__init__.py +3 -3
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/storage/base.py +23 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/storage/memory.py +34 -8
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/storage/redis.py +37 -20
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/telemetry.py +3 -3
- avtomatika-1.0b3/src/avtomatika/watcher.py +82 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/worker_config_loader.py +7 -2
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/ws_manager.py +2 -1
- {avtomatika-1.0b2 → avtomatika-1.0b3/src/avtomatika.egg-info}/PKG-INFO +29 -11
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika.egg-info/SOURCES.txt +4 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika.egg-info/requires.txt +8 -8
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_client_config_loader.py +7 -6
- avtomatika-1.0b3/tests/test_config_validation.py +60 -0
- avtomatika-1.0b3/tests/test_dispatcher_extended.py +95 -0
- avtomatika-1.0b3/tests/test_memory_locking.py +44 -0
- avtomatika-1.0b3/tests/test_redis_locking.py +45 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_watcher.py +3 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_worker_config_loader.py +7 -4
- avtomatika-1.0b2/src/avtomatika/watcher.py +0 -68
- {avtomatika-1.0b2 → avtomatika-1.0b3}/LICENSE +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/setup.cfg +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/api.html +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/compression.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/config.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/context.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/data_types.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/datastore.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/health_checker.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/history/base.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/history/noop.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/history/postgres.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/history/sqlite.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/logging_config.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/metrics.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/py.typed +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/quota.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika/security.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika.egg-info/dependency_links.txt +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/src/avtomatika.egg-info/top_level.txt +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_blueprint_conditions.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_blueprints.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_compression.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_context.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_dispatcher.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_engine.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_error_handling.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_executor.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_health_checker.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_history.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_integration.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_logging_config.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_memory_storage.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_metrics.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_noop_history.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_postgres_history.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_ratelimit.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_redis_storage.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_reputation.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_telemetry.py +0 -0
- {avtomatika-1.0b2 → avtomatika-1.0b3}/tests/test_ws_manager.py +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: avtomatika
|
|
3
|
-
Version: 1.
|
|
4
|
-
Summary: A state-machine based orchestrator for long-running jobs.
|
|
3
|
+
Version: 1.0b3
|
|
4
|
+
Summary: A state-machine based orchestrator for long-running AI and other jobs.
|
|
5
5
|
Project-URL: Homepage, https://github.com/avtomatika-ai/avtomatika
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/avtomatika-ai/avtomatika/issues
|
|
7
7
|
Classifier: Development Status :: 4 - Beta
|
|
@@ -18,25 +18,25 @@ Requires-Dist: graphviz~=0.21
|
|
|
18
18
|
Requires-Dist: zstandard~=0.24
|
|
19
19
|
Requires-Dist: aioprometheus~=23.12
|
|
20
20
|
Provides-Extra: redis
|
|
21
|
-
Requires-Dist: redis~=
|
|
21
|
+
Requires-Dist: redis~=7.1; extra == "redis"
|
|
22
22
|
Requires-Dist: orjson~=3.11; extra == "redis"
|
|
23
23
|
Provides-Extra: history
|
|
24
|
-
Requires-Dist: aiosqlite~=0.
|
|
24
|
+
Requires-Dist: aiosqlite~=0.22; extra == "history"
|
|
25
25
|
Requires-Dist: asyncpg~=0.30; extra == "history"
|
|
26
26
|
Requires-Dist: orjson~=3.11; extra == "history"
|
|
27
27
|
Provides-Extra: telemetry
|
|
28
|
-
Requires-Dist: opentelemetry-api~=1.
|
|
29
|
-
Requires-Dist: opentelemetry-sdk~=1.
|
|
30
|
-
Requires-Dist: opentelemetry-exporter-otlp~=1.
|
|
28
|
+
Requires-Dist: opentelemetry-api~=1.39; extra == "telemetry"
|
|
29
|
+
Requires-Dist: opentelemetry-sdk~=1.39; extra == "telemetry"
|
|
30
|
+
Requires-Dist: opentelemetry-exporter-otlp~=1.39; extra == "telemetry"
|
|
31
31
|
Requires-Dist: opentelemetry-instrumentation-aiohttp-client~=0.59b0; extra == "telemetry"
|
|
32
32
|
Provides-Extra: test
|
|
33
|
-
Requires-Dist: pytest~=
|
|
33
|
+
Requires-Dist: pytest~=9.0; extra == "test"
|
|
34
34
|
Requires-Dist: pytest-asyncio~=1.1; extra == "test"
|
|
35
|
-
Requires-Dist: fakeredis~=2.
|
|
35
|
+
Requires-Dist: fakeredis~=2.33; extra == "test"
|
|
36
36
|
Requires-Dist: pytest-aiohttp~=1.1; extra == "test"
|
|
37
37
|
Requires-Dist: pytest-mock~=3.14; extra == "test"
|
|
38
38
|
Requires-Dist: aioresponses~=0.7; extra == "test"
|
|
39
|
-
Requires-Dist: backports.zstd; extra == "test"
|
|
39
|
+
Requires-Dist: backports.zstd~=1.2; extra == "test"
|
|
40
40
|
Requires-Dist: opentelemetry-instrumentation-aiohttp-client; extra == "test"
|
|
41
41
|
Provides-Extra: all
|
|
42
42
|
Requires-Dist: avtomatika[redis]; extra == "all"
|
|
@@ -285,7 +285,7 @@ Run multiple tasks simultaneously and gather their results.
|
|
|
285
285
|
@my_blueprint.handler_for("process_files")
|
|
286
286
|
async def fan_out_handler(initial_data, actions):
|
|
287
287
|
tasks_to_dispatch = [
|
|
288
|
-
{"task_type": "file_analysis", "params": {"file": file}}
|
|
288
|
+
{"task_type": "file_analysis", "params": {"file": file}})
|
|
289
289
|
for file in initial_data.get("files", [])
|
|
290
290
|
]
|
|
291
291
|
# Use dispatch_parallel to send all tasks at once.
|
|
@@ -332,6 +332,8 @@ async def cache_handler(data_stores):
|
|
|
332
332
|
|
|
333
333
|
The orchestrator's behavior can be configured through environment variables. Additionally, any configuration parameter loaded from environment variables can be programmatically overridden in your application code after the `Config` object has been initialized. This provides flexibility for different deployment and testing scenarios.
|
|
334
334
|
|
|
335
|
+
**Important:** The system employs **strict validation** for configuration files (`clients.toml`, `workers.toml`) at startup. If a configuration file is invalid (e.g., malformed TOML, missing required fields), the application will **fail fast** and exit with an error, rather than starting in a partially broken state. This ensures the security and integrity of the deployment.
|
|
336
|
+
|
|
335
337
|
### Fault Tolerance
|
|
336
338
|
|
|
337
339
|
The orchestrator has built-in mechanisms for handling failures based on the `error.code` field in a worker's response.
|
|
@@ -340,6 +342,13 @@ The orchestrator has built-in mechanisms for handling failures based on the `err
|
|
|
340
342
|
* **PERMANENT_ERROR**: A permanent error (e.g., a corrupted file). The task will be immediately sent to quarantine for manual investigation.
|
|
341
343
|
* **INVALID_INPUT_ERROR**: An error in the input data. The entire pipeline (Job) will be immediately moved to the failed state.
|
|
342
344
|
|
|
345
|
+
### High Availability & Distributed Locking
|
|
346
|
+
|
|
347
|
+
The architecture supports horizontal scaling. Multiple Orchestrator instances can run behind a load balancer.
|
|
348
|
+
|
|
349
|
+
* **Stateless API:** The API is stateless; all state is persisted in Redis.
|
|
350
|
+
* **Distributed Locking:** Background processes (`Watcher`, `ReputationCalculator`) use distributed locks (via Redis `SET NX`) to coordinate and prevent race conditions when multiple instances are active.
|
|
351
|
+
|
|
343
352
|
### Storage Backend
|
|
344
353
|
|
|
345
354
|
By default, the engine uses in-memory storage. For production, you must configure persistent storage via environment variables.
|
|
@@ -408,3 +417,12 @@ To run the `avtomatika` test suite:
|
|
|
408
417
|
```bash
|
|
409
418
|
pytest avtomatika/tests/
|
|
410
419
|
```
|
|
420
|
+
|
|
421
|
+
## Detailed Documentation
|
|
422
|
+
|
|
423
|
+
For a deeper dive into the system, please refer to the following documents in the `docs/` directory:
|
|
424
|
+
|
|
425
|
+
- [**Architecture Guide**](docs/architecture.md): A detailed overview of the system components and their interactions.
|
|
426
|
+
- [**API Reference**](docs/api_reference.md): Full specification of the HTTP API.
|
|
427
|
+
- [**Deployment Guide**](docs/deployment.md): Instructions for deploying with Gunicorn/Uvicorn and NGINX.
|
|
428
|
+
- [**Cookbook**](docs/cookbook/README.md): Examples and best practices for creating blueprints.
|
|
@@ -239,7 +239,7 @@ Run multiple tasks simultaneously and gather their results.
|
|
|
239
239
|
@my_blueprint.handler_for("process_files")
|
|
240
240
|
async def fan_out_handler(initial_data, actions):
|
|
241
241
|
tasks_to_dispatch = [
|
|
242
|
-
{"task_type": "file_analysis", "params": {"file": file}}
|
|
242
|
+
{"task_type": "file_analysis", "params": {"file": file}})
|
|
243
243
|
for file in initial_data.get("files", [])
|
|
244
244
|
]
|
|
245
245
|
# Use dispatch_parallel to send all tasks at once.
|
|
@@ -286,6 +286,8 @@ async def cache_handler(data_stores):
|
|
|
286
286
|
|
|
287
287
|
The orchestrator's behavior can be configured through environment variables. Additionally, any configuration parameter loaded from environment variables can be programmatically overridden in your application code after the `Config` object has been initialized. This provides flexibility for different deployment and testing scenarios.
|
|
288
288
|
|
|
289
|
+
**Important:** The system employs **strict validation** for configuration files (`clients.toml`, `workers.toml`) at startup. If a configuration file is invalid (e.g., malformed TOML, missing required fields), the application will **fail fast** and exit with an error, rather than starting in a partially broken state. This ensures the security and integrity of the deployment.
|
|
290
|
+
|
|
289
291
|
### Fault Tolerance
|
|
290
292
|
|
|
291
293
|
The orchestrator has built-in mechanisms for handling failures based on the `error.code` field in a worker's response.
|
|
@@ -294,6 +296,13 @@ The orchestrator has built-in mechanisms for handling failures based on the `err
|
|
|
294
296
|
* **PERMANENT_ERROR**: A permanent error (e.g., a corrupted file). The task will be immediately sent to quarantine for manual investigation.
|
|
295
297
|
* **INVALID_INPUT_ERROR**: An error in the input data. The entire pipeline (Job) will be immediately moved to the failed state.
|
|
296
298
|
|
|
299
|
+
### High Availability & Distributed Locking
|
|
300
|
+
|
|
301
|
+
The architecture supports horizontal scaling. Multiple Orchestrator instances can run behind a load balancer.
|
|
302
|
+
|
|
303
|
+
* **Stateless API:** The API is stateless; all state is persisted in Redis.
|
|
304
|
+
* **Distributed Locking:** Background processes (`Watcher`, `ReputationCalculator`) use distributed locks (via Redis `SET NX`) to coordinate and prevent race conditions when multiple instances are active.
|
|
305
|
+
|
|
297
306
|
### Storage Backend
|
|
298
307
|
|
|
299
308
|
By default, the engine uses in-memory storage. For production, you must configure persistent storage via environment variables.
|
|
@@ -362,3 +371,12 @@ To run the `avtomatika` test suite:
|
|
|
362
371
|
```bash
|
|
363
372
|
pytest avtomatika/tests/
|
|
364
373
|
```
|
|
374
|
+
|
|
375
|
+
## Detailed Documentation
|
|
376
|
+
|
|
377
|
+
For a deeper dive into the system, please refer to the following documents in the `docs/` directory:
|
|
378
|
+
|
|
379
|
+
- [**Architecture Guide**](docs/architecture.md): A detailed overview of the system components and their interactions.
|
|
380
|
+
- [**API Reference**](docs/api_reference.md): Full specification of the HTTP API.
|
|
381
|
+
- [**Deployment Guide**](docs/deployment.md): Instructions for deploying with Gunicorn/Uvicorn and NGINX.
|
|
382
|
+
- [**Cookbook**](docs/cookbook/README.md): Examples and best practices for creating blueprints.
|
|
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "avtomatika"
|
|
7
|
-
version = "1.
|
|
8
|
-
description = "A state-machine based orchestrator for long-running jobs."
|
|
7
|
+
version = "1.0b3"
|
|
8
|
+
description = "A state-machine based orchestrator for long-running AI and other jobs."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.11"
|
|
11
11
|
classifiers = [
|
|
@@ -24,22 +24,22 @@ dependencies = [
|
|
|
24
24
|
]
|
|
25
25
|
|
|
26
26
|
[project.optional-dependencies]
|
|
27
|
-
redis = ["redis~=
|
|
28
|
-
history = ["aiosqlite~=0.
|
|
27
|
+
redis = ["redis~=7.1", "orjson~=3.11"]
|
|
28
|
+
history = ["aiosqlite~=0.22", "asyncpg~=0.30", "orjson~=3.11"]
|
|
29
29
|
telemetry = [
|
|
30
|
-
"opentelemetry-api~=1.
|
|
31
|
-
"opentelemetry-sdk~=1.
|
|
32
|
-
"opentelemetry-exporter-otlp~=1.
|
|
30
|
+
"opentelemetry-api~=1.39",
|
|
31
|
+
"opentelemetry-sdk~=1.39",
|
|
32
|
+
"opentelemetry-exporter-otlp~=1.39",
|
|
33
33
|
"opentelemetry-instrumentation-aiohttp-client~=0.59b0",
|
|
34
34
|
]
|
|
35
35
|
test = [
|
|
36
|
-
"pytest~=
|
|
36
|
+
"pytest~=9.0",
|
|
37
37
|
"pytest-asyncio~=1.1",
|
|
38
|
-
"fakeredis~=2.
|
|
38
|
+
"fakeredis~=2.33",
|
|
39
39
|
"pytest-aiohttp~=1.1",
|
|
40
40
|
"pytest-mock~=3.14",
|
|
41
41
|
"aioresponses~=0.7",
|
|
42
|
-
"backports.zstd",
|
|
42
|
+
"backports.zstd~=1.2",
|
|
43
43
|
"opentelemetry-instrumentation-aiohttp-client",
|
|
44
44
|
]
|
|
45
45
|
all = [
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
This module exposes the primary classes for building and running state-driven automations.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
import contextlib
|
|
7
8
|
from importlib.metadata import version
|
|
8
9
|
|
|
9
10
|
__version__ = version("avtomatika")
|
|
@@ -22,9 +23,7 @@ __all__ = [
|
|
|
22
23
|
"StorageBackend",
|
|
23
24
|
]
|
|
24
25
|
|
|
25
|
-
|
|
26
|
+
with contextlib.suppress(ImportError):
|
|
26
27
|
from .storage.redis import RedisStorage # noqa: F401
|
|
27
28
|
|
|
28
29
|
__all__.append("RedisStorage")
|
|
29
|
-
except ImportError:
|
|
30
|
-
pass
|
|
@@ -168,8 +168,7 @@ class StateMachineBlueprint:
|
|
|
168
168
|
for handler in self.conditional_handlers:
|
|
169
169
|
if handler.state == state and handler.evaluate(context):
|
|
170
170
|
return handler.func
|
|
171
|
-
default_handler
|
|
172
|
-
if default_handler:
|
|
171
|
+
if default_handler := self.handlers.get(state):
|
|
173
172
|
return default_handler
|
|
174
173
|
raise ValueError(
|
|
175
174
|
f"No suitable handler found for state '{state}' in blueprint '{self.name}' for the given context.",
|
|
@@ -230,12 +229,11 @@ class StateMachineBlueprint:
|
|
|
230
229
|
f"Could not parse handler '{handler_func.__name__}' for state '{handler_state}'. "
|
|
231
230
|
f"Graph may be incomplete. Error: {e}"
|
|
232
231
|
)
|
|
233
|
-
pass
|
|
234
232
|
for state in states:
|
|
235
233
|
dot.node(state, state)
|
|
236
234
|
|
|
237
|
-
if output_filename:
|
|
238
|
-
dot.render(output_filename, format=output_format, cleanup=True)
|
|
239
|
-
print(f"Graph rendered to {output_filename}.{output_format}")
|
|
240
|
-
else:
|
|
235
|
+
if not output_filename:
|
|
241
236
|
return dot.source
|
|
237
|
+
dot.render(output_filename, format=output_format, cleanup=True)
|
|
238
|
+
print(f"Graph rendered to {output_filename}.{output_format}")
|
|
239
|
+
return None
|
|
@@ -26,25 +26,37 @@ async def load_client_configs_to_redis(
|
|
|
26
26
|
config_path,
|
|
27
27
|
)
|
|
28
28
|
return
|
|
29
|
+
except Exception as e:
|
|
30
|
+
logger.error(f"Failed to parse client config file '{config_path}': {e}")
|
|
31
|
+
raise ValueError(f"Invalid client configuration file: {e}") from e
|
|
29
32
|
|
|
30
33
|
loaded_count = 0
|
|
31
34
|
for client_name, config in clients_data.items():
|
|
35
|
+
if not isinstance(config, dict):
|
|
36
|
+
logger.error(f"Client '{client_name}' configuration must be a table (dict).")
|
|
37
|
+
raise ValueError(f"Invalid configuration for client '{client_name}'")
|
|
38
|
+
|
|
32
39
|
token = config.get("token")
|
|
33
40
|
if not token:
|
|
34
|
-
logger.
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
41
|
+
logger.error(f"Client '{client_name}' is missing required 'token' field.")
|
|
42
|
+
raise ValueError(f"Missing token for client '{client_name}'")
|
|
43
|
+
|
|
44
|
+
if not isinstance(token, str):
|
|
45
|
+
logger.error(f"Token for client '{client_name}' must be a string.")
|
|
46
|
+
raise ValueError(f"Invalid token type for client '{client_name}'")
|
|
39
47
|
|
|
40
48
|
# Separate static config from dynamic quota values
|
|
41
49
|
static_config = {k: v for k, v in config.items() if k != "monthly_attempts"}
|
|
42
50
|
quota = config.get("monthly_attempts")
|
|
43
51
|
|
|
52
|
+
if quota is not None and not isinstance(quota, int):
|
|
53
|
+
logger.error(f"Quota 'monthly_attempts' for client '{client_name}' must be an integer.")
|
|
54
|
+
raise ValueError(f"Invalid quota type for client '{client_name}'")
|
|
55
|
+
|
|
44
56
|
try:
|
|
45
57
|
# Assume these storage methods will be implemented
|
|
46
58
|
await storage.save_client_config(token, static_config)
|
|
47
|
-
if quota is not None
|
|
59
|
+
if quota is not None:
|
|
48
60
|
await storage.initialize_client_quota(token, quota)
|
|
49
61
|
|
|
50
62
|
loaded_count += 1
|
|
@@ -28,15 +28,13 @@ class Dispatcher:
|
|
|
28
28
|
self.config = config
|
|
29
29
|
self._round_robin_indices: Dict[str, int] = defaultdict(int)
|
|
30
30
|
|
|
31
|
+
@staticmethod
|
|
31
32
|
def _is_worker_compliant(
|
|
32
|
-
self,
|
|
33
33
|
worker: Dict[str, Any],
|
|
34
34
|
requirements: Dict[str, Any],
|
|
35
35
|
) -> bool:
|
|
36
36
|
"""Checks if a worker meets the specified resource requirements."""
|
|
37
|
-
|
|
38
|
-
required_gpu = requirements.get("gpu_info")
|
|
39
|
-
if required_gpu:
|
|
37
|
+
if required_gpu := requirements.get("gpu_info"):
|
|
40
38
|
gpu_info = worker.get("resources", {}).get("gpu_info")
|
|
41
39
|
if not gpu_info:
|
|
42
40
|
return False
|
|
@@ -51,17 +49,15 @@ class Dispatcher:
|
|
|
51
49
|
):
|
|
52
50
|
return False
|
|
53
51
|
|
|
54
|
-
|
|
55
|
-
required_models = requirements.get("installed_models")
|
|
56
|
-
if required_models:
|
|
52
|
+
if required_models := requirements.get("installed_models"):
|
|
57
53
|
installed_models = {m["name"] for m in worker.get("installed_models", [])}
|
|
58
54
|
if not set(required_models).issubset(installed_models):
|
|
59
55
|
return False
|
|
60
56
|
|
|
61
57
|
return True
|
|
62
58
|
|
|
59
|
+
@staticmethod
|
|
63
60
|
def _select_default(
|
|
64
|
-
self,
|
|
65
61
|
workers: List[Dict[str, Any]],
|
|
66
62
|
task_type: str,
|
|
67
63
|
) -> Dict[str, Any]:
|
|
@@ -74,7 +70,7 @@ class Dispatcher:
|
|
|
74
70
|
"""
|
|
75
71
|
warm_workers = [w for w in workers if task_type in w.get("hot_cache", [])]
|
|
76
72
|
|
|
77
|
-
target_pool = warm_workers
|
|
73
|
+
target_pool = warm_workers or workers
|
|
78
74
|
|
|
79
75
|
# The `cost` field is deprecated but maintained for backward compatibility.
|
|
80
76
|
min_cost = min(w.get("cost", float("inf")) for w in target_pool)
|
|
@@ -95,8 +91,8 @@ class Dispatcher:
|
|
|
95
91
|
self._round_robin_indices[task_type] = idx + 1
|
|
96
92
|
return selected_worker
|
|
97
93
|
|
|
94
|
+
@staticmethod
|
|
98
95
|
def _select_least_connections(
|
|
99
|
-
self,
|
|
100
96
|
workers: List[Dict[str, Any]],
|
|
101
97
|
task_type: str,
|
|
102
98
|
) -> Dict[str, Any]:
|
|
@@ -105,15 +101,16 @@ class Dispatcher:
|
|
|
105
101
|
"""
|
|
106
102
|
return min(workers, key=lambda w: w.get("load", 0.0))
|
|
107
103
|
|
|
104
|
+
@staticmethod
|
|
108
105
|
def _select_cheapest(
|
|
109
|
-
self,
|
|
110
106
|
workers: List[Dict[str, Any]],
|
|
111
107
|
task_type: str,
|
|
112
108
|
) -> Dict[str, Any]:
|
|
113
109
|
"""Selects the cheapest worker based on 'cost_per_second'."""
|
|
114
110
|
return min(workers, key=lambda w: w.get("cost_per_second", float("inf")))
|
|
115
111
|
|
|
116
|
-
|
|
112
|
+
@staticmethod
|
|
113
|
+
def _get_best_value_score(worker: Dict[str, Any]) -> float:
|
|
117
114
|
"""Calculates a "score" for a worker using the formula cost / reputation.
|
|
118
115
|
The lower the score, the better.
|
|
119
116
|
"""
|
|
@@ -121,9 +118,7 @@ class Dispatcher:
|
|
|
121
118
|
# Default reputation is 1.0 if absent
|
|
122
119
|
reputation = worker.get("reputation", 1.0)
|
|
123
120
|
# Avoid division by zero
|
|
124
|
-
if reputation == 0
|
|
125
|
-
return float("inf")
|
|
126
|
-
return cost / reputation
|
|
121
|
+
return float("inf") if reputation == 0 else cost / reputation
|
|
127
122
|
|
|
128
123
|
def _select_best_value(
|
|
129
124
|
self,
|
|
@@ -153,10 +148,9 @@ class Dispatcher:
|
|
|
153
148
|
idle_workers = [w for w in all_workers if w.get("status", "idle") == "idle"]
|
|
154
149
|
logger.debug(f"Idle workers: {[w['worker_id'] for w in idle_workers]}")
|
|
155
150
|
if not idle_workers:
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
if busy_mo_workers:
|
|
151
|
+
if busy_mo_workers := [
|
|
152
|
+
w for w in all_workers if w.get("status") == "busy" and "multi_orchestrator_info" in w
|
|
153
|
+
]:
|
|
160
154
|
logger.warning(
|
|
161
155
|
f"No idle workers. Found {len(busy_mo_workers)} busy workers "
|
|
162
156
|
f"in multi-orchestrator mode. They are likely performing tasks for other Orchestrators.",
|
|
@@ -485,8 +485,7 @@ class OrchestratorEngine:
|
|
|
485
485
|
await self.storage.save_job_state(job_id, job_state)
|
|
486
486
|
# Optionally, trigger a specific 'cancelled' transition if defined in the blueprint
|
|
487
487
|
transitions = job_state.get("current_task_transitions", {})
|
|
488
|
-
next_state
|
|
489
|
-
if next_state:
|
|
488
|
+
if next_state := transitions.get("cancelled"):
|
|
490
489
|
job_state["current_state"] = next_state
|
|
491
490
|
job_state["status"] = "running" # It's running the cancellation handler now
|
|
492
491
|
await self.storage.save_job_state(job_id, job_state)
|
|
@@ -494,9 +493,7 @@ class OrchestratorEngine:
|
|
|
494
493
|
return web.json_response({"status": "result_accepted_cancelled"}, status=200)
|
|
495
494
|
|
|
496
495
|
transitions = job_state.get("current_task_transitions", {})
|
|
497
|
-
next_state
|
|
498
|
-
|
|
499
|
-
if next_state:
|
|
496
|
+
if next_state := transitions.get(result_status):
|
|
500
497
|
logging.info(f"Job {job_id} transitioning based on worker status '{result_status}' to state '{next_state}'")
|
|
501
498
|
|
|
502
499
|
worker_data = result.get("data")
|
|
@@ -602,7 +599,8 @@ class OrchestratorEngine:
|
|
|
602
599
|
await load_client_configs_to_redis(self.storage)
|
|
603
600
|
return web.json_response({"status": "db_flushed"}, status=200)
|
|
604
601
|
|
|
605
|
-
|
|
602
|
+
@staticmethod
|
|
603
|
+
async def _docs_handler(request: web.Request) -> web.Response:
|
|
606
604
|
from importlib import resources
|
|
607
605
|
|
|
608
606
|
try:
|
|
@@ -647,16 +645,7 @@ class OrchestratorEngine:
|
|
|
647
645
|
all_protected_apps.append(protected_app)
|
|
648
646
|
|
|
649
647
|
for app in all_protected_apps:
|
|
650
|
-
|
|
651
|
-
app.router.add_post("/jobs/{job_id}/cancel", self._cancel_job_handler)
|
|
652
|
-
if not isinstance(self.history_storage, NoOpHistoryStorage):
|
|
653
|
-
app.router.add_get("/jobs/{job_id}/history", self._get_job_history_handler)
|
|
654
|
-
app.router.add_get("/blueprints/{blueprint_name}/graph", self._get_blueprint_graph_handler)
|
|
655
|
-
app.router.add_get("/workers", self._get_workers_handler)
|
|
656
|
-
app.router.add_get("/jobs", self._get_jobs_handler)
|
|
657
|
-
app.router.add_get("/dashboard", self._get_dashboard_handler)
|
|
658
|
-
app.router.add_post("/admin/reload-workers", self._reload_worker_configs_handler)
|
|
659
|
-
|
|
648
|
+
self._register_common_routes(app)
|
|
660
649
|
if has_unversioned_routes:
|
|
661
650
|
self.app.add_subapp("/api/", protected_app)
|
|
662
651
|
for version, app in versioned_apps.items():
|
|
@@ -676,6 +665,17 @@ class OrchestratorEngine:
|
|
|
676
665
|
worker_app.router.add_get("/ws/{worker_id}", self._websocket_handler)
|
|
677
666
|
self.app.add_subapp("/_worker/", worker_app)
|
|
678
667
|
|
|
668
|
+
def _register_common_routes(self, app):
|
|
669
|
+
app.router.add_get("/jobs/{job_id}", self._get_job_status_handler)
|
|
670
|
+
app.router.add_post("/jobs/{job_id}/cancel", self._cancel_job_handler)
|
|
671
|
+
if not isinstance(self.history_storage, NoOpHistoryStorage):
|
|
672
|
+
app.router.add_get("/jobs/{job_id}/history", self._get_job_history_handler)
|
|
673
|
+
app.router.add_get("/blueprints/{blueprint_name}/graph", self._get_blueprint_graph_handler)
|
|
674
|
+
app.router.add_get("/workers", self._get_workers_handler)
|
|
675
|
+
app.router.add_get("/jobs", self._get_jobs_handler)
|
|
676
|
+
app.router.add_get("/dashboard", self._get_dashboard_handler)
|
|
677
|
+
app.router.add_post("/admin/reload-workers", self._reload_worker_configs_handler)
|
|
678
|
+
|
|
679
679
|
async def _websocket_handler(self, request: web.Request) -> web.WebSocketResponse:
|
|
680
680
|
worker_id = request.match_info.get("worker_id")
|
|
681
681
|
if not worker_id:
|
|
@@ -35,11 +35,13 @@ except ImportError:
|
|
|
35
35
|
def inject(self, *args, **kwargs):
|
|
36
36
|
pass
|
|
37
37
|
|
|
38
|
-
|
|
38
|
+
@staticmethod
|
|
39
|
+
def extract(*args, **kwargs):
|
|
39
40
|
return None
|
|
40
41
|
|
|
41
42
|
class NoOpTraceContextTextMapPropagator:
|
|
42
|
-
|
|
43
|
+
@staticmethod
|
|
44
|
+
def extract(*args, **kwargs):
|
|
43
45
|
return None
|
|
44
46
|
|
|
45
47
|
trace = NoOpTracer()
|
|
@@ -485,7 +487,8 @@ class JobExecutor:
|
|
|
485
487
|
await self.storage.save_job_state(parent_job_id, parent_job_state)
|
|
486
488
|
await self.storage.enqueue_job(parent_job_id)
|
|
487
489
|
|
|
488
|
-
|
|
490
|
+
@staticmethod
|
|
491
|
+
def _handle_task_completion(task: Task):
|
|
489
492
|
"""Callback to handle completion of a job processing task."""
|
|
490
493
|
try:
|
|
491
494
|
# This will re-raise any exception caught in the task
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from contextlib import suppress
|
|
1
2
|
from typing import Awaitable, Callable
|
|
2
3
|
|
|
3
4
|
from aiohttp import web
|
|
@@ -23,23 +24,15 @@ def rate_limit_middleware_factory(
|
|
|
23
24
|
"""Rate-limiting middleware that uses the provided storage backend."""
|
|
24
25
|
# Determine the key for rate limiting (e.g., by worker_id or IP)
|
|
25
26
|
# For worker endpoints, we key by worker_id. For others, by IP.
|
|
26
|
-
key_identifier = request.match_info.get("worker_id", request.remote)
|
|
27
|
-
if not key_identifier:
|
|
28
|
-
# Fallback for cases where remote IP might not be available
|
|
29
|
-
key_identifier = "unknown"
|
|
27
|
+
key_identifier = request.match_info.get("worker_id", request.remote) or "unknown"
|
|
30
28
|
|
|
31
29
|
# Key by identifier and path to have per-endpoint limits
|
|
32
30
|
rate_limit_key = f"ratelimit:{key_identifier}:{request.path}"
|
|
33
31
|
|
|
34
|
-
|
|
32
|
+
with suppress(Exception):
|
|
35
33
|
count = await storage.increment_key_with_ttl(rate_limit_key, period)
|
|
36
34
|
if count > limit:
|
|
37
35
|
return web.json_response({"error": "Too Many Requests"}, status=429)
|
|
38
|
-
except Exception:
|
|
39
|
-
# If the rate limiter fails for any reason (e.g., Redis down),
|
|
40
|
-
# it's safer to let the request through than to block everything.
|
|
41
|
-
pass
|
|
42
|
-
|
|
43
36
|
return await handler(request)
|
|
44
37
|
|
|
45
38
|
return rate_limit_middleware
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from asyncio import CancelledError, sleep
|
|
2
2
|
from logging import getLogger
|
|
3
3
|
from typing import TYPE_CHECKING
|
|
4
|
+
from uuid import uuid4
|
|
4
5
|
|
|
5
6
|
if TYPE_CHECKING:
|
|
6
7
|
from .engine import OrchestratorEngine
|
|
@@ -20,14 +21,22 @@ class ReputationCalculator:
|
|
|
20
21
|
self.history_storage = engine.history_storage
|
|
21
22
|
self.interval_seconds = interval_seconds
|
|
22
23
|
self._running = False
|
|
24
|
+
self._instance_id = str(uuid4())
|
|
23
25
|
|
|
24
26
|
async def run(self):
|
|
25
27
|
"""The main loop that periodically triggers reputation recalculation."""
|
|
26
|
-
logger.info("ReputationCalculator started.")
|
|
28
|
+
logger.info(f"ReputationCalculator started (Instance ID: {self._instance_id}).")
|
|
27
29
|
self._running = True
|
|
28
30
|
while self._running:
|
|
29
31
|
try:
|
|
30
|
-
|
|
32
|
+
# Attempt to acquire lock
|
|
33
|
+
if await self.storage.acquire_lock("global_reputation_lock", self._instance_id, 300):
|
|
34
|
+
try:
|
|
35
|
+
await self.calculate_all_reputations()
|
|
36
|
+
finally:
|
|
37
|
+
await self.storage.release_lock("global_reputation_lock", self._instance_id)
|
|
38
|
+
else:
|
|
39
|
+
logger.debug("ReputationCalculator lock held by another instance. Skipping.")
|
|
31
40
|
except CancelledError:
|
|
32
41
|
break
|
|
33
42
|
except Exception:
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
|
|
1
3
|
from .base import StorageBackend
|
|
2
4
|
from .memory import MemoryStorage
|
|
3
5
|
|
|
4
6
|
__all__ = ["StorageBackend", "MemoryStorage"]
|
|
5
7
|
|
|
6
|
-
|
|
8
|
+
with contextlib.suppress(ImportError):
|
|
7
9
|
from .redis import RedisStorage # noqa: F401
|
|
8
10
|
|
|
9
11
|
__all__.append("RedisStorage")
|
|
10
|
-
except ImportError:
|
|
11
|
-
pass
|
|
@@ -264,3 +264,26 @@ class StorageBackend(ABC):
|
|
|
264
264
|
Used for metrics.
|
|
265
265
|
"""
|
|
266
266
|
raise NotImplementedError
|
|
267
|
+
|
|
268
|
+
@abstractmethod
|
|
269
|
+
async def acquire_lock(self, key: str, holder_id: str, ttl: int) -> bool:
|
|
270
|
+
"""
|
|
271
|
+
Attempts to acquire a distributed lock.
|
|
272
|
+
|
|
273
|
+
:param key: The unique key of the lock (e.g., 'watcher_lock').
|
|
274
|
+
:param holder_id: A unique identifier for the caller (e.g., UUID).
|
|
275
|
+
:param ttl: Time-to-live for the lock in seconds.
|
|
276
|
+
:return: True if the lock was acquired, False otherwise.
|
|
277
|
+
"""
|
|
278
|
+
raise NotImplementedError
|
|
279
|
+
|
|
280
|
+
@abstractmethod
|
|
281
|
+
async def release_lock(self, key: str, holder_id: str) -> bool:
|
|
282
|
+
"""
|
|
283
|
+
Releases a distributed lock if it is held by the specified holder_id.
|
|
284
|
+
|
|
285
|
+
:param key: The unique key of the lock.
|
|
286
|
+
:param holder_id: The identifier of the caller who presumably holds the lock.
|
|
287
|
+
:return: True if the lock was successfully released, False otherwise.
|
|
288
|
+
"""
|
|
289
|
+
raise NotImplementedError
|
|
@@ -25,6 +25,7 @@ class MemoryStorage(StorageBackend):
|
|
|
25
25
|
self._worker_tokens: Dict[str, str] = {}
|
|
26
26
|
self._generic_keys: Dict[str, Any] = {}
|
|
27
27
|
self._generic_key_ttls: Dict[str, float] = {}
|
|
28
|
+
self._locks: Dict[str, tuple[str, float]] = {} # key -> (holder_id, expiry_time)
|
|
28
29
|
|
|
29
30
|
self._lock = Lock()
|
|
30
31
|
|
|
@@ -128,9 +129,11 @@ class MemoryStorage(StorageBackend):
|
|
|
128
129
|
async with self._lock:
|
|
129
130
|
now = monotonic()
|
|
130
131
|
active_workers = []
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
132
|
+
active_workers.extend(
|
|
133
|
+
worker_info
|
|
134
|
+
for worker_id, worker_info in self._workers.items()
|
|
135
|
+
if self._worker_ttls.get(worker_id, 0) > now
|
|
136
|
+
)
|
|
134
137
|
return active_workers
|
|
135
138
|
|
|
136
139
|
async def add_job_to_watch(self, job_id: str, timeout_at: float) -> None:
|
|
@@ -226,6 +229,7 @@ class MemoryStorage(StorageBackend):
|
|
|
226
229
|
self._quotas.clear()
|
|
227
230
|
self._generic_keys.clear()
|
|
228
231
|
self._generic_key_ttls.clear()
|
|
232
|
+
self._locks.clear()
|
|
229
233
|
|
|
230
234
|
async def get_job_queue_length(self) -> int:
|
|
231
235
|
# No lock needed for asyncio.Queue.qsize()
|
|
@@ -234,13 +238,9 @@ class MemoryStorage(StorageBackend):
|
|
|
234
238
|
async def get_active_worker_count(self) -> int:
|
|
235
239
|
async with self._lock:
|
|
236
240
|
now = monotonic()
|
|
237
|
-
count = 0
|
|
238
241
|
# Create a copy of keys to avoid issues with concurrent modifications
|
|
239
242
|
worker_ids = list(self._workers.keys())
|
|
240
|
-
for worker_id in worker_ids
|
|
241
|
-
if self._worker_ttls.get(worker_id, 0) > now:
|
|
242
|
-
count += 1
|
|
243
|
-
return count
|
|
243
|
+
return sum(self._worker_ttls.get(worker_id, 0) > now for worker_id in worker_ids)
|
|
244
244
|
|
|
245
245
|
async def get_worker_info(self, worker_id: str) -> Optional[Dict[str, Any]]:
|
|
246
246
|
async with self._lock:
|
|
@@ -273,3 +273,29 @@ class MemoryStorage(StorageBackend):
|
|
|
273
273
|
"average_bid": 0,
|
|
274
274
|
"error": "Statistics are not supported for MemoryStorage backend.",
|
|
275
275
|
}
|
|
276
|
+
|
|
277
|
+
async def acquire_lock(self, key: str, holder_id: str, ttl: int) -> bool:
|
|
278
|
+
async with self._lock:
|
|
279
|
+
now = monotonic()
|
|
280
|
+
current_lock = self._locks.get(key)
|
|
281
|
+
|
|
282
|
+
# If lock exists and hasn't expired
|
|
283
|
+
if current_lock and current_lock[1] > now:
|
|
284
|
+
# If explicitly owned by us, we can extend/re-enter (optional behavior)
|
|
285
|
+
# But for strict locking, if it's held, return False (unless it's us? let's simpler: just False if held)
|
|
286
|
+
return False
|
|
287
|
+
|
|
288
|
+
# Acquire lock
|
|
289
|
+
self._locks[key] = (holder_id, now + ttl)
|
|
290
|
+
return True
|
|
291
|
+
|
|
292
|
+
async def release_lock(self, key: str, holder_id: str) -> bool:
|
|
293
|
+
async with self._lock:
|
|
294
|
+
current_lock = self._locks.get(key)
|
|
295
|
+
if current_lock:
|
|
296
|
+
owner, expiry = current_lock
|
|
297
|
+
# Only release if we are the owner
|
|
298
|
+
if owner == holder_id:
|
|
299
|
+
del self._locks[key]
|
|
300
|
+
return True
|
|
301
|
+
return False
|