avtomatika 1.0b4__tar.gz → 1.0b5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {avtomatika-1.0b4/src/avtomatika.egg-info → avtomatika-1.0b5}/PKG-INFO +11 -4
- {avtomatika-1.0b4 → avtomatika-1.0b5}/README.md +8 -1
- {avtomatika-1.0b4 → avtomatika-1.0b5}/pyproject.toml +5 -3
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/__init__.py +2 -2
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/blueprint.py +9 -11
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/config.py +7 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/context.py +18 -18
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/data_types.py +6 -7
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/datastore.py +2 -2
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/dispatcher.py +20 -21
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/engine.py +70 -67
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/executor.py +168 -148
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/history/base.py +7 -7
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/history/noop.py +7 -7
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/history/postgres.py +7 -9
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/history/sqlite.py +7 -10
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/logging_config.py +1 -1
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/storage/__init__.py +2 -2
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/storage/base.py +31 -20
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/storage/memory.py +36 -43
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/storage/redis.py +124 -60
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/worker_config_loader.py +2 -2
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/ws_manager.py +1 -2
- {avtomatika-1.0b4 → avtomatika-1.0b5/src/avtomatika.egg-info}/PKG-INFO +11 -4
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika.egg-info/requires.txt +2 -2
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_engine.py +123 -88
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_executor.py +24 -8
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_history.py +4 -3
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_logging_config.py +2 -2
- {avtomatika-1.0b4 → avtomatika-1.0b5}/LICENSE +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/setup.cfg +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/api.html +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/client_config_loader.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/compression.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/health_checker.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/metrics.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/py.typed +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/quota.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/ratelimit.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/reputation.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/security.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/telemetry.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika/watcher.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika.egg-info/SOURCES.txt +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika.egg-info/dependency_links.txt +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/src/avtomatika.egg-info/top_level.txt +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_blueprint_conditions.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_blueprints.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_client_config_loader.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_compression.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_config_validation.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_context.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_dispatcher.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_dispatcher_extended.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_error_handling.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_health_checker.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_integration.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_memory_locking.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_memory_storage.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_metrics.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_noop_history.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_postgres_history.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_ratelimit.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_redis_locking.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_redis_storage.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_reputation.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_telemetry.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_watcher.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_worker_config_loader.py +0 -0
- {avtomatika-1.0b4 → avtomatika-1.0b5}/tests/test_ws_manager.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: avtomatika
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.0b5
|
|
4
4
|
Summary: A state-machine based orchestrator for long-running AI and other jobs.
|
|
5
5
|
Project-URL: Homepage, https://github.com/avtomatika-ai/avtomatika
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/avtomatika-ai/avtomatika/issues
|
|
@@ -17,13 +17,13 @@ Requires-Dist: python-json-logger~=4.0
|
|
|
17
17
|
Requires-Dist: graphviz~=0.21
|
|
18
18
|
Requires-Dist: zstandard~=0.24
|
|
19
19
|
Requires-Dist: aioprometheus~=23.12
|
|
20
|
+
Requires-Dist: msgpack~=1.1
|
|
21
|
+
Requires-Dist: orjson~=3.11
|
|
20
22
|
Provides-Extra: redis
|
|
21
23
|
Requires-Dist: redis~=7.1; extra == "redis"
|
|
22
|
-
Requires-Dist: orjson~=3.11; extra == "redis"
|
|
23
24
|
Provides-Extra: history
|
|
24
25
|
Requires-Dist: aiosqlite~=0.22; extra == "history"
|
|
25
26
|
Requires-Dist: asyncpg~=0.30; extra == "history"
|
|
26
|
-
Requires-Dist: orjson~=3.11; extra == "history"
|
|
27
27
|
Provides-Extra: telemetry
|
|
28
28
|
Requires-Dist: opentelemetry-api~=1.39; extra == "telemetry"
|
|
29
29
|
Requires-Dist: opentelemetry-sdk~=1.39; extra == "telemetry"
|
|
@@ -360,18 +360,25 @@ The orchestrator has built-in mechanisms for handling failures based on the `err
|
|
|
360
360
|
* **PERMANENT_ERROR**: A permanent error (e.g., a corrupted file). The task will be immediately sent to quarantine for manual investigation.
|
|
361
361
|
* **INVALID_INPUT_ERROR**: An error in the input data. The entire pipeline (Job) will be immediately moved to the failed state.
|
|
362
362
|
|
|
363
|
+
### Concurrency & Performance
|
|
364
|
+
|
|
365
|
+
To prevent system overload during high traffic, the Orchestrator implements a backpressure mechanism for its internal job processing logic.
|
|
366
|
+
|
|
367
|
+
* **`EXECUTOR_MAX_CONCURRENT_JOBS`**: Limits the number of job handlers running simultaneously within the Orchestrator process (default: `100`). If this limit is reached, new jobs remain in the Redis queue until a slot becomes available. This ensures the event loop remains responsive even with a massive backlog of pending jobs.
|
|
368
|
+
|
|
363
369
|
### High Availability & Distributed Locking
|
|
364
370
|
|
|
365
371
|
The architecture supports horizontal scaling. Multiple Orchestrator instances can run behind a load balancer.
|
|
366
372
|
|
|
367
373
|
* **Stateless API:** The API is stateless; all state is persisted in Redis.
|
|
374
|
+
* **Instance Identity:** Each instance should have a unique `INSTANCE_ID` (defaults to hostname) for correct handling of Redis Streams consumer groups.
|
|
368
375
|
* **Distributed Locking:** Background processes (`Watcher`, `ReputationCalculator`) use distributed locks (via Redis `SET NX`) to coordinate and prevent race conditions when multiple instances are active.
|
|
369
376
|
|
|
370
377
|
### Storage Backend
|
|
371
378
|
|
|
372
379
|
By default, the engine uses in-memory storage. For production, you must configure persistent storage via environment variables.
|
|
373
380
|
|
|
374
|
-
* **Redis (StorageBackend)**: For storing current job states.
|
|
381
|
+
* **Redis (StorageBackend)**: For storing current job states (serialized with `msgpack`) and managing task queues (using Redis Streams with consumer groups).
|
|
375
382
|
* Install:
|
|
376
383
|
```bash
|
|
377
384
|
pip install "avtomatika[redis]"
|
|
@@ -314,18 +314,25 @@ The orchestrator has built-in mechanisms for handling failures based on the `err
|
|
|
314
314
|
* **PERMANENT_ERROR**: A permanent error (e.g., a corrupted file). The task will be immediately sent to quarantine for manual investigation.
|
|
315
315
|
* **INVALID_INPUT_ERROR**: An error in the input data. The entire pipeline (Job) will be immediately moved to the failed state.
|
|
316
316
|
|
|
317
|
+
### Concurrency & Performance
|
|
318
|
+
|
|
319
|
+
To prevent system overload during high traffic, the Orchestrator implements a backpressure mechanism for its internal job processing logic.
|
|
320
|
+
|
|
321
|
+
* **`EXECUTOR_MAX_CONCURRENT_JOBS`**: Limits the number of job handlers running simultaneously within the Orchestrator process (default: `100`). If this limit is reached, new jobs remain in the Redis queue until a slot becomes available. This ensures the event loop remains responsive even with a massive backlog of pending jobs.
|
|
322
|
+
|
|
317
323
|
### High Availability & Distributed Locking
|
|
318
324
|
|
|
319
325
|
The architecture supports horizontal scaling. Multiple Orchestrator instances can run behind a load balancer.
|
|
320
326
|
|
|
321
327
|
* **Stateless API:** The API is stateless; all state is persisted in Redis.
|
|
328
|
+
* **Instance Identity:** Each instance should have a unique `INSTANCE_ID` (defaults to hostname) for correct handling of Redis Streams consumer groups.
|
|
322
329
|
* **Distributed Locking:** Background processes (`Watcher`, `ReputationCalculator`) use distributed locks (via Redis `SET NX`) to coordinate and prevent race conditions when multiple instances are active.
|
|
323
330
|
|
|
324
331
|
### Storage Backend
|
|
325
332
|
|
|
326
333
|
By default, the engine uses in-memory storage. For production, you must configure persistent storage via environment variables.
|
|
327
334
|
|
|
328
|
-
* **Redis (StorageBackend)**: For storing current job states.
|
|
335
|
+
* **Redis (StorageBackend)**: For storing current job states (serialized with `msgpack`) and managing task queues (using Redis Streams with consumer groups).
|
|
329
336
|
* Install:
|
|
330
337
|
```bash
|
|
331
338
|
pip install "avtomatika[redis]"
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "avtomatika"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.0b5"
|
|
8
8
|
description = "A state-machine based orchestrator for long-running AI and other jobs."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.11"
|
|
@@ -21,11 +21,13 @@ dependencies = [
|
|
|
21
21
|
"graphviz~=0.21",
|
|
22
22
|
"zstandard~=0.24",
|
|
23
23
|
"aioprometheus~=23.12",
|
|
24
|
+
"msgpack~=1.1",
|
|
25
|
+
"orjson~=3.11",
|
|
24
26
|
]
|
|
25
27
|
|
|
26
28
|
[project.optional-dependencies]
|
|
27
|
-
redis = ["redis~=7.1"
|
|
28
|
-
history = ["aiosqlite~=0.22", "asyncpg~=0.30"
|
|
29
|
+
redis = ["redis~=7.1"]
|
|
30
|
+
history = ["aiosqlite~=0.22", "asyncpg~=0.30"]
|
|
29
31
|
telemetry = [
|
|
30
32
|
"opentelemetry-api~=1.39",
|
|
31
33
|
"opentelemetry-sdk~=1.39",
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
This module exposes the primary classes for building and running state-driven automations.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
import
|
|
7
|
+
from contextlib import suppress
|
|
8
8
|
from importlib.metadata import version
|
|
9
9
|
|
|
10
10
|
__version__ = version("avtomatika")
|
|
@@ -23,7 +23,7 @@ __all__ = [
|
|
|
23
23
|
"StorageBackend",
|
|
24
24
|
]
|
|
25
25
|
|
|
26
|
-
with
|
|
26
|
+
with suppress(ImportError):
|
|
27
27
|
from .storage.redis import RedisStorage # noqa: F401
|
|
28
28
|
|
|
29
29
|
__all__.append("RedisStorage")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from operator import eq, ge, gt, le, lt, ne
|
|
2
2
|
from re import compile as re_compile
|
|
3
|
-
from typing import Any, Callable,
|
|
3
|
+
from typing import Any, Callable, NamedTuple
|
|
4
4
|
|
|
5
5
|
from .datastore import AsyncDictStore
|
|
6
6
|
|
|
@@ -99,8 +99,6 @@ class HandlerDecorator:
|
|
|
99
99
|
|
|
100
100
|
def when(self, condition_str: str) -> Callable:
|
|
101
101
|
def decorator(func: Callable) -> Callable:
|
|
102
|
-
# We still register the base handler to ensure the state is known,
|
|
103
|
-
# but we can make it a no-op if only conditional handlers exist for a state.
|
|
104
102
|
if self._state not in self._blueprint.handlers:
|
|
105
103
|
self._blueprint.handlers[self._state] = lambda: None # Placeholder
|
|
106
104
|
|
|
@@ -115,8 +113,8 @@ class StateMachineBlueprint:
|
|
|
115
113
|
def __init__(
|
|
116
114
|
self,
|
|
117
115
|
name: str,
|
|
118
|
-
api_endpoint:
|
|
119
|
-
api_version:
|
|
116
|
+
api_endpoint: str | None = None,
|
|
117
|
+
api_version: str | None = None,
|
|
120
118
|
data_stores: Any = None,
|
|
121
119
|
):
|
|
122
120
|
"""Initializes a new blueprint.
|
|
@@ -132,14 +130,14 @@ class StateMachineBlueprint:
|
|
|
132
130
|
self.name = name
|
|
133
131
|
self.api_endpoint = api_endpoint
|
|
134
132
|
self.api_version = api_version
|
|
135
|
-
self.data_stores:
|
|
136
|
-
self.handlers:
|
|
137
|
-
self.aggregator_handlers:
|
|
133
|
+
self.data_stores: dict[str, AsyncDictStore] = data_stores if data_stores is not None else {}
|
|
134
|
+
self.handlers: dict[str, Callable] = {}
|
|
135
|
+
self.aggregator_handlers: dict[str, Callable] = {}
|
|
138
136
|
self.conditional_handlers: list[ConditionalHandler] = []
|
|
139
|
-
self.start_state:
|
|
137
|
+
self.start_state: str | None = None
|
|
140
138
|
self.end_states: set[str] = set()
|
|
141
139
|
|
|
142
|
-
def add_data_store(self, name: str, initial_data:
|
|
140
|
+
def add_data_store(self, name: str, initial_data: dict[str, Any]):
|
|
143
141
|
"""Adds a named data store to the blueprint."""
|
|
144
142
|
if name in self.data_stores:
|
|
145
143
|
raise ValueError(f"Data store with name '{name}' already exists.")
|
|
@@ -174,7 +172,7 @@ class StateMachineBlueprint:
|
|
|
174
172
|
f"No suitable handler found for state '{state}' in blueprint '{self.name}' for the given context.",
|
|
175
173
|
)
|
|
176
174
|
|
|
177
|
-
def render_graph(self, output_filename:
|
|
175
|
+
def render_graph(self, output_filename: str | None = None, output_format: str = "png"):
|
|
178
176
|
import ast
|
|
179
177
|
import inspect
|
|
180
178
|
import logging
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from os import getenv
|
|
2
|
+
from socket import gethostname
|
|
2
3
|
|
|
3
4
|
|
|
4
5
|
class Config:
|
|
@@ -7,6 +8,9 @@ class Config:
|
|
|
7
8
|
"""
|
|
8
9
|
|
|
9
10
|
def __init__(self):
|
|
11
|
+
# Instance identity
|
|
12
|
+
self.INSTANCE_ID: str = getenv("INSTANCE_ID", gethostname())
|
|
13
|
+
|
|
10
14
|
# Redis settings
|
|
11
15
|
self.REDIS_HOST: str = getenv("REDIS_HOST", "")
|
|
12
16
|
self.REDIS_PORT: int = int(getenv("REDIS_PORT", 6379))
|
|
@@ -45,6 +49,9 @@ class Config:
|
|
|
45
49
|
self.WATCHER_INTERVAL_SECONDS: int = int(
|
|
46
50
|
getenv("WATCHER_INTERVAL_SECONDS", 20),
|
|
47
51
|
)
|
|
52
|
+
self.EXECUTOR_MAX_CONCURRENT_JOBS: int = int(
|
|
53
|
+
getenv("EXECUTOR_MAX_CONCURRENT_JOBS", 100),
|
|
54
|
+
)
|
|
48
55
|
|
|
49
56
|
# History storage settings
|
|
50
57
|
self.HISTORY_DATABASE_URI: str = getenv("HISTORY_DATABASE_URI", "")
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class ActionFactory:
|
|
@@ -6,10 +6,10 @@ class ActionFactory:
|
|
|
6
6
|
|
|
7
7
|
def __init__(self, job_id: str):
|
|
8
8
|
self._job_id = job_id
|
|
9
|
-
self._next_state_val:
|
|
10
|
-
self._task_to_dispatch_val:
|
|
11
|
-
self._sub_blueprint_to_run_val:
|
|
12
|
-
self._parallel_tasks_to_dispatch_val:
|
|
9
|
+
self._next_state_val: str | None = None
|
|
10
|
+
self._task_to_dispatch_val: dict[str, Any] | None = None
|
|
11
|
+
self._sub_blueprint_to_run_val: dict[str, Any] | None = None
|
|
12
|
+
self._parallel_tasks_to_dispatch_val: dict[str, Any] | None = None
|
|
13
13
|
|
|
14
14
|
def _check_for_existing_action(self):
|
|
15
15
|
"""
|
|
@@ -30,22 +30,22 @@ class ActionFactory:
|
|
|
30
30
|
)
|
|
31
31
|
|
|
32
32
|
@property
|
|
33
|
-
def next_state(self) ->
|
|
33
|
+
def next_state(self) -> str | None:
|
|
34
34
|
return self._next_state_val
|
|
35
35
|
|
|
36
36
|
@property
|
|
37
|
-
def task_to_dispatch(self) ->
|
|
37
|
+
def task_to_dispatch(self) -> dict[str, Any] | None:
|
|
38
38
|
return self._task_to_dispatch_val
|
|
39
39
|
|
|
40
40
|
@property
|
|
41
|
-
def sub_blueprint_to_run(self) ->
|
|
41
|
+
def sub_blueprint_to_run(self) -> dict[str, Any] | None:
|
|
42
42
|
return self._sub_blueprint_to_run_val
|
|
43
43
|
|
|
44
44
|
@property
|
|
45
|
-
def parallel_tasks_to_dispatch(self) ->
|
|
45
|
+
def parallel_tasks_to_dispatch(self) -> dict[str, Any] | None:
|
|
46
46
|
return self._parallel_tasks_to_dispatch_val
|
|
47
47
|
|
|
48
|
-
def dispatch_parallel(self, tasks:
|
|
48
|
+
def dispatch_parallel(self, tasks: dict[str, Any] | None, aggregate_into: str) -> None:
|
|
49
49
|
"""
|
|
50
50
|
Dispatches multiple tasks for parallel execution.
|
|
51
51
|
"""
|
|
@@ -65,12 +65,12 @@ class ActionFactory:
|
|
|
65
65
|
def dispatch_task(
|
|
66
66
|
self,
|
|
67
67
|
task_type: str,
|
|
68
|
-
params:
|
|
69
|
-
transitions:
|
|
68
|
+
params: dict[str, Any],
|
|
69
|
+
transitions: dict[str, str],
|
|
70
70
|
dispatch_strategy: str = "default",
|
|
71
|
-
resource_requirements:
|
|
72
|
-
timeout_seconds:
|
|
73
|
-
max_cost:
|
|
71
|
+
resource_requirements: dict[str, Any] | None = None,
|
|
72
|
+
timeout_seconds: int | None = None,
|
|
73
|
+
max_cost: float | None = None,
|
|
74
74
|
priority: float = 0.0,
|
|
75
75
|
) -> None:
|
|
76
76
|
"""Dispatches a task to a worker for execution."""
|
|
@@ -91,7 +91,7 @@ class ActionFactory:
|
|
|
91
91
|
self,
|
|
92
92
|
integration: str,
|
|
93
93
|
message: str,
|
|
94
|
-
transitions:
|
|
94
|
+
transitions: dict[str, str],
|
|
95
95
|
) -> None:
|
|
96
96
|
"""Pauses the pipeline until an external signal (human approval) is received."""
|
|
97
97
|
self._check_for_existing_action()
|
|
@@ -106,8 +106,8 @@ class ActionFactory:
|
|
|
106
106
|
def run_blueprint(
|
|
107
107
|
self,
|
|
108
108
|
blueprint_name: str,
|
|
109
|
-
initial_data:
|
|
110
|
-
transitions:
|
|
109
|
+
initial_data: dict[str, Any],
|
|
110
|
+
transitions: dict[str, str],
|
|
111
111
|
) -> None:
|
|
112
112
|
"""Runs a child blueprint and waits for its result."""
|
|
113
113
|
self._check_for_existing_action()
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, Any,
|
|
1
|
+
from typing import TYPE_CHECKING, Any, NamedTuple
|
|
2
2
|
|
|
3
3
|
if TYPE_CHECKING:
|
|
4
4
|
from .context import ActionFactory
|
|
@@ -9,8 +9,7 @@ class ClientConfig(NamedTuple):
|
|
|
9
9
|
|
|
10
10
|
token: str
|
|
11
11
|
plan: str
|
|
12
|
-
|
|
13
|
-
params: Dict[str, Any]
|
|
12
|
+
params: dict[str, Any]
|
|
14
13
|
|
|
15
14
|
|
|
16
15
|
class JobContext(NamedTuple):
|
|
@@ -18,13 +17,13 @@ class JobContext(NamedTuple):
|
|
|
18
17
|
|
|
19
18
|
job_id: str
|
|
20
19
|
current_state: str
|
|
21
|
-
initial_data:
|
|
22
|
-
state_history:
|
|
20
|
+
initial_data: dict[str, Any]
|
|
21
|
+
state_history: dict[str, Any]
|
|
23
22
|
client: ClientConfig
|
|
24
23
|
actions: "ActionFactory"
|
|
25
24
|
data_stores: Any = None
|
|
26
|
-
tracing_context:
|
|
27
|
-
aggregation_results:
|
|
25
|
+
tracing_context: dict[str, Any] = {}
|
|
26
|
+
aggregation_results: dict[str, Any] | None = None
|
|
28
27
|
|
|
29
28
|
|
|
30
29
|
class GPUInfo(NamedTuple):
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any
|
|
1
|
+
from typing import Any
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class AsyncDictStore:
|
|
@@ -6,7 +6,7 @@ class AsyncDictStore:
|
|
|
6
6
|
Simulates the behavior of a persistent store for use in blueprints.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
def __init__(self, initial_data:
|
|
9
|
+
def __init__(self, initial_data: dict[str, Any]):
|
|
10
10
|
self._data = initial_data.copy()
|
|
11
11
|
|
|
12
12
|
async def get(self, key: str) -> Any:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from collections import defaultdict
|
|
2
2
|
from logging import getLogger
|
|
3
3
|
from random import choice
|
|
4
|
-
from typing import Any
|
|
4
|
+
from typing import Any
|
|
5
5
|
from uuid import uuid4
|
|
6
6
|
|
|
7
7
|
try:
|
|
@@ -26,12 +26,12 @@ class Dispatcher:
|
|
|
26
26
|
def __init__(self, storage: StorageBackend, config: Config):
|
|
27
27
|
self.storage = storage
|
|
28
28
|
self.config = config
|
|
29
|
-
self._round_robin_indices:
|
|
29
|
+
self._round_robin_indices: dict[str, int] = defaultdict(int)
|
|
30
30
|
|
|
31
31
|
@staticmethod
|
|
32
32
|
def _is_worker_compliant(
|
|
33
|
-
worker:
|
|
34
|
-
requirements:
|
|
33
|
+
worker: dict[str, Any],
|
|
34
|
+
requirements: dict[str, Any],
|
|
35
35
|
) -> bool:
|
|
36
36
|
"""Checks if a worker meets the specified resource requirements."""
|
|
37
37
|
if required_gpu := requirements.get("gpu_info"):
|
|
@@ -58,9 +58,9 @@ class Dispatcher:
|
|
|
58
58
|
|
|
59
59
|
@staticmethod
|
|
60
60
|
def _select_default(
|
|
61
|
-
workers:
|
|
61
|
+
workers: list[dict[str, Any]],
|
|
62
62
|
task_type: str,
|
|
63
|
-
) ->
|
|
63
|
+
) -> dict[str, Any]:
|
|
64
64
|
"""Default strategy: first selects "warm" workers (those that have the
|
|
65
65
|
task in their cache), and then selects the cheapest among them.
|
|
66
66
|
|
|
@@ -80,9 +80,9 @@ class Dispatcher:
|
|
|
80
80
|
|
|
81
81
|
def _select_round_robin(
|
|
82
82
|
self,
|
|
83
|
-
workers:
|
|
83
|
+
workers: list[dict[str, Any]],
|
|
84
84
|
task_type: str,
|
|
85
|
-
) ->
|
|
85
|
+
) -> dict[str, Any]:
|
|
86
86
|
""" "Round Robin" strategy: distributes tasks sequentially among all
|
|
87
87
|
available workers.
|
|
88
88
|
"""
|
|
@@ -93,9 +93,9 @@ class Dispatcher:
|
|
|
93
93
|
|
|
94
94
|
@staticmethod
|
|
95
95
|
def _select_least_connections(
|
|
96
|
-
workers:
|
|
96
|
+
workers: list[dict[str, Any]],
|
|
97
97
|
task_type: str,
|
|
98
|
-
) ->
|
|
98
|
+
) -> dict[str, Any]:
|
|
99
99
|
""" "Least Connections" strategy: selects the worker with the fewest
|
|
100
100
|
active tasks (based on the `load` field).
|
|
101
101
|
"""
|
|
@@ -103,14 +103,14 @@ class Dispatcher:
|
|
|
103
103
|
|
|
104
104
|
@staticmethod
|
|
105
105
|
def _select_cheapest(
|
|
106
|
-
workers:
|
|
106
|
+
workers: list[dict[str, Any]],
|
|
107
107
|
task_type: str,
|
|
108
|
-
) ->
|
|
108
|
+
) -> dict[str, Any]:
|
|
109
109
|
"""Selects the cheapest worker based on 'cost_per_second'."""
|
|
110
110
|
return min(workers, key=lambda w: w.get("cost_per_second", float("inf")))
|
|
111
111
|
|
|
112
112
|
@staticmethod
|
|
113
|
-
def _get_best_value_score(worker:
|
|
113
|
+
def _get_best_value_score(worker: dict[str, Any]) -> float:
|
|
114
114
|
"""Calculates a "score" for a worker using the formula cost / reputation.
|
|
115
115
|
The lower the score, the better.
|
|
116
116
|
"""
|
|
@@ -122,13 +122,13 @@ class Dispatcher:
|
|
|
122
122
|
|
|
123
123
|
def _select_best_value(
|
|
124
124
|
self,
|
|
125
|
-
workers:
|
|
125
|
+
workers: list[dict[str, Any]],
|
|
126
126
|
task_type: str,
|
|
127
|
-
) ->
|
|
127
|
+
) -> dict[str, Any]:
|
|
128
128
|
"""Selects the worker with the best price-quality (reputation) ratio."""
|
|
129
129
|
return min(workers, key=self._get_best_value_score)
|
|
130
130
|
|
|
131
|
-
async def dispatch(self, job_state:
|
|
131
|
+
async def dispatch(self, job_state: dict[str, Any], task_info: dict[str, Any]):
|
|
132
132
|
job_id = job_state["id"]
|
|
133
133
|
task_type = task_info.get("type")
|
|
134
134
|
if not task_type:
|
|
@@ -142,7 +142,6 @@ class Dispatcher:
|
|
|
142
142
|
if not all_workers:
|
|
143
143
|
raise RuntimeError("No available workers")
|
|
144
144
|
|
|
145
|
-
# 1. Filter by 'idle' status
|
|
146
145
|
# A worker is considered available if its status is 'idle' or not specified (for backward compatibility)
|
|
147
146
|
logger.debug(f"All available workers: {[w['worker_id'] for w in all_workers]}")
|
|
148
147
|
idle_workers = [w for w in all_workers if w.get("status", "idle") == "idle"]
|
|
@@ -157,13 +156,13 @@ class Dispatcher:
|
|
|
157
156
|
)
|
|
158
157
|
raise RuntimeError("No idle workers (all are 'busy')")
|
|
159
158
|
|
|
160
|
-
#
|
|
159
|
+
# Filter by task type
|
|
161
160
|
capable_workers = [w for w in idle_workers if task_type in w.get("supported_tasks", [])]
|
|
162
161
|
logger.debug(f"Capable workers for task '{task_type}': {[w['worker_id'] for w in capable_workers]}")
|
|
163
162
|
if not capable_workers:
|
|
164
163
|
raise RuntimeError(f"No suitable workers for task type '{task_type}'")
|
|
165
164
|
|
|
166
|
-
#
|
|
165
|
+
# Filter by resource requirements
|
|
167
166
|
if resource_requirements:
|
|
168
167
|
compliant_workers = [w for w in capable_workers if self._is_worker_compliant(w, resource_requirements)]
|
|
169
168
|
logger.debug(
|
|
@@ -176,7 +175,7 @@ class Dispatcher:
|
|
|
176
175
|
)
|
|
177
176
|
capable_workers = compliant_workers
|
|
178
177
|
|
|
179
|
-
#
|
|
178
|
+
# Filter by maximum cost
|
|
180
179
|
max_cost = task_info.get("max_cost")
|
|
181
180
|
if max_cost is not None:
|
|
182
181
|
cost_compliant_workers = [w for w in capable_workers if w.get("cost_per_second", float("inf")) <= max_cost]
|
|
@@ -189,7 +188,7 @@ class Dispatcher:
|
|
|
189
188
|
)
|
|
190
189
|
capable_workers = cost_compliant_workers
|
|
191
190
|
|
|
192
|
-
#
|
|
191
|
+
# Select worker according to strategy
|
|
193
192
|
if dispatch_strategy == "round_robin":
|
|
194
193
|
selected_worker = self._select_round_robin(capable_workers, task_type)
|
|
195
194
|
elif dispatch_strategy == "least_connections":
|