avtomatika 1.0b2__py3-none-any.whl → 1.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avtomatika/__init__.py +2 -3
- avtomatika/api.html +0 -11
- avtomatika/blueprint.py +5 -7
- avtomatika/client_config_loader.py +18 -6
- avtomatika/dispatcher.py +13 -19
- avtomatika/engine.py +52 -16
- avtomatika/executor.py +6 -3
- avtomatika/ratelimit.py +3 -10
- avtomatika/reputation.py +11 -2
- avtomatika/storage/__init__.py +3 -3
- avtomatika/storage/base.py +23 -0
- avtomatika/storage/memory.py +34 -8
- avtomatika/storage/redis.py +37 -20
- avtomatika/telemetry.py +3 -3
- avtomatika/watcher.py +39 -25
- avtomatika/worker_config_loader.py +7 -2
- avtomatika/ws_manager.py +2 -1
- {avtomatika-1.0b2.dist-info → avtomatika-1.0b4.dist-info}/METADATA +57 -11
- avtomatika-1.0b4.dist-info/RECORD +37 -0
- avtomatika-1.0b2.dist-info/RECORD +0 -37
- {avtomatika-1.0b2.dist-info → avtomatika-1.0b4.dist-info}/WHEEL +0 -0
- {avtomatika-1.0b2.dist-info → avtomatika-1.0b4.dist-info}/licenses/LICENSE +0 -0
- {avtomatika-1.0b2.dist-info → avtomatika-1.0b4.dist-info}/top_level.txt +0 -0
avtomatika/__init__.py
CHANGED
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
This module exposes the primary classes for building and running state-driven automations.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
import contextlib
|
|
7
8
|
from importlib.metadata import version
|
|
8
9
|
|
|
9
10
|
__version__ = version("avtomatika")
|
|
@@ -22,9 +23,7 @@ __all__ = [
|
|
|
22
23
|
"StorageBackend",
|
|
23
24
|
]
|
|
24
25
|
|
|
25
|
-
|
|
26
|
+
with contextlib.suppress(ImportError):
|
|
26
27
|
from .storage.redis import RedisStorage # noqa: F401
|
|
27
28
|
|
|
28
29
|
__all__.append("RedisStorage")
|
|
29
|
-
except ImportError:
|
|
30
|
-
pass
|
avtomatika/api.html
CHANGED
|
@@ -199,17 +199,6 @@
|
|
|
199
199
|
{ code: '202 Accepted', description: 'Job successfully accepted for processing.', body: { "status": "accepted", "job_id": "..." } }
|
|
200
200
|
]
|
|
201
201
|
},
|
|
202
|
-
{
|
|
203
|
-
id: 'post-create-showcase-job',
|
|
204
|
-
name: 'Create a Full Showcase Job',
|
|
205
|
-
method: 'POST',
|
|
206
|
-
path: '/api/v1/jobs/full_showcase',
|
|
207
|
-
description: 'Creates and starts a new instance (Job) of the `full_showcase` blueprint. This blueprint demonstrates most of the features of the Avtomatika library.',
|
|
208
|
-
request: { body: { "path": "/path/to/video.mp4", "user_id": "user-123", "quality": "high" } },
|
|
209
|
-
responses: [
|
|
210
|
-
{ code: '202 Accepted', description: 'Job successfully accepted for processing.', body: { "status": "accepted", "job_id": "..." } }
|
|
211
|
-
]
|
|
212
|
-
},
|
|
213
202
|
{
|
|
214
203
|
id: 'get-job-status',
|
|
215
204
|
name: 'Get Job Status',
|
avtomatika/blueprint.py
CHANGED
|
@@ -168,8 +168,7 @@ class StateMachineBlueprint:
|
|
|
168
168
|
for handler in self.conditional_handlers:
|
|
169
169
|
if handler.state == state and handler.evaluate(context):
|
|
170
170
|
return handler.func
|
|
171
|
-
default_handler
|
|
172
|
-
if default_handler:
|
|
171
|
+
if default_handler := self.handlers.get(state):
|
|
173
172
|
return default_handler
|
|
174
173
|
raise ValueError(
|
|
175
174
|
f"No suitable handler found for state '{state}' in blueprint '{self.name}' for the given context.",
|
|
@@ -230,12 +229,11 @@ class StateMachineBlueprint:
|
|
|
230
229
|
f"Could not parse handler '{handler_func.__name__}' for state '{handler_state}'. "
|
|
231
230
|
f"Graph may be incomplete. Error: {e}"
|
|
232
231
|
)
|
|
233
|
-
pass
|
|
234
232
|
for state in states:
|
|
235
233
|
dot.node(state, state)
|
|
236
234
|
|
|
237
|
-
if output_filename:
|
|
238
|
-
dot.render(output_filename, format=output_format, cleanup=True)
|
|
239
|
-
print(f"Graph rendered to {output_filename}.{output_format}")
|
|
240
|
-
else:
|
|
235
|
+
if not output_filename:
|
|
241
236
|
return dot.source
|
|
237
|
+
dot.render(output_filename, format=output_format, cleanup=True)
|
|
238
|
+
print(f"Graph rendered to {output_filename}.{output_format}")
|
|
239
|
+
return None
|
|
@@ -26,25 +26,37 @@ async def load_client_configs_to_redis(
|
|
|
26
26
|
config_path,
|
|
27
27
|
)
|
|
28
28
|
return
|
|
29
|
+
except Exception as e:
|
|
30
|
+
logger.error(f"Failed to parse client config file '{config_path}': {e}")
|
|
31
|
+
raise ValueError(f"Invalid client configuration file: {e}") from e
|
|
29
32
|
|
|
30
33
|
loaded_count = 0
|
|
31
34
|
for client_name, config in clients_data.items():
|
|
35
|
+
if not isinstance(config, dict):
|
|
36
|
+
logger.error(f"Client '{client_name}' configuration must be a table (dict).")
|
|
37
|
+
raise ValueError(f"Invalid configuration for client '{client_name}'")
|
|
38
|
+
|
|
32
39
|
token = config.get("token")
|
|
33
40
|
if not token:
|
|
34
|
-
logger.
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
41
|
+
logger.error(f"Client '{client_name}' is missing required 'token' field.")
|
|
42
|
+
raise ValueError(f"Missing token for client '{client_name}'")
|
|
43
|
+
|
|
44
|
+
if not isinstance(token, str):
|
|
45
|
+
logger.error(f"Token for client '{client_name}' must be a string.")
|
|
46
|
+
raise ValueError(f"Invalid token type for client '{client_name}'")
|
|
39
47
|
|
|
40
48
|
# Separate static config from dynamic quota values
|
|
41
49
|
static_config = {k: v for k, v in config.items() if k != "monthly_attempts"}
|
|
42
50
|
quota = config.get("monthly_attempts")
|
|
43
51
|
|
|
52
|
+
if quota is not None and not isinstance(quota, int):
|
|
53
|
+
logger.error(f"Quota 'monthly_attempts' for client '{client_name}' must be an integer.")
|
|
54
|
+
raise ValueError(f"Invalid quota type for client '{client_name}'")
|
|
55
|
+
|
|
44
56
|
try:
|
|
45
57
|
# Assume these storage methods will be implemented
|
|
46
58
|
await storage.save_client_config(token, static_config)
|
|
47
|
-
if quota is not None
|
|
59
|
+
if quota is not None:
|
|
48
60
|
await storage.initialize_client_quota(token, quota)
|
|
49
61
|
|
|
50
62
|
loaded_count += 1
|
avtomatika/dispatcher.py
CHANGED
|
@@ -28,15 +28,13 @@ class Dispatcher:
|
|
|
28
28
|
self.config = config
|
|
29
29
|
self._round_robin_indices: Dict[str, int] = defaultdict(int)
|
|
30
30
|
|
|
31
|
+
@staticmethod
|
|
31
32
|
def _is_worker_compliant(
|
|
32
|
-
self,
|
|
33
33
|
worker: Dict[str, Any],
|
|
34
34
|
requirements: Dict[str, Any],
|
|
35
35
|
) -> bool:
|
|
36
36
|
"""Checks if a worker meets the specified resource requirements."""
|
|
37
|
-
|
|
38
|
-
required_gpu = requirements.get("gpu_info")
|
|
39
|
-
if required_gpu:
|
|
37
|
+
if required_gpu := requirements.get("gpu_info"):
|
|
40
38
|
gpu_info = worker.get("resources", {}).get("gpu_info")
|
|
41
39
|
if not gpu_info:
|
|
42
40
|
return False
|
|
@@ -51,17 +49,15 @@ class Dispatcher:
|
|
|
51
49
|
):
|
|
52
50
|
return False
|
|
53
51
|
|
|
54
|
-
|
|
55
|
-
required_models = requirements.get("installed_models")
|
|
56
|
-
if required_models:
|
|
52
|
+
if required_models := requirements.get("installed_models"):
|
|
57
53
|
installed_models = {m["name"] for m in worker.get("installed_models", [])}
|
|
58
54
|
if not set(required_models).issubset(installed_models):
|
|
59
55
|
return False
|
|
60
56
|
|
|
61
57
|
return True
|
|
62
58
|
|
|
59
|
+
@staticmethod
|
|
63
60
|
def _select_default(
|
|
64
|
-
self,
|
|
65
61
|
workers: List[Dict[str, Any]],
|
|
66
62
|
task_type: str,
|
|
67
63
|
) -> Dict[str, Any]:
|
|
@@ -74,7 +70,7 @@ class Dispatcher:
|
|
|
74
70
|
"""
|
|
75
71
|
warm_workers = [w for w in workers if task_type in w.get("hot_cache", [])]
|
|
76
72
|
|
|
77
|
-
target_pool = warm_workers
|
|
73
|
+
target_pool = warm_workers or workers
|
|
78
74
|
|
|
79
75
|
# The `cost` field is deprecated but maintained for backward compatibility.
|
|
80
76
|
min_cost = min(w.get("cost", float("inf")) for w in target_pool)
|
|
@@ -95,8 +91,8 @@ class Dispatcher:
|
|
|
95
91
|
self._round_robin_indices[task_type] = idx + 1
|
|
96
92
|
return selected_worker
|
|
97
93
|
|
|
94
|
+
@staticmethod
|
|
98
95
|
def _select_least_connections(
|
|
99
|
-
self,
|
|
100
96
|
workers: List[Dict[str, Any]],
|
|
101
97
|
task_type: str,
|
|
102
98
|
) -> Dict[str, Any]:
|
|
@@ -105,15 +101,16 @@ class Dispatcher:
|
|
|
105
101
|
"""
|
|
106
102
|
return min(workers, key=lambda w: w.get("load", 0.0))
|
|
107
103
|
|
|
104
|
+
@staticmethod
|
|
108
105
|
def _select_cheapest(
|
|
109
|
-
self,
|
|
110
106
|
workers: List[Dict[str, Any]],
|
|
111
107
|
task_type: str,
|
|
112
108
|
) -> Dict[str, Any]:
|
|
113
109
|
"""Selects the cheapest worker based on 'cost_per_second'."""
|
|
114
110
|
return min(workers, key=lambda w: w.get("cost_per_second", float("inf")))
|
|
115
111
|
|
|
116
|
-
|
|
112
|
+
@staticmethod
|
|
113
|
+
def _get_best_value_score(worker: Dict[str, Any]) -> float:
|
|
117
114
|
"""Calculates a "score" for a worker using the formula cost / reputation.
|
|
118
115
|
The lower the score, the better.
|
|
119
116
|
"""
|
|
@@ -121,9 +118,7 @@ class Dispatcher:
|
|
|
121
118
|
# Default reputation is 1.0 if absent
|
|
122
119
|
reputation = worker.get("reputation", 1.0)
|
|
123
120
|
# Avoid division by zero
|
|
124
|
-
if reputation == 0
|
|
125
|
-
return float("inf")
|
|
126
|
-
return cost / reputation
|
|
121
|
+
return float("inf") if reputation == 0 else cost / reputation
|
|
127
122
|
|
|
128
123
|
def _select_best_value(
|
|
129
124
|
self,
|
|
@@ -153,10 +148,9 @@ class Dispatcher:
|
|
|
153
148
|
idle_workers = [w for w in all_workers if w.get("status", "idle") == "idle"]
|
|
154
149
|
logger.debug(f"Idle workers: {[w['worker_id'] for w in idle_workers]}")
|
|
155
150
|
if not idle_workers:
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
if busy_mo_workers:
|
|
151
|
+
if busy_mo_workers := [
|
|
152
|
+
w for w in all_workers if w.get("status") == "busy" and "multi_orchestrator_info" in w
|
|
153
|
+
]:
|
|
160
154
|
logger.warning(
|
|
161
155
|
f"No idle workers. Found {len(busy_mo_workers)} busy workers "
|
|
162
156
|
f"in multi-orchestrator mode. They are likely performing tasks for other Orchestrators.",
|
avtomatika/engine.py
CHANGED
|
@@ -485,8 +485,7 @@ class OrchestratorEngine:
|
|
|
485
485
|
await self.storage.save_job_state(job_id, job_state)
|
|
486
486
|
# Optionally, trigger a specific 'cancelled' transition if defined in the blueprint
|
|
487
487
|
transitions = job_state.get("current_task_transitions", {})
|
|
488
|
-
next_state
|
|
489
|
-
if next_state:
|
|
488
|
+
if next_state := transitions.get("cancelled"):
|
|
490
489
|
job_state["current_state"] = next_state
|
|
491
490
|
job_state["status"] = "running" # It's running the cancellation handler now
|
|
492
491
|
await self.storage.save_job_state(job_id, job_state)
|
|
@@ -494,9 +493,7 @@ class OrchestratorEngine:
|
|
|
494
493
|
return web.json_response({"status": "result_accepted_cancelled"}, status=200)
|
|
495
494
|
|
|
496
495
|
transitions = job_state.get("current_task_transitions", {})
|
|
497
|
-
next_state
|
|
498
|
-
|
|
499
|
-
if next_state:
|
|
496
|
+
if next_state := transitions.get(result_status):
|
|
500
497
|
logging.info(f"Job {job_id} transitioning based on worker status '{result_status}' to state '{next_state}'")
|
|
501
498
|
|
|
502
499
|
worker_data = result.get("data")
|
|
@@ -603,15 +600,52 @@ class OrchestratorEngine:
|
|
|
603
600
|
return web.json_response({"status": "db_flushed"}, status=200)
|
|
604
601
|
|
|
605
602
|
async def _docs_handler(self, request: web.Request) -> web.Response:
|
|
603
|
+
import json
|
|
606
604
|
from importlib import resources
|
|
607
605
|
|
|
608
606
|
try:
|
|
609
607
|
content = resources.read_text("avtomatika", "api.html")
|
|
610
|
-
return web.Response(text=content, content_type="text/html")
|
|
611
608
|
except FileNotFoundError:
|
|
612
609
|
logger.error("api.html not found within the avtomatika package.")
|
|
613
610
|
return web.json_response({"error": "Documentation file not found on server."}, status=500)
|
|
614
611
|
|
|
612
|
+
# Generate dynamic documentation for registered blueprints
|
|
613
|
+
blueprint_endpoints = []
|
|
614
|
+
for bp in self.blueprints.values():
|
|
615
|
+
if not bp.api_endpoint:
|
|
616
|
+
continue
|
|
617
|
+
|
|
618
|
+
version_prefix = f"/{bp.api_version}" if bp.api_version else ""
|
|
619
|
+
endpoint_path = bp.api_endpoint if bp.api_endpoint.startswith("/") else f"/{bp.api_endpoint}"
|
|
620
|
+
full_path = f"/api{version_prefix}{endpoint_path}"
|
|
621
|
+
|
|
622
|
+
blueprint_endpoints.append(
|
|
623
|
+
{
|
|
624
|
+
"id": f"post-create-{bp.name.replace('_', '-')}",
|
|
625
|
+
"name": f"Create {bp.name.replace('_', ' ').title()} Job",
|
|
626
|
+
"method": "POST",
|
|
627
|
+
"path": full_path,
|
|
628
|
+
"description": f"Creates and starts a new instance (Job) of the `{bp.name}` blueprint.",
|
|
629
|
+
"request": {"body": {"initial_data": {}}},
|
|
630
|
+
"responses": [
|
|
631
|
+
{
|
|
632
|
+
"code": "202 Accepted",
|
|
633
|
+
"description": "Job successfully accepted for processing.",
|
|
634
|
+
"body": {"status": "accepted", "job_id": "..."},
|
|
635
|
+
}
|
|
636
|
+
],
|
|
637
|
+
}
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
# Inject dynamic endpoints into the apiData structure in the HTML
|
|
641
|
+
if blueprint_endpoints:
|
|
642
|
+
endpoints_json = json.dumps(blueprint_endpoints, indent=2)
|
|
643
|
+
# We insert the new endpoints at the beginning of the 'Protected API' group
|
|
644
|
+
marker = "group: 'Protected API',\n endpoints: ["
|
|
645
|
+
content = content.replace(marker, f"{marker}\n{endpoints_json.strip('[]')},")
|
|
646
|
+
|
|
647
|
+
return web.Response(text=content, content_type="text/html")
|
|
648
|
+
|
|
615
649
|
def _setup_routes(self):
|
|
616
650
|
public_app = web.Application()
|
|
617
651
|
public_app.router.add_get("/status", status_handler)
|
|
@@ -647,16 +681,7 @@ class OrchestratorEngine:
|
|
|
647
681
|
all_protected_apps.append(protected_app)
|
|
648
682
|
|
|
649
683
|
for app in all_protected_apps:
|
|
650
|
-
|
|
651
|
-
app.router.add_post("/jobs/{job_id}/cancel", self._cancel_job_handler)
|
|
652
|
-
if not isinstance(self.history_storage, NoOpHistoryStorage):
|
|
653
|
-
app.router.add_get("/jobs/{job_id}/history", self._get_job_history_handler)
|
|
654
|
-
app.router.add_get("/blueprints/{blueprint_name}/graph", self._get_blueprint_graph_handler)
|
|
655
|
-
app.router.add_get("/workers", self._get_workers_handler)
|
|
656
|
-
app.router.add_get("/jobs", self._get_jobs_handler)
|
|
657
|
-
app.router.add_get("/dashboard", self._get_dashboard_handler)
|
|
658
|
-
app.router.add_post("/admin/reload-workers", self._reload_worker_configs_handler)
|
|
659
|
-
|
|
684
|
+
self._register_common_routes(app)
|
|
660
685
|
if has_unversioned_routes:
|
|
661
686
|
self.app.add_subapp("/api/", protected_app)
|
|
662
687
|
for version, app in versioned_apps.items():
|
|
@@ -676,6 +701,17 @@ class OrchestratorEngine:
|
|
|
676
701
|
worker_app.router.add_get("/ws/{worker_id}", self._websocket_handler)
|
|
677
702
|
self.app.add_subapp("/_worker/", worker_app)
|
|
678
703
|
|
|
704
|
+
def _register_common_routes(self, app):
|
|
705
|
+
app.router.add_get("/jobs/{job_id}", self._get_job_status_handler)
|
|
706
|
+
app.router.add_post("/jobs/{job_id}/cancel", self._cancel_job_handler)
|
|
707
|
+
if not isinstance(self.history_storage, NoOpHistoryStorage):
|
|
708
|
+
app.router.add_get("/jobs/{job_id}/history", self._get_job_history_handler)
|
|
709
|
+
app.router.add_get("/blueprints/{blueprint_name}/graph", self._get_blueprint_graph_handler)
|
|
710
|
+
app.router.add_get("/workers", self._get_workers_handler)
|
|
711
|
+
app.router.add_get("/jobs", self._get_jobs_handler)
|
|
712
|
+
app.router.add_get("/dashboard", self._get_dashboard_handler)
|
|
713
|
+
app.router.add_post("/admin/reload-workers", self._reload_worker_configs_handler)
|
|
714
|
+
|
|
679
715
|
async def _websocket_handler(self, request: web.Request) -> web.WebSocketResponse:
|
|
680
716
|
worker_id = request.match_info.get("worker_id")
|
|
681
717
|
if not worker_id:
|
avtomatika/executor.py
CHANGED
|
@@ -35,11 +35,13 @@ except ImportError:
|
|
|
35
35
|
def inject(self, *args, **kwargs):
|
|
36
36
|
pass
|
|
37
37
|
|
|
38
|
-
|
|
38
|
+
@staticmethod
|
|
39
|
+
def extract(*args, **kwargs):
|
|
39
40
|
return None
|
|
40
41
|
|
|
41
42
|
class NoOpTraceContextTextMapPropagator:
|
|
42
|
-
|
|
43
|
+
@staticmethod
|
|
44
|
+
def extract(*args, **kwargs):
|
|
43
45
|
return None
|
|
44
46
|
|
|
45
47
|
trace = NoOpTracer()
|
|
@@ -485,7 +487,8 @@ class JobExecutor:
|
|
|
485
487
|
await self.storage.save_job_state(parent_job_id, parent_job_state)
|
|
486
488
|
await self.storage.enqueue_job(parent_job_id)
|
|
487
489
|
|
|
488
|
-
|
|
490
|
+
@staticmethod
|
|
491
|
+
def _handle_task_completion(task: Task):
|
|
489
492
|
"""Callback to handle completion of a job processing task."""
|
|
490
493
|
try:
|
|
491
494
|
# This will re-raise any exception caught in the task
|
avtomatika/ratelimit.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from contextlib import suppress
|
|
1
2
|
from typing import Awaitable, Callable
|
|
2
3
|
|
|
3
4
|
from aiohttp import web
|
|
@@ -23,23 +24,15 @@ def rate_limit_middleware_factory(
|
|
|
23
24
|
"""Rate-limiting middleware that uses the provided storage backend."""
|
|
24
25
|
# Determine the key for rate limiting (e.g., by worker_id or IP)
|
|
25
26
|
# For worker endpoints, we key by worker_id. For others, by IP.
|
|
26
|
-
key_identifier = request.match_info.get("worker_id", request.remote)
|
|
27
|
-
if not key_identifier:
|
|
28
|
-
# Fallback for cases where remote IP might not be available
|
|
29
|
-
key_identifier = "unknown"
|
|
27
|
+
key_identifier = request.match_info.get("worker_id", request.remote) or "unknown"
|
|
30
28
|
|
|
31
29
|
# Key by identifier and path to have per-endpoint limits
|
|
32
30
|
rate_limit_key = f"ratelimit:{key_identifier}:{request.path}"
|
|
33
31
|
|
|
34
|
-
|
|
32
|
+
with suppress(Exception):
|
|
35
33
|
count = await storage.increment_key_with_ttl(rate_limit_key, period)
|
|
36
34
|
if count > limit:
|
|
37
35
|
return web.json_response({"error": "Too Many Requests"}, status=429)
|
|
38
|
-
except Exception:
|
|
39
|
-
# If the rate limiter fails for any reason (e.g., Redis down),
|
|
40
|
-
# it's safer to let the request through than to block everything.
|
|
41
|
-
pass
|
|
42
|
-
|
|
43
36
|
return await handler(request)
|
|
44
37
|
|
|
45
38
|
return rate_limit_middleware
|
avtomatika/reputation.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from asyncio import CancelledError, sleep
|
|
2
2
|
from logging import getLogger
|
|
3
3
|
from typing import TYPE_CHECKING
|
|
4
|
+
from uuid import uuid4
|
|
4
5
|
|
|
5
6
|
if TYPE_CHECKING:
|
|
6
7
|
from .engine import OrchestratorEngine
|
|
@@ -20,14 +21,22 @@ class ReputationCalculator:
|
|
|
20
21
|
self.history_storage = engine.history_storage
|
|
21
22
|
self.interval_seconds = interval_seconds
|
|
22
23
|
self._running = False
|
|
24
|
+
self._instance_id = str(uuid4())
|
|
23
25
|
|
|
24
26
|
async def run(self):
|
|
25
27
|
"""The main loop that periodically triggers reputation recalculation."""
|
|
26
|
-
logger.info("ReputationCalculator started.")
|
|
28
|
+
logger.info(f"ReputationCalculator started (Instance ID: {self._instance_id}).")
|
|
27
29
|
self._running = True
|
|
28
30
|
while self._running:
|
|
29
31
|
try:
|
|
30
|
-
|
|
32
|
+
# Attempt to acquire lock
|
|
33
|
+
if await self.storage.acquire_lock("global_reputation_lock", self._instance_id, 300):
|
|
34
|
+
try:
|
|
35
|
+
await self.calculate_all_reputations()
|
|
36
|
+
finally:
|
|
37
|
+
await self.storage.release_lock("global_reputation_lock", self._instance_id)
|
|
38
|
+
else:
|
|
39
|
+
logger.debug("ReputationCalculator lock held by another instance. Skipping.")
|
|
31
40
|
except CancelledError:
|
|
32
41
|
break
|
|
33
42
|
except Exception:
|
avtomatika/storage/__init__.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
|
|
1
3
|
from .base import StorageBackend
|
|
2
4
|
from .memory import MemoryStorage
|
|
3
5
|
|
|
4
6
|
__all__ = ["StorageBackend", "MemoryStorage"]
|
|
5
7
|
|
|
6
|
-
|
|
8
|
+
with contextlib.suppress(ImportError):
|
|
7
9
|
from .redis import RedisStorage # noqa: F401
|
|
8
10
|
|
|
9
11
|
__all__.append("RedisStorage")
|
|
10
|
-
except ImportError:
|
|
11
|
-
pass
|
avtomatika/storage/base.py
CHANGED
|
@@ -264,3 +264,26 @@ class StorageBackend(ABC):
|
|
|
264
264
|
Used for metrics.
|
|
265
265
|
"""
|
|
266
266
|
raise NotImplementedError
|
|
267
|
+
|
|
268
|
+
@abstractmethod
|
|
269
|
+
async def acquire_lock(self, key: str, holder_id: str, ttl: int) -> bool:
|
|
270
|
+
"""
|
|
271
|
+
Attempts to acquire a distributed lock.
|
|
272
|
+
|
|
273
|
+
:param key: The unique key of the lock (e.g., 'watcher_lock').
|
|
274
|
+
:param holder_id: A unique identifier for the caller (e.g., UUID).
|
|
275
|
+
:param ttl: Time-to-live for the lock in seconds.
|
|
276
|
+
:return: True if the lock was acquired, False otherwise.
|
|
277
|
+
"""
|
|
278
|
+
raise NotImplementedError
|
|
279
|
+
|
|
280
|
+
@abstractmethod
|
|
281
|
+
async def release_lock(self, key: str, holder_id: str) -> bool:
|
|
282
|
+
"""
|
|
283
|
+
Releases a distributed lock if it is held by the specified holder_id.
|
|
284
|
+
|
|
285
|
+
:param key: The unique key of the lock.
|
|
286
|
+
:param holder_id: The identifier of the caller who presumably holds the lock.
|
|
287
|
+
:return: True if the lock was successfully released, False otherwise.
|
|
288
|
+
"""
|
|
289
|
+
raise NotImplementedError
|
avtomatika/storage/memory.py
CHANGED
|
@@ -25,6 +25,7 @@ class MemoryStorage(StorageBackend):
|
|
|
25
25
|
self._worker_tokens: Dict[str, str] = {}
|
|
26
26
|
self._generic_keys: Dict[str, Any] = {}
|
|
27
27
|
self._generic_key_ttls: Dict[str, float] = {}
|
|
28
|
+
self._locks: Dict[str, tuple[str, float]] = {} # key -> (holder_id, expiry_time)
|
|
28
29
|
|
|
29
30
|
self._lock = Lock()
|
|
30
31
|
|
|
@@ -128,9 +129,11 @@ class MemoryStorage(StorageBackend):
|
|
|
128
129
|
async with self._lock:
|
|
129
130
|
now = monotonic()
|
|
130
131
|
active_workers = []
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
132
|
+
active_workers.extend(
|
|
133
|
+
worker_info
|
|
134
|
+
for worker_id, worker_info in self._workers.items()
|
|
135
|
+
if self._worker_ttls.get(worker_id, 0) > now
|
|
136
|
+
)
|
|
134
137
|
return active_workers
|
|
135
138
|
|
|
136
139
|
async def add_job_to_watch(self, job_id: str, timeout_at: float) -> None:
|
|
@@ -226,6 +229,7 @@ class MemoryStorage(StorageBackend):
|
|
|
226
229
|
self._quotas.clear()
|
|
227
230
|
self._generic_keys.clear()
|
|
228
231
|
self._generic_key_ttls.clear()
|
|
232
|
+
self._locks.clear()
|
|
229
233
|
|
|
230
234
|
async def get_job_queue_length(self) -> int:
|
|
231
235
|
# No lock needed for asyncio.Queue.qsize()
|
|
@@ -234,13 +238,9 @@ class MemoryStorage(StorageBackend):
|
|
|
234
238
|
async def get_active_worker_count(self) -> int:
|
|
235
239
|
async with self._lock:
|
|
236
240
|
now = monotonic()
|
|
237
|
-
count = 0
|
|
238
241
|
# Create a copy of keys to avoid issues with concurrent modifications
|
|
239
242
|
worker_ids = list(self._workers.keys())
|
|
240
|
-
for worker_id in worker_ids
|
|
241
|
-
if self._worker_ttls.get(worker_id, 0) > now:
|
|
242
|
-
count += 1
|
|
243
|
-
return count
|
|
243
|
+
return sum(self._worker_ttls.get(worker_id, 0) > now for worker_id in worker_ids)
|
|
244
244
|
|
|
245
245
|
async def get_worker_info(self, worker_id: str) -> Optional[Dict[str, Any]]:
|
|
246
246
|
async with self._lock:
|
|
@@ -273,3 +273,29 @@ class MemoryStorage(StorageBackend):
|
|
|
273
273
|
"average_bid": 0,
|
|
274
274
|
"error": "Statistics are not supported for MemoryStorage backend.",
|
|
275
275
|
}
|
|
276
|
+
|
|
277
|
+
async def acquire_lock(self, key: str, holder_id: str, ttl: int) -> bool:
|
|
278
|
+
async with self._lock:
|
|
279
|
+
now = monotonic()
|
|
280
|
+
current_lock = self._locks.get(key)
|
|
281
|
+
|
|
282
|
+
# If lock exists and hasn't expired
|
|
283
|
+
if current_lock and current_lock[1] > now:
|
|
284
|
+
# If explicitly owned by us, we can extend/re-enter (optional behavior)
|
|
285
|
+
# But for strict locking, if it's held, return False (unless it's us? let's simpler: just False if held)
|
|
286
|
+
return False
|
|
287
|
+
|
|
288
|
+
# Acquire lock
|
|
289
|
+
self._locks[key] = (holder_id, now + ttl)
|
|
290
|
+
return True
|
|
291
|
+
|
|
292
|
+
async def release_lock(self, key: str, holder_id: str) -> bool:
|
|
293
|
+
async with self._lock:
|
|
294
|
+
current_lock = self._locks.get(key)
|
|
295
|
+
if current_lock:
|
|
296
|
+
owner, expiry = current_lock
|
|
297
|
+
# Only release if we are the owner
|
|
298
|
+
if owner == holder_id:
|
|
299
|
+
del self._locks[key]
|
|
300
|
+
return True
|
|
301
|
+
return False
|
avtomatika/storage/redis.py
CHANGED
|
@@ -25,9 +25,7 @@ class RedisStorage(StorageBackend):
|
|
|
25
25
|
"""Get the job state from Redis."""
|
|
26
26
|
key = self._get_key(job_id)
|
|
27
27
|
data = await self._redis.get(key)
|
|
28
|
-
if data
|
|
29
|
-
return loads(data)
|
|
30
|
-
return None
|
|
28
|
+
return loads(data) if data else None
|
|
31
29
|
|
|
32
30
|
async def get_priority_queue_stats(self, task_type: str) -> Dict[str, Any]:
|
|
33
31
|
"""Gets statistics for the priority queue (Sorted Set) for a given task type."""
|
|
@@ -74,7 +72,7 @@ class RedisStorage(StorageBackend):
|
|
|
74
72
|
self,
|
|
75
73
|
job_id: str,
|
|
76
74
|
update_data: Dict[str, Any],
|
|
77
|
-
) ->
|
|
75
|
+
) -> dict[Any, Any] | None | Any:
|
|
78
76
|
"""Atomically update the job state in Redis using a transaction."""
|
|
79
77
|
key = self._get_key(job_id)
|
|
80
78
|
|
|
@@ -134,10 +132,7 @@ class RedisStorage(StorageBackend):
|
|
|
134
132
|
try:
|
|
135
133
|
# BZPOPMAX returns a tuple (key, member, score)
|
|
136
134
|
result = await self._redis.bzpopmax([key], timeout=timeout)
|
|
137
|
-
if result
|
|
138
|
-
# result[1] contains the element (task data) in bytes
|
|
139
|
-
return loads(result[1])
|
|
140
|
-
return None
|
|
135
|
+
return loads(result[1]) if result else None
|
|
141
136
|
except CancelledError:
|
|
142
137
|
return None
|
|
143
138
|
except ResponseError as e:
|
|
@@ -228,18 +223,13 @@ class RedisStorage(StorageBackend):
|
|
|
228
223
|
|
|
229
224
|
async def get_available_workers(self) -> list[dict[str, Any]]:
|
|
230
225
|
"""Gets a list of active workers by scanning keys in Redis."""
|
|
231
|
-
workers = []
|
|
232
226
|
worker_keys = [key async for key in self._redis.scan_iter("orchestrator:worker:info:*")] # type: ignore[attr-defined]
|
|
233
227
|
|
|
234
228
|
if not worker_keys:
|
|
235
229
|
return []
|
|
236
230
|
|
|
237
231
|
worker_data_list = await self._redis.mget(worker_keys)
|
|
238
|
-
for data in worker_data_list
|
|
239
|
-
if data:
|
|
240
|
-
workers.append(loads(data))
|
|
241
|
-
|
|
242
|
-
return workers
|
|
232
|
+
return [loads(data) for data in worker_data_list if data]
|
|
243
233
|
|
|
244
234
|
async def add_job_to_watch(self, job_id: str, timeout_at: float) -> None:
|
|
245
235
|
"""Adds a job to a Redis sorted set.
|
|
@@ -277,9 +267,7 @@ class RedisStorage(StorageBackend):
|
|
|
277
267
|
try:
|
|
278
268
|
# Lock for 1 second so that the while loop in the executor is not too tight
|
|
279
269
|
result = await self._redis.brpop(["orchestrator:job_queue"], timeout=1) # type: ignore[misc]
|
|
280
|
-
if result
|
|
281
|
-
return result[1].decode("utf-8")
|
|
282
|
-
return None
|
|
270
|
+
return result[1].decode("utf-8") if result else None
|
|
283
271
|
except CancelledError:
|
|
284
272
|
return None
|
|
285
273
|
|
|
@@ -407,6 +395,35 @@ class RedisStorage(StorageBackend):
|
|
|
407
395
|
"""Gets the full info for a worker by its ID."""
|
|
408
396
|
key = f"orchestrator:worker:info:{worker_id}"
|
|
409
397
|
data = await self._redis.get(key)
|
|
410
|
-
if data
|
|
411
|
-
|
|
412
|
-
|
|
398
|
+
return loads(data) if data else None
|
|
399
|
+
|
|
400
|
+
async def acquire_lock(self, key: str, holder_id: str, ttl: int) -> bool:
|
|
401
|
+
"""Attempts to acquire a lock using Redis SET NX."""
|
|
402
|
+
redis_key = f"orchestrator:lock:{key}"
|
|
403
|
+
# Returns True if set was successful (key didn't exist), None otherwise
|
|
404
|
+
result = await self._redis.set(redis_key, holder_id, nx=True, ex=ttl)
|
|
405
|
+
return bool(result)
|
|
406
|
+
|
|
407
|
+
async def release_lock(self, key: str, holder_id: str) -> bool:
|
|
408
|
+
"""Releases the lock using a Lua script to ensure ownership."""
|
|
409
|
+
redis_key = f"orchestrator:lock:{key}"
|
|
410
|
+
|
|
411
|
+
LUA_RELEASE_SCRIPT = """
|
|
412
|
+
if redis.call("get", KEYS[1]) == ARGV[1] then
|
|
413
|
+
return redis.call("del", KEYS[1])
|
|
414
|
+
else
|
|
415
|
+
return 0
|
|
416
|
+
end
|
|
417
|
+
"""
|
|
418
|
+
try:
|
|
419
|
+
result = await self._redis.eval(LUA_RELEASE_SCRIPT, 1, redis_key, holder_id)
|
|
420
|
+
return bool(result)
|
|
421
|
+
except ResponseError as e:
|
|
422
|
+
# Fallback for fakeredis if needed, though fakeredis usually supports eval
|
|
423
|
+
if "unknown command" in str(e):
|
|
424
|
+
current_val = await self._redis.get(redis_key)
|
|
425
|
+
if current_val and current_val.decode("utf-8") == holder_id:
|
|
426
|
+
await self._redis.delete(redis_key)
|
|
427
|
+
return True
|
|
428
|
+
return False
|
|
429
|
+
raise e
|
avtomatika/telemetry.py
CHANGED
|
@@ -27,7 +27,8 @@ except ImportError:
|
|
|
27
27
|
pass
|
|
28
28
|
|
|
29
29
|
class DummyTracer:
|
|
30
|
-
|
|
30
|
+
@staticmethod
|
|
31
|
+
def start_as_current_span(name, context=None):
|
|
31
32
|
return DummySpan()
|
|
32
33
|
|
|
33
34
|
class NoOpTrace:
|
|
@@ -46,8 +47,7 @@ def setup_telemetry(service_name: str = "avtomatika"):
|
|
|
46
47
|
resource = Resource(attributes={"service.name": service_name})
|
|
47
48
|
provider = TracerProvider(resource=resource)
|
|
48
49
|
|
|
49
|
-
otlp_endpoint
|
|
50
|
-
if otlp_endpoint:
|
|
50
|
+
if otlp_endpoint := getenv("OTEL_EXPORTER_OTLP_ENDPOINT"):
|
|
51
51
|
logger.info(f"OTLP exporter enabled, sending traces to {otlp_endpoint}")
|
|
52
52
|
try:
|
|
53
53
|
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
|
avtomatika/watcher.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from asyncio import CancelledError, sleep
|
|
2
2
|
from logging import getLogger
|
|
3
3
|
from typing import TYPE_CHECKING
|
|
4
|
+
from uuid import uuid4
|
|
4
5
|
|
|
5
6
|
if TYPE_CHECKING:
|
|
6
7
|
from .engine import OrchestratorEngine
|
|
@@ -17,43 +18,56 @@ class Watcher:
|
|
|
17
18
|
self.config = engine.config
|
|
18
19
|
self._running = False
|
|
19
20
|
self.watch_interval_seconds = self.config.WATCHER_INTERVAL_SECONDS
|
|
21
|
+
self._instance_id = str(uuid4())
|
|
20
22
|
|
|
21
23
|
async def run(self):
|
|
22
24
|
"""The main loop of the watcher."""
|
|
23
|
-
logger.info("Watcher started.")
|
|
25
|
+
logger.info(f"Watcher started (Instance ID: {self._instance_id}).")
|
|
24
26
|
self._running = True
|
|
25
27
|
while self._running:
|
|
26
28
|
try:
|
|
27
29
|
await sleep(self.watch_interval_seconds)
|
|
28
|
-
logger.info("Watcher running check for timed out jobs...")
|
|
29
30
|
|
|
30
|
-
|
|
31
|
+
# Attempt to acquire distributed lock
|
|
32
|
+
# We set TTL slightly longer than the expected execution time,
|
|
33
|
+
# but shorter than the interval if possible.
|
|
34
|
+
# Actually, a fixed TTL like 60s is fine as long as we release it.
|
|
35
|
+
if not await self.storage.acquire_lock("global_watcher_lock", self._instance_id, 60):
|
|
36
|
+
logger.debug("Watcher lock held by another instance. Skipping check.")
|
|
37
|
+
continue
|
|
31
38
|
|
|
32
|
-
|
|
33
|
-
logger.
|
|
34
|
-
|
|
35
|
-
# Get the latest version to avoid overwriting
|
|
36
|
-
job_state = await self.storage.get_job_state(job_id)
|
|
37
|
-
if job_state and job_state["status"] == "waiting_for_worker":
|
|
38
|
-
job_state["status"] = "failed"
|
|
39
|
-
job_state["error_message"] = "Worker task timed out."
|
|
40
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
39
|
+
try:
|
|
40
|
+
logger.info("Watcher running check for timed out jobs...")
|
|
41
|
+
timed_out_job_ids = await self.storage.get_timed_out_jobs()
|
|
41
42
|
|
|
42
|
-
|
|
43
|
-
|
|
43
|
+
for job_id in timed_out_job_ids:
|
|
44
|
+
logger.warning(f"Job {job_id} timed out. Moving to failed state.")
|
|
45
|
+
try:
|
|
46
|
+
# Get the latest version to avoid overwriting
|
|
47
|
+
job_state = await self.storage.get_job_state(job_id)
|
|
48
|
+
if job_state and job_state["status"] == "waiting_for_worker":
|
|
49
|
+
job_state["status"] = "failed"
|
|
50
|
+
job_state["error_message"] = "Worker task timed out."
|
|
51
|
+
await self.storage.save_job_state(job_id, job_state)
|
|
44
52
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
53
|
+
# Increment the metric
|
|
54
|
+
from . import metrics
|
|
55
|
+
|
|
56
|
+
metrics.jobs_failed_total.inc(
|
|
57
|
+
{
|
|
58
|
+
metrics.LABEL_BLUEPRINT: job_state.get(
|
|
59
|
+
"blueprint_name",
|
|
60
|
+
"unknown",
|
|
61
|
+
),
|
|
62
|
+
},
|
|
63
|
+
)
|
|
64
|
+
except Exception:
|
|
65
|
+
logger.exception(
|
|
66
|
+
f"Failed to update state for timed out job {job_id}",
|
|
52
67
|
)
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
)
|
|
68
|
+
finally:
|
|
69
|
+
# Always release the lock so we (or others) can run next time
|
|
70
|
+
await self.storage.release_lock("global_watcher_lock", self._instance_id)
|
|
57
71
|
|
|
58
72
|
except CancelledError:
|
|
59
73
|
logger.info("Watcher received cancellation request.")
|
|
@@ -28,14 +28,19 @@ async def load_worker_configs_to_redis(storage: StorageBackend, config_path: str
|
|
|
28
28
|
workers_config: Dict[str, Any] = load(f)
|
|
29
29
|
except Exception as e:
|
|
30
30
|
logger.error(f"Failed to load or parse worker config file '{config_path}': {e}")
|
|
31
|
-
|
|
31
|
+
raise ValueError(f"Invalid worker configuration file: {e}") from e
|
|
32
32
|
|
|
33
33
|
for worker_id, config in workers_config.items():
|
|
34
|
+
if not isinstance(config, dict):
|
|
35
|
+
logger.error(f"Worker '{worker_id}' configuration must be a table.")
|
|
36
|
+
raise ValueError(f"Invalid configuration for worker '{worker_id}'")
|
|
37
|
+
|
|
34
38
|
token = config.get("token")
|
|
35
39
|
if not token:
|
|
36
40
|
logger.warning(f"No token found for worker_id '{worker_id}' in {config_path}. Skipping.")
|
|
41
|
+
# Skipping might be safer here if we want to allow partial configs, but strict is better.
|
|
42
|
+
# Let's keep existing skip logic but log error? No, let's allow skip if user really wants.
|
|
37
43
|
continue
|
|
38
|
-
|
|
39
44
|
try:
|
|
40
45
|
# Hash the token before storing it
|
|
41
46
|
hashed_token = sha256(token.encode()).hexdigest()
|
avtomatika/ws_manager.py
CHANGED
|
@@ -46,7 +46,8 @@ class WebSocketManager:
|
|
|
46
46
|
logger.warning(f"Cannot send command: No active WebSocket connection for worker {worker_id}.")
|
|
47
47
|
return False
|
|
48
48
|
|
|
49
|
-
|
|
49
|
+
@staticmethod
|
|
50
|
+
async def handle_message(worker_id: str, message: dict):
|
|
50
51
|
"""Handles an incoming message from a worker."""
|
|
51
52
|
event_type = message.get("event")
|
|
52
53
|
if event_type == "progress_update":
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: avtomatika
|
|
3
|
-
Version: 1.
|
|
4
|
-
Summary: A state-machine based orchestrator for long-running jobs.
|
|
3
|
+
Version: 1.0b4
|
|
4
|
+
Summary: A state-machine based orchestrator for long-running AI and other jobs.
|
|
5
5
|
Project-URL: Homepage, https://github.com/avtomatika-ai/avtomatika
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/avtomatika-ai/avtomatika/issues
|
|
7
7
|
Classifier: Development Status :: 4 - Beta
|
|
@@ -18,25 +18,25 @@ Requires-Dist: graphviz~=0.21
|
|
|
18
18
|
Requires-Dist: zstandard~=0.24
|
|
19
19
|
Requires-Dist: aioprometheus~=23.12
|
|
20
20
|
Provides-Extra: redis
|
|
21
|
-
Requires-Dist: redis~=
|
|
21
|
+
Requires-Dist: redis~=7.1; extra == "redis"
|
|
22
22
|
Requires-Dist: orjson~=3.11; extra == "redis"
|
|
23
23
|
Provides-Extra: history
|
|
24
|
-
Requires-Dist: aiosqlite~=0.
|
|
24
|
+
Requires-Dist: aiosqlite~=0.22; extra == "history"
|
|
25
25
|
Requires-Dist: asyncpg~=0.30; extra == "history"
|
|
26
26
|
Requires-Dist: orjson~=3.11; extra == "history"
|
|
27
27
|
Provides-Extra: telemetry
|
|
28
|
-
Requires-Dist: opentelemetry-api~=1.
|
|
29
|
-
Requires-Dist: opentelemetry-sdk~=1.
|
|
30
|
-
Requires-Dist: opentelemetry-exporter-otlp~=1.
|
|
28
|
+
Requires-Dist: opentelemetry-api~=1.39; extra == "telemetry"
|
|
29
|
+
Requires-Dist: opentelemetry-sdk~=1.39; extra == "telemetry"
|
|
30
|
+
Requires-Dist: opentelemetry-exporter-otlp~=1.39; extra == "telemetry"
|
|
31
31
|
Requires-Dist: opentelemetry-instrumentation-aiohttp-client~=0.59b0; extra == "telemetry"
|
|
32
32
|
Provides-Extra: test
|
|
33
|
-
Requires-Dist: pytest~=
|
|
33
|
+
Requires-Dist: pytest~=9.0; extra == "test"
|
|
34
34
|
Requires-Dist: pytest-asyncio~=1.1; extra == "test"
|
|
35
|
-
Requires-Dist: fakeredis~=2.
|
|
35
|
+
Requires-Dist: fakeredis~=2.33; extra == "test"
|
|
36
36
|
Requires-Dist: pytest-aiohttp~=1.1; extra == "test"
|
|
37
37
|
Requires-Dist: pytest-mock~=3.14; extra == "test"
|
|
38
38
|
Requires-Dist: aioresponses~=0.7; extra == "test"
|
|
39
|
-
Requires-Dist: backports.zstd; extra == "test"
|
|
39
|
+
Requires-Dist: backports.zstd~=1.2; extra == "test"
|
|
40
40
|
Requires-Dist: opentelemetry-instrumentation-aiohttp-client; extra == "test"
|
|
41
41
|
Provides-Extra: all
|
|
42
42
|
Requires-Dist: avtomatika[redis]; extra == "all"
|
|
@@ -285,7 +285,7 @@ Run multiple tasks simultaneously and gather their results.
|
|
|
285
285
|
@my_blueprint.handler_for("process_files")
|
|
286
286
|
async def fan_out_handler(initial_data, actions):
|
|
287
287
|
tasks_to_dispatch = [
|
|
288
|
-
{"task_type": "file_analysis", "params": {"file": file}}
|
|
288
|
+
{"task_type": "file_analysis", "params": {"file": file}})
|
|
289
289
|
for file in initial_data.get("files", [])
|
|
290
290
|
]
|
|
291
291
|
# Use dispatch_parallel to send all tasks at once.
|
|
@@ -332,6 +332,26 @@ async def cache_handler(data_stores):
|
|
|
332
332
|
|
|
333
333
|
The orchestrator's behavior can be configured through environment variables. Additionally, any configuration parameter loaded from environment variables can be programmatically overridden in your application code after the `Config` object has been initialized. This provides flexibility for different deployment and testing scenarios.
|
|
334
334
|
|
|
335
|
+
**Important:** The system employs **strict validation** for configuration files (`clients.toml`, `workers.toml`) at startup. If a configuration file is invalid (e.g., malformed TOML, missing required fields), the application will **fail fast** and exit with an error, rather than starting in a partially broken state. This ensures the security and integrity of the deployment.
|
|
336
|
+
|
|
337
|
+
### Configuration Files
|
|
338
|
+
|
|
339
|
+
To manage access and worker settings securely, Avtomatika uses TOML configuration files.
|
|
340
|
+
|
|
341
|
+
- **`clients.toml`**: Defines API clients, their tokens, plans, and quotas.
|
|
342
|
+
```toml
|
|
343
|
+
[client_premium]
|
|
344
|
+
token = "secret-token-123"
|
|
345
|
+
plan = "premium"
|
|
346
|
+
```
|
|
347
|
+
- **`workers.toml`**: Defines individual tokens for workers to enhance security.
|
|
348
|
+
```toml
|
|
349
|
+
[gpu-worker-01]
|
|
350
|
+
token = "worker-secret-456"
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
For detailed specifications and examples, please refer to the [**Configuration Guide**](docs/configuration.md).
|
|
354
|
+
|
|
335
355
|
### Fault Tolerance
|
|
336
356
|
|
|
337
357
|
The orchestrator has built-in mechanisms for handling failures based on the `error.code` field in a worker's response.
|
|
@@ -340,6 +360,13 @@ The orchestrator has built-in mechanisms for handling failures based on the `err
|
|
|
340
360
|
* **PERMANENT_ERROR**: A permanent error (e.g., a corrupted file). The task will be immediately sent to quarantine for manual investigation.
|
|
341
361
|
* **INVALID_INPUT_ERROR**: An error in the input data. The entire pipeline (Job) will be immediately moved to the failed state.
|
|
342
362
|
|
|
363
|
+
### High Availability & Distributed Locking
|
|
364
|
+
|
|
365
|
+
The architecture supports horizontal scaling. Multiple Orchestrator instances can run behind a load balancer.
|
|
366
|
+
|
|
367
|
+
* **Stateless API:** The API is stateless; all state is persisted in Redis.
|
|
368
|
+
* **Distributed Locking:** Background processes (`Watcher`, `ReputationCalculator`) use distributed locks (via Redis `SET NX`) to coordinate and prevent race conditions when multiple instances are active.
|
|
369
|
+
|
|
343
370
|
### Storage Backend
|
|
344
371
|
|
|
345
372
|
By default, the engine uses in-memory storage. For production, you must configure persistent storage via environment variables.
|
|
@@ -408,3 +435,22 @@ To run the `avtomatika` test suite:
|
|
|
408
435
|
```bash
|
|
409
436
|
pytest avtomatika/tests/
|
|
410
437
|
```
|
|
438
|
+
|
|
439
|
+
### Interactive API Documentation
|
|
440
|
+
|
|
441
|
+
Avtomatika provides a built-in interactive API documentation page (similar to Swagger UI) that is automatically generated based on your registered blueprints.
|
|
442
|
+
|
|
443
|
+
* **Endpoint:** `/_public/docs`
|
|
444
|
+
* **Features:**
|
|
445
|
+
* **List of all system endpoints:** Detailed documentation for Public, Protected, and Worker API groups.
|
|
446
|
+
* **Dynamic Blueprint Documentation:** Automatically generates and lists documentation for all blueprints registered in the engine, including their specific API endpoints.
|
|
447
|
+
* **Interactive Testing:** Allows you to test API calls directly from the browser. You can provide authentication tokens, parameters, and request bodies to see real server responses.
|
|
448
|
+
|
|
449
|
+
## Detailed Documentation
|
|
450
|
+
|
|
451
|
+
For a deeper dive into the system, please refer to the following documents:
|
|
452
|
+
|
|
453
|
+
- [**Architecture Guide**](https://github.com/avtomatika-ai/avtomatika/blob/main/docs/architecture.md): A detailed overview of the system components and their interactions.
|
|
454
|
+
- [**API Reference**](https://github.com/avtomatika-ai/avtomatika/blob/main/docs/api_reference.md): Full specification of the HTTP API.
|
|
455
|
+
- [**Deployment Guide**](https://github.com/avtomatika-ai/avtomatika/blob/main/docs/deployment.md): Instructions for deploying with Gunicorn/Uvicorn and NGINX.
|
|
456
|
+
- [**Cookbook**](https://github.com/avtomatika-ai/avtomatika/blob/main/docs/cookbook/README.md): Examples and best practices for creating blueprints.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
avtomatika/__init__.py,sha256=nlk59j7YcK1gapRUVfHjvFZVAD_PZoamgHEptchP3TA,698
|
|
2
|
+
avtomatika/api.html,sha256=RLx-D1uFCSAXIf_2WgFlSTWrWPcmonNYM-9oNanKXBg,32835
|
|
3
|
+
avtomatika/blueprint.py,sha256=Hx5h0upr_IYbCy1ebUTpXw4bnt5yYhgWtdPLVE1_h48,9403
|
|
4
|
+
avtomatika/client_config_loader.py,sha256=zVVHZlxSqZUaNpZ4zoU0T1CFYXdxy-3vKSmPcaFuHSY,2772
|
|
5
|
+
avtomatika/compression.py,sha256=bhA1kw4YrCR3I3kdquZSY0fAzCrRrjtz55uepzLUDKI,2498
|
|
6
|
+
avtomatika/config.py,sha256=0vlMfVMjxwVUC8m_NglGocC_EoklzAc0qmt3UJbxm10,2087
|
|
7
|
+
avtomatika/context.py,sha256=rnF09jqQGkaKlax8P5ku9USwijSm6dommDGZbeVrzLk,4295
|
|
8
|
+
avtomatika/data_types.py,sha256=g-g5hPnCpzeATgOn5v7EvDm5ps314owFJD5iWJ6IPR0,1425
|
|
9
|
+
avtomatika/datastore.py,sha256=ERMyiFYQpAhVYijxzTrrdm6jtIPFf4dngWIa0qod3Wc,551
|
|
10
|
+
avtomatika/dispatcher.py,sha256=a_7DjJwSXbW-ZzqcjZG0ZXMYDD2JLZxpQRIzHOrjeow,9688
|
|
11
|
+
avtomatika/engine.py,sha256=-y_gwj1YK_X3QZ6h02KntVep7vRtZWYf1RYFnXtUmP8,39213
|
|
12
|
+
avtomatika/executor.py,sha256=JHwT2DR-Hbrb_-Le1-mVaXiiQ7z-PkMsuIYB9ciiVo0,21201
|
|
13
|
+
avtomatika/health_checker.py,sha256=WXwvRJ-3cZC2Udc_ogsyIQp7VzcvJjq_IaqzkTdE0TE,1265
|
|
14
|
+
avtomatika/logging_config.py,sha256=e0-eEEGHw1zz9ZshzXaxfavV0uZfamRNdcAeHnrgBYQ,1370
|
|
15
|
+
avtomatika/metrics.py,sha256=7XDhr_xMJ9JpElpZmBG7R0ml7AMdAp9UYp_W-i7tyLg,1858
|
|
16
|
+
avtomatika/py.typed,sha256=CT_L7gw2MLcQY-X0vs-xB5Vr0wzvGo7GuQYPI_qwJE8,65
|
|
17
|
+
avtomatika/quota.py,sha256=DNcaL6k0J1REeP8sVqbY9FprY_3BSr2SxM2Vf4mEqdw,1612
|
|
18
|
+
avtomatika/ratelimit.py,sha256=hFGW5oN9G6_W_jnHmopXW8bRjjzlvanY19MLghsNLE8,1306
|
|
19
|
+
avtomatika/reputation.py,sha256=IHcaIAILWZftPPmXj5En28OSDNK7U8ivQ-w30zIF8fk,3748
|
|
20
|
+
avtomatika/security.py,sha256=afj28O3xB20EmA75DAQCQm_QKzx_tX2Qv9zE9TlcFvM,4441
|
|
21
|
+
avtomatika/telemetry.py,sha256=ZBt1_xJ36PzDSz-zdCXeNp58NiezUgbqvMctTG25PT0,2352
|
|
22
|
+
avtomatika/watcher.py,sha256=IHaqSqp3XSGXjRY-LEeTG9BJpq2nqJSnmjY_Vdvk3jo,3493
|
|
23
|
+
avtomatika/worker_config_loader.py,sha256=Ir8jbZ_07U8NAcu3r_EXM1jQvNpVEvHRP0k9vsq3mio,2255
|
|
24
|
+
avtomatika/ws_manager.py,sha256=v3nz-w4AhoV_vqs3y8twXaMxm7s52wg2wzCMzTkPd8M,3081
|
|
25
|
+
avtomatika/history/base.py,sha256=p0zItsdxFzd889LujV8py6GwK4CUfqAt8QL915mrT4k,1680
|
|
26
|
+
avtomatika/history/noop.py,sha256=Hk5yJsS4S5G5A7NRRMEafIV_IFI9hddSwEvRg2Reh0M,982
|
|
27
|
+
avtomatika/history/postgres.py,sha256=zanh_WktXM_C8ZPsYGiI1x4ScyHDNE1LVOeYiN72NdY,7685
|
|
28
|
+
avtomatika/history/sqlite.py,sha256=hivl--uJ47MosrD6qhBwW3KYAQvVgbPNM6UYYatqFKM,8862
|
|
29
|
+
avtomatika/storage/__init__.py,sha256=ygqv240XuYuHjU_2eci0J3FWoJLNSRpUFA2GzBrHMKg,259
|
|
30
|
+
avtomatika/storage/base.py,sha256=BCC7uAQrko1UCwZo5kGF-0blwJiFcLCcT-pMnhYAxqY,10494
|
|
31
|
+
avtomatika/storage/memory.py,sha256=7VhQO02SbYc65uDTOY9g43CVOgsodxzg-WYo0JGpUec,11387
|
|
32
|
+
avtomatika/storage/redis.py,sha256=kgNUJuwcxQvCzul0m5COKhDnfJGKReMNeWxtG_BGfLc,18171
|
|
33
|
+
avtomatika-1.0b4.dist-info/licenses/LICENSE,sha256=tqCjw9Y1vbU-hLcWi__7wQstLbt2T1XWPdbQYqCxuWY,1072
|
|
34
|
+
avtomatika-1.0b4.dist-info/METADATA,sha256=o_4U54i5frChX81Jyw6SZcg18RCxloSj5uQ4A8dwilQ,20927
|
|
35
|
+
avtomatika-1.0b4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
36
|
+
avtomatika-1.0b4.dist-info/top_level.txt,sha256=gLDWhA_wxHj0I6fG5X8vw9fE0HSN4hTE2dEJzeVS2x8,11
|
|
37
|
+
avtomatika-1.0b4.dist-info/RECORD,,
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
avtomatika/__init__.py,sha256=BtYVoi9xy3yuMDMmzvLTO3AfbL2xeEjVz2bP9GhQeHU,675
|
|
2
|
-
avtomatika/api.html,sha256=Z-Ikqrle7YPXagx2D-C5ylVZicLQFSsIzPsHCQgqMHM,33628
|
|
3
|
-
avtomatika/blueprint.py,sha256=RvDidn2xkziZOOnJlkV30SMp59GuscASz11GolxK7fE,9445
|
|
4
|
-
avtomatika/client_config_loader.py,sha256=N41juqx9CF2XZ3-WRXspkLG2eoGhj9LkWdaMCCCk33E,1962
|
|
5
|
-
avtomatika/compression.py,sha256=bhA1kw4YrCR3I3kdquZSY0fAzCrRrjtz55uepzLUDKI,2498
|
|
6
|
-
avtomatika/config.py,sha256=0vlMfVMjxwVUC8m_NglGocC_EoklzAc0qmt3UJbxm10,2087
|
|
7
|
-
avtomatika/context.py,sha256=rnF09jqQGkaKlax8P5ku9USwijSm6dommDGZbeVrzLk,4295
|
|
8
|
-
avtomatika/data_types.py,sha256=g-g5hPnCpzeATgOn5v7EvDm5ps314owFJD5iWJ6IPR0,1425
|
|
9
|
-
avtomatika/datastore.py,sha256=ERMyiFYQpAhVYijxzTrrdm6jtIPFf4dngWIa0qod3Wc,551
|
|
10
|
-
avtomatika/dispatcher.py,sha256=2YW-A0QXD4T6RF3KiNHflGs8LeLNrMlo5NScE4T49-o,10015
|
|
11
|
-
avtomatika/engine.py,sha256=IeBy7pnWjERZwTIAp02wcaKggRMi5Jof-02kB2LfAKc,37482
|
|
12
|
-
avtomatika/executor.py,sha256=-gbLfluZfwtLnAYI8GdK1xswA_DYp_-yoLnP4AWlcyQ,21157
|
|
13
|
-
avtomatika/health_checker.py,sha256=WXwvRJ-3cZC2Udc_ogsyIQp7VzcvJjq_IaqzkTdE0TE,1265
|
|
14
|
-
avtomatika/logging_config.py,sha256=e0-eEEGHw1zz9ZshzXaxfavV0uZfamRNdcAeHnrgBYQ,1370
|
|
15
|
-
avtomatika/metrics.py,sha256=7XDhr_xMJ9JpElpZmBG7R0ml7AMdAp9UYp_W-i7tyLg,1858
|
|
16
|
-
avtomatika/py.typed,sha256=CT_L7gw2MLcQY-X0vs-xB5Vr0wzvGo7GuQYPI_qwJE8,65
|
|
17
|
-
avtomatika/quota.py,sha256=DNcaL6k0J1REeP8sVqbY9FprY_3BSr2SxM2Vf4mEqdw,1612
|
|
18
|
-
avtomatika/ratelimit.py,sha256=bZSmdCHviSGMVDNOKTBSswDl6P9Dc63BKLqKU50twpg,1579
|
|
19
|
-
avtomatika/reputation.py,sha256=XFEqDW4TEqjrWgGB2KwnBBJF1RpwLlZPgFwxjPl2z8w,3221
|
|
20
|
-
avtomatika/security.py,sha256=afj28O3xB20EmA75DAQCQm_QKzx_tX2Qv9zE9TlcFvM,4441
|
|
21
|
-
avtomatika/telemetry.py,sha256=FIbbYjNX0JhpfT7UHjx6EdcFUGF1DpQDrbicIp0ZGvA,2353
|
|
22
|
-
avtomatika/watcher.py,sha256=djqwgK8wu73LL2Rtxa6mV-EXimFO7T-iRQV5U_26dc8,2571
|
|
23
|
-
avtomatika/worker_config_loader.py,sha256=2Fu2gU3O5H9VJ62D_Yc4gT19vV_be8DJZWzrTTovq8E,1788
|
|
24
|
-
avtomatika/ws_manager.py,sha256=X420oV9_vnujV78VWuOVNqiMZAMQzfw6_0Ep694LJjQ,3069
|
|
25
|
-
avtomatika/history/base.py,sha256=p0zItsdxFzd889LujV8py6GwK4CUfqAt8QL915mrT4k,1680
|
|
26
|
-
avtomatika/history/noop.py,sha256=Hk5yJsS4S5G5A7NRRMEafIV_IFI9hddSwEvRg2Reh0M,982
|
|
27
|
-
avtomatika/history/postgres.py,sha256=zanh_WktXM_C8ZPsYGiI1x4ScyHDNE1LVOeYiN72NdY,7685
|
|
28
|
-
avtomatika/history/sqlite.py,sha256=hivl--uJ47MosrD6qhBwW3KYAQvVgbPNM6UYYatqFKM,8862
|
|
29
|
-
avtomatika/storage/__init__.py,sha256=TFb3_Ab6K1tlGaBihMqhzwdwCarMG2uRpATDW372YNA,235
|
|
30
|
-
avtomatika/storage/base.py,sha256=rcHbTB_0s4ZhkxIACnp9f6Ow7Jvr9pHx96t3VUf4lIc,9581
|
|
31
|
-
avtomatika/storage/memory.py,sha256=Wqi879hw1cZBIhMDk_bKPaWhnH33QEl4aUIxu9g3M0M,10282
|
|
32
|
-
avtomatika/storage/redis.py,sha256=eT65_tvRufy-txiKfSSs5UmKBeEio8aWG0UTBfj9DdY,17088
|
|
33
|
-
avtomatika-1.0b2.dist-info/licenses/LICENSE,sha256=tqCjw9Y1vbU-hLcWi__7wQstLbt2T1XWPdbQYqCxuWY,1072
|
|
34
|
-
avtomatika-1.0b2.dist-info/METADATA,sha256=IofXNi3-o-YDIfLKrBIT0mV5osHPvRyYX95b_9WI8dY,18083
|
|
35
|
-
avtomatika-1.0b2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
36
|
-
avtomatika-1.0b2.dist-info/top_level.txt,sha256=gLDWhA_wxHj0I6fG5X8vw9fE0HSN4hTE2dEJzeVS2x8,11
|
|
37
|
-
avtomatika-1.0b2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|