avtomatika 1.0b8__py3-none-any.whl → 1.0b10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avtomatika/api/handlers.py +5 -257
- avtomatika/api/routes.py +42 -63
- avtomatika/api.html +1 -1
- avtomatika/app_keys.py +1 -0
- avtomatika/blueprint.py +3 -2
- avtomatika/config.py +8 -0
- avtomatika/constants.py +75 -25
- avtomatika/data_types.py +2 -22
- avtomatika/dispatcher.py +4 -0
- avtomatika/engine.py +119 -7
- avtomatika/executor.py +19 -19
- avtomatika/logging_config.py +16 -7
- avtomatika/s3.py +96 -40
- avtomatika/scheduler_config_loader.py +5 -2
- avtomatika/security.py +56 -74
- avtomatika/services/__init__.py +0 -0
- avtomatika/services/worker_service.py +267 -0
- avtomatika/storage/base.py +10 -0
- avtomatika/storage/memory.py +15 -4
- avtomatika/storage/redis.py +42 -11
- avtomatika/telemetry.py +8 -7
- avtomatika/utils/webhook_sender.py +3 -3
- avtomatika/watcher.py +4 -2
- avtomatika/ws_manager.py +16 -8
- {avtomatika-1.0b8.dist-info → avtomatika-1.0b10.dist-info}/METADATA +47 -15
- avtomatika-1.0b10.dist-info/RECORD +48 -0
- {avtomatika-1.0b8.dist-info → avtomatika-1.0b10.dist-info}/WHEEL +1 -1
- avtomatika-1.0b8.dist-info/RECORD +0 -46
- {avtomatika-1.0b8.dist-info → avtomatika-1.0b10.dist-info}/licenses/LICENSE +0 -0
- {avtomatika-1.0b8.dist-info → avtomatika-1.0b10.dist-info}/top_level.txt +0 -0
avtomatika/api/handlers.py
CHANGED
|
@@ -3,7 +3,7 @@ from logging import getLogger
|
|
|
3
3
|
from typing import Any, Callable
|
|
4
4
|
from uuid import uuid4
|
|
5
5
|
|
|
6
|
-
from aiohttp import
|
|
6
|
+
from aiohttp import web
|
|
7
7
|
from aioprometheus import render
|
|
8
8
|
from orjson import OPT_INDENT_2, dumps, loads
|
|
9
9
|
|
|
@@ -14,31 +14,22 @@ from ..app_keys import (
|
|
|
14
14
|
from ..blueprint import StateMachineBlueprint
|
|
15
15
|
from ..client_config_loader import load_client_configs_to_redis
|
|
16
16
|
from ..constants import (
|
|
17
|
-
|
|
18
|
-
ERROR_CODE_PERMANENT,
|
|
19
|
-
ERROR_CODE_TRANSIENT,
|
|
20
|
-
JOB_STATUS_CANCELLED,
|
|
21
|
-
JOB_STATUS_FAILED,
|
|
17
|
+
COMMAND_CANCEL_TASK,
|
|
22
18
|
JOB_STATUS_PENDING,
|
|
23
|
-
JOB_STATUS_QUARANTINED,
|
|
24
19
|
JOB_STATUS_RUNNING,
|
|
25
20
|
JOB_STATUS_WAITING_FOR_HUMAN,
|
|
26
|
-
JOB_STATUS_WAITING_FOR_PARALLEL,
|
|
27
21
|
JOB_STATUS_WAITING_FOR_WORKER,
|
|
28
|
-
TASK_STATUS_CANCELLED,
|
|
29
|
-
TASK_STATUS_FAILURE,
|
|
30
|
-
TASK_STATUS_SUCCESS,
|
|
31
22
|
)
|
|
32
23
|
from ..worker_config_loader import load_worker_configs_to_redis
|
|
33
24
|
|
|
34
25
|
logger = getLogger(__name__)
|
|
35
26
|
|
|
36
27
|
|
|
37
|
-
def json_dumps(obj) -> str:
|
|
28
|
+
def json_dumps(obj: Any) -> str:
|
|
38
29
|
return dumps(obj).decode("utf-8")
|
|
39
30
|
|
|
40
31
|
|
|
41
|
-
def json_response(data, **kwargs) -> web.Response:
|
|
32
|
+
def json_response(data: Any, **kwargs: Any) -> web.Response:
|
|
42
33
|
return web.json_response(data, dumps=json_dumps, **kwargs)
|
|
43
34
|
|
|
44
35
|
|
|
@@ -138,7 +129,7 @@ async def cancel_job_handler(request: web.Request) -> web.Response:
|
|
|
138
129
|
|
|
139
130
|
# Attempt WebSocket-based cancellation if supported
|
|
140
131
|
if worker_info and worker_info.get("capabilities", {}).get("websockets"):
|
|
141
|
-
command = {"command":
|
|
132
|
+
command = {"command": COMMAND_CANCEL_TASK, "task_id": task_id, "job_id": job_id}
|
|
142
133
|
sent = await engine.ws_manager.send_command(worker_id, command)
|
|
143
134
|
if sent:
|
|
144
135
|
return json_response({"status": "cancellation_request_sent"})
|
|
@@ -208,143 +199,6 @@ async def get_dashboard_handler(request: web.Request) -> web.Response:
|
|
|
208
199
|
return json_response(dashboard_data)
|
|
209
200
|
|
|
210
201
|
|
|
211
|
-
async def task_result_handler(request: web.Request) -> web.Response:
|
|
212
|
-
engine = request.app[ENGINE_KEY]
|
|
213
|
-
try:
|
|
214
|
-
data = await request.json(loads=loads)
|
|
215
|
-
job_id = data.get("job_id")
|
|
216
|
-
task_id = data.get("task_id")
|
|
217
|
-
result = data.get("result", {})
|
|
218
|
-
result_status = result.get("status", TASK_STATUS_SUCCESS)
|
|
219
|
-
error_message = result.get("error")
|
|
220
|
-
payload_worker_id = data.get("worker_id")
|
|
221
|
-
except Exception:
|
|
222
|
-
return json_response({"error": "Invalid JSON body"}, status=400)
|
|
223
|
-
|
|
224
|
-
# Security check: Ensure the worker_id from the payload matches the authenticated worker
|
|
225
|
-
authenticated_worker_id = request.get("worker_id")
|
|
226
|
-
if not authenticated_worker_id:
|
|
227
|
-
return json_response({"error": "Could not identify authenticated worker."}, status=500)
|
|
228
|
-
|
|
229
|
-
if payload_worker_id and payload_worker_id != authenticated_worker_id:
|
|
230
|
-
return json_response(
|
|
231
|
-
{
|
|
232
|
-
"error": f"Forbidden: Authenticated worker '{authenticated_worker_id}' "
|
|
233
|
-
f"cannot submit results for another worker '{payload_worker_id}'.",
|
|
234
|
-
},
|
|
235
|
-
status=403,
|
|
236
|
-
)
|
|
237
|
-
|
|
238
|
-
if not job_id or not task_id:
|
|
239
|
-
return json_response({"error": "job_id and task_id are required"}, status=400)
|
|
240
|
-
|
|
241
|
-
job_state = await engine.storage.get_job_state(job_id)
|
|
242
|
-
if not job_state:
|
|
243
|
-
return json_response({"error": "Job not found"}, status=404)
|
|
244
|
-
|
|
245
|
-
# Handle parallel task completion
|
|
246
|
-
if job_state.get("status") == JOB_STATUS_WAITING_FOR_PARALLEL:
|
|
247
|
-
await engine.storage.remove_job_from_watch(f"{job_id}:{task_id}")
|
|
248
|
-
job_state.setdefault("aggregation_results", {})[task_id] = result
|
|
249
|
-
job_state.setdefault("active_branches", []).remove(task_id)
|
|
250
|
-
|
|
251
|
-
if not job_state["active_branches"]:
|
|
252
|
-
logger.info(f"All parallel branches for job {job_id} have completed.")
|
|
253
|
-
job_state["status"] = JOB_STATUS_RUNNING
|
|
254
|
-
job_state["current_state"] = job_state["aggregation_target"]
|
|
255
|
-
await engine.storage.save_job_state(job_id, job_state)
|
|
256
|
-
await engine.storage.enqueue_job(job_id)
|
|
257
|
-
else:
|
|
258
|
-
logger.info(
|
|
259
|
-
f"Branch {task_id} for job {job_id} completed. Waiting for {len(job_state['active_branches'])} more.",
|
|
260
|
-
)
|
|
261
|
-
await engine.storage.save_job_state(job_id, job_state)
|
|
262
|
-
|
|
263
|
-
return json_response({"status": "parallel_branch_result_accepted"}, status=200)
|
|
264
|
-
|
|
265
|
-
await engine.storage.remove_job_from_watch(job_id)
|
|
266
|
-
|
|
267
|
-
import time
|
|
268
|
-
|
|
269
|
-
now = time.monotonic()
|
|
270
|
-
dispatched_at = job_state.get("task_dispatched_at", now)
|
|
271
|
-
duration_ms = int((now - dispatched_at) * 1000)
|
|
272
|
-
|
|
273
|
-
await engine.history_storage.log_job_event(
|
|
274
|
-
{
|
|
275
|
-
"job_id": job_id,
|
|
276
|
-
"state": job_state.get("current_state"),
|
|
277
|
-
"event_type": "task_finished",
|
|
278
|
-
"duration_ms": duration_ms,
|
|
279
|
-
"worker_id": authenticated_worker_id,
|
|
280
|
-
"context_snapshot": {**job_state, "result": result},
|
|
281
|
-
},
|
|
282
|
-
)
|
|
283
|
-
|
|
284
|
-
job_state["tracing_context"] = {str(k): v for k, v in request.headers.items()}
|
|
285
|
-
|
|
286
|
-
if result_status == TASK_STATUS_FAILURE:
|
|
287
|
-
error_details = result.get("error", {})
|
|
288
|
-
error_type = ERROR_CODE_TRANSIENT
|
|
289
|
-
error_message = "No error details provided."
|
|
290
|
-
|
|
291
|
-
if isinstance(error_details, dict):
|
|
292
|
-
error_type = error_details.get("code", ERROR_CODE_TRANSIENT)
|
|
293
|
-
error_message = error_details.get("message", "No error message provided.")
|
|
294
|
-
elif isinstance(error_details, str):
|
|
295
|
-
error_message = error_details
|
|
296
|
-
|
|
297
|
-
logger.warning(f"Task {task_id} for job {job_id} failed with error type '{error_type}'.")
|
|
298
|
-
|
|
299
|
-
if error_type == ERROR_CODE_PERMANENT:
|
|
300
|
-
job_state["status"] = JOB_STATUS_QUARANTINED
|
|
301
|
-
job_state["error_message"] = f"Task failed with permanent error: {error_message}"
|
|
302
|
-
await engine.storage.save_job_state(job_id, job_state)
|
|
303
|
-
await engine.storage.quarantine_job(job_id)
|
|
304
|
-
elif error_type == ERROR_CODE_INVALID_INPUT:
|
|
305
|
-
job_state["status"] = JOB_STATUS_FAILED
|
|
306
|
-
job_state["error_message"] = f"Task failed due to invalid input: {error_message}"
|
|
307
|
-
await engine.storage.save_job_state(job_id, job_state)
|
|
308
|
-
else: # TRANSIENT_ERROR
|
|
309
|
-
await engine.handle_task_failure(job_state, task_id, error_message)
|
|
310
|
-
|
|
311
|
-
return json_response({"status": "result_accepted_failure"}, status=200)
|
|
312
|
-
|
|
313
|
-
if result_status == TASK_STATUS_CANCELLED:
|
|
314
|
-
logger.info(f"Task {task_id} for job {job_id} was cancelled by worker.")
|
|
315
|
-
job_state["status"] = JOB_STATUS_CANCELLED
|
|
316
|
-
await engine.storage.save_job_state(job_id, job_state)
|
|
317
|
-
transitions = job_state.get("current_task_transitions", {})
|
|
318
|
-
if next_state := transitions.get("cancelled"):
|
|
319
|
-
job_state["current_state"] = next_state
|
|
320
|
-
job_state["status"] = JOB_STATUS_RUNNING
|
|
321
|
-
await engine.storage.save_job_state(job_id, job_state)
|
|
322
|
-
await engine.storage.enqueue_job(job_id)
|
|
323
|
-
return json_response({"status": "result_accepted_cancelled"}, status=200)
|
|
324
|
-
|
|
325
|
-
transitions = job_state.get("current_task_transitions", {})
|
|
326
|
-
if next_state := transitions.get(result_status):
|
|
327
|
-
logger.info(f"Job {job_id} transitioning based on worker status '{result_status}' to state '{next_state}'")
|
|
328
|
-
|
|
329
|
-
worker_data = result.get("data")
|
|
330
|
-
if worker_data and isinstance(worker_data, dict):
|
|
331
|
-
if "state_history" not in job_state:
|
|
332
|
-
job_state["state_history"] = {}
|
|
333
|
-
job_state["state_history"].update(worker_data)
|
|
334
|
-
|
|
335
|
-
job_state["current_state"] = next_state
|
|
336
|
-
job_state["status"] = JOB_STATUS_RUNNING
|
|
337
|
-
await engine.storage.save_job_state(job_id, job_state)
|
|
338
|
-
await engine.storage.enqueue_job(job_id)
|
|
339
|
-
else:
|
|
340
|
-
logger.error(f"Job {job_id} failed. Worker returned unhandled status '{result_status}'.")
|
|
341
|
-
job_state["status"] = JOB_STATUS_FAILED
|
|
342
|
-
job_state["error_message"] = f"Worker returned unhandled status: {result_status}"
|
|
343
|
-
await engine.storage.save_job_state(job_id, job_state)
|
|
344
|
-
|
|
345
|
-
return json_response({"status": "result_accepted_success"}, status=200)
|
|
346
|
-
|
|
347
|
-
|
|
348
202
|
async def human_approval_webhook_handler(request: web.Request) -> web.Response:
|
|
349
203
|
engine = request.app[ENGINE_KEY]
|
|
350
204
|
job_id = request.match_info.get("job_id")
|
|
@@ -441,109 +295,3 @@ async def docs_handler(request: web.Request) -> web.Response:
|
|
|
441
295
|
content = content.replace(marker, f"{marker}\n{endpoints_json.strip('[]')},")
|
|
442
296
|
|
|
443
297
|
return web.Response(text=content, content_type="text/html")
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
async def websocket_handler(request: web.Request) -> web.WebSocketResponse:
|
|
447
|
-
engine = request.app[ENGINE_KEY]
|
|
448
|
-
worker_id = request.match_info.get("worker_id")
|
|
449
|
-
if not worker_id:
|
|
450
|
-
raise web.HTTPBadRequest(text="worker_id is required")
|
|
451
|
-
|
|
452
|
-
ws = web.WebSocketResponse()
|
|
453
|
-
await ws.prepare(request)
|
|
454
|
-
|
|
455
|
-
await engine.ws_manager.register(worker_id, ws)
|
|
456
|
-
try:
|
|
457
|
-
async for msg in ws:
|
|
458
|
-
if msg.type == WSMsgType.TEXT:
|
|
459
|
-
try:
|
|
460
|
-
data = msg.json()
|
|
461
|
-
await engine.ws_manager.handle_message(worker_id, data)
|
|
462
|
-
except Exception as e:
|
|
463
|
-
logger.error(f"Error processing WebSocket message from {worker_id}: {e}")
|
|
464
|
-
elif msg.type == WSMsgType.ERROR:
|
|
465
|
-
logger.error(f"WebSocket connection for {worker_id} closed with exception {ws.exception()}")
|
|
466
|
-
break
|
|
467
|
-
finally:
|
|
468
|
-
await engine.ws_manager.unregister(worker_id)
|
|
469
|
-
return ws
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
async def handle_get_next_task(request: web.Request) -> web.Response:
|
|
473
|
-
engine = request.app[ENGINE_KEY]
|
|
474
|
-
worker_id = request.match_info.get("worker_id")
|
|
475
|
-
if not worker_id:
|
|
476
|
-
return json_response({"error": "worker_id is required in path"}, status=400)
|
|
477
|
-
|
|
478
|
-
logger.debug(f"Worker {worker_id} is requesting a new task.")
|
|
479
|
-
task = await engine.storage.dequeue_task_for_worker(worker_id, engine.config.WORKER_POLL_TIMEOUT_SECONDS)
|
|
480
|
-
|
|
481
|
-
if task:
|
|
482
|
-
logger.info(f"Sending task {task.get('task_id')} to worker {worker_id}")
|
|
483
|
-
return json_response(task, status=200)
|
|
484
|
-
logger.debug(f"No tasks for worker {worker_id}, responding 204.")
|
|
485
|
-
return web.Response(status=204)
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
async def worker_update_handler(request: web.Request) -> web.Response:
|
|
489
|
-
engine = request.app[ENGINE_KEY]
|
|
490
|
-
worker_id = request.match_info.get("worker_id")
|
|
491
|
-
if not worker_id:
|
|
492
|
-
return json_response({"error": "worker_id is required in path"}, status=400)
|
|
493
|
-
|
|
494
|
-
ttl = engine.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
|
|
495
|
-
update_data = None
|
|
496
|
-
|
|
497
|
-
if request.can_read_body:
|
|
498
|
-
try:
|
|
499
|
-
update_data = await request.json(loads=loads)
|
|
500
|
-
except Exception:
|
|
501
|
-
logger.warning(
|
|
502
|
-
f"Received PATCH from worker {worker_id} with non-JSON body. Treating as TTL-only heartbeat."
|
|
503
|
-
)
|
|
504
|
-
|
|
505
|
-
if update_data:
|
|
506
|
-
updated_worker = await engine.storage.update_worker_status(worker_id, update_data, ttl)
|
|
507
|
-
if not updated_worker:
|
|
508
|
-
return json_response({"error": "Worker not found"}, status=404)
|
|
509
|
-
|
|
510
|
-
await engine.history_storage.log_worker_event(
|
|
511
|
-
{
|
|
512
|
-
"worker_id": worker_id,
|
|
513
|
-
"event_type": "status_update",
|
|
514
|
-
"worker_info_snapshot": updated_worker,
|
|
515
|
-
},
|
|
516
|
-
)
|
|
517
|
-
return json_response(updated_worker, status=200)
|
|
518
|
-
else:
|
|
519
|
-
refreshed = await engine.storage.refresh_worker_ttl(worker_id, ttl)
|
|
520
|
-
if not refreshed:
|
|
521
|
-
return json_response({"error": "Worker not found"}, status=404)
|
|
522
|
-
return json_response({"status": "ttl_refreshed"})
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
async def register_worker_handler(request: web.Request) -> web.Response:
|
|
526
|
-
engine = request.app[ENGINE_KEY]
|
|
527
|
-
worker_data = request.get("worker_registration_data")
|
|
528
|
-
if not worker_data:
|
|
529
|
-
return json_response({"error": "Worker data not found in request"}, status=500)
|
|
530
|
-
|
|
531
|
-
worker_id = worker_data.get("worker_id")
|
|
532
|
-
if not worker_id:
|
|
533
|
-
return json_response({"error": "Missing required field: worker_id"}, status=400)
|
|
534
|
-
|
|
535
|
-
ttl = engine.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
|
|
536
|
-
await engine.storage.register_worker(worker_id, worker_data, ttl)
|
|
537
|
-
|
|
538
|
-
logger.info(
|
|
539
|
-
f"Worker '{worker_id}' registered with info: {worker_data}",
|
|
540
|
-
)
|
|
541
|
-
|
|
542
|
-
await engine.history_storage.log_worker_event(
|
|
543
|
-
{
|
|
544
|
-
"worker_id": worker_id,
|
|
545
|
-
"event_type": "registered",
|
|
546
|
-
"worker_info_snapshot": worker_data,
|
|
547
|
-
},
|
|
548
|
-
)
|
|
549
|
-
return json_response({"status": "registered"}, status=200)
|
avtomatika/api/routes.py
CHANGED
|
@@ -5,8 +5,7 @@ from aiohttp import web
|
|
|
5
5
|
from ..app_keys import ENGINE_KEY
|
|
6
6
|
from ..history.noop import NoOpHistoryStorage
|
|
7
7
|
from ..quota import quota_middleware_factory
|
|
8
|
-
from ..
|
|
9
|
-
from ..security import client_auth_middleware_factory, worker_auth_middleware_factory
|
|
8
|
+
from ..security import client_auth_middleware_factory
|
|
10
9
|
from .handlers import (
|
|
11
10
|
cancel_job_handler,
|
|
12
11
|
create_job_handler_factory,
|
|
@@ -19,15 +18,10 @@ from .handlers import (
|
|
|
19
18
|
get_jobs_handler,
|
|
20
19
|
get_quarantined_jobs_handler,
|
|
21
20
|
get_workers_handler,
|
|
22
|
-
handle_get_next_task,
|
|
23
21
|
human_approval_webhook_handler,
|
|
24
22
|
metrics_handler,
|
|
25
|
-
register_worker_handler,
|
|
26
23
|
reload_worker_configs_handler,
|
|
27
24
|
status_handler,
|
|
28
|
-
task_result_handler,
|
|
29
|
-
websocket_handler,
|
|
30
|
-
worker_update_handler,
|
|
31
25
|
)
|
|
32
26
|
|
|
33
27
|
if TYPE_CHECKING:
|
|
@@ -35,7 +29,7 @@ if TYPE_CHECKING:
|
|
|
35
29
|
|
|
36
30
|
|
|
37
31
|
def setup_routes(app: web.Application, engine: "OrchestratorEngine") -> None:
|
|
38
|
-
"""Sets up
|
|
32
|
+
"""Sets up application routes for Public and Client APIs."""
|
|
39
33
|
|
|
40
34
|
# --- Public API (Unprotected) ---
|
|
41
35
|
public_app = web.Application()
|
|
@@ -49,61 +43,46 @@ def setup_routes(app: web.Application, engine: "OrchestratorEngine") -> None:
|
|
|
49
43
|
app.add_subapp("/_public/", public_app)
|
|
50
44
|
|
|
51
45
|
# --- Protected API (Client Access) ---
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
if bp.api_version
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
# --- Worker API (Worker Access) ---
|
|
93
|
-
worker_auth_middleware = worker_auth_middleware_factory(engine.storage, engine.config)
|
|
94
|
-
worker_middlewares = [worker_auth_middleware]
|
|
95
|
-
if engine.config.RATE_LIMITING_ENABLED:
|
|
96
|
-
worker_rate_limiter = rate_limit_middleware_factory(storage=engine.storage, limit=5, period=60)
|
|
97
|
-
worker_middlewares.append(worker_rate_limiter)
|
|
98
|
-
|
|
99
|
-
worker_app = web.Application(middlewares=worker_middlewares)
|
|
100
|
-
worker_app[ENGINE_KEY] = engine
|
|
101
|
-
worker_app.router.add_post("/workers/register", register_worker_handler)
|
|
102
|
-
worker_app.router.add_get("/workers/{worker_id}/tasks/next", handle_get_next_task)
|
|
103
|
-
worker_app.router.add_patch("/workers/{worker_id}", worker_update_handler)
|
|
104
|
-
worker_app.router.add_post("/tasks/result", task_result_handler)
|
|
105
|
-
worker_app.router.add_get("/ws/{worker_id}", websocket_handler)
|
|
106
|
-
app.add_subapp("/_worker/", worker_app)
|
|
46
|
+
if engine.config.ENABLE_CLIENT_API:
|
|
47
|
+
auth_middleware = client_auth_middleware_factory(engine.storage)
|
|
48
|
+
quota_middleware = quota_middleware_factory(engine.storage)
|
|
49
|
+
api_middlewares = [auth_middleware, quota_middleware]
|
|
50
|
+
|
|
51
|
+
protected_app = web.Application(middlewares=api_middlewares)
|
|
52
|
+
protected_app[ENGINE_KEY] = engine
|
|
53
|
+
versioned_apps: dict[str, web.Application] = {}
|
|
54
|
+
has_unversioned_routes = False
|
|
55
|
+
|
|
56
|
+
# Register Blueprint routes
|
|
57
|
+
for bp in engine.blueprints.values():
|
|
58
|
+
if not bp.api_endpoint:
|
|
59
|
+
continue
|
|
60
|
+
endpoint = bp.api_endpoint if bp.api_endpoint.startswith("/") else f"/{bp.api_endpoint}"
|
|
61
|
+
|
|
62
|
+
handler = create_job_handler_factory(bp)
|
|
63
|
+
|
|
64
|
+
if bp.api_version:
|
|
65
|
+
if bp.api_version not in versioned_apps:
|
|
66
|
+
versioned_apps[bp.api_version] = web.Application(middlewares=api_middlewares)
|
|
67
|
+
versioned_apps[bp.api_version][ENGINE_KEY] = engine
|
|
68
|
+
versioned_apps[bp.api_version].router.add_post(endpoint, handler)
|
|
69
|
+
else:
|
|
70
|
+
protected_app.router.add_post(endpoint, handler)
|
|
71
|
+
has_unversioned_routes = True
|
|
72
|
+
|
|
73
|
+
# Common routes for all protected apps
|
|
74
|
+
all_protected_apps = list(versioned_apps.values())
|
|
75
|
+
if has_unversioned_routes:
|
|
76
|
+
all_protected_apps.append(protected_app)
|
|
77
|
+
|
|
78
|
+
for sub_app in all_protected_apps:
|
|
79
|
+
_register_common_routes(sub_app, engine)
|
|
80
|
+
|
|
81
|
+
# Mount protected apps
|
|
82
|
+
if has_unversioned_routes:
|
|
83
|
+
app.add_subapp("/api/", protected_app)
|
|
84
|
+
for version, sub_app in versioned_apps.items():
|
|
85
|
+
app.add_subapp(f"/api/{version}", sub_app)
|
|
107
86
|
|
|
108
87
|
|
|
109
88
|
def _register_common_routes(app: web.Application, engine: "OrchestratorEngine") -> None:
|
avtomatika/api.html
CHANGED
|
@@ -211,7 +211,7 @@
|
|
|
211
211
|
],
|
|
212
212
|
request: { body: null },
|
|
213
213
|
responses: [
|
|
214
|
-
{ code: '200 OK', description: 'Successful response.', body: { "id": "...", "status": "..." } }
|
|
214
|
+
{ code: '200 OK', description: 'Successful response.', body: { "id": "...", "status": "running", "progress": 0.75, "progress_message": "Processing..." } }
|
|
215
215
|
]
|
|
216
216
|
},
|
|
217
217
|
{
|
avtomatika/app_keys.py
CHANGED
|
@@ -31,3 +31,4 @@ REPUTATION_CALCULATOR_TASK_KEY = AppKey("reputation_calculator_task", Task)
|
|
|
31
31
|
HEALTH_CHECKER_TASK_KEY = AppKey("health_checker_task", Task)
|
|
32
32
|
SCHEDULER_TASK_KEY = AppKey("scheduler_task", Task)
|
|
33
33
|
S3_SERVICE_KEY = AppKey("s3_service", "S3Service")
|
|
34
|
+
WORKER_SERVICE_KEY = AppKey("worker_service", "WorkerService")
|
avtomatika/blueprint.py
CHANGED
|
@@ -62,7 +62,8 @@ class ConditionalHandler:
|
|
|
62
62
|
try:
|
|
63
63
|
context_area = getattr(context, self.condition.area)
|
|
64
64
|
actual_value = context_area[self.condition.field]
|
|
65
|
-
|
|
65
|
+
result = self.condition.op(actual_value, self.condition.value)
|
|
66
|
+
return bool(result)
|
|
66
67
|
except (AttributeError, KeyError):
|
|
67
68
|
return False
|
|
68
69
|
|
|
@@ -279,7 +280,7 @@ class StateMachineBlueprint:
|
|
|
279
280
|
f"No suitable handler found for state '{state}' in blueprint '{self.name}' for the given context.",
|
|
280
281
|
)
|
|
281
282
|
|
|
282
|
-
def render_graph(self, output_filename: str | None = None, output_format: str = "png"):
|
|
283
|
+
def render_graph(self, output_filename: str | None = None, output_format: str = "png") -> str | None:
|
|
283
284
|
from graphviz import Digraph # type: ignore[import]
|
|
284
285
|
|
|
285
286
|
dot = Digraph(comment=f"State Machine for {self.name}")
|
avtomatika/config.py
CHANGED
|
@@ -25,6 +25,7 @@ class Config:
|
|
|
25
25
|
# API server settings
|
|
26
26
|
self.API_HOST: str = getenv("API_HOST", "0.0.0.0")
|
|
27
27
|
self.API_PORT: int = int(getenv("API_PORT", 8080))
|
|
28
|
+
self.ENABLE_CLIENT_API: bool = getenv("ENABLE_CLIENT_API", "true").lower() == "true"
|
|
28
29
|
|
|
29
30
|
# Security settings
|
|
30
31
|
self.CLIENT_TOKEN: str = getenv(
|
|
@@ -33,6 +34,13 @@ class Config:
|
|
|
33
34
|
)
|
|
34
35
|
self.GLOBAL_WORKER_TOKEN: str = getenv("GLOBAL_WORKER_TOKEN", "secure-worker-token")
|
|
35
36
|
|
|
37
|
+
# TLS / mTLS settings
|
|
38
|
+
self.TLS_ENABLED: bool = getenv("TLS_ENABLED", "false").lower() == "true"
|
|
39
|
+
self.TLS_CERT_PATH: str = getenv("TLS_CERT_PATH", "")
|
|
40
|
+
self.TLS_KEY_PATH: str = getenv("TLS_KEY_PATH", "")
|
|
41
|
+
self.TLS_CA_PATH: str = getenv("TLS_CA_PATH", "")
|
|
42
|
+
self.TLS_REQUIRE_CLIENT_CERT: bool = getenv("TLS_REQUIRE_CLIENT_CERT", "false").lower() == "true"
|
|
43
|
+
|
|
36
44
|
# Logging settings
|
|
37
45
|
self.LOG_LEVEL: str = getenv("LOG_LEVEL", "INFO").upper()
|
|
38
46
|
self.LOG_FORMAT: str = getenv("LOG_FORMAT", "json") # "text" or "json"
|
avtomatika/constants.py
CHANGED
|
@@ -1,30 +1,80 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Centralized constants for the Avtomatika protocol.
|
|
3
|
-
|
|
3
|
+
(Legacy wrapper, pointing to rxon.constants)
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
|
|
7
|
-
AUTH_HEADER_CLIENT
|
|
8
|
-
AUTH_HEADER_WORKER
|
|
6
|
+
from rxon.constants import (
|
|
7
|
+
AUTH_HEADER_CLIENT,
|
|
8
|
+
AUTH_HEADER_WORKER,
|
|
9
|
+
COMMAND_CANCEL_TASK,
|
|
10
|
+
ENDPOINT_TASK_NEXT,
|
|
11
|
+
ENDPOINT_TASK_RESULT,
|
|
12
|
+
ENDPOINT_WORKER_HEARTBEAT,
|
|
13
|
+
ENDPOINT_WORKER_REGISTER,
|
|
14
|
+
ERROR_CODE_DEPENDENCY,
|
|
15
|
+
ERROR_CODE_INTEGRITY_MISMATCH,
|
|
16
|
+
ERROR_CODE_INTERNAL,
|
|
17
|
+
ERROR_CODE_INVALID_INPUT,
|
|
18
|
+
ERROR_CODE_PERMANENT,
|
|
19
|
+
ERROR_CODE_RESOURCE_EXHAUSTED,
|
|
20
|
+
ERROR_CODE_SECURITY,
|
|
21
|
+
ERROR_CODE_TIMEOUT,
|
|
22
|
+
ERROR_CODE_TRANSIENT,
|
|
23
|
+
JOB_STATUS_CANCELLED,
|
|
24
|
+
JOB_STATUS_ERROR,
|
|
25
|
+
JOB_STATUS_FAILED,
|
|
26
|
+
JOB_STATUS_FINISHED,
|
|
27
|
+
JOB_STATUS_PENDING,
|
|
28
|
+
JOB_STATUS_QUARANTINED,
|
|
29
|
+
JOB_STATUS_RUNNING,
|
|
30
|
+
JOB_STATUS_WAITING_FOR_HUMAN,
|
|
31
|
+
JOB_STATUS_WAITING_FOR_PARALLEL,
|
|
32
|
+
JOB_STATUS_WAITING_FOR_WORKER,
|
|
33
|
+
MSG_TYPE_PROGRESS,
|
|
34
|
+
PROTOCOL_VERSION,
|
|
35
|
+
PROTOCOL_VERSION_HEADER,
|
|
36
|
+
STS_TOKEN_ENDPOINT,
|
|
37
|
+
TASK_STATUS_CANCELLED,
|
|
38
|
+
TASK_STATUS_FAILURE,
|
|
39
|
+
TASK_STATUS_SUCCESS,
|
|
40
|
+
WORKER_API_PREFIX,
|
|
41
|
+
WS_ENDPOINT,
|
|
42
|
+
)
|
|
9
43
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
44
|
+
__all__ = [
|
|
45
|
+
"AUTH_HEADER_CLIENT",
|
|
46
|
+
"AUTH_HEADER_WORKER",
|
|
47
|
+
"COMMAND_CANCEL_TASK",
|
|
48
|
+
"ENDPOINT_TASK_NEXT",
|
|
49
|
+
"ENDPOINT_TASK_RESULT",
|
|
50
|
+
"ENDPOINT_WORKER_HEARTBEAT",
|
|
51
|
+
"ENDPOINT_WORKER_REGISTER",
|
|
52
|
+
"ERROR_CODE_DEPENDENCY",
|
|
53
|
+
"ERROR_CODE_INTEGRITY_MISMATCH",
|
|
54
|
+
"ERROR_CODE_INTERNAL",
|
|
55
|
+
"ERROR_CODE_INVALID_INPUT",
|
|
56
|
+
"ERROR_CODE_PERMANENT",
|
|
57
|
+
"ERROR_CODE_RESOURCE_EXHAUSTED",
|
|
58
|
+
"ERROR_CODE_SECURITY",
|
|
59
|
+
"ERROR_CODE_TIMEOUT",
|
|
60
|
+
"ERROR_CODE_TRANSIENT",
|
|
61
|
+
"JOB_STATUS_CANCELLED",
|
|
62
|
+
"JOB_STATUS_ERROR",
|
|
63
|
+
"JOB_STATUS_FAILED",
|
|
64
|
+
"JOB_STATUS_FINISHED",
|
|
65
|
+
"JOB_STATUS_PENDING",
|
|
66
|
+
"JOB_STATUS_QUARANTINED",
|
|
67
|
+
"JOB_STATUS_RUNNING",
|
|
68
|
+
"JOB_STATUS_WAITING_FOR_HUMAN",
|
|
69
|
+
"JOB_STATUS_WAITING_FOR_PARALLEL",
|
|
70
|
+
"JOB_STATUS_WAITING_FOR_WORKER",
|
|
71
|
+
"MSG_TYPE_PROGRESS",
|
|
72
|
+
"PROTOCOL_VERSION",
|
|
73
|
+
"PROTOCOL_VERSION_HEADER",
|
|
74
|
+
"STS_TOKEN_ENDPOINT",
|
|
75
|
+
"TASK_STATUS_CANCELLED",
|
|
76
|
+
"TASK_STATUS_FAILURE",
|
|
77
|
+
"TASK_STATUS_SUCCESS",
|
|
78
|
+
"WORKER_API_PREFIX",
|
|
79
|
+
"WS_ENDPOINT",
|
|
80
|
+
]
|
avtomatika/data_types.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING, Any, NamedTuple
|
|
2
2
|
|
|
3
|
+
from rxon.models import InstalledModel, Resources
|
|
4
|
+
|
|
3
5
|
if TYPE_CHECKING:
|
|
4
6
|
from .context import ActionFactory
|
|
5
7
|
|
|
@@ -28,28 +30,6 @@ class JobContext(NamedTuple):
|
|
|
28
30
|
task_files: Any | None = None
|
|
29
31
|
|
|
30
32
|
|
|
31
|
-
class GPUInfo(NamedTuple):
|
|
32
|
-
"""Information about the graphics processor."""
|
|
33
|
-
|
|
34
|
-
model: str
|
|
35
|
-
vram_gb: int
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class Resources(NamedTuple):
|
|
39
|
-
"""Information about worker resources."""
|
|
40
|
-
|
|
41
|
-
max_concurrent_tasks: int
|
|
42
|
-
gpu_info: GPUInfo | None
|
|
43
|
-
cpu_cores: int
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
class InstalledModel(NamedTuple):
|
|
47
|
-
"""Information about the installed ML model."""
|
|
48
|
-
|
|
49
|
-
name: str
|
|
50
|
-
version: str
|
|
51
|
-
|
|
52
|
-
|
|
53
33
|
class WorkerInfo(NamedTuple):
|
|
54
34
|
"""Complete information about the worker, transmitted upon registration."""
|
|
55
35
|
|
avtomatika/dispatcher.py
CHANGED
|
@@ -184,6 +184,9 @@ class Dispatcher:
|
|
|
184
184
|
selected_worker = self._select_default(capable_workers, task_type)
|
|
185
185
|
|
|
186
186
|
worker_id = selected_worker.get("worker_id")
|
|
187
|
+
if not worker_id:
|
|
188
|
+
raise RuntimeError(f"Selected worker for task '{task_type}' has no worker_id")
|
|
189
|
+
|
|
187
190
|
logger.info(
|
|
188
191
|
f"Dispatching task '{task_type}' to worker {worker_id} (strategy: {dispatch_strategy})",
|
|
189
192
|
)
|
|
@@ -195,6 +198,7 @@ class Dispatcher:
|
|
|
195
198
|
"type": task_type,
|
|
196
199
|
"params": task_info.get("params", {}),
|
|
197
200
|
"tracing_context": {},
|
|
201
|
+
"params_metadata": job_state.get("data_metadata"),
|
|
198
202
|
}
|
|
199
203
|
# Inject tracing context into the payload, not headers
|
|
200
204
|
inject(payload["tracing_context"], context=job_state.get("tracing_context"))
|