avtomatika 1.0b6__py3-none-any.whl → 1.0b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avtomatika/api/handlers.py +549 -0
- avtomatika/api/routes.py +118 -0
- avtomatika/app_keys.py +32 -0
- avtomatika/blueprint.py +125 -54
- avtomatika/context.py +2 -2
- avtomatika/data_types.py +3 -2
- avtomatika/dispatcher.py +1 -1
- avtomatika/engine.py +52 -601
- avtomatika/executor.py +21 -16
- avtomatika/scheduler.py +8 -8
- avtomatika/storage/memory.py +12 -7
- avtomatika/utils/__init__.py +0 -0
- avtomatika/utils/webhook_sender.py +54 -0
- avtomatika/watcher.py +1 -3
- {avtomatika-1.0b6.dist-info → avtomatika-1.0b7.dist-info}/METADATA +43 -3
- {avtomatika-1.0b6.dist-info → avtomatika-1.0b7.dist-info}/RECORD +19 -14
- {avtomatika-1.0b6.dist-info → avtomatika-1.0b7.dist-info}/WHEEL +0 -0
- {avtomatika-1.0b6.dist-info → avtomatika-1.0b7.dist-info}/licenses/LICENSE +0 -0
- {avtomatika-1.0b6.dist-info → avtomatika-1.0b7.dist-info}/top_level.txt +0 -0
avtomatika/engine.py
CHANGED
|
@@ -1,68 +1,50 @@
|
|
|
1
|
-
from asyncio import Task, create_task, gather, get_running_loop, wait_for
|
|
2
1
|
from asyncio import TimeoutError as AsyncTimeoutError
|
|
2
|
+
from asyncio import create_task, gather, get_running_loop, wait_for
|
|
3
3
|
from logging import getLogger
|
|
4
|
-
from typing import Any
|
|
4
|
+
from typing import Any
|
|
5
5
|
from uuid import uuid4
|
|
6
6
|
|
|
7
|
-
from aiohttp import ClientSession,
|
|
8
|
-
from
|
|
9
|
-
from aioprometheus import render
|
|
10
|
-
from orjson import OPT_INDENT_2, dumps, loads
|
|
7
|
+
from aiohttp import ClientSession, web
|
|
8
|
+
from orjson import dumps
|
|
11
9
|
|
|
12
10
|
from . import metrics
|
|
11
|
+
from .api.routes import setup_routes
|
|
12
|
+
from .app_keys import (
|
|
13
|
+
DISPATCHER_KEY,
|
|
14
|
+
ENGINE_KEY,
|
|
15
|
+
EXECUTOR_KEY,
|
|
16
|
+
EXECUTOR_TASK_KEY,
|
|
17
|
+
HEALTH_CHECKER_KEY,
|
|
18
|
+
HEALTH_CHECKER_TASK_KEY,
|
|
19
|
+
HTTP_SESSION_KEY,
|
|
20
|
+
REPUTATION_CALCULATOR_KEY,
|
|
21
|
+
REPUTATION_CALCULATOR_TASK_KEY,
|
|
22
|
+
SCHEDULER_KEY,
|
|
23
|
+
SCHEDULER_TASK_KEY,
|
|
24
|
+
WATCHER_KEY,
|
|
25
|
+
WATCHER_TASK_KEY,
|
|
26
|
+
WS_MANAGER_KEY,
|
|
27
|
+
)
|
|
13
28
|
from .blueprint import StateMachineBlueprint
|
|
14
29
|
from .client_config_loader import load_client_configs_to_redis
|
|
15
30
|
from .compression import compression_middleware
|
|
16
31
|
from .config import Config
|
|
17
|
-
from .constants import
|
|
18
|
-
ERROR_CODE_INVALID_INPUT,
|
|
19
|
-
ERROR_CODE_PERMANENT,
|
|
20
|
-
ERROR_CODE_TRANSIENT,
|
|
21
|
-
JOB_STATUS_CANCELLED,
|
|
22
|
-
JOB_STATUS_FAILED,
|
|
23
|
-
JOB_STATUS_PENDING,
|
|
24
|
-
JOB_STATUS_QUARANTINED,
|
|
25
|
-
JOB_STATUS_RUNNING,
|
|
26
|
-
JOB_STATUS_WAITING_FOR_HUMAN,
|
|
27
|
-
JOB_STATUS_WAITING_FOR_PARALLEL,
|
|
28
|
-
JOB_STATUS_WAITING_FOR_WORKER,
|
|
29
|
-
TASK_STATUS_CANCELLED,
|
|
30
|
-
TASK_STATUS_FAILURE,
|
|
31
|
-
TASK_STATUS_SUCCESS,
|
|
32
|
-
)
|
|
32
|
+
from .constants import JOB_STATUS_FAILED, JOB_STATUS_PENDING, JOB_STATUS_QUARANTINED, JOB_STATUS_WAITING_FOR_WORKER
|
|
33
33
|
from .dispatcher import Dispatcher
|
|
34
34
|
from .executor import JobExecutor
|
|
35
35
|
from .health_checker import HealthChecker
|
|
36
36
|
from .history.base import HistoryStorageBase
|
|
37
37
|
from .history.noop import NoOpHistoryStorage
|
|
38
38
|
from .logging_config import setup_logging
|
|
39
|
-
from .quota import quota_middleware_factory
|
|
40
|
-
from .ratelimit import rate_limit_middleware_factory
|
|
41
39
|
from .reputation import ReputationCalculator
|
|
42
40
|
from .scheduler import Scheduler
|
|
43
|
-
from .security import client_auth_middleware_factory, worker_auth_middleware_factory
|
|
44
41
|
from .storage.base import StorageBackend
|
|
45
42
|
from .telemetry import setup_telemetry
|
|
43
|
+
from .utils.webhook_sender import WebhookPayload, WebhookSender
|
|
46
44
|
from .watcher import Watcher
|
|
47
45
|
from .worker_config_loader import load_worker_configs_to_redis
|
|
48
46
|
from .ws_manager import WebSocketManager
|
|
49
47
|
|
|
50
|
-
# Application keys for storing components
|
|
51
|
-
ENGINE_KEY = AppKey("engine", "OrchestratorEngine")
|
|
52
|
-
HTTP_SESSION_KEY = AppKey("http_session", ClientSession)
|
|
53
|
-
DISPATCHER_KEY = AppKey("dispatcher", Dispatcher)
|
|
54
|
-
EXECUTOR_KEY = AppKey("executor", JobExecutor)
|
|
55
|
-
WATCHER_KEY = AppKey("watcher", Watcher)
|
|
56
|
-
REPUTATION_CALCULATOR_KEY = AppKey("reputation_calculator", ReputationCalculator)
|
|
57
|
-
HEALTH_CHECKER_KEY = AppKey("health_checker", HealthChecker)
|
|
58
|
-
SCHEDULER_KEY = AppKey("scheduler", Scheduler)
|
|
59
|
-
|
|
60
|
-
EXECUTOR_TASK_KEY = AppKey("executor_task", Task)
|
|
61
|
-
WATCHER_TASK_KEY = AppKey("watcher_task", Task)
|
|
62
|
-
REPUTATION_CALCULATOR_TASK_KEY = AppKey("reputation_calculator_task", Task)
|
|
63
|
-
HEALTH_CHECKER_TASK_KEY = AppKey("health_checker_task", Task)
|
|
64
|
-
SCHEDULER_TASK_KEY = AppKey("scheduler_task", Task)
|
|
65
|
-
|
|
66
48
|
metrics.init_metrics()
|
|
67
49
|
|
|
68
50
|
logger = getLogger(__name__)
|
|
@@ -76,14 +58,6 @@ def json_response(data: Any, **kwargs: Any) -> web.Response:
|
|
|
76
58
|
return web.json_response(data, dumps=json_dumps, **kwargs)
|
|
77
59
|
|
|
78
60
|
|
|
79
|
-
async def status_handler(_request: web.Request) -> web.Response:
|
|
80
|
-
return json_response({"status": "ok"})
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
async def metrics_handler(_request: web.Request) -> web.Response:
|
|
84
|
-
return web.Response(body=render(), content_type="text/plain")
|
|
85
|
-
|
|
86
|
-
|
|
87
61
|
class OrchestratorEngine:
|
|
88
62
|
def __init__(self, storage: StorageBackend, config: Config):
|
|
89
63
|
setup_logging(config.LOG_LEVEL, config.LOG_FORMAT, config.TZ)
|
|
@@ -97,7 +71,7 @@ class OrchestratorEngine:
|
|
|
97
71
|
self.app[ENGINE_KEY] = self
|
|
98
72
|
self._setup_done = False
|
|
99
73
|
|
|
100
|
-
def register_blueprint(self, blueprint: StateMachineBlueprint):
|
|
74
|
+
def register_blueprint(self, blueprint: StateMachineBlueprint) -> None:
|
|
101
75
|
if self._setup_done:
|
|
102
76
|
raise RuntimeError("Cannot register blueprints after engine setup.")
|
|
103
77
|
if blueprint.name in self.blueprints:
|
|
@@ -107,15 +81,15 @@ class OrchestratorEngine:
|
|
|
107
81
|
blueprint.validate()
|
|
108
82
|
self.blueprints[blueprint.name] = blueprint
|
|
109
83
|
|
|
110
|
-
def setup(self):
|
|
84
|
+
def setup(self) -> None:
|
|
111
85
|
if self._setup_done:
|
|
112
86
|
return
|
|
113
|
-
self.
|
|
87
|
+
setup_routes(self.app, self)
|
|
114
88
|
self.app.on_startup.append(self.on_startup)
|
|
115
89
|
self.app.on_shutdown.append(self.on_shutdown)
|
|
116
90
|
self._setup_done = True
|
|
117
91
|
|
|
118
|
-
async def _setup_history_storage(self):
|
|
92
|
+
async def _setup_history_storage(self) -> None:
|
|
119
93
|
from importlib import import_module
|
|
120
94
|
|
|
121
95
|
uri = self.config.HISTORY_DATABASE_URI
|
|
@@ -166,7 +140,7 @@ class OrchestratorEngine:
|
|
|
166
140
|
)
|
|
167
141
|
self.history_storage = NoOpHistoryStorage()
|
|
168
142
|
|
|
169
|
-
async def on_startup(self, app: web.Application):
|
|
143
|
+
async def on_startup(self, app: web.Application) -> None:
|
|
170
144
|
try:
|
|
171
145
|
from opentelemetry.instrumentation.aiohttp_client import (
|
|
172
146
|
AioHttpClientInstrumentor,
|
|
@@ -213,6 +187,7 @@ class OrchestratorEngine:
|
|
|
213
187
|
)
|
|
214
188
|
|
|
215
189
|
app[HTTP_SESSION_KEY] = ClientSession()
|
|
190
|
+
self.webhook_sender = WebhookSender(app[HTTP_SESSION_KEY])
|
|
216
191
|
self.dispatcher = Dispatcher(self.storage, self.config)
|
|
217
192
|
app[DISPATCHER_KEY] = self.dispatcher
|
|
218
193
|
app[EXECUTOR_KEY] = JobExecutor(self, self.history_storage)
|
|
@@ -220,6 +195,7 @@ class OrchestratorEngine:
|
|
|
220
195
|
app[REPUTATION_CALCULATOR_KEY] = ReputationCalculator(self)
|
|
221
196
|
app[HEALTH_CHECKER_KEY] = HealthChecker(self)
|
|
222
197
|
app[SCHEDULER_KEY] = Scheduler(self)
|
|
198
|
+
app[WS_MANAGER_KEY] = self.ws_manager
|
|
223
199
|
|
|
224
200
|
app[EXECUTOR_TASK_KEY] = create_task(app[EXECUTOR_KEY].run())
|
|
225
201
|
app[WATCHER_TASK_KEY] = create_task(app[WATCHER_KEY].run())
|
|
@@ -227,7 +203,7 @@ class OrchestratorEngine:
|
|
|
227
203
|
app[HEALTH_CHECKER_TASK_KEY] = create_task(app[HEALTH_CHECKER_KEY].run())
|
|
228
204
|
app[SCHEDULER_TASK_KEY] = create_task(app[SCHEDULER_KEY].run())
|
|
229
205
|
|
|
230
|
-
async def on_shutdown(self, app: web.Application):
|
|
206
|
+
async def on_shutdown(self, app: web.Application) -> None:
|
|
231
207
|
logger.info("Shutdown sequence started.")
|
|
232
208
|
app[EXECUTOR_KEY].stop()
|
|
233
209
|
app[WATCHER_KEY].stop()
|
|
@@ -324,295 +300,23 @@ class OrchestratorEngine:
|
|
|
324
300
|
logger.info(f"Created background job {job_id} for blueprint '{blueprint_name}' (source: {source})")
|
|
325
301
|
return job_id
|
|
326
302
|
|
|
327
|
-
def
|
|
328
|
-
|
|
329
|
-
try:
|
|
330
|
-
initial_data = await request.json(loads=loads)
|
|
331
|
-
except Exception:
|
|
332
|
-
return json_response({"error": "Invalid JSON body"}, status=400)
|
|
333
|
-
|
|
334
|
-
client_config = request["client_config"]
|
|
335
|
-
carrier = {str(k): v for k, v in request.headers.items()}
|
|
336
|
-
|
|
337
|
-
job_id = str(uuid4())
|
|
338
|
-
job_state = {
|
|
339
|
-
"id": job_id,
|
|
340
|
-
"blueprint_name": blueprint.name,
|
|
341
|
-
"current_state": blueprint.start_state,
|
|
342
|
-
"initial_data": initial_data,
|
|
343
|
-
"state_history": {},
|
|
344
|
-
"status": JOB_STATUS_PENDING,
|
|
345
|
-
"tracing_context": carrier,
|
|
346
|
-
"client_config": client_config,
|
|
347
|
-
}
|
|
348
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
349
|
-
await self.storage.enqueue_job(job_id)
|
|
350
|
-
metrics.jobs_total.inc({metrics.LABEL_BLUEPRINT: blueprint.name})
|
|
351
|
-
return json_response({"status": "accepted", "job_id": job_id}, status=202)
|
|
352
|
-
|
|
353
|
-
return handler
|
|
354
|
-
|
|
355
|
-
async def _get_job_status_handler(self, request: web.Request) -> web.Response:
|
|
356
|
-
job_id = request.match_info.get("job_id")
|
|
357
|
-
if not job_id:
|
|
358
|
-
return json_response({"error": "job_id is required in path"}, status=400)
|
|
359
|
-
job_state = await self.storage.get_job_state(job_id)
|
|
360
|
-
if not job_state:
|
|
361
|
-
return json_response({"error": "Job not found"}, status=404)
|
|
362
|
-
return json_response(job_state, status=200)
|
|
363
|
-
|
|
364
|
-
async def _cancel_job_handler(self, request: web.Request) -> web.Response:
|
|
365
|
-
job_id = request.match_info.get("job_id")
|
|
366
|
-
if not job_id:
|
|
367
|
-
return json_response({"error": "job_id is required in path"}, status=400)
|
|
368
|
-
|
|
369
|
-
job_state = await self.storage.get_job_state(job_id)
|
|
370
|
-
if not job_state:
|
|
371
|
-
return json_response({"error": "Job not found"}, status=404)
|
|
372
|
-
|
|
373
|
-
if job_state.get("status") != JOB_STATUS_WAITING_FOR_WORKER:
|
|
374
|
-
return json_response(
|
|
375
|
-
{"error": "Job is not in a state that can be cancelled (must be waiting for a worker)."},
|
|
376
|
-
status=409,
|
|
377
|
-
)
|
|
378
|
-
|
|
379
|
-
worker_id = job_state.get("task_worker_id")
|
|
380
|
-
if not worker_id:
|
|
381
|
-
return json_response(
|
|
382
|
-
{"error": "Cannot cancel job: worker_id not found in job state."},
|
|
383
|
-
status=500,
|
|
384
|
-
)
|
|
385
|
-
|
|
386
|
-
worker_info = await self.storage.get_worker_info(worker_id)
|
|
387
|
-
task_id = job_state.get("current_task_id")
|
|
388
|
-
if not task_id:
|
|
389
|
-
return json_response(
|
|
390
|
-
{"error": "Cannot cancel job: task_id not found in job state."},
|
|
391
|
-
status=500,
|
|
392
|
-
)
|
|
393
|
-
|
|
394
|
-
# Set Redis flag as a reliable fallback/primary mechanism
|
|
395
|
-
await self.storage.set_task_cancellation_flag(task_id)
|
|
396
|
-
|
|
397
|
-
# Attempt WebSocket-based cancellation if supported
|
|
398
|
-
if worker_info and worker_info.get("capabilities", {}).get("websockets"):
|
|
399
|
-
command = {"command": "cancel_task", "task_id": task_id, "job_id": job_id}
|
|
400
|
-
sent = await self.ws_manager.send_command(worker_id, command)
|
|
401
|
-
if sent:
|
|
402
|
-
return json_response({"status": "cancellation_request_sent"})
|
|
403
|
-
else:
|
|
404
|
-
logger.warning(f"Failed to send WebSocket cancellation for task {task_id}, but Redis flag is set.")
|
|
405
|
-
# Proceed to return success, as the Redis flag will handle it
|
|
406
|
-
|
|
407
|
-
return json_response({"status": "cancellation_request_accepted"})
|
|
408
|
-
|
|
409
|
-
async def _get_job_history_handler(self, request: web.Request) -> web.Response:
|
|
410
|
-
job_id = request.match_info.get("job_id")
|
|
411
|
-
if not job_id:
|
|
412
|
-
return json_response({"error": "job_id is required in path"}, status=400)
|
|
413
|
-
history = await self.history_storage.get_job_history(job_id)
|
|
414
|
-
return json_response(history)
|
|
415
|
-
|
|
416
|
-
async def _get_blueprint_graph_handler(self, request: web.Request) -> web.Response:
|
|
417
|
-
blueprint_name = request.match_info.get("blueprint_name")
|
|
418
|
-
if not blueprint_name:
|
|
419
|
-
return json_response({"error": "blueprint_name is required in path"}, status=400)
|
|
420
|
-
|
|
421
|
-
blueprint = self.blueprints.get(blueprint_name)
|
|
422
|
-
if not blueprint:
|
|
423
|
-
return json_response({"error": "Blueprint not found"}, status=404)
|
|
424
|
-
|
|
425
|
-
try:
|
|
426
|
-
graph_dot = blueprint.render_graph()
|
|
427
|
-
return web.Response(text=graph_dot, content_type="text/vnd.graphviz")
|
|
428
|
-
except FileNotFoundError:
|
|
429
|
-
error_msg = "Graphviz is not installed on the server. Cannot generate graph."
|
|
430
|
-
logger.error(error_msg)
|
|
431
|
-
return json_response({"error": error_msg}, status=501)
|
|
432
|
-
|
|
433
|
-
async def _get_workers_handler(self, request: web.Request) -> web.Response:
|
|
434
|
-
workers = await self.storage.get_available_workers()
|
|
435
|
-
return json_response(workers)
|
|
436
|
-
|
|
437
|
-
async def _get_jobs_handler(self, request: web.Request) -> web.Response:
|
|
438
|
-
try:
|
|
439
|
-
limit = int(request.query.get("limit", "100"))
|
|
440
|
-
offset = int(request.query.get("offset", "0"))
|
|
441
|
-
except ValueError:
|
|
442
|
-
return json_response({"error": "Invalid limit/offset parameter"}, status=400)
|
|
443
|
-
|
|
444
|
-
jobs = await self.history_storage.get_jobs(limit=limit, offset=offset)
|
|
445
|
-
return json_response(jobs)
|
|
446
|
-
|
|
447
|
-
async def _get_dashboard_handler(self, request: web.Request) -> web.Response:
|
|
448
|
-
worker_count = await self.storage.get_active_worker_count()
|
|
449
|
-
queue_length = await self.storage.get_job_queue_length()
|
|
450
|
-
job_summary = await self.history_storage.get_job_summary()
|
|
451
|
-
|
|
452
|
-
dashboard_data = {
|
|
453
|
-
"workers": {"total": worker_count},
|
|
454
|
-
"jobs": {"queued": queue_length, **job_summary},
|
|
455
|
-
}
|
|
456
|
-
return json_response(dashboard_data)
|
|
457
|
-
|
|
458
|
-
async def _task_result_handler(self, request: web.Request) -> web.Response:
|
|
459
|
-
import logging
|
|
460
|
-
|
|
461
|
-
try:
|
|
462
|
-
data = await request.json(loads=loads)
|
|
463
|
-
job_id = data.get("job_id")
|
|
464
|
-
task_id = data.get("task_id")
|
|
465
|
-
result = data.get("result", {})
|
|
466
|
-
result_status = result.get("status", TASK_STATUS_SUCCESS)
|
|
467
|
-
error_message = result.get("error")
|
|
468
|
-
payload_worker_id = data.get("worker_id")
|
|
469
|
-
except Exception:
|
|
470
|
-
return json_response({"error": "Invalid JSON body"}, status=400)
|
|
471
|
-
|
|
472
|
-
# Security check: Ensure the worker_id from the payload matches the authenticated worker
|
|
473
|
-
authenticated_worker_id = request.get("worker_id")
|
|
474
|
-
if not authenticated_worker_id:
|
|
475
|
-
# This should not happen if the auth middleware is working correctly
|
|
476
|
-
return json_response({"error": "Could not identify authenticated worker."}, status=500)
|
|
477
|
-
|
|
478
|
-
if payload_worker_id and payload_worker_id != authenticated_worker_id:
|
|
479
|
-
return json_response(
|
|
480
|
-
{
|
|
481
|
-
"error": f"Forbidden: Authenticated worker '{authenticated_worker_id}' "
|
|
482
|
-
f"cannot submit results for another worker '{payload_worker_id}'.",
|
|
483
|
-
},
|
|
484
|
-
status=403,
|
|
485
|
-
)
|
|
486
|
-
|
|
487
|
-
if not job_id or not task_id:
|
|
488
|
-
return json_response({"error": "job_id and task_id are required"}, status=400)
|
|
489
|
-
|
|
490
|
-
job_state = await self.storage.get_job_state(job_id)
|
|
491
|
-
if not job_state:
|
|
492
|
-
return json_response({"error": "Job not found"}, status=404)
|
|
493
|
-
|
|
494
|
-
# Handle parallel task completion
|
|
495
|
-
if job_state.get("status") == JOB_STATUS_WAITING_FOR_PARALLEL:
|
|
496
|
-
await self.storage.remove_job_from_watch(f"{job_id}:{task_id}")
|
|
497
|
-
job_state.setdefault("aggregation_results", {})[task_id] = result
|
|
498
|
-
job_state.setdefault("active_branches", []).remove(task_id)
|
|
499
|
-
|
|
500
|
-
if not job_state["active_branches"]:
|
|
501
|
-
logger.info(f"All parallel branches for job {job_id} have completed.")
|
|
502
|
-
job_state["status"] = JOB_STATUS_RUNNING
|
|
503
|
-
job_state["current_state"] = job_state["aggregation_target"]
|
|
504
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
505
|
-
await self.storage.enqueue_job(job_id)
|
|
506
|
-
else:
|
|
507
|
-
logger.info(
|
|
508
|
-
f"Branch {task_id} for job {job_id} completed. "
|
|
509
|
-
f"Waiting for {len(job_state['active_branches'])} more.",
|
|
510
|
-
)
|
|
511
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
512
|
-
|
|
513
|
-
return json_response({"status": "parallel_branch_result_accepted"}, status=200)
|
|
514
|
-
|
|
515
|
-
await self.storage.remove_job_from_watch(job_id)
|
|
516
|
-
|
|
517
|
-
import time
|
|
518
|
-
|
|
519
|
-
now = time.monotonic()
|
|
520
|
-
dispatched_at = job_state.get("task_dispatched_at", now)
|
|
521
|
-
duration_ms = int((now - dispatched_at) * 1000)
|
|
522
|
-
|
|
523
|
-
await self.history_storage.log_job_event(
|
|
524
|
-
{
|
|
525
|
-
"job_id": job_id,
|
|
526
|
-
"state": job_state.get("current_state"),
|
|
527
|
-
"event_type": "task_finished",
|
|
528
|
-
"duration_ms": duration_ms,
|
|
529
|
-
"worker_id": authenticated_worker_id, # Use authenticated worker_id
|
|
530
|
-
"context_snapshot": {**job_state, "result": result},
|
|
531
|
-
},
|
|
532
|
-
)
|
|
533
|
-
|
|
534
|
-
job_state["tracing_context"] = {str(k): v for k, v in request.headers.items()}
|
|
535
|
-
|
|
536
|
-
if result_status == TASK_STATUS_FAILURE:
|
|
537
|
-
error_details = result.get("error", {})
|
|
538
|
-
error_type = ERROR_CODE_TRANSIENT
|
|
539
|
-
error_message = "No error details provided."
|
|
540
|
-
|
|
541
|
-
if isinstance(error_details, dict):
|
|
542
|
-
error_type = error_details.get("code", ERROR_CODE_TRANSIENT)
|
|
543
|
-
error_message = error_details.get("message", "No error message provided.")
|
|
544
|
-
elif isinstance(error_details, str):
|
|
545
|
-
# Fallback for old format where `error` was just a string
|
|
546
|
-
error_message = error_details
|
|
547
|
-
|
|
548
|
-
logging.warning(f"Task {task_id} for job {job_id} failed with error type '{error_type}'.")
|
|
549
|
-
|
|
550
|
-
if error_type == ERROR_CODE_PERMANENT:
|
|
551
|
-
job_state["status"] = JOB_STATUS_QUARANTINED
|
|
552
|
-
job_state["error_message"] = f"Task failed with permanent error: {error_message}"
|
|
553
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
554
|
-
await self.storage.quarantine_job(job_id)
|
|
555
|
-
elif error_type == ERROR_CODE_INVALID_INPUT:
|
|
556
|
-
job_state["status"] = JOB_STATUS_FAILED
|
|
557
|
-
job_state["error_message"] = f"Task failed due to invalid input: {error_message}"
|
|
558
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
559
|
-
else: # TRANSIENT_ERROR or any other/unspecified error
|
|
560
|
-
await self._handle_task_failure(job_state, task_id, error_message)
|
|
561
|
-
|
|
562
|
-
return json_response({"status": "result_accepted_failure"}, status=200)
|
|
563
|
-
|
|
564
|
-
if result_status == TASK_STATUS_CANCELLED:
|
|
565
|
-
logging.info(f"Task {task_id} for job {job_id} was cancelled by worker.")
|
|
566
|
-
job_state["status"] = JOB_STATUS_CANCELLED
|
|
567
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
568
|
-
# Optionally, trigger a specific 'cancelled' transition if defined in the blueprint
|
|
569
|
-
transitions = job_state.get("current_task_transitions", {})
|
|
570
|
-
if next_state := transitions.get("cancelled"):
|
|
571
|
-
job_state["current_state"] = next_state
|
|
572
|
-
job_state["status"] = JOB_STATUS_RUNNING # It's running the cancellation handler now
|
|
573
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
574
|
-
await self.storage.enqueue_job(job_id)
|
|
575
|
-
return json_response({"status": "result_accepted_cancelled"}, status=200)
|
|
576
|
-
|
|
577
|
-
transitions = job_state.get("current_task_transitions", {})
|
|
578
|
-
if next_state := transitions.get(result_status):
|
|
579
|
-
logging.info(f"Job {job_id} transitioning based on worker status '{result_status}' to state '{next_state}'")
|
|
580
|
-
|
|
581
|
-
worker_data = result.get("data")
|
|
582
|
-
if worker_data and isinstance(worker_data, dict):
|
|
583
|
-
if "state_history" not in job_state:
|
|
584
|
-
job_state["state_history"] = {}
|
|
585
|
-
job_state["state_history"].update(worker_data)
|
|
586
|
-
|
|
587
|
-
job_state["current_state"] = next_state
|
|
588
|
-
job_state["status"] = JOB_STATUS_RUNNING
|
|
589
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
590
|
-
await self.storage.enqueue_job(job_id)
|
|
591
|
-
else:
|
|
592
|
-
logging.error(f"Job {job_id} failed. Worker returned unhandled status '{result_status}'.")
|
|
593
|
-
job_state["status"] = JOB_STATUS_FAILED
|
|
594
|
-
job_state["error_message"] = f"Worker returned unhandled status: {result_status}"
|
|
595
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
596
|
-
|
|
597
|
-
return json_response({"status": "result_accepted_success"}, status=200)
|
|
598
|
-
|
|
599
|
-
async def _handle_task_failure(self, job_state: dict, task_id: str, error_message: str | None):
|
|
600
|
-
import logging
|
|
601
|
-
|
|
303
|
+
async def handle_task_failure(self, job_state: dict[str, Any], task_id: str, error_message: str | None) -> None:
|
|
304
|
+
"""Handles a transient task failure by retrying or quarantining."""
|
|
602
305
|
job_id = job_state["id"]
|
|
603
306
|
retry_count = job_state.get("retry_count", 0)
|
|
604
307
|
max_retries = self.config.JOB_MAX_RETRIES
|
|
605
308
|
|
|
606
309
|
if retry_count < max_retries:
|
|
607
310
|
job_state["retry_count"] = retry_count + 1
|
|
608
|
-
|
|
311
|
+
logger.info(f"Retrying task for job {job_id}. Attempt {retry_count + 1}/{max_retries}.")
|
|
609
312
|
|
|
610
313
|
task_info = job_state.get("current_task_info")
|
|
611
314
|
if not task_info:
|
|
612
|
-
|
|
315
|
+
logger.error(f"Cannot retry job {job_id}: missing 'current_task_info' in job state.")
|
|
613
316
|
job_state["status"] = JOB_STATUS_FAILED
|
|
614
317
|
job_state["error_message"] = "Cannot retry: original task info not found."
|
|
615
318
|
await self.storage.save_job_state(job_id, job_state)
|
|
319
|
+
await self.send_job_webhook(job_state, "job_failed")
|
|
616
320
|
return
|
|
617
321
|
|
|
618
322
|
now = get_running_loop().time()
|
|
@@ -626,284 +330,31 @@ class OrchestratorEngine:
|
|
|
626
330
|
|
|
627
331
|
await self.dispatcher.dispatch(job_state, task_info)
|
|
628
332
|
else:
|
|
629
|
-
|
|
333
|
+
logger.critical(f"Job {job_id} has failed {max_retries + 1} times. Moving to quarantine.")
|
|
630
334
|
job_state["status"] = JOB_STATUS_QUARANTINED
|
|
631
335
|
job_state["error_message"] = f"Task failed after {max_retries + 1} attempts: {error_message}"
|
|
632
336
|
await self.storage.save_job_state(job_id, job_state)
|
|
633
337
|
await self.storage.quarantine_job(job_id)
|
|
338
|
+
await self.send_job_webhook(job_state, "job_quarantined")
|
|
634
339
|
|
|
635
|
-
async def
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
data = await request.json(loads=loads)
|
|
641
|
-
decision = data.get("decision")
|
|
642
|
-
if not decision:
|
|
643
|
-
return json_response({"error": "decision is required in body"}, status=400)
|
|
644
|
-
except Exception:
|
|
645
|
-
return json_response({"error": "Invalid JSON body"}, status=400)
|
|
646
|
-
job_state = await self.storage.get_job_state(job_id)
|
|
647
|
-
if not job_state:
|
|
648
|
-
return json_response({"error": "Job not found"}, status=404)
|
|
649
|
-
if job_state.get("status") not in [JOB_STATUS_WAITING_FOR_WORKER, JOB_STATUS_WAITING_FOR_HUMAN]:
|
|
650
|
-
return json_response({"error": "Job is not in a state that can be approved"}, status=409)
|
|
651
|
-
transitions = job_state.get("current_task_transitions", {})
|
|
652
|
-
next_state = transitions.get(decision)
|
|
653
|
-
if not next_state:
|
|
654
|
-
return json_response({"error": f"Invalid decision '{decision}' for this job"}, status=400)
|
|
655
|
-
job_state["current_state"] = next_state
|
|
656
|
-
job_state["status"] = JOB_STATUS_RUNNING
|
|
657
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
658
|
-
await self.storage.enqueue_job(job_id)
|
|
659
|
-
return json_response({"status": "approval_received", "job_id": job_id})
|
|
660
|
-
|
|
661
|
-
async def _get_quarantined_jobs_handler(self, request: web.Request) -> web.Response:
|
|
662
|
-
"""Returns a list of all job IDs in the quarantine queue."""
|
|
663
|
-
jobs = await self.storage.get_quarantined_jobs()
|
|
664
|
-
return json_response(jobs)
|
|
665
|
-
|
|
666
|
-
async def _reload_worker_configs_handler(self, request: web.Request) -> web.Response:
|
|
667
|
-
"""Handles the dynamic reloading of worker configurations."""
|
|
668
|
-
logger.info("Received request to reload worker configurations.")
|
|
669
|
-
if not self.config.WORKERS_CONFIG_PATH:
|
|
670
|
-
return json_response(
|
|
671
|
-
{"error": "WORKERS_CONFIG_PATH is not set, cannot reload configs."},
|
|
672
|
-
status=400,
|
|
673
|
-
)
|
|
674
|
-
|
|
675
|
-
await load_worker_configs_to_redis(self.storage, self.config.WORKERS_CONFIG_PATH)
|
|
676
|
-
return json_response({"status": "worker_configs_reloaded"})
|
|
677
|
-
|
|
678
|
-
async def _flush_db_handler(self, request: web.Request) -> web.Response:
|
|
679
|
-
logger.warning("Received request to flush the database.")
|
|
680
|
-
await self.storage.flush_all()
|
|
681
|
-
await load_client_configs_to_redis(self.storage)
|
|
682
|
-
return json_response({"status": "db_flushed"}, status=200)
|
|
683
|
-
|
|
684
|
-
async def _docs_handler(self, request: web.Request) -> web.Response:
|
|
685
|
-
from importlib import resources
|
|
686
|
-
|
|
687
|
-
try:
|
|
688
|
-
content = resources.read_text("avtomatika", "api.html")
|
|
689
|
-
except FileNotFoundError:
|
|
690
|
-
logger.error("api.html not found within the avtomatika package.")
|
|
691
|
-
return json_response({"error": "Documentation file not found on server."}, status=500)
|
|
692
|
-
|
|
693
|
-
# Generate dynamic documentation for registered blueprints
|
|
694
|
-
blueprint_endpoints = []
|
|
695
|
-
for bp in self.blueprints.values():
|
|
696
|
-
if not bp.api_endpoint:
|
|
697
|
-
continue
|
|
698
|
-
|
|
699
|
-
version_prefix = f"/{bp.api_version}" if bp.api_version else ""
|
|
700
|
-
endpoint_path = bp.api_endpoint if bp.api_endpoint.startswith("/") else f"/{bp.api_endpoint}"
|
|
701
|
-
full_path = f"/api{version_prefix}{endpoint_path}"
|
|
702
|
-
|
|
703
|
-
blueprint_endpoints.append(
|
|
704
|
-
{
|
|
705
|
-
"id": f"post-create-{bp.name.replace('_', '-')}",
|
|
706
|
-
"name": f"Create {bp.name.replace('_', ' ').title()} Job",
|
|
707
|
-
"method": "POST",
|
|
708
|
-
"path": full_path,
|
|
709
|
-
"description": f"Creates and starts a new instance (Job) of the `{bp.name}` blueprint.",
|
|
710
|
-
"request": {"body": {"initial_data": {}}},
|
|
711
|
-
"responses": [
|
|
712
|
-
{
|
|
713
|
-
"code": "202 Accepted",
|
|
714
|
-
"description": "Job successfully accepted for processing.",
|
|
715
|
-
"body": {"status": "accepted", "job_id": "..."},
|
|
716
|
-
}
|
|
717
|
-
],
|
|
718
|
-
}
|
|
719
|
-
)
|
|
720
|
-
|
|
721
|
-
# Inject dynamic endpoints into the apiData structure in the HTML
|
|
722
|
-
if blueprint_endpoints:
|
|
723
|
-
endpoints_json = dumps(blueprint_endpoints, option=OPT_INDENT_2).decode("utf-8")
|
|
724
|
-
# We insert the new endpoints at the beginning of the 'Protected API' group
|
|
725
|
-
marker = "group: 'Protected API',\n endpoints: ["
|
|
726
|
-
content = content.replace(marker, f"{marker}\n{endpoints_json.strip('[]')},")
|
|
727
|
-
|
|
728
|
-
return web.Response(text=content, content_type="text/html")
|
|
729
|
-
|
|
730
|
-
def _setup_routes(self):
|
|
731
|
-
public_app = web.Application()
|
|
732
|
-
public_app.router.add_get("/status", status_handler)
|
|
733
|
-
public_app.router.add_get("/metrics", metrics_handler)
|
|
734
|
-
public_app.router.add_post("/webhooks/approval/{job_id}", self._human_approval_webhook_handler)
|
|
735
|
-
public_app.router.add_post("/debug/flush_db", self._flush_db_handler)
|
|
736
|
-
public_app.router.add_get("/docs", self._docs_handler)
|
|
737
|
-
public_app.router.add_get("/jobs/quarantined", self._get_quarantined_jobs_handler)
|
|
738
|
-
self.app.add_subapp("/_public/", public_app)
|
|
739
|
-
|
|
740
|
-
auth_middleware = client_auth_middleware_factory(self.storage)
|
|
741
|
-
quota_middleware = quota_middleware_factory(self.storage)
|
|
742
|
-
api_middlewares = [auth_middleware, quota_middleware]
|
|
743
|
-
|
|
744
|
-
protected_app = web.Application(middlewares=api_middlewares)
|
|
745
|
-
versioned_apps: dict[str, web.Application] = {}
|
|
746
|
-
has_unversioned_routes = False
|
|
747
|
-
|
|
748
|
-
for bp in self.blueprints.values():
|
|
749
|
-
if not bp.api_endpoint:
|
|
750
|
-
continue
|
|
751
|
-
endpoint = bp.api_endpoint if bp.api_endpoint.startswith("/") else f"/{bp.api_endpoint}"
|
|
752
|
-
if bp.api_version:
|
|
753
|
-
if bp.api_version not in versioned_apps:
|
|
754
|
-
versioned_apps[bp.api_version] = web.Application(middlewares=api_middlewares)
|
|
755
|
-
versioned_apps[bp.api_version].router.add_post(endpoint, self._create_job_handler(bp))
|
|
756
|
-
else:
|
|
757
|
-
protected_app.router.add_post(endpoint, self._create_job_handler(bp))
|
|
758
|
-
has_unversioned_routes = True
|
|
759
|
-
|
|
760
|
-
all_protected_apps = list(versioned_apps.values())
|
|
761
|
-
if has_unversioned_routes:
|
|
762
|
-
all_protected_apps.append(protected_app)
|
|
763
|
-
|
|
764
|
-
for app in all_protected_apps:
|
|
765
|
-
self._register_common_routes(app)
|
|
766
|
-
if has_unversioned_routes:
|
|
767
|
-
self.app.add_subapp("/api/", protected_app)
|
|
768
|
-
for version, app in versioned_apps.items():
|
|
769
|
-
self.app.add_subapp(f"/api/{version}", app)
|
|
770
|
-
|
|
771
|
-
worker_auth_middleware = worker_auth_middleware_factory(self.storage, self.config)
|
|
772
|
-
worker_middlewares = [worker_auth_middleware]
|
|
773
|
-
if self.config.RATE_LIMITING_ENABLED:
|
|
774
|
-
worker_rate_limiter = rate_limit_middleware_factory(storage=self.storage, limit=5, period=60)
|
|
775
|
-
worker_middlewares.append(worker_rate_limiter)
|
|
776
|
-
|
|
777
|
-
worker_app = web.Application(middlewares=worker_middlewares)
|
|
778
|
-
worker_app.router.add_post("/workers/register", self._register_worker_handler)
|
|
779
|
-
worker_app.router.add_get("/workers/{worker_id}/tasks/next", self._handle_get_next_task)
|
|
780
|
-
worker_app.router.add_patch("/workers/{worker_id}", self._worker_update_handler)
|
|
781
|
-
worker_app.router.add_post("/tasks/result", self._task_result_handler)
|
|
782
|
-
worker_app.router.add_get("/ws/{worker_id}", self._websocket_handler)
|
|
783
|
-
self.app.add_subapp("/_worker/", worker_app)
|
|
784
|
-
|
|
785
|
-
def _register_common_routes(self, app):
|
|
786
|
-
app.router.add_get("/jobs/{job_id}", self._get_job_status_handler)
|
|
787
|
-
app.router.add_post("/jobs/{job_id}/cancel", self._cancel_job_handler)
|
|
788
|
-
if not isinstance(self.history_storage, NoOpHistoryStorage):
|
|
789
|
-
app.router.add_get("/jobs/{job_id}/history", self._get_job_history_handler)
|
|
790
|
-
app.router.add_get("/blueprints/{blueprint_name}/graph", self._get_blueprint_graph_handler)
|
|
791
|
-
app.router.add_get("/workers", self._get_workers_handler)
|
|
792
|
-
app.router.add_get("/jobs", self._get_jobs_handler)
|
|
793
|
-
app.router.add_get("/dashboard", self._get_dashboard_handler)
|
|
794
|
-
app.router.add_post("/admin/reload-workers", self._reload_worker_configs_handler)
|
|
795
|
-
|
|
796
|
-
async def _websocket_handler(self, request: web.Request) -> web.WebSocketResponse:
|
|
797
|
-
worker_id = request.match_info.get("worker_id")
|
|
798
|
-
if not worker_id:
|
|
799
|
-
raise web.HTTPBadRequest(text="worker_id is required")
|
|
800
|
-
|
|
801
|
-
ws = web.WebSocketResponse()
|
|
802
|
-
await ws.prepare(request)
|
|
803
|
-
|
|
804
|
-
await self.ws_manager.register(worker_id, ws)
|
|
805
|
-
try:
|
|
806
|
-
async for msg in ws:
|
|
807
|
-
if msg.type == WSMsgType.TEXT:
|
|
808
|
-
try:
|
|
809
|
-
data = msg.json()
|
|
810
|
-
await self.ws_manager.handle_message(worker_id, data)
|
|
811
|
-
except Exception as e:
|
|
812
|
-
logger.error(f"Error processing WebSocket message from {worker_id}: {e}")
|
|
813
|
-
elif msg.type == WSMsgType.ERROR:
|
|
814
|
-
logger.error(f"WebSocket connection for {worker_id} closed with exception {ws.exception()}")
|
|
815
|
-
break
|
|
816
|
-
finally:
|
|
817
|
-
await self.ws_manager.unregister(worker_id)
|
|
818
|
-
return ws
|
|
819
|
-
|
|
820
|
-
async def _handle_get_next_task(self, request: web.Request) -> web.Response:
|
|
821
|
-
worker_id = request.match_info.get("worker_id")
|
|
822
|
-
if not worker_id:
|
|
823
|
-
return json_response({"error": "worker_id is required in path"}, status=400)
|
|
824
|
-
|
|
825
|
-
logger.debug(f"Worker {worker_id} is requesting a new task.")
|
|
826
|
-
task = await self.storage.dequeue_task_for_worker(worker_id, self.config.WORKER_POLL_TIMEOUT_SECONDS)
|
|
827
|
-
|
|
828
|
-
if task:
|
|
829
|
-
logger.info(f"Sending task {task.get('task_id')} to worker {worker_id}")
|
|
830
|
-
return json_response(task, status=200)
|
|
831
|
-
logger.debug(f"No tasks for worker {worker_id}, responding 204.")
|
|
832
|
-
return web.Response(status=204)
|
|
833
|
-
|
|
834
|
-
async def _worker_update_handler(self, request: web.Request) -> web.Response:
|
|
835
|
-
"""
|
|
836
|
-
Handles both full updates and lightweight heartbeats for a worker.
|
|
837
|
-
|
|
838
|
-
If the request has a JSON body, it updates the worker's data.
|
|
839
|
-
In either case, it refreshes the worker's TTL, serving as a heartbeat.
|
|
840
|
-
"""
|
|
841
|
-
worker_id = request.match_info.get("worker_id")
|
|
842
|
-
if not worker_id:
|
|
843
|
-
return json_response({"error": "worker_id is required in path"}, status=400)
|
|
844
|
-
|
|
845
|
-
ttl = self.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
|
|
846
|
-
update_data = None
|
|
847
|
-
|
|
848
|
-
# Check for body content without consuming it if it's not JSON
|
|
849
|
-
if request.can_read_body:
|
|
850
|
-
try:
|
|
851
|
-
update_data = await request.json(loads=loads)
|
|
852
|
-
except Exception:
|
|
853
|
-
logger.warning(
|
|
854
|
-
f"Received PATCH from worker {worker_id} with non-JSON body. Treating as TTL-only heartbeat."
|
|
855
|
-
)
|
|
340
|
+
async def send_job_webhook(self, job_state: dict[str, Any], event: str) -> None:
|
|
341
|
+
"""Sends a webhook notification for a job event."""
|
|
342
|
+
webhook_url = job_state.get("webhook_url")
|
|
343
|
+
if not webhook_url:
|
|
344
|
+
return
|
|
856
345
|
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
await self.history_storage.log_worker_event(
|
|
864
|
-
{
|
|
865
|
-
"worker_id": worker_id,
|
|
866
|
-
"event_type": "status_update",
|
|
867
|
-
"worker_info_snapshot": updated_worker,
|
|
868
|
-
},
|
|
869
|
-
)
|
|
870
|
-
return json_response(updated_worker, status=200)
|
|
871
|
-
else:
|
|
872
|
-
# Lightweight TTL-only heartbeat path
|
|
873
|
-
refreshed = await self.storage.refresh_worker_ttl(worker_id, ttl)
|
|
874
|
-
if not refreshed:
|
|
875
|
-
return json_response({"error": "Worker not found"}, status=404)
|
|
876
|
-
return json_response({"status": "ttl_refreshed"})
|
|
877
|
-
|
|
878
|
-
async def _register_worker_handler(self, request: web.Request) -> web.Response:
|
|
879
|
-
# The worker_registration_data is attached by the auth middleware
|
|
880
|
-
# to avoid reading the request body twice.
|
|
881
|
-
worker_data = request.get("worker_registration_data")
|
|
882
|
-
if not worker_data:
|
|
883
|
-
return json_response({"error": "Worker data not found in request"}, status=500)
|
|
884
|
-
|
|
885
|
-
worker_id = worker_data.get("worker_id")
|
|
886
|
-
# This check is redundant if the middleware works, but good for safety
|
|
887
|
-
if not worker_id:
|
|
888
|
-
return json_response({"error": "Missing required field: worker_id"}, status=400)
|
|
889
|
-
|
|
890
|
-
ttl = self.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
|
|
891
|
-
await self.storage.register_worker(worker_id, worker_data, ttl)
|
|
892
|
-
|
|
893
|
-
logger.info(
|
|
894
|
-
f"Worker '{worker_id}' registered with info: {worker_data}",
|
|
346
|
+
payload = WebhookPayload(
|
|
347
|
+
event=event,
|
|
348
|
+
job_id=job_state["id"],
|
|
349
|
+
status=job_state["status"],
|
|
350
|
+
result=job_state.get("state_history"), # Or specific result
|
|
351
|
+
error=job_state.get("error_message"),
|
|
895
352
|
)
|
|
896
353
|
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
"worker_id": worker_id,
|
|
900
|
-
"event_type": "registered",
|
|
901
|
-
"worker_info_snapshot": worker_data,
|
|
902
|
-
},
|
|
903
|
-
)
|
|
904
|
-
return json_response({"status": "registered"}, status=200)
|
|
354
|
+
# Run in background to not block the main flow
|
|
355
|
+
create_task(self.webhook_sender.send(webhook_url, payload))
|
|
905
356
|
|
|
906
|
-
def run(self):
|
|
357
|
+
def run(self) -> None:
|
|
907
358
|
self.setup()
|
|
908
359
|
print(
|
|
909
360
|
f"Starting OrchestratorEngine API server on {self.config.API_HOST}:{self.config.API_PORT} in blocking mode."
|