avtomatika 1.0b6__py3-none-any.whl → 1.0b8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avtomatika/api/handlers.py +549 -0
- avtomatika/api/routes.py +118 -0
- avtomatika/app_keys.py +33 -0
- avtomatika/blueprint.py +125 -54
- avtomatika/config.py +10 -0
- avtomatika/context.py +2 -2
- avtomatika/data_types.py +4 -2
- avtomatika/dispatcher.py +9 -27
- avtomatika/engine.py +70 -601
- avtomatika/executor.py +55 -22
- avtomatika/health_checker.py +23 -5
- avtomatika/history/base.py +60 -6
- avtomatika/history/noop.py +18 -7
- avtomatika/history/postgres.py +8 -6
- avtomatika/history/sqlite.py +7 -5
- avtomatika/metrics.py +1 -1
- avtomatika/reputation.py +46 -40
- avtomatika/s3.py +323 -0
- avtomatika/scheduler.py +8 -8
- avtomatika/storage/base.py +45 -4
- avtomatika/storage/memory.py +56 -13
- avtomatika/storage/redis.py +185 -252
- avtomatika/utils/__init__.py +0 -0
- avtomatika/utils/webhook_sender.py +96 -0
- avtomatika/watcher.py +34 -38
- avtomatika/ws_manager.py +7 -6
- {avtomatika-1.0b6.dist-info → avtomatika-1.0b8.dist-info}/METADATA +91 -3
- avtomatika-1.0b8.dist-info/RECORD +46 -0
- avtomatika-1.0b6.dist-info/RECORD +0 -40
- {avtomatika-1.0b6.dist-info → avtomatika-1.0b8.dist-info}/WHEEL +0 -0
- {avtomatika-1.0b6.dist-info → avtomatika-1.0b8.dist-info}/licenses/LICENSE +0 -0
- {avtomatika-1.0b6.dist-info → avtomatika-1.0b8.dist-info}/top_level.txt +0 -0
avtomatika/engine.py
CHANGED
|
@@ -1,68 +1,52 @@
|
|
|
1
|
-
from asyncio import Task, create_task, gather, get_running_loop, wait_for
|
|
2
1
|
from asyncio import TimeoutError as AsyncTimeoutError
|
|
2
|
+
from asyncio import create_task, gather, get_running_loop, wait_for
|
|
3
3
|
from logging import getLogger
|
|
4
|
-
from typing import Any
|
|
4
|
+
from typing import Any
|
|
5
5
|
from uuid import uuid4
|
|
6
6
|
|
|
7
|
-
from aiohttp import ClientSession,
|
|
8
|
-
from
|
|
9
|
-
from aioprometheus import render
|
|
10
|
-
from orjson import OPT_INDENT_2, dumps, loads
|
|
7
|
+
from aiohttp import ClientSession, web
|
|
8
|
+
from orjson import dumps
|
|
11
9
|
|
|
12
10
|
from . import metrics
|
|
11
|
+
from .api.routes import setup_routes
|
|
12
|
+
from .app_keys import (
|
|
13
|
+
DISPATCHER_KEY,
|
|
14
|
+
ENGINE_KEY,
|
|
15
|
+
EXECUTOR_KEY,
|
|
16
|
+
EXECUTOR_TASK_KEY,
|
|
17
|
+
HEALTH_CHECKER_KEY,
|
|
18
|
+
HEALTH_CHECKER_TASK_KEY,
|
|
19
|
+
HTTP_SESSION_KEY,
|
|
20
|
+
REPUTATION_CALCULATOR_KEY,
|
|
21
|
+
REPUTATION_CALCULATOR_TASK_KEY,
|
|
22
|
+
S3_SERVICE_KEY,
|
|
23
|
+
SCHEDULER_KEY,
|
|
24
|
+
SCHEDULER_TASK_KEY,
|
|
25
|
+
WATCHER_KEY,
|
|
26
|
+
WATCHER_TASK_KEY,
|
|
27
|
+
WS_MANAGER_KEY,
|
|
28
|
+
)
|
|
13
29
|
from .blueprint import StateMachineBlueprint
|
|
14
30
|
from .client_config_loader import load_client_configs_to_redis
|
|
15
31
|
from .compression import compression_middleware
|
|
16
32
|
from .config import Config
|
|
17
|
-
from .constants import
|
|
18
|
-
ERROR_CODE_INVALID_INPUT,
|
|
19
|
-
ERROR_CODE_PERMANENT,
|
|
20
|
-
ERROR_CODE_TRANSIENT,
|
|
21
|
-
JOB_STATUS_CANCELLED,
|
|
22
|
-
JOB_STATUS_FAILED,
|
|
23
|
-
JOB_STATUS_PENDING,
|
|
24
|
-
JOB_STATUS_QUARANTINED,
|
|
25
|
-
JOB_STATUS_RUNNING,
|
|
26
|
-
JOB_STATUS_WAITING_FOR_HUMAN,
|
|
27
|
-
JOB_STATUS_WAITING_FOR_PARALLEL,
|
|
28
|
-
JOB_STATUS_WAITING_FOR_WORKER,
|
|
29
|
-
TASK_STATUS_CANCELLED,
|
|
30
|
-
TASK_STATUS_FAILURE,
|
|
31
|
-
TASK_STATUS_SUCCESS,
|
|
32
|
-
)
|
|
33
|
+
from .constants import JOB_STATUS_FAILED, JOB_STATUS_PENDING, JOB_STATUS_QUARANTINED, JOB_STATUS_WAITING_FOR_WORKER
|
|
33
34
|
from .dispatcher import Dispatcher
|
|
34
35
|
from .executor import JobExecutor
|
|
35
36
|
from .health_checker import HealthChecker
|
|
36
37
|
from .history.base import HistoryStorageBase
|
|
37
38
|
from .history.noop import NoOpHistoryStorage
|
|
38
39
|
from .logging_config import setup_logging
|
|
39
|
-
from .quota import quota_middleware_factory
|
|
40
|
-
from .ratelimit import rate_limit_middleware_factory
|
|
41
40
|
from .reputation import ReputationCalculator
|
|
41
|
+
from .s3 import S3Service
|
|
42
42
|
from .scheduler import Scheduler
|
|
43
|
-
from .security import client_auth_middleware_factory, worker_auth_middleware_factory
|
|
44
43
|
from .storage.base import StorageBackend
|
|
45
44
|
from .telemetry import setup_telemetry
|
|
45
|
+
from .utils.webhook_sender import WebhookPayload, WebhookSender
|
|
46
46
|
from .watcher import Watcher
|
|
47
47
|
from .worker_config_loader import load_worker_configs_to_redis
|
|
48
48
|
from .ws_manager import WebSocketManager
|
|
49
49
|
|
|
50
|
-
# Application keys for storing components
|
|
51
|
-
ENGINE_KEY = AppKey("engine", "OrchestratorEngine")
|
|
52
|
-
HTTP_SESSION_KEY = AppKey("http_session", ClientSession)
|
|
53
|
-
DISPATCHER_KEY = AppKey("dispatcher", Dispatcher)
|
|
54
|
-
EXECUTOR_KEY = AppKey("executor", JobExecutor)
|
|
55
|
-
WATCHER_KEY = AppKey("watcher", Watcher)
|
|
56
|
-
REPUTATION_CALCULATOR_KEY = AppKey("reputation_calculator", ReputationCalculator)
|
|
57
|
-
HEALTH_CHECKER_KEY = AppKey("health_checker", HealthChecker)
|
|
58
|
-
SCHEDULER_KEY = AppKey("scheduler", Scheduler)
|
|
59
|
-
|
|
60
|
-
EXECUTOR_TASK_KEY = AppKey("executor_task", Task)
|
|
61
|
-
WATCHER_TASK_KEY = AppKey("watcher_task", Task)
|
|
62
|
-
REPUTATION_CALCULATOR_TASK_KEY = AppKey("reputation_calculator_task", Task)
|
|
63
|
-
HEALTH_CHECKER_TASK_KEY = AppKey("health_checker_task", Task)
|
|
64
|
-
SCHEDULER_TASK_KEY = AppKey("scheduler_task", Task)
|
|
65
|
-
|
|
66
50
|
metrics.init_metrics()
|
|
67
51
|
|
|
68
52
|
logger = getLogger(__name__)
|
|
@@ -76,14 +60,6 @@ def json_response(data: Any, **kwargs: Any) -> web.Response:
|
|
|
76
60
|
return web.json_response(data, dumps=json_dumps, **kwargs)
|
|
77
61
|
|
|
78
62
|
|
|
79
|
-
async def status_handler(_request: web.Request) -> web.Response:
|
|
80
|
-
return json_response({"status": "ok"})
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
async def metrics_handler(_request: web.Request) -> web.Response:
|
|
84
|
-
return web.Response(body=render(), content_type="text/plain")
|
|
85
|
-
|
|
86
|
-
|
|
87
63
|
class OrchestratorEngine:
|
|
88
64
|
def __init__(self, storage: StorageBackend, config: Config):
|
|
89
65
|
setup_logging(config.LOG_LEVEL, config.LOG_FORMAT, config.TZ)
|
|
@@ -97,7 +73,7 @@ class OrchestratorEngine:
|
|
|
97
73
|
self.app[ENGINE_KEY] = self
|
|
98
74
|
self._setup_done = False
|
|
99
75
|
|
|
100
|
-
def register_blueprint(self, blueprint: StateMachineBlueprint):
|
|
76
|
+
def register_blueprint(self, blueprint: StateMachineBlueprint) -> None:
|
|
101
77
|
if self._setup_done:
|
|
102
78
|
raise RuntimeError("Cannot register blueprints after engine setup.")
|
|
103
79
|
if blueprint.name in self.blueprints:
|
|
@@ -107,15 +83,15 @@ class OrchestratorEngine:
|
|
|
107
83
|
blueprint.validate()
|
|
108
84
|
self.blueprints[blueprint.name] = blueprint
|
|
109
85
|
|
|
110
|
-
def setup(self):
|
|
86
|
+
def setup(self) -> None:
|
|
111
87
|
if self._setup_done:
|
|
112
88
|
return
|
|
113
|
-
self.
|
|
89
|
+
setup_routes(self.app, self)
|
|
114
90
|
self.app.on_startup.append(self.on_startup)
|
|
115
91
|
self.app.on_shutdown.append(self.on_shutdown)
|
|
116
92
|
self._setup_done = True
|
|
117
93
|
|
|
118
|
-
async def _setup_history_storage(self):
|
|
94
|
+
async def _setup_history_storage(self) -> None:
|
|
119
95
|
from importlib import import_module
|
|
120
96
|
|
|
121
97
|
uri = self.config.HISTORY_DATABASE_URI
|
|
@@ -166,7 +142,12 @@ class OrchestratorEngine:
|
|
|
166
142
|
)
|
|
167
143
|
self.history_storage = NoOpHistoryStorage()
|
|
168
144
|
|
|
169
|
-
async def on_startup(self, app: web.Application):
|
|
145
|
+
async def on_startup(self, app: web.Application) -> None:
|
|
146
|
+
# 1. Fail Fast: Check Storage Connection
|
|
147
|
+
if not await self.storage.ping():
|
|
148
|
+
logger.critical("Failed to connect to Storage Backend (Redis). Exiting.")
|
|
149
|
+
raise RuntimeError("Storage Backend is unavailable.")
|
|
150
|
+
|
|
170
151
|
try:
|
|
171
152
|
from opentelemetry.instrumentation.aiohttp_client import (
|
|
172
153
|
AioHttpClientInstrumentor,
|
|
@@ -178,6 +159,8 @@ class OrchestratorEngine:
|
|
|
178
159
|
"opentelemetry-instrumentation-aiohttp-client not found. AIOHTTP client instrumentation is disabled."
|
|
179
160
|
)
|
|
180
161
|
await self._setup_history_storage()
|
|
162
|
+
# Start history background worker
|
|
163
|
+
await self.history_storage.start()
|
|
181
164
|
|
|
182
165
|
# Load client configs if the path is provided
|
|
183
166
|
if self.config.CLIENTS_CONFIG_PATH:
|
|
@@ -213,6 +196,8 @@ class OrchestratorEngine:
|
|
|
213
196
|
)
|
|
214
197
|
|
|
215
198
|
app[HTTP_SESSION_KEY] = ClientSession()
|
|
199
|
+
self.webhook_sender = WebhookSender(app[HTTP_SESSION_KEY])
|
|
200
|
+
self.webhook_sender.start()
|
|
216
201
|
self.dispatcher = Dispatcher(self.storage, self.config)
|
|
217
202
|
app[DISPATCHER_KEY] = self.dispatcher
|
|
218
203
|
app[EXECUTOR_KEY] = JobExecutor(self, self.history_storage)
|
|
@@ -220,6 +205,8 @@ class OrchestratorEngine:
|
|
|
220
205
|
app[REPUTATION_CALCULATOR_KEY] = ReputationCalculator(self)
|
|
221
206
|
app[HEALTH_CHECKER_KEY] = HealthChecker(self)
|
|
222
207
|
app[SCHEDULER_KEY] = Scheduler(self)
|
|
208
|
+
app[WS_MANAGER_KEY] = self.ws_manager
|
|
209
|
+
app[S3_SERVICE_KEY] = S3Service(self.config, self.history_storage)
|
|
223
210
|
|
|
224
211
|
app[EXECUTOR_TASK_KEY] = create_task(app[EXECUTOR_KEY].run())
|
|
225
212
|
app[WATCHER_TASK_KEY] = create_task(app[WATCHER_KEY].run())
|
|
@@ -227,7 +214,7 @@ class OrchestratorEngine:
|
|
|
227
214
|
app[HEALTH_CHECKER_TASK_KEY] = create_task(app[HEALTH_CHECKER_KEY].run())
|
|
228
215
|
app[SCHEDULER_TASK_KEY] = create_task(app[SCHEDULER_KEY].run())
|
|
229
216
|
|
|
230
|
-
async def on_shutdown(self, app: web.Application):
|
|
217
|
+
async def on_shutdown(self, app: web.Application) -> None:
|
|
231
218
|
logger.info("Shutdown sequence started.")
|
|
232
219
|
app[EXECUTOR_KEY].stop()
|
|
233
220
|
app[WATCHER_KEY].stop()
|
|
@@ -244,6 +231,13 @@ class OrchestratorEngine:
|
|
|
244
231
|
logger.info("Closing WebSocket connections...")
|
|
245
232
|
await self.ws_manager.close_all()
|
|
246
233
|
|
|
234
|
+
logger.info("Stopping WebhookSender...")
|
|
235
|
+
await self.webhook_sender.stop()
|
|
236
|
+
|
|
237
|
+
if S3_SERVICE_KEY in app:
|
|
238
|
+
logger.info("Closing S3 Service...")
|
|
239
|
+
await app[S3_SERVICE_KEY].close()
|
|
240
|
+
|
|
247
241
|
logger.info("Cancelling background tasks...")
|
|
248
242
|
app[HEALTH_CHECKER_TASK_KEY].cancel()
|
|
249
243
|
app[WATCHER_TASK_KEY].cancel()
|
|
@@ -324,295 +318,23 @@ class OrchestratorEngine:
|
|
|
324
318
|
logger.info(f"Created background job {job_id} for blueprint '{blueprint_name}' (source: {source})")
|
|
325
319
|
return job_id
|
|
326
320
|
|
|
327
|
-
def
|
|
328
|
-
|
|
329
|
-
try:
|
|
330
|
-
initial_data = await request.json(loads=loads)
|
|
331
|
-
except Exception:
|
|
332
|
-
return json_response({"error": "Invalid JSON body"}, status=400)
|
|
333
|
-
|
|
334
|
-
client_config = request["client_config"]
|
|
335
|
-
carrier = {str(k): v for k, v in request.headers.items()}
|
|
336
|
-
|
|
337
|
-
job_id = str(uuid4())
|
|
338
|
-
job_state = {
|
|
339
|
-
"id": job_id,
|
|
340
|
-
"blueprint_name": blueprint.name,
|
|
341
|
-
"current_state": blueprint.start_state,
|
|
342
|
-
"initial_data": initial_data,
|
|
343
|
-
"state_history": {},
|
|
344
|
-
"status": JOB_STATUS_PENDING,
|
|
345
|
-
"tracing_context": carrier,
|
|
346
|
-
"client_config": client_config,
|
|
347
|
-
}
|
|
348
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
349
|
-
await self.storage.enqueue_job(job_id)
|
|
350
|
-
metrics.jobs_total.inc({metrics.LABEL_BLUEPRINT: blueprint.name})
|
|
351
|
-
return json_response({"status": "accepted", "job_id": job_id}, status=202)
|
|
352
|
-
|
|
353
|
-
return handler
|
|
354
|
-
|
|
355
|
-
async def _get_job_status_handler(self, request: web.Request) -> web.Response:
|
|
356
|
-
job_id = request.match_info.get("job_id")
|
|
357
|
-
if not job_id:
|
|
358
|
-
return json_response({"error": "job_id is required in path"}, status=400)
|
|
359
|
-
job_state = await self.storage.get_job_state(job_id)
|
|
360
|
-
if not job_state:
|
|
361
|
-
return json_response({"error": "Job not found"}, status=404)
|
|
362
|
-
return json_response(job_state, status=200)
|
|
363
|
-
|
|
364
|
-
async def _cancel_job_handler(self, request: web.Request) -> web.Response:
|
|
365
|
-
job_id = request.match_info.get("job_id")
|
|
366
|
-
if not job_id:
|
|
367
|
-
return json_response({"error": "job_id is required in path"}, status=400)
|
|
368
|
-
|
|
369
|
-
job_state = await self.storage.get_job_state(job_id)
|
|
370
|
-
if not job_state:
|
|
371
|
-
return json_response({"error": "Job not found"}, status=404)
|
|
372
|
-
|
|
373
|
-
if job_state.get("status") != JOB_STATUS_WAITING_FOR_WORKER:
|
|
374
|
-
return json_response(
|
|
375
|
-
{"error": "Job is not in a state that can be cancelled (must be waiting for a worker)."},
|
|
376
|
-
status=409,
|
|
377
|
-
)
|
|
378
|
-
|
|
379
|
-
worker_id = job_state.get("task_worker_id")
|
|
380
|
-
if not worker_id:
|
|
381
|
-
return json_response(
|
|
382
|
-
{"error": "Cannot cancel job: worker_id not found in job state."},
|
|
383
|
-
status=500,
|
|
384
|
-
)
|
|
385
|
-
|
|
386
|
-
worker_info = await self.storage.get_worker_info(worker_id)
|
|
387
|
-
task_id = job_state.get("current_task_id")
|
|
388
|
-
if not task_id:
|
|
389
|
-
return json_response(
|
|
390
|
-
{"error": "Cannot cancel job: task_id not found in job state."},
|
|
391
|
-
status=500,
|
|
392
|
-
)
|
|
393
|
-
|
|
394
|
-
# Set Redis flag as a reliable fallback/primary mechanism
|
|
395
|
-
await self.storage.set_task_cancellation_flag(task_id)
|
|
396
|
-
|
|
397
|
-
# Attempt WebSocket-based cancellation if supported
|
|
398
|
-
if worker_info and worker_info.get("capabilities", {}).get("websockets"):
|
|
399
|
-
command = {"command": "cancel_task", "task_id": task_id, "job_id": job_id}
|
|
400
|
-
sent = await self.ws_manager.send_command(worker_id, command)
|
|
401
|
-
if sent:
|
|
402
|
-
return json_response({"status": "cancellation_request_sent"})
|
|
403
|
-
else:
|
|
404
|
-
logger.warning(f"Failed to send WebSocket cancellation for task {task_id}, but Redis flag is set.")
|
|
405
|
-
# Proceed to return success, as the Redis flag will handle it
|
|
406
|
-
|
|
407
|
-
return json_response({"status": "cancellation_request_accepted"})
|
|
408
|
-
|
|
409
|
-
async def _get_job_history_handler(self, request: web.Request) -> web.Response:
|
|
410
|
-
job_id = request.match_info.get("job_id")
|
|
411
|
-
if not job_id:
|
|
412
|
-
return json_response({"error": "job_id is required in path"}, status=400)
|
|
413
|
-
history = await self.history_storage.get_job_history(job_id)
|
|
414
|
-
return json_response(history)
|
|
415
|
-
|
|
416
|
-
async def _get_blueprint_graph_handler(self, request: web.Request) -> web.Response:
|
|
417
|
-
blueprint_name = request.match_info.get("blueprint_name")
|
|
418
|
-
if not blueprint_name:
|
|
419
|
-
return json_response({"error": "blueprint_name is required in path"}, status=400)
|
|
420
|
-
|
|
421
|
-
blueprint = self.blueprints.get(blueprint_name)
|
|
422
|
-
if not blueprint:
|
|
423
|
-
return json_response({"error": "Blueprint not found"}, status=404)
|
|
424
|
-
|
|
425
|
-
try:
|
|
426
|
-
graph_dot = blueprint.render_graph()
|
|
427
|
-
return web.Response(text=graph_dot, content_type="text/vnd.graphviz")
|
|
428
|
-
except FileNotFoundError:
|
|
429
|
-
error_msg = "Graphviz is not installed on the server. Cannot generate graph."
|
|
430
|
-
logger.error(error_msg)
|
|
431
|
-
return json_response({"error": error_msg}, status=501)
|
|
432
|
-
|
|
433
|
-
async def _get_workers_handler(self, request: web.Request) -> web.Response:
|
|
434
|
-
workers = await self.storage.get_available_workers()
|
|
435
|
-
return json_response(workers)
|
|
436
|
-
|
|
437
|
-
async def _get_jobs_handler(self, request: web.Request) -> web.Response:
|
|
438
|
-
try:
|
|
439
|
-
limit = int(request.query.get("limit", "100"))
|
|
440
|
-
offset = int(request.query.get("offset", "0"))
|
|
441
|
-
except ValueError:
|
|
442
|
-
return json_response({"error": "Invalid limit/offset parameter"}, status=400)
|
|
443
|
-
|
|
444
|
-
jobs = await self.history_storage.get_jobs(limit=limit, offset=offset)
|
|
445
|
-
return json_response(jobs)
|
|
446
|
-
|
|
447
|
-
async def _get_dashboard_handler(self, request: web.Request) -> web.Response:
|
|
448
|
-
worker_count = await self.storage.get_active_worker_count()
|
|
449
|
-
queue_length = await self.storage.get_job_queue_length()
|
|
450
|
-
job_summary = await self.history_storage.get_job_summary()
|
|
451
|
-
|
|
452
|
-
dashboard_data = {
|
|
453
|
-
"workers": {"total": worker_count},
|
|
454
|
-
"jobs": {"queued": queue_length, **job_summary},
|
|
455
|
-
}
|
|
456
|
-
return json_response(dashboard_data)
|
|
457
|
-
|
|
458
|
-
async def _task_result_handler(self, request: web.Request) -> web.Response:
|
|
459
|
-
import logging
|
|
460
|
-
|
|
461
|
-
try:
|
|
462
|
-
data = await request.json(loads=loads)
|
|
463
|
-
job_id = data.get("job_id")
|
|
464
|
-
task_id = data.get("task_id")
|
|
465
|
-
result = data.get("result", {})
|
|
466
|
-
result_status = result.get("status", TASK_STATUS_SUCCESS)
|
|
467
|
-
error_message = result.get("error")
|
|
468
|
-
payload_worker_id = data.get("worker_id")
|
|
469
|
-
except Exception:
|
|
470
|
-
return json_response({"error": "Invalid JSON body"}, status=400)
|
|
471
|
-
|
|
472
|
-
# Security check: Ensure the worker_id from the payload matches the authenticated worker
|
|
473
|
-
authenticated_worker_id = request.get("worker_id")
|
|
474
|
-
if not authenticated_worker_id:
|
|
475
|
-
# This should not happen if the auth middleware is working correctly
|
|
476
|
-
return json_response({"error": "Could not identify authenticated worker."}, status=500)
|
|
477
|
-
|
|
478
|
-
if payload_worker_id and payload_worker_id != authenticated_worker_id:
|
|
479
|
-
return json_response(
|
|
480
|
-
{
|
|
481
|
-
"error": f"Forbidden: Authenticated worker '{authenticated_worker_id}' "
|
|
482
|
-
f"cannot submit results for another worker '{payload_worker_id}'.",
|
|
483
|
-
},
|
|
484
|
-
status=403,
|
|
485
|
-
)
|
|
486
|
-
|
|
487
|
-
if not job_id or not task_id:
|
|
488
|
-
return json_response({"error": "job_id and task_id are required"}, status=400)
|
|
489
|
-
|
|
490
|
-
job_state = await self.storage.get_job_state(job_id)
|
|
491
|
-
if not job_state:
|
|
492
|
-
return json_response({"error": "Job not found"}, status=404)
|
|
493
|
-
|
|
494
|
-
# Handle parallel task completion
|
|
495
|
-
if job_state.get("status") == JOB_STATUS_WAITING_FOR_PARALLEL:
|
|
496
|
-
await self.storage.remove_job_from_watch(f"{job_id}:{task_id}")
|
|
497
|
-
job_state.setdefault("aggregation_results", {})[task_id] = result
|
|
498
|
-
job_state.setdefault("active_branches", []).remove(task_id)
|
|
499
|
-
|
|
500
|
-
if not job_state["active_branches"]:
|
|
501
|
-
logger.info(f"All parallel branches for job {job_id} have completed.")
|
|
502
|
-
job_state["status"] = JOB_STATUS_RUNNING
|
|
503
|
-
job_state["current_state"] = job_state["aggregation_target"]
|
|
504
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
505
|
-
await self.storage.enqueue_job(job_id)
|
|
506
|
-
else:
|
|
507
|
-
logger.info(
|
|
508
|
-
f"Branch {task_id} for job {job_id} completed. "
|
|
509
|
-
f"Waiting for {len(job_state['active_branches'])} more.",
|
|
510
|
-
)
|
|
511
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
512
|
-
|
|
513
|
-
return json_response({"status": "parallel_branch_result_accepted"}, status=200)
|
|
514
|
-
|
|
515
|
-
await self.storage.remove_job_from_watch(job_id)
|
|
516
|
-
|
|
517
|
-
import time
|
|
518
|
-
|
|
519
|
-
now = time.monotonic()
|
|
520
|
-
dispatched_at = job_state.get("task_dispatched_at", now)
|
|
521
|
-
duration_ms = int((now - dispatched_at) * 1000)
|
|
522
|
-
|
|
523
|
-
await self.history_storage.log_job_event(
|
|
524
|
-
{
|
|
525
|
-
"job_id": job_id,
|
|
526
|
-
"state": job_state.get("current_state"),
|
|
527
|
-
"event_type": "task_finished",
|
|
528
|
-
"duration_ms": duration_ms,
|
|
529
|
-
"worker_id": authenticated_worker_id, # Use authenticated worker_id
|
|
530
|
-
"context_snapshot": {**job_state, "result": result},
|
|
531
|
-
},
|
|
532
|
-
)
|
|
533
|
-
|
|
534
|
-
job_state["tracing_context"] = {str(k): v for k, v in request.headers.items()}
|
|
535
|
-
|
|
536
|
-
if result_status == TASK_STATUS_FAILURE:
|
|
537
|
-
error_details = result.get("error", {})
|
|
538
|
-
error_type = ERROR_CODE_TRANSIENT
|
|
539
|
-
error_message = "No error details provided."
|
|
540
|
-
|
|
541
|
-
if isinstance(error_details, dict):
|
|
542
|
-
error_type = error_details.get("code", ERROR_CODE_TRANSIENT)
|
|
543
|
-
error_message = error_details.get("message", "No error message provided.")
|
|
544
|
-
elif isinstance(error_details, str):
|
|
545
|
-
# Fallback for old format where `error` was just a string
|
|
546
|
-
error_message = error_details
|
|
547
|
-
|
|
548
|
-
logging.warning(f"Task {task_id} for job {job_id} failed with error type '{error_type}'.")
|
|
549
|
-
|
|
550
|
-
if error_type == ERROR_CODE_PERMANENT:
|
|
551
|
-
job_state["status"] = JOB_STATUS_QUARANTINED
|
|
552
|
-
job_state["error_message"] = f"Task failed with permanent error: {error_message}"
|
|
553
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
554
|
-
await self.storage.quarantine_job(job_id)
|
|
555
|
-
elif error_type == ERROR_CODE_INVALID_INPUT:
|
|
556
|
-
job_state["status"] = JOB_STATUS_FAILED
|
|
557
|
-
job_state["error_message"] = f"Task failed due to invalid input: {error_message}"
|
|
558
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
559
|
-
else: # TRANSIENT_ERROR or any other/unspecified error
|
|
560
|
-
await self._handle_task_failure(job_state, task_id, error_message)
|
|
561
|
-
|
|
562
|
-
return json_response({"status": "result_accepted_failure"}, status=200)
|
|
563
|
-
|
|
564
|
-
if result_status == TASK_STATUS_CANCELLED:
|
|
565
|
-
logging.info(f"Task {task_id} for job {job_id} was cancelled by worker.")
|
|
566
|
-
job_state["status"] = JOB_STATUS_CANCELLED
|
|
567
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
568
|
-
# Optionally, trigger a specific 'cancelled' transition if defined in the blueprint
|
|
569
|
-
transitions = job_state.get("current_task_transitions", {})
|
|
570
|
-
if next_state := transitions.get("cancelled"):
|
|
571
|
-
job_state["current_state"] = next_state
|
|
572
|
-
job_state["status"] = JOB_STATUS_RUNNING # It's running the cancellation handler now
|
|
573
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
574
|
-
await self.storage.enqueue_job(job_id)
|
|
575
|
-
return json_response({"status": "result_accepted_cancelled"}, status=200)
|
|
576
|
-
|
|
577
|
-
transitions = job_state.get("current_task_transitions", {})
|
|
578
|
-
if next_state := transitions.get(result_status):
|
|
579
|
-
logging.info(f"Job {job_id} transitioning based on worker status '{result_status}' to state '{next_state}'")
|
|
580
|
-
|
|
581
|
-
worker_data = result.get("data")
|
|
582
|
-
if worker_data and isinstance(worker_data, dict):
|
|
583
|
-
if "state_history" not in job_state:
|
|
584
|
-
job_state["state_history"] = {}
|
|
585
|
-
job_state["state_history"].update(worker_data)
|
|
586
|
-
|
|
587
|
-
job_state["current_state"] = next_state
|
|
588
|
-
job_state["status"] = JOB_STATUS_RUNNING
|
|
589
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
590
|
-
await self.storage.enqueue_job(job_id)
|
|
591
|
-
else:
|
|
592
|
-
logging.error(f"Job {job_id} failed. Worker returned unhandled status '{result_status}'.")
|
|
593
|
-
job_state["status"] = JOB_STATUS_FAILED
|
|
594
|
-
job_state["error_message"] = f"Worker returned unhandled status: {result_status}"
|
|
595
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
596
|
-
|
|
597
|
-
return json_response({"status": "result_accepted_success"}, status=200)
|
|
598
|
-
|
|
599
|
-
async def _handle_task_failure(self, job_state: dict, task_id: str, error_message: str | None):
|
|
600
|
-
import logging
|
|
601
|
-
|
|
321
|
+
async def handle_task_failure(self, job_state: dict[str, Any], task_id: str, error_message: str | None) -> None:
|
|
322
|
+
"""Handles a transient task failure by retrying or quarantining."""
|
|
602
323
|
job_id = job_state["id"]
|
|
603
324
|
retry_count = job_state.get("retry_count", 0)
|
|
604
325
|
max_retries = self.config.JOB_MAX_RETRIES
|
|
605
326
|
|
|
606
327
|
if retry_count < max_retries:
|
|
607
328
|
job_state["retry_count"] = retry_count + 1
|
|
608
|
-
|
|
329
|
+
logger.info(f"Retrying task for job {job_id}. Attempt {retry_count + 1}/{max_retries}.")
|
|
609
330
|
|
|
610
331
|
task_info = job_state.get("current_task_info")
|
|
611
332
|
if not task_info:
|
|
612
|
-
|
|
333
|
+
logger.error(f"Cannot retry job {job_id}: missing 'current_task_info' in job state.")
|
|
613
334
|
job_state["status"] = JOB_STATUS_FAILED
|
|
614
335
|
job_state["error_message"] = "Cannot retry: original task info not found."
|
|
615
336
|
await self.storage.save_job_state(job_id, job_state)
|
|
337
|
+
await self.send_job_webhook(job_state, "job_failed")
|
|
616
338
|
return
|
|
617
339
|
|
|
618
340
|
now = get_running_loop().time()
|
|
@@ -626,284 +348,31 @@ class OrchestratorEngine:
|
|
|
626
348
|
|
|
627
349
|
await self.dispatcher.dispatch(job_state, task_info)
|
|
628
350
|
else:
|
|
629
|
-
|
|
351
|
+
logger.critical(f"Job {job_id} has failed {max_retries + 1} times. Moving to quarantine.")
|
|
630
352
|
job_state["status"] = JOB_STATUS_QUARANTINED
|
|
631
353
|
job_state["error_message"] = f"Task failed after {max_retries + 1} attempts: {error_message}"
|
|
632
354
|
await self.storage.save_job_state(job_id, job_state)
|
|
633
355
|
await self.storage.quarantine_job(job_id)
|
|
356
|
+
await self.send_job_webhook(job_state, "job_quarantined")
|
|
634
357
|
|
|
635
|
-
async def
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
data = await request.json(loads=loads)
|
|
641
|
-
decision = data.get("decision")
|
|
642
|
-
if not decision:
|
|
643
|
-
return json_response({"error": "decision is required in body"}, status=400)
|
|
644
|
-
except Exception:
|
|
645
|
-
return json_response({"error": "Invalid JSON body"}, status=400)
|
|
646
|
-
job_state = await self.storage.get_job_state(job_id)
|
|
647
|
-
if not job_state:
|
|
648
|
-
return json_response({"error": "Job not found"}, status=404)
|
|
649
|
-
if job_state.get("status") not in [JOB_STATUS_WAITING_FOR_WORKER, JOB_STATUS_WAITING_FOR_HUMAN]:
|
|
650
|
-
return json_response({"error": "Job is not in a state that can be approved"}, status=409)
|
|
651
|
-
transitions = job_state.get("current_task_transitions", {})
|
|
652
|
-
next_state = transitions.get(decision)
|
|
653
|
-
if not next_state:
|
|
654
|
-
return json_response({"error": f"Invalid decision '{decision}' for this job"}, status=400)
|
|
655
|
-
job_state["current_state"] = next_state
|
|
656
|
-
job_state["status"] = JOB_STATUS_RUNNING
|
|
657
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
658
|
-
await self.storage.enqueue_job(job_id)
|
|
659
|
-
return json_response({"status": "approval_received", "job_id": job_id})
|
|
660
|
-
|
|
661
|
-
async def _get_quarantined_jobs_handler(self, request: web.Request) -> web.Response:
|
|
662
|
-
"""Returns a list of all job IDs in the quarantine queue."""
|
|
663
|
-
jobs = await self.storage.get_quarantined_jobs()
|
|
664
|
-
return json_response(jobs)
|
|
665
|
-
|
|
666
|
-
async def _reload_worker_configs_handler(self, request: web.Request) -> web.Response:
|
|
667
|
-
"""Handles the dynamic reloading of worker configurations."""
|
|
668
|
-
logger.info("Received request to reload worker configurations.")
|
|
669
|
-
if not self.config.WORKERS_CONFIG_PATH:
|
|
670
|
-
return json_response(
|
|
671
|
-
{"error": "WORKERS_CONFIG_PATH is not set, cannot reload configs."},
|
|
672
|
-
status=400,
|
|
673
|
-
)
|
|
674
|
-
|
|
675
|
-
await load_worker_configs_to_redis(self.storage, self.config.WORKERS_CONFIG_PATH)
|
|
676
|
-
return json_response({"status": "worker_configs_reloaded"})
|
|
677
|
-
|
|
678
|
-
async def _flush_db_handler(self, request: web.Request) -> web.Response:
|
|
679
|
-
logger.warning("Received request to flush the database.")
|
|
680
|
-
await self.storage.flush_all()
|
|
681
|
-
await load_client_configs_to_redis(self.storage)
|
|
682
|
-
return json_response({"status": "db_flushed"}, status=200)
|
|
683
|
-
|
|
684
|
-
async def _docs_handler(self, request: web.Request) -> web.Response:
|
|
685
|
-
from importlib import resources
|
|
686
|
-
|
|
687
|
-
try:
|
|
688
|
-
content = resources.read_text("avtomatika", "api.html")
|
|
689
|
-
except FileNotFoundError:
|
|
690
|
-
logger.error("api.html not found within the avtomatika package.")
|
|
691
|
-
return json_response({"error": "Documentation file not found on server."}, status=500)
|
|
692
|
-
|
|
693
|
-
# Generate dynamic documentation for registered blueprints
|
|
694
|
-
blueprint_endpoints = []
|
|
695
|
-
for bp in self.blueprints.values():
|
|
696
|
-
if not bp.api_endpoint:
|
|
697
|
-
continue
|
|
698
|
-
|
|
699
|
-
version_prefix = f"/{bp.api_version}" if bp.api_version else ""
|
|
700
|
-
endpoint_path = bp.api_endpoint if bp.api_endpoint.startswith("/") else f"/{bp.api_endpoint}"
|
|
701
|
-
full_path = f"/api{version_prefix}{endpoint_path}"
|
|
702
|
-
|
|
703
|
-
blueprint_endpoints.append(
|
|
704
|
-
{
|
|
705
|
-
"id": f"post-create-{bp.name.replace('_', '-')}",
|
|
706
|
-
"name": f"Create {bp.name.replace('_', ' ').title()} Job",
|
|
707
|
-
"method": "POST",
|
|
708
|
-
"path": full_path,
|
|
709
|
-
"description": f"Creates and starts a new instance (Job) of the `{bp.name}` blueprint.",
|
|
710
|
-
"request": {"body": {"initial_data": {}}},
|
|
711
|
-
"responses": [
|
|
712
|
-
{
|
|
713
|
-
"code": "202 Accepted",
|
|
714
|
-
"description": "Job successfully accepted for processing.",
|
|
715
|
-
"body": {"status": "accepted", "job_id": "..."},
|
|
716
|
-
}
|
|
717
|
-
],
|
|
718
|
-
}
|
|
719
|
-
)
|
|
720
|
-
|
|
721
|
-
# Inject dynamic endpoints into the apiData structure in the HTML
|
|
722
|
-
if blueprint_endpoints:
|
|
723
|
-
endpoints_json = dumps(blueprint_endpoints, option=OPT_INDENT_2).decode("utf-8")
|
|
724
|
-
# We insert the new endpoints at the beginning of the 'Protected API' group
|
|
725
|
-
marker = "group: 'Protected API',\n endpoints: ["
|
|
726
|
-
content = content.replace(marker, f"{marker}\n{endpoints_json.strip('[]')},")
|
|
727
|
-
|
|
728
|
-
return web.Response(text=content, content_type="text/html")
|
|
729
|
-
|
|
730
|
-
def _setup_routes(self):
|
|
731
|
-
public_app = web.Application()
|
|
732
|
-
public_app.router.add_get("/status", status_handler)
|
|
733
|
-
public_app.router.add_get("/metrics", metrics_handler)
|
|
734
|
-
public_app.router.add_post("/webhooks/approval/{job_id}", self._human_approval_webhook_handler)
|
|
735
|
-
public_app.router.add_post("/debug/flush_db", self._flush_db_handler)
|
|
736
|
-
public_app.router.add_get("/docs", self._docs_handler)
|
|
737
|
-
public_app.router.add_get("/jobs/quarantined", self._get_quarantined_jobs_handler)
|
|
738
|
-
self.app.add_subapp("/_public/", public_app)
|
|
739
|
-
|
|
740
|
-
auth_middleware = client_auth_middleware_factory(self.storage)
|
|
741
|
-
quota_middleware = quota_middleware_factory(self.storage)
|
|
742
|
-
api_middlewares = [auth_middleware, quota_middleware]
|
|
743
|
-
|
|
744
|
-
protected_app = web.Application(middlewares=api_middlewares)
|
|
745
|
-
versioned_apps: dict[str, web.Application] = {}
|
|
746
|
-
has_unversioned_routes = False
|
|
747
|
-
|
|
748
|
-
for bp in self.blueprints.values():
|
|
749
|
-
if not bp.api_endpoint:
|
|
750
|
-
continue
|
|
751
|
-
endpoint = bp.api_endpoint if bp.api_endpoint.startswith("/") else f"/{bp.api_endpoint}"
|
|
752
|
-
if bp.api_version:
|
|
753
|
-
if bp.api_version not in versioned_apps:
|
|
754
|
-
versioned_apps[bp.api_version] = web.Application(middlewares=api_middlewares)
|
|
755
|
-
versioned_apps[bp.api_version].router.add_post(endpoint, self._create_job_handler(bp))
|
|
756
|
-
else:
|
|
757
|
-
protected_app.router.add_post(endpoint, self._create_job_handler(bp))
|
|
758
|
-
has_unversioned_routes = True
|
|
759
|
-
|
|
760
|
-
all_protected_apps = list(versioned_apps.values())
|
|
761
|
-
if has_unversioned_routes:
|
|
762
|
-
all_protected_apps.append(protected_app)
|
|
763
|
-
|
|
764
|
-
for app in all_protected_apps:
|
|
765
|
-
self._register_common_routes(app)
|
|
766
|
-
if has_unversioned_routes:
|
|
767
|
-
self.app.add_subapp("/api/", protected_app)
|
|
768
|
-
for version, app in versioned_apps.items():
|
|
769
|
-
self.app.add_subapp(f"/api/{version}", app)
|
|
770
|
-
|
|
771
|
-
worker_auth_middleware = worker_auth_middleware_factory(self.storage, self.config)
|
|
772
|
-
worker_middlewares = [worker_auth_middleware]
|
|
773
|
-
if self.config.RATE_LIMITING_ENABLED:
|
|
774
|
-
worker_rate_limiter = rate_limit_middleware_factory(storage=self.storage, limit=5, period=60)
|
|
775
|
-
worker_middlewares.append(worker_rate_limiter)
|
|
776
|
-
|
|
777
|
-
worker_app = web.Application(middlewares=worker_middlewares)
|
|
778
|
-
worker_app.router.add_post("/workers/register", self._register_worker_handler)
|
|
779
|
-
worker_app.router.add_get("/workers/{worker_id}/tasks/next", self._handle_get_next_task)
|
|
780
|
-
worker_app.router.add_patch("/workers/{worker_id}", self._worker_update_handler)
|
|
781
|
-
worker_app.router.add_post("/tasks/result", self._task_result_handler)
|
|
782
|
-
worker_app.router.add_get("/ws/{worker_id}", self._websocket_handler)
|
|
783
|
-
self.app.add_subapp("/_worker/", worker_app)
|
|
784
|
-
|
|
785
|
-
def _register_common_routes(self, app):
|
|
786
|
-
app.router.add_get("/jobs/{job_id}", self._get_job_status_handler)
|
|
787
|
-
app.router.add_post("/jobs/{job_id}/cancel", self._cancel_job_handler)
|
|
788
|
-
if not isinstance(self.history_storage, NoOpHistoryStorage):
|
|
789
|
-
app.router.add_get("/jobs/{job_id}/history", self._get_job_history_handler)
|
|
790
|
-
app.router.add_get("/blueprints/{blueprint_name}/graph", self._get_blueprint_graph_handler)
|
|
791
|
-
app.router.add_get("/workers", self._get_workers_handler)
|
|
792
|
-
app.router.add_get("/jobs", self._get_jobs_handler)
|
|
793
|
-
app.router.add_get("/dashboard", self._get_dashboard_handler)
|
|
794
|
-
app.router.add_post("/admin/reload-workers", self._reload_worker_configs_handler)
|
|
795
|
-
|
|
796
|
-
async def _websocket_handler(self, request: web.Request) -> web.WebSocketResponse:
|
|
797
|
-
worker_id = request.match_info.get("worker_id")
|
|
798
|
-
if not worker_id:
|
|
799
|
-
raise web.HTTPBadRequest(text="worker_id is required")
|
|
800
|
-
|
|
801
|
-
ws = web.WebSocketResponse()
|
|
802
|
-
await ws.prepare(request)
|
|
803
|
-
|
|
804
|
-
await self.ws_manager.register(worker_id, ws)
|
|
805
|
-
try:
|
|
806
|
-
async for msg in ws:
|
|
807
|
-
if msg.type == WSMsgType.TEXT:
|
|
808
|
-
try:
|
|
809
|
-
data = msg.json()
|
|
810
|
-
await self.ws_manager.handle_message(worker_id, data)
|
|
811
|
-
except Exception as e:
|
|
812
|
-
logger.error(f"Error processing WebSocket message from {worker_id}: {e}")
|
|
813
|
-
elif msg.type == WSMsgType.ERROR:
|
|
814
|
-
logger.error(f"WebSocket connection for {worker_id} closed with exception {ws.exception()}")
|
|
815
|
-
break
|
|
816
|
-
finally:
|
|
817
|
-
await self.ws_manager.unregister(worker_id)
|
|
818
|
-
return ws
|
|
819
|
-
|
|
820
|
-
async def _handle_get_next_task(self, request: web.Request) -> web.Response:
|
|
821
|
-
worker_id = request.match_info.get("worker_id")
|
|
822
|
-
if not worker_id:
|
|
823
|
-
return json_response({"error": "worker_id is required in path"}, status=400)
|
|
824
|
-
|
|
825
|
-
logger.debug(f"Worker {worker_id} is requesting a new task.")
|
|
826
|
-
task = await self.storage.dequeue_task_for_worker(worker_id, self.config.WORKER_POLL_TIMEOUT_SECONDS)
|
|
827
|
-
|
|
828
|
-
if task:
|
|
829
|
-
logger.info(f"Sending task {task.get('task_id')} to worker {worker_id}")
|
|
830
|
-
return json_response(task, status=200)
|
|
831
|
-
logger.debug(f"No tasks for worker {worker_id}, responding 204.")
|
|
832
|
-
return web.Response(status=204)
|
|
833
|
-
|
|
834
|
-
async def _worker_update_handler(self, request: web.Request) -> web.Response:
|
|
835
|
-
"""
|
|
836
|
-
Handles both full updates and lightweight heartbeats for a worker.
|
|
837
|
-
|
|
838
|
-
If the request has a JSON body, it updates the worker's data.
|
|
839
|
-
In either case, it refreshes the worker's TTL, serving as a heartbeat.
|
|
840
|
-
"""
|
|
841
|
-
worker_id = request.match_info.get("worker_id")
|
|
842
|
-
if not worker_id:
|
|
843
|
-
return json_response({"error": "worker_id is required in path"}, status=400)
|
|
844
|
-
|
|
845
|
-
ttl = self.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
|
|
846
|
-
update_data = None
|
|
847
|
-
|
|
848
|
-
# Check for body content without consuming it if it's not JSON
|
|
849
|
-
if request.can_read_body:
|
|
850
|
-
try:
|
|
851
|
-
update_data = await request.json(loads=loads)
|
|
852
|
-
except Exception:
|
|
853
|
-
logger.warning(
|
|
854
|
-
f"Received PATCH from worker {worker_id} with non-JSON body. Treating as TTL-only heartbeat."
|
|
855
|
-
)
|
|
358
|
+
async def send_job_webhook(self, job_state: dict[str, Any], event: str) -> None:
|
|
359
|
+
"""Sends a webhook notification for a job event."""
|
|
360
|
+
webhook_url = job_state.get("webhook_url")
|
|
361
|
+
if not webhook_url:
|
|
362
|
+
return
|
|
856
363
|
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
await self.history_storage.log_worker_event(
|
|
864
|
-
{
|
|
865
|
-
"worker_id": worker_id,
|
|
866
|
-
"event_type": "status_update",
|
|
867
|
-
"worker_info_snapshot": updated_worker,
|
|
868
|
-
},
|
|
869
|
-
)
|
|
870
|
-
return json_response(updated_worker, status=200)
|
|
871
|
-
else:
|
|
872
|
-
# Lightweight TTL-only heartbeat path
|
|
873
|
-
refreshed = await self.storage.refresh_worker_ttl(worker_id, ttl)
|
|
874
|
-
if not refreshed:
|
|
875
|
-
return json_response({"error": "Worker not found"}, status=404)
|
|
876
|
-
return json_response({"status": "ttl_refreshed"})
|
|
877
|
-
|
|
878
|
-
async def _register_worker_handler(self, request: web.Request) -> web.Response:
|
|
879
|
-
# The worker_registration_data is attached by the auth middleware
|
|
880
|
-
# to avoid reading the request body twice.
|
|
881
|
-
worker_data = request.get("worker_registration_data")
|
|
882
|
-
if not worker_data:
|
|
883
|
-
return json_response({"error": "Worker data not found in request"}, status=500)
|
|
884
|
-
|
|
885
|
-
worker_id = worker_data.get("worker_id")
|
|
886
|
-
# This check is redundant if the middleware works, but good for safety
|
|
887
|
-
if not worker_id:
|
|
888
|
-
return json_response({"error": "Missing required field: worker_id"}, status=400)
|
|
889
|
-
|
|
890
|
-
ttl = self.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
|
|
891
|
-
await self.storage.register_worker(worker_id, worker_data, ttl)
|
|
892
|
-
|
|
893
|
-
logger.info(
|
|
894
|
-
f"Worker '{worker_id}' registered with info: {worker_data}",
|
|
364
|
+
payload = WebhookPayload(
|
|
365
|
+
event=event,
|
|
366
|
+
job_id=job_state["id"],
|
|
367
|
+
status=job_state["status"],
|
|
368
|
+
result=job_state.get("state_history"), # Or specific result
|
|
369
|
+
error=job_state.get("error_message"),
|
|
895
370
|
)
|
|
896
371
|
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
"worker_id": worker_id,
|
|
900
|
-
"event_type": "registered",
|
|
901
|
-
"worker_info_snapshot": worker_data,
|
|
902
|
-
},
|
|
903
|
-
)
|
|
904
|
-
return json_response({"status": "registered"}, status=200)
|
|
372
|
+
# Run in background to not block the main flow
|
|
373
|
+
await self.webhook_sender.send(webhook_url, payload)
|
|
905
374
|
|
|
906
|
-
def run(self):
|
|
375
|
+
def run(self) -> None:
|
|
907
376
|
self.setup()
|
|
908
377
|
print(
|
|
909
378
|
f"Starting OrchestratorEngine API server on {self.config.API_HOST}:{self.config.API_PORT} in blocking mode."
|