avtomatika 1.0b5__py3-none-any.whl → 1.0b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avtomatika/api/handlers.py +549 -0
- avtomatika/api/routes.py +118 -0
- avtomatika/app_keys.py +32 -0
- avtomatika/blueprint.py +125 -54
- avtomatika/config.py +4 -0
- avtomatika/constants.py +30 -0
- avtomatika/context.py +2 -2
- avtomatika/data_types.py +3 -2
- avtomatika/dispatcher.py +1 -1
- avtomatika/engine.py +103 -577
- avtomatika/executor.py +21 -16
- avtomatika/history/postgres.py +56 -13
- avtomatika/history/sqlite.py +54 -34
- avtomatika/logging_config.py +58 -7
- avtomatika/scheduler.py +119 -0
- avtomatika/scheduler_config_loader.py +41 -0
- avtomatika/security.py +3 -5
- avtomatika/storage/base.py +17 -3
- avtomatika/storage/memory.py +50 -8
- avtomatika/storage/redis.py +17 -0
- avtomatika/utils/__init__.py +0 -0
- avtomatika/utils/webhook_sender.py +54 -0
- avtomatika/watcher.py +1 -3
- {avtomatika-1.0b5.dist-info → avtomatika-1.0b7.dist-info}/METADATA +77 -4
- avtomatika-1.0b7.dist-info/RECORD +45 -0
- avtomatika-1.0b5.dist-info/RECORD +0 -37
- {avtomatika-1.0b5.dist-info → avtomatika-1.0b7.dist-info}/WHEEL +0 -0
- {avtomatika-1.0b5.dist-info → avtomatika-1.0b7.dist-info}/licenses/LICENSE +0 -0
- {avtomatika-1.0b5.dist-info → avtomatika-1.0b7.dist-info}/top_level.txt +0 -0
avtomatika/engine.py
CHANGED
|
@@ -1,48 +1,50 @@
|
|
|
1
|
-
from asyncio import Task, create_task, gather, get_running_loop, wait_for
|
|
2
1
|
from asyncio import TimeoutError as AsyncTimeoutError
|
|
2
|
+
from asyncio import create_task, gather, get_running_loop, wait_for
|
|
3
3
|
from logging import getLogger
|
|
4
|
-
from typing import Any
|
|
4
|
+
from typing import Any
|
|
5
5
|
from uuid import uuid4
|
|
6
6
|
|
|
7
|
-
from aiohttp import ClientSession,
|
|
8
|
-
from
|
|
9
|
-
from aioprometheus import render
|
|
10
|
-
from orjson import OPT_INDENT_2, dumps, loads
|
|
7
|
+
from aiohttp import ClientSession, web
|
|
8
|
+
from orjson import dumps
|
|
11
9
|
|
|
12
10
|
from . import metrics
|
|
11
|
+
from .api.routes import setup_routes
|
|
12
|
+
from .app_keys import (
|
|
13
|
+
DISPATCHER_KEY,
|
|
14
|
+
ENGINE_KEY,
|
|
15
|
+
EXECUTOR_KEY,
|
|
16
|
+
EXECUTOR_TASK_KEY,
|
|
17
|
+
HEALTH_CHECKER_KEY,
|
|
18
|
+
HEALTH_CHECKER_TASK_KEY,
|
|
19
|
+
HTTP_SESSION_KEY,
|
|
20
|
+
REPUTATION_CALCULATOR_KEY,
|
|
21
|
+
REPUTATION_CALCULATOR_TASK_KEY,
|
|
22
|
+
SCHEDULER_KEY,
|
|
23
|
+
SCHEDULER_TASK_KEY,
|
|
24
|
+
WATCHER_KEY,
|
|
25
|
+
WATCHER_TASK_KEY,
|
|
26
|
+
WS_MANAGER_KEY,
|
|
27
|
+
)
|
|
13
28
|
from .blueprint import StateMachineBlueprint
|
|
14
29
|
from .client_config_loader import load_client_configs_to_redis
|
|
15
30
|
from .compression import compression_middleware
|
|
16
31
|
from .config import Config
|
|
32
|
+
from .constants import JOB_STATUS_FAILED, JOB_STATUS_PENDING, JOB_STATUS_QUARANTINED, JOB_STATUS_WAITING_FOR_WORKER
|
|
17
33
|
from .dispatcher import Dispatcher
|
|
18
34
|
from .executor import JobExecutor
|
|
19
35
|
from .health_checker import HealthChecker
|
|
20
36
|
from .history.base import HistoryStorageBase
|
|
21
37
|
from .history.noop import NoOpHistoryStorage
|
|
22
38
|
from .logging_config import setup_logging
|
|
23
|
-
from .quota import quota_middleware_factory
|
|
24
|
-
from .ratelimit import rate_limit_middleware_factory
|
|
25
39
|
from .reputation import ReputationCalculator
|
|
26
|
-
from .
|
|
40
|
+
from .scheduler import Scheduler
|
|
27
41
|
from .storage.base import StorageBackend
|
|
28
42
|
from .telemetry import setup_telemetry
|
|
43
|
+
from .utils.webhook_sender import WebhookPayload, WebhookSender
|
|
29
44
|
from .watcher import Watcher
|
|
30
45
|
from .worker_config_loader import load_worker_configs_to_redis
|
|
31
46
|
from .ws_manager import WebSocketManager
|
|
32
47
|
|
|
33
|
-
# Application keys for storing components
|
|
34
|
-
ENGINE_KEY = AppKey("engine", "OrchestratorEngine")
|
|
35
|
-
HTTP_SESSION_KEY = AppKey("http_session", ClientSession)
|
|
36
|
-
DISPATCHER_KEY = AppKey("dispatcher", Dispatcher)
|
|
37
|
-
EXECUTOR_KEY = AppKey("executor", JobExecutor)
|
|
38
|
-
WATCHER_KEY = AppKey("watcher", Watcher)
|
|
39
|
-
REPUTATION_CALCULATOR_KEY = AppKey("reputation_calculator", ReputationCalculator)
|
|
40
|
-
HEALTH_CHECKER_KEY = AppKey("health_checker", HealthChecker)
|
|
41
|
-
EXECUTOR_TASK_KEY = AppKey("executor_task", Task)
|
|
42
|
-
WATCHER_TASK_KEY = AppKey("watcher_task", Task)
|
|
43
|
-
REPUTATION_CALCULATOR_TASK_KEY = AppKey("reputation_calculator_task", Task)
|
|
44
|
-
HEALTH_CHECKER_TASK_KEY = AppKey("health_checker_task", Task)
|
|
45
|
-
|
|
46
48
|
metrics.init_metrics()
|
|
47
49
|
|
|
48
50
|
logger = getLogger(__name__)
|
|
@@ -56,17 +58,9 @@ def json_response(data: Any, **kwargs: Any) -> web.Response:
|
|
|
56
58
|
return web.json_response(data, dumps=json_dumps, **kwargs)
|
|
57
59
|
|
|
58
60
|
|
|
59
|
-
async def status_handler(_request: web.Request) -> web.Response:
|
|
60
|
-
return json_response({"status": "ok"})
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
async def metrics_handler(_request: web.Request) -> web.Response:
|
|
64
|
-
return web.Response(body=render(), content_type="text/plain")
|
|
65
|
-
|
|
66
|
-
|
|
67
61
|
class OrchestratorEngine:
|
|
68
62
|
def __init__(self, storage: StorageBackend, config: Config):
|
|
69
|
-
setup_logging(config.LOG_LEVEL, config.LOG_FORMAT)
|
|
63
|
+
setup_logging(config.LOG_LEVEL, config.LOG_FORMAT, config.TZ)
|
|
70
64
|
setup_telemetry()
|
|
71
65
|
self.storage = storage
|
|
72
66
|
self.config = config
|
|
@@ -77,7 +71,7 @@ class OrchestratorEngine:
|
|
|
77
71
|
self.app[ENGINE_KEY] = self
|
|
78
72
|
self._setup_done = False
|
|
79
73
|
|
|
80
|
-
def register_blueprint(self, blueprint: StateMachineBlueprint):
|
|
74
|
+
def register_blueprint(self, blueprint: StateMachineBlueprint) -> None:
|
|
81
75
|
if self._setup_done:
|
|
82
76
|
raise RuntimeError("Cannot register blueprints after engine setup.")
|
|
83
77
|
if blueprint.name in self.blueprints:
|
|
@@ -87,15 +81,15 @@ class OrchestratorEngine:
|
|
|
87
81
|
blueprint.validate()
|
|
88
82
|
self.blueprints[blueprint.name] = blueprint
|
|
89
83
|
|
|
90
|
-
def setup(self):
|
|
84
|
+
def setup(self) -> None:
|
|
91
85
|
if self._setup_done:
|
|
92
86
|
return
|
|
93
|
-
self.
|
|
87
|
+
setup_routes(self.app, self)
|
|
94
88
|
self.app.on_startup.append(self.on_startup)
|
|
95
89
|
self.app.on_shutdown.append(self.on_shutdown)
|
|
96
90
|
self._setup_done = True
|
|
97
91
|
|
|
98
|
-
async def _setup_history_storage(self):
|
|
92
|
+
async def _setup_history_storage(self) -> None:
|
|
99
93
|
from importlib import import_module
|
|
100
94
|
|
|
101
95
|
uri = self.config.HISTORY_DATABASE_URI
|
|
@@ -115,7 +109,7 @@ class OrchestratorEngine:
|
|
|
115
109
|
storage_class = module.SQLiteHistoryStorage
|
|
116
110
|
parsed_uri = urlparse(uri)
|
|
117
111
|
db_path = parsed_uri.path
|
|
118
|
-
storage_args = [db_path]
|
|
112
|
+
storage_args = [db_path, self.config.TZ]
|
|
119
113
|
except ImportError as e:
|
|
120
114
|
logger.error(f"Could not import SQLiteHistoryStorage, perhaps aiosqlite is not installed? Error: {e}")
|
|
121
115
|
self.history_storage = NoOpHistoryStorage()
|
|
@@ -125,7 +119,7 @@ class OrchestratorEngine:
|
|
|
125
119
|
try:
|
|
126
120
|
module = import_module(".history.postgres", package="avtomatika")
|
|
127
121
|
storage_class = module.PostgresHistoryStorage
|
|
128
|
-
storage_args = [uri]
|
|
122
|
+
storage_args = [uri, self.config.TZ]
|
|
129
123
|
except ImportError as e:
|
|
130
124
|
logger.error(f"Could not import PostgresHistoryStorage, perhaps asyncpg is not installed? Error: {e}")
|
|
131
125
|
self.history_storage = NoOpHistoryStorage()
|
|
@@ -146,7 +140,7 @@ class OrchestratorEngine:
|
|
|
146
140
|
)
|
|
147
141
|
self.history_storage = NoOpHistoryStorage()
|
|
148
142
|
|
|
149
|
-
async def on_startup(self, app: web.Application):
|
|
143
|
+
async def on_startup(self, app: web.Application) -> None:
|
|
150
144
|
try:
|
|
151
145
|
from opentelemetry.instrumentation.aiohttp_client import (
|
|
152
146
|
AioHttpClientInstrumentor,
|
|
@@ -193,24 +187,29 @@ class OrchestratorEngine:
|
|
|
193
187
|
)
|
|
194
188
|
|
|
195
189
|
app[HTTP_SESSION_KEY] = ClientSession()
|
|
190
|
+
self.webhook_sender = WebhookSender(app[HTTP_SESSION_KEY])
|
|
196
191
|
self.dispatcher = Dispatcher(self.storage, self.config)
|
|
197
192
|
app[DISPATCHER_KEY] = self.dispatcher
|
|
198
193
|
app[EXECUTOR_KEY] = JobExecutor(self, self.history_storage)
|
|
199
194
|
app[WATCHER_KEY] = Watcher(self)
|
|
200
195
|
app[REPUTATION_CALCULATOR_KEY] = ReputationCalculator(self)
|
|
201
196
|
app[HEALTH_CHECKER_KEY] = HealthChecker(self)
|
|
197
|
+
app[SCHEDULER_KEY] = Scheduler(self)
|
|
198
|
+
app[WS_MANAGER_KEY] = self.ws_manager
|
|
202
199
|
|
|
203
200
|
app[EXECUTOR_TASK_KEY] = create_task(app[EXECUTOR_KEY].run())
|
|
204
201
|
app[WATCHER_TASK_KEY] = create_task(app[WATCHER_KEY].run())
|
|
205
202
|
app[REPUTATION_CALCULATOR_TASK_KEY] = create_task(app[REPUTATION_CALCULATOR_KEY].run())
|
|
206
203
|
app[HEALTH_CHECKER_TASK_KEY] = create_task(app[HEALTH_CHECKER_KEY].run())
|
|
204
|
+
app[SCHEDULER_TASK_KEY] = create_task(app[SCHEDULER_KEY].run())
|
|
207
205
|
|
|
208
|
-
async def on_shutdown(self, app: web.Application):
|
|
206
|
+
async def on_shutdown(self, app: web.Application) -> None:
|
|
209
207
|
logger.info("Shutdown sequence started.")
|
|
210
208
|
app[EXECUTOR_KEY].stop()
|
|
211
209
|
app[WATCHER_KEY].stop()
|
|
212
210
|
app[REPUTATION_CALCULATOR_KEY].stop()
|
|
213
211
|
app[HEALTH_CHECKER_KEY].stop()
|
|
212
|
+
app[SCHEDULER_KEY].stop()
|
|
214
213
|
logger.info("Background task running flags set to False.")
|
|
215
214
|
|
|
216
215
|
if hasattr(self.history_storage, "close"):
|
|
@@ -226,6 +225,8 @@ class OrchestratorEngine:
|
|
|
226
225
|
app[WATCHER_TASK_KEY].cancel()
|
|
227
226
|
app[REPUTATION_CALCULATOR_TASK_KEY].cancel()
|
|
228
227
|
app[EXECUTOR_TASK_KEY].cancel()
|
|
228
|
+
# Scheduler task manages its own loop cancellation in stop(), but just in case:
|
|
229
|
+
app[SCHEDULER_TASK_KEY].cancel()
|
|
229
230
|
logger.info("Background tasks cancelled.")
|
|
230
231
|
|
|
231
232
|
logger.info("Gathering background tasks with a 10s timeout...")
|
|
@@ -236,6 +237,7 @@ class OrchestratorEngine:
|
|
|
236
237
|
app[WATCHER_TASK_KEY],
|
|
237
238
|
app[REPUTATION_CALCULATOR_TASK_KEY],
|
|
238
239
|
app[EXECUTOR_TASK_KEY],
|
|
240
|
+
app[SCHEDULER_TASK_KEY],
|
|
239
241
|
return_exceptions=True,
|
|
240
242
|
),
|
|
241
243
|
timeout=10.0,
|
|
@@ -249,586 +251,110 @@ class OrchestratorEngine:
|
|
|
249
251
|
logger.info("HTTP session closed.")
|
|
250
252
|
logger.info("Shutdown sequence finished.")
|
|
251
253
|
|
|
252
|
-
def
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
job_id = str(uuid4())
|
|
263
|
-
job_state = {
|
|
264
|
-
"id": job_id,
|
|
265
|
-
"blueprint_name": blueprint.name,
|
|
266
|
-
"current_state": blueprint.start_state,
|
|
267
|
-
"initial_data": initial_data,
|
|
268
|
-
"state_history": {},
|
|
269
|
-
"status": "pending",
|
|
270
|
-
"tracing_context": carrier,
|
|
271
|
-
"client_config": client_config,
|
|
272
|
-
}
|
|
273
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
274
|
-
await self.storage.enqueue_job(job_id)
|
|
275
|
-
metrics.jobs_total.inc({metrics.LABEL_BLUEPRINT: blueprint.name})
|
|
276
|
-
return json_response({"status": "accepted", "job_id": job_id}, status=202)
|
|
277
|
-
|
|
278
|
-
return handler
|
|
279
|
-
|
|
280
|
-
async def _get_job_status_handler(self, request: web.Request) -> web.Response:
|
|
281
|
-
job_id = request.match_info.get("job_id")
|
|
282
|
-
if not job_id:
|
|
283
|
-
return json_response({"error": "job_id is required in path"}, status=400)
|
|
284
|
-
job_state = await self.storage.get_job_state(job_id)
|
|
285
|
-
if not job_state:
|
|
286
|
-
return json_response({"error": "Job not found"}, status=404)
|
|
287
|
-
return json_response(job_state, status=200)
|
|
288
|
-
|
|
289
|
-
async def _cancel_job_handler(self, request: web.Request) -> web.Response:
|
|
290
|
-
job_id = request.match_info.get("job_id")
|
|
291
|
-
if not job_id:
|
|
292
|
-
return json_response({"error": "job_id is required in path"}, status=400)
|
|
293
|
-
|
|
294
|
-
job_state = await self.storage.get_job_state(job_id)
|
|
295
|
-
if not job_state:
|
|
296
|
-
return json_response({"error": "Job not found"}, status=404)
|
|
297
|
-
|
|
298
|
-
if job_state.get("status") != "waiting_for_worker":
|
|
299
|
-
return json_response(
|
|
300
|
-
{"error": "Job is not in a state that can be cancelled (must be waiting for a worker)."},
|
|
301
|
-
status=409,
|
|
302
|
-
)
|
|
303
|
-
|
|
304
|
-
worker_id = job_state.get("task_worker_id")
|
|
305
|
-
if not worker_id:
|
|
306
|
-
return json_response(
|
|
307
|
-
{"error": "Cannot cancel job: worker_id not found in job state."},
|
|
308
|
-
status=500,
|
|
309
|
-
)
|
|
310
|
-
|
|
311
|
-
worker_info = await self.storage.get_worker_info(worker_id)
|
|
312
|
-
task_id = job_state.get("current_task_id")
|
|
313
|
-
if not task_id:
|
|
314
|
-
return json_response(
|
|
315
|
-
{"error": "Cannot cancel job: task_id not found in job state."},
|
|
316
|
-
status=500,
|
|
317
|
-
)
|
|
318
|
-
|
|
319
|
-
# Set Redis flag as a reliable fallback/primary mechanism
|
|
320
|
-
await self.storage.set_task_cancellation_flag(task_id)
|
|
321
|
-
|
|
322
|
-
# Attempt WebSocket-based cancellation if supported
|
|
323
|
-
if worker_info and worker_info.get("capabilities", {}).get("websockets"):
|
|
324
|
-
command = {"command": "cancel_task", "task_id": task_id, "job_id": job_id}
|
|
325
|
-
sent = await self.ws_manager.send_command(worker_id, command)
|
|
326
|
-
if sent:
|
|
327
|
-
return json_response({"status": "cancellation_request_sent"})
|
|
328
|
-
else:
|
|
329
|
-
logger.warning(f"Failed to send WebSocket cancellation for task {task_id}, but Redis flag is set.")
|
|
330
|
-
# Proceed to return success, as the Redis flag will handle it
|
|
331
|
-
|
|
332
|
-
return json_response({"status": "cancellation_request_accepted"})
|
|
333
|
-
|
|
334
|
-
async def _get_job_history_handler(self, request: web.Request) -> web.Response:
|
|
335
|
-
job_id = request.match_info.get("job_id")
|
|
336
|
-
if not job_id:
|
|
337
|
-
return json_response({"error": "job_id is required in path"}, status=400)
|
|
338
|
-
history = await self.history_storage.get_job_history(job_id)
|
|
339
|
-
return json_response(history)
|
|
340
|
-
|
|
341
|
-
async def _get_blueprint_graph_handler(self, request: web.Request) -> web.Response:
|
|
342
|
-
blueprint_name = request.match_info.get("blueprint_name")
|
|
343
|
-
if not blueprint_name:
|
|
344
|
-
return json_response({"error": "blueprint_name is required in path"}, status=400)
|
|
345
|
-
|
|
254
|
+
async def create_background_job(
|
|
255
|
+
self,
|
|
256
|
+
blueprint_name: str,
|
|
257
|
+
initial_data: dict[str, Any],
|
|
258
|
+
source: str = "internal",
|
|
259
|
+
) -> str:
|
|
260
|
+
"""Creates a job directly, bypassing the HTTP API layer.
|
|
261
|
+
Useful for internal schedulers and triggers.
|
|
262
|
+
"""
|
|
346
263
|
blueprint = self.blueprints.get(blueprint_name)
|
|
347
264
|
if not blueprint:
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
return json_response({"error": error_msg}, status=501)
|
|
357
|
-
|
|
358
|
-
async def _get_workers_handler(self, request: web.Request) -> web.Response:
|
|
359
|
-
workers = await self.storage.get_available_workers()
|
|
360
|
-
return json_response(workers)
|
|
361
|
-
|
|
362
|
-
async def _get_jobs_handler(self, request: web.Request) -> web.Response:
|
|
363
|
-
try:
|
|
364
|
-
limit = int(request.query.get("limit", "100"))
|
|
365
|
-
offset = int(request.query.get("offset", "0"))
|
|
366
|
-
except ValueError:
|
|
367
|
-
return json_response({"error": "Invalid limit/offset parameter"}, status=400)
|
|
368
|
-
|
|
369
|
-
jobs = await self.history_storage.get_jobs(limit=limit, offset=offset)
|
|
370
|
-
return json_response(jobs)
|
|
371
|
-
|
|
372
|
-
async def _get_dashboard_handler(self, request: web.Request) -> web.Response:
|
|
373
|
-
worker_count = await self.storage.get_active_worker_count()
|
|
374
|
-
queue_length = await self.storage.get_job_queue_length()
|
|
375
|
-
job_summary = await self.history_storage.get_job_summary()
|
|
376
|
-
|
|
377
|
-
dashboard_data = {
|
|
378
|
-
"workers": {"total": worker_count},
|
|
379
|
-
"jobs": {"queued": queue_length, **job_summary},
|
|
265
|
+
raise ValueError(f"Blueprint '{blueprint_name}' not found.")
|
|
266
|
+
|
|
267
|
+
job_id = str(uuid4())
|
|
268
|
+
# Use a special internal client config
|
|
269
|
+
client_config = {
|
|
270
|
+
"token": "internal-scheduler",
|
|
271
|
+
"plan": "system",
|
|
272
|
+
"params": {"source": source},
|
|
380
273
|
}
|
|
381
|
-
return json_response(dashboard_data)
|
|
382
|
-
|
|
383
|
-
async def _task_result_handler(self, request: web.Request) -> web.Response:
|
|
384
|
-
import logging
|
|
385
|
-
|
|
386
|
-
try:
|
|
387
|
-
data = await request.json(loads=loads)
|
|
388
|
-
job_id = data.get("job_id")
|
|
389
|
-
task_id = data.get("task_id")
|
|
390
|
-
result = data.get("result", {})
|
|
391
|
-
result_status = result.get("status", "success")
|
|
392
|
-
error_message = result.get("error")
|
|
393
|
-
payload_worker_id = data.get("worker_id")
|
|
394
|
-
except Exception:
|
|
395
|
-
return json_response({"error": "Invalid JSON body"}, status=400)
|
|
396
|
-
|
|
397
|
-
# Security check: Ensure the worker_id from the payload matches the authenticated worker
|
|
398
|
-
authenticated_worker_id = request.get("worker_id")
|
|
399
|
-
if not authenticated_worker_id:
|
|
400
|
-
# This should not happen if the auth middleware is working correctly
|
|
401
|
-
return json_response({"error": "Could not identify authenticated worker."}, status=500)
|
|
402
|
-
|
|
403
|
-
if payload_worker_id and payload_worker_id != authenticated_worker_id:
|
|
404
|
-
return json_response(
|
|
405
|
-
{
|
|
406
|
-
"error": f"Forbidden: Authenticated worker '{authenticated_worker_id}' "
|
|
407
|
-
f"cannot submit results for another worker '{payload_worker_id}'.",
|
|
408
|
-
},
|
|
409
|
-
status=403,
|
|
410
|
-
)
|
|
411
|
-
|
|
412
|
-
if not job_id or not task_id:
|
|
413
|
-
return json_response({"error": "job_id and task_id are required"}, status=400)
|
|
414
|
-
|
|
415
|
-
job_state = await self.storage.get_job_state(job_id)
|
|
416
|
-
if not job_state:
|
|
417
|
-
return json_response({"error": "Job not found"}, status=404)
|
|
418
|
-
|
|
419
|
-
# Handle parallel task completion
|
|
420
|
-
if job_state.get("status") == "waiting_for_parallel_tasks":
|
|
421
|
-
await self.storage.remove_job_from_watch(f"{job_id}:{task_id}")
|
|
422
|
-
job_state.setdefault("aggregation_results", {})[task_id] = result
|
|
423
|
-
job_state.setdefault("active_branches", []).remove(task_id)
|
|
424
|
-
|
|
425
|
-
if not job_state["active_branches"]:
|
|
426
|
-
logger.info(f"All parallel branches for job {job_id} have completed.")
|
|
427
|
-
job_state["status"] = "running"
|
|
428
|
-
job_state["current_state"] = job_state["aggregation_target"]
|
|
429
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
430
|
-
await self.storage.enqueue_job(job_id)
|
|
431
|
-
else:
|
|
432
|
-
logger.info(
|
|
433
|
-
f"Branch {task_id} for job {job_id} completed. "
|
|
434
|
-
f"Waiting for {len(job_state['active_branches'])} more.",
|
|
435
|
-
)
|
|
436
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
437
|
-
|
|
438
|
-
return json_response({"status": "parallel_branch_result_accepted"}, status=200)
|
|
439
|
-
|
|
440
|
-
await self.storage.remove_job_from_watch(job_id)
|
|
441
|
-
|
|
442
|
-
import time
|
|
443
274
|
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
275
|
+
job_state = {
|
|
276
|
+
"id": job_id,
|
|
277
|
+
"blueprint_name": blueprint.name,
|
|
278
|
+
"current_state": blueprint.start_state,
|
|
279
|
+
"initial_data": initial_data,
|
|
280
|
+
"state_history": {},
|
|
281
|
+
"status": JOB_STATUS_PENDING,
|
|
282
|
+
"tracing_context": {},
|
|
283
|
+
"client_config": client_config,
|
|
284
|
+
}
|
|
285
|
+
await self.storage.save_job_state(job_id, job_state)
|
|
286
|
+
await self.storage.enqueue_job(job_id)
|
|
287
|
+
metrics.jobs_total.inc({metrics.LABEL_BLUEPRINT: blueprint.name})
|
|
447
288
|
|
|
289
|
+
# Log the creation in history as well (so we can track scheduled jobs)
|
|
448
290
|
await self.history_storage.log_job_event(
|
|
449
291
|
{
|
|
450
292
|
"job_id": job_id,
|
|
451
|
-
"state":
|
|
452
|
-
"event_type": "
|
|
453
|
-
"
|
|
454
|
-
"
|
|
455
|
-
|
|
456
|
-
},
|
|
293
|
+
"state": "pending",
|
|
294
|
+
"event_type": "job_created",
|
|
295
|
+
"context_snapshot": job_state,
|
|
296
|
+
"metadata": {"source": source, "scheduled": True},
|
|
297
|
+
}
|
|
457
298
|
)
|
|
458
299
|
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
if result_status == "failure":
|
|
462
|
-
error_details = result.get("error", {})
|
|
463
|
-
error_type = "TRANSIENT_ERROR"
|
|
464
|
-
error_message = "No error details provided."
|
|
465
|
-
|
|
466
|
-
if isinstance(error_details, dict):
|
|
467
|
-
error_type = error_details.get("code", "TRANSIENT_ERROR")
|
|
468
|
-
error_message = error_details.get("message", "No error message provided.")
|
|
469
|
-
elif isinstance(error_details, str):
|
|
470
|
-
# Fallback for old format where `error` was just a string
|
|
471
|
-
error_message = error_details
|
|
472
|
-
|
|
473
|
-
logging.warning(f"Task {task_id} for job {job_id} failed with error type '{error_type}'.")
|
|
474
|
-
|
|
475
|
-
if error_type == "PERMANENT_ERROR":
|
|
476
|
-
job_state["status"] = "quarantined"
|
|
477
|
-
job_state["error_message"] = f"Task failed with permanent error: {error_message}"
|
|
478
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
479
|
-
await self.storage.quarantine_job(job_id)
|
|
480
|
-
elif error_type == "INVALID_INPUT_ERROR":
|
|
481
|
-
job_state["status"] = "failed"
|
|
482
|
-
job_state["error_message"] = f"Task failed due to invalid input: {error_message}"
|
|
483
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
484
|
-
else: # TRANSIENT_ERROR or any other/unspecified error
|
|
485
|
-
await self._handle_task_failure(job_state, task_id, error_message)
|
|
486
|
-
|
|
487
|
-
return json_response({"status": "result_accepted_failure"}, status=200)
|
|
488
|
-
|
|
489
|
-
if result_status == "cancelled":
|
|
490
|
-
logging.info(f"Task {task_id} for job {job_id} was cancelled by worker.")
|
|
491
|
-
job_state["status"] = "cancelled"
|
|
492
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
493
|
-
# Optionally, trigger a specific 'cancelled' transition if defined in the blueprint
|
|
494
|
-
transitions = job_state.get("current_task_transitions", {})
|
|
495
|
-
if next_state := transitions.get("cancelled"):
|
|
496
|
-
job_state["current_state"] = next_state
|
|
497
|
-
job_state["status"] = "running" # It's running the cancellation handler now
|
|
498
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
499
|
-
await self.storage.enqueue_job(job_id)
|
|
500
|
-
return json_response({"status": "result_accepted_cancelled"}, status=200)
|
|
501
|
-
|
|
502
|
-
transitions = job_state.get("current_task_transitions", {})
|
|
503
|
-
if next_state := transitions.get(result_status):
|
|
504
|
-
logging.info(f"Job {job_id} transitioning based on worker status '{result_status}' to state '{next_state}'")
|
|
505
|
-
|
|
506
|
-
worker_data = result.get("data")
|
|
507
|
-
if worker_data and isinstance(worker_data, dict):
|
|
508
|
-
if "state_history" not in job_state:
|
|
509
|
-
job_state["state_history"] = {}
|
|
510
|
-
job_state["state_history"].update(worker_data)
|
|
511
|
-
|
|
512
|
-
job_state["current_state"] = next_state
|
|
513
|
-
job_state["status"] = "running"
|
|
514
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
515
|
-
await self.storage.enqueue_job(job_id)
|
|
516
|
-
else:
|
|
517
|
-
logging.error(f"Job {job_id} failed. Worker returned unhandled status '{result_status}'.")
|
|
518
|
-
job_state["status"] = "failed"
|
|
519
|
-
job_state["error_message"] = f"Worker returned unhandled status: {result_status}"
|
|
520
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
521
|
-
|
|
522
|
-
return json_response({"status": "result_accepted_success"}, status=200)
|
|
523
|
-
|
|
524
|
-
async def _handle_task_failure(self, job_state: dict, task_id: str, error_message: str | None):
|
|
525
|
-
import logging
|
|
300
|
+
logger.info(f"Created background job {job_id} for blueprint '{blueprint_name}' (source: {source})")
|
|
301
|
+
return job_id
|
|
526
302
|
|
|
303
|
+
async def handle_task_failure(self, job_state: dict[str, Any], task_id: str, error_message: str | None) -> None:
|
|
304
|
+
"""Handles a transient task failure by retrying or quarantining."""
|
|
527
305
|
job_id = job_state["id"]
|
|
528
306
|
retry_count = job_state.get("retry_count", 0)
|
|
529
307
|
max_retries = self.config.JOB_MAX_RETRIES
|
|
530
308
|
|
|
531
309
|
if retry_count < max_retries:
|
|
532
310
|
job_state["retry_count"] = retry_count + 1
|
|
533
|
-
|
|
311
|
+
logger.info(f"Retrying task for job {job_id}. Attempt {retry_count + 1}/{max_retries}.")
|
|
534
312
|
|
|
535
313
|
task_info = job_state.get("current_task_info")
|
|
536
314
|
if not task_info:
|
|
537
|
-
|
|
538
|
-
job_state["status"] =
|
|
315
|
+
logger.error(f"Cannot retry job {job_id}: missing 'current_task_info' in job state.")
|
|
316
|
+
job_state["status"] = JOB_STATUS_FAILED
|
|
539
317
|
job_state["error_message"] = "Cannot retry: original task info not found."
|
|
540
318
|
await self.storage.save_job_state(job_id, job_state)
|
|
319
|
+
await self.send_job_webhook(job_state, "job_failed")
|
|
541
320
|
return
|
|
542
321
|
|
|
543
322
|
now = get_running_loop().time()
|
|
544
323
|
timeout_seconds = task_info.get("timeout_seconds", self.config.WORKER_TIMEOUT_SECONDS)
|
|
545
324
|
timeout_at = now + timeout_seconds
|
|
546
325
|
|
|
547
|
-
job_state["status"] =
|
|
326
|
+
job_state["status"] = JOB_STATUS_WAITING_FOR_WORKER
|
|
548
327
|
job_state["task_dispatched_at"] = now
|
|
549
328
|
await self.storage.save_job_state(job_id, job_state)
|
|
550
329
|
await self.storage.add_job_to_watch(job_id, timeout_at)
|
|
551
330
|
|
|
552
331
|
await self.dispatcher.dispatch(job_state, task_info)
|
|
553
332
|
else:
|
|
554
|
-
|
|
555
|
-
job_state["status"] =
|
|
333
|
+
logger.critical(f"Job {job_id} has failed {max_retries + 1} times. Moving to quarantine.")
|
|
334
|
+
job_state["status"] = JOB_STATUS_QUARANTINED
|
|
556
335
|
job_state["error_message"] = f"Task failed after {max_retries + 1} attempts: {error_message}"
|
|
557
336
|
await self.storage.save_job_state(job_id, job_state)
|
|
558
337
|
await self.storage.quarantine_job(job_id)
|
|
338
|
+
await self.send_job_webhook(job_state, "job_quarantined")
|
|
559
339
|
|
|
560
|
-
async def
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
data = await request.json(loads=loads)
|
|
566
|
-
decision = data.get("decision")
|
|
567
|
-
if not decision:
|
|
568
|
-
return json_response({"error": "decision is required in body"}, status=400)
|
|
569
|
-
except Exception:
|
|
570
|
-
return json_response({"error": "Invalid JSON body"}, status=400)
|
|
571
|
-
job_state = await self.storage.get_job_state(job_id)
|
|
572
|
-
if not job_state:
|
|
573
|
-
return json_response({"error": "Job not found"}, status=404)
|
|
574
|
-
if job_state.get("status") not in ["waiting_for_worker", "waiting_for_human"]:
|
|
575
|
-
return json_response({"error": "Job is not in a state that can be approved"}, status=409)
|
|
576
|
-
transitions = job_state.get("current_task_transitions", {})
|
|
577
|
-
next_state = transitions.get(decision)
|
|
578
|
-
if not next_state:
|
|
579
|
-
return json_response({"error": f"Invalid decision '{decision}' for this job"}, status=400)
|
|
580
|
-
job_state["current_state"] = next_state
|
|
581
|
-
job_state["status"] = "running"
|
|
582
|
-
await self.storage.save_job_state(job_id, job_state)
|
|
583
|
-
await self.storage.enqueue_job(job_id)
|
|
584
|
-
return json_response({"status": "approval_received", "job_id": job_id})
|
|
585
|
-
|
|
586
|
-
async def _get_quarantined_jobs_handler(self, request: web.Request) -> web.Response:
|
|
587
|
-
"""Returns a list of all job IDs in the quarantine queue."""
|
|
588
|
-
jobs = await self.storage.get_quarantined_jobs()
|
|
589
|
-
return json_response(jobs)
|
|
590
|
-
|
|
591
|
-
async def _reload_worker_configs_handler(self, request: web.Request) -> web.Response:
|
|
592
|
-
"""Handles the dynamic reloading of worker configurations."""
|
|
593
|
-
logger.info("Received request to reload worker configurations.")
|
|
594
|
-
if not self.config.WORKERS_CONFIG_PATH:
|
|
595
|
-
return json_response(
|
|
596
|
-
{"error": "WORKERS_CONFIG_PATH is not set, cannot reload configs."},
|
|
597
|
-
status=400,
|
|
598
|
-
)
|
|
599
|
-
|
|
600
|
-
await load_worker_configs_to_redis(self.storage, self.config.WORKERS_CONFIG_PATH)
|
|
601
|
-
return json_response({"status": "worker_configs_reloaded"})
|
|
602
|
-
|
|
603
|
-
async def _flush_db_handler(self, request: web.Request) -> web.Response:
|
|
604
|
-
logger.warning("Received request to flush the database.")
|
|
605
|
-
await self.storage.flush_all()
|
|
606
|
-
await load_client_configs_to_redis(self.storage)
|
|
607
|
-
return json_response({"status": "db_flushed"}, status=200)
|
|
608
|
-
|
|
609
|
-
async def _docs_handler(self, request: web.Request) -> web.Response:
|
|
610
|
-
from importlib import resources
|
|
611
|
-
|
|
612
|
-
try:
|
|
613
|
-
content = resources.read_text("avtomatika", "api.html")
|
|
614
|
-
except FileNotFoundError:
|
|
615
|
-
logger.error("api.html not found within the avtomatika package.")
|
|
616
|
-
return json_response({"error": "Documentation file not found on server."}, status=500)
|
|
617
|
-
|
|
618
|
-
# Generate dynamic documentation for registered blueprints
|
|
619
|
-
blueprint_endpoints = []
|
|
620
|
-
for bp in self.blueprints.values():
|
|
621
|
-
if not bp.api_endpoint:
|
|
622
|
-
continue
|
|
623
|
-
|
|
624
|
-
version_prefix = f"/{bp.api_version}" if bp.api_version else ""
|
|
625
|
-
endpoint_path = bp.api_endpoint if bp.api_endpoint.startswith("/") else f"/{bp.api_endpoint}"
|
|
626
|
-
full_path = f"/api{version_prefix}{endpoint_path}"
|
|
627
|
-
|
|
628
|
-
blueprint_endpoints.append(
|
|
629
|
-
{
|
|
630
|
-
"id": f"post-create-{bp.name.replace('_', '-')}",
|
|
631
|
-
"name": f"Create {bp.name.replace('_', ' ').title()} Job",
|
|
632
|
-
"method": "POST",
|
|
633
|
-
"path": full_path,
|
|
634
|
-
"description": f"Creates and starts a new instance (Job) of the `{bp.name}` blueprint.",
|
|
635
|
-
"request": {"body": {"initial_data": {}}},
|
|
636
|
-
"responses": [
|
|
637
|
-
{
|
|
638
|
-
"code": "202 Accepted",
|
|
639
|
-
"description": "Job successfully accepted for processing.",
|
|
640
|
-
"body": {"status": "accepted", "job_id": "..."},
|
|
641
|
-
}
|
|
642
|
-
],
|
|
643
|
-
}
|
|
644
|
-
)
|
|
645
|
-
|
|
646
|
-
# Inject dynamic endpoints into the apiData structure in the HTML
|
|
647
|
-
if blueprint_endpoints:
|
|
648
|
-
endpoints_json = dumps(blueprint_endpoints, option=OPT_INDENT_2).decode("utf-8")
|
|
649
|
-
# We insert the new endpoints at the beginning of the 'Protected API' group
|
|
650
|
-
marker = "group: 'Protected API',\n endpoints: ["
|
|
651
|
-
content = content.replace(marker, f"{marker}\n{endpoints_json.strip('[]')},")
|
|
652
|
-
|
|
653
|
-
return web.Response(text=content, content_type="text/html")
|
|
654
|
-
|
|
655
|
-
def _setup_routes(self):
|
|
656
|
-
public_app = web.Application()
|
|
657
|
-
public_app.router.add_get("/status", status_handler)
|
|
658
|
-
public_app.router.add_get("/metrics", metrics_handler)
|
|
659
|
-
public_app.router.add_post("/webhooks/approval/{job_id}", self._human_approval_webhook_handler)
|
|
660
|
-
public_app.router.add_post("/debug/flush_db", self._flush_db_handler)
|
|
661
|
-
public_app.router.add_get("/docs", self._docs_handler)
|
|
662
|
-
public_app.router.add_get("/jobs/quarantined", self._get_quarantined_jobs_handler)
|
|
663
|
-
self.app.add_subapp("/_public/", public_app)
|
|
664
|
-
|
|
665
|
-
auth_middleware = client_auth_middleware_factory(self.storage)
|
|
666
|
-
quota_middleware = quota_middleware_factory(self.storage)
|
|
667
|
-
api_middlewares = [auth_middleware, quota_middleware]
|
|
668
|
-
|
|
669
|
-
protected_app = web.Application(middlewares=api_middlewares)
|
|
670
|
-
versioned_apps: dict[str, web.Application] = {}
|
|
671
|
-
has_unversioned_routes = False
|
|
672
|
-
|
|
673
|
-
for bp in self.blueprints.values():
|
|
674
|
-
if not bp.api_endpoint:
|
|
675
|
-
continue
|
|
676
|
-
endpoint = bp.api_endpoint if bp.api_endpoint.startswith("/") else f"/{bp.api_endpoint}"
|
|
677
|
-
if bp.api_version:
|
|
678
|
-
if bp.api_version not in versioned_apps:
|
|
679
|
-
versioned_apps[bp.api_version] = web.Application(middlewares=api_middlewares)
|
|
680
|
-
versioned_apps[bp.api_version].router.add_post(endpoint, self._create_job_handler(bp))
|
|
681
|
-
else:
|
|
682
|
-
protected_app.router.add_post(endpoint, self._create_job_handler(bp))
|
|
683
|
-
has_unversioned_routes = True
|
|
684
|
-
|
|
685
|
-
all_protected_apps = list(versioned_apps.values())
|
|
686
|
-
if has_unversioned_routes:
|
|
687
|
-
all_protected_apps.append(protected_app)
|
|
688
|
-
|
|
689
|
-
for app in all_protected_apps:
|
|
690
|
-
self._register_common_routes(app)
|
|
691
|
-
if has_unversioned_routes:
|
|
692
|
-
self.app.add_subapp("/api/", protected_app)
|
|
693
|
-
for version, app in versioned_apps.items():
|
|
694
|
-
self.app.add_subapp(f"/api/{version}", app)
|
|
695
|
-
|
|
696
|
-
worker_auth_middleware = worker_auth_middleware_factory(self.storage, self.config)
|
|
697
|
-
worker_middlewares = [worker_auth_middleware]
|
|
698
|
-
if self.config.RATE_LIMITING_ENABLED:
|
|
699
|
-
worker_rate_limiter = rate_limit_middleware_factory(storage=self.storage, limit=5, period=60)
|
|
700
|
-
worker_middlewares.append(worker_rate_limiter)
|
|
701
|
-
|
|
702
|
-
worker_app = web.Application(middlewares=worker_middlewares)
|
|
703
|
-
worker_app.router.add_post("/workers/register", self._register_worker_handler)
|
|
704
|
-
worker_app.router.add_get("/workers/{worker_id}/tasks/next", self._handle_get_next_task)
|
|
705
|
-
worker_app.router.add_patch("/workers/{worker_id}", self._worker_update_handler)
|
|
706
|
-
worker_app.router.add_post("/tasks/result", self._task_result_handler)
|
|
707
|
-
worker_app.router.add_get("/ws/{worker_id}", self._websocket_handler)
|
|
708
|
-
self.app.add_subapp("/_worker/", worker_app)
|
|
709
|
-
|
|
710
|
-
def _register_common_routes(self, app):
|
|
711
|
-
app.router.add_get("/jobs/{job_id}", self._get_job_status_handler)
|
|
712
|
-
app.router.add_post("/jobs/{job_id}/cancel", self._cancel_job_handler)
|
|
713
|
-
if not isinstance(self.history_storage, NoOpHistoryStorage):
|
|
714
|
-
app.router.add_get("/jobs/{job_id}/history", self._get_job_history_handler)
|
|
715
|
-
app.router.add_get("/blueprints/{blueprint_name}/graph", self._get_blueprint_graph_handler)
|
|
716
|
-
app.router.add_get("/workers", self._get_workers_handler)
|
|
717
|
-
app.router.add_get("/jobs", self._get_jobs_handler)
|
|
718
|
-
app.router.add_get("/dashboard", self._get_dashboard_handler)
|
|
719
|
-
app.router.add_post("/admin/reload-workers", self._reload_worker_configs_handler)
|
|
720
|
-
|
|
721
|
-
async def _websocket_handler(self, request: web.Request) -> web.WebSocketResponse:
|
|
722
|
-
worker_id = request.match_info.get("worker_id")
|
|
723
|
-
if not worker_id:
|
|
724
|
-
raise web.HTTPBadRequest(text="worker_id is required")
|
|
725
|
-
|
|
726
|
-
ws = web.WebSocketResponse()
|
|
727
|
-
await ws.prepare(request)
|
|
728
|
-
|
|
729
|
-
await self.ws_manager.register(worker_id, ws)
|
|
730
|
-
try:
|
|
731
|
-
async for msg in ws:
|
|
732
|
-
if msg.type == WSMsgType.TEXT:
|
|
733
|
-
try:
|
|
734
|
-
data = msg.json()
|
|
735
|
-
await self.ws_manager.handle_message(worker_id, data)
|
|
736
|
-
except Exception as e:
|
|
737
|
-
logger.error(f"Error processing WebSocket message from {worker_id}: {e}")
|
|
738
|
-
elif msg.type == WSMsgType.ERROR:
|
|
739
|
-
logger.error(f"WebSocket connection for {worker_id} closed with exception {ws.exception()}")
|
|
740
|
-
break
|
|
741
|
-
finally:
|
|
742
|
-
await self.ws_manager.unregister(worker_id)
|
|
743
|
-
return ws
|
|
744
|
-
|
|
745
|
-
async def _handle_get_next_task(self, request: web.Request) -> web.Response:
|
|
746
|
-
worker_id = request.match_info.get("worker_id")
|
|
747
|
-
if not worker_id:
|
|
748
|
-
return json_response({"error": "worker_id is required in path"}, status=400)
|
|
749
|
-
|
|
750
|
-
logger.debug(f"Worker {worker_id} is requesting a new task.")
|
|
751
|
-
task = await self.storage.dequeue_task_for_worker(worker_id, self.config.WORKER_POLL_TIMEOUT_SECONDS)
|
|
752
|
-
|
|
753
|
-
if task:
|
|
754
|
-
logger.info(f"Sending task {task.get('task_id')} to worker {worker_id}")
|
|
755
|
-
return json_response(task, status=200)
|
|
756
|
-
logger.debug(f"No tasks for worker {worker_id}, responding 204.")
|
|
757
|
-
return web.Response(status=204)
|
|
758
|
-
|
|
759
|
-
async def _worker_update_handler(self, request: web.Request) -> web.Response:
|
|
760
|
-
"""
|
|
761
|
-
Handles both full updates and lightweight heartbeats for a worker.
|
|
762
|
-
|
|
763
|
-
If the request has a JSON body, it updates the worker's data.
|
|
764
|
-
In either case, it refreshes the worker's TTL, serving as a heartbeat.
|
|
765
|
-
"""
|
|
766
|
-
worker_id = request.match_info.get("worker_id")
|
|
767
|
-
if not worker_id:
|
|
768
|
-
return json_response({"error": "worker_id is required in path"}, status=400)
|
|
769
|
-
|
|
770
|
-
ttl = self.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
|
|
771
|
-
update_data = None
|
|
772
|
-
|
|
773
|
-
# Check for body content without consuming it if it's not JSON
|
|
774
|
-
if request.can_read_body:
|
|
775
|
-
try:
|
|
776
|
-
update_data = await request.json(loads=loads)
|
|
777
|
-
except Exception:
|
|
778
|
-
logger.warning(
|
|
779
|
-
f"Received PATCH from worker {worker_id} with non-JSON body. Treating as TTL-only heartbeat."
|
|
780
|
-
)
|
|
340
|
+
async def send_job_webhook(self, job_state: dict[str, Any], event: str) -> None:
|
|
341
|
+
"""Sends a webhook notification for a job event."""
|
|
342
|
+
webhook_url = job_state.get("webhook_url")
|
|
343
|
+
if not webhook_url:
|
|
344
|
+
return
|
|
781
345
|
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
await self.history_storage.log_worker_event(
|
|
789
|
-
{
|
|
790
|
-
"worker_id": worker_id,
|
|
791
|
-
"event_type": "status_update",
|
|
792
|
-
"worker_info_snapshot": updated_worker,
|
|
793
|
-
},
|
|
794
|
-
)
|
|
795
|
-
return json_response(updated_worker, status=200)
|
|
796
|
-
else:
|
|
797
|
-
# Lightweight TTL-only heartbeat path
|
|
798
|
-
refreshed = await self.storage.refresh_worker_ttl(worker_id, ttl)
|
|
799
|
-
if not refreshed:
|
|
800
|
-
return json_response({"error": "Worker not found"}, status=404)
|
|
801
|
-
return json_response({"status": "ttl_refreshed"})
|
|
802
|
-
|
|
803
|
-
async def _register_worker_handler(self, request: web.Request) -> web.Response:
|
|
804
|
-
# The worker_registration_data is attached by the auth middleware
|
|
805
|
-
# to avoid reading the request body twice.
|
|
806
|
-
worker_data = request.get("worker_registration_data")
|
|
807
|
-
if not worker_data:
|
|
808
|
-
return json_response({"error": "Worker data not found in request"}, status=500)
|
|
809
|
-
|
|
810
|
-
worker_id = worker_data.get("worker_id")
|
|
811
|
-
# This check is redundant if the middleware works, but good for safety
|
|
812
|
-
if not worker_id:
|
|
813
|
-
return json_response({"error": "Missing required field: worker_id"}, status=400)
|
|
814
|
-
|
|
815
|
-
ttl = self.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
|
|
816
|
-
await self.storage.register_worker(worker_id, worker_data, ttl)
|
|
817
|
-
|
|
818
|
-
logger.info(
|
|
819
|
-
f"Worker '{worker_id}' registered with info: {worker_data}",
|
|
346
|
+
payload = WebhookPayload(
|
|
347
|
+
event=event,
|
|
348
|
+
job_id=job_state["id"],
|
|
349
|
+
status=job_state["status"],
|
|
350
|
+
result=job_state.get("state_history"), # Or specific result
|
|
351
|
+
error=job_state.get("error_message"),
|
|
820
352
|
)
|
|
821
353
|
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
"worker_id": worker_id,
|
|
825
|
-
"event_type": "registered",
|
|
826
|
-
"worker_info_snapshot": worker_data,
|
|
827
|
-
},
|
|
828
|
-
)
|
|
829
|
-
return json_response({"status": "registered"}, status=200)
|
|
354
|
+
# Run in background to not block the main flow
|
|
355
|
+
create_task(self.webhook_sender.send(webhook_url, payload))
|
|
830
356
|
|
|
831
|
-
def run(self):
|
|
357
|
+
def run(self) -> None:
|
|
832
358
|
self.setup()
|
|
833
359
|
print(
|
|
834
360
|
f"Starting OrchestratorEngine API server on {self.config.API_HOST}:{self.config.API_PORT} in blocking mode."
|