avtomatika 1.0b4__py3-none-any.whl → 1.0b6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avtomatika/__init__.py +2 -2
- avtomatika/blueprint.py +9 -11
- avtomatika/config.py +11 -0
- avtomatika/constants.py +30 -0
- avtomatika/context.py +18 -18
- avtomatika/data_types.py +6 -7
- avtomatika/datastore.py +2 -2
- avtomatika/dispatcher.py +20 -21
- avtomatika/engine.py +170 -92
- avtomatika/executor.py +168 -148
- avtomatika/history/base.py +7 -7
- avtomatika/history/noop.py +7 -7
- avtomatika/history/postgres.py +63 -22
- avtomatika/history/sqlite.py +61 -44
- avtomatika/logging_config.py +59 -8
- avtomatika/scheduler.py +119 -0
- avtomatika/scheduler_config_loader.py +41 -0
- avtomatika/security.py +3 -5
- avtomatika/storage/__init__.py +2 -2
- avtomatika/storage/base.py +48 -23
- avtomatika/storage/memory.py +76 -46
- avtomatika/storage/redis.py +141 -60
- avtomatika/worker_config_loader.py +2 -2
- avtomatika/ws_manager.py +1 -2
- {avtomatika-1.0b4.dist-info → avtomatika-1.0b6.dist-info}/METADATA +45 -5
- avtomatika-1.0b6.dist-info/RECORD +40 -0
- avtomatika-1.0b4.dist-info/RECORD +0 -37
- {avtomatika-1.0b4.dist-info → avtomatika-1.0b6.dist-info}/WHEEL +0 -0
- {avtomatika-1.0b4.dist-info → avtomatika-1.0b6.dist-info}/licenses/LICENSE +0 -0
- {avtomatika-1.0b4.dist-info → avtomatika-1.0b6.dist-info}/top_level.txt +0 -0
avtomatika/engine.py
CHANGED
|
@@ -1,18 +1,35 @@
|
|
|
1
1
|
from asyncio import Task, create_task, gather, get_running_loop, wait_for
|
|
2
2
|
from asyncio import TimeoutError as AsyncTimeoutError
|
|
3
3
|
from logging import getLogger
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Any, Callable
|
|
5
5
|
from uuid import uuid4
|
|
6
6
|
|
|
7
7
|
from aiohttp import ClientSession, WSMsgType, web
|
|
8
8
|
from aiohttp.web import AppKey
|
|
9
9
|
from aioprometheus import render
|
|
10
|
+
from orjson import OPT_INDENT_2, dumps, loads
|
|
10
11
|
|
|
11
12
|
from . import metrics
|
|
12
13
|
from .blueprint import StateMachineBlueprint
|
|
13
14
|
from .client_config_loader import load_client_configs_to_redis
|
|
14
15
|
from .compression import compression_middleware
|
|
15
16
|
from .config import Config
|
|
17
|
+
from .constants import (
|
|
18
|
+
ERROR_CODE_INVALID_INPUT,
|
|
19
|
+
ERROR_CODE_PERMANENT,
|
|
20
|
+
ERROR_CODE_TRANSIENT,
|
|
21
|
+
JOB_STATUS_CANCELLED,
|
|
22
|
+
JOB_STATUS_FAILED,
|
|
23
|
+
JOB_STATUS_PENDING,
|
|
24
|
+
JOB_STATUS_QUARANTINED,
|
|
25
|
+
JOB_STATUS_RUNNING,
|
|
26
|
+
JOB_STATUS_WAITING_FOR_HUMAN,
|
|
27
|
+
JOB_STATUS_WAITING_FOR_PARALLEL,
|
|
28
|
+
JOB_STATUS_WAITING_FOR_WORKER,
|
|
29
|
+
TASK_STATUS_CANCELLED,
|
|
30
|
+
TASK_STATUS_FAILURE,
|
|
31
|
+
TASK_STATUS_SUCCESS,
|
|
32
|
+
)
|
|
16
33
|
from .dispatcher import Dispatcher
|
|
17
34
|
from .executor import JobExecutor
|
|
18
35
|
from .health_checker import HealthChecker
|
|
@@ -22,6 +39,7 @@ from .logging_config import setup_logging
|
|
|
22
39
|
from .quota import quota_middleware_factory
|
|
23
40
|
from .ratelimit import rate_limit_middleware_factory
|
|
24
41
|
from .reputation import ReputationCalculator
|
|
42
|
+
from .scheduler import Scheduler
|
|
25
43
|
from .security import client_auth_middleware_factory, worker_auth_middleware_factory
|
|
26
44
|
from .storage.base import StorageBackend
|
|
27
45
|
from .telemetry import setup_telemetry
|
|
@@ -37,20 +55,29 @@ EXECUTOR_KEY = AppKey("executor", JobExecutor)
|
|
|
37
55
|
WATCHER_KEY = AppKey("watcher", Watcher)
|
|
38
56
|
REPUTATION_CALCULATOR_KEY = AppKey("reputation_calculator", ReputationCalculator)
|
|
39
57
|
HEALTH_CHECKER_KEY = AppKey("health_checker", HealthChecker)
|
|
58
|
+
SCHEDULER_KEY = AppKey("scheduler", Scheduler)
|
|
59
|
+
|
|
40
60
|
EXECUTOR_TASK_KEY = AppKey("executor_task", Task)
|
|
41
61
|
WATCHER_TASK_KEY = AppKey("watcher_task", Task)
|
|
42
62
|
REPUTATION_CALCULATOR_TASK_KEY = AppKey("reputation_calculator_task", Task)
|
|
43
63
|
HEALTH_CHECKER_TASK_KEY = AppKey("health_checker_task", Task)
|
|
44
|
-
|
|
64
|
+
SCHEDULER_TASK_KEY = AppKey("scheduler_task", Task)
|
|
45
65
|
|
|
46
66
|
metrics.init_metrics()
|
|
47
67
|
|
|
48
|
-
|
|
49
68
|
logger = getLogger(__name__)
|
|
50
69
|
|
|
51
70
|
|
|
71
|
+
def json_dumps(obj: Any) -> str:
|
|
72
|
+
return dumps(obj).decode("utf-8")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def json_response(data: Any, **kwargs: Any) -> web.Response:
|
|
76
|
+
return web.json_response(data, dumps=json_dumps, **kwargs)
|
|
77
|
+
|
|
78
|
+
|
|
52
79
|
async def status_handler(_request: web.Request) -> web.Response:
|
|
53
|
-
return
|
|
80
|
+
return json_response({"status": "ok"})
|
|
54
81
|
|
|
55
82
|
|
|
56
83
|
async def metrics_handler(_request: web.Request) -> web.Response:
|
|
@@ -59,11 +86,11 @@ async def metrics_handler(_request: web.Request) -> web.Response:
|
|
|
59
86
|
|
|
60
87
|
class OrchestratorEngine:
|
|
61
88
|
def __init__(self, storage: StorageBackend, config: Config):
|
|
62
|
-
setup_logging(config.LOG_LEVEL, config.LOG_FORMAT)
|
|
89
|
+
setup_logging(config.LOG_LEVEL, config.LOG_FORMAT, config.TZ)
|
|
63
90
|
setup_telemetry()
|
|
64
91
|
self.storage = storage
|
|
65
92
|
self.config = config
|
|
66
|
-
self.blueprints:
|
|
93
|
+
self.blueprints: dict[str, StateMachineBlueprint] = {}
|
|
67
94
|
self.history_storage: HistoryStorageBase = NoOpHistoryStorage()
|
|
68
95
|
self.ws_manager = WebSocketManager()
|
|
69
96
|
self.app = web.Application(middlewares=[compression_middleware])
|
|
@@ -108,7 +135,7 @@ class OrchestratorEngine:
|
|
|
108
135
|
storage_class = module.SQLiteHistoryStorage
|
|
109
136
|
parsed_uri = urlparse(uri)
|
|
110
137
|
db_path = parsed_uri.path
|
|
111
|
-
storage_args = [db_path]
|
|
138
|
+
storage_args = [db_path, self.config.TZ]
|
|
112
139
|
except ImportError as e:
|
|
113
140
|
logger.error(f"Could not import SQLiteHistoryStorage, perhaps aiosqlite is not installed? Error: {e}")
|
|
114
141
|
self.history_storage = NoOpHistoryStorage()
|
|
@@ -118,7 +145,7 @@ class OrchestratorEngine:
|
|
|
118
145
|
try:
|
|
119
146
|
module = import_module(".history.postgres", package="avtomatika")
|
|
120
147
|
storage_class = module.PostgresHistoryStorage
|
|
121
|
-
storage_args = [uri]
|
|
148
|
+
storage_args = [uri, self.config.TZ]
|
|
122
149
|
except ImportError as e:
|
|
123
150
|
logger.error(f"Could not import PostgresHistoryStorage, perhaps asyncpg is not installed? Error: {e}")
|
|
124
151
|
self.history_storage = NoOpHistoryStorage()
|
|
@@ -192,11 +219,13 @@ class OrchestratorEngine:
|
|
|
192
219
|
app[WATCHER_KEY] = Watcher(self)
|
|
193
220
|
app[REPUTATION_CALCULATOR_KEY] = ReputationCalculator(self)
|
|
194
221
|
app[HEALTH_CHECKER_KEY] = HealthChecker(self)
|
|
222
|
+
app[SCHEDULER_KEY] = Scheduler(self)
|
|
195
223
|
|
|
196
224
|
app[EXECUTOR_TASK_KEY] = create_task(app[EXECUTOR_KEY].run())
|
|
197
225
|
app[WATCHER_TASK_KEY] = create_task(app[WATCHER_KEY].run())
|
|
198
226
|
app[REPUTATION_CALCULATOR_TASK_KEY] = create_task(app[REPUTATION_CALCULATOR_KEY].run())
|
|
199
227
|
app[HEALTH_CHECKER_TASK_KEY] = create_task(app[HEALTH_CHECKER_KEY].run())
|
|
228
|
+
app[SCHEDULER_TASK_KEY] = create_task(app[SCHEDULER_KEY].run())
|
|
200
229
|
|
|
201
230
|
async def on_shutdown(self, app: web.Application):
|
|
202
231
|
logger.info("Shutdown sequence started.")
|
|
@@ -204,6 +233,7 @@ class OrchestratorEngine:
|
|
|
204
233
|
app[WATCHER_KEY].stop()
|
|
205
234
|
app[REPUTATION_CALCULATOR_KEY].stop()
|
|
206
235
|
app[HEALTH_CHECKER_KEY].stop()
|
|
236
|
+
app[SCHEDULER_KEY].stop()
|
|
207
237
|
logger.info("Background task running flags set to False.")
|
|
208
238
|
|
|
209
239
|
if hasattr(self.history_storage, "close"):
|
|
@@ -219,6 +249,8 @@ class OrchestratorEngine:
|
|
|
219
249
|
app[WATCHER_TASK_KEY].cancel()
|
|
220
250
|
app[REPUTATION_CALCULATOR_TASK_KEY].cancel()
|
|
221
251
|
app[EXECUTOR_TASK_KEY].cancel()
|
|
252
|
+
# Scheduler task manages its own loop cancellation in stop(), but just in case:
|
|
253
|
+
app[SCHEDULER_TASK_KEY].cancel()
|
|
222
254
|
logger.info("Background tasks cancelled.")
|
|
223
255
|
|
|
224
256
|
logger.info("Gathering background tasks with a 10s timeout...")
|
|
@@ -229,6 +261,7 @@ class OrchestratorEngine:
|
|
|
229
261
|
app[WATCHER_TASK_KEY],
|
|
230
262
|
app[REPUTATION_CALCULATOR_TASK_KEY],
|
|
231
263
|
app[EXECUTOR_TASK_KEY],
|
|
264
|
+
app[SCHEDULER_TASK_KEY],
|
|
232
265
|
return_exceptions=True,
|
|
233
266
|
),
|
|
234
267
|
timeout=10.0,
|
|
@@ -242,12 +275,61 @@ class OrchestratorEngine:
|
|
|
242
275
|
logger.info("HTTP session closed.")
|
|
243
276
|
logger.info("Shutdown sequence finished.")
|
|
244
277
|
|
|
278
|
+
async def create_background_job(
|
|
279
|
+
self,
|
|
280
|
+
blueprint_name: str,
|
|
281
|
+
initial_data: dict[str, Any],
|
|
282
|
+
source: str = "internal",
|
|
283
|
+
) -> str:
|
|
284
|
+
"""Creates a job directly, bypassing the HTTP API layer.
|
|
285
|
+
Useful for internal schedulers and triggers.
|
|
286
|
+
"""
|
|
287
|
+
blueprint = self.blueprints.get(blueprint_name)
|
|
288
|
+
if not blueprint:
|
|
289
|
+
raise ValueError(f"Blueprint '{blueprint_name}' not found.")
|
|
290
|
+
|
|
291
|
+
job_id = str(uuid4())
|
|
292
|
+
# Use a special internal client config
|
|
293
|
+
client_config = {
|
|
294
|
+
"token": "internal-scheduler",
|
|
295
|
+
"plan": "system",
|
|
296
|
+
"params": {"source": source},
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
job_state = {
|
|
300
|
+
"id": job_id,
|
|
301
|
+
"blueprint_name": blueprint.name,
|
|
302
|
+
"current_state": blueprint.start_state,
|
|
303
|
+
"initial_data": initial_data,
|
|
304
|
+
"state_history": {},
|
|
305
|
+
"status": JOB_STATUS_PENDING,
|
|
306
|
+
"tracing_context": {},
|
|
307
|
+
"client_config": client_config,
|
|
308
|
+
}
|
|
309
|
+
await self.storage.save_job_state(job_id, job_state)
|
|
310
|
+
await self.storage.enqueue_job(job_id)
|
|
311
|
+
metrics.jobs_total.inc({metrics.LABEL_BLUEPRINT: blueprint.name})
|
|
312
|
+
|
|
313
|
+
# Log the creation in history as well (so we can track scheduled jobs)
|
|
314
|
+
await self.history_storage.log_job_event(
|
|
315
|
+
{
|
|
316
|
+
"job_id": job_id,
|
|
317
|
+
"state": "pending",
|
|
318
|
+
"event_type": "job_created",
|
|
319
|
+
"context_snapshot": job_state,
|
|
320
|
+
"metadata": {"source": source, "scheduled": True},
|
|
321
|
+
}
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
logger.info(f"Created background job {job_id} for blueprint '{blueprint_name}' (source: {source})")
|
|
325
|
+
return job_id
|
|
326
|
+
|
|
245
327
|
def _create_job_handler(self, blueprint: StateMachineBlueprint) -> Callable:
|
|
246
328
|
async def handler(request: web.Request) -> web.Response:
|
|
247
329
|
try:
|
|
248
|
-
initial_data = await request.json()
|
|
330
|
+
initial_data = await request.json(loads=loads)
|
|
249
331
|
except Exception:
|
|
250
|
-
return
|
|
332
|
+
return json_response({"error": "Invalid JSON body"}, status=400)
|
|
251
333
|
|
|
252
334
|
client_config = request["client_config"]
|
|
253
335
|
carrier = {str(k): v for k, v in request.headers.items()}
|
|
@@ -259,44 +341,44 @@ class OrchestratorEngine:
|
|
|
259
341
|
"current_state": blueprint.start_state,
|
|
260
342
|
"initial_data": initial_data,
|
|
261
343
|
"state_history": {},
|
|
262
|
-
"status":
|
|
344
|
+
"status": JOB_STATUS_PENDING,
|
|
263
345
|
"tracing_context": carrier,
|
|
264
346
|
"client_config": client_config,
|
|
265
347
|
}
|
|
266
348
|
await self.storage.save_job_state(job_id, job_state)
|
|
267
349
|
await self.storage.enqueue_job(job_id)
|
|
268
350
|
metrics.jobs_total.inc({metrics.LABEL_BLUEPRINT: blueprint.name})
|
|
269
|
-
return
|
|
351
|
+
return json_response({"status": "accepted", "job_id": job_id}, status=202)
|
|
270
352
|
|
|
271
353
|
return handler
|
|
272
354
|
|
|
273
355
|
async def _get_job_status_handler(self, request: web.Request) -> web.Response:
|
|
274
356
|
job_id = request.match_info.get("job_id")
|
|
275
357
|
if not job_id:
|
|
276
|
-
return
|
|
358
|
+
return json_response({"error": "job_id is required in path"}, status=400)
|
|
277
359
|
job_state = await self.storage.get_job_state(job_id)
|
|
278
360
|
if not job_state:
|
|
279
|
-
return
|
|
280
|
-
return
|
|
361
|
+
return json_response({"error": "Job not found"}, status=404)
|
|
362
|
+
return json_response(job_state, status=200)
|
|
281
363
|
|
|
282
364
|
async def _cancel_job_handler(self, request: web.Request) -> web.Response:
|
|
283
365
|
job_id = request.match_info.get("job_id")
|
|
284
366
|
if not job_id:
|
|
285
|
-
return
|
|
367
|
+
return json_response({"error": "job_id is required in path"}, status=400)
|
|
286
368
|
|
|
287
369
|
job_state = await self.storage.get_job_state(job_id)
|
|
288
370
|
if not job_state:
|
|
289
|
-
return
|
|
371
|
+
return json_response({"error": "Job not found"}, status=404)
|
|
290
372
|
|
|
291
|
-
if job_state.get("status") !=
|
|
292
|
-
return
|
|
373
|
+
if job_state.get("status") != JOB_STATUS_WAITING_FOR_WORKER:
|
|
374
|
+
return json_response(
|
|
293
375
|
{"error": "Job is not in a state that can be cancelled (must be waiting for a worker)."},
|
|
294
376
|
status=409,
|
|
295
377
|
)
|
|
296
378
|
|
|
297
379
|
worker_id = job_state.get("task_worker_id")
|
|
298
380
|
if not worker_id:
|
|
299
|
-
return
|
|
381
|
+
return json_response(
|
|
300
382
|
{"error": "Cannot cancel job: worker_id not found in job state."},
|
|
301
383
|
status=500,
|
|
302
384
|
)
|
|
@@ -304,7 +386,7 @@ class OrchestratorEngine:
|
|
|
304
386
|
worker_info = await self.storage.get_worker_info(worker_id)
|
|
305
387
|
task_id = job_state.get("current_task_id")
|
|
306
388
|
if not task_id:
|
|
307
|
-
return
|
|
389
|
+
return json_response(
|
|
308
390
|
{"error": "Cannot cancel job: task_id not found in job state."},
|
|
309
391
|
status=500,
|
|
310
392
|
)
|
|
@@ -317,28 +399,28 @@ class OrchestratorEngine:
|
|
|
317
399
|
command = {"command": "cancel_task", "task_id": task_id, "job_id": job_id}
|
|
318
400
|
sent = await self.ws_manager.send_command(worker_id, command)
|
|
319
401
|
if sent:
|
|
320
|
-
return
|
|
402
|
+
return json_response({"status": "cancellation_request_sent"})
|
|
321
403
|
else:
|
|
322
404
|
logger.warning(f"Failed to send WebSocket cancellation for task {task_id}, but Redis flag is set.")
|
|
323
405
|
# Proceed to return success, as the Redis flag will handle it
|
|
324
406
|
|
|
325
|
-
return
|
|
407
|
+
return json_response({"status": "cancellation_request_accepted"})
|
|
326
408
|
|
|
327
409
|
async def _get_job_history_handler(self, request: web.Request) -> web.Response:
|
|
328
410
|
job_id = request.match_info.get("job_id")
|
|
329
411
|
if not job_id:
|
|
330
|
-
return
|
|
412
|
+
return json_response({"error": "job_id is required in path"}, status=400)
|
|
331
413
|
history = await self.history_storage.get_job_history(job_id)
|
|
332
|
-
return
|
|
414
|
+
return json_response(history)
|
|
333
415
|
|
|
334
416
|
async def _get_blueprint_graph_handler(self, request: web.Request) -> web.Response:
|
|
335
417
|
blueprint_name = request.match_info.get("blueprint_name")
|
|
336
418
|
if not blueprint_name:
|
|
337
|
-
return
|
|
419
|
+
return json_response({"error": "blueprint_name is required in path"}, status=400)
|
|
338
420
|
|
|
339
421
|
blueprint = self.blueprints.get(blueprint_name)
|
|
340
422
|
if not blueprint:
|
|
341
|
-
return
|
|
423
|
+
return json_response({"error": "Blueprint not found"}, status=404)
|
|
342
424
|
|
|
343
425
|
try:
|
|
344
426
|
graph_dot = blueprint.render_graph()
|
|
@@ -346,21 +428,21 @@ class OrchestratorEngine:
|
|
|
346
428
|
except FileNotFoundError:
|
|
347
429
|
error_msg = "Graphviz is not installed on the server. Cannot generate graph."
|
|
348
430
|
logger.error(error_msg)
|
|
349
|
-
return
|
|
431
|
+
return json_response({"error": error_msg}, status=501)
|
|
350
432
|
|
|
351
433
|
async def _get_workers_handler(self, request: web.Request) -> web.Response:
|
|
352
434
|
workers = await self.storage.get_available_workers()
|
|
353
|
-
return
|
|
435
|
+
return json_response(workers)
|
|
354
436
|
|
|
355
437
|
async def _get_jobs_handler(self, request: web.Request) -> web.Response:
|
|
356
438
|
try:
|
|
357
439
|
limit = int(request.query.get("limit", "100"))
|
|
358
440
|
offset = int(request.query.get("offset", "0"))
|
|
359
441
|
except ValueError:
|
|
360
|
-
return
|
|
442
|
+
return json_response({"error": "Invalid limit/offset parameter"}, status=400)
|
|
361
443
|
|
|
362
444
|
jobs = await self.history_storage.get_jobs(limit=limit, offset=offset)
|
|
363
|
-
return
|
|
445
|
+
return json_response(jobs)
|
|
364
446
|
|
|
365
447
|
async def _get_dashboard_handler(self, request: web.Request) -> web.Response:
|
|
366
448
|
worker_count = await self.storage.get_active_worker_count()
|
|
@@ -371,30 +453,30 @@ class OrchestratorEngine:
|
|
|
371
453
|
"workers": {"total": worker_count},
|
|
372
454
|
"jobs": {"queued": queue_length, **job_summary},
|
|
373
455
|
}
|
|
374
|
-
return
|
|
456
|
+
return json_response(dashboard_data)
|
|
375
457
|
|
|
376
458
|
async def _task_result_handler(self, request: web.Request) -> web.Response:
|
|
377
459
|
import logging
|
|
378
460
|
|
|
379
461
|
try:
|
|
380
|
-
data = await request.json()
|
|
462
|
+
data = await request.json(loads=loads)
|
|
381
463
|
job_id = data.get("job_id")
|
|
382
464
|
task_id = data.get("task_id")
|
|
383
465
|
result = data.get("result", {})
|
|
384
|
-
result_status = result.get("status",
|
|
466
|
+
result_status = result.get("status", TASK_STATUS_SUCCESS)
|
|
385
467
|
error_message = result.get("error")
|
|
386
468
|
payload_worker_id = data.get("worker_id")
|
|
387
469
|
except Exception:
|
|
388
|
-
return
|
|
470
|
+
return json_response({"error": "Invalid JSON body"}, status=400)
|
|
389
471
|
|
|
390
472
|
# Security check: Ensure the worker_id from the payload matches the authenticated worker
|
|
391
473
|
authenticated_worker_id = request.get("worker_id")
|
|
392
474
|
if not authenticated_worker_id:
|
|
393
475
|
# This should not happen if the auth middleware is working correctly
|
|
394
|
-
return
|
|
476
|
+
return json_response({"error": "Could not identify authenticated worker."}, status=500)
|
|
395
477
|
|
|
396
478
|
if payload_worker_id and payload_worker_id != authenticated_worker_id:
|
|
397
|
-
return
|
|
479
|
+
return json_response(
|
|
398
480
|
{
|
|
399
481
|
"error": f"Forbidden: Authenticated worker '{authenticated_worker_id}' "
|
|
400
482
|
f"cannot submit results for another worker '{payload_worker_id}'.",
|
|
@@ -403,21 +485,21 @@ class OrchestratorEngine:
|
|
|
403
485
|
)
|
|
404
486
|
|
|
405
487
|
if not job_id or not task_id:
|
|
406
|
-
return
|
|
488
|
+
return json_response({"error": "job_id and task_id are required"}, status=400)
|
|
407
489
|
|
|
408
490
|
job_state = await self.storage.get_job_state(job_id)
|
|
409
491
|
if not job_state:
|
|
410
|
-
return
|
|
492
|
+
return json_response({"error": "Job not found"}, status=404)
|
|
411
493
|
|
|
412
494
|
# Handle parallel task completion
|
|
413
|
-
if job_state.get("status") ==
|
|
495
|
+
if job_state.get("status") == JOB_STATUS_WAITING_FOR_PARALLEL:
|
|
414
496
|
await self.storage.remove_job_from_watch(f"{job_id}:{task_id}")
|
|
415
497
|
job_state.setdefault("aggregation_results", {})[task_id] = result
|
|
416
498
|
job_state.setdefault("active_branches", []).remove(task_id)
|
|
417
499
|
|
|
418
500
|
if not job_state["active_branches"]:
|
|
419
501
|
logger.info(f"All parallel branches for job {job_id} have completed.")
|
|
420
|
-
job_state["status"] =
|
|
502
|
+
job_state["status"] = JOB_STATUS_RUNNING
|
|
421
503
|
job_state["current_state"] = job_state["aggregation_target"]
|
|
422
504
|
await self.storage.save_job_state(job_id, job_state)
|
|
423
505
|
await self.storage.enqueue_job(job_id)
|
|
@@ -428,7 +510,7 @@ class OrchestratorEngine:
|
|
|
428
510
|
)
|
|
429
511
|
await self.storage.save_job_state(job_id, job_state)
|
|
430
512
|
|
|
431
|
-
return
|
|
513
|
+
return json_response({"status": "parallel_branch_result_accepted"}, status=200)
|
|
432
514
|
|
|
433
515
|
await self.storage.remove_job_from_watch(job_id)
|
|
434
516
|
|
|
@@ -451,13 +533,13 @@ class OrchestratorEngine:
|
|
|
451
533
|
|
|
452
534
|
job_state["tracing_context"] = {str(k): v for k, v in request.headers.items()}
|
|
453
535
|
|
|
454
|
-
if result_status ==
|
|
536
|
+
if result_status == TASK_STATUS_FAILURE:
|
|
455
537
|
error_details = result.get("error", {})
|
|
456
|
-
error_type =
|
|
538
|
+
error_type = ERROR_CODE_TRANSIENT
|
|
457
539
|
error_message = "No error details provided."
|
|
458
540
|
|
|
459
541
|
if isinstance(error_details, dict):
|
|
460
|
-
error_type = error_details.get("code",
|
|
542
|
+
error_type = error_details.get("code", ERROR_CODE_TRANSIENT)
|
|
461
543
|
error_message = error_details.get("message", "No error message provided.")
|
|
462
544
|
elif isinstance(error_details, str):
|
|
463
545
|
# Fallback for old format where `error` was just a string
|
|
@@ -465,32 +547,32 @@ class OrchestratorEngine:
|
|
|
465
547
|
|
|
466
548
|
logging.warning(f"Task {task_id} for job {job_id} failed with error type '{error_type}'.")
|
|
467
549
|
|
|
468
|
-
if error_type ==
|
|
469
|
-
job_state["status"] =
|
|
550
|
+
if error_type == ERROR_CODE_PERMANENT:
|
|
551
|
+
job_state["status"] = JOB_STATUS_QUARANTINED
|
|
470
552
|
job_state["error_message"] = f"Task failed with permanent error: {error_message}"
|
|
471
553
|
await self.storage.save_job_state(job_id, job_state)
|
|
472
554
|
await self.storage.quarantine_job(job_id)
|
|
473
|
-
elif error_type ==
|
|
474
|
-
job_state["status"] =
|
|
555
|
+
elif error_type == ERROR_CODE_INVALID_INPUT:
|
|
556
|
+
job_state["status"] = JOB_STATUS_FAILED
|
|
475
557
|
job_state["error_message"] = f"Task failed due to invalid input: {error_message}"
|
|
476
558
|
await self.storage.save_job_state(job_id, job_state)
|
|
477
559
|
else: # TRANSIENT_ERROR or any other/unspecified error
|
|
478
560
|
await self._handle_task_failure(job_state, task_id, error_message)
|
|
479
561
|
|
|
480
|
-
return
|
|
562
|
+
return json_response({"status": "result_accepted_failure"}, status=200)
|
|
481
563
|
|
|
482
|
-
if result_status ==
|
|
564
|
+
if result_status == TASK_STATUS_CANCELLED:
|
|
483
565
|
logging.info(f"Task {task_id} for job {job_id} was cancelled by worker.")
|
|
484
|
-
job_state["status"] =
|
|
566
|
+
job_state["status"] = JOB_STATUS_CANCELLED
|
|
485
567
|
await self.storage.save_job_state(job_id, job_state)
|
|
486
568
|
# Optionally, trigger a specific 'cancelled' transition if defined in the blueprint
|
|
487
569
|
transitions = job_state.get("current_task_transitions", {})
|
|
488
570
|
if next_state := transitions.get("cancelled"):
|
|
489
571
|
job_state["current_state"] = next_state
|
|
490
|
-
job_state["status"] =
|
|
572
|
+
job_state["status"] = JOB_STATUS_RUNNING # It's running the cancellation handler now
|
|
491
573
|
await self.storage.save_job_state(job_id, job_state)
|
|
492
574
|
await self.storage.enqueue_job(job_id)
|
|
493
|
-
return
|
|
575
|
+
return json_response({"status": "result_accepted_cancelled"}, status=200)
|
|
494
576
|
|
|
495
577
|
transitions = job_state.get("current_task_transitions", {})
|
|
496
578
|
if next_state := transitions.get(result_status):
|
|
@@ -503,16 +585,16 @@ class OrchestratorEngine:
|
|
|
503
585
|
job_state["state_history"].update(worker_data)
|
|
504
586
|
|
|
505
587
|
job_state["current_state"] = next_state
|
|
506
|
-
job_state["status"] =
|
|
588
|
+
job_state["status"] = JOB_STATUS_RUNNING
|
|
507
589
|
await self.storage.save_job_state(job_id, job_state)
|
|
508
590
|
await self.storage.enqueue_job(job_id)
|
|
509
591
|
else:
|
|
510
592
|
logging.error(f"Job {job_id} failed. Worker returned unhandled status '{result_status}'.")
|
|
511
|
-
job_state["status"] =
|
|
593
|
+
job_state["status"] = JOB_STATUS_FAILED
|
|
512
594
|
job_state["error_message"] = f"Worker returned unhandled status: {result_status}"
|
|
513
595
|
await self.storage.save_job_state(job_id, job_state)
|
|
514
596
|
|
|
515
|
-
return
|
|
597
|
+
return json_response({"status": "result_accepted_success"}, status=200)
|
|
516
598
|
|
|
517
599
|
async def _handle_task_failure(self, job_state: dict, task_id: str, error_message: str | None):
|
|
518
600
|
import logging
|
|
@@ -528,7 +610,7 @@ class OrchestratorEngine:
|
|
|
528
610
|
task_info = job_state.get("current_task_info")
|
|
529
611
|
if not task_info:
|
|
530
612
|
logging.error(f"Cannot retry job {job_id}: missing 'current_task_info' in job state.")
|
|
531
|
-
job_state["status"] =
|
|
613
|
+
job_state["status"] = JOB_STATUS_FAILED
|
|
532
614
|
job_state["error_message"] = "Cannot retry: original task info not found."
|
|
533
615
|
await self.storage.save_job_state(job_id, job_state)
|
|
534
616
|
return
|
|
@@ -537,7 +619,7 @@ class OrchestratorEngine:
|
|
|
537
619
|
timeout_seconds = task_info.get("timeout_seconds", self.config.WORKER_TIMEOUT_SECONDS)
|
|
538
620
|
timeout_at = now + timeout_seconds
|
|
539
621
|
|
|
540
|
-
job_state["status"] =
|
|
622
|
+
job_state["status"] = JOB_STATUS_WAITING_FOR_WORKER
|
|
541
623
|
job_state["task_dispatched_at"] = now
|
|
542
624
|
await self.storage.save_job_state(job_id, job_state)
|
|
543
625
|
await self.storage.add_job_to_watch(job_id, timeout_at)
|
|
@@ -545,7 +627,7 @@ class OrchestratorEngine:
|
|
|
545
627
|
await self.dispatcher.dispatch(job_state, task_info)
|
|
546
628
|
else:
|
|
547
629
|
logging.critical(f"Job {job_id} has failed {max_retries + 1} times. Moving to quarantine.")
|
|
548
|
-
job_state["status"] =
|
|
630
|
+
job_state["status"] = JOB_STATUS_QUARANTINED
|
|
549
631
|
job_state["error_message"] = f"Task failed after {max_retries + 1} attempts: {error_message}"
|
|
550
632
|
await self.storage.save_job_state(job_id, job_state)
|
|
551
633
|
await self.storage.quarantine_job(job_id)
|
|
@@ -553,61 +635,60 @@ class OrchestratorEngine:
|
|
|
553
635
|
async def _human_approval_webhook_handler(self, request: web.Request) -> web.Response:
|
|
554
636
|
job_id = request.match_info.get("job_id")
|
|
555
637
|
if not job_id:
|
|
556
|
-
return
|
|
638
|
+
return json_response({"error": "job_id is required in path"}, status=400)
|
|
557
639
|
try:
|
|
558
|
-
data = await request.json()
|
|
640
|
+
data = await request.json(loads=loads)
|
|
559
641
|
decision = data.get("decision")
|
|
560
642
|
if not decision:
|
|
561
|
-
return
|
|
643
|
+
return json_response({"error": "decision is required in body"}, status=400)
|
|
562
644
|
except Exception:
|
|
563
|
-
return
|
|
645
|
+
return json_response({"error": "Invalid JSON body"}, status=400)
|
|
564
646
|
job_state = await self.storage.get_job_state(job_id)
|
|
565
647
|
if not job_state:
|
|
566
|
-
return
|
|
567
|
-
if job_state.get("status") not in [
|
|
568
|
-
return
|
|
648
|
+
return json_response({"error": "Job not found"}, status=404)
|
|
649
|
+
if job_state.get("status") not in [JOB_STATUS_WAITING_FOR_WORKER, JOB_STATUS_WAITING_FOR_HUMAN]:
|
|
650
|
+
return json_response({"error": "Job is not in a state that can be approved"}, status=409)
|
|
569
651
|
transitions = job_state.get("current_task_transitions", {})
|
|
570
652
|
next_state = transitions.get(decision)
|
|
571
653
|
if not next_state:
|
|
572
|
-
return
|
|
654
|
+
return json_response({"error": f"Invalid decision '{decision}' for this job"}, status=400)
|
|
573
655
|
job_state["current_state"] = next_state
|
|
574
|
-
job_state["status"] =
|
|
656
|
+
job_state["status"] = JOB_STATUS_RUNNING
|
|
575
657
|
await self.storage.save_job_state(job_id, job_state)
|
|
576
658
|
await self.storage.enqueue_job(job_id)
|
|
577
|
-
return
|
|
659
|
+
return json_response({"status": "approval_received", "job_id": job_id})
|
|
578
660
|
|
|
579
661
|
async def _get_quarantined_jobs_handler(self, request: web.Request) -> web.Response:
|
|
580
662
|
"""Returns a list of all job IDs in the quarantine queue."""
|
|
581
663
|
jobs = await self.storage.get_quarantined_jobs()
|
|
582
|
-
return
|
|
664
|
+
return json_response(jobs)
|
|
583
665
|
|
|
584
666
|
async def _reload_worker_configs_handler(self, request: web.Request) -> web.Response:
|
|
585
667
|
"""Handles the dynamic reloading of worker configurations."""
|
|
586
668
|
logger.info("Received request to reload worker configurations.")
|
|
587
669
|
if not self.config.WORKERS_CONFIG_PATH:
|
|
588
|
-
return
|
|
670
|
+
return json_response(
|
|
589
671
|
{"error": "WORKERS_CONFIG_PATH is not set, cannot reload configs."},
|
|
590
672
|
status=400,
|
|
591
673
|
)
|
|
592
674
|
|
|
593
675
|
await load_worker_configs_to_redis(self.storage, self.config.WORKERS_CONFIG_PATH)
|
|
594
|
-
return
|
|
676
|
+
return json_response({"status": "worker_configs_reloaded"})
|
|
595
677
|
|
|
596
678
|
async def _flush_db_handler(self, request: web.Request) -> web.Response:
|
|
597
679
|
logger.warning("Received request to flush the database.")
|
|
598
680
|
await self.storage.flush_all()
|
|
599
681
|
await load_client_configs_to_redis(self.storage)
|
|
600
|
-
return
|
|
682
|
+
return json_response({"status": "db_flushed"}, status=200)
|
|
601
683
|
|
|
602
684
|
async def _docs_handler(self, request: web.Request) -> web.Response:
|
|
603
|
-
import json
|
|
604
685
|
from importlib import resources
|
|
605
686
|
|
|
606
687
|
try:
|
|
607
688
|
content = resources.read_text("avtomatika", "api.html")
|
|
608
689
|
except FileNotFoundError:
|
|
609
690
|
logger.error("api.html not found within the avtomatika package.")
|
|
610
|
-
return
|
|
691
|
+
return json_response({"error": "Documentation file not found on server."}, status=500)
|
|
611
692
|
|
|
612
693
|
# Generate dynamic documentation for registered blueprints
|
|
613
694
|
blueprint_endpoints = []
|
|
@@ -639,7 +720,7 @@ class OrchestratorEngine:
|
|
|
639
720
|
|
|
640
721
|
# Inject dynamic endpoints into the apiData structure in the HTML
|
|
641
722
|
if blueprint_endpoints:
|
|
642
|
-
endpoints_json =
|
|
723
|
+
endpoints_json = dumps(blueprint_endpoints, option=OPT_INDENT_2).decode("utf-8")
|
|
643
724
|
# We insert the new endpoints at the beginning of the 'Protected API' group
|
|
644
725
|
marker = "group: 'Protected API',\n endpoints: ["
|
|
645
726
|
content = content.replace(marker, f"{marker}\n{endpoints_json.strip('[]')},")
|
|
@@ -661,7 +742,7 @@ class OrchestratorEngine:
|
|
|
661
742
|
api_middlewares = [auth_middleware, quota_middleware]
|
|
662
743
|
|
|
663
744
|
protected_app = web.Application(middlewares=api_middlewares)
|
|
664
|
-
versioned_apps:
|
|
745
|
+
versioned_apps: dict[str, web.Application] = {}
|
|
665
746
|
has_unversioned_routes = False
|
|
666
747
|
|
|
667
748
|
for bp in self.blueprints.values():
|
|
@@ -739,14 +820,14 @@ class OrchestratorEngine:
|
|
|
739
820
|
async def _handle_get_next_task(self, request: web.Request) -> web.Response:
|
|
740
821
|
worker_id = request.match_info.get("worker_id")
|
|
741
822
|
if not worker_id:
|
|
742
|
-
return
|
|
823
|
+
return json_response({"error": "worker_id is required in path"}, status=400)
|
|
743
824
|
|
|
744
825
|
logger.debug(f"Worker {worker_id} is requesting a new task.")
|
|
745
826
|
task = await self.storage.dequeue_task_for_worker(worker_id, self.config.WORKER_POLL_TIMEOUT_SECONDS)
|
|
746
827
|
|
|
747
828
|
if task:
|
|
748
829
|
logger.info(f"Sending task {task.get('task_id')} to worker {worker_id}")
|
|
749
|
-
return
|
|
830
|
+
return json_response(task, status=200)
|
|
750
831
|
logger.debug(f"No tasks for worker {worker_id}, responding 204.")
|
|
751
832
|
return web.Response(status=204)
|
|
752
833
|
|
|
@@ -759,7 +840,7 @@ class OrchestratorEngine:
|
|
|
759
840
|
"""
|
|
760
841
|
worker_id = request.match_info.get("worker_id")
|
|
761
842
|
if not worker_id:
|
|
762
|
-
return
|
|
843
|
+
return json_response({"error": "worker_id is required in path"}, status=400)
|
|
763
844
|
|
|
764
845
|
ttl = self.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
|
|
765
846
|
update_data = None
|
|
@@ -767,11 +848,8 @@ class OrchestratorEngine:
|
|
|
767
848
|
# Check for body content without consuming it if it's not JSON
|
|
768
849
|
if request.can_read_body:
|
|
769
850
|
try:
|
|
770
|
-
update_data = await request.json()
|
|
851
|
+
update_data = await request.json(loads=loads)
|
|
771
852
|
except Exception:
|
|
772
|
-
# This can happen if the body is present but not valid JSON.
|
|
773
|
-
# We can treat it as a lightweight heartbeat or return an error.
|
|
774
|
-
# For robustness, let's treat it as a lightweight ping but log a warning.
|
|
775
853
|
logger.warning(
|
|
776
854
|
f"Received PATCH from worker {worker_id} with non-JSON body. Treating as TTL-only heartbeat."
|
|
777
855
|
)
|
|
@@ -780,7 +858,7 @@ class OrchestratorEngine:
|
|
|
780
858
|
# Full update path
|
|
781
859
|
updated_worker = await self.storage.update_worker_status(worker_id, update_data, ttl)
|
|
782
860
|
if not updated_worker:
|
|
783
|
-
return
|
|
861
|
+
return json_response({"error": "Worker not found"}, status=404)
|
|
784
862
|
|
|
785
863
|
await self.history_storage.log_worker_event(
|
|
786
864
|
{
|
|
@@ -789,25 +867,25 @@ class OrchestratorEngine:
|
|
|
789
867
|
"worker_info_snapshot": updated_worker,
|
|
790
868
|
},
|
|
791
869
|
)
|
|
792
|
-
return
|
|
870
|
+
return json_response(updated_worker, status=200)
|
|
793
871
|
else:
|
|
794
872
|
# Lightweight TTL-only heartbeat path
|
|
795
873
|
refreshed = await self.storage.refresh_worker_ttl(worker_id, ttl)
|
|
796
874
|
if not refreshed:
|
|
797
|
-
return
|
|
798
|
-
return
|
|
875
|
+
return json_response({"error": "Worker not found"}, status=404)
|
|
876
|
+
return json_response({"status": "ttl_refreshed"})
|
|
799
877
|
|
|
800
878
|
async def _register_worker_handler(self, request: web.Request) -> web.Response:
|
|
801
879
|
# The worker_registration_data is attached by the auth middleware
|
|
802
880
|
# to avoid reading the request body twice.
|
|
803
881
|
worker_data = request.get("worker_registration_data")
|
|
804
882
|
if not worker_data:
|
|
805
|
-
return
|
|
883
|
+
return json_response({"error": "Worker data not found in request"}, status=500)
|
|
806
884
|
|
|
807
885
|
worker_id = worker_data.get("worker_id")
|
|
808
886
|
# This check is redundant if the middleware works, but good for safety
|
|
809
887
|
if not worker_id:
|
|
810
|
-
return
|
|
888
|
+
return json_response({"error": "Missing required field: worker_id"}, status=400)
|
|
811
889
|
|
|
812
890
|
ttl = self.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
|
|
813
891
|
await self.storage.register_worker(worker_id, worker_data, ttl)
|
|
@@ -823,7 +901,7 @@ class OrchestratorEngine:
|
|
|
823
901
|
"worker_info_snapshot": worker_data,
|
|
824
902
|
},
|
|
825
903
|
)
|
|
826
|
-
return
|
|
904
|
+
return json_response({"status": "registered"}, status=200)
|
|
827
905
|
|
|
828
906
|
def run(self):
|
|
829
907
|
self.setup()
|