avtomatika 1.0b6__py3-none-any.whl → 1.0b8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
avtomatika/engine.py CHANGED
@@ -1,68 +1,52 @@
1
- from asyncio import Task, create_task, gather, get_running_loop, wait_for
2
1
  from asyncio import TimeoutError as AsyncTimeoutError
2
+ from asyncio import create_task, gather, get_running_loop, wait_for
3
3
  from logging import getLogger
4
- from typing import Any, Callable
4
+ from typing import Any
5
5
  from uuid import uuid4
6
6
 
7
- from aiohttp import ClientSession, WSMsgType, web
8
- from aiohttp.web import AppKey
9
- from aioprometheus import render
10
- from orjson import OPT_INDENT_2, dumps, loads
7
+ from aiohttp import ClientSession, web
8
+ from orjson import dumps
11
9
 
12
10
  from . import metrics
11
+ from .api.routes import setup_routes
12
+ from .app_keys import (
13
+ DISPATCHER_KEY,
14
+ ENGINE_KEY,
15
+ EXECUTOR_KEY,
16
+ EXECUTOR_TASK_KEY,
17
+ HEALTH_CHECKER_KEY,
18
+ HEALTH_CHECKER_TASK_KEY,
19
+ HTTP_SESSION_KEY,
20
+ REPUTATION_CALCULATOR_KEY,
21
+ REPUTATION_CALCULATOR_TASK_KEY,
22
+ S3_SERVICE_KEY,
23
+ SCHEDULER_KEY,
24
+ SCHEDULER_TASK_KEY,
25
+ WATCHER_KEY,
26
+ WATCHER_TASK_KEY,
27
+ WS_MANAGER_KEY,
28
+ )
13
29
  from .blueprint import StateMachineBlueprint
14
30
  from .client_config_loader import load_client_configs_to_redis
15
31
  from .compression import compression_middleware
16
32
  from .config import Config
17
- from .constants import (
18
- ERROR_CODE_INVALID_INPUT,
19
- ERROR_CODE_PERMANENT,
20
- ERROR_CODE_TRANSIENT,
21
- JOB_STATUS_CANCELLED,
22
- JOB_STATUS_FAILED,
23
- JOB_STATUS_PENDING,
24
- JOB_STATUS_QUARANTINED,
25
- JOB_STATUS_RUNNING,
26
- JOB_STATUS_WAITING_FOR_HUMAN,
27
- JOB_STATUS_WAITING_FOR_PARALLEL,
28
- JOB_STATUS_WAITING_FOR_WORKER,
29
- TASK_STATUS_CANCELLED,
30
- TASK_STATUS_FAILURE,
31
- TASK_STATUS_SUCCESS,
32
- )
33
+ from .constants import JOB_STATUS_FAILED, JOB_STATUS_PENDING, JOB_STATUS_QUARANTINED, JOB_STATUS_WAITING_FOR_WORKER
33
34
  from .dispatcher import Dispatcher
34
35
  from .executor import JobExecutor
35
36
  from .health_checker import HealthChecker
36
37
  from .history.base import HistoryStorageBase
37
38
  from .history.noop import NoOpHistoryStorage
38
39
  from .logging_config import setup_logging
39
- from .quota import quota_middleware_factory
40
- from .ratelimit import rate_limit_middleware_factory
41
40
  from .reputation import ReputationCalculator
41
+ from .s3 import S3Service
42
42
  from .scheduler import Scheduler
43
- from .security import client_auth_middleware_factory, worker_auth_middleware_factory
44
43
  from .storage.base import StorageBackend
45
44
  from .telemetry import setup_telemetry
45
+ from .utils.webhook_sender import WebhookPayload, WebhookSender
46
46
  from .watcher import Watcher
47
47
  from .worker_config_loader import load_worker_configs_to_redis
48
48
  from .ws_manager import WebSocketManager
49
49
 
50
- # Application keys for storing components
51
- ENGINE_KEY = AppKey("engine", "OrchestratorEngine")
52
- HTTP_SESSION_KEY = AppKey("http_session", ClientSession)
53
- DISPATCHER_KEY = AppKey("dispatcher", Dispatcher)
54
- EXECUTOR_KEY = AppKey("executor", JobExecutor)
55
- WATCHER_KEY = AppKey("watcher", Watcher)
56
- REPUTATION_CALCULATOR_KEY = AppKey("reputation_calculator", ReputationCalculator)
57
- HEALTH_CHECKER_KEY = AppKey("health_checker", HealthChecker)
58
- SCHEDULER_KEY = AppKey("scheduler", Scheduler)
59
-
60
- EXECUTOR_TASK_KEY = AppKey("executor_task", Task)
61
- WATCHER_TASK_KEY = AppKey("watcher_task", Task)
62
- REPUTATION_CALCULATOR_TASK_KEY = AppKey("reputation_calculator_task", Task)
63
- HEALTH_CHECKER_TASK_KEY = AppKey("health_checker_task", Task)
64
- SCHEDULER_TASK_KEY = AppKey("scheduler_task", Task)
65
-
66
50
  metrics.init_metrics()
67
51
 
68
52
  logger = getLogger(__name__)
@@ -76,14 +60,6 @@ def json_response(data: Any, **kwargs: Any) -> web.Response:
76
60
  return web.json_response(data, dumps=json_dumps, **kwargs)
77
61
 
78
62
 
79
- async def status_handler(_request: web.Request) -> web.Response:
80
- return json_response({"status": "ok"})
81
-
82
-
83
- async def metrics_handler(_request: web.Request) -> web.Response:
84
- return web.Response(body=render(), content_type="text/plain")
85
-
86
-
87
63
  class OrchestratorEngine:
88
64
  def __init__(self, storage: StorageBackend, config: Config):
89
65
  setup_logging(config.LOG_LEVEL, config.LOG_FORMAT, config.TZ)
@@ -97,7 +73,7 @@ class OrchestratorEngine:
97
73
  self.app[ENGINE_KEY] = self
98
74
  self._setup_done = False
99
75
 
100
- def register_blueprint(self, blueprint: StateMachineBlueprint):
76
+ def register_blueprint(self, blueprint: StateMachineBlueprint) -> None:
101
77
  if self._setup_done:
102
78
  raise RuntimeError("Cannot register blueprints after engine setup.")
103
79
  if blueprint.name in self.blueprints:
@@ -107,15 +83,15 @@ class OrchestratorEngine:
107
83
  blueprint.validate()
108
84
  self.blueprints[blueprint.name] = blueprint
109
85
 
110
- def setup(self):
86
+ def setup(self) -> None:
111
87
  if self._setup_done:
112
88
  return
113
- self._setup_routes()
89
+ setup_routes(self.app, self)
114
90
  self.app.on_startup.append(self.on_startup)
115
91
  self.app.on_shutdown.append(self.on_shutdown)
116
92
  self._setup_done = True
117
93
 
118
- async def _setup_history_storage(self):
94
+ async def _setup_history_storage(self) -> None:
119
95
  from importlib import import_module
120
96
 
121
97
  uri = self.config.HISTORY_DATABASE_URI
@@ -166,7 +142,12 @@ class OrchestratorEngine:
166
142
  )
167
143
  self.history_storage = NoOpHistoryStorage()
168
144
 
169
- async def on_startup(self, app: web.Application):
145
+ async def on_startup(self, app: web.Application) -> None:
146
+ # 1. Fail Fast: Check Storage Connection
147
+ if not await self.storage.ping():
148
+ logger.critical("Failed to connect to Storage Backend (Redis). Exiting.")
149
+ raise RuntimeError("Storage Backend is unavailable.")
150
+
170
151
  try:
171
152
  from opentelemetry.instrumentation.aiohttp_client import (
172
153
  AioHttpClientInstrumentor,
@@ -178,6 +159,8 @@ class OrchestratorEngine:
178
159
  "opentelemetry-instrumentation-aiohttp-client not found. AIOHTTP client instrumentation is disabled."
179
160
  )
180
161
  await self._setup_history_storage()
162
+ # Start history background worker
163
+ await self.history_storage.start()
181
164
 
182
165
  # Load client configs if the path is provided
183
166
  if self.config.CLIENTS_CONFIG_PATH:
@@ -213,6 +196,8 @@ class OrchestratorEngine:
213
196
  )
214
197
 
215
198
  app[HTTP_SESSION_KEY] = ClientSession()
199
+ self.webhook_sender = WebhookSender(app[HTTP_SESSION_KEY])
200
+ self.webhook_sender.start()
216
201
  self.dispatcher = Dispatcher(self.storage, self.config)
217
202
  app[DISPATCHER_KEY] = self.dispatcher
218
203
  app[EXECUTOR_KEY] = JobExecutor(self, self.history_storage)
@@ -220,6 +205,8 @@ class OrchestratorEngine:
220
205
  app[REPUTATION_CALCULATOR_KEY] = ReputationCalculator(self)
221
206
  app[HEALTH_CHECKER_KEY] = HealthChecker(self)
222
207
  app[SCHEDULER_KEY] = Scheduler(self)
208
+ app[WS_MANAGER_KEY] = self.ws_manager
209
+ app[S3_SERVICE_KEY] = S3Service(self.config, self.history_storage)
223
210
 
224
211
  app[EXECUTOR_TASK_KEY] = create_task(app[EXECUTOR_KEY].run())
225
212
  app[WATCHER_TASK_KEY] = create_task(app[WATCHER_KEY].run())
@@ -227,7 +214,7 @@ class OrchestratorEngine:
227
214
  app[HEALTH_CHECKER_TASK_KEY] = create_task(app[HEALTH_CHECKER_KEY].run())
228
215
  app[SCHEDULER_TASK_KEY] = create_task(app[SCHEDULER_KEY].run())
229
216
 
230
- async def on_shutdown(self, app: web.Application):
217
+ async def on_shutdown(self, app: web.Application) -> None:
231
218
  logger.info("Shutdown sequence started.")
232
219
  app[EXECUTOR_KEY].stop()
233
220
  app[WATCHER_KEY].stop()
@@ -244,6 +231,13 @@ class OrchestratorEngine:
244
231
  logger.info("Closing WebSocket connections...")
245
232
  await self.ws_manager.close_all()
246
233
 
234
+ logger.info("Stopping WebhookSender...")
235
+ await self.webhook_sender.stop()
236
+
237
+ if S3_SERVICE_KEY in app:
238
+ logger.info("Closing S3 Service...")
239
+ await app[S3_SERVICE_KEY].close()
240
+
247
241
  logger.info("Cancelling background tasks...")
248
242
  app[HEALTH_CHECKER_TASK_KEY].cancel()
249
243
  app[WATCHER_TASK_KEY].cancel()
@@ -324,295 +318,23 @@ class OrchestratorEngine:
324
318
  logger.info(f"Created background job {job_id} for blueprint '{blueprint_name}' (source: {source})")
325
319
  return job_id
326
320
 
327
- def _create_job_handler(self, blueprint: StateMachineBlueprint) -> Callable:
328
- async def handler(request: web.Request) -> web.Response:
329
- try:
330
- initial_data = await request.json(loads=loads)
331
- except Exception:
332
- return json_response({"error": "Invalid JSON body"}, status=400)
333
-
334
- client_config = request["client_config"]
335
- carrier = {str(k): v for k, v in request.headers.items()}
336
-
337
- job_id = str(uuid4())
338
- job_state = {
339
- "id": job_id,
340
- "blueprint_name": blueprint.name,
341
- "current_state": blueprint.start_state,
342
- "initial_data": initial_data,
343
- "state_history": {},
344
- "status": JOB_STATUS_PENDING,
345
- "tracing_context": carrier,
346
- "client_config": client_config,
347
- }
348
- await self.storage.save_job_state(job_id, job_state)
349
- await self.storage.enqueue_job(job_id)
350
- metrics.jobs_total.inc({metrics.LABEL_BLUEPRINT: blueprint.name})
351
- return json_response({"status": "accepted", "job_id": job_id}, status=202)
352
-
353
- return handler
354
-
355
- async def _get_job_status_handler(self, request: web.Request) -> web.Response:
356
- job_id = request.match_info.get("job_id")
357
- if not job_id:
358
- return json_response({"error": "job_id is required in path"}, status=400)
359
- job_state = await self.storage.get_job_state(job_id)
360
- if not job_state:
361
- return json_response({"error": "Job not found"}, status=404)
362
- return json_response(job_state, status=200)
363
-
364
- async def _cancel_job_handler(self, request: web.Request) -> web.Response:
365
- job_id = request.match_info.get("job_id")
366
- if not job_id:
367
- return json_response({"error": "job_id is required in path"}, status=400)
368
-
369
- job_state = await self.storage.get_job_state(job_id)
370
- if not job_state:
371
- return json_response({"error": "Job not found"}, status=404)
372
-
373
- if job_state.get("status") != JOB_STATUS_WAITING_FOR_WORKER:
374
- return json_response(
375
- {"error": "Job is not in a state that can be cancelled (must be waiting for a worker)."},
376
- status=409,
377
- )
378
-
379
- worker_id = job_state.get("task_worker_id")
380
- if not worker_id:
381
- return json_response(
382
- {"error": "Cannot cancel job: worker_id not found in job state."},
383
- status=500,
384
- )
385
-
386
- worker_info = await self.storage.get_worker_info(worker_id)
387
- task_id = job_state.get("current_task_id")
388
- if not task_id:
389
- return json_response(
390
- {"error": "Cannot cancel job: task_id not found in job state."},
391
- status=500,
392
- )
393
-
394
- # Set Redis flag as a reliable fallback/primary mechanism
395
- await self.storage.set_task_cancellation_flag(task_id)
396
-
397
- # Attempt WebSocket-based cancellation if supported
398
- if worker_info and worker_info.get("capabilities", {}).get("websockets"):
399
- command = {"command": "cancel_task", "task_id": task_id, "job_id": job_id}
400
- sent = await self.ws_manager.send_command(worker_id, command)
401
- if sent:
402
- return json_response({"status": "cancellation_request_sent"})
403
- else:
404
- logger.warning(f"Failed to send WebSocket cancellation for task {task_id}, but Redis flag is set.")
405
- # Proceed to return success, as the Redis flag will handle it
406
-
407
- return json_response({"status": "cancellation_request_accepted"})
408
-
409
- async def _get_job_history_handler(self, request: web.Request) -> web.Response:
410
- job_id = request.match_info.get("job_id")
411
- if not job_id:
412
- return json_response({"error": "job_id is required in path"}, status=400)
413
- history = await self.history_storage.get_job_history(job_id)
414
- return json_response(history)
415
-
416
- async def _get_blueprint_graph_handler(self, request: web.Request) -> web.Response:
417
- blueprint_name = request.match_info.get("blueprint_name")
418
- if not blueprint_name:
419
- return json_response({"error": "blueprint_name is required in path"}, status=400)
420
-
421
- blueprint = self.blueprints.get(blueprint_name)
422
- if not blueprint:
423
- return json_response({"error": "Blueprint not found"}, status=404)
424
-
425
- try:
426
- graph_dot = blueprint.render_graph()
427
- return web.Response(text=graph_dot, content_type="text/vnd.graphviz")
428
- except FileNotFoundError:
429
- error_msg = "Graphviz is not installed on the server. Cannot generate graph."
430
- logger.error(error_msg)
431
- return json_response({"error": error_msg}, status=501)
432
-
433
- async def _get_workers_handler(self, request: web.Request) -> web.Response:
434
- workers = await self.storage.get_available_workers()
435
- return json_response(workers)
436
-
437
- async def _get_jobs_handler(self, request: web.Request) -> web.Response:
438
- try:
439
- limit = int(request.query.get("limit", "100"))
440
- offset = int(request.query.get("offset", "0"))
441
- except ValueError:
442
- return json_response({"error": "Invalid limit/offset parameter"}, status=400)
443
-
444
- jobs = await self.history_storage.get_jobs(limit=limit, offset=offset)
445
- return json_response(jobs)
446
-
447
- async def _get_dashboard_handler(self, request: web.Request) -> web.Response:
448
- worker_count = await self.storage.get_active_worker_count()
449
- queue_length = await self.storage.get_job_queue_length()
450
- job_summary = await self.history_storage.get_job_summary()
451
-
452
- dashboard_data = {
453
- "workers": {"total": worker_count},
454
- "jobs": {"queued": queue_length, **job_summary},
455
- }
456
- return json_response(dashboard_data)
457
-
458
- async def _task_result_handler(self, request: web.Request) -> web.Response:
459
- import logging
460
-
461
- try:
462
- data = await request.json(loads=loads)
463
- job_id = data.get("job_id")
464
- task_id = data.get("task_id")
465
- result = data.get("result", {})
466
- result_status = result.get("status", TASK_STATUS_SUCCESS)
467
- error_message = result.get("error")
468
- payload_worker_id = data.get("worker_id")
469
- except Exception:
470
- return json_response({"error": "Invalid JSON body"}, status=400)
471
-
472
- # Security check: Ensure the worker_id from the payload matches the authenticated worker
473
- authenticated_worker_id = request.get("worker_id")
474
- if not authenticated_worker_id:
475
- # This should not happen if the auth middleware is working correctly
476
- return json_response({"error": "Could not identify authenticated worker."}, status=500)
477
-
478
- if payload_worker_id and payload_worker_id != authenticated_worker_id:
479
- return json_response(
480
- {
481
- "error": f"Forbidden: Authenticated worker '{authenticated_worker_id}' "
482
- f"cannot submit results for another worker '{payload_worker_id}'.",
483
- },
484
- status=403,
485
- )
486
-
487
- if not job_id or not task_id:
488
- return json_response({"error": "job_id and task_id are required"}, status=400)
489
-
490
- job_state = await self.storage.get_job_state(job_id)
491
- if not job_state:
492
- return json_response({"error": "Job not found"}, status=404)
493
-
494
- # Handle parallel task completion
495
- if job_state.get("status") == JOB_STATUS_WAITING_FOR_PARALLEL:
496
- await self.storage.remove_job_from_watch(f"{job_id}:{task_id}")
497
- job_state.setdefault("aggregation_results", {})[task_id] = result
498
- job_state.setdefault("active_branches", []).remove(task_id)
499
-
500
- if not job_state["active_branches"]:
501
- logger.info(f"All parallel branches for job {job_id} have completed.")
502
- job_state["status"] = JOB_STATUS_RUNNING
503
- job_state["current_state"] = job_state["aggregation_target"]
504
- await self.storage.save_job_state(job_id, job_state)
505
- await self.storage.enqueue_job(job_id)
506
- else:
507
- logger.info(
508
- f"Branch {task_id} for job {job_id} completed. "
509
- f"Waiting for {len(job_state['active_branches'])} more.",
510
- )
511
- await self.storage.save_job_state(job_id, job_state)
512
-
513
- return json_response({"status": "parallel_branch_result_accepted"}, status=200)
514
-
515
- await self.storage.remove_job_from_watch(job_id)
516
-
517
- import time
518
-
519
- now = time.monotonic()
520
- dispatched_at = job_state.get("task_dispatched_at", now)
521
- duration_ms = int((now - dispatched_at) * 1000)
522
-
523
- await self.history_storage.log_job_event(
524
- {
525
- "job_id": job_id,
526
- "state": job_state.get("current_state"),
527
- "event_type": "task_finished",
528
- "duration_ms": duration_ms,
529
- "worker_id": authenticated_worker_id, # Use authenticated worker_id
530
- "context_snapshot": {**job_state, "result": result},
531
- },
532
- )
533
-
534
- job_state["tracing_context"] = {str(k): v for k, v in request.headers.items()}
535
-
536
- if result_status == TASK_STATUS_FAILURE:
537
- error_details = result.get("error", {})
538
- error_type = ERROR_CODE_TRANSIENT
539
- error_message = "No error details provided."
540
-
541
- if isinstance(error_details, dict):
542
- error_type = error_details.get("code", ERROR_CODE_TRANSIENT)
543
- error_message = error_details.get("message", "No error message provided.")
544
- elif isinstance(error_details, str):
545
- # Fallback for old format where `error` was just a string
546
- error_message = error_details
547
-
548
- logging.warning(f"Task {task_id} for job {job_id} failed with error type '{error_type}'.")
549
-
550
- if error_type == ERROR_CODE_PERMANENT:
551
- job_state["status"] = JOB_STATUS_QUARANTINED
552
- job_state["error_message"] = f"Task failed with permanent error: {error_message}"
553
- await self.storage.save_job_state(job_id, job_state)
554
- await self.storage.quarantine_job(job_id)
555
- elif error_type == ERROR_CODE_INVALID_INPUT:
556
- job_state["status"] = JOB_STATUS_FAILED
557
- job_state["error_message"] = f"Task failed due to invalid input: {error_message}"
558
- await self.storage.save_job_state(job_id, job_state)
559
- else: # TRANSIENT_ERROR or any other/unspecified error
560
- await self._handle_task_failure(job_state, task_id, error_message)
561
-
562
- return json_response({"status": "result_accepted_failure"}, status=200)
563
-
564
- if result_status == TASK_STATUS_CANCELLED:
565
- logging.info(f"Task {task_id} for job {job_id} was cancelled by worker.")
566
- job_state["status"] = JOB_STATUS_CANCELLED
567
- await self.storage.save_job_state(job_id, job_state)
568
- # Optionally, trigger a specific 'cancelled' transition if defined in the blueprint
569
- transitions = job_state.get("current_task_transitions", {})
570
- if next_state := transitions.get("cancelled"):
571
- job_state["current_state"] = next_state
572
- job_state["status"] = JOB_STATUS_RUNNING # It's running the cancellation handler now
573
- await self.storage.save_job_state(job_id, job_state)
574
- await self.storage.enqueue_job(job_id)
575
- return json_response({"status": "result_accepted_cancelled"}, status=200)
576
-
577
- transitions = job_state.get("current_task_transitions", {})
578
- if next_state := transitions.get(result_status):
579
- logging.info(f"Job {job_id} transitioning based on worker status '{result_status}' to state '{next_state}'")
580
-
581
- worker_data = result.get("data")
582
- if worker_data and isinstance(worker_data, dict):
583
- if "state_history" not in job_state:
584
- job_state["state_history"] = {}
585
- job_state["state_history"].update(worker_data)
586
-
587
- job_state["current_state"] = next_state
588
- job_state["status"] = JOB_STATUS_RUNNING
589
- await self.storage.save_job_state(job_id, job_state)
590
- await self.storage.enqueue_job(job_id)
591
- else:
592
- logging.error(f"Job {job_id} failed. Worker returned unhandled status '{result_status}'.")
593
- job_state["status"] = JOB_STATUS_FAILED
594
- job_state["error_message"] = f"Worker returned unhandled status: {result_status}"
595
- await self.storage.save_job_state(job_id, job_state)
596
-
597
- return json_response({"status": "result_accepted_success"}, status=200)
598
-
599
- async def _handle_task_failure(self, job_state: dict, task_id: str, error_message: str | None):
600
- import logging
601
-
321
+ async def handle_task_failure(self, job_state: dict[str, Any], task_id: str, error_message: str | None) -> None:
322
+ """Handles a transient task failure by retrying or quarantining."""
602
323
  job_id = job_state["id"]
603
324
  retry_count = job_state.get("retry_count", 0)
604
325
  max_retries = self.config.JOB_MAX_RETRIES
605
326
 
606
327
  if retry_count < max_retries:
607
328
  job_state["retry_count"] = retry_count + 1
608
- logging.info(f"Retrying task for job {job_id}. Attempt {retry_count + 1}/{max_retries}.")
329
+ logger.info(f"Retrying task for job {job_id}. Attempt {retry_count + 1}/{max_retries}.")
609
330
 
610
331
  task_info = job_state.get("current_task_info")
611
332
  if not task_info:
612
- logging.error(f"Cannot retry job {job_id}: missing 'current_task_info' in job state.")
333
+ logger.error(f"Cannot retry job {job_id}: missing 'current_task_info' in job state.")
613
334
  job_state["status"] = JOB_STATUS_FAILED
614
335
  job_state["error_message"] = "Cannot retry: original task info not found."
615
336
  await self.storage.save_job_state(job_id, job_state)
337
+ await self.send_job_webhook(job_state, "job_failed")
616
338
  return
617
339
 
618
340
  now = get_running_loop().time()
@@ -626,284 +348,31 @@ class OrchestratorEngine:
626
348
 
627
349
  await self.dispatcher.dispatch(job_state, task_info)
628
350
  else:
629
- logging.critical(f"Job {job_id} has failed {max_retries + 1} times. Moving to quarantine.")
351
+ logger.critical(f"Job {job_id} has failed {max_retries + 1} times. Moving to quarantine.")
630
352
  job_state["status"] = JOB_STATUS_QUARANTINED
631
353
  job_state["error_message"] = f"Task failed after {max_retries + 1} attempts: {error_message}"
632
354
  await self.storage.save_job_state(job_id, job_state)
633
355
  await self.storage.quarantine_job(job_id)
356
+ await self.send_job_webhook(job_state, "job_quarantined")
634
357
 
635
- async def _human_approval_webhook_handler(self, request: web.Request) -> web.Response:
636
- job_id = request.match_info.get("job_id")
637
- if not job_id:
638
- return json_response({"error": "job_id is required in path"}, status=400)
639
- try:
640
- data = await request.json(loads=loads)
641
- decision = data.get("decision")
642
- if not decision:
643
- return json_response({"error": "decision is required in body"}, status=400)
644
- except Exception:
645
- return json_response({"error": "Invalid JSON body"}, status=400)
646
- job_state = await self.storage.get_job_state(job_id)
647
- if not job_state:
648
- return json_response({"error": "Job not found"}, status=404)
649
- if job_state.get("status") not in [JOB_STATUS_WAITING_FOR_WORKER, JOB_STATUS_WAITING_FOR_HUMAN]:
650
- return json_response({"error": "Job is not in a state that can be approved"}, status=409)
651
- transitions = job_state.get("current_task_transitions", {})
652
- next_state = transitions.get(decision)
653
- if not next_state:
654
- return json_response({"error": f"Invalid decision '{decision}' for this job"}, status=400)
655
- job_state["current_state"] = next_state
656
- job_state["status"] = JOB_STATUS_RUNNING
657
- await self.storage.save_job_state(job_id, job_state)
658
- await self.storage.enqueue_job(job_id)
659
- return json_response({"status": "approval_received", "job_id": job_id})
660
-
661
- async def _get_quarantined_jobs_handler(self, request: web.Request) -> web.Response:
662
- """Returns a list of all job IDs in the quarantine queue."""
663
- jobs = await self.storage.get_quarantined_jobs()
664
- return json_response(jobs)
665
-
666
- async def _reload_worker_configs_handler(self, request: web.Request) -> web.Response:
667
- """Handles the dynamic reloading of worker configurations."""
668
- logger.info("Received request to reload worker configurations.")
669
- if not self.config.WORKERS_CONFIG_PATH:
670
- return json_response(
671
- {"error": "WORKERS_CONFIG_PATH is not set, cannot reload configs."},
672
- status=400,
673
- )
674
-
675
- await load_worker_configs_to_redis(self.storage, self.config.WORKERS_CONFIG_PATH)
676
- return json_response({"status": "worker_configs_reloaded"})
677
-
678
- async def _flush_db_handler(self, request: web.Request) -> web.Response:
679
- logger.warning("Received request to flush the database.")
680
- await self.storage.flush_all()
681
- await load_client_configs_to_redis(self.storage)
682
- return json_response({"status": "db_flushed"}, status=200)
683
-
684
- async def _docs_handler(self, request: web.Request) -> web.Response:
685
- from importlib import resources
686
-
687
- try:
688
- content = resources.read_text("avtomatika", "api.html")
689
- except FileNotFoundError:
690
- logger.error("api.html not found within the avtomatika package.")
691
- return json_response({"error": "Documentation file not found on server."}, status=500)
692
-
693
- # Generate dynamic documentation for registered blueprints
694
- blueprint_endpoints = []
695
- for bp in self.blueprints.values():
696
- if not bp.api_endpoint:
697
- continue
698
-
699
- version_prefix = f"/{bp.api_version}" if bp.api_version else ""
700
- endpoint_path = bp.api_endpoint if bp.api_endpoint.startswith("/") else f"/{bp.api_endpoint}"
701
- full_path = f"/api{version_prefix}{endpoint_path}"
702
-
703
- blueprint_endpoints.append(
704
- {
705
- "id": f"post-create-{bp.name.replace('_', '-')}",
706
- "name": f"Create {bp.name.replace('_', ' ').title()} Job",
707
- "method": "POST",
708
- "path": full_path,
709
- "description": f"Creates and starts a new instance (Job) of the `{bp.name}` blueprint.",
710
- "request": {"body": {"initial_data": {}}},
711
- "responses": [
712
- {
713
- "code": "202 Accepted",
714
- "description": "Job successfully accepted for processing.",
715
- "body": {"status": "accepted", "job_id": "..."},
716
- }
717
- ],
718
- }
719
- )
720
-
721
- # Inject dynamic endpoints into the apiData structure in the HTML
722
- if blueprint_endpoints:
723
- endpoints_json = dumps(blueprint_endpoints, option=OPT_INDENT_2).decode("utf-8")
724
- # We insert the new endpoints at the beginning of the 'Protected API' group
725
- marker = "group: 'Protected API',\n endpoints: ["
726
- content = content.replace(marker, f"{marker}\n{endpoints_json.strip('[]')},")
727
-
728
- return web.Response(text=content, content_type="text/html")
729
-
730
- def _setup_routes(self):
731
- public_app = web.Application()
732
- public_app.router.add_get("/status", status_handler)
733
- public_app.router.add_get("/metrics", metrics_handler)
734
- public_app.router.add_post("/webhooks/approval/{job_id}", self._human_approval_webhook_handler)
735
- public_app.router.add_post("/debug/flush_db", self._flush_db_handler)
736
- public_app.router.add_get("/docs", self._docs_handler)
737
- public_app.router.add_get("/jobs/quarantined", self._get_quarantined_jobs_handler)
738
- self.app.add_subapp("/_public/", public_app)
739
-
740
- auth_middleware = client_auth_middleware_factory(self.storage)
741
- quota_middleware = quota_middleware_factory(self.storage)
742
- api_middlewares = [auth_middleware, quota_middleware]
743
-
744
- protected_app = web.Application(middlewares=api_middlewares)
745
- versioned_apps: dict[str, web.Application] = {}
746
- has_unversioned_routes = False
747
-
748
- for bp in self.blueprints.values():
749
- if not bp.api_endpoint:
750
- continue
751
- endpoint = bp.api_endpoint if bp.api_endpoint.startswith("/") else f"/{bp.api_endpoint}"
752
- if bp.api_version:
753
- if bp.api_version not in versioned_apps:
754
- versioned_apps[bp.api_version] = web.Application(middlewares=api_middlewares)
755
- versioned_apps[bp.api_version].router.add_post(endpoint, self._create_job_handler(bp))
756
- else:
757
- protected_app.router.add_post(endpoint, self._create_job_handler(bp))
758
- has_unversioned_routes = True
759
-
760
- all_protected_apps = list(versioned_apps.values())
761
- if has_unversioned_routes:
762
- all_protected_apps.append(protected_app)
763
-
764
- for app in all_protected_apps:
765
- self._register_common_routes(app)
766
- if has_unversioned_routes:
767
- self.app.add_subapp("/api/", protected_app)
768
- for version, app in versioned_apps.items():
769
- self.app.add_subapp(f"/api/{version}", app)
770
-
771
- worker_auth_middleware = worker_auth_middleware_factory(self.storage, self.config)
772
- worker_middlewares = [worker_auth_middleware]
773
- if self.config.RATE_LIMITING_ENABLED:
774
- worker_rate_limiter = rate_limit_middleware_factory(storage=self.storage, limit=5, period=60)
775
- worker_middlewares.append(worker_rate_limiter)
776
-
777
- worker_app = web.Application(middlewares=worker_middlewares)
778
- worker_app.router.add_post("/workers/register", self._register_worker_handler)
779
- worker_app.router.add_get("/workers/{worker_id}/tasks/next", self._handle_get_next_task)
780
- worker_app.router.add_patch("/workers/{worker_id}", self._worker_update_handler)
781
- worker_app.router.add_post("/tasks/result", self._task_result_handler)
782
- worker_app.router.add_get("/ws/{worker_id}", self._websocket_handler)
783
- self.app.add_subapp("/_worker/", worker_app)
784
-
785
- def _register_common_routes(self, app):
786
- app.router.add_get("/jobs/{job_id}", self._get_job_status_handler)
787
- app.router.add_post("/jobs/{job_id}/cancel", self._cancel_job_handler)
788
- if not isinstance(self.history_storage, NoOpHistoryStorage):
789
- app.router.add_get("/jobs/{job_id}/history", self._get_job_history_handler)
790
- app.router.add_get("/blueprints/{blueprint_name}/graph", self._get_blueprint_graph_handler)
791
- app.router.add_get("/workers", self._get_workers_handler)
792
- app.router.add_get("/jobs", self._get_jobs_handler)
793
- app.router.add_get("/dashboard", self._get_dashboard_handler)
794
- app.router.add_post("/admin/reload-workers", self._reload_worker_configs_handler)
795
-
796
- async def _websocket_handler(self, request: web.Request) -> web.WebSocketResponse:
797
- worker_id = request.match_info.get("worker_id")
798
- if not worker_id:
799
- raise web.HTTPBadRequest(text="worker_id is required")
800
-
801
- ws = web.WebSocketResponse()
802
- await ws.prepare(request)
803
-
804
- await self.ws_manager.register(worker_id, ws)
805
- try:
806
- async for msg in ws:
807
- if msg.type == WSMsgType.TEXT:
808
- try:
809
- data = msg.json()
810
- await self.ws_manager.handle_message(worker_id, data)
811
- except Exception as e:
812
- logger.error(f"Error processing WebSocket message from {worker_id}: {e}")
813
- elif msg.type == WSMsgType.ERROR:
814
- logger.error(f"WebSocket connection for {worker_id} closed with exception {ws.exception()}")
815
- break
816
- finally:
817
- await self.ws_manager.unregister(worker_id)
818
- return ws
819
-
820
- async def _handle_get_next_task(self, request: web.Request) -> web.Response:
821
- worker_id = request.match_info.get("worker_id")
822
- if not worker_id:
823
- return json_response({"error": "worker_id is required in path"}, status=400)
824
-
825
- logger.debug(f"Worker {worker_id} is requesting a new task.")
826
- task = await self.storage.dequeue_task_for_worker(worker_id, self.config.WORKER_POLL_TIMEOUT_SECONDS)
827
-
828
- if task:
829
- logger.info(f"Sending task {task.get('task_id')} to worker {worker_id}")
830
- return json_response(task, status=200)
831
- logger.debug(f"No tasks for worker {worker_id}, responding 204.")
832
- return web.Response(status=204)
833
-
834
- async def _worker_update_handler(self, request: web.Request) -> web.Response:
835
- """
836
- Handles both full updates and lightweight heartbeats for a worker.
837
-
838
- If the request has a JSON body, it updates the worker's data.
839
- In either case, it refreshes the worker's TTL, serving as a heartbeat.
840
- """
841
- worker_id = request.match_info.get("worker_id")
842
- if not worker_id:
843
- return json_response({"error": "worker_id is required in path"}, status=400)
844
-
845
- ttl = self.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
846
- update_data = None
847
-
848
- # Check for body content without consuming it if it's not JSON
849
- if request.can_read_body:
850
- try:
851
- update_data = await request.json(loads=loads)
852
- except Exception:
853
- logger.warning(
854
- f"Received PATCH from worker {worker_id} with non-JSON body. Treating as TTL-only heartbeat."
855
- )
358
+ async def send_job_webhook(self, job_state: dict[str, Any], event: str) -> None:
359
+ """Sends a webhook notification for a job event."""
360
+ webhook_url = job_state.get("webhook_url")
361
+ if not webhook_url:
362
+ return
856
363
 
857
- if update_data:
858
- # Full update path
859
- updated_worker = await self.storage.update_worker_status(worker_id, update_data, ttl)
860
- if not updated_worker:
861
- return json_response({"error": "Worker not found"}, status=404)
862
-
863
- await self.history_storage.log_worker_event(
864
- {
865
- "worker_id": worker_id,
866
- "event_type": "status_update",
867
- "worker_info_snapshot": updated_worker,
868
- },
869
- )
870
- return json_response(updated_worker, status=200)
871
- else:
872
- # Lightweight TTL-only heartbeat path
873
- refreshed = await self.storage.refresh_worker_ttl(worker_id, ttl)
874
- if not refreshed:
875
- return json_response({"error": "Worker not found"}, status=404)
876
- return json_response({"status": "ttl_refreshed"})
877
-
878
- async def _register_worker_handler(self, request: web.Request) -> web.Response:
879
- # The worker_registration_data is attached by the auth middleware
880
- # to avoid reading the request body twice.
881
- worker_data = request.get("worker_registration_data")
882
- if not worker_data:
883
- return json_response({"error": "Worker data not found in request"}, status=500)
884
-
885
- worker_id = worker_data.get("worker_id")
886
- # This check is redundant if the middleware works, but good for safety
887
- if not worker_id:
888
- return json_response({"error": "Missing required field: worker_id"}, status=400)
889
-
890
- ttl = self.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
891
- await self.storage.register_worker(worker_id, worker_data, ttl)
892
-
893
- logger.info(
894
- f"Worker '{worker_id}' registered with info: {worker_data}",
364
+ payload = WebhookPayload(
365
+ event=event,
366
+ job_id=job_state["id"],
367
+ status=job_state["status"],
368
+ result=job_state.get("state_history"), # Or specific result
369
+ error=job_state.get("error_message"),
895
370
  )
896
371
 
897
- await self.history_storage.log_worker_event(
898
- {
899
- "worker_id": worker_id,
900
- "event_type": "registered",
901
- "worker_info_snapshot": worker_data,
902
- },
903
- )
904
- return json_response({"status": "registered"}, status=200)
372
+ # Run in background to not block the main flow
373
+ await self.webhook_sender.send(webhook_url, payload)
905
374
 
906
- def run(self):
375
+ def run(self) -> None:
907
376
  self.setup()
908
377
  print(
909
378
  f"Starting OrchestratorEngine API server on {self.config.API_HOST}:{self.config.API_PORT} in blocking mode."