avtomatika 1.0b6__py3-none-any.whl → 1.0b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
avtomatika/engine.py CHANGED
@@ -1,68 +1,50 @@
1
- from asyncio import Task, create_task, gather, get_running_loop, wait_for
2
1
  from asyncio import TimeoutError as AsyncTimeoutError
2
+ from asyncio import create_task, gather, get_running_loop, wait_for
3
3
  from logging import getLogger
4
- from typing import Any, Callable
4
+ from typing import Any
5
5
  from uuid import uuid4
6
6
 
7
- from aiohttp import ClientSession, WSMsgType, web
8
- from aiohttp.web import AppKey
9
- from aioprometheus import render
10
- from orjson import OPT_INDENT_2, dumps, loads
7
+ from aiohttp import ClientSession, web
8
+ from orjson import dumps
11
9
 
12
10
  from . import metrics
11
+ from .api.routes import setup_routes
12
+ from .app_keys import (
13
+ DISPATCHER_KEY,
14
+ ENGINE_KEY,
15
+ EXECUTOR_KEY,
16
+ EXECUTOR_TASK_KEY,
17
+ HEALTH_CHECKER_KEY,
18
+ HEALTH_CHECKER_TASK_KEY,
19
+ HTTP_SESSION_KEY,
20
+ REPUTATION_CALCULATOR_KEY,
21
+ REPUTATION_CALCULATOR_TASK_KEY,
22
+ SCHEDULER_KEY,
23
+ SCHEDULER_TASK_KEY,
24
+ WATCHER_KEY,
25
+ WATCHER_TASK_KEY,
26
+ WS_MANAGER_KEY,
27
+ )
13
28
  from .blueprint import StateMachineBlueprint
14
29
  from .client_config_loader import load_client_configs_to_redis
15
30
  from .compression import compression_middleware
16
31
  from .config import Config
17
- from .constants import (
18
- ERROR_CODE_INVALID_INPUT,
19
- ERROR_CODE_PERMANENT,
20
- ERROR_CODE_TRANSIENT,
21
- JOB_STATUS_CANCELLED,
22
- JOB_STATUS_FAILED,
23
- JOB_STATUS_PENDING,
24
- JOB_STATUS_QUARANTINED,
25
- JOB_STATUS_RUNNING,
26
- JOB_STATUS_WAITING_FOR_HUMAN,
27
- JOB_STATUS_WAITING_FOR_PARALLEL,
28
- JOB_STATUS_WAITING_FOR_WORKER,
29
- TASK_STATUS_CANCELLED,
30
- TASK_STATUS_FAILURE,
31
- TASK_STATUS_SUCCESS,
32
- )
32
+ from .constants import JOB_STATUS_FAILED, JOB_STATUS_PENDING, JOB_STATUS_QUARANTINED, JOB_STATUS_WAITING_FOR_WORKER
33
33
  from .dispatcher import Dispatcher
34
34
  from .executor import JobExecutor
35
35
  from .health_checker import HealthChecker
36
36
  from .history.base import HistoryStorageBase
37
37
  from .history.noop import NoOpHistoryStorage
38
38
  from .logging_config import setup_logging
39
- from .quota import quota_middleware_factory
40
- from .ratelimit import rate_limit_middleware_factory
41
39
  from .reputation import ReputationCalculator
42
40
  from .scheduler import Scheduler
43
- from .security import client_auth_middleware_factory, worker_auth_middleware_factory
44
41
  from .storage.base import StorageBackend
45
42
  from .telemetry import setup_telemetry
43
+ from .utils.webhook_sender import WebhookPayload, WebhookSender
46
44
  from .watcher import Watcher
47
45
  from .worker_config_loader import load_worker_configs_to_redis
48
46
  from .ws_manager import WebSocketManager
49
47
 
50
- # Application keys for storing components
51
- ENGINE_KEY = AppKey("engine", "OrchestratorEngine")
52
- HTTP_SESSION_KEY = AppKey("http_session", ClientSession)
53
- DISPATCHER_KEY = AppKey("dispatcher", Dispatcher)
54
- EXECUTOR_KEY = AppKey("executor", JobExecutor)
55
- WATCHER_KEY = AppKey("watcher", Watcher)
56
- REPUTATION_CALCULATOR_KEY = AppKey("reputation_calculator", ReputationCalculator)
57
- HEALTH_CHECKER_KEY = AppKey("health_checker", HealthChecker)
58
- SCHEDULER_KEY = AppKey("scheduler", Scheduler)
59
-
60
- EXECUTOR_TASK_KEY = AppKey("executor_task", Task)
61
- WATCHER_TASK_KEY = AppKey("watcher_task", Task)
62
- REPUTATION_CALCULATOR_TASK_KEY = AppKey("reputation_calculator_task", Task)
63
- HEALTH_CHECKER_TASK_KEY = AppKey("health_checker_task", Task)
64
- SCHEDULER_TASK_KEY = AppKey("scheduler_task", Task)
65
-
66
48
  metrics.init_metrics()
67
49
 
68
50
  logger = getLogger(__name__)
@@ -76,14 +58,6 @@ def json_response(data: Any, **kwargs: Any) -> web.Response:
76
58
  return web.json_response(data, dumps=json_dumps, **kwargs)
77
59
 
78
60
 
79
- async def status_handler(_request: web.Request) -> web.Response:
80
- return json_response({"status": "ok"})
81
-
82
-
83
- async def metrics_handler(_request: web.Request) -> web.Response:
84
- return web.Response(body=render(), content_type="text/plain")
85
-
86
-
87
61
  class OrchestratorEngine:
88
62
  def __init__(self, storage: StorageBackend, config: Config):
89
63
  setup_logging(config.LOG_LEVEL, config.LOG_FORMAT, config.TZ)
@@ -97,7 +71,7 @@ class OrchestratorEngine:
97
71
  self.app[ENGINE_KEY] = self
98
72
  self._setup_done = False
99
73
 
100
- def register_blueprint(self, blueprint: StateMachineBlueprint):
74
+ def register_blueprint(self, blueprint: StateMachineBlueprint) -> None:
101
75
  if self._setup_done:
102
76
  raise RuntimeError("Cannot register blueprints after engine setup.")
103
77
  if blueprint.name in self.blueprints:
@@ -107,15 +81,15 @@ class OrchestratorEngine:
107
81
  blueprint.validate()
108
82
  self.blueprints[blueprint.name] = blueprint
109
83
 
110
- def setup(self):
84
+ def setup(self) -> None:
111
85
  if self._setup_done:
112
86
  return
113
- self._setup_routes()
87
+ setup_routes(self.app, self)
114
88
  self.app.on_startup.append(self.on_startup)
115
89
  self.app.on_shutdown.append(self.on_shutdown)
116
90
  self._setup_done = True
117
91
 
118
- async def _setup_history_storage(self):
92
+ async def _setup_history_storage(self) -> None:
119
93
  from importlib import import_module
120
94
 
121
95
  uri = self.config.HISTORY_DATABASE_URI
@@ -166,7 +140,7 @@ class OrchestratorEngine:
166
140
  )
167
141
  self.history_storage = NoOpHistoryStorage()
168
142
 
169
- async def on_startup(self, app: web.Application):
143
+ async def on_startup(self, app: web.Application) -> None:
170
144
  try:
171
145
  from opentelemetry.instrumentation.aiohttp_client import (
172
146
  AioHttpClientInstrumentor,
@@ -213,6 +187,7 @@ class OrchestratorEngine:
213
187
  )
214
188
 
215
189
  app[HTTP_SESSION_KEY] = ClientSession()
190
+ self.webhook_sender = WebhookSender(app[HTTP_SESSION_KEY])
216
191
  self.dispatcher = Dispatcher(self.storage, self.config)
217
192
  app[DISPATCHER_KEY] = self.dispatcher
218
193
  app[EXECUTOR_KEY] = JobExecutor(self, self.history_storage)
@@ -220,6 +195,7 @@ class OrchestratorEngine:
220
195
  app[REPUTATION_CALCULATOR_KEY] = ReputationCalculator(self)
221
196
  app[HEALTH_CHECKER_KEY] = HealthChecker(self)
222
197
  app[SCHEDULER_KEY] = Scheduler(self)
198
+ app[WS_MANAGER_KEY] = self.ws_manager
223
199
 
224
200
  app[EXECUTOR_TASK_KEY] = create_task(app[EXECUTOR_KEY].run())
225
201
  app[WATCHER_TASK_KEY] = create_task(app[WATCHER_KEY].run())
@@ -227,7 +203,7 @@ class OrchestratorEngine:
227
203
  app[HEALTH_CHECKER_TASK_KEY] = create_task(app[HEALTH_CHECKER_KEY].run())
228
204
  app[SCHEDULER_TASK_KEY] = create_task(app[SCHEDULER_KEY].run())
229
205
 
230
- async def on_shutdown(self, app: web.Application):
206
+ async def on_shutdown(self, app: web.Application) -> None:
231
207
  logger.info("Shutdown sequence started.")
232
208
  app[EXECUTOR_KEY].stop()
233
209
  app[WATCHER_KEY].stop()
@@ -324,295 +300,23 @@ class OrchestratorEngine:
324
300
  logger.info(f"Created background job {job_id} for blueprint '{blueprint_name}' (source: {source})")
325
301
  return job_id
326
302
 
327
- def _create_job_handler(self, blueprint: StateMachineBlueprint) -> Callable:
328
- async def handler(request: web.Request) -> web.Response:
329
- try:
330
- initial_data = await request.json(loads=loads)
331
- except Exception:
332
- return json_response({"error": "Invalid JSON body"}, status=400)
333
-
334
- client_config = request["client_config"]
335
- carrier = {str(k): v for k, v in request.headers.items()}
336
-
337
- job_id = str(uuid4())
338
- job_state = {
339
- "id": job_id,
340
- "blueprint_name": blueprint.name,
341
- "current_state": blueprint.start_state,
342
- "initial_data": initial_data,
343
- "state_history": {},
344
- "status": JOB_STATUS_PENDING,
345
- "tracing_context": carrier,
346
- "client_config": client_config,
347
- }
348
- await self.storage.save_job_state(job_id, job_state)
349
- await self.storage.enqueue_job(job_id)
350
- metrics.jobs_total.inc({metrics.LABEL_BLUEPRINT: blueprint.name})
351
- return json_response({"status": "accepted", "job_id": job_id}, status=202)
352
-
353
- return handler
354
-
355
- async def _get_job_status_handler(self, request: web.Request) -> web.Response:
356
- job_id = request.match_info.get("job_id")
357
- if not job_id:
358
- return json_response({"error": "job_id is required in path"}, status=400)
359
- job_state = await self.storage.get_job_state(job_id)
360
- if not job_state:
361
- return json_response({"error": "Job not found"}, status=404)
362
- return json_response(job_state, status=200)
363
-
364
- async def _cancel_job_handler(self, request: web.Request) -> web.Response:
365
- job_id = request.match_info.get("job_id")
366
- if not job_id:
367
- return json_response({"error": "job_id is required in path"}, status=400)
368
-
369
- job_state = await self.storage.get_job_state(job_id)
370
- if not job_state:
371
- return json_response({"error": "Job not found"}, status=404)
372
-
373
- if job_state.get("status") != JOB_STATUS_WAITING_FOR_WORKER:
374
- return json_response(
375
- {"error": "Job is not in a state that can be cancelled (must be waiting for a worker)."},
376
- status=409,
377
- )
378
-
379
- worker_id = job_state.get("task_worker_id")
380
- if not worker_id:
381
- return json_response(
382
- {"error": "Cannot cancel job: worker_id not found in job state."},
383
- status=500,
384
- )
385
-
386
- worker_info = await self.storage.get_worker_info(worker_id)
387
- task_id = job_state.get("current_task_id")
388
- if not task_id:
389
- return json_response(
390
- {"error": "Cannot cancel job: task_id not found in job state."},
391
- status=500,
392
- )
393
-
394
- # Set Redis flag as a reliable fallback/primary mechanism
395
- await self.storage.set_task_cancellation_flag(task_id)
396
-
397
- # Attempt WebSocket-based cancellation if supported
398
- if worker_info and worker_info.get("capabilities", {}).get("websockets"):
399
- command = {"command": "cancel_task", "task_id": task_id, "job_id": job_id}
400
- sent = await self.ws_manager.send_command(worker_id, command)
401
- if sent:
402
- return json_response({"status": "cancellation_request_sent"})
403
- else:
404
- logger.warning(f"Failed to send WebSocket cancellation for task {task_id}, but Redis flag is set.")
405
- # Proceed to return success, as the Redis flag will handle it
406
-
407
- return json_response({"status": "cancellation_request_accepted"})
408
-
409
- async def _get_job_history_handler(self, request: web.Request) -> web.Response:
410
- job_id = request.match_info.get("job_id")
411
- if not job_id:
412
- return json_response({"error": "job_id is required in path"}, status=400)
413
- history = await self.history_storage.get_job_history(job_id)
414
- return json_response(history)
415
-
416
- async def _get_blueprint_graph_handler(self, request: web.Request) -> web.Response:
417
- blueprint_name = request.match_info.get("blueprint_name")
418
- if not blueprint_name:
419
- return json_response({"error": "blueprint_name is required in path"}, status=400)
420
-
421
- blueprint = self.blueprints.get(blueprint_name)
422
- if not blueprint:
423
- return json_response({"error": "Blueprint not found"}, status=404)
424
-
425
- try:
426
- graph_dot = blueprint.render_graph()
427
- return web.Response(text=graph_dot, content_type="text/vnd.graphviz")
428
- except FileNotFoundError:
429
- error_msg = "Graphviz is not installed on the server. Cannot generate graph."
430
- logger.error(error_msg)
431
- return json_response({"error": error_msg}, status=501)
432
-
433
- async def _get_workers_handler(self, request: web.Request) -> web.Response:
434
- workers = await self.storage.get_available_workers()
435
- return json_response(workers)
436
-
437
- async def _get_jobs_handler(self, request: web.Request) -> web.Response:
438
- try:
439
- limit = int(request.query.get("limit", "100"))
440
- offset = int(request.query.get("offset", "0"))
441
- except ValueError:
442
- return json_response({"error": "Invalid limit/offset parameter"}, status=400)
443
-
444
- jobs = await self.history_storage.get_jobs(limit=limit, offset=offset)
445
- return json_response(jobs)
446
-
447
- async def _get_dashboard_handler(self, request: web.Request) -> web.Response:
448
- worker_count = await self.storage.get_active_worker_count()
449
- queue_length = await self.storage.get_job_queue_length()
450
- job_summary = await self.history_storage.get_job_summary()
451
-
452
- dashboard_data = {
453
- "workers": {"total": worker_count},
454
- "jobs": {"queued": queue_length, **job_summary},
455
- }
456
- return json_response(dashboard_data)
457
-
458
- async def _task_result_handler(self, request: web.Request) -> web.Response:
459
- import logging
460
-
461
- try:
462
- data = await request.json(loads=loads)
463
- job_id = data.get("job_id")
464
- task_id = data.get("task_id")
465
- result = data.get("result", {})
466
- result_status = result.get("status", TASK_STATUS_SUCCESS)
467
- error_message = result.get("error")
468
- payload_worker_id = data.get("worker_id")
469
- except Exception:
470
- return json_response({"error": "Invalid JSON body"}, status=400)
471
-
472
- # Security check: Ensure the worker_id from the payload matches the authenticated worker
473
- authenticated_worker_id = request.get("worker_id")
474
- if not authenticated_worker_id:
475
- # This should not happen if the auth middleware is working correctly
476
- return json_response({"error": "Could not identify authenticated worker."}, status=500)
477
-
478
- if payload_worker_id and payload_worker_id != authenticated_worker_id:
479
- return json_response(
480
- {
481
- "error": f"Forbidden: Authenticated worker '{authenticated_worker_id}' "
482
- f"cannot submit results for another worker '{payload_worker_id}'.",
483
- },
484
- status=403,
485
- )
486
-
487
- if not job_id or not task_id:
488
- return json_response({"error": "job_id and task_id are required"}, status=400)
489
-
490
- job_state = await self.storage.get_job_state(job_id)
491
- if not job_state:
492
- return json_response({"error": "Job not found"}, status=404)
493
-
494
- # Handle parallel task completion
495
- if job_state.get("status") == JOB_STATUS_WAITING_FOR_PARALLEL:
496
- await self.storage.remove_job_from_watch(f"{job_id}:{task_id}")
497
- job_state.setdefault("aggregation_results", {})[task_id] = result
498
- job_state.setdefault("active_branches", []).remove(task_id)
499
-
500
- if not job_state["active_branches"]:
501
- logger.info(f"All parallel branches for job {job_id} have completed.")
502
- job_state["status"] = JOB_STATUS_RUNNING
503
- job_state["current_state"] = job_state["aggregation_target"]
504
- await self.storage.save_job_state(job_id, job_state)
505
- await self.storage.enqueue_job(job_id)
506
- else:
507
- logger.info(
508
- f"Branch {task_id} for job {job_id} completed. "
509
- f"Waiting for {len(job_state['active_branches'])} more.",
510
- )
511
- await self.storage.save_job_state(job_id, job_state)
512
-
513
- return json_response({"status": "parallel_branch_result_accepted"}, status=200)
514
-
515
- await self.storage.remove_job_from_watch(job_id)
516
-
517
- import time
518
-
519
- now = time.monotonic()
520
- dispatched_at = job_state.get("task_dispatched_at", now)
521
- duration_ms = int((now - dispatched_at) * 1000)
522
-
523
- await self.history_storage.log_job_event(
524
- {
525
- "job_id": job_id,
526
- "state": job_state.get("current_state"),
527
- "event_type": "task_finished",
528
- "duration_ms": duration_ms,
529
- "worker_id": authenticated_worker_id, # Use authenticated worker_id
530
- "context_snapshot": {**job_state, "result": result},
531
- },
532
- )
533
-
534
- job_state["tracing_context"] = {str(k): v for k, v in request.headers.items()}
535
-
536
- if result_status == TASK_STATUS_FAILURE:
537
- error_details = result.get("error", {})
538
- error_type = ERROR_CODE_TRANSIENT
539
- error_message = "No error details provided."
540
-
541
- if isinstance(error_details, dict):
542
- error_type = error_details.get("code", ERROR_CODE_TRANSIENT)
543
- error_message = error_details.get("message", "No error message provided.")
544
- elif isinstance(error_details, str):
545
- # Fallback for old format where `error` was just a string
546
- error_message = error_details
547
-
548
- logging.warning(f"Task {task_id} for job {job_id} failed with error type '{error_type}'.")
549
-
550
- if error_type == ERROR_CODE_PERMANENT:
551
- job_state["status"] = JOB_STATUS_QUARANTINED
552
- job_state["error_message"] = f"Task failed with permanent error: {error_message}"
553
- await self.storage.save_job_state(job_id, job_state)
554
- await self.storage.quarantine_job(job_id)
555
- elif error_type == ERROR_CODE_INVALID_INPUT:
556
- job_state["status"] = JOB_STATUS_FAILED
557
- job_state["error_message"] = f"Task failed due to invalid input: {error_message}"
558
- await self.storage.save_job_state(job_id, job_state)
559
- else: # TRANSIENT_ERROR or any other/unspecified error
560
- await self._handle_task_failure(job_state, task_id, error_message)
561
-
562
- return json_response({"status": "result_accepted_failure"}, status=200)
563
-
564
- if result_status == TASK_STATUS_CANCELLED:
565
- logging.info(f"Task {task_id} for job {job_id} was cancelled by worker.")
566
- job_state["status"] = JOB_STATUS_CANCELLED
567
- await self.storage.save_job_state(job_id, job_state)
568
- # Optionally, trigger a specific 'cancelled' transition if defined in the blueprint
569
- transitions = job_state.get("current_task_transitions", {})
570
- if next_state := transitions.get("cancelled"):
571
- job_state["current_state"] = next_state
572
- job_state["status"] = JOB_STATUS_RUNNING # It's running the cancellation handler now
573
- await self.storage.save_job_state(job_id, job_state)
574
- await self.storage.enqueue_job(job_id)
575
- return json_response({"status": "result_accepted_cancelled"}, status=200)
576
-
577
- transitions = job_state.get("current_task_transitions", {})
578
- if next_state := transitions.get(result_status):
579
- logging.info(f"Job {job_id} transitioning based on worker status '{result_status}' to state '{next_state}'")
580
-
581
- worker_data = result.get("data")
582
- if worker_data and isinstance(worker_data, dict):
583
- if "state_history" not in job_state:
584
- job_state["state_history"] = {}
585
- job_state["state_history"].update(worker_data)
586
-
587
- job_state["current_state"] = next_state
588
- job_state["status"] = JOB_STATUS_RUNNING
589
- await self.storage.save_job_state(job_id, job_state)
590
- await self.storage.enqueue_job(job_id)
591
- else:
592
- logging.error(f"Job {job_id} failed. Worker returned unhandled status '{result_status}'.")
593
- job_state["status"] = JOB_STATUS_FAILED
594
- job_state["error_message"] = f"Worker returned unhandled status: {result_status}"
595
- await self.storage.save_job_state(job_id, job_state)
596
-
597
- return json_response({"status": "result_accepted_success"}, status=200)
598
-
599
- async def _handle_task_failure(self, job_state: dict, task_id: str, error_message: str | None):
600
- import logging
601
-
303
+ async def handle_task_failure(self, job_state: dict[str, Any], task_id: str, error_message: str | None) -> None:
304
+ """Handles a transient task failure by retrying or quarantining."""
602
305
  job_id = job_state["id"]
603
306
  retry_count = job_state.get("retry_count", 0)
604
307
  max_retries = self.config.JOB_MAX_RETRIES
605
308
 
606
309
  if retry_count < max_retries:
607
310
  job_state["retry_count"] = retry_count + 1
608
- logging.info(f"Retrying task for job {job_id}. Attempt {retry_count + 1}/{max_retries}.")
311
+ logger.info(f"Retrying task for job {job_id}. Attempt {retry_count + 1}/{max_retries}.")
609
312
 
610
313
  task_info = job_state.get("current_task_info")
611
314
  if not task_info:
612
- logging.error(f"Cannot retry job {job_id}: missing 'current_task_info' in job state.")
315
+ logger.error(f"Cannot retry job {job_id}: missing 'current_task_info' in job state.")
613
316
  job_state["status"] = JOB_STATUS_FAILED
614
317
  job_state["error_message"] = "Cannot retry: original task info not found."
615
318
  await self.storage.save_job_state(job_id, job_state)
319
+ await self.send_job_webhook(job_state, "job_failed")
616
320
  return
617
321
 
618
322
  now = get_running_loop().time()
@@ -626,284 +330,31 @@ class OrchestratorEngine:
626
330
 
627
331
  await self.dispatcher.dispatch(job_state, task_info)
628
332
  else:
629
- logging.critical(f"Job {job_id} has failed {max_retries + 1} times. Moving to quarantine.")
333
+ logger.critical(f"Job {job_id} has failed {max_retries + 1} times. Moving to quarantine.")
630
334
  job_state["status"] = JOB_STATUS_QUARANTINED
631
335
  job_state["error_message"] = f"Task failed after {max_retries + 1} attempts: {error_message}"
632
336
  await self.storage.save_job_state(job_id, job_state)
633
337
  await self.storage.quarantine_job(job_id)
338
+ await self.send_job_webhook(job_state, "job_quarantined")
634
339
 
635
- async def _human_approval_webhook_handler(self, request: web.Request) -> web.Response:
636
- job_id = request.match_info.get("job_id")
637
- if not job_id:
638
- return json_response({"error": "job_id is required in path"}, status=400)
639
- try:
640
- data = await request.json(loads=loads)
641
- decision = data.get("decision")
642
- if not decision:
643
- return json_response({"error": "decision is required in body"}, status=400)
644
- except Exception:
645
- return json_response({"error": "Invalid JSON body"}, status=400)
646
- job_state = await self.storage.get_job_state(job_id)
647
- if not job_state:
648
- return json_response({"error": "Job not found"}, status=404)
649
- if job_state.get("status") not in [JOB_STATUS_WAITING_FOR_WORKER, JOB_STATUS_WAITING_FOR_HUMAN]:
650
- return json_response({"error": "Job is not in a state that can be approved"}, status=409)
651
- transitions = job_state.get("current_task_transitions", {})
652
- next_state = transitions.get(decision)
653
- if not next_state:
654
- return json_response({"error": f"Invalid decision '{decision}' for this job"}, status=400)
655
- job_state["current_state"] = next_state
656
- job_state["status"] = JOB_STATUS_RUNNING
657
- await self.storage.save_job_state(job_id, job_state)
658
- await self.storage.enqueue_job(job_id)
659
- return json_response({"status": "approval_received", "job_id": job_id})
660
-
661
- async def _get_quarantined_jobs_handler(self, request: web.Request) -> web.Response:
662
- """Returns a list of all job IDs in the quarantine queue."""
663
- jobs = await self.storage.get_quarantined_jobs()
664
- return json_response(jobs)
665
-
666
- async def _reload_worker_configs_handler(self, request: web.Request) -> web.Response:
667
- """Handles the dynamic reloading of worker configurations."""
668
- logger.info("Received request to reload worker configurations.")
669
- if not self.config.WORKERS_CONFIG_PATH:
670
- return json_response(
671
- {"error": "WORKERS_CONFIG_PATH is not set, cannot reload configs."},
672
- status=400,
673
- )
674
-
675
- await load_worker_configs_to_redis(self.storage, self.config.WORKERS_CONFIG_PATH)
676
- return json_response({"status": "worker_configs_reloaded"})
677
-
678
- async def _flush_db_handler(self, request: web.Request) -> web.Response:
679
- logger.warning("Received request to flush the database.")
680
- await self.storage.flush_all()
681
- await load_client_configs_to_redis(self.storage)
682
- return json_response({"status": "db_flushed"}, status=200)
683
-
684
- async def _docs_handler(self, request: web.Request) -> web.Response:
685
- from importlib import resources
686
-
687
- try:
688
- content = resources.read_text("avtomatika", "api.html")
689
- except FileNotFoundError:
690
- logger.error("api.html not found within the avtomatika package.")
691
- return json_response({"error": "Documentation file not found on server."}, status=500)
692
-
693
- # Generate dynamic documentation for registered blueprints
694
- blueprint_endpoints = []
695
- for bp in self.blueprints.values():
696
- if not bp.api_endpoint:
697
- continue
698
-
699
- version_prefix = f"/{bp.api_version}" if bp.api_version else ""
700
- endpoint_path = bp.api_endpoint if bp.api_endpoint.startswith("/") else f"/{bp.api_endpoint}"
701
- full_path = f"/api{version_prefix}{endpoint_path}"
702
-
703
- blueprint_endpoints.append(
704
- {
705
- "id": f"post-create-{bp.name.replace('_', '-')}",
706
- "name": f"Create {bp.name.replace('_', ' ').title()} Job",
707
- "method": "POST",
708
- "path": full_path,
709
- "description": f"Creates and starts a new instance (Job) of the `{bp.name}` blueprint.",
710
- "request": {"body": {"initial_data": {}}},
711
- "responses": [
712
- {
713
- "code": "202 Accepted",
714
- "description": "Job successfully accepted for processing.",
715
- "body": {"status": "accepted", "job_id": "..."},
716
- }
717
- ],
718
- }
719
- )
720
-
721
- # Inject dynamic endpoints into the apiData structure in the HTML
722
- if blueprint_endpoints:
723
- endpoints_json = dumps(blueprint_endpoints, option=OPT_INDENT_2).decode("utf-8")
724
- # We insert the new endpoints at the beginning of the 'Protected API' group
725
- marker = "group: 'Protected API',\n endpoints: ["
726
- content = content.replace(marker, f"{marker}\n{endpoints_json.strip('[]')},")
727
-
728
- return web.Response(text=content, content_type="text/html")
729
-
730
- def _setup_routes(self):
731
- public_app = web.Application()
732
- public_app.router.add_get("/status", status_handler)
733
- public_app.router.add_get("/metrics", metrics_handler)
734
- public_app.router.add_post("/webhooks/approval/{job_id}", self._human_approval_webhook_handler)
735
- public_app.router.add_post("/debug/flush_db", self._flush_db_handler)
736
- public_app.router.add_get("/docs", self._docs_handler)
737
- public_app.router.add_get("/jobs/quarantined", self._get_quarantined_jobs_handler)
738
- self.app.add_subapp("/_public/", public_app)
739
-
740
- auth_middleware = client_auth_middleware_factory(self.storage)
741
- quota_middleware = quota_middleware_factory(self.storage)
742
- api_middlewares = [auth_middleware, quota_middleware]
743
-
744
- protected_app = web.Application(middlewares=api_middlewares)
745
- versioned_apps: dict[str, web.Application] = {}
746
- has_unversioned_routes = False
747
-
748
- for bp in self.blueprints.values():
749
- if not bp.api_endpoint:
750
- continue
751
- endpoint = bp.api_endpoint if bp.api_endpoint.startswith("/") else f"/{bp.api_endpoint}"
752
- if bp.api_version:
753
- if bp.api_version not in versioned_apps:
754
- versioned_apps[bp.api_version] = web.Application(middlewares=api_middlewares)
755
- versioned_apps[bp.api_version].router.add_post(endpoint, self._create_job_handler(bp))
756
- else:
757
- protected_app.router.add_post(endpoint, self._create_job_handler(bp))
758
- has_unversioned_routes = True
759
-
760
- all_protected_apps = list(versioned_apps.values())
761
- if has_unversioned_routes:
762
- all_protected_apps.append(protected_app)
763
-
764
- for app in all_protected_apps:
765
- self._register_common_routes(app)
766
- if has_unversioned_routes:
767
- self.app.add_subapp("/api/", protected_app)
768
- for version, app in versioned_apps.items():
769
- self.app.add_subapp(f"/api/{version}", app)
770
-
771
- worker_auth_middleware = worker_auth_middleware_factory(self.storage, self.config)
772
- worker_middlewares = [worker_auth_middleware]
773
- if self.config.RATE_LIMITING_ENABLED:
774
- worker_rate_limiter = rate_limit_middleware_factory(storage=self.storage, limit=5, period=60)
775
- worker_middlewares.append(worker_rate_limiter)
776
-
777
- worker_app = web.Application(middlewares=worker_middlewares)
778
- worker_app.router.add_post("/workers/register", self._register_worker_handler)
779
- worker_app.router.add_get("/workers/{worker_id}/tasks/next", self._handle_get_next_task)
780
- worker_app.router.add_patch("/workers/{worker_id}", self._worker_update_handler)
781
- worker_app.router.add_post("/tasks/result", self._task_result_handler)
782
- worker_app.router.add_get("/ws/{worker_id}", self._websocket_handler)
783
- self.app.add_subapp("/_worker/", worker_app)
784
-
785
- def _register_common_routes(self, app):
786
- app.router.add_get("/jobs/{job_id}", self._get_job_status_handler)
787
- app.router.add_post("/jobs/{job_id}/cancel", self._cancel_job_handler)
788
- if not isinstance(self.history_storage, NoOpHistoryStorage):
789
- app.router.add_get("/jobs/{job_id}/history", self._get_job_history_handler)
790
- app.router.add_get("/blueprints/{blueprint_name}/graph", self._get_blueprint_graph_handler)
791
- app.router.add_get("/workers", self._get_workers_handler)
792
- app.router.add_get("/jobs", self._get_jobs_handler)
793
- app.router.add_get("/dashboard", self._get_dashboard_handler)
794
- app.router.add_post("/admin/reload-workers", self._reload_worker_configs_handler)
795
-
796
- async def _websocket_handler(self, request: web.Request) -> web.WebSocketResponse:
797
- worker_id = request.match_info.get("worker_id")
798
- if not worker_id:
799
- raise web.HTTPBadRequest(text="worker_id is required")
800
-
801
- ws = web.WebSocketResponse()
802
- await ws.prepare(request)
803
-
804
- await self.ws_manager.register(worker_id, ws)
805
- try:
806
- async for msg in ws:
807
- if msg.type == WSMsgType.TEXT:
808
- try:
809
- data = msg.json()
810
- await self.ws_manager.handle_message(worker_id, data)
811
- except Exception as e:
812
- logger.error(f"Error processing WebSocket message from {worker_id}: {e}")
813
- elif msg.type == WSMsgType.ERROR:
814
- logger.error(f"WebSocket connection for {worker_id} closed with exception {ws.exception()}")
815
- break
816
- finally:
817
- await self.ws_manager.unregister(worker_id)
818
- return ws
819
-
820
- async def _handle_get_next_task(self, request: web.Request) -> web.Response:
821
- worker_id = request.match_info.get("worker_id")
822
- if not worker_id:
823
- return json_response({"error": "worker_id is required in path"}, status=400)
824
-
825
- logger.debug(f"Worker {worker_id} is requesting a new task.")
826
- task = await self.storage.dequeue_task_for_worker(worker_id, self.config.WORKER_POLL_TIMEOUT_SECONDS)
827
-
828
- if task:
829
- logger.info(f"Sending task {task.get('task_id')} to worker {worker_id}")
830
- return json_response(task, status=200)
831
- logger.debug(f"No tasks for worker {worker_id}, responding 204.")
832
- return web.Response(status=204)
833
-
834
- async def _worker_update_handler(self, request: web.Request) -> web.Response:
835
- """
836
- Handles both full updates and lightweight heartbeats for a worker.
837
-
838
- If the request has a JSON body, it updates the worker's data.
839
- In either case, it refreshes the worker's TTL, serving as a heartbeat.
840
- """
841
- worker_id = request.match_info.get("worker_id")
842
- if not worker_id:
843
- return json_response({"error": "worker_id is required in path"}, status=400)
844
-
845
- ttl = self.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
846
- update_data = None
847
-
848
- # Check for body content without consuming it if it's not JSON
849
- if request.can_read_body:
850
- try:
851
- update_data = await request.json(loads=loads)
852
- except Exception:
853
- logger.warning(
854
- f"Received PATCH from worker {worker_id} with non-JSON body. Treating as TTL-only heartbeat."
855
- )
340
+ async def send_job_webhook(self, job_state: dict[str, Any], event: str) -> None:
341
+ """Sends a webhook notification for a job event."""
342
+ webhook_url = job_state.get("webhook_url")
343
+ if not webhook_url:
344
+ return
856
345
 
857
- if update_data:
858
- # Full update path
859
- updated_worker = await self.storage.update_worker_status(worker_id, update_data, ttl)
860
- if not updated_worker:
861
- return json_response({"error": "Worker not found"}, status=404)
862
-
863
- await self.history_storage.log_worker_event(
864
- {
865
- "worker_id": worker_id,
866
- "event_type": "status_update",
867
- "worker_info_snapshot": updated_worker,
868
- },
869
- )
870
- return json_response(updated_worker, status=200)
871
- else:
872
- # Lightweight TTL-only heartbeat path
873
- refreshed = await self.storage.refresh_worker_ttl(worker_id, ttl)
874
- if not refreshed:
875
- return json_response({"error": "Worker not found"}, status=404)
876
- return json_response({"status": "ttl_refreshed"})
877
-
878
- async def _register_worker_handler(self, request: web.Request) -> web.Response:
879
- # The worker_registration_data is attached by the auth middleware
880
- # to avoid reading the request body twice.
881
- worker_data = request.get("worker_registration_data")
882
- if not worker_data:
883
- return json_response({"error": "Worker data not found in request"}, status=500)
884
-
885
- worker_id = worker_data.get("worker_id")
886
- # This check is redundant if the middleware works, but good for safety
887
- if not worker_id:
888
- return json_response({"error": "Missing required field: worker_id"}, status=400)
889
-
890
- ttl = self.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
891
- await self.storage.register_worker(worker_id, worker_data, ttl)
892
-
893
- logger.info(
894
- f"Worker '{worker_id}' registered with info: {worker_data}",
346
+ payload = WebhookPayload(
347
+ event=event,
348
+ job_id=job_state["id"],
349
+ status=job_state["status"],
350
+ result=job_state.get("state_history"), # Or specific result
351
+ error=job_state.get("error_message"),
895
352
  )
896
353
 
897
- await self.history_storage.log_worker_event(
898
- {
899
- "worker_id": worker_id,
900
- "event_type": "registered",
901
- "worker_info_snapshot": worker_data,
902
- },
903
- )
904
- return json_response({"status": "registered"}, status=200)
354
+ # Run in background to not block the main flow
355
+ create_task(self.webhook_sender.send(webhook_url, payload))
905
356
 
906
- def run(self):
357
+ def run(self) -> None:
907
358
  self.setup()
908
359
  print(
909
360
  f"Starting OrchestratorEngine API server on {self.config.API_HOST}:{self.config.API_PORT} in blocking mode."