avtomatika 1.0b5__py3-none-any.whl → 1.0b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
avtomatika/engine.py CHANGED
@@ -1,48 +1,50 @@
1
- from asyncio import Task, create_task, gather, get_running_loop, wait_for
2
1
  from asyncio import TimeoutError as AsyncTimeoutError
2
+ from asyncio import create_task, gather, get_running_loop, wait_for
3
3
  from logging import getLogger
4
- from typing import Any, Callable
4
+ from typing import Any
5
5
  from uuid import uuid4
6
6
 
7
- from aiohttp import ClientSession, WSMsgType, web
8
- from aiohttp.web import AppKey
9
- from aioprometheus import render
10
- from orjson import OPT_INDENT_2, dumps, loads
7
+ from aiohttp import ClientSession, web
8
+ from orjson import dumps
11
9
 
12
10
  from . import metrics
11
+ from .api.routes import setup_routes
12
+ from .app_keys import (
13
+ DISPATCHER_KEY,
14
+ ENGINE_KEY,
15
+ EXECUTOR_KEY,
16
+ EXECUTOR_TASK_KEY,
17
+ HEALTH_CHECKER_KEY,
18
+ HEALTH_CHECKER_TASK_KEY,
19
+ HTTP_SESSION_KEY,
20
+ REPUTATION_CALCULATOR_KEY,
21
+ REPUTATION_CALCULATOR_TASK_KEY,
22
+ SCHEDULER_KEY,
23
+ SCHEDULER_TASK_KEY,
24
+ WATCHER_KEY,
25
+ WATCHER_TASK_KEY,
26
+ WS_MANAGER_KEY,
27
+ )
13
28
  from .blueprint import StateMachineBlueprint
14
29
  from .client_config_loader import load_client_configs_to_redis
15
30
  from .compression import compression_middleware
16
31
  from .config import Config
32
+ from .constants import JOB_STATUS_FAILED, JOB_STATUS_PENDING, JOB_STATUS_QUARANTINED, JOB_STATUS_WAITING_FOR_WORKER
17
33
  from .dispatcher import Dispatcher
18
34
  from .executor import JobExecutor
19
35
  from .health_checker import HealthChecker
20
36
  from .history.base import HistoryStorageBase
21
37
  from .history.noop import NoOpHistoryStorage
22
38
  from .logging_config import setup_logging
23
- from .quota import quota_middleware_factory
24
- from .ratelimit import rate_limit_middleware_factory
25
39
  from .reputation import ReputationCalculator
26
- from .security import client_auth_middleware_factory, worker_auth_middleware_factory
40
+ from .scheduler import Scheduler
27
41
  from .storage.base import StorageBackend
28
42
  from .telemetry import setup_telemetry
43
+ from .utils.webhook_sender import WebhookPayload, WebhookSender
29
44
  from .watcher import Watcher
30
45
  from .worker_config_loader import load_worker_configs_to_redis
31
46
  from .ws_manager import WebSocketManager
32
47
 
33
- # Application keys for storing components
34
- ENGINE_KEY = AppKey("engine", "OrchestratorEngine")
35
- HTTP_SESSION_KEY = AppKey("http_session", ClientSession)
36
- DISPATCHER_KEY = AppKey("dispatcher", Dispatcher)
37
- EXECUTOR_KEY = AppKey("executor", JobExecutor)
38
- WATCHER_KEY = AppKey("watcher", Watcher)
39
- REPUTATION_CALCULATOR_KEY = AppKey("reputation_calculator", ReputationCalculator)
40
- HEALTH_CHECKER_KEY = AppKey("health_checker", HealthChecker)
41
- EXECUTOR_TASK_KEY = AppKey("executor_task", Task)
42
- WATCHER_TASK_KEY = AppKey("watcher_task", Task)
43
- REPUTATION_CALCULATOR_TASK_KEY = AppKey("reputation_calculator_task", Task)
44
- HEALTH_CHECKER_TASK_KEY = AppKey("health_checker_task", Task)
45
-
46
48
  metrics.init_metrics()
47
49
 
48
50
  logger = getLogger(__name__)
@@ -56,17 +58,9 @@ def json_response(data: Any, **kwargs: Any) -> web.Response:
56
58
  return web.json_response(data, dumps=json_dumps, **kwargs)
57
59
 
58
60
 
59
- async def status_handler(_request: web.Request) -> web.Response:
60
- return json_response({"status": "ok"})
61
-
62
-
63
- async def metrics_handler(_request: web.Request) -> web.Response:
64
- return web.Response(body=render(), content_type="text/plain")
65
-
66
-
67
61
  class OrchestratorEngine:
68
62
  def __init__(self, storage: StorageBackend, config: Config):
69
- setup_logging(config.LOG_LEVEL, config.LOG_FORMAT)
63
+ setup_logging(config.LOG_LEVEL, config.LOG_FORMAT, config.TZ)
70
64
  setup_telemetry()
71
65
  self.storage = storage
72
66
  self.config = config
@@ -77,7 +71,7 @@ class OrchestratorEngine:
77
71
  self.app[ENGINE_KEY] = self
78
72
  self._setup_done = False
79
73
 
80
- def register_blueprint(self, blueprint: StateMachineBlueprint):
74
+ def register_blueprint(self, blueprint: StateMachineBlueprint) -> None:
81
75
  if self._setup_done:
82
76
  raise RuntimeError("Cannot register blueprints after engine setup.")
83
77
  if blueprint.name in self.blueprints:
@@ -87,15 +81,15 @@ class OrchestratorEngine:
87
81
  blueprint.validate()
88
82
  self.blueprints[blueprint.name] = blueprint
89
83
 
90
- def setup(self):
84
+ def setup(self) -> None:
91
85
  if self._setup_done:
92
86
  return
93
- self._setup_routes()
87
+ setup_routes(self.app, self)
94
88
  self.app.on_startup.append(self.on_startup)
95
89
  self.app.on_shutdown.append(self.on_shutdown)
96
90
  self._setup_done = True
97
91
 
98
- async def _setup_history_storage(self):
92
+ async def _setup_history_storage(self) -> None:
99
93
  from importlib import import_module
100
94
 
101
95
  uri = self.config.HISTORY_DATABASE_URI
@@ -115,7 +109,7 @@ class OrchestratorEngine:
115
109
  storage_class = module.SQLiteHistoryStorage
116
110
  parsed_uri = urlparse(uri)
117
111
  db_path = parsed_uri.path
118
- storage_args = [db_path]
112
+ storage_args = [db_path, self.config.TZ]
119
113
  except ImportError as e:
120
114
  logger.error(f"Could not import SQLiteHistoryStorage, perhaps aiosqlite is not installed? Error: {e}")
121
115
  self.history_storage = NoOpHistoryStorage()
@@ -125,7 +119,7 @@ class OrchestratorEngine:
125
119
  try:
126
120
  module = import_module(".history.postgres", package="avtomatika")
127
121
  storage_class = module.PostgresHistoryStorage
128
- storage_args = [uri]
122
+ storage_args = [uri, self.config.TZ]
129
123
  except ImportError as e:
130
124
  logger.error(f"Could not import PostgresHistoryStorage, perhaps asyncpg is not installed? Error: {e}")
131
125
  self.history_storage = NoOpHistoryStorage()
@@ -146,7 +140,7 @@ class OrchestratorEngine:
146
140
  )
147
141
  self.history_storage = NoOpHistoryStorage()
148
142
 
149
- async def on_startup(self, app: web.Application):
143
+ async def on_startup(self, app: web.Application) -> None:
150
144
  try:
151
145
  from opentelemetry.instrumentation.aiohttp_client import (
152
146
  AioHttpClientInstrumentor,
@@ -193,24 +187,29 @@ class OrchestratorEngine:
193
187
  )
194
188
 
195
189
  app[HTTP_SESSION_KEY] = ClientSession()
190
+ self.webhook_sender = WebhookSender(app[HTTP_SESSION_KEY])
196
191
  self.dispatcher = Dispatcher(self.storage, self.config)
197
192
  app[DISPATCHER_KEY] = self.dispatcher
198
193
  app[EXECUTOR_KEY] = JobExecutor(self, self.history_storage)
199
194
  app[WATCHER_KEY] = Watcher(self)
200
195
  app[REPUTATION_CALCULATOR_KEY] = ReputationCalculator(self)
201
196
  app[HEALTH_CHECKER_KEY] = HealthChecker(self)
197
+ app[SCHEDULER_KEY] = Scheduler(self)
198
+ app[WS_MANAGER_KEY] = self.ws_manager
202
199
 
203
200
  app[EXECUTOR_TASK_KEY] = create_task(app[EXECUTOR_KEY].run())
204
201
  app[WATCHER_TASK_KEY] = create_task(app[WATCHER_KEY].run())
205
202
  app[REPUTATION_CALCULATOR_TASK_KEY] = create_task(app[REPUTATION_CALCULATOR_KEY].run())
206
203
  app[HEALTH_CHECKER_TASK_KEY] = create_task(app[HEALTH_CHECKER_KEY].run())
204
+ app[SCHEDULER_TASK_KEY] = create_task(app[SCHEDULER_KEY].run())
207
205
 
208
- async def on_shutdown(self, app: web.Application):
206
+ async def on_shutdown(self, app: web.Application) -> None:
209
207
  logger.info("Shutdown sequence started.")
210
208
  app[EXECUTOR_KEY].stop()
211
209
  app[WATCHER_KEY].stop()
212
210
  app[REPUTATION_CALCULATOR_KEY].stop()
213
211
  app[HEALTH_CHECKER_KEY].stop()
212
+ app[SCHEDULER_KEY].stop()
214
213
  logger.info("Background task running flags set to False.")
215
214
 
216
215
  if hasattr(self.history_storage, "close"):
@@ -226,6 +225,8 @@ class OrchestratorEngine:
226
225
  app[WATCHER_TASK_KEY].cancel()
227
226
  app[REPUTATION_CALCULATOR_TASK_KEY].cancel()
228
227
  app[EXECUTOR_TASK_KEY].cancel()
228
+ # Scheduler task manages its own loop cancellation in stop(), but just in case:
229
+ app[SCHEDULER_TASK_KEY].cancel()
229
230
  logger.info("Background tasks cancelled.")
230
231
 
231
232
  logger.info("Gathering background tasks with a 10s timeout...")
@@ -236,6 +237,7 @@ class OrchestratorEngine:
236
237
  app[WATCHER_TASK_KEY],
237
238
  app[REPUTATION_CALCULATOR_TASK_KEY],
238
239
  app[EXECUTOR_TASK_KEY],
240
+ app[SCHEDULER_TASK_KEY],
239
241
  return_exceptions=True,
240
242
  ),
241
243
  timeout=10.0,
@@ -249,586 +251,110 @@ class OrchestratorEngine:
249
251
  logger.info("HTTP session closed.")
250
252
  logger.info("Shutdown sequence finished.")
251
253
 
252
- def _create_job_handler(self, blueprint: StateMachineBlueprint) -> Callable:
253
- async def handler(request: web.Request) -> web.Response:
254
- try:
255
- initial_data = await request.json(loads=loads)
256
- except Exception:
257
- return json_response({"error": "Invalid JSON body"}, status=400)
258
-
259
- client_config = request["client_config"]
260
- carrier = {str(k): v for k, v in request.headers.items()}
261
-
262
- job_id = str(uuid4())
263
- job_state = {
264
- "id": job_id,
265
- "blueprint_name": blueprint.name,
266
- "current_state": blueprint.start_state,
267
- "initial_data": initial_data,
268
- "state_history": {},
269
- "status": "pending",
270
- "tracing_context": carrier,
271
- "client_config": client_config,
272
- }
273
- await self.storage.save_job_state(job_id, job_state)
274
- await self.storage.enqueue_job(job_id)
275
- metrics.jobs_total.inc({metrics.LABEL_BLUEPRINT: blueprint.name})
276
- return json_response({"status": "accepted", "job_id": job_id}, status=202)
277
-
278
- return handler
279
-
280
- async def _get_job_status_handler(self, request: web.Request) -> web.Response:
281
- job_id = request.match_info.get("job_id")
282
- if not job_id:
283
- return json_response({"error": "job_id is required in path"}, status=400)
284
- job_state = await self.storage.get_job_state(job_id)
285
- if not job_state:
286
- return json_response({"error": "Job not found"}, status=404)
287
- return json_response(job_state, status=200)
288
-
289
- async def _cancel_job_handler(self, request: web.Request) -> web.Response:
290
- job_id = request.match_info.get("job_id")
291
- if not job_id:
292
- return json_response({"error": "job_id is required in path"}, status=400)
293
-
294
- job_state = await self.storage.get_job_state(job_id)
295
- if not job_state:
296
- return json_response({"error": "Job not found"}, status=404)
297
-
298
- if job_state.get("status") != "waiting_for_worker":
299
- return json_response(
300
- {"error": "Job is not in a state that can be cancelled (must be waiting for a worker)."},
301
- status=409,
302
- )
303
-
304
- worker_id = job_state.get("task_worker_id")
305
- if not worker_id:
306
- return json_response(
307
- {"error": "Cannot cancel job: worker_id not found in job state."},
308
- status=500,
309
- )
310
-
311
- worker_info = await self.storage.get_worker_info(worker_id)
312
- task_id = job_state.get("current_task_id")
313
- if not task_id:
314
- return json_response(
315
- {"error": "Cannot cancel job: task_id not found in job state."},
316
- status=500,
317
- )
318
-
319
- # Set Redis flag as a reliable fallback/primary mechanism
320
- await self.storage.set_task_cancellation_flag(task_id)
321
-
322
- # Attempt WebSocket-based cancellation if supported
323
- if worker_info and worker_info.get("capabilities", {}).get("websockets"):
324
- command = {"command": "cancel_task", "task_id": task_id, "job_id": job_id}
325
- sent = await self.ws_manager.send_command(worker_id, command)
326
- if sent:
327
- return json_response({"status": "cancellation_request_sent"})
328
- else:
329
- logger.warning(f"Failed to send WebSocket cancellation for task {task_id}, but Redis flag is set.")
330
- # Proceed to return success, as the Redis flag will handle it
331
-
332
- return json_response({"status": "cancellation_request_accepted"})
333
-
334
- async def _get_job_history_handler(self, request: web.Request) -> web.Response:
335
- job_id = request.match_info.get("job_id")
336
- if not job_id:
337
- return json_response({"error": "job_id is required in path"}, status=400)
338
- history = await self.history_storage.get_job_history(job_id)
339
- return json_response(history)
340
-
341
- async def _get_blueprint_graph_handler(self, request: web.Request) -> web.Response:
342
- blueprint_name = request.match_info.get("blueprint_name")
343
- if not blueprint_name:
344
- return json_response({"error": "blueprint_name is required in path"}, status=400)
345
-
254
+ async def create_background_job(
255
+ self,
256
+ blueprint_name: str,
257
+ initial_data: dict[str, Any],
258
+ source: str = "internal",
259
+ ) -> str:
260
+ """Creates a job directly, bypassing the HTTP API layer.
261
+ Useful for internal schedulers and triggers.
262
+ """
346
263
  blueprint = self.blueprints.get(blueprint_name)
347
264
  if not blueprint:
348
- return json_response({"error": "Blueprint not found"}, status=404)
349
-
350
- try:
351
- graph_dot = blueprint.render_graph()
352
- return web.Response(text=graph_dot, content_type="text/vnd.graphviz")
353
- except FileNotFoundError:
354
- error_msg = "Graphviz is not installed on the server. Cannot generate graph."
355
- logger.error(error_msg)
356
- return json_response({"error": error_msg}, status=501)
357
-
358
- async def _get_workers_handler(self, request: web.Request) -> web.Response:
359
- workers = await self.storage.get_available_workers()
360
- return json_response(workers)
361
-
362
- async def _get_jobs_handler(self, request: web.Request) -> web.Response:
363
- try:
364
- limit = int(request.query.get("limit", "100"))
365
- offset = int(request.query.get("offset", "0"))
366
- except ValueError:
367
- return json_response({"error": "Invalid limit/offset parameter"}, status=400)
368
-
369
- jobs = await self.history_storage.get_jobs(limit=limit, offset=offset)
370
- return json_response(jobs)
371
-
372
- async def _get_dashboard_handler(self, request: web.Request) -> web.Response:
373
- worker_count = await self.storage.get_active_worker_count()
374
- queue_length = await self.storage.get_job_queue_length()
375
- job_summary = await self.history_storage.get_job_summary()
376
-
377
- dashboard_data = {
378
- "workers": {"total": worker_count},
379
- "jobs": {"queued": queue_length, **job_summary},
265
+ raise ValueError(f"Blueprint '{blueprint_name}' not found.")
266
+
267
+ job_id = str(uuid4())
268
+ # Use a special internal client config
269
+ client_config = {
270
+ "token": "internal-scheduler",
271
+ "plan": "system",
272
+ "params": {"source": source},
380
273
  }
381
- return json_response(dashboard_data)
382
-
383
- async def _task_result_handler(self, request: web.Request) -> web.Response:
384
- import logging
385
-
386
- try:
387
- data = await request.json(loads=loads)
388
- job_id = data.get("job_id")
389
- task_id = data.get("task_id")
390
- result = data.get("result", {})
391
- result_status = result.get("status", "success")
392
- error_message = result.get("error")
393
- payload_worker_id = data.get("worker_id")
394
- except Exception:
395
- return json_response({"error": "Invalid JSON body"}, status=400)
396
-
397
- # Security check: Ensure the worker_id from the payload matches the authenticated worker
398
- authenticated_worker_id = request.get("worker_id")
399
- if not authenticated_worker_id:
400
- # This should not happen if the auth middleware is working correctly
401
- return json_response({"error": "Could not identify authenticated worker."}, status=500)
402
-
403
- if payload_worker_id and payload_worker_id != authenticated_worker_id:
404
- return json_response(
405
- {
406
- "error": f"Forbidden: Authenticated worker '{authenticated_worker_id}' "
407
- f"cannot submit results for another worker '{payload_worker_id}'.",
408
- },
409
- status=403,
410
- )
411
-
412
- if not job_id or not task_id:
413
- return json_response({"error": "job_id and task_id are required"}, status=400)
414
-
415
- job_state = await self.storage.get_job_state(job_id)
416
- if not job_state:
417
- return json_response({"error": "Job not found"}, status=404)
418
-
419
- # Handle parallel task completion
420
- if job_state.get("status") == "waiting_for_parallel_tasks":
421
- await self.storage.remove_job_from_watch(f"{job_id}:{task_id}")
422
- job_state.setdefault("aggregation_results", {})[task_id] = result
423
- job_state.setdefault("active_branches", []).remove(task_id)
424
-
425
- if not job_state["active_branches"]:
426
- logger.info(f"All parallel branches for job {job_id} have completed.")
427
- job_state["status"] = "running"
428
- job_state["current_state"] = job_state["aggregation_target"]
429
- await self.storage.save_job_state(job_id, job_state)
430
- await self.storage.enqueue_job(job_id)
431
- else:
432
- logger.info(
433
- f"Branch {task_id} for job {job_id} completed. "
434
- f"Waiting for {len(job_state['active_branches'])} more.",
435
- )
436
- await self.storage.save_job_state(job_id, job_state)
437
-
438
- return json_response({"status": "parallel_branch_result_accepted"}, status=200)
439
-
440
- await self.storage.remove_job_from_watch(job_id)
441
-
442
- import time
443
274
 
444
- now = time.monotonic()
445
- dispatched_at = job_state.get("task_dispatched_at", now)
446
- duration_ms = int((now - dispatched_at) * 1000)
275
+ job_state = {
276
+ "id": job_id,
277
+ "blueprint_name": blueprint.name,
278
+ "current_state": blueprint.start_state,
279
+ "initial_data": initial_data,
280
+ "state_history": {},
281
+ "status": JOB_STATUS_PENDING,
282
+ "tracing_context": {},
283
+ "client_config": client_config,
284
+ }
285
+ await self.storage.save_job_state(job_id, job_state)
286
+ await self.storage.enqueue_job(job_id)
287
+ metrics.jobs_total.inc({metrics.LABEL_BLUEPRINT: blueprint.name})
447
288
 
289
+ # Log the creation in history as well (so we can track scheduled jobs)
448
290
  await self.history_storage.log_job_event(
449
291
  {
450
292
  "job_id": job_id,
451
- "state": job_state.get("current_state"),
452
- "event_type": "task_finished",
453
- "duration_ms": duration_ms,
454
- "worker_id": authenticated_worker_id, # Use authenticated worker_id
455
- "context_snapshot": {**job_state, "result": result},
456
- },
293
+ "state": "pending",
294
+ "event_type": "job_created",
295
+ "context_snapshot": job_state,
296
+ "metadata": {"source": source, "scheduled": True},
297
+ }
457
298
  )
458
299
 
459
- job_state["tracing_context"] = {str(k): v for k, v in request.headers.items()}
460
-
461
- if result_status == "failure":
462
- error_details = result.get("error", {})
463
- error_type = "TRANSIENT_ERROR"
464
- error_message = "No error details provided."
465
-
466
- if isinstance(error_details, dict):
467
- error_type = error_details.get("code", "TRANSIENT_ERROR")
468
- error_message = error_details.get("message", "No error message provided.")
469
- elif isinstance(error_details, str):
470
- # Fallback for old format where `error` was just a string
471
- error_message = error_details
472
-
473
- logging.warning(f"Task {task_id} for job {job_id} failed with error type '{error_type}'.")
474
-
475
- if error_type == "PERMANENT_ERROR":
476
- job_state["status"] = "quarantined"
477
- job_state["error_message"] = f"Task failed with permanent error: {error_message}"
478
- await self.storage.save_job_state(job_id, job_state)
479
- await self.storage.quarantine_job(job_id)
480
- elif error_type == "INVALID_INPUT_ERROR":
481
- job_state["status"] = "failed"
482
- job_state["error_message"] = f"Task failed due to invalid input: {error_message}"
483
- await self.storage.save_job_state(job_id, job_state)
484
- else: # TRANSIENT_ERROR or any other/unspecified error
485
- await self._handle_task_failure(job_state, task_id, error_message)
486
-
487
- return json_response({"status": "result_accepted_failure"}, status=200)
488
-
489
- if result_status == "cancelled":
490
- logging.info(f"Task {task_id} for job {job_id} was cancelled by worker.")
491
- job_state["status"] = "cancelled"
492
- await self.storage.save_job_state(job_id, job_state)
493
- # Optionally, trigger a specific 'cancelled' transition if defined in the blueprint
494
- transitions = job_state.get("current_task_transitions", {})
495
- if next_state := transitions.get("cancelled"):
496
- job_state["current_state"] = next_state
497
- job_state["status"] = "running" # It's running the cancellation handler now
498
- await self.storage.save_job_state(job_id, job_state)
499
- await self.storage.enqueue_job(job_id)
500
- return json_response({"status": "result_accepted_cancelled"}, status=200)
501
-
502
- transitions = job_state.get("current_task_transitions", {})
503
- if next_state := transitions.get(result_status):
504
- logging.info(f"Job {job_id} transitioning based on worker status '{result_status}' to state '{next_state}'")
505
-
506
- worker_data = result.get("data")
507
- if worker_data and isinstance(worker_data, dict):
508
- if "state_history" not in job_state:
509
- job_state["state_history"] = {}
510
- job_state["state_history"].update(worker_data)
511
-
512
- job_state["current_state"] = next_state
513
- job_state["status"] = "running"
514
- await self.storage.save_job_state(job_id, job_state)
515
- await self.storage.enqueue_job(job_id)
516
- else:
517
- logging.error(f"Job {job_id} failed. Worker returned unhandled status '{result_status}'.")
518
- job_state["status"] = "failed"
519
- job_state["error_message"] = f"Worker returned unhandled status: {result_status}"
520
- await self.storage.save_job_state(job_id, job_state)
521
-
522
- return json_response({"status": "result_accepted_success"}, status=200)
523
-
524
- async def _handle_task_failure(self, job_state: dict, task_id: str, error_message: str | None):
525
- import logging
300
+ logger.info(f"Created background job {job_id} for blueprint '{blueprint_name}' (source: {source})")
301
+ return job_id
526
302
 
303
+ async def handle_task_failure(self, job_state: dict[str, Any], task_id: str, error_message: str | None) -> None:
304
+ """Handles a transient task failure by retrying or quarantining."""
527
305
  job_id = job_state["id"]
528
306
  retry_count = job_state.get("retry_count", 0)
529
307
  max_retries = self.config.JOB_MAX_RETRIES
530
308
 
531
309
  if retry_count < max_retries:
532
310
  job_state["retry_count"] = retry_count + 1
533
- logging.info(f"Retrying task for job {job_id}. Attempt {retry_count + 1}/{max_retries}.")
311
+ logger.info(f"Retrying task for job {job_id}. Attempt {retry_count + 1}/{max_retries}.")
534
312
 
535
313
  task_info = job_state.get("current_task_info")
536
314
  if not task_info:
537
- logging.error(f"Cannot retry job {job_id}: missing 'current_task_info' in job state.")
538
- job_state["status"] = "failed"
315
+ logger.error(f"Cannot retry job {job_id}: missing 'current_task_info' in job state.")
316
+ job_state["status"] = JOB_STATUS_FAILED
539
317
  job_state["error_message"] = "Cannot retry: original task info not found."
540
318
  await self.storage.save_job_state(job_id, job_state)
319
+ await self.send_job_webhook(job_state, "job_failed")
541
320
  return
542
321
 
543
322
  now = get_running_loop().time()
544
323
  timeout_seconds = task_info.get("timeout_seconds", self.config.WORKER_TIMEOUT_SECONDS)
545
324
  timeout_at = now + timeout_seconds
546
325
 
547
- job_state["status"] = "waiting_for_worker"
326
+ job_state["status"] = JOB_STATUS_WAITING_FOR_WORKER
548
327
  job_state["task_dispatched_at"] = now
549
328
  await self.storage.save_job_state(job_id, job_state)
550
329
  await self.storage.add_job_to_watch(job_id, timeout_at)
551
330
 
552
331
  await self.dispatcher.dispatch(job_state, task_info)
553
332
  else:
554
- logging.critical(f"Job {job_id} has failed {max_retries + 1} times. Moving to quarantine.")
555
- job_state["status"] = "quarantined"
333
+ logger.critical(f"Job {job_id} has failed {max_retries + 1} times. Moving to quarantine.")
334
+ job_state["status"] = JOB_STATUS_QUARANTINED
556
335
  job_state["error_message"] = f"Task failed after {max_retries + 1} attempts: {error_message}"
557
336
  await self.storage.save_job_state(job_id, job_state)
558
337
  await self.storage.quarantine_job(job_id)
338
+ await self.send_job_webhook(job_state, "job_quarantined")
559
339
 
560
- async def _human_approval_webhook_handler(self, request: web.Request) -> web.Response:
561
- job_id = request.match_info.get("job_id")
562
- if not job_id:
563
- return json_response({"error": "job_id is required in path"}, status=400)
564
- try:
565
- data = await request.json(loads=loads)
566
- decision = data.get("decision")
567
- if not decision:
568
- return json_response({"error": "decision is required in body"}, status=400)
569
- except Exception:
570
- return json_response({"error": "Invalid JSON body"}, status=400)
571
- job_state = await self.storage.get_job_state(job_id)
572
- if not job_state:
573
- return json_response({"error": "Job not found"}, status=404)
574
- if job_state.get("status") not in ["waiting_for_worker", "waiting_for_human"]:
575
- return json_response({"error": "Job is not in a state that can be approved"}, status=409)
576
- transitions = job_state.get("current_task_transitions", {})
577
- next_state = transitions.get(decision)
578
- if not next_state:
579
- return json_response({"error": f"Invalid decision '{decision}' for this job"}, status=400)
580
- job_state["current_state"] = next_state
581
- job_state["status"] = "running"
582
- await self.storage.save_job_state(job_id, job_state)
583
- await self.storage.enqueue_job(job_id)
584
- return json_response({"status": "approval_received", "job_id": job_id})
585
-
586
- async def _get_quarantined_jobs_handler(self, request: web.Request) -> web.Response:
587
- """Returns a list of all job IDs in the quarantine queue."""
588
- jobs = await self.storage.get_quarantined_jobs()
589
- return json_response(jobs)
590
-
591
- async def _reload_worker_configs_handler(self, request: web.Request) -> web.Response:
592
- """Handles the dynamic reloading of worker configurations."""
593
- logger.info("Received request to reload worker configurations.")
594
- if not self.config.WORKERS_CONFIG_PATH:
595
- return json_response(
596
- {"error": "WORKERS_CONFIG_PATH is not set, cannot reload configs."},
597
- status=400,
598
- )
599
-
600
- await load_worker_configs_to_redis(self.storage, self.config.WORKERS_CONFIG_PATH)
601
- return json_response({"status": "worker_configs_reloaded"})
602
-
603
- async def _flush_db_handler(self, request: web.Request) -> web.Response:
604
- logger.warning("Received request to flush the database.")
605
- await self.storage.flush_all()
606
- await load_client_configs_to_redis(self.storage)
607
- return json_response({"status": "db_flushed"}, status=200)
608
-
609
- async def _docs_handler(self, request: web.Request) -> web.Response:
610
- from importlib import resources
611
-
612
- try:
613
- content = resources.read_text("avtomatika", "api.html")
614
- except FileNotFoundError:
615
- logger.error("api.html not found within the avtomatika package.")
616
- return json_response({"error": "Documentation file not found on server."}, status=500)
617
-
618
- # Generate dynamic documentation for registered blueprints
619
- blueprint_endpoints = []
620
- for bp in self.blueprints.values():
621
- if not bp.api_endpoint:
622
- continue
623
-
624
- version_prefix = f"/{bp.api_version}" if bp.api_version else ""
625
- endpoint_path = bp.api_endpoint if bp.api_endpoint.startswith("/") else f"/{bp.api_endpoint}"
626
- full_path = f"/api{version_prefix}{endpoint_path}"
627
-
628
- blueprint_endpoints.append(
629
- {
630
- "id": f"post-create-{bp.name.replace('_', '-')}",
631
- "name": f"Create {bp.name.replace('_', ' ').title()} Job",
632
- "method": "POST",
633
- "path": full_path,
634
- "description": f"Creates and starts a new instance (Job) of the `{bp.name}` blueprint.",
635
- "request": {"body": {"initial_data": {}}},
636
- "responses": [
637
- {
638
- "code": "202 Accepted",
639
- "description": "Job successfully accepted for processing.",
640
- "body": {"status": "accepted", "job_id": "..."},
641
- }
642
- ],
643
- }
644
- )
645
-
646
- # Inject dynamic endpoints into the apiData structure in the HTML
647
- if blueprint_endpoints:
648
- endpoints_json = dumps(blueprint_endpoints, option=OPT_INDENT_2).decode("utf-8")
649
- # We insert the new endpoints at the beginning of the 'Protected API' group
650
- marker = "group: 'Protected API',\n endpoints: ["
651
- content = content.replace(marker, f"{marker}\n{endpoints_json.strip('[]')},")
652
-
653
- return web.Response(text=content, content_type="text/html")
654
-
655
- def _setup_routes(self):
656
- public_app = web.Application()
657
- public_app.router.add_get("/status", status_handler)
658
- public_app.router.add_get("/metrics", metrics_handler)
659
- public_app.router.add_post("/webhooks/approval/{job_id}", self._human_approval_webhook_handler)
660
- public_app.router.add_post("/debug/flush_db", self._flush_db_handler)
661
- public_app.router.add_get("/docs", self._docs_handler)
662
- public_app.router.add_get("/jobs/quarantined", self._get_quarantined_jobs_handler)
663
- self.app.add_subapp("/_public/", public_app)
664
-
665
- auth_middleware = client_auth_middleware_factory(self.storage)
666
- quota_middleware = quota_middleware_factory(self.storage)
667
- api_middlewares = [auth_middleware, quota_middleware]
668
-
669
- protected_app = web.Application(middlewares=api_middlewares)
670
- versioned_apps: dict[str, web.Application] = {}
671
- has_unversioned_routes = False
672
-
673
- for bp in self.blueprints.values():
674
- if not bp.api_endpoint:
675
- continue
676
- endpoint = bp.api_endpoint if bp.api_endpoint.startswith("/") else f"/{bp.api_endpoint}"
677
- if bp.api_version:
678
- if bp.api_version not in versioned_apps:
679
- versioned_apps[bp.api_version] = web.Application(middlewares=api_middlewares)
680
- versioned_apps[bp.api_version].router.add_post(endpoint, self._create_job_handler(bp))
681
- else:
682
- protected_app.router.add_post(endpoint, self._create_job_handler(bp))
683
- has_unversioned_routes = True
684
-
685
- all_protected_apps = list(versioned_apps.values())
686
- if has_unversioned_routes:
687
- all_protected_apps.append(protected_app)
688
-
689
- for app in all_protected_apps:
690
- self._register_common_routes(app)
691
- if has_unversioned_routes:
692
- self.app.add_subapp("/api/", protected_app)
693
- for version, app in versioned_apps.items():
694
- self.app.add_subapp(f"/api/{version}", app)
695
-
696
- worker_auth_middleware = worker_auth_middleware_factory(self.storage, self.config)
697
- worker_middlewares = [worker_auth_middleware]
698
- if self.config.RATE_LIMITING_ENABLED:
699
- worker_rate_limiter = rate_limit_middleware_factory(storage=self.storage, limit=5, period=60)
700
- worker_middlewares.append(worker_rate_limiter)
701
-
702
- worker_app = web.Application(middlewares=worker_middlewares)
703
- worker_app.router.add_post("/workers/register", self._register_worker_handler)
704
- worker_app.router.add_get("/workers/{worker_id}/tasks/next", self._handle_get_next_task)
705
- worker_app.router.add_patch("/workers/{worker_id}", self._worker_update_handler)
706
- worker_app.router.add_post("/tasks/result", self._task_result_handler)
707
- worker_app.router.add_get("/ws/{worker_id}", self._websocket_handler)
708
- self.app.add_subapp("/_worker/", worker_app)
709
-
710
- def _register_common_routes(self, app):
711
- app.router.add_get("/jobs/{job_id}", self._get_job_status_handler)
712
- app.router.add_post("/jobs/{job_id}/cancel", self._cancel_job_handler)
713
- if not isinstance(self.history_storage, NoOpHistoryStorage):
714
- app.router.add_get("/jobs/{job_id}/history", self._get_job_history_handler)
715
- app.router.add_get("/blueprints/{blueprint_name}/graph", self._get_blueprint_graph_handler)
716
- app.router.add_get("/workers", self._get_workers_handler)
717
- app.router.add_get("/jobs", self._get_jobs_handler)
718
- app.router.add_get("/dashboard", self._get_dashboard_handler)
719
- app.router.add_post("/admin/reload-workers", self._reload_worker_configs_handler)
720
-
721
- async def _websocket_handler(self, request: web.Request) -> web.WebSocketResponse:
722
- worker_id = request.match_info.get("worker_id")
723
- if not worker_id:
724
- raise web.HTTPBadRequest(text="worker_id is required")
725
-
726
- ws = web.WebSocketResponse()
727
- await ws.prepare(request)
728
-
729
- await self.ws_manager.register(worker_id, ws)
730
- try:
731
- async for msg in ws:
732
- if msg.type == WSMsgType.TEXT:
733
- try:
734
- data = msg.json()
735
- await self.ws_manager.handle_message(worker_id, data)
736
- except Exception as e:
737
- logger.error(f"Error processing WebSocket message from {worker_id}: {e}")
738
- elif msg.type == WSMsgType.ERROR:
739
- logger.error(f"WebSocket connection for {worker_id} closed with exception {ws.exception()}")
740
- break
741
- finally:
742
- await self.ws_manager.unregister(worker_id)
743
- return ws
744
-
745
- async def _handle_get_next_task(self, request: web.Request) -> web.Response:
746
- worker_id = request.match_info.get("worker_id")
747
- if not worker_id:
748
- return json_response({"error": "worker_id is required in path"}, status=400)
749
-
750
- logger.debug(f"Worker {worker_id} is requesting a new task.")
751
- task = await self.storage.dequeue_task_for_worker(worker_id, self.config.WORKER_POLL_TIMEOUT_SECONDS)
752
-
753
- if task:
754
- logger.info(f"Sending task {task.get('task_id')} to worker {worker_id}")
755
- return json_response(task, status=200)
756
- logger.debug(f"No tasks for worker {worker_id}, responding 204.")
757
- return web.Response(status=204)
758
-
759
- async def _worker_update_handler(self, request: web.Request) -> web.Response:
760
- """
761
- Handles both full updates and lightweight heartbeats for a worker.
762
-
763
- If the request has a JSON body, it updates the worker's data.
764
- In either case, it refreshes the worker's TTL, serving as a heartbeat.
765
- """
766
- worker_id = request.match_info.get("worker_id")
767
- if not worker_id:
768
- return json_response({"error": "worker_id is required in path"}, status=400)
769
-
770
- ttl = self.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
771
- update_data = None
772
-
773
- # Check for body content without consuming it if it's not JSON
774
- if request.can_read_body:
775
- try:
776
- update_data = await request.json(loads=loads)
777
- except Exception:
778
- logger.warning(
779
- f"Received PATCH from worker {worker_id} with non-JSON body. Treating as TTL-only heartbeat."
780
- )
340
+ async def send_job_webhook(self, job_state: dict[str, Any], event: str) -> None:
341
+ """Sends a webhook notification for a job event."""
342
+ webhook_url = job_state.get("webhook_url")
343
+ if not webhook_url:
344
+ return
781
345
 
782
- if update_data:
783
- # Full update path
784
- updated_worker = await self.storage.update_worker_status(worker_id, update_data, ttl)
785
- if not updated_worker:
786
- return json_response({"error": "Worker not found"}, status=404)
787
-
788
- await self.history_storage.log_worker_event(
789
- {
790
- "worker_id": worker_id,
791
- "event_type": "status_update",
792
- "worker_info_snapshot": updated_worker,
793
- },
794
- )
795
- return json_response(updated_worker, status=200)
796
- else:
797
- # Lightweight TTL-only heartbeat path
798
- refreshed = await self.storage.refresh_worker_ttl(worker_id, ttl)
799
- if not refreshed:
800
- return json_response({"error": "Worker not found"}, status=404)
801
- return json_response({"status": "ttl_refreshed"})
802
-
803
- async def _register_worker_handler(self, request: web.Request) -> web.Response:
804
- # The worker_registration_data is attached by the auth middleware
805
- # to avoid reading the request body twice.
806
- worker_data = request.get("worker_registration_data")
807
- if not worker_data:
808
- return json_response({"error": "Worker data not found in request"}, status=500)
809
-
810
- worker_id = worker_data.get("worker_id")
811
- # This check is redundant if the middleware works, but good for safety
812
- if not worker_id:
813
- return json_response({"error": "Missing required field: worker_id"}, status=400)
814
-
815
- ttl = self.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
816
- await self.storage.register_worker(worker_id, worker_data, ttl)
817
-
818
- logger.info(
819
- f"Worker '{worker_id}' registered with info: {worker_data}",
346
+ payload = WebhookPayload(
347
+ event=event,
348
+ job_id=job_state["id"],
349
+ status=job_state["status"],
350
+ result=job_state.get("state_history"), # Or specific result
351
+ error=job_state.get("error_message"),
820
352
  )
821
353
 
822
- await self.history_storage.log_worker_event(
823
- {
824
- "worker_id": worker_id,
825
- "event_type": "registered",
826
- "worker_info_snapshot": worker_data,
827
- },
828
- )
829
- return json_response({"status": "registered"}, status=200)
354
+ # Run in background to not block the main flow
355
+ create_task(self.webhook_sender.send(webhook_url, payload))
830
356
 
831
- def run(self):
357
+ def run(self) -> None:
832
358
  self.setup()
833
359
  print(
834
360
  f"Starting OrchestratorEngine API server on {self.config.API_HOST}:{self.config.API_PORT} in blocking mode."