avtomatika 1.0b6__py3-none-any.whl → 1.0b8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avtomatika/api/handlers.py +549 -0
- avtomatika/api/routes.py +118 -0
- avtomatika/app_keys.py +33 -0
- avtomatika/blueprint.py +125 -54
- avtomatika/config.py +10 -0
- avtomatika/context.py +2 -2
- avtomatika/data_types.py +4 -2
- avtomatika/dispatcher.py +9 -27
- avtomatika/engine.py +70 -601
- avtomatika/executor.py +55 -22
- avtomatika/health_checker.py +23 -5
- avtomatika/history/base.py +60 -6
- avtomatika/history/noop.py +18 -7
- avtomatika/history/postgres.py +8 -6
- avtomatika/history/sqlite.py +7 -5
- avtomatika/metrics.py +1 -1
- avtomatika/reputation.py +46 -40
- avtomatika/s3.py +323 -0
- avtomatika/scheduler.py +8 -8
- avtomatika/storage/base.py +45 -4
- avtomatika/storage/memory.py +56 -13
- avtomatika/storage/redis.py +185 -252
- avtomatika/utils/__init__.py +0 -0
- avtomatika/utils/webhook_sender.py +96 -0
- avtomatika/watcher.py +34 -38
- avtomatika/ws_manager.py +7 -6
- {avtomatika-1.0b6.dist-info → avtomatika-1.0b8.dist-info}/METADATA +91 -3
- avtomatika-1.0b8.dist-info/RECORD +46 -0
- avtomatika-1.0b6.dist-info/RECORD +0 -40
- {avtomatika-1.0b6.dist-info → avtomatika-1.0b8.dist-info}/WHEEL +0 -0
- {avtomatika-1.0b6.dist-info → avtomatika-1.0b8.dist-info}/licenses/LICENSE +0 -0
- {avtomatika-1.0b6.dist-info → avtomatika-1.0b8.dist-info}/top_level.txt +0 -0
avtomatika/executor.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from asyncio import CancelledError, Task, create_task, sleep
|
|
2
|
-
from inspect import signature
|
|
3
2
|
from logging import getLogger
|
|
4
3
|
from time import monotonic
|
|
5
4
|
from types import SimpleNamespace
|
|
@@ -48,6 +47,7 @@ except ImportError:
|
|
|
48
47
|
inject = NoOpPropagate().inject
|
|
49
48
|
TraceContextTextMapPropagator = NoOpTraceContextTextMapPropagator() # Instantiate the class
|
|
50
49
|
|
|
50
|
+
from .app_keys import S3_SERVICE_KEY
|
|
51
51
|
from .context import ActionFactory
|
|
52
52
|
from .data_types import ClientConfig, JobContext
|
|
53
53
|
from .history.base import HistoryStorageBase
|
|
@@ -75,7 +75,7 @@ class JobExecutor:
|
|
|
75
75
|
self._running = False
|
|
76
76
|
self._processing_messages: set[str] = set()
|
|
77
77
|
|
|
78
|
-
async def _process_job(self, job_id: str, message_id: str):
|
|
78
|
+
async def _process_job(self, job_id: str, message_id: str) -> None:
|
|
79
79
|
"""The core logic for processing a single job dequeued from storage."""
|
|
80
80
|
if message_id in self._processing_messages:
|
|
81
81
|
return
|
|
@@ -144,6 +144,11 @@ class JobExecutor:
|
|
|
144
144
|
plan=client_config_dict.get("plan", "unknown"),
|
|
145
145
|
params=client_config_dict.get("params", {}),
|
|
146
146
|
)
|
|
147
|
+
|
|
148
|
+
# Get TaskFiles if S3 service is available
|
|
149
|
+
s3_service = self.engine.app.get(S3_SERVICE_KEY)
|
|
150
|
+
task_files = s3_service.get_task_files(job_id) if s3_service else None
|
|
151
|
+
|
|
147
152
|
context = JobContext(
|
|
148
153
|
job_id=job_id,
|
|
149
154
|
current_state=job_state["current_state"],
|
|
@@ -154,6 +159,7 @@ class JobExecutor:
|
|
|
154
159
|
data_stores=SimpleNamespace(**blueprint.data_stores),
|
|
155
160
|
tracing_context=tracing_context,
|
|
156
161
|
aggregation_results=job_state.get("aggregation_results"),
|
|
162
|
+
task_files=task_files,
|
|
157
163
|
)
|
|
158
164
|
|
|
159
165
|
try:
|
|
@@ -167,19 +173,24 @@ class JobExecutor:
|
|
|
167
173
|
handler = blueprint.find_handler(context.current_state, context)
|
|
168
174
|
|
|
169
175
|
# Build arguments for the handler dynamically.
|
|
170
|
-
|
|
171
|
-
params_to_inject = {}
|
|
176
|
+
param_names = blueprint.get_handler_params(handler)
|
|
177
|
+
params_to_inject: dict[str, Any] = {}
|
|
172
178
|
|
|
173
|
-
if "context" in
|
|
179
|
+
if "context" in param_names:
|
|
174
180
|
params_to_inject["context"] = context
|
|
175
|
-
if "actions" in
|
|
181
|
+
if "actions" in param_names:
|
|
176
182
|
params_to_inject["actions"] = action_factory
|
|
183
|
+
if "task_files" in param_names:
|
|
184
|
+
params_to_inject["task_files"] = task_files
|
|
177
185
|
else:
|
|
178
186
|
# New injection logic with prioritized lookup.
|
|
179
187
|
context_as_dict = context._asdict()
|
|
180
|
-
for param_name in
|
|
188
|
+
for param_name in param_names:
|
|
189
|
+
# Direct injection of task_files
|
|
190
|
+
if param_name == "task_files":
|
|
191
|
+
params_to_inject[param_name] = task_files
|
|
181
192
|
# Look in JobContext fields first.
|
|
182
|
-
|
|
193
|
+
elif param_name in context_as_dict:
|
|
183
194
|
params_to_inject[param_name] = context_as_dict[param_name]
|
|
184
195
|
# Then look in state_history (data from previous steps/workers).
|
|
185
196
|
elif param_name in context.state_history:
|
|
@@ -232,7 +243,7 @@ class JobExecutor:
|
|
|
232
243
|
job_state: dict[str, Any],
|
|
233
244
|
next_state: str,
|
|
234
245
|
duration_ms: int,
|
|
235
|
-
):
|
|
246
|
+
) -> None:
|
|
236
247
|
job_id = job_state["id"]
|
|
237
248
|
previous_state = job_state["current_state"]
|
|
238
249
|
logger.info(f"Job {job_id} transitioning from {previous_state} to {next_state}")
|
|
@@ -259,14 +270,29 @@ class JobExecutor:
|
|
|
259
270
|
await self.storage.enqueue_job(job_id)
|
|
260
271
|
else:
|
|
261
272
|
logger.info(f"Job {job_id} reached terminal state {next_state}")
|
|
273
|
+
|
|
274
|
+
# Clean up S3 files if service is available
|
|
275
|
+
s3_service = self.engine.app.get(S3_SERVICE_KEY)
|
|
276
|
+
if s3_service:
|
|
277
|
+
task_files = s3_service.get_task_files(job_id)
|
|
278
|
+
if task_files:
|
|
279
|
+
# Run cleanup in background to not block response
|
|
280
|
+
create_task(task_files.cleanup())
|
|
281
|
+
|
|
262
282
|
await self._check_and_resume_parent(job_state)
|
|
283
|
+
# Send webhook for finished/failed jobs
|
|
284
|
+
event_type = "job_finished" if next_state == "finished" else "job_failed"
|
|
285
|
+
# Since _check_and_resume_parent is for sub-jobs, we only send webhook if it's a top-level job
|
|
286
|
+
# or if the user explicitly requested it for sub-jobs (by providing webhook_url).
|
|
287
|
+
# The current logic stores webhook_url in job_state, so we just check it.
|
|
288
|
+
await self.engine.send_job_webhook(job_state, event_type)
|
|
263
289
|
|
|
264
290
|
async def _handle_dispatch(
|
|
265
291
|
self,
|
|
266
292
|
job_state: dict[str, Any],
|
|
267
293
|
task_info: dict[str, Any],
|
|
268
294
|
duration_ms: int,
|
|
269
|
-
):
|
|
295
|
+
) -> None:
|
|
270
296
|
job_id = job_state["id"]
|
|
271
297
|
current_state = job_state["current_state"]
|
|
272
298
|
|
|
@@ -302,7 +328,6 @@ class JobExecutor:
|
|
|
302
328
|
await self.storage.save_job_state(job_id, job_state)
|
|
303
329
|
await self.storage.add_job_to_watch(job_id, timeout_at)
|
|
304
330
|
|
|
305
|
-
# Now, dispatch the task
|
|
306
331
|
await self.dispatcher.dispatch(job_state, task_info)
|
|
307
332
|
|
|
308
333
|
async def _handle_run_blueprint(
|
|
@@ -310,7 +335,7 @@ class JobExecutor:
|
|
|
310
335
|
parent_job_state: dict[str, Any],
|
|
311
336
|
sub_blueprint_info: dict[str, Any],
|
|
312
337
|
duration_ms: int,
|
|
313
|
-
):
|
|
338
|
+
) -> None:
|
|
314
339
|
parent_job_id = parent_job_state["id"]
|
|
315
340
|
child_job_id = str(uuid4())
|
|
316
341
|
|
|
@@ -350,7 +375,7 @@ class JobExecutor:
|
|
|
350
375
|
job_state: dict[str, Any],
|
|
351
376
|
parallel_info: dict[str, Any],
|
|
352
377
|
duration_ms: int,
|
|
353
|
-
):
|
|
378
|
+
) -> None:
|
|
354
379
|
job_id = job_state["id"]
|
|
355
380
|
tasks_to_dispatch = parallel_info["tasks"]
|
|
356
381
|
aggregate_into = parallel_info["aggregate_into"]
|
|
@@ -398,7 +423,7 @@ class JobExecutor:
|
|
|
398
423
|
job_state: dict[str, Any],
|
|
399
424
|
error: Exception,
|
|
400
425
|
duration_ms: int,
|
|
401
|
-
):
|
|
426
|
+
) -> None:
|
|
402
427
|
"""Handles failures that occur *during the execution of a handler*.
|
|
403
428
|
|
|
404
429
|
This is different from a task failure reported by a worker. This logic
|
|
@@ -447,13 +472,14 @@ class JobExecutor:
|
|
|
447
472
|
await self.storage.quarantine_job(job_id)
|
|
448
473
|
# If this quarantined job was a sub-job, we must now resume its parent.
|
|
449
474
|
await self._check_and_resume_parent(job_state)
|
|
475
|
+
await self.engine.send_job_webhook(job_state, "job_quarantined")
|
|
450
476
|
from . import metrics
|
|
451
477
|
|
|
452
478
|
metrics.jobs_failed_total.inc(
|
|
453
479
|
{metrics.LABEL_BLUEPRINT: job_state.get("blueprint_name", "unknown")},
|
|
454
480
|
)
|
|
455
481
|
|
|
456
|
-
async def _check_and_resume_parent(self, child_job_state: dict[str, Any]):
|
|
482
|
+
async def _check_and_resume_parent(self, child_job_state: dict[str, Any]) -> None:
|
|
457
483
|
"""Checks if a completed job was a sub-job. If so, it resumes the parent
|
|
458
484
|
job, passing the success/failure outcome of the child.
|
|
459
485
|
"""
|
|
@@ -493,7 +519,7 @@ class JobExecutor:
|
|
|
493
519
|
await self.storage.enqueue_job(parent_job_id)
|
|
494
520
|
|
|
495
521
|
@staticmethod
|
|
496
|
-
def _handle_task_completion(task: Task):
|
|
522
|
+
def _handle_task_completion(task: Task) -> None:
|
|
497
523
|
"""Callback to handle completion of a job processing task."""
|
|
498
524
|
try:
|
|
499
525
|
# This will re-raise any exception caught in the task
|
|
@@ -505,7 +531,7 @@ class JobExecutor:
|
|
|
505
531
|
# Log any other exceptions that occurred in the task.
|
|
506
532
|
logger.exception("Unhandled exception in job processing task")
|
|
507
533
|
|
|
508
|
-
async def run(self):
|
|
534
|
+
async def run(self) -> None:
|
|
509
535
|
import asyncio
|
|
510
536
|
|
|
511
537
|
logger.info("JobExecutor started.")
|
|
@@ -517,7 +543,10 @@ class JobExecutor:
|
|
|
517
543
|
# Wait for an available slot before fetching a new job
|
|
518
544
|
await semaphore.acquire()
|
|
519
545
|
|
|
520
|
-
|
|
546
|
+
# Block for a configured time waiting for a job
|
|
547
|
+
block_time = self.engine.config.REDIS_STREAM_BLOCK_MS
|
|
548
|
+
result = await self.storage.dequeue_job(block=block_time if block_time > 0 else None)
|
|
549
|
+
|
|
521
550
|
if result:
|
|
522
551
|
job_id, message_id = result
|
|
523
552
|
task = create_task(self._process_job(job_id, message_id))
|
|
@@ -525,16 +554,20 @@ class JobExecutor:
|
|
|
525
554
|
# Release the semaphore slot when the task is done
|
|
526
555
|
task.add_done_callback(lambda _: semaphore.release())
|
|
527
556
|
else:
|
|
528
|
-
#
|
|
557
|
+
# Timeout reached, release slot and loop again
|
|
529
558
|
semaphore.release()
|
|
530
|
-
# Prevent busy loop if
|
|
531
|
-
|
|
559
|
+
# Prevent busy loop if blocking is disabled (e.g. in tests) or failed
|
|
560
|
+
if block_time <= 0:
|
|
561
|
+
await sleep(0.1)
|
|
562
|
+
|
|
532
563
|
except CancelledError:
|
|
533
564
|
break
|
|
534
565
|
except Exception:
|
|
535
566
|
logger.exception("Error in JobExecutor main loop.")
|
|
567
|
+
# If an error occurred (e.g. Redis connection lost), sleep briefly to avoid log spam
|
|
568
|
+
semaphore.release()
|
|
536
569
|
await sleep(1)
|
|
537
570
|
logger.info("JobExecutor stopped.")
|
|
538
571
|
|
|
539
|
-
def stop(self):
|
|
572
|
+
def stop(self) -> None:
|
|
540
573
|
self._running = False
|
avtomatika/health_checker.py
CHANGED
|
@@ -20,19 +20,37 @@ logger = getLogger(__name__)
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class HealthChecker:
|
|
23
|
-
def __init__(self, engine: "OrchestratorEngine"):
|
|
23
|
+
def __init__(self, engine: "OrchestratorEngine", interval_seconds: int = 600):
|
|
24
|
+
self.engine = engine
|
|
25
|
+
self.storage = engine.storage
|
|
26
|
+
self.interval_seconds = interval_seconds
|
|
24
27
|
self._running = False
|
|
28
|
+
from uuid import uuid4
|
|
29
|
+
|
|
30
|
+
self._instance_id = str(uuid4())
|
|
25
31
|
|
|
26
32
|
async def run(self):
|
|
27
|
-
logger.info("HealthChecker
|
|
33
|
+
logger.info(f"HealthChecker started (Active Index Cleanup, Instance ID: {self._instance_id}).")
|
|
28
34
|
self._running = True
|
|
29
35
|
while self._running:
|
|
30
36
|
try:
|
|
31
|
-
#
|
|
32
|
-
|
|
33
|
-
|
|
37
|
+
# Use distributed lock to ensure only one instance cleans up
|
|
38
|
+
if await self.storage.acquire_lock(
|
|
39
|
+
"global_health_check_lock", self._instance_id, self.interval_seconds - 5
|
|
40
|
+
):
|
|
41
|
+
try:
|
|
42
|
+
await self.storage.cleanup_expired_workers()
|
|
43
|
+
finally:
|
|
44
|
+
# We don't release the lock immediately to prevent other instances from
|
|
45
|
+
# running the same task if the interval is small.
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
await sleep(self.interval_seconds)
|
|
34
49
|
except CancelledError:
|
|
35
50
|
break
|
|
51
|
+
except Exception:
|
|
52
|
+
logger.exception("Error in HealthChecker main loop.")
|
|
53
|
+
await sleep(60)
|
|
36
54
|
logger.info("HealthChecker stopped.")
|
|
37
55
|
|
|
38
56
|
def stop(self):
|
avtomatika/history/base.py
CHANGED
|
@@ -1,25 +1,79 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import contextlib
|
|
1
3
|
from abc import ABC, abstractmethod
|
|
4
|
+
from logging import getLogger
|
|
2
5
|
from typing import Any
|
|
3
6
|
|
|
7
|
+
logger = getLogger(__name__)
|
|
8
|
+
|
|
4
9
|
|
|
5
10
|
class HistoryStorageBase(ABC):
|
|
6
11
|
"""Abstract base class for a history store.
|
|
7
|
-
|
|
12
|
+
Implements buffered asynchronous logging to avoid blocking the main loop.
|
|
8
13
|
"""
|
|
9
14
|
|
|
15
|
+
def __init__(self):
|
|
16
|
+
self._queue: asyncio.Queue[tuple[str, dict[str, Any]]] = asyncio.Queue(maxsize=5000)
|
|
17
|
+
self._worker_task: asyncio.Task | None = None
|
|
18
|
+
|
|
19
|
+
async def start(self) -> None:
|
|
20
|
+
"""Starts the background worker for writing logs."""
|
|
21
|
+
if not self._worker_task:
|
|
22
|
+
self._worker_task = asyncio.create_task(self._worker())
|
|
23
|
+
logger.info("HistoryStorage background worker started.")
|
|
24
|
+
|
|
25
|
+
async def close(self) -> None:
|
|
26
|
+
"""Stops the background worker and closes resources."""
|
|
27
|
+
if self._worker_task:
|
|
28
|
+
self._worker_task.cancel()
|
|
29
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
30
|
+
await self._worker_task
|
|
31
|
+
self._worker_task = None
|
|
32
|
+
logger.info("HistoryStorage background worker stopped.")
|
|
33
|
+
|
|
10
34
|
@abstractmethod
|
|
11
|
-
async def initialize(self):
|
|
35
|
+
async def initialize(self) -> None:
|
|
12
36
|
"""Performs initialization, e.g., creating tables in the DB."""
|
|
13
37
|
raise NotImplementedError
|
|
14
38
|
|
|
39
|
+
async def log_job_event(self, event_data: dict[str, Any]) -> None:
|
|
40
|
+
"""Queues a job event for logging."""
|
|
41
|
+
try:
|
|
42
|
+
self._queue.put_nowait(("job", event_data))
|
|
43
|
+
except asyncio.QueueFull:
|
|
44
|
+
logger.warning("History queue full! Dropping job event.")
|
|
45
|
+
|
|
46
|
+
async def log_worker_event(self, event_data: dict[str, Any]) -> None:
|
|
47
|
+
"""Queues a worker event for logging."""
|
|
48
|
+
try:
|
|
49
|
+
self._queue.put_nowait(("worker", event_data))
|
|
50
|
+
except asyncio.QueueFull:
|
|
51
|
+
logger.warning("History queue full! Dropping worker event.")
|
|
52
|
+
|
|
53
|
+
async def _worker(self) -> None:
|
|
54
|
+
while True:
|
|
55
|
+
try:
|
|
56
|
+
kind, data = await self._queue.get()
|
|
57
|
+
try:
|
|
58
|
+
if kind == "job":
|
|
59
|
+
await self._persist_job_event(data)
|
|
60
|
+
elif kind == "worker":
|
|
61
|
+
await self._persist_worker_event(data)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
logger.error(f"Error persisting history event: {e}")
|
|
64
|
+
finally:
|
|
65
|
+
self._queue.task_done()
|
|
66
|
+
except asyncio.CancelledError:
|
|
67
|
+
break
|
|
68
|
+
|
|
15
69
|
@abstractmethod
|
|
16
|
-
async def
|
|
17
|
-
"""
|
|
70
|
+
async def _persist_job_event(self, event_data: dict[str, Any]) -> None:
|
|
71
|
+
"""Actual implementation of writing a job event to storage."""
|
|
18
72
|
raise NotImplementedError
|
|
19
73
|
|
|
20
74
|
@abstractmethod
|
|
21
|
-
async def
|
|
22
|
-
"""
|
|
75
|
+
async def _persist_worker_event(self, event_data: dict[str, Any]) -> None:
|
|
76
|
+
"""Actual implementation of writing a worker event to storage."""
|
|
23
77
|
raise NotImplementedError
|
|
24
78
|
|
|
25
79
|
@abstractmethod
|
avtomatika/history/noop.py
CHANGED
|
@@ -8,20 +8,31 @@ class NoOpHistoryStorage(HistoryStorageBase):
|
|
|
8
8
|
Used when history storage is not configured.
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
|
|
11
|
+
def __init__(self):
|
|
12
|
+
super().__init__()
|
|
13
|
+
|
|
14
|
+
async def start(self) -> None:
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
async def close(self) -> None:
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
async def initialize(self) -> None:
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
async def log_job_event(self, event_data: dict[str, Any]) -> None:
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
async def log_worker_event(self, event_data: dict[str, Any]) -> None:
|
|
13
27
|
pass
|
|
14
28
|
|
|
15
|
-
async def
|
|
16
|
-
# Do nothing
|
|
29
|
+
async def _persist_job_event(self, event_data: dict[str, Any]) -> None:
|
|
17
30
|
pass
|
|
18
31
|
|
|
19
|
-
async def
|
|
20
|
-
# Do nothing
|
|
32
|
+
async def _persist_worker_event(self, event_data: dict[str, Any]) -> None:
|
|
21
33
|
pass
|
|
22
34
|
|
|
23
35
|
async def get_job_history(self, job_id: str) -> list[dict[str, Any]]:
|
|
24
|
-
# Always return an empty list
|
|
25
36
|
return []
|
|
26
37
|
|
|
27
38
|
async def get_jobs(self, limit: int = 100, offset: int = 0) -> list[dict[str, Any]]:
|
avtomatika/history/postgres.py
CHANGED
|
@@ -46,19 +46,20 @@ class PostgresHistoryStorage(HistoryStorageBase, ABC):
|
|
|
46
46
|
"""Implementation of the history store based on asyncpg for PostgreSQL."""
|
|
47
47
|
|
|
48
48
|
def __init__(self, dsn: str, tz_name: str = "UTC"):
|
|
49
|
+
super().__init__()
|
|
49
50
|
self._dsn = dsn
|
|
50
51
|
self._pool: Pool | None = None
|
|
51
52
|
self.tz_name = tz_name
|
|
52
53
|
self.tz = ZoneInfo(tz_name)
|
|
53
54
|
|
|
54
|
-
async def _setup_connection(self, conn: Connection):
|
|
55
|
+
async def _setup_connection(self, conn: Connection) -> None:
|
|
55
56
|
"""Configures the connection session with the correct timezone."""
|
|
56
57
|
try:
|
|
57
58
|
await conn.execute(f"SET TIME ZONE '{self.tz_name}'")
|
|
58
59
|
except PostgresError as e:
|
|
59
60
|
logger.error(f"Failed to set timezone '{self.tz_name}' for PG connection: {e}")
|
|
60
61
|
|
|
61
|
-
async def initialize(self):
|
|
62
|
+
async def initialize(self) -> None:
|
|
62
63
|
"""Initializes the connection pool to PostgreSQL and creates tables."""
|
|
63
64
|
try:
|
|
64
65
|
# We use init parameter to configure each new connection in the pool
|
|
@@ -75,13 +76,14 @@ class PostgresHistoryStorage(HistoryStorageBase, ABC):
|
|
|
75
76
|
logger.error(f"Failed to initialize PostgreSQL history storage: {e}")
|
|
76
77
|
raise
|
|
77
78
|
|
|
78
|
-
async def close(self):
|
|
79
|
-
"""Closes the connection pool."""
|
|
79
|
+
async def close(self) -> None:
|
|
80
|
+
"""Closes the connection pool and background worker."""
|
|
81
|
+
await super().close()
|
|
80
82
|
if self._pool:
|
|
81
83
|
await self._pool.close()
|
|
82
84
|
logger.info("PostgreSQL history storage connection pool closed.")
|
|
83
85
|
|
|
84
|
-
async def
|
|
86
|
+
async def _persist_job_event(self, event_data: dict[str, Any]) -> None:
|
|
85
87
|
"""Logs a job lifecycle event to PostgreSQL."""
|
|
86
88
|
if not self._pool:
|
|
87
89
|
raise RuntimeError("History storage is not initialized.")
|
|
@@ -117,7 +119,7 @@ class PostgresHistoryStorage(HistoryStorageBase, ABC):
|
|
|
117
119
|
except PostgresError as e:
|
|
118
120
|
logger.error(f"Failed to log job event to PostgreSQL: {e}")
|
|
119
121
|
|
|
120
|
-
async def
|
|
122
|
+
async def _persist_worker_event(self, event_data: dict[str, Any]) -> None:
|
|
121
123
|
"""Logs a worker lifecycle event to PostgreSQL."""
|
|
122
124
|
if not self._pool:
|
|
123
125
|
raise RuntimeError("History storage is not initialized.")
|
avtomatika/history/sqlite.py
CHANGED
|
@@ -49,11 +49,12 @@ class SQLiteHistoryStorage(HistoryStorageBase):
|
|
|
49
49
|
"""
|
|
50
50
|
|
|
51
51
|
def __init__(self, db_path: str, tz_name: str = "UTC"):
|
|
52
|
+
super().__init__()
|
|
52
53
|
self._db_path = db_path
|
|
53
54
|
self._conn: Connection | None = None
|
|
54
55
|
self.tz = ZoneInfo(tz_name)
|
|
55
56
|
|
|
56
|
-
async def initialize(self):
|
|
57
|
+
async def initialize(self) -> None:
|
|
57
58
|
"""Initializes the database connection and creates tables if they don't exist."""
|
|
58
59
|
try:
|
|
59
60
|
self._conn = await connect(self._db_path)
|
|
@@ -68,8 +69,9 @@ class SQLiteHistoryStorage(HistoryStorageBase):
|
|
|
68
69
|
logger.error(f"Failed to initialize SQLite history storage: {e}")
|
|
69
70
|
raise
|
|
70
71
|
|
|
71
|
-
async def close(self):
|
|
72
|
-
"""Closes the database connection."""
|
|
72
|
+
async def close(self) -> None:
|
|
73
|
+
"""Closes the database connection and background worker."""
|
|
74
|
+
await super().close()
|
|
73
75
|
if self._conn:
|
|
74
76
|
await self._conn.close()
|
|
75
77
|
logger.info("SQLite history storage connection closed.")
|
|
@@ -91,7 +93,7 @@ class SQLiteHistoryStorage(HistoryStorageBase):
|
|
|
91
93
|
|
|
92
94
|
return item
|
|
93
95
|
|
|
94
|
-
async def
|
|
96
|
+
async def _persist_job_event(self, event_data: dict[str, Any]) -> None:
|
|
95
97
|
"""Logs a job lifecycle event to the job_history table."""
|
|
96
98
|
if not self._conn:
|
|
97
99
|
raise RuntimeError("History storage is not initialized.")
|
|
@@ -128,7 +130,7 @@ class SQLiteHistoryStorage(HistoryStorageBase):
|
|
|
128
130
|
except Error as e:
|
|
129
131
|
logger.error(f"Failed to log job event: {e}")
|
|
130
132
|
|
|
131
|
-
async def
|
|
133
|
+
async def _persist_worker_event(self, event_data: dict[str, Any]) -> None:
|
|
132
134
|
"""Logs a worker lifecycle event to the worker_history table."""
|
|
133
135
|
if not self._conn:
|
|
134
136
|
raise RuntimeError("History storage is not initialized.")
|
avtomatika/metrics.py
CHANGED
avtomatika/reputation.py
CHANGED
|
@@ -52,48 +52,54 @@ class ReputationCalculator:
|
|
|
52
52
|
async def calculate_all_reputations(self):
|
|
53
53
|
"""Calculates and updates the reputation for all active workers."""
|
|
54
54
|
logger.info("Starting reputation calculation for all workers...")
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
|
|
56
|
+
# Get only IDs of active workers to avoid O(N) scan of all data
|
|
57
|
+
worker_ids = await self.storage.get_active_worker_ids()
|
|
58
|
+
|
|
59
|
+
if not worker_ids:
|
|
57
60
|
logger.info("No active workers found for reputation calculation.")
|
|
58
61
|
return
|
|
59
62
|
|
|
60
|
-
for
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
63
|
+
logger.info(f"Recalculating reputation for {len(worker_ids)} workers.")
|
|
64
|
+
|
|
65
|
+
for worker_id in worker_ids:
|
|
66
|
+
if not self._running:
|
|
67
|
+
break
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
history = await self.history_storage.get_worker_history(
|
|
71
|
+
worker_id,
|
|
72
|
+
since_days=REPUTATION_HISTORY_DAYS,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Count only task completion events
|
|
76
|
+
task_finished_events = [event for event in history if event.get("event_type") == "task_finished"]
|
|
77
|
+
|
|
78
|
+
if not task_finished_events:
|
|
79
|
+
# If there is no history, skip to next worker
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
successful_tasks = 0
|
|
83
|
+
for event in task_finished_events:
|
|
84
|
+
# Extract the result from the snapshot
|
|
85
|
+
snapshot = event.get("context_snapshot", {})
|
|
86
|
+
result = snapshot.get("result", {})
|
|
87
|
+
if result.get("status") == "success":
|
|
88
|
+
successful_tasks += 1
|
|
89
|
+
|
|
90
|
+
total_tasks = len(task_finished_events)
|
|
91
|
+
new_reputation = successful_tasks / total_tasks if total_tasks > 0 else 1.0
|
|
92
|
+
new_reputation = round(new_reputation, 4)
|
|
93
|
+
|
|
94
|
+
await self.storage.update_worker_data(
|
|
95
|
+
worker_id,
|
|
96
|
+
{"reputation": new_reputation},
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Throttling: Small sleep to prevent DB spikes
|
|
100
|
+
await sleep(0.1)
|
|
101
|
+
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.error(f"Failed to calculate reputation for worker {worker_id}: {e}")
|
|
98
104
|
|
|
99
105
|
logger.info("Reputation calculation finished.")
|