PyPI - avtomatika - Versions diffs - 1.0b8__py3-none-any.whl → 1.0b10__py3-none-any.whl - Mend

avtomatika 1.0b8py3-none-any.whl → 1.0b10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

avtomatika/api/handlers.py +5 -257
avtomatika/api/routes.py +42 -63
avtomatika/api.html +1 -1
avtomatika/app_keys.py +1 -0
avtomatika/blueprint.py +3 -2
avtomatika/config.py +8 -0
avtomatika/constants.py +75 -25
avtomatika/data_types.py +2 -22
avtomatika/dispatcher.py +4 -0
avtomatika/engine.py +119 -7
avtomatika/executor.py +19 -19
avtomatika/logging_config.py +16 -7
avtomatika/s3.py +96 -40
avtomatika/scheduler_config_loader.py +5 -2
avtomatika/security.py +56 -74
avtomatika/services/__init__.py +0 -0
avtomatika/services/worker_service.py +267 -0
avtomatika/storage/base.py +10 -0
avtomatika/storage/memory.py +15 -4
avtomatika/storage/redis.py +42 -11
avtomatika/telemetry.py +8 -7
avtomatika/utils/webhook_sender.py +3 -3
avtomatika/watcher.py +4 -2
avtomatika/ws_manager.py +16 -8
{avtomatika-1.0b8.dist-info → avtomatika-1.0b10.dist-info}/METADATA +47 -15
avtomatika-1.0b10.dist-info/RECORD +48 -0
{avtomatika-1.0b8.dist-info → avtomatika-1.0b10.dist-info}/WHEEL +1 -1
avtomatika-1.0b8.dist-info/RECORD +0 -46
{avtomatika-1.0b8.dist-info → avtomatika-1.0b10.dist-info}/licenses/LICENSE +0 -0
{avtomatika-1.0b8.dist-info → avtomatika-1.0b10.dist-info}/top_level.txt +0 -0

avtomatika/security.py CHANGED Viewed

@@ -10,6 +10,62 @@ from .storage.base import StorageBackend
 Handler = Callable[[web.Request], Awaitable[web.Response]]
+async def verify_worker_auth(
+    storage: StorageBackend,
+    config: Config,
+    token: str | None,
+    cert_identity: str | None,
+    worker_id_hint: str | None,
+) -> str:
+    """
+    Verifies worker authentication using token or mTLS.
+    Returns authenticated worker_id.
+    Raises ValueError (400), PermissionError (401/403) on failure.
+    """
+    # mTLS Check
+    if cert_identity:
+        if worker_id_hint and cert_identity != worker_id_hint:
+            raise PermissionError(
+                f"Unauthorized: Certificate CN '{cert_identity}' does not match worker_id '{worker_id_hint}'"
+            )
+        return cert_identity
+    # Token Check
+    if not token:
+        raise PermissionError(f"Missing {AUTH_HEADER_WORKER} header or client certificate")
+    hashed_provided_token = sha256(token.encode()).hexdigest()
+    # STS Access Token
+    token_worker_id = await storage.verify_worker_access_token(hashed_provided_token)
+    if token_worker_id:
+        if worker_id_hint and token_worker_id != worker_id_hint:
+            raise PermissionError(
+                f"Unauthorized: Access Token belongs to '{token_worker_id}', but request is for '{worker_id_hint}'"
+            )
+        return token_worker_id
+    # Individual/Global Token
+    if not worker_id_hint:
+        if config.GLOBAL_WORKER_TOKEN and token == config.GLOBAL_WORKER_TOKEN:
+            return "unknown_authenticated_by_global_token"
+        raise PermissionError("Unauthorized: Invalid token or missing worker_id hint")
+    # Individual Token for specific worker
+    expected_token_hash = await storage.get_worker_token(worker_id_hint)
+    if expected_token_hash:
+        if hashed_provided_token == expected_token_hash:
+            return worker_id_hint
+        raise PermissionError("Unauthorized: Invalid individual worker token")
+    # Global Token Fallback
+    if config.GLOBAL_WORKER_TOKEN and token == config.GLOBAL_WORKER_TOKEN:
+        return worker_id_hint
+    raise PermissionError("Unauthorized: No valid token found")
 def client_auth_middleware_factory(
     storage: StorageBackend,
 ) -> Any:
@@ -38,77 +94,3 @@ def client_auth_middleware_factory(
         return await handler(request)
     return middleware
-def worker_auth_middleware_factory(
-    storage: StorageBackend,
-    config: Config,
-) -> Any:
-    """
-    Middleware factory for worker authentication.
-    It supports both individual tokens and a global fallback token for backward compatibility.
-    It also attaches the authenticated worker_id to the request.
-    """
-    @web.middleware
-    async def middleware(request: web.Request, handler: Handler) -> web.Response:
-        provided_token = request.headers.get(AUTH_HEADER_WORKER)
-        if not provided_token:
-            return web.json_response(
-                {"error": f"Missing {AUTH_HEADER_WORKER} header"},
-                status=401,
-            )
-        worker_id = request.match_info.get("worker_id")
-        data = None
-        # For specific endpoints, worker_id is in the body.
-        # We need to read the body here, which can be tricky as it's a stream.
-        # We clone the request to allow the handler to read the body again.
-        if not worker_id and (request.path.endswith("/register") or request.path.endswith("/tasks/result")):
-            try:
-                cloned_request = request.clone()
-                data = await cloned_request.json()
-                worker_id = data.get("worker_id")
-                # Attach the parsed data to the request so the handler doesn't need to re-parse
-                if request.path.endswith("/register"):
-                    request["worker_registration_data"] = data
-            except Exception:
-                return web.json_response({"error": "Invalid JSON body"}, status=400)
-        # If no worker_id could be determined from path or body, we can only validate against the global token.
-        if not worker_id:
-            if provided_token == config.GLOBAL_WORKER_TOKEN:
-                # We don't know the worker_id, so we can't attach it.
-                return await handler(request)
-            else:
-                return web.json_response(
-                    {"error": "Unauthorized: Invalid token or missing worker_id"},
-                    status=401,
-                )
-        # --- Individual Token Check ---
-        expected_token_hash = await storage.get_worker_token(worker_id)
-        if expected_token_hash:
-            hashed_provided_token = sha256(provided_token.encode()).hexdigest()
-            if hashed_provided_token == expected_token_hash:
-                request["worker_id"] = worker_id  # Attach authenticated worker_id
-                return await handler(request)
-            else:
-                # If an individual token exists, we do not fall back to the global token.
-                return web.json_response(
-                    {"error": "Unauthorized: Invalid individual worker token"},
-                    status=401,
-                )
-        # --- Global Token Fallback ---
-        if config.GLOBAL_WORKER_TOKEN and provided_token == config.GLOBAL_WORKER_TOKEN:
-            request["worker_id"] = worker_id  # Attach authenticated worker_id
-            return await handler(request)
-        return web.json_response(
-            {"error": "Unauthorized: No valid token found"},
-            status=401,
-        )
-    return middleware

avtomatika/services/__init__.py ADDED Viewed

File without changes

avtomatika/services/worker_service.py ADDED Viewed

@@ -0,0 +1,267 @@
+from hashlib import sha256
+from logging import getLogger
+from secrets import token_urlsafe
+from time import monotonic
+from typing import Any, Optional
+from rxon.models import TokenResponse
+from rxon.validators import validate_identifier
+from ..app_keys import S3_SERVICE_KEY
+from ..config import Config
+from ..constants import (
+    ERROR_CODE_DEPENDENCY,
+    ERROR_CODE_INTEGRITY_MISMATCH,
+    ERROR_CODE_INVALID_INPUT,
+    ERROR_CODE_PERMANENT,
+    ERROR_CODE_SECURITY,
+    ERROR_CODE_TRANSIENT,
+    JOB_STATUS_CANCELLED,
+    JOB_STATUS_FAILED,
+    JOB_STATUS_QUARANTINED,
+    JOB_STATUS_RUNNING,
+    JOB_STATUS_WAITING_FOR_PARALLEL,
+    TASK_STATUS_CANCELLED,
+    TASK_STATUS_FAILURE,
+    TASK_STATUS_SUCCESS,
+)
+from ..history.base import HistoryStorageBase
+from ..storage.base import StorageBackend
+logger = getLogger(__name__)
+class WorkerService:
+    def __init__(
+        self,
+        storage: StorageBackend,
+        history_storage: HistoryStorageBase,
+        config: Config,
+        engine: Any,
+    ):
+        self.storage = storage
+        self.history_storage = history_storage
+        self.config = config
+        self.engine = engine
+    async def register_worker(self, worker_data: dict[str, Any]) -> None:
+        """
+        Registers a new worker.
+        :param worker_data: Raw dictionary from request (to be validated/converted to Model later)
+        """
+        worker_id = worker_data.get("worker_id")
+        if not worker_id:
+            raise ValueError("Missing required field: worker_id")
+        validate_identifier(worker_id, "worker_id")
+        # S3 Consistency Check
+        s3_service = self.engine.app.get(S3_SERVICE_KEY)
+        if s3_service:
+            orchestrator_s3_hash = s3_service.get_config_hash()
+            worker_capabilities = worker_data.get("capabilities", {})
+            worker_s3_hash = worker_capabilities.get("s3_config_hash")
+            if orchestrator_s3_hash and worker_s3_hash and orchestrator_s3_hash != worker_s3_hash:
+                logger.warning(
+                    f"Worker '{worker_id}' has a different S3 configuration hash! "
+                    f"Orchestrator: {orchestrator_s3_hash}, Worker: {worker_s3_hash}. "
+                    "This may lead to 'split-brain' storage issues."
+                )
+        ttl = self.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
+        await self.storage.register_worker(worker_id, worker_data, ttl)
+        logger.info(f"Worker '{worker_id}' registered with info: {worker_data}")
+        await self.history_storage.log_worker_event(
+            {
+                "worker_id": worker_id,
+                "event_type": "registered",
+                "worker_info_snapshot": worker_data,
+            }
+        )
+    async def get_next_task(self, worker_id: str) -> Optional[dict[str, Any]]:
+        """
+        Retrieves the next task for a worker using long-polling configuration.
+        """
+        logger.debug(f"Worker {worker_id} is requesting a new task.")
+        return await self.storage.dequeue_task_for_worker(worker_id, self.config.WORKER_POLL_TIMEOUT_SECONDS)
+    async def process_task_result(self, result_payload: dict[str, Any], authenticated_worker_id: str) -> str:
+        """
+        Processes a task result submitted by a worker.
+        Returns a status string constant.
+        """
+        payload_worker_id = result_payload.get("worker_id")
+        if payload_worker_id and payload_worker_id != authenticated_worker_id:
+            raise PermissionError(
+                f"Forbidden: Authenticated worker '{authenticated_worker_id}' "
+                f"cannot submit results for another worker '{payload_worker_id}'."
+            )
+        job_id = result_payload.get("job_id")
+        task_id = result_payload.get("task_id")
+        result_data = result_payload.get("result", {})
+        if not job_id or not task_id:
+            raise ValueError("job_id and task_id are required")
+        job_state = await self.storage.get_job_state(job_id)
+        if not job_state:
+            raise LookupError("Job not found")
+        if job_state.get("status") == JOB_STATUS_WAITING_FOR_PARALLEL:
+            await self.storage.remove_job_from_watch(f"{job_id}:{task_id}")
+            job_state.setdefault("aggregation_results", {})[task_id] = result_data
+            branches = job_state.setdefault("active_branches", [])
+            if task_id in branches:
+                branches.remove(task_id)
+            if not branches:
+                logger.info(f"All parallel branches for job {job_id} have completed.")
+                job_state["status"] = JOB_STATUS_RUNNING
+                job_state["current_state"] = job_state["aggregation_target"]
+                await self.storage.save_job_state(job_id, job_state)
+                await self.storage.enqueue_job(job_id)
+            else:
+                logger.info(
+                    f"Branch {task_id} for job {job_id} completed. Waiting for {len(branches)} more.",
+                )
+                await self.storage.save_job_state(job_id, job_state)
+            return "parallel_branch_result_accepted"
+        await self.storage.remove_job_from_watch(job_id)
+        now = monotonic()
+        dispatched_at = job_state.get("task_dispatched_at", now)
+        duration_ms = int((now - dispatched_at) * 1000)
+        await self.history_storage.log_job_event(
+            {
+                "job_id": job_id,
+                "state": job_state.get("current_state"),
+                "event_type": "task_finished",
+                "duration_ms": duration_ms,
+                "worker_id": authenticated_worker_id,
+                "context_snapshot": {**job_state, "result": result_data},
+            },
+        )
+        result_status = result_data.get("status", TASK_STATUS_SUCCESS)  # Default to success? Constant?
+        if result_status == TASK_STATUS_FAILURE:
+            return await self._handle_task_failure(job_state, task_id, result_data)
+        if result_status == TASK_STATUS_CANCELLED:
+            logger.info(f"Task {task_id} for job {job_id} was cancelled by worker.")
+            job_state["status"] = JOB_STATUS_CANCELLED
+            await self.storage.save_job_state(job_id, job_state)
+            transitions = job_state.get("current_task_transitions", {})
+            if next_state := transitions.get("cancelled"):
+                job_state["current_state"] = next_state
+                job_state["status"] = JOB_STATUS_RUNNING
+                await self.storage.save_job_state(job_id, job_state)
+                await self.storage.enqueue_job(job_id)
+            return "result_accepted_cancelled"
+        transitions = job_state.get("current_task_transitions", {})
+        result_status = result_data.get("status", TASK_STATUS_SUCCESS)
+        next_state = transitions.get(result_status)
+        if next_state:
+            logger.info(f"Job {job_id} transitioning based on worker status '{result_status}' to state '{next_state}'")
+            worker_data_content = result_data.get("data")
+            if worker_data_content and isinstance(worker_data_content, dict):
+                if "state_history" not in job_state:
+                    job_state["state_history"] = {}
+                job_state["state_history"].update(worker_data_content)
+            data_metadata = result_payload.get("data_metadata")
+            if data_metadata:
+                if "data_metadata" not in job_state:
+                    job_state["data_metadata"] = {}
+                job_state["data_metadata"].update(data_metadata)
+                logger.debug(f"Stored data metadata for job {job_id}: {list(data_metadata.keys())}")
+            job_state["current_state"] = next_state
+            job_state["status"] = JOB_STATUS_RUNNING
+            await self.storage.save_job_state(job_id, job_state)
+            await self.storage.enqueue_job(job_id)
+            return "result_accepted_success"
+        else:
+            logger.error(f"Job {job_id} failed. Worker returned unhandled status '{result_status}'.")
+            job_state["status"] = JOB_STATUS_FAILED
+            job_state["error_message"] = f"Worker returned unhandled status: {result_status}"
+            await self.storage.save_job_state(job_id, job_state)
+            return "result_accepted_failure"
+    async def _handle_task_failure(self, job_state: dict, task_id: str, result_data: dict) -> str:
+        error_details = result_data.get("error", {})
+        error_type = ERROR_CODE_TRANSIENT
+        error_message = "No error details provided."
+        if isinstance(error_details, dict):
+            error_type = error_details.get("code", ERROR_CODE_TRANSIENT)
+            error_message = error_details.get("message", "No error message provided.")
+        elif isinstance(error_details, str):
+            error_message = error_details
+        job_id = job_state["id"]
+        logger.warning(f"Task {task_id} for job {job_id} failed with error type '{error_type}'.")
+        if error_type in (ERROR_CODE_PERMANENT, ERROR_CODE_SECURITY, ERROR_CODE_DEPENDENCY):
+            job_state["status"] = JOB_STATUS_QUARANTINED
+            job_state["error_message"] = f"Task failed with permanent error ({error_type}): {error_message}"
+            await self.storage.save_job_state(job_id, job_state)
+            await self.storage.quarantine_job(job_id)
+        elif error_type == ERROR_CODE_INVALID_INPUT:
+            job_state["status"] = JOB_STATUS_FAILED
+            job_state["error_message"] = f"Task failed due to invalid input: {error_message}"
+            await self.storage.save_job_state(job_id, job_state)
+        elif error_type == ERROR_CODE_INTEGRITY_MISMATCH:
+            job_state["status"] = JOB_STATUS_FAILED
+            job_state["error_message"] = f"Task failed due to data integrity mismatch: {error_message}"
+            await self.storage.save_job_state(job_id, job_state)
+            logger.critical(f"Data integrity mismatch detected for job {job_id}: {error_message}")
+        else:
+            await self.engine.handle_task_failure(job_state, task_id, error_message)
+        return "result_accepted_failure"
+    async def issue_access_token(self, worker_id: str) -> TokenResponse:
+        """Generates and stores a temporary access token."""
+        raw_token = token_urlsafe(32)
+        token_hash = sha256(raw_token.encode()).hexdigest()
+        ttl = 3600
+        await self.storage.save_worker_access_token(worker_id, token_hash, ttl)
+        logger.info(f"Issued temporary access token for worker {worker_id}")
+        return TokenResponse(access_token=raw_token, expires_in=ttl, worker_id=worker_id)
+    async def update_worker_heartbeat(
+        self, worker_id: str, update_data: Optional[dict[str, Any]]
+    ) -> Optional[dict[str, Any]]:
+        """Updates worker TTL and status."""
+        ttl = self.config.WORKER_HEALTH_CHECK_INTERVAL_SECONDS * 2
+        if update_data:
+            updated_worker = await self.storage.update_worker_status(worker_id, update_data, ttl)
+            if updated_worker:
+                await self.history_storage.log_worker_event(
+                    {
+                        "worker_id": worker_id,
+                        "event_type": "status_update",
+                        "worker_info_snapshot": updated_worker,
+                    },
+                )
+            return updated_worker
+        else:
+            refreshed = await self.storage.refresh_worker_ttl(worker_id, ttl)
+            return {"status": "ttl_refreshed"} if refreshed else None

avtomatika/storage/base.py CHANGED Viewed

@@ -292,6 +292,16 @@ class StorageBackend(ABC):
         """Retrieves an individual token for a specific worker."""
         raise NotImplementedError
+    @abstractmethod
+    async def save_worker_access_token(self, worker_id: str, token: str, ttl: int) -> None:
+        """Saves a temporary access token for a worker (STS)."""
+        raise NotImplementedError
+    @abstractmethod
+    async def verify_worker_access_token(self, token: str) -> str | None:
+        """Verifies a temporary access token and returns the associated worker_id if valid."""
+        raise NotImplementedError
     @abstractmethod
     async def get_worker_info(self, worker_id: str) -> dict[str, Any] | None:
         """Get complete information about a worker by its ID."""

avtomatika/storage/memory.py CHANGED Viewed

@@ -12,12 +12,12 @@ class MemoryStorage(StorageBackend):
     Not persistent.
     """
-    def __init__(self):
+    def __init__(self) -> None:
         self._jobs: dict[str, dict[str, Any]] = {}
         self._workers: dict[str, dict[str, Any]] = {}
         self._worker_ttls: dict[str, float] = {}
-        self._worker_task_queues: dict[str, PriorityQueue] = {}
-        self._job_queue = Queue()
+        self._worker_task_queues: dict[str, PriorityQueue[Any]] = {}
+        self._job_queue: Queue[str] = Queue()
         self._quarantine_queue: list[str] = []
         self._watched_jobs: dict[str, float] = {}
         self._client_configs: dict[str, dict[str, Any]] = {}
@@ -189,10 +189,11 @@ class MemoryStorage(StorageBackend):
         async with self._lock:
             self._watched_jobs.pop(job_id, None)
-    async def get_timed_out_jobs(self) -> list[str]:
+    async def get_timed_out_jobs(self, limit: int = 100) -> list[str]:
         async with self._lock:
             now = monotonic()
             timed_out_ids = [job_id for job_id, timeout_at in self._watched_jobs.items() if timeout_at <= now]
+            timed_out_ids = timed_out_ids[:limit]
             for job_id in timed_out_ids:
                 self._watched_jobs.pop(job_id, None)
             return timed_out_ids
@@ -331,6 +332,16 @@ class MemoryStorage(StorageBackend):
         async with self._lock:
             return self._worker_tokens.get(worker_id)
+    async def save_worker_access_token(self, worker_id: str, token: str, ttl: int) -> None:
+        async with self._lock:
+            self._generic_keys[f"sts:{token}"] = worker_id
+            self._generic_key_ttls[f"sts:{token}"] = monotonic() + ttl
+    async def verify_worker_access_token(self, token: str) -> str | None:
+        async with self._lock:
+            await self._clean_expired()
+            return self._generic_keys.get(f"sts:{token}")
     async def set_task_cancellation_flag(self, task_id: str) -> None:
         key = f"task_cancel:{task_id}"
         await self.increment_key_with_ttl(key, 3600)

avtomatika/storage/redis.py CHANGED Viewed

@@ -95,7 +95,7 @@ class RedisStorage(StorageBackend):
         self,
         job_id: str,
         update_data: dict[str, Any],
-    ) -> dict[Any, Any] | None | Any:
+    ) -> dict[str, Any]:
         """Atomically update the job state in Redis using a transaction."""
         key = self._get_key(job_id)
@@ -104,7 +104,7 @@ class RedisStorage(StorageBackend):
                 try:
                     await pipe.watch(key)
                     current_state_raw = await pipe.get(key)
-                    current_state = self._unpack(current_state_raw) if current_state_raw else {}
+                    current_state: dict[str, Any] = self._unpack(current_state_raw) if current_state_raw else {}
                     current_state.update(update_data)
                     pipe.multi()
@@ -147,7 +147,7 @@ class RedisStorage(StorageBackend):
         key = f"orchestrator:worker:info:{worker_id}"
         tasks_key = f"orchestrator:worker:tasks:{worker_id}"
-        tasks = await self._redis.smembers(tasks_key)  # type: ignore
+        tasks = await self._redis.smembers(tasks_key)  # type: ignore[var-annotated]
         async with self._redis.pipeline(transaction=True) as pipe:
             pipe.delete(key)
@@ -156,7 +156,7 @@ class RedisStorage(StorageBackend):
             pipe.srem("orchestrator:index:workers:idle", worker_id)
             for task in tasks:
-                task_str = task.decode("utf-8") if isinstance(task, bytes) else task
+                task_str = task.decode("utf-8") if isinstance(task, bytes) else str(task)
                 pipe.srem(f"orchestrator:index:workers:task:{task_str}", worker_id)
             await pipe.execute()
@@ -204,8 +204,8 @@ class RedisStorage(StorageBackend):
         """Finds idle workers that support the given task using set intersection."""
         task_index = f"orchestrator:index:workers:task:{task_type}"
         idle_index = "orchestrator:index:workers:idle"
-        worker_ids = await self._redis.sinter(task_index, idle_index)  # type: ignore
-        return [wid.decode("utf-8") if isinstance(wid, bytes) else wid for wid in worker_ids]
+        worker_ids = await self._redis.sinter(task_index, idle_index)  # type: ignore[var-annotated]
+        return [wid.decode("utf-8") if isinstance(wid, bytes) else str(wid) for wid in worker_ids]
     async def enqueue_task_for_worker(self, worker_id: str, task_payload: dict[str, Any], priority: float) -> None:
         key = f"orchestrator:task_queue:{worker_id}"
@@ -274,13 +274,14 @@ class RedisStorage(StorageBackend):
         existence = await pipe.execute()
         dead_ids = [worker_ids[i] for i, exists in enumerate(existence) if not exists]
         for wid in dead_ids:
-            tasks = await self._redis.smembers(f"orchestrator:worker:tasks:{wid}")  # type: ignore
+            tasks = await self._redis.smembers(f"orchestrator:worker:tasks:{wid}")  # type: ignore[var-annotated]
             async with self._redis.pipeline(transaction=True) as p:
                 p.delete(f"orchestrator:worker:tasks:{wid}")
                 p.srem("orchestrator:index:workers:all", wid)
                 p.srem("orchestrator:index:workers:idle", wid)
                 for t in tasks:
-                    p.srem(f"orchestrator:index:workers:task:{t.decode() if isinstance(t, bytes) else t}", wid)
+                    t_str = t.decode() if isinstance(t, bytes) else str(t)
+                    p.srem(f"orchestrator:index:workers:task:{t_str}", wid)
                 await p.execute()
     async def add_job_to_watch(self, job_id: str, timeout_at: float) -> None:
@@ -291,10 +292,33 @@ class RedisStorage(StorageBackend):
     async def get_timed_out_jobs(self, limit: int = 100) -> list[str]:
         now = get_running_loop().time()
-        ids = await self._redis.zrangebyscore("orchestrator:watched_jobs", 0, now, start=0, num=limit)
+        # Lua script to atomically fetch and remove timed out jobs
+        LUA_POP_TIMEOUTS = """
+        local now = ARGV[1]
+        local limit = ARGV[2]
+        local ids = redis.call('ZRANGEBYSCORE', KEYS[1], 0, now, 'LIMIT', 0, limit)
+        if #ids > 0 then
+            redis.call('ZREM', KEYS[1], unpack(ids))
+        end
+        return ids
+        """
+        try:
+            sha = await self._redis.script_load(LUA_POP_TIMEOUTS)
+            ids = await self._redis.evalsha(sha, 1, "orchestrator:watched_jobs", now, limit)
+        except NoScriptError:
+            ids = await self._redis.eval(LUA_POP_TIMEOUTS, 1, "orchestrator:watched_jobs", now, limit)
+        except ResponseError as e:
+            # Fallback for Redis versions that don't support script_load/evalsha or other errors
+            if "unknown command" in str(e).lower():
+                logger.warning("Redis does not support LUA scripts. Falling back to non-atomic get_timed_out_jobs.")
+                ids = await self._redis.zrangebyscore("orchestrator:watched_jobs", 0, now, start=0, num=limit)
+                if ids:
+                    await self._redis.zrem("orchestrator:watched_jobs", *ids)  # type: ignore
+            else:
+                raise e
         if ids:
-            await self._redis.zrem("orchestrator:watched_jobs", *ids)  # type: ignore
-            return [i.decode("utf-8") for i in ids]
+            return [i.decode("utf-8") if isinstance(i, bytes) else i for i in ids]
         return []
     async def enqueue_job(self, job_id: str) -> None:
@@ -411,6 +435,13 @@ class RedisStorage(StorageBackend):
         token = await self._redis.get(f"orchestrator:worker:token:{worker_id}")
         return token.decode("utf-8") if token else None
+    async def save_worker_access_token(self, worker_id: str, token: str, ttl: int) -> None:
+        await self._redis.set(f"orchestrator:sts:token:{token}", worker_id, ex=ttl)
+    async def verify_worker_access_token(self, token: str) -> str | None:
+        worker_id = await self._redis.get(f"orchestrator:sts:token:{token}")
+        return worker_id.decode("utf-8") if worker_id else None
     async def acquire_lock(self, key: str, holder_id: str, ttl: int) -> bool:
         return bool(await self._redis.set(f"orchestrator:lock:{key}", holder_id, nx=True, ex=ttl))

avtomatika/telemetry.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from logging import getLogger
 from os import getenv
+from typing import Any
 logger = getLogger(__name__)
@@ -17,28 +18,28 @@ except ImportError:
     TELEMETRY_ENABLED = False
     class DummySpan:
-        def __enter__(self):
+        def __enter__(self) -> "DummySpan":
             return self
-        def __exit__(self, *args):
+        def __exit__(self, *args: Any) -> None:
             pass
-        def set_attribute(self, key, value):
+        def set_attribute(self, key: str, value: Any) -> None:
             pass
     class DummyTracer:
         @staticmethod
-        def start_as_current_span(name, context=None):
+        def start_as_current_span(name: str, context: Any = None) -> DummySpan:
             return DummySpan()
     class NoOpTrace:
-        def get_tracer(self, name):
+        def get_tracer(self, name: str) -> DummyTracer:
             return DummyTracer()
-    trace = NoOpTrace()
+    trace: Any = NoOpTrace()  # type: ignore[no-redef]
-def setup_telemetry(service_name: str = "avtomatika"):
+def setup_telemetry(service_name: str = "avtomatika") -> Any:
     """Configures OpenTelemetry for the application if installed."""
     if not TELEMETRY_ENABLED:
         logger.info("opentelemetry-sdk not found. Telemetry is disabled.")

avtomatika/utils/webhook_sender.py CHANGED Viewed

@@ -1,8 +1,8 @@
-from asyncio import CancelledError, Queue, QueueFull, create_task, sleep
+from asyncio import CancelledError, Queue, QueueFull, Task, create_task, sleep
 from contextlib import suppress
 from dataclasses import asdict, dataclass
 from logging import getLogger
-from typing import Any
+from typing import Any, Optional
 from aiohttp import ClientSession, ClientTimeout
@@ -24,7 +24,7 @@ class WebhookSender:
         self.timeout = ClientTimeout(total=10)
         self.max_retries = 3
         self._queue: Queue[tuple[str, WebhookPayload]] = Queue(maxsize=1000)
-        self._worker_task = None
+        self._worker_task: Optional[Task[None]] = None
     def start(self) -> None:
         if not self._worker_task:

avtomatika/watcher.py CHANGED Viewed

@@ -3,6 +3,8 @@ from logging import getLogger
 from typing import TYPE_CHECKING
 from uuid import uuid4
+from .constants import JOB_STATUS_FAILED, JOB_STATUS_WAITING_FOR_WORKER
 if TYPE_CHECKING:
     from .engine import OrchestratorEngine
@@ -38,8 +40,8 @@ class Watcher:
                             try:
                                 # Get the latest version to avoid overwriting
                                 job_state = await self.storage.get_job_state(job_id)
-                                if job_state and job_state["status"] == "waiting_for_worker":
-                                    job_state["status"] = "failed"
+                                if job_state and job_state["status"] == JOB_STATUS_WAITING_FOR_WORKER:
+                                    job_state["status"] = JOB_STATUS_FAILED
                                     job_state["error_message"] = "Worker task timed out."
                                     await self.storage.save_job_state(job_id, job_state)

avtomatika 1.0b8__py3-none-any.whl → 1.0b10__py3-none-any.whl

avtomatika 1.0b8py3-none-any.whl → 1.0b10py3-none-any.whl