baqueue 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
baqueue/serializer.py ADDED
@@ -0,0 +1,124 @@
1
+ """Job serialization and deserialization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import importlib
7
+ from datetime import datetime, timezone
8
+ from typing import Any
9
+ from uuid import uuid4
10
+
11
+
12
+ def _now_ts() -> float:
13
+ return datetime.now(timezone.utc).timestamp()
14
+
15
+
16
+ class JobPayload:
17
+ """Serializable representation of a queued job."""
18
+
19
+ __slots__ = (
20
+ "id",
21
+ "job_class",
22
+ "data",
23
+ "queue",
24
+ "attempts",
25
+ "max_attempts",
26
+ "backoff",
27
+ "timeout",
28
+ "tags",
29
+ "batch_id",
30
+ "delay_until",
31
+ "created_at",
32
+ "updated_at",
33
+ "started_at",
34
+ "completed_at",
35
+ "failed_at",
36
+ "status",
37
+ "error",
38
+ )
39
+
40
+ def __init__(
41
+ self,
42
+ *,
43
+ id: str | None = None,
44
+ job_class: str = "",
45
+ data: dict[str, Any] | None = None,
46
+ queue: str = "default",
47
+ attempts: int = 0,
48
+ max_attempts: int = 3,
49
+ backoff: str | list[int] = "exponential",
50
+ timeout: int = 60,
51
+ tags: list[str] | None = None,
52
+ batch_id: str | None = None,
53
+ delay_until: float | None = None,
54
+ created_at: float | None = None,
55
+ updated_at: float | None = None,
56
+ started_at: float | None = None,
57
+ completed_at: float | None = None,
58
+ failed_at: float | None = None,
59
+ status: str = "pending",
60
+ error: str | None = None,
61
+ ):
62
+ self.id = id or uuid4().hex
63
+ self.job_class = job_class
64
+ self.data = data or {}
65
+ self.queue = queue
66
+ self.attempts = attempts
67
+ self.max_attempts = max_attempts
68
+ self.backoff = backoff
69
+ self.timeout = timeout
70
+ self.tags = tags or []
71
+ self.batch_id = batch_id
72
+ self.delay_until = delay_until
73
+ self.created_at = created_at or _now_ts()
74
+ self.updated_at = updated_at or self.created_at
75
+ self.started_at = started_at
76
+ self.completed_at = completed_at
77
+ self.failed_at = failed_at
78
+ self.status = status
79
+ self.error = error
80
+
81
+ def to_dict(self) -> dict[str, Any]:
82
+ return {
83
+ "id": self.id,
84
+ "job_class": self.job_class,
85
+ "data": self.data,
86
+ "queue": self.queue,
87
+ "attempts": self.attempts,
88
+ "max_attempts": self.max_attempts,
89
+ "backoff": self.backoff,
90
+ "timeout": self.timeout,
91
+ "tags": self.tags,
92
+ "batch_id": self.batch_id,
93
+ "delay_until": self.delay_until,
94
+ "created_at": self.created_at,
95
+ "updated_at": self.updated_at,
96
+ "started_at": self.started_at,
97
+ "completed_at": self.completed_at,
98
+ "failed_at": self.failed_at,
99
+ "status": self.status,
100
+ "error": self.error,
101
+ }
102
+
103
+ def to_json(self) -> str:
104
+ return json.dumps(self.to_dict())
105
+
106
+ @classmethod
107
+ def from_dict(cls, data: dict[str, Any]) -> JobPayload:
108
+ return cls(**data)
109
+
110
+ @classmethod
111
+ def from_json(cls, raw: str) -> JobPayload:
112
+ return cls.from_dict(json.loads(raw))
113
+
114
+
115
+ def resolve_job_class(class_path: str):
116
+ """Dynamically import and return a job class from its dotted path."""
117
+ module_path, class_name = class_path.rsplit(".", 1)
118
+ module = importlib.import_module(module_path)
119
+ return getattr(module, class_name)
120
+
121
+
122
+ def get_class_path(cls: type) -> str:
123
+ """Get the fully qualified dotted path for a class."""
124
+ return f"{cls.__module__}.{cls.__qualname__}"
baqueue/supervisor.py ADDED
@@ -0,0 +1,206 @@
1
+ """Supervisor - manages a pool of workers with auto-scaling and graceful shutdown."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ import signal
8
+ import os
9
+ from typing import Any
10
+
11
+ from baqueue.config import SupervisorConfig
12
+ from baqueue.drivers.base import BaseDriver
13
+ from baqueue.events import EventBus
14
+ from baqueue.pruner import Pruner
15
+ from baqueue.worker import Worker
16
+
17
+ logger = logging.getLogger("baqueue.supervisor")
18
+
19
+
20
+ class Supervisor:
21
+ """Manages a pool of workers for one or more queues.
22
+
23
+ Supports auto-balancing, scaling, and graceful shutdown.
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ driver: BaseDriver,
29
+ config: SupervisorConfig | None = None,
30
+ events: EventBus | None = None,
31
+ balancer: Any | None = None,
32
+ pruner: Pruner | None = None,
33
+ ):
34
+ self.driver = driver
35
+ self.config = config or SupervisorConfig()
36
+ self.events = events or EventBus.default()
37
+ self.balancer = balancer
38
+ self.pruner = pruner
39
+ self._workers: list[Worker] = []
40
+ self._tasks: list[asyncio.Task] = []
41
+ self._running = False
42
+ self._delayed_task: asyncio.Task | None = None
43
+ self._heartbeat_task: asyncio.Task | None = None
44
+ self._balance_task: asyncio.Task | None = None
45
+ self._pruner_task: asyncio.Task | None = None
46
+
47
+ @property
48
+ def is_running(self) -> bool:
49
+ return self._running
50
+
51
+ @property
52
+ def worker_count(self) -> int:
53
+ return len(self._workers)
54
+
55
+ @property
56
+ def stats(self) -> dict[str, Any]:
57
+ return {
58
+ "name": self.config.name,
59
+ "queues": self.config.queues,
60
+ "balance": self.config.balance,
61
+ "workers": len(self._workers),
62
+ "running": self._running,
63
+ "worker_stats": [w.stats for w in self._workers],
64
+ }
65
+
66
+ async def start(self) -> None:
67
+ """Start the supervisor and its worker pool."""
68
+ self._running = True
69
+ logger.info(
70
+ "Supervisor '%s' starting with %d workers on queues %s",
71
+ self.config.name, self.config.min_workers, self.config.queues,
72
+ )
73
+ await self.events.emit("supervisor.started", supervisor=self.config.name)
74
+
75
+ self._setup_signal_handlers()
76
+
77
+ for i in range(self.config.min_workers):
78
+ self._spawn_worker(i)
79
+
80
+ await self._report_stats()
81
+ self._delayed_task = asyncio.create_task(self._poll_delayed())
82
+ self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
83
+
84
+ if self.balancer:
85
+ self._balance_task = asyncio.create_task(self._balance_loop())
86
+
87
+ if self.pruner:
88
+ self._pruner_task = asyncio.create_task(self.pruner.start())
89
+
90
+ try:
91
+ await asyncio.gather(*self._tasks, return_exceptions=True)
92
+ except asyncio.CancelledError:
93
+ pass
94
+
95
+ async def stop(self) -> None:
96
+ """Gracefully stop all workers."""
97
+ logger.info("Supervisor '%s' shutting down...", self.config.name)
98
+ self._running = False
99
+
100
+ await self._report_stats()
101
+ for w in self._workers:
102
+ w.stop()
103
+
104
+ if self.pruner:
105
+ self.pruner.stop()
106
+
107
+ if self._delayed_task:
108
+ self._delayed_task.cancel()
109
+ if self._heartbeat_task:
110
+ self._heartbeat_task.cancel()
111
+ if self._balance_task:
112
+ self._balance_task.cancel()
113
+ if self._pruner_task:
114
+ self._pruner_task.cancel()
115
+
116
+ aux_tasks = [
117
+ t for t in (self._delayed_task, self._heartbeat_task, self._balance_task, self._pruner_task)
118
+ if t is not None
119
+ ]
120
+ if aux_tasks:
121
+ await asyncio.gather(*aux_tasks, return_exceptions=True)
122
+ self._delayed_task = None
123
+ self._heartbeat_task = None
124
+ self._balance_task = None
125
+ self._pruner_task = None
126
+
127
+ for task in self._tasks:
128
+ task.cancel()
129
+
130
+ await asyncio.gather(*self._tasks, return_exceptions=True)
131
+ self._tasks.clear()
132
+ self._workers.clear()
133
+ await self._report_stats()
134
+
135
+ await self.events.emit("supervisor.stopped", supervisor=self.config.name)
136
+ logger.info("Supervisor '%s' stopped", self.config.name)
137
+
138
+ def _spawn_worker(self, index: int) -> Worker:
139
+ worker = Worker(
140
+ driver=self.driver,
141
+ queues=list(self.config.queues),
142
+ events=self.events,
143
+ sleep_interval=self.config.sleep,
144
+ timeout=self.config.timeout,
145
+ name=f"{self.config.name}-worker-{index}",
146
+ )
147
+ self._workers.append(worker)
148
+ task = asyncio.create_task(worker.start())
149
+ self._tasks.append(task)
150
+ return worker
151
+
152
+ async def scale(self, count: int) -> None:
153
+ """Scale worker pool to the specified count."""
154
+ count = max(self.config.min_workers, min(count, self.config.max_workers))
155
+ current = len(self._workers)
156
+
157
+ if count > current:
158
+ for i in range(current, count):
159
+ self._spawn_worker(i)
160
+ logger.info("Scaled up to %d workers", count)
161
+ elif count < current:
162
+ for _ in range(current - count):
163
+ worker = self._workers.pop()
164
+ worker.stop()
165
+ logger.info("Scaled down to %d workers", count)
166
+
167
+ async def _poll_delayed(self) -> None:
168
+ """Periodically move delayed jobs into their queues."""
169
+ while self._running:
170
+ try:
171
+ await self.driver.pop_delayed()
172
+ except Exception:
173
+ logger.exception("Error polling delayed jobs")
174
+ await asyncio.sleep(1)
175
+
176
+ async def _balance_loop(self) -> None:
177
+ """Periodically rebalance workers across queues."""
178
+ while self._running:
179
+ try:
180
+ if self.balancer:
181
+ new_count = await self.balancer.recommend(
182
+ self.driver, self.config.queues, len(self._workers)
183
+ )
184
+ if new_count != len(self._workers):
185
+ await self.scale(new_count)
186
+ except Exception:
187
+ logger.exception("Error in balance loop")
188
+ await asyncio.sleep(5)
189
+
190
+ async def _heartbeat_loop(self) -> None:
191
+ while self._running:
192
+ await self._report_stats()
193
+ await asyncio.sleep(1)
194
+
195
+ async def _report_stats(self) -> None:
196
+ try:
197
+ await self.driver.report_supervisor(self.stats)
198
+ except Exception:
199
+ logger.exception("Failed to report supervisor stats")
200
+
201
+ def _setup_signal_handlers(self) -> None:
202
+ if os.name == "nt":
203
+ return
204
+ loop = asyncio.get_running_loop()
205
+ for sig in (signal.SIGTERM, signal.SIGINT):
206
+ loop.add_signal_handler(sig, lambda: asyncio.create_task(self.stop()))
baqueue/worker.py ADDED
@@ -0,0 +1,165 @@
1
+ """Worker - processes jobs from queues."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ import traceback
8
+ from typing import Any
9
+
10
+ from baqueue.drivers.base import BaseDriver
11
+ from baqueue.events import EventBus
12
+ from baqueue.job import Job, FunctionJob
13
+ from baqueue.retry import compute_delay, should_retry
14
+ from baqueue.serializer import JobPayload, resolve_job_class
15
+
16
+ logger = logging.getLogger("baqueue.worker")
17
+
18
+
19
+ class Worker:
20
+ """Pulls and executes jobs from one or more queues."""
21
+
22
+ def __init__(
23
+ self,
24
+ driver: BaseDriver,
25
+ queues: list[str],
26
+ events: EventBus | None = None,
27
+ sleep_interval: float = 1.0,
28
+ timeout: int = 60,
29
+ name: str = "worker-0",
30
+ ):
31
+ self.driver = driver
32
+ self.queues = queues
33
+ self.events = events or EventBus.default()
34
+ self.sleep_interval = sleep_interval
35
+ self.timeout = timeout
36
+ self.name = name
37
+ self._running = False
38
+ self._current_job: JobPayload | None = None
39
+ self._jobs_processed = 0
40
+ self._jobs_failed = 0
41
+
42
+ @property
43
+ def is_running(self) -> bool:
44
+ return self._running
45
+
46
+ @property
47
+ def stats(self) -> dict[str, Any]:
48
+ return {
49
+ "name": self.name,
50
+ "queues": self.queues,
51
+ "running": self._running,
52
+ "current_job": self._current_job.id if self._current_job else None,
53
+ "jobs_processed": self._jobs_processed,
54
+ "jobs_failed": self._jobs_failed,
55
+ }
56
+
57
+ async def start(self) -> None:
58
+ """Start the worker loop."""
59
+ self._running = True
60
+ await self.events.emit("worker.started", worker=self.name)
61
+ logger.info("Worker %s started on queues %s", self.name, self.queues)
62
+
63
+ try:
64
+ while self._running:
65
+ job = await self._fetch_next()
66
+ if job:
67
+ await self._process(job)
68
+ else:
69
+ await asyncio.sleep(self.sleep_interval)
70
+ except asyncio.CancelledError:
71
+ pass
72
+ finally:
73
+ self._running = False
74
+ await self.events.emit("worker.stopped", worker=self.name)
75
+ logger.info("Worker %s stopped", self.name)
76
+
77
+ def stop(self) -> None:
78
+ self._running = False
79
+
80
+ async def _fetch_next(self) -> JobPayload | None:
81
+ for queue in self.queues:
82
+ job = await self.driver.pop(queue)
83
+ if job:
84
+ return job
85
+ return None
86
+
87
+ async def _process(self, payload: JobPayload) -> None:
88
+ self._current_job = payload
89
+ job_timeout = payload.timeout or self.timeout
90
+
91
+ try:
92
+ await self.events.emit("job.started", payload=payload, worker=self.name)
93
+ await self.driver.record_metric(payload.queue, "processing", 1)
94
+ logger.debug("Processing job %s (%s)", payload.id, payload.job_class)
95
+
96
+ job_instance = self._instantiate(payload)
97
+ result = await asyncio.wait_for(
98
+ job_instance.handle(**payload.data),
99
+ timeout=job_timeout,
100
+ )
101
+
102
+ await self.driver.complete(payload)
103
+ await self.driver.record_metric(payload.queue, "completed", 1)
104
+ await self.events.emit("job.completed", payload=payload, result=result, worker=self.name)
105
+ self._jobs_processed += 1
106
+
107
+ if hasattr(job_instance, "on_success"):
108
+ try:
109
+ await job_instance.on_success(result, payload)
110
+ except Exception:
111
+ logger.exception("Error in on_success for job %s", payload.id)
112
+
113
+ await self._check_batch_completion(payload)
114
+
115
+ except Exception as exc:
116
+ error_msg = f"{type(exc).__name__}: {exc}\n{traceback.format_exc()}"
117
+ logger.warning("Job %s failed (attempt %d): %s", payload.id, payload.attempts, exc)
118
+
119
+ if should_retry(payload.attempts, payload.max_attempts):
120
+ delay = compute_delay(payload.backoff, payload.attempts)
121
+ await self.driver.release(payload, delay=delay)
122
+ await self.events.emit("job.retrying", payload=payload, error=error_msg, delay=delay)
123
+ else:
124
+ await self.driver.fail(payload, error_msg)
125
+ await self.driver.record_metric(payload.queue, "failed", 1)
126
+ await self.events.emit("job.failed", payload=payload, error=error_msg, worker=self.name)
127
+ self._jobs_failed += 1
128
+
129
+ job_instance = self._instantiate(payload)
130
+ try:
131
+ await job_instance.on_failure(exc, payload)
132
+ except Exception:
133
+ logger.exception("Error in on_failure for job %s", payload.id)
134
+
135
+ await self._check_batch_failure(payload)
136
+ finally:
137
+ self._current_job = None
138
+
139
+ def _instantiate(self, payload: JobPayload) -> Job:
140
+ cls = resolve_job_class(payload.job_class)
141
+ if isinstance(cls, FunctionJob):
142
+ return cls
143
+ return cls()
144
+
145
+ async def _check_batch_completion(self, payload: JobPayload) -> None:
146
+ if not payload.batch_id:
147
+ return
148
+ batch = await self.driver.increment_batch_counter(payload.batch_id, "completed_count", 1)
149
+ if not batch:
150
+ return
151
+ done = batch.get("completed_count", 0) + batch.get("failed_count", 0)
152
+ # Equality, not >=, so only the worker that pushed `done` to total fires the event.
153
+ if done == batch.get("total", 0):
154
+ await self.events.emit("batch.completed", batch_id=payload.batch_id, batch=batch)
155
+
156
+ async def _check_batch_failure(self, payload: JobPayload) -> None:
157
+ if not payload.batch_id:
158
+ return
159
+ batch = await self.driver.increment_batch_counter(payload.batch_id, "failed_count", 1)
160
+ if not batch:
161
+ return
162
+ # Fire batch.failed exactly once on the first failure (the increment that
163
+ # transitioned failed_count from 0 to 1).
164
+ if batch.get("allow_failures", False) is False and batch.get("failed_count", 0) == 1:
165
+ await self.events.emit("batch.failed", batch_id=payload.batch_id, batch=batch)