queue-max 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,426 @@
1
+ """Worker implementation for Queue Max.
2
+
3
+ Workers run in their own thread, polling the queue for jobs,
4
+ executing the processing function, and managing job lifecycle.
5
+ Includes state machine, callbacks, async support, and auto-scaling pool.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ import threading
11
+ import time
12
+ import traceback
13
+ from dataclasses import dataclass, field
14
+ from enum import Enum
15
+ from typing import Any, Callable, Dict, List, Optional, Union
16
+
17
+ from queue_max.core.queue import Queue
18
+ from queue_max.utils.helpers import get_env_int, is_retryable_error, now_iso
19
+
20
+ logger = logging.getLogger("queue_max.worker")
21
+
22
+
23
+ class WorkerState(Enum):
24
+ """Worker state machine states."""
25
+ INITIALIZED = "initialized"
26
+ STARTING = "starting"
27
+ RUNNING = "running"
28
+ STOPPING = "stopping"
29
+ STOPPED = "stopped"
30
+ ERROR = "error"
31
+
32
+
33
+ @dataclass
34
+ class WorkerStats:
35
+ """Statistics for a worker."""
36
+ worker_id: str = ""
37
+ state: str = "initialized"
38
+ processed: int = 0
39
+ failed: int = 0
40
+ retried: int = 0
41
+ started_at: Optional[str] = None
42
+ stopped_at: Optional[str] = None
43
+ last_heartbeat_at: Optional[str] = None
44
+ last_error: Optional[str] = None
45
+ total_runtime_seconds: float = 0.0
46
+ current_job_id: Optional[int] = None
47
+ throughput_jobs_per_hour: float = 0.0
48
+ uptime_seconds: float = 0.0
49
+
50
+
51
+ class Worker:
52
+ """Worker for processing queue jobs.
53
+
54
+ Runs a processing loop in a dedicated thread with state machine,
55
+ heartbeats, callbacks, and graceful shutdown.
56
+
57
+ Attributes:
58
+ worker_id: Unique identifier for this worker.
59
+ process_function: Function that processes job payloads.
60
+ queue: Queue instance to pull jobs from.
61
+ """
62
+
63
+ def __init__(
64
+ self,
65
+ worker_id: str,
66
+ process_function: Callable[[Dict[str, Any]], Any],
67
+ queue: Optional[Queue] = None,
68
+ on_job_start: Optional[Callable] = None,
69
+ on_job_complete: Optional[Callable] = None,
70
+ on_job_error: Optional[Callable] = None,
71
+ poll_interval: float = 1.0,
72
+ job_timeout: Optional[float] = None,
73
+ ):
74
+ """Initialize a worker.
75
+
76
+ Args:
77
+ worker_id: Unique identifier for the worker.
78
+ process_function: Function that processes job payloads.
79
+ queue: Queue instance (creates a new one if None).
80
+ on_job_start: Callback when a job starts processing.
81
+ on_job_complete: Callback when a job completes.
82
+ on_job_error: Callback when a job fails.
83
+ poll_interval: Seconds between polls when queue is empty.
84
+ job_timeout: Max seconds for job execution (Unix only).
85
+ """
86
+ self.worker_id = worker_id
87
+ self.process_function = process_function
88
+ self.queue = queue or Queue()
89
+ self.on_job_start = on_job_start
90
+ self.on_job_complete = on_job_complete
91
+ self.on_job_error = on_job_error
92
+ self.poll_interval = poll_interval
93
+ self.job_timeout = job_timeout
94
+
95
+ self._thread: Optional[threading.Thread] = None
96
+ self._stop_event = threading.Event()
97
+ self._state = WorkerState.INITIALIZED
98
+ self._current_job: Optional[Any] = None
99
+ self._current_job_start: Optional[float] = None
100
+ self._job_mutex = threading.Lock()
101
+ self._stats: WorkerStats = WorkerStats(worker_id=worker_id)
102
+ self._last_heartbeat_time = 0.0
103
+ self._heartbeat_interval = get_env_int("HEARTBEAT_INTERVAL", 5000) / 1000.0
104
+ self._start_time: Optional[float] = None
105
+
106
+ @property
107
+ def state(self) -> str:
108
+ return self._state.value
109
+
110
+ @property
111
+ def is_running(self) -> bool:
112
+ return self._state == WorkerState.RUNNING
113
+
114
+ def start(self) -> None:
115
+ """Start the worker in a background thread."""
116
+ if self._state in (WorkerState.RUNNING, WorkerState.STARTING):
117
+ logger.warning(f"Worker {self.worker_id} is already {self._state.value}")
118
+ return
119
+ self._state = WorkerState.STARTING
120
+ self._start_time = time.monotonic()
121
+ self._stats.started_at = now_iso()
122
+ self._stop_event.clear()
123
+ self._thread = threading.Thread(
124
+ target=self._run_loop,
125
+ name=f"Worker-{self.worker_id}",
126
+ daemon=True,
127
+ )
128
+ self._thread.start()
129
+ self._state = WorkerState.RUNNING
130
+ logger.info(f"Worker {self.worker_id} started")
131
+
132
+ def stop(self, timeout: float = 10.0) -> None:
133
+ """Gracefully stop the worker.
134
+
135
+ Args:
136
+ timeout: Max seconds to wait for the worker to stop.
137
+ """
138
+ if self._state not in (WorkerState.RUNNING, WorkerState.STARTING):
139
+ return
140
+ self._state = WorkerState.STOPPING
141
+ self._stop_event.set()
142
+ if self._thread and self._thread.is_alive():
143
+ self._thread.join(timeout=timeout)
144
+ self._state = WorkerState.STOPPED if not self._thread.is_alive() else WorkerState.ERROR
145
+ self._stats.stopped_at = now_iso()
146
+ logger.info(f"Worker {self.worker_id} stopped ({self._state.value})")
147
+
148
+ def _run_loop(self) -> None:
149
+ """Main worker processing loop."""
150
+ while not self._stop_event.is_set():
151
+ try:
152
+ job = self.queue.pop_job(self.worker_id)
153
+ except Exception as e:
154
+ logger.exception(f"Worker {self.worker_id}: pop error: {e}")
155
+ self._stats.last_error = str(e)
156
+ self._idle_wait()
157
+ continue
158
+ if job is None:
159
+ self._idle_wait()
160
+ continue
161
+ self._process_job(job)
162
+ self._send_heartbeat()
163
+
164
+ def _process_job(self, job: Any) -> None:
165
+ """Process a single job with timeout, callbacks, and error handling."""
166
+ start_time = time.monotonic()
167
+ with self._job_mutex:
168
+ self._current_job = job
169
+ self._current_job_start = start_time
170
+ self._stats.current_job_id = job.id
171
+
172
+ if self.on_job_start:
173
+ try:
174
+ self.on_job_start(worker_id=self.worker_id, job_id=job.id, payload=job.payload)
175
+ except Exception as e:
176
+ logger.error(f"Worker {self.worker_id}: on_job_start error: {e}")
177
+
178
+ try:
179
+ if self.job_timeout:
180
+ result = self._execute_with_timeout(job.payload)
181
+ else:
182
+ result = self.process_function(job.payload)
183
+ self.queue.complete_job(job.id, job.shard_id)
184
+ self._stats.processed += 1
185
+ self._stats.total_runtime_seconds += time.monotonic() - start_time
186
+ if self.on_job_complete:
187
+ try:
188
+ self.on_job_complete(worker_id=self.worker_id, job_id=job.id, result=result)
189
+ except Exception as e:
190
+ logger.error(f"Worker {self.worker_id}: on_job_complete error: {e}")
191
+ except Exception as e:
192
+ permanent = not is_retryable_error(e)
193
+ self.queue.fail_job(job.id, job.shard_id, e, permanent=permanent)
194
+ if permanent or job.tentativas + 1 >= job.max_tentativas:
195
+ self._stats.failed += 1
196
+ else:
197
+ self._stats.retried += 1
198
+ self._stats.last_error = str(e)
199
+ self._stats.total_runtime_seconds += time.monotonic() - start_time
200
+ if self.on_job_error:
201
+ try:
202
+ self.on_job_error(worker_id=self.worker_id, job_id=job.id, error=str(e), permanent=permanent)
203
+ except Exception as cb_err:
204
+ logger.error(f"Worker {self.worker_id}: on_job_error error: {cb_err}")
205
+ finally:
206
+ with self._job_mutex:
207
+ self._current_job = None
208
+ self._current_job_start = None
209
+ self._stats.current_job_id = None
210
+
211
+ def _execute_with_timeout(self, payload: Dict) -> Any:
212
+ """Execute function with job_timeout using ThreadPoolExecutor."""
213
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
214
+
215
+ with ThreadPoolExecutor(max_workers=1) as executor:
216
+ future = executor.submit(self.process_function, payload)
217
+ try:
218
+ return future.result(timeout=self.job_timeout)
219
+ except FuturesTimeoutError:
220
+ raise TimeoutError(f"Job exceeded {self.job_timeout}s timeout")
221
+
222
+ def _idle_wait(self) -> None:
223
+ self._stop_event.wait(timeout=self.poll_interval)
224
+
225
+ def _send_heartbeat(self) -> None:
226
+ now = time.monotonic()
227
+ if now - self._last_heartbeat_time >= self._heartbeat_interval:
228
+ try:
229
+ for sid in range(self.queue.num_shards):
230
+ self.queue.heartbeat(sid, self.worker_id)
231
+ self._last_heartbeat_time = now
232
+ self._stats.last_heartbeat_at = now_iso()
233
+ except Exception as e:
234
+ logger.exception(f"Worker {self.worker_id}: heartbeat error: {e}")
235
+
236
+ def get_current_job(self) -> Optional[Any]:
237
+ with self._job_mutex:
238
+ return self._current_job
239
+
240
+ def get_stats(self) -> Dict[str, Any]:
241
+ runtime_hours = self._stats.total_runtime_seconds / 3600
242
+ processed = self._stats.processed
243
+ throughput = round(processed / runtime_hours, 2) if runtime_hours > 0 else 0.0
244
+ uptime = time.monotonic() - self._start_time if self._start_time else 0
245
+ return {
246
+ "worker_id": self.worker_id,
247
+ "state": self._state.value,
248
+ "is_running": self._state == WorkerState.RUNNING,
249
+ "processed": processed,
250
+ "failed": self._stats.failed,
251
+ "retried": self._stats.retried,
252
+ "started_at": self._stats.started_at,
253
+ "last_heartbeat_at": self._stats.last_heartbeat_at,
254
+ "total_runtime_seconds": round(self._stats.total_runtime_seconds, 2),
255
+ "throughput_jobs_per_hour": throughput,
256
+ "uptime_seconds": round(uptime, 2),
257
+ "current_job_id": self._stats.current_job_id,
258
+ }
259
+
260
+ def __repr__(self) -> str:
261
+ return f"Worker(id='{self.worker_id}', state={self._state.value})"
262
+
263
+
264
+ class AsyncWorker(Worker):
265
+ """Worker that supports async/await process functions."""
266
+
267
+ def _run_loop(self) -> None:
268
+ self._loop = asyncio.new_event_loop()
269
+ asyncio.set_event_loop(self._loop)
270
+ while not self._stop_event.is_set():
271
+ try:
272
+ job = self.queue.pop_job(self.worker_id)
273
+ except Exception as e:
274
+ logger.exception(f"AsyncWorker {self.worker_id}: pop error: {e}")
275
+ self._stats.last_error = str(e)
276
+ time.sleep(self.poll_interval)
277
+ continue
278
+ if job is None:
279
+ self._idle_wait()
280
+ continue
281
+ self._loop.run_until_complete(self._process_async(job))
282
+ self._send_heartbeat()
283
+
284
+ async def _process_async(self, job: Any) -> None:
285
+ start_time = time.monotonic()
286
+ try:
287
+ if asyncio.iscoroutinefunction(self.process_function):
288
+ result = await self.process_function(job.payload)
289
+ else:
290
+ result = self.process_function(job.payload)
291
+ self.queue.complete_job(job.id, job.shard_id)
292
+ self._stats.processed += 1
293
+ self._stats.total_runtime_seconds += time.monotonic() - start_time
294
+ except Exception as e:
295
+ permanent = not is_retryable_error(e)
296
+ self.queue.fail_job(job.id, job.shard_id, e, permanent=permanent)
297
+ if permanent or job.tentativas + 1 >= job.max_tentativas:
298
+ self._stats.failed += 1
299
+ else:
300
+ self._stats.retried += 1
301
+ self._stats.total_runtime_seconds += time.monotonic() - start_time
302
+ self._stats.last_error = str(e)
303
+
304
+
305
+ class WorkerPool:
306
+ """Manages multiple Worker instances for parallel processing.
307
+
308
+ Supports auto-scaling based on queue depth.
309
+
310
+ Attributes:
311
+ workers: List of Worker instances.
312
+ """
313
+
314
+ def __init__(
315
+ self,
316
+ workers: Optional[List[Worker]] = None,
317
+ auto_scale: bool = False,
318
+ min_workers: int = 1,
319
+ max_workers: int = 10,
320
+ scale_up_threshold: int = 100,
321
+ scale_down_threshold: int = 10,
322
+ scale_check_interval: float = 60.0,
323
+ ):
324
+ self.workers: List[Worker] = workers or []
325
+ self.auto_scale = auto_scale
326
+ self.min_workers = min_workers
327
+ self.max_workers = max_workers
328
+ self.scale_up_threshold = scale_up_threshold
329
+ self.scale_down_threshold = scale_down_threshold
330
+ self.scale_check_interval = scale_check_interval
331
+ self._queue = workers[0].queue if workers else None
332
+ self._scale_thread: Optional[threading.Thread] = None
333
+ self._stop_scale = threading.Event()
334
+
335
+ def add_worker(self, worker: Worker) -> None:
336
+ self.workers.append(worker)
337
+
338
+ def remove_worker(self, worker: Worker) -> None:
339
+ if worker in self.workers:
340
+ worker.stop()
341
+ self.workers.remove(worker)
342
+
343
+ def start_all(self) -> None:
344
+ for worker in self.workers:
345
+ worker.start()
346
+ if self.auto_scale and self._queue:
347
+ self._start_auto_scaling()
348
+
349
+ def stop_all(self, timeout: float = 10.0) -> None:
350
+ self._stop_auto_scaling()
351
+ for worker in self.workers:
352
+ worker.stop(timeout=timeout)
353
+
354
+ def _start_auto_scaling(self) -> None:
355
+ self._stop_scale.clear()
356
+ self._scale_thread = threading.Thread(target=self._scale_loop, daemon=True)
357
+ self._scale_thread.start()
358
+
359
+ def _stop_auto_scaling(self) -> None:
360
+ self._stop_scale.set()
361
+ if self._scale_thread and self._scale_thread.is_alive():
362
+ self._scale_thread.join(timeout=5)
363
+
364
+ def _scale_loop(self) -> None:
365
+ while not self._stop_scale.is_set():
366
+ try:
367
+ self._check_and_scale()
368
+ except Exception as e:
369
+ logger.error(f"Auto-scaling error: {e}")
370
+ self._stop_scale.wait(timeout=self.scale_check_interval)
371
+
372
+ def _check_and_scale(self) -> None:
373
+ if not self._queue:
374
+ return
375
+ stats = self._queue.get_stats()
376
+ pending = stats.get("pending", 0)
377
+ current = len(self.workers)
378
+ if pending > self.scale_up_threshold and current < self.max_workers:
379
+ self._scale_to(min(self.max_workers, current + 1), f"pending={pending}")
380
+ elif pending < self.scale_down_threshold and current > self.min_workers:
381
+ self._scale_to(max(self.min_workers, current - 1), f"pending={pending}")
382
+
383
+ def _scale_to(self, target: int, reason: str) -> None:
384
+ if target == len(self.workers):
385
+ return
386
+ logger.info(f"Scaling workers {len(self.workers)} -> {target}: {reason}")
387
+ if target > len(self.workers):
388
+ for i in range(len(self.workers), target):
389
+ w = Worker(
390
+ worker_id=f"pool-worker-{i+1}",
391
+ process_function=self.workers[0].process_function,
392
+ queue=self._queue,
393
+ )
394
+ w.start()
395
+ self.workers.append(w)
396
+ else:
397
+ for w in self.workers[target:]:
398
+ w.stop()
399
+ self.workers = self.workers[:target]
400
+
401
+ def get_stats(self) -> Dict[str, Any]:
402
+ per_worker = [w.get_stats() for w in self.workers]
403
+ return {
404
+ "workers": per_worker,
405
+ "total_workers": len(self.workers),
406
+ "total_processed": sum(w["processed"] for w in per_worker),
407
+ "total_failed": sum(w["failed"] for w in per_worker),
408
+ "total_retried": sum(w["retried"] for w in per_worker),
409
+ "auto_scale_enabled": self.auto_scale,
410
+ }
411
+
412
+ def wait_for_idle(self, timeout: Optional[float] = None) -> bool:
413
+ start = time.time()
414
+ while True:
415
+ busy = sum(1 for w in self.workers if w.get_current_job() is not None)
416
+ if busy == 0:
417
+ return True
418
+ if timeout and (time.time() - start) > timeout:
419
+ return False
420
+ time.sleep(0.5)
421
+
422
+ def __enter__(self):
423
+ return self
424
+
425
+ def __exit__(self, *args):
426
+ self.stop_all()
@@ -0,0 +1,25 @@
1
+ """Custom exceptions for the Robusta Queue library."""
2
+
3
+
4
+ class QueueError(Exception):
5
+ """Base exception for all queue-related errors."""
6
+
7
+
8
+ class RateLimitError(QueueError):
9
+ """Raised when rate limit is exceeded and the operation times out."""
10
+
11
+
12
+ class CircuitBreakerOpenError(QueueError):
13
+ """Raised when the circuit breaker is open and rejecting requests."""
14
+
15
+
16
+ class JobFailedError(QueueError):
17
+ """Raised when a job has permanently failed."""
18
+
19
+
20
+ class ShardError(QueueError):
21
+ """Raised when there is an error related to a specific shard."""
22
+
23
+
24
+ class ConfigurationError(QueueError):
25
+ """Raised when there is an invalid configuration."""
@@ -0,0 +1,5 @@
1
+ """Data models for Robusta Queue."""
2
+
3
+ from queue_max.models.job import Job, JobPriority, JobResult, JobStatus
4
+
5
+ __all__ = ["Job", "JobStatus", "JobPriority", "JobResult"]