queue-max 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- queue_max/__init__.py +62 -0
- queue_max/cli.py +373 -0
- queue_max/contrib/__init__.py +7 -0
- queue_max/contrib/django/__init__.py +61 -0
- queue_max/contrib/django/management/__init__.py +0 -0
- queue_max/contrib/django/management/commands/__init__.py +0 -0
- queue_max/contrib/django/management/commands/queue_purge.py +19 -0
- queue_max/contrib/django/management/commands/queue_stats.py +39 -0
- queue_max/contrib/django/management/commands/queue_worker.py +69 -0
- queue_max/contrib/fastapi/__init__.py +117 -0
- queue_max/contrib/flask/__init__.py +99 -0
- queue_max/core/__init__.py +16 -0
- queue_max/core/circuit_breaker.py +162 -0
- queue_max/core/database.py +253 -0
- queue_max/core/decorator.py +346 -0
- queue_max/core/queue.py +420 -0
- queue_max/core/rate_limiter.py +214 -0
- queue_max/core/worker.py +426 -0
- queue_max/exceptions.py +25 -0
- queue_max/models/__init__.py +5 -0
- queue_max/models/job.py +340 -0
- queue_max/py.typed +0 -0
- queue_max/utils/__init__.py +23 -0
- queue_max/utils/helpers.py +156 -0
- queue_max-0.1.0.dist-info/METADATA +233 -0
- queue_max-0.1.0.dist-info/RECORD +30 -0
- queue_max-0.1.0.dist-info/WHEEL +5 -0
- queue_max-0.1.0.dist-info/entry_points.txt +2 -0
- queue_max-0.1.0.dist-info/licenses/LICENSE +21 -0
- queue_max-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""FastAPI integration for Robusta Queue.
|
|
2
|
+
|
|
3
|
+
Provides background task processing via dependency injection
|
|
4
|
+
and middleware for automatic queue management.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
from fastapi import FastAPI
|
|
8
|
+
from queue_max.contrib.fastapi import QueueMiddleware, BackgroundQueue
|
|
9
|
+
|
|
10
|
+
app = FastAPI()
|
|
11
|
+
app.add_middleware(QueueMiddleware, max_workers=4)
|
|
12
|
+
|
|
13
|
+
@app.post('/webhook')
|
|
14
|
+
async def webhook(payload: dict, background: BackgroundQueue):
|
|
15
|
+
background.enqueue('process_webhook', payload=payload)
|
|
16
|
+
return {'status': 'accepted'}
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
from typing import Any, Callable, Dict, Optional
|
|
21
|
+
|
|
22
|
+
from queue_max import Queue, Worker
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger("queue_max.fastapi")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BackgroundQueue:
|
|
28
|
+
"""FastAPI dependency for background task enqueuing.
|
|
29
|
+
|
|
30
|
+
Injected via FastAPI dependency resolution to allow
|
|
31
|
+
endpoints to enqueue background tasks easily.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, queue: Queue):
|
|
35
|
+
self._queue = queue
|
|
36
|
+
|
|
37
|
+
def enqueue(
|
|
38
|
+
self,
|
|
39
|
+
task_name: str,
|
|
40
|
+
payload: Dict[str, Any],
|
|
41
|
+
pagina_id: Optional[int] = None,
|
|
42
|
+
priority: int = 0,
|
|
43
|
+
) -> Dict[str, Any]:
|
|
44
|
+
"""Enqueue a background task.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
task_name: Name of the task to enqueue.
|
|
48
|
+
payload: Task payload data.
|
|
49
|
+
pagina_id: Optional ID for consistent sharding.
|
|
50
|
+
priority: Task priority (0, 1, 2).
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Dict with 'id' and 'shard_id'.
|
|
54
|
+
"""
|
|
55
|
+
full_payload = {"task": task_name, **payload}
|
|
56
|
+
return self._queue.enqueue(
|
|
57
|
+
payload=full_payload,
|
|
58
|
+
pagina_id=pagina_id,
|
|
59
|
+
priority=priority,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class QueueMiddleware:
|
|
64
|
+
"""FastAPI middleware for Robusta Queue lifecycle management.
|
|
65
|
+
|
|
66
|
+
Automatically starts background workers when the app starts
|
|
67
|
+
and gracefully shuts them down on app shutdown.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
app: Any,
|
|
73
|
+
queue: Optional[Queue] = None,
|
|
74
|
+
max_workers: int = 2,
|
|
75
|
+
process_function: Optional[Callable] = None,
|
|
76
|
+
):
|
|
77
|
+
self.app = app
|
|
78
|
+
self.queue = queue or Queue()
|
|
79
|
+
self.max_workers = max_workers
|
|
80
|
+
self.process_function = process_function
|
|
81
|
+
|
|
82
|
+
@app.on_event("startup")
|
|
83
|
+
async def startup():
|
|
84
|
+
if self.process_function:
|
|
85
|
+
self._start_workers()
|
|
86
|
+
|
|
87
|
+
@app.on_event("shutdown")
|
|
88
|
+
async def shutdown():
|
|
89
|
+
self._stop_workers()
|
|
90
|
+
|
|
91
|
+
def _start_workers(self) -> None:
|
|
92
|
+
"""Start background worker pool."""
|
|
93
|
+
if not self.process_function:
|
|
94
|
+
return
|
|
95
|
+
from queue_max import WorkerPool
|
|
96
|
+
|
|
97
|
+
workers = [
|
|
98
|
+
Worker(
|
|
99
|
+
worker_id=f"fastapi-worker-{i + 1}",
|
|
100
|
+
process_function=self.process_function,
|
|
101
|
+
queue=self.queue,
|
|
102
|
+
)
|
|
103
|
+
for i in range(self.max_workers)
|
|
104
|
+
]
|
|
105
|
+
self._pool = WorkerPool(workers)
|
|
106
|
+
self._pool.start_all()
|
|
107
|
+
logger.info(f"Started {self.max_workers} FastAPI worker(s)")
|
|
108
|
+
|
|
109
|
+
def _stop_workers(self) -> None:
|
|
110
|
+
"""Stop background workers gracefully."""
|
|
111
|
+
if hasattr(self, "_pool"):
|
|
112
|
+
self._pool.stop_all()
|
|
113
|
+
logger.info("FastAPI workers stopped")
|
|
114
|
+
|
|
115
|
+
async def __call__(self, scope, receive, send) -> None:
|
|
116
|
+
"""ASGI callable."""
|
|
117
|
+
await self.app(scope, receive, send)
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Flask integration for Robusta Queue.
|
|
2
|
+
|
|
3
|
+
Provides an extension pattern for easy integration with Flask apps.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
from flask import Flask
|
|
7
|
+
from queue_max.contrib.flask import QueueExtension
|
|
8
|
+
|
|
9
|
+
app = Flask(__name__)
|
|
10
|
+
queue = QueueExtension(app)
|
|
11
|
+
|
|
12
|
+
@queue.task
|
|
13
|
+
def send_notification(user_id):
|
|
14
|
+
# send notification
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
@app.route('/notify/<int:user_id>')
|
|
18
|
+
def notify(user_id):
|
|
19
|
+
send_notification.delay(user_id=user_id)
|
|
20
|
+
return 'OK'
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import logging
|
|
24
|
+
from typing import Any, Callable, Dict, Optional
|
|
25
|
+
|
|
26
|
+
from queue_max import Queue as BaseQueue
|
|
27
|
+
from queue_max import task as base_task
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger("queue_max.flask")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class QueueExtension:
|
|
33
|
+
"""Flask extension for Robusta Queue.
|
|
34
|
+
|
|
35
|
+
Provides queue access via app.extensions['queue'] and a @task decorator.
|
|
36
|
+
|
|
37
|
+
Attributes:
|
|
38
|
+
queue: The underlying Queue instance.
|
|
39
|
+
app: The Flask application instance.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
app: Any = None,
|
|
45
|
+
queue: Optional[BaseQueue] = None,
|
|
46
|
+
):
|
|
47
|
+
"""Initialize the extension.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
app: Flask application instance (optional, can be init_app later).
|
|
51
|
+
queue: Queue instance (creates default if None).
|
|
52
|
+
"""
|
|
53
|
+
self.queue = queue or BaseQueue()
|
|
54
|
+
self.app = app
|
|
55
|
+
if app is not None:
|
|
56
|
+
self.init_app(app)
|
|
57
|
+
|
|
58
|
+
def init_app(self, app: Any) -> None:
|
|
59
|
+
"""Initialize the extension with a Flask app.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
app: Flask application instance.
|
|
63
|
+
"""
|
|
64
|
+
app.extensions = getattr(app, "extensions", {})
|
|
65
|
+
app.extensions["queue_max"] = self
|
|
66
|
+
|
|
67
|
+
def task(
|
|
68
|
+
self,
|
|
69
|
+
func: Optional[Callable] = None,
|
|
70
|
+
priority: int = 0,
|
|
71
|
+
max_retries: Optional[int] = None,
|
|
72
|
+
) -> Callable:
|
|
73
|
+
"""Decorator that registers a function as a queue task.
|
|
74
|
+
|
|
75
|
+
Can be used with or without arguments:
|
|
76
|
+
|
|
77
|
+
@queue.task
|
|
78
|
+
def my_task():
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
@queue.task(priority=2)
|
|
82
|
+
def my_task():
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
func: The function to decorate (when used without arguments).
|
|
87
|
+
priority: Task priority (0, 1, 2).
|
|
88
|
+
max_retries: Maximum retry attempts.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Decorated function with .delay() method.
|
|
92
|
+
"""
|
|
93
|
+
if func is not None:
|
|
94
|
+
return base_task(queue=self.queue, priority=priority, max_retries=max_retries)(func)
|
|
95
|
+
return base_task(queue=self.queue, priority=priority, max_retries=max_retries)
|
|
96
|
+
|
|
97
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
98
|
+
"""Get queue statistics."""
|
|
99
|
+
return self.queue.get_stats()
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Core modules for Robusta Queue."""
|
|
2
|
+
|
|
3
|
+
from queue_max.core.circuit_breaker import CircuitBreaker, CircuitState
|
|
4
|
+
from queue_max.core.rate_limiter import RateLimitUnit, RateLimiter
|
|
5
|
+
from queue_max.core.worker import AsyncWorker, Worker, WorkerPool, WorkerState
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"RateLimiter",
|
|
9
|
+
"RateLimitUnit",
|
|
10
|
+
"CircuitBreaker",
|
|
11
|
+
"CircuitState",
|
|
12
|
+
"Worker",
|
|
13
|
+
"AsyncWorker",
|
|
14
|
+
"WorkerPool",
|
|
15
|
+
"WorkerState",
|
|
16
|
+
]
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""Circuit breaker implementation for Queue Max.
|
|
2
|
+
|
|
3
|
+
Implements the standard circuit breaker pattern with three states:
|
|
4
|
+
- CLOSED: Normal operation, requests pass through
|
|
5
|
+
- OPEN: Circuit is tripped, requests are rejected immediately
|
|
6
|
+
- HALF_OPEN: Testing state, allows one request through to check if service recovered
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from typing import Callable, Optional
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CircuitState(Enum):
|
|
16
|
+
"""Circuit breaker states."""
|
|
17
|
+
|
|
18
|
+
CLOSED = "closed"
|
|
19
|
+
OPEN = "open"
|
|
20
|
+
HALF_OPEN = "half_open"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CircuitBreaker:
|
|
24
|
+
"""Circuit breaker for external service calls.
|
|
25
|
+
|
|
26
|
+
Prevents cascading failures by failing fast when a service is unhealthy.
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
failure_threshold: Number of consecutive failures to trip the circuit.
|
|
30
|
+
recovery_timeout: Seconds to wait before attempting recovery (HALF_OPEN).
|
|
31
|
+
state: Current circuit state.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
failure_threshold: int = 5,
|
|
37
|
+
recovery_timeout: float = 60.0,
|
|
38
|
+
on_state_change: Optional[Callable[[CircuitState, CircuitState], None]] = None,
|
|
39
|
+
):
|
|
40
|
+
"""Initialize the circuit breaker.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
failure_threshold: Consecutive failures before opening (default: 5).
|
|
44
|
+
recovery_timeout: Seconds before attempting recovery (default: 60).
|
|
45
|
+
on_state_change: Optional callback for state transitions.
|
|
46
|
+
"""
|
|
47
|
+
self.failure_threshold = failure_threshold
|
|
48
|
+
self.recovery_timeout = recovery_timeout
|
|
49
|
+
self.on_state_change = on_state_change
|
|
50
|
+
|
|
51
|
+
self.state = CircuitState.CLOSED
|
|
52
|
+
self._failure_count = 0
|
|
53
|
+
self._last_failure_time = 0.0
|
|
54
|
+
self._mutex = threading.Lock()
|
|
55
|
+
|
|
56
|
+
def call(self, func: Callable, *args, **kwargs):
|
|
57
|
+
"""Execute a function with circuit breaker protection.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
func: The function to execute.
|
|
61
|
+
*args: Arguments for the function.
|
|
62
|
+
**kwargs: Keyword arguments for the function.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
The return value of the function.
|
|
66
|
+
|
|
67
|
+
Raises:
|
|
68
|
+
CircuitBreakerOpenError: If the circuit is open.
|
|
69
|
+
Exception: Re-raises any exception from the called function.
|
|
70
|
+
"""
|
|
71
|
+
if not self._try_call():
|
|
72
|
+
from queue_max.exceptions import CircuitBreakerOpenError
|
|
73
|
+
|
|
74
|
+
raise CircuitBreakerOpenError(
|
|
75
|
+
f"Circuit breaker is OPEN (state: {self.state.value}). "
|
|
76
|
+
f"Recovery in {self._recovery_remaining():.0f}s. "
|
|
77
|
+
f"Failures: {self._failure_count}/{self.failure_threshold}"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
result = func(*args, **kwargs)
|
|
82
|
+
self._on_success()
|
|
83
|
+
return result
|
|
84
|
+
except Exception as e:
|
|
85
|
+
self._on_failure()
|
|
86
|
+
raise
|
|
87
|
+
|
|
88
|
+
def _try_call(self) -> bool:
|
|
89
|
+
"""Check if a call should be allowed through.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
True if the call is allowed, False if rejected.
|
|
93
|
+
"""
|
|
94
|
+
with self._mutex:
|
|
95
|
+
if self.state == CircuitState.CLOSED:
|
|
96
|
+
return True
|
|
97
|
+
elif self.state == CircuitState.OPEN:
|
|
98
|
+
if time.monotonic() - self._last_failure_time >= self.recovery_timeout:
|
|
99
|
+
self._set_state(CircuitState.HALF_OPEN)
|
|
100
|
+
return True
|
|
101
|
+
return False
|
|
102
|
+
elif self.state == CircuitState.HALF_OPEN:
|
|
103
|
+
return True
|
|
104
|
+
return False
|
|
105
|
+
|
|
106
|
+
def _on_success(self) -> None:
|
|
107
|
+
"""Handle a successful call."""
|
|
108
|
+
with self._mutex:
|
|
109
|
+
if self.state == CircuitState.HALF_OPEN:
|
|
110
|
+
self._set_state(CircuitState.CLOSED)
|
|
111
|
+
self._failure_count = 0
|
|
112
|
+
|
|
113
|
+
def _on_failure(self) -> None:
|
|
114
|
+
"""Handle a failed call."""
|
|
115
|
+
with self._mutex:
|
|
116
|
+
self._failure_count += 1
|
|
117
|
+
self._last_failure_time = time.monotonic()
|
|
118
|
+
if self.state == CircuitState.HALF_OPEN:
|
|
119
|
+
self._set_state(CircuitState.OPEN)
|
|
120
|
+
elif (
|
|
121
|
+
self.state == CircuitState.CLOSED
|
|
122
|
+
and self._failure_count >= self.failure_threshold
|
|
123
|
+
):
|
|
124
|
+
self._set_state(CircuitState.OPEN)
|
|
125
|
+
|
|
126
|
+
def _set_state(self, new_state: CircuitState) -> None:
|
|
127
|
+
"""Set the circuit state and trigger callback if configured.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
new_state: The new circuit state.
|
|
131
|
+
"""
|
|
132
|
+
old_state = self.state
|
|
133
|
+
self.state = new_state
|
|
134
|
+
if self.on_state_change and old_state != new_state:
|
|
135
|
+
self.on_state_change(old_state, new_state)
|
|
136
|
+
|
|
137
|
+
def _recovery_remaining(self) -> float:
|
|
138
|
+
"""Calculate seconds remaining until recovery attempt."""
|
|
139
|
+
remaining = self.recovery_timeout - (time.monotonic() - self._last_failure_time)
|
|
140
|
+
return max(0.0, remaining)
|
|
141
|
+
|
|
142
|
+
def reset(self) -> None:
|
|
143
|
+
"""Reset the circuit breaker to closed state."""
|
|
144
|
+
with self._mutex:
|
|
145
|
+
self._set_state(CircuitState.CLOSED)
|
|
146
|
+
self._failure_count = 0
|
|
147
|
+
self._last_failure_time = 0.0
|
|
148
|
+
|
|
149
|
+
def get_stats(self) -> dict:
|
|
150
|
+
"""Get current circuit breaker statistics.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Dict with 'state', 'failure_count', 'failure_threshold', etc.
|
|
154
|
+
"""
|
|
155
|
+
with self._mutex:
|
|
156
|
+
return {
|
|
157
|
+
"state": self.state.value,
|
|
158
|
+
"failure_count": self._failure_count,
|
|
159
|
+
"failure_threshold": self.failure_threshold,
|
|
160
|
+
"recovery_timeout": self.recovery_timeout,
|
|
161
|
+
"recovery_remaining": round(self._recovery_remaining(), 1),
|
|
162
|
+
}
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
"""Database and shard management for Queue Max.
|
|
2
|
+
|
|
3
|
+
Each shard is an independent SQLite database file with WAL mode.
|
|
4
|
+
Uses thread-local connections for thread safety.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json, os, sqlite3, threading, time, traceback
|
|
8
|
+
from contextlib import contextmanager
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from datetime import datetime, timedelta, timezone
|
|
11
|
+
from typing import Any, Dict, List, Optional
|
|
12
|
+
|
|
13
|
+
from queue_max.models.job import Job, JobStatus
|
|
14
|
+
from queue_max.utils.helpers import backoff_delay, get_env_int, now_iso
|
|
15
|
+
|
|
16
|
+
NUM_SHARDS = get_env_int("NUM_SHARDS", 6)
|
|
17
|
+
DB_BUSY_TIMEOUT = get_env_int("DB_BUSY_TIMEOUT", 30000)
|
|
18
|
+
DATA_DIR = os.environ.get("DATA_DIR", "./data")
|
|
19
|
+
CACHE_SIZE = get_env_int("CACHE_SIZE", 10000)
|
|
20
|
+
MMAP_SIZE = get_env_int("MMAP_SIZE", 268435456)
|
|
21
|
+
|
|
22
|
+
SCHEMA_SQL = """
|
|
23
|
+
CREATE TABLE IF NOT EXISTS fila (
|
|
24
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT, pagina_id INTEGER NULL,
|
|
25
|
+
payload TEXT NOT NULL, status TEXT DEFAULT 'pending',
|
|
26
|
+
priority INTEGER DEFAULT 0, tentativas INTEGER DEFAULT 0,
|
|
27
|
+
max_tentativas INTEGER DEFAULT 3, retry_delay INTEGER DEFAULT 60,
|
|
28
|
+
last_error TEXT NULL, error_type TEXT NULL, error_stack TEXT NULL,
|
|
29
|
+
worker_id TEXT NULL, heartbeat TEXT NULL,
|
|
30
|
+
created_at TEXT DEFAULT (datetime('now')),
|
|
31
|
+
started_at TEXT NULL, completed_at TEXT NULL, next_retry_at TEXT NULL
|
|
32
|
+
);
|
|
33
|
+
CREATE TABLE IF NOT EXISTS shard_metadata (
|
|
34
|
+
shard_id INTEGER PRIMARY KEY, version INTEGER DEFAULT 1,
|
|
35
|
+
created_at TEXT DEFAULT (datetime('now')), last_vacuum TEXT NULL,
|
|
36
|
+
total_jobs_processed INTEGER DEFAULT 0, total_jobs_failed INTEGER DEFAULT 0
|
|
37
|
+
);
|
|
38
|
+
CREATE TABLE IF NOT EXISTS dead_letter_queue (
|
|
39
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT, original_job_id INTEGER,
|
|
40
|
+
payload TEXT NOT NULL, error TEXT NOT NULL, error_type TEXT NOT NULL,
|
|
41
|
+
failed_at TEXT DEFAULT (datetime('now')), shard_id INTEGER
|
|
42
|
+
);
|
|
43
|
+
"""
|
|
44
|
+
INDEXES_SQL = [
|
|
45
|
+
"CREATE INDEX IF NOT EXISTS idx_status_priority ON fila(status, priority DESC);",
|
|
46
|
+
"CREATE INDEX IF NOT EXISTS idx_next_retry ON fila(next_retry_at) WHERE status = 'pending';",
|
|
47
|
+
"CREATE INDEX IF NOT EXISTS idx_heartbeat ON fila(heartbeat) WHERE status = 'processing';",
|
|
48
|
+
"CREATE INDEX IF NOT EXISTS idx_created_at ON fila(created_at);",
|
|
49
|
+
"CREATE INDEX IF NOT EXISTS idx_status_created ON fila(status, created_at);",
|
|
50
|
+
"CREATE INDEX IF NOT EXISTS idx_dlq_failed_at ON dead_letter_queue(failed_at);",
|
|
51
|
+
]
|
|
52
|
+
PRAGMAS_SQL = [
|
|
53
|
+
"PRAGMA journal_mode = WAL;", "PRAGMA synchronous = NORMAL;",
|
|
54
|
+
"PRAGMA cache_size = {};".format(CACHE_SIZE),
|
|
55
|
+
"PRAGMA mmap_size = {};".format(MMAP_SIZE),
|
|
56
|
+
"PRAGMA temp_store = MEMORY;",
|
|
57
|
+
"PRAGMA busy_timeout = {};".format(DB_BUSY_TIMEOUT),
|
|
58
|
+
]
|
|
59
|
+
POP_JOB_SELECT_SQL = "SELECT * FROM fila WHERE status='pending' AND (next_retry_at IS NULL OR next_retry_at<=?) ORDER BY priority DESC, id ASC LIMIT 1;"
|
|
60
|
+
CLAIM_JOB_SQL = "UPDATE fila SET status='processing', worker_id=?, heartbeat=?, started_at=?, tentativas=tentativas+1 WHERE id=? AND status='pending';"
|
|
61
|
+
COMPLETE_JOB_SQL = "DELETE FROM fila WHERE id=?;"
|
|
62
|
+
FAIL_JOB_SQL = "UPDATE fila SET status='failed', last_error=?, error_type=?, error_stack=?, worker_id=NULL, heartbeat=NULL, completed_at=? WHERE id=?;"
|
|
63
|
+
RETRY_SCHEDULE_SQL = "UPDATE fila SET status='pending', next_retry_at=?, last_error=?, error_type=?, error_stack=? WHERE id=?;"
|
|
64
|
+
RETRY_FAILED_SQL = "UPDATE fila SET status='pending', next_retry_at=?, worker_id=NULL, heartbeat=NULL, last_error=NULL, error_type=NULL, error_stack=NULL WHERE status='failed';"
|
|
65
|
+
HEARTBEAT_SQL = "UPDATE fila SET heartbeat=? WHERE worker_id=? AND status='processing';"
|
|
66
|
+
RECOVER_ORPHAN_SQL = "UPDATE fila SET status='pending', worker_id=NULL, heartbeat=NULL, next_retry_at=?, last_error='Recovered orphan', error_type='OrphanRecovery' WHERE status='processing' AND (heartbeat IS NULL OR heartbeat<?);"
|
|
67
|
+
MOVE_TO_DLQ_SQL = "INSERT INTO dead_letter_queue (original_job_id, payload, error, error_type, shard_id) VALUES (?, ?, ?, ?, ?);"
|
|
68
|
+
UPDATE_META_PROCESSED_SQL = "UPDATE shard_metadata SET total_jobs_processed=total_jobs_processed+1 WHERE shard_id=?;"
|
|
69
|
+
UPDATE_META_FAILED_SQL = "UPDATE shard_metadata SET total_jobs_failed=total_jobs_failed+1 WHERE shard_id=?;"
|
|
70
|
+
INSERT_META_SQL = "INSERT OR IGNORE INTO shard_metadata (shard_id, version) VALUES (?, ?);"
|
|
71
|
+
GET_METRICS_SQL = "SELECT COUNT(CASE WHEN status='pending' THEN 1 END) as pending, COUNT(CASE WHEN status='processing' THEN 1 END) as processing, COUNT(CASE WHEN status='failed' THEN 1 END) as failed, AVG(CASE WHEN status='processing' AND started_at IS NOT NULL THEN (julianday('now')-julianday(started_at))*86400.0 END) as avg_processing_time FROM fila;"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class ShardMetrics:
|
|
76
|
+
shard_id: int; pending: int = 0; processing: int = 0; failed: int = 0
|
|
77
|
+
avg_processing_time: Optional[float] = None
|
|
78
|
+
total_jobs_processed: int = 0; total_jobs_failed: int = 0
|
|
79
|
+
is_healthy: bool = True; last_error: Optional[str] = None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class ShardManager:
|
|
83
|
+
"""Manages SQLite shards with thread-local connections."""
|
|
84
|
+
|
|
85
|
+
def __init__(self, num_shards: int = NUM_SHARDS, data_dir: str = DATA_DIR):
|
|
86
|
+
if num_shards < 1:
|
|
87
|
+
raise ValueError(f"num_shards must be >= 1, got {num_shards}")
|
|
88
|
+
self.num_shards = num_shards
|
|
89
|
+
self.data_dir = data_dir
|
|
90
|
+
self.vacuum_interval_hours = 24
|
|
91
|
+
self._local = threading.local()
|
|
92
|
+
self._last_vacuum_time: Dict[int, float] = {}
|
|
93
|
+
os.makedirs(data_dir, exist_ok=True)
|
|
94
|
+
self._init_all_shards()
|
|
95
|
+
|
|
96
|
+
def _init_all_shards(self) -> None:
|
|
97
|
+
for shard_id in range(self.num_shards):
|
|
98
|
+
db_path = os.path.join(self.data_dir, f"shard_{shard_id}.db")
|
|
99
|
+
is_new = not os.path.exists(db_path)
|
|
100
|
+
conn = sqlite3.connect(db_path, timeout=DB_BUSY_TIMEOUT / 1000)
|
|
101
|
+
try:
|
|
102
|
+
for p in PRAGMAS_SQL: conn.execute(p)
|
|
103
|
+
conn.executescript(SCHEMA_SQL)
|
|
104
|
+
for i in INDEXES_SQL: conn.execute(i)
|
|
105
|
+
if is_new: conn.execute(INSERT_META_SQL, (shard_id, 1))
|
|
106
|
+
conn.commit()
|
|
107
|
+
finally:
|
|
108
|
+
conn.close()
|
|
109
|
+
|
|
110
|
+
@contextmanager
|
|
111
|
+
def get_connection(self, shard_id: int):
|
|
112
|
+
conn = self._get_connection(shard_id)
|
|
113
|
+
try:
|
|
114
|
+
yield conn
|
|
115
|
+
except Exception:
|
|
116
|
+
conn.rollback()
|
|
117
|
+
raise
|
|
118
|
+
|
|
119
|
+
def _get_connection(self, shard_id: int) -> sqlite3.Connection:
|
|
120
|
+
if not hasattr(self._local, "connections"):
|
|
121
|
+
self._local.connections = {}
|
|
122
|
+
if shard_id not in self._local.connections:
|
|
123
|
+
db_path = os.path.join(self.data_dir, f"shard_{shard_id}.db")
|
|
124
|
+
conn = sqlite3.connect(db_path, timeout=DB_BUSY_TIMEOUT / 1000)
|
|
125
|
+
conn.row_factory = sqlite3.Row
|
|
126
|
+
for p in PRAGMAS_SQL: conn.execute(p)
|
|
127
|
+
self._local.connections[shard_id] = conn
|
|
128
|
+
return self._local.connections[shard_id]
|
|
129
|
+
|
|
130
|
+
def insert_job(self, shard_id: int, payload: Dict[str, Any], pagina_id: Optional[int] = None, priority: int = 0, max_retries: Optional[int] = None) -> int:
|
|
131
|
+
max_retries = max_retries or get_env_int("QUEUE_MAX_RETRIES", 3)
|
|
132
|
+
conn = self._get_connection(shard_id)
|
|
133
|
+
cur = conn.execute("INSERT INTO fila (pagina_id, payload, priority, max_tentativas) VALUES (?, ?, ?, ?)", (pagina_id, json.dumps(payload), priority, max_retries))
|
|
134
|
+
conn.commit()
|
|
135
|
+
return cur.lastrowid
|
|
136
|
+
|
|
137
|
+
def pop_job(self, shard_id: int, worker_id: str) -> Optional[Job]:
|
|
138
|
+
conn = self._get_connection(shard_id)
|
|
139
|
+
now = now_iso()
|
|
140
|
+
try:
|
|
141
|
+
conn.execute("BEGIN IMMEDIATE")
|
|
142
|
+
row = conn.execute(POP_JOB_SELECT_SQL, (now,)).fetchone()
|
|
143
|
+
if row is None:
|
|
144
|
+
conn.commit(); return None
|
|
145
|
+
job_id = row["id"]
|
|
146
|
+
cur = conn.execute(CLAIM_JOB_SQL, (worker_id, now, now, job_id))
|
|
147
|
+
conn.commit()
|
|
148
|
+
if cur.rowcount == 0: return None
|
|
149
|
+
job = Job.from_row(dict(row), shard_id=shard_id)
|
|
150
|
+
job.status = JobStatus.PROCESSING; job.worker_id = worker_id
|
|
151
|
+
return job
|
|
152
|
+
except sqlite3.OperationalError:
|
|
153
|
+
conn.rollback(); return None
|
|
154
|
+
|
|
155
|
+
def complete_job(self, shard_id: int, job_id: int) -> None:
|
|
156
|
+
conn = self._get_connection(shard_id)
|
|
157
|
+
conn.execute("BEGIN IMMEDIATE")
|
|
158
|
+
conn.execute(COMPLETE_JOB_SQL, (job_id,))
|
|
159
|
+
conn.execute(UPDATE_META_PROCESSED_SQL, (shard_id,))
|
|
160
|
+
conn.commit()
|
|
161
|
+
|
|
162
|
+
def fail_job(self, shard_id: int, job_id: int, error: Exception, permanent: bool = False) -> None:
|
|
163
|
+
now = now_iso()
|
|
164
|
+
et = type(error).__name__; em = str(error)
|
|
165
|
+
es = "".join(traceback.format_exception(type(error), error, error.__traceback__))
|
|
166
|
+
with self.get_connection(shard_id) as conn:
|
|
167
|
+
if permanent:
|
|
168
|
+
row = conn.execute("SELECT payload FROM fila WHERE id=?", (job_id,)).fetchone()
|
|
169
|
+
conn.execute(FAIL_JOB_SQL, (em, et, es, now, job_id))
|
|
170
|
+
if row: conn.execute(MOVE_TO_DLQ_SQL, (job_id, row["payload"], em, et, shard_id))
|
|
171
|
+
conn.execute(UPDATE_META_FAILED_SQL, (shard_id,))
|
|
172
|
+
else:
|
|
173
|
+
row = conn.execute("SELECT tentativas, max_tentativas FROM fila WHERE id=?", (job_id,)).fetchone()
|
|
174
|
+
if row:
|
|
175
|
+
t = row["tentativas"] + 1
|
|
176
|
+
if t >= row["max_tentativas"]:
|
|
177
|
+
conn.commit()
|
|
178
|
+
return self.fail_job(shard_id, job_id, error, permanent=True)
|
|
179
|
+
d = backoff_delay(t)
|
|
180
|
+
nr = (datetime.now(timezone.utc) + timedelta(seconds=d)).isoformat(timespec="milliseconds").replace("+00:00", "Z")
|
|
181
|
+
conn.execute(RETRY_SCHEDULE_SQL, (nr, em, et, es, job_id))
|
|
182
|
+
conn.commit()
|
|
183
|
+
|
|
184
|
+
def retry_failed_jobs(self, shard_id: int) -> int:
|
|
185
|
+
conn = self._get_connection(shard_id)
|
|
186
|
+
cur = conn.execute(RETRY_FAILED_SQL, (now_iso(),))
|
|
187
|
+
conn.commit(); return cur.rowcount
|
|
188
|
+
|
|
189
|
+
def cleanup_old_jobs(self, shard_id: int, days: int = 7) -> int:
|
|
190
|
+
cutoff = (datetime.now(timezone.utc) - timedelta(days=days)).isoformat(timespec="milliseconds").replace("+00:00", "Z")
|
|
191
|
+
conn = self._get_connection(shard_id)
|
|
192
|
+
cur = conn.execute("DELETE FROM fila WHERE created_at<? AND status IN ('failed','completed')", (cutoff,))
|
|
193
|
+
conn.execute("DELETE FROM dead_letter_queue WHERE failed_at<?", (cutoff,))
|
|
194
|
+
conn.commit(); self._maybe_vacuum(shard_id); return cur.rowcount
|
|
195
|
+
|
|
196
|
+
def _maybe_vacuum(self, shard_id: int) -> None:
|
|
197
|
+
if self.vacuum_interval_hours <= 0: return
|
|
198
|
+
now = time.time()
|
|
199
|
+
if now - self._last_vacuum_time.get(shard_id, 0) >= self.vacuum_interval_hours * 3600:
|
|
200
|
+
try:
|
|
201
|
+
conn = self._get_connection(shard_id)
|
|
202
|
+
conn.execute("VACUUM")
|
|
203
|
+
conn.execute("UPDATE shard_metadata SET last_vacuum=? WHERE shard_id=?", (now_iso(), shard_id))
|
|
204
|
+
conn.commit(); self._last_vacuum_time[shard_id] = now
|
|
205
|
+
except Exception: pass
|
|
206
|
+
|
|
207
|
+
def get_failed_jobs(self, shard_id: int, limit: int = 100) -> List[Job]:
|
|
208
|
+
conn = self._get_connection(shard_id)
|
|
209
|
+
return [Job.from_row(dict(r), shard_id=shard_id) for r in conn.execute("SELECT * FROM fila WHERE status='failed' ORDER BY id DESC LIMIT ?", (limit,)).fetchall()]
|
|
210
|
+
|
|
211
|
+
def get_dead_letter_queue(self, shard_id: int, limit: int = 100) -> List[Dict]:
|
|
212
|
+
conn = self._get_connection(shard_id)
|
|
213
|
+
return [dict(r) for r in conn.execute("SELECT * FROM dead_letter_queue ORDER BY failed_at DESC LIMIT ?", (limit,)).fetchall()]
|
|
214
|
+
|
|
215
|
+
def get_processing_jobs(self, shard_id: int) -> List[Job]:
|
|
216
|
+
conn = self._get_connection(shard_id)
|
|
217
|
+
return [Job.from_row(dict(r), shard_id=shard_id) for r in conn.execute("SELECT * FROM fila WHERE status='processing' ORDER BY id ASC").fetchall()]
|
|
218
|
+
|
|
219
|
+
def heartbeat(self, shard_id: int, worker_id: str) -> None:
|
|
220
|
+
conn = self._get_connection(shard_id)
|
|
221
|
+
conn.execute(HEARTBEAT_SQL, (now_iso(), worker_id)); conn.commit()
|
|
222
|
+
|
|
223
|
+
def recover_orphans(self, shard_id: int, stuck_timeout: int = 30000) -> int:
|
|
224
|
+
now = now_iso()
|
|
225
|
+
ts = (datetime.now(timezone.utc) - timedelta(seconds=stuck_timeout / 1000)).isoformat(timespec="milliseconds").replace("+00:00", "Z")
|
|
226
|
+
conn = self._get_connection(shard_id)
|
|
227
|
+
cur = conn.execute(RECOVER_ORPHAN_SQL, (now, ts)); conn.commit(); return cur.rowcount
|
|
228
|
+
|
|
229
|
+
def get_metrics(self, shard_id: int) -> ShardMetrics:
|
|
230
|
+
try:
|
|
231
|
+
conn = self._get_connection(shard_id)
|
|
232
|
+
row = conn.execute(GET_METRICS_SQL).fetchone()
|
|
233
|
+
meta = conn.execute("SELECT total_jobs_processed, total_jobs_failed FROM shard_metadata WHERE shard_id=?", (shard_id,)).fetchone()
|
|
234
|
+
return ShardMetrics(shard_id, row["pending"] or 0, row["processing"] or 0, row["failed"] or 0, row["avg_processing_time"], meta["total_jobs_processed"] if meta else 0, meta["total_jobs_failed"] if meta else 0)
|
|
235
|
+
except Exception as e:
|
|
236
|
+
return ShardMetrics(shard_id=shard_id, is_healthy=False, last_error=str(e))
|
|
237
|
+
|
|
238
|
+
def get_stats(self, shard_id: int) -> Dict[str, int]:
|
|
239
|
+
m = self.get_metrics(shard_id); return {"pending": m.pending, "processing": m.processing, "failed": m.failed}
|
|
240
|
+
|
|
241
|
+
def get_all_stats(self) -> Dict[str, int]:
|
|
242
|
+
total: Dict[str, int] = {"pending": 0, "processing": 0, "failed": 0}
|
|
243
|
+
for sid in range(self.num_shards):
|
|
244
|
+
s = self.get_stats(sid)
|
|
245
|
+
for k in total: total[k] += s[k]
|
|
246
|
+
return total
|
|
247
|
+
|
|
248
|
+
def close_all(self) -> None:
|
|
249
|
+
if hasattr(self._local, "connections"):
|
|
250
|
+
for c in self._local.connections.values():
|
|
251
|
+
try: c.close()
|
|
252
|
+
except Exception: pass
|
|
253
|
+
self._local.connections.clear()
|