queue-max 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,117 @@
1
+ """FastAPI integration for Robusta Queue.
2
+
3
+ Provides background task processing via dependency injection
4
+ and middleware for automatic queue management.
5
+
6
+ Usage:
7
+ from fastapi import FastAPI
8
+ from queue_max.contrib.fastapi import QueueMiddleware, BackgroundQueue
9
+
10
+ app = FastAPI()
11
+ app.add_middleware(QueueMiddleware, max_workers=4)
12
+
13
+ @app.post('/webhook')
14
+ async def webhook(payload: dict, background: BackgroundQueue):
15
+ background.enqueue('process_webhook', payload=payload)
16
+ return {'status': 'accepted'}
17
+ """
18
+
19
+ import logging
20
+ from typing import Any, Callable, Dict, Optional
21
+
22
+ from queue_max import Queue, Worker
23
+
24
+ logger = logging.getLogger("queue_max.fastapi")
25
+
26
+
27
+ class BackgroundQueue:
28
+ """FastAPI dependency for background task enqueuing.
29
+
30
+ Injected via FastAPI dependency resolution to allow
31
+ endpoints to enqueue background tasks easily.
32
+ """
33
+
34
+ def __init__(self, queue: Queue):
35
+ self._queue = queue
36
+
37
+ def enqueue(
38
+ self,
39
+ task_name: str,
40
+ payload: Dict[str, Any],
41
+ pagina_id: Optional[int] = None,
42
+ priority: int = 0,
43
+ ) -> Dict[str, Any]:
44
+ """Enqueue a background task.
45
+
46
+ Args:
47
+ task_name: Name of the task to enqueue.
48
+ payload: Task payload data.
49
+ pagina_id: Optional ID for consistent sharding.
50
+ priority: Task priority (0, 1, 2).
51
+
52
+ Returns:
53
+ Dict with 'id' and 'shard_id'.
54
+ """
55
+ full_payload = {"task": task_name, **payload}
56
+ return self._queue.enqueue(
57
+ payload=full_payload,
58
+ pagina_id=pagina_id,
59
+ priority=priority,
60
+ )
61
+
62
+
63
+ class QueueMiddleware:
64
+ """FastAPI middleware for Robusta Queue lifecycle management.
65
+
66
+ Automatically starts background workers when the app starts
67
+ and gracefully shuts them down on app shutdown.
68
+ """
69
+
70
+ def __init__(
71
+ self,
72
+ app: Any,
73
+ queue: Optional[Queue] = None,
74
+ max_workers: int = 2,
75
+ process_function: Optional[Callable] = None,
76
+ ):
77
+ self.app = app
78
+ self.queue = queue or Queue()
79
+ self.max_workers = max_workers
80
+ self.process_function = process_function
81
+
82
+ @app.on_event("startup")
83
+ async def startup():
84
+ if self.process_function:
85
+ self._start_workers()
86
+
87
+ @app.on_event("shutdown")
88
+ async def shutdown():
89
+ self._stop_workers()
90
+
91
+ def _start_workers(self) -> None:
92
+ """Start background worker pool."""
93
+ if not self.process_function:
94
+ return
95
+ from queue_max import WorkerPool
96
+
97
+ workers = [
98
+ Worker(
99
+ worker_id=f"fastapi-worker-{i + 1}",
100
+ process_function=self.process_function,
101
+ queue=self.queue,
102
+ )
103
+ for i in range(self.max_workers)
104
+ ]
105
+ self._pool = WorkerPool(workers)
106
+ self._pool.start_all()
107
+ logger.info(f"Started {self.max_workers} FastAPI worker(s)")
108
+
109
+ def _stop_workers(self) -> None:
110
+ """Stop background workers gracefully."""
111
+ if hasattr(self, "_pool"):
112
+ self._pool.stop_all()
113
+ logger.info("FastAPI workers stopped")
114
+
115
+ async def __call__(self, scope, receive, send) -> None:
116
+ """ASGI callable."""
117
+ await self.app(scope, receive, send)
@@ -0,0 +1,99 @@
1
+ """Flask integration for Robusta Queue.
2
+
3
+ Provides an extension pattern for easy integration with Flask apps.
4
+
5
+ Usage:
6
+ from flask import Flask
7
+ from queue_max.contrib.flask import QueueExtension
8
+
9
+ app = Flask(__name__)
10
+ queue = QueueExtension(app)
11
+
12
+ @queue.task
13
+ def send_notification(user_id):
14
+ # send notification
15
+ pass
16
+
17
+ @app.route('/notify/<int:user_id>')
18
+ def notify(user_id):
19
+ send_notification.delay(user_id=user_id)
20
+ return 'OK'
21
+ """
22
+
23
+ import logging
24
+ from typing import Any, Callable, Dict, Optional
25
+
26
+ from queue_max import Queue as BaseQueue
27
+ from queue_max import task as base_task
28
+
29
+ logger = logging.getLogger("queue_max.flask")
30
+
31
+
32
+ class QueueExtension:
33
+ """Flask extension for Robusta Queue.
34
+
35
+ Provides queue access via app.extensions['queue'] and a @task decorator.
36
+
37
+ Attributes:
38
+ queue: The underlying Queue instance.
39
+ app: The Flask application instance.
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ app: Any = None,
45
+ queue: Optional[BaseQueue] = None,
46
+ ):
47
+ """Initialize the extension.
48
+
49
+ Args:
50
+ app: Flask application instance (optional, can be init_app later).
51
+ queue: Queue instance (creates default if None).
52
+ """
53
+ self.queue = queue or BaseQueue()
54
+ self.app = app
55
+ if app is not None:
56
+ self.init_app(app)
57
+
58
+ def init_app(self, app: Any) -> None:
59
+ """Initialize the extension with a Flask app.
60
+
61
+ Args:
62
+ app: Flask application instance.
63
+ """
64
+ app.extensions = getattr(app, "extensions", {})
65
+ app.extensions["queue_max"] = self
66
+
67
+ def task(
68
+ self,
69
+ func: Optional[Callable] = None,
70
+ priority: int = 0,
71
+ max_retries: Optional[int] = None,
72
+ ) -> Callable:
73
+ """Decorator that registers a function as a queue task.
74
+
75
+ Can be used with or without arguments:
76
+
77
+ @queue.task
78
+ def my_task():
79
+ pass
80
+
81
+ @queue.task(priority=2)
82
+ def my_task():
83
+ pass
84
+
85
+ Args:
86
+ func: The function to decorate (when used without arguments).
87
+ priority: Task priority (0, 1, 2).
88
+ max_retries: Maximum retry attempts.
89
+
90
+ Returns:
91
+ Decorated function with .delay() method.
92
+ """
93
+ if func is not None:
94
+ return base_task(queue=self.queue, priority=priority, max_retries=max_retries)(func)
95
+ return base_task(queue=self.queue, priority=priority, max_retries=max_retries)
96
+
97
+ def get_stats(self) -> Dict[str, Any]:
98
+ """Get queue statistics."""
99
+ return self.queue.get_stats()
@@ -0,0 +1,16 @@
1
+ """Core modules for Robusta Queue."""
2
+
3
+ from queue_max.core.circuit_breaker import CircuitBreaker, CircuitState
4
+ from queue_max.core.rate_limiter import RateLimitUnit, RateLimiter
5
+ from queue_max.core.worker import AsyncWorker, Worker, WorkerPool, WorkerState
6
+
7
+ __all__ = [
8
+ "RateLimiter",
9
+ "RateLimitUnit",
10
+ "CircuitBreaker",
11
+ "CircuitState",
12
+ "Worker",
13
+ "AsyncWorker",
14
+ "WorkerPool",
15
+ "WorkerState",
16
+ ]
@@ -0,0 +1,162 @@
1
+ """Circuit breaker implementation for Queue Max.
2
+
3
+ Implements the standard circuit breaker pattern with three states:
4
+ - CLOSED: Normal operation, requests pass through
5
+ - OPEN: Circuit is tripped, requests are rejected immediately
6
+ - HALF_OPEN: Testing state, allows one request through to check if service recovered
7
+ """
8
+
9
+ import threading
10
+ import time
11
+ from enum import Enum
12
+ from typing import Callable, Optional
13
+
14
+
15
+ class CircuitState(Enum):
16
+ """Circuit breaker states."""
17
+
18
+ CLOSED = "closed"
19
+ OPEN = "open"
20
+ HALF_OPEN = "half_open"
21
+
22
+
23
+ class CircuitBreaker:
24
+ """Circuit breaker for external service calls.
25
+
26
+ Prevents cascading failures by failing fast when a service is unhealthy.
27
+
28
+ Attributes:
29
+ failure_threshold: Number of consecutive failures to trip the circuit.
30
+ recovery_timeout: Seconds to wait before attempting recovery (HALF_OPEN).
31
+ state: Current circuit state.
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ failure_threshold: int = 5,
37
+ recovery_timeout: float = 60.0,
38
+ on_state_change: Optional[Callable[[CircuitState, CircuitState], None]] = None,
39
+ ):
40
+ """Initialize the circuit breaker.
41
+
42
+ Args:
43
+ failure_threshold: Consecutive failures before opening (default: 5).
44
+ recovery_timeout: Seconds before attempting recovery (default: 60).
45
+ on_state_change: Optional callback for state transitions.
46
+ """
47
+ self.failure_threshold = failure_threshold
48
+ self.recovery_timeout = recovery_timeout
49
+ self.on_state_change = on_state_change
50
+
51
+ self.state = CircuitState.CLOSED
52
+ self._failure_count = 0
53
+ self._last_failure_time = 0.0
54
+ self._mutex = threading.Lock()
55
+
56
+ def call(self, func: Callable, *args, **kwargs):
57
+ """Execute a function with circuit breaker protection.
58
+
59
+ Args:
60
+ func: The function to execute.
61
+ *args: Arguments for the function.
62
+ **kwargs: Keyword arguments for the function.
63
+
64
+ Returns:
65
+ The return value of the function.
66
+
67
+ Raises:
68
+ CircuitBreakerOpenError: If the circuit is open.
69
+ Exception: Re-raises any exception from the called function.
70
+ """
71
+ if not self._try_call():
72
+ from queue_max.exceptions import CircuitBreakerOpenError
73
+
74
+ raise CircuitBreakerOpenError(
75
+ f"Circuit breaker is OPEN (state: {self.state.value}). "
76
+ f"Recovery in {self._recovery_remaining():.0f}s. "
77
+ f"Failures: {self._failure_count}/{self.failure_threshold}"
78
+ )
79
+
80
+ try:
81
+ result = func(*args, **kwargs)
82
+ self._on_success()
83
+ return result
84
+ except Exception as e:
85
+ self._on_failure()
86
+ raise
87
+
88
+ def _try_call(self) -> bool:
89
+ """Check if a call should be allowed through.
90
+
91
+ Returns:
92
+ True if the call is allowed, False if rejected.
93
+ """
94
+ with self._mutex:
95
+ if self.state == CircuitState.CLOSED:
96
+ return True
97
+ elif self.state == CircuitState.OPEN:
98
+ if time.monotonic() - self._last_failure_time >= self.recovery_timeout:
99
+ self._set_state(CircuitState.HALF_OPEN)
100
+ return True
101
+ return False
102
+ elif self.state == CircuitState.HALF_OPEN:
103
+ return True
104
+ return False
105
+
106
+ def _on_success(self) -> None:
107
+ """Handle a successful call."""
108
+ with self._mutex:
109
+ if self.state == CircuitState.HALF_OPEN:
110
+ self._set_state(CircuitState.CLOSED)
111
+ self._failure_count = 0
112
+
113
+ def _on_failure(self) -> None:
114
+ """Handle a failed call."""
115
+ with self._mutex:
116
+ self._failure_count += 1
117
+ self._last_failure_time = time.monotonic()
118
+ if self.state == CircuitState.HALF_OPEN:
119
+ self._set_state(CircuitState.OPEN)
120
+ elif (
121
+ self.state == CircuitState.CLOSED
122
+ and self._failure_count >= self.failure_threshold
123
+ ):
124
+ self._set_state(CircuitState.OPEN)
125
+
126
+ def _set_state(self, new_state: CircuitState) -> None:
127
+ """Set the circuit state and trigger callback if configured.
128
+
129
+ Args:
130
+ new_state: The new circuit state.
131
+ """
132
+ old_state = self.state
133
+ self.state = new_state
134
+ if self.on_state_change and old_state != new_state:
135
+ self.on_state_change(old_state, new_state)
136
+
137
+ def _recovery_remaining(self) -> float:
138
+ """Calculate seconds remaining until recovery attempt."""
139
+ remaining = self.recovery_timeout - (time.monotonic() - self._last_failure_time)
140
+ return max(0.0, remaining)
141
+
142
+ def reset(self) -> None:
143
+ """Reset the circuit breaker to closed state."""
144
+ with self._mutex:
145
+ self._set_state(CircuitState.CLOSED)
146
+ self._failure_count = 0
147
+ self._last_failure_time = 0.0
148
+
149
+ def get_stats(self) -> dict:
150
+ """Get current circuit breaker statistics.
151
+
152
+ Returns:
153
+ Dict with 'state', 'failure_count', 'failure_threshold', etc.
154
+ """
155
+ with self._mutex:
156
+ return {
157
+ "state": self.state.value,
158
+ "failure_count": self._failure_count,
159
+ "failure_threshold": self.failure_threshold,
160
+ "recovery_timeout": self.recovery_timeout,
161
+ "recovery_remaining": round(self._recovery_remaining(), 1),
162
+ }
@@ -0,0 +1,253 @@
1
+ """Database and shard management for Queue Max.
2
+
3
+ Each shard is an independent SQLite database file with WAL mode.
4
+ Uses thread-local connections for thread safety.
5
+ """
6
+
7
+ import json, os, sqlite3, threading, time, traceback
8
+ from contextlib import contextmanager
9
+ from dataclasses import dataclass
10
+ from datetime import datetime, timedelta, timezone
11
+ from typing import Any, Dict, List, Optional
12
+
13
+ from queue_max.models.job import Job, JobStatus
14
+ from queue_max.utils.helpers import backoff_delay, get_env_int, now_iso
15
+
16
+ NUM_SHARDS = get_env_int("NUM_SHARDS", 6)
17
+ DB_BUSY_TIMEOUT = get_env_int("DB_BUSY_TIMEOUT", 30000)
18
+ DATA_DIR = os.environ.get("DATA_DIR", "./data")
19
+ CACHE_SIZE = get_env_int("CACHE_SIZE", 10000)
20
+ MMAP_SIZE = get_env_int("MMAP_SIZE", 268435456)
21
+
22
+ SCHEMA_SQL = """
23
+ CREATE TABLE IF NOT EXISTS fila (
24
+ id INTEGER PRIMARY KEY AUTOINCREMENT, pagina_id INTEGER NULL,
25
+ payload TEXT NOT NULL, status TEXT DEFAULT 'pending',
26
+ priority INTEGER DEFAULT 0, tentativas INTEGER DEFAULT 0,
27
+ max_tentativas INTEGER DEFAULT 3, retry_delay INTEGER DEFAULT 60,
28
+ last_error TEXT NULL, error_type TEXT NULL, error_stack TEXT NULL,
29
+ worker_id TEXT NULL, heartbeat TEXT NULL,
30
+ created_at TEXT DEFAULT (datetime('now')),
31
+ started_at TEXT NULL, completed_at TEXT NULL, next_retry_at TEXT NULL
32
+ );
33
+ CREATE TABLE IF NOT EXISTS shard_metadata (
34
+ shard_id INTEGER PRIMARY KEY, version INTEGER DEFAULT 1,
35
+ created_at TEXT DEFAULT (datetime('now')), last_vacuum TEXT NULL,
36
+ total_jobs_processed INTEGER DEFAULT 0, total_jobs_failed INTEGER DEFAULT 0
37
+ );
38
+ CREATE TABLE IF NOT EXISTS dead_letter_queue (
39
+ id INTEGER PRIMARY KEY AUTOINCREMENT, original_job_id INTEGER,
40
+ payload TEXT NOT NULL, error TEXT NOT NULL, error_type TEXT NOT NULL,
41
+ failed_at TEXT DEFAULT (datetime('now')), shard_id INTEGER
42
+ );
43
+ """
44
+ INDEXES_SQL = [
45
+ "CREATE INDEX IF NOT EXISTS idx_status_priority ON fila(status, priority DESC);",
46
+ "CREATE INDEX IF NOT EXISTS idx_next_retry ON fila(next_retry_at) WHERE status = 'pending';",
47
+ "CREATE INDEX IF NOT EXISTS idx_heartbeat ON fila(heartbeat) WHERE status = 'processing';",
48
+ "CREATE INDEX IF NOT EXISTS idx_created_at ON fila(created_at);",
49
+ "CREATE INDEX IF NOT EXISTS idx_status_created ON fila(status, created_at);",
50
+ "CREATE INDEX IF NOT EXISTS idx_dlq_failed_at ON dead_letter_queue(failed_at);",
51
+ ]
52
+ PRAGMAS_SQL = [
53
+ "PRAGMA journal_mode = WAL;", "PRAGMA synchronous = NORMAL;",
54
+ "PRAGMA cache_size = {};".format(CACHE_SIZE),
55
+ "PRAGMA mmap_size = {};".format(MMAP_SIZE),
56
+ "PRAGMA temp_store = MEMORY;",
57
+ "PRAGMA busy_timeout = {};".format(DB_BUSY_TIMEOUT),
58
+ ]
59
+ POP_JOB_SELECT_SQL = "SELECT * FROM fila WHERE status='pending' AND (next_retry_at IS NULL OR next_retry_at<=?) ORDER BY priority DESC, id ASC LIMIT 1;"
60
+ CLAIM_JOB_SQL = "UPDATE fila SET status='processing', worker_id=?, heartbeat=?, started_at=?, tentativas=tentativas+1 WHERE id=? AND status='pending';"
61
+ COMPLETE_JOB_SQL = "DELETE FROM fila WHERE id=?;"
62
+ FAIL_JOB_SQL = "UPDATE fila SET status='failed', last_error=?, error_type=?, error_stack=?, worker_id=NULL, heartbeat=NULL, completed_at=? WHERE id=?;"
63
+ RETRY_SCHEDULE_SQL = "UPDATE fila SET status='pending', next_retry_at=?, last_error=?, error_type=?, error_stack=? WHERE id=?;"
64
+ RETRY_FAILED_SQL = "UPDATE fila SET status='pending', next_retry_at=?, worker_id=NULL, heartbeat=NULL, last_error=NULL, error_type=NULL, error_stack=NULL WHERE status='failed';"
65
+ HEARTBEAT_SQL = "UPDATE fila SET heartbeat=? WHERE worker_id=? AND status='processing';"
66
+ RECOVER_ORPHAN_SQL = "UPDATE fila SET status='pending', worker_id=NULL, heartbeat=NULL, next_retry_at=?, last_error='Recovered orphan', error_type='OrphanRecovery' WHERE status='processing' AND (heartbeat IS NULL OR heartbeat<?);"
67
+ MOVE_TO_DLQ_SQL = "INSERT INTO dead_letter_queue (original_job_id, payload, error, error_type, shard_id) VALUES (?, ?, ?, ?, ?);"
68
+ UPDATE_META_PROCESSED_SQL = "UPDATE shard_metadata SET total_jobs_processed=total_jobs_processed+1 WHERE shard_id=?;"
69
+ UPDATE_META_FAILED_SQL = "UPDATE shard_metadata SET total_jobs_failed=total_jobs_failed+1 WHERE shard_id=?;"
70
+ INSERT_META_SQL = "INSERT OR IGNORE INTO shard_metadata (shard_id, version) VALUES (?, ?);"
71
+ GET_METRICS_SQL = "SELECT COUNT(CASE WHEN status='pending' THEN 1 END) as pending, COUNT(CASE WHEN status='processing' THEN 1 END) as processing, COUNT(CASE WHEN status='failed' THEN 1 END) as failed, AVG(CASE WHEN status='processing' AND started_at IS NOT NULL THEN (julianday('now')-julianday(started_at))*86400.0 END) as avg_processing_time FROM fila;"
72
+
73
+
74
+ @dataclass
75
+ class ShardMetrics:
76
+ shard_id: int; pending: int = 0; processing: int = 0; failed: int = 0
77
+ avg_processing_time: Optional[float] = None
78
+ total_jobs_processed: int = 0; total_jobs_failed: int = 0
79
+ is_healthy: bool = True; last_error: Optional[str] = None
80
+
81
+
82
+ class ShardManager:
83
+ """Manages SQLite shards with thread-local connections."""
84
+
85
+ def __init__(self, num_shards: int = NUM_SHARDS, data_dir: str = DATA_DIR):
86
+ if num_shards < 1:
87
+ raise ValueError(f"num_shards must be >= 1, got {num_shards}")
88
+ self.num_shards = num_shards
89
+ self.data_dir = data_dir
90
+ self.vacuum_interval_hours = 24
91
+ self._local = threading.local()
92
+ self._last_vacuum_time: Dict[int, float] = {}
93
+ os.makedirs(data_dir, exist_ok=True)
94
+ self._init_all_shards()
95
+
96
+ def _init_all_shards(self) -> None:
97
+ for shard_id in range(self.num_shards):
98
+ db_path = os.path.join(self.data_dir, f"shard_{shard_id}.db")
99
+ is_new = not os.path.exists(db_path)
100
+ conn = sqlite3.connect(db_path, timeout=DB_BUSY_TIMEOUT / 1000)
101
+ try:
102
+ for p in PRAGMAS_SQL: conn.execute(p)
103
+ conn.executescript(SCHEMA_SQL)
104
+ for i in INDEXES_SQL: conn.execute(i)
105
+ if is_new: conn.execute(INSERT_META_SQL, (shard_id, 1))
106
+ conn.commit()
107
+ finally:
108
+ conn.close()
109
+
110
+ @contextmanager
111
+ def get_connection(self, shard_id: int):
112
+ conn = self._get_connection(shard_id)
113
+ try:
114
+ yield conn
115
+ except Exception:
116
+ conn.rollback()
117
+ raise
118
+
119
+ def _get_connection(self, shard_id: int) -> sqlite3.Connection:
120
+ if not hasattr(self._local, "connections"):
121
+ self._local.connections = {}
122
+ if shard_id not in self._local.connections:
123
+ db_path = os.path.join(self.data_dir, f"shard_{shard_id}.db")
124
+ conn = sqlite3.connect(db_path, timeout=DB_BUSY_TIMEOUT / 1000)
125
+ conn.row_factory = sqlite3.Row
126
+ for p in PRAGMAS_SQL: conn.execute(p)
127
+ self._local.connections[shard_id] = conn
128
+ return self._local.connections[shard_id]
129
+
130
+ def insert_job(self, shard_id: int, payload: Dict[str, Any], pagina_id: Optional[int] = None, priority: int = 0, max_retries: Optional[int] = None) -> int:
131
+ max_retries = max_retries or get_env_int("QUEUE_MAX_RETRIES", 3)
132
+ conn = self._get_connection(shard_id)
133
+ cur = conn.execute("INSERT INTO fila (pagina_id, payload, priority, max_tentativas) VALUES (?, ?, ?, ?)", (pagina_id, json.dumps(payload), priority, max_retries))
134
+ conn.commit()
135
+ return cur.lastrowid
136
+
137
+ def pop_job(self, shard_id: int, worker_id: str) -> Optional[Job]:
138
+ conn = self._get_connection(shard_id)
139
+ now = now_iso()
140
+ try:
141
+ conn.execute("BEGIN IMMEDIATE")
142
+ row = conn.execute(POP_JOB_SELECT_SQL, (now,)).fetchone()
143
+ if row is None:
144
+ conn.commit(); return None
145
+ job_id = row["id"]
146
+ cur = conn.execute(CLAIM_JOB_SQL, (worker_id, now, now, job_id))
147
+ conn.commit()
148
+ if cur.rowcount == 0: return None
149
+ job = Job.from_row(dict(row), shard_id=shard_id)
150
+ job.status = JobStatus.PROCESSING; job.worker_id = worker_id
151
+ return job
152
+ except sqlite3.OperationalError:
153
+ conn.rollback(); return None
154
+
155
+ def complete_job(self, shard_id: int, job_id: int) -> None:
156
+ conn = self._get_connection(shard_id)
157
+ conn.execute("BEGIN IMMEDIATE")
158
+ conn.execute(COMPLETE_JOB_SQL, (job_id,))
159
+ conn.execute(UPDATE_META_PROCESSED_SQL, (shard_id,))
160
+ conn.commit()
161
+
162
+ def fail_job(self, shard_id: int, job_id: int, error: Exception, permanent: bool = False) -> None:
163
+ now = now_iso()
164
+ et = type(error).__name__; em = str(error)
165
+ es = "".join(traceback.format_exception(type(error), error, error.__traceback__))
166
+ with self.get_connection(shard_id) as conn:
167
+ if permanent:
168
+ row = conn.execute("SELECT payload FROM fila WHERE id=?", (job_id,)).fetchone()
169
+ conn.execute(FAIL_JOB_SQL, (em, et, es, now, job_id))
170
+ if row: conn.execute(MOVE_TO_DLQ_SQL, (job_id, row["payload"], em, et, shard_id))
171
+ conn.execute(UPDATE_META_FAILED_SQL, (shard_id,))
172
+ else:
173
+ row = conn.execute("SELECT tentativas, max_tentativas FROM fila WHERE id=?", (job_id,)).fetchone()
174
+ if row:
175
+ t = row["tentativas"] + 1
176
+ if t >= row["max_tentativas"]:
177
+ conn.commit()
178
+ return self.fail_job(shard_id, job_id, error, permanent=True)
179
+ d = backoff_delay(t)
180
+ nr = (datetime.now(timezone.utc) + timedelta(seconds=d)).isoformat(timespec="milliseconds").replace("+00:00", "Z")
181
+ conn.execute(RETRY_SCHEDULE_SQL, (nr, em, et, es, job_id))
182
+ conn.commit()
183
+
184
+ def retry_failed_jobs(self, shard_id: int) -> int:
185
+ conn = self._get_connection(shard_id)
186
+ cur = conn.execute(RETRY_FAILED_SQL, (now_iso(),))
187
+ conn.commit(); return cur.rowcount
188
+
189
+ def cleanup_old_jobs(self, shard_id: int, days: int = 7) -> int:
190
+ cutoff = (datetime.now(timezone.utc) - timedelta(days=days)).isoformat(timespec="milliseconds").replace("+00:00", "Z")
191
+ conn = self._get_connection(shard_id)
192
+ cur = conn.execute("DELETE FROM fila WHERE created_at<? AND status IN ('failed','completed')", (cutoff,))
193
+ conn.execute("DELETE FROM dead_letter_queue WHERE failed_at<?", (cutoff,))
194
+ conn.commit(); self._maybe_vacuum(shard_id); return cur.rowcount
195
+
196
+ def _maybe_vacuum(self, shard_id: int) -> None:
197
+ if self.vacuum_interval_hours <= 0: return
198
+ now = time.time()
199
+ if now - self._last_vacuum_time.get(shard_id, 0) >= self.vacuum_interval_hours * 3600:
200
+ try:
201
+ conn = self._get_connection(shard_id)
202
+ conn.execute("VACUUM")
203
+ conn.execute("UPDATE shard_metadata SET last_vacuum=? WHERE shard_id=?", (now_iso(), shard_id))
204
+ conn.commit(); self._last_vacuum_time[shard_id] = now
205
+ except Exception: pass
206
+
207
+ def get_failed_jobs(self, shard_id: int, limit: int = 100) -> List[Job]:
208
+ conn = self._get_connection(shard_id)
209
+ return [Job.from_row(dict(r), shard_id=shard_id) for r in conn.execute("SELECT * FROM fila WHERE status='failed' ORDER BY id DESC LIMIT ?", (limit,)).fetchall()]
210
+
211
+ def get_dead_letter_queue(self, shard_id: int, limit: int = 100) -> List[Dict]:
212
+ conn = self._get_connection(shard_id)
213
+ return [dict(r) for r in conn.execute("SELECT * FROM dead_letter_queue ORDER BY failed_at DESC LIMIT ?", (limit,)).fetchall()]
214
+
215
+ def get_processing_jobs(self, shard_id: int) -> List[Job]:
216
+ conn = self._get_connection(shard_id)
217
+ return [Job.from_row(dict(r), shard_id=shard_id) for r in conn.execute("SELECT * FROM fila WHERE status='processing' ORDER BY id ASC").fetchall()]
218
+
219
+ def heartbeat(self, shard_id: int, worker_id: str) -> None:
220
+ conn = self._get_connection(shard_id)
221
+ conn.execute(HEARTBEAT_SQL, (now_iso(), worker_id)); conn.commit()
222
+
223
+ def recover_orphans(self, shard_id: int, stuck_timeout: int = 30000) -> int:
224
+ now = now_iso()
225
+ ts = (datetime.now(timezone.utc) - timedelta(seconds=stuck_timeout / 1000)).isoformat(timespec="milliseconds").replace("+00:00", "Z")
226
+ conn = self._get_connection(shard_id)
227
+ cur = conn.execute(RECOVER_ORPHAN_SQL, (now, ts)); conn.commit(); return cur.rowcount
228
+
229
+ def get_metrics(self, shard_id: int) -> ShardMetrics:
230
+ try:
231
+ conn = self._get_connection(shard_id)
232
+ row = conn.execute(GET_METRICS_SQL).fetchone()
233
+ meta = conn.execute("SELECT total_jobs_processed, total_jobs_failed FROM shard_metadata WHERE shard_id=?", (shard_id,)).fetchone()
234
+ return ShardMetrics(shard_id, row["pending"] or 0, row["processing"] or 0, row["failed"] or 0, row["avg_processing_time"], meta["total_jobs_processed"] if meta else 0, meta["total_jobs_failed"] if meta else 0)
235
+ except Exception as e:
236
+ return ShardMetrics(shard_id=shard_id, is_healthy=False, last_error=str(e))
237
+
238
+ def get_stats(self, shard_id: int) -> Dict[str, int]:
239
+ m = self.get_metrics(shard_id); return {"pending": m.pending, "processing": m.processing, "failed": m.failed}
240
+
241
+ def get_all_stats(self) -> Dict[str, int]:
242
+ total: Dict[str, int] = {"pending": 0, "processing": 0, "failed": 0}
243
+ for sid in range(self.num_shards):
244
+ s = self.get_stats(sid)
245
+ for k in total: total[k] += s[k]
246
+ return total
247
+
248
+ def close_all(self) -> None:
249
+ if hasattr(self._local, "connections"):
250
+ for c in self._local.connections.values():
251
+ try: c.close()
252
+ except Exception: pass
253
+ self._local.connections.clear()