mcp-ticketer 0.1.26__py3-none-any.whl → 0.1.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcp-ticketer might be problematic. Click here for more details.

@@ -0,0 +1,322 @@
1
+ """Queue health monitoring and alerting system."""
2
+
3
+ import asyncio
4
+ import logging
5
+ import time
6
+ from datetime import datetime, timedelta
7
+ from enum import Enum
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ import psutil
12
+
13
+ from .manager import WorkerManager
14
+ from .queue import Queue, QueueStatus
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class HealthStatus(str, Enum):
20
+ """Health status levels."""
21
+ HEALTHY = "healthy"
22
+ WARNING = "warning"
23
+ CRITICAL = "critical"
24
+ FAILED = "failed"
25
+
26
+
27
+ class HealthAlert:
28
+ """Health alert with severity and details."""
29
+
30
+ def __init__(
31
+ self,
32
+ level: HealthStatus,
33
+ message: str,
34
+ details: Optional[Dict[str, Any]] = None,
35
+ timestamp: Optional[datetime] = None
36
+ ):
37
+ self.level = level
38
+ self.message = message
39
+ self.details = details or {}
40
+ self.timestamp = timestamp or datetime.now()
41
+
42
+ def __str__(self) -> str:
43
+ return f"[{self.level.upper()}] {self.message}"
44
+
45
+
46
+ class QueueHealthMonitor:
47
+ """Monitors queue health and provides immediate alerts."""
48
+
49
+ # Health check thresholds
50
+ WORKER_TIMEOUT_SECONDS = 30 # Worker should process items within 30s
51
+ STUCK_ITEM_THRESHOLD = 300 # 5 minutes for stuck items
52
+ HIGH_FAILURE_RATE = 0.3 # 30% failure rate is concerning
53
+ QUEUE_BACKLOG_WARNING = 10 # Warn if more than 10 pending items
54
+ QUEUE_BACKLOG_CRITICAL = 50 # Critical if more than 50 pending items
55
+
56
+ def __init__(self, queue: Optional[Queue] = None):
57
+ """Initialize health monitor.
58
+
59
+ Args:
60
+ queue: Queue instance to monitor. Creates new if None.
61
+ """
62
+ self.queue = queue or Queue()
63
+ self.manager = WorkerManager()
64
+ self.last_check = datetime.now()
65
+ self.alerts: List[HealthAlert] = []
66
+
67
+ def check_health(self) -> Dict[str, Any]:
68
+ """Perform comprehensive health check.
69
+
70
+ Returns:
71
+ Health status with alerts and metrics
72
+ """
73
+ self.alerts.clear()
74
+
75
+ # Check worker status
76
+ worker_health = self._check_worker_health()
77
+
78
+ # Check queue status
79
+ queue_health = self._check_queue_health()
80
+
81
+ # Check for stuck items
82
+ stuck_health = self._check_stuck_items()
83
+
84
+ # Check failure rates
85
+ failure_health = self._check_failure_rates()
86
+
87
+ # Determine overall health
88
+ overall_status = self._determine_overall_status()
89
+
90
+ health_report = {
91
+ "status": overall_status,
92
+ "timestamp": datetime.now().isoformat(),
93
+ "alerts": [{"level": alert.level, "message": alert.message, "details": alert.details} for alert in self.alerts],
94
+ "metrics": {
95
+ "worker": worker_health,
96
+ "queue": queue_health,
97
+ "stuck_items": stuck_health,
98
+ "failure_rate": failure_health
99
+ }
100
+ }
101
+
102
+ self.last_check = datetime.now()
103
+ return health_report
104
+
105
+ def _check_worker_health(self) -> Dict[str, Any]:
106
+ """Check worker process health."""
107
+ worker_status = self.manager.get_status()
108
+
109
+ metrics = {
110
+ "running": worker_status["running"],
111
+ "pid": worker_status.get("pid"),
112
+ "cpu_percent": worker_status.get("cpu_percent", 0),
113
+ "memory_mb": worker_status.get("memory_mb", 0)
114
+ }
115
+
116
+ if not worker_status["running"]:
117
+ # Check if we have pending items but no worker
118
+ pending_count = self.queue.get_pending_count()
119
+ if pending_count > 0:
120
+ self.alerts.append(HealthAlert(
121
+ HealthStatus.CRITICAL,
122
+ f"Worker not running but {pending_count} items pending",
123
+ {"pending_count": pending_count, "action": "start_worker"}
124
+ ))
125
+ else:
126
+ self.alerts.append(HealthAlert(
127
+ HealthStatus.WARNING,
128
+ "Worker not running (no pending items)",
129
+ {"action": "worker_idle"}
130
+ ))
131
+ else:
132
+ # Worker is running, check if it's responsive
133
+ pid = worker_status.get("pid")
134
+ if pid:
135
+ try:
136
+ process = psutil.Process(pid)
137
+ # Check if worker has been idle too long with pending items
138
+ pending_count = self.queue.get_pending_count()
139
+ if pending_count > 0:
140
+ # Check for items that have been pending too long
141
+ old_pending = self._get_old_pending_items()
142
+ if old_pending:
143
+ self.alerts.append(HealthAlert(
144
+ HealthStatus.WARNING,
145
+ f"Worker running but {len(old_pending)} items pending for >30s",
146
+ {"old_pending_count": len(old_pending), "worker_pid": pid}
147
+ ))
148
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
149
+ self.alerts.append(HealthAlert(
150
+ HealthStatus.CRITICAL,
151
+ "Worker PID exists but process not accessible",
152
+ {"pid": pid, "action": "restart_worker"}
153
+ ))
154
+
155
+ return metrics
156
+
157
+ def _check_queue_health(self) -> Dict[str, Any]:
158
+ """Check queue status and backlog."""
159
+ stats = self.queue.get_stats()
160
+
161
+ pending = stats.get("pending", 0)
162
+ processing = stats.get("processing", 0)
163
+ failed = stats.get("failed", 0)
164
+ completed = stats.get("completed", 0)
165
+
166
+ metrics = {
167
+ "pending": pending,
168
+ "processing": processing,
169
+ "failed": failed,
170
+ "completed": completed,
171
+ "total": pending + processing + failed + completed
172
+ }
173
+
174
+ # Check backlog levels
175
+ if pending >= self.QUEUE_BACKLOG_CRITICAL:
176
+ self.alerts.append(HealthAlert(
177
+ HealthStatus.CRITICAL,
178
+ f"Critical queue backlog: {pending} pending items",
179
+ {"pending_count": pending, "action": "scale_workers"}
180
+ ))
181
+ elif pending >= self.QUEUE_BACKLOG_WARNING:
182
+ self.alerts.append(HealthAlert(
183
+ HealthStatus.WARNING,
184
+ f"High queue backlog: {pending} pending items",
185
+ {"pending_count": pending}
186
+ ))
187
+
188
+ # Check for too many processing items (might indicate stuck workers)
189
+ if processing > 5: # Should rarely have more than a few processing
190
+ self.alerts.append(HealthAlert(
191
+ HealthStatus.WARNING,
192
+ f"Many items in processing state: {processing}",
193
+ {"processing_count": processing, "action": "check_stuck_items"}
194
+ ))
195
+
196
+ return metrics
197
+
198
+ def _check_stuck_items(self) -> Dict[str, Any]:
199
+ """Check for items stuck in processing state."""
200
+ # Reset stuck items first
201
+ self.queue.reset_stuck_items(timeout_minutes=5) # 5 minute timeout
202
+
203
+ # Get current stuck items
204
+ stuck_items = self._get_stuck_processing_items()
205
+
206
+ metrics = {
207
+ "stuck_count": len(stuck_items),
208
+ "stuck_items": [item.id for item in stuck_items]
209
+ }
210
+
211
+ if stuck_items:
212
+ self.alerts.append(HealthAlert(
213
+ HealthStatus.WARNING,
214
+ f"Found {len(stuck_items)} stuck items, auto-reset applied",
215
+ {"stuck_items": [item.id for item in stuck_items], "action": "items_reset"}
216
+ ))
217
+
218
+ return metrics
219
+
220
+ def _check_failure_rates(self) -> Dict[str, Any]:
221
+ """Check recent failure rates."""
222
+ stats = self.queue.get_stats()
223
+
224
+ total_items = sum(stats.values())
225
+ failed_items = stats.get("failed", 0)
226
+
227
+ failure_rate = failed_items / total_items if total_items > 0 else 0
228
+
229
+ metrics = {
230
+ "failure_rate": failure_rate,
231
+ "failed_count": failed_items,
232
+ "total_count": total_items
233
+ }
234
+
235
+ if failure_rate >= self.HIGH_FAILURE_RATE and total_items >= 10:
236
+ self.alerts.append(HealthAlert(
237
+ HealthStatus.CRITICAL,
238
+ f"High failure rate: {failure_rate:.1%} ({failed_items}/{total_items})",
239
+ {"failure_rate": failure_rate, "action": "investigate_failures"}
240
+ ))
241
+
242
+ return metrics
243
+
244
+ def _determine_overall_status(self) -> HealthStatus:
245
+ """Determine overall health status from alerts."""
246
+ if not self.alerts:
247
+ return HealthStatus.HEALTHY
248
+
249
+ # Check for critical alerts
250
+ if any(alert.level == HealthStatus.CRITICAL for alert in self.alerts):
251
+ return HealthStatus.CRITICAL
252
+
253
+ # Check for warnings
254
+ if any(alert.level == HealthStatus.WARNING for alert in self.alerts):
255
+ return HealthStatus.WARNING
256
+
257
+ return HealthStatus.HEALTHY
258
+
259
+ def _get_old_pending_items(self) -> List:
260
+ """Get items that have been pending for too long."""
261
+ cutoff_time = datetime.now() - timedelta(seconds=self.WORKER_TIMEOUT_SECONDS)
262
+
263
+ items = self.queue.list_items(status=QueueStatus.PENDING, limit=100)
264
+ return [
265
+ item for item in items
266
+ if item.created_at < cutoff_time
267
+ ]
268
+
269
+ def _get_stuck_processing_items(self) -> List:
270
+ """Get items stuck in processing state."""
271
+ cutoff_time = datetime.now() - timedelta(seconds=self.STUCK_ITEM_THRESHOLD)
272
+
273
+ items = self.queue.list_items(status=QueueStatus.PROCESSING, limit=100)
274
+ return [
275
+ item for item in items
276
+ if item.created_at < cutoff_time
277
+ ]
278
+
279
+ def get_immediate_alerts(self) -> List[HealthAlert]:
280
+ """Get alerts that require immediate attention."""
281
+ return [
282
+ alert for alert in self.alerts
283
+ if alert.level in [HealthStatus.CRITICAL, HealthStatus.FAILED]
284
+ ]
285
+
286
+ def auto_repair(self) -> Dict[str, Any]:
287
+ """Attempt automatic repair of detected issues."""
288
+ repair_actions = []
289
+
290
+ # Check health first
291
+ health = self.check_health()
292
+
293
+ for alert in self.alerts:
294
+ action = alert.details.get("action")
295
+
296
+ if action == "start_worker":
297
+ try:
298
+ if self.manager.start():
299
+ repair_actions.append(f"Started worker for {alert.details.get('pending_count')} pending items")
300
+ else:
301
+ repair_actions.append("Failed to start worker")
302
+ except Exception as e:
303
+ repair_actions.append(f"Error starting worker: {e}")
304
+
305
+ elif action == "restart_worker":
306
+ try:
307
+ self.manager.stop()
308
+ time.sleep(2)
309
+ if self.manager.start():
310
+ repair_actions.append("Restarted unresponsive worker")
311
+ else:
312
+ repair_actions.append("Failed to restart worker")
313
+ except Exception as e:
314
+ repair_actions.append(f"Error restarting worker: {e}")
315
+
316
+ elif action == "items_reset":
317
+ repair_actions.append(f"Reset {alert.details.get('stuck_items', [])} stuck items")
318
+
319
+ return {
320
+ "actions_taken": repair_actions,
321
+ "timestamp": datetime.now().isoformat()
322
+ }
@@ -180,7 +180,7 @@ class Queue:
180
180
  return queue_id
181
181
 
182
182
  def get_next_pending(self) -> Optional[QueueItem]:
183
- """Get next pending item from queue.
183
+ """Get next pending item from queue atomically.
184
184
 
185
185
  Returns:
186
186
  Next pending QueueItem or None if queue is empty
@@ -188,34 +188,50 @@ class Queue:
188
188
  """
189
189
  with self._lock:
190
190
  with sqlite3.connect(self.db_path) as conn:
191
- # Get next pending item ordered by creation time
192
- cursor = conn.execute(
193
- """
194
- SELECT * FROM queue
195
- WHERE status = ?
196
- ORDER BY created_at
197
- LIMIT 1
198
- """,
199
- (QueueStatus.PENDING.value,),
200
- )
201
-
202
- row = cursor.fetchone()
203
- if row:
204
- # Mark as processing
205
- conn.execute(
191
+ # Use a transaction to atomically get and update the item
192
+ conn.execute("BEGIN IMMEDIATE")
193
+ try:
194
+ # Get next pending item ordered by creation time
195
+ cursor = conn.execute(
206
196
  """
207
- UPDATE queue
208
- SET status = ?
209
- WHERE id = ?
197
+ SELECT * FROM queue
198
+ WHERE status = ?
199
+ ORDER BY created_at
200
+ LIMIT 1
210
201
  """,
211
- (QueueStatus.PROCESSING.value, row[0]),
202
+ (QueueStatus.PENDING.value,),
212
203
  )
213
- conn.commit()
214
204
 
215
- # Create QueueItem from row and update status
216
- item = QueueItem.from_row(row)
217
- item.status = QueueStatus.PROCESSING
218
- return item
205
+ row = cursor.fetchone()
206
+ if row:
207
+ # Atomically mark as processing
208
+ update_cursor = conn.execute(
209
+ """
210
+ UPDATE queue
211
+ SET status = ?, processed_at = ?
212
+ WHERE id = ? AND status = ?
213
+ """,
214
+ (QueueStatus.PROCESSING.value, datetime.now().isoformat(), row[0], QueueStatus.PENDING.value),
215
+ )
216
+
217
+ # Check if update was successful (prevents race conditions)
218
+ if update_cursor.rowcount == 1:
219
+ conn.commit()
220
+ # Create QueueItem from row and update status
221
+ item = QueueItem.from_row(row)
222
+ item.status = QueueStatus.PROCESSING
223
+ return item
224
+ else:
225
+ # Item was already taken by another worker
226
+ conn.rollback()
227
+ return None
228
+ else:
229
+ conn.rollback()
230
+ return None
231
+
232
+ except Exception:
233
+ conn.rollback()
234
+ raise
219
235
 
220
236
  return None
221
237
 
@@ -225,67 +241,132 @@ class Queue:
225
241
  status: QueueStatus,
226
242
  error_message: Optional[str] = None,
227
243
  result: Optional[dict[str, Any]] = None,
228
- ):
229
- """Update queue item status.
244
+ expected_status: Optional[QueueStatus] = None,
245
+ ) -> bool:
246
+ """Update queue item status atomically.
230
247
 
231
248
  Args:
232
249
  queue_id: Queue item ID
233
250
  status: New status
234
251
  error_message: Error message if failed
235
252
  result: Result data if completed
253
+ expected_status: Expected current status (for atomic updates)
236
254
 
255
+ Returns:
256
+ True if update was successful, False if item was in unexpected state
237
257
  """
238
258
  with self._lock:
239
259
  with sqlite3.connect(self.db_path) as conn:
240
- processed_at = (
241
- datetime.now().isoformat()
242
- if status in [QueueStatus.COMPLETED, QueueStatus.FAILED]
243
- else None
244
- )
245
-
246
- conn.execute(
247
- """
248
- UPDATE queue
249
- SET status = ?, processed_at = ?,
250
- error_message = ?, result = ?
251
- WHERE id = ?
252
- """,
253
- (
254
- status.value,
255
- processed_at,
256
- error_message,
257
- json.dumps(result) if result else None,
258
- queue_id,
259
- ),
260
- )
261
- conn.commit()
260
+ conn.execute("BEGIN IMMEDIATE")
261
+ try:
262
+ processed_at = (
263
+ datetime.now().isoformat()
264
+ if status in [QueueStatus.COMPLETED, QueueStatus.FAILED]
265
+ else None
266
+ )
262
267
 
263
- def increment_retry(self, queue_id: str) -> int:
264
- """Increment retry count for item.
268
+ if expected_status:
269
+ # Atomic update with status check
270
+ cursor = conn.execute(
271
+ """
272
+ UPDATE queue
273
+ SET status = ?, processed_at = ?,
274
+ error_message = ?, result = ?
275
+ WHERE id = ? AND status = ?
276
+ """,
277
+ (
278
+ status.value,
279
+ processed_at,
280
+ error_message,
281
+ json.dumps(result) if result else None,
282
+ queue_id,
283
+ expected_status.value,
284
+ ),
285
+ )
286
+ success = cursor.rowcount == 1
287
+ else:
288
+ # Regular update
289
+ cursor = conn.execute(
290
+ """
291
+ UPDATE queue
292
+ SET status = ?, processed_at = ?,
293
+ error_message = ?, result = ?
294
+ WHERE id = ?
295
+ """,
296
+ (
297
+ status.value,
298
+ processed_at,
299
+ error_message,
300
+ json.dumps(result) if result else None,
301
+ queue_id,
302
+ ),
303
+ )
304
+ success = cursor.rowcount == 1
305
+
306
+ if success:
307
+ conn.commit()
308
+ else:
309
+ conn.rollback()
310
+
311
+ return success
312
+
313
+ except Exception:
314
+ conn.rollback()
315
+ raise
316
+
317
+ def increment_retry(self, queue_id: str, expected_status: Optional[QueueStatus] = None) -> int:
318
+ """Increment retry count and reset to pending atomically.
265
319
 
266
320
  Args:
267
321
  queue_id: Queue item ID
322
+ expected_status: Expected current status for atomic operation
268
323
 
269
324
  Returns:
270
- New retry count
325
+ New retry count, or -1 if operation failed
271
326
 
272
327
  """
273
328
  with self._lock:
274
329
  with sqlite3.connect(self.db_path) as conn:
275
- cursor = conn.execute(
276
- """
277
- UPDATE queue
278
- SET retry_count = retry_count + 1,
279
- status = ?
280
- WHERE id = ?
281
- RETURNING retry_count
282
- """,
283
- (QueueStatus.PENDING.value, queue_id),
284
- )
285
-
286
- result = cursor.fetchone()
287
- conn.commit()
288
- return result[0] if result else 0
330
+ conn.execute("BEGIN IMMEDIATE")
331
+ try:
332
+ if expected_status:
333
+ # Atomic increment with status check
334
+ cursor = conn.execute(
335
+ """
336
+ UPDATE queue
337
+ SET retry_count = retry_count + 1,
338
+ status = ?, processed_at = NULL,
339
+ error_message = NULL
340
+ WHERE id = ? AND status = ?
341
+ RETURNING retry_count
342
+ """,
343
+ (QueueStatus.PENDING.value, queue_id, expected_status.value),
344
+ )
345
+ else:
346
+ # Regular increment
347
+ cursor = conn.execute(
348
+ """
349
+ UPDATE queue
350
+ SET retry_count = retry_count + 1,
351
+ status = ?, processed_at = NULL,
352
+ error_message = NULL
353
+ WHERE id = ?
354
+ RETURNING retry_count
355
+ """,
356
+ (QueueStatus.PENDING.value, queue_id),
357
+ )
358
+
359
+ result = cursor.fetchone()
360
+ if result:
361
+ conn.commit()
362
+ return result[0]
363
+ else:
364
+ conn.rollback()
365
+ return -1
366
+
367
+ except Exception:
368
+ conn.rollback()
369
+ raise
289
370
 
290
371
  def get_item(self, queue_id: str) -> Optional[QueueItem]:
291
372
  """Get specific queue item by ID.