smartify-ai 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smartify/__init__.py +3 -0
- smartify/agents/__init__.py +0 -0
- smartify/agents/adapters/__init__.py +13 -0
- smartify/agents/adapters/anthropic.py +253 -0
- smartify/agents/adapters/openai.py +289 -0
- smartify/api/__init__.py +26 -0
- smartify/api/auth.py +352 -0
- smartify/api/errors.py +380 -0
- smartify/api/events.py +345 -0
- smartify/api/server.py +992 -0
- smartify/cli/__init__.py +1 -0
- smartify/cli/main.py +430 -0
- smartify/engine/__init__.py +64 -0
- smartify/engine/approval.py +479 -0
- smartify/engine/orchestrator.py +1365 -0
- smartify/engine/scheduler.py +380 -0
- smartify/engine/spark.py +294 -0
- smartify/guardrails/__init__.py +22 -0
- smartify/guardrails/breakers.py +409 -0
- smartify/models/__init__.py +61 -0
- smartify/models/grid.py +625 -0
- smartify/notifications/__init__.py +22 -0
- smartify/notifications/webhook.py +556 -0
- smartify/state/__init__.py +46 -0
- smartify/state/checkpoint.py +558 -0
- smartify/state/resume.py +301 -0
- smartify/state/store.py +370 -0
- smartify/tools/__init__.py +17 -0
- smartify/tools/base.py +196 -0
- smartify/tools/builtin/__init__.py +79 -0
- smartify/tools/builtin/file.py +464 -0
- smartify/tools/builtin/http.py +195 -0
- smartify/tools/builtin/shell.py +137 -0
- smartify/tools/mcp/__init__.py +33 -0
- smartify/tools/mcp/adapter.py +157 -0
- smartify/tools/mcp/client.py +334 -0
- smartify/tools/mcp/registry.py +130 -0
- smartify/validator/__init__.py +0 -0
- smartify/validator/validate.py +271 -0
- smartify/workspace/__init__.py +5 -0
- smartify/workspace/manager.py +248 -0
- smartify_ai-0.1.0.dist-info/METADATA +201 -0
- smartify_ai-0.1.0.dist-info/RECORD +46 -0
- smartify_ai-0.1.0.dist-info/WHEEL +4 -0
- smartify_ai-0.1.0.dist-info/entry_points.txt +2 -0
- smartify_ai-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,558 @@
|
|
|
1
|
+
"""Checkpoint and resume support for Smartify grid runs.
|
|
2
|
+
|
|
3
|
+
Provides durable execution by:
|
|
4
|
+
1. Checkpointing node outputs before marking complete
|
|
5
|
+
2. Storing grid spec and scheduler state for resume
|
|
6
|
+
3. Webhook retry queue for failed deliveries
|
|
7
|
+
4. Resume incomplete runs on startup
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import sqlite3
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from datetime import datetime, timedelta
|
|
15
|
+
from enum import Enum
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Dict, List, Optional, Set
|
|
18
|
+
from uuid import uuid4
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CheckpointStatus(str, Enum):
|
|
24
|
+
"""Status of a checkpoint."""
|
|
25
|
+
ACTIVE = "active" # Run in progress, checkpoints being written
|
|
26
|
+
COMPLETED = "completed" # Run finished successfully
|
|
27
|
+
FAILED = "failed" # Run failed
|
|
28
|
+
ABANDONED = "abandoned" # Run stopped/crashed, can be resumed
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class WebhookDeliveryStatus(str, Enum):
|
|
32
|
+
"""Status of a webhook delivery."""
|
|
33
|
+
PENDING = "pending"
|
|
34
|
+
DELIVERED = "delivered"
|
|
35
|
+
FAILED = "failed"
|
|
36
|
+
RETRYING = "retrying"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class Checkpoint:
|
|
41
|
+
"""A checkpoint for a grid run."""
|
|
42
|
+
checkpoint_id: str
|
|
43
|
+
run_id: str
|
|
44
|
+
grid_id: str
|
|
45
|
+
grid_yaml: str # Original grid YAML for reconstruction
|
|
46
|
+
status: CheckpointStatus
|
|
47
|
+
|
|
48
|
+
# Execution state
|
|
49
|
+
completed_nodes: List[str] # Node IDs that completed successfully
|
|
50
|
+
failed_nodes: List[str] # Node IDs that failed
|
|
51
|
+
running_nodes: List[str] # Node IDs that were running at checkpoint
|
|
52
|
+
|
|
53
|
+
# Context state
|
|
54
|
+
inputs: Dict[str, Any]
|
|
55
|
+
outputs: Dict[str, Dict[str, Any]] # node_id -> output
|
|
56
|
+
total_tokens: int
|
|
57
|
+
total_cost: float
|
|
58
|
+
|
|
59
|
+
# Timestamps
|
|
60
|
+
created_at: datetime
|
|
61
|
+
updated_at: datetime
|
|
62
|
+
|
|
63
|
+
# Resume info
|
|
64
|
+
resume_count: int = 0
|
|
65
|
+
last_error: Optional[str] = None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class WebhookRetryJob:
|
|
70
|
+
"""A webhook delivery queued for retry."""
|
|
71
|
+
job_id: str
|
|
72
|
+
event_type: str
|
|
73
|
+
grid_id: str
|
|
74
|
+
webhook_url: str
|
|
75
|
+
payload: Dict[str, Any]
|
|
76
|
+
headers: Dict[str, str]
|
|
77
|
+
secret: Optional[str]
|
|
78
|
+
|
|
79
|
+
status: WebhookDeliveryStatus
|
|
80
|
+
attempts: int
|
|
81
|
+
max_attempts: int
|
|
82
|
+
|
|
83
|
+
created_at: datetime
|
|
84
|
+
next_retry_at: datetime
|
|
85
|
+
last_error: Optional[str] = None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class CheckpointStore:
|
|
89
|
+
"""SQLite-based checkpoint storage.
|
|
90
|
+
|
|
91
|
+
Extends the base state store with checkpoint-specific tables
|
|
92
|
+
for durable execution and resume support.
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
def __init__(self, db_path: str = "smartify_state.db"):
|
|
96
|
+
self.db_path = db_path
|
|
97
|
+
self._init_db()
|
|
98
|
+
|
|
99
|
+
def _init_db(self) -> None:
|
|
100
|
+
"""Initialize checkpoint tables."""
|
|
101
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
102
|
+
# Checkpoints table - stores run state for resume
|
|
103
|
+
conn.execute("""
|
|
104
|
+
CREATE TABLE IF NOT EXISTS checkpoints (
|
|
105
|
+
checkpoint_id TEXT PRIMARY KEY,
|
|
106
|
+
run_id TEXT NOT NULL UNIQUE,
|
|
107
|
+
grid_id TEXT NOT NULL,
|
|
108
|
+
grid_yaml TEXT NOT NULL,
|
|
109
|
+
status TEXT NOT NULL,
|
|
110
|
+
completed_nodes TEXT NOT NULL,
|
|
111
|
+
failed_nodes TEXT NOT NULL,
|
|
112
|
+
running_nodes TEXT NOT NULL,
|
|
113
|
+
inputs TEXT NOT NULL,
|
|
114
|
+
outputs TEXT NOT NULL,
|
|
115
|
+
total_tokens INTEGER DEFAULT 0,
|
|
116
|
+
total_cost REAL DEFAULT 0.0,
|
|
117
|
+
created_at TEXT NOT NULL,
|
|
118
|
+
updated_at TEXT NOT NULL,
|
|
119
|
+
resume_count INTEGER DEFAULT 0,
|
|
120
|
+
last_error TEXT
|
|
121
|
+
)
|
|
122
|
+
""")
|
|
123
|
+
|
|
124
|
+
# Webhook retry queue
|
|
125
|
+
conn.execute("""
|
|
126
|
+
CREATE TABLE IF NOT EXISTS webhook_retry_queue (
|
|
127
|
+
job_id TEXT PRIMARY KEY,
|
|
128
|
+
event_type TEXT NOT NULL,
|
|
129
|
+
grid_id TEXT NOT NULL,
|
|
130
|
+
webhook_url TEXT NOT NULL,
|
|
131
|
+
payload TEXT NOT NULL,
|
|
132
|
+
headers TEXT NOT NULL,
|
|
133
|
+
secret TEXT,
|
|
134
|
+
status TEXT NOT NULL,
|
|
135
|
+
attempts INTEGER DEFAULT 0,
|
|
136
|
+
max_attempts INTEGER DEFAULT 3,
|
|
137
|
+
created_at TEXT NOT NULL,
|
|
138
|
+
next_retry_at TEXT NOT NULL,
|
|
139
|
+
last_error TEXT
|
|
140
|
+
)
|
|
141
|
+
""")
|
|
142
|
+
|
|
143
|
+
# Indexes
|
|
144
|
+
conn.execute("CREATE INDEX IF NOT EXISTS idx_checkpoints_status ON checkpoints(status)")
|
|
145
|
+
conn.execute("CREATE INDEX IF NOT EXISTS idx_checkpoints_grid ON checkpoints(grid_id)")
|
|
146
|
+
conn.execute("CREATE INDEX IF NOT EXISTS idx_webhook_queue_status ON webhook_retry_queue(status)")
|
|
147
|
+
conn.execute("CREATE INDEX IF NOT EXISTS idx_webhook_queue_retry ON webhook_retry_queue(next_retry_at)")
|
|
148
|
+
|
|
149
|
+
conn.commit()
|
|
150
|
+
logger.info(f"Checkpoint store initialized at {self.db_path}")
|
|
151
|
+
|
|
152
|
+
# =========================================================================
|
|
153
|
+
# Checkpoint Operations
|
|
154
|
+
# =========================================================================
|
|
155
|
+
|
|
156
|
+
def create_checkpoint(
|
|
157
|
+
self,
|
|
158
|
+
run_id: str,
|
|
159
|
+
grid_id: str,
|
|
160
|
+
grid_yaml: str,
|
|
161
|
+
inputs: Dict[str, Any],
|
|
162
|
+
) -> Checkpoint:
|
|
163
|
+
"""Create a new checkpoint for a run."""
|
|
164
|
+
checkpoint = Checkpoint(
|
|
165
|
+
checkpoint_id=f"ckpt-{uuid4().hex[:12]}",
|
|
166
|
+
run_id=run_id,
|
|
167
|
+
grid_id=grid_id,
|
|
168
|
+
grid_yaml=grid_yaml,
|
|
169
|
+
status=CheckpointStatus.ACTIVE,
|
|
170
|
+
completed_nodes=[],
|
|
171
|
+
failed_nodes=[],
|
|
172
|
+
running_nodes=[],
|
|
173
|
+
inputs=inputs,
|
|
174
|
+
outputs={},
|
|
175
|
+
total_tokens=0,
|
|
176
|
+
total_cost=0.0,
|
|
177
|
+
created_at=datetime.now(),
|
|
178
|
+
updated_at=datetime.now(),
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
self._save_checkpoint(checkpoint)
|
|
182
|
+
logger.info(f"Created checkpoint {checkpoint.checkpoint_id} for run {run_id}")
|
|
183
|
+
return checkpoint
|
|
184
|
+
|
|
185
|
+
def _save_checkpoint(self, checkpoint: Checkpoint) -> None:
|
|
186
|
+
"""Save checkpoint to database."""
|
|
187
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
188
|
+
conn.execute("""
|
|
189
|
+
INSERT OR REPLACE INTO checkpoints
|
|
190
|
+
(checkpoint_id, run_id, grid_id, grid_yaml, status,
|
|
191
|
+
completed_nodes, failed_nodes, running_nodes,
|
|
192
|
+
inputs, outputs, total_tokens, total_cost,
|
|
193
|
+
created_at, updated_at, resume_count, last_error)
|
|
194
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
195
|
+
""", (
|
|
196
|
+
checkpoint.checkpoint_id,
|
|
197
|
+
checkpoint.run_id,
|
|
198
|
+
checkpoint.grid_id,
|
|
199
|
+
checkpoint.grid_yaml,
|
|
200
|
+
checkpoint.status.value,
|
|
201
|
+
json.dumps(checkpoint.completed_nodes),
|
|
202
|
+
json.dumps(checkpoint.failed_nodes),
|
|
203
|
+
json.dumps(checkpoint.running_nodes),
|
|
204
|
+
json.dumps(checkpoint.inputs),
|
|
205
|
+
json.dumps(checkpoint.outputs),
|
|
206
|
+
checkpoint.total_tokens,
|
|
207
|
+
checkpoint.total_cost,
|
|
208
|
+
checkpoint.created_at.isoformat(),
|
|
209
|
+
checkpoint.updated_at.isoformat(),
|
|
210
|
+
checkpoint.resume_count,
|
|
211
|
+
checkpoint.last_error,
|
|
212
|
+
))
|
|
213
|
+
conn.commit()
|
|
214
|
+
|
|
215
|
+
def get_checkpoint(self, run_id: str) -> Optional[Checkpoint]:
|
|
216
|
+
"""Get checkpoint by run ID."""
|
|
217
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
218
|
+
conn.row_factory = sqlite3.Row
|
|
219
|
+
cursor = conn.execute(
|
|
220
|
+
"SELECT * FROM checkpoints WHERE run_id = ?", (run_id,)
|
|
221
|
+
)
|
|
222
|
+
row = cursor.fetchone()
|
|
223
|
+
|
|
224
|
+
if not row:
|
|
225
|
+
return None
|
|
226
|
+
|
|
227
|
+
return self._row_to_checkpoint(row)
|
|
228
|
+
|
|
229
|
+
def _row_to_checkpoint(self, row: sqlite3.Row) -> Checkpoint:
|
|
230
|
+
"""Convert database row to Checkpoint."""
|
|
231
|
+
return Checkpoint(
|
|
232
|
+
checkpoint_id=row['checkpoint_id'],
|
|
233
|
+
run_id=row['run_id'],
|
|
234
|
+
grid_id=row['grid_id'],
|
|
235
|
+
grid_yaml=row['grid_yaml'],
|
|
236
|
+
status=CheckpointStatus(row['status']),
|
|
237
|
+
completed_nodes=json.loads(row['completed_nodes']),
|
|
238
|
+
failed_nodes=json.loads(row['failed_nodes']),
|
|
239
|
+
running_nodes=json.loads(row['running_nodes']),
|
|
240
|
+
inputs=json.loads(row['inputs']),
|
|
241
|
+
outputs=json.loads(row['outputs']),
|
|
242
|
+
total_tokens=row['total_tokens'],
|
|
243
|
+
total_cost=row['total_cost'],
|
|
244
|
+
created_at=datetime.fromisoformat(row['created_at']),
|
|
245
|
+
updated_at=datetime.fromisoformat(row['updated_at']),
|
|
246
|
+
resume_count=row['resume_count'],
|
|
247
|
+
last_error=row['last_error'],
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
def checkpoint_node_complete(
|
|
251
|
+
self,
|
|
252
|
+
run_id: str,
|
|
253
|
+
node_id: str,
|
|
254
|
+
output: Dict[str, Any],
|
|
255
|
+
tokens_used: int = 0,
|
|
256
|
+
cost: float = 0.0,
|
|
257
|
+
) -> None:
|
|
258
|
+
"""Record a node completion in the checkpoint."""
|
|
259
|
+
checkpoint = self.get_checkpoint(run_id)
|
|
260
|
+
if not checkpoint:
|
|
261
|
+
logger.warning(f"No checkpoint found for run {run_id}")
|
|
262
|
+
return
|
|
263
|
+
|
|
264
|
+
# Update checkpoint state
|
|
265
|
+
if node_id not in checkpoint.completed_nodes:
|
|
266
|
+
checkpoint.completed_nodes.append(node_id)
|
|
267
|
+
if node_id in checkpoint.running_nodes:
|
|
268
|
+
checkpoint.running_nodes.remove(node_id)
|
|
269
|
+
|
|
270
|
+
checkpoint.outputs[node_id] = output
|
|
271
|
+
checkpoint.total_tokens += tokens_used
|
|
272
|
+
checkpoint.total_cost += cost
|
|
273
|
+
checkpoint.updated_at = datetime.now()
|
|
274
|
+
|
|
275
|
+
self._save_checkpoint(checkpoint)
|
|
276
|
+
logger.debug(f"Checkpointed node {node_id} for run {run_id}")
|
|
277
|
+
|
|
278
|
+
def checkpoint_node_failed(
|
|
279
|
+
self,
|
|
280
|
+
run_id: str,
|
|
281
|
+
node_id: str,
|
|
282
|
+
error: str,
|
|
283
|
+
) -> None:
|
|
284
|
+
"""Record a node failure in the checkpoint."""
|
|
285
|
+
checkpoint = self.get_checkpoint(run_id)
|
|
286
|
+
if not checkpoint:
|
|
287
|
+
return
|
|
288
|
+
|
|
289
|
+
if node_id not in checkpoint.failed_nodes:
|
|
290
|
+
checkpoint.failed_nodes.append(node_id)
|
|
291
|
+
if node_id in checkpoint.running_nodes:
|
|
292
|
+
checkpoint.running_nodes.remove(node_id)
|
|
293
|
+
|
|
294
|
+
checkpoint.last_error = f"Node {node_id}: {error}"
|
|
295
|
+
checkpoint.updated_at = datetime.now()
|
|
296
|
+
|
|
297
|
+
self._save_checkpoint(checkpoint)
|
|
298
|
+
|
|
299
|
+
def checkpoint_node_started(self, run_id: str, node_id: str) -> None:
|
|
300
|
+
"""Record that a node has started execution."""
|
|
301
|
+
checkpoint = self.get_checkpoint(run_id)
|
|
302
|
+
if not checkpoint:
|
|
303
|
+
return
|
|
304
|
+
|
|
305
|
+
if node_id not in checkpoint.running_nodes:
|
|
306
|
+
checkpoint.running_nodes.append(node_id)
|
|
307
|
+
checkpoint.updated_at = datetime.now()
|
|
308
|
+
|
|
309
|
+
self._save_checkpoint(checkpoint)
|
|
310
|
+
|
|
311
|
+
def mark_completed(self, run_id: str) -> None:
|
|
312
|
+
"""Mark a run as completed."""
|
|
313
|
+
checkpoint = self.get_checkpoint(run_id)
|
|
314
|
+
if checkpoint:
|
|
315
|
+
checkpoint.status = CheckpointStatus.COMPLETED
|
|
316
|
+
checkpoint.running_nodes = []
|
|
317
|
+
checkpoint.updated_at = datetime.now()
|
|
318
|
+
self._save_checkpoint(checkpoint)
|
|
319
|
+
logger.info(f"Marked run {run_id} as completed")
|
|
320
|
+
|
|
321
|
+
def mark_failed(self, run_id: str, error: str) -> None:
|
|
322
|
+
"""Mark a run as failed."""
|
|
323
|
+
checkpoint = self.get_checkpoint(run_id)
|
|
324
|
+
if checkpoint:
|
|
325
|
+
checkpoint.status = CheckpointStatus.FAILED
|
|
326
|
+
checkpoint.running_nodes = []
|
|
327
|
+
checkpoint.last_error = error
|
|
328
|
+
checkpoint.updated_at = datetime.now()
|
|
329
|
+
self._save_checkpoint(checkpoint)
|
|
330
|
+
logger.info(f"Marked run {run_id} as failed: {error}")
|
|
331
|
+
|
|
332
|
+
def mark_abandoned(self, run_id: str) -> None:
|
|
333
|
+
"""Mark a run as abandoned (can be resumed)."""
|
|
334
|
+
checkpoint = self.get_checkpoint(run_id)
|
|
335
|
+
if checkpoint:
|
|
336
|
+
checkpoint.status = CheckpointStatus.ABANDONED
|
|
337
|
+
checkpoint.updated_at = datetime.now()
|
|
338
|
+
self._save_checkpoint(checkpoint)
|
|
339
|
+
|
|
340
|
+
def get_resumable_runs(self) -> List[Checkpoint]:
|
|
341
|
+
"""Get all runs that can be resumed."""
|
|
342
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
343
|
+
conn.row_factory = sqlite3.Row
|
|
344
|
+
cursor = conn.execute(
|
|
345
|
+
"SELECT * FROM checkpoints WHERE status IN (?, ?)",
|
|
346
|
+
(CheckpointStatus.ACTIVE.value, CheckpointStatus.ABANDONED.value)
|
|
347
|
+
)
|
|
348
|
+
return [self._row_to_checkpoint(row) for row in cursor]
|
|
349
|
+
|
|
350
|
+
def increment_resume_count(self, run_id: str) -> None:
|
|
351
|
+
"""Increment the resume count for a run."""
|
|
352
|
+
checkpoint = self.get_checkpoint(run_id)
|
|
353
|
+
if checkpoint:
|
|
354
|
+
checkpoint.resume_count += 1
|
|
355
|
+
checkpoint.status = CheckpointStatus.ACTIVE
|
|
356
|
+
checkpoint.updated_at = datetime.now()
|
|
357
|
+
self._save_checkpoint(checkpoint)
|
|
358
|
+
|
|
359
|
+
def delete_checkpoint(self, run_id: str) -> bool:
|
|
360
|
+
"""Delete a checkpoint."""
|
|
361
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
362
|
+
cursor = conn.execute(
|
|
363
|
+
"DELETE FROM checkpoints WHERE run_id = ?", (run_id,)
|
|
364
|
+
)
|
|
365
|
+
conn.commit()
|
|
366
|
+
return cursor.rowcount > 0
|
|
367
|
+
|
|
368
|
+
# =========================================================================
|
|
369
|
+
# Webhook Retry Queue
|
|
370
|
+
# =========================================================================
|
|
371
|
+
|
|
372
|
+
def queue_webhook_retry(
|
|
373
|
+
self,
|
|
374
|
+
event_type: str,
|
|
375
|
+
grid_id: str,
|
|
376
|
+
webhook_url: str,
|
|
377
|
+
payload: Dict[str, Any],
|
|
378
|
+
headers: Dict[str, str],
|
|
379
|
+
secret: Optional[str] = None,
|
|
380
|
+
max_attempts: int = 3,
|
|
381
|
+
retry_delay_seconds: float = 60.0,
|
|
382
|
+
) -> WebhookRetryJob:
|
|
383
|
+
"""Queue a webhook for retry."""
|
|
384
|
+
job = WebhookRetryJob(
|
|
385
|
+
job_id=f"whj-{uuid4().hex[:12]}",
|
|
386
|
+
event_type=event_type,
|
|
387
|
+
grid_id=grid_id,
|
|
388
|
+
webhook_url=webhook_url,
|
|
389
|
+
payload=payload,
|
|
390
|
+
headers=headers,
|
|
391
|
+
secret=secret,
|
|
392
|
+
status=WebhookDeliveryStatus.PENDING,
|
|
393
|
+
attempts=0,
|
|
394
|
+
max_attempts=max_attempts,
|
|
395
|
+
created_at=datetime.now(),
|
|
396
|
+
next_retry_at=datetime.now() + timedelta(seconds=retry_delay_seconds),
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
self._save_webhook_job(job)
|
|
400
|
+
logger.info(f"Queued webhook retry job {job.job_id} for {webhook_url}")
|
|
401
|
+
return job
|
|
402
|
+
|
|
403
|
+
def _save_webhook_job(self, job: WebhookRetryJob) -> None:
|
|
404
|
+
"""Save webhook job to database."""
|
|
405
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
406
|
+
conn.execute("""
|
|
407
|
+
INSERT OR REPLACE INTO webhook_retry_queue
|
|
408
|
+
(job_id, event_type, grid_id, webhook_url, payload, headers,
|
|
409
|
+
secret, status, attempts, max_attempts, created_at, next_retry_at, last_error)
|
|
410
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
411
|
+
""", (
|
|
412
|
+
job.job_id,
|
|
413
|
+
job.event_type,
|
|
414
|
+
job.grid_id,
|
|
415
|
+
job.webhook_url,
|
|
416
|
+
json.dumps(job.payload),
|
|
417
|
+
json.dumps(job.headers),
|
|
418
|
+
job.secret,
|
|
419
|
+
job.status.value,
|
|
420
|
+
job.attempts,
|
|
421
|
+
job.max_attempts,
|
|
422
|
+
job.created_at.isoformat(),
|
|
423
|
+
job.next_retry_at.isoformat(),
|
|
424
|
+
job.last_error,
|
|
425
|
+
))
|
|
426
|
+
conn.commit()
|
|
427
|
+
|
|
428
|
+
def get_pending_webhook_jobs(self, limit: int = 100) -> List[WebhookRetryJob]:
|
|
429
|
+
"""Get webhook jobs ready for retry."""
|
|
430
|
+
now = datetime.now().isoformat()
|
|
431
|
+
|
|
432
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
433
|
+
conn.row_factory = sqlite3.Row
|
|
434
|
+
cursor = conn.execute("""
|
|
435
|
+
SELECT * FROM webhook_retry_queue
|
|
436
|
+
WHERE status IN (?, ?) AND next_retry_at <= ?
|
|
437
|
+
ORDER BY next_retry_at ASC
|
|
438
|
+
LIMIT ?
|
|
439
|
+
""", (
|
|
440
|
+
WebhookDeliveryStatus.PENDING.value,
|
|
441
|
+
WebhookDeliveryStatus.RETRYING.value,
|
|
442
|
+
now,
|
|
443
|
+
limit,
|
|
444
|
+
))
|
|
445
|
+
|
|
446
|
+
return [self._row_to_webhook_job(row) for row in cursor]
|
|
447
|
+
|
|
448
|
+
def _row_to_webhook_job(self, row: sqlite3.Row) -> WebhookRetryJob:
|
|
449
|
+
"""Convert database row to WebhookRetryJob."""
|
|
450
|
+
return WebhookRetryJob(
|
|
451
|
+
job_id=row['job_id'],
|
|
452
|
+
event_type=row['event_type'],
|
|
453
|
+
grid_id=row['grid_id'],
|
|
454
|
+
webhook_url=row['webhook_url'],
|
|
455
|
+
payload=json.loads(row['payload']),
|
|
456
|
+
headers=json.loads(row['headers']),
|
|
457
|
+
secret=row['secret'],
|
|
458
|
+
status=WebhookDeliveryStatus(row['status']),
|
|
459
|
+
attempts=row['attempts'],
|
|
460
|
+
max_attempts=row['max_attempts'],
|
|
461
|
+
created_at=datetime.fromisoformat(row['created_at']),
|
|
462
|
+
next_retry_at=datetime.fromisoformat(row['next_retry_at']),
|
|
463
|
+
last_error=row['last_error'],
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
def mark_webhook_delivered(self, job_id: str) -> None:
|
|
467
|
+
"""Mark a webhook job as delivered."""
|
|
468
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
469
|
+
conn.execute("""
|
|
470
|
+
UPDATE webhook_retry_queue
|
|
471
|
+
SET status = ?, attempts = attempts + 1
|
|
472
|
+
WHERE job_id = ?
|
|
473
|
+
""", (WebhookDeliveryStatus.DELIVERED.value, job_id))
|
|
474
|
+
conn.commit()
|
|
475
|
+
|
|
476
|
+
def mark_webhook_failed(
|
|
477
|
+
self,
|
|
478
|
+
job_id: str,
|
|
479
|
+
error: str,
|
|
480
|
+
retry_delay_seconds: float = 60.0,
|
|
481
|
+
backoff_multiplier: float = 2.0,
|
|
482
|
+
) -> None:
|
|
483
|
+
"""Mark a webhook job as failed, schedule retry if attempts remain."""
|
|
484
|
+
job = self._get_webhook_job(job_id)
|
|
485
|
+
if not job:
|
|
486
|
+
return
|
|
487
|
+
|
|
488
|
+
job.attempts += 1
|
|
489
|
+
job.last_error = error
|
|
490
|
+
|
|
491
|
+
if job.attempts >= job.max_attempts:
|
|
492
|
+
job.status = WebhookDeliveryStatus.FAILED
|
|
493
|
+
logger.warning(f"Webhook job {job_id} failed permanently after {job.attempts} attempts")
|
|
494
|
+
else:
|
|
495
|
+
job.status = WebhookDeliveryStatus.RETRYING
|
|
496
|
+
# Exponential backoff
|
|
497
|
+
delay = retry_delay_seconds * (backoff_multiplier ** (job.attempts - 1))
|
|
498
|
+
job.next_retry_at = datetime.now() + timedelta(seconds=delay)
|
|
499
|
+
logger.info(f"Webhook job {job_id} retry scheduled in {delay}s")
|
|
500
|
+
|
|
501
|
+
self._save_webhook_job(job)
|
|
502
|
+
|
|
503
|
+
def _get_webhook_job(self, job_id: str) -> Optional[WebhookRetryJob]:
|
|
504
|
+
"""Get a webhook job by ID."""
|
|
505
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
506
|
+
conn.row_factory = sqlite3.Row
|
|
507
|
+
cursor = conn.execute(
|
|
508
|
+
"SELECT * FROM webhook_retry_queue WHERE job_id = ?", (job_id,)
|
|
509
|
+
)
|
|
510
|
+
row = cursor.fetchone()
|
|
511
|
+
return self._row_to_webhook_job(row) if row else None
|
|
512
|
+
|
|
513
|
+
def cleanup_old_jobs(self, days: int = 7) -> int:
|
|
514
|
+
"""Clean up completed/failed jobs older than N days."""
|
|
515
|
+
cutoff = (datetime.now() - timedelta(days=days)).isoformat()
|
|
516
|
+
|
|
517
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
518
|
+
cursor = conn.execute("""
|
|
519
|
+
DELETE FROM webhook_retry_queue
|
|
520
|
+
WHERE status IN (?, ?) AND created_at < ?
|
|
521
|
+
""", (
|
|
522
|
+
WebhookDeliveryStatus.DELIVERED.value,
|
|
523
|
+
WebhookDeliveryStatus.FAILED.value,
|
|
524
|
+
cutoff,
|
|
525
|
+
))
|
|
526
|
+
conn.commit()
|
|
527
|
+
|
|
528
|
+
deleted = cursor.rowcount
|
|
529
|
+
if deleted:
|
|
530
|
+
logger.info(f"Cleaned up {deleted} old webhook jobs")
|
|
531
|
+
return deleted
|
|
532
|
+
|
|
533
|
+
def get_queue_stats(self) -> Dict[str, int]:
|
|
534
|
+
"""Get webhook queue statistics."""
|
|
535
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
536
|
+
cursor = conn.execute("""
|
|
537
|
+
SELECT status, COUNT(*) as count
|
|
538
|
+
FROM webhook_retry_queue
|
|
539
|
+
GROUP BY status
|
|
540
|
+
""")
|
|
541
|
+
|
|
542
|
+
stats = {status.value: 0 for status in WebhookDeliveryStatus}
|
|
543
|
+
for row in cursor:
|
|
544
|
+
stats[row[0]] = row[1]
|
|
545
|
+
|
|
546
|
+
return stats
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
# Singleton instance
|
|
550
|
+
_checkpoint_store: Optional[CheckpointStore] = None
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def get_checkpoint_store(db_path: str = "smartify_state.db") -> CheckpointStore:
|
|
554
|
+
"""Get or create the checkpoint store singleton."""
|
|
555
|
+
global _checkpoint_store
|
|
556
|
+
if _checkpoint_store is None:
|
|
557
|
+
_checkpoint_store = CheckpointStore(db_path)
|
|
558
|
+
return _checkpoint_store
|