ouroboros-ai 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ouroboros-ai might be problematic. Click here for more details.
- ouroboros/__init__.py +15 -0
- ouroboros/__main__.py +9 -0
- ouroboros/bigbang/__init__.py +39 -0
- ouroboros/bigbang/ambiguity.py +464 -0
- ouroboros/bigbang/interview.py +530 -0
- ouroboros/bigbang/seed_generator.py +610 -0
- ouroboros/cli/__init__.py +9 -0
- ouroboros/cli/commands/__init__.py +7 -0
- ouroboros/cli/commands/config.py +79 -0
- ouroboros/cli/commands/init.py +425 -0
- ouroboros/cli/commands/run.py +201 -0
- ouroboros/cli/commands/status.py +85 -0
- ouroboros/cli/formatters/__init__.py +31 -0
- ouroboros/cli/formatters/panels.py +157 -0
- ouroboros/cli/formatters/progress.py +112 -0
- ouroboros/cli/formatters/tables.py +166 -0
- ouroboros/cli/main.py +60 -0
- ouroboros/config/__init__.py +81 -0
- ouroboros/config/loader.py +292 -0
- ouroboros/config/models.py +332 -0
- ouroboros/core/__init__.py +62 -0
- ouroboros/core/ac_tree.py +401 -0
- ouroboros/core/context.py +472 -0
- ouroboros/core/errors.py +246 -0
- ouroboros/core/seed.py +212 -0
- ouroboros/core/types.py +205 -0
- ouroboros/evaluation/__init__.py +110 -0
- ouroboros/evaluation/consensus.py +350 -0
- ouroboros/evaluation/mechanical.py +351 -0
- ouroboros/evaluation/models.py +235 -0
- ouroboros/evaluation/pipeline.py +286 -0
- ouroboros/evaluation/semantic.py +302 -0
- ouroboros/evaluation/trigger.py +278 -0
- ouroboros/events/__init__.py +5 -0
- ouroboros/events/base.py +80 -0
- ouroboros/events/decomposition.py +153 -0
- ouroboros/events/evaluation.py +248 -0
- ouroboros/execution/__init__.py +44 -0
- ouroboros/execution/atomicity.py +451 -0
- ouroboros/execution/decomposition.py +481 -0
- ouroboros/execution/double_diamond.py +1386 -0
- ouroboros/execution/subagent.py +275 -0
- ouroboros/observability/__init__.py +63 -0
- ouroboros/observability/drift.py +383 -0
- ouroboros/observability/logging.py +504 -0
- ouroboros/observability/retrospective.py +338 -0
- ouroboros/orchestrator/__init__.py +78 -0
- ouroboros/orchestrator/adapter.py +391 -0
- ouroboros/orchestrator/events.py +278 -0
- ouroboros/orchestrator/runner.py +597 -0
- ouroboros/orchestrator/session.py +486 -0
- ouroboros/persistence/__init__.py +23 -0
- ouroboros/persistence/checkpoint.py +511 -0
- ouroboros/persistence/event_store.py +183 -0
- ouroboros/persistence/migrations/__init__.py +1 -0
- ouroboros/persistence/migrations/runner.py +100 -0
- ouroboros/persistence/migrations/scripts/001_initial.sql +20 -0
- ouroboros/persistence/schema.py +56 -0
- ouroboros/persistence/uow.py +230 -0
- ouroboros/providers/__init__.py +28 -0
- ouroboros/providers/base.py +133 -0
- ouroboros/providers/claude_code_adapter.py +212 -0
- ouroboros/providers/litellm_adapter.py +316 -0
- ouroboros/py.typed +0 -0
- ouroboros/resilience/__init__.py +67 -0
- ouroboros/resilience/lateral.py +595 -0
- ouroboros/resilience/stagnation.py +727 -0
- ouroboros/routing/__init__.py +60 -0
- ouroboros/routing/complexity.py +272 -0
- ouroboros/routing/downgrade.py +664 -0
- ouroboros/routing/escalation.py +340 -0
- ouroboros/routing/router.py +204 -0
- ouroboros/routing/tiers.py +247 -0
- ouroboros/secondary/__init__.py +40 -0
- ouroboros/secondary/scheduler.py +467 -0
- ouroboros/secondary/todo_registry.py +483 -0
- ouroboros_ai-0.1.0.dist-info/METADATA +607 -0
- ouroboros_ai-0.1.0.dist-info/RECORD +81 -0
- ouroboros_ai-0.1.0.dist-info/WHEEL +4 -0
- ouroboros_ai-0.1.0.dist-info/entry_points.txt +2 -0
- ouroboros_ai-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,511 @@
|
|
|
1
|
+
"""Checkpoint and recovery system for workflow persistence.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- CheckpointData: Data model for checkpoint state
|
|
5
|
+
- CheckpointStore: Save/load checkpoints with integrity validation
|
|
6
|
+
- Recovery logic with rollback support (max 3 levels per NFR11)
|
|
7
|
+
- PeriodicCheckpointer: Background task for automatic checkpointing
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
import fcntl
|
|
12
|
+
import hashlib
|
|
13
|
+
import json
|
|
14
|
+
from collections.abc import Awaitable, Callable
|
|
15
|
+
from contextlib import contextmanager
|
|
16
|
+
from dataclasses import asdict, dataclass
|
|
17
|
+
from datetime import UTC, datetime
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any, Iterator
|
|
20
|
+
|
|
21
|
+
from ouroboros.core.errors import PersistenceError
|
|
22
|
+
from ouroboros.core.types import Result
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@contextmanager
|
|
26
|
+
def _file_lock(file_path: Path, exclusive: bool = True) -> Iterator[None]:
|
|
27
|
+
"""Context manager for file locking to prevent race conditions.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
file_path: Path to the file to lock.
|
|
31
|
+
exclusive: If True, use exclusive lock (for writes).
|
|
32
|
+
If False, use shared lock (for reads).
|
|
33
|
+
|
|
34
|
+
Yields:
|
|
35
|
+
None when lock is acquired.
|
|
36
|
+
"""
|
|
37
|
+
lock_path = file_path.with_suffix(file_path.suffix + ".lock")
|
|
38
|
+
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
39
|
+
|
|
40
|
+
with open(lock_path, "w") as lock_file:
|
|
41
|
+
lock_type = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
|
|
42
|
+
try:
|
|
43
|
+
fcntl.flock(lock_file.fileno(), lock_type)
|
|
44
|
+
yield
|
|
45
|
+
finally:
|
|
46
|
+
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass(frozen=True, slots=True)
|
|
50
|
+
class CheckpointData:
|
|
51
|
+
"""Immutable checkpoint data for workflow state.
|
|
52
|
+
|
|
53
|
+
Attributes:
|
|
54
|
+
seed_id: Unique identifier for the seed being executed.
|
|
55
|
+
phase: Current execution phase (e.g., "planning", "execution").
|
|
56
|
+
state: Arbitrary state data as JSON-serializable dict.
|
|
57
|
+
timestamp: UTC timestamp when checkpoint was created.
|
|
58
|
+
hash: SHA-256 hash of serialized data for integrity validation.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
seed_id: str
|
|
62
|
+
phase: str
|
|
63
|
+
state: dict[str, Any]
|
|
64
|
+
timestamp: datetime
|
|
65
|
+
hash: str
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
def create(
|
|
69
|
+
cls, seed_id: str, phase: str, state: dict[str, Any]
|
|
70
|
+
) -> "CheckpointData":
|
|
71
|
+
"""Create a new checkpoint with automatic hash generation.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
seed_id: Unique identifier for the seed.
|
|
75
|
+
phase: Current execution phase.
|
|
76
|
+
state: State data to checkpoint.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
New CheckpointData instance with computed hash.
|
|
80
|
+
"""
|
|
81
|
+
timestamp = datetime.now(UTC)
|
|
82
|
+
# Create temporary instance without hash to compute it
|
|
83
|
+
temp_data = {
|
|
84
|
+
"seed_id": seed_id,
|
|
85
|
+
"phase": phase,
|
|
86
|
+
"state": state,
|
|
87
|
+
"timestamp": timestamp.isoformat(),
|
|
88
|
+
}
|
|
89
|
+
serialized = json.dumps(temp_data, sort_keys=True)
|
|
90
|
+
hash_value = hashlib.sha256(serialized.encode()).hexdigest()
|
|
91
|
+
|
|
92
|
+
return cls(
|
|
93
|
+
seed_id=seed_id,
|
|
94
|
+
phase=phase,
|
|
95
|
+
state=state,
|
|
96
|
+
timestamp=timestamp,
|
|
97
|
+
hash=hash_value,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
def validate_integrity(self) -> Result[bool, str]:
|
|
101
|
+
"""Validate checkpoint integrity by recomputing hash.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Result.ok(True) if hash matches, Result.err with details if corrupted.
|
|
105
|
+
"""
|
|
106
|
+
temp_data = {
|
|
107
|
+
"seed_id": self.seed_id,
|
|
108
|
+
"phase": self.phase,
|
|
109
|
+
"state": self.state,
|
|
110
|
+
"timestamp": self.timestamp.isoformat(),
|
|
111
|
+
}
|
|
112
|
+
serialized = json.dumps(temp_data, sort_keys=True)
|
|
113
|
+
computed_hash = hashlib.sha256(serialized.encode()).hexdigest()
|
|
114
|
+
|
|
115
|
+
if computed_hash != self.hash:
|
|
116
|
+
return Result.err(
|
|
117
|
+
f"Hash mismatch: expected {self.hash}, got {computed_hash}"
|
|
118
|
+
)
|
|
119
|
+
return Result.ok(True)
|
|
120
|
+
|
|
121
|
+
def to_dict(self) -> dict[str, Any]:
|
|
122
|
+
"""Convert checkpoint to JSON-serializable dict.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Dict representation suitable for JSON serialization.
|
|
126
|
+
"""
|
|
127
|
+
data = asdict(self)
|
|
128
|
+
data["timestamp"] = self.timestamp.isoformat()
|
|
129
|
+
return data
|
|
130
|
+
|
|
131
|
+
@classmethod
|
|
132
|
+
def from_dict(cls, data: dict[str, Any]) -> "CheckpointData":
|
|
133
|
+
"""Reconstruct checkpoint from dict.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
data: Dict with checkpoint data.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
CheckpointData instance.
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
ValueError: If timestamp parsing fails.
|
|
143
|
+
"""
|
|
144
|
+
timestamp_str = data["timestamp"]
|
|
145
|
+
timestamp = datetime.fromisoformat(timestamp_str)
|
|
146
|
+
return cls(
|
|
147
|
+
seed_id=data["seed_id"],
|
|
148
|
+
phase=data["phase"],
|
|
149
|
+
state=data["state"],
|
|
150
|
+
timestamp=timestamp,
|
|
151
|
+
hash=data["hash"],
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class CheckpointStore:
|
|
156
|
+
"""Store for persisting and recovering checkpoints with integrity validation.
|
|
157
|
+
|
|
158
|
+
Checkpoints are stored as JSON files in ~/.ouroboros/data/checkpoints/.
|
|
159
|
+
Each checkpoint is validated with SHA-256 hash for integrity.
|
|
160
|
+
Supports rollback up to 3 levels (NFR11) when corruption is detected.
|
|
161
|
+
|
|
162
|
+
Usage:
|
|
163
|
+
store = CheckpointStore()
|
|
164
|
+
store.initialize()
|
|
165
|
+
|
|
166
|
+
# Save checkpoint
|
|
167
|
+
checkpoint = CheckpointData.create("seed-123", "planning", {"step": 1})
|
|
168
|
+
result = store.save(checkpoint)
|
|
169
|
+
|
|
170
|
+
# Load latest valid checkpoint with automatic rollback
|
|
171
|
+
result = store.load("seed-123")
|
|
172
|
+
if result.is_ok:
|
|
173
|
+
checkpoint = result.value
|
|
174
|
+
"""
|
|
175
|
+
|
|
176
|
+
MAX_ROLLBACK_DEPTH = 3
|
|
177
|
+
|
|
178
|
+
def __init__(self, base_path: Path | None = None) -> None:
|
|
179
|
+
"""Initialize checkpoint store.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
base_path: Base directory for checkpoints.
|
|
183
|
+
Defaults to ~/.ouroboros/data/checkpoints/
|
|
184
|
+
"""
|
|
185
|
+
if base_path is None:
|
|
186
|
+
base_path = Path.home() / ".ouroboros" / "data" / "checkpoints"
|
|
187
|
+
self._base_path = base_path
|
|
188
|
+
|
|
189
|
+
def initialize(self) -> None:
|
|
190
|
+
"""Create checkpoint directory if it doesn't exist.
|
|
191
|
+
|
|
192
|
+
This method is idempotent - safe to call multiple times.
|
|
193
|
+
"""
|
|
194
|
+
self._base_path.mkdir(parents=True, exist_ok=True)
|
|
195
|
+
|
|
196
|
+
def save(self, checkpoint: CheckpointData) -> Result[None, PersistenceError]:
|
|
197
|
+
"""Save checkpoint to disk.
|
|
198
|
+
|
|
199
|
+
The checkpoint is rotated: existing checkpoints are shifted to .1, .2, .3
|
|
200
|
+
for rollback support (max 3 levels per NFR11).
|
|
201
|
+
|
|
202
|
+
Uses file locking to prevent race conditions during concurrent access.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
checkpoint: Checkpoint data to save.
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
Result.ok(None) on success, Result.err(PersistenceError) on failure.
|
|
209
|
+
"""
|
|
210
|
+
try:
|
|
211
|
+
checkpoint_path = self._get_checkpoint_path(checkpoint.seed_id)
|
|
212
|
+
|
|
213
|
+
# Use file locking to prevent race conditions
|
|
214
|
+
with _file_lock(checkpoint_path, exclusive=True):
|
|
215
|
+
# Rotate existing checkpoints for rollback support
|
|
216
|
+
self._rotate_checkpoints(checkpoint.seed_id)
|
|
217
|
+
|
|
218
|
+
# Write new checkpoint
|
|
219
|
+
with checkpoint_path.open("w") as f:
|
|
220
|
+
json.dump(checkpoint.to_dict(), f, indent=2)
|
|
221
|
+
|
|
222
|
+
return Result.ok(None)
|
|
223
|
+
except Exception as e:
|
|
224
|
+
return Result.err(
|
|
225
|
+
PersistenceError(
|
|
226
|
+
f"Failed to save checkpoint: {e}",
|
|
227
|
+
operation="write",
|
|
228
|
+
details={"seed_id": checkpoint.seed_id, "phase": checkpoint.phase},
|
|
229
|
+
)
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
def load(self, seed_id: str) -> Result[CheckpointData, PersistenceError]:
|
|
233
|
+
"""Load latest valid checkpoint with automatic rollback on corruption.
|
|
234
|
+
|
|
235
|
+
Attempts to load the latest checkpoint. If corrupted (hash mismatch or
|
|
236
|
+
parse error), automatically rolls back to previous checkpoint up to 3 levels.
|
|
237
|
+
Logs corruption details for debugging.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
seed_id: Seed identifier to load checkpoint for.
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
Result.ok(CheckpointData) with valid checkpoint,
|
|
244
|
+
Result.err(PersistenceError) if no valid checkpoint found.
|
|
245
|
+
"""
|
|
246
|
+
# Try loading checkpoints in order: current, .1, .2, .3
|
|
247
|
+
for level in range(self.MAX_ROLLBACK_DEPTH + 1):
|
|
248
|
+
result = self._load_checkpoint_level(seed_id, level)
|
|
249
|
+
if result.is_ok:
|
|
250
|
+
if level > 0:
|
|
251
|
+
# Log successful recovery after rollback
|
|
252
|
+
print(
|
|
253
|
+
f"Recovered checkpoint for {seed_id} "
|
|
254
|
+
f"from rollback level {level}"
|
|
255
|
+
)
|
|
256
|
+
return result
|
|
257
|
+
|
|
258
|
+
# Log corruption details for debugging
|
|
259
|
+
error = result.error
|
|
260
|
+
print(
|
|
261
|
+
f"Checkpoint corruption at level {level} for {seed_id}: "
|
|
262
|
+
f"{error.message}"
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# No valid checkpoint found at any level
|
|
266
|
+
return Result.err(
|
|
267
|
+
PersistenceError(
|
|
268
|
+
f"No valid checkpoint found for seed {seed_id} "
|
|
269
|
+
f"(tried {self.MAX_ROLLBACK_DEPTH + 1} levels)",
|
|
270
|
+
operation="load",
|
|
271
|
+
details={"seed_id": seed_id},
|
|
272
|
+
)
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
def _load_checkpoint_level(
|
|
276
|
+
self, seed_id: str, level: int
|
|
277
|
+
) -> Result[CheckpointData, PersistenceError]:
|
|
278
|
+
"""Load checkpoint at specific rollback level.
|
|
279
|
+
|
|
280
|
+
Uses file locking to prevent race conditions during concurrent access.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
seed_id: Seed identifier.
|
|
284
|
+
level: Rollback level (0=current, 1-3=previous).
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
Result.ok(CheckpointData) if valid, Result.err otherwise.
|
|
288
|
+
"""
|
|
289
|
+
checkpoint_path = self._get_checkpoint_path(seed_id, level)
|
|
290
|
+
|
|
291
|
+
if not checkpoint_path.exists():
|
|
292
|
+
return Result.err(
|
|
293
|
+
PersistenceError(
|
|
294
|
+
f"Checkpoint not found at level {level}",
|
|
295
|
+
operation="read",
|
|
296
|
+
details={"seed_id": seed_id, "level": level},
|
|
297
|
+
)
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
try:
|
|
301
|
+
# Use shared lock for reading
|
|
302
|
+
with _file_lock(checkpoint_path, exclusive=False):
|
|
303
|
+
with checkpoint_path.open("r") as f:
|
|
304
|
+
data = json.load(f)
|
|
305
|
+
|
|
306
|
+
checkpoint = CheckpointData.from_dict(data)
|
|
307
|
+
|
|
308
|
+
# Validate integrity
|
|
309
|
+
validation_result = checkpoint.validate_integrity()
|
|
310
|
+
if validation_result.is_err:
|
|
311
|
+
return Result.err(
|
|
312
|
+
PersistenceError(
|
|
313
|
+
f"Checkpoint integrity validation failed: "
|
|
314
|
+
f"{validation_result.error}",
|
|
315
|
+
operation="validate",
|
|
316
|
+
details={"seed_id": seed_id, "level": level},
|
|
317
|
+
)
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
return Result.ok(checkpoint)
|
|
321
|
+
|
|
322
|
+
except json.JSONDecodeError as e:
|
|
323
|
+
return Result.err(
|
|
324
|
+
PersistenceError(
|
|
325
|
+
f"Failed to parse checkpoint JSON: {e}",
|
|
326
|
+
operation="parse",
|
|
327
|
+
details={"seed_id": seed_id, "level": level},
|
|
328
|
+
)
|
|
329
|
+
)
|
|
330
|
+
except Exception as e:
|
|
331
|
+
return Result.err(
|
|
332
|
+
PersistenceError(
|
|
333
|
+
f"Failed to load checkpoint: {e}",
|
|
334
|
+
operation="read",
|
|
335
|
+
details={"seed_id": seed_id, "level": level},
|
|
336
|
+
)
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
def _rotate_checkpoints(self, seed_id: str) -> None:
|
|
340
|
+
"""Rotate existing checkpoints for rollback support.
|
|
341
|
+
|
|
342
|
+
Shifts checkpoints: current -> .1, .1 -> .2, .2 -> .3
|
|
343
|
+
Oldest checkpoint (.3) is deleted if it exists.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
seed_id: Seed identifier for checkpoint rotation.
|
|
347
|
+
"""
|
|
348
|
+
# Delete oldest checkpoint (.3) if it exists
|
|
349
|
+
oldest_path = self._get_checkpoint_path(seed_id, self.MAX_ROLLBACK_DEPTH)
|
|
350
|
+
if oldest_path.exists():
|
|
351
|
+
oldest_path.unlink()
|
|
352
|
+
|
|
353
|
+
# Shift existing checkpoints
|
|
354
|
+
for level in range(self.MAX_ROLLBACK_DEPTH - 1, -1, -1):
|
|
355
|
+
current_path = self._get_checkpoint_path(seed_id, level)
|
|
356
|
+
if current_path.exists():
|
|
357
|
+
next_path = self._get_checkpoint_path(seed_id, level + 1)
|
|
358
|
+
current_path.rename(next_path)
|
|
359
|
+
|
|
360
|
+
def _get_checkpoint_path(self, seed_id: str, level: int = 0) -> Path:
|
|
361
|
+
"""Get file path for checkpoint at specific rollback level.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
seed_id: Seed identifier.
|
|
365
|
+
level: Rollback level (0=current, 1-3=previous).
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
Path to checkpoint file.
|
|
369
|
+
"""
|
|
370
|
+
filename = f"checkpoint_{seed_id}.json"
|
|
371
|
+
if level > 0:
|
|
372
|
+
filename = f"checkpoint_{seed_id}.json.{level}"
|
|
373
|
+
return self._base_path / filename
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
class PeriodicCheckpointer:
|
|
377
|
+
"""Background task for automatic periodic checkpointing.
|
|
378
|
+
|
|
379
|
+
Runs a background asyncio task that calls a checkpoint callback
|
|
380
|
+
at regular intervals (default 5 minutes per AC2).
|
|
381
|
+
|
|
382
|
+
Usage:
|
|
383
|
+
async def checkpoint_callback():
|
|
384
|
+
# Get current state and save checkpoint
|
|
385
|
+
checkpoint = CheckpointData.create("seed-123", "planning", state)
|
|
386
|
+
store.save(checkpoint)
|
|
387
|
+
|
|
388
|
+
checkpointer = PeriodicCheckpointer(checkpoint_callback, interval=300)
|
|
389
|
+
await checkpointer.start()
|
|
390
|
+
|
|
391
|
+
# Later, when done
|
|
392
|
+
await checkpointer.stop()
|
|
393
|
+
"""
|
|
394
|
+
|
|
395
|
+
def __init__(
|
|
396
|
+
self,
|
|
397
|
+
checkpoint_callback: Callable[[], Awaitable[None]],
|
|
398
|
+
interval: int = 300, # 5 minutes default
|
|
399
|
+
) -> None:
|
|
400
|
+
"""Initialize periodic checkpointer.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
checkpoint_callback: Async function to call for checkpointing.
|
|
404
|
+
interval: Interval in seconds between checkpoints (default 300 = 5 min).
|
|
405
|
+
"""
|
|
406
|
+
self._callback = checkpoint_callback
|
|
407
|
+
self._interval = interval
|
|
408
|
+
self._task: asyncio.Task | None = None
|
|
409
|
+
self._stop_event = asyncio.Event()
|
|
410
|
+
|
|
411
|
+
async def start(self) -> None:
|
|
412
|
+
"""Start the periodic checkpointing background task.
|
|
413
|
+
|
|
414
|
+
This method is idempotent - calling it multiple times is safe.
|
|
415
|
+
"""
|
|
416
|
+
if self._task is None or self._task.done():
|
|
417
|
+
self._stop_event.clear()
|
|
418
|
+
self._task = asyncio.create_task(self._run())
|
|
419
|
+
|
|
420
|
+
async def stop(self) -> None:
|
|
421
|
+
"""Stop the periodic checkpointing background task.
|
|
422
|
+
|
|
423
|
+
Waits for the current checkpoint to complete before stopping.
|
|
424
|
+
"""
|
|
425
|
+
if self._task is not None and not self._task.done():
|
|
426
|
+
self._stop_event.set()
|
|
427
|
+
await self._task
|
|
428
|
+
self._task = None
|
|
429
|
+
|
|
430
|
+
async def _run(self) -> None:
|
|
431
|
+
"""Internal background task loop."""
|
|
432
|
+
while not self._stop_event.is_set():
|
|
433
|
+
try:
|
|
434
|
+
# Wait for interval or stop event
|
|
435
|
+
await asyncio.wait_for(
|
|
436
|
+
self._stop_event.wait(), timeout=self._interval
|
|
437
|
+
)
|
|
438
|
+
# If we get here, stop event was set
|
|
439
|
+
break
|
|
440
|
+
except asyncio.TimeoutError:
|
|
441
|
+
# Timeout means it's time to checkpoint
|
|
442
|
+
try:
|
|
443
|
+
await self._callback()
|
|
444
|
+
except Exception as e:
|
|
445
|
+
# Log error but continue checkpointing
|
|
446
|
+
print(f"Periodic checkpoint failed: {e}")
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
class RecoveryManager:
|
|
450
|
+
"""Manager for workflow recovery on startup.
|
|
451
|
+
|
|
452
|
+
Handles loading the latest valid checkpoint and restoring workflow state.
|
|
453
|
+
Provides recovery status and logging for debugging.
|
|
454
|
+
|
|
455
|
+
Usage:
|
|
456
|
+
store = CheckpointStore()
|
|
457
|
+
manager = RecoveryManager(store)
|
|
458
|
+
|
|
459
|
+
result = await manager.recover("seed-123")
|
|
460
|
+
if result.is_ok:
|
|
461
|
+
checkpoint = result.value
|
|
462
|
+
# Restore workflow state from checkpoint
|
|
463
|
+
"""
|
|
464
|
+
|
|
465
|
+
def __init__(self, checkpoint_store: CheckpointStore) -> None:
|
|
466
|
+
"""Initialize recovery manager.
|
|
467
|
+
|
|
468
|
+
Args:
|
|
469
|
+
checkpoint_store: CheckpointStore instance for loading checkpoints.
|
|
470
|
+
"""
|
|
471
|
+
self._store = checkpoint_store
|
|
472
|
+
|
|
473
|
+
async def recover(
|
|
474
|
+
self, seed_id: str
|
|
475
|
+
) -> Result[CheckpointData | None, PersistenceError]:
|
|
476
|
+
"""Recover workflow state from latest valid checkpoint.
|
|
477
|
+
|
|
478
|
+
Attempts to load the latest checkpoint. If not found or corrupted,
|
|
479
|
+
uses automatic rollback. Returns None if no checkpoint exists
|
|
480
|
+
(normal for first run).
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
seed_id: Seed identifier to recover.
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
Result.ok(CheckpointData) if checkpoint loaded,
|
|
487
|
+
Result.ok(None) if no checkpoint exists (normal),
|
|
488
|
+
Result.err(PersistenceError) if recovery failed after rollback.
|
|
489
|
+
"""
|
|
490
|
+
result = self._store.load(seed_id)
|
|
491
|
+
|
|
492
|
+
if result.is_err:
|
|
493
|
+
error = result.error
|
|
494
|
+
# Check if error is due to no checkpoint (normal for first run)
|
|
495
|
+
# Match both "not found" and "no valid checkpoint found"
|
|
496
|
+
error_msg_lower = error.message.lower()
|
|
497
|
+
if "not found" in error_msg_lower or "no valid checkpoint found" in error_msg_lower:
|
|
498
|
+
print(f"No checkpoint found for {seed_id} - starting fresh")
|
|
499
|
+
return Result.ok(None)
|
|
500
|
+
|
|
501
|
+
# Other errors indicate corruption/recovery failure
|
|
502
|
+
print(f"Recovery failed for {seed_id}: {error.message}")
|
|
503
|
+
return Result.err(error)
|
|
504
|
+
|
|
505
|
+
checkpoint = result.value
|
|
506
|
+
print(
|
|
507
|
+
f"Recovered checkpoint for {seed_id} "
|
|
508
|
+
f"from phase '{checkpoint.phase}' "
|
|
509
|
+
f"at {checkpoint.timestamp.isoformat()}"
|
|
510
|
+
)
|
|
511
|
+
return Result.ok(checkpoint)
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""EventStore implementation for event sourcing.
|
|
2
|
+
|
|
3
|
+
Provides async methods for appending and replaying events using SQLAlchemy Core
|
|
4
|
+
with aiosqlite backend.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
from sqlalchemy import select
|
|
9
|
+
from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine
|
|
10
|
+
|
|
11
|
+
from ouroboros.core.errors import PersistenceError
|
|
12
|
+
from ouroboros.events.base import BaseEvent
|
|
13
|
+
from ouroboros.persistence.schema import events_table, metadata
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class EventStore:
|
|
17
|
+
"""Event store for persisting and replaying events.
|
|
18
|
+
|
|
19
|
+
Uses SQLAlchemy Core with aiosqlite for async database operations.
|
|
20
|
+
All operations are transactional for atomicity.
|
|
21
|
+
|
|
22
|
+
Usage:
|
|
23
|
+
store = EventStore("sqlite+aiosqlite:///events.db")
|
|
24
|
+
await store.initialize()
|
|
25
|
+
|
|
26
|
+
# Append event
|
|
27
|
+
await store.append(event)
|
|
28
|
+
|
|
29
|
+
# Replay events for an aggregate
|
|
30
|
+
events = await store.replay("seed", "seed-123")
|
|
31
|
+
|
|
32
|
+
# Close when done
|
|
33
|
+
await store.close()
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, database_url: str) -> None:
|
|
37
|
+
"""Initialize EventStore with database URL.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
database_url: SQLAlchemy database URL.
|
|
41
|
+
For async SQLite: "sqlite+aiosqlite:///path/to/db.sqlite"
|
|
42
|
+
"""
|
|
43
|
+
self._database_url = database_url
|
|
44
|
+
self._engine: AsyncEngine | None = None
|
|
45
|
+
|
|
46
|
+
async def initialize(self) -> None:
|
|
47
|
+
"""Initialize the database connection and create tables if needed.
|
|
48
|
+
|
|
49
|
+
This method is idempotent - calling it multiple times is safe.
|
|
50
|
+
"""
|
|
51
|
+
if self._engine is None:
|
|
52
|
+
self._engine = create_async_engine(
|
|
53
|
+
self._database_url,
|
|
54
|
+
echo=False,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Create all tables defined in metadata
|
|
58
|
+
async with self._engine.begin() as conn:
|
|
59
|
+
await conn.run_sync(metadata.create_all)
|
|
60
|
+
|
|
61
|
+
async def append(self, event: BaseEvent) -> None:
|
|
62
|
+
"""Append an event to the store.
|
|
63
|
+
|
|
64
|
+
The operation is wrapped in a transaction for atomicity.
|
|
65
|
+
If the insert fails, the transaction is rolled back.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
event: The event to append.
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
PersistenceError: If the append operation fails.
|
|
72
|
+
"""
|
|
73
|
+
if self._engine is None:
|
|
74
|
+
raise PersistenceError(
|
|
75
|
+
"EventStore not initialized. Call initialize() first.",
|
|
76
|
+
operation="append",
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
async with self._engine.begin() as conn:
|
|
81
|
+
await conn.execute(
|
|
82
|
+
events_table.insert().values(**event.to_db_dict())
|
|
83
|
+
)
|
|
84
|
+
except Exception as e:
|
|
85
|
+
raise PersistenceError(
|
|
86
|
+
f"Failed to append event: {e}",
|
|
87
|
+
operation="insert",
|
|
88
|
+
table="events",
|
|
89
|
+
details={"event_id": event.id, "event_type": event.type},
|
|
90
|
+
) from e
|
|
91
|
+
|
|
92
|
+
async def append_batch(self, events: list[BaseEvent]) -> None:
|
|
93
|
+
"""Append multiple events atomically in a single transaction.
|
|
94
|
+
|
|
95
|
+
All events are inserted in a single transaction. If any insert fails,
|
|
96
|
+
the entire batch is rolled back, ensuring atomicity.
|
|
97
|
+
|
|
98
|
+
This is more efficient than calling append() multiple times and
|
|
99
|
+
guarantees that either all events are persisted or none are.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
events: List of events to append.
|
|
103
|
+
|
|
104
|
+
Raises:
|
|
105
|
+
PersistenceError: If the batch operation fails. No events
|
|
106
|
+
will be persisted if this is raised.
|
|
107
|
+
"""
|
|
108
|
+
if self._engine is None:
|
|
109
|
+
raise PersistenceError(
|
|
110
|
+
"EventStore not initialized. Call initialize() first.",
|
|
111
|
+
operation="append_batch",
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
if not events:
|
|
115
|
+
return # Nothing to do
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
async with self._engine.begin() as conn:
|
|
119
|
+
# Insert all events in a single statement within one transaction
|
|
120
|
+
await conn.execute(
|
|
121
|
+
events_table.insert(),
|
|
122
|
+
[event.to_db_dict() for event in events],
|
|
123
|
+
)
|
|
124
|
+
except Exception as e:
|
|
125
|
+
raise PersistenceError(
|
|
126
|
+
f"Failed to append event batch: {e}",
|
|
127
|
+
operation="insert_batch",
|
|
128
|
+
table="events",
|
|
129
|
+
details={
|
|
130
|
+
"batch_size": len(events),
|
|
131
|
+
"event_ids": [e.id for e in events[:5]], # First 5 for debugging
|
|
132
|
+
},
|
|
133
|
+
) from e
|
|
134
|
+
|
|
135
|
+
async def replay(
|
|
136
|
+
self, aggregate_type: str, aggregate_id: str
|
|
137
|
+
) -> list[BaseEvent]:
|
|
138
|
+
"""Replay all events for a specific aggregate.
|
|
139
|
+
|
|
140
|
+
The operation uses a transaction for read consistency.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
aggregate_type: The type of aggregate (e.g., "seed", "execution").
|
|
144
|
+
aggregate_id: The unique identifier of the aggregate.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
List of events for the aggregate, ordered by timestamp.
|
|
148
|
+
|
|
149
|
+
Raises:
|
|
150
|
+
PersistenceError: If the replay operation fails.
|
|
151
|
+
"""
|
|
152
|
+
if self._engine is None:
|
|
153
|
+
raise PersistenceError(
|
|
154
|
+
"EventStore not initialized. Call initialize() first.",
|
|
155
|
+
operation="replay",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
async with self._engine.begin() as conn:
|
|
160
|
+
result = await conn.execute(
|
|
161
|
+
select(events_table)
|
|
162
|
+
.where(events_table.c.aggregate_type == aggregate_type)
|
|
163
|
+
.where(events_table.c.aggregate_id == aggregate_id)
|
|
164
|
+
.order_by(events_table.c.timestamp)
|
|
165
|
+
)
|
|
166
|
+
rows = result.mappings().all()
|
|
167
|
+
return [BaseEvent.from_db_row(dict(row)) for row in rows]
|
|
168
|
+
except Exception as e:
|
|
169
|
+
raise PersistenceError(
|
|
170
|
+
f"Failed to replay events: {e}",
|
|
171
|
+
operation="select",
|
|
172
|
+
table="events",
|
|
173
|
+
details={
|
|
174
|
+
"aggregate_type": aggregate_type,
|
|
175
|
+
"aggregate_id": aggregate_id,
|
|
176
|
+
},
|
|
177
|
+
) from e
|
|
178
|
+
|
|
179
|
+
async def close(self) -> None:
|
|
180
|
+
"""Close the database connection."""
|
|
181
|
+
if self._engine is not None:
|
|
182
|
+
await self._engine.dispose()
|
|
183
|
+
self._engine = None
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Database migrations module."""
|