asap-protocol 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- asap/__init__.py +1 -1
- asap/cli.py +137 -2
- asap/errors.py +167 -0
- asap/examples/README.md +81 -10
- asap/examples/auth_patterns.py +212 -0
- asap/examples/error_recovery.py +248 -0
- asap/examples/long_running.py +287 -0
- asap/examples/mcp_integration.py +240 -0
- asap/examples/multi_step_workflow.py +134 -0
- asap/examples/orchestration.py +293 -0
- asap/examples/rate_limiting.py +137 -0
- asap/examples/run_demo.py +9 -4
- asap/examples/secure_handler.py +84 -0
- asap/examples/state_migration.py +240 -0
- asap/examples/streaming_response.py +108 -0
- asap/examples/websocket_concept.py +129 -0
- asap/mcp/__init__.py +43 -0
- asap/mcp/client.py +224 -0
- asap/mcp/protocol.py +179 -0
- asap/mcp/server.py +333 -0
- asap/mcp/server_runner.py +40 -0
- asap/models/__init__.py +4 -0
- asap/models/base.py +0 -3
- asap/models/constants.py +76 -1
- asap/models/entities.py +58 -7
- asap/models/envelope.py +14 -1
- asap/models/ids.py +8 -4
- asap/models/parts.py +33 -3
- asap/models/validators.py +16 -0
- asap/observability/__init__.py +6 -0
- asap/observability/dashboards/README.md +24 -0
- asap/observability/dashboards/asap-detailed.json +131 -0
- asap/observability/dashboards/asap-red.json +129 -0
- asap/observability/logging.py +81 -1
- asap/observability/metrics.py +15 -1
- asap/observability/trace_parser.py +238 -0
- asap/observability/trace_ui.py +218 -0
- asap/observability/tracing.py +293 -0
- asap/state/machine.py +15 -2
- asap/state/snapshot.py +0 -9
- asap/testing/__init__.py +31 -0
- asap/testing/assertions.py +108 -0
- asap/testing/fixtures.py +113 -0
- asap/testing/mocks.py +152 -0
- asap/transport/__init__.py +31 -0
- asap/transport/cache.py +180 -0
- asap/transport/circuit_breaker.py +194 -0
- asap/transport/client.py +989 -72
- asap/transport/compression.py +389 -0
- asap/transport/handlers.py +106 -53
- asap/transport/middleware.py +64 -39
- asap/transport/server.py +461 -94
- asap/transport/validators.py +320 -0
- asap/utils/__init__.py +7 -0
- asap/utils/sanitization.py +134 -0
- asap_protocol-1.0.0.dist-info/METADATA +264 -0
- asap_protocol-1.0.0.dist-info/RECORD +70 -0
- asap_protocol-0.3.0.dist-info/METADATA +0 -227
- asap_protocol-0.3.0.dist-info/RECORD +0 -37
- {asap_protocol-0.3.0.dist-info → asap_protocol-1.0.0.dist-info}/WHEEL +0 -0
- {asap_protocol-0.3.0.dist-info → asap_protocol-1.0.0.dist-info}/entry_points.txt +0 -0
- {asap_protocol-0.3.0.dist-info → asap_protocol-1.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""Error recovery patterns example for ASAP protocol.
|
|
2
|
+
|
|
3
|
+
This module demonstrates retry with backoff, circuit breaker, and fallback
|
|
4
|
+
patterns using ASAP's built-in support and small helpers you can reuse.
|
|
5
|
+
|
|
6
|
+
Patterns:
|
|
7
|
+
1. Retry with backoff: RetryConfig + ASAPClient, or a standalone retry loop.
|
|
8
|
+
2. Circuit breaker: CircuitBreaker (and ASAPClient with circuit_breaker_enabled).
|
|
9
|
+
3. Fallback: Try primary operation; on failure use fallback result or backup agent.
|
|
10
|
+
|
|
11
|
+
Run:
|
|
12
|
+
uv run python -m asap.examples.error_recovery
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import random
|
|
19
|
+
import time
|
|
20
|
+
from typing import Callable, Sequence, TypeVar
|
|
21
|
+
|
|
22
|
+
from asap.observability import get_logger
|
|
23
|
+
from asap.transport.circuit_breaker import CircuitBreaker, CircuitState
|
|
24
|
+
from asap.transport.client import RetryConfig
|
|
25
|
+
|
|
26
|
+
logger = get_logger(__name__)
|
|
27
|
+
|
|
28
|
+
T = TypeVar("T")
|
|
29
|
+
|
|
30
|
+
# Demo defaults (short delays so the example runs quickly)
|
|
31
|
+
DEFAULT_MAX_RETRIES = 3
|
|
32
|
+
DEFAULT_BASE_DELAY = 0.05
|
|
33
|
+
DEFAULT_MAX_DELAY = 0.5
|
|
34
|
+
DEFAULT_CB_THRESHOLD = 2
|
|
35
|
+
DEFAULT_CB_TIMEOUT = 0.2
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def retry_with_backoff(
|
|
39
|
+
fn: Callable[[], T],
|
|
40
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
|
41
|
+
base_delay: float = DEFAULT_BASE_DELAY,
|
|
42
|
+
max_delay: float = DEFAULT_MAX_DELAY,
|
|
43
|
+
jitter: bool = True,
|
|
44
|
+
) -> T:
|
|
45
|
+
"""Call fn(); on exception, retry with exponential backoff until success or max_retries.
|
|
46
|
+
|
|
47
|
+
Same idea as ASAPClient's internal retry: delay = min(base_delay * 2^attempt, max_delay)
|
|
48
|
+
with optional jitter. Use this for custom operations outside the client.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
fn: Callable that may raise. No arguments.
|
|
52
|
+
max_retries: Number of retries after the first attempt (total attempts = max_retries + 1).
|
|
53
|
+
base_delay: Base delay in seconds for exponential backoff.
|
|
54
|
+
max_delay: Cap on delay in seconds.
|
|
55
|
+
jitter: If True, add random jitter to each delay.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Result of fn().
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
Last exception raised by fn() if all attempts fail.
|
|
62
|
+
"""
|
|
63
|
+
last_exc: BaseException | None = None
|
|
64
|
+
for attempt in range(max_retries + 1):
|
|
65
|
+
try:
|
|
66
|
+
return fn()
|
|
67
|
+
except BaseException as e:
|
|
68
|
+
last_exc = e
|
|
69
|
+
if attempt == max_retries:
|
|
70
|
+
raise
|
|
71
|
+
delay = min(base_delay * (2**attempt), max_delay)
|
|
72
|
+
if jitter:
|
|
73
|
+
delay = delay * (0.5 + random.random()) # nosec B311
|
|
74
|
+
logger.info(
|
|
75
|
+
"asap.error_recovery.retry",
|
|
76
|
+
attempt=attempt + 1,
|
|
77
|
+
max_retries=max_retries,
|
|
78
|
+
delay_seconds=round(delay, 3),
|
|
79
|
+
error=str(e),
|
|
80
|
+
)
|
|
81
|
+
time.sleep(delay)
|
|
82
|
+
if last_exc is not None:
|
|
83
|
+
raise last_exc
|
|
84
|
+
raise RuntimeError("retry_with_backoff: no result and no exception")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def with_fallback(
|
|
88
|
+
primary_fn: Callable[[], T],
|
|
89
|
+
fallback_fn: Callable[[], T],
|
|
90
|
+
) -> T:
|
|
91
|
+
"""Try primary_fn(); on exception, call fallback_fn() and return its result.
|
|
92
|
+
|
|
93
|
+
Use when you have a backup (e.g. cached value, secondary agent, default payload).
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
primary_fn: Operation that may raise.
|
|
97
|
+
fallback_fn: Called only if primary_fn raises; should not raise.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Result of primary_fn or fallback_fn.
|
|
101
|
+
"""
|
|
102
|
+
try:
|
|
103
|
+
return primary_fn()
|
|
104
|
+
except Exception as e:
|
|
105
|
+
logger.warning(
|
|
106
|
+
"asap.error_recovery.fallback",
|
|
107
|
+
primary_error=str(e),
|
|
108
|
+
message="Using fallback",
|
|
109
|
+
)
|
|
110
|
+
return fallback_fn()
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def demo_retry_with_backoff(
|
|
114
|
+
fails_then_succeeds_at: int = 2,
|
|
115
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
|
116
|
+
) -> None:
|
|
117
|
+
"""Demonstrate retry with backoff using a flaky callable that fails N times then succeeds."""
|
|
118
|
+
call_count = 0
|
|
119
|
+
|
|
120
|
+
def flaky_op() -> str:
|
|
121
|
+
nonlocal call_count
|
|
122
|
+
call_count += 1
|
|
123
|
+
if call_count < fails_then_succeeds_at:
|
|
124
|
+
raise ConnectionError(f"Simulated failure (call #{call_count})")
|
|
125
|
+
return "ok"
|
|
126
|
+
|
|
127
|
+
result = retry_with_backoff(
|
|
128
|
+
flaky_op,
|
|
129
|
+
max_retries=max_retries,
|
|
130
|
+
base_delay=DEFAULT_BASE_DELAY,
|
|
131
|
+
max_delay=DEFAULT_MAX_DELAY,
|
|
132
|
+
)
|
|
133
|
+
logger.info(
|
|
134
|
+
"asap.error_recovery.retry_demo_complete",
|
|
135
|
+
result=result,
|
|
136
|
+
calls=call_count,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def demo_circuit_breaker(
|
|
141
|
+
threshold: int = DEFAULT_CB_THRESHOLD,
|
|
142
|
+
timeout: float = DEFAULT_CB_TIMEOUT,
|
|
143
|
+
) -> None:
|
|
144
|
+
"""Demonstrate circuit breaker: record failures until OPEN, wait, then HALF_OPEN and recover."""
|
|
145
|
+
breaker = CircuitBreaker(threshold=threshold, timeout=timeout)
|
|
146
|
+
|
|
147
|
+
# CLOSED -> record failures until OPEN
|
|
148
|
+
for _ in range(threshold):
|
|
149
|
+
breaker.record_failure()
|
|
150
|
+
assert breaker.get_state() == CircuitState.OPEN # nosec B101
|
|
151
|
+
assert breaker.can_attempt() is False # nosec B101
|
|
152
|
+
logger.info(
|
|
153
|
+
"asap.error_recovery.circuit_open",
|
|
154
|
+
state=breaker.get_state().value,
|
|
155
|
+
consecutive_failures=breaker.get_consecutive_failures(),
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Wait for timeout -> HALF_OPEN
|
|
159
|
+
time.sleep(timeout + 0.05)
|
|
160
|
+
assert breaker.can_attempt() is True # nosec B101
|
|
161
|
+
assert breaker.get_state() == CircuitState.HALF_OPEN # nosec B101
|
|
162
|
+
logger.info(
|
|
163
|
+
"asap.error_recovery.circuit_half_open",
|
|
164
|
+
state=breaker.get_state().value,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Success -> CLOSED
|
|
168
|
+
breaker.record_success()
|
|
169
|
+
assert breaker.get_state() == CircuitState.CLOSED # nosec B101
|
|
170
|
+
logger.info(
|
|
171
|
+
"asap.error_recovery.circuit_closed",
|
|
172
|
+
state=breaker.get_state().value,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def demo_fallback() -> None:
|
|
177
|
+
"""Demonstrate fallback: primary raises, fallback returns default result."""
|
|
178
|
+
|
|
179
|
+
def primary() -> str:
|
|
180
|
+
raise RuntimeError("Primary agent unavailable")
|
|
181
|
+
|
|
182
|
+
def fallback() -> str:
|
|
183
|
+
return '{"status": "fallback", "message": "default result"}'
|
|
184
|
+
|
|
185
|
+
result = with_fallback(primary, fallback)
|
|
186
|
+
logger.info(
|
|
187
|
+
"asap.error_recovery.fallback_demo_complete",
|
|
188
|
+
result=result,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def show_client_retry_config() -> None:
|
|
193
|
+
"""Log how to use RetryConfig with ASAPClient (for reference)."""
|
|
194
|
+
config = RetryConfig(
|
|
195
|
+
max_retries=3,
|
|
196
|
+
base_delay=1.0,
|
|
197
|
+
max_delay=60.0,
|
|
198
|
+
jitter=True,
|
|
199
|
+
circuit_breaker_enabled=True,
|
|
200
|
+
circuit_breaker_threshold=5,
|
|
201
|
+
circuit_breaker_timeout=60.0,
|
|
202
|
+
)
|
|
203
|
+
logger.info(
|
|
204
|
+
"asap.error_recovery.client_retry_config",
|
|
205
|
+
message="Use ASAPClient(..., retry_config=RetryConfig(...)); CircuitOpenError when circuit is open",
|
|
206
|
+
max_retries=config.max_retries,
|
|
207
|
+
circuit_breaker_enabled=config.circuit_breaker_enabled,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def run_demo(
|
|
212
|
+
skip_retry: bool = False,
|
|
213
|
+
skip_circuit: bool = False,
|
|
214
|
+
skip_fallback: bool = False,
|
|
215
|
+
) -> None:
|
|
216
|
+
"""Run all error recovery demos (retry, circuit breaker, fallback)."""
|
|
217
|
+
if not skip_retry:
|
|
218
|
+
demo_retry_with_backoff()
|
|
219
|
+
if not skip_circuit:
|
|
220
|
+
demo_circuit_breaker()
|
|
221
|
+
if not skip_fallback:
|
|
222
|
+
demo_fallback()
|
|
223
|
+
show_client_retry_config()
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
|
|
227
|
+
"""Parse command-line arguments for the error recovery demo."""
|
|
228
|
+
parser = argparse.ArgumentParser(
|
|
229
|
+
description="Error recovery patterns: retry with backoff, circuit breaker, fallback."
|
|
230
|
+
)
|
|
231
|
+
parser.add_argument("--skip-retry", action="store_true", help="Skip retry demo.")
|
|
232
|
+
parser.add_argument("--skip-circuit", action="store_true", help="Skip circuit breaker demo.")
|
|
233
|
+
parser.add_argument("--skip-fallback", action="store_true", help="Skip fallback demo.")
|
|
234
|
+
return parser.parse_args(argv)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def main(argv: Sequence[str] | None = None) -> None:
|
|
238
|
+
"""Run error recovery pattern demos."""
|
|
239
|
+
args = parse_args(argv)
|
|
240
|
+
run_demo(
|
|
241
|
+
skip_retry=args.skip_retry,
|
|
242
|
+
skip_circuit=args.skip_circuit,
|
|
243
|
+
skip_fallback=args.skip_fallback,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
if __name__ == "__main__":
|
|
248
|
+
main()
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
"""Long-running task with checkpoints example for ASAP protocol.
|
|
2
|
+
|
|
3
|
+
This module demonstrates saving task state as snapshots and resuming after
|
|
4
|
+
a "crash" (e.g. process exit, failure). Use StateSnapshot and a SnapshotStore
|
|
5
|
+
to persist progress so work can continue from the last checkpoint.
|
|
6
|
+
|
|
7
|
+
Scenario:
|
|
8
|
+
- A task runs in multiple steps (e.g. step 1, 2, 3, ...).
|
|
9
|
+
- After each step we save a StateSnapshot to the store.
|
|
10
|
+
- If the process crashes or stops, we can resume by loading the latest
|
|
11
|
+
snapshot and continuing from the next step.
|
|
12
|
+
|
|
13
|
+
Run:
|
|
14
|
+
uv run python -m asap.examples.long_running
|
|
15
|
+
uv run python -m asap.examples.long_running --crash-after 2 # Simulate crash after step 2
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
from datetime import datetime, timezone
|
|
22
|
+
from typing import Any, Protocol, Sequence, runtime_checkable
|
|
23
|
+
|
|
24
|
+
from asap.models.entities import StateSnapshot
|
|
25
|
+
from asap.models.ids import generate_id
|
|
26
|
+
from asap.models.types import TaskID
|
|
27
|
+
from asap.observability import get_logger
|
|
28
|
+
from asap.state.snapshot import InMemorySnapshotStore
|
|
29
|
+
|
|
30
|
+
logger = get_logger(__name__)
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
"KEY_COMPLETED",
|
|
34
|
+
"KEY_PARTIAL_RESULT",
|
|
35
|
+
"KEY_PROGRESS_PCT",
|
|
36
|
+
"KEY_STEP",
|
|
37
|
+
"InMemorySnapshotStore",
|
|
38
|
+
"SnapshotStoreLike",
|
|
39
|
+
"create_snapshot",
|
|
40
|
+
"resume_from_store",
|
|
41
|
+
"run_demo",
|
|
42
|
+
"run_steps",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
# Keys used in snapshot data for this example
|
|
46
|
+
KEY_STEP = "step"
|
|
47
|
+
KEY_PROGRESS_PCT = "progress_pct"
|
|
48
|
+
KEY_PARTIAL_RESULT = "partial_result"
|
|
49
|
+
KEY_COMPLETED = "completed"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@runtime_checkable
|
|
53
|
+
class SnapshotStoreLike(Protocol):
|
|
54
|
+
"""Minimal protocol for snapshot save/get (for type hints in this example)."""
|
|
55
|
+
|
|
56
|
+
def save(self, snapshot: StateSnapshot) -> None: ...
|
|
57
|
+
def get(self, task_id: TaskID, version: int | None = None) -> StateSnapshot | None: ...
|
|
58
|
+
def list_versions(self, task_id: TaskID) -> list[int]: ...
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def create_snapshot(
|
|
62
|
+
task_id: str,
|
|
63
|
+
version: int,
|
|
64
|
+
step: int,
|
|
65
|
+
progress_pct: int,
|
|
66
|
+
partial_result: dict[str, Any],
|
|
67
|
+
completed: bool = False,
|
|
68
|
+
checkpoint: bool = True,
|
|
69
|
+
) -> StateSnapshot:
|
|
70
|
+
"""Build a StateSnapshot for the long-running task progress.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
task_id: Parent task ID.
|
|
74
|
+
version: Snapshot version (monotonically increasing).
|
|
75
|
+
step: Current step number (1-based).
|
|
76
|
+
progress_pct: Progress percentage (0–100).
|
|
77
|
+
partial_result: Result data accumulated so far.
|
|
78
|
+
completed: Whether the task is fully completed.
|
|
79
|
+
checkpoint: Whether this snapshot is a significant checkpoint.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
StateSnapshot ready to save to a store.
|
|
83
|
+
"""
|
|
84
|
+
return StateSnapshot(
|
|
85
|
+
id=generate_id(),
|
|
86
|
+
task_id=task_id,
|
|
87
|
+
version=version,
|
|
88
|
+
data={
|
|
89
|
+
KEY_STEP: step,
|
|
90
|
+
KEY_PROGRESS_PCT: progress_pct,
|
|
91
|
+
KEY_PARTIAL_RESULT: partial_result,
|
|
92
|
+
KEY_COMPLETED: completed,
|
|
93
|
+
},
|
|
94
|
+
checkpoint=checkpoint,
|
|
95
|
+
created_at=datetime.now(timezone.utc),
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def run_steps(
|
|
100
|
+
store: SnapshotStoreLike,
|
|
101
|
+
task_id: str,
|
|
102
|
+
num_steps: int,
|
|
103
|
+
crash_after_step: int | None = None,
|
|
104
|
+
) -> StateSnapshot | None:
|
|
105
|
+
"""Run the long-running task: execute steps 1..num_steps and save a snapshot after each.
|
|
106
|
+
|
|
107
|
+
If crash_after_step is set, stop after that step (simulating a crash).
|
|
108
|
+
The latest snapshot remains in the store so the task can be resumed.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
store: Snapshot store to persist state.
|
|
112
|
+
task_id: Task identifier.
|
|
113
|
+
num_steps: Total number of steps (e.g. 5).
|
|
114
|
+
crash_after_step: If set, stop after this step (1-based). None = no crash.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Latest snapshot after the run, or None if no step was executed.
|
|
118
|
+
"""
|
|
119
|
+
partial_result: dict[str, Any] = {"items": [], "last_step": 0}
|
|
120
|
+
last_snapshot: StateSnapshot | None = None
|
|
121
|
+
|
|
122
|
+
for step in range(1, num_steps + 1):
|
|
123
|
+
progress_pct = (step * 100) // num_steps
|
|
124
|
+
partial_result["items"].append(f"result_step_{step}")
|
|
125
|
+
partial_result["last_step"] = step
|
|
126
|
+
|
|
127
|
+
completed = step == num_steps
|
|
128
|
+
version = step
|
|
129
|
+
snapshot = create_snapshot(
|
|
130
|
+
task_id=task_id,
|
|
131
|
+
version=version,
|
|
132
|
+
step=step,
|
|
133
|
+
progress_pct=progress_pct,
|
|
134
|
+
partial_result=dict(partial_result),
|
|
135
|
+
completed=completed,
|
|
136
|
+
checkpoint=True,
|
|
137
|
+
)
|
|
138
|
+
store.save(snapshot)
|
|
139
|
+
last_snapshot = snapshot
|
|
140
|
+
logger.info(
|
|
141
|
+
"asap.long_running.checkpoint",
|
|
142
|
+
task_id=task_id,
|
|
143
|
+
step=step,
|
|
144
|
+
version=version,
|
|
145
|
+
progress_pct=progress_pct,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
if crash_after_step is not None and step >= crash_after_step:
|
|
149
|
+
logger.warning(
|
|
150
|
+
"asap.long_running.crash_simulated",
|
|
151
|
+
task_id=task_id,
|
|
152
|
+
after_step=step,
|
|
153
|
+
)
|
|
154
|
+
break
|
|
155
|
+
|
|
156
|
+
return last_snapshot
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def resume_from_store(
|
|
160
|
+
store: SnapshotStoreLike,
|
|
161
|
+
task_id: str,
|
|
162
|
+
num_steps: int,
|
|
163
|
+
) -> StateSnapshot | None:
|
|
164
|
+
"""Resume a long-running task from the latest snapshot in the store.
|
|
165
|
+
|
|
166
|
+
Loads the latest StateSnapshot for task_id, reads the last completed step,
|
|
167
|
+
and runs from (step + 1) to num_steps, saving a snapshot after each step.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
store: Snapshot store where state was persisted.
|
|
171
|
+
task_id: Task identifier.
|
|
172
|
+
num_steps: Total number of steps (must match original task).
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Latest snapshot after resume, or None if no previous snapshot or nothing left to do.
|
|
176
|
+
"""
|
|
177
|
+
latest = store.get(task_id, version=None)
|
|
178
|
+
if latest is None:
|
|
179
|
+
logger.warning("asap.long_running.no_snapshot", task_id=task_id)
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
data = latest.data
|
|
183
|
+
last_step = data.get(KEY_STEP, 0)
|
|
184
|
+
partial_result = dict(data.get(KEY_PARTIAL_RESULT, {"items": [], "last_step": 0}))
|
|
185
|
+
|
|
186
|
+
if last_step >= num_steps:
|
|
187
|
+
logger.info("asap.long_running.already_complete", task_id=task_id)
|
|
188
|
+
return latest
|
|
189
|
+
|
|
190
|
+
logger.info(
|
|
191
|
+
"asap.long_running.resuming",
|
|
192
|
+
task_id=task_id,
|
|
193
|
+
from_step=last_step + 1,
|
|
194
|
+
num_steps=num_steps,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
last_snapshot: StateSnapshot | None = latest
|
|
198
|
+
for step in range(last_step + 1, num_steps + 1):
|
|
199
|
+
progress_pct = (step * 100) // num_steps
|
|
200
|
+
partial_result["items"].append(f"result_step_{step}")
|
|
201
|
+
partial_result["last_step"] = step
|
|
202
|
+
completed = step == num_steps
|
|
203
|
+
version = step
|
|
204
|
+
snapshot = create_snapshot(
|
|
205
|
+
task_id=task_id,
|
|
206
|
+
version=version,
|
|
207
|
+
step=step,
|
|
208
|
+
progress_pct=progress_pct,
|
|
209
|
+
partial_result=dict(partial_result),
|
|
210
|
+
completed=completed,
|
|
211
|
+
checkpoint=True,
|
|
212
|
+
)
|
|
213
|
+
store.save(snapshot)
|
|
214
|
+
last_snapshot = snapshot
|
|
215
|
+
logger.info(
|
|
216
|
+
"asap.long_running.checkpoint",
|
|
217
|
+
task_id=task_id,
|
|
218
|
+
step=step,
|
|
219
|
+
version=version,
|
|
220
|
+
progress_pct=progress_pct,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
return last_snapshot
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def run_demo(
|
|
227
|
+
num_steps: int = 5,
|
|
228
|
+
crash_after_step: int | None = 2,
|
|
229
|
+
) -> None:
|
|
230
|
+
"""Run a full demo: execute until crash, then resume and complete.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
num_steps: Total number of steps.
|
|
234
|
+
crash_after_step: Step after which to simulate a crash (1-based). None = no crash.
|
|
235
|
+
"""
|
|
236
|
+
store: InMemorySnapshotStore = InMemorySnapshotStore()
|
|
237
|
+
task_id = generate_id()
|
|
238
|
+
|
|
239
|
+
run_steps(store, task_id, num_steps, crash_after_step=crash_after_step)
|
|
240
|
+
|
|
241
|
+
final = resume_from_store(store, task_id, num_steps)
|
|
242
|
+
if final is None:
|
|
243
|
+
raise SystemExit(1)
|
|
244
|
+
if not final.data.get(KEY_COMPLETED, False):
|
|
245
|
+
raise SystemExit(1)
|
|
246
|
+
logger.info("asap.long_running.demo_complete", task_id=task_id, final_step=final.data[KEY_STEP])
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
|
|
250
|
+
"""Parse command-line arguments for the long-running demo.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
argv: Optional list of CLI arguments for testing.
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
Parsed argparse namespace.
|
|
257
|
+
"""
|
|
258
|
+
parser = argparse.ArgumentParser(
|
|
259
|
+
description="Long-running task with checkpoints (save snapshot, resume after crash)."
|
|
260
|
+
)
|
|
261
|
+
parser.add_argument(
|
|
262
|
+
"--num-steps",
|
|
263
|
+
type=int,
|
|
264
|
+
default=5,
|
|
265
|
+
help="Total number of steps in the task.",
|
|
266
|
+
)
|
|
267
|
+
parser.add_argument(
|
|
268
|
+
"--crash-after",
|
|
269
|
+
type=int,
|
|
270
|
+
default=2,
|
|
271
|
+
metavar="N",
|
|
272
|
+
help="Simulate crash after step N (1-based). Use 0 to disable crash.",
|
|
273
|
+
)
|
|
274
|
+
args = parser.parse_args(argv)
|
|
275
|
+
if args.crash_after == 0:
|
|
276
|
+
args.crash_after = None
|
|
277
|
+
return args
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def main(argv: Sequence[str] | None = None) -> None:
|
|
281
|
+
"""Run the long-running task demo: checkpoint, crash, resume."""
|
|
282
|
+
args = parse_args(argv)
|
|
283
|
+
run_demo(num_steps=args.num_steps, crash_after_step=args.crash_after)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
if __name__ == "__main__":
|
|
287
|
+
main()
|