pyworkflow-engine 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dashboard/backend/app/__init__.py +1 -0
- dashboard/backend/app/config.py +32 -0
- dashboard/backend/app/controllers/__init__.py +6 -0
- dashboard/backend/app/controllers/run_controller.py +86 -0
- dashboard/backend/app/controllers/workflow_controller.py +33 -0
- dashboard/backend/app/dependencies/__init__.py +5 -0
- dashboard/backend/app/dependencies/storage.py +50 -0
- dashboard/backend/app/repositories/__init__.py +6 -0
- dashboard/backend/app/repositories/run_repository.py +80 -0
- dashboard/backend/app/repositories/workflow_repository.py +27 -0
- dashboard/backend/app/rest/__init__.py +8 -0
- dashboard/backend/app/rest/v1/__init__.py +12 -0
- dashboard/backend/app/rest/v1/health.py +33 -0
- dashboard/backend/app/rest/v1/runs.py +133 -0
- dashboard/backend/app/rest/v1/workflows.py +41 -0
- dashboard/backend/app/schemas/__init__.py +23 -0
- dashboard/backend/app/schemas/common.py +16 -0
- dashboard/backend/app/schemas/event.py +24 -0
- dashboard/backend/app/schemas/hook.py +25 -0
- dashboard/backend/app/schemas/run.py +54 -0
- dashboard/backend/app/schemas/step.py +28 -0
- dashboard/backend/app/schemas/workflow.py +31 -0
- dashboard/backend/app/server.py +87 -0
- dashboard/backend/app/services/__init__.py +6 -0
- dashboard/backend/app/services/run_service.py +240 -0
- dashboard/backend/app/services/workflow_service.py +155 -0
- dashboard/backend/main.py +18 -0
- docs/concepts/cancellation.mdx +362 -0
- docs/concepts/continue-as-new.mdx +434 -0
- docs/concepts/events.mdx +266 -0
- docs/concepts/fault-tolerance.mdx +370 -0
- docs/concepts/hooks.mdx +552 -0
- docs/concepts/limitations.mdx +167 -0
- docs/concepts/schedules.mdx +775 -0
- docs/concepts/sleep.mdx +312 -0
- docs/concepts/steps.mdx +301 -0
- docs/concepts/workflows.mdx +255 -0
- docs/guides/cli.mdx +942 -0
- docs/guides/configuration.mdx +560 -0
- docs/introduction.mdx +155 -0
- docs/quickstart.mdx +279 -0
- examples/__init__.py +1 -0
- examples/celery/__init__.py +1 -0
- examples/celery/durable/docker-compose.yml +55 -0
- examples/celery/durable/pyworkflow.config.yaml +12 -0
- examples/celery/durable/workflows/__init__.py +122 -0
- examples/celery/durable/workflows/basic.py +87 -0
- examples/celery/durable/workflows/batch_processing.py +102 -0
- examples/celery/durable/workflows/cancellation.py +273 -0
- examples/celery/durable/workflows/child_workflow_patterns.py +240 -0
- examples/celery/durable/workflows/child_workflows.py +202 -0
- examples/celery/durable/workflows/continue_as_new.py +260 -0
- examples/celery/durable/workflows/fault_tolerance.py +210 -0
- examples/celery/durable/workflows/hooks.py +211 -0
- examples/celery/durable/workflows/idempotency.py +112 -0
- examples/celery/durable/workflows/long_running.py +99 -0
- examples/celery/durable/workflows/retries.py +101 -0
- examples/celery/durable/workflows/schedules.py +209 -0
- examples/celery/transient/01_basic_workflow.py +91 -0
- examples/celery/transient/02_fault_tolerance.py +257 -0
- examples/celery/transient/__init__.py +20 -0
- examples/celery/transient/pyworkflow.config.yaml +25 -0
- examples/local/__init__.py +1 -0
- examples/local/durable/01_basic_workflow.py +94 -0
- examples/local/durable/02_file_storage.py +132 -0
- examples/local/durable/03_retries.py +169 -0
- examples/local/durable/04_long_running.py +119 -0
- examples/local/durable/05_event_log.py +145 -0
- examples/local/durable/06_idempotency.py +148 -0
- examples/local/durable/07_hooks.py +334 -0
- examples/local/durable/08_cancellation.py +233 -0
- examples/local/durable/09_child_workflows.py +198 -0
- examples/local/durable/10_child_workflow_patterns.py +265 -0
- examples/local/durable/11_continue_as_new.py +249 -0
- examples/local/durable/12_schedules.py +198 -0
- examples/local/durable/__init__.py +1 -0
- examples/local/transient/01_quick_tasks.py +87 -0
- examples/local/transient/02_retries.py +130 -0
- examples/local/transient/03_sleep.py +141 -0
- examples/local/transient/__init__.py +1 -0
- pyworkflow/__init__.py +256 -0
- pyworkflow/aws/__init__.py +68 -0
- pyworkflow/aws/context.py +234 -0
- pyworkflow/aws/handler.py +184 -0
- pyworkflow/aws/testing.py +310 -0
- pyworkflow/celery/__init__.py +41 -0
- pyworkflow/celery/app.py +198 -0
- pyworkflow/celery/scheduler.py +315 -0
- pyworkflow/celery/tasks.py +1746 -0
- pyworkflow/cli/__init__.py +132 -0
- pyworkflow/cli/__main__.py +6 -0
- pyworkflow/cli/commands/__init__.py +1 -0
- pyworkflow/cli/commands/hooks.py +640 -0
- pyworkflow/cli/commands/quickstart.py +495 -0
- pyworkflow/cli/commands/runs.py +773 -0
- pyworkflow/cli/commands/scheduler.py +130 -0
- pyworkflow/cli/commands/schedules.py +794 -0
- pyworkflow/cli/commands/setup.py +703 -0
- pyworkflow/cli/commands/worker.py +413 -0
- pyworkflow/cli/commands/workflows.py +1257 -0
- pyworkflow/cli/output/__init__.py +1 -0
- pyworkflow/cli/output/formatters.py +321 -0
- pyworkflow/cli/output/styles.py +121 -0
- pyworkflow/cli/utils/__init__.py +1 -0
- pyworkflow/cli/utils/async_helpers.py +30 -0
- pyworkflow/cli/utils/config.py +130 -0
- pyworkflow/cli/utils/config_generator.py +344 -0
- pyworkflow/cli/utils/discovery.py +53 -0
- pyworkflow/cli/utils/docker_manager.py +651 -0
- pyworkflow/cli/utils/interactive.py +364 -0
- pyworkflow/cli/utils/storage.py +115 -0
- pyworkflow/config.py +329 -0
- pyworkflow/context/__init__.py +63 -0
- pyworkflow/context/aws.py +230 -0
- pyworkflow/context/base.py +416 -0
- pyworkflow/context/local.py +930 -0
- pyworkflow/context/mock.py +381 -0
- pyworkflow/core/__init__.py +0 -0
- pyworkflow/core/exceptions.py +353 -0
- pyworkflow/core/registry.py +313 -0
- pyworkflow/core/scheduled.py +328 -0
- pyworkflow/core/step.py +494 -0
- pyworkflow/core/workflow.py +294 -0
- pyworkflow/discovery.py +248 -0
- pyworkflow/engine/__init__.py +0 -0
- pyworkflow/engine/events.py +879 -0
- pyworkflow/engine/executor.py +682 -0
- pyworkflow/engine/replay.py +273 -0
- pyworkflow/observability/__init__.py +19 -0
- pyworkflow/observability/logging.py +234 -0
- pyworkflow/primitives/__init__.py +33 -0
- pyworkflow/primitives/child_handle.py +174 -0
- pyworkflow/primitives/child_workflow.py +372 -0
- pyworkflow/primitives/continue_as_new.py +101 -0
- pyworkflow/primitives/define_hook.py +150 -0
- pyworkflow/primitives/hooks.py +97 -0
- pyworkflow/primitives/resume_hook.py +210 -0
- pyworkflow/primitives/schedule.py +545 -0
- pyworkflow/primitives/shield.py +96 -0
- pyworkflow/primitives/sleep.py +100 -0
- pyworkflow/runtime/__init__.py +21 -0
- pyworkflow/runtime/base.py +179 -0
- pyworkflow/runtime/celery.py +310 -0
- pyworkflow/runtime/factory.py +101 -0
- pyworkflow/runtime/local.py +706 -0
- pyworkflow/scheduler/__init__.py +9 -0
- pyworkflow/scheduler/local.py +248 -0
- pyworkflow/serialization/__init__.py +0 -0
- pyworkflow/serialization/decoder.py +146 -0
- pyworkflow/serialization/encoder.py +162 -0
- pyworkflow/storage/__init__.py +54 -0
- pyworkflow/storage/base.py +612 -0
- pyworkflow/storage/config.py +185 -0
- pyworkflow/storage/dynamodb.py +1315 -0
- pyworkflow/storage/file.py +827 -0
- pyworkflow/storage/memory.py +549 -0
- pyworkflow/storage/postgres.py +1161 -0
- pyworkflow/storage/schemas.py +486 -0
- pyworkflow/storage/sqlite.py +1136 -0
- pyworkflow/utils/__init__.py +0 -0
- pyworkflow/utils/duration.py +177 -0
- pyworkflow/utils/schedule.py +391 -0
- pyworkflow_engine-0.1.7.dist-info/METADATA +687 -0
- pyworkflow_engine-0.1.7.dist-info/RECORD +196 -0
- pyworkflow_engine-0.1.7.dist-info/WHEEL +5 -0
- pyworkflow_engine-0.1.7.dist-info/entry_points.txt +2 -0
- pyworkflow_engine-0.1.7.dist-info/licenses/LICENSE +21 -0
- pyworkflow_engine-0.1.7.dist-info/top_level.txt +5 -0
- tests/examples/__init__.py +0 -0
- tests/integration/__init__.py +0 -0
- tests/integration/test_cancellation.py +330 -0
- tests/integration/test_child_workflows.py +439 -0
- tests/integration/test_continue_as_new.py +428 -0
- tests/integration/test_dynamodb_storage.py +1146 -0
- tests/integration/test_fault_tolerance.py +369 -0
- tests/integration/test_schedule_storage.py +484 -0
- tests/unit/__init__.py +0 -0
- tests/unit/backends/__init__.py +1 -0
- tests/unit/backends/test_dynamodb_storage.py +1554 -0
- tests/unit/backends/test_postgres_storage.py +1281 -0
- tests/unit/backends/test_sqlite_storage.py +1460 -0
- tests/unit/conftest.py +41 -0
- tests/unit/test_cancellation.py +364 -0
- tests/unit/test_child_workflows.py +680 -0
- tests/unit/test_continue_as_new.py +441 -0
- tests/unit/test_event_limits.py +316 -0
- tests/unit/test_executor.py +320 -0
- tests/unit/test_fault_tolerance.py +334 -0
- tests/unit/test_hooks.py +495 -0
- tests/unit/test_registry.py +261 -0
- tests/unit/test_replay.py +420 -0
- tests/unit/test_schedule_schemas.py +285 -0
- tests/unit/test_schedule_utils.py +286 -0
- tests/unit/test_scheduled_workflow.py +274 -0
- tests/unit/test_step.py +353 -0
- tests/unit/test_workflow.py +243 -0
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: 'Fault Tolerance & Auto Recovery'
|
|
3
|
+
description: 'How PyWorkflow automatically recovers workflows from worker crashes'
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
## What is Fault Tolerance?
|
|
7
|
+
|
|
8
|
+
In distributed systems, workers can fail unexpectedly due to crashes, OOM kills, network issues, or deployments. PyWorkflow's fault tolerance ensures your workflows survive these failures and automatically resume from where they left off.
|
|
9
|
+
|
|
10
|
+
<CardGroup cols={2}>
|
|
11
|
+
<Card title="Automatic Detection" icon="radar">
|
|
12
|
+
Worker crashes are detected automatically when Celery requeues tasks.
|
|
13
|
+
</Card>
|
|
14
|
+
<Card title="Event Replay" icon="rotate">
|
|
15
|
+
Completed steps are restored from the event log without re-execution.
|
|
16
|
+
</Card>
|
|
17
|
+
<Card title="Checkpoint Resume" icon="play">
|
|
18
|
+
Workflows continue from the last successful checkpoint, not from the beginning.
|
|
19
|
+
</Card>
|
|
20
|
+
<Card title="Configurable Limits" icon="sliders">
|
|
21
|
+
Control recovery attempts and behavior per workflow or globally.
|
|
22
|
+
</Card>
|
|
23
|
+
</CardGroup>
|
|
24
|
+
|
|
25
|
+
## How Auto Recovery Works
|
|
26
|
+
|
|
27
|
+
When a worker crashes mid-workflow, PyWorkflow automatically recovers:
|
|
28
|
+
|
|
29
|
+
```
|
|
30
|
+
┌─────────────────────────────────────────────────────────────────────┐
|
|
31
|
+
│ Worker A crashes while executing workflow │
|
|
32
|
+
│ │
|
|
33
|
+
│ 1. Celery detects WorkerLostError │
|
|
34
|
+
│ 2. Task is requeued to the broker │
|
|
35
|
+
│ 3. Worker B picks up the task │
|
|
36
|
+
│ 4. Detects workflow is in RUNNING/INTERRUPTED status │
|
|
37
|
+
│ 5. Loads event log from storage │
|
|
38
|
+
│ 6. Replays events to restore state: │
|
|
39
|
+
│ - Step results are cached (skip re-execution) │
|
|
40
|
+
│ - Pending sleeps are marked complete │
|
|
41
|
+
│ 7. Workflow continues from last checkpoint │
|
|
42
|
+
│ │
|
|
43
|
+
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
|
|
44
|
+
│ │ Step 1 │───▶│ Step 2 │───▶│ Crash! │ │ Step 3 │ │
|
|
45
|
+
│ │(complete)│ │(complete)│ │ │ │(pending) │ │
|
|
46
|
+
│ └──────────┘ └──────────┘ └────┬─────┘ └──────────┘ │
|
|
47
|
+
│ │ │
|
|
48
|
+
│ │ Recovery │
|
|
49
|
+
│ ▼ │
|
|
50
|
+
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
|
|
51
|
+
│ │ Step 1 │───▶│ Step 2 │───▶│ Resume │───▶│ Step 3 │ │
|
|
52
|
+
│ │ (replay) │ │ (replay) │ │ │ │(execute) │ │
|
|
53
|
+
│ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │
|
|
54
|
+
└─────────────────────────────────────────────────────────────────────┘
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
The key insight is that **event sourcing makes recovery possible**. All completed steps are recorded as events, so on recovery, PyWorkflow simply replays those events to restore the workflow's state without re-executing any work.
|
|
58
|
+
|
|
59
|
+
## Configuration
|
|
60
|
+
|
|
61
|
+
<Tabs>
|
|
62
|
+
<Tab title="Decorator">
|
|
63
|
+
Configure recovery per-workflow using the `@workflow` decorator:
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from pyworkflow import workflow, step
|
|
67
|
+
|
|
68
|
+
@workflow(
|
|
69
|
+
recover_on_worker_loss=True, # Enable auto recovery
|
|
70
|
+
max_recovery_attempts=5, # Allow up to 5 recovery attempts
|
|
71
|
+
)
|
|
72
|
+
async def resilient_workflow(order_id: str):
|
|
73
|
+
# Steps that complete before a crash won't re-execute
|
|
74
|
+
order = await validate_order(order_id)
|
|
75
|
+
payment = await process_payment(order)
|
|
76
|
+
|
|
77
|
+
# If worker crashes here, workflow resumes from this point
|
|
78
|
+
await create_shipment(order)
|
|
79
|
+
return {"status": "completed"}
|
|
80
|
+
```
|
|
81
|
+
</Tab>
|
|
82
|
+
<Tab title="Config File">
|
|
83
|
+
Set defaults in `pyworkflow.config.yaml`:
|
|
84
|
+
|
|
85
|
+
```yaml
|
|
86
|
+
# pyworkflow.config.yaml
|
|
87
|
+
|
|
88
|
+
module: myapp.workflows
|
|
89
|
+
runtime: celery
|
|
90
|
+
|
|
91
|
+
# Fault tolerance defaults
|
|
92
|
+
recovery:
|
|
93
|
+
recover_on_worker_loss: true
|
|
94
|
+
max_recovery_attempts: 3
|
|
95
|
+
|
|
96
|
+
storage:
|
|
97
|
+
backend: file
|
|
98
|
+
path: ./workflow_data
|
|
99
|
+
|
|
100
|
+
celery:
|
|
101
|
+
broker: redis://localhost:6379/0
|
|
102
|
+
```
|
|
103
|
+
</Tab>
|
|
104
|
+
<Tab title="Programmatic">
|
|
105
|
+
Configure globally in your application:
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
import pyworkflow
|
|
109
|
+
|
|
110
|
+
pyworkflow.configure(
|
|
111
|
+
default_recover_on_worker_loss=True,
|
|
112
|
+
default_max_recovery_attempts=3,
|
|
113
|
+
)
|
|
114
|
+
```
|
|
115
|
+
</Tab>
|
|
116
|
+
</Tabs>
|
|
117
|
+
|
|
118
|
+
### Configuration Options
|
|
119
|
+
|
|
120
|
+
| Option | Type | Default | Description |
|
|
121
|
+
|--------|------|---------|-------------|
|
|
122
|
+
| `recover_on_worker_loss` | `bool` | `True` (durable) / `False` (transient) | Enable automatic recovery on worker crash |
|
|
123
|
+
| `max_recovery_attempts` | `int` | `3` | Maximum number of recovery attempts before marking as failed |
|
|
124
|
+
|
|
125
|
+
### Configuration Priority
|
|
126
|
+
|
|
127
|
+
When resolving recovery settings, PyWorkflow uses this priority order:
|
|
128
|
+
|
|
129
|
+
| Priority | Source | Example |
|
|
130
|
+
|----------|--------|---------|
|
|
131
|
+
| 1 (highest) | `@workflow()` decorator | `@workflow(recover_on_worker_loss=True)` |
|
|
132
|
+
| 2 | `pyworkflow.configure()` | `configure(default_recover_on_worker_loss=True)` |
|
|
133
|
+
| 3 | Config file | `recovery.recover_on_worker_loss: true` |
|
|
134
|
+
| 4 (lowest) | Built-in defaults | `True` for durable, `False` for transient |
|
|
135
|
+
|
|
136
|
+
## Durable vs Transient Workflows
|
|
137
|
+
|
|
138
|
+
Recovery behavior differs based on workflow durability:
|
|
139
|
+
|
|
140
|
+
<Tabs>
|
|
141
|
+
<Tab title="Durable Workflows">
|
|
142
|
+
**Durable workflows resume from the last checkpoint.**
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
@workflow(durable=True, recover_on_worker_loss=True)
|
|
146
|
+
async def durable_pipeline(data_id: str):
|
|
147
|
+
# These steps are recorded as events
|
|
148
|
+
data = await fetch_data(data_id) # Event: step_completed
|
|
149
|
+
data = await validate_data(data) # Event: step_completed
|
|
150
|
+
|
|
151
|
+
await sleep("10m") # Event: sleep_started
|
|
152
|
+
|
|
153
|
+
# If crash happens here, on recovery:
|
|
154
|
+
# - fetch_data and validate_data results restored from events
|
|
155
|
+
# - sleep marked as complete
|
|
156
|
+
# - Execution continues from transform_data
|
|
157
|
+
|
|
158
|
+
data = await transform_data(data) # Executes after recovery
|
|
159
|
+
return data
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
**Recovery process:**
|
|
163
|
+
1. Load event log from storage
|
|
164
|
+
2. Replay `step_completed` events (restore cached results)
|
|
165
|
+
3. Complete pending `sleep_started` events
|
|
166
|
+
4. Continue execution from the next step
|
|
167
|
+
</Tab>
|
|
168
|
+
<Tab title="Transient Workflows">
|
|
169
|
+
**Transient workflows restart from the beginning.**
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
@workflow(durable=False, recover_on_worker_loss=True)
|
|
173
|
+
async def transient_batch(batch_id: str):
|
|
174
|
+
# No events are recorded
|
|
175
|
+
items = await fetch_items(batch_id)
|
|
176
|
+
items = await process_items(items)
|
|
177
|
+
|
|
178
|
+
# If crash happens here, on recovery:
|
|
179
|
+
# - No event log to replay
|
|
180
|
+
# - Workflow starts over from fetch_items
|
|
181
|
+
|
|
182
|
+
items = await finalize_items(items)
|
|
183
|
+
return items
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
<Warning>
|
|
187
|
+
Transient workflows with recovery enabled will **restart from scratch** on each recovery attempt. Ensure your steps are idempotent if using this pattern.
|
|
188
|
+
</Warning>
|
|
189
|
+
</Tab>
|
|
190
|
+
</Tabs>
|
|
191
|
+
|
|
192
|
+
## Workflow States
|
|
193
|
+
|
|
194
|
+
Auto recovery introduces a new workflow state:
|
|
195
|
+
|
|
196
|
+
```
|
|
197
|
+
┌─────────────┐
|
|
198
|
+
│ PENDING │ Workflow created, waiting to start
|
|
199
|
+
└──────┬──────┘
|
|
200
|
+
│
|
|
201
|
+
▼
|
|
202
|
+
┌─────────────┐
|
|
203
|
+
│ RUNNING │ Workflow is executing
|
|
204
|
+
└──────┬──────┘
|
|
205
|
+
│
|
|
206
|
+
├────────────────┬────────────────┐
|
|
207
|
+
│ │ │
|
|
208
|
+
▼ ▼ ▼
|
|
209
|
+
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
|
210
|
+
│ SUSPENDED │ │ INTERRUPTED │ │ FAILED │
|
|
211
|
+
└──────┬──────┘ └──────┬──────┘ └─────────────┘
|
|
212
|
+
│ │
|
|
213
|
+
│ │ (auto recovery)
|
|
214
|
+
│ ▼
|
|
215
|
+
│ ┌─────────────┐
|
|
216
|
+
│ │ RUNNING │ Recovered, resuming
|
|
217
|
+
│ └──────┬──────┘
|
|
218
|
+
│ │
|
|
219
|
+
└────────────────┤
|
|
220
|
+
│
|
|
221
|
+
▼
|
|
222
|
+
┌─────────────┐
|
|
223
|
+
│ COMPLETED │ Workflow finished
|
|
224
|
+
└─────────────┘
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
| Status | Description |
|
|
228
|
+
|--------|-------------|
|
|
229
|
+
| `INTERRUPTED` | Worker crashed; workflow is awaiting recovery |
|
|
230
|
+
| `RUNNING` | Workflow is executing (or has been recovered) |
|
|
231
|
+
|
|
232
|
+
## Monitoring Recovery
|
|
233
|
+
|
|
234
|
+
Use the CLI to monitor workflows that have been interrupted or recovered:
|
|
235
|
+
|
|
236
|
+
```bash
|
|
237
|
+
# List interrupted workflows
|
|
238
|
+
pyworkflow runs list --status interrupted
|
|
239
|
+
|
|
240
|
+
# List all workflows with recovery info
|
|
241
|
+
pyworkflow runs list --workflow my_workflow
|
|
242
|
+
|
|
243
|
+
# View detailed status including recovery attempts
|
|
244
|
+
pyworkflow runs status <run_id>
|
|
245
|
+
|
|
246
|
+
# View full event log including WORKFLOW_INTERRUPTED events
|
|
247
|
+
pyworkflow runs logs <run_id>
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
Example output:
|
|
251
|
+
```
|
|
252
|
+
$ pyworkflow runs status abc123
|
|
253
|
+
Run ID: abc123
|
|
254
|
+
Workflow: data_pipeline
|
|
255
|
+
Status: running
|
|
256
|
+
Started: 2025-01-15 10:30:00
|
|
257
|
+
Recovery: 2/3 attempts (recover_on_worker_loss: true)
|
|
258
|
+
Last Event: workflow.resumed (2025-01-15 10:35:00)
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
## When to Disable Recovery
|
|
262
|
+
|
|
263
|
+
<AccordionGroup>
|
|
264
|
+
<Accordion title="Non-idempotent external calls">
|
|
265
|
+
If your workflow makes calls that can't be safely repeated (e.g., charging a credit card without idempotency keys), disable recovery or implement compensation logic.
|
|
266
|
+
|
|
267
|
+
```python
|
|
268
|
+
@workflow(recover_on_worker_loss=False)
|
|
269
|
+
async def non_idempotent_workflow():
|
|
270
|
+
# This payment call might succeed but crash before recording
|
|
271
|
+
# Recovery would charge the customer again!
|
|
272
|
+
await charge_credit_card(amount) # Dangerous without idempotency
|
|
273
|
+
```
|
|
274
|
+
</Accordion>
|
|
275
|
+
|
|
276
|
+
<Accordion title="Critical workflows requiring human review">
|
|
277
|
+
Some workflows should fail loudly and require human intervention rather than automatic recovery.
|
|
278
|
+
|
|
279
|
+
```python
|
|
280
|
+
@workflow(
|
|
281
|
+
recover_on_worker_loss=False,
|
|
282
|
+
max_recovery_attempts=0,
|
|
283
|
+
)
|
|
284
|
+
async def critical_financial_workflow():
|
|
285
|
+
# Any failure should be reviewed by humans
|
|
286
|
+
await transfer_funds(amount)
|
|
287
|
+
```
|
|
288
|
+
</Accordion>
|
|
289
|
+
|
|
290
|
+
<Accordion title="External systems without rollback">
|
|
291
|
+
If your workflow interacts with systems that don't support rollback or compensation, partial re-execution could leave inconsistent state.
|
|
292
|
+
|
|
293
|
+
```python
|
|
294
|
+
@workflow(recover_on_worker_loss=False)
|
|
295
|
+
async def legacy_integration():
|
|
296
|
+
# Legacy system has no rollback capability
|
|
297
|
+
await update_legacy_system(data)
|
|
298
|
+
```
|
|
299
|
+
</Accordion>
|
|
300
|
+
</AccordionGroup>
|
|
301
|
+
|
|
302
|
+
## Best Practices
|
|
303
|
+
|
|
304
|
+
<AccordionGroup>
|
|
305
|
+
<Accordion title="Make steps idempotent">
|
|
306
|
+
Design steps to produce the same result when called multiple times with the same input. Use idempotency keys for external API calls.
|
|
307
|
+
|
|
308
|
+
```python
|
|
309
|
+
@step()
|
|
310
|
+
async def create_order(order_id: str):
|
|
311
|
+
# Use order_id as idempotency key
|
|
312
|
+
return await api.create_order(
|
|
313
|
+
order_id=order_id,
|
|
314
|
+
idempotency_key=f"order-{order_id}"
|
|
315
|
+
)
|
|
316
|
+
```
|
|
317
|
+
</Accordion>
|
|
318
|
+
|
|
319
|
+
<Accordion title="Set appropriate recovery limits">
|
|
320
|
+
Don't allow unlimited recovery attempts. Set `max_recovery_attempts` based on your tolerance for repeated failures.
|
|
321
|
+
|
|
322
|
+
```python
|
|
323
|
+
@workflow(
|
|
324
|
+
recover_on_worker_loss=True,
|
|
325
|
+
max_recovery_attempts=3, # Fail after 3 attempts
|
|
326
|
+
)
|
|
327
|
+
async def bounded_recovery_workflow():
|
|
328
|
+
pass
|
|
329
|
+
```
|
|
330
|
+
</Accordion>
|
|
331
|
+
|
|
332
|
+
<Accordion title="Monitor interrupted workflows">
|
|
333
|
+
Set up alerts for workflows that reach `INTERRUPTED` status frequently. This may indicate infrastructure issues.
|
|
334
|
+
|
|
335
|
+
```bash
|
|
336
|
+
# Example: Alert if more than 5 interrupted workflows in an hour
|
|
337
|
+
pyworkflow runs list --status interrupted --since 1h | wc -l
|
|
338
|
+
```
|
|
339
|
+
</Accordion>
|
|
340
|
+
|
|
341
|
+
<Accordion title="Use durable mode for critical workflows">
|
|
342
|
+
Critical business workflows should always use durable mode to ensure proper recovery.
|
|
343
|
+
|
|
344
|
+
```python
|
|
345
|
+
@workflow(
|
|
346
|
+
durable=True, # Event sourcing enabled
|
|
347
|
+
recover_on_worker_loss=True,
|
|
348
|
+
)
|
|
349
|
+
async def critical_business_workflow():
|
|
350
|
+
pass
|
|
351
|
+
```
|
|
352
|
+
</Accordion>
|
|
353
|
+
</AccordionGroup>
|
|
354
|
+
|
|
355
|
+
## Next Steps
|
|
356
|
+
|
|
357
|
+
<CardGroup cols={2}>
|
|
358
|
+
<Card title="Workflows" icon="diagram-project" href="/concepts/workflows">
|
|
359
|
+
Learn about workflow lifecycle and configuration.
|
|
360
|
+
</Card>
|
|
361
|
+
<Card title="Events" icon="timeline" href="/concepts/events">
|
|
362
|
+
Understand the event sourcing model that enables recovery.
|
|
363
|
+
</Card>
|
|
364
|
+
<Card title="Configuration" icon="gear" href="/guides/configuration">
|
|
365
|
+
Configure fault tolerance settings globally.
|
|
366
|
+
</Card>
|
|
367
|
+
<Card title="CLI Guide" icon="terminal" href="/guides/cli">
|
|
368
|
+
Monitor and manage workflows from the command line.
|
|
369
|
+
</Card>
|
|
370
|
+
</CardGroup>
|