pyworkflow-engine 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. pyworkflow/__init__.py +10 -1
  2. pyworkflow/celery/tasks.py +272 -24
  3. pyworkflow/cli/__init__.py +4 -1
  4. pyworkflow/cli/commands/runs.py +4 -4
  5. pyworkflow/cli/commands/setup.py +203 -4
  6. pyworkflow/cli/utils/config_generator.py +76 -3
  7. pyworkflow/cli/utils/docker_manager.py +232 -0
  8. pyworkflow/context/__init__.py +13 -0
  9. pyworkflow/context/base.py +26 -0
  10. pyworkflow/context/local.py +80 -0
  11. pyworkflow/context/step_context.py +295 -0
  12. pyworkflow/core/registry.py +6 -1
  13. pyworkflow/core/step.py +141 -0
  14. pyworkflow/core/workflow.py +56 -0
  15. pyworkflow/engine/events.py +30 -0
  16. pyworkflow/engine/replay.py +39 -0
  17. pyworkflow/primitives/child_workflow.py +1 -1
  18. pyworkflow/runtime/local.py +1 -1
  19. pyworkflow/storage/__init__.py +14 -0
  20. pyworkflow/storage/base.py +35 -0
  21. pyworkflow/storage/cassandra.py +1747 -0
  22. pyworkflow/storage/config.py +69 -0
  23. pyworkflow/storage/dynamodb.py +31 -2
  24. pyworkflow/storage/file.py +28 -0
  25. pyworkflow/storage/memory.py +18 -0
  26. pyworkflow/storage/mysql.py +1159 -0
  27. pyworkflow/storage/postgres.py +27 -2
  28. pyworkflow/storage/schemas.py +4 -3
  29. pyworkflow/storage/sqlite.py +25 -2
  30. {pyworkflow_engine-0.1.7.dist-info → pyworkflow_engine-0.1.9.dist-info}/METADATA +7 -4
  31. pyworkflow_engine-0.1.9.dist-info/RECORD +91 -0
  32. pyworkflow_engine-0.1.9.dist-info/top_level.txt +1 -0
  33. dashboard/backend/app/__init__.py +0 -1
  34. dashboard/backend/app/config.py +0 -32
  35. dashboard/backend/app/controllers/__init__.py +0 -6
  36. dashboard/backend/app/controllers/run_controller.py +0 -86
  37. dashboard/backend/app/controllers/workflow_controller.py +0 -33
  38. dashboard/backend/app/dependencies/__init__.py +0 -5
  39. dashboard/backend/app/dependencies/storage.py +0 -50
  40. dashboard/backend/app/repositories/__init__.py +0 -6
  41. dashboard/backend/app/repositories/run_repository.py +0 -80
  42. dashboard/backend/app/repositories/workflow_repository.py +0 -27
  43. dashboard/backend/app/rest/__init__.py +0 -8
  44. dashboard/backend/app/rest/v1/__init__.py +0 -12
  45. dashboard/backend/app/rest/v1/health.py +0 -33
  46. dashboard/backend/app/rest/v1/runs.py +0 -133
  47. dashboard/backend/app/rest/v1/workflows.py +0 -41
  48. dashboard/backend/app/schemas/__init__.py +0 -23
  49. dashboard/backend/app/schemas/common.py +0 -16
  50. dashboard/backend/app/schemas/event.py +0 -24
  51. dashboard/backend/app/schemas/hook.py +0 -25
  52. dashboard/backend/app/schemas/run.py +0 -54
  53. dashboard/backend/app/schemas/step.py +0 -28
  54. dashboard/backend/app/schemas/workflow.py +0 -31
  55. dashboard/backend/app/server.py +0 -87
  56. dashboard/backend/app/services/__init__.py +0 -6
  57. dashboard/backend/app/services/run_service.py +0 -240
  58. dashboard/backend/app/services/workflow_service.py +0 -155
  59. dashboard/backend/main.py +0 -18
  60. docs/concepts/cancellation.mdx +0 -362
  61. docs/concepts/continue-as-new.mdx +0 -434
  62. docs/concepts/events.mdx +0 -266
  63. docs/concepts/fault-tolerance.mdx +0 -370
  64. docs/concepts/hooks.mdx +0 -552
  65. docs/concepts/limitations.mdx +0 -167
  66. docs/concepts/schedules.mdx +0 -775
  67. docs/concepts/sleep.mdx +0 -312
  68. docs/concepts/steps.mdx +0 -301
  69. docs/concepts/workflows.mdx +0 -255
  70. docs/guides/cli.mdx +0 -942
  71. docs/guides/configuration.mdx +0 -560
  72. docs/introduction.mdx +0 -155
  73. docs/quickstart.mdx +0 -279
  74. examples/__init__.py +0 -1
  75. examples/celery/__init__.py +0 -1
  76. examples/celery/durable/docker-compose.yml +0 -55
  77. examples/celery/durable/pyworkflow.config.yaml +0 -12
  78. examples/celery/durable/workflows/__init__.py +0 -122
  79. examples/celery/durable/workflows/basic.py +0 -87
  80. examples/celery/durable/workflows/batch_processing.py +0 -102
  81. examples/celery/durable/workflows/cancellation.py +0 -273
  82. examples/celery/durable/workflows/child_workflow_patterns.py +0 -240
  83. examples/celery/durable/workflows/child_workflows.py +0 -202
  84. examples/celery/durable/workflows/continue_as_new.py +0 -260
  85. examples/celery/durable/workflows/fault_tolerance.py +0 -210
  86. examples/celery/durable/workflows/hooks.py +0 -211
  87. examples/celery/durable/workflows/idempotency.py +0 -112
  88. examples/celery/durable/workflows/long_running.py +0 -99
  89. examples/celery/durable/workflows/retries.py +0 -101
  90. examples/celery/durable/workflows/schedules.py +0 -209
  91. examples/celery/transient/01_basic_workflow.py +0 -91
  92. examples/celery/transient/02_fault_tolerance.py +0 -257
  93. examples/celery/transient/__init__.py +0 -20
  94. examples/celery/transient/pyworkflow.config.yaml +0 -25
  95. examples/local/__init__.py +0 -1
  96. examples/local/durable/01_basic_workflow.py +0 -94
  97. examples/local/durable/02_file_storage.py +0 -132
  98. examples/local/durable/03_retries.py +0 -169
  99. examples/local/durable/04_long_running.py +0 -119
  100. examples/local/durable/05_event_log.py +0 -145
  101. examples/local/durable/06_idempotency.py +0 -148
  102. examples/local/durable/07_hooks.py +0 -334
  103. examples/local/durable/08_cancellation.py +0 -233
  104. examples/local/durable/09_child_workflows.py +0 -198
  105. examples/local/durable/10_child_workflow_patterns.py +0 -265
  106. examples/local/durable/11_continue_as_new.py +0 -249
  107. examples/local/durable/12_schedules.py +0 -198
  108. examples/local/durable/__init__.py +0 -1
  109. examples/local/transient/01_quick_tasks.py +0 -87
  110. examples/local/transient/02_retries.py +0 -130
  111. examples/local/transient/03_sleep.py +0 -141
  112. examples/local/transient/__init__.py +0 -1
  113. pyworkflow_engine-0.1.7.dist-info/RECORD +0 -196
  114. pyworkflow_engine-0.1.7.dist-info/top_level.txt +0 -5
  115. tests/examples/__init__.py +0 -0
  116. tests/integration/__init__.py +0 -0
  117. tests/integration/test_cancellation.py +0 -330
  118. tests/integration/test_child_workflows.py +0 -439
  119. tests/integration/test_continue_as_new.py +0 -428
  120. tests/integration/test_dynamodb_storage.py +0 -1146
  121. tests/integration/test_fault_tolerance.py +0 -369
  122. tests/integration/test_schedule_storage.py +0 -484
  123. tests/unit/__init__.py +0 -0
  124. tests/unit/backends/__init__.py +0 -1
  125. tests/unit/backends/test_dynamodb_storage.py +0 -1554
  126. tests/unit/backends/test_postgres_storage.py +0 -1281
  127. tests/unit/backends/test_sqlite_storage.py +0 -1460
  128. tests/unit/conftest.py +0 -41
  129. tests/unit/test_cancellation.py +0 -364
  130. tests/unit/test_child_workflows.py +0 -680
  131. tests/unit/test_continue_as_new.py +0 -441
  132. tests/unit/test_event_limits.py +0 -316
  133. tests/unit/test_executor.py +0 -320
  134. tests/unit/test_fault_tolerance.py +0 -334
  135. tests/unit/test_hooks.py +0 -495
  136. tests/unit/test_registry.py +0 -261
  137. tests/unit/test_replay.py +0 -420
  138. tests/unit/test_schedule_schemas.py +0 -285
  139. tests/unit/test_schedule_utils.py +0 -286
  140. tests/unit/test_scheduled_workflow.py +0 -274
  141. tests/unit/test_step.py +0 -353
  142. tests/unit/test_workflow.py +0 -243
  143. {pyworkflow_engine-0.1.7.dist-info → pyworkflow_engine-0.1.9.dist-info}/WHEEL +0 -0
  144. {pyworkflow_engine-0.1.7.dist-info → pyworkflow_engine-0.1.9.dist-info}/entry_points.txt +0 -0
  145. {pyworkflow_engine-0.1.7.dist-info → pyworkflow_engine-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,370 +0,0 @@
1
- ---
2
- title: 'Fault Tolerance & Auto Recovery'
3
- description: 'How PyWorkflow automatically recovers workflows from worker crashes'
4
- ---
5
-
6
- ## What is Fault Tolerance?
7
-
8
- In distributed systems, workers can fail unexpectedly due to crashes, OOM kills, network issues, or deployments. PyWorkflow's fault tolerance ensures your workflows survive these failures and automatically resume from where they left off.
9
-
10
- <CardGroup cols={2}>
11
- <Card title="Automatic Detection" icon="radar">
12
- Worker crashes are detected automatically when Celery requeues tasks.
13
- </Card>
14
- <Card title="Event Replay" icon="rotate">
15
- Completed steps are restored from the event log without re-execution.
16
- </Card>
17
- <Card title="Checkpoint Resume" icon="play">
18
- Workflows continue from the last successful checkpoint, not from the beginning.
19
- </Card>
20
- <Card title="Configurable Limits" icon="sliders">
21
- Control recovery attempts and behavior per workflow or globally.
22
- </Card>
23
- </CardGroup>
24
-
25
- ## How Auto Recovery Works
26
-
27
- When a worker crashes mid-workflow, PyWorkflow automatically recovers:
28
-
29
- ```
30
- ┌─────────────────────────────────────────────────────────────────────┐
31
- │ Worker A crashes while executing workflow │
32
- │ │
33
- │ 1. Celery detects WorkerLostError │
34
- │ 2. Task is requeued to the broker │
35
- │ 3. Worker B picks up the task │
36
- │ 4. Detects workflow is in RUNNING/INTERRUPTED status │
37
- │ 5. Loads event log from storage │
38
- │ 6. Replays events to restore state: │
39
- │ - Step results are cached (skip re-execution) │
40
- │ - Pending sleeps are marked complete │
41
- │ 7. Workflow continues from last checkpoint │
42
- │ │
43
- │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
44
- │ │ Step 1 │───▶│ Step 2 │───▶│ Crash! │ │ Step 3 │ │
45
- │ │(complete)│ │(complete)│ │ │ │(pending) │ │
46
- │ └──────────┘ └──────────┘ └────┬─────┘ └──────────┘ │
47
- │ │ │
48
- │ │ Recovery │
49
- │ ▼ │
50
- │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
51
- │ │ Step 1 │───▶│ Step 2 │───▶│ Resume │───▶│ Step 3 │ │
52
- │ │ (replay) │ │ (replay) │ │ │ │(execute) │ │
53
- │ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │
54
- └─────────────────────────────────────────────────────────────────────┘
55
- ```
56
-
57
- The key insight is that **event sourcing makes recovery possible**. All completed steps are recorded as events, so on recovery, PyWorkflow simply replays those events to restore the workflow's state without re-executing any work.
58
-
59
- ## Configuration
60
-
61
- <Tabs>
62
- <Tab title="Decorator">
63
- Configure recovery per-workflow using the `@workflow` decorator:
64
-
65
- ```python
66
- from pyworkflow import workflow, step
67
-
68
- @workflow(
69
- recover_on_worker_loss=True, # Enable auto recovery
70
- max_recovery_attempts=5, # Allow up to 5 recovery attempts
71
- )
72
- async def resilient_workflow(order_id: str):
73
- # Steps that complete before a crash won't re-execute
74
- order = await validate_order(order_id)
75
- payment = await process_payment(order)
76
-
77
- # If worker crashes here, workflow resumes from this point
78
- await create_shipment(order)
79
- return {"status": "completed"}
80
- ```
81
- </Tab>
82
- <Tab title="Config File">
83
- Set defaults in `pyworkflow.config.yaml`:
84
-
85
- ```yaml
86
- # pyworkflow.config.yaml
87
-
88
- module: myapp.workflows
89
- runtime: celery
90
-
91
- # Fault tolerance defaults
92
- recovery:
93
- recover_on_worker_loss: true
94
- max_recovery_attempts: 3
95
-
96
- storage:
97
- backend: file
98
- path: ./workflow_data
99
-
100
- celery:
101
- broker: redis://localhost:6379/0
102
- ```
103
- </Tab>
104
- <Tab title="Programmatic">
105
- Configure globally in your application:
106
-
107
- ```python
108
- import pyworkflow
109
-
110
- pyworkflow.configure(
111
- default_recover_on_worker_loss=True,
112
- default_max_recovery_attempts=3,
113
- )
114
- ```
115
- </Tab>
116
- </Tabs>
117
-
118
- ### Configuration Options
119
-
120
- | Option | Type | Default | Description |
121
- |--------|------|---------|-------------|
122
- | `recover_on_worker_loss` | `bool` | `True` (durable) / `False` (transient) | Enable automatic recovery on worker crash |
123
- | `max_recovery_attempts` | `int` | `3` | Maximum number of recovery attempts before marking as failed |
124
-
125
- ### Configuration Priority
126
-
127
- When resolving recovery settings, PyWorkflow uses this priority order:
128
-
129
- | Priority | Source | Example |
130
- |----------|--------|---------|
131
- | 1 (highest) | `@workflow()` decorator | `@workflow(recover_on_worker_loss=True)` |
132
- | 2 | `pyworkflow.configure()` | `configure(default_recover_on_worker_loss=True)` |
133
- | 3 | Config file | `recovery.recover_on_worker_loss: true` |
134
- | 4 (lowest) | Built-in defaults | `True` for durable, `False` for transient |
135
-
136
- ## Durable vs Transient Workflows
137
-
138
- Recovery behavior differs based on workflow durability:
139
-
140
- <Tabs>
141
- <Tab title="Durable Workflows">
142
- **Durable workflows resume from the last checkpoint.**
143
-
144
- ```python
145
- @workflow(durable=True, recover_on_worker_loss=True)
146
- async def durable_pipeline(data_id: str):
147
- # These steps are recorded as events
148
- data = await fetch_data(data_id) # Event: step_completed
149
- data = await validate_data(data) # Event: step_completed
150
-
151
- await sleep("10m") # Event: sleep_started
152
-
153
- # If crash happens here, on recovery:
154
- # - fetch_data and validate_data results restored from events
155
- # - sleep marked as complete
156
- # - Execution continues from transform_data
157
-
158
- data = await transform_data(data) # Executes after recovery
159
- return data
160
- ```
161
-
162
- **Recovery process:**
163
- 1. Load event log from storage
164
- 2. Replay `step_completed` events (restore cached results)
165
- 3. Complete pending `sleep_started` events
166
- 4. Continue execution from the next step
167
- </Tab>
168
- <Tab title="Transient Workflows">
169
- **Transient workflows restart from the beginning.**
170
-
171
- ```python
172
- @workflow(durable=False, recover_on_worker_loss=True)
173
- async def transient_batch(batch_id: str):
174
- # No events are recorded
175
- items = await fetch_items(batch_id)
176
- items = await process_items(items)
177
-
178
- # If crash happens here, on recovery:
179
- # - No event log to replay
180
- # - Workflow starts over from fetch_items
181
-
182
- items = await finalize_items(items)
183
- return items
184
- ```
185
-
186
- <Warning>
187
- Transient workflows with recovery enabled will **restart from scratch** on each recovery attempt. Ensure your steps are idempotent if using this pattern.
188
- </Warning>
189
- </Tab>
190
- </Tabs>
191
-
192
- ## Workflow States
193
-
194
- Auto recovery introduces a new workflow state:
195
-
196
- ```
197
- ┌─────────────┐
198
- │ PENDING │ Workflow created, waiting to start
199
- └──────┬──────┘
200
-
201
-
202
- ┌─────────────┐
203
- │ RUNNING │ Workflow is executing
204
- └──────┬──────┘
205
-
206
- ├────────────────┬────────────────┐
207
- │ │ │
208
- ▼ ▼ ▼
209
- ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
210
- │ SUSPENDED │ │ INTERRUPTED │ │ FAILED │
211
- └──────┬──────┘ └──────┬──────┘ └─────────────┘
212
- │ │
213
- │ │ (auto recovery)
214
- │ ▼
215
- │ ┌─────────────┐
216
- │ │ RUNNING │ Recovered, resuming
217
- │ └──────┬──────┘
218
- │ │
219
- └────────────────┤
220
-
221
-
222
- ┌─────────────┐
223
- │ COMPLETED │ Workflow finished
224
- └─────────────┘
225
- ```
226
-
227
- | Status | Description |
228
- |--------|-------------|
229
- | `INTERRUPTED` | Worker crashed; workflow is awaiting recovery |
230
- | `RUNNING` | Workflow is executing (or has been recovered) |
231
-
232
- ## Monitoring Recovery
233
-
234
- Use the CLI to monitor workflows that have been interrupted or recovered:
235
-
236
- ```bash
237
- # List interrupted workflows
238
- pyworkflow runs list --status interrupted
239
-
240
- # List all workflows with recovery info
241
- pyworkflow runs list --workflow my_workflow
242
-
243
- # View detailed status including recovery attempts
244
- pyworkflow runs status <run_id>
245
-
246
- # View full event log including WORKFLOW_INTERRUPTED events
247
- pyworkflow runs logs <run_id>
248
- ```
249
-
250
- Example output:
251
- ```
252
- $ pyworkflow runs status abc123
253
- Run ID: abc123
254
- Workflow: data_pipeline
255
- Status: running
256
- Started: 2025-01-15 10:30:00
257
- Recovery: 2/3 attempts (recover_on_worker_loss: true)
258
- Last Event: workflow.resumed (2025-01-15 10:35:00)
259
- ```
260
-
261
- ## When to Disable Recovery
262
-
263
- <AccordionGroup>
264
- <Accordion title="Non-idempotent external calls">
265
- If your workflow makes calls that can't be safely repeated (e.g., charging a credit card without idempotency keys), disable recovery or implement compensation logic.
266
-
267
- ```python
268
- @workflow(recover_on_worker_loss=False)
269
- async def non_idempotent_workflow():
270
- # This payment call might succeed but crash before recording
271
- # Recovery would charge the customer again!
272
- await charge_credit_card(amount) # Dangerous without idempotency
273
- ```
274
- </Accordion>
275
-
276
- <Accordion title="Critical workflows requiring human review">
277
- Some workflows should fail loudly and require human intervention rather than automatic recovery.
278
-
279
- ```python
280
- @workflow(
281
- recover_on_worker_loss=False,
282
- max_recovery_attempts=0,
283
- )
284
- async def critical_financial_workflow():
285
- # Any failure should be reviewed by humans
286
- await transfer_funds(amount)
287
- ```
288
- </Accordion>
289
-
290
- <Accordion title="External systems without rollback">
291
- If your workflow interacts with systems that don't support rollback or compensation, partial re-execution could leave inconsistent state.
292
-
293
- ```python
294
- @workflow(recover_on_worker_loss=False)
295
- async def legacy_integration():
296
- # Legacy system has no rollback capability
297
- await update_legacy_system(data)
298
- ```
299
- </Accordion>
300
- </AccordionGroup>
301
-
302
- ## Best Practices
303
-
304
- <AccordionGroup>
305
- <Accordion title="Make steps idempotent">
306
- Design steps to produce the same result when called multiple times with the same input. Use idempotency keys for external API calls.
307
-
308
- ```python
309
- @step()
310
- async def create_order(order_id: str):
311
- # Use order_id as idempotency key
312
- return await api.create_order(
313
- order_id=order_id,
314
- idempotency_key=f"order-{order_id}"
315
- )
316
- ```
317
- </Accordion>
318
-
319
- <Accordion title="Set appropriate recovery limits">
320
- Don't allow unlimited recovery attempts. Set `max_recovery_attempts` based on your tolerance for repeated failures.
321
-
322
- ```python
323
- @workflow(
324
- recover_on_worker_loss=True,
325
- max_recovery_attempts=3, # Fail after 3 attempts
326
- )
327
- async def bounded_recovery_workflow():
328
- pass
329
- ```
330
- </Accordion>
331
-
332
- <Accordion title="Monitor interrupted workflows">
333
- Set up alerts for workflows that reach `INTERRUPTED` status frequently. This may indicate infrastructure issues.
334
-
335
- ```bash
336
- # Example: Alert if more than 5 interrupted workflows in an hour
337
- pyworkflow runs list --status interrupted --since 1h | wc -l
338
- ```
339
- </Accordion>
340
-
341
- <Accordion title="Use durable mode for critical workflows">
342
- Critical business workflows should always use durable mode to ensure proper recovery.
343
-
344
- ```python
345
- @workflow(
346
- durable=True, # Event sourcing enabled
347
- recover_on_worker_loss=True,
348
- )
349
- async def critical_business_workflow():
350
- pass
351
- ```
352
- </Accordion>
353
- </AccordionGroup>
354
-
355
- ## Next Steps
356
-
357
- <CardGroup cols={2}>
358
- <Card title="Workflows" icon="diagram-project" href="/concepts/workflows">
359
- Learn about workflow lifecycle and configuration.
360
- </Card>
361
- <Card title="Events" icon="timeline" href="/concepts/events">
362
- Understand the event sourcing model that enables recovery.
363
- </Card>
364
- <Card title="Configuration" icon="gear" href="/guides/configuration">
365
- Configure fault tolerance settings globally.
366
- </Card>
367
- <Card title="CLI Guide" icon="terminal" href="/guides/cli">
368
- Monitor and manage workflows from the command line.
369
- </Card>
370
- </CardGroup>