pyworkflow-engine 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (196) hide show
  1. dashboard/backend/app/__init__.py +1 -0
  2. dashboard/backend/app/config.py +32 -0
  3. dashboard/backend/app/controllers/__init__.py +6 -0
  4. dashboard/backend/app/controllers/run_controller.py +86 -0
  5. dashboard/backend/app/controllers/workflow_controller.py +33 -0
  6. dashboard/backend/app/dependencies/__init__.py +5 -0
  7. dashboard/backend/app/dependencies/storage.py +50 -0
  8. dashboard/backend/app/repositories/__init__.py +6 -0
  9. dashboard/backend/app/repositories/run_repository.py +80 -0
  10. dashboard/backend/app/repositories/workflow_repository.py +27 -0
  11. dashboard/backend/app/rest/__init__.py +8 -0
  12. dashboard/backend/app/rest/v1/__init__.py +12 -0
  13. dashboard/backend/app/rest/v1/health.py +33 -0
  14. dashboard/backend/app/rest/v1/runs.py +133 -0
  15. dashboard/backend/app/rest/v1/workflows.py +41 -0
  16. dashboard/backend/app/schemas/__init__.py +23 -0
  17. dashboard/backend/app/schemas/common.py +16 -0
  18. dashboard/backend/app/schemas/event.py +24 -0
  19. dashboard/backend/app/schemas/hook.py +25 -0
  20. dashboard/backend/app/schemas/run.py +54 -0
  21. dashboard/backend/app/schemas/step.py +28 -0
  22. dashboard/backend/app/schemas/workflow.py +31 -0
  23. dashboard/backend/app/server.py +87 -0
  24. dashboard/backend/app/services/__init__.py +6 -0
  25. dashboard/backend/app/services/run_service.py +240 -0
  26. dashboard/backend/app/services/workflow_service.py +155 -0
  27. dashboard/backend/main.py +18 -0
  28. docs/concepts/cancellation.mdx +362 -0
  29. docs/concepts/continue-as-new.mdx +434 -0
  30. docs/concepts/events.mdx +266 -0
  31. docs/concepts/fault-tolerance.mdx +370 -0
  32. docs/concepts/hooks.mdx +552 -0
  33. docs/concepts/limitations.mdx +167 -0
  34. docs/concepts/schedules.mdx +775 -0
  35. docs/concepts/sleep.mdx +312 -0
  36. docs/concepts/steps.mdx +301 -0
  37. docs/concepts/workflows.mdx +255 -0
  38. docs/guides/cli.mdx +942 -0
  39. docs/guides/configuration.mdx +560 -0
  40. docs/introduction.mdx +155 -0
  41. docs/quickstart.mdx +279 -0
  42. examples/__init__.py +1 -0
  43. examples/celery/__init__.py +1 -0
  44. examples/celery/durable/docker-compose.yml +55 -0
  45. examples/celery/durable/pyworkflow.config.yaml +12 -0
  46. examples/celery/durable/workflows/__init__.py +122 -0
  47. examples/celery/durable/workflows/basic.py +87 -0
  48. examples/celery/durable/workflows/batch_processing.py +102 -0
  49. examples/celery/durable/workflows/cancellation.py +273 -0
  50. examples/celery/durable/workflows/child_workflow_patterns.py +240 -0
  51. examples/celery/durable/workflows/child_workflows.py +202 -0
  52. examples/celery/durable/workflows/continue_as_new.py +260 -0
  53. examples/celery/durable/workflows/fault_tolerance.py +210 -0
  54. examples/celery/durable/workflows/hooks.py +211 -0
  55. examples/celery/durable/workflows/idempotency.py +112 -0
  56. examples/celery/durable/workflows/long_running.py +99 -0
  57. examples/celery/durable/workflows/retries.py +101 -0
  58. examples/celery/durable/workflows/schedules.py +209 -0
  59. examples/celery/transient/01_basic_workflow.py +91 -0
  60. examples/celery/transient/02_fault_tolerance.py +257 -0
  61. examples/celery/transient/__init__.py +20 -0
  62. examples/celery/transient/pyworkflow.config.yaml +25 -0
  63. examples/local/__init__.py +1 -0
  64. examples/local/durable/01_basic_workflow.py +94 -0
  65. examples/local/durable/02_file_storage.py +132 -0
  66. examples/local/durable/03_retries.py +169 -0
  67. examples/local/durable/04_long_running.py +119 -0
  68. examples/local/durable/05_event_log.py +145 -0
  69. examples/local/durable/06_idempotency.py +148 -0
  70. examples/local/durable/07_hooks.py +334 -0
  71. examples/local/durable/08_cancellation.py +233 -0
  72. examples/local/durable/09_child_workflows.py +198 -0
  73. examples/local/durable/10_child_workflow_patterns.py +265 -0
  74. examples/local/durable/11_continue_as_new.py +249 -0
  75. examples/local/durable/12_schedules.py +198 -0
  76. examples/local/durable/__init__.py +1 -0
  77. examples/local/transient/01_quick_tasks.py +87 -0
  78. examples/local/transient/02_retries.py +130 -0
  79. examples/local/transient/03_sleep.py +141 -0
  80. examples/local/transient/__init__.py +1 -0
  81. pyworkflow/__init__.py +256 -0
  82. pyworkflow/aws/__init__.py +68 -0
  83. pyworkflow/aws/context.py +234 -0
  84. pyworkflow/aws/handler.py +184 -0
  85. pyworkflow/aws/testing.py +310 -0
  86. pyworkflow/celery/__init__.py +41 -0
  87. pyworkflow/celery/app.py +198 -0
  88. pyworkflow/celery/scheduler.py +315 -0
  89. pyworkflow/celery/tasks.py +1746 -0
  90. pyworkflow/cli/__init__.py +132 -0
  91. pyworkflow/cli/__main__.py +6 -0
  92. pyworkflow/cli/commands/__init__.py +1 -0
  93. pyworkflow/cli/commands/hooks.py +640 -0
  94. pyworkflow/cli/commands/quickstart.py +495 -0
  95. pyworkflow/cli/commands/runs.py +773 -0
  96. pyworkflow/cli/commands/scheduler.py +130 -0
  97. pyworkflow/cli/commands/schedules.py +794 -0
  98. pyworkflow/cli/commands/setup.py +703 -0
  99. pyworkflow/cli/commands/worker.py +413 -0
  100. pyworkflow/cli/commands/workflows.py +1257 -0
  101. pyworkflow/cli/output/__init__.py +1 -0
  102. pyworkflow/cli/output/formatters.py +321 -0
  103. pyworkflow/cli/output/styles.py +121 -0
  104. pyworkflow/cli/utils/__init__.py +1 -0
  105. pyworkflow/cli/utils/async_helpers.py +30 -0
  106. pyworkflow/cli/utils/config.py +130 -0
  107. pyworkflow/cli/utils/config_generator.py +344 -0
  108. pyworkflow/cli/utils/discovery.py +53 -0
  109. pyworkflow/cli/utils/docker_manager.py +651 -0
  110. pyworkflow/cli/utils/interactive.py +364 -0
  111. pyworkflow/cli/utils/storage.py +115 -0
  112. pyworkflow/config.py +329 -0
  113. pyworkflow/context/__init__.py +63 -0
  114. pyworkflow/context/aws.py +230 -0
  115. pyworkflow/context/base.py +416 -0
  116. pyworkflow/context/local.py +930 -0
  117. pyworkflow/context/mock.py +381 -0
  118. pyworkflow/core/__init__.py +0 -0
  119. pyworkflow/core/exceptions.py +353 -0
  120. pyworkflow/core/registry.py +313 -0
  121. pyworkflow/core/scheduled.py +328 -0
  122. pyworkflow/core/step.py +494 -0
  123. pyworkflow/core/workflow.py +294 -0
  124. pyworkflow/discovery.py +248 -0
  125. pyworkflow/engine/__init__.py +0 -0
  126. pyworkflow/engine/events.py +879 -0
  127. pyworkflow/engine/executor.py +682 -0
  128. pyworkflow/engine/replay.py +273 -0
  129. pyworkflow/observability/__init__.py +19 -0
  130. pyworkflow/observability/logging.py +234 -0
  131. pyworkflow/primitives/__init__.py +33 -0
  132. pyworkflow/primitives/child_handle.py +174 -0
  133. pyworkflow/primitives/child_workflow.py +372 -0
  134. pyworkflow/primitives/continue_as_new.py +101 -0
  135. pyworkflow/primitives/define_hook.py +150 -0
  136. pyworkflow/primitives/hooks.py +97 -0
  137. pyworkflow/primitives/resume_hook.py +210 -0
  138. pyworkflow/primitives/schedule.py +545 -0
  139. pyworkflow/primitives/shield.py +96 -0
  140. pyworkflow/primitives/sleep.py +100 -0
  141. pyworkflow/runtime/__init__.py +21 -0
  142. pyworkflow/runtime/base.py +179 -0
  143. pyworkflow/runtime/celery.py +310 -0
  144. pyworkflow/runtime/factory.py +101 -0
  145. pyworkflow/runtime/local.py +706 -0
  146. pyworkflow/scheduler/__init__.py +9 -0
  147. pyworkflow/scheduler/local.py +248 -0
  148. pyworkflow/serialization/__init__.py +0 -0
  149. pyworkflow/serialization/decoder.py +146 -0
  150. pyworkflow/serialization/encoder.py +162 -0
  151. pyworkflow/storage/__init__.py +54 -0
  152. pyworkflow/storage/base.py +612 -0
  153. pyworkflow/storage/config.py +185 -0
  154. pyworkflow/storage/dynamodb.py +1315 -0
  155. pyworkflow/storage/file.py +827 -0
  156. pyworkflow/storage/memory.py +549 -0
  157. pyworkflow/storage/postgres.py +1161 -0
  158. pyworkflow/storage/schemas.py +486 -0
  159. pyworkflow/storage/sqlite.py +1136 -0
  160. pyworkflow/utils/__init__.py +0 -0
  161. pyworkflow/utils/duration.py +177 -0
  162. pyworkflow/utils/schedule.py +391 -0
  163. pyworkflow_engine-0.1.7.dist-info/METADATA +687 -0
  164. pyworkflow_engine-0.1.7.dist-info/RECORD +196 -0
  165. pyworkflow_engine-0.1.7.dist-info/WHEEL +5 -0
  166. pyworkflow_engine-0.1.7.dist-info/entry_points.txt +2 -0
  167. pyworkflow_engine-0.1.7.dist-info/licenses/LICENSE +21 -0
  168. pyworkflow_engine-0.1.7.dist-info/top_level.txt +5 -0
  169. tests/examples/__init__.py +0 -0
  170. tests/integration/__init__.py +0 -0
  171. tests/integration/test_cancellation.py +330 -0
  172. tests/integration/test_child_workflows.py +439 -0
  173. tests/integration/test_continue_as_new.py +428 -0
  174. tests/integration/test_dynamodb_storage.py +1146 -0
  175. tests/integration/test_fault_tolerance.py +369 -0
  176. tests/integration/test_schedule_storage.py +484 -0
  177. tests/unit/__init__.py +0 -0
  178. tests/unit/backends/__init__.py +1 -0
  179. tests/unit/backends/test_dynamodb_storage.py +1554 -0
  180. tests/unit/backends/test_postgres_storage.py +1281 -0
  181. tests/unit/backends/test_sqlite_storage.py +1460 -0
  182. tests/unit/conftest.py +41 -0
  183. tests/unit/test_cancellation.py +364 -0
  184. tests/unit/test_child_workflows.py +680 -0
  185. tests/unit/test_continue_as_new.py +441 -0
  186. tests/unit/test_event_limits.py +316 -0
  187. tests/unit/test_executor.py +320 -0
  188. tests/unit/test_fault_tolerance.py +334 -0
  189. tests/unit/test_hooks.py +495 -0
  190. tests/unit/test_registry.py +261 -0
  191. tests/unit/test_replay.py +420 -0
  192. tests/unit/test_schedule_schemas.py +285 -0
  193. tests/unit/test_schedule_utils.py +286 -0
  194. tests/unit/test_scheduled_workflow.py +274 -0
  195. tests/unit/test_step.py +353 -0
  196. tests/unit/test_workflow.py +243 -0
@@ -0,0 +1,494 @@
1
+ """
2
+ @step decorator for defining workflow steps.
3
+
4
+ Steps are isolated, retryable units of work that:
5
+ - Execute actual business logic
6
+ - Have automatic retry on failure
7
+ - Cache results for replay
8
+ - Run independently (can be distributed)
9
+
10
+ Supports multiple runtimes:
11
+ - Local: In-process execution with optional event sourcing
12
+ - Celery: Distributed execution via Celery workers
13
+ - AWS: AWS Durable Lambda Functions with automatic checkpointing
14
+ """
15
+
16
+ import functools
17
+ import hashlib
18
+ from collections.abc import Callable
19
+ from typing import Any
20
+
21
+ from loguru import logger
22
+
23
+ from pyworkflow.context import get_context, has_context
24
+ from pyworkflow.core.exceptions import FatalError, RetryableError
25
+ from pyworkflow.core.registry import register_step
26
+ from pyworkflow.engine.events import (
27
+ create_step_completed_event,
28
+ create_step_failed_event,
29
+ create_step_started_event,
30
+ )
31
+ from pyworkflow.serialization.encoder import serialize, serialize_args, serialize_kwargs
32
+
33
+
34
+ def _get_aws_context() -> Any | None:
35
+ """
36
+ Get the current AWS workflow context if running in AWS environment.
37
+
38
+ Returns None if not in AWS context or AWS module not available.
39
+ """
40
+ try:
41
+ from pyworkflow.aws.context import get_aws_context
42
+
43
+ return get_aws_context()
44
+ except ImportError:
45
+ # AWS module not installed
46
+ return None
47
+
48
+
49
+ def step(
50
+ name: str | None = None,
51
+ max_retries: int = 3,
52
+ retry_delay: str | int | list[int] = "exponential",
53
+ timeout: int | None = None,
54
+ metadata: dict[str, Any] | None = None,
55
+ ) -> Callable:
56
+ """
57
+ Decorator to mark functions as workflow steps.
58
+
59
+ Steps are isolated units of work with automatic retry and result caching.
60
+ They can be called both within workflows and independently.
61
+
62
+ Args:
63
+ name: Optional step name (defaults to function name)
64
+ max_retries: Maximum number of retry attempts (default: 3)
65
+ retry_delay: Retry delay strategy:
66
+ - "exponential": Exponential backoff (1s, 2s, 4s, 8s, ...)
67
+ - int: Fixed delay in seconds
68
+ - List[int]: Custom delays for each retry
69
+ timeout: Optional timeout in seconds
70
+ metadata: Optional metadata dictionary
71
+
72
+ Returns:
73
+ Decorated step function
74
+
75
+ Examples:
76
+ @step
77
+ async def simple_step(x: int):
78
+ return x * 2
79
+
80
+ @step(max_retries=5, retry_delay=10)
81
+ async def api_call(url: str):
82
+ response = await httpx.get(url)
83
+ return response.json()
84
+
85
+ @step(retry_delay=[5, 30, 300])
86
+ async def custom_retry_step():
87
+ # Retries: after 5s, then 30s, then 300s
88
+ pass
89
+ """
90
+
91
+ def decorator(func: Callable) -> Callable:
92
+ step_name = name or func.__name__
93
+
94
+ @functools.wraps(func)
95
+ async def wrapper(*args: Any, **kwargs: Any) -> Any:
96
+ # Check if running in AWS Durable Lambda context
97
+ aws_ctx = _get_aws_context()
98
+ if aws_ctx is not None:
99
+ logger.debug(f"Step {step_name} running in AWS context, delegating to AWS SDK")
100
+ # Delegate to AWS context for checkpointed execution
101
+ return aws_ctx.execute_step(func, *args, step_name=step_name, **kwargs)
102
+
103
+ # Check if we're in a workflow context
104
+ if not has_context():
105
+ # Called outside workflow - execute directly
106
+ logger.debug(f"Step {step_name} called outside workflow, executing directly")
107
+ return await func(*args, **kwargs)
108
+
109
+ ctx = get_context()
110
+
111
+ # Check for cancellation before executing step
112
+ ctx.check_cancellation()
113
+
114
+ # Transient mode: execute directly without event sourcing
115
+ # Retries are still supported via direct execution
116
+ if not ctx.is_durable:
117
+ logger.debug(
118
+ f"Step {step_name} in transient mode, executing directly",
119
+ run_id=ctx.run_id,
120
+ )
121
+ return await _execute_with_retries(
122
+ func, args, kwargs, step_name, max_retries, retry_delay
123
+ )
124
+
125
+ # Durable mode: use event sourcing
126
+ # Generate step ID (deterministic based on name + args)
127
+ step_id = _generate_step_id(step_name, args, kwargs)
128
+
129
+ # Check if step has already completed (replay)
130
+ if not ctx.should_execute_step(step_id):
131
+ logger.debug(
132
+ f"Step {step_name} already completed, using cached result",
133
+ run_id=ctx.run_id,
134
+ step_id=step_id,
135
+ )
136
+ return ctx.get_step_result(step_id)
137
+
138
+ # Check if we're resuming from a retry
139
+ retry_state = ctx.get_retry_state(step_id)
140
+ if retry_state:
141
+ current_attempt = retry_state["current_attempt"]
142
+ resume_at = retry_state.get("resume_at")
143
+
144
+ # Check if retry delay has elapsed during replay
145
+ if ctx.is_replaying and resume_at:
146
+ from datetime import UTC, datetime
147
+
148
+ now = datetime.now(UTC)
149
+ if now < resume_at:
150
+ # Not ready to retry yet - re-raise suspension
151
+ logger.debug(
152
+ f"Retry delay not elapsed for {step_name}, re-suspending",
153
+ run_id=ctx.run_id,
154
+ step_id=step_id,
155
+ current_attempt=current_attempt,
156
+ resume_at=resume_at.isoformat(),
157
+ )
158
+ from pyworkflow.core.exceptions import SuspensionSignal
159
+
160
+ raise SuspensionSignal(
161
+ reason=f"retry:{step_id}",
162
+ resume_at=resume_at,
163
+ step_id=step_id,
164
+ attempt=current_attempt,
165
+ )
166
+ else:
167
+ current_attempt = 1
168
+
169
+ # Validate event limits before executing step
170
+ await ctx.validate_event_limits()
171
+
172
+ # Record step start event
173
+ start_event = create_step_started_event(
174
+ run_id=ctx.run_id,
175
+ step_id=step_id,
176
+ step_name=step_name,
177
+ args=serialize_args(*args),
178
+ kwargs=serialize_kwargs(**kwargs),
179
+ attempt=current_attempt,
180
+ )
181
+ await ctx.storage.record_event(start_event) # type: ignore[union-attr]
182
+
183
+ logger.info(
184
+ f"Executing step: {step_name} (attempt {current_attempt}/{max_retries + 1})",
185
+ run_id=ctx.run_id,
186
+ step_id=step_id,
187
+ step_name=step_name,
188
+ attempt=current_attempt,
189
+ )
190
+
191
+ # Check for cancellation before executing step
192
+ ctx.check_cancellation()
193
+
194
+ try:
195
+ # Execute step function
196
+ result = await func(*args, **kwargs)
197
+
198
+ # Record completion event
199
+ completion_event = create_step_completed_event(
200
+ run_id=ctx.run_id,
201
+ step_id=step_id,
202
+ result=serialize(result),
203
+ step_name=step_name,
204
+ )
205
+ await ctx.storage.record_event(completion_event) # type: ignore[union-attr]
206
+
207
+ # Cache result for replay
208
+ ctx.cache_step_result(step_id, result)
209
+
210
+ # Clear retry state on success
211
+ ctx.clear_retry_state(step_id)
212
+
213
+ logger.info(
214
+ f"Step completed: {step_name}",
215
+ run_id=ctx.run_id,
216
+ step_id=step_id,
217
+ )
218
+
219
+ return result
220
+
221
+ except FatalError as e:
222
+ # Fatal error - don't retry
223
+ logger.error(
224
+ f"Step failed (fatal): {step_name}",
225
+ run_id=ctx.run_id,
226
+ step_id=step_id,
227
+ error=str(e),
228
+ )
229
+
230
+ # Record failure event
231
+ failure_event = create_step_failed_event(
232
+ run_id=ctx.run_id,
233
+ step_id=step_id,
234
+ error=str(e),
235
+ error_type=type(e).__name__,
236
+ is_retryable=False,
237
+ attempt=current_attempt,
238
+ )
239
+ await ctx.storage.record_event(failure_event) # type: ignore[union-attr]
240
+
241
+ # Clear retry state
242
+ ctx.clear_retry_state(step_id)
243
+
244
+ raise
245
+
246
+ except (RetryableError, Exception) as e:
247
+ # Handle retriable errors (RetryableError or generic Exception)
248
+ # FatalError is already handled above
249
+ is_retryable_error = isinstance(e, RetryableError)
250
+
251
+ # Check if we have retries left
252
+ if current_attempt <= max_retries:
253
+ # We can retry
254
+ next_attempt = current_attempt + 1
255
+
256
+ # Calculate retry delay
257
+ delay_seconds: float
258
+ if isinstance(e, RetryableError) and e.retry_after is not None:
259
+ # Use RetryableError's specified delay
260
+ delay_seconds = float(e.get_retry_delay_seconds() or 0)
261
+ else:
262
+ # Use step's configured retry delay strategy
263
+ delay_seconds = _get_retry_delay(retry_delay, current_attempt - 1)
264
+
265
+ # Calculate resume time
266
+ from datetime import UTC, datetime, timedelta
267
+
268
+ resume_at = datetime.now(UTC) + timedelta(seconds=delay_seconds)
269
+
270
+ logger.warning(
271
+ f"Step failed (retriable): {step_name}, "
272
+ f"retrying in {delay_seconds}s (attempt {next_attempt}/{max_retries + 1})",
273
+ run_id=ctx.run_id,
274
+ step_id=step_id,
275
+ error=str(e),
276
+ current_attempt=current_attempt,
277
+ next_attempt=next_attempt,
278
+ )
279
+
280
+ # Record STEP_FAILED event
281
+ failure_event = create_step_failed_event(
282
+ run_id=ctx.run_id,
283
+ step_id=step_id,
284
+ error=str(e),
285
+ error_type=type(e).__name__,
286
+ is_retryable=True,
287
+ attempt=current_attempt,
288
+ )
289
+ await ctx.storage.record_event(failure_event) # type: ignore[union-attr]
290
+
291
+ # Record STEP_RETRYING event
292
+ from pyworkflow.engine.events import create_step_retrying_event
293
+
294
+ retrying_event = create_step_retrying_event(
295
+ run_id=ctx.run_id,
296
+ step_id=step_id,
297
+ attempt=next_attempt,
298
+ retry_after=str(int(delay_seconds)),
299
+ error=str(e),
300
+ )
301
+ # Add additional fields to event data
302
+ retrying_event.data["resume_at"] = resume_at.isoformat()
303
+ retrying_event.data["retry_strategy"] = str(retry_delay)
304
+ retrying_event.data["max_retries"] = max_retries
305
+ await ctx.storage.record_event(retrying_event) # type: ignore[union-attr]
306
+
307
+ # Update retry state in context
308
+ ctx.set_retry_state(
309
+ step_id=step_id,
310
+ attempt=next_attempt,
311
+ resume_at=resume_at,
312
+ max_retries=max_retries,
313
+ retry_delay=retry_delay,
314
+ last_error=str(e),
315
+ )
316
+
317
+ # Raise suspension signal to pause workflow
318
+ # Note: The workflow-level exception handler will schedule automatic resumption
319
+ from pyworkflow.core.exceptions import SuspensionSignal
320
+
321
+ raise SuspensionSignal(
322
+ reason=f"retry:{step_id}",
323
+ resume_at=resume_at,
324
+ step_id=step_id,
325
+ attempt=next_attempt,
326
+ )
327
+
328
+ else:
329
+ # Max retries exhausted
330
+ logger.error(
331
+ f"Step failed after {max_retries + 1} attempts: {step_name}",
332
+ run_id=ctx.run_id,
333
+ step_id=step_id,
334
+ error=str(e),
335
+ total_attempts=current_attempt,
336
+ )
337
+
338
+ # Record final STEP_FAILED event
339
+ failure_event = create_step_failed_event(
340
+ run_id=ctx.run_id,
341
+ step_id=step_id,
342
+ error=str(e),
343
+ error_type=type(e).__name__,
344
+ is_retryable=True,
345
+ attempt=current_attempt,
346
+ )
347
+ await ctx.storage.record_event(failure_event) # type: ignore[union-attr]
348
+
349
+ ctx.clear_retry_state(step_id)
350
+
351
+ # Convert to RetryableError if it wasn't already
352
+ if not is_retryable_error:
353
+ raise RetryableError(
354
+ f"Step {step_name} failed after {max_retries + 1} attempts: {e}"
355
+ ) from e
356
+ else:
357
+ raise
358
+
359
+ # Register step
360
+ register_step(
361
+ name=step_name,
362
+ func=wrapper,
363
+ original_func=func,
364
+ max_retries=max_retries,
365
+ retry_delay=str(retry_delay),
366
+ timeout=timeout,
367
+ metadata=metadata,
368
+ )
369
+
370
+ # Store metadata on wrapper
371
+ wrapper.__step__ = True # type: ignore[attr-defined]
372
+ wrapper.__step_name__ = step_name # type: ignore[attr-defined]
373
+ wrapper.__step_max_retries__ = max_retries # type: ignore[attr-defined]
374
+ wrapper.__step_retry_delay__ = retry_delay # type: ignore[attr-defined]
375
+ wrapper.__step_timeout__ = timeout # type: ignore[attr-defined]
376
+ wrapper.__step_metadata__ = metadata or {} # type: ignore[attr-defined]
377
+
378
+ return wrapper
379
+
380
+ return decorator
381
+
382
+
383
+ async def _execute_with_retries(
384
+ func: Callable,
385
+ args: tuple,
386
+ kwargs: dict,
387
+ step_name: str,
388
+ max_retries: int,
389
+ retry_delay: str | int | list[int],
390
+ ) -> Any:
391
+ """
392
+ Execute a step function with retry logic (for transient mode).
393
+
394
+ Args:
395
+ func: The step function to execute
396
+ args: Positional arguments
397
+ kwargs: Keyword arguments
398
+ step_name: Name of the step for logging
399
+ max_retries: Maximum number of retry attempts
400
+ retry_delay: Retry delay strategy
401
+
402
+ Returns:
403
+ Result of the function
404
+
405
+ Raises:
406
+ Exception: If all retries exhausted
407
+ """
408
+ import asyncio
409
+
410
+ last_error: Exception | None = None
411
+
412
+ for attempt in range(max_retries + 1):
413
+ try:
414
+ return await func(*args, **kwargs)
415
+
416
+ except FatalError:
417
+ # Fatal errors are not retried
418
+ raise
419
+
420
+ except Exception as e:
421
+ last_error = e
422
+
423
+ if attempt < max_retries:
424
+ # Calculate delay
425
+ delay = _get_retry_delay(retry_delay, attempt)
426
+
427
+ logger.warning(
428
+ f"Step {step_name} failed (attempt {attempt + 1}/{max_retries + 1}), "
429
+ f"retrying in {delay}s",
430
+ error=str(e),
431
+ )
432
+
433
+ await asyncio.sleep(delay)
434
+ else:
435
+ # All retries exhausted
436
+ logger.error(
437
+ f"Step {step_name} failed after {max_retries + 1} attempts",
438
+ error=str(e),
439
+ )
440
+
441
+ assert last_error is not None # mypy: guaranteed by loop logic
442
+ raise last_error
443
+
444
+
445
+ def _get_retry_delay(retry_delay: str | int | list[int], attempt: int) -> float:
446
+ """
447
+ Calculate retry delay based on strategy.
448
+
449
+ Args:
450
+ retry_delay: Delay strategy ("exponential", int, or list)
451
+ attempt: Current attempt number (0-indexed)
452
+
453
+ Returns:
454
+ Delay in seconds
455
+ """
456
+ if retry_delay == "exponential":
457
+ # Exponential backoff: 1, 2, 4, 8, 16, ... (capped at 300s)
458
+ return min(2**attempt, 300)
459
+ elif isinstance(retry_delay, int):
460
+ return retry_delay
461
+ elif isinstance(retry_delay, list):
462
+ # Use custom delays, fall back to last value if out of range
463
+ if attempt < len(retry_delay):
464
+ return retry_delay[attempt]
465
+ return retry_delay[-1] if retry_delay else 1
466
+ else:
467
+ # Default to 1 second
468
+ return 1
469
+
470
+
471
+ def _generate_step_id(step_name: str, args: tuple, kwargs: dict) -> str:
472
+ """
473
+ Generate deterministic step ID based on name and arguments.
474
+
475
+ This ensures the same step with same arguments always gets the same ID,
476
+ enabling proper replay behavior.
477
+
478
+ Args:
479
+ step_name: Step name
480
+ args: Positional arguments
481
+ kwargs: Keyword arguments
482
+
483
+ Returns:
484
+ Deterministic step ID
485
+ """
486
+ # Serialize arguments
487
+ args_str = serialize_args(*args)
488
+ kwargs_str = serialize_kwargs(**kwargs)
489
+
490
+ # Create hash of step name + arguments
491
+ content = f"{step_name}:{args_str}:{kwargs_str}"
492
+ hash_hex = hashlib.sha256(content.encode()).hexdigest()[:16]
493
+
494
+ return f"step_{step_name}_{hash_hex}"