pyworkflow-engine 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dashboard/backend/app/__init__.py +1 -0
- dashboard/backend/app/config.py +32 -0
- dashboard/backend/app/controllers/__init__.py +6 -0
- dashboard/backend/app/controllers/run_controller.py +86 -0
- dashboard/backend/app/controllers/workflow_controller.py +33 -0
- dashboard/backend/app/dependencies/__init__.py +5 -0
- dashboard/backend/app/dependencies/storage.py +50 -0
- dashboard/backend/app/repositories/__init__.py +6 -0
- dashboard/backend/app/repositories/run_repository.py +80 -0
- dashboard/backend/app/repositories/workflow_repository.py +27 -0
- dashboard/backend/app/rest/__init__.py +8 -0
- dashboard/backend/app/rest/v1/__init__.py +12 -0
- dashboard/backend/app/rest/v1/health.py +33 -0
- dashboard/backend/app/rest/v1/runs.py +133 -0
- dashboard/backend/app/rest/v1/workflows.py +41 -0
- dashboard/backend/app/schemas/__init__.py +23 -0
- dashboard/backend/app/schemas/common.py +16 -0
- dashboard/backend/app/schemas/event.py +24 -0
- dashboard/backend/app/schemas/hook.py +25 -0
- dashboard/backend/app/schemas/run.py +54 -0
- dashboard/backend/app/schemas/step.py +28 -0
- dashboard/backend/app/schemas/workflow.py +31 -0
- dashboard/backend/app/server.py +87 -0
- dashboard/backend/app/services/__init__.py +6 -0
- dashboard/backend/app/services/run_service.py +240 -0
- dashboard/backend/app/services/workflow_service.py +155 -0
- dashboard/backend/main.py +18 -0
- docs/concepts/cancellation.mdx +362 -0
- docs/concepts/continue-as-new.mdx +434 -0
- docs/concepts/events.mdx +266 -0
- docs/concepts/fault-tolerance.mdx +370 -0
- docs/concepts/hooks.mdx +552 -0
- docs/concepts/limitations.mdx +167 -0
- docs/concepts/schedules.mdx +775 -0
- docs/concepts/sleep.mdx +312 -0
- docs/concepts/steps.mdx +301 -0
- docs/concepts/workflows.mdx +255 -0
- docs/guides/cli.mdx +942 -0
- docs/guides/configuration.mdx +560 -0
- docs/introduction.mdx +155 -0
- docs/quickstart.mdx +279 -0
- examples/__init__.py +1 -0
- examples/celery/__init__.py +1 -0
- examples/celery/durable/docker-compose.yml +55 -0
- examples/celery/durable/pyworkflow.config.yaml +12 -0
- examples/celery/durable/workflows/__init__.py +122 -0
- examples/celery/durable/workflows/basic.py +87 -0
- examples/celery/durable/workflows/batch_processing.py +102 -0
- examples/celery/durable/workflows/cancellation.py +273 -0
- examples/celery/durable/workflows/child_workflow_patterns.py +240 -0
- examples/celery/durable/workflows/child_workflows.py +202 -0
- examples/celery/durable/workflows/continue_as_new.py +260 -0
- examples/celery/durable/workflows/fault_tolerance.py +210 -0
- examples/celery/durable/workflows/hooks.py +211 -0
- examples/celery/durable/workflows/idempotency.py +112 -0
- examples/celery/durable/workflows/long_running.py +99 -0
- examples/celery/durable/workflows/retries.py +101 -0
- examples/celery/durable/workflows/schedules.py +209 -0
- examples/celery/transient/01_basic_workflow.py +91 -0
- examples/celery/transient/02_fault_tolerance.py +257 -0
- examples/celery/transient/__init__.py +20 -0
- examples/celery/transient/pyworkflow.config.yaml +25 -0
- examples/local/__init__.py +1 -0
- examples/local/durable/01_basic_workflow.py +94 -0
- examples/local/durable/02_file_storage.py +132 -0
- examples/local/durable/03_retries.py +169 -0
- examples/local/durable/04_long_running.py +119 -0
- examples/local/durable/05_event_log.py +145 -0
- examples/local/durable/06_idempotency.py +148 -0
- examples/local/durable/07_hooks.py +334 -0
- examples/local/durable/08_cancellation.py +233 -0
- examples/local/durable/09_child_workflows.py +198 -0
- examples/local/durable/10_child_workflow_patterns.py +265 -0
- examples/local/durable/11_continue_as_new.py +249 -0
- examples/local/durable/12_schedules.py +198 -0
- examples/local/durable/__init__.py +1 -0
- examples/local/transient/01_quick_tasks.py +87 -0
- examples/local/transient/02_retries.py +130 -0
- examples/local/transient/03_sleep.py +141 -0
- examples/local/transient/__init__.py +1 -0
- pyworkflow/__init__.py +256 -0
- pyworkflow/aws/__init__.py +68 -0
- pyworkflow/aws/context.py +234 -0
- pyworkflow/aws/handler.py +184 -0
- pyworkflow/aws/testing.py +310 -0
- pyworkflow/celery/__init__.py +41 -0
- pyworkflow/celery/app.py +198 -0
- pyworkflow/celery/scheduler.py +315 -0
- pyworkflow/celery/tasks.py +1746 -0
- pyworkflow/cli/__init__.py +132 -0
- pyworkflow/cli/__main__.py +6 -0
- pyworkflow/cli/commands/__init__.py +1 -0
- pyworkflow/cli/commands/hooks.py +640 -0
- pyworkflow/cli/commands/quickstart.py +495 -0
- pyworkflow/cli/commands/runs.py +773 -0
- pyworkflow/cli/commands/scheduler.py +130 -0
- pyworkflow/cli/commands/schedules.py +794 -0
- pyworkflow/cli/commands/setup.py +703 -0
- pyworkflow/cli/commands/worker.py +413 -0
- pyworkflow/cli/commands/workflows.py +1257 -0
- pyworkflow/cli/output/__init__.py +1 -0
- pyworkflow/cli/output/formatters.py +321 -0
- pyworkflow/cli/output/styles.py +121 -0
- pyworkflow/cli/utils/__init__.py +1 -0
- pyworkflow/cli/utils/async_helpers.py +30 -0
- pyworkflow/cli/utils/config.py +130 -0
- pyworkflow/cli/utils/config_generator.py +344 -0
- pyworkflow/cli/utils/discovery.py +53 -0
- pyworkflow/cli/utils/docker_manager.py +651 -0
- pyworkflow/cli/utils/interactive.py +364 -0
- pyworkflow/cli/utils/storage.py +115 -0
- pyworkflow/config.py +329 -0
- pyworkflow/context/__init__.py +63 -0
- pyworkflow/context/aws.py +230 -0
- pyworkflow/context/base.py +416 -0
- pyworkflow/context/local.py +930 -0
- pyworkflow/context/mock.py +381 -0
- pyworkflow/core/__init__.py +0 -0
- pyworkflow/core/exceptions.py +353 -0
- pyworkflow/core/registry.py +313 -0
- pyworkflow/core/scheduled.py +328 -0
- pyworkflow/core/step.py +494 -0
- pyworkflow/core/workflow.py +294 -0
- pyworkflow/discovery.py +248 -0
- pyworkflow/engine/__init__.py +0 -0
- pyworkflow/engine/events.py +879 -0
- pyworkflow/engine/executor.py +682 -0
- pyworkflow/engine/replay.py +273 -0
- pyworkflow/observability/__init__.py +19 -0
- pyworkflow/observability/logging.py +234 -0
- pyworkflow/primitives/__init__.py +33 -0
- pyworkflow/primitives/child_handle.py +174 -0
- pyworkflow/primitives/child_workflow.py +372 -0
- pyworkflow/primitives/continue_as_new.py +101 -0
- pyworkflow/primitives/define_hook.py +150 -0
- pyworkflow/primitives/hooks.py +97 -0
- pyworkflow/primitives/resume_hook.py +210 -0
- pyworkflow/primitives/schedule.py +545 -0
- pyworkflow/primitives/shield.py +96 -0
- pyworkflow/primitives/sleep.py +100 -0
- pyworkflow/runtime/__init__.py +21 -0
- pyworkflow/runtime/base.py +179 -0
- pyworkflow/runtime/celery.py +310 -0
- pyworkflow/runtime/factory.py +101 -0
- pyworkflow/runtime/local.py +706 -0
- pyworkflow/scheduler/__init__.py +9 -0
- pyworkflow/scheduler/local.py +248 -0
- pyworkflow/serialization/__init__.py +0 -0
- pyworkflow/serialization/decoder.py +146 -0
- pyworkflow/serialization/encoder.py +162 -0
- pyworkflow/storage/__init__.py +54 -0
- pyworkflow/storage/base.py +612 -0
- pyworkflow/storage/config.py +185 -0
- pyworkflow/storage/dynamodb.py +1315 -0
- pyworkflow/storage/file.py +827 -0
- pyworkflow/storage/memory.py +549 -0
- pyworkflow/storage/postgres.py +1161 -0
- pyworkflow/storage/schemas.py +486 -0
- pyworkflow/storage/sqlite.py +1136 -0
- pyworkflow/utils/__init__.py +0 -0
- pyworkflow/utils/duration.py +177 -0
- pyworkflow/utils/schedule.py +391 -0
- pyworkflow_engine-0.1.7.dist-info/METADATA +687 -0
- pyworkflow_engine-0.1.7.dist-info/RECORD +196 -0
- pyworkflow_engine-0.1.7.dist-info/WHEEL +5 -0
- pyworkflow_engine-0.1.7.dist-info/entry_points.txt +2 -0
- pyworkflow_engine-0.1.7.dist-info/licenses/LICENSE +21 -0
- pyworkflow_engine-0.1.7.dist-info/top_level.txt +5 -0
- tests/examples/__init__.py +0 -0
- tests/integration/__init__.py +0 -0
- tests/integration/test_cancellation.py +330 -0
- tests/integration/test_child_workflows.py +439 -0
- tests/integration/test_continue_as_new.py +428 -0
- tests/integration/test_dynamodb_storage.py +1146 -0
- tests/integration/test_fault_tolerance.py +369 -0
- tests/integration/test_schedule_storage.py +484 -0
- tests/unit/__init__.py +0 -0
- tests/unit/backends/__init__.py +1 -0
- tests/unit/backends/test_dynamodb_storage.py +1554 -0
- tests/unit/backends/test_postgres_storage.py +1281 -0
- tests/unit/backends/test_sqlite_storage.py +1460 -0
- tests/unit/conftest.py +41 -0
- tests/unit/test_cancellation.py +364 -0
- tests/unit/test_child_workflows.py +680 -0
- tests/unit/test_continue_as_new.py +441 -0
- tests/unit/test_event_limits.py +316 -0
- tests/unit/test_executor.py +320 -0
- tests/unit/test_fault_tolerance.py +334 -0
- tests/unit/test_hooks.py +495 -0
- tests/unit/test_registry.py +261 -0
- tests/unit/test_replay.py +420 -0
- tests/unit/test_schedule_schemas.py +285 -0
- tests/unit/test_schedule_utils.py +286 -0
- tests/unit/test_scheduled_workflow.py +274 -0
- tests/unit/test_step.py +353 -0
- tests/unit/test_workflow.py +243 -0
pyworkflow/core/step.py
ADDED
|
@@ -0,0 +1,494 @@
|
|
|
1
|
+
"""
|
|
2
|
+
@step decorator for defining workflow steps.
|
|
3
|
+
|
|
4
|
+
Steps are isolated, retryable units of work that:
|
|
5
|
+
- Execute actual business logic
|
|
6
|
+
- Have automatic retry on failure
|
|
7
|
+
- Cache results for replay
|
|
8
|
+
- Run independently (can be distributed)
|
|
9
|
+
|
|
10
|
+
Supports multiple runtimes:
|
|
11
|
+
- Local: In-process execution with optional event sourcing
|
|
12
|
+
- Celery: Distributed execution via Celery workers
|
|
13
|
+
- AWS: AWS Durable Lambda Functions with automatic checkpointing
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import functools
|
|
17
|
+
import hashlib
|
|
18
|
+
from collections.abc import Callable
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from loguru import logger
|
|
22
|
+
|
|
23
|
+
from pyworkflow.context import get_context, has_context
|
|
24
|
+
from pyworkflow.core.exceptions import FatalError, RetryableError
|
|
25
|
+
from pyworkflow.core.registry import register_step
|
|
26
|
+
from pyworkflow.engine.events import (
|
|
27
|
+
create_step_completed_event,
|
|
28
|
+
create_step_failed_event,
|
|
29
|
+
create_step_started_event,
|
|
30
|
+
)
|
|
31
|
+
from pyworkflow.serialization.encoder import serialize, serialize_args, serialize_kwargs
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _get_aws_context() -> Any | None:
|
|
35
|
+
"""
|
|
36
|
+
Get the current AWS workflow context if running in AWS environment.
|
|
37
|
+
|
|
38
|
+
Returns None if not in AWS context or AWS module not available.
|
|
39
|
+
"""
|
|
40
|
+
try:
|
|
41
|
+
from pyworkflow.aws.context import get_aws_context
|
|
42
|
+
|
|
43
|
+
return get_aws_context()
|
|
44
|
+
except ImportError:
|
|
45
|
+
# AWS module not installed
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def step(
|
|
50
|
+
name: str | None = None,
|
|
51
|
+
max_retries: int = 3,
|
|
52
|
+
retry_delay: str | int | list[int] = "exponential",
|
|
53
|
+
timeout: int | None = None,
|
|
54
|
+
metadata: dict[str, Any] | None = None,
|
|
55
|
+
) -> Callable:
|
|
56
|
+
"""
|
|
57
|
+
Decorator to mark functions as workflow steps.
|
|
58
|
+
|
|
59
|
+
Steps are isolated units of work with automatic retry and result caching.
|
|
60
|
+
They can be called both within workflows and independently.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
name: Optional step name (defaults to function name)
|
|
64
|
+
max_retries: Maximum number of retry attempts (default: 3)
|
|
65
|
+
retry_delay: Retry delay strategy:
|
|
66
|
+
- "exponential": Exponential backoff (1s, 2s, 4s, 8s, ...)
|
|
67
|
+
- int: Fixed delay in seconds
|
|
68
|
+
- List[int]: Custom delays for each retry
|
|
69
|
+
timeout: Optional timeout in seconds
|
|
70
|
+
metadata: Optional metadata dictionary
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Decorated step function
|
|
74
|
+
|
|
75
|
+
Examples:
|
|
76
|
+
@step
|
|
77
|
+
async def simple_step(x: int):
|
|
78
|
+
return x * 2
|
|
79
|
+
|
|
80
|
+
@step(max_retries=5, retry_delay=10)
|
|
81
|
+
async def api_call(url: str):
|
|
82
|
+
response = await httpx.get(url)
|
|
83
|
+
return response.json()
|
|
84
|
+
|
|
85
|
+
@step(retry_delay=[5, 30, 300])
|
|
86
|
+
async def custom_retry_step():
|
|
87
|
+
# Retries: after 5s, then 30s, then 300s
|
|
88
|
+
pass
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def decorator(func: Callable) -> Callable:
|
|
92
|
+
step_name = name or func.__name__
|
|
93
|
+
|
|
94
|
+
@functools.wraps(func)
|
|
95
|
+
async def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
96
|
+
# Check if running in AWS Durable Lambda context
|
|
97
|
+
aws_ctx = _get_aws_context()
|
|
98
|
+
if aws_ctx is not None:
|
|
99
|
+
logger.debug(f"Step {step_name} running in AWS context, delegating to AWS SDK")
|
|
100
|
+
# Delegate to AWS context for checkpointed execution
|
|
101
|
+
return aws_ctx.execute_step(func, *args, step_name=step_name, **kwargs)
|
|
102
|
+
|
|
103
|
+
# Check if we're in a workflow context
|
|
104
|
+
if not has_context():
|
|
105
|
+
# Called outside workflow - execute directly
|
|
106
|
+
logger.debug(f"Step {step_name} called outside workflow, executing directly")
|
|
107
|
+
return await func(*args, **kwargs)
|
|
108
|
+
|
|
109
|
+
ctx = get_context()
|
|
110
|
+
|
|
111
|
+
# Check for cancellation before executing step
|
|
112
|
+
ctx.check_cancellation()
|
|
113
|
+
|
|
114
|
+
# Transient mode: execute directly without event sourcing
|
|
115
|
+
# Retries are still supported via direct execution
|
|
116
|
+
if not ctx.is_durable:
|
|
117
|
+
logger.debug(
|
|
118
|
+
f"Step {step_name} in transient mode, executing directly",
|
|
119
|
+
run_id=ctx.run_id,
|
|
120
|
+
)
|
|
121
|
+
return await _execute_with_retries(
|
|
122
|
+
func, args, kwargs, step_name, max_retries, retry_delay
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Durable mode: use event sourcing
|
|
126
|
+
# Generate step ID (deterministic based on name + args)
|
|
127
|
+
step_id = _generate_step_id(step_name, args, kwargs)
|
|
128
|
+
|
|
129
|
+
# Check if step has already completed (replay)
|
|
130
|
+
if not ctx.should_execute_step(step_id):
|
|
131
|
+
logger.debug(
|
|
132
|
+
f"Step {step_name} already completed, using cached result",
|
|
133
|
+
run_id=ctx.run_id,
|
|
134
|
+
step_id=step_id,
|
|
135
|
+
)
|
|
136
|
+
return ctx.get_step_result(step_id)
|
|
137
|
+
|
|
138
|
+
# Check if we're resuming from a retry
|
|
139
|
+
retry_state = ctx.get_retry_state(step_id)
|
|
140
|
+
if retry_state:
|
|
141
|
+
current_attempt = retry_state["current_attempt"]
|
|
142
|
+
resume_at = retry_state.get("resume_at")
|
|
143
|
+
|
|
144
|
+
# Check if retry delay has elapsed during replay
|
|
145
|
+
if ctx.is_replaying and resume_at:
|
|
146
|
+
from datetime import UTC, datetime
|
|
147
|
+
|
|
148
|
+
now = datetime.now(UTC)
|
|
149
|
+
if now < resume_at:
|
|
150
|
+
# Not ready to retry yet - re-raise suspension
|
|
151
|
+
logger.debug(
|
|
152
|
+
f"Retry delay not elapsed for {step_name}, re-suspending",
|
|
153
|
+
run_id=ctx.run_id,
|
|
154
|
+
step_id=step_id,
|
|
155
|
+
current_attempt=current_attempt,
|
|
156
|
+
resume_at=resume_at.isoformat(),
|
|
157
|
+
)
|
|
158
|
+
from pyworkflow.core.exceptions import SuspensionSignal
|
|
159
|
+
|
|
160
|
+
raise SuspensionSignal(
|
|
161
|
+
reason=f"retry:{step_id}",
|
|
162
|
+
resume_at=resume_at,
|
|
163
|
+
step_id=step_id,
|
|
164
|
+
attempt=current_attempt,
|
|
165
|
+
)
|
|
166
|
+
else:
|
|
167
|
+
current_attempt = 1
|
|
168
|
+
|
|
169
|
+
# Validate event limits before executing step
|
|
170
|
+
await ctx.validate_event_limits()
|
|
171
|
+
|
|
172
|
+
# Record step start event
|
|
173
|
+
start_event = create_step_started_event(
|
|
174
|
+
run_id=ctx.run_id,
|
|
175
|
+
step_id=step_id,
|
|
176
|
+
step_name=step_name,
|
|
177
|
+
args=serialize_args(*args),
|
|
178
|
+
kwargs=serialize_kwargs(**kwargs),
|
|
179
|
+
attempt=current_attempt,
|
|
180
|
+
)
|
|
181
|
+
await ctx.storage.record_event(start_event) # type: ignore[union-attr]
|
|
182
|
+
|
|
183
|
+
logger.info(
|
|
184
|
+
f"Executing step: {step_name} (attempt {current_attempt}/{max_retries + 1})",
|
|
185
|
+
run_id=ctx.run_id,
|
|
186
|
+
step_id=step_id,
|
|
187
|
+
step_name=step_name,
|
|
188
|
+
attempt=current_attempt,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Check for cancellation before executing step
|
|
192
|
+
ctx.check_cancellation()
|
|
193
|
+
|
|
194
|
+
try:
|
|
195
|
+
# Execute step function
|
|
196
|
+
result = await func(*args, **kwargs)
|
|
197
|
+
|
|
198
|
+
# Record completion event
|
|
199
|
+
completion_event = create_step_completed_event(
|
|
200
|
+
run_id=ctx.run_id,
|
|
201
|
+
step_id=step_id,
|
|
202
|
+
result=serialize(result),
|
|
203
|
+
step_name=step_name,
|
|
204
|
+
)
|
|
205
|
+
await ctx.storage.record_event(completion_event) # type: ignore[union-attr]
|
|
206
|
+
|
|
207
|
+
# Cache result for replay
|
|
208
|
+
ctx.cache_step_result(step_id, result)
|
|
209
|
+
|
|
210
|
+
# Clear retry state on success
|
|
211
|
+
ctx.clear_retry_state(step_id)
|
|
212
|
+
|
|
213
|
+
logger.info(
|
|
214
|
+
f"Step completed: {step_name}",
|
|
215
|
+
run_id=ctx.run_id,
|
|
216
|
+
step_id=step_id,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
return result
|
|
220
|
+
|
|
221
|
+
except FatalError as e:
|
|
222
|
+
# Fatal error - don't retry
|
|
223
|
+
logger.error(
|
|
224
|
+
f"Step failed (fatal): {step_name}",
|
|
225
|
+
run_id=ctx.run_id,
|
|
226
|
+
step_id=step_id,
|
|
227
|
+
error=str(e),
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# Record failure event
|
|
231
|
+
failure_event = create_step_failed_event(
|
|
232
|
+
run_id=ctx.run_id,
|
|
233
|
+
step_id=step_id,
|
|
234
|
+
error=str(e),
|
|
235
|
+
error_type=type(e).__name__,
|
|
236
|
+
is_retryable=False,
|
|
237
|
+
attempt=current_attempt,
|
|
238
|
+
)
|
|
239
|
+
await ctx.storage.record_event(failure_event) # type: ignore[union-attr]
|
|
240
|
+
|
|
241
|
+
# Clear retry state
|
|
242
|
+
ctx.clear_retry_state(step_id)
|
|
243
|
+
|
|
244
|
+
raise
|
|
245
|
+
|
|
246
|
+
except (RetryableError, Exception) as e:
|
|
247
|
+
# Handle retriable errors (RetryableError or generic Exception)
|
|
248
|
+
# FatalError is already handled above
|
|
249
|
+
is_retryable_error = isinstance(e, RetryableError)
|
|
250
|
+
|
|
251
|
+
# Check if we have retries left
|
|
252
|
+
if current_attempt <= max_retries:
|
|
253
|
+
# We can retry
|
|
254
|
+
next_attempt = current_attempt + 1
|
|
255
|
+
|
|
256
|
+
# Calculate retry delay
|
|
257
|
+
delay_seconds: float
|
|
258
|
+
if isinstance(e, RetryableError) and e.retry_after is not None:
|
|
259
|
+
# Use RetryableError's specified delay
|
|
260
|
+
delay_seconds = float(e.get_retry_delay_seconds() or 0)
|
|
261
|
+
else:
|
|
262
|
+
# Use step's configured retry delay strategy
|
|
263
|
+
delay_seconds = _get_retry_delay(retry_delay, current_attempt - 1)
|
|
264
|
+
|
|
265
|
+
# Calculate resume time
|
|
266
|
+
from datetime import UTC, datetime, timedelta
|
|
267
|
+
|
|
268
|
+
resume_at = datetime.now(UTC) + timedelta(seconds=delay_seconds)
|
|
269
|
+
|
|
270
|
+
logger.warning(
|
|
271
|
+
f"Step failed (retriable): {step_name}, "
|
|
272
|
+
f"retrying in {delay_seconds}s (attempt {next_attempt}/{max_retries + 1})",
|
|
273
|
+
run_id=ctx.run_id,
|
|
274
|
+
step_id=step_id,
|
|
275
|
+
error=str(e),
|
|
276
|
+
current_attempt=current_attempt,
|
|
277
|
+
next_attempt=next_attempt,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
# Record STEP_FAILED event
|
|
281
|
+
failure_event = create_step_failed_event(
|
|
282
|
+
run_id=ctx.run_id,
|
|
283
|
+
step_id=step_id,
|
|
284
|
+
error=str(e),
|
|
285
|
+
error_type=type(e).__name__,
|
|
286
|
+
is_retryable=True,
|
|
287
|
+
attempt=current_attempt,
|
|
288
|
+
)
|
|
289
|
+
await ctx.storage.record_event(failure_event) # type: ignore[union-attr]
|
|
290
|
+
|
|
291
|
+
# Record STEP_RETRYING event
|
|
292
|
+
from pyworkflow.engine.events import create_step_retrying_event
|
|
293
|
+
|
|
294
|
+
retrying_event = create_step_retrying_event(
|
|
295
|
+
run_id=ctx.run_id,
|
|
296
|
+
step_id=step_id,
|
|
297
|
+
attempt=next_attempt,
|
|
298
|
+
retry_after=str(int(delay_seconds)),
|
|
299
|
+
error=str(e),
|
|
300
|
+
)
|
|
301
|
+
# Add additional fields to event data
|
|
302
|
+
retrying_event.data["resume_at"] = resume_at.isoformat()
|
|
303
|
+
retrying_event.data["retry_strategy"] = str(retry_delay)
|
|
304
|
+
retrying_event.data["max_retries"] = max_retries
|
|
305
|
+
await ctx.storage.record_event(retrying_event) # type: ignore[union-attr]
|
|
306
|
+
|
|
307
|
+
# Update retry state in context
|
|
308
|
+
ctx.set_retry_state(
|
|
309
|
+
step_id=step_id,
|
|
310
|
+
attempt=next_attempt,
|
|
311
|
+
resume_at=resume_at,
|
|
312
|
+
max_retries=max_retries,
|
|
313
|
+
retry_delay=retry_delay,
|
|
314
|
+
last_error=str(e),
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
# Raise suspension signal to pause workflow
|
|
318
|
+
# Note: The workflow-level exception handler will schedule automatic resumption
|
|
319
|
+
from pyworkflow.core.exceptions import SuspensionSignal
|
|
320
|
+
|
|
321
|
+
raise SuspensionSignal(
|
|
322
|
+
reason=f"retry:{step_id}",
|
|
323
|
+
resume_at=resume_at,
|
|
324
|
+
step_id=step_id,
|
|
325
|
+
attempt=next_attempt,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
else:
|
|
329
|
+
# Max retries exhausted
|
|
330
|
+
logger.error(
|
|
331
|
+
f"Step failed after {max_retries + 1} attempts: {step_name}",
|
|
332
|
+
run_id=ctx.run_id,
|
|
333
|
+
step_id=step_id,
|
|
334
|
+
error=str(e),
|
|
335
|
+
total_attempts=current_attempt,
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
# Record final STEP_FAILED event
|
|
339
|
+
failure_event = create_step_failed_event(
|
|
340
|
+
run_id=ctx.run_id,
|
|
341
|
+
step_id=step_id,
|
|
342
|
+
error=str(e),
|
|
343
|
+
error_type=type(e).__name__,
|
|
344
|
+
is_retryable=True,
|
|
345
|
+
attempt=current_attempt,
|
|
346
|
+
)
|
|
347
|
+
await ctx.storage.record_event(failure_event) # type: ignore[union-attr]
|
|
348
|
+
|
|
349
|
+
ctx.clear_retry_state(step_id)
|
|
350
|
+
|
|
351
|
+
# Convert to RetryableError if it wasn't already
|
|
352
|
+
if not is_retryable_error:
|
|
353
|
+
raise RetryableError(
|
|
354
|
+
f"Step {step_name} failed after {max_retries + 1} attempts: {e}"
|
|
355
|
+
) from e
|
|
356
|
+
else:
|
|
357
|
+
raise
|
|
358
|
+
|
|
359
|
+
# Register step
|
|
360
|
+
register_step(
|
|
361
|
+
name=step_name,
|
|
362
|
+
func=wrapper,
|
|
363
|
+
original_func=func,
|
|
364
|
+
max_retries=max_retries,
|
|
365
|
+
retry_delay=str(retry_delay),
|
|
366
|
+
timeout=timeout,
|
|
367
|
+
metadata=metadata,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Store metadata on wrapper
|
|
371
|
+
wrapper.__step__ = True # type: ignore[attr-defined]
|
|
372
|
+
wrapper.__step_name__ = step_name # type: ignore[attr-defined]
|
|
373
|
+
wrapper.__step_max_retries__ = max_retries # type: ignore[attr-defined]
|
|
374
|
+
wrapper.__step_retry_delay__ = retry_delay # type: ignore[attr-defined]
|
|
375
|
+
wrapper.__step_timeout__ = timeout # type: ignore[attr-defined]
|
|
376
|
+
wrapper.__step_metadata__ = metadata or {} # type: ignore[attr-defined]
|
|
377
|
+
|
|
378
|
+
return wrapper
|
|
379
|
+
|
|
380
|
+
return decorator
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
async def _execute_with_retries(
|
|
384
|
+
func: Callable,
|
|
385
|
+
args: tuple,
|
|
386
|
+
kwargs: dict,
|
|
387
|
+
step_name: str,
|
|
388
|
+
max_retries: int,
|
|
389
|
+
retry_delay: str | int | list[int],
|
|
390
|
+
) -> Any:
|
|
391
|
+
"""
|
|
392
|
+
Execute a step function with retry logic (for transient mode).
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
func: The step function to execute
|
|
396
|
+
args: Positional arguments
|
|
397
|
+
kwargs: Keyword arguments
|
|
398
|
+
step_name: Name of the step for logging
|
|
399
|
+
max_retries: Maximum number of retry attempts
|
|
400
|
+
retry_delay: Retry delay strategy
|
|
401
|
+
|
|
402
|
+
Returns:
|
|
403
|
+
Result of the function
|
|
404
|
+
|
|
405
|
+
Raises:
|
|
406
|
+
Exception: If all retries exhausted
|
|
407
|
+
"""
|
|
408
|
+
import asyncio
|
|
409
|
+
|
|
410
|
+
last_error: Exception | None = None
|
|
411
|
+
|
|
412
|
+
for attempt in range(max_retries + 1):
|
|
413
|
+
try:
|
|
414
|
+
return await func(*args, **kwargs)
|
|
415
|
+
|
|
416
|
+
except FatalError:
|
|
417
|
+
# Fatal errors are not retried
|
|
418
|
+
raise
|
|
419
|
+
|
|
420
|
+
except Exception as e:
|
|
421
|
+
last_error = e
|
|
422
|
+
|
|
423
|
+
if attempt < max_retries:
|
|
424
|
+
# Calculate delay
|
|
425
|
+
delay = _get_retry_delay(retry_delay, attempt)
|
|
426
|
+
|
|
427
|
+
logger.warning(
|
|
428
|
+
f"Step {step_name} failed (attempt {attempt + 1}/{max_retries + 1}), "
|
|
429
|
+
f"retrying in {delay}s",
|
|
430
|
+
error=str(e),
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
await asyncio.sleep(delay)
|
|
434
|
+
else:
|
|
435
|
+
# All retries exhausted
|
|
436
|
+
logger.error(
|
|
437
|
+
f"Step {step_name} failed after {max_retries + 1} attempts",
|
|
438
|
+
error=str(e),
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
assert last_error is not None # mypy: guaranteed by loop logic
|
|
442
|
+
raise last_error
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def _get_retry_delay(retry_delay: str | int | list[int], attempt: int) -> float:
|
|
446
|
+
"""
|
|
447
|
+
Calculate retry delay based on strategy.
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
retry_delay: Delay strategy ("exponential", int, or list)
|
|
451
|
+
attempt: Current attempt number (0-indexed)
|
|
452
|
+
|
|
453
|
+
Returns:
|
|
454
|
+
Delay in seconds
|
|
455
|
+
"""
|
|
456
|
+
if retry_delay == "exponential":
|
|
457
|
+
# Exponential backoff: 1, 2, 4, 8, 16, ... (capped at 300s)
|
|
458
|
+
return min(2**attempt, 300)
|
|
459
|
+
elif isinstance(retry_delay, int):
|
|
460
|
+
return retry_delay
|
|
461
|
+
elif isinstance(retry_delay, list):
|
|
462
|
+
# Use custom delays, fall back to last value if out of range
|
|
463
|
+
if attempt < len(retry_delay):
|
|
464
|
+
return retry_delay[attempt]
|
|
465
|
+
return retry_delay[-1] if retry_delay else 1
|
|
466
|
+
else:
|
|
467
|
+
# Default to 1 second
|
|
468
|
+
return 1
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def _generate_step_id(step_name: str, args: tuple, kwargs: dict) -> str:
|
|
472
|
+
"""
|
|
473
|
+
Generate deterministic step ID based on name and arguments.
|
|
474
|
+
|
|
475
|
+
This ensures the same step with same arguments always gets the same ID,
|
|
476
|
+
enabling proper replay behavior.
|
|
477
|
+
|
|
478
|
+
Args:
|
|
479
|
+
step_name: Step name
|
|
480
|
+
args: Positional arguments
|
|
481
|
+
kwargs: Keyword arguments
|
|
482
|
+
|
|
483
|
+
Returns:
|
|
484
|
+
Deterministic step ID
|
|
485
|
+
"""
|
|
486
|
+
# Serialize arguments
|
|
487
|
+
args_str = serialize_args(*args)
|
|
488
|
+
kwargs_str = serialize_kwargs(**kwargs)
|
|
489
|
+
|
|
490
|
+
# Create hash of step name + arguments
|
|
491
|
+
content = f"{step_name}:{args_str}:{kwargs_str}"
|
|
492
|
+
hash_hex = hashlib.sha256(content.encode()).hexdigest()[:16]
|
|
493
|
+
|
|
494
|
+
return f"step_{step_name}_{hash_hex}"
|