pyworkflow-engine 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dashboard/backend/app/__init__.py +1 -0
- dashboard/backend/app/config.py +32 -0
- dashboard/backend/app/controllers/__init__.py +6 -0
- dashboard/backend/app/controllers/run_controller.py +86 -0
- dashboard/backend/app/controllers/workflow_controller.py +33 -0
- dashboard/backend/app/dependencies/__init__.py +5 -0
- dashboard/backend/app/dependencies/storage.py +50 -0
- dashboard/backend/app/repositories/__init__.py +6 -0
- dashboard/backend/app/repositories/run_repository.py +80 -0
- dashboard/backend/app/repositories/workflow_repository.py +27 -0
- dashboard/backend/app/rest/__init__.py +8 -0
- dashboard/backend/app/rest/v1/__init__.py +12 -0
- dashboard/backend/app/rest/v1/health.py +33 -0
- dashboard/backend/app/rest/v1/runs.py +133 -0
- dashboard/backend/app/rest/v1/workflows.py +41 -0
- dashboard/backend/app/schemas/__init__.py +23 -0
- dashboard/backend/app/schemas/common.py +16 -0
- dashboard/backend/app/schemas/event.py +24 -0
- dashboard/backend/app/schemas/hook.py +25 -0
- dashboard/backend/app/schemas/run.py +54 -0
- dashboard/backend/app/schemas/step.py +28 -0
- dashboard/backend/app/schemas/workflow.py +31 -0
- dashboard/backend/app/server.py +87 -0
- dashboard/backend/app/services/__init__.py +6 -0
- dashboard/backend/app/services/run_service.py +240 -0
- dashboard/backend/app/services/workflow_service.py +155 -0
- dashboard/backend/main.py +18 -0
- docs/concepts/cancellation.mdx +362 -0
- docs/concepts/continue-as-new.mdx +434 -0
- docs/concepts/events.mdx +266 -0
- docs/concepts/fault-tolerance.mdx +370 -0
- docs/concepts/hooks.mdx +552 -0
- docs/concepts/limitations.mdx +167 -0
- docs/concepts/schedules.mdx +775 -0
- docs/concepts/sleep.mdx +312 -0
- docs/concepts/steps.mdx +301 -0
- docs/concepts/workflows.mdx +255 -0
- docs/guides/cli.mdx +942 -0
- docs/guides/configuration.mdx +560 -0
- docs/introduction.mdx +155 -0
- docs/quickstart.mdx +279 -0
- examples/__init__.py +1 -0
- examples/celery/__init__.py +1 -0
- examples/celery/durable/docker-compose.yml +55 -0
- examples/celery/durable/pyworkflow.config.yaml +12 -0
- examples/celery/durable/workflows/__init__.py +122 -0
- examples/celery/durable/workflows/basic.py +87 -0
- examples/celery/durable/workflows/batch_processing.py +102 -0
- examples/celery/durable/workflows/cancellation.py +273 -0
- examples/celery/durable/workflows/child_workflow_patterns.py +240 -0
- examples/celery/durable/workflows/child_workflows.py +202 -0
- examples/celery/durable/workflows/continue_as_new.py +260 -0
- examples/celery/durable/workflows/fault_tolerance.py +210 -0
- examples/celery/durable/workflows/hooks.py +211 -0
- examples/celery/durable/workflows/idempotency.py +112 -0
- examples/celery/durable/workflows/long_running.py +99 -0
- examples/celery/durable/workflows/retries.py +101 -0
- examples/celery/durable/workflows/schedules.py +209 -0
- examples/celery/transient/01_basic_workflow.py +91 -0
- examples/celery/transient/02_fault_tolerance.py +257 -0
- examples/celery/transient/__init__.py +20 -0
- examples/celery/transient/pyworkflow.config.yaml +25 -0
- examples/local/__init__.py +1 -0
- examples/local/durable/01_basic_workflow.py +94 -0
- examples/local/durable/02_file_storage.py +132 -0
- examples/local/durable/03_retries.py +169 -0
- examples/local/durable/04_long_running.py +119 -0
- examples/local/durable/05_event_log.py +145 -0
- examples/local/durable/06_idempotency.py +148 -0
- examples/local/durable/07_hooks.py +334 -0
- examples/local/durable/08_cancellation.py +233 -0
- examples/local/durable/09_child_workflows.py +198 -0
- examples/local/durable/10_child_workflow_patterns.py +265 -0
- examples/local/durable/11_continue_as_new.py +249 -0
- examples/local/durable/12_schedules.py +198 -0
- examples/local/durable/__init__.py +1 -0
- examples/local/transient/01_quick_tasks.py +87 -0
- examples/local/transient/02_retries.py +130 -0
- examples/local/transient/03_sleep.py +141 -0
- examples/local/transient/__init__.py +1 -0
- pyworkflow/__init__.py +256 -0
- pyworkflow/aws/__init__.py +68 -0
- pyworkflow/aws/context.py +234 -0
- pyworkflow/aws/handler.py +184 -0
- pyworkflow/aws/testing.py +310 -0
- pyworkflow/celery/__init__.py +41 -0
- pyworkflow/celery/app.py +198 -0
- pyworkflow/celery/scheduler.py +315 -0
- pyworkflow/celery/tasks.py +1746 -0
- pyworkflow/cli/__init__.py +132 -0
- pyworkflow/cli/__main__.py +6 -0
- pyworkflow/cli/commands/__init__.py +1 -0
- pyworkflow/cli/commands/hooks.py +640 -0
- pyworkflow/cli/commands/quickstart.py +495 -0
- pyworkflow/cli/commands/runs.py +773 -0
- pyworkflow/cli/commands/scheduler.py +130 -0
- pyworkflow/cli/commands/schedules.py +794 -0
- pyworkflow/cli/commands/setup.py +703 -0
- pyworkflow/cli/commands/worker.py +413 -0
- pyworkflow/cli/commands/workflows.py +1257 -0
- pyworkflow/cli/output/__init__.py +1 -0
- pyworkflow/cli/output/formatters.py +321 -0
- pyworkflow/cli/output/styles.py +121 -0
- pyworkflow/cli/utils/__init__.py +1 -0
- pyworkflow/cli/utils/async_helpers.py +30 -0
- pyworkflow/cli/utils/config.py +130 -0
- pyworkflow/cli/utils/config_generator.py +344 -0
- pyworkflow/cli/utils/discovery.py +53 -0
- pyworkflow/cli/utils/docker_manager.py +651 -0
- pyworkflow/cli/utils/interactive.py +364 -0
- pyworkflow/cli/utils/storage.py +115 -0
- pyworkflow/config.py +329 -0
- pyworkflow/context/__init__.py +63 -0
- pyworkflow/context/aws.py +230 -0
- pyworkflow/context/base.py +416 -0
- pyworkflow/context/local.py +930 -0
- pyworkflow/context/mock.py +381 -0
- pyworkflow/core/__init__.py +0 -0
- pyworkflow/core/exceptions.py +353 -0
- pyworkflow/core/registry.py +313 -0
- pyworkflow/core/scheduled.py +328 -0
- pyworkflow/core/step.py +494 -0
- pyworkflow/core/workflow.py +294 -0
- pyworkflow/discovery.py +248 -0
- pyworkflow/engine/__init__.py +0 -0
- pyworkflow/engine/events.py +879 -0
- pyworkflow/engine/executor.py +682 -0
- pyworkflow/engine/replay.py +273 -0
- pyworkflow/observability/__init__.py +19 -0
- pyworkflow/observability/logging.py +234 -0
- pyworkflow/primitives/__init__.py +33 -0
- pyworkflow/primitives/child_handle.py +174 -0
- pyworkflow/primitives/child_workflow.py +372 -0
- pyworkflow/primitives/continue_as_new.py +101 -0
- pyworkflow/primitives/define_hook.py +150 -0
- pyworkflow/primitives/hooks.py +97 -0
- pyworkflow/primitives/resume_hook.py +210 -0
- pyworkflow/primitives/schedule.py +545 -0
- pyworkflow/primitives/shield.py +96 -0
- pyworkflow/primitives/sleep.py +100 -0
- pyworkflow/runtime/__init__.py +21 -0
- pyworkflow/runtime/base.py +179 -0
- pyworkflow/runtime/celery.py +310 -0
- pyworkflow/runtime/factory.py +101 -0
- pyworkflow/runtime/local.py +706 -0
- pyworkflow/scheduler/__init__.py +9 -0
- pyworkflow/scheduler/local.py +248 -0
- pyworkflow/serialization/__init__.py +0 -0
- pyworkflow/serialization/decoder.py +146 -0
- pyworkflow/serialization/encoder.py +162 -0
- pyworkflow/storage/__init__.py +54 -0
- pyworkflow/storage/base.py +612 -0
- pyworkflow/storage/config.py +185 -0
- pyworkflow/storage/dynamodb.py +1315 -0
- pyworkflow/storage/file.py +827 -0
- pyworkflow/storage/memory.py +549 -0
- pyworkflow/storage/postgres.py +1161 -0
- pyworkflow/storage/schemas.py +486 -0
- pyworkflow/storage/sqlite.py +1136 -0
- pyworkflow/utils/__init__.py +0 -0
- pyworkflow/utils/duration.py +177 -0
- pyworkflow/utils/schedule.py +391 -0
- pyworkflow_engine-0.1.7.dist-info/METADATA +687 -0
- pyworkflow_engine-0.1.7.dist-info/RECORD +196 -0
- pyworkflow_engine-0.1.7.dist-info/WHEEL +5 -0
- pyworkflow_engine-0.1.7.dist-info/entry_points.txt +2 -0
- pyworkflow_engine-0.1.7.dist-info/licenses/LICENSE +21 -0
- pyworkflow_engine-0.1.7.dist-info/top_level.txt +5 -0
- tests/examples/__init__.py +0 -0
- tests/integration/__init__.py +0 -0
- tests/integration/test_cancellation.py +330 -0
- tests/integration/test_child_workflows.py +439 -0
- tests/integration/test_continue_as_new.py +428 -0
- tests/integration/test_dynamodb_storage.py +1146 -0
- tests/integration/test_fault_tolerance.py +369 -0
- tests/integration/test_schedule_storage.py +484 -0
- tests/unit/__init__.py +0 -0
- tests/unit/backends/__init__.py +1 -0
- tests/unit/backends/test_dynamodb_storage.py +1554 -0
- tests/unit/backends/test_postgres_storage.py +1281 -0
- tests/unit/backends/test_sqlite_storage.py +1460 -0
- tests/unit/conftest.py +41 -0
- tests/unit/test_cancellation.py +364 -0
- tests/unit/test_child_workflows.py +680 -0
- tests/unit/test_continue_as_new.py +441 -0
- tests/unit/test_event_limits.py +316 -0
- tests/unit/test_executor.py +320 -0
- tests/unit/test_fault_tolerance.py +334 -0
- tests/unit/test_hooks.py +495 -0
- tests/unit/test_registry.py +261 -0
- tests/unit/test_replay.py +420 -0
- tests/unit/test_schedule_schemas.py +285 -0
- tests/unit/test_schedule_utils.py +286 -0
- tests/unit/test_scheduled_workflow.py +274 -0
- tests/unit/test_step.py +353 -0
- tests/unit/test_workflow.py +243 -0
|
@@ -0,0 +1,1746 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Celery tasks for distributed workflow and step execution.
|
|
3
|
+
|
|
4
|
+
These tasks enable:
|
|
5
|
+
- Distributed step execution across workers
|
|
6
|
+
- Automatic retry with exponential backoff
|
|
7
|
+
- Scheduled sleep resumption
|
|
8
|
+
- Workflow orchestration
|
|
9
|
+
- Fault tolerance with automatic recovery on worker failures
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
import uuid
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
from datetime import UTC, datetime
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from celery import Task
|
|
19
|
+
from celery.exceptions import WorkerLostError
|
|
20
|
+
from loguru import logger
|
|
21
|
+
|
|
22
|
+
from pyworkflow.celery.app import celery_app
|
|
23
|
+
from pyworkflow.core.exceptions import (
|
|
24
|
+
CancellationError,
|
|
25
|
+
ContinueAsNewSignal,
|
|
26
|
+
FatalError,
|
|
27
|
+
RetryableError,
|
|
28
|
+
SuspensionSignal,
|
|
29
|
+
)
|
|
30
|
+
from pyworkflow.core.registry import WorkflowMetadata, get_workflow
|
|
31
|
+
from pyworkflow.core.workflow import execute_workflow_with_context
|
|
32
|
+
from pyworkflow.engine.events import (
|
|
33
|
+
EventType,
|
|
34
|
+
create_child_workflow_cancelled_event,
|
|
35
|
+
create_workflow_cancelled_event,
|
|
36
|
+
create_workflow_continued_as_new_event,
|
|
37
|
+
create_workflow_interrupted_event,
|
|
38
|
+
create_workflow_started_event,
|
|
39
|
+
)
|
|
40
|
+
from pyworkflow.serialization.decoder import deserialize_args, deserialize_kwargs
|
|
41
|
+
from pyworkflow.serialization.encoder import serialize_args, serialize_kwargs
|
|
42
|
+
from pyworkflow.storage.base import StorageBackend
|
|
43
|
+
from pyworkflow.storage.schemas import RunStatus, WorkflowRun
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class WorkflowTask(Task):
|
|
47
|
+
"""Base task class for workflow execution with custom error handling."""
|
|
48
|
+
|
|
49
|
+
autoretry_for = (RetryableError,)
|
|
50
|
+
retry_kwargs = {"max_retries": 3}
|
|
51
|
+
retry_backoff = True
|
|
52
|
+
retry_backoff_max = 600
|
|
53
|
+
retry_jitter = True
|
|
54
|
+
|
|
55
|
+
def on_failure(self, exc, task_id, args, kwargs, einfo):
|
|
56
|
+
"""
|
|
57
|
+
Handle task failure.
|
|
58
|
+
|
|
59
|
+
Detects worker loss and handles recovery appropriately:
|
|
60
|
+
- WorkerLostError: Infrastructure failure, may trigger recovery
|
|
61
|
+
- Other exceptions: Application failure
|
|
62
|
+
"""
|
|
63
|
+
is_worker_loss = isinstance(exc, WorkerLostError)
|
|
64
|
+
|
|
65
|
+
if is_worker_loss:
|
|
66
|
+
logger.warning(
|
|
67
|
+
f"Task {self.name} interrupted due to worker loss",
|
|
68
|
+
task_id=task_id,
|
|
69
|
+
error=str(exc),
|
|
70
|
+
)
|
|
71
|
+
# Note: Recovery is handled when the task is requeued and picked up
|
|
72
|
+
# by another worker. See _handle_workflow_recovery() for logic.
|
|
73
|
+
else:
|
|
74
|
+
logger.error(
|
|
75
|
+
f"Task {self.name} failed",
|
|
76
|
+
task_id=task_id,
|
|
77
|
+
error=str(exc),
|
|
78
|
+
traceback=einfo.traceback if einfo else None,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def on_retry(self, exc, task_id, args, kwargs, einfo):
|
|
82
|
+
"""Handle task retry."""
|
|
83
|
+
logger.warning(
|
|
84
|
+
f"Task {self.name} retrying",
|
|
85
|
+
task_id=task_id,
|
|
86
|
+
error=str(exc),
|
|
87
|
+
retry_count=self.request.retries,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@celery_app.task(
|
|
92
|
+
name="pyworkflow.execute_step",
|
|
93
|
+
base=WorkflowTask,
|
|
94
|
+
bind=True,
|
|
95
|
+
queue="pyworkflow.steps",
|
|
96
|
+
)
|
|
97
|
+
def execute_step_task(
|
|
98
|
+
self: WorkflowTask,
|
|
99
|
+
step_name: str,
|
|
100
|
+
args_json: str,
|
|
101
|
+
kwargs_json: str,
|
|
102
|
+
run_id: str,
|
|
103
|
+
step_id: str,
|
|
104
|
+
max_retries: int = 3,
|
|
105
|
+
storage_config: dict[str, Any] | None = None,
|
|
106
|
+
) -> Any:
|
|
107
|
+
"""
|
|
108
|
+
Execute a workflow step in a Celery worker.
|
|
109
|
+
|
|
110
|
+
This task runs a single step and handles retries automatically.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
step_name: Name of the step function
|
|
114
|
+
args_json: Serialized positional arguments
|
|
115
|
+
kwargs_json: Serialized keyword arguments
|
|
116
|
+
run_id: Workflow run ID
|
|
117
|
+
step_id: Step execution ID
|
|
118
|
+
max_retries: Maximum retry attempts
|
|
119
|
+
storage_config: Storage backend configuration
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Step result (serialized)
|
|
123
|
+
|
|
124
|
+
Raises:
|
|
125
|
+
FatalError: For non-retriable errors
|
|
126
|
+
RetryableError: For retriable errors (triggers automatic retry)
|
|
127
|
+
"""
|
|
128
|
+
from pyworkflow.core.registry import _registry
|
|
129
|
+
|
|
130
|
+
logger.info(
|
|
131
|
+
f"Executing step: {step_name}",
|
|
132
|
+
run_id=run_id,
|
|
133
|
+
step_id=step_id,
|
|
134
|
+
attempt=self.request.retries + 1,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Get step metadata
|
|
138
|
+
step_meta = _registry.get_step(step_name)
|
|
139
|
+
if not step_meta:
|
|
140
|
+
raise FatalError(f"Step '{step_name}' not found in registry")
|
|
141
|
+
|
|
142
|
+
# Deserialize arguments
|
|
143
|
+
args = deserialize_args(args_json)
|
|
144
|
+
kwargs = deserialize_kwargs(kwargs_json)
|
|
145
|
+
|
|
146
|
+
# Execute step function
|
|
147
|
+
try:
|
|
148
|
+
# Get the original function (unwrapped from decorator)
|
|
149
|
+
step_func = step_meta.original_func
|
|
150
|
+
|
|
151
|
+
# Execute the step
|
|
152
|
+
if asyncio.iscoroutinefunction(step_func):
|
|
153
|
+
result = asyncio.run(step_func(*args, **kwargs))
|
|
154
|
+
else:
|
|
155
|
+
result = step_func(*args, **kwargs)
|
|
156
|
+
|
|
157
|
+
logger.info(
|
|
158
|
+
f"Step completed: {step_name}",
|
|
159
|
+
run_id=run_id,
|
|
160
|
+
step_id=step_id,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
return result
|
|
164
|
+
|
|
165
|
+
except FatalError:
|
|
166
|
+
logger.error(f"Step failed (fatal): {step_name}", run_id=run_id, step_id=step_id)
|
|
167
|
+
raise
|
|
168
|
+
|
|
169
|
+
except RetryableError as e:
|
|
170
|
+
logger.warning(
|
|
171
|
+
f"Step failed (retriable): {step_name}",
|
|
172
|
+
run_id=run_id,
|
|
173
|
+
step_id=step_id,
|
|
174
|
+
retry_after=e.retry_after,
|
|
175
|
+
)
|
|
176
|
+
# Let Celery handle the retry
|
|
177
|
+
raise self.retry(exc=e, countdown=e.get_retry_delay_seconds() or 60)
|
|
178
|
+
|
|
179
|
+
except Exception as e:
|
|
180
|
+
logger.error(
|
|
181
|
+
f"Step failed (unexpected): {step_name}",
|
|
182
|
+
run_id=run_id,
|
|
183
|
+
step_id=step_id,
|
|
184
|
+
error=str(e),
|
|
185
|
+
exc_info=True,
|
|
186
|
+
)
|
|
187
|
+
# Treat unexpected errors as retriable
|
|
188
|
+
raise self.retry(exc=RetryableError(str(e)), countdown=60)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
@celery_app.task(
|
|
192
|
+
name="pyworkflow.start_workflow",
|
|
193
|
+
queue="pyworkflow.workflows",
|
|
194
|
+
)
|
|
195
|
+
def start_workflow_task(
|
|
196
|
+
workflow_name: str,
|
|
197
|
+
args_json: str,
|
|
198
|
+
kwargs_json: str,
|
|
199
|
+
run_id: str,
|
|
200
|
+
storage_config: dict[str, Any] | None = None,
|
|
201
|
+
idempotency_key: str | None = None,
|
|
202
|
+
) -> str:
|
|
203
|
+
"""
|
|
204
|
+
Start a workflow execution.
|
|
205
|
+
|
|
206
|
+
This task executes on Celery workers and runs the workflow directly.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
workflow_name: Name of the workflow
|
|
210
|
+
args_json: Serialized positional arguments
|
|
211
|
+
kwargs_json: Serialized keyword arguments
|
|
212
|
+
run_id: Workflow run ID (generated by the caller)
|
|
213
|
+
storage_config: Storage backend configuration
|
|
214
|
+
idempotency_key: Optional idempotency key
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Workflow run ID
|
|
218
|
+
"""
|
|
219
|
+
logger.info(f"Starting workflow on worker: {workflow_name}", run_id=run_id)
|
|
220
|
+
|
|
221
|
+
# Get workflow metadata
|
|
222
|
+
workflow_meta = get_workflow(workflow_name)
|
|
223
|
+
if not workflow_meta:
|
|
224
|
+
raise ValueError(f"Workflow '{workflow_name}' not found in registry")
|
|
225
|
+
|
|
226
|
+
# Deserialize arguments
|
|
227
|
+
args = deserialize_args(args_json)
|
|
228
|
+
kwargs = deserialize_kwargs(kwargs_json)
|
|
229
|
+
|
|
230
|
+
# Get storage backend
|
|
231
|
+
storage = _get_storage_backend(storage_config)
|
|
232
|
+
|
|
233
|
+
# Execute workflow directly on worker
|
|
234
|
+
result_run_id = asyncio.run(
|
|
235
|
+
_start_workflow_on_worker(
|
|
236
|
+
workflow_meta=workflow_meta,
|
|
237
|
+
args=args,
|
|
238
|
+
kwargs=kwargs,
|
|
239
|
+
storage=storage,
|
|
240
|
+
storage_config=storage_config,
|
|
241
|
+
idempotency_key=idempotency_key,
|
|
242
|
+
run_id=run_id,
|
|
243
|
+
)
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
logger.info(f"Workflow execution initiated: {workflow_name}", run_id=result_run_id)
|
|
247
|
+
return result_run_id
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
@celery_app.task(
|
|
251
|
+
name="pyworkflow.start_child_workflow",
|
|
252
|
+
queue="pyworkflow.workflows",
|
|
253
|
+
)
|
|
254
|
+
def start_child_workflow_task(
|
|
255
|
+
workflow_name: str,
|
|
256
|
+
args_json: str,
|
|
257
|
+
kwargs_json: str,
|
|
258
|
+
child_run_id: str,
|
|
259
|
+
storage_config: dict[str, Any] | None,
|
|
260
|
+
parent_run_id: str,
|
|
261
|
+
child_id: str,
|
|
262
|
+
wait_for_completion: bool,
|
|
263
|
+
) -> str:
|
|
264
|
+
"""
|
|
265
|
+
Start a child workflow execution on Celery worker.
|
|
266
|
+
|
|
267
|
+
This task executes child workflows and handles parent notification
|
|
268
|
+
when the child completes or fails.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
workflow_name: Name of the child workflow
|
|
272
|
+
args_json: Serialized positional arguments
|
|
273
|
+
kwargs_json: Serialized keyword arguments
|
|
274
|
+
child_run_id: Child workflow run ID (already created by parent)
|
|
275
|
+
storage_config: Storage backend configuration
|
|
276
|
+
parent_run_id: Parent workflow run ID
|
|
277
|
+
child_id: Deterministic child ID for replay
|
|
278
|
+
wait_for_completion: Whether parent is waiting for child
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
Child workflow run ID
|
|
282
|
+
"""
|
|
283
|
+
logger.info(
|
|
284
|
+
f"Starting child workflow on worker: {workflow_name}",
|
|
285
|
+
child_run_id=child_run_id,
|
|
286
|
+
parent_run_id=parent_run_id,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Get workflow metadata
|
|
290
|
+
workflow_meta = get_workflow(workflow_name)
|
|
291
|
+
if not workflow_meta:
|
|
292
|
+
raise ValueError(f"Workflow '{workflow_name}' not found in registry")
|
|
293
|
+
|
|
294
|
+
# Deserialize arguments
|
|
295
|
+
args = deserialize_args(args_json)
|
|
296
|
+
kwargs = deserialize_kwargs(kwargs_json)
|
|
297
|
+
|
|
298
|
+
# Get storage backend
|
|
299
|
+
storage = _get_storage_backend(storage_config)
|
|
300
|
+
|
|
301
|
+
# Execute child workflow on worker
|
|
302
|
+
asyncio.run(
|
|
303
|
+
_execute_child_workflow_on_worker(
|
|
304
|
+
workflow_func=workflow_meta.func,
|
|
305
|
+
workflow_name=workflow_name,
|
|
306
|
+
args=args,
|
|
307
|
+
kwargs=kwargs,
|
|
308
|
+
child_run_id=child_run_id,
|
|
309
|
+
storage=storage,
|
|
310
|
+
storage_config=storage_config,
|
|
311
|
+
parent_run_id=parent_run_id,
|
|
312
|
+
child_id=child_id,
|
|
313
|
+
wait_for_completion=wait_for_completion,
|
|
314
|
+
)
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
logger.info(
|
|
318
|
+
f"Child workflow execution completed: {workflow_name}",
|
|
319
|
+
child_run_id=child_run_id,
|
|
320
|
+
)
|
|
321
|
+
return child_run_id
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
async def _execute_child_workflow_on_worker(
|
|
325
|
+
workflow_func: Callable[..., Any],
|
|
326
|
+
workflow_name: str,
|
|
327
|
+
args: tuple[Any, ...],
|
|
328
|
+
kwargs: dict[str, Any],
|
|
329
|
+
child_run_id: str,
|
|
330
|
+
storage: StorageBackend,
|
|
331
|
+
storage_config: dict[str, Any] | None,
|
|
332
|
+
parent_run_id: str,
|
|
333
|
+
child_id: str,
|
|
334
|
+
wait_for_completion: bool,
|
|
335
|
+
) -> None:
|
|
336
|
+
"""
|
|
337
|
+
Execute a child workflow on Celery worker and notify parent on completion.
|
|
338
|
+
|
|
339
|
+
This handles:
|
|
340
|
+
1. Executing the child workflow
|
|
341
|
+
2. Recording completion/failure events in parent's log
|
|
342
|
+
3. Triggering parent resumption if waiting
|
|
343
|
+
"""
|
|
344
|
+
# Ensure storage is connected (some backends like SQLite require this)
|
|
345
|
+
if hasattr(storage, "connect"):
|
|
346
|
+
await storage.connect()
|
|
347
|
+
|
|
348
|
+
from pyworkflow.engine.events import (
|
|
349
|
+
create_child_workflow_completed_event,
|
|
350
|
+
create_child_workflow_failed_event,
|
|
351
|
+
)
|
|
352
|
+
from pyworkflow.serialization.encoder import serialize
|
|
353
|
+
|
|
354
|
+
try:
|
|
355
|
+
# Update status to RUNNING
|
|
356
|
+
await storage.update_run_status(child_run_id, RunStatus.RUNNING)
|
|
357
|
+
|
|
358
|
+
# Execute the child workflow
|
|
359
|
+
result = await execute_workflow_with_context(
|
|
360
|
+
run_id=child_run_id,
|
|
361
|
+
workflow_func=workflow_func,
|
|
362
|
+
workflow_name=workflow_name,
|
|
363
|
+
args=args,
|
|
364
|
+
kwargs=kwargs,
|
|
365
|
+
storage=storage,
|
|
366
|
+
durable=True,
|
|
367
|
+
event_log=None, # Fresh execution
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Update status to COMPLETED
|
|
371
|
+
serialized_result = serialize(result)
|
|
372
|
+
await storage.update_run_status(child_run_id, RunStatus.COMPLETED, result=serialized_result)
|
|
373
|
+
|
|
374
|
+
# Record completion in parent's log
|
|
375
|
+
completion_event = create_child_workflow_completed_event(
|
|
376
|
+
run_id=parent_run_id,
|
|
377
|
+
child_id=child_id,
|
|
378
|
+
child_run_id=child_run_id,
|
|
379
|
+
result=serialized_result,
|
|
380
|
+
)
|
|
381
|
+
await storage.record_event(completion_event)
|
|
382
|
+
|
|
383
|
+
logger.info(
|
|
384
|
+
f"Child workflow completed: {workflow_name}",
|
|
385
|
+
parent_run_id=parent_run_id,
|
|
386
|
+
child_run_id=child_run_id,
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# If parent is waiting, trigger resumption
|
|
390
|
+
if wait_for_completion:
|
|
391
|
+
await _trigger_parent_resumption_celery(parent_run_id, storage, storage_config)
|
|
392
|
+
|
|
393
|
+
except SuspensionSignal as e:
|
|
394
|
+
# Child workflow suspended (e.g., sleep, hook)
|
|
395
|
+
# Update status and don't notify parent yet - handled on child resumption
|
|
396
|
+
await storage.update_run_status(child_run_id, RunStatus.SUSPENDED)
|
|
397
|
+
logger.debug(
|
|
398
|
+
f"Child workflow suspended: {workflow_name}",
|
|
399
|
+
parent_run_id=parent_run_id,
|
|
400
|
+
child_run_id=child_run_id,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
# Schedule automatic resumption if we have a resume_at time
|
|
404
|
+
resume_at = e.data.get("resume_at") if e.data else None
|
|
405
|
+
if resume_at:
|
|
406
|
+
schedule_workflow_resumption(child_run_id, resume_at, storage_config)
|
|
407
|
+
|
|
408
|
+
except ContinueAsNewSignal as e:
|
|
409
|
+
# Child workflow continuing as new execution
|
|
410
|
+
from pyworkflow.core.registry import get_workflow
|
|
411
|
+
|
|
412
|
+
child_workflow_meta = get_workflow(workflow_name)
|
|
413
|
+
if not child_workflow_meta:
|
|
414
|
+
raise ValueError(f"Workflow '{workflow_name}' not found in registry")
|
|
415
|
+
|
|
416
|
+
new_run_id = await _handle_continue_as_new_celery(
|
|
417
|
+
current_run_id=child_run_id,
|
|
418
|
+
workflow_meta=child_workflow_meta,
|
|
419
|
+
storage=storage,
|
|
420
|
+
storage_config=storage_config,
|
|
421
|
+
new_args=e.workflow_args,
|
|
422
|
+
new_kwargs=e.workflow_kwargs,
|
|
423
|
+
parent_run_id=parent_run_id,
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
logger.info(
|
|
427
|
+
f"Child workflow continued as new: {workflow_name}",
|
|
428
|
+
old_run_id=child_run_id,
|
|
429
|
+
new_run_id=new_run_id,
|
|
430
|
+
parent_run_id=parent_run_id,
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
except Exception as e:
|
|
434
|
+
# Child workflow failed
|
|
435
|
+
error_msg = str(e)
|
|
436
|
+
error_type = type(e).__name__
|
|
437
|
+
|
|
438
|
+
await storage.update_run_status(child_run_id, RunStatus.FAILED, error=error_msg)
|
|
439
|
+
|
|
440
|
+
# Record failure in parent's log
|
|
441
|
+
failure_event = create_child_workflow_failed_event(
|
|
442
|
+
run_id=parent_run_id,
|
|
443
|
+
child_id=child_id,
|
|
444
|
+
child_run_id=child_run_id,
|
|
445
|
+
error=error_msg,
|
|
446
|
+
error_type=error_type,
|
|
447
|
+
)
|
|
448
|
+
await storage.record_event(failure_event)
|
|
449
|
+
|
|
450
|
+
logger.error(
|
|
451
|
+
f"Child workflow failed: {workflow_name}",
|
|
452
|
+
parent_run_id=parent_run_id,
|
|
453
|
+
child_run_id=child_run_id,
|
|
454
|
+
error=error_msg,
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
# If parent is waiting, trigger resumption (will raise error on replay)
|
|
458
|
+
if wait_for_completion:
|
|
459
|
+
await _trigger_parent_resumption_celery(parent_run_id, storage, storage_config)
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
async def _trigger_parent_resumption_celery(
|
|
463
|
+
parent_run_id: str,
|
|
464
|
+
storage: StorageBackend,
|
|
465
|
+
storage_config: dict[str, Any] | None,
|
|
466
|
+
) -> None:
|
|
467
|
+
"""
|
|
468
|
+
Trigger parent workflow resumption after child completes.
|
|
469
|
+
|
|
470
|
+
Checks if parent is suspended and schedules resumption via Celery.
|
|
471
|
+
"""
|
|
472
|
+
parent_run = await storage.get_run(parent_run_id)
|
|
473
|
+
if parent_run and parent_run.status == RunStatus.SUSPENDED:
|
|
474
|
+
logger.debug(
|
|
475
|
+
"Triggering parent resumption via Celery",
|
|
476
|
+
parent_run_id=parent_run_id,
|
|
477
|
+
)
|
|
478
|
+
# Schedule immediate resumption via Celery
|
|
479
|
+
schedule_workflow_resumption(parent_run_id, datetime.now(UTC), storage_config)
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
async def _notify_parent_of_child_completion(
|
|
483
|
+
run: WorkflowRun,
|
|
484
|
+
storage: StorageBackend,
|
|
485
|
+
storage_config: dict[str, Any] | None,
|
|
486
|
+
status: RunStatus,
|
|
487
|
+
result: str | None = None,
|
|
488
|
+
error: str | None = None,
|
|
489
|
+
error_type: str | None = None,
|
|
490
|
+
) -> None:
|
|
491
|
+
"""
|
|
492
|
+
Notify parent workflow that a child has completed/failed/cancelled.
|
|
493
|
+
|
|
494
|
+
This is called when a child workflow reaches a terminal state during resume.
|
|
495
|
+
It records the appropriate event in the parent's log and triggers resumption
|
|
496
|
+
if the parent was waiting.
|
|
497
|
+
|
|
498
|
+
Args:
|
|
499
|
+
run: The child workflow run
|
|
500
|
+
storage: Storage backend
|
|
501
|
+
storage_config: Storage configuration for Celery tasks
|
|
502
|
+
status: Terminal status (COMPLETED, FAILED, CANCELLED)
|
|
503
|
+
result: Serialized result (for COMPLETED)
|
|
504
|
+
error: Error message (for FAILED/CANCELLED)
|
|
505
|
+
error_type: Error type name (for FAILED)
|
|
506
|
+
"""
|
|
507
|
+
from pyworkflow.engine.events import (
|
|
508
|
+
create_child_workflow_cancelled_event,
|
|
509
|
+
create_child_workflow_completed_event,
|
|
510
|
+
create_child_workflow_failed_event,
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
if not run.parent_run_id:
|
|
514
|
+
return # Not a child workflow
|
|
515
|
+
|
|
516
|
+
parent_run_id = run.parent_run_id
|
|
517
|
+
child_run_id = run.run_id
|
|
518
|
+
|
|
519
|
+
# Find child_id and wait_for_completion from parent's events
|
|
520
|
+
parent_events = await storage.get_events(parent_run_id)
|
|
521
|
+
child_id = None
|
|
522
|
+
wait_for_completion = False
|
|
523
|
+
|
|
524
|
+
for event in parent_events:
|
|
525
|
+
if (
|
|
526
|
+
event.type == EventType.CHILD_WORKFLOW_STARTED
|
|
527
|
+
and event.data.get("child_run_id") == child_run_id
|
|
528
|
+
):
|
|
529
|
+
child_id = event.data.get("child_id")
|
|
530
|
+
wait_for_completion = event.data.get("wait_for_completion", False)
|
|
531
|
+
break
|
|
532
|
+
|
|
533
|
+
if not child_id:
|
|
534
|
+
logger.warning(
|
|
535
|
+
"Could not find child_id in parent events for resumed child workflow",
|
|
536
|
+
parent_run_id=parent_run_id,
|
|
537
|
+
child_run_id=child_run_id,
|
|
538
|
+
)
|
|
539
|
+
return
|
|
540
|
+
|
|
541
|
+
# Record appropriate event in parent's log
|
|
542
|
+
if status == RunStatus.COMPLETED:
|
|
543
|
+
event = create_child_workflow_completed_event(
|
|
544
|
+
run_id=parent_run_id,
|
|
545
|
+
child_id=child_id,
|
|
546
|
+
child_run_id=child_run_id,
|
|
547
|
+
result=result,
|
|
548
|
+
)
|
|
549
|
+
elif status == RunStatus.FAILED:
|
|
550
|
+
event = create_child_workflow_failed_event(
|
|
551
|
+
run_id=parent_run_id,
|
|
552
|
+
child_id=child_id,
|
|
553
|
+
child_run_id=child_run_id,
|
|
554
|
+
error=error or "Unknown error",
|
|
555
|
+
error_type=error_type or "Exception",
|
|
556
|
+
)
|
|
557
|
+
elif status == RunStatus.CANCELLED:
|
|
558
|
+
event = create_child_workflow_cancelled_event(
|
|
559
|
+
run_id=parent_run_id,
|
|
560
|
+
child_id=child_id,
|
|
561
|
+
child_run_id=child_run_id,
|
|
562
|
+
reason=error,
|
|
563
|
+
)
|
|
564
|
+
else:
|
|
565
|
+
return # Not a terminal state we handle
|
|
566
|
+
|
|
567
|
+
await storage.record_event(event)
|
|
568
|
+
|
|
569
|
+
logger.info(
|
|
570
|
+
f"Notified parent of child workflow {status.value}",
|
|
571
|
+
parent_run_id=parent_run_id,
|
|
572
|
+
child_run_id=child_run_id,
|
|
573
|
+
child_id=child_id,
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
# Trigger parent resumption if waiting
|
|
577
|
+
if wait_for_completion:
|
|
578
|
+
await _trigger_parent_resumption_celery(parent_run_id, storage, storage_config)
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
async def _handle_workflow_recovery(
|
|
582
|
+
run: WorkflowRun,
|
|
583
|
+
storage: StorageBackend,
|
|
584
|
+
worker_id: str | None = None,
|
|
585
|
+
) -> bool:
|
|
586
|
+
"""
|
|
587
|
+
Handle workflow recovery from worker failure.
|
|
588
|
+
|
|
589
|
+
Called when a workflow is found in RUNNING status but we're starting fresh.
|
|
590
|
+
This indicates a previous worker crashed.
|
|
591
|
+
|
|
592
|
+
Args:
|
|
593
|
+
run: Existing workflow run record
|
|
594
|
+
storage: Storage backend
|
|
595
|
+
worker_id: ID of the current worker
|
|
596
|
+
|
|
597
|
+
Returns:
|
|
598
|
+
True if recovery should proceed, False if max attempts exceeded
|
|
599
|
+
"""
|
|
600
|
+
# Check if recovery is enabled for this workflow
|
|
601
|
+
if not run.recover_on_worker_loss:
|
|
602
|
+
logger.warning(
|
|
603
|
+
"Workflow recovery disabled, marking as failed",
|
|
604
|
+
run_id=run.run_id,
|
|
605
|
+
workflow_name=run.workflow_name,
|
|
606
|
+
)
|
|
607
|
+
await storage.update_run_status(
|
|
608
|
+
run_id=run.run_id,
|
|
609
|
+
status=RunStatus.FAILED,
|
|
610
|
+
error="Worker lost and recovery is disabled",
|
|
611
|
+
)
|
|
612
|
+
return False
|
|
613
|
+
|
|
614
|
+
# Check recovery attempt limit
|
|
615
|
+
new_attempts = run.recovery_attempts + 1
|
|
616
|
+
if new_attempts > run.max_recovery_attempts:
|
|
617
|
+
logger.error(
|
|
618
|
+
"Workflow exceeded max recovery attempts",
|
|
619
|
+
run_id=run.run_id,
|
|
620
|
+
workflow_name=run.workflow_name,
|
|
621
|
+
recovery_attempts=run.recovery_attempts,
|
|
622
|
+
max_recovery_attempts=run.max_recovery_attempts,
|
|
623
|
+
)
|
|
624
|
+
await storage.update_run_status(
|
|
625
|
+
run_id=run.run_id,
|
|
626
|
+
status=RunStatus.FAILED,
|
|
627
|
+
error=f"Exceeded max recovery attempts ({run.max_recovery_attempts})",
|
|
628
|
+
)
|
|
629
|
+
return False
|
|
630
|
+
|
|
631
|
+
# Get last event sequence
|
|
632
|
+
events = await storage.get_events(run.run_id)
|
|
633
|
+
last_event_sequence = max((e.sequence or 0 for e in events), default=0) if events else None
|
|
634
|
+
|
|
635
|
+
# Record interruption event
|
|
636
|
+
interrupted_event = create_workflow_interrupted_event(
|
|
637
|
+
run_id=run.run_id,
|
|
638
|
+
reason="worker_lost",
|
|
639
|
+
worker_id=worker_id,
|
|
640
|
+
last_event_sequence=last_event_sequence,
|
|
641
|
+
error="Worker process terminated unexpectedly",
|
|
642
|
+
recovery_attempt=new_attempts,
|
|
643
|
+
recoverable=True,
|
|
644
|
+
)
|
|
645
|
+
await storage.record_event(interrupted_event)
|
|
646
|
+
|
|
647
|
+
# Update recovery attempts counter
|
|
648
|
+
# Note: We need to update the run record with the new recovery_attempts count
|
|
649
|
+
run.recovery_attempts = new_attempts
|
|
650
|
+
await storage.update_run_recovery_attempts(run.run_id, new_attempts)
|
|
651
|
+
|
|
652
|
+
logger.info(
|
|
653
|
+
"Workflow recovery initiated",
|
|
654
|
+
run_id=run.run_id,
|
|
655
|
+
workflow_name=run.workflow_name,
|
|
656
|
+
recovery_attempt=new_attempts,
|
|
657
|
+
max_recovery_attempts=run.max_recovery_attempts,
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
return True
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
async def _recover_workflow_on_worker(
|
|
664
|
+
run: WorkflowRun,
|
|
665
|
+
workflow_meta: WorkflowMetadata,
|
|
666
|
+
storage: StorageBackend,
|
|
667
|
+
storage_config: dict[str, Any] | None = None,
|
|
668
|
+
) -> str:
|
|
669
|
+
"""
|
|
670
|
+
Recover a workflow after worker failure.
|
|
671
|
+
|
|
672
|
+
This is similar to resuming a suspended workflow, but specifically handles
|
|
673
|
+
the recovery scenario after a worker crash.
|
|
674
|
+
|
|
675
|
+
Args:
|
|
676
|
+
run: Existing workflow run record
|
|
677
|
+
workflow_meta: Workflow metadata
|
|
678
|
+
storage: Storage backend
|
|
679
|
+
storage_config: Storage configuration for child tasks
|
|
680
|
+
|
|
681
|
+
Returns:
|
|
682
|
+
Workflow run ID
|
|
683
|
+
"""
|
|
684
|
+
run_id = run.run_id
|
|
685
|
+
workflow_name = run.workflow_name
|
|
686
|
+
|
|
687
|
+
logger.info(
|
|
688
|
+
f"Recovering workflow execution: {workflow_name}",
|
|
689
|
+
run_id=run_id,
|
|
690
|
+
workflow_name=workflow_name,
|
|
691
|
+
recovery_attempt=run.recovery_attempts,
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
# Update status to RUNNING (from RUNNING or INTERRUPTED)
|
|
695
|
+
await storage.update_run_status(run_id=run_id, status=RunStatus.RUNNING)
|
|
696
|
+
|
|
697
|
+
# Load event log for replay
|
|
698
|
+
events = await storage.get_events(run_id)
|
|
699
|
+
|
|
700
|
+
# Complete any pending sleeps (mark them as done before resuming)
|
|
701
|
+
events = await _complete_pending_sleeps(run_id, events, storage)
|
|
702
|
+
|
|
703
|
+
# Deserialize arguments
|
|
704
|
+
args = deserialize_args(run.input_args)
|
|
705
|
+
kwargs = deserialize_kwargs(run.input_kwargs)
|
|
706
|
+
|
|
707
|
+
# Execute workflow with event replay
|
|
708
|
+
try:
|
|
709
|
+
result = await execute_workflow_with_context(
|
|
710
|
+
workflow_func=workflow_meta.func,
|
|
711
|
+
run_id=run_id,
|
|
712
|
+
workflow_name=workflow_name,
|
|
713
|
+
storage=storage,
|
|
714
|
+
args=args,
|
|
715
|
+
kwargs=kwargs,
|
|
716
|
+
event_log=events,
|
|
717
|
+
)
|
|
718
|
+
|
|
719
|
+
# Update run status to completed
|
|
720
|
+
await storage.update_run_status(
|
|
721
|
+
run_id=run_id, status=RunStatus.COMPLETED, result=serialize_args(result)
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
# Cancel all running children (TERMINATE policy)
|
|
725
|
+
await _handle_parent_completion(run_id, RunStatus.COMPLETED, storage)
|
|
726
|
+
|
|
727
|
+
logger.info(
|
|
728
|
+
f"Workflow recovered and completed: {workflow_name}",
|
|
729
|
+
run_id=run_id,
|
|
730
|
+
workflow_name=workflow_name,
|
|
731
|
+
recovery_attempt=run.recovery_attempts,
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
return run_id
|
|
735
|
+
|
|
736
|
+
except SuspensionSignal as e:
|
|
737
|
+
# Workflow suspended again
|
|
738
|
+
await storage.update_run_status(run_id=run_id, status=RunStatus.SUSPENDED)
|
|
739
|
+
|
|
740
|
+
logger.info(
|
|
741
|
+
f"Recovered workflow suspended: {e.reason}",
|
|
742
|
+
run_id=run_id,
|
|
743
|
+
workflow_name=workflow_name,
|
|
744
|
+
reason=e.reason,
|
|
745
|
+
)
|
|
746
|
+
|
|
747
|
+
# Schedule automatic resumption if we have a resume_at time
|
|
748
|
+
resume_at = e.data.get("resume_at") if e.data else None
|
|
749
|
+
if resume_at:
|
|
750
|
+
schedule_workflow_resumption(run_id, resume_at, storage_config=storage_config)
|
|
751
|
+
logger.info(
|
|
752
|
+
"Scheduled automatic workflow resumption",
|
|
753
|
+
run_id=run_id,
|
|
754
|
+
resume_at=resume_at.isoformat(),
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
return run_id
|
|
758
|
+
|
|
759
|
+
except ContinueAsNewSignal as e:
|
|
760
|
+
# Workflow continuing as new execution
|
|
761
|
+
new_run_id = await _handle_continue_as_new_celery(
|
|
762
|
+
current_run_id=run_id,
|
|
763
|
+
workflow_meta=workflow_meta,
|
|
764
|
+
storage=storage,
|
|
765
|
+
storage_config=storage_config,
|
|
766
|
+
new_args=e.workflow_args,
|
|
767
|
+
new_kwargs=e.workflow_kwargs,
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
# Cancel all running children (TERMINATE policy)
|
|
771
|
+
await _handle_parent_completion(run_id, RunStatus.CONTINUED_AS_NEW, storage)
|
|
772
|
+
|
|
773
|
+
logger.info(
|
|
774
|
+
f"Recovered workflow continued as new: {workflow_name}",
|
|
775
|
+
old_run_id=run_id,
|
|
776
|
+
new_run_id=new_run_id,
|
|
777
|
+
)
|
|
778
|
+
|
|
779
|
+
return run_id
|
|
780
|
+
|
|
781
|
+
except Exception as e:
|
|
782
|
+
# Workflow failed during recovery
|
|
783
|
+
await storage.update_run_status(run_id=run_id, status=RunStatus.FAILED, error=str(e))
|
|
784
|
+
|
|
785
|
+
# Cancel all running children (TERMINATE policy)
|
|
786
|
+
await _handle_parent_completion(run_id, RunStatus.FAILED, storage)
|
|
787
|
+
|
|
788
|
+
# Cancel all running children (TERMINATE policy)
|
|
789
|
+
await _handle_parent_completion(run_id, RunStatus.FAILED, storage)
|
|
790
|
+
|
|
791
|
+
logger.error(
|
|
792
|
+
f"Workflow failed during recovery: {workflow_name}",
|
|
793
|
+
run_id=run_id,
|
|
794
|
+
workflow_name=workflow_name,
|
|
795
|
+
error=str(e),
|
|
796
|
+
exc_info=True,
|
|
797
|
+
)
|
|
798
|
+
|
|
799
|
+
raise
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
async def _start_workflow_on_worker(
|
|
803
|
+
workflow_meta: WorkflowMetadata,
|
|
804
|
+
args: tuple,
|
|
805
|
+
kwargs: dict,
|
|
806
|
+
storage: StorageBackend,
|
|
807
|
+
storage_config: dict[str, Any] | None = None,
|
|
808
|
+
idempotency_key: str | None = None,
|
|
809
|
+
run_id: str | None = None,
|
|
810
|
+
) -> str:
|
|
811
|
+
"""
|
|
812
|
+
Internal function to start workflow on Celery worker.
|
|
813
|
+
|
|
814
|
+
This mirrors the logic from testing.py but runs on workers.
|
|
815
|
+
Handles recovery scenarios when picking up a task from a crashed worker.
|
|
816
|
+
|
|
817
|
+
Args:
|
|
818
|
+
workflow_meta: Workflow metadata
|
|
819
|
+
args: Workflow positional arguments
|
|
820
|
+
kwargs: Workflow keyword arguments
|
|
821
|
+
storage: Storage backend
|
|
822
|
+
storage_config: Storage configuration for child tasks
|
|
823
|
+
idempotency_key: Optional idempotency key
|
|
824
|
+
run_id: Pre-generated run ID (if None, generates a new one)
|
|
825
|
+
"""
|
|
826
|
+
from pyworkflow.config import get_config
|
|
827
|
+
|
|
828
|
+
# Ensure storage is connected (some backends like SQLite require this)
|
|
829
|
+
if hasattr(storage, "connect"):
|
|
830
|
+
await storage.connect()
|
|
831
|
+
|
|
832
|
+
workflow_name = workflow_meta.name
|
|
833
|
+
config = get_config()
|
|
834
|
+
|
|
835
|
+
# Check idempotency key
|
|
836
|
+
if idempotency_key:
|
|
837
|
+
existing_run = await storage.get_run_by_idempotency_key(idempotency_key)
|
|
838
|
+
if existing_run:
|
|
839
|
+
# Check if this is a recovery scenario (workflow was RUNNING but worker crashed)
|
|
840
|
+
if existing_run.status == RunStatus.RUNNING:
|
|
841
|
+
# Check if this is truly a crashed worker or just a duplicate task execution
|
|
842
|
+
from datetime import timedelta
|
|
843
|
+
|
|
844
|
+
run_age = datetime.now(UTC) - existing_run.created_at
|
|
845
|
+
if run_age < timedelta(seconds=30):
|
|
846
|
+
logger.info(
|
|
847
|
+
f"Run with idempotency key '{idempotency_key}' already exists and was created recently. "
|
|
848
|
+
"Likely duplicate task execution, skipping.",
|
|
849
|
+
run_id=existing_run.run_id,
|
|
850
|
+
)
|
|
851
|
+
return existing_run.run_id
|
|
852
|
+
|
|
853
|
+
# This is a recovery scenario - worker crashed while running
|
|
854
|
+
can_recover = await _handle_workflow_recovery(
|
|
855
|
+
run=existing_run,
|
|
856
|
+
storage=storage,
|
|
857
|
+
worker_id=None, # TODO: Get actual worker ID from Celery
|
|
858
|
+
)
|
|
859
|
+
if can_recover:
|
|
860
|
+
# Continue with recovery - resume workflow from last checkpoint
|
|
861
|
+
return await _recover_workflow_on_worker(
|
|
862
|
+
run=existing_run,
|
|
863
|
+
workflow_meta=workflow_meta,
|
|
864
|
+
storage=storage,
|
|
865
|
+
storage_config=storage_config,
|
|
866
|
+
)
|
|
867
|
+
else:
|
|
868
|
+
# Recovery disabled or max attempts exceeded
|
|
869
|
+
return existing_run.run_id
|
|
870
|
+
elif existing_run.status == RunStatus.INTERRUPTED:
|
|
871
|
+
# Previous recovery attempt also failed, try again
|
|
872
|
+
can_recover = await _handle_workflow_recovery(
|
|
873
|
+
run=existing_run,
|
|
874
|
+
storage=storage,
|
|
875
|
+
worker_id=None,
|
|
876
|
+
)
|
|
877
|
+
if can_recover:
|
|
878
|
+
return await _recover_workflow_on_worker(
|
|
879
|
+
run=existing_run,
|
|
880
|
+
workflow_meta=workflow_meta,
|
|
881
|
+
storage=storage,
|
|
882
|
+
storage_config=storage_config,
|
|
883
|
+
)
|
|
884
|
+
else:
|
|
885
|
+
return existing_run.run_id
|
|
886
|
+
else:
|
|
887
|
+
# Workflow already completed/failed/etc
|
|
888
|
+
logger.info(
|
|
889
|
+
f"Workflow with idempotency key '{idempotency_key}' already exists",
|
|
890
|
+
run_id=existing_run.run_id,
|
|
891
|
+
status=existing_run.status.value,
|
|
892
|
+
)
|
|
893
|
+
return existing_run.run_id
|
|
894
|
+
|
|
895
|
+
# Use provided run_id or generate a new one
|
|
896
|
+
if run_id is None:
|
|
897
|
+
run_id = f"run_{uuid.uuid4().hex[:16]}"
|
|
898
|
+
|
|
899
|
+
# Check if run already exists (recovery scenario without idempotency key)
|
|
900
|
+
existing_run = await storage.get_run(run_id)
|
|
901
|
+
if existing_run and existing_run.status == RunStatus.RUNNING:
|
|
902
|
+
# This is a recovery scenario
|
|
903
|
+
can_recover = await _handle_workflow_recovery(
|
|
904
|
+
run=existing_run,
|
|
905
|
+
storage=storage,
|
|
906
|
+
worker_id=None,
|
|
907
|
+
)
|
|
908
|
+
if can_recover:
|
|
909
|
+
return await _recover_workflow_on_worker(
|
|
910
|
+
run=existing_run,
|
|
911
|
+
workflow_meta=workflow_meta,
|
|
912
|
+
storage=storage,
|
|
913
|
+
storage_config=storage_config,
|
|
914
|
+
)
|
|
915
|
+
else:
|
|
916
|
+
return existing_run.run_id
|
|
917
|
+
|
|
918
|
+
logger.info(
|
|
919
|
+
f"Starting workflow execution on worker: {workflow_name}",
|
|
920
|
+
run_id=run_id,
|
|
921
|
+
workflow_name=workflow_name,
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
# Determine recovery settings
|
|
925
|
+
# Priority: workflow decorator > global config > defaults based on durable mode
|
|
926
|
+
recover_on_worker_loss = getattr(
|
|
927
|
+
workflow_meta.func, "__workflow_recover_on_worker_loss__", None
|
|
928
|
+
)
|
|
929
|
+
max_recovery_attempts = getattr(workflow_meta.func, "__workflow_max_recovery_attempts__", None)
|
|
930
|
+
is_durable = getattr(workflow_meta.func, "__workflow_durable__", True)
|
|
931
|
+
|
|
932
|
+
if recover_on_worker_loss is None:
|
|
933
|
+
recover_on_worker_loss = config.default_recover_on_worker_loss
|
|
934
|
+
if recover_on_worker_loss is None:
|
|
935
|
+
# Default: True for durable, False for transient
|
|
936
|
+
recover_on_worker_loss = is_durable if is_durable is not None else True
|
|
937
|
+
|
|
938
|
+
if max_recovery_attempts is None:
|
|
939
|
+
max_recovery_attempts = config.default_max_recovery_attempts
|
|
940
|
+
|
|
941
|
+
# Create workflow run record
|
|
942
|
+
run = WorkflowRun(
|
|
943
|
+
run_id=run_id,
|
|
944
|
+
workflow_name=workflow_name,
|
|
945
|
+
status=RunStatus.RUNNING,
|
|
946
|
+
created_at=datetime.now(UTC),
|
|
947
|
+
started_at=datetime.now(UTC),
|
|
948
|
+
input_args=serialize_args(*args),
|
|
949
|
+
input_kwargs=serialize_kwargs(**kwargs),
|
|
950
|
+
idempotency_key=idempotency_key,
|
|
951
|
+
max_duration=workflow_meta.max_duration,
|
|
952
|
+
metadata={}, # Run-level metadata (not from decorator)
|
|
953
|
+
recovery_attempts=0,
|
|
954
|
+
max_recovery_attempts=max_recovery_attempts,
|
|
955
|
+
recover_on_worker_loss=recover_on_worker_loss,
|
|
956
|
+
)
|
|
957
|
+
|
|
958
|
+
await storage.create_run(run)
|
|
959
|
+
|
|
960
|
+
# Record workflow started event
|
|
961
|
+
start_event = create_workflow_started_event(
|
|
962
|
+
run_id=run_id,
|
|
963
|
+
workflow_name=workflow_name,
|
|
964
|
+
args=serialize_args(*args),
|
|
965
|
+
kwargs=serialize_kwargs(**kwargs),
|
|
966
|
+
metadata={}, # Run-level metadata
|
|
967
|
+
)
|
|
968
|
+
|
|
969
|
+
await storage.record_event(start_event)
|
|
970
|
+
|
|
971
|
+
# Execute workflow
|
|
972
|
+
try:
|
|
973
|
+
result = await execute_workflow_with_context(
|
|
974
|
+
workflow_func=workflow_meta.func,
|
|
975
|
+
run_id=run_id,
|
|
976
|
+
workflow_name=workflow_name,
|
|
977
|
+
storage=storage,
|
|
978
|
+
args=args,
|
|
979
|
+
kwargs=kwargs,
|
|
980
|
+
)
|
|
981
|
+
|
|
982
|
+
# Update run status to completed
|
|
983
|
+
await storage.update_run_status(
|
|
984
|
+
run_id=run_id, status=RunStatus.COMPLETED, result=serialize_args(result)
|
|
985
|
+
)
|
|
986
|
+
|
|
987
|
+
# Cancel all running children (TERMINATE policy)
|
|
988
|
+
await _handle_parent_completion(run_id, RunStatus.COMPLETED, storage)
|
|
989
|
+
|
|
990
|
+
logger.info(
|
|
991
|
+
f"Workflow completed successfully on worker: {workflow_name}",
|
|
992
|
+
run_id=run_id,
|
|
993
|
+
workflow_name=workflow_name,
|
|
994
|
+
)
|
|
995
|
+
|
|
996
|
+
return run_id
|
|
997
|
+
|
|
998
|
+
except CancellationError as e:
|
|
999
|
+
# Workflow was cancelled
|
|
1000
|
+
cancelled_event = create_workflow_cancelled_event(
|
|
1001
|
+
run_id=run_id,
|
|
1002
|
+
reason=e.reason,
|
|
1003
|
+
cleanup_completed=True, # If we got here, cleanup has completed
|
|
1004
|
+
)
|
|
1005
|
+
await storage.record_event(cancelled_event)
|
|
1006
|
+
await storage.update_run_status(run_id=run_id, status=RunStatus.CANCELLED)
|
|
1007
|
+
await storage.clear_cancellation_flag(run_id)
|
|
1008
|
+
|
|
1009
|
+
# Cancel all running children (TERMINATE policy)
|
|
1010
|
+
await _handle_parent_completion(run_id, RunStatus.CANCELLED, storage)
|
|
1011
|
+
|
|
1012
|
+
logger.info(
|
|
1013
|
+
f"Workflow cancelled on worker: {workflow_name}",
|
|
1014
|
+
run_id=run_id,
|
|
1015
|
+
workflow_name=workflow_name,
|
|
1016
|
+
reason=e.reason,
|
|
1017
|
+
)
|
|
1018
|
+
|
|
1019
|
+
return run_id
|
|
1020
|
+
|
|
1021
|
+
except SuspensionSignal as e:
|
|
1022
|
+
# Workflow suspended (sleep or hook)
|
|
1023
|
+
await storage.update_run_status(run_id=run_id, status=RunStatus.SUSPENDED)
|
|
1024
|
+
|
|
1025
|
+
logger.info(
|
|
1026
|
+
f"Workflow suspended on worker: {e.reason}",
|
|
1027
|
+
run_id=run_id,
|
|
1028
|
+
workflow_name=workflow_name,
|
|
1029
|
+
reason=e.reason,
|
|
1030
|
+
)
|
|
1031
|
+
|
|
1032
|
+
# Schedule automatic resumption if we have a resume_at time
|
|
1033
|
+
resume_at = e.data.get("resume_at") if e.data else None
|
|
1034
|
+
if resume_at:
|
|
1035
|
+
schedule_workflow_resumption(run_id, resume_at, storage_config=storage_config)
|
|
1036
|
+
logger.info(
|
|
1037
|
+
"Scheduled automatic workflow resumption",
|
|
1038
|
+
run_id=run_id,
|
|
1039
|
+
resume_at=resume_at.isoformat(),
|
|
1040
|
+
)
|
|
1041
|
+
|
|
1042
|
+
return run_id
|
|
1043
|
+
|
|
1044
|
+
except ContinueAsNewSignal as e:
|
|
1045
|
+
# Workflow continuing as new execution
|
|
1046
|
+
new_run_id = await _handle_continue_as_new_celery(
|
|
1047
|
+
current_run_id=run_id,
|
|
1048
|
+
workflow_meta=workflow_meta,
|
|
1049
|
+
storage=storage,
|
|
1050
|
+
storage_config=storage_config,
|
|
1051
|
+
new_args=e.workflow_args,
|
|
1052
|
+
new_kwargs=e.workflow_kwargs,
|
|
1053
|
+
)
|
|
1054
|
+
|
|
1055
|
+
# Cancel all running children (TERMINATE policy)
|
|
1056
|
+
await _handle_parent_completion(run_id, RunStatus.CONTINUED_AS_NEW, storage)
|
|
1057
|
+
|
|
1058
|
+
logger.info(
|
|
1059
|
+
f"Workflow continued as new on worker: {workflow_name}",
|
|
1060
|
+
old_run_id=run_id,
|
|
1061
|
+
new_run_id=new_run_id,
|
|
1062
|
+
)
|
|
1063
|
+
|
|
1064
|
+
return run_id
|
|
1065
|
+
|
|
1066
|
+
except Exception as e:
|
|
1067
|
+
# Workflow failed
|
|
1068
|
+
await storage.update_run_status(run_id=run_id, status=RunStatus.FAILED, error=str(e))
|
|
1069
|
+
|
|
1070
|
+
# Cancel all running children (TERMINATE policy)
|
|
1071
|
+
await _handle_parent_completion(run_id, RunStatus.FAILED, storage)
|
|
1072
|
+
|
|
1073
|
+
# Cancel all running children (TERMINATE policy)
|
|
1074
|
+
await _handle_parent_completion(run_id, RunStatus.FAILED, storage)
|
|
1075
|
+
|
|
1076
|
+
logger.error(
|
|
1077
|
+
f"Workflow failed on worker: {workflow_name}",
|
|
1078
|
+
run_id=run_id,
|
|
1079
|
+
workflow_name=workflow_name,
|
|
1080
|
+
error=str(e),
|
|
1081
|
+
exc_info=True,
|
|
1082
|
+
)
|
|
1083
|
+
|
|
1084
|
+
raise
|
|
1085
|
+
|
|
1086
|
+
|
|
1087
|
+
@celery_app.task(
|
|
1088
|
+
name="pyworkflow.resume_workflow",
|
|
1089
|
+
queue="pyworkflow.schedules",
|
|
1090
|
+
)
|
|
1091
|
+
def resume_workflow_task(
|
|
1092
|
+
run_id: str,
|
|
1093
|
+
storage_config: dict[str, Any] | None = None,
|
|
1094
|
+
) -> Any | None:
|
|
1095
|
+
"""
|
|
1096
|
+
Resume a suspended workflow.
|
|
1097
|
+
|
|
1098
|
+
This task is scheduled automatically when a workflow suspends (e.g., for sleep).
|
|
1099
|
+
It executes on Celery workers and runs the workflow directly.
|
|
1100
|
+
|
|
1101
|
+
Args:
|
|
1102
|
+
run_id: Workflow run ID to resume
|
|
1103
|
+
storage_config: Storage backend configuration
|
|
1104
|
+
|
|
1105
|
+
Returns:
|
|
1106
|
+
Workflow result if completed, None if suspended again
|
|
1107
|
+
"""
|
|
1108
|
+
logger.info(f"Resuming workflow on worker: {run_id}")
|
|
1109
|
+
|
|
1110
|
+
# Get storage backend
|
|
1111
|
+
storage = _get_storage_backend(storage_config)
|
|
1112
|
+
|
|
1113
|
+
# Resume workflow directly on worker
|
|
1114
|
+
result = asyncio.run(_resume_workflow_on_worker(run_id, storage, storage_config))
|
|
1115
|
+
|
|
1116
|
+
if result is not None:
|
|
1117
|
+
logger.info(f"Workflow completed on worker: {run_id}")
|
|
1118
|
+
else:
|
|
1119
|
+
logger.info(f"Workflow suspended again on worker: {run_id}")
|
|
1120
|
+
|
|
1121
|
+
return result
|
|
1122
|
+
|
|
1123
|
+
|
|
1124
|
+
@celery_app.task(
|
|
1125
|
+
name="pyworkflow.execute_scheduled_workflow",
|
|
1126
|
+
queue="pyworkflow.schedules",
|
|
1127
|
+
)
|
|
1128
|
+
def execute_scheduled_workflow_task(
|
|
1129
|
+
schedule_id: str,
|
|
1130
|
+
scheduled_time: str,
|
|
1131
|
+
storage_config: dict[str, Any] | None = None,
|
|
1132
|
+
) -> str | None:
|
|
1133
|
+
"""
|
|
1134
|
+
Execute a workflow from a schedule.
|
|
1135
|
+
|
|
1136
|
+
This task is triggered by the PyWorkflow scheduler when a schedule is due.
|
|
1137
|
+
It starts a new workflow run and tracks it against the schedule.
|
|
1138
|
+
|
|
1139
|
+
Args:
|
|
1140
|
+
schedule_id: Schedule identifier
|
|
1141
|
+
scheduled_time: ISO format scheduled execution time
|
|
1142
|
+
storage_config: Storage backend configuration
|
|
1143
|
+
|
|
1144
|
+
Returns:
|
|
1145
|
+
Workflow run ID if started, None if skipped
|
|
1146
|
+
"""
|
|
1147
|
+
logger.info("Executing scheduled workflow", schedule_id=schedule_id)
|
|
1148
|
+
|
|
1149
|
+
storage = _get_storage_backend(storage_config)
|
|
1150
|
+
|
|
1151
|
+
return asyncio.run(
|
|
1152
|
+
_execute_scheduled_workflow(
|
|
1153
|
+
schedule_id=schedule_id,
|
|
1154
|
+
scheduled_time=datetime.fromisoformat(scheduled_time),
|
|
1155
|
+
storage=storage,
|
|
1156
|
+
storage_config=storage_config,
|
|
1157
|
+
)
|
|
1158
|
+
)
|
|
1159
|
+
|
|
1160
|
+
|
|
1161
|
+
async def _execute_scheduled_workflow(
|
|
1162
|
+
schedule_id: str,
|
|
1163
|
+
scheduled_time: datetime,
|
|
1164
|
+
storage: StorageBackend,
|
|
1165
|
+
storage_config: dict[str, Any] | None,
|
|
1166
|
+
) -> str | None:
|
|
1167
|
+
"""
|
|
1168
|
+
Execute a scheduled workflow with tracking.
|
|
1169
|
+
|
|
1170
|
+
Args:
|
|
1171
|
+
schedule_id: Schedule identifier
|
|
1172
|
+
scheduled_time: When the schedule was supposed to trigger
|
|
1173
|
+
storage: Storage backend
|
|
1174
|
+
storage_config: Storage configuration for serialization
|
|
1175
|
+
|
|
1176
|
+
Returns:
|
|
1177
|
+
Workflow run ID if started, None if skipped
|
|
1178
|
+
"""
|
|
1179
|
+
# Ensure storage is connected (some backends like SQLite require this)
|
|
1180
|
+
if hasattr(storage, "connect"):
|
|
1181
|
+
await storage.connect()
|
|
1182
|
+
|
|
1183
|
+
from pyworkflow.engine.events import create_schedule_triggered_event
|
|
1184
|
+
from pyworkflow.storage.schemas import ScheduleStatus
|
|
1185
|
+
|
|
1186
|
+
# Get schedule
|
|
1187
|
+
schedule = await storage.get_schedule(schedule_id)
|
|
1188
|
+
if not schedule:
|
|
1189
|
+
logger.error(f"Schedule not found: {schedule_id}")
|
|
1190
|
+
return None
|
|
1191
|
+
|
|
1192
|
+
if schedule.status != ScheduleStatus.ACTIVE:
|
|
1193
|
+
logger.info(f"Schedule not active: {schedule_id}")
|
|
1194
|
+
return None
|
|
1195
|
+
|
|
1196
|
+
# Get workflow
|
|
1197
|
+
workflow_meta = get_workflow(schedule.workflow_name)
|
|
1198
|
+
if not workflow_meta:
|
|
1199
|
+
logger.error(f"Workflow not found: {schedule.workflow_name}")
|
|
1200
|
+
schedule.failed_runs += 1
|
|
1201
|
+
schedule.updated_at = datetime.now(UTC)
|
|
1202
|
+
await storage.update_schedule(schedule)
|
|
1203
|
+
return None
|
|
1204
|
+
|
|
1205
|
+
# Deserialize arguments
|
|
1206
|
+
args = deserialize_args(schedule.args)
|
|
1207
|
+
kwargs = deserialize_kwargs(schedule.kwargs)
|
|
1208
|
+
|
|
1209
|
+
# Generate run_id
|
|
1210
|
+
run_id = f"sched_{schedule_id[:8]}_{uuid.uuid4().hex[:8]}"
|
|
1211
|
+
|
|
1212
|
+
# Add to running runs
|
|
1213
|
+
await storage.add_running_run(schedule_id, run_id)
|
|
1214
|
+
|
|
1215
|
+
# Update schedule stats
|
|
1216
|
+
schedule.total_runs += 1
|
|
1217
|
+
schedule.last_run_at = datetime.now(UTC)
|
|
1218
|
+
schedule.last_run_id = run_id
|
|
1219
|
+
await storage.update_schedule(schedule)
|
|
1220
|
+
|
|
1221
|
+
try:
|
|
1222
|
+
# Serialize args for Celery task
|
|
1223
|
+
args_json = serialize_args(*args)
|
|
1224
|
+
kwargs_json = serialize_kwargs(**kwargs)
|
|
1225
|
+
|
|
1226
|
+
# Start the workflow via Celery
|
|
1227
|
+
# Note: start_workflow_task will create the run record
|
|
1228
|
+
start_workflow_task.delay(
|
|
1229
|
+
workflow_name=schedule.workflow_name,
|
|
1230
|
+
args_json=args_json,
|
|
1231
|
+
kwargs_json=kwargs_json,
|
|
1232
|
+
run_id=run_id,
|
|
1233
|
+
storage_config=storage_config,
|
|
1234
|
+
metadata={"schedule_id": schedule_id, "scheduled_time": scheduled_time.isoformat()},
|
|
1235
|
+
)
|
|
1236
|
+
|
|
1237
|
+
# Record trigger event - use schedule_id as run_id since workflow run may not exist yet
|
|
1238
|
+
trigger_event = create_schedule_triggered_event(
|
|
1239
|
+
run_id=schedule_id, # Use schedule_id for event association
|
|
1240
|
+
schedule_id=schedule_id,
|
|
1241
|
+
scheduled_time=scheduled_time,
|
|
1242
|
+
actual_time=datetime.now(UTC),
|
|
1243
|
+
workflow_run_id=run_id,
|
|
1244
|
+
)
|
|
1245
|
+
await storage.record_event(trigger_event)
|
|
1246
|
+
|
|
1247
|
+
logger.info(
|
|
1248
|
+
f"Started scheduled workflow: {schedule.workflow_name}",
|
|
1249
|
+
schedule_id=schedule_id,
|
|
1250
|
+
run_id=run_id,
|
|
1251
|
+
)
|
|
1252
|
+
|
|
1253
|
+
return run_id
|
|
1254
|
+
|
|
1255
|
+
except Exception as e:
|
|
1256
|
+
logger.error(f"Failed to start scheduled workflow: {e}")
|
|
1257
|
+
await storage.remove_running_run(schedule_id, run_id)
|
|
1258
|
+
schedule.failed_runs += 1
|
|
1259
|
+
schedule.updated_at = datetime.now(UTC)
|
|
1260
|
+
await storage.update_schedule(schedule)
|
|
1261
|
+
raise
|
|
1262
|
+
|
|
1263
|
+
|
|
1264
|
+
async def _complete_pending_sleeps(
|
|
1265
|
+
run_id: str,
|
|
1266
|
+
events: list[Any],
|
|
1267
|
+
storage: StorageBackend,
|
|
1268
|
+
) -> list[Any]:
|
|
1269
|
+
"""
|
|
1270
|
+
Record SLEEP_COMPLETED events for any pending sleeps.
|
|
1271
|
+
|
|
1272
|
+
When resuming a workflow, we need to mark sleeps as completed
|
|
1273
|
+
so the replay logic knows to skip them.
|
|
1274
|
+
|
|
1275
|
+
Args:
|
|
1276
|
+
run_id: Workflow run ID
|
|
1277
|
+
events: Current event list
|
|
1278
|
+
storage: Storage backend
|
|
1279
|
+
|
|
1280
|
+
Returns:
|
|
1281
|
+
Updated event list with SLEEP_COMPLETED events appended
|
|
1282
|
+
"""
|
|
1283
|
+
from pyworkflow.engine.events import EventType, create_sleep_completed_event
|
|
1284
|
+
|
|
1285
|
+
# Find pending sleeps (SLEEP_STARTED without SLEEP_COMPLETED)
|
|
1286
|
+
started_sleeps = set()
|
|
1287
|
+
completed_sleeps = set()
|
|
1288
|
+
|
|
1289
|
+
for event in events:
|
|
1290
|
+
if event.type == EventType.SLEEP_STARTED:
|
|
1291
|
+
started_sleeps.add(event.data.get("sleep_id"))
|
|
1292
|
+
elif event.type == EventType.SLEEP_COMPLETED:
|
|
1293
|
+
completed_sleeps.add(event.data.get("sleep_id"))
|
|
1294
|
+
|
|
1295
|
+
pending_sleeps = started_sleeps - completed_sleeps
|
|
1296
|
+
|
|
1297
|
+
if not pending_sleeps:
|
|
1298
|
+
return events
|
|
1299
|
+
|
|
1300
|
+
# Record SLEEP_COMPLETED for each pending sleep
|
|
1301
|
+
updated_events = list(events)
|
|
1302
|
+
for sleep_id in pending_sleeps:
|
|
1303
|
+
complete_event = create_sleep_completed_event(
|
|
1304
|
+
run_id=run_id,
|
|
1305
|
+
sleep_id=sleep_id,
|
|
1306
|
+
)
|
|
1307
|
+
await storage.record_event(complete_event)
|
|
1308
|
+
updated_events.append(complete_event)
|
|
1309
|
+
logger.debug(f"Recorded SLEEP_COMPLETED for {sleep_id}", run_id=run_id)
|
|
1310
|
+
|
|
1311
|
+
return updated_events
|
|
1312
|
+
|
|
1313
|
+
|
|
1314
|
+
async def _resume_workflow_on_worker(
|
|
1315
|
+
run_id: str,
|
|
1316
|
+
storage: StorageBackend,
|
|
1317
|
+
storage_config: dict[str, Any] | None = None,
|
|
1318
|
+
) -> Any | None:
|
|
1319
|
+
"""
|
|
1320
|
+
Internal function to resume workflow on Celery worker.
|
|
1321
|
+
|
|
1322
|
+
This mirrors the logic from testing.py but runs on workers.
|
|
1323
|
+
"""
|
|
1324
|
+
from pyworkflow.core.exceptions import WorkflowNotFoundError
|
|
1325
|
+
|
|
1326
|
+
# Ensure storage is connected (some backends like SQLite require this)
|
|
1327
|
+
if hasattr(storage, "connect"):
|
|
1328
|
+
await storage.connect()
|
|
1329
|
+
|
|
1330
|
+
# Load workflow run
|
|
1331
|
+
run = await storage.get_run(run_id)
|
|
1332
|
+
if not run:
|
|
1333
|
+
raise WorkflowNotFoundError(run_id)
|
|
1334
|
+
|
|
1335
|
+
# Check if workflow was cancelled while suspended
|
|
1336
|
+
if run.status == RunStatus.CANCELLED:
|
|
1337
|
+
logger.info(
|
|
1338
|
+
"Workflow was cancelled while suspended, skipping resume",
|
|
1339
|
+
run_id=run_id,
|
|
1340
|
+
workflow_name=run.workflow_name,
|
|
1341
|
+
)
|
|
1342
|
+
return None
|
|
1343
|
+
|
|
1344
|
+
# Check for cancellation flag
|
|
1345
|
+
cancellation_requested = await storage.check_cancellation_flag(run_id)
|
|
1346
|
+
|
|
1347
|
+
logger.info(
|
|
1348
|
+
f"Resuming workflow execution on worker: {run.workflow_name}",
|
|
1349
|
+
run_id=run_id,
|
|
1350
|
+
workflow_name=run.workflow_name,
|
|
1351
|
+
current_status=run.status.value,
|
|
1352
|
+
cancellation_requested=cancellation_requested,
|
|
1353
|
+
)
|
|
1354
|
+
|
|
1355
|
+
# Get workflow function
|
|
1356
|
+
workflow_meta = get_workflow(run.workflow_name)
|
|
1357
|
+
if not workflow_meta:
|
|
1358
|
+
raise ValueError(f"Workflow '{run.workflow_name}' not registered")
|
|
1359
|
+
|
|
1360
|
+
# Load event log
|
|
1361
|
+
events = await storage.get_events(run_id)
|
|
1362
|
+
|
|
1363
|
+
# Complete any pending sleeps (mark them as done before resuming)
|
|
1364
|
+
events = await _complete_pending_sleeps(run_id, events, storage)
|
|
1365
|
+
|
|
1366
|
+
# Deserialize arguments
|
|
1367
|
+
args = deserialize_args(run.input_args)
|
|
1368
|
+
kwargs = deserialize_kwargs(run.input_kwargs)
|
|
1369
|
+
|
|
1370
|
+
# Update status to running
|
|
1371
|
+
await storage.update_run_status(run_id=run_id, status=RunStatus.RUNNING)
|
|
1372
|
+
|
|
1373
|
+
# Execute workflow with event replay
|
|
1374
|
+
try:
|
|
1375
|
+
result = await execute_workflow_with_context(
|
|
1376
|
+
workflow_func=workflow_meta.func,
|
|
1377
|
+
run_id=run_id,
|
|
1378
|
+
workflow_name=run.workflow_name,
|
|
1379
|
+
storage=storage,
|
|
1380
|
+
args=args,
|
|
1381
|
+
kwargs=kwargs,
|
|
1382
|
+
event_log=events,
|
|
1383
|
+
cancellation_requested=cancellation_requested,
|
|
1384
|
+
)
|
|
1385
|
+
|
|
1386
|
+
# Update run status to completed
|
|
1387
|
+
await storage.update_run_status(
|
|
1388
|
+
run_id=run_id, status=RunStatus.COMPLETED, result=serialize_args(result)
|
|
1389
|
+
)
|
|
1390
|
+
|
|
1391
|
+
# Clear cancellation flag if any
|
|
1392
|
+
await storage.clear_cancellation_flag(run_id)
|
|
1393
|
+
|
|
1394
|
+
# Cancel all running children (TERMINATE policy)
|
|
1395
|
+
await _handle_parent_completion(run_id, RunStatus.COMPLETED, storage)
|
|
1396
|
+
|
|
1397
|
+
# Notify parent if this is a child workflow
|
|
1398
|
+
await _notify_parent_of_child_completion(
|
|
1399
|
+
run=run,
|
|
1400
|
+
storage=storage,
|
|
1401
|
+
storage_config=storage_config,
|
|
1402
|
+
status=RunStatus.COMPLETED,
|
|
1403
|
+
result=serialize_args(result),
|
|
1404
|
+
)
|
|
1405
|
+
|
|
1406
|
+
logger.info(
|
|
1407
|
+
f"Workflow resumed and completed on worker: {run.workflow_name}",
|
|
1408
|
+
run_id=run_id,
|
|
1409
|
+
workflow_name=run.workflow_name,
|
|
1410
|
+
)
|
|
1411
|
+
|
|
1412
|
+
return result
|
|
1413
|
+
|
|
1414
|
+
except CancellationError as e:
|
|
1415
|
+
# Workflow was cancelled
|
|
1416
|
+
cancelled_event = create_workflow_cancelled_event(
|
|
1417
|
+
run_id=run_id,
|
|
1418
|
+
reason=e.reason,
|
|
1419
|
+
cleanup_completed=True,
|
|
1420
|
+
)
|
|
1421
|
+
await storage.record_event(cancelled_event)
|
|
1422
|
+
await storage.update_run_status(run_id=run_id, status=RunStatus.CANCELLED)
|
|
1423
|
+
await storage.clear_cancellation_flag(run_id)
|
|
1424
|
+
|
|
1425
|
+
# Cancel all running children (TERMINATE policy)
|
|
1426
|
+
await _handle_parent_completion(run_id, RunStatus.CANCELLED, storage)
|
|
1427
|
+
|
|
1428
|
+
# Notify parent if this is a child workflow
|
|
1429
|
+
await _notify_parent_of_child_completion(
|
|
1430
|
+
run=run,
|
|
1431
|
+
storage=storage,
|
|
1432
|
+
storage_config=storage_config,
|
|
1433
|
+
status=RunStatus.CANCELLED,
|
|
1434
|
+
error=e.reason,
|
|
1435
|
+
)
|
|
1436
|
+
|
|
1437
|
+
logger.info(
|
|
1438
|
+
f"Workflow cancelled on resume on worker: {run.workflow_name}",
|
|
1439
|
+
run_id=run_id,
|
|
1440
|
+
workflow_name=run.workflow_name,
|
|
1441
|
+
reason=e.reason,
|
|
1442
|
+
)
|
|
1443
|
+
|
|
1444
|
+
return None
|
|
1445
|
+
|
|
1446
|
+
except SuspensionSignal as e:
|
|
1447
|
+
# Workflow suspended again
|
|
1448
|
+
await storage.update_run_status(run_id=run_id, status=RunStatus.SUSPENDED)
|
|
1449
|
+
|
|
1450
|
+
logger.info(
|
|
1451
|
+
f"Workflow suspended again on worker: {e.reason}",
|
|
1452
|
+
run_id=run_id,
|
|
1453
|
+
workflow_name=run.workflow_name,
|
|
1454
|
+
reason=e.reason,
|
|
1455
|
+
)
|
|
1456
|
+
|
|
1457
|
+
# Schedule automatic resumption if we have a resume_at time
|
|
1458
|
+
resume_at = e.data.get("resume_at") if e.data else None
|
|
1459
|
+
if resume_at:
|
|
1460
|
+
schedule_workflow_resumption(run_id, resume_at, storage_config=storage_config)
|
|
1461
|
+
logger.info(
|
|
1462
|
+
"Scheduled automatic workflow resumption",
|
|
1463
|
+
run_id=run_id,
|
|
1464
|
+
resume_at=resume_at.isoformat(),
|
|
1465
|
+
)
|
|
1466
|
+
|
|
1467
|
+
return None
|
|
1468
|
+
|
|
1469
|
+
except ContinueAsNewSignal as e:
|
|
1470
|
+
# Workflow continuing as new execution
|
|
1471
|
+
workflow_meta = get_workflow(run.workflow_name)
|
|
1472
|
+
if not workflow_meta:
|
|
1473
|
+
raise ValueError(f"Workflow {run.workflow_name} not registered")
|
|
1474
|
+
|
|
1475
|
+
new_run_id = await _handle_continue_as_new_celery(
|
|
1476
|
+
current_run_id=run_id,
|
|
1477
|
+
workflow_meta=workflow_meta,
|
|
1478
|
+
storage=storage,
|
|
1479
|
+
storage_config=storage_config,
|
|
1480
|
+
new_args=e.workflow_args,
|
|
1481
|
+
new_kwargs=e.workflow_kwargs,
|
|
1482
|
+
parent_run_id=run.parent_run_id,
|
|
1483
|
+
)
|
|
1484
|
+
|
|
1485
|
+
# Cancel all running children (TERMINATE policy)
|
|
1486
|
+
await _handle_parent_completion(run_id, RunStatus.CONTINUED_AS_NEW, storage)
|
|
1487
|
+
|
|
1488
|
+
logger.info(
|
|
1489
|
+
f"Workflow continued as new on resume: {run.workflow_name}",
|
|
1490
|
+
old_run_id=run_id,
|
|
1491
|
+
new_run_id=new_run_id,
|
|
1492
|
+
)
|
|
1493
|
+
|
|
1494
|
+
return None
|
|
1495
|
+
|
|
1496
|
+
except Exception as e:
|
|
1497
|
+
# Workflow failed
|
|
1498
|
+
error_msg = str(e)
|
|
1499
|
+
error_type = type(e).__name__
|
|
1500
|
+
await storage.update_run_status(run_id=run_id, status=RunStatus.FAILED, error=error_msg)
|
|
1501
|
+
|
|
1502
|
+
# Cancel all running children (TERMINATE policy)
|
|
1503
|
+
await _handle_parent_completion(run_id, RunStatus.FAILED, storage)
|
|
1504
|
+
|
|
1505
|
+
# Notify parent if this is a child workflow
|
|
1506
|
+
await _notify_parent_of_child_completion(
|
|
1507
|
+
run=run,
|
|
1508
|
+
storage=storage,
|
|
1509
|
+
storage_config=storage_config,
|
|
1510
|
+
status=RunStatus.FAILED,
|
|
1511
|
+
error=error_msg,
|
|
1512
|
+
error_type=error_type,
|
|
1513
|
+
)
|
|
1514
|
+
|
|
1515
|
+
logger.error(
|
|
1516
|
+
f"Workflow failed on resume on worker: {run.workflow_name}",
|
|
1517
|
+
run_id=run_id,
|
|
1518
|
+
workflow_name=run.workflow_name,
|
|
1519
|
+
error=error_msg,
|
|
1520
|
+
exc_info=True,
|
|
1521
|
+
)
|
|
1522
|
+
|
|
1523
|
+
raise
|
|
1524
|
+
|
|
1525
|
+
|
|
1526
|
+
def _get_storage_backend(config: dict[str, Any] | None = None) -> StorageBackend:
|
|
1527
|
+
"""
|
|
1528
|
+
Get storage backend from configuration.
|
|
1529
|
+
|
|
1530
|
+
This is an alias for config_to_storage for backward compatibility.
|
|
1531
|
+
"""
|
|
1532
|
+
from pyworkflow.storage.config import config_to_storage
|
|
1533
|
+
|
|
1534
|
+
return config_to_storage(config)
|
|
1535
|
+
|
|
1536
|
+
|
|
1537
|
+
def schedule_workflow_resumption(
|
|
1538
|
+
run_id: str,
|
|
1539
|
+
resume_at: datetime,
|
|
1540
|
+
storage_config: dict[str, Any] | None = None,
|
|
1541
|
+
) -> None:
|
|
1542
|
+
"""
|
|
1543
|
+
Schedule automatic workflow resumption after sleep.
|
|
1544
|
+
|
|
1545
|
+
Args:
|
|
1546
|
+
run_id: Workflow run ID
|
|
1547
|
+
resume_at: When to resume the workflow
|
|
1548
|
+
storage_config: Storage backend configuration to pass to the resume task
|
|
1549
|
+
"""
|
|
1550
|
+
from datetime import UTC
|
|
1551
|
+
|
|
1552
|
+
# Calculate delay in seconds
|
|
1553
|
+
now = datetime.now(UTC)
|
|
1554
|
+
delay_seconds = max(0, int((resume_at - now).total_seconds()))
|
|
1555
|
+
|
|
1556
|
+
logger.info(
|
|
1557
|
+
"Scheduling workflow resumption",
|
|
1558
|
+
run_id=run_id,
|
|
1559
|
+
resume_at=resume_at.isoformat(),
|
|
1560
|
+
delay_seconds=delay_seconds,
|
|
1561
|
+
)
|
|
1562
|
+
|
|
1563
|
+
# Schedule the resume task
|
|
1564
|
+
resume_workflow_task.apply_async(
|
|
1565
|
+
args=[run_id],
|
|
1566
|
+
kwargs={"storage_config": storage_config},
|
|
1567
|
+
countdown=delay_seconds,
|
|
1568
|
+
)
|
|
1569
|
+
|
|
1570
|
+
|
|
1571
|
+
async def _handle_parent_completion(
|
|
1572
|
+
run_id: str,
|
|
1573
|
+
status: RunStatus,
|
|
1574
|
+
storage: StorageBackend,
|
|
1575
|
+
) -> None:
|
|
1576
|
+
"""
|
|
1577
|
+
Handle parent workflow completion by cancelling all running children.
|
|
1578
|
+
|
|
1579
|
+
When a parent workflow reaches a terminal state (COMPLETED, FAILED, CANCELLED),
|
|
1580
|
+
all running child workflows are automatically cancelled. This implements the
|
|
1581
|
+
TERMINATE parent close policy.
|
|
1582
|
+
|
|
1583
|
+
Args:
|
|
1584
|
+
run_id: Parent workflow run ID
|
|
1585
|
+
status: Terminal status of the parent workflow
|
|
1586
|
+
storage: Storage backend
|
|
1587
|
+
"""
|
|
1588
|
+
from pyworkflow.engine.executor import cancel_workflow
|
|
1589
|
+
|
|
1590
|
+
# Get all non-terminal children
|
|
1591
|
+
children = await storage.get_children(run_id)
|
|
1592
|
+
non_terminal_statuses = {
|
|
1593
|
+
RunStatus.PENDING,
|
|
1594
|
+
RunStatus.RUNNING,
|
|
1595
|
+
RunStatus.SUSPENDED,
|
|
1596
|
+
RunStatus.INTERRUPTED,
|
|
1597
|
+
}
|
|
1598
|
+
|
|
1599
|
+
running_children = [c for c in children if c.status in non_terminal_statuses]
|
|
1600
|
+
|
|
1601
|
+
if not running_children:
|
|
1602
|
+
return
|
|
1603
|
+
|
|
1604
|
+
logger.info(
|
|
1605
|
+
f"Cancelling {len(running_children)} child workflow(s) due to parent {status.value}",
|
|
1606
|
+
parent_run_id=run_id,
|
|
1607
|
+
parent_status=status.value,
|
|
1608
|
+
child_count=len(running_children),
|
|
1609
|
+
)
|
|
1610
|
+
|
|
1611
|
+
# Cancel each running child
|
|
1612
|
+
for child in running_children:
|
|
1613
|
+
try:
|
|
1614
|
+
reason = f"Parent workflow {run_id} {status.value}"
|
|
1615
|
+
|
|
1616
|
+
# Cancel the child workflow
|
|
1617
|
+
await cancel_workflow(
|
|
1618
|
+
run_id=child.run_id,
|
|
1619
|
+
reason=reason,
|
|
1620
|
+
storage=storage,
|
|
1621
|
+
)
|
|
1622
|
+
|
|
1623
|
+
# Find the child_id from parent's events
|
|
1624
|
+
events = await storage.get_events(run_id)
|
|
1625
|
+
child_id = None
|
|
1626
|
+
for event in events:
|
|
1627
|
+
if (
|
|
1628
|
+
event.type == EventType.CHILD_WORKFLOW_STARTED
|
|
1629
|
+
and event.data.get("child_run_id") == child.run_id
|
|
1630
|
+
):
|
|
1631
|
+
child_id = event.data.get("child_id")
|
|
1632
|
+
break
|
|
1633
|
+
|
|
1634
|
+
# Record cancellation event in parent's log
|
|
1635
|
+
if child_id:
|
|
1636
|
+
cancel_event = create_child_workflow_cancelled_event(
|
|
1637
|
+
run_id=run_id,
|
|
1638
|
+
child_id=child_id,
|
|
1639
|
+
child_run_id=child.run_id,
|
|
1640
|
+
reason=reason,
|
|
1641
|
+
)
|
|
1642
|
+
await storage.record_event(cancel_event)
|
|
1643
|
+
|
|
1644
|
+
logger.info(
|
|
1645
|
+
f"Cancelled child workflow: {child.workflow_name}",
|
|
1646
|
+
parent_run_id=run_id,
|
|
1647
|
+
child_run_id=child.run_id,
|
|
1648
|
+
child_workflow_name=child.workflow_name,
|
|
1649
|
+
)
|
|
1650
|
+
|
|
1651
|
+
except Exception as e:
|
|
1652
|
+
# Log error but don't fail parent completion
|
|
1653
|
+
logger.error(
|
|
1654
|
+
f"Failed to cancel child workflow: {child.workflow_name}",
|
|
1655
|
+
parent_run_id=run_id,
|
|
1656
|
+
child_run_id=child.run_id,
|
|
1657
|
+
error=str(e),
|
|
1658
|
+
)
|
|
1659
|
+
|
|
1660
|
+
|
|
1661
|
+
async def _handle_continue_as_new_celery(
|
|
1662
|
+
current_run_id: str,
|
|
1663
|
+
workflow_meta: WorkflowMetadata,
|
|
1664
|
+
storage: StorageBackend,
|
|
1665
|
+
storage_config: dict[str, Any] | None,
|
|
1666
|
+
new_args: tuple,
|
|
1667
|
+
new_kwargs: dict,
|
|
1668
|
+
parent_run_id: str | None = None,
|
|
1669
|
+
) -> str:
|
|
1670
|
+
"""
|
|
1671
|
+
Handle continue-as-new in Celery context.
|
|
1672
|
+
|
|
1673
|
+
This function:
|
|
1674
|
+
1. Generates new run_id
|
|
1675
|
+
2. Records WORKFLOW_CONTINUED_AS_NEW event in current run
|
|
1676
|
+
3. Updates current run status to CONTINUED_AS_NEW
|
|
1677
|
+
4. Updates current run's continued_to_run_id
|
|
1678
|
+
5. Creates new WorkflowRun with continued_from_run_id
|
|
1679
|
+
6. Schedules new workflow execution via Celery
|
|
1680
|
+
|
|
1681
|
+
Args:
|
|
1682
|
+
current_run_id: The run ID of the current workflow
|
|
1683
|
+
workflow_meta: Workflow metadata
|
|
1684
|
+
storage: Storage backend
|
|
1685
|
+
storage_config: Storage configuration for serialization
|
|
1686
|
+
new_args: Arguments for the new workflow
|
|
1687
|
+
new_kwargs: Keyword arguments for the new workflow
|
|
1688
|
+
parent_run_id: Parent run ID if this is a child workflow
|
|
1689
|
+
|
|
1690
|
+
Returns:
|
|
1691
|
+
New run ID
|
|
1692
|
+
"""
|
|
1693
|
+
# Generate new run_id
|
|
1694
|
+
new_run_id = f"run_{uuid.uuid4().hex[:16]}"
|
|
1695
|
+
|
|
1696
|
+
# Serialize arguments
|
|
1697
|
+
args_json = serialize_args(*new_args)
|
|
1698
|
+
kwargs_json = serialize_kwargs(**new_kwargs)
|
|
1699
|
+
|
|
1700
|
+
# Record continuation event in current run's log
|
|
1701
|
+
continuation_event = create_workflow_continued_as_new_event(
|
|
1702
|
+
run_id=current_run_id,
|
|
1703
|
+
new_run_id=new_run_id,
|
|
1704
|
+
args=args_json,
|
|
1705
|
+
kwargs=kwargs_json,
|
|
1706
|
+
)
|
|
1707
|
+
await storage.record_event(continuation_event)
|
|
1708
|
+
|
|
1709
|
+
# Update current run status and link to new run
|
|
1710
|
+
await storage.update_run_status(
|
|
1711
|
+
run_id=current_run_id,
|
|
1712
|
+
status=RunStatus.CONTINUED_AS_NEW,
|
|
1713
|
+
)
|
|
1714
|
+
await storage.update_run_continuation(
|
|
1715
|
+
run_id=current_run_id,
|
|
1716
|
+
continued_to_run_id=new_run_id,
|
|
1717
|
+
)
|
|
1718
|
+
|
|
1719
|
+
# Get current run to copy metadata
|
|
1720
|
+
current_run = await storage.get_run(current_run_id)
|
|
1721
|
+
nesting_depth = current_run.nesting_depth if current_run else 0
|
|
1722
|
+
|
|
1723
|
+
# Create new workflow run linked to current
|
|
1724
|
+
new_run = WorkflowRun(
|
|
1725
|
+
run_id=new_run_id,
|
|
1726
|
+
workflow_name=workflow_meta.name,
|
|
1727
|
+
status=RunStatus.PENDING,
|
|
1728
|
+
created_at=datetime.now(UTC),
|
|
1729
|
+
input_args=args_json,
|
|
1730
|
+
input_kwargs=kwargs_json,
|
|
1731
|
+
continued_from_run_id=current_run_id,
|
|
1732
|
+
nesting_depth=nesting_depth,
|
|
1733
|
+
parent_run_id=parent_run_id,
|
|
1734
|
+
)
|
|
1735
|
+
await storage.create_run(new_run)
|
|
1736
|
+
|
|
1737
|
+
# Schedule new workflow execution via Celery
|
|
1738
|
+
start_workflow_task.delay(
|
|
1739
|
+
workflow_name=workflow_meta.name,
|
|
1740
|
+
args_json=args_json,
|
|
1741
|
+
kwargs_json=kwargs_json,
|
|
1742
|
+
run_id=new_run_id,
|
|
1743
|
+
storage_config=storage_config,
|
|
1744
|
+
)
|
|
1745
|
+
|
|
1746
|
+
return new_run_id
|