pyworkflow-engine 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyworkflow/__init__.py +1 -1
- pyworkflow/celery/app.py +97 -3
- pyworkflow/celery/loop.py +108 -0
- pyworkflow/celery/singleton.py +368 -0
- pyworkflow/celery/tasks.py +553 -111
- pyworkflow/cli/commands/worker.py +13 -16
- pyworkflow/config.py +5 -0
- pyworkflow/context/base.py +4 -0
- pyworkflow/context/local.py +27 -1
- pyworkflow/context/step_context.py +1 -11
- pyworkflow/core/step.py +43 -15
- pyworkflow/core/validation.py +112 -0
- pyworkflow/engine/events.py +44 -30
- pyworkflow/engine/executor.py +21 -1
- pyworkflow/engine/replay.py +0 -39
- pyworkflow/observability/logging.py +43 -1
- pyworkflow/runtime/celery.py +1 -1
- pyworkflow/runtime/local.py +41 -1
- pyworkflow/storage/config.py +81 -2
- pyworkflow/storage/postgres.py +103 -34
- {pyworkflow_engine-0.1.11.dist-info → pyworkflow_engine-0.1.13.dist-info}/METADATA +1 -1
- {pyworkflow_engine-0.1.11.dist-info → pyworkflow_engine-0.1.13.dist-info}/RECORD +26 -23
- {pyworkflow_engine-0.1.11.dist-info → pyworkflow_engine-0.1.13.dist-info}/WHEEL +0 -0
- {pyworkflow_engine-0.1.11.dist-info → pyworkflow_engine-0.1.13.dist-info}/entry_points.txt +0 -0
- {pyworkflow_engine-0.1.11.dist-info → pyworkflow_engine-0.1.13.dist-info}/licenses/LICENSE +0 -0
- {pyworkflow_engine-0.1.11.dist-info → pyworkflow_engine-0.1.13.dist-info}/top_level.txt +0 -0
pyworkflow/celery/tasks.py
CHANGED
|
@@ -3,13 +3,14 @@ Celery tasks for distributed workflow and step execution.
|
|
|
3
3
|
|
|
4
4
|
These tasks enable:
|
|
5
5
|
- Distributed step execution across workers
|
|
6
|
-
- Automatic retry with exponential backoff
|
|
6
|
+
- Automatic retry with exponential backoff and jitter (via Celery)
|
|
7
7
|
- Scheduled sleep resumption
|
|
8
8
|
- Workflow orchestration
|
|
9
9
|
- Fault tolerance with automatic recovery on worker failures
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
12
|
import asyncio
|
|
13
|
+
import random
|
|
13
14
|
import uuid
|
|
14
15
|
from collections.abc import Callable
|
|
15
16
|
from datetime import UTC, datetime
|
|
@@ -18,11 +19,12 @@ from typing import TYPE_CHECKING, Any
|
|
|
18
19
|
if TYPE_CHECKING:
|
|
19
20
|
from pyworkflow.context.step_context import StepContext
|
|
20
21
|
|
|
21
|
-
from celery import
|
|
22
|
-
from celery.exceptions import WorkerLostError
|
|
22
|
+
from celery.exceptions import MaxRetriesExceededError, Retry
|
|
23
23
|
from loguru import logger
|
|
24
24
|
|
|
25
25
|
from pyworkflow.celery.app import celery_app
|
|
26
|
+
from pyworkflow.celery.loop import run_async
|
|
27
|
+
from pyworkflow.celery.singleton import SingletonWorkflowTask
|
|
26
28
|
from pyworkflow.core.exceptions import (
|
|
27
29
|
CancellationError,
|
|
28
30
|
ContinueAsNewSignal,
|
|
@@ -31,6 +33,7 @@ from pyworkflow.core.exceptions import (
|
|
|
31
33
|
SuspensionSignal,
|
|
32
34
|
)
|
|
33
35
|
from pyworkflow.core.registry import WorkflowMetadata, get_workflow
|
|
36
|
+
from pyworkflow.core.validation import validate_step_parameters
|
|
34
37
|
from pyworkflow.core.workflow import execute_workflow_with_context
|
|
35
38
|
from pyworkflow.engine.events import (
|
|
36
39
|
EventType,
|
|
@@ -39,6 +42,7 @@ from pyworkflow.engine.events import (
|
|
|
39
42
|
create_workflow_continued_as_new_event,
|
|
40
43
|
create_workflow_interrupted_event,
|
|
41
44
|
create_workflow_started_event,
|
|
45
|
+
create_workflow_suspended_event,
|
|
42
46
|
)
|
|
43
47
|
from pyworkflow.serialization.decoder import deserialize_args, deserialize_kwargs
|
|
44
48
|
from pyworkflow.serialization.encoder import serialize_args, serialize_kwargs
|
|
@@ -46,59 +50,39 @@ from pyworkflow.storage.base import StorageBackend
|
|
|
46
50
|
from pyworkflow.storage.schemas import RunStatus, WorkflowRun
|
|
47
51
|
|
|
48
52
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
retry_backoff = True
|
|
55
|
-
retry_backoff_max = 600
|
|
56
|
-
retry_jitter = True
|
|
57
|
-
|
|
58
|
-
def on_failure(self, exc, task_id, args, kwargs, einfo):
|
|
59
|
-
"""
|
|
60
|
-
Handle task failure.
|
|
53
|
+
def _calculate_exponential_backoff(
|
|
54
|
+
attempt: int, base: float = 2.0, max_delay: float = 300.0
|
|
55
|
+
) -> float:
|
|
56
|
+
"""
|
|
57
|
+
Calculate exponential backoff delay with jitter.
|
|
61
58
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
is_worker_loss = isinstance(exc, WorkerLostError)
|
|
59
|
+
Args:
|
|
60
|
+
attempt: Current retry attempt (0-indexed)
|
|
61
|
+
base: Base delay multiplier (default: 2.0)
|
|
62
|
+
max_delay: Maximum delay in seconds (default: 300s / 5 minutes)
|
|
67
63
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
f"Task {self.name} interrupted due to worker loss",
|
|
71
|
-
task_id=task_id,
|
|
72
|
-
error=str(exc),
|
|
73
|
-
)
|
|
74
|
-
# Note: Recovery is handled when the task is requeued and picked up
|
|
75
|
-
# by another worker. See _handle_workflow_recovery() for logic.
|
|
76
|
-
else:
|
|
77
|
-
logger.error(
|
|
78
|
-
f"Task {self.name} failed",
|
|
79
|
-
task_id=task_id,
|
|
80
|
-
error=str(exc),
|
|
81
|
-
traceback=einfo.traceback if einfo else None,
|
|
82
|
-
)
|
|
64
|
+
Returns:
|
|
65
|
+
Delay in seconds with jitter applied
|
|
83
66
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
67
|
+
Formula: min(base * 2^attempt, max_delay) * (0.5 + random(0, 0.5))
|
|
68
|
+
This gives delays like: ~1s, ~2s, ~4s, ~8s, ~16s, ... capped at max_delay
|
|
69
|
+
"""
|
|
70
|
+
delay = min(base * (2**attempt), max_delay)
|
|
71
|
+
# Add jitter: multiply by random factor between 0.5 and 1.0
|
|
72
|
+
# This prevents thundering herd when multiple tasks retry simultaneously
|
|
73
|
+
jitter = 0.5 + random.random() * 0.5
|
|
74
|
+
return delay * jitter
|
|
92
75
|
|
|
93
76
|
|
|
94
77
|
@celery_app.task(
|
|
95
78
|
name="pyworkflow.execute_step",
|
|
96
|
-
base=
|
|
79
|
+
base=SingletonWorkflowTask,
|
|
97
80
|
bind=True,
|
|
98
81
|
queue="pyworkflow.steps",
|
|
82
|
+
unique_on=["run_id", "step_id"],
|
|
99
83
|
)
|
|
100
84
|
def execute_step_task(
|
|
101
|
-
self:
|
|
85
|
+
self: SingletonWorkflowTask,
|
|
102
86
|
step_name: str,
|
|
103
87
|
args_json: str,
|
|
104
88
|
kwargs_json: str,
|
|
@@ -132,9 +116,13 @@ def execute_step_task(
|
|
|
132
116
|
Step result (serialized)
|
|
133
117
|
|
|
134
118
|
Raises:
|
|
135
|
-
FatalError: For non-retriable errors
|
|
136
|
-
RetryableError: For retriable errors (triggers automatic retry)
|
|
119
|
+
FatalError: For non-retriable errors after all retries exhausted
|
|
137
120
|
"""
|
|
121
|
+
# Ensure logging is configured in forked worker process
|
|
122
|
+
from pyworkflow.celery.app import _configure_worker_logging
|
|
123
|
+
|
|
124
|
+
_configure_worker_logging()
|
|
125
|
+
|
|
138
126
|
from pyworkflow.core.registry import _registry
|
|
139
127
|
|
|
140
128
|
logger.info(
|
|
@@ -144,11 +132,32 @@ def execute_step_task(
|
|
|
144
132
|
attempt=self.request.retries + 1,
|
|
145
133
|
)
|
|
146
134
|
|
|
135
|
+
# Check workflow status before executing - bail out if workflow is in terminal state
|
|
136
|
+
storage = _get_storage_backend(storage_config)
|
|
137
|
+
run = run_async(_get_workflow_run_safe(storage, run_id))
|
|
138
|
+
if run is None:
|
|
139
|
+
logger.warning(
|
|
140
|
+
f"Workflow run not found, skipping step execution: {step_name}",
|
|
141
|
+
run_id=run_id,
|
|
142
|
+
step_id=step_id,
|
|
143
|
+
)
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
# Only proceed if workflow is in a state where step execution makes sense
|
|
147
|
+
if run.status not in (RunStatus.RUNNING, RunStatus.SUSPENDED):
|
|
148
|
+
logger.warning(
|
|
149
|
+
f"Workflow in terminal state ({run.status.value}), skipping step execution: {step_name}",
|
|
150
|
+
run_id=run_id,
|
|
151
|
+
step_id=step_id,
|
|
152
|
+
workflow_status=run.status.value,
|
|
153
|
+
)
|
|
154
|
+
return None
|
|
155
|
+
|
|
147
156
|
# Get step metadata
|
|
148
157
|
step_meta = _registry.get_step(step_name)
|
|
149
158
|
if not step_meta:
|
|
150
159
|
# Record failure and resume workflow
|
|
151
|
-
|
|
160
|
+
run_async(
|
|
152
161
|
_record_step_failure_and_resume(
|
|
153
162
|
storage_config=storage_config,
|
|
154
163
|
run_id=run_id,
|
|
@@ -161,10 +170,28 @@ def execute_step_task(
|
|
|
161
170
|
)
|
|
162
171
|
raise FatalError(f"Step '{step_name}' not found in registry")
|
|
163
172
|
|
|
173
|
+
# Ignore processing step if already completed (idempotency)
|
|
174
|
+
events = run_async(storage.get_events(run_id))
|
|
175
|
+
already_completed = any(
|
|
176
|
+
evt.type == EventType.STEP_COMPLETED and evt.data.get("step_id") == step_id
|
|
177
|
+
for evt in events
|
|
178
|
+
)
|
|
179
|
+
if already_completed:
|
|
180
|
+
logger.warning(
|
|
181
|
+
"Step already completed by another task, skipping execution",
|
|
182
|
+
run_id=run_id,
|
|
183
|
+
step_id=step_id,
|
|
184
|
+
step_name=step_name,
|
|
185
|
+
)
|
|
186
|
+
return None
|
|
187
|
+
|
|
164
188
|
# Deserialize arguments
|
|
165
189
|
args = deserialize_args(args_json)
|
|
166
190
|
kwargs = deserialize_kwargs(kwargs_json)
|
|
167
191
|
|
|
192
|
+
# Validate parameters before execution on worker (defense in depth)
|
|
193
|
+
validate_step_parameters(step_meta.original_func, args, kwargs, step_name)
|
|
194
|
+
|
|
168
195
|
# Set up step context if provided (read-only mode)
|
|
169
196
|
step_context_token = None
|
|
170
197
|
readonly_token = None
|
|
@@ -197,7 +224,7 @@ def execute_step_task(
|
|
|
197
224
|
|
|
198
225
|
# Execute the step
|
|
199
226
|
if asyncio.iscoroutinefunction(step_func):
|
|
200
|
-
result =
|
|
227
|
+
result = run_async(step_func(*args, **kwargs))
|
|
201
228
|
else:
|
|
202
229
|
result = step_func(*args, **kwargs)
|
|
203
230
|
|
|
@@ -208,7 +235,7 @@ def execute_step_task(
|
|
|
208
235
|
)
|
|
209
236
|
|
|
210
237
|
# Record STEP_COMPLETED event and trigger workflow resumption
|
|
211
|
-
|
|
238
|
+
run_async(
|
|
212
239
|
_record_step_completion_and_resume(
|
|
213
240
|
storage_config=storage_config,
|
|
214
241
|
run_id=run_id,
|
|
@@ -220,10 +247,23 @@ def execute_step_task(
|
|
|
220
247
|
|
|
221
248
|
return result
|
|
222
249
|
|
|
250
|
+
except Retry:
|
|
251
|
+
# Celery retry in progress - let it propagate correctly
|
|
252
|
+
raise
|
|
253
|
+
|
|
254
|
+
except MaxRetriesExceededError:
|
|
255
|
+
# Celery hit its internal retry limit - treat as fatal
|
|
256
|
+
logger.error(
|
|
257
|
+
f"Step exceeded Celery retry limit: {step_name}",
|
|
258
|
+
run_id=run_id,
|
|
259
|
+
step_id=step_id,
|
|
260
|
+
)
|
|
261
|
+
raise
|
|
262
|
+
|
|
223
263
|
except FatalError as e:
|
|
224
264
|
logger.error(f"Step failed (fatal): {step_name}", run_id=run_id, step_id=step_id)
|
|
225
265
|
# Record failure and resume workflow (workflow will fail on replay)
|
|
226
|
-
|
|
266
|
+
run_async(
|
|
227
267
|
_record_step_failure_and_resume(
|
|
228
268
|
storage_config=storage_config,
|
|
229
269
|
run_id=run_id,
|
|
@@ -239,16 +279,22 @@ def execute_step_task(
|
|
|
239
279
|
except RetryableError as e:
|
|
240
280
|
# Check if we have retries left
|
|
241
281
|
if self.request.retries < max_retries:
|
|
282
|
+
# Use explicit retry_after if provided, otherwise use exponential backoff
|
|
283
|
+
countdown = (
|
|
284
|
+
e.retry_after
|
|
285
|
+
if e.retry_after
|
|
286
|
+
else _calculate_exponential_backoff(self.request.retries)
|
|
287
|
+
)
|
|
242
288
|
logger.warning(
|
|
243
|
-
f"Step failed (retriable): {step_name}, retrying...",
|
|
289
|
+
f"Step failed (retriable): {step_name}, retrying in {countdown:.1f}s...",
|
|
244
290
|
run_id=run_id,
|
|
245
291
|
step_id=step_id,
|
|
246
|
-
|
|
292
|
+
countdown=countdown,
|
|
247
293
|
attempt=self.request.retries + 1,
|
|
248
294
|
max_retries=max_retries,
|
|
249
295
|
)
|
|
250
296
|
# Let Celery handle the retry - don't resume workflow yet
|
|
251
|
-
raise self.retry(
|
|
297
|
+
raise self.retry(countdown=countdown, exc=e)
|
|
252
298
|
else:
|
|
253
299
|
# Max retries exhausted - record failure and resume workflow
|
|
254
300
|
logger.error(
|
|
@@ -256,7 +302,7 @@ def execute_step_task(
|
|
|
256
302
|
run_id=run_id,
|
|
257
303
|
step_id=step_id,
|
|
258
304
|
)
|
|
259
|
-
|
|
305
|
+
run_async(
|
|
260
306
|
_record_step_failure_and_resume(
|
|
261
307
|
storage_config=storage_config,
|
|
262
308
|
run_id=run_id,
|
|
@@ -267,20 +313,23 @@ def execute_step_task(
|
|
|
267
313
|
is_retryable=False, # Mark as not retryable since we exhausted retries
|
|
268
314
|
)
|
|
269
315
|
)
|
|
270
|
-
raise
|
|
316
|
+
raise FatalError(f"Step '{step_name}' failed after retries: {str(e)}") from e
|
|
271
317
|
|
|
272
318
|
except Exception as e:
|
|
273
319
|
# Check if we have retries left
|
|
274
320
|
if self.request.retries < max_retries:
|
|
321
|
+
# Use exponential backoff for unexpected errors
|
|
322
|
+
countdown = _calculate_exponential_backoff(self.request.retries)
|
|
275
323
|
logger.warning(
|
|
276
|
-
f"Step failed (unexpected): {step_name}, retrying...",
|
|
324
|
+
f"Step failed (unexpected): {step_name}, retrying in {countdown:.1f}s...",
|
|
277
325
|
run_id=run_id,
|
|
278
326
|
step_id=step_id,
|
|
279
327
|
error=str(e),
|
|
328
|
+
countdown=countdown,
|
|
280
329
|
attempt=self.request.retries + 1,
|
|
281
330
|
)
|
|
282
|
-
# Treat unexpected errors as retriable
|
|
283
|
-
raise self.retry(exc=
|
|
331
|
+
# Treat unexpected errors as retriable with exponential backoff
|
|
332
|
+
raise self.retry(exc=e, countdown=countdown)
|
|
284
333
|
else:
|
|
285
334
|
# Max retries exhausted
|
|
286
335
|
logger.error(
|
|
@@ -290,7 +339,7 @@ def execute_step_task(
|
|
|
290
339
|
error=str(e),
|
|
291
340
|
exc_info=True,
|
|
292
341
|
)
|
|
293
|
-
|
|
342
|
+
run_async(
|
|
294
343
|
_record_step_failure_and_resume(
|
|
295
344
|
storage_config=storage_config,
|
|
296
345
|
run_id=run_id,
|
|
@@ -301,7 +350,7 @@ def execute_step_task(
|
|
|
301
350
|
is_retryable=False,
|
|
302
351
|
)
|
|
303
352
|
)
|
|
304
|
-
raise
|
|
353
|
+
raise FatalError(f"Step '{step_name}' failed after retries: {str(e)}") from e
|
|
305
354
|
|
|
306
355
|
finally:
|
|
307
356
|
# Clean up step context
|
|
@@ -323,9 +372,16 @@ async def _record_step_completion_and_resume(
|
|
|
323
372
|
result: Any,
|
|
324
373
|
) -> None:
|
|
325
374
|
"""
|
|
326
|
-
Record STEP_COMPLETED event and trigger workflow resumption.
|
|
375
|
+
Record STEP_COMPLETED event and trigger workflow resumption if safe.
|
|
327
376
|
|
|
328
377
|
Called by execute_step_task after successful step execution.
|
|
378
|
+
|
|
379
|
+
Only schedules resume if WORKFLOW_SUSPENDED event exists, indicating
|
|
380
|
+
the workflow has fully suspended. This prevents race conditions where
|
|
381
|
+
a step completes before the workflow has suspended.
|
|
382
|
+
|
|
383
|
+
Idempotency: If STEP_COMPLETED already exists for this step_id, skip
|
|
384
|
+
recording and resume scheduling (another task already handled it).
|
|
329
385
|
"""
|
|
330
386
|
from pyworkflow.engine.events import create_step_completed_event
|
|
331
387
|
from pyworkflow.serialization.encoder import serialize
|
|
@@ -337,6 +393,21 @@ async def _record_step_completion_and_resume(
|
|
|
337
393
|
if hasattr(storage, "connect"):
|
|
338
394
|
await storage.connect()
|
|
339
395
|
|
|
396
|
+
# Idempotency check: skip if step already completed
|
|
397
|
+
events = await storage.get_events(run_id)
|
|
398
|
+
already_completed = any(
|
|
399
|
+
evt.type == EventType.STEP_COMPLETED and evt.data.get("step_id") == step_id
|
|
400
|
+
for evt in events
|
|
401
|
+
)
|
|
402
|
+
if already_completed:
|
|
403
|
+
logger.info(
|
|
404
|
+
"Step already completed by another task, skipping",
|
|
405
|
+
run_id=run_id,
|
|
406
|
+
step_id=step_id,
|
|
407
|
+
step_name=step_name,
|
|
408
|
+
)
|
|
409
|
+
return
|
|
410
|
+
|
|
340
411
|
# Record STEP_COMPLETED event
|
|
341
412
|
completion_event = create_step_completed_event(
|
|
342
413
|
run_id=run_id,
|
|
@@ -346,15 +417,33 @@ async def _record_step_completion_and_resume(
|
|
|
346
417
|
)
|
|
347
418
|
await storage.record_event(completion_event)
|
|
348
419
|
|
|
349
|
-
#
|
|
350
|
-
|
|
420
|
+
# Refresh events to include the one we just recorded
|
|
421
|
+
events = await storage.get_events(run_id)
|
|
351
422
|
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
423
|
+
# Check if workflow has suspended (WORKFLOW_SUSPENDED event exists)
|
|
424
|
+
# Only schedule resume if workflow has properly suspended
|
|
425
|
+
has_suspended = any(evt.type == EventType.WORKFLOW_SUSPENDED for evt in events)
|
|
426
|
+
|
|
427
|
+
if has_suspended:
|
|
428
|
+
# Workflow has suspended, safe to schedule resume
|
|
429
|
+
schedule_workflow_resumption(
|
|
430
|
+
run_id, datetime.now(UTC), storage_config, triggered_by="step_completed"
|
|
431
|
+
)
|
|
432
|
+
logger.info(
|
|
433
|
+
"Step completed and workflow resumption scheduled",
|
|
434
|
+
run_id=run_id,
|
|
435
|
+
step_id=step_id,
|
|
436
|
+
step_name=step_name,
|
|
437
|
+
)
|
|
438
|
+
else:
|
|
439
|
+
# Workflow hasn't suspended yet - don't schedule resume
|
|
440
|
+
# The suspension handler will check for step completion and schedule resume
|
|
441
|
+
logger.info(
|
|
442
|
+
"Step completed but workflow not yet suspended, skipping resume scheduling",
|
|
443
|
+
run_id=run_id,
|
|
444
|
+
step_id=step_id,
|
|
445
|
+
step_name=step_name,
|
|
446
|
+
)
|
|
358
447
|
|
|
359
448
|
|
|
360
449
|
async def _record_step_failure_and_resume(
|
|
@@ -367,10 +456,17 @@ async def _record_step_failure_and_resume(
|
|
|
367
456
|
is_retryable: bool,
|
|
368
457
|
) -> None:
|
|
369
458
|
"""
|
|
370
|
-
Record STEP_FAILED event and trigger workflow resumption.
|
|
459
|
+
Record STEP_FAILED event and trigger workflow resumption if safe.
|
|
371
460
|
|
|
372
461
|
Called by execute_step_task after step failure (when retries are exhausted).
|
|
373
462
|
The workflow will fail when it replays and sees the failure event.
|
|
463
|
+
|
|
464
|
+
Only schedules resume if WORKFLOW_SUSPENDED event exists, indicating
|
|
465
|
+
the workflow has fully suspended. This prevents race conditions where
|
|
466
|
+
a step fails before the workflow has suspended.
|
|
467
|
+
|
|
468
|
+
Idempotency: If STEP_COMPLETED or terminal STEP_FAILED already exists
|
|
469
|
+
for this step_id, skip recording and resume scheduling.
|
|
374
470
|
"""
|
|
375
471
|
from pyworkflow.engine.events import create_step_failed_event
|
|
376
472
|
|
|
@@ -381,6 +477,26 @@ async def _record_step_failure_and_resume(
|
|
|
381
477
|
if hasattr(storage, "connect"):
|
|
382
478
|
await storage.connect()
|
|
383
479
|
|
|
480
|
+
# Idempotency check: skip if step already completed or terminally failed
|
|
481
|
+
events = await storage.get_events(run_id)
|
|
482
|
+
already_handled = any(
|
|
483
|
+
(evt.type == EventType.STEP_COMPLETED and evt.data.get("step_id") == step_id)
|
|
484
|
+
or (
|
|
485
|
+
evt.type == EventType.STEP_FAILED
|
|
486
|
+
and evt.data.get("step_id") == step_id
|
|
487
|
+
and not evt.data.get("is_retryable", True)
|
|
488
|
+
)
|
|
489
|
+
for evt in events
|
|
490
|
+
)
|
|
491
|
+
if already_handled:
|
|
492
|
+
logger.info(
|
|
493
|
+
"Step already completed/failed by another task, skipping",
|
|
494
|
+
run_id=run_id,
|
|
495
|
+
step_id=step_id,
|
|
496
|
+
step_name=step_name,
|
|
497
|
+
)
|
|
498
|
+
return
|
|
499
|
+
|
|
384
500
|
# Record STEP_FAILED event
|
|
385
501
|
failure_event = create_step_failed_event(
|
|
386
502
|
run_id=run_id,
|
|
@@ -392,16 +508,54 @@ async def _record_step_failure_and_resume(
|
|
|
392
508
|
)
|
|
393
509
|
await storage.record_event(failure_event)
|
|
394
510
|
|
|
395
|
-
#
|
|
396
|
-
|
|
511
|
+
# Refresh events to include the one we just recorded
|
|
512
|
+
events = await storage.get_events(run_id)
|
|
397
513
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
514
|
+
# Check if workflow has suspended (WORKFLOW_SUSPENDED event exists)
|
|
515
|
+
# Only schedule resume if workflow has properly suspended
|
|
516
|
+
has_suspended = any(evt.type == EventType.WORKFLOW_SUSPENDED for evt in events)
|
|
517
|
+
|
|
518
|
+
if has_suspended:
|
|
519
|
+
# Workflow has suspended, safe to schedule resume
|
|
520
|
+
schedule_workflow_resumption(
|
|
521
|
+
run_id, datetime.now(UTC), storage_config, triggered_by="step_failed"
|
|
522
|
+
)
|
|
523
|
+
logger.info(
|
|
524
|
+
"Step failed and workflow resumption scheduled",
|
|
525
|
+
run_id=run_id,
|
|
526
|
+
step_id=step_id,
|
|
527
|
+
step_name=step_name,
|
|
528
|
+
error=error,
|
|
529
|
+
)
|
|
530
|
+
else:
|
|
531
|
+
# Workflow hasn't suspended yet - don't schedule resume
|
|
532
|
+
# The suspension handler will check for step failure and schedule resume
|
|
533
|
+
logger.info(
|
|
534
|
+
"Step failed but workflow not yet suspended, skipping resume scheduling",
|
|
535
|
+
run_id=run_id,
|
|
536
|
+
step_id=step_id,
|
|
537
|
+
step_name=step_name,
|
|
538
|
+
error=error,
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
async def _get_workflow_run_safe(
|
|
543
|
+
storage: StorageBackend,
|
|
544
|
+
run_id: str,
|
|
545
|
+
) -> WorkflowRun | None:
|
|
546
|
+
"""
|
|
547
|
+
Safely get workflow run with proper storage connection handling.
|
|
548
|
+
|
|
549
|
+
Args:
|
|
550
|
+
storage: Storage backend
|
|
551
|
+
run_id: Workflow run ID
|
|
552
|
+
|
|
553
|
+
Returns:
|
|
554
|
+
WorkflowRun or None if not found
|
|
555
|
+
"""
|
|
556
|
+
if hasattr(storage, "connect"):
|
|
557
|
+
await storage.connect()
|
|
558
|
+
return await storage.get_run(run_id)
|
|
405
559
|
|
|
406
560
|
|
|
407
561
|
def _resolve_context_class(class_name: str) -> type["StepContext"] | None:
|
|
@@ -430,7 +584,9 @@ def _resolve_context_class(class_name: str) -> type["StepContext"] | None:
|
|
|
430
584
|
|
|
431
585
|
@celery_app.task(
|
|
432
586
|
name="pyworkflow.start_workflow",
|
|
587
|
+
base=SingletonWorkflowTask,
|
|
433
588
|
queue="pyworkflow.workflows",
|
|
589
|
+
unique_on=["run_id"],
|
|
434
590
|
)
|
|
435
591
|
def start_workflow_task(
|
|
436
592
|
workflow_name: str,
|
|
@@ -456,7 +612,17 @@ def start_workflow_task(
|
|
|
456
612
|
Returns:
|
|
457
613
|
Workflow run ID
|
|
458
614
|
"""
|
|
459
|
-
|
|
615
|
+
# Ensure logging is configured in forked worker process
|
|
616
|
+
from pyworkflow.celery.app import _configure_worker_logging
|
|
617
|
+
|
|
618
|
+
_configure_worker_logging()
|
|
619
|
+
|
|
620
|
+
logger.info(
|
|
621
|
+
f"START_WORKFLOW_TASK ENTRY: {workflow_name}",
|
|
622
|
+
run_id=run_id,
|
|
623
|
+
idempotency_key=idempotency_key,
|
|
624
|
+
celery_task_id=start_workflow_task.request.id,
|
|
625
|
+
)
|
|
460
626
|
|
|
461
627
|
# Get workflow metadata
|
|
462
628
|
workflow_meta = get_workflow(workflow_name)
|
|
@@ -471,7 +637,7 @@ def start_workflow_task(
|
|
|
471
637
|
storage = _get_storage_backend(storage_config)
|
|
472
638
|
|
|
473
639
|
# Execute workflow directly on worker
|
|
474
|
-
result_run_id =
|
|
640
|
+
result_run_id = run_async(
|
|
475
641
|
_start_workflow_on_worker(
|
|
476
642
|
workflow_meta=workflow_meta,
|
|
477
643
|
args=args,
|
|
@@ -489,7 +655,9 @@ def start_workflow_task(
|
|
|
489
655
|
|
|
490
656
|
@celery_app.task(
|
|
491
657
|
name="pyworkflow.start_child_workflow",
|
|
658
|
+
base=SingletonWorkflowTask,
|
|
492
659
|
queue="pyworkflow.workflows",
|
|
660
|
+
unique_on=["child_run_id"],
|
|
493
661
|
)
|
|
494
662
|
def start_child_workflow_task(
|
|
495
663
|
workflow_name: str,
|
|
@@ -520,6 +688,11 @@ def start_child_workflow_task(
|
|
|
520
688
|
Returns:
|
|
521
689
|
Child workflow run ID
|
|
522
690
|
"""
|
|
691
|
+
# Ensure logging is configured in forked worker process
|
|
692
|
+
from pyworkflow.celery.app import _configure_worker_logging
|
|
693
|
+
|
|
694
|
+
_configure_worker_logging()
|
|
695
|
+
|
|
523
696
|
logger.info(
|
|
524
697
|
f"Starting child workflow on worker: {workflow_name}",
|
|
525
698
|
child_run_id=child_run_id,
|
|
@@ -539,7 +712,7 @@ def start_child_workflow_task(
|
|
|
539
712
|
storage = _get_storage_backend(storage_config)
|
|
540
713
|
|
|
541
714
|
# Execute child workflow on worker
|
|
542
|
-
|
|
715
|
+
run_async(
|
|
543
716
|
_execute_child_workflow_on_worker(
|
|
544
717
|
workflow_func=workflow_meta.func,
|
|
545
718
|
workflow_name=workflow_name,
|
|
@@ -633,19 +806,62 @@ async def _execute_child_workflow_on_worker(
|
|
|
633
806
|
await _trigger_parent_resumption_celery(parent_run_id, storage, storage_config)
|
|
634
807
|
|
|
635
808
|
except SuspensionSignal as e:
|
|
636
|
-
# Child workflow suspended (e.g., sleep, hook)
|
|
809
|
+
# Child workflow suspended (e.g., sleep, hook, step dispatch)
|
|
637
810
|
# Update status and don't notify parent yet - handled on child resumption
|
|
638
811
|
await storage.update_run_status(child_run_id, RunStatus.SUSPENDED)
|
|
812
|
+
|
|
813
|
+
# Record WORKFLOW_SUSPENDED event
|
|
814
|
+
step_id = e.data.get("step_id") if e.data else None
|
|
815
|
+
step_name = e.data.get("step_name") if e.data else None
|
|
816
|
+
sleep_id = e.data.get("sleep_id") if e.data else None
|
|
817
|
+
hook_id = e.data.get("hook_id") if e.data else None
|
|
818
|
+
nested_child_id = e.data.get("child_id") if e.data else None
|
|
819
|
+
|
|
820
|
+
suspended_event = create_workflow_suspended_event(
|
|
821
|
+
run_id=child_run_id,
|
|
822
|
+
reason=e.reason,
|
|
823
|
+
step_id=step_id,
|
|
824
|
+
step_name=step_name,
|
|
825
|
+
sleep_id=sleep_id,
|
|
826
|
+
hook_id=hook_id,
|
|
827
|
+
child_id=nested_child_id,
|
|
828
|
+
)
|
|
829
|
+
await storage.record_event(suspended_event)
|
|
830
|
+
|
|
639
831
|
logger.debug(
|
|
640
832
|
f"Child workflow suspended: {workflow_name}",
|
|
641
833
|
parent_run_id=parent_run_id,
|
|
642
834
|
child_run_id=child_run_id,
|
|
643
835
|
)
|
|
644
836
|
|
|
837
|
+
# For step dispatch suspensions, check if step already completed/failed
|
|
838
|
+
if step_id and e.reason.startswith("step_dispatch:"):
|
|
839
|
+
events = await storage.get_events(child_run_id)
|
|
840
|
+
step_finished = any(
|
|
841
|
+
evt.type in (EventType.STEP_COMPLETED, EventType.STEP_FAILED)
|
|
842
|
+
and evt.data.get("step_id") == step_id
|
|
843
|
+
for evt in events
|
|
844
|
+
)
|
|
845
|
+
if step_finished:
|
|
846
|
+
logger.info(
|
|
847
|
+
"Child step finished before suspension completed, scheduling resume",
|
|
848
|
+
child_run_id=child_run_id,
|
|
849
|
+
step_id=step_id,
|
|
850
|
+
)
|
|
851
|
+
schedule_workflow_resumption(
|
|
852
|
+
child_run_id,
|
|
853
|
+
datetime.now(UTC),
|
|
854
|
+
storage_config=storage_config,
|
|
855
|
+
triggered_by="child_suspension_step_race",
|
|
856
|
+
)
|
|
857
|
+
return
|
|
858
|
+
|
|
645
859
|
# Schedule automatic resumption if we have a resume_at time
|
|
646
860
|
resume_at = e.data.get("resume_at") if e.data else None
|
|
647
861
|
if resume_at:
|
|
648
|
-
schedule_workflow_resumption(
|
|
862
|
+
schedule_workflow_resumption(
|
|
863
|
+
child_run_id, resume_at, storage_config, triggered_by="child_sleep_hook"
|
|
864
|
+
)
|
|
649
865
|
|
|
650
866
|
except ContinueAsNewSignal as e:
|
|
651
867
|
# Child workflow continuing as new execution
|
|
@@ -718,7 +934,9 @@ async def _trigger_parent_resumption_celery(
|
|
|
718
934
|
parent_run_id=parent_run_id,
|
|
719
935
|
)
|
|
720
936
|
# Schedule immediate resumption via Celery
|
|
721
|
-
schedule_workflow_resumption(
|
|
937
|
+
schedule_workflow_resumption(
|
|
938
|
+
parent_run_id, datetime.now(UTC), storage_config, triggered_by="child_completed"
|
|
939
|
+
)
|
|
722
940
|
|
|
723
941
|
|
|
724
942
|
async def _notify_parent_of_child_completion(
|
|
@@ -978,9 +1196,27 @@ async def _recover_workflow_on_worker(
|
|
|
978
1196
|
return run_id
|
|
979
1197
|
|
|
980
1198
|
except SuspensionSignal as e:
|
|
981
|
-
# Workflow suspended again
|
|
1199
|
+
# Workflow suspended again (during recovery)
|
|
982
1200
|
await storage.update_run_status(run_id=run_id, status=RunStatus.SUSPENDED)
|
|
983
1201
|
|
|
1202
|
+
# Record WORKFLOW_SUSPENDED event
|
|
1203
|
+
step_id = e.data.get("step_id") if e.data else None
|
|
1204
|
+
step_name = e.data.get("step_name") if e.data else None
|
|
1205
|
+
sleep_id = e.data.get("sleep_id") if e.data else None
|
|
1206
|
+
hook_id = e.data.get("hook_id") if e.data else None
|
|
1207
|
+
child_id = e.data.get("child_id") if e.data else None
|
|
1208
|
+
|
|
1209
|
+
suspended_event = create_workflow_suspended_event(
|
|
1210
|
+
run_id=run_id,
|
|
1211
|
+
reason=e.reason,
|
|
1212
|
+
step_id=step_id,
|
|
1213
|
+
step_name=step_name,
|
|
1214
|
+
sleep_id=sleep_id,
|
|
1215
|
+
hook_id=hook_id,
|
|
1216
|
+
child_id=child_id,
|
|
1217
|
+
)
|
|
1218
|
+
await storage.record_event(suspended_event)
|
|
1219
|
+
|
|
984
1220
|
logger.info(
|
|
985
1221
|
f"Recovered workflow suspended: {e.reason}",
|
|
986
1222
|
run_id=run_id,
|
|
@@ -988,10 +1224,34 @@ async def _recover_workflow_on_worker(
|
|
|
988
1224
|
reason=e.reason,
|
|
989
1225
|
)
|
|
990
1226
|
|
|
1227
|
+
# For step dispatch suspensions, check if step already completed/failed
|
|
1228
|
+
if step_id and e.reason.startswith("step_dispatch:"):
|
|
1229
|
+
events = await storage.get_events(run_id)
|
|
1230
|
+
step_finished = any(
|
|
1231
|
+
evt.type in (EventType.STEP_COMPLETED, EventType.STEP_FAILED)
|
|
1232
|
+
and evt.data.get("step_id") == step_id
|
|
1233
|
+
for evt in events
|
|
1234
|
+
)
|
|
1235
|
+
if step_finished:
|
|
1236
|
+
logger.info(
|
|
1237
|
+
"Step finished before recovery suspension completed, scheduling resume",
|
|
1238
|
+
run_id=run_id,
|
|
1239
|
+
step_id=step_id,
|
|
1240
|
+
)
|
|
1241
|
+
schedule_workflow_resumption(
|
|
1242
|
+
run_id,
|
|
1243
|
+
datetime.now(UTC),
|
|
1244
|
+
storage_config=storage_config,
|
|
1245
|
+
triggered_by="recovery_suspension_step_race",
|
|
1246
|
+
)
|
|
1247
|
+
return run_id
|
|
1248
|
+
|
|
991
1249
|
# Schedule automatic resumption if we have a resume_at time
|
|
992
1250
|
resume_at = e.data.get("resume_at") if e.data else None
|
|
993
1251
|
if resume_at:
|
|
994
|
-
schedule_workflow_resumption(
|
|
1252
|
+
schedule_workflow_resumption(
|
|
1253
|
+
run_id, resume_at, storage_config=storage_config, triggered_by="recovery_sleep_hook"
|
|
1254
|
+
)
|
|
995
1255
|
logger.info(
|
|
996
1256
|
"Scheduled automatic workflow resumption",
|
|
997
1257
|
run_id=run_id,
|
|
@@ -1076,10 +1336,22 @@ async def _start_workflow_on_worker(
|
|
|
1076
1336
|
workflow_name = workflow_meta.name
|
|
1077
1337
|
config = get_config()
|
|
1078
1338
|
|
|
1339
|
+
run = await storage.get_run(run_id) if run_id else None
|
|
1340
|
+
logger.debug(
|
|
1341
|
+
f"_START_WORKFLOW_ON_WORKER ENTRY: {workflow_name} with run_id={run_id} and status={run.status.value if run else 'N/A'}",
|
|
1342
|
+
run_id=run_id,
|
|
1343
|
+
)
|
|
1344
|
+
|
|
1079
1345
|
# Check idempotency key
|
|
1080
1346
|
if idempotency_key:
|
|
1081
1347
|
existing_run = await storage.get_run_by_idempotency_key(idempotency_key)
|
|
1082
1348
|
if existing_run:
|
|
1349
|
+
logger.info(
|
|
1350
|
+
"IDEMPOTENCY CHECK: Found existing run",
|
|
1351
|
+
run_id=existing_run.run_id,
|
|
1352
|
+
status=existing_run.status.value,
|
|
1353
|
+
idempotency_key=idempotency_key,
|
|
1354
|
+
)
|
|
1083
1355
|
# Check if this is a recovery scenario (workflow was RUNNING but worker crashed)
|
|
1084
1356
|
if existing_run.status == RunStatus.RUNNING:
|
|
1085
1357
|
# Check if this is truly a crashed worker or just a duplicate task execution
|
|
@@ -1140,27 +1412,76 @@ async def _start_workflow_on_worker(
|
|
|
1140
1412
|
if run_id is None:
|
|
1141
1413
|
run_id = f"run_{uuid.uuid4().hex[:16]}"
|
|
1142
1414
|
|
|
1143
|
-
# Check if run already exists
|
|
1415
|
+
# Check if run already exists
|
|
1144
1416
|
existing_run = await storage.get_run(run_id)
|
|
1145
|
-
if existing_run
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
worker_id=None,
|
|
1417
|
+
if existing_run:
|
|
1418
|
+
logger.info(
|
|
1419
|
+
f"RUN_ID CHECK: Found existing run with status {existing_run.status.value}",
|
|
1420
|
+
run_id=run_id,
|
|
1421
|
+
status=existing_run.status.value,
|
|
1151
1422
|
)
|
|
1152
|
-
|
|
1153
|
-
|
|
1423
|
+
|
|
1424
|
+
if existing_run.status == RunStatus.RUNNING:
|
|
1425
|
+
# Recovery scenario - worker crashed while running
|
|
1426
|
+
can_recover = await _handle_workflow_recovery(
|
|
1154
1427
|
run=existing_run,
|
|
1155
|
-
workflow_meta=workflow_meta,
|
|
1156
1428
|
storage=storage,
|
|
1157
|
-
|
|
1429
|
+
worker_id=None,
|
|
1430
|
+
)
|
|
1431
|
+
if can_recover:
|
|
1432
|
+
return await _recover_workflow_on_worker(
|
|
1433
|
+
run=existing_run,
|
|
1434
|
+
workflow_meta=workflow_meta,
|
|
1435
|
+
storage=storage,
|
|
1436
|
+
storage_config=storage_config,
|
|
1437
|
+
)
|
|
1438
|
+
else:
|
|
1439
|
+
return existing_run.run_id
|
|
1440
|
+
|
|
1441
|
+
elif existing_run.status == RunStatus.SUSPENDED:
|
|
1442
|
+
# Workflow is suspended - this start_workflow_task is a duplicate
|
|
1443
|
+
# (scheduled during race condition before workflow suspended)
|
|
1444
|
+
# Return existing run_id - resume_workflow_task will handle it
|
|
1445
|
+
logger.info(
|
|
1446
|
+
"DUPLICATE START: Workflow already suspended, returning existing run",
|
|
1447
|
+
run_id=run_id,
|
|
1448
|
+
status=existing_run.status.value,
|
|
1449
|
+
)
|
|
1450
|
+
return existing_run.run_id
|
|
1451
|
+
|
|
1452
|
+
elif existing_run.status in (
|
|
1453
|
+
RunStatus.COMPLETED,
|
|
1454
|
+
RunStatus.FAILED,
|
|
1455
|
+
RunStatus.CANCELLED,
|
|
1456
|
+
):
|
|
1457
|
+
# Terminal status - workflow already finished
|
|
1458
|
+
logger.info(
|
|
1459
|
+
f"TERMINAL STATUS: Workflow already {existing_run.status.value}, returning existing run",
|
|
1460
|
+
run_id=run_id,
|
|
1461
|
+
status=existing_run.status.value,
|
|
1158
1462
|
)
|
|
1159
|
-
else:
|
|
1160
1463
|
return existing_run.run_id
|
|
1161
1464
|
|
|
1465
|
+
elif existing_run.status == RunStatus.INTERRUPTED:
|
|
1466
|
+
# Previous recovery failed, try again
|
|
1467
|
+
can_recover = await _handle_workflow_recovery(
|
|
1468
|
+
run=existing_run,
|
|
1469
|
+
storage=storage,
|
|
1470
|
+
worker_id=None,
|
|
1471
|
+
)
|
|
1472
|
+
if can_recover:
|
|
1473
|
+
return await _recover_workflow_on_worker(
|
|
1474
|
+
run=existing_run,
|
|
1475
|
+
workflow_meta=workflow_meta,
|
|
1476
|
+
storage=storage,
|
|
1477
|
+
storage_config=storage_config,
|
|
1478
|
+
)
|
|
1479
|
+
else:
|
|
1480
|
+
return existing_run.run_id
|
|
1481
|
+
|
|
1482
|
+
# Only reach here if no existing run found
|
|
1162
1483
|
logger.info(
|
|
1163
|
-
f"
|
|
1484
|
+
f"FRESH START: Creating new workflow run: {workflow_name}",
|
|
1164
1485
|
run_id=run_id,
|
|
1165
1486
|
workflow_name=workflow_name,
|
|
1166
1487
|
)
|
|
@@ -1265,9 +1586,28 @@ async def _start_workflow_on_worker(
|
|
|
1265
1586
|
return run_id
|
|
1266
1587
|
|
|
1267
1588
|
except SuspensionSignal as e:
|
|
1268
|
-
# Workflow suspended (sleep or
|
|
1589
|
+
# Workflow suspended (sleep, hook, or step dispatch)
|
|
1269
1590
|
await storage.update_run_status(run_id=run_id, status=RunStatus.SUSPENDED)
|
|
1270
1591
|
|
|
1592
|
+
# Record WORKFLOW_SUSPENDED event - this signals that suspension is complete
|
|
1593
|
+
# and resume can be safely scheduled
|
|
1594
|
+
step_id = e.data.get("step_id") if e.data else None
|
|
1595
|
+
step_name = e.data.get("step_name") if e.data else None
|
|
1596
|
+
sleep_id = e.data.get("sleep_id") if e.data else None
|
|
1597
|
+
hook_id = e.data.get("hook_id") if e.data else None
|
|
1598
|
+
child_id = e.data.get("child_id") if e.data else None
|
|
1599
|
+
|
|
1600
|
+
suspended_event = create_workflow_suspended_event(
|
|
1601
|
+
run_id=run_id,
|
|
1602
|
+
reason=e.reason,
|
|
1603
|
+
step_id=step_id,
|
|
1604
|
+
step_name=step_name,
|
|
1605
|
+
sleep_id=sleep_id,
|
|
1606
|
+
hook_id=hook_id,
|
|
1607
|
+
child_id=child_id,
|
|
1608
|
+
)
|
|
1609
|
+
await storage.record_event(suspended_event)
|
|
1610
|
+
|
|
1271
1611
|
logger.info(
|
|
1272
1612
|
f"Workflow suspended on worker: {e.reason}",
|
|
1273
1613
|
run_id=run_id,
|
|
@@ -1275,10 +1615,35 @@ async def _start_workflow_on_worker(
|
|
|
1275
1615
|
reason=e.reason,
|
|
1276
1616
|
)
|
|
1277
1617
|
|
|
1278
|
-
#
|
|
1618
|
+
# For step dispatch suspensions, check if step already completed/failed (race condition)
|
|
1619
|
+
# If so, schedule resume immediately
|
|
1620
|
+
if step_id and e.reason.startswith("step_dispatch:"):
|
|
1621
|
+
events = await storage.get_events(run_id)
|
|
1622
|
+
step_finished = any(
|
|
1623
|
+
evt.type in (EventType.STEP_COMPLETED, EventType.STEP_FAILED)
|
|
1624
|
+
and evt.data.get("step_id") == step_id
|
|
1625
|
+
for evt in events
|
|
1626
|
+
)
|
|
1627
|
+
if step_finished:
|
|
1628
|
+
logger.info(
|
|
1629
|
+
"Step finished before suspension completed, scheduling resume",
|
|
1630
|
+
run_id=run_id,
|
|
1631
|
+
step_id=step_id,
|
|
1632
|
+
)
|
|
1633
|
+
schedule_workflow_resumption(
|
|
1634
|
+
run_id,
|
|
1635
|
+
datetime.now(UTC),
|
|
1636
|
+
storage_config=storage_config,
|
|
1637
|
+
triggered_by="resume_suspension_step_race",
|
|
1638
|
+
)
|
|
1639
|
+
return run_id
|
|
1640
|
+
|
|
1641
|
+
# Schedule automatic resumption if we have a resume_at time (for sleep/hook)
|
|
1279
1642
|
resume_at = e.data.get("resume_at") if e.data else None
|
|
1280
1643
|
if resume_at:
|
|
1281
|
-
schedule_workflow_resumption(
|
|
1644
|
+
schedule_workflow_resumption(
|
|
1645
|
+
run_id, resume_at, storage_config=storage_config, triggered_by="resume_sleep_hook"
|
|
1646
|
+
)
|
|
1282
1647
|
logger.info(
|
|
1283
1648
|
"Scheduled automatic workflow resumption",
|
|
1284
1649
|
run_id=run_id,
|
|
@@ -1332,7 +1697,9 @@ async def _start_workflow_on_worker(
|
|
|
1332
1697
|
|
|
1333
1698
|
@celery_app.task(
|
|
1334
1699
|
name="pyworkflow.resume_workflow",
|
|
1700
|
+
base=SingletonWorkflowTask,
|
|
1335
1701
|
queue="pyworkflow.schedules",
|
|
1702
|
+
unique_on=["run_id"],
|
|
1336
1703
|
)
|
|
1337
1704
|
def resume_workflow_task(
|
|
1338
1705
|
run_id: str,
|
|
@@ -1351,13 +1718,22 @@ def resume_workflow_task(
|
|
|
1351
1718
|
Returns:
|
|
1352
1719
|
Workflow result if completed, None if suspended again
|
|
1353
1720
|
"""
|
|
1354
|
-
|
|
1721
|
+
# Ensure logging is configured in forked worker process
|
|
1722
|
+
from pyworkflow.celery.app import _configure_worker_logging
|
|
1723
|
+
|
|
1724
|
+
_configure_worker_logging()
|
|
1725
|
+
|
|
1726
|
+
logger.info(
|
|
1727
|
+
f"RESUME_WORKFLOW_TASK ENTRY: {run_id}",
|
|
1728
|
+
run_id=run_id,
|
|
1729
|
+
celery_task_id=resume_workflow_task.request.id,
|
|
1730
|
+
)
|
|
1355
1731
|
|
|
1356
1732
|
# Get storage backend
|
|
1357
1733
|
storage = _get_storage_backend(storage_config)
|
|
1358
1734
|
|
|
1359
1735
|
# Resume workflow directly on worker
|
|
1360
|
-
result =
|
|
1736
|
+
result = run_async(_resume_workflow_on_worker(run_id, storage, storage_config))
|
|
1361
1737
|
|
|
1362
1738
|
if result is not None:
|
|
1363
1739
|
logger.info(f"Workflow completed on worker: {run_id}")
|
|
@@ -1369,7 +1745,9 @@ def resume_workflow_task(
|
|
|
1369
1745
|
|
|
1370
1746
|
@celery_app.task(
|
|
1371
1747
|
name="pyworkflow.execute_scheduled_workflow",
|
|
1748
|
+
base=SingletonWorkflowTask,
|
|
1372
1749
|
queue="pyworkflow.schedules",
|
|
1750
|
+
# No unique_on - scheduled workflows create new runs each time, no deduplication needed
|
|
1373
1751
|
)
|
|
1374
1752
|
def execute_scheduled_workflow_task(
|
|
1375
1753
|
schedule_id: str,
|
|
@@ -1390,11 +1768,16 @@ def execute_scheduled_workflow_task(
|
|
|
1390
1768
|
Returns:
|
|
1391
1769
|
Workflow run ID if started, None if skipped
|
|
1392
1770
|
"""
|
|
1771
|
+
# Ensure logging is configured in forked worker process
|
|
1772
|
+
from pyworkflow.celery.app import _configure_worker_logging
|
|
1773
|
+
|
|
1774
|
+
_configure_worker_logging()
|
|
1775
|
+
|
|
1393
1776
|
logger.info("Executing scheduled workflow", schedule_id=schedule_id)
|
|
1394
1777
|
|
|
1395
1778
|
storage = _get_storage_backend(storage_config)
|
|
1396
1779
|
|
|
1397
|
-
return
|
|
1780
|
+
return run_async(
|
|
1398
1781
|
_execute_scheduled_workflow(
|
|
1399
1782
|
schedule_id=schedule_id,
|
|
1400
1783
|
scheduled_time=datetime.fromisoformat(scheduled_time),
|
|
@@ -1587,6 +1970,19 @@ async def _resume_workflow_on_worker(
|
|
|
1587
1970
|
)
|
|
1588
1971
|
return None
|
|
1589
1972
|
|
|
1973
|
+
# Prevent duplicate resume execution
|
|
1974
|
+
# Multiple resume tasks can be scheduled for the same workflow (e.g., race
|
|
1975
|
+
# condition between step completion and suspension handler). Only proceed
|
|
1976
|
+
# if the workflow is actually SUSPENDED. If status is RUNNING, another
|
|
1977
|
+
# resume task got there first.
|
|
1978
|
+
if run.status != RunStatus.SUSPENDED:
|
|
1979
|
+
logger.info(
|
|
1980
|
+
f"Workflow status is {run.status.value}, not SUSPENDED - skipping duplicate resume",
|
|
1981
|
+
run_id=run_id,
|
|
1982
|
+
workflow_name=run.workflow_name,
|
|
1983
|
+
)
|
|
1984
|
+
return None
|
|
1985
|
+
|
|
1590
1986
|
# Check for cancellation flag
|
|
1591
1987
|
cancellation_requested = await storage.check_cancellation_flag(run_id)
|
|
1592
1988
|
|
|
@@ -1692,9 +2088,27 @@ async def _resume_workflow_on_worker(
|
|
|
1692
2088
|
return None
|
|
1693
2089
|
|
|
1694
2090
|
except SuspensionSignal as e:
|
|
1695
|
-
# Workflow suspended again
|
|
2091
|
+
# Workflow suspended again (during resume)
|
|
1696
2092
|
await storage.update_run_status(run_id=run_id, status=RunStatus.SUSPENDED)
|
|
1697
2093
|
|
|
2094
|
+
# Record WORKFLOW_SUSPENDED event
|
|
2095
|
+
step_id = e.data.get("step_id") if e.data else None
|
|
2096
|
+
step_name = e.data.get("step_name") if e.data else None
|
|
2097
|
+
sleep_id = e.data.get("sleep_id") if e.data else None
|
|
2098
|
+
hook_id = e.data.get("hook_id") if e.data else None
|
|
2099
|
+
child_id = e.data.get("child_id") if e.data else None
|
|
2100
|
+
|
|
2101
|
+
suspended_event = create_workflow_suspended_event(
|
|
2102
|
+
run_id=run_id,
|
|
2103
|
+
reason=e.reason,
|
|
2104
|
+
step_id=step_id,
|
|
2105
|
+
step_name=step_name,
|
|
2106
|
+
sleep_id=sleep_id,
|
|
2107
|
+
hook_id=hook_id,
|
|
2108
|
+
child_id=child_id,
|
|
2109
|
+
)
|
|
2110
|
+
await storage.record_event(suspended_event)
|
|
2111
|
+
|
|
1698
2112
|
logger.info(
|
|
1699
2113
|
f"Workflow suspended again on worker: {e.reason}",
|
|
1700
2114
|
run_id=run_id,
|
|
@@ -1702,10 +2116,34 @@ async def _resume_workflow_on_worker(
|
|
|
1702
2116
|
reason=e.reason,
|
|
1703
2117
|
)
|
|
1704
2118
|
|
|
2119
|
+
# For step dispatch suspensions, check if step already completed/failed
|
|
2120
|
+
if step_id and e.reason.startswith("step_dispatch:"):
|
|
2121
|
+
events = await storage.get_events(run_id)
|
|
2122
|
+
step_finished = any(
|
|
2123
|
+
evt.type in (EventType.STEP_COMPLETED, EventType.STEP_FAILED)
|
|
2124
|
+
and evt.data.get("step_id") == step_id
|
|
2125
|
+
for evt in events
|
|
2126
|
+
)
|
|
2127
|
+
if step_finished:
|
|
2128
|
+
logger.info(
|
|
2129
|
+
"Step finished before resume suspension completed, scheduling resume",
|
|
2130
|
+
run_id=run_id,
|
|
2131
|
+
step_id=step_id,
|
|
2132
|
+
)
|
|
2133
|
+
schedule_workflow_resumption(
|
|
2134
|
+
run_id,
|
|
2135
|
+
datetime.now(UTC),
|
|
2136
|
+
storage_config=storage_config,
|
|
2137
|
+
triggered_by="start_suspension_step_race",
|
|
2138
|
+
)
|
|
2139
|
+
return None
|
|
2140
|
+
|
|
1705
2141
|
# Schedule automatic resumption if we have a resume_at time
|
|
1706
2142
|
resume_at = e.data.get("resume_at") if e.data else None
|
|
1707
2143
|
if resume_at:
|
|
1708
|
-
schedule_workflow_resumption(
|
|
2144
|
+
schedule_workflow_resumption(
|
|
2145
|
+
run_id, resume_at, storage_config=storage_config, triggered_by="start_sleep_hook"
|
|
2146
|
+
)
|
|
1709
2147
|
logger.info(
|
|
1710
2148
|
"Scheduled automatic workflow resumption",
|
|
1711
2149
|
run_id=run_id,
|
|
@@ -1779,13 +2217,15 @@ def _get_storage_backend(config: dict[str, Any] | None = None) -> StorageBackend
|
|
|
1779
2217
|
"""
|
|
1780
2218
|
from pyworkflow.storage.config import config_to_storage
|
|
1781
2219
|
|
|
1782
|
-
|
|
2220
|
+
storage = config_to_storage(config)
|
|
2221
|
+
return storage
|
|
1783
2222
|
|
|
1784
2223
|
|
|
1785
2224
|
def schedule_workflow_resumption(
|
|
1786
2225
|
run_id: str,
|
|
1787
2226
|
resume_at: datetime,
|
|
1788
2227
|
storage_config: dict[str, Any] | None = None,
|
|
2228
|
+
triggered_by: str = "unknown",
|
|
1789
2229
|
) -> None:
|
|
1790
2230
|
"""
|
|
1791
2231
|
Schedule automatic workflow resumption after sleep.
|
|
@@ -1794,6 +2234,7 @@ def schedule_workflow_resumption(
|
|
|
1794
2234
|
run_id: Workflow run ID
|
|
1795
2235
|
resume_at: When to resume the workflow
|
|
1796
2236
|
storage_config: Storage backend configuration to pass to the resume task
|
|
2237
|
+
triggered_by: What triggered this resume scheduling (for debugging)
|
|
1797
2238
|
"""
|
|
1798
2239
|
from datetime import UTC
|
|
1799
2240
|
|
|
@@ -1802,10 +2243,11 @@ def schedule_workflow_resumption(
|
|
|
1802
2243
|
delay_seconds = max(0, int((resume_at - now).total_seconds()))
|
|
1803
2244
|
|
|
1804
2245
|
logger.info(
|
|
1805
|
-
"
|
|
2246
|
+
f"SCHEDULE_RESUME: {triggered_by}",
|
|
1806
2247
|
run_id=run_id,
|
|
1807
2248
|
resume_at=resume_at.isoformat(),
|
|
1808
2249
|
delay_seconds=delay_seconds,
|
|
2250
|
+
triggered_by=triggered_by,
|
|
1809
2251
|
)
|
|
1810
2252
|
|
|
1811
2253
|
# Schedule the resume task
|