pyworkflow-engine 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyworkflow/__init__.py +1 -1
- pyworkflow/celery/app.py +87 -3
- pyworkflow/celery/loop.py +108 -0
- pyworkflow/celery/tasks.py +537 -75
- pyworkflow/cli/commands/worker.py +13 -16
- pyworkflow/config.py +5 -0
- pyworkflow/context/base.py +4 -0
- pyworkflow/context/local.py +27 -1
- pyworkflow/context/step_context.py +1 -11
- pyworkflow/core/step.py +35 -15
- pyworkflow/engine/events.py +44 -30
- pyworkflow/engine/executor.py +21 -1
- pyworkflow/engine/replay.py +0 -39
- pyworkflow/observability/logging.py +43 -1
- pyworkflow/runtime/celery.py +1 -1
- pyworkflow/runtime/local.py +41 -1
- pyworkflow/storage/config.py +81 -2
- pyworkflow/storage/postgres.py +103 -34
- {pyworkflow_engine-0.1.11.dist-info → pyworkflow_engine-0.1.12.dist-info}/METADATA +1 -1
- {pyworkflow_engine-0.1.11.dist-info → pyworkflow_engine-0.1.12.dist-info}/RECORD +24 -23
- {pyworkflow_engine-0.1.11.dist-info → pyworkflow_engine-0.1.12.dist-info}/WHEEL +0 -0
- {pyworkflow_engine-0.1.11.dist-info → pyworkflow_engine-0.1.12.dist-info}/entry_points.txt +0 -0
- {pyworkflow_engine-0.1.11.dist-info → pyworkflow_engine-0.1.12.dist-info}/licenses/LICENSE +0 -0
- {pyworkflow_engine-0.1.11.dist-info → pyworkflow_engine-0.1.12.dist-info}/top_level.txt +0 -0
pyworkflow/celery/tasks.py
CHANGED
|
@@ -3,13 +3,14 @@ Celery tasks for distributed workflow and step execution.
|
|
|
3
3
|
|
|
4
4
|
These tasks enable:
|
|
5
5
|
- Distributed step execution across workers
|
|
6
|
-
- Automatic retry with exponential backoff
|
|
6
|
+
- Automatic retry with exponential backoff and jitter (via Celery)
|
|
7
7
|
- Scheduled sleep resumption
|
|
8
8
|
- Workflow orchestration
|
|
9
9
|
- Fault tolerance with automatic recovery on worker failures
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
12
|
import asyncio
|
|
13
|
+
import random
|
|
13
14
|
import uuid
|
|
14
15
|
from collections.abc import Callable
|
|
15
16
|
from datetime import UTC, datetime
|
|
@@ -19,10 +20,11 @@ if TYPE_CHECKING:
|
|
|
19
20
|
from pyworkflow.context.step_context import StepContext
|
|
20
21
|
|
|
21
22
|
from celery import Task
|
|
22
|
-
from celery.exceptions import WorkerLostError
|
|
23
|
+
from celery.exceptions import MaxRetriesExceededError, Retry, WorkerLostError
|
|
23
24
|
from loguru import logger
|
|
24
25
|
|
|
25
26
|
from pyworkflow.celery.app import celery_app
|
|
27
|
+
from pyworkflow.celery.loop import run_async
|
|
26
28
|
from pyworkflow.core.exceptions import (
|
|
27
29
|
CancellationError,
|
|
28
30
|
ContinueAsNewSignal,
|
|
@@ -39,6 +41,7 @@ from pyworkflow.engine.events import (
|
|
|
39
41
|
create_workflow_continued_as_new_event,
|
|
40
42
|
create_workflow_interrupted_event,
|
|
41
43
|
create_workflow_started_event,
|
|
44
|
+
create_workflow_suspended_event,
|
|
42
45
|
)
|
|
43
46
|
from pyworkflow.serialization.decoder import deserialize_args, deserialize_kwargs
|
|
44
47
|
from pyworkflow.serialization.encoder import serialize_args, serialize_kwargs
|
|
@@ -46,14 +49,38 @@ from pyworkflow.storage.base import StorageBackend
|
|
|
46
49
|
from pyworkflow.storage.schemas import RunStatus, WorkflowRun
|
|
47
50
|
|
|
48
51
|
|
|
52
|
+
def _calculate_exponential_backoff(
|
|
53
|
+
attempt: int, base: float = 2.0, max_delay: float = 300.0
|
|
54
|
+
) -> float:
|
|
55
|
+
"""
|
|
56
|
+
Calculate exponential backoff delay with jitter.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
attempt: Current retry attempt (0-indexed)
|
|
60
|
+
base: Base delay multiplier (default: 2.0)
|
|
61
|
+
max_delay: Maximum delay in seconds (default: 300s / 5 minutes)
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Delay in seconds with jitter applied
|
|
65
|
+
|
|
66
|
+
Formula: min(base * 2^attempt, max_delay) * (0.5 + random(0, 0.5))
|
|
67
|
+
This gives delays like: ~1s, ~2s, ~4s, ~8s, ~16s, ... capped at max_delay
|
|
68
|
+
"""
|
|
69
|
+
delay = min(base * (2**attempt), max_delay)
|
|
70
|
+
# Add jitter: multiply by random factor between 0.5 and 1.0
|
|
71
|
+
# This prevents thundering herd when multiple tasks retry simultaneously
|
|
72
|
+
jitter = 0.5 + random.random() * 0.5
|
|
73
|
+
return delay * jitter
|
|
74
|
+
|
|
75
|
+
|
|
49
76
|
class WorkflowTask(Task):
|
|
50
77
|
"""Base task class for workflow execution with custom error handling."""
|
|
51
78
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
79
|
+
# Allow unlimited Celery-level retries - our code controls the actual limit
|
|
80
|
+
# via the max_retries parameter passed to execute_step_task
|
|
81
|
+
max_retries = None
|
|
82
|
+
# Prevent message requeue loops when task fails
|
|
83
|
+
acks_on_failure_or_timeout = True
|
|
57
84
|
|
|
58
85
|
def on_failure(self, exc, task_id, args, kwargs, einfo):
|
|
59
86
|
"""
|
|
@@ -64,7 +91,6 @@ class WorkflowTask(Task):
|
|
|
64
91
|
- Other exceptions: Application failure
|
|
65
92
|
"""
|
|
66
93
|
is_worker_loss = isinstance(exc, WorkerLostError)
|
|
67
|
-
|
|
68
94
|
if is_worker_loss:
|
|
69
95
|
logger.warning(
|
|
70
96
|
f"Task {self.name} interrupted due to worker loss",
|
|
@@ -75,7 +101,7 @@ class WorkflowTask(Task):
|
|
|
75
101
|
# by another worker. See _handle_workflow_recovery() for logic.
|
|
76
102
|
else:
|
|
77
103
|
logger.error(
|
|
78
|
-
f"Task {self.name} failed",
|
|
104
|
+
f"Task {self.name} failed: {str(exc)}",
|
|
79
105
|
task_id=task_id,
|
|
80
106
|
error=str(exc),
|
|
81
107
|
traceback=einfo.traceback if einfo else None,
|
|
@@ -132,9 +158,13 @@ def execute_step_task(
|
|
|
132
158
|
Step result (serialized)
|
|
133
159
|
|
|
134
160
|
Raises:
|
|
135
|
-
FatalError: For non-retriable errors
|
|
136
|
-
RetryableError: For retriable errors (triggers automatic retry)
|
|
161
|
+
FatalError: For non-retriable errors after all retries exhausted
|
|
137
162
|
"""
|
|
163
|
+
# Ensure logging is configured in forked worker process
|
|
164
|
+
from pyworkflow.celery.app import _configure_worker_logging
|
|
165
|
+
|
|
166
|
+
_configure_worker_logging()
|
|
167
|
+
|
|
138
168
|
from pyworkflow.core.registry import _registry
|
|
139
169
|
|
|
140
170
|
logger.info(
|
|
@@ -144,11 +174,32 @@ def execute_step_task(
|
|
|
144
174
|
attempt=self.request.retries + 1,
|
|
145
175
|
)
|
|
146
176
|
|
|
177
|
+
# Check workflow status before executing - bail out if workflow is in terminal state
|
|
178
|
+
storage = _get_storage_backend(storage_config)
|
|
179
|
+
run = run_async(_get_workflow_run_safe(storage, run_id))
|
|
180
|
+
if run is None:
|
|
181
|
+
logger.warning(
|
|
182
|
+
f"Workflow run not found, skipping step execution: {step_name}",
|
|
183
|
+
run_id=run_id,
|
|
184
|
+
step_id=step_id,
|
|
185
|
+
)
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
# Only proceed if workflow is in a state where step execution makes sense
|
|
189
|
+
if run.status not in (RunStatus.RUNNING, RunStatus.SUSPENDED):
|
|
190
|
+
logger.warning(
|
|
191
|
+
f"Workflow in terminal state ({run.status.value}), skipping step execution: {step_name}",
|
|
192
|
+
run_id=run_id,
|
|
193
|
+
step_id=step_id,
|
|
194
|
+
workflow_status=run.status.value,
|
|
195
|
+
)
|
|
196
|
+
return None
|
|
197
|
+
|
|
147
198
|
# Get step metadata
|
|
148
199
|
step_meta = _registry.get_step(step_name)
|
|
149
200
|
if not step_meta:
|
|
150
201
|
# Record failure and resume workflow
|
|
151
|
-
|
|
202
|
+
run_async(
|
|
152
203
|
_record_step_failure_and_resume(
|
|
153
204
|
storage_config=storage_config,
|
|
154
205
|
run_id=run_id,
|
|
@@ -197,7 +248,7 @@ def execute_step_task(
|
|
|
197
248
|
|
|
198
249
|
# Execute the step
|
|
199
250
|
if asyncio.iscoroutinefunction(step_func):
|
|
200
|
-
result =
|
|
251
|
+
result = run_async(step_func(*args, **kwargs))
|
|
201
252
|
else:
|
|
202
253
|
result = step_func(*args, **kwargs)
|
|
203
254
|
|
|
@@ -208,7 +259,7 @@ def execute_step_task(
|
|
|
208
259
|
)
|
|
209
260
|
|
|
210
261
|
# Record STEP_COMPLETED event and trigger workflow resumption
|
|
211
|
-
|
|
262
|
+
run_async(
|
|
212
263
|
_record_step_completion_and_resume(
|
|
213
264
|
storage_config=storage_config,
|
|
214
265
|
run_id=run_id,
|
|
@@ -220,10 +271,23 @@ def execute_step_task(
|
|
|
220
271
|
|
|
221
272
|
return result
|
|
222
273
|
|
|
274
|
+
except Retry:
|
|
275
|
+
# Celery retry in progress - let it propagate correctly
|
|
276
|
+
raise
|
|
277
|
+
|
|
278
|
+
except MaxRetriesExceededError:
|
|
279
|
+
# Celery hit its internal retry limit - treat as fatal
|
|
280
|
+
logger.error(
|
|
281
|
+
f"Step exceeded Celery retry limit: {step_name}",
|
|
282
|
+
run_id=run_id,
|
|
283
|
+
step_id=step_id,
|
|
284
|
+
)
|
|
285
|
+
raise
|
|
286
|
+
|
|
223
287
|
except FatalError as e:
|
|
224
288
|
logger.error(f"Step failed (fatal): {step_name}", run_id=run_id, step_id=step_id)
|
|
225
289
|
# Record failure and resume workflow (workflow will fail on replay)
|
|
226
|
-
|
|
290
|
+
run_async(
|
|
227
291
|
_record_step_failure_and_resume(
|
|
228
292
|
storage_config=storage_config,
|
|
229
293
|
run_id=run_id,
|
|
@@ -239,16 +303,22 @@ def execute_step_task(
|
|
|
239
303
|
except RetryableError as e:
|
|
240
304
|
# Check if we have retries left
|
|
241
305
|
if self.request.retries < max_retries:
|
|
306
|
+
# Use explicit retry_after if provided, otherwise use exponential backoff
|
|
307
|
+
countdown = (
|
|
308
|
+
e.retry_after
|
|
309
|
+
if e.retry_after
|
|
310
|
+
else _calculate_exponential_backoff(self.request.retries)
|
|
311
|
+
)
|
|
242
312
|
logger.warning(
|
|
243
|
-
f"Step failed (retriable): {step_name}, retrying...",
|
|
313
|
+
f"Step failed (retriable): {step_name}, retrying in {countdown:.1f}s...",
|
|
244
314
|
run_id=run_id,
|
|
245
315
|
step_id=step_id,
|
|
246
|
-
|
|
316
|
+
countdown=countdown,
|
|
247
317
|
attempt=self.request.retries + 1,
|
|
248
318
|
max_retries=max_retries,
|
|
249
319
|
)
|
|
250
320
|
# Let Celery handle the retry - don't resume workflow yet
|
|
251
|
-
raise self.retry(
|
|
321
|
+
raise self.retry(countdown=countdown, exc=e)
|
|
252
322
|
else:
|
|
253
323
|
# Max retries exhausted - record failure and resume workflow
|
|
254
324
|
logger.error(
|
|
@@ -256,7 +326,7 @@ def execute_step_task(
|
|
|
256
326
|
run_id=run_id,
|
|
257
327
|
step_id=step_id,
|
|
258
328
|
)
|
|
259
|
-
|
|
329
|
+
run_async(
|
|
260
330
|
_record_step_failure_and_resume(
|
|
261
331
|
storage_config=storage_config,
|
|
262
332
|
run_id=run_id,
|
|
@@ -267,20 +337,23 @@ def execute_step_task(
|
|
|
267
337
|
is_retryable=False, # Mark as not retryable since we exhausted retries
|
|
268
338
|
)
|
|
269
339
|
)
|
|
270
|
-
raise
|
|
340
|
+
raise FatalError(f"Step '{step_name}' failed after retries: {str(e)}") from e
|
|
271
341
|
|
|
272
342
|
except Exception as e:
|
|
273
343
|
# Check if we have retries left
|
|
274
344
|
if self.request.retries < max_retries:
|
|
345
|
+
# Use exponential backoff for unexpected errors
|
|
346
|
+
countdown = _calculate_exponential_backoff(self.request.retries)
|
|
275
347
|
logger.warning(
|
|
276
|
-
f"Step failed (unexpected): {step_name}, retrying...",
|
|
348
|
+
f"Step failed (unexpected): {step_name}, retrying in {countdown:.1f}s...",
|
|
277
349
|
run_id=run_id,
|
|
278
350
|
step_id=step_id,
|
|
279
351
|
error=str(e),
|
|
352
|
+
countdown=countdown,
|
|
280
353
|
attempt=self.request.retries + 1,
|
|
281
354
|
)
|
|
282
|
-
# Treat unexpected errors as retriable
|
|
283
|
-
raise self.retry(exc=
|
|
355
|
+
# Treat unexpected errors as retriable with exponential backoff
|
|
356
|
+
raise self.retry(exc=e, countdown=countdown)
|
|
284
357
|
else:
|
|
285
358
|
# Max retries exhausted
|
|
286
359
|
logger.error(
|
|
@@ -290,7 +363,7 @@ def execute_step_task(
|
|
|
290
363
|
error=str(e),
|
|
291
364
|
exc_info=True,
|
|
292
365
|
)
|
|
293
|
-
|
|
366
|
+
run_async(
|
|
294
367
|
_record_step_failure_and_resume(
|
|
295
368
|
storage_config=storage_config,
|
|
296
369
|
run_id=run_id,
|
|
@@ -301,7 +374,7 @@ def execute_step_task(
|
|
|
301
374
|
is_retryable=False,
|
|
302
375
|
)
|
|
303
376
|
)
|
|
304
|
-
raise
|
|
377
|
+
raise FatalError(f"Step '{step_name}' failed after retries: {str(e)}") from e
|
|
305
378
|
|
|
306
379
|
finally:
|
|
307
380
|
# Clean up step context
|
|
@@ -323,9 +396,16 @@ async def _record_step_completion_and_resume(
|
|
|
323
396
|
result: Any,
|
|
324
397
|
) -> None:
|
|
325
398
|
"""
|
|
326
|
-
Record STEP_COMPLETED event and trigger workflow resumption.
|
|
399
|
+
Record STEP_COMPLETED event and trigger workflow resumption if safe.
|
|
327
400
|
|
|
328
401
|
Called by execute_step_task after successful step execution.
|
|
402
|
+
|
|
403
|
+
Only schedules resume if WORKFLOW_SUSPENDED event exists, indicating
|
|
404
|
+
the workflow has fully suspended. This prevents race conditions where
|
|
405
|
+
a step completes before the workflow has suspended.
|
|
406
|
+
|
|
407
|
+
Idempotency: If STEP_COMPLETED already exists for this step_id, skip
|
|
408
|
+
recording and resume scheduling (another task already handled it).
|
|
329
409
|
"""
|
|
330
410
|
from pyworkflow.engine.events import create_step_completed_event
|
|
331
411
|
from pyworkflow.serialization.encoder import serialize
|
|
@@ -337,6 +417,21 @@ async def _record_step_completion_and_resume(
|
|
|
337
417
|
if hasattr(storage, "connect"):
|
|
338
418
|
await storage.connect()
|
|
339
419
|
|
|
420
|
+
# Idempotency check: skip if step already completed
|
|
421
|
+
events = await storage.get_events(run_id)
|
|
422
|
+
already_completed = any(
|
|
423
|
+
evt.type == EventType.STEP_COMPLETED and evt.data.get("step_id") == step_id
|
|
424
|
+
for evt in events
|
|
425
|
+
)
|
|
426
|
+
if already_completed:
|
|
427
|
+
logger.info(
|
|
428
|
+
"Step already completed by another task, skipping",
|
|
429
|
+
run_id=run_id,
|
|
430
|
+
step_id=step_id,
|
|
431
|
+
step_name=step_name,
|
|
432
|
+
)
|
|
433
|
+
return
|
|
434
|
+
|
|
340
435
|
# Record STEP_COMPLETED event
|
|
341
436
|
completion_event = create_step_completed_event(
|
|
342
437
|
run_id=run_id,
|
|
@@ -346,15 +441,33 @@ async def _record_step_completion_and_resume(
|
|
|
346
441
|
)
|
|
347
442
|
await storage.record_event(completion_event)
|
|
348
443
|
|
|
349
|
-
#
|
|
350
|
-
|
|
444
|
+
# Refresh events to include the one we just recorded
|
|
445
|
+
events = await storage.get_events(run_id)
|
|
351
446
|
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
447
|
+
# Check if workflow has suspended (WORKFLOW_SUSPENDED event exists)
|
|
448
|
+
# Only schedule resume if workflow has properly suspended
|
|
449
|
+
has_suspended = any(evt.type == EventType.WORKFLOW_SUSPENDED for evt in events)
|
|
450
|
+
|
|
451
|
+
if has_suspended:
|
|
452
|
+
# Workflow has suspended, safe to schedule resume
|
|
453
|
+
schedule_workflow_resumption(
|
|
454
|
+
run_id, datetime.now(UTC), storage_config, triggered_by="step_completed"
|
|
455
|
+
)
|
|
456
|
+
logger.info(
|
|
457
|
+
"Step completed and workflow resumption scheduled",
|
|
458
|
+
run_id=run_id,
|
|
459
|
+
step_id=step_id,
|
|
460
|
+
step_name=step_name,
|
|
461
|
+
)
|
|
462
|
+
else:
|
|
463
|
+
# Workflow hasn't suspended yet - don't schedule resume
|
|
464
|
+
# The suspension handler will check for step completion and schedule resume
|
|
465
|
+
logger.info(
|
|
466
|
+
"Step completed but workflow not yet suspended, skipping resume scheduling",
|
|
467
|
+
run_id=run_id,
|
|
468
|
+
step_id=step_id,
|
|
469
|
+
step_name=step_name,
|
|
470
|
+
)
|
|
358
471
|
|
|
359
472
|
|
|
360
473
|
async def _record_step_failure_and_resume(
|
|
@@ -367,10 +480,17 @@ async def _record_step_failure_and_resume(
|
|
|
367
480
|
is_retryable: bool,
|
|
368
481
|
) -> None:
|
|
369
482
|
"""
|
|
370
|
-
Record STEP_FAILED event and trigger workflow resumption.
|
|
483
|
+
Record STEP_FAILED event and trigger workflow resumption if safe.
|
|
371
484
|
|
|
372
485
|
Called by execute_step_task after step failure (when retries are exhausted).
|
|
373
486
|
The workflow will fail when it replays and sees the failure event.
|
|
487
|
+
|
|
488
|
+
Only schedules resume if WORKFLOW_SUSPENDED event exists, indicating
|
|
489
|
+
the workflow has fully suspended. This prevents race conditions where
|
|
490
|
+
a step fails before the workflow has suspended.
|
|
491
|
+
|
|
492
|
+
Idempotency: If STEP_COMPLETED or terminal STEP_FAILED already exists
|
|
493
|
+
for this step_id, skip recording and resume scheduling.
|
|
374
494
|
"""
|
|
375
495
|
from pyworkflow.engine.events import create_step_failed_event
|
|
376
496
|
|
|
@@ -381,6 +501,26 @@ async def _record_step_failure_and_resume(
|
|
|
381
501
|
if hasattr(storage, "connect"):
|
|
382
502
|
await storage.connect()
|
|
383
503
|
|
|
504
|
+
# Idempotency check: skip if step already completed or terminally failed
|
|
505
|
+
events = await storage.get_events(run_id)
|
|
506
|
+
already_handled = any(
|
|
507
|
+
(evt.type == EventType.STEP_COMPLETED and evt.data.get("step_id") == step_id)
|
|
508
|
+
or (
|
|
509
|
+
evt.type == EventType.STEP_FAILED
|
|
510
|
+
and evt.data.get("step_id") == step_id
|
|
511
|
+
and not evt.data.get("is_retryable", True)
|
|
512
|
+
)
|
|
513
|
+
for evt in events
|
|
514
|
+
)
|
|
515
|
+
if already_handled:
|
|
516
|
+
logger.info(
|
|
517
|
+
"Step already completed/failed by another task, skipping",
|
|
518
|
+
run_id=run_id,
|
|
519
|
+
step_id=step_id,
|
|
520
|
+
step_name=step_name,
|
|
521
|
+
)
|
|
522
|
+
return
|
|
523
|
+
|
|
384
524
|
# Record STEP_FAILED event
|
|
385
525
|
failure_event = create_step_failed_event(
|
|
386
526
|
run_id=run_id,
|
|
@@ -392,16 +532,54 @@ async def _record_step_failure_and_resume(
|
|
|
392
532
|
)
|
|
393
533
|
await storage.record_event(failure_event)
|
|
394
534
|
|
|
395
|
-
#
|
|
396
|
-
|
|
535
|
+
# Refresh events to include the one we just recorded
|
|
536
|
+
events = await storage.get_events(run_id)
|
|
397
537
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
538
|
+
# Check if workflow has suspended (WORKFLOW_SUSPENDED event exists)
|
|
539
|
+
# Only schedule resume if workflow has properly suspended
|
|
540
|
+
has_suspended = any(evt.type == EventType.WORKFLOW_SUSPENDED for evt in events)
|
|
541
|
+
|
|
542
|
+
if has_suspended:
|
|
543
|
+
# Workflow has suspended, safe to schedule resume
|
|
544
|
+
schedule_workflow_resumption(
|
|
545
|
+
run_id, datetime.now(UTC), storage_config, triggered_by="step_failed"
|
|
546
|
+
)
|
|
547
|
+
logger.info(
|
|
548
|
+
"Step failed and workflow resumption scheduled",
|
|
549
|
+
run_id=run_id,
|
|
550
|
+
step_id=step_id,
|
|
551
|
+
step_name=step_name,
|
|
552
|
+
error=error,
|
|
553
|
+
)
|
|
554
|
+
else:
|
|
555
|
+
# Workflow hasn't suspended yet - don't schedule resume
|
|
556
|
+
# The suspension handler will check for step failure and schedule resume
|
|
557
|
+
logger.info(
|
|
558
|
+
"Step failed but workflow not yet suspended, skipping resume scheduling",
|
|
559
|
+
run_id=run_id,
|
|
560
|
+
step_id=step_id,
|
|
561
|
+
step_name=step_name,
|
|
562
|
+
error=error,
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
async def _get_workflow_run_safe(
|
|
567
|
+
storage: StorageBackend,
|
|
568
|
+
run_id: str,
|
|
569
|
+
) -> WorkflowRun | None:
|
|
570
|
+
"""
|
|
571
|
+
Safely get workflow run with proper storage connection handling.
|
|
572
|
+
|
|
573
|
+
Args:
|
|
574
|
+
storage: Storage backend
|
|
575
|
+
run_id: Workflow run ID
|
|
576
|
+
|
|
577
|
+
Returns:
|
|
578
|
+
WorkflowRun or None if not found
|
|
579
|
+
"""
|
|
580
|
+
if hasattr(storage, "connect"):
|
|
581
|
+
await storage.connect()
|
|
582
|
+
return await storage.get_run(run_id)
|
|
405
583
|
|
|
406
584
|
|
|
407
585
|
def _resolve_context_class(class_name: str) -> type["StepContext"] | None:
|
|
@@ -430,6 +608,7 @@ def _resolve_context_class(class_name: str) -> type["StepContext"] | None:
|
|
|
430
608
|
|
|
431
609
|
@celery_app.task(
|
|
432
610
|
name="pyworkflow.start_workflow",
|
|
611
|
+
base=WorkflowTask,
|
|
433
612
|
queue="pyworkflow.workflows",
|
|
434
613
|
)
|
|
435
614
|
def start_workflow_task(
|
|
@@ -456,7 +635,17 @@ def start_workflow_task(
|
|
|
456
635
|
Returns:
|
|
457
636
|
Workflow run ID
|
|
458
637
|
"""
|
|
459
|
-
|
|
638
|
+
# Ensure logging is configured in forked worker process
|
|
639
|
+
from pyworkflow.celery.app import _configure_worker_logging
|
|
640
|
+
|
|
641
|
+
_configure_worker_logging()
|
|
642
|
+
|
|
643
|
+
logger.info(
|
|
644
|
+
f"START_WORKFLOW_TASK ENTRY: {workflow_name}",
|
|
645
|
+
run_id=run_id,
|
|
646
|
+
idempotency_key=idempotency_key,
|
|
647
|
+
celery_task_id=start_workflow_task.request.id,
|
|
648
|
+
)
|
|
460
649
|
|
|
461
650
|
# Get workflow metadata
|
|
462
651
|
workflow_meta = get_workflow(workflow_name)
|
|
@@ -471,7 +660,7 @@ def start_workflow_task(
|
|
|
471
660
|
storage = _get_storage_backend(storage_config)
|
|
472
661
|
|
|
473
662
|
# Execute workflow directly on worker
|
|
474
|
-
result_run_id =
|
|
663
|
+
result_run_id = run_async(
|
|
475
664
|
_start_workflow_on_worker(
|
|
476
665
|
workflow_meta=workflow_meta,
|
|
477
666
|
args=args,
|
|
@@ -489,6 +678,7 @@ def start_workflow_task(
|
|
|
489
678
|
|
|
490
679
|
@celery_app.task(
|
|
491
680
|
name="pyworkflow.start_child_workflow",
|
|
681
|
+
base=WorkflowTask,
|
|
492
682
|
queue="pyworkflow.workflows",
|
|
493
683
|
)
|
|
494
684
|
def start_child_workflow_task(
|
|
@@ -520,6 +710,11 @@ def start_child_workflow_task(
|
|
|
520
710
|
Returns:
|
|
521
711
|
Child workflow run ID
|
|
522
712
|
"""
|
|
713
|
+
# Ensure logging is configured in forked worker process
|
|
714
|
+
from pyworkflow.celery.app import _configure_worker_logging
|
|
715
|
+
|
|
716
|
+
_configure_worker_logging()
|
|
717
|
+
|
|
523
718
|
logger.info(
|
|
524
719
|
f"Starting child workflow on worker: {workflow_name}",
|
|
525
720
|
child_run_id=child_run_id,
|
|
@@ -539,7 +734,7 @@ def start_child_workflow_task(
|
|
|
539
734
|
storage = _get_storage_backend(storage_config)
|
|
540
735
|
|
|
541
736
|
# Execute child workflow on worker
|
|
542
|
-
|
|
737
|
+
run_async(
|
|
543
738
|
_execute_child_workflow_on_worker(
|
|
544
739
|
workflow_func=workflow_meta.func,
|
|
545
740
|
workflow_name=workflow_name,
|
|
@@ -633,19 +828,62 @@ async def _execute_child_workflow_on_worker(
|
|
|
633
828
|
await _trigger_parent_resumption_celery(parent_run_id, storage, storage_config)
|
|
634
829
|
|
|
635
830
|
except SuspensionSignal as e:
|
|
636
|
-
# Child workflow suspended (e.g., sleep, hook)
|
|
831
|
+
# Child workflow suspended (e.g., sleep, hook, step dispatch)
|
|
637
832
|
# Update status and don't notify parent yet - handled on child resumption
|
|
638
833
|
await storage.update_run_status(child_run_id, RunStatus.SUSPENDED)
|
|
834
|
+
|
|
835
|
+
# Record WORKFLOW_SUSPENDED event
|
|
836
|
+
step_id = e.data.get("step_id") if e.data else None
|
|
837
|
+
step_name = e.data.get("step_name") if e.data else None
|
|
838
|
+
sleep_id = e.data.get("sleep_id") if e.data else None
|
|
839
|
+
hook_id = e.data.get("hook_id") if e.data else None
|
|
840
|
+
nested_child_id = e.data.get("child_id") if e.data else None
|
|
841
|
+
|
|
842
|
+
suspended_event = create_workflow_suspended_event(
|
|
843
|
+
run_id=child_run_id,
|
|
844
|
+
reason=e.reason,
|
|
845
|
+
step_id=step_id,
|
|
846
|
+
step_name=step_name,
|
|
847
|
+
sleep_id=sleep_id,
|
|
848
|
+
hook_id=hook_id,
|
|
849
|
+
child_id=nested_child_id,
|
|
850
|
+
)
|
|
851
|
+
await storage.record_event(suspended_event)
|
|
852
|
+
|
|
639
853
|
logger.debug(
|
|
640
854
|
f"Child workflow suspended: {workflow_name}",
|
|
641
855
|
parent_run_id=parent_run_id,
|
|
642
856
|
child_run_id=child_run_id,
|
|
643
857
|
)
|
|
644
858
|
|
|
859
|
+
# For step dispatch suspensions, check if step already completed/failed
|
|
860
|
+
if step_id and e.reason.startswith("step_dispatch:"):
|
|
861
|
+
events = await storage.get_events(child_run_id)
|
|
862
|
+
step_finished = any(
|
|
863
|
+
evt.type in (EventType.STEP_COMPLETED, EventType.STEP_FAILED)
|
|
864
|
+
and evt.data.get("step_id") == step_id
|
|
865
|
+
for evt in events
|
|
866
|
+
)
|
|
867
|
+
if step_finished:
|
|
868
|
+
logger.info(
|
|
869
|
+
"Child step finished before suspension completed, scheduling resume",
|
|
870
|
+
child_run_id=child_run_id,
|
|
871
|
+
step_id=step_id,
|
|
872
|
+
)
|
|
873
|
+
schedule_workflow_resumption(
|
|
874
|
+
child_run_id,
|
|
875
|
+
datetime.now(UTC),
|
|
876
|
+
storage_config=storage_config,
|
|
877
|
+
triggered_by="child_suspension_step_race",
|
|
878
|
+
)
|
|
879
|
+
return
|
|
880
|
+
|
|
645
881
|
# Schedule automatic resumption if we have a resume_at time
|
|
646
882
|
resume_at = e.data.get("resume_at") if e.data else None
|
|
647
883
|
if resume_at:
|
|
648
|
-
schedule_workflow_resumption(
|
|
884
|
+
schedule_workflow_resumption(
|
|
885
|
+
child_run_id, resume_at, storage_config, triggered_by="child_sleep_hook"
|
|
886
|
+
)
|
|
649
887
|
|
|
650
888
|
except ContinueAsNewSignal as e:
|
|
651
889
|
# Child workflow continuing as new execution
|
|
@@ -718,7 +956,9 @@ async def _trigger_parent_resumption_celery(
|
|
|
718
956
|
parent_run_id=parent_run_id,
|
|
719
957
|
)
|
|
720
958
|
# Schedule immediate resumption via Celery
|
|
721
|
-
schedule_workflow_resumption(
|
|
959
|
+
schedule_workflow_resumption(
|
|
960
|
+
parent_run_id, datetime.now(UTC), storage_config, triggered_by="child_completed"
|
|
961
|
+
)
|
|
722
962
|
|
|
723
963
|
|
|
724
964
|
async def _notify_parent_of_child_completion(
|
|
@@ -978,9 +1218,27 @@ async def _recover_workflow_on_worker(
|
|
|
978
1218
|
return run_id
|
|
979
1219
|
|
|
980
1220
|
except SuspensionSignal as e:
|
|
981
|
-
# Workflow suspended again
|
|
1221
|
+
# Workflow suspended again (during recovery)
|
|
982
1222
|
await storage.update_run_status(run_id=run_id, status=RunStatus.SUSPENDED)
|
|
983
1223
|
|
|
1224
|
+
# Record WORKFLOW_SUSPENDED event
|
|
1225
|
+
step_id = e.data.get("step_id") if e.data else None
|
|
1226
|
+
step_name = e.data.get("step_name") if e.data else None
|
|
1227
|
+
sleep_id = e.data.get("sleep_id") if e.data else None
|
|
1228
|
+
hook_id = e.data.get("hook_id") if e.data else None
|
|
1229
|
+
child_id = e.data.get("child_id") if e.data else None
|
|
1230
|
+
|
|
1231
|
+
suspended_event = create_workflow_suspended_event(
|
|
1232
|
+
run_id=run_id,
|
|
1233
|
+
reason=e.reason,
|
|
1234
|
+
step_id=step_id,
|
|
1235
|
+
step_name=step_name,
|
|
1236
|
+
sleep_id=sleep_id,
|
|
1237
|
+
hook_id=hook_id,
|
|
1238
|
+
child_id=child_id,
|
|
1239
|
+
)
|
|
1240
|
+
await storage.record_event(suspended_event)
|
|
1241
|
+
|
|
984
1242
|
logger.info(
|
|
985
1243
|
f"Recovered workflow suspended: {e.reason}",
|
|
986
1244
|
run_id=run_id,
|
|
@@ -988,10 +1246,34 @@ async def _recover_workflow_on_worker(
|
|
|
988
1246
|
reason=e.reason,
|
|
989
1247
|
)
|
|
990
1248
|
|
|
1249
|
+
# For step dispatch suspensions, check if step already completed/failed
|
|
1250
|
+
if step_id and e.reason.startswith("step_dispatch:"):
|
|
1251
|
+
events = await storage.get_events(run_id)
|
|
1252
|
+
step_finished = any(
|
|
1253
|
+
evt.type in (EventType.STEP_COMPLETED, EventType.STEP_FAILED)
|
|
1254
|
+
and evt.data.get("step_id") == step_id
|
|
1255
|
+
for evt in events
|
|
1256
|
+
)
|
|
1257
|
+
if step_finished:
|
|
1258
|
+
logger.info(
|
|
1259
|
+
"Step finished before recovery suspension completed, scheduling resume",
|
|
1260
|
+
run_id=run_id,
|
|
1261
|
+
step_id=step_id,
|
|
1262
|
+
)
|
|
1263
|
+
schedule_workflow_resumption(
|
|
1264
|
+
run_id,
|
|
1265
|
+
datetime.now(UTC),
|
|
1266
|
+
storage_config=storage_config,
|
|
1267
|
+
triggered_by="recovery_suspension_step_race",
|
|
1268
|
+
)
|
|
1269
|
+
return run_id
|
|
1270
|
+
|
|
991
1271
|
# Schedule automatic resumption if we have a resume_at time
|
|
992
1272
|
resume_at = e.data.get("resume_at") if e.data else None
|
|
993
1273
|
if resume_at:
|
|
994
|
-
schedule_workflow_resumption(
|
|
1274
|
+
schedule_workflow_resumption(
|
|
1275
|
+
run_id, resume_at, storage_config=storage_config, triggered_by="recovery_sleep_hook"
|
|
1276
|
+
)
|
|
995
1277
|
logger.info(
|
|
996
1278
|
"Scheduled automatic workflow resumption",
|
|
997
1279
|
run_id=run_id,
|
|
@@ -1076,10 +1358,22 @@ async def _start_workflow_on_worker(
|
|
|
1076
1358
|
workflow_name = workflow_meta.name
|
|
1077
1359
|
config = get_config()
|
|
1078
1360
|
|
|
1361
|
+
run = await storage.get_run(run_id) if run_id else None
|
|
1362
|
+
logger.debug(
|
|
1363
|
+
f"_START_WORKFLOW_ON_WORKER ENTRY: {workflow_name} with run_id={run_id} and status={run.status.value if run else 'N/A'}",
|
|
1364
|
+
run_id=run_id,
|
|
1365
|
+
)
|
|
1366
|
+
|
|
1079
1367
|
# Check idempotency key
|
|
1080
1368
|
if idempotency_key:
|
|
1081
1369
|
existing_run = await storage.get_run_by_idempotency_key(idempotency_key)
|
|
1082
1370
|
if existing_run:
|
|
1371
|
+
logger.info(
|
|
1372
|
+
"IDEMPOTENCY CHECK: Found existing run",
|
|
1373
|
+
run_id=existing_run.run_id,
|
|
1374
|
+
status=existing_run.status.value,
|
|
1375
|
+
idempotency_key=idempotency_key,
|
|
1376
|
+
)
|
|
1083
1377
|
# Check if this is a recovery scenario (workflow was RUNNING but worker crashed)
|
|
1084
1378
|
if existing_run.status == RunStatus.RUNNING:
|
|
1085
1379
|
# Check if this is truly a crashed worker or just a duplicate task execution
|
|
@@ -1140,27 +1434,76 @@ async def _start_workflow_on_worker(
|
|
|
1140
1434
|
if run_id is None:
|
|
1141
1435
|
run_id = f"run_{uuid.uuid4().hex[:16]}"
|
|
1142
1436
|
|
|
1143
|
-
# Check if run already exists
|
|
1437
|
+
# Check if run already exists
|
|
1144
1438
|
existing_run = await storage.get_run(run_id)
|
|
1145
|
-
if existing_run
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
worker_id=None,
|
|
1439
|
+
if existing_run:
|
|
1440
|
+
logger.info(
|
|
1441
|
+
f"RUN_ID CHECK: Found existing run with status {existing_run.status.value}",
|
|
1442
|
+
run_id=run_id,
|
|
1443
|
+
status=existing_run.status.value,
|
|
1151
1444
|
)
|
|
1152
|
-
|
|
1153
|
-
|
|
1445
|
+
|
|
1446
|
+
if existing_run.status == RunStatus.RUNNING:
|
|
1447
|
+
# Recovery scenario - worker crashed while running
|
|
1448
|
+
can_recover = await _handle_workflow_recovery(
|
|
1154
1449
|
run=existing_run,
|
|
1155
|
-
workflow_meta=workflow_meta,
|
|
1156
1450
|
storage=storage,
|
|
1157
|
-
|
|
1451
|
+
worker_id=None,
|
|
1452
|
+
)
|
|
1453
|
+
if can_recover:
|
|
1454
|
+
return await _recover_workflow_on_worker(
|
|
1455
|
+
run=existing_run,
|
|
1456
|
+
workflow_meta=workflow_meta,
|
|
1457
|
+
storage=storage,
|
|
1458
|
+
storage_config=storage_config,
|
|
1459
|
+
)
|
|
1460
|
+
else:
|
|
1461
|
+
return existing_run.run_id
|
|
1462
|
+
|
|
1463
|
+
elif existing_run.status == RunStatus.SUSPENDED:
|
|
1464
|
+
# Workflow is suspended - this start_workflow_task is a duplicate
|
|
1465
|
+
# (scheduled during race condition before workflow suspended)
|
|
1466
|
+
# Return existing run_id - resume_workflow_task will handle it
|
|
1467
|
+
logger.info(
|
|
1468
|
+
"DUPLICATE START: Workflow already suspended, returning existing run",
|
|
1469
|
+
run_id=run_id,
|
|
1470
|
+
status=existing_run.status.value,
|
|
1158
1471
|
)
|
|
1159
|
-
else:
|
|
1160
1472
|
return existing_run.run_id
|
|
1161
1473
|
|
|
1474
|
+
elif existing_run.status in (
|
|
1475
|
+
RunStatus.COMPLETED,
|
|
1476
|
+
RunStatus.FAILED,
|
|
1477
|
+
RunStatus.CANCELLED,
|
|
1478
|
+
):
|
|
1479
|
+
# Terminal status - workflow already finished
|
|
1480
|
+
logger.info(
|
|
1481
|
+
f"TERMINAL STATUS: Workflow already {existing_run.status.value}, returning existing run",
|
|
1482
|
+
run_id=run_id,
|
|
1483
|
+
status=existing_run.status.value,
|
|
1484
|
+
)
|
|
1485
|
+
return existing_run.run_id
|
|
1486
|
+
|
|
1487
|
+
elif existing_run.status == RunStatus.INTERRUPTED:
|
|
1488
|
+
# Previous recovery failed, try again
|
|
1489
|
+
can_recover = await _handle_workflow_recovery(
|
|
1490
|
+
run=existing_run,
|
|
1491
|
+
storage=storage,
|
|
1492
|
+
worker_id=None,
|
|
1493
|
+
)
|
|
1494
|
+
if can_recover:
|
|
1495
|
+
return await _recover_workflow_on_worker(
|
|
1496
|
+
run=existing_run,
|
|
1497
|
+
workflow_meta=workflow_meta,
|
|
1498
|
+
storage=storage,
|
|
1499
|
+
storage_config=storage_config,
|
|
1500
|
+
)
|
|
1501
|
+
else:
|
|
1502
|
+
return existing_run.run_id
|
|
1503
|
+
|
|
1504
|
+
# Only reach here if no existing run found
|
|
1162
1505
|
logger.info(
|
|
1163
|
-
f"
|
|
1506
|
+
f"FRESH START: Creating new workflow run: {workflow_name}",
|
|
1164
1507
|
run_id=run_id,
|
|
1165
1508
|
workflow_name=workflow_name,
|
|
1166
1509
|
)
|
|
@@ -1265,9 +1608,28 @@ async def _start_workflow_on_worker(
|
|
|
1265
1608
|
return run_id
|
|
1266
1609
|
|
|
1267
1610
|
except SuspensionSignal as e:
|
|
1268
|
-
# Workflow suspended (sleep or
|
|
1611
|
+
# Workflow suspended (sleep, hook, or step dispatch)
|
|
1269
1612
|
await storage.update_run_status(run_id=run_id, status=RunStatus.SUSPENDED)
|
|
1270
1613
|
|
|
1614
|
+
# Record WORKFLOW_SUSPENDED event - this signals that suspension is complete
|
|
1615
|
+
# and resume can be safely scheduled
|
|
1616
|
+
step_id = e.data.get("step_id") if e.data else None
|
|
1617
|
+
step_name = e.data.get("step_name") if e.data else None
|
|
1618
|
+
sleep_id = e.data.get("sleep_id") if e.data else None
|
|
1619
|
+
hook_id = e.data.get("hook_id") if e.data else None
|
|
1620
|
+
child_id = e.data.get("child_id") if e.data else None
|
|
1621
|
+
|
|
1622
|
+
suspended_event = create_workflow_suspended_event(
|
|
1623
|
+
run_id=run_id,
|
|
1624
|
+
reason=e.reason,
|
|
1625
|
+
step_id=step_id,
|
|
1626
|
+
step_name=step_name,
|
|
1627
|
+
sleep_id=sleep_id,
|
|
1628
|
+
hook_id=hook_id,
|
|
1629
|
+
child_id=child_id,
|
|
1630
|
+
)
|
|
1631
|
+
await storage.record_event(suspended_event)
|
|
1632
|
+
|
|
1271
1633
|
logger.info(
|
|
1272
1634
|
f"Workflow suspended on worker: {e.reason}",
|
|
1273
1635
|
run_id=run_id,
|
|
@@ -1275,10 +1637,35 @@ async def _start_workflow_on_worker(
|
|
|
1275
1637
|
reason=e.reason,
|
|
1276
1638
|
)
|
|
1277
1639
|
|
|
1278
|
-
#
|
|
1640
|
+
# For step dispatch suspensions, check if step already completed/failed (race condition)
|
|
1641
|
+
# If so, schedule resume immediately
|
|
1642
|
+
if step_id and e.reason.startswith("step_dispatch:"):
|
|
1643
|
+
events = await storage.get_events(run_id)
|
|
1644
|
+
step_finished = any(
|
|
1645
|
+
evt.type in (EventType.STEP_COMPLETED, EventType.STEP_FAILED)
|
|
1646
|
+
and evt.data.get("step_id") == step_id
|
|
1647
|
+
for evt in events
|
|
1648
|
+
)
|
|
1649
|
+
if step_finished:
|
|
1650
|
+
logger.info(
|
|
1651
|
+
"Step finished before suspension completed, scheduling resume",
|
|
1652
|
+
run_id=run_id,
|
|
1653
|
+
step_id=step_id,
|
|
1654
|
+
)
|
|
1655
|
+
schedule_workflow_resumption(
|
|
1656
|
+
run_id,
|
|
1657
|
+
datetime.now(UTC),
|
|
1658
|
+
storage_config=storage_config,
|
|
1659
|
+
triggered_by="resume_suspension_step_race",
|
|
1660
|
+
)
|
|
1661
|
+
return run_id
|
|
1662
|
+
|
|
1663
|
+
# Schedule automatic resumption if we have a resume_at time (for sleep/hook)
|
|
1279
1664
|
resume_at = e.data.get("resume_at") if e.data else None
|
|
1280
1665
|
if resume_at:
|
|
1281
|
-
schedule_workflow_resumption(
|
|
1666
|
+
schedule_workflow_resumption(
|
|
1667
|
+
run_id, resume_at, storage_config=storage_config, triggered_by="resume_sleep_hook"
|
|
1668
|
+
)
|
|
1282
1669
|
logger.info(
|
|
1283
1670
|
"Scheduled automatic workflow resumption",
|
|
1284
1671
|
run_id=run_id,
|
|
@@ -1332,6 +1719,7 @@ async def _start_workflow_on_worker(
|
|
|
1332
1719
|
|
|
1333
1720
|
@celery_app.task(
|
|
1334
1721
|
name="pyworkflow.resume_workflow",
|
|
1722
|
+
base=WorkflowTask,
|
|
1335
1723
|
queue="pyworkflow.schedules",
|
|
1336
1724
|
)
|
|
1337
1725
|
def resume_workflow_task(
|
|
@@ -1351,13 +1739,22 @@ def resume_workflow_task(
|
|
|
1351
1739
|
Returns:
|
|
1352
1740
|
Workflow result if completed, None if suspended again
|
|
1353
1741
|
"""
|
|
1354
|
-
|
|
1742
|
+
# Ensure logging is configured in forked worker process
|
|
1743
|
+
from pyworkflow.celery.app import _configure_worker_logging
|
|
1744
|
+
|
|
1745
|
+
_configure_worker_logging()
|
|
1746
|
+
|
|
1747
|
+
logger.info(
|
|
1748
|
+
f"RESUME_WORKFLOW_TASK ENTRY: {run_id}",
|
|
1749
|
+
run_id=run_id,
|
|
1750
|
+
celery_task_id=resume_workflow_task.request.id,
|
|
1751
|
+
)
|
|
1355
1752
|
|
|
1356
1753
|
# Get storage backend
|
|
1357
1754
|
storage = _get_storage_backend(storage_config)
|
|
1358
1755
|
|
|
1359
1756
|
# Resume workflow directly on worker
|
|
1360
|
-
result =
|
|
1757
|
+
result = run_async(_resume_workflow_on_worker(run_id, storage, storage_config))
|
|
1361
1758
|
|
|
1362
1759
|
if result is not None:
|
|
1363
1760
|
logger.info(f"Workflow completed on worker: {run_id}")
|
|
@@ -1369,6 +1766,7 @@ def resume_workflow_task(
|
|
|
1369
1766
|
|
|
1370
1767
|
@celery_app.task(
|
|
1371
1768
|
name="pyworkflow.execute_scheduled_workflow",
|
|
1769
|
+
base=WorkflowTask,
|
|
1372
1770
|
queue="pyworkflow.schedules",
|
|
1373
1771
|
)
|
|
1374
1772
|
def execute_scheduled_workflow_task(
|
|
@@ -1390,11 +1788,16 @@ def execute_scheduled_workflow_task(
|
|
|
1390
1788
|
Returns:
|
|
1391
1789
|
Workflow run ID if started, None if skipped
|
|
1392
1790
|
"""
|
|
1791
|
+
# Ensure logging is configured in forked worker process
|
|
1792
|
+
from pyworkflow.celery.app import _configure_worker_logging
|
|
1793
|
+
|
|
1794
|
+
_configure_worker_logging()
|
|
1795
|
+
|
|
1393
1796
|
logger.info("Executing scheduled workflow", schedule_id=schedule_id)
|
|
1394
1797
|
|
|
1395
1798
|
storage = _get_storage_backend(storage_config)
|
|
1396
1799
|
|
|
1397
|
-
return
|
|
1800
|
+
return run_async(
|
|
1398
1801
|
_execute_scheduled_workflow(
|
|
1399
1802
|
schedule_id=schedule_id,
|
|
1400
1803
|
scheduled_time=datetime.fromisoformat(scheduled_time),
|
|
@@ -1587,6 +1990,19 @@ async def _resume_workflow_on_worker(
|
|
|
1587
1990
|
)
|
|
1588
1991
|
return None
|
|
1589
1992
|
|
|
1993
|
+
# Prevent duplicate resume execution
|
|
1994
|
+
# Multiple resume tasks can be scheduled for the same workflow (e.g., race
|
|
1995
|
+
# condition between step completion and suspension handler). Only proceed
|
|
1996
|
+
# if the workflow is actually SUSPENDED. If status is RUNNING, another
|
|
1997
|
+
# resume task got there first.
|
|
1998
|
+
if run.status != RunStatus.SUSPENDED:
|
|
1999
|
+
logger.info(
|
|
2000
|
+
f"Workflow status is {run.status.value}, not SUSPENDED - skipping duplicate resume",
|
|
2001
|
+
run_id=run_id,
|
|
2002
|
+
workflow_name=run.workflow_name,
|
|
2003
|
+
)
|
|
2004
|
+
return None
|
|
2005
|
+
|
|
1590
2006
|
# Check for cancellation flag
|
|
1591
2007
|
cancellation_requested = await storage.check_cancellation_flag(run_id)
|
|
1592
2008
|
|
|
@@ -1692,9 +2108,27 @@ async def _resume_workflow_on_worker(
|
|
|
1692
2108
|
return None
|
|
1693
2109
|
|
|
1694
2110
|
except SuspensionSignal as e:
|
|
1695
|
-
# Workflow suspended again
|
|
2111
|
+
# Workflow suspended again (during resume)
|
|
1696
2112
|
await storage.update_run_status(run_id=run_id, status=RunStatus.SUSPENDED)
|
|
1697
2113
|
|
|
2114
|
+
# Record WORKFLOW_SUSPENDED event
|
|
2115
|
+
step_id = e.data.get("step_id") if e.data else None
|
|
2116
|
+
step_name = e.data.get("step_name") if e.data else None
|
|
2117
|
+
sleep_id = e.data.get("sleep_id") if e.data else None
|
|
2118
|
+
hook_id = e.data.get("hook_id") if e.data else None
|
|
2119
|
+
child_id = e.data.get("child_id") if e.data else None
|
|
2120
|
+
|
|
2121
|
+
suspended_event = create_workflow_suspended_event(
|
|
2122
|
+
run_id=run_id,
|
|
2123
|
+
reason=e.reason,
|
|
2124
|
+
step_id=step_id,
|
|
2125
|
+
step_name=step_name,
|
|
2126
|
+
sleep_id=sleep_id,
|
|
2127
|
+
hook_id=hook_id,
|
|
2128
|
+
child_id=child_id,
|
|
2129
|
+
)
|
|
2130
|
+
await storage.record_event(suspended_event)
|
|
2131
|
+
|
|
1698
2132
|
logger.info(
|
|
1699
2133
|
f"Workflow suspended again on worker: {e.reason}",
|
|
1700
2134
|
run_id=run_id,
|
|
@@ -1702,10 +2136,34 @@ async def _resume_workflow_on_worker(
|
|
|
1702
2136
|
reason=e.reason,
|
|
1703
2137
|
)
|
|
1704
2138
|
|
|
2139
|
+
# For step dispatch suspensions, check if step already completed/failed
|
|
2140
|
+
if step_id and e.reason.startswith("step_dispatch:"):
|
|
2141
|
+
events = await storage.get_events(run_id)
|
|
2142
|
+
step_finished = any(
|
|
2143
|
+
evt.type in (EventType.STEP_COMPLETED, EventType.STEP_FAILED)
|
|
2144
|
+
and evt.data.get("step_id") == step_id
|
|
2145
|
+
for evt in events
|
|
2146
|
+
)
|
|
2147
|
+
if step_finished:
|
|
2148
|
+
logger.info(
|
|
2149
|
+
"Step finished before resume suspension completed, scheduling resume",
|
|
2150
|
+
run_id=run_id,
|
|
2151
|
+
step_id=step_id,
|
|
2152
|
+
)
|
|
2153
|
+
schedule_workflow_resumption(
|
|
2154
|
+
run_id,
|
|
2155
|
+
datetime.now(UTC),
|
|
2156
|
+
storage_config=storage_config,
|
|
2157
|
+
triggered_by="start_suspension_step_race",
|
|
2158
|
+
)
|
|
2159
|
+
return None
|
|
2160
|
+
|
|
1705
2161
|
# Schedule automatic resumption if we have a resume_at time
|
|
1706
2162
|
resume_at = e.data.get("resume_at") if e.data else None
|
|
1707
2163
|
if resume_at:
|
|
1708
|
-
schedule_workflow_resumption(
|
|
2164
|
+
schedule_workflow_resumption(
|
|
2165
|
+
run_id, resume_at, storage_config=storage_config, triggered_by="start_sleep_hook"
|
|
2166
|
+
)
|
|
1709
2167
|
logger.info(
|
|
1710
2168
|
"Scheduled automatic workflow resumption",
|
|
1711
2169
|
run_id=run_id,
|
|
@@ -1779,13 +2237,15 @@ def _get_storage_backend(config: dict[str, Any] | None = None) -> StorageBackend
|
|
|
1779
2237
|
"""
|
|
1780
2238
|
from pyworkflow.storage.config import config_to_storage
|
|
1781
2239
|
|
|
1782
|
-
|
|
2240
|
+
storage = config_to_storage(config)
|
|
2241
|
+
return storage
|
|
1783
2242
|
|
|
1784
2243
|
|
|
1785
2244
|
def schedule_workflow_resumption(
|
|
1786
2245
|
run_id: str,
|
|
1787
2246
|
resume_at: datetime,
|
|
1788
2247
|
storage_config: dict[str, Any] | None = None,
|
|
2248
|
+
triggered_by: str = "unknown",
|
|
1789
2249
|
) -> None:
|
|
1790
2250
|
"""
|
|
1791
2251
|
Schedule automatic workflow resumption after sleep.
|
|
@@ -1794,6 +2254,7 @@ def schedule_workflow_resumption(
|
|
|
1794
2254
|
run_id: Workflow run ID
|
|
1795
2255
|
resume_at: When to resume the workflow
|
|
1796
2256
|
storage_config: Storage backend configuration to pass to the resume task
|
|
2257
|
+
triggered_by: What triggered this resume scheduling (for debugging)
|
|
1797
2258
|
"""
|
|
1798
2259
|
from datetime import UTC
|
|
1799
2260
|
|
|
@@ -1802,10 +2263,11 @@ def schedule_workflow_resumption(
|
|
|
1802
2263
|
delay_seconds = max(0, int((resume_at - now).total_seconds()))
|
|
1803
2264
|
|
|
1804
2265
|
logger.info(
|
|
1805
|
-
"
|
|
2266
|
+
f"SCHEDULE_RESUME: {triggered_by}",
|
|
1806
2267
|
run_id=run_id,
|
|
1807
2268
|
resume_at=resume_at.isoformat(),
|
|
1808
2269
|
delay_seconds=delay_seconds,
|
|
2270
|
+
triggered_by=triggered_by,
|
|
1809
2271
|
)
|
|
1810
2272
|
|
|
1811
2273
|
# Schedule the resume task
|