pyworkflow-engine 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,13 +3,14 @@ Celery tasks for distributed workflow and step execution.
3
3
 
4
4
  These tasks enable:
5
5
  - Distributed step execution across workers
6
- - Automatic retry with exponential backoff
6
+ - Automatic retry with exponential backoff and jitter (via Celery)
7
7
  - Scheduled sleep resumption
8
8
  - Workflow orchestration
9
9
  - Fault tolerance with automatic recovery on worker failures
10
10
  """
11
11
 
12
12
  import asyncio
13
+ import random
13
14
  import uuid
14
15
  from collections.abc import Callable
15
16
  from datetime import UTC, datetime
@@ -19,10 +20,11 @@ if TYPE_CHECKING:
19
20
  from pyworkflow.context.step_context import StepContext
20
21
 
21
22
  from celery import Task
22
- from celery.exceptions import WorkerLostError
23
+ from celery.exceptions import MaxRetriesExceededError, Retry, WorkerLostError
23
24
  from loguru import logger
24
25
 
25
26
  from pyworkflow.celery.app import celery_app
27
+ from pyworkflow.celery.loop import run_async
26
28
  from pyworkflow.core.exceptions import (
27
29
  CancellationError,
28
30
  ContinueAsNewSignal,
@@ -39,6 +41,7 @@ from pyworkflow.engine.events import (
39
41
  create_workflow_continued_as_new_event,
40
42
  create_workflow_interrupted_event,
41
43
  create_workflow_started_event,
44
+ create_workflow_suspended_event,
42
45
  )
43
46
  from pyworkflow.serialization.decoder import deserialize_args, deserialize_kwargs
44
47
  from pyworkflow.serialization.encoder import serialize_args, serialize_kwargs
@@ -46,14 +49,38 @@ from pyworkflow.storage.base import StorageBackend
46
49
  from pyworkflow.storage.schemas import RunStatus, WorkflowRun
47
50
 
48
51
 
52
+ def _calculate_exponential_backoff(
53
+ attempt: int, base: float = 2.0, max_delay: float = 300.0
54
+ ) -> float:
55
+ """
56
+ Calculate exponential backoff delay with jitter.
57
+
58
+ Args:
59
+ attempt: Current retry attempt (0-indexed)
60
+ base: Base delay multiplier (default: 2.0)
61
+ max_delay: Maximum delay in seconds (default: 300s / 5 minutes)
62
+
63
+ Returns:
64
+ Delay in seconds with jitter applied
65
+
66
+ Formula: min(base * 2^attempt, max_delay) * (0.5 + random(0, 0.5))
67
+ This gives delays like: ~1s, ~2s, ~4s, ~8s, ~16s, ... capped at max_delay
68
+ """
69
+ delay = min(base * (2**attempt), max_delay)
70
+ # Add jitter: multiply by random factor between 0.5 and 1.0
71
+ # This prevents thundering herd when multiple tasks retry simultaneously
72
+ jitter = 0.5 + random.random() * 0.5
73
+ return delay * jitter
74
+
75
+
49
76
  class WorkflowTask(Task):
50
77
  """Base task class for workflow execution with custom error handling."""
51
78
 
52
- autoretry_for = (RetryableError,)
53
- retry_kwargs = {"max_retries": 3}
54
- retry_backoff = True
55
- retry_backoff_max = 600
56
- retry_jitter = True
79
+ # Allow unlimited Celery-level retries - our code controls the actual limit
80
+ # via the max_retries parameter passed to execute_step_task
81
+ max_retries = None
82
+ # Prevent message requeue loops when task fails
83
+ acks_on_failure_or_timeout = True
57
84
 
58
85
  def on_failure(self, exc, task_id, args, kwargs, einfo):
59
86
  """
@@ -64,7 +91,6 @@ class WorkflowTask(Task):
64
91
  - Other exceptions: Application failure
65
92
  """
66
93
  is_worker_loss = isinstance(exc, WorkerLostError)
67
-
68
94
  if is_worker_loss:
69
95
  logger.warning(
70
96
  f"Task {self.name} interrupted due to worker loss",
@@ -75,7 +101,7 @@ class WorkflowTask(Task):
75
101
  # by another worker. See _handle_workflow_recovery() for logic.
76
102
  else:
77
103
  logger.error(
78
- f"Task {self.name} failed",
104
+ f"Task {self.name} failed: {str(exc)}",
79
105
  task_id=task_id,
80
106
  error=str(exc),
81
107
  traceback=einfo.traceback if einfo else None,
@@ -132,9 +158,13 @@ def execute_step_task(
132
158
  Step result (serialized)
133
159
 
134
160
  Raises:
135
- FatalError: For non-retriable errors
136
- RetryableError: For retriable errors (triggers automatic retry)
161
+ FatalError: For non-retriable errors after all retries exhausted
137
162
  """
163
+ # Ensure logging is configured in forked worker process
164
+ from pyworkflow.celery.app import _configure_worker_logging
165
+
166
+ _configure_worker_logging()
167
+
138
168
  from pyworkflow.core.registry import _registry
139
169
 
140
170
  logger.info(
@@ -144,11 +174,32 @@ def execute_step_task(
144
174
  attempt=self.request.retries + 1,
145
175
  )
146
176
 
177
+ # Check workflow status before executing - bail out if workflow is in terminal state
178
+ storage = _get_storage_backend(storage_config)
179
+ run = run_async(_get_workflow_run_safe(storage, run_id))
180
+ if run is None:
181
+ logger.warning(
182
+ f"Workflow run not found, skipping step execution: {step_name}",
183
+ run_id=run_id,
184
+ step_id=step_id,
185
+ )
186
+ return None
187
+
188
+ # Only proceed if workflow is in a state where step execution makes sense
189
+ if run.status not in (RunStatus.RUNNING, RunStatus.SUSPENDED):
190
+ logger.warning(
191
+ f"Workflow in terminal state ({run.status.value}), skipping step execution: {step_name}",
192
+ run_id=run_id,
193
+ step_id=step_id,
194
+ workflow_status=run.status.value,
195
+ )
196
+ return None
197
+
147
198
  # Get step metadata
148
199
  step_meta = _registry.get_step(step_name)
149
200
  if not step_meta:
150
201
  # Record failure and resume workflow
151
- asyncio.run(
202
+ run_async(
152
203
  _record_step_failure_and_resume(
153
204
  storage_config=storage_config,
154
205
  run_id=run_id,
@@ -197,7 +248,7 @@ def execute_step_task(
197
248
 
198
249
  # Execute the step
199
250
  if asyncio.iscoroutinefunction(step_func):
200
- result = asyncio.run(step_func(*args, **kwargs))
251
+ result = run_async(step_func(*args, **kwargs))
201
252
  else:
202
253
  result = step_func(*args, **kwargs)
203
254
 
@@ -208,7 +259,7 @@ def execute_step_task(
208
259
  )
209
260
 
210
261
  # Record STEP_COMPLETED event and trigger workflow resumption
211
- asyncio.run(
262
+ run_async(
212
263
  _record_step_completion_and_resume(
213
264
  storage_config=storage_config,
214
265
  run_id=run_id,
@@ -220,10 +271,23 @@ def execute_step_task(
220
271
 
221
272
  return result
222
273
 
274
+ except Retry:
275
+ # Celery retry in progress - let it propagate correctly
276
+ raise
277
+
278
+ except MaxRetriesExceededError:
279
+ # Celery hit its internal retry limit - treat as fatal
280
+ logger.error(
281
+ f"Step exceeded Celery retry limit: {step_name}",
282
+ run_id=run_id,
283
+ step_id=step_id,
284
+ )
285
+ raise
286
+
223
287
  except FatalError as e:
224
288
  logger.error(f"Step failed (fatal): {step_name}", run_id=run_id, step_id=step_id)
225
289
  # Record failure and resume workflow (workflow will fail on replay)
226
- asyncio.run(
290
+ run_async(
227
291
  _record_step_failure_and_resume(
228
292
  storage_config=storage_config,
229
293
  run_id=run_id,
@@ -239,16 +303,22 @@ def execute_step_task(
239
303
  except RetryableError as e:
240
304
  # Check if we have retries left
241
305
  if self.request.retries < max_retries:
306
+ # Use explicit retry_after if provided, otherwise use exponential backoff
307
+ countdown = (
308
+ e.retry_after
309
+ if e.retry_after
310
+ else _calculate_exponential_backoff(self.request.retries)
311
+ )
242
312
  logger.warning(
243
- f"Step failed (retriable): {step_name}, retrying...",
313
+ f"Step failed (retriable): {step_name}, retrying in {countdown:.1f}s...",
244
314
  run_id=run_id,
245
315
  step_id=step_id,
246
- retry_after=e.retry_after,
316
+ countdown=countdown,
247
317
  attempt=self.request.retries + 1,
248
318
  max_retries=max_retries,
249
319
  )
250
320
  # Let Celery handle the retry - don't resume workflow yet
251
- raise self.retry(exc=e, countdown=e.get_retry_delay_seconds() or 60)
321
+ raise self.retry(countdown=countdown, exc=e)
252
322
  else:
253
323
  # Max retries exhausted - record failure and resume workflow
254
324
  logger.error(
@@ -256,7 +326,7 @@ def execute_step_task(
256
326
  run_id=run_id,
257
327
  step_id=step_id,
258
328
  )
259
- asyncio.run(
329
+ run_async(
260
330
  _record_step_failure_and_resume(
261
331
  storage_config=storage_config,
262
332
  run_id=run_id,
@@ -267,20 +337,23 @@ def execute_step_task(
267
337
  is_retryable=False, # Mark as not retryable since we exhausted retries
268
338
  )
269
339
  )
270
- raise
340
+ raise FatalError(f"Step '{step_name}' failed after retries: {str(e)}") from e
271
341
 
272
342
  except Exception as e:
273
343
  # Check if we have retries left
274
344
  if self.request.retries < max_retries:
345
+ # Use exponential backoff for unexpected errors
346
+ countdown = _calculate_exponential_backoff(self.request.retries)
275
347
  logger.warning(
276
- f"Step failed (unexpected): {step_name}, retrying...",
348
+ f"Step failed (unexpected): {step_name}, retrying in {countdown:.1f}s...",
277
349
  run_id=run_id,
278
350
  step_id=step_id,
279
351
  error=str(e),
352
+ countdown=countdown,
280
353
  attempt=self.request.retries + 1,
281
354
  )
282
- # Treat unexpected errors as retriable
283
- raise self.retry(exc=RetryableError(str(e)), countdown=60)
355
+ # Treat unexpected errors as retriable with exponential backoff
356
+ raise self.retry(exc=e, countdown=countdown)
284
357
  else:
285
358
  # Max retries exhausted
286
359
  logger.error(
@@ -290,7 +363,7 @@ def execute_step_task(
290
363
  error=str(e),
291
364
  exc_info=True,
292
365
  )
293
- asyncio.run(
366
+ run_async(
294
367
  _record_step_failure_and_resume(
295
368
  storage_config=storage_config,
296
369
  run_id=run_id,
@@ -301,7 +374,7 @@ def execute_step_task(
301
374
  is_retryable=False,
302
375
  )
303
376
  )
304
- raise
377
+ raise FatalError(f"Step '{step_name}' failed after retries: {str(e)}") from e
305
378
 
306
379
  finally:
307
380
  # Clean up step context
@@ -323,9 +396,16 @@ async def _record_step_completion_and_resume(
323
396
  result: Any,
324
397
  ) -> None:
325
398
  """
326
- Record STEP_COMPLETED event and trigger workflow resumption.
399
+ Record STEP_COMPLETED event and trigger workflow resumption if safe.
327
400
 
328
401
  Called by execute_step_task after successful step execution.
402
+
403
+ Only schedules resume if WORKFLOW_SUSPENDED event exists, indicating
404
+ the workflow has fully suspended. This prevents race conditions where
405
+ a step completes before the workflow has suspended.
406
+
407
+ Idempotency: If STEP_COMPLETED already exists for this step_id, skip
408
+ recording and resume scheduling (another task already handled it).
329
409
  """
330
410
  from pyworkflow.engine.events import create_step_completed_event
331
411
  from pyworkflow.serialization.encoder import serialize
@@ -337,6 +417,21 @@ async def _record_step_completion_and_resume(
337
417
  if hasattr(storage, "connect"):
338
418
  await storage.connect()
339
419
 
420
+ # Idempotency check: skip if step already completed
421
+ events = await storage.get_events(run_id)
422
+ already_completed = any(
423
+ evt.type == EventType.STEP_COMPLETED and evt.data.get("step_id") == step_id
424
+ for evt in events
425
+ )
426
+ if already_completed:
427
+ logger.info(
428
+ "Step already completed by another task, skipping",
429
+ run_id=run_id,
430
+ step_id=step_id,
431
+ step_name=step_name,
432
+ )
433
+ return
434
+
340
435
  # Record STEP_COMPLETED event
341
436
  completion_event = create_step_completed_event(
342
437
  run_id=run_id,
@@ -346,15 +441,33 @@ async def _record_step_completion_and_resume(
346
441
  )
347
442
  await storage.record_event(completion_event)
348
443
 
349
- # Schedule workflow resumption immediately
350
- schedule_workflow_resumption(run_id, datetime.now(UTC), storage_config)
444
+ # Refresh events to include the one we just recorded
445
+ events = await storage.get_events(run_id)
351
446
 
352
- logger.info(
353
- "Step completed and workflow resumption scheduled",
354
- run_id=run_id,
355
- step_id=step_id,
356
- step_name=step_name,
357
- )
447
+ # Check if workflow has suspended (WORKFLOW_SUSPENDED event exists)
448
+ # Only schedule resume if workflow has properly suspended
449
+ has_suspended = any(evt.type == EventType.WORKFLOW_SUSPENDED for evt in events)
450
+
451
+ if has_suspended:
452
+ # Workflow has suspended, safe to schedule resume
453
+ schedule_workflow_resumption(
454
+ run_id, datetime.now(UTC), storage_config, triggered_by="step_completed"
455
+ )
456
+ logger.info(
457
+ "Step completed and workflow resumption scheduled",
458
+ run_id=run_id,
459
+ step_id=step_id,
460
+ step_name=step_name,
461
+ )
462
+ else:
463
+ # Workflow hasn't suspended yet - don't schedule resume
464
+ # The suspension handler will check for step completion and schedule resume
465
+ logger.info(
466
+ "Step completed but workflow not yet suspended, skipping resume scheduling",
467
+ run_id=run_id,
468
+ step_id=step_id,
469
+ step_name=step_name,
470
+ )
358
471
 
359
472
 
360
473
  async def _record_step_failure_and_resume(
@@ -367,10 +480,17 @@ async def _record_step_failure_and_resume(
367
480
  is_retryable: bool,
368
481
  ) -> None:
369
482
  """
370
- Record STEP_FAILED event and trigger workflow resumption.
483
+ Record STEP_FAILED event and trigger workflow resumption if safe.
371
484
 
372
485
  Called by execute_step_task after step failure (when retries are exhausted).
373
486
  The workflow will fail when it replays and sees the failure event.
487
+
488
+ Only schedules resume if WORKFLOW_SUSPENDED event exists, indicating
489
+ the workflow has fully suspended. This prevents race conditions where
490
+ a step fails before the workflow has suspended.
491
+
492
+ Idempotency: If STEP_COMPLETED or terminal STEP_FAILED already exists
493
+ for this step_id, skip recording and resume scheduling.
374
494
  """
375
495
  from pyworkflow.engine.events import create_step_failed_event
376
496
 
@@ -381,6 +501,26 @@ async def _record_step_failure_and_resume(
381
501
  if hasattr(storage, "connect"):
382
502
  await storage.connect()
383
503
 
504
+ # Idempotency check: skip if step already completed or terminally failed
505
+ events = await storage.get_events(run_id)
506
+ already_handled = any(
507
+ (evt.type == EventType.STEP_COMPLETED and evt.data.get("step_id") == step_id)
508
+ or (
509
+ evt.type == EventType.STEP_FAILED
510
+ and evt.data.get("step_id") == step_id
511
+ and not evt.data.get("is_retryable", True)
512
+ )
513
+ for evt in events
514
+ )
515
+ if already_handled:
516
+ logger.info(
517
+ "Step already completed/failed by another task, skipping",
518
+ run_id=run_id,
519
+ step_id=step_id,
520
+ step_name=step_name,
521
+ )
522
+ return
523
+
384
524
  # Record STEP_FAILED event
385
525
  failure_event = create_step_failed_event(
386
526
  run_id=run_id,
@@ -392,16 +532,54 @@ async def _record_step_failure_and_resume(
392
532
  )
393
533
  await storage.record_event(failure_event)
394
534
 
395
- # Schedule workflow resumption - workflow will fail on replay
396
- schedule_workflow_resumption(run_id, datetime.now(UTC), storage_config)
535
+ # Refresh events to include the one we just recorded
536
+ events = await storage.get_events(run_id)
397
537
 
398
- logger.info(
399
- "Step failed and workflow resumption scheduled",
400
- run_id=run_id,
401
- step_id=step_id,
402
- step_name=step_name,
403
- error=error,
404
- )
538
+ # Check if workflow has suspended (WORKFLOW_SUSPENDED event exists)
539
+ # Only schedule resume if workflow has properly suspended
540
+ has_suspended = any(evt.type == EventType.WORKFLOW_SUSPENDED for evt in events)
541
+
542
+ if has_suspended:
543
+ # Workflow has suspended, safe to schedule resume
544
+ schedule_workflow_resumption(
545
+ run_id, datetime.now(UTC), storage_config, triggered_by="step_failed"
546
+ )
547
+ logger.info(
548
+ "Step failed and workflow resumption scheduled",
549
+ run_id=run_id,
550
+ step_id=step_id,
551
+ step_name=step_name,
552
+ error=error,
553
+ )
554
+ else:
555
+ # Workflow hasn't suspended yet - don't schedule resume
556
+ # The suspension handler will check for step failure and schedule resume
557
+ logger.info(
558
+ "Step failed but workflow not yet suspended, skipping resume scheduling",
559
+ run_id=run_id,
560
+ step_id=step_id,
561
+ step_name=step_name,
562
+ error=error,
563
+ )
564
+
565
+
566
+ async def _get_workflow_run_safe(
567
+ storage: StorageBackend,
568
+ run_id: str,
569
+ ) -> WorkflowRun | None:
570
+ """
571
+ Safely get workflow run with proper storage connection handling.
572
+
573
+ Args:
574
+ storage: Storage backend
575
+ run_id: Workflow run ID
576
+
577
+ Returns:
578
+ WorkflowRun or None if not found
579
+ """
580
+ if hasattr(storage, "connect"):
581
+ await storage.connect()
582
+ return await storage.get_run(run_id)
405
583
 
406
584
 
407
585
  def _resolve_context_class(class_name: str) -> type["StepContext"] | None:
@@ -430,6 +608,7 @@ def _resolve_context_class(class_name: str) -> type["StepContext"] | None:
430
608
 
431
609
  @celery_app.task(
432
610
  name="pyworkflow.start_workflow",
611
+ base=WorkflowTask,
433
612
  queue="pyworkflow.workflows",
434
613
  )
435
614
  def start_workflow_task(
@@ -456,7 +635,17 @@ def start_workflow_task(
456
635
  Returns:
457
636
  Workflow run ID
458
637
  """
459
- logger.info(f"Starting workflow on worker: {workflow_name}", run_id=run_id)
638
+ # Ensure logging is configured in forked worker process
639
+ from pyworkflow.celery.app import _configure_worker_logging
640
+
641
+ _configure_worker_logging()
642
+
643
+ logger.info(
644
+ f"START_WORKFLOW_TASK ENTRY: {workflow_name}",
645
+ run_id=run_id,
646
+ idempotency_key=idempotency_key,
647
+ celery_task_id=start_workflow_task.request.id,
648
+ )
460
649
 
461
650
  # Get workflow metadata
462
651
  workflow_meta = get_workflow(workflow_name)
@@ -471,7 +660,7 @@ def start_workflow_task(
471
660
  storage = _get_storage_backend(storage_config)
472
661
 
473
662
  # Execute workflow directly on worker
474
- result_run_id = asyncio.run(
663
+ result_run_id = run_async(
475
664
  _start_workflow_on_worker(
476
665
  workflow_meta=workflow_meta,
477
666
  args=args,
@@ -489,6 +678,7 @@ def start_workflow_task(
489
678
 
490
679
  @celery_app.task(
491
680
  name="pyworkflow.start_child_workflow",
681
+ base=WorkflowTask,
492
682
  queue="pyworkflow.workflows",
493
683
  )
494
684
  def start_child_workflow_task(
@@ -520,6 +710,11 @@ def start_child_workflow_task(
520
710
  Returns:
521
711
  Child workflow run ID
522
712
  """
713
+ # Ensure logging is configured in forked worker process
714
+ from pyworkflow.celery.app import _configure_worker_logging
715
+
716
+ _configure_worker_logging()
717
+
523
718
  logger.info(
524
719
  f"Starting child workflow on worker: {workflow_name}",
525
720
  child_run_id=child_run_id,
@@ -539,7 +734,7 @@ def start_child_workflow_task(
539
734
  storage = _get_storage_backend(storage_config)
540
735
 
541
736
  # Execute child workflow on worker
542
- asyncio.run(
737
+ run_async(
543
738
  _execute_child_workflow_on_worker(
544
739
  workflow_func=workflow_meta.func,
545
740
  workflow_name=workflow_name,
@@ -633,19 +828,62 @@ async def _execute_child_workflow_on_worker(
633
828
  await _trigger_parent_resumption_celery(parent_run_id, storage, storage_config)
634
829
 
635
830
  except SuspensionSignal as e:
636
- # Child workflow suspended (e.g., sleep, hook)
831
+ # Child workflow suspended (e.g., sleep, hook, step dispatch)
637
832
  # Update status and don't notify parent yet - handled on child resumption
638
833
  await storage.update_run_status(child_run_id, RunStatus.SUSPENDED)
834
+
835
+ # Record WORKFLOW_SUSPENDED event
836
+ step_id = e.data.get("step_id") if e.data else None
837
+ step_name = e.data.get("step_name") if e.data else None
838
+ sleep_id = e.data.get("sleep_id") if e.data else None
839
+ hook_id = e.data.get("hook_id") if e.data else None
840
+ nested_child_id = e.data.get("child_id") if e.data else None
841
+
842
+ suspended_event = create_workflow_suspended_event(
843
+ run_id=child_run_id,
844
+ reason=e.reason,
845
+ step_id=step_id,
846
+ step_name=step_name,
847
+ sleep_id=sleep_id,
848
+ hook_id=hook_id,
849
+ child_id=nested_child_id,
850
+ )
851
+ await storage.record_event(suspended_event)
852
+
639
853
  logger.debug(
640
854
  f"Child workflow suspended: {workflow_name}",
641
855
  parent_run_id=parent_run_id,
642
856
  child_run_id=child_run_id,
643
857
  )
644
858
 
859
+ # For step dispatch suspensions, check if step already completed/failed
860
+ if step_id and e.reason.startswith("step_dispatch:"):
861
+ events = await storage.get_events(child_run_id)
862
+ step_finished = any(
863
+ evt.type in (EventType.STEP_COMPLETED, EventType.STEP_FAILED)
864
+ and evt.data.get("step_id") == step_id
865
+ for evt in events
866
+ )
867
+ if step_finished:
868
+ logger.info(
869
+ "Child step finished before suspension completed, scheduling resume",
870
+ child_run_id=child_run_id,
871
+ step_id=step_id,
872
+ )
873
+ schedule_workflow_resumption(
874
+ child_run_id,
875
+ datetime.now(UTC),
876
+ storage_config=storage_config,
877
+ triggered_by="child_suspension_step_race",
878
+ )
879
+ return
880
+
645
881
  # Schedule automatic resumption if we have a resume_at time
646
882
  resume_at = e.data.get("resume_at") if e.data else None
647
883
  if resume_at:
648
- schedule_workflow_resumption(child_run_id, resume_at, storage_config)
884
+ schedule_workflow_resumption(
885
+ child_run_id, resume_at, storage_config, triggered_by="child_sleep_hook"
886
+ )
649
887
 
650
888
  except ContinueAsNewSignal as e:
651
889
  # Child workflow continuing as new execution
@@ -718,7 +956,9 @@ async def _trigger_parent_resumption_celery(
718
956
  parent_run_id=parent_run_id,
719
957
  )
720
958
  # Schedule immediate resumption via Celery
721
- schedule_workflow_resumption(parent_run_id, datetime.now(UTC), storage_config)
959
+ schedule_workflow_resumption(
960
+ parent_run_id, datetime.now(UTC), storage_config, triggered_by="child_completed"
961
+ )
722
962
 
723
963
 
724
964
  async def _notify_parent_of_child_completion(
@@ -978,9 +1218,27 @@ async def _recover_workflow_on_worker(
978
1218
  return run_id
979
1219
 
980
1220
  except SuspensionSignal as e:
981
- # Workflow suspended again
1221
+ # Workflow suspended again (during recovery)
982
1222
  await storage.update_run_status(run_id=run_id, status=RunStatus.SUSPENDED)
983
1223
 
1224
+ # Record WORKFLOW_SUSPENDED event
1225
+ step_id = e.data.get("step_id") if e.data else None
1226
+ step_name = e.data.get("step_name") if e.data else None
1227
+ sleep_id = e.data.get("sleep_id") if e.data else None
1228
+ hook_id = e.data.get("hook_id") if e.data else None
1229
+ child_id = e.data.get("child_id") if e.data else None
1230
+
1231
+ suspended_event = create_workflow_suspended_event(
1232
+ run_id=run_id,
1233
+ reason=e.reason,
1234
+ step_id=step_id,
1235
+ step_name=step_name,
1236
+ sleep_id=sleep_id,
1237
+ hook_id=hook_id,
1238
+ child_id=child_id,
1239
+ )
1240
+ await storage.record_event(suspended_event)
1241
+
984
1242
  logger.info(
985
1243
  f"Recovered workflow suspended: {e.reason}",
986
1244
  run_id=run_id,
@@ -988,10 +1246,34 @@ async def _recover_workflow_on_worker(
988
1246
  reason=e.reason,
989
1247
  )
990
1248
 
1249
+ # For step dispatch suspensions, check if step already completed/failed
1250
+ if step_id and e.reason.startswith("step_dispatch:"):
1251
+ events = await storage.get_events(run_id)
1252
+ step_finished = any(
1253
+ evt.type in (EventType.STEP_COMPLETED, EventType.STEP_FAILED)
1254
+ and evt.data.get("step_id") == step_id
1255
+ for evt in events
1256
+ )
1257
+ if step_finished:
1258
+ logger.info(
1259
+ "Step finished before recovery suspension completed, scheduling resume",
1260
+ run_id=run_id,
1261
+ step_id=step_id,
1262
+ )
1263
+ schedule_workflow_resumption(
1264
+ run_id,
1265
+ datetime.now(UTC),
1266
+ storage_config=storage_config,
1267
+ triggered_by="recovery_suspension_step_race",
1268
+ )
1269
+ return run_id
1270
+
991
1271
  # Schedule automatic resumption if we have a resume_at time
992
1272
  resume_at = e.data.get("resume_at") if e.data else None
993
1273
  if resume_at:
994
- schedule_workflow_resumption(run_id, resume_at, storage_config=storage_config)
1274
+ schedule_workflow_resumption(
1275
+ run_id, resume_at, storage_config=storage_config, triggered_by="recovery_sleep_hook"
1276
+ )
995
1277
  logger.info(
996
1278
  "Scheduled automatic workflow resumption",
997
1279
  run_id=run_id,
@@ -1076,10 +1358,22 @@ async def _start_workflow_on_worker(
1076
1358
  workflow_name = workflow_meta.name
1077
1359
  config = get_config()
1078
1360
 
1361
+ run = await storage.get_run(run_id) if run_id else None
1362
+ logger.debug(
1363
+ f"_START_WORKFLOW_ON_WORKER ENTRY: {workflow_name} with run_id={run_id} and status={run.status.value if run else 'N/A'}",
1364
+ run_id=run_id,
1365
+ )
1366
+
1079
1367
  # Check idempotency key
1080
1368
  if idempotency_key:
1081
1369
  existing_run = await storage.get_run_by_idempotency_key(idempotency_key)
1082
1370
  if existing_run:
1371
+ logger.info(
1372
+ "IDEMPOTENCY CHECK: Found existing run",
1373
+ run_id=existing_run.run_id,
1374
+ status=existing_run.status.value,
1375
+ idempotency_key=idempotency_key,
1376
+ )
1083
1377
  # Check if this is a recovery scenario (workflow was RUNNING but worker crashed)
1084
1378
  if existing_run.status == RunStatus.RUNNING:
1085
1379
  # Check if this is truly a crashed worker or just a duplicate task execution
@@ -1140,27 +1434,76 @@ async def _start_workflow_on_worker(
1140
1434
  if run_id is None:
1141
1435
  run_id = f"run_{uuid.uuid4().hex[:16]}"
1142
1436
 
1143
- # Check if run already exists (recovery scenario without idempotency key)
1437
+ # Check if run already exists
1144
1438
  existing_run = await storage.get_run(run_id)
1145
- if existing_run and existing_run.status == RunStatus.RUNNING:
1146
- # This is a recovery scenario
1147
- can_recover = await _handle_workflow_recovery(
1148
- run=existing_run,
1149
- storage=storage,
1150
- worker_id=None,
1439
+ if existing_run:
1440
+ logger.info(
1441
+ f"RUN_ID CHECK: Found existing run with status {existing_run.status.value}",
1442
+ run_id=run_id,
1443
+ status=existing_run.status.value,
1151
1444
  )
1152
- if can_recover:
1153
- return await _recover_workflow_on_worker(
1445
+
1446
+ if existing_run.status == RunStatus.RUNNING:
1447
+ # Recovery scenario - worker crashed while running
1448
+ can_recover = await _handle_workflow_recovery(
1154
1449
  run=existing_run,
1155
- workflow_meta=workflow_meta,
1156
1450
  storage=storage,
1157
- storage_config=storage_config,
1451
+ worker_id=None,
1452
+ )
1453
+ if can_recover:
1454
+ return await _recover_workflow_on_worker(
1455
+ run=existing_run,
1456
+ workflow_meta=workflow_meta,
1457
+ storage=storage,
1458
+ storage_config=storage_config,
1459
+ )
1460
+ else:
1461
+ return existing_run.run_id
1462
+
1463
+ elif existing_run.status == RunStatus.SUSPENDED:
1464
+ # Workflow is suspended - this start_workflow_task is a duplicate
1465
+ # (scheduled during race condition before workflow suspended)
1466
+ # Return existing run_id - resume_workflow_task will handle it
1467
+ logger.info(
1468
+ "DUPLICATE START: Workflow already suspended, returning existing run",
1469
+ run_id=run_id,
1470
+ status=existing_run.status.value,
1158
1471
  )
1159
- else:
1160
1472
  return existing_run.run_id
1161
1473
 
1474
+ elif existing_run.status in (
1475
+ RunStatus.COMPLETED,
1476
+ RunStatus.FAILED,
1477
+ RunStatus.CANCELLED,
1478
+ ):
1479
+ # Terminal status - workflow already finished
1480
+ logger.info(
1481
+ f"TERMINAL STATUS: Workflow already {existing_run.status.value}, returning existing run",
1482
+ run_id=run_id,
1483
+ status=existing_run.status.value,
1484
+ )
1485
+ return existing_run.run_id
1486
+
1487
+ elif existing_run.status == RunStatus.INTERRUPTED:
1488
+ # Previous recovery failed, try again
1489
+ can_recover = await _handle_workflow_recovery(
1490
+ run=existing_run,
1491
+ storage=storage,
1492
+ worker_id=None,
1493
+ )
1494
+ if can_recover:
1495
+ return await _recover_workflow_on_worker(
1496
+ run=existing_run,
1497
+ workflow_meta=workflow_meta,
1498
+ storage=storage,
1499
+ storage_config=storage_config,
1500
+ )
1501
+ else:
1502
+ return existing_run.run_id
1503
+
1504
+ # Only reach here if no existing run found
1162
1505
  logger.info(
1163
- f"Starting workflow execution on worker: {workflow_name}",
1506
+ f"FRESH START: Creating new workflow run: {workflow_name}",
1164
1507
  run_id=run_id,
1165
1508
  workflow_name=workflow_name,
1166
1509
  )
@@ -1265,9 +1608,28 @@ async def _start_workflow_on_worker(
1265
1608
  return run_id
1266
1609
 
1267
1610
  except SuspensionSignal as e:
1268
- # Workflow suspended (sleep or hook)
1611
+ # Workflow suspended (sleep, hook, or step dispatch)
1269
1612
  await storage.update_run_status(run_id=run_id, status=RunStatus.SUSPENDED)
1270
1613
 
1614
+ # Record WORKFLOW_SUSPENDED event - this signals that suspension is complete
1615
+ # and resume can be safely scheduled
1616
+ step_id = e.data.get("step_id") if e.data else None
1617
+ step_name = e.data.get("step_name") if e.data else None
1618
+ sleep_id = e.data.get("sleep_id") if e.data else None
1619
+ hook_id = e.data.get("hook_id") if e.data else None
1620
+ child_id = e.data.get("child_id") if e.data else None
1621
+
1622
+ suspended_event = create_workflow_suspended_event(
1623
+ run_id=run_id,
1624
+ reason=e.reason,
1625
+ step_id=step_id,
1626
+ step_name=step_name,
1627
+ sleep_id=sleep_id,
1628
+ hook_id=hook_id,
1629
+ child_id=child_id,
1630
+ )
1631
+ await storage.record_event(suspended_event)
1632
+
1271
1633
  logger.info(
1272
1634
  f"Workflow suspended on worker: {e.reason}",
1273
1635
  run_id=run_id,
@@ -1275,10 +1637,35 @@ async def _start_workflow_on_worker(
1275
1637
  reason=e.reason,
1276
1638
  )
1277
1639
 
1278
- # Schedule automatic resumption if we have a resume_at time
1640
+ # For step dispatch suspensions, check if step already completed/failed (race condition)
1641
+ # If so, schedule resume immediately
1642
+ if step_id and e.reason.startswith("step_dispatch:"):
1643
+ events = await storage.get_events(run_id)
1644
+ step_finished = any(
1645
+ evt.type in (EventType.STEP_COMPLETED, EventType.STEP_FAILED)
1646
+ and evt.data.get("step_id") == step_id
1647
+ for evt in events
1648
+ )
1649
+ if step_finished:
1650
+ logger.info(
1651
+ "Step finished before suspension completed, scheduling resume",
1652
+ run_id=run_id,
1653
+ step_id=step_id,
1654
+ )
1655
+ schedule_workflow_resumption(
1656
+ run_id,
1657
+ datetime.now(UTC),
1658
+ storage_config=storage_config,
1659
+ triggered_by="resume_suspension_step_race",
1660
+ )
1661
+ return run_id
1662
+
1663
+ # Schedule automatic resumption if we have a resume_at time (for sleep/hook)
1279
1664
  resume_at = e.data.get("resume_at") if e.data else None
1280
1665
  if resume_at:
1281
- schedule_workflow_resumption(run_id, resume_at, storage_config=storage_config)
1666
+ schedule_workflow_resumption(
1667
+ run_id, resume_at, storage_config=storage_config, triggered_by="resume_sleep_hook"
1668
+ )
1282
1669
  logger.info(
1283
1670
  "Scheduled automatic workflow resumption",
1284
1671
  run_id=run_id,
@@ -1332,6 +1719,7 @@ async def _start_workflow_on_worker(
1332
1719
 
1333
1720
  @celery_app.task(
1334
1721
  name="pyworkflow.resume_workflow",
1722
+ base=WorkflowTask,
1335
1723
  queue="pyworkflow.schedules",
1336
1724
  )
1337
1725
  def resume_workflow_task(
@@ -1351,13 +1739,22 @@ def resume_workflow_task(
1351
1739
  Returns:
1352
1740
  Workflow result if completed, None if suspended again
1353
1741
  """
1354
- logger.info(f"Resuming workflow on worker: {run_id}")
1742
+ # Ensure logging is configured in forked worker process
1743
+ from pyworkflow.celery.app import _configure_worker_logging
1744
+
1745
+ _configure_worker_logging()
1746
+
1747
+ logger.info(
1748
+ f"RESUME_WORKFLOW_TASK ENTRY: {run_id}",
1749
+ run_id=run_id,
1750
+ celery_task_id=resume_workflow_task.request.id,
1751
+ )
1355
1752
 
1356
1753
  # Get storage backend
1357
1754
  storage = _get_storage_backend(storage_config)
1358
1755
 
1359
1756
  # Resume workflow directly on worker
1360
- result = asyncio.run(_resume_workflow_on_worker(run_id, storage, storage_config))
1757
+ result = run_async(_resume_workflow_on_worker(run_id, storage, storage_config))
1361
1758
 
1362
1759
  if result is not None:
1363
1760
  logger.info(f"Workflow completed on worker: {run_id}")
@@ -1369,6 +1766,7 @@ def resume_workflow_task(
1369
1766
 
1370
1767
  @celery_app.task(
1371
1768
  name="pyworkflow.execute_scheduled_workflow",
1769
+ base=WorkflowTask,
1372
1770
  queue="pyworkflow.schedules",
1373
1771
  )
1374
1772
  def execute_scheduled_workflow_task(
@@ -1390,11 +1788,16 @@ def execute_scheduled_workflow_task(
1390
1788
  Returns:
1391
1789
  Workflow run ID if started, None if skipped
1392
1790
  """
1791
+ # Ensure logging is configured in forked worker process
1792
+ from pyworkflow.celery.app import _configure_worker_logging
1793
+
1794
+ _configure_worker_logging()
1795
+
1393
1796
  logger.info("Executing scheduled workflow", schedule_id=schedule_id)
1394
1797
 
1395
1798
  storage = _get_storage_backend(storage_config)
1396
1799
 
1397
- return asyncio.run(
1800
+ return run_async(
1398
1801
  _execute_scheduled_workflow(
1399
1802
  schedule_id=schedule_id,
1400
1803
  scheduled_time=datetime.fromisoformat(scheduled_time),
@@ -1587,6 +1990,19 @@ async def _resume_workflow_on_worker(
1587
1990
  )
1588
1991
  return None
1589
1992
 
1993
+ # Prevent duplicate resume execution
1994
+ # Multiple resume tasks can be scheduled for the same workflow (e.g., race
1995
+ # condition between step completion and suspension handler). Only proceed
1996
+ # if the workflow is actually SUSPENDED. If status is RUNNING, another
1997
+ # resume task got there first.
1998
+ if run.status != RunStatus.SUSPENDED:
1999
+ logger.info(
2000
+ f"Workflow status is {run.status.value}, not SUSPENDED - skipping duplicate resume",
2001
+ run_id=run_id,
2002
+ workflow_name=run.workflow_name,
2003
+ )
2004
+ return None
2005
+
1590
2006
  # Check for cancellation flag
1591
2007
  cancellation_requested = await storage.check_cancellation_flag(run_id)
1592
2008
 
@@ -1692,9 +2108,27 @@ async def _resume_workflow_on_worker(
1692
2108
  return None
1693
2109
 
1694
2110
  except SuspensionSignal as e:
1695
- # Workflow suspended again
2111
+ # Workflow suspended again (during resume)
1696
2112
  await storage.update_run_status(run_id=run_id, status=RunStatus.SUSPENDED)
1697
2113
 
2114
+ # Record WORKFLOW_SUSPENDED event
2115
+ step_id = e.data.get("step_id") if e.data else None
2116
+ step_name = e.data.get("step_name") if e.data else None
2117
+ sleep_id = e.data.get("sleep_id") if e.data else None
2118
+ hook_id = e.data.get("hook_id") if e.data else None
2119
+ child_id = e.data.get("child_id") if e.data else None
2120
+
2121
+ suspended_event = create_workflow_suspended_event(
2122
+ run_id=run_id,
2123
+ reason=e.reason,
2124
+ step_id=step_id,
2125
+ step_name=step_name,
2126
+ sleep_id=sleep_id,
2127
+ hook_id=hook_id,
2128
+ child_id=child_id,
2129
+ )
2130
+ await storage.record_event(suspended_event)
2131
+
1698
2132
  logger.info(
1699
2133
  f"Workflow suspended again on worker: {e.reason}",
1700
2134
  run_id=run_id,
@@ -1702,10 +2136,34 @@ async def _resume_workflow_on_worker(
1702
2136
  reason=e.reason,
1703
2137
  )
1704
2138
 
2139
+ # For step dispatch suspensions, check if step already completed/failed
2140
+ if step_id and e.reason.startswith("step_dispatch:"):
2141
+ events = await storage.get_events(run_id)
2142
+ step_finished = any(
2143
+ evt.type in (EventType.STEP_COMPLETED, EventType.STEP_FAILED)
2144
+ and evt.data.get("step_id") == step_id
2145
+ for evt in events
2146
+ )
2147
+ if step_finished:
2148
+ logger.info(
2149
+ "Step finished before resume suspension completed, scheduling resume",
2150
+ run_id=run_id,
2151
+ step_id=step_id,
2152
+ )
2153
+ schedule_workflow_resumption(
2154
+ run_id,
2155
+ datetime.now(UTC),
2156
+ storage_config=storage_config,
2157
+ triggered_by="start_suspension_step_race",
2158
+ )
2159
+ return None
2160
+
1705
2161
  # Schedule automatic resumption if we have a resume_at time
1706
2162
  resume_at = e.data.get("resume_at") if e.data else None
1707
2163
  if resume_at:
1708
- schedule_workflow_resumption(run_id, resume_at, storage_config=storage_config)
2164
+ schedule_workflow_resumption(
2165
+ run_id, resume_at, storage_config=storage_config, triggered_by="start_sleep_hook"
2166
+ )
1709
2167
  logger.info(
1710
2168
  "Scheduled automatic workflow resumption",
1711
2169
  run_id=run_id,
@@ -1779,13 +2237,15 @@ def _get_storage_backend(config: dict[str, Any] | None = None) -> StorageBackend
1779
2237
  """
1780
2238
  from pyworkflow.storage.config import config_to_storage
1781
2239
 
1782
- return config_to_storage(config)
2240
+ storage = config_to_storage(config)
2241
+ return storage
1783
2242
 
1784
2243
 
1785
2244
  def schedule_workflow_resumption(
1786
2245
  run_id: str,
1787
2246
  resume_at: datetime,
1788
2247
  storage_config: dict[str, Any] | None = None,
2248
+ triggered_by: str = "unknown",
1789
2249
  ) -> None:
1790
2250
  """
1791
2251
  Schedule automatic workflow resumption after sleep.
@@ -1794,6 +2254,7 @@ def schedule_workflow_resumption(
1794
2254
  run_id: Workflow run ID
1795
2255
  resume_at: When to resume the workflow
1796
2256
  storage_config: Storage backend configuration to pass to the resume task
2257
+ triggered_by: What triggered this resume scheduling (for debugging)
1797
2258
  """
1798
2259
  from datetime import UTC
1799
2260
 
@@ -1802,10 +2263,11 @@ def schedule_workflow_resumption(
1802
2263
  delay_seconds = max(0, int((resume_at - now).total_seconds()))
1803
2264
 
1804
2265
  logger.info(
1805
- "Scheduling workflow resumption",
2266
+ f"SCHEDULE_RESUME: {triggered_by}",
1806
2267
  run_id=run_id,
1807
2268
  resume_at=resume_at.isoformat(),
1808
2269
  delay_seconds=delay_seconds,
2270
+ triggered_by=triggered_by,
1809
2271
  )
1810
2272
 
1811
2273
  # Schedule the resume task