pyworkflow-engine 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,13 +3,14 @@ Celery tasks for distributed workflow and step execution.
3
3
 
4
4
  These tasks enable:
5
5
  - Distributed step execution across workers
6
- - Automatic retry with exponential backoff
6
+ - Automatic retry with exponential backoff and jitter (via Celery)
7
7
  - Scheduled sleep resumption
8
8
  - Workflow orchestration
9
9
  - Fault tolerance with automatic recovery on worker failures
10
10
  """
11
11
 
12
12
  import asyncio
13
+ import random
13
14
  import uuid
14
15
  from collections.abc import Callable
15
16
  from datetime import UTC, datetime
@@ -18,11 +19,12 @@ from typing import TYPE_CHECKING, Any
18
19
  if TYPE_CHECKING:
19
20
  from pyworkflow.context.step_context import StepContext
20
21
 
21
- from celery import Task
22
- from celery.exceptions import WorkerLostError
22
+ from celery.exceptions import MaxRetriesExceededError, Retry
23
23
  from loguru import logger
24
24
 
25
25
  from pyworkflow.celery.app import celery_app
26
+ from pyworkflow.celery.loop import run_async
27
+ from pyworkflow.celery.singleton import SingletonWorkflowTask
26
28
  from pyworkflow.core.exceptions import (
27
29
  CancellationError,
28
30
  ContinueAsNewSignal,
@@ -31,6 +33,7 @@ from pyworkflow.core.exceptions import (
31
33
  SuspensionSignal,
32
34
  )
33
35
  from pyworkflow.core.registry import WorkflowMetadata, get_workflow
36
+ from pyworkflow.core.validation import validate_step_parameters
34
37
  from pyworkflow.core.workflow import execute_workflow_with_context
35
38
  from pyworkflow.engine.events import (
36
39
  EventType,
@@ -39,6 +42,7 @@ from pyworkflow.engine.events import (
39
42
  create_workflow_continued_as_new_event,
40
43
  create_workflow_interrupted_event,
41
44
  create_workflow_started_event,
45
+ create_workflow_suspended_event,
42
46
  )
43
47
  from pyworkflow.serialization.decoder import deserialize_args, deserialize_kwargs
44
48
  from pyworkflow.serialization.encoder import serialize_args, serialize_kwargs
@@ -46,59 +50,39 @@ from pyworkflow.storage.base import StorageBackend
46
50
  from pyworkflow.storage.schemas import RunStatus, WorkflowRun
47
51
 
48
52
 
49
- class WorkflowTask(Task):
50
- """Base task class for workflow execution with custom error handling."""
51
-
52
- autoretry_for = (RetryableError,)
53
- retry_kwargs = {"max_retries": 3}
54
- retry_backoff = True
55
- retry_backoff_max = 600
56
- retry_jitter = True
57
-
58
- def on_failure(self, exc, task_id, args, kwargs, einfo):
59
- """
60
- Handle task failure.
53
+ def _calculate_exponential_backoff(
54
+ attempt: int, base: float = 2.0, max_delay: float = 300.0
55
+ ) -> float:
56
+ """
57
+ Calculate exponential backoff delay with jitter.
61
58
 
62
- Detects worker loss and handles recovery appropriately:
63
- - WorkerLostError: Infrastructure failure, may trigger recovery
64
- - Other exceptions: Application failure
65
- """
66
- is_worker_loss = isinstance(exc, WorkerLostError)
59
+ Args:
60
+ attempt: Current retry attempt (0-indexed)
61
+ base: Base delay multiplier (default: 2.0)
62
+ max_delay: Maximum delay in seconds (default: 300s / 5 minutes)
67
63
 
68
- if is_worker_loss:
69
- logger.warning(
70
- f"Task {self.name} interrupted due to worker loss",
71
- task_id=task_id,
72
- error=str(exc),
73
- )
74
- # Note: Recovery is handled when the task is requeued and picked up
75
- # by another worker. See _handle_workflow_recovery() for logic.
76
- else:
77
- logger.error(
78
- f"Task {self.name} failed",
79
- task_id=task_id,
80
- error=str(exc),
81
- traceback=einfo.traceback if einfo else None,
82
- )
64
+ Returns:
65
+ Delay in seconds with jitter applied
83
66
 
84
- def on_retry(self, exc, task_id, args, kwargs, einfo):
85
- """Handle task retry."""
86
- logger.warning(
87
- f"Task {self.name} retrying",
88
- task_id=task_id,
89
- error=str(exc),
90
- retry_count=self.request.retries,
91
- )
67
+ Formula: min(base * 2^attempt, max_delay) * (0.5 + random(0, 0.5))
68
+ This gives delays like: ~1s, ~2s, ~4s, ~8s, ~16s, ... capped at max_delay
69
+ """
70
+ delay = min(base * (2**attempt), max_delay)
71
+ # Add jitter: multiply by random factor between 0.5 and 1.0
72
+ # This prevents thundering herd when multiple tasks retry simultaneously
73
+ jitter = 0.5 + random.random() * 0.5
74
+ return delay * jitter
92
75
 
93
76
 
94
77
  @celery_app.task(
95
78
  name="pyworkflow.execute_step",
96
- base=WorkflowTask,
79
+ base=SingletonWorkflowTask,
97
80
  bind=True,
98
81
  queue="pyworkflow.steps",
82
+ unique_on=["run_id", "step_id"],
99
83
  )
100
84
  def execute_step_task(
101
- self: WorkflowTask,
85
+ self: SingletonWorkflowTask,
102
86
  step_name: str,
103
87
  args_json: str,
104
88
  kwargs_json: str,
@@ -132,9 +116,13 @@ def execute_step_task(
132
116
  Step result (serialized)
133
117
 
134
118
  Raises:
135
- FatalError: For non-retriable errors
136
- RetryableError: For retriable errors (triggers automatic retry)
119
+ FatalError: For non-retriable errors after all retries exhausted
137
120
  """
121
+ # Ensure logging is configured in forked worker process
122
+ from pyworkflow.celery.app import _configure_worker_logging
123
+
124
+ _configure_worker_logging()
125
+
138
126
  from pyworkflow.core.registry import _registry
139
127
 
140
128
  logger.info(
@@ -144,11 +132,32 @@ def execute_step_task(
144
132
  attempt=self.request.retries + 1,
145
133
  )
146
134
 
135
+ # Check workflow status before executing - bail out if workflow is in terminal state
136
+ storage = _get_storage_backend(storage_config)
137
+ run = run_async(_get_workflow_run_safe(storage, run_id))
138
+ if run is None:
139
+ logger.warning(
140
+ f"Workflow run not found, skipping step execution: {step_name}",
141
+ run_id=run_id,
142
+ step_id=step_id,
143
+ )
144
+ return None
145
+
146
+ # Only proceed if workflow is in a state where step execution makes sense
147
+ if run.status not in (RunStatus.RUNNING, RunStatus.SUSPENDED):
148
+ logger.warning(
149
+ f"Workflow in terminal state ({run.status.value}), skipping step execution: {step_name}",
150
+ run_id=run_id,
151
+ step_id=step_id,
152
+ workflow_status=run.status.value,
153
+ )
154
+ return None
155
+
147
156
  # Get step metadata
148
157
  step_meta = _registry.get_step(step_name)
149
158
  if not step_meta:
150
159
  # Record failure and resume workflow
151
- asyncio.run(
160
+ run_async(
152
161
  _record_step_failure_and_resume(
153
162
  storage_config=storage_config,
154
163
  run_id=run_id,
@@ -161,10 +170,28 @@ def execute_step_task(
161
170
  )
162
171
  raise FatalError(f"Step '{step_name}' not found in registry")
163
172
 
173
+ # Ignore processing step if already completed (idempotency)
174
+ events = run_async(storage.get_events(run_id))
175
+ already_completed = any(
176
+ evt.type == EventType.STEP_COMPLETED and evt.data.get("step_id") == step_id
177
+ for evt in events
178
+ )
179
+ if already_completed:
180
+ logger.warning(
181
+ "Step already completed by another task, skipping execution",
182
+ run_id=run_id,
183
+ step_id=step_id,
184
+ step_name=step_name,
185
+ )
186
+ return None
187
+
164
188
  # Deserialize arguments
165
189
  args = deserialize_args(args_json)
166
190
  kwargs = deserialize_kwargs(kwargs_json)
167
191
 
192
+ # Validate parameters before execution on worker (defense in depth)
193
+ validate_step_parameters(step_meta.original_func, args, kwargs, step_name)
194
+
168
195
  # Set up step context if provided (read-only mode)
169
196
  step_context_token = None
170
197
  readonly_token = None
@@ -197,7 +224,7 @@ def execute_step_task(
197
224
 
198
225
  # Execute the step
199
226
  if asyncio.iscoroutinefunction(step_func):
200
- result = asyncio.run(step_func(*args, **kwargs))
227
+ result = run_async(step_func(*args, **kwargs))
201
228
  else:
202
229
  result = step_func(*args, **kwargs)
203
230
 
@@ -208,7 +235,7 @@ def execute_step_task(
208
235
  )
209
236
 
210
237
  # Record STEP_COMPLETED event and trigger workflow resumption
211
- asyncio.run(
238
+ run_async(
212
239
  _record_step_completion_and_resume(
213
240
  storage_config=storage_config,
214
241
  run_id=run_id,
@@ -220,10 +247,23 @@ def execute_step_task(
220
247
 
221
248
  return result
222
249
 
250
+ except Retry:
251
+ # Celery retry in progress - let it propagate correctly
252
+ raise
253
+
254
+ except MaxRetriesExceededError:
255
+ # Celery hit its internal retry limit - treat as fatal
256
+ logger.error(
257
+ f"Step exceeded Celery retry limit: {step_name}",
258
+ run_id=run_id,
259
+ step_id=step_id,
260
+ )
261
+ raise
262
+
223
263
  except FatalError as e:
224
264
  logger.error(f"Step failed (fatal): {step_name}", run_id=run_id, step_id=step_id)
225
265
  # Record failure and resume workflow (workflow will fail on replay)
226
- asyncio.run(
266
+ run_async(
227
267
  _record_step_failure_and_resume(
228
268
  storage_config=storage_config,
229
269
  run_id=run_id,
@@ -239,16 +279,22 @@ def execute_step_task(
239
279
  except RetryableError as e:
240
280
  # Check if we have retries left
241
281
  if self.request.retries < max_retries:
282
+ # Use explicit retry_after if provided, otherwise use exponential backoff
283
+ countdown = (
284
+ e.retry_after
285
+ if e.retry_after
286
+ else _calculate_exponential_backoff(self.request.retries)
287
+ )
242
288
  logger.warning(
243
- f"Step failed (retriable): {step_name}, retrying...",
289
+ f"Step failed (retriable): {step_name}, retrying in {countdown:.1f}s...",
244
290
  run_id=run_id,
245
291
  step_id=step_id,
246
- retry_after=e.retry_after,
292
+ countdown=countdown,
247
293
  attempt=self.request.retries + 1,
248
294
  max_retries=max_retries,
249
295
  )
250
296
  # Let Celery handle the retry - don't resume workflow yet
251
- raise self.retry(exc=e, countdown=e.get_retry_delay_seconds() or 60)
297
+ raise self.retry(countdown=countdown, exc=e)
252
298
  else:
253
299
  # Max retries exhausted - record failure and resume workflow
254
300
  logger.error(
@@ -256,7 +302,7 @@ def execute_step_task(
256
302
  run_id=run_id,
257
303
  step_id=step_id,
258
304
  )
259
- asyncio.run(
305
+ run_async(
260
306
  _record_step_failure_and_resume(
261
307
  storage_config=storage_config,
262
308
  run_id=run_id,
@@ -267,20 +313,23 @@ def execute_step_task(
267
313
  is_retryable=False, # Mark as not retryable since we exhausted retries
268
314
  )
269
315
  )
270
- raise
316
+ raise FatalError(f"Step '{step_name}' failed after retries: {str(e)}") from e
271
317
 
272
318
  except Exception as e:
273
319
  # Check if we have retries left
274
320
  if self.request.retries < max_retries:
321
+ # Use exponential backoff for unexpected errors
322
+ countdown = _calculate_exponential_backoff(self.request.retries)
275
323
  logger.warning(
276
- f"Step failed (unexpected): {step_name}, retrying...",
324
+ f"Step failed (unexpected): {step_name}, retrying in {countdown:.1f}s...",
277
325
  run_id=run_id,
278
326
  step_id=step_id,
279
327
  error=str(e),
328
+ countdown=countdown,
280
329
  attempt=self.request.retries + 1,
281
330
  )
282
- # Treat unexpected errors as retriable
283
- raise self.retry(exc=RetryableError(str(e)), countdown=60)
331
+ # Treat unexpected errors as retriable with exponential backoff
332
+ raise self.retry(exc=e, countdown=countdown)
284
333
  else:
285
334
  # Max retries exhausted
286
335
  logger.error(
@@ -290,7 +339,7 @@ def execute_step_task(
290
339
  error=str(e),
291
340
  exc_info=True,
292
341
  )
293
- asyncio.run(
342
+ run_async(
294
343
  _record_step_failure_and_resume(
295
344
  storage_config=storage_config,
296
345
  run_id=run_id,
@@ -301,7 +350,7 @@ def execute_step_task(
301
350
  is_retryable=False,
302
351
  )
303
352
  )
304
- raise
353
+ raise FatalError(f"Step '{step_name}' failed after retries: {str(e)}") from e
305
354
 
306
355
  finally:
307
356
  # Clean up step context
@@ -323,9 +372,16 @@ async def _record_step_completion_and_resume(
323
372
  result: Any,
324
373
  ) -> None:
325
374
  """
326
- Record STEP_COMPLETED event and trigger workflow resumption.
375
+ Record STEP_COMPLETED event and trigger workflow resumption if safe.
327
376
 
328
377
  Called by execute_step_task after successful step execution.
378
+
379
+ Only schedules resume if WORKFLOW_SUSPENDED event exists, indicating
380
+ the workflow has fully suspended. This prevents race conditions where
381
+ a step completes before the workflow has suspended.
382
+
383
+ Idempotency: If STEP_COMPLETED already exists for this step_id, skip
384
+ recording and resume scheduling (another task already handled it).
329
385
  """
330
386
  from pyworkflow.engine.events import create_step_completed_event
331
387
  from pyworkflow.serialization.encoder import serialize
@@ -337,6 +393,21 @@ async def _record_step_completion_and_resume(
337
393
  if hasattr(storage, "connect"):
338
394
  await storage.connect()
339
395
 
396
+ # Idempotency check: skip if step already completed
397
+ events = await storage.get_events(run_id)
398
+ already_completed = any(
399
+ evt.type == EventType.STEP_COMPLETED and evt.data.get("step_id") == step_id
400
+ for evt in events
401
+ )
402
+ if already_completed:
403
+ logger.info(
404
+ "Step already completed by another task, skipping",
405
+ run_id=run_id,
406
+ step_id=step_id,
407
+ step_name=step_name,
408
+ )
409
+ return
410
+
340
411
  # Record STEP_COMPLETED event
341
412
  completion_event = create_step_completed_event(
342
413
  run_id=run_id,
@@ -346,15 +417,33 @@ async def _record_step_completion_and_resume(
346
417
  )
347
418
  await storage.record_event(completion_event)
348
419
 
349
- # Schedule workflow resumption immediately
350
- schedule_workflow_resumption(run_id, datetime.now(UTC), storage_config)
420
+ # Refresh events to include the one we just recorded
421
+ events = await storage.get_events(run_id)
351
422
 
352
- logger.info(
353
- "Step completed and workflow resumption scheduled",
354
- run_id=run_id,
355
- step_id=step_id,
356
- step_name=step_name,
357
- )
423
+ # Check if workflow has suspended (WORKFLOW_SUSPENDED event exists)
424
+ # Only schedule resume if workflow has properly suspended
425
+ has_suspended = any(evt.type == EventType.WORKFLOW_SUSPENDED for evt in events)
426
+
427
+ if has_suspended:
428
+ # Workflow has suspended, safe to schedule resume
429
+ schedule_workflow_resumption(
430
+ run_id, datetime.now(UTC), storage_config, triggered_by="step_completed"
431
+ )
432
+ logger.info(
433
+ "Step completed and workflow resumption scheduled",
434
+ run_id=run_id,
435
+ step_id=step_id,
436
+ step_name=step_name,
437
+ )
438
+ else:
439
+ # Workflow hasn't suspended yet - don't schedule resume
440
+ # The suspension handler will check for step completion and schedule resume
441
+ logger.info(
442
+ "Step completed but workflow not yet suspended, skipping resume scheduling",
443
+ run_id=run_id,
444
+ step_id=step_id,
445
+ step_name=step_name,
446
+ )
358
447
 
359
448
 
360
449
  async def _record_step_failure_and_resume(
@@ -367,10 +456,17 @@ async def _record_step_failure_and_resume(
367
456
  is_retryable: bool,
368
457
  ) -> None:
369
458
  """
370
- Record STEP_FAILED event and trigger workflow resumption.
459
+ Record STEP_FAILED event and trigger workflow resumption if safe.
371
460
 
372
461
  Called by execute_step_task after step failure (when retries are exhausted).
373
462
  The workflow will fail when it replays and sees the failure event.
463
+
464
+ Only schedules resume if WORKFLOW_SUSPENDED event exists, indicating
465
+ the workflow has fully suspended. This prevents race conditions where
466
+ a step fails before the workflow has suspended.
467
+
468
+ Idempotency: If STEP_COMPLETED or terminal STEP_FAILED already exists
469
+ for this step_id, skip recording and resume scheduling.
374
470
  """
375
471
  from pyworkflow.engine.events import create_step_failed_event
376
472
 
@@ -381,6 +477,26 @@ async def _record_step_failure_and_resume(
381
477
  if hasattr(storage, "connect"):
382
478
  await storage.connect()
383
479
 
480
+ # Idempotency check: skip if step already completed or terminally failed
481
+ events = await storage.get_events(run_id)
482
+ already_handled = any(
483
+ (evt.type == EventType.STEP_COMPLETED and evt.data.get("step_id") == step_id)
484
+ or (
485
+ evt.type == EventType.STEP_FAILED
486
+ and evt.data.get("step_id") == step_id
487
+ and not evt.data.get("is_retryable", True)
488
+ )
489
+ for evt in events
490
+ )
491
+ if already_handled:
492
+ logger.info(
493
+ "Step already completed/failed by another task, skipping",
494
+ run_id=run_id,
495
+ step_id=step_id,
496
+ step_name=step_name,
497
+ )
498
+ return
499
+
384
500
  # Record STEP_FAILED event
385
501
  failure_event = create_step_failed_event(
386
502
  run_id=run_id,
@@ -392,16 +508,54 @@ async def _record_step_failure_and_resume(
392
508
  )
393
509
  await storage.record_event(failure_event)
394
510
 
395
- # Schedule workflow resumption - workflow will fail on replay
396
- schedule_workflow_resumption(run_id, datetime.now(UTC), storage_config)
511
+ # Refresh events to include the one we just recorded
512
+ events = await storage.get_events(run_id)
397
513
 
398
- logger.info(
399
- "Step failed and workflow resumption scheduled",
400
- run_id=run_id,
401
- step_id=step_id,
402
- step_name=step_name,
403
- error=error,
404
- )
514
+ # Check if workflow has suspended (WORKFLOW_SUSPENDED event exists)
515
+ # Only schedule resume if workflow has properly suspended
516
+ has_suspended = any(evt.type == EventType.WORKFLOW_SUSPENDED for evt in events)
517
+
518
+ if has_suspended:
519
+ # Workflow has suspended, safe to schedule resume
520
+ schedule_workflow_resumption(
521
+ run_id, datetime.now(UTC), storage_config, triggered_by="step_failed"
522
+ )
523
+ logger.info(
524
+ "Step failed and workflow resumption scheduled",
525
+ run_id=run_id,
526
+ step_id=step_id,
527
+ step_name=step_name,
528
+ error=error,
529
+ )
530
+ else:
531
+ # Workflow hasn't suspended yet - don't schedule resume
532
+ # The suspension handler will check for step failure and schedule resume
533
+ logger.info(
534
+ "Step failed but workflow not yet suspended, skipping resume scheduling",
535
+ run_id=run_id,
536
+ step_id=step_id,
537
+ step_name=step_name,
538
+ error=error,
539
+ )
540
+
541
+
542
+ async def _get_workflow_run_safe(
543
+ storage: StorageBackend,
544
+ run_id: str,
545
+ ) -> WorkflowRun | None:
546
+ """
547
+ Safely get workflow run with proper storage connection handling.
548
+
549
+ Args:
550
+ storage: Storage backend
551
+ run_id: Workflow run ID
552
+
553
+ Returns:
554
+ WorkflowRun or None if not found
555
+ """
556
+ if hasattr(storage, "connect"):
557
+ await storage.connect()
558
+ return await storage.get_run(run_id)
405
559
 
406
560
 
407
561
  def _resolve_context_class(class_name: str) -> type["StepContext"] | None:
@@ -430,7 +584,9 @@ def _resolve_context_class(class_name: str) -> type["StepContext"] | None:
430
584
 
431
585
  @celery_app.task(
432
586
  name="pyworkflow.start_workflow",
587
+ base=SingletonWorkflowTask,
433
588
  queue="pyworkflow.workflows",
589
+ unique_on=["run_id"],
434
590
  )
435
591
  def start_workflow_task(
436
592
  workflow_name: str,
@@ -456,7 +612,17 @@ def start_workflow_task(
456
612
  Returns:
457
613
  Workflow run ID
458
614
  """
459
- logger.info(f"Starting workflow on worker: {workflow_name}", run_id=run_id)
615
+ # Ensure logging is configured in forked worker process
616
+ from pyworkflow.celery.app import _configure_worker_logging
617
+
618
+ _configure_worker_logging()
619
+
620
+ logger.info(
621
+ f"START_WORKFLOW_TASK ENTRY: {workflow_name}",
622
+ run_id=run_id,
623
+ idempotency_key=idempotency_key,
624
+ celery_task_id=start_workflow_task.request.id,
625
+ )
460
626
 
461
627
  # Get workflow metadata
462
628
  workflow_meta = get_workflow(workflow_name)
@@ -471,7 +637,7 @@ def start_workflow_task(
471
637
  storage = _get_storage_backend(storage_config)
472
638
 
473
639
  # Execute workflow directly on worker
474
- result_run_id = asyncio.run(
640
+ result_run_id = run_async(
475
641
  _start_workflow_on_worker(
476
642
  workflow_meta=workflow_meta,
477
643
  args=args,
@@ -489,7 +655,9 @@ def start_workflow_task(
489
655
 
490
656
  @celery_app.task(
491
657
  name="pyworkflow.start_child_workflow",
658
+ base=SingletonWorkflowTask,
492
659
  queue="pyworkflow.workflows",
660
+ unique_on=["child_run_id"],
493
661
  )
494
662
  def start_child_workflow_task(
495
663
  workflow_name: str,
@@ -520,6 +688,11 @@ def start_child_workflow_task(
520
688
  Returns:
521
689
  Child workflow run ID
522
690
  """
691
+ # Ensure logging is configured in forked worker process
692
+ from pyworkflow.celery.app import _configure_worker_logging
693
+
694
+ _configure_worker_logging()
695
+
523
696
  logger.info(
524
697
  f"Starting child workflow on worker: {workflow_name}",
525
698
  child_run_id=child_run_id,
@@ -539,7 +712,7 @@ def start_child_workflow_task(
539
712
  storage = _get_storage_backend(storage_config)
540
713
 
541
714
  # Execute child workflow on worker
542
- asyncio.run(
715
+ run_async(
543
716
  _execute_child_workflow_on_worker(
544
717
  workflow_func=workflow_meta.func,
545
718
  workflow_name=workflow_name,
@@ -633,19 +806,62 @@ async def _execute_child_workflow_on_worker(
633
806
  await _trigger_parent_resumption_celery(parent_run_id, storage, storage_config)
634
807
 
635
808
  except SuspensionSignal as e:
636
- # Child workflow suspended (e.g., sleep, hook)
809
+ # Child workflow suspended (e.g., sleep, hook, step dispatch)
637
810
  # Update status and don't notify parent yet - handled on child resumption
638
811
  await storage.update_run_status(child_run_id, RunStatus.SUSPENDED)
812
+
813
+ # Record WORKFLOW_SUSPENDED event
814
+ step_id = e.data.get("step_id") if e.data else None
815
+ step_name = e.data.get("step_name") if e.data else None
816
+ sleep_id = e.data.get("sleep_id") if e.data else None
817
+ hook_id = e.data.get("hook_id") if e.data else None
818
+ nested_child_id = e.data.get("child_id") if e.data else None
819
+
820
+ suspended_event = create_workflow_suspended_event(
821
+ run_id=child_run_id,
822
+ reason=e.reason,
823
+ step_id=step_id,
824
+ step_name=step_name,
825
+ sleep_id=sleep_id,
826
+ hook_id=hook_id,
827
+ child_id=nested_child_id,
828
+ )
829
+ await storage.record_event(suspended_event)
830
+
639
831
  logger.debug(
640
832
  f"Child workflow suspended: {workflow_name}",
641
833
  parent_run_id=parent_run_id,
642
834
  child_run_id=child_run_id,
643
835
  )
644
836
 
837
+ # For step dispatch suspensions, check if step already completed/failed
838
+ if step_id and e.reason.startswith("step_dispatch:"):
839
+ events = await storage.get_events(child_run_id)
840
+ step_finished = any(
841
+ evt.type in (EventType.STEP_COMPLETED, EventType.STEP_FAILED)
842
+ and evt.data.get("step_id") == step_id
843
+ for evt in events
844
+ )
845
+ if step_finished:
846
+ logger.info(
847
+ "Child step finished before suspension completed, scheduling resume",
848
+ child_run_id=child_run_id,
849
+ step_id=step_id,
850
+ )
851
+ schedule_workflow_resumption(
852
+ child_run_id,
853
+ datetime.now(UTC),
854
+ storage_config=storage_config,
855
+ triggered_by="child_suspension_step_race",
856
+ )
857
+ return
858
+
645
859
  # Schedule automatic resumption if we have a resume_at time
646
860
  resume_at = e.data.get("resume_at") if e.data else None
647
861
  if resume_at:
648
- schedule_workflow_resumption(child_run_id, resume_at, storage_config)
862
+ schedule_workflow_resumption(
863
+ child_run_id, resume_at, storage_config, triggered_by="child_sleep_hook"
864
+ )
649
865
 
650
866
  except ContinueAsNewSignal as e:
651
867
  # Child workflow continuing as new execution
@@ -718,7 +934,9 @@ async def _trigger_parent_resumption_celery(
718
934
  parent_run_id=parent_run_id,
719
935
  )
720
936
  # Schedule immediate resumption via Celery
721
- schedule_workflow_resumption(parent_run_id, datetime.now(UTC), storage_config)
937
+ schedule_workflow_resumption(
938
+ parent_run_id, datetime.now(UTC), storage_config, triggered_by="child_completed"
939
+ )
722
940
 
723
941
 
724
942
  async def _notify_parent_of_child_completion(
@@ -978,9 +1196,27 @@ async def _recover_workflow_on_worker(
978
1196
  return run_id
979
1197
 
980
1198
  except SuspensionSignal as e:
981
- # Workflow suspended again
1199
+ # Workflow suspended again (during recovery)
982
1200
  await storage.update_run_status(run_id=run_id, status=RunStatus.SUSPENDED)
983
1201
 
1202
+ # Record WORKFLOW_SUSPENDED event
1203
+ step_id = e.data.get("step_id") if e.data else None
1204
+ step_name = e.data.get("step_name") if e.data else None
1205
+ sleep_id = e.data.get("sleep_id") if e.data else None
1206
+ hook_id = e.data.get("hook_id") if e.data else None
1207
+ child_id = e.data.get("child_id") if e.data else None
1208
+
1209
+ suspended_event = create_workflow_suspended_event(
1210
+ run_id=run_id,
1211
+ reason=e.reason,
1212
+ step_id=step_id,
1213
+ step_name=step_name,
1214
+ sleep_id=sleep_id,
1215
+ hook_id=hook_id,
1216
+ child_id=child_id,
1217
+ )
1218
+ await storage.record_event(suspended_event)
1219
+
984
1220
  logger.info(
985
1221
  f"Recovered workflow suspended: {e.reason}",
986
1222
  run_id=run_id,
@@ -988,10 +1224,34 @@ async def _recover_workflow_on_worker(
988
1224
  reason=e.reason,
989
1225
  )
990
1226
 
1227
+ # For step dispatch suspensions, check if step already completed/failed
1228
+ if step_id and e.reason.startswith("step_dispatch:"):
1229
+ events = await storage.get_events(run_id)
1230
+ step_finished = any(
1231
+ evt.type in (EventType.STEP_COMPLETED, EventType.STEP_FAILED)
1232
+ and evt.data.get("step_id") == step_id
1233
+ for evt in events
1234
+ )
1235
+ if step_finished:
1236
+ logger.info(
1237
+ "Step finished before recovery suspension completed, scheduling resume",
1238
+ run_id=run_id,
1239
+ step_id=step_id,
1240
+ )
1241
+ schedule_workflow_resumption(
1242
+ run_id,
1243
+ datetime.now(UTC),
1244
+ storage_config=storage_config,
1245
+ triggered_by="recovery_suspension_step_race",
1246
+ )
1247
+ return run_id
1248
+
991
1249
  # Schedule automatic resumption if we have a resume_at time
992
1250
  resume_at = e.data.get("resume_at") if e.data else None
993
1251
  if resume_at:
994
- schedule_workflow_resumption(run_id, resume_at, storage_config=storage_config)
1252
+ schedule_workflow_resumption(
1253
+ run_id, resume_at, storage_config=storage_config, triggered_by="recovery_sleep_hook"
1254
+ )
995
1255
  logger.info(
996
1256
  "Scheduled automatic workflow resumption",
997
1257
  run_id=run_id,
@@ -1076,10 +1336,22 @@ async def _start_workflow_on_worker(
1076
1336
  workflow_name = workflow_meta.name
1077
1337
  config = get_config()
1078
1338
 
1339
+ run = await storage.get_run(run_id) if run_id else None
1340
+ logger.debug(
1341
+ f"_START_WORKFLOW_ON_WORKER ENTRY: {workflow_name} with run_id={run_id} and status={run.status.value if run else 'N/A'}",
1342
+ run_id=run_id,
1343
+ )
1344
+
1079
1345
  # Check idempotency key
1080
1346
  if idempotency_key:
1081
1347
  existing_run = await storage.get_run_by_idempotency_key(idempotency_key)
1082
1348
  if existing_run:
1349
+ logger.info(
1350
+ "IDEMPOTENCY CHECK: Found existing run",
1351
+ run_id=existing_run.run_id,
1352
+ status=existing_run.status.value,
1353
+ idempotency_key=idempotency_key,
1354
+ )
1083
1355
  # Check if this is a recovery scenario (workflow was RUNNING but worker crashed)
1084
1356
  if existing_run.status == RunStatus.RUNNING:
1085
1357
  # Check if this is truly a crashed worker or just a duplicate task execution
@@ -1140,27 +1412,76 @@ async def _start_workflow_on_worker(
1140
1412
  if run_id is None:
1141
1413
  run_id = f"run_{uuid.uuid4().hex[:16]}"
1142
1414
 
1143
- # Check if run already exists (recovery scenario without idempotency key)
1415
+ # Check if run already exists
1144
1416
  existing_run = await storage.get_run(run_id)
1145
- if existing_run and existing_run.status == RunStatus.RUNNING:
1146
- # This is a recovery scenario
1147
- can_recover = await _handle_workflow_recovery(
1148
- run=existing_run,
1149
- storage=storage,
1150
- worker_id=None,
1417
+ if existing_run:
1418
+ logger.info(
1419
+ f"RUN_ID CHECK: Found existing run with status {existing_run.status.value}",
1420
+ run_id=run_id,
1421
+ status=existing_run.status.value,
1151
1422
  )
1152
- if can_recover:
1153
- return await _recover_workflow_on_worker(
1423
+
1424
+ if existing_run.status == RunStatus.RUNNING:
1425
+ # Recovery scenario - worker crashed while running
1426
+ can_recover = await _handle_workflow_recovery(
1154
1427
  run=existing_run,
1155
- workflow_meta=workflow_meta,
1156
1428
  storage=storage,
1157
- storage_config=storage_config,
1429
+ worker_id=None,
1430
+ )
1431
+ if can_recover:
1432
+ return await _recover_workflow_on_worker(
1433
+ run=existing_run,
1434
+ workflow_meta=workflow_meta,
1435
+ storage=storage,
1436
+ storage_config=storage_config,
1437
+ )
1438
+ else:
1439
+ return existing_run.run_id
1440
+
1441
+ elif existing_run.status == RunStatus.SUSPENDED:
1442
+ # Workflow is suspended - this start_workflow_task is a duplicate
1443
+ # (scheduled during race condition before workflow suspended)
1444
+ # Return existing run_id - resume_workflow_task will handle it
1445
+ logger.info(
1446
+ "DUPLICATE START: Workflow already suspended, returning existing run",
1447
+ run_id=run_id,
1448
+ status=existing_run.status.value,
1449
+ )
1450
+ return existing_run.run_id
1451
+
1452
+ elif existing_run.status in (
1453
+ RunStatus.COMPLETED,
1454
+ RunStatus.FAILED,
1455
+ RunStatus.CANCELLED,
1456
+ ):
1457
+ # Terminal status - workflow already finished
1458
+ logger.info(
1459
+ f"TERMINAL STATUS: Workflow already {existing_run.status.value}, returning existing run",
1460
+ run_id=run_id,
1461
+ status=existing_run.status.value,
1158
1462
  )
1159
- else:
1160
1463
  return existing_run.run_id
1161
1464
 
1465
+ elif existing_run.status == RunStatus.INTERRUPTED:
1466
+ # Previous recovery failed, try again
1467
+ can_recover = await _handle_workflow_recovery(
1468
+ run=existing_run,
1469
+ storage=storage,
1470
+ worker_id=None,
1471
+ )
1472
+ if can_recover:
1473
+ return await _recover_workflow_on_worker(
1474
+ run=existing_run,
1475
+ workflow_meta=workflow_meta,
1476
+ storage=storage,
1477
+ storage_config=storage_config,
1478
+ )
1479
+ else:
1480
+ return existing_run.run_id
1481
+
1482
+ # Only reach here if no existing run found
1162
1483
  logger.info(
1163
- f"Starting workflow execution on worker: {workflow_name}",
1484
+ f"FRESH START: Creating new workflow run: {workflow_name}",
1164
1485
  run_id=run_id,
1165
1486
  workflow_name=workflow_name,
1166
1487
  )
@@ -1265,9 +1586,28 @@ async def _start_workflow_on_worker(
1265
1586
  return run_id
1266
1587
 
1267
1588
  except SuspensionSignal as e:
1268
- # Workflow suspended (sleep or hook)
1589
+ # Workflow suspended (sleep, hook, or step dispatch)
1269
1590
  await storage.update_run_status(run_id=run_id, status=RunStatus.SUSPENDED)
1270
1591
 
1592
+ # Record WORKFLOW_SUSPENDED event - this signals that suspension is complete
1593
+ # and resume can be safely scheduled
1594
+ step_id = e.data.get("step_id") if e.data else None
1595
+ step_name = e.data.get("step_name") if e.data else None
1596
+ sleep_id = e.data.get("sleep_id") if e.data else None
1597
+ hook_id = e.data.get("hook_id") if e.data else None
1598
+ child_id = e.data.get("child_id") if e.data else None
1599
+
1600
+ suspended_event = create_workflow_suspended_event(
1601
+ run_id=run_id,
1602
+ reason=e.reason,
1603
+ step_id=step_id,
1604
+ step_name=step_name,
1605
+ sleep_id=sleep_id,
1606
+ hook_id=hook_id,
1607
+ child_id=child_id,
1608
+ )
1609
+ await storage.record_event(suspended_event)
1610
+
1271
1611
  logger.info(
1272
1612
  f"Workflow suspended on worker: {e.reason}",
1273
1613
  run_id=run_id,
@@ -1275,10 +1615,35 @@ async def _start_workflow_on_worker(
1275
1615
  reason=e.reason,
1276
1616
  )
1277
1617
 
1278
- # Schedule automatic resumption if we have a resume_at time
1618
+ # For step dispatch suspensions, check if step already completed/failed (race condition)
1619
+ # If so, schedule resume immediately
1620
+ if step_id and e.reason.startswith("step_dispatch:"):
1621
+ events = await storage.get_events(run_id)
1622
+ step_finished = any(
1623
+ evt.type in (EventType.STEP_COMPLETED, EventType.STEP_FAILED)
1624
+ and evt.data.get("step_id") == step_id
1625
+ for evt in events
1626
+ )
1627
+ if step_finished:
1628
+ logger.info(
1629
+ "Step finished before suspension completed, scheduling resume",
1630
+ run_id=run_id,
1631
+ step_id=step_id,
1632
+ )
1633
+ schedule_workflow_resumption(
1634
+ run_id,
1635
+ datetime.now(UTC),
1636
+ storage_config=storage_config,
1637
+ triggered_by="resume_suspension_step_race",
1638
+ )
1639
+ return run_id
1640
+
1641
+ # Schedule automatic resumption if we have a resume_at time (for sleep/hook)
1279
1642
  resume_at = e.data.get("resume_at") if e.data else None
1280
1643
  if resume_at:
1281
- schedule_workflow_resumption(run_id, resume_at, storage_config=storage_config)
1644
+ schedule_workflow_resumption(
1645
+ run_id, resume_at, storage_config=storage_config, triggered_by="resume_sleep_hook"
1646
+ )
1282
1647
  logger.info(
1283
1648
  "Scheduled automatic workflow resumption",
1284
1649
  run_id=run_id,
@@ -1332,7 +1697,9 @@ async def _start_workflow_on_worker(
1332
1697
 
1333
1698
  @celery_app.task(
1334
1699
  name="pyworkflow.resume_workflow",
1700
+ base=SingletonWorkflowTask,
1335
1701
  queue="pyworkflow.schedules",
1702
+ unique_on=["run_id"],
1336
1703
  )
1337
1704
  def resume_workflow_task(
1338
1705
  run_id: str,
@@ -1351,13 +1718,22 @@ def resume_workflow_task(
1351
1718
  Returns:
1352
1719
  Workflow result if completed, None if suspended again
1353
1720
  """
1354
- logger.info(f"Resuming workflow on worker: {run_id}")
1721
+ # Ensure logging is configured in forked worker process
1722
+ from pyworkflow.celery.app import _configure_worker_logging
1723
+
1724
+ _configure_worker_logging()
1725
+
1726
+ logger.info(
1727
+ f"RESUME_WORKFLOW_TASK ENTRY: {run_id}",
1728
+ run_id=run_id,
1729
+ celery_task_id=resume_workflow_task.request.id,
1730
+ )
1355
1731
 
1356
1732
  # Get storage backend
1357
1733
  storage = _get_storage_backend(storage_config)
1358
1734
 
1359
1735
  # Resume workflow directly on worker
1360
- result = asyncio.run(_resume_workflow_on_worker(run_id, storage, storage_config))
1736
+ result = run_async(_resume_workflow_on_worker(run_id, storage, storage_config))
1361
1737
 
1362
1738
  if result is not None:
1363
1739
  logger.info(f"Workflow completed on worker: {run_id}")
@@ -1369,7 +1745,9 @@ def resume_workflow_task(
1369
1745
 
1370
1746
  @celery_app.task(
1371
1747
  name="pyworkflow.execute_scheduled_workflow",
1748
+ base=SingletonWorkflowTask,
1372
1749
  queue="pyworkflow.schedules",
1750
+ # No unique_on - scheduled workflows create new runs each time, no deduplication needed
1373
1751
  )
1374
1752
  def execute_scheduled_workflow_task(
1375
1753
  schedule_id: str,
@@ -1390,11 +1768,16 @@ def execute_scheduled_workflow_task(
1390
1768
  Returns:
1391
1769
  Workflow run ID if started, None if skipped
1392
1770
  """
1771
+ # Ensure logging is configured in forked worker process
1772
+ from pyworkflow.celery.app import _configure_worker_logging
1773
+
1774
+ _configure_worker_logging()
1775
+
1393
1776
  logger.info("Executing scheduled workflow", schedule_id=schedule_id)
1394
1777
 
1395
1778
  storage = _get_storage_backend(storage_config)
1396
1779
 
1397
- return asyncio.run(
1780
+ return run_async(
1398
1781
  _execute_scheduled_workflow(
1399
1782
  schedule_id=schedule_id,
1400
1783
  scheduled_time=datetime.fromisoformat(scheduled_time),
@@ -1587,6 +1970,19 @@ async def _resume_workflow_on_worker(
1587
1970
  )
1588
1971
  return None
1589
1972
 
1973
+ # Prevent duplicate resume execution
1974
+ # Multiple resume tasks can be scheduled for the same workflow (e.g., race
1975
+ # condition between step completion and suspension handler). Only proceed
1976
+ # if the workflow is actually SUSPENDED. If status is RUNNING, another
1977
+ # resume task got there first.
1978
+ if run.status != RunStatus.SUSPENDED:
1979
+ logger.info(
1980
+ f"Workflow status is {run.status.value}, not SUSPENDED - skipping duplicate resume",
1981
+ run_id=run_id,
1982
+ workflow_name=run.workflow_name,
1983
+ )
1984
+ return None
1985
+
1590
1986
  # Check for cancellation flag
1591
1987
  cancellation_requested = await storage.check_cancellation_flag(run_id)
1592
1988
 
@@ -1692,9 +2088,27 @@ async def _resume_workflow_on_worker(
1692
2088
  return None
1693
2089
 
1694
2090
  except SuspensionSignal as e:
1695
- # Workflow suspended again
2091
+ # Workflow suspended again (during resume)
1696
2092
  await storage.update_run_status(run_id=run_id, status=RunStatus.SUSPENDED)
1697
2093
 
2094
+ # Record WORKFLOW_SUSPENDED event
2095
+ step_id = e.data.get("step_id") if e.data else None
2096
+ step_name = e.data.get("step_name") if e.data else None
2097
+ sleep_id = e.data.get("sleep_id") if e.data else None
2098
+ hook_id = e.data.get("hook_id") if e.data else None
2099
+ child_id = e.data.get("child_id") if e.data else None
2100
+
2101
+ suspended_event = create_workflow_suspended_event(
2102
+ run_id=run_id,
2103
+ reason=e.reason,
2104
+ step_id=step_id,
2105
+ step_name=step_name,
2106
+ sleep_id=sleep_id,
2107
+ hook_id=hook_id,
2108
+ child_id=child_id,
2109
+ )
2110
+ await storage.record_event(suspended_event)
2111
+
1698
2112
  logger.info(
1699
2113
  f"Workflow suspended again on worker: {e.reason}",
1700
2114
  run_id=run_id,
@@ -1702,10 +2116,34 @@ async def _resume_workflow_on_worker(
1702
2116
  reason=e.reason,
1703
2117
  )
1704
2118
 
2119
+ # For step dispatch suspensions, check if step already completed/failed
2120
+ if step_id and e.reason.startswith("step_dispatch:"):
2121
+ events = await storage.get_events(run_id)
2122
+ step_finished = any(
2123
+ evt.type in (EventType.STEP_COMPLETED, EventType.STEP_FAILED)
2124
+ and evt.data.get("step_id") == step_id
2125
+ for evt in events
2126
+ )
2127
+ if step_finished:
2128
+ logger.info(
2129
+ "Step finished before resume suspension completed, scheduling resume",
2130
+ run_id=run_id,
2131
+ step_id=step_id,
2132
+ )
2133
+ schedule_workflow_resumption(
2134
+ run_id,
2135
+ datetime.now(UTC),
2136
+ storage_config=storage_config,
2137
+ triggered_by="start_suspension_step_race",
2138
+ )
2139
+ return None
2140
+
1705
2141
  # Schedule automatic resumption if we have a resume_at time
1706
2142
  resume_at = e.data.get("resume_at") if e.data else None
1707
2143
  if resume_at:
1708
- schedule_workflow_resumption(run_id, resume_at, storage_config=storage_config)
2144
+ schedule_workflow_resumption(
2145
+ run_id, resume_at, storage_config=storage_config, triggered_by="start_sleep_hook"
2146
+ )
1709
2147
  logger.info(
1710
2148
  "Scheduled automatic workflow resumption",
1711
2149
  run_id=run_id,
@@ -1779,13 +2217,15 @@ def _get_storage_backend(config: dict[str, Any] | None = None) -> StorageBackend
1779
2217
  """
1780
2218
  from pyworkflow.storage.config import config_to_storage
1781
2219
 
1782
- return config_to_storage(config)
2220
+ storage = config_to_storage(config)
2221
+ return storage
1783
2222
 
1784
2223
 
1785
2224
  def schedule_workflow_resumption(
1786
2225
  run_id: str,
1787
2226
  resume_at: datetime,
1788
2227
  storage_config: dict[str, Any] | None = None,
2228
+ triggered_by: str = "unknown",
1789
2229
  ) -> None:
1790
2230
  """
1791
2231
  Schedule automatic workflow resumption after sleep.
@@ -1794,6 +2234,7 @@ def schedule_workflow_resumption(
1794
2234
  run_id: Workflow run ID
1795
2235
  resume_at: When to resume the workflow
1796
2236
  storage_config: Storage backend configuration to pass to the resume task
2237
+ triggered_by: What triggered this resume scheduling (for debugging)
1797
2238
  """
1798
2239
  from datetime import UTC
1799
2240
 
@@ -1802,10 +2243,11 @@ def schedule_workflow_resumption(
1802
2243
  delay_seconds = max(0, int((resume_at - now).total_seconds()))
1803
2244
 
1804
2245
  logger.info(
1805
- "Scheduling workflow resumption",
2246
+ f"SCHEDULE_RESUME: {triggered_by}",
1806
2247
  run_id=run_id,
1807
2248
  resume_at=resume_at.isoformat(),
1808
2249
  delay_seconds=delay_seconds,
2250
+ triggered_by=triggered_by,
1809
2251
  )
1810
2252
 
1811
2253
  # Schedule the resume task