pyworkflow-engine 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pyworkflow/__init__.py CHANGED
@@ -29,7 +29,7 @@ Quick Start:
29
29
  >>> run_id = await start(my_workflow, "Alice")
30
30
  """
31
31
 
32
- __version__ = "0.1.12"
32
+ __version__ = "0.1.14"
33
33
 
34
34
  # Configuration
35
35
  from pyworkflow.config import (
pyworkflow/celery/app.py CHANGED
@@ -194,6 +194,16 @@ def create_celery_app(
194
194
  worker_task_log_format="[%(asctime)s: %(levelname)s/%(processName)s] [%(task_name)s(%(task_id)s)] %(message)s",
195
195
  )
196
196
 
197
+ # Configure singleton locking for Redis brokers
198
+ # This enables distributed locking to prevent duplicate task execution
199
+ is_redis_broker = broker_url.startswith("redis://") or broker_url.startswith("rediss://")
200
+ if is_redis_broker:
201
+ app.conf.update(
202
+ singleton_backend_url=broker_url,
203
+ singleton_key_prefix="pyworkflow:lock:",
204
+ singleton_lock_expiry=3600, # 1 hour TTL (safety net)
205
+ )
206
+
197
207
  # Note: Logging is configured via Celery signals (worker_init, worker_process_init)
198
208
  # to ensure proper initialization AFTER process forking.
199
209
  # See on_worker_init() and on_worker_process_init() below.
@@ -0,0 +1,370 @@
1
+ """
2
+ Singleton task implementation for PyWorkflow.
3
+
4
+ Provides Redis-based distributed locking to prevent duplicate task execution.
5
+ Self-contained implementation (no external dependencies beyond redis).
6
+
7
+ Based on:
8
+ - steinitzu/celery-singleton library concepts
9
+ - FlowHunt's battle-tested refinements for retry-safe lock management
10
+ """
11
+
12
+ import inspect
13
+ import json
14
+ from hashlib import md5
15
+ from typing import Any
16
+ from uuid import uuid4
17
+
18
+ from celery import Task
19
+ from celery.exceptions import WorkerLostError
20
+ from loguru import logger
21
+
22
+
23
+ def generate_lock_key(
24
+ task_name: str,
25
+ task_args: list[Any] | tuple[Any, ...] | None = None,
26
+ task_kwargs: dict[str, Any] | None = None,
27
+ key_prefix: str = "pyworkflow:lock:",
28
+ ) -> str:
29
+ """
30
+ Generate a unique lock key for a task based on its name and arguments.
31
+
32
+ Uses MD5 hash to keep key length reasonable while ensuring uniqueness.
33
+ """
34
+ str_args = json.dumps(task_args or [], sort_keys=True, default=str)
35
+ str_kwargs = json.dumps(task_kwargs or {}, sort_keys=True, default=str)
36
+ task_hash = md5((task_name + str_args + str_kwargs).encode()).hexdigest()
37
+ return key_prefix + task_hash
38
+
39
+
40
+ class SingletonConfig:
41
+ """Configuration for singleton task behavior."""
42
+
43
+ def __init__(self, app: Any):
44
+ self.app = app
45
+
46
+ @property
47
+ def backend_url(self) -> str | None:
48
+ return self.app.conf.get("singleton_backend_url")
49
+
50
+ @property
51
+ def key_prefix(self) -> str:
52
+ return self.app.conf.get("singleton_key_prefix", "pyworkflow:lock:")
53
+
54
+ @property
55
+ def lock_expiry(self) -> int:
56
+ return self.app.conf.get("singleton_lock_expiry", 3600)
57
+
58
+ @property
59
+ def raise_on_duplicate(self) -> bool:
60
+ return self.app.conf.get("singleton_raise_on_duplicate", False)
61
+
62
+
63
+ class RedisLockBackend:
64
+ """Redis backend for distributed locking."""
65
+
66
+ def __init__(self, url: str):
67
+ import redis
68
+
69
+ self.redis = redis.from_url(url, decode_responses=True)
70
+
71
+ def lock(self, lock_key: str, task_id: str, expiry: int | None = None) -> bool:
72
+ """Acquire lock atomically. Returns True if acquired."""
73
+ return bool(self.redis.set(lock_key, task_id, nx=True, ex=expiry))
74
+
75
+ def unlock(self, lock_key: str) -> None:
76
+ """Release the lock."""
77
+ self.redis.delete(lock_key)
78
+
79
+ def get(self, lock_key: str) -> str | None:
80
+ """Get the task ID holding the lock."""
81
+ return self.redis.get(lock_key)
82
+
83
+
84
+ class DuplicateTaskError(Exception):
85
+ """Raised when attempting to queue a duplicate singleton task."""
86
+
87
+ def __init__(self, message: str, task_id: str):
88
+ self.task_id = task_id
89
+ super().__init__(message)
90
+
91
+
92
+ class SingletonWorkflowTask(Task):
93
+ """
94
+ Base class for singleton workflow tasks with distributed locking.
95
+
96
+ Features:
97
+ - Redis-based lock prevents duplicate execution
98
+ - Support for unique_on with nested dict/list access (e.g., "data.run_id")
99
+ - Retry-safe: lock released in on_retry callback to allow retry to acquire it
100
+ - Lock released on success or when max retries exceeded
101
+ - Time-based lock expiry as safety net
102
+
103
+ Configuration:
104
+ unique_on: List of argument names to use for uniqueness (e.g., ["run_id", "step_id"])
105
+ Supports nested access with dot notation (e.g., ["data.run_id"])
106
+ raise_on_duplicate: If True, raise DuplicateTaskError instead of returning existing result
107
+ lock_expiry: Lock TTL in seconds (default: 3600 = 1 hour)
108
+
109
+ Example:
110
+ @celery_app.task(
111
+ base=SingletonWorkflowTask,
112
+ unique_on=["run_id", "step_id"],
113
+ )
114
+ def my_task(run_id: str, step_id: str, data: dict):
115
+ ...
116
+ """
117
+
118
+ abstract = True
119
+
120
+ # Singleton configuration (can be overridden per-task)
121
+ unique_on: list[str] | str | None = None
122
+ raise_on_duplicate: bool | None = None
123
+ lock_expiry: int | None = None
124
+
125
+ # Lock behavior
126
+ release_lock_on_success: bool = True
127
+ release_lock_on_failure: bool = False # Only release on max retries exceeded
128
+
129
+ # Celery task settings
130
+ max_retries: int | None = None
131
+ acks_on_failure_or_timeout: bool = True
132
+
133
+ # Cached instances (class-level, shared across task instances)
134
+ _singleton_backend: RedisLockBackend | None = None
135
+ _singleton_config: SingletonConfig | None = None
136
+
137
+ @property
138
+ def singleton_config(self) -> SingletonConfig:
139
+ if self._singleton_config is None:
140
+ self._singleton_config = SingletonConfig(self.app)
141
+ return self._singleton_config
142
+
143
+ @property
144
+ def singleton_backend(self) -> RedisLockBackend | None:
145
+ if self._singleton_backend is None:
146
+ url = self.singleton_config.backend_url
147
+ if not url:
148
+ # Try broker URL if it's Redis
149
+ broker = self.app.conf.broker_url or ""
150
+ if broker.startswith("redis://") or broker.startswith("rediss://"):
151
+ url = broker
152
+ if url:
153
+ self._singleton_backend = RedisLockBackend(url)
154
+ return self._singleton_backend
155
+
156
+ @property
157
+ def _lock_expiry(self) -> int:
158
+ if self.lock_expiry is not None:
159
+ return self.lock_expiry
160
+ return self.singleton_config.lock_expiry
161
+
162
+ @property
163
+ def _raise_on_duplicate(self) -> bool:
164
+ if self.raise_on_duplicate is not None:
165
+ return self.raise_on_duplicate
166
+ return self.singleton_config.raise_on_duplicate
167
+
168
+ def generate_lock(
169
+ self,
170
+ task_name: str,
171
+ task_args: list[Any] | tuple[Any, ...] | None = None,
172
+ task_kwargs: dict[str, Any] | None = None,
173
+ ) -> str:
174
+ """Generate lock key, supporting nested attribute access via unique_on."""
175
+ unique_on = self.unique_on
176
+ task_args = task_args or []
177
+ task_kwargs = task_kwargs or {}
178
+
179
+ if unique_on:
180
+ if isinstance(unique_on, str):
181
+ unique_on = [unique_on]
182
+
183
+ # Bind arguments to function signature
184
+ sig = inspect.signature(self.run)
185
+ bound = sig.bind(*task_args, **task_kwargs).arguments
186
+
187
+ unique_args: list[Any] = []
188
+ for key in unique_on:
189
+ keys = key.split(".")
190
+ if keys[0] not in bound:
191
+ raise ValueError(f"Key '{keys[0]}' not found in task arguments")
192
+
193
+ value = bound[keys[0]]
194
+ # Navigate nested structure (supports one level of nesting)
195
+ if len(keys) == 2:
196
+ nested_key = keys[1]
197
+ if isinstance(value, dict):
198
+ if nested_key not in value:
199
+ raise ValueError(f"Key '{nested_key}' not found in dict")
200
+ unique_args.append(value[nested_key])
201
+ elif isinstance(value, (list, tuple)):
202
+ unique_args.append(value[int(nested_key)])
203
+ elif hasattr(value, nested_key):
204
+ unique_args.append(getattr(value, nested_key))
205
+ else:
206
+ raise ValueError(f"Key '{key}' has unsupported type")
207
+ elif len(keys) == 1:
208
+ unique_args.append(value)
209
+ else:
210
+ raise ValueError(f"Key '{key}' has too many levels (max 2)")
211
+
212
+ return generate_lock_key(
213
+ task_name,
214
+ unique_args,
215
+ {},
216
+ key_prefix=self.singleton_config.key_prefix,
217
+ )
218
+ else:
219
+ return generate_lock_key(
220
+ task_name,
221
+ list(task_args),
222
+ task_kwargs,
223
+ key_prefix=self.singleton_config.key_prefix,
224
+ )
225
+
226
+ def acquire_lock(self, lock_key: str, task_id: str) -> bool:
227
+ """Attempt to acquire lock. Returns True if successful."""
228
+ backend = self.singleton_backend
229
+ if backend is None:
230
+ return True # No Redis = no locking
231
+ return backend.lock(lock_key, task_id, expiry=self._lock_expiry)
232
+
233
+ def release_lock(
234
+ self,
235
+ task_args: list[Any] | tuple[Any, ...] | None = None,
236
+ task_kwargs: dict[str, Any] | None = None,
237
+ ) -> None:
238
+ """Release the lock for this task."""
239
+ backend = self.singleton_backend
240
+ if backend is None:
241
+ return
242
+ lock_key = self.generate_lock(self.name, task_args, task_kwargs)
243
+ backend.unlock(lock_key)
244
+
245
+ def get_existing_task_id(self, lock_key: str) -> str | None:
246
+ """Get task ID holding the lock, if any."""
247
+ backend = self.singleton_backend
248
+ if backend is None:
249
+ return None
250
+ return backend.get(lock_key)
251
+
252
+ def apply_async(
253
+ self,
254
+ args: list[Any] | tuple[Any, ...] | None = None,
255
+ kwargs: dict[str, Any] | None = None,
256
+ task_id: str | None = None,
257
+ **options: Any,
258
+ ) -> Any:
259
+ """Override apply_async to implement singleton behavior."""
260
+ args = args or []
261
+ kwargs = kwargs or {}
262
+ task_id = task_id or str(uuid4())
263
+
264
+ backend = self.singleton_backend
265
+ if backend is None:
266
+ # No Redis = normal behavior
267
+ return super().apply_async(args, kwargs, task_id=task_id, **options)
268
+
269
+ lock_key = self.generate_lock(self.name, args, kwargs)
270
+
271
+ # Try to acquire lock and run
272
+ if self.acquire_lock(lock_key, task_id):
273
+ try:
274
+ return super().apply_async(args, kwargs, task_id=task_id, **options)
275
+ except Exception:
276
+ # Release lock if apply_async fails
277
+ backend.unlock(lock_key)
278
+ raise
279
+
280
+ # Lock not acquired - check for existing task
281
+ existing_task_id = self.get_existing_task_id(lock_key)
282
+ if existing_task_id:
283
+ logger.debug(
284
+ "Singleton: duplicate task blocked",
285
+ task=self.name,
286
+ existing_task_id=existing_task_id,
287
+ )
288
+ if self._raise_on_duplicate:
289
+ raise DuplicateTaskError(f"Duplicate of task {existing_task_id}", existing_task_id)
290
+ return self.AsyncResult(existing_task_id)
291
+
292
+ # Race condition: lock disappeared, retry
293
+ if self.acquire_lock(lock_key, task_id):
294
+ try:
295
+ return super().apply_async(args, kwargs, task_id=task_id, **options)
296
+ except Exception:
297
+ backend.unlock(lock_key)
298
+ raise
299
+
300
+ # Still can't acquire - return existing or submit anyway
301
+ existing_task_id = self.get_existing_task_id(lock_key)
302
+ if existing_task_id:
303
+ return self.AsyncResult(existing_task_id)
304
+
305
+ # Fallback: submit anyway (rare edge case)
306
+ logger.warning(f"Singleton lock unstable, submitting anyway: {self.name}")
307
+ return super().apply_async(args, kwargs, task_id=task_id, **options)
308
+
309
+ def on_success(
310
+ self, retval: Any, task_id: str, args: tuple[Any, ...], kwargs: dict[str, Any]
311
+ ) -> None:
312
+ """Release lock on successful task completion."""
313
+ if self.release_lock_on_success:
314
+ self.release_lock(task_args=args, task_kwargs=kwargs)
315
+
316
+ def on_failure(
317
+ self,
318
+ exc: Exception,
319
+ task_id: str,
320
+ args: tuple[Any, ...],
321
+ kwargs: dict[str, Any],
322
+ einfo: Any,
323
+ ) -> None:
324
+ """
325
+ Retry-aware lock management on failure.
326
+
327
+ - If task will retry: Keep lock
328
+ - If max retries exceeded: Release lock
329
+ """
330
+ max_retries_exceeded = False
331
+ if hasattr(self, "request") and self.request:
332
+ current_retries = getattr(self.request, "retries", 0)
333
+ max_retries = self.max_retries if self.max_retries is not None else 3
334
+ max_retries_exceeded = current_retries >= max_retries
335
+
336
+ if self.release_lock_on_failure or max_retries_exceeded:
337
+ self.release_lock(task_args=args, task_kwargs=kwargs)
338
+ if max_retries_exceeded:
339
+ logger.warning(
340
+ f"Task {self.name} failed after {current_retries} retries. Lock released.",
341
+ task_id=task_id,
342
+ error=str(exc),
343
+ )
344
+
345
+ # Log appropriately
346
+ if isinstance(exc, WorkerLostError):
347
+ logger.warning("Task interrupted due to worker loss", task_id=task_id)
348
+ else:
349
+ logger.error(
350
+ f"Task {self.name} failed: {exc}",
351
+ task_id=task_id,
352
+ traceback=einfo.traceback if einfo else None,
353
+ )
354
+
355
+ def on_retry(
356
+ self,
357
+ exc: Exception,
358
+ task_id: str,
359
+ args: tuple[Any, ...],
360
+ kwargs: dict[str, Any],
361
+ einfo: Any,
362
+ ) -> None:
363
+ """Release lock during retry to allow retry task to acquire it."""
364
+ # Release lock so retry can acquire it via apply_async()
365
+ self.release_lock(task_args=args, task_kwargs=kwargs)
366
+ logger.warning(
367
+ f"Task {self.name} retrying (lock released for retry)",
368
+ task_id=task_id,
369
+ retry_count=self.request.retries,
370
+ )
@@ -19,12 +19,12 @@ from typing import TYPE_CHECKING, Any
19
19
  if TYPE_CHECKING:
20
20
  from pyworkflow.context.step_context import StepContext
21
21
 
22
- from celery import Task
23
- from celery.exceptions import MaxRetriesExceededError, Retry, WorkerLostError
22
+ from celery.exceptions import MaxRetriesExceededError, Retry
24
23
  from loguru import logger
25
24
 
26
25
  from pyworkflow.celery.app import celery_app
27
26
  from pyworkflow.celery.loop import run_async
27
+ from pyworkflow.celery.singleton import SingletonWorkflowTask
28
28
  from pyworkflow.core.exceptions import (
29
29
  CancellationError,
30
30
  ContinueAsNewSignal,
@@ -33,6 +33,7 @@ from pyworkflow.core.exceptions import (
33
33
  SuspensionSignal,
34
34
  )
35
35
  from pyworkflow.core.registry import WorkflowMetadata, get_workflow
36
+ from pyworkflow.core.validation import validate_step_parameters
36
37
  from pyworkflow.core.workflow import execute_workflow_with_context
37
38
  from pyworkflow.engine.events import (
38
39
  EventType,
@@ -73,58 +74,15 @@ def _calculate_exponential_backoff(
73
74
  return delay * jitter
74
75
 
75
76
 
76
- class WorkflowTask(Task):
77
- """Base task class for workflow execution with custom error handling."""
78
-
79
- # Allow unlimited Celery-level retries - our code controls the actual limit
80
- # via the max_retries parameter passed to execute_step_task
81
- max_retries = None
82
- # Prevent message requeue loops when task fails
83
- acks_on_failure_or_timeout = True
84
-
85
- def on_failure(self, exc, task_id, args, kwargs, einfo):
86
- """
87
- Handle task failure.
88
-
89
- Detects worker loss and handles recovery appropriately:
90
- - WorkerLostError: Infrastructure failure, may trigger recovery
91
- - Other exceptions: Application failure
92
- """
93
- is_worker_loss = isinstance(exc, WorkerLostError)
94
- if is_worker_loss:
95
- logger.warning(
96
- f"Task {self.name} interrupted due to worker loss",
97
- task_id=task_id,
98
- error=str(exc),
99
- )
100
- # Note: Recovery is handled when the task is requeued and picked up
101
- # by another worker. See _handle_workflow_recovery() for logic.
102
- else:
103
- logger.error(
104
- f"Task {self.name} failed: {str(exc)}",
105
- task_id=task_id,
106
- error=str(exc),
107
- traceback=einfo.traceback if einfo else None,
108
- )
109
-
110
- def on_retry(self, exc, task_id, args, kwargs, einfo):
111
- """Handle task retry."""
112
- logger.warning(
113
- f"Task {self.name} retrying",
114
- task_id=task_id,
115
- error=str(exc),
116
- retry_count=self.request.retries,
117
- )
118
-
119
-
120
77
  @celery_app.task(
121
78
  name="pyworkflow.execute_step",
122
- base=WorkflowTask,
79
+ base=SingletonWorkflowTask,
123
80
  bind=True,
124
81
  queue="pyworkflow.steps",
82
+ unique_on=["run_id", "step_id"],
125
83
  )
126
84
  def execute_step_task(
127
- self: WorkflowTask,
85
+ self: SingletonWorkflowTask,
128
86
  step_name: str,
129
87
  args_json: str,
130
88
  kwargs_json: str,
@@ -212,10 +170,28 @@ def execute_step_task(
212
170
  )
213
171
  raise FatalError(f"Step '{step_name}' not found in registry")
214
172
 
173
+ # Ignore processing step if already completed (idempotency)
174
+ events = run_async(storage.get_events(run_id))
175
+ already_completed = any(
176
+ evt.type == EventType.STEP_COMPLETED and evt.data.get("step_id") == step_id
177
+ for evt in events
178
+ )
179
+ if already_completed:
180
+ logger.warning(
181
+ "Step already completed by another task, skipping execution",
182
+ run_id=run_id,
183
+ step_id=step_id,
184
+ step_name=step_name,
185
+ )
186
+ return None
187
+
215
188
  # Deserialize arguments
216
189
  args = deserialize_args(args_json)
217
190
  kwargs = deserialize_kwargs(kwargs_json)
218
191
 
192
+ # Validate parameters before execution on worker (defense in depth)
193
+ validate_step_parameters(step_meta.original_func, args, kwargs, step_name)
194
+
219
195
  # Set up step context if provided (read-only mode)
220
196
  step_context_token = None
221
197
  readonly_token = None
@@ -345,7 +321,7 @@ def execute_step_task(
345
321
  # Use exponential backoff for unexpected errors
346
322
  countdown = _calculate_exponential_backoff(self.request.retries)
347
323
  logger.warning(
348
- f"Step failed (unexpected): {step_name}, retrying in {countdown:.1f}s...",
324
+ f"Step failed (unexpected): {step_name}, retrying in {countdown:.1f}s...: {str(e)}",
349
325
  run_id=run_id,
350
326
  step_id=step_id,
351
327
  error=str(e),
@@ -608,8 +584,9 @@ def _resolve_context_class(class_name: str) -> type["StepContext"] | None:
608
584
 
609
585
  @celery_app.task(
610
586
  name="pyworkflow.start_workflow",
611
- base=WorkflowTask,
587
+ base=SingletonWorkflowTask,
612
588
  queue="pyworkflow.workflows",
589
+ unique_on=["run_id"],
613
590
  )
614
591
  def start_workflow_task(
615
592
  workflow_name: str,
@@ -678,8 +655,9 @@ def start_workflow_task(
678
655
 
679
656
  @celery_app.task(
680
657
  name="pyworkflow.start_child_workflow",
681
- base=WorkflowTask,
658
+ base=SingletonWorkflowTask,
682
659
  queue="pyworkflow.workflows",
660
+ unique_on=["child_run_id"],
683
661
  )
684
662
  def start_child_workflow_task(
685
663
  workflow_name: str,
@@ -1719,12 +1697,14 @@ async def _start_workflow_on_worker(
1719
1697
 
1720
1698
  @celery_app.task(
1721
1699
  name="pyworkflow.resume_workflow",
1722
- base=WorkflowTask,
1700
+ base=SingletonWorkflowTask,
1723
1701
  queue="pyworkflow.schedules",
1702
+ unique_on=["run_id"],
1724
1703
  )
1725
1704
  def resume_workflow_task(
1726
1705
  run_id: str,
1727
1706
  storage_config: dict[str, Any] | None = None,
1707
+ triggered_by_hook_id: str | None = None,
1728
1708
  ) -> Any | None:
1729
1709
  """
1730
1710
  Resume a suspended workflow.
@@ -1735,6 +1715,9 @@ def resume_workflow_task(
1735
1715
  Args:
1736
1716
  run_id: Workflow run ID to resume
1737
1717
  storage_config: Storage backend configuration
1718
+ triggered_by_hook_id: Optional hook ID that triggered this resume.
1719
+ Used to prevent spurious resumes when a workflow
1720
+ has already moved past the triggering hook.
1738
1721
 
1739
1722
  Returns:
1740
1723
  Workflow result if completed, None if suspended again
@@ -1748,13 +1731,18 @@ def resume_workflow_task(
1748
1731
  f"RESUME_WORKFLOW_TASK ENTRY: {run_id}",
1749
1732
  run_id=run_id,
1750
1733
  celery_task_id=resume_workflow_task.request.id,
1734
+ triggered_by_hook_id=triggered_by_hook_id,
1751
1735
  )
1752
1736
 
1753
1737
  # Get storage backend
1754
1738
  storage = _get_storage_backend(storage_config)
1755
1739
 
1756
1740
  # Resume workflow directly on worker
1757
- result = run_async(_resume_workflow_on_worker(run_id, storage, storage_config))
1741
+ result = run_async(
1742
+ _resume_workflow_on_worker(
1743
+ run_id, storage, storage_config, triggered_by_hook_id=triggered_by_hook_id
1744
+ )
1745
+ )
1758
1746
 
1759
1747
  if result is not None:
1760
1748
  logger.info(f"Workflow completed on worker: {run_id}")
@@ -1766,8 +1754,9 @@ def resume_workflow_task(
1766
1754
 
1767
1755
  @celery_app.task(
1768
1756
  name="pyworkflow.execute_scheduled_workflow",
1769
- base=WorkflowTask,
1757
+ base=SingletonWorkflowTask,
1770
1758
  queue="pyworkflow.schedules",
1759
+ # No unique_on - scheduled workflows create new runs each time, no deduplication needed
1771
1760
  )
1772
1761
  def execute_scheduled_workflow_task(
1773
1762
  schedule_id: str,
@@ -1960,15 +1949,81 @@ async def _complete_pending_sleeps(
1960
1949
  return updated_events
1961
1950
 
1962
1951
 
1952
+ def _is_hook_still_relevant(hook_id: str, events: list[Any]) -> bool:
1953
+ """
1954
+ Check if a hook is still relevant for resuming the workflow.
1955
+
1956
+ A hook is "still relevant" if there are no newer hooks created after
1957
+ this hook was received. This prevents spurious resumes when:
1958
+ 1. resume_hook() is called multiple times for the same hook
1959
+ 2. The workflow moved past the first resume and created a new hook
1960
+ 3. The duplicate resume task runs but the workflow is now waiting on a different hook
1961
+
1962
+ Args:
1963
+ hook_id: The hook ID that triggered the resume
1964
+ events: List of workflow events
1965
+
1966
+ Returns:
1967
+ True if the hook is still relevant, False if workflow has moved past it
1968
+ """
1969
+ from pyworkflow.engine.events import EventType
1970
+
1971
+ # Sort events by sequence to process in order
1972
+ sorted_events = sorted(events, key=lambda e: e.sequence or 0)
1973
+
1974
+ # Find the sequence number of HOOK_RECEIVED for this hook
1975
+ hook_received_sequence = None
1976
+ for event in sorted_events:
1977
+ if event.type == EventType.HOOK_RECEIVED and event.data.get("hook_id") == hook_id:
1978
+ hook_received_sequence = event.sequence
1979
+ break
1980
+
1981
+ if hook_received_sequence is None:
1982
+ # Hook was never received - shouldn't happen, but allow resume
1983
+ logger.warning(
1984
+ f"Hook {hook_id} was not found in HOOK_RECEIVED events, allowing resume",
1985
+ hook_id=hook_id,
1986
+ )
1987
+ return True
1988
+
1989
+ # Check if there's a HOOK_CREATED event after this hook was received
1990
+ # (indicating the workflow has moved past this hook and created a new one)
1991
+ for event in sorted_events:
1992
+ if event.type == EventType.HOOK_CREATED:
1993
+ event_sequence = event.sequence or 0
1994
+ if event_sequence > hook_received_sequence:
1995
+ # There's a newer hook - this resume is stale
1996
+ newer_hook_id = event.data.get("hook_id")
1997
+ logger.debug(
1998
+ f"Found newer hook {newer_hook_id} (seq {event_sequence}) "
1999
+ f"after triggered hook {hook_id} (received at seq {hook_received_sequence})",
2000
+ hook_id=hook_id,
2001
+ newer_hook_id=newer_hook_id,
2002
+ )
2003
+ return False
2004
+
2005
+ # No newer hooks created - this resume is still relevant
2006
+ return True
2007
+
2008
+
1963
2009
  async def _resume_workflow_on_worker(
1964
2010
  run_id: str,
1965
2011
  storage: StorageBackend,
1966
2012
  storage_config: dict[str, Any] | None = None,
2013
+ triggered_by_hook_id: str | None = None,
1967
2014
  ) -> Any | None:
1968
2015
  """
1969
2016
  Internal function to resume workflow on Celery worker.
1970
2017
 
1971
2018
  This mirrors the logic from testing.py but runs on workers.
2019
+
2020
+ Args:
2021
+ run_id: Workflow run ID to resume
2022
+ storage: Storage backend
2023
+ storage_config: Storage configuration for task dispatch
2024
+ triggered_by_hook_id: Optional hook ID that triggered this resume.
2025
+ If provided, we verify the hook is still relevant
2026
+ before resuming to prevent spurious resumes.
1972
2027
  """
1973
2028
  from pyworkflow.core.exceptions import WorkflowNotFoundError
1974
2029
 
@@ -2003,6 +2058,22 @@ async def _resume_workflow_on_worker(
2003
2058
  )
2004
2059
  return None
2005
2060
 
2061
+ # If this resume was triggered by a specific hook, verify the hook is still relevant.
2062
+ # A hook is "stale" if the workflow has already moved past it (created a newer hook).
2063
+ # This prevents spurious resumes from duplicate resume_hook() calls.
2064
+ if triggered_by_hook_id:
2065
+ events = await storage.get_events(run_id)
2066
+ hook_still_relevant = _is_hook_still_relevant(triggered_by_hook_id, events)
2067
+ if not hook_still_relevant:
2068
+ logger.info(
2069
+ f"Hook {triggered_by_hook_id} is no longer relevant (workflow moved past it), "
2070
+ "skipping spurious resume",
2071
+ run_id=run_id,
2072
+ workflow_name=run.workflow_name,
2073
+ triggered_by_hook_id=triggered_by_hook_id,
2074
+ )
2075
+ return None
2076
+
2006
2077
  # Check for cancellation flag
2007
2078
  cancellation_requested = await storage.check_cancellation_flag(run_id)
2008
2079