pyworkflow-engine 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyworkflow/__init__.py +1 -1
- pyworkflow/celery/app.py +10 -0
- pyworkflow/celery/singleton.py +370 -0
- pyworkflow/celery/tasks.py +125 -54
- pyworkflow/context/local.py +46 -0
- pyworkflow/core/step.py +8 -0
- pyworkflow/core/validation.py +112 -0
- pyworkflow/primitives/resume_hook.py +2 -1
- pyworkflow/runtime/base.py +4 -0
- pyworkflow/runtime/celery.py +12 -1
- pyworkflow/runtime/local.py +8 -0
- pyworkflow/storage/base.py +4 -1
- pyworkflow/storage/cassandra.py +30 -25
- pyworkflow/storage/dynamodb.py +32 -16
- pyworkflow/storage/file.py +39 -13
- pyworkflow/storage/memory.py +28 -11
- pyworkflow/storage/mysql.py +27 -11
- pyworkflow/storage/postgres.py +29 -12
- pyworkflow/storage/sqlite.py +29 -12
- {pyworkflow_engine-0.1.12.dist-info → pyworkflow_engine-0.1.14.dist-info}/METADATA +1 -1
- {pyworkflow_engine-0.1.12.dist-info → pyworkflow_engine-0.1.14.dist-info}/RECORD +25 -23
- {pyworkflow_engine-0.1.12.dist-info → pyworkflow_engine-0.1.14.dist-info}/WHEEL +0 -0
- {pyworkflow_engine-0.1.12.dist-info → pyworkflow_engine-0.1.14.dist-info}/entry_points.txt +0 -0
- {pyworkflow_engine-0.1.12.dist-info → pyworkflow_engine-0.1.14.dist-info}/licenses/LICENSE +0 -0
- {pyworkflow_engine-0.1.12.dist-info → pyworkflow_engine-0.1.14.dist-info}/top_level.txt +0 -0
pyworkflow/__init__.py
CHANGED
pyworkflow/celery/app.py
CHANGED
|
@@ -194,6 +194,16 @@ def create_celery_app(
|
|
|
194
194
|
worker_task_log_format="[%(asctime)s: %(levelname)s/%(processName)s] [%(task_name)s(%(task_id)s)] %(message)s",
|
|
195
195
|
)
|
|
196
196
|
|
|
197
|
+
# Configure singleton locking for Redis brokers
|
|
198
|
+
# This enables distributed locking to prevent duplicate task execution
|
|
199
|
+
is_redis_broker = broker_url.startswith("redis://") or broker_url.startswith("rediss://")
|
|
200
|
+
if is_redis_broker:
|
|
201
|
+
app.conf.update(
|
|
202
|
+
singleton_backend_url=broker_url,
|
|
203
|
+
singleton_key_prefix="pyworkflow:lock:",
|
|
204
|
+
singleton_lock_expiry=3600, # 1 hour TTL (safety net)
|
|
205
|
+
)
|
|
206
|
+
|
|
197
207
|
# Note: Logging is configured via Celery signals (worker_init, worker_process_init)
|
|
198
208
|
# to ensure proper initialization AFTER process forking.
|
|
199
209
|
# See on_worker_init() and on_worker_process_init() below.
|
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Singleton task implementation for PyWorkflow.
|
|
3
|
+
|
|
4
|
+
Provides Redis-based distributed locking to prevent duplicate task execution.
|
|
5
|
+
Self-contained implementation (no external dependencies beyond redis).
|
|
6
|
+
|
|
7
|
+
Based on:
|
|
8
|
+
- steinitzu/celery-singleton library concepts
|
|
9
|
+
- FlowHunt's battle-tested refinements for retry-safe lock management
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import inspect
|
|
13
|
+
import json
|
|
14
|
+
from hashlib import md5
|
|
15
|
+
from typing import Any
|
|
16
|
+
from uuid import uuid4
|
|
17
|
+
|
|
18
|
+
from celery import Task
|
|
19
|
+
from celery.exceptions import WorkerLostError
|
|
20
|
+
from loguru import logger
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def generate_lock_key(
|
|
24
|
+
task_name: str,
|
|
25
|
+
task_args: list[Any] | tuple[Any, ...] | None = None,
|
|
26
|
+
task_kwargs: dict[str, Any] | None = None,
|
|
27
|
+
key_prefix: str = "pyworkflow:lock:",
|
|
28
|
+
) -> str:
|
|
29
|
+
"""
|
|
30
|
+
Generate a unique lock key for a task based on its name and arguments.
|
|
31
|
+
|
|
32
|
+
Uses MD5 hash to keep key length reasonable while ensuring uniqueness.
|
|
33
|
+
"""
|
|
34
|
+
str_args = json.dumps(task_args or [], sort_keys=True, default=str)
|
|
35
|
+
str_kwargs = json.dumps(task_kwargs or {}, sort_keys=True, default=str)
|
|
36
|
+
task_hash = md5((task_name + str_args + str_kwargs).encode()).hexdigest()
|
|
37
|
+
return key_prefix + task_hash
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class SingletonConfig:
|
|
41
|
+
"""Configuration for singleton task behavior."""
|
|
42
|
+
|
|
43
|
+
def __init__(self, app: Any):
|
|
44
|
+
self.app = app
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def backend_url(self) -> str | None:
|
|
48
|
+
return self.app.conf.get("singleton_backend_url")
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def key_prefix(self) -> str:
|
|
52
|
+
return self.app.conf.get("singleton_key_prefix", "pyworkflow:lock:")
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def lock_expiry(self) -> int:
|
|
56
|
+
return self.app.conf.get("singleton_lock_expiry", 3600)
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def raise_on_duplicate(self) -> bool:
|
|
60
|
+
return self.app.conf.get("singleton_raise_on_duplicate", False)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class RedisLockBackend:
|
|
64
|
+
"""Redis backend for distributed locking."""
|
|
65
|
+
|
|
66
|
+
def __init__(self, url: str):
|
|
67
|
+
import redis
|
|
68
|
+
|
|
69
|
+
self.redis = redis.from_url(url, decode_responses=True)
|
|
70
|
+
|
|
71
|
+
def lock(self, lock_key: str, task_id: str, expiry: int | None = None) -> bool:
|
|
72
|
+
"""Acquire lock atomically. Returns True if acquired."""
|
|
73
|
+
return bool(self.redis.set(lock_key, task_id, nx=True, ex=expiry))
|
|
74
|
+
|
|
75
|
+
def unlock(self, lock_key: str) -> None:
|
|
76
|
+
"""Release the lock."""
|
|
77
|
+
self.redis.delete(lock_key)
|
|
78
|
+
|
|
79
|
+
def get(self, lock_key: str) -> str | None:
|
|
80
|
+
"""Get the task ID holding the lock."""
|
|
81
|
+
return self.redis.get(lock_key)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class DuplicateTaskError(Exception):
|
|
85
|
+
"""Raised when attempting to queue a duplicate singleton task."""
|
|
86
|
+
|
|
87
|
+
def __init__(self, message: str, task_id: str):
|
|
88
|
+
self.task_id = task_id
|
|
89
|
+
super().__init__(message)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class SingletonWorkflowTask(Task):
|
|
93
|
+
"""
|
|
94
|
+
Base class for singleton workflow tasks with distributed locking.
|
|
95
|
+
|
|
96
|
+
Features:
|
|
97
|
+
- Redis-based lock prevents duplicate execution
|
|
98
|
+
- Support for unique_on with nested dict/list access (e.g., "data.run_id")
|
|
99
|
+
- Retry-safe: lock released in on_retry callback to allow retry to acquire it
|
|
100
|
+
- Lock released on success or when max retries exceeded
|
|
101
|
+
- Time-based lock expiry as safety net
|
|
102
|
+
|
|
103
|
+
Configuration:
|
|
104
|
+
unique_on: List of argument names to use for uniqueness (e.g., ["run_id", "step_id"])
|
|
105
|
+
Supports nested access with dot notation (e.g., ["data.run_id"])
|
|
106
|
+
raise_on_duplicate: If True, raise DuplicateTaskError instead of returning existing result
|
|
107
|
+
lock_expiry: Lock TTL in seconds (default: 3600 = 1 hour)
|
|
108
|
+
|
|
109
|
+
Example:
|
|
110
|
+
@celery_app.task(
|
|
111
|
+
base=SingletonWorkflowTask,
|
|
112
|
+
unique_on=["run_id", "step_id"],
|
|
113
|
+
)
|
|
114
|
+
def my_task(run_id: str, step_id: str, data: dict):
|
|
115
|
+
...
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
abstract = True
|
|
119
|
+
|
|
120
|
+
# Singleton configuration (can be overridden per-task)
|
|
121
|
+
unique_on: list[str] | str | None = None
|
|
122
|
+
raise_on_duplicate: bool | None = None
|
|
123
|
+
lock_expiry: int | None = None
|
|
124
|
+
|
|
125
|
+
# Lock behavior
|
|
126
|
+
release_lock_on_success: bool = True
|
|
127
|
+
release_lock_on_failure: bool = False # Only release on max retries exceeded
|
|
128
|
+
|
|
129
|
+
# Celery task settings
|
|
130
|
+
max_retries: int | None = None
|
|
131
|
+
acks_on_failure_or_timeout: bool = True
|
|
132
|
+
|
|
133
|
+
# Cached instances (class-level, shared across task instances)
|
|
134
|
+
_singleton_backend: RedisLockBackend | None = None
|
|
135
|
+
_singleton_config: SingletonConfig | None = None
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def singleton_config(self) -> SingletonConfig:
|
|
139
|
+
if self._singleton_config is None:
|
|
140
|
+
self._singleton_config = SingletonConfig(self.app)
|
|
141
|
+
return self._singleton_config
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def singleton_backend(self) -> RedisLockBackend | None:
|
|
145
|
+
if self._singleton_backend is None:
|
|
146
|
+
url = self.singleton_config.backend_url
|
|
147
|
+
if not url:
|
|
148
|
+
# Try broker URL if it's Redis
|
|
149
|
+
broker = self.app.conf.broker_url or ""
|
|
150
|
+
if broker.startswith("redis://") or broker.startswith("rediss://"):
|
|
151
|
+
url = broker
|
|
152
|
+
if url:
|
|
153
|
+
self._singleton_backend = RedisLockBackend(url)
|
|
154
|
+
return self._singleton_backend
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def _lock_expiry(self) -> int:
|
|
158
|
+
if self.lock_expiry is not None:
|
|
159
|
+
return self.lock_expiry
|
|
160
|
+
return self.singleton_config.lock_expiry
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def _raise_on_duplicate(self) -> bool:
|
|
164
|
+
if self.raise_on_duplicate is not None:
|
|
165
|
+
return self.raise_on_duplicate
|
|
166
|
+
return self.singleton_config.raise_on_duplicate
|
|
167
|
+
|
|
168
|
+
def generate_lock(
|
|
169
|
+
self,
|
|
170
|
+
task_name: str,
|
|
171
|
+
task_args: list[Any] | tuple[Any, ...] | None = None,
|
|
172
|
+
task_kwargs: dict[str, Any] | None = None,
|
|
173
|
+
) -> str:
|
|
174
|
+
"""Generate lock key, supporting nested attribute access via unique_on."""
|
|
175
|
+
unique_on = self.unique_on
|
|
176
|
+
task_args = task_args or []
|
|
177
|
+
task_kwargs = task_kwargs or {}
|
|
178
|
+
|
|
179
|
+
if unique_on:
|
|
180
|
+
if isinstance(unique_on, str):
|
|
181
|
+
unique_on = [unique_on]
|
|
182
|
+
|
|
183
|
+
# Bind arguments to function signature
|
|
184
|
+
sig = inspect.signature(self.run)
|
|
185
|
+
bound = sig.bind(*task_args, **task_kwargs).arguments
|
|
186
|
+
|
|
187
|
+
unique_args: list[Any] = []
|
|
188
|
+
for key in unique_on:
|
|
189
|
+
keys = key.split(".")
|
|
190
|
+
if keys[0] not in bound:
|
|
191
|
+
raise ValueError(f"Key '{keys[0]}' not found in task arguments")
|
|
192
|
+
|
|
193
|
+
value = bound[keys[0]]
|
|
194
|
+
# Navigate nested structure (supports one level of nesting)
|
|
195
|
+
if len(keys) == 2:
|
|
196
|
+
nested_key = keys[1]
|
|
197
|
+
if isinstance(value, dict):
|
|
198
|
+
if nested_key not in value:
|
|
199
|
+
raise ValueError(f"Key '{nested_key}' not found in dict")
|
|
200
|
+
unique_args.append(value[nested_key])
|
|
201
|
+
elif isinstance(value, (list, tuple)):
|
|
202
|
+
unique_args.append(value[int(nested_key)])
|
|
203
|
+
elif hasattr(value, nested_key):
|
|
204
|
+
unique_args.append(getattr(value, nested_key))
|
|
205
|
+
else:
|
|
206
|
+
raise ValueError(f"Key '{key}' has unsupported type")
|
|
207
|
+
elif len(keys) == 1:
|
|
208
|
+
unique_args.append(value)
|
|
209
|
+
else:
|
|
210
|
+
raise ValueError(f"Key '{key}' has too many levels (max 2)")
|
|
211
|
+
|
|
212
|
+
return generate_lock_key(
|
|
213
|
+
task_name,
|
|
214
|
+
unique_args,
|
|
215
|
+
{},
|
|
216
|
+
key_prefix=self.singleton_config.key_prefix,
|
|
217
|
+
)
|
|
218
|
+
else:
|
|
219
|
+
return generate_lock_key(
|
|
220
|
+
task_name,
|
|
221
|
+
list(task_args),
|
|
222
|
+
task_kwargs,
|
|
223
|
+
key_prefix=self.singleton_config.key_prefix,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
def acquire_lock(self, lock_key: str, task_id: str) -> bool:
|
|
227
|
+
"""Attempt to acquire lock. Returns True if successful."""
|
|
228
|
+
backend = self.singleton_backend
|
|
229
|
+
if backend is None:
|
|
230
|
+
return True # No Redis = no locking
|
|
231
|
+
return backend.lock(lock_key, task_id, expiry=self._lock_expiry)
|
|
232
|
+
|
|
233
|
+
def release_lock(
|
|
234
|
+
self,
|
|
235
|
+
task_args: list[Any] | tuple[Any, ...] | None = None,
|
|
236
|
+
task_kwargs: dict[str, Any] | None = None,
|
|
237
|
+
) -> None:
|
|
238
|
+
"""Release the lock for this task."""
|
|
239
|
+
backend = self.singleton_backend
|
|
240
|
+
if backend is None:
|
|
241
|
+
return
|
|
242
|
+
lock_key = self.generate_lock(self.name, task_args, task_kwargs)
|
|
243
|
+
backend.unlock(lock_key)
|
|
244
|
+
|
|
245
|
+
def get_existing_task_id(self, lock_key: str) -> str | None:
|
|
246
|
+
"""Get task ID holding the lock, if any."""
|
|
247
|
+
backend = self.singleton_backend
|
|
248
|
+
if backend is None:
|
|
249
|
+
return None
|
|
250
|
+
return backend.get(lock_key)
|
|
251
|
+
|
|
252
|
+
def apply_async(
|
|
253
|
+
self,
|
|
254
|
+
args: list[Any] | tuple[Any, ...] | None = None,
|
|
255
|
+
kwargs: dict[str, Any] | None = None,
|
|
256
|
+
task_id: str | None = None,
|
|
257
|
+
**options: Any,
|
|
258
|
+
) -> Any:
|
|
259
|
+
"""Override apply_async to implement singleton behavior."""
|
|
260
|
+
args = args or []
|
|
261
|
+
kwargs = kwargs or {}
|
|
262
|
+
task_id = task_id or str(uuid4())
|
|
263
|
+
|
|
264
|
+
backend = self.singleton_backend
|
|
265
|
+
if backend is None:
|
|
266
|
+
# No Redis = normal behavior
|
|
267
|
+
return super().apply_async(args, kwargs, task_id=task_id, **options)
|
|
268
|
+
|
|
269
|
+
lock_key = self.generate_lock(self.name, args, kwargs)
|
|
270
|
+
|
|
271
|
+
# Try to acquire lock and run
|
|
272
|
+
if self.acquire_lock(lock_key, task_id):
|
|
273
|
+
try:
|
|
274
|
+
return super().apply_async(args, kwargs, task_id=task_id, **options)
|
|
275
|
+
except Exception:
|
|
276
|
+
# Release lock if apply_async fails
|
|
277
|
+
backend.unlock(lock_key)
|
|
278
|
+
raise
|
|
279
|
+
|
|
280
|
+
# Lock not acquired - check for existing task
|
|
281
|
+
existing_task_id = self.get_existing_task_id(lock_key)
|
|
282
|
+
if existing_task_id:
|
|
283
|
+
logger.debug(
|
|
284
|
+
"Singleton: duplicate task blocked",
|
|
285
|
+
task=self.name,
|
|
286
|
+
existing_task_id=existing_task_id,
|
|
287
|
+
)
|
|
288
|
+
if self._raise_on_duplicate:
|
|
289
|
+
raise DuplicateTaskError(f"Duplicate of task {existing_task_id}", existing_task_id)
|
|
290
|
+
return self.AsyncResult(existing_task_id)
|
|
291
|
+
|
|
292
|
+
# Race condition: lock disappeared, retry
|
|
293
|
+
if self.acquire_lock(lock_key, task_id):
|
|
294
|
+
try:
|
|
295
|
+
return super().apply_async(args, kwargs, task_id=task_id, **options)
|
|
296
|
+
except Exception:
|
|
297
|
+
backend.unlock(lock_key)
|
|
298
|
+
raise
|
|
299
|
+
|
|
300
|
+
# Still can't acquire - return existing or submit anyway
|
|
301
|
+
existing_task_id = self.get_existing_task_id(lock_key)
|
|
302
|
+
if existing_task_id:
|
|
303
|
+
return self.AsyncResult(existing_task_id)
|
|
304
|
+
|
|
305
|
+
# Fallback: submit anyway (rare edge case)
|
|
306
|
+
logger.warning(f"Singleton lock unstable, submitting anyway: {self.name}")
|
|
307
|
+
return super().apply_async(args, kwargs, task_id=task_id, **options)
|
|
308
|
+
|
|
309
|
+
def on_success(
|
|
310
|
+
self, retval: Any, task_id: str, args: tuple[Any, ...], kwargs: dict[str, Any]
|
|
311
|
+
) -> None:
|
|
312
|
+
"""Release lock on successful task completion."""
|
|
313
|
+
if self.release_lock_on_success:
|
|
314
|
+
self.release_lock(task_args=args, task_kwargs=kwargs)
|
|
315
|
+
|
|
316
|
+
def on_failure(
|
|
317
|
+
self,
|
|
318
|
+
exc: Exception,
|
|
319
|
+
task_id: str,
|
|
320
|
+
args: tuple[Any, ...],
|
|
321
|
+
kwargs: dict[str, Any],
|
|
322
|
+
einfo: Any,
|
|
323
|
+
) -> None:
|
|
324
|
+
"""
|
|
325
|
+
Retry-aware lock management on failure.
|
|
326
|
+
|
|
327
|
+
- If task will retry: Keep lock
|
|
328
|
+
- If max retries exceeded: Release lock
|
|
329
|
+
"""
|
|
330
|
+
max_retries_exceeded = False
|
|
331
|
+
if hasattr(self, "request") and self.request:
|
|
332
|
+
current_retries = getattr(self.request, "retries", 0)
|
|
333
|
+
max_retries = self.max_retries if self.max_retries is not None else 3
|
|
334
|
+
max_retries_exceeded = current_retries >= max_retries
|
|
335
|
+
|
|
336
|
+
if self.release_lock_on_failure or max_retries_exceeded:
|
|
337
|
+
self.release_lock(task_args=args, task_kwargs=kwargs)
|
|
338
|
+
if max_retries_exceeded:
|
|
339
|
+
logger.warning(
|
|
340
|
+
f"Task {self.name} failed after {current_retries} retries. Lock released.",
|
|
341
|
+
task_id=task_id,
|
|
342
|
+
error=str(exc),
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
# Log appropriately
|
|
346
|
+
if isinstance(exc, WorkerLostError):
|
|
347
|
+
logger.warning("Task interrupted due to worker loss", task_id=task_id)
|
|
348
|
+
else:
|
|
349
|
+
logger.error(
|
|
350
|
+
f"Task {self.name} failed: {exc}",
|
|
351
|
+
task_id=task_id,
|
|
352
|
+
traceback=einfo.traceback if einfo else None,
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
def on_retry(
|
|
356
|
+
self,
|
|
357
|
+
exc: Exception,
|
|
358
|
+
task_id: str,
|
|
359
|
+
args: tuple[Any, ...],
|
|
360
|
+
kwargs: dict[str, Any],
|
|
361
|
+
einfo: Any,
|
|
362
|
+
) -> None:
|
|
363
|
+
"""Release lock during retry to allow retry task to acquire it."""
|
|
364
|
+
# Release lock so retry can acquire it via apply_async()
|
|
365
|
+
self.release_lock(task_args=args, task_kwargs=kwargs)
|
|
366
|
+
logger.warning(
|
|
367
|
+
f"Task {self.name} retrying (lock released for retry)",
|
|
368
|
+
task_id=task_id,
|
|
369
|
+
retry_count=self.request.retries,
|
|
370
|
+
)
|
pyworkflow/celery/tasks.py
CHANGED
|
@@ -19,12 +19,12 @@ from typing import TYPE_CHECKING, Any
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
20
|
from pyworkflow.context.step_context import StepContext
|
|
21
21
|
|
|
22
|
-
from celery import
|
|
23
|
-
from celery.exceptions import MaxRetriesExceededError, Retry, WorkerLostError
|
|
22
|
+
from celery.exceptions import MaxRetriesExceededError, Retry
|
|
24
23
|
from loguru import logger
|
|
25
24
|
|
|
26
25
|
from pyworkflow.celery.app import celery_app
|
|
27
26
|
from pyworkflow.celery.loop import run_async
|
|
27
|
+
from pyworkflow.celery.singleton import SingletonWorkflowTask
|
|
28
28
|
from pyworkflow.core.exceptions import (
|
|
29
29
|
CancellationError,
|
|
30
30
|
ContinueAsNewSignal,
|
|
@@ -33,6 +33,7 @@ from pyworkflow.core.exceptions import (
|
|
|
33
33
|
SuspensionSignal,
|
|
34
34
|
)
|
|
35
35
|
from pyworkflow.core.registry import WorkflowMetadata, get_workflow
|
|
36
|
+
from pyworkflow.core.validation import validate_step_parameters
|
|
36
37
|
from pyworkflow.core.workflow import execute_workflow_with_context
|
|
37
38
|
from pyworkflow.engine.events import (
|
|
38
39
|
EventType,
|
|
@@ -73,58 +74,15 @@ def _calculate_exponential_backoff(
|
|
|
73
74
|
return delay * jitter
|
|
74
75
|
|
|
75
76
|
|
|
76
|
-
class WorkflowTask(Task):
|
|
77
|
-
"""Base task class for workflow execution with custom error handling."""
|
|
78
|
-
|
|
79
|
-
# Allow unlimited Celery-level retries - our code controls the actual limit
|
|
80
|
-
# via the max_retries parameter passed to execute_step_task
|
|
81
|
-
max_retries = None
|
|
82
|
-
# Prevent message requeue loops when task fails
|
|
83
|
-
acks_on_failure_or_timeout = True
|
|
84
|
-
|
|
85
|
-
def on_failure(self, exc, task_id, args, kwargs, einfo):
|
|
86
|
-
"""
|
|
87
|
-
Handle task failure.
|
|
88
|
-
|
|
89
|
-
Detects worker loss and handles recovery appropriately:
|
|
90
|
-
- WorkerLostError: Infrastructure failure, may trigger recovery
|
|
91
|
-
- Other exceptions: Application failure
|
|
92
|
-
"""
|
|
93
|
-
is_worker_loss = isinstance(exc, WorkerLostError)
|
|
94
|
-
if is_worker_loss:
|
|
95
|
-
logger.warning(
|
|
96
|
-
f"Task {self.name} interrupted due to worker loss",
|
|
97
|
-
task_id=task_id,
|
|
98
|
-
error=str(exc),
|
|
99
|
-
)
|
|
100
|
-
# Note: Recovery is handled when the task is requeued and picked up
|
|
101
|
-
# by another worker. See _handle_workflow_recovery() for logic.
|
|
102
|
-
else:
|
|
103
|
-
logger.error(
|
|
104
|
-
f"Task {self.name} failed: {str(exc)}",
|
|
105
|
-
task_id=task_id,
|
|
106
|
-
error=str(exc),
|
|
107
|
-
traceback=einfo.traceback if einfo else None,
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
def on_retry(self, exc, task_id, args, kwargs, einfo):
|
|
111
|
-
"""Handle task retry."""
|
|
112
|
-
logger.warning(
|
|
113
|
-
f"Task {self.name} retrying",
|
|
114
|
-
task_id=task_id,
|
|
115
|
-
error=str(exc),
|
|
116
|
-
retry_count=self.request.retries,
|
|
117
|
-
)
|
|
118
|
-
|
|
119
|
-
|
|
120
77
|
@celery_app.task(
|
|
121
78
|
name="pyworkflow.execute_step",
|
|
122
|
-
base=
|
|
79
|
+
base=SingletonWorkflowTask,
|
|
123
80
|
bind=True,
|
|
124
81
|
queue="pyworkflow.steps",
|
|
82
|
+
unique_on=["run_id", "step_id"],
|
|
125
83
|
)
|
|
126
84
|
def execute_step_task(
|
|
127
|
-
self:
|
|
85
|
+
self: SingletonWorkflowTask,
|
|
128
86
|
step_name: str,
|
|
129
87
|
args_json: str,
|
|
130
88
|
kwargs_json: str,
|
|
@@ -212,10 +170,28 @@ def execute_step_task(
|
|
|
212
170
|
)
|
|
213
171
|
raise FatalError(f"Step '{step_name}' not found in registry")
|
|
214
172
|
|
|
173
|
+
# Ignore processing step if already completed (idempotency)
|
|
174
|
+
events = run_async(storage.get_events(run_id))
|
|
175
|
+
already_completed = any(
|
|
176
|
+
evt.type == EventType.STEP_COMPLETED and evt.data.get("step_id") == step_id
|
|
177
|
+
for evt in events
|
|
178
|
+
)
|
|
179
|
+
if already_completed:
|
|
180
|
+
logger.warning(
|
|
181
|
+
"Step already completed by another task, skipping execution",
|
|
182
|
+
run_id=run_id,
|
|
183
|
+
step_id=step_id,
|
|
184
|
+
step_name=step_name,
|
|
185
|
+
)
|
|
186
|
+
return None
|
|
187
|
+
|
|
215
188
|
# Deserialize arguments
|
|
216
189
|
args = deserialize_args(args_json)
|
|
217
190
|
kwargs = deserialize_kwargs(kwargs_json)
|
|
218
191
|
|
|
192
|
+
# Validate parameters before execution on worker (defense in depth)
|
|
193
|
+
validate_step_parameters(step_meta.original_func, args, kwargs, step_name)
|
|
194
|
+
|
|
219
195
|
# Set up step context if provided (read-only mode)
|
|
220
196
|
step_context_token = None
|
|
221
197
|
readonly_token = None
|
|
@@ -345,7 +321,7 @@ def execute_step_task(
|
|
|
345
321
|
# Use exponential backoff for unexpected errors
|
|
346
322
|
countdown = _calculate_exponential_backoff(self.request.retries)
|
|
347
323
|
logger.warning(
|
|
348
|
-
f"Step failed (unexpected): {step_name}, retrying in {countdown:.1f}s
|
|
324
|
+
f"Step failed (unexpected): {step_name}, retrying in {countdown:.1f}s...: {str(e)}",
|
|
349
325
|
run_id=run_id,
|
|
350
326
|
step_id=step_id,
|
|
351
327
|
error=str(e),
|
|
@@ -608,8 +584,9 @@ def _resolve_context_class(class_name: str) -> type["StepContext"] | None:
|
|
|
608
584
|
|
|
609
585
|
@celery_app.task(
|
|
610
586
|
name="pyworkflow.start_workflow",
|
|
611
|
-
base=
|
|
587
|
+
base=SingletonWorkflowTask,
|
|
612
588
|
queue="pyworkflow.workflows",
|
|
589
|
+
unique_on=["run_id"],
|
|
613
590
|
)
|
|
614
591
|
def start_workflow_task(
|
|
615
592
|
workflow_name: str,
|
|
@@ -678,8 +655,9 @@ def start_workflow_task(
|
|
|
678
655
|
|
|
679
656
|
@celery_app.task(
|
|
680
657
|
name="pyworkflow.start_child_workflow",
|
|
681
|
-
base=
|
|
658
|
+
base=SingletonWorkflowTask,
|
|
682
659
|
queue="pyworkflow.workflows",
|
|
660
|
+
unique_on=["child_run_id"],
|
|
683
661
|
)
|
|
684
662
|
def start_child_workflow_task(
|
|
685
663
|
workflow_name: str,
|
|
@@ -1719,12 +1697,14 @@ async def _start_workflow_on_worker(
|
|
|
1719
1697
|
|
|
1720
1698
|
@celery_app.task(
|
|
1721
1699
|
name="pyworkflow.resume_workflow",
|
|
1722
|
-
base=
|
|
1700
|
+
base=SingletonWorkflowTask,
|
|
1723
1701
|
queue="pyworkflow.schedules",
|
|
1702
|
+
unique_on=["run_id"],
|
|
1724
1703
|
)
|
|
1725
1704
|
def resume_workflow_task(
|
|
1726
1705
|
run_id: str,
|
|
1727
1706
|
storage_config: dict[str, Any] | None = None,
|
|
1707
|
+
triggered_by_hook_id: str | None = None,
|
|
1728
1708
|
) -> Any | None:
|
|
1729
1709
|
"""
|
|
1730
1710
|
Resume a suspended workflow.
|
|
@@ -1735,6 +1715,9 @@ def resume_workflow_task(
|
|
|
1735
1715
|
Args:
|
|
1736
1716
|
run_id: Workflow run ID to resume
|
|
1737
1717
|
storage_config: Storage backend configuration
|
|
1718
|
+
triggered_by_hook_id: Optional hook ID that triggered this resume.
|
|
1719
|
+
Used to prevent spurious resumes when a workflow
|
|
1720
|
+
has already moved past the triggering hook.
|
|
1738
1721
|
|
|
1739
1722
|
Returns:
|
|
1740
1723
|
Workflow result if completed, None if suspended again
|
|
@@ -1748,13 +1731,18 @@ def resume_workflow_task(
|
|
|
1748
1731
|
f"RESUME_WORKFLOW_TASK ENTRY: {run_id}",
|
|
1749
1732
|
run_id=run_id,
|
|
1750
1733
|
celery_task_id=resume_workflow_task.request.id,
|
|
1734
|
+
triggered_by_hook_id=triggered_by_hook_id,
|
|
1751
1735
|
)
|
|
1752
1736
|
|
|
1753
1737
|
# Get storage backend
|
|
1754
1738
|
storage = _get_storage_backend(storage_config)
|
|
1755
1739
|
|
|
1756
1740
|
# Resume workflow directly on worker
|
|
1757
|
-
result = run_async(
|
|
1741
|
+
result = run_async(
|
|
1742
|
+
_resume_workflow_on_worker(
|
|
1743
|
+
run_id, storage, storage_config, triggered_by_hook_id=triggered_by_hook_id
|
|
1744
|
+
)
|
|
1745
|
+
)
|
|
1758
1746
|
|
|
1759
1747
|
if result is not None:
|
|
1760
1748
|
logger.info(f"Workflow completed on worker: {run_id}")
|
|
@@ -1766,8 +1754,9 @@ def resume_workflow_task(
|
|
|
1766
1754
|
|
|
1767
1755
|
@celery_app.task(
|
|
1768
1756
|
name="pyworkflow.execute_scheduled_workflow",
|
|
1769
|
-
base=
|
|
1757
|
+
base=SingletonWorkflowTask,
|
|
1770
1758
|
queue="pyworkflow.schedules",
|
|
1759
|
+
# No unique_on - scheduled workflows create new runs each time, no deduplication needed
|
|
1771
1760
|
)
|
|
1772
1761
|
def execute_scheduled_workflow_task(
|
|
1773
1762
|
schedule_id: str,
|
|
@@ -1960,15 +1949,81 @@ async def _complete_pending_sleeps(
|
|
|
1960
1949
|
return updated_events
|
|
1961
1950
|
|
|
1962
1951
|
|
|
1952
|
+
def _is_hook_still_relevant(hook_id: str, events: list[Any]) -> bool:
|
|
1953
|
+
"""
|
|
1954
|
+
Check if a hook is still relevant for resuming the workflow.
|
|
1955
|
+
|
|
1956
|
+
A hook is "still relevant" if there are no newer hooks created after
|
|
1957
|
+
this hook was received. This prevents spurious resumes when:
|
|
1958
|
+
1. resume_hook() is called multiple times for the same hook
|
|
1959
|
+
2. The workflow moved past the first resume and created a new hook
|
|
1960
|
+
3. The duplicate resume task runs but the workflow is now waiting on a different hook
|
|
1961
|
+
|
|
1962
|
+
Args:
|
|
1963
|
+
hook_id: The hook ID that triggered the resume
|
|
1964
|
+
events: List of workflow events
|
|
1965
|
+
|
|
1966
|
+
Returns:
|
|
1967
|
+
True if the hook is still relevant, False if workflow has moved past it
|
|
1968
|
+
"""
|
|
1969
|
+
from pyworkflow.engine.events import EventType
|
|
1970
|
+
|
|
1971
|
+
# Sort events by sequence to process in order
|
|
1972
|
+
sorted_events = sorted(events, key=lambda e: e.sequence or 0)
|
|
1973
|
+
|
|
1974
|
+
# Find the sequence number of HOOK_RECEIVED for this hook
|
|
1975
|
+
hook_received_sequence = None
|
|
1976
|
+
for event in sorted_events:
|
|
1977
|
+
if event.type == EventType.HOOK_RECEIVED and event.data.get("hook_id") == hook_id:
|
|
1978
|
+
hook_received_sequence = event.sequence
|
|
1979
|
+
break
|
|
1980
|
+
|
|
1981
|
+
if hook_received_sequence is None:
|
|
1982
|
+
# Hook was never received - shouldn't happen, but allow resume
|
|
1983
|
+
logger.warning(
|
|
1984
|
+
f"Hook {hook_id} was not found in HOOK_RECEIVED events, allowing resume",
|
|
1985
|
+
hook_id=hook_id,
|
|
1986
|
+
)
|
|
1987
|
+
return True
|
|
1988
|
+
|
|
1989
|
+
# Check if there's a HOOK_CREATED event after this hook was received
|
|
1990
|
+
# (indicating the workflow has moved past this hook and created a new one)
|
|
1991
|
+
for event in sorted_events:
|
|
1992
|
+
if event.type == EventType.HOOK_CREATED:
|
|
1993
|
+
event_sequence = event.sequence or 0
|
|
1994
|
+
if event_sequence > hook_received_sequence:
|
|
1995
|
+
# There's a newer hook - this resume is stale
|
|
1996
|
+
newer_hook_id = event.data.get("hook_id")
|
|
1997
|
+
logger.debug(
|
|
1998
|
+
f"Found newer hook {newer_hook_id} (seq {event_sequence}) "
|
|
1999
|
+
f"after triggered hook {hook_id} (received at seq {hook_received_sequence})",
|
|
2000
|
+
hook_id=hook_id,
|
|
2001
|
+
newer_hook_id=newer_hook_id,
|
|
2002
|
+
)
|
|
2003
|
+
return False
|
|
2004
|
+
|
|
2005
|
+
# No newer hooks created - this resume is still relevant
|
|
2006
|
+
return True
|
|
2007
|
+
|
|
2008
|
+
|
|
1963
2009
|
async def _resume_workflow_on_worker(
|
|
1964
2010
|
run_id: str,
|
|
1965
2011
|
storage: StorageBackend,
|
|
1966
2012
|
storage_config: dict[str, Any] | None = None,
|
|
2013
|
+
triggered_by_hook_id: str | None = None,
|
|
1967
2014
|
) -> Any | None:
|
|
1968
2015
|
"""
|
|
1969
2016
|
Internal function to resume workflow on Celery worker.
|
|
1970
2017
|
|
|
1971
2018
|
This mirrors the logic from testing.py but runs on workers.
|
|
2019
|
+
|
|
2020
|
+
Args:
|
|
2021
|
+
run_id: Workflow run ID to resume
|
|
2022
|
+
storage: Storage backend
|
|
2023
|
+
storage_config: Storage configuration for task dispatch
|
|
2024
|
+
triggered_by_hook_id: Optional hook ID that triggered this resume.
|
|
2025
|
+
If provided, we verify the hook is still relevant
|
|
2026
|
+
before resuming to prevent spurious resumes.
|
|
1972
2027
|
"""
|
|
1973
2028
|
from pyworkflow.core.exceptions import WorkflowNotFoundError
|
|
1974
2029
|
|
|
@@ -2003,6 +2058,22 @@ async def _resume_workflow_on_worker(
|
|
|
2003
2058
|
)
|
|
2004
2059
|
return None
|
|
2005
2060
|
|
|
2061
|
+
# If this resume was triggered by a specific hook, verify the hook is still relevant.
|
|
2062
|
+
# A hook is "stale" if the workflow has already moved past it (created a newer hook).
|
|
2063
|
+
# This prevents spurious resumes from duplicate resume_hook() calls.
|
|
2064
|
+
if triggered_by_hook_id:
|
|
2065
|
+
events = await storage.get_events(run_id)
|
|
2066
|
+
hook_still_relevant = _is_hook_still_relevant(triggered_by_hook_id, events)
|
|
2067
|
+
if not hook_still_relevant:
|
|
2068
|
+
logger.info(
|
|
2069
|
+
f"Hook {triggered_by_hook_id} is no longer relevant (workflow moved past it), "
|
|
2070
|
+
"skipping spurious resume",
|
|
2071
|
+
run_id=run_id,
|
|
2072
|
+
workflow_name=run.workflow_name,
|
|
2073
|
+
triggered_by_hook_id=triggered_by_hook_id,
|
|
2074
|
+
)
|
|
2075
|
+
return None
|
|
2076
|
+
|
|
2006
2077
|
# Check for cancellation flag
|
|
2007
2078
|
cancellation_requested = await storage.check_cancellation_flag(run_id)
|
|
2008
2079
|
|