edda-framework 0.6.0__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edda/__init__.py +39 -5
- edda/app.py +383 -223
- edda/channels.py +992 -0
- edda/compensation.py +22 -22
- edda/context.py +77 -51
- edda/integrations/opentelemetry/hooks.py +7 -2
- edda/locking.py +130 -67
- edda/replay.py +312 -82
- edda/storage/models.py +165 -24
- edda/storage/protocol.py +575 -122
- edda/storage/sqlalchemy_storage.py +2073 -319
- edda/viewer_ui/app.py +558 -127
- edda/viewer_ui/components.py +81 -68
- edda/viewer_ui/data_service.py +61 -25
- edda/viewer_ui/theme.py +200 -0
- edda/workflow.py +43 -0
- {edda_framework-0.6.0.dist-info → edda_framework-0.8.0.dist-info}/METADATA +167 -9
- {edda_framework-0.6.0.dist-info → edda_framework-0.8.0.dist-info}/RECORD +21 -20
- {edda_framework-0.6.0.dist-info → edda_framework-0.8.0.dist-info}/WHEEL +1 -1
- edda/events.py +0 -505
- {edda_framework-0.6.0.dist-info → edda_framework-0.8.0.dist-info}/entry_points.txt +0 -0
- {edda_framework-0.6.0.dist-info → edda_framework-0.8.0.dist-info}/licenses/LICENSE +0 -0
edda/compensation.py
CHANGED
|
@@ -5,9 +5,12 @@ This module provides compensation transaction support for implementing
|
|
|
5
5
|
the Saga pattern with automatic rollback on failure.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
+
import logging
|
|
8
9
|
from collections.abc import Callable
|
|
9
10
|
from typing import TYPE_CHECKING, Any, TypeVar
|
|
10
11
|
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
11
14
|
if TYPE_CHECKING:
|
|
12
15
|
from edda.context import WorkflowContext
|
|
13
16
|
|
|
@@ -197,12 +200,12 @@ async def execute_compensations(ctx: "WorkflowContext") -> None:
|
|
|
197
200
|
|
|
198
201
|
# If no compensations, nothing to do
|
|
199
202
|
if not compensations:
|
|
200
|
-
|
|
203
|
+
logger.debug("No compensations to execute for %s", ctx.instance_id)
|
|
201
204
|
return
|
|
202
205
|
|
|
203
206
|
# Mark as compensating BEFORE execution for crash recovery
|
|
204
207
|
# This allows auto-resume to detect and restart incomplete compensation
|
|
205
|
-
|
|
208
|
+
logger.debug("Starting compensation execution for %s", ctx.instance_id)
|
|
206
209
|
await ctx._update_status("compensating", {"started_at": None})
|
|
207
210
|
|
|
208
211
|
# Get already executed compensations to avoid duplicate execution
|
|
@@ -221,8 +224,10 @@ async def execute_compensations(ctx: "WorkflowContext") -> None:
|
|
|
221
224
|
|
|
222
225
|
# Skip if already executed (idempotency)
|
|
223
226
|
if compensation_id in executed_compensation_ids:
|
|
224
|
-
|
|
225
|
-
|
|
227
|
+
logger.debug(
|
|
228
|
+
"Skipping already executed compensation: %s (id=%s)",
|
|
229
|
+
activity_name,
|
|
230
|
+
compensation_id,
|
|
226
231
|
)
|
|
227
232
|
continue
|
|
228
233
|
|
|
@@ -232,20 +237,18 @@ async def execute_compensations(ctx: "WorkflowContext") -> None:
|
|
|
232
237
|
|
|
233
238
|
# Skip if activity_name is None or not a string
|
|
234
239
|
if not isinstance(activity_name, str):
|
|
235
|
-
|
|
240
|
+
logger.warning("Invalid activity_name: %s. Skipping.", activity_name)
|
|
236
241
|
continue
|
|
237
242
|
|
|
238
243
|
# Log compensation execution
|
|
239
|
-
|
|
244
|
+
logger.info("Executing compensation: %s (id=%s)", activity_name, compensation_id)
|
|
240
245
|
|
|
241
246
|
try:
|
|
242
247
|
# Look up compensation function from registry
|
|
243
248
|
compensation_func = _COMPENSATION_REGISTRY.get(activity_name)
|
|
244
249
|
|
|
245
250
|
if compensation_func is None:
|
|
246
|
-
|
|
247
|
-
f"[Compensation] Warning: Function '{activity_name}' not found in registry. Skipping."
|
|
248
|
-
)
|
|
251
|
+
logger.warning("Function '%s' not found in registry. Skipping.", activity_name)
|
|
249
252
|
continue
|
|
250
253
|
|
|
251
254
|
# Execute the compensation function directly
|
|
@@ -271,20 +274,21 @@ async def execute_compensations(ctx: "WorkflowContext") -> None:
|
|
|
271
274
|
# This is expected in concurrent cancellation scenarios - silently ignore
|
|
272
275
|
error_msg = str(record_error)
|
|
273
276
|
if "UNIQUE constraint" in error_msg or "UNIQUE" in error_msg:
|
|
274
|
-
|
|
275
|
-
|
|
277
|
+
logger.debug(
|
|
278
|
+
"%s already recorded by another process, skipping duplicate record",
|
|
279
|
+
activity_name,
|
|
276
280
|
)
|
|
277
281
|
else:
|
|
278
282
|
# Other errors should be logged but not break the compensation flow
|
|
279
|
-
|
|
280
|
-
f"[Compensation] Warning: Failed to record {activity_name} execution: {record_error}"
|
|
281
|
-
)
|
|
283
|
+
logger.warning("Failed to record %s execution: %s", activity_name, record_error)
|
|
282
284
|
|
|
283
|
-
|
|
285
|
+
logger.info("Successfully executed compensation: %s", activity_name)
|
|
284
286
|
|
|
285
287
|
except Exception as error:
|
|
286
288
|
# Log but don't fail the rollback
|
|
287
|
-
|
|
289
|
+
logger.error(
|
|
290
|
+
"Failed to execute compensation %s: %s", activity_name, error, exc_info=True
|
|
291
|
+
)
|
|
288
292
|
|
|
289
293
|
# Record compensation failure in history
|
|
290
294
|
try:
|
|
@@ -304,13 +308,9 @@ async def execute_compensations(ctx: "WorkflowContext") -> None:
|
|
|
304
308
|
# UNIQUE constraint error means another process already recorded this failure
|
|
305
309
|
error_msg = str(record_error)
|
|
306
310
|
if "UNIQUE constraint" in error_msg or "UNIQUE" in error_msg:
|
|
307
|
-
|
|
308
|
-
f"[Compensation] {activity_name} failure already recorded by another process"
|
|
309
|
-
)
|
|
311
|
+
logger.debug("%s failure already recorded by another process", activity_name)
|
|
310
312
|
else:
|
|
311
|
-
|
|
312
|
-
f"[Compensation] Warning: Failed to record compensation failure: {record_error}"
|
|
313
|
-
)
|
|
313
|
+
logger.warning("Failed to record compensation failure: %s", record_error)
|
|
314
314
|
|
|
315
315
|
|
|
316
316
|
async def clear_compensations(ctx: "WorkflowContext") -> None:
|
edda/context.py
CHANGED
|
@@ -9,7 +9,7 @@ from collections.abc import AsyncIterator
|
|
|
9
9
|
from contextlib import asynccontextmanager
|
|
10
10
|
from typing import TYPE_CHECKING, Any, cast
|
|
11
11
|
|
|
12
|
-
from edda.
|
|
12
|
+
from edda.channels import ChannelMessage, ReceivedEvent
|
|
13
13
|
from edda.storage.protocol import StorageProtocol
|
|
14
14
|
|
|
15
15
|
if TYPE_CHECKING:
|
|
@@ -191,6 +191,28 @@ class WorkflowContext:
|
|
|
191
191
|
extensions=extensions,
|
|
192
192
|
)
|
|
193
193
|
self._history_cache[activity_id] = received_event
|
|
194
|
+
elif event_type == "ChannelMessageReceived":
|
|
195
|
+
# Cache the message data for receive() replay
|
|
196
|
+
from datetime import UTC, datetime
|
|
197
|
+
|
|
198
|
+
raw_data = event_data.get("data", event_data.get("payload", {}))
|
|
199
|
+
data: dict[str, Any] | bytes = (
|
|
200
|
+
raw_data if isinstance(raw_data, (dict, bytes)) else {}
|
|
201
|
+
)
|
|
202
|
+
# Parse published_at if available, otherwise use current time
|
|
203
|
+
published_at_str = event_data.get("published_at")
|
|
204
|
+
if published_at_str:
|
|
205
|
+
published_at = datetime.fromisoformat(published_at_str.replace("Z", "+00:00"))
|
|
206
|
+
else:
|
|
207
|
+
published_at = datetime.now(UTC)
|
|
208
|
+
message = ChannelMessage(
|
|
209
|
+
data=data,
|
|
210
|
+
channel=event_data.get("channel", "unknown"),
|
|
211
|
+
id=event_data.get("id", "unknown"),
|
|
212
|
+
metadata=event_data.get("metadata") or {},
|
|
213
|
+
published_at=published_at,
|
|
214
|
+
)
|
|
215
|
+
self._history_cache[activity_id] = message
|
|
194
216
|
elif event_type == "TimerExpired":
|
|
195
217
|
# Cache the timer result for wait_timer replay
|
|
196
218
|
# Timer returns None, so we cache the result field
|
|
@@ -340,56 +362,6 @@ class WorkflowContext:
|
|
|
340
362
|
"""
|
|
341
363
|
await self.storage.update_instance_status(self.instance_id, status, output_data)
|
|
342
364
|
|
|
343
|
-
async def _register_event_subscription(
|
|
344
|
-
self,
|
|
345
|
-
event_type: str,
|
|
346
|
-
timeout_seconds: int | None = None,
|
|
347
|
-
activity_id: str | None = None,
|
|
348
|
-
) -> None:
|
|
349
|
-
"""
|
|
350
|
-
Register an event subscription for wait_event (internal use only).
|
|
351
|
-
|
|
352
|
-
This is called when a workflow calls wait_event() and needs to pause
|
|
353
|
-
until a matching event arrives.
|
|
354
|
-
|
|
355
|
-
Args:
|
|
356
|
-
event_type: CloudEvent type to wait for
|
|
357
|
-
timeout_seconds: Optional timeout in seconds
|
|
358
|
-
activity_id: The activity ID where wait_event was called
|
|
359
|
-
"""
|
|
360
|
-
from datetime import UTC, datetime, timedelta
|
|
361
|
-
|
|
362
|
-
timeout_at = None
|
|
363
|
-
if timeout_seconds is not None:
|
|
364
|
-
timeout_at = datetime.now(UTC) + timedelta(seconds=timeout_seconds)
|
|
365
|
-
|
|
366
|
-
await self.storage.add_event_subscription(
|
|
367
|
-
instance_id=self.instance_id,
|
|
368
|
-
event_type=event_type,
|
|
369
|
-
timeout_at=timeout_at,
|
|
370
|
-
)
|
|
371
|
-
|
|
372
|
-
# Update current activity ID
|
|
373
|
-
if activity_id is not None:
|
|
374
|
-
await self.storage.update_instance_activity(self.instance_id, activity_id)
|
|
375
|
-
|
|
376
|
-
async def _record_event_received(self, activity_id: str, event_data: dict[str, Any]) -> None:
|
|
377
|
-
"""
|
|
378
|
-
Record that an event was received during wait_event (internal use only).
|
|
379
|
-
|
|
380
|
-
This is called when resuming a workflow after an event arrives.
|
|
381
|
-
|
|
382
|
-
Args:
|
|
383
|
-
activity_id: The activity ID where wait_event was called
|
|
384
|
-
event_data: The received event data
|
|
385
|
-
"""
|
|
386
|
-
await self.storage.append_history(
|
|
387
|
-
instance_id=self.instance_id,
|
|
388
|
-
activity_id=activity_id,
|
|
389
|
-
event_type="EventReceived",
|
|
390
|
-
event_data={"event_data": event_data},
|
|
391
|
-
)
|
|
392
|
-
|
|
393
365
|
async def _push_compensation(self, compensation_action: Any, activity_id: str) -> None:
|
|
394
366
|
"""
|
|
395
367
|
Register a compensation action for this workflow (internal use only).
|
|
@@ -479,6 +451,60 @@ class WorkflowContext:
|
|
|
479
451
|
"""
|
|
480
452
|
return self.storage.in_transaction()
|
|
481
453
|
|
|
454
|
+
async def recur(self, **kwargs: Any) -> None:
|
|
455
|
+
"""
|
|
456
|
+
Restart the workflow with fresh history (Erlang-style tail recursion).
|
|
457
|
+
|
|
458
|
+
This method prevents unbounded history growth in long-running loops by:
|
|
459
|
+
1. Completing the current workflow instance (marking as "recurred")
|
|
460
|
+
2. Archiving the current history (not deleted)
|
|
461
|
+
3. Starting a new workflow instance with the provided arguments
|
|
462
|
+
4. Linking the new instance to the old one via `continued_from`
|
|
463
|
+
|
|
464
|
+
This is similar to Erlang's tail recursion pattern where calling the same
|
|
465
|
+
function at the end of a loop prevents stack growth. In Edda, `recur()`
|
|
466
|
+
prevents history growth.
|
|
467
|
+
|
|
468
|
+
Args:
|
|
469
|
+
**kwargs: Arguments to pass to the new workflow instance.
|
|
470
|
+
These become the input parameters for the next iteration.
|
|
471
|
+
|
|
472
|
+
Raises:
|
|
473
|
+
RecurException: Always raised to signal the ReplayEngine to handle
|
|
474
|
+
the recur operation. This exception should not be caught.
|
|
475
|
+
|
|
476
|
+
Example:
|
|
477
|
+
>>> @workflow
|
|
478
|
+
... async def notification_service(ctx: WorkflowContext, processed_count: int = 0):
|
|
479
|
+
... await join_group(ctx, group="order_watchers")
|
|
480
|
+
...
|
|
481
|
+
... count = 0
|
|
482
|
+
... while True:
|
|
483
|
+
... msg = await wait_message(ctx, channel="order.completed")
|
|
484
|
+
... await send_notification(ctx, msg.data, activity_id=f"notify:{msg.id}")
|
|
485
|
+
...
|
|
486
|
+
... count += 1
|
|
487
|
+
... if count >= 1000:
|
|
488
|
+
... # Reset history every 1000 iterations
|
|
489
|
+
... await ctx.recur(processed_count=processed_count + count)
|
|
490
|
+
... # Code after recur() is never executed
|
|
491
|
+
|
|
492
|
+
Note:
|
|
493
|
+
- Group memberships are NOT automatically transferred. You must re-join
|
|
494
|
+
groups in the new iteration if needed.
|
|
495
|
+
- The old workflow's history is archived, not deleted.
|
|
496
|
+
- The new instance has a `continued_from` field pointing to the old instance.
|
|
497
|
+
- During replay, if recur() was already called, this raises immediately
|
|
498
|
+
without re-executing previous activities.
|
|
499
|
+
"""
|
|
500
|
+
from edda.pydantic_utils import to_json_dict
|
|
501
|
+
from edda.workflow import RecurException
|
|
502
|
+
|
|
503
|
+
# Convert Pydantic models and Enums to JSON-compatible values
|
|
504
|
+
processed_kwargs = {k: to_json_dict(v) for k, v in kwargs.items()}
|
|
505
|
+
|
|
506
|
+
raise RecurException(kwargs=processed_kwargs)
|
|
507
|
+
|
|
482
508
|
def __repr__(self) -> str:
|
|
483
509
|
"""String representation of the context."""
|
|
484
510
|
return (
|
|
@@ -22,9 +22,14 @@ try:
|
|
|
22
22
|
from opentelemetry.context import Context
|
|
23
23
|
from opentelemetry.sdk.resources import Resource
|
|
24
24
|
from opentelemetry.sdk.trace import TracerProvider
|
|
25
|
-
from opentelemetry.sdk.trace.export import
|
|
25
|
+
from opentelemetry.sdk.trace.export import (
|
|
26
|
+
BatchSpanProcessor,
|
|
27
|
+
ConsoleSpanExporter,
|
|
28
|
+
)
|
|
26
29
|
from opentelemetry.trace import Span, Status, StatusCode, Tracer
|
|
27
|
-
from opentelemetry.trace.propagation.tracecontext import
|
|
30
|
+
from opentelemetry.trace.propagation.tracecontext import (
|
|
31
|
+
TraceContextTextMapPropagator,
|
|
32
|
+
)
|
|
28
33
|
|
|
29
34
|
_OPENTELEMETRY_AVAILABLE = True
|
|
30
35
|
except ImportError:
|
edda/locking.py
CHANGED
|
@@ -6,7 +6,9 @@ distributed locks in multi-pod deployments.
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import asyncio
|
|
9
|
+
import logging
|
|
9
10
|
import os
|
|
11
|
+
import random
|
|
10
12
|
import uuid
|
|
11
13
|
from collections.abc import AsyncIterator
|
|
12
14
|
from contextlib import asynccontextmanager, suppress
|
|
@@ -14,6 +16,8 @@ from typing import Any
|
|
|
14
16
|
|
|
15
17
|
from edda.storage.protocol import StorageProtocol
|
|
16
18
|
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
17
21
|
|
|
18
22
|
def generate_worker_id(service_name: str) -> str:
|
|
19
23
|
"""
|
|
@@ -188,6 +192,7 @@ async def _refresh_lock_periodically(
|
|
|
188
192
|
|
|
189
193
|
async def cleanup_stale_locks_periodically(
|
|
190
194
|
storage: StorageProtocol,
|
|
195
|
+
worker_id: str,
|
|
191
196
|
interval: int = 60,
|
|
192
197
|
) -> None:
|
|
193
198
|
"""
|
|
@@ -199,30 +204,49 @@ async def cleanup_stale_locks_periodically(
|
|
|
199
204
|
Note: This function only cleans up locks without resuming workflows.
|
|
200
205
|
For automatic workflow resumption, use auto_resume_stale_workflows_periodically().
|
|
201
206
|
|
|
207
|
+
Uses system-level locking to ensure only one pod executes cleanup at a time.
|
|
208
|
+
|
|
202
209
|
Example:
|
|
203
210
|
>>> asyncio.create_task(
|
|
204
|
-
... cleanup_stale_locks_periodically(storage, interval=60)
|
|
211
|
+
... cleanup_stale_locks_periodically(storage, worker_id, interval=60)
|
|
205
212
|
... )
|
|
206
213
|
|
|
207
214
|
Args:
|
|
208
215
|
storage: Storage backend
|
|
216
|
+
worker_id: Unique identifier for this worker (for global lock coordination)
|
|
209
217
|
interval: Cleanup interval in seconds (default: 60)
|
|
210
218
|
"""
|
|
211
219
|
with suppress(asyncio.CancelledError):
|
|
212
220
|
while True:
|
|
213
|
-
|
|
221
|
+
# Add jitter to prevent thundering herd in multi-pod deployments
|
|
222
|
+
jitter = random.uniform(0, interval * 0.3)
|
|
223
|
+
await asyncio.sleep(interval + jitter)
|
|
224
|
+
|
|
225
|
+
# Try to acquire global lock for this task
|
|
226
|
+
lock_acquired = await storage.try_acquire_system_lock(
|
|
227
|
+
lock_name="cleanup_stale_locks",
|
|
228
|
+
worker_id=worker_id,
|
|
229
|
+
timeout_seconds=interval,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
if not lock_acquired:
|
|
233
|
+
# Another pod is handling this task
|
|
234
|
+
continue
|
|
214
235
|
|
|
215
|
-
|
|
216
|
-
|
|
236
|
+
try:
|
|
237
|
+
# Clean up stale locks
|
|
238
|
+
workflows = await storage.cleanup_stale_locks()
|
|
217
239
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
240
|
+
if len(workflows) > 0:
|
|
241
|
+
logger.info("Cleaned up %d stale locks", len(workflows))
|
|
242
|
+
finally:
|
|
243
|
+
await storage.release_system_lock("cleanup_stale_locks", worker_id)
|
|
221
244
|
|
|
222
245
|
|
|
223
246
|
async def auto_resume_stale_workflows_periodically(
|
|
224
247
|
storage: StorageProtocol,
|
|
225
248
|
replay_engine: Any,
|
|
249
|
+
worker_id: str,
|
|
226
250
|
interval: int = 60,
|
|
227
251
|
) -> None:
|
|
228
252
|
"""
|
|
@@ -231,83 +255,122 @@ async def auto_resume_stale_workflows_periodically(
|
|
|
231
255
|
This combines lock cleanup with automatic workflow resumption, ensuring
|
|
232
256
|
that workflows interrupted by worker crashes are automatically recovered.
|
|
233
257
|
|
|
258
|
+
Uses system-level locking to ensure only one pod executes this task at a time,
|
|
259
|
+
preventing duplicate workflow execution (CRITICAL for safety).
|
|
260
|
+
|
|
234
261
|
Example:
|
|
235
262
|
>>> asyncio.create_task(
|
|
236
263
|
... auto_resume_stale_workflows_periodically(
|
|
237
|
-
... storage, replay_engine, interval=60
|
|
264
|
+
... storage, replay_engine, worker_id, interval=60
|
|
238
265
|
... )
|
|
239
266
|
... )
|
|
240
267
|
|
|
241
268
|
Args:
|
|
242
269
|
storage: Storage backend
|
|
243
270
|
replay_engine: ReplayEngine instance for resuming workflows
|
|
271
|
+
worker_id: Unique identifier for this worker (for global lock coordination)
|
|
244
272
|
interval: Cleanup interval in seconds (default: 60)
|
|
245
273
|
"""
|
|
246
274
|
with suppress(asyncio.CancelledError):
|
|
247
275
|
while True:
|
|
248
|
-
|
|
276
|
+
# Add jitter to prevent thundering herd in multi-pod deployments
|
|
277
|
+
jitter = random.uniform(0, interval * 0.3)
|
|
278
|
+
await asyncio.sleep(interval + jitter)
|
|
279
|
+
|
|
280
|
+
# Try to acquire global lock for this task
|
|
281
|
+
lock_acquired = await storage.try_acquire_system_lock(
|
|
282
|
+
lock_name="auto_resume_stale_workflows",
|
|
283
|
+
worker_id=worker_id,
|
|
284
|
+
timeout_seconds=interval,
|
|
285
|
+
)
|
|
249
286
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
287
|
+
if not lock_acquired:
|
|
288
|
+
# Another pod is handling this task
|
|
289
|
+
continue
|
|
290
|
+
|
|
291
|
+
try:
|
|
292
|
+
# Clean up stale locks and get workflows to resume
|
|
293
|
+
workflows_to_resume = await storage.cleanup_stale_locks()
|
|
294
|
+
|
|
295
|
+
if len(workflows_to_resume) > 0:
|
|
296
|
+
logger.info("Cleaned up %d stale locks", len(workflows_to_resume))
|
|
297
|
+
|
|
298
|
+
# Auto-resume workflows
|
|
299
|
+
for workflow in workflows_to_resume:
|
|
300
|
+
instance_id = workflow["instance_id"]
|
|
301
|
+
workflow_name = workflow["workflow_name"]
|
|
302
|
+
source_hash = workflow["source_hash"]
|
|
303
|
+
status = workflow.get("status", "running")
|
|
304
|
+
|
|
305
|
+
try:
|
|
306
|
+
# Special handling for workflows in compensating state
|
|
307
|
+
if status == "compensating":
|
|
308
|
+
# Workflow crashed during compensation execution
|
|
309
|
+
# Only re-execute compensations, don't run workflow function
|
|
310
|
+
logger.info(
|
|
311
|
+
"Auto-resuming compensating workflow: %s "
|
|
312
|
+
"(compensation recovery only, no workflow execution)",
|
|
313
|
+
instance_id,
|
|
314
|
+
)
|
|
315
|
+
success = await replay_engine.resume_compensating_workflow(
|
|
316
|
+
instance_id
|
|
317
|
+
)
|
|
318
|
+
if success:
|
|
319
|
+
logger.info(
|
|
320
|
+
"Successfully completed compensations for: %s",
|
|
321
|
+
instance_id,
|
|
322
|
+
)
|
|
323
|
+
else:
|
|
324
|
+
logger.warning(
|
|
325
|
+
"Failed to complete compensations for: %s", instance_id
|
|
326
|
+
)
|
|
327
|
+
continue
|
|
328
|
+
|
|
329
|
+
# Normal workflow resumption (status='running')
|
|
330
|
+
# Check if workflow definition matches current Saga registry
|
|
331
|
+
# This prevents resuming workflows with outdated/incompatible code
|
|
332
|
+
current_definition = await storage.get_current_workflow_definition(
|
|
333
|
+
workflow_name
|
|
272
334
|
)
|
|
273
|
-
|
|
274
|
-
if
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
335
|
+
|
|
336
|
+
if current_definition is None:
|
|
337
|
+
logger.warning(
|
|
338
|
+
"Skipping auto-resume for %s: "
|
|
339
|
+
"workflow '%s' not found in registry",
|
|
340
|
+
instance_id,
|
|
341
|
+
workflow_name,
|
|
342
|
+
)
|
|
343
|
+
continue
|
|
344
|
+
|
|
345
|
+
if current_definition["source_hash"] != source_hash:
|
|
346
|
+
logger.warning(
|
|
347
|
+
"Skipping auto-resume for %s: "
|
|
348
|
+
"workflow definition has changed "
|
|
349
|
+
"(old hash: %s..., new hash: %s...)",
|
|
350
|
+
instance_id,
|
|
351
|
+
source_hash[:8],
|
|
352
|
+
current_definition["source_hash"][:8],
|
|
353
|
+
)
|
|
354
|
+
continue
|
|
355
|
+
|
|
356
|
+
# Hash matches - safe to resume
|
|
357
|
+
logger.info(
|
|
358
|
+
"Auto-resuming workflow: %s (instance: %s)",
|
|
359
|
+
workflow_name,
|
|
360
|
+
instance_id,
|
|
291
361
|
)
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
362
|
+
await replay_engine.resume_by_name(instance_id, workflow_name)
|
|
363
|
+
logger.info("Successfully resumed workflow: %s", instance_id)
|
|
364
|
+
except Exception as e:
|
|
365
|
+
# Log error but continue with other workflows
|
|
366
|
+
logger.error(
|
|
367
|
+
"Failed to auto-resume workflow %s: %s",
|
|
368
|
+
instance_id,
|
|
369
|
+
e,
|
|
370
|
+
exc_info=True,
|
|
300
371
|
)
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
# Hash matches - safe to resume
|
|
304
|
-
print(f"Auto-resuming workflow: {workflow_name} (instance: {instance_id})")
|
|
305
|
-
await replay_engine.resume_by_name(instance_id, workflow_name)
|
|
306
|
-
print(f"Successfully resumed workflow: {instance_id}")
|
|
307
|
-
except Exception as e:
|
|
308
|
-
# Log error but continue with other workflows
|
|
309
|
-
# In a real implementation, use proper logging
|
|
310
|
-
print(f"Failed to auto-resume workflow {instance_id}: {e}")
|
|
372
|
+
finally:
|
|
373
|
+
await storage.release_system_lock("auto_resume_stale_workflows", worker_id)
|
|
311
374
|
|
|
312
375
|
|
|
313
376
|
class LockNotAcquiredError(Exception):
|