edda-framework 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
edda/app.py ADDED
@@ -0,0 +1,996 @@
1
+ """
2
+ Main application module for Edda framework.
3
+
4
+ This module provides the EddaApp class, which is an ASGI/WSGI compatible
5
+ application for handling CloudEvents and executing workflows.
6
+ """
7
+
8
+ import asyncio
9
+ import json
10
+ import sys
11
+ from collections.abc import Callable
12
+ from typing import Any
13
+
14
+ import uvloop
15
+ from cloudevents.exceptions import GenericException as CloudEventsException
16
+ from cloudevents.http import from_http
17
+ from sqlalchemy.ext.asyncio import create_async_engine
18
+
19
+ from edda import workflow
20
+ from edda.hooks import WorkflowHooks
21
+ from edda.locking import auto_resume_stale_workflows_periodically, generate_worker_id
22
+ from edda.outbox.relayer import OutboxRelayer
23
+ from edda.replay import ReplayEngine
24
+ from edda.retry import RetryPolicy
25
+ from edda.storage.sqlalchemy_storage import SQLAlchemyStorage
26
+
27
+
28
+ class EddaApp:
29
+ """
30
+ ASGI/WSGI compatible workflow application with distributed execution support.
31
+
32
+ This is the main entry point for the Edda framework. It handles:
33
+ - CloudEvents HTTP endpoint
34
+ - Event routing and workflow triggering
35
+ - Distributed locking and coordination
36
+ - Storage management
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ service_name: str,
42
+ db_url: str,
43
+ outbox_enabled: bool = False,
44
+ broker_url: str = "http://broker-ingress.knative-eventing.svc.cluster.local/default/default",
45
+ hooks: WorkflowHooks | None = None,
46
+ default_retry_policy: "RetryPolicy | None" = None,
47
+ ):
48
+ """
49
+ Initialize Edda application.
50
+
51
+ Args:
52
+ service_name: Service name for distributed execution (e.g., "order-service")
53
+ db_url: Database URL (e.g., "sqlite:///workflow.db")
54
+ outbox_enabled: Enable transactional outbox pattern
55
+ broker_url: Knative Broker URL for outbox publishing
56
+ hooks: Optional WorkflowHooks implementation for observability
57
+ default_retry_policy: Default retry policy for all activities.
58
+ If None, uses DEFAULT_RETRY_POLICY (5 attempts, exponential backoff).
59
+ Can be overridden per-activity using @activity(retry_policy=...).
60
+ """
61
+ self.db_url = db_url
62
+ self.service_name = service_name
63
+ self.outbox_enabled = outbox_enabled
64
+ self.broker_url = broker_url
65
+ self.hooks = hooks
66
+ self.default_retry_policy = default_retry_policy
67
+
68
+ # Generate unique worker ID for this process
69
+ self.worker_id = generate_worker_id(service_name)
70
+
71
+ # Initialize storage
72
+ self.storage = self._create_storage(db_url)
73
+
74
+ # Event handlers registry
75
+ self.event_handlers: dict[str, list[Callable[..., Any]]] = {}
76
+
77
+ # Replay engine (will be initialized in initialize())
78
+ self.replay_engine: ReplayEngine | None = None
79
+
80
+ # Outbox relayer (will be initialized if outbox_enabled)
81
+ self.outbox_relayer: OutboxRelayer | None = None
82
+
83
+ # Background tasks
84
+ self._background_tasks: list[asyncio.Task[Any]] = []
85
+ self._initialized = False
86
+
87
+ def _create_storage(self, db_url: str) -> SQLAlchemyStorage:
88
+ """
89
+ Create storage backend from database URL.
90
+
91
+ Supports SQLite, PostgreSQL, and MySQL via SQLAlchemy.
92
+
93
+ Args:
94
+ db_url: Database URL in SQLAlchemy format
95
+ Examples:
96
+ - SQLite: "sqlite:///saga.db" or "sqlite+aiosqlite:///saga.db"
97
+ - PostgreSQL: "postgresql+asyncpg://user:pass@localhost/dbname"
98
+ - MySQL: "mysql+aiomysql://user:pass@localhost/dbname"
99
+
100
+ Returns:
101
+ SQLAlchemyStorage instance
102
+ """
103
+ # Convert plain sqlite:// URLs to use aiosqlite driver
104
+ if db_url.startswith("sqlite:///"):
105
+ db_url = db_url.replace("sqlite:///", "sqlite+aiosqlite:///", 1)
106
+ elif db_url == "sqlite:///:memory:" or db_url.startswith("sqlite:///:memory:"):
107
+ db_url = "sqlite+aiosqlite:///:memory:"
108
+
109
+ # Create async engine
110
+ engine = create_async_engine(
111
+ db_url,
112
+ echo=False, # Set to True for SQL logging
113
+ future=True,
114
+ )
115
+
116
+ return SQLAlchemyStorage(engine)
117
+
118
+ async def initialize(self) -> None:
119
+ """
120
+ Initialize the application.
121
+
122
+ This should be called before the app starts receiving requests.
123
+ """
124
+ if self._initialized:
125
+ return
126
+
127
+ # Install uvloop for better performance
128
+ # Python 3.12+ uses asyncio.set_event_loop_policy() instead of uvloop.install()
129
+ if sys.version_info >= (3, 12):
130
+ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
131
+ else:
132
+ uvloop.install()
133
+
134
+ # Initialize storage
135
+ await self.storage.initialize()
136
+
137
+ # Initialize replay engine
138
+ self.replay_engine = ReplayEngine(
139
+ storage=self.storage,
140
+ service_name=self.service_name,
141
+ worker_id=self.worker_id,
142
+ hooks=self.hooks,
143
+ default_retry_policy=self.default_retry_policy,
144
+ )
145
+
146
+ # Set global replay engine for workflow decorator
147
+ workflow.set_replay_engine(self.replay_engine)
148
+
149
+ # Initialize outbox relayer if enabled
150
+ if self.outbox_enabled:
151
+ self.outbox_relayer = OutboxRelayer(
152
+ storage=self.storage,
153
+ broker_url=self.broker_url,
154
+ poll_interval=1.0,
155
+ max_retries=3,
156
+ batch_size=10,
157
+ )
158
+ await self.outbox_relayer.start()
159
+
160
+ # Auto-register all @workflow decorated workflows
161
+ self._auto_register_workflows()
162
+
163
+ # Start background tasks
164
+ self._start_background_tasks()
165
+
166
+ self._initialized = True
167
+
168
+ async def shutdown(self) -> None:
169
+ """
170
+ Shutdown the application and cleanup resources.
171
+
172
+ This should be called when the app is shutting down.
173
+ """
174
+ # Stop outbox relayer if enabled
175
+ if self.outbox_relayer:
176
+ await self.outbox_relayer.stop()
177
+
178
+ # Cancel background tasks
179
+ for task in self._background_tasks:
180
+ task.cancel()
181
+
182
+ # Wait for tasks to complete
183
+ await asyncio.gather(*self._background_tasks, return_exceptions=True)
184
+
185
+ # Close storage
186
+ await self.storage.close()
187
+
188
+ self._initialized = False
189
+
190
+ def _start_background_tasks(self) -> None:
191
+ """Start background maintenance tasks."""
192
+ # Task to cleanup stale locks and auto-resume workflows
193
+ auto_resume_task = asyncio.create_task(
194
+ auto_resume_stale_workflows_periodically(
195
+ self.storage,
196
+ self.replay_engine,
197
+ interval=60, # Check every 60 seconds
198
+ )
199
+ )
200
+ self._background_tasks.append(auto_resume_task)
201
+
202
+ # Task to check expired timers and resume workflows
203
+ timer_check_task = asyncio.create_task(
204
+ self._check_expired_timers_periodically(interval=10) # Check every 10 seconds
205
+ )
206
+ self._background_tasks.append(timer_check_task)
207
+
208
+ # Task to check expired event timeouts and fail workflows
209
+ event_timeout_task = asyncio.create_task(
210
+ self._check_expired_event_timeouts_periodically(interval=10) # Check every 10 seconds
211
+ )
212
+ self._background_tasks.append(event_timeout_task)
213
+
214
+ def _auto_register_workflows(self) -> None:
215
+ """
216
+ Auto-register workflows with event_handler=True as CloudEvent handlers.
217
+
218
+ Only workflows explicitly marked with @workflow(event_handler=True) will be
219
+ auto-registered. For each eligible workflow, a default handler is registered that:
220
+ 1. Extracts data from CloudEvent
221
+ 2. Starts the workflow with data as kwargs
222
+
223
+ Manual @app.on_event() registrations take precedence.
224
+ """
225
+ from edda.workflow import get_all_workflows
226
+
227
+ for workflow_name, workflow_instance in get_all_workflows().items():
228
+ # Only register if event_handler=True
229
+ if not workflow_instance.event_handler:
230
+ continue
231
+
232
+ # Skip if already manually registered (manual takes precedence)
233
+ if workflow_name not in self.event_handlers:
234
+ self._register_default_workflow_handler(workflow_name, workflow_instance)
235
+
236
+ def _register_default_workflow_handler(self, event_type: str, wf: Any) -> None:
237
+ """
238
+ Register a default CloudEvent handler for a workflow.
239
+
240
+ The default handler extracts the CloudEvent data and passes it
241
+ as kwargs to workflow.start().
242
+
243
+ Args:
244
+ event_type: CloudEvent type (same as workflow name)
245
+ wf: Workflow instance to start when event is received
246
+ """
247
+
248
+ async def default_handler(event: Any) -> None:
249
+ """Default handler that starts workflow with CloudEvent data."""
250
+ # Extract data from CloudEvent
251
+ data = event.get_data()
252
+
253
+ # Start workflow with data as kwargs
254
+ if isinstance(data, dict):
255
+ await wf.start(**data)
256
+ else:
257
+ # If data is not a dict, start without arguments
258
+ await wf.start()
259
+
260
+ # Register the handler
261
+ if event_type not in self.event_handlers:
262
+ self.event_handlers[event_type] = []
263
+ self.event_handlers[event_type].append(default_handler)
264
+
265
+ def on_event(
266
+ self, event_type: str, proto_type: type[Any] | None = None
267
+ ) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
268
+ """
269
+ Decorator to register an event handler.
270
+
271
+ Example:
272
+ >>> @app.on_event("order.created")
273
+ ... async def handle_order_created(event):
274
+ ... await order_workflow.start(...)
275
+
276
+ Args:
277
+ event_type: CloudEvent type to handle
278
+ proto_type: Optional protobuf message type
279
+
280
+ Returns:
281
+ Decorator function
282
+ """
283
+
284
+ def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
285
+ if event_type not in self.event_handlers:
286
+ self.event_handlers[event_type] = []
287
+ self.event_handlers[event_type].append(func)
288
+
289
+ # Store proto_type if provided
290
+ if proto_type is not None:
291
+ func._proto_type = proto_type # type: ignore[attr-defined]
292
+
293
+ return func
294
+
295
+ return decorator
296
+
297
+ async def handle_cloudevent(self, event: Any, wait: bool = False) -> None:
298
+ """
299
+ Handle incoming CloudEvent.
300
+
301
+ This will route the event to registered handlers and deliver events
302
+ to waiting workflows.
303
+
304
+ By default, handlers are executed as background tasks to avoid blocking
305
+ the HTTP response. Set wait=True for synchronous execution (useful for testing).
306
+
307
+ Args:
308
+ event: CloudEvent instance
309
+ wait: If True, wait for handlers to complete before returning.
310
+ If False (default), execute handlers as background tasks.
311
+ """
312
+ import asyncio
313
+
314
+ event_type = event["type"]
315
+
316
+ # Find handlers for this event type
317
+ handlers = self.event_handlers.get(event_type, [])
318
+
319
+ if wait:
320
+ # Synchronous execution (for tests)
321
+ for handler in handlers:
322
+ await self._run_handler(handler, event, event_type)
323
+ await self._deliver_event_to_waiting_workflows_safe(event)
324
+ else:
325
+ # Background execution (for production)
326
+ for handler in handlers:
327
+ asyncio.create_task(self._run_handler(handler, event, event_type))
328
+ asyncio.create_task(self._deliver_event_to_waiting_workflows_safe(event))
329
+
330
+ async def _run_handler(self, handler: Callable[..., Any], event: Any, event_type: str) -> None:
331
+ """
332
+ Run a CloudEvent handler with error handling.
333
+
334
+ Args:
335
+ handler: Event handler function
336
+ event: CloudEvent instance
337
+ event_type: Event type for logging
338
+ """
339
+ try:
340
+ await handler(event)
341
+ except Exception as e:
342
+ # Log error (in a real implementation, use proper logging)
343
+ print(f"Error handling event {event_type}: {e}")
344
+ import traceback
345
+
346
+ traceback.print_exc()
347
+
348
+ async def _deliver_event_to_waiting_workflows_safe(self, event: Any) -> None:
349
+ """
350
+ Deliver event to waiting workflows with error handling.
351
+
352
+ Args:
353
+ event: CloudEvent instance
354
+ """
355
+ try:
356
+ await self._deliver_event_to_waiting_workflows(event)
357
+ except Exception as e:
358
+ print(f"Error delivering event to waiting workflows: {e}")
359
+ import traceback
360
+
361
+ traceback.print_exc()
362
+
363
+ async def _deliver_event_to_waiting_workflows(self, event: Any) -> None:
364
+ """
365
+ Deliver event to workflows waiting for this event type.
366
+
367
+ This method:
368
+ 1. Finds workflows waiting for the event type
369
+ 2. Records event data to workflow history
370
+ 3. Removes event subscription
371
+ 4. Resumes the workflow
372
+
373
+ Args:
374
+ event: CloudEvent instance
375
+ """
376
+ event_type = event["type"]
377
+ event_data = event.get_data()
378
+
379
+ # Extract CloudEvents metadata
380
+ event_metadata = {
381
+ "type": event["type"],
382
+ "source": event["source"],
383
+ "id": event["id"],
384
+ "time": event.get("time"),
385
+ "datacontenttype": event.get("datacontenttype"),
386
+ "subject": event.get("subject"),
387
+ }
388
+
389
+ # Extract extension attributes (any attributes not in the standard set)
390
+ standard_attrs = {
391
+ "type",
392
+ "source",
393
+ "id",
394
+ "time",
395
+ "datacontenttype",
396
+ "subject",
397
+ "specversion",
398
+ "data",
399
+ "data_base64",
400
+ }
401
+ extensions = {k: v for k, v in event.get_attributes().items() if k not in standard_attrs}
402
+
403
+ # Find workflows waiting for this event type
404
+ waiting_instances = await self.storage.find_waiting_instances(event_type)
405
+
406
+ if not waiting_instances:
407
+ return # No workflows waiting for this event
408
+
409
+ print(
410
+ f"[EventDelivery] Found {len(waiting_instances)} workflow(s) waiting for '{event_type}'"
411
+ )
412
+
413
+ for subscription in waiting_instances:
414
+ instance_id = subscription["instance_id"]
415
+
416
+ # Get workflow instance
417
+ instance = await self.storage.get_instance(instance_id)
418
+ if not instance:
419
+ print(f"[EventDelivery] Warning: Instance {instance_id} not found, skipping")
420
+ continue
421
+
422
+ # Check if instance is still waiting
423
+ if instance.get("status") != "waiting_for_event":
424
+ print(
425
+ f"[EventDelivery] Warning: Instance {instance_id} "
426
+ f"status is '{instance.get('status')}', expected 'waiting_for_event', skipping"
427
+ )
428
+ continue
429
+
430
+ # Get activity_id from the subscription (stored when wait_event was called)
431
+ activity_id = subscription.get("activity_id")
432
+ if not activity_id:
433
+ print(
434
+ f"[EventDelivery] Warning: No activity_id in subscription for {instance_id}, skipping"
435
+ )
436
+ continue
437
+
438
+ workflow_name = instance["workflow_name"]
439
+
440
+ # Distributed Coroutines: Acquire lock FIRST to prevent race conditions
441
+ # This ensures only ONE pod processes this event, even if multiple pods
442
+ # receive the event simultaneously
443
+ lock_acquired = await self.storage.try_acquire_lock(
444
+ instance_id, self.worker_id, timeout_seconds=300
445
+ )
446
+
447
+ if not lock_acquired:
448
+ print(
449
+ f"[EventDelivery] Another worker is processing {instance_id}, skipping "
450
+ "(distributed coroutine - lock already held)"
451
+ )
452
+ continue
453
+
454
+ try:
455
+ print(
456
+ f"[EventDelivery] Delivering event to workflow {instance_id} (activity_id: {activity_id})"
457
+ )
458
+
459
+ # 1. Record event data and metadata to history
460
+ try:
461
+ await self.storage.append_history(
462
+ instance_id,
463
+ activity_id=activity_id,
464
+ event_type="EventReceived",
465
+ event_data={
466
+ "payload": event_data,
467
+ "metadata": event_metadata,
468
+ "extensions": extensions,
469
+ },
470
+ )
471
+ except Exception as history_error:
472
+ # If history entry already exists (UNIQUE constraint), this event was already
473
+ # delivered by another worker in a multi-process environment.
474
+ # Skip workflow resumption to prevent duplicate processing.
475
+ print(
476
+ f"[EventDelivery] History already exists for activity_id {activity_id}: {history_error}"
477
+ )
478
+ print(
479
+ f"[EventDelivery] Event '{event_type}' was already delivered by another worker, skipping"
480
+ )
481
+ continue
482
+
483
+ # 2. Remove event subscription
484
+ await self.storage.remove_event_subscription(instance_id, event_type)
485
+
486
+ # 3. Resume workflow (lock already held by this worker - distributed coroutine pattern)
487
+ if self.replay_engine is None:
488
+ print("[EventDelivery] Error: Replay engine not initialized")
489
+ continue
490
+
491
+ await self.replay_engine.resume_by_name(
492
+ instance_id, workflow_name, already_locked=True
493
+ )
494
+
495
+ print(
496
+ f"[EventDelivery] ✅ Resumed workflow {instance_id} after receiving '{event_type}'"
497
+ )
498
+
499
+ except Exception as e:
500
+ print(f"[EventDelivery] ❌ Error resuming workflow {instance_id}: {e}")
501
+ import traceback
502
+
503
+ traceback.print_exc()
504
+
505
+ finally:
506
+ # Always release the lock, even if an error occurred
507
+ await self.storage.release_lock(instance_id, self.worker_id)
508
+
509
+ async def _check_expired_timers(self) -> None:
510
+ """
511
+ Check for expired timers and resume waiting workflows.
512
+
513
+ This method:
514
+ 1. Finds timers that have expired
515
+ 2. Records timer expiration to workflow history
516
+ 3. Removes timer subscription
517
+ 4. Resumes the workflow
518
+
519
+ Note:
520
+ This is called periodically by a background task.
521
+ Timer expiration is recorded to history to enable deterministic replay.
522
+ During replay, wait_timer() will find this history entry and skip the wait.
523
+ """
524
+ # Find expired timers
525
+ expired_timers = await self.storage.find_expired_timers()
526
+
527
+ if not expired_timers:
528
+ return # No expired timers
529
+
530
+ print(f"[TimerCheck] Found {len(expired_timers)} expired timer(s)")
531
+
532
+ for timer in expired_timers:
533
+ instance_id = timer["instance_id"]
534
+ timer_id = timer["timer_id"]
535
+ workflow_name = timer["workflow_name"]
536
+ activity_id = timer.get("activity_id")
537
+
538
+ if not activity_id:
539
+ print(f"[TimerCheck] Warning: No activity_id in timer for {instance_id}, skipping")
540
+ continue
541
+
542
+ # Get workflow instance
543
+ instance = await self.storage.get_instance(instance_id)
544
+ if not instance:
545
+ print(f"[TimerCheck] Warning: Instance {instance_id} not found, skipping")
546
+ continue
547
+
548
+ # Check if instance is still waiting for timer
549
+ if instance.get("status") != "waiting_for_timer":
550
+ print(
551
+ f"[TimerCheck] Warning: Instance {instance_id} "
552
+ f"status is '{instance.get('status')}', expected 'waiting_for_timer', skipping"
553
+ )
554
+ continue
555
+
556
+ # Distributed Coroutines: Acquire lock FIRST to prevent race conditions
557
+ # This ensures only ONE pod processes this timer, even if multiple pods
558
+ # check timers simultaneously
559
+ lock_acquired = await self.storage.try_acquire_lock(
560
+ instance_id, self.worker_id, timeout_seconds=300
561
+ )
562
+
563
+ if not lock_acquired:
564
+ print(
565
+ f"[TimerCheck] Another worker is processing {instance_id}, skipping "
566
+ "(distributed coroutine - lock already held)"
567
+ )
568
+ continue
569
+
570
+ try:
571
+ print(
572
+ f"[TimerCheck] Timer '{timer_id}' expired for workflow {instance_id} (activity_id: {activity_id})"
573
+ )
574
+
575
+ # 1. Record timer expiration to history (allows deterministic replay)
576
+ # During replay, wait_timer() will find this entry and skip the wait
577
+ try:
578
+ await self.storage.append_history(
579
+ instance_id,
580
+ activity_id=activity_id,
581
+ event_type="TimerExpired",
582
+ event_data={
583
+ "result": None,
584
+ "timer_id": timer_id,
585
+ "expires_at": timer["expires_at"],
586
+ },
587
+ )
588
+ except Exception as history_error:
589
+ # If history entry already exists (UNIQUE constraint), this timer was already
590
+ # processed by another worker in a multi-process environment.
591
+ # Skip workflow resumption to prevent duplicate processing.
592
+ print(
593
+ f"[TimerCheck] History already exists for activity_id {activity_id}: {history_error}"
594
+ )
595
+ print(
596
+ f"[TimerCheck] Timer '{timer_id}' was already processed by another worker, skipping"
597
+ )
598
+ continue
599
+
600
+ # 2. Remove timer subscription
601
+ await self.storage.remove_timer_subscription(instance_id, timer_id)
602
+
603
+ # 3. Resume workflow (lock already held by this worker - distributed coroutine pattern)
604
+ if self.replay_engine is None:
605
+ print("[TimerCheck] Error: Replay engine not initialized")
606
+ continue
607
+
608
+ await self.replay_engine.resume_by_name(
609
+ instance_id, workflow_name, already_locked=True
610
+ )
611
+
612
+ print(
613
+ f"[TimerCheck] ✅ Resumed workflow {instance_id} after timer '{timer_id}' expired"
614
+ )
615
+
616
+ except Exception as e:
617
+ print(f"[TimerCheck] ❌ Error resuming workflow {instance_id}: {e}")
618
+ import traceback
619
+
620
+ traceback.print_exc()
621
+
622
+ finally:
623
+ # Always release the lock, even if an error occurred
624
+ await self.storage.release_lock(instance_id, self.worker_id)
625
+
626
+ async def _check_expired_timers_periodically(self, interval: int = 10) -> None:
627
+ """
628
+ Background task to periodically check for expired timers.
629
+
630
+ Args:
631
+ interval: Check interval in seconds (default: 10)
632
+
633
+ Note:
634
+ This runs indefinitely until the application is shut down.
635
+ The actual resume time may be slightly later than the specified
636
+ duration depending on the check interval.
637
+ """
638
+ while True:
639
+ try:
640
+ await asyncio.sleep(interval)
641
+ await self._check_expired_timers()
642
+ except Exception as e:
643
+ print(f"[TimerCheck] Error in periodic timer check: {e}")
644
+ import traceback
645
+
646
+ traceback.print_exc()
647
+
648
+ async def _check_expired_event_timeouts(self) -> None:
649
+ """
650
+ Check for event subscriptions that have timed out and fail those workflows.
651
+
652
+ This method:
653
+ 1. Finds all event subscriptions where timeout_at <= now
654
+ 2. For each timeout, acquires workflow lock (Lock-First pattern)
655
+ 3. Records EventTimeout to history
656
+ 4. Removes event subscription
657
+ 5. Fails the workflow with EventTimeoutError
658
+ """
659
+ # Find all expired event subscriptions
660
+ expired = await self.storage.find_expired_event_subscriptions()
661
+
662
+ if not expired:
663
+ return
664
+
665
+ print(f"[EventTimeoutCheck] Found {len(expired)} expired event subscriptions")
666
+
667
+ for subscription in expired:
668
+ instance_id = subscription["instance_id"]
669
+ event_type = subscription["event_type"]
670
+ timeout_at = subscription["timeout_at"]
671
+ created_at = subscription["created_at"]
672
+
673
+ # Lock-First pattern: Try to acquire the lock before processing
674
+ # If we can't get the lock, another worker is processing this workflow
675
+ lock_acquired = await self.storage.try_acquire_lock(instance_id, self.worker_id)
676
+ if not lock_acquired:
677
+ print(
678
+ f"[EventTimeoutCheck] Could not acquire lock for workflow {instance_id}, skipping (another worker is processing)"
679
+ )
680
+ continue
681
+
682
+ try:
683
+ print(
684
+ f"[EventTimeoutCheck] Event '{event_type}' timed out for workflow {instance_id}"
685
+ )
686
+
687
+ # Get workflow instance
688
+ instance = await self.storage.get_instance(instance_id)
689
+ if not instance:
690
+ print(f"[EventTimeoutCheck] Workflow {instance_id} not found")
691
+ continue
692
+
693
+ # Get activity_id from the subscription (stored when wait_event was called)
694
+ activity_id = subscription.get("activity_id")
695
+ if not activity_id:
696
+ print(
697
+ f"[EventTimeoutCheck] Warning: No activity_id in subscription for {instance_id}, skipping"
698
+ )
699
+ continue
700
+
701
+ # 1. Record event timeout to history
702
+ # This allows the workflow to see what happened during replay
703
+ try:
704
+ await self.storage.append_history(
705
+ instance_id,
706
+ activity_id=activity_id,
707
+ event_type="EventTimeout",
708
+ event_data={
709
+ "event_type": event_type,
710
+ "timeout_at": timeout_at,
711
+ "error_message": f"Event '{event_type}' did not arrive within timeout",
712
+ },
713
+ )
714
+ except Exception as history_error:
715
+ # If history entry already exists, this timeout was already processed
716
+ print(
717
+ f"[EventTimeoutCheck] History already exists for activity_id {activity_id}: {history_error}"
718
+ )
719
+ print(
720
+ f"[EventTimeoutCheck] Timeout for '{event_type}' was already processed, skipping"
721
+ )
722
+ continue
723
+
724
+ # 2. Remove event subscription
725
+ await self.storage.remove_event_subscription(instance_id, event_type)
726
+
727
+ # 3. Fail the workflow with EventTimeoutError
728
+ # Create error details similar to workflow failure
729
+ import traceback
730
+
731
+ # Get timeout_seconds from timeout_at and created_at
732
+ from datetime import datetime
733
+
734
+ from edda.events import EventTimeoutError
735
+
736
+ try:
737
+ timeout_dt = datetime.fromisoformat(timeout_at)
738
+ created_dt = datetime.fromisoformat(created_at)
739
+ # Calculate the original timeout duration (timeout_at - created_at)
740
+ timeout_seconds = int((timeout_dt - created_dt).total_seconds())
741
+ except Exception:
742
+ timeout_seconds = 0 # Fallback
743
+
744
+ error = EventTimeoutError(event_type, timeout_seconds)
745
+ stack_trace = "".join(
746
+ traceback.format_exception(type(error), error, error.__traceback__)
747
+ )
748
+
749
+ # Update workflow status to failed with error details
750
+ await self.storage.update_instance_status(
751
+ instance_id,
752
+ "failed",
753
+ {
754
+ "error_message": str(error),
755
+ "error_type": "EventTimeoutError",
756
+ "stack_trace": stack_trace,
757
+ },
758
+ )
759
+
760
+ print(
761
+ f"[EventTimeoutCheck] ✅ Marked workflow {instance_id} as failed due to event timeout"
762
+ )
763
+
764
+ except Exception as e:
765
+ print(f"[EventTimeoutCheck] ❌ Error processing timeout for {instance_id}: {e}")
766
+ import traceback
767
+
768
+ traceback.print_exc()
769
+
770
+ finally:
771
+ # Always release the lock
772
+ await self.storage.release_lock(instance_id, self.worker_id)
773
+
774
+ async def _check_expired_event_timeouts_periodically(self, interval: int = 10) -> None:
775
+ """
776
+ Background task to periodically check for expired event timeouts.
777
+
778
+ Args:
779
+ interval: Check interval in seconds (default: 10)
780
+
781
+ Note:
782
+ This runs indefinitely until the application is shut down.
783
+ """
784
+ while True:
785
+ try:
786
+ await asyncio.sleep(interval)
787
+ await self._check_expired_event_timeouts()
788
+ except Exception as e:
789
+ print(f"[EventTimeoutCheck] Error in periodic timeout check: {e}")
790
+ import traceback
791
+
792
+ traceback.print_exc()
793
+
794
+ # -------------------------------------------------------------------------
795
+ # ASGI Interface
796
+ # -------------------------------------------------------------------------
797
+
798
+ async def __call__(
799
+ self,
800
+ scope: dict[str, Any],
801
+ receive: Callable[[], Any],
802
+ send: Callable[[dict[str, Any]], Any],
803
+ ) -> None:
804
+ """
805
+ ASGI interface.
806
+
807
+ Args:
808
+ scope: ASGI scope dictionary
809
+ receive: Async function to receive messages
810
+ send: Async function to send messages
811
+ """
812
+ # Initialize if not already done
813
+ if not self._initialized:
814
+ await self.initialize()
815
+
816
+ if scope["type"] == "lifespan":
817
+ await self._handle_lifespan(scope, receive, send)
818
+ elif scope["type"] == "http":
819
+ await self._handle_http(scope, receive, send)
820
+ else:
821
+ raise NotImplementedError(f"Unsupported scope type: {scope['type']}")
822
+
823
+ async def _handle_lifespan(
824
+ self,
825
+ _scope: dict[str, Any],
826
+ receive: Callable[[], Any],
827
+ send: Callable[[dict[str, Any]], Any],
828
+ ) -> None:
829
+ """Handle ASGI lifespan events."""
830
+ while True:
831
+ message = await receive()
832
+ if message["type"] == "lifespan.startup":
833
+ await self.initialize()
834
+ await send({"type": "lifespan.startup.complete"})
835
+ elif message["type"] == "lifespan.shutdown":
836
+ await self.shutdown()
837
+ await send({"type": "lifespan.shutdown.complete"})
838
+ return
839
+
840
+ async def _handle_http(
841
+ self,
842
+ scope: dict[str, Any],
843
+ receive: Callable[[], Any],
844
+ send: Callable[[dict[str, Any]], Any],
845
+ ) -> None:
846
+ """Handle HTTP request (CloudEvents and API endpoints)."""
847
+ # Get request path and method
848
+ path = scope.get("path", "/")
849
+ method = scope.get("method", "GET")
850
+
851
+ # Route to appropriate handler
852
+ if path.startswith("/cancel/") and method == "POST":
853
+ await self._handle_cancel_request(scope, receive, send)
854
+ else:
855
+ # Default: CloudEvents handler
856
+ await self._handle_cloudevent_request(scope, receive, send)
857
+
858
+ async def _handle_cloudevent_request(
859
+ self,
860
+ scope: dict[str, Any],
861
+ receive: Callable[[], Any],
862
+ send: Callable[[dict[str, Any]], Any],
863
+ ) -> None:
864
+ """
865
+ Handle CloudEvent HTTP request.
866
+
867
+ CloudEvents HTTP Binding compliant responses:
868
+ - 202 Accepted: Event accepted for async processing
869
+ - 400 Bad Request: CloudEvents parsing/validation error (non-retryable)
870
+ - 500 Internal Server Error: Internal error (retryable)
871
+ """
872
+ # Read request body
873
+ body = b""
874
+ while True:
875
+ message = await receive()
876
+ if message["type"] == "http.request":
877
+ body += message.get("body", b"")
878
+ if not message.get("more_body", False):
879
+ break
880
+
881
+ # Parse and handle CloudEvent
882
+ try:
883
+ headers = {k.decode("latin1"): v.decode("latin1") for k, v in scope.get("headers", [])}
884
+
885
+ # Create CloudEvent from HTTP request
886
+ event = from_http(headers, body)
887
+
888
+ # Handle the event (background task execution)
889
+ await self.handle_cloudevent(event)
890
+
891
+ # Success: 202 Accepted (async processing)
892
+ status = 202
893
+ response_body: dict[str, Any] = {"status": "accepted"}
894
+
895
+ except (ValueError, TypeError, KeyError, CloudEventsException) as e:
896
+ # CloudEvents parsing/validation error: 400 Bad Request (non-retryable)
897
+ status = 400
898
+ response_body = {
899
+ "error": str(e),
900
+ "error_type": type(e).__name__,
901
+ "retryable": False,
902
+ }
903
+
904
+ except Exception as e:
905
+ # Internal error: 500 Internal Server Error (retryable)
906
+ status = 500
907
+ response_body = {
908
+ "error": str(e),
909
+ "error_type": type(e).__name__,
910
+ "retryable": True,
911
+ }
912
+
913
+ # Send response (only once, at the end)
914
+ await send(
915
+ {
916
+ "type": "http.response.start",
917
+ "status": status,
918
+ "headers": [[b"content-type", b"application/json"]],
919
+ }
920
+ )
921
+ await send(
922
+ {
923
+ "type": "http.response.body",
924
+ "body": json.dumps(response_body).encode("utf-8"),
925
+ }
926
+ )
927
+
928
+ async def _handle_cancel_request(
929
+ self,
930
+ scope: dict[str, Any],
931
+ receive: Callable[[], Any],
932
+ send: Callable[[dict[str, Any]], Any],
933
+ ) -> None:
934
+ """Handle workflow cancellation request."""
935
+ # Extract instance_id from path: /cancel/{instance_id}
936
+ path = scope.get("path", "")
937
+ instance_id = path.split("/cancel/")[-1]
938
+
939
+ # Determine response (default: error)
940
+ status = 500
941
+ response_body: dict[str, Any] = {"error": "Unknown error"}
942
+
943
+ if not instance_id:
944
+ status = 400
945
+ response_body = {"error": "Missing instance_id"}
946
+ else:
947
+ # Consume request body (even if we don't use it)
948
+ while True:
949
+ message = await receive()
950
+ if message["type"] == "http.request" and not message.get("more_body", False):
951
+ break
952
+
953
+ # Try to cancel the workflow
954
+ try:
955
+ if self.replay_engine is None:
956
+ raise RuntimeError("Replay engine not initialized")
957
+
958
+ success = await self.replay_engine.cancel_workflow(
959
+ instance_id=instance_id, cancelled_by="api_user"
960
+ )
961
+
962
+ if success:
963
+ # Successfully cancelled
964
+ status = 200
965
+ response_body = {"status": "cancelled", "instance_id": instance_id}
966
+ else:
967
+ # Could not cancel (not found or already completed/failed)
968
+ status = 400
969
+ response_body = {
970
+ "error": "Cannot cancel workflow (not found or already completed/failed/cancelled)"
971
+ }
972
+
973
+ except Exception as e:
974
+ # Internal error - log detailed traceback
975
+ print(f"[Cancel] Error cancelling workflow {instance_id}: {e}")
976
+ import traceback
977
+
978
+ traceback.print_exc()
979
+
980
+ status = 500
981
+ response_body = {"error": str(e), "type": type(e).__name__}
982
+
983
+ # Send response (only once, at the end)
984
+ await send(
985
+ {
986
+ "type": "http.response.start",
987
+ "status": status,
988
+ "headers": [[b"content-type", b"application/json"]],
989
+ }
990
+ )
991
+ await send(
992
+ {
993
+ "type": "http.response.body",
994
+ "body": json.dumps(response_body).encode("utf-8"),
995
+ }
996
+ )