jararaca 0.3.11a15__py3-none-any.whl → 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of jararaca might be problematic. Click here for more details.

@@ -1,18 +1,36 @@
1
1
  import asyncio
2
2
  import inspect
3
3
  import logging
4
+ import random
4
5
  import signal
6
+ import time
7
+ import uuid
5
8
  from abc import ABC
6
9
  from contextlib import asynccontextmanager, suppress
7
- from dataclasses import dataclass
10
+ from dataclasses import dataclass, field
8
11
  from datetime import UTC, datetime
9
- from typing import Any, AsyncContextManager, AsyncGenerator, Type, get_origin
12
+ from typing import (
13
+ Any,
14
+ AsyncContextManager,
15
+ AsyncGenerator,
16
+ Awaitable,
17
+ Optional,
18
+ Type,
19
+ get_origin,
20
+ )
10
21
  from urllib.parse import parse_qs, urlparse
11
22
 
12
23
  import aio_pika
13
24
  import aio_pika.abc
14
25
  import uvloop
15
- from aio_pika.exceptions import AMQPError, ChannelClosed, ChannelNotFoundEntity
26
+ from aio_pika.exceptions import (
27
+ AMQPChannelError,
28
+ AMQPConnectionError,
29
+ AMQPError,
30
+ ChannelClosed,
31
+ ChannelNotFoundEntity,
32
+ ConnectionClosed,
33
+ )
16
34
  from pydantic import BaseModel
17
35
 
18
36
  from jararaca.broker_backend import MessageBrokerBackend
@@ -38,9 +56,12 @@ from jararaca.microservice import (
38
56
  MessageBusTransactionData,
39
57
  Microservice,
40
58
  SchedulerTransactionData,
59
+ ShutdownState,
60
+ provide_shutdown_state,
41
61
  )
42
62
  from jararaca.scheduler.decorators import ScheduledActionData
43
63
  from jararaca.utils.rabbitmq_utils import RabbitmqUtils
64
+ from jararaca.utils.retry import RetryConfig, retry_with_backoff
44
65
 
45
66
  logger = logging.getLogger(__name__)
46
67
 
@@ -50,6 +71,34 @@ class AioPikaWorkerConfig:
50
71
  url: str
51
72
  exchange: str
52
73
  prefetch_count: int
74
+ connection_retry_config: RetryConfig = field(
75
+ default_factory=lambda: RetryConfig(
76
+ max_retries=15,
77
+ initial_delay=1.0,
78
+ max_delay=60.0,
79
+ backoff_factor=2.0,
80
+ )
81
+ )
82
+ consumer_retry_config: RetryConfig = field(
83
+ default_factory=lambda: RetryConfig(
84
+ max_retries=15,
85
+ initial_delay=0.5,
86
+ max_delay=40.0,
87
+ backoff_factor=2.0,
88
+ )
89
+ )
90
+ # Connection health monitoring settings
91
+ connection_heartbeat_interval: float = 30.0 # seconds
92
+ connection_health_check_interval: float = 10.0 # seconds
93
+ reconnection_backoff_config: RetryConfig = field(
94
+ default_factory=lambda: RetryConfig(
95
+ max_retries=-1, # Infinite retries for reconnection
96
+ initial_delay=2.0,
97
+ max_delay=120.0,
98
+ backoff_factor=2.0,
99
+ jitter=True,
100
+ )
101
+ )
53
102
 
54
103
 
55
104
  class AioPikaMessage(MessageOf[Message]):
@@ -97,6 +146,20 @@ class MessageBusConsumer(ABC):
97
146
 
98
147
  def shutdown(self) -> None: ...
99
148
 
149
+ async def close(self) -> None:
150
+ """Close all resources related to the consumer"""
151
+
152
+
153
+ class _WorkerShutdownState(ShutdownState):
154
+ def __init__(self, shutdown_event: asyncio.Event):
155
+ self.shutdown_event = shutdown_event
156
+
157
+ def request_shutdown(self) -> None:
158
+ self.shutdown_event.set()
159
+
160
+ def is_shutdown_requested(self) -> bool:
161
+ return self.shutdown_event.is_set()
162
+
100
163
 
101
164
  class AioPikaMicroserviceConsumer(MessageBusConsumer):
102
165
  def __init__(
@@ -115,107 +178,741 @@ class AioPikaMicroserviceConsumer(MessageBusConsumer):
115
178
  self.incoming_map: dict[str, MessageHandlerData] = {}
116
179
  self.uow_context_provider = uow_context_provider
117
180
  self.shutdown_event = asyncio.Event()
181
+ self.shutdown_state = _WorkerShutdownState(self.shutdown_event)
118
182
  self.lock = asyncio.Lock()
119
183
  self.tasks: set[asyncio.Task[Any]] = set()
184
+ self.connection: aio_pika.abc.AbstractConnection | None = None
185
+ self.channels: dict[str, aio_pika.abc.AbstractChannel] = {}
186
+
187
+ # Connection resilience attributes
188
+ self.connection_healthy = False
189
+ self.connection_lock = asyncio.Lock()
190
+ self.reconnection_event = asyncio.Event()
191
+ self.reconnection_in_progress = False
192
+ self.consumer_tags: dict[str, str] = {} # Track consumer tags for cleanup
193
+ self.health_check_task: asyncio.Task[Any] | None = None
194
+ self.reconnection_task: asyncio.Task[Any] | None = None
195
+
196
+ async def _verify_infrastructure(self) -> bool:
197
+ """
198
+ Verify that the required RabbitMQ infrastructure (exchanges, queues) exists.
199
+ Returns True if all required infrastructure is in place.
200
+ """
201
+ try:
202
+ async with self.connect() as connection:
203
+ # Create a main channel just for checking infrastructure
204
+ async with connection.channel() as main_channel:
205
+ # Get existing exchange and queues to verify infrastructure is in place
206
+ await RabbitmqUtils.get_main_exchange(
207
+ channel=main_channel,
208
+ exchange_name=self.config.exchange,
209
+ )
210
+ await RabbitmqUtils.get_dl_exchange(channel=main_channel)
211
+ await RabbitmqUtils.get_dl_queue(channel=main_channel)
212
+ return True
213
+ except (ChannelNotFoundEntity, ChannelClosed, AMQPError) as e:
214
+ logger.critical(
215
+ f"Required exchange or queue infrastructure not found. "
216
+ f"Please use the declare command first to create the required infrastructure. Error: {e}"
217
+ )
218
+ return False
219
+
220
+ async def _setup_message_handler_consumer(
221
+ self, handler: MessageHandlerData
222
+ ) -> bool:
223
+ """
224
+ Set up a consumer for a message handler with retry mechanism.
225
+ Returns True if successful, False otherwise.
226
+ """
227
+ queue_name = f"{handler.message_type.MESSAGE_TOPIC}.{handler.instance_callable.__module__}.{handler.instance_callable.__qualname__}"
228
+ routing_key = f"{handler.message_type.MESSAGE_TOPIC}.#"
229
+
230
+ async def setup_consumer() -> None:
231
+ # Wait for connection to be healthy if reconnection is in progress
232
+ if self.reconnection_in_progress:
233
+ await self.reconnection_event.wait()
234
+
235
+ # Create a channel using the context manager
236
+ async with self.create_channel(queue_name) as channel:
237
+ queue = await RabbitmqUtils.get_queue(
238
+ channel=channel, queue_name=queue_name
239
+ )
120
240
 
121
- async def consume(self) -> None:
241
+ # Configure consumer and get the consumer tag
242
+ consumer_tag = await queue.consume(
243
+ callback=MessageHandlerCallback(
244
+ consumer=self,
245
+ queue_name=queue_name,
246
+ routing_key=routing_key,
247
+ message_handler=handler,
248
+ ),
249
+ no_ack=handler.spec.auto_ack,
250
+ )
122
251
 
123
- connection = await aio_pika.connect(self.config.url)
252
+ # Store consumer tag for cleanup
253
+ self.consumer_tags[queue_name] = consumer_tag
124
254
 
125
- channel = await connection.channel()
255
+ logger.info(
256
+ f"Consuming message handler {queue_name} on dedicated channel"
257
+ )
126
258
 
127
- await channel.set_qos(prefetch_count=self.config.prefetch_count)
259
+ try:
260
+ # Setup with retry
261
+ await retry_with_backoff(
262
+ setup_consumer,
263
+ retry_config=self.config.consumer_retry_config,
264
+ retry_exceptions=(
265
+ ChannelNotFoundEntity,
266
+ ChannelClosed,
267
+ AMQPError,
268
+ AMQPConnectionError,
269
+ AMQPChannelError,
270
+ ConnectionClosed,
271
+ ),
272
+ )
273
+ return True
274
+ except Exception as e:
275
+ logger.error(
276
+ f"Failed to setup consumer for queue '{queue_name}' after retries: {e}"
277
+ )
278
+ return False
279
+
280
+ async def _setup_scheduled_action_consumer(
281
+ self, scheduled_action: ScheduledActionData
282
+ ) -> bool:
283
+ """
284
+ Set up a consumer for a scheduled action with retry mechanism.
285
+ Returns True if successful, False otherwise.
286
+ """
287
+ queue_name = f"{scheduled_action.callable.__module__}.{scheduled_action.callable.__qualname__}"
288
+ routing_key = queue_name
289
+
290
+ async def setup_consumer() -> None:
291
+ # Wait for connection to be healthy if reconnection is in progress
292
+ if self.reconnection_in_progress:
293
+ await self.reconnection_event.wait()
294
+
295
+ # Create a channel using the context manager
296
+ async with self.create_channel(queue_name) as channel:
297
+ queue = await RabbitmqUtils.get_queue(
298
+ channel=channel, queue_name=queue_name
299
+ )
300
+
301
+ # Configure consumer and get the consumer tag
302
+ consumer_tag = await queue.consume(
303
+ callback=ScheduledMessageHandlerCallback(
304
+ consumer=self,
305
+ queue_name=queue_name,
306
+ routing_key=routing_key,
307
+ scheduled_action=scheduled_action,
308
+ ),
309
+ no_ack=True,
310
+ )
311
+
312
+ # Store consumer tag for cleanup
313
+ self.consumer_tags[queue_name] = consumer_tag
314
+
315
+ logger.info(f"Consuming scheduler {queue_name} on dedicated channel")
128
316
 
129
- # Get existing exchange and queues
130
317
  try:
131
- exchange = await RabbitmqUtils.get_main_exchange(
132
- channel=channel,
133
- exchange_name=self.config.exchange,
318
+ # Setup with retry
319
+ await retry_with_backoff(
320
+ setup_consumer,
321
+ retry_config=self.config.consumer_retry_config,
322
+ retry_exceptions=(
323
+ ChannelNotFoundEntity,
324
+ ChannelClosed,
325
+ AMQPError,
326
+ AMQPConnectionError,
327
+ AMQPChannelError,
328
+ ConnectionClosed,
329
+ ),
134
330
  )
331
+ return True
332
+ except Exception as e:
333
+ logger.error(
334
+ f"Failed to setup consumer for scheduler queue '{queue_name}' after retries: {e}"
335
+ )
336
+ return False
135
337
 
136
- dlx = await RabbitmqUtils.get_dl_exchange(channel=channel)
137
- dlq = await RabbitmqUtils.get_dl_queue(channel=channel)
138
- except (ChannelNotFoundEntity, ChannelClosed, AMQPError) as e:
139
- logger.critical(
140
- f"Required exchange or queue infrastructure not found and passive mode is enabled. "
141
- f"Please use the declare command first to create the required infrastructure. Error: {e}"
338
+ async def consume(self) -> None:
339
+ """
340
+ Main consume method that sets up all message handlers and scheduled actions with retry mechanisms.
341
+ """
342
+ # Establish initial connection
343
+ async with self.connect() as connection:
344
+ self.connection_healthy = True
345
+
346
+ # Start connection health monitoring
347
+ self.health_check_task = asyncio.create_task(
348
+ self._monitor_connection_health()
142
349
  )
143
- self.shutdown_event.set()
350
+
351
+ # Verify infrastructure with retry
352
+ infra_check_success = await retry_with_backoff(
353
+ self._verify_infrastructure,
354
+ retry_config=self.config.connection_retry_config,
355
+ retry_exceptions=(Exception,),
356
+ )
357
+
358
+ if not infra_check_success:
359
+ logger.critical(
360
+ "Failed to verify RabbitMQ infrastructure. Shutting down."
361
+ )
362
+ self.shutdown_event.set()
363
+ return
364
+
365
+ async def wait_for(
366
+ type: str, name: str, coroutine: Awaitable[bool]
367
+ ) -> tuple[str, str, bool]:
368
+ return type, name, await coroutine
369
+
370
+ tasks: set[asyncio.Task[tuple[str, str, bool]]] = set()
371
+
372
+ # Setup message handlers
373
+ for handler in self.message_handler_set:
374
+ queue_name = f"{handler.message_type.MESSAGE_TOPIC}.{handler.instance_callable.__module__}.{handler.instance_callable.__qualname__}"
375
+ self.incoming_map[queue_name] = handler
376
+
377
+ tasks.add(
378
+ task := asyncio.create_task(
379
+ wait_for(
380
+ "message_handler",
381
+ queue_name,
382
+ self._setup_message_handler_consumer(handler),
383
+ )
384
+ )
385
+ )
386
+
387
+ # Setup scheduled actions
388
+ for scheduled_action in self.scheduled_actions:
389
+ queue_name = f"{scheduled_action.callable.__module__}.{scheduled_action.callable.__qualname__}"
390
+ tasks.add(
391
+ task := asyncio.create_task(
392
+ wait_for(
393
+ "scheduled_action",
394
+ queue_name,
395
+ self._setup_scheduled_action_consumer(scheduled_action),
396
+ )
397
+ )
398
+ )
399
+
400
+ async def handle_task_results() -> None:
401
+ for task in asyncio.as_completed(tasks):
402
+ type, name, success = await task
403
+ if success:
404
+ logger.info(f"Successfully set up {type} consumer for {name}")
405
+ else:
406
+ logger.warning(
407
+ f"Failed to set up {type} consumer for {name}, will not process messages from this queue"
408
+ )
409
+
410
+ handle_task_results_task = asyncio.create_task(handle_task_results())
411
+
412
+ # Wait for shutdown signal
413
+ await self.shutdown_event.wait()
414
+ logger.info("Shutdown event received, stopping consumers")
415
+
416
+ # Cancel health monitoring
417
+ if self.health_check_task:
418
+ self.health_check_task.cancel()
419
+ with suppress(asyncio.CancelledError):
420
+ await self.health_check_task
421
+
422
+ # Cancel reconnection task if running
423
+ if self.reconnection_task:
424
+ self.reconnection_task.cancel()
425
+ with suppress(asyncio.CancelledError):
426
+ await self.reconnection_task
427
+
428
+ handle_task_results_task.cancel()
429
+ with suppress(asyncio.CancelledError):
430
+ await handle_task_results_task
431
+ for task in tasks:
432
+ if not task.done():
433
+ task.cancel()
434
+ with suppress(asyncio.CancelledError):
435
+ await task
436
+ logger.info("Worker shutting down")
437
+
438
+ # Wait for all tasks to complete
439
+ await self.wait_all_tasks_done()
440
+
441
+ # Close all channels and the connection
442
+ await self.close_channels_and_connection()
443
+
444
+ async def wait_all_tasks_done(self) -> None:
445
+ if not self.tasks:
144
446
  return
145
447
 
146
- for handler in self.message_handler_set:
448
+ logger.info(f"Waiting for {len(self.tasks)} in-flight tasks to complete")
449
+ async with self.lock:
450
+ # Use gather with return_exceptions=True to ensure all tasks are awaited
451
+ # even if some raise exceptions
452
+ results = await asyncio.gather(*self.tasks, return_exceptions=True)
147
453
 
148
- queue_name = f"{handler.message_type.MESSAGE_TOPIC}.{handler.instance_callable.__module__}.{handler.instance_callable.__qualname__}"
149
- routing_key = f"{handler.message_type.MESSAGE_TOPIC}.#"
454
+ # Log any exceptions that occurred
455
+ for result in results:
456
+ if isinstance(result, Exception):
457
+ logger.error(f"Task raised an exception during shutdown: {result}")
458
+
459
+ async def close_channels_and_connection(self) -> None:
460
+ """Close all channels and then the connection"""
461
+ logger.info("Closing channels and connection...")
462
+ await self._cleanup_connection()
150
463
 
151
- self.incoming_map[queue_name] = handler
464
+ def shutdown(self) -> None:
465
+ """Signal for shutdown"""
466
+ logger.info("Initiating graceful shutdown")
467
+ self.shutdown_event.set()
152
468
 
469
+ async def close(self) -> None:
470
+ """Implement MessageBusConsumer.close for cleanup"""
471
+ logger.info("Closing consumer...")
472
+ self.shutdown()
473
+
474
+ # Cancel health monitoring
475
+ if self.health_check_task:
476
+ self.health_check_task.cancel()
477
+ with suppress(asyncio.CancelledError):
478
+ await self.health_check_task
479
+
480
+ # Cancel reconnection task if running
481
+ if self.reconnection_task:
482
+ self.reconnection_task.cancel()
483
+ with suppress(asyncio.CancelledError):
484
+ await self.reconnection_task
485
+
486
+ await self.wait_all_tasks_done()
487
+ await self.close_channels_and_connection()
488
+
489
+ async def get_channel(self, queue_name: str) -> aio_pika.abc.AbstractChannel | None:
490
+ """
491
+ Get the channel for a specific queue, or None if not found.
492
+ This helps with error handling when a channel might have been closed.
493
+ """
494
+ # If reconnection is in progress, wait for it to complete
495
+ if self.reconnection_in_progress:
153
496
  try:
154
- queue = await RabbitmqUtils.get_queue(
155
- channel=channel, queue_name=queue_name
156
- )
157
- except (ChannelNotFoundEntity, ChannelClosed, AMQPError) as e:
158
- logger.error(
159
- f"Queue '{queue_name}' not found and passive mode is enabled. "
160
- f"Please use the declare command first to create the queue. Error: {e}"
497
+ await asyncio.wait_for(self.reconnection_event.wait(), timeout=30.0)
498
+ except asyncio.TimeoutError:
499
+ logger.warning(
500
+ f"Timeout waiting for reconnection when getting channel for {queue_name}"
161
501
  )
162
- continue
502
+ return None
163
503
 
164
- await queue.consume(
165
- callback=MessageHandlerCallback(
166
- consumer=self,
167
- queue_name=queue_name,
168
- routing_key=routing_key,
169
- message_handler=handler,
170
- ),
171
- no_ack=handler.spec.auto_ack,
504
+ if queue_name not in self.channels:
505
+ logger.warning(f"No channel found for queue {queue_name}")
506
+ return None
507
+
508
+ try:
509
+ channel = self.channels[queue_name]
510
+ if channel.is_closed:
511
+ logger.warning(f"Channel for queue {queue_name} is closed")
512
+ # Remove the closed channel
513
+ del self.channels[queue_name]
514
+
515
+ # Attempt to recreate the channel if connection is healthy
516
+ if (
517
+ self.connection
518
+ and not self.connection.is_closed
519
+ and self.connection_healthy
520
+ ):
521
+ try:
522
+ logger.info(f"Creating new channel for {queue_name}")
523
+ self.channels[queue_name] = await self.connection.channel()
524
+ await self.channels[queue_name].set_qos(
525
+ prefetch_count=self.config.prefetch_count
526
+ )
527
+ return self.channels[queue_name]
528
+ except Exception as e:
529
+ logger.error(
530
+ f"Failed to recreate channel for {queue_name}: {e}"
531
+ )
532
+ # Trigger reconnection if channel creation fails
533
+ self._trigger_reconnection()
534
+ return None
535
+ else:
536
+ # Connection is not healthy, trigger reconnection
537
+ self._trigger_reconnection()
538
+ return None
539
+ return channel
540
+ except Exception as e:
541
+ logger.error(f"Error accessing channel for queue {queue_name}: {e}")
542
+ # Trigger reconnection on any channel access error
543
+ self._trigger_reconnection()
544
+ return None
545
+
546
+ async def _establish_channel(self, queue_name: str) -> aio_pika.abc.AbstractChannel:
547
+ """
548
+ Creates a new channel for the specified queue with proper QoS settings.
549
+ """
550
+ if self.connection is None or self.connection.is_closed:
551
+ logger.warning(
552
+ f"Cannot create channel for {queue_name}: connection is not available"
172
553
  )
554
+ raise RuntimeError("Connection is not available")
173
555
 
174
- logger.info(f"Consuming message handler {queue_name}")
556
+ logger.debug(f"Creating channel for queue {queue_name}")
557
+ channel = await self.connection.channel()
558
+ await channel.set_qos(prefetch_count=self.config.prefetch_count)
559
+ logger.debug(f"Created channel for queue {queue_name}")
560
+ return channel
175
561
 
176
- for scheduled_action in self.scheduled_actions:
562
+ @asynccontextmanager
563
+ async def create_channel(
564
+ self, queue_name: str
565
+ ) -> AsyncGenerator[aio_pika.abc.AbstractChannel, None]:
566
+ """
567
+ Create and yield a channel for the specified queue with retry mechanism.
568
+ This context manager ensures the channel is properly managed.
569
+ """
570
+ try:
571
+ # Create a new channel with retry
572
+ channel = await retry_with_backoff(
573
+ fn=lambda: self._establish_channel(queue_name),
574
+ retry_config=self.config.consumer_retry_config,
575
+ retry_exceptions=(
576
+ AMQPConnectionError,
577
+ AMQPChannelError,
578
+ ConnectionError,
579
+ ),
580
+ )
177
581
 
178
- queue_name = f"{scheduled_action.callable.__module__}.{scheduled_action.callable.__qualname__}"
582
+ # Save in the channels dict for tracking
583
+ self.channels[queue_name] = channel
584
+ logger.debug(f"Created new channel for queue {queue_name}")
179
585
 
180
- routing_key = queue_name
586
+ try:
587
+ yield channel
588
+ finally:
589
+ # Don't close the channel here as it might be used later
590
+ # It will be closed during shutdown
591
+ pass
592
+ except aio_pika.exceptions.AMQPError as e:
593
+ logger.error(
594
+ f"Error creating channel for queue {queue_name} after retries: {e}"
595
+ )
596
+ raise
181
597
 
598
+ async def _establish_connection(self) -> aio_pika.abc.AbstractConnection:
599
+ """
600
+ Creates a new RabbitMQ connection with retry logic.
601
+ """
602
+ try:
603
+ logger.info("Establishing connection to RabbitMQ")
604
+ connection = await aio_pika.connect(
605
+ self.config.url,
606
+ heartbeat=self.config.connection_heartbeat_interval,
607
+ )
608
+ logger.info("Connected to RabbitMQ successfully")
609
+ return connection
610
+ except Exception as e:
611
+ logger.error(f"Failed to connect to RabbitMQ: {e}")
612
+ raise
613
+
614
+ @asynccontextmanager
615
+ async def connect(self) -> AsyncGenerator[aio_pika.abc.AbstractConnection, None]:
616
+ """
617
+ Create and manage the main connection to RabbitMQ with automatic retry.
618
+ """
619
+ if self.connection is not None and not self.connection.is_closed:
620
+ logger.debug("Connection already exists, reusing existing connection")
182
621
  try:
183
- queue = await RabbitmqUtils.get_queue(
184
- channel=channel, queue_name=queue_name
185
- )
186
- except (ChannelNotFoundEntity, ChannelClosed, AMQPError) as e:
187
- logger.error(
188
- f"Scheduler queue '{queue_name}' not found and passive mode is enabled. "
189
- f"Please use the declare command first to create the queue. Error: {e}"
190
- )
191
- continue
622
+ yield self.connection
623
+ finally:
624
+ # The existing connection will be handled by close_channels_and_connection
625
+ pass
626
+ return
192
627
 
193
- await queue.consume(
194
- callback=ScheduledMessageHandlerCallback(
195
- consumer=self,
196
- queue_name=queue_name,
197
- routing_key=routing_key,
198
- scheduled_action=scheduled_action,
628
+ try:
629
+ # Create a new connection with retry
630
+ self.connection = await retry_with_backoff(
631
+ self._establish_connection,
632
+ retry_config=self.config.connection_retry_config,
633
+ retry_exceptions=(
634
+ AMQPConnectionError,
635
+ ConnectionError,
636
+ OSError,
637
+ TimeoutError,
199
638
  ),
200
- no_ack=True,
201
639
  )
202
640
 
203
- logger.info(f"Consuming scheduler {queue_name}")
641
+ try:
642
+ yield self.connection
643
+ finally:
644
+ # Don't close the connection here; it will be closed in close_channels_and_connection
645
+ pass
646
+ except Exception as e:
647
+ logger.error(
648
+ f"Failed to establish connection to RabbitMQ after retries: {e}"
649
+ )
650
+ if self.connection:
651
+ try:
652
+ await self.connection.close()
653
+ except Exception as close_error:
654
+ logger.error(
655
+ f"Error closing connection after connect failure: {close_error}"
656
+ )
657
+ self.connection = None
658
+ raise
204
659
 
205
- await self.shutdown_event.wait()
206
- logger.info("Worker shutting down")
660
+ @asynccontextmanager
661
+ async def get_channel_ctx(
662
+ self, queue_name: str
663
+ ) -> AsyncGenerator[aio_pika.abc.AbstractChannel, None]:
664
+ """
665
+ Get a channel for a specific queue as a context manager.
666
+ This is safer than using get_channel directly as it ensures proper error handling.
667
+ """
668
+ max_retries = 3
669
+ retry_delay = 1.0
670
+
671
+ for attempt in range(max_retries):
672
+ try:
673
+ channel = await self.get_channel(queue_name)
674
+ if channel is not None:
675
+ try:
676
+ yield channel
677
+ return
678
+ finally:
679
+ # We don't close the channel here as it's managed by the consumer
680
+ pass
681
+
682
+ # No channel available, check connection state
683
+ if (
684
+ self.connection
685
+ and not self.connection.is_closed
686
+ and self.connection_healthy
687
+ ):
688
+ # Try to create a new channel
689
+ async with self.create_channel(queue_name) as new_channel:
690
+ yield new_channel
691
+ return
692
+ else:
693
+ # Connection is not healthy, wait for reconnection
694
+ if self.reconnection_in_progress:
695
+ try:
696
+ await asyncio.wait_for(
697
+ self.reconnection_event.wait(), timeout=30.0
698
+ )
699
+ # Retry after reconnection
700
+ continue
701
+ except asyncio.TimeoutError:
702
+ logger.warning(
703
+ f"Timeout waiting for reconnection for queue {queue_name}"
704
+ )
207
705
 
208
- await self.wait_all_tasks_done()
706
+ # Still no connection, trigger reconnection
707
+ if not self.reconnection_in_progress:
708
+ self._trigger_reconnection()
209
709
 
210
- await channel.close()
211
- await connection.close()
710
+ if attempt < max_retries - 1:
711
+ logger.info(
712
+ f"Retrying channel access for {queue_name} in {retry_delay}s"
713
+ )
714
+ await asyncio.sleep(retry_delay)
715
+ retry_delay *= 2
716
+ else:
717
+ raise RuntimeError(
718
+ f"Cannot get channel for queue {queue_name}: no connection available after {max_retries} attempts"
719
+ )
212
720
 
213
- async def wait_all_tasks_done(self) -> None:
214
- async with self.lock:
215
- await asyncio.gather(*self.tasks)
721
+ except Exception as e:
722
+ if attempt < max_retries - 1:
723
+ logger.warning(
724
+ f"Error getting channel for {queue_name}, retrying: {e}"
725
+ )
726
+ await asyncio.sleep(retry_delay)
727
+ retry_delay *= 2
728
+ else:
729
+ logger.error(
730
+ f"Failed to get channel for {queue_name} after {max_retries} attempts: {e}"
731
+ )
732
+ raise
733
+
734
+ async def _monitor_connection_health(self) -> None:
735
+ """
736
+ Monitor connection health and trigger reconnection if needed.
737
+ This runs as a background task.
738
+ """
739
+ while not self.shutdown_event.is_set():
740
+ try:
741
+ await asyncio.sleep(self.config.connection_health_check_interval)
216
742
 
217
- def shutdown(self) -> None:
218
- self.shutdown_event.set()
743
+ if self.shutdown_event.is_set():
744
+ break
745
+
746
+ # Check connection health
747
+ if not await self._is_connection_healthy():
748
+ logger.warning(
749
+ "Connection health check failed, triggering reconnection"
750
+ )
751
+ if not self.reconnection_in_progress:
752
+ self._trigger_reconnection()
753
+
754
+ except asyncio.CancelledError:
755
+ logger.info("Connection health monitoring cancelled")
756
+ break
757
+ except Exception as e:
758
+ logger.error(f"Error in connection health monitoring: {e}")
759
+ await asyncio.sleep(5) # Wait before retrying
760
+
761
+ async def _is_connection_healthy(self) -> bool:
762
+ """
763
+ Check if the connection is healthy.
764
+ """
765
+ try:
766
+ if self.connection is None or self.connection.is_closed:
767
+ return False
768
+
769
+ # Try to create a temporary channel to test connection
770
+ async with self.connection.channel() as test_channel:
771
+ # If we can create a channel, connection is healthy
772
+ return True
773
+
774
+ except Exception as e:
775
+ logger.debug(f"Connection health check failed: {e}")
776
+ return False
777
+
778
+ def _trigger_reconnection(self) -> None:
779
+ """
780
+ Trigger reconnection process.
781
+ """
782
+ if not self.reconnection_in_progress and not self.shutdown_event.is_set():
783
+ self.reconnection_in_progress = True
784
+ self.connection_healthy = False
785
+ self.reconnection_event.clear()
786
+
787
+ # Start reconnection task
788
+ self.reconnection_task = asyncio.create_task(self._handle_reconnection())
789
+ self.reconnection_task.add_done_callback(self._on_reconnection_done)
790
+
791
+ def _on_reconnection_done(self, task: asyncio.Task[Any]) -> None:
792
+ """
793
+ Handle completion of reconnection task.
794
+ """
795
+ self.reconnection_in_progress = False
796
+ if task.exception():
797
+ logger.error(f"Reconnection task failed: {task.exception()}")
798
+ else:
799
+ logger.info("Reconnection completed successfully")
800
+
801
+ async def _handle_reconnection(self) -> None:
802
+ """
803
+ Handle the reconnection process with exponential backoff.
804
+ """
805
+ logger.info("Starting reconnection process")
806
+
807
+ # Close existing connection and channels
808
+ await self._cleanup_connection()
809
+
810
+ reconnection_config = self.config.reconnection_backoff_config
811
+ attempt = 0
812
+
813
+ while not self.shutdown_event.is_set():
814
+ try:
815
+ attempt += 1
816
+ logger.info(f"Reconnection attempt {attempt}")
817
+
818
+ # Establish new connection
819
+ self.connection = await self._establish_connection()
820
+ self.connection_healthy = True
821
+
822
+ # Re-establish all consumers
823
+ await self._reestablish_consumers()
824
+
825
+ logger.info("Reconnection successful")
826
+ self.reconnection_event.set()
827
+ return
828
+
829
+ except Exception as e:
830
+ logger.error(f"Reconnection attempt {attempt} failed: {e}")
831
+
832
+ if self.shutdown_event.is_set():
833
+ break
834
+
835
+ # Calculate backoff delay
836
+ delay = reconnection_config.initial_delay * (
837
+ reconnection_config.backoff_factor ** (attempt - 1)
838
+ )
839
+ if reconnection_config.jitter:
840
+ jitter_amount = delay * 0.25
841
+ delay = delay + random.uniform(-jitter_amount, jitter_amount)
842
+ delay = max(delay, 0.1)
843
+
844
+ delay = min(delay, reconnection_config.max_delay)
845
+
846
+ logger.info(f"Retrying reconnection in {delay:.2f} seconds")
847
+ await asyncio.sleep(delay)
848
+
849
+ async def _cleanup_connection(self) -> None:
850
+ """
851
+ Clean up existing connection and channels.
852
+ """
853
+ # Cancel existing consumers
854
+ for queue_name, channel in self.channels.items():
855
+ try:
856
+ if not channel.is_closed:
857
+ # Cancel consumer if we have its tag
858
+ if queue_name in self.consumer_tags:
859
+ try:
860
+ queue = await channel.get_queue(queue_name, ensure=False)
861
+ if queue:
862
+ await queue.cancel(self.consumer_tags[queue_name])
863
+ except Exception as cancel_error:
864
+ logger.warning(
865
+ f"Error cancelling consumer for {queue_name}: {cancel_error}"
866
+ )
867
+ del self.consumer_tags[queue_name]
868
+ except Exception as e:
869
+ logger.warning(f"Error cancelling consumer for {queue_name}: {e}")
870
+
871
+ # Close channels
872
+ for queue_name, channel in self.channels.items():
873
+ try:
874
+ if not channel.is_closed:
875
+ await channel.close()
876
+ except Exception as e:
877
+ logger.warning(f"Error closing channel for {queue_name}: {e}")
878
+
879
+ self.channels.clear()
880
+
881
+ # Close connection
882
+ if self.connection and not self.connection.is_closed:
883
+ try:
884
+ await self.connection.close()
885
+ except Exception as e:
886
+ logger.warning(f"Error closing connection: {e}")
887
+
888
+ self.connection = None
889
+ self.connection_healthy = False
890
+
891
+ async def _reestablish_consumers(self) -> None:
892
+ """
893
+ Re-establish all consumers after reconnection.
894
+ """
895
+ logger.info("Re-establishing consumers after reconnection")
896
+
897
+ # Re-establish message handlers
898
+ for handler in self.message_handler_set:
899
+ queue_name = f"{handler.message_type.MESSAGE_TOPIC}.{handler.instance_callable.__module__}.{handler.instance_callable.__qualname__}"
900
+ try:
901
+ await self._setup_message_handler_consumer(handler)
902
+ logger.info(f"Re-established consumer for {queue_name}")
903
+ except Exception as e:
904
+ logger.error(f"Failed to re-establish consumer for {queue_name}: {e}")
905
+
906
+ # Re-establish scheduled actions
907
+ for scheduled_action in self.scheduled_actions:
908
+ queue_name = f"{scheduled_action.callable.__module__}.{scheduled_action.callable.__qualname__}"
909
+ try:
910
+ await self._setup_scheduled_action_consumer(scheduled_action)
911
+ logger.info(f"Re-established scheduler consumer for {queue_name}")
912
+ except Exception as e:
913
+ logger.error(
914
+ f"Failed to re-establish scheduler consumer for {queue_name}: {e}"
915
+ )
219
916
 
220
917
 
221
918
  def create_message_bus(
@@ -254,10 +951,150 @@ def create_message_bus(
254
951
  exchange = query_params["exchange"][0]
255
952
  prefetch_count = int(query_params["prefetch_count"][0])
256
953
 
954
+ # Parse optional retry configuration parameters
955
+ connection_retry_config = RetryConfig()
956
+ consumer_retry_config = RetryConfig(
957
+ max_retries=30, initial_delay=5, max_delay=60.0, backoff_factor=3.0
958
+ )
959
+
960
+ # Parse optional reconnection configuration parameters
961
+ reconnection_backoff_config = RetryConfig(
962
+ max_retries=-1, # Infinite retries for reconnection
963
+ initial_delay=2.0,
964
+ max_delay=120.0,
965
+ backoff_factor=2.0,
966
+ jitter=True,
967
+ )
968
+
969
+ # Parse heartbeat and health check intervals
970
+ connection_heartbeat_interval = 30.0
971
+ connection_health_check_interval = 10.0
972
+
973
+ # Connection retry config parameters
974
+ if (
975
+ "connection_retry_max" in query_params
976
+ and query_params["connection_retry_max"][0].isdigit()
977
+ ):
978
+ connection_retry_config.max_retries = int(
979
+ query_params["connection_retry_max"][0]
980
+ )
981
+
982
+ if "connection_retry_delay" in query_params:
983
+ try:
984
+ connection_retry_config.initial_delay = float(
985
+ query_params["connection_retry_delay"][0]
986
+ )
987
+ except ValueError:
988
+ pass
989
+
990
+ if "connection_retry_max_delay" in query_params:
991
+ try:
992
+ connection_retry_config.max_delay = float(
993
+ query_params["connection_retry_max_delay"][0]
994
+ )
995
+ except ValueError:
996
+ pass
997
+
998
+ if "connection_retry_backoff" in query_params:
999
+ try:
1000
+ connection_retry_config.backoff_factor = float(
1001
+ query_params["connection_retry_backoff"][0]
1002
+ )
1003
+ except ValueError:
1004
+ pass
1005
+
1006
+ # Consumer retry config parameters
1007
+ if (
1008
+ "consumer_retry_max" in query_params
1009
+ and query_params["consumer_retry_max"][0].isdigit()
1010
+ ):
1011
+ consumer_retry_config.max_retries = int(
1012
+ query_params["consumer_retry_max"][0]
1013
+ )
1014
+
1015
+ if "consumer_retry_delay" in query_params:
1016
+ try:
1017
+ consumer_retry_config.initial_delay = float(
1018
+ query_params["consumer_retry_delay"][0]
1019
+ )
1020
+ except ValueError:
1021
+ pass
1022
+
1023
+ if "consumer_retry_max_delay" in query_params:
1024
+ try:
1025
+ consumer_retry_config.max_delay = float(
1026
+ query_params["consumer_retry_max_delay"][0]
1027
+ )
1028
+ except ValueError:
1029
+ pass
1030
+
1031
+ if "consumer_retry_backoff" in query_params:
1032
+ try:
1033
+ consumer_retry_config.backoff_factor = float(
1034
+ query_params["consumer_retry_backoff"][0]
1035
+ )
1036
+ except ValueError:
1037
+ pass
1038
+
1039
+ # Reconnection backoff config parameters
1040
+ if (
1041
+ "reconnection_retry_max" in query_params
1042
+ and query_params["reconnection_retry_max"][0].isdigit()
1043
+ ):
1044
+ reconnection_backoff_config.max_retries = int(
1045
+ query_params["reconnection_retry_max"][0]
1046
+ )
1047
+
1048
+ if "reconnection_retry_delay" in query_params:
1049
+ try:
1050
+ reconnection_backoff_config.initial_delay = float(
1051
+ query_params["reconnection_retry_delay"][0]
1052
+ )
1053
+ except ValueError:
1054
+ pass
1055
+
1056
+ if "reconnection_retry_max_delay" in query_params:
1057
+ try:
1058
+ reconnection_backoff_config.max_delay = float(
1059
+ query_params["reconnection_retry_max_delay"][0]
1060
+ )
1061
+ except ValueError:
1062
+ pass
1063
+
1064
+ if "reconnection_retry_backoff" in query_params:
1065
+ try:
1066
+ reconnection_backoff_config.backoff_factor = float(
1067
+ query_params["reconnection_retry_backoff"][0]
1068
+ )
1069
+ except ValueError:
1070
+ pass
1071
+
1072
+ # Heartbeat and health check intervals
1073
+ if "connection_heartbeat_interval" in query_params:
1074
+ try:
1075
+ connection_heartbeat_interval = float(
1076
+ query_params["connection_heartbeat_interval"][0]
1077
+ )
1078
+ except ValueError:
1079
+ pass
1080
+
1081
+ if "connection_health_check_interval" in query_params:
1082
+ try:
1083
+ connection_health_check_interval = float(
1084
+ query_params["connection_health_check_interval"][0]
1085
+ )
1086
+ except ValueError:
1087
+ pass
1088
+
257
1089
  config = AioPikaWorkerConfig(
258
1090
  url=broker_url,
259
1091
  exchange=exchange,
260
1092
  prefetch_count=prefetch_count,
1093
+ connection_retry_config=connection_retry_config,
1094
+ consumer_retry_config=consumer_retry_config,
1095
+ connection_heartbeat_interval=connection_heartbeat_interval,
1096
+ connection_health_check_interval=connection_health_check_interval,
1097
+ reconnection_backoff_config=reconnection_backoff_config,
261
1098
  )
262
1099
 
263
1100
  return AioPikaMicroserviceConsumer(
@@ -291,8 +1128,42 @@ class ScheduledMessageHandlerCallback:
291
1128
  ) -> None:
292
1129
 
293
1130
  if self.consumer.shutdown_event.is_set():
1131
+ logger.info(
1132
+ f"Shutdown in progress. Requeuing scheduled message for {self.queue_name}"
1133
+ )
1134
+ try:
1135
+ # Use channel context for requeuing
1136
+ async with self.consumer.get_channel_ctx(self.queue_name):
1137
+ await aio_pika_message.reject(requeue=True)
1138
+ except RuntimeError:
1139
+ logger.warning(
1140
+ f"Could not requeue scheduled message during shutdown - channel not available"
1141
+ )
1142
+ except Exception as e:
1143
+ logger.error(
1144
+ f"Failed to requeue scheduled message during shutdown: {e}"
1145
+ )
294
1146
  return
295
1147
 
1148
+ # Check if connection is healthy before processing
1149
+ if not self.consumer.connection_healthy:
1150
+ logger.warning(
1151
+ f"Connection not healthy, requeuing scheduled message for {self.queue_name}"
1152
+ )
1153
+ try:
1154
+ # Wait briefly for potential reconnection
1155
+ await asyncio.sleep(0.1)
1156
+ if not self.consumer.connection_healthy:
1157
+ # Still not healthy, requeue the message
1158
+ async with self.consumer.get_channel_ctx(self.queue_name):
1159
+ await aio_pika_message.reject(requeue=True)
1160
+ return
1161
+ except Exception as e:
1162
+ logger.error(
1163
+ f"Failed to requeue scheduled message due to connection issues: {e}"
1164
+ )
1165
+ return
1166
+
296
1167
  async with self.consumer.lock:
297
1168
  task = asyncio.create_task(self.handle_message(aio_pika_message))
298
1169
  self.consumer.tasks.add(task)
@@ -300,14 +1171,48 @@ class ScheduledMessageHandlerCallback:
300
1171
 
301
1172
  def handle_message_consume_done(self, task: asyncio.Task[Any]) -> None:
302
1173
  self.consumer.tasks.discard(task)
1174
+ if task.cancelled():
1175
+ logger.warning(f"Scheduled task for {self.queue_name} was cancelled")
1176
+ return
1177
+
1178
+ if (error := task.exception()) is not None:
1179
+ logger.exception(
1180
+ f"Error processing scheduled action {self.queue_name}", exc_info=error
1181
+ )
303
1182
 
304
1183
  async def handle_message(
305
1184
  self, aio_pika_message: aio_pika.abc.AbstractIncomingMessage
306
1185
  ) -> None:
307
1186
 
308
1187
  if self.consumer.shutdown_event.is_set():
309
- logger.info("Shutdown event set. Rqueuing message")
310
- await aio_pika_message.reject(requeue=True)
1188
+ logger.info(f"Shutdown event set. Requeuing message for {self.queue_name}")
1189
+ try:
1190
+ # Use channel context for requeuing
1191
+ async with self.consumer.get_channel_ctx(self.queue_name):
1192
+ await aio_pika_message.reject(requeue=True)
1193
+ return
1194
+ except RuntimeError:
1195
+ logger.warning(
1196
+ f"Could not requeue message during shutdown - channel not available"
1197
+ )
1198
+ except Exception as e:
1199
+ logger.error(f"Failed to requeue message during shutdown: {e}")
1200
+ return
1201
+
1202
+ # Check connection health before processing
1203
+ if not self.consumer.connection_healthy:
1204
+ logger.warning(
1205
+ f"Connection not healthy, requeuing scheduled message for {self.queue_name}"
1206
+ )
1207
+ try:
1208
+ async with self.consumer.get_channel_ctx(self.queue_name):
1209
+ await aio_pika_message.reject(requeue=True)
1210
+ return
1211
+ except Exception as e:
1212
+ logger.error(
1213
+ f"Failed to requeue scheduled message due to connection issues: {e}"
1214
+ )
1215
+ return
311
1216
 
312
1217
  sig = inspect.signature(self.scheduled_action.callable)
313
1218
  if len(sig.parameters) == 1:
@@ -352,18 +1257,19 @@ class ScheduledMessageHandlerCallback:
352
1257
  args: tuple[Any, ...],
353
1258
  kwargs: dict[str, Any],
354
1259
  ) -> None:
355
- async with self.consumer.uow_context_provider(
356
- AppTransactionContext(
357
- controller_member_reflect=scheduled_action.controller_member,
358
- transaction_data=SchedulerTransactionData(
359
- scheduled_to=datetime.now(UTC),
360
- cron_expression=scheduled_action.spec.cron,
361
- triggered_at=datetime.now(UTC),
362
- ),
363
- )
364
- ):
1260
+ with provide_shutdown_state(self.consumer.shutdown_state):
1261
+ async with self.consumer.uow_context_provider(
1262
+ AppTransactionContext(
1263
+ controller_member_reflect=scheduled_action.controller_member,
1264
+ transaction_data=SchedulerTransactionData(
1265
+ scheduled_to=datetime.now(UTC),
1266
+ cron_expression=scheduled_action.spec.cron,
1267
+ triggered_at=datetime.now(UTC),
1268
+ ),
1269
+ )
1270
+ ):
365
1271
 
366
- await scheduled_action.callable(*args, **kwargs)
1272
+ await scheduled_action.callable(*args, **kwargs)
367
1273
 
368
1274
 
369
1275
  class MessageHandlerCallback:
@@ -379,13 +1285,44 @@ class MessageHandlerCallback:
379
1285
  self.queue_name = queue_name
380
1286
  self.routing_key = routing_key
381
1287
  self.message_handler = message_handler
1288
+ self.retry_state: dict[str, dict[str, Any]] = {}
382
1289
 
383
1290
  async def message_consumer(
384
1291
  self, aio_pika_message: aio_pika.abc.AbstractIncomingMessage
385
1292
  ) -> None:
386
1293
  if self.consumer.shutdown_event.is_set():
1294
+ logger.info(
1295
+ f"Shutdown in progress. Requeuing message for {self.queue_name}"
1296
+ )
1297
+ try:
1298
+ # Use channel context for requeuing
1299
+ async with self.consumer.get_channel_ctx(self.queue_name):
1300
+ await aio_pika_message.reject(requeue=True)
1301
+ except RuntimeError:
1302
+ logger.warning(
1303
+ f"Could not requeue message during shutdown - channel not available"
1304
+ )
1305
+ except Exception as e:
1306
+ logger.error(f"Failed to requeue message during shutdown: {e}")
387
1307
  return
388
1308
 
1309
+ # Check if connection is healthy before processing
1310
+ if not self.consumer.connection_healthy:
1311
+ logger.warning(
1312
+ f"Connection not healthy, requeuing message for {self.queue_name}"
1313
+ )
1314
+ try:
1315
+ # Wait briefly for potential reconnection
1316
+ await asyncio.sleep(0.1)
1317
+ if not self.consumer.connection_healthy:
1318
+ # Still not healthy, requeue the message
1319
+ async with self.consumer.get_channel_ctx(self.queue_name):
1320
+ await aio_pika_message.reject(requeue=True)
1321
+ return
1322
+ except Exception as e:
1323
+ logger.error(f"Failed to requeue message due to connection issues: {e}")
1324
+ return
1325
+
389
1326
  async with self.consumer.lock:
390
1327
  task = asyncio.create_task(self.handle_message(aio_pika_message))
391
1328
  self.consumer.tasks.add(task)
@@ -394,10 +1331,13 @@ class MessageHandlerCallback:
394
1331
  def handle_message_consume_done(self, task: asyncio.Task[Any]) -> None:
395
1332
  self.consumer.tasks.discard(task)
396
1333
  if task.cancelled():
1334
+ logger.warning(f"Task for queue {self.queue_name} was cancelled")
397
1335
  return
398
1336
 
399
1337
  if (error := task.exception()) is not None:
400
- logger.exception("Error processing message", exc_info=error)
1338
+ logger.exception(
1339
+ f"Error processing message for queue {self.queue_name}", exc_info=error
1340
+ )
401
1341
 
402
1342
  async def __call__(
403
1343
  self, aio_pika_message: aio_pika.abc.AbstractIncomingMessage
@@ -408,14 +1348,203 @@ class MessageHandlerCallback:
408
1348
  self,
409
1349
  aio_pika_message: aio_pika.abc.AbstractIncomingMessage,
410
1350
  requeue: bool = False,
1351
+ retry_count: int = 0,
1352
+ exception: Optional[BaseException] = None,
411
1353
  ) -> None:
412
- if self.message_handler.spec.auto_ack is False:
413
- await aio_pika_message.reject(requeue=requeue)
414
- elif requeue:
415
- logger.warning(
416
- f"Message {aio_pika_message.message_id} ({self.queue_name}) cannot be requeued because auto_ack is enabled"
1354
+ """
1355
+ Handle rejecting a message, with support for retry with exponential backoff.
1356
+
1357
+ Args:
1358
+ aio_pika_message: The message to reject
1359
+ requeue: Whether to requeue the message directly (True) or handle with retry logic (False)
1360
+ retry_count: The current retry count for this message
1361
+ exception: The exception that caused the rejection, if any
1362
+ """
1363
+ message_id = aio_pika_message.message_id or str(uuid.uuid4())
1364
+
1365
+ # If auto_ack is enabled, we cannot retry the message through RabbitMQ reject mechanism
1366
+ if self.message_handler.spec.auto_ack:
1367
+ if requeue:
1368
+ logger.warning(
1369
+ f"Message {message_id} ({self.queue_name}) cannot be requeued because auto_ack is enabled"
1370
+ )
1371
+ return
1372
+
1373
+ try:
1374
+ # Check if we should retry with backoff
1375
+ if (
1376
+ not requeue
1377
+ and self.message_handler.spec.requeue_on_exception
1378
+ and exception is not None
1379
+ ):
1380
+ # Get retry config from consumer
1381
+ retry_config = self.consumer.config.consumer_retry_config
1382
+
1383
+ # Check if we reached max retries
1384
+ if retry_count >= retry_config.max_retries:
1385
+ logger.warning(
1386
+ f"Message {message_id} ({self.queue_name}) failed after {retry_count} retries, "
1387
+ f"dead-lettering: {str(exception)}"
1388
+ )
1389
+ # Dead-letter the message after max retries
1390
+ try:
1391
+ async with self.consumer.get_channel_ctx(self.queue_name):
1392
+ await aio_pika_message.reject(requeue=False)
1393
+ except Exception as e:
1394
+ logger.error(f"Failed to dead-letter message {message_id}: {e}")
1395
+ return
1396
+
1397
+ # Calculate delay for this retry attempt
1398
+ delay = retry_config.initial_delay * (
1399
+ retry_config.backoff_factor**retry_count
1400
+ )
1401
+ if retry_config.jitter:
1402
+ jitter_amount = delay * 0.25
1403
+ delay = delay + random.uniform(-jitter_amount, jitter_amount)
1404
+ delay = max(
1405
+ delay, 0.1
1406
+ ) # Ensure delay doesn't go negative due to jitter
1407
+
1408
+ delay = min(delay, retry_config.max_delay)
1409
+
1410
+ logger.info(
1411
+ f"Message {message_id} ({self.queue_name}) failed with {str(exception)}, "
1412
+ f"retry {retry_count+1}/{retry_config.max_retries} scheduled in {delay:.2f}s"
1413
+ )
1414
+
1415
+ # Store retry state for this message
1416
+ self.retry_state[message_id] = {
1417
+ "retry_count": retry_count + 1,
1418
+ "last_exception": exception,
1419
+ "next_retry": time.time() + delay,
1420
+ }
1421
+
1422
+ # Schedule retry after delay
1423
+ asyncio.create_task(
1424
+ self._delayed_retry(
1425
+ aio_pika_message, delay, retry_count + 1, exception
1426
+ )
1427
+ )
1428
+
1429
+ # Acknowledge the current message since we'll handle retry ourselves
1430
+ try:
1431
+ async with self.consumer.get_channel_ctx(self.queue_name):
1432
+ await aio_pika_message.ack()
1433
+ except Exception as e:
1434
+ logger.error(
1435
+ f"Failed to acknowledge message {message_id} for retry: {e}"
1436
+ )
1437
+ return
1438
+
1439
+ # Standard reject without retry or with immediate requeue
1440
+ try:
1441
+ async with self.consumer.get_channel_ctx(self.queue_name):
1442
+ await aio_pika_message.reject(requeue=requeue)
1443
+ if requeue:
1444
+ logger.info(
1445
+ f"Message {message_id} ({self.queue_name}) requeued for immediate retry"
1446
+ )
1447
+ else:
1448
+ logger.info(
1449
+ f"Message {message_id} ({self.queue_name}) rejected without requeue"
1450
+ )
1451
+ except Exception as e:
1452
+ logger.error(f"Failed to reject message {message_id}: {e}")
1453
+
1454
+ except Exception as e:
1455
+ logger.exception(
1456
+ f"Unexpected error in handle_reject_message for {message_id} ({self.queue_name}): {e}"
417
1457
  )
418
1458
 
1459
+ async def _delayed_retry(
1460
+ self,
1461
+ aio_pika_message: aio_pika.abc.AbstractIncomingMessage,
1462
+ delay: float,
1463
+ retry_count: int,
1464
+ exception: Optional[BaseException],
1465
+ ) -> None:
1466
+ """
1467
+ Handle delayed retry of a message after exponential backoff delay.
1468
+
1469
+ Args:
1470
+ aio_pika_message: The original message
1471
+ delay: Delay in seconds before retrying
1472
+ retry_count: The current retry count (after increment)
1473
+ exception: The exception that caused the failure
1474
+ """
1475
+ message_id = aio_pika_message.message_id or str(uuid.uuid4())
1476
+
1477
+ try:
1478
+ # Wait for the backoff delay
1479
+ await asyncio.sleep(delay)
1480
+
1481
+ # Get message body and properties for republishing
1482
+ message_body = aio_pika_message.body
1483
+ headers = (
1484
+ aio_pika_message.headers.copy() if aio_pika_message.headers else {}
1485
+ )
1486
+
1487
+ # Add retry information to headers
1488
+ headers["x-retry-count"] = retry_count
1489
+ if exception:
1490
+ headers["x-last-error"] = str(exception)
1491
+
1492
+ # Clean up retry state
1493
+ if message_id in self.retry_state:
1494
+ del self.retry_state[message_id]
1495
+
1496
+ # Republish the message to the same queue with retry logic
1497
+ max_attempts = 3
1498
+ for attempt in range(max_attempts):
1499
+ try:
1500
+ async with self.consumer.get_channel_ctx(
1501
+ self.queue_name
1502
+ ) as channel:
1503
+ exchange = await RabbitmqUtils.get_main_exchange(
1504
+ channel=channel,
1505
+ exchange_name=self.consumer.config.exchange,
1506
+ )
1507
+
1508
+ await exchange.publish(
1509
+ aio_pika.Message(
1510
+ body=message_body,
1511
+ headers=headers,
1512
+ message_id=message_id,
1513
+ content_type=aio_pika_message.content_type,
1514
+ content_encoding=aio_pika_message.content_encoding,
1515
+ delivery_mode=aio_pika_message.delivery_mode,
1516
+ ),
1517
+ routing_key=self.routing_key,
1518
+ )
1519
+
1520
+ logger.info(
1521
+ f"Message {message_id} ({self.queue_name}) republished for retry {retry_count}"
1522
+ )
1523
+ return
1524
+
1525
+ except Exception as e:
1526
+ if attempt < max_attempts - 1:
1527
+ logger.warning(
1528
+ f"Failed to republish message {message_id} (attempt {attempt + 1}): {e}"
1529
+ )
1530
+ await asyncio.sleep(1.0 * (attempt + 1)) # Exponential backoff
1531
+ else:
1532
+ logger.error(
1533
+ f"Failed to republish message {message_id} after {max_attempts} attempts: {e}"
1534
+ )
1535
+ raise
1536
+
1537
+ except Exception as e:
1538
+ logger.exception(
1539
+ f"Failed to execute delayed retry for message {message_id} ({self.queue_name}): {e}"
1540
+ )
1541
+ # If we fail to republish, try to dead-letter the original message
1542
+ try:
1543
+ if message_id in self.retry_state:
1544
+ del self.retry_state[message_id]
1545
+ except Exception:
1546
+ pass
1547
+
419
1548
  async def handle_message(
420
1549
  self, aio_pika_message: aio_pika.abc.AbstractIncomingMessage
421
1550
  ) -> None:
@@ -472,51 +1601,92 @@ class MessageHandlerCallback:
472
1601
  incoming_message_spec = MessageHandler.get_message_incoming(handler)
473
1602
  assert incoming_message_spec is not None
474
1603
 
475
- async with self.consumer.uow_context_provider(
476
- AppTransactionContext(
477
- controller_member_reflect=handler_data.controller_member,
478
- transaction_data=MessageBusTransactionData(
479
- message=builded_message,
480
- topic=routing_key,
481
- ),
482
- )
483
- ):
484
- ctx: AsyncContextManager[Any]
485
- if incoming_message_spec.timeout is not None:
486
- ctx = asyncio.timeout(incoming_message_spec.timeout)
487
- else:
488
- ctx = none_context()
489
- async with ctx:
490
- try:
491
- with provide_bus_message_controller(
492
- AioPikaMessageBusController(aio_pika_message)
493
- ):
494
- await handler(builded_message)
495
- if not incoming_message_spec.auto_ack:
496
- with suppress(aio_pika.MessageProcessError):
497
- await aio_pika_message.ack()
498
- except BaseException as base_exc:
499
- if incoming_message_spec.exception_handler is not None:
500
- try:
501
- incoming_message_spec.exception_handler(base_exc)
502
- except Exception as nested_exc:
1604
+ with provide_shutdown_state(self.consumer.shutdown_state):
1605
+ async with self.consumer.uow_context_provider(
1606
+ AppTransactionContext(
1607
+ controller_member_reflect=handler_data.controller_member,
1608
+ transaction_data=MessageBusTransactionData(
1609
+ message=builded_message,
1610
+ topic=routing_key,
1611
+ ),
1612
+ )
1613
+ ):
1614
+ ctx: AsyncContextManager[Any]
1615
+ if incoming_message_spec.timeout is not None:
1616
+ ctx = asyncio.timeout(incoming_message_spec.timeout)
1617
+ else:
1618
+ ctx = none_context()
1619
+ async with ctx:
1620
+ try:
1621
+ with provide_bus_message_controller(
1622
+ AioPikaMessageBusController(aio_pika_message)
1623
+ ):
1624
+ await handler(builded_message)
1625
+ if not incoming_message_spec.auto_ack:
1626
+ with suppress(aio_pika.MessageProcessError):
1627
+ # Use channel context for acknowledgement with retry
1628
+ try:
1629
+ async with self.consumer.get_channel_ctx(
1630
+ self.queue_name
1631
+ ):
1632
+ await aio_pika_message.ack()
1633
+ except Exception as ack_error:
1634
+ logger.warning(
1635
+ f"Failed to acknowledge message {aio_pika_message.message_id or 'unknown'}: {ack_error}"
1636
+ )
1637
+ # Message will be redelivered if ack fails, which is acceptable
1638
+ except BaseException as base_exc:
1639
+ # Get message id for logging
1640
+ message_id = aio_pika_message.message_id or str(uuid.uuid4())
1641
+
1642
+ # Extract retry count from headers if available
1643
+ headers = aio_pika_message.headers or {}
1644
+ retry_count = int(str(headers.get("x-retry-count", 0)))
1645
+
1646
+ # Process exception handler if configured
1647
+ if incoming_message_spec.exception_handler is not None:
1648
+ try:
1649
+ incoming_message_spec.exception_handler(base_exc)
1650
+ except Exception as nested_exc:
1651
+ logger.exception(
1652
+ f"Error processing exception handler for message {message_id}: {base_exc} | {nested_exc}"
1653
+ )
1654
+ else:
503
1655
  logger.exception(
504
- f"Error processing exception handler: {base_exc} | {nested_exc}"
1656
+ f"Error processing message {message_id} on topic {routing_key}: {str(base_exc)}"
1657
+ )
1658
+
1659
+ # Handle rejection with retry logic
1660
+ if incoming_message_spec.requeue_on_exception:
1661
+ # Use our retry with backoff mechanism
1662
+ await self.handle_reject_message(
1663
+ aio_pika_message,
1664
+ requeue=False, # Don't requeue directly, use our backoff mechanism
1665
+ retry_count=retry_count,
1666
+ exception=base_exc,
1667
+ )
1668
+ else:
1669
+ # Message shouldn't be retried, reject it
1670
+ await self.handle_reject_message(
1671
+ aio_pika_message, requeue=False, exception=base_exc
505
1672
  )
506
1673
  else:
507
- logger.exception(
508
- f"Error processing message on topic {routing_key}"
509
- )
510
- if incoming_message_spec.requeue_on_exception:
511
- await self.handle_reject_message(aio_pika_message, requeue=True)
512
- else:
513
- await self.handle_reject_message(
514
- aio_pika_message, requeue=False
515
- )
516
- else:
517
- logger.info(
518
- f"Message {aio_pika_message.message_id}#{self.queue_name} processed successfully"
519
- )
1674
+ # Message processed successfully, log and clean up any retry state
1675
+ message_id = aio_pika_message.message_id or str(uuid.uuid4())
1676
+ if message_id in self.retry_state:
1677
+ del self.retry_state[message_id]
1678
+
1679
+ # Log success with retry information if applicable
1680
+ headers = aio_pika_message.headers or {}
1681
+ if "x-retry-count" in headers:
1682
+ retry_count = int(str(headers.get("x-retry-count", 0)))
1683
+ logger.info(
1684
+ f"Message {message_id}#{self.queue_name} processed successfully after {retry_count} retries"
1685
+ )
1686
+ else:
1687
+ logger.info(
1688
+ f"Message {message_id}#{self.queue_name} processed successfully"
1689
+ )
520
1690
 
521
1691
 
522
1692
  @asynccontextmanager
@@ -614,19 +1784,64 @@ class MessageBusWorker:
614
1784
  def start_sync(self) -> None:
615
1785
 
616
1786
  def on_shutdown(loop: asyncio.AbstractEventLoop) -> None:
617
- logger.info("Shutting down")
618
- self.consumer.shutdown()
1787
+ logger.info("Shutting down - signal received")
1788
+ # Schedule the shutdown to run in the event loop
1789
+ asyncio.create_task(self._graceful_shutdown())
1790
+ # wait until the shutdown is complete
619
1791
 
620
1792
  with asyncio.Runner(loop_factory=uvloop.new_event_loop) as runner:
621
- runner.get_loop().add_signal_handler(
622
- signal.SIGINT, on_shutdown, runner.get_loop()
623
- )
1793
+ loop = runner.get_loop()
1794
+ loop.add_signal_handler(signal.SIGINT, on_shutdown, loop)
1795
+ # Add graceful shutdown handler for SIGTERM as well
1796
+ loop.add_signal_handler(signal.SIGTERM, on_shutdown, loop)
624
1797
  runner.run(self.start_async())
625
1798
 
1799
+ async def _graceful_shutdown(self) -> None:
1800
+ """Handles graceful shutdown process"""
1801
+ logger.info("Initiating graceful shutdown sequence")
1802
+ # Use the comprehensive close method that handles shutdown, task waiting and connection cleanup
1803
+
1804
+ self.consumer.shutdown()
1805
+ logger.info("Graceful shutdown completed")
1806
+
626
1807
 
627
1808
  class AioPikaMessageBusController(BusMessageController):
628
1809
  def __init__(self, aio_pika_message: aio_pika.abc.AbstractIncomingMessage):
629
1810
  self.aio_pika_message = aio_pika_message
1811
+ # We access consumer callback through context if available
1812
+ self._callback: Optional[MessageHandlerCallback] = None
1813
+
1814
+ def _get_callback(self) -> MessageHandlerCallback:
1815
+ """
1816
+ Find the callback associated with this message.
1817
+ This allows us to access the retry mechanisms.
1818
+ """
1819
+ if self._callback is None:
1820
+ # Get the context from current frame's locals
1821
+ frame = inspect.currentframe()
1822
+ if frame is not None:
1823
+ try:
1824
+ caller_frame = frame.f_back
1825
+ if caller_frame is not None:
1826
+ # Check for context with handler callback
1827
+ callback_ref = None
1828
+ # Look for handler_message call context
1829
+ while caller_frame is not None:
1830
+ if "self" in caller_frame.f_locals:
1831
+ self_obj = caller_frame.f_locals["self"]
1832
+ if isinstance(self_obj, MessageHandlerCallback):
1833
+ callback_ref = self_obj
1834
+ break
1835
+ caller_frame = caller_frame.f_back
1836
+ # Save callback reference if we found it
1837
+ self._callback = callback_ref
1838
+ finally:
1839
+ del frame # Avoid reference cycles
1840
+
1841
+ if self._callback is None:
1842
+ raise RuntimeError("Could not find callback context for message retry")
1843
+
1844
+ return self._callback
630
1845
 
631
1846
  async def ack(self) -> None:
632
1847
  await self.aio_pika_message.ack()
@@ -638,7 +1853,41 @@ class AioPikaMessageBusController(BusMessageController):
638
1853
  await self.aio_pika_message.reject()
639
1854
 
640
1855
  async def retry(self) -> None:
641
- await self.aio_pika_message.reject(requeue=True)
1856
+ """
1857
+ Retry the message immediately by rejecting with requeue flag.
1858
+ This doesn't use the exponential backoff mechanism.
1859
+ """
1860
+ callback = self._get_callback()
1861
+ await callback.handle_reject_message(self.aio_pika_message, requeue=True)
642
1862
 
643
1863
  async def retry_later(self, delay: int) -> None:
644
- raise NotImplementedError("Not implemented")
1864
+ """
1865
+ Retry the message after a specified delay using the exponential backoff mechanism.
1866
+
1867
+ Args:
1868
+ delay: Minimum delay in seconds before retrying
1869
+ """
1870
+ try:
1871
+ callback = self._get_callback()
1872
+
1873
+ # Get current retry count from message headers
1874
+ headers = self.aio_pika_message.headers or {}
1875
+ retry_count = int(str(headers.get("x-retry-count", 0)))
1876
+
1877
+ # Handle retry with explicit delay
1878
+ asyncio.create_task(
1879
+ callback._delayed_retry(
1880
+ self.aio_pika_message,
1881
+ float(delay),
1882
+ retry_count + 1,
1883
+ None, # No specific exception
1884
+ )
1885
+ )
1886
+
1887
+ # Acknowledge the current message since we'll republish
1888
+ await self.aio_pika_message.ack()
1889
+
1890
+ except Exception as e:
1891
+ logger.exception(f"Failed to schedule retry_later: {e}")
1892
+ # Fall back to immediate retry
1893
+ await self.aio_pika_message.reject(requeue=True)