jararaca 0.3.12a13__py3-none-any.whl → 0.3.12a14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of jararaca might be problematic. Click here for more details.

@@ -23,7 +23,14 @@ from urllib.parse import parse_qs, urlparse
23
23
  import aio_pika
24
24
  import aio_pika.abc
25
25
  import uvloop
26
- from aio_pika.exceptions import AMQPError, ChannelClosed, ChannelNotFoundEntity
26
+ from aio_pika.exceptions import (
27
+ AMQPChannelError,
28
+ AMQPConnectionError,
29
+ AMQPError,
30
+ ChannelClosed,
31
+ ChannelNotFoundEntity,
32
+ ConnectionClosed,
33
+ )
27
34
  from pydantic import BaseModel
28
35
 
29
36
  from jararaca.broker_backend import MessageBrokerBackend
@@ -80,6 +87,18 @@ class AioPikaWorkerConfig:
80
87
  backoff_factor=2.0,
81
88
  )
82
89
  )
90
+ # Connection health monitoring settings
91
+ connection_heartbeat_interval: float = 30.0 # seconds
92
+ connection_health_check_interval: float = 10.0 # seconds
93
+ reconnection_backoff_config: RetryConfig = field(
94
+ default_factory=lambda: RetryConfig(
95
+ max_retries=-1, # Infinite retries for reconnection
96
+ initial_delay=2.0,
97
+ max_delay=120.0,
98
+ backoff_factor=2.0,
99
+ jitter=True,
100
+ )
101
+ )
83
102
 
84
103
 
85
104
  class AioPikaMessage(MessageOf[Message]):
@@ -165,6 +184,15 @@ class AioPikaMicroserviceConsumer(MessageBusConsumer):
165
184
  self.connection: aio_pika.abc.AbstractConnection | None = None
166
185
  self.channels: dict[str, aio_pika.abc.AbstractChannel] = {}
167
186
 
187
+ # Connection resilience attributes
188
+ self.connection_healthy = False
189
+ self.connection_lock = asyncio.Lock()
190
+ self.reconnection_event = asyncio.Event()
191
+ self.reconnection_in_progress = False
192
+ self.consumer_tags: dict[str, str] = {} # Track consumer tags for cleanup
193
+ self.health_check_task: asyncio.Task[Any] | None = None
194
+ self.reconnection_task: asyncio.Task[Any] | None = None
195
+
168
196
  async def _verify_infrastructure(self) -> bool:
169
197
  """
170
198
  Verify that the required RabbitMQ infrastructure (exchanges, queues) exists.
@@ -200,14 +228,18 @@ class AioPikaMicroserviceConsumer(MessageBusConsumer):
200
228
  routing_key = f"{handler.message_type.MESSAGE_TOPIC}.#"
201
229
 
202
230
  async def setup_consumer() -> None:
231
+ # Wait for connection to be healthy if reconnection is in progress
232
+ if self.reconnection_in_progress:
233
+ await self.reconnection_event.wait()
234
+
203
235
  # Create a channel using the context manager
204
236
  async with self.create_channel(queue_name) as channel:
205
237
  queue = await RabbitmqUtils.get_queue(
206
238
  channel=channel, queue_name=queue_name
207
239
  )
208
240
 
209
- # Configure consumer right away while in the context
210
- await queue.consume(
241
+ # Configure consumer and get the consumer tag
242
+ consumer_tag = await queue.consume(
211
243
  callback=MessageHandlerCallback(
212
244
  consumer=self,
213
245
  queue_name=queue_name,
@@ -217,6 +249,9 @@ class AioPikaMicroserviceConsumer(MessageBusConsumer):
217
249
  no_ack=handler.spec.auto_ack,
218
250
  )
219
251
 
252
+ # Store consumer tag for cleanup
253
+ self.consumer_tags[queue_name] = consumer_tag
254
+
220
255
  logger.info(
221
256
  f"Consuming message handler {queue_name} on dedicated channel"
222
257
  )
@@ -226,7 +261,14 @@ class AioPikaMicroserviceConsumer(MessageBusConsumer):
226
261
  await retry_with_backoff(
227
262
  setup_consumer,
228
263
  retry_config=self.config.consumer_retry_config,
229
- retry_exceptions=(ChannelNotFoundEntity, ChannelClosed, AMQPError),
264
+ retry_exceptions=(
265
+ ChannelNotFoundEntity,
266
+ ChannelClosed,
267
+ AMQPError,
268
+ AMQPConnectionError,
269
+ AMQPChannelError,
270
+ ConnectionClosed,
271
+ ),
230
272
  )
231
273
  return True
232
274
  except Exception as e:
@@ -246,14 +288,18 @@ class AioPikaMicroserviceConsumer(MessageBusConsumer):
246
288
  routing_key = queue_name
247
289
 
248
290
  async def setup_consumer() -> None:
291
+ # Wait for connection to be healthy if reconnection is in progress
292
+ if self.reconnection_in_progress:
293
+ await self.reconnection_event.wait()
294
+
249
295
  # Create a channel using the context manager
250
296
  async with self.create_channel(queue_name) as channel:
251
297
  queue = await RabbitmqUtils.get_queue(
252
298
  channel=channel, queue_name=queue_name
253
299
  )
254
300
 
255
- # Configure consumer right away while in the context
256
- await queue.consume(
301
+ # Configure consumer and get the consumer tag
302
+ consumer_tag = await queue.consume(
257
303
  callback=ScheduledMessageHandlerCallback(
258
304
  consumer=self,
259
305
  queue_name=queue_name,
@@ -263,6 +309,9 @@ class AioPikaMicroserviceConsumer(MessageBusConsumer):
263
309
  no_ack=True,
264
310
  )
265
311
 
312
+ # Store consumer tag for cleanup
313
+ self.consumer_tags[queue_name] = consumer_tag
314
+
266
315
  logger.info(f"Consuming scheduler {queue_name} on dedicated channel")
267
316
 
268
317
  try:
@@ -270,7 +319,14 @@ class AioPikaMicroserviceConsumer(MessageBusConsumer):
270
319
  await retry_with_backoff(
271
320
  setup_consumer,
272
321
  retry_config=self.config.consumer_retry_config,
273
- retry_exceptions=(ChannelNotFoundEntity, ChannelClosed, AMQPError),
322
+ retry_exceptions=(
323
+ ChannelNotFoundEntity,
324
+ ChannelClosed,
325
+ AMQPError,
326
+ AMQPConnectionError,
327
+ AMQPChannelError,
328
+ ConnectionClosed,
329
+ ),
274
330
  )
275
331
  return True
276
332
  except Exception as e:
@@ -283,98 +339,107 @@ class AioPikaMicroserviceConsumer(MessageBusConsumer):
283
339
  """
284
340
  Main consume method that sets up all message handlers and scheduled actions with retry mechanisms.
285
341
  """
286
- # Verify infrastructure with retry
287
- infra_check_success = await retry_with_backoff(
288
- self._verify_infrastructure,
289
- retry_config=self.config.connection_retry_config,
290
- retry_exceptions=(Exception,),
291
- )
342
+ # Establish initial connection
343
+ async with self.connect() as connection:
344
+ self.connection_healthy = True
292
345
 
293
- if not infra_check_success:
294
- logger.critical("Failed to verify RabbitMQ infrastructure. Shutting down.")
295
- self.shutdown_event.set()
296
- return
346
+ # Start connection health monitoring
347
+ self.health_check_task = asyncio.create_task(
348
+ self._monitor_connection_health()
349
+ )
297
350
 
298
- async def wait_for(
299
- type: str, name: str, coroutine: Awaitable[bool]
300
- ) -> tuple[str, str, bool]:
301
- return type, name, await coroutine
351
+ # Verify infrastructure with retry
352
+ infra_check_success = await retry_with_backoff(
353
+ self._verify_infrastructure,
354
+ retry_config=self.config.connection_retry_config,
355
+ retry_exceptions=(Exception,),
356
+ )
302
357
 
303
- tasks: set[asyncio.Task[tuple[str, str, bool]]] = set()
358
+ if not infra_check_success:
359
+ logger.critical(
360
+ "Failed to verify RabbitMQ infrastructure. Shutting down."
361
+ )
362
+ self.shutdown_event.set()
363
+ return
304
364
 
305
- # Setup message handlers
306
- for handler in self.message_handler_set:
307
- queue_name = f"{handler.message_type.MESSAGE_TOPIC}.{handler.instance_callable.__module__}.{handler.instance_callable.__qualname__}"
308
- self.incoming_map[queue_name] = handler
309
-
310
- tasks.add(
311
- task := asyncio.create_task(
312
- wait_for(
313
- "message_handler",
314
- queue_name,
315
- self._setup_message_handler_consumer(handler),
365
+ async def wait_for(
366
+ type: str, name: str, coroutine: Awaitable[bool]
367
+ ) -> tuple[str, str, bool]:
368
+ return type, name, await coroutine
369
+
370
+ tasks: set[asyncio.Task[tuple[str, str, bool]]] = set()
371
+
372
+ # Setup message handlers
373
+ for handler in self.message_handler_set:
374
+ queue_name = f"{handler.message_type.MESSAGE_TOPIC}.{handler.instance_callable.__module__}.{handler.instance_callable.__qualname__}"
375
+ self.incoming_map[queue_name] = handler
376
+
377
+ tasks.add(
378
+ task := asyncio.create_task(
379
+ wait_for(
380
+ "message_handler",
381
+ queue_name,
382
+ self._setup_message_handler_consumer(handler),
383
+ )
316
384
  )
317
385
  )
318
- )
319
- # task.add_done_callback(tasks.discard)
320
- # success = await self._setup_message_handler_consumer(handler)
321
- # if not success:
322
- # logger.warning(
323
- # f"Failed to set up consumer for {queue_name}, will not process messages from this queue"
324
- # )
325
-
326
- # Setup scheduled actions
327
- for scheduled_action in self.scheduled_actions:
328
386
 
329
- queue_name = f"{scheduled_action.callable.__module__}.{scheduled_action.callable.__qualname__}"
330
- tasks.add(
331
- task := asyncio.create_task(
332
- wait_for(
333
- "scheduled_action",
334
- queue_name,
335
- self._setup_scheduled_action_consumer(scheduled_action),
387
+ # Setup scheduled actions
388
+ for scheduled_action in self.scheduled_actions:
389
+ queue_name = f"{scheduled_action.callable.__module__}.{scheduled_action.callable.__qualname__}"
390
+ tasks.add(
391
+ task := asyncio.create_task(
392
+ wait_for(
393
+ "scheduled_action",
394
+ queue_name,
395
+ self._setup_scheduled_action_consumer(scheduled_action),
396
+ )
336
397
  )
337
398
  )
338
- )
339
- # task.add_done_callback(tasks.discard)
340
-
341
- # success = await self._setup_scheduled_action_consumer(scheduled_action)
342
- # if not success:
343
- # queue_name = f"{scheduled_action.callable.__module__}.{scheduled_action.callable.__qualname__}"
344
- # logger.warning(
345
- # f"Failed to set up consumer for scheduled action {queue_name}, will not process scheduled tasks from this queue"
346
- # )
347
-
348
- async def handle_task_results() -> None:
349
- for task in asyncio.as_completed(tasks):
350
- type, name, success = await task
351
- if success:
352
- logger.info(f"Successfully set up {type} consumer for {name}")
353
- else:
354
- logger.warning(
355
- f"Failed to set up {type} consumer for {name}, will not process messages from this queue"
356
- )
357
399
 
358
- handle_task_results_task = asyncio.create_task(handle_task_results())
359
-
360
- # Wait for shutdown signal
361
- await self.shutdown_event.wait()
362
- logger.info("Shutdown event received, stopping consumers")
363
- handle_task_results_task.cancel()
364
- with suppress(asyncio.CancelledError):
365
- await handle_task_results_task
366
- for task in tasks:
367
- if not task.done():
368
- task.cancel()
400
+ async def handle_task_results() -> None:
401
+ for task in asyncio.as_completed(tasks):
402
+ type, name, success = await task
403
+ if success:
404
+ logger.info(f"Successfully set up {type} consumer for {name}")
405
+ else:
406
+ logger.warning(
407
+ f"Failed to set up {type} consumer for {name}, will not process messages from this queue"
408
+ )
409
+
410
+ handle_task_results_task = asyncio.create_task(handle_task_results())
411
+
412
+ # Wait for shutdown signal
413
+ await self.shutdown_event.wait()
414
+ logger.info("Shutdown event received, stopping consumers")
415
+
416
+ # Cancel health monitoring
417
+ if self.health_check_task:
418
+ self.health_check_task.cancel()
369
419
  with suppress(asyncio.CancelledError):
370
- await task
371
- logger.info("Worker shutting down")
420
+ await self.health_check_task
372
421
 
373
- # Wait for all tasks to complete
374
- await self.wait_all_tasks_done()
422
+ # Cancel reconnection task if running
423
+ if self.reconnection_task:
424
+ self.reconnection_task.cancel()
425
+ with suppress(asyncio.CancelledError):
426
+ await self.reconnection_task
375
427
 
376
- # Close all channels and the connection
377
- await self.close_channels_and_connection()
428
+ handle_task_results_task.cancel()
429
+ with suppress(asyncio.CancelledError):
430
+ await handle_task_results_task
431
+ for task in tasks:
432
+ if not task.done():
433
+ task.cancel()
434
+ with suppress(asyncio.CancelledError):
435
+ await task
436
+ logger.info("Worker shutting down")
437
+
438
+ # Wait for all tasks to complete
439
+ await self.wait_all_tasks_done()
440
+
441
+ # Close all channels and the connection
442
+ await self.close_channels_and_connection()
378
443
 
379
444
  async def wait_all_tasks_done(self) -> None:
380
445
  if not self.tasks:
@@ -393,41 +458,8 @@ class AioPikaMicroserviceConsumer(MessageBusConsumer):
393
458
 
394
459
  async def close_channels_and_connection(self) -> None:
395
460
  """Close all channels and then the connection"""
396
- # Close all channels
397
- channel_close_tasks = []
398
- for queue_name, channel in self.channels.items():
399
- try:
400
- if not channel.is_closed:
401
- logger.info(f"Closing channel for queue {queue_name}")
402
- channel_close_tasks.append(channel.close())
403
- else:
404
- logger.info(f"Channel for queue {queue_name} already closed")
405
- except Exception as e:
406
- logger.error(
407
- f"Error preparing to close channel for queue {queue_name}: {e}"
408
- )
409
-
410
- # Wait for all channels to close (if any)
411
- if channel_close_tasks:
412
- try:
413
- await asyncio.gather(*channel_close_tasks, return_exceptions=True)
414
- except Exception as e:
415
- logger.error(f"Error during channel closures: {e}")
416
-
417
- # Clear channels dictionary
418
- self.channels.clear()
419
-
420
- # Close the connection
421
- if self.connection:
422
- try:
423
- if not self.connection.is_closed:
424
- logger.info("Closing RabbitMQ connection")
425
- await self.connection.close()
426
- else:
427
- logger.info("RabbitMQ connection already closed")
428
- except Exception as e:
429
- logger.error(f"Error closing RabbitMQ connection: {e}")
430
- self.connection = None
461
+ logger.info("Closing channels and connection...")
462
+ await self._cleanup_connection()
431
463
 
432
464
  def shutdown(self) -> None:
433
465
  """Signal for shutdown"""
@@ -436,7 +468,21 @@ class AioPikaMicroserviceConsumer(MessageBusConsumer):
436
468
 
437
469
  async def close(self) -> None:
438
470
  """Implement MessageBusConsumer.close for cleanup"""
471
+ logger.info("Closing consumer...")
439
472
  self.shutdown()
473
+
474
+ # Cancel health monitoring
475
+ if self.health_check_task:
476
+ self.health_check_task.cancel()
477
+ with suppress(asyncio.CancelledError):
478
+ await self.health_check_task
479
+
480
+ # Cancel reconnection task if running
481
+ if self.reconnection_task:
482
+ self.reconnection_task.cancel()
483
+ with suppress(asyncio.CancelledError):
484
+ await self.reconnection_task
485
+
440
486
  await self.wait_all_tasks_done()
441
487
  await self.close_channels_and_connection()
442
488
 
@@ -445,6 +491,16 @@ class AioPikaMicroserviceConsumer(MessageBusConsumer):
445
491
  Get the channel for a specific queue, or None if not found.
446
492
  This helps with error handling when a channel might have been closed.
447
493
  """
494
+ # If reconnection is in progress, wait for it to complete
495
+ if self.reconnection_in_progress:
496
+ try:
497
+ await asyncio.wait_for(self.reconnection_event.wait(), timeout=30.0)
498
+ except asyncio.TimeoutError:
499
+ logger.warning(
500
+ f"Timeout waiting for reconnection when getting channel for {queue_name}"
501
+ )
502
+ return None
503
+
448
504
  if queue_name not in self.channels:
449
505
  logger.warning(f"No channel found for queue {queue_name}")
450
506
  return None
@@ -453,18 +509,38 @@ class AioPikaMicroserviceConsumer(MessageBusConsumer):
453
509
  channel = self.channels[queue_name]
454
510
  if channel.is_closed:
455
511
  logger.warning(f"Channel for queue {queue_name} is closed")
456
- # Attempt to recreate the channel if needed
457
- if self.connection and not self.connection.is_closed:
458
- logger.info(f"Creating new channel for {queue_name}")
459
- self.channels[queue_name] = await self.connection.channel()
460
- await self.channels[queue_name].set_qos(
461
- prefetch_count=self.config.prefetch_count
462
- )
463
- return self.channels[queue_name]
464
- return None
512
+ # Remove the closed channel
513
+ del self.channels[queue_name]
514
+
515
+ # Attempt to recreate the channel if connection is healthy
516
+ if (
517
+ self.connection
518
+ and not self.connection.is_closed
519
+ and self.connection_healthy
520
+ ):
521
+ try:
522
+ logger.info(f"Creating new channel for {queue_name}")
523
+ self.channels[queue_name] = await self.connection.channel()
524
+ await self.channels[queue_name].set_qos(
525
+ prefetch_count=self.config.prefetch_count
526
+ )
527
+ return self.channels[queue_name]
528
+ except Exception as e:
529
+ logger.error(
530
+ f"Failed to recreate channel for {queue_name}: {e}"
531
+ )
532
+ # Trigger reconnection if channel creation fails
533
+ self._trigger_reconnection()
534
+ return None
535
+ else:
536
+ # Connection is not healthy, trigger reconnection
537
+ self._trigger_reconnection()
538
+ return None
465
539
  return channel
466
540
  except Exception as e:
467
541
  logger.error(f"Error accessing channel for queue {queue_name}: {e}")
542
+ # Trigger reconnection on any channel access error
543
+ self._trigger_reconnection()
468
544
  return None
469
545
 
470
546
  async def _establish_channel(self, queue_name: str) -> aio_pika.abc.AbstractChannel:
@@ -497,8 +573,8 @@ class AioPikaMicroserviceConsumer(MessageBusConsumer):
497
573
  fn=lambda: self._establish_channel(queue_name),
498
574
  retry_config=self.config.consumer_retry_config,
499
575
  retry_exceptions=(
500
- aio_pika.exceptions.AMQPConnectionError,
501
- aio_pika.exceptions.AMQPChannelError,
576
+ AMQPConnectionError,
577
+ AMQPChannelError,
502
578
  ConnectionError,
503
579
  ),
504
580
  )
@@ -525,7 +601,10 @@ class AioPikaMicroserviceConsumer(MessageBusConsumer):
525
601
  """
526
602
  try:
527
603
  logger.info("Establishing connection to RabbitMQ")
528
- connection = await aio_pika.connect(self.config.url)
604
+ connection = await aio_pika.connect(
605
+ self.config.url,
606
+ heartbeat=self.config.connection_heartbeat_interval,
607
+ )
529
608
  logger.info("Connected to RabbitMQ successfully")
530
609
  return connection
531
610
  except Exception as e:
@@ -552,7 +631,7 @@ class AioPikaMicroserviceConsumer(MessageBusConsumer):
552
631
  self._establish_connection,
553
632
  retry_config=self.config.connection_retry_config,
554
633
  retry_exceptions=(
555
- aio_pika.exceptions.AMQPConnectionError,
634
+ AMQPConnectionError,
556
635
  ConnectionError,
557
636
  OSError,
558
637
  TimeoutError,
@@ -586,22 +665,254 @@ class AioPikaMicroserviceConsumer(MessageBusConsumer):
586
665
  Get a channel for a specific queue as a context manager.
587
666
  This is safer than using get_channel directly as it ensures proper error handling.
588
667
  """
589
- channel = await self.get_channel(queue_name)
590
- if channel is None:
591
- if self.connection and not self.connection.is_closed:
592
- # Try to create a new channel
593
- async with self.create_channel(queue_name) as new_channel:
594
- yield new_channel
595
- else:
596
- raise RuntimeError(
597
- f"Cannot get channel for queue {queue_name}: no connection available"
598
- )
668
+ max_retries = 3
669
+ retry_delay = 1.0
670
+
671
+ for attempt in range(max_retries):
672
+ try:
673
+ channel = await self.get_channel(queue_name)
674
+ if channel is not None:
675
+ try:
676
+ yield channel
677
+ return
678
+ finally:
679
+ # We don't close the channel here as it's managed by the consumer
680
+ pass
681
+
682
+ # No channel available, check connection state
683
+ if (
684
+ self.connection
685
+ and not self.connection.is_closed
686
+ and self.connection_healthy
687
+ ):
688
+ # Try to create a new channel
689
+ async with self.create_channel(queue_name) as new_channel:
690
+ yield new_channel
691
+ return
692
+ else:
693
+ # Connection is not healthy, wait for reconnection
694
+ if self.reconnection_in_progress:
695
+ try:
696
+ await asyncio.wait_for(
697
+ self.reconnection_event.wait(), timeout=30.0
698
+ )
699
+ # Retry after reconnection
700
+ continue
701
+ except asyncio.TimeoutError:
702
+ logger.warning(
703
+ f"Timeout waiting for reconnection for queue {queue_name}"
704
+ )
705
+
706
+ # Still no connection, trigger reconnection
707
+ if not self.reconnection_in_progress:
708
+ self._trigger_reconnection()
709
+
710
+ if attempt < max_retries - 1:
711
+ logger.info(
712
+ f"Retrying channel access for {queue_name} in {retry_delay}s"
713
+ )
714
+ await asyncio.sleep(retry_delay)
715
+ retry_delay *= 2
716
+ else:
717
+ raise RuntimeError(
718
+ f"Cannot get channel for queue {queue_name}: no connection available after {max_retries} attempts"
719
+ )
720
+
721
+ except Exception as e:
722
+ if attempt < max_retries - 1:
723
+ logger.warning(
724
+ f"Error getting channel for {queue_name}, retrying: {e}"
725
+ )
726
+ await asyncio.sleep(retry_delay)
727
+ retry_delay *= 2
728
+ else:
729
+ logger.error(
730
+ f"Failed to get channel for {queue_name} after {max_retries} attempts: {e}"
731
+ )
732
+ raise
733
+
734
+ async def _monitor_connection_health(self) -> None:
735
+ """
736
+ Monitor connection health and trigger reconnection if needed.
737
+ This runs as a background task.
738
+ """
739
+ while not self.shutdown_event.is_set():
740
+ try:
741
+ await asyncio.sleep(self.config.connection_health_check_interval)
742
+
743
+ if self.shutdown_event.is_set():
744
+ break
745
+
746
+ # Check connection health
747
+ if not await self._is_connection_healthy():
748
+ logger.warning(
749
+ "Connection health check failed, triggering reconnection"
750
+ )
751
+ if not self.reconnection_in_progress:
752
+ self._trigger_reconnection()
753
+
754
+ except asyncio.CancelledError:
755
+ logger.info("Connection health monitoring cancelled")
756
+ break
757
+ except Exception as e:
758
+ logger.error(f"Error in connection health monitoring: {e}")
759
+ await asyncio.sleep(5) # Wait before retrying
760
+
761
+ async def _is_connection_healthy(self) -> bool:
762
+ """
763
+ Check if the connection is healthy.
764
+ """
765
+ try:
766
+ if self.connection is None or self.connection.is_closed:
767
+ return False
768
+
769
+ # Try to create a temporary channel to test connection
770
+ async with self.connection.channel() as test_channel:
771
+ # If we can create a channel, connection is healthy
772
+ return True
773
+
774
+ except Exception as e:
775
+ logger.debug(f"Connection health check failed: {e}")
776
+ return False
777
+
778
+ def _trigger_reconnection(self) -> None:
779
+ """
780
+ Trigger reconnection process.
781
+ """
782
+ if not self.reconnection_in_progress and not self.shutdown_event.is_set():
783
+ self.reconnection_in_progress = True
784
+ self.connection_healthy = False
785
+ self.reconnection_event.clear()
786
+
787
+ # Start reconnection task
788
+ self.reconnection_task = asyncio.create_task(self._handle_reconnection())
789
+ self.reconnection_task.add_done_callback(self._on_reconnection_done)
790
+
791
+ def _on_reconnection_done(self, task: asyncio.Task[Any]) -> None:
792
+ """
793
+ Handle completion of reconnection task.
794
+ """
795
+ self.reconnection_in_progress = False
796
+ if task.exception():
797
+ logger.error(f"Reconnection task failed: {task.exception()}")
599
798
  else:
799
+ logger.info("Reconnection completed successfully")
800
+
801
+ async def _handle_reconnection(self) -> None:
802
+ """
803
+ Handle the reconnection process with exponential backoff.
804
+ """
805
+ logger.info("Starting reconnection process")
806
+
807
+ # Close existing connection and channels
808
+ await self._cleanup_connection()
809
+
810
+ reconnection_config = self.config.reconnection_backoff_config
811
+ attempt = 0
812
+
813
+ while not self.shutdown_event.is_set():
600
814
  try:
601
- yield channel
602
- finally:
603
- # We don't close the channel here as it's managed by the consumer
604
- pass
815
+ attempt += 1
816
+ logger.info(f"Reconnection attempt {attempt}")
817
+
818
+ # Establish new connection
819
+ self.connection = await self._establish_connection()
820
+ self.connection_healthy = True
821
+
822
+ # Re-establish all consumers
823
+ await self._reestablish_consumers()
824
+
825
+ logger.info("Reconnection successful")
826
+ self.reconnection_event.set()
827
+ return
828
+
829
+ except Exception as e:
830
+ logger.error(f"Reconnection attempt {attempt} failed: {e}")
831
+
832
+ if self.shutdown_event.is_set():
833
+ break
834
+
835
+ # Calculate backoff delay
836
+ delay = reconnection_config.initial_delay * (
837
+ reconnection_config.backoff_factor ** (attempt - 1)
838
+ )
839
+ if reconnection_config.jitter:
840
+ jitter_amount = delay * 0.25
841
+ delay = delay + random.uniform(-jitter_amount, jitter_amount)
842
+ delay = max(delay, 0.1)
843
+
844
+ delay = min(delay, reconnection_config.max_delay)
845
+
846
+ logger.info(f"Retrying reconnection in {delay:.2f} seconds")
847
+ await asyncio.sleep(delay)
848
+
849
+ async def _cleanup_connection(self) -> None:
850
+ """
851
+ Clean up existing connection and channels.
852
+ """
853
+ # Cancel existing consumers
854
+ for queue_name, channel in self.channels.items():
855
+ try:
856
+ if not channel.is_closed:
857
+ # Cancel consumer if we have its tag
858
+ if queue_name in self.consumer_tags:
859
+ try:
860
+ queue = await channel.get_queue(queue_name, ensure=False)
861
+ if queue:
862
+ await queue.cancel(self.consumer_tags[queue_name])
863
+ except Exception as cancel_error:
864
+ logger.warning(
865
+ f"Error cancelling consumer for {queue_name}: {cancel_error}"
866
+ )
867
+ del self.consumer_tags[queue_name]
868
+ except Exception as e:
869
+ logger.warning(f"Error cancelling consumer for {queue_name}: {e}")
870
+
871
+ # Close channels
872
+ for queue_name, channel in self.channels.items():
873
+ try:
874
+ if not channel.is_closed:
875
+ await channel.close()
876
+ except Exception as e:
877
+ logger.warning(f"Error closing channel for {queue_name}: {e}")
878
+
879
+ self.channels.clear()
880
+
881
+ # Close connection
882
+ if self.connection and not self.connection.is_closed:
883
+ try:
884
+ await self.connection.close()
885
+ except Exception as e:
886
+ logger.warning(f"Error closing connection: {e}")
887
+
888
+ self.connection = None
889
+ self.connection_healthy = False
890
+
891
+ async def _reestablish_consumers(self) -> None:
892
+ """
893
+ Re-establish all consumers after reconnection.
894
+ """
895
+ logger.info("Re-establishing consumers after reconnection")
896
+
897
+ # Re-establish message handlers
898
+ for handler in self.message_handler_set:
899
+ queue_name = f"{handler.message_type.MESSAGE_TOPIC}.{handler.instance_callable.__module__}.{handler.instance_callable.__qualname__}"
900
+ try:
901
+ await self._setup_message_handler_consumer(handler)
902
+ logger.info(f"Re-established consumer for {queue_name}")
903
+ except Exception as e:
904
+ logger.error(f"Failed to re-establish consumer for {queue_name}: {e}")
905
+
906
+ # Re-establish scheduled actions
907
+ for scheduled_action in self.scheduled_actions:
908
+ queue_name = f"{scheduled_action.callable.__module__}.{scheduled_action.callable.__qualname__}"
909
+ try:
910
+ await self._setup_scheduled_action_consumer(scheduled_action)
911
+ logger.info(f"Re-established scheduler consumer for {queue_name}")
912
+ except Exception as e:
913
+ logger.error(
914
+ f"Failed to re-establish scheduler consumer for {queue_name}: {e}"
915
+ )
605
916
 
606
917
 
607
918
  def create_message_bus(
@@ -646,6 +957,19 @@ def create_message_bus(
646
957
  max_retries=30, initial_delay=5, max_delay=60.0, backoff_factor=3.0
647
958
  )
648
959
 
960
+ # Parse optional reconnection configuration parameters
961
+ reconnection_backoff_config = RetryConfig(
962
+ max_retries=-1, # Infinite retries for reconnection
963
+ initial_delay=2.0,
964
+ max_delay=120.0,
965
+ backoff_factor=2.0,
966
+ jitter=True,
967
+ )
968
+
969
+ # Parse heartbeat and health check intervals
970
+ connection_heartbeat_interval = 30.0
971
+ connection_health_check_interval = 10.0
972
+
649
973
  # Connection retry config parameters
650
974
  if (
651
975
  "connection_retry_max" in query_params
@@ -712,12 +1036,65 @@ def create_message_bus(
712
1036
  except ValueError:
713
1037
  pass
714
1038
 
1039
+ # Reconnection backoff config parameters
1040
+ if (
1041
+ "reconnection_retry_max" in query_params
1042
+ and query_params["reconnection_retry_max"][0].isdigit()
1043
+ ):
1044
+ reconnection_backoff_config.max_retries = int(
1045
+ query_params["reconnection_retry_max"][0]
1046
+ )
1047
+
1048
+ if "reconnection_retry_delay" in query_params:
1049
+ try:
1050
+ reconnection_backoff_config.initial_delay = float(
1051
+ query_params["reconnection_retry_delay"][0]
1052
+ )
1053
+ except ValueError:
1054
+ pass
1055
+
1056
+ if "reconnection_retry_max_delay" in query_params:
1057
+ try:
1058
+ reconnection_backoff_config.max_delay = float(
1059
+ query_params["reconnection_retry_max_delay"][0]
1060
+ )
1061
+ except ValueError:
1062
+ pass
1063
+
1064
+ if "reconnection_retry_backoff" in query_params:
1065
+ try:
1066
+ reconnection_backoff_config.backoff_factor = float(
1067
+ query_params["reconnection_retry_backoff"][0]
1068
+ )
1069
+ except ValueError:
1070
+ pass
1071
+
1072
+ # Heartbeat and health check intervals
1073
+ if "connection_heartbeat_interval" in query_params:
1074
+ try:
1075
+ connection_heartbeat_interval = float(
1076
+ query_params["connection_heartbeat_interval"][0]
1077
+ )
1078
+ except ValueError:
1079
+ pass
1080
+
1081
+ if "connection_health_check_interval" in query_params:
1082
+ try:
1083
+ connection_health_check_interval = float(
1084
+ query_params["connection_health_check_interval"][0]
1085
+ )
1086
+ except ValueError:
1087
+ pass
1088
+
715
1089
  config = AioPikaWorkerConfig(
716
1090
  url=broker_url,
717
1091
  exchange=exchange,
718
1092
  prefetch_count=prefetch_count,
719
1093
  connection_retry_config=connection_retry_config,
720
1094
  consumer_retry_config=consumer_retry_config,
1095
+ connection_heartbeat_interval=connection_heartbeat_interval,
1096
+ connection_health_check_interval=connection_health_check_interval,
1097
+ reconnection_backoff_config=reconnection_backoff_config,
721
1098
  )
722
1099
 
723
1100
  return AioPikaMicroserviceConsumer(
@@ -768,6 +1145,25 @@ class ScheduledMessageHandlerCallback:
768
1145
  )
769
1146
  return
770
1147
 
1148
+ # Check if connection is healthy before processing
1149
+ if not self.consumer.connection_healthy:
1150
+ logger.warning(
1151
+ f"Connection not healthy, requeuing scheduled message for {self.queue_name}"
1152
+ )
1153
+ try:
1154
+ # Wait briefly for potential reconnection
1155
+ await asyncio.sleep(0.1)
1156
+ if not self.consumer.connection_healthy:
1157
+ # Still not healthy, requeue the message
1158
+ async with self.consumer.get_channel_ctx(self.queue_name):
1159
+ await aio_pika_message.reject(requeue=True)
1160
+ return
1161
+ except Exception as e:
1162
+ logger.error(
1163
+ f"Failed to requeue scheduled message due to connection issues: {e}"
1164
+ )
1165
+ return
1166
+
771
1167
  async with self.consumer.lock:
772
1168
  task = asyncio.create_task(self.handle_message(aio_pika_message))
773
1169
  self.consumer.tasks.add(task)
@@ -803,6 +1199,21 @@ class ScheduledMessageHandlerCallback:
803
1199
  logger.error(f"Failed to requeue message during shutdown: {e}")
804
1200
  return
805
1201
 
1202
+ # Check connection health before processing
1203
+ if not self.consumer.connection_healthy:
1204
+ logger.warning(
1205
+ f"Connection not healthy, requeuing scheduled message for {self.queue_name}"
1206
+ )
1207
+ try:
1208
+ async with self.consumer.get_channel_ctx(self.queue_name):
1209
+ await aio_pika_message.reject(requeue=True)
1210
+ return
1211
+ except Exception as e:
1212
+ logger.error(
1213
+ f"Failed to requeue scheduled message due to connection issues: {e}"
1214
+ )
1215
+ return
1216
+
806
1217
  sig = inspect.signature(self.scheduled_action.callable)
807
1218
  if len(sig.parameters) == 1:
808
1219
 
@@ -895,6 +1306,23 @@ class MessageHandlerCallback:
895
1306
  logger.error(f"Failed to requeue message during shutdown: {e}")
896
1307
  return
897
1308
 
1309
+ # Check if connection is healthy before processing
1310
+ if not self.consumer.connection_healthy:
1311
+ logger.warning(
1312
+ f"Connection not healthy, requeuing message for {self.queue_name}"
1313
+ )
1314
+ try:
1315
+ # Wait briefly for potential reconnection
1316
+ await asyncio.sleep(0.1)
1317
+ if not self.consumer.connection_healthy:
1318
+ # Still not healthy, requeue the message
1319
+ async with self.consumer.get_channel_ctx(self.queue_name):
1320
+ await aio_pika_message.reject(requeue=True)
1321
+ return
1322
+ except Exception as e:
1323
+ logger.error(f"Failed to requeue message due to connection issues: {e}")
1324
+ return
1325
+
898
1326
  async with self.consumer.lock:
899
1327
  task = asyncio.create_task(self.handle_message(aio_pika_message))
900
1328
  self.consumer.tasks.add(task)
@@ -959,8 +1387,11 @@ class MessageHandlerCallback:
959
1387
  f"dead-lettering: {str(exception)}"
960
1388
  )
961
1389
  # Dead-letter the message after max retries
962
- async with self.consumer.get_channel_ctx(self.queue_name):
963
- await aio_pika_message.reject(requeue=False)
1390
+ try:
1391
+ async with self.consumer.get_channel_ctx(self.queue_name):
1392
+ await aio_pika_message.reject(requeue=False)
1393
+ except Exception as e:
1394
+ logger.error(f"Failed to dead-letter message {message_id}: {e}")
964
1395
  return
965
1396
 
966
1397
  # Calculate delay for this retry attempt
@@ -996,29 +1427,33 @@ class MessageHandlerCallback:
996
1427
  )
997
1428
 
998
1429
  # Acknowledge the current message since we'll handle retry ourselves
999
- async with self.consumer.get_channel_ctx(self.queue_name):
1000
- await aio_pika_message.ack()
1430
+ try:
1431
+ async with self.consumer.get_channel_ctx(self.queue_name):
1432
+ await aio_pika_message.ack()
1433
+ except Exception as e:
1434
+ logger.error(
1435
+ f"Failed to acknowledge message {message_id} for retry: {e}"
1436
+ )
1001
1437
  return
1002
1438
 
1003
1439
  # Standard reject without retry or with immediate requeue
1004
- async with self.consumer.get_channel_ctx(self.queue_name):
1005
- await aio_pika_message.reject(requeue=requeue)
1006
- if requeue:
1007
- logger.info(
1008
- f"Message {message_id} ({self.queue_name}) requeued for immediate retry"
1009
- )
1010
- else:
1011
- logger.info(
1012
- f"Message {message_id} ({self.queue_name}) rejected without requeue"
1013
- )
1440
+ try:
1441
+ async with self.consumer.get_channel_ctx(self.queue_name):
1442
+ await aio_pika_message.reject(requeue=requeue)
1443
+ if requeue:
1444
+ logger.info(
1445
+ f"Message {message_id} ({self.queue_name}) requeued for immediate retry"
1446
+ )
1447
+ else:
1448
+ logger.info(
1449
+ f"Message {message_id} ({self.queue_name}) rejected without requeue"
1450
+ )
1451
+ except Exception as e:
1452
+ logger.error(f"Failed to reject message {message_id}: {e}")
1014
1453
 
1015
- except RuntimeError as e:
1016
- logger.error(
1017
- f"Error rejecting message {message_id} ({self.queue_name}): {e}"
1018
- )
1019
1454
  except Exception as e:
1020
1455
  logger.exception(
1021
- f"Unexpected error rejecting message {message_id} ({self.queue_name}): {e}"
1456
+ f"Unexpected error in handle_reject_message for {message_id} ({self.queue_name}): {e}"
1022
1457
  )
1023
1458
 
1024
1459
  async def _delayed_retry(
@@ -1033,7 +1468,7 @@ class MessageHandlerCallback:
1033
1468
 
1034
1469
  Args:
1035
1470
  aio_pika_message: The original message
1036
- delay: Delay in seconds before retry
1471
+ delay: Delay in seconds before retrying
1037
1472
  retry_count: The current retry count (after increment)
1038
1473
  exception: The exception that caused the failure
1039
1474
  """
@@ -1058,28 +1493,46 @@ class MessageHandlerCallback:
1058
1493
  if message_id in self.retry_state:
1059
1494
  del self.retry_state[message_id]
1060
1495
 
1061
- # Republish the message to the same queue
1062
- async with self.consumer.get_channel_ctx(self.queue_name) as channel:
1063
- exchange = await RabbitmqUtils.get_main_exchange(
1064
- channel=channel,
1065
- exchange_name=self.consumer.config.exchange,
1066
- )
1496
+ # Republish the message to the same queue with retry logic
1497
+ max_attempts = 3
1498
+ for attempt in range(max_attempts):
1499
+ try:
1500
+ async with self.consumer.get_channel_ctx(
1501
+ self.queue_name
1502
+ ) as channel:
1503
+ exchange = await RabbitmqUtils.get_main_exchange(
1504
+ channel=channel,
1505
+ exchange_name=self.consumer.config.exchange,
1506
+ )
1067
1507
 
1068
- await exchange.publish(
1069
- aio_pika.Message(
1070
- body=message_body,
1071
- headers=headers,
1072
- message_id=message_id,
1073
- content_type=aio_pika_message.content_type,
1074
- content_encoding=aio_pika_message.content_encoding,
1075
- delivery_mode=aio_pika_message.delivery_mode,
1076
- ),
1077
- routing_key=self.routing_key,
1078
- )
1508
+ await exchange.publish(
1509
+ aio_pika.Message(
1510
+ body=message_body,
1511
+ headers=headers,
1512
+ message_id=message_id,
1513
+ content_type=aio_pika_message.content_type,
1514
+ content_encoding=aio_pika_message.content_encoding,
1515
+ delivery_mode=aio_pika_message.delivery_mode,
1516
+ ),
1517
+ routing_key=self.routing_key,
1518
+ )
1079
1519
 
1080
- logger.info(
1081
- f"Message {message_id} ({self.queue_name}) republished for retry {retry_count}"
1082
- )
1520
+ logger.info(
1521
+ f"Message {message_id} ({self.queue_name}) republished for retry {retry_count}"
1522
+ )
1523
+ return
1524
+
1525
+ except Exception as e:
1526
+ if attempt < max_attempts - 1:
1527
+ logger.warning(
1528
+ f"Failed to republish message {message_id} (attempt {attempt + 1}): {e}"
1529
+ )
1530
+ await asyncio.sleep(1.0 * (attempt + 1)) # Exponential backoff
1531
+ else:
1532
+ logger.error(
1533
+ f"Failed to republish message {message_id} after {max_attempts} attempts: {e}"
1534
+ )
1535
+ raise
1083
1536
 
1084
1537
  except Exception as e:
1085
1538
  logger.exception(
@@ -1171,11 +1624,17 @@ class MessageHandlerCallback:
1171
1624
  await handler(builded_message)
1172
1625
  if not incoming_message_spec.auto_ack:
1173
1626
  with suppress(aio_pika.MessageProcessError):
1174
- # Use channel context for acknowledgement
1175
- async with self.consumer.get_channel_ctx(
1176
- self.queue_name
1177
- ):
1178
- await aio_pika_message.ack()
1627
+ # Use channel context for acknowledgement with retry
1628
+ try:
1629
+ async with self.consumer.get_channel_ctx(
1630
+ self.queue_name
1631
+ ):
1632
+ await aio_pika_message.ack()
1633
+ except Exception as ack_error:
1634
+ logger.warning(
1635
+ f"Failed to acknowledge message {aio_pika_message.message_id or 'unknown'}: {ack_error}"
1636
+ )
1637
+ # Message will be redelivered if ack fails, which is acceptable
1179
1638
  except BaseException as base_exc:
1180
1639
  # Get message id for logging
1181
1640
  message_id = aio_pika_message.message_id or str(uuid.uuid4())