omnibase_infra 0.2.8__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. omnibase_infra/__init__.py +1 -1
  2. omnibase_infra/enums/__init__.py +4 -0
  3. omnibase_infra/enums/enum_declarative_node_violation.py +102 -0
  4. omnibase_infra/errors/__init__.py +18 -0
  5. omnibase_infra/errors/repository/__init__.py +78 -0
  6. omnibase_infra/errors/repository/errors_repository.py +424 -0
  7. omnibase_infra/event_bus/adapters/__init__.py +31 -0
  8. omnibase_infra/event_bus/adapters/adapter_protocol_event_publisher_kafka.py +517 -0
  9. omnibase_infra/mixins/mixin_async_circuit_breaker.py +113 -1
  10. omnibase_infra/models/__init__.py +9 -0
  11. omnibase_infra/models/event_bus/__init__.py +22 -0
  12. omnibase_infra/models/event_bus/model_consumer_retry_config.py +367 -0
  13. omnibase_infra/models/event_bus/model_dlq_config.py +177 -0
  14. omnibase_infra/models/event_bus/model_idempotency_config.py +131 -0
  15. omnibase_infra/models/event_bus/model_offset_policy_config.py +107 -0
  16. omnibase_infra/models/resilience/model_circuit_breaker_config.py +15 -0
  17. omnibase_infra/models/validation/__init__.py +8 -0
  18. omnibase_infra/models/validation/model_declarative_node_validation_result.py +139 -0
  19. omnibase_infra/models/validation/model_declarative_node_violation.py +169 -0
  20. omnibase_infra/nodes/architecture_validator/__init__.py +28 -7
  21. omnibase_infra/nodes/architecture_validator/constants.py +36 -0
  22. omnibase_infra/nodes/architecture_validator/handlers/__init__.py +28 -0
  23. omnibase_infra/nodes/architecture_validator/handlers/contract.yaml +120 -0
  24. omnibase_infra/nodes/architecture_validator/handlers/handler_architecture_validation.py +359 -0
  25. omnibase_infra/nodes/architecture_validator/node.py +1 -0
  26. omnibase_infra/nodes/architecture_validator/node_architecture_validator.py +48 -336
  27. omnibase_infra/nodes/contract_registry_reducer/reducer.py +12 -2
  28. omnibase_infra/nodes/node_ledger_projection_compute/__init__.py +16 -2
  29. omnibase_infra/nodes/node_ledger_projection_compute/contract.yaml +14 -4
  30. omnibase_infra/nodes/node_ledger_projection_compute/handlers/__init__.py +18 -0
  31. omnibase_infra/nodes/node_ledger_projection_compute/handlers/contract.yaml +53 -0
  32. omnibase_infra/nodes/node_ledger_projection_compute/handlers/handler_ledger_projection.py +354 -0
  33. omnibase_infra/nodes/node_ledger_projection_compute/node.py +20 -256
  34. omnibase_infra/nodes/node_registry_effect/node.py +20 -73
  35. omnibase_infra/protocols/protocol_dispatch_engine.py +90 -0
  36. omnibase_infra/runtime/__init__.py +11 -0
  37. omnibase_infra/runtime/baseline_subscriptions.py +150 -0
  38. omnibase_infra/runtime/db/__init__.py +73 -0
  39. omnibase_infra/runtime/db/models/__init__.py +41 -0
  40. omnibase_infra/runtime/db/models/model_repository_runtime_config.py +211 -0
  41. omnibase_infra/runtime/db/postgres_repository_runtime.py +545 -0
  42. omnibase_infra/runtime/event_bus_subcontract_wiring.py +455 -24
  43. omnibase_infra/runtime/kafka_contract_source.py +13 -5
  44. omnibase_infra/runtime/service_message_dispatch_engine.py +112 -0
  45. omnibase_infra/runtime/service_runtime_host_process.py +6 -11
  46. omnibase_infra/services/__init__.py +36 -0
  47. omnibase_infra/services/contract_publisher/__init__.py +95 -0
  48. omnibase_infra/services/contract_publisher/config.py +199 -0
  49. omnibase_infra/services/contract_publisher/errors.py +243 -0
  50. omnibase_infra/services/contract_publisher/models/__init__.py +28 -0
  51. omnibase_infra/services/contract_publisher/models/model_contract_error.py +67 -0
  52. omnibase_infra/services/contract_publisher/models/model_infra_error.py +62 -0
  53. omnibase_infra/services/contract_publisher/models/model_publish_result.py +112 -0
  54. omnibase_infra/services/contract_publisher/models/model_publish_stats.py +79 -0
  55. omnibase_infra/services/contract_publisher/service.py +617 -0
  56. omnibase_infra/services/contract_publisher/sources/__init__.py +52 -0
  57. omnibase_infra/services/contract_publisher/sources/model_discovered.py +155 -0
  58. omnibase_infra/services/contract_publisher/sources/protocol.py +101 -0
  59. omnibase_infra/services/contract_publisher/sources/source_composite.py +309 -0
  60. omnibase_infra/services/contract_publisher/sources/source_filesystem.py +174 -0
  61. omnibase_infra/services/contract_publisher/sources/source_package.py +221 -0
  62. omnibase_infra/services/observability/__init__.py +40 -0
  63. omnibase_infra/services/observability/agent_actions/__init__.py +64 -0
  64. omnibase_infra/services/observability/agent_actions/config.py +209 -0
  65. omnibase_infra/services/observability/agent_actions/consumer.py +1320 -0
  66. omnibase_infra/services/observability/agent_actions/models/__init__.py +87 -0
  67. omnibase_infra/services/observability/agent_actions/models/model_agent_action.py +142 -0
  68. omnibase_infra/services/observability/agent_actions/models/model_detection_failure.py +125 -0
  69. omnibase_infra/services/observability/agent_actions/models/model_envelope.py +85 -0
  70. omnibase_infra/services/observability/agent_actions/models/model_execution_log.py +159 -0
  71. omnibase_infra/services/observability/agent_actions/models/model_performance_metric.py +130 -0
  72. omnibase_infra/services/observability/agent_actions/models/model_routing_decision.py +138 -0
  73. omnibase_infra/services/observability/agent_actions/models/model_transformation_event.py +124 -0
  74. omnibase_infra/services/observability/agent_actions/tests/__init__.py +20 -0
  75. omnibase_infra/services/observability/agent_actions/tests/test_consumer.py +1154 -0
  76. omnibase_infra/services/observability/agent_actions/tests/test_models.py +645 -0
  77. omnibase_infra/services/observability/agent_actions/tests/test_writer.py +709 -0
  78. omnibase_infra/services/observability/agent_actions/writer_postgres.py +926 -0
  79. omnibase_infra/validation/__init__.py +12 -0
  80. omnibase_infra/validation/contracts/declarative_node.validation.yaml +143 -0
  81. omnibase_infra/validation/infra_validators.py +4 -1
  82. omnibase_infra/validation/validation_exemptions.yaml +111 -0
  83. omnibase_infra/validation/validator_declarative_node.py +850 -0
  84. {omnibase_infra-0.2.8.dist-info → omnibase_infra-0.3.0.dist-info}/METADATA +2 -2
  85. {omnibase_infra-0.2.8.dist-info → omnibase_infra-0.3.0.dist-info}/RECORD +88 -30
  86. {omnibase_infra-0.2.8.dist-info → omnibase_infra-0.3.0.dist-info}/WHEEL +0 -0
  87. {omnibase_infra-0.2.8.dist-info → omnibase_infra-0.3.0.dist-info}/entry_points.txt +0 -0
  88. {omnibase_infra-0.2.8.dist-info → omnibase_infra-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -191,6 +191,7 @@ class MixinAsyncCircuitBreaker:
191
191
  reset_timeout: float = 60.0,
192
192
  service_name: str = "unknown",
193
193
  transport_type: EnumInfraTransportType = EnumInfraTransportType.HTTP,
194
+ half_open_successes: int = 1,
194
195
  ) -> None:
195
196
  """Initialize circuit breaker state and configuration.
196
197
 
@@ -202,9 +203,11 @@ class MixinAsyncCircuitBreaker:
202
203
  reset_timeout: Seconds before automatic reset (default: 60.0)
203
204
  service_name: Service identifier for error context (e.g., "kafka.dev")
204
205
  transport_type: Transport type for error context (default: HTTP)
206
+ half_open_successes: Successful requests required to close circuit
207
+ from half-open state (default: 1)
205
208
 
206
209
  Raises:
207
- ValueError: If threshold < 1 or reset_timeout < 0
210
+ ValueError: If threshold < 1 or reset_timeout < 0 or half_open_successes < 1
208
211
 
209
212
  Example:
210
213
  ```python
@@ -215,6 +218,7 @@ class MixinAsyncCircuitBreaker:
215
218
  reset_timeout=config.circuit_breaker_reset_timeout,
216
219
  service_name=f"my-service.{config.environment}",
217
220
  transport_type=EnumInfraTransportType.HTTP,
221
+ half_open_successes=config.circuit_breaker_half_open_successes,
218
222
  )
219
223
  ```
220
224
  """
@@ -243,15 +247,30 @@ class MixinAsyncCircuitBreaker:
243
247
  parameter="reset_timeout",
244
248
  value=reset_timeout,
245
249
  )
250
+ if half_open_successes < 1:
251
+ context = ModelInfraErrorContext.with_correlation(
252
+ transport_type=transport_type,
253
+ operation="init_circuit_breaker",
254
+ target_name=service_name,
255
+ )
256
+ raise ProtocolConfigurationError(
257
+ f"Circuit breaker half_open_successes must be >= 1, got {half_open_successes}",
258
+ context=context,
259
+ parameter="half_open_successes",
260
+ value=half_open_successes,
261
+ )
246
262
 
247
263
  # State variables
248
264
  self._circuit_breaker_failures = 0
249
265
  self._circuit_breaker_open = False
250
266
  self._circuit_breaker_open_until: float = 0.0
267
+ self._circuit_breaker_half_open = False
268
+ self._circuit_breaker_half_open_success_count = 0
251
269
 
252
270
  # Configuration
253
271
  self.circuit_breaker_threshold = threshold
254
272
  self.circuit_breaker_reset_timeout = reset_timeout
273
+ self.circuit_breaker_half_open_successes = half_open_successes
255
274
  self.service_name = service_name
256
275
  self._cb_transport_type = (
257
276
  transport_type # Use private name to avoid property conflicts
@@ -265,6 +284,7 @@ class MixinAsyncCircuitBreaker:
265
284
  extra={
266
285
  "threshold": threshold,
267
286
  "reset_timeout": reset_timeout,
287
+ "half_open_successes": half_open_successes,
268
288
  "transport_type": transport_type.value,
269
289
  },
270
290
  )
@@ -298,6 +318,7 @@ class MixinAsyncCircuitBreaker:
298
318
  reset_timeout_seconds=60.0,
299
319
  service_name=f"kafka.{environment}",
300
320
  transport_type=EnumInfraTransportType.KAFKA,
321
+ half_open_successes=2,
301
322
  )
302
323
  self._init_circuit_breaker_from_config(config)
303
324
  ```
@@ -311,6 +332,7 @@ class MixinAsyncCircuitBreaker:
311
332
  reset_timeout=config.reset_timeout_seconds,
312
333
  service_name=config.service_name,
313
334
  transport_type=config.transport_type,
335
+ half_open_successes=config.half_open_successes,
314
336
  )
315
337
 
316
338
  async def _check_circuit_breaker(
@@ -388,12 +410,15 @@ class MixinAsyncCircuitBreaker:
388
410
  if current_time >= self._circuit_breaker_open_until:
389
411
  # Transition to HALF_OPEN (atomic write protected by caller's lock)
390
412
  self._circuit_breaker_open = False
413
+ self._circuit_breaker_half_open = True
414
+ self._circuit_breaker_half_open_success_count = 0
391
415
  self._circuit_breaker_failures = 0
392
416
  logger.info(
393
417
  f"Circuit breaker transitioning to half-open for {self.service_name}",
394
418
  extra={
395
419
  "service": self.service_name,
396
420
  "operation": operation,
421
+ "required_successes": self.circuit_breaker_half_open_successes,
397
422
  },
398
423
  )
399
424
  else:
@@ -484,6 +509,26 @@ class MixinAsyncCircuitBreaker:
484
509
  # Increment failure counter (atomic write protected by caller's lock)
485
510
  self._circuit_breaker_failures += 1
486
511
 
512
+ # If in half-open state, any failure immediately re-opens the circuit
513
+ if self._circuit_breaker_half_open:
514
+ self._circuit_breaker_open = True
515
+ self._circuit_breaker_half_open = False
516
+ self._circuit_breaker_half_open_success_count = 0
517
+ self._circuit_breaker_open_until = (
518
+ time.time() + self.circuit_breaker_reset_timeout
519
+ )
520
+
521
+ logger.warning(
522
+ f"Circuit breaker re-opened for {self.service_name} after failure in half-open state",
523
+ extra={
524
+ "service": self.service_name,
525
+ "operation": operation,
526
+ "reset_timeout": self.circuit_breaker_reset_timeout,
527
+ "correlation_id": str(correlation_id) if correlation_id else None,
528
+ },
529
+ )
530
+ return
531
+
487
532
  # Check if threshold reached
488
533
  if self._circuit_breaker_failures >= self.circuit_breaker_threshold:
489
534
  # Transition to OPEN state (atomic write protected by caller's lock)
@@ -564,6 +609,39 @@ class MixinAsyncCircuitBreaker:
564
609
  )
565
610
  # Still proceed but log the violation for debugging
566
611
 
612
+ # If in half-open state, track successes
613
+ if self._circuit_breaker_half_open:
614
+ self._circuit_breaker_half_open_success_count += 1
615
+
616
+ if (
617
+ self._circuit_breaker_half_open_success_count
618
+ >= self.circuit_breaker_half_open_successes
619
+ ):
620
+ # Enough successes - transition to CLOSED
621
+ logger.info(
622
+ f"Circuit breaker closed for {self.service_name} after {self._circuit_breaker_half_open_success_count} successful requests in half-open state",
623
+ extra={
624
+ "service": self.service_name,
625
+ "half_open_successes": self._circuit_breaker_half_open_success_count,
626
+ "required_successes": self.circuit_breaker_half_open_successes,
627
+ },
628
+ )
629
+ self._circuit_breaker_half_open = False
630
+ self._circuit_breaker_half_open_success_count = 0
631
+ self._circuit_breaker_failures = 0
632
+ self._circuit_breaker_open_until = 0.0
633
+ else:
634
+ # Still in half-open, waiting for more successes
635
+ logger.debug(
636
+ f"Circuit breaker half-open success {self._circuit_breaker_half_open_success_count}/{self.circuit_breaker_half_open_successes} for {self.service_name}",
637
+ extra={
638
+ "service": self.service_name,
639
+ "half_open_successes": self._circuit_breaker_half_open_success_count,
640
+ "required_successes": self.circuit_breaker_half_open_successes,
641
+ },
642
+ )
643
+ return
644
+
567
645
  # Log state transition if circuit was open or had failures
568
646
  if self._circuit_breaker_open or self._circuit_breaker_failures > 0:
569
647
  previous_state = "open" if self._circuit_breaker_open else "closed"
@@ -578,6 +656,8 @@ class MixinAsyncCircuitBreaker:
578
656
 
579
657
  # Reset state (atomic write protected by caller's lock)
580
658
  self._circuit_breaker_open = False
659
+ self._circuit_breaker_half_open = False
660
+ self._circuit_breaker_half_open_success_count = 0
581
661
  self._circuit_breaker_failures = 0
582
662
  self._circuit_breaker_open_until = 0.0
583
663
 
@@ -621,10 +701,15 @@ class MixinAsyncCircuitBreaker:
621
701
 
622
702
  # Read state variables with safe defaults for uninitialized state
623
703
  cb_open = getattr(self, "_circuit_breaker_open", False)
704
+ cb_half_open = getattr(self, "_circuit_breaker_half_open", False)
624
705
  cb_open_until = getattr(self, "_circuit_breaker_open_until", 0.0)
625
706
  cb_failures = getattr(self, "_circuit_breaker_failures", 0)
626
707
  cb_threshold = getattr(self, "circuit_breaker_threshold", 5)
627
708
  cb_reset_timeout = getattr(self, "circuit_breaker_reset_timeout", 60.0)
709
+ cb_half_open_successes = getattr(self, "circuit_breaker_half_open_successes", 1)
710
+ cb_half_open_success_count = getattr(
711
+ self, "_circuit_breaker_half_open_success_count", 0
712
+ )
628
713
 
629
714
  # Calculate state: closed, open, or half_open
630
715
  current_time = time.time()
@@ -635,6 +720,9 @@ class MixinAsyncCircuitBreaker:
635
720
  else:
636
721
  cb_state = "open"
637
722
  seconds_until_half_open = round(cb_open_until - current_time, 2)
723
+ elif cb_half_open:
724
+ cb_state = "half_open"
725
+ seconds_until_half_open = None
638
726
  else:
639
727
  cb_state = "closed"
640
728
  seconds_until_half_open = None
@@ -645,12 +733,36 @@ class MixinAsyncCircuitBreaker:
645
733
  "failures": cb_failures,
646
734
  "threshold": cb_threshold,
647
735
  "reset_timeout_seconds": cb_reset_timeout,
736
+ "half_open_successes_required": cb_half_open_successes,
648
737
  }
649
738
 
650
739
  if seconds_until_half_open is not None:
651
740
  result["seconds_until_half_open"] = seconds_until_half_open
652
741
 
742
+ if cb_state == "half_open":
743
+ result["half_open_success_count"] = cb_half_open_success_count
744
+
653
745
  return result
654
746
 
747
+ def get_circuit_breaker_state(self) -> dict[str, JsonType]:
748
+ """Return current circuit breaker state for external introspection.
749
+
750
+ This is the public API for accessing circuit breaker state. Adapters and
751
+ other external consumers should use this method rather than accessing
752
+ private attributes.
753
+
754
+ See `_get_circuit_breaker_state()` for implementation details.
755
+
756
+ Returns:
757
+ dict containing:
758
+ - initialized: Whether circuit breaker has been initialized
759
+ - state: Current state ("closed", "open", or "half_open")
760
+ - failures: Current failure count
761
+ - threshold: Configured failure threshold
762
+ - reset_timeout_seconds: Configured reset timeout
763
+ - seconds_until_half_open: Seconds until half_open (only when open)
764
+ """
765
+ return self._get_circuit_breaker_state()
766
+
655
767
 
656
768
  __all__ = ["EnumCircuitState", "MixinAsyncCircuitBreaker", "ModelCircuitBreakerConfig"]
@@ -22,6 +22,11 @@ from omnibase_infra.models.dispatch import (
22
22
  ModelTopicParser,
23
23
  )
24
24
  from omnibase_infra.models.errors import ModelHandlerValidationError
25
+ from omnibase_infra.models.event_bus import (
26
+ ModelConsumerRetryConfig,
27
+ ModelIdempotencyConfig,
28
+ ModelOffsetPolicyConfig,
29
+ )
25
30
  from omnibase_infra.models.handlers import ModelHandlerIdentifier
26
31
  from omnibase_infra.models.health import ModelHealthCheckResult
27
32
  from omnibase_infra.models.logging import ModelLogContext
@@ -84,6 +89,10 @@ __all__: list[str] = [
84
89
  # Dispatch models
85
90
  "EnumDispatchStatus",
86
91
  "EnumTopicStandard",
92
+ # Event bus models
93
+ "ModelConsumerRetryConfig",
94
+ "ModelIdempotencyConfig",
95
+ "ModelOffsetPolicyConfig",
87
96
  # Resilience models
88
97
  "ModelCircuitBreakerConfig",
89
98
  # Validation models
@@ -0,0 +1,22 @@
1
+ # SPDX-FileCopyrightText: 2025 OmniNode Team <info@omninode.ai>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ """Event bus models for message consumption, idempotency, and DLQ configuration."""
5
+
6
+ from omnibase_infra.models.event_bus.model_consumer_retry_config import (
7
+ ModelConsumerRetryConfig,
8
+ )
9
+ from omnibase_infra.models.event_bus.model_dlq_config import ModelDlqConfig
10
+ from omnibase_infra.models.event_bus.model_idempotency_config import (
11
+ ModelIdempotencyConfig,
12
+ )
13
+ from omnibase_infra.models.event_bus.model_offset_policy_config import (
14
+ ModelOffsetPolicyConfig,
15
+ )
16
+
17
+ __all__ = [
18
+ "ModelConsumerRetryConfig",
19
+ "ModelDlqConfig",
20
+ "ModelIdempotencyConfig",
21
+ "ModelOffsetPolicyConfig",
22
+ ]
@@ -0,0 +1,367 @@
1
+ # SPDX-License-Identifier: MIT
2
+ # Copyright (c) 2025 OmniNode Team
3
+ """Consumer-side retry configuration model.
4
+
5
+ This module provides the configuration model for consumer-side retry behavior
6
+ when message handlers fail. It distinguishes between:
7
+
8
+ - **Content errors** (non-retryable): Malformed messages, validation failures,
9
+ business logic errors. These will fail regardless of retry attempts.
10
+
11
+ - **Infrastructure errors** (retryable): Network timeouts, temporary service
12
+ unavailability, rate limiting. These may succeed on retry.
13
+
14
+ The model uses exponential backoff with optional jitter to prevent thundering
15
+ herd problems in distributed systems.
16
+
17
+ Example:
18
+ >>> config = ModelConsumerRetryConfig(
19
+ ... max_attempts=5,
20
+ ... backoff_ms=500,
21
+ ... backoff_multiplier=2.0,
22
+ ... jitter_enabled=True,
23
+ ... )
24
+ >>> config.calculate_delay_ms(attempt=3) # Returns ~2000ms + jitter
25
+
26
+ See Also:
27
+ - docs/patterns/error_recovery_patterns.md: Error recovery patterns
28
+ - docs/patterns/dispatcher_resilience.md: Dispatcher resilience patterns
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ import random
34
+ from typing import Literal
35
+
36
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
37
+
38
+ from omnibase_core.errors import OnexError
39
+
40
+
41
+ class ModelConsumerRetryConfig(BaseModel):
42
+ """Consumer-side retry configuration.
43
+
44
+ Controls retry behavior when message handlers fail. Distinguishes between
45
+ content errors (non-retryable) and infrastructure errors (retryable).
46
+
47
+ Attributes:
48
+ max_attempts: Maximum retry attempts before giving up. Includes the
49
+ initial attempt, so max_attempts=3 means 1 initial + 2 retries.
50
+ backoff_ms: Base backoff delay in milliseconds. For exponential
51
+ backoff, subsequent delays are backoff_ms * (multiplier ^ attempt).
52
+ backoff_multiplier: Exponential backoff multiplier. A value of 2.0
53
+ doubles the delay with each retry.
54
+ jitter_enabled: When True, adds random jitter (0-25% of delay) to
55
+ prevent thundering herd when multiple consumers retry simultaneously.
56
+ backoff_strategy: Strategy for calculating delays between retries.
57
+ "exponential" doubles delay each retry, "fixed" uses constant delay.
58
+ max_backoff_ms: Maximum backoff delay cap to prevent excessive waits.
59
+
60
+ Example:
61
+ ```python
62
+ from omnibase_infra.models.event_bus import ModelConsumerRetryConfig
63
+
64
+ # Standard configuration with exponential backoff
65
+ config = ModelConsumerRetryConfig(
66
+ max_attempts=3,
67
+ backoff_ms=1000,
68
+ backoff_multiplier=2.0,
69
+ jitter_enabled=True,
70
+ )
71
+
72
+ # Conservative configuration for critical operations
73
+ config = ModelConsumerRetryConfig.create_conservative()
74
+
75
+ # Aggressive configuration for resilient operations
76
+ config = ModelConsumerRetryConfig.create_aggressive()
77
+ ```
78
+
79
+ Configuration Guidelines:
80
+ - Critical operations: Use lower max_attempts (2-3), higher backoff
81
+ - Best-effort operations: Use higher max_attempts (5+), lower backoff
82
+ - High-concurrency: Always enable jitter to prevent thundering herd
83
+ """
84
+
85
+ model_config = ConfigDict(
86
+ frozen=True,
87
+ extra="forbid",
88
+ json_schema_extra={
89
+ "examples": [
90
+ {
91
+ "max_attempts": 3,
92
+ "backoff_ms": 1000,
93
+ "backoff_multiplier": 2.0,
94
+ "jitter_enabled": True,
95
+ "backoff_strategy": "exponential",
96
+ "max_backoff_ms": 30000,
97
+ },
98
+ {
99
+ "max_attempts": 5,
100
+ "backoff_ms": 500,
101
+ "backoff_multiplier": 1.5,
102
+ "jitter_enabled": True,
103
+ "backoff_strategy": "exponential",
104
+ "max_backoff_ms": 60000,
105
+ },
106
+ ]
107
+ },
108
+ )
109
+
110
+ max_attempts: int = Field(
111
+ default=3,
112
+ ge=1,
113
+ le=10,
114
+ description="Maximum retry attempts before giving up (1-10). "
115
+ "Includes initial attempt, so 3 means 1 initial + 2 retries.",
116
+ )
117
+
118
+ backoff_ms: int = Field(
119
+ default=1000,
120
+ ge=100,
121
+ le=60000,
122
+ description="Base backoff delay in milliseconds (100-60000). "
123
+ "For exponential backoff, subsequent delays are backoff_ms * (multiplier ^ attempt).",
124
+ )
125
+
126
+ backoff_multiplier: float = Field(
127
+ default=2.0,
128
+ ge=1.0,
129
+ le=4.0,
130
+ description="Exponential backoff multiplier (1.0-4.0). "
131
+ "A value of 2.0 doubles the delay with each retry.",
132
+ )
133
+
134
+ jitter_enabled: bool = Field(
135
+ default=True,
136
+ description="Add random jitter (0-25% of delay) to backoff. "
137
+ "Prevents thundering herd when multiple consumers retry simultaneously.",
138
+ )
139
+
140
+ backoff_strategy: Literal["exponential", "fixed"] = Field(
141
+ default="exponential",
142
+ description="Backoff strategy: 'exponential' multiplies delay each retry, "
143
+ "'fixed' uses constant delay.",
144
+ )
145
+
146
+ max_backoff_ms: int = Field(
147
+ default=30000,
148
+ ge=1000,
149
+ le=300000,
150
+ description="Maximum backoff delay cap in milliseconds (1000-300000). "
151
+ "Prevents excessive waits in exponential backoff.",
152
+ )
153
+
154
+ @field_validator("max_backoff_ms")
155
+ @classmethod
156
+ def validate_max_backoff_greater_than_base(cls, v: int, info: object) -> int:
157
+ """Ensure max_backoff_ms is at least as large as backoff_ms.
158
+
159
+ Args:
160
+ v: The max_backoff_ms value to validate.
161
+ info: Pydantic validation info containing other field values.
162
+
163
+ Returns:
164
+ The validated max_backoff_ms value.
165
+
166
+ Raises:
167
+ ValueError: If max_backoff_ms is less than backoff_ms.
168
+ """
169
+ # Access data from validation info
170
+ # Use getattr for safer access pattern
171
+ data = getattr(info, "data", None) or {}
172
+ base_delay = data.get("backoff_ms", 1000)
173
+ if v < base_delay:
174
+ msg = f"max_backoff_ms ({v}) must be >= backoff_ms ({base_delay})"
175
+ raise ValueError(msg)
176
+ return v
177
+
178
+ def calculate_delay_ms(
179
+ self, attempt: int, include_jitter: bool | None = None
180
+ ) -> int:
181
+ """Calculate delay in milliseconds for a specific retry attempt.
182
+
183
+ Args:
184
+ attempt: The retry attempt number (1-based). Attempt 1 is the first
185
+ retry after the initial failure.
186
+ include_jitter: Whether to include jitter. If None, uses the
187
+ jitter_enabled setting from configuration.
188
+
189
+ Returns:
190
+ Delay in milliseconds for the specified attempt, capped at max_backoff_ms.
191
+
192
+ Raises:
193
+ OnexError: If attempt is less than 1 or exceeds allowed retries.
194
+
195
+ Example:
196
+ >>> config = ModelConsumerRetryConfig(backoff_ms=1000, backoff_multiplier=2.0)
197
+ >>> config.calculate_delay_ms(1) # ~1000ms
198
+ >>> config.calculate_delay_ms(2) # ~2000ms (max_attempts=3 allows 2 retries)
199
+ """
200
+ if attempt < 1:
201
+ msg = f"Attempt must be >= 1, got {attempt}"
202
+ raise OnexError(msg)
203
+
204
+ # max_attempts includes the initial attempt, so valid retries are 1 to max_attempts-1
205
+ max_retry = self.max_attempts - 1
206
+ if attempt > max_retry:
207
+ msg = f"Attempt {attempt} exceeds max retries {max_retry} (max_attempts={self.max_attempts} includes initial attempt)"
208
+ raise OnexError(msg)
209
+
210
+ if self.backoff_strategy == "exponential":
211
+ # Exponential: backoff_ms * (multiplier ^ (attempt - 1))
212
+ delay = self.backoff_ms * (self.backoff_multiplier ** (attempt - 1))
213
+ else:
214
+ # Fixed: constant delay
215
+ delay = float(self.backoff_ms)
216
+
217
+ # Cap at max_backoff_ms
218
+ delay = min(delay, float(self.max_backoff_ms))
219
+
220
+ # Add jitter if enabled
221
+ use_jitter = (
222
+ include_jitter if include_jitter is not None else self.jitter_enabled
223
+ )
224
+ if use_jitter:
225
+ # Add 0-25% random jitter
226
+ jitter_factor = random.random() * 0.25
227
+ delay += delay * jitter_factor
228
+
229
+ return int(delay)
230
+
231
+ def get_all_delays_ms(self, include_jitter: bool = False) -> list[int]:
232
+ """Get all delay times for the complete retry sequence.
233
+
234
+ Args:
235
+ include_jitter: Whether to include jitter in calculations.
236
+ Defaults to False for predictable results.
237
+
238
+ Returns:
239
+ List of delays in milliseconds for each retry attempt.
240
+ Since max_attempts includes the initial attempt, there are
241
+ max_attempts - 1 retries, and thus max_attempts - 1 delays.
242
+
243
+ Example:
244
+ >>> config = ModelConsumerRetryConfig(max_attempts=3, backoff_ms=1000)
245
+ >>> config.get_all_delays_ms()
246
+ [1000, 2000] # 2 retries (max_attempts=3 includes initial)
247
+ """
248
+ # max_attempts includes initial attempt, so we have max_attempts - 1 retries
249
+ return [
250
+ self.calculate_delay_ms(i, include_jitter=include_jitter)
251
+ for i in range(1, self.max_attempts)
252
+ ]
253
+
254
+ def get_total_retry_time_ms(self) -> int:
255
+ """Get total time that all retries will take.
256
+
257
+ Returns:
258
+ Total time in milliseconds for all retry attempts (without jitter).
259
+ Since max_attempts includes the initial attempt, this sums delays
260
+ for max_attempts - 1 retries.
261
+
262
+ Example:
263
+ >>> config = ModelConsumerRetryConfig(max_attempts=3, backoff_ms=1000)
264
+ >>> config.get_total_retry_time_ms()
265
+ 3000 # 1000 + 2000 (2 retries for max_attempts=3)
266
+ """
267
+ return sum(self.get_all_delays_ms(include_jitter=False))
268
+
269
+ @classmethod
270
+ def create_conservative(cls) -> ModelConsumerRetryConfig:
271
+ """Create conservative retry configuration for critical operations.
272
+
273
+ Conservative configuration minimizes retry attempts and uses longer
274
+ delays to reduce load on failing services.
275
+
276
+ Returns:
277
+ ModelConsumerRetryConfig with conservative settings.
278
+
279
+ Example:
280
+ >>> config = ModelConsumerRetryConfig.create_conservative()
281
+ >>> config.max_attempts
282
+ 2
283
+ """
284
+ return cls(
285
+ max_attempts=2,
286
+ backoff_ms=2000,
287
+ backoff_multiplier=2.0,
288
+ jitter_enabled=True,
289
+ backoff_strategy="exponential",
290
+ max_backoff_ms=30000,
291
+ )
292
+
293
+ @classmethod
294
+ def create_standard(cls) -> ModelConsumerRetryConfig:
295
+ """Create standard retry configuration for typical operations.
296
+
297
+ Standard configuration balances reliability with reasonable latency.
298
+
299
+ Returns:
300
+ ModelConsumerRetryConfig with standard settings.
301
+
302
+ Example:
303
+ >>> config = ModelConsumerRetryConfig.create_standard()
304
+ >>> config.max_attempts
305
+ 3
306
+ """
307
+ return cls(
308
+ max_attempts=3,
309
+ backoff_ms=1000,
310
+ backoff_multiplier=2.0,
311
+ jitter_enabled=True,
312
+ backoff_strategy="exponential",
313
+ max_backoff_ms=30000,
314
+ )
315
+
316
+ @classmethod
317
+ def create_aggressive(cls) -> ModelConsumerRetryConfig:
318
+ """Create aggressive retry configuration for resilient operations.
319
+
320
+ Aggressive configuration maximizes retry attempts with shorter delays
321
+ for operations that need high availability.
322
+
323
+ Returns:
324
+ ModelConsumerRetryConfig with aggressive settings.
325
+
326
+ Example:
327
+ >>> config = ModelConsumerRetryConfig.create_aggressive()
328
+ >>> config.max_attempts
329
+ 5
330
+ """
331
+ return cls(
332
+ max_attempts=5,
333
+ backoff_ms=500,
334
+ backoff_multiplier=1.5,
335
+ jitter_enabled=True,
336
+ backoff_strategy="exponential",
337
+ max_backoff_ms=15000,
338
+ )
339
+
340
+ @classmethod
341
+ def create_no_retry(cls) -> ModelConsumerRetryConfig:
342
+ """Create configuration with no retries (fail-fast).
343
+
344
+ Use for operations where retries are not appropriate, such as
345
+ idempotency-sensitive operations or when circuit breaker is preferred.
346
+
347
+ Returns:
348
+ ModelConsumerRetryConfig with no retry attempts.
349
+
350
+ Example:
351
+ >>> config = ModelConsumerRetryConfig.create_no_retry()
352
+ >>> config.max_attempts
353
+ 1
354
+ """
355
+ return cls(
356
+ max_attempts=1,
357
+ backoff_ms=100,
358
+ backoff_multiplier=1.0,
359
+ jitter_enabled=False,
360
+ backoff_strategy="fixed",
361
+ max_backoff_ms=1000,
362
+ )
363
+
364
+
365
+ __all__: list[str] = [
366
+ "ModelConsumerRetryConfig",
367
+ ]