omnibase_infra 0.2.8__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. omnibase_infra/__init__.py +1 -1
  2. omnibase_infra/enums/__init__.py +4 -0
  3. omnibase_infra/enums/enum_declarative_node_violation.py +102 -0
  4. omnibase_infra/errors/__init__.py +18 -0
  5. omnibase_infra/errors/repository/__init__.py +78 -0
  6. omnibase_infra/errors/repository/errors_repository.py +424 -0
  7. omnibase_infra/event_bus/adapters/__init__.py +31 -0
  8. omnibase_infra/event_bus/adapters/adapter_protocol_event_publisher_kafka.py +517 -0
  9. omnibase_infra/mixins/mixin_async_circuit_breaker.py +113 -1
  10. omnibase_infra/models/__init__.py +9 -0
  11. omnibase_infra/models/event_bus/__init__.py +22 -0
  12. omnibase_infra/models/event_bus/model_consumer_retry_config.py +367 -0
  13. omnibase_infra/models/event_bus/model_dlq_config.py +177 -0
  14. omnibase_infra/models/event_bus/model_idempotency_config.py +131 -0
  15. omnibase_infra/models/event_bus/model_offset_policy_config.py +107 -0
  16. omnibase_infra/models/resilience/model_circuit_breaker_config.py +15 -0
  17. omnibase_infra/models/validation/__init__.py +8 -0
  18. omnibase_infra/models/validation/model_declarative_node_validation_result.py +139 -0
  19. omnibase_infra/models/validation/model_declarative_node_violation.py +169 -0
  20. omnibase_infra/nodes/architecture_validator/__init__.py +28 -7
  21. omnibase_infra/nodes/architecture_validator/constants.py +36 -0
  22. omnibase_infra/nodes/architecture_validator/handlers/__init__.py +28 -0
  23. omnibase_infra/nodes/architecture_validator/handlers/contract.yaml +120 -0
  24. omnibase_infra/nodes/architecture_validator/handlers/handler_architecture_validation.py +359 -0
  25. omnibase_infra/nodes/architecture_validator/node.py +1 -0
  26. omnibase_infra/nodes/architecture_validator/node_architecture_validator.py +48 -336
  27. omnibase_infra/nodes/contract_registry_reducer/reducer.py +12 -2
  28. omnibase_infra/nodes/node_ledger_projection_compute/__init__.py +16 -2
  29. omnibase_infra/nodes/node_ledger_projection_compute/contract.yaml +14 -4
  30. omnibase_infra/nodes/node_ledger_projection_compute/handlers/__init__.py +18 -0
  31. omnibase_infra/nodes/node_ledger_projection_compute/handlers/contract.yaml +53 -0
  32. omnibase_infra/nodes/node_ledger_projection_compute/handlers/handler_ledger_projection.py +354 -0
  33. omnibase_infra/nodes/node_ledger_projection_compute/node.py +20 -256
  34. omnibase_infra/nodes/node_registry_effect/node.py +20 -73
  35. omnibase_infra/protocols/protocol_dispatch_engine.py +90 -0
  36. omnibase_infra/runtime/__init__.py +11 -0
  37. omnibase_infra/runtime/baseline_subscriptions.py +150 -0
  38. omnibase_infra/runtime/db/__init__.py +73 -0
  39. omnibase_infra/runtime/db/models/__init__.py +41 -0
  40. omnibase_infra/runtime/db/models/model_repository_runtime_config.py +211 -0
  41. omnibase_infra/runtime/db/postgres_repository_runtime.py +545 -0
  42. omnibase_infra/runtime/event_bus_subcontract_wiring.py +455 -24
  43. omnibase_infra/runtime/kafka_contract_source.py +13 -5
  44. omnibase_infra/runtime/service_message_dispatch_engine.py +112 -0
  45. omnibase_infra/runtime/service_runtime_host_process.py +6 -11
  46. omnibase_infra/services/__init__.py +36 -0
  47. omnibase_infra/services/contract_publisher/__init__.py +95 -0
  48. omnibase_infra/services/contract_publisher/config.py +199 -0
  49. omnibase_infra/services/contract_publisher/errors.py +243 -0
  50. omnibase_infra/services/contract_publisher/models/__init__.py +28 -0
  51. omnibase_infra/services/contract_publisher/models/model_contract_error.py +67 -0
  52. omnibase_infra/services/contract_publisher/models/model_infra_error.py +62 -0
  53. omnibase_infra/services/contract_publisher/models/model_publish_result.py +112 -0
  54. omnibase_infra/services/contract_publisher/models/model_publish_stats.py +79 -0
  55. omnibase_infra/services/contract_publisher/service.py +617 -0
  56. omnibase_infra/services/contract_publisher/sources/__init__.py +52 -0
  57. omnibase_infra/services/contract_publisher/sources/model_discovered.py +155 -0
  58. omnibase_infra/services/contract_publisher/sources/protocol.py +101 -0
  59. omnibase_infra/services/contract_publisher/sources/source_composite.py +309 -0
  60. omnibase_infra/services/contract_publisher/sources/source_filesystem.py +174 -0
  61. omnibase_infra/services/contract_publisher/sources/source_package.py +221 -0
  62. omnibase_infra/services/observability/__init__.py +40 -0
  63. omnibase_infra/services/observability/agent_actions/__init__.py +64 -0
  64. omnibase_infra/services/observability/agent_actions/config.py +209 -0
  65. omnibase_infra/services/observability/agent_actions/consumer.py +1320 -0
  66. omnibase_infra/services/observability/agent_actions/models/__init__.py +87 -0
  67. omnibase_infra/services/observability/agent_actions/models/model_agent_action.py +142 -0
  68. omnibase_infra/services/observability/agent_actions/models/model_detection_failure.py +125 -0
  69. omnibase_infra/services/observability/agent_actions/models/model_envelope.py +85 -0
  70. omnibase_infra/services/observability/agent_actions/models/model_execution_log.py +159 -0
  71. omnibase_infra/services/observability/agent_actions/models/model_performance_metric.py +130 -0
  72. omnibase_infra/services/observability/agent_actions/models/model_routing_decision.py +138 -0
  73. omnibase_infra/services/observability/agent_actions/models/model_transformation_event.py +124 -0
  74. omnibase_infra/services/observability/agent_actions/tests/__init__.py +20 -0
  75. omnibase_infra/services/observability/agent_actions/tests/test_consumer.py +1154 -0
  76. omnibase_infra/services/observability/agent_actions/tests/test_models.py +645 -0
  77. omnibase_infra/services/observability/agent_actions/tests/test_writer.py +709 -0
  78. omnibase_infra/services/observability/agent_actions/writer_postgres.py +926 -0
  79. omnibase_infra/validation/__init__.py +12 -0
  80. omnibase_infra/validation/contracts/declarative_node.validation.yaml +143 -0
  81. omnibase_infra/validation/infra_validators.py +4 -1
  82. omnibase_infra/validation/validation_exemptions.yaml +111 -0
  83. omnibase_infra/validation/validator_declarative_node.py +850 -0
  84. {omnibase_infra-0.2.8.dist-info → omnibase_infra-0.3.0.dist-info}/METADATA +2 -2
  85. {omnibase_infra-0.2.8.dist-info → omnibase_infra-0.3.0.dist-info}/RECORD +88 -30
  86. {omnibase_infra-0.2.8.dist-info → omnibase_infra-0.3.0.dist-info}/WHEEL +0 -0
  87. {omnibase_infra-0.2.8.dist-info → omnibase_infra-0.3.0.dist-info}/entry_points.txt +0 -0
  88. {omnibase_infra-0.2.8.dist-info → omnibase_infra-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -13,11 +13,42 @@ Architecture:
13
13
  3. Creating Kafka subscriptions with appropriate consumer groups
14
14
  4. Bridging received messages to the MessageDispatchEngine
15
15
  5. Managing subscription lifecycle (creation and cleanup)
16
+ 6. Classifying errors as content vs infrastructure for proper handling
16
17
 
17
18
  This follows the ARCH-002 principle: "Runtime owns all Kafka plumbing."
18
19
  Nodes and handlers declare their topic requirements in contracts, but
19
20
  never directly interact with Kafka consumers or producers.
20
21
 
22
+ Error Classification:
23
+ The wiring distinguishes between two error categories:
24
+
25
+ Content Errors (non-retryable):
26
+ Schema validation failures, malformed payloads, missing required fields,
27
+ type conversion errors. These will NOT fix themselves with retry.
28
+ Default behavior: Send to DLQ and commit offset (dlq_and_commit).
29
+ Identified by: ProtocolConfigurationError, json.JSONDecodeError,
30
+ pydantic.ValidationError
31
+
32
+ Infrastructure Errors (potentially retryable):
33
+ Database timeouts, network failures, service unavailability.
34
+ These errors MAY fix themselves after retry.
35
+ Default behavior: Fail fast (fail_fast) to avoid hiding infrastructure
36
+ fires in the DLQ.
37
+ Identified by: RuntimeHostError and subclasses (InfraConnectionError,
38
+ InfraTimeoutError, InfraUnavailableError, etc.)
39
+
40
+ DLQ Consumer Group Alignment:
41
+ IMPORTANT: The consumer_group used for DLQ publishing MUST match the
42
+ consumer_group used when subscribing to topics. This is critical for:
43
+ - Traceability: DLQ messages can be correlated back to their source consumer
44
+ - Replay operations: DLQ replay tools can identify which consumer group failed
45
+ - Debugging: Operations teams can trace failures to specific consumer groups
46
+
47
+ The wiring ensures this alignment by:
48
+ 1. Computing consumer_group as "{environment}.{node_name}" in wire_subscriptions
49
+ 2. Passing this same consumer_group to _create_dispatch_callback
50
+ 3. Using it in all _publish_to_dlq calls within the callback closure
51
+
21
52
  Topic Resolution:
22
53
  Topic suffixes from contracts follow the ONEX naming convention:
23
54
  onex.{kind}.{producer}.{event-name}.v{n}
@@ -32,11 +63,14 @@ Topic Resolution:
32
63
 
33
64
  Related:
34
65
  - OMN-1621: Runtime consumes event_bus subcontract for contract-driven wiring
66
+ - OMN-1740: Error classification (content vs infra) in wiring
35
67
  - ModelEventBusSubcontract: Contract model defining subscribe/publish topics
36
68
  - MessageDispatchEngine: Dispatch engine that processes received messages
37
69
  - EventBusKafka: Kafka event bus implementation
38
70
 
39
71
  .. versionadded:: 0.2.5
72
+ .. versionchanged:: 0.2.9
73
+ Added error classification (content vs infrastructure) with DLQ integration.
40
74
  """
41
75
 
42
76
  from __future__ import annotations
@@ -46,6 +80,7 @@ import logging
46
80
  from collections.abc import Awaitable, Callable
47
81
  from pathlib import Path
48
82
  from typing import TYPE_CHECKING
83
+ from uuid import UUID, uuid4
49
84
 
50
85
  import yaml
51
86
  from pydantic import ValidationError
@@ -59,8 +94,18 @@ from omnibase_core.protocols.event_bus.protocol_event_message import (
59
94
  ProtocolEventMessage,
60
95
  )
61
96
  from omnibase_infra.enums import EnumInfraTransportType
62
- from omnibase_infra.errors import ModelInfraErrorContext, RuntimeHostError
63
- from omnibase_infra.protocols import ProtocolDispatchEngine
97
+ from omnibase_infra.errors import (
98
+ ModelInfraErrorContext,
99
+ ProtocolConfigurationError,
100
+ RuntimeHostError,
101
+ )
102
+ from omnibase_infra.models.event_bus import (
103
+ ModelConsumerRetryConfig,
104
+ ModelDlqConfig,
105
+ ModelIdempotencyConfig,
106
+ ModelOffsetPolicyConfig,
107
+ )
108
+ from omnibase_infra.protocols import ProtocolDispatchEngine, ProtocolIdempotencyStore
64
109
 
65
110
  if TYPE_CHECKING:
66
111
  from omnibase_infra.event_bus.event_bus_inmemory import EventBusInmemory
@@ -82,9 +127,28 @@ class EventBusSubcontractWiring:
82
127
  - Resolve topic suffixes to full topic names with environment prefix
83
128
  - Create Kafka subscriptions with appropriate consumer groups
84
129
  - Deserialize incoming messages to ModelEventEnvelope
130
+ - Check idempotency and skip duplicate messages (if enabled)
131
+ - Classify errors as content (DLQ) vs infrastructure (fail-fast)
85
132
  - Dispatch envelopes to MessageDispatchEngine
86
133
  - Manage subscription lifecycle (cleanup on shutdown)
87
134
 
135
+ Error Classification:
136
+ Content Errors (non-retryable): ProtocolConfigurationError, ValidationError,
137
+ json.JSONDecodeError. Default: DLQ and commit offset.
138
+
139
+ Infrastructure Errors (retryable): RuntimeHostError and subclasses.
140
+ Default: Fail-fast (no DLQ, no commit).
141
+
142
+ Idempotency:
143
+ When configured with an idempotency store and enabled config, the wiring
144
+ deduplicates messages based on the `envelope_id` field from the envelope.
145
+ Messages with the same envelope_id (within a topic domain) are processed
146
+ only once - duplicates are logged and skipped.
147
+
148
+ Requirements when idempotency is enabled:
149
+ - All envelopes MUST have a non-None envelope_id field
150
+ - Missing envelope_id raises ProtocolConfigurationError
151
+
88
152
  Thread Safety:
89
153
  This class is designed for single-threaded async use. All subscription
90
154
  operations should be performed from a single async context. The underlying
@@ -94,13 +158,26 @@ class EventBusSubcontractWiring:
94
158
  Example:
95
159
  ```python
96
160
  from omnibase_infra.runtime import EventBusSubcontractWiring
161
+ from omnibase_infra.models.event_bus import (
162
+ ModelIdempotencyConfig,
163
+ ModelDlqConfig,
164
+ ModelConsumerRetryConfig,
165
+ ModelOffsetPolicyConfig,
166
+ )
167
+ from omnibase_infra.idempotency import StoreIdempotencyInmemory
97
168
  from omnibase_core.models.contracts.subcontracts import ModelEventBusSubcontract
98
169
 
99
- # Create wiring with event bus and dispatch engine
170
+ # Create wiring with full error handling configuration
100
171
  wiring = EventBusSubcontractWiring(
101
172
  event_bus=event_bus,
102
173
  dispatch_engine=dispatch_engine,
103
174
  environment="dev",
175
+ node_name="my-handler",
176
+ idempotency_store=StoreIdempotencyInmemory(),
177
+ idempotency_config=ModelIdempotencyConfig(enabled=True),
178
+ dlq_config=ModelDlqConfig(enabled=True),
179
+ retry_config=ModelConsumerRetryConfig.create_standard(),
180
+ offset_policy=ModelOffsetPolicyConfig(),
104
181
  )
105
182
 
106
183
  # Wire subscriptions from subcontract
@@ -118,10 +195,20 @@ class EventBusSubcontractWiring:
118
195
  _event_bus: The event bus implementation (Kafka or in-memory)
119
196
  _dispatch_engine: Engine to dispatch received messages to handlers
120
197
  _environment: Environment prefix for topics (e.g., 'dev', 'prod')
198
+ _node_name: Name of the node/handler for consumer group and logging
199
+ _idempotency_store: Optional store for tracking processed messages
200
+ _idempotency_config: Configuration for idempotency behavior
201
+ _dlq_config: Configuration for Dead Letter Queue behavior
202
+ _retry_config: Configuration for consumer-side retry behavior
203
+ _offset_policy: Configuration for offset commit strategy
121
204
  _unsubscribe_callables: List of callables to unsubscribe from topics
122
205
  _logger: Logger for debug and error messages
206
+ _retry_counts: Tracks retry attempts per message (by correlation_id)
123
207
 
124
208
  .. versionadded:: 0.2.5
209
+ .. versionchanged:: 0.2.9
210
+ Added idempotency gate support via idempotency_store and idempotency_config.
211
+ Added error classification (content vs infrastructure) with DLQ integration.
125
212
  """
126
213
 
127
214
  def __init__(
@@ -129,6 +216,12 @@ class EventBusSubcontractWiring:
129
216
  event_bus: ProtocolEventBusSubscriber,
130
217
  dispatch_engine: ProtocolDispatchEngine,
131
218
  environment: str,
219
+ node_name: str,
220
+ idempotency_store: ProtocolIdempotencyStore | None = None,
221
+ idempotency_config: ModelIdempotencyConfig | None = None,
222
+ dlq_config: ModelDlqConfig | None = None,
223
+ retry_config: ModelConsumerRetryConfig | None = None,
224
+ offset_policy: ModelOffsetPolicyConfig | None = None,
132
225
  ) -> None:
133
226
  """Initialize event bus wiring.
134
227
 
@@ -141,6 +234,20 @@ class EventBusSubcontractWiring:
141
234
  Must be frozen (registrations complete) before wiring subscriptions.
142
235
  environment: Environment prefix for topics (e.g., 'dev', 'prod').
143
236
  Used to resolve topic suffixes to full topic names.
237
+ node_name: Name of the node/handler for consumer group identification and logging.
238
+ idempotency_store: Optional idempotency store for message deduplication.
239
+ If provided with enabled config, messages are deduplicated by envelope_id.
240
+ idempotency_config: Optional configuration for idempotency behavior.
241
+ If None, idempotency checking is disabled.
242
+ dlq_config: Optional configuration for Dead Letter Queue behavior.
243
+ Controls how content vs infrastructure errors are handled.
244
+ If None, uses defaults (content -> DLQ, infra -> fail-fast).
245
+ retry_config: Optional configuration for consumer-side retry behavior.
246
+ Controls retry attempts and backoff for infrastructure errors.
247
+ If None, uses standard defaults (3 attempts, exponential backoff).
248
+ offset_policy: Optional configuration for offset commit strategy.
249
+ Controls when offsets are committed relative to handler execution.
250
+ If None, uses commit_after_handler (at-least-once delivery).
144
251
 
145
252
  Note:
146
253
  The dispatch_engine should be frozen before wiring subscriptions.
@@ -155,8 +262,16 @@ class EventBusSubcontractWiring:
155
262
  self._event_bus = event_bus
156
263
  self._dispatch_engine = dispatch_engine
157
264
  self._environment = environment
265
+ self._node_name = node_name
266
+ self._idempotency_store = idempotency_store
267
+ self._idempotency_config = idempotency_config or ModelIdempotencyConfig()
268
+ self._dlq_config = dlq_config or ModelDlqConfig()
269
+ self._retry_config = retry_config or ModelConsumerRetryConfig.create_standard()
270
+ self._offset_policy = offset_policy or ModelOffsetPolicyConfig()
158
271
  self._unsubscribe_callables: list[Callable[[], Awaitable[None]]] = []
159
272
  self._logger = logging.getLogger(__name__)
273
+ # Track retry attempts per correlation_id for infrastructure errors
274
+ self._retry_counts: dict[UUID, int] = {}
160
275
 
161
276
  def resolve_topic(self, topic_suffix: str) -> str:
162
277
  """Resolve topic suffix to full topic name with environment prefix.
@@ -202,6 +317,11 @@ class EventBusSubcontractWiring:
202
317
  - Multiple instances of the same node load-balance message processing
203
318
  - Different environments are completely isolated
204
319
 
320
+ IMPORTANT: The same consumer_group is used for both subscriptions and
321
+ DLQ publishing to maintain traceability. DLQ messages include the
322
+ consumer_group that originally processed the message, enabling
323
+ correlation during replay and debugging.
324
+
205
325
  Args:
206
326
  subcontract: The event_bus subcontract from a handler's contract.
207
327
  Contains subscribe_topics list with topic suffixes.
@@ -228,43 +348,218 @@ class EventBusSubcontractWiring:
228
348
 
229
349
  for topic_suffix in subcontract.subscribe_topics:
230
350
  full_topic = self.resolve_topic(topic_suffix)
231
- group_id = f"{self._environment}.{node_name}"
351
+ # Consumer group ID derived from environment and node_name
352
+ # This same group_id is passed to DLQ publishing for traceability
353
+ consumer_group = f"{self._environment}.{node_name}"
232
354
 
233
- # Create dispatch callback for this topic
234
- callback = self._create_dispatch_callback(full_topic)
355
+ # Create dispatch callback for this topic, capturing the consumer_group
356
+ # used for this subscription to ensure DLQ messages have consistent
357
+ # consumer_group metadata
358
+ callback = self._create_dispatch_callback(full_topic, consumer_group)
235
359
 
236
360
  # Subscribe and store unsubscribe callable
237
361
  unsubscribe = await self._event_bus.subscribe(
238
362
  topic=full_topic,
239
- group_id=group_id,
363
+ group_id=consumer_group,
240
364
  on_message=callback,
241
365
  )
242
366
  self._unsubscribe_callables.append(unsubscribe)
243
367
 
244
368
  self._logger.info(
245
- "Wired subscription: topic=%s, group_id=%s, node=%s",
369
+ "Wired subscription: topic=%s, consumer_group=%s, node=%s",
246
370
  full_topic,
247
- group_id,
371
+ consumer_group,
248
372
  node_name,
249
373
  )
250
374
 
375
+ def _should_commit_after_handler(self) -> bool:
376
+ """Check if offset should be committed after handler execution.
377
+
378
+ Returns:
379
+ True if offset_policy is commit_after_handler (at-least-once).
380
+ """
381
+ return self._offset_policy.commit_strategy == "commit_after_handler"
382
+
383
+ async def _commit_offset(
384
+ self,
385
+ message: ProtocolEventMessage,
386
+ correlation_id: UUID | None,
387
+ ) -> None:
388
+ """Commit Kafka offset for the processed message.
389
+
390
+ Delegates to the event bus if it supports offset commits.
391
+ This is a no-op for event buses that don't support explicit commits.
392
+
393
+ Args:
394
+ message: The message whose offset should be committed.
395
+ correlation_id: Optional correlation ID for logging.
396
+ """
397
+ # Duck-type check for commit_offset method
398
+ commit_fn = getattr(self._event_bus, "commit_offset", None)
399
+ if commit_fn is not None and callable(commit_fn):
400
+ try:
401
+ await commit_fn(message)
402
+ self._logger.debug(
403
+ "offset_committed topic=%s offset=%s correlation_id=%s",
404
+ getattr(message, "topic", "unknown"),
405
+ getattr(message, "offset", "unknown"),
406
+ str(correlation_id) if correlation_id else "none",
407
+ )
408
+ except Exception as e:
409
+ self._logger.warning(
410
+ "offset_commit_failed topic=%s error=%s correlation_id=%s",
411
+ getattr(message, "topic", "unknown"),
412
+ str(e),
413
+ str(correlation_id) if correlation_id else "none",
414
+ )
415
+
416
+ async def _publish_to_dlq(
417
+ self,
418
+ topic: str,
419
+ message: ProtocolEventMessage,
420
+ error: Exception,
421
+ correlation_id: UUID,
422
+ error_category: str,
423
+ consumer_group: str,
424
+ ) -> None:
425
+ """Publish failed message to Dead Letter Queue.
426
+
427
+ Delegates to the event bus if it supports DLQ publishing.
428
+ Falls back to logging if DLQ is not available.
429
+
430
+ Args:
431
+ topic: The original topic the message was consumed from.
432
+ message: The message that failed processing.
433
+ error: The exception that caused the failure.
434
+ correlation_id: Correlation ID for tracing.
435
+ error_category: Either "content" or "infra" for classification.
436
+ consumer_group: The consumer group ID that was subscribed to this topic.
437
+ This should match the group_id used in wire_subscriptions() for
438
+ consistent traceability in DLQ messages.
439
+ """
440
+ if not self._dlq_config.enabled:
441
+ self._logger.debug(
442
+ "dlq_disabled topic=%s correlation_id=%s error_category=%s",
443
+ topic,
444
+ str(correlation_id),
445
+ error_category,
446
+ )
447
+ return
448
+
449
+ # Duck-type check for DLQ publish method
450
+ publish_dlq_fn = getattr(self._event_bus, "_publish_raw_to_dlq", None)
451
+ if publish_dlq_fn is not None and callable(publish_dlq_fn):
452
+ try:
453
+ await publish_dlq_fn(
454
+ original_topic=topic,
455
+ raw_msg=message,
456
+ error=error,
457
+ correlation_id=correlation_id,
458
+ failure_type=f"{error_category}_error",
459
+ consumer_group=consumer_group,
460
+ )
461
+ self._logger.warning(
462
+ "dlq_published topic=%s error_category=%s error_type=%s "
463
+ "correlation_id=%s",
464
+ topic,
465
+ error_category,
466
+ type(error).__name__,
467
+ str(correlation_id),
468
+ )
469
+ except Exception as dlq_error:
470
+ self._logger.exception(
471
+ "dlq_publish_failed topic=%s error=%s correlation_id=%s",
472
+ topic,
473
+ str(dlq_error),
474
+ str(correlation_id),
475
+ )
476
+ else:
477
+ # Fallback: log at ERROR level if DLQ not available
478
+ self._logger.error(
479
+ "dlq_not_available topic=%s error_category=%s error_type=%s "
480
+ "error_message=%s correlation_id=%s",
481
+ topic,
482
+ error_category,
483
+ type(error).__name__,
484
+ str(error),
485
+ str(correlation_id),
486
+ )
487
+
488
+ def _get_retry_count(self, correlation_id: UUID) -> int:
489
+ """Get current retry count for a correlation ID.
490
+
491
+ Args:
492
+ correlation_id: The correlation ID to check.
493
+
494
+ Returns:
495
+ Current retry count (0 if not tracked).
496
+ """
497
+ return self._retry_counts.get(correlation_id, 0)
498
+
499
+ def _increment_retry_count(self, correlation_id: UUID) -> int:
500
+ """Increment retry count for a correlation ID.
501
+
502
+ Args:
503
+ correlation_id: The correlation ID to increment.
504
+
505
+ Returns:
506
+ New retry count after increment.
507
+ """
508
+ current = self._retry_counts.get(correlation_id, 0)
509
+ self._retry_counts[correlation_id] = current + 1
510
+ return current + 1
511
+
512
+ def _clear_retry_count(self, correlation_id: UUID) -> None:
513
+ """Clear retry count for a correlation ID after successful processing.
514
+
515
+ Args:
516
+ correlation_id: The correlation ID to clear.
517
+ """
518
+ self._retry_counts.pop(correlation_id, None)
519
+
520
+ def _is_retry_exhausted(self, correlation_id: UUID) -> bool:
521
+ """Check if retry budget is exhausted for a correlation ID.
522
+
523
+ Args:
524
+ correlation_id: The correlation ID to check.
525
+
526
+ Returns:
527
+ True if retry attempts exceed max_attempts from config.
528
+ """
529
+ return self._get_retry_count(correlation_id) >= self._retry_config.max_attempts
530
+
251
531
  def _create_dispatch_callback(
252
532
  self,
253
533
  topic: str,
534
+ consumer_group: str,
254
535
  ) -> Callable[[ProtocolEventMessage], Awaitable[None]]:
255
536
  """Create callback that bridges Kafka consumer to dispatch engine.
256
537
 
257
538
  Creates an async callback function that:
258
539
  1. Receives ProtocolEventMessage from the Kafka consumer
259
540
  2. Deserializes the message value to ModelEventEnvelope
260
- 3. Dispatches the envelope to the MessageDispatchEngine
261
-
262
- Error Handling:
263
- - Deserialization errors are logged and the message is skipped
264
- - Dispatch errors are propagated (handled by the event bus DLQ logic)
541
+ 3. Checks idempotency (if enabled) to skip duplicate messages
542
+ 4. Dispatches the envelope to the MessageDispatchEngine
543
+ 5. Classifies errors as content (DLQ) vs infrastructure (fail-fast)
544
+ 6. Manages offset commits based on policy
545
+
546
+ Error Classification:
547
+ Content Errors (ProtocolConfigurationError, ValidationError, JSONDecodeError):
548
+ - Non-retryable (will never succeed with retry)
549
+ - Default: DLQ and commit offset
550
+ - Policy override via dlq_config.on_content_error
551
+
552
+ Infrastructure Errors (RuntimeHostError and subclasses):
553
+ - Potentially retryable (may succeed after service recovery)
554
+ - Default: Fail-fast (no DLQ, no commit, re-raise)
555
+ - If retry exhausted and policy allows: DLQ and commit
556
+ - Policy override via dlq_config.on_infra_exhausted
265
557
 
266
558
  Args:
267
559
  topic: The full topic name for routing context in logs.
560
+ consumer_group: The consumer group ID used for this topic subscription.
561
+ This is passed to DLQ publishing to ensure consistent traceability
562
+ between subscriptions and their associated DLQ messages.
268
563
 
269
564
  Returns:
270
565
  Async callback function compatible with event bus subscribe().
@@ -272,34 +567,169 @@ class EventBusSubcontractWiring:
272
567
 
273
568
  async def callback(message: ProtocolEventMessage) -> None:
274
569
  """Process incoming Kafka message and dispatch to engine."""
570
+ envelope: ModelEventEnvelope[object] | None = None
571
+ correlation_id: UUID = uuid4() # Default if not in envelope
572
+
275
573
  try:
276
574
  envelope = self._deserialize_to_envelope(message)
575
+ correlation_id = envelope.correlation_id or uuid4()
576
+
577
+ # Idempotency gate: check for duplicate messages
578
+ if self._idempotency_store and self._idempotency_config.enabled:
579
+ envelope_id = envelope.envelope_id
580
+ if envelope_id is None:
581
+ # Missing envelope_id is a content error when idempotency is enabled
582
+ raise ProtocolConfigurationError(
583
+ "Envelope missing envelope_id for idempotency",
584
+ context=ModelInfraErrorContext.with_correlation(
585
+ correlation_id=correlation_id,
586
+ transport_type=EnumInfraTransportType.KAFKA,
587
+ operation="idempotency_check",
588
+ ),
589
+ )
590
+
591
+ is_new = await self._idempotency_store.check_and_record(
592
+ message_id=envelope_id,
593
+ domain=topic, # Use topic as domain for namespace isolation
594
+ correlation_id=correlation_id,
595
+ )
596
+ if not is_new:
597
+ # Duplicate - skip processing but commit offset to prevent
598
+ # infinite redelivery. This is critical: even though we don't
599
+ # reprocess the message, we must advance the consumer offset.
600
+ self._logger.info(
601
+ "idempotency_skip envelope_id=%s topic=%s "
602
+ "correlation_id=%s node=%s reason=duplicate_message",
603
+ str(envelope_id),
604
+ topic,
605
+ str(correlation_id),
606
+ self._node_name,
607
+ )
608
+ # Commit offset for duplicate to prevent infinite redelivery
609
+ if self._should_commit_after_handler():
610
+ await self._commit_offset(message, correlation_id)
611
+ return # Skip dispatch
612
+
277
613
  # Dispatch via ProtocolDispatchEngine interface
278
614
  await self._dispatch_engine.dispatch(topic, envelope)
279
- except json.JSONDecodeError as e:
280
- self._logger.exception(
281
- "Failed to deserialize message from topic '%s': %s",
615
+
616
+ # Success - commit offset if policy requires and clear retry count
617
+ if self._should_commit_after_handler():
618
+ await self._commit_offset(message, correlation_id)
619
+ self._clear_retry_count(correlation_id)
620
+
621
+ except (json.JSONDecodeError, ValidationError) as e:
622
+ # Content error: malformed JSON or schema validation failure
623
+ # These are non-retryable - the message will never parse correctly
624
+ self._logger.warning(
625
+ "content_error_deserialization topic=%s error_type=%s "
626
+ "error=%s correlation_id=%s",
282
627
  topic,
283
- e,
628
+ type(e).__name__,
629
+ str(e),
630
+ str(correlation_id),
284
631
  )
285
- # Wrap in OnexError per CLAUDE.md: "OnexError Only"
286
- raise RuntimeHostError(
287
- f"Failed to deserialize message from topic '{topic}'",
632
+
633
+ if self._dlq_config.on_content_error == "dlq_and_commit":
634
+ await self._publish_to_dlq(
635
+ topic, message, e, correlation_id, "content", consumer_group
636
+ )
637
+ await self._commit_offset(message, correlation_id)
638
+ return # Handled - don't re-raise
639
+
640
+ # fail_fast - wrap and re-raise
641
+ raise ProtocolConfigurationError(
642
+ f"Content error: failed to deserialize message from topic '{topic}'",
288
643
  context=ModelInfraErrorContext.with_correlation(
644
+ correlation_id=correlation_id,
289
645
  transport_type=EnumInfraTransportType.KAFKA,
290
646
  operation="event_bus_deserialize",
291
647
  ),
292
648
  ) from e
649
+
650
+ except ProtocolConfigurationError as e:
651
+ # Content error: already classified as non-retryable
652
+ self._logger.warning(
653
+ "content_error_configuration topic=%s error=%s correlation_id=%s",
654
+ topic,
655
+ str(e),
656
+ str(correlation_id),
657
+ )
658
+
659
+ if self._dlq_config.on_content_error == "dlq_and_commit":
660
+ await self._publish_to_dlq(
661
+ topic, message, e, correlation_id, "content", consumer_group
662
+ )
663
+ await self._commit_offset(message, correlation_id)
664
+ return # Handled - don't re-raise
665
+
666
+ # fail_fast - re-raise without wrapping (already proper OnexError)
667
+ raise
668
+
669
+ except RuntimeHostError as e:
670
+ # Infrastructure error: potentially retryable
671
+ # Track retry attempts and check exhaustion
672
+ retry_count = self._increment_retry_count(correlation_id)
673
+ is_exhausted = self._is_retry_exhausted(correlation_id)
674
+
675
+ # TRY400 disabled: logger.error intentional to avoid leaking stack traces
676
+ self._logger.error( # noqa: TRY400
677
+ "infra_error topic=%s error_type=%s error=%s "
678
+ "retry_count=%d max_attempts=%d exhausted=%s correlation_id=%s",
679
+ topic,
680
+ type(e).__name__,
681
+ str(e),
682
+ retry_count,
683
+ self._retry_config.max_attempts,
684
+ is_exhausted,
685
+ str(correlation_id),
686
+ )
687
+
688
+ if is_exhausted:
689
+ # Retry budget exhausted - check policy
690
+ if self._dlq_config.on_infra_exhausted == "dlq_and_commit":
691
+ await self._publish_to_dlq(
692
+ topic, message, e, correlation_id, "infra", consumer_group
693
+ )
694
+ await self._commit_offset(message, correlation_id)
695
+ self._clear_retry_count(correlation_id)
696
+ return # Handled - don't re-raise
697
+
698
+ # fail_fast (default) - re-raise without committing
699
+ # Kafka will redeliver the message
700
+ raise
701
+
293
702
  except Exception as e:
703
+ # Unexpected error - classify as infrastructure error
704
+ # This catches errors from handlers that aren't properly wrapped
705
+ retry_count = self._increment_retry_count(correlation_id)
706
+ is_exhausted = self._is_retry_exhausted(correlation_id)
707
+
294
708
  self._logger.exception(
295
- "Failed to dispatch message from topic '%s': %s",
709
+ "unexpected_error topic=%s error_type=%s error=%s "
710
+ "retry_count=%d exhausted=%s correlation_id=%s",
296
711
  topic,
297
- e,
712
+ type(e).__name__,
713
+ str(e),
714
+ retry_count,
715
+ is_exhausted,
716
+ str(correlation_id),
298
717
  )
299
- # Wrap in OnexError per CLAUDE.md: "OnexError Only"
718
+
719
+ if is_exhausted:
720
+ if self._dlq_config.on_infra_exhausted == "dlq_and_commit":
721
+ await self._publish_to_dlq(
722
+ topic, message, e, correlation_id, "infra", consumer_group
723
+ )
724
+ await self._commit_offset(message, correlation_id)
725
+ self._clear_retry_count(correlation_id)
726
+ return
727
+
728
+ # Wrap in RuntimeHostError and re-raise
300
729
  raise RuntimeHostError(
301
730
  f"Failed to dispatch message from topic '{topic}'",
302
731
  context=ModelInfraErrorContext.with_correlation(
732
+ correlation_id=correlation_id,
303
733
  transport_type=EnumInfraTransportType.KAFKA,
304
734
  operation="event_bus_dispatch",
305
735
  ),
@@ -368,6 +798,7 @@ class EventBusSubcontractWiring:
368
798
  )
369
799
 
370
800
  self._unsubscribe_callables.clear()
801
+ self._retry_counts.clear()
371
802
 
372
803
  if cleanup_count > 0:
373
804
  self._logger.info(