omnibase_infra 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omnibase_infra/__init__.py +1 -1
- omnibase_infra/enums/__init__.py +4 -0
- omnibase_infra/enums/enum_declarative_node_violation.py +102 -0
- omnibase_infra/event_bus/adapters/__init__.py +31 -0
- omnibase_infra/event_bus/adapters/adapter_protocol_event_publisher_kafka.py +517 -0
- omnibase_infra/mixins/mixin_async_circuit_breaker.py +113 -1
- omnibase_infra/models/__init__.py +9 -0
- omnibase_infra/models/event_bus/__init__.py +22 -0
- omnibase_infra/models/event_bus/model_consumer_retry_config.py +367 -0
- omnibase_infra/models/event_bus/model_dlq_config.py +177 -0
- omnibase_infra/models/event_bus/model_idempotency_config.py +131 -0
- omnibase_infra/models/event_bus/model_offset_policy_config.py +107 -0
- omnibase_infra/models/resilience/model_circuit_breaker_config.py +15 -0
- omnibase_infra/models/validation/__init__.py +8 -0
- omnibase_infra/models/validation/model_declarative_node_validation_result.py +139 -0
- omnibase_infra/models/validation/model_declarative_node_violation.py +169 -0
- omnibase_infra/nodes/architecture_validator/__init__.py +28 -7
- omnibase_infra/nodes/architecture_validator/constants.py +36 -0
- omnibase_infra/nodes/architecture_validator/handlers/__init__.py +28 -0
- omnibase_infra/nodes/architecture_validator/handlers/contract.yaml +120 -0
- omnibase_infra/nodes/architecture_validator/handlers/handler_architecture_validation.py +359 -0
- omnibase_infra/nodes/architecture_validator/node.py +1 -0
- omnibase_infra/nodes/architecture_validator/node_architecture_validator.py +48 -336
- omnibase_infra/nodes/node_ledger_projection_compute/__init__.py +16 -2
- omnibase_infra/nodes/node_ledger_projection_compute/contract.yaml +14 -4
- omnibase_infra/nodes/node_ledger_projection_compute/handlers/__init__.py +18 -0
- omnibase_infra/nodes/node_ledger_projection_compute/handlers/contract.yaml +53 -0
- omnibase_infra/nodes/node_ledger_projection_compute/handlers/handler_ledger_projection.py +354 -0
- omnibase_infra/nodes/node_ledger_projection_compute/node.py +20 -256
- omnibase_infra/nodes/node_registry_effect/node.py +20 -73
- omnibase_infra/protocols/protocol_dispatch_engine.py +90 -0
- omnibase_infra/runtime/__init__.py +11 -0
- omnibase_infra/runtime/baseline_subscriptions.py +150 -0
- omnibase_infra/runtime/event_bus_subcontract_wiring.py +455 -24
- omnibase_infra/runtime/kafka_contract_source.py +13 -5
- omnibase_infra/runtime/service_message_dispatch_engine.py +112 -0
- omnibase_infra/runtime/service_runtime_host_process.py +6 -11
- omnibase_infra/services/__init__.py +36 -0
- omnibase_infra/services/contract_publisher/__init__.py +95 -0
- omnibase_infra/services/contract_publisher/config.py +199 -0
- omnibase_infra/services/contract_publisher/errors.py +243 -0
- omnibase_infra/services/contract_publisher/models/__init__.py +28 -0
- omnibase_infra/services/contract_publisher/models/model_contract_error.py +67 -0
- omnibase_infra/services/contract_publisher/models/model_infra_error.py +62 -0
- omnibase_infra/services/contract_publisher/models/model_publish_result.py +112 -0
- omnibase_infra/services/contract_publisher/models/model_publish_stats.py +79 -0
- omnibase_infra/services/contract_publisher/service.py +617 -0
- omnibase_infra/services/contract_publisher/sources/__init__.py +52 -0
- omnibase_infra/services/contract_publisher/sources/model_discovered.py +155 -0
- omnibase_infra/services/contract_publisher/sources/protocol.py +101 -0
- omnibase_infra/services/contract_publisher/sources/source_composite.py +309 -0
- omnibase_infra/services/contract_publisher/sources/source_filesystem.py +174 -0
- omnibase_infra/services/contract_publisher/sources/source_package.py +221 -0
- omnibase_infra/services/observability/__init__.py +40 -0
- omnibase_infra/services/observability/agent_actions/__init__.py +64 -0
- omnibase_infra/services/observability/agent_actions/config.py +209 -0
- omnibase_infra/services/observability/agent_actions/consumer.py +1320 -0
- omnibase_infra/services/observability/agent_actions/models/__init__.py +87 -0
- omnibase_infra/services/observability/agent_actions/models/model_agent_action.py +142 -0
- omnibase_infra/services/observability/agent_actions/models/model_detection_failure.py +125 -0
- omnibase_infra/services/observability/agent_actions/models/model_envelope.py +85 -0
- omnibase_infra/services/observability/agent_actions/models/model_execution_log.py +159 -0
- omnibase_infra/services/observability/agent_actions/models/model_performance_metric.py +130 -0
- omnibase_infra/services/observability/agent_actions/models/model_routing_decision.py +138 -0
- omnibase_infra/services/observability/agent_actions/models/model_transformation_event.py +124 -0
- omnibase_infra/services/observability/agent_actions/tests/__init__.py +20 -0
- omnibase_infra/services/observability/agent_actions/tests/test_consumer.py +1154 -0
- omnibase_infra/services/observability/agent_actions/tests/test_models.py +645 -0
- omnibase_infra/services/observability/agent_actions/tests/test_writer.py +709 -0
- omnibase_infra/services/observability/agent_actions/writer_postgres.py +926 -0
- omnibase_infra/validation/__init__.py +12 -0
- omnibase_infra/validation/contracts/declarative_node.validation.yaml +143 -0
- omnibase_infra/validation/validation_exemptions.yaml +93 -0
- omnibase_infra/validation/validator_declarative_node.py +850 -0
- {omnibase_infra-0.2.7.dist-info → omnibase_infra-0.2.9.dist-info}/METADATA +3 -3
- {omnibase_infra-0.2.7.dist-info → omnibase_infra-0.2.9.dist-info}/RECORD +79 -27
- {omnibase_infra-0.2.7.dist-info → omnibase_infra-0.2.9.dist-info}/WHEEL +0 -0
- {omnibase_infra-0.2.7.dist-info → omnibase_infra-0.2.9.dist-info}/entry_points.txt +0 -0
- {omnibase_infra-0.2.7.dist-info → omnibase_infra-0.2.9.dist-info}/licenses/LICENSE +0 -0
|
@@ -13,11 +13,42 @@ Architecture:
|
|
|
13
13
|
3. Creating Kafka subscriptions with appropriate consumer groups
|
|
14
14
|
4. Bridging received messages to the MessageDispatchEngine
|
|
15
15
|
5. Managing subscription lifecycle (creation and cleanup)
|
|
16
|
+
6. Classifying errors as content vs infrastructure for proper handling
|
|
16
17
|
|
|
17
18
|
This follows the ARCH-002 principle: "Runtime owns all Kafka plumbing."
|
|
18
19
|
Nodes and handlers declare their topic requirements in contracts, but
|
|
19
20
|
never directly interact with Kafka consumers or producers.
|
|
20
21
|
|
|
22
|
+
Error Classification:
|
|
23
|
+
The wiring distinguishes between two error categories:
|
|
24
|
+
|
|
25
|
+
Content Errors (non-retryable):
|
|
26
|
+
Schema validation failures, malformed payloads, missing required fields,
|
|
27
|
+
type conversion errors. These will NOT fix themselves with retry.
|
|
28
|
+
Default behavior: Send to DLQ and commit offset (dlq_and_commit).
|
|
29
|
+
Identified by: ProtocolConfigurationError, json.JSONDecodeError,
|
|
30
|
+
pydantic.ValidationError
|
|
31
|
+
|
|
32
|
+
Infrastructure Errors (potentially retryable):
|
|
33
|
+
Database timeouts, network failures, service unavailability.
|
|
34
|
+
These errors MAY fix themselves after retry.
|
|
35
|
+
Default behavior: Fail fast (fail_fast) to avoid hiding infrastructure
|
|
36
|
+
fires in the DLQ.
|
|
37
|
+
Identified by: RuntimeHostError and subclasses (InfraConnectionError,
|
|
38
|
+
InfraTimeoutError, InfraUnavailableError, etc.)
|
|
39
|
+
|
|
40
|
+
DLQ Consumer Group Alignment:
|
|
41
|
+
IMPORTANT: The consumer_group used for DLQ publishing MUST match the
|
|
42
|
+
consumer_group used when subscribing to topics. This is critical for:
|
|
43
|
+
- Traceability: DLQ messages can be correlated back to their source consumer
|
|
44
|
+
- Replay operations: DLQ replay tools can identify which consumer group failed
|
|
45
|
+
- Debugging: Operations teams can trace failures to specific consumer groups
|
|
46
|
+
|
|
47
|
+
The wiring ensures this alignment by:
|
|
48
|
+
1. Computing consumer_group as "{environment}.{node_name}" in wire_subscriptions
|
|
49
|
+
2. Passing this same consumer_group to _create_dispatch_callback
|
|
50
|
+
3. Using it in all _publish_to_dlq calls within the callback closure
|
|
51
|
+
|
|
21
52
|
Topic Resolution:
|
|
22
53
|
Topic suffixes from contracts follow the ONEX naming convention:
|
|
23
54
|
onex.{kind}.{producer}.{event-name}.v{n}
|
|
@@ -32,11 +63,14 @@ Topic Resolution:
|
|
|
32
63
|
|
|
33
64
|
Related:
|
|
34
65
|
- OMN-1621: Runtime consumes event_bus subcontract for contract-driven wiring
|
|
66
|
+
- OMN-1740: Error classification (content vs infra) in wiring
|
|
35
67
|
- ModelEventBusSubcontract: Contract model defining subscribe/publish topics
|
|
36
68
|
- MessageDispatchEngine: Dispatch engine that processes received messages
|
|
37
69
|
- EventBusKafka: Kafka event bus implementation
|
|
38
70
|
|
|
39
71
|
.. versionadded:: 0.2.5
|
|
72
|
+
.. versionchanged:: 0.2.9
|
|
73
|
+
Added error classification (content vs infrastructure) with DLQ integration.
|
|
40
74
|
"""
|
|
41
75
|
|
|
42
76
|
from __future__ import annotations
|
|
@@ -46,6 +80,7 @@ import logging
|
|
|
46
80
|
from collections.abc import Awaitable, Callable
|
|
47
81
|
from pathlib import Path
|
|
48
82
|
from typing import TYPE_CHECKING
|
|
83
|
+
from uuid import UUID, uuid4
|
|
49
84
|
|
|
50
85
|
import yaml
|
|
51
86
|
from pydantic import ValidationError
|
|
@@ -59,8 +94,18 @@ from omnibase_core.protocols.event_bus.protocol_event_message import (
|
|
|
59
94
|
ProtocolEventMessage,
|
|
60
95
|
)
|
|
61
96
|
from omnibase_infra.enums import EnumInfraTransportType
|
|
62
|
-
from omnibase_infra.errors import
|
|
63
|
-
|
|
97
|
+
from omnibase_infra.errors import (
|
|
98
|
+
ModelInfraErrorContext,
|
|
99
|
+
ProtocolConfigurationError,
|
|
100
|
+
RuntimeHostError,
|
|
101
|
+
)
|
|
102
|
+
from omnibase_infra.models.event_bus import (
|
|
103
|
+
ModelConsumerRetryConfig,
|
|
104
|
+
ModelDlqConfig,
|
|
105
|
+
ModelIdempotencyConfig,
|
|
106
|
+
ModelOffsetPolicyConfig,
|
|
107
|
+
)
|
|
108
|
+
from omnibase_infra.protocols import ProtocolDispatchEngine, ProtocolIdempotencyStore
|
|
64
109
|
|
|
65
110
|
if TYPE_CHECKING:
|
|
66
111
|
from omnibase_infra.event_bus.event_bus_inmemory import EventBusInmemory
|
|
@@ -82,9 +127,28 @@ class EventBusSubcontractWiring:
|
|
|
82
127
|
- Resolve topic suffixes to full topic names with environment prefix
|
|
83
128
|
- Create Kafka subscriptions with appropriate consumer groups
|
|
84
129
|
- Deserialize incoming messages to ModelEventEnvelope
|
|
130
|
+
- Check idempotency and skip duplicate messages (if enabled)
|
|
131
|
+
- Classify errors as content (DLQ) vs infrastructure (fail-fast)
|
|
85
132
|
- Dispatch envelopes to MessageDispatchEngine
|
|
86
133
|
- Manage subscription lifecycle (cleanup on shutdown)
|
|
87
134
|
|
|
135
|
+
Error Classification:
|
|
136
|
+
Content Errors (non-retryable): ProtocolConfigurationError, ValidationError,
|
|
137
|
+
json.JSONDecodeError. Default: DLQ and commit offset.
|
|
138
|
+
|
|
139
|
+
Infrastructure Errors (retryable): RuntimeHostError and subclasses.
|
|
140
|
+
Default: Fail-fast (no DLQ, no commit).
|
|
141
|
+
|
|
142
|
+
Idempotency:
|
|
143
|
+
When configured with an idempotency store and enabled config, the wiring
|
|
144
|
+
deduplicates messages based on the `envelope_id` field from the envelope.
|
|
145
|
+
Messages with the same envelope_id (within a topic domain) are processed
|
|
146
|
+
only once - duplicates are logged and skipped.
|
|
147
|
+
|
|
148
|
+
Requirements when idempotency is enabled:
|
|
149
|
+
- All envelopes MUST have a non-None envelope_id field
|
|
150
|
+
- Missing envelope_id raises ProtocolConfigurationError
|
|
151
|
+
|
|
88
152
|
Thread Safety:
|
|
89
153
|
This class is designed for single-threaded async use. All subscription
|
|
90
154
|
operations should be performed from a single async context. The underlying
|
|
@@ -94,13 +158,26 @@ class EventBusSubcontractWiring:
|
|
|
94
158
|
Example:
|
|
95
159
|
```python
|
|
96
160
|
from omnibase_infra.runtime import EventBusSubcontractWiring
|
|
161
|
+
from omnibase_infra.models.event_bus import (
|
|
162
|
+
ModelIdempotencyConfig,
|
|
163
|
+
ModelDlqConfig,
|
|
164
|
+
ModelConsumerRetryConfig,
|
|
165
|
+
ModelOffsetPolicyConfig,
|
|
166
|
+
)
|
|
167
|
+
from omnibase_infra.idempotency import StoreIdempotencyInmemory
|
|
97
168
|
from omnibase_core.models.contracts.subcontracts import ModelEventBusSubcontract
|
|
98
169
|
|
|
99
|
-
# Create wiring with
|
|
170
|
+
# Create wiring with full error handling configuration
|
|
100
171
|
wiring = EventBusSubcontractWiring(
|
|
101
172
|
event_bus=event_bus,
|
|
102
173
|
dispatch_engine=dispatch_engine,
|
|
103
174
|
environment="dev",
|
|
175
|
+
node_name="my-handler",
|
|
176
|
+
idempotency_store=StoreIdempotencyInmemory(),
|
|
177
|
+
idempotency_config=ModelIdempotencyConfig(enabled=True),
|
|
178
|
+
dlq_config=ModelDlqConfig(enabled=True),
|
|
179
|
+
retry_config=ModelConsumerRetryConfig.create_standard(),
|
|
180
|
+
offset_policy=ModelOffsetPolicyConfig(),
|
|
104
181
|
)
|
|
105
182
|
|
|
106
183
|
# Wire subscriptions from subcontract
|
|
@@ -118,10 +195,20 @@ class EventBusSubcontractWiring:
|
|
|
118
195
|
_event_bus: The event bus implementation (Kafka or in-memory)
|
|
119
196
|
_dispatch_engine: Engine to dispatch received messages to handlers
|
|
120
197
|
_environment: Environment prefix for topics (e.g., 'dev', 'prod')
|
|
198
|
+
_node_name: Name of the node/handler for consumer group and logging
|
|
199
|
+
_idempotency_store: Optional store for tracking processed messages
|
|
200
|
+
_idempotency_config: Configuration for idempotency behavior
|
|
201
|
+
_dlq_config: Configuration for Dead Letter Queue behavior
|
|
202
|
+
_retry_config: Configuration for consumer-side retry behavior
|
|
203
|
+
_offset_policy: Configuration for offset commit strategy
|
|
121
204
|
_unsubscribe_callables: List of callables to unsubscribe from topics
|
|
122
205
|
_logger: Logger for debug and error messages
|
|
206
|
+
_retry_counts: Tracks retry attempts per message (by correlation_id)
|
|
123
207
|
|
|
124
208
|
.. versionadded:: 0.2.5
|
|
209
|
+
.. versionchanged:: 0.2.9
|
|
210
|
+
Added idempotency gate support via idempotency_store and idempotency_config.
|
|
211
|
+
Added error classification (content vs infrastructure) with DLQ integration.
|
|
125
212
|
"""
|
|
126
213
|
|
|
127
214
|
def __init__(
|
|
@@ -129,6 +216,12 @@ class EventBusSubcontractWiring:
|
|
|
129
216
|
event_bus: ProtocolEventBusSubscriber,
|
|
130
217
|
dispatch_engine: ProtocolDispatchEngine,
|
|
131
218
|
environment: str,
|
|
219
|
+
node_name: str,
|
|
220
|
+
idempotency_store: ProtocolIdempotencyStore | None = None,
|
|
221
|
+
idempotency_config: ModelIdempotencyConfig | None = None,
|
|
222
|
+
dlq_config: ModelDlqConfig | None = None,
|
|
223
|
+
retry_config: ModelConsumerRetryConfig | None = None,
|
|
224
|
+
offset_policy: ModelOffsetPolicyConfig | None = None,
|
|
132
225
|
) -> None:
|
|
133
226
|
"""Initialize event bus wiring.
|
|
134
227
|
|
|
@@ -141,6 +234,20 @@ class EventBusSubcontractWiring:
|
|
|
141
234
|
Must be frozen (registrations complete) before wiring subscriptions.
|
|
142
235
|
environment: Environment prefix for topics (e.g., 'dev', 'prod').
|
|
143
236
|
Used to resolve topic suffixes to full topic names.
|
|
237
|
+
node_name: Name of the node/handler for consumer group identification and logging.
|
|
238
|
+
idempotency_store: Optional idempotency store for message deduplication.
|
|
239
|
+
If provided with enabled config, messages are deduplicated by envelope_id.
|
|
240
|
+
idempotency_config: Optional configuration for idempotency behavior.
|
|
241
|
+
If None, idempotency checking is disabled.
|
|
242
|
+
dlq_config: Optional configuration for Dead Letter Queue behavior.
|
|
243
|
+
Controls how content vs infrastructure errors are handled.
|
|
244
|
+
If None, uses defaults (content -> DLQ, infra -> fail-fast).
|
|
245
|
+
retry_config: Optional configuration for consumer-side retry behavior.
|
|
246
|
+
Controls retry attempts and backoff for infrastructure errors.
|
|
247
|
+
If None, uses standard defaults (3 attempts, exponential backoff).
|
|
248
|
+
offset_policy: Optional configuration for offset commit strategy.
|
|
249
|
+
Controls when offsets are committed relative to handler execution.
|
|
250
|
+
If None, uses commit_after_handler (at-least-once delivery).
|
|
144
251
|
|
|
145
252
|
Note:
|
|
146
253
|
The dispatch_engine should be frozen before wiring subscriptions.
|
|
@@ -155,8 +262,16 @@ class EventBusSubcontractWiring:
|
|
|
155
262
|
self._event_bus = event_bus
|
|
156
263
|
self._dispatch_engine = dispatch_engine
|
|
157
264
|
self._environment = environment
|
|
265
|
+
self._node_name = node_name
|
|
266
|
+
self._idempotency_store = idempotency_store
|
|
267
|
+
self._idempotency_config = idempotency_config or ModelIdempotencyConfig()
|
|
268
|
+
self._dlq_config = dlq_config or ModelDlqConfig()
|
|
269
|
+
self._retry_config = retry_config or ModelConsumerRetryConfig.create_standard()
|
|
270
|
+
self._offset_policy = offset_policy or ModelOffsetPolicyConfig()
|
|
158
271
|
self._unsubscribe_callables: list[Callable[[], Awaitable[None]]] = []
|
|
159
272
|
self._logger = logging.getLogger(__name__)
|
|
273
|
+
# Track retry attempts per correlation_id for infrastructure errors
|
|
274
|
+
self._retry_counts: dict[UUID, int] = {}
|
|
160
275
|
|
|
161
276
|
def resolve_topic(self, topic_suffix: str) -> str:
|
|
162
277
|
"""Resolve topic suffix to full topic name with environment prefix.
|
|
@@ -202,6 +317,11 @@ class EventBusSubcontractWiring:
|
|
|
202
317
|
- Multiple instances of the same node load-balance message processing
|
|
203
318
|
- Different environments are completely isolated
|
|
204
319
|
|
|
320
|
+
IMPORTANT: The same consumer_group is used for both subscriptions and
|
|
321
|
+
DLQ publishing to maintain traceability. DLQ messages include the
|
|
322
|
+
consumer_group that originally processed the message, enabling
|
|
323
|
+
correlation during replay and debugging.
|
|
324
|
+
|
|
205
325
|
Args:
|
|
206
326
|
subcontract: The event_bus subcontract from a handler's contract.
|
|
207
327
|
Contains subscribe_topics list with topic suffixes.
|
|
@@ -228,43 +348,218 @@ class EventBusSubcontractWiring:
|
|
|
228
348
|
|
|
229
349
|
for topic_suffix in subcontract.subscribe_topics:
|
|
230
350
|
full_topic = self.resolve_topic(topic_suffix)
|
|
231
|
-
|
|
351
|
+
# Consumer group ID derived from environment and node_name
|
|
352
|
+
# This same group_id is passed to DLQ publishing for traceability
|
|
353
|
+
consumer_group = f"{self._environment}.{node_name}"
|
|
232
354
|
|
|
233
|
-
# Create dispatch callback for this topic
|
|
234
|
-
|
|
355
|
+
# Create dispatch callback for this topic, capturing the consumer_group
|
|
356
|
+
# used for this subscription to ensure DLQ messages have consistent
|
|
357
|
+
# consumer_group metadata
|
|
358
|
+
callback = self._create_dispatch_callback(full_topic, consumer_group)
|
|
235
359
|
|
|
236
360
|
# Subscribe and store unsubscribe callable
|
|
237
361
|
unsubscribe = await self._event_bus.subscribe(
|
|
238
362
|
topic=full_topic,
|
|
239
|
-
group_id=
|
|
363
|
+
group_id=consumer_group,
|
|
240
364
|
on_message=callback,
|
|
241
365
|
)
|
|
242
366
|
self._unsubscribe_callables.append(unsubscribe)
|
|
243
367
|
|
|
244
368
|
self._logger.info(
|
|
245
|
-
"Wired subscription: topic=%s,
|
|
369
|
+
"Wired subscription: topic=%s, consumer_group=%s, node=%s",
|
|
246
370
|
full_topic,
|
|
247
|
-
|
|
371
|
+
consumer_group,
|
|
248
372
|
node_name,
|
|
249
373
|
)
|
|
250
374
|
|
|
375
|
+
def _should_commit_after_handler(self) -> bool:
|
|
376
|
+
"""Check if offset should be committed after handler execution.
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
True if offset_policy is commit_after_handler (at-least-once).
|
|
380
|
+
"""
|
|
381
|
+
return self._offset_policy.commit_strategy == "commit_after_handler"
|
|
382
|
+
|
|
383
|
+
async def _commit_offset(
|
|
384
|
+
self,
|
|
385
|
+
message: ProtocolEventMessage,
|
|
386
|
+
correlation_id: UUID | None,
|
|
387
|
+
) -> None:
|
|
388
|
+
"""Commit Kafka offset for the processed message.
|
|
389
|
+
|
|
390
|
+
Delegates to the event bus if it supports offset commits.
|
|
391
|
+
This is a no-op for event buses that don't support explicit commits.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
message: The message whose offset should be committed.
|
|
395
|
+
correlation_id: Optional correlation ID for logging.
|
|
396
|
+
"""
|
|
397
|
+
# Duck-type check for commit_offset method
|
|
398
|
+
commit_fn = getattr(self._event_bus, "commit_offset", None)
|
|
399
|
+
if commit_fn is not None and callable(commit_fn):
|
|
400
|
+
try:
|
|
401
|
+
await commit_fn(message)
|
|
402
|
+
self._logger.debug(
|
|
403
|
+
"offset_committed topic=%s offset=%s correlation_id=%s",
|
|
404
|
+
getattr(message, "topic", "unknown"),
|
|
405
|
+
getattr(message, "offset", "unknown"),
|
|
406
|
+
str(correlation_id) if correlation_id else "none",
|
|
407
|
+
)
|
|
408
|
+
except Exception as e:
|
|
409
|
+
self._logger.warning(
|
|
410
|
+
"offset_commit_failed topic=%s error=%s correlation_id=%s",
|
|
411
|
+
getattr(message, "topic", "unknown"),
|
|
412
|
+
str(e),
|
|
413
|
+
str(correlation_id) if correlation_id else "none",
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
async def _publish_to_dlq(
|
|
417
|
+
self,
|
|
418
|
+
topic: str,
|
|
419
|
+
message: ProtocolEventMessage,
|
|
420
|
+
error: Exception,
|
|
421
|
+
correlation_id: UUID,
|
|
422
|
+
error_category: str,
|
|
423
|
+
consumer_group: str,
|
|
424
|
+
) -> None:
|
|
425
|
+
"""Publish failed message to Dead Letter Queue.
|
|
426
|
+
|
|
427
|
+
Delegates to the event bus if it supports DLQ publishing.
|
|
428
|
+
Falls back to logging if DLQ is not available.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
topic: The original topic the message was consumed from.
|
|
432
|
+
message: The message that failed processing.
|
|
433
|
+
error: The exception that caused the failure.
|
|
434
|
+
correlation_id: Correlation ID for tracing.
|
|
435
|
+
error_category: Either "content" or "infra" for classification.
|
|
436
|
+
consumer_group: The consumer group ID that was subscribed to this topic.
|
|
437
|
+
This should match the group_id used in wire_subscriptions() for
|
|
438
|
+
consistent traceability in DLQ messages.
|
|
439
|
+
"""
|
|
440
|
+
if not self._dlq_config.enabled:
|
|
441
|
+
self._logger.debug(
|
|
442
|
+
"dlq_disabled topic=%s correlation_id=%s error_category=%s",
|
|
443
|
+
topic,
|
|
444
|
+
str(correlation_id),
|
|
445
|
+
error_category,
|
|
446
|
+
)
|
|
447
|
+
return
|
|
448
|
+
|
|
449
|
+
# Duck-type check for DLQ publish method
|
|
450
|
+
publish_dlq_fn = getattr(self._event_bus, "_publish_raw_to_dlq", None)
|
|
451
|
+
if publish_dlq_fn is not None and callable(publish_dlq_fn):
|
|
452
|
+
try:
|
|
453
|
+
await publish_dlq_fn(
|
|
454
|
+
original_topic=topic,
|
|
455
|
+
raw_msg=message,
|
|
456
|
+
error=error,
|
|
457
|
+
correlation_id=correlation_id,
|
|
458
|
+
failure_type=f"{error_category}_error",
|
|
459
|
+
consumer_group=consumer_group,
|
|
460
|
+
)
|
|
461
|
+
self._logger.warning(
|
|
462
|
+
"dlq_published topic=%s error_category=%s error_type=%s "
|
|
463
|
+
"correlation_id=%s",
|
|
464
|
+
topic,
|
|
465
|
+
error_category,
|
|
466
|
+
type(error).__name__,
|
|
467
|
+
str(correlation_id),
|
|
468
|
+
)
|
|
469
|
+
except Exception as dlq_error:
|
|
470
|
+
self._logger.exception(
|
|
471
|
+
"dlq_publish_failed topic=%s error=%s correlation_id=%s",
|
|
472
|
+
topic,
|
|
473
|
+
str(dlq_error),
|
|
474
|
+
str(correlation_id),
|
|
475
|
+
)
|
|
476
|
+
else:
|
|
477
|
+
# Fallback: log at ERROR level if DLQ not available
|
|
478
|
+
self._logger.error(
|
|
479
|
+
"dlq_not_available topic=%s error_category=%s error_type=%s "
|
|
480
|
+
"error_message=%s correlation_id=%s",
|
|
481
|
+
topic,
|
|
482
|
+
error_category,
|
|
483
|
+
type(error).__name__,
|
|
484
|
+
str(error),
|
|
485
|
+
str(correlation_id),
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
def _get_retry_count(self, correlation_id: UUID) -> int:
|
|
489
|
+
"""Get current retry count for a correlation ID.
|
|
490
|
+
|
|
491
|
+
Args:
|
|
492
|
+
correlation_id: The correlation ID to check.
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
Current retry count (0 if not tracked).
|
|
496
|
+
"""
|
|
497
|
+
return self._retry_counts.get(correlation_id, 0)
|
|
498
|
+
|
|
499
|
+
def _increment_retry_count(self, correlation_id: UUID) -> int:
|
|
500
|
+
"""Increment retry count for a correlation ID.
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
correlation_id: The correlation ID to increment.
|
|
504
|
+
|
|
505
|
+
Returns:
|
|
506
|
+
New retry count after increment.
|
|
507
|
+
"""
|
|
508
|
+
current = self._retry_counts.get(correlation_id, 0)
|
|
509
|
+
self._retry_counts[correlation_id] = current + 1
|
|
510
|
+
return current + 1
|
|
511
|
+
|
|
512
|
+
def _clear_retry_count(self, correlation_id: UUID) -> None:
|
|
513
|
+
"""Clear retry count for a correlation ID after successful processing.
|
|
514
|
+
|
|
515
|
+
Args:
|
|
516
|
+
correlation_id: The correlation ID to clear.
|
|
517
|
+
"""
|
|
518
|
+
self._retry_counts.pop(correlation_id, None)
|
|
519
|
+
|
|
520
|
+
def _is_retry_exhausted(self, correlation_id: UUID) -> bool:
|
|
521
|
+
"""Check if retry budget is exhausted for a correlation ID.
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
correlation_id: The correlation ID to check.
|
|
525
|
+
|
|
526
|
+
Returns:
|
|
527
|
+
True if retry attempts exceed max_attempts from config.
|
|
528
|
+
"""
|
|
529
|
+
return self._get_retry_count(correlation_id) >= self._retry_config.max_attempts
|
|
530
|
+
|
|
251
531
|
def _create_dispatch_callback(
|
|
252
532
|
self,
|
|
253
533
|
topic: str,
|
|
534
|
+
consumer_group: str,
|
|
254
535
|
) -> Callable[[ProtocolEventMessage], Awaitable[None]]:
|
|
255
536
|
"""Create callback that bridges Kafka consumer to dispatch engine.
|
|
256
537
|
|
|
257
538
|
Creates an async callback function that:
|
|
258
539
|
1. Receives ProtocolEventMessage from the Kafka consumer
|
|
259
540
|
2. Deserializes the message value to ModelEventEnvelope
|
|
260
|
-
3.
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
541
|
+
3. Checks idempotency (if enabled) to skip duplicate messages
|
|
542
|
+
4. Dispatches the envelope to the MessageDispatchEngine
|
|
543
|
+
5. Classifies errors as content (DLQ) vs infrastructure (fail-fast)
|
|
544
|
+
6. Manages offset commits based on policy
|
|
545
|
+
|
|
546
|
+
Error Classification:
|
|
547
|
+
Content Errors (ProtocolConfigurationError, ValidationError, JSONDecodeError):
|
|
548
|
+
- Non-retryable (will never succeed with retry)
|
|
549
|
+
- Default: DLQ and commit offset
|
|
550
|
+
- Policy override via dlq_config.on_content_error
|
|
551
|
+
|
|
552
|
+
Infrastructure Errors (RuntimeHostError and subclasses):
|
|
553
|
+
- Potentially retryable (may succeed after service recovery)
|
|
554
|
+
- Default: Fail-fast (no DLQ, no commit, re-raise)
|
|
555
|
+
- If retry exhausted and policy allows: DLQ and commit
|
|
556
|
+
- Policy override via dlq_config.on_infra_exhausted
|
|
265
557
|
|
|
266
558
|
Args:
|
|
267
559
|
topic: The full topic name for routing context in logs.
|
|
560
|
+
consumer_group: The consumer group ID used for this topic subscription.
|
|
561
|
+
This is passed to DLQ publishing to ensure consistent traceability
|
|
562
|
+
between subscriptions and their associated DLQ messages.
|
|
268
563
|
|
|
269
564
|
Returns:
|
|
270
565
|
Async callback function compatible with event bus subscribe().
|
|
@@ -272,34 +567,169 @@ class EventBusSubcontractWiring:
|
|
|
272
567
|
|
|
273
568
|
async def callback(message: ProtocolEventMessage) -> None:
|
|
274
569
|
"""Process incoming Kafka message and dispatch to engine."""
|
|
570
|
+
envelope: ModelEventEnvelope[object] | None = None
|
|
571
|
+
correlation_id: UUID = uuid4() # Default if not in envelope
|
|
572
|
+
|
|
275
573
|
try:
|
|
276
574
|
envelope = self._deserialize_to_envelope(message)
|
|
575
|
+
correlation_id = envelope.correlation_id or uuid4()
|
|
576
|
+
|
|
577
|
+
# Idempotency gate: check for duplicate messages
|
|
578
|
+
if self._idempotency_store and self._idempotency_config.enabled:
|
|
579
|
+
envelope_id = envelope.envelope_id
|
|
580
|
+
if envelope_id is None:
|
|
581
|
+
# Missing envelope_id is a content error when idempotency is enabled
|
|
582
|
+
raise ProtocolConfigurationError(
|
|
583
|
+
"Envelope missing envelope_id for idempotency",
|
|
584
|
+
context=ModelInfraErrorContext.with_correlation(
|
|
585
|
+
correlation_id=correlation_id,
|
|
586
|
+
transport_type=EnumInfraTransportType.KAFKA,
|
|
587
|
+
operation="idempotency_check",
|
|
588
|
+
),
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
is_new = await self._idempotency_store.check_and_record(
|
|
592
|
+
message_id=envelope_id,
|
|
593
|
+
domain=topic, # Use topic as domain for namespace isolation
|
|
594
|
+
correlation_id=correlation_id,
|
|
595
|
+
)
|
|
596
|
+
if not is_new:
|
|
597
|
+
# Duplicate - skip processing but commit offset to prevent
|
|
598
|
+
# infinite redelivery. This is critical: even though we don't
|
|
599
|
+
# reprocess the message, we must advance the consumer offset.
|
|
600
|
+
self._logger.info(
|
|
601
|
+
"idempotency_skip envelope_id=%s topic=%s "
|
|
602
|
+
"correlation_id=%s node=%s reason=duplicate_message",
|
|
603
|
+
str(envelope_id),
|
|
604
|
+
topic,
|
|
605
|
+
str(correlation_id),
|
|
606
|
+
self._node_name,
|
|
607
|
+
)
|
|
608
|
+
# Commit offset for duplicate to prevent infinite redelivery
|
|
609
|
+
if self._should_commit_after_handler():
|
|
610
|
+
await self._commit_offset(message, correlation_id)
|
|
611
|
+
return # Skip dispatch
|
|
612
|
+
|
|
277
613
|
# Dispatch via ProtocolDispatchEngine interface
|
|
278
614
|
await self._dispatch_engine.dispatch(topic, envelope)
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
615
|
+
|
|
616
|
+
# Success - commit offset if policy requires and clear retry count
|
|
617
|
+
if self._should_commit_after_handler():
|
|
618
|
+
await self._commit_offset(message, correlation_id)
|
|
619
|
+
self._clear_retry_count(correlation_id)
|
|
620
|
+
|
|
621
|
+
except (json.JSONDecodeError, ValidationError) as e:
|
|
622
|
+
# Content error: malformed JSON or schema validation failure
|
|
623
|
+
# These are non-retryable - the message will never parse correctly
|
|
624
|
+
self._logger.warning(
|
|
625
|
+
"content_error_deserialization topic=%s error_type=%s "
|
|
626
|
+
"error=%s correlation_id=%s",
|
|
282
627
|
topic,
|
|
283
|
-
e,
|
|
628
|
+
type(e).__name__,
|
|
629
|
+
str(e),
|
|
630
|
+
str(correlation_id),
|
|
284
631
|
)
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
632
|
+
|
|
633
|
+
if self._dlq_config.on_content_error == "dlq_and_commit":
|
|
634
|
+
await self._publish_to_dlq(
|
|
635
|
+
topic, message, e, correlation_id, "content", consumer_group
|
|
636
|
+
)
|
|
637
|
+
await self._commit_offset(message, correlation_id)
|
|
638
|
+
return # Handled - don't re-raise
|
|
639
|
+
|
|
640
|
+
# fail_fast - wrap and re-raise
|
|
641
|
+
raise ProtocolConfigurationError(
|
|
642
|
+
f"Content error: failed to deserialize message from topic '{topic}'",
|
|
288
643
|
context=ModelInfraErrorContext.with_correlation(
|
|
644
|
+
correlation_id=correlation_id,
|
|
289
645
|
transport_type=EnumInfraTransportType.KAFKA,
|
|
290
646
|
operation="event_bus_deserialize",
|
|
291
647
|
),
|
|
292
648
|
) from e
|
|
649
|
+
|
|
650
|
+
except ProtocolConfigurationError as e:
|
|
651
|
+
# Content error: already classified as non-retryable
|
|
652
|
+
self._logger.warning(
|
|
653
|
+
"content_error_configuration topic=%s error=%s correlation_id=%s",
|
|
654
|
+
topic,
|
|
655
|
+
str(e),
|
|
656
|
+
str(correlation_id),
|
|
657
|
+
)
|
|
658
|
+
|
|
659
|
+
if self._dlq_config.on_content_error == "dlq_and_commit":
|
|
660
|
+
await self._publish_to_dlq(
|
|
661
|
+
topic, message, e, correlation_id, "content", consumer_group
|
|
662
|
+
)
|
|
663
|
+
await self._commit_offset(message, correlation_id)
|
|
664
|
+
return # Handled - don't re-raise
|
|
665
|
+
|
|
666
|
+
# fail_fast - re-raise without wrapping (already proper OnexError)
|
|
667
|
+
raise
|
|
668
|
+
|
|
669
|
+
except RuntimeHostError as e:
|
|
670
|
+
# Infrastructure error: potentially retryable
|
|
671
|
+
# Track retry attempts and check exhaustion
|
|
672
|
+
retry_count = self._increment_retry_count(correlation_id)
|
|
673
|
+
is_exhausted = self._is_retry_exhausted(correlation_id)
|
|
674
|
+
|
|
675
|
+
# TRY400 disabled: logger.error intentional to avoid leaking stack traces
|
|
676
|
+
self._logger.error( # noqa: TRY400
|
|
677
|
+
"infra_error topic=%s error_type=%s error=%s "
|
|
678
|
+
"retry_count=%d max_attempts=%d exhausted=%s correlation_id=%s",
|
|
679
|
+
topic,
|
|
680
|
+
type(e).__name__,
|
|
681
|
+
str(e),
|
|
682
|
+
retry_count,
|
|
683
|
+
self._retry_config.max_attempts,
|
|
684
|
+
is_exhausted,
|
|
685
|
+
str(correlation_id),
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
if is_exhausted:
|
|
689
|
+
# Retry budget exhausted - check policy
|
|
690
|
+
if self._dlq_config.on_infra_exhausted == "dlq_and_commit":
|
|
691
|
+
await self._publish_to_dlq(
|
|
692
|
+
topic, message, e, correlation_id, "infra", consumer_group
|
|
693
|
+
)
|
|
694
|
+
await self._commit_offset(message, correlation_id)
|
|
695
|
+
self._clear_retry_count(correlation_id)
|
|
696
|
+
return # Handled - don't re-raise
|
|
697
|
+
|
|
698
|
+
# fail_fast (default) - re-raise without committing
|
|
699
|
+
# Kafka will redeliver the message
|
|
700
|
+
raise
|
|
701
|
+
|
|
293
702
|
except Exception as e:
|
|
703
|
+
# Unexpected error - classify as infrastructure error
|
|
704
|
+
# This catches errors from handlers that aren't properly wrapped
|
|
705
|
+
retry_count = self._increment_retry_count(correlation_id)
|
|
706
|
+
is_exhausted = self._is_retry_exhausted(correlation_id)
|
|
707
|
+
|
|
294
708
|
self._logger.exception(
|
|
295
|
-
"
|
|
709
|
+
"unexpected_error topic=%s error_type=%s error=%s "
|
|
710
|
+
"retry_count=%d exhausted=%s correlation_id=%s",
|
|
296
711
|
topic,
|
|
297
|
-
e,
|
|
712
|
+
type(e).__name__,
|
|
713
|
+
str(e),
|
|
714
|
+
retry_count,
|
|
715
|
+
is_exhausted,
|
|
716
|
+
str(correlation_id),
|
|
298
717
|
)
|
|
299
|
-
|
|
718
|
+
|
|
719
|
+
if is_exhausted:
|
|
720
|
+
if self._dlq_config.on_infra_exhausted == "dlq_and_commit":
|
|
721
|
+
await self._publish_to_dlq(
|
|
722
|
+
topic, message, e, correlation_id, "infra", consumer_group
|
|
723
|
+
)
|
|
724
|
+
await self._commit_offset(message, correlation_id)
|
|
725
|
+
self._clear_retry_count(correlation_id)
|
|
726
|
+
return
|
|
727
|
+
|
|
728
|
+
# Wrap in RuntimeHostError and re-raise
|
|
300
729
|
raise RuntimeHostError(
|
|
301
730
|
f"Failed to dispatch message from topic '{topic}'",
|
|
302
731
|
context=ModelInfraErrorContext.with_correlation(
|
|
732
|
+
correlation_id=correlation_id,
|
|
303
733
|
transport_type=EnumInfraTransportType.KAFKA,
|
|
304
734
|
operation="event_bus_dispatch",
|
|
305
735
|
),
|
|
@@ -368,6 +798,7 @@ class EventBusSubcontractWiring:
|
|
|
368
798
|
)
|
|
369
799
|
|
|
370
800
|
self._unsubscribe_callables.clear()
|
|
801
|
+
self._retry_counts.clear()
|
|
371
802
|
|
|
372
803
|
if cleanup_count > 0:
|
|
373
804
|
self._logger.info(
|