omnibase_infra 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. omnibase_infra/__init__.py +1 -1
  2. omnibase_infra/adapters/adapter_onex_tool_execution.py +451 -0
  3. omnibase_infra/capabilities/__init__.py +15 -0
  4. omnibase_infra/capabilities/capability_inference_rules.py +211 -0
  5. omnibase_infra/capabilities/contract_capability_extractor.py +221 -0
  6. omnibase_infra/capabilities/intent_type_extractor.py +160 -0
  7. omnibase_infra/cli/commands.py +1 -1
  8. omnibase_infra/configs/widget_mapping.yaml +176 -0
  9. omnibase_infra/contracts/handlers/filesystem/handler_contract.yaml +5 -2
  10. omnibase_infra/contracts/handlers/mcp/handler_contract.yaml +5 -2
  11. omnibase_infra/enums/__init__.py +6 -0
  12. omnibase_infra/enums/enum_handler_error_type.py +10 -0
  13. omnibase_infra/enums/enum_handler_source_mode.py +72 -0
  14. omnibase_infra/enums/enum_kafka_acks.py +99 -0
  15. omnibase_infra/errors/error_compute_registry.py +4 -1
  16. omnibase_infra/errors/error_event_bus_registry.py +4 -1
  17. omnibase_infra/errors/error_infra.py +3 -1
  18. omnibase_infra/errors/error_policy_registry.py +4 -1
  19. omnibase_infra/event_bus/event_bus_kafka.py +1 -1
  20. omnibase_infra/event_bus/models/config/model_kafka_event_bus_config.py +59 -10
  21. omnibase_infra/handlers/__init__.py +8 -1
  22. omnibase_infra/handlers/handler_consul.py +7 -1
  23. omnibase_infra/handlers/handler_db.py +10 -3
  24. omnibase_infra/handlers/handler_graph.py +10 -5
  25. omnibase_infra/handlers/handler_http.py +8 -2
  26. omnibase_infra/handlers/handler_intent.py +387 -0
  27. omnibase_infra/handlers/handler_mcp.py +745 -63
  28. omnibase_infra/handlers/handler_vault.py +11 -5
  29. omnibase_infra/handlers/mixins/mixin_consul_kv.py +4 -3
  30. omnibase_infra/handlers/mixins/mixin_consul_service.py +2 -1
  31. omnibase_infra/handlers/registration_storage/handler_registration_storage_postgres.py +7 -0
  32. omnibase_infra/handlers/service_discovery/handler_service_discovery_consul.py +308 -4
  33. omnibase_infra/handlers/service_discovery/models/model_service_info.py +10 -0
  34. omnibase_infra/mixins/mixin_async_circuit_breaker.py +3 -2
  35. omnibase_infra/mixins/mixin_node_introspection.py +42 -7
  36. omnibase_infra/mixins/mixin_retry_execution.py +1 -1
  37. omnibase_infra/models/discovery/model_introspection_config.py +11 -0
  38. omnibase_infra/models/handlers/__init__.py +48 -5
  39. omnibase_infra/models/handlers/model_bootstrap_handler_descriptor.py +162 -0
  40. omnibase_infra/models/handlers/model_contract_discovery_result.py +6 -4
  41. omnibase_infra/models/handlers/model_handler_descriptor.py +15 -0
  42. omnibase_infra/models/handlers/model_handler_source_config.py +220 -0
  43. omnibase_infra/models/mcp/__init__.py +15 -0
  44. omnibase_infra/models/mcp/model_mcp_contract_config.py +80 -0
  45. omnibase_infra/models/mcp/model_mcp_server_config.py +67 -0
  46. omnibase_infra/models/mcp/model_mcp_tool_definition.py +73 -0
  47. omnibase_infra/models/mcp/model_mcp_tool_parameter.py +35 -0
  48. omnibase_infra/models/registration/model_node_capabilities.py +11 -0
  49. omnibase_infra/models/registration/model_node_introspection_event.py +9 -0
  50. omnibase_infra/models/runtime/model_handler_contract.py +25 -9
  51. omnibase_infra/models/runtime/model_loaded_handler.py +9 -0
  52. omnibase_infra/nodes/architecture_validator/contract_architecture_validator.yaml +0 -5
  53. omnibase_infra/nodes/architecture_validator/registry/registry_infra_architecture_validator.py +17 -10
  54. omnibase_infra/nodes/effects/contract.yaml +0 -5
  55. omnibase_infra/nodes/node_registration_orchestrator/contract.yaml +7 -0
  56. omnibase_infra/nodes/node_registration_orchestrator/handlers/handler_node_introspected.py +86 -1
  57. omnibase_infra/nodes/node_registration_orchestrator/introspection_event_router.py +3 -3
  58. omnibase_infra/nodes/node_registration_orchestrator/plugin.py +1 -1
  59. omnibase_infra/nodes/node_registration_orchestrator/registry/registry_infra_node_registration_orchestrator.py +9 -8
  60. omnibase_infra/nodes/node_registration_orchestrator/timeout_coordinator.py +4 -3
  61. omnibase_infra/nodes/node_registration_orchestrator/wiring.py +14 -13
  62. omnibase_infra/nodes/node_registration_storage_effect/contract.yaml +0 -5
  63. omnibase_infra/nodes/node_registration_storage_effect/node.py +4 -1
  64. omnibase_infra/nodes/node_registration_storage_effect/registry/registry_infra_registration_storage.py +47 -26
  65. omnibase_infra/nodes/node_registry_effect/contract.yaml +0 -5
  66. omnibase_infra/nodes/node_registry_effect/handlers/handler_partial_retry.py +2 -1
  67. omnibase_infra/nodes/node_service_discovery_effect/registry/registry_infra_service_discovery.py +28 -20
  68. omnibase_infra/plugins/examples/plugin_json_normalizer.py +2 -2
  69. omnibase_infra/plugins/examples/plugin_json_normalizer_error_handling.py +2 -2
  70. omnibase_infra/plugins/plugin_compute_base.py +16 -2
  71. omnibase_infra/protocols/__init__.py +2 -0
  72. omnibase_infra/protocols/protocol_container_aware.py +200 -0
  73. omnibase_infra/protocols/protocol_event_projector.py +1 -1
  74. omnibase_infra/runtime/__init__.py +90 -1
  75. omnibase_infra/runtime/binding_config_resolver.py +102 -37
  76. omnibase_infra/runtime/constants_notification.py +75 -0
  77. omnibase_infra/runtime/contract_handler_discovery.py +6 -1
  78. omnibase_infra/runtime/handler_bootstrap_source.py +507 -0
  79. omnibase_infra/runtime/handler_contract_config_loader.py +603 -0
  80. omnibase_infra/runtime/handler_contract_source.py +267 -186
  81. omnibase_infra/runtime/handler_identity.py +81 -0
  82. omnibase_infra/runtime/handler_plugin_loader.py +19 -2
  83. omnibase_infra/runtime/handler_registry.py +11 -3
  84. omnibase_infra/runtime/handler_source_resolver.py +326 -0
  85. omnibase_infra/runtime/mixin_semver_cache.py +25 -1
  86. omnibase_infra/runtime/mixins/__init__.py +7 -0
  87. omnibase_infra/runtime/mixins/mixin_projector_notification_publishing.py +566 -0
  88. omnibase_infra/runtime/mixins/mixin_projector_sql_operations.py +31 -10
  89. omnibase_infra/runtime/models/__init__.py +24 -0
  90. omnibase_infra/runtime/models/model_health_check_result.py +2 -1
  91. omnibase_infra/runtime/models/model_projector_notification_config.py +171 -0
  92. omnibase_infra/runtime/models/model_transition_notification_outbox_config.py +112 -0
  93. omnibase_infra/runtime/models/model_transition_notification_outbox_metrics.py +140 -0
  94. omnibase_infra/runtime/models/model_transition_notification_publisher_metrics.py +357 -0
  95. omnibase_infra/runtime/projector_plugin_loader.py +1 -1
  96. omnibase_infra/runtime/projector_shell.py +229 -1
  97. omnibase_infra/runtime/protocol_lifecycle_executor.py +6 -6
  98. omnibase_infra/runtime/protocols/__init__.py +10 -0
  99. omnibase_infra/runtime/registry/registry_protocol_binding.py +16 -15
  100. omnibase_infra/runtime/registry_contract_source.py +693 -0
  101. omnibase_infra/runtime/registry_policy.py +9 -326
  102. omnibase_infra/runtime/secret_resolver.py +4 -2
  103. omnibase_infra/runtime/service_kernel.py +11 -3
  104. omnibase_infra/runtime/service_message_dispatch_engine.py +4 -2
  105. omnibase_infra/runtime/service_runtime_host_process.py +589 -106
  106. omnibase_infra/runtime/transition_notification_outbox.py +1190 -0
  107. omnibase_infra/runtime/transition_notification_publisher.py +764 -0
  108. omnibase_infra/runtime/util_container_wiring.py +6 -5
  109. omnibase_infra/runtime/util_wiring.py +17 -4
  110. omnibase_infra/schemas/schema_transition_notification_outbox.sql +245 -0
  111. omnibase_infra/services/__init__.py +21 -0
  112. omnibase_infra/services/corpus_capture.py +7 -1
  113. omnibase_infra/services/mcp/__init__.py +31 -0
  114. omnibase_infra/services/mcp/mcp_server_lifecycle.py +449 -0
  115. omnibase_infra/services/mcp/service_mcp_tool_discovery.py +411 -0
  116. omnibase_infra/services/mcp/service_mcp_tool_registry.py +329 -0
  117. omnibase_infra/services/mcp/service_mcp_tool_sync.py +547 -0
  118. omnibase_infra/services/registry_api/__init__.py +40 -0
  119. omnibase_infra/services/registry_api/main.py +261 -0
  120. omnibase_infra/services/registry_api/models/__init__.py +66 -0
  121. omnibase_infra/services/registry_api/models/model_capability_widget_mapping.py +38 -0
  122. omnibase_infra/services/registry_api/models/model_pagination_info.py +48 -0
  123. omnibase_infra/services/registry_api/models/model_registry_discovery_response.py +73 -0
  124. omnibase_infra/services/registry_api/models/model_registry_health_response.py +49 -0
  125. omnibase_infra/services/registry_api/models/model_registry_instance_view.py +88 -0
  126. omnibase_infra/services/registry_api/models/model_registry_node_view.py +88 -0
  127. omnibase_infra/services/registry_api/models/model_registry_summary.py +60 -0
  128. omnibase_infra/services/registry_api/models/model_response_list_instances.py +43 -0
  129. omnibase_infra/services/registry_api/models/model_response_list_nodes.py +51 -0
  130. omnibase_infra/services/registry_api/models/model_warning.py +49 -0
  131. omnibase_infra/services/registry_api/models/model_widget_defaults.py +28 -0
  132. omnibase_infra/services/registry_api/models/model_widget_mapping.py +51 -0
  133. omnibase_infra/services/registry_api/routes.py +371 -0
  134. omnibase_infra/services/registry_api/service.py +837 -0
  135. omnibase_infra/services/service_capability_query.py +4 -4
  136. omnibase_infra/services/service_health.py +3 -2
  137. omnibase_infra/services/service_timeout_emitter.py +20 -3
  138. omnibase_infra/services/service_timeout_scanner.py +7 -3
  139. omnibase_infra/services/session/__init__.py +56 -0
  140. omnibase_infra/services/session/config_consumer.py +120 -0
  141. omnibase_infra/services/session/config_store.py +139 -0
  142. omnibase_infra/services/session/consumer.py +1007 -0
  143. omnibase_infra/services/session/protocol_session_aggregator.py +117 -0
  144. omnibase_infra/services/session/store.py +997 -0
  145. omnibase_infra/utils/__init__.py +19 -0
  146. omnibase_infra/utils/util_atomic_file.py +261 -0
  147. omnibase_infra/utils/util_db_transaction.py +239 -0
  148. omnibase_infra/utils/util_dsn_validation.py +1 -1
  149. omnibase_infra/utils/util_retry_optimistic.py +281 -0
  150. omnibase_infra/validation/__init__.py +3 -19
  151. omnibase_infra/validation/contracts/security.validation.yaml +114 -0
  152. omnibase_infra/validation/infra_validators.py +35 -24
  153. omnibase_infra/validation/validation_exemptions.yaml +140 -9
  154. omnibase_infra/validation/validator_chain_propagation.py +2 -2
  155. omnibase_infra/validation/validator_runtime_shape.py +1 -1
  156. omnibase_infra/validation/validator_security.py +473 -370
  157. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.3.dist-info}/METADATA +3 -3
  158. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.3.dist-info}/RECORD +161 -98
  159. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.3.dist-info}/WHEEL +0 -0
  160. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.3.dist-info}/entry_points.txt +0 -0
  161. {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1007 @@
1
+ # SPDX-License-Identifier: MIT
2
+ # Copyright (c) 2025 OmniNode Team
3
+ """Kafka consumer for Claude Code session events.
4
+
5
+ Implements at-least-once delivery with manual offset commits.
6
+ Events are processed through a ProtocolSessionAggregator.
7
+
8
+ This consumer subscribes to Claude Code hook event topics and processes
9
+ incoming events through the session aggregation system. It implements
10
+ several resilience patterns:
11
+
12
+ - At-least-once delivery: Manual offset commits after successful processing
13
+ - Circuit breaker: Prevents cascade failures when downstream is unhealthy
14
+ - Graceful shutdown: Properly drains and commits before exiting
15
+ - Observability: Structured logging with correlation IDs
16
+
17
+ Architecture:
18
+ ```
19
+ Kafka Topics (session/prompt/tool)
20
+ |
21
+ v
22
+ SessionEventConsumer
23
+ |
24
+ v (process_event)
25
+ ProtocolSessionAggregator
26
+ |
27
+ v
28
+ Session Snapshots (storage)
29
+ ```
30
+
31
+ Related Tickets:
32
+ - OMN-1401: Session storage in OmniMemory (current)
33
+ - OMN-1400: Hook handlers emit to Kafka
34
+ - OMN-1402: Learning compute node (consumer of snapshots)
35
+ - OMN-1526: Moved from omniclaude to omnibase_infra
36
+
37
+ Example:
38
+ >>> from omnibase_infra.services.session import SessionEventConsumer, ConfigSessionConsumer
39
+ >>> from my_aggregator import MySessionAggregator
40
+ >>>
41
+ >>> config = ConfigSessionConsumer()
42
+ >>> aggregator = MySessionAggregator()
43
+ >>> consumer = SessionEventConsumer(config=config, aggregator=aggregator)
44
+ >>>
45
+ >>> # Start consuming (blocking)
46
+ >>> await consumer.start()
47
+ >>>
48
+ >>> # Or use context manager
49
+ >>> async with consumer:
50
+ ... await consumer.run()
51
+
52
+ Moved from omniclaude as part of OMN-1526 architectural cleanup.
53
+ """
54
+
55
+ from __future__ import annotations
56
+
57
+ import asyncio
58
+ import logging
59
+ from datetime import UTC, datetime
60
+ from enum import StrEnum
61
+ from uuid import UUID, uuid4
62
+
63
+ from aiokafka import AIOKafkaConsumer
64
+ from aiokafka.errors import KafkaError
65
+ from pydantic import ValidationError
66
+
67
+ from omnibase_infra.services.session.config_consumer import ConfigSessionConsumer
68
+ from omnibase_infra.services.session.protocol_session_aggregator import (
69
+ ProtocolSessionAggregator,
70
+ )
71
+
72
+ # TODO(OMN-1526): These imports need resolution - schemas remain in omniclaude
73
+ # The consumer depends on hook event schemas which are domain-specific to omniclaude.
74
+ # Options to resolve:
75
+ # 1. Move schemas to a shared package (omnibase-schemas)
76
+ # 2. Pass schema types as generic parameters
77
+ # 3. Use raw dict processing without schema validation
78
+ #
79
+ # For now, commenting out the direct imports and using a protocol-based approach.
80
+ #
81
+ # Original imports from omniclaude:
82
+ # from omniclaude.hooks.schemas import (
83
+ # HookEventType,
84
+ # ModelHookEventEnvelope,
85
+ # ModelHookPromptSubmittedPayload,
86
+ # ModelHookSessionEndedPayload,
87
+ # ModelHookSessionStartedPayload,
88
+ # ModelHookToolExecutedPayload,
89
+ # )
90
+
91
+
92
+ logger = logging.getLogger(__name__)
93
+
94
+
95
+ # =============================================================================
96
+ # Enums
97
+ # =============================================================================
98
+
99
+
100
+ class EnumCircuitState(StrEnum):
101
+ """Circuit breaker states.
102
+
103
+ The circuit breaker protects the consumer from cascade failures when
104
+ the downstream aggregator is unhealthy.
105
+
106
+ State Transitions:
107
+ CLOSED -> OPEN: After consecutive_failures >= threshold
108
+ OPEN -> HALF_OPEN: After circuit_breaker_timeout_seconds elapsed
109
+ HALF_OPEN -> CLOSED: After successful processing
110
+ HALF_OPEN -> OPEN: After failure in half-open state
111
+ """
112
+
113
+ CLOSED = "closed"
114
+ OPEN = "open"
115
+ HALF_OPEN = "half_open"
116
+
117
+
118
+ # =============================================================================
119
+ # Consumer Metrics
120
+ # =============================================================================
121
+
122
+
123
+ class ConsumerMetrics:
124
+ """Metrics tracking for the session event consumer.
125
+
126
+ Tracks processing statistics for observability and monitoring.
127
+ Thread-safe via asyncio lock protection.
128
+
129
+ Attributes:
130
+ messages_received: Total messages received from Kafka.
131
+ messages_processed: Successfully processed messages.
132
+ messages_failed: Messages that failed processing.
133
+ messages_skipped: Messages skipped (invalid, duplicate, etc.).
134
+ circuit_opens: Number of times circuit breaker opened.
135
+ last_message_at: Timestamp of last received message.
136
+ """
137
+
138
+ def __init__(self) -> None:
139
+ """Initialize metrics with zero values."""
140
+ self.messages_received: int = 0
141
+ self.messages_processed: int = 0
142
+ self.messages_failed: int = 0
143
+ self.messages_skipped: int = 0
144
+ self.circuit_opens: int = 0
145
+ self.last_message_at: datetime | None = None
146
+ self._lock = asyncio.Lock()
147
+
148
+ async def record_received(self) -> None:
149
+ """Record a message received."""
150
+ async with self._lock:
151
+ self.messages_received += 1
152
+ self.last_message_at = datetime.now(UTC)
153
+
154
+ async def record_processed(self) -> None:
155
+ """Record a successfully processed message."""
156
+ async with self._lock:
157
+ self.messages_processed += 1
158
+
159
+ async def record_failed(self) -> None:
160
+ """Record a failed message."""
161
+ async with self._lock:
162
+ self.messages_failed += 1
163
+
164
+ async def record_skipped(self) -> None:
165
+ """Record a skipped message."""
166
+ async with self._lock:
167
+ self.messages_skipped += 1
168
+
169
+ async def record_circuit_open(self) -> None:
170
+ """Record a circuit breaker open event."""
171
+ async with self._lock:
172
+ self.circuit_opens += 1
173
+
174
+ async def snapshot(self) -> dict[str, object]:
175
+ """Get a snapshot of current metrics.
176
+
177
+ Returns:
178
+ Dictionary with all metric values.
179
+ """
180
+ async with self._lock:
181
+ return {
182
+ "messages_received": self.messages_received,
183
+ "messages_processed": self.messages_processed,
184
+ "messages_failed": self.messages_failed,
185
+ "messages_skipped": self.messages_skipped,
186
+ "circuit_opens": self.circuit_opens,
187
+ "last_message_at": (
188
+ self.last_message_at.isoformat() if self.last_message_at else None
189
+ ),
190
+ }
191
+
192
+
193
+ # =============================================================================
194
+ # Session Event Consumer
195
+ # =============================================================================
196
+
197
+
198
+ class SessionEventConsumer:
199
+ """Kafka consumer for Claude Code hook events.
200
+
201
+ Consumes events from session/prompt/tool topics and processes
202
+ them through an aggregator. Implements at-least-once delivery
203
+ with manual offset commits.
204
+
205
+ Features:
206
+ - **At-least-once delivery**: Offsets committed only after successful
207
+ processing. If the consumer crashes before commit, messages will be
208
+ reprocessed on restart (aggregator must be idempotent).
209
+
210
+ - **Circuit breaker**: Protects against cascade failures when the
211
+ downstream aggregator is unhealthy. Opens after consecutive failures
212
+ exceed threshold, allowing time for recovery.
213
+
214
+ - **Graceful shutdown**: Drains in-flight messages and commits offsets
215
+ before exiting. Responds to stop() or SIGTERM signals.
216
+
217
+ - **Observability**: Structured logging with correlation IDs, plus
218
+ metrics tracking for monitoring dashboards.
219
+
220
+ Thread Safety:
221
+ This consumer is designed for single-threaded async execution.
222
+ Multiple consumers can run in parallel with different group_ids
223
+ for horizontal scaling.
224
+
225
+ Example:
226
+ >>> config = ConfigSessionConsumer()
227
+ >>> aggregator = InMemorySessionAggregator()
228
+ >>> consumer = SessionEventConsumer(config=config, aggregator=aggregator)
229
+ >>>
230
+ >>> # Start consuming
231
+ >>> await consumer.start()
232
+ >>>
233
+ >>> # Or in application lifecycle
234
+ >>> await consumer.start()
235
+ >>> try:
236
+ ... await consumer.run()
237
+ ... finally:
238
+ ... await consumer.stop()
239
+
240
+ Attributes:
241
+ metrics: Consumer metrics for observability.
242
+ is_running: Whether the consumer is currently running.
243
+ circuit_state: Current circuit breaker state.
244
+ """
245
+
246
+ def __init__(
247
+ self,
248
+ config: ConfigSessionConsumer,
249
+ aggregator: ProtocolSessionAggregator,
250
+ ) -> None:
251
+ """Initialize the session event consumer.
252
+
253
+ Args:
254
+ config: Consumer configuration (topics, timeouts, circuit breaker).
255
+ aggregator: Session aggregator implementing ProtocolSessionAggregator.
256
+ The aggregator must be idempotent to support at-least-once delivery.
257
+
258
+ Example:
259
+ >>> config = ConfigSessionConsumer(
260
+ ... bootstrap_servers="192.168.86.200:29092",
261
+ ... group_id="my-consumer-group",
262
+ ... )
263
+ >>> aggregator = InMemorySessionAggregator()
264
+ >>> consumer = SessionEventConsumer(config, aggregator)
265
+ """
266
+ self._config = config
267
+ self._aggregator = aggregator
268
+ self._consumer: AIOKafkaConsumer | None = None
269
+ self._running = False
270
+ self._shutdown_event = asyncio.Event()
271
+
272
+ # Circuit breaker state
273
+ self._consecutive_failures = 0
274
+ self._circuit_state = EnumCircuitState.CLOSED
275
+ self._circuit_opened_at: datetime | None = None
276
+ self._circuit_lock = asyncio.Lock()
277
+ self._consumer_paused = False # Track pause state for circuit breaker
278
+ self._half_open_successes = 0 # Track successes in half-open state
279
+
280
+ # Metrics
281
+ self.metrics = ConsumerMetrics()
282
+
283
+ # Consumer ID for logging
284
+ self._consumer_id = f"session-consumer-{uuid4().hex[:8]}"
285
+
286
+ logger.info(
287
+ "SessionEventConsumer initialized",
288
+ extra={
289
+ "consumer_id": self._consumer_id,
290
+ "topics": self._config.topics,
291
+ "group_id": self._config.group_id,
292
+ "bootstrap_servers": self._config.bootstrap_servers,
293
+ },
294
+ )
295
+
296
+ # =========================================================================
297
+ # Properties
298
+ # =========================================================================
299
+
300
+ @property
301
+ def is_running(self) -> bool:
302
+ """Check if the consumer is currently running.
303
+
304
+ Returns:
305
+ True if start() has been called and stop() has not.
306
+ """
307
+ return self._running
308
+
309
+ @property
310
+ def circuit_state(self) -> EnumCircuitState:
311
+ """Get the current circuit breaker state.
312
+
313
+ Returns:
314
+ Current circuit state (CLOSED, OPEN, or HALF_OPEN).
315
+ """
316
+ return self._circuit_state
317
+
318
+ @property
319
+ def consumer_id(self) -> str:
320
+ """Get the unique consumer identifier.
321
+
322
+ Returns:
323
+ Consumer ID string for logging and tracing.
324
+ """
325
+ return self._consumer_id
326
+
327
+ # =========================================================================
328
+ # Lifecycle Methods
329
+ # =========================================================================
330
+
331
+ async def start(self) -> None:
332
+ """Start the consumer and connect to Kafka.
333
+
334
+ Creates the Kafka consumer with manual offset commits disabled
335
+ (for at-least-once semantics) and starts the connection.
336
+
337
+ Raises:
338
+ RuntimeError: If the consumer is already running.
339
+ KafkaError: If connection to Kafka fails.
340
+
341
+ Example:
342
+ >>> await consumer.start()
343
+ >>> # Consumer is now connected, ready for run()
344
+ """
345
+ if self._running:
346
+ logger.warning(
347
+ "Consumer already running",
348
+ extra={"consumer_id": self._consumer_id},
349
+ )
350
+ return
351
+
352
+ correlation_id = uuid4()
353
+
354
+ logger.info(
355
+ "Starting SessionEventConsumer",
356
+ extra={
357
+ "consumer_id": self._consumer_id,
358
+ "correlation_id": str(correlation_id),
359
+ "topics": self._config.topics,
360
+ },
361
+ )
362
+
363
+ try:
364
+ self._consumer = AIOKafkaConsumer(
365
+ *self._config.topics,
366
+ bootstrap_servers=self._config.bootstrap_servers,
367
+ group_id=self._config.group_id,
368
+ auto_offset_reset=self._config.auto_offset_reset,
369
+ enable_auto_commit=False, # Manual commits for at-least-once
370
+ max_poll_records=self._config.max_poll_records,
371
+ )
372
+
373
+ await self._consumer.start()
374
+ self._running = True
375
+ self._shutdown_event.clear()
376
+
377
+ logger.info(
378
+ "SessionEventConsumer started",
379
+ extra={
380
+ "consumer_id": self._consumer_id,
381
+ "correlation_id": str(correlation_id),
382
+ "topics": self._config.topics,
383
+ "group_id": self._config.group_id,
384
+ },
385
+ )
386
+
387
+ except KafkaError as e:
388
+ logger.exception(
389
+ "Failed to start consumer",
390
+ extra={
391
+ "consumer_id": self._consumer_id,
392
+ "correlation_id": str(correlation_id),
393
+ "error": str(e),
394
+ },
395
+ )
396
+ raise
397
+
398
+ async def stop(self) -> None:
399
+ """Stop the consumer gracefully.
400
+
401
+ Signals the consume loop to exit, waits for in-flight processing
402
+ to complete, and closes the Kafka consumer connection. Safe to
403
+ call multiple times.
404
+
405
+ Example:
406
+ >>> await consumer.stop()
407
+ >>> # Consumer is now stopped and disconnected
408
+ """
409
+ if not self._running:
410
+ logger.debug(
411
+ "Consumer not running, nothing to stop",
412
+ extra={"consumer_id": self._consumer_id},
413
+ )
414
+ return
415
+
416
+ correlation_id = uuid4()
417
+
418
+ logger.info(
419
+ "Stopping SessionEventConsumer",
420
+ extra={
421
+ "consumer_id": self._consumer_id,
422
+ "correlation_id": str(correlation_id),
423
+ },
424
+ )
425
+
426
+ # Signal shutdown
427
+ self._running = False
428
+ self._shutdown_event.set()
429
+
430
+ # Resume consumer if paused (cleanup before stop)
431
+ if self._consumer is not None and self._consumer_paused:
432
+ await self._resume_consumer(correlation_id)
433
+
434
+ # Close consumer connection
435
+ if self._consumer is not None:
436
+ try:
437
+ await self._consumer.stop()
438
+ except Exception as e:
439
+ logger.warning(
440
+ "Error stopping Kafka consumer",
441
+ extra={
442
+ "consumer_id": self._consumer_id,
443
+ "correlation_id": str(correlation_id),
444
+ "error": str(e),
445
+ },
446
+ )
447
+ finally:
448
+ self._consumer = None
449
+
450
+ # Log final metrics
451
+ metrics_snapshot = await self.metrics.snapshot()
452
+ logger.info(
453
+ "SessionEventConsumer stopped",
454
+ extra={
455
+ "consumer_id": self._consumer_id,
456
+ "correlation_id": str(correlation_id),
457
+ "final_metrics": metrics_snapshot,
458
+ },
459
+ )
460
+
461
+ async def run(self) -> None:
462
+ """Run the main consume loop.
463
+
464
+ Continuously consumes messages from Kafka topics and processes them
465
+ through the aggregator. Implements at-least-once delivery by committing
466
+ offsets only after successful processing.
467
+
468
+ This method blocks until stop() is called or an unrecoverable error
469
+ occurs. Use this after calling start().
470
+
471
+ Example:
472
+ >>> await consumer.start()
473
+ >>> try:
474
+ ... await consumer.run()
475
+ ... finally:
476
+ ... await consumer.stop()
477
+ """
478
+ if not self._running or self._consumer is None:
479
+ raise RuntimeError(
480
+ "Consumer not started. Call start() before run().",
481
+ )
482
+
483
+ correlation_id = uuid4()
484
+
485
+ logger.info(
486
+ "Starting consume loop",
487
+ extra={
488
+ "consumer_id": self._consumer_id,
489
+ "correlation_id": str(correlation_id),
490
+ },
491
+ )
492
+
493
+ await self._consume_loop(correlation_id)
494
+
495
+ async def __aenter__(self) -> SessionEventConsumer:
496
+ """Async context manager entry.
497
+
498
+ Starts the consumer and returns self for use in async with blocks.
499
+
500
+ Returns:
501
+ Self for chaining.
502
+
503
+ Example:
504
+ >>> async with SessionEventConsumer(config, aggregator) as consumer:
505
+ ... await consumer.run()
506
+ """
507
+ await self.start()
508
+ return self
509
+
510
+ async def __aexit__(
511
+ self,
512
+ exc_type: type[BaseException] | None,
513
+ exc_val: BaseException | None,
514
+ exc_tb: object,
515
+ ) -> None:
516
+ """Async context manager exit.
517
+
518
+ Stops the consumer on exit from async with block.
519
+ """
520
+ await self.stop()
521
+
522
+ # =========================================================================
523
+ # Consume Loop
524
+ # =========================================================================
525
+
526
+ async def _consume_loop(self, correlation_id: UUID) -> None:
527
+ """Main consumption loop with at-least-once semantics.
528
+
529
+ Continuously polls Kafka for messages and processes them through
530
+ the aggregator. Commits offsets only after successful processing.
531
+
532
+ Args:
533
+ correlation_id: Correlation ID for tracing this consume session.
534
+ """
535
+ if self._consumer is None:
536
+ logger.error(
537
+ "Consumer is None in consume loop",
538
+ extra={
539
+ "consumer_id": self._consumer_id,
540
+ "correlation_id": str(correlation_id),
541
+ },
542
+ )
543
+ return
544
+
545
+ try:
546
+ async for message in self._consumer:
547
+ # Check shutdown signal
548
+ if not self._running:
549
+ logger.debug(
550
+ "Shutdown signal received, exiting consume loop",
551
+ extra={
552
+ "consumer_id": self._consumer_id,
553
+ "correlation_id": str(correlation_id),
554
+ },
555
+ )
556
+ break
557
+
558
+ # Record message received
559
+ await self.metrics.record_received()
560
+
561
+ # Check circuit breaker - if open, pause and wait for recovery
562
+ # IMPORTANT: We do NOT skip this message. After circuit recovers,
563
+ # we process it normally. This prevents message loss during circuit open.
564
+ if await self._is_circuit_open():
565
+ await self._wait_for_circuit_recovery(correlation_id)
566
+ # Fall through to process this message after recovery
567
+
568
+ # Process the message
569
+ message_correlation_id = uuid4()
570
+ try:
571
+ success = await self._process_message(
572
+ message, message_correlation_id
573
+ )
574
+
575
+ if success:
576
+ # Commit offset after successful processing
577
+ await self._consumer.commit()
578
+ await self.metrics.record_processed()
579
+ await self._record_success()
580
+
581
+ logger.debug(
582
+ "Message processed and committed",
583
+ extra={
584
+ "consumer_id": self._consumer_id,
585
+ "correlation_id": str(message_correlation_id),
586
+ "topic": message.topic,
587
+ "partition": message.partition,
588
+ "offset": message.offset,
589
+ },
590
+ )
591
+ else:
592
+ # Processing returned False (rejected, duplicate, etc.)
593
+ # Still commit to avoid reprocessing
594
+ await self._consumer.commit()
595
+ await self.metrics.record_skipped()
596
+
597
+ logger.debug(
598
+ "Message skipped (rejected by aggregator)",
599
+ extra={
600
+ "consumer_id": self._consumer_id,
601
+ "correlation_id": str(message_correlation_id),
602
+ "topic": message.topic,
603
+ },
604
+ )
605
+
606
+ except ValidationError as e:
607
+ # Schema validation error - skip and commit
608
+ # These messages are malformed and will never succeed
609
+ await self._consumer.commit()
610
+ await self.metrics.record_skipped()
611
+
612
+ logger.warning(
613
+ "Message skipped due to validation error",
614
+ extra={
615
+ "consumer_id": self._consumer_id,
616
+ "correlation_id": str(message_correlation_id),
617
+ "topic": message.topic,
618
+ "error": str(e),
619
+ },
620
+ )
621
+
622
+ except Exception as e:
623
+ # Processing error - record failure, don't commit
624
+ await self.metrics.record_failed()
625
+ await self._record_failure()
626
+
627
+ logger.exception(
628
+ "Error processing message",
629
+ extra={
630
+ "consumer_id": self._consumer_id,
631
+ "correlation_id": str(message_correlation_id),
632
+ "topic": message.topic,
633
+ "partition": message.partition,
634
+ "offset": message.offset,
635
+ "error": str(e),
636
+ "consecutive_failures": self._consecutive_failures,
637
+ },
638
+ )
639
+
640
+ except asyncio.CancelledError:
641
+ logger.info(
642
+ "Consume loop cancelled",
643
+ extra={
644
+ "consumer_id": self._consumer_id,
645
+ "correlation_id": str(correlation_id),
646
+ },
647
+ )
648
+ raise
649
+
650
+ except Exception as e:
651
+ logger.exception(
652
+ "Unexpected error in consume loop",
653
+ extra={
654
+ "consumer_id": self._consumer_id,
655
+ "correlation_id": str(correlation_id),
656
+ "error": str(e),
657
+ },
658
+ )
659
+ raise
660
+
661
+ finally:
662
+ logger.info(
663
+ "Consume loop exiting",
664
+ extra={
665
+ "consumer_id": self._consumer_id,
666
+ "correlation_id": str(correlation_id),
667
+ },
668
+ )
669
+
670
+ # =========================================================================
671
+ # Message Processing
672
+ # =========================================================================
673
+
674
+ async def _process_message(self, message: object, correlation_id: UUID) -> bool:
675
+ """Process a single message through the aggregator.
676
+
677
+ Deserializes the message payload and dispatches it to the aggregator.
678
+
679
+ Note: Schema validation is delegated to the aggregator since schemas
680
+ are domain-specific (omniclaude). This consumer is schema-agnostic.
681
+
682
+ Args:
683
+ message: Kafka ConsumerRecord with topic, value, etc.
684
+ correlation_id: Correlation ID for this processing attempt.
685
+
686
+ Returns:
687
+ True if processed successfully, False if rejected (duplicate, etc.).
688
+
689
+ Raises:
690
+ ValidationError: If the message payload fails schema validation.
691
+ Exception: If the aggregator raises an error during processing.
692
+ """
693
+ # Extract message value
694
+ value = getattr(message, "value", None)
695
+ if value is None:
696
+ logger.warning(
697
+ "Message has no value",
698
+ extra={
699
+ "consumer_id": self._consumer_id,
700
+ "correlation_id": str(correlation_id),
701
+ "topic": getattr(message, "topic", "unknown"),
702
+ },
703
+ )
704
+ return False
705
+
706
+ # Decode bytes to string
707
+ if isinstance(value, bytes):
708
+ value = value.decode("utf-8")
709
+
710
+ # TODO(OMN-1526): Schema parsing moved to aggregator
711
+ # The original code parsed ModelHookEventEnvelope here, but that
712
+ # creates a dependency on omniclaude.hooks.schemas. The aggregator
713
+ # is now responsible for schema validation.
714
+ #
715
+ # Original code:
716
+ # envelope = ModelHookEventEnvelope.model_validate_json(value)
717
+ # payload = envelope.payload
718
+ # result = await self._aggregator.process_event(envelope, correlation_id)
719
+
720
+ logger.debug(
721
+ "Processing event",
722
+ extra={
723
+ "consumer_id": self._consumer_id,
724
+ "correlation_id": str(correlation_id),
725
+ "topic": getattr(message, "topic", "unknown"),
726
+ },
727
+ )
728
+
729
+ # Pass raw JSON string to aggregator - let it handle schema validation
730
+ result = await self._aggregator.process_event(value, correlation_id)
731
+
732
+ return result
733
+
734
+ # =========================================================================
735
+ # Circuit Breaker
736
+ # =========================================================================
737
+
738
+ async def _is_circuit_open(self) -> bool:
739
+ """Check if circuit breaker is open.
740
+
741
+ If the circuit is open, checks if enough time has passed to
742
+ transition to half-open state for a test request.
743
+
744
+ Returns:
745
+ True if circuit is open and should block processing.
746
+ """
747
+ async with self._circuit_lock:
748
+ if self._circuit_state == EnumCircuitState.CLOSED:
749
+ return False
750
+
751
+ if self._circuit_state == EnumCircuitState.HALF_OPEN:
752
+ # Allow test request
753
+ return False
754
+
755
+ # Circuit is OPEN - check if timeout has elapsed
756
+ if self._circuit_opened_at is not None:
757
+ elapsed = (datetime.now(UTC) - self._circuit_opened_at).total_seconds()
758
+ if elapsed >= self._config.circuit_breaker_timeout_seconds:
759
+ # Transition to half-open
760
+ self._circuit_state = EnumCircuitState.HALF_OPEN
761
+ logger.info(
762
+ "Circuit breaker transitioning to half-open",
763
+ extra={
764
+ "consumer_id": self._consumer_id,
765
+ "elapsed_seconds": elapsed,
766
+ },
767
+ )
768
+ return False
769
+
770
+ return True
771
+
772
+ async def _wait_for_circuit_recovery(self, correlation_id: UUID) -> None:
773
+ """Pause consumer and wait for circuit breaker to recover.
774
+
775
+ Called when the circuit is open. This method:
776
+ 1. Pauses the Kafka consumer to stop fetching new messages
777
+ 2. Waits in a loop until circuit transitions to HALF_OPEN or CLOSED
778
+ 3. Resumes the consumer before returning
779
+
780
+ This ensures no messages are lost during circuit open state - the current
781
+ message will be processed after this method returns, and no new messages
782
+ are fetched while waiting.
783
+
784
+ Args:
785
+ correlation_id: Correlation ID for logging.
786
+ """
787
+ if self._consumer is None:
788
+ return
789
+
790
+ # Pause the consumer to stop fetching new messages
791
+ await self._pause_consumer(correlation_id)
792
+
793
+ logger.warning(
794
+ "Circuit breaker is open, consumer paused - waiting for recovery",
795
+ extra={
796
+ "consumer_id": self._consumer_id,
797
+ "correlation_id": str(correlation_id),
798
+ "timeout_seconds": self._config.circuit_breaker_timeout_seconds,
799
+ },
800
+ )
801
+
802
+ # Wait in a loop until circuit is no longer open
803
+ check_interval = min(1.0, self._config.circuit_breaker_timeout_seconds / 10)
804
+ while self._running:
805
+ # Check if circuit has recovered
806
+ if not await self._is_circuit_open():
807
+ logger.info(
808
+ "Circuit breaker recovered, resuming consumer",
809
+ extra={
810
+ "consumer_id": self._consumer_id,
811
+ "correlation_id": str(correlation_id),
812
+ "circuit_state": self._circuit_state.value,
813
+ },
814
+ )
815
+ break
816
+
817
+ # Check for shutdown signal
818
+ if self._shutdown_event.is_set():
819
+ logger.debug(
820
+ "Shutdown signal received while waiting for circuit recovery",
821
+ extra={
822
+ "consumer_id": self._consumer_id,
823
+ "correlation_id": str(correlation_id),
824
+ },
825
+ )
826
+ break
827
+
828
+ # Wait before checking again
829
+ await asyncio.sleep(check_interval)
830
+
831
+ # Resume the consumer before returning
832
+ await self._resume_consumer(correlation_id)
833
+
834
+ async def _pause_consumer(self, correlation_id: UUID) -> None:
835
+ """Pause the Kafka consumer on all assigned partitions.
836
+
837
+ Args:
838
+ correlation_id: Correlation ID for logging.
839
+ """
840
+ if self._consumer is None or self._consumer_paused:
841
+ return
842
+
843
+ try:
844
+ partitions = self._consumer.assignment()
845
+ if partitions:
846
+ self._consumer.pause(*partitions)
847
+ self._consumer_paused = True
848
+ logger.debug(
849
+ "Consumer paused",
850
+ extra={
851
+ "consumer_id": self._consumer_id,
852
+ "correlation_id": str(correlation_id),
853
+ "partitions": [str(p) for p in partitions],
854
+ },
855
+ )
856
+ except Exception as e:
857
+ logger.warning(
858
+ "Failed to pause consumer",
859
+ extra={
860
+ "consumer_id": self._consumer_id,
861
+ "correlation_id": str(correlation_id),
862
+ "error": str(e),
863
+ },
864
+ )
865
+
866
+ async def _resume_consumer(self, correlation_id: UUID) -> None:
867
+ """Resume the Kafka consumer on all assigned partitions.
868
+
869
+ Args:
870
+ correlation_id: Correlation ID for logging.
871
+ """
872
+ if self._consumer is None or not self._consumer_paused:
873
+ return
874
+
875
+ try:
876
+ partitions = self._consumer.assignment()
877
+ if partitions:
878
+ self._consumer.resume(*partitions)
879
+ self._consumer_paused = False
880
+ logger.debug(
881
+ "Consumer resumed",
882
+ extra={
883
+ "consumer_id": self._consumer_id,
884
+ "correlation_id": str(correlation_id),
885
+ "partitions": [str(p) for p in partitions],
886
+ },
887
+ )
888
+ except Exception as e:
889
+ logger.warning(
890
+ "Failed to resume consumer",
891
+ extra={
892
+ "consumer_id": self._consumer_id,
893
+ "correlation_id": str(correlation_id),
894
+ "error": str(e),
895
+ },
896
+ )
897
+
898
+ async def _record_failure(self) -> None:
899
+ """Record a processing failure for circuit breaker.
900
+
901
+ Increments consecutive failure count and opens circuit if
902
+ threshold is exceeded. Also resets half-open success counter
903
+ when circuit opens.
904
+ """
905
+ async with self._circuit_lock:
906
+ self._consecutive_failures += 1
907
+
908
+ if self._consecutive_failures >= self._config.circuit_breaker_threshold:
909
+ if self._circuit_state != EnumCircuitState.OPEN:
910
+ self._circuit_state = EnumCircuitState.OPEN
911
+ self._circuit_opened_at = datetime.now(UTC)
912
+ self._half_open_successes = 0
913
+ await self.metrics.record_circuit_open()
914
+
915
+ logger.warning(
916
+ "Circuit breaker opened",
917
+ extra={
918
+ "consumer_id": self._consumer_id,
919
+ "consecutive_failures": self._consecutive_failures,
920
+ "threshold": self._config.circuit_breaker_threshold,
921
+ },
922
+ )
923
+
924
+ async def _record_success(self) -> None:
925
+ """Record a processing success for circuit breaker.
926
+
927
+ Resets consecutive failure count. In half-open state, tracks
928
+ successful requests and closes circuit once threshold is met.
929
+ Also ensures consumer is resumed if it was paused.
930
+ """
931
+ should_resume = False
932
+ async with self._circuit_lock:
933
+ self._consecutive_failures = 0
934
+
935
+ if self._circuit_state == EnumCircuitState.HALF_OPEN:
936
+ self._half_open_successes += 1
937
+
938
+ if (
939
+ self._half_open_successes
940
+ >= self._config.circuit_breaker_half_open_successes
941
+ ):
942
+ self._circuit_state = EnumCircuitState.CLOSED
943
+ self._circuit_opened_at = None
944
+ self._half_open_successes = 0
945
+ should_resume = self._consumer_paused
946
+
947
+ logger.info(
948
+ "Circuit breaker closed after successful requests in half-open",
949
+ extra={
950
+ "consumer_id": self._consumer_id,
951
+ "successes_required": self._config.circuit_breaker_half_open_successes,
952
+ },
953
+ )
954
+ else:
955
+ logger.debug(
956
+ "Circuit breaker half-open success recorded",
957
+ extra={
958
+ "consumer_id": self._consumer_id,
959
+ "current_successes": self._half_open_successes,
960
+ "required_successes": self._config.circuit_breaker_half_open_successes,
961
+ },
962
+ )
963
+
964
+ # Resume consumer outside the lock if needed (safety check)
965
+ if should_resume:
966
+ await self._resume_consumer(uuid4())
967
+
968
+ # =========================================================================
969
+ # Health Check
970
+ # =========================================================================
971
+
972
+ async def health_check(self) -> dict[str, object]:
973
+ """Check consumer health status.
974
+
975
+ Returns a dictionary with health information for monitoring
976
+ and diagnostics.
977
+
978
+ Returns:
979
+ Dictionary with health status including:
980
+ - healthy: Overall health (running and circuit closed)
981
+ - running: Whether consume loop is active
982
+ - circuit_state: Current circuit breaker state
983
+ - consumer_id: Unique consumer identifier
984
+ - metrics: Current metrics snapshot
985
+ """
986
+ metrics_snapshot = await self.metrics.snapshot()
987
+
988
+ return {
989
+ "healthy": self._running and self._circuit_state == EnumCircuitState.CLOSED,
990
+ "running": self._running,
991
+ "circuit_state": self._circuit_state.value,
992
+ "consumer_paused": self._consumer_paused,
993
+ "consumer_id": self._consumer_id,
994
+ "group_id": self._config.group_id,
995
+ "topics": self._config.topics,
996
+ "consecutive_failures": self._consecutive_failures,
997
+ "half_open_successes": self._half_open_successes,
998
+ "metrics": metrics_snapshot,
999
+ }
1000
+
1001
+
1002
+ __all__ = [
1003
+ "SessionEventConsumer",
1004
+ "ConsumerMetrics",
1005
+ "EnumCircuitState",
1006
+ "ProtocolSessionAggregator",
1007
+ ]