omnibase_infra 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omnibase_infra/__init__.py +1 -1
- omnibase_infra/adapters/adapter_onex_tool_execution.py +451 -0
- omnibase_infra/capabilities/__init__.py +15 -0
- omnibase_infra/capabilities/capability_inference_rules.py +211 -0
- omnibase_infra/capabilities/contract_capability_extractor.py +221 -0
- omnibase_infra/capabilities/intent_type_extractor.py +160 -0
- omnibase_infra/cli/commands.py +1 -1
- omnibase_infra/configs/widget_mapping.yaml +176 -0
- omnibase_infra/contracts/handlers/filesystem/handler_contract.yaml +5 -2
- omnibase_infra/contracts/handlers/mcp/handler_contract.yaml +5 -2
- omnibase_infra/enums/__init__.py +6 -0
- omnibase_infra/enums/enum_handler_error_type.py +10 -0
- omnibase_infra/enums/enum_handler_source_mode.py +72 -0
- omnibase_infra/enums/enum_kafka_acks.py +99 -0
- omnibase_infra/errors/error_compute_registry.py +4 -1
- omnibase_infra/errors/error_event_bus_registry.py +4 -1
- omnibase_infra/errors/error_infra.py +3 -1
- omnibase_infra/errors/error_policy_registry.py +4 -1
- omnibase_infra/event_bus/event_bus_kafka.py +1 -1
- omnibase_infra/event_bus/models/config/model_kafka_event_bus_config.py +59 -10
- omnibase_infra/handlers/__init__.py +8 -1
- omnibase_infra/handlers/handler_consul.py +7 -1
- omnibase_infra/handlers/handler_db.py +10 -3
- omnibase_infra/handlers/handler_graph.py +10 -5
- omnibase_infra/handlers/handler_http.py +8 -2
- omnibase_infra/handlers/handler_intent.py +387 -0
- omnibase_infra/handlers/handler_mcp.py +745 -63
- omnibase_infra/handlers/handler_vault.py +11 -5
- omnibase_infra/handlers/mixins/mixin_consul_kv.py +4 -3
- omnibase_infra/handlers/mixins/mixin_consul_service.py +2 -1
- omnibase_infra/handlers/registration_storage/handler_registration_storage_postgres.py +7 -0
- omnibase_infra/handlers/service_discovery/handler_service_discovery_consul.py +308 -4
- omnibase_infra/handlers/service_discovery/models/model_service_info.py +10 -0
- omnibase_infra/mixins/mixin_async_circuit_breaker.py +3 -2
- omnibase_infra/mixins/mixin_node_introspection.py +42 -7
- omnibase_infra/mixins/mixin_retry_execution.py +1 -1
- omnibase_infra/models/discovery/model_introspection_config.py +11 -0
- omnibase_infra/models/handlers/__init__.py +48 -5
- omnibase_infra/models/handlers/model_bootstrap_handler_descriptor.py +162 -0
- omnibase_infra/models/handlers/model_contract_discovery_result.py +6 -4
- omnibase_infra/models/handlers/model_handler_descriptor.py +15 -0
- omnibase_infra/models/handlers/model_handler_source_config.py +220 -0
- omnibase_infra/models/mcp/__init__.py +15 -0
- omnibase_infra/models/mcp/model_mcp_contract_config.py +80 -0
- omnibase_infra/models/mcp/model_mcp_server_config.py +67 -0
- omnibase_infra/models/mcp/model_mcp_tool_definition.py +73 -0
- omnibase_infra/models/mcp/model_mcp_tool_parameter.py +35 -0
- omnibase_infra/models/registration/model_node_capabilities.py +11 -0
- omnibase_infra/models/registration/model_node_introspection_event.py +9 -0
- omnibase_infra/models/runtime/model_handler_contract.py +25 -9
- omnibase_infra/models/runtime/model_loaded_handler.py +9 -0
- omnibase_infra/nodes/architecture_validator/contract_architecture_validator.yaml +0 -5
- omnibase_infra/nodes/architecture_validator/registry/registry_infra_architecture_validator.py +17 -10
- omnibase_infra/nodes/effects/contract.yaml +0 -5
- omnibase_infra/nodes/node_registration_orchestrator/contract.yaml +7 -0
- omnibase_infra/nodes/node_registration_orchestrator/handlers/handler_node_introspected.py +86 -1
- omnibase_infra/nodes/node_registration_orchestrator/introspection_event_router.py +3 -3
- omnibase_infra/nodes/node_registration_orchestrator/plugin.py +1 -1
- omnibase_infra/nodes/node_registration_orchestrator/registry/registry_infra_node_registration_orchestrator.py +9 -8
- omnibase_infra/nodes/node_registration_orchestrator/timeout_coordinator.py +4 -3
- omnibase_infra/nodes/node_registration_orchestrator/wiring.py +14 -13
- omnibase_infra/nodes/node_registration_storage_effect/contract.yaml +0 -5
- omnibase_infra/nodes/node_registration_storage_effect/node.py +4 -1
- omnibase_infra/nodes/node_registration_storage_effect/registry/registry_infra_registration_storage.py +47 -26
- omnibase_infra/nodes/node_registry_effect/contract.yaml +0 -5
- omnibase_infra/nodes/node_registry_effect/handlers/handler_partial_retry.py +2 -1
- omnibase_infra/nodes/node_service_discovery_effect/registry/registry_infra_service_discovery.py +28 -20
- omnibase_infra/plugins/examples/plugin_json_normalizer.py +2 -2
- omnibase_infra/plugins/examples/plugin_json_normalizer_error_handling.py +2 -2
- omnibase_infra/plugins/plugin_compute_base.py +16 -2
- omnibase_infra/protocols/__init__.py +2 -0
- omnibase_infra/protocols/protocol_container_aware.py +200 -0
- omnibase_infra/protocols/protocol_event_projector.py +1 -1
- omnibase_infra/runtime/__init__.py +90 -1
- omnibase_infra/runtime/binding_config_resolver.py +102 -37
- omnibase_infra/runtime/constants_notification.py +75 -0
- omnibase_infra/runtime/contract_handler_discovery.py +6 -1
- omnibase_infra/runtime/handler_bootstrap_source.py +507 -0
- omnibase_infra/runtime/handler_contract_config_loader.py +603 -0
- omnibase_infra/runtime/handler_contract_source.py +267 -186
- omnibase_infra/runtime/handler_identity.py +81 -0
- omnibase_infra/runtime/handler_plugin_loader.py +19 -2
- omnibase_infra/runtime/handler_registry.py +11 -3
- omnibase_infra/runtime/handler_source_resolver.py +326 -0
- omnibase_infra/runtime/mixin_semver_cache.py +25 -1
- omnibase_infra/runtime/mixins/__init__.py +7 -0
- omnibase_infra/runtime/mixins/mixin_projector_notification_publishing.py +566 -0
- omnibase_infra/runtime/mixins/mixin_projector_sql_operations.py +31 -10
- omnibase_infra/runtime/models/__init__.py +24 -0
- omnibase_infra/runtime/models/model_health_check_result.py +2 -1
- omnibase_infra/runtime/models/model_projector_notification_config.py +171 -0
- omnibase_infra/runtime/models/model_transition_notification_outbox_config.py +112 -0
- omnibase_infra/runtime/models/model_transition_notification_outbox_metrics.py +140 -0
- omnibase_infra/runtime/models/model_transition_notification_publisher_metrics.py +357 -0
- omnibase_infra/runtime/projector_plugin_loader.py +1 -1
- omnibase_infra/runtime/projector_shell.py +229 -1
- omnibase_infra/runtime/protocol_lifecycle_executor.py +6 -6
- omnibase_infra/runtime/protocols/__init__.py +10 -0
- omnibase_infra/runtime/registry/registry_protocol_binding.py +16 -15
- omnibase_infra/runtime/registry_contract_source.py +693 -0
- omnibase_infra/runtime/registry_policy.py +9 -326
- omnibase_infra/runtime/secret_resolver.py +4 -2
- omnibase_infra/runtime/service_kernel.py +11 -3
- omnibase_infra/runtime/service_message_dispatch_engine.py +4 -2
- omnibase_infra/runtime/service_runtime_host_process.py +589 -106
- omnibase_infra/runtime/transition_notification_outbox.py +1190 -0
- omnibase_infra/runtime/transition_notification_publisher.py +764 -0
- omnibase_infra/runtime/util_container_wiring.py +6 -5
- omnibase_infra/runtime/util_wiring.py +17 -4
- omnibase_infra/schemas/schema_transition_notification_outbox.sql +245 -0
- omnibase_infra/services/__init__.py +21 -0
- omnibase_infra/services/corpus_capture.py +7 -1
- omnibase_infra/services/mcp/__init__.py +31 -0
- omnibase_infra/services/mcp/mcp_server_lifecycle.py +449 -0
- omnibase_infra/services/mcp/service_mcp_tool_discovery.py +411 -0
- omnibase_infra/services/mcp/service_mcp_tool_registry.py +329 -0
- omnibase_infra/services/mcp/service_mcp_tool_sync.py +547 -0
- omnibase_infra/services/registry_api/__init__.py +40 -0
- omnibase_infra/services/registry_api/main.py +261 -0
- omnibase_infra/services/registry_api/models/__init__.py +66 -0
- omnibase_infra/services/registry_api/models/model_capability_widget_mapping.py +38 -0
- omnibase_infra/services/registry_api/models/model_pagination_info.py +48 -0
- omnibase_infra/services/registry_api/models/model_registry_discovery_response.py +73 -0
- omnibase_infra/services/registry_api/models/model_registry_health_response.py +49 -0
- omnibase_infra/services/registry_api/models/model_registry_instance_view.py +88 -0
- omnibase_infra/services/registry_api/models/model_registry_node_view.py +88 -0
- omnibase_infra/services/registry_api/models/model_registry_summary.py +60 -0
- omnibase_infra/services/registry_api/models/model_response_list_instances.py +43 -0
- omnibase_infra/services/registry_api/models/model_response_list_nodes.py +51 -0
- omnibase_infra/services/registry_api/models/model_warning.py +49 -0
- omnibase_infra/services/registry_api/models/model_widget_defaults.py +28 -0
- omnibase_infra/services/registry_api/models/model_widget_mapping.py +51 -0
- omnibase_infra/services/registry_api/routes.py +371 -0
- omnibase_infra/services/registry_api/service.py +837 -0
- omnibase_infra/services/service_capability_query.py +4 -4
- omnibase_infra/services/service_health.py +3 -2
- omnibase_infra/services/service_timeout_emitter.py +20 -3
- omnibase_infra/services/service_timeout_scanner.py +7 -3
- omnibase_infra/services/session/__init__.py +56 -0
- omnibase_infra/services/session/config_consumer.py +120 -0
- omnibase_infra/services/session/config_store.py +139 -0
- omnibase_infra/services/session/consumer.py +1007 -0
- omnibase_infra/services/session/protocol_session_aggregator.py +117 -0
- omnibase_infra/services/session/store.py +997 -0
- omnibase_infra/utils/__init__.py +19 -0
- omnibase_infra/utils/util_atomic_file.py +261 -0
- omnibase_infra/utils/util_db_transaction.py +239 -0
- omnibase_infra/utils/util_dsn_validation.py +1 -1
- omnibase_infra/utils/util_retry_optimistic.py +281 -0
- omnibase_infra/validation/__init__.py +3 -19
- omnibase_infra/validation/contracts/security.validation.yaml +114 -0
- omnibase_infra/validation/infra_validators.py +35 -24
- omnibase_infra/validation/validation_exemptions.yaml +140 -9
- omnibase_infra/validation/validator_chain_propagation.py +2 -2
- omnibase_infra/validation/validator_runtime_shape.py +1 -1
- omnibase_infra/validation/validator_security.py +473 -370
- {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.3.dist-info}/METADATA +3 -3
- {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.3.dist-info}/RECORD +161 -98
- {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.3.dist-info}/WHEEL +0 -0
- {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.3.dist-info}/entry_points.txt +0 -0
- {omnibase_infra-0.2.1.dist-info → omnibase_infra-0.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1007 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
2
|
+
# Copyright (c) 2025 OmniNode Team
|
|
3
|
+
"""Kafka consumer for Claude Code session events.
|
|
4
|
+
|
|
5
|
+
Implements at-least-once delivery with manual offset commits.
|
|
6
|
+
Events are processed through a ProtocolSessionAggregator.
|
|
7
|
+
|
|
8
|
+
This consumer subscribes to Claude Code hook event topics and processes
|
|
9
|
+
incoming events through the session aggregation system. It implements
|
|
10
|
+
several resilience patterns:
|
|
11
|
+
|
|
12
|
+
- At-least-once delivery: Manual offset commits after successful processing
|
|
13
|
+
- Circuit breaker: Prevents cascade failures when downstream is unhealthy
|
|
14
|
+
- Graceful shutdown: Properly drains and commits before exiting
|
|
15
|
+
- Observability: Structured logging with correlation IDs
|
|
16
|
+
|
|
17
|
+
Architecture:
|
|
18
|
+
```
|
|
19
|
+
Kafka Topics (session/prompt/tool)
|
|
20
|
+
|
|
|
21
|
+
v
|
|
22
|
+
SessionEventConsumer
|
|
23
|
+
|
|
|
24
|
+
v (process_event)
|
|
25
|
+
ProtocolSessionAggregator
|
|
26
|
+
|
|
|
27
|
+
v
|
|
28
|
+
Session Snapshots (storage)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Related Tickets:
|
|
32
|
+
- OMN-1401: Session storage in OmniMemory (current)
|
|
33
|
+
- OMN-1400: Hook handlers emit to Kafka
|
|
34
|
+
- OMN-1402: Learning compute node (consumer of snapshots)
|
|
35
|
+
- OMN-1526: Moved from omniclaude to omnibase_infra
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
>>> from omnibase_infra.services.session import SessionEventConsumer, ConfigSessionConsumer
|
|
39
|
+
>>> from my_aggregator import MySessionAggregator
|
|
40
|
+
>>>
|
|
41
|
+
>>> config = ConfigSessionConsumer()
|
|
42
|
+
>>> aggregator = MySessionAggregator()
|
|
43
|
+
>>> consumer = SessionEventConsumer(config=config, aggregator=aggregator)
|
|
44
|
+
>>>
|
|
45
|
+
>>> # Start consuming (blocking)
|
|
46
|
+
>>> await consumer.start()
|
|
47
|
+
>>>
|
|
48
|
+
>>> # Or use context manager
|
|
49
|
+
>>> async with consumer:
|
|
50
|
+
... await consumer.run()
|
|
51
|
+
|
|
52
|
+
Moved from omniclaude as part of OMN-1526 architectural cleanup.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
from __future__ import annotations
|
|
56
|
+
|
|
57
|
+
import asyncio
|
|
58
|
+
import logging
|
|
59
|
+
from datetime import UTC, datetime
|
|
60
|
+
from enum import StrEnum
|
|
61
|
+
from uuid import UUID, uuid4
|
|
62
|
+
|
|
63
|
+
from aiokafka import AIOKafkaConsumer
|
|
64
|
+
from aiokafka.errors import KafkaError
|
|
65
|
+
from pydantic import ValidationError
|
|
66
|
+
|
|
67
|
+
from omnibase_infra.services.session.config_consumer import ConfigSessionConsumer
|
|
68
|
+
from omnibase_infra.services.session.protocol_session_aggregator import (
|
|
69
|
+
ProtocolSessionAggregator,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# TODO(OMN-1526): These imports need resolution - schemas remain in omniclaude
|
|
73
|
+
# The consumer depends on hook event schemas which are domain-specific to omniclaude.
|
|
74
|
+
# Options to resolve:
|
|
75
|
+
# 1. Move schemas to a shared package (omnibase-schemas)
|
|
76
|
+
# 2. Pass schema types as generic parameters
|
|
77
|
+
# 3. Use raw dict processing without schema validation
|
|
78
|
+
#
|
|
79
|
+
# For now, commenting out the direct imports and using a protocol-based approach.
|
|
80
|
+
#
|
|
81
|
+
# Original imports from omniclaude:
|
|
82
|
+
# from omniclaude.hooks.schemas import (
|
|
83
|
+
# HookEventType,
|
|
84
|
+
# ModelHookEventEnvelope,
|
|
85
|
+
# ModelHookPromptSubmittedPayload,
|
|
86
|
+
# ModelHookSessionEndedPayload,
|
|
87
|
+
# ModelHookSessionStartedPayload,
|
|
88
|
+
# ModelHookToolExecutedPayload,
|
|
89
|
+
# )
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
logger = logging.getLogger(__name__)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# =============================================================================
|
|
96
|
+
# Enums
|
|
97
|
+
# =============================================================================
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class EnumCircuitState(StrEnum):
|
|
101
|
+
"""Circuit breaker states.
|
|
102
|
+
|
|
103
|
+
The circuit breaker protects the consumer from cascade failures when
|
|
104
|
+
the downstream aggregator is unhealthy.
|
|
105
|
+
|
|
106
|
+
State Transitions:
|
|
107
|
+
CLOSED -> OPEN: After consecutive_failures >= threshold
|
|
108
|
+
OPEN -> HALF_OPEN: After circuit_breaker_timeout_seconds elapsed
|
|
109
|
+
HALF_OPEN -> CLOSED: After successful processing
|
|
110
|
+
HALF_OPEN -> OPEN: After failure in half-open state
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
CLOSED = "closed"
|
|
114
|
+
OPEN = "open"
|
|
115
|
+
HALF_OPEN = "half_open"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# =============================================================================
|
|
119
|
+
# Consumer Metrics
|
|
120
|
+
# =============================================================================
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class ConsumerMetrics:
|
|
124
|
+
"""Metrics tracking for the session event consumer.
|
|
125
|
+
|
|
126
|
+
Tracks processing statistics for observability and monitoring.
|
|
127
|
+
Thread-safe via asyncio lock protection.
|
|
128
|
+
|
|
129
|
+
Attributes:
|
|
130
|
+
messages_received: Total messages received from Kafka.
|
|
131
|
+
messages_processed: Successfully processed messages.
|
|
132
|
+
messages_failed: Messages that failed processing.
|
|
133
|
+
messages_skipped: Messages skipped (invalid, duplicate, etc.).
|
|
134
|
+
circuit_opens: Number of times circuit breaker opened.
|
|
135
|
+
last_message_at: Timestamp of last received message.
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
def __init__(self) -> None:
|
|
139
|
+
"""Initialize metrics with zero values."""
|
|
140
|
+
self.messages_received: int = 0
|
|
141
|
+
self.messages_processed: int = 0
|
|
142
|
+
self.messages_failed: int = 0
|
|
143
|
+
self.messages_skipped: int = 0
|
|
144
|
+
self.circuit_opens: int = 0
|
|
145
|
+
self.last_message_at: datetime | None = None
|
|
146
|
+
self._lock = asyncio.Lock()
|
|
147
|
+
|
|
148
|
+
async def record_received(self) -> None:
|
|
149
|
+
"""Record a message received."""
|
|
150
|
+
async with self._lock:
|
|
151
|
+
self.messages_received += 1
|
|
152
|
+
self.last_message_at = datetime.now(UTC)
|
|
153
|
+
|
|
154
|
+
async def record_processed(self) -> None:
|
|
155
|
+
"""Record a successfully processed message."""
|
|
156
|
+
async with self._lock:
|
|
157
|
+
self.messages_processed += 1
|
|
158
|
+
|
|
159
|
+
async def record_failed(self) -> None:
|
|
160
|
+
"""Record a failed message."""
|
|
161
|
+
async with self._lock:
|
|
162
|
+
self.messages_failed += 1
|
|
163
|
+
|
|
164
|
+
async def record_skipped(self) -> None:
|
|
165
|
+
"""Record a skipped message."""
|
|
166
|
+
async with self._lock:
|
|
167
|
+
self.messages_skipped += 1
|
|
168
|
+
|
|
169
|
+
async def record_circuit_open(self) -> None:
|
|
170
|
+
"""Record a circuit breaker open event."""
|
|
171
|
+
async with self._lock:
|
|
172
|
+
self.circuit_opens += 1
|
|
173
|
+
|
|
174
|
+
async def snapshot(self) -> dict[str, object]:
|
|
175
|
+
"""Get a snapshot of current metrics.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
Dictionary with all metric values.
|
|
179
|
+
"""
|
|
180
|
+
async with self._lock:
|
|
181
|
+
return {
|
|
182
|
+
"messages_received": self.messages_received,
|
|
183
|
+
"messages_processed": self.messages_processed,
|
|
184
|
+
"messages_failed": self.messages_failed,
|
|
185
|
+
"messages_skipped": self.messages_skipped,
|
|
186
|
+
"circuit_opens": self.circuit_opens,
|
|
187
|
+
"last_message_at": (
|
|
188
|
+
self.last_message_at.isoformat() if self.last_message_at else None
|
|
189
|
+
),
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# =============================================================================
|
|
194
|
+
# Session Event Consumer
|
|
195
|
+
# =============================================================================
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
class SessionEventConsumer:
|
|
199
|
+
"""Kafka consumer for Claude Code hook events.
|
|
200
|
+
|
|
201
|
+
Consumes events from session/prompt/tool topics and processes
|
|
202
|
+
them through an aggregator. Implements at-least-once delivery
|
|
203
|
+
with manual offset commits.
|
|
204
|
+
|
|
205
|
+
Features:
|
|
206
|
+
- **At-least-once delivery**: Offsets committed only after successful
|
|
207
|
+
processing. If the consumer crashes before commit, messages will be
|
|
208
|
+
reprocessed on restart (aggregator must be idempotent).
|
|
209
|
+
|
|
210
|
+
- **Circuit breaker**: Protects against cascade failures when the
|
|
211
|
+
downstream aggregator is unhealthy. Opens after consecutive failures
|
|
212
|
+
exceed threshold, allowing time for recovery.
|
|
213
|
+
|
|
214
|
+
- **Graceful shutdown**: Drains in-flight messages and commits offsets
|
|
215
|
+
before exiting. Responds to stop() or SIGTERM signals.
|
|
216
|
+
|
|
217
|
+
- **Observability**: Structured logging with correlation IDs, plus
|
|
218
|
+
metrics tracking for monitoring dashboards.
|
|
219
|
+
|
|
220
|
+
Thread Safety:
|
|
221
|
+
This consumer is designed for single-threaded async execution.
|
|
222
|
+
Multiple consumers can run in parallel with different group_ids
|
|
223
|
+
for horizontal scaling.
|
|
224
|
+
|
|
225
|
+
Example:
|
|
226
|
+
>>> config = ConfigSessionConsumer()
|
|
227
|
+
>>> aggregator = InMemorySessionAggregator()
|
|
228
|
+
>>> consumer = SessionEventConsumer(config=config, aggregator=aggregator)
|
|
229
|
+
>>>
|
|
230
|
+
>>> # Start consuming
|
|
231
|
+
>>> await consumer.start()
|
|
232
|
+
>>>
|
|
233
|
+
>>> # Or in application lifecycle
|
|
234
|
+
>>> await consumer.start()
|
|
235
|
+
>>> try:
|
|
236
|
+
... await consumer.run()
|
|
237
|
+
... finally:
|
|
238
|
+
... await consumer.stop()
|
|
239
|
+
|
|
240
|
+
Attributes:
|
|
241
|
+
metrics: Consumer metrics for observability.
|
|
242
|
+
is_running: Whether the consumer is currently running.
|
|
243
|
+
circuit_state: Current circuit breaker state.
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
def __init__(
|
|
247
|
+
self,
|
|
248
|
+
config: ConfigSessionConsumer,
|
|
249
|
+
aggregator: ProtocolSessionAggregator,
|
|
250
|
+
) -> None:
|
|
251
|
+
"""Initialize the session event consumer.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
config: Consumer configuration (topics, timeouts, circuit breaker).
|
|
255
|
+
aggregator: Session aggregator implementing ProtocolSessionAggregator.
|
|
256
|
+
The aggregator must be idempotent to support at-least-once delivery.
|
|
257
|
+
|
|
258
|
+
Example:
|
|
259
|
+
>>> config = ConfigSessionConsumer(
|
|
260
|
+
... bootstrap_servers="192.168.86.200:29092",
|
|
261
|
+
... group_id="my-consumer-group",
|
|
262
|
+
... )
|
|
263
|
+
>>> aggregator = InMemorySessionAggregator()
|
|
264
|
+
>>> consumer = SessionEventConsumer(config, aggregator)
|
|
265
|
+
"""
|
|
266
|
+
self._config = config
|
|
267
|
+
self._aggregator = aggregator
|
|
268
|
+
self._consumer: AIOKafkaConsumer | None = None
|
|
269
|
+
self._running = False
|
|
270
|
+
self._shutdown_event = asyncio.Event()
|
|
271
|
+
|
|
272
|
+
# Circuit breaker state
|
|
273
|
+
self._consecutive_failures = 0
|
|
274
|
+
self._circuit_state = EnumCircuitState.CLOSED
|
|
275
|
+
self._circuit_opened_at: datetime | None = None
|
|
276
|
+
self._circuit_lock = asyncio.Lock()
|
|
277
|
+
self._consumer_paused = False # Track pause state for circuit breaker
|
|
278
|
+
self._half_open_successes = 0 # Track successes in half-open state
|
|
279
|
+
|
|
280
|
+
# Metrics
|
|
281
|
+
self.metrics = ConsumerMetrics()
|
|
282
|
+
|
|
283
|
+
# Consumer ID for logging
|
|
284
|
+
self._consumer_id = f"session-consumer-{uuid4().hex[:8]}"
|
|
285
|
+
|
|
286
|
+
logger.info(
|
|
287
|
+
"SessionEventConsumer initialized",
|
|
288
|
+
extra={
|
|
289
|
+
"consumer_id": self._consumer_id,
|
|
290
|
+
"topics": self._config.topics,
|
|
291
|
+
"group_id": self._config.group_id,
|
|
292
|
+
"bootstrap_servers": self._config.bootstrap_servers,
|
|
293
|
+
},
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# =========================================================================
|
|
297
|
+
# Properties
|
|
298
|
+
# =========================================================================
|
|
299
|
+
|
|
300
|
+
@property
|
|
301
|
+
def is_running(self) -> bool:
|
|
302
|
+
"""Check if the consumer is currently running.
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
True if start() has been called and stop() has not.
|
|
306
|
+
"""
|
|
307
|
+
return self._running
|
|
308
|
+
|
|
309
|
+
@property
|
|
310
|
+
def circuit_state(self) -> EnumCircuitState:
|
|
311
|
+
"""Get the current circuit breaker state.
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
Current circuit state (CLOSED, OPEN, or HALF_OPEN).
|
|
315
|
+
"""
|
|
316
|
+
return self._circuit_state
|
|
317
|
+
|
|
318
|
+
@property
|
|
319
|
+
def consumer_id(self) -> str:
|
|
320
|
+
"""Get the unique consumer identifier.
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Consumer ID string for logging and tracing.
|
|
324
|
+
"""
|
|
325
|
+
return self._consumer_id
|
|
326
|
+
|
|
327
|
+
# =========================================================================
|
|
328
|
+
# Lifecycle Methods
|
|
329
|
+
# =========================================================================
|
|
330
|
+
|
|
331
|
+
async def start(self) -> None:
|
|
332
|
+
"""Start the consumer and connect to Kafka.
|
|
333
|
+
|
|
334
|
+
Creates the Kafka consumer with manual offset commits disabled
|
|
335
|
+
(for at-least-once semantics) and starts the connection.
|
|
336
|
+
|
|
337
|
+
Raises:
|
|
338
|
+
RuntimeError: If the consumer is already running.
|
|
339
|
+
KafkaError: If connection to Kafka fails.
|
|
340
|
+
|
|
341
|
+
Example:
|
|
342
|
+
>>> await consumer.start()
|
|
343
|
+
>>> # Consumer is now connected, ready for run()
|
|
344
|
+
"""
|
|
345
|
+
if self._running:
|
|
346
|
+
logger.warning(
|
|
347
|
+
"Consumer already running",
|
|
348
|
+
extra={"consumer_id": self._consumer_id},
|
|
349
|
+
)
|
|
350
|
+
return
|
|
351
|
+
|
|
352
|
+
correlation_id = uuid4()
|
|
353
|
+
|
|
354
|
+
logger.info(
|
|
355
|
+
"Starting SessionEventConsumer",
|
|
356
|
+
extra={
|
|
357
|
+
"consumer_id": self._consumer_id,
|
|
358
|
+
"correlation_id": str(correlation_id),
|
|
359
|
+
"topics": self._config.topics,
|
|
360
|
+
},
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
try:
|
|
364
|
+
self._consumer = AIOKafkaConsumer(
|
|
365
|
+
*self._config.topics,
|
|
366
|
+
bootstrap_servers=self._config.bootstrap_servers,
|
|
367
|
+
group_id=self._config.group_id,
|
|
368
|
+
auto_offset_reset=self._config.auto_offset_reset,
|
|
369
|
+
enable_auto_commit=False, # Manual commits for at-least-once
|
|
370
|
+
max_poll_records=self._config.max_poll_records,
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
await self._consumer.start()
|
|
374
|
+
self._running = True
|
|
375
|
+
self._shutdown_event.clear()
|
|
376
|
+
|
|
377
|
+
logger.info(
|
|
378
|
+
"SessionEventConsumer started",
|
|
379
|
+
extra={
|
|
380
|
+
"consumer_id": self._consumer_id,
|
|
381
|
+
"correlation_id": str(correlation_id),
|
|
382
|
+
"topics": self._config.topics,
|
|
383
|
+
"group_id": self._config.group_id,
|
|
384
|
+
},
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
except KafkaError as e:
|
|
388
|
+
logger.exception(
|
|
389
|
+
"Failed to start consumer",
|
|
390
|
+
extra={
|
|
391
|
+
"consumer_id": self._consumer_id,
|
|
392
|
+
"correlation_id": str(correlation_id),
|
|
393
|
+
"error": str(e),
|
|
394
|
+
},
|
|
395
|
+
)
|
|
396
|
+
raise
|
|
397
|
+
|
|
398
|
+
async def stop(self) -> None:
|
|
399
|
+
"""Stop the consumer gracefully.
|
|
400
|
+
|
|
401
|
+
Signals the consume loop to exit, waits for in-flight processing
|
|
402
|
+
to complete, and closes the Kafka consumer connection. Safe to
|
|
403
|
+
call multiple times.
|
|
404
|
+
|
|
405
|
+
Example:
|
|
406
|
+
>>> await consumer.stop()
|
|
407
|
+
>>> # Consumer is now stopped and disconnected
|
|
408
|
+
"""
|
|
409
|
+
if not self._running:
|
|
410
|
+
logger.debug(
|
|
411
|
+
"Consumer not running, nothing to stop",
|
|
412
|
+
extra={"consumer_id": self._consumer_id},
|
|
413
|
+
)
|
|
414
|
+
return
|
|
415
|
+
|
|
416
|
+
correlation_id = uuid4()
|
|
417
|
+
|
|
418
|
+
logger.info(
|
|
419
|
+
"Stopping SessionEventConsumer",
|
|
420
|
+
extra={
|
|
421
|
+
"consumer_id": self._consumer_id,
|
|
422
|
+
"correlation_id": str(correlation_id),
|
|
423
|
+
},
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
# Signal shutdown
|
|
427
|
+
self._running = False
|
|
428
|
+
self._shutdown_event.set()
|
|
429
|
+
|
|
430
|
+
# Resume consumer if paused (cleanup before stop)
|
|
431
|
+
if self._consumer is not None and self._consumer_paused:
|
|
432
|
+
await self._resume_consumer(correlation_id)
|
|
433
|
+
|
|
434
|
+
# Close consumer connection
|
|
435
|
+
if self._consumer is not None:
|
|
436
|
+
try:
|
|
437
|
+
await self._consumer.stop()
|
|
438
|
+
except Exception as e:
|
|
439
|
+
logger.warning(
|
|
440
|
+
"Error stopping Kafka consumer",
|
|
441
|
+
extra={
|
|
442
|
+
"consumer_id": self._consumer_id,
|
|
443
|
+
"correlation_id": str(correlation_id),
|
|
444
|
+
"error": str(e),
|
|
445
|
+
},
|
|
446
|
+
)
|
|
447
|
+
finally:
|
|
448
|
+
self._consumer = None
|
|
449
|
+
|
|
450
|
+
# Log final metrics
|
|
451
|
+
metrics_snapshot = await self.metrics.snapshot()
|
|
452
|
+
logger.info(
|
|
453
|
+
"SessionEventConsumer stopped",
|
|
454
|
+
extra={
|
|
455
|
+
"consumer_id": self._consumer_id,
|
|
456
|
+
"correlation_id": str(correlation_id),
|
|
457
|
+
"final_metrics": metrics_snapshot,
|
|
458
|
+
},
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
async def run(self) -> None:
|
|
462
|
+
"""Run the main consume loop.
|
|
463
|
+
|
|
464
|
+
Continuously consumes messages from Kafka topics and processes them
|
|
465
|
+
through the aggregator. Implements at-least-once delivery by committing
|
|
466
|
+
offsets only after successful processing.
|
|
467
|
+
|
|
468
|
+
This method blocks until stop() is called or an unrecoverable error
|
|
469
|
+
occurs. Use this after calling start().
|
|
470
|
+
|
|
471
|
+
Example:
|
|
472
|
+
>>> await consumer.start()
|
|
473
|
+
>>> try:
|
|
474
|
+
... await consumer.run()
|
|
475
|
+
... finally:
|
|
476
|
+
... await consumer.stop()
|
|
477
|
+
"""
|
|
478
|
+
if not self._running or self._consumer is None:
|
|
479
|
+
raise RuntimeError(
|
|
480
|
+
"Consumer not started. Call start() before run().",
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
correlation_id = uuid4()
|
|
484
|
+
|
|
485
|
+
logger.info(
|
|
486
|
+
"Starting consume loop",
|
|
487
|
+
extra={
|
|
488
|
+
"consumer_id": self._consumer_id,
|
|
489
|
+
"correlation_id": str(correlation_id),
|
|
490
|
+
},
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
await self._consume_loop(correlation_id)
|
|
494
|
+
|
|
495
|
+
async def __aenter__(self) -> SessionEventConsumer:
|
|
496
|
+
"""Async context manager entry.
|
|
497
|
+
|
|
498
|
+
Starts the consumer and returns self for use in async with blocks.
|
|
499
|
+
|
|
500
|
+
Returns:
|
|
501
|
+
Self for chaining.
|
|
502
|
+
|
|
503
|
+
Example:
|
|
504
|
+
>>> async with SessionEventConsumer(config, aggregator) as consumer:
|
|
505
|
+
... await consumer.run()
|
|
506
|
+
"""
|
|
507
|
+
await self.start()
|
|
508
|
+
return self
|
|
509
|
+
|
|
510
|
+
async def __aexit__(
|
|
511
|
+
self,
|
|
512
|
+
exc_type: type[BaseException] | None,
|
|
513
|
+
exc_val: BaseException | None,
|
|
514
|
+
exc_tb: object,
|
|
515
|
+
) -> None:
|
|
516
|
+
"""Async context manager exit.
|
|
517
|
+
|
|
518
|
+
Stops the consumer on exit from async with block.
|
|
519
|
+
"""
|
|
520
|
+
await self.stop()
|
|
521
|
+
|
|
522
|
+
# =========================================================================
|
|
523
|
+
# Consume Loop
|
|
524
|
+
# =========================================================================
|
|
525
|
+
|
|
526
|
+
async def _consume_loop(self, correlation_id: UUID) -> None:
|
|
527
|
+
"""Main consumption loop with at-least-once semantics.
|
|
528
|
+
|
|
529
|
+
Continuously polls Kafka for messages and processes them through
|
|
530
|
+
the aggregator. Commits offsets only after successful processing.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
correlation_id: Correlation ID for tracing this consume session.
|
|
534
|
+
"""
|
|
535
|
+
if self._consumer is None:
|
|
536
|
+
logger.error(
|
|
537
|
+
"Consumer is None in consume loop",
|
|
538
|
+
extra={
|
|
539
|
+
"consumer_id": self._consumer_id,
|
|
540
|
+
"correlation_id": str(correlation_id),
|
|
541
|
+
},
|
|
542
|
+
)
|
|
543
|
+
return
|
|
544
|
+
|
|
545
|
+
try:
|
|
546
|
+
async for message in self._consumer:
|
|
547
|
+
# Check shutdown signal
|
|
548
|
+
if not self._running:
|
|
549
|
+
logger.debug(
|
|
550
|
+
"Shutdown signal received, exiting consume loop",
|
|
551
|
+
extra={
|
|
552
|
+
"consumer_id": self._consumer_id,
|
|
553
|
+
"correlation_id": str(correlation_id),
|
|
554
|
+
},
|
|
555
|
+
)
|
|
556
|
+
break
|
|
557
|
+
|
|
558
|
+
# Record message received
|
|
559
|
+
await self.metrics.record_received()
|
|
560
|
+
|
|
561
|
+
# Check circuit breaker - if open, pause and wait for recovery
|
|
562
|
+
# IMPORTANT: We do NOT skip this message. After circuit recovers,
|
|
563
|
+
# we process it normally. This prevents message loss during circuit open.
|
|
564
|
+
if await self._is_circuit_open():
|
|
565
|
+
await self._wait_for_circuit_recovery(correlation_id)
|
|
566
|
+
# Fall through to process this message after recovery
|
|
567
|
+
|
|
568
|
+
# Process the message
|
|
569
|
+
message_correlation_id = uuid4()
|
|
570
|
+
try:
|
|
571
|
+
success = await self._process_message(
|
|
572
|
+
message, message_correlation_id
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
if success:
|
|
576
|
+
# Commit offset after successful processing
|
|
577
|
+
await self._consumer.commit()
|
|
578
|
+
await self.metrics.record_processed()
|
|
579
|
+
await self._record_success()
|
|
580
|
+
|
|
581
|
+
logger.debug(
|
|
582
|
+
"Message processed and committed",
|
|
583
|
+
extra={
|
|
584
|
+
"consumer_id": self._consumer_id,
|
|
585
|
+
"correlation_id": str(message_correlation_id),
|
|
586
|
+
"topic": message.topic,
|
|
587
|
+
"partition": message.partition,
|
|
588
|
+
"offset": message.offset,
|
|
589
|
+
},
|
|
590
|
+
)
|
|
591
|
+
else:
|
|
592
|
+
# Processing returned False (rejected, duplicate, etc.)
|
|
593
|
+
# Still commit to avoid reprocessing
|
|
594
|
+
await self._consumer.commit()
|
|
595
|
+
await self.metrics.record_skipped()
|
|
596
|
+
|
|
597
|
+
logger.debug(
|
|
598
|
+
"Message skipped (rejected by aggregator)",
|
|
599
|
+
extra={
|
|
600
|
+
"consumer_id": self._consumer_id,
|
|
601
|
+
"correlation_id": str(message_correlation_id),
|
|
602
|
+
"topic": message.topic,
|
|
603
|
+
},
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
except ValidationError as e:
|
|
607
|
+
# Schema validation error - skip and commit
|
|
608
|
+
# These messages are malformed and will never succeed
|
|
609
|
+
await self._consumer.commit()
|
|
610
|
+
await self.metrics.record_skipped()
|
|
611
|
+
|
|
612
|
+
logger.warning(
|
|
613
|
+
"Message skipped due to validation error",
|
|
614
|
+
extra={
|
|
615
|
+
"consumer_id": self._consumer_id,
|
|
616
|
+
"correlation_id": str(message_correlation_id),
|
|
617
|
+
"topic": message.topic,
|
|
618
|
+
"error": str(e),
|
|
619
|
+
},
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
except Exception as e:
|
|
623
|
+
# Processing error - record failure, don't commit
|
|
624
|
+
await self.metrics.record_failed()
|
|
625
|
+
await self._record_failure()
|
|
626
|
+
|
|
627
|
+
logger.exception(
|
|
628
|
+
"Error processing message",
|
|
629
|
+
extra={
|
|
630
|
+
"consumer_id": self._consumer_id,
|
|
631
|
+
"correlation_id": str(message_correlation_id),
|
|
632
|
+
"topic": message.topic,
|
|
633
|
+
"partition": message.partition,
|
|
634
|
+
"offset": message.offset,
|
|
635
|
+
"error": str(e),
|
|
636
|
+
"consecutive_failures": self._consecutive_failures,
|
|
637
|
+
},
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
except asyncio.CancelledError:
|
|
641
|
+
logger.info(
|
|
642
|
+
"Consume loop cancelled",
|
|
643
|
+
extra={
|
|
644
|
+
"consumer_id": self._consumer_id,
|
|
645
|
+
"correlation_id": str(correlation_id),
|
|
646
|
+
},
|
|
647
|
+
)
|
|
648
|
+
raise
|
|
649
|
+
|
|
650
|
+
except Exception as e:
|
|
651
|
+
logger.exception(
|
|
652
|
+
"Unexpected error in consume loop",
|
|
653
|
+
extra={
|
|
654
|
+
"consumer_id": self._consumer_id,
|
|
655
|
+
"correlation_id": str(correlation_id),
|
|
656
|
+
"error": str(e),
|
|
657
|
+
},
|
|
658
|
+
)
|
|
659
|
+
raise
|
|
660
|
+
|
|
661
|
+
finally:
|
|
662
|
+
logger.info(
|
|
663
|
+
"Consume loop exiting",
|
|
664
|
+
extra={
|
|
665
|
+
"consumer_id": self._consumer_id,
|
|
666
|
+
"correlation_id": str(correlation_id),
|
|
667
|
+
},
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
# =========================================================================
|
|
671
|
+
# Message Processing
|
|
672
|
+
# =========================================================================
|
|
673
|
+
|
|
674
|
+
async def _process_message(self, message: object, correlation_id: UUID) -> bool:
|
|
675
|
+
"""Process a single message through the aggregator.
|
|
676
|
+
|
|
677
|
+
Deserializes the message payload and dispatches it to the aggregator.
|
|
678
|
+
|
|
679
|
+
Note: Schema validation is delegated to the aggregator since schemas
|
|
680
|
+
are domain-specific (omniclaude). This consumer is schema-agnostic.
|
|
681
|
+
|
|
682
|
+
Args:
|
|
683
|
+
message: Kafka ConsumerRecord with topic, value, etc.
|
|
684
|
+
correlation_id: Correlation ID for this processing attempt.
|
|
685
|
+
|
|
686
|
+
Returns:
|
|
687
|
+
True if processed successfully, False if rejected (duplicate, etc.).
|
|
688
|
+
|
|
689
|
+
Raises:
|
|
690
|
+
ValidationError: If the message payload fails schema validation.
|
|
691
|
+
Exception: If the aggregator raises an error during processing.
|
|
692
|
+
"""
|
|
693
|
+
# Extract message value
|
|
694
|
+
value = getattr(message, "value", None)
|
|
695
|
+
if value is None:
|
|
696
|
+
logger.warning(
|
|
697
|
+
"Message has no value",
|
|
698
|
+
extra={
|
|
699
|
+
"consumer_id": self._consumer_id,
|
|
700
|
+
"correlation_id": str(correlation_id),
|
|
701
|
+
"topic": getattr(message, "topic", "unknown"),
|
|
702
|
+
},
|
|
703
|
+
)
|
|
704
|
+
return False
|
|
705
|
+
|
|
706
|
+
# Decode bytes to string
|
|
707
|
+
if isinstance(value, bytes):
|
|
708
|
+
value = value.decode("utf-8")
|
|
709
|
+
|
|
710
|
+
# TODO(OMN-1526): Schema parsing moved to aggregator
|
|
711
|
+
# The original code parsed ModelHookEventEnvelope here, but that
|
|
712
|
+
# creates a dependency on omniclaude.hooks.schemas. The aggregator
|
|
713
|
+
# is now responsible for schema validation.
|
|
714
|
+
#
|
|
715
|
+
# Original code:
|
|
716
|
+
# envelope = ModelHookEventEnvelope.model_validate_json(value)
|
|
717
|
+
# payload = envelope.payload
|
|
718
|
+
# result = await self._aggregator.process_event(envelope, correlation_id)
|
|
719
|
+
|
|
720
|
+
logger.debug(
|
|
721
|
+
"Processing event",
|
|
722
|
+
extra={
|
|
723
|
+
"consumer_id": self._consumer_id,
|
|
724
|
+
"correlation_id": str(correlation_id),
|
|
725
|
+
"topic": getattr(message, "topic", "unknown"),
|
|
726
|
+
},
|
|
727
|
+
)
|
|
728
|
+
|
|
729
|
+
# Pass raw JSON string to aggregator - let it handle schema validation
|
|
730
|
+
result = await self._aggregator.process_event(value, correlation_id)
|
|
731
|
+
|
|
732
|
+
return result
|
|
733
|
+
|
|
734
|
+
# =========================================================================
|
|
735
|
+
# Circuit Breaker
|
|
736
|
+
# =========================================================================
|
|
737
|
+
|
|
738
|
+
async def _is_circuit_open(self) -> bool:
|
|
739
|
+
"""Check if circuit breaker is open.
|
|
740
|
+
|
|
741
|
+
If the circuit is open, checks if enough time has passed to
|
|
742
|
+
transition to half-open state for a test request.
|
|
743
|
+
|
|
744
|
+
Returns:
|
|
745
|
+
True if circuit is open and should block processing.
|
|
746
|
+
"""
|
|
747
|
+
async with self._circuit_lock:
|
|
748
|
+
if self._circuit_state == EnumCircuitState.CLOSED:
|
|
749
|
+
return False
|
|
750
|
+
|
|
751
|
+
if self._circuit_state == EnumCircuitState.HALF_OPEN:
|
|
752
|
+
# Allow test request
|
|
753
|
+
return False
|
|
754
|
+
|
|
755
|
+
# Circuit is OPEN - check if timeout has elapsed
|
|
756
|
+
if self._circuit_opened_at is not None:
|
|
757
|
+
elapsed = (datetime.now(UTC) - self._circuit_opened_at).total_seconds()
|
|
758
|
+
if elapsed >= self._config.circuit_breaker_timeout_seconds:
|
|
759
|
+
# Transition to half-open
|
|
760
|
+
self._circuit_state = EnumCircuitState.HALF_OPEN
|
|
761
|
+
logger.info(
|
|
762
|
+
"Circuit breaker transitioning to half-open",
|
|
763
|
+
extra={
|
|
764
|
+
"consumer_id": self._consumer_id,
|
|
765
|
+
"elapsed_seconds": elapsed,
|
|
766
|
+
},
|
|
767
|
+
)
|
|
768
|
+
return False
|
|
769
|
+
|
|
770
|
+
return True
|
|
771
|
+
|
|
772
|
+
async def _wait_for_circuit_recovery(self, correlation_id: UUID) -> None:
|
|
773
|
+
"""Pause consumer and wait for circuit breaker to recover.
|
|
774
|
+
|
|
775
|
+
Called when the circuit is open. This method:
|
|
776
|
+
1. Pauses the Kafka consumer to stop fetching new messages
|
|
777
|
+
2. Waits in a loop until circuit transitions to HALF_OPEN or CLOSED
|
|
778
|
+
3. Resumes the consumer before returning
|
|
779
|
+
|
|
780
|
+
This ensures no messages are lost during circuit open state - the current
|
|
781
|
+
message will be processed after this method returns, and no new messages
|
|
782
|
+
are fetched while waiting.
|
|
783
|
+
|
|
784
|
+
Args:
|
|
785
|
+
correlation_id: Correlation ID for logging.
|
|
786
|
+
"""
|
|
787
|
+
if self._consumer is None:
|
|
788
|
+
return
|
|
789
|
+
|
|
790
|
+
# Pause the consumer to stop fetching new messages
|
|
791
|
+
await self._pause_consumer(correlation_id)
|
|
792
|
+
|
|
793
|
+
logger.warning(
|
|
794
|
+
"Circuit breaker is open, consumer paused - waiting for recovery",
|
|
795
|
+
extra={
|
|
796
|
+
"consumer_id": self._consumer_id,
|
|
797
|
+
"correlation_id": str(correlation_id),
|
|
798
|
+
"timeout_seconds": self._config.circuit_breaker_timeout_seconds,
|
|
799
|
+
},
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
# Wait in a loop until circuit is no longer open
|
|
803
|
+
check_interval = min(1.0, self._config.circuit_breaker_timeout_seconds / 10)
|
|
804
|
+
while self._running:
|
|
805
|
+
# Check if circuit has recovered
|
|
806
|
+
if not await self._is_circuit_open():
|
|
807
|
+
logger.info(
|
|
808
|
+
"Circuit breaker recovered, resuming consumer",
|
|
809
|
+
extra={
|
|
810
|
+
"consumer_id": self._consumer_id,
|
|
811
|
+
"correlation_id": str(correlation_id),
|
|
812
|
+
"circuit_state": self._circuit_state.value,
|
|
813
|
+
},
|
|
814
|
+
)
|
|
815
|
+
break
|
|
816
|
+
|
|
817
|
+
# Check for shutdown signal
|
|
818
|
+
if self._shutdown_event.is_set():
|
|
819
|
+
logger.debug(
|
|
820
|
+
"Shutdown signal received while waiting for circuit recovery",
|
|
821
|
+
extra={
|
|
822
|
+
"consumer_id": self._consumer_id,
|
|
823
|
+
"correlation_id": str(correlation_id),
|
|
824
|
+
},
|
|
825
|
+
)
|
|
826
|
+
break
|
|
827
|
+
|
|
828
|
+
# Wait before checking again
|
|
829
|
+
await asyncio.sleep(check_interval)
|
|
830
|
+
|
|
831
|
+
# Resume the consumer before returning
|
|
832
|
+
await self._resume_consumer(correlation_id)
|
|
833
|
+
|
|
834
|
+
async def _pause_consumer(self, correlation_id: UUID) -> None:
|
|
835
|
+
"""Pause the Kafka consumer on all assigned partitions.
|
|
836
|
+
|
|
837
|
+
Args:
|
|
838
|
+
correlation_id: Correlation ID for logging.
|
|
839
|
+
"""
|
|
840
|
+
if self._consumer is None or self._consumer_paused:
|
|
841
|
+
return
|
|
842
|
+
|
|
843
|
+
try:
|
|
844
|
+
partitions = self._consumer.assignment()
|
|
845
|
+
if partitions:
|
|
846
|
+
self._consumer.pause(*partitions)
|
|
847
|
+
self._consumer_paused = True
|
|
848
|
+
logger.debug(
|
|
849
|
+
"Consumer paused",
|
|
850
|
+
extra={
|
|
851
|
+
"consumer_id": self._consumer_id,
|
|
852
|
+
"correlation_id": str(correlation_id),
|
|
853
|
+
"partitions": [str(p) for p in partitions],
|
|
854
|
+
},
|
|
855
|
+
)
|
|
856
|
+
except Exception as e:
|
|
857
|
+
logger.warning(
|
|
858
|
+
"Failed to pause consumer",
|
|
859
|
+
extra={
|
|
860
|
+
"consumer_id": self._consumer_id,
|
|
861
|
+
"correlation_id": str(correlation_id),
|
|
862
|
+
"error": str(e),
|
|
863
|
+
},
|
|
864
|
+
)
|
|
865
|
+
|
|
866
|
+
async def _resume_consumer(self, correlation_id: UUID) -> None:
|
|
867
|
+
"""Resume the Kafka consumer on all assigned partitions.
|
|
868
|
+
|
|
869
|
+
Args:
|
|
870
|
+
correlation_id: Correlation ID for logging.
|
|
871
|
+
"""
|
|
872
|
+
if self._consumer is None or not self._consumer_paused:
|
|
873
|
+
return
|
|
874
|
+
|
|
875
|
+
try:
|
|
876
|
+
partitions = self._consumer.assignment()
|
|
877
|
+
if partitions:
|
|
878
|
+
self._consumer.resume(*partitions)
|
|
879
|
+
self._consumer_paused = False
|
|
880
|
+
logger.debug(
|
|
881
|
+
"Consumer resumed",
|
|
882
|
+
extra={
|
|
883
|
+
"consumer_id": self._consumer_id,
|
|
884
|
+
"correlation_id": str(correlation_id),
|
|
885
|
+
"partitions": [str(p) for p in partitions],
|
|
886
|
+
},
|
|
887
|
+
)
|
|
888
|
+
except Exception as e:
|
|
889
|
+
logger.warning(
|
|
890
|
+
"Failed to resume consumer",
|
|
891
|
+
extra={
|
|
892
|
+
"consumer_id": self._consumer_id,
|
|
893
|
+
"correlation_id": str(correlation_id),
|
|
894
|
+
"error": str(e),
|
|
895
|
+
},
|
|
896
|
+
)
|
|
897
|
+
|
|
898
|
+
async def _record_failure(self) -> None:
|
|
899
|
+
"""Record a processing failure for circuit breaker.
|
|
900
|
+
|
|
901
|
+
Increments consecutive failure count and opens circuit if
|
|
902
|
+
threshold is exceeded. Also resets half-open success counter
|
|
903
|
+
when circuit opens.
|
|
904
|
+
"""
|
|
905
|
+
async with self._circuit_lock:
|
|
906
|
+
self._consecutive_failures += 1
|
|
907
|
+
|
|
908
|
+
if self._consecutive_failures >= self._config.circuit_breaker_threshold:
|
|
909
|
+
if self._circuit_state != EnumCircuitState.OPEN:
|
|
910
|
+
self._circuit_state = EnumCircuitState.OPEN
|
|
911
|
+
self._circuit_opened_at = datetime.now(UTC)
|
|
912
|
+
self._half_open_successes = 0
|
|
913
|
+
await self.metrics.record_circuit_open()
|
|
914
|
+
|
|
915
|
+
logger.warning(
|
|
916
|
+
"Circuit breaker opened",
|
|
917
|
+
extra={
|
|
918
|
+
"consumer_id": self._consumer_id,
|
|
919
|
+
"consecutive_failures": self._consecutive_failures,
|
|
920
|
+
"threshold": self._config.circuit_breaker_threshold,
|
|
921
|
+
},
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
async def _record_success(self) -> None:
|
|
925
|
+
"""Record a processing success for circuit breaker.
|
|
926
|
+
|
|
927
|
+
Resets consecutive failure count. In half-open state, tracks
|
|
928
|
+
successful requests and closes circuit once threshold is met.
|
|
929
|
+
Also ensures consumer is resumed if it was paused.
|
|
930
|
+
"""
|
|
931
|
+
should_resume = False
|
|
932
|
+
async with self._circuit_lock:
|
|
933
|
+
self._consecutive_failures = 0
|
|
934
|
+
|
|
935
|
+
if self._circuit_state == EnumCircuitState.HALF_OPEN:
|
|
936
|
+
self._half_open_successes += 1
|
|
937
|
+
|
|
938
|
+
if (
|
|
939
|
+
self._half_open_successes
|
|
940
|
+
>= self._config.circuit_breaker_half_open_successes
|
|
941
|
+
):
|
|
942
|
+
self._circuit_state = EnumCircuitState.CLOSED
|
|
943
|
+
self._circuit_opened_at = None
|
|
944
|
+
self._half_open_successes = 0
|
|
945
|
+
should_resume = self._consumer_paused
|
|
946
|
+
|
|
947
|
+
logger.info(
|
|
948
|
+
"Circuit breaker closed after successful requests in half-open",
|
|
949
|
+
extra={
|
|
950
|
+
"consumer_id": self._consumer_id,
|
|
951
|
+
"successes_required": self._config.circuit_breaker_half_open_successes,
|
|
952
|
+
},
|
|
953
|
+
)
|
|
954
|
+
else:
|
|
955
|
+
logger.debug(
|
|
956
|
+
"Circuit breaker half-open success recorded",
|
|
957
|
+
extra={
|
|
958
|
+
"consumer_id": self._consumer_id,
|
|
959
|
+
"current_successes": self._half_open_successes,
|
|
960
|
+
"required_successes": self._config.circuit_breaker_half_open_successes,
|
|
961
|
+
},
|
|
962
|
+
)
|
|
963
|
+
|
|
964
|
+
# Resume consumer outside the lock if needed (safety check)
|
|
965
|
+
if should_resume:
|
|
966
|
+
await self._resume_consumer(uuid4())
|
|
967
|
+
|
|
968
|
+
# =========================================================================
|
|
969
|
+
# Health Check
|
|
970
|
+
# =========================================================================
|
|
971
|
+
|
|
972
|
+
async def health_check(self) -> dict[str, object]:
|
|
973
|
+
"""Check consumer health status.
|
|
974
|
+
|
|
975
|
+
Returns a dictionary with health information for monitoring
|
|
976
|
+
and diagnostics.
|
|
977
|
+
|
|
978
|
+
Returns:
|
|
979
|
+
Dictionary with health status including:
|
|
980
|
+
- healthy: Overall health (running and circuit closed)
|
|
981
|
+
- running: Whether consume loop is active
|
|
982
|
+
- circuit_state: Current circuit breaker state
|
|
983
|
+
- consumer_id: Unique consumer identifier
|
|
984
|
+
- metrics: Current metrics snapshot
|
|
985
|
+
"""
|
|
986
|
+
metrics_snapshot = await self.metrics.snapshot()
|
|
987
|
+
|
|
988
|
+
return {
|
|
989
|
+
"healthy": self._running and self._circuit_state == EnumCircuitState.CLOSED,
|
|
990
|
+
"running": self._running,
|
|
991
|
+
"circuit_state": self._circuit_state.value,
|
|
992
|
+
"consumer_paused": self._consumer_paused,
|
|
993
|
+
"consumer_id": self._consumer_id,
|
|
994
|
+
"group_id": self._config.group_id,
|
|
995
|
+
"topics": self._config.topics,
|
|
996
|
+
"consecutive_failures": self._consecutive_failures,
|
|
997
|
+
"half_open_successes": self._half_open_successes,
|
|
998
|
+
"metrics": metrics_snapshot,
|
|
999
|
+
}
|
|
1000
|
+
|
|
1001
|
+
|
|
1002
|
+
__all__ = [
|
|
1003
|
+
"SessionEventConsumer",
|
|
1004
|
+
"ConsumerMetrics",
|
|
1005
|
+
"EnumCircuitState",
|
|
1006
|
+
"ProtocolSessionAggregator",
|
|
1007
|
+
]
|