omnibase_infra 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omnibase_infra/__init__.py +1 -1
- omnibase_infra/enums/__init__.py +3 -0
- omnibase_infra/enums/enum_consumer_group_purpose.py +9 -0
- omnibase_infra/enums/enum_postgres_error_code.py +188 -0
- omnibase_infra/errors/__init__.py +4 -0
- omnibase_infra/errors/error_infra.py +60 -0
- omnibase_infra/handlers/__init__.py +3 -0
- omnibase_infra/handlers/handler_slack_webhook.py +426 -0
- omnibase_infra/handlers/models/__init__.py +14 -0
- omnibase_infra/handlers/models/enum_alert_severity.py +36 -0
- omnibase_infra/handlers/models/model_slack_alert.py +24 -0
- omnibase_infra/handlers/models/model_slack_alert_payload.py +77 -0
- omnibase_infra/handlers/models/model_slack_alert_result.py +73 -0
- omnibase_infra/handlers/registration_storage/handler_registration_storage_postgres.py +29 -20
- omnibase_infra/mixins/__init__.py +14 -0
- omnibase_infra/mixins/mixin_node_introspection.py +42 -20
- omnibase_infra/mixins/mixin_postgres_error_response.py +314 -0
- omnibase_infra/mixins/mixin_postgres_op_executor.py +298 -0
- omnibase_infra/models/__init__.py +3 -0
- omnibase_infra/models/discovery/model_dependency_spec.py +1 -0
- omnibase_infra/models/discovery/model_discovered_capabilities.py +1 -1
- omnibase_infra/models/discovery/model_introspection_config.py +28 -1
- omnibase_infra/models/discovery/model_introspection_performance_metrics.py +1 -0
- omnibase_infra/models/discovery/model_introspection_task_config.py +1 -0
- omnibase_infra/{nodes/effects/models → models}/model_backend_result.py +22 -6
- omnibase_infra/models/projection/__init__.py +11 -0
- omnibase_infra/models/projection/model_contract_projection.py +170 -0
- omnibase_infra/models/projection/model_topic_projection.py +148 -0
- omnibase_infra/models/runtime/__init__.py +4 -0
- omnibase_infra/models/runtime/model_resolved_dependencies.py +116 -0
- omnibase_infra/nodes/contract_registry_reducer/__init__.py +5 -0
- omnibase_infra/nodes/contract_registry_reducer/contract.yaml +6 -5
- omnibase_infra/nodes/contract_registry_reducer/contract_registration_event_router.py +689 -0
- omnibase_infra/nodes/contract_registry_reducer/reducer.py +9 -26
- omnibase_infra/nodes/effects/__init__.py +1 -1
- omnibase_infra/nodes/effects/models/__init__.py +6 -4
- omnibase_infra/nodes/effects/models/model_registry_response.py +1 -1
- omnibase_infra/nodes/effects/protocol_consul_client.py +1 -1
- omnibase_infra/nodes/effects/protocol_postgres_adapter.py +1 -1
- omnibase_infra/nodes/effects/registry_effect.py +1 -1
- omnibase_infra/nodes/node_contract_persistence_effect/__init__.py +101 -0
- omnibase_infra/nodes/node_contract_persistence_effect/contract.yaml +490 -0
- omnibase_infra/nodes/node_contract_persistence_effect/handlers/__init__.py +74 -0
- omnibase_infra/nodes/node_contract_persistence_effect/handlers/handler_postgres_cleanup_topics.py +217 -0
- omnibase_infra/nodes/node_contract_persistence_effect/handlers/handler_postgres_contract_upsert.py +242 -0
- omnibase_infra/nodes/node_contract_persistence_effect/handlers/handler_postgres_deactivate.py +194 -0
- omnibase_infra/nodes/node_contract_persistence_effect/handlers/handler_postgres_heartbeat.py +243 -0
- omnibase_infra/nodes/node_contract_persistence_effect/handlers/handler_postgres_mark_stale.py +208 -0
- omnibase_infra/nodes/node_contract_persistence_effect/handlers/handler_postgres_topic_update.py +298 -0
- omnibase_infra/nodes/node_contract_persistence_effect/models/__init__.py +15 -0
- omnibase_infra/nodes/node_contract_persistence_effect/models/model_persistence_result.py +52 -0
- omnibase_infra/nodes/node_contract_persistence_effect/node.py +131 -0
- omnibase_infra/nodes/node_contract_persistence_effect/registry/__init__.py +27 -0
- omnibase_infra/nodes/node_contract_persistence_effect/registry/registry_infra_contract_persistence_effect.py +251 -0
- omnibase_infra/nodes/node_registration_orchestrator/models/model_postgres_intent_payload.py +8 -12
- omnibase_infra/nodes/node_registry_effect/models/__init__.py +2 -2
- omnibase_infra/nodes/node_slack_alerter_effect/__init__.py +33 -0
- omnibase_infra/nodes/node_slack_alerter_effect/contract.yaml +291 -0
- omnibase_infra/nodes/node_slack_alerter_effect/node.py +106 -0
- omnibase_infra/projectors/__init__.py +6 -0
- omnibase_infra/projectors/projection_reader_contract.py +1301 -0
- omnibase_infra/runtime/__init__.py +12 -0
- omnibase_infra/runtime/baseline_subscriptions.py +13 -6
- omnibase_infra/runtime/contract_dependency_resolver.py +455 -0
- omnibase_infra/runtime/contract_registration_event_router.py +500 -0
- omnibase_infra/runtime/db/__init__.py +4 -0
- omnibase_infra/runtime/db/models/__init__.py +15 -10
- omnibase_infra/runtime/db/models/model_db_operation.py +40 -0
- omnibase_infra/runtime/db/models/model_db_param.py +24 -0
- omnibase_infra/runtime/db/models/model_db_repository_contract.py +40 -0
- omnibase_infra/runtime/db/models/model_db_return.py +26 -0
- omnibase_infra/runtime/db/models/model_db_safety_policy.py +32 -0
- omnibase_infra/runtime/emit_daemon/event_registry.py +34 -22
- omnibase_infra/runtime/event_bus_subcontract_wiring.py +63 -23
- omnibase_infra/runtime/intent_execution_router.py +430 -0
- omnibase_infra/runtime/models/__init__.py +6 -0
- omnibase_infra/runtime/models/model_contract_registry_config.py +41 -0
- omnibase_infra/runtime/models/model_intent_execution_summary.py +79 -0
- omnibase_infra/runtime/models/model_runtime_config.py +8 -0
- omnibase_infra/runtime/protocols/__init__.py +16 -0
- omnibase_infra/runtime/protocols/protocol_intent_executor.py +107 -0
- omnibase_infra/runtime/publisher_topic_scoped.py +16 -11
- omnibase_infra/runtime/registry_policy.py +29 -15
- omnibase_infra/runtime/request_response_wiring.py +793 -0
- omnibase_infra/runtime/service_kernel.py +295 -8
- omnibase_infra/runtime/service_runtime_host_process.py +149 -5
- omnibase_infra/runtime/util_version.py +5 -1
- omnibase_infra/schemas/schema_latency_baseline.sql +135 -0
- omnibase_infra/services/contract_publisher/config.py +4 -4
- omnibase_infra/services/contract_publisher/service.py +8 -5
- omnibase_infra/services/observability/injection_effectiveness/__init__.py +67 -0
- omnibase_infra/services/observability/injection_effectiveness/config.py +295 -0
- omnibase_infra/services/observability/injection_effectiveness/consumer.py +1461 -0
- omnibase_infra/services/observability/injection_effectiveness/models/__init__.py +32 -0
- omnibase_infra/services/observability/injection_effectiveness/models/model_agent_match.py +79 -0
- omnibase_infra/services/observability/injection_effectiveness/models/model_context_utilization.py +118 -0
- omnibase_infra/services/observability/injection_effectiveness/models/model_latency_breakdown.py +107 -0
- omnibase_infra/services/observability/injection_effectiveness/models/model_pattern_utilization.py +46 -0
- omnibase_infra/services/observability/injection_effectiveness/writer_postgres.py +596 -0
- omnibase_infra/services/registry_api/models/__init__.py +25 -0
- omnibase_infra/services/registry_api/models/model_contract_ref.py +44 -0
- omnibase_infra/services/registry_api/models/model_contract_view.py +81 -0
- omnibase_infra/services/registry_api/models/model_response_contracts.py +50 -0
- omnibase_infra/services/registry_api/models/model_response_topics.py +50 -0
- omnibase_infra/services/registry_api/models/model_topic_summary.py +57 -0
- omnibase_infra/services/registry_api/models/model_topic_view.py +63 -0
- omnibase_infra/services/registry_api/routes.py +205 -6
- omnibase_infra/services/registry_api/service.py +528 -1
- omnibase_infra/utils/__init__.py +7 -0
- omnibase_infra/utils/util_db_error_context.py +292 -0
- omnibase_infra/validation/infra_validators.py +3 -1
- omnibase_infra/validation/validation_exemptions.yaml +65 -0
- {omnibase_infra-0.3.1.dist-info → omnibase_infra-0.4.0.dist-info}/METADATA +3 -3
- {omnibase_infra-0.3.1.dist-info → omnibase_infra-0.4.0.dist-info}/RECORD +117 -58
- {omnibase_infra-0.3.1.dist-info → omnibase_infra-0.4.0.dist-info}/WHEEL +0 -0
- {omnibase_infra-0.3.1.dist-info → omnibase_infra-0.4.0.dist-info}/entry_points.txt +0 -0
- {omnibase_infra-0.3.1.dist-info → omnibase_infra-0.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1461 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
2
|
+
# Copyright (c) 2025 OmniNode Team
|
|
3
|
+
"""Async Kafka Consumer for Injection Effectiveness Observability.
|
|
4
|
+
|
|
5
|
+
This module provides an async Kafka consumer for injection effectiveness events.
|
|
6
|
+
Events are consumed from multiple topics, validated using Pydantic models,
|
|
7
|
+
and persisted to PostgreSQL via the WriterInjectionEffectivenessPostgres.
|
|
8
|
+
|
|
9
|
+
Design Decisions:
|
|
10
|
+
- Per-partition offset tracking: Commit only successfully persisted partitions
|
|
11
|
+
- Batch processing: Configurable batch size and timeout
|
|
12
|
+
- Circuit breaker: Resilience via writer's MixinAsyncCircuitBreaker
|
|
13
|
+
- Health check: HTTP endpoint for Kubernetes probes
|
|
14
|
+
- Graceful shutdown: Signal handling with drain and commit
|
|
15
|
+
|
|
16
|
+
Critical Invariant:
|
|
17
|
+
For each (topic, partition), commit offsets only up to the highest offset
|
|
18
|
+
that has been successfully persisted for that partition.
|
|
19
|
+
Never commit offsets for partitions that had write failures in the batch.
|
|
20
|
+
|
|
21
|
+
Topics consumed:
|
|
22
|
+
- onex.evt.omniclaude.context-utilization.v1
|
|
23
|
+
- onex.evt.omniclaude.agent-match.v1
|
|
24
|
+
- onex.evt.omniclaude.latency-breakdown.v1
|
|
25
|
+
|
|
26
|
+
Related Tickets:
|
|
27
|
+
- OMN-1890: Injection effectiveness observability consumer (current)
|
|
28
|
+
- OMN-1889: Emit injection metrics from omniclaude hooks (producer)
|
|
29
|
+
- OMN-1743: Agent actions consumer (reference pattern)
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
>>> from omnibase_infra.services.observability.injection_effectiveness import (
|
|
33
|
+
... InjectionEffectivenessConsumer,
|
|
34
|
+
... ConfigInjectionEffectivenessConsumer,
|
|
35
|
+
... )
|
|
36
|
+
>>>
|
|
37
|
+
>>> config = ConfigInjectionEffectivenessConsumer(
|
|
38
|
+
... kafka_bootstrap_servers="localhost:9092",
|
|
39
|
+
... postgres_dsn="postgresql://postgres:secret@localhost:5432/omninode_bridge",
|
|
40
|
+
... )
|
|
41
|
+
>>> consumer = InjectionEffectivenessConsumer(config)
|
|
42
|
+
>>>
|
|
43
|
+
>>> # Run consumer (blocking)
|
|
44
|
+
>>> await consumer.start()
|
|
45
|
+
>>> await consumer.run()
|
|
46
|
+
|
|
47
|
+
# Or run as module:
|
|
48
|
+
# python -m omnibase_infra.services.observability.injection_effectiveness.consumer
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
from __future__ import annotations
|
|
52
|
+
|
|
53
|
+
import asyncio
|
|
54
|
+
import json
|
|
55
|
+
import logging
|
|
56
|
+
import signal
|
|
57
|
+
from collections.abc import Callable, Coroutine
|
|
58
|
+
from datetime import UTC, datetime
|
|
59
|
+
from enum import StrEnum
|
|
60
|
+
from typing import TYPE_CHECKING
|
|
61
|
+
from urllib.parse import urlparse, urlunparse
|
|
62
|
+
from uuid import UUID, uuid4
|
|
63
|
+
|
|
64
|
+
import asyncpg
|
|
65
|
+
from aiohttp import web
|
|
66
|
+
from aiokafka import AIOKafkaConsumer, TopicPartition
|
|
67
|
+
from aiokafka.errors import KafkaError
|
|
68
|
+
from pydantic import BaseModel, ValidationError
|
|
69
|
+
|
|
70
|
+
from omnibase_core.errors import OnexError
|
|
71
|
+
from omnibase_core.types import JsonType
|
|
72
|
+
from omnibase_infra.services.observability.injection_effectiveness.config import (
|
|
73
|
+
ConfigInjectionEffectivenessConsumer,
|
|
74
|
+
)
|
|
75
|
+
from omnibase_infra.services.observability.injection_effectiveness.models import (
|
|
76
|
+
ModelAgentMatchEvent,
|
|
77
|
+
ModelContextUtilizationEvent,
|
|
78
|
+
ModelLatencyBreakdownEvent,
|
|
79
|
+
)
|
|
80
|
+
from omnibase_infra.services.observability.injection_effectiveness.writer_postgres import (
|
|
81
|
+
WriterInjectionEffectivenessPostgres,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
if TYPE_CHECKING:
|
|
85
|
+
from aiokafka.structs import ConsumerRecord
|
|
86
|
+
|
|
87
|
+
logger = logging.getLogger(__name__)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# =============================================================================
|
|
91
|
+
# Utility Functions
|
|
92
|
+
# =============================================================================
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def mask_dsn_password(dsn: str) -> str:
|
|
96
|
+
"""Mask password in a PostgreSQL DSN for safe logging.
|
|
97
|
+
|
|
98
|
+
Parses the DSN and replaces any password component with '***'.
|
|
99
|
+
Handles standard PostgreSQL connection string formats.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
dsn: PostgreSQL connection string, e.g.,
|
|
103
|
+
'postgresql://user:password@host:port/db'
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
DSN with password replaced by '***'. If parsing fails or no password
|
|
107
|
+
is present, returns the original DSN (safe - no password to mask).
|
|
108
|
+
|
|
109
|
+
Examples:
|
|
110
|
+
>>> mask_dsn_password("postgresql://user:secret@localhost:5432/db")
|
|
111
|
+
'postgresql://user:***@localhost:5432/db'
|
|
112
|
+
|
|
113
|
+
>>> mask_dsn_password("postgresql://user@localhost/db")
|
|
114
|
+
'postgresql://user@localhost/db'
|
|
115
|
+
|
|
116
|
+
>>> mask_dsn_password("invalid-dsn")
|
|
117
|
+
'invalid-dsn'
|
|
118
|
+
"""
|
|
119
|
+
try:
|
|
120
|
+
parsed = urlparse(dsn)
|
|
121
|
+
|
|
122
|
+
# No password present - safe to return as-is
|
|
123
|
+
if not parsed.password:
|
|
124
|
+
return dsn
|
|
125
|
+
|
|
126
|
+
# Reconstruct netloc with masked password
|
|
127
|
+
# Format: user:***@host:port or user:***@host
|
|
128
|
+
if parsed.port:
|
|
129
|
+
masked_netloc = f"{parsed.username}:***@{parsed.hostname}:{parsed.port}"
|
|
130
|
+
else:
|
|
131
|
+
masked_netloc = f"{parsed.username}:***@{parsed.hostname}"
|
|
132
|
+
|
|
133
|
+
# Reconstruct the full DSN with masked password
|
|
134
|
+
masked = urlunparse(
|
|
135
|
+
(
|
|
136
|
+
parsed.scheme,
|
|
137
|
+
masked_netloc,
|
|
138
|
+
parsed.path,
|
|
139
|
+
parsed.params,
|
|
140
|
+
parsed.query,
|
|
141
|
+
parsed.fragment,
|
|
142
|
+
)
|
|
143
|
+
)
|
|
144
|
+
return masked
|
|
145
|
+
|
|
146
|
+
except Exception:
|
|
147
|
+
# If parsing fails, return original (likely no password to mask)
|
|
148
|
+
# Log at debug level to avoid noise
|
|
149
|
+
logger.debug("Failed to parse DSN for masking, returning as-is")
|
|
150
|
+
return dsn
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
# =============================================================================
|
|
154
|
+
# Type Aliases and Constants
|
|
155
|
+
# =============================================================================
|
|
156
|
+
|
|
157
|
+
# Map topics to their Pydantic model class
|
|
158
|
+
TOPIC_TO_MODEL: dict[str, type[BaseModel]] = {
|
|
159
|
+
"onex.evt.omniclaude.context-utilization.v1": ModelContextUtilizationEvent,
|
|
160
|
+
"onex.evt.omniclaude.agent-match.v1": ModelAgentMatchEvent,
|
|
161
|
+
"onex.evt.omniclaude.latency-breakdown.v1": ModelLatencyBreakdownEvent,
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
# Map topics to writer method names
|
|
165
|
+
TOPIC_TO_WRITER_METHOD: dict[str, str] = {
|
|
166
|
+
"onex.evt.omniclaude.context-utilization.v1": "write_context_utilization",
|
|
167
|
+
"onex.evt.omniclaude.agent-match.v1": "write_agent_match",
|
|
168
|
+
"onex.evt.omniclaude.latency-breakdown.v1": "write_latency_breakdowns",
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# =============================================================================
|
|
173
|
+
# Enums
|
|
174
|
+
# =============================================================================
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class EnumHealthStatus(StrEnum):
|
|
178
|
+
"""Health check status values.
|
|
179
|
+
|
|
180
|
+
Used by the health check endpoint to indicate consumer health.
|
|
181
|
+
|
|
182
|
+
Status Semantics:
|
|
183
|
+
HEALTHY: Consumer running, circuit closed, recent successful write
|
|
184
|
+
DEGRADED: Consumer running but circuit open (retrying)
|
|
185
|
+
UNHEALTHY: Consumer stopped or no writes for extended period
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
HEALTHY = "healthy"
|
|
189
|
+
DEGRADED = "degraded"
|
|
190
|
+
UNHEALTHY = "unhealthy"
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# =============================================================================
|
|
194
|
+
# Consumer Metrics
|
|
195
|
+
# =============================================================================
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
class ConsumerMetrics:
|
|
199
|
+
"""Metrics tracking for the injection effectiveness consumer.
|
|
200
|
+
|
|
201
|
+
Tracks processing statistics for observability and monitoring.
|
|
202
|
+
Thread-safe via asyncio lock protection.
|
|
203
|
+
|
|
204
|
+
Attributes:
|
|
205
|
+
messages_received: Total messages received from Kafka.
|
|
206
|
+
messages_processed: Successfully processed messages.
|
|
207
|
+
messages_failed: Messages that failed processing.
|
|
208
|
+
messages_skipped: Messages skipped (invalid, duplicate, etc.).
|
|
209
|
+
batches_processed: Number of batches successfully processed.
|
|
210
|
+
commit_failures: Number of offset commit failures (tracks persistent issues).
|
|
211
|
+
last_poll_at: Timestamp of last Kafka poll.
|
|
212
|
+
last_successful_write_at: Timestamp of last successful database write.
|
|
213
|
+
last_commit_failure_at: Timestamp of last commit failure (for diagnostics).
|
|
214
|
+
started_at: Timestamp when metrics were initialized (consumer start time).
|
|
215
|
+
"""
|
|
216
|
+
|
|
217
|
+
def __init__(self) -> None:
|
|
218
|
+
"""Initialize metrics with zero values."""
|
|
219
|
+
self.messages_received: int = 0
|
|
220
|
+
self.messages_processed: int = 0
|
|
221
|
+
self.messages_failed: int = 0
|
|
222
|
+
self.messages_skipped: int = 0
|
|
223
|
+
self.batches_processed: int = 0
|
|
224
|
+
self.commit_failures: int = 0
|
|
225
|
+
self.last_poll_at: datetime | None = None
|
|
226
|
+
self.last_successful_write_at: datetime | None = None
|
|
227
|
+
self.last_commit_failure_at: datetime | None = None
|
|
228
|
+
self.started_at: datetime = datetime.now(UTC)
|
|
229
|
+
self._lock = asyncio.Lock()
|
|
230
|
+
|
|
231
|
+
async def record_received(self, count: int = 1) -> None:
|
|
232
|
+
"""Record messages received."""
|
|
233
|
+
async with self._lock:
|
|
234
|
+
self.messages_received += count
|
|
235
|
+
self.last_poll_at = datetime.now(UTC)
|
|
236
|
+
|
|
237
|
+
async def record_processed(self, count: int = 1) -> None:
|
|
238
|
+
"""Record successfully processed messages."""
|
|
239
|
+
async with self._lock:
|
|
240
|
+
self.messages_processed += count
|
|
241
|
+
self.last_successful_write_at = datetime.now(UTC)
|
|
242
|
+
|
|
243
|
+
async def record_failed(self, count: int = 1) -> None:
|
|
244
|
+
"""Record failed messages."""
|
|
245
|
+
async with self._lock:
|
|
246
|
+
self.messages_failed += count
|
|
247
|
+
|
|
248
|
+
async def record_skipped(self, count: int = 1) -> None:
|
|
249
|
+
"""Record skipped messages."""
|
|
250
|
+
async with self._lock:
|
|
251
|
+
self.messages_skipped += count
|
|
252
|
+
|
|
253
|
+
async def record_batch_processed(self) -> None:
|
|
254
|
+
"""Record a successfully processed batch."""
|
|
255
|
+
async with self._lock:
|
|
256
|
+
self.batches_processed += 1
|
|
257
|
+
|
|
258
|
+
async def record_polled(self) -> None:
|
|
259
|
+
"""Record a poll attempt (updates last_poll_at regardless of message count).
|
|
260
|
+
|
|
261
|
+
This method should be called after every successful Kafka poll, even when
|
|
262
|
+
the poll returns no messages. This prevents false DEGRADED health status
|
|
263
|
+
on low-traffic topics where empty polls are normal.
|
|
264
|
+
|
|
265
|
+
See: CodeRabbit PR #220 feedback - last_poll_at was only updated via
|
|
266
|
+
record_received(), causing stale timestamps on empty polls.
|
|
267
|
+
"""
|
|
268
|
+
async with self._lock:
|
|
269
|
+
self.last_poll_at = datetime.now(UTC)
|
|
270
|
+
|
|
271
|
+
async def record_commit_failure(self) -> None:
|
|
272
|
+
"""Record an offset commit failure for tracking consecutive failures.
|
|
273
|
+
|
|
274
|
+
Commit failures don't lose data (messages will be reprocessed on restart),
|
|
275
|
+
but persistent failures may indicate Kafka connectivity issues that require
|
|
276
|
+
investigation. This metric tracks consecutive failures - a successful commit
|
|
277
|
+
resets the counter via reset_commit_failures().
|
|
278
|
+
"""
|
|
279
|
+
async with self._lock:
|
|
280
|
+
self.commit_failures += 1
|
|
281
|
+
self.last_commit_failure_at = datetime.now(UTC)
|
|
282
|
+
|
|
283
|
+
async def reset_commit_failures(self) -> None:
|
|
284
|
+
"""Reset consecutive commit failure counter after successful commit.
|
|
285
|
+
|
|
286
|
+
Called after a successful offset commit to reset the consecutive failure
|
|
287
|
+
tracking. This ensures the "persistent failures" warning only triggers
|
|
288
|
+
when failures are truly consecutive, not spread across time.
|
|
289
|
+
"""
|
|
290
|
+
async with self._lock:
|
|
291
|
+
self.commit_failures = 0
|
|
292
|
+
|
|
293
|
+
async def snapshot(self) -> dict[str, object]:
|
|
294
|
+
"""Get a snapshot of current metrics.
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
Dictionary with all metric values.
|
|
298
|
+
"""
|
|
299
|
+
async with self._lock:
|
|
300
|
+
return {
|
|
301
|
+
"messages_received": self.messages_received,
|
|
302
|
+
"messages_processed": self.messages_processed,
|
|
303
|
+
"messages_failed": self.messages_failed,
|
|
304
|
+
"messages_skipped": self.messages_skipped,
|
|
305
|
+
"batches_processed": self.batches_processed,
|
|
306
|
+
"commit_failures": self.commit_failures,
|
|
307
|
+
"last_poll_at": (
|
|
308
|
+
self.last_poll_at.isoformat() if self.last_poll_at else None
|
|
309
|
+
),
|
|
310
|
+
"last_successful_write_at": (
|
|
311
|
+
self.last_successful_write_at.isoformat()
|
|
312
|
+
if self.last_successful_write_at
|
|
313
|
+
else None
|
|
314
|
+
),
|
|
315
|
+
"last_commit_failure_at": (
|
|
316
|
+
self.last_commit_failure_at.isoformat()
|
|
317
|
+
if self.last_commit_failure_at
|
|
318
|
+
else None
|
|
319
|
+
),
|
|
320
|
+
"started_at": self.started_at.isoformat(),
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
# =============================================================================
|
|
325
|
+
# Injection Effectiveness Consumer
|
|
326
|
+
# =============================================================================
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
class InjectionEffectivenessConsumer:
|
|
330
|
+
"""Async Kafka consumer for injection effectiveness events.
|
|
331
|
+
|
|
332
|
+
Consumes events from multiple injection effectiveness topics and persists them
|
|
333
|
+
to PostgreSQL. Implements at-least-once delivery with per-partition
|
|
334
|
+
offset tracking to ensure no message loss on partial batch failures.
|
|
335
|
+
|
|
336
|
+
Features:
|
|
337
|
+
- **Per-partition offset tracking**: Commit only successfully persisted
|
|
338
|
+
partitions. Partial batch failures do not cause message loss.
|
|
339
|
+
|
|
340
|
+
- **Batch processing**: Configurable batch size and timeout for
|
|
341
|
+
efficient database writes via executemany.
|
|
342
|
+
|
|
343
|
+
- **Circuit breaker**: Database resilience via writer's circuit breaker.
|
|
344
|
+
Consumer degrades gracefully when database is unavailable.
|
|
345
|
+
|
|
346
|
+
- **Health check endpoint**: HTTP server for Kubernetes liveness
|
|
347
|
+
and readiness probes.
|
|
348
|
+
|
|
349
|
+
- **Graceful shutdown**: Signal handling with drain and final commit.
|
|
350
|
+
|
|
351
|
+
Thread Safety:
|
|
352
|
+
This consumer is designed for single-threaded async execution.
|
|
353
|
+
Multiple consumers can run with different group_ids for horizontal
|
|
354
|
+
scaling (partition assignment via Kafka consumer groups).
|
|
355
|
+
|
|
356
|
+
Example:
|
|
357
|
+
>>> config = ConfigInjectionEffectivenessConsumer(
|
|
358
|
+
... kafka_bootstrap_servers="localhost:9092",
|
|
359
|
+
... postgres_dsn="postgresql://postgres:secret@localhost:5432/omninode_bridge",
|
|
360
|
+
... )
|
|
361
|
+
>>> consumer = InjectionEffectivenessConsumer(config)
|
|
362
|
+
>>>
|
|
363
|
+
>>> await consumer.start()
|
|
364
|
+
>>> try:
|
|
365
|
+
... await consumer.run()
|
|
366
|
+
... finally:
|
|
367
|
+
... await consumer.stop()
|
|
368
|
+
|
|
369
|
+
Attributes:
|
|
370
|
+
metrics: Consumer metrics for observability.
|
|
371
|
+
is_running: Whether the consumer is currently running.
|
|
372
|
+
"""
|
|
373
|
+
|
|
374
|
+
def __init__(self, config: ConfigInjectionEffectivenessConsumer) -> None:
|
|
375
|
+
"""Initialize the injection effectiveness consumer.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
378
|
+
config: Consumer configuration (Kafka, PostgreSQL, batch settings).
|
|
379
|
+
|
|
380
|
+
Example:
|
|
381
|
+
>>> config = ConfigInjectionEffectivenessConsumer(
|
|
382
|
+
... kafka_bootstrap_servers="localhost:9092",
|
|
383
|
+
... postgres_dsn="postgresql://postgres:secret@localhost:5432/omninode_bridge",
|
|
384
|
+
... )
|
|
385
|
+
>>> consumer = InjectionEffectivenessConsumer(config)
|
|
386
|
+
"""
|
|
387
|
+
self._config = config
|
|
388
|
+
self._consumer: AIOKafkaConsumer | None = None
|
|
389
|
+
self._pool: asyncpg.Pool | None = None
|
|
390
|
+
self._writer: WriterInjectionEffectivenessPostgres | None = None
|
|
391
|
+
self._running = False
|
|
392
|
+
self._shutdown_event = asyncio.Event()
|
|
393
|
+
|
|
394
|
+
# Health check server
|
|
395
|
+
self._health_app: web.Application | None = None
|
|
396
|
+
self._health_runner: web.AppRunner | None = None
|
|
397
|
+
self._health_site: web.TCPSite | None = None
|
|
398
|
+
|
|
399
|
+
# Metrics
|
|
400
|
+
self.metrics = ConsumerMetrics()
|
|
401
|
+
|
|
402
|
+
# Consumer ID for logging
|
|
403
|
+
self._consumer_id = f"injection-effectiveness-consumer-{uuid4().hex[:8]}"
|
|
404
|
+
|
|
405
|
+
logger.info(
|
|
406
|
+
"InjectionEffectivenessConsumer initialized",
|
|
407
|
+
extra={
|
|
408
|
+
"consumer_id": self._consumer_id,
|
|
409
|
+
"topics": self._config.topics,
|
|
410
|
+
"group_id": self._config.kafka_group_id,
|
|
411
|
+
"bootstrap_servers": self._config.kafka_bootstrap_servers,
|
|
412
|
+
"postgres_dsn": mask_dsn_password(self._config.postgres_dsn),
|
|
413
|
+
"batch_size": self._config.batch_size,
|
|
414
|
+
"batch_timeout_ms": self._config.batch_timeout_ms,
|
|
415
|
+
},
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
# =========================================================================
|
|
419
|
+
# Properties
|
|
420
|
+
# =========================================================================
|
|
421
|
+
|
|
422
|
+
@property
|
|
423
|
+
def is_running(self) -> bool:
|
|
424
|
+
"""Check if the consumer is currently running.
|
|
425
|
+
|
|
426
|
+
Returns:
|
|
427
|
+
True if start() has been called and stop() has not.
|
|
428
|
+
"""
|
|
429
|
+
return self._running
|
|
430
|
+
|
|
431
|
+
@property
|
|
432
|
+
def consumer_id(self) -> str:
|
|
433
|
+
"""Get the unique consumer identifier.
|
|
434
|
+
|
|
435
|
+
Returns:
|
|
436
|
+
Consumer ID string for logging and tracing.
|
|
437
|
+
"""
|
|
438
|
+
return self._consumer_id
|
|
439
|
+
|
|
440
|
+
# =========================================================================
|
|
441
|
+
# Lifecycle Methods
|
|
442
|
+
# =========================================================================
|
|
443
|
+
|
|
444
|
+
async def start(self) -> None:
|
|
445
|
+
"""Start the consumer, pool, writer, and health check server.
|
|
446
|
+
|
|
447
|
+
Creates the asyncpg pool, initializes the writer, creates the Kafka
|
|
448
|
+
consumer, and starts the health check HTTP server.
|
|
449
|
+
|
|
450
|
+
Raises:
|
|
451
|
+
RuntimeError: If the consumer is already running.
|
|
452
|
+
asyncpg.PostgresError: If database connection fails.
|
|
453
|
+
KafkaError: If Kafka connection fails.
|
|
454
|
+
|
|
455
|
+
Example:
|
|
456
|
+
>>> await consumer.start()
|
|
457
|
+
>>> # Consumer is now connected, ready for run()
|
|
458
|
+
"""
|
|
459
|
+
if self._running:
|
|
460
|
+
logger.warning(
|
|
461
|
+
"Consumer already running",
|
|
462
|
+
extra={"consumer_id": self._consumer_id},
|
|
463
|
+
)
|
|
464
|
+
return
|
|
465
|
+
|
|
466
|
+
correlation_id = uuid4()
|
|
467
|
+
|
|
468
|
+
logger.info(
|
|
469
|
+
"Starting InjectionEffectivenessConsumer",
|
|
470
|
+
extra={
|
|
471
|
+
"consumer_id": self._consumer_id,
|
|
472
|
+
"correlation_id": str(correlation_id),
|
|
473
|
+
"topics": self._config.topics,
|
|
474
|
+
},
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
try:
|
|
478
|
+
# Create PostgreSQL pool with configurable sizes
|
|
479
|
+
self._pool = await asyncpg.create_pool(
|
|
480
|
+
dsn=self._config.postgres_dsn,
|
|
481
|
+
min_size=self._config.pool_min_size,
|
|
482
|
+
max_size=self._config.pool_max_size,
|
|
483
|
+
)
|
|
484
|
+
logger.info(
|
|
485
|
+
"PostgreSQL pool created",
|
|
486
|
+
extra={
|
|
487
|
+
"consumer_id": self._consumer_id,
|
|
488
|
+
"correlation_id": str(correlation_id),
|
|
489
|
+
"postgres_dsn": mask_dsn_password(self._config.postgres_dsn),
|
|
490
|
+
},
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
# Create writer with pool injection
|
|
494
|
+
self._writer = WriterInjectionEffectivenessPostgres(
|
|
495
|
+
pool=self._pool,
|
|
496
|
+
circuit_breaker_threshold=self._config.circuit_breaker_threshold,
|
|
497
|
+
circuit_breaker_reset_timeout=self._config.circuit_breaker_reset_timeout,
|
|
498
|
+
circuit_breaker_half_open_successes=self._config.circuit_breaker_half_open_successes,
|
|
499
|
+
minimum_support_threshold=self._config.min_pattern_support,
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
# Create Kafka consumer
|
|
503
|
+
self._consumer = AIOKafkaConsumer(
|
|
504
|
+
*self._config.topics,
|
|
505
|
+
bootstrap_servers=self._config.kafka_bootstrap_servers,
|
|
506
|
+
group_id=self._config.kafka_group_id,
|
|
507
|
+
auto_offset_reset=self._config.auto_offset_reset,
|
|
508
|
+
enable_auto_commit=False, # Manual commits for at-least-once
|
|
509
|
+
max_poll_records=self._config.batch_size,
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
await self._consumer.start()
|
|
513
|
+
logger.info(
|
|
514
|
+
"Kafka consumer started",
|
|
515
|
+
extra={
|
|
516
|
+
"consumer_id": self._consumer_id,
|
|
517
|
+
"correlation_id": str(correlation_id),
|
|
518
|
+
"topics": self._config.topics,
|
|
519
|
+
"group_id": self._config.kafka_group_id,
|
|
520
|
+
},
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
# Start health check server
|
|
524
|
+
await self._start_health_server()
|
|
525
|
+
|
|
526
|
+
self._running = True
|
|
527
|
+
self._shutdown_event.clear()
|
|
528
|
+
|
|
529
|
+
logger.info(
|
|
530
|
+
"InjectionEffectivenessConsumer started",
|
|
531
|
+
extra={
|
|
532
|
+
"consumer_id": self._consumer_id,
|
|
533
|
+
"correlation_id": str(correlation_id),
|
|
534
|
+
},
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
except Exception as e:
|
|
538
|
+
logger.exception(
|
|
539
|
+
"Failed to start consumer",
|
|
540
|
+
extra={
|
|
541
|
+
"consumer_id": self._consumer_id,
|
|
542
|
+
"correlation_id": str(correlation_id),
|
|
543
|
+
"error": str(e),
|
|
544
|
+
},
|
|
545
|
+
)
|
|
546
|
+
# Cleanup any partial initialization
|
|
547
|
+
await self._cleanup_resources(correlation_id)
|
|
548
|
+
raise
|
|
549
|
+
|
|
550
|
+
async def stop(self) -> None:
|
|
551
|
+
"""Stop the consumer gracefully.
|
|
552
|
+
|
|
553
|
+
Signals the consume loop to exit, waits for in-flight processing,
|
|
554
|
+
commits final offsets, and closes all connections. Safe to call
|
|
555
|
+
multiple times.
|
|
556
|
+
|
|
557
|
+
Example:
|
|
558
|
+
>>> await consumer.stop()
|
|
559
|
+
>>> # Consumer is now stopped and disconnected
|
|
560
|
+
"""
|
|
561
|
+
if not self._running:
|
|
562
|
+
logger.debug(
|
|
563
|
+
"Consumer not running, nothing to stop",
|
|
564
|
+
extra={"consumer_id": self._consumer_id},
|
|
565
|
+
)
|
|
566
|
+
return
|
|
567
|
+
|
|
568
|
+
correlation_id = uuid4()
|
|
569
|
+
|
|
570
|
+
logger.info(
|
|
571
|
+
"Stopping InjectionEffectivenessConsumer",
|
|
572
|
+
extra={
|
|
573
|
+
"consumer_id": self._consumer_id,
|
|
574
|
+
"correlation_id": str(correlation_id),
|
|
575
|
+
},
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
# Signal shutdown
|
|
579
|
+
self._running = False
|
|
580
|
+
self._shutdown_event.set()
|
|
581
|
+
|
|
582
|
+
# Cleanup resources
|
|
583
|
+
await self._cleanup_resources(correlation_id)
|
|
584
|
+
|
|
585
|
+
# Log final metrics
|
|
586
|
+
metrics_snapshot = await self.metrics.snapshot()
|
|
587
|
+
logger.info(
|
|
588
|
+
"InjectionEffectivenessConsumer stopped",
|
|
589
|
+
extra={
|
|
590
|
+
"consumer_id": self._consumer_id,
|
|
591
|
+
"correlation_id": str(correlation_id),
|
|
592
|
+
"final_metrics": metrics_snapshot,
|
|
593
|
+
},
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
async def _cleanup_resources(self, correlation_id: UUID) -> None:
|
|
597
|
+
"""Clean up all resources during shutdown.
|
|
598
|
+
|
|
599
|
+
Args:
|
|
600
|
+
correlation_id: Correlation ID for logging.
|
|
601
|
+
"""
|
|
602
|
+
# Stop health check server
|
|
603
|
+
if self._health_site is not None:
|
|
604
|
+
await self._health_site.stop()
|
|
605
|
+
self._health_site = None
|
|
606
|
+
|
|
607
|
+
if self._health_runner is not None:
|
|
608
|
+
await self._health_runner.cleanup()
|
|
609
|
+
self._health_runner = None
|
|
610
|
+
|
|
611
|
+
self._health_app = None
|
|
612
|
+
|
|
613
|
+
# Stop Kafka consumer
|
|
614
|
+
if self._consumer is not None:
|
|
615
|
+
try:
|
|
616
|
+
await self._consumer.stop()
|
|
617
|
+
except Exception as e:
|
|
618
|
+
logger.warning(
|
|
619
|
+
"Error stopping Kafka consumer",
|
|
620
|
+
extra={
|
|
621
|
+
"consumer_id": self._consumer_id,
|
|
622
|
+
"correlation_id": str(correlation_id),
|
|
623
|
+
"error": str(e),
|
|
624
|
+
},
|
|
625
|
+
)
|
|
626
|
+
finally:
|
|
627
|
+
self._consumer = None
|
|
628
|
+
|
|
629
|
+
# Close PostgreSQL pool
|
|
630
|
+
if self._pool is not None:
|
|
631
|
+
try:
|
|
632
|
+
await self._pool.close()
|
|
633
|
+
except Exception as e:
|
|
634
|
+
logger.warning(
|
|
635
|
+
"Error closing PostgreSQL pool",
|
|
636
|
+
extra={
|
|
637
|
+
"consumer_id": self._consumer_id,
|
|
638
|
+
"correlation_id": str(correlation_id),
|
|
639
|
+
"error": str(e),
|
|
640
|
+
},
|
|
641
|
+
)
|
|
642
|
+
finally:
|
|
643
|
+
self._pool = None
|
|
644
|
+
|
|
645
|
+
self._writer = None
|
|
646
|
+
|
|
647
|
+
async def run(self) -> None:
|
|
648
|
+
"""Run the main consume loop.
|
|
649
|
+
|
|
650
|
+
Continuously consumes messages from Kafka topics, processes them
|
|
651
|
+
in batches, and writes to PostgreSQL. Implements at-least-once
|
|
652
|
+
delivery by committing offsets only after successful writes.
|
|
653
|
+
|
|
654
|
+
This method blocks until stop() is called or an unrecoverable error
|
|
655
|
+
occurs. Use this after calling start().
|
|
656
|
+
|
|
657
|
+
Example:
|
|
658
|
+
>>> await consumer.start()
|
|
659
|
+
>>> try:
|
|
660
|
+
... await consumer.run()
|
|
661
|
+
... finally:
|
|
662
|
+
... await consumer.stop()
|
|
663
|
+
"""
|
|
664
|
+
if not self._running or self._consumer is None:
|
|
665
|
+
raise OnexError(
|
|
666
|
+
"Consumer not started. Call start() before run().",
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
correlation_id = uuid4()
|
|
670
|
+
|
|
671
|
+
logger.info(
|
|
672
|
+
"Starting consume loop",
|
|
673
|
+
extra={
|
|
674
|
+
"consumer_id": self._consumer_id,
|
|
675
|
+
"correlation_id": str(correlation_id),
|
|
676
|
+
},
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
await self._consume_loop(correlation_id)
|
|
680
|
+
|
|
681
|
+
async def __aenter__(self) -> InjectionEffectivenessConsumer:
|
|
682
|
+
"""Async context manager entry.
|
|
683
|
+
|
|
684
|
+
Starts the consumer and returns self for use in async with blocks.
|
|
685
|
+
|
|
686
|
+
Returns:
|
|
687
|
+
Self for chaining.
|
|
688
|
+
|
|
689
|
+
Example:
|
|
690
|
+
>>> async with InjectionEffectivenessConsumer(config) as consumer:
|
|
691
|
+
... await consumer.run()
|
|
692
|
+
"""
|
|
693
|
+
await self.start()
|
|
694
|
+
return self
|
|
695
|
+
|
|
696
|
+
async def __aexit__(
|
|
697
|
+
self,
|
|
698
|
+
exc_type: type[BaseException] | None,
|
|
699
|
+
exc_val: BaseException | None,
|
|
700
|
+
exc_tb: object,
|
|
701
|
+
) -> None:
|
|
702
|
+
"""Async context manager exit.
|
|
703
|
+
|
|
704
|
+
Stops the consumer on exit from async with block.
|
|
705
|
+
"""
|
|
706
|
+
await self.stop()
|
|
707
|
+
|
|
708
|
+
# =========================================================================
|
|
709
|
+
# Consume Loop
|
|
710
|
+
# =========================================================================
|
|
711
|
+
|
|
712
|
+
async def _consume_loop(self, correlation_id: UUID) -> None:
|
|
713
|
+
"""Main consumption loop with batch processing.
|
|
714
|
+
|
|
715
|
+
Polls Kafka for messages, accumulates batches, processes them,
|
|
716
|
+
and commits offsets for successfully written partitions only.
|
|
717
|
+
|
|
718
|
+
Args:
|
|
719
|
+
correlation_id: Correlation ID for tracing this consume session.
|
|
720
|
+
"""
|
|
721
|
+
if self._consumer is None:
|
|
722
|
+
logger.error(
|
|
723
|
+
"Consumer is None in consume loop",
|
|
724
|
+
extra={
|
|
725
|
+
"consumer_id": self._consumer_id,
|
|
726
|
+
"correlation_id": str(correlation_id),
|
|
727
|
+
},
|
|
728
|
+
)
|
|
729
|
+
return
|
|
730
|
+
|
|
731
|
+
batch_timeout_seconds = self._config.batch_timeout_ms / 1000.0
|
|
732
|
+
|
|
733
|
+
try:
|
|
734
|
+
while self._running:
|
|
735
|
+
# Poll with timeout for batch accumulation
|
|
736
|
+
try:
|
|
737
|
+
records = await asyncio.wait_for(
|
|
738
|
+
self._consumer.getmany(
|
|
739
|
+
timeout_ms=self._config.batch_timeout_ms,
|
|
740
|
+
max_records=self._config.batch_size,
|
|
741
|
+
),
|
|
742
|
+
timeout=batch_timeout_seconds
|
|
743
|
+
+ self._config.poll_timeout_buffer_seconds,
|
|
744
|
+
)
|
|
745
|
+
except TimeoutError:
|
|
746
|
+
# Poll timeout is normal, continue loop
|
|
747
|
+
continue
|
|
748
|
+
|
|
749
|
+
# Record poll time even if no messages - prevents false DEGRADED
|
|
750
|
+
# health status on low-traffic topics (CodeRabbit PR #220 feedback)
|
|
751
|
+
await self.metrics.record_polled()
|
|
752
|
+
|
|
753
|
+
if not records:
|
|
754
|
+
continue
|
|
755
|
+
|
|
756
|
+
# Flatten all messages from all partitions
|
|
757
|
+
messages: list[ConsumerRecord] = []
|
|
758
|
+
for tp_messages in records.values():
|
|
759
|
+
messages.extend(tp_messages)
|
|
760
|
+
|
|
761
|
+
if not messages:
|
|
762
|
+
continue
|
|
763
|
+
|
|
764
|
+
await self.metrics.record_received(len(messages))
|
|
765
|
+
|
|
766
|
+
# Process batch and get successful offsets per partition
|
|
767
|
+
batch_correlation_id = uuid4()
|
|
768
|
+
successful_offsets = await self._process_batch(
|
|
769
|
+
messages, batch_correlation_id
|
|
770
|
+
)
|
|
771
|
+
|
|
772
|
+
# Commit only successful offsets
|
|
773
|
+
if successful_offsets:
|
|
774
|
+
await self._commit_offsets(successful_offsets, batch_correlation_id)
|
|
775
|
+
await self.metrics.record_batch_processed()
|
|
776
|
+
|
|
777
|
+
except asyncio.CancelledError:
|
|
778
|
+
logger.info(
|
|
779
|
+
"Consume loop cancelled",
|
|
780
|
+
extra={
|
|
781
|
+
"consumer_id": self._consumer_id,
|
|
782
|
+
"correlation_id": str(correlation_id),
|
|
783
|
+
},
|
|
784
|
+
)
|
|
785
|
+
raise
|
|
786
|
+
|
|
787
|
+
except KafkaError as e:
|
|
788
|
+
logger.exception(
|
|
789
|
+
"Kafka error in consume loop",
|
|
790
|
+
extra={
|
|
791
|
+
"consumer_id": self._consumer_id,
|
|
792
|
+
"correlation_id": str(correlation_id),
|
|
793
|
+
"error": str(e),
|
|
794
|
+
},
|
|
795
|
+
)
|
|
796
|
+
raise
|
|
797
|
+
|
|
798
|
+
except Exception as e:
|
|
799
|
+
logger.exception(
|
|
800
|
+
"Unexpected error in consume loop",
|
|
801
|
+
extra={
|
|
802
|
+
"consumer_id": self._consumer_id,
|
|
803
|
+
"correlation_id": str(correlation_id),
|
|
804
|
+
"error": str(e),
|
|
805
|
+
},
|
|
806
|
+
)
|
|
807
|
+
raise
|
|
808
|
+
|
|
809
|
+
finally:
|
|
810
|
+
logger.info(
|
|
811
|
+
"Consume loop exiting",
|
|
812
|
+
extra={
|
|
813
|
+
"consumer_id": self._consumer_id,
|
|
814
|
+
"correlation_id": str(correlation_id),
|
|
815
|
+
},
|
|
816
|
+
)
|
|
817
|
+
|
|
818
|
+
# =========================================================================
|
|
819
|
+
# Batch Processing
|
|
820
|
+
# =========================================================================
|
|
821
|
+
|
|
822
|
+
@staticmethod
|
|
823
|
+
def _track_skipped_offset(
|
|
824
|
+
skipped_offsets: dict[TopicPartition, int],
|
|
825
|
+
msg: ConsumerRecord,
|
|
826
|
+
) -> None:
|
|
827
|
+
"""Track offset for a skipped message to enable commit after processing.
|
|
828
|
+
|
|
829
|
+
Skipped messages (tombstones, invalid UTF-8, JSON errors, validation errors)
|
|
830
|
+
must have their offsets committed to avoid reprocessing. This helper updates
|
|
831
|
+
the skipped_offsets dict with the highest offset seen for each partition.
|
|
832
|
+
|
|
833
|
+
Args:
|
|
834
|
+
skipped_offsets: Dictionary mapping TopicPartition to highest skipped offset.
|
|
835
|
+
msg: The ConsumerRecord being skipped.
|
|
836
|
+
"""
|
|
837
|
+
tp = TopicPartition(msg.topic, msg.partition)
|
|
838
|
+
current = skipped_offsets.get(tp, -1)
|
|
839
|
+
skipped_offsets[tp] = max(current, msg.offset)
|
|
840
|
+
|
|
841
|
+
async def _process_batch(
|
|
842
|
+
self,
|
|
843
|
+
messages: list[ConsumerRecord],
|
|
844
|
+
correlation_id: UUID,
|
|
845
|
+
) -> dict[TopicPartition, int]:
|
|
846
|
+
"""Process batch and return highest successful offset per partition.
|
|
847
|
+
|
|
848
|
+
Groups messages by topic, validates them, writes each topic's batch
|
|
849
|
+
to PostgreSQL, and tracks successful offsets per partition.
|
|
850
|
+
|
|
851
|
+
Args:
|
|
852
|
+
messages: List of Kafka ConsumerRecords to process.
|
|
853
|
+
correlation_id: Correlation ID for tracing.
|
|
854
|
+
|
|
855
|
+
Returns:
|
|
856
|
+
Dictionary mapping TopicPartition to highest successful offset.
|
|
857
|
+
Only partitions with successful writes are included.
|
|
858
|
+
"""
|
|
859
|
+
if self._writer is None:
|
|
860
|
+
logger.error(
|
|
861
|
+
"Writer is None during batch processing",
|
|
862
|
+
extra={
|
|
863
|
+
"consumer_id": self._consumer_id,
|
|
864
|
+
"correlation_id": str(correlation_id),
|
|
865
|
+
},
|
|
866
|
+
)
|
|
867
|
+
return {}
|
|
868
|
+
|
|
869
|
+
successful_offsets: dict[TopicPartition, int] = {}
|
|
870
|
+
# Track skipped message offsets separately to preserve them on write failures
|
|
871
|
+
skipped_offsets: dict[TopicPartition, int] = {}
|
|
872
|
+
parsed_skipped: int = 0
|
|
873
|
+
|
|
874
|
+
# Group messages by topic with their ConsumerRecord for offset tracking
|
|
875
|
+
by_topic: dict[str, list[tuple[ConsumerRecord, BaseModel]]] = {}
|
|
876
|
+
|
|
877
|
+
for msg in messages:
|
|
878
|
+
# Guard against tombstones (compacted topic deletions)
|
|
879
|
+
if msg.value is None:
|
|
880
|
+
logger.warning(
|
|
881
|
+
"Skipping tombstone message",
|
|
882
|
+
extra={
|
|
883
|
+
"consumer_id": self._consumer_id,
|
|
884
|
+
"correlation_id": str(correlation_id),
|
|
885
|
+
"topic": msg.topic,
|
|
886
|
+
"partition": msg.partition,
|
|
887
|
+
"offset": msg.offset,
|
|
888
|
+
},
|
|
889
|
+
)
|
|
890
|
+
parsed_skipped += 1
|
|
891
|
+
self._track_skipped_offset(skipped_offsets, msg)
|
|
892
|
+
continue
|
|
893
|
+
|
|
894
|
+
try:
|
|
895
|
+
# Decode message value with UTF-8 guard
|
|
896
|
+
value = msg.value
|
|
897
|
+
if isinstance(value, bytes):
|
|
898
|
+
try:
|
|
899
|
+
value = value.decode("utf-8")
|
|
900
|
+
except UnicodeDecodeError as e:
|
|
901
|
+
logger.warning(
|
|
902
|
+
"Skipping message with invalid UTF-8 encoding",
|
|
903
|
+
extra={
|
|
904
|
+
"consumer_id": self._consumer_id,
|
|
905
|
+
"correlation_id": str(correlation_id),
|
|
906
|
+
"topic": msg.topic,
|
|
907
|
+
"partition": msg.partition,
|
|
908
|
+
"offset": msg.offset,
|
|
909
|
+
"error": str(e),
|
|
910
|
+
},
|
|
911
|
+
)
|
|
912
|
+
parsed_skipped += 1
|
|
913
|
+
self._track_skipped_offset(skipped_offsets, msg)
|
|
914
|
+
continue
|
|
915
|
+
|
|
916
|
+
payload = json.loads(value)
|
|
917
|
+
|
|
918
|
+
# Get model class for topic
|
|
919
|
+
model_cls = TOPIC_TO_MODEL.get(msg.topic)
|
|
920
|
+
if model_cls is None:
|
|
921
|
+
logger.warning(
|
|
922
|
+
"Unknown topic, skipping message",
|
|
923
|
+
extra={
|
|
924
|
+
"consumer_id": self._consumer_id,
|
|
925
|
+
"correlation_id": str(correlation_id),
|
|
926
|
+
"topic": msg.topic,
|
|
927
|
+
},
|
|
928
|
+
)
|
|
929
|
+
parsed_skipped += 1
|
|
930
|
+
self._track_skipped_offset(skipped_offsets, msg)
|
|
931
|
+
continue
|
|
932
|
+
|
|
933
|
+
# Validate with Pydantic model
|
|
934
|
+
model = model_cls.model_validate(payload)
|
|
935
|
+
by_topic.setdefault(msg.topic, []).append((msg, model))
|
|
936
|
+
|
|
937
|
+
except json.JSONDecodeError as e:
|
|
938
|
+
logger.warning(
|
|
939
|
+
"Failed to decode JSON message",
|
|
940
|
+
extra={
|
|
941
|
+
"consumer_id": self._consumer_id,
|
|
942
|
+
"correlation_id": str(correlation_id),
|
|
943
|
+
"topic": msg.topic,
|
|
944
|
+
"partition": msg.partition,
|
|
945
|
+
"offset": msg.offset,
|
|
946
|
+
"error": str(e),
|
|
947
|
+
},
|
|
948
|
+
)
|
|
949
|
+
parsed_skipped += 1
|
|
950
|
+
self._track_skipped_offset(skipped_offsets, msg)
|
|
951
|
+
|
|
952
|
+
except ValidationError as e:
|
|
953
|
+
logger.warning(
|
|
954
|
+
"Message validation failed",
|
|
955
|
+
extra={
|
|
956
|
+
"consumer_id": self._consumer_id,
|
|
957
|
+
"correlation_id": str(correlation_id),
|
|
958
|
+
"topic": msg.topic,
|
|
959
|
+
"partition": msg.partition,
|
|
960
|
+
"offset": msg.offset,
|
|
961
|
+
"error": str(e),
|
|
962
|
+
},
|
|
963
|
+
)
|
|
964
|
+
parsed_skipped += 1
|
|
965
|
+
self._track_skipped_offset(skipped_offsets, msg)
|
|
966
|
+
|
|
967
|
+
if parsed_skipped > 0:
|
|
968
|
+
await self.metrics.record_skipped(parsed_skipped)
|
|
969
|
+
|
|
970
|
+
# Write each topic's batch to PostgreSQL
|
|
971
|
+
for topic, items in by_topic.items():
|
|
972
|
+
writer_method_name = TOPIC_TO_WRITER_METHOD.get(topic)
|
|
973
|
+
if writer_method_name is None:
|
|
974
|
+
logger.warning(
|
|
975
|
+
"No writer method for topic",
|
|
976
|
+
extra={
|
|
977
|
+
"consumer_id": self._consumer_id,
|
|
978
|
+
"correlation_id": str(correlation_id),
|
|
979
|
+
"topic": topic,
|
|
980
|
+
},
|
|
981
|
+
)
|
|
982
|
+
continue
|
|
983
|
+
|
|
984
|
+
writer_method: Callable[
|
|
985
|
+
[list[BaseModel], UUID], Coroutine[object, object, int]
|
|
986
|
+
] = getattr(self._writer, writer_method_name)
|
|
987
|
+
models = [item[1] for item in items]
|
|
988
|
+
|
|
989
|
+
# Extract correlation_id from events. Models use default_factory=uuid4,
|
|
990
|
+
# so correlation_id is ALWAYS present - this is defensive iteration.
|
|
991
|
+
event_correlation_id: UUID | None = None
|
|
992
|
+
for _, model in items:
|
|
993
|
+
if hasattr(model, "correlation_id"):
|
|
994
|
+
event_correlation_id = model.correlation_id
|
|
995
|
+
break
|
|
996
|
+
|
|
997
|
+
# Use event correlation_id (always present via default_factory), or batch fallback.
|
|
998
|
+
# The assertion guards against impossible state - models guarantee correlation_id.
|
|
999
|
+
writer_correlation_id = event_correlation_id or correlation_id
|
|
1000
|
+
assert writer_correlation_id is not None, (
|
|
1001
|
+
"correlation_id must be present - models use default_factory=uuid4"
|
|
1002
|
+
)
|
|
1003
|
+
|
|
1004
|
+
try:
|
|
1005
|
+
written_count = await writer_method(models, writer_correlation_id)
|
|
1006
|
+
|
|
1007
|
+
# Record successful offsets per partition for this topic
|
|
1008
|
+
for msg, _ in items:
|
|
1009
|
+
tp = TopicPartition(msg.topic, msg.partition)
|
|
1010
|
+
current = successful_offsets.get(tp, -1)
|
|
1011
|
+
successful_offsets[tp] = max(current, msg.offset)
|
|
1012
|
+
|
|
1013
|
+
await self.metrics.record_processed(written_count)
|
|
1014
|
+
|
|
1015
|
+
logger.debug(
|
|
1016
|
+
"Wrote batch for topic",
|
|
1017
|
+
extra={
|
|
1018
|
+
"consumer_id": self._consumer_id,
|
|
1019
|
+
"correlation_id": str(correlation_id),
|
|
1020
|
+
"topic": topic,
|
|
1021
|
+
"count": written_count,
|
|
1022
|
+
},
|
|
1023
|
+
)
|
|
1024
|
+
|
|
1025
|
+
except Exception:
|
|
1026
|
+
# Write failed for this topic - don't update offsets for its partitions
|
|
1027
|
+
logger.exception(
|
|
1028
|
+
"Failed to write batch for topic",
|
|
1029
|
+
extra={
|
|
1030
|
+
"consumer_id": self._consumer_id,
|
|
1031
|
+
"correlation_id": str(correlation_id),
|
|
1032
|
+
"topic": topic,
|
|
1033
|
+
"count": len(models),
|
|
1034
|
+
},
|
|
1035
|
+
)
|
|
1036
|
+
await self.metrics.record_failed(len(models))
|
|
1037
|
+
# Remove any offsets we may have tracked for failed partitions
|
|
1038
|
+
for msg, _ in items:
|
|
1039
|
+
tp = TopicPartition(msg.topic, msg.partition)
|
|
1040
|
+
# Only remove if this batch was the only contributor
|
|
1041
|
+
# In practice, we don't add until success, so this is safe
|
|
1042
|
+
successful_offsets.pop(tp, None)
|
|
1043
|
+
|
|
1044
|
+
# Merge skipped message offsets into successful_offsets
|
|
1045
|
+
# Skipped messages (tombstones, invalid UTF-8, JSON errors, validation errors)
|
|
1046
|
+
# must always have their offsets committed to avoid reprocessing
|
|
1047
|
+
for tp, offset in skipped_offsets.items():
|
|
1048
|
+
current = successful_offsets.get(tp, -1)
|
|
1049
|
+
successful_offsets[tp] = max(current, offset)
|
|
1050
|
+
|
|
1051
|
+
return successful_offsets
|
|
1052
|
+
|
|
1053
|
+
async def _commit_offsets(
|
|
1054
|
+
self,
|
|
1055
|
+
offsets: dict[TopicPartition, int],
|
|
1056
|
+
correlation_id: UUID,
|
|
1057
|
+
) -> None:
|
|
1058
|
+
"""Commit only successfully persisted offsets per partition.
|
|
1059
|
+
|
|
1060
|
+
Commits offset + 1 for each partition (next offset to consume).
|
|
1061
|
+
|
|
1062
|
+
Args:
|
|
1063
|
+
offsets: Dictionary mapping TopicPartition to highest persisted offset.
|
|
1064
|
+
correlation_id: Correlation ID for tracing.
|
|
1065
|
+
"""
|
|
1066
|
+
if not offsets or self._consumer is None:
|
|
1067
|
+
return
|
|
1068
|
+
|
|
1069
|
+
# Build commit offsets (offset + 1 = next offset to consume)
|
|
1070
|
+
commit_offsets: dict[TopicPartition, int] = {
|
|
1071
|
+
tp: offset + 1 for tp, offset in offsets.items()
|
|
1072
|
+
}
|
|
1073
|
+
|
|
1074
|
+
try:
|
|
1075
|
+
await self._consumer.commit(commit_offsets)
|
|
1076
|
+
|
|
1077
|
+
# Reset consecutive failure counter on successful commit
|
|
1078
|
+
await self.metrics.reset_commit_failures()
|
|
1079
|
+
|
|
1080
|
+
logger.debug(
|
|
1081
|
+
"Committed offsets",
|
|
1082
|
+
extra={
|
|
1083
|
+
"consumer_id": self._consumer_id,
|
|
1084
|
+
"correlation_id": str(correlation_id),
|
|
1085
|
+
"partitions": len(commit_offsets),
|
|
1086
|
+
},
|
|
1087
|
+
)
|
|
1088
|
+
|
|
1089
|
+
except KafkaError:
|
|
1090
|
+
# Track commit failures to identify persistent issues
|
|
1091
|
+
await self.metrics.record_commit_failure()
|
|
1092
|
+
|
|
1093
|
+
# Get current failure count for warning threshold
|
|
1094
|
+
metrics_snapshot = await self.metrics.snapshot()
|
|
1095
|
+
commit_failures = metrics_snapshot.get("commit_failures", 0)
|
|
1096
|
+
|
|
1097
|
+
# Escalate logging level if failures are persistent (5+ consecutive)
|
|
1098
|
+
if isinstance(commit_failures, int) and commit_failures >= 5:
|
|
1099
|
+
logger.exception(
|
|
1100
|
+
"Persistent commit failures detected - may indicate Kafka "
|
|
1101
|
+
"connectivity issues requiring investigation",
|
|
1102
|
+
extra={
|
|
1103
|
+
"consumer_id": self._consumer_id,
|
|
1104
|
+
"correlation_id": str(correlation_id),
|
|
1105
|
+
"commit_failures": commit_failures,
|
|
1106
|
+
},
|
|
1107
|
+
)
|
|
1108
|
+
else:
|
|
1109
|
+
logger.exception(
|
|
1110
|
+
"Failed to commit offsets",
|
|
1111
|
+
extra={
|
|
1112
|
+
"consumer_id": self._consumer_id,
|
|
1113
|
+
"correlation_id": str(correlation_id),
|
|
1114
|
+
"commit_failures": commit_failures,
|
|
1115
|
+
},
|
|
1116
|
+
)
|
|
1117
|
+
# Don't re-raise - messages will be reprocessed on restart
|
|
1118
|
+
# (at-least-once delivery semantics preserved)
|
|
1119
|
+
|
|
1120
|
+
# =========================================================================
|
|
1121
|
+
# Health Check Server
|
|
1122
|
+
# =========================================================================
|
|
1123
|
+
|
|
1124
|
+
async def _start_health_server(self) -> None:
|
|
1125
|
+
"""Start minimal HTTP health check server.
|
|
1126
|
+
|
|
1127
|
+
Starts an aiohttp server on the configured port with health check endpoints:
|
|
1128
|
+
- /health: Full health status (backwards compatible)
|
|
1129
|
+
- /health/live: Kubernetes liveness probe (process running)
|
|
1130
|
+
- /health/ready: Kubernetes readiness probe (dependencies connected)
|
|
1131
|
+
"""
|
|
1132
|
+
self._health_app = web.Application()
|
|
1133
|
+
self._health_app.router.add_get("/health", self._health_handler)
|
|
1134
|
+
self._health_app.router.add_get("/health/live", self._liveness_handler)
|
|
1135
|
+
self._health_app.router.add_get("/health/ready", self._readiness_handler)
|
|
1136
|
+
|
|
1137
|
+
self._health_runner = web.AppRunner(self._health_app)
|
|
1138
|
+
await self._health_runner.setup()
|
|
1139
|
+
|
|
1140
|
+
self._health_site = web.TCPSite(
|
|
1141
|
+
self._health_runner,
|
|
1142
|
+
host=self._config.health_check_host, # Configurable - see config.py for security notes
|
|
1143
|
+
port=self._config.health_check_port,
|
|
1144
|
+
)
|
|
1145
|
+
await self._health_site.start()
|
|
1146
|
+
|
|
1147
|
+
logger.info(
|
|
1148
|
+
"Health check server started",
|
|
1149
|
+
extra={
|
|
1150
|
+
"consumer_id": self._consumer_id,
|
|
1151
|
+
"host": self._config.health_check_host,
|
|
1152
|
+
"port": self._config.health_check_port,
|
|
1153
|
+
"endpoints": ["/health", "/health/live", "/health/ready"],
|
|
1154
|
+
},
|
|
1155
|
+
)
|
|
1156
|
+
|
|
1157
|
+
def _determine_health_status(
|
|
1158
|
+
self,
|
|
1159
|
+
metrics_snapshot: dict[str, object],
|
|
1160
|
+
circuit_state: dict[str, JsonType],
|
|
1161
|
+
) -> EnumHealthStatus:
|
|
1162
|
+
"""Determine consumer health status based on current state.
|
|
1163
|
+
|
|
1164
|
+
Health status determination rules (in priority order):
|
|
1165
|
+
1. UNHEALTHY: Consumer is not running (stopped or crashed)
|
|
1166
|
+
2. DEGRADED: Circuit breaker is open or half-open (database issues, retrying)
|
|
1167
|
+
3. DEGRADED: Last poll exceeds poll staleness threshold (consumer not polling)
|
|
1168
|
+
4. DEGRADED: No writes yet AND consumer running > startup_grace_period_seconds (configurable)
|
|
1169
|
+
5. DEGRADED: Last successful write exceeds staleness threshold (with messages received)
|
|
1170
|
+
6. HEALTHY: All other cases (running, circuit closed, recent activity or in grace period)
|
|
1171
|
+
|
|
1172
|
+
The startup grace period (configurable via startup_grace_period_seconds, default 60s)
|
|
1173
|
+
allows the consumer to be considered healthy immediately after starting, before any
|
|
1174
|
+
messages have been consumed.
|
|
1175
|
+
|
|
1176
|
+
Args:
|
|
1177
|
+
metrics_snapshot: Snapshot of current consumer metrics including
|
|
1178
|
+
timestamps for started_at, last_poll_at, and last_successful_write_at.
|
|
1179
|
+
circuit_state: Current circuit breaker state from the writer,
|
|
1180
|
+
containing at minimum a "state" key.
|
|
1181
|
+
|
|
1182
|
+
Returns:
|
|
1183
|
+
EnumHealthStatus indicating current health:
|
|
1184
|
+
- HEALTHY: Fully operational
|
|
1185
|
+
- DEGRADED: Running but with issues (circuit open/half-open, stale polls/writes)
|
|
1186
|
+
- UNHEALTHY: Not running
|
|
1187
|
+
"""
|
|
1188
|
+
# Rule 1: Consumer not running -> UNHEALTHY
|
|
1189
|
+
if not self._running:
|
|
1190
|
+
return EnumHealthStatus.UNHEALTHY
|
|
1191
|
+
|
|
1192
|
+
# Rule 2: Circuit breaker open or half-open -> DEGRADED
|
|
1193
|
+
circuit_breaker_state = circuit_state.get("state")
|
|
1194
|
+
if circuit_breaker_state in ("open", "half_open"):
|
|
1195
|
+
return EnumHealthStatus.DEGRADED
|
|
1196
|
+
|
|
1197
|
+
# Rule 3: Check poll staleness (consumer not polling Kafka)
|
|
1198
|
+
last_poll = metrics_snapshot.get("last_poll_at")
|
|
1199
|
+
if last_poll is not None:
|
|
1200
|
+
try:
|
|
1201
|
+
last_poll_dt = datetime.fromisoformat(str(last_poll))
|
|
1202
|
+
poll_age_seconds = (datetime.now(UTC) - last_poll_dt).total_seconds()
|
|
1203
|
+
if poll_age_seconds > self._config.health_check_poll_staleness_seconds:
|
|
1204
|
+
# Poll exceeds staleness threshold -> DEGRADED
|
|
1205
|
+
return EnumHealthStatus.DEGRADED
|
|
1206
|
+
except (ValueError, TypeError):
|
|
1207
|
+
# Parse error - continue to other checks
|
|
1208
|
+
pass
|
|
1209
|
+
|
|
1210
|
+
# Check for recent successful write (within staleness threshold)
|
|
1211
|
+
last_write = metrics_snapshot.get("last_successful_write_at")
|
|
1212
|
+
messages_received = metrics_snapshot.get("messages_received", 0)
|
|
1213
|
+
|
|
1214
|
+
if last_write is None:
|
|
1215
|
+
# No writes yet - check startup grace period (configurable, default 60s)
|
|
1216
|
+
started_at_str = metrics_snapshot.get("started_at")
|
|
1217
|
+
if started_at_str is not None:
|
|
1218
|
+
try:
|
|
1219
|
+
started_at_dt = datetime.fromisoformat(str(started_at_str))
|
|
1220
|
+
age_seconds = (datetime.now(UTC) - started_at_dt).total_seconds()
|
|
1221
|
+
if age_seconds <= self._config.startup_grace_period_seconds:
|
|
1222
|
+
# Rule 6: Consumer just started, healthy even without writes
|
|
1223
|
+
return EnumHealthStatus.HEALTHY
|
|
1224
|
+
else:
|
|
1225
|
+
# Rule 4: Consumer running > grace period with no writes -> DEGRADED
|
|
1226
|
+
return EnumHealthStatus.DEGRADED
|
|
1227
|
+
except (ValueError, TypeError):
|
|
1228
|
+
# Parse error - fallback to healthy
|
|
1229
|
+
return EnumHealthStatus.HEALTHY
|
|
1230
|
+
else:
|
|
1231
|
+
# No started_at timestamp (shouldn't happen) - assume healthy
|
|
1232
|
+
return EnumHealthStatus.HEALTHY
|
|
1233
|
+
else:
|
|
1234
|
+
# Check if last write was recent (within staleness threshold)
|
|
1235
|
+
# Only consider stale if we have received messages (active traffic)
|
|
1236
|
+
try:
|
|
1237
|
+
last_write_dt = datetime.fromisoformat(str(last_write))
|
|
1238
|
+
write_age_seconds = (datetime.now(UTC) - last_write_dt).total_seconds()
|
|
1239
|
+
if (
|
|
1240
|
+
write_age_seconds > self._config.health_check_staleness_seconds
|
|
1241
|
+
and isinstance(messages_received, int)
|
|
1242
|
+
and messages_received > 0
|
|
1243
|
+
):
|
|
1244
|
+
# Rule 5: Last write exceeds staleness threshold with traffic -> DEGRADED
|
|
1245
|
+
return EnumHealthStatus.DEGRADED
|
|
1246
|
+
else:
|
|
1247
|
+
# Rule 6: Recent write or no traffic -> HEALTHY
|
|
1248
|
+
return EnumHealthStatus.HEALTHY
|
|
1249
|
+
except (ValueError, TypeError):
|
|
1250
|
+
# Parse error - fallback to healthy
|
|
1251
|
+
return EnumHealthStatus.HEALTHY
|
|
1252
|
+
|
|
1253
|
+
async def _health_handler(self, request: web.Request) -> web.Response:
|
|
1254
|
+
"""Handle health check requests.
|
|
1255
|
+
|
|
1256
|
+
Returns JSON with health status based on:
|
|
1257
|
+
- Consumer running state
|
|
1258
|
+
- Circuit breaker state (from writer)
|
|
1259
|
+
- Last successful write timestamp
|
|
1260
|
+
|
|
1261
|
+
Args:
|
|
1262
|
+
request: aiohttp request object.
|
|
1263
|
+
|
|
1264
|
+
Returns:
|
|
1265
|
+
JSON response with health status.
|
|
1266
|
+
"""
|
|
1267
|
+
metrics_snapshot = await self.metrics.snapshot()
|
|
1268
|
+
circuit_state = self._writer.get_circuit_breaker_state() if self._writer else {}
|
|
1269
|
+
|
|
1270
|
+
# Determine health status using shared logic
|
|
1271
|
+
status = self._determine_health_status(metrics_snapshot, circuit_state)
|
|
1272
|
+
|
|
1273
|
+
response_body = {
|
|
1274
|
+
"status": status.value,
|
|
1275
|
+
"consumer_running": self._running,
|
|
1276
|
+
"consumer_id": self._consumer_id,
|
|
1277
|
+
"last_poll_time": metrics_snapshot.get("last_poll_at"),
|
|
1278
|
+
"last_successful_write": metrics_snapshot.get("last_successful_write_at"),
|
|
1279
|
+
"circuit_breaker_state": circuit_state.get("state", "unknown"),
|
|
1280
|
+
"messages_processed": metrics_snapshot.get("messages_processed", 0),
|
|
1281
|
+
"messages_failed": metrics_snapshot.get("messages_failed", 0),
|
|
1282
|
+
"batches_processed": metrics_snapshot.get("batches_processed", 0),
|
|
1283
|
+
}
|
|
1284
|
+
|
|
1285
|
+
# Return appropriate HTTP status code
|
|
1286
|
+
http_status = 200 if status == EnumHealthStatus.HEALTHY else 503
|
|
1287
|
+
|
|
1288
|
+
return web.json_response(response_body, status=http_status)
|
|
1289
|
+
|
|
1290
|
+
async def _liveness_handler(self, request: web.Request) -> web.Response:
|
|
1291
|
+
"""Handle Kubernetes liveness probe requests.
|
|
1292
|
+
|
|
1293
|
+
Liveness indicates the process is running and not deadlocked.
|
|
1294
|
+
Returns 200 if the consumer event loop is responsive.
|
|
1295
|
+
Returns 503 if the consumer is not running.
|
|
1296
|
+
|
|
1297
|
+
This is a minimal check - if we can respond to this request,
|
|
1298
|
+
the event loop is not blocked and the process is alive.
|
|
1299
|
+
|
|
1300
|
+
Args:
|
|
1301
|
+
request: aiohttp request object.
|
|
1302
|
+
|
|
1303
|
+
Returns:
|
|
1304
|
+
JSON response with liveness status.
|
|
1305
|
+
"""
|
|
1306
|
+
# If we can respond, the process is alive
|
|
1307
|
+
is_alive = self._running
|
|
1308
|
+
|
|
1309
|
+
response_body = {
|
|
1310
|
+
"status": "alive" if is_alive else "dead",
|
|
1311
|
+
"consumer_id": self._consumer_id,
|
|
1312
|
+
}
|
|
1313
|
+
|
|
1314
|
+
return web.json_response(response_body, status=200 if is_alive else 503)
|
|
1315
|
+
|
|
1316
|
+
async def _readiness_handler(self, request: web.Request) -> web.Response:
|
|
1317
|
+
"""Handle Kubernetes readiness probe requests.
|
|
1318
|
+
|
|
1319
|
+
Readiness indicates the consumer can accept work - all dependencies
|
|
1320
|
+
are connected and the circuit breaker is not open.
|
|
1321
|
+
|
|
1322
|
+
Dependencies checked:
|
|
1323
|
+
- PostgreSQL pool connected
|
|
1324
|
+
- Kafka consumer initialized
|
|
1325
|
+
- Writer available
|
|
1326
|
+
- Circuit breaker not in OPEN state
|
|
1327
|
+
|
|
1328
|
+
Args:
|
|
1329
|
+
request: aiohttp request object.
|
|
1330
|
+
|
|
1331
|
+
Returns:
|
|
1332
|
+
JSON response with readiness status and dependency states.
|
|
1333
|
+
"""
|
|
1334
|
+
dependencies_ready = {
|
|
1335
|
+
"postgres_pool": self._pool is not None,
|
|
1336
|
+
"kafka_consumer": self._consumer is not None,
|
|
1337
|
+
"writer": self._writer is not None,
|
|
1338
|
+
}
|
|
1339
|
+
|
|
1340
|
+
# Check circuit breaker - OPEN means not ready to accept work
|
|
1341
|
+
circuit_state = self._writer.get_circuit_breaker_state() if self._writer else {}
|
|
1342
|
+
circuit_ready = circuit_state.get("state") != "open"
|
|
1343
|
+
dependencies_ready["circuit_breaker"] = circuit_ready
|
|
1344
|
+
|
|
1345
|
+
all_ready = all(dependencies_ready.values()) and self._running
|
|
1346
|
+
|
|
1347
|
+
response_body = {
|
|
1348
|
+
"status": "ready" if all_ready else "not_ready",
|
|
1349
|
+
"consumer_id": self._consumer_id,
|
|
1350
|
+
"consumer_running": self._running,
|
|
1351
|
+
"dependencies": dependencies_ready,
|
|
1352
|
+
"circuit_breaker_state": circuit_state.get("state", "unknown"),
|
|
1353
|
+
}
|
|
1354
|
+
|
|
1355
|
+
return web.json_response(response_body, status=200 if all_ready else 503)
|
|
1356
|
+
|
|
1357
|
+
# =========================================================================
|
|
1358
|
+
# Health Check (Direct API)
|
|
1359
|
+
# =========================================================================
|
|
1360
|
+
|
|
1361
|
+
async def health_check(self) -> dict[str, object]:
|
|
1362
|
+
"""Check consumer health status.
|
|
1363
|
+
|
|
1364
|
+
Returns a dictionary with health information for programmatic access.
|
|
1365
|
+
|
|
1366
|
+
Returns:
|
|
1367
|
+
Dictionary with health status including:
|
|
1368
|
+
- status: Overall health (healthy, degraded, unhealthy)
|
|
1369
|
+
- consumer_running: Whether consume loop is active
|
|
1370
|
+
- circuit_breaker_state: Current circuit breaker state
|
|
1371
|
+
- consumer_id: Unique consumer identifier
|
|
1372
|
+
- metrics: Current metrics snapshot
|
|
1373
|
+
"""
|
|
1374
|
+
metrics_snapshot = await self.metrics.snapshot()
|
|
1375
|
+
circuit_state = self._writer.get_circuit_breaker_state() if self._writer else {}
|
|
1376
|
+
|
|
1377
|
+
# Determine health status using shared logic
|
|
1378
|
+
status = self._determine_health_status(metrics_snapshot, circuit_state)
|
|
1379
|
+
|
|
1380
|
+
return {
|
|
1381
|
+
"status": status.value,
|
|
1382
|
+
"consumer_running": self._running,
|
|
1383
|
+
"consumer_id": self._consumer_id,
|
|
1384
|
+
"group_id": self._config.kafka_group_id,
|
|
1385
|
+
"topics": self._config.topics,
|
|
1386
|
+
"circuit_breaker_state": circuit_state,
|
|
1387
|
+
"metrics": metrics_snapshot,
|
|
1388
|
+
}
|
|
1389
|
+
|
|
1390
|
+
|
|
1391
|
+
# =============================================================================
|
|
1392
|
+
# Entry Point
|
|
1393
|
+
# =============================================================================
|
|
1394
|
+
|
|
1395
|
+
|
|
1396
|
+
async def _main() -> None:
|
|
1397
|
+
"""Main entry point for running the consumer as a module."""
|
|
1398
|
+
# Load configuration from environment
|
|
1399
|
+
config = ConfigInjectionEffectivenessConsumer()
|
|
1400
|
+
|
|
1401
|
+
logger.info(
|
|
1402
|
+
"Starting injection effectiveness consumer",
|
|
1403
|
+
extra={
|
|
1404
|
+
"topics": config.topics,
|
|
1405
|
+
"bootstrap_servers": config.kafka_bootstrap_servers,
|
|
1406
|
+
"postgres_dsn": mask_dsn_password(config.postgres_dsn),
|
|
1407
|
+
"group_id": config.kafka_group_id,
|
|
1408
|
+
"health_port": config.health_check_port,
|
|
1409
|
+
},
|
|
1410
|
+
)
|
|
1411
|
+
|
|
1412
|
+
consumer = InjectionEffectivenessConsumer(config)
|
|
1413
|
+
|
|
1414
|
+
# Set up signal handlers
|
|
1415
|
+
loop = asyncio.get_running_loop()
|
|
1416
|
+
shutdown_task: asyncio.Task[None] | None = None
|
|
1417
|
+
|
|
1418
|
+
def signal_handler() -> None:
|
|
1419
|
+
nonlocal shutdown_task
|
|
1420
|
+
logger.info("Received shutdown signal")
|
|
1421
|
+
# Only create shutdown task once to avoid race conditions
|
|
1422
|
+
if shutdown_task is None:
|
|
1423
|
+
shutdown_task = asyncio.create_task(consumer.stop())
|
|
1424
|
+
|
|
1425
|
+
for sig in (signal.SIGTERM, signal.SIGINT):
|
|
1426
|
+
loop.add_signal_handler(sig, signal_handler)
|
|
1427
|
+
|
|
1428
|
+
try:
|
|
1429
|
+
await consumer.start()
|
|
1430
|
+
await consumer.run()
|
|
1431
|
+
except asyncio.CancelledError:
|
|
1432
|
+
logger.info("Consumer cancelled")
|
|
1433
|
+
finally:
|
|
1434
|
+
# Ensure shutdown task completes if it was started by signal handler
|
|
1435
|
+
if shutdown_task is not None:
|
|
1436
|
+
if not shutdown_task.done():
|
|
1437
|
+
await shutdown_task
|
|
1438
|
+
# Task already completed, no action needed
|
|
1439
|
+
else:
|
|
1440
|
+
# No signal received, perform clean shutdown
|
|
1441
|
+
await consumer.stop()
|
|
1442
|
+
|
|
1443
|
+
|
|
1444
|
+
if __name__ == "__main__":
|
|
1445
|
+
# Configure logging
|
|
1446
|
+
logging.basicConfig(
|
|
1447
|
+
level=logging.INFO,
|
|
1448
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
1449
|
+
)
|
|
1450
|
+
|
|
1451
|
+
asyncio.run(_main())
|
|
1452
|
+
|
|
1453
|
+
|
|
1454
|
+
__all__ = [
|
|
1455
|
+
"InjectionEffectivenessConsumer",
|
|
1456
|
+
"ConsumerMetrics",
|
|
1457
|
+
"EnumHealthStatus",
|
|
1458
|
+
"TOPIC_TO_MODEL",
|
|
1459
|
+
"TOPIC_TO_WRITER_METHOD",
|
|
1460
|
+
"mask_dsn_password",
|
|
1461
|
+
]
|