omnibase_infra 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. omnibase_infra/__init__.py +1 -1
  2. omnibase_infra/enums/__init__.py +3 -0
  3. omnibase_infra/enums/enum_consumer_group_purpose.py +9 -0
  4. omnibase_infra/enums/enum_postgres_error_code.py +188 -0
  5. omnibase_infra/errors/__init__.py +4 -0
  6. omnibase_infra/errors/error_infra.py +60 -0
  7. omnibase_infra/handlers/__init__.py +3 -0
  8. omnibase_infra/handlers/handler_slack_webhook.py +426 -0
  9. omnibase_infra/handlers/models/__init__.py +14 -0
  10. omnibase_infra/handlers/models/enum_alert_severity.py +36 -0
  11. omnibase_infra/handlers/models/model_slack_alert.py +24 -0
  12. omnibase_infra/handlers/models/model_slack_alert_payload.py +77 -0
  13. omnibase_infra/handlers/models/model_slack_alert_result.py +73 -0
  14. omnibase_infra/handlers/registration_storage/handler_registration_storage_postgres.py +29 -20
  15. omnibase_infra/mixins/__init__.py +14 -0
  16. omnibase_infra/mixins/mixin_node_introspection.py +42 -20
  17. omnibase_infra/mixins/mixin_postgres_error_response.py +314 -0
  18. omnibase_infra/mixins/mixin_postgres_op_executor.py +298 -0
  19. omnibase_infra/models/__init__.py +3 -0
  20. omnibase_infra/models/discovery/model_dependency_spec.py +1 -0
  21. omnibase_infra/models/discovery/model_discovered_capabilities.py +1 -1
  22. omnibase_infra/models/discovery/model_introspection_config.py +28 -1
  23. omnibase_infra/models/discovery/model_introspection_performance_metrics.py +1 -0
  24. omnibase_infra/models/discovery/model_introspection_task_config.py +1 -0
  25. omnibase_infra/{nodes/effects/models → models}/model_backend_result.py +22 -6
  26. omnibase_infra/models/projection/__init__.py +11 -0
  27. omnibase_infra/models/projection/model_contract_projection.py +170 -0
  28. omnibase_infra/models/projection/model_topic_projection.py +148 -0
  29. omnibase_infra/models/runtime/__init__.py +4 -0
  30. omnibase_infra/models/runtime/model_resolved_dependencies.py +116 -0
  31. omnibase_infra/nodes/contract_registry_reducer/__init__.py +5 -0
  32. omnibase_infra/nodes/contract_registry_reducer/contract.yaml +6 -5
  33. omnibase_infra/nodes/contract_registry_reducer/contract_registration_event_router.py +689 -0
  34. omnibase_infra/nodes/contract_registry_reducer/reducer.py +9 -26
  35. omnibase_infra/nodes/effects/__init__.py +1 -1
  36. omnibase_infra/nodes/effects/models/__init__.py +6 -4
  37. omnibase_infra/nodes/effects/models/model_registry_response.py +1 -1
  38. omnibase_infra/nodes/effects/protocol_consul_client.py +1 -1
  39. omnibase_infra/nodes/effects/protocol_postgres_adapter.py +1 -1
  40. omnibase_infra/nodes/effects/registry_effect.py +1 -1
  41. omnibase_infra/nodes/node_contract_persistence_effect/__init__.py +101 -0
  42. omnibase_infra/nodes/node_contract_persistence_effect/contract.yaml +490 -0
  43. omnibase_infra/nodes/node_contract_persistence_effect/handlers/__init__.py +74 -0
  44. omnibase_infra/nodes/node_contract_persistence_effect/handlers/handler_postgres_cleanup_topics.py +217 -0
  45. omnibase_infra/nodes/node_contract_persistence_effect/handlers/handler_postgres_contract_upsert.py +242 -0
  46. omnibase_infra/nodes/node_contract_persistence_effect/handlers/handler_postgres_deactivate.py +194 -0
  47. omnibase_infra/nodes/node_contract_persistence_effect/handlers/handler_postgres_heartbeat.py +243 -0
  48. omnibase_infra/nodes/node_contract_persistence_effect/handlers/handler_postgres_mark_stale.py +208 -0
  49. omnibase_infra/nodes/node_contract_persistence_effect/handlers/handler_postgres_topic_update.py +298 -0
  50. omnibase_infra/nodes/node_contract_persistence_effect/models/__init__.py +15 -0
  51. omnibase_infra/nodes/node_contract_persistence_effect/models/model_persistence_result.py +52 -0
  52. omnibase_infra/nodes/node_contract_persistence_effect/node.py +131 -0
  53. omnibase_infra/nodes/node_contract_persistence_effect/registry/__init__.py +27 -0
  54. omnibase_infra/nodes/node_contract_persistence_effect/registry/registry_infra_contract_persistence_effect.py +251 -0
  55. omnibase_infra/nodes/node_registration_orchestrator/models/model_postgres_intent_payload.py +8 -12
  56. omnibase_infra/nodes/node_registry_effect/models/__init__.py +2 -2
  57. omnibase_infra/nodes/node_slack_alerter_effect/__init__.py +33 -0
  58. omnibase_infra/nodes/node_slack_alerter_effect/contract.yaml +291 -0
  59. omnibase_infra/nodes/node_slack_alerter_effect/node.py +106 -0
  60. omnibase_infra/projectors/__init__.py +6 -0
  61. omnibase_infra/projectors/projection_reader_contract.py +1301 -0
  62. omnibase_infra/runtime/__init__.py +12 -0
  63. omnibase_infra/runtime/baseline_subscriptions.py +13 -6
  64. omnibase_infra/runtime/contract_dependency_resolver.py +455 -0
  65. omnibase_infra/runtime/contract_registration_event_router.py +500 -0
  66. omnibase_infra/runtime/db/__init__.py +4 -0
  67. omnibase_infra/runtime/db/models/__init__.py +15 -10
  68. omnibase_infra/runtime/db/models/model_db_operation.py +40 -0
  69. omnibase_infra/runtime/db/models/model_db_param.py +24 -0
  70. omnibase_infra/runtime/db/models/model_db_repository_contract.py +40 -0
  71. omnibase_infra/runtime/db/models/model_db_return.py +26 -0
  72. omnibase_infra/runtime/db/models/model_db_safety_policy.py +32 -0
  73. omnibase_infra/runtime/emit_daemon/event_registry.py +34 -22
  74. omnibase_infra/runtime/event_bus_subcontract_wiring.py +63 -23
  75. omnibase_infra/runtime/intent_execution_router.py +430 -0
  76. omnibase_infra/runtime/models/__init__.py +6 -0
  77. omnibase_infra/runtime/models/model_contract_registry_config.py +41 -0
  78. omnibase_infra/runtime/models/model_intent_execution_summary.py +79 -0
  79. omnibase_infra/runtime/models/model_runtime_config.py +8 -0
  80. omnibase_infra/runtime/protocols/__init__.py +16 -0
  81. omnibase_infra/runtime/protocols/protocol_intent_executor.py +107 -0
  82. omnibase_infra/runtime/publisher_topic_scoped.py +16 -11
  83. omnibase_infra/runtime/registry_policy.py +29 -15
  84. omnibase_infra/runtime/request_response_wiring.py +793 -0
  85. omnibase_infra/runtime/service_kernel.py +295 -8
  86. omnibase_infra/runtime/service_runtime_host_process.py +149 -5
  87. omnibase_infra/runtime/util_version.py +5 -1
  88. omnibase_infra/schemas/schema_latency_baseline.sql +135 -0
  89. omnibase_infra/services/contract_publisher/config.py +4 -4
  90. omnibase_infra/services/contract_publisher/service.py +8 -5
  91. omnibase_infra/services/observability/injection_effectiveness/__init__.py +67 -0
  92. omnibase_infra/services/observability/injection_effectiveness/config.py +295 -0
  93. omnibase_infra/services/observability/injection_effectiveness/consumer.py +1461 -0
  94. omnibase_infra/services/observability/injection_effectiveness/models/__init__.py +32 -0
  95. omnibase_infra/services/observability/injection_effectiveness/models/model_agent_match.py +79 -0
  96. omnibase_infra/services/observability/injection_effectiveness/models/model_context_utilization.py +118 -0
  97. omnibase_infra/services/observability/injection_effectiveness/models/model_latency_breakdown.py +107 -0
  98. omnibase_infra/services/observability/injection_effectiveness/models/model_pattern_utilization.py +46 -0
  99. omnibase_infra/services/observability/injection_effectiveness/writer_postgres.py +596 -0
  100. omnibase_infra/services/registry_api/models/__init__.py +25 -0
  101. omnibase_infra/services/registry_api/models/model_contract_ref.py +44 -0
  102. omnibase_infra/services/registry_api/models/model_contract_view.py +81 -0
  103. omnibase_infra/services/registry_api/models/model_response_contracts.py +50 -0
  104. omnibase_infra/services/registry_api/models/model_response_topics.py +50 -0
  105. omnibase_infra/services/registry_api/models/model_topic_summary.py +57 -0
  106. omnibase_infra/services/registry_api/models/model_topic_view.py +63 -0
  107. omnibase_infra/services/registry_api/routes.py +205 -6
  108. omnibase_infra/services/registry_api/service.py +528 -1
  109. omnibase_infra/utils/__init__.py +7 -0
  110. omnibase_infra/utils/util_db_error_context.py +292 -0
  111. omnibase_infra/validation/infra_validators.py +3 -1
  112. omnibase_infra/validation/validation_exemptions.yaml +65 -0
  113. {omnibase_infra-0.3.1.dist-info → omnibase_infra-0.4.0.dist-info}/METADATA +3 -3
  114. {omnibase_infra-0.3.1.dist-info → omnibase_infra-0.4.0.dist-info}/RECORD +117 -58
  115. {omnibase_infra-0.3.1.dist-info → omnibase_infra-0.4.0.dist-info}/WHEEL +0 -0
  116. {omnibase_infra-0.3.1.dist-info → omnibase_infra-0.4.0.dist-info}/entry_points.txt +0 -0
  117. {omnibase_infra-0.3.1.dist-info → omnibase_infra-0.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1461 @@
1
+ # SPDX-License-Identifier: MIT
2
+ # Copyright (c) 2025 OmniNode Team
3
+ """Async Kafka Consumer for Injection Effectiveness Observability.
4
+
5
+ This module provides an async Kafka consumer for injection effectiveness events.
6
+ Events are consumed from multiple topics, validated using Pydantic models,
7
+ and persisted to PostgreSQL via the WriterInjectionEffectivenessPostgres.
8
+
9
+ Design Decisions:
10
+ - Per-partition offset tracking: Commit only successfully persisted partitions
11
+ - Batch processing: Configurable batch size and timeout
12
+ - Circuit breaker: Resilience via writer's MixinAsyncCircuitBreaker
13
+ - Health check: HTTP endpoint for Kubernetes probes
14
+ - Graceful shutdown: Signal handling with drain and commit
15
+
16
+ Critical Invariant:
17
+ For each (topic, partition), commit offsets only up to the highest offset
18
+ that has been successfully persisted for that partition.
19
+ Never commit offsets for partitions that had write failures in the batch.
20
+
21
+ Topics consumed:
22
+ - onex.evt.omniclaude.context-utilization.v1
23
+ - onex.evt.omniclaude.agent-match.v1
24
+ - onex.evt.omniclaude.latency-breakdown.v1
25
+
26
+ Related Tickets:
27
+ - OMN-1890: Injection effectiveness observability consumer (current)
28
+ - OMN-1889: Emit injection metrics from omniclaude hooks (producer)
29
+ - OMN-1743: Agent actions consumer (reference pattern)
30
+
31
+ Example:
32
+ >>> from omnibase_infra.services.observability.injection_effectiveness import (
33
+ ... InjectionEffectivenessConsumer,
34
+ ... ConfigInjectionEffectivenessConsumer,
35
+ ... )
36
+ >>>
37
+ >>> config = ConfigInjectionEffectivenessConsumer(
38
+ ... kafka_bootstrap_servers="localhost:9092",
39
+ ... postgres_dsn="postgresql://postgres:secret@localhost:5432/omninode_bridge",
40
+ ... )
41
+ >>> consumer = InjectionEffectivenessConsumer(config)
42
+ >>>
43
+ >>> # Run consumer (blocking)
44
+ >>> await consumer.start()
45
+ >>> await consumer.run()
46
+
47
+ # Or run as module:
48
+ # python -m omnibase_infra.services.observability.injection_effectiveness.consumer
49
+ """
50
+
51
+ from __future__ import annotations
52
+
53
+ import asyncio
54
+ import json
55
+ import logging
56
+ import signal
57
+ from collections.abc import Callable, Coroutine
58
+ from datetime import UTC, datetime
59
+ from enum import StrEnum
60
+ from typing import TYPE_CHECKING
61
+ from urllib.parse import urlparse, urlunparse
62
+ from uuid import UUID, uuid4
63
+
64
+ import asyncpg
65
+ from aiohttp import web
66
+ from aiokafka import AIOKafkaConsumer, TopicPartition
67
+ from aiokafka.errors import KafkaError
68
+ from pydantic import BaseModel, ValidationError
69
+
70
+ from omnibase_core.errors import OnexError
71
+ from omnibase_core.types import JsonType
72
+ from omnibase_infra.services.observability.injection_effectiveness.config import (
73
+ ConfigInjectionEffectivenessConsumer,
74
+ )
75
+ from omnibase_infra.services.observability.injection_effectiveness.models import (
76
+ ModelAgentMatchEvent,
77
+ ModelContextUtilizationEvent,
78
+ ModelLatencyBreakdownEvent,
79
+ )
80
+ from omnibase_infra.services.observability.injection_effectiveness.writer_postgres import (
81
+ WriterInjectionEffectivenessPostgres,
82
+ )
83
+
84
+ if TYPE_CHECKING:
85
+ from aiokafka.structs import ConsumerRecord
86
+
87
+ logger = logging.getLogger(__name__)
88
+
89
+
90
+ # =============================================================================
91
+ # Utility Functions
92
+ # =============================================================================
93
+
94
+
95
+ def mask_dsn_password(dsn: str) -> str:
96
+ """Mask password in a PostgreSQL DSN for safe logging.
97
+
98
+ Parses the DSN and replaces any password component with '***'.
99
+ Handles standard PostgreSQL connection string formats.
100
+
101
+ Args:
102
+ dsn: PostgreSQL connection string, e.g.,
103
+ 'postgresql://user:password@host:port/db'
104
+
105
+ Returns:
106
+ DSN with password replaced by '***'. If parsing fails or no password
107
+ is present, returns the original DSN (safe - no password to mask).
108
+
109
+ Examples:
110
+ >>> mask_dsn_password("postgresql://user:secret@localhost:5432/db")
111
+ 'postgresql://user:***@localhost:5432/db'
112
+
113
+ >>> mask_dsn_password("postgresql://user@localhost/db")
114
+ 'postgresql://user@localhost/db'
115
+
116
+ >>> mask_dsn_password("invalid-dsn")
117
+ 'invalid-dsn'
118
+ """
119
+ try:
120
+ parsed = urlparse(dsn)
121
+
122
+ # No password present - safe to return as-is
123
+ if not parsed.password:
124
+ return dsn
125
+
126
+ # Reconstruct netloc with masked password
127
+ # Format: user:***@host:port or user:***@host
128
+ if parsed.port:
129
+ masked_netloc = f"{parsed.username}:***@{parsed.hostname}:{parsed.port}"
130
+ else:
131
+ masked_netloc = f"{parsed.username}:***@{parsed.hostname}"
132
+
133
+ # Reconstruct the full DSN with masked password
134
+ masked = urlunparse(
135
+ (
136
+ parsed.scheme,
137
+ masked_netloc,
138
+ parsed.path,
139
+ parsed.params,
140
+ parsed.query,
141
+ parsed.fragment,
142
+ )
143
+ )
144
+ return masked
145
+
146
+ except Exception:
147
+ # If parsing fails, return original (likely no password to mask)
148
+ # Log at debug level to avoid noise
149
+ logger.debug("Failed to parse DSN for masking, returning as-is")
150
+ return dsn
151
+
152
+
153
+ # =============================================================================
154
+ # Type Aliases and Constants
155
+ # =============================================================================
156
+
157
+ # Map topics to their Pydantic model class
158
+ TOPIC_TO_MODEL: dict[str, type[BaseModel]] = {
159
+ "onex.evt.omniclaude.context-utilization.v1": ModelContextUtilizationEvent,
160
+ "onex.evt.omniclaude.agent-match.v1": ModelAgentMatchEvent,
161
+ "onex.evt.omniclaude.latency-breakdown.v1": ModelLatencyBreakdownEvent,
162
+ }
163
+
164
+ # Map topics to writer method names
165
+ TOPIC_TO_WRITER_METHOD: dict[str, str] = {
166
+ "onex.evt.omniclaude.context-utilization.v1": "write_context_utilization",
167
+ "onex.evt.omniclaude.agent-match.v1": "write_agent_match",
168
+ "onex.evt.omniclaude.latency-breakdown.v1": "write_latency_breakdowns",
169
+ }
170
+
171
+
172
+ # =============================================================================
173
+ # Enums
174
+ # =============================================================================
175
+
176
+
177
+ class EnumHealthStatus(StrEnum):
178
+ """Health check status values.
179
+
180
+ Used by the health check endpoint to indicate consumer health.
181
+
182
+ Status Semantics:
183
+ HEALTHY: Consumer running, circuit closed, recent successful write
184
+ DEGRADED: Consumer running but circuit open (retrying)
185
+ UNHEALTHY: Consumer stopped or no writes for extended period
186
+ """
187
+
188
+ HEALTHY = "healthy"
189
+ DEGRADED = "degraded"
190
+ UNHEALTHY = "unhealthy"
191
+
192
+
193
+ # =============================================================================
194
+ # Consumer Metrics
195
+ # =============================================================================
196
+
197
+
198
+ class ConsumerMetrics:
199
+ """Metrics tracking for the injection effectiveness consumer.
200
+
201
+ Tracks processing statistics for observability and monitoring.
202
+ Thread-safe via asyncio lock protection.
203
+
204
+ Attributes:
205
+ messages_received: Total messages received from Kafka.
206
+ messages_processed: Successfully processed messages.
207
+ messages_failed: Messages that failed processing.
208
+ messages_skipped: Messages skipped (invalid, duplicate, etc.).
209
+ batches_processed: Number of batches successfully processed.
210
+ commit_failures: Number of offset commit failures (tracks persistent issues).
211
+ last_poll_at: Timestamp of last Kafka poll.
212
+ last_successful_write_at: Timestamp of last successful database write.
213
+ last_commit_failure_at: Timestamp of last commit failure (for diagnostics).
214
+ started_at: Timestamp when metrics were initialized (consumer start time).
215
+ """
216
+
217
+ def __init__(self) -> None:
218
+ """Initialize metrics with zero values."""
219
+ self.messages_received: int = 0
220
+ self.messages_processed: int = 0
221
+ self.messages_failed: int = 0
222
+ self.messages_skipped: int = 0
223
+ self.batches_processed: int = 0
224
+ self.commit_failures: int = 0
225
+ self.last_poll_at: datetime | None = None
226
+ self.last_successful_write_at: datetime | None = None
227
+ self.last_commit_failure_at: datetime | None = None
228
+ self.started_at: datetime = datetime.now(UTC)
229
+ self._lock = asyncio.Lock()
230
+
231
+ async def record_received(self, count: int = 1) -> None:
232
+ """Record messages received."""
233
+ async with self._lock:
234
+ self.messages_received += count
235
+ self.last_poll_at = datetime.now(UTC)
236
+
237
+ async def record_processed(self, count: int = 1) -> None:
238
+ """Record successfully processed messages."""
239
+ async with self._lock:
240
+ self.messages_processed += count
241
+ self.last_successful_write_at = datetime.now(UTC)
242
+
243
+ async def record_failed(self, count: int = 1) -> None:
244
+ """Record failed messages."""
245
+ async with self._lock:
246
+ self.messages_failed += count
247
+
248
+ async def record_skipped(self, count: int = 1) -> None:
249
+ """Record skipped messages."""
250
+ async with self._lock:
251
+ self.messages_skipped += count
252
+
253
+ async def record_batch_processed(self) -> None:
254
+ """Record a successfully processed batch."""
255
+ async with self._lock:
256
+ self.batches_processed += 1
257
+
258
+ async def record_polled(self) -> None:
259
+ """Record a poll attempt (updates last_poll_at regardless of message count).
260
+
261
+ This method should be called after every successful Kafka poll, even when
262
+ the poll returns no messages. This prevents false DEGRADED health status
263
+ on low-traffic topics where empty polls are normal.
264
+
265
+ See: CodeRabbit PR #220 feedback - last_poll_at was only updated via
266
+ record_received(), causing stale timestamps on empty polls.
267
+ """
268
+ async with self._lock:
269
+ self.last_poll_at = datetime.now(UTC)
270
+
271
+ async def record_commit_failure(self) -> None:
272
+ """Record an offset commit failure for tracking consecutive failures.
273
+
274
+ Commit failures don't lose data (messages will be reprocessed on restart),
275
+ but persistent failures may indicate Kafka connectivity issues that require
276
+ investigation. This metric tracks consecutive failures - a successful commit
277
+ resets the counter via reset_commit_failures().
278
+ """
279
+ async with self._lock:
280
+ self.commit_failures += 1
281
+ self.last_commit_failure_at = datetime.now(UTC)
282
+
283
+ async def reset_commit_failures(self) -> None:
284
+ """Reset consecutive commit failure counter after successful commit.
285
+
286
+ Called after a successful offset commit to reset the consecutive failure
287
+ tracking. This ensures the "persistent failures" warning only triggers
288
+ when failures are truly consecutive, not spread across time.
289
+ """
290
+ async with self._lock:
291
+ self.commit_failures = 0
292
+
293
+ async def snapshot(self) -> dict[str, object]:
294
+ """Get a snapshot of current metrics.
295
+
296
+ Returns:
297
+ Dictionary with all metric values.
298
+ """
299
+ async with self._lock:
300
+ return {
301
+ "messages_received": self.messages_received,
302
+ "messages_processed": self.messages_processed,
303
+ "messages_failed": self.messages_failed,
304
+ "messages_skipped": self.messages_skipped,
305
+ "batches_processed": self.batches_processed,
306
+ "commit_failures": self.commit_failures,
307
+ "last_poll_at": (
308
+ self.last_poll_at.isoformat() if self.last_poll_at else None
309
+ ),
310
+ "last_successful_write_at": (
311
+ self.last_successful_write_at.isoformat()
312
+ if self.last_successful_write_at
313
+ else None
314
+ ),
315
+ "last_commit_failure_at": (
316
+ self.last_commit_failure_at.isoformat()
317
+ if self.last_commit_failure_at
318
+ else None
319
+ ),
320
+ "started_at": self.started_at.isoformat(),
321
+ }
322
+
323
+
324
+ # =============================================================================
325
+ # Injection Effectiveness Consumer
326
+ # =============================================================================
327
+
328
+
329
+ class InjectionEffectivenessConsumer:
330
+ """Async Kafka consumer for injection effectiveness events.
331
+
332
+ Consumes events from multiple injection effectiveness topics and persists them
333
+ to PostgreSQL. Implements at-least-once delivery with per-partition
334
+ offset tracking to ensure no message loss on partial batch failures.
335
+
336
+ Features:
337
+ - **Per-partition offset tracking**: Commit only successfully persisted
338
+ partitions. Partial batch failures do not cause message loss.
339
+
340
+ - **Batch processing**: Configurable batch size and timeout for
341
+ efficient database writes via executemany.
342
+
343
+ - **Circuit breaker**: Database resilience via writer's circuit breaker.
344
+ Consumer degrades gracefully when database is unavailable.
345
+
346
+ - **Health check endpoint**: HTTP server for Kubernetes liveness
347
+ and readiness probes.
348
+
349
+ - **Graceful shutdown**: Signal handling with drain and final commit.
350
+
351
+ Thread Safety:
352
+ This consumer is designed for single-threaded async execution.
353
+ Multiple consumers can run with different group_ids for horizontal
354
+ scaling (partition assignment via Kafka consumer groups).
355
+
356
+ Example:
357
+ >>> config = ConfigInjectionEffectivenessConsumer(
358
+ ... kafka_bootstrap_servers="localhost:9092",
359
+ ... postgres_dsn="postgresql://postgres:secret@localhost:5432/omninode_bridge",
360
+ ... )
361
+ >>> consumer = InjectionEffectivenessConsumer(config)
362
+ >>>
363
+ >>> await consumer.start()
364
+ >>> try:
365
+ ... await consumer.run()
366
+ ... finally:
367
+ ... await consumer.stop()
368
+
369
+ Attributes:
370
+ metrics: Consumer metrics for observability.
371
+ is_running: Whether the consumer is currently running.
372
+ """
373
+
374
+ def __init__(self, config: ConfigInjectionEffectivenessConsumer) -> None:
375
+ """Initialize the injection effectiveness consumer.
376
+
377
+ Args:
378
+ config: Consumer configuration (Kafka, PostgreSQL, batch settings).
379
+
380
+ Example:
381
+ >>> config = ConfigInjectionEffectivenessConsumer(
382
+ ... kafka_bootstrap_servers="localhost:9092",
383
+ ... postgres_dsn="postgresql://postgres:secret@localhost:5432/omninode_bridge",
384
+ ... )
385
+ >>> consumer = InjectionEffectivenessConsumer(config)
386
+ """
387
+ self._config = config
388
+ self._consumer: AIOKafkaConsumer | None = None
389
+ self._pool: asyncpg.Pool | None = None
390
+ self._writer: WriterInjectionEffectivenessPostgres | None = None
391
+ self._running = False
392
+ self._shutdown_event = asyncio.Event()
393
+
394
+ # Health check server
395
+ self._health_app: web.Application | None = None
396
+ self._health_runner: web.AppRunner | None = None
397
+ self._health_site: web.TCPSite | None = None
398
+
399
+ # Metrics
400
+ self.metrics = ConsumerMetrics()
401
+
402
+ # Consumer ID for logging
403
+ self._consumer_id = f"injection-effectiveness-consumer-{uuid4().hex[:8]}"
404
+
405
+ logger.info(
406
+ "InjectionEffectivenessConsumer initialized",
407
+ extra={
408
+ "consumer_id": self._consumer_id,
409
+ "topics": self._config.topics,
410
+ "group_id": self._config.kafka_group_id,
411
+ "bootstrap_servers": self._config.kafka_bootstrap_servers,
412
+ "postgres_dsn": mask_dsn_password(self._config.postgres_dsn),
413
+ "batch_size": self._config.batch_size,
414
+ "batch_timeout_ms": self._config.batch_timeout_ms,
415
+ },
416
+ )
417
+
418
+ # =========================================================================
419
+ # Properties
420
+ # =========================================================================
421
+
422
+ @property
423
+ def is_running(self) -> bool:
424
+ """Check if the consumer is currently running.
425
+
426
+ Returns:
427
+ True if start() has been called and stop() has not.
428
+ """
429
+ return self._running
430
+
431
+ @property
432
+ def consumer_id(self) -> str:
433
+ """Get the unique consumer identifier.
434
+
435
+ Returns:
436
+ Consumer ID string for logging and tracing.
437
+ """
438
+ return self._consumer_id
439
+
440
+ # =========================================================================
441
+ # Lifecycle Methods
442
+ # =========================================================================
443
+
444
+ async def start(self) -> None:
445
+ """Start the consumer, pool, writer, and health check server.
446
+
447
+ Creates the asyncpg pool, initializes the writer, creates the Kafka
448
+ consumer, and starts the health check HTTP server.
449
+
450
+ Raises:
451
+ RuntimeError: If the consumer is already running.
452
+ asyncpg.PostgresError: If database connection fails.
453
+ KafkaError: If Kafka connection fails.
454
+
455
+ Example:
456
+ >>> await consumer.start()
457
+ >>> # Consumer is now connected, ready for run()
458
+ """
459
+ if self._running:
460
+ logger.warning(
461
+ "Consumer already running",
462
+ extra={"consumer_id": self._consumer_id},
463
+ )
464
+ return
465
+
466
+ correlation_id = uuid4()
467
+
468
+ logger.info(
469
+ "Starting InjectionEffectivenessConsumer",
470
+ extra={
471
+ "consumer_id": self._consumer_id,
472
+ "correlation_id": str(correlation_id),
473
+ "topics": self._config.topics,
474
+ },
475
+ )
476
+
477
+ try:
478
+ # Create PostgreSQL pool with configurable sizes
479
+ self._pool = await asyncpg.create_pool(
480
+ dsn=self._config.postgres_dsn,
481
+ min_size=self._config.pool_min_size,
482
+ max_size=self._config.pool_max_size,
483
+ )
484
+ logger.info(
485
+ "PostgreSQL pool created",
486
+ extra={
487
+ "consumer_id": self._consumer_id,
488
+ "correlation_id": str(correlation_id),
489
+ "postgres_dsn": mask_dsn_password(self._config.postgres_dsn),
490
+ },
491
+ )
492
+
493
+ # Create writer with pool injection
494
+ self._writer = WriterInjectionEffectivenessPostgres(
495
+ pool=self._pool,
496
+ circuit_breaker_threshold=self._config.circuit_breaker_threshold,
497
+ circuit_breaker_reset_timeout=self._config.circuit_breaker_reset_timeout,
498
+ circuit_breaker_half_open_successes=self._config.circuit_breaker_half_open_successes,
499
+ minimum_support_threshold=self._config.min_pattern_support,
500
+ )
501
+
502
+ # Create Kafka consumer
503
+ self._consumer = AIOKafkaConsumer(
504
+ *self._config.topics,
505
+ bootstrap_servers=self._config.kafka_bootstrap_servers,
506
+ group_id=self._config.kafka_group_id,
507
+ auto_offset_reset=self._config.auto_offset_reset,
508
+ enable_auto_commit=False, # Manual commits for at-least-once
509
+ max_poll_records=self._config.batch_size,
510
+ )
511
+
512
+ await self._consumer.start()
513
+ logger.info(
514
+ "Kafka consumer started",
515
+ extra={
516
+ "consumer_id": self._consumer_id,
517
+ "correlation_id": str(correlation_id),
518
+ "topics": self._config.topics,
519
+ "group_id": self._config.kafka_group_id,
520
+ },
521
+ )
522
+
523
+ # Start health check server
524
+ await self._start_health_server()
525
+
526
+ self._running = True
527
+ self._shutdown_event.clear()
528
+
529
+ logger.info(
530
+ "InjectionEffectivenessConsumer started",
531
+ extra={
532
+ "consumer_id": self._consumer_id,
533
+ "correlation_id": str(correlation_id),
534
+ },
535
+ )
536
+
537
+ except Exception as e:
538
+ logger.exception(
539
+ "Failed to start consumer",
540
+ extra={
541
+ "consumer_id": self._consumer_id,
542
+ "correlation_id": str(correlation_id),
543
+ "error": str(e),
544
+ },
545
+ )
546
+ # Cleanup any partial initialization
547
+ await self._cleanup_resources(correlation_id)
548
+ raise
549
+
550
+ async def stop(self) -> None:
551
+ """Stop the consumer gracefully.
552
+
553
+ Signals the consume loop to exit, waits for in-flight processing,
554
+ commits final offsets, and closes all connections. Safe to call
555
+ multiple times.
556
+
557
+ Example:
558
+ >>> await consumer.stop()
559
+ >>> # Consumer is now stopped and disconnected
560
+ """
561
+ if not self._running:
562
+ logger.debug(
563
+ "Consumer not running, nothing to stop",
564
+ extra={"consumer_id": self._consumer_id},
565
+ )
566
+ return
567
+
568
+ correlation_id = uuid4()
569
+
570
+ logger.info(
571
+ "Stopping InjectionEffectivenessConsumer",
572
+ extra={
573
+ "consumer_id": self._consumer_id,
574
+ "correlation_id": str(correlation_id),
575
+ },
576
+ )
577
+
578
+ # Signal shutdown
579
+ self._running = False
580
+ self._shutdown_event.set()
581
+
582
+ # Cleanup resources
583
+ await self._cleanup_resources(correlation_id)
584
+
585
+ # Log final metrics
586
+ metrics_snapshot = await self.metrics.snapshot()
587
+ logger.info(
588
+ "InjectionEffectivenessConsumer stopped",
589
+ extra={
590
+ "consumer_id": self._consumer_id,
591
+ "correlation_id": str(correlation_id),
592
+ "final_metrics": metrics_snapshot,
593
+ },
594
+ )
595
+
596
+ async def _cleanup_resources(self, correlation_id: UUID) -> None:
597
+ """Clean up all resources during shutdown.
598
+
599
+ Args:
600
+ correlation_id: Correlation ID for logging.
601
+ """
602
+ # Stop health check server
603
+ if self._health_site is not None:
604
+ await self._health_site.stop()
605
+ self._health_site = None
606
+
607
+ if self._health_runner is not None:
608
+ await self._health_runner.cleanup()
609
+ self._health_runner = None
610
+
611
+ self._health_app = None
612
+
613
+ # Stop Kafka consumer
614
+ if self._consumer is not None:
615
+ try:
616
+ await self._consumer.stop()
617
+ except Exception as e:
618
+ logger.warning(
619
+ "Error stopping Kafka consumer",
620
+ extra={
621
+ "consumer_id": self._consumer_id,
622
+ "correlation_id": str(correlation_id),
623
+ "error": str(e),
624
+ },
625
+ )
626
+ finally:
627
+ self._consumer = None
628
+
629
+ # Close PostgreSQL pool
630
+ if self._pool is not None:
631
+ try:
632
+ await self._pool.close()
633
+ except Exception as e:
634
+ logger.warning(
635
+ "Error closing PostgreSQL pool",
636
+ extra={
637
+ "consumer_id": self._consumer_id,
638
+ "correlation_id": str(correlation_id),
639
+ "error": str(e),
640
+ },
641
+ )
642
+ finally:
643
+ self._pool = None
644
+
645
+ self._writer = None
646
+
647
+ async def run(self) -> None:
648
+ """Run the main consume loop.
649
+
650
+ Continuously consumes messages from Kafka topics, processes them
651
+ in batches, and writes to PostgreSQL. Implements at-least-once
652
+ delivery by committing offsets only after successful writes.
653
+
654
+ This method blocks until stop() is called or an unrecoverable error
655
+ occurs. Use this after calling start().
656
+
657
+ Example:
658
+ >>> await consumer.start()
659
+ >>> try:
660
+ ... await consumer.run()
661
+ ... finally:
662
+ ... await consumer.stop()
663
+ """
664
+ if not self._running or self._consumer is None:
665
+ raise OnexError(
666
+ "Consumer not started. Call start() before run().",
667
+ )
668
+
669
+ correlation_id = uuid4()
670
+
671
+ logger.info(
672
+ "Starting consume loop",
673
+ extra={
674
+ "consumer_id": self._consumer_id,
675
+ "correlation_id": str(correlation_id),
676
+ },
677
+ )
678
+
679
+ await self._consume_loop(correlation_id)
680
+
681
+ async def __aenter__(self) -> InjectionEffectivenessConsumer:
682
+ """Async context manager entry.
683
+
684
+ Starts the consumer and returns self for use in async with blocks.
685
+
686
+ Returns:
687
+ Self for chaining.
688
+
689
+ Example:
690
+ >>> async with InjectionEffectivenessConsumer(config) as consumer:
691
+ ... await consumer.run()
692
+ """
693
+ await self.start()
694
+ return self
695
+
696
+ async def __aexit__(
697
+ self,
698
+ exc_type: type[BaseException] | None,
699
+ exc_val: BaseException | None,
700
+ exc_tb: object,
701
+ ) -> None:
702
+ """Async context manager exit.
703
+
704
+ Stops the consumer on exit from async with block.
705
+ """
706
+ await self.stop()
707
+
708
+ # =========================================================================
709
+ # Consume Loop
710
+ # =========================================================================
711
+
712
+ async def _consume_loop(self, correlation_id: UUID) -> None:
713
+ """Main consumption loop with batch processing.
714
+
715
+ Polls Kafka for messages, accumulates batches, processes them,
716
+ and commits offsets for successfully written partitions only.
717
+
718
+ Args:
719
+ correlation_id: Correlation ID for tracing this consume session.
720
+ """
721
+ if self._consumer is None:
722
+ logger.error(
723
+ "Consumer is None in consume loop",
724
+ extra={
725
+ "consumer_id": self._consumer_id,
726
+ "correlation_id": str(correlation_id),
727
+ },
728
+ )
729
+ return
730
+
731
+ batch_timeout_seconds = self._config.batch_timeout_ms / 1000.0
732
+
733
+ try:
734
+ while self._running:
735
+ # Poll with timeout for batch accumulation
736
+ try:
737
+ records = await asyncio.wait_for(
738
+ self._consumer.getmany(
739
+ timeout_ms=self._config.batch_timeout_ms,
740
+ max_records=self._config.batch_size,
741
+ ),
742
+ timeout=batch_timeout_seconds
743
+ + self._config.poll_timeout_buffer_seconds,
744
+ )
745
+ except TimeoutError:
746
+ # Poll timeout is normal, continue loop
747
+ continue
748
+
749
+ # Record poll time even if no messages - prevents false DEGRADED
750
+ # health status on low-traffic topics (CodeRabbit PR #220 feedback)
751
+ await self.metrics.record_polled()
752
+
753
+ if not records:
754
+ continue
755
+
756
+ # Flatten all messages from all partitions
757
+ messages: list[ConsumerRecord] = []
758
+ for tp_messages in records.values():
759
+ messages.extend(tp_messages)
760
+
761
+ if not messages:
762
+ continue
763
+
764
+ await self.metrics.record_received(len(messages))
765
+
766
+ # Process batch and get successful offsets per partition
767
+ batch_correlation_id = uuid4()
768
+ successful_offsets = await self._process_batch(
769
+ messages, batch_correlation_id
770
+ )
771
+
772
+ # Commit only successful offsets
773
+ if successful_offsets:
774
+ await self._commit_offsets(successful_offsets, batch_correlation_id)
775
+ await self.metrics.record_batch_processed()
776
+
777
+ except asyncio.CancelledError:
778
+ logger.info(
779
+ "Consume loop cancelled",
780
+ extra={
781
+ "consumer_id": self._consumer_id,
782
+ "correlation_id": str(correlation_id),
783
+ },
784
+ )
785
+ raise
786
+
787
+ except KafkaError as e:
788
+ logger.exception(
789
+ "Kafka error in consume loop",
790
+ extra={
791
+ "consumer_id": self._consumer_id,
792
+ "correlation_id": str(correlation_id),
793
+ "error": str(e),
794
+ },
795
+ )
796
+ raise
797
+
798
+ except Exception as e:
799
+ logger.exception(
800
+ "Unexpected error in consume loop",
801
+ extra={
802
+ "consumer_id": self._consumer_id,
803
+ "correlation_id": str(correlation_id),
804
+ "error": str(e),
805
+ },
806
+ )
807
+ raise
808
+
809
+ finally:
810
+ logger.info(
811
+ "Consume loop exiting",
812
+ extra={
813
+ "consumer_id": self._consumer_id,
814
+ "correlation_id": str(correlation_id),
815
+ },
816
+ )
817
+
818
+ # =========================================================================
819
+ # Batch Processing
820
+ # =========================================================================
821
+
822
+ @staticmethod
823
+ def _track_skipped_offset(
824
+ skipped_offsets: dict[TopicPartition, int],
825
+ msg: ConsumerRecord,
826
+ ) -> None:
827
+ """Track offset for a skipped message to enable commit after processing.
828
+
829
+ Skipped messages (tombstones, invalid UTF-8, JSON errors, validation errors)
830
+ must have their offsets committed to avoid reprocessing. This helper updates
831
+ the skipped_offsets dict with the highest offset seen for each partition.
832
+
833
+ Args:
834
+ skipped_offsets: Dictionary mapping TopicPartition to highest skipped offset.
835
+ msg: The ConsumerRecord being skipped.
836
+ """
837
+ tp = TopicPartition(msg.topic, msg.partition)
838
+ current = skipped_offsets.get(tp, -1)
839
+ skipped_offsets[tp] = max(current, msg.offset)
840
+
841
+ async def _process_batch(
842
+ self,
843
+ messages: list[ConsumerRecord],
844
+ correlation_id: UUID,
845
+ ) -> dict[TopicPartition, int]:
846
+ """Process batch and return highest successful offset per partition.
847
+
848
+ Groups messages by topic, validates them, writes each topic's batch
849
+ to PostgreSQL, and tracks successful offsets per partition.
850
+
851
+ Args:
852
+ messages: List of Kafka ConsumerRecords to process.
853
+ correlation_id: Correlation ID for tracing.
854
+
855
+ Returns:
856
+ Dictionary mapping TopicPartition to highest successful offset.
857
+ Only partitions with successful writes are included.
858
+ """
859
+ if self._writer is None:
860
+ logger.error(
861
+ "Writer is None during batch processing",
862
+ extra={
863
+ "consumer_id": self._consumer_id,
864
+ "correlation_id": str(correlation_id),
865
+ },
866
+ )
867
+ return {}
868
+
869
+ successful_offsets: dict[TopicPartition, int] = {}
870
+ # Track skipped message offsets separately to preserve them on write failures
871
+ skipped_offsets: dict[TopicPartition, int] = {}
872
+ parsed_skipped: int = 0
873
+
874
+ # Group messages by topic with their ConsumerRecord for offset tracking
875
+ by_topic: dict[str, list[tuple[ConsumerRecord, BaseModel]]] = {}
876
+
877
+ for msg in messages:
878
+ # Guard against tombstones (compacted topic deletions)
879
+ if msg.value is None:
880
+ logger.warning(
881
+ "Skipping tombstone message",
882
+ extra={
883
+ "consumer_id": self._consumer_id,
884
+ "correlation_id": str(correlation_id),
885
+ "topic": msg.topic,
886
+ "partition": msg.partition,
887
+ "offset": msg.offset,
888
+ },
889
+ )
890
+ parsed_skipped += 1
891
+ self._track_skipped_offset(skipped_offsets, msg)
892
+ continue
893
+
894
+ try:
895
+ # Decode message value with UTF-8 guard
896
+ value = msg.value
897
+ if isinstance(value, bytes):
898
+ try:
899
+ value = value.decode("utf-8")
900
+ except UnicodeDecodeError as e:
901
+ logger.warning(
902
+ "Skipping message with invalid UTF-8 encoding",
903
+ extra={
904
+ "consumer_id": self._consumer_id,
905
+ "correlation_id": str(correlation_id),
906
+ "topic": msg.topic,
907
+ "partition": msg.partition,
908
+ "offset": msg.offset,
909
+ "error": str(e),
910
+ },
911
+ )
912
+ parsed_skipped += 1
913
+ self._track_skipped_offset(skipped_offsets, msg)
914
+ continue
915
+
916
+ payload = json.loads(value)
917
+
918
+ # Get model class for topic
919
+ model_cls = TOPIC_TO_MODEL.get(msg.topic)
920
+ if model_cls is None:
921
+ logger.warning(
922
+ "Unknown topic, skipping message",
923
+ extra={
924
+ "consumer_id": self._consumer_id,
925
+ "correlation_id": str(correlation_id),
926
+ "topic": msg.topic,
927
+ },
928
+ )
929
+ parsed_skipped += 1
930
+ self._track_skipped_offset(skipped_offsets, msg)
931
+ continue
932
+
933
+ # Validate with Pydantic model
934
+ model = model_cls.model_validate(payload)
935
+ by_topic.setdefault(msg.topic, []).append((msg, model))
936
+
937
+ except json.JSONDecodeError as e:
938
+ logger.warning(
939
+ "Failed to decode JSON message",
940
+ extra={
941
+ "consumer_id": self._consumer_id,
942
+ "correlation_id": str(correlation_id),
943
+ "topic": msg.topic,
944
+ "partition": msg.partition,
945
+ "offset": msg.offset,
946
+ "error": str(e),
947
+ },
948
+ )
949
+ parsed_skipped += 1
950
+ self._track_skipped_offset(skipped_offsets, msg)
951
+
952
+ except ValidationError as e:
953
+ logger.warning(
954
+ "Message validation failed",
955
+ extra={
956
+ "consumer_id": self._consumer_id,
957
+ "correlation_id": str(correlation_id),
958
+ "topic": msg.topic,
959
+ "partition": msg.partition,
960
+ "offset": msg.offset,
961
+ "error": str(e),
962
+ },
963
+ )
964
+ parsed_skipped += 1
965
+ self._track_skipped_offset(skipped_offsets, msg)
966
+
967
+ if parsed_skipped > 0:
968
+ await self.metrics.record_skipped(parsed_skipped)
969
+
970
+ # Write each topic's batch to PostgreSQL
971
+ for topic, items in by_topic.items():
972
+ writer_method_name = TOPIC_TO_WRITER_METHOD.get(topic)
973
+ if writer_method_name is None:
974
+ logger.warning(
975
+ "No writer method for topic",
976
+ extra={
977
+ "consumer_id": self._consumer_id,
978
+ "correlation_id": str(correlation_id),
979
+ "topic": topic,
980
+ },
981
+ )
982
+ continue
983
+
984
+ writer_method: Callable[
985
+ [list[BaseModel], UUID], Coroutine[object, object, int]
986
+ ] = getattr(self._writer, writer_method_name)
987
+ models = [item[1] for item in items]
988
+
989
+ # Extract correlation_id from events. Models use default_factory=uuid4,
990
+ # so correlation_id is ALWAYS present - this is defensive iteration.
991
+ event_correlation_id: UUID | None = None
992
+ for _, model in items:
993
+ if hasattr(model, "correlation_id"):
994
+ event_correlation_id = model.correlation_id
995
+ break
996
+
997
+ # Use event correlation_id (always present via default_factory), or batch fallback.
998
+ # The assertion guards against impossible state - models guarantee correlation_id.
999
+ writer_correlation_id = event_correlation_id or correlation_id
1000
+ assert writer_correlation_id is not None, (
1001
+ "correlation_id must be present - models use default_factory=uuid4"
1002
+ )
1003
+
1004
+ try:
1005
+ written_count = await writer_method(models, writer_correlation_id)
1006
+
1007
+ # Record successful offsets per partition for this topic
1008
+ for msg, _ in items:
1009
+ tp = TopicPartition(msg.topic, msg.partition)
1010
+ current = successful_offsets.get(tp, -1)
1011
+ successful_offsets[tp] = max(current, msg.offset)
1012
+
1013
+ await self.metrics.record_processed(written_count)
1014
+
1015
+ logger.debug(
1016
+ "Wrote batch for topic",
1017
+ extra={
1018
+ "consumer_id": self._consumer_id,
1019
+ "correlation_id": str(correlation_id),
1020
+ "topic": topic,
1021
+ "count": written_count,
1022
+ },
1023
+ )
1024
+
1025
+ except Exception:
1026
+ # Write failed for this topic - don't update offsets for its partitions
1027
+ logger.exception(
1028
+ "Failed to write batch for topic",
1029
+ extra={
1030
+ "consumer_id": self._consumer_id,
1031
+ "correlation_id": str(correlation_id),
1032
+ "topic": topic,
1033
+ "count": len(models),
1034
+ },
1035
+ )
1036
+ await self.metrics.record_failed(len(models))
1037
+ # Remove any offsets we may have tracked for failed partitions
1038
+ for msg, _ in items:
1039
+ tp = TopicPartition(msg.topic, msg.partition)
1040
+ # Only remove if this batch was the only contributor
1041
+ # In practice, we don't add until success, so this is safe
1042
+ successful_offsets.pop(tp, None)
1043
+
1044
+ # Merge skipped message offsets into successful_offsets
1045
+ # Skipped messages (tombstones, invalid UTF-8, JSON errors, validation errors)
1046
+ # must always have their offsets committed to avoid reprocessing
1047
+ for tp, offset in skipped_offsets.items():
1048
+ current = successful_offsets.get(tp, -1)
1049
+ successful_offsets[tp] = max(current, offset)
1050
+
1051
+ return successful_offsets
1052
+
1053
+ async def _commit_offsets(
1054
+ self,
1055
+ offsets: dict[TopicPartition, int],
1056
+ correlation_id: UUID,
1057
+ ) -> None:
1058
+ """Commit only successfully persisted offsets per partition.
1059
+
1060
+ Commits offset + 1 for each partition (next offset to consume).
1061
+
1062
+ Args:
1063
+ offsets: Dictionary mapping TopicPartition to highest persisted offset.
1064
+ correlation_id: Correlation ID for tracing.
1065
+ """
1066
+ if not offsets or self._consumer is None:
1067
+ return
1068
+
1069
+ # Build commit offsets (offset + 1 = next offset to consume)
1070
+ commit_offsets: dict[TopicPartition, int] = {
1071
+ tp: offset + 1 for tp, offset in offsets.items()
1072
+ }
1073
+
1074
+ try:
1075
+ await self._consumer.commit(commit_offsets)
1076
+
1077
+ # Reset consecutive failure counter on successful commit
1078
+ await self.metrics.reset_commit_failures()
1079
+
1080
+ logger.debug(
1081
+ "Committed offsets",
1082
+ extra={
1083
+ "consumer_id": self._consumer_id,
1084
+ "correlation_id": str(correlation_id),
1085
+ "partitions": len(commit_offsets),
1086
+ },
1087
+ )
1088
+
1089
+ except KafkaError:
1090
+ # Track commit failures to identify persistent issues
1091
+ await self.metrics.record_commit_failure()
1092
+
1093
+ # Get current failure count for warning threshold
1094
+ metrics_snapshot = await self.metrics.snapshot()
1095
+ commit_failures = metrics_snapshot.get("commit_failures", 0)
1096
+
1097
+ # Escalate logging level if failures are persistent (5+ consecutive)
1098
+ if isinstance(commit_failures, int) and commit_failures >= 5:
1099
+ logger.exception(
1100
+ "Persistent commit failures detected - may indicate Kafka "
1101
+ "connectivity issues requiring investigation",
1102
+ extra={
1103
+ "consumer_id": self._consumer_id,
1104
+ "correlation_id": str(correlation_id),
1105
+ "commit_failures": commit_failures,
1106
+ },
1107
+ )
1108
+ else:
1109
+ logger.exception(
1110
+ "Failed to commit offsets",
1111
+ extra={
1112
+ "consumer_id": self._consumer_id,
1113
+ "correlation_id": str(correlation_id),
1114
+ "commit_failures": commit_failures,
1115
+ },
1116
+ )
1117
+ # Don't re-raise - messages will be reprocessed on restart
1118
+ # (at-least-once delivery semantics preserved)
1119
+
1120
+ # =========================================================================
1121
+ # Health Check Server
1122
+ # =========================================================================
1123
+
1124
+ async def _start_health_server(self) -> None:
1125
+ """Start minimal HTTP health check server.
1126
+
1127
+ Starts an aiohttp server on the configured port with health check endpoints:
1128
+ - /health: Full health status (backwards compatible)
1129
+ - /health/live: Kubernetes liveness probe (process running)
1130
+ - /health/ready: Kubernetes readiness probe (dependencies connected)
1131
+ """
1132
+ self._health_app = web.Application()
1133
+ self._health_app.router.add_get("/health", self._health_handler)
1134
+ self._health_app.router.add_get("/health/live", self._liveness_handler)
1135
+ self._health_app.router.add_get("/health/ready", self._readiness_handler)
1136
+
1137
+ self._health_runner = web.AppRunner(self._health_app)
1138
+ await self._health_runner.setup()
1139
+
1140
+ self._health_site = web.TCPSite(
1141
+ self._health_runner,
1142
+ host=self._config.health_check_host, # Configurable - see config.py for security notes
1143
+ port=self._config.health_check_port,
1144
+ )
1145
+ await self._health_site.start()
1146
+
1147
+ logger.info(
1148
+ "Health check server started",
1149
+ extra={
1150
+ "consumer_id": self._consumer_id,
1151
+ "host": self._config.health_check_host,
1152
+ "port": self._config.health_check_port,
1153
+ "endpoints": ["/health", "/health/live", "/health/ready"],
1154
+ },
1155
+ )
1156
+
1157
+ def _determine_health_status(
1158
+ self,
1159
+ metrics_snapshot: dict[str, object],
1160
+ circuit_state: dict[str, JsonType],
1161
+ ) -> EnumHealthStatus:
1162
+ """Determine consumer health status based on current state.
1163
+
1164
+ Health status determination rules (in priority order):
1165
+ 1. UNHEALTHY: Consumer is not running (stopped or crashed)
1166
+ 2. DEGRADED: Circuit breaker is open or half-open (database issues, retrying)
1167
+ 3. DEGRADED: Last poll exceeds poll staleness threshold (consumer not polling)
1168
+ 4. DEGRADED: No writes yet AND consumer running > startup_grace_period_seconds (configurable)
1169
+ 5. DEGRADED: Last successful write exceeds staleness threshold (with messages received)
1170
+ 6. HEALTHY: All other cases (running, circuit closed, recent activity or in grace period)
1171
+
1172
+ The startup grace period (configurable via startup_grace_period_seconds, default 60s)
1173
+ allows the consumer to be considered healthy immediately after starting, before any
1174
+ messages have been consumed.
1175
+
1176
+ Args:
1177
+ metrics_snapshot: Snapshot of current consumer metrics including
1178
+ timestamps for started_at, last_poll_at, and last_successful_write_at.
1179
+ circuit_state: Current circuit breaker state from the writer,
1180
+ containing at minimum a "state" key.
1181
+
1182
+ Returns:
1183
+ EnumHealthStatus indicating current health:
1184
+ - HEALTHY: Fully operational
1185
+ - DEGRADED: Running but with issues (circuit open/half-open, stale polls/writes)
1186
+ - UNHEALTHY: Not running
1187
+ """
1188
+ # Rule 1: Consumer not running -> UNHEALTHY
1189
+ if not self._running:
1190
+ return EnumHealthStatus.UNHEALTHY
1191
+
1192
+ # Rule 2: Circuit breaker open or half-open -> DEGRADED
1193
+ circuit_breaker_state = circuit_state.get("state")
1194
+ if circuit_breaker_state in ("open", "half_open"):
1195
+ return EnumHealthStatus.DEGRADED
1196
+
1197
+ # Rule 3: Check poll staleness (consumer not polling Kafka)
1198
+ last_poll = metrics_snapshot.get("last_poll_at")
1199
+ if last_poll is not None:
1200
+ try:
1201
+ last_poll_dt = datetime.fromisoformat(str(last_poll))
1202
+ poll_age_seconds = (datetime.now(UTC) - last_poll_dt).total_seconds()
1203
+ if poll_age_seconds > self._config.health_check_poll_staleness_seconds:
1204
+ # Poll exceeds staleness threshold -> DEGRADED
1205
+ return EnumHealthStatus.DEGRADED
1206
+ except (ValueError, TypeError):
1207
+ # Parse error - continue to other checks
1208
+ pass
1209
+
1210
+ # Check for recent successful write (within staleness threshold)
1211
+ last_write = metrics_snapshot.get("last_successful_write_at")
1212
+ messages_received = metrics_snapshot.get("messages_received", 0)
1213
+
1214
+ if last_write is None:
1215
+ # No writes yet - check startup grace period (configurable, default 60s)
1216
+ started_at_str = metrics_snapshot.get("started_at")
1217
+ if started_at_str is not None:
1218
+ try:
1219
+ started_at_dt = datetime.fromisoformat(str(started_at_str))
1220
+ age_seconds = (datetime.now(UTC) - started_at_dt).total_seconds()
1221
+ if age_seconds <= self._config.startup_grace_period_seconds:
1222
+ # Rule 6: Consumer just started, healthy even without writes
1223
+ return EnumHealthStatus.HEALTHY
1224
+ else:
1225
+ # Rule 4: Consumer running > grace period with no writes -> DEGRADED
1226
+ return EnumHealthStatus.DEGRADED
1227
+ except (ValueError, TypeError):
1228
+ # Parse error - fallback to healthy
1229
+ return EnumHealthStatus.HEALTHY
1230
+ else:
1231
+ # No started_at timestamp (shouldn't happen) - assume healthy
1232
+ return EnumHealthStatus.HEALTHY
1233
+ else:
1234
+ # Check if last write was recent (within staleness threshold)
1235
+ # Only consider stale if we have received messages (active traffic)
1236
+ try:
1237
+ last_write_dt = datetime.fromisoformat(str(last_write))
1238
+ write_age_seconds = (datetime.now(UTC) - last_write_dt).total_seconds()
1239
+ if (
1240
+ write_age_seconds > self._config.health_check_staleness_seconds
1241
+ and isinstance(messages_received, int)
1242
+ and messages_received > 0
1243
+ ):
1244
+ # Rule 5: Last write exceeds staleness threshold with traffic -> DEGRADED
1245
+ return EnumHealthStatus.DEGRADED
1246
+ else:
1247
+ # Rule 6: Recent write or no traffic -> HEALTHY
1248
+ return EnumHealthStatus.HEALTHY
1249
+ except (ValueError, TypeError):
1250
+ # Parse error - fallback to healthy
1251
+ return EnumHealthStatus.HEALTHY
1252
+
1253
+ async def _health_handler(self, request: web.Request) -> web.Response:
1254
+ """Handle health check requests.
1255
+
1256
+ Returns JSON with health status based on:
1257
+ - Consumer running state
1258
+ - Circuit breaker state (from writer)
1259
+ - Last successful write timestamp
1260
+
1261
+ Args:
1262
+ request: aiohttp request object.
1263
+
1264
+ Returns:
1265
+ JSON response with health status.
1266
+ """
1267
+ metrics_snapshot = await self.metrics.snapshot()
1268
+ circuit_state = self._writer.get_circuit_breaker_state() if self._writer else {}
1269
+
1270
+ # Determine health status using shared logic
1271
+ status = self._determine_health_status(metrics_snapshot, circuit_state)
1272
+
1273
+ response_body = {
1274
+ "status": status.value,
1275
+ "consumer_running": self._running,
1276
+ "consumer_id": self._consumer_id,
1277
+ "last_poll_time": metrics_snapshot.get("last_poll_at"),
1278
+ "last_successful_write": metrics_snapshot.get("last_successful_write_at"),
1279
+ "circuit_breaker_state": circuit_state.get("state", "unknown"),
1280
+ "messages_processed": metrics_snapshot.get("messages_processed", 0),
1281
+ "messages_failed": metrics_snapshot.get("messages_failed", 0),
1282
+ "batches_processed": metrics_snapshot.get("batches_processed", 0),
1283
+ }
1284
+
1285
+ # Return appropriate HTTP status code
1286
+ http_status = 200 if status == EnumHealthStatus.HEALTHY else 503
1287
+
1288
+ return web.json_response(response_body, status=http_status)
1289
+
1290
+ async def _liveness_handler(self, request: web.Request) -> web.Response:
1291
+ """Handle Kubernetes liveness probe requests.
1292
+
1293
+ Liveness indicates the process is running and not deadlocked.
1294
+ Returns 200 if the consumer event loop is responsive.
1295
+ Returns 503 if the consumer is not running.
1296
+
1297
+ This is a minimal check - if we can respond to this request,
1298
+ the event loop is not blocked and the process is alive.
1299
+
1300
+ Args:
1301
+ request: aiohttp request object.
1302
+
1303
+ Returns:
1304
+ JSON response with liveness status.
1305
+ """
1306
+ # If we can respond, the process is alive
1307
+ is_alive = self._running
1308
+
1309
+ response_body = {
1310
+ "status": "alive" if is_alive else "dead",
1311
+ "consumer_id": self._consumer_id,
1312
+ }
1313
+
1314
+ return web.json_response(response_body, status=200 if is_alive else 503)
1315
+
1316
+ async def _readiness_handler(self, request: web.Request) -> web.Response:
1317
+ """Handle Kubernetes readiness probe requests.
1318
+
1319
+ Readiness indicates the consumer can accept work - all dependencies
1320
+ are connected and the circuit breaker is not open.
1321
+
1322
+ Dependencies checked:
1323
+ - PostgreSQL pool connected
1324
+ - Kafka consumer initialized
1325
+ - Writer available
1326
+ - Circuit breaker not in OPEN state
1327
+
1328
+ Args:
1329
+ request: aiohttp request object.
1330
+
1331
+ Returns:
1332
+ JSON response with readiness status and dependency states.
1333
+ """
1334
+ dependencies_ready = {
1335
+ "postgres_pool": self._pool is not None,
1336
+ "kafka_consumer": self._consumer is not None,
1337
+ "writer": self._writer is not None,
1338
+ }
1339
+
1340
+ # Check circuit breaker - OPEN means not ready to accept work
1341
+ circuit_state = self._writer.get_circuit_breaker_state() if self._writer else {}
1342
+ circuit_ready = circuit_state.get("state") != "open"
1343
+ dependencies_ready["circuit_breaker"] = circuit_ready
1344
+
1345
+ all_ready = all(dependencies_ready.values()) and self._running
1346
+
1347
+ response_body = {
1348
+ "status": "ready" if all_ready else "not_ready",
1349
+ "consumer_id": self._consumer_id,
1350
+ "consumer_running": self._running,
1351
+ "dependencies": dependencies_ready,
1352
+ "circuit_breaker_state": circuit_state.get("state", "unknown"),
1353
+ }
1354
+
1355
+ return web.json_response(response_body, status=200 if all_ready else 503)
1356
+
1357
+ # =========================================================================
1358
+ # Health Check (Direct API)
1359
+ # =========================================================================
1360
+
1361
+ async def health_check(self) -> dict[str, object]:
1362
+ """Check consumer health status.
1363
+
1364
+ Returns a dictionary with health information for programmatic access.
1365
+
1366
+ Returns:
1367
+ Dictionary with health status including:
1368
+ - status: Overall health (healthy, degraded, unhealthy)
1369
+ - consumer_running: Whether consume loop is active
1370
+ - circuit_breaker_state: Current circuit breaker state
1371
+ - consumer_id: Unique consumer identifier
1372
+ - metrics: Current metrics snapshot
1373
+ """
1374
+ metrics_snapshot = await self.metrics.snapshot()
1375
+ circuit_state = self._writer.get_circuit_breaker_state() if self._writer else {}
1376
+
1377
+ # Determine health status using shared logic
1378
+ status = self._determine_health_status(metrics_snapshot, circuit_state)
1379
+
1380
+ return {
1381
+ "status": status.value,
1382
+ "consumer_running": self._running,
1383
+ "consumer_id": self._consumer_id,
1384
+ "group_id": self._config.kafka_group_id,
1385
+ "topics": self._config.topics,
1386
+ "circuit_breaker_state": circuit_state,
1387
+ "metrics": metrics_snapshot,
1388
+ }
1389
+
1390
+
1391
+ # =============================================================================
1392
+ # Entry Point
1393
+ # =============================================================================
1394
+
1395
+
1396
+ async def _main() -> None:
1397
+ """Main entry point for running the consumer as a module."""
1398
+ # Load configuration from environment
1399
+ config = ConfigInjectionEffectivenessConsumer()
1400
+
1401
+ logger.info(
1402
+ "Starting injection effectiveness consumer",
1403
+ extra={
1404
+ "topics": config.topics,
1405
+ "bootstrap_servers": config.kafka_bootstrap_servers,
1406
+ "postgres_dsn": mask_dsn_password(config.postgres_dsn),
1407
+ "group_id": config.kafka_group_id,
1408
+ "health_port": config.health_check_port,
1409
+ },
1410
+ )
1411
+
1412
+ consumer = InjectionEffectivenessConsumer(config)
1413
+
1414
+ # Set up signal handlers
1415
+ loop = asyncio.get_running_loop()
1416
+ shutdown_task: asyncio.Task[None] | None = None
1417
+
1418
+ def signal_handler() -> None:
1419
+ nonlocal shutdown_task
1420
+ logger.info("Received shutdown signal")
1421
+ # Only create shutdown task once to avoid race conditions
1422
+ if shutdown_task is None:
1423
+ shutdown_task = asyncio.create_task(consumer.stop())
1424
+
1425
+ for sig in (signal.SIGTERM, signal.SIGINT):
1426
+ loop.add_signal_handler(sig, signal_handler)
1427
+
1428
+ try:
1429
+ await consumer.start()
1430
+ await consumer.run()
1431
+ except asyncio.CancelledError:
1432
+ logger.info("Consumer cancelled")
1433
+ finally:
1434
+ # Ensure shutdown task completes if it was started by signal handler
1435
+ if shutdown_task is not None:
1436
+ if not shutdown_task.done():
1437
+ await shutdown_task
1438
+ # Task already completed, no action needed
1439
+ else:
1440
+ # No signal received, perform clean shutdown
1441
+ await consumer.stop()
1442
+
1443
+
1444
+ if __name__ == "__main__":
1445
+ # Configure logging
1446
+ logging.basicConfig(
1447
+ level=logging.INFO,
1448
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
1449
+ )
1450
+
1451
+ asyncio.run(_main())
1452
+
1453
+
1454
+ __all__ = [
1455
+ "InjectionEffectivenessConsumer",
1456
+ "ConsumerMetrics",
1457
+ "EnumHealthStatus",
1458
+ "TOPIC_TO_MODEL",
1459
+ "TOPIC_TO_WRITER_METHOD",
1460
+ "mask_dsn_password",
1461
+ ]