omnibase_infra 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. omnibase_infra/__init__.py +1 -1
  2. omnibase_infra/enums/__init__.py +4 -0
  3. omnibase_infra/enums/enum_declarative_node_violation.py +102 -0
  4. omnibase_infra/event_bus/adapters/__init__.py +31 -0
  5. omnibase_infra/event_bus/adapters/adapter_protocol_event_publisher_kafka.py +517 -0
  6. omnibase_infra/mixins/mixin_async_circuit_breaker.py +113 -1
  7. omnibase_infra/models/__init__.py +9 -0
  8. omnibase_infra/models/event_bus/__init__.py +22 -0
  9. omnibase_infra/models/event_bus/model_consumer_retry_config.py +367 -0
  10. omnibase_infra/models/event_bus/model_dlq_config.py +177 -0
  11. omnibase_infra/models/event_bus/model_idempotency_config.py +131 -0
  12. omnibase_infra/models/event_bus/model_offset_policy_config.py +107 -0
  13. omnibase_infra/models/resilience/model_circuit_breaker_config.py +15 -0
  14. omnibase_infra/models/validation/__init__.py +8 -0
  15. omnibase_infra/models/validation/model_declarative_node_validation_result.py +139 -0
  16. omnibase_infra/models/validation/model_declarative_node_violation.py +169 -0
  17. omnibase_infra/nodes/architecture_validator/__init__.py +28 -7
  18. omnibase_infra/nodes/architecture_validator/constants.py +36 -0
  19. omnibase_infra/nodes/architecture_validator/handlers/__init__.py +28 -0
  20. omnibase_infra/nodes/architecture_validator/handlers/contract.yaml +120 -0
  21. omnibase_infra/nodes/architecture_validator/handlers/handler_architecture_validation.py +359 -0
  22. omnibase_infra/nodes/architecture_validator/node.py +1 -0
  23. omnibase_infra/nodes/architecture_validator/node_architecture_validator.py +48 -336
  24. omnibase_infra/nodes/node_ledger_projection_compute/__init__.py +16 -2
  25. omnibase_infra/nodes/node_ledger_projection_compute/contract.yaml +14 -4
  26. omnibase_infra/nodes/node_ledger_projection_compute/handlers/__init__.py +18 -0
  27. omnibase_infra/nodes/node_ledger_projection_compute/handlers/contract.yaml +53 -0
  28. omnibase_infra/nodes/node_ledger_projection_compute/handlers/handler_ledger_projection.py +354 -0
  29. omnibase_infra/nodes/node_ledger_projection_compute/node.py +20 -256
  30. omnibase_infra/nodes/node_registry_effect/node.py +20 -73
  31. omnibase_infra/protocols/protocol_dispatch_engine.py +90 -0
  32. omnibase_infra/runtime/__init__.py +11 -0
  33. omnibase_infra/runtime/baseline_subscriptions.py +150 -0
  34. omnibase_infra/runtime/event_bus_subcontract_wiring.py +455 -24
  35. omnibase_infra/runtime/kafka_contract_source.py +13 -5
  36. omnibase_infra/runtime/service_message_dispatch_engine.py +112 -0
  37. omnibase_infra/runtime/service_runtime_host_process.py +6 -11
  38. omnibase_infra/services/__init__.py +36 -0
  39. omnibase_infra/services/contract_publisher/__init__.py +95 -0
  40. omnibase_infra/services/contract_publisher/config.py +199 -0
  41. omnibase_infra/services/contract_publisher/errors.py +243 -0
  42. omnibase_infra/services/contract_publisher/models/__init__.py +28 -0
  43. omnibase_infra/services/contract_publisher/models/model_contract_error.py +67 -0
  44. omnibase_infra/services/contract_publisher/models/model_infra_error.py +62 -0
  45. omnibase_infra/services/contract_publisher/models/model_publish_result.py +112 -0
  46. omnibase_infra/services/contract_publisher/models/model_publish_stats.py +79 -0
  47. omnibase_infra/services/contract_publisher/service.py +617 -0
  48. omnibase_infra/services/contract_publisher/sources/__init__.py +52 -0
  49. omnibase_infra/services/contract_publisher/sources/model_discovered.py +155 -0
  50. omnibase_infra/services/contract_publisher/sources/protocol.py +101 -0
  51. omnibase_infra/services/contract_publisher/sources/source_composite.py +309 -0
  52. omnibase_infra/services/contract_publisher/sources/source_filesystem.py +174 -0
  53. omnibase_infra/services/contract_publisher/sources/source_package.py +221 -0
  54. omnibase_infra/services/observability/__init__.py +40 -0
  55. omnibase_infra/services/observability/agent_actions/__init__.py +64 -0
  56. omnibase_infra/services/observability/agent_actions/config.py +209 -0
  57. omnibase_infra/services/observability/agent_actions/consumer.py +1320 -0
  58. omnibase_infra/services/observability/agent_actions/models/__init__.py +87 -0
  59. omnibase_infra/services/observability/agent_actions/models/model_agent_action.py +142 -0
  60. omnibase_infra/services/observability/agent_actions/models/model_detection_failure.py +125 -0
  61. omnibase_infra/services/observability/agent_actions/models/model_envelope.py +85 -0
  62. omnibase_infra/services/observability/agent_actions/models/model_execution_log.py +159 -0
  63. omnibase_infra/services/observability/agent_actions/models/model_performance_metric.py +130 -0
  64. omnibase_infra/services/observability/agent_actions/models/model_routing_decision.py +138 -0
  65. omnibase_infra/services/observability/agent_actions/models/model_transformation_event.py +124 -0
  66. omnibase_infra/services/observability/agent_actions/tests/__init__.py +20 -0
  67. omnibase_infra/services/observability/agent_actions/tests/test_consumer.py +1154 -0
  68. omnibase_infra/services/observability/agent_actions/tests/test_models.py +645 -0
  69. omnibase_infra/services/observability/agent_actions/tests/test_writer.py +709 -0
  70. omnibase_infra/services/observability/agent_actions/writer_postgres.py +926 -0
  71. omnibase_infra/validation/__init__.py +12 -0
  72. omnibase_infra/validation/contracts/declarative_node.validation.yaml +143 -0
  73. omnibase_infra/validation/validation_exemptions.yaml +93 -0
  74. omnibase_infra/validation/validator_declarative_node.py +850 -0
  75. {omnibase_infra-0.2.7.dist-info → omnibase_infra-0.2.9.dist-info}/METADATA +3 -3
  76. {omnibase_infra-0.2.7.dist-info → omnibase_infra-0.2.9.dist-info}/RECORD +79 -27
  77. {omnibase_infra-0.2.7.dist-info → omnibase_infra-0.2.9.dist-info}/WHEEL +0 -0
  78. {omnibase_infra-0.2.7.dist-info → omnibase_infra-0.2.9.dist-info}/entry_points.txt +0 -0
  79. {omnibase_infra-0.2.7.dist-info → omnibase_infra-0.2.9.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1320 @@
1
+ # SPDX-License-Identifier: MIT
2
+ # Copyright (c) 2025 OmniNode Team
3
+ """Async Kafka Consumer for Agent Actions Observability.
4
+
5
+ This module provides an async Kafka consumer for agent observability events.
6
+ Events are consumed from multiple topics, validated using Pydantic models,
7
+ and persisted to PostgreSQL via the WriterAgentActionsPostgres.
8
+
9
+ Design Decisions:
10
+ - Per-partition offset tracking: Commit only successfully persisted partitions
11
+ - Batch processing: Configurable batch size and timeout
12
+ - Circuit breaker: Resilience via writer's MixinAsyncCircuitBreaker
13
+ - Health check: HTTP endpoint for Kubernetes probes
14
+ - Graceful shutdown: Signal handling with drain and commit
15
+
16
+ Critical Invariant:
17
+ For each (topic, partition), commit offsets only up to the highest offset
18
+ that has been successfully persisted for that partition.
19
+ Never commit offsets for partitions that had write failures in the batch.
20
+
21
+ Topics consumed:
22
+ - agent-actions
23
+ - agent-routing-decisions
24
+ - agent-transformation-events
25
+ - router-performance-metrics
26
+ - agent-detection-failures
27
+ - agent-execution-logs
28
+
29
+ Related Tickets:
30
+ - OMN-1743: Migrate agent_actions_consumer to omnibase_infra (current)
31
+ - OMN-1526: Session consumer moved from omniclaude (reference pattern)
32
+
33
+ Example:
34
+ >>> from omnibase_infra.services.observability.agent_actions import (
35
+ ... AgentActionsConsumer,
36
+ ... ConfigAgentActionsConsumer,
37
+ ... )
38
+ >>>
39
+ >>> config = ConfigAgentActionsConsumer(
40
+ ... kafka_bootstrap_servers="localhost:9092",
41
+ ... postgres_dsn="postgresql://postgres:secret@localhost:5432/omninode_bridge",
42
+ ... )
43
+ >>> consumer = AgentActionsConsumer(config)
44
+ >>>
45
+ >>> # Run consumer (blocking)
46
+ >>> await consumer.start()
47
+ >>> await consumer.run()
48
+
49
+ # Or run as module:
50
+ # python -m omnibase_infra.services.observability.agent_actions.consumer
51
+ """
52
+
53
+ from __future__ import annotations
54
+
55
+ import asyncio
56
+ import json
57
+ import logging
58
+ import signal
59
+ from collections.abc import Callable, Coroutine
60
+ from datetime import UTC, datetime
61
+ from enum import StrEnum
62
+ from typing import TYPE_CHECKING
63
+ from urllib.parse import urlparse, urlunparse
64
+ from uuid import UUID, uuid4
65
+
66
+ import asyncpg
67
+ from aiohttp import web
68
+ from aiokafka import AIOKafkaConsumer, TopicPartition
69
+ from aiokafka.errors import KafkaError
70
+ from pydantic import BaseModel, ValidationError
71
+
72
+ from omnibase_core.errors import OnexError
73
+ from omnibase_core.types import JsonType
74
+ from omnibase_infra.services.observability.agent_actions.config import (
75
+ ConfigAgentActionsConsumer,
76
+ )
77
+ from omnibase_infra.services.observability.agent_actions.models import (
78
+ ModelAgentAction,
79
+ ModelDetectionFailure,
80
+ ModelExecutionLog,
81
+ ModelPerformanceMetric,
82
+ ModelRoutingDecision,
83
+ ModelTransformationEvent,
84
+ )
85
+ from omnibase_infra.services.observability.agent_actions.writer_postgres import (
86
+ WriterAgentActionsPostgres,
87
+ )
88
+
89
+ if TYPE_CHECKING:
90
+ from aiokafka.structs import ConsumerRecord
91
+
92
+ logger = logging.getLogger(__name__)
93
+
94
+
95
+ # =============================================================================
96
+ # Utility Functions
97
+ # =============================================================================
98
+
99
+
100
+ def mask_dsn_password(dsn: str) -> str:
101
+ """Mask password in a PostgreSQL DSN for safe logging.
102
+
103
+ Parses the DSN and replaces any password component with '***'.
104
+ Handles standard PostgreSQL connection string formats.
105
+
106
+ Args:
107
+ dsn: PostgreSQL connection string, e.g.,
108
+ 'postgresql://user:password@host:port/db'
109
+
110
+ Returns:
111
+ DSN with password replaced by '***'. If parsing fails or no password
112
+ is present, returns the original DSN (safe - no password to mask).
113
+
114
+ Examples:
115
+ >>> mask_dsn_password("postgresql://user:secret@localhost:5432/db")
116
+ 'postgresql://user:***@localhost:5432/db'
117
+
118
+ >>> mask_dsn_password("postgresql://user@localhost/db")
119
+ 'postgresql://user@localhost/db'
120
+
121
+ >>> mask_dsn_password("invalid-dsn")
122
+ 'invalid-dsn'
123
+ """
124
+ try:
125
+ parsed = urlparse(dsn)
126
+
127
+ # No password present - safe to return as-is
128
+ if not parsed.password:
129
+ return dsn
130
+
131
+ # Reconstruct netloc with masked password
132
+ # Format: user:***@host:port or user:***@host
133
+ if parsed.port:
134
+ masked_netloc = f"{parsed.username}:***@{parsed.hostname}:{parsed.port}"
135
+ else:
136
+ masked_netloc = f"{parsed.username}:***@{parsed.hostname}"
137
+
138
+ # Reconstruct the full DSN with masked password
139
+ masked = urlunparse(
140
+ (
141
+ parsed.scheme,
142
+ masked_netloc,
143
+ parsed.path,
144
+ parsed.params,
145
+ parsed.query,
146
+ parsed.fragment,
147
+ )
148
+ )
149
+ return masked
150
+
151
+ except Exception:
152
+ # If parsing fails, return original (likely no password to mask)
153
+ # Log at debug level to avoid noise
154
+ logger.debug("Failed to parse DSN for masking, returning as-is")
155
+ return dsn
156
+
157
+
158
+ # =============================================================================
159
+ # Type Aliases and Constants
160
+ # =============================================================================
161
+
162
+ # Map topics to their Pydantic model class
163
+ TOPIC_TO_MODEL: dict[str, type[BaseModel]] = {
164
+ "agent-actions": ModelAgentAction,
165
+ "agent-routing-decisions": ModelRoutingDecision,
166
+ "agent-transformation-events": ModelTransformationEvent,
167
+ "router-performance-metrics": ModelPerformanceMetric,
168
+ "agent-detection-failures": ModelDetectionFailure,
169
+ "agent-execution-logs": ModelExecutionLog,
170
+ }
171
+
172
+ # Map topics to writer method names
173
+ TOPIC_TO_WRITER_METHOD: dict[str, str] = {
174
+ "agent-actions": "write_agent_actions",
175
+ "agent-routing-decisions": "write_routing_decisions",
176
+ "agent-transformation-events": "write_transformation_events",
177
+ "router-performance-metrics": "write_performance_metrics",
178
+ "agent-detection-failures": "write_detection_failures",
179
+ "agent-execution-logs": "write_execution_logs",
180
+ }
181
+
182
+
183
+ # =============================================================================
184
+ # Enums
185
+ # =============================================================================
186
+
187
+
188
+ class EnumHealthStatus(StrEnum):
189
+ """Health check status values.
190
+
191
+ Used by the health check endpoint to indicate consumer health.
192
+
193
+ Status Semantics:
194
+ HEALTHY: Consumer running, circuit closed, recent successful write
195
+ DEGRADED: Consumer running but circuit open (retrying)
196
+ UNHEALTHY: Consumer stopped or no writes for extended period
197
+ """
198
+
199
+ HEALTHY = "healthy"
200
+ DEGRADED = "degraded"
201
+ UNHEALTHY = "unhealthy"
202
+
203
+
204
+ # =============================================================================
205
+ # Consumer Metrics
206
+ # =============================================================================
207
+
208
+
209
+ class ConsumerMetrics:
210
+ """Metrics tracking for the agent actions consumer.
211
+
212
+ Tracks processing statistics for observability and monitoring.
213
+ Thread-safe via asyncio lock protection.
214
+
215
+ Attributes:
216
+ messages_received: Total messages received from Kafka.
217
+ messages_processed: Successfully processed messages.
218
+ messages_failed: Messages that failed processing.
219
+ messages_skipped: Messages skipped (invalid, duplicate, etc.).
220
+ batches_processed: Number of batches successfully processed.
221
+ last_poll_at: Timestamp of last Kafka poll.
222
+ last_successful_write_at: Timestamp of last successful database write.
223
+ started_at: Timestamp when metrics were initialized (consumer start time).
224
+ """
225
+
226
+ def __init__(self) -> None:
227
+ """Initialize metrics with zero values."""
228
+ self.messages_received: int = 0
229
+ self.messages_processed: int = 0
230
+ self.messages_failed: int = 0
231
+ self.messages_skipped: int = 0
232
+ self.batches_processed: int = 0
233
+ self.last_poll_at: datetime | None = None
234
+ self.last_successful_write_at: datetime | None = None
235
+ self.started_at: datetime = datetime.now(UTC)
236
+ self._lock = asyncio.Lock()
237
+
238
+ async def record_received(self, count: int = 1) -> None:
239
+ """Record messages received."""
240
+ async with self._lock:
241
+ self.messages_received += count
242
+ self.last_poll_at = datetime.now(UTC)
243
+
244
+ async def record_processed(self, count: int = 1) -> None:
245
+ """Record successfully processed messages."""
246
+ async with self._lock:
247
+ self.messages_processed += count
248
+ self.last_successful_write_at = datetime.now(UTC)
249
+
250
+ async def record_failed(self, count: int = 1) -> None:
251
+ """Record failed messages."""
252
+ async with self._lock:
253
+ self.messages_failed += count
254
+
255
+ async def record_skipped(self, count: int = 1) -> None:
256
+ """Record skipped messages."""
257
+ async with self._lock:
258
+ self.messages_skipped += count
259
+
260
+ async def record_batch_processed(self) -> None:
261
+ """Record a successfully processed batch."""
262
+ async with self._lock:
263
+ self.batches_processed += 1
264
+
265
+ async def record_polled(self) -> None:
266
+ """Record a poll attempt (updates last_poll_at regardless of message count).
267
+
268
+ This method should be called after every successful Kafka poll, even when
269
+ the poll returns no messages. This prevents false DEGRADED health status
270
+ on low-traffic topics where empty polls are normal.
271
+
272
+ See: CodeRabbit PR #220 feedback - last_poll_at was only updated via
273
+ record_received(), causing stale timestamps on empty polls.
274
+ """
275
+ async with self._lock:
276
+ self.last_poll_at = datetime.now(UTC)
277
+
278
+ async def snapshot(self) -> dict[str, object]:
279
+ """Get a snapshot of current metrics.
280
+
281
+ Returns:
282
+ Dictionary with all metric values.
283
+ """
284
+ async with self._lock:
285
+ return {
286
+ "messages_received": self.messages_received,
287
+ "messages_processed": self.messages_processed,
288
+ "messages_failed": self.messages_failed,
289
+ "messages_skipped": self.messages_skipped,
290
+ "batches_processed": self.batches_processed,
291
+ "last_poll_at": (
292
+ self.last_poll_at.isoformat() if self.last_poll_at else None
293
+ ),
294
+ "last_successful_write_at": (
295
+ self.last_successful_write_at.isoformat()
296
+ if self.last_successful_write_at
297
+ else None
298
+ ),
299
+ "started_at": self.started_at.isoformat(),
300
+ }
301
+
302
+
303
+ # =============================================================================
304
+ # Agent Actions Consumer
305
+ # =============================================================================
306
+
307
+
308
+ class AgentActionsConsumer:
309
+ """Async Kafka consumer for agent observability events.
310
+
311
+ Consumes events from multiple observability topics and persists them
312
+ to PostgreSQL. Implements at-least-once delivery with per-partition
313
+ offset tracking to ensure no message loss on partial batch failures.
314
+
315
+ Features:
316
+ - **Per-partition offset tracking**: Commit only successfully persisted
317
+ partitions. Partial batch failures do not cause message loss.
318
+
319
+ - **Batch processing**: Configurable batch size and timeout for
320
+ efficient database writes via executemany.
321
+
322
+ - **Circuit breaker**: Database resilience via writer's circuit breaker.
323
+ Consumer degrades gracefully when database is unavailable.
324
+
325
+ - **Health check endpoint**: HTTP server for Kubernetes liveness
326
+ and readiness probes.
327
+
328
+ - **Graceful shutdown**: Signal handling with drain and final commit.
329
+
330
+ Thread Safety:
331
+ This consumer is designed for single-threaded async execution.
332
+ Multiple consumers can run with different group_ids for horizontal
333
+ scaling (partition assignment via Kafka consumer groups).
334
+
335
+ Example:
336
+ >>> config = ConfigAgentActionsConsumer(
337
+ ... kafka_bootstrap_servers="localhost:9092",
338
+ ... postgres_dsn="postgresql://postgres:secret@localhost:5432/omninode_bridge",
339
+ ... )
340
+ >>> consumer = AgentActionsConsumer(config)
341
+ >>>
342
+ >>> await consumer.start()
343
+ >>> try:
344
+ ... await consumer.run()
345
+ ... finally:
346
+ ... await consumer.stop()
347
+
348
+ Attributes:
349
+ metrics: Consumer metrics for observability.
350
+ is_running: Whether the consumer is currently running.
351
+ """
352
+
353
+ def __init__(self, config: ConfigAgentActionsConsumer) -> None:
354
+ """Initialize the agent actions consumer.
355
+
356
+ Args:
357
+ config: Consumer configuration (Kafka, PostgreSQL, batch settings).
358
+
359
+ Example:
360
+ >>> config = ConfigAgentActionsConsumer(
361
+ ... kafka_bootstrap_servers="localhost:9092",
362
+ ... postgres_dsn="postgresql://postgres:secret@localhost:5432/omninode_bridge",
363
+ ... )
364
+ >>> consumer = AgentActionsConsumer(config)
365
+ """
366
+ self._config = config
367
+ self._consumer: AIOKafkaConsumer | None = None
368
+ self._pool: asyncpg.Pool | None = None
369
+ self._writer: WriterAgentActionsPostgres | None = None
370
+ self._running = False
371
+ self._shutdown_event = asyncio.Event()
372
+
373
+ # Health check server
374
+ self._health_app: web.Application | None = None
375
+ self._health_runner: web.AppRunner | None = None
376
+ self._health_site: web.TCPSite | None = None
377
+
378
+ # Metrics
379
+ self.metrics = ConsumerMetrics()
380
+
381
+ # Consumer ID for logging
382
+ self._consumer_id = f"agent-actions-consumer-{uuid4().hex[:8]}"
383
+
384
+ logger.info(
385
+ "AgentActionsConsumer initialized",
386
+ extra={
387
+ "consumer_id": self._consumer_id,
388
+ "topics": self._config.topics,
389
+ "group_id": self._config.kafka_group_id,
390
+ "bootstrap_servers": self._config.kafka_bootstrap_servers,
391
+ "postgres_dsn": mask_dsn_password(self._config.postgres_dsn),
392
+ "batch_size": self._config.batch_size,
393
+ "batch_timeout_ms": self._config.batch_timeout_ms,
394
+ },
395
+ )
396
+
397
+ # =========================================================================
398
+ # Properties
399
+ # =========================================================================
400
+
401
+ @property
402
+ def is_running(self) -> bool:
403
+ """Check if the consumer is currently running.
404
+
405
+ Returns:
406
+ True if start() has been called and stop() has not.
407
+ """
408
+ return self._running
409
+
410
+ @property
411
+ def consumer_id(self) -> str:
412
+ """Get the unique consumer identifier.
413
+
414
+ Returns:
415
+ Consumer ID string for logging and tracing.
416
+ """
417
+ return self._consumer_id
418
+
419
+ # =========================================================================
420
+ # Lifecycle Methods
421
+ # =========================================================================
422
+
423
+ async def start(self) -> None:
424
+ """Start the consumer, pool, writer, and health check server.
425
+
426
+ Creates the asyncpg pool, initializes the writer, creates the Kafka
427
+ consumer, and starts the health check HTTP server.
428
+
429
+ Raises:
430
+ RuntimeError: If the consumer is already running.
431
+ asyncpg.PostgresError: If database connection fails.
432
+ KafkaError: If Kafka connection fails.
433
+
434
+ Example:
435
+ >>> await consumer.start()
436
+ >>> # Consumer is now connected, ready for run()
437
+ """
438
+ if self._running:
439
+ logger.warning(
440
+ "Consumer already running",
441
+ extra={"consumer_id": self._consumer_id},
442
+ )
443
+ return
444
+
445
+ correlation_id = uuid4()
446
+
447
+ logger.info(
448
+ "Starting AgentActionsConsumer",
449
+ extra={
450
+ "consumer_id": self._consumer_id,
451
+ "correlation_id": str(correlation_id),
452
+ "topics": self._config.topics,
453
+ },
454
+ )
455
+
456
+ try:
457
+ # Create PostgreSQL pool
458
+ self._pool = await asyncpg.create_pool(
459
+ dsn=self._config.postgres_dsn,
460
+ min_size=2,
461
+ max_size=10,
462
+ )
463
+ logger.info(
464
+ "PostgreSQL pool created",
465
+ extra={
466
+ "consumer_id": self._consumer_id,
467
+ "correlation_id": str(correlation_id),
468
+ "postgres_dsn": mask_dsn_password(self._config.postgres_dsn),
469
+ },
470
+ )
471
+
472
+ # Create writer with pool injection
473
+ self._writer = WriterAgentActionsPostgres(
474
+ pool=self._pool,
475
+ circuit_breaker_threshold=self._config.circuit_breaker_threshold,
476
+ circuit_breaker_reset_timeout=self._config.circuit_breaker_reset_timeout,
477
+ circuit_breaker_half_open_successes=self._config.circuit_breaker_half_open_successes,
478
+ )
479
+
480
+ # Create Kafka consumer
481
+ self._consumer = AIOKafkaConsumer(
482
+ *self._config.topics,
483
+ bootstrap_servers=self._config.kafka_bootstrap_servers,
484
+ group_id=self._config.kafka_group_id,
485
+ auto_offset_reset=self._config.auto_offset_reset,
486
+ enable_auto_commit=False, # Manual commits for at-least-once
487
+ max_poll_records=self._config.batch_size,
488
+ )
489
+
490
+ await self._consumer.start()
491
+ logger.info(
492
+ "Kafka consumer started",
493
+ extra={
494
+ "consumer_id": self._consumer_id,
495
+ "correlation_id": str(correlation_id),
496
+ "topics": self._config.topics,
497
+ "group_id": self._config.kafka_group_id,
498
+ },
499
+ )
500
+
501
+ # Start health check server
502
+ await self._start_health_server()
503
+
504
+ self._running = True
505
+ self._shutdown_event.clear()
506
+
507
+ logger.info(
508
+ "AgentActionsConsumer started",
509
+ extra={
510
+ "consumer_id": self._consumer_id,
511
+ "correlation_id": str(correlation_id),
512
+ },
513
+ )
514
+
515
+ except Exception as e:
516
+ logger.exception(
517
+ "Failed to start consumer",
518
+ extra={
519
+ "consumer_id": self._consumer_id,
520
+ "correlation_id": str(correlation_id),
521
+ "error": str(e),
522
+ },
523
+ )
524
+ # Cleanup any partial initialization
525
+ await self._cleanup_resources(correlation_id)
526
+ raise
527
+
528
+ async def stop(self) -> None:
529
+ """Stop the consumer gracefully.
530
+
531
+ Signals the consume loop to exit, waits for in-flight processing,
532
+ commits final offsets, and closes all connections. Safe to call
533
+ multiple times.
534
+
535
+ Example:
536
+ >>> await consumer.stop()
537
+ >>> # Consumer is now stopped and disconnected
538
+ """
539
+ if not self._running:
540
+ logger.debug(
541
+ "Consumer not running, nothing to stop",
542
+ extra={"consumer_id": self._consumer_id},
543
+ )
544
+ return
545
+
546
+ correlation_id = uuid4()
547
+
548
+ logger.info(
549
+ "Stopping AgentActionsConsumer",
550
+ extra={
551
+ "consumer_id": self._consumer_id,
552
+ "correlation_id": str(correlation_id),
553
+ },
554
+ )
555
+
556
+ # Signal shutdown
557
+ self._running = False
558
+ self._shutdown_event.set()
559
+
560
+ # Cleanup resources
561
+ await self._cleanup_resources(correlation_id)
562
+
563
+ # Log final metrics
564
+ metrics_snapshot = await self.metrics.snapshot()
565
+ logger.info(
566
+ "AgentActionsConsumer stopped",
567
+ extra={
568
+ "consumer_id": self._consumer_id,
569
+ "correlation_id": str(correlation_id),
570
+ "final_metrics": metrics_snapshot,
571
+ },
572
+ )
573
+
574
+ async def _cleanup_resources(self, correlation_id: UUID) -> None:
575
+ """Clean up all resources during shutdown.
576
+
577
+ Args:
578
+ correlation_id: Correlation ID for logging.
579
+ """
580
+ # Stop health check server
581
+ if self._health_site is not None:
582
+ await self._health_site.stop()
583
+ self._health_site = None
584
+
585
+ if self._health_runner is not None:
586
+ await self._health_runner.cleanup()
587
+ self._health_runner = None
588
+
589
+ self._health_app = None
590
+
591
+ # Stop Kafka consumer
592
+ if self._consumer is not None:
593
+ try:
594
+ await self._consumer.stop()
595
+ except Exception as e:
596
+ logger.warning(
597
+ "Error stopping Kafka consumer",
598
+ extra={
599
+ "consumer_id": self._consumer_id,
600
+ "correlation_id": str(correlation_id),
601
+ "error": str(e),
602
+ },
603
+ )
604
+ finally:
605
+ self._consumer = None
606
+
607
+ # Close PostgreSQL pool
608
+ if self._pool is not None:
609
+ try:
610
+ await self._pool.close()
611
+ except Exception as e:
612
+ logger.warning(
613
+ "Error closing PostgreSQL pool",
614
+ extra={
615
+ "consumer_id": self._consumer_id,
616
+ "correlation_id": str(correlation_id),
617
+ "error": str(e),
618
+ },
619
+ )
620
+ finally:
621
+ self._pool = None
622
+
623
+ self._writer = None
624
+
625
+ async def run(self) -> None:
626
+ """Run the main consume loop.
627
+
628
+ Continuously consumes messages from Kafka topics, processes them
629
+ in batches, and writes to PostgreSQL. Implements at-least-once
630
+ delivery by committing offsets only after successful writes.
631
+
632
+ This method blocks until stop() is called or an unrecoverable error
633
+ occurs. Use this after calling start().
634
+
635
+ Example:
636
+ >>> await consumer.start()
637
+ >>> try:
638
+ ... await consumer.run()
639
+ ... finally:
640
+ ... await consumer.stop()
641
+ """
642
+ if not self._running or self._consumer is None:
643
+ raise OnexError(
644
+ "Consumer not started. Call start() before run().",
645
+ )
646
+
647
+ correlation_id = uuid4()
648
+
649
+ logger.info(
650
+ "Starting consume loop",
651
+ extra={
652
+ "consumer_id": self._consumer_id,
653
+ "correlation_id": str(correlation_id),
654
+ },
655
+ )
656
+
657
+ await self._consume_loop(correlation_id)
658
+
659
+ async def __aenter__(self) -> AgentActionsConsumer:
660
+ """Async context manager entry.
661
+
662
+ Starts the consumer and returns self for use in async with blocks.
663
+
664
+ Returns:
665
+ Self for chaining.
666
+
667
+ Example:
668
+ >>> async with AgentActionsConsumer(config) as consumer:
669
+ ... await consumer.run()
670
+ """
671
+ await self.start()
672
+ return self
673
+
674
+ async def __aexit__(
675
+ self,
676
+ exc_type: type[BaseException] | None,
677
+ exc_val: BaseException | None,
678
+ exc_tb: object,
679
+ ) -> None:
680
+ """Async context manager exit.
681
+
682
+ Stops the consumer on exit from async with block.
683
+ """
684
+ await self.stop()
685
+
686
+ # =========================================================================
687
+ # Consume Loop
688
+ # =========================================================================
689
+
690
+ async def _consume_loop(self, correlation_id: UUID) -> None:
691
+ """Main consumption loop with batch processing.
692
+
693
+ Polls Kafka for messages, accumulates batches, processes them,
694
+ and commits offsets for successfully written partitions only.
695
+
696
+ Args:
697
+ correlation_id: Correlation ID for tracing this consume session.
698
+ """
699
+ if self._consumer is None:
700
+ logger.error(
701
+ "Consumer is None in consume loop",
702
+ extra={
703
+ "consumer_id": self._consumer_id,
704
+ "correlation_id": str(correlation_id),
705
+ },
706
+ )
707
+ return
708
+
709
+ batch_timeout_seconds = self._config.batch_timeout_ms / 1000.0
710
+
711
+ try:
712
+ while self._running:
713
+ # Poll with timeout for batch accumulation
714
+ try:
715
+ records = await asyncio.wait_for(
716
+ self._consumer.getmany(
717
+ timeout_ms=self._config.batch_timeout_ms,
718
+ max_records=self._config.batch_size,
719
+ ),
720
+ timeout=batch_timeout_seconds
721
+ + self._config.poll_timeout_buffer_seconds,
722
+ )
723
+ except TimeoutError:
724
+ # Poll timeout is normal, continue loop
725
+ continue
726
+
727
+ # Record poll time even if no messages - prevents false DEGRADED
728
+ # health status on low-traffic topics (CodeRabbit PR #220 feedback)
729
+ await self.metrics.record_polled()
730
+
731
+ if not records:
732
+ continue
733
+
734
+ # Flatten all messages from all partitions
735
+ messages: list[ConsumerRecord] = []
736
+ for tp_messages in records.values():
737
+ messages.extend(tp_messages)
738
+
739
+ if not messages:
740
+ continue
741
+
742
+ await self.metrics.record_received(len(messages))
743
+
744
+ # Process batch and get successful offsets per partition
745
+ batch_correlation_id = uuid4()
746
+ successful_offsets = await self._process_batch(
747
+ messages, batch_correlation_id
748
+ )
749
+
750
+ # Commit only successful offsets
751
+ if successful_offsets:
752
+ await self._commit_offsets(successful_offsets, batch_correlation_id)
753
+ await self.metrics.record_batch_processed()
754
+
755
+ except asyncio.CancelledError:
756
+ logger.info(
757
+ "Consume loop cancelled",
758
+ extra={
759
+ "consumer_id": self._consumer_id,
760
+ "correlation_id": str(correlation_id),
761
+ },
762
+ )
763
+ raise
764
+
765
+ except KafkaError as e:
766
+ logger.exception(
767
+ "Kafka error in consume loop",
768
+ extra={
769
+ "consumer_id": self._consumer_id,
770
+ "correlation_id": str(correlation_id),
771
+ "error": str(e),
772
+ },
773
+ )
774
+ raise
775
+
776
+ except Exception as e:
777
+ logger.exception(
778
+ "Unexpected error in consume loop",
779
+ extra={
780
+ "consumer_id": self._consumer_id,
781
+ "correlation_id": str(correlation_id),
782
+ "error": str(e),
783
+ },
784
+ )
785
+ raise
786
+
787
+ finally:
788
+ logger.info(
789
+ "Consume loop exiting",
790
+ extra={
791
+ "consumer_id": self._consumer_id,
792
+ "correlation_id": str(correlation_id),
793
+ },
794
+ )
795
+
796
+ # =========================================================================
797
+ # Batch Processing
798
+ # =========================================================================
799
+
800
+ async def _process_batch(
801
+ self,
802
+ messages: list[ConsumerRecord],
803
+ correlation_id: UUID,
804
+ ) -> dict[TopicPartition, int]:
805
+ """Process batch and return highest successful offset per partition.
806
+
807
+ Groups messages by topic, validates them, writes each topic's batch
808
+ to PostgreSQL, and tracks successful offsets per partition.
809
+
810
+ Args:
811
+ messages: List of Kafka ConsumerRecords to process.
812
+ correlation_id: Correlation ID for tracing.
813
+
814
+ Returns:
815
+ Dictionary mapping TopicPartition to highest successful offset.
816
+ Only partitions with successful writes are included.
817
+ """
818
+ if self._writer is None:
819
+ logger.error(
820
+ "Writer is None during batch processing",
821
+ extra={
822
+ "consumer_id": self._consumer_id,
823
+ "correlation_id": str(correlation_id),
824
+ },
825
+ )
826
+ return {}
827
+
828
+ successful_offsets: dict[TopicPartition, int] = {}
829
+ # Track skipped message offsets separately to preserve them on write failures
830
+ skipped_offsets: dict[TopicPartition, int] = {}
831
+ parsed_skipped: int = 0
832
+
833
+ # Group messages by topic with their ConsumerRecord for offset tracking
834
+ by_topic: dict[str, list[tuple[ConsumerRecord, BaseModel]]] = {}
835
+
836
+ for msg in messages:
837
+ # Guard against tombstones (compacted topic deletions)
838
+ if msg.value is None:
839
+ logger.warning(
840
+ "Skipping tombstone message",
841
+ extra={
842
+ "consumer_id": self._consumer_id,
843
+ "correlation_id": str(correlation_id),
844
+ "topic": msg.topic,
845
+ "partition": msg.partition,
846
+ "offset": msg.offset,
847
+ },
848
+ )
849
+ parsed_skipped += 1
850
+ tp = TopicPartition(msg.topic, msg.partition)
851
+ current = skipped_offsets.get(tp, -1)
852
+ skipped_offsets[tp] = max(current, msg.offset)
853
+ continue
854
+
855
+ try:
856
+ # Decode message value with UTF-8 guard
857
+ value = msg.value
858
+ if isinstance(value, bytes):
859
+ try:
860
+ value = value.decode("utf-8")
861
+ except UnicodeDecodeError as e:
862
+ logger.warning(
863
+ "Skipping message with invalid UTF-8 encoding",
864
+ extra={
865
+ "consumer_id": self._consumer_id,
866
+ "correlation_id": str(correlation_id),
867
+ "topic": msg.topic,
868
+ "partition": msg.partition,
869
+ "offset": msg.offset,
870
+ "error": str(e),
871
+ },
872
+ )
873
+ parsed_skipped += 1
874
+ tp = TopicPartition(msg.topic, msg.partition)
875
+ current = skipped_offsets.get(tp, -1)
876
+ skipped_offsets[tp] = max(current, msg.offset)
877
+ continue
878
+
879
+ payload = json.loads(value)
880
+
881
+ # Get model class for topic
882
+ model_cls = TOPIC_TO_MODEL.get(msg.topic)
883
+ if model_cls is None:
884
+ logger.warning(
885
+ "Unknown topic, skipping message",
886
+ extra={
887
+ "consumer_id": self._consumer_id,
888
+ "correlation_id": str(correlation_id),
889
+ "topic": msg.topic,
890
+ },
891
+ )
892
+ parsed_skipped += 1
893
+ # Track offset separately to preserve on write failures
894
+ tp = TopicPartition(msg.topic, msg.partition)
895
+ current = skipped_offsets.get(tp, -1)
896
+ skipped_offsets[tp] = max(current, msg.offset)
897
+ continue
898
+
899
+ # Validate with Pydantic model
900
+ model = model_cls.model_validate(payload)
901
+ by_topic.setdefault(msg.topic, []).append((msg, model))
902
+
903
+ except json.JSONDecodeError as e:
904
+ logger.warning(
905
+ "Failed to decode JSON message",
906
+ extra={
907
+ "consumer_id": self._consumer_id,
908
+ "correlation_id": str(correlation_id),
909
+ "topic": msg.topic,
910
+ "partition": msg.partition,
911
+ "offset": msg.offset,
912
+ "error": str(e),
913
+ },
914
+ )
915
+ parsed_skipped += 1
916
+ # Skip malformed messages but track offset separately to preserve on write failures
917
+ tp = TopicPartition(msg.topic, msg.partition)
918
+ current = skipped_offsets.get(tp, -1)
919
+ skipped_offsets[tp] = max(current, msg.offset)
920
+
921
+ except ValidationError as e:
922
+ logger.warning(
923
+ "Message validation failed",
924
+ extra={
925
+ "consumer_id": self._consumer_id,
926
+ "correlation_id": str(correlation_id),
927
+ "topic": msg.topic,
928
+ "partition": msg.partition,
929
+ "offset": msg.offset,
930
+ "error": str(e),
931
+ },
932
+ )
933
+ parsed_skipped += 1
934
+ # Skip invalid messages but track offset separately to preserve on write failures
935
+ tp = TopicPartition(msg.topic, msg.partition)
936
+ current = skipped_offsets.get(tp, -1)
937
+ skipped_offsets[tp] = max(current, msg.offset)
938
+
939
+ if parsed_skipped > 0:
940
+ await self.metrics.record_skipped(parsed_skipped)
941
+
942
+ # Write each topic's batch to PostgreSQL
943
+ for topic, items in by_topic.items():
944
+ writer_method_name = TOPIC_TO_WRITER_METHOD.get(topic)
945
+ if writer_method_name is None:
946
+ logger.warning(
947
+ "No writer method for topic",
948
+ extra={
949
+ "consumer_id": self._consumer_id,
950
+ "correlation_id": str(correlation_id),
951
+ "topic": topic,
952
+ },
953
+ )
954
+ continue
955
+
956
+ writer_method: Callable[
957
+ [list[BaseModel], UUID | None], Coroutine[object, object, int]
958
+ ] = getattr(self._writer, writer_method_name)
959
+ models = [item[1] for item in items]
960
+
961
+ try:
962
+ written_count = await writer_method(models, correlation_id)
963
+
964
+ # Record successful offsets per partition for this topic
965
+ for msg, _ in items:
966
+ tp = TopicPartition(msg.topic, msg.partition)
967
+ current = successful_offsets.get(tp, -1)
968
+ successful_offsets[tp] = max(current, msg.offset)
969
+
970
+ await self.metrics.record_processed(written_count)
971
+
972
+ logger.debug(
973
+ "Wrote batch for topic",
974
+ extra={
975
+ "consumer_id": self._consumer_id,
976
+ "correlation_id": str(correlation_id),
977
+ "topic": topic,
978
+ "count": written_count,
979
+ },
980
+ )
981
+
982
+ except Exception:
983
+ # Write failed for this topic - don't update offsets for its partitions
984
+ logger.exception(
985
+ "Failed to write batch for topic",
986
+ extra={
987
+ "consumer_id": self._consumer_id,
988
+ "correlation_id": str(correlation_id),
989
+ "topic": topic,
990
+ "count": len(models),
991
+ },
992
+ )
993
+ await self.metrics.record_failed(len(models))
994
+ # Remove any offsets we may have tracked for failed partitions
995
+ for msg, _ in items:
996
+ tp = TopicPartition(msg.topic, msg.partition)
997
+ # Only remove if this batch was the only contributor
998
+ # In practice, we don't add until success, so this is safe
999
+ successful_offsets.pop(tp, None)
1000
+
1001
+ # Merge skipped message offsets into successful_offsets
1002
+ # Skipped messages (tombstones, invalid UTF-8, JSON errors, validation errors)
1003
+ # must always have their offsets committed to avoid reprocessing
1004
+ for tp, offset in skipped_offsets.items():
1005
+ current = successful_offsets.get(tp, -1)
1006
+ successful_offsets[tp] = max(current, offset)
1007
+
1008
+ return successful_offsets
1009
+
1010
+ async def _commit_offsets(
1011
+ self,
1012
+ offsets: dict[TopicPartition, int],
1013
+ correlation_id: UUID,
1014
+ ) -> None:
1015
+ """Commit only successfully persisted offsets per partition.
1016
+
1017
+ Commits offset + 1 for each partition (next offset to consume).
1018
+
1019
+ Args:
1020
+ offsets: Dictionary mapping TopicPartition to highest persisted offset.
1021
+ correlation_id: Correlation ID for tracing.
1022
+ """
1023
+ if not offsets or self._consumer is None:
1024
+ return
1025
+
1026
+ # Build commit offsets (offset + 1 = next offset to consume)
1027
+ commit_offsets: dict[TopicPartition, int] = {
1028
+ tp: offset + 1 for tp, offset in offsets.items()
1029
+ }
1030
+
1031
+ try:
1032
+ await self._consumer.commit(commit_offsets)
1033
+
1034
+ logger.debug(
1035
+ "Committed offsets",
1036
+ extra={
1037
+ "consumer_id": self._consumer_id,
1038
+ "correlation_id": str(correlation_id),
1039
+ "partitions": len(commit_offsets),
1040
+ },
1041
+ )
1042
+
1043
+ except KafkaError:
1044
+ logger.exception(
1045
+ "Failed to commit offsets",
1046
+ extra={
1047
+ "consumer_id": self._consumer_id,
1048
+ "correlation_id": str(correlation_id),
1049
+ },
1050
+ )
1051
+ # Don't re-raise - messages will be reprocessed on restart
1052
+
1053
+ # =========================================================================
1054
+ # Health Check Server
1055
+ # =========================================================================
1056
+
1057
+ async def _start_health_server(self) -> None:
1058
+ """Start minimal HTTP health check server.
1059
+
1060
+ Starts an aiohttp server on the configured port with a /health endpoint.
1061
+ """
1062
+ self._health_app = web.Application()
1063
+ self._health_app.router.add_get("/health", self._health_handler)
1064
+
1065
+ self._health_runner = web.AppRunner(self._health_app)
1066
+ await self._health_runner.setup()
1067
+
1068
+ self._health_site = web.TCPSite(
1069
+ self._health_runner,
1070
+ host=self._config.health_check_host, # Configurable - see config.py for security notes
1071
+ port=self._config.health_check_port,
1072
+ )
1073
+ await self._health_site.start()
1074
+
1075
+ logger.info(
1076
+ "Health check server started",
1077
+ extra={
1078
+ "consumer_id": self._consumer_id,
1079
+ "host": self._config.health_check_host,
1080
+ "port": self._config.health_check_port,
1081
+ },
1082
+ )
1083
+
1084
+ def _determine_health_status(
1085
+ self,
1086
+ metrics_snapshot: dict[str, object],
1087
+ circuit_state: dict[str, JsonType],
1088
+ ) -> EnumHealthStatus:
1089
+ """Determine consumer health status based on current state.
1090
+
1091
+ Health status determination rules (in priority order):
1092
+ 1. UNHEALTHY: Consumer is not running (stopped or crashed)
1093
+ 2. DEGRADED: Circuit breaker is open or half-open (database issues, retrying)
1094
+ 3. DEGRADED: Last poll exceeds poll staleness threshold (consumer not polling)
1095
+ 4. DEGRADED: No writes yet AND consumer running > 60s (startup grace period exceeded)
1096
+ 5. DEGRADED: Last successful write exceeds staleness threshold (with messages received)
1097
+ 6. HEALTHY: All other cases (running, circuit closed, recent activity or in grace period)
1098
+
1099
+ The 60-second startup grace period allows the consumer to be considered
1100
+ healthy immediately after starting, before any messages have been consumed.
1101
+
1102
+ Args:
1103
+ metrics_snapshot: Snapshot of current consumer metrics including
1104
+ timestamps for started_at, last_poll_at, and last_successful_write_at.
1105
+ circuit_state: Current circuit breaker state from the writer,
1106
+ containing at minimum a "state" key.
1107
+
1108
+ Returns:
1109
+ EnumHealthStatus indicating current health:
1110
+ - HEALTHY: Fully operational
1111
+ - DEGRADED: Running but with issues (circuit open/half-open, stale polls/writes)
1112
+ - UNHEALTHY: Not running
1113
+ """
1114
+ # Rule 1: Consumer not running -> UNHEALTHY
1115
+ if not self._running:
1116
+ return EnumHealthStatus.UNHEALTHY
1117
+
1118
+ # Rule 2: Circuit breaker open or half-open -> DEGRADED
1119
+ circuit_breaker_state = circuit_state.get("state")
1120
+ if circuit_breaker_state in ("open", "half_open"):
1121
+ return EnumHealthStatus.DEGRADED
1122
+
1123
+ # Rule 3: Check poll staleness (consumer not polling Kafka)
1124
+ last_poll = metrics_snapshot.get("last_poll_at")
1125
+ if last_poll is not None:
1126
+ try:
1127
+ last_poll_dt = datetime.fromisoformat(str(last_poll))
1128
+ poll_age_seconds = (datetime.now(UTC) - last_poll_dt).total_seconds()
1129
+ if poll_age_seconds > self._config.health_check_poll_staleness_seconds:
1130
+ # Poll exceeds staleness threshold -> DEGRADED
1131
+ return EnumHealthStatus.DEGRADED
1132
+ except (ValueError, TypeError):
1133
+ # Parse error - continue to other checks
1134
+ pass
1135
+
1136
+ # Check for recent successful write (within staleness threshold)
1137
+ last_write = metrics_snapshot.get("last_successful_write_at")
1138
+ messages_received = metrics_snapshot.get("messages_received", 0)
1139
+
1140
+ if last_write is None:
1141
+ # No writes yet - check startup grace period (60 seconds)
1142
+ started_at_str = metrics_snapshot.get("started_at")
1143
+ if started_at_str is not None:
1144
+ try:
1145
+ started_at_dt = datetime.fromisoformat(str(started_at_str))
1146
+ age_seconds = (datetime.now(UTC) - started_at_dt).total_seconds()
1147
+ if age_seconds <= 60.0:
1148
+ # Rule 6: Consumer just started, healthy even without writes
1149
+ return EnumHealthStatus.HEALTHY
1150
+ else:
1151
+ # Rule 4: Consumer running > 60s with no writes -> DEGRADED
1152
+ return EnumHealthStatus.DEGRADED
1153
+ except (ValueError, TypeError):
1154
+ # Parse error - fallback to healthy
1155
+ return EnumHealthStatus.HEALTHY
1156
+ else:
1157
+ # No started_at timestamp (shouldn't happen) - assume healthy
1158
+ return EnumHealthStatus.HEALTHY
1159
+ else:
1160
+ # Check if last write was recent (within staleness threshold)
1161
+ # Only consider stale if we have received messages (active traffic)
1162
+ try:
1163
+ last_write_dt = datetime.fromisoformat(str(last_write))
1164
+ write_age_seconds = (datetime.now(UTC) - last_write_dt).total_seconds()
1165
+ if (
1166
+ write_age_seconds > self._config.health_check_staleness_seconds
1167
+ and isinstance(messages_received, int)
1168
+ and messages_received > 0
1169
+ ):
1170
+ # Rule 5: Last write exceeds staleness threshold with traffic -> DEGRADED
1171
+ return EnumHealthStatus.DEGRADED
1172
+ else:
1173
+ # Rule 6: Recent write or no traffic -> HEALTHY
1174
+ return EnumHealthStatus.HEALTHY
1175
+ except (ValueError, TypeError):
1176
+ # Parse error - fallback to healthy
1177
+ return EnumHealthStatus.HEALTHY
1178
+
1179
+ async def _health_handler(self, request: web.Request) -> web.Response:
1180
+ """Handle health check requests.
1181
+
1182
+ Returns JSON with health status based on:
1183
+ - Consumer running state
1184
+ - Circuit breaker state (from writer)
1185
+ - Last successful write timestamp
1186
+
1187
+ Args:
1188
+ request: aiohttp request object.
1189
+
1190
+ Returns:
1191
+ JSON response with health status.
1192
+ """
1193
+ metrics_snapshot = await self.metrics.snapshot()
1194
+ circuit_state = self._writer.get_circuit_breaker_state() if self._writer else {}
1195
+
1196
+ # Determine health status using shared logic
1197
+ status = self._determine_health_status(metrics_snapshot, circuit_state)
1198
+
1199
+ response_body = {
1200
+ "status": status.value,
1201
+ "consumer_running": self._running,
1202
+ "consumer_id": self._consumer_id,
1203
+ "last_poll_time": metrics_snapshot.get("last_poll_at"),
1204
+ "last_successful_write": metrics_snapshot.get("last_successful_write_at"),
1205
+ "circuit_breaker_state": circuit_state.get("state", "unknown"),
1206
+ "messages_processed": metrics_snapshot.get("messages_processed", 0),
1207
+ "messages_failed": metrics_snapshot.get("messages_failed", 0),
1208
+ "batches_processed": metrics_snapshot.get("batches_processed", 0),
1209
+ }
1210
+
1211
+ # Return appropriate HTTP status code
1212
+ http_status = 200 if status == EnumHealthStatus.HEALTHY else 503
1213
+
1214
+ return web.json_response(response_body, status=http_status)
1215
+
1216
+ # =========================================================================
1217
+ # Health Check (Direct API)
1218
+ # =========================================================================
1219
+
1220
+ async def health_check(self) -> dict[str, object]:
1221
+ """Check consumer health status.
1222
+
1223
+ Returns a dictionary with health information for programmatic access.
1224
+
1225
+ Returns:
1226
+ Dictionary with health status including:
1227
+ - status: Overall health (healthy, degraded, unhealthy)
1228
+ - consumer_running: Whether consume loop is active
1229
+ - circuit_breaker_state: Current circuit breaker state
1230
+ - consumer_id: Unique consumer identifier
1231
+ - metrics: Current metrics snapshot
1232
+ """
1233
+ metrics_snapshot = await self.metrics.snapshot()
1234
+ circuit_state = self._writer.get_circuit_breaker_state() if self._writer else {}
1235
+
1236
+ # Determine health status using shared logic
1237
+ status = self._determine_health_status(metrics_snapshot, circuit_state)
1238
+
1239
+ return {
1240
+ "status": status.value,
1241
+ "consumer_running": self._running,
1242
+ "consumer_id": self._consumer_id,
1243
+ "group_id": self._config.kafka_group_id,
1244
+ "topics": self._config.topics,
1245
+ "circuit_breaker_state": circuit_state,
1246
+ "metrics": metrics_snapshot,
1247
+ }
1248
+
1249
+
1250
+ # =============================================================================
1251
+ # Entry Point
1252
+ # =============================================================================
1253
+
1254
+
1255
+ async def _main() -> None:
1256
+ """Main entry point for running the consumer as a module."""
1257
+ # Load configuration from environment
1258
+ config = ConfigAgentActionsConsumer()
1259
+
1260
+ logger.info(
1261
+ "Starting agent actions consumer",
1262
+ extra={
1263
+ "topics": config.topics,
1264
+ "bootstrap_servers": config.kafka_bootstrap_servers,
1265
+ "postgres_dsn": mask_dsn_password(config.postgres_dsn),
1266
+ "group_id": config.kafka_group_id,
1267
+ "health_port": config.health_check_port,
1268
+ },
1269
+ )
1270
+
1271
+ consumer = AgentActionsConsumer(config)
1272
+
1273
+ # Set up signal handlers
1274
+ loop = asyncio.get_running_loop()
1275
+ shutdown_task: asyncio.Task[None] | None = None
1276
+
1277
+ def signal_handler() -> None:
1278
+ nonlocal shutdown_task
1279
+ logger.info("Received shutdown signal")
1280
+ # Only create shutdown task once to avoid race conditions
1281
+ if shutdown_task is None:
1282
+ shutdown_task = asyncio.create_task(consumer.stop())
1283
+
1284
+ for sig in (signal.SIGTERM, signal.SIGINT):
1285
+ loop.add_signal_handler(sig, signal_handler)
1286
+
1287
+ try:
1288
+ await consumer.start()
1289
+ await consumer.run()
1290
+ except asyncio.CancelledError:
1291
+ logger.info("Consumer cancelled")
1292
+ finally:
1293
+ # Ensure shutdown task completes if it was started by signal handler
1294
+ if shutdown_task is not None:
1295
+ if not shutdown_task.done():
1296
+ await shutdown_task
1297
+ # Task already completed, no action needed
1298
+ else:
1299
+ # No signal received, perform clean shutdown
1300
+ await consumer.stop()
1301
+
1302
+
1303
+ if __name__ == "__main__":
1304
+ # Configure logging
1305
+ logging.basicConfig(
1306
+ level=logging.INFO,
1307
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
1308
+ )
1309
+
1310
+ asyncio.run(_main())
1311
+
1312
+
1313
+ __all__ = [
1314
+ "AgentActionsConsumer",
1315
+ "ConsumerMetrics",
1316
+ "EnumHealthStatus",
1317
+ "TOPIC_TO_MODEL",
1318
+ "TOPIC_TO_WRITER_METHOD",
1319
+ "mask_dsn_password",
1320
+ ]