agentflow-runtime 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. agentflow_runtime-1.1.0.dist-info/METADATA +55 -0
  2. agentflow_runtime-1.1.0.dist-info/RECORD +100 -0
  3. agentflow_runtime-1.1.0.dist-info/WHEEL +4 -0
  4. agentflow_runtime-1.1.0.dist-info/licenses/LICENSE +21 -0
  5. src/__init__.py +0 -0
  6. src/constants.py +3 -0
  7. src/ingestion/__init__.py +0 -0
  8. src/ingestion/cdc/__init__.py +5 -0
  9. src/ingestion/cdc/normalizer.py +186 -0
  10. src/ingestion/connectors/__init__.py +0 -0
  11. src/ingestion/connectors/mysql_cdc.py +63 -0
  12. src/ingestion/connectors/postgres_cdc.py +68 -0
  13. src/ingestion/producers/__init__.py +0 -0
  14. src/ingestion/producers/event_producer.py +237 -0
  15. src/ingestion/schemas/__init__.py +0 -0
  16. src/ingestion/schemas/events.py +147 -0
  17. src/ingestion/tenant_router.py +80 -0
  18. src/logger.py +41 -0
  19. src/orchestration/__init__.py +0 -0
  20. src/orchestration/dags/__init__.py +0 -0
  21. src/orchestration/dags/daily_batch.py +201 -0
  22. src/processing/__init__.py +0 -0
  23. src/processing/event_replayer.py +250 -0
  24. src/processing/flink_jobs/Dockerfile +55 -0
  25. src/processing/flink_jobs/__init__.py +0 -0
  26. src/processing/flink_jobs/checkpointing.py +32 -0
  27. src/processing/flink_jobs/session_aggregation.py +212 -0
  28. src/processing/flink_jobs/session_aggregator.py +199 -0
  29. src/processing/flink_jobs/stream_processor.py +316 -0
  30. src/processing/iceberg_sink.py +348 -0
  31. src/processing/local_pipeline.py +452 -0
  32. src/processing/outbox.py +273 -0
  33. src/processing/tracing.py +36 -0
  34. src/processing/transformations/__init__.py +0 -0
  35. src/processing/transformations/enrichment.py +125 -0
  36. src/quality/__init__.py +0 -0
  37. src/quality/monitors/__init__.py +0 -0
  38. src/quality/monitors/freshness_monitor.py +166 -0
  39. src/quality/monitors/metrics_collector.py +367 -0
  40. src/quality/validators/__init__.py +0 -0
  41. src/quality/validators/schema_validator.py +119 -0
  42. src/quality/validators/semantic_validator.py +202 -0
  43. src/serving/__init__.py +0 -0
  44. src/serving/api/__init__.py +0 -0
  45. src/serving/api/alert_dispatcher.py +51 -0
  46. src/serving/api/alerts/__init__.py +38 -0
  47. src/serving/api/alerts/dispatcher.py +299 -0
  48. src/serving/api/alerts/escalation.py +290 -0
  49. src/serving/api/alerts/evaluator.py +81 -0
  50. src/serving/api/alerts/history.py +115 -0
  51. src/serving/api/analytics.py +543 -0
  52. src/serving/api/auth/__init__.py +46 -0
  53. src/serving/api/auth/key_rotation.py +400 -0
  54. src/serving/api/auth/manager.py +406 -0
  55. src/serving/api/auth/middleware.py +331 -0
  56. src/serving/api/main.py +390 -0
  57. src/serving/api/middleware/logging.py +41 -0
  58. src/serving/api/middleware/tracing.py +51 -0
  59. src/serving/api/rate_limiter.py +76 -0
  60. src/serving/api/routers/__init__.py +0 -0
  61. src/serving/api/routers/admin.py +150 -0
  62. src/serving/api/routers/admin_ui.py +93 -0
  63. src/serving/api/routers/agent_query.py +639 -0
  64. src/serving/api/routers/alerts.py +134 -0
  65. src/serving/api/routers/batch.py +231 -0
  66. src/serving/api/routers/contracts.py +98 -0
  67. src/serving/api/routers/deadletter.py +337 -0
  68. src/serving/api/routers/lineage.py +218 -0
  69. src/serving/api/routers/search.py +103 -0
  70. src/serving/api/routers/slo.py +231 -0
  71. src/serving/api/routers/stream.py +141 -0
  72. src/serving/api/routers/webhooks.py +93 -0
  73. src/serving/api/security.py +83 -0
  74. src/serving/api/telemetry.py +66 -0
  75. src/serving/api/templates/admin.html +214 -0
  76. src/serving/api/versioning.py +328 -0
  77. src/serving/api/webhook_dispatcher.py +423 -0
  78. src/serving/backends/__init__.py +117 -0
  79. src/serving/backends/clickhouse_backend.py +310 -0
  80. src/serving/backends/duckdb_backend.py +268 -0
  81. src/serving/cache.py +169 -0
  82. src/serving/db_pool.py +105 -0
  83. src/serving/masking.py +122 -0
  84. src/serving/semantic_layer/__init__.py +0 -0
  85. src/serving/semantic_layer/catalog.py +177 -0
  86. src/serving/semantic_layer/contract_registry.py +258 -0
  87. src/serving/semantic_layer/entity_type_registry.py +107 -0
  88. src/serving/semantic_layer/nl_engine.py +189 -0
  89. src/serving/semantic_layer/query/__init__.py +3 -0
  90. src/serving/semantic_layer/query/contracts.py +47 -0
  91. src/serving/semantic_layer/query/engine.py +81 -0
  92. src/serving/semantic_layer/query/entity_queries.py +221 -0
  93. src/serving/semantic_layer/query/metric_queries.py +84 -0
  94. src/serving/semantic_layer/query/nl_queries.py +305 -0
  95. src/serving/semantic_layer/query/sql_builder.py +113 -0
  96. src/serving/semantic_layer/query/sql_guard.py +3 -0
  97. src/serving/semantic_layer/query_engine.py +5 -0
  98. src/serving/semantic_layer/schema_evolution.py +175 -0
  99. src/serving/semantic_layer/search_index.py +337 -0
  100. src/serving/semantic_layer/sql_guard.py +56 -0
@@ -0,0 +1,273 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import os
6
+ from collections.abc import Callable
7
+ from contextlib import nullcontext
8
+ from datetime import UTC, datetime, timedelta
9
+
10
+ import duckdb
11
+ import structlog
12
+ from confluent_kafka import KafkaException
13
+ from opentelemetry import trace
14
+
15
+ from src.processing.tracing import inject_trace_to_kafka_headers, telemetry_disabled
16
+
17
+ logger = structlog.get_logger()
18
+ tracer = trace.get_tracer("agentflow.outbox")
19
+
20
+ DEFAULT_KAFKA_BOOTSTRAP = os.getenv("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092")
21
+
22
+
23
+ def ensure_outbox_table(conn) -> None:
24
+ conn.execute(
25
+ """
26
+ CREATE TABLE IF NOT EXISTS outbox (
27
+ id TEXT PRIMARY KEY,
28
+ event_id TEXT NOT NULL,
29
+ payload JSON NOT NULL,
30
+ topic TEXT NOT NULL,
31
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
32
+ sent_at TIMESTAMP,
33
+ status TEXT DEFAULT 'pending',
34
+ retry_count INTEGER DEFAULT 0,
35
+ next_attempt_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
36
+ last_error TEXT
37
+ )
38
+ """
39
+ )
40
+
41
+
42
+ class OutboxProcessor:
43
+ def __init__(
44
+ self,
45
+ duckdb_path: str | None = None,
46
+ conn=None,
47
+ producer: Callable[[str, dict], None] | None = None,
48
+ bootstrap_servers: str | None = None,
49
+ max_retries: int = 5,
50
+ ) -> None:
51
+ if conn is None and duckdb_path is None:
52
+ raise ValueError("duckdb_path or conn is required")
53
+ self._owns_conn = conn is None
54
+ self._conn = conn if conn is not None else duckdb.connect(str(duckdb_path))
55
+ self._producer = producer or self._produce_to_kafka
56
+ self._bootstrap_servers = bootstrap_servers or DEFAULT_KAFKA_BOOTSTRAP
57
+ self._max_retries = max_retries
58
+ ensure_outbox_table(self._conn)
59
+
60
+ def close(self) -> None:
61
+ if self._owns_conn and self._conn is not None:
62
+ self._conn.close()
63
+ self._conn = None
64
+
65
+ async def run_forever(self) -> None:
66
+ try:
67
+ while True:
68
+ await asyncio.sleep(2)
69
+ try:
70
+ self.process_pending()
71
+ except duckdb.Error as exc:
72
+ logger.warning(
73
+ "outbox_processing_failed",
74
+ error=str(exc),
75
+ bootstrap_servers=self._bootstrap_servers,
76
+ owns_connection=self._owns_conn,
77
+ exc_info=True,
78
+ )
79
+ finally:
80
+ self.close()
81
+
82
+ def process_pending(self, limit: int = 100) -> int:
83
+ rows = self._conn.execute(
84
+ """
85
+ SELECT id, event_id, payload, topic, retry_count
86
+ FROM outbox
87
+ WHERE status = 'pending'
88
+ AND (next_attempt_at IS NULL OR next_attempt_at <= ?)
89
+ ORDER BY created_at
90
+ LIMIT ?
91
+ """,
92
+ [datetime.now(UTC), limit],
93
+ ).fetchall()
94
+ processed = 0
95
+ for row in rows:
96
+ if self._process_row(row):
97
+ processed += 1
98
+ return processed
99
+
100
+ def process_entry(self, outbox_id: str) -> bool:
101
+ row = self._conn.execute(
102
+ """
103
+ SELECT id, event_id, payload, topic, retry_count
104
+ FROM outbox
105
+ WHERE id = ?
106
+ AND status = 'pending'
107
+ """,
108
+ [outbox_id],
109
+ ).fetchone()
110
+ if row is None:
111
+ return False
112
+ return self._process_row(row)
113
+
114
+ def _process_row(self, row) -> bool:
115
+ outbox_id, event_id, payload, topic, retry_count = row
116
+ decoded_payload = self._decode_payload(payload)
117
+ try:
118
+ self._producer(topic, decoded_payload)
119
+ except (BufferError, ConnectionError, TimeoutError, KafkaException, RuntimeError) as exc:
120
+ error_message = str(exc)
121
+ if isinstance(exc, RuntimeError) and not (
122
+ error_message.startswith("KafkaError{")
123
+ or "Kafka message(s) were not delivered" in error_message
124
+ ):
125
+ raise
126
+ next_retry_count = int(retry_count or 0) + 1
127
+ logger.warning(
128
+ "outbox_delivery_retry_scheduled",
129
+ outbox_id=outbox_id,
130
+ event_id=event_id,
131
+ topic=topic,
132
+ retry_count=next_retry_count,
133
+ error=error_message,
134
+ exc_info=True,
135
+ )
136
+ self._schedule_retry(
137
+ outbox_id=outbox_id,
138
+ event_id=event_id,
139
+ retry_count=next_retry_count,
140
+ error_message=error_message,
141
+ )
142
+ return False
143
+ self._mark_sent(outbox_id=outbox_id, event_id=event_id)
144
+ return True
145
+
146
+ def _mark_sent(self, outbox_id: str, event_id: str) -> None:
147
+ sent_at = datetime.now(UTC)
148
+ self._conn.execute("BEGIN TRANSACTION")
149
+ try:
150
+ self._conn.execute(
151
+ """
152
+ UPDATE outbox
153
+ SET status = 'sent',
154
+ sent_at = ?,
155
+ last_error = NULL
156
+ WHERE id = ?
157
+ """,
158
+ [sent_at, outbox_id],
159
+ )
160
+ self._conn.execute(
161
+ "UPDATE dead_letter_events SET status = 'replayed' WHERE event_id = ?",
162
+ [event_id],
163
+ )
164
+ self._conn.execute("COMMIT")
165
+ except Exception: # nosec B110 - rollback must preserve the original replay failure
166
+ # Transaction rollback must happen before unexpected errors propagate.
167
+ self._conn.execute("ROLLBACK")
168
+ raise
169
+
170
+ def _schedule_retry(
171
+ self,
172
+ outbox_id: str,
173
+ event_id: str,
174
+ retry_count: int,
175
+ error_message: str,
176
+ ) -> None:
177
+ status = "pending"
178
+ retry_delay_seconds = 2**retry_count
179
+ is_kafka_error = (
180
+ error_message.startswith("KafkaError{")
181
+ or "Kafka message(s) were not delivered" in error_message
182
+ )
183
+ if is_kafka_error:
184
+ retry_delay_seconds = max(retry_delay_seconds, 30)
185
+ next_attempt_at: datetime | None = datetime.now(UTC) + timedelta(
186
+ seconds=retry_delay_seconds
187
+ )
188
+ self._conn.execute("BEGIN TRANSACTION")
189
+ try:
190
+ if retry_count >= self._max_retries:
191
+ status = "failed"
192
+ next_attempt_at = None
193
+ self._conn.execute(
194
+ """
195
+ UPDATE outbox
196
+ SET status = ?,
197
+ retry_count = ?,
198
+ next_attempt_at = ?,
199
+ last_error = ?
200
+ WHERE id = ?
201
+ """,
202
+ [status, retry_count, next_attempt_at, error_message, outbox_id],
203
+ )
204
+ if status == "failed":
205
+ self._conn.execute(
206
+ "UPDATE dead_letter_events SET status = 'failed' WHERE event_id = ?",
207
+ [event_id],
208
+ )
209
+ self._conn.execute("COMMIT")
210
+ except Exception: # nosec B110 - rollback must preserve the original retry scheduling failure
211
+ # Transaction rollback must happen before unexpected errors propagate.
212
+ self._conn.execute("ROLLBACK")
213
+ raise
214
+
215
+ def _decode_payload(self, payload) -> dict:
216
+ if isinstance(payload, dict):
217
+ return payload
218
+ if isinstance(payload, str):
219
+ decoded = json.loads(payload)
220
+ if isinstance(decoded, dict):
221
+ return decoded
222
+ raise ValueError("Outbox payload must be a JSON object.")
223
+
224
+ def _produce_to_kafka(self, topic: str, payload: dict) -> None:
225
+ from confluent_kafka import Producer
226
+
227
+ delivery_errors: list[str] = []
228
+
229
+ def on_delivery(err, msg) -> None:
230
+ del msg
231
+ if err is not None:
232
+ delivery_errors.append(str(err))
233
+
234
+ producer = Producer({"bootstrap.servers": self._bootstrap_servers})
235
+ produce_span = (
236
+ tracer.start_as_current_span("kafka.produce")
237
+ if not telemetry_disabled()
238
+ else nullcontext()
239
+ )
240
+ with produce_span as span:
241
+ if span is not None and span.is_recording():
242
+ span.set_attribute("topic", topic)
243
+ event_type = payload.get("event_type")
244
+ if event_type is not None:
245
+ span.set_attribute("event_type", str(event_type))
246
+ tenant_id = payload.get("tenant_id") or structlog.contextvars.get_contextvars().get(
247
+ "tenant_id"
248
+ )
249
+ if tenant_id is not None:
250
+ span.set_attribute("tenant_id", str(tenant_id))
251
+ headers = inject_trace_to_kafka_headers({})
252
+ try:
253
+ producer.produce(
254
+ topic,
255
+ key=str(payload.get("event_id", "")),
256
+ value=json.dumps(payload).encode("utf-8"),
257
+ headers=list(headers.items()) or None,
258
+ on_delivery=on_delivery,
259
+ )
260
+ except TypeError as exc:
261
+ if "on_delivery" not in str(exc):
262
+ raise
263
+ producer.produce(
264
+ topic,
265
+ key=str(payload.get("event_id", "")),
266
+ value=json.dumps(payload).encode("utf-8"),
267
+ headers=list(headers.items()) or None,
268
+ )
269
+ remaining = producer.flush(10)
270
+ if delivery_errors:
271
+ raise RuntimeError(delivery_errors[0])
272
+ if remaining != 0:
273
+ raise RuntimeError(f"{remaining} Kafka message(s) were not delivered")
@@ -0,0 +1,36 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from collections.abc import Mapping, Sequence
5
+
6
+ from opentelemetry.context import Context
7
+ from opentelemetry.propagate import extract, inject
8
+
9
+
10
+ def telemetry_disabled() -> bool:
11
+ return os.getenv("OTEL_SDK_DISABLED", "").lower() == "true"
12
+
13
+
14
+ def inject_trace_to_kafka_headers(
15
+ headers: Mapping[str, bytes] | None = None,
16
+ ) -> dict[str, bytes]:
17
+ injected_headers = dict(headers or {})
18
+ if telemetry_disabled():
19
+ return injected_headers
20
+
21
+ carrier: dict[str, str] = {}
22
+ inject(carrier)
23
+ for key, value in carrier.items():
24
+ injected_headers[key] = value.encode("utf-8")
25
+ return injected_headers
26
+
27
+
28
+ def extract_trace_from_kafka_headers(
29
+ headers: Mapping[str, bytes] | Sequence[tuple[str, bytes]] | None,
30
+ ) -> Context:
31
+ if telemetry_disabled() or headers is None:
32
+ return extract({})
33
+
34
+ items = headers.items() if isinstance(headers, Mapping) else headers
35
+ carrier = {str(key): value.decode("utf-8") for key, value in items if isinstance(value, bytes)}
36
+ return extract(carrier)
File without changes
@@ -0,0 +1,125 @@
1
+ """Event enrichment functions for the processing layer.
2
+
3
+ Pure functions that add derived fields to events. Used by Flink jobs
4
+ and batch transformations alike — keeping logic DRY across streaming and batch.
5
+ """
6
+
7
+ from decimal import Decimal
8
+
9
+ ORDER_SIZE_SMALL_MAX_TOTAL = Decimal("50")
10
+ ORDER_SIZE_MEDIUM_MAX_TOTAL = Decimal("200")
11
+ ORDER_SIZE_LARGE_MAX_TOTAL = Decimal("1000")
12
+ MOBILE_VIEWPORT_MAX_WIDTH = 768
13
+ PAYMENT_RISK_HIGH_AMOUNT_THRESHOLD = 500
14
+ PAYMENT_RISK_MEDIUM_AMOUNT_THRESHOLD = 200
15
+ PAYMENT_RISK_HIGH_SCORE_THRESHOLD = 0.5
16
+ PAYMENT_RISK_MEDIUM_SCORE_THRESHOLD = 0.2
17
+
18
+
19
+ def enrich_order(event: dict) -> dict:
20
+ """Add derived fields to an order event.
21
+
22
+ Adds:
23
+ - item_count: total number of items
24
+ - unique_products: number of distinct products
25
+ - avg_item_price: average price per item
26
+ - order_size_bucket: small/medium/large/whale
27
+ """
28
+ items = event.get("items", [])
29
+ total = Decimal(str(event.get("total_amount", 0)))
30
+
31
+ item_count = sum(i.get("quantity", 0) for i in items)
32
+ unique_products = len({i["product_id"] for i in items if "product_id" in i})
33
+ avg_price = total / item_count if item_count > 0 else Decimal("0")
34
+
35
+ if total < ORDER_SIZE_SMALL_MAX_TOTAL:
36
+ bucket = "small"
37
+ elif total < ORDER_SIZE_MEDIUM_MAX_TOTAL:
38
+ bucket = "medium"
39
+ elif total < ORDER_SIZE_LARGE_MAX_TOTAL:
40
+ bucket = "large"
41
+ else:
42
+ bucket = "whale"
43
+
44
+ event["_derived"] = {
45
+ "item_count": item_count,
46
+ "unique_products": unique_products,
47
+ "avg_item_price": float(avg_price.quantize(Decimal("0.01"))),
48
+ "order_size_bucket": bucket,
49
+ }
50
+ return event
51
+
52
+
53
+ def enrich_clickstream(event: dict) -> dict:
54
+ """Add derived fields to a clickstream event.
55
+
56
+ Adds:
57
+ - is_mobile: viewport < 768px
58
+ - page_category: derived from URL path
59
+ - is_product_page: bool
60
+ """
61
+ viewport = event.get("viewport_width")
62
+ page_url = event.get("page_url", "")
63
+
64
+ if "/products/" in page_url:
65
+ page_category = "product_detail"
66
+ is_product_page = True
67
+ elif "/cart" in page_url:
68
+ page_category = "cart"
69
+ is_product_page = False
70
+ elif "/checkout" in page_url:
71
+ page_category = "checkout"
72
+ is_product_page = False
73
+ elif "/search" in page_url:
74
+ page_category = "search"
75
+ is_product_page = False
76
+ elif page_url == "/":
77
+ page_category = "home"
78
+ is_product_page = False
79
+ else:
80
+ page_category = "other"
81
+ is_product_page = False
82
+
83
+ event["_derived"] = {
84
+ "is_mobile": viewport is not None and viewport < MOBILE_VIEWPORT_MAX_WIDTH,
85
+ "page_category": page_category,
86
+ "is_product_page": is_product_page,
87
+ }
88
+ return event
89
+
90
+
91
+ def compute_payment_risk_score(event: dict) -> dict:
92
+ """Add a simple fraud risk score to payment events.
93
+
94
+ Heuristic scoring (0.0 - 1.0):
95
+ - High amount → higher risk
96
+ - Bank transfer → lower risk than card
97
+ - Missing user_id → higher risk
98
+ """
99
+ score = 0.0
100
+ amount = float(event.get("amount", 0))
101
+
102
+ if amount > PAYMENT_RISK_HIGH_AMOUNT_THRESHOLD:
103
+ score += 0.3
104
+ elif amount > PAYMENT_RISK_MEDIUM_AMOUNT_THRESHOLD:
105
+ score += 0.1
106
+
107
+ if event.get("method") == "card":
108
+ score += 0.1
109
+ elif event.get("method") == "wallet":
110
+ score += 0.15
111
+
112
+ if not event.get("user_id"):
113
+ score += 0.3
114
+
115
+ event["_derived"] = {
116
+ "risk_score": min(score, 1.0),
117
+ "risk_level": (
118
+ "high"
119
+ if score > PAYMENT_RISK_HIGH_SCORE_THRESHOLD
120
+ else "medium"
121
+ if score >= PAYMENT_RISK_MEDIUM_SCORE_THRESHOLD
122
+ else "low"
123
+ ),
124
+ }
125
+ return event
File without changes
File without changes
@@ -0,0 +1,166 @@
1
+ """Monitors data freshness across all pipeline stages.
2
+
3
+ Checks that events flow through the pipeline within SLA bounds.
4
+ Exposes metrics to Prometheus and triggers alerts on SLA breaches.
5
+
6
+ SLA: end-to-end latency (ingestion → serving) < 30 seconds for p99.
7
+ """
8
+
9
+ import json
10
+ import os
11
+ from collections import defaultdict
12
+ from datetime import UTC, datetime
13
+
14
+ import structlog
15
+ from confluent_kafka import Consumer, KafkaError
16
+ from prometheus_client import Gauge, Histogram, start_http_server
17
+
18
+ logger = structlog.get_logger()
19
+
20
+ FRESHNESS_SLA_SECONDS = int(os.getenv("FRESHNESS_SLA_SECONDS", "30"))
21
+
22
+ # Prometheus metrics
23
+ PIPELINE_LATENCY = Histogram(
24
+ "agentflow_pipeline_latency_seconds",
25
+ "End-to-end pipeline latency in seconds",
26
+ ["topic", "event_type"],
27
+ buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0],
28
+ )
29
+
30
+ SLA_COMPLIANCE = Gauge(
31
+ "agentflow_sla_compliance_ratio",
32
+ "Ratio of events within SLA (rolling 5-min window)",
33
+ ["topic"],
34
+ )
35
+
36
+ EVENTS_PROCESSED = Gauge(
37
+ "agentflow_freshness_events_total",
38
+ "Total events checked by freshness monitor",
39
+ ["topic"],
40
+ )
41
+
42
+
43
+ class FreshnessMonitor:
44
+ """Consumes from validated topics and measures pipeline latency."""
45
+
46
+ def __init__(self, bootstrap_servers: str, topics: list[str]):
47
+ self.consumer = Consumer(
48
+ {
49
+ "bootstrap.servers": bootstrap_servers,
50
+ "group.id": "agentflow-freshness-monitor",
51
+ "auto.offset.reset": "latest",
52
+ "enable.auto.commit": True,
53
+ }
54
+ )
55
+ self.topics = topics
56
+ self._sla_window: dict[str, list[bool]] = defaultdict(list)
57
+ self._window_size = 1000 # last N events per topic
58
+
59
+ def start(self, metrics_port: int = 8001):
60
+ """Start monitoring loop with Prometheus metrics endpoint."""
61
+ start_http_server(metrics_port)
62
+ logger.info("freshness_monitor_started", topics=self.topics, port=metrics_port)
63
+
64
+ self.consumer.subscribe(self.topics)
65
+
66
+ try:
67
+ while True:
68
+ msg = self.consumer.poll(timeout=1.0)
69
+ if msg is None:
70
+ continue
71
+ err = msg.error()
72
+ if err:
73
+ if err.code() != KafkaError._PARTITION_EOF:
74
+ logger.error("kafka_error", error=str(err))
75
+ continue
76
+
77
+ self._process_message(msg)
78
+ except KeyboardInterrupt:
79
+ logger.info("freshness_monitor_stopping")
80
+ finally:
81
+ self.consumer.close()
82
+
83
+ def _process_message(self, msg):
84
+ topic = msg.topic()
85
+ try:
86
+ event = json.loads(msg.value().decode())
87
+ except (json.JSONDecodeError, UnicodeDecodeError) as exc:
88
+ logger.warning(
89
+ "freshness_message_skipped",
90
+ topic=topic,
91
+ partition=msg.partition(),
92
+ offset=msg.offset(),
93
+ reason="invalid_payload",
94
+ error=str(exc),
95
+ exc_info=True,
96
+ )
97
+ return
98
+
99
+ # Calculate latency from event timestamp to now
100
+ event_ts_str = event.get("timestamp")
101
+ if not event_ts_str:
102
+ logger.warning(
103
+ "freshness_message_skipped",
104
+ topic=topic,
105
+ partition=msg.partition(),
106
+ offset=msg.offset(),
107
+ reason="missing_timestamp",
108
+ event_id=event.get("event_id"),
109
+ event_type=event.get("event_type", "unknown"),
110
+ )
111
+ return
112
+
113
+ try:
114
+ event_ts = datetime.fromisoformat(event_ts_str)
115
+ if event_ts.tzinfo is None:
116
+ event_ts = event_ts.replace(tzinfo=UTC)
117
+ now = datetime.now(UTC)
118
+ latency = (now - event_ts).total_seconds()
119
+ except (ValueError, TypeError) as exc:
120
+ logger.warning(
121
+ "freshness_message_skipped",
122
+ topic=topic,
123
+ partition=msg.partition(),
124
+ offset=msg.offset(),
125
+ reason="invalid_timestamp",
126
+ event_id=event.get("event_id"),
127
+ event_type=event.get("event_type", "unknown"),
128
+ timestamp=event_ts_str,
129
+ error=str(exc),
130
+ exc_info=True,
131
+ )
132
+ return
133
+
134
+ event_type = event.get("event_type", "unknown")
135
+
136
+ # Record metrics
137
+ PIPELINE_LATENCY.labels(topic=topic, event_type=event_type).observe(latency)
138
+ EVENTS_PROCESSED.labels(topic=topic).inc()
139
+
140
+ # Track SLA compliance
141
+ within_sla = latency <= FRESHNESS_SLA_SECONDS
142
+ window = self._sla_window[topic]
143
+ window.append(within_sla)
144
+ if len(window) > self._window_size:
145
+ window.pop(0)
146
+
147
+ compliance = sum(window) / len(window)
148
+ SLA_COMPLIANCE.labels(topic=topic).set(compliance)
149
+
150
+ if not within_sla:
151
+ logger.warning(
152
+ "sla_breach",
153
+ topic=topic,
154
+ event_type=event_type,
155
+ latency_seconds=round(latency, 2),
156
+ sla_seconds=FRESHNESS_SLA_SECONDS,
157
+ event_id=event.get("event_id"),
158
+ )
159
+
160
+
161
+ if __name__ == "__main__":
162
+ monitor = FreshnessMonitor(
163
+ bootstrap_servers=os.getenv("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092"),
164
+ topics=["events.validated", "sessions.aggregated"],
165
+ )
166
+ monitor.start()