agentflow-runtime 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentflow_runtime-1.1.0.dist-info/METADATA +55 -0
- agentflow_runtime-1.1.0.dist-info/RECORD +100 -0
- agentflow_runtime-1.1.0.dist-info/WHEEL +4 -0
- agentflow_runtime-1.1.0.dist-info/licenses/LICENSE +21 -0
- src/__init__.py +0 -0
- src/constants.py +3 -0
- src/ingestion/__init__.py +0 -0
- src/ingestion/cdc/__init__.py +5 -0
- src/ingestion/cdc/normalizer.py +186 -0
- src/ingestion/connectors/__init__.py +0 -0
- src/ingestion/connectors/mysql_cdc.py +63 -0
- src/ingestion/connectors/postgres_cdc.py +68 -0
- src/ingestion/producers/__init__.py +0 -0
- src/ingestion/producers/event_producer.py +237 -0
- src/ingestion/schemas/__init__.py +0 -0
- src/ingestion/schemas/events.py +147 -0
- src/ingestion/tenant_router.py +80 -0
- src/logger.py +41 -0
- src/orchestration/__init__.py +0 -0
- src/orchestration/dags/__init__.py +0 -0
- src/orchestration/dags/daily_batch.py +201 -0
- src/processing/__init__.py +0 -0
- src/processing/event_replayer.py +250 -0
- src/processing/flink_jobs/Dockerfile +55 -0
- src/processing/flink_jobs/__init__.py +0 -0
- src/processing/flink_jobs/checkpointing.py +32 -0
- src/processing/flink_jobs/session_aggregation.py +212 -0
- src/processing/flink_jobs/session_aggregator.py +199 -0
- src/processing/flink_jobs/stream_processor.py +316 -0
- src/processing/iceberg_sink.py +348 -0
- src/processing/local_pipeline.py +452 -0
- src/processing/outbox.py +273 -0
- src/processing/tracing.py +36 -0
- src/processing/transformations/__init__.py +0 -0
- src/processing/transformations/enrichment.py +125 -0
- src/quality/__init__.py +0 -0
- src/quality/monitors/__init__.py +0 -0
- src/quality/monitors/freshness_monitor.py +166 -0
- src/quality/monitors/metrics_collector.py +367 -0
- src/quality/validators/__init__.py +0 -0
- src/quality/validators/schema_validator.py +119 -0
- src/quality/validators/semantic_validator.py +202 -0
- src/serving/__init__.py +0 -0
- src/serving/api/__init__.py +0 -0
- src/serving/api/alert_dispatcher.py +51 -0
- src/serving/api/alerts/__init__.py +38 -0
- src/serving/api/alerts/dispatcher.py +299 -0
- src/serving/api/alerts/escalation.py +290 -0
- src/serving/api/alerts/evaluator.py +81 -0
- src/serving/api/alerts/history.py +115 -0
- src/serving/api/analytics.py +543 -0
- src/serving/api/auth/__init__.py +46 -0
- src/serving/api/auth/key_rotation.py +400 -0
- src/serving/api/auth/manager.py +406 -0
- src/serving/api/auth/middleware.py +331 -0
- src/serving/api/main.py +390 -0
- src/serving/api/middleware/logging.py +41 -0
- src/serving/api/middleware/tracing.py +51 -0
- src/serving/api/rate_limiter.py +76 -0
- src/serving/api/routers/__init__.py +0 -0
- src/serving/api/routers/admin.py +150 -0
- src/serving/api/routers/admin_ui.py +93 -0
- src/serving/api/routers/agent_query.py +639 -0
- src/serving/api/routers/alerts.py +134 -0
- src/serving/api/routers/batch.py +231 -0
- src/serving/api/routers/contracts.py +98 -0
- src/serving/api/routers/deadletter.py +337 -0
- src/serving/api/routers/lineage.py +218 -0
- src/serving/api/routers/search.py +103 -0
- src/serving/api/routers/slo.py +231 -0
- src/serving/api/routers/stream.py +141 -0
- src/serving/api/routers/webhooks.py +93 -0
- src/serving/api/security.py +83 -0
- src/serving/api/telemetry.py +66 -0
- src/serving/api/templates/admin.html +214 -0
- src/serving/api/versioning.py +328 -0
- src/serving/api/webhook_dispatcher.py +423 -0
- src/serving/backends/__init__.py +117 -0
- src/serving/backends/clickhouse_backend.py +310 -0
- src/serving/backends/duckdb_backend.py +268 -0
- src/serving/cache.py +169 -0
- src/serving/db_pool.py +105 -0
- src/serving/masking.py +122 -0
- src/serving/semantic_layer/__init__.py +0 -0
- src/serving/semantic_layer/catalog.py +177 -0
- src/serving/semantic_layer/contract_registry.py +258 -0
- src/serving/semantic_layer/entity_type_registry.py +107 -0
- src/serving/semantic_layer/nl_engine.py +189 -0
- src/serving/semantic_layer/query/__init__.py +3 -0
- src/serving/semantic_layer/query/contracts.py +47 -0
- src/serving/semantic_layer/query/engine.py +81 -0
- src/serving/semantic_layer/query/entity_queries.py +221 -0
- src/serving/semantic_layer/query/metric_queries.py +84 -0
- src/serving/semantic_layer/query/nl_queries.py +305 -0
- src/serving/semantic_layer/query/sql_builder.py +113 -0
- src/serving/semantic_layer/query/sql_guard.py +3 -0
- src/serving/semantic_layer/query_engine.py +5 -0
- src/serving/semantic_layer/schema_evolution.py +175 -0
- src/serving/semantic_layer/search_index.py +337 -0
- src/serving/semantic_layer/sql_guard.py +56 -0
src/processing/outbox.py
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from contextlib import nullcontext
|
|
8
|
+
from datetime import UTC, datetime, timedelta
|
|
9
|
+
|
|
10
|
+
import duckdb
|
|
11
|
+
import structlog
|
|
12
|
+
from confluent_kafka import KafkaException
|
|
13
|
+
from opentelemetry import trace
|
|
14
|
+
|
|
15
|
+
from src.processing.tracing import inject_trace_to_kafka_headers, telemetry_disabled
|
|
16
|
+
|
|
17
|
+
logger = structlog.get_logger()
|
|
18
|
+
tracer = trace.get_tracer("agentflow.outbox")
|
|
19
|
+
|
|
20
|
+
DEFAULT_KAFKA_BOOTSTRAP = os.getenv("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def ensure_outbox_table(conn) -> None:
|
|
24
|
+
conn.execute(
|
|
25
|
+
"""
|
|
26
|
+
CREATE TABLE IF NOT EXISTS outbox (
|
|
27
|
+
id TEXT PRIMARY KEY,
|
|
28
|
+
event_id TEXT NOT NULL,
|
|
29
|
+
payload JSON NOT NULL,
|
|
30
|
+
topic TEXT NOT NULL,
|
|
31
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
32
|
+
sent_at TIMESTAMP,
|
|
33
|
+
status TEXT DEFAULT 'pending',
|
|
34
|
+
retry_count INTEGER DEFAULT 0,
|
|
35
|
+
next_attempt_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
36
|
+
last_error TEXT
|
|
37
|
+
)
|
|
38
|
+
"""
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class OutboxProcessor:
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
duckdb_path: str | None = None,
|
|
46
|
+
conn=None,
|
|
47
|
+
producer: Callable[[str, dict], None] | None = None,
|
|
48
|
+
bootstrap_servers: str | None = None,
|
|
49
|
+
max_retries: int = 5,
|
|
50
|
+
) -> None:
|
|
51
|
+
if conn is None and duckdb_path is None:
|
|
52
|
+
raise ValueError("duckdb_path or conn is required")
|
|
53
|
+
self._owns_conn = conn is None
|
|
54
|
+
self._conn = conn if conn is not None else duckdb.connect(str(duckdb_path))
|
|
55
|
+
self._producer = producer or self._produce_to_kafka
|
|
56
|
+
self._bootstrap_servers = bootstrap_servers or DEFAULT_KAFKA_BOOTSTRAP
|
|
57
|
+
self._max_retries = max_retries
|
|
58
|
+
ensure_outbox_table(self._conn)
|
|
59
|
+
|
|
60
|
+
def close(self) -> None:
|
|
61
|
+
if self._owns_conn and self._conn is not None:
|
|
62
|
+
self._conn.close()
|
|
63
|
+
self._conn = None
|
|
64
|
+
|
|
65
|
+
async def run_forever(self) -> None:
|
|
66
|
+
try:
|
|
67
|
+
while True:
|
|
68
|
+
await asyncio.sleep(2)
|
|
69
|
+
try:
|
|
70
|
+
self.process_pending()
|
|
71
|
+
except duckdb.Error as exc:
|
|
72
|
+
logger.warning(
|
|
73
|
+
"outbox_processing_failed",
|
|
74
|
+
error=str(exc),
|
|
75
|
+
bootstrap_servers=self._bootstrap_servers,
|
|
76
|
+
owns_connection=self._owns_conn,
|
|
77
|
+
exc_info=True,
|
|
78
|
+
)
|
|
79
|
+
finally:
|
|
80
|
+
self.close()
|
|
81
|
+
|
|
82
|
+
def process_pending(self, limit: int = 100) -> int:
|
|
83
|
+
rows = self._conn.execute(
|
|
84
|
+
"""
|
|
85
|
+
SELECT id, event_id, payload, topic, retry_count
|
|
86
|
+
FROM outbox
|
|
87
|
+
WHERE status = 'pending'
|
|
88
|
+
AND (next_attempt_at IS NULL OR next_attempt_at <= ?)
|
|
89
|
+
ORDER BY created_at
|
|
90
|
+
LIMIT ?
|
|
91
|
+
""",
|
|
92
|
+
[datetime.now(UTC), limit],
|
|
93
|
+
).fetchall()
|
|
94
|
+
processed = 0
|
|
95
|
+
for row in rows:
|
|
96
|
+
if self._process_row(row):
|
|
97
|
+
processed += 1
|
|
98
|
+
return processed
|
|
99
|
+
|
|
100
|
+
def process_entry(self, outbox_id: str) -> bool:
|
|
101
|
+
row = self._conn.execute(
|
|
102
|
+
"""
|
|
103
|
+
SELECT id, event_id, payload, topic, retry_count
|
|
104
|
+
FROM outbox
|
|
105
|
+
WHERE id = ?
|
|
106
|
+
AND status = 'pending'
|
|
107
|
+
""",
|
|
108
|
+
[outbox_id],
|
|
109
|
+
).fetchone()
|
|
110
|
+
if row is None:
|
|
111
|
+
return False
|
|
112
|
+
return self._process_row(row)
|
|
113
|
+
|
|
114
|
+
def _process_row(self, row) -> bool:
|
|
115
|
+
outbox_id, event_id, payload, topic, retry_count = row
|
|
116
|
+
decoded_payload = self._decode_payload(payload)
|
|
117
|
+
try:
|
|
118
|
+
self._producer(topic, decoded_payload)
|
|
119
|
+
except (BufferError, ConnectionError, TimeoutError, KafkaException, RuntimeError) as exc:
|
|
120
|
+
error_message = str(exc)
|
|
121
|
+
if isinstance(exc, RuntimeError) and not (
|
|
122
|
+
error_message.startswith("KafkaError{")
|
|
123
|
+
or "Kafka message(s) were not delivered" in error_message
|
|
124
|
+
):
|
|
125
|
+
raise
|
|
126
|
+
next_retry_count = int(retry_count or 0) + 1
|
|
127
|
+
logger.warning(
|
|
128
|
+
"outbox_delivery_retry_scheduled",
|
|
129
|
+
outbox_id=outbox_id,
|
|
130
|
+
event_id=event_id,
|
|
131
|
+
topic=topic,
|
|
132
|
+
retry_count=next_retry_count,
|
|
133
|
+
error=error_message,
|
|
134
|
+
exc_info=True,
|
|
135
|
+
)
|
|
136
|
+
self._schedule_retry(
|
|
137
|
+
outbox_id=outbox_id,
|
|
138
|
+
event_id=event_id,
|
|
139
|
+
retry_count=next_retry_count,
|
|
140
|
+
error_message=error_message,
|
|
141
|
+
)
|
|
142
|
+
return False
|
|
143
|
+
self._mark_sent(outbox_id=outbox_id, event_id=event_id)
|
|
144
|
+
return True
|
|
145
|
+
|
|
146
|
+
def _mark_sent(self, outbox_id: str, event_id: str) -> None:
|
|
147
|
+
sent_at = datetime.now(UTC)
|
|
148
|
+
self._conn.execute("BEGIN TRANSACTION")
|
|
149
|
+
try:
|
|
150
|
+
self._conn.execute(
|
|
151
|
+
"""
|
|
152
|
+
UPDATE outbox
|
|
153
|
+
SET status = 'sent',
|
|
154
|
+
sent_at = ?,
|
|
155
|
+
last_error = NULL
|
|
156
|
+
WHERE id = ?
|
|
157
|
+
""",
|
|
158
|
+
[sent_at, outbox_id],
|
|
159
|
+
)
|
|
160
|
+
self._conn.execute(
|
|
161
|
+
"UPDATE dead_letter_events SET status = 'replayed' WHERE event_id = ?",
|
|
162
|
+
[event_id],
|
|
163
|
+
)
|
|
164
|
+
self._conn.execute("COMMIT")
|
|
165
|
+
except Exception: # nosec B110 - rollback must preserve the original replay failure
|
|
166
|
+
# Transaction rollback must happen before unexpected errors propagate.
|
|
167
|
+
self._conn.execute("ROLLBACK")
|
|
168
|
+
raise
|
|
169
|
+
|
|
170
|
+
def _schedule_retry(
|
|
171
|
+
self,
|
|
172
|
+
outbox_id: str,
|
|
173
|
+
event_id: str,
|
|
174
|
+
retry_count: int,
|
|
175
|
+
error_message: str,
|
|
176
|
+
) -> None:
|
|
177
|
+
status = "pending"
|
|
178
|
+
retry_delay_seconds = 2**retry_count
|
|
179
|
+
is_kafka_error = (
|
|
180
|
+
error_message.startswith("KafkaError{")
|
|
181
|
+
or "Kafka message(s) were not delivered" in error_message
|
|
182
|
+
)
|
|
183
|
+
if is_kafka_error:
|
|
184
|
+
retry_delay_seconds = max(retry_delay_seconds, 30)
|
|
185
|
+
next_attempt_at: datetime | None = datetime.now(UTC) + timedelta(
|
|
186
|
+
seconds=retry_delay_seconds
|
|
187
|
+
)
|
|
188
|
+
self._conn.execute("BEGIN TRANSACTION")
|
|
189
|
+
try:
|
|
190
|
+
if retry_count >= self._max_retries:
|
|
191
|
+
status = "failed"
|
|
192
|
+
next_attempt_at = None
|
|
193
|
+
self._conn.execute(
|
|
194
|
+
"""
|
|
195
|
+
UPDATE outbox
|
|
196
|
+
SET status = ?,
|
|
197
|
+
retry_count = ?,
|
|
198
|
+
next_attempt_at = ?,
|
|
199
|
+
last_error = ?
|
|
200
|
+
WHERE id = ?
|
|
201
|
+
""",
|
|
202
|
+
[status, retry_count, next_attempt_at, error_message, outbox_id],
|
|
203
|
+
)
|
|
204
|
+
if status == "failed":
|
|
205
|
+
self._conn.execute(
|
|
206
|
+
"UPDATE dead_letter_events SET status = 'failed' WHERE event_id = ?",
|
|
207
|
+
[event_id],
|
|
208
|
+
)
|
|
209
|
+
self._conn.execute("COMMIT")
|
|
210
|
+
except Exception: # nosec B110 - rollback must preserve the original retry scheduling failure
|
|
211
|
+
# Transaction rollback must happen before unexpected errors propagate.
|
|
212
|
+
self._conn.execute("ROLLBACK")
|
|
213
|
+
raise
|
|
214
|
+
|
|
215
|
+
def _decode_payload(self, payload) -> dict:
|
|
216
|
+
if isinstance(payload, dict):
|
|
217
|
+
return payload
|
|
218
|
+
if isinstance(payload, str):
|
|
219
|
+
decoded = json.loads(payload)
|
|
220
|
+
if isinstance(decoded, dict):
|
|
221
|
+
return decoded
|
|
222
|
+
raise ValueError("Outbox payload must be a JSON object.")
|
|
223
|
+
|
|
224
|
+
def _produce_to_kafka(self, topic: str, payload: dict) -> None:
|
|
225
|
+
from confluent_kafka import Producer
|
|
226
|
+
|
|
227
|
+
delivery_errors: list[str] = []
|
|
228
|
+
|
|
229
|
+
def on_delivery(err, msg) -> None:
|
|
230
|
+
del msg
|
|
231
|
+
if err is not None:
|
|
232
|
+
delivery_errors.append(str(err))
|
|
233
|
+
|
|
234
|
+
producer = Producer({"bootstrap.servers": self._bootstrap_servers})
|
|
235
|
+
produce_span = (
|
|
236
|
+
tracer.start_as_current_span("kafka.produce")
|
|
237
|
+
if not telemetry_disabled()
|
|
238
|
+
else nullcontext()
|
|
239
|
+
)
|
|
240
|
+
with produce_span as span:
|
|
241
|
+
if span is not None and span.is_recording():
|
|
242
|
+
span.set_attribute("topic", topic)
|
|
243
|
+
event_type = payload.get("event_type")
|
|
244
|
+
if event_type is not None:
|
|
245
|
+
span.set_attribute("event_type", str(event_type))
|
|
246
|
+
tenant_id = payload.get("tenant_id") or structlog.contextvars.get_contextvars().get(
|
|
247
|
+
"tenant_id"
|
|
248
|
+
)
|
|
249
|
+
if tenant_id is not None:
|
|
250
|
+
span.set_attribute("tenant_id", str(tenant_id))
|
|
251
|
+
headers = inject_trace_to_kafka_headers({})
|
|
252
|
+
try:
|
|
253
|
+
producer.produce(
|
|
254
|
+
topic,
|
|
255
|
+
key=str(payload.get("event_id", "")),
|
|
256
|
+
value=json.dumps(payload).encode("utf-8"),
|
|
257
|
+
headers=list(headers.items()) or None,
|
|
258
|
+
on_delivery=on_delivery,
|
|
259
|
+
)
|
|
260
|
+
except TypeError as exc:
|
|
261
|
+
if "on_delivery" not in str(exc):
|
|
262
|
+
raise
|
|
263
|
+
producer.produce(
|
|
264
|
+
topic,
|
|
265
|
+
key=str(payload.get("event_id", "")),
|
|
266
|
+
value=json.dumps(payload).encode("utf-8"),
|
|
267
|
+
headers=list(headers.items()) or None,
|
|
268
|
+
)
|
|
269
|
+
remaining = producer.flush(10)
|
|
270
|
+
if delivery_errors:
|
|
271
|
+
raise RuntimeError(delivery_errors[0])
|
|
272
|
+
if remaining != 0:
|
|
273
|
+
raise RuntimeError(f"{remaining} Kafka message(s) were not delivered")
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from collections.abc import Mapping, Sequence
|
|
5
|
+
|
|
6
|
+
from opentelemetry.context import Context
|
|
7
|
+
from opentelemetry.propagate import extract, inject
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def telemetry_disabled() -> bool:
|
|
11
|
+
return os.getenv("OTEL_SDK_DISABLED", "").lower() == "true"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def inject_trace_to_kafka_headers(
|
|
15
|
+
headers: Mapping[str, bytes] | None = None,
|
|
16
|
+
) -> dict[str, bytes]:
|
|
17
|
+
injected_headers = dict(headers or {})
|
|
18
|
+
if telemetry_disabled():
|
|
19
|
+
return injected_headers
|
|
20
|
+
|
|
21
|
+
carrier: dict[str, str] = {}
|
|
22
|
+
inject(carrier)
|
|
23
|
+
for key, value in carrier.items():
|
|
24
|
+
injected_headers[key] = value.encode("utf-8")
|
|
25
|
+
return injected_headers
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def extract_trace_from_kafka_headers(
|
|
29
|
+
headers: Mapping[str, bytes] | Sequence[tuple[str, bytes]] | None,
|
|
30
|
+
) -> Context:
|
|
31
|
+
if telemetry_disabled() or headers is None:
|
|
32
|
+
return extract({})
|
|
33
|
+
|
|
34
|
+
items = headers.items() if isinstance(headers, Mapping) else headers
|
|
35
|
+
carrier = {str(key): value.decode("utf-8") for key, value in items if isinstance(value, bytes)}
|
|
36
|
+
return extract(carrier)
|
|
File without changes
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Event enrichment functions for the processing layer.
|
|
2
|
+
|
|
3
|
+
Pure functions that add derived fields to events. Used by Flink jobs
|
|
4
|
+
and batch transformations alike — keeping logic DRY across streaming and batch.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from decimal import Decimal
|
|
8
|
+
|
|
9
|
+
ORDER_SIZE_SMALL_MAX_TOTAL = Decimal("50")
|
|
10
|
+
ORDER_SIZE_MEDIUM_MAX_TOTAL = Decimal("200")
|
|
11
|
+
ORDER_SIZE_LARGE_MAX_TOTAL = Decimal("1000")
|
|
12
|
+
MOBILE_VIEWPORT_MAX_WIDTH = 768
|
|
13
|
+
PAYMENT_RISK_HIGH_AMOUNT_THRESHOLD = 500
|
|
14
|
+
PAYMENT_RISK_MEDIUM_AMOUNT_THRESHOLD = 200
|
|
15
|
+
PAYMENT_RISK_HIGH_SCORE_THRESHOLD = 0.5
|
|
16
|
+
PAYMENT_RISK_MEDIUM_SCORE_THRESHOLD = 0.2
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def enrich_order(event: dict) -> dict:
|
|
20
|
+
"""Add derived fields to an order event.
|
|
21
|
+
|
|
22
|
+
Adds:
|
|
23
|
+
- item_count: total number of items
|
|
24
|
+
- unique_products: number of distinct products
|
|
25
|
+
- avg_item_price: average price per item
|
|
26
|
+
- order_size_bucket: small/medium/large/whale
|
|
27
|
+
"""
|
|
28
|
+
items = event.get("items", [])
|
|
29
|
+
total = Decimal(str(event.get("total_amount", 0)))
|
|
30
|
+
|
|
31
|
+
item_count = sum(i.get("quantity", 0) for i in items)
|
|
32
|
+
unique_products = len({i["product_id"] for i in items if "product_id" in i})
|
|
33
|
+
avg_price = total / item_count if item_count > 0 else Decimal("0")
|
|
34
|
+
|
|
35
|
+
if total < ORDER_SIZE_SMALL_MAX_TOTAL:
|
|
36
|
+
bucket = "small"
|
|
37
|
+
elif total < ORDER_SIZE_MEDIUM_MAX_TOTAL:
|
|
38
|
+
bucket = "medium"
|
|
39
|
+
elif total < ORDER_SIZE_LARGE_MAX_TOTAL:
|
|
40
|
+
bucket = "large"
|
|
41
|
+
else:
|
|
42
|
+
bucket = "whale"
|
|
43
|
+
|
|
44
|
+
event["_derived"] = {
|
|
45
|
+
"item_count": item_count,
|
|
46
|
+
"unique_products": unique_products,
|
|
47
|
+
"avg_item_price": float(avg_price.quantize(Decimal("0.01"))),
|
|
48
|
+
"order_size_bucket": bucket,
|
|
49
|
+
}
|
|
50
|
+
return event
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def enrich_clickstream(event: dict) -> dict:
|
|
54
|
+
"""Add derived fields to a clickstream event.
|
|
55
|
+
|
|
56
|
+
Adds:
|
|
57
|
+
- is_mobile: viewport < 768px
|
|
58
|
+
- page_category: derived from URL path
|
|
59
|
+
- is_product_page: bool
|
|
60
|
+
"""
|
|
61
|
+
viewport = event.get("viewport_width")
|
|
62
|
+
page_url = event.get("page_url", "")
|
|
63
|
+
|
|
64
|
+
if "/products/" in page_url:
|
|
65
|
+
page_category = "product_detail"
|
|
66
|
+
is_product_page = True
|
|
67
|
+
elif "/cart" in page_url:
|
|
68
|
+
page_category = "cart"
|
|
69
|
+
is_product_page = False
|
|
70
|
+
elif "/checkout" in page_url:
|
|
71
|
+
page_category = "checkout"
|
|
72
|
+
is_product_page = False
|
|
73
|
+
elif "/search" in page_url:
|
|
74
|
+
page_category = "search"
|
|
75
|
+
is_product_page = False
|
|
76
|
+
elif page_url == "/":
|
|
77
|
+
page_category = "home"
|
|
78
|
+
is_product_page = False
|
|
79
|
+
else:
|
|
80
|
+
page_category = "other"
|
|
81
|
+
is_product_page = False
|
|
82
|
+
|
|
83
|
+
event["_derived"] = {
|
|
84
|
+
"is_mobile": viewport is not None and viewport < MOBILE_VIEWPORT_MAX_WIDTH,
|
|
85
|
+
"page_category": page_category,
|
|
86
|
+
"is_product_page": is_product_page,
|
|
87
|
+
}
|
|
88
|
+
return event
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def compute_payment_risk_score(event: dict) -> dict:
|
|
92
|
+
"""Add a simple fraud risk score to payment events.
|
|
93
|
+
|
|
94
|
+
Heuristic scoring (0.0 - 1.0):
|
|
95
|
+
- High amount → higher risk
|
|
96
|
+
- Bank transfer → lower risk than card
|
|
97
|
+
- Missing user_id → higher risk
|
|
98
|
+
"""
|
|
99
|
+
score = 0.0
|
|
100
|
+
amount = float(event.get("amount", 0))
|
|
101
|
+
|
|
102
|
+
if amount > PAYMENT_RISK_HIGH_AMOUNT_THRESHOLD:
|
|
103
|
+
score += 0.3
|
|
104
|
+
elif amount > PAYMENT_RISK_MEDIUM_AMOUNT_THRESHOLD:
|
|
105
|
+
score += 0.1
|
|
106
|
+
|
|
107
|
+
if event.get("method") == "card":
|
|
108
|
+
score += 0.1
|
|
109
|
+
elif event.get("method") == "wallet":
|
|
110
|
+
score += 0.15
|
|
111
|
+
|
|
112
|
+
if not event.get("user_id"):
|
|
113
|
+
score += 0.3
|
|
114
|
+
|
|
115
|
+
event["_derived"] = {
|
|
116
|
+
"risk_score": min(score, 1.0),
|
|
117
|
+
"risk_level": (
|
|
118
|
+
"high"
|
|
119
|
+
if score > PAYMENT_RISK_HIGH_SCORE_THRESHOLD
|
|
120
|
+
else "medium"
|
|
121
|
+
if score >= PAYMENT_RISK_MEDIUM_SCORE_THRESHOLD
|
|
122
|
+
else "low"
|
|
123
|
+
),
|
|
124
|
+
}
|
|
125
|
+
return event
|
src/quality/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Monitors data freshness across all pipeline stages.
|
|
2
|
+
|
|
3
|
+
Checks that events flow through the pipeline within SLA bounds.
|
|
4
|
+
Exposes metrics to Prometheus and triggers alerts on SLA breaches.
|
|
5
|
+
|
|
6
|
+
SLA: end-to-end latency (ingestion → serving) < 30 seconds for p99.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
from datetime import UTC, datetime
|
|
13
|
+
|
|
14
|
+
import structlog
|
|
15
|
+
from confluent_kafka import Consumer, KafkaError
|
|
16
|
+
from prometheus_client import Gauge, Histogram, start_http_server
|
|
17
|
+
|
|
18
|
+
logger = structlog.get_logger()
|
|
19
|
+
|
|
20
|
+
FRESHNESS_SLA_SECONDS = int(os.getenv("FRESHNESS_SLA_SECONDS", "30"))
|
|
21
|
+
|
|
22
|
+
# Prometheus metrics
|
|
23
|
+
PIPELINE_LATENCY = Histogram(
|
|
24
|
+
"agentflow_pipeline_latency_seconds",
|
|
25
|
+
"End-to-end pipeline latency in seconds",
|
|
26
|
+
["topic", "event_type"],
|
|
27
|
+
buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0],
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
SLA_COMPLIANCE = Gauge(
|
|
31
|
+
"agentflow_sla_compliance_ratio",
|
|
32
|
+
"Ratio of events within SLA (rolling 5-min window)",
|
|
33
|
+
["topic"],
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
EVENTS_PROCESSED = Gauge(
|
|
37
|
+
"agentflow_freshness_events_total",
|
|
38
|
+
"Total events checked by freshness monitor",
|
|
39
|
+
["topic"],
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class FreshnessMonitor:
|
|
44
|
+
"""Consumes from validated topics and measures pipeline latency."""
|
|
45
|
+
|
|
46
|
+
def __init__(self, bootstrap_servers: str, topics: list[str]):
|
|
47
|
+
self.consumer = Consumer(
|
|
48
|
+
{
|
|
49
|
+
"bootstrap.servers": bootstrap_servers,
|
|
50
|
+
"group.id": "agentflow-freshness-monitor",
|
|
51
|
+
"auto.offset.reset": "latest",
|
|
52
|
+
"enable.auto.commit": True,
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
self.topics = topics
|
|
56
|
+
self._sla_window: dict[str, list[bool]] = defaultdict(list)
|
|
57
|
+
self._window_size = 1000 # last N events per topic
|
|
58
|
+
|
|
59
|
+
def start(self, metrics_port: int = 8001):
|
|
60
|
+
"""Start monitoring loop with Prometheus metrics endpoint."""
|
|
61
|
+
start_http_server(metrics_port)
|
|
62
|
+
logger.info("freshness_monitor_started", topics=self.topics, port=metrics_port)
|
|
63
|
+
|
|
64
|
+
self.consumer.subscribe(self.topics)
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
while True:
|
|
68
|
+
msg = self.consumer.poll(timeout=1.0)
|
|
69
|
+
if msg is None:
|
|
70
|
+
continue
|
|
71
|
+
err = msg.error()
|
|
72
|
+
if err:
|
|
73
|
+
if err.code() != KafkaError._PARTITION_EOF:
|
|
74
|
+
logger.error("kafka_error", error=str(err))
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
self._process_message(msg)
|
|
78
|
+
except KeyboardInterrupt:
|
|
79
|
+
logger.info("freshness_monitor_stopping")
|
|
80
|
+
finally:
|
|
81
|
+
self.consumer.close()
|
|
82
|
+
|
|
83
|
+
def _process_message(self, msg):
|
|
84
|
+
topic = msg.topic()
|
|
85
|
+
try:
|
|
86
|
+
event = json.loads(msg.value().decode())
|
|
87
|
+
except (json.JSONDecodeError, UnicodeDecodeError) as exc:
|
|
88
|
+
logger.warning(
|
|
89
|
+
"freshness_message_skipped",
|
|
90
|
+
topic=topic,
|
|
91
|
+
partition=msg.partition(),
|
|
92
|
+
offset=msg.offset(),
|
|
93
|
+
reason="invalid_payload",
|
|
94
|
+
error=str(exc),
|
|
95
|
+
exc_info=True,
|
|
96
|
+
)
|
|
97
|
+
return
|
|
98
|
+
|
|
99
|
+
# Calculate latency from event timestamp to now
|
|
100
|
+
event_ts_str = event.get("timestamp")
|
|
101
|
+
if not event_ts_str:
|
|
102
|
+
logger.warning(
|
|
103
|
+
"freshness_message_skipped",
|
|
104
|
+
topic=topic,
|
|
105
|
+
partition=msg.partition(),
|
|
106
|
+
offset=msg.offset(),
|
|
107
|
+
reason="missing_timestamp",
|
|
108
|
+
event_id=event.get("event_id"),
|
|
109
|
+
event_type=event.get("event_type", "unknown"),
|
|
110
|
+
)
|
|
111
|
+
return
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
event_ts = datetime.fromisoformat(event_ts_str)
|
|
115
|
+
if event_ts.tzinfo is None:
|
|
116
|
+
event_ts = event_ts.replace(tzinfo=UTC)
|
|
117
|
+
now = datetime.now(UTC)
|
|
118
|
+
latency = (now - event_ts).total_seconds()
|
|
119
|
+
except (ValueError, TypeError) as exc:
|
|
120
|
+
logger.warning(
|
|
121
|
+
"freshness_message_skipped",
|
|
122
|
+
topic=topic,
|
|
123
|
+
partition=msg.partition(),
|
|
124
|
+
offset=msg.offset(),
|
|
125
|
+
reason="invalid_timestamp",
|
|
126
|
+
event_id=event.get("event_id"),
|
|
127
|
+
event_type=event.get("event_type", "unknown"),
|
|
128
|
+
timestamp=event_ts_str,
|
|
129
|
+
error=str(exc),
|
|
130
|
+
exc_info=True,
|
|
131
|
+
)
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
event_type = event.get("event_type", "unknown")
|
|
135
|
+
|
|
136
|
+
# Record metrics
|
|
137
|
+
PIPELINE_LATENCY.labels(topic=topic, event_type=event_type).observe(latency)
|
|
138
|
+
EVENTS_PROCESSED.labels(topic=topic).inc()
|
|
139
|
+
|
|
140
|
+
# Track SLA compliance
|
|
141
|
+
within_sla = latency <= FRESHNESS_SLA_SECONDS
|
|
142
|
+
window = self._sla_window[topic]
|
|
143
|
+
window.append(within_sla)
|
|
144
|
+
if len(window) > self._window_size:
|
|
145
|
+
window.pop(0)
|
|
146
|
+
|
|
147
|
+
compliance = sum(window) / len(window)
|
|
148
|
+
SLA_COMPLIANCE.labels(topic=topic).set(compliance)
|
|
149
|
+
|
|
150
|
+
if not within_sla:
|
|
151
|
+
logger.warning(
|
|
152
|
+
"sla_breach",
|
|
153
|
+
topic=topic,
|
|
154
|
+
event_type=event_type,
|
|
155
|
+
latency_seconds=round(latency, 2),
|
|
156
|
+
sla_seconds=FRESHNESS_SLA_SECONDS,
|
|
157
|
+
event_id=event.get("event_id"),
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
if __name__ == "__main__":
|
|
162
|
+
monitor = FreshnessMonitor(
|
|
163
|
+
bootstrap_servers=os.getenv("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092"),
|
|
164
|
+
topics=["events.validated", "sessions.aggregated"],
|
|
165
|
+
)
|
|
166
|
+
monitor.start()
|