agentflow-runtime 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentflow_runtime-1.1.0.dist-info/METADATA +55 -0
- agentflow_runtime-1.1.0.dist-info/RECORD +100 -0
- agentflow_runtime-1.1.0.dist-info/WHEEL +4 -0
- agentflow_runtime-1.1.0.dist-info/licenses/LICENSE +21 -0
- src/__init__.py +0 -0
- src/constants.py +3 -0
- src/ingestion/__init__.py +0 -0
- src/ingestion/cdc/__init__.py +5 -0
- src/ingestion/cdc/normalizer.py +186 -0
- src/ingestion/connectors/__init__.py +0 -0
- src/ingestion/connectors/mysql_cdc.py +63 -0
- src/ingestion/connectors/postgres_cdc.py +68 -0
- src/ingestion/producers/__init__.py +0 -0
- src/ingestion/producers/event_producer.py +237 -0
- src/ingestion/schemas/__init__.py +0 -0
- src/ingestion/schemas/events.py +147 -0
- src/ingestion/tenant_router.py +80 -0
- src/logger.py +41 -0
- src/orchestration/__init__.py +0 -0
- src/orchestration/dags/__init__.py +0 -0
- src/orchestration/dags/daily_batch.py +201 -0
- src/processing/__init__.py +0 -0
- src/processing/event_replayer.py +250 -0
- src/processing/flink_jobs/Dockerfile +55 -0
- src/processing/flink_jobs/__init__.py +0 -0
- src/processing/flink_jobs/checkpointing.py +32 -0
- src/processing/flink_jobs/session_aggregation.py +212 -0
- src/processing/flink_jobs/session_aggregator.py +199 -0
- src/processing/flink_jobs/stream_processor.py +316 -0
- src/processing/iceberg_sink.py +348 -0
- src/processing/local_pipeline.py +452 -0
- src/processing/outbox.py +273 -0
- src/processing/tracing.py +36 -0
- src/processing/transformations/__init__.py +0 -0
- src/processing/transformations/enrichment.py +125 -0
- src/quality/__init__.py +0 -0
- src/quality/monitors/__init__.py +0 -0
- src/quality/monitors/freshness_monitor.py +166 -0
- src/quality/monitors/metrics_collector.py +367 -0
- src/quality/validators/__init__.py +0 -0
- src/quality/validators/schema_validator.py +119 -0
- src/quality/validators/semantic_validator.py +202 -0
- src/serving/__init__.py +0 -0
- src/serving/api/__init__.py +0 -0
- src/serving/api/alert_dispatcher.py +51 -0
- src/serving/api/alerts/__init__.py +38 -0
- src/serving/api/alerts/dispatcher.py +299 -0
- src/serving/api/alerts/escalation.py +290 -0
- src/serving/api/alerts/evaluator.py +81 -0
- src/serving/api/alerts/history.py +115 -0
- src/serving/api/analytics.py +543 -0
- src/serving/api/auth/__init__.py +46 -0
- src/serving/api/auth/key_rotation.py +400 -0
- src/serving/api/auth/manager.py +406 -0
- src/serving/api/auth/middleware.py +331 -0
- src/serving/api/main.py +390 -0
- src/serving/api/middleware/logging.py +41 -0
- src/serving/api/middleware/tracing.py +51 -0
- src/serving/api/rate_limiter.py +76 -0
- src/serving/api/routers/__init__.py +0 -0
- src/serving/api/routers/admin.py +150 -0
- src/serving/api/routers/admin_ui.py +93 -0
- src/serving/api/routers/agent_query.py +639 -0
- src/serving/api/routers/alerts.py +134 -0
- src/serving/api/routers/batch.py +231 -0
- src/serving/api/routers/contracts.py +98 -0
- src/serving/api/routers/deadletter.py +337 -0
- src/serving/api/routers/lineage.py +218 -0
- src/serving/api/routers/search.py +103 -0
- src/serving/api/routers/slo.py +231 -0
- src/serving/api/routers/stream.py +141 -0
- src/serving/api/routers/webhooks.py +93 -0
- src/serving/api/security.py +83 -0
- src/serving/api/telemetry.py +66 -0
- src/serving/api/templates/admin.html +214 -0
- src/serving/api/versioning.py +328 -0
- src/serving/api/webhook_dispatcher.py +423 -0
- src/serving/backends/__init__.py +117 -0
- src/serving/backends/clickhouse_backend.py +310 -0
- src/serving/backends/duckdb_backend.py +268 -0
- src/serving/cache.py +169 -0
- src/serving/db_pool.py +105 -0
- src/serving/masking.py +122 -0
- src/serving/semantic_layer/__init__.py +0 -0
- src/serving/semantic_layer/catalog.py +177 -0
- src/serving/semantic_layer/contract_registry.py +258 -0
- src/serving/semantic_layer/entity_type_registry.py +107 -0
- src/serving/semantic_layer/nl_engine.py +189 -0
- src/serving/semantic_layer/query/__init__.py +3 -0
- src/serving/semantic_layer/query/contracts.py +47 -0
- src/serving/semantic_layer/query/engine.py +81 -0
- src/serving/semantic_layer/query/entity_queries.py +221 -0
- src/serving/semantic_layer/query/metric_queries.py +84 -0
- src/serving/semantic_layer/query/nl_queries.py +305 -0
- src/serving/semantic_layer/query/sql_builder.py +113 -0
- src/serving/semantic_layer/query/sql_guard.py +3 -0
- src/serving/semantic_layer/query_engine.py +5 -0
- src/serving/semantic_layer/schema_evolution.py +175 -0
- src/serving/semantic_layer/search_index.py +337 -0
- src/serving/semantic_layer/sql_guard.py +56 -0
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
"""Collects and exposes pipeline health metrics.
|
|
2
|
+
|
|
3
|
+
Aggregates metrics from Kafka consumer groups, Flink jobs,
|
|
4
|
+
and quality checks into a unified health status.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from datetime import UTC, datetime
|
|
10
|
+
from enum import StrEnum
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import duckdb
|
|
14
|
+
import httpx
|
|
15
|
+
import structlog
|
|
16
|
+
import yaml # type: ignore[import-untyped]
|
|
17
|
+
from confluent_kafka import KafkaException
|
|
18
|
+
from prometheus_client import Gauge
|
|
19
|
+
from pyiceberg.exceptions import NoSuchPropertyException, RESTError, ValidationError
|
|
20
|
+
|
|
21
|
+
logger = structlog.get_logger()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class HealthStatus(StrEnum):
|
|
25
|
+
HEALTHY = "healthy"
|
|
26
|
+
DEGRADED = "degraded"
|
|
27
|
+
UNHEALTHY = "unhealthy"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
PIPELINE_HEALTH = Gauge(
|
|
31
|
+
"agentflow_pipeline_health",
|
|
32
|
+
"Pipeline health status (1=healthy, 0.5=degraded, 0=unhealthy)",
|
|
33
|
+
["component"],
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
CONSUMER_LAG = Gauge(
|
|
37
|
+
"agentflow_consumer_lag",
|
|
38
|
+
"Kafka consumer group lag",
|
|
39
|
+
["group_id", "topic", "partition"],
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class CheckSource(StrEnum):
|
|
44
|
+
LIVE = "live"
|
|
45
|
+
PLACEHOLDER = "placeholder"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class ComponentHealth:
|
|
50
|
+
name: str
|
|
51
|
+
status: HealthStatus
|
|
52
|
+
message: str
|
|
53
|
+
last_check: datetime
|
|
54
|
+
metrics: dict
|
|
55
|
+
source: CheckSource = CheckSource.LIVE
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class PipelineHealth:
|
|
60
|
+
overall: HealthStatus
|
|
61
|
+
components: list[ComponentHealth]
|
|
62
|
+
checked_at: datetime
|
|
63
|
+
|
|
64
|
+
def to_dict(self) -> dict:
|
|
65
|
+
return {
|
|
66
|
+
"status": self.overall,
|
|
67
|
+
"checked_at": self.checked_at.isoformat(),
|
|
68
|
+
"components": [
|
|
69
|
+
{
|
|
70
|
+
"name": c.name,
|
|
71
|
+
"status": c.status,
|
|
72
|
+
"message": c.message,
|
|
73
|
+
"metrics": c.metrics,
|
|
74
|
+
"source": c.source.value,
|
|
75
|
+
}
|
|
76
|
+
for c in self.components
|
|
77
|
+
],
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class HealthCollector:
|
|
82
|
+
"""Aggregates health from all pipeline components."""
|
|
83
|
+
|
|
84
|
+
def __init__(self):
|
|
85
|
+
self._checks: list = [
|
|
86
|
+
self._check_kafka,
|
|
87
|
+
self._check_flink,
|
|
88
|
+
self._check_freshness,
|
|
89
|
+
self._check_quality_score,
|
|
90
|
+
self._check_iceberg,
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
def collect(self) -> PipelineHealth:
|
|
94
|
+
components = []
|
|
95
|
+
for check in self._checks:
|
|
96
|
+
components.append(check())
|
|
97
|
+
|
|
98
|
+
# Overall status: worst component determines it
|
|
99
|
+
statuses = [c.status for c in components]
|
|
100
|
+
if HealthStatus.UNHEALTHY in statuses:
|
|
101
|
+
overall = HealthStatus.UNHEALTHY
|
|
102
|
+
elif HealthStatus.DEGRADED in statuses:
|
|
103
|
+
overall = HealthStatus.DEGRADED
|
|
104
|
+
else:
|
|
105
|
+
overall = HealthStatus.HEALTHY
|
|
106
|
+
|
|
107
|
+
for c in components:
|
|
108
|
+
val = {"healthy": 1.0, "degraded": 0.5, "unhealthy": 0.0}[c.status]
|
|
109
|
+
PIPELINE_HEALTH.labels(component=c.name).set(val)
|
|
110
|
+
|
|
111
|
+
return PipelineHealth(
|
|
112
|
+
overall=overall,
|
|
113
|
+
components=components,
|
|
114
|
+
checked_at=datetime.now(UTC),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
def _check_kafka(self) -> ComponentHealth:
|
|
118
|
+
"""Check Kafka broker connectivity and consumer lag."""
|
|
119
|
+
from confluent_kafka.admin import AdminClient
|
|
120
|
+
|
|
121
|
+
bootstrap = os.getenv("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092")
|
|
122
|
+
try:
|
|
123
|
+
admin = AdminClient({"bootstrap.servers": bootstrap})
|
|
124
|
+
cluster_meta = admin.list_topics(timeout=5)
|
|
125
|
+
except (KafkaException, OSError) as exc:
|
|
126
|
+
logger.warning(
|
|
127
|
+
"kafka_check_unavailable",
|
|
128
|
+
bootstrap_servers=bootstrap,
|
|
129
|
+
error=str(exc),
|
|
130
|
+
exc_info=True,
|
|
131
|
+
)
|
|
132
|
+
return ComponentHealth(
|
|
133
|
+
name="kafka",
|
|
134
|
+
status=HealthStatus.UNHEALTHY,
|
|
135
|
+
message=f"Kafka unavailable: {exc}",
|
|
136
|
+
last_check=datetime.now(UTC),
|
|
137
|
+
metrics={"brokers": 0, "topics": 0},
|
|
138
|
+
source=CheckSource.PLACEHOLDER,
|
|
139
|
+
)
|
|
140
|
+
topic_count = len(cluster_meta.topics)
|
|
141
|
+
broker_count = len(cluster_meta.brokers)
|
|
142
|
+
|
|
143
|
+
if broker_count == 0:
|
|
144
|
+
return ComponentHealth(
|
|
145
|
+
name="kafka",
|
|
146
|
+
status=HealthStatus.UNHEALTHY,
|
|
147
|
+
message="No brokers available",
|
|
148
|
+
last_check=datetime.now(UTC),
|
|
149
|
+
metrics={"brokers": 0},
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
return ComponentHealth(
|
|
153
|
+
name="kafka",
|
|
154
|
+
status=HealthStatus.HEALTHY,
|
|
155
|
+
message=f"{broker_count} brokers, {topic_count} topics",
|
|
156
|
+
last_check=datetime.now(UTC),
|
|
157
|
+
metrics={"brokers": broker_count, "topics": topic_count},
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
def _check_flink(self) -> ComponentHealth:
|
|
161
|
+
"""Check Flink JobManager and running jobs."""
|
|
162
|
+
flink_url = os.getenv("FLINK_JOBMANAGER_URL", "http://localhost:8081")
|
|
163
|
+
try:
|
|
164
|
+
resp = httpx.get(f"{flink_url}/overview", timeout=5)
|
|
165
|
+
resp.raise_for_status()
|
|
166
|
+
data = resp.json()
|
|
167
|
+
except (httpx.HTTPError, ValueError) as exc:
|
|
168
|
+
logger.warning(
|
|
169
|
+
"flink_check_unavailable",
|
|
170
|
+
flink_url=flink_url,
|
|
171
|
+
error=str(exc),
|
|
172
|
+
exc_info=True,
|
|
173
|
+
)
|
|
174
|
+
return ComponentHealth(
|
|
175
|
+
name="flink",
|
|
176
|
+
status=HealthStatus.UNHEALTHY,
|
|
177
|
+
message=f"Flink unavailable: {exc}",
|
|
178
|
+
last_check=datetime.now(UTC),
|
|
179
|
+
metrics={"running_jobs": None, "failed_jobs": None},
|
|
180
|
+
source=CheckSource.PLACEHOLDER,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
running = data.get("jobs-running", 0)
|
|
184
|
+
failed = data.get("jobs-failed", 0)
|
|
185
|
+
|
|
186
|
+
if failed > 0:
|
|
187
|
+
status = HealthStatus.DEGRADED
|
|
188
|
+
msg = f"{running} running, {failed} failed"
|
|
189
|
+
elif running == 0:
|
|
190
|
+
status = HealthStatus.DEGRADED
|
|
191
|
+
msg = "No running jobs"
|
|
192
|
+
else:
|
|
193
|
+
status = HealthStatus.HEALTHY
|
|
194
|
+
msg = f"{running} jobs running"
|
|
195
|
+
|
|
196
|
+
return ComponentHealth(
|
|
197
|
+
name="flink",
|
|
198
|
+
status=status,
|
|
199
|
+
message=msg,
|
|
200
|
+
last_check=datetime.now(UTC),
|
|
201
|
+
metrics={"running_jobs": running, "failed_jobs": failed},
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
def _check_freshness(self) -> ComponentHealth:
|
|
205
|
+
"""Check data freshness from the most recent pipeline event."""
|
|
206
|
+
try:
|
|
207
|
+
db_path = os.getenv("DUCKDB_PATH", "agentflow_demo.duckdb")
|
|
208
|
+
conn = duckdb.connect(db_path, read_only=True)
|
|
209
|
+
row = conn.execute("SELECT MAX(processed_at) FROM pipeline_events").fetchone()
|
|
210
|
+
conn.close()
|
|
211
|
+
|
|
212
|
+
if row and row[0]:
|
|
213
|
+
last_event = row[0]
|
|
214
|
+
if hasattr(last_event, "timestamp"):
|
|
215
|
+
age_s = (datetime.now(UTC) - last_event.replace(tzinfo=UTC)).total_seconds()
|
|
216
|
+
else:
|
|
217
|
+
age_s = -1.0
|
|
218
|
+
|
|
219
|
+
sla = int(os.getenv("FRESHNESS_SLA_SECONDS", "30"))
|
|
220
|
+
if age_s <= sla:
|
|
221
|
+
status = HealthStatus.HEALTHY
|
|
222
|
+
msg = f"Last event {age_s:.0f}s ago (SLA: {sla}s)"
|
|
223
|
+
elif age_s <= sla * 3:
|
|
224
|
+
status = HealthStatus.DEGRADED
|
|
225
|
+
msg = f"Last event {age_s:.0f}s ago (SLA: {sla}s)"
|
|
226
|
+
else:
|
|
227
|
+
status = HealthStatus.UNHEALTHY
|
|
228
|
+
msg = f"Last event {age_s:.0f}s ago (SLA: {sla}s)"
|
|
229
|
+
|
|
230
|
+
return ComponentHealth(
|
|
231
|
+
name="freshness",
|
|
232
|
+
status=status,
|
|
233
|
+
message=msg,
|
|
234
|
+
last_check=datetime.now(UTC),
|
|
235
|
+
metrics={
|
|
236
|
+
"last_event_age_seconds": round(age_s, 1),
|
|
237
|
+
"sla_seconds": sla,
|
|
238
|
+
},
|
|
239
|
+
source=CheckSource.LIVE,
|
|
240
|
+
)
|
|
241
|
+
except duckdb.Error as exc:
|
|
242
|
+
logger.warning(
|
|
243
|
+
"freshness_check_unavailable",
|
|
244
|
+
db_path=db_path,
|
|
245
|
+
error=str(exc),
|
|
246
|
+
exc_info=True,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
return ComponentHealth(
|
|
250
|
+
name="freshness",
|
|
251
|
+
status=HealthStatus.DEGRADED,
|
|
252
|
+
message="No pipeline events found (run local pipeline first)",
|
|
253
|
+
last_check=datetime.now(UTC),
|
|
254
|
+
metrics={"last_event_age_seconds": None},
|
|
255
|
+
source=CheckSource.PLACEHOLDER,
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
def _check_quality_score(self) -> ComponentHealth:
|
|
259
|
+
"""Check data quality from dead letter ratio in pipeline events."""
|
|
260
|
+
try:
|
|
261
|
+
db_path = os.getenv("DUCKDB_PATH", "agentflow_demo.duckdb")
|
|
262
|
+
conn = duckdb.connect(db_path, read_only=True)
|
|
263
|
+
row = conn.execute("""
|
|
264
|
+
SELECT
|
|
265
|
+
COUNT(*) as total,
|
|
266
|
+
COUNT(*) FILTER (
|
|
267
|
+
WHERE topic = 'events.deadletter'
|
|
268
|
+
) as dead
|
|
269
|
+
FROM pipeline_events
|
|
270
|
+
WHERE processed_at >= NOW() - INTERVAL '1 hour'
|
|
271
|
+
""").fetchone()
|
|
272
|
+
conn.close()
|
|
273
|
+
|
|
274
|
+
if row and row[0] and row[0] > 0:
|
|
275
|
+
total, dead = row[0], row[1]
|
|
276
|
+
pass_rate = (total - dead) / total
|
|
277
|
+
if pass_rate >= 0.99:
|
|
278
|
+
status = HealthStatus.HEALTHY
|
|
279
|
+
elif pass_rate >= 0.95:
|
|
280
|
+
status = HealthStatus.DEGRADED
|
|
281
|
+
else:
|
|
282
|
+
status = HealthStatus.UNHEALTHY
|
|
283
|
+
|
|
284
|
+
return ComponentHealth(
|
|
285
|
+
name="quality",
|
|
286
|
+
status=status,
|
|
287
|
+
message=f"Pass rate: {pass_rate:.1%} ({dead}/{total} rejected)",
|
|
288
|
+
last_check=datetime.now(UTC),
|
|
289
|
+
metrics={
|
|
290
|
+
"pass_rate": round(pass_rate, 4),
|
|
291
|
+
"total_events": total,
|
|
292
|
+
"rejected_events": dead,
|
|
293
|
+
},
|
|
294
|
+
source=CheckSource.LIVE,
|
|
295
|
+
)
|
|
296
|
+
except duckdb.Error as exc:
|
|
297
|
+
logger.warning(
|
|
298
|
+
"quality_check_unavailable",
|
|
299
|
+
db_path=db_path,
|
|
300
|
+
error=str(exc),
|
|
301
|
+
exc_info=True,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
return ComponentHealth(
|
|
305
|
+
name="quality",
|
|
306
|
+
status=HealthStatus.DEGRADED,
|
|
307
|
+
message="No pipeline events found (run local pipeline first)",
|
|
308
|
+
last_check=datetime.now(UTC),
|
|
309
|
+
metrics={"pass_rate": None},
|
|
310
|
+
source=CheckSource.PLACEHOLDER,
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
def _check_iceberg(self) -> ComponentHealth:
|
|
314
|
+
"""Check Iceberg catalog accessibility and row counts."""
|
|
315
|
+
config_path = Path(os.getenv("AGENTFLOW_ICEBERG_CONFIG", "config/iceberg.yaml"))
|
|
316
|
+
if not config_path.exists():
|
|
317
|
+
return ComponentHealth(
|
|
318
|
+
name="iceberg",
|
|
319
|
+
status=HealthStatus.DEGRADED,
|
|
320
|
+
message="Iceberg config not found",
|
|
321
|
+
last_check=datetime.now(UTC),
|
|
322
|
+
metrics={"row_counts": {}},
|
|
323
|
+
source=CheckSource.PLACEHOLDER,
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
try:
|
|
327
|
+
from src.processing.iceberg_sink import IcebergSink
|
|
328
|
+
|
|
329
|
+
sink = IcebergSink(config_path=config_path)
|
|
330
|
+
row_counts = sink.row_counts()
|
|
331
|
+
except (
|
|
332
|
+
ImportError,
|
|
333
|
+
OSError,
|
|
334
|
+
KeyError,
|
|
335
|
+
ValueError,
|
|
336
|
+
yaml.YAMLError,
|
|
337
|
+
NoSuchPropertyException,
|
|
338
|
+
RESTError,
|
|
339
|
+
ValidationError,
|
|
340
|
+
) as exc:
|
|
341
|
+
logger.warning(
|
|
342
|
+
"iceberg_check_unavailable",
|
|
343
|
+
config_path=str(config_path),
|
|
344
|
+
error=str(exc),
|
|
345
|
+
exc_info=True,
|
|
346
|
+
)
|
|
347
|
+
return ComponentHealth(
|
|
348
|
+
name="iceberg",
|
|
349
|
+
status=HealthStatus.DEGRADED,
|
|
350
|
+
message=f"Iceberg unavailable: {exc}",
|
|
351
|
+
last_check=datetime.now(UTC),
|
|
352
|
+
metrics={"row_counts": {}},
|
|
353
|
+
source=CheckSource.PLACEHOLDER,
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
total_rows = sum(row_counts.values())
|
|
357
|
+
return ComponentHealth(
|
|
358
|
+
name="iceberg",
|
|
359
|
+
status=HealthStatus.HEALTHY,
|
|
360
|
+
message=f"{len(row_counts)} tables, {total_rows} rows",
|
|
361
|
+
last_check=datetime.now(UTC),
|
|
362
|
+
metrics={
|
|
363
|
+
"row_counts": row_counts,
|
|
364
|
+
"total_rows": total_rows,
|
|
365
|
+
},
|
|
366
|
+
source=CheckSource.LIVE,
|
|
367
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Schema validation for incoming events.
|
|
2
|
+
|
|
3
|
+
Validates events against their Pydantic schemas before they enter the storage layer.
|
|
4
|
+
Returns structured validation results with error details for observability.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from datetime import UTC, datetime
|
|
9
|
+
|
|
10
|
+
from pydantic import ValidationError
|
|
11
|
+
|
|
12
|
+
from src.ingestion.schemas.events import (
|
|
13
|
+
CdcEvent,
|
|
14
|
+
ClickstreamEvent,
|
|
15
|
+
OrderEvent,
|
|
16
|
+
PaymentEvent,
|
|
17
|
+
ProductEvent,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ValidationResult:
|
|
23
|
+
is_valid: bool
|
|
24
|
+
event_id: str
|
|
25
|
+
event_type: str
|
|
26
|
+
errors: list[dict] = field(default_factory=list)
|
|
27
|
+
validated_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
|
28
|
+
|
|
29
|
+
def to_dict(self) -> dict:
|
|
30
|
+
return {
|
|
31
|
+
"is_valid": self.is_valid,
|
|
32
|
+
"event_id": self.event_id,
|
|
33
|
+
"event_type": self.event_type,
|
|
34
|
+
"errors": self.errors,
|
|
35
|
+
"validated_at": self.validated_at.isoformat(),
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# Map event type prefixes to their Pydantic models
|
|
40
|
+
_SCHEMA_MAP: dict = {
|
|
41
|
+
"order.": OrderEvent,
|
|
42
|
+
"payment.": PaymentEvent,
|
|
43
|
+
"click": ClickstreamEvent,
|
|
44
|
+
"page_view": ClickstreamEvent,
|
|
45
|
+
"add_to_cart": ClickstreamEvent,
|
|
46
|
+
"product.": ProductEvent,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
_CDC_SOURCES = {"postgres_cdc", "mysql_cdc"}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _get_model_for_event(event_type: str): # -> BaseModel subclass | None
|
|
53
|
+
for prefix, model in _SCHEMA_MAP.items():
|
|
54
|
+
if event_type.startswith(prefix) or event_type == prefix:
|
|
55
|
+
return model
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def validate_event(raw_event: dict) -> ValidationResult:
|
|
60
|
+
"""Validate a single event against its schema.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
raw_event: Raw event dict (already parsed from JSON).
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
ValidationResult with is_valid=True if the event passes,
|
|
67
|
+
or is_valid=False with structured error details.
|
|
68
|
+
"""
|
|
69
|
+
event_id = raw_event.get("event_id", "unknown")
|
|
70
|
+
event_type = raw_event.get("event_type", "unknown")
|
|
71
|
+
|
|
72
|
+
model = CdcEvent if _is_cdc_event(raw_event) else _get_model_for_event(event_type)
|
|
73
|
+
if model is None:
|
|
74
|
+
return ValidationResult(
|
|
75
|
+
is_valid=False,
|
|
76
|
+
event_id=event_id,
|
|
77
|
+
event_type=event_type,
|
|
78
|
+
errors=[{"type": "unknown_event_type", "msg": f"No schema for: {event_type}"}],
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
model.model_validate(raw_event)
|
|
83
|
+
return ValidationResult(is_valid=True, event_id=event_id, event_type=event_type)
|
|
84
|
+
except ValidationError as e:
|
|
85
|
+
errors = [
|
|
86
|
+
{
|
|
87
|
+
"type": err["type"],
|
|
88
|
+
"loc": list(err["loc"]),
|
|
89
|
+
"msg": err["msg"],
|
|
90
|
+
}
|
|
91
|
+
for err in e.errors()
|
|
92
|
+
]
|
|
93
|
+
return ValidationResult(
|
|
94
|
+
is_valid=False,
|
|
95
|
+
event_id=event_id,
|
|
96
|
+
event_type=event_type,
|
|
97
|
+
errors=errors,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _is_cdc_event(raw_event: dict) -> bool:
|
|
102
|
+
return (
|
|
103
|
+
raw_event.get("source") in _CDC_SOURCES
|
|
104
|
+
and "operation" in raw_event
|
|
105
|
+
and "source_metadata" in raw_event
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def validate_batch(events: list[dict]) -> tuple[list[dict], list[ValidationResult]]:
|
|
110
|
+
"""Validate a batch of events. Returns (valid_events, failed_results)."""
|
|
111
|
+
valid = []
|
|
112
|
+
failed = []
|
|
113
|
+
for event in events:
|
|
114
|
+
result = validate_event(event)
|
|
115
|
+
if result.is_valid:
|
|
116
|
+
valid.append(event)
|
|
117
|
+
else:
|
|
118
|
+
failed.append(result)
|
|
119
|
+
return valid, failed
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""Semantic validation: business rules that go beyond schema correctness.
|
|
2
|
+
|
|
3
|
+
Schema validation checks structure. Semantic validation checks meaning:
|
|
4
|
+
- Does the order total actually match line items?
|
|
5
|
+
- Is the payment amount within reasonable bounds?
|
|
6
|
+
- Does the user_id reference a plausible user?
|
|
7
|
+
|
|
8
|
+
These rules catch data quality issues that pass schema validation
|
|
9
|
+
but would cause AI agents to give wrong answers.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from datetime import UTC, datetime
|
|
15
|
+
from decimal import Decimal
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class SemanticIssue:
|
|
20
|
+
rule: str
|
|
21
|
+
severity: str # "error" | "warning"
|
|
22
|
+
field: str
|
|
23
|
+
message: str
|
|
24
|
+
actual_value: str | None = None
|
|
25
|
+
expected: str | None = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class SemanticResult:
|
|
30
|
+
event_id: str
|
|
31
|
+
event_type: str
|
|
32
|
+
is_clean: bool
|
|
33
|
+
issues: list[SemanticIssue] = field(default_factory=list)
|
|
34
|
+
checked_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
|
35
|
+
|
|
36
|
+
def to_dict(self) -> dict:
|
|
37
|
+
return {
|
|
38
|
+
"event_id": self.event_id,
|
|
39
|
+
"event_type": self.event_type,
|
|
40
|
+
"is_clean": self.is_clean,
|
|
41
|
+
"issues": [
|
|
42
|
+
{
|
|
43
|
+
"rule": i.rule,
|
|
44
|
+
"severity": i.severity,
|
|
45
|
+
"field": i.field,
|
|
46
|
+
"message": i.message,
|
|
47
|
+
}
|
|
48
|
+
for i in self.issues
|
|
49
|
+
],
|
|
50
|
+
"checked_at": self.checked_at.isoformat(),
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# ── Rule definitions ────────────────────────────────────────────
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _check_order_total_consistency(event: dict) -> list[SemanticIssue]:
|
|
58
|
+
"""Order total must match sum of (quantity * unit_price) for all items."""
|
|
59
|
+
issues = []
|
|
60
|
+
items = event.get("items", [])
|
|
61
|
+
stated_total = Decimal(str(event.get("total_amount", 0)))
|
|
62
|
+
|
|
63
|
+
computed_total = sum(
|
|
64
|
+
Decimal(str(i.get("quantity", 0))) * Decimal(str(i.get("unit_price", 0))) for i in items
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
if abs(stated_total - computed_total) > Decimal("0.01"):
|
|
68
|
+
issues.append(
|
|
69
|
+
SemanticIssue(
|
|
70
|
+
rule="order_total_consistency",
|
|
71
|
+
severity="error",
|
|
72
|
+
field="total_amount",
|
|
73
|
+
message=f"Stated total {stated_total} != computed {computed_total}",
|
|
74
|
+
actual_value=str(stated_total),
|
|
75
|
+
expected=str(computed_total),
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
return issues
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _check_payment_amount_bounds(event: dict) -> list[SemanticIssue]:
|
|
82
|
+
"""Payment amount should be between $0.50 and $50,000."""
|
|
83
|
+
issues = []
|
|
84
|
+
amount = Decimal(str(event.get("amount", 0)))
|
|
85
|
+
|
|
86
|
+
if amount < Decimal("0.50"):
|
|
87
|
+
issues.append(
|
|
88
|
+
SemanticIssue(
|
|
89
|
+
rule="payment_min_amount",
|
|
90
|
+
severity="error",
|
|
91
|
+
field="amount",
|
|
92
|
+
message=f"Payment amount {amount} below minimum $0.50",
|
|
93
|
+
actual_value=str(amount),
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
elif amount > Decimal("50000"):
|
|
97
|
+
issues.append(
|
|
98
|
+
SemanticIssue(
|
|
99
|
+
rule="payment_max_amount",
|
|
100
|
+
severity="warning",
|
|
101
|
+
field="amount",
|
|
102
|
+
message=f"Payment amount {amount} exceeds $50,000 — needs manual review",
|
|
103
|
+
actual_value=str(amount),
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
return issues
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _check_payment_failure_reason(event: dict) -> list[SemanticIssue]:
|
|
110
|
+
"""Failed payments must have a failure_reason."""
|
|
111
|
+
issues = []
|
|
112
|
+
if event.get("status") == "failed" and not event.get("failure_reason"):
|
|
113
|
+
issues.append(
|
|
114
|
+
SemanticIssue(
|
|
115
|
+
rule="payment_failure_reason_required",
|
|
116
|
+
severity="warning",
|
|
117
|
+
field="failure_reason",
|
|
118
|
+
message="Failed payment missing failure_reason",
|
|
119
|
+
)
|
|
120
|
+
)
|
|
121
|
+
return issues
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _check_clickstream_session_id(event: dict) -> list[SemanticIssue]:
|
|
125
|
+
"""Clickstream events must have a session_id."""
|
|
126
|
+
issues = []
|
|
127
|
+
if not event.get("session_id"):
|
|
128
|
+
issues.append(
|
|
129
|
+
SemanticIssue(
|
|
130
|
+
rule="clickstream_session_required",
|
|
131
|
+
severity="error",
|
|
132
|
+
field="session_id",
|
|
133
|
+
message="Clickstream event missing session_id",
|
|
134
|
+
)
|
|
135
|
+
)
|
|
136
|
+
return issues
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _check_product_price_sanity(event: dict) -> list[SemanticIssue]:
|
|
140
|
+
"""Product price should be between $0 and $100,000."""
|
|
141
|
+
issues = []
|
|
142
|
+
price = Decimal(str(event.get("price", 0)))
|
|
143
|
+
if price > Decimal("100000"):
|
|
144
|
+
issues.append(
|
|
145
|
+
SemanticIssue(
|
|
146
|
+
rule="product_price_sanity",
|
|
147
|
+
severity="warning",
|
|
148
|
+
field="price",
|
|
149
|
+
message=f"Product price {price} seems unreasonably high",
|
|
150
|
+
actual_value=str(price),
|
|
151
|
+
)
|
|
152
|
+
)
|
|
153
|
+
return issues
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# ── Rule registry ───────────────────────────────────────────────
|
|
157
|
+
|
|
158
|
+
_RULES: dict[str, list] = {
|
|
159
|
+
"order.": [_check_order_total_consistency],
|
|
160
|
+
"payment.": [_check_payment_amount_bounds, _check_payment_failure_reason],
|
|
161
|
+
"click": [_check_clickstream_session_id],
|
|
162
|
+
"page_view": [_check_clickstream_session_id],
|
|
163
|
+
"add_to_cart": [_check_clickstream_session_id],
|
|
164
|
+
"product.": [_check_product_price_sanity],
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def validate_semantics(event: dict) -> SemanticResult:
|
|
169
|
+
"""Run all applicable semantic rules on an event."""
|
|
170
|
+
event_id = event.get("event_id", "unknown")
|
|
171
|
+
event_type = event.get("event_type", "unknown")
|
|
172
|
+
all_issues: list[SemanticIssue] = []
|
|
173
|
+
|
|
174
|
+
for prefix, rules in _RULES.items():
|
|
175
|
+
if event_type.startswith(prefix) or event_type == prefix:
|
|
176
|
+
for rule_fn in rules:
|
|
177
|
+
all_issues.extend(rule_fn(event))
|
|
178
|
+
|
|
179
|
+
has_errors = any(i.severity == "error" for i in all_issues)
|
|
180
|
+
|
|
181
|
+
return SemanticResult(
|
|
182
|
+
event_id=event_id,
|
|
183
|
+
event_type=event_type,
|
|
184
|
+
is_clean=not has_errors,
|
|
185
|
+
issues=all_issues,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
if __name__ == "__main__":
|
|
190
|
+
parser = argparse.ArgumentParser(description="Run semantic validation checks")
|
|
191
|
+
parser.add_argument("--check-all", action="store_true", help="Run all checks on sample data")
|
|
192
|
+
args = parser.parse_args()
|
|
193
|
+
|
|
194
|
+
if args.check_all:
|
|
195
|
+
sample_order = {
|
|
196
|
+
"event_id": "test-001",
|
|
197
|
+
"event_type": "order.created",
|
|
198
|
+
"total_amount": "100.00",
|
|
199
|
+
"items": [{"quantity": 2, "unit_price": "50.00", "product_id": "P1"}],
|
|
200
|
+
}
|
|
201
|
+
result = validate_semantics(sample_order)
|
|
202
|
+
print(f"Order check: is_clean={result.is_clean}, issues={len(result.issues)}")
|
src/serving/__init__.py
ADDED
|
File without changes
|
|
File without changes
|