agentflow-runtime 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentflow_runtime-1.1.0.dist-info/METADATA +55 -0
- agentflow_runtime-1.1.0.dist-info/RECORD +100 -0
- agentflow_runtime-1.1.0.dist-info/WHEEL +4 -0
- agentflow_runtime-1.1.0.dist-info/licenses/LICENSE +21 -0
- src/__init__.py +0 -0
- src/constants.py +3 -0
- src/ingestion/__init__.py +0 -0
- src/ingestion/cdc/__init__.py +5 -0
- src/ingestion/cdc/normalizer.py +186 -0
- src/ingestion/connectors/__init__.py +0 -0
- src/ingestion/connectors/mysql_cdc.py +63 -0
- src/ingestion/connectors/postgres_cdc.py +68 -0
- src/ingestion/producers/__init__.py +0 -0
- src/ingestion/producers/event_producer.py +237 -0
- src/ingestion/schemas/__init__.py +0 -0
- src/ingestion/schemas/events.py +147 -0
- src/ingestion/tenant_router.py +80 -0
- src/logger.py +41 -0
- src/orchestration/__init__.py +0 -0
- src/orchestration/dags/__init__.py +0 -0
- src/orchestration/dags/daily_batch.py +201 -0
- src/processing/__init__.py +0 -0
- src/processing/event_replayer.py +250 -0
- src/processing/flink_jobs/Dockerfile +55 -0
- src/processing/flink_jobs/__init__.py +0 -0
- src/processing/flink_jobs/checkpointing.py +32 -0
- src/processing/flink_jobs/session_aggregation.py +212 -0
- src/processing/flink_jobs/session_aggregator.py +199 -0
- src/processing/flink_jobs/stream_processor.py +316 -0
- src/processing/iceberg_sink.py +348 -0
- src/processing/local_pipeline.py +452 -0
- src/processing/outbox.py +273 -0
- src/processing/tracing.py +36 -0
- src/processing/transformations/__init__.py +0 -0
- src/processing/transformations/enrichment.py +125 -0
- src/quality/__init__.py +0 -0
- src/quality/monitors/__init__.py +0 -0
- src/quality/monitors/freshness_monitor.py +166 -0
- src/quality/monitors/metrics_collector.py +367 -0
- src/quality/validators/__init__.py +0 -0
- src/quality/validators/schema_validator.py +119 -0
- src/quality/validators/semantic_validator.py +202 -0
- src/serving/__init__.py +0 -0
- src/serving/api/__init__.py +0 -0
- src/serving/api/alert_dispatcher.py +51 -0
- src/serving/api/alerts/__init__.py +38 -0
- src/serving/api/alerts/dispatcher.py +299 -0
- src/serving/api/alerts/escalation.py +290 -0
- src/serving/api/alerts/evaluator.py +81 -0
- src/serving/api/alerts/history.py +115 -0
- src/serving/api/analytics.py +543 -0
- src/serving/api/auth/__init__.py +46 -0
- src/serving/api/auth/key_rotation.py +400 -0
- src/serving/api/auth/manager.py +406 -0
- src/serving/api/auth/middleware.py +331 -0
- src/serving/api/main.py +390 -0
- src/serving/api/middleware/logging.py +41 -0
- src/serving/api/middleware/tracing.py +51 -0
- src/serving/api/rate_limiter.py +76 -0
- src/serving/api/routers/__init__.py +0 -0
- src/serving/api/routers/admin.py +150 -0
- src/serving/api/routers/admin_ui.py +93 -0
- src/serving/api/routers/agent_query.py +639 -0
- src/serving/api/routers/alerts.py +134 -0
- src/serving/api/routers/batch.py +231 -0
- src/serving/api/routers/contracts.py +98 -0
- src/serving/api/routers/deadletter.py +337 -0
- src/serving/api/routers/lineage.py +218 -0
- src/serving/api/routers/search.py +103 -0
- src/serving/api/routers/slo.py +231 -0
- src/serving/api/routers/stream.py +141 -0
- src/serving/api/routers/webhooks.py +93 -0
- src/serving/api/security.py +83 -0
- src/serving/api/telemetry.py +66 -0
- src/serving/api/templates/admin.html +214 -0
- src/serving/api/versioning.py +328 -0
- src/serving/api/webhook_dispatcher.py +423 -0
- src/serving/backends/__init__.py +117 -0
- src/serving/backends/clickhouse_backend.py +310 -0
- src/serving/backends/duckdb_backend.py +268 -0
- src/serving/cache.py +169 -0
- src/serving/db_pool.py +105 -0
- src/serving/masking.py +122 -0
- src/serving/semantic_layer/__init__.py +0 -0
- src/serving/semantic_layer/catalog.py +177 -0
- src/serving/semantic_layer/contract_registry.py +258 -0
- src/serving/semantic_layer/entity_type_registry.py +107 -0
- src/serving/semantic_layer/nl_engine.py +189 -0
- src/serving/semantic_layer/query/__init__.py +3 -0
- src/serving/semantic_layer/query/contracts.py +47 -0
- src/serving/semantic_layer/query/engine.py +81 -0
- src/serving/semantic_layer/query/entity_queries.py +221 -0
- src/serving/semantic_layer/query/metric_queries.py +84 -0
- src/serving/semantic_layer/query/nl_queries.py +305 -0
- src/serving/semantic_layer/query/sql_builder.py +113 -0
- src/serving/semantic_layer/query/sql_guard.py +3 -0
- src/serving/semantic_layer/query_engine.py +5 -0
- src/serving/semantic_layer/schema_evolution.py +175 -0
- src/serving/semantic_layer/search_index.py +337 -0
- src/serving/semantic_layer/sql_guard.py +56 -0
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
"""Core Flink streaming job: validates, enriches, and routes events.
|
|
2
|
+
|
|
3
|
+
Pipeline: Kafka source → Schema validation → Enrichment → Deduplication → Iceberg sink
|
|
4
|
+
Invalid events are routed to a dead letter topic with error metadata.
|
|
5
|
+
|
|
6
|
+
This is the main entry point for the Flink cluster. Submit with:
|
|
7
|
+
flink run -py stream_processor.py
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
from datetime import timedelta
|
|
13
|
+
|
|
14
|
+
from pyflink.common import Types, WatermarkStrategy
|
|
15
|
+
from pyflink.common.serialization import SimpleStringSchema
|
|
16
|
+
from pyflink.common.watermark_strategy import TimestampAssigner
|
|
17
|
+
from pyflink.datastream import StreamExecutionEnvironment
|
|
18
|
+
from pyflink.datastream.connectors.kafka import (
|
|
19
|
+
KafkaOffsetsInitializer,
|
|
20
|
+
KafkaRecordSerializationSchema,
|
|
21
|
+
KafkaSink,
|
|
22
|
+
KafkaSource,
|
|
23
|
+
)
|
|
24
|
+
from pyflink.datastream.functions import MapFunction, ProcessFunction
|
|
25
|
+
from pyflink.datastream.output_tag import OutputTag
|
|
26
|
+
|
|
27
|
+
# Side output for invalid events
|
|
28
|
+
DEAD_LETTER_TAG = OutputTag("dead-letter", Types.STRING())
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _event_tenant(event: dict) -> str:
|
|
32
|
+
source_metadata = event.get("source_metadata", {})
|
|
33
|
+
metadata_tenant = source_metadata.get("tenant") if isinstance(source_metadata, dict) else None
|
|
34
|
+
tenant = event.get("tenant") or metadata_tenant
|
|
35
|
+
return str(tenant) if tenant else "default"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class EventTimestampAssigner(TimestampAssigner):
|
|
39
|
+
"""Extracts event_time from the JSON payload for watermark generation."""
|
|
40
|
+
|
|
41
|
+
def extract_timestamp(self, value, record_timestamp):
|
|
42
|
+
try:
|
|
43
|
+
event = json.loads(value)
|
|
44
|
+
from datetime import UTC, datetime
|
|
45
|
+
|
|
46
|
+
from src.ingestion.cdc.normalizer import is_debezium_event, normalize_debezium_event
|
|
47
|
+
|
|
48
|
+
if is_debezium_event(event):
|
|
49
|
+
# Best-effort topic for tenant resolution: Debezium value-only
|
|
50
|
+
# deserializer drops the Kafka topic name (Codex review P1).
|
|
51
|
+
# Wrappers should populate `event["topic"]` before this point;
|
|
52
|
+
# without it we still fall back to source.name → 'default'.
|
|
53
|
+
event = normalize_debezium_event(event, topic=event.get("topic"))
|
|
54
|
+
ts = datetime.fromisoformat(event["timestamp"])
|
|
55
|
+
if ts.tzinfo is None:
|
|
56
|
+
ts = ts.replace(tzinfo=UTC)
|
|
57
|
+
return int(ts.timestamp() * 1000)
|
|
58
|
+
except (json.JSONDecodeError, KeyError, ValueError):
|
|
59
|
+
return record_timestamp
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class ValidateAndEnrich(ProcessFunction):
|
|
63
|
+
"""Validates, enriches, and routes events using the shared quality layer.
|
|
64
|
+
|
|
65
|
+
Pipeline per event:
|
|
66
|
+
1. Parse JSON
|
|
67
|
+
2. Schema validation via quality.validators.schema_validator
|
|
68
|
+
3. Semantic validation via quality.validators.semantic_validator
|
|
69
|
+
4. Domain enrichment via processing.transformations.enrichment
|
|
70
|
+
5. Processing metadata (latency, version)
|
|
71
|
+
|
|
72
|
+
Invalid events (schema or semantic errors) → dead letter topic.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def process_element(self, value, ctx: ProcessFunction.Context):
|
|
76
|
+
from datetime import UTC, datetime
|
|
77
|
+
|
|
78
|
+
from src.processing.transformations.enrichment import (
|
|
79
|
+
compute_payment_risk_score,
|
|
80
|
+
enrich_clickstream,
|
|
81
|
+
enrich_order,
|
|
82
|
+
)
|
|
83
|
+
from src.quality.validators.schema_validator import validate_event
|
|
84
|
+
from src.quality.validators.semantic_validator import validate_semantics
|
|
85
|
+
|
|
86
|
+
# 1. Parse JSON
|
|
87
|
+
try:
|
|
88
|
+
event = json.loads(value)
|
|
89
|
+
except json.JSONDecodeError as e:
|
|
90
|
+
ctx.output(
|
|
91
|
+
DEAD_LETTER_TAG,
|
|
92
|
+
json.dumps(
|
|
93
|
+
{
|
|
94
|
+
"raw": value[:1000],
|
|
95
|
+
"error": f"JSON parse error: {e}",
|
|
96
|
+
"stage": "parse",
|
|
97
|
+
}
|
|
98
|
+
),
|
|
99
|
+
)
|
|
100
|
+
return
|
|
101
|
+
|
|
102
|
+
from src.ingestion.cdc.normalizer import is_debezium_event, normalize_debezium_event
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
if is_debezium_event(event):
|
|
106
|
+
# See note above on topic propagation; Flink wrappers should
|
|
107
|
+
# inject `event["topic"]` from KafkaSourceMetadata when
|
|
108
|
+
# available so tenant resolution sees the prefixed topic.
|
|
109
|
+
event = normalize_debezium_event(event, topic=event.get("topic"))
|
|
110
|
+
except ValueError as e:
|
|
111
|
+
ctx.output(
|
|
112
|
+
DEAD_LETTER_TAG,
|
|
113
|
+
json.dumps(
|
|
114
|
+
{
|
|
115
|
+
"raw": value[:1000],
|
|
116
|
+
"error": str(e),
|
|
117
|
+
"stage": "cdc_normalization",
|
|
118
|
+
}
|
|
119
|
+
),
|
|
120
|
+
)
|
|
121
|
+
return
|
|
122
|
+
|
|
123
|
+
event_id = event.get("event_id", "unknown")
|
|
124
|
+
event_type = event.get("event_type", "unknown")
|
|
125
|
+
event["tenant"] = _event_tenant(event)
|
|
126
|
+
is_cdc_event = event.get("source") in {"postgres_cdc", "mysql_cdc"} and "operation" in event
|
|
127
|
+
|
|
128
|
+
# 2. Schema validation (Pydantic models)
|
|
129
|
+
schema_result = validate_event(event)
|
|
130
|
+
if not schema_result.is_valid:
|
|
131
|
+
ctx.output(
|
|
132
|
+
DEAD_LETTER_TAG,
|
|
133
|
+
json.dumps(
|
|
134
|
+
{
|
|
135
|
+
"event_id": event_id,
|
|
136
|
+
"error": schema_result.errors,
|
|
137
|
+
"stage": "schema_validation",
|
|
138
|
+
}
|
|
139
|
+
),
|
|
140
|
+
)
|
|
141
|
+
return
|
|
142
|
+
|
|
143
|
+
# 3. Semantic validation (business rules)
|
|
144
|
+
semantic_result = validate_semantics(event)
|
|
145
|
+
if not semantic_result.is_clean:
|
|
146
|
+
error_issues = [
|
|
147
|
+
i.to_dict()
|
|
148
|
+
if hasattr(i, "to_dict")
|
|
149
|
+
else {
|
|
150
|
+
"rule": i.rule,
|
|
151
|
+
"severity": i.severity,
|
|
152
|
+
"field": i.field,
|
|
153
|
+
"message": i.message,
|
|
154
|
+
}
|
|
155
|
+
for i in semantic_result.issues
|
|
156
|
+
if i.severity == "error"
|
|
157
|
+
]
|
|
158
|
+
if error_issues:
|
|
159
|
+
ctx.output(
|
|
160
|
+
DEAD_LETTER_TAG,
|
|
161
|
+
json.dumps(
|
|
162
|
+
{
|
|
163
|
+
"event_id": event_id,
|
|
164
|
+
"error": error_issues,
|
|
165
|
+
"stage": "semantic_validation",
|
|
166
|
+
}
|
|
167
|
+
),
|
|
168
|
+
)
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
# 4. Domain enrichment by event type
|
|
172
|
+
if is_cdc_event:
|
|
173
|
+
pass
|
|
174
|
+
elif event_type.startswith("order."):
|
|
175
|
+
event = enrich_order(event)
|
|
176
|
+
elif event_type in ("click", "page_view", "add_to_cart"):
|
|
177
|
+
event = enrich_clickstream(event)
|
|
178
|
+
elif event_type.startswith("payment."):
|
|
179
|
+
event = compute_payment_risk_score(event)
|
|
180
|
+
|
|
181
|
+
# 5. Processing metadata
|
|
182
|
+
now = datetime.now(UTC)
|
|
183
|
+
try:
|
|
184
|
+
event_ts = datetime.fromisoformat(event["timestamp"])
|
|
185
|
+
if event_ts.tzinfo is None:
|
|
186
|
+
event_ts = event_ts.replace(tzinfo=UTC)
|
|
187
|
+
latency_ms = int((now - event_ts).total_seconds() * 1000)
|
|
188
|
+
except (ValueError, TypeError):
|
|
189
|
+
latency_ms = -1
|
|
190
|
+
|
|
191
|
+
event["_enriched"] = {
|
|
192
|
+
"processing_time": now.isoformat(),
|
|
193
|
+
"pipeline_latency_ms": latency_ms,
|
|
194
|
+
"processor_version": "1.0.0",
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
event["_partition_key"] = (
|
|
198
|
+
event.get("user_id")
|
|
199
|
+
or event.get("order_id")
|
|
200
|
+
or event.get("entity_id")
|
|
201
|
+
or event["event_id"]
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
yield json.dumps(event)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
class DeduplicateByEventId(MapFunction):
|
|
208
|
+
"""Deduplicates events using a Flink keyed state with TTL.
|
|
209
|
+
|
|
210
|
+
Events with the same event_id within the TTL window are dropped.
|
|
211
|
+
This handles at-least-once delivery from Kafka producers.
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
def open(self, runtime_context):
|
|
215
|
+
from pyflink.datastream.state import StateTtlConfig, ValueStateDescriptor
|
|
216
|
+
|
|
217
|
+
ttl_config = (
|
|
218
|
+
StateTtlConfig.new_builder(timedelta(minutes=10))
|
|
219
|
+
.set_update_type(StateTtlConfig.UpdateType.OnCreateAndWrite)
|
|
220
|
+
.build()
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
state_desc = ValueStateDescriptor("seen", Types.BOOLEAN())
|
|
224
|
+
state_desc.enable_time_to_live(ttl_config)
|
|
225
|
+
self.seen_state = runtime_context.get_state(state_desc)
|
|
226
|
+
|
|
227
|
+
def map(self, value):
|
|
228
|
+
if self.seen_state.value():
|
|
229
|
+
return None # duplicate
|
|
230
|
+
self.seen_state.update(True)
|
|
231
|
+
return value
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def build_pipeline():
|
|
235
|
+
env = StreamExecutionEnvironment.get_execution_environment()
|
|
236
|
+
|
|
237
|
+
# Checkpointing for exactly-once
|
|
238
|
+
env.enable_checkpointing(30_000) # 30s
|
|
239
|
+
env.get_checkpoint_config().set_min_pause_between_checkpoints(10_000)
|
|
240
|
+
env.set_parallelism(int(os.getenv("FLINK_PARALLELISM", "2")))
|
|
241
|
+
|
|
242
|
+
bootstrap_servers = os.getenv("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092")
|
|
243
|
+
|
|
244
|
+
# Multi-topic Kafka source
|
|
245
|
+
source = (
|
|
246
|
+
KafkaSource.builder()
|
|
247
|
+
.set_bootstrap_servers(bootstrap_servers)
|
|
248
|
+
.set_topics(
|
|
249
|
+
"orders.raw",
|
|
250
|
+
"payments.raw",
|
|
251
|
+
"clicks.raw",
|
|
252
|
+
"products.cdc",
|
|
253
|
+
"cdc.postgres.public.orders_v2",
|
|
254
|
+
"cdc.postgres.public.users_enriched",
|
|
255
|
+
"cdc.mysql.agentflow_demo.products_current",
|
|
256
|
+
"cdc.mysql.agentflow_demo.sessions_aggregated",
|
|
257
|
+
)
|
|
258
|
+
.set_group_id("agentflow-stream-processor")
|
|
259
|
+
.set_starting_offsets(KafkaOffsetsInitializer.earliest())
|
|
260
|
+
.set_value_only_deserializer(SimpleStringSchema())
|
|
261
|
+
.build()
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(
|
|
265
|
+
timedelta(seconds=5)
|
|
266
|
+
).with_timestamp_assigner(EventTimestampAssigner())
|
|
267
|
+
|
|
268
|
+
# Main pipeline
|
|
269
|
+
stream = env.from_source(source, watermark_strategy, "kafka-source")
|
|
270
|
+
|
|
271
|
+
# Validate + enrich (with dead letter side output)
|
|
272
|
+
validated = stream.process(ValidateAndEnrich(), output_type=Types.STRING())
|
|
273
|
+
|
|
274
|
+
# Dead letter sink
|
|
275
|
+
dead_letter_sink = (
|
|
276
|
+
KafkaSink.builder()
|
|
277
|
+
.set_bootstrap_servers(bootstrap_servers)
|
|
278
|
+
.set_record_serializer(
|
|
279
|
+
KafkaRecordSerializationSchema.builder()
|
|
280
|
+
.set_topic("events.deadletter")
|
|
281
|
+
.set_value_serialization_schema(SimpleStringSchema())
|
|
282
|
+
.build()
|
|
283
|
+
)
|
|
284
|
+
.build()
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
validated.get_side_output(DEAD_LETTER_TAG).sink_to(dead_letter_sink)
|
|
288
|
+
|
|
289
|
+
# Deduplicate by event_id
|
|
290
|
+
deduped = (
|
|
291
|
+
validated.key_by(lambda x: json.loads(x).get("event_id", ""))
|
|
292
|
+
.map(DeduplicateByEventId(), output_type=Types.STRING())
|
|
293
|
+
.filter(lambda x: x is not None)
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Validated events sink (for downstream consumers)
|
|
297
|
+
validated_sink = (
|
|
298
|
+
KafkaSink.builder()
|
|
299
|
+
.set_bootstrap_servers(bootstrap_servers)
|
|
300
|
+
.set_record_serializer(
|
|
301
|
+
KafkaRecordSerializationSchema.builder()
|
|
302
|
+
.set_topic("events.validated")
|
|
303
|
+
.set_value_serialization_schema(SimpleStringSchema())
|
|
304
|
+
.build()
|
|
305
|
+
)
|
|
306
|
+
.build()
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
deduped.sink_to(validated_sink)
|
|
310
|
+
|
|
311
|
+
return env
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
if __name__ == "__main__":
|
|
315
|
+
pipeline = build_pipeline()
|
|
316
|
+
pipeline.execute("agentflow-stream-processor")
|
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from datetime import UTC, datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import pyarrow as pa
|
|
10
|
+
import yaml # type: ignore[import-untyped]
|
|
11
|
+
from pyiceberg.catalog import load_catalog
|
|
12
|
+
from pyiceberg.partitioning import PartitionField, PartitionSpec
|
|
13
|
+
from pyiceberg.schema import Schema
|
|
14
|
+
from pyiceberg.transforms import DayTransform, HourTransform
|
|
15
|
+
from pyiceberg.types import (
|
|
16
|
+
BooleanType,
|
|
17
|
+
DoubleType,
|
|
18
|
+
IntegerType,
|
|
19
|
+
NestedField,
|
|
20
|
+
StringType,
|
|
21
|
+
TimestampType,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
ORDERS_SCHEMA = Schema(
|
|
25
|
+
NestedField(field_id=1, name="event_id", field_type=StringType(), required=True),
|
|
26
|
+
NestedField(field_id=2, name="event_type", field_type=StringType(), required=True),
|
|
27
|
+
NestedField(field_id=3, name="order_id", field_type=StringType(), required=True),
|
|
28
|
+
NestedField(field_id=4, name="user_id", field_type=StringType(), required=True),
|
|
29
|
+
NestedField(field_id=5, name="status", field_type=StringType(), required=True),
|
|
30
|
+
NestedField(field_id=6, name="total_amount", field_type=DoubleType(), required=True),
|
|
31
|
+
NestedField(field_id=7, name="currency", field_type=StringType(), required=True),
|
|
32
|
+
NestedField(field_id=8, name="item_count", field_type=IntegerType(), required=True),
|
|
33
|
+
NestedField(field_id=9, name="unique_products", field_type=IntegerType(), required=True),
|
|
34
|
+
NestedField(field_id=10, name="order_size_bucket", field_type=StringType(), required=True),
|
|
35
|
+
NestedField(field_id=11, name="created_at", field_type=TimestampType(), required=True),
|
|
36
|
+
NestedField(field_id=12, name="payload_json", field_type=StringType(), required=True),
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
PAYMENTS_SCHEMA = Schema(
|
|
40
|
+
NestedField(field_id=1, name="event_id", field_type=StringType(), required=True),
|
|
41
|
+
NestedField(field_id=2, name="event_type", field_type=StringType(), required=True),
|
|
42
|
+
NestedField(field_id=3, name="payment_id", field_type=StringType(), required=True),
|
|
43
|
+
NestedField(field_id=4, name="order_id", field_type=StringType(), required=True),
|
|
44
|
+
NestedField(field_id=5, name="user_id", field_type=StringType(), required=True),
|
|
45
|
+
NestedField(field_id=6, name="amount", field_type=DoubleType(), required=True),
|
|
46
|
+
NestedField(field_id=7, name="currency", field_type=StringType(), required=True),
|
|
47
|
+
NestedField(field_id=8, name="method", field_type=StringType(), required=True),
|
|
48
|
+
NestedField(field_id=9, name="status", field_type=StringType(), required=True),
|
|
49
|
+
NestedField(field_id=10, name="risk_score", field_type=DoubleType(), required=False),
|
|
50
|
+
NestedField(field_id=11, name="risk_level", field_type=StringType(), required=False),
|
|
51
|
+
NestedField(field_id=12, name="created_at", field_type=TimestampType(), required=True),
|
|
52
|
+
NestedField(field_id=13, name="payload_json", field_type=StringType(), required=True),
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
CLICKSTREAM_SCHEMA = Schema(
|
|
56
|
+
NestedField(field_id=1, name="event_id", field_type=StringType(), required=True),
|
|
57
|
+
NestedField(field_id=2, name="event_type", field_type=StringType(), required=True),
|
|
58
|
+
NestedField(field_id=3, name="session_id", field_type=StringType(), required=True),
|
|
59
|
+
NestedField(field_id=4, name="user_id", field_type=StringType(), required=False),
|
|
60
|
+
NestedField(field_id=5, name="page_url", field_type=StringType(), required=True),
|
|
61
|
+
NestedField(field_id=6, name="referrer", field_type=StringType(), required=False),
|
|
62
|
+
NestedField(field_id=7, name="user_agent", field_type=StringType(), required=True),
|
|
63
|
+
NestedField(field_id=8, name="viewport_width", field_type=IntegerType(), required=False),
|
|
64
|
+
NestedField(field_id=9, name="product_id", field_type=StringType(), required=False),
|
|
65
|
+
NestedField(field_id=10, name="is_mobile", field_type=BooleanType(), required=False),
|
|
66
|
+
NestedField(field_id=11, name="page_category", field_type=StringType(), required=False),
|
|
67
|
+
NestedField(field_id=12, name="is_product_page", field_type=BooleanType(), required=False),
|
|
68
|
+
NestedField(field_id=13, name="created_at", field_type=TimestampType(), required=True),
|
|
69
|
+
NestedField(field_id=14, name="payload_json", field_type=StringType(), required=True),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
INVENTORY_SCHEMA = Schema(
|
|
73
|
+
NestedField(field_id=1, name="event_id", field_type=StringType(), required=True),
|
|
74
|
+
NestedField(field_id=2, name="event_type", field_type=StringType(), required=True),
|
|
75
|
+
NestedField(field_id=3, name="product_id", field_type=StringType(), required=True),
|
|
76
|
+
NestedField(field_id=4, name="name", field_type=StringType(), required=True),
|
|
77
|
+
NestedField(field_id=5, name="category", field_type=StringType(), required=True),
|
|
78
|
+
NestedField(field_id=6, name="price", field_type=DoubleType(), required=True),
|
|
79
|
+
NestedField(field_id=7, name="currency", field_type=StringType(), required=True),
|
|
80
|
+
NestedField(field_id=8, name="in_stock", field_type=BooleanType(), required=True),
|
|
81
|
+
NestedField(field_id=9, name="stock_quantity", field_type=IntegerType(), required=True),
|
|
82
|
+
NestedField(field_id=10, name="created_at", field_type=TimestampType(), required=True),
|
|
83
|
+
NestedField(field_id=11, name="payload_json", field_type=StringType(), required=True),
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
DEAD_LETTER_SCHEMA = Schema(
|
|
87
|
+
NestedField(field_id=1, name="event_id", field_type=StringType(), required=False),
|
|
88
|
+
NestedField(field_id=2, name="event_type", field_type=StringType(), required=False),
|
|
89
|
+
NestedField(field_id=3, name="reason", field_type=StringType(), required=True),
|
|
90
|
+
NestedField(field_id=4, name="source_topic", field_type=StringType(), required=True),
|
|
91
|
+
NestedField(field_id=5, name="received_at", field_type=TimestampType(), required=True),
|
|
92
|
+
NestedField(field_id=6, name="payload_json", field_type=StringType(), required=True),
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class IcebergSink:
|
|
97
|
+
def __init__(self, config_path: str | Path = "config/iceberg.yaml"):
|
|
98
|
+
self.config_path = Path(config_path)
|
|
99
|
+
config = yaml.safe_load(self.config_path.read_text(encoding="utf-8")) or {}
|
|
100
|
+
self._config = config["iceberg"]
|
|
101
|
+
self.namespace = self._config["namespace"]
|
|
102
|
+
self.table_configs = {table["name"]: table for table in self._config["tables"]}
|
|
103
|
+
catalog_type = self._config["catalog_type"]
|
|
104
|
+
catalog_properties = {
|
|
105
|
+
"type": catalog_type,
|
|
106
|
+
"uri": self._resolve_catalog_uri(self._config["catalog_uri"]),
|
|
107
|
+
"warehouse": self._resolve_warehouse(
|
|
108
|
+
self._config["warehouse"],
|
|
109
|
+
catalog_type,
|
|
110
|
+
),
|
|
111
|
+
}
|
|
112
|
+
catalog_properties.update(self._config.get("catalog_properties", {}))
|
|
113
|
+
self.catalog = load_catalog(
|
|
114
|
+
self._config.get("catalog_name", "agentflow"),
|
|
115
|
+
**catalog_properties,
|
|
116
|
+
)
|
|
117
|
+
self.catalog.create_namespace_if_not_exists(self.namespace)
|
|
118
|
+
|
|
119
|
+
def create_tables_if_not_exist(self) -> None:
|
|
120
|
+
for table_name in self.table_configs:
|
|
121
|
+
identifier = self._identifier(table_name)
|
|
122
|
+
if self.catalog.table_exists(identifier):
|
|
123
|
+
continue
|
|
124
|
+
self.catalog.create_table(
|
|
125
|
+
identifier,
|
|
126
|
+
schema=self._schema_for_table(table_name),
|
|
127
|
+
partition_spec=self._partition_spec_for_table(table_name),
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
def write_batch(self, table_name: str, records: list[dict[str, Any]]) -> int:
|
|
131
|
+
if not records:
|
|
132
|
+
return 0
|
|
133
|
+
self.create_tables_if_not_exist()
|
|
134
|
+
table = self.catalog.load_table(self._identifier(table_name))
|
|
135
|
+
normalized_records = [self._normalize_record(table_name, record) for record in records]
|
|
136
|
+
arrow_table = pa.Table.from_pylist(
|
|
137
|
+
normalized_records,
|
|
138
|
+
schema=table.schema().as_arrow(),
|
|
139
|
+
)
|
|
140
|
+
table.append(arrow_table)
|
|
141
|
+
return len(normalized_records)
|
|
142
|
+
|
|
143
|
+
def row_counts(self) -> dict[str, int]:
|
|
144
|
+
counts: dict[str, int] = {}
|
|
145
|
+
for table_name in self.table_configs:
|
|
146
|
+
identifier = self._identifier(table_name)
|
|
147
|
+
if not self.catalog.table_exists(identifier):
|
|
148
|
+
counts[table_name] = 0
|
|
149
|
+
continue
|
|
150
|
+
table = self.catalog.load_table(identifier)
|
|
151
|
+
counts[table_name] = table.scan().count()
|
|
152
|
+
return counts
|
|
153
|
+
|
|
154
|
+
def _identifier(self, table_name: str) -> tuple[str, str]:
|
|
155
|
+
return self.namespace, table_name
|
|
156
|
+
|
|
157
|
+
def _resolve_catalog_uri(self, value: str) -> str:
|
|
158
|
+
prefix = "sqlite:///"
|
|
159
|
+
if not value.startswith(prefix):
|
|
160
|
+
return value
|
|
161
|
+
raw_path = value[len(prefix) :]
|
|
162
|
+
catalog_path = Path(raw_path)
|
|
163
|
+
if not catalog_path.is_absolute():
|
|
164
|
+
catalog_path = (self.config_path.parent / catalog_path).resolve()
|
|
165
|
+
catalog_path.parent.mkdir(parents=True, exist_ok=True)
|
|
166
|
+
return f"{prefix}{catalog_path.as_posix()}"
|
|
167
|
+
|
|
168
|
+
def _resolve_warehouse(self, value: str, catalog_type: str) -> str:
|
|
169
|
+
if catalog_type == "rest":
|
|
170
|
+
return value
|
|
171
|
+
if "://" in value or value.startswith("file:"):
|
|
172
|
+
return value
|
|
173
|
+
warehouse_path = Path(value)
|
|
174
|
+
if not warehouse_path.is_absolute():
|
|
175
|
+
warehouse_path = (self.config_path.parent / warehouse_path).resolve()
|
|
176
|
+
warehouse_path.mkdir(parents=True, exist_ok=True)
|
|
177
|
+
if os.name == "nt":
|
|
178
|
+
return f"file:{warehouse_path.as_posix()}"
|
|
179
|
+
return warehouse_path.as_posix()
|
|
180
|
+
|
|
181
|
+
def _schema_for_table(self, table_name: str) -> Schema:
|
|
182
|
+
schemas = {
|
|
183
|
+
"orders": ORDERS_SCHEMA,
|
|
184
|
+
"payments": PAYMENTS_SCHEMA,
|
|
185
|
+
"clickstream": CLICKSTREAM_SCHEMA,
|
|
186
|
+
"inventory": INVENTORY_SCHEMA,
|
|
187
|
+
"dead_letter": DEAD_LETTER_SCHEMA,
|
|
188
|
+
}
|
|
189
|
+
return schemas[table_name]
|
|
190
|
+
|
|
191
|
+
def _partition_spec_for_table(self, table_name: str) -> PartitionSpec:
|
|
192
|
+
schema = self._schema_for_table(table_name)
|
|
193
|
+
fields: list[PartitionField] = []
|
|
194
|
+
for index, expression in enumerate(
|
|
195
|
+
self.table_configs[table_name].get("partition_by", []),
|
|
196
|
+
start=1,
|
|
197
|
+
):
|
|
198
|
+
source_name: str
|
|
199
|
+
transform: DayTransform | HourTransform
|
|
200
|
+
if expression.startswith("days(") and expression.endswith(")"):
|
|
201
|
+
source_name = expression[5:-1]
|
|
202
|
+
transform = DayTransform()
|
|
203
|
+
suffix = "day"
|
|
204
|
+
elif expression.startswith("hours(") and expression.endswith(")"):
|
|
205
|
+
source_name = expression[6:-1]
|
|
206
|
+
transform = HourTransform()
|
|
207
|
+
suffix = "hour"
|
|
208
|
+
else:
|
|
209
|
+
msg = f"Unsupported partition transform: {expression}"
|
|
210
|
+
raise ValueError(msg)
|
|
211
|
+
source_field = schema.find_field(source_name)
|
|
212
|
+
fields.append(
|
|
213
|
+
PartitionField(
|
|
214
|
+
source_id=source_field.field_id,
|
|
215
|
+
field_id=1000 + index,
|
|
216
|
+
transform=transform,
|
|
217
|
+
name=f"{source_name}_{suffix}",
|
|
218
|
+
)
|
|
219
|
+
)
|
|
220
|
+
return PartitionSpec(*fields)
|
|
221
|
+
|
|
222
|
+
def _normalize_record(self, table_name: str, record: dict[str, Any]) -> dict[str, Any]:
|
|
223
|
+
if table_name == "orders":
|
|
224
|
+
return self._normalize_order(record)
|
|
225
|
+
if table_name == "payments":
|
|
226
|
+
return self._normalize_payment(record)
|
|
227
|
+
if table_name == "clickstream":
|
|
228
|
+
return self._normalize_clickstream(record)
|
|
229
|
+
if table_name == "inventory":
|
|
230
|
+
return self._normalize_inventory(record)
|
|
231
|
+
if table_name == "dead_letter":
|
|
232
|
+
return self._normalize_dead_letter(record)
|
|
233
|
+
msg = f"Unsupported table: {table_name}"
|
|
234
|
+
raise ValueError(msg)
|
|
235
|
+
|
|
236
|
+
def _normalize_order(self, record: dict[str, Any]) -> dict[str, Any]:
|
|
237
|
+
derived = record.get("_derived", {})
|
|
238
|
+
items = record.get("items", [])
|
|
239
|
+
return {
|
|
240
|
+
"event_id": str(record["event_id"]),
|
|
241
|
+
"event_type": str(record["event_type"]),
|
|
242
|
+
"order_id": str(record["order_id"]),
|
|
243
|
+
"user_id": str(record["user_id"]),
|
|
244
|
+
"status": str(record["status"]),
|
|
245
|
+
"total_amount": float(record["total_amount"]),
|
|
246
|
+
"currency": str(record.get("currency", "USD")),
|
|
247
|
+
"item_count": int(
|
|
248
|
+
derived.get(
|
|
249
|
+
"item_count",
|
|
250
|
+
sum(item.get("quantity", 0) for item in items),
|
|
251
|
+
)
|
|
252
|
+
),
|
|
253
|
+
"unique_products": int(
|
|
254
|
+
derived.get(
|
|
255
|
+
"unique_products",
|
|
256
|
+
len({item.get("product_id") for item in items if item.get("product_id")}),
|
|
257
|
+
)
|
|
258
|
+
),
|
|
259
|
+
"order_size_bucket": str(derived.get("order_size_bucket", "unknown")),
|
|
260
|
+
"created_at": self._coerce_timestamp(record.get("timestamp")),
|
|
261
|
+
"payload_json": self._dump_payload(record),
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
def _normalize_payment(self, record: dict[str, Any]) -> dict[str, Any]:
|
|
265
|
+
derived = record.get("_derived", {})
|
|
266
|
+
return {
|
|
267
|
+
"event_id": str(record["event_id"]),
|
|
268
|
+
"event_type": str(record["event_type"]),
|
|
269
|
+
"payment_id": str(record["payment_id"]),
|
|
270
|
+
"order_id": str(record["order_id"]),
|
|
271
|
+
"user_id": str(record["user_id"]),
|
|
272
|
+
"amount": float(record["amount"]),
|
|
273
|
+
"currency": str(record.get("currency", "USD")),
|
|
274
|
+
"method": str(record["method"]),
|
|
275
|
+
"status": str(record["status"]),
|
|
276
|
+
"risk_score": (float(derived["risk_score"]) if "risk_score" in derived else None),
|
|
277
|
+
"risk_level": (str(derived["risk_level"]) if "risk_level" in derived else None),
|
|
278
|
+
"created_at": self._coerce_timestamp(record.get("timestamp")),
|
|
279
|
+
"payload_json": self._dump_payload(record),
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
def _normalize_clickstream(self, record: dict[str, Any]) -> dict[str, Any]:
|
|
283
|
+
derived = record.get("_derived", {})
|
|
284
|
+
return {
|
|
285
|
+
"event_id": str(record["event_id"]),
|
|
286
|
+
"event_type": str(record["event_type"]),
|
|
287
|
+
"session_id": str(record["session_id"]),
|
|
288
|
+
"user_id": (str(record["user_id"]) if record.get("user_id") is not None else None),
|
|
289
|
+
"page_url": str(record["page_url"]),
|
|
290
|
+
"referrer": (str(record["referrer"]) if record.get("referrer") is not None else None),
|
|
291
|
+
"user_agent": str(record["user_agent"]),
|
|
292
|
+
"viewport_width": (
|
|
293
|
+
int(record["viewport_width"]) if record.get("viewport_width") is not None else None
|
|
294
|
+
),
|
|
295
|
+
"product_id": (
|
|
296
|
+
str(record["product_id"]) if record.get("product_id") is not None else None
|
|
297
|
+
),
|
|
298
|
+
"is_mobile": derived.get("is_mobile"),
|
|
299
|
+
"page_category": derived.get("page_category"),
|
|
300
|
+
"is_product_page": derived.get("is_product_page"),
|
|
301
|
+
"created_at": self._coerce_timestamp(record.get("timestamp")),
|
|
302
|
+
"payload_json": self._dump_payload(record),
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
def _normalize_inventory(self, record: dict[str, Any]) -> dict[str, Any]:
|
|
306
|
+
return {
|
|
307
|
+
"event_id": str(record["event_id"]),
|
|
308
|
+
"event_type": str(record["event_type"]),
|
|
309
|
+
"product_id": str(record["product_id"]),
|
|
310
|
+
"name": str(record["name"]),
|
|
311
|
+
"category": str(record["category"]),
|
|
312
|
+
"price": float(record["price"]),
|
|
313
|
+
"currency": str(record.get("currency", "USD")),
|
|
314
|
+
"in_stock": bool(record["in_stock"]),
|
|
315
|
+
"stock_quantity": int(record["stock_quantity"]),
|
|
316
|
+
"created_at": self._coerce_timestamp(record.get("timestamp")),
|
|
317
|
+
"payload_json": self._dump_payload(record),
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
def _normalize_dead_letter(self, record: dict[str, Any]) -> dict[str, Any]:
|
|
321
|
+
return {
|
|
322
|
+
"event_id": (str(record["event_id"]) if record.get("event_id") is not None else None),
|
|
323
|
+
"event_type": (
|
|
324
|
+
str(record["event_type"]) if record.get("event_type") is not None else None
|
|
325
|
+
),
|
|
326
|
+
"reason": str(record["reason"]),
|
|
327
|
+
"source_topic": str(record.get("source_topic", "events.deadletter")),
|
|
328
|
+
"received_at": self._coerce_timestamp(record.get("received_at", datetime.now(UTC))),
|
|
329
|
+
"payload_json": (
|
|
330
|
+
str(record["payload_json"])
|
|
331
|
+
if "payload_json" in record
|
|
332
|
+
else self._dump_payload(record.get("payload", record))
|
|
333
|
+
),
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
def _coerce_timestamp(self, value: Any) -> datetime:
|
|
337
|
+
if isinstance(value, datetime):
|
|
338
|
+
timestamp = value
|
|
339
|
+
elif value is None:
|
|
340
|
+
timestamp = datetime.now(UTC)
|
|
341
|
+
else:
|
|
342
|
+
timestamp = datetime.fromisoformat(str(value).replace("Z", "+00:00"))
|
|
343
|
+
if timestamp.tzinfo is None:
|
|
344
|
+
return timestamp
|
|
345
|
+
return timestamp.astimezone(UTC).replace(tzinfo=None)
|
|
346
|
+
|
|
347
|
+
def _dump_payload(self, payload: Any) -> str:
|
|
348
|
+
return json.dumps(payload, default=str, sort_keys=True)
|