agentflow-runtime 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. agentflow_runtime-1.1.0.dist-info/METADATA +55 -0
  2. agentflow_runtime-1.1.0.dist-info/RECORD +100 -0
  3. agentflow_runtime-1.1.0.dist-info/WHEEL +4 -0
  4. agentflow_runtime-1.1.0.dist-info/licenses/LICENSE +21 -0
  5. src/__init__.py +0 -0
  6. src/constants.py +3 -0
  7. src/ingestion/__init__.py +0 -0
  8. src/ingestion/cdc/__init__.py +5 -0
  9. src/ingestion/cdc/normalizer.py +186 -0
  10. src/ingestion/connectors/__init__.py +0 -0
  11. src/ingestion/connectors/mysql_cdc.py +63 -0
  12. src/ingestion/connectors/postgres_cdc.py +68 -0
  13. src/ingestion/producers/__init__.py +0 -0
  14. src/ingestion/producers/event_producer.py +237 -0
  15. src/ingestion/schemas/__init__.py +0 -0
  16. src/ingestion/schemas/events.py +147 -0
  17. src/ingestion/tenant_router.py +80 -0
  18. src/logger.py +41 -0
  19. src/orchestration/__init__.py +0 -0
  20. src/orchestration/dags/__init__.py +0 -0
  21. src/orchestration/dags/daily_batch.py +201 -0
  22. src/processing/__init__.py +0 -0
  23. src/processing/event_replayer.py +250 -0
  24. src/processing/flink_jobs/Dockerfile +55 -0
  25. src/processing/flink_jobs/__init__.py +0 -0
  26. src/processing/flink_jobs/checkpointing.py +32 -0
  27. src/processing/flink_jobs/session_aggregation.py +212 -0
  28. src/processing/flink_jobs/session_aggregator.py +199 -0
  29. src/processing/flink_jobs/stream_processor.py +316 -0
  30. src/processing/iceberg_sink.py +348 -0
  31. src/processing/local_pipeline.py +452 -0
  32. src/processing/outbox.py +273 -0
  33. src/processing/tracing.py +36 -0
  34. src/processing/transformations/__init__.py +0 -0
  35. src/processing/transformations/enrichment.py +125 -0
  36. src/quality/__init__.py +0 -0
  37. src/quality/monitors/__init__.py +0 -0
  38. src/quality/monitors/freshness_monitor.py +166 -0
  39. src/quality/monitors/metrics_collector.py +367 -0
  40. src/quality/validators/__init__.py +0 -0
  41. src/quality/validators/schema_validator.py +119 -0
  42. src/quality/validators/semantic_validator.py +202 -0
  43. src/serving/__init__.py +0 -0
  44. src/serving/api/__init__.py +0 -0
  45. src/serving/api/alert_dispatcher.py +51 -0
  46. src/serving/api/alerts/__init__.py +38 -0
  47. src/serving/api/alerts/dispatcher.py +299 -0
  48. src/serving/api/alerts/escalation.py +290 -0
  49. src/serving/api/alerts/evaluator.py +81 -0
  50. src/serving/api/alerts/history.py +115 -0
  51. src/serving/api/analytics.py +543 -0
  52. src/serving/api/auth/__init__.py +46 -0
  53. src/serving/api/auth/key_rotation.py +400 -0
  54. src/serving/api/auth/manager.py +406 -0
  55. src/serving/api/auth/middleware.py +331 -0
  56. src/serving/api/main.py +390 -0
  57. src/serving/api/middleware/logging.py +41 -0
  58. src/serving/api/middleware/tracing.py +51 -0
  59. src/serving/api/rate_limiter.py +76 -0
  60. src/serving/api/routers/__init__.py +0 -0
  61. src/serving/api/routers/admin.py +150 -0
  62. src/serving/api/routers/admin_ui.py +93 -0
  63. src/serving/api/routers/agent_query.py +639 -0
  64. src/serving/api/routers/alerts.py +134 -0
  65. src/serving/api/routers/batch.py +231 -0
  66. src/serving/api/routers/contracts.py +98 -0
  67. src/serving/api/routers/deadletter.py +337 -0
  68. src/serving/api/routers/lineage.py +218 -0
  69. src/serving/api/routers/search.py +103 -0
  70. src/serving/api/routers/slo.py +231 -0
  71. src/serving/api/routers/stream.py +141 -0
  72. src/serving/api/routers/webhooks.py +93 -0
  73. src/serving/api/security.py +83 -0
  74. src/serving/api/telemetry.py +66 -0
  75. src/serving/api/templates/admin.html +214 -0
  76. src/serving/api/versioning.py +328 -0
  77. src/serving/api/webhook_dispatcher.py +423 -0
  78. src/serving/backends/__init__.py +117 -0
  79. src/serving/backends/clickhouse_backend.py +310 -0
  80. src/serving/backends/duckdb_backend.py +268 -0
  81. src/serving/cache.py +169 -0
  82. src/serving/db_pool.py +105 -0
  83. src/serving/masking.py +122 -0
  84. src/serving/semantic_layer/__init__.py +0 -0
  85. src/serving/semantic_layer/catalog.py +177 -0
  86. src/serving/semantic_layer/contract_registry.py +258 -0
  87. src/serving/semantic_layer/entity_type_registry.py +107 -0
  88. src/serving/semantic_layer/nl_engine.py +189 -0
  89. src/serving/semantic_layer/query/__init__.py +3 -0
  90. src/serving/semantic_layer/query/contracts.py +47 -0
  91. src/serving/semantic_layer/query/engine.py +81 -0
  92. src/serving/semantic_layer/query/entity_queries.py +221 -0
  93. src/serving/semantic_layer/query/metric_queries.py +84 -0
  94. src/serving/semantic_layer/query/nl_queries.py +305 -0
  95. src/serving/semantic_layer/query/sql_builder.py +113 -0
  96. src/serving/semantic_layer/query/sql_guard.py +3 -0
  97. src/serving/semantic_layer/query_engine.py +5 -0
  98. src/serving/semantic_layer/schema_evolution.py +175 -0
  99. src/serving/semantic_layer/search_index.py +337 -0
  100. src/serving/semantic_layer/sql_guard.py +56 -0
@@ -0,0 +1,316 @@
1
+ """Core Flink streaming job: validates, enriches, and routes events.
2
+
3
+ Pipeline: Kafka source → Schema validation → Enrichment → Deduplication → Iceberg sink
4
+ Invalid events are routed to a dead letter topic with error metadata.
5
+
6
+ This is the main entry point for the Flink cluster. Submit with:
7
+ flink run -py stream_processor.py
8
+ """
9
+
10
+ import json
11
+ import os
12
+ from datetime import timedelta
13
+
14
+ from pyflink.common import Types, WatermarkStrategy
15
+ from pyflink.common.serialization import SimpleStringSchema
16
+ from pyflink.common.watermark_strategy import TimestampAssigner
17
+ from pyflink.datastream import StreamExecutionEnvironment
18
+ from pyflink.datastream.connectors.kafka import (
19
+ KafkaOffsetsInitializer,
20
+ KafkaRecordSerializationSchema,
21
+ KafkaSink,
22
+ KafkaSource,
23
+ )
24
+ from pyflink.datastream.functions import MapFunction, ProcessFunction
25
+ from pyflink.datastream.output_tag import OutputTag
26
+
27
+ # Side output for invalid events
28
+ DEAD_LETTER_TAG = OutputTag("dead-letter", Types.STRING())
29
+
30
+
31
+ def _event_tenant(event: dict) -> str:
32
+ source_metadata = event.get("source_metadata", {})
33
+ metadata_tenant = source_metadata.get("tenant") if isinstance(source_metadata, dict) else None
34
+ tenant = event.get("tenant") or metadata_tenant
35
+ return str(tenant) if tenant else "default"
36
+
37
+
38
+ class EventTimestampAssigner(TimestampAssigner):
39
+ """Extracts event_time from the JSON payload for watermark generation."""
40
+
41
+ def extract_timestamp(self, value, record_timestamp):
42
+ try:
43
+ event = json.loads(value)
44
+ from datetime import UTC, datetime
45
+
46
+ from src.ingestion.cdc.normalizer import is_debezium_event, normalize_debezium_event
47
+
48
+ if is_debezium_event(event):
49
+ # Best-effort topic for tenant resolution: Debezium value-only
50
+ # deserializer drops the Kafka topic name (Codex review P1).
51
+ # Wrappers should populate `event["topic"]` before this point;
52
+ # without it we still fall back to source.name → 'default'.
53
+ event = normalize_debezium_event(event, topic=event.get("topic"))
54
+ ts = datetime.fromisoformat(event["timestamp"])
55
+ if ts.tzinfo is None:
56
+ ts = ts.replace(tzinfo=UTC)
57
+ return int(ts.timestamp() * 1000)
58
+ except (json.JSONDecodeError, KeyError, ValueError):
59
+ return record_timestamp
60
+
61
+
62
+ class ValidateAndEnrich(ProcessFunction):
63
+ """Validates, enriches, and routes events using the shared quality layer.
64
+
65
+ Pipeline per event:
66
+ 1. Parse JSON
67
+ 2. Schema validation via quality.validators.schema_validator
68
+ 3. Semantic validation via quality.validators.semantic_validator
69
+ 4. Domain enrichment via processing.transformations.enrichment
70
+ 5. Processing metadata (latency, version)
71
+
72
+ Invalid events (schema or semantic errors) → dead letter topic.
73
+ """
74
+
75
+ def process_element(self, value, ctx: ProcessFunction.Context):
76
+ from datetime import UTC, datetime
77
+
78
+ from src.processing.transformations.enrichment import (
79
+ compute_payment_risk_score,
80
+ enrich_clickstream,
81
+ enrich_order,
82
+ )
83
+ from src.quality.validators.schema_validator import validate_event
84
+ from src.quality.validators.semantic_validator import validate_semantics
85
+
86
+ # 1. Parse JSON
87
+ try:
88
+ event = json.loads(value)
89
+ except json.JSONDecodeError as e:
90
+ ctx.output(
91
+ DEAD_LETTER_TAG,
92
+ json.dumps(
93
+ {
94
+ "raw": value[:1000],
95
+ "error": f"JSON parse error: {e}",
96
+ "stage": "parse",
97
+ }
98
+ ),
99
+ )
100
+ return
101
+
102
+ from src.ingestion.cdc.normalizer import is_debezium_event, normalize_debezium_event
103
+
104
+ try:
105
+ if is_debezium_event(event):
106
+ # See note above on topic propagation; Flink wrappers should
107
+ # inject `event["topic"]` from KafkaSourceMetadata when
108
+ # available so tenant resolution sees the prefixed topic.
109
+ event = normalize_debezium_event(event, topic=event.get("topic"))
110
+ except ValueError as e:
111
+ ctx.output(
112
+ DEAD_LETTER_TAG,
113
+ json.dumps(
114
+ {
115
+ "raw": value[:1000],
116
+ "error": str(e),
117
+ "stage": "cdc_normalization",
118
+ }
119
+ ),
120
+ )
121
+ return
122
+
123
+ event_id = event.get("event_id", "unknown")
124
+ event_type = event.get("event_type", "unknown")
125
+ event["tenant"] = _event_tenant(event)
126
+ is_cdc_event = event.get("source") in {"postgres_cdc", "mysql_cdc"} and "operation" in event
127
+
128
+ # 2. Schema validation (Pydantic models)
129
+ schema_result = validate_event(event)
130
+ if not schema_result.is_valid:
131
+ ctx.output(
132
+ DEAD_LETTER_TAG,
133
+ json.dumps(
134
+ {
135
+ "event_id": event_id,
136
+ "error": schema_result.errors,
137
+ "stage": "schema_validation",
138
+ }
139
+ ),
140
+ )
141
+ return
142
+
143
+ # 3. Semantic validation (business rules)
144
+ semantic_result = validate_semantics(event)
145
+ if not semantic_result.is_clean:
146
+ error_issues = [
147
+ i.to_dict()
148
+ if hasattr(i, "to_dict")
149
+ else {
150
+ "rule": i.rule,
151
+ "severity": i.severity,
152
+ "field": i.field,
153
+ "message": i.message,
154
+ }
155
+ for i in semantic_result.issues
156
+ if i.severity == "error"
157
+ ]
158
+ if error_issues:
159
+ ctx.output(
160
+ DEAD_LETTER_TAG,
161
+ json.dumps(
162
+ {
163
+ "event_id": event_id,
164
+ "error": error_issues,
165
+ "stage": "semantic_validation",
166
+ }
167
+ ),
168
+ )
169
+ return
170
+
171
+ # 4. Domain enrichment by event type
172
+ if is_cdc_event:
173
+ pass
174
+ elif event_type.startswith("order."):
175
+ event = enrich_order(event)
176
+ elif event_type in ("click", "page_view", "add_to_cart"):
177
+ event = enrich_clickstream(event)
178
+ elif event_type.startswith("payment."):
179
+ event = compute_payment_risk_score(event)
180
+
181
+ # 5. Processing metadata
182
+ now = datetime.now(UTC)
183
+ try:
184
+ event_ts = datetime.fromisoformat(event["timestamp"])
185
+ if event_ts.tzinfo is None:
186
+ event_ts = event_ts.replace(tzinfo=UTC)
187
+ latency_ms = int((now - event_ts).total_seconds() * 1000)
188
+ except (ValueError, TypeError):
189
+ latency_ms = -1
190
+
191
+ event["_enriched"] = {
192
+ "processing_time": now.isoformat(),
193
+ "pipeline_latency_ms": latency_ms,
194
+ "processor_version": "1.0.0",
195
+ }
196
+
197
+ event["_partition_key"] = (
198
+ event.get("user_id")
199
+ or event.get("order_id")
200
+ or event.get("entity_id")
201
+ or event["event_id"]
202
+ )
203
+
204
+ yield json.dumps(event)
205
+
206
+
207
+ class DeduplicateByEventId(MapFunction):
208
+ """Deduplicates events using a Flink keyed state with TTL.
209
+
210
+ Events with the same event_id within the TTL window are dropped.
211
+ This handles at-least-once delivery from Kafka producers.
212
+ """
213
+
214
+ def open(self, runtime_context):
215
+ from pyflink.datastream.state import StateTtlConfig, ValueStateDescriptor
216
+
217
+ ttl_config = (
218
+ StateTtlConfig.new_builder(timedelta(minutes=10))
219
+ .set_update_type(StateTtlConfig.UpdateType.OnCreateAndWrite)
220
+ .build()
221
+ )
222
+
223
+ state_desc = ValueStateDescriptor("seen", Types.BOOLEAN())
224
+ state_desc.enable_time_to_live(ttl_config)
225
+ self.seen_state = runtime_context.get_state(state_desc)
226
+
227
+ def map(self, value):
228
+ if self.seen_state.value():
229
+ return None # duplicate
230
+ self.seen_state.update(True)
231
+ return value
232
+
233
+
234
+ def build_pipeline():
235
+ env = StreamExecutionEnvironment.get_execution_environment()
236
+
237
+ # Checkpointing for exactly-once
238
+ env.enable_checkpointing(30_000) # 30s
239
+ env.get_checkpoint_config().set_min_pause_between_checkpoints(10_000)
240
+ env.set_parallelism(int(os.getenv("FLINK_PARALLELISM", "2")))
241
+
242
+ bootstrap_servers = os.getenv("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092")
243
+
244
+ # Multi-topic Kafka source
245
+ source = (
246
+ KafkaSource.builder()
247
+ .set_bootstrap_servers(bootstrap_servers)
248
+ .set_topics(
249
+ "orders.raw",
250
+ "payments.raw",
251
+ "clicks.raw",
252
+ "products.cdc",
253
+ "cdc.postgres.public.orders_v2",
254
+ "cdc.postgres.public.users_enriched",
255
+ "cdc.mysql.agentflow_demo.products_current",
256
+ "cdc.mysql.agentflow_demo.sessions_aggregated",
257
+ )
258
+ .set_group_id("agentflow-stream-processor")
259
+ .set_starting_offsets(KafkaOffsetsInitializer.earliest())
260
+ .set_value_only_deserializer(SimpleStringSchema())
261
+ .build()
262
+ )
263
+
264
+ watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(
265
+ timedelta(seconds=5)
266
+ ).with_timestamp_assigner(EventTimestampAssigner())
267
+
268
+ # Main pipeline
269
+ stream = env.from_source(source, watermark_strategy, "kafka-source")
270
+
271
+ # Validate + enrich (with dead letter side output)
272
+ validated = stream.process(ValidateAndEnrich(), output_type=Types.STRING())
273
+
274
+ # Dead letter sink
275
+ dead_letter_sink = (
276
+ KafkaSink.builder()
277
+ .set_bootstrap_servers(bootstrap_servers)
278
+ .set_record_serializer(
279
+ KafkaRecordSerializationSchema.builder()
280
+ .set_topic("events.deadletter")
281
+ .set_value_serialization_schema(SimpleStringSchema())
282
+ .build()
283
+ )
284
+ .build()
285
+ )
286
+
287
+ validated.get_side_output(DEAD_LETTER_TAG).sink_to(dead_letter_sink)
288
+
289
+ # Deduplicate by event_id
290
+ deduped = (
291
+ validated.key_by(lambda x: json.loads(x).get("event_id", ""))
292
+ .map(DeduplicateByEventId(), output_type=Types.STRING())
293
+ .filter(lambda x: x is not None)
294
+ )
295
+
296
+ # Validated events sink (for downstream consumers)
297
+ validated_sink = (
298
+ KafkaSink.builder()
299
+ .set_bootstrap_servers(bootstrap_servers)
300
+ .set_record_serializer(
301
+ KafkaRecordSerializationSchema.builder()
302
+ .set_topic("events.validated")
303
+ .set_value_serialization_schema(SimpleStringSchema())
304
+ .build()
305
+ )
306
+ .build()
307
+ )
308
+
309
+ deduped.sink_to(validated_sink)
310
+
311
+ return env
312
+
313
+
314
+ if __name__ == "__main__":
315
+ pipeline = build_pipeline()
316
+ pipeline.execute("agentflow-stream-processor")
@@ -0,0 +1,348 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from datetime import UTC, datetime
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import pyarrow as pa
10
+ import yaml # type: ignore[import-untyped]
11
+ from pyiceberg.catalog import load_catalog
12
+ from pyiceberg.partitioning import PartitionField, PartitionSpec
13
+ from pyiceberg.schema import Schema
14
+ from pyiceberg.transforms import DayTransform, HourTransform
15
+ from pyiceberg.types import (
16
+ BooleanType,
17
+ DoubleType,
18
+ IntegerType,
19
+ NestedField,
20
+ StringType,
21
+ TimestampType,
22
+ )
23
+
24
+ ORDERS_SCHEMA = Schema(
25
+ NestedField(field_id=1, name="event_id", field_type=StringType(), required=True),
26
+ NestedField(field_id=2, name="event_type", field_type=StringType(), required=True),
27
+ NestedField(field_id=3, name="order_id", field_type=StringType(), required=True),
28
+ NestedField(field_id=4, name="user_id", field_type=StringType(), required=True),
29
+ NestedField(field_id=5, name="status", field_type=StringType(), required=True),
30
+ NestedField(field_id=6, name="total_amount", field_type=DoubleType(), required=True),
31
+ NestedField(field_id=7, name="currency", field_type=StringType(), required=True),
32
+ NestedField(field_id=8, name="item_count", field_type=IntegerType(), required=True),
33
+ NestedField(field_id=9, name="unique_products", field_type=IntegerType(), required=True),
34
+ NestedField(field_id=10, name="order_size_bucket", field_type=StringType(), required=True),
35
+ NestedField(field_id=11, name="created_at", field_type=TimestampType(), required=True),
36
+ NestedField(field_id=12, name="payload_json", field_type=StringType(), required=True),
37
+ )
38
+
39
+ PAYMENTS_SCHEMA = Schema(
40
+ NestedField(field_id=1, name="event_id", field_type=StringType(), required=True),
41
+ NestedField(field_id=2, name="event_type", field_type=StringType(), required=True),
42
+ NestedField(field_id=3, name="payment_id", field_type=StringType(), required=True),
43
+ NestedField(field_id=4, name="order_id", field_type=StringType(), required=True),
44
+ NestedField(field_id=5, name="user_id", field_type=StringType(), required=True),
45
+ NestedField(field_id=6, name="amount", field_type=DoubleType(), required=True),
46
+ NestedField(field_id=7, name="currency", field_type=StringType(), required=True),
47
+ NestedField(field_id=8, name="method", field_type=StringType(), required=True),
48
+ NestedField(field_id=9, name="status", field_type=StringType(), required=True),
49
+ NestedField(field_id=10, name="risk_score", field_type=DoubleType(), required=False),
50
+ NestedField(field_id=11, name="risk_level", field_type=StringType(), required=False),
51
+ NestedField(field_id=12, name="created_at", field_type=TimestampType(), required=True),
52
+ NestedField(field_id=13, name="payload_json", field_type=StringType(), required=True),
53
+ )
54
+
55
+ CLICKSTREAM_SCHEMA = Schema(
56
+ NestedField(field_id=1, name="event_id", field_type=StringType(), required=True),
57
+ NestedField(field_id=2, name="event_type", field_type=StringType(), required=True),
58
+ NestedField(field_id=3, name="session_id", field_type=StringType(), required=True),
59
+ NestedField(field_id=4, name="user_id", field_type=StringType(), required=False),
60
+ NestedField(field_id=5, name="page_url", field_type=StringType(), required=True),
61
+ NestedField(field_id=6, name="referrer", field_type=StringType(), required=False),
62
+ NestedField(field_id=7, name="user_agent", field_type=StringType(), required=True),
63
+ NestedField(field_id=8, name="viewport_width", field_type=IntegerType(), required=False),
64
+ NestedField(field_id=9, name="product_id", field_type=StringType(), required=False),
65
+ NestedField(field_id=10, name="is_mobile", field_type=BooleanType(), required=False),
66
+ NestedField(field_id=11, name="page_category", field_type=StringType(), required=False),
67
+ NestedField(field_id=12, name="is_product_page", field_type=BooleanType(), required=False),
68
+ NestedField(field_id=13, name="created_at", field_type=TimestampType(), required=True),
69
+ NestedField(field_id=14, name="payload_json", field_type=StringType(), required=True),
70
+ )
71
+
72
+ INVENTORY_SCHEMA = Schema(
73
+ NestedField(field_id=1, name="event_id", field_type=StringType(), required=True),
74
+ NestedField(field_id=2, name="event_type", field_type=StringType(), required=True),
75
+ NestedField(field_id=3, name="product_id", field_type=StringType(), required=True),
76
+ NestedField(field_id=4, name="name", field_type=StringType(), required=True),
77
+ NestedField(field_id=5, name="category", field_type=StringType(), required=True),
78
+ NestedField(field_id=6, name="price", field_type=DoubleType(), required=True),
79
+ NestedField(field_id=7, name="currency", field_type=StringType(), required=True),
80
+ NestedField(field_id=8, name="in_stock", field_type=BooleanType(), required=True),
81
+ NestedField(field_id=9, name="stock_quantity", field_type=IntegerType(), required=True),
82
+ NestedField(field_id=10, name="created_at", field_type=TimestampType(), required=True),
83
+ NestedField(field_id=11, name="payload_json", field_type=StringType(), required=True),
84
+ )
85
+
86
+ DEAD_LETTER_SCHEMA = Schema(
87
+ NestedField(field_id=1, name="event_id", field_type=StringType(), required=False),
88
+ NestedField(field_id=2, name="event_type", field_type=StringType(), required=False),
89
+ NestedField(field_id=3, name="reason", field_type=StringType(), required=True),
90
+ NestedField(field_id=4, name="source_topic", field_type=StringType(), required=True),
91
+ NestedField(field_id=5, name="received_at", field_type=TimestampType(), required=True),
92
+ NestedField(field_id=6, name="payload_json", field_type=StringType(), required=True),
93
+ )
94
+
95
+
96
+ class IcebergSink:
97
+ def __init__(self, config_path: str | Path = "config/iceberg.yaml"):
98
+ self.config_path = Path(config_path)
99
+ config = yaml.safe_load(self.config_path.read_text(encoding="utf-8")) or {}
100
+ self._config = config["iceberg"]
101
+ self.namespace = self._config["namespace"]
102
+ self.table_configs = {table["name"]: table for table in self._config["tables"]}
103
+ catalog_type = self._config["catalog_type"]
104
+ catalog_properties = {
105
+ "type": catalog_type,
106
+ "uri": self._resolve_catalog_uri(self._config["catalog_uri"]),
107
+ "warehouse": self._resolve_warehouse(
108
+ self._config["warehouse"],
109
+ catalog_type,
110
+ ),
111
+ }
112
+ catalog_properties.update(self._config.get("catalog_properties", {}))
113
+ self.catalog = load_catalog(
114
+ self._config.get("catalog_name", "agentflow"),
115
+ **catalog_properties,
116
+ )
117
+ self.catalog.create_namespace_if_not_exists(self.namespace)
118
+
119
+ def create_tables_if_not_exist(self) -> None:
120
+ for table_name in self.table_configs:
121
+ identifier = self._identifier(table_name)
122
+ if self.catalog.table_exists(identifier):
123
+ continue
124
+ self.catalog.create_table(
125
+ identifier,
126
+ schema=self._schema_for_table(table_name),
127
+ partition_spec=self._partition_spec_for_table(table_name),
128
+ )
129
+
130
+ def write_batch(self, table_name: str, records: list[dict[str, Any]]) -> int:
131
+ if not records:
132
+ return 0
133
+ self.create_tables_if_not_exist()
134
+ table = self.catalog.load_table(self._identifier(table_name))
135
+ normalized_records = [self._normalize_record(table_name, record) for record in records]
136
+ arrow_table = pa.Table.from_pylist(
137
+ normalized_records,
138
+ schema=table.schema().as_arrow(),
139
+ )
140
+ table.append(arrow_table)
141
+ return len(normalized_records)
142
+
143
+ def row_counts(self) -> dict[str, int]:
144
+ counts: dict[str, int] = {}
145
+ for table_name in self.table_configs:
146
+ identifier = self._identifier(table_name)
147
+ if not self.catalog.table_exists(identifier):
148
+ counts[table_name] = 0
149
+ continue
150
+ table = self.catalog.load_table(identifier)
151
+ counts[table_name] = table.scan().count()
152
+ return counts
153
+
154
+ def _identifier(self, table_name: str) -> tuple[str, str]:
155
+ return self.namespace, table_name
156
+
157
+ def _resolve_catalog_uri(self, value: str) -> str:
158
+ prefix = "sqlite:///"
159
+ if not value.startswith(prefix):
160
+ return value
161
+ raw_path = value[len(prefix) :]
162
+ catalog_path = Path(raw_path)
163
+ if not catalog_path.is_absolute():
164
+ catalog_path = (self.config_path.parent / catalog_path).resolve()
165
+ catalog_path.parent.mkdir(parents=True, exist_ok=True)
166
+ return f"{prefix}{catalog_path.as_posix()}"
167
+
168
+ def _resolve_warehouse(self, value: str, catalog_type: str) -> str:
169
+ if catalog_type == "rest":
170
+ return value
171
+ if "://" in value or value.startswith("file:"):
172
+ return value
173
+ warehouse_path = Path(value)
174
+ if not warehouse_path.is_absolute():
175
+ warehouse_path = (self.config_path.parent / warehouse_path).resolve()
176
+ warehouse_path.mkdir(parents=True, exist_ok=True)
177
+ if os.name == "nt":
178
+ return f"file:{warehouse_path.as_posix()}"
179
+ return warehouse_path.as_posix()
180
+
181
+ def _schema_for_table(self, table_name: str) -> Schema:
182
+ schemas = {
183
+ "orders": ORDERS_SCHEMA,
184
+ "payments": PAYMENTS_SCHEMA,
185
+ "clickstream": CLICKSTREAM_SCHEMA,
186
+ "inventory": INVENTORY_SCHEMA,
187
+ "dead_letter": DEAD_LETTER_SCHEMA,
188
+ }
189
+ return schemas[table_name]
190
+
191
+ def _partition_spec_for_table(self, table_name: str) -> PartitionSpec:
192
+ schema = self._schema_for_table(table_name)
193
+ fields: list[PartitionField] = []
194
+ for index, expression in enumerate(
195
+ self.table_configs[table_name].get("partition_by", []),
196
+ start=1,
197
+ ):
198
+ source_name: str
199
+ transform: DayTransform | HourTransform
200
+ if expression.startswith("days(") and expression.endswith(")"):
201
+ source_name = expression[5:-1]
202
+ transform = DayTransform()
203
+ suffix = "day"
204
+ elif expression.startswith("hours(") and expression.endswith(")"):
205
+ source_name = expression[6:-1]
206
+ transform = HourTransform()
207
+ suffix = "hour"
208
+ else:
209
+ msg = f"Unsupported partition transform: {expression}"
210
+ raise ValueError(msg)
211
+ source_field = schema.find_field(source_name)
212
+ fields.append(
213
+ PartitionField(
214
+ source_id=source_field.field_id,
215
+ field_id=1000 + index,
216
+ transform=transform,
217
+ name=f"{source_name}_{suffix}",
218
+ )
219
+ )
220
+ return PartitionSpec(*fields)
221
+
222
+ def _normalize_record(self, table_name: str, record: dict[str, Any]) -> dict[str, Any]:
223
+ if table_name == "orders":
224
+ return self._normalize_order(record)
225
+ if table_name == "payments":
226
+ return self._normalize_payment(record)
227
+ if table_name == "clickstream":
228
+ return self._normalize_clickstream(record)
229
+ if table_name == "inventory":
230
+ return self._normalize_inventory(record)
231
+ if table_name == "dead_letter":
232
+ return self._normalize_dead_letter(record)
233
+ msg = f"Unsupported table: {table_name}"
234
+ raise ValueError(msg)
235
+
236
+ def _normalize_order(self, record: dict[str, Any]) -> dict[str, Any]:
237
+ derived = record.get("_derived", {})
238
+ items = record.get("items", [])
239
+ return {
240
+ "event_id": str(record["event_id"]),
241
+ "event_type": str(record["event_type"]),
242
+ "order_id": str(record["order_id"]),
243
+ "user_id": str(record["user_id"]),
244
+ "status": str(record["status"]),
245
+ "total_amount": float(record["total_amount"]),
246
+ "currency": str(record.get("currency", "USD")),
247
+ "item_count": int(
248
+ derived.get(
249
+ "item_count",
250
+ sum(item.get("quantity", 0) for item in items),
251
+ )
252
+ ),
253
+ "unique_products": int(
254
+ derived.get(
255
+ "unique_products",
256
+ len({item.get("product_id") for item in items if item.get("product_id")}),
257
+ )
258
+ ),
259
+ "order_size_bucket": str(derived.get("order_size_bucket", "unknown")),
260
+ "created_at": self._coerce_timestamp(record.get("timestamp")),
261
+ "payload_json": self._dump_payload(record),
262
+ }
263
+
264
+ def _normalize_payment(self, record: dict[str, Any]) -> dict[str, Any]:
265
+ derived = record.get("_derived", {})
266
+ return {
267
+ "event_id": str(record["event_id"]),
268
+ "event_type": str(record["event_type"]),
269
+ "payment_id": str(record["payment_id"]),
270
+ "order_id": str(record["order_id"]),
271
+ "user_id": str(record["user_id"]),
272
+ "amount": float(record["amount"]),
273
+ "currency": str(record.get("currency", "USD")),
274
+ "method": str(record["method"]),
275
+ "status": str(record["status"]),
276
+ "risk_score": (float(derived["risk_score"]) if "risk_score" in derived else None),
277
+ "risk_level": (str(derived["risk_level"]) if "risk_level" in derived else None),
278
+ "created_at": self._coerce_timestamp(record.get("timestamp")),
279
+ "payload_json": self._dump_payload(record),
280
+ }
281
+
282
+ def _normalize_clickstream(self, record: dict[str, Any]) -> dict[str, Any]:
283
+ derived = record.get("_derived", {})
284
+ return {
285
+ "event_id": str(record["event_id"]),
286
+ "event_type": str(record["event_type"]),
287
+ "session_id": str(record["session_id"]),
288
+ "user_id": (str(record["user_id"]) if record.get("user_id") is not None else None),
289
+ "page_url": str(record["page_url"]),
290
+ "referrer": (str(record["referrer"]) if record.get("referrer") is not None else None),
291
+ "user_agent": str(record["user_agent"]),
292
+ "viewport_width": (
293
+ int(record["viewport_width"]) if record.get("viewport_width") is not None else None
294
+ ),
295
+ "product_id": (
296
+ str(record["product_id"]) if record.get("product_id") is not None else None
297
+ ),
298
+ "is_mobile": derived.get("is_mobile"),
299
+ "page_category": derived.get("page_category"),
300
+ "is_product_page": derived.get("is_product_page"),
301
+ "created_at": self._coerce_timestamp(record.get("timestamp")),
302
+ "payload_json": self._dump_payload(record),
303
+ }
304
+
305
+ def _normalize_inventory(self, record: dict[str, Any]) -> dict[str, Any]:
306
+ return {
307
+ "event_id": str(record["event_id"]),
308
+ "event_type": str(record["event_type"]),
309
+ "product_id": str(record["product_id"]),
310
+ "name": str(record["name"]),
311
+ "category": str(record["category"]),
312
+ "price": float(record["price"]),
313
+ "currency": str(record.get("currency", "USD")),
314
+ "in_stock": bool(record["in_stock"]),
315
+ "stock_quantity": int(record["stock_quantity"]),
316
+ "created_at": self._coerce_timestamp(record.get("timestamp")),
317
+ "payload_json": self._dump_payload(record),
318
+ }
319
+
320
+ def _normalize_dead_letter(self, record: dict[str, Any]) -> dict[str, Any]:
321
+ return {
322
+ "event_id": (str(record["event_id"]) if record.get("event_id") is not None else None),
323
+ "event_type": (
324
+ str(record["event_type"]) if record.get("event_type") is not None else None
325
+ ),
326
+ "reason": str(record["reason"]),
327
+ "source_topic": str(record.get("source_topic", "events.deadletter")),
328
+ "received_at": self._coerce_timestamp(record.get("received_at", datetime.now(UTC))),
329
+ "payload_json": (
330
+ str(record["payload_json"])
331
+ if "payload_json" in record
332
+ else self._dump_payload(record.get("payload", record))
333
+ ),
334
+ }
335
+
336
+ def _coerce_timestamp(self, value: Any) -> datetime:
337
+ if isinstance(value, datetime):
338
+ timestamp = value
339
+ elif value is None:
340
+ timestamp = datetime.now(UTC)
341
+ else:
342
+ timestamp = datetime.fromisoformat(str(value).replace("Z", "+00:00"))
343
+ if timestamp.tzinfo is None:
344
+ return timestamp
345
+ return timestamp.astimezone(UTC).replace(tzinfo=None)
346
+
347
+ def _dump_payload(self, payload: Any) -> str:
348
+ return json.dumps(payload, default=str, sort_keys=True)