agentflow-runtime 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentflow_runtime-1.1.0.dist-info/METADATA +55 -0
- agentflow_runtime-1.1.0.dist-info/RECORD +100 -0
- agentflow_runtime-1.1.0.dist-info/WHEEL +4 -0
- agentflow_runtime-1.1.0.dist-info/licenses/LICENSE +21 -0
- src/__init__.py +0 -0
- src/constants.py +3 -0
- src/ingestion/__init__.py +0 -0
- src/ingestion/cdc/__init__.py +5 -0
- src/ingestion/cdc/normalizer.py +186 -0
- src/ingestion/connectors/__init__.py +0 -0
- src/ingestion/connectors/mysql_cdc.py +63 -0
- src/ingestion/connectors/postgres_cdc.py +68 -0
- src/ingestion/producers/__init__.py +0 -0
- src/ingestion/producers/event_producer.py +237 -0
- src/ingestion/schemas/__init__.py +0 -0
- src/ingestion/schemas/events.py +147 -0
- src/ingestion/tenant_router.py +80 -0
- src/logger.py +41 -0
- src/orchestration/__init__.py +0 -0
- src/orchestration/dags/__init__.py +0 -0
- src/orchestration/dags/daily_batch.py +201 -0
- src/processing/__init__.py +0 -0
- src/processing/event_replayer.py +250 -0
- src/processing/flink_jobs/Dockerfile +55 -0
- src/processing/flink_jobs/__init__.py +0 -0
- src/processing/flink_jobs/checkpointing.py +32 -0
- src/processing/flink_jobs/session_aggregation.py +212 -0
- src/processing/flink_jobs/session_aggregator.py +199 -0
- src/processing/flink_jobs/stream_processor.py +316 -0
- src/processing/iceberg_sink.py +348 -0
- src/processing/local_pipeline.py +452 -0
- src/processing/outbox.py +273 -0
- src/processing/tracing.py +36 -0
- src/processing/transformations/__init__.py +0 -0
- src/processing/transformations/enrichment.py +125 -0
- src/quality/__init__.py +0 -0
- src/quality/monitors/__init__.py +0 -0
- src/quality/monitors/freshness_monitor.py +166 -0
- src/quality/monitors/metrics_collector.py +367 -0
- src/quality/validators/__init__.py +0 -0
- src/quality/validators/schema_validator.py +119 -0
- src/quality/validators/semantic_validator.py +202 -0
- src/serving/__init__.py +0 -0
- src/serving/api/__init__.py +0 -0
- src/serving/api/alert_dispatcher.py +51 -0
- src/serving/api/alerts/__init__.py +38 -0
- src/serving/api/alerts/dispatcher.py +299 -0
- src/serving/api/alerts/escalation.py +290 -0
- src/serving/api/alerts/evaluator.py +81 -0
- src/serving/api/alerts/history.py +115 -0
- src/serving/api/analytics.py +543 -0
- src/serving/api/auth/__init__.py +46 -0
- src/serving/api/auth/key_rotation.py +400 -0
- src/serving/api/auth/manager.py +406 -0
- src/serving/api/auth/middleware.py +331 -0
- src/serving/api/main.py +390 -0
- src/serving/api/middleware/logging.py +41 -0
- src/serving/api/middleware/tracing.py +51 -0
- src/serving/api/rate_limiter.py +76 -0
- src/serving/api/routers/__init__.py +0 -0
- src/serving/api/routers/admin.py +150 -0
- src/serving/api/routers/admin_ui.py +93 -0
- src/serving/api/routers/agent_query.py +639 -0
- src/serving/api/routers/alerts.py +134 -0
- src/serving/api/routers/batch.py +231 -0
- src/serving/api/routers/contracts.py +98 -0
- src/serving/api/routers/deadletter.py +337 -0
- src/serving/api/routers/lineage.py +218 -0
- src/serving/api/routers/search.py +103 -0
- src/serving/api/routers/slo.py +231 -0
- src/serving/api/routers/stream.py +141 -0
- src/serving/api/routers/webhooks.py +93 -0
- src/serving/api/security.py +83 -0
- src/serving/api/telemetry.py +66 -0
- src/serving/api/templates/admin.html +214 -0
- src/serving/api/versioning.py +328 -0
- src/serving/api/webhook_dispatcher.py +423 -0
- src/serving/backends/__init__.py +117 -0
- src/serving/backends/clickhouse_backend.py +310 -0
- src/serving/backends/duckdb_backend.py +268 -0
- src/serving/cache.py +169 -0
- src/serving/db_pool.py +105 -0
- src/serving/masking.py +122 -0
- src/serving/semantic_layer/__init__.py +0 -0
- src/serving/semantic_layer/catalog.py +177 -0
- src/serving/semantic_layer/contract_registry.py +258 -0
- src/serving/semantic_layer/entity_type_registry.py +107 -0
- src/serving/semantic_layer/nl_engine.py +189 -0
- src/serving/semantic_layer/query/__init__.py +3 -0
- src/serving/semantic_layer/query/contracts.py +47 -0
- src/serving/semantic_layer/query/engine.py +81 -0
- src/serving/semantic_layer/query/entity_queries.py +221 -0
- src/serving/semantic_layer/query/metric_queries.py +84 -0
- src/serving/semantic_layer/query/nl_queries.py +305 -0
- src/serving/semantic_layer/query/sql_builder.py +113 -0
- src/serving/semantic_layer/query/sql_guard.py +3 -0
- src/serving/semantic_layer/query_engine.py +5 -0
- src/serving/semantic_layer/schema_evolution.py +175 -0
- src/serving/semantic_layer/search_index.py +337 -0
- src/serving/semantic_layer/sql_guard.py +56 -0
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""Daily batch DAG: compaction, aggregation, and quality reports.
|
|
2
|
+
|
|
3
|
+
Runs daily at 02:00 UTC when streaming traffic is lowest.
|
|
4
|
+
Handles Iceberg maintenance and pre-computes aggregates for fast serving.
|
|
5
|
+
|
|
6
|
+
Local mode uses DuckDB; production uses Trino/Iceberg catalog.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
import duckdb
|
|
12
|
+
from dagster import (
|
|
13
|
+
AssetExecutionContext,
|
|
14
|
+
Definitions,
|
|
15
|
+
ScheduleDefinition,
|
|
16
|
+
asset,
|
|
17
|
+
define_asset_job,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
DB_PATH = os.getenv("DUCKDB_PATH", ":memory:")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _get_conn():
|
|
24
|
+
return duckdb.connect(DB_PATH)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@asset(group_name="maintenance")
|
|
28
|
+
def iceberg_snapshot_expiry(context: AssetExecutionContext):
|
|
29
|
+
"""Expire old Iceberg snapshots to prevent metadata bloat.
|
|
30
|
+
|
|
31
|
+
Keeps last 30 days of snapshots for time-travel debugging.
|
|
32
|
+
In local mode: no-op (DuckDB has no snapshots).
|
|
33
|
+
In production: calls Iceberg REST catalog expire_snapshots.
|
|
34
|
+
"""
|
|
35
|
+
if DB_PATH == ":memory:":
|
|
36
|
+
context.log.info("Local mode: snapshot expiry is a no-op")
|
|
37
|
+
return {"mode": "local", "tables_processed": 0}
|
|
38
|
+
|
|
39
|
+
context.log.info("Expiring Iceberg snapshots older than 30 days")
|
|
40
|
+
tables = ["orders_v2", "sessions_aggregated", "products_current"]
|
|
41
|
+
# Production: catalog.expire_snapshots(t, older_than_days=30)
|
|
42
|
+
return {"tables_processed": len(tables)}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@asset(group_name="maintenance", deps=[iceberg_snapshot_expiry])
|
|
46
|
+
def iceberg_compaction(context: AssetExecutionContext):
|
|
47
|
+
"""Compact small Iceberg data files into larger ones.
|
|
48
|
+
|
|
49
|
+
Target: 128-512 MB per file for optimal read performance.
|
|
50
|
+
In local mode: runs DuckDB VACUUM/CHECKPOINT.
|
|
51
|
+
"""
|
|
52
|
+
if DB_PATH == ":memory:":
|
|
53
|
+
context.log.info("Local mode: compaction is a no-op")
|
|
54
|
+
return {"mode": "local", "tables_compacted": 0}
|
|
55
|
+
|
|
56
|
+
context.log.info("Compacting data files")
|
|
57
|
+
conn = _get_conn()
|
|
58
|
+
conn.execute("CHECKPOINT")
|
|
59
|
+
conn.close()
|
|
60
|
+
return {"tables_compacted": 3}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@asset(group_name="aggregation")
|
|
64
|
+
def daily_user_profiles(context: AssetExecutionContext):
|
|
65
|
+
"""Pre-compute user profile aggregates for fast entity lookups.
|
|
66
|
+
|
|
67
|
+
Materializes users_enriched from orders_v2.
|
|
68
|
+
"""
|
|
69
|
+
conn = _get_conn()
|
|
70
|
+
|
|
71
|
+
conn.execute("""
|
|
72
|
+
CREATE TABLE IF NOT EXISTS users_enriched (
|
|
73
|
+
user_id VARCHAR PRIMARY KEY,
|
|
74
|
+
total_orders INTEGER DEFAULT 0,
|
|
75
|
+
total_spent DECIMAL(10,2) DEFAULT 0,
|
|
76
|
+
first_order_at TIMESTAMP,
|
|
77
|
+
last_order_at TIMESTAMP,
|
|
78
|
+
preferred_category VARCHAR
|
|
79
|
+
)
|
|
80
|
+
""")
|
|
81
|
+
|
|
82
|
+
result = conn.execute("""
|
|
83
|
+
SELECT
|
|
84
|
+
user_id,
|
|
85
|
+
COUNT(*) as total_orders,
|
|
86
|
+
SUM(total_amount) as total_spent,
|
|
87
|
+
MIN(created_at) as first_order_at,
|
|
88
|
+
MAX(created_at) as last_order_at
|
|
89
|
+
FROM orders_v2
|
|
90
|
+
WHERE status != 'cancelled'
|
|
91
|
+
GROUP BY user_id
|
|
92
|
+
""").fetchall()
|
|
93
|
+
|
|
94
|
+
for row in result:
|
|
95
|
+
conn.execute(
|
|
96
|
+
"""
|
|
97
|
+
INSERT OR REPLACE INTO users_enriched
|
|
98
|
+
(user_id, total_orders, total_spent,
|
|
99
|
+
first_order_at, last_order_at)
|
|
100
|
+
VALUES (?, ?, ?, ?, ?)
|
|
101
|
+
""",
|
|
102
|
+
list(row),
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
conn.close()
|
|
106
|
+
context.log.info("User profiles updated: %d", len(result))
|
|
107
|
+
return {"users_updated": len(result)}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@asset(group_name="aggregation")
|
|
111
|
+
def daily_product_metrics(context: AssetExecutionContext):
|
|
112
|
+
"""Pre-compute product-level metrics from pipeline events."""
|
|
113
|
+
conn = _get_conn()
|
|
114
|
+
|
|
115
|
+
count = conn.execute("SELECT COUNT(*) FROM products_current").fetchone()[0]
|
|
116
|
+
conn.close()
|
|
117
|
+
|
|
118
|
+
context.log.info("Product metrics refreshed: %d", count)
|
|
119
|
+
return {"products_updated": count}
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@asset(
|
|
123
|
+
group_name="quality",
|
|
124
|
+
deps=[daily_user_profiles, daily_product_metrics],
|
|
125
|
+
)
|
|
126
|
+
def daily_quality_report(context: AssetExecutionContext):
|
|
127
|
+
"""Generate daily data quality report.
|
|
128
|
+
|
|
129
|
+
Checks:
|
|
130
|
+
- Row counts per table (alert on >20% daily deviation)
|
|
131
|
+
- Null rates for required fields
|
|
132
|
+
- Dead letter ratio
|
|
133
|
+
"""
|
|
134
|
+
conn = _get_conn()
|
|
135
|
+
|
|
136
|
+
checks = {}
|
|
137
|
+
for table in [
|
|
138
|
+
"orders_v2",
|
|
139
|
+
"users_enriched",
|
|
140
|
+
"products_current",
|
|
141
|
+
"sessions_aggregated",
|
|
142
|
+
]:
|
|
143
|
+
try:
|
|
144
|
+
row = conn.execute(
|
|
145
|
+
f"SELECT COUNT(*) FROM {table}" # nosec B608 - table comes from the fixed health-check allowlist
|
|
146
|
+
).fetchone()
|
|
147
|
+
checks[table] = {"rows": row[0], "status": "ok"}
|
|
148
|
+
except duckdb.Error as e:
|
|
149
|
+
checks[table] = {"rows": 0, "status": f"error: {e}"}
|
|
150
|
+
|
|
151
|
+
# Dead letter ratio
|
|
152
|
+
try:
|
|
153
|
+
total = conn.execute("SELECT COUNT(*) FROM pipeline_events").fetchone()[0]
|
|
154
|
+
dead = conn.execute(
|
|
155
|
+
"SELECT COUNT(*) FROM pipeline_events WHERE topic = 'events.deadletter'"
|
|
156
|
+
).fetchone()[0]
|
|
157
|
+
dl_ratio = dead / total if total > 0 else 0.0
|
|
158
|
+
checks["dead_letter_ratio"] = {"value": round(dl_ratio, 4)}
|
|
159
|
+
except duckdb.Error:
|
|
160
|
+
checks["dead_letter_ratio"] = {"value": None}
|
|
161
|
+
|
|
162
|
+
conn.close()
|
|
163
|
+
|
|
164
|
+
failed = sum(
|
|
165
|
+
1
|
|
166
|
+
for v in checks.values()
|
|
167
|
+
if isinstance(v, dict) and v.get("status", "").startswith("error")
|
|
168
|
+
)
|
|
169
|
+
context.log.info("Quality report: %s", checks)
|
|
170
|
+
return {"checks": checks, "checks_failed": failed}
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
# ── Job & Schedule ──────────────────────────────────────────────
|
|
174
|
+
|
|
175
|
+
daily_maintenance_job = define_asset_job(
|
|
176
|
+
name="daily_maintenance",
|
|
177
|
+
selection=[
|
|
178
|
+
iceberg_snapshot_expiry,
|
|
179
|
+
iceberg_compaction,
|
|
180
|
+
daily_user_profiles,
|
|
181
|
+
daily_product_metrics,
|
|
182
|
+
daily_quality_report,
|
|
183
|
+
],
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
daily_schedule = ScheduleDefinition(
|
|
187
|
+
job=daily_maintenance_job,
|
|
188
|
+
cron_schedule="0 2 * * *", # 02:00 UTC daily
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
defs = Definitions(
|
|
192
|
+
assets=[
|
|
193
|
+
iceberg_snapshot_expiry,
|
|
194
|
+
iceberg_compaction,
|
|
195
|
+
daily_user_profiles,
|
|
196
|
+
daily_product_metrics,
|
|
197
|
+
daily_quality_report,
|
|
198
|
+
],
|
|
199
|
+
jobs=[daily_maintenance_job],
|
|
200
|
+
schedules=[daily_schedule],
|
|
201
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
from contextlib import nullcontext
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from datetime import UTC, datetime
|
|
9
|
+
from uuid import uuid4
|
|
10
|
+
|
|
11
|
+
import structlog
|
|
12
|
+
from opentelemetry import trace
|
|
13
|
+
|
|
14
|
+
from src.processing.outbox import OutboxProcessor, ensure_outbox_table
|
|
15
|
+
from src.processing.tracing import inject_trace_to_kafka_headers, telemetry_disabled
|
|
16
|
+
from src.quality.validators.schema_validator import validate_event
|
|
17
|
+
from src.quality.validators.semantic_validator import validate_semantics
|
|
18
|
+
|
|
19
|
+
DEFAULT_KAFKA_BOOTSTRAP = os.getenv("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092")
|
|
20
|
+
DEFAULT_REPLAY_TOPIC = "events.raw"
|
|
21
|
+
tracer = trace.get_tracer("agentflow.event_replayer")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DeadLetterEventNotFoundError(LookupError):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ReplayValidationError(ValueError):
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class ReplayResult:
|
|
34
|
+
event_id: str
|
|
35
|
+
status: str
|
|
36
|
+
retry_count: int
|
|
37
|
+
last_retried_at: datetime
|
|
38
|
+
payload: dict
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def ensure_dead_letter_table(conn) -> None:
|
|
42
|
+
conn.execute(
|
|
43
|
+
"""
|
|
44
|
+
CREATE TABLE IF NOT EXISTS dead_letter_events (
|
|
45
|
+
event_id TEXT PRIMARY KEY,
|
|
46
|
+
tenant_id TEXT DEFAULT 'default',
|
|
47
|
+
event_type TEXT,
|
|
48
|
+
payload JSON,
|
|
49
|
+
failure_reason TEXT,
|
|
50
|
+
failure_detail TEXT,
|
|
51
|
+
received_at TIMESTAMP,
|
|
52
|
+
retry_count INTEGER DEFAULT 0,
|
|
53
|
+
last_retried_at TIMESTAMP,
|
|
54
|
+
status TEXT DEFAULT 'failed'
|
|
55
|
+
)
|
|
56
|
+
"""
|
|
57
|
+
)
|
|
58
|
+
conn.execute(
|
|
59
|
+
"ALTER TABLE dead_letter_events ADD COLUMN IF NOT EXISTS tenant_id TEXT DEFAULT 'default'"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class EventReplayer:
|
|
64
|
+
def __init__(
|
|
65
|
+
self,
|
|
66
|
+
conn,
|
|
67
|
+
producer: Callable[[str, dict], None] | None = None,
|
|
68
|
+
bootstrap_servers: str | None = None,
|
|
69
|
+
) -> None:
|
|
70
|
+
self._conn = conn
|
|
71
|
+
self._producer = producer or self._produce_to_kafka
|
|
72
|
+
self._bootstrap_servers = bootstrap_servers or DEFAULT_KAFKA_BOOTSTRAP
|
|
73
|
+
ensure_dead_letter_table(self._conn)
|
|
74
|
+
ensure_outbox_table(self._conn)
|
|
75
|
+
|
|
76
|
+
def replay(
|
|
77
|
+
self,
|
|
78
|
+
event_id: str,
|
|
79
|
+
corrected_payload: dict | None = None,
|
|
80
|
+
) -> ReplayResult:
|
|
81
|
+
row = self._load_row(event_id)
|
|
82
|
+
payload = self._decoded_payload(row["payload"])
|
|
83
|
+
candidate = dict(payload)
|
|
84
|
+
if corrected_payload:
|
|
85
|
+
candidate.update(corrected_payload)
|
|
86
|
+
self._validate(candidate)
|
|
87
|
+
|
|
88
|
+
replayed_at = datetime.now(UTC)
|
|
89
|
+
retry_count = int(row["retry_count"] or 0) + 1
|
|
90
|
+
outbox_id = str(uuid4())
|
|
91
|
+
self._conn.execute("BEGIN TRANSACTION")
|
|
92
|
+
try:
|
|
93
|
+
self._conn.execute(
|
|
94
|
+
"""
|
|
95
|
+
UPDATE dead_letter_events
|
|
96
|
+
SET payload = ?, status = 'replay_pending', retry_count = ?, last_retried_at = ?
|
|
97
|
+
WHERE event_id = ?
|
|
98
|
+
""",
|
|
99
|
+
[
|
|
100
|
+
json.dumps(candidate),
|
|
101
|
+
retry_count,
|
|
102
|
+
replayed_at,
|
|
103
|
+
event_id,
|
|
104
|
+
],
|
|
105
|
+
)
|
|
106
|
+
self._conn.execute(
|
|
107
|
+
"""
|
|
108
|
+
INSERT INTO outbox (
|
|
109
|
+
id,
|
|
110
|
+
event_id,
|
|
111
|
+
payload,
|
|
112
|
+
topic,
|
|
113
|
+
created_at,
|
|
114
|
+
sent_at,
|
|
115
|
+
status,
|
|
116
|
+
retry_count,
|
|
117
|
+
next_attempt_at,
|
|
118
|
+
last_error
|
|
119
|
+
)
|
|
120
|
+
VALUES (?, ?, ?, ?, ?, NULL, 'pending', 0, ?, NULL)
|
|
121
|
+
""",
|
|
122
|
+
[
|
|
123
|
+
outbox_id,
|
|
124
|
+
event_id,
|
|
125
|
+
json.dumps(candidate),
|
|
126
|
+
DEFAULT_REPLAY_TOPIC,
|
|
127
|
+
replayed_at,
|
|
128
|
+
replayed_at,
|
|
129
|
+
],
|
|
130
|
+
)
|
|
131
|
+
self._conn.execute("COMMIT")
|
|
132
|
+
except Exception: # nosec B110 - rollback must preserve the original replay failure
|
|
133
|
+
# Transaction rollback must happen before unexpected errors propagate.
|
|
134
|
+
self._conn.execute("ROLLBACK")
|
|
135
|
+
raise
|
|
136
|
+
status = "replay_pending"
|
|
137
|
+
processor = OutboxProcessor(
|
|
138
|
+
conn=self._conn,
|
|
139
|
+
producer=self._producer,
|
|
140
|
+
bootstrap_servers=self._bootstrap_servers,
|
|
141
|
+
)
|
|
142
|
+
if processor.process_entry(outbox_id):
|
|
143
|
+
status = "replayed"
|
|
144
|
+
return ReplayResult(
|
|
145
|
+
event_id=event_id,
|
|
146
|
+
status=status,
|
|
147
|
+
retry_count=retry_count,
|
|
148
|
+
last_retried_at=replayed_at,
|
|
149
|
+
payload=candidate,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
def dismiss(self, event_id: str) -> None:
|
|
153
|
+
self._load_row(event_id)
|
|
154
|
+
self._conn.execute(
|
|
155
|
+
"UPDATE dead_letter_events SET status = 'dismissed' WHERE event_id = ?",
|
|
156
|
+
[event_id],
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def _load_row(self, event_id: str) -> dict:
|
|
160
|
+
row = self._conn.execute(
|
|
161
|
+
"""
|
|
162
|
+
SELECT
|
|
163
|
+
event_id,
|
|
164
|
+
payload,
|
|
165
|
+
retry_count
|
|
166
|
+
FROM dead_letter_events
|
|
167
|
+
WHERE event_id = ?
|
|
168
|
+
""",
|
|
169
|
+
[event_id],
|
|
170
|
+
).fetchone()
|
|
171
|
+
if row is None:
|
|
172
|
+
raise DeadLetterEventNotFoundError(event_id)
|
|
173
|
+
return {
|
|
174
|
+
"event_id": row[0],
|
|
175
|
+
"payload": row[1],
|
|
176
|
+
"retry_count": row[2],
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
def _decoded_payload(self, payload) -> dict:
|
|
180
|
+
if isinstance(payload, dict):
|
|
181
|
+
return payload
|
|
182
|
+
if isinstance(payload, str):
|
|
183
|
+
decoded = json.loads(payload)
|
|
184
|
+
if isinstance(decoded, dict):
|
|
185
|
+
return decoded
|
|
186
|
+
raise ReplayValidationError("Dead-letter payload must be a JSON object.")
|
|
187
|
+
|
|
188
|
+
def _validate(self, payload: dict) -> None:
|
|
189
|
+
schema_result = validate_event(payload)
|
|
190
|
+
if not schema_result.is_valid:
|
|
191
|
+
first_error = schema_result.errors[0] if schema_result.errors else {}
|
|
192
|
+
raise ReplayValidationError(first_error.get("msg", "Schema validation failed."))
|
|
193
|
+
|
|
194
|
+
semantic_result = validate_semantics(payload)
|
|
195
|
+
semantic_errors = [
|
|
196
|
+
issue.message for issue in semantic_result.issues if issue.severity == "error"
|
|
197
|
+
]
|
|
198
|
+
if semantic_errors:
|
|
199
|
+
raise ReplayValidationError(semantic_errors[0])
|
|
200
|
+
|
|
201
|
+
def _produce_to_kafka(self, topic: str, payload: dict) -> None:
|
|
202
|
+
from confluent_kafka import Producer
|
|
203
|
+
|
|
204
|
+
delivery_errors: list[str] = []
|
|
205
|
+
|
|
206
|
+
def on_delivery(err, msg) -> None:
|
|
207
|
+
del msg
|
|
208
|
+
if err is not None:
|
|
209
|
+
delivery_errors.append(str(err))
|
|
210
|
+
|
|
211
|
+
producer = Producer({"bootstrap.servers": self._bootstrap_servers})
|
|
212
|
+
produce_span = (
|
|
213
|
+
tracer.start_as_current_span("kafka.produce")
|
|
214
|
+
if not telemetry_disabled()
|
|
215
|
+
else nullcontext()
|
|
216
|
+
)
|
|
217
|
+
with produce_span as span:
|
|
218
|
+
if span is not None and span.is_recording():
|
|
219
|
+
span.set_attribute("topic", topic)
|
|
220
|
+
event_type = payload.get("event_type")
|
|
221
|
+
if event_type is not None:
|
|
222
|
+
span.set_attribute("event_type", str(event_type))
|
|
223
|
+
tenant_id = payload.get("tenant_id") or structlog.contextvars.get_contextvars().get(
|
|
224
|
+
"tenant_id"
|
|
225
|
+
)
|
|
226
|
+
if tenant_id is not None:
|
|
227
|
+
span.set_attribute("tenant_id", str(tenant_id))
|
|
228
|
+
headers = inject_trace_to_kafka_headers({})
|
|
229
|
+
try:
|
|
230
|
+
producer.produce(
|
|
231
|
+
topic,
|
|
232
|
+
key=str(payload.get("event_id", "")),
|
|
233
|
+
value=json.dumps(payload).encode("utf-8"),
|
|
234
|
+
headers=list(headers.items()) or None,
|
|
235
|
+
on_delivery=on_delivery,
|
|
236
|
+
)
|
|
237
|
+
except TypeError as exc:
|
|
238
|
+
if "on_delivery" not in str(exc):
|
|
239
|
+
raise
|
|
240
|
+
producer.produce(
|
|
241
|
+
topic,
|
|
242
|
+
key=str(payload.get("event_id", "")),
|
|
243
|
+
value=json.dumps(payload).encode("utf-8"),
|
|
244
|
+
headers=list(headers.items()) or None,
|
|
245
|
+
)
|
|
246
|
+
remaining = producer.flush(10)
|
|
247
|
+
if delivery_errors:
|
|
248
|
+
raise RuntimeError(delivery_errors[0])
|
|
249
|
+
if remaining != 0:
|
|
250
|
+
raise RuntimeError(f"{remaining} Kafka message(s) were not delivered")
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
FROM python:3.11-slim-bookworm
|
|
2
|
+
|
|
3
|
+
ARG FLINK_VERSION=1.19.1
|
|
4
|
+
ARG SCALA_VERSION=2.12
|
|
5
|
+
ARG PYFLINK_KAFKA_JAR_VERSION=3.3.0-1.19
|
|
6
|
+
|
|
7
|
+
RUN apt-get update \
|
|
8
|
+
&& apt-get install -y --no-install-recommends bash ca-certificates curl openjdk-17-jre-headless \
|
|
9
|
+
&& curl -fsSL "https://archive.apache.org/dist/flink/flink-${FLINK_VERSION}/flink-${FLINK_VERSION}-bin-scala_${SCALA_VERSION}.tgz" -o /tmp/flink.tgz \
|
|
10
|
+
&& tar -xzf /tmp/flink.tgz -C /opt \
|
|
11
|
+
&& ln -s "/opt/flink-${FLINK_VERSION}" /opt/flink \
|
|
12
|
+
&& curl -fsSL "https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka/${PYFLINK_KAFKA_JAR_VERSION}/flink-sql-connector-kafka-${PYFLINK_KAFKA_JAR_VERSION}.jar" -o "/opt/flink/lib/flink-sql-connector-kafka-${PYFLINK_KAFKA_JAR_VERSION}.jar" \
|
|
13
|
+
&& mkdir -p /opt/flink/plugins/s3-fs-hadoop \
|
|
14
|
+
&& ln -s "/opt/flink/opt/flink-s3-fs-hadoop-${FLINK_VERSION}.jar" "/opt/flink/plugins/s3-fs-hadoop/flink-s3-fs-hadoop-${FLINK_VERSION}.jar" \
|
|
15
|
+
&& python -m pip install --no-cache-dir --upgrade pip \
|
|
16
|
+
&& python -m pip install --no-cache-dir "apache-flink==${FLINK_VERSION}" confluent-kafka pydantic structlog \
|
|
17
|
+
&& rm -rf /var/lib/apt/lists/* /tmp/flink.tgz
|
|
18
|
+
|
|
19
|
+
WORKDIR /opt/agentflow
|
|
20
|
+
ENV FLINK_HOME=/opt/flink
|
|
21
|
+
ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
|
|
22
|
+
ENV PATH="/opt/flink/bin:${PATH}"
|
|
23
|
+
ENV PYTHONDONTWRITEBYTECODE=1
|
|
24
|
+
ENV PYTHONUNBUFFERED=1
|
|
25
|
+
ENV PYFLINK_CLIENT_EXECUTABLE=/usr/local/bin/python
|
|
26
|
+
ENV PYTHONPATH=/opt/agentflow
|
|
27
|
+
|
|
28
|
+
COPY . /opt/agentflow/src
|
|
29
|
+
|
|
30
|
+
RUN cat <<'PY' > /opt/agentflow/sitecustomize.py
|
|
31
|
+
from datetime import timedelta as _timedelta
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
from pyflink.common import WatermarkStrategy
|
|
35
|
+
from pyflink.common.time import Duration, Time
|
|
36
|
+
from pyflink.datastream.state import StateTtlConfig
|
|
37
|
+
except Exception:
|
|
38
|
+
pass
|
|
39
|
+
else:
|
|
40
|
+
_original_watermark = WatermarkStrategy.for_bounded_out_of_orderness
|
|
41
|
+
_original_ttl_builder = StateTtlConfig.new_builder
|
|
42
|
+
|
|
43
|
+
def _patched_watermark(max_out_of_orderness):
|
|
44
|
+
if isinstance(max_out_of_orderness, _timedelta):
|
|
45
|
+
max_out_of_orderness = Duration.of_millis(int(max_out_of_orderness.total_seconds() * 1000))
|
|
46
|
+
return _original_watermark(max_out_of_orderness)
|
|
47
|
+
|
|
48
|
+
def _patched_ttl_builder(ttl):
|
|
49
|
+
if isinstance(ttl, _timedelta):
|
|
50
|
+
ttl = Time.milliseconds(int(ttl.total_seconds() * 1000))
|
|
51
|
+
return _original_ttl_builder(ttl)
|
|
52
|
+
|
|
53
|
+
WatermarkStrategy.for_bounded_out_of_orderness = staticmethod(_patched_watermark)
|
|
54
|
+
StateTtlConfig.new_builder = staticmethod(_patched_ttl_builder)
|
|
55
|
+
PY
|
|
File without changes
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _checkpoint_constants() -> tuple[object, object]:
|
|
8
|
+
try:
|
|
9
|
+
from pyflink.datastream import CheckpointingMode
|
|
10
|
+
from pyflink.datastream.checkpoint_config import ExternalizedCheckpointCleanup
|
|
11
|
+
except ModuleNotFoundError:
|
|
12
|
+
return "EXACTLY_ONCE", "RETAIN_ON_CANCELLATION"
|
|
13
|
+
|
|
14
|
+
return (
|
|
15
|
+
CheckpointingMode.EXACTLY_ONCE,
|
|
16
|
+
ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def configure_checkpointing(env: Any) -> None:
|
|
21
|
+
checkpoint_mode, cleanup_mode = _checkpoint_constants()
|
|
22
|
+
|
|
23
|
+
env.enable_checkpointing(60_000)
|
|
24
|
+
config = env.get_checkpoint_config()
|
|
25
|
+
config.set_checkpointing_mode(checkpoint_mode)
|
|
26
|
+
config.set_min_pause_between_checkpoints(30_000)
|
|
27
|
+
config.set_checkpoint_timeout(120_000)
|
|
28
|
+
config.set_max_concurrent_checkpoints(1)
|
|
29
|
+
config.enable_externalized_checkpoints(cleanup_mode)
|
|
30
|
+
config.set_checkpoint_storage(
|
|
31
|
+
os.getenv("FLINK_CHECKPOINT_DIR", "file:///tmp/flink-checkpoints")
|
|
32
|
+
)
|