agentflow-runtime 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. agentflow_runtime-1.1.0.dist-info/METADATA +55 -0
  2. agentflow_runtime-1.1.0.dist-info/RECORD +100 -0
  3. agentflow_runtime-1.1.0.dist-info/WHEEL +4 -0
  4. agentflow_runtime-1.1.0.dist-info/licenses/LICENSE +21 -0
  5. src/__init__.py +0 -0
  6. src/constants.py +3 -0
  7. src/ingestion/__init__.py +0 -0
  8. src/ingestion/cdc/__init__.py +5 -0
  9. src/ingestion/cdc/normalizer.py +186 -0
  10. src/ingestion/connectors/__init__.py +0 -0
  11. src/ingestion/connectors/mysql_cdc.py +63 -0
  12. src/ingestion/connectors/postgres_cdc.py +68 -0
  13. src/ingestion/producers/__init__.py +0 -0
  14. src/ingestion/producers/event_producer.py +237 -0
  15. src/ingestion/schemas/__init__.py +0 -0
  16. src/ingestion/schemas/events.py +147 -0
  17. src/ingestion/tenant_router.py +80 -0
  18. src/logger.py +41 -0
  19. src/orchestration/__init__.py +0 -0
  20. src/orchestration/dags/__init__.py +0 -0
  21. src/orchestration/dags/daily_batch.py +201 -0
  22. src/processing/__init__.py +0 -0
  23. src/processing/event_replayer.py +250 -0
  24. src/processing/flink_jobs/Dockerfile +55 -0
  25. src/processing/flink_jobs/__init__.py +0 -0
  26. src/processing/flink_jobs/checkpointing.py +32 -0
  27. src/processing/flink_jobs/session_aggregation.py +212 -0
  28. src/processing/flink_jobs/session_aggregator.py +199 -0
  29. src/processing/flink_jobs/stream_processor.py +316 -0
  30. src/processing/iceberg_sink.py +348 -0
  31. src/processing/local_pipeline.py +452 -0
  32. src/processing/outbox.py +273 -0
  33. src/processing/tracing.py +36 -0
  34. src/processing/transformations/__init__.py +0 -0
  35. src/processing/transformations/enrichment.py +125 -0
  36. src/quality/__init__.py +0 -0
  37. src/quality/monitors/__init__.py +0 -0
  38. src/quality/monitors/freshness_monitor.py +166 -0
  39. src/quality/monitors/metrics_collector.py +367 -0
  40. src/quality/validators/__init__.py +0 -0
  41. src/quality/validators/schema_validator.py +119 -0
  42. src/quality/validators/semantic_validator.py +202 -0
  43. src/serving/__init__.py +0 -0
  44. src/serving/api/__init__.py +0 -0
  45. src/serving/api/alert_dispatcher.py +51 -0
  46. src/serving/api/alerts/__init__.py +38 -0
  47. src/serving/api/alerts/dispatcher.py +299 -0
  48. src/serving/api/alerts/escalation.py +290 -0
  49. src/serving/api/alerts/evaluator.py +81 -0
  50. src/serving/api/alerts/history.py +115 -0
  51. src/serving/api/analytics.py +543 -0
  52. src/serving/api/auth/__init__.py +46 -0
  53. src/serving/api/auth/key_rotation.py +400 -0
  54. src/serving/api/auth/manager.py +406 -0
  55. src/serving/api/auth/middleware.py +331 -0
  56. src/serving/api/main.py +390 -0
  57. src/serving/api/middleware/logging.py +41 -0
  58. src/serving/api/middleware/tracing.py +51 -0
  59. src/serving/api/rate_limiter.py +76 -0
  60. src/serving/api/routers/__init__.py +0 -0
  61. src/serving/api/routers/admin.py +150 -0
  62. src/serving/api/routers/admin_ui.py +93 -0
  63. src/serving/api/routers/agent_query.py +639 -0
  64. src/serving/api/routers/alerts.py +134 -0
  65. src/serving/api/routers/batch.py +231 -0
  66. src/serving/api/routers/contracts.py +98 -0
  67. src/serving/api/routers/deadletter.py +337 -0
  68. src/serving/api/routers/lineage.py +218 -0
  69. src/serving/api/routers/search.py +103 -0
  70. src/serving/api/routers/slo.py +231 -0
  71. src/serving/api/routers/stream.py +141 -0
  72. src/serving/api/routers/webhooks.py +93 -0
  73. src/serving/api/security.py +83 -0
  74. src/serving/api/telemetry.py +66 -0
  75. src/serving/api/templates/admin.html +214 -0
  76. src/serving/api/versioning.py +328 -0
  77. src/serving/api/webhook_dispatcher.py +423 -0
  78. src/serving/backends/__init__.py +117 -0
  79. src/serving/backends/clickhouse_backend.py +310 -0
  80. src/serving/backends/duckdb_backend.py +268 -0
  81. src/serving/cache.py +169 -0
  82. src/serving/db_pool.py +105 -0
  83. src/serving/masking.py +122 -0
  84. src/serving/semantic_layer/__init__.py +0 -0
  85. src/serving/semantic_layer/catalog.py +177 -0
  86. src/serving/semantic_layer/contract_registry.py +258 -0
  87. src/serving/semantic_layer/entity_type_registry.py +107 -0
  88. src/serving/semantic_layer/nl_engine.py +189 -0
  89. src/serving/semantic_layer/query/__init__.py +3 -0
  90. src/serving/semantic_layer/query/contracts.py +47 -0
  91. src/serving/semantic_layer/query/engine.py +81 -0
  92. src/serving/semantic_layer/query/entity_queries.py +221 -0
  93. src/serving/semantic_layer/query/metric_queries.py +84 -0
  94. src/serving/semantic_layer/query/nl_queries.py +305 -0
  95. src/serving/semantic_layer/query/sql_builder.py +113 -0
  96. src/serving/semantic_layer/query/sql_guard.py +3 -0
  97. src/serving/semantic_layer/query_engine.py +5 -0
  98. src/serving/semantic_layer/schema_evolution.py +175 -0
  99. src/serving/semantic_layer/search_index.py +337 -0
  100. src/serving/semantic_layer/sql_guard.py +56 -0
@@ -0,0 +1,201 @@
1
+ """Daily batch DAG: compaction, aggregation, and quality reports.
2
+
3
+ Runs daily at 02:00 UTC when streaming traffic is lowest.
4
+ Handles Iceberg maintenance and pre-computes aggregates for fast serving.
5
+
6
+ Local mode uses DuckDB; production uses Trino/Iceberg catalog.
7
+ """
8
+
9
+ import os
10
+
11
+ import duckdb
12
+ from dagster import (
13
+ AssetExecutionContext,
14
+ Definitions,
15
+ ScheduleDefinition,
16
+ asset,
17
+ define_asset_job,
18
+ )
19
+
20
+ DB_PATH = os.getenv("DUCKDB_PATH", ":memory:")
21
+
22
+
23
+ def _get_conn():
24
+ return duckdb.connect(DB_PATH)
25
+
26
+
27
+ @asset(group_name="maintenance")
28
+ def iceberg_snapshot_expiry(context: AssetExecutionContext):
29
+ """Expire old Iceberg snapshots to prevent metadata bloat.
30
+
31
+ Keeps last 30 days of snapshots for time-travel debugging.
32
+ In local mode: no-op (DuckDB has no snapshots).
33
+ In production: calls Iceberg REST catalog expire_snapshots.
34
+ """
35
+ if DB_PATH == ":memory:":
36
+ context.log.info("Local mode: snapshot expiry is a no-op")
37
+ return {"mode": "local", "tables_processed": 0}
38
+
39
+ context.log.info("Expiring Iceberg snapshots older than 30 days")
40
+ tables = ["orders_v2", "sessions_aggregated", "products_current"]
41
+ # Production: catalog.expire_snapshots(t, older_than_days=30)
42
+ return {"tables_processed": len(tables)}
43
+
44
+
45
+ @asset(group_name="maintenance", deps=[iceberg_snapshot_expiry])
46
+ def iceberg_compaction(context: AssetExecutionContext):
47
+ """Compact small Iceberg data files into larger ones.
48
+
49
+ Target: 128-512 MB per file for optimal read performance.
50
+ In local mode: runs DuckDB VACUUM/CHECKPOINT.
51
+ """
52
+ if DB_PATH == ":memory:":
53
+ context.log.info("Local mode: compaction is a no-op")
54
+ return {"mode": "local", "tables_compacted": 0}
55
+
56
+ context.log.info("Compacting data files")
57
+ conn = _get_conn()
58
+ conn.execute("CHECKPOINT")
59
+ conn.close()
60
+ return {"tables_compacted": 3}
61
+
62
+
63
+ @asset(group_name="aggregation")
64
+ def daily_user_profiles(context: AssetExecutionContext):
65
+ """Pre-compute user profile aggregates for fast entity lookups.
66
+
67
+ Materializes users_enriched from orders_v2.
68
+ """
69
+ conn = _get_conn()
70
+
71
+ conn.execute("""
72
+ CREATE TABLE IF NOT EXISTS users_enriched (
73
+ user_id VARCHAR PRIMARY KEY,
74
+ total_orders INTEGER DEFAULT 0,
75
+ total_spent DECIMAL(10,2) DEFAULT 0,
76
+ first_order_at TIMESTAMP,
77
+ last_order_at TIMESTAMP,
78
+ preferred_category VARCHAR
79
+ )
80
+ """)
81
+
82
+ result = conn.execute("""
83
+ SELECT
84
+ user_id,
85
+ COUNT(*) as total_orders,
86
+ SUM(total_amount) as total_spent,
87
+ MIN(created_at) as first_order_at,
88
+ MAX(created_at) as last_order_at
89
+ FROM orders_v2
90
+ WHERE status != 'cancelled'
91
+ GROUP BY user_id
92
+ """).fetchall()
93
+
94
+ for row in result:
95
+ conn.execute(
96
+ """
97
+ INSERT OR REPLACE INTO users_enriched
98
+ (user_id, total_orders, total_spent,
99
+ first_order_at, last_order_at)
100
+ VALUES (?, ?, ?, ?, ?)
101
+ """,
102
+ list(row),
103
+ )
104
+
105
+ conn.close()
106
+ context.log.info("User profiles updated: %d", len(result))
107
+ return {"users_updated": len(result)}
108
+
109
+
110
+ @asset(group_name="aggregation")
111
+ def daily_product_metrics(context: AssetExecutionContext):
112
+ """Pre-compute product-level metrics from pipeline events."""
113
+ conn = _get_conn()
114
+
115
+ count = conn.execute("SELECT COUNT(*) FROM products_current").fetchone()[0]
116
+ conn.close()
117
+
118
+ context.log.info("Product metrics refreshed: %d", count)
119
+ return {"products_updated": count}
120
+
121
+
122
+ @asset(
123
+ group_name="quality",
124
+ deps=[daily_user_profiles, daily_product_metrics],
125
+ )
126
+ def daily_quality_report(context: AssetExecutionContext):
127
+ """Generate daily data quality report.
128
+
129
+ Checks:
130
+ - Row counts per table (alert on >20% daily deviation)
131
+ - Null rates for required fields
132
+ - Dead letter ratio
133
+ """
134
+ conn = _get_conn()
135
+
136
+ checks = {}
137
+ for table in [
138
+ "orders_v2",
139
+ "users_enriched",
140
+ "products_current",
141
+ "sessions_aggregated",
142
+ ]:
143
+ try:
144
+ row = conn.execute(
145
+ f"SELECT COUNT(*) FROM {table}" # nosec B608 - table comes from the fixed health-check allowlist
146
+ ).fetchone()
147
+ checks[table] = {"rows": row[0], "status": "ok"}
148
+ except duckdb.Error as e:
149
+ checks[table] = {"rows": 0, "status": f"error: {e}"}
150
+
151
+ # Dead letter ratio
152
+ try:
153
+ total = conn.execute("SELECT COUNT(*) FROM pipeline_events").fetchone()[0]
154
+ dead = conn.execute(
155
+ "SELECT COUNT(*) FROM pipeline_events WHERE topic = 'events.deadletter'"
156
+ ).fetchone()[0]
157
+ dl_ratio = dead / total if total > 0 else 0.0
158
+ checks["dead_letter_ratio"] = {"value": round(dl_ratio, 4)}
159
+ except duckdb.Error:
160
+ checks["dead_letter_ratio"] = {"value": None}
161
+
162
+ conn.close()
163
+
164
+ failed = sum(
165
+ 1
166
+ for v in checks.values()
167
+ if isinstance(v, dict) and v.get("status", "").startswith("error")
168
+ )
169
+ context.log.info("Quality report: %s", checks)
170
+ return {"checks": checks, "checks_failed": failed}
171
+
172
+
173
+ # ── Job & Schedule ──────────────────────────────────────────────
174
+
175
+ daily_maintenance_job = define_asset_job(
176
+ name="daily_maintenance",
177
+ selection=[
178
+ iceberg_snapshot_expiry,
179
+ iceberg_compaction,
180
+ daily_user_profiles,
181
+ daily_product_metrics,
182
+ daily_quality_report,
183
+ ],
184
+ )
185
+
186
+ daily_schedule = ScheduleDefinition(
187
+ job=daily_maintenance_job,
188
+ cron_schedule="0 2 * * *", # 02:00 UTC daily
189
+ )
190
+
191
+ defs = Definitions(
192
+ assets=[
193
+ iceberg_snapshot_expiry,
194
+ iceberg_compaction,
195
+ daily_user_profiles,
196
+ daily_product_metrics,
197
+ daily_quality_report,
198
+ ],
199
+ jobs=[daily_maintenance_job],
200
+ schedules=[daily_schedule],
201
+ )
File without changes
@@ -0,0 +1,250 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from collections.abc import Callable
6
+ from contextlib import nullcontext
7
+ from dataclasses import dataclass
8
+ from datetime import UTC, datetime
9
+ from uuid import uuid4
10
+
11
+ import structlog
12
+ from opentelemetry import trace
13
+
14
+ from src.processing.outbox import OutboxProcessor, ensure_outbox_table
15
+ from src.processing.tracing import inject_trace_to_kafka_headers, telemetry_disabled
16
+ from src.quality.validators.schema_validator import validate_event
17
+ from src.quality.validators.semantic_validator import validate_semantics
18
+
19
+ DEFAULT_KAFKA_BOOTSTRAP = os.getenv("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092")
20
+ DEFAULT_REPLAY_TOPIC = "events.raw"
21
+ tracer = trace.get_tracer("agentflow.event_replayer")
22
+
23
+
24
+ class DeadLetterEventNotFoundError(LookupError):
25
+ pass
26
+
27
+
28
+ class ReplayValidationError(ValueError):
29
+ pass
30
+
31
+
32
+ @dataclass
33
+ class ReplayResult:
34
+ event_id: str
35
+ status: str
36
+ retry_count: int
37
+ last_retried_at: datetime
38
+ payload: dict
39
+
40
+
41
+ def ensure_dead_letter_table(conn) -> None:
42
+ conn.execute(
43
+ """
44
+ CREATE TABLE IF NOT EXISTS dead_letter_events (
45
+ event_id TEXT PRIMARY KEY,
46
+ tenant_id TEXT DEFAULT 'default',
47
+ event_type TEXT,
48
+ payload JSON,
49
+ failure_reason TEXT,
50
+ failure_detail TEXT,
51
+ received_at TIMESTAMP,
52
+ retry_count INTEGER DEFAULT 0,
53
+ last_retried_at TIMESTAMP,
54
+ status TEXT DEFAULT 'failed'
55
+ )
56
+ """
57
+ )
58
+ conn.execute(
59
+ "ALTER TABLE dead_letter_events ADD COLUMN IF NOT EXISTS tenant_id TEXT DEFAULT 'default'"
60
+ )
61
+
62
+
63
+ class EventReplayer:
64
+ def __init__(
65
+ self,
66
+ conn,
67
+ producer: Callable[[str, dict], None] | None = None,
68
+ bootstrap_servers: str | None = None,
69
+ ) -> None:
70
+ self._conn = conn
71
+ self._producer = producer or self._produce_to_kafka
72
+ self._bootstrap_servers = bootstrap_servers or DEFAULT_KAFKA_BOOTSTRAP
73
+ ensure_dead_letter_table(self._conn)
74
+ ensure_outbox_table(self._conn)
75
+
76
+ def replay(
77
+ self,
78
+ event_id: str,
79
+ corrected_payload: dict | None = None,
80
+ ) -> ReplayResult:
81
+ row = self._load_row(event_id)
82
+ payload = self._decoded_payload(row["payload"])
83
+ candidate = dict(payload)
84
+ if corrected_payload:
85
+ candidate.update(corrected_payload)
86
+ self._validate(candidate)
87
+
88
+ replayed_at = datetime.now(UTC)
89
+ retry_count = int(row["retry_count"] or 0) + 1
90
+ outbox_id = str(uuid4())
91
+ self._conn.execute("BEGIN TRANSACTION")
92
+ try:
93
+ self._conn.execute(
94
+ """
95
+ UPDATE dead_letter_events
96
+ SET payload = ?, status = 'replay_pending', retry_count = ?, last_retried_at = ?
97
+ WHERE event_id = ?
98
+ """,
99
+ [
100
+ json.dumps(candidate),
101
+ retry_count,
102
+ replayed_at,
103
+ event_id,
104
+ ],
105
+ )
106
+ self._conn.execute(
107
+ """
108
+ INSERT INTO outbox (
109
+ id,
110
+ event_id,
111
+ payload,
112
+ topic,
113
+ created_at,
114
+ sent_at,
115
+ status,
116
+ retry_count,
117
+ next_attempt_at,
118
+ last_error
119
+ )
120
+ VALUES (?, ?, ?, ?, ?, NULL, 'pending', 0, ?, NULL)
121
+ """,
122
+ [
123
+ outbox_id,
124
+ event_id,
125
+ json.dumps(candidate),
126
+ DEFAULT_REPLAY_TOPIC,
127
+ replayed_at,
128
+ replayed_at,
129
+ ],
130
+ )
131
+ self._conn.execute("COMMIT")
132
+ except Exception: # nosec B110 - rollback must preserve the original replay failure
133
+ # Transaction rollback must happen before unexpected errors propagate.
134
+ self._conn.execute("ROLLBACK")
135
+ raise
136
+ status = "replay_pending"
137
+ processor = OutboxProcessor(
138
+ conn=self._conn,
139
+ producer=self._producer,
140
+ bootstrap_servers=self._bootstrap_servers,
141
+ )
142
+ if processor.process_entry(outbox_id):
143
+ status = "replayed"
144
+ return ReplayResult(
145
+ event_id=event_id,
146
+ status=status,
147
+ retry_count=retry_count,
148
+ last_retried_at=replayed_at,
149
+ payload=candidate,
150
+ )
151
+
152
+ def dismiss(self, event_id: str) -> None:
153
+ self._load_row(event_id)
154
+ self._conn.execute(
155
+ "UPDATE dead_letter_events SET status = 'dismissed' WHERE event_id = ?",
156
+ [event_id],
157
+ )
158
+
159
+ def _load_row(self, event_id: str) -> dict:
160
+ row = self._conn.execute(
161
+ """
162
+ SELECT
163
+ event_id,
164
+ payload,
165
+ retry_count
166
+ FROM dead_letter_events
167
+ WHERE event_id = ?
168
+ """,
169
+ [event_id],
170
+ ).fetchone()
171
+ if row is None:
172
+ raise DeadLetterEventNotFoundError(event_id)
173
+ return {
174
+ "event_id": row[0],
175
+ "payload": row[1],
176
+ "retry_count": row[2],
177
+ }
178
+
179
+ def _decoded_payload(self, payload) -> dict:
180
+ if isinstance(payload, dict):
181
+ return payload
182
+ if isinstance(payload, str):
183
+ decoded = json.loads(payload)
184
+ if isinstance(decoded, dict):
185
+ return decoded
186
+ raise ReplayValidationError("Dead-letter payload must be a JSON object.")
187
+
188
+ def _validate(self, payload: dict) -> None:
189
+ schema_result = validate_event(payload)
190
+ if not schema_result.is_valid:
191
+ first_error = schema_result.errors[0] if schema_result.errors else {}
192
+ raise ReplayValidationError(first_error.get("msg", "Schema validation failed."))
193
+
194
+ semantic_result = validate_semantics(payload)
195
+ semantic_errors = [
196
+ issue.message for issue in semantic_result.issues if issue.severity == "error"
197
+ ]
198
+ if semantic_errors:
199
+ raise ReplayValidationError(semantic_errors[0])
200
+
201
+ def _produce_to_kafka(self, topic: str, payload: dict) -> None:
202
+ from confluent_kafka import Producer
203
+
204
+ delivery_errors: list[str] = []
205
+
206
+ def on_delivery(err, msg) -> None:
207
+ del msg
208
+ if err is not None:
209
+ delivery_errors.append(str(err))
210
+
211
+ producer = Producer({"bootstrap.servers": self._bootstrap_servers})
212
+ produce_span = (
213
+ tracer.start_as_current_span("kafka.produce")
214
+ if not telemetry_disabled()
215
+ else nullcontext()
216
+ )
217
+ with produce_span as span:
218
+ if span is not None and span.is_recording():
219
+ span.set_attribute("topic", topic)
220
+ event_type = payload.get("event_type")
221
+ if event_type is not None:
222
+ span.set_attribute("event_type", str(event_type))
223
+ tenant_id = payload.get("tenant_id") or structlog.contextvars.get_contextvars().get(
224
+ "tenant_id"
225
+ )
226
+ if tenant_id is not None:
227
+ span.set_attribute("tenant_id", str(tenant_id))
228
+ headers = inject_trace_to_kafka_headers({})
229
+ try:
230
+ producer.produce(
231
+ topic,
232
+ key=str(payload.get("event_id", "")),
233
+ value=json.dumps(payload).encode("utf-8"),
234
+ headers=list(headers.items()) or None,
235
+ on_delivery=on_delivery,
236
+ )
237
+ except TypeError as exc:
238
+ if "on_delivery" not in str(exc):
239
+ raise
240
+ producer.produce(
241
+ topic,
242
+ key=str(payload.get("event_id", "")),
243
+ value=json.dumps(payload).encode("utf-8"),
244
+ headers=list(headers.items()) or None,
245
+ )
246
+ remaining = producer.flush(10)
247
+ if delivery_errors:
248
+ raise RuntimeError(delivery_errors[0])
249
+ if remaining != 0:
250
+ raise RuntimeError(f"{remaining} Kafka message(s) were not delivered")
@@ -0,0 +1,55 @@
1
+ FROM python:3.11-slim-bookworm
2
+
3
+ ARG FLINK_VERSION=1.19.1
4
+ ARG SCALA_VERSION=2.12
5
+ ARG PYFLINK_KAFKA_JAR_VERSION=3.3.0-1.19
6
+
7
+ RUN apt-get update \
8
+ && apt-get install -y --no-install-recommends bash ca-certificates curl openjdk-17-jre-headless \
9
+ && curl -fsSL "https://archive.apache.org/dist/flink/flink-${FLINK_VERSION}/flink-${FLINK_VERSION}-bin-scala_${SCALA_VERSION}.tgz" -o /tmp/flink.tgz \
10
+ && tar -xzf /tmp/flink.tgz -C /opt \
11
+ && ln -s "/opt/flink-${FLINK_VERSION}" /opt/flink \
12
+ && curl -fsSL "https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka/${PYFLINK_KAFKA_JAR_VERSION}/flink-sql-connector-kafka-${PYFLINK_KAFKA_JAR_VERSION}.jar" -o "/opt/flink/lib/flink-sql-connector-kafka-${PYFLINK_KAFKA_JAR_VERSION}.jar" \
13
+ && mkdir -p /opt/flink/plugins/s3-fs-hadoop \
14
+ && ln -s "/opt/flink/opt/flink-s3-fs-hadoop-${FLINK_VERSION}.jar" "/opt/flink/plugins/s3-fs-hadoop/flink-s3-fs-hadoop-${FLINK_VERSION}.jar" \
15
+ && python -m pip install --no-cache-dir --upgrade pip \
16
+ && python -m pip install --no-cache-dir "apache-flink==${FLINK_VERSION}" confluent-kafka pydantic structlog \
17
+ && rm -rf /var/lib/apt/lists/* /tmp/flink.tgz
18
+
19
+ WORKDIR /opt/agentflow
20
+ ENV FLINK_HOME=/opt/flink
21
+ ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
22
+ ENV PATH="/opt/flink/bin:${PATH}"
23
+ ENV PYTHONDONTWRITEBYTECODE=1
24
+ ENV PYTHONUNBUFFERED=1
25
+ ENV PYFLINK_CLIENT_EXECUTABLE=/usr/local/bin/python
26
+ ENV PYTHONPATH=/opt/agentflow
27
+
28
+ COPY . /opt/agentflow/src
29
+
30
+ RUN cat <<'PY' > /opt/agentflow/sitecustomize.py
31
+ from datetime import timedelta as _timedelta
32
+
33
+ try:
34
+ from pyflink.common import WatermarkStrategy
35
+ from pyflink.common.time import Duration, Time
36
+ from pyflink.datastream.state import StateTtlConfig
37
+ except Exception:
38
+ pass
39
+ else:
40
+ _original_watermark = WatermarkStrategy.for_bounded_out_of_orderness
41
+ _original_ttl_builder = StateTtlConfig.new_builder
42
+
43
+ def _patched_watermark(max_out_of_orderness):
44
+ if isinstance(max_out_of_orderness, _timedelta):
45
+ max_out_of_orderness = Duration.of_millis(int(max_out_of_orderness.total_seconds() * 1000))
46
+ return _original_watermark(max_out_of_orderness)
47
+
48
+ def _patched_ttl_builder(ttl):
49
+ if isinstance(ttl, _timedelta):
50
+ ttl = Time.milliseconds(int(ttl.total_seconds() * 1000))
51
+ return _original_ttl_builder(ttl)
52
+
53
+ WatermarkStrategy.for_bounded_out_of_orderness = staticmethod(_patched_watermark)
54
+ StateTtlConfig.new_builder = staticmethod(_patched_ttl_builder)
55
+ PY
File without changes
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from typing import Any
5
+
6
+
7
+ def _checkpoint_constants() -> tuple[object, object]:
8
+ try:
9
+ from pyflink.datastream import CheckpointingMode
10
+ from pyflink.datastream.checkpoint_config import ExternalizedCheckpointCleanup
11
+ except ModuleNotFoundError:
12
+ return "EXACTLY_ONCE", "RETAIN_ON_CANCELLATION"
13
+
14
+ return (
15
+ CheckpointingMode.EXACTLY_ONCE,
16
+ ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION,
17
+ )
18
+
19
+
20
+ def configure_checkpointing(env: Any) -> None:
21
+ checkpoint_mode, cleanup_mode = _checkpoint_constants()
22
+
23
+ env.enable_checkpointing(60_000)
24
+ config = env.get_checkpoint_config()
25
+ config.set_checkpointing_mode(checkpoint_mode)
26
+ config.set_min_pause_between_checkpoints(30_000)
27
+ config.set_checkpoint_timeout(120_000)
28
+ config.set_max_concurrent_checkpoints(1)
29
+ config.enable_externalized_checkpoints(cleanup_mode)
30
+ config.set_checkpoint_storage(
31
+ os.getenv("FLINK_CHECKPOINT_DIR", "file:///tmp/flink-checkpoints")
32
+ )