agentflow-runtime 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. agentflow_runtime-1.1.0.dist-info/METADATA +55 -0
  2. agentflow_runtime-1.1.0.dist-info/RECORD +100 -0
  3. agentflow_runtime-1.1.0.dist-info/WHEEL +4 -0
  4. agentflow_runtime-1.1.0.dist-info/licenses/LICENSE +21 -0
  5. src/__init__.py +0 -0
  6. src/constants.py +3 -0
  7. src/ingestion/__init__.py +0 -0
  8. src/ingestion/cdc/__init__.py +5 -0
  9. src/ingestion/cdc/normalizer.py +186 -0
  10. src/ingestion/connectors/__init__.py +0 -0
  11. src/ingestion/connectors/mysql_cdc.py +63 -0
  12. src/ingestion/connectors/postgres_cdc.py +68 -0
  13. src/ingestion/producers/__init__.py +0 -0
  14. src/ingestion/producers/event_producer.py +237 -0
  15. src/ingestion/schemas/__init__.py +0 -0
  16. src/ingestion/schemas/events.py +147 -0
  17. src/ingestion/tenant_router.py +80 -0
  18. src/logger.py +41 -0
  19. src/orchestration/__init__.py +0 -0
  20. src/orchestration/dags/__init__.py +0 -0
  21. src/orchestration/dags/daily_batch.py +201 -0
  22. src/processing/__init__.py +0 -0
  23. src/processing/event_replayer.py +250 -0
  24. src/processing/flink_jobs/Dockerfile +55 -0
  25. src/processing/flink_jobs/__init__.py +0 -0
  26. src/processing/flink_jobs/checkpointing.py +32 -0
  27. src/processing/flink_jobs/session_aggregation.py +212 -0
  28. src/processing/flink_jobs/session_aggregator.py +199 -0
  29. src/processing/flink_jobs/stream_processor.py +316 -0
  30. src/processing/iceberg_sink.py +348 -0
  31. src/processing/local_pipeline.py +452 -0
  32. src/processing/outbox.py +273 -0
  33. src/processing/tracing.py +36 -0
  34. src/processing/transformations/__init__.py +0 -0
  35. src/processing/transformations/enrichment.py +125 -0
  36. src/quality/__init__.py +0 -0
  37. src/quality/monitors/__init__.py +0 -0
  38. src/quality/monitors/freshness_monitor.py +166 -0
  39. src/quality/monitors/metrics_collector.py +367 -0
  40. src/quality/validators/__init__.py +0 -0
  41. src/quality/validators/schema_validator.py +119 -0
  42. src/quality/validators/semantic_validator.py +202 -0
  43. src/serving/__init__.py +0 -0
  44. src/serving/api/__init__.py +0 -0
  45. src/serving/api/alert_dispatcher.py +51 -0
  46. src/serving/api/alerts/__init__.py +38 -0
  47. src/serving/api/alerts/dispatcher.py +299 -0
  48. src/serving/api/alerts/escalation.py +290 -0
  49. src/serving/api/alerts/evaluator.py +81 -0
  50. src/serving/api/alerts/history.py +115 -0
  51. src/serving/api/analytics.py +543 -0
  52. src/serving/api/auth/__init__.py +46 -0
  53. src/serving/api/auth/key_rotation.py +400 -0
  54. src/serving/api/auth/manager.py +406 -0
  55. src/serving/api/auth/middleware.py +331 -0
  56. src/serving/api/main.py +390 -0
  57. src/serving/api/middleware/logging.py +41 -0
  58. src/serving/api/middleware/tracing.py +51 -0
  59. src/serving/api/rate_limiter.py +76 -0
  60. src/serving/api/routers/__init__.py +0 -0
  61. src/serving/api/routers/admin.py +150 -0
  62. src/serving/api/routers/admin_ui.py +93 -0
  63. src/serving/api/routers/agent_query.py +639 -0
  64. src/serving/api/routers/alerts.py +134 -0
  65. src/serving/api/routers/batch.py +231 -0
  66. src/serving/api/routers/contracts.py +98 -0
  67. src/serving/api/routers/deadletter.py +337 -0
  68. src/serving/api/routers/lineage.py +218 -0
  69. src/serving/api/routers/search.py +103 -0
  70. src/serving/api/routers/slo.py +231 -0
  71. src/serving/api/routers/stream.py +141 -0
  72. src/serving/api/routers/webhooks.py +93 -0
  73. src/serving/api/security.py +83 -0
  74. src/serving/api/telemetry.py +66 -0
  75. src/serving/api/templates/admin.html +214 -0
  76. src/serving/api/versioning.py +328 -0
  77. src/serving/api/webhook_dispatcher.py +423 -0
  78. src/serving/backends/__init__.py +117 -0
  79. src/serving/backends/clickhouse_backend.py +310 -0
  80. src/serving/backends/duckdb_backend.py +268 -0
  81. src/serving/cache.py +169 -0
  82. src/serving/db_pool.py +105 -0
  83. src/serving/masking.py +122 -0
  84. src/serving/semantic_layer/__init__.py +0 -0
  85. src/serving/semantic_layer/catalog.py +177 -0
  86. src/serving/semantic_layer/contract_registry.py +258 -0
  87. src/serving/semantic_layer/entity_type_registry.py +107 -0
  88. src/serving/semantic_layer/nl_engine.py +189 -0
  89. src/serving/semantic_layer/query/__init__.py +3 -0
  90. src/serving/semantic_layer/query/contracts.py +47 -0
  91. src/serving/semantic_layer/query/engine.py +81 -0
  92. src/serving/semantic_layer/query/entity_queries.py +221 -0
  93. src/serving/semantic_layer/query/metric_queries.py +84 -0
  94. src/serving/semantic_layer/query/nl_queries.py +305 -0
  95. src/serving/semantic_layer/query/sql_builder.py +113 -0
  96. src/serving/semantic_layer/query/sql_guard.py +3 -0
  97. src/serving/semantic_layer/query_engine.py +5 -0
  98. src/serving/semantic_layer/schema_evolution.py +175 -0
  99. src/serving/semantic_layer/search_index.py +337 -0
  100. src/serving/semantic_layer/sql_guard.py +56 -0
@@ -0,0 +1,212 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from collections.abc import Mapping
6
+ from dataclasses import dataclass
7
+ from datetime import UTC, datetime, timedelta
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ from src.processing.flink_jobs.checkpointing import configure_checkpointing
11
+
12
+ if TYPE_CHECKING:
13
+ from pyflink.datastream import StreamExecutionEnvironment
14
+
15
+
16
+ SESSION_GAP = timedelta(minutes=30)
17
+
18
+
19
+ @dataclass
20
+ class _SessionState:
21
+ start_time: datetime
22
+ last_time: datetime
23
+ event_count: int
24
+ total_value: float
25
+
26
+ def to_snapshot(self) -> dict[str, object]:
27
+ return {
28
+ "start_time": self.start_time.isoformat(),
29
+ "last_time": self.last_time.isoformat(),
30
+ "event_count": self.event_count,
31
+ "total_value": self.total_value,
32
+ }
33
+
34
+ @classmethod
35
+ def from_snapshot(cls, snapshot: Mapping[str, object]) -> _SessionState:
36
+ return cls(
37
+ start_time=_parse_timestamp(snapshot["start_time"]),
38
+ last_time=_parse_timestamp(snapshot["last_time"]),
39
+ event_count=int(snapshot["event_count"]),
40
+ total_value=float(snapshot["total_value"]),
41
+ )
42
+
43
+
44
+ def _parse_timestamp(value: object) -> datetime:
45
+ if not isinstance(value, str):
46
+ raise TypeError("timestamp must be an ISO-8601 string")
47
+
48
+ parsed = datetime.fromisoformat(value)
49
+ if parsed.tzinfo is None:
50
+ return parsed.replace(tzinfo=UTC)
51
+ return parsed
52
+
53
+
54
+ def _closed_session(user_id: str, state: _SessionState) -> dict[str, object]:
55
+ return {
56
+ "user_id": user_id,
57
+ "session_start": state.start_time.isoformat(),
58
+ "session_end": state.last_time.isoformat(),
59
+ "event_count": state.event_count,
60
+ "total_value": state.total_value,
61
+ "status": "closed",
62
+ }
63
+
64
+
65
+ class SessionAggregator:
66
+ def __init__(self, session_gap: timedelta = SESSION_GAP):
67
+ self._session_gap = session_gap
68
+ self._state: dict[str, _SessionState] = {}
69
+
70
+ def process_event(self, event: Mapping[str, object]) -> list[dict[str, object]]:
71
+ user_id = str(event["user_id"])
72
+ event_time = _parse_timestamp(event["timestamp"])
73
+ value = float(event.get("value", 0.0) or 0.0)
74
+
75
+ current = self._state.get(user_id)
76
+ if current is None:
77
+ self._state[user_id] = _SessionState(
78
+ start_time=event_time,
79
+ last_time=event_time,
80
+ event_count=1,
81
+ total_value=value,
82
+ )
83
+ return []
84
+
85
+ if event_time - current.last_time > self._session_gap:
86
+ closed = _closed_session(user_id, current)
87
+ self._state[user_id] = _SessionState(
88
+ start_time=event_time,
89
+ last_time=event_time,
90
+ event_count=1,
91
+ total_value=value,
92
+ )
93
+ return [closed]
94
+
95
+ current.start_time = min(current.start_time, event_time)
96
+ current.last_time = max(current.last_time, event_time)
97
+ current.event_count += 1
98
+ current.total_value += value
99
+ return []
100
+
101
+ def snapshot(self) -> dict[str, dict[str, object]]:
102
+ return {user_id: state.to_snapshot() for user_id, state in self._state.items()}
103
+
104
+ def restore(self, snapshot: Mapping[str, Mapping[str, object]]) -> None:
105
+ self._state = {
106
+ str(user_id): _SessionState.from_snapshot(state) for user_id, state in snapshot.items()
107
+ }
108
+
109
+
110
+ def build_session_pipeline(
111
+ env: StreamExecutionEnvironment,
112
+ source_topic: str,
113
+ sink_topic: str,
114
+ ) -> Any:
115
+ try:
116
+ from pyflink.common import Types
117
+ from pyflink.common.serialization import SimpleStringSchema
118
+ from pyflink.common.watermark_strategy import WatermarkStrategy
119
+ from pyflink.datastream.connectors.kafka import (
120
+ KafkaOffsetsInitializer,
121
+ KafkaRecordSerializationSchema,
122
+ KafkaSink,
123
+ KafkaSource,
124
+ )
125
+ from pyflink.datastream.functions import KeyedProcessFunction
126
+ from pyflink.datastream.state import MapStateDescriptor
127
+ except ModuleNotFoundError as exc:
128
+ raise RuntimeError(
129
+ "PyFlink is not installed. Install the project with the 'flink' extra."
130
+ ) from exc
131
+
132
+ class FlinkSessionAggregator(KeyedProcessFunction):
133
+ def open(self, runtime_context):
134
+ descriptor = MapStateDescriptor(
135
+ "session_state",
136
+ Types.STRING(),
137
+ Types.STRING(),
138
+ )
139
+ self.state = runtime_context.get_map_state(descriptor)
140
+
141
+ def process_element(self, raw_event, ctx):
142
+ event = json.loads(raw_event)
143
+ user_id = str(event["user_id"])
144
+ aggregator = SessionAggregator()
145
+
146
+ if self.state.contains(user_id):
147
+ aggregator.restore({user_id: json.loads(self.state.get(user_id))})
148
+
149
+ for session in aggregator.process_event(event):
150
+ yield json.dumps(session)
151
+
152
+ self.state.put(user_id, json.dumps(aggregator.snapshot()[user_id]))
153
+
154
+ configure_checkpointing(env)
155
+
156
+ bootstrap_servers = os.getenv("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092")
157
+
158
+ source = (
159
+ KafkaSource.builder()
160
+ .set_bootstrap_servers(bootstrap_servers)
161
+ .set_topics(source_topic)
162
+ .set_group_id("agentflow-session-aggregation")
163
+ .set_starting_offsets(KafkaOffsetsInitializer.earliest())
164
+ .set_value_only_deserializer(SimpleStringSchema())
165
+ .build()
166
+ )
167
+
168
+ sessions = (
169
+ env.from_source(
170
+ source,
171
+ WatermarkStrategy.for_monotonous_timestamps(),
172
+ "Session Aggregation Source",
173
+ )
174
+ .key_by(lambda raw: json.loads(raw)["user_id"])
175
+ .process(FlinkSessionAggregator(), output_type=Types.STRING())
176
+ )
177
+
178
+ sink = (
179
+ KafkaSink.builder()
180
+ .set_bootstrap_servers(bootstrap_servers)
181
+ .set_record_serializer(
182
+ KafkaRecordSerializationSchema.builder()
183
+ .set_topic(sink_topic)
184
+ .set_value_serialization_schema(SimpleStringSchema())
185
+ .build()
186
+ )
187
+ .build()
188
+ )
189
+
190
+ sessions.sink_to(sink)
191
+ return sessions
192
+
193
+
194
+ def main() -> None:
195
+ try:
196
+ from pyflink.datastream import StreamExecutionEnvironment
197
+ except ModuleNotFoundError as exc:
198
+ raise RuntimeError(
199
+ "PyFlink is not installed. Install the project with the 'flink' extra."
200
+ ) from exc
201
+
202
+ env = StreamExecutionEnvironment.get_execution_environment()
203
+ build_session_pipeline(
204
+ env=env,
205
+ source_topic=os.getenv("FLINK_SOURCE_TOPIC", "events.validated"),
206
+ sink_topic=os.getenv("FLINK_SESSION_SINK_TOPIC", "sessions.aggregated"),
207
+ )
208
+ env.execute("agentflow-session-aggregation")
209
+
210
+
211
+ if __name__ == "__main__":
212
+ main()
@@ -0,0 +1,199 @@
1
+ """Session aggregation Flink job: builds user sessions from clickstream events.
2
+
3
+ Groups clickstream events into sessions using a 30-minute gap-based window.
4
+ Outputs session summaries with: duration, page count, conversion signals, funnel stage.
5
+
6
+ Submit with:
7
+ flink run -py session_aggregator.py
8
+ """
9
+
10
+ import json
11
+ import os
12
+ from datetime import UTC, datetime, timedelta
13
+
14
+ from pyflink.common import Types, WatermarkStrategy
15
+ from pyflink.common.serialization import SimpleStringSchema
16
+ from pyflink.common.watermark_strategy import TimestampAssigner
17
+ from pyflink.datastream import StreamExecutionEnvironment
18
+ from pyflink.datastream.connectors.kafka import (
19
+ KafkaOffsetsInitializer,
20
+ KafkaRecordSerializationSchema,
21
+ KafkaSink,
22
+ KafkaSource,
23
+ )
24
+ from pyflink.datastream.functions import KeyedProcessFunction
25
+ from pyflink.datastream.state import ValueStateDescriptor
26
+
27
+ SESSION_GAP_MINUTES = 30
28
+ SESSION_GAP_MS = SESSION_GAP_MINUTES * 60 * 1000
29
+ WATERMARK_OUT_OF_ORDERNESS_SECONDS = 10
30
+ CHECKPOINT_INTERVAL_MS = 30_000
31
+
32
+
33
+ class ClickTimestampAssigner(TimestampAssigner):
34
+ def extract_timestamp(self, value, record_timestamp):
35
+ try:
36
+ event = json.loads(value)
37
+ ts = datetime.fromisoformat(event["timestamp"])
38
+ if ts.tzinfo is None:
39
+ ts = ts.replace(tzinfo=UTC)
40
+ return int(ts.timestamp() * 1000)
41
+ except (json.JSONDecodeError, KeyError, ValueError):
42
+ return record_timestamp
43
+
44
+
45
+ class SessionWindowFunction(KeyedProcessFunction):
46
+ """Accumulates clickstream events into sessions using processing-time timers.
47
+
48
+ State per session_id:
49
+ - session_data: JSON with accumulated pages, first/last event time, event count
50
+ - timer_ts: timestamp of the gap-expiry timer
51
+
52
+ When the timer fires (no new event for 30 min), emit session summary.
53
+ """
54
+
55
+ def open(self, runtime_context):
56
+ self.session_state = runtime_context.get_state(
57
+ ValueStateDescriptor("session_data", Types.STRING())
58
+ )
59
+ self.timer_state = runtime_context.get_state(ValueStateDescriptor("timer_ts", Types.LONG()))
60
+
61
+ def process_element(self, value, ctx: KeyedProcessFunction.Context):
62
+ event = json.loads(value)
63
+ event_ts = ctx.timestamp()
64
+
65
+ current = self.session_state.value()
66
+ if current:
67
+ session = json.loads(current)
68
+ else:
69
+ session = {
70
+ "session_id": event.get("session_id", ctx.get_current_key()),
71
+ "user_id": event.get("user_id"),
72
+ "first_event_ts": event_ts,
73
+ "last_event_ts": event_ts,
74
+ "event_count": 0,
75
+ "pages": [],
76
+ "has_add_to_cart": False,
77
+ "has_checkout": False,
78
+ "product_ids_viewed": [],
79
+ }
80
+
81
+ # Update session
82
+ session["last_event_ts"] = event_ts
83
+ session["event_count"] += 1
84
+
85
+ page = event.get("page_url", "")
86
+ if page and page not in session["pages"]:
87
+ session["pages"].append(page)
88
+
89
+ if event.get("event_type") == "add_to_cart":
90
+ session["has_add_to_cart"] = True
91
+ if "/checkout" in page:
92
+ session["has_checkout"] = True
93
+
94
+ pid = event.get("product_id")
95
+ if pid and pid not in session["product_ids_viewed"]:
96
+ session["product_ids_viewed"].append(pid)
97
+
98
+ self.session_state.update(json.dumps(session))
99
+
100
+ # Reset gap timer
101
+ old_timer = self.timer_state.value()
102
+ if old_timer:
103
+ ctx.timer_service().delete_event_time_timer(old_timer)
104
+
105
+ new_timer = event_ts + SESSION_GAP_MS
106
+ ctx.timer_service().register_event_time_timer(new_timer)
107
+ self.timer_state.update(new_timer)
108
+
109
+ def on_timer(self, timestamp, ctx: KeyedProcessFunction.OnTimerContext):
110
+ """Session gap expired — emit session summary."""
111
+ current = self.session_state.value()
112
+ if not current:
113
+ return
114
+
115
+ session = json.loads(current)
116
+ duration_ms = session["last_event_ts"] - session["first_event_ts"]
117
+
118
+ # Determine funnel stage
119
+ if session["has_checkout"]:
120
+ funnel_stage = "checkout"
121
+ elif session["has_add_to_cart"]:
122
+ funnel_stage = "add_to_cart"
123
+ elif len(session["product_ids_viewed"]) > 0:
124
+ funnel_stage = "product_view"
125
+ elif session["event_count"] > 1:
126
+ funnel_stage = "browse"
127
+ else:
128
+ funnel_stage = "bounce"
129
+
130
+ summary = {
131
+ "session_id": session["session_id"],
132
+ "user_id": session["user_id"],
133
+ "started_at": datetime.fromtimestamp(
134
+ session["first_event_ts"] / 1000, tz=UTC
135
+ ).isoformat(),
136
+ "ended_at": datetime.fromtimestamp(session["last_event_ts"] / 1000, tz=UTC).isoformat(),
137
+ "duration_seconds": duration_ms / 1000,
138
+ "event_count": session["event_count"],
139
+ "unique_pages": len(session["pages"]),
140
+ "products_viewed": len(session["product_ids_viewed"]),
141
+ "funnel_stage": funnel_stage,
142
+ "is_conversion": session["has_checkout"],
143
+ }
144
+
145
+ # Emit
146
+ yield json.dumps(summary)
147
+
148
+ # Clear state
149
+ self.session_state.clear()
150
+ self.timer_state.clear()
151
+
152
+
153
+ def build_pipeline():
154
+ env = StreamExecutionEnvironment.get_execution_environment()
155
+ env.enable_checkpointing(CHECKPOINT_INTERVAL_MS)
156
+ env.set_parallelism(int(os.getenv("FLINK_PARALLELISM", "2")))
157
+
158
+ bootstrap_servers = os.getenv("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092")
159
+
160
+ source = (
161
+ KafkaSource.builder()
162
+ .set_bootstrap_servers(bootstrap_servers)
163
+ .set_topics("clicks.raw")
164
+ .set_group_id("agentflow-session-aggregator")
165
+ .set_starting_offsets(KafkaOffsetsInitializer.earliest())
166
+ .set_value_only_deserializer(SimpleStringSchema())
167
+ .build()
168
+ )
169
+
170
+ watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(
171
+ timedelta(seconds=WATERMARK_OUT_OF_ORDERNESS_SECONDS)
172
+ ).with_timestamp_assigner(ClickTimestampAssigner())
173
+
174
+ stream = env.from_source(source, watermark_strategy, "clicks-source")
175
+
176
+ sessions = stream.key_by(lambda x: json.loads(x).get("session_id", "unknown")).process(
177
+ SessionWindowFunction(), output_type=Types.STRING()
178
+ )
179
+
180
+ sink = (
181
+ KafkaSink.builder()
182
+ .set_bootstrap_servers(bootstrap_servers)
183
+ .set_record_serializer(
184
+ KafkaRecordSerializationSchema.builder()
185
+ .set_topic("sessions.aggregated")
186
+ .set_value_serialization_schema(SimpleStringSchema())
187
+ .build()
188
+ )
189
+ .build()
190
+ )
191
+
192
+ sessions.sink_to(sink)
193
+
194
+ return env
195
+
196
+
197
+ if __name__ == "__main__":
198
+ pipeline = build_pipeline()
199
+ pipeline.execute("agentflow-session-aggregator")