agentflow-runtime 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentflow_runtime-1.1.0.dist-info/METADATA +55 -0
- agentflow_runtime-1.1.0.dist-info/RECORD +100 -0
- agentflow_runtime-1.1.0.dist-info/WHEEL +4 -0
- agentflow_runtime-1.1.0.dist-info/licenses/LICENSE +21 -0
- src/__init__.py +0 -0
- src/constants.py +3 -0
- src/ingestion/__init__.py +0 -0
- src/ingestion/cdc/__init__.py +5 -0
- src/ingestion/cdc/normalizer.py +186 -0
- src/ingestion/connectors/__init__.py +0 -0
- src/ingestion/connectors/mysql_cdc.py +63 -0
- src/ingestion/connectors/postgres_cdc.py +68 -0
- src/ingestion/producers/__init__.py +0 -0
- src/ingestion/producers/event_producer.py +237 -0
- src/ingestion/schemas/__init__.py +0 -0
- src/ingestion/schemas/events.py +147 -0
- src/ingestion/tenant_router.py +80 -0
- src/logger.py +41 -0
- src/orchestration/__init__.py +0 -0
- src/orchestration/dags/__init__.py +0 -0
- src/orchestration/dags/daily_batch.py +201 -0
- src/processing/__init__.py +0 -0
- src/processing/event_replayer.py +250 -0
- src/processing/flink_jobs/Dockerfile +55 -0
- src/processing/flink_jobs/__init__.py +0 -0
- src/processing/flink_jobs/checkpointing.py +32 -0
- src/processing/flink_jobs/session_aggregation.py +212 -0
- src/processing/flink_jobs/session_aggregator.py +199 -0
- src/processing/flink_jobs/stream_processor.py +316 -0
- src/processing/iceberg_sink.py +348 -0
- src/processing/local_pipeline.py +452 -0
- src/processing/outbox.py +273 -0
- src/processing/tracing.py +36 -0
- src/processing/transformations/__init__.py +0 -0
- src/processing/transformations/enrichment.py +125 -0
- src/quality/__init__.py +0 -0
- src/quality/monitors/__init__.py +0 -0
- src/quality/monitors/freshness_monitor.py +166 -0
- src/quality/monitors/metrics_collector.py +367 -0
- src/quality/validators/__init__.py +0 -0
- src/quality/validators/schema_validator.py +119 -0
- src/quality/validators/semantic_validator.py +202 -0
- src/serving/__init__.py +0 -0
- src/serving/api/__init__.py +0 -0
- src/serving/api/alert_dispatcher.py +51 -0
- src/serving/api/alerts/__init__.py +38 -0
- src/serving/api/alerts/dispatcher.py +299 -0
- src/serving/api/alerts/escalation.py +290 -0
- src/serving/api/alerts/evaluator.py +81 -0
- src/serving/api/alerts/history.py +115 -0
- src/serving/api/analytics.py +543 -0
- src/serving/api/auth/__init__.py +46 -0
- src/serving/api/auth/key_rotation.py +400 -0
- src/serving/api/auth/manager.py +406 -0
- src/serving/api/auth/middleware.py +331 -0
- src/serving/api/main.py +390 -0
- src/serving/api/middleware/logging.py +41 -0
- src/serving/api/middleware/tracing.py +51 -0
- src/serving/api/rate_limiter.py +76 -0
- src/serving/api/routers/__init__.py +0 -0
- src/serving/api/routers/admin.py +150 -0
- src/serving/api/routers/admin_ui.py +93 -0
- src/serving/api/routers/agent_query.py +639 -0
- src/serving/api/routers/alerts.py +134 -0
- src/serving/api/routers/batch.py +231 -0
- src/serving/api/routers/contracts.py +98 -0
- src/serving/api/routers/deadletter.py +337 -0
- src/serving/api/routers/lineage.py +218 -0
- src/serving/api/routers/search.py +103 -0
- src/serving/api/routers/slo.py +231 -0
- src/serving/api/routers/stream.py +141 -0
- src/serving/api/routers/webhooks.py +93 -0
- src/serving/api/security.py +83 -0
- src/serving/api/telemetry.py +66 -0
- src/serving/api/templates/admin.html +214 -0
- src/serving/api/versioning.py +328 -0
- src/serving/api/webhook_dispatcher.py +423 -0
- src/serving/backends/__init__.py +117 -0
- src/serving/backends/clickhouse_backend.py +310 -0
- src/serving/backends/duckdb_backend.py +268 -0
- src/serving/cache.py +169 -0
- src/serving/db_pool.py +105 -0
- src/serving/masking.py +122 -0
- src/serving/semantic_layer/__init__.py +0 -0
- src/serving/semantic_layer/catalog.py +177 -0
- src/serving/semantic_layer/contract_registry.py +258 -0
- src/serving/semantic_layer/entity_type_registry.py +107 -0
- src/serving/semantic_layer/nl_engine.py +189 -0
- src/serving/semantic_layer/query/__init__.py +3 -0
- src/serving/semantic_layer/query/contracts.py +47 -0
- src/serving/semantic_layer/query/engine.py +81 -0
- src/serving/semantic_layer/query/entity_queries.py +221 -0
- src/serving/semantic_layer/query/metric_queries.py +84 -0
- src/serving/semantic_layer/query/nl_queries.py +305 -0
- src/serving/semantic_layer/query/sql_builder.py +113 -0
- src/serving/semantic_layer/query/sql_guard.py +3 -0
- src/serving/semantic_layer/query_engine.py +5 -0
- src/serving/semantic_layer/schema_evolution.py +175 -0
- src/serving/semantic_layer/search_index.py +337 -0
- src/serving/semantic_layer/sql_guard.py +56 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import UTC, datetime, timedelta
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
|
+
|
|
10
|
+
from src.processing.flink_jobs.checkpointing import configure_checkpointing
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from pyflink.datastream import StreamExecutionEnvironment
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
SESSION_GAP = timedelta(minutes=30)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class _SessionState:
|
|
21
|
+
start_time: datetime
|
|
22
|
+
last_time: datetime
|
|
23
|
+
event_count: int
|
|
24
|
+
total_value: float
|
|
25
|
+
|
|
26
|
+
def to_snapshot(self) -> dict[str, object]:
|
|
27
|
+
return {
|
|
28
|
+
"start_time": self.start_time.isoformat(),
|
|
29
|
+
"last_time": self.last_time.isoformat(),
|
|
30
|
+
"event_count": self.event_count,
|
|
31
|
+
"total_value": self.total_value,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def from_snapshot(cls, snapshot: Mapping[str, object]) -> _SessionState:
|
|
36
|
+
return cls(
|
|
37
|
+
start_time=_parse_timestamp(snapshot["start_time"]),
|
|
38
|
+
last_time=_parse_timestamp(snapshot["last_time"]),
|
|
39
|
+
event_count=int(snapshot["event_count"]),
|
|
40
|
+
total_value=float(snapshot["total_value"]),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _parse_timestamp(value: object) -> datetime:
|
|
45
|
+
if not isinstance(value, str):
|
|
46
|
+
raise TypeError("timestamp must be an ISO-8601 string")
|
|
47
|
+
|
|
48
|
+
parsed = datetime.fromisoformat(value)
|
|
49
|
+
if parsed.tzinfo is None:
|
|
50
|
+
return parsed.replace(tzinfo=UTC)
|
|
51
|
+
return parsed
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _closed_session(user_id: str, state: _SessionState) -> dict[str, object]:
|
|
55
|
+
return {
|
|
56
|
+
"user_id": user_id,
|
|
57
|
+
"session_start": state.start_time.isoformat(),
|
|
58
|
+
"session_end": state.last_time.isoformat(),
|
|
59
|
+
"event_count": state.event_count,
|
|
60
|
+
"total_value": state.total_value,
|
|
61
|
+
"status": "closed",
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class SessionAggregator:
|
|
66
|
+
def __init__(self, session_gap: timedelta = SESSION_GAP):
|
|
67
|
+
self._session_gap = session_gap
|
|
68
|
+
self._state: dict[str, _SessionState] = {}
|
|
69
|
+
|
|
70
|
+
def process_event(self, event: Mapping[str, object]) -> list[dict[str, object]]:
|
|
71
|
+
user_id = str(event["user_id"])
|
|
72
|
+
event_time = _parse_timestamp(event["timestamp"])
|
|
73
|
+
value = float(event.get("value", 0.0) or 0.0)
|
|
74
|
+
|
|
75
|
+
current = self._state.get(user_id)
|
|
76
|
+
if current is None:
|
|
77
|
+
self._state[user_id] = _SessionState(
|
|
78
|
+
start_time=event_time,
|
|
79
|
+
last_time=event_time,
|
|
80
|
+
event_count=1,
|
|
81
|
+
total_value=value,
|
|
82
|
+
)
|
|
83
|
+
return []
|
|
84
|
+
|
|
85
|
+
if event_time - current.last_time > self._session_gap:
|
|
86
|
+
closed = _closed_session(user_id, current)
|
|
87
|
+
self._state[user_id] = _SessionState(
|
|
88
|
+
start_time=event_time,
|
|
89
|
+
last_time=event_time,
|
|
90
|
+
event_count=1,
|
|
91
|
+
total_value=value,
|
|
92
|
+
)
|
|
93
|
+
return [closed]
|
|
94
|
+
|
|
95
|
+
current.start_time = min(current.start_time, event_time)
|
|
96
|
+
current.last_time = max(current.last_time, event_time)
|
|
97
|
+
current.event_count += 1
|
|
98
|
+
current.total_value += value
|
|
99
|
+
return []
|
|
100
|
+
|
|
101
|
+
def snapshot(self) -> dict[str, dict[str, object]]:
|
|
102
|
+
return {user_id: state.to_snapshot() for user_id, state in self._state.items()}
|
|
103
|
+
|
|
104
|
+
def restore(self, snapshot: Mapping[str, Mapping[str, object]]) -> None:
|
|
105
|
+
self._state = {
|
|
106
|
+
str(user_id): _SessionState.from_snapshot(state) for user_id, state in snapshot.items()
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def build_session_pipeline(
|
|
111
|
+
env: StreamExecutionEnvironment,
|
|
112
|
+
source_topic: str,
|
|
113
|
+
sink_topic: str,
|
|
114
|
+
) -> Any:
|
|
115
|
+
try:
|
|
116
|
+
from pyflink.common import Types
|
|
117
|
+
from pyflink.common.serialization import SimpleStringSchema
|
|
118
|
+
from pyflink.common.watermark_strategy import WatermarkStrategy
|
|
119
|
+
from pyflink.datastream.connectors.kafka import (
|
|
120
|
+
KafkaOffsetsInitializer,
|
|
121
|
+
KafkaRecordSerializationSchema,
|
|
122
|
+
KafkaSink,
|
|
123
|
+
KafkaSource,
|
|
124
|
+
)
|
|
125
|
+
from pyflink.datastream.functions import KeyedProcessFunction
|
|
126
|
+
from pyflink.datastream.state import MapStateDescriptor
|
|
127
|
+
except ModuleNotFoundError as exc:
|
|
128
|
+
raise RuntimeError(
|
|
129
|
+
"PyFlink is not installed. Install the project with the 'flink' extra."
|
|
130
|
+
) from exc
|
|
131
|
+
|
|
132
|
+
class FlinkSessionAggregator(KeyedProcessFunction):
|
|
133
|
+
def open(self, runtime_context):
|
|
134
|
+
descriptor = MapStateDescriptor(
|
|
135
|
+
"session_state",
|
|
136
|
+
Types.STRING(),
|
|
137
|
+
Types.STRING(),
|
|
138
|
+
)
|
|
139
|
+
self.state = runtime_context.get_map_state(descriptor)
|
|
140
|
+
|
|
141
|
+
def process_element(self, raw_event, ctx):
|
|
142
|
+
event = json.loads(raw_event)
|
|
143
|
+
user_id = str(event["user_id"])
|
|
144
|
+
aggregator = SessionAggregator()
|
|
145
|
+
|
|
146
|
+
if self.state.contains(user_id):
|
|
147
|
+
aggregator.restore({user_id: json.loads(self.state.get(user_id))})
|
|
148
|
+
|
|
149
|
+
for session in aggregator.process_event(event):
|
|
150
|
+
yield json.dumps(session)
|
|
151
|
+
|
|
152
|
+
self.state.put(user_id, json.dumps(aggregator.snapshot()[user_id]))
|
|
153
|
+
|
|
154
|
+
configure_checkpointing(env)
|
|
155
|
+
|
|
156
|
+
bootstrap_servers = os.getenv("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092")
|
|
157
|
+
|
|
158
|
+
source = (
|
|
159
|
+
KafkaSource.builder()
|
|
160
|
+
.set_bootstrap_servers(bootstrap_servers)
|
|
161
|
+
.set_topics(source_topic)
|
|
162
|
+
.set_group_id("agentflow-session-aggregation")
|
|
163
|
+
.set_starting_offsets(KafkaOffsetsInitializer.earliest())
|
|
164
|
+
.set_value_only_deserializer(SimpleStringSchema())
|
|
165
|
+
.build()
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
sessions = (
|
|
169
|
+
env.from_source(
|
|
170
|
+
source,
|
|
171
|
+
WatermarkStrategy.for_monotonous_timestamps(),
|
|
172
|
+
"Session Aggregation Source",
|
|
173
|
+
)
|
|
174
|
+
.key_by(lambda raw: json.loads(raw)["user_id"])
|
|
175
|
+
.process(FlinkSessionAggregator(), output_type=Types.STRING())
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
sink = (
|
|
179
|
+
KafkaSink.builder()
|
|
180
|
+
.set_bootstrap_servers(bootstrap_servers)
|
|
181
|
+
.set_record_serializer(
|
|
182
|
+
KafkaRecordSerializationSchema.builder()
|
|
183
|
+
.set_topic(sink_topic)
|
|
184
|
+
.set_value_serialization_schema(SimpleStringSchema())
|
|
185
|
+
.build()
|
|
186
|
+
)
|
|
187
|
+
.build()
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
sessions.sink_to(sink)
|
|
191
|
+
return sessions
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def main() -> None:
|
|
195
|
+
try:
|
|
196
|
+
from pyflink.datastream import StreamExecutionEnvironment
|
|
197
|
+
except ModuleNotFoundError as exc:
|
|
198
|
+
raise RuntimeError(
|
|
199
|
+
"PyFlink is not installed. Install the project with the 'flink' extra."
|
|
200
|
+
) from exc
|
|
201
|
+
|
|
202
|
+
env = StreamExecutionEnvironment.get_execution_environment()
|
|
203
|
+
build_session_pipeline(
|
|
204
|
+
env=env,
|
|
205
|
+
source_topic=os.getenv("FLINK_SOURCE_TOPIC", "events.validated"),
|
|
206
|
+
sink_topic=os.getenv("FLINK_SESSION_SINK_TOPIC", "sessions.aggregated"),
|
|
207
|
+
)
|
|
208
|
+
env.execute("agentflow-session-aggregation")
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
if __name__ == "__main__":
|
|
212
|
+
main()
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""Session aggregation Flink job: builds user sessions from clickstream events.
|
|
2
|
+
|
|
3
|
+
Groups clickstream events into sessions using a 30-minute gap-based window.
|
|
4
|
+
Outputs session summaries with: duration, page count, conversion signals, funnel stage.
|
|
5
|
+
|
|
6
|
+
Submit with:
|
|
7
|
+
flink run -py session_aggregator.py
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
from datetime import UTC, datetime, timedelta
|
|
13
|
+
|
|
14
|
+
from pyflink.common import Types, WatermarkStrategy
|
|
15
|
+
from pyflink.common.serialization import SimpleStringSchema
|
|
16
|
+
from pyflink.common.watermark_strategy import TimestampAssigner
|
|
17
|
+
from pyflink.datastream import StreamExecutionEnvironment
|
|
18
|
+
from pyflink.datastream.connectors.kafka import (
|
|
19
|
+
KafkaOffsetsInitializer,
|
|
20
|
+
KafkaRecordSerializationSchema,
|
|
21
|
+
KafkaSink,
|
|
22
|
+
KafkaSource,
|
|
23
|
+
)
|
|
24
|
+
from pyflink.datastream.functions import KeyedProcessFunction
|
|
25
|
+
from pyflink.datastream.state import ValueStateDescriptor
|
|
26
|
+
|
|
27
|
+
SESSION_GAP_MINUTES = 30
|
|
28
|
+
SESSION_GAP_MS = SESSION_GAP_MINUTES * 60 * 1000
|
|
29
|
+
WATERMARK_OUT_OF_ORDERNESS_SECONDS = 10
|
|
30
|
+
CHECKPOINT_INTERVAL_MS = 30_000
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ClickTimestampAssigner(TimestampAssigner):
|
|
34
|
+
def extract_timestamp(self, value, record_timestamp):
|
|
35
|
+
try:
|
|
36
|
+
event = json.loads(value)
|
|
37
|
+
ts = datetime.fromisoformat(event["timestamp"])
|
|
38
|
+
if ts.tzinfo is None:
|
|
39
|
+
ts = ts.replace(tzinfo=UTC)
|
|
40
|
+
return int(ts.timestamp() * 1000)
|
|
41
|
+
except (json.JSONDecodeError, KeyError, ValueError):
|
|
42
|
+
return record_timestamp
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class SessionWindowFunction(KeyedProcessFunction):
|
|
46
|
+
"""Accumulates clickstream events into sessions using processing-time timers.
|
|
47
|
+
|
|
48
|
+
State per session_id:
|
|
49
|
+
- session_data: JSON with accumulated pages, first/last event time, event count
|
|
50
|
+
- timer_ts: timestamp of the gap-expiry timer
|
|
51
|
+
|
|
52
|
+
When the timer fires (no new event for 30 min), emit session summary.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def open(self, runtime_context):
|
|
56
|
+
self.session_state = runtime_context.get_state(
|
|
57
|
+
ValueStateDescriptor("session_data", Types.STRING())
|
|
58
|
+
)
|
|
59
|
+
self.timer_state = runtime_context.get_state(ValueStateDescriptor("timer_ts", Types.LONG()))
|
|
60
|
+
|
|
61
|
+
def process_element(self, value, ctx: KeyedProcessFunction.Context):
|
|
62
|
+
event = json.loads(value)
|
|
63
|
+
event_ts = ctx.timestamp()
|
|
64
|
+
|
|
65
|
+
current = self.session_state.value()
|
|
66
|
+
if current:
|
|
67
|
+
session = json.loads(current)
|
|
68
|
+
else:
|
|
69
|
+
session = {
|
|
70
|
+
"session_id": event.get("session_id", ctx.get_current_key()),
|
|
71
|
+
"user_id": event.get("user_id"),
|
|
72
|
+
"first_event_ts": event_ts,
|
|
73
|
+
"last_event_ts": event_ts,
|
|
74
|
+
"event_count": 0,
|
|
75
|
+
"pages": [],
|
|
76
|
+
"has_add_to_cart": False,
|
|
77
|
+
"has_checkout": False,
|
|
78
|
+
"product_ids_viewed": [],
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
# Update session
|
|
82
|
+
session["last_event_ts"] = event_ts
|
|
83
|
+
session["event_count"] += 1
|
|
84
|
+
|
|
85
|
+
page = event.get("page_url", "")
|
|
86
|
+
if page and page not in session["pages"]:
|
|
87
|
+
session["pages"].append(page)
|
|
88
|
+
|
|
89
|
+
if event.get("event_type") == "add_to_cart":
|
|
90
|
+
session["has_add_to_cart"] = True
|
|
91
|
+
if "/checkout" in page:
|
|
92
|
+
session["has_checkout"] = True
|
|
93
|
+
|
|
94
|
+
pid = event.get("product_id")
|
|
95
|
+
if pid and pid not in session["product_ids_viewed"]:
|
|
96
|
+
session["product_ids_viewed"].append(pid)
|
|
97
|
+
|
|
98
|
+
self.session_state.update(json.dumps(session))
|
|
99
|
+
|
|
100
|
+
# Reset gap timer
|
|
101
|
+
old_timer = self.timer_state.value()
|
|
102
|
+
if old_timer:
|
|
103
|
+
ctx.timer_service().delete_event_time_timer(old_timer)
|
|
104
|
+
|
|
105
|
+
new_timer = event_ts + SESSION_GAP_MS
|
|
106
|
+
ctx.timer_service().register_event_time_timer(new_timer)
|
|
107
|
+
self.timer_state.update(new_timer)
|
|
108
|
+
|
|
109
|
+
def on_timer(self, timestamp, ctx: KeyedProcessFunction.OnTimerContext):
|
|
110
|
+
"""Session gap expired — emit session summary."""
|
|
111
|
+
current = self.session_state.value()
|
|
112
|
+
if not current:
|
|
113
|
+
return
|
|
114
|
+
|
|
115
|
+
session = json.loads(current)
|
|
116
|
+
duration_ms = session["last_event_ts"] - session["first_event_ts"]
|
|
117
|
+
|
|
118
|
+
# Determine funnel stage
|
|
119
|
+
if session["has_checkout"]:
|
|
120
|
+
funnel_stage = "checkout"
|
|
121
|
+
elif session["has_add_to_cart"]:
|
|
122
|
+
funnel_stage = "add_to_cart"
|
|
123
|
+
elif len(session["product_ids_viewed"]) > 0:
|
|
124
|
+
funnel_stage = "product_view"
|
|
125
|
+
elif session["event_count"] > 1:
|
|
126
|
+
funnel_stage = "browse"
|
|
127
|
+
else:
|
|
128
|
+
funnel_stage = "bounce"
|
|
129
|
+
|
|
130
|
+
summary = {
|
|
131
|
+
"session_id": session["session_id"],
|
|
132
|
+
"user_id": session["user_id"],
|
|
133
|
+
"started_at": datetime.fromtimestamp(
|
|
134
|
+
session["first_event_ts"] / 1000, tz=UTC
|
|
135
|
+
).isoformat(),
|
|
136
|
+
"ended_at": datetime.fromtimestamp(session["last_event_ts"] / 1000, tz=UTC).isoformat(),
|
|
137
|
+
"duration_seconds": duration_ms / 1000,
|
|
138
|
+
"event_count": session["event_count"],
|
|
139
|
+
"unique_pages": len(session["pages"]),
|
|
140
|
+
"products_viewed": len(session["product_ids_viewed"]),
|
|
141
|
+
"funnel_stage": funnel_stage,
|
|
142
|
+
"is_conversion": session["has_checkout"],
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
# Emit
|
|
146
|
+
yield json.dumps(summary)
|
|
147
|
+
|
|
148
|
+
# Clear state
|
|
149
|
+
self.session_state.clear()
|
|
150
|
+
self.timer_state.clear()
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def build_pipeline():
|
|
154
|
+
env = StreamExecutionEnvironment.get_execution_environment()
|
|
155
|
+
env.enable_checkpointing(CHECKPOINT_INTERVAL_MS)
|
|
156
|
+
env.set_parallelism(int(os.getenv("FLINK_PARALLELISM", "2")))
|
|
157
|
+
|
|
158
|
+
bootstrap_servers = os.getenv("KAFKA_BOOTSTRAP_SERVERS", "localhost:9092")
|
|
159
|
+
|
|
160
|
+
source = (
|
|
161
|
+
KafkaSource.builder()
|
|
162
|
+
.set_bootstrap_servers(bootstrap_servers)
|
|
163
|
+
.set_topics("clicks.raw")
|
|
164
|
+
.set_group_id("agentflow-session-aggregator")
|
|
165
|
+
.set_starting_offsets(KafkaOffsetsInitializer.earliest())
|
|
166
|
+
.set_value_only_deserializer(SimpleStringSchema())
|
|
167
|
+
.build()
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
watermark_strategy = WatermarkStrategy.for_bounded_out_of_orderness(
|
|
171
|
+
timedelta(seconds=WATERMARK_OUT_OF_ORDERNESS_SECONDS)
|
|
172
|
+
).with_timestamp_assigner(ClickTimestampAssigner())
|
|
173
|
+
|
|
174
|
+
stream = env.from_source(source, watermark_strategy, "clicks-source")
|
|
175
|
+
|
|
176
|
+
sessions = stream.key_by(lambda x: json.loads(x).get("session_id", "unknown")).process(
|
|
177
|
+
SessionWindowFunction(), output_type=Types.STRING()
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
sink = (
|
|
181
|
+
KafkaSink.builder()
|
|
182
|
+
.set_bootstrap_servers(bootstrap_servers)
|
|
183
|
+
.set_record_serializer(
|
|
184
|
+
KafkaRecordSerializationSchema.builder()
|
|
185
|
+
.set_topic("sessions.aggregated")
|
|
186
|
+
.set_value_serialization_schema(SimpleStringSchema())
|
|
187
|
+
.build()
|
|
188
|
+
)
|
|
189
|
+
.build()
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
sessions.sink_to(sink)
|
|
193
|
+
|
|
194
|
+
return env
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
if __name__ == "__main__":
|
|
198
|
+
pipeline = build_pipeline()
|
|
199
|
+
pipeline.execute("agentflow-session-aggregator")
|