datafun-streaming 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datafun_streaming/__init__.py +1 -0
- datafun_streaming/_version.py +24 -0
- datafun_streaming/core/__init__.py +1 -0
- datafun_streaming/core/types.py +16 -0
- datafun_streaming/data_validation/__init__.py +1 -0
- datafun_streaming/data_validation/errors.py +24 -0
- datafun_streaming/data_validation/reference.py +63 -0
- datafun_streaming/data_validation/types.py +42 -0
- datafun_streaming/data_validation/validation_utils.py +143 -0
- datafun_streaming/io/__init__.py +1 -0
- datafun_streaming/io/errors.py +50 -0
- datafun_streaming/io/io_utils.py +109 -0
- datafun_streaming/kafka/__init__.py +1 -0
- datafun_streaming/kafka/errors.py +150 -0
- datafun_streaming/kafka/kafka_admin_utils.py +211 -0
- datafun_streaming/kafka/kafka_connection_utils.py +46 -0
- datafun_streaming/kafka/kafka_consumer_utils.py +62 -0
- datafun_streaming/kafka/kafka_producer_utils.py +96 -0
- datafun_streaming/kafka/kafka_settings.py +79 -0
- datafun_streaming/py.typed +0 -0
- datafun_streaming/stats/__init__.py +1 -0
- datafun_streaming/stats/stats_utils.py +110 -0
- datafun_streaming/storage/__init__.py +1 -0
- datafun_streaming/storage/duckdb_utils.py +244 -0
- datafun_streaming/visualization/__init__.py +1 -0
- datafun_streaming/visualization/chart_utils.py +150 -0
- datafun_streaming-0.1.0.dist-info/METADATA +168 -0
- datafun_streaming-0.1.0.dist-info/RECORD +30 -0
- datafun_streaming-0.1.0.dist-info/WHEEL +4 -0
- datafun_streaming-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""src/datafun_streaming/kafka/kafka_admin_utils.py.
|
|
2
|
+
|
|
3
|
+
Kafka topic management helpers for streaming examples.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
# === IMPORTS ===
|
|
7
|
+
|
|
8
|
+
import time
|
|
9
|
+
|
|
10
|
+
from confluent_kafka import Consumer, TopicPartition
|
|
11
|
+
from confluent_kafka.admin import AdminClient
|
|
12
|
+
from confluent_kafka.cimpl import KafkaException, NewTopic
|
|
13
|
+
|
|
14
|
+
from datafun_streaming.kafka.errors import kafka_admin_failed_message
|
|
15
|
+
from datafun_streaming.kafka.kafka_settings import KafkaSettings
|
|
16
|
+
|
|
17
|
+
# === EXPORTS
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"create_admin_client",
|
|
21
|
+
"create_topic",
|
|
22
|
+
"delete_topic",
|
|
23
|
+
"list_topics",
|
|
24
|
+
"topic_exists",
|
|
25
|
+
"get_topic_message_count",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
# === DECLARE CONSTANTS ===
|
|
29
|
+
|
|
30
|
+
# rdkafka's internal broker handshake takes up to ~4 seconds on Windows
|
|
31
|
+
# before the AdminClient is ready to accept calls. These constants
|
|
32
|
+
# control how long we wait and how many times we retry.
|
|
33
|
+
ADMIN_READY_RETRIES: int = 5
|
|
34
|
+
ADMIN_READY_DELAY_SECONDS: float = 2.0
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# === DEFINE ADMIN HELPER FUNCTIONS ===
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def create_admin_client(settings: KafkaSettings) -> AdminClient:
|
|
41
|
+
"""Create a Kafka AdminClient."""
|
|
42
|
+
return AdminClient({"bootstrap.servers": settings.bootstrap_servers})
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def create_topic(
|
|
46
|
+
admin: AdminClient,
|
|
47
|
+
topic: str,
|
|
48
|
+
*,
|
|
49
|
+
num_partitions: int = 1,
|
|
50
|
+
replication_factor: int = 1,
|
|
51
|
+
) -> None:
|
|
52
|
+
"""Create a Kafka topic if it does not already exist.
|
|
53
|
+
|
|
54
|
+
Arguments:
|
|
55
|
+
admin: An AdminClient instance.
|
|
56
|
+
topic: The topic name to create.
|
|
57
|
+
num_partitions: Number of partitions (default 1 for local dev).
|
|
58
|
+
replication_factor: Replication factor (default 1 for local dev).
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
RuntimeError: If topic creation fails.
|
|
62
|
+
"""
|
|
63
|
+
if topic_exists(admin, topic):
|
|
64
|
+
return
|
|
65
|
+
|
|
66
|
+
new_topic = NewTopic(
|
|
67
|
+
topic,
|
|
68
|
+
num_partitions=num_partitions,
|
|
69
|
+
replication_factor=replication_factor,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
futures = admin.create_topics([new_topic])
|
|
73
|
+
|
|
74
|
+
for topic_name, future in futures.items():
|
|
75
|
+
try:
|
|
76
|
+
future.result()
|
|
77
|
+
except KafkaException as error:
|
|
78
|
+
msg = (
|
|
79
|
+
f"Failed to create topic {topic_name!r}.\n"
|
|
80
|
+
f"Kafka reported: {error}\n\n"
|
|
81
|
+
"Check that Kafka is running and that you have permission to create topics."
|
|
82
|
+
)
|
|
83
|
+
raise RuntimeError(msg) from error
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def delete_topic(admin: AdminClient, topic: str) -> None:
|
|
87
|
+
"""Delete a Kafka topic if it exists.
|
|
88
|
+
|
|
89
|
+
Deleting a topic removes all its messages. Run the producer again
|
|
90
|
+
after deleting to repopulate the topic.
|
|
91
|
+
|
|
92
|
+
Arguments:
|
|
93
|
+
admin: An AdminClient instance.
|
|
94
|
+
topic: The topic name to delete.
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
RuntimeError: If topic deletion fails.
|
|
98
|
+
"""
|
|
99
|
+
if not topic_exists(admin, topic):
|
|
100
|
+
return
|
|
101
|
+
|
|
102
|
+
futures = admin.delete_topics([topic])
|
|
103
|
+
|
|
104
|
+
for topic_name, future in futures.items():
|
|
105
|
+
try:
|
|
106
|
+
future.result()
|
|
107
|
+
except KafkaException as error:
|
|
108
|
+
msg = (
|
|
109
|
+
f"Failed to delete topic {topic_name!r}.\n"
|
|
110
|
+
f"Kafka reported: {error}\n\n"
|
|
111
|
+
"Check that Kafka is running and that you have permission to delete topics."
|
|
112
|
+
)
|
|
113
|
+
raise RuntimeError(msg) from error
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def list_topics(admin: AdminClient) -> list[str]:
|
|
117
|
+
"""Return a sorted list of topic names currently in Kafka.
|
|
118
|
+
|
|
119
|
+
Retries several times to allow rdkafka's broker handshake to complete.
|
|
120
|
+
On Windows, the handshake can take up to ~4 seconds after the AdminClient
|
|
121
|
+
is created, which causes an immediate call to fail with a transport error.
|
|
122
|
+
|
|
123
|
+
Arguments:
|
|
124
|
+
admin: An AdminClient instance.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
A sorted list of topic name strings.
|
|
128
|
+
|
|
129
|
+
Raises:
|
|
130
|
+
RuntimeError: If Kafka is unreachable after all retries.
|
|
131
|
+
"""
|
|
132
|
+
last_error: Exception | None = None
|
|
133
|
+
|
|
134
|
+
for attempt in range(1, ADMIN_READY_RETRIES + 1):
|
|
135
|
+
try:
|
|
136
|
+
metadata = admin.list_topics(timeout=5)
|
|
137
|
+
return sorted(metadata.topics.keys())
|
|
138
|
+
except KafkaException as error:
|
|
139
|
+
last_error = error
|
|
140
|
+
if attempt < ADMIN_READY_RETRIES:
|
|
141
|
+
time.sleep(ADMIN_READY_DELAY_SECONDS)
|
|
142
|
+
|
|
143
|
+
msg = kafka_admin_failed_message(
|
|
144
|
+
operation="list_topics",
|
|
145
|
+
topic="(all)",
|
|
146
|
+
detail=(
|
|
147
|
+
f"Kafka did not respond after {ADMIN_READY_RETRIES} attempts.\n"
|
|
148
|
+
f" Last error: {last_error}"
|
|
149
|
+
),
|
|
150
|
+
)
|
|
151
|
+
raise RuntimeError(msg) from last_error
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def topic_exists(admin: AdminClient, topic: str) -> bool:
|
|
155
|
+
"""Return True if the topic already exists in Kafka."""
|
|
156
|
+
return topic in list_topics(admin)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def get_topic_message_count(
|
|
160
|
+
admin: AdminClient, topic: str, settings: KafkaSettings
|
|
161
|
+
) -> int:
|
|
162
|
+
"""Return the total number of messages available in a topic.
|
|
163
|
+
|
|
164
|
+
Sums the high-water offset across all partitions.
|
|
165
|
+
This reflects the total messages ever produced to the topic,
|
|
166
|
+
not the number of unread messages for a specific consumer group.
|
|
167
|
+
|
|
168
|
+
Arguments:
|
|
169
|
+
admin: An AdminClient instance.
|
|
170
|
+
topic: The topic name to inspect.
|
|
171
|
+
settings: KafkaSettings instance containing configuration.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Total message count across all partitions, or 0 if topic is empty.
|
|
175
|
+
|
|
176
|
+
Raises:
|
|
177
|
+
RuntimeError: If topic metadata cannot be retrieved.
|
|
178
|
+
"""
|
|
179
|
+
try:
|
|
180
|
+
metadata = admin.list_topics(topic=topic, timeout=5)
|
|
181
|
+
except KafkaException as error:
|
|
182
|
+
msg = kafka_admin_failed_message(
|
|
183
|
+
operation="list_topics",
|
|
184
|
+
topic=topic,
|
|
185
|
+
detail=str(error),
|
|
186
|
+
)
|
|
187
|
+
raise RuntimeError(msg) from error
|
|
188
|
+
|
|
189
|
+
topic_metadata = metadata.topics.get(topic)
|
|
190
|
+
if topic_metadata is None:
|
|
191
|
+
return 0
|
|
192
|
+
|
|
193
|
+
bootstrap_servers = settings.bootstrap_servers
|
|
194
|
+
temp_consumer = Consumer(
|
|
195
|
+
{
|
|
196
|
+
"bootstrap.servers": bootstrap_servers,
|
|
197
|
+
"group.id": "_offset_inspector",
|
|
198
|
+
"enable.auto.commit": "false",
|
|
199
|
+
}
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
total = 0
|
|
203
|
+
try:
|
|
204
|
+
for partition_id in topic_metadata.partitions:
|
|
205
|
+
tp = TopicPartition(topic, partition_id)
|
|
206
|
+
low, high = temp_consumer.get_watermark_offsets(tp, timeout=5)
|
|
207
|
+
total += max(0, high - low)
|
|
208
|
+
finally:
|
|
209
|
+
temp_consumer.close()
|
|
210
|
+
|
|
211
|
+
return total
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""src/datafun_streaming/kafka/kafka_connection_utils.py.
|
|
2
|
+
|
|
3
|
+
Kafka connection helpers for streaming examples.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
# === IMPORTS ===
|
|
7
|
+
|
|
8
|
+
import socket
|
|
9
|
+
|
|
10
|
+
from datafun_streaming.kafka.errors import kafka_not_reachable_message
|
|
11
|
+
from datafun_streaming.kafka.kafka_settings import KafkaSettings
|
|
12
|
+
|
|
13
|
+
# === EXPORTS ===
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"verify_kafka_connection",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def verify_kafka_connection(settings: KafkaSettings) -> None:
|
|
21
|
+
"""Verify that the Kafka bootstrap server is reachable."""
|
|
22
|
+
bootstrap_server = settings.bootstrap_servers.split(",")[0].strip()
|
|
23
|
+
|
|
24
|
+
if ":" not in bootstrap_server:
|
|
25
|
+
msg = (
|
|
26
|
+
"KAFKA_BOOTSTRAP_SERVERS must include host and port, "
|
|
27
|
+
f"but got {bootstrap_server!r}."
|
|
28
|
+
)
|
|
29
|
+
raise ConnectionError(msg)
|
|
30
|
+
|
|
31
|
+
host, port_text = bootstrap_server.rsplit(":", 1)
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
port = int(port_text)
|
|
35
|
+
except ValueError as error:
|
|
36
|
+
msg = f"KAFKA_BOOTSTRAP_SERVERS has an invalid port. Got {bootstrap_server!r}."
|
|
37
|
+
raise ConnectionError(msg) from error
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
with socket.create_connection((host, port), timeout=5):
|
|
41
|
+
return
|
|
42
|
+
except OSError as error:
|
|
43
|
+
msg = kafka_not_reachable_message(
|
|
44
|
+
bootstrap_servers=settings.bootstrap_servers,
|
|
45
|
+
)
|
|
46
|
+
raise ConnectionError(msg) from error
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""src/datafun_streaming/kafka/kafka_consumer_utils.py.
|
|
2
|
+
|
|
3
|
+
Consumer helpers for Kafka messages.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
# === IMPORTS ===
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from confluent_kafka import Consumer
|
|
12
|
+
|
|
13
|
+
from datafun_streaming.io.io_utils import row_from_json
|
|
14
|
+
from datafun_streaming.kafka.errors import kafka_consume_failed_message
|
|
15
|
+
from datafun_streaming.kafka.kafka_settings import KafkaSettings
|
|
16
|
+
|
|
17
|
+
# === EXPORTS
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"create_consumer",
|
|
21
|
+
"consume_kafka_message",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
# === DEFINE HELPER FUNCTIONS ===
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def create_consumer(settings: KafkaSettings) -> Consumer:
|
|
28
|
+
"""Create a Kafka consumer."""
|
|
29
|
+
return Consumer(
|
|
30
|
+
settings.consumer_config(),
|
|
31
|
+
logger=logging.getLogger("rdkafka.consumer"),
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def consume_kafka_message(
|
|
36
|
+
*,
|
|
37
|
+
consumer: Any,
|
|
38
|
+
timeout_seconds: float,
|
|
39
|
+
) -> dict[str, Any] | None:
|
|
40
|
+
"""Consume one Kafka message and return it as a row dictionary."""
|
|
41
|
+
message = consumer.poll(timeout_seconds)
|
|
42
|
+
|
|
43
|
+
if message is None:
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
if message.error():
|
|
47
|
+
msg = kafka_consume_failed_message(detail=str(message.error()))
|
|
48
|
+
raise RuntimeError(msg)
|
|
49
|
+
|
|
50
|
+
raw_value = message.value()
|
|
51
|
+
|
|
52
|
+
if raw_value is None:
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
row = row_from_json(raw_value.decode("utf-8"))
|
|
56
|
+
|
|
57
|
+
raw_key = message.key()
|
|
58
|
+
row["_kafka_key"] = raw_key.decode("utf-8") if raw_key else ""
|
|
59
|
+
row["_kafka_partition"] = message.partition()
|
|
60
|
+
row["_kafka_offset"] = message.offset()
|
|
61
|
+
|
|
62
|
+
return row
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""src/datafun_streaming/kafka/kafka_producer_utils.py.
|
|
2
|
+
|
|
3
|
+
Kafka producer helpers for streaming examples.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
# === IMPORTS ===
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from confluent_kafka import Producer
|
|
12
|
+
|
|
13
|
+
from datafun_streaming.io.io_utils import row_to_json
|
|
14
|
+
from datafun_streaming.kafka.errors import (
|
|
15
|
+
kafka_delivery_failed_message,
|
|
16
|
+
)
|
|
17
|
+
from datafun_streaming.kafka.kafka_settings import KafkaSettings
|
|
18
|
+
|
|
19
|
+
# === EXPORTS ===
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"create_producer",
|
|
23
|
+
"produce_kafka_message",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
# === DEFINE HELPER FUNCTIONS ===
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def create_producer(settings: KafkaSettings) -> Producer:
|
|
30
|
+
"""Create a Kafka producer.
|
|
31
|
+
|
|
32
|
+
Arguments:
|
|
33
|
+
settings: KafkaSettings object with producer configuration.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
A confluent_kafka.Producer instance.
|
|
37
|
+
"""
|
|
38
|
+
return Producer(
|
|
39
|
+
settings.producer_config(),
|
|
40
|
+
logger=logging.getLogger("rdkafka.producer"),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def produce_kafka_message(
|
|
45
|
+
*,
|
|
46
|
+
producer: Producer,
|
|
47
|
+
topic: str,
|
|
48
|
+
key: str,
|
|
49
|
+
message: dict[str, Any],
|
|
50
|
+
) -> None:
|
|
51
|
+
"""Produce one dictionary message to Kafka as JSON.
|
|
52
|
+
|
|
53
|
+
Arguments:
|
|
54
|
+
*: All arguments after the asterisk must be passed as keyword arguments.
|
|
55
|
+
producer: A confluent_kafka.Producer instance.
|
|
56
|
+
topic: The Kafka topic to produce to.
|
|
57
|
+
key: The Kafka message key.
|
|
58
|
+
message: The message dictionary to produce.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
None
|
|
62
|
+
|
|
63
|
+
Raises:
|
|
64
|
+
RuntimeError: If Kafka reports a delivery failure.
|
|
65
|
+
|
|
66
|
+
This function encodes the message as JSON and produces it to Kafka with the given key.
|
|
67
|
+
It uses a delivery callback to check for delivery errors and raises a RuntimeError if any occur.
|
|
68
|
+
|
|
69
|
+
"""
|
|
70
|
+
delivery_errors: list[str] = []
|
|
71
|
+
|
|
72
|
+
def delivery_report(error: Any, delivered_message: Any) -> None:
|
|
73
|
+
"""Record Kafka delivery failure details."""
|
|
74
|
+
if error is not None:
|
|
75
|
+
delivery_errors.append(str(error))
|
|
76
|
+
|
|
77
|
+
producer.produce(
|
|
78
|
+
topic=topic,
|
|
79
|
+
key=key.encode("utf-8"),
|
|
80
|
+
value=row_to_json(message).encode("utf-8"),
|
|
81
|
+
callback=delivery_report,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
producer.poll(0)
|
|
85
|
+
|
|
86
|
+
remaining = producer.flush(timeout=10)
|
|
87
|
+
|
|
88
|
+
if remaining > 0:
|
|
89
|
+
detail = f"{remaining} Kafka message(s) were not delivered before timeout."
|
|
90
|
+
msg = kafka_delivery_failed_message(detail=detail)
|
|
91
|
+
raise RuntimeError(msg)
|
|
92
|
+
|
|
93
|
+
if delivery_errors:
|
|
94
|
+
detail = "; ".join(delivery_errors)
|
|
95
|
+
msg = kafka_delivery_failed_message(detail=detail)
|
|
96
|
+
raise RuntimeError(msg)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""src/datafun_streaming/kafka/kafka_settings.py.
|
|
2
|
+
|
|
3
|
+
Kafka settings for producer and consumer examples.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
# === IMPORTS ===
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
import os
|
|
10
|
+
from typing import Self
|
|
11
|
+
|
|
12
|
+
from dotenv import load_dotenv
|
|
13
|
+
|
|
14
|
+
# === EXPORTS ===
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"KafkaSettings",
|
|
18
|
+
"DEFAULT_AUTO_OFFSET_RESET",
|
|
19
|
+
"DEFAULT_BOOTSTRAP_SERVERS",
|
|
20
|
+
"DEFAULT_GROUP_ID",
|
|
21
|
+
"DEFAULT_TOPIC",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
# === DECLARE DEFAULTS ===
|
|
25
|
+
|
|
26
|
+
DEFAULT_BOOTSTRAP_SERVERS = "localhost:9092"
|
|
27
|
+
DEFAULT_TOPIC = "product-sales-case"
|
|
28
|
+
DEFAULT_GROUP_ID = "streaming-consumer-group-A"
|
|
29
|
+
DEFAULT_AUTO_OFFSET_RESET = "earliest"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# == DECLARE A FROZEN (IMMUTABLE) DATA CLASS FOR KAFKA SETTINGS ===
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(frozen=True)
|
|
36
|
+
class KafkaSettings:
|
|
37
|
+
"""Kafka settings for producer and consumer examples."""
|
|
38
|
+
|
|
39
|
+
bootstrap_servers: str
|
|
40
|
+
topic: str
|
|
41
|
+
group_id: str
|
|
42
|
+
auto_offset_reset: str
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def from_env(cls) -> Self:
|
|
46
|
+
"""Create Kafka settings from environment variables."""
|
|
47
|
+
load_dotenv()
|
|
48
|
+
|
|
49
|
+
return cls(
|
|
50
|
+
bootstrap_servers=os.getenv(
|
|
51
|
+
"KAFKA_BOOTSTRAP_SERVERS",
|
|
52
|
+
DEFAULT_BOOTSTRAP_SERVERS,
|
|
53
|
+
),
|
|
54
|
+
topic=os.getenv("KAFKA_TOPIC", DEFAULT_TOPIC),
|
|
55
|
+
group_id=os.getenv("KAFKA_GROUP_ID", DEFAULT_GROUP_ID),
|
|
56
|
+
auto_offset_reset=os.getenv(
|
|
57
|
+
"KAFKA_AUTO_OFFSET_RESET",
|
|
58
|
+
DEFAULT_AUTO_OFFSET_RESET,
|
|
59
|
+
),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def producer_config(self) -> dict[str, str]:
|
|
63
|
+
"""Return a confluent-kafka producer configuration."""
|
|
64
|
+
return {
|
|
65
|
+
"bootstrap.servers": self.bootstrap_servers,
|
|
66
|
+
"log_level": "3",
|
|
67
|
+
"message.timeout.ms": "5000",
|
|
68
|
+
"socket.timeout.ms": "5000",
|
|
69
|
+
"request.timeout.ms": "5000",
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
def consumer_config(self) -> dict[str, str]:
|
|
73
|
+
"""Return a confluent-kafka consumer configuration."""
|
|
74
|
+
return {
|
|
75
|
+
"bootstrap.servers": self.bootstrap_servers,
|
|
76
|
+
"log_level": "3",
|
|
77
|
+
"group.id": self.group_id,
|
|
78
|
+
"auto.offset.reset": self.auto_offset_reset,
|
|
79
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Statistical utilities for streaming data analysis."""
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""stats/stats_utils.py.
|
|
2
|
+
|
|
3
|
+
Running statistics for streaming data.
|
|
4
|
+
|
|
5
|
+
Provides a RunningStats class that tracks count, sum, mean, min, and max
|
|
6
|
+
for a stream of numeric values without storing the full history.
|
|
7
|
+
|
|
8
|
+
This is domain-agnostic: it works on any numeric field from any message.
|
|
9
|
+
Pass it a value on each message and read the current statistics at any time.
|
|
10
|
+
|
|
11
|
+
Author: Denise Case
|
|
12
|
+
Date: 2026-05
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
# === IMPORTS ===
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
|
|
19
|
+
# === EXPORTS ===
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"RunningStats",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
# === DEFINE RUNNING STATS CLASS ===
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class RunningStats:
|
|
30
|
+
"""Accumulates running statistics for a stream of numeric values.
|
|
31
|
+
|
|
32
|
+
Updates incrementally (one value at a time) without storing history.
|
|
33
|
+
Safe to use inside a message processing loop.
|
|
34
|
+
|
|
35
|
+
Do not use min and max as they would conflict with
|
|
36
|
+
built-in functions.
|
|
37
|
+
Access minimum and maximum values
|
|
38
|
+
via the minimum and maximum attributes.
|
|
39
|
+
|
|
40
|
+
Attributes:
|
|
41
|
+
count: Number of values received so far.
|
|
42
|
+
total: Running sum of all values.
|
|
43
|
+
mean: Running mean of all values.
|
|
44
|
+
minimum: Minimum value seen so far.
|
|
45
|
+
maximum: Maximum value seen so far.
|
|
46
|
+
|
|
47
|
+
Example:
|
|
48
|
+
stats = RunningStats()
|
|
49
|
+
for message in messages:
|
|
50
|
+
stats.update(message["total"])
|
|
51
|
+
print(f"count={stats.count} mean={stats.mean:.2f}")
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
count: int = 0
|
|
55
|
+
total: float = 0.0
|
|
56
|
+
mean: float = 0.0
|
|
57
|
+
minimum: float = float("inf")
|
|
58
|
+
maximum: float = float("-inf")
|
|
59
|
+
|
|
60
|
+
def update(self, value: float) -> None:
|
|
61
|
+
"""Update statistics with one new value.
|
|
62
|
+
|
|
63
|
+
Arguments:
|
|
64
|
+
value: The new numeric value to include.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
None.
|
|
68
|
+
"""
|
|
69
|
+
self.count += 1
|
|
70
|
+
self.total += value
|
|
71
|
+
self.mean = self.total / self.count
|
|
72
|
+
if value < self.minimum:
|
|
73
|
+
self.minimum = value
|
|
74
|
+
if value > self.maximum:
|
|
75
|
+
self.maximum = value
|
|
76
|
+
|
|
77
|
+
def reset(self) -> None:
|
|
78
|
+
"""Reset all statistics to their initial state.
|
|
79
|
+
|
|
80
|
+
Use this to start a new window or clear accumulated state.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
None.
|
|
84
|
+
"""
|
|
85
|
+
self.count = 0
|
|
86
|
+
self.total = 0.0
|
|
87
|
+
self.mean = 0.0
|
|
88
|
+
self.minimum = float("inf")
|
|
89
|
+
self.maximum = float("-inf")
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def is_empty(self) -> bool:
|
|
93
|
+
"""Return True if no values have been received yet."""
|
|
94
|
+
return self.count == 0
|
|
95
|
+
|
|
96
|
+
def summary(self) -> str:
|
|
97
|
+
"""Return a formatted summary string for logging.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
A single-line string with all current statistics.
|
|
101
|
+
"""
|
|
102
|
+
if self.is_empty:
|
|
103
|
+
return "RunningStats: no values received yet."
|
|
104
|
+
return (
|
|
105
|
+
f"count={self.count} "
|
|
106
|
+
f"total={self.total:,.2f} "
|
|
107
|
+
f"mean={self.mean:,.2f} "
|
|
108
|
+
f"minimum={self.minimum:,.2f} "
|
|
109
|
+
f"maximum={self.maximum:,.2f}"
|
|
110
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Persistence backends for streaming data."""
|