datafun-streaming 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,211 @@
1
+ """src/datafun_streaming/kafka/kafka_admin_utils.py.
2
+
3
+ Kafka topic management helpers for streaming examples.
4
+ """
5
+
6
+ # === IMPORTS ===
7
+
8
+ import time
9
+
10
+ from confluent_kafka import Consumer, TopicPartition
11
+ from confluent_kafka.admin import AdminClient
12
+ from confluent_kafka.cimpl import KafkaException, NewTopic
13
+
14
+ from datafun_streaming.kafka.errors import kafka_admin_failed_message
15
+ from datafun_streaming.kafka.kafka_settings import KafkaSettings
16
+
17
+ # === EXPORTS
18
+
19
+ __all__ = [
20
+ "create_admin_client",
21
+ "create_topic",
22
+ "delete_topic",
23
+ "list_topics",
24
+ "topic_exists",
25
+ "get_topic_message_count",
26
+ ]
27
+
28
+ # === DECLARE CONSTANTS ===
29
+
30
+ # rdkafka's internal broker handshake takes up to ~4 seconds on Windows
31
+ # before the AdminClient is ready to accept calls. These constants
32
+ # control how long we wait and how many times we retry.
33
+ ADMIN_READY_RETRIES: int = 5
34
+ ADMIN_READY_DELAY_SECONDS: float = 2.0
35
+
36
+
37
+ # === DEFINE ADMIN HELPER FUNCTIONS ===
38
+
39
+
40
+ def create_admin_client(settings: KafkaSettings) -> AdminClient:
41
+ """Create a Kafka AdminClient."""
42
+ return AdminClient({"bootstrap.servers": settings.bootstrap_servers})
43
+
44
+
45
+ def create_topic(
46
+ admin: AdminClient,
47
+ topic: str,
48
+ *,
49
+ num_partitions: int = 1,
50
+ replication_factor: int = 1,
51
+ ) -> None:
52
+ """Create a Kafka topic if it does not already exist.
53
+
54
+ Arguments:
55
+ admin: An AdminClient instance.
56
+ topic: The topic name to create.
57
+ num_partitions: Number of partitions (default 1 for local dev).
58
+ replication_factor: Replication factor (default 1 for local dev).
59
+
60
+ Raises:
61
+ RuntimeError: If topic creation fails.
62
+ """
63
+ if topic_exists(admin, topic):
64
+ return
65
+
66
+ new_topic = NewTopic(
67
+ topic,
68
+ num_partitions=num_partitions,
69
+ replication_factor=replication_factor,
70
+ )
71
+
72
+ futures = admin.create_topics([new_topic])
73
+
74
+ for topic_name, future in futures.items():
75
+ try:
76
+ future.result()
77
+ except KafkaException as error:
78
+ msg = (
79
+ f"Failed to create topic {topic_name!r}.\n"
80
+ f"Kafka reported: {error}\n\n"
81
+ "Check that Kafka is running and that you have permission to create topics."
82
+ )
83
+ raise RuntimeError(msg) from error
84
+
85
+
86
+ def delete_topic(admin: AdminClient, topic: str) -> None:
87
+ """Delete a Kafka topic if it exists.
88
+
89
+ Deleting a topic removes all its messages. Run the producer again
90
+ after deleting to repopulate the topic.
91
+
92
+ Arguments:
93
+ admin: An AdminClient instance.
94
+ topic: The topic name to delete.
95
+
96
+ Raises:
97
+ RuntimeError: If topic deletion fails.
98
+ """
99
+ if not topic_exists(admin, topic):
100
+ return
101
+
102
+ futures = admin.delete_topics([topic])
103
+
104
+ for topic_name, future in futures.items():
105
+ try:
106
+ future.result()
107
+ except KafkaException as error:
108
+ msg = (
109
+ f"Failed to delete topic {topic_name!r}.\n"
110
+ f"Kafka reported: {error}\n\n"
111
+ "Check that Kafka is running and that you have permission to delete topics."
112
+ )
113
+ raise RuntimeError(msg) from error
114
+
115
+
116
+ def list_topics(admin: AdminClient) -> list[str]:
117
+ """Return a sorted list of topic names currently in Kafka.
118
+
119
+ Retries several times to allow rdkafka's broker handshake to complete.
120
+ On Windows, the handshake can take up to ~4 seconds after the AdminClient
121
+ is created, which causes an immediate call to fail with a transport error.
122
+
123
+ Arguments:
124
+ admin: An AdminClient instance.
125
+
126
+ Returns:
127
+ A sorted list of topic name strings.
128
+
129
+ Raises:
130
+ RuntimeError: If Kafka is unreachable after all retries.
131
+ """
132
+ last_error: Exception | None = None
133
+
134
+ for attempt in range(1, ADMIN_READY_RETRIES + 1):
135
+ try:
136
+ metadata = admin.list_topics(timeout=5)
137
+ return sorted(metadata.topics.keys())
138
+ except KafkaException as error:
139
+ last_error = error
140
+ if attempt < ADMIN_READY_RETRIES:
141
+ time.sleep(ADMIN_READY_DELAY_SECONDS)
142
+
143
+ msg = kafka_admin_failed_message(
144
+ operation="list_topics",
145
+ topic="(all)",
146
+ detail=(
147
+ f"Kafka did not respond after {ADMIN_READY_RETRIES} attempts.\n"
148
+ f" Last error: {last_error}"
149
+ ),
150
+ )
151
+ raise RuntimeError(msg) from last_error
152
+
153
+
154
+ def topic_exists(admin: AdminClient, topic: str) -> bool:
155
+ """Return True if the topic already exists in Kafka."""
156
+ return topic in list_topics(admin)
157
+
158
+
159
+ def get_topic_message_count(
160
+ admin: AdminClient, topic: str, settings: KafkaSettings
161
+ ) -> int:
162
+ """Return the total number of messages available in a topic.
163
+
164
+ Sums the high-water offset across all partitions.
165
+ This reflects the total messages ever produced to the topic,
166
+ not the number of unread messages for a specific consumer group.
167
+
168
+ Arguments:
169
+ admin: An AdminClient instance.
170
+ topic: The topic name to inspect.
171
+ settings: KafkaSettings instance containing configuration.
172
+
173
+ Returns:
174
+ Total message count across all partitions, or 0 if topic is empty.
175
+
176
+ Raises:
177
+ RuntimeError: If topic metadata cannot be retrieved.
178
+ """
179
+ try:
180
+ metadata = admin.list_topics(topic=topic, timeout=5)
181
+ except KafkaException as error:
182
+ msg = kafka_admin_failed_message(
183
+ operation="list_topics",
184
+ topic=topic,
185
+ detail=str(error),
186
+ )
187
+ raise RuntimeError(msg) from error
188
+
189
+ topic_metadata = metadata.topics.get(topic)
190
+ if topic_metadata is None:
191
+ return 0
192
+
193
+ bootstrap_servers = settings.bootstrap_servers
194
+ temp_consumer = Consumer(
195
+ {
196
+ "bootstrap.servers": bootstrap_servers,
197
+ "group.id": "_offset_inspector",
198
+ "enable.auto.commit": "false",
199
+ }
200
+ )
201
+
202
+ total = 0
203
+ try:
204
+ for partition_id in topic_metadata.partitions:
205
+ tp = TopicPartition(topic, partition_id)
206
+ low, high = temp_consumer.get_watermark_offsets(tp, timeout=5)
207
+ total += max(0, high - low)
208
+ finally:
209
+ temp_consumer.close()
210
+
211
+ return total
@@ -0,0 +1,46 @@
1
+ """src/datafun_streaming/kafka/kafka_connection_utils.py.
2
+
3
+ Kafka connection helpers for streaming examples.
4
+ """
5
+
6
+ # === IMPORTS ===
7
+
8
+ import socket
9
+
10
+ from datafun_streaming.kafka.errors import kafka_not_reachable_message
11
+ from datafun_streaming.kafka.kafka_settings import KafkaSettings
12
+
13
+ # === EXPORTS ===
14
+
15
+ __all__ = [
16
+ "verify_kafka_connection",
17
+ ]
18
+
19
+
20
+ def verify_kafka_connection(settings: KafkaSettings) -> None:
21
+ """Verify that the Kafka bootstrap server is reachable."""
22
+ bootstrap_server = settings.bootstrap_servers.split(",")[0].strip()
23
+
24
+ if ":" not in bootstrap_server:
25
+ msg = (
26
+ "KAFKA_BOOTSTRAP_SERVERS must include host and port, "
27
+ f"but got {bootstrap_server!r}."
28
+ )
29
+ raise ConnectionError(msg)
30
+
31
+ host, port_text = bootstrap_server.rsplit(":", 1)
32
+
33
+ try:
34
+ port = int(port_text)
35
+ except ValueError as error:
36
+ msg = f"KAFKA_BOOTSTRAP_SERVERS has an invalid port. Got {bootstrap_server!r}."
37
+ raise ConnectionError(msg) from error
38
+
39
+ try:
40
+ with socket.create_connection((host, port), timeout=5):
41
+ return
42
+ except OSError as error:
43
+ msg = kafka_not_reachable_message(
44
+ bootstrap_servers=settings.bootstrap_servers,
45
+ )
46
+ raise ConnectionError(msg) from error
@@ -0,0 +1,62 @@
1
+ """src/datafun_streaming/kafka/kafka_consumer_utils.py.
2
+
3
+ Consumer helpers for Kafka messages.
4
+ """
5
+
6
+ # === IMPORTS ===
7
+
8
+ import logging
9
+ from typing import Any
10
+
11
+ from confluent_kafka import Consumer
12
+
13
+ from datafun_streaming.io.io_utils import row_from_json
14
+ from datafun_streaming.kafka.errors import kafka_consume_failed_message
15
+ from datafun_streaming.kafka.kafka_settings import KafkaSettings
16
+
17
+ # === EXPORTS
18
+
19
+ __all__ = [
20
+ "create_consumer",
21
+ "consume_kafka_message",
22
+ ]
23
+
24
+ # === DEFINE HELPER FUNCTIONS ===
25
+
26
+
27
+ def create_consumer(settings: KafkaSettings) -> Consumer:
28
+ """Create a Kafka consumer."""
29
+ return Consumer(
30
+ settings.consumer_config(),
31
+ logger=logging.getLogger("rdkafka.consumer"),
32
+ )
33
+
34
+
35
+ def consume_kafka_message(
36
+ *,
37
+ consumer: Any,
38
+ timeout_seconds: float,
39
+ ) -> dict[str, Any] | None:
40
+ """Consume one Kafka message and return it as a row dictionary."""
41
+ message = consumer.poll(timeout_seconds)
42
+
43
+ if message is None:
44
+ return None
45
+
46
+ if message.error():
47
+ msg = kafka_consume_failed_message(detail=str(message.error()))
48
+ raise RuntimeError(msg)
49
+
50
+ raw_value = message.value()
51
+
52
+ if raw_value is None:
53
+ return None
54
+
55
+ row = row_from_json(raw_value.decode("utf-8"))
56
+
57
+ raw_key = message.key()
58
+ row["_kafka_key"] = raw_key.decode("utf-8") if raw_key else ""
59
+ row["_kafka_partition"] = message.partition()
60
+ row["_kafka_offset"] = message.offset()
61
+
62
+ return row
@@ -0,0 +1,96 @@
1
+ """src/datafun_streaming/kafka/kafka_producer_utils.py.
2
+
3
+ Kafka producer helpers for streaming examples.
4
+ """
5
+
6
+ # === IMPORTS ===
7
+
8
+ import logging
9
+ from typing import Any
10
+
11
+ from confluent_kafka import Producer
12
+
13
+ from datafun_streaming.io.io_utils import row_to_json
14
+ from datafun_streaming.kafka.errors import (
15
+ kafka_delivery_failed_message,
16
+ )
17
+ from datafun_streaming.kafka.kafka_settings import KafkaSettings
18
+
19
+ # === EXPORTS ===
20
+
21
+ __all__ = [
22
+ "create_producer",
23
+ "produce_kafka_message",
24
+ ]
25
+
26
+ # === DEFINE HELPER FUNCTIONS ===
27
+
28
+
29
+ def create_producer(settings: KafkaSettings) -> Producer:
30
+ """Create a Kafka producer.
31
+
32
+ Arguments:
33
+ settings: KafkaSettings object with producer configuration.
34
+
35
+ Returns:
36
+ A confluent_kafka.Producer instance.
37
+ """
38
+ return Producer(
39
+ settings.producer_config(),
40
+ logger=logging.getLogger("rdkafka.producer"),
41
+ )
42
+
43
+
44
+ def produce_kafka_message(
45
+ *,
46
+ producer: Producer,
47
+ topic: str,
48
+ key: str,
49
+ message: dict[str, Any],
50
+ ) -> None:
51
+ """Produce one dictionary message to Kafka as JSON.
52
+
53
+ Arguments:
54
+ *: All arguments after the asterisk must be passed as keyword arguments.
55
+ producer: A confluent_kafka.Producer instance.
56
+ topic: The Kafka topic to produce to.
57
+ key: The Kafka message key.
58
+ message: The message dictionary to produce.
59
+
60
+ Returns:
61
+ None
62
+
63
+ Raises:
64
+ RuntimeError: If Kafka reports a delivery failure.
65
+
66
+ This function encodes the message as JSON and produces it to Kafka with the given key.
67
+ It uses a delivery callback to check for delivery errors and raises a RuntimeError if any occur.
68
+
69
+ """
70
+ delivery_errors: list[str] = []
71
+
72
+ def delivery_report(error: Any, delivered_message: Any) -> None:
73
+ """Record Kafka delivery failure details."""
74
+ if error is not None:
75
+ delivery_errors.append(str(error))
76
+
77
+ producer.produce(
78
+ topic=topic,
79
+ key=key.encode("utf-8"),
80
+ value=row_to_json(message).encode("utf-8"),
81
+ callback=delivery_report,
82
+ )
83
+
84
+ producer.poll(0)
85
+
86
+ remaining = producer.flush(timeout=10)
87
+
88
+ if remaining > 0:
89
+ detail = f"{remaining} Kafka message(s) were not delivered before timeout."
90
+ msg = kafka_delivery_failed_message(detail=detail)
91
+ raise RuntimeError(msg)
92
+
93
+ if delivery_errors:
94
+ detail = "; ".join(delivery_errors)
95
+ msg = kafka_delivery_failed_message(detail=detail)
96
+ raise RuntimeError(msg)
@@ -0,0 +1,79 @@
1
+ """src/datafun_streaming/kafka/kafka_settings.py.
2
+
3
+ Kafka settings for producer and consumer examples.
4
+ """
5
+
6
+ # === IMPORTS ===
7
+
8
+ from dataclasses import dataclass
9
+ import os
10
+ from typing import Self
11
+
12
+ from dotenv import load_dotenv
13
+
14
+ # === EXPORTS ===
15
+
16
+ __all__ = [
17
+ "KafkaSettings",
18
+ "DEFAULT_AUTO_OFFSET_RESET",
19
+ "DEFAULT_BOOTSTRAP_SERVERS",
20
+ "DEFAULT_GROUP_ID",
21
+ "DEFAULT_TOPIC",
22
+ ]
23
+
24
+ # === DECLARE DEFAULTS ===
25
+
26
+ DEFAULT_BOOTSTRAP_SERVERS = "localhost:9092"
27
+ DEFAULT_TOPIC = "product-sales-case"
28
+ DEFAULT_GROUP_ID = "streaming-consumer-group-A"
29
+ DEFAULT_AUTO_OFFSET_RESET = "earliest"
30
+
31
+
32
+ # == DECLARE A FROZEN (IMMUTABLE) DATA CLASS FOR KAFKA SETTINGS ===
33
+
34
+
35
+ @dataclass(frozen=True)
36
+ class KafkaSettings:
37
+ """Kafka settings for producer and consumer examples."""
38
+
39
+ bootstrap_servers: str
40
+ topic: str
41
+ group_id: str
42
+ auto_offset_reset: str
43
+
44
+ @classmethod
45
+ def from_env(cls) -> Self:
46
+ """Create Kafka settings from environment variables."""
47
+ load_dotenv()
48
+
49
+ return cls(
50
+ bootstrap_servers=os.getenv(
51
+ "KAFKA_BOOTSTRAP_SERVERS",
52
+ DEFAULT_BOOTSTRAP_SERVERS,
53
+ ),
54
+ topic=os.getenv("KAFKA_TOPIC", DEFAULT_TOPIC),
55
+ group_id=os.getenv("KAFKA_GROUP_ID", DEFAULT_GROUP_ID),
56
+ auto_offset_reset=os.getenv(
57
+ "KAFKA_AUTO_OFFSET_RESET",
58
+ DEFAULT_AUTO_OFFSET_RESET,
59
+ ),
60
+ )
61
+
62
+ def producer_config(self) -> dict[str, str]:
63
+ """Return a confluent-kafka producer configuration."""
64
+ return {
65
+ "bootstrap.servers": self.bootstrap_servers,
66
+ "log_level": "3",
67
+ "message.timeout.ms": "5000",
68
+ "socket.timeout.ms": "5000",
69
+ "request.timeout.ms": "5000",
70
+ }
71
+
72
+ def consumer_config(self) -> dict[str, str]:
73
+ """Return a confluent-kafka consumer configuration."""
74
+ return {
75
+ "bootstrap.servers": self.bootstrap_servers,
76
+ "log_level": "3",
77
+ "group.id": self.group_id,
78
+ "auto.offset.reset": self.auto_offset_reset,
79
+ }
File without changes
@@ -0,0 +1 @@
1
+ """Statistical utilities for streaming data analysis."""
@@ -0,0 +1,110 @@
1
+ """stats/stats_utils.py.
2
+
3
+ Running statistics for streaming data.
4
+
5
+ Provides a RunningStats class that tracks count, sum, mean, min, and max
6
+ for a stream of numeric values without storing the full history.
7
+
8
+ This is domain-agnostic: it works on any numeric field from any message.
9
+ Pass it a value on each message and read the current statistics at any time.
10
+
11
+ Author: Denise Case
12
+ Date: 2026-05
13
+ """
14
+
15
+ # === IMPORTS ===
16
+
17
+ from dataclasses import dataclass
18
+
19
+ # === EXPORTS ===
20
+
21
+ __all__ = [
22
+ "RunningStats",
23
+ ]
24
+
25
+ # === DEFINE RUNNING STATS CLASS ===
26
+
27
+
28
+ @dataclass
29
+ class RunningStats:
30
+ """Accumulates running statistics for a stream of numeric values.
31
+
32
+ Updates incrementally (one value at a time) without storing history.
33
+ Safe to use inside a message processing loop.
34
+
35
+ Do not use min and max as they would conflict with
36
+ built-in functions.
37
+ Access minimum and maximum values
38
+ via the minimum and maximum attributes.
39
+
40
+ Attributes:
41
+ count: Number of values received so far.
42
+ total: Running sum of all values.
43
+ mean: Running mean of all values.
44
+ minimum: Minimum value seen so far.
45
+ maximum: Maximum value seen so far.
46
+
47
+ Example:
48
+ stats = RunningStats()
49
+ for message in messages:
50
+ stats.update(message["total"])
51
+ print(f"count={stats.count} mean={stats.mean:.2f}")
52
+ """
53
+
54
+ count: int = 0
55
+ total: float = 0.0
56
+ mean: float = 0.0
57
+ minimum: float = float("inf")
58
+ maximum: float = float("-inf")
59
+
60
+ def update(self, value: float) -> None:
61
+ """Update statistics with one new value.
62
+
63
+ Arguments:
64
+ value: The new numeric value to include.
65
+
66
+ Returns:
67
+ None.
68
+ """
69
+ self.count += 1
70
+ self.total += value
71
+ self.mean = self.total / self.count
72
+ if value < self.minimum:
73
+ self.minimum = value
74
+ if value > self.maximum:
75
+ self.maximum = value
76
+
77
+ def reset(self) -> None:
78
+ """Reset all statistics to their initial state.
79
+
80
+ Use this to start a new window or clear accumulated state.
81
+
82
+ Returns:
83
+ None.
84
+ """
85
+ self.count = 0
86
+ self.total = 0.0
87
+ self.mean = 0.0
88
+ self.minimum = float("inf")
89
+ self.maximum = float("-inf")
90
+
91
+ @property
92
+ def is_empty(self) -> bool:
93
+ """Return True if no values have been received yet."""
94
+ return self.count == 0
95
+
96
+ def summary(self) -> str:
97
+ """Return a formatted summary string for logging.
98
+
99
+ Returns:
100
+ A single-line string with all current statistics.
101
+ """
102
+ if self.is_empty:
103
+ return "RunningStats: no values received yet."
104
+ return (
105
+ f"count={self.count} "
106
+ f"total={self.total:,.2f} "
107
+ f"mean={self.mean:,.2f} "
108
+ f"minimum={self.minimum:,.2f} "
109
+ f"maximum={self.maximum:,.2f}"
110
+ )
@@ -0,0 +1 @@
1
+ """Persistence backends for streaming data."""