tkati-core 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tkati_core/consumer.py ADDED
@@ -0,0 +1,268 @@
1
+ """Kafka consumer utilities for reading messages into PyArrow tables."""
2
+
3
+ import time
4
+ from io import BytesIO
5
+ from typing import TYPE_CHECKING
6
+
7
+ import orjson
8
+ import pyarrow as pa
9
+ from confluent_kafka import Consumer
10
+ from loguru import logger
11
+ from pyarrow import json as pa_json
12
+
13
+ if TYPE_CHECKING:
14
+ from tkati_core.settings import KafkaInputSettings
15
+
16
+
17
+ class KafkaConsumer:
18
+ """
19
+ A Kafka consumer wrapper that reads messages into PyArrow tables or Python lists.
20
+
21
+ This class manages the Kafka consumer lifecycle, topic subscription,
22
+ and provides a convenient interface for reading messages as PyArrow tables
23
+ or plain Python dicts.
24
+ """
25
+
26
+ @classmethod
27
+ def from_input_settings(cls, settings: "KafkaInputSettings") -> "KafkaConsumer":
28
+ """
29
+ Construct a KafkaConsumer from a KafkaInputSettings instance.
30
+
31
+ Sets enable.auto.commit=False — offsets must be committed explicitly via .commit().
32
+ """
33
+ kafka_config: dict[str, str | bool] = {
34
+ "bootstrap.servers": settings.topic.broker,
35
+ "group.id": settings.consumer.group_id,
36
+ "auto.offset.reset": settings.consumer.auto_offset_reset,
37
+ "enable.auto.commit": False,
38
+ }
39
+ return cls(
40
+ kafka_config=kafka_config,
41
+ topic_name=settings.topic.name,
42
+ input_schema=settings.topic.schema,
43
+ )
44
+
45
+ def __init__(
46
+ self,
47
+ kafka_config: dict[str, str | bool],
48
+ topic_name: str,
49
+ input_schema: dict[str, str],
50
+ ) -> None:
51
+ """
52
+ Initialize the Kafka consumer with the provided configuration.
53
+
54
+ Args:
55
+ kafka_config: Dictionary of Kafka consumer configuration parameters.
56
+ Common keys include:
57
+ - 'bootstrap.servers': Kafka broker addresses
58
+ - 'group.id': Consumer group ID
59
+ - 'auto.offset.reset': Offset reset behavior
60
+ - 'enable.auto.commit': Whether to auto-commit offsets
61
+ """
62
+ self.consumer = Consumer(kafka_config)
63
+ self.topic_name = topic_name
64
+ self.input_schema = input_schema
65
+
66
+ self.consumer.subscribe([self.topic_name])
67
+
68
+ # Create PyArrow schema based on input_schema
69
+ # type -> (parse_type, cast_type)
70
+ type_mapping: dict[str, tuple[pa.DataType, pa.DataType]] = {
71
+ "string": (pa.string(), pa.string()),
72
+ "int32": (pa.int32(), pa.int32()),
73
+ "int64": (pa.int64(), pa.int64()),
74
+ "uint32": (pa.uint32(), pa.uint32()),
75
+ "uint64": (pa.uint64(), pa.uint64()),
76
+ "uint8": (pa.uint8(), pa.uint8()),
77
+ "int": (pa.int32(), pa.int32()),
78
+ "timestamp[ms]": (pa.int64(), pa.timestamp("ms")),
79
+ }
80
+
81
+ parse_schema_fields = []
82
+ cast_schema_fields = []
83
+
84
+ for field_name, field_type in input_schema.items():
85
+ types = type_mapping.get(field_type)
86
+
87
+ if types is not None:
88
+ parse_type, cast_type = types
89
+ else:
90
+ logger.warning(
91
+ f"Unsupported field type '{field_type}' for field '{field_name}'. Defaulting to string."
92
+ )
93
+ parse_type, cast_type = (pa.string(), pa.string())
94
+
95
+ parse_schema_fields.append(pa.field(field_name, parse_type))
96
+ cast_schema_fields.append(pa.field(field_name, cast_type))
97
+
98
+ self.parse_schema = pa.schema(parse_schema_fields)
99
+ self.cast_schema = pa.schema(cast_schema_fields)
100
+
101
+ logger.info(
102
+ f"Initialized KafkaConsumer with config: {kafka_config} and topic: {topic_name}"
103
+ )
104
+
105
+ def _consume_batch(
106
+ self,
107
+ aggregation_interval_seconds: int,
108
+ max_events_to_aggregate: int,
109
+ ) -> tuple[list, int]:
110
+ """
111
+ Consume raw messages from Kafka within the given time and count limits.
112
+
113
+ Returns a tuple of (messages, events_read) where messages is a list of
114
+ confluent_kafka Message objects (without errors).
115
+ """
116
+ if self.topic_name:
117
+ logger.info(
118
+ f"Consuming events from topic(s): {self.topic_name} for up to {aggregation_interval_seconds}s or {max_events_to_aggregate} events"
119
+ )
120
+ else:
121
+ logger.info(
122
+ f"Consuming events for up to {aggregation_interval_seconds}s or {max_events_to_aggregate} events"
123
+ )
124
+
125
+ start_time = time.time()
126
+ events_read = 0
127
+ poll_timeout = 10
128
+ valid_messages = []
129
+
130
+ while events_read < max_events_to_aggregate:
131
+ elapsed = time.time() - start_time
132
+ remaining_time = aggregation_interval_seconds - elapsed
133
+
134
+ if remaining_time <= 0:
135
+ logger.info(f"Reached time limit of {aggregation_interval_seconds}s")
136
+ break
137
+
138
+ remaining_messages = max_events_to_aggregate - events_read
139
+ batch_timeout = min(poll_timeout, remaining_time)
140
+ messages = self.consumer.consume(
141
+ num_messages=min(remaining_messages, 1_000_000),
142
+ timeout=batch_timeout,
143
+ )
144
+
145
+ if not messages:
146
+ continue
147
+
148
+ for msg in messages:
149
+ if msg.error():
150
+ logger.info(f"Consumer error: {msg.error()}")
151
+ continue
152
+ valid_messages.append(msg)
153
+ events_read += 1
154
+
155
+ elapsed_total = time.time() - start_time
156
+ logger.info(f"Consumed {events_read} events in {elapsed_total:.2f}s")
157
+ return valid_messages, events_read
158
+
159
+ # WARNING: This function breaks if any single message is malformed JSON. We may
160
+ # want to enhance it to handle individual message errors more gracefully.
161
+ def read_arrow(
162
+ self,
163
+ aggregation_interval_seconds: int,
164
+ max_events_to_aggregate: int,
165
+ ) -> pa.Table | None:
166
+ """
167
+ Read messages from subscribed topics into a PyArrow table.
168
+
169
+ Args:
170
+ aggregation_interval_seconds: Maximum time in seconds to consume messages.
171
+ max_events_to_aggregate: Maximum number of events to consume.
172
+
173
+ Returns:
174
+ A PyArrow Table containing the parsed events, or None if no data was consumed.
175
+
176
+ Notes:
177
+ - Does NOT commit offsets. The caller is responsible for managing consumer lifecycle.
178
+ - Does NOT subscribe to topics. The consumer must be pre-subscribed.
179
+ - Raises exceptions on JSON parsing errors.
180
+ - Uses permissive parsing that ignores unexpected fields in JSON messages.
181
+ """
182
+ valid_messages, events_read = self._consume_batch(
183
+ aggregation_interval_seconds, max_events_to_aggregate
184
+ )
185
+
186
+ if events_read == 0:
187
+ logger.info("No data consumed from topic.")
188
+ return None
189
+
190
+ buffer = BytesIO()
191
+ for msg in valid_messages:
192
+ buffer.write(msg.value())
193
+ buffer.write(b"\n")
194
+ buffer.seek(0)
195
+
196
+ parse_options = pa_json.ParseOptions(
197
+ explicit_schema=self.parse_schema,
198
+ unexpected_field_behavior="ignore",
199
+ )
200
+
201
+ try:
202
+ table = pa_json.read_json(buffer, parse_options=parse_options)
203
+ table = table.cast(self.cast_schema)
204
+ actual_rows = len(table)
205
+
206
+ if actual_rows != events_read:
207
+ logger.warning(
208
+ f"Row count mismatch: consumed {events_read} messages, but parsed {actual_rows} rows. {events_read - actual_rows} messages may have been skipped."
209
+ )
210
+ else:
211
+ logger.info(
212
+ f"Successfully parsed {actual_rows} rows matching {events_read} consumed messages"
213
+ )
214
+
215
+ except Exception as e:
216
+ logger.error(f"Failed to parse JSON with PyArrow: {e}")
217
+ raise
218
+
219
+ return table
220
+
221
+ def read_pylist(
222
+ self,
223
+ aggregation_interval_seconds: int,
224
+ max_events_to_aggregate: int,
225
+ ) -> list[dict] | None:
226
+ """
227
+ Read messages from subscribed topics into a list of dicts.
228
+
229
+ Same batching semantics as read_arrow (time + count limits).
230
+ Messages that fail JSON parsing are skipped (logged as errors).
231
+
232
+ Returns:
233
+ A list of parsed event dicts, or None if no data was consumed.
234
+
235
+ Notes:
236
+ - Does NOT commit offsets. The caller is responsible for managing consumer lifecycle.
237
+ """
238
+ valid_messages, events_read = self._consume_batch(
239
+ aggregation_interval_seconds, max_events_to_aggregate
240
+ )
241
+
242
+ if events_read == 0:
243
+ logger.info("No data consumed from topic.")
244
+ return None
245
+
246
+ rows = []
247
+ for msg in valid_messages:
248
+ try:
249
+ rows.append(orjson.loads(msg.value()))
250
+ except Exception as e:
251
+ logger.error(f"Error parsing message from topic {msg.topic()}: {e}")
252
+
253
+ logger.info(f"Successfully parsed {len(rows)} rows")
254
+ return rows if rows else None
255
+
256
+ def commit(self) -> None:
257
+ """
258
+ Commit the current offsets for all subscribed topics.
259
+ """
260
+ self.consumer.commit()
261
+ logger.debug("Committed offsets")
262
+
263
+ def close(self) -> None:
264
+ """
265
+ Close the Kafka consumer and release resources.
266
+ """
267
+ self.consumer.close()
268
+ logger.info("Closed KafkaConsumer")
tkati_core/producer.py ADDED
@@ -0,0 +1,105 @@
1
+ """Kafka producer utilities for writing PyArrow tables as messages."""
2
+
3
+ from typing import TYPE_CHECKING, Literal
4
+
5
+ import orjson
6
+ import pyarrow as pa
7
+ from confluent_kafka import Producer
8
+ from loguru import logger
9
+
10
+ if TYPE_CHECKING:
11
+ from tkati_core.settings import KafkaOutputSettings
12
+
13
+
14
+ class KafkaProducer:
15
+ """
16
+ A Kafka producer wrapper that writes data as messages.
17
+
18
+ Supports producing from PyArrow tables/batches or plain Python dicts.
19
+
20
+ For Arrow-based production, two serialization formats are controlled by the
21
+ topic's ``format`` setting:
22
+ - ``"json"``: produces one Kafka message per row, serialized with orjson.
23
+ - ``"arrow-batch"``: produces the entire table as a single Arrow IPC message.
24
+
25
+ The optional ``key_column`` setting (from ``KafkaTopicSettings``) names the
26
+ column whose value is used as the Kafka message key for each row (JSON format only).
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ kafka_config: dict[str, str],
32
+ topic_name: str,
33
+ format: Literal["json", "arrow-batch"] = "json",
34
+ key_column: str | None = None,
35
+ ) -> None:
36
+ self.producer = Producer(kafka_config)
37
+ self.topic_name = topic_name
38
+ self.format = format
39
+ self.key_column = key_column
40
+ logger.info(
41
+ f"Initialized KafkaProducer with topic: {topic_name}, format: {format}"
42
+ )
43
+
44
+ @classmethod
45
+ def from_output_settings(cls, settings: "KafkaOutputSettings") -> "KafkaProducer":
46
+ """
47
+ Construct a KafkaProducer from a KafkaOutputSettings instance.
48
+ """
49
+ return cls(
50
+ kafka_config={"bootstrap.servers": settings.topic.broker},
51
+ topic_name=settings.topic.name,
52
+ format=settings.topic.format,
53
+ key_column=settings.topic.key_column,
54
+ )
55
+
56
+ def produce_arrow(self, data: pa.Table | pa.RecordBatch) -> None:
57
+ """
58
+ Produce data to the configured topic.
59
+
60
+ For ``"json"`` format each row becomes a separate Kafka message serialized
61
+ with orjson. If ``key_column`` is set, its value is used as the message key.
62
+
63
+ For ``"arrow-batch"`` format the entire table is serialized as a single
64
+ Arrow IPC stream message.
65
+ """
66
+ if self.format == "json":
67
+ self.produce_pylist(data.to_pylist())
68
+ elif self.format == "arrow-batch":
69
+ table = (
70
+ data if isinstance(data, pa.Table) else pa.Table.from_batches([data])
71
+ )
72
+ buf = pa.BufferOutputStream()
73
+ with pa.ipc.new_stream(buf, table.schema) as writer:
74
+ for batch in table.to_batches():
75
+ writer.write_batch(batch)
76
+ self.producer.produce(self.topic_name, value=buf.getvalue().to_pybytes())
77
+
78
+ def produce_pylist(self, rows: list[dict]) -> None:
79
+ """
80
+ Produce a list of dicts to the configured topic as JSON messages.
81
+
82
+ Each dict becomes a separate Kafka message serialized with orjson.
83
+ If ``key_column`` is set, its value is used as the Kafka message key.
84
+ """
85
+ for row in rows:
86
+ key = (
87
+ str(row[self.key_column])
88
+ if self.key_column and self.key_column in row
89
+ else None
90
+ )
91
+ self.producer.produce(self.topic_name, value=orjson.dumps(row), key=key)
92
+
93
+ def flush(self) -> None:
94
+ """
95
+ Block until all queued messages have been delivered.
96
+ """
97
+ self.producer.flush()
98
+ logger.debug("Flushed KafkaProducer")
99
+
100
+ def close(self) -> None:
101
+ """
102
+ Flush pending messages and release resources.
103
+ """
104
+ self.producer.flush()
105
+ logger.info("Closed KafkaProducer")
tkati_core/py.typed ADDED
File without changes
tkati_core/settings.py ADDED
@@ -0,0 +1,66 @@
1
+ import os
2
+ from typing import Literal
3
+
4
+ from loguru import logger
5
+ from pydantic import BaseModel, Field
6
+ from pydantic_settings import (
7
+ BaseSettings,
8
+ PydanticBaseSettingsSource,
9
+ SettingsConfigDict,
10
+ TomlConfigSettingsSource,
11
+ )
12
+
13
+ SETTINGS_FILE = os.getenv("SETTINGS_FILE", "settings.toml")
14
+
15
+
16
+ class KafkaTopicSettings(BaseModel):
17
+ broker: str
18
+ name: str
19
+ schema: dict[str, str] = Field(default_factory=dict) # type: ignore
20
+ format: Literal["json", "arrow-batch"] = "json"
21
+ key_column: str | None = None
22
+
23
+
24
+ class KafkaConsumerSettings(BaseModel):
25
+ group_id: str
26
+ batch_size: int = 1000
27
+ batch_timeout_sec: int = 5
28
+ auto_offset_reset: str = "latest"
29
+
30
+
31
+ class KafkaInputSettings(BaseModel):
32
+ topic: KafkaTopicSettings
33
+ consumer: KafkaConsumerSettings
34
+
35
+
36
+ class KafkaOutputSettings(BaseModel):
37
+ topic: KafkaTopicSettings
38
+
39
+
40
+ logger.info(f"Using settings file: {os.path.abspath(SETTINGS_FILE)}")
41
+
42
+
43
+ class TomlBaseSettings(BaseSettings):
44
+ model_config = SettingsConfigDict(
45
+ toml_file=SETTINGS_FILE,
46
+ env_file=".env",
47
+ extra="ignore",
48
+ env_nested_delimiter="__",
49
+ )
50
+
51
+ @classmethod
52
+ def settings_customise_sources(
53
+ cls,
54
+ settings_cls: type[BaseSettings],
55
+ init_settings: PydanticBaseSettingsSource,
56
+ env_settings: PydanticBaseSettingsSource,
57
+ dotenv_settings: PydanticBaseSettingsSource,
58
+ file_secret_settings: PydanticBaseSettingsSource,
59
+ ) -> tuple[PydanticBaseSettingsSource, ...]:
60
+ return (
61
+ init_settings,
62
+ env_settings,
63
+ dotenv_settings,
64
+ file_secret_settings,
65
+ TomlConfigSettingsSource(settings_cls),
66
+ )
tkati_core/testing.py ADDED
@@ -0,0 +1,11 @@
1
+ from collections.abc import Generator
2
+
3
+ import pytest
4
+ from confluent_kafka.admin import AdminClient
5
+
6
+
7
+ @pytest.fixture(scope="function")
8
+ def kafka_admin_client() -> Generator[AdminClient, None, None]:
9
+ """Provides a Kafka AdminClient for integration tests. Requires Redpanda on localhost:9092."""
10
+ admin = AdminClient({"bootstrap.servers": "localhost:9092"})
11
+ yield admin
@@ -0,0 +1,118 @@
1
+ Metadata-Version: 2.4
2
+ Name: tkati-core
3
+ Version: 0.1.0
4
+ Summary: Add your description here
5
+ Requires-Python: >=3.13
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: confluent-kafka>=2.11.0
8
+ Requires-Dist: loguru>=0.7.3
9
+ Requires-Dist: orjson>=3.9.0
10
+ Requires-Dist: pyarrow>=21.0.0
11
+ Requires-Dist: pydantic-settings>=2.11.0
12
+
13
+ # tkati-core
14
+
15
+ For now we assume that each node gets one input and produces one output in a
16
+ form of kafka stream.
17
+
18
+ ## Settings
19
+
20
+ General form of settings is:
21
+
22
+ ```toml
23
+ [input.topic]
24
+ # definition of input stream:
25
+ # - broker
26
+ # - topic name
27
+ # - message schema
28
+ # - message format = "json" / "arrow-batch"
29
+
30
+ [input.consumer]
31
+ # parameters local to this consumer
32
+ # - group_id
33
+ # - batch_size
34
+ # - batch_timeout_sec
35
+ # - auto_offset_reset
36
+
37
+ [output.topic]
38
+ # definition of output stream
39
+ # - broker
40
+ # - topic name
41
+ # - message schema
42
+ # - message format = "json" / "arrow-batch"
43
+ # - key_column (optional) = column to use as the Kafka message key
44
+
45
+ [...]
46
+ # settings specific to node function
47
+ ```
48
+
49
+ ## Usage
50
+
51
+ ### Constructing a consumer from settings
52
+
53
+ Use `KafkaArrowConsumer.from_input_settings` to construct a consumer directly from
54
+ `KafkaInputSettings` — no need to manually map fields to Confluent Kafka config keys.
55
+
56
+ ```python
57
+ from tkati_core.settings import TomlBaseSettings, KafkaInputSettings
58
+ from tkati_core.consumer import KafkaArrowConsumer
59
+
60
+ class AppSettings(TomlBaseSettings):
61
+ input: KafkaInputSettings
62
+ # ...
63
+
64
+ settings = AppSettings()
65
+ consumer = KafkaArrowConsumer.from_input_settings(settings.input)
66
+
67
+ # Read a batch
68
+ table = consumer.read_to_pyarrow(
69
+ aggregation_interval_seconds=settings.input.consumer.batch_timeout_sec,
70
+ max_events_to_aggregate=settings.input.consumer.batch_size,
71
+ )
72
+ consumer.commit()
73
+ ```
74
+
75
+ The factory method sets `enable.auto.commit=False` — offsets must be committed explicitly
76
+ via `consumer.commit()`.
77
+
78
+ ### Constructing a producer from settings
79
+
80
+ Use `KafkaArrowProducer.from_output_settings` to construct a producer directly from
81
+ `KafkaOutputSettings`. It accepts PyArrow tables or record batches and handles
82
+ serialization according to the topic's `format` setting.
83
+
84
+ ```python
85
+ from tkati_core.settings import TomlBaseSettings, KafkaOutputSettings
86
+ from tkati_core.producer import KafkaArrowProducer
87
+
88
+ class AppSettings(TomlBaseSettings):
89
+ output: KafkaOutputSettings
90
+ # ...
91
+
92
+ settings = AppSettings()
93
+ producer = KafkaArrowProducer.from_output_settings(settings.output)
94
+
95
+ # Produce a PyArrow table (one message per row for "json" format)
96
+ producer.produce(table)
97
+ producer.flush()
98
+ producer.close() # flushes and releases resources
99
+ ```
100
+
101
+ **Formats** — controlled by `output.topic.format` in `settings.toml`:
102
+
103
+ - `"json"` *(default)*: each row becomes a separate Kafka message serialized with orjson.
104
+ - `"arrow-batch"`: the entire table is serialized as a single Arrow IPC stream message.
105
+
106
+ **Message keys** — controlled by `output.topic.key_column` in `settings.toml`:
107
+
108
+ ```toml
109
+ [output.topic]
110
+ broker = "localhost:9092"
111
+ name = "my-output-topic"
112
+ key_column = "customer_id" # column whose value becomes the Kafka message key
113
+ ```
114
+
115
+ `key_column` is optional. When omitted (or `None`), messages are produced without a key.
116
+ When set, the value of that column for each row is used as the Kafka message key
117
+ (JSON format only — ignored for `"arrow-batch"`). This determines which Kafka partition
118
+ each message is routed to.
@@ -0,0 +1,9 @@
1
+ tkati_core/consumer.py,sha256=y9rVVDWpncuwJWhxvj6psx0b01GAVuONu_9JtvCBwFI,9519
2
+ tkati_core/producer.py,sha256=wGJW0E9UWzDDlgkii5-jpHnCwLdQQPuHaMI2Ctsk43k,3731
3
+ tkati_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ tkati_core/settings.py,sha256=RUhSB6Kfq43FjAJd0zRuq57_qKrgiPrYnu4ZLB3_s_s,1699
5
+ tkati_core/testing.py,sha256=cVE5uYkENk9KgQ0g3i5VCQ0YBdL3bPlRQbB1uLKDI5s,379
6
+ tkati_core-0.1.0.dist-info/METADATA,sha256=sGKFKV8ht9e1CtM7qWLsVXw45xDVKSqFe5GbP7VvZZ8,3369
7
+ tkati_core-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
8
+ tkati_core-0.1.0.dist-info/top_level.txt,sha256=HjR7-66EdB2ZTWc3U1qLVWwc435OAwrN58iJGhJE9UY,11
9
+ tkati_core-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ tkati_core