tkati-core 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tkati_core-0.1.0/PKG-INFO +118 -0
- tkati_core-0.1.0/README.md +106 -0
- tkati_core-0.1.0/pyproject.toml +28 -0
- tkati_core-0.1.0/setup.cfg +4 -0
- tkati_core-0.1.0/tests/test_consumer.py +172 -0
- tkati_core-0.1.0/tests/test_producer.py +198 -0
- tkati_core-0.1.0/tkati_core/consumer.py +268 -0
- tkati_core-0.1.0/tkati_core/producer.py +105 -0
- tkati_core-0.1.0/tkati_core/py.typed +0 -0
- tkati_core-0.1.0/tkati_core/settings.py +66 -0
- tkati_core-0.1.0/tkati_core/testing.py +11 -0
- tkati_core-0.1.0/tkati_core.egg-info/PKG-INFO +118 -0
- tkati_core-0.1.0/tkati_core.egg-info/SOURCES.txt +14 -0
- tkati_core-0.1.0/tkati_core.egg-info/dependency_links.txt +1 -0
- tkati_core-0.1.0/tkati_core.egg-info/requires.txt +5 -0
- tkati_core-0.1.0/tkati_core.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tkati-core
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Requires-Python: >=3.13
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: confluent-kafka>=2.11.0
|
|
8
|
+
Requires-Dist: loguru>=0.7.3
|
|
9
|
+
Requires-Dist: orjson>=3.9.0
|
|
10
|
+
Requires-Dist: pyarrow>=21.0.0
|
|
11
|
+
Requires-Dist: pydantic-settings>=2.11.0
|
|
12
|
+
|
|
13
|
+
# tkati-core
|
|
14
|
+
|
|
15
|
+
For now we assume that each node gets one input and produces one output in a
|
|
16
|
+
form of kafka stream.
|
|
17
|
+
|
|
18
|
+
## Settings
|
|
19
|
+
|
|
20
|
+
General form of settings is:
|
|
21
|
+
|
|
22
|
+
```toml
|
|
23
|
+
[input.topic]
|
|
24
|
+
# definition of input stream:
|
|
25
|
+
# - broker
|
|
26
|
+
# - topic name
|
|
27
|
+
# - message schema
|
|
28
|
+
# - message format = "json" / "arrow-batch"
|
|
29
|
+
|
|
30
|
+
[input.consumer]
|
|
31
|
+
# parameters local to this consumer
|
|
32
|
+
# - group_id
|
|
33
|
+
# - batch_size
|
|
34
|
+
# - batch_timeout_sec
|
|
35
|
+
# - auto_offset_reset
|
|
36
|
+
|
|
37
|
+
[output.topic]
|
|
38
|
+
# definition of output stream
|
|
39
|
+
# - broker
|
|
40
|
+
# - topic name
|
|
41
|
+
# - message schema
|
|
42
|
+
# - message format = "json" / "arrow-batch"
|
|
43
|
+
# - key_column (optional) = column to use as the Kafka message key
|
|
44
|
+
|
|
45
|
+
[...]
|
|
46
|
+
# settings specific to node function
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Usage
|
|
50
|
+
|
|
51
|
+
### Constructing a consumer from settings
|
|
52
|
+
|
|
53
|
+
Use `KafkaArrowConsumer.from_input_settings` to construct a consumer directly from
|
|
54
|
+
`KafkaInputSettings` — no need to manually map fields to Confluent Kafka config keys.
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from tkati_core.settings import TomlBaseSettings, KafkaInputSettings
|
|
58
|
+
from tkati_core.consumer import KafkaArrowConsumer
|
|
59
|
+
|
|
60
|
+
class AppSettings(TomlBaseSettings):
|
|
61
|
+
input: KafkaInputSettings
|
|
62
|
+
# ...
|
|
63
|
+
|
|
64
|
+
settings = AppSettings()
|
|
65
|
+
consumer = KafkaArrowConsumer.from_input_settings(settings.input)
|
|
66
|
+
|
|
67
|
+
# Read a batch
|
|
68
|
+
table = consumer.read_to_pyarrow(
|
|
69
|
+
aggregation_interval_seconds=settings.input.consumer.batch_timeout_sec,
|
|
70
|
+
max_events_to_aggregate=settings.input.consumer.batch_size,
|
|
71
|
+
)
|
|
72
|
+
consumer.commit()
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
The factory method sets `enable.auto.commit=False` — offsets must be committed explicitly
|
|
76
|
+
via `consumer.commit()`.
|
|
77
|
+
|
|
78
|
+
### Constructing a producer from settings
|
|
79
|
+
|
|
80
|
+
Use `KafkaArrowProducer.from_output_settings` to construct a producer directly from
|
|
81
|
+
`KafkaOutputSettings`. It accepts PyArrow tables or record batches and handles
|
|
82
|
+
serialization according to the topic's `format` setting.
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from tkati_core.settings import TomlBaseSettings, KafkaOutputSettings
|
|
86
|
+
from tkati_core.producer import KafkaArrowProducer
|
|
87
|
+
|
|
88
|
+
class AppSettings(TomlBaseSettings):
|
|
89
|
+
output: KafkaOutputSettings
|
|
90
|
+
# ...
|
|
91
|
+
|
|
92
|
+
settings = AppSettings()
|
|
93
|
+
producer = KafkaArrowProducer.from_output_settings(settings.output)
|
|
94
|
+
|
|
95
|
+
# Produce a PyArrow table (one message per row for "json" format)
|
|
96
|
+
producer.produce(table)
|
|
97
|
+
producer.flush()
|
|
98
|
+
producer.close() # flushes and releases resources
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
**Formats** — controlled by `output.topic.format` in `settings.toml`:
|
|
102
|
+
|
|
103
|
+
- `"json"` *(default)*: each row becomes a separate Kafka message serialized with orjson.
|
|
104
|
+
- `"arrow-batch"`: the entire table is serialized as a single Arrow IPC stream message.
|
|
105
|
+
|
|
106
|
+
**Message keys** — controlled by `output.topic.key_column` in `settings.toml`:
|
|
107
|
+
|
|
108
|
+
```toml
|
|
109
|
+
[output.topic]
|
|
110
|
+
broker = "localhost:9092"
|
|
111
|
+
name = "my-output-topic"
|
|
112
|
+
key_column = "customer_id" # column whose value becomes the Kafka message key
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
`key_column` is optional. When omitted (or `None`), messages are produced without a key.
|
|
116
|
+
When set, the value of that column for each row is used as the Kafka message key
|
|
117
|
+
(JSON format only — ignored for `"arrow-batch"`). This determines which Kafka partition
|
|
118
|
+
each message is routed to.
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# tkati-core
|
|
2
|
+
|
|
3
|
+
For now we assume that each node gets one input and produces one output in a
|
|
4
|
+
form of kafka stream.
|
|
5
|
+
|
|
6
|
+
## Settings
|
|
7
|
+
|
|
8
|
+
General form of settings is:
|
|
9
|
+
|
|
10
|
+
```toml
|
|
11
|
+
[input.topic]
|
|
12
|
+
# definition of input stream:
|
|
13
|
+
# - broker
|
|
14
|
+
# - topic name
|
|
15
|
+
# - message schema
|
|
16
|
+
# - message format = "json" / "arrow-batch"
|
|
17
|
+
|
|
18
|
+
[input.consumer]
|
|
19
|
+
# parameters local to this consumer
|
|
20
|
+
# - group_id
|
|
21
|
+
# - batch_size
|
|
22
|
+
# - batch_timeout_sec
|
|
23
|
+
# - auto_offset_reset
|
|
24
|
+
|
|
25
|
+
[output.topic]
|
|
26
|
+
# definition of output stream
|
|
27
|
+
# - broker
|
|
28
|
+
# - topic name
|
|
29
|
+
# - message schema
|
|
30
|
+
# - message format = "json" / "arrow-batch"
|
|
31
|
+
# - key_column (optional) = column to use as the Kafka message key
|
|
32
|
+
|
|
33
|
+
[...]
|
|
34
|
+
# settings specific to node function
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Usage
|
|
38
|
+
|
|
39
|
+
### Constructing a consumer from settings
|
|
40
|
+
|
|
41
|
+
Use `KafkaArrowConsumer.from_input_settings` to construct a consumer directly from
|
|
42
|
+
`KafkaInputSettings` — no need to manually map fields to Confluent Kafka config keys.
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from tkati_core.settings import TomlBaseSettings, KafkaInputSettings
|
|
46
|
+
from tkati_core.consumer import KafkaArrowConsumer
|
|
47
|
+
|
|
48
|
+
class AppSettings(TomlBaseSettings):
|
|
49
|
+
input: KafkaInputSettings
|
|
50
|
+
# ...
|
|
51
|
+
|
|
52
|
+
settings = AppSettings()
|
|
53
|
+
consumer = KafkaArrowConsumer.from_input_settings(settings.input)
|
|
54
|
+
|
|
55
|
+
# Read a batch
|
|
56
|
+
table = consumer.read_to_pyarrow(
|
|
57
|
+
aggregation_interval_seconds=settings.input.consumer.batch_timeout_sec,
|
|
58
|
+
max_events_to_aggregate=settings.input.consumer.batch_size,
|
|
59
|
+
)
|
|
60
|
+
consumer.commit()
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
The factory method sets `enable.auto.commit=False` — offsets must be committed explicitly
|
|
64
|
+
via `consumer.commit()`.
|
|
65
|
+
|
|
66
|
+
### Constructing a producer from settings
|
|
67
|
+
|
|
68
|
+
Use `KafkaArrowProducer.from_output_settings` to construct a producer directly from
|
|
69
|
+
`KafkaOutputSettings`. It accepts PyArrow tables or record batches and handles
|
|
70
|
+
serialization according to the topic's `format` setting.
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from tkati_core.settings import TomlBaseSettings, KafkaOutputSettings
|
|
74
|
+
from tkati_core.producer import KafkaArrowProducer
|
|
75
|
+
|
|
76
|
+
class AppSettings(TomlBaseSettings):
|
|
77
|
+
output: KafkaOutputSettings
|
|
78
|
+
# ...
|
|
79
|
+
|
|
80
|
+
settings = AppSettings()
|
|
81
|
+
producer = KafkaArrowProducer.from_output_settings(settings.output)
|
|
82
|
+
|
|
83
|
+
# Produce a PyArrow table (one message per row for "json" format)
|
|
84
|
+
producer.produce(table)
|
|
85
|
+
producer.flush()
|
|
86
|
+
producer.close() # flushes and releases resources
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
**Formats** — controlled by `output.topic.format` in `settings.toml`:
|
|
90
|
+
|
|
91
|
+
- `"json"` *(default)*: each row becomes a separate Kafka message serialized with orjson.
|
|
92
|
+
- `"arrow-batch"`: the entire table is serialized as a single Arrow IPC stream message.
|
|
93
|
+
|
|
94
|
+
**Message keys** — controlled by `output.topic.key_column` in `settings.toml`:
|
|
95
|
+
|
|
96
|
+
```toml
|
|
97
|
+
[output.topic]
|
|
98
|
+
broker = "localhost:9092"
|
|
99
|
+
name = "my-output-topic"
|
|
100
|
+
key_column = "customer_id" # column whose value becomes the Kafka message key
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
`key_column` is optional. When omitted (or `None`), messages are produced without a key.
|
|
104
|
+
When set, the value of that column for each row is used as the Kafka message key
|
|
105
|
+
(JSON format only — ignored for `"arrow-batch"`). This determines which Kafka partition
|
|
106
|
+
each message is routed to.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "tkati-core"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Add your description here"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.13"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"confluent-kafka>=2.11.0",
|
|
9
|
+
"loguru>=0.7.3",
|
|
10
|
+
"orjson>=3.9.0",
|
|
11
|
+
"pyarrow>=21.0.0",
|
|
12
|
+
"pydantic-settings>=2.11.0",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[dependency-groups]
|
|
16
|
+
dev = ["pytest>=9.0.1"]
|
|
17
|
+
|
|
18
|
+
[build-system]
|
|
19
|
+
requires = ["setuptools", "wheel"]
|
|
20
|
+
build-backend = "setuptools.build_meta"
|
|
21
|
+
|
|
22
|
+
[tool.setuptools.packages.find]
|
|
23
|
+
where = ["."]
|
|
24
|
+
include = ["tkati_core"]
|
|
25
|
+
|
|
26
|
+
[tool.uv-workspace-codegen]
|
|
27
|
+
generate = true
|
|
28
|
+
template_type = "package"
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import orjson
|
|
2
|
+
import pyarrow as pa
|
|
3
|
+
import pytest
|
|
4
|
+
from confluent_kafka import Producer
|
|
5
|
+
from tkati_core.consumer import KafkaConsumer
|
|
6
|
+
from tkati_core.settings import KafkaInputSettings
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def test_from_input_settings_sets_attributes(input_settings: KafkaInputSettings):
|
|
10
|
+
consumer = KafkaConsumer.from_input_settings(input_settings)
|
|
11
|
+
try:
|
|
12
|
+
assert consumer.topic_name == input_settings.topic.name
|
|
13
|
+
assert consumer.input_schema == input_settings.topic.schema
|
|
14
|
+
finally:
|
|
15
|
+
consumer.close()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_from_input_settings_builds_arrow_schemas(input_settings: KafkaInputSettings):
|
|
19
|
+
consumer = KafkaConsumer.from_input_settings(input_settings)
|
|
20
|
+
try:
|
|
21
|
+
assert consumer.parse_schema.field("id").type == pa.string()
|
|
22
|
+
assert consumer.cast_schema.field("value").type == pa.int64()
|
|
23
|
+
finally:
|
|
24
|
+
consumer.close()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_read_arrow_returns_none_on_empty_topic(
|
|
28
|
+
input_settings: KafkaInputSettings, kafka_input_topic: str
|
|
29
|
+
):
|
|
30
|
+
consumer = KafkaConsumer.from_input_settings(input_settings)
|
|
31
|
+
try:
|
|
32
|
+
result = consumer.read_arrow(
|
|
33
|
+
aggregation_interval_seconds=2,
|
|
34
|
+
max_events_to_aggregate=10,
|
|
35
|
+
)
|
|
36
|
+
assert result is None
|
|
37
|
+
finally:
|
|
38
|
+
consumer.close()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_read_arrow_reads_json_messages(
|
|
42
|
+
input_settings: KafkaInputSettings,
|
|
43
|
+
kafka_input_topic: str,
|
|
44
|
+
raw_producer: Producer,
|
|
45
|
+
):
|
|
46
|
+
events = [
|
|
47
|
+
{"id": "a", "value": 1},
|
|
48
|
+
{"id": "b", "value": 2},
|
|
49
|
+
{"id": "c", "value": 3},
|
|
50
|
+
]
|
|
51
|
+
for event in events:
|
|
52
|
+
raw_producer.produce(kafka_input_topic, value=orjson.dumps(event))
|
|
53
|
+
raw_producer.flush()
|
|
54
|
+
|
|
55
|
+
consumer = KafkaConsumer.from_input_settings(input_settings)
|
|
56
|
+
try:
|
|
57
|
+
table = consumer.read_arrow(
|
|
58
|
+
aggregation_interval_seconds=5,
|
|
59
|
+
max_events_to_aggregate=10,
|
|
60
|
+
)
|
|
61
|
+
finally:
|
|
62
|
+
consumer.close()
|
|
63
|
+
|
|
64
|
+
assert table is not None
|
|
65
|
+
assert len(table) == 3
|
|
66
|
+
assert table.schema.field("id").type == pa.string()
|
|
67
|
+
assert table.schema.field("value").type == pa.int64()
|
|
68
|
+
assert sorted(table.column("id").to_pylist()) == ["a", "b", "c"] # type: ignore
|
|
69
|
+
assert sorted(table.column("value").to_pylist()) == [1, 2, 3] # type: ignore
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_read_arrow_respects_max_events(
|
|
73
|
+
input_settings: KafkaInputSettings,
|
|
74
|
+
kafka_input_topic: str,
|
|
75
|
+
raw_producer: Producer,
|
|
76
|
+
):
|
|
77
|
+
for i in range(10):
|
|
78
|
+
raw_producer.produce(kafka_input_topic, value=orjson.dumps({"id": str(i), "value": i}))
|
|
79
|
+
raw_producer.flush()
|
|
80
|
+
|
|
81
|
+
consumer = KafkaConsumer.from_input_settings(input_settings)
|
|
82
|
+
try:
|
|
83
|
+
table = consumer.read_arrow(
|
|
84
|
+
aggregation_interval_seconds=5,
|
|
85
|
+
max_events_to_aggregate=3,
|
|
86
|
+
)
|
|
87
|
+
finally:
|
|
88
|
+
consumer.close()
|
|
89
|
+
|
|
90
|
+
assert table is not None
|
|
91
|
+
assert len(table) == 3
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_read_arrow_timestamp_casting(
|
|
95
|
+
kafka_input_topic: str,
|
|
96
|
+
raw_producer: Producer,
|
|
97
|
+
run_id: str,
|
|
98
|
+
):
|
|
99
|
+
"""Verify timestamp[ms] fields are parsed as int64 and cast to pa.timestamp('ms')."""
|
|
100
|
+
from tkati_core.settings import (
|
|
101
|
+
KafkaConsumerSettings,
|
|
102
|
+
KafkaInputSettings,
|
|
103
|
+
KafkaTopicSettings,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
settings = KafkaInputSettings(
|
|
107
|
+
topic=KafkaTopicSettings(
|
|
108
|
+
broker="localhost:9092",
|
|
109
|
+
name=kafka_input_topic,
|
|
110
|
+
schema={"ts": "timestamp[ms]"},
|
|
111
|
+
),
|
|
112
|
+
consumer=KafkaConsumerSettings(
|
|
113
|
+
group_id=f"test-ts-{run_id}",
|
|
114
|
+
auto_offset_reset="earliest",
|
|
115
|
+
),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
raw_producer.produce(kafka_input_topic, value=orjson.dumps({"ts": 1_700_000_000_000}))
|
|
119
|
+
raw_producer.flush()
|
|
120
|
+
|
|
121
|
+
consumer = KafkaConsumer.from_input_settings(settings)
|
|
122
|
+
try:
|
|
123
|
+
table = consumer.read_arrow(aggregation_interval_seconds=5, max_events_to_aggregate=1)
|
|
124
|
+
finally:
|
|
125
|
+
consumer.close()
|
|
126
|
+
|
|
127
|
+
assert table is not None
|
|
128
|
+
assert table.schema.field("ts").type == pa.timestamp("ms")
|
|
129
|
+
assert table.column("ts")[0].as_py().timestamp() * 1000 == pytest.approx(1_700_000_000_000)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def test_read_pylist_returns_none_on_empty_topic(
|
|
133
|
+
input_settings: KafkaInputSettings, kafka_input_topic: str
|
|
134
|
+
):
|
|
135
|
+
consumer = KafkaConsumer.from_input_settings(input_settings)
|
|
136
|
+
try:
|
|
137
|
+
result = consumer.read_pylist(
|
|
138
|
+
aggregation_interval_seconds=2,
|
|
139
|
+
max_events_to_aggregate=10,
|
|
140
|
+
)
|
|
141
|
+
assert result is None
|
|
142
|
+
finally:
|
|
143
|
+
consumer.close()
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def test_read_pylist_reads_json_messages(
|
|
147
|
+
input_settings: KafkaInputSettings,
|
|
148
|
+
kafka_input_topic: str,
|
|
149
|
+
raw_producer: Producer,
|
|
150
|
+
):
|
|
151
|
+
events = [
|
|
152
|
+
{"id": "a", "value": 1},
|
|
153
|
+
{"id": "b", "value": 2},
|
|
154
|
+
{"id": "c", "value": 3},
|
|
155
|
+
]
|
|
156
|
+
for event in events:
|
|
157
|
+
raw_producer.produce(kafka_input_topic, value=orjson.dumps(event))
|
|
158
|
+
raw_producer.flush()
|
|
159
|
+
|
|
160
|
+
consumer = KafkaConsumer.from_input_settings(input_settings)
|
|
161
|
+
try:
|
|
162
|
+
rows = consumer.read_pylist(
|
|
163
|
+
aggregation_interval_seconds=5,
|
|
164
|
+
max_events_to_aggregate=10,
|
|
165
|
+
)
|
|
166
|
+
finally:
|
|
167
|
+
consumer.close()
|
|
168
|
+
|
|
169
|
+
assert rows is not None
|
|
170
|
+
assert len(rows) == 3
|
|
171
|
+
assert sorted(r["id"] for r in rows) == ["a", "b", "c"]
|
|
172
|
+
assert sorted(r["value"] for r in rows) == [1, 2, 3]
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
import orjson
|
|
4
|
+
import pyarrow as pa
|
|
5
|
+
from confluent_kafka import Consumer
|
|
6
|
+
from tkati_core.producer import KafkaProducer
|
|
7
|
+
from tkati_core.settings import KafkaOutputSettings, KafkaTopicSettings
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _consume_all(consumer: Consumer, topic: str, count: int, timeout: float = 10.0) -> list:
|
|
11
|
+
"""Consume exactly `count` messages from `topic`, returning raw confluent Message objects."""
|
|
12
|
+
consumer.subscribe([topic])
|
|
13
|
+
messages = [] # type: ignore
|
|
14
|
+
deadline = time.time() + timeout
|
|
15
|
+
while len(messages) < count and time.time() < deadline:
|
|
16
|
+
batch = consumer.consume(num_messages=count - len(messages), timeout=1.0)
|
|
17
|
+
messages.extend(m for m in batch if not m.error())
|
|
18
|
+
return messages
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_from_output_settings_sets_attributes(output_settings: KafkaOutputSettings):
|
|
22
|
+
producer = KafkaProducer.from_output_settings(output_settings)
|
|
23
|
+
try:
|
|
24
|
+
assert producer.topic_name == output_settings.topic.name
|
|
25
|
+
assert producer.format == output_settings.topic.format
|
|
26
|
+
assert producer.key_column == output_settings.topic.key_column
|
|
27
|
+
finally:
|
|
28
|
+
producer.close()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_produce_json_format(
|
|
32
|
+
output_settings: KafkaOutputSettings,
|
|
33
|
+
kafka_output_topic: str,
|
|
34
|
+
raw_consumer: Consumer,
|
|
35
|
+
):
|
|
36
|
+
table = pa.table({"name": ["alice", "bob"], "score": [10, 20]})
|
|
37
|
+
|
|
38
|
+
producer = KafkaProducer.from_output_settings(output_settings)
|
|
39
|
+
try:
|
|
40
|
+
producer.produce_arrow(table)
|
|
41
|
+
producer.flush()
|
|
42
|
+
finally:
|
|
43
|
+
producer.close()
|
|
44
|
+
|
|
45
|
+
messages = _consume_all(raw_consumer, kafka_output_topic, count=2)
|
|
46
|
+
assert len(messages) == 2
|
|
47
|
+
parsed = [orjson.loads(m.value()) for m in messages]
|
|
48
|
+
names = {r["name"] for r in parsed}
|
|
49
|
+
scores = {r["score"] for r in parsed}
|
|
50
|
+
assert names == {"alice", "bob"}
|
|
51
|
+
assert scores == {10, 20}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_produce_json_format_no_message_key_by_default(
|
|
55
|
+
output_settings: KafkaOutputSettings,
|
|
56
|
+
kafka_output_topic: str,
|
|
57
|
+
raw_consumer: Consumer,
|
|
58
|
+
):
|
|
59
|
+
table = pa.table({"x": [1]})
|
|
60
|
+
|
|
61
|
+
producer = KafkaProducer.from_output_settings(output_settings)
|
|
62
|
+
try:
|
|
63
|
+
producer.produce_arrow(table)
|
|
64
|
+
producer.flush()
|
|
65
|
+
finally:
|
|
66
|
+
producer.close()
|
|
67
|
+
|
|
68
|
+
messages = _consume_all(raw_consumer, kafka_output_topic, count=1)
|
|
69
|
+
assert messages[0].key() is None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_produce_json_format_with_key_column(
|
|
73
|
+
kafka_output_topic: str,
|
|
74
|
+
raw_consumer: Consumer,
|
|
75
|
+
run_id: str,
|
|
76
|
+
):
|
|
77
|
+
settings = KafkaOutputSettings(
|
|
78
|
+
topic=KafkaTopicSettings(
|
|
79
|
+
broker="localhost:9092",
|
|
80
|
+
name=kafka_output_topic,
|
|
81
|
+
key_column="user_id",
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
table = pa.table({"user_id": ["u1", "u2"], "value": [100, 200]})
|
|
85
|
+
|
|
86
|
+
producer = KafkaProducer.from_output_settings(settings)
|
|
87
|
+
try:
|
|
88
|
+
producer.produce_arrow(table)
|
|
89
|
+
producer.flush()
|
|
90
|
+
finally:
|
|
91
|
+
producer.close()
|
|
92
|
+
|
|
93
|
+
messages = _consume_all(raw_consumer, kafka_output_topic, count=2)
|
|
94
|
+
assert len(messages) == 2
|
|
95
|
+
keys = {m.key().decode() for m in messages}
|
|
96
|
+
assert keys == {"u1", "u2"}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def test_produce_arrow_batch_format(
|
|
100
|
+
kafka_output_topic: str,
|
|
101
|
+
raw_consumer: Consumer,
|
|
102
|
+
run_id: str,
|
|
103
|
+
):
|
|
104
|
+
settings = KafkaOutputSettings(
|
|
105
|
+
topic=KafkaTopicSettings(
|
|
106
|
+
broker="localhost:9092",
|
|
107
|
+
name=kafka_output_topic,
|
|
108
|
+
format="arrow-batch",
|
|
109
|
+
)
|
|
110
|
+
)
|
|
111
|
+
original = pa.table({"id": ["x", "y", "z"], "n": [1, 2, 3]})
|
|
112
|
+
|
|
113
|
+
producer = KafkaProducer.from_output_settings(settings)
|
|
114
|
+
try:
|
|
115
|
+
producer.produce_arrow(original)
|
|
116
|
+
producer.flush()
|
|
117
|
+
finally:
|
|
118
|
+
producer.close()
|
|
119
|
+
|
|
120
|
+
messages = _consume_all(raw_consumer, kafka_output_topic, count=1)
|
|
121
|
+
assert len(messages) == 1
|
|
122
|
+
|
|
123
|
+
reader = pa.ipc.open_stream(messages[0].value())
|
|
124
|
+
recovered = reader.read_all()
|
|
125
|
+
|
|
126
|
+
assert recovered.schema == original.schema
|
|
127
|
+
assert recovered.equals(original)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def test_produce_record_batch(
|
|
131
|
+
output_settings: KafkaOutputSettings,
|
|
132
|
+
kafka_output_topic: str,
|
|
133
|
+
raw_consumer: Consumer,
|
|
134
|
+
):
|
|
135
|
+
"""produce() accepts pa.RecordBatch as well as pa.Table."""
|
|
136
|
+
batch = pa.record_batch({"x": [7, 8]})
|
|
137
|
+
|
|
138
|
+
producer = KafkaProducer.from_output_settings(output_settings)
|
|
139
|
+
try:
|
|
140
|
+
producer.produce_arrow(batch)
|
|
141
|
+
producer.flush()
|
|
142
|
+
finally:
|
|
143
|
+
producer.close()
|
|
144
|
+
|
|
145
|
+
messages = _consume_all(raw_consumer, kafka_output_topic, count=2)
|
|
146
|
+
assert len(messages) == 2
|
|
147
|
+
values = {orjson.loads(m.value())["x"] for m in messages}
|
|
148
|
+
assert values == {7, 8}
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def test_produce_pylist_json_format(
|
|
152
|
+
output_settings: KafkaOutputSettings,
|
|
153
|
+
kafka_output_topic: str,
|
|
154
|
+
raw_consumer: Consumer,
|
|
155
|
+
):
|
|
156
|
+
rows = [{"name": "alice", "score": 10}, {"name": "bob", "score": 20}]
|
|
157
|
+
|
|
158
|
+
producer = KafkaProducer.from_output_settings(output_settings)
|
|
159
|
+
try:
|
|
160
|
+
producer.produce_pylist(rows)
|
|
161
|
+
producer.flush()
|
|
162
|
+
finally:
|
|
163
|
+
producer.close()
|
|
164
|
+
|
|
165
|
+
messages = _consume_all(raw_consumer, kafka_output_topic, count=2)
|
|
166
|
+
assert len(messages) == 2
|
|
167
|
+
parsed = [orjson.loads(m.value()) for m in messages]
|
|
168
|
+
names = {r["name"] for r in parsed}
|
|
169
|
+
scores = {r["score"] for r in parsed}
|
|
170
|
+
assert names == {"alice", "bob"}
|
|
171
|
+
assert scores == {10, 20}
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def test_produce_pylist_with_key_column(
|
|
175
|
+
kafka_output_topic: str,
|
|
176
|
+
raw_consumer: Consumer,
|
|
177
|
+
run_id: str,
|
|
178
|
+
):
|
|
179
|
+
settings = KafkaOutputSettings(
|
|
180
|
+
topic=KafkaTopicSettings(
|
|
181
|
+
broker="localhost:9092",
|
|
182
|
+
name=kafka_output_topic,
|
|
183
|
+
key_column="user_id",
|
|
184
|
+
)
|
|
185
|
+
)
|
|
186
|
+
rows = [{"user_id": "u1", "value": 100}, {"user_id": "u2", "value": 200}]
|
|
187
|
+
|
|
188
|
+
producer = KafkaProducer.from_output_settings(settings)
|
|
189
|
+
try:
|
|
190
|
+
producer.produce_pylist(rows)
|
|
191
|
+
producer.flush()
|
|
192
|
+
finally:
|
|
193
|
+
producer.close()
|
|
194
|
+
|
|
195
|
+
messages = _consume_all(raw_consumer, kafka_output_topic, count=2)
|
|
196
|
+
assert len(messages) == 2
|
|
197
|
+
keys = {m.key().decode() for m in messages}
|
|
198
|
+
assert keys == {"u1", "u2"}
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
"""Kafka consumer utilities for reading messages into PyArrow tables."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
import orjson
|
|
8
|
+
import pyarrow as pa
|
|
9
|
+
from confluent_kafka import Consumer
|
|
10
|
+
from loguru import logger
|
|
11
|
+
from pyarrow import json as pa_json
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from tkati_core.settings import KafkaInputSettings
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class KafkaConsumer:
|
|
18
|
+
"""
|
|
19
|
+
A Kafka consumer wrapper that reads messages into PyArrow tables or Python lists.
|
|
20
|
+
|
|
21
|
+
This class manages the Kafka consumer lifecycle, topic subscription,
|
|
22
|
+
and provides a convenient interface for reading messages as PyArrow tables
|
|
23
|
+
or plain Python dicts.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def from_input_settings(cls, settings: "KafkaInputSettings") -> "KafkaConsumer":
|
|
28
|
+
"""
|
|
29
|
+
Construct a KafkaConsumer from a KafkaInputSettings instance.
|
|
30
|
+
|
|
31
|
+
Sets enable.auto.commit=False — offsets must be committed explicitly via .commit().
|
|
32
|
+
"""
|
|
33
|
+
kafka_config: dict[str, str | bool] = {
|
|
34
|
+
"bootstrap.servers": settings.topic.broker,
|
|
35
|
+
"group.id": settings.consumer.group_id,
|
|
36
|
+
"auto.offset.reset": settings.consumer.auto_offset_reset,
|
|
37
|
+
"enable.auto.commit": False,
|
|
38
|
+
}
|
|
39
|
+
return cls(
|
|
40
|
+
kafka_config=kafka_config,
|
|
41
|
+
topic_name=settings.topic.name,
|
|
42
|
+
input_schema=settings.topic.schema,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
kafka_config: dict[str, str | bool],
|
|
48
|
+
topic_name: str,
|
|
49
|
+
input_schema: dict[str, str],
|
|
50
|
+
) -> None:
|
|
51
|
+
"""
|
|
52
|
+
Initialize the Kafka consumer with the provided configuration.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
kafka_config: Dictionary of Kafka consumer configuration parameters.
|
|
56
|
+
Common keys include:
|
|
57
|
+
- 'bootstrap.servers': Kafka broker addresses
|
|
58
|
+
- 'group.id': Consumer group ID
|
|
59
|
+
- 'auto.offset.reset': Offset reset behavior
|
|
60
|
+
- 'enable.auto.commit': Whether to auto-commit offsets
|
|
61
|
+
"""
|
|
62
|
+
self.consumer = Consumer(kafka_config)
|
|
63
|
+
self.topic_name = topic_name
|
|
64
|
+
self.input_schema = input_schema
|
|
65
|
+
|
|
66
|
+
self.consumer.subscribe([self.topic_name])
|
|
67
|
+
|
|
68
|
+
# Create PyArrow schema based on input_schema
|
|
69
|
+
# type -> (parse_type, cast_type)
|
|
70
|
+
type_mapping: dict[str, tuple[pa.DataType, pa.DataType]] = {
|
|
71
|
+
"string": (pa.string(), pa.string()),
|
|
72
|
+
"int32": (pa.int32(), pa.int32()),
|
|
73
|
+
"int64": (pa.int64(), pa.int64()),
|
|
74
|
+
"uint32": (pa.uint32(), pa.uint32()),
|
|
75
|
+
"uint64": (pa.uint64(), pa.uint64()),
|
|
76
|
+
"uint8": (pa.uint8(), pa.uint8()),
|
|
77
|
+
"int": (pa.int32(), pa.int32()),
|
|
78
|
+
"timestamp[ms]": (pa.int64(), pa.timestamp("ms")),
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
parse_schema_fields = []
|
|
82
|
+
cast_schema_fields = []
|
|
83
|
+
|
|
84
|
+
for field_name, field_type in input_schema.items():
|
|
85
|
+
types = type_mapping.get(field_type)
|
|
86
|
+
|
|
87
|
+
if types is not None:
|
|
88
|
+
parse_type, cast_type = types
|
|
89
|
+
else:
|
|
90
|
+
logger.warning(
|
|
91
|
+
f"Unsupported field type '{field_type}' for field '{field_name}'. Defaulting to string."
|
|
92
|
+
)
|
|
93
|
+
parse_type, cast_type = (pa.string(), pa.string())
|
|
94
|
+
|
|
95
|
+
parse_schema_fields.append(pa.field(field_name, parse_type))
|
|
96
|
+
cast_schema_fields.append(pa.field(field_name, cast_type))
|
|
97
|
+
|
|
98
|
+
self.parse_schema = pa.schema(parse_schema_fields)
|
|
99
|
+
self.cast_schema = pa.schema(cast_schema_fields)
|
|
100
|
+
|
|
101
|
+
logger.info(
|
|
102
|
+
f"Initialized KafkaConsumer with config: {kafka_config} and topic: {topic_name}"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def _consume_batch(
|
|
106
|
+
self,
|
|
107
|
+
aggregation_interval_seconds: int,
|
|
108
|
+
max_events_to_aggregate: int,
|
|
109
|
+
) -> tuple[list, int]:
|
|
110
|
+
"""
|
|
111
|
+
Consume raw messages from Kafka within the given time and count limits.
|
|
112
|
+
|
|
113
|
+
Returns a tuple of (messages, events_read) where messages is a list of
|
|
114
|
+
confluent_kafka Message objects (without errors).
|
|
115
|
+
"""
|
|
116
|
+
if self.topic_name:
|
|
117
|
+
logger.info(
|
|
118
|
+
f"Consuming events from topic(s): {self.topic_name} for up to {aggregation_interval_seconds}s or {max_events_to_aggregate} events"
|
|
119
|
+
)
|
|
120
|
+
else:
|
|
121
|
+
logger.info(
|
|
122
|
+
f"Consuming events for up to {aggregation_interval_seconds}s or {max_events_to_aggregate} events"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
start_time = time.time()
|
|
126
|
+
events_read = 0
|
|
127
|
+
poll_timeout = 10
|
|
128
|
+
valid_messages = []
|
|
129
|
+
|
|
130
|
+
while events_read < max_events_to_aggregate:
|
|
131
|
+
elapsed = time.time() - start_time
|
|
132
|
+
remaining_time = aggregation_interval_seconds - elapsed
|
|
133
|
+
|
|
134
|
+
if remaining_time <= 0:
|
|
135
|
+
logger.info(f"Reached time limit of {aggregation_interval_seconds}s")
|
|
136
|
+
break
|
|
137
|
+
|
|
138
|
+
remaining_messages = max_events_to_aggregate - events_read
|
|
139
|
+
batch_timeout = min(poll_timeout, remaining_time)
|
|
140
|
+
messages = self.consumer.consume(
|
|
141
|
+
num_messages=min(remaining_messages, 1_000_000),
|
|
142
|
+
timeout=batch_timeout,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
if not messages:
|
|
146
|
+
continue
|
|
147
|
+
|
|
148
|
+
for msg in messages:
|
|
149
|
+
if msg.error():
|
|
150
|
+
logger.info(f"Consumer error: {msg.error()}")
|
|
151
|
+
continue
|
|
152
|
+
valid_messages.append(msg)
|
|
153
|
+
events_read += 1
|
|
154
|
+
|
|
155
|
+
elapsed_total = time.time() - start_time
|
|
156
|
+
logger.info(f"Consumed {events_read} events in {elapsed_total:.2f}s")
|
|
157
|
+
return valid_messages, events_read
|
|
158
|
+
|
|
159
|
+
# WARNING: This function breaks if any single message is malformed JSON. We may
|
|
160
|
+
# want to enhance it to handle individual message errors more gracefully.
|
|
161
|
+
def read_arrow(
|
|
162
|
+
self,
|
|
163
|
+
aggregation_interval_seconds: int,
|
|
164
|
+
max_events_to_aggregate: int,
|
|
165
|
+
) -> pa.Table | None:
|
|
166
|
+
"""
|
|
167
|
+
Read messages from subscribed topics into a PyArrow table.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
aggregation_interval_seconds: Maximum time in seconds to consume messages.
|
|
171
|
+
max_events_to_aggregate: Maximum number of events to consume.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
A PyArrow Table containing the parsed events, or None if no data was consumed.
|
|
175
|
+
|
|
176
|
+
Notes:
|
|
177
|
+
- Does NOT commit offsets. The caller is responsible for managing consumer lifecycle.
|
|
178
|
+
- Does NOT subscribe to topics. The consumer must be pre-subscribed.
|
|
179
|
+
- Raises exceptions on JSON parsing errors.
|
|
180
|
+
- Uses permissive parsing that ignores unexpected fields in JSON messages.
|
|
181
|
+
"""
|
|
182
|
+
valid_messages, events_read = self._consume_batch(
|
|
183
|
+
aggregation_interval_seconds, max_events_to_aggregate
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
if events_read == 0:
|
|
187
|
+
logger.info("No data consumed from topic.")
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
buffer = BytesIO()
|
|
191
|
+
for msg in valid_messages:
|
|
192
|
+
buffer.write(msg.value())
|
|
193
|
+
buffer.write(b"\n")
|
|
194
|
+
buffer.seek(0)
|
|
195
|
+
|
|
196
|
+
parse_options = pa_json.ParseOptions(
|
|
197
|
+
explicit_schema=self.parse_schema,
|
|
198
|
+
unexpected_field_behavior="ignore",
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
table = pa_json.read_json(buffer, parse_options=parse_options)
|
|
203
|
+
table = table.cast(self.cast_schema)
|
|
204
|
+
actual_rows = len(table)
|
|
205
|
+
|
|
206
|
+
if actual_rows != events_read:
|
|
207
|
+
logger.warning(
|
|
208
|
+
f"Row count mismatch: consumed {events_read} messages, but parsed {actual_rows} rows. {events_read - actual_rows} messages may have been skipped."
|
|
209
|
+
)
|
|
210
|
+
else:
|
|
211
|
+
logger.info(
|
|
212
|
+
f"Successfully parsed {actual_rows} rows matching {events_read} consumed messages"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
except Exception as e:
|
|
216
|
+
logger.error(f"Failed to parse JSON with PyArrow: {e}")
|
|
217
|
+
raise
|
|
218
|
+
|
|
219
|
+
return table
|
|
220
|
+
|
|
221
|
+
def read_pylist(
|
|
222
|
+
self,
|
|
223
|
+
aggregation_interval_seconds: int,
|
|
224
|
+
max_events_to_aggregate: int,
|
|
225
|
+
) -> list[dict] | None:
|
|
226
|
+
"""
|
|
227
|
+
Read messages from subscribed topics into a list of dicts.
|
|
228
|
+
|
|
229
|
+
Same batching semantics as read_arrow (time + count limits).
|
|
230
|
+
Messages that fail JSON parsing are skipped (logged as errors).
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
A list of parsed event dicts, or None if no data was consumed.
|
|
234
|
+
|
|
235
|
+
Notes:
|
|
236
|
+
- Does NOT commit offsets. The caller is responsible for managing consumer lifecycle.
|
|
237
|
+
"""
|
|
238
|
+
valid_messages, events_read = self._consume_batch(
|
|
239
|
+
aggregation_interval_seconds, max_events_to_aggregate
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
if events_read == 0:
|
|
243
|
+
logger.info("No data consumed from topic.")
|
|
244
|
+
return None
|
|
245
|
+
|
|
246
|
+
rows = []
|
|
247
|
+
for msg in valid_messages:
|
|
248
|
+
try:
|
|
249
|
+
rows.append(orjson.loads(msg.value()))
|
|
250
|
+
except Exception as e:
|
|
251
|
+
logger.error(f"Error parsing message from topic {msg.topic()}: {e}")
|
|
252
|
+
|
|
253
|
+
logger.info(f"Successfully parsed {len(rows)} rows")
|
|
254
|
+
return rows if rows else None
|
|
255
|
+
|
|
256
|
+
def commit(self) -> None:
|
|
257
|
+
"""
|
|
258
|
+
Commit the current offsets for all subscribed topics.
|
|
259
|
+
"""
|
|
260
|
+
self.consumer.commit()
|
|
261
|
+
logger.debug("Committed offsets")
|
|
262
|
+
|
|
263
|
+
def close(self) -> None:
|
|
264
|
+
"""
|
|
265
|
+
Close the Kafka consumer and release resources.
|
|
266
|
+
"""
|
|
267
|
+
self.consumer.close()
|
|
268
|
+
logger.info("Closed KafkaConsumer")
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Kafka producer utilities for writing PyArrow tables as messages."""
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Literal
|
|
4
|
+
|
|
5
|
+
import orjson
|
|
6
|
+
import pyarrow as pa
|
|
7
|
+
from confluent_kafka import Producer
|
|
8
|
+
from loguru import logger
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from tkati_core.settings import KafkaOutputSettings
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class KafkaProducer:
|
|
15
|
+
"""
|
|
16
|
+
A Kafka producer wrapper that writes data as messages.
|
|
17
|
+
|
|
18
|
+
Supports producing from PyArrow tables/batches or plain Python dicts.
|
|
19
|
+
|
|
20
|
+
For Arrow-based production, two serialization formats are controlled by the
|
|
21
|
+
topic's ``format`` setting:
|
|
22
|
+
- ``"json"``: produces one Kafka message per row, serialized with orjson.
|
|
23
|
+
- ``"arrow-batch"``: produces the entire table as a single Arrow IPC message.
|
|
24
|
+
|
|
25
|
+
The optional ``key_column`` setting (from ``KafkaTopicSettings``) names the
|
|
26
|
+
column whose value is used as the Kafka message key for each row (JSON format only).
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
kafka_config: dict[str, str],
|
|
32
|
+
topic_name: str,
|
|
33
|
+
format: Literal["json", "arrow-batch"] = "json",
|
|
34
|
+
key_column: str | None = None,
|
|
35
|
+
) -> None:
|
|
36
|
+
self.producer = Producer(kafka_config)
|
|
37
|
+
self.topic_name = topic_name
|
|
38
|
+
self.format = format
|
|
39
|
+
self.key_column = key_column
|
|
40
|
+
logger.info(
|
|
41
|
+
f"Initialized KafkaProducer with topic: {topic_name}, format: {format}"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def from_output_settings(cls, settings: "KafkaOutputSettings") -> "KafkaProducer":
|
|
46
|
+
"""
|
|
47
|
+
Construct a KafkaProducer from a KafkaOutputSettings instance.
|
|
48
|
+
"""
|
|
49
|
+
return cls(
|
|
50
|
+
kafka_config={"bootstrap.servers": settings.topic.broker},
|
|
51
|
+
topic_name=settings.topic.name,
|
|
52
|
+
format=settings.topic.format,
|
|
53
|
+
key_column=settings.topic.key_column,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def produce_arrow(self, data: pa.Table | pa.RecordBatch) -> None:
|
|
57
|
+
"""
|
|
58
|
+
Produce data to the configured topic.
|
|
59
|
+
|
|
60
|
+
For ``"json"`` format each row becomes a separate Kafka message serialized
|
|
61
|
+
with orjson. If ``key_column`` is set, its value is used as the message key.
|
|
62
|
+
|
|
63
|
+
For ``"arrow-batch"`` format the entire table is serialized as a single
|
|
64
|
+
Arrow IPC stream message.
|
|
65
|
+
"""
|
|
66
|
+
if self.format == "json":
|
|
67
|
+
self.produce_pylist(data.to_pylist())
|
|
68
|
+
elif self.format == "arrow-batch":
|
|
69
|
+
table = (
|
|
70
|
+
data if isinstance(data, pa.Table) else pa.Table.from_batches([data])
|
|
71
|
+
)
|
|
72
|
+
buf = pa.BufferOutputStream()
|
|
73
|
+
with pa.ipc.new_stream(buf, table.schema) as writer:
|
|
74
|
+
for batch in table.to_batches():
|
|
75
|
+
writer.write_batch(batch)
|
|
76
|
+
self.producer.produce(self.topic_name, value=buf.getvalue().to_pybytes())
|
|
77
|
+
|
|
78
|
+
def produce_pylist(self, rows: list[dict]) -> None:
|
|
79
|
+
"""
|
|
80
|
+
Produce a list of dicts to the configured topic as JSON messages.
|
|
81
|
+
|
|
82
|
+
Each dict becomes a separate Kafka message serialized with orjson.
|
|
83
|
+
If ``key_column`` is set, its value is used as the Kafka message key.
|
|
84
|
+
"""
|
|
85
|
+
for row in rows:
|
|
86
|
+
key = (
|
|
87
|
+
str(row[self.key_column])
|
|
88
|
+
if self.key_column and self.key_column in row
|
|
89
|
+
else None
|
|
90
|
+
)
|
|
91
|
+
self.producer.produce(self.topic_name, value=orjson.dumps(row), key=key)
|
|
92
|
+
|
|
93
|
+
def flush(self) -> None:
|
|
94
|
+
"""
|
|
95
|
+
Block until all queued messages have been delivered.
|
|
96
|
+
"""
|
|
97
|
+
self.producer.flush()
|
|
98
|
+
logger.debug("Flushed KafkaProducer")
|
|
99
|
+
|
|
100
|
+
def close(self) -> None:
|
|
101
|
+
"""
|
|
102
|
+
Flush pending messages and release resources.
|
|
103
|
+
"""
|
|
104
|
+
self.producer.flush()
|
|
105
|
+
logger.info("Closed KafkaProducer")
|
|
File without changes
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Literal
|
|
3
|
+
|
|
4
|
+
from loguru import logger
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
from pydantic_settings import (
|
|
7
|
+
BaseSettings,
|
|
8
|
+
PydanticBaseSettingsSource,
|
|
9
|
+
SettingsConfigDict,
|
|
10
|
+
TomlConfigSettingsSource,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
SETTINGS_FILE = os.getenv("SETTINGS_FILE", "settings.toml")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class KafkaTopicSettings(BaseModel):
|
|
17
|
+
broker: str
|
|
18
|
+
name: str
|
|
19
|
+
schema: dict[str, str] = Field(default_factory=dict) # type: ignore
|
|
20
|
+
format: Literal["json", "arrow-batch"] = "json"
|
|
21
|
+
key_column: str | None = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class KafkaConsumerSettings(BaseModel):
|
|
25
|
+
group_id: str
|
|
26
|
+
batch_size: int = 1000
|
|
27
|
+
batch_timeout_sec: int = 5
|
|
28
|
+
auto_offset_reset: str = "latest"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class KafkaInputSettings(BaseModel):
|
|
32
|
+
topic: KafkaTopicSettings
|
|
33
|
+
consumer: KafkaConsumerSettings
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class KafkaOutputSettings(BaseModel):
|
|
37
|
+
topic: KafkaTopicSettings
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
logger.info(f"Using settings file: {os.path.abspath(SETTINGS_FILE)}")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class TomlBaseSettings(BaseSettings):
|
|
44
|
+
model_config = SettingsConfigDict(
|
|
45
|
+
toml_file=SETTINGS_FILE,
|
|
46
|
+
env_file=".env",
|
|
47
|
+
extra="ignore",
|
|
48
|
+
env_nested_delimiter="__",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def settings_customise_sources(
|
|
53
|
+
cls,
|
|
54
|
+
settings_cls: type[BaseSettings],
|
|
55
|
+
init_settings: PydanticBaseSettingsSource,
|
|
56
|
+
env_settings: PydanticBaseSettingsSource,
|
|
57
|
+
dotenv_settings: PydanticBaseSettingsSource,
|
|
58
|
+
file_secret_settings: PydanticBaseSettingsSource,
|
|
59
|
+
) -> tuple[PydanticBaseSettingsSource, ...]:
|
|
60
|
+
return (
|
|
61
|
+
init_settings,
|
|
62
|
+
env_settings,
|
|
63
|
+
dotenv_settings,
|
|
64
|
+
file_secret_settings,
|
|
65
|
+
TomlConfigSettingsSource(settings_cls),
|
|
66
|
+
)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from collections.abc import Generator
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from confluent_kafka.admin import AdminClient
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@pytest.fixture(scope="function")
|
|
8
|
+
def kafka_admin_client() -> Generator[AdminClient, None, None]:
|
|
9
|
+
"""Provides a Kafka AdminClient for integration tests. Requires Redpanda on localhost:9092."""
|
|
10
|
+
admin = AdminClient({"bootstrap.servers": "localhost:9092"})
|
|
11
|
+
yield admin
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tkati-core
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Requires-Python: >=3.13
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: confluent-kafka>=2.11.0
|
|
8
|
+
Requires-Dist: loguru>=0.7.3
|
|
9
|
+
Requires-Dist: orjson>=3.9.0
|
|
10
|
+
Requires-Dist: pyarrow>=21.0.0
|
|
11
|
+
Requires-Dist: pydantic-settings>=2.11.0
|
|
12
|
+
|
|
13
|
+
# tkati-core
|
|
14
|
+
|
|
15
|
+
For now we assume that each node gets one input and produces one output in a
|
|
16
|
+
form of kafka stream.
|
|
17
|
+
|
|
18
|
+
## Settings
|
|
19
|
+
|
|
20
|
+
General form of settings is:
|
|
21
|
+
|
|
22
|
+
```toml
|
|
23
|
+
[input.topic]
|
|
24
|
+
# definition of input stream:
|
|
25
|
+
# - broker
|
|
26
|
+
# - topic name
|
|
27
|
+
# - message schema
|
|
28
|
+
# - message format = "json" / "arrow-batch"
|
|
29
|
+
|
|
30
|
+
[input.consumer]
|
|
31
|
+
# parameters local to this consumer
|
|
32
|
+
# - group_id
|
|
33
|
+
# - batch_size
|
|
34
|
+
# - batch_timeout_sec
|
|
35
|
+
# - auto_offset_reset
|
|
36
|
+
|
|
37
|
+
[output.topic]
|
|
38
|
+
# definition of output stream
|
|
39
|
+
# - broker
|
|
40
|
+
# - topic name
|
|
41
|
+
# - message schema
|
|
42
|
+
# - message format = "json" / "arrow-batch"
|
|
43
|
+
# - key_column (optional) = column to use as the Kafka message key
|
|
44
|
+
|
|
45
|
+
[...]
|
|
46
|
+
# settings specific to node function
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Usage
|
|
50
|
+
|
|
51
|
+
### Constructing a consumer from settings
|
|
52
|
+
|
|
53
|
+
Use `KafkaArrowConsumer.from_input_settings` to construct a consumer directly from
|
|
54
|
+
`KafkaInputSettings` — no need to manually map fields to Confluent Kafka config keys.
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from tkati_core.settings import TomlBaseSettings, KafkaInputSettings
|
|
58
|
+
from tkati_core.consumer import KafkaArrowConsumer
|
|
59
|
+
|
|
60
|
+
class AppSettings(TomlBaseSettings):
|
|
61
|
+
input: KafkaInputSettings
|
|
62
|
+
# ...
|
|
63
|
+
|
|
64
|
+
settings = AppSettings()
|
|
65
|
+
consumer = KafkaArrowConsumer.from_input_settings(settings.input)
|
|
66
|
+
|
|
67
|
+
# Read a batch
|
|
68
|
+
table = consumer.read_to_pyarrow(
|
|
69
|
+
aggregation_interval_seconds=settings.input.consumer.batch_timeout_sec,
|
|
70
|
+
max_events_to_aggregate=settings.input.consumer.batch_size,
|
|
71
|
+
)
|
|
72
|
+
consumer.commit()
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
The factory method sets `enable.auto.commit=False` — offsets must be committed explicitly
|
|
76
|
+
via `consumer.commit()`.
|
|
77
|
+
|
|
78
|
+
### Constructing a producer from settings
|
|
79
|
+
|
|
80
|
+
Use `KafkaArrowProducer.from_output_settings` to construct a producer directly from
|
|
81
|
+
`KafkaOutputSettings`. It accepts PyArrow tables or record batches and handles
|
|
82
|
+
serialization according to the topic's `format` setting.
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from tkati_core.settings import TomlBaseSettings, KafkaOutputSettings
|
|
86
|
+
from tkati_core.producer import KafkaArrowProducer
|
|
87
|
+
|
|
88
|
+
class AppSettings(TomlBaseSettings):
|
|
89
|
+
output: KafkaOutputSettings
|
|
90
|
+
# ...
|
|
91
|
+
|
|
92
|
+
settings = AppSettings()
|
|
93
|
+
producer = KafkaArrowProducer.from_output_settings(settings.output)
|
|
94
|
+
|
|
95
|
+
# Produce a PyArrow table (one message per row for "json" format)
|
|
96
|
+
producer.produce(table)
|
|
97
|
+
producer.flush()
|
|
98
|
+
producer.close() # flushes and releases resources
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
**Formats** — controlled by `output.topic.format` in `settings.toml`:
|
|
102
|
+
|
|
103
|
+
- `"json"` *(default)*: each row becomes a separate Kafka message serialized with orjson.
|
|
104
|
+
- `"arrow-batch"`: the entire table is serialized as a single Arrow IPC stream message.
|
|
105
|
+
|
|
106
|
+
**Message keys** — controlled by `output.topic.key_column` in `settings.toml`:
|
|
107
|
+
|
|
108
|
+
```toml
|
|
109
|
+
[output.topic]
|
|
110
|
+
broker = "localhost:9092"
|
|
111
|
+
name = "my-output-topic"
|
|
112
|
+
key_column = "customer_id" # column whose value becomes the Kafka message key
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
`key_column` is optional. When omitted (or `None`), messages are produced without a key.
|
|
116
|
+
When set, the value of that column for each row is used as the Kafka message key
|
|
117
|
+
(JSON format only — ignored for `"arrow-batch"`). This determines which Kafka partition
|
|
118
|
+
each message is routed to.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
tests/test_consumer.py
|
|
4
|
+
tests/test_producer.py
|
|
5
|
+
tkati_core/consumer.py
|
|
6
|
+
tkati_core/producer.py
|
|
7
|
+
tkati_core/py.typed
|
|
8
|
+
tkati_core/settings.py
|
|
9
|
+
tkati_core/testing.py
|
|
10
|
+
tkati_core.egg-info/PKG-INFO
|
|
11
|
+
tkati_core.egg-info/SOURCES.txt
|
|
12
|
+
tkati_core.egg-info/dependency_links.txt
|
|
13
|
+
tkati_core.egg-info/requires.txt
|
|
14
|
+
tkati_core.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
tkati_core
|