PyPI - tkati-core - Versions diffs - 0.1.0__py3-none-any.whl - Mend

tkati-core 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

tkati_core/consumer.py +268 -0
tkati_core/producer.py +105 -0
tkati_core/py.typed +0 -0
tkati_core/settings.py +66 -0
tkati_core/testing.py +11 -0
tkati_core-0.1.0.dist-info/METADATA +118 -0
tkati_core-0.1.0.dist-info/RECORD +9 -0
tkati_core-0.1.0.dist-info/WHEEL +5 -0
tkati_core-0.1.0.dist-info/top_level.txt +1 -0

tkati_core/consumer.py ADDED Viewed

@@ -0,0 +1,268 @@
+"""Kafka consumer utilities for reading messages into PyArrow tables."""
+import time
+from io import BytesIO
+from typing import TYPE_CHECKING
+import orjson
+import pyarrow as pa
+from confluent_kafka import Consumer
+from loguru import logger
+from pyarrow import json as pa_json
+if TYPE_CHECKING:
+    from tkati_core.settings import KafkaInputSettings
+class KafkaConsumer:
+    """
+    A Kafka consumer wrapper that reads messages into PyArrow tables or Python lists.
+    This class manages the Kafka consumer lifecycle, topic subscription,
+    and provides a convenient interface for reading messages as PyArrow tables
+    or plain Python dicts.
+    """
+    @classmethod
+    def from_input_settings(cls, settings: "KafkaInputSettings") -> "KafkaConsumer":
+        """
+        Construct a KafkaConsumer from a KafkaInputSettings instance.
+        Sets enable.auto.commit=False — offsets must be committed explicitly via .commit().
+        """
+        kafka_config: dict[str, str | bool] = {
+            "bootstrap.servers": settings.topic.broker,
+            "group.id": settings.consumer.group_id,
+            "auto.offset.reset": settings.consumer.auto_offset_reset,
+            "enable.auto.commit": False,
+        }
+        return cls(
+            kafka_config=kafka_config,
+            topic_name=settings.topic.name,
+            input_schema=settings.topic.schema,
+        )
+    def __init__(
+        self,
+        kafka_config: dict[str, str | bool],
+        topic_name: str,
+        input_schema: dict[str, str],
+    ) -> None:
+        """
+        Initialize the Kafka consumer with the provided configuration.
+        Args:
+            kafka_config: Dictionary of Kafka consumer configuration parameters.
+                         Common keys include:
+                         - 'bootstrap.servers': Kafka broker addresses
+                         - 'group.id': Consumer group ID
+                         - 'auto.offset.reset': Offset reset behavior
+                         - 'enable.auto.commit': Whether to auto-commit offsets
+        """
+        self.consumer = Consumer(kafka_config)
+        self.topic_name = topic_name
+        self.input_schema = input_schema
+        self.consumer.subscribe([self.topic_name])
+        # Create PyArrow schema based on input_schema
+        # type -> (parse_type, cast_type)
+        type_mapping: dict[str, tuple[pa.DataType, pa.DataType]] = {
+            "string": (pa.string(), pa.string()),
+            "int32": (pa.int32(), pa.int32()),
+            "int64": (pa.int64(), pa.int64()),
+            "uint32": (pa.uint32(), pa.uint32()),
+            "uint64": (pa.uint64(), pa.uint64()),
+            "uint8": (pa.uint8(), pa.uint8()),
+            "int": (pa.int32(), pa.int32()),
+            "timestamp[ms]": (pa.int64(), pa.timestamp("ms")),
+        }
+        parse_schema_fields = []
+        cast_schema_fields = []
+        for field_name, field_type in input_schema.items():
+            types = type_mapping.get(field_type)
+            if types is not None:
+                parse_type, cast_type = types
+            else:
+                logger.warning(
+                    f"Unsupported field type '{field_type}' for field '{field_name}'. Defaulting to string."
+                )
+                parse_type, cast_type = (pa.string(), pa.string())
+            parse_schema_fields.append(pa.field(field_name, parse_type))
+            cast_schema_fields.append(pa.field(field_name, cast_type))
+        self.parse_schema = pa.schema(parse_schema_fields)
+        self.cast_schema = pa.schema(cast_schema_fields)
+        logger.info(
+            f"Initialized KafkaConsumer with config: {kafka_config} and topic: {topic_name}"
+        )
+    def _consume_batch(
+        self,
+        aggregation_interval_seconds: int,
+        max_events_to_aggregate: int,
+    ) -> tuple[list, int]:
+        """
+        Consume raw messages from Kafka within the given time and count limits.
+        Returns a tuple of (messages, events_read) where messages is a list of
+        confluent_kafka Message objects (without errors).
+        """
+        if self.topic_name:
+            logger.info(
+                f"Consuming events from topic(s): {self.topic_name} for up to {aggregation_interval_seconds}s or {max_events_to_aggregate} events"
+            )
+        else:
+            logger.info(
+                f"Consuming events for up to {aggregation_interval_seconds}s or {max_events_to_aggregate} events"
+            )
+        start_time = time.time()
+        events_read = 0
+        poll_timeout = 10
+        valid_messages = []
+        while events_read < max_events_to_aggregate:
+            elapsed = time.time() - start_time
+            remaining_time = aggregation_interval_seconds - elapsed
+            if remaining_time <= 0:
+                logger.info(f"Reached time limit of {aggregation_interval_seconds}s")
+                break
+            remaining_messages = max_events_to_aggregate - events_read
+            batch_timeout = min(poll_timeout, remaining_time)
+            messages = self.consumer.consume(
+                num_messages=min(remaining_messages, 1_000_000),
+                timeout=batch_timeout,
+            )
+            if not messages:
+                continue
+            for msg in messages:
+                if msg.error():
+                    logger.info(f"Consumer error: {msg.error()}")
+                    continue
+                valid_messages.append(msg)
+                events_read += 1
+        elapsed_total = time.time() - start_time
+        logger.info(f"Consumed {events_read} events in {elapsed_total:.2f}s")
+        return valid_messages, events_read
+    # WARNING: This function breaks if any single message is malformed JSON. We may
+    # want to enhance it to handle individual message errors more gracefully.
+    def read_arrow(
+        self,
+        aggregation_interval_seconds: int,
+        max_events_to_aggregate: int,
+    ) -> pa.Table | None:
+        """
+        Read messages from subscribed topics into a PyArrow table.
+        Args:
+            aggregation_interval_seconds: Maximum time in seconds to consume messages.
+            max_events_to_aggregate: Maximum number of events to consume.
+        Returns:
+            A PyArrow Table containing the parsed events, or None if no data was consumed.
+        Notes:
+            - Does NOT commit offsets. The caller is responsible for managing consumer lifecycle.
+            - Does NOT subscribe to topics. The consumer must be pre-subscribed.
+            - Raises exceptions on JSON parsing errors.
+            - Uses permissive parsing that ignores unexpected fields in JSON messages.
+        """
+        valid_messages, events_read = self._consume_batch(
+            aggregation_interval_seconds, max_events_to_aggregate
+        )
+        if events_read == 0:
+            logger.info("No data consumed from topic.")
+            return None
+        buffer = BytesIO()
+        for msg in valid_messages:
+            buffer.write(msg.value())
+            buffer.write(b"\n")
+        buffer.seek(0)
+        parse_options = pa_json.ParseOptions(
+            explicit_schema=self.parse_schema,
+            unexpected_field_behavior="ignore",
+        )
+        try:
+            table = pa_json.read_json(buffer, parse_options=parse_options)
+            table = table.cast(self.cast_schema)
+            actual_rows = len(table)
+            if actual_rows != events_read:
+                logger.warning(
+                    f"Row count mismatch: consumed {events_read} messages, but parsed {actual_rows} rows. {events_read - actual_rows} messages may have been skipped."
+                )
+            else:
+                logger.info(
+                    f"Successfully parsed {actual_rows} rows matching {events_read} consumed messages"
+                )
+        except Exception as e:
+            logger.error(f"Failed to parse JSON with PyArrow: {e}")
+            raise
+        return table
+    def read_pylist(
+        self,
+        aggregation_interval_seconds: int,
+        max_events_to_aggregate: int,
+    ) -> list[dict] | None:
+        """
+        Read messages from subscribed topics into a list of dicts.
+        Same batching semantics as read_arrow (time + count limits).
+        Messages that fail JSON parsing are skipped (logged as errors).
+        Returns:
+            A list of parsed event dicts, or None if no data was consumed.
+        Notes:
+            - Does NOT commit offsets. The caller is responsible for managing consumer lifecycle.
+        """
+        valid_messages, events_read = self._consume_batch(
+            aggregation_interval_seconds, max_events_to_aggregate
+        )
+        if events_read == 0:
+            logger.info("No data consumed from topic.")
+            return None
+        rows = []
+        for msg in valid_messages:
+            try:
+                rows.append(orjson.loads(msg.value()))
+            except Exception as e:
+                logger.error(f"Error parsing message from topic {msg.topic()}: {e}")
+        logger.info(f"Successfully parsed {len(rows)} rows")
+        return rows if rows else None
+    def commit(self) -> None:
+        """
+        Commit the current offsets for all subscribed topics.
+        """
+        self.consumer.commit()
+        logger.debug("Committed offsets")
+    def close(self) -> None:
+        """
+        Close the Kafka consumer and release resources.
+        """
+        self.consumer.close()
+        logger.info("Closed KafkaConsumer")

tkati_core/producer.py ADDED Viewed

@@ -0,0 +1,105 @@
+"""Kafka producer utilities for writing PyArrow tables as messages."""
+from typing import TYPE_CHECKING, Literal
+import orjson
+import pyarrow as pa
+from confluent_kafka import Producer
+from loguru import logger
+if TYPE_CHECKING:
+    from tkati_core.settings import KafkaOutputSettings
+class KafkaProducer:
+    """
+    A Kafka producer wrapper that writes data as messages.
+    Supports producing from PyArrow tables/batches or plain Python dicts.
+    For Arrow-based production, two serialization formats are controlled by the
+    topic's ``format`` setting:
+    - ``"json"``: produces one Kafka message per row, serialized with orjson.
+    - ``"arrow-batch"``: produces the entire table as a single Arrow IPC message.
+    The optional ``key_column`` setting (from ``KafkaTopicSettings``) names the
+    column whose value is used as the Kafka message key for each row (JSON format only).
+    """
+    def __init__(
+        self,
+        kafka_config: dict[str, str],
+        topic_name: str,
+        format: Literal["json", "arrow-batch"] = "json",
+        key_column: str | None = None,
+    ) -> None:
+        self.producer = Producer(kafka_config)
+        self.topic_name = topic_name
+        self.format = format
+        self.key_column = key_column
+        logger.info(
+            f"Initialized KafkaProducer with topic: {topic_name}, format: {format}"
+        )
+    @classmethod
+    def from_output_settings(cls, settings: "KafkaOutputSettings") -> "KafkaProducer":
+        """
+        Construct a KafkaProducer from a KafkaOutputSettings instance.
+        """
+        return cls(
+            kafka_config={"bootstrap.servers": settings.topic.broker},
+            topic_name=settings.topic.name,
+            format=settings.topic.format,
+            key_column=settings.topic.key_column,
+        )
+    def produce_arrow(self, data: pa.Table | pa.RecordBatch) -> None:
+        """
+        Produce data to the configured topic.
+        For ``"json"`` format each row becomes a separate Kafka message serialized
+        with orjson. If ``key_column`` is set, its value is used as the message key.
+        For ``"arrow-batch"`` format the entire table is serialized as a single
+        Arrow IPC stream message.
+        """
+        if self.format == "json":
+            self.produce_pylist(data.to_pylist())
+        elif self.format == "arrow-batch":
+            table = (
+                data if isinstance(data, pa.Table) else pa.Table.from_batches([data])
+            )
+            buf = pa.BufferOutputStream()
+            with pa.ipc.new_stream(buf, table.schema) as writer:
+                for batch in table.to_batches():
+                    writer.write_batch(batch)
+            self.producer.produce(self.topic_name, value=buf.getvalue().to_pybytes())
+    def produce_pylist(self, rows: list[dict]) -> None:
+        """
+        Produce a list of dicts to the configured topic as JSON messages.
+        Each dict becomes a separate Kafka message serialized with orjson.
+        If ``key_column`` is set, its value is used as the Kafka message key.
+        """
+        for row in rows:
+            key = (
+                str(row[self.key_column])
+                if self.key_column and self.key_column in row
+                else None
+            )
+            self.producer.produce(self.topic_name, value=orjson.dumps(row), key=key)
+    def flush(self) -> None:
+        """
+        Block until all queued messages have been delivered.
+        """
+        self.producer.flush()
+        logger.debug("Flushed KafkaProducer")
+    def close(self) -> None:
+        """
+        Flush pending messages and release resources.
+        """
+        self.producer.flush()
+        logger.info("Closed KafkaProducer")

tkati_core/py.typed ADDED Viewed

File without changes

tkati_core/settings.py ADDED Viewed

@@ -0,0 +1,66 @@
+import os
+from typing import Literal
+from loguru import logger
+from pydantic import BaseModel, Field
+from pydantic_settings import (
+    BaseSettings,
+    PydanticBaseSettingsSource,
+    SettingsConfigDict,
+    TomlConfigSettingsSource,
+)
+SETTINGS_FILE = os.getenv("SETTINGS_FILE", "settings.toml")
+class KafkaTopicSettings(BaseModel):
+    broker: str
+    name: str
+    schema: dict[str, str] = Field(default_factory=dict)  # type: ignore
+    format: Literal["json", "arrow-batch"] = "json"
+    key_column: str | None = None
+class KafkaConsumerSettings(BaseModel):
+    group_id: str
+    batch_size: int = 1000
+    batch_timeout_sec: int = 5
+    auto_offset_reset: str = "latest"
+class KafkaInputSettings(BaseModel):
+    topic: KafkaTopicSettings
+    consumer: KafkaConsumerSettings
+class KafkaOutputSettings(BaseModel):
+    topic: KafkaTopicSettings
+logger.info(f"Using settings file: {os.path.abspath(SETTINGS_FILE)}")
+class TomlBaseSettings(BaseSettings):
+    model_config = SettingsConfigDict(
+        toml_file=SETTINGS_FILE,
+        env_file=".env",
+        extra="ignore",
+        env_nested_delimiter="__",
+    )
+    @classmethod
+    def settings_customise_sources(
+        cls,
+        settings_cls: type[BaseSettings],
+        init_settings: PydanticBaseSettingsSource,
+        env_settings: PydanticBaseSettingsSource,
+        dotenv_settings: PydanticBaseSettingsSource,
+        file_secret_settings: PydanticBaseSettingsSource,
+    ) -> tuple[PydanticBaseSettingsSource, ...]:
+        return (
+            init_settings,
+            env_settings,
+            dotenv_settings,
+            file_secret_settings,
+            TomlConfigSettingsSource(settings_cls),
+        )

tkati_core/testing.py ADDED Viewed

@@ -0,0 +1,11 @@
+from collections.abc import Generator
+import pytest
+from confluent_kafka.admin import AdminClient
+@pytest.fixture(scope="function")
+def kafka_admin_client() -> Generator[AdminClient, None, None]:
+    """Provides a Kafka AdminClient for integration tests. Requires Redpanda on localhost:9092."""
+    admin = AdminClient({"bootstrap.servers": "localhost:9092"})
+    yield admin

tkati_core-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,118 @@
+Metadata-Version: 2.4
+Name: tkati-core
+Version: 0.1.0
+Summary: Add your description here
+Requires-Python: >=3.13
+Description-Content-Type: text/markdown
+Requires-Dist: confluent-kafka>=2.11.0
+Requires-Dist: loguru>=0.7.3
+Requires-Dist: orjson>=3.9.0
+Requires-Dist: pyarrow>=21.0.0
+Requires-Dist: pydantic-settings>=2.11.0
+# tkati-core
+For now we assume that each node gets one input and produces one output in a
+form of kafka stream.
+## Settings
+General form of settings is:
+```toml
+[input.topic]
+# definition of input stream:
+# - broker
+# - topic name
+# - message schema
+# - message format = "json" / "arrow-batch"
+[input.consumer]
+# parameters local to this consumer
+# - group_id
+# - batch_size
+# - batch_timeout_sec
+# - auto_offset_reset
+[output.topic]
+# definition of output stream
+# - broker
+# - topic name
+# - message schema
+# - message format = "json" / "arrow-batch"
+# - key_column (optional) = column to use as the Kafka message key
+[...]
+# settings specific to node function
+```
+## Usage
+### Constructing a consumer from settings
+Use `KafkaArrowConsumer.from_input_settings` to construct a consumer directly from
+`KafkaInputSettings` — no need to manually map fields to Confluent Kafka config keys.
+```python
+from tkati_core.settings import TomlBaseSettings, KafkaInputSettings
+from tkati_core.consumer import KafkaArrowConsumer
+class AppSettings(TomlBaseSettings):
+    input: KafkaInputSettings
+    # ...
+settings = AppSettings()
+consumer = KafkaArrowConsumer.from_input_settings(settings.input)
+# Read a batch
+table = consumer.read_to_pyarrow(
+    aggregation_interval_seconds=settings.input.consumer.batch_timeout_sec,
+    max_events_to_aggregate=settings.input.consumer.batch_size,
+)
+consumer.commit()
+```
+The factory method sets `enable.auto.commit=False` — offsets must be committed explicitly
+via `consumer.commit()`.
+### Constructing a producer from settings
+Use `KafkaArrowProducer.from_output_settings` to construct a producer directly from
+`KafkaOutputSettings`. It accepts PyArrow tables or record batches and handles
+serialization according to the topic's `format` setting.
+```python
+from tkati_core.settings import TomlBaseSettings, KafkaOutputSettings
+from tkati_core.producer import KafkaArrowProducer
+class AppSettings(TomlBaseSettings):
+    output: KafkaOutputSettings
+    # ...
+settings = AppSettings()
+producer = KafkaArrowProducer.from_output_settings(settings.output)
+# Produce a PyArrow table (one message per row for "json" format)
+producer.produce(table)
+producer.flush()
+producer.close()  # flushes and releases resources
+```
+**Formats** — controlled by `output.topic.format` in `settings.toml`:
+- `"json"` *(default)*: each row becomes a separate Kafka message serialized with orjson.
+- `"arrow-batch"`: the entire table is serialized as a single Arrow IPC stream message.
+**Message keys** — controlled by `output.topic.key_column` in `settings.toml`:
+```toml
+[output.topic]
+broker = "localhost:9092"
+name = "my-output-topic"
+key_column = "customer_id"   # column whose value becomes the Kafka message key
+```
+`key_column` is optional. When omitted (or `None`), messages are produced without a key.
+When set, the value of that column for each row is used as the Kafka message key
+(JSON format only — ignored for `"arrow-batch"`). This determines which Kafka partition
+each message is routed to.

tkati_core-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+tkati_core/consumer.py,sha256=y9rVVDWpncuwJWhxvj6psx0b01GAVuONu_9JtvCBwFI,9519
+tkati_core/producer.py,sha256=wGJW0E9UWzDDlgkii5-jpHnCwLdQQPuHaMI2Ctsk43k,3731
+tkati_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+tkati_core/settings.py,sha256=RUhSB6Kfq43FjAJd0zRuq57_qKrgiPrYnu4ZLB3_s_s,1699
+tkati_core/testing.py,sha256=cVE5uYkENk9KgQ0g3i5VCQ0YBdL3bPlRQbB1uLKDI5s,379
+tkati_core-0.1.0.dist-info/METADATA,sha256=sGKFKV8ht9e1CtM7qWLsVXw45xDVKSqFe5GbP7VvZZ8,3369
+tkati_core-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+tkati_core-0.1.0.dist-info/top_level.txt,sha256=HjR7-66EdB2ZTWc3U1qLVWwc435OAwrN58iJGhJE9UY,11
+tkati_core-0.1.0.dist-info/RECORD,,

tkati_core-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

tkati_core-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ tkati_core