PyPI - bizon - Versions diffs - 0.0.9__tar.gz → 0.0.11__tar.gz - Mend

bizon 0.0.9tar.gz → 0.0.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

{bizon-0.0.9 → bizon-0.0.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: bizon
-Version: 0.0.9
+Version: 0.0.11
 Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
 Author: Antoine Balliet
 Author-email: antoine.balliet@gmail.com
@@ -31,6 +31,7 @@ Requires-Dist: loguru (>=0.7.2,<0.8.0)
 Requires-Dist: pandas (>=2.2.2,<3.0.0) ; extra == "bigquery"
 Requires-Dist: pendulum (>=3.0.0,<4.0.0)
 Requires-Dist: pika (>=1.3.2,<2.0.0) ; extra == "rabbitmq"
+Requires-Dist: protobuf (==4.24.0)
 Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgres"
 Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
 Requires-Dist: pydantic (>=2.8.2,<3.0.0)

{bizon-0.0.9 → bizon-0.0.11}/bizon/common/models.py RENAMED Viewed

@@ -3,6 +3,7 @@ from typing import Union
 from pydantic import BaseModel, ConfigDict, Field
 from bizon.destinations.bigquery.src.config import BigQueryConfig
+from bizon.destinations.bigquery_streaming.src.config import BigQueryStreamingConfig
 from bizon.destinations.file.src.config import FileDestinationConfig
 from bizon.destinations.logger.src.config import LoggerConfig
 from bizon.engine.config import EngineConfig
@@ -24,6 +25,7 @@ class BizonConfig(BaseModel):
     destination: Union[
         BigQueryConfig,
+        BigQueryStreamingConfig,
         LoggerConfig,
         FileDestinationConfig,
     ] = Field(

bizon-0.0.11/bizon/destinations/bigquery_streaming/src/config.py ADDED Viewed

@@ -0,0 +1,55 @@
+from enum import Enum
+from typing import Literal, Optional
+from pydantic import BaseModel, Field, field_validator
+from bizon.destinations.config import (
+    AbstractDestinationConfig,
+    AbstractDestinationDetailsConfig,
+    DestinationTypes,
+)
+class GCSBufferFormat(str, Enum):
+    PARQUET = "parquet"
+    CSV = "csv"
+class TimePartitioning(str, Enum):
+    DAY = "DAY"
+    HOUR = "HOUR"
+    MONTH = "MONTH"
+    YEAR = "YEAR"
+class BigQueryAuthentication(BaseModel):
+    service_account_key: str = Field(
+        description="Service Account Key JSON string. If empty it will be infered",
+        default="",
+    )
+class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
+    project_id: str
+    dataset_id: str
+    dataset_location: Optional[str] = "US"
+    table_id: Optional[str] = Field(
+        default=None, description="Table ID, if not provided it will be inferred from source name"
+    )
+    time_partitioning: Optional[TimePartitioning] = Field(
+        default=TimePartitioning.DAY, description="BigQuery Time partitioning type"
+    )
+    authentication: Optional[BigQueryAuthentication] = None
+    buffer_size: int = Field(default=0, description="Buffer size in MB")
+    @field_validator("buffer_size", mode="after")
+    def validate_buffer_size(cls, value: int) -> int:
+        if value != 0:
+            raise ValueError("Buffer size must be 0, we directly stream to BigQuery")
+        return value
+class BigQueryStreamingConfig(AbstractDestinationConfig):
+    name: Literal[DestinationTypes.BIGQUERY_STREAMING]
+    config: BigQueryConfigDetails

bizon-0.0.11/bizon/destinations/bigquery_streaming/src/destination.py ADDED Viewed

@@ -0,0 +1,148 @@
+import json
+import os
+import tempfile
+from typing import List, Tuple
+from google.api_core.exceptions import NotFound
+from google.cloud import bigquery, bigquery_storage_v1, storage
+from google.cloud.bigquery import DatasetReference, TimePartitioning
+from google.cloud.bigquery_storage_v1.types import AppendRowsRequest, ProtoRows
+from loguru import logger
+from bizon.common.models import SyncMetadata
+from bizon.destinations.config import NormalizationType
+from bizon.destinations.destination import AbstractDestination
+from bizon.destinations.models import DestinationRecord
+from bizon.engine.backend.backend import AbstractBackend
+from .config import BigQueryConfigDetails
+from .proto_utils import get_proto_schema_and_class
+class BigQueryStreamingDestination(AbstractDestination):
+    def __init__(self, sync_metadata: SyncMetadata, config: BigQueryConfigDetails, backend: AbstractBackend):
+        super().__init__(sync_metadata, config, backend)
+        self.config: BigQueryConfigDetails = config
+        if config.authentication and config.authentication.service_account_key:
+            with tempfile.NamedTemporaryFile(delete=False) as temp:
+                temp.write(config.authentication.service_account_key.encode())
+                temp_file_path = temp.name
+            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file_path
+        self.project_id = config.project_id
+        self.bq_client = bigquery.Client(project=self.project_id)
+        self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
+        self.gcs_client = storage.Client(project=self.project_id)
+        self.dataset_id = config.dataset_id
+        self.dataset_location = config.dataset_location
+    @property
+    def table_id(self) -> str:
+        tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
+        return f"{self.project_id}.{self.dataset_id}.{tabled_id}"
+    def get_bigquery_schema(self, destination_records: List[DestinationRecord]) -> List[bigquery.SchemaField]:
+        # we keep raw data in the column source_data
+        if self.config.normalization.type == NormalizationType.NONE:
+            return [
+                bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
+                bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
+                bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
+                bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
+                bigquery.SchemaField(
+                    "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
+                ),
+                bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
+            ]
+        elif self.config.normalization.type == NormalizationType.DEBEZIUM:
+            assert (
+                "_bizon_message_key" in destination_records[0].source_data
+            ), "Debezium records must have a '_bizon_message_key' key"
+            message_keys = json.loads(destination_records[0].source_data["_bizon_message_key"])
+            return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in message_keys] + [
+                bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
+                bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
+                bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
+                bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
+                bigquery.SchemaField(
+                    "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
+                ),
+                bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
+            ]
+        # If normalization is tabular, we parse key / value pairs to columns
+        elif self.config.normalization.type == NormalizationType.TABULAR:
+            first_record_keys = destination_records[0].source_data.keys()
+            return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in first_record_keys] + [
+                bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
+                bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
+                bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
+                bigquery.SchemaField(
+                    "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
+                ),
+                bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
+            ]
+        raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
+    def check_connection(self) -> bool:
+        dataset_ref = DatasetReference(self.project_id, self.dataset_id)
+        try:
+            self.bq_client.get_dataset(dataset_ref)
+        except NotFound:
+            dataset = bigquery.Dataset(dataset_ref)
+            dataset.location = self.dataset_location
+            dataset = self.bq_client.create_dataset(dataset)
+        return True
+    def load_to_bigquery_via_streaming(self, destination_records: List[DestinationRecord]) -> str:
+        clustering_keys = []
+        if self.config.normalization.type == NormalizationType.DEBEZIUM:
+            clustering_keys = list(json.loads(destination_records[0].source_data["_bizon_message_key"]).keys())
+        # Create table if it doesnt exist
+        schema = self.get_bigquery_schema(destination_records=destination_records)
+        table = bigquery.Table(self.table_id, schema=schema)
+        time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
+        table.time_partitioning = time_partitioning
+        if clustering_keys:
+            table.clustering_fields = clustering_keys
+        table = self.bq_client.create_table(table, exists_ok=True)
+        # Create the stream
+        write_client = self.bq_storage_client
+        tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
+        parent = write_client.table_path(self.project_id, self.dataset_id, tabled_id)
+        stream_name = f"{parent}/_default"
+        # Generating the protocol buffer representation of the message descriptor.
+        proto_schema, TableRow = get_proto_schema_and_class(clustering_keys)
+        serialized_rows = [
+            record.to_protobuf_serialization(
+                TableRow, debezium=self.config.normalization.type == NormalizationType.DEBEZIUM
+            )
+            for record in destination_records
+        ]
+        request = AppendRowsRequest(
+            write_stream=stream_name,
+            proto_rows=AppendRowsRequest.ProtoData(
+                rows=ProtoRows(serialized_rows=serialized_rows),
+                writer_schema=proto_schema,
+            ),
+        )
+        response = write_client.append_rows(iter([request]))
+        assert response.code().name == "OK"
+    def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, str]:
+        self.load_to_bigquery_via_streaming(destination_records=destination_records)
+        return True, ""

bizon-0.0.11/bizon/destinations/bigquery_streaming/src/proto_utils.py ADDED Viewed

@@ -0,0 +1,91 @@
+from typing import List, Tuple, Type
+from google.cloud.bigquery_storage_v1.types import ProtoSchema
+from google.protobuf.descriptor_pb2 import (
+    DescriptorProto,
+    FieldDescriptorProto,
+    FileDescriptorProto,
+)
+from google.protobuf.descriptor_pool import DescriptorPool
+from google.protobuf.message import Message
+from google.protobuf.message_factory import GetMessageClassesForFiles
+def get_proto_schema_and_class(clustering_keys: List[str] = None) -> Tuple[ProtoSchema, Type[Message]]:
+    # Define the FileDescriptorProto
+    file_descriptor_proto = FileDescriptorProto()
+    file_descriptor_proto.name = "dynamic.proto"
+    file_descriptor_proto.package = "dynamic_package"
+    # Define the TableRow message schema
+    message_descriptor = DescriptorProto()
+    message_descriptor.name = "TableRow"
+    # Add fields to the message, only use TYPE_STRING, BigQuery does not support other types
+    # It does not imapact data types in final table
+    # https://stackoverflow.com/questions/70489919/protobuf-type-for-bigquery-timestamp-field
+    fields = [
+        {"name": "_bizon_id", "type": FieldDescriptorProto.TYPE_STRING, "label": FieldDescriptorProto.LABEL_REQUIRED},
+        {
+            "name": "_bizon_extracted_at",
+            "type": FieldDescriptorProto.TYPE_STRING,
+            "label": FieldDescriptorProto.LABEL_REQUIRED,
+        },
+        {
+            "name": "_bizon_loaded_at",
+            "type": FieldDescriptorProto.TYPE_STRING,
+            "label": FieldDescriptorProto.LABEL_REQUIRED,
+        },
+        {
+            "name": "_source_record_id",
+            "type": FieldDescriptorProto.TYPE_STRING,
+            "label": FieldDescriptorProto.LABEL_REQUIRED,
+        },
+        {
+            "name": "_source_timestamp",
+            "type": FieldDescriptorProto.TYPE_STRING,
+            "label": FieldDescriptorProto.LABEL_REQUIRED,
+        },
+        {
+            "name": "_source_data",
+            "type": FieldDescriptorProto.TYPE_STRING,
+            "label": FieldDescriptorProto.LABEL_OPTIONAL,
+        },
+    ]
+    if clustering_keys:
+        for key in clustering_keys:
+            fields.append(
+                {
+                    "name": key,
+                    "type": FieldDescriptorProto.TYPE_STRING,
+                    "label": FieldDescriptorProto.LABEL_OPTIONAL,
+                }
+            )
+    for i, field in enumerate(fields, start=1):
+        field_descriptor = message_descriptor.field.add()
+        field_descriptor.name = field["name"]
+        field_descriptor.number = i
+        field_descriptor.type = field["type"]
+        field_descriptor.label = field["label"]
+    # Add the message to the file descriptor
+    file_descriptor_proto.message_type.add().CopyFrom(message_descriptor)
+    # Create a DescriptorPool and register the FileDescriptorProto
+    pool = DescriptorPool()
+    pool.Add(file_descriptor_proto)
+    # Use the registered file name to fetch the message classes
+    message_classes = GetMessageClassesForFiles(["dynamic.proto"], pool=pool)
+    # Fetch the TableRow class
+    table_row_class = message_classes["dynamic_package.TableRow"]
+    # Create the ProtoSchema
+    proto_schema = ProtoSchema()
+    proto_schema.proto_descriptor.CopyFrom(message_descriptor)
+    return proto_schema, table_row_class

{bizon-0.0.9 → bizon-0.0.11}/bizon/destinations/config.py RENAMED Viewed

@@ -6,6 +6,7 @@ from pydantic import BaseModel, ConfigDict, Field
 class DestinationTypes(str, Enum):
     BIGQUERY = "bigquery"
+    BIGQUERY_STREAMING = "bigquery_streaming"
     LOGGER = "logger"
     FILE = "file"

{bizon-0.0.9 → bizon-0.0.11}/bizon/destinations/destination.py RENAMED Viewed

@@ -85,11 +85,16 @@ class AbstractDestination(ABC):
             pagination=self.buffer.pagination,
         )
+        logger.info(
+            f"Writing in destination from source iteration {self.buffer.from_iteration} to {self.buffer.to_iteration}"
+        )
         success, error_msg = self.write_records(destination_records=self.buffer.records)
         if success:
             # We wrote records to destination so we keep it
             destination_iteration.records_written = len(self.buffer.records)
+            logger.info(f"Successfully wrote {destination_iteration.records_written} records to destination")
         else:
             # We failed to write records to destination so we keep the error message
@@ -117,6 +122,11 @@ class AbstractDestination(ABC):
         # Last iteration, write all records to destination
         if last_iteration:
+            if len(self.buffer.records) == 0 and self.config.buffer_size == 0:
+                logger.warning("No records to write to destination, already written, buffer is empty.")
+                return DestinationBufferStatus.RECORDS_WRITTEN
             logger.debug("Writing last iteration records to destination")
             assert len(destination_records) == 0, "Last iteration should not have any records"
             destination_iteration = self.buffer_flush_handler(session=session)
@@ -147,9 +157,9 @@ class AbstractDestination(ABC):
             logger.warning("No records to write to destination. Check source and queue provider.")
             return DestinationBufferStatus.NO_RECORDS
-        # Write records to destination if buffer size is 0
+        # Write records to destination if buffer size is 0 or streaming
         if self.buffer.buffer_size == 0:
-            logger.info("Writing last iteration records to destination")
+            logger.info("Writing records to destination.")
             self.buffer.add_source_iteration_records_to_buffer(
                 iteration=iteration, records=destination_records, pagination=pagination
             )
@@ -160,7 +170,7 @@ class AbstractDestination(ABC):
         logger.debug(f"Buffer free space {self.buffer.buffer_free_space_pct}%")
         logger.debug(f"Buffer current size {self.buffer.current_size} bytes")
         logger.info(
-            f"Buffer ripeness {self.buffer.ripeness / 60} min. Max ripeness {self.buffer.buffer_flush_timeout / 60} min."  # noqa
+            f"Buffer ripeness {round(self.buffer.ripeness / 60, 2)} min. Max ripeness {round(self.buffer.buffer_flush_timeout / 60, 2)} min."  # noqa
         )
         # Write buffer to destination if buffer is ripe and create a new buffer for the new iteration
@@ -258,6 +268,11 @@ class DestinationFactory:
             return BigQueryDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
+        elif config.name == DestinationTypes.BIGQUERY_STREAMING:
+            from .bigquery_streaming.src.destination import BigQueryStreamingDestination
+            return BigQueryStreamingDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
         elif config.name == DestinationTypes.FILE:
             from .file.src.destination import FileDestination

{bizon-0.0.9 → bizon-0.0.11}/bizon/destinations/models.py RENAMED Viewed

@@ -1,7 +1,9 @@
 import json
 from datetime import datetime
+from typing import Type
 from uuid import uuid4
+from google.protobuf.message import Message
 from pydantic import BaseModel, Field
 from pytz import UTC
@@ -81,3 +83,28 @@ class DestinationRecord(BaseModel):
             "_source_timestamp": self.source_timestamp,
             "_source_data": json.dumps(self.source_data),
         }
+    def to_protobuf_serialization(self, TableRowClass: Type[Message], debezium=False):
+        record = TableRowClass()
+        record._bizon_id = self.bizon_id
+        record._bizon_extracted_at = str(int(self.bizon_extracted_at.timestamp()))
+        record._bizon_loaded_at = str(int(self.bizon_loaded_at.timestamp()))
+        record._source_record_id = self.source_record_id
+        record._source_timestamp = str(int(self.source_timestamp.timestamp()))
+        if debezium:
+            parsed_debezium_keys = json.loads(self.source_data["_bizon_message_key"])
+            if parsed_debezium_keys:
+                for _key in parsed_debezium_keys:
+                    setattr(record, _key, str(parsed_debezium_keys[_key]))
+            if self.source_data.get("op") == "d":
+                source_data = {"__deleted": True, **self.source_data["before"]}
+            else:
+                source_data = {"__deleted": False, **self.source_data["after"]}
+            record._source_data = json.dumps(source_data)
+        else:
+            record._source_data = json.dumps(self.source_data)
+        return record.SerializeToString()

{bizon-0.0.9 → bizon-0.0.11}/bizon/engine/pipeline/producer.py RENAMED Viewed

@@ -104,6 +104,8 @@ class Producer:
         while not cursor.is_finished:
+            timestamp_start_iteration = datetime.now(tz=UTC)
             # Handle the case where last cursor already reach max_iterations
             terminate = self.handle_max_iterations(cursor)
             if terminate:
@@ -178,6 +180,15 @@ class Producer:
                 return_value = PipelineReturnStatus.SOURCE_ERROR
                 break
+            # Items in queue
+            items_in_queue = f"{self.queue.get_size()} items in queue." if self.queue.get_size() else ""
+            logger.info(
+                (
+                    f"Iteration {cursor.iteration} finished in {datetime.now(tz=UTC) - timestamp_start_iteration}. {items_in_queue}"
+                )
+            )
         logger.info("Terminating destination ...")
         try:

{bizon-0.0.9 → bizon-0.0.11}/bizon/engine/queue/adapters/kafka/queue.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import json
+from typing import Union
 from kafka import KafkaProducer
 from loguru import logger
@@ -36,6 +37,9 @@ class KafkaQueue(AbstractQueue):
     def on_error(e):
         logger.error(f"Error sending message: {e}")
+    def get_size(self) -> Union[int, None]:
+        return None
     def put_queue_message(self, queue_message: QueueMessage):
         future = self.producer.send(
             topic=self.config.queue.topic,

{bizon-0.0.9 → bizon-0.0.11}/bizon/engine/queue/adapters/python_queue/queue.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import random
 import time
 from multiprocessing import Queue
+from typing import Union
 from loguru import logger
@@ -52,6 +53,11 @@ class PythonQueue(AbstractQueue):
             time.sleep(random.random())
             return self.get()
+    def get_size(self) -> Union[int, None]:
+        if hasattr(self.queue, "qsize"):
+            return self.queue.qsize()
+        return None
     def terminate(self, iteration: int) -> bool:
         self.put(source_records=[], iteration=iteration, signal=QUEUE_TERMINATION)
         logger.info("Sent termination signal to destination.")

{bizon-0.0.9 → bizon-0.0.11}/bizon/engine/queue/adapters/rabbitmq/queue.py RENAMED Viewed

@@ -1,3 +1,5 @@
+from typing import Union
 import pika
 from loguru import logger
@@ -31,6 +33,9 @@ class RabbitMQ(AbstractQueue):
             body=queue_message.model_dump_json(),
         )
+    def get_size(self) -> Union[int, None]:
+        return None
     def get(self) -> QueueMessage:
         raise NotImplementedError(
             "RabbitMQ does not support getting messages from the queue, directly use callback in consumer."

{bizon-0.0.9 → bizon-0.0.11}/bizon/engine/queue/queue.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from datetime import datetime
-from typing import List, Optional
+from typing import List, Optional, Union
 from pydantic import BaseModel
 from pytz import UTC
@@ -45,6 +45,11 @@ class AbstractQueue(ABC):
         """Get a QueueMessage object from the queue system"""
         pass
+    @abstractmethod
+    def get_size(self) -> Union[int, None]:
+        """If queue is compatible, return size of the queue"""
+        pass
     @abstractmethod
     def terminate(self, iteration: int) -> bool:
         """Send a termination signal in the queue system"""

{bizon-0.0.9 → bizon-0.0.11}/bizon/engine/runner/adapters/thread.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import concurrent.futures
 import time
+import traceback
 from loguru import logger
@@ -75,5 +76,6 @@ class ThreadRunner(AbstractRunner):
                     future_consumer.result()
                 except Exception as e:
                     logger.error(f"Consumer thread stopped running with error {e}")
+                    logger.error(traceback.format_exc())
         return True

{bizon-0.0.9 → bizon-0.0.11}/bizon/source/discover.py RENAMED Viewed

@@ -143,7 +143,7 @@ def parse_streams_from_filepath(source_name: str, filepath: str, skip_unavailabl
         # Transform the relative path to a python import path and import the module
         python_import_path = get_python_import_path(relative_path)
-        logger.info(f"Importing {python_import_path}")
+        logger.debug(f"Importing {python_import_path}")
         try:
             source_module = importlib.import_module(python_import_path, package="sources")

{bizon-0.0.9 → bizon-0.0.11}/bizon/sources/kafka/src/source.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import io
 import json
+import logging
 import struct
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime, timezone
 from enum import Enum
 from functools import lru_cache
@@ -18,6 +18,9 @@ from bizon.source.config import SourceConfig
 from bizon.source.models import SourceIteration, SourceRecord
 from bizon.source.source import AbstractSource
+silent_logger = logging.getLogger()
+silent_logger.addHandler(logging.StreamHandler())
 class SchemaRegistryType(str, Enum):
     APICURIO = "apicurio"
@@ -98,7 +101,10 @@ class KafkaSource(AbstractSource):
         }
         # Consumer instance
-        self.consumer = Consumer(self.kafka_consumer_conf)
+        self.consumer = Consumer(self.kafka_consumer_conf, logger=silent_logger)
+        # Consumers for each worker thread
+        self.consumers_cached: Mapping[int, Consumer] = {}
     @staticmethod
     def streams() -> List[str]:
@@ -194,25 +200,17 @@ class KafkaSource(AbstractSource):
         global_id = self.parse_global_id_from_serialized_message(header_message)
         return self.get_parsed_avro_schema(global_id).to_json()
-    def read_partition(self, partition: int, topic_offsets: TopicOffsets) -> List[SourceRecord]:
+    def parse_encoded_messages(self, encoded_messages: list) -> List[SourceRecord]:
         records = []
-        encoded_messages = []
         # Set the source timestamp to now, otherwise it will be overwritten by the message timestamp
         source_timestamp = datetime.now(tz=timezone.utc)
-        # Set consumer offset params
-        consumer = Consumer(self.kafka_consumer_conf)
-        consumer.assign([TopicPartition(self.config.topic, partition, topic_offsets.get_partition_offset(partition))])
-        consumer.seek(TopicPartition(self.config.topic, partition, topic_offsets.get_partition_offset(partition)))
-        # Read messages
-        encoded_messages.extend(consumer.consume(self.config.batch_size, timeout=self.config.consumer_timeout))
         for message in encoded_messages:
             if not message.value():
                 logger.debug(
-                    f"Message for partition {partition} and offset {message.offset()} and topic {self.config.topic} is empty, skipping."
+                    f"Message for partition {message.partition()} and offset {message.offset()} and topic {self.config.topic} is empty, skipping."
                 )
                 continue
@@ -233,43 +231,44 @@ class KafkaSource(AbstractSource):
                         data[self.config.timestamp_ms_name] / 1000, tz=timezone.utc
                     )
+                self.topic_offsets.set_partition_offset(message.partition(), message.offset() + 1)
                 records.append(
                     SourceRecord(
-                        id=f"part_{partition}_offset_{message.offset()}",
+                        id=f"part_{message.partition()}_offset_{message.offset()}",
                         timestamp=source_timestamp,
                         data=data,
                     )
                 )
             except Exception as e:
                 logger.error(
-                    f"Error while decoding message for partition {partition}: {e} at offset {message.offset()}"
+                    f"Error while decoding message for partition {message.partition()}: {e} at offset {message.offset()}"
                 )
                 continue
-        # Update the offset for the partition
-        if encoded_messages:
-            topic_offsets.set_partition_offset(partition, encoded_messages[-1].offset() + 1)
-        else:
-            logger.warning(f"No new messages found for partition {partition}")
-        consumer.close()
         return records
     def read_topic(self, pagination: dict = None) -> SourceIteration:
         nb_partitions = self.get_number_of_partitions()
         # Setup offset_pagination
-        topic_offsets = TopicOffsets.model_validate(pagination) if pagination else self.get_offset_partitions()
+        self.topic_offsets = TopicOffsets.model_validate(pagination) if pagination else self.get_offset_partitions()
-        # Use ThreadPoolExecutor to parallelize reading partitions
-        records = []
-        with ThreadPoolExecutor(max_workers=min(nb_partitions, self.config.max_consumer_threads)) as executor:
-            futures = {executor.submit(self.read_partition, i, topic_offsets): i for i in range(nb_partitions)}
-            for future in as_completed(futures):
-                partition_records = future.result()
-                records.extend(partition_records)
+        self.consumer.assign(
+            [
+                TopicPartition(self.config.topic, partition, self.topic_offsets.get_partition_offset(partition))
+                for partition in range(nb_partitions)
+            ]
+        )
+        t1 = datetime.now()
+        encoded_messages = self.consumer.consume(self.config.batch_size, timeout=self.config.consumer_timeout)
+        logger.info(f"Read Kafka: {len(encoded_messages)} messages in {datetime.now() - t1}")
+        records = self.parse_encoded_messages(encoded_messages)
+        # Update the offset for the partition
         if not records:
             logger.info("No new records found, stopping iteration")
             return SourceIteration(
@@ -278,7 +277,7 @@ class KafkaSource(AbstractSource):
             )
         return SourceIteration(
-            next_pagination=topic_offsets.model_dump(),
+            next_pagination=self.topic_offsets.model_dump(),
             records=records,
         )

{bizon-0.0.9 → bizon-0.0.11}/bizon/sources/kafka/tests/kafka_pipeline.py RENAMED Viewed

@@ -4,6 +4,6 @@ from bizon.engine.engine import RunnerFactory
 if __name__ == "__main__":
     runner = RunnerFactory.create_from_yaml(
-        filepath=os.path.abspath("bizon/sources/kafka/config/kafka_teams_users_eu_west1_c511.yml")
+        filepath=os.path.abspath("bizon/sources/kafka/config/kafka_teams_users.yml")
     )
     runner.run()

{bizon-0.0.9 → bizon-0.0.11}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "bizon"
-version = "0.0.9"
+version = "0.0.11"
 description = "Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism."
 authors = ["Antoine Balliet <antoine.balliet@gmail.com>", "Anas El Mhamdi <anas.elmhamdi@gmail.com>"]
 readme = "README.md"
@@ -44,6 +44,7 @@ python-dotenv = "^1.0.1"
 gspread = { version = "^6.1.2", optional = true }
 click = "^8.1.7"
 pytz = "^2024.2"
+protobuf = "4.24.0"
 [tool.poetry.extras]
 postgres = ["psycopg2-binary"]