PyPI - bizon - Versions diffs - 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl - Mend

bizon 0.0.9py3-none-any.whl → 0.0.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

bizon/common/models.py +2 -0
bizon/destinations/bigquery_streaming/src/config.py +55 -0
bizon/destinations/bigquery_streaming/src/destination.py +148 -0
bizon/destinations/bigquery_streaming/src/proto_utils.py +91 -0
bizon/destinations/config.py +1 -0
bizon/destinations/destination.py +18 -3
bizon/destinations/models.py +27 -0
bizon/engine/pipeline/producer.py +11 -0
bizon/engine/queue/adapters/kafka/queue.py +4 -0
bizon/engine/queue/adapters/python_queue/queue.py +6 -0
bizon/engine/queue/adapters/rabbitmq/queue.py +5 -0
bizon/engine/queue/queue.py +6 -1
bizon/engine/runner/adapters/thread.py +2 -0
bizon/source/discover.py +1 -1
bizon/sources/kafka/src/source.py +31 -32
bizon/sources/kafka/tests/kafka_pipeline.py +1 -1
{bizon-0.0.9.dist-info → bizon-0.0.11.dist-info}/METADATA +2 -1
{bizon-0.0.9.dist-info → bizon-0.0.11.dist-info}/RECORD +21 -18
{bizon-0.0.9.dist-info → bizon-0.0.11.dist-info}/LICENSE +0 -0
{bizon-0.0.9.dist-info → bizon-0.0.11.dist-info}/WHEEL +0 -0
{bizon-0.0.9.dist-info → bizon-0.0.11.dist-info}/entry_points.txt +0 -0

bizon/common/models.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Union
 from pydantic import BaseModel, ConfigDict, Field
 from bizon.destinations.bigquery.src.config import BigQueryConfig
+from bizon.destinations.bigquery_streaming.src.config import BigQueryStreamingConfig
 from bizon.destinations.file.src.config import FileDestinationConfig
 from bizon.destinations.logger.src.config import LoggerConfig
 from bizon.engine.config import EngineConfig
@@ -24,6 +25,7 @@ class BizonConfig(BaseModel):
     destination: Union[
         BigQueryConfig,
+        BigQueryStreamingConfig,
         LoggerConfig,
         FileDestinationConfig,
     ] = Field(

bizon/destinations/bigquery_streaming/src/config.py ADDED Viewed

@@ -0,0 +1,55 @@
+from enum import Enum
+from typing import Literal, Optional
+from pydantic import BaseModel, Field, field_validator
+from bizon.destinations.config import (
+    AbstractDestinationConfig,
+    AbstractDestinationDetailsConfig,
+    DestinationTypes,
+)
+class GCSBufferFormat(str, Enum):
+    PARQUET = "parquet"
+    CSV = "csv"
+class TimePartitioning(str, Enum):
+    DAY = "DAY"
+    HOUR = "HOUR"
+    MONTH = "MONTH"
+    YEAR = "YEAR"
+class BigQueryAuthentication(BaseModel):
+    service_account_key: str = Field(
+        description="Service Account Key JSON string. If empty it will be infered",
+        default="",
+    )
+class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
+    project_id: str
+    dataset_id: str
+    dataset_location: Optional[str] = "US"
+    table_id: Optional[str] = Field(
+        default=None, description="Table ID, if not provided it will be inferred from source name"
+    )
+    time_partitioning: Optional[TimePartitioning] = Field(
+        default=TimePartitioning.DAY, description="BigQuery Time partitioning type"
+    )
+    authentication: Optional[BigQueryAuthentication] = None
+    buffer_size: int = Field(default=0, description="Buffer size in MB")
+    @field_validator("buffer_size", mode="after")
+    def validate_buffer_size(cls, value: int) -> int:
+        if value != 0:
+            raise ValueError("Buffer size must be 0, we directly stream to BigQuery")
+        return value
+class BigQueryStreamingConfig(AbstractDestinationConfig):
+    name: Literal[DestinationTypes.BIGQUERY_STREAMING]
+    config: BigQueryConfigDetails

bizon/destinations/bigquery_streaming/src/destination.py ADDED Viewed

@@ -0,0 +1,148 @@
+import json
+import os
+import tempfile
+from typing import List, Tuple
+from google.api_core.exceptions import NotFound
+from google.cloud import bigquery, bigquery_storage_v1, storage
+from google.cloud.bigquery import DatasetReference, TimePartitioning
+from google.cloud.bigquery_storage_v1.types import AppendRowsRequest, ProtoRows
+from loguru import logger
+from bizon.common.models import SyncMetadata
+from bizon.destinations.config import NormalizationType
+from bizon.destinations.destination import AbstractDestination
+from bizon.destinations.models import DestinationRecord
+from bizon.engine.backend.backend import AbstractBackend
+from .config import BigQueryConfigDetails
+from .proto_utils import get_proto_schema_and_class
+class BigQueryStreamingDestination(AbstractDestination):
+    def __init__(self, sync_metadata: SyncMetadata, config: BigQueryConfigDetails, backend: AbstractBackend):
+        super().__init__(sync_metadata, config, backend)
+        self.config: BigQueryConfigDetails = config
+        if config.authentication and config.authentication.service_account_key:
+            with tempfile.NamedTemporaryFile(delete=False) as temp:
+                temp.write(config.authentication.service_account_key.encode())
+                temp_file_path = temp.name
+            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file_path
+        self.project_id = config.project_id
+        self.bq_client = bigquery.Client(project=self.project_id)
+        self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
+        self.gcs_client = storage.Client(project=self.project_id)
+        self.dataset_id = config.dataset_id
+        self.dataset_location = config.dataset_location
+    @property
+    def table_id(self) -> str:
+        tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
+        return f"{self.project_id}.{self.dataset_id}.{tabled_id}"
+    def get_bigquery_schema(self, destination_records: List[DestinationRecord]) -> List[bigquery.SchemaField]:
+        # we keep raw data in the column source_data
+        if self.config.normalization.type == NormalizationType.NONE:
+            return [
+                bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
+                bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
+                bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
+                bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
+                bigquery.SchemaField(
+                    "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
+                ),
+                bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
+            ]
+        elif self.config.normalization.type == NormalizationType.DEBEZIUM:
+            assert (
+                "_bizon_message_key" in destination_records[0].source_data
+            ), "Debezium records must have a '_bizon_message_key' key"
+            message_keys = json.loads(destination_records[0].source_data["_bizon_message_key"])
+            return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in message_keys] + [
+                bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
+                bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
+                bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
+                bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
+                bigquery.SchemaField(
+                    "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
+                ),
+                bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
+            ]
+        # If normalization is tabular, we parse key / value pairs to columns
+        elif self.config.normalization.type == NormalizationType.TABULAR:
+            first_record_keys = destination_records[0].source_data.keys()
+            return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in first_record_keys] + [
+                bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
+                bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
+                bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
+                bigquery.SchemaField(
+                    "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
+                ),
+                bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
+            ]
+        raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
+    def check_connection(self) -> bool:
+        dataset_ref = DatasetReference(self.project_id, self.dataset_id)
+        try:
+            self.bq_client.get_dataset(dataset_ref)
+        except NotFound:
+            dataset = bigquery.Dataset(dataset_ref)
+            dataset.location = self.dataset_location
+            dataset = self.bq_client.create_dataset(dataset)
+        return True
+    def load_to_bigquery_via_streaming(self, destination_records: List[DestinationRecord]) -> str:
+        clustering_keys = []
+        if self.config.normalization.type == NormalizationType.DEBEZIUM:
+            clustering_keys = list(json.loads(destination_records[0].source_data["_bizon_message_key"]).keys())
+        # Create table if it doesnt exist
+        schema = self.get_bigquery_schema(destination_records=destination_records)
+        table = bigquery.Table(self.table_id, schema=schema)
+        time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
+        table.time_partitioning = time_partitioning
+        if clustering_keys:
+            table.clustering_fields = clustering_keys
+        table = self.bq_client.create_table(table, exists_ok=True)
+        # Create the stream
+        write_client = self.bq_storage_client
+        tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
+        parent = write_client.table_path(self.project_id, self.dataset_id, tabled_id)
+        stream_name = f"{parent}/_default"
+        # Generating the protocol buffer representation of the message descriptor.
+        proto_schema, TableRow = get_proto_schema_and_class(clustering_keys)
+        serialized_rows = [
+            record.to_protobuf_serialization(
+                TableRow, debezium=self.config.normalization.type == NormalizationType.DEBEZIUM
+            )
+            for record in destination_records
+        ]
+        request = AppendRowsRequest(
+            write_stream=stream_name,
+            proto_rows=AppendRowsRequest.ProtoData(
+                rows=ProtoRows(serialized_rows=serialized_rows),
+                writer_schema=proto_schema,
+            ),
+        )
+        response = write_client.append_rows(iter([request]))
+        assert response.code().name == "OK"
+    def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, str]:
+        self.load_to_bigquery_via_streaming(destination_records=destination_records)
+        return True, ""

bizon/destinations/bigquery_streaming/src/proto_utils.py ADDED Viewed

@@ -0,0 +1,91 @@
+from typing import List, Tuple, Type
+from google.cloud.bigquery_storage_v1.types import ProtoSchema
+from google.protobuf.descriptor_pb2 import (
+    DescriptorProto,
+    FieldDescriptorProto,
+    FileDescriptorProto,
+)
+from google.protobuf.descriptor_pool import DescriptorPool
+from google.protobuf.message import Message
+from google.protobuf.message_factory import GetMessageClassesForFiles
+def get_proto_schema_and_class(clustering_keys: List[str] = None) -> Tuple[ProtoSchema, Type[Message]]:
+    # Define the FileDescriptorProto
+    file_descriptor_proto = FileDescriptorProto()
+    file_descriptor_proto.name = "dynamic.proto"
+    file_descriptor_proto.package = "dynamic_package"
+    # Define the TableRow message schema
+    message_descriptor = DescriptorProto()
+    message_descriptor.name = "TableRow"
+    # Add fields to the message, only use TYPE_STRING, BigQuery does not support other types
+    # It does not imapact data types in final table
+    # https://stackoverflow.com/questions/70489919/protobuf-type-for-bigquery-timestamp-field
+    fields = [
+        {"name": "_bizon_id", "type": FieldDescriptorProto.TYPE_STRING, "label": FieldDescriptorProto.LABEL_REQUIRED},
+        {
+            "name": "_bizon_extracted_at",
+            "type": FieldDescriptorProto.TYPE_STRING,
+            "label": FieldDescriptorProto.LABEL_REQUIRED,
+        },
+        {
+            "name": "_bizon_loaded_at",
+            "type": FieldDescriptorProto.TYPE_STRING,
+            "label": FieldDescriptorProto.LABEL_REQUIRED,
+        },
+        {
+            "name": "_source_record_id",
+            "type": FieldDescriptorProto.TYPE_STRING,
+            "label": FieldDescriptorProto.LABEL_REQUIRED,
+        },
+        {
+            "name": "_source_timestamp",
+            "type": FieldDescriptorProto.TYPE_STRING,
+            "label": FieldDescriptorProto.LABEL_REQUIRED,
+        },
+        {
+            "name": "_source_data",
+            "type": FieldDescriptorProto.TYPE_STRING,
+            "label": FieldDescriptorProto.LABEL_OPTIONAL,
+        },
+    ]
+    if clustering_keys:
+        for key in clustering_keys:
+            fields.append(
+                {
+                    "name": key,
+                    "type": FieldDescriptorProto.TYPE_STRING,
+                    "label": FieldDescriptorProto.LABEL_OPTIONAL,
+                }
+            )
+    for i, field in enumerate(fields, start=1):
+        field_descriptor = message_descriptor.field.add()
+        field_descriptor.name = field["name"]
+        field_descriptor.number = i
+        field_descriptor.type = field["type"]
+        field_descriptor.label = field["label"]
+    # Add the message to the file descriptor
+    file_descriptor_proto.message_type.add().CopyFrom(message_descriptor)
+    # Create a DescriptorPool and register the FileDescriptorProto
+    pool = DescriptorPool()
+    pool.Add(file_descriptor_proto)
+    # Use the registered file name to fetch the message classes
+    message_classes = GetMessageClassesForFiles(["dynamic.proto"], pool=pool)
+    # Fetch the TableRow class
+    table_row_class = message_classes["dynamic_package.TableRow"]
+    # Create the ProtoSchema
+    proto_schema = ProtoSchema()
+    proto_schema.proto_descriptor.CopyFrom(message_descriptor)
+    return proto_schema, table_row_class

bizon/destinations/config.py CHANGED Viewed

@@ -6,6 +6,7 @@ from pydantic import BaseModel, ConfigDict, Field
 class DestinationTypes(str, Enum):
     BIGQUERY = "bigquery"
+    BIGQUERY_STREAMING = "bigquery_streaming"
     LOGGER = "logger"
     FILE = "file"

bizon/destinations/destination.py CHANGED Viewed

@@ -85,11 +85,16 @@ class AbstractDestination(ABC):
             pagination=self.buffer.pagination,
         )
+        logger.info(
+            f"Writing in destination from source iteration {self.buffer.from_iteration} to {self.buffer.to_iteration}"
+        )
         success, error_msg = self.write_records(destination_records=self.buffer.records)
         if success:
             # We wrote records to destination so we keep it
             destination_iteration.records_written = len(self.buffer.records)
+            logger.info(f"Successfully wrote {destination_iteration.records_written} records to destination")
         else:
             # We failed to write records to destination so we keep the error message
@@ -117,6 +122,11 @@ class AbstractDestination(ABC):
         # Last iteration, write all records to destination
         if last_iteration:
+            if len(self.buffer.records) == 0 and self.config.buffer_size == 0:
+                logger.warning("No records to write to destination, already written, buffer is empty.")
+                return DestinationBufferStatus.RECORDS_WRITTEN
             logger.debug("Writing last iteration records to destination")
             assert len(destination_records) == 0, "Last iteration should not have any records"
             destination_iteration = self.buffer_flush_handler(session=session)
@@ -147,9 +157,9 @@ class AbstractDestination(ABC):
             logger.warning("No records to write to destination. Check source and queue provider.")
             return DestinationBufferStatus.NO_RECORDS
-        # Write records to destination if buffer size is 0
+        # Write records to destination if buffer size is 0 or streaming
         if self.buffer.buffer_size == 0:
-            logger.info("Writing last iteration records to destination")
+            logger.info("Writing records to destination.")
             self.buffer.add_source_iteration_records_to_buffer(
                 iteration=iteration, records=destination_records, pagination=pagination
             )
@@ -160,7 +170,7 @@ class AbstractDestination(ABC):
         logger.debug(f"Buffer free space {self.buffer.buffer_free_space_pct}%")
         logger.debug(f"Buffer current size {self.buffer.current_size} bytes")
         logger.info(
-            f"Buffer ripeness {self.buffer.ripeness / 60} min. Max ripeness {self.buffer.buffer_flush_timeout / 60} min."  # noqa
+            f"Buffer ripeness {round(self.buffer.ripeness / 60, 2)} min. Max ripeness {round(self.buffer.buffer_flush_timeout / 60, 2)} min."  # noqa
         )
         # Write buffer to destination if buffer is ripe and create a new buffer for the new iteration
@@ -258,6 +268,11 @@ class DestinationFactory:
             return BigQueryDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
+        elif config.name == DestinationTypes.BIGQUERY_STREAMING:
+            from .bigquery_streaming.src.destination import BigQueryStreamingDestination
+            return BigQueryStreamingDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
         elif config.name == DestinationTypes.FILE:
             from .file.src.destination import FileDestination

bizon/destinations/models.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import json
 from datetime import datetime
+from typing import Type
 from uuid import uuid4
+from google.protobuf.message import Message
 from pydantic import BaseModel, Field
 from pytz import UTC
@@ -81,3 +83,28 @@ class DestinationRecord(BaseModel):
             "_source_timestamp": self.source_timestamp,
             "_source_data": json.dumps(self.source_data),
         }
+    def to_protobuf_serialization(self, TableRowClass: Type[Message], debezium=False):
+        record = TableRowClass()
+        record._bizon_id = self.bizon_id
+        record._bizon_extracted_at = str(int(self.bizon_extracted_at.timestamp()))
+        record._bizon_loaded_at = str(int(self.bizon_loaded_at.timestamp()))
+        record._source_record_id = self.source_record_id
+        record._source_timestamp = str(int(self.source_timestamp.timestamp()))
+        if debezium:
+            parsed_debezium_keys = json.loads(self.source_data["_bizon_message_key"])
+            if parsed_debezium_keys:
+                for _key in parsed_debezium_keys:
+                    setattr(record, _key, str(parsed_debezium_keys[_key]))
+            if self.source_data.get("op") == "d":
+                source_data = {"__deleted": True, **self.source_data["before"]}
+            else:
+                source_data = {"__deleted": False, **self.source_data["after"]}
+            record._source_data = json.dumps(source_data)
+        else:
+            record._source_data = json.dumps(self.source_data)
+        return record.SerializeToString()

bizon/engine/pipeline/producer.py CHANGED Viewed

@@ -104,6 +104,8 @@ class Producer:
         while not cursor.is_finished:
+            timestamp_start_iteration = datetime.now(tz=UTC)
             # Handle the case where last cursor already reach max_iterations
             terminate = self.handle_max_iterations(cursor)
             if terminate:
@@ -178,6 +180,15 @@ class Producer:
                 return_value = PipelineReturnStatus.SOURCE_ERROR
                 break
+            # Items in queue
+            items_in_queue = f"{self.queue.get_size()} items in queue." if self.queue.get_size() else ""
+            logger.info(
+                (
+                    f"Iteration {cursor.iteration} finished in {datetime.now(tz=UTC) - timestamp_start_iteration}. {items_in_queue}"
+                )
+            )
         logger.info("Terminating destination ...")
         try:

bizon/engine/queue/adapters/kafka/queue.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
+from typing import Union
 from kafka import KafkaProducer
 from loguru import logger
@@ -36,6 +37,9 @@ class KafkaQueue(AbstractQueue):
     def on_error(e):
         logger.error(f"Error sending message: {e}")
+    def get_size(self) -> Union[int, None]:
+        return None
     def put_queue_message(self, queue_message: QueueMessage):
         future = self.producer.send(
             topic=self.config.queue.topic,

bizon/engine/queue/adapters/python_queue/queue.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import random
 import time
 from multiprocessing import Queue
+from typing import Union
 from loguru import logger
@@ -52,6 +53,11 @@ class PythonQueue(AbstractQueue):
             time.sleep(random.random())
             return self.get()
+    def get_size(self) -> Union[int, None]:
+        if hasattr(self.queue, "qsize"):
+            return self.queue.qsize()
+        return None
     def terminate(self, iteration: int) -> bool:
         self.put(source_records=[], iteration=iteration, signal=QUEUE_TERMINATION)
         logger.info("Sent termination signal to destination.")

bizon/engine/queue/adapters/rabbitmq/queue.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from typing import Union
 import pika
 from loguru import logger
@@ -31,6 +33,9 @@ class RabbitMQ(AbstractQueue):
             body=queue_message.model_dump_json(),
         )
+    def get_size(self) -> Union[int, None]:
+        return None
     def get(self) -> QueueMessage:
         raise NotImplementedError(
             "RabbitMQ does not support getting messages from the queue, directly use callback in consumer."

bizon/engine/queue/queue.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from datetime import datetime
-from typing import List, Optional
+from typing import List, Optional, Union
 from pydantic import BaseModel
 from pytz import UTC
@@ -45,6 +45,11 @@ class AbstractQueue(ABC):
         """Get a QueueMessage object from the queue system"""
         pass
+    @abstractmethod
+    def get_size(self) -> Union[int, None]:
+        """If queue is compatible, return size of the queue"""
+        pass
     @abstractmethod
     def terminate(self, iteration: int) -> bool:
         """Send a termination signal in the queue system"""

bizon/engine/runner/adapters/thread.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import concurrent.futures
 import time
+import traceback
 from loguru import logger
@@ -75,5 +76,6 @@ class ThreadRunner(AbstractRunner):
                     future_consumer.result()
                 except Exception as e:
                     logger.error(f"Consumer thread stopped running with error {e}")
+                    logger.error(traceback.format_exc())
         return True

bizon/source/discover.py CHANGED Viewed

@@ -143,7 +143,7 @@ def parse_streams_from_filepath(source_name: str, filepath: str, skip_unavailabl
         # Transform the relative path to a python import path and import the module
         python_import_path = get_python_import_path(relative_path)
-        logger.info(f"Importing {python_import_path}")
+        logger.debug(f"Importing {python_import_path}")
         try:
             source_module = importlib.import_module(python_import_path, package="sources")

bizon/sources/kafka/src/source.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import io
 import json
+import logging
 import struct
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime, timezone
 from enum import Enum
 from functools import lru_cache
@@ -18,6 +18,9 @@ from bizon.source.config import SourceConfig
 from bizon.source.models import SourceIteration, SourceRecord
 from bizon.source.source import AbstractSource
+silent_logger = logging.getLogger()
+silent_logger.addHandler(logging.StreamHandler())
 class SchemaRegistryType(str, Enum):
     APICURIO = "apicurio"
@@ -98,7 +101,10 @@ class KafkaSource(AbstractSource):
         }
         # Consumer instance
-        self.consumer = Consumer(self.kafka_consumer_conf)
+        self.consumer = Consumer(self.kafka_consumer_conf, logger=silent_logger)
+        # Consumers for each worker thread
+        self.consumers_cached: Mapping[int, Consumer] = {}
     @staticmethod
     def streams() -> List[str]:
@@ -194,25 +200,17 @@ class KafkaSource(AbstractSource):
         global_id = self.parse_global_id_from_serialized_message(header_message)
         return self.get_parsed_avro_schema(global_id).to_json()
-    def read_partition(self, partition: int, topic_offsets: TopicOffsets) -> List[SourceRecord]:
+    def parse_encoded_messages(self, encoded_messages: list) -> List[SourceRecord]:
         records = []
-        encoded_messages = []
         # Set the source timestamp to now, otherwise it will be overwritten by the message timestamp
         source_timestamp = datetime.now(tz=timezone.utc)
-        # Set consumer offset params
-        consumer = Consumer(self.kafka_consumer_conf)
-        consumer.assign([TopicPartition(self.config.topic, partition, topic_offsets.get_partition_offset(partition))])
-        consumer.seek(TopicPartition(self.config.topic, partition, topic_offsets.get_partition_offset(partition)))
-        # Read messages
-        encoded_messages.extend(consumer.consume(self.config.batch_size, timeout=self.config.consumer_timeout))
         for message in encoded_messages:
             if not message.value():
                 logger.debug(
-                    f"Message for partition {partition} and offset {message.offset()} and topic {self.config.topic} is empty, skipping."
+                    f"Message for partition {message.partition()} and offset {message.offset()} and topic {self.config.topic} is empty, skipping."
                 )
                 continue
@@ -233,43 +231,44 @@ class KafkaSource(AbstractSource):
                         data[self.config.timestamp_ms_name] / 1000, tz=timezone.utc
                     )
+                self.topic_offsets.set_partition_offset(message.partition(), message.offset() + 1)
                 records.append(
                     SourceRecord(
-                        id=f"part_{partition}_offset_{message.offset()}",
+                        id=f"part_{message.partition()}_offset_{message.offset()}",
                         timestamp=source_timestamp,
                         data=data,
                     )
                 )
             except Exception as e:
                 logger.error(
-                    f"Error while decoding message for partition {partition}: {e} at offset {message.offset()}"
+                    f"Error while decoding message for partition {message.partition()}: {e} at offset {message.offset()}"
                 )
                 continue
-        # Update the offset for the partition
-        if encoded_messages:
-            topic_offsets.set_partition_offset(partition, encoded_messages[-1].offset() + 1)
-        else:
-            logger.warning(f"No new messages found for partition {partition}")
-        consumer.close()
         return records
     def read_topic(self, pagination: dict = None) -> SourceIteration:
         nb_partitions = self.get_number_of_partitions()
         # Setup offset_pagination
-        topic_offsets = TopicOffsets.model_validate(pagination) if pagination else self.get_offset_partitions()
+        self.topic_offsets = TopicOffsets.model_validate(pagination) if pagination else self.get_offset_partitions()
-        # Use ThreadPoolExecutor to parallelize reading partitions
-        records = []
-        with ThreadPoolExecutor(max_workers=min(nb_partitions, self.config.max_consumer_threads)) as executor:
-            futures = {executor.submit(self.read_partition, i, topic_offsets): i for i in range(nb_partitions)}
-            for future in as_completed(futures):
-                partition_records = future.result()
-                records.extend(partition_records)
+        self.consumer.assign(
+            [
+                TopicPartition(self.config.topic, partition, self.topic_offsets.get_partition_offset(partition))
+                for partition in range(nb_partitions)
+            ]
+        )
+        t1 = datetime.now()
+        encoded_messages = self.consumer.consume(self.config.batch_size, timeout=self.config.consumer_timeout)
+        logger.info(f"Read Kafka: {len(encoded_messages)} messages in {datetime.now() - t1}")
+        records = self.parse_encoded_messages(encoded_messages)
+        # Update the offset for the partition
         if not records:
             logger.info("No new records found, stopping iteration")
             return SourceIteration(
@@ -278,7 +277,7 @@ class KafkaSource(AbstractSource):
             )
         return SourceIteration(
-            next_pagination=topic_offsets.model_dump(),
+            next_pagination=self.topic_offsets.model_dump(),
             records=records,
         )

bizon/sources/kafka/tests/kafka_pipeline.py CHANGED Viewed

@@ -4,6 +4,6 @@ from bizon.engine.engine import RunnerFactory
 if __name__ == "__main__":
     runner = RunnerFactory.create_from_yaml(
-        filepath=os.path.abspath("bizon/sources/kafka/config/kafka_teams_users_eu_west1_c511.yml")
+        filepath=os.path.abspath("bizon/sources/kafka/config/kafka_teams_users.yml")
     )
     runner.run()

{bizon-0.0.9.dist-info → bizon-0.0.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: bizon
-Version: 0.0.9
+Version: 0.0.11
 Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
 Author: Antoine Balliet
 Author-email: antoine.balliet@gmail.com
@@ -31,6 +31,7 @@ Requires-Dist: loguru (>=0.7.2,<0.8.0)
 Requires-Dist: pandas (>=2.2.2,<3.0.0) ; extra == "bigquery"
 Requires-Dist: pendulum (>=3.0.0,<4.0.0)
 Requires-Dist: pika (>=1.3.2,<2.0.0) ; extra == "rabbitmq"
+Requires-Dist: protobuf (==4.24.0)
 Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgres"
 Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
 Requires-Dist: pydantic (>=2.8.2,<3.0.0)

{bizon-0.0.9.dist-info → bizon-0.0.11.dist-info}/RECORD RENAMED Viewed

@@ -4,18 +4,21 @@ bizon/cli/main.py,sha256=QOTy8nrfj7bJSXkd98OywKpYUD3zShsBUxBiodmeku8,3212
 bizon/cli/utils.py,sha256=aZ47YjFfifHkW95bAVzWfEQD3ZnxGSMT32bkRLmc5-c,953
 bizon/common/errors/backoff.py,sha256=z7RkQt1Npdh0sfD3hBDaiWQKe4iqS6ewvT1Q4Fds5aU,508
 bizon/common/errors/errors.py,sha256=mrYx1uE2kOuR2pEaB7ztK1l2m0E4V-_-hxq-DuILerY,682
-bizon/common/models.py,sha256=7_HKAxOyN9eK8hmqahzHhmK-TYVAuRtGOgf4iadE7FI,1751
+bizon/common/models.py,sha256=SE06zZjvJbljyR0nTB0wmVotQS-YDr9Y1OGbLT7XVek,1869
 bizon/destinations/bigquery/config/bigquery.example.yml,sha256=mvKtFS_PUuekyMh9xssuwRfFwLtR-rVvpIy5xmF5__k,1261
 bizon/destinations/bigquery/src/config.py,sha256=QlD-FdBJ8Q6nKPrOf5q28lHnyFE8khT41dSR1s2meeM,1378
 bizon/destinations/bigquery/src/destination.py,sha256=tPxE0IpHbR4zDkW5HaiHkgeDRDY2AibIPzY9iftZ2Uc,11079
+bizon/destinations/bigquery_streaming/src/config.py,sha256=6NtsOJJ8rl0U96DpQdYvE50i8Wry6d8OiuBR1FKHjNE,1582
+bizon/destinations/bigquery_streaming/src/destination.py,sha256=6sSSdhqv4LIMbrIDcInKed0pNR0r_qKU3eG2BCX0JYQ,7322
+bizon/destinations/bigquery_streaming/src/proto_utils.py,sha256=n6OP5KEnyVdl17NjCxVPNsV7zewp1fbYDckbaHFrIUM,3305
 bizon/destinations/buffer.py,sha256=bFYkaoge-3AyKfGolqsuB3PWWtdPt65Fllrz-3X_uMI,2594
-bizon/destinations/config.py,sha256=jD4nkG-sg7mzJMFKLErQBkJu7ri0PMbCRVU3xIvFT7E,1686
-bizon/destinations/destination.py,sha256=VAyGPmowNimvK_joZj-6ESk2ezGxDZHnKCIpKRA-Vus,10995
+bizon/destinations/config.py,sha256=gBiEfVYARlH9BcSKI5Fb6rLkSUfIElf39MeMdk0HJOc,1732
+bizon/destinations/destination.py,sha256=D6EMs_sq3HG2x9yR54aknDgx3TrU2VbxfmzpH1GWVCc,11795
 bizon/destinations/file/src/config.py,sha256=C4BBIKzBH5343iLGR3aCubAGjPo0b2LegsCLjb77uFA,513
 bizon/destinations/file/src/destination.py,sha256=1VCrVdtzAzwSKgYq0JUOc3r2cM7314dV-eIoAFhM_64,1003
 bizon/destinations/logger/src/config.py,sha256=AWY3R9q3ZjD3uQ_KBq8VcW60deKSIHe3qtgCKjdywKk,433
 bizon/destinations/logger/src/destination.py,sha256=xTt03F3AMI9KhQno2tGoCr3eacrO62qjnOlpeEHk6tQ,868
-bizon/destinations/models.py,sha256=hK7yXMoOArLJ5sUS9kgljXMBaq2vqu1l_7u707yS1KM,3630
+bizon/destinations/models.py,sha256=w2wZy8alcqc89rdmrwDUWxVZvI01ON7m6rcidKqk-Cc,4834
 bizon/engine/backend/adapters/sqlalchemy/backend.py,sha256=R0CztRGc3_6PdIIgbbrDYD2OJRNhq9PPmD6PYK7-fjk,15567
 bizon/engine/backend/adapters/sqlalchemy/config.py,sha256=K-FpE_-VHnTSAQOduouhXFVy43EkrKbeZLqr9_OfeMw,1846
 bizon/engine/backend/backend.py,sha256=Bodqoo5qJHV0H2zJJeGytaHGiNZmBjnLBxiRgq6M3kE,5844
@@ -25,20 +28,20 @@ bizon/engine/config.py,sha256=cKgI1IfzDncoxG3FsKUz-Aa3fU41ucQPaafjjhKeU90,2039
 bizon/engine/engine.py,sha256=bdQksSQfxkeAHbbe52_MbqTJieOURjlMGYtkCCaDtuc,990
 bizon/engine/pipeline/consumer.py,sha256=HU3G2_h5ZUM217mnKSktdvib2nRc9r8OzvqWodRdFk0,424
 bizon/engine/pipeline/models.py,sha256=kfr_kqkJMEVlWX35rJiYMCuEBCrNhsx9R0a19E39i14,216
-bizon/engine/pipeline/producer.py,sha256=5Duhf_KHTtufV2j-KiZVwl5EEDKMpM4I8dH-MEdad7w,7924
+bizon/engine/pipeline/producer.py,sha256=k0dzSa6_7PiTJF0UtX0BAfvSTS0h7XFVZ3JN_3_ZvQQ,8330
 bizon/engine/queue/adapters/kafka/config.py,sha256=o7GAb_ls9N0nQV04B6Y4XjLo-Q57x28r63gjFG9LvVg,1091
 bizon/engine/queue/adapters/kafka/consumer.py,sha256=mh25mTjO7w6CGwJDWtxHVocwZi6DbTIVncm81rmhKrw,2576
-bizon/engine/queue/adapters/kafka/queue.py,sha256=fqROnv3HHcMvq4Aq9Is-jyiRWOQWC4zxNtzVpKFnKJs,1875
+bizon/engine/queue/adapters/kafka/queue.py,sha256=IS6akN7F81lkAajQdgqSqlqAg3r8uXbw6SdByDgvdMM,1965
 bizon/engine/queue/adapters/python_queue/config.py,sha256=D_CAuWJtdMQmQcm9gq9YBrkeFHAxZKRc7kIISliyp_4,847
 bizon/engine/queue/adapters/python_queue/consumer.py,sha256=yEoDF6QEmr9gjNGxXRqypdIHIJ50lQh_fFDhDXk6_g8,1566
-bizon/engine/queue/adapters/python_queue/queue.py,sha256=m4bfp0qD25lCxcok8yBXyjGGQjhQDqkm69uslutWLbc,2090
+bizon/engine/queue/adapters/python_queue/queue.py,sha256=VVc5A7qU2wgWEeeG6UOmgkmoIiwZ7GZGjSiBThloFzk,2259
 bizon/engine/queue/adapters/rabbitmq/config.py,sha256=9N_7WREvNjJgcNTC3Y2kHII-iId2MZa3ssHHks6PyAs,987
 bizon/engine/queue/adapters/rabbitmq/consumer.py,sha256=cN6K8wSBIQUSuRD7VsNltS6ElZ32PW92ZXiugzIDPJU,2019
-bizon/engine/queue/adapters/rabbitmq/queue.py,sha256=UPkrvrPizWNtwqSey714SB6kr70XWNJmqGQvGsgK5sY,1618
+bizon/engine/queue/adapters/rabbitmq/queue.py,sha256=gaTCIY_mCfWt8LCjfEymZuIiwqPkYQoVvaOacRYgLJo,1709
 bizon/engine/queue/config.py,sha256=PN9Je_Q9Sxo-3fI8lI6rZPQ9oeWatnD4rzUTWj3NhnA,792
-bizon/engine/queue/queue.py,sha256=Q3NlfS8DtKfVP2Y_bApVxjcNokCjtA4i7GH8dHorZk4,2778
+bizon/engine/queue/queue.py,sha256=_pOhmDZs79V7XgjthrKsFxZvae_4_cvol97jcZ-YR3g,2926
 bizon/engine/runner/adapters/process.py,sha256=idyknLADcmhCS4614WtyO-FqaYChV243gvjzPWvk0KE,2525
-bizon/engine/runner/adapters/thread.py,sha256=HUIJm5xg_yKdD2JOndvrod6x3qQ3uC6uakfc4m3XMso,2609
+bizon/engine/runner/adapters/thread.py,sha256=QyDW-D8fkpYFefKth7OQoDRzURuxYdX7on2NephBLzY,2683
 bizon/engine/runner/config.py,sha256=QPgfy6YnS-EW8nhpTg1aRHshbGz5QTrQ5R3pDmLkIE0,1272
 bizon/engine/runner/runner.py,sha256=1njU4KoFPhkP_oMWewH5bWTxt38Vhz-Y4e5hpcl2jF4,7812
 bizon/source/auth/authenticators/abstract_oauth.py,sha256=gJ40Sbrt0lnHfLupzkzOvUmse3X0Fp2XRHHqjqnVXdI,5274
@@ -51,7 +54,7 @@ bizon/source/auth/builder.py,sha256=hc4zBNj31LZc-QqgIyx1VQEYTm9Xv81vY5pJiwQroJo,
 bizon/source/auth/config.py,sha256=2jjcBLP95XsCkfKxdUei4X2yHI2WX92lJb8D8Txw86g,750
 bizon/source/config.py,sha256=DPwJsBfU48yMvCw-pQCEha4X-IUjvmnQzjTwgsaCxAA,2307
 bizon/source/cursor.py,sha256=TSgWe1T9b4x7EEsbk22hwTWwVXCk5vdrs9eaHNhrevo,3983
-bizon/source/discover.py,sha256=ylv26jA02xOyWp5hZACCMpcqzZ4P3C5hvejBL7YX2cA,11119
+bizon/source/discover.py,sha256=C0_SnFxeHpz4VernxAfu2gbnQuoqv0cWX9z5J3WlCKw,11120
 bizon/source/models.py,sha256=iVp0H4muOWGst1W5DuxEVtHIY6lewOV8zDZUqvPTcBk,1337
 bizon/source/session.py,sha256=z4dZlKC_PD8w_utTuAqs1vsfGuRkxHh5WQZhVKamNd0,1979
 bizon/source/source.py,sha256=NhxMU1yXgi7mL64RyeymOYNqRk6fad9v_S8lhvXYUI0,3390
@@ -76,16 +79,16 @@ bizon/sources/hubspot/src/hubspot_objects.py,sha256=EmABx9XD8q6g4Uc5mHLv5YYl5KcI
 bizon/sources/hubspot/src/models/hs_object.py,sha256=-Y20H3-nenJyySMlvM4TPttPz4O8qm3ArKP_I8pxsuo,1235
 bizon/sources/hubspot/tests/hubspot_pipeline.py,sha256=e6dCF5_MHMySkeiF6kKrSAuCa_48J22-ZeSCZSjrfUI,216
 bizon/sources/kafka/config/kafka.example.yml,sha256=ZyHBmSWZ_5WQaBr9WzD05PuE6vi3hhYgHh2VZ-IU-Iw,755
-bizon/sources/kafka/src/source.py,sha256=28Cn_m8DOzsUdgbq0sUm36I7hB0TWRF6xEzg7TcrPrc,11343
-bizon/sources/kafka/tests/kafka_pipeline.py,sha256=DrMHq96ZDiQ2lWmxEf_aX7HmBg_qNOsSFGTuGmuhly8,252
+bizon/sources/kafka/src/source.py,sha256=wPCtrQ7qolaRzOYPUvzp6vuBSdx1I5FBniKjqNyYYJ8,10972
+bizon/sources/kafka/tests/kafka_pipeline.py,sha256=txi2-Tvg4Ydgk6iYp-GqDRXqWj1Sb5rrg9Q0hbBA114,238
 bizon/sources/periscope/config/periscope_charts.example.yml,sha256=rpFDAWeU5oZ3UOiX0sSAgd1X5lv6t-s3iqiDPnRqutU,477
 bizon/sources/periscope/config/periscope_dashboards.example.yml,sha256=sN2iGGqCQCvrMXcwxNGq_dR7-KZ1KtYdXmNYKXlfEpg,481
 bizon/sources/periscope/src/source.py,sha256=AZM-HDDjdTWj8akeeofQ_-G8YlnNHEKi2mjEQSYwOvE,7638
 bizon/sources/periscope/tests/periscope_pipeline_charts.py,sha256=mU0JtfhS1KmWsS3iovGhGxK7iPVWiYzjBM_QfRL3ZQI,275
 bizon/sources/periscope/tests/periscope_pipeline_dashboard.py,sha256=vZKN7UfH-lQIWrnfjPqQFjZm28UIw2m9OSg4yS-Wckk,279
 bizon/utils.py,sha256=HXaPiyxpWKoy3XN5vSYOve1ezlFeOYin3aFqTjcabUQ,81
-bizon-0.0.9.dist-info/LICENSE,sha256=AW7SjYVT2bBnXOxgDxqy_e_JF8jDCFlMCaPCF11wFDI,1072
-bizon-0.0.9.dist-info/METADATA,sha256=POEtr3jEzvy8ogs2WvJ0rnlcpqFXcnZjLwQebLWxNnw,5646
-bizon-0.0.9.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-bizon-0.0.9.dist-info/entry_points.txt,sha256=wtCd-6JswSY8lPWYSvOf7ASX1zfKgmgXtgg5XQS5274,44
-bizon-0.0.9.dist-info/RECORD,,
+bizon-0.0.11.dist-info/LICENSE,sha256=AW7SjYVT2bBnXOxgDxqy_e_JF8jDCFlMCaPCF11wFDI,1072
+bizon-0.0.11.dist-info/METADATA,sha256=DyUyvYgGZqHsFiMWoxaTr7OoY0GYbYs5rUx_ueMVfVo,5682
+bizon-0.0.11.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+bizon-0.0.11.dist-info/entry_points.txt,sha256=wtCd-6JswSY8lPWYSvOf7ASX1zfKgmgXtgg5XQS5274,44
+bizon-0.0.11.dist-info/RECORD,,

{bizon-0.0.9.dist-info → bizon-0.0.11.dist-info}/LICENSE RENAMED Viewed

File without changes

{bizon-0.0.9.dist-info → bizon-0.0.11.dist-info}/WHEEL RENAMED Viewed

File without changes

{bizon-0.0.9.dist-info → bizon-0.0.11.dist-info}/entry_points.txt RENAMED Viewed

File without changes

bizon 0.0.9__py3-none-any.whl → 0.0.11__py3-none-any.whl

bizon 0.0.9py3-none-any.whl → 0.0.11py3-none-any.whl