PyPI - bizon - Versions diffs - 0.0.10__tar.gz → 0.0.13__tar.gz - Mend

bizon 0.0.10tar.gz → 0.0.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

{bizon-0.0.10 → bizon-0.0.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: bizon
-Version: 0.0.10
+Version: 0.0.13
 Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
 Author: Antoine Balliet
 Author-email: antoine.balliet@gmail.com
@@ -20,7 +20,6 @@ Requires-Dist: backoff (>=2.2.1,<3.0.0)
 Requires-Dist: click (>=8.1.7,<9.0.0)
 Requires-Dist: confluent-kafka (>=2.6.0,<3.0.0) ; extra == "kafka"
 Requires-Dist: dpath (>=2.2.0,<3.0.0)
-Requires-Dist: faker (>=26.0.0,<27.0.0)
 Requires-Dist: fastavro (>=1.9.7,<2.0.0) ; extra == "kafka"
 Requires-Dist: google-cloud-bigquery (>=3.25.0,<4.0.0) ; extra == "bigquery"
 Requires-Dist: google-cloud-bigquery-storage (>=2.25.0,<3.0.0) ; extra == "bigquery"
@@ -28,9 +27,10 @@ Requires-Dist: google-cloud-storage (>=2.17.0,<3.0.0)
 Requires-Dist: gspread (>=6.1.2,<7.0.0) ; extra == "gsheets"
 Requires-Dist: kafka-python (>=2.0.2,<3.0.0) ; extra == "kafka"
 Requires-Dist: loguru (>=0.7.2,<0.8.0)
-Requires-Dist: pandas (>=2.2.2,<3.0.0) ; extra == "bigquery"
 Requires-Dist: pendulum (>=3.0.0,<4.0.0)
 Requires-Dist: pika (>=1.3.2,<2.0.0) ; extra == "rabbitmq"
+Requires-Dist: polars (>=1.16.0,<2.0.0)
+Requires-Dist: protobuf (>=4.24.0,<5.0.0) ; extra == "bigquery"
 Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgres"
 Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
 Requires-Dist: pydantic (>=2.8.2,<3.0.0)

{bizon-0.0.10 → bizon-0.0.13}/bizon/common/models.py RENAMED Viewed

@@ -3,6 +3,7 @@ from typing import Union
 from pydantic import BaseModel, ConfigDict, Field
 from bizon.destinations.bigquery.src.config import BigQueryConfig
+from bizon.destinations.bigquery_streaming.src.config import BigQueryStreamingConfig
 from bizon.destinations.file.src.config import FileDestinationConfig
 from bizon.destinations.logger.src.config import LoggerConfig
 from bizon.engine.config import EngineConfig
@@ -24,6 +25,7 @@ class BizonConfig(BaseModel):
     destination: Union[
         BigQueryConfig,
+        BigQueryStreamingConfig,
         LoggerConfig,
         FileDestinationConfig,
     ] = Field(

{bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/bigquery/src/destination.py RENAMED Viewed

@@ -2,22 +2,19 @@ import io
 import json
 import os
 import tempfile
+import traceback
 from typing import List, Tuple
 from uuid import uuid4
-import pandas as pd
-import pyarrow as pa
-import pyarrow.parquet as pq
+import polars as pl
 from google.api_core.exceptions import NotFound
 from google.cloud import bigquery, storage
 from google.cloud.bigquery import DatasetReference, TimePartitioning
 from loguru import logger
-from pytz import UTC
 from bizon.common.models import SyncMetadata
 from bizon.destinations.config import NormalizationType
 from bizon.destinations.destination import AbstractDestination
-from bizon.destinations.models import DestinationRecord
 from bizon.engine.backend.backend import AbstractBackend
 from bizon.source.config import SourceSyncModes
@@ -62,7 +59,7 @@ class BigQueryDestination(AbstractDestination):
         elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
             return f"{self.table_id}"
-    def get_bigquery_schema(self, destination_records: List[DestinationRecord]) -> List[bigquery.SchemaField]:
+    def get_bigquery_schema(self, df_destination_records: pl.DataFrame) -> List[bigquery.SchemaField]:
         # we keep raw data in the column source_data
         if self.config.normalization.type == NormalizationType.NONE:
@@ -77,26 +74,13 @@ class BigQueryDestination(AbstractDestination):
                 bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
             ]
-        elif self.config.normalization.type == NormalizationType.DEBEZIUM:
-            assert (
-                "_bizon_message_key" in destination_records[0].source_data
-            ), "Debezium records must have a '_bizon_message_key' key"
-            message_keys = json.loads(destination_records[0].source_data["_bizon_message_key"])
-            return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in message_keys] + [
-                bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
-                bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
-                bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
-                bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
-                bigquery.SchemaField(
-                    "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
-                ),
-                bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
-            ]
         # If normalization is tabular, we parse key / value pairs to columns
         elif self.config.normalization.type == NormalizationType.TABULAR:
-            first_record_keys = destination_records[0].source_data.keys()
-            return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in first_record_keys] + [
+            # We use the first record to infer the schema of tabular data (key / value pairs)
+            source_data_keys = list(json.loads(df_destination_records["source_data"][0]).keys())
+            return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in source_data_keys] + [
                 bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
                 bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
                 bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
@@ -108,58 +92,6 @@ class BigQueryDestination(AbstractDestination):
         raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
-    def get_batch_records_as_df(self, destination_records: List[DestinationRecord]) -> pd.DataFrame:
-        # We keep raw data in a column -> convert the SourceRecord to a DestinationRecord
-        if self.config.normalization.type == NormalizationType.NONE:
-            df = pd.DataFrame([record.to_dict_raw_json_data(parquet=True) for record in destination_records])
-            df["_bizon_loaded_at"] = pd.Timestamp.now(tz=UTC)
-        # If normalization is tabular, we can just convert the data to a DataFrame parsing first-level keys
-        elif self.config.normalization.type == NormalizationType.TABULAR:
-            list_data_dict = [record.source_data for record in destination_records]
-            df = pd.DataFrame(list_data_dict).astype(str)
-            df["_bizon_id"] = [uuid4().hex for _ in range(len(destination_records))]
-            df["_bizon_extracted_at"] = [
-                int(record.source_timestamp.timestamp() * 1_000_000) for record in destination_records
-            ]
-            df["_bizon_loaded_at"] = pd.Timestamp.now(tz=UTC)
-            df["_source_record_id"] = [record.source_record_id for record in destination_records]
-            # We need to convert the source datetime to a int timestamp
-            df["_source_timestamp"] = [
-                int(record.source_timestamp.timestamp() * 1_000_000) for record in destination_records
-            ]
-        elif self.config.normalization.type == NormalizationType.DEBEZIUM:
-            df = pd.DataFrame([record.to_dict_debezium(parquet=True) for record in destination_records])
-            df["_bizon_loaded_at"] = pd.Timestamp.now(tz=UTC)
-        else:
-            raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
-        return df
-    def convert_and_upload_to_buffer(self, destination_records: List[DestinationRecord]):
-        df = self.get_batch_records_as_df(destination_records)
-        # Convert DataFrame to Parquet in-memory
-        if self.buffer_format == "parquet":
-            table = pa.Table.from_pandas(df)
-            buffer = io.BytesIO()
-            pq.write_table(table, buffer)
-            buffer.seek(0)
-            # Upload the Parquet file to GCS
-            file_name = f"{self.sync_metadata.source_name}/{self.sync_metadata.stream_name}/{str(uuid4())}.parquet"
-            blob = self.buffer_bucket.blob(file_name)
-            blob.upload_from_file(buffer, content_type="application/octet-stream")
-            return file_name
     def check_connection(self) -> bool:
         dataset_ref = DatasetReference(self.project_id, self.dataset_id)
@@ -179,7 +111,25 @@ class BigQueryDestination(AbstractDestination):
     # https://cloud.google.com/python/docs/reference/storage/latest/retry_timeout
     # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dbapi.DataError
-    def load_to_bigquery(self, gcs_file: str, destination_records: List[DestinationRecord]):
+    def convert_and_upload_to_buffer(self, df_destination_records: pl.DataFrame) -> str:
+        if self.buffer_format == "parquet":
+            # Upload the Parquet file to GCS
+            file_name = f"{self.sync_metadata.source_name}/{self.sync_metadata.stream_name}/{str(uuid4())}.parquet"
+            with io.BytesIO() as stream:
+                df_destination_records.write_parquet(stream)
+                stream.seek(0)
+                blob = self.buffer_bucket.blob(file_name)
+                blob.upload_from_file(stream, content_type="application/octet-stream")
+            return file_name
+        raise NotImplementedError(f"Buffer format {self.buffer_format} is not supported")
+    def load_to_bigquery(self, gcs_file: str, df_destination_records: pl.DataFrame):
         # We always partition by the loaded_at field
         time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
@@ -187,34 +137,41 @@ class BigQueryDestination(AbstractDestination):
         job_config = bigquery.LoadJobConfig(
             source_format=bigquery.SourceFormat.PARQUET,
             write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
-            schema=self.get_bigquery_schema(destination_records=destination_records),
+            schema=self.get_bigquery_schema(df_destination_records=df_destination_records),
             time_partitioning=time_partitioning,
         )
-        if self.config.normalization.type == NormalizationType.DEBEZIUM:
-            job_config.clustering_fields = list(
-                json.loads(destination_records[0].source_data["_bizon_message_key"]).keys()
-            )
         load_job = self.bq_client.load_table_from_uri(
             f"gs://{self.buffer_bucket_name}/{gcs_file}", self.temp_table_id, job_config=job_config
         )
+        result = load_job.result()  # Waits for the job to complete
+        assert result.state == "DONE", f"Job failed with state {result.state} with error {result.error_result}"
+    def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
+        # Rename fields to match BigQuery schema
+        df_destination_records = df_destination_records.rename(
+            {
+                # Bizon fields
+                "bizon_extracted_at": "_bizon_extracted_at",
+                "bizon_id": "_bizon_id",
+                "bizon_loaded_at": "_bizon_loaded_at",
+                # Source fields
+                "source_record_id": "_source_record_id",
+                "source_timestamp": "_source_timestamp",
+                "source_data": "_source_data",
+            },
+        )
-        load_job.result()
-    def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, str]:
-        # Here we can check if these IDs are already present in BigQuery
-        # Using SourceRecord.id values
-        gs_file_name = self.convert_and_upload_to_buffer(destination_records=destination_records)
+        gs_file_name = self.convert_and_upload_to_buffer(df_destination_records=df_destination_records)
         try:
-            self.load_to_bigquery(gs_file_name, destination_records=destination_records)
+            self.load_to_bigquery(gcs_file=gs_file_name, df_destination_records=df_destination_records)
             self.cleanup(gs_file_name)
         except Exception as e:
             self.cleanup(gs_file_name)
             logger.error(f"Error loading data to BigQuery: {e}")
+            logger.error(traceback.format_exc())
             return False, str(e)
         return True, ""

bizon-0.0.13/bizon/destinations/bigquery_streaming/src/config.py ADDED Viewed

@@ -0,0 +1,43 @@
+from enum import Enum
+from typing import Literal, Optional
+from pydantic import BaseModel, Field, field_validator
+from bizon.destinations.config import (
+    AbstractDestinationConfig,
+    AbstractDestinationDetailsConfig,
+    DestinationTypes,
+)
+class TimePartitioning(str, Enum):
+    DAY = "DAY"
+    HOUR = "HOUR"
+    MONTH = "MONTH"
+    YEAR = "YEAR"
+class BigQueryAuthentication(BaseModel):
+    service_account_key: str = Field(
+        description="Service Account Key JSON string. If empty it will be infered",
+        default="",
+    )
+class BigQueryStreamingConfigDetails(AbstractDestinationDetailsConfig):
+    project_id: str
+    dataset_id: str
+    dataset_location: Optional[str] = "US"
+    table_id: Optional[str] = Field(
+        default=None, description="Table ID, if not provided it will be inferred from source name"
+    )
+    time_partitioning: Optional[TimePartitioning] = Field(
+        default=TimePartitioning.DAY, description="BigQuery Time partitioning type"
+    )
+    authentication: Optional[BigQueryAuthentication] = None
+    bq_max_rows_per_request: Optional[int] = Field(30000, description="Max rows per buffer streaming request.")
+class BigQueryStreamingConfig(AbstractDestinationConfig):
+    name: Literal[DestinationTypes.BIGQUERY_STREAMING]
+    config: BigQueryStreamingConfigDetails

bizon-0.0.13/bizon/destinations/bigquery_streaming/src/destination.py ADDED Viewed

@@ -0,0 +1,154 @@
+import os
+import tempfile
+from concurrent.futures import ThreadPoolExecutor
+from typing import List, Tuple, Type
+import polars as pl
+from google.api_core.exceptions import NotFound
+from google.cloud import bigquery, bigquery_storage_v1
+from google.cloud.bigquery import DatasetReference, TimePartitioning
+from google.cloud.bigquery_storage_v1.types import (
+    AppendRowsRequest,
+    ProtoRows,
+    ProtoSchema,
+)
+from google.protobuf.message import Message
+from bizon.common.models import SyncMetadata
+from bizon.destinations.config import NormalizationType
+from bizon.destinations.destination import AbstractDestination
+from bizon.engine.backend.backend import AbstractBackend
+from .config import BigQueryStreamingConfigDetails
+from .proto_utils import get_proto_schema_and_class
+class BigQueryStreamingDestination(AbstractDestination):
+    def __init__(self, sync_metadata: SyncMetadata, config: BigQueryStreamingConfigDetails, backend: AbstractBackend):
+        super().__init__(sync_metadata, config, backend)
+        self.config: BigQueryStreamingConfigDetails = config
+        if config.authentication and config.authentication.service_account_key:
+            with tempfile.NamedTemporaryFile(delete=False) as temp:
+                temp.write(config.authentication.service_account_key.encode())
+                temp_file_path = temp.name
+            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file_path
+        self.project_id = config.project_id
+        self.bq_client = bigquery.Client(project=self.project_id)
+        self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
+        self.dataset_id = config.dataset_id
+        self.dataset_location = config.dataset_location
+        self.bq_max_rows_per_request = config.bq_max_rows_per_request
+    @property
+    def table_id(self) -> str:
+        tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
+        return f"{self.project_id}.{self.dataset_id}.{tabled_id}"
+    def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
+        # we keep raw data in the column source_data
+        if self.config.normalization.type == NormalizationType.NONE:
+            return [
+                bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
+                bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
+                bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
+                bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
+                bigquery.SchemaField(
+                    "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
+                ),
+                bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
+            ]
+        raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
+    def check_connection(self) -> bool:
+        dataset_ref = DatasetReference(self.project_id, self.dataset_id)
+        try:
+            self.bq_client.get_dataset(dataset_ref)
+        except NotFound:
+            dataset = bigquery.Dataset(dataset_ref)
+            dataset.location = self.dataset_location
+            dataset = self.bq_client.create_dataset(dataset)
+        return True
+    def append_rows_to_stream(
+        self,
+        write_client: bigquery_storage_v1.BigQueryWriteClient,
+        stream_name: str,
+        proto_schema: ProtoSchema,
+        serialized_rows: List[bytes],
+    ):
+        request = AppendRowsRequest(
+            write_stream=stream_name,
+            proto_rows=AppendRowsRequest.ProtoData(
+                rows=ProtoRows(serialized_rows=serialized_rows),
+                writer_schema=proto_schema,
+            ),
+        )
+        response = write_client.append_rows(iter([request]))
+        return response.code().name
+    @staticmethod
+    def to_protobuf_serialization(TableRowClass: Type[Message], row: dict) -> bytes:
+        """Convert a row to a protobuf serialization"""
+        record = TableRowClass()
+        record._bizon_id = row["bizon_id"]
+        record._bizon_extracted_at = row["bizon_extracted_at"].strftime("%Y-%m-%d %H:%M:%S.%f")
+        record._bizon_loaded_at = row["bizon_loaded_at"].strftime("%Y-%m-%d %H:%M:%S.%f")
+        record._source_record_id = row["source_record_id"]
+        record._source_timestamp = row["source_timestamp"].strftime("%Y-%m-%d %H:%M:%S.%f")
+        record._source_data = row["source_data"]
+        return record.SerializeToString()
+    def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
+        # TODO: for now no clustering keys
+        clustering_keys = []
+        # Create table if it doesnt exist
+        schema = self.get_bigquery_schema()
+        table = bigquery.Table(self.table_id, schema=schema)
+        time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
+        table.time_partitioning = time_partitioning
+        table = self.bq_client.create_table(table, exists_ok=True)
+        # Create the stream
+        write_client = self.bq_storage_client
+        tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
+        parent = write_client.table_path(self.project_id, self.dataset_id, tabled_id)
+        stream_name = f"{parent}/_default"
+        # Generating the protocol buffer representation of the message descriptor.
+        proto_schema, TableRow = get_proto_schema_and_class(clustering_keys)
+        serialized_rows = [
+            self.to_protobuf_serialization(TableRowClass=TableRow, row=row)
+            for row in df_destination_records.iter_rows(named=True)
+        ]
+        results = []
+        with ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(self.append_rows_to_stream, write_client, stream_name, proto_schema, batch_rows)
+                for batch_rows in self.batch(serialized_rows)
+            ]
+            for future in futures:
+                results.append(future.result())
+        assert all([r == "OK" for r in results]) is True, "Failed to append rows to stream"
+    def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
+        self.load_to_bigquery_via_streaming(df_destination_records=df_destination_records)
+        return True, ""
+    def batch(self, iterable):
+        """
+        Yield successive batches of size `batch_size` from `iterable`.
+        """
+        for i in range(0, len(iterable), self.bq_max_rows_per_request):
+            yield iterable[i : i + self.bq_max_rows_per_request]  # noqa

bizon-0.0.13/bizon/destinations/bigquery_streaming/src/proto_utils.py ADDED Viewed

@@ -0,0 +1,91 @@
+from typing import List, Tuple, Type
+from google.cloud.bigquery_storage_v1.types import ProtoSchema
+from google.protobuf.descriptor_pb2 import (
+    DescriptorProto,
+    FieldDescriptorProto,
+    FileDescriptorProto,
+)
+from google.protobuf.descriptor_pool import DescriptorPool
+from google.protobuf.message import Message
+from google.protobuf.message_factory import GetMessageClassesForFiles
+def get_proto_schema_and_class(clustering_keys: List[str] = None) -> Tuple[ProtoSchema, Type[Message]]:
+    # Define the FileDescriptorProto
+    file_descriptor_proto = FileDescriptorProto()
+    file_descriptor_proto.name = "dynamic.proto"
+    file_descriptor_proto.package = "dynamic_package"
+    # Define the TableRow message schema
+    message_descriptor = DescriptorProto()
+    message_descriptor.name = "TableRow"
+    # Add fields to the message, only use TYPE_STRING, BigQuery does not support other types
+    # It does not imapact data types in final table
+    # https://stackoverflow.com/questions/70489919/protobuf-type-for-bigquery-timestamp-field
+    fields = [
+        {"name": "_bizon_id", "type": FieldDescriptorProto.TYPE_STRING, "label": FieldDescriptorProto.LABEL_REQUIRED},
+        {
+            "name": "_bizon_extracted_at",
+            "type": FieldDescriptorProto.TYPE_STRING,
+            "label": FieldDescriptorProto.LABEL_REQUIRED,
+        },
+        {
+            "name": "_bizon_loaded_at",
+            "type": FieldDescriptorProto.TYPE_STRING,
+            "label": FieldDescriptorProto.LABEL_REQUIRED,
+        },
+        {
+            "name": "_source_record_id",
+            "type": FieldDescriptorProto.TYPE_STRING,
+            "label": FieldDescriptorProto.LABEL_REQUIRED,
+        },
+        {
+            "name": "_source_timestamp",
+            "type": FieldDescriptorProto.TYPE_STRING,
+            "label": FieldDescriptorProto.LABEL_REQUIRED,
+        },
+        {
+            "name": "_source_data",
+            "type": FieldDescriptorProto.TYPE_STRING,
+            "label": FieldDescriptorProto.LABEL_OPTIONAL,
+        },
+    ]
+    if clustering_keys:
+        for key in clustering_keys:
+            fields.append(
+                {
+                    "name": key,
+                    "type": FieldDescriptorProto.TYPE_STRING,
+                    "label": FieldDescriptorProto.LABEL_OPTIONAL,
+                }
+            )
+    for i, field in enumerate(fields, start=1):
+        field_descriptor = message_descriptor.field.add()
+        field_descriptor.name = field["name"]
+        field_descriptor.number = i
+        field_descriptor.type = field["type"]
+        field_descriptor.label = field["label"]
+    # Add the message to the file descriptor
+    file_descriptor_proto.message_type.add().CopyFrom(message_descriptor)
+    # Create a DescriptorPool and register the FileDescriptorProto
+    pool = DescriptorPool()
+    pool.Add(file_descriptor_proto)
+    # Use the registered file name to fetch the message classes
+    message_classes = GetMessageClassesForFiles(["dynamic.proto"], pool=pool)
+    # Fetch the TableRow class
+    table_row_class = message_classes["dynamic_package.TableRow"]
+    # Create the ProtoSchema
+    proto_schema = ProtoSchema()
+    proto_schema.proto_descriptor.CopyFrom(message_descriptor)
+    return proto_schema, table_row_class

{bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/buffer.py RENAMED Viewed

@@ -1,8 +1,11 @@
-import sys
 from datetime import datetime
 from typing import List
-from bizon.destinations.models import DestinationRecord
+from loguru import logger
+from polars import DataFrame
+from pytz import UTC
+from .models import destination_record_schema
 class DestinationBuffer:
@@ -10,15 +13,15 @@ class DestinationBuffer:
     def __init__(self, buffer_size: int, buffer_flush_timeout: int) -> None:
         self.buffer_size = buffer_size * 1024 * 1024  # Convert to bytes
         self.buffer_flush_timeout = buffer_flush_timeout
-        self.records: List[DestinationRecord] = []
+        self.df_destination_records: DataFrame = DataFrame(schema=destination_record_schema)
         self._iterations: List[int] = []
         self.pagination = {}
-        self.modified_at: List[datetime] = [datetime.utcnow()]
+        self.modified_at: List[datetime] = [datetime.now(tz=UTC)]
     @property
     def current_size(self) -> int:
         """Return buffer size"""
-        return sys.getsizeof(self.records)
+        return self.df_destination_records.estimated_size(unit="b")
     @property
     def buffer_free_space_pct(self) -> float:
@@ -61,16 +64,20 @@ class DestinationBuffer:
     def flush(self):
         """Flush buffer"""
-        self.records = []
+        self.df_destination_records = DataFrame(schema=destination_record_schema)
         self._iterations = []
         self.pagination = {}
         self.modified_at = []
     def add_source_iteration_records_to_buffer(
-        self, iteration: int, records: List[DestinationRecord], pagination: dict = None
+        self, iteration: int, df_destination_records: DataFrame, pagination: dict = None
     ):
         """Add records for the given iteration to buffer"""
-        self.records.extend(records)
+        self.df_destination_records.vstack(df_destination_records, in_place=True)
         self._iterations.append(iteration)
         self.pagination = pagination
-        self.modified_at.append(datetime.utcnow())
+        self.modified_at.append(datetime.now(tz=UTC))
+        logger.info(
+            f"Added {df_destination_records.height} records to buffer for iteration {iteration} - {self.df_destination_records.estimated_size(unit='mb')} MB"
+        )

{bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/config.py RENAMED Viewed

@@ -6,6 +6,7 @@ from pydantic import BaseModel, ConfigDict, Field
 class DestinationTypes(str, Enum):
     BIGQUERY = "bigquery"
+    BIGQUERY_STREAMING = "bigquery_streaming"
     LOGGER = "logger"
     FILE = "file"

bizon 0.0.10__tar.gz → 0.0.13__tar.gz

bizon 0.0.10tar.gz → 0.0.13tar.gz