PyPI - bizon - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

bizon 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

bizon/common/models.py +2 -0
bizon/connectors/destinations/bigquery/src/config.py +1 -0
bizon/connectors/destinations/bigquery/src/destination.py +3 -1
bizon/connectors/destinations/bigquery_streaming/src/config.py +6 -5
bizon/connectors/destinations/bigquery_streaming/src/destination.py +9 -4
bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +6 -1
bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +230 -45
bizon/connectors/destinations/bigquery_streaming_v2/src/proto_utils.py +1 -13
bizon/connectors/destinations/file/src/config.py +1 -0
bizon/connectors/destinations/file/src/destination.py +3 -1
bizon/connectors/destinations/logger/src/config.py +1 -0
bizon/connectors/destinations/logger/src/destination.py +3 -0
bizon/connectors/sources/kafka/config/kafka.example.yml +1 -3
bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +1 -3
bizon/connectors/sources/kafka/src/config.py +0 -6
bizon/connectors/sources/kafka/src/decode.py +71 -66
bizon/connectors/sources/kafka/src/source.py +44 -24
bizon/connectors/sources/kafka/tests/kafka_pipeline.py +1 -1
bizon/destination/config.py +9 -0
bizon/destination/destination.py +37 -5
bizon/engine/runner/adapters/streaming.py +60 -42
bizon/engine/runner/runner.py +14 -7
bizon/monitoring/config.py +12 -2
bizon/monitoring/datadog/monitor.py +98 -14
bizon/monitoring/monitor.py +41 -12
bizon/monitoring/noop/monitor.py +22 -3
bizon/source/source.py +1 -1
{bizon-0.1.1.dist-info → bizon-0.1.2.dist-info}/METADATA +2 -1
{bizon-0.1.1.dist-info → bizon-0.1.2.dist-info}/RECORD +32 -32
{bizon-0.1.1.dist-info → bizon-0.1.2.dist-info}/WHEEL +1 -1
{bizon-0.1.1.dist-info → bizon-0.1.2.dist-info}/LICENSE +0 -0
{bizon-0.1.1.dist-info → bizon-0.1.2.dist-info}/entry_points.txt +0 -0

bizon/common/models.py CHANGED Viewed

@@ -75,6 +75,7 @@ class SyncMetadata(BaseModel):
     stream_name: str
     sync_mode: SourceSyncModes
     destination_name: str
+    destination_alias: str
     @classmethod
     def from_bizon_config(cls, job_id: str, config: BizonConfig) -> "SyncMetadata":
@@ -85,4 +86,5 @@ class SyncMetadata(BaseModel):
             stream_name=config.source.stream,
             sync_mode=config.source.sync_mode,
             destination_name=config.destination.name,
+            destination_alias=config.destination.alias,
         )

bizon/connectors/destinations/bigquery/src/config.py CHANGED Viewed

@@ -123,5 +123,6 @@ class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
 class BigQueryConfig(AbstractDestinationConfig):
     name: Literal[DestinationTypes.BIGQUERY]
+    alias: str = "bigquery"
     buffer_size: Optional[int] = 400
     config: BigQueryConfigDetails

bizon/connectors/destinations/bigquery/src/destination.py CHANGED Viewed

@@ -14,6 +14,7 @@ from loguru import logger
 from bizon.common.models import SyncMetadata
 from bizon.destination.destination import AbstractDestination
 from bizon.engine.backend.backend import AbstractBackend
+from bizon.monitoring.monitor import AbstractMonitor
 from bizon.source.config import SourceSyncModes
 from bizon.source.source import AbstractSourceCallback
@@ -28,8 +29,9 @@ class BigQueryDestination(AbstractDestination):
         config: BigQueryConfigDetails,
         backend: AbstractBackend,
         source_callback: AbstractSourceCallback,
+        monitor: AbstractMonitor,
     ):
-        super().__init__(sync_metadata, config, backend, source_callback)
+        super().__init__(sync_metadata, config, backend, source_callback, monitor)
         self.config: BigQueryConfigDetails = config
         if config.authentication and config.authentication.service_account_key:

bizon/connectors/destinations/bigquery_streaming/src/config.py CHANGED Viewed

@@ -41,16 +41,17 @@ class BigQueryStreamingConfigDetails(AbstractDestinationDetailsConfig):
         description="BigQuery Time partitioning type",
     )
     authentication: Optional[BigQueryAuthentication] = None
-    bq_max_rows_per_request: Optional[int] = Field(30000, description="Max rows per buffer streaming request.")
+    bq_max_rows_per_request: Optional[int] = Field(
+        5000,
+        description="Max rows per buffer streaming request. Must not exceed 10000.",
+        le=10000,
+    )
     record_schemas: Optional[list[BigQueryRecordSchemaConfig]] = Field(
         default=None, description="Schema for the records. Required if unnest is set to true."
     )
-    use_legacy_streaming_api: bool = Field(
-        default=False,
-        description="[DEPRECATED] Use the legacy streaming API. This is required for some older BigQuery versions.",
-    )
 class BigQueryStreamingConfig(AbstractDestinationConfig):
     name: Literal[DestinationTypes.BIGQUERY_STREAMING]
+    alias: str = "bigquery"
     config: BigQueryStreamingConfigDetails

bizon/connectors/destinations/bigquery_streaming/src/destination.py CHANGED Viewed

@@ -36,6 +36,7 @@ from bizon.connectors.destinations.bigquery.src.config import (
 )
 from bizon.destination.destination import AbstractDestination
 from bizon.engine.backend.backend import AbstractBackend
+from bizon.monitoring.monitor import AbstractMonitor
 from bizon.source.callback import AbstractSourceCallback
 from .config import BigQueryStreamingConfigDetails
@@ -44,7 +45,6 @@ from .config import BigQueryStreamingConfigDetails
 class BigQueryStreamingDestination(AbstractDestination):
     # Add constants for limits
-    MAX_ROWS_PER_REQUEST = 5000  # 5000 (max is 10000)
     MAX_REQUEST_SIZE_BYTES = 5 * 1024 * 1024  # 5 MB (max is 10MB)
     MAX_ROW_SIZE_BYTES = 0.9 * 1024 * 1024  # 1 MB
@@ -54,8 +54,9 @@ class BigQueryStreamingDestination(AbstractDestination):
         config: BigQueryStreamingConfigDetails,
         backend: AbstractBackend,
         source_callback: AbstractSourceCallback,
+        monitor: AbstractMonitor,
     ):  # type: ignore
-        super().__init__(sync_metadata, config, backend, source_callback)
+        super().__init__(sync_metadata, config, backend, source_callback, monitor)
         self.config: BigQueryStreamingConfigDetails = config
         if config.authentication and config.authentication.service_account_key:
@@ -222,7 +223,7 @@ class BigQueryStreamingDestination(AbstractDestination):
         try:
             # Handle streaming batch
             if batch.get("stream_batch") and len(batch["stream_batch"]) > 0:
-                return self.bq_client.insert_rows_json(
+                self.bq_client.insert_rows_json(
                     table,
                     batch["stream_batch"],
                     row_ids=[None] * len(batch["stream_batch"]),
@@ -245,6 +246,10 @@ class BigQueryStreamingDestination(AbstractDestination):
                 if load_job.state != "DONE":
                     raise Exception(f"Failed to load rows to BigQuery: {load_job.errors}")
+                self.monitor.track_large_records_synced(
+                    num_records=len(batch["json_batch"]), extra_tags={"destination_id": self.destination_id}
+                )
         except Exception as e:
             logger.error(f"Error inserting batch: {str(e)}, type: {type(e)}")
             raise
@@ -347,7 +352,7 @@ class BigQueryStreamingDestination(AbstractDestination):
             # If adding this item would exceed either limit, yield current batch and start new one
             if (
-                len(current_batch) >= self.MAX_ROWS_PER_REQUEST
+                len(current_batch) >= self.bq_max_rows_per_request
                 or current_batch_size + item_size > self.MAX_REQUEST_SIZE_BYTES
             ):
                 logger.debug(f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB")

bizon/connectors/destinations/bigquery_streaming_v2/src/config.py CHANGED Viewed

@@ -41,7 +41,11 @@ class BigQueryStreamingV2ConfigDetails(AbstractDestinationDetailsConfig):
         description="BigQuery Time partitioning type",
     )
     authentication: Optional[BigQueryAuthentication] = None
-    bq_max_rows_per_request: Optional[int] = Field(30000, description="Max rows per buffer streaming request.")
+    bq_max_rows_per_request: Optional[int] = Field(
+        5000,
+        description="Max rows per buffer streaming request. Must not exceed 10000.",
+        le=10000,
+    )
     record_schemas: Optional[list[BigQueryRecordSchemaConfig]] = Field(
         default=None, description="Schema for the records. Required if unnest is set to true."
     )
@@ -49,4 +53,5 @@ class BigQueryStreamingV2ConfigDetails(AbstractDestinationDetailsConfig):
 class BigQueryStreamingV2Config(AbstractDestinationConfig):
     name: Literal[DestinationTypes.BIGQUERY_STREAMING_V2]
+    alias: str = "bigquery"
     config: BigQueryStreamingV2ConfigDetails

bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py CHANGED Viewed

@@ -1,25 +1,44 @@
 import os
 import tempfile
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
 from typing import List, Tuple, Type
+import orjson
 import polars as pl
-from google.api_core.exceptions import NotFound
-from google.cloud import bigquery, bigquery_storage_v1
+import urllib3.exceptions
+from google.api_core.client_options import ClientOptions
+from google.api_core.exceptions import (
+    Conflict,
+    InvalidArgument,
+    NotFound,
+    RetryError,
+    ServerError,
+    ServiceUnavailable,
+)
+from google.cloud import bigquery
 from google.cloud.bigquery import DatasetReference, TimePartitioning
+from google.cloud.bigquery_storage_v1 import BigQueryWriteClient
 from google.cloud.bigquery_storage_v1.types import (
     AppendRowsRequest,
     ProtoRows,
     ProtoSchema,
 )
-from google.protobuf.json_format import ParseDict
-from google.protobuf.message import Message
+from google.protobuf.json_format import MessageToDict, ParseDict, ParseError
+from google.protobuf.message import EncodeError, Message
 from loguru import logger
+from requests.exceptions import ConnectionError, SSLError, Timeout
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
 from bizon.common.models import SyncMetadata
 from bizon.destination.destination import AbstractDestination
 from bizon.engine.backend.backend import AbstractBackend
+from bizon.monitoring.monitor import AbstractMonitor
 from bizon.source.callback import AbstractSourceCallback
 from .config import BigQueryStreamingV2ConfigDetails
@@ -29,9 +48,8 @@ from .proto_utils import get_proto_schema_and_class
 class BigQueryStreamingV2Destination(AbstractDestination):
     # Add constants for limits
-    MAX_ROWS_PER_REQUEST = 5000  # 5000 (max is 10000)
-    MAX_REQUEST_SIZE_BYTES = 5 * 1024 * 1024  # 5 MB (max is 10MB)
-    MAX_ROW_SIZE_BYTES = 0.9 * 1024 * 1024  # 1 MB
+    MAX_REQUEST_SIZE_BYTES = 9.5 * 1024 * 1024  # 9.5 MB (max is 10MB)
+    MAX_ROW_SIZE_BYTES = 8 * 1024 * 1024  # 8 MB (max is 10MB)
     def __init__(
         self,
@@ -39,8 +57,9 @@ class BigQueryStreamingV2Destination(AbstractDestination):
         config: BigQueryStreamingV2ConfigDetails,
         backend: AbstractBackend,
         source_callback: AbstractSourceCallback,
+        monitor: AbstractMonitor,
     ):  # type: ignore
-        super().__init__(sync_metadata, config, backend, source_callback)
+        super().__init__(sync_metadata, config, backend, source_callback, monitor)
         self.config: BigQueryStreamingV2ConfigDetails = config
         if config.authentication and config.authentication.service_account_key:
@@ -51,10 +70,12 @@ class BigQueryStreamingV2Destination(AbstractDestination):
         self.project_id = config.project_id
         self.bq_client = bigquery.Client(project=self.project_id)
-        self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
         self.dataset_id = config.dataset_id
         self.dataset_location = config.dataset_location
         self.bq_max_rows_per_request = config.bq_max_rows_per_request
+        self.bq_storage_client_options = ClientOptions(
+            quota_project_id=self.project_id,
+        )
     @property
     def table_id(self) -> str:
@@ -102,13 +123,35 @@ class BigQueryStreamingV2Destination(AbstractDestination):
             dataset = self.bq_client.create_dataset(dataset)
         return True
+    @retry(
+        retry=retry_if_exception_type(
+            (
+                ServerError,
+                ServiceUnavailable,
+                SSLError,
+                ConnectionError,
+                Timeout,
+                RetryError,
+                urllib3.exceptions.ProtocolError,
+                urllib3.exceptions.SSLError,
+                InvalidArgument,
+            )
+        ),
+        wait=wait_exponential(multiplier=2, min=4, max=120),
+        stop=stop_after_attempt(8),
+        before_sleep=lambda retry_state: logger.warning(
+            f"Streaming append attempt {retry_state.attempt_number} failed. "
+            f"Retrying in {retry_state.next_action.sleep} seconds..."
+        ),
+    )
     def append_rows_to_stream(
         self,
-        write_client: bigquery_storage_v1.BigQueryWriteClient,
         stream_name: str,
         proto_schema: ProtoSchema,
         serialized_rows: List[bytes],
     ):
+        write_client = BigQueryWriteClient(client_options=self.bq_storage_client_options)
         request = AppendRowsRequest(
             write_stream=stream_name,
             proto_rows=AppendRowsRequest.ProtoData(
@@ -116,11 +159,26 @@ class BigQueryStreamingV2Destination(AbstractDestination):
                 writer_schema=proto_schema,
             ),
         )
-        response = write_client.append_rows(iter([request]))
-        return response.code().name
+        try:
+            response = write_client.append_rows(iter([request]))
+            return response.code().name
+        except Exception as e:
+            logger.error(f"Error in append_rows_to_stream: {str(e)}")
+            logger.error(f"Stream name: {stream_name}")
+            raise
     def safe_cast_record_values(self, row: dict):
+        """
+        Safe cast record values to the correct type for BigQuery.
+        """
         for col in self.record_schemas[self.destination_id]:
+            # Handle dicts as strings
+            if col.type in ["STRING", "JSON"]:
+                if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
+                    row[col.name] = orjson.dumps(row[col.name]).decode("utf-8")
+            # Handle timestamps
             if col.type in ["TIMESTAMP", "DATETIME"] and col.default_value_expression is None:
                 if isinstance(row[col.name], int):
                     if row[col.name] > datetime(9999, 12, 31).timestamp():
@@ -143,15 +201,102 @@ class BigQueryStreamingV2Destination(AbstractDestination):
     @staticmethod
     def to_protobuf_serialization(TableRowClass: Type[Message], row: dict) -> bytes:
         """Convert a row to a Protobuf serialization."""
-        record = ParseDict(row, TableRowClass())
-        return record.SerializeToString()
+        try:
+            record = ParseDict(row, TableRowClass())
+        except ParseError as e:
+            logger.error(f"Error serializing record: {e} for row: {row}.")
+            raise e
-    def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
+        try:
+            serialized_record = record.SerializeToString()
+        except EncodeError as e:
+            logger.error(f"Error serializing record: {e} for row: {row}.")
+            raise e
+        return serialized_record
-        # TODO: for now no clustering keys
-        clustering_keys = []
+    @staticmethod
+    def from_protobuf_serialization(
+        TableRowClass: Type[Message],
+        serialized_data: bytes,
+    ) -> dict:
+        """Convert protobuf serialization back to a dictionary."""
+        record = TableRowClass()
+        record.ParseFromString(serialized_data)
+        return MessageToDict(record, preserving_proto_field_name=True)
+    @retry(
+        retry=retry_if_exception_type(
+            (
+                ServerError,
+                ServiceUnavailable,
+                SSLError,
+                ConnectionError,
+                Timeout,
+                RetryError,
+                urllib3.exceptions.ProtocolError,
+                urllib3.exceptions.SSLError,
+            )
+        ),
+        wait=wait_exponential(multiplier=2, min=4, max=120),
+        stop=stop_after_attempt(8),
+        before_sleep=lambda retry_state: logger.warning(
+            f"Attempt {retry_state.attempt_number} failed. Retrying in {retry_state.next_action.sleep} seconds..."
+        ),
+    )
+    def process_streaming_batch(
+        self,
+        stream_name: str,
+        proto_schema: ProtoSchema,
+        batch: dict,
+        table_row_class: Type[Message],
+    ) -> List[Tuple[str, str]]:
+        """Process a single batch for streaming and/or large rows with retry logic."""
+        results = []
+        try:
+            # Handle streaming batch
+            if batch.get("stream_batch") and len(batch["stream_batch"]) > 0:
+                result = self.append_rows_to_stream(stream_name, proto_schema, batch["stream_batch"])
+                results.append(("streaming", result))
+            # Handle large rows batch
+            if batch.get("json_batch") and len(batch["json_batch"]) > 0:
+                # Deserialize protobuf bytes back to JSON for the load job
+                deserialized_rows = []
+                for serialized_row in batch["json_batch"]:
+                    deserialized_row = self.from_protobuf_serialization(table_row_class, serialized_row)
+                    deserialized_rows.append(deserialized_row)
+                # For large rows, we need to use the main client
+                job_config = bigquery.LoadJobConfig(
+                    source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
+                    schema=self.bq_client.get_table(self.table_id).schema,
+                    ignore_unknown_values=True,
+                )
+                load_job = self.bq_client.load_table_from_json(
+                    deserialized_rows, self.table_id, job_config=job_config, timeout=300
+                )
+                result = load_job.result()
+                if load_job.state != "DONE":
+                    raise Exception(f"Failed to load rows to BigQuery: {load_job.errors}")
+                # Track large rows
+                self.monitor.track_large_records_synced(
+                    num_records=len(batch["json_batch"]), extra_tags={"destination_id": self.destination_id}
+                )
+                results.append(("large_rows", "DONE"))
+            if not results:
+                results.append(("empty", "SKIPPED"))
-        # Create table if it doesnt exist
+            return results
+        except Exception as e:
+            logger.error(f"Error processing batch: {str(e)}")
+            raise
+    def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
+        # Create table if it does not exist
         schema = self.get_bigquery_schema()
         table = bigquery.Table(self.table_id, schema=schema)
         time_partitioning = TimePartitioning(
@@ -159,31 +304,43 @@ class BigQueryStreamingV2Destination(AbstractDestination):
         )
         table.time_partitioning = time_partitioning
-        # Override bigquery client with project's destination id
-        if self.destination_id:
-            project, dataset, table_name = self.destination_id.split(".")
-            self.bq_client = bigquery.Client(project=project)
-        table = self.bq_client.create_table(table, exists_ok=True)
+        if self.clustering_keys and self.clustering_keys[self.destination_id]:
+            table.clustering_fields = self.clustering_keys[self.destination_id]
+        try:
+            table = self.bq_client.create_table(table)
+        except Conflict:
+            table = self.bq_client.get_table(self.table_id)
+            # Compare and update schema if needed
+            existing_fields = {field.name: field for field in table.schema}
+            new_fields = {field.name: field for field in self.get_bigquery_schema()}
+            # Find fields that need to be added
+            fields_to_add = [field for name, field in new_fields.items() if name not in existing_fields]
+            if fields_to_add:
+                logger.warning(f"Adding new fields to table schema: {[field.name for field in fields_to_add]}")
+                updated_schema = table.schema + fields_to_add
+                table.schema = updated_schema
+                table = self.bq_client.update_table(table, ["schema"])
         # Create the stream
         if self.destination_id:
             project, dataset, table_name = self.destination_id.split(".")
-            write_client = bigquery_storage_v1.BigQueryWriteClient()
-            parent = write_client.table_path(project, dataset, table_name)
+            parent = BigQueryWriteClient.table_path(project, dataset, table_name)
         else:
-            write_client = self.bq_storage_client
-            parent = write_client.table_path(self.project_id, self.dataset_id, self.destination_id)
+            parent = BigQueryWriteClient.table_path(self.project_id, self.dataset_id, self.destination_id)
         stream_name = f"{parent}/_default"
         # Generating the protocol buffer representation of the message descriptor.
-        proto_schema, TableRow = get_proto_schema_and_class(schema, clustering_keys)
+        proto_schema, TableRow = get_proto_schema_and_class(schema)
         if self.config.unnest:
             serialized_rows = [
-                self.to_protobuf_serialization(TableRowClass=TableRow, row=self.safe_cast_record_values(row))
-                for row in df_destination_records["source_data"].str.json_decode(infer_schema_length=None).to_list()
+                self.to_protobuf_serialization(
+                    TableRowClass=TableRow, row=self.safe_cast_record_values(orjson.loads(row))
+                )
+                for row in df_destination_records["source_data"].to_list()
             ]
         else:
             df_destination_records = df_destination_records.with_columns(
@@ -207,16 +364,43 @@ class BigQueryStreamingV2Destination(AbstractDestination):
                 for row in df_destination_records.iter_rows(named=True)
             ]
-        results = []
-        with ThreadPoolExecutor() as executor:
-            futures = [
-                executor.submit(self.append_rows_to_stream, write_client, stream_name, proto_schema, batch_rows)
-                for batch_rows in self.batch(serialized_rows)
-            ]
-            for future in futures:
-                results.append(future.result())
+        streaming_results = []
+        large_rows_results = []
+        # Collect all batches first
+        batches = list(self.batch(serialized_rows))
+        # Use ThreadPoolExecutor for parallel processing
+        max_workers = min(len(batches), self.config.max_concurrent_threads)
+        logger.info(f"Processing {len(batches)} batches with {max_workers} concurrent threads")
+        try:
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                # Submit all batch processing tasks
+                future_to_batch = {
+                    executor.submit(self.process_streaming_batch, stream_name, proto_schema, batch, TableRow): batch
+                    for batch in batches
+                }
-        assert all([r == "OK" for r in results]) is True, "Failed to append rows to stream"
+                # Collect results as they complete
+                for future in as_completed(future_to_batch):
+                    batch_results = future.result()
+                    for batch_type, result in batch_results:
+                        if batch_type == "streaming":
+                            streaming_results.append(result)
+                        if batch_type == "large_rows":
+                            large_rows_results.append(result)
+        except Exception as e:
+            logger.error(f"Error in multithreaded batch processing: {str(e)}, type: {type(e)}")
+            if isinstance(e, RetryError):
+                logger.error(f"Retry error details: {e.cause if hasattr(e, 'cause') else 'No cause available'}")
+            raise
+        if len(streaming_results) > 0:
+            assert all([r == "OK" for r in streaming_results]) is True, "Failed to append rows to stream"
+        if len(large_rows_results) > 0:
+            assert all([r == "DONE" for r in large_rows_results]) is True, "Failed to load rows to BigQuery"
     def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
         self.load_to_bigquery_via_streaming(df_destination_records=df_destination_records)
@@ -236,7 +420,7 @@ class BigQueryStreamingV2Destination(AbstractDestination):
             # If adding this item would exceed either limit, yield current batch and start new one
             if (
-                len(current_batch) >= self.MAX_ROWS_PER_REQUEST
+                len(current_batch) >= self.bq_max_rows_per_request
                 or current_batch_size + item_size > self.MAX_REQUEST_SIZE_BYTES
             ):
                 logger.debug(f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB")
@@ -247,15 +431,16 @@ class BigQueryStreamingV2Destination(AbstractDestination):
             if item_size > self.MAX_ROW_SIZE_BYTES:
                 large_rows.append(item)
-                logger.debug(f"Large row detected: {item_size} bytes")
+                logger.warning(f"Large row detected: {item_size} bytes")
             else:
                 current_batch.append(item)
                 current_batch_size += item_size
         # Yield the last batch
         if current_batch:
-            logger.debug(
+            logger.info(
                 f"Yielding streaming batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB"
             )
-            logger.debug(f"Yielding large rows batch of {len(large_rows)} rows")
+            if large_rows:
+                logger.warning(f"Yielding large rows batch of {len(large_rows)} rows")
             yield {"stream_batch": current_batch, "json_batch": large_rows}

bizon/connectors/destinations/bigquery_streaming_v2/src/proto_utils.py CHANGED Viewed

@@ -32,9 +32,7 @@ def map_bq_type_to_field_descriptor(bq_type: str) -> int:
     return type_map.get(bq_type, FieldDescriptorProto.TYPE_STRING)  # Default to TYPE_STRING
-def get_proto_schema_and_class(
-    bq_schema: List[SchemaField], clustering_keys: List[str] = None
-) -> Tuple[ProtoSchema, Type[Message]]:
+def get_proto_schema_and_class(bq_schema: List[SchemaField]) -> Tuple[ProtoSchema, Type[Message]]:
     """Generate a ProtoSchema and a TableRow class for unnested BigQuery schema."""
     # Define the FileDescriptorProto
     file_descriptor_proto = FileDescriptorProto()
@@ -60,16 +58,6 @@ def get_proto_schema_and_class(
         for col in bq_schema
     ]
-    if clustering_keys:
-        for key in clustering_keys:
-            fields.append(
-                {
-                    "name": key,
-                    "type": FieldDescriptorProto.TYPE_STRING,
-                    "label": FieldDescriptorProto.LABEL_OPTIONAL,
-                }
-            )
     for i, field in enumerate(fields, start=1):
         field_descriptor = message_descriptor.field.add()
         field_descriptor.name = field["name"]

bizon/connectors/destinations/file/src/config.py CHANGED Viewed

@@ -20,4 +20,5 @@ class FileDestinationDetailsConfig(AbstractDestinationDetailsConfig):
 class FileDestinationConfig(AbstractDestinationConfig):
     name: Literal[DestinationTypes.FILE]
+    alias: str = "file"
     config: FileDestinationDetailsConfig

bizon/connectors/destinations/file/src/destination.py CHANGED Viewed

@@ -6,6 +6,7 @@ import polars as pl
 from bizon.common.models import SyncMetadata
 from bizon.destination.destination import AbstractDestination
 from bizon.engine.backend.backend import AbstractBackend
+from bizon.monitoring.monitor import AbstractMonitor
 from bizon.source.callback import AbstractSourceCallback
 from .config import FileDestinationDetailsConfig
@@ -19,8 +20,9 @@ class FileDestination(AbstractDestination):
         config: FileDestinationDetailsConfig,
         backend: AbstractBackend,
         source_callback: AbstractSourceCallback,
+        monitor: AbstractMonitor,
     ):
-        super().__init__(sync_metadata, config, backend, source_callback)
+        super().__init__(sync_metadata, config, backend, source_callback, monitor)
         self.config: FileDestinationDetailsConfig = config
     def check_connection(self) -> bool:

bizon/connectors/destinations/logger/src/config.py CHANGED Viewed

@@ -15,4 +15,5 @@ class LoggerDestinationConfig(AbstractDestinationDetailsConfig):
 class LoggerConfig(AbstractDestinationConfig):
     name: Literal[DestinationTypes.LOGGER]
+    alias: str = "logger"
     config: LoggerDestinationConfig

bizon/connectors/destinations/logger/src/destination.py CHANGED Viewed

@@ -6,6 +6,7 @@ from loguru import logger
 from bizon.common.models import SyncMetadata
 from bizon.destination.destination import AbstractDestination
 from bizon.engine.backend.backend import AbstractBackend
+from bizon.monitoring.monitor import AbstractMonitor
 from bizon.source.callback import AbstractSourceCallback
 from .config import LoggerDestinationConfig
@@ -19,12 +20,14 @@ class LoggerDestination(AbstractDestination):
         config: LoggerDestinationConfig,
         backend: AbstractBackend,
         source_callback: AbstractSourceCallback,
+        monitor: AbstractMonitor,
     ):
         super().__init__(
             sync_metadata=sync_metadata,
             config=config,
             backend=backend,
             source_callback=source_callback,
+            monitor=monitor,
         )
     def check_connection(self) -> bool:

bizon/connectors/sources/kafka/config/kafka.example.yml CHANGED Viewed

@@ -8,8 +8,6 @@ source:
   topic: my-topic
-  nb_bytes_schema_id: 8
   batch_size: 1000
   consumer_timeout: 10
   bootstrap_servers: <bootstrap-severs>:9092
@@ -47,4 +45,4 @@ destination:
 #       syncCursorInDBEvery: 100
 #   runner:
-#     log_level: INFO
+#     log_level: INFO

bizon/connectors/sources/kafka/config/kafka_debezium.example.yml CHANGED Viewed

@@ -10,8 +10,6 @@ source:
   topic: <TOPIC_NAME>
-  nb_bytes_schema_id: 8
   batch_size: 1000
   consumer_timeout: 10
   bootstrap_servers: <BOOTSTRAP_SERVERS>
@@ -109,4 +107,4 @@ engine:
   queue:
     type: python_queue
     config:
-      max_nb_messages: 1000000
+      max_nb_messages: 1000000

bizon/connectors/sources/kafka/src/config.py CHANGED Viewed

@@ -66,10 +66,4 @@ class KafkaSourceConfig(SourceConfig):
     message_encoding: str = Field(default=MessageEncoding.AVRO, description="Encoding to use to decode the message")
-    # Schema ID header configuration
-    nb_bytes_schema_id: Literal[4, 8] = Field(
-        description="Number of bytes encode SchemaID in Kafka message. Standard is 4.",
-        default=4,
-    )
     authentication: KafkaAuthConfig = Field(..., description="Authentication configuration")

bizon 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

bizon 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl