PyPI - bizon - Versions diffs - 0.1.2__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

bizon 0.1.2py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

bizon/alerting/alerts.py +0 -1
bizon/common/models.py +182 -4
bizon/connectors/destinations/bigquery/config/bigquery_incremental.example.yml +34 -0
bizon/connectors/destinations/bigquery/src/config.py +0 -1
bizon/connectors/destinations/bigquery/src/destination.py +16 -9
bizon/connectors/destinations/bigquery_streaming/config/bigquery_streaming.example.yml +74 -0
bizon/connectors/destinations/bigquery_streaming/src/destination.py +5 -43
bizon/connectors/destinations/bigquery_streaming_v2/config/bigquery_streaming_v2.example.yml +79 -0
bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +54 -50
bizon/connectors/destinations/file/config/file.example.yml +40 -0
bizon/connectors/destinations/file/config/file_incremental.example.yml +22 -0
bizon/connectors/destinations/file/src/config.py +1 -1
bizon/connectors/destinations/file/src/destination.py +59 -7
bizon/connectors/destinations/logger/config/logger.example.yml +30 -0
bizon/connectors/destinations/logger/config/logger_incremental.example.yml +21 -0
bizon/connectors/destinations/logger/src/config.py +0 -2
bizon/connectors/destinations/logger/src/destination.py +14 -3
bizon/connectors/sources/cycle/src/source.py +2 -6
bizon/connectors/sources/dummy/src/source.py +0 -4
bizon/connectors/sources/gsheets/config/service_account_incremental.example.yml +51 -0
bizon/connectors/sources/gsheets/src/source.py +2 -3
bizon/connectors/sources/hubspot/config/api_key_incremental.example.yml +40 -0
bizon/connectors/sources/hubspot/src/hubspot_base.py +0 -1
bizon/connectors/sources/hubspot/src/hubspot_objects.py +3 -4
bizon/connectors/sources/hubspot/src/models/hs_object.py +0 -1
bizon/connectors/sources/kafka/config/kafka_streams.example.yml +124 -0
bizon/connectors/sources/kafka/src/config.py +10 -6
bizon/connectors/sources/kafka/src/decode.py +2 -2
bizon/connectors/sources/kafka/src/source.py +147 -46
bizon/connectors/sources/notion/config/api_key.example.yml +35 -0
bizon/connectors/sources/notion/config/api_key_incremental.example.yml +48 -0
bizon/connectors/sources/notion/src/__init__.py +0 -0
bizon/connectors/sources/notion/src/config.py +59 -0
bizon/connectors/sources/notion/src/source.py +1501 -0
bizon/connectors/sources/notion/tests/notion_pipeline.py +7 -0
bizon/connectors/sources/notion/tests/test_notion.py +113 -0
bizon/connectors/sources/periscope/src/source.py +0 -6
bizon/connectors/sources/pokeapi/src/source.py +0 -1
bizon/connectors/sources/sana_ai/config/sana.example.yml +25 -0
bizon/connectors/sources/sana_ai/src/source.py +85 -0
bizon/destination/buffer.py +0 -1
bizon/destination/config.py +0 -1
bizon/destination/destination.py +1 -4
bizon/engine/backend/adapters/sqlalchemy/backend.py +2 -5
bizon/engine/backend/adapters/sqlalchemy/config.py +0 -1
bizon/engine/config.py +0 -1
bizon/engine/engine.py +0 -1
bizon/engine/pipeline/consumer.py +0 -1
bizon/engine/pipeline/producer.py +43 -6
bizon/engine/queue/adapters/kafka/config.py +1 -1
bizon/engine/queue/adapters/kafka/queue.py +0 -1
bizon/engine/queue/adapters/python_queue/consumer.py +0 -1
bizon/engine/queue/adapters/python_queue/queue.py +0 -2
bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -1
bizon/engine/queue/adapters/rabbitmq/queue.py +0 -1
bizon/engine/queue/config.py +0 -2
bizon/engine/runner/adapters/process.py +0 -2
bizon/engine/runner/adapters/streaming.py +55 -1
bizon/engine/runner/adapters/thread.py +0 -2
bizon/engine/runner/config.py +0 -1
bizon/engine/runner/runner.py +0 -2
bizon/monitoring/datadog/monitor.py +5 -3
bizon/monitoring/noop/monitor.py +1 -1
bizon/source/auth/authenticators/abstract_oauth.py +11 -3
bizon/source/auth/authenticators/abstract_token.py +2 -1
bizon/source/auth/authenticators/basic.py +1 -1
bizon/source/auth/authenticators/cookies.py +2 -1
bizon/source/auth/authenticators/oauth.py +8 -3
bizon/source/config.py +6 -2
bizon/source/cursor.py +8 -16
bizon/source/discover.py +3 -6
bizon/source/models.py +2 -2
bizon/source/session.py +0 -1
bizon/source/source.py +17 -2
bizon/transform/config.py +0 -2
bizon/transform/transform.py +0 -3
bizon-0.3.0.dist-info/METADATA +323 -0
bizon-0.3.0.dist-info/RECORD +142 -0
{bizon-0.1.2.dist-info → bizon-0.3.0.dist-info}/WHEEL +1 -1
bizon-0.3.0.dist-info/entry_points.txt +2 -0
bizon-0.1.2.dist-info/METADATA +0 -179
bizon-0.1.2.dist-info/RECORD +0 -123
bizon-0.1.2.dist-info/entry_points.txt +0 -3
{bizon-0.1.2.dist-info → bizon-0.3.0.dist-info/licenses}/LICENSE +0 -0

bizon/alerting/alerts.py CHANGED Viewed

@@ -7,7 +7,6 @@ from bizon.alerting.models import AlertingConfig, AlertMethod, LogLevel
 class AbstractAlert(ABC):
     def __init__(self, type: AlertMethod, config: AlertingConfig, log_levels: List[LogLevel] = [LogLevel.ERROR]):
         self.type = type
         self.config = config

bizon/common/models.py CHANGED Viewed

@@ -1,9 +1,12 @@
-from typing import Optional, Union
+from typing import Any, Optional, Union
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from bizon.alerting.models import AlertingConfig
-from bizon.connectors.destinations.bigquery.src.config import BigQueryConfig
+from bizon.connectors.destinations.bigquery.src.config import (
+    BigQueryColumn,
+    BigQueryConfig,
+)
 from bizon.connectors.destinations.bigquery_streaming.src.config import (
     BigQueryStreamingConfig,
 )
@@ -18,8 +21,74 @@ from bizon.source.config import SourceConfig, SourceSyncModes
 from bizon.transform.config import TransformModel
-class BizonConfig(BaseModel):
+class StreamSourceConfig(BaseModel):
+    """Source-specific stream routing configuration.
+    Uses extra='allow' to support source-specific fields like:
+    - topic (Kafka)
+    - endpoint (API sources)
+    - channel (other streaming sources)
+    """
+    model_config = ConfigDict(extra="allow")
+    # Common field for stream identifier
+    name: Optional[str] = Field(None, description="Stream identifier within the source")
+    # Kafka-specific
+    topic: Optional[str] = Field(None, description="Kafka topic name")
+    # API-specific
+    endpoint: Optional[str] = Field(None, description="API endpoint path")
+class StreamDestinationConfig(BaseModel):
+    """Destination configuration for a stream.
+    Supports destination-specific schema definitions and options.
+    Uses extra='allow' to support destination-specific overrides.
+    """
+    model_config = ConfigDict(extra="allow")
+    # Universal destination identifier
+    table_id: str = Field(..., description="Full destination identifier (e.g., project.dataset.table)")
+    # BigQuery-specific schema (can be extended for other destinations)
+    record_schema: Optional[list[BigQueryColumn]] = Field(None, description="Schema for the destination records")
+    clustering_keys: Optional[list[str]] = Field(None, description="Clustering keys for the destination table")
+class StreamConfig(BaseModel):
+    """Configuration for a single stream.
+    Consolidates source stream routing and destination configuration in one place,
+    eliminating duplication of destination_id between source and destination configs.
+    """
+    model_config = ConfigDict(extra="forbid")
+    name: str = Field(..., description="Logical name for this stream")
+    source: StreamSourceConfig = Field(..., description="Source-specific routing configuration")
+    destination: StreamDestinationConfig = Field(
+        ..., description="Destination configuration including table and schema"
+    )
+    @field_validator("destination")
+    @classmethod
+    def validate_table_id_format(cls, v: StreamDestinationConfig) -> StreamDestinationConfig:
+        """Ensure table_id follows expected format for BigQuery-like destinations."""
+        if v.table_id:
+            parts = v.table_id.split(".")
+            if len(parts) != 3:
+                raise ValueError(
+                    f"table_id must be in format 'project.dataset.table', got: {v.table_id}. "
+                    f"Found {len(parts)} parts instead of 3."
+                )
+        return v
+class BizonConfig(BaseModel):
     # Forbid extra keys in the model
     model_config = ConfigDict(extra="forbid")
@@ -63,6 +132,115 @@ class BizonConfig(BaseModel):
         default=None,
     )
+    streams: Optional[list[StreamConfig]] = Field(
+        None,
+        description="Stream routing configuration (opt-in for multi-table streaming). "
+        "Consolidates source stream definitions with destination tables and schemas.",
+    )
+    @field_validator("streams")
+    @classmethod
+    def validate_streams_config(cls, v: Optional[list[StreamConfig]], info) -> Optional[list[StreamConfig]]:
+        """Validate streams configuration consistency."""
+        if not v:
+            return v
+        # Check for duplicate stream names
+        names = [s.name for s in v]
+        if len(names) != len(set(names)):
+            duplicates = [name for name in names if names.count(name) > 1]
+            raise ValueError(f"Duplicate stream names found in streams configuration: {set(duplicates)}")
+        # Check for duplicate table_ids
+        table_ids = [s.destination.table_id for s in v]
+        if len(table_ids) != len(set(table_ids)):
+            duplicates = [tid for tid in table_ids if table_ids.count(tid) > 1]
+            raise ValueError(f"Duplicate table_ids found in streams configuration: {set(duplicates)}")
+        # Validate that source sync_mode is 'stream' if streams config is used
+        source_config = info.data.get("source") if info.data else None
+        if source_config and source_config.sync_mode != SourceSyncModes.STREAM:
+            raise ValueError(
+                f"Configuration Error: 'streams' configuration requires source.sync_mode='stream'. "
+                f"Current sync_mode: {source_config.sync_mode}. "
+                f"Please update your config to use:\n"
+                f"  source:\n"
+                f"    sync_mode: stream"
+            )
+        return v
+    @model_validator(mode="before")
+    @classmethod
+    def inject_config_from_streams(cls, data: Any) -> Any:
+        """Inject source and destination config from streams.
+        This runs BEFORE field validation, enriching both source and destination
+        configs from the streams configuration. This allows:
+        1. Sources (like Kafka) to omit topics - they're extracted from streams
+        2. Destinations with unnest=true to work without duplicate record_schemas
+        This is source-agnostic: each source type can extract what it needs from streams.
+        """
+        if not isinstance(data, dict):
+            return data
+        streams = data.get("streams")
+        if not streams:
+            return data
+        source = data.get("source")
+        if source and isinstance(source, dict):
+            source_name = source.get("name")
+            # Kafka: inject topics from streams
+            if source_name == "kafka":
+                # Check if topics is missing, None, or empty list
+                if not source.get("topics") or source.get("topics") == []:
+                    topics = []
+                    for stream in streams:
+                        if isinstance(stream, dict):
+                            stream_src = stream.get("source", {})
+                            stream_dest = stream.get("destination", {})
+                            if stream_src.get("topic"):
+                                topics.append(
+                                    {
+                                        "name": stream_src.get("topic"),
+                                        "destination_id": stream_dest.get("table_id", ""),
+                                    }
+                                )
+                    if topics:
+                        source["topics"] = topics
+        destination = data.get("destination")
+        if not destination or not isinstance(destination, dict):
+            return data
+        destination_config = destination.get("config")
+        if not destination_config or not isinstance(destination_config, dict):
+            return data
+        # Only inject if record_schemas is not already set or is empty
+        if not destination_config.get("record_schemas"):
+            # Build record_schemas from streams
+            record_schemas = []
+            for stream in streams:
+                if isinstance(stream, dict):
+                    stream_dest = stream.get("destination", {})
+                    if stream_dest.get("record_schema"):
+                        record_schema_config = {
+                            "destination_id": stream_dest.get("table_id"),
+                            "record_schema": stream_dest.get("record_schema"),
+                            "clustering_keys": stream_dest.get("clustering_keys"),
+                        }
+                        record_schemas.append(record_schema_config)
+            # Inject into destination config
+            if record_schemas:
+                destination_config["record_schemas"] = record_schemas
+        return data
 class SyncMetadata(BaseModel):
     """Model which stores general metadata around a sync.

bizon/connectors/destinations/bigquery/config/bigquery_incremental.example.yml ADDED Viewed

@@ -0,0 +1,34 @@
+name: hubspot contacts to bigquery (incremental)
+source:
+  name: hubspot
+  stream: contacts
+  sync_mode: incremental
+  cursor_field: updatedAt  # HubSpot's timestamp field for filtering
+  properties:
+    strategy: all
+  authentication:
+    type: api_key
+    api_key: <MY_API_KEY>
+destination:
+  # Authentication: If empty it will be infered.
+  # Must have the bigquery.jobUser
+  # Must have the bigquery.dataEditor and storage.objectUser on the supplied dataset and bucket
+  name: bigquery
+  config:
+    buffer_size: 10 # in Mb
+    buffer_flush_timeout: 300 # in seconds
+    dataset_id: bizon_test
+    dataset_location: US
+    project_id: my-gcp-project-id
+    gcs_buffer_bucket: bizon-buffer
+    gcs_buffer_format: parquet
+    # Optional: service_account_key for explicit authentication
+    # service_account_key: >-
+    #   { ... }
+# How incremental sync works:
+# 1. First run: Behaves like full_refresh (fetches all data)
+# 2. Subsequent runs: Only fetches records where cursor_field > last_run
+# 3. Uses append-only strategy - new records are appended to existing data

bizon/connectors/destinations/bigquery/src/config.py CHANGED Viewed

@@ -98,7 +98,6 @@ class BigQueryRecordSchemaConfig(BaseModel):
 class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
     # Table details
     project_id: str = Field(..., description="BigQuery Project ID")
     dataset_id: str = Field(..., description="BigQuery Dataset ID")

bizon/connectors/destinations/bigquery/src/destination.py CHANGED Viewed

@@ -22,7 +22,6 @@ from .config import BigQueryColumn, BigQueryConfigDetails
 class BigQueryDestination(AbstractDestination):
     def __init__(
         self,
         sync_metadata: SyncMetadata,
@@ -56,7 +55,6 @@ class BigQueryDestination(AbstractDestination):
     @property
     def temp_table_id(self) -> str:
         if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
             return f"{self.table_id}_temp"
@@ -67,7 +65,6 @@ class BigQueryDestination(AbstractDestination):
             return f"{self.table_id}"
     def get_bigquery_schema(self, df_destination_records: pl.DataFrame) -> List[bigquery.SchemaField]:
         # Case we unnest the data
         if self.config.unnest:
             return [
@@ -113,9 +110,7 @@ class BigQueryDestination(AbstractDestination):
     # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dbapi.DataError
     def convert_and_upload_to_buffer(self, df_destination_records: pl.DataFrame) -> str:
         if self.buffer_format == "parquet":
             # Upload the Parquet file to GCS
             file_name = f"{self.sync_metadata.source_name}/{self.sync_metadata.stream_name}/{str(uuid4())}.parquet"
@@ -153,7 +148,6 @@ class BigQueryDestination(AbstractDestination):
         )
     def load_to_bigquery(self, gcs_file: str, df_destination_records: pl.DataFrame):
         # We always partition by the loaded_at field
         time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
@@ -171,7 +165,6 @@ class BigQueryDestination(AbstractDestination):
         assert result.state == "DONE", f"Job failed with state {result.state} with error {result.error_result}"
     def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
         # Rename fields to match BigQuery schema
         df_destination_records = df_destination_records.rename(
             {
@@ -201,13 +194,27 @@ class BigQueryDestination(AbstractDestination):
     def finalize(self):
         if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
             logger.info(f"Loading temp table {self.temp_table_id} data into {self.table_id} ...")
-            self.bq_client.query(f"CREATE OR REPLACE TABLE {self.table_id} AS SELECT * FROM {self.temp_table_id}")
+            query = f"CREATE OR REPLACE TABLE {self.table_id} AS SELECT * FROM {self.temp_table_id}"
+            result = self.bq_client.query(query)
+            bq_result = result.result()  # Waits for the job to completew
+            logger.info(f"BigQuery CREATE OR REPLACE query result: {bq_result}")
+            # Check if the destination table exists by fetching it; raise if it doesn't exist
+            try:
+                self.bq_client.get_table(self.table_id)
+            except NotFound:
+                logger.error(f"Table {self.table_id} not found")
+                raise Exception(f"Table {self.table_id} not found")
+            # Cleanup
             logger.info(f"Deleting temp table {self.temp_table_id} ...")
             self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
             return True
         elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL:
-            # TO DO: Implement incremental sync
+            # Append data from incremental temp table to main table
+            logger.info(f"Appending data from {self.temp_table_id} to {self.table_id} ...")
+            self.bq_client.query(f"INSERT INTO {self.table_id} SELECT * FROM {self.temp_table_id}").result()
+            logger.info(f"Deleting incremental temp table {self.temp_table_id} ...")
+            self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
             return True
         elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:

bizon/connectors/destinations/bigquery_streaming/config/bigquery_streaming.example.yml ADDED Viewed

@@ -0,0 +1,74 @@
+# BigQuery Streaming Destination Configuration
+# Uses the BigQuery Storage Write API for low-latency inserts
+#
+# Use this destination when:
+# - You need near real-time data loading
+# - Low latency is more important than cost optimization
+# - Working with streaming/continuous data sources
+#
+# Requirements:
+# - Service account with bigquery.dataEditor role
+# - Dataset must already exist
+name: source_to_bigquery_streaming
+source:
+  name: <YOUR_SOURCE>
+  stream: <YOUR_STREAM>
+  authentication:
+    type: api_key
+    params:
+      token: <YOUR_API_KEY>
+destination:
+  name: bigquery_streaming
+  config:
+    # GCP Project ID
+    project_id: <YOUR_GCP_PROJECT>
+    # BigQuery dataset (must exist)
+    dataset_id: <YOUR_DATASET>
+    # Dataset location (US, EU, etc.)
+    dataset_location: US
+    # Time partitioning (optional)
+    time_partitioning:
+      type: DAY  # Options: DAY, HOUR, MONTH, YEAR
+      field: _bizon_loaded_at
+    # Max rows per streaming request (max 10000)
+    bq_max_rows_per_request: 5000
+    # Buffer settings
+    buffer_size: 50           # MB before flushing
+    buffer_flush_timeout: 300  # Seconds before forcing flush
+    # Authentication (optional - uses ADC if not provided)
+    # authentication:
+    #   service_account_key: |
+    #     {
+    #       "type": "service_account",
+    #       "project_id": "<YOUR_GCP_PROJECT>",
+    #       ...
+    #     }
+    # Schema definition for unnesting (optional)
+    # Required if unnest: true
+    # unnest: true
+    # record_schemas:
+    #   - destination_id: my_table
+    #     record_schema:
+    #       - name: id
+    #         type: STRING
+    #         mode: REQUIRED
+    #       - name: created_at
+    #         type: TIMESTAMP
+    #         mode: NULLABLE
+engine:
+  backend:
+    type: bigquery
+    database: <YOUR_GCP_PROJECT>
+    schema: bizon_state
+    syncCursorInDBEvery: 10

bizon/connectors/destinations/bigquery_streaming/src/destination.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 import tempfile
-from datetime import datetime
 from typing import List, Tuple
 import orjson
@@ -43,7 +42,6 @@ from .config import BigQueryStreamingConfigDetails
 class BigQueryStreamingDestination(AbstractDestination):
     # Add constants for limits
     MAX_REQUEST_SIZE_BYTES = 5 * 1024 * 1024  # 5 MB (max is 10MB)
     MAX_ROW_SIZE_BYTES = 0.9 * 1024 * 1024  # 1 MB
@@ -78,7 +76,6 @@ class BigQueryStreamingDestination(AbstractDestination):
         return self.destination_id or f"{self.project_id}.{self.dataset_id}.{tabled_id}"
     def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
         if self.config.unnest:
             if len(list(self.record_schemas.keys())) == 1:
                 self.destination_id = list(self.record_schemas.keys())[0]
@@ -164,40 +161,6 @@ class BigQueryStreamingDestination(AbstractDestination):
         response = write_client.append_rows(iter([request]))
         return response.code().name
-    def safe_cast_record_values(self, row: dict):
-        """
-        Safe cast record values to the correct type for BigQuery.
-        """
-        for col in self.record_schemas[self.destination_id]:
-            # Handle dicts as strings
-            if col.type in [BigQueryColumnType.STRING, BigQueryColumnType.JSON]:
-                if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
-                    row[col.name] = orjson.dumps(row[col.name]).decode("utf-8")
-            # Handle timestamps
-            if (
-                col.type in [BigQueryColumnType.TIMESTAMP, BigQueryColumnType.DATETIME]
-                and col.default_value_expression is None
-            ):
-                if isinstance(row[col.name], int):
-                    if row[col.name] > datetime(9999, 12, 31).timestamp():
-                        row[col.name] = datetime.fromtimestamp(row[col.name] / 1_000_000).strftime(
-                            "%Y-%m-%d %H:%M:%S.%f"
-                        )
-                    else:
-                        try:
-                            row[col.name] = datetime.fromtimestamp(row[col.name]).strftime("%Y-%m-%d %H:%M:%S.%f")
-                        except ValueError:
-                            error_message = (
-                                f"Error casting timestamp for destination '{self.destination_id}' column '{col.name}'. "
-                                f"Invalid timestamp value: {row[col.name]} ({type(row[col.name])}). "
-                                "Consider using a transformation."
-                            )
-                            logger.error(error_message)
-                            raise ValueError(error_message)
-        return row
     @retry(
         retry=retry_if_exception_type(
             (
@@ -284,10 +247,7 @@ class BigQueryStreamingDestination(AbstractDestination):
         if self.config.unnest:
             # We cannot use the `json_decode` method here because of the issue: https://github.com/pola-rs/polars/issues/22371
-            rows_to_insert = [
-                self.safe_cast_record_values(orjson.loads(row))
-                for row in df_destination_records["source_data"].to_list()
-            ]
+            rows_to_insert = [orjson.loads(row) for row in df_destination_records["source_data"].to_list()]
         else:
             df_destination_records = df_destination_records.with_columns(
                 pl.col("bizon_extracted_at").dt.strftime("%Y-%m-%d %H:%M:%S").alias("bizon_extracted_at"),
@@ -355,7 +315,9 @@ class BigQueryStreamingDestination(AbstractDestination):
                 len(current_batch) >= self.bq_max_rows_per_request
                 or current_batch_size + item_size > self.MAX_REQUEST_SIZE_BYTES
             ):
-                logger.debug(f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB")
+                logger.debug(
+                    f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size / 1024 / 1024:.2f}MB"
+                )
                 yield {"stream_batch": current_batch, "json_batch": large_rows}
                 current_batch = []
                 current_batch_size = 0
@@ -371,7 +333,7 @@ class BigQueryStreamingDestination(AbstractDestination):
         # Yield the last batch
         if current_batch:
             logger.debug(
-                f"Yielding streaming batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB"
+                f"Yielding streaming batch of {len(current_batch)} rows, size: {current_batch_size / 1024 / 1024:.2f}MB"
             )
             logger.debug(f"Yielding large rows batch of {len(large_rows)} rows")
             yield {"stream_batch": current_batch, "json_batch": large_rows}

bizon/connectors/destinations/bigquery_streaming_v2/config/bigquery_streaming_v2.example.yml ADDED Viewed

@@ -0,0 +1,79 @@
+# BigQuery Streaming V2 Destination Configuration
+# Uses the BigQuery Storage Write API (v2) for improved streaming performance
+#
+# Use this destination when:
+# - You need the latest BigQuery streaming features
+# - Working with high-volume streaming data
+# - Require better error handling and retry logic
+#
+# Differences from v1:
+# - Improved batching and retry logic
+# - Better handling of schema evolution
+# - Enhanced error reporting
+#
+# Requirements:
+# - Service account with bigquery.dataEditor role
+# - Dataset must already exist
+name: source_to_bigquery_streaming_v2
+source:
+  name: <YOUR_SOURCE>
+  stream: <YOUR_STREAM>
+  authentication:
+    type: api_key
+    params:
+      token: <YOUR_API_KEY>
+destination:
+  name: bigquery_streaming_v2
+  config:
+    # GCP Project ID
+    project_id: <YOUR_GCP_PROJECT>
+    # BigQuery dataset (must exist)
+    dataset_id: <YOUR_DATASET>
+    # Dataset location (US, EU, etc.)
+    dataset_location: US
+    # Time partitioning (optional)
+    time_partitioning:
+      type: DAY  # Options: DAY, HOUR, MONTH, YEAR
+      field: _bizon_loaded_at
+    # Max rows per streaming request (max 10000)
+    bq_max_rows_per_request: 5000
+    # Buffer settings
+    buffer_size: 50           # MB before flushing
+    buffer_flush_timeout: 300  # Seconds before forcing flush
+    # Authentication (optional - uses ADC if not provided)
+    # authentication:
+    #   service_account_key: |
+    #     {
+    #       "type": "service_account",
+    #       "project_id": "<YOUR_GCP_PROJECT>",
+    #       ...
+    #     }
+    # Schema definition for unnesting (optional)
+    # Required if unnest: true
+    # unnest: true
+    # record_schemas:
+    #   - destination_id: my_table
+    #     record_schema:
+    #       - name: id
+    #         type: STRING
+    #         mode: REQUIRED
+    #       - name: created_at
+    #         type: TIMESTAMP
+    #         mode: NULLABLE
+engine:
+  backend:
+    type: bigquery
+    database: <YOUR_GCP_PROJECT>
+    schema: bizon_state
+    syncCursorInDBEvery: 10

bizon 0.1.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

bizon 0.1.2py3-none-any.whl → 0.3.0py3-none-any.whl