PyPI - bizon - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

bizon 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

bizon/cli/main.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import click
+from dotenv import find_dotenv, load_dotenv
 from bizon.engine.engine import RunnerFactory
 from bizon.engine.runner.config import LoggerLevel
@@ -95,15 +96,28 @@ def destination():
     show_default=True,
     help="Log level to use.",
 )
+@click.option(
+    "--env-file",
+    required=False,
+    type=click.Path(exists=True),
+    help="Path to .env file to load environment variables from.",
+)
 def run(
     filename: str,
     custom_source: str,
     runner: str,
     log_level: LoggerLevel,
+    env_file: str,
     help="Run a bizon pipeline from a YAML file.",
 ):
     """Run a bizon pipeline from a YAML file."""
+    # Load environment variables from .env file
+    if env_file:
+        load_dotenv(env_file)
+    else:
+        load_dotenv(find_dotenv(".env"))
     # Parse config from YAML file as a dictionary
     config = parse_from_yaml(filename)

bizon/connectors/destinations/bigquery/config/bigquery_incremental.example.yml ADDED Viewed

@@ -0,0 +1,34 @@
+name: hubspot contacts to bigquery (incremental)
+source:
+  name: hubspot
+  stream: contacts
+  sync_mode: incremental
+  cursor_field: updatedAt  # HubSpot's timestamp field for filtering
+  properties:
+    strategy: all
+  authentication:
+    type: api_key
+    api_key: <MY_API_KEY>
+destination:
+  # Authentication: If empty it will be infered.
+  # Must have the bigquery.jobUser
+  # Must have the bigquery.dataEditor and storage.objectUser on the supplied dataset and bucket
+  name: bigquery
+  config:
+    buffer_size: 10 # in Mb
+    buffer_flush_timeout: 300 # in seconds
+    dataset_id: bizon_test
+    dataset_location: US
+    project_id: my-gcp-project-id
+    gcs_buffer_bucket: bizon-buffer
+    gcs_buffer_format: parquet
+    # Optional: service_account_key for explicit authentication
+    # service_account_key: >-
+    #   { ... }
+# How incremental sync works:
+# 1. First run: Behaves like full_refresh (fetches all data)
+# 2. Subsequent runs: Only fetches records where cursor_field > last_run
+# 3. Uses append-only strategy - new records are appended to existing data

bizon/connectors/destinations/bigquery/src/destination.py CHANGED Viewed

@@ -210,7 +210,11 @@ class BigQueryDestination(AbstractDestination):
             return True
         elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL:
-            # TO DO: Implement incremental sync
+            # Append data from incremental temp table to main table
+            logger.info(f"Appending data from {self.temp_table_id} to {self.table_id} ...")
+            self.bq_client.query(f"INSERT INTO {self.table_id} SELECT * FROM {self.temp_table_id}").result()
+            logger.info(f"Deleting incremental temp table {self.temp_table_id} ...")
+            self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
             return True
         elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:

bizon/connectors/destinations/bigquery_streaming/src/destination.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 import tempfile
-from datetime import datetime
 from typing import List, Tuple
 import orjson
@@ -162,39 +161,6 @@ class BigQueryStreamingDestination(AbstractDestination):
         response = write_client.append_rows(iter([request]))
         return response.code().name
-    def safe_cast_record_values(self, row: dict):
-        """
-        Safe cast record values to the correct type for BigQuery.
-        """
-        for col in self.record_schemas[self.destination_id]:
-            # Handle dicts as strings
-            if col.type in [BigQueryColumnType.STRING, BigQueryColumnType.JSON]:
-                if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
-                    row[col.name] = orjson.dumps(row[col.name]).decode("utf-8")
-            # Handle timestamps
-            if (
-                col.type in [BigQueryColumnType.TIMESTAMP, BigQueryColumnType.DATETIME]
-                and col.default_value_expression is None
-            ):
-                if isinstance(row[col.name], int):
-                    if row[col.name] > datetime(9999, 12, 31).timestamp():
-                        row[col.name] = datetime.fromtimestamp(row[col.name] / 1_000_000).strftime(
-                            "%Y-%m-%d %H:%M:%S.%f"
-                        )
-                    else:
-                        try:
-                            row[col.name] = datetime.fromtimestamp(row[col.name]).strftime("%Y-%m-%d %H:%M:%S.%f")
-                        except ValueError:
-                            error_message = (
-                                f"Error casting timestamp for destination '{self.destination_id}' column '{col.name}'. "
-                                f"Invalid timestamp value: {row[col.name]} ({type(row[col.name])}). "
-                                "Consider using a transformation."
-                            )
-                            logger.error(error_message)
-                            raise ValueError(error_message)
-        return row
     @retry(
         retry=retry_if_exception_type(
             (
@@ -281,10 +247,7 @@ class BigQueryStreamingDestination(AbstractDestination):
         if self.config.unnest:
             # We cannot use the `json_decode` method here because of the issue: https://github.com/pola-rs/polars/issues/22371
-            rows_to_insert = [
-                self.safe_cast_record_values(orjson.loads(row))
-                for row in df_destination_records["source_data"].to_list()
-            ]
+            rows_to_insert = [orjson.loads(row) for row in df_destination_records["source_data"].to_list()]
         else:
             df_destination_records = df_destination_records.with_columns(
                 pl.col("bizon_extracted_at").dt.strftime("%Y-%m-%d %H:%M:%S").alias("bizon_extracted_at"),

bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import os
 import tempfile
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from datetime import datetime
 from typing import List, Tuple, Type
 import orjson
@@ -40,6 +39,7 @@ from bizon.destination.destination import AbstractDestination
 from bizon.engine.backend.backend import AbstractBackend
 from bizon.monitoring.monitor import AbstractMonitor
 from bizon.source.callback import AbstractSourceCallback
+from bizon.source.config import SourceSyncModes
 from .config import BigQueryStreamingV2ConfigDetails
 from .proto_utils import get_proto_schema_and_class
@@ -81,6 +81,17 @@ class BigQueryStreamingV2Destination(AbstractDestination):
         tabled_id = f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
         return self.destination_id or f"{self.project_id}.{self.dataset_id}.{tabled_id}"
+    @property
+    def temp_table_id(self) -> str:
+        if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
+            return f"{self.table_id}_temp"
+        elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL:
+            return f"{self.table_id}_incremental"
+        elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
+            return f"{self.table_id}"
+        # Default fallback
+        return f"{self.table_id}"
     def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
         if self.config.unnest:
             if len(list(self.record_schemas.keys())) == 1:
@@ -165,36 +176,6 @@ class BigQueryStreamingV2Destination(AbstractDestination):
             logger.error(f"Stream name: {stream_name}")
             raise
-    def safe_cast_record_values(self, row: dict):
-        """
-        Safe cast record values to the correct type for BigQuery.
-        """
-        for col in self.record_schemas[self.destination_id]:
-            # Handle dicts as strings
-            if col.type in ["STRING", "JSON"]:
-                if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
-                    row[col.name] = orjson.dumps(row[col.name]).decode("utf-8")
-            # Handle timestamps
-            if col.type in ["TIMESTAMP", "DATETIME"] and col.default_value_expression is None:
-                if isinstance(row[col.name], int):
-                    if row[col.name] > datetime(9999, 12, 31).timestamp():
-                        row[col.name] = datetime.fromtimestamp(row[col.name] / 1_000_000).strftime(
-                            "%Y-%m-%d %H:%M:%S.%f"
-                        )
-                    else:
-                        try:
-                            row[col.name] = datetime.fromtimestamp(row[col.name]).strftime("%Y-%m-%d %H:%M:%S.%f")
-                        except ValueError:
-                            error_message = (
-                                f"Error casting timestamp for destination '{self.destination_id}' column '{col.name}'. "
-                                f"Invalid timestamp value: {row[col.name]} ({type(row[col.name])}). "
-                                "Consider using a transformation."
-                            )
-                            logger.error(error_message)
-                            raise ValueError(error_message)
-        return row
     @staticmethod
     def to_protobuf_serialization(TableRowClass: Type[Message], row: dict) -> bytes:
         """Convert a row to a Protobuf serialization."""
@@ -263,14 +244,14 @@ class BigQueryStreamingV2Destination(AbstractDestination):
                     deserialized_row = self.from_protobuf_serialization(table_row_class, serialized_row)
                     deserialized_rows.append(deserialized_row)
-                # For large rows, we need to use the main client
+                # For large rows, we need to use the main client (write to temp_table_id)
                 job_config = bigquery.LoadJobConfig(
                     source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
-                    schema=self.bq_client.get_table(self.table_id).schema,
+                    schema=self.bq_client.get_table(self.temp_table_id).schema,
                     ignore_unknown_values=True,
                 )
                 load_job = self.bq_client.load_table_from_json(
-                    deserialized_rows, self.table_id, job_config=job_config, timeout=300
+                    deserialized_rows, self.temp_table_id, job_config=job_config, timeout=300
                 )
                 result = load_job.result()
                 if load_job.state != "DONE":
@@ -292,9 +273,9 @@ class BigQueryStreamingV2Destination(AbstractDestination):
             raise
     def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
-        # Create table if it does not exist
+        # Create table if it does not exist (use temp_table_id for staging)
         schema = self.get_bigquery_schema()
-        table = bigquery.Table(self.table_id, schema=schema)
+        table = bigquery.Table(self.temp_table_id, schema=schema)
         time_partitioning = TimePartitioning(
             field=self.config.time_partitioning.field, type_=self.config.time_partitioning.type
         )
@@ -305,7 +286,7 @@ class BigQueryStreamingV2Destination(AbstractDestination):
         try:
             table = self.bq_client.create_table(table)
         except Conflict:
-            table = self.bq_client.get_table(self.table_id)
+            table = self.bq_client.get_table(self.temp_table_id)
             # Compare and update schema if needed
             existing_fields = {field.name: field for field in table.schema}
             new_fields = {field.name: field for field in self.get_bigquery_schema()}
@@ -319,12 +300,13 @@ class BigQueryStreamingV2Destination(AbstractDestination):
                 table.schema = updated_schema
                 table = self.bq_client.update_table(table, ["schema"])
-        # Create the stream
-        if self.destination_id:
-            project, dataset, table_name = self.destination_id.split(".")
+        # Create the stream (use temp_table_id for staging)
+        temp_table_parts = self.temp_table_id.split(".")
+        if len(temp_table_parts) == 3:
+            project, dataset, table_name = temp_table_parts
             parent = BigQueryWriteClient.table_path(project, dataset, table_name)
         else:
-            parent = BigQueryWriteClient.table_path(self.project_id, self.dataset_id, self.destination_id)
+            parent = BigQueryWriteClient.table_path(self.project_id, self.dataset_id, temp_table_parts[-1])
         stream_name = f"{parent}/_default"
@@ -333,9 +315,7 @@ class BigQueryStreamingV2Destination(AbstractDestination):
         if self.config.unnest:
             serialized_rows = [
-                self.to_protobuf_serialization(
-                    TableRowClass=TableRow, row=self.safe_cast_record_values(orjson.loads(row))
-                )
+                self.to_protobuf_serialization(TableRowClass=TableRow, row=orjson.loads(row))
                 for row in df_destination_records["source_data"].to_list()
             ]
         else:
@@ -442,3 +422,29 @@ class BigQueryStreamingV2Destination(AbstractDestination):
             if large_rows:
                 logger.warning(f"Yielding large rows batch of {len(large_rows)} rows")
             yield {"stream_batch": current_batch, "json_batch": large_rows}
+    def finalize(self):
+        """Finalize the sync by moving data from temp table to main table based on sync mode."""
+        if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
+            # Replace main table with temp table data
+            logger.info(f"Loading temp table {self.temp_table_id} data into {self.table_id} ...")
+            self.bq_client.query(
+                f"CREATE OR REPLACE TABLE {self.table_id} AS SELECT * FROM {self.temp_table_id}"
+            ).result()
+            logger.info(f"Deleting temp table {self.temp_table_id} ...")
+            self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
+            return True
+        elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL:
+            # Append data from incremental temp table to main table
+            logger.info(f"Appending data from {self.temp_table_id} to {self.table_id} ...")
+            self.bq_client.query(f"INSERT INTO {self.table_id} SELECT * FROM {self.temp_table_id}").result()
+            logger.info(f"Deleting incremental temp table {self.temp_table_id} ...")
+            self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
+            return True
+        elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
+            # Direct writes, no finalization needed
+            return True
+        return True

bizon/connectors/destinations/file/config/file_incremental.example.yml ADDED Viewed

@@ -0,0 +1,22 @@
+name: dummy to file (incremental)
+source:
+  name: dummy
+  stream: creatures
+  sync_mode: incremental
+  cursor_field: updated_at  # Field to filter records by timestamp
+  authentication:
+    type: api_key
+    params:
+      token: dummy_key
+destination:
+  name: file
+  config:
+    format: json
+# How incremental sync works with file destination:
+# 1. First run: Behaves like full_refresh (creates new file)
+# 2. Subsequent runs: Only fetches records where cursor_field > last_run
+# 3. New records are appended to the existing JSON file
+# 4. Writes to temp file (_incremental.json) then appends to main file on finalize

bizon/connectors/destinations/file/src/destination.py CHANGED Viewed

@@ -1,13 +1,17 @@
+import os
+import shutil
 from typing import Tuple
 import orjson
 import polars as pl
+from loguru import logger
 from bizon.common.models import SyncMetadata
 from bizon.destination.destination import AbstractDestination
 from bizon.engine.backend.backend import AbstractBackend
 from bizon.monitoring.monitor import AbstractMonitor
 from bizon.source.callback import AbstractSourceCallback
+from bizon.source.config import SourceSyncModes
 from .config import FileDestinationDetailsConfig
@@ -24,6 +28,30 @@ class FileDestination(AbstractDestination):
         super().__init__(sync_metadata, config, backend, source_callback, monitor)
         self.config: FileDestinationDetailsConfig = config
+    @property
+    def file_path(self) -> str:
+        """Main output file path."""
+        return f"{self.destination_id}.json"
+    @property
+    def temp_file_path(self) -> str:
+        """Temp file path for FULL_REFRESH mode."""
+        if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH.value:
+            return f"{self.destination_id}_temp.json"
+        elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL.value:
+            return f"{self.destination_id}_incremental.json"
+        return self.file_path
+    @property
+    def write_path(self) -> str:
+        """Get the path to write to based on sync mode."""
+        if self.sync_metadata.sync_mode in [
+            SourceSyncModes.FULL_REFRESH.value,
+            SourceSyncModes.INCREMENTAL.value,
+        ]:
+            return self.temp_file_path
+        return self.file_path
     def check_connection(self) -> bool:
         return True
@@ -34,7 +62,7 @@ class FileDestination(AbstractDestination):
         if self.config.unnest:
             schema_keys = set([column.name for column in self.record_schemas[self.destination_id]])
-            with open(f"{self.destination_id}.json", "a") as f:
+            with open(self.write_path, "a") as f:
                 for value in [orjson.loads(data) for data in df_destination_records["source_data"].to_list()]:
                     assert set(value.keys()) == schema_keys, "Keys do not match the schema"
@@ -46,6 +74,35 @@ class FileDestination(AbstractDestination):
                     f.write(f"{orjson.dumps(row).decode('utf-8')}\n")
         else:
-            df_destination_records.write_ndjson(f"{self.destination_id}.json")
+            # Append mode for incremental, overwrite for full refresh on first write
+            with open(self.write_path, "a") as f:
+                for record in df_destination_records.iter_rows(named=True):
+                    f.write(f"{orjson.dumps(record).decode('utf-8')}\n")
         return True, ""
+    def finalize(self) -> bool:
+        """Finalize the sync by moving temp file to main file based on sync mode."""
+        if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH.value:
+            # Replace main file with temp file
+            if os.path.exists(self.temp_file_path):
+                logger.info(f"File destination: Moving {self.temp_file_path} to {self.file_path}")
+                shutil.move(self.temp_file_path, self.file_path)
+            return True
+        elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL.value:
+            # Append temp file contents to main file
+            if os.path.exists(self.temp_file_path):
+                logger.info(f"File destination: Appending {self.temp_file_path} to {self.file_path}")
+                with open(self.file_path, "a") as main_file:
+                    with open(self.temp_file_path) as temp_file:
+                        main_file.write(temp_file.read())
+                os.remove(self.temp_file_path)
+            return True
+        elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM.value:
+            # Direct writes, no finalization needed
+            logger.info("File destination: STREAM sync batch completed")
+            return True
+        return True

bizon/connectors/destinations/logger/config/logger_incremental.example.yml ADDED Viewed

@@ -0,0 +1,21 @@
+name: dummy to logger (incremental)
+source:
+  name: dummy
+  stream: creatures
+  sync_mode: incremental
+  cursor_field: updated_at  # Field to filter records by timestamp
+  authentication:
+    type: api_key
+    params:
+      token: dummy_key
+destination:
+  name: logger
+  config:
+    dummy: dummy
+# How incremental sync works:
+# 1. First run: Behaves like full_refresh (fetches all data)
+# 2. Subsequent runs: Only fetches records where cursor_field > last_run
+# 3. Logger outputs records with [incremental] prefix for easy identification

bizon/connectors/destinations/logger/src/destination.py CHANGED Viewed

@@ -8,6 +8,7 @@ from bizon.destination.destination import AbstractDestination
 from bizon.engine.backend.backend import AbstractBackend
 from bizon.monitoring.monitor import AbstractMonitor
 from bizon.source.callback import AbstractSourceCallback
+from bizon.source.config import SourceSyncModes
 from .config import LoggerDestinationConfig
@@ -36,6 +37,17 @@ class LoggerDestination(AbstractDestination):
         return True
     def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
+        sync_mode_label = f"[{self.sync_metadata.sync_mode}]" if self.sync_metadata.sync_mode else ""
         for record in df_destination_records.iter_rows(named=True):
-            logger.info(record["source_data"])
+            logger.info(f"{sync_mode_label} {record['source_data']}")
         return True, ""
+    def finalize(self) -> bool:
+        """Finalize the sync - logs completion message based on sync mode."""
+        if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH.value:
+            logger.info("Logger destination: FULL_REFRESH sync completed")
+        elif self.sync_metadata.sync_mode == SourceSyncModes.INCREMENTAL.value:
+            logger.info("Logger destination: INCREMENTAL sync completed (records appended)")
+        elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM.value:
+            logger.info("Logger destination: STREAM sync batch completed")
+        return True

bizon/connectors/sources/gsheets/config/service_account_incremental.example.yml ADDED Viewed

@@ -0,0 +1,51 @@
+name: gsheets incremental sync
+source:
+  name: gsheets
+  stream: worksheet
+  sync_mode: incremental
+  cursor_field: updated_at  # Column name in your sheet containing timestamps
+  spreadsheet_url: <MY_SPREADSHEET_URL>
+  worksheet_name: Sheet1
+  service_account_key: >-
+    {
+      "type": "service_account",
+      "project_id": "<MY_GCP_PROJECT>",
+      "private_key_id": "xxx",
+      "private_key": "-----BEGIN PRIVATE KEY-----\n...\n-----END PRIVATE KEY-----\n",
+      "client_email": "bizon@<MY_GCP_PROJECT>.iam.gserviceaccount.com",
+      "client_id": "999999999999",
+      "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+      "token_uri": "https://oauth2.googleapis.com/token",
+      "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+      "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/...",
+      "universe_domain": "googleapis.com"
+    }
+destination:
+  name: bigquery
+  config:
+    project_id: <MY_GCP_PROJECT>
+    dataset_id: gsheets_data
+    dataset_location: US
+    gcs_buffer_bucket: <MY_GCS_BUCKET>
+    gcs_buffer_format: parquet
+engine:
+  backend:
+    type: bigquery
+    database: <MY_GCP_PROJECT>
+    schema: bizon_backend
+    syncCursorInDBEvery: 2
+# Incremental sync for Google Sheets:
+# - First run: Fetches all rows (full refresh behavior)
+# - Subsequent runs: Only fetches rows where cursor_field > last_run
+#
+# IMPORTANT: Your Google Sheet must have a timestamp column for incremental sync.
+# Common patterns:
+# - Add an "updated_at" column with formula: =NOW() (updates on edit)
+# - Use Google Apps Script to auto-update timestamps on row changes
+# - Manually maintain a "last_modified" column
+#
+# If your sheet doesn't have timestamps, use sync_mode: full_refresh instead.

bizon/connectors/sources/hubspot/config/api_key_incremental.example.yml ADDED Viewed

@@ -0,0 +1,40 @@
+name: hubspot contacts incremental sync
+source:
+  name: hubspot
+  stream: contacts
+  sync_mode: incremental
+  cursor_field: updatedAt  # HubSpot's timestamp field for contacts
+  properties:
+    strategy: all
+  authentication:
+    type: api_key
+    params:
+      token: <MY_API_KEY>
+destination:
+  name: bigquery
+  config:
+    project_id: <MY_GCP_PROJECT>
+    dataset_id: hubspot_data
+    dataset_location: US
+    gcs_buffer_bucket: <MY_GCS_BUCKET>
+    gcs_buffer_format: parquet
+engine:
+  backend:
+    type: bigquery
+    database: <MY_GCP_PROJECT>
+    schema: bizon_backend
+    syncCursorInDBEvery: 2
+# Incremental sync for HubSpot:
+# - First run: Fetches all contacts (full refresh behavior)
+# - Subsequent runs: Only fetches contacts where updatedAt > last_run
+#
+# Common cursor fields by stream:
+# - contacts: updatedAt
+# - companies: updatedAt
+# - deals: updatedAt
+# - tickets: updatedAt
+# - products: updatedAt

bizon/connectors/sources/notion/config/api_key_incremental.example.yml ADDED Viewed

@@ -0,0 +1,48 @@
+name: notion pages incremental sync
+source:
+  name: notion
+  stream: pages  # Options: databases, data_sources, pages, blocks, users
+  sync_mode: incremental
+  cursor_field: last_edited_time  # Notion's timestamp field
+  authentication:
+    type: api_key
+    params:
+      token: secret_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx  # Your Notion integration token
+  # List of database IDs to fetch data from
+  database_ids:
+    - "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
+  # Number of results per API call (1-100, default: 100)
+  page_size: 100
+destination:
+  name: bigquery
+  config:
+    project_id: <MY_GCP_PROJECT>
+    dataset_id: notion_data
+    dataset_location: US
+    gcs_buffer_bucket: <MY_GCS_BUCKET>
+    gcs_buffer_format: parquet
+engine:
+  backend:
+    type: bigquery
+    database: <MY_GCP_PROJECT>
+    schema: bizon_backend
+    syncCursorInDBEvery: 2
+# Incremental sync for Notion:
+# - First run: Fetches all pages/databases (full refresh behavior)
+# - Subsequent runs: Only fetches items where last_edited_time > last_run
+#
+# Supported streams for incremental sync:
+# - pages, all_pages: Uses Search API with last_edited_time filter
+# - databases, all_databases: Uses Search API to find updated data_sources
+# - blocks: First finds updated pages, then fetches their blocks
+# - blocks_markdown, all_blocks_markdown: Same as blocks, converts to markdown
+#
+# Not supported (falls back to full refresh):
+# - users: No timestamp filter available
+# - data_sources: Use databases stream instead

bizon 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

bizon 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl