PyPI - bizon - Versions diffs - 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

bizon 0.1.2py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

bizon/alerting/alerts.py +0 -1
bizon/common/models.py +182 -4
bizon/connectors/destinations/bigquery/src/config.py +0 -1
bizon/connectors/destinations/bigquery/src/destination.py +11 -8
bizon/connectors/destinations/bigquery_streaming/config/bigquery_streaming.example.yml +74 -0
bizon/connectors/destinations/bigquery_streaming/src/destination.py +4 -5
bizon/connectors/destinations/bigquery_streaming_v2/config/bigquery_streaming_v2.example.yml +79 -0
bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +4 -6
bizon/connectors/destinations/file/config/file.example.yml +40 -0
bizon/connectors/destinations/file/src/config.py +1 -1
bizon/connectors/destinations/file/src/destination.py +0 -5
bizon/connectors/destinations/logger/config/logger.example.yml +30 -0
bizon/connectors/destinations/logger/src/config.py +0 -2
bizon/connectors/destinations/logger/src/destination.py +1 -2
bizon/connectors/sources/cycle/src/source.py +2 -6
bizon/connectors/sources/dummy/src/source.py +0 -4
bizon/connectors/sources/gsheets/src/source.py +2 -3
bizon/connectors/sources/hubspot/src/hubspot_base.py +0 -1
bizon/connectors/sources/hubspot/src/hubspot_objects.py +3 -4
bizon/connectors/sources/hubspot/src/models/hs_object.py +0 -1
bizon/connectors/sources/kafka/config/kafka_streams.example.yml +124 -0
bizon/connectors/sources/kafka/src/config.py +10 -6
bizon/connectors/sources/kafka/src/decode.py +2 -2
bizon/connectors/sources/kafka/src/source.py +147 -46
bizon/connectors/sources/notion/config/api_key.example.yml +35 -0
bizon/connectors/sources/notion/src/__init__.py +0 -0
bizon/connectors/sources/notion/src/config.py +59 -0
bizon/connectors/sources/notion/src/source.py +1159 -0
bizon/connectors/sources/notion/tests/notion_pipeline.py +7 -0
bizon/connectors/sources/notion/tests/test_notion.py +113 -0
bizon/connectors/sources/periscope/src/source.py +0 -6
bizon/connectors/sources/pokeapi/src/source.py +0 -1
bizon/connectors/sources/sana_ai/config/sana.example.yml +25 -0
bizon/connectors/sources/sana_ai/src/source.py +85 -0
bizon/destination/buffer.py +0 -1
bizon/destination/config.py +0 -1
bizon/destination/destination.py +1 -4
bizon/engine/backend/adapters/sqlalchemy/backend.py +2 -5
bizon/engine/backend/adapters/sqlalchemy/config.py +0 -1
bizon/engine/config.py +0 -1
bizon/engine/engine.py +0 -1
bizon/engine/pipeline/consumer.py +0 -1
bizon/engine/pipeline/producer.py +1 -5
bizon/engine/queue/adapters/kafka/config.py +1 -1
bizon/engine/queue/adapters/kafka/queue.py +0 -1
bizon/engine/queue/adapters/python_queue/consumer.py +0 -1
bizon/engine/queue/adapters/python_queue/queue.py +0 -2
bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -1
bizon/engine/queue/adapters/rabbitmq/queue.py +0 -1
bizon/engine/queue/config.py +0 -2
bizon/engine/runner/adapters/process.py +0 -2
bizon/engine/runner/adapters/streaming.py +55 -1
bizon/engine/runner/adapters/thread.py +0 -2
bizon/engine/runner/config.py +0 -1
bizon/engine/runner/runner.py +0 -2
bizon/monitoring/datadog/monitor.py +5 -3
bizon/monitoring/noop/monitor.py +1 -1
bizon/source/auth/authenticators/abstract_oauth.py +11 -3
bizon/source/auth/authenticators/abstract_token.py +2 -1
bizon/source/auth/authenticators/basic.py +1 -1
bizon/source/auth/authenticators/cookies.py +2 -1
bizon/source/auth/authenticators/oauth.py +8 -3
bizon/source/config.py +0 -2
bizon/source/cursor.py +8 -16
bizon/source/discover.py +3 -6
bizon/source/models.py +0 -1
bizon/source/session.py +0 -1
bizon/source/source.py +17 -2
bizon/transform/config.py +0 -2
bizon/transform/transform.py +0 -3
{bizon-0.1.2.dist-info → bizon-0.2.0.dist-info}/METADATA +62 -42
bizon-0.2.0.dist-info/RECORD +136 -0
{bizon-0.1.2.dist-info → bizon-0.2.0.dist-info}/WHEEL +1 -1
bizon-0.2.0.dist-info/entry_points.txt +2 -0
bizon-0.1.2.dist-info/RECORD +0 -123
bizon-0.1.2.dist-info/entry_points.txt +0 -3
{bizon-0.1.2.dist-info → bizon-0.2.0.dist-info/licenses}/LICENSE +0 -0

bizon/connectors/sources/cycle/src/source.py CHANGED Viewed

@@ -60,9 +60,7 @@ class CycleSource(AbstractSource):
                 cursor: "PAGINATION_CURSOR"
                 direction: AFTER
             }
-            """.replace(
-                "PAGINATION_CURSOR", pagination.get("endCursor")
-            )
+            """.replace("PAGINATION_CURSOR", pagination.get("endCursor"))
         return pagination_str
@@ -99,9 +97,7 @@ class CycleSource(AbstractSource):
                 }
             }
         }
-        """.replace(
-            "PAGINATION_STRING", pagination_str
-        )
+        """.replace("PAGINATION_STRING", pagination_str)
         variables = {"slug": self.config.slug}

bizon/connectors/sources/dummy/src/source.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import random
 from typing import List, Literal, Tuple, Union
 from pydantic import Field
@@ -28,7 +27,6 @@ class DummySourceConfig(SourceConfig):
 class DummySource(AbstractSource):
     def __init__(self, config: DummySourceConfig):
         super().__init__(config)
         self.config = config
@@ -46,7 +44,6 @@ class DummySource(AbstractSource):
         return f"https://api.dummy.com/v1/{self.config.stream}"
     def get_authenticator(self) -> AuthBase:
         if self.config.authentication.type == AuthType.OAUTH:
             return AuthBuilder.oauth2(
                 params=Oauth2AuthParams(
@@ -69,7 +66,6 @@ class DummySource(AbstractSource):
         return 5
     def get(self, pagination: dict = None) -> SourceIteration:
         response: dict = None
         # If no pagination data is passed, we want to reach first page

bizon/connectors/sources/gsheets/src/source.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import re
-from typing import Any, Counter, List, Tuple, Type
+from collections import Counter
+from typing import Any, List, Tuple
 from uuid import uuid4
 import google.auth
@@ -34,7 +35,6 @@ class GsheetsSourceConfig(SourceConfig):
 class GsheetsSource(AbstractSource):
     def __init__(self, config: GsheetsSourceConfig):
         super().__init__(config)
         self.config: GsheetsSourceConfig = config
@@ -49,7 +49,6 @@ class GsheetsSource(AbstractSource):
         return GsheetsSourceConfig
     def get_gspread_client(self) -> gspread.client.Client:
         if self.config.service_account_key:
             # use creds to create a client to interact with the Google Drive API
             credentials_dict = json.loads(self.config.service_account_key)

bizon/connectors/sources/hubspot/src/hubspot_base.py CHANGED Viewed

@@ -19,7 +19,6 @@ URL_TOKEN_REFRESH = f"{URL_BASE}/oauth/v1/token"
 class HubSpotBaseSource(AbstractSource, ABC):
     def get_session(self) -> Session:
         """Apply custom strategy for HubSpot"""
         session = Session()

bizon/connectors/sources/hubspot/src/hubspot_objects.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
+from collections.abc import Generator
 from enum import Enum
-from typing import Any, Generator, List, Optional
+from typing import List, Optional
 from loguru import logger
 from pydantic import BaseModel, Field
@@ -31,7 +32,6 @@ class HubSpotSourceConfig(SourceConfig):
 class HubSpotObjectsSource(HubSpotBaseSource):
     api_version = "v3"
     object_path = f"crm/{api_version}/objects"
@@ -75,7 +75,6 @@ class HubSpotObjectsSource(HubSpotBaseSource):
         payload: Optional[dict] = None,
         headers=None,
     ) -> Generator[dict, None, None]:
         # Call HubSpot API
         response = self.session.call(
             method=method,
@@ -164,7 +163,7 @@ class HubSpotObjectsSource(HubSpotBaseSource):
             payload={"filterGroups": [{"filters": [{"operator": "HAS_PROPERTY", "propertyName": "hs_object_id"}]}]},
         )
         total = search_response["total"]
-        logger.info(f"Number of {self.object} in HubSpot: {'{:,}'.format(total).replace(',', ' ')}")
+        logger.info(f"Number of {self.object} in HubSpot: {f'{total:,}'.replace(',', ' ')}")
         return total
     def list_properties(self) -> AllObjectProperties:

bizon/connectors/sources/hubspot/src/models/hs_object.py CHANGED Viewed

@@ -40,7 +40,6 @@ class HubSpotObject(BaseModel):
         cls,
         raw_obj: dict,
     ):
         properties = {}
         for property_name, property_value in raw_obj.get("properties", {}).items():

bizon/connectors/sources/kafka/config/kafka_streams.example.yml ADDED Viewed

@@ -0,0 +1,124 @@
+# Example: Kafka source with streams configuration
+# This demonstrates the unified streams config that consolidates
+# topic-to-destination mapping with schema definitions.
+name: kafka_streams_example
+source:
+  name: kafka
+  stream: topic
+  sync_mode: stream
+  # No topics needed - they are automatically extracted from streams config
+  nb_bytes_schema_id: 4
+  timestamp_ms_name: ts_ms
+  batch_size: 100
+  consumer_timeout: 30
+  bootstrap_servers: your-kafka-broker:9092
+  group_id: your-consumer-group
+  authentication:
+    type: basic
+    schema_registry_url: https://your-schema-registry:8081
+    params:
+      username: your-kafka-username
+      password: your-kafka-password
+destination:
+  name: bigquery_streaming_v2
+  config:
+    dataset_id: your_dataset
+    dataset_location: US
+    project_id: your-gcp-project
+    unnest: true
+    time_partitioning:
+      type: DAY
+      field: __inserted_at
+# Streams configuration - consolidates topic -> table -> schema mapping
+# Each stream defines:
+#   - source: where to read from (topic for Kafka)
+#   - destination: where to write (table_id + schema)
+streams:
+  - name: "users"
+    source:
+      topic: "cdc.public.users"
+    destination:
+      table_id: "your-gcp-project.your_dataset.users"
+      clustering_keys:
+        - "id"
+      record_schema:
+        - name: "id"
+          type: "INTEGER"
+          mode: "REQUIRED"
+        - name: "email"
+          type: "STRING"
+          mode: "NULLABLE"
+        - name: "payload"
+          type: "JSON"
+          mode: "NULLABLE"
+        - name: "__operation"
+          type: "STRING"
+          mode: "NULLABLE"
+        - name: "__deleted"
+          type: "BOOLEAN"
+          mode: "NULLABLE"
+        - name: "__kafka_partition"
+          type: "INTEGER"
+          mode: "NULLABLE"
+        - name: "__kafka_offset"
+          type: "INTEGER"
+          mode: "NULLABLE"
+        - name: "__kafka_topic"
+          type: "STRING"
+          mode: "NULLABLE"
+        - name: "__event_timestamp"
+          type: "TIMESTAMP"
+          mode: "NULLABLE"
+        - name: "__inserted_at"
+          type: "TIMESTAMP"
+          mode: "NULLABLE"
+          default_value_expression: "CURRENT_TIMESTAMP()"
+  - name: "orders"
+    source:
+      topic: "cdc.public.orders"
+    destination:
+      table_id: "your-gcp-project.your_dataset.orders"
+      clustering_keys:
+        - "id"
+        - "user_id"
+      record_schema:
+        - name: "id"
+          type: "INTEGER"
+          mode: "REQUIRED"
+        - name: "user_id"
+          type: "INTEGER"
+          mode: "REQUIRED"
+        - name: "payload"
+          type: "JSON"
+          mode: "NULLABLE"
+        - name: "__operation"
+          type: "STRING"
+          mode: "NULLABLE"
+        - name: "__deleted"
+          type: "BOOLEAN"
+          mode: "NULLABLE"
+        - name: "__kafka_partition"
+          type: "INTEGER"
+          mode: "NULLABLE"
+        - name: "__kafka_offset"
+          type: "INTEGER"
+          mode: "NULLABLE"
+        - name: "__kafka_topic"
+          type: "STRING"
+          mode: "NULLABLE"
+        - name: "__event_timestamp"
+          type: "TIMESTAMP"
+          mode: "NULLABLE"
+        - name: "__inserted_at"
+          type: "TIMESTAMP"
+          mode: "NULLABLE"
+          default_value_expression: "CURRENT_TIMESTAMP()"
+engine:
+  runner:
+    type: stream  # Required when using streams config

bizon/connectors/sources/kafka/src/config.py CHANGED Viewed

@@ -1,5 +1,6 @@
+from collections.abc import Mapping
 from enum import Enum
-from typing import Any, List, Literal, Mapping
+from typing import Any, List, Literal, Optional
 from pydantic import BaseModel, Field
@@ -17,7 +18,6 @@ class MessageEncoding(str, Enum):
 class KafkaAuthConfig(AuthConfig):
     type: Literal[AuthType.BASIC] = AuthType.BASIC  # username and password authentication
     # Schema registry authentication
@@ -45,16 +45,20 @@ class TopicConfig(BaseModel):
 class KafkaSourceConfig(SourceConfig):
-    # Mandatory Kafka configuration
-    topics: List[TopicConfig] = Field(..., description="Kafka topic, comma separated")
+    # Kafka configuration
+    topics: Optional[List[TopicConfig]] = Field(
+        default=[],
+        description="Kafka topics. Can be empty if using streams configuration to define topics.",
+    )
     bootstrap_servers: str = Field(..., description="Kafka bootstrap servers")
     group_id: str = Field(default="bizon", description="Kafka group id")
     skip_message_empty_value: bool = Field(
         default=True, description="Skip messages with empty value (tombstone messages)"
     )
+    skip_message_invalid_keys: bool = Field(
+        default=False, description="Skip messages with invalid keys (unparsable JSON keys)"
+    )
     # Kafka consumer configuration
     batch_size: int = Field(100, description="Kafka batch size, number of messages to fetch at once.")
     consumer_timeout: int = Field(10, description="Kafka consumer timeout in seconds, before returning batch.")

bizon/connectors/sources/kafka/src/decode.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import io
 import struct
-from functools import lru_cache
+from functools import cache
 from typing import Tuple, Union
 import fastavro
@@ -20,7 +20,7 @@ class Hashabledict(dict):
         return hash(frozenset(self.items()))
-@lru_cache(maxsize=None)
+@cache
 def parse_global_id_from_serialized_message(message: bytes) -> Tuple[int, int]:
     """
     Parse the global id from the serialized message.

bizon/connectors/sources/kafka/src/source.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import traceback
+from collections.abc import Mapping
 from datetime import datetime
-from functools import lru_cache
-from typing import Any, List, Mapping, Tuple
+from functools import cache
+from typing import Any, List, Tuple
 import orjson
 from avro.schema import Schema, parse
@@ -59,13 +60,25 @@ class TopicOffsets(BaseModel):
         return sum([partition.last for partition in self.partitions.values()])
-class KafkaSource(AbstractSource):
+def on_error(err: KafkaError):
+    # Fires for client-level errors (incl. DNS resolve failures)
+    if err.fatal():
+        logger.error(f"Kafka client error: {err} | fatal={err.fatal()} retriable={err.retriable()}")
+        raise KafkaException(err)
+    else:
+        logger.warning(f"Kafka client error: {err} | fatal={err.fatal()} retriable={err.retriable()}")
+class KafkaSource(AbstractSource):
     def __init__(self, config: KafkaSourceConfig):
         super().__init__(config)
         self.config: KafkaSourceConfig = config
+        # Ensure topics is always a list (not None)
+        if self.config.topics is None:
+            self.config.topics = []
         # Kafka consumer configuration.
         if self.config.authentication.type == AuthType.BASIC:
             self.config.consumer_config["sasl.mechanisms"] = "PLAIN"
@@ -76,12 +89,58 @@ class KafkaSource(AbstractSource):
         self.config.consumer_config["group.id"] = self.config.group_id
         self.config.consumer_config["bootstrap.servers"] = self.config.bootstrap_servers
+        # Set the error callback
+        self.config.consumer_config["error_cb"] = on_error
         # Consumer instance
         self.consumer = Consumer(self.config.consumer_config)
         # Map topic_name to destination_id
         self.topic_map = {topic.name: topic.destination_id for topic in self.config.topics}
+    def set_streams_config(self, streams: list) -> None:
+        """Configure Kafka topics from streams config.
+        This method enriches self.config.topics from the streams configuration,
+        ensuring that subsequent source instantiations (e.g., in init_job) have
+        access to the topics without duplication in the YAML config.
+        When a top-level 'streams' configuration is present, this method:
+        1. Extracts Kafka topics from streams (topic field)
+        2. Builds TopicConfig objects with destination_id from streams
+        3. Populates self.config.topics if empty (modifies bizon_config.source in-place)
+        4. Updates topic_map for record routing
+        Args:
+            streams: List of StreamConfig objects from BizonConfig.streams
+        """
+        from .config import TopicConfig
+        # Extract topics from streams
+        topics_from_streams = []
+        streams_map = {}
+        for stream in streams:
+            if hasattr(stream.source, "topic") and stream.source.topic:
+                topic_name = stream.source.topic
+                streams_map[topic_name] = stream
+                # Build TopicConfig from stream
+                topic_config = TopicConfig(name=topic_name, destination_id=stream.destination.table_id)
+                topics_from_streams.append(topic_config)
+        # Populate self.config.topics from streams (modifies bizon_config.source in-place)
+        # This ensures check_connection() and subsequent source instantiations have topics
+        if not self.config.topics and topics_from_streams:
+            self.config.topics = topics_from_streams
+            logger.info(f"Kafka: Populated {len(topics_from_streams)} topics from streams config")
+            for topic_config in topics_from_streams:
+                logger.info(f"  - Topic: {topic_config.name} -> {topic_config.destination_id}")
+        # Update topic_map with destination table_ids from streams
+        for topic, stream_config in streams_map.items():
+            self.topic_map[topic] = stream_config.destination.table_id
     @staticmethod
     def streams() -> List[str]:
         return ["topic"]
@@ -102,31 +161,52 @@ class KafkaSource(AbstractSource):
     def check_connection(self) -> Tuple[bool | Any | None]:
         """Check the connection to the Kafka source"""
-        logger.info(f"Found: {len(self.consumer.list_topics().topics)} topics")
+        # Validate that topics have been configured
+        if not self.config.topics:
+            error_msg = (
+                "No topics configured. Either provide topics in source config or use streams configuration. "
+                "If using streams config, ensure set_streams_config() is called before check_connection()."
+            )
+            logger.error(error_msg)
+            return False, error_msg
+        try:
+            # Use a short timeout to avoid hanging on connection issues
+            cluster_metadata = self.consumer.list_topics(timeout=self.config.consumer_timeout)
+            topics = cluster_metadata.topics
+            logger.info(f"Found: {len(topics)} topics")
-        topics = self.consumer.list_topics().topics
+            config_topics = [topic.name for topic in self.config.topics]
-        config_topics = [topic.name for topic in self.config.topics]
+            # Display consumer config
+            # We ignore the key sasl.password and sasl.username
+            consumer_config = self.config.consumer_config.copy()
+            consumer_config.pop("sasl.password", None)
+            consumer_config.pop("sasl.username", None)
+            logger.info(f"Consumer config: {consumer_config}")
-        # Display consumer config
-        # We ignore the key sasl.password and sasl.username
-        consumer_config = self.config.consumer_config.copy()
-        consumer_config.pop("sasl.password", None)
-        consumer_config.pop("sasl.username", None)
-        logger.info(f"Consumer config: {consumer_config}")
+            for topic in config_topics:
+                if topic not in topics:
+                    logger.error(f"Topic {topic} not found, available topics: {topics.keys()}")
+                    return False, f"Topic {topic} not found"
-        for topic in config_topics:
-            if topic not in topics:
-                logger.error(f"Topic {topic} not found, available topics: {topics.keys()}")
-                return False, f"Topic {topic} not found"
+                logger.info(f"Topic {topic} has {len(topics[topic].partitions)} partitions")
-            logger.info(f"Topic {topic} has {len(topics[topic].partitions)} partitions")
+            return True, None
-        return True, None
+        except KafkaException as e:
+            error_msg = f"Kafka connection failed: {e}"
+            logger.error(error_msg)
+            return False, error_msg
+        except Exception as e:
+            error_msg = f"Connection check failed: {e}"
+            logger.error(error_msg)
+            return False, error_msg
     def get_number_of_partitions(self, topic: str) -> int:
         """Get the number of partitions for the topic"""
-        return len(self.consumer.list_topics().topics[topic].partitions)
+        return len(self.consumer.list_topics(timeout=self.config.consumer_timeout).topics[topic].partitions)
     def get_offset_partitions(self, topic: str) -> TopicOffsets:
         """Get the offsets for each partition of the topic"""
@@ -134,7 +214,9 @@ class KafkaSource(AbstractSource):
         partitions: Mapping[int, OffsetPartition] = {}
         for i in range(self.get_number_of_partitions(topic)):
-            offsets = self.consumer.get_watermark_offsets(TopicPartition(topic, i))
+            offsets = self.consumer.get_watermark_offsets(
+                TopicPartition(topic, i), timeout=self.config.consumer_timeout
+            )
             partitions[i] = OffsetPartition(first=offsets[0], last=offsets[1])
         return TopicOffsets(name=topic, partitions=partitions)
@@ -147,7 +229,7 @@ class KafkaSource(AbstractSource):
             total_records += self.get_offset_partitions(topic).total_offset
         return total_records
-    @lru_cache(maxsize=None)
+    @cache
     def get_schema_from_registry(self, global_id: int) -> Tuple[Hashabledict, Schema]:
         """Get the schema from the registry, return a hashable dict and an avro schema object"""
@@ -194,10 +276,8 @@ class KafkaSource(AbstractSource):
             hashable_dict_schema, avro_schema = self.get_schema_from_registry(global_id=global_id)
         except SchemaNotFound as e:
             logger.error(
-                (
-                    f"Message on topic {message.topic()} partition {message.partition()} at offset {message.offset()} has a  SchemaID of {global_id} which is not found in Registry."
-                    f"message value: {message.value()}."
-                )
+                f"Message on topic {message.topic()} partition {message.partition()} at offset {message.offset()} has a  SchemaID of {global_id} which is not found in Registry."
+                f"message value: {message.value()}."
             )
             logger.error(traceback.format_exc())
             raise e
@@ -235,35 +315,46 @@ class KafkaSource(AbstractSource):
         records = []
         for message in encoded_messages:
+            MESSAGE_LOG_METADATA = (
+                f"Message for topic {message.topic()} partition {message.partition()} and offset {message.offset()}"
+            )
             if message.error():
                 # If the message is too large, we skip it and update the offset
                 if message.error().code() == KafkaError.MSG_SIZE_TOO_LARGE:
                     logger.error(
-                        (
-                            f"Message for topic {message.topic()} partition {message.partition()} and offset {message.offset()} is too large. "
-                            f"Raised MSG_SIZE_TOO_LARGE, if manually setting the offset, the message might not exist. Double-check in Confluent Cloud."
-                        )
+                        f"{MESSAGE_LOG_METADATA} is too large. "
+                        "Raised MSG_SIZE_TOO_LARGE, if manually setting the offset, the message might not exist. Double-check in Confluent Cloud."
                     )
-                logger.error(
-                    (
-                        f"Error while consuming message for topic {message.topic()} partition {message.partition()} and offset {message.offset()}: "
-                        f"{message.error()}"
-                    )
-                )
+                logger.error(f"{MESSAGE_LOG_METADATA}: {message.error()}")
                 raise KafkaException(message.error())
             # We skip tombstone messages
             if self.config.skip_message_empty_value and not message.value():
-                logger.debug(
-                    f"Message for topic {message.topic()} partition {message.partition()} and offset {message.offset()} is empty, skipping."
-                )
+                logger.debug(f"{MESSAGE_LOG_METADATA} is empty, skipping.")
                 continue
+            # Parse message keys
+            if message.key():
+                try:
+                    message_keys = orjson.loads(message.key().decode("utf-8"))
+                except orjson.JSONDecodeError as e:
+                    # We skip messages with invalid keys
+                    if self.config.skip_message_invalid_keys:
+                        logger.warning(f"{MESSAGE_LOG_METADATA} has an invalid key={message.key()}, skipping.")
+                        # Skip the message
+                        continue
+                    logger.error(
+                        f"{MESSAGE_LOG_METADATA}: Error while parsing message key: {e}, raw key: {message.key()}"
+                    )
+                    raise e
+            else:
+                message_keys = {}
             # Decode the message
             try:
                 decoded_message, hashable_dict_schema = self.decode(message)
                 data = {
@@ -271,7 +362,7 @@ class KafkaSource(AbstractSource):
                     "offset": message.offset(),
                     "partition": message.partition(),
                     "timestamp": message.timestamp()[1],
-                    "keys": orjson.loads(message.key().decode("utf-8")) if message.key() else {},
+                    "keys": message_keys,
                     "headers": (
                         {key: value.decode("utf-8") for key, value in message.headers()} if message.headers() else {}
                     ),
@@ -290,17 +381,27 @@ class KafkaSource(AbstractSource):
             except Exception as e:
                 logger.error(
-                    (
-                        f"Error while decoding message for topic {message.topic()} on partition {message.partition()}: {e} at offset {message.offset()} "
-                        f"with value: {message.value()} and key: {message.key()}"
-                    )
+                    f"{MESSAGE_LOG_METADATA}: Error while decoding message: {e} "
+                    f"with value: {message.value()} and key: {message.key()}"
                 )
-                # Try to parse error message from the message
+                # Try to parse error message from the message value
                 try:
                     message_raw_text = message.value().decode("utf-8")
                     logger.error(f"Parsed Kafka value: {message_raw_text}")
                 except UnicodeDecodeError:
-                    logger.error("Message is not a valid UTF-8 string")
+                    logger.error("Message value is not a valid UTF-8 string")
+                # Try to parse error message from the message headers
+                if message.headers():
+                    try:
+                        headers_dict = {key: value.decode("utf-8") for key, value in message.headers()}
+                        logger.error(f"Parsed Kafka headers: {headers_dict}")
+                    except UnicodeDecodeError as header_error:
+                        logger.error(f"Some message headers are not valid UTF-8 strings: {header_error}")
+                        logger.error(f"Raw message headers: {list(message.headers())}")
+                else:
+                    logger.error("Message headers are None or empty")
                 logger.error(traceback.format_exc())
                 raise e

bizon/connectors/sources/notion/config/api_key.example.yml ADDED Viewed

@@ -0,0 +1,35 @@
+# Notion Source Configuration
+# This example shows how to configure the Notion source connector
+source:
+  name: notion
+  stream: pages  # Options: databases, data_sources, pages, blocks, users
+  authentication:
+    type: api_key
+    params:
+      token: secret_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx  # Your Notion integration token
+  # List of database IDs to fetch data from
+  # Find the ID in the database URL: notion.so/{workspace}/{database_id}?v=...
+  database_ids:
+    - "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
+    - "yyyyyyyy-yyyy-yyyy-yyyy-yyyyyyyyyyyy"
+  # List of specific page IDs to fetch (optional)
+  # Find the ID in the page URL: notion.so/{page_id}
+  page_ids:
+    - "zzzzzzzz-zzzz-zzzz-zzzz-zzzzzzzzzzzz"
+  # Whether to fetch nested blocks recursively (default: true)
+  # Only applies to blocks stream
+  fetch_blocks_recursively: true
+  # Number of results per API call (1-100, default: 100)
+  page_size: 100
+destination:
+  name: bigquery
+  config:
+    project_id: my-project
+    dataset_id: notion_data
+    # ... other destination config

bizon/connectors/sources/notion/src/__init__.py ADDED Viewed

File without changes

bizon 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

bizon 0.1.2py3-none-any.whl → 0.2.0py3-none-any.whl