bizon 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon/alerting/alerts.py +0 -1
- bizon/common/models.py +182 -4
- bizon/connectors/destinations/bigquery/src/config.py +0 -1
- bizon/connectors/destinations/bigquery/src/destination.py +11 -8
- bizon/connectors/destinations/bigquery_streaming/config/bigquery_streaming.example.yml +74 -0
- bizon/connectors/destinations/bigquery_streaming/src/destination.py +4 -5
- bizon/connectors/destinations/bigquery_streaming_v2/config/bigquery_streaming_v2.example.yml +79 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +4 -6
- bizon/connectors/destinations/file/config/file.example.yml +40 -0
- bizon/connectors/destinations/file/src/config.py +1 -1
- bizon/connectors/destinations/file/src/destination.py +0 -5
- bizon/connectors/destinations/logger/config/logger.example.yml +30 -0
- bizon/connectors/destinations/logger/src/config.py +0 -2
- bizon/connectors/destinations/logger/src/destination.py +1 -2
- bizon/connectors/sources/cycle/src/source.py +2 -6
- bizon/connectors/sources/dummy/src/source.py +0 -4
- bizon/connectors/sources/gsheets/src/source.py +2 -3
- bizon/connectors/sources/hubspot/src/hubspot_base.py +0 -1
- bizon/connectors/sources/hubspot/src/hubspot_objects.py +3 -4
- bizon/connectors/sources/hubspot/src/models/hs_object.py +0 -1
- bizon/connectors/sources/kafka/config/kafka_streams.example.yml +124 -0
- bizon/connectors/sources/kafka/src/config.py +10 -6
- bizon/connectors/sources/kafka/src/decode.py +2 -2
- bizon/connectors/sources/kafka/src/source.py +147 -46
- bizon/connectors/sources/notion/config/api_key.example.yml +35 -0
- bizon/connectors/sources/notion/src/__init__.py +0 -0
- bizon/connectors/sources/notion/src/config.py +59 -0
- bizon/connectors/sources/notion/src/source.py +1159 -0
- bizon/connectors/sources/notion/tests/notion_pipeline.py +7 -0
- bizon/connectors/sources/notion/tests/test_notion.py +113 -0
- bizon/connectors/sources/periscope/src/source.py +0 -6
- bizon/connectors/sources/pokeapi/src/source.py +0 -1
- bizon/connectors/sources/sana_ai/config/sana.example.yml +25 -0
- bizon/connectors/sources/sana_ai/src/source.py +85 -0
- bizon/destination/buffer.py +0 -1
- bizon/destination/config.py +0 -1
- bizon/destination/destination.py +1 -4
- bizon/engine/backend/adapters/sqlalchemy/backend.py +2 -5
- bizon/engine/backend/adapters/sqlalchemy/config.py +0 -1
- bizon/engine/config.py +0 -1
- bizon/engine/engine.py +0 -1
- bizon/engine/pipeline/consumer.py +0 -1
- bizon/engine/pipeline/producer.py +1 -5
- bizon/engine/queue/adapters/kafka/config.py +1 -1
- bizon/engine/queue/adapters/kafka/queue.py +0 -1
- bizon/engine/queue/adapters/python_queue/consumer.py +0 -1
- bizon/engine/queue/adapters/python_queue/queue.py +0 -2
- bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -1
- bizon/engine/queue/adapters/rabbitmq/queue.py +0 -1
- bizon/engine/queue/config.py +0 -2
- bizon/engine/runner/adapters/process.py +0 -2
- bizon/engine/runner/adapters/streaming.py +55 -1
- bizon/engine/runner/adapters/thread.py +0 -2
- bizon/engine/runner/config.py +0 -1
- bizon/engine/runner/runner.py +0 -2
- bizon/monitoring/datadog/monitor.py +5 -3
- bizon/monitoring/noop/monitor.py +1 -1
- bizon/source/auth/authenticators/abstract_oauth.py +11 -3
- bizon/source/auth/authenticators/abstract_token.py +2 -1
- bizon/source/auth/authenticators/basic.py +1 -1
- bizon/source/auth/authenticators/cookies.py +2 -1
- bizon/source/auth/authenticators/oauth.py +8 -3
- bizon/source/config.py +0 -2
- bizon/source/cursor.py +8 -16
- bizon/source/discover.py +3 -6
- bizon/source/models.py +0 -1
- bizon/source/session.py +0 -1
- bizon/source/source.py +17 -2
- bizon/transform/config.py +0 -2
- bizon/transform/transform.py +0 -3
- {bizon-0.1.2.dist-info → bizon-0.2.0.dist-info}/METADATA +62 -42
- bizon-0.2.0.dist-info/RECORD +136 -0
- {bizon-0.1.2.dist-info → bizon-0.2.0.dist-info}/WHEEL +1 -1
- bizon-0.2.0.dist-info/entry_points.txt +2 -0
- bizon-0.1.2.dist-info/RECORD +0 -123
- bizon-0.1.2.dist-info/entry_points.txt +0 -3
- {bizon-0.1.2.dist-info → bizon-0.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -60,9 +60,7 @@ class CycleSource(AbstractSource):
|
|
|
60
60
|
cursor: "PAGINATION_CURSOR"
|
|
61
61
|
direction: AFTER
|
|
62
62
|
}
|
|
63
|
-
""".replace(
|
|
64
|
-
"PAGINATION_CURSOR", pagination.get("endCursor")
|
|
65
|
-
)
|
|
63
|
+
""".replace("PAGINATION_CURSOR", pagination.get("endCursor"))
|
|
66
64
|
|
|
67
65
|
return pagination_str
|
|
68
66
|
|
|
@@ -99,9 +97,7 @@ class CycleSource(AbstractSource):
|
|
|
99
97
|
}
|
|
100
98
|
}
|
|
101
99
|
}
|
|
102
|
-
""".replace(
|
|
103
|
-
"PAGINATION_STRING", pagination_str
|
|
104
|
-
)
|
|
100
|
+
""".replace("PAGINATION_STRING", pagination_str)
|
|
105
101
|
|
|
106
102
|
variables = {"slug": self.config.slug}
|
|
107
103
|
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import random
|
|
2
1
|
from typing import List, Literal, Tuple, Union
|
|
3
2
|
|
|
4
3
|
from pydantic import Field
|
|
@@ -28,7 +27,6 @@ class DummySourceConfig(SourceConfig):
|
|
|
28
27
|
|
|
29
28
|
|
|
30
29
|
class DummySource(AbstractSource):
|
|
31
|
-
|
|
32
30
|
def __init__(self, config: DummySourceConfig):
|
|
33
31
|
super().__init__(config)
|
|
34
32
|
self.config = config
|
|
@@ -46,7 +44,6 @@ class DummySource(AbstractSource):
|
|
|
46
44
|
return f"https://api.dummy.com/v1/{self.config.stream}"
|
|
47
45
|
|
|
48
46
|
def get_authenticator(self) -> AuthBase:
|
|
49
|
-
|
|
50
47
|
if self.config.authentication.type == AuthType.OAUTH:
|
|
51
48
|
return AuthBuilder.oauth2(
|
|
52
49
|
params=Oauth2AuthParams(
|
|
@@ -69,7 +66,6 @@ class DummySource(AbstractSource):
|
|
|
69
66
|
return 5
|
|
70
67
|
|
|
71
68
|
def get(self, pagination: dict = None) -> SourceIteration:
|
|
72
|
-
|
|
73
69
|
response: dict = None
|
|
74
70
|
|
|
75
71
|
# If no pagination data is passed, we want to reach first page
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import re
|
|
3
|
-
from
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from typing import Any, List, Tuple
|
|
4
5
|
from uuid import uuid4
|
|
5
6
|
|
|
6
7
|
import google.auth
|
|
@@ -34,7 +35,6 @@ class GsheetsSourceConfig(SourceConfig):
|
|
|
34
35
|
|
|
35
36
|
|
|
36
37
|
class GsheetsSource(AbstractSource):
|
|
37
|
-
|
|
38
38
|
def __init__(self, config: GsheetsSourceConfig):
|
|
39
39
|
super().__init__(config)
|
|
40
40
|
self.config: GsheetsSourceConfig = config
|
|
@@ -49,7 +49,6 @@ class GsheetsSource(AbstractSource):
|
|
|
49
49
|
return GsheetsSourceConfig
|
|
50
50
|
|
|
51
51
|
def get_gspread_client(self) -> gspread.client.Client:
|
|
52
|
-
|
|
53
52
|
if self.config.service_account_key:
|
|
54
53
|
# use creds to create a client to interact with the Google Drive API
|
|
55
54
|
credentials_dict = json.loads(self.config.service_account_key)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
|
+
from collections.abc import Generator
|
|
2
3
|
from enum import Enum
|
|
3
|
-
from typing import
|
|
4
|
+
from typing import List, Optional
|
|
4
5
|
|
|
5
6
|
from loguru import logger
|
|
6
7
|
from pydantic import BaseModel, Field
|
|
@@ -31,7 +32,6 @@ class HubSpotSourceConfig(SourceConfig):
|
|
|
31
32
|
|
|
32
33
|
|
|
33
34
|
class HubSpotObjectsSource(HubSpotBaseSource):
|
|
34
|
-
|
|
35
35
|
api_version = "v3"
|
|
36
36
|
|
|
37
37
|
object_path = f"crm/{api_version}/objects"
|
|
@@ -75,7 +75,6 @@ class HubSpotObjectsSource(HubSpotBaseSource):
|
|
|
75
75
|
payload: Optional[dict] = None,
|
|
76
76
|
headers=None,
|
|
77
77
|
) -> Generator[dict, None, None]:
|
|
78
|
-
|
|
79
78
|
# Call HubSpot API
|
|
80
79
|
response = self.session.call(
|
|
81
80
|
method=method,
|
|
@@ -164,7 +163,7 @@ class HubSpotObjectsSource(HubSpotBaseSource):
|
|
|
164
163
|
payload={"filterGroups": [{"filters": [{"operator": "HAS_PROPERTY", "propertyName": "hs_object_id"}]}]},
|
|
165
164
|
)
|
|
166
165
|
total = search_response["total"]
|
|
167
|
-
logger.info(f"Number of {self.object} in HubSpot: {'{:,}'.
|
|
166
|
+
logger.info(f"Number of {self.object} in HubSpot: {f'{total:,}'.replace(',', ' ')}")
|
|
168
167
|
return total
|
|
169
168
|
|
|
170
169
|
def list_properties(self) -> AllObjectProperties:
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
# Example: Kafka source with streams configuration
|
|
2
|
+
# This demonstrates the unified streams config that consolidates
|
|
3
|
+
# topic-to-destination mapping with schema definitions.
|
|
4
|
+
|
|
5
|
+
name: kafka_streams_example
|
|
6
|
+
|
|
7
|
+
source:
|
|
8
|
+
name: kafka
|
|
9
|
+
stream: topic
|
|
10
|
+
sync_mode: stream
|
|
11
|
+
# No topics needed - they are automatically extracted from streams config
|
|
12
|
+
nb_bytes_schema_id: 4
|
|
13
|
+
timestamp_ms_name: ts_ms
|
|
14
|
+
batch_size: 100
|
|
15
|
+
consumer_timeout: 30
|
|
16
|
+
bootstrap_servers: your-kafka-broker:9092
|
|
17
|
+
group_id: your-consumer-group
|
|
18
|
+
authentication:
|
|
19
|
+
type: basic
|
|
20
|
+
schema_registry_url: https://your-schema-registry:8081
|
|
21
|
+
params:
|
|
22
|
+
username: your-kafka-username
|
|
23
|
+
password: your-kafka-password
|
|
24
|
+
|
|
25
|
+
destination:
|
|
26
|
+
name: bigquery_streaming_v2
|
|
27
|
+
config:
|
|
28
|
+
dataset_id: your_dataset
|
|
29
|
+
dataset_location: US
|
|
30
|
+
project_id: your-gcp-project
|
|
31
|
+
unnest: true
|
|
32
|
+
time_partitioning:
|
|
33
|
+
type: DAY
|
|
34
|
+
field: __inserted_at
|
|
35
|
+
|
|
36
|
+
# Streams configuration - consolidates topic -> table -> schema mapping
|
|
37
|
+
# Each stream defines:
|
|
38
|
+
# - source: where to read from (topic for Kafka)
|
|
39
|
+
# - destination: where to write (table_id + schema)
|
|
40
|
+
streams:
|
|
41
|
+
- name: "users"
|
|
42
|
+
source:
|
|
43
|
+
topic: "cdc.public.users"
|
|
44
|
+
destination:
|
|
45
|
+
table_id: "your-gcp-project.your_dataset.users"
|
|
46
|
+
clustering_keys:
|
|
47
|
+
- "id"
|
|
48
|
+
record_schema:
|
|
49
|
+
- name: "id"
|
|
50
|
+
type: "INTEGER"
|
|
51
|
+
mode: "REQUIRED"
|
|
52
|
+
- name: "email"
|
|
53
|
+
type: "STRING"
|
|
54
|
+
mode: "NULLABLE"
|
|
55
|
+
- name: "payload"
|
|
56
|
+
type: "JSON"
|
|
57
|
+
mode: "NULLABLE"
|
|
58
|
+
- name: "__operation"
|
|
59
|
+
type: "STRING"
|
|
60
|
+
mode: "NULLABLE"
|
|
61
|
+
- name: "__deleted"
|
|
62
|
+
type: "BOOLEAN"
|
|
63
|
+
mode: "NULLABLE"
|
|
64
|
+
- name: "__kafka_partition"
|
|
65
|
+
type: "INTEGER"
|
|
66
|
+
mode: "NULLABLE"
|
|
67
|
+
- name: "__kafka_offset"
|
|
68
|
+
type: "INTEGER"
|
|
69
|
+
mode: "NULLABLE"
|
|
70
|
+
- name: "__kafka_topic"
|
|
71
|
+
type: "STRING"
|
|
72
|
+
mode: "NULLABLE"
|
|
73
|
+
- name: "__event_timestamp"
|
|
74
|
+
type: "TIMESTAMP"
|
|
75
|
+
mode: "NULLABLE"
|
|
76
|
+
- name: "__inserted_at"
|
|
77
|
+
type: "TIMESTAMP"
|
|
78
|
+
mode: "NULLABLE"
|
|
79
|
+
default_value_expression: "CURRENT_TIMESTAMP()"
|
|
80
|
+
|
|
81
|
+
- name: "orders"
|
|
82
|
+
source:
|
|
83
|
+
topic: "cdc.public.orders"
|
|
84
|
+
destination:
|
|
85
|
+
table_id: "your-gcp-project.your_dataset.orders"
|
|
86
|
+
clustering_keys:
|
|
87
|
+
- "id"
|
|
88
|
+
- "user_id"
|
|
89
|
+
record_schema:
|
|
90
|
+
- name: "id"
|
|
91
|
+
type: "INTEGER"
|
|
92
|
+
mode: "REQUIRED"
|
|
93
|
+
- name: "user_id"
|
|
94
|
+
type: "INTEGER"
|
|
95
|
+
mode: "REQUIRED"
|
|
96
|
+
- name: "payload"
|
|
97
|
+
type: "JSON"
|
|
98
|
+
mode: "NULLABLE"
|
|
99
|
+
- name: "__operation"
|
|
100
|
+
type: "STRING"
|
|
101
|
+
mode: "NULLABLE"
|
|
102
|
+
- name: "__deleted"
|
|
103
|
+
type: "BOOLEAN"
|
|
104
|
+
mode: "NULLABLE"
|
|
105
|
+
- name: "__kafka_partition"
|
|
106
|
+
type: "INTEGER"
|
|
107
|
+
mode: "NULLABLE"
|
|
108
|
+
- name: "__kafka_offset"
|
|
109
|
+
type: "INTEGER"
|
|
110
|
+
mode: "NULLABLE"
|
|
111
|
+
- name: "__kafka_topic"
|
|
112
|
+
type: "STRING"
|
|
113
|
+
mode: "NULLABLE"
|
|
114
|
+
- name: "__event_timestamp"
|
|
115
|
+
type: "TIMESTAMP"
|
|
116
|
+
mode: "NULLABLE"
|
|
117
|
+
- name: "__inserted_at"
|
|
118
|
+
type: "TIMESTAMP"
|
|
119
|
+
mode: "NULLABLE"
|
|
120
|
+
default_value_expression: "CURRENT_TIMESTAMP()"
|
|
121
|
+
|
|
122
|
+
engine:
|
|
123
|
+
runner:
|
|
124
|
+
type: stream # Required when using streams config
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
from collections.abc import Mapping
|
|
1
2
|
from enum import Enum
|
|
2
|
-
from typing import Any, List, Literal,
|
|
3
|
+
from typing import Any, List, Literal, Optional
|
|
3
4
|
|
|
4
5
|
from pydantic import BaseModel, Field
|
|
5
6
|
|
|
@@ -17,7 +18,6 @@ class MessageEncoding(str, Enum):
|
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
class KafkaAuthConfig(AuthConfig):
|
|
20
|
-
|
|
21
21
|
type: Literal[AuthType.BASIC] = AuthType.BASIC # username and password authentication
|
|
22
22
|
|
|
23
23
|
# Schema registry authentication
|
|
@@ -45,16 +45,20 @@ class TopicConfig(BaseModel):
|
|
|
45
45
|
|
|
46
46
|
|
|
47
47
|
class KafkaSourceConfig(SourceConfig):
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
48
|
+
# Kafka configuration
|
|
49
|
+
topics: Optional[List[TopicConfig]] = Field(
|
|
50
|
+
default=[],
|
|
51
|
+
description="Kafka topics. Can be empty if using streams configuration to define topics.",
|
|
52
|
+
)
|
|
51
53
|
bootstrap_servers: str = Field(..., description="Kafka bootstrap servers")
|
|
52
54
|
group_id: str = Field(default="bizon", description="Kafka group id")
|
|
53
55
|
|
|
54
56
|
skip_message_empty_value: bool = Field(
|
|
55
57
|
default=True, description="Skip messages with empty value (tombstone messages)"
|
|
56
58
|
)
|
|
57
|
-
|
|
59
|
+
skip_message_invalid_keys: bool = Field(
|
|
60
|
+
default=False, description="Skip messages with invalid keys (unparsable JSON keys)"
|
|
61
|
+
)
|
|
58
62
|
# Kafka consumer configuration
|
|
59
63
|
batch_size: int = Field(100, description="Kafka batch size, number of messages to fetch at once.")
|
|
60
64
|
consumer_timeout: int = Field(10, description="Kafka consumer timeout in seconds, before returning batch.")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import struct
|
|
3
|
-
from functools import
|
|
3
|
+
from functools import cache
|
|
4
4
|
from typing import Tuple, Union
|
|
5
5
|
|
|
6
6
|
import fastavro
|
|
@@ -20,7 +20,7 @@ class Hashabledict(dict):
|
|
|
20
20
|
return hash(frozenset(self.items()))
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
@
|
|
23
|
+
@cache
|
|
24
24
|
def parse_global_id_from_serialized_message(message: bytes) -> Tuple[int, int]:
|
|
25
25
|
"""
|
|
26
26
|
Parse the global id from the serialized message.
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import traceback
|
|
2
|
+
from collections.abc import Mapping
|
|
2
3
|
from datetime import datetime
|
|
3
|
-
from functools import
|
|
4
|
-
from typing import Any, List,
|
|
4
|
+
from functools import cache
|
|
5
|
+
from typing import Any, List, Tuple
|
|
5
6
|
|
|
6
7
|
import orjson
|
|
7
8
|
from avro.schema import Schema, parse
|
|
@@ -59,13 +60,25 @@ class TopicOffsets(BaseModel):
|
|
|
59
60
|
return sum([partition.last for partition in self.partitions.values()])
|
|
60
61
|
|
|
61
62
|
|
|
62
|
-
|
|
63
|
+
def on_error(err: KafkaError):
|
|
64
|
+
# Fires for client-level errors (incl. DNS resolve failures)
|
|
65
|
+
if err.fatal():
|
|
66
|
+
logger.error(f"Kafka client error: {err} | fatal={err.fatal()} retriable={err.retriable()}")
|
|
67
|
+
raise KafkaException(err)
|
|
68
|
+
else:
|
|
69
|
+
logger.warning(f"Kafka client error: {err} | fatal={err.fatal()} retriable={err.retriable()}")
|
|
70
|
+
|
|
63
71
|
|
|
72
|
+
class KafkaSource(AbstractSource):
|
|
64
73
|
def __init__(self, config: KafkaSourceConfig):
|
|
65
74
|
super().__init__(config)
|
|
66
75
|
|
|
67
76
|
self.config: KafkaSourceConfig = config
|
|
68
77
|
|
|
78
|
+
# Ensure topics is always a list (not None)
|
|
79
|
+
if self.config.topics is None:
|
|
80
|
+
self.config.topics = []
|
|
81
|
+
|
|
69
82
|
# Kafka consumer configuration.
|
|
70
83
|
if self.config.authentication.type == AuthType.BASIC:
|
|
71
84
|
self.config.consumer_config["sasl.mechanisms"] = "PLAIN"
|
|
@@ -76,12 +89,58 @@ class KafkaSource(AbstractSource):
|
|
|
76
89
|
self.config.consumer_config["group.id"] = self.config.group_id
|
|
77
90
|
self.config.consumer_config["bootstrap.servers"] = self.config.bootstrap_servers
|
|
78
91
|
|
|
92
|
+
# Set the error callback
|
|
93
|
+
self.config.consumer_config["error_cb"] = on_error
|
|
94
|
+
|
|
79
95
|
# Consumer instance
|
|
80
96
|
self.consumer = Consumer(self.config.consumer_config)
|
|
81
97
|
|
|
82
98
|
# Map topic_name to destination_id
|
|
83
99
|
self.topic_map = {topic.name: topic.destination_id for topic in self.config.topics}
|
|
84
100
|
|
|
101
|
+
def set_streams_config(self, streams: list) -> None:
|
|
102
|
+
"""Configure Kafka topics from streams config.
|
|
103
|
+
|
|
104
|
+
This method enriches self.config.topics from the streams configuration,
|
|
105
|
+
ensuring that subsequent source instantiations (e.g., in init_job) have
|
|
106
|
+
access to the topics without duplication in the YAML config.
|
|
107
|
+
|
|
108
|
+
When a top-level 'streams' configuration is present, this method:
|
|
109
|
+
1. Extracts Kafka topics from streams (topic field)
|
|
110
|
+
2. Builds TopicConfig objects with destination_id from streams
|
|
111
|
+
3. Populates self.config.topics if empty (modifies bizon_config.source in-place)
|
|
112
|
+
4. Updates topic_map for record routing
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
streams: List of StreamConfig objects from BizonConfig.streams
|
|
116
|
+
"""
|
|
117
|
+
from .config import TopicConfig
|
|
118
|
+
|
|
119
|
+
# Extract topics from streams
|
|
120
|
+
topics_from_streams = []
|
|
121
|
+
streams_map = {}
|
|
122
|
+
|
|
123
|
+
for stream in streams:
|
|
124
|
+
if hasattr(stream.source, "topic") and stream.source.topic:
|
|
125
|
+
topic_name = stream.source.topic
|
|
126
|
+
streams_map[topic_name] = stream
|
|
127
|
+
|
|
128
|
+
# Build TopicConfig from stream
|
|
129
|
+
topic_config = TopicConfig(name=topic_name, destination_id=stream.destination.table_id)
|
|
130
|
+
topics_from_streams.append(topic_config)
|
|
131
|
+
|
|
132
|
+
# Populate self.config.topics from streams (modifies bizon_config.source in-place)
|
|
133
|
+
# This ensures check_connection() and subsequent source instantiations have topics
|
|
134
|
+
if not self.config.topics and topics_from_streams:
|
|
135
|
+
self.config.topics = topics_from_streams
|
|
136
|
+
logger.info(f"Kafka: Populated {len(topics_from_streams)} topics from streams config")
|
|
137
|
+
for topic_config in topics_from_streams:
|
|
138
|
+
logger.info(f" - Topic: {topic_config.name} -> {topic_config.destination_id}")
|
|
139
|
+
|
|
140
|
+
# Update topic_map with destination table_ids from streams
|
|
141
|
+
for topic, stream_config in streams_map.items():
|
|
142
|
+
self.topic_map[topic] = stream_config.destination.table_id
|
|
143
|
+
|
|
85
144
|
@staticmethod
|
|
86
145
|
def streams() -> List[str]:
|
|
87
146
|
return ["topic"]
|
|
@@ -102,31 +161,52 @@ class KafkaSource(AbstractSource):
|
|
|
102
161
|
def check_connection(self) -> Tuple[bool | Any | None]:
|
|
103
162
|
"""Check the connection to the Kafka source"""
|
|
104
163
|
|
|
105
|
-
|
|
164
|
+
# Validate that topics have been configured
|
|
165
|
+
if not self.config.topics:
|
|
166
|
+
error_msg = (
|
|
167
|
+
"No topics configured. Either provide topics in source config or use streams configuration. "
|
|
168
|
+
"If using streams config, ensure set_streams_config() is called before check_connection()."
|
|
169
|
+
)
|
|
170
|
+
logger.error(error_msg)
|
|
171
|
+
return False, error_msg
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
# Use a short timeout to avoid hanging on connection issues
|
|
175
|
+
cluster_metadata = self.consumer.list_topics(timeout=self.config.consumer_timeout)
|
|
176
|
+
topics = cluster_metadata.topics
|
|
177
|
+
|
|
178
|
+
logger.info(f"Found: {len(topics)} topics")
|
|
106
179
|
|
|
107
|
-
|
|
180
|
+
config_topics = [topic.name for topic in self.config.topics]
|
|
108
181
|
|
|
109
|
-
|
|
182
|
+
# Display consumer config
|
|
183
|
+
# We ignore the key sasl.password and sasl.username
|
|
184
|
+
consumer_config = self.config.consumer_config.copy()
|
|
185
|
+
consumer_config.pop("sasl.password", None)
|
|
186
|
+
consumer_config.pop("sasl.username", None)
|
|
187
|
+
logger.info(f"Consumer config: {consumer_config}")
|
|
110
188
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
consumer_config.pop("sasl.username", None)
|
|
116
|
-
logger.info(f"Consumer config: {consumer_config}")
|
|
189
|
+
for topic in config_topics:
|
|
190
|
+
if topic not in topics:
|
|
191
|
+
logger.error(f"Topic {topic} not found, available topics: {topics.keys()}")
|
|
192
|
+
return False, f"Topic {topic} not found"
|
|
117
193
|
|
|
118
|
-
|
|
119
|
-
if topic not in topics:
|
|
120
|
-
logger.error(f"Topic {topic} not found, available topics: {topics.keys()}")
|
|
121
|
-
return False, f"Topic {topic} not found"
|
|
194
|
+
logger.info(f"Topic {topic} has {len(topics[topic].partitions)} partitions")
|
|
122
195
|
|
|
123
|
-
|
|
196
|
+
return True, None
|
|
124
197
|
|
|
125
|
-
|
|
198
|
+
except KafkaException as e:
|
|
199
|
+
error_msg = f"Kafka connection failed: {e}"
|
|
200
|
+
logger.error(error_msg)
|
|
201
|
+
return False, error_msg
|
|
202
|
+
except Exception as e:
|
|
203
|
+
error_msg = f"Connection check failed: {e}"
|
|
204
|
+
logger.error(error_msg)
|
|
205
|
+
return False, error_msg
|
|
126
206
|
|
|
127
207
|
def get_number_of_partitions(self, topic: str) -> int:
|
|
128
208
|
"""Get the number of partitions for the topic"""
|
|
129
|
-
return len(self.consumer.list_topics().topics[topic].partitions)
|
|
209
|
+
return len(self.consumer.list_topics(timeout=self.config.consumer_timeout).topics[topic].partitions)
|
|
130
210
|
|
|
131
211
|
def get_offset_partitions(self, topic: str) -> TopicOffsets:
|
|
132
212
|
"""Get the offsets for each partition of the topic"""
|
|
@@ -134,7 +214,9 @@ class KafkaSource(AbstractSource):
|
|
|
134
214
|
partitions: Mapping[int, OffsetPartition] = {}
|
|
135
215
|
|
|
136
216
|
for i in range(self.get_number_of_partitions(topic)):
|
|
137
|
-
offsets = self.consumer.get_watermark_offsets(
|
|
217
|
+
offsets = self.consumer.get_watermark_offsets(
|
|
218
|
+
TopicPartition(topic, i), timeout=self.config.consumer_timeout
|
|
219
|
+
)
|
|
138
220
|
partitions[i] = OffsetPartition(first=offsets[0], last=offsets[1])
|
|
139
221
|
|
|
140
222
|
return TopicOffsets(name=topic, partitions=partitions)
|
|
@@ -147,7 +229,7 @@ class KafkaSource(AbstractSource):
|
|
|
147
229
|
total_records += self.get_offset_partitions(topic).total_offset
|
|
148
230
|
return total_records
|
|
149
231
|
|
|
150
|
-
@
|
|
232
|
+
@cache
|
|
151
233
|
def get_schema_from_registry(self, global_id: int) -> Tuple[Hashabledict, Schema]:
|
|
152
234
|
"""Get the schema from the registry, return a hashable dict and an avro schema object"""
|
|
153
235
|
|
|
@@ -194,10 +276,8 @@ class KafkaSource(AbstractSource):
|
|
|
194
276
|
hashable_dict_schema, avro_schema = self.get_schema_from_registry(global_id=global_id)
|
|
195
277
|
except SchemaNotFound as e:
|
|
196
278
|
logger.error(
|
|
197
|
-
(
|
|
198
|
-
|
|
199
|
-
f"message value: {message.value()}."
|
|
200
|
-
)
|
|
279
|
+
f"Message on topic {message.topic()} partition {message.partition()} at offset {message.offset()} has a SchemaID of {global_id} which is not found in Registry."
|
|
280
|
+
f"message value: {message.value()}."
|
|
201
281
|
)
|
|
202
282
|
logger.error(traceback.format_exc())
|
|
203
283
|
raise e
|
|
@@ -235,35 +315,46 @@ class KafkaSource(AbstractSource):
|
|
|
235
315
|
records = []
|
|
236
316
|
|
|
237
317
|
for message in encoded_messages:
|
|
318
|
+
MESSAGE_LOG_METADATA = (
|
|
319
|
+
f"Message for topic {message.topic()} partition {message.partition()} and offset {message.offset()}"
|
|
320
|
+
)
|
|
238
321
|
|
|
239
322
|
if message.error():
|
|
240
323
|
# If the message is too large, we skip it and update the offset
|
|
241
324
|
if message.error().code() == KafkaError.MSG_SIZE_TOO_LARGE:
|
|
242
325
|
logger.error(
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
f"Raised MSG_SIZE_TOO_LARGE, if manually setting the offset, the message might not exist. Double-check in Confluent Cloud."
|
|
246
|
-
)
|
|
326
|
+
f"{MESSAGE_LOG_METADATA} is too large. "
|
|
327
|
+
"Raised MSG_SIZE_TOO_LARGE, if manually setting the offset, the message might not exist. Double-check in Confluent Cloud."
|
|
247
328
|
)
|
|
248
329
|
|
|
249
|
-
logger.error(
|
|
250
|
-
(
|
|
251
|
-
f"Error while consuming message for topic {message.topic()} partition {message.partition()} and offset {message.offset()}: "
|
|
252
|
-
f"{message.error()}"
|
|
253
|
-
)
|
|
254
|
-
)
|
|
330
|
+
logger.error(f"{MESSAGE_LOG_METADATA}: {message.error()}")
|
|
255
331
|
raise KafkaException(message.error())
|
|
256
332
|
|
|
257
333
|
# We skip tombstone messages
|
|
258
334
|
if self.config.skip_message_empty_value and not message.value():
|
|
259
|
-
logger.debug(
|
|
260
|
-
f"Message for topic {message.topic()} partition {message.partition()} and offset {message.offset()} is empty, skipping."
|
|
261
|
-
)
|
|
335
|
+
logger.debug(f"{MESSAGE_LOG_METADATA} is empty, skipping.")
|
|
262
336
|
continue
|
|
263
337
|
|
|
338
|
+
# Parse message keys
|
|
339
|
+
if message.key():
|
|
340
|
+
try:
|
|
341
|
+
message_keys = orjson.loads(message.key().decode("utf-8"))
|
|
342
|
+
except orjson.JSONDecodeError as e:
|
|
343
|
+
# We skip messages with invalid keys
|
|
344
|
+
if self.config.skip_message_invalid_keys:
|
|
345
|
+
logger.warning(f"{MESSAGE_LOG_METADATA} has an invalid key={message.key()}, skipping.")
|
|
346
|
+
# Skip the message
|
|
347
|
+
continue
|
|
348
|
+
|
|
349
|
+
logger.error(
|
|
350
|
+
f"{MESSAGE_LOG_METADATA}: Error while parsing message key: {e}, raw key: {message.key()}"
|
|
351
|
+
)
|
|
352
|
+
raise e
|
|
353
|
+
else:
|
|
354
|
+
message_keys = {}
|
|
355
|
+
|
|
264
356
|
# Decode the message
|
|
265
357
|
try:
|
|
266
|
-
|
|
267
358
|
decoded_message, hashable_dict_schema = self.decode(message)
|
|
268
359
|
|
|
269
360
|
data = {
|
|
@@ -271,7 +362,7 @@ class KafkaSource(AbstractSource):
|
|
|
271
362
|
"offset": message.offset(),
|
|
272
363
|
"partition": message.partition(),
|
|
273
364
|
"timestamp": message.timestamp()[1],
|
|
274
|
-
"keys":
|
|
365
|
+
"keys": message_keys,
|
|
275
366
|
"headers": (
|
|
276
367
|
{key: value.decode("utf-8") for key, value in message.headers()} if message.headers() else {}
|
|
277
368
|
),
|
|
@@ -290,17 +381,27 @@ class KafkaSource(AbstractSource):
|
|
|
290
381
|
|
|
291
382
|
except Exception as e:
|
|
292
383
|
logger.error(
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
f"with value: {message.value()} and key: {message.key()}"
|
|
296
|
-
)
|
|
384
|
+
f"{MESSAGE_LOG_METADATA}: Error while decoding message: {e} "
|
|
385
|
+
f"with value: {message.value()} and key: {message.key()}"
|
|
297
386
|
)
|
|
298
|
-
|
|
387
|
+
|
|
388
|
+
# Try to parse error message from the message value
|
|
299
389
|
try:
|
|
300
390
|
message_raw_text = message.value().decode("utf-8")
|
|
301
391
|
logger.error(f"Parsed Kafka value: {message_raw_text}")
|
|
302
392
|
except UnicodeDecodeError:
|
|
303
|
-
logger.error("Message is not a valid UTF-8 string")
|
|
393
|
+
logger.error("Message value is not a valid UTF-8 string")
|
|
394
|
+
|
|
395
|
+
# Try to parse error message from the message headers
|
|
396
|
+
if message.headers():
|
|
397
|
+
try:
|
|
398
|
+
headers_dict = {key: value.decode("utf-8") for key, value in message.headers()}
|
|
399
|
+
logger.error(f"Parsed Kafka headers: {headers_dict}")
|
|
400
|
+
except UnicodeDecodeError as header_error:
|
|
401
|
+
logger.error(f"Some message headers are not valid UTF-8 strings: {header_error}")
|
|
402
|
+
logger.error(f"Raw message headers: {list(message.headers())}")
|
|
403
|
+
else:
|
|
404
|
+
logger.error("Message headers are None or empty")
|
|
304
405
|
|
|
305
406
|
logger.error(traceback.format_exc())
|
|
306
407
|
raise e
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Notion Source Configuration
|
|
2
|
+
# This example shows how to configure the Notion source connector
|
|
3
|
+
|
|
4
|
+
source:
|
|
5
|
+
name: notion
|
|
6
|
+
stream: pages # Options: databases, data_sources, pages, blocks, users
|
|
7
|
+
authentication:
|
|
8
|
+
type: api_key
|
|
9
|
+
params:
|
|
10
|
+
token: secret_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx # Your Notion integration token
|
|
11
|
+
|
|
12
|
+
# List of database IDs to fetch data from
|
|
13
|
+
# Find the ID in the database URL: notion.so/{workspace}/{database_id}?v=...
|
|
14
|
+
database_ids:
|
|
15
|
+
- "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
|
16
|
+
- "yyyyyyyy-yyyy-yyyy-yyyy-yyyyyyyyyyyy"
|
|
17
|
+
|
|
18
|
+
# List of specific page IDs to fetch (optional)
|
|
19
|
+
# Find the ID in the page URL: notion.so/{page_id}
|
|
20
|
+
page_ids:
|
|
21
|
+
- "zzzzzzzz-zzzz-zzzz-zzzz-zzzzzzzzzzzz"
|
|
22
|
+
|
|
23
|
+
# Whether to fetch nested blocks recursively (default: true)
|
|
24
|
+
# Only applies to blocks stream
|
|
25
|
+
fetch_blocks_recursively: true
|
|
26
|
+
|
|
27
|
+
# Number of results per API call (1-100, default: 100)
|
|
28
|
+
page_size: 100
|
|
29
|
+
|
|
30
|
+
destination:
|
|
31
|
+
name: bigquery
|
|
32
|
+
config:
|
|
33
|
+
project_id: my-project
|
|
34
|
+
dataset_id: notion_data
|
|
35
|
+
# ... other destination config
|
|
File without changes
|