bizon 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon/alerting/alerts.py +0 -1
- bizon/common/models.py +184 -4
- bizon/connectors/destinations/bigquery/src/config.py +1 -1
- bizon/connectors/destinations/bigquery/src/destination.py +14 -9
- bizon/connectors/destinations/bigquery_streaming/config/bigquery_streaming.example.yml +74 -0
- bizon/connectors/destinations/bigquery_streaming/src/config.py +6 -5
- bizon/connectors/destinations/bigquery_streaming/src/destination.py +13 -9
- bizon/connectors/destinations/bigquery_streaming_v2/config/bigquery_streaming_v2.example.yml +79 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +6 -1
- bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +232 -49
- bizon/connectors/destinations/bigquery_streaming_v2/src/proto_utils.py +1 -13
- bizon/connectors/destinations/file/config/file.example.yml +40 -0
- bizon/connectors/destinations/file/src/config.py +2 -1
- bizon/connectors/destinations/file/src/destination.py +3 -6
- bizon/connectors/destinations/logger/config/logger.example.yml +30 -0
- bizon/connectors/destinations/logger/src/config.py +1 -2
- bizon/connectors/destinations/logger/src/destination.py +4 -2
- bizon/connectors/sources/cycle/src/source.py +2 -6
- bizon/connectors/sources/dummy/src/source.py +0 -4
- bizon/connectors/sources/gsheets/src/source.py +2 -3
- bizon/connectors/sources/hubspot/src/hubspot_base.py +0 -1
- bizon/connectors/sources/hubspot/src/hubspot_objects.py +3 -4
- bizon/connectors/sources/hubspot/src/models/hs_object.py +0 -1
- bizon/connectors/sources/kafka/config/kafka.example.yml +1 -3
- bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +1 -3
- bizon/connectors/sources/kafka/config/kafka_streams.example.yml +124 -0
- bizon/connectors/sources/kafka/src/config.py +10 -12
- bizon/connectors/sources/kafka/src/decode.py +65 -60
- bizon/connectors/sources/kafka/src/source.py +182 -61
- bizon/connectors/sources/kafka/tests/kafka_pipeline.py +1 -1
- bizon/connectors/sources/notion/config/api_key.example.yml +35 -0
- bizon/connectors/sources/notion/src/__init__.py +0 -0
- bizon/connectors/sources/notion/src/config.py +59 -0
- bizon/connectors/sources/notion/src/source.py +1159 -0
- bizon/connectors/sources/notion/tests/notion_pipeline.py +7 -0
- bizon/connectors/sources/notion/tests/test_notion.py +113 -0
- bizon/connectors/sources/periscope/src/source.py +0 -6
- bizon/connectors/sources/pokeapi/src/source.py +0 -1
- bizon/connectors/sources/sana_ai/config/sana.example.yml +25 -0
- bizon/connectors/sources/sana_ai/src/source.py +85 -0
- bizon/destination/buffer.py +0 -1
- bizon/destination/config.py +9 -1
- bizon/destination/destination.py +38 -9
- bizon/engine/backend/adapters/sqlalchemy/backend.py +2 -5
- bizon/engine/backend/adapters/sqlalchemy/config.py +0 -1
- bizon/engine/config.py +0 -1
- bizon/engine/engine.py +0 -1
- bizon/engine/pipeline/consumer.py +0 -1
- bizon/engine/pipeline/producer.py +1 -5
- bizon/engine/queue/adapters/kafka/config.py +1 -1
- bizon/engine/queue/adapters/kafka/queue.py +0 -1
- bizon/engine/queue/adapters/python_queue/consumer.py +0 -1
- bizon/engine/queue/adapters/python_queue/queue.py +0 -2
- bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -1
- bizon/engine/queue/adapters/rabbitmq/queue.py +0 -1
- bizon/engine/queue/config.py +0 -2
- bizon/engine/runner/adapters/process.py +0 -2
- bizon/engine/runner/adapters/streaming.py +114 -42
- bizon/engine/runner/adapters/thread.py +0 -2
- bizon/engine/runner/config.py +0 -1
- bizon/engine/runner/runner.py +14 -9
- bizon/monitoring/config.py +12 -2
- bizon/monitoring/datadog/monitor.py +100 -14
- bizon/monitoring/monitor.py +41 -12
- bizon/monitoring/noop/monitor.py +22 -3
- bizon/source/auth/authenticators/abstract_oauth.py +11 -3
- bizon/source/auth/authenticators/abstract_token.py +2 -1
- bizon/source/auth/authenticators/basic.py +1 -1
- bizon/source/auth/authenticators/cookies.py +2 -1
- bizon/source/auth/authenticators/oauth.py +8 -3
- bizon/source/config.py +0 -2
- bizon/source/cursor.py +8 -16
- bizon/source/discover.py +3 -6
- bizon/source/models.py +0 -1
- bizon/source/session.py +0 -1
- bizon/source/source.py +18 -3
- bizon/transform/config.py +0 -2
- bizon/transform/transform.py +0 -3
- {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info}/METADATA +62 -41
- bizon-0.2.0.dist-info/RECORD +136 -0
- {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info}/WHEEL +1 -1
- bizon-0.2.0.dist-info/entry_points.txt +2 -0
- bizon-0.1.1.dist-info/RECORD +0 -123
- bizon-0.1.1.dist-info/entry_points.txt +0 -3
- {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import traceback
|
|
2
|
+
from collections.abc import Mapping
|
|
2
3
|
from datetime import datetime
|
|
3
|
-
from functools import
|
|
4
|
-
from typing import Any, List,
|
|
4
|
+
from functools import cache
|
|
5
|
+
from typing import Any, List, Tuple
|
|
5
6
|
|
|
6
7
|
import orjson
|
|
7
8
|
from avro.schema import Schema, parse
|
|
@@ -12,6 +13,7 @@ from confluent_kafka import (
|
|
|
12
13
|
Message,
|
|
13
14
|
TopicPartition,
|
|
14
15
|
)
|
|
16
|
+
from confluent_kafka.cimpl import KafkaException as CimplKafkaException
|
|
15
17
|
from loguru import logger
|
|
16
18
|
from pydantic import BaseModel
|
|
17
19
|
from pytz import UTC
|
|
@@ -27,12 +29,13 @@ from .config import KafkaSourceConfig, MessageEncoding, SchemaRegistryType
|
|
|
27
29
|
from .decode import (
|
|
28
30
|
Hashabledict,
|
|
29
31
|
decode_avro_message,
|
|
30
|
-
get_header_bytes,
|
|
31
32
|
parse_global_id_from_serialized_message,
|
|
32
33
|
)
|
|
33
34
|
|
|
34
35
|
|
|
35
36
|
class SchemaNotFound(Exception):
|
|
37
|
+
"""Schema not found in the Schema Registry"""
|
|
38
|
+
|
|
36
39
|
pass
|
|
37
40
|
|
|
38
41
|
|
|
@@ -57,13 +60,25 @@ class TopicOffsets(BaseModel):
|
|
|
57
60
|
return sum([partition.last for partition in self.partitions.values()])
|
|
58
61
|
|
|
59
62
|
|
|
60
|
-
|
|
63
|
+
def on_error(err: KafkaError):
|
|
64
|
+
# Fires for client-level errors (incl. DNS resolve failures)
|
|
65
|
+
if err.fatal():
|
|
66
|
+
logger.error(f"Kafka client error: {err} | fatal={err.fatal()} retriable={err.retriable()}")
|
|
67
|
+
raise KafkaException(err)
|
|
68
|
+
else:
|
|
69
|
+
logger.warning(f"Kafka client error: {err} | fatal={err.fatal()} retriable={err.retriable()}")
|
|
70
|
+
|
|
61
71
|
|
|
72
|
+
class KafkaSource(AbstractSource):
|
|
62
73
|
def __init__(self, config: KafkaSourceConfig):
|
|
63
74
|
super().__init__(config)
|
|
64
75
|
|
|
65
76
|
self.config: KafkaSourceConfig = config
|
|
66
77
|
|
|
78
|
+
# Ensure topics is always a list (not None)
|
|
79
|
+
if self.config.topics is None:
|
|
80
|
+
self.config.topics = []
|
|
81
|
+
|
|
67
82
|
# Kafka consumer configuration.
|
|
68
83
|
if self.config.authentication.type == AuthType.BASIC:
|
|
69
84
|
self.config.consumer_config["sasl.mechanisms"] = "PLAIN"
|
|
@@ -74,11 +89,58 @@ class KafkaSource(AbstractSource):
|
|
|
74
89
|
self.config.consumer_config["group.id"] = self.config.group_id
|
|
75
90
|
self.config.consumer_config["bootstrap.servers"] = self.config.bootstrap_servers
|
|
76
91
|
|
|
92
|
+
# Set the error callback
|
|
93
|
+
self.config.consumer_config["error_cb"] = on_error
|
|
94
|
+
|
|
77
95
|
# Consumer instance
|
|
78
|
-
self.consumer = Consumer(self.config.consumer_config
|
|
96
|
+
self.consumer = Consumer(self.config.consumer_config)
|
|
79
97
|
|
|
98
|
+
# Map topic_name to destination_id
|
|
80
99
|
self.topic_map = {topic.name: topic.destination_id for topic in self.config.topics}
|
|
81
100
|
|
|
101
|
+
def set_streams_config(self, streams: list) -> None:
|
|
102
|
+
"""Configure Kafka topics from streams config.
|
|
103
|
+
|
|
104
|
+
This method enriches self.config.topics from the streams configuration,
|
|
105
|
+
ensuring that subsequent source instantiations (e.g., in init_job) have
|
|
106
|
+
access to the topics without duplication in the YAML config.
|
|
107
|
+
|
|
108
|
+
When a top-level 'streams' configuration is present, this method:
|
|
109
|
+
1. Extracts Kafka topics from streams (topic field)
|
|
110
|
+
2. Builds TopicConfig objects with destination_id from streams
|
|
111
|
+
3. Populates self.config.topics if empty (modifies bizon_config.source in-place)
|
|
112
|
+
4. Updates topic_map for record routing
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
streams: List of StreamConfig objects from BizonConfig.streams
|
|
116
|
+
"""
|
|
117
|
+
from .config import TopicConfig
|
|
118
|
+
|
|
119
|
+
# Extract topics from streams
|
|
120
|
+
topics_from_streams = []
|
|
121
|
+
streams_map = {}
|
|
122
|
+
|
|
123
|
+
for stream in streams:
|
|
124
|
+
if hasattr(stream.source, "topic") and stream.source.topic:
|
|
125
|
+
topic_name = stream.source.topic
|
|
126
|
+
streams_map[topic_name] = stream
|
|
127
|
+
|
|
128
|
+
# Build TopicConfig from stream
|
|
129
|
+
topic_config = TopicConfig(name=topic_name, destination_id=stream.destination.table_id)
|
|
130
|
+
topics_from_streams.append(topic_config)
|
|
131
|
+
|
|
132
|
+
# Populate self.config.topics from streams (modifies bizon_config.source in-place)
|
|
133
|
+
# This ensures check_connection() and subsequent source instantiations have topics
|
|
134
|
+
if not self.config.topics and topics_from_streams:
|
|
135
|
+
self.config.topics = topics_from_streams
|
|
136
|
+
logger.info(f"Kafka: Populated {len(topics_from_streams)} topics from streams config")
|
|
137
|
+
for topic_config in topics_from_streams:
|
|
138
|
+
logger.info(f" - Topic: {topic_config.name} -> {topic_config.destination_id}")
|
|
139
|
+
|
|
140
|
+
# Update topic_map with destination table_ids from streams
|
|
141
|
+
for topic, stream_config in streams_map.items():
|
|
142
|
+
self.topic_map[topic] = stream_config.destination.table_id
|
|
143
|
+
|
|
82
144
|
@staticmethod
|
|
83
145
|
def streams() -> List[str]:
|
|
84
146
|
return ["topic"]
|
|
@@ -99,24 +161,52 @@ class KafkaSource(AbstractSource):
|
|
|
99
161
|
def check_connection(self) -> Tuple[bool | Any | None]:
|
|
100
162
|
"""Check the connection to the Kafka source"""
|
|
101
163
|
|
|
102
|
-
|
|
164
|
+
# Validate that topics have been configured
|
|
165
|
+
if not self.config.topics:
|
|
166
|
+
error_msg = (
|
|
167
|
+
"No topics configured. Either provide topics in source config or use streams configuration. "
|
|
168
|
+
"If using streams config, ensure set_streams_config() is called before check_connection()."
|
|
169
|
+
)
|
|
170
|
+
logger.error(error_msg)
|
|
171
|
+
return False, error_msg
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
# Use a short timeout to avoid hanging on connection issues
|
|
175
|
+
cluster_metadata = self.consumer.list_topics(timeout=self.config.consumer_timeout)
|
|
176
|
+
topics = cluster_metadata.topics
|
|
177
|
+
|
|
178
|
+
logger.info(f"Found: {len(topics)} topics")
|
|
179
|
+
|
|
180
|
+
config_topics = [topic.name for topic in self.config.topics]
|
|
103
181
|
|
|
104
|
-
|
|
182
|
+
# Display consumer config
|
|
183
|
+
# We ignore the key sasl.password and sasl.username
|
|
184
|
+
consumer_config = self.config.consumer_config.copy()
|
|
185
|
+
consumer_config.pop("sasl.password", None)
|
|
186
|
+
consumer_config.pop("sasl.username", None)
|
|
187
|
+
logger.info(f"Consumer config: {consumer_config}")
|
|
105
188
|
|
|
106
|
-
|
|
189
|
+
for topic in config_topics:
|
|
190
|
+
if topic not in topics:
|
|
191
|
+
logger.error(f"Topic {topic} not found, available topics: {topics.keys()}")
|
|
192
|
+
return False, f"Topic {topic} not found"
|
|
107
193
|
|
|
108
|
-
|
|
109
|
-
if topic not in topics:
|
|
110
|
-
logger.error(f"Topic {topic} not found, available topics: {topics.keys()}")
|
|
111
|
-
return False, f"Topic {topic} not found"
|
|
194
|
+
logger.info(f"Topic {topic} has {len(topics[topic].partitions)} partitions")
|
|
112
195
|
|
|
113
|
-
|
|
196
|
+
return True, None
|
|
114
197
|
|
|
115
|
-
|
|
198
|
+
except KafkaException as e:
|
|
199
|
+
error_msg = f"Kafka connection failed: {e}"
|
|
200
|
+
logger.error(error_msg)
|
|
201
|
+
return False, error_msg
|
|
202
|
+
except Exception as e:
|
|
203
|
+
error_msg = f"Connection check failed: {e}"
|
|
204
|
+
logger.error(error_msg)
|
|
205
|
+
return False, error_msg
|
|
116
206
|
|
|
117
207
|
def get_number_of_partitions(self, topic: str) -> int:
|
|
118
208
|
"""Get the number of partitions for the topic"""
|
|
119
|
-
return len(self.consumer.list_topics().topics[topic].partitions)
|
|
209
|
+
return len(self.consumer.list_topics(timeout=self.config.consumer_timeout).topics[topic].partitions)
|
|
120
210
|
|
|
121
211
|
def get_offset_partitions(self, topic: str) -> TopicOffsets:
|
|
122
212
|
"""Get the offsets for each partition of the topic"""
|
|
@@ -124,7 +214,9 @@ class KafkaSource(AbstractSource):
|
|
|
124
214
|
partitions: Mapping[int, OffsetPartition] = {}
|
|
125
215
|
|
|
126
216
|
for i in range(self.get_number_of_partitions(topic)):
|
|
127
|
-
offsets = self.consumer.get_watermark_offsets(
|
|
217
|
+
offsets = self.consumer.get_watermark_offsets(
|
|
218
|
+
TopicPartition(topic, i), timeout=self.config.consumer_timeout
|
|
219
|
+
)
|
|
128
220
|
partitions[i] = OffsetPartition(first=offsets[0], last=offsets[1])
|
|
129
221
|
|
|
130
222
|
return TopicOffsets(name=topic, partitions=partitions)
|
|
@@ -137,7 +229,7 @@ class KafkaSource(AbstractSource):
|
|
|
137
229
|
total_records += self.get_offset_partitions(topic).total_offset
|
|
138
230
|
return total_records
|
|
139
231
|
|
|
140
|
-
@
|
|
232
|
+
@cache
|
|
141
233
|
def get_schema_from_registry(self, global_id: int) -> Tuple[Hashabledict, Schema]:
|
|
142
234
|
"""Get the schema from the registry, return a hashable dict and an avro schema object"""
|
|
143
235
|
|
|
@@ -174,40 +266,40 @@ class KafkaSource(AbstractSource):
|
|
|
174
266
|
else:
|
|
175
267
|
raise ValueError(f"Schema registry type {self.config.authentication.schema_registry_type} not supported")
|
|
176
268
|
|
|
177
|
-
def decode_avro(self, message: Message) -> dict:
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
)
|
|
182
|
-
global_id = parse_global_id_from_serialized_message(
|
|
183
|
-
nb_bytes_schema_id=self.config.nb_bytes_schema_id,
|
|
184
|
-
header_message_bytes=header_message_bytes,
|
|
269
|
+
def decode_avro(self, message: Message) -> Tuple[dict, dict]:
|
|
270
|
+
"""Decode the message as avro and return the parsed message and the schema"""
|
|
271
|
+
global_id, nb_bytes_schema_id = parse_global_id_from_serialized_message(
|
|
272
|
+
message=message.value(),
|
|
185
273
|
)
|
|
186
274
|
|
|
187
275
|
try:
|
|
188
276
|
hashable_dict_schema, avro_schema = self.get_schema_from_registry(global_id=global_id)
|
|
189
277
|
except SchemaNotFound as e:
|
|
190
278
|
logger.error(
|
|
191
|
-
(
|
|
192
|
-
|
|
193
|
-
f"message value: {message.value()}."
|
|
194
|
-
)
|
|
279
|
+
f"Message on topic {message.topic()} partition {message.partition()} at offset {message.offset()} has a SchemaID of {global_id} which is not found in Registry."
|
|
280
|
+
f"message value: {message.value()}."
|
|
195
281
|
)
|
|
196
282
|
logger.error(traceback.format_exc())
|
|
197
283
|
raise e
|
|
198
284
|
|
|
199
|
-
return
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
285
|
+
return (
|
|
286
|
+
decode_avro_message(
|
|
287
|
+
message_value=message.value(),
|
|
288
|
+
nb_bytes_schema_id=nb_bytes_schema_id,
|
|
289
|
+
avro_schema=avro_schema,
|
|
290
|
+
),
|
|
291
|
+
hashable_dict_schema,
|
|
204
292
|
)
|
|
205
293
|
|
|
206
|
-
def decode_utf_8(self, message: Message):
|
|
294
|
+
def decode_utf_8(self, message: Message) -> Tuple[dict, dict]:
|
|
295
|
+
"""Decode the message as utf-8 and return the parsed message and the schema"""
|
|
207
296
|
# Decode the message as utf-8
|
|
208
|
-
return orjson.loads(message.value().decode("utf-8"))
|
|
297
|
+
return orjson.loads(message.value().decode("utf-8")), {}
|
|
209
298
|
|
|
210
|
-
def decode(self, message):
|
|
299
|
+
def decode(self, message) -> Tuple[dict, dict]:
|
|
300
|
+
"""Decode the message based on the encoding type
|
|
301
|
+
Returns parsed message and the schema
|
|
302
|
+
"""
|
|
211
303
|
if self.config.message_encoding == MessageEncoding.AVRO:
|
|
212
304
|
return self.decode_avro(message)
|
|
213
305
|
|
|
@@ -218,50 +310,64 @@ class KafkaSource(AbstractSource):
|
|
|
218
310
|
raise ValueError(f"Message encoding {self.config.message_encoding} not supported")
|
|
219
311
|
|
|
220
312
|
def parse_encoded_messages(self, encoded_messages: list) -> List[SourceRecord]:
|
|
313
|
+
"""Parse the encoded Kafka messages and return a list of SourceRecord"""
|
|
221
314
|
|
|
222
315
|
records = []
|
|
223
316
|
|
|
224
317
|
for message in encoded_messages:
|
|
318
|
+
MESSAGE_LOG_METADATA = (
|
|
319
|
+
f"Message for topic {message.topic()} partition {message.partition()} and offset {message.offset()}"
|
|
320
|
+
)
|
|
225
321
|
|
|
226
322
|
if message.error():
|
|
227
323
|
# If the message is too large, we skip it and update the offset
|
|
228
324
|
if message.error().code() == KafkaError.MSG_SIZE_TOO_LARGE:
|
|
229
|
-
logger.
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
f"Raised MSG_SIZE_TOO_LARGE, we suppose the message does not exist. Double-check in Confluent Cloud."
|
|
233
|
-
)
|
|
325
|
+
logger.error(
|
|
326
|
+
f"{MESSAGE_LOG_METADATA} is too large. "
|
|
327
|
+
"Raised MSG_SIZE_TOO_LARGE, if manually setting the offset, the message might not exist. Double-check in Confluent Cloud."
|
|
234
328
|
)
|
|
235
|
-
continue
|
|
236
329
|
|
|
237
|
-
logger.error(
|
|
238
|
-
(
|
|
239
|
-
f"Error while consuming message for topic {message.topic()} partition {message.partition()} and offset {message.offset()}: "
|
|
240
|
-
f"{message.error()}"
|
|
241
|
-
)
|
|
242
|
-
)
|
|
330
|
+
logger.error(f"{MESSAGE_LOG_METADATA}: {message.error()}")
|
|
243
331
|
raise KafkaException(message.error())
|
|
244
332
|
|
|
245
333
|
# We skip tombstone messages
|
|
246
334
|
if self.config.skip_message_empty_value and not message.value():
|
|
247
|
-
logger.debug(
|
|
248
|
-
f"Message for topic {message.topic()} partition {message.partition()} and offset {message.offset()} is empty, skipping."
|
|
249
|
-
)
|
|
335
|
+
logger.debug(f"{MESSAGE_LOG_METADATA} is empty, skipping.")
|
|
250
336
|
continue
|
|
251
337
|
|
|
338
|
+
# Parse message keys
|
|
339
|
+
if message.key():
|
|
340
|
+
try:
|
|
341
|
+
message_keys = orjson.loads(message.key().decode("utf-8"))
|
|
342
|
+
except orjson.JSONDecodeError as e:
|
|
343
|
+
# We skip messages with invalid keys
|
|
344
|
+
if self.config.skip_message_invalid_keys:
|
|
345
|
+
logger.warning(f"{MESSAGE_LOG_METADATA} has an invalid key={message.key()}, skipping.")
|
|
346
|
+
# Skip the message
|
|
347
|
+
continue
|
|
348
|
+
|
|
349
|
+
logger.error(
|
|
350
|
+
f"{MESSAGE_LOG_METADATA}: Error while parsing message key: {e}, raw key: {message.key()}"
|
|
351
|
+
)
|
|
352
|
+
raise e
|
|
353
|
+
else:
|
|
354
|
+
message_keys = {}
|
|
355
|
+
|
|
252
356
|
# Decode the message
|
|
253
357
|
try:
|
|
358
|
+
decoded_message, hashable_dict_schema = self.decode(message)
|
|
254
359
|
|
|
255
360
|
data = {
|
|
256
361
|
"topic": message.topic(),
|
|
257
362
|
"offset": message.offset(),
|
|
258
363
|
"partition": message.partition(),
|
|
259
364
|
"timestamp": message.timestamp()[1],
|
|
260
|
-
"keys":
|
|
365
|
+
"keys": message_keys,
|
|
261
366
|
"headers": (
|
|
262
367
|
{key: value.decode("utf-8") for key, value in message.headers()} if message.headers() else {}
|
|
263
368
|
),
|
|
264
|
-
"value":
|
|
369
|
+
"value": decoded_message,
|
|
370
|
+
"schema": hashable_dict_schema,
|
|
265
371
|
}
|
|
266
372
|
|
|
267
373
|
records.append(
|
|
@@ -275,17 +381,27 @@ class KafkaSource(AbstractSource):
|
|
|
275
381
|
|
|
276
382
|
except Exception as e:
|
|
277
383
|
logger.error(
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
f"with value: {message.value()} and key: {message.key()}"
|
|
281
|
-
)
|
|
384
|
+
f"{MESSAGE_LOG_METADATA}: Error while decoding message: {e} "
|
|
385
|
+
f"with value: {message.value()} and key: {message.key()}"
|
|
282
386
|
)
|
|
283
|
-
|
|
387
|
+
|
|
388
|
+
# Try to parse error message from the message value
|
|
284
389
|
try:
|
|
285
390
|
message_raw_text = message.value().decode("utf-8")
|
|
286
391
|
logger.error(f"Parsed Kafka value: {message_raw_text}")
|
|
287
392
|
except UnicodeDecodeError:
|
|
288
|
-
logger.error("Message is not a valid UTF-8 string")
|
|
393
|
+
logger.error("Message value is not a valid UTF-8 string")
|
|
394
|
+
|
|
395
|
+
# Try to parse error message from the message headers
|
|
396
|
+
if message.headers():
|
|
397
|
+
try:
|
|
398
|
+
headers_dict = {key: value.decode("utf-8") for key, value in message.headers()}
|
|
399
|
+
logger.error(f"Parsed Kafka headers: {headers_dict}")
|
|
400
|
+
except UnicodeDecodeError as header_error:
|
|
401
|
+
logger.error(f"Some message headers are not valid UTF-8 strings: {header_error}")
|
|
402
|
+
logger.error(f"Raw message headers: {list(message.headers())}")
|
|
403
|
+
else:
|
|
404
|
+
logger.error("Message headers are None or empty")
|
|
289
405
|
|
|
290
406
|
logger.error(traceback.format_exc())
|
|
291
407
|
raise e
|
|
@@ -358,4 +474,9 @@ class KafkaSource(AbstractSource):
|
|
|
358
474
|
return self.read_topics_manually(pagination)
|
|
359
475
|
|
|
360
476
|
def commit(self):
|
|
361
|
-
|
|
477
|
+
"""Commit the offsets of the consumer"""
|
|
478
|
+
try:
|
|
479
|
+
self.consumer.commit(asynchronous=False)
|
|
480
|
+
except CimplKafkaException as e:
|
|
481
|
+
logger.error(f"Kafka exception occurred during commit: {e}")
|
|
482
|
+
logger.info("Gracefully exiting without committing offsets due to Kafka exception")
|
|
@@ -3,5 +3,5 @@ import os
|
|
|
3
3
|
from bizon.engine.engine import RunnerFactory
|
|
4
4
|
|
|
5
5
|
if __name__ == "__main__":
|
|
6
|
-
runner = RunnerFactory.create_from_yaml(filepath=os.path.abspath("test-pipeline-streaming.yml"))
|
|
6
|
+
runner = RunnerFactory.create_from_yaml(filepath=os.path.abspath("test-pipeline-streaming-v2.yml"))
|
|
7
7
|
runner.run()
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Notion Source Configuration
|
|
2
|
+
# This example shows how to configure the Notion source connector
|
|
3
|
+
|
|
4
|
+
source:
|
|
5
|
+
name: notion
|
|
6
|
+
stream: pages # Options: databases, data_sources, pages, blocks, users
|
|
7
|
+
authentication:
|
|
8
|
+
type: api_key
|
|
9
|
+
params:
|
|
10
|
+
token: secret_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx # Your Notion integration token
|
|
11
|
+
|
|
12
|
+
# List of database IDs to fetch data from
|
|
13
|
+
# Find the ID in the database URL: notion.so/{workspace}/{database_id}?v=...
|
|
14
|
+
database_ids:
|
|
15
|
+
- "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
|
|
16
|
+
- "yyyyyyyy-yyyy-yyyy-yyyy-yyyyyyyyyyyy"
|
|
17
|
+
|
|
18
|
+
# List of specific page IDs to fetch (optional)
|
|
19
|
+
# Find the ID in the page URL: notion.so/{page_id}
|
|
20
|
+
page_ids:
|
|
21
|
+
- "zzzzzzzz-zzzz-zzzz-zzzz-zzzzzzzzzzzz"
|
|
22
|
+
|
|
23
|
+
# Whether to fetch nested blocks recursively (default: true)
|
|
24
|
+
# Only applies to blocks stream
|
|
25
|
+
fetch_blocks_recursively: true
|
|
26
|
+
|
|
27
|
+
# Number of results per API call (1-100, default: 100)
|
|
28
|
+
page_size: 100
|
|
29
|
+
|
|
30
|
+
destination:
|
|
31
|
+
name: bigquery
|
|
32
|
+
config:
|
|
33
|
+
project_id: my-project
|
|
34
|
+
dataset_id: notion_data
|
|
35
|
+
# ... other destination config
|
|
File without changes
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from pydantic import Field
|
|
5
|
+
|
|
6
|
+
from bizon.source.config import SourceConfig
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class NotionStreams(str, Enum):
|
|
10
|
+
DATABASES = "databases"
|
|
11
|
+
DATA_SOURCES = "data_sources"
|
|
12
|
+
PAGES = "pages"
|
|
13
|
+
BLOCKS = "blocks"
|
|
14
|
+
BLOCKS_MARKDOWN = "blocks_markdown"
|
|
15
|
+
USERS = "users"
|
|
16
|
+
# Streams that fetch all accessible content (no database_ids/page_ids required)
|
|
17
|
+
ALL_PAGES = "all_pages"
|
|
18
|
+
ALL_DATABASES = "all_databases"
|
|
19
|
+
ALL_DATA_SOURCES = "all_data_sources"
|
|
20
|
+
ALL_BLOCKS_MARKDOWN = "all_blocks_markdown"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class NotionSourceConfig(SourceConfig):
|
|
24
|
+
stream: NotionStreams
|
|
25
|
+
|
|
26
|
+
database_ids: List[str] = Field(
|
|
27
|
+
default_factory=list,
|
|
28
|
+
description="List of Notion database IDs to fetch. Required for databases, data_sources, pages, and blocks streams.",
|
|
29
|
+
)
|
|
30
|
+
page_ids: List[str] = Field(
|
|
31
|
+
default_factory=list,
|
|
32
|
+
description="List of Notion page IDs to fetch. Used for pages and blocks streams.",
|
|
33
|
+
)
|
|
34
|
+
fetch_blocks_recursively: bool = Field(
|
|
35
|
+
default=True,
|
|
36
|
+
description="Whether to fetch nested blocks recursively. Only applies to blocks stream.",
|
|
37
|
+
)
|
|
38
|
+
max_recursion_depth: int = Field(
|
|
39
|
+
default=5,
|
|
40
|
+
ge=1,
|
|
41
|
+
le=100,
|
|
42
|
+
description="Maximum nesting depth for recursive block fetching. Prevents infinite loops.",
|
|
43
|
+
)
|
|
44
|
+
page_size: int = Field(
|
|
45
|
+
default=100,
|
|
46
|
+
ge=1,
|
|
47
|
+
le=100,
|
|
48
|
+
description="Number of results per page (max 100)",
|
|
49
|
+
)
|
|
50
|
+
max_workers: int = Field(
|
|
51
|
+
default=3,
|
|
52
|
+
ge=1,
|
|
53
|
+
le=10,
|
|
54
|
+
description="Number of concurrent workers for fetching blocks. Keep low to respect rate limits.",
|
|
55
|
+
)
|
|
56
|
+
database_filters: Dict[str, Any] = Field(
|
|
57
|
+
default_factory=dict,
|
|
58
|
+
description="Map of database_id -> Notion filter object. Filters are passed directly to Notion API.",
|
|
59
|
+
)
|