bizon 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. bizon/alerting/alerts.py +0 -1
  2. bizon/common/models.py +184 -4
  3. bizon/connectors/destinations/bigquery/src/config.py +1 -1
  4. bizon/connectors/destinations/bigquery/src/destination.py +14 -9
  5. bizon/connectors/destinations/bigquery_streaming/config/bigquery_streaming.example.yml +74 -0
  6. bizon/connectors/destinations/bigquery_streaming/src/config.py +6 -5
  7. bizon/connectors/destinations/bigquery_streaming/src/destination.py +13 -9
  8. bizon/connectors/destinations/bigquery_streaming_v2/config/bigquery_streaming_v2.example.yml +79 -0
  9. bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +6 -1
  10. bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +232 -49
  11. bizon/connectors/destinations/bigquery_streaming_v2/src/proto_utils.py +1 -13
  12. bizon/connectors/destinations/file/config/file.example.yml +40 -0
  13. bizon/connectors/destinations/file/src/config.py +2 -1
  14. bizon/connectors/destinations/file/src/destination.py +3 -6
  15. bizon/connectors/destinations/logger/config/logger.example.yml +30 -0
  16. bizon/connectors/destinations/logger/src/config.py +1 -2
  17. bizon/connectors/destinations/logger/src/destination.py +4 -2
  18. bizon/connectors/sources/cycle/src/source.py +2 -6
  19. bizon/connectors/sources/dummy/src/source.py +0 -4
  20. bizon/connectors/sources/gsheets/src/source.py +2 -3
  21. bizon/connectors/sources/hubspot/src/hubspot_base.py +0 -1
  22. bizon/connectors/sources/hubspot/src/hubspot_objects.py +3 -4
  23. bizon/connectors/sources/hubspot/src/models/hs_object.py +0 -1
  24. bizon/connectors/sources/kafka/config/kafka.example.yml +1 -3
  25. bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +1 -3
  26. bizon/connectors/sources/kafka/config/kafka_streams.example.yml +124 -0
  27. bizon/connectors/sources/kafka/src/config.py +10 -12
  28. bizon/connectors/sources/kafka/src/decode.py +65 -60
  29. bizon/connectors/sources/kafka/src/source.py +182 -61
  30. bizon/connectors/sources/kafka/tests/kafka_pipeline.py +1 -1
  31. bizon/connectors/sources/notion/config/api_key.example.yml +35 -0
  32. bizon/connectors/sources/notion/src/__init__.py +0 -0
  33. bizon/connectors/sources/notion/src/config.py +59 -0
  34. bizon/connectors/sources/notion/src/source.py +1159 -0
  35. bizon/connectors/sources/notion/tests/notion_pipeline.py +7 -0
  36. bizon/connectors/sources/notion/tests/test_notion.py +113 -0
  37. bizon/connectors/sources/periscope/src/source.py +0 -6
  38. bizon/connectors/sources/pokeapi/src/source.py +0 -1
  39. bizon/connectors/sources/sana_ai/config/sana.example.yml +25 -0
  40. bizon/connectors/sources/sana_ai/src/source.py +85 -0
  41. bizon/destination/buffer.py +0 -1
  42. bizon/destination/config.py +9 -1
  43. bizon/destination/destination.py +38 -9
  44. bizon/engine/backend/adapters/sqlalchemy/backend.py +2 -5
  45. bizon/engine/backend/adapters/sqlalchemy/config.py +0 -1
  46. bizon/engine/config.py +0 -1
  47. bizon/engine/engine.py +0 -1
  48. bizon/engine/pipeline/consumer.py +0 -1
  49. bizon/engine/pipeline/producer.py +1 -5
  50. bizon/engine/queue/adapters/kafka/config.py +1 -1
  51. bizon/engine/queue/adapters/kafka/queue.py +0 -1
  52. bizon/engine/queue/adapters/python_queue/consumer.py +0 -1
  53. bizon/engine/queue/adapters/python_queue/queue.py +0 -2
  54. bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -1
  55. bizon/engine/queue/adapters/rabbitmq/queue.py +0 -1
  56. bizon/engine/queue/config.py +0 -2
  57. bizon/engine/runner/adapters/process.py +0 -2
  58. bizon/engine/runner/adapters/streaming.py +114 -42
  59. bizon/engine/runner/adapters/thread.py +0 -2
  60. bizon/engine/runner/config.py +0 -1
  61. bizon/engine/runner/runner.py +14 -9
  62. bizon/monitoring/config.py +12 -2
  63. bizon/monitoring/datadog/monitor.py +100 -14
  64. bizon/monitoring/monitor.py +41 -12
  65. bizon/monitoring/noop/monitor.py +22 -3
  66. bizon/source/auth/authenticators/abstract_oauth.py +11 -3
  67. bizon/source/auth/authenticators/abstract_token.py +2 -1
  68. bizon/source/auth/authenticators/basic.py +1 -1
  69. bizon/source/auth/authenticators/cookies.py +2 -1
  70. bizon/source/auth/authenticators/oauth.py +8 -3
  71. bizon/source/config.py +0 -2
  72. bizon/source/cursor.py +8 -16
  73. bizon/source/discover.py +3 -6
  74. bizon/source/models.py +0 -1
  75. bizon/source/session.py +0 -1
  76. bizon/source/source.py +18 -3
  77. bizon/transform/config.py +0 -2
  78. bizon/transform/transform.py +0 -3
  79. {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info}/METADATA +62 -41
  80. bizon-0.2.0.dist-info/RECORD +136 -0
  81. {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info}/WHEEL +1 -1
  82. bizon-0.2.0.dist-info/entry_points.txt +2 -0
  83. bizon-0.1.1.dist-info/RECORD +0 -123
  84. bizon-0.1.1.dist-info/entry_points.txt +0 -3
  85. {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,7 +1,8 @@
1
1
  import traceback
2
+ from collections.abc import Mapping
2
3
  from datetime import datetime
3
- from functools import lru_cache
4
- from typing import Any, List, Mapping, Tuple
4
+ from functools import cache
5
+ from typing import Any, List, Tuple
5
6
 
6
7
  import orjson
7
8
  from avro.schema import Schema, parse
@@ -12,6 +13,7 @@ from confluent_kafka import (
12
13
  Message,
13
14
  TopicPartition,
14
15
  )
16
+ from confluent_kafka.cimpl import KafkaException as CimplKafkaException
15
17
  from loguru import logger
16
18
  from pydantic import BaseModel
17
19
  from pytz import UTC
@@ -27,12 +29,13 @@ from .config import KafkaSourceConfig, MessageEncoding, SchemaRegistryType
27
29
  from .decode import (
28
30
  Hashabledict,
29
31
  decode_avro_message,
30
- get_header_bytes,
31
32
  parse_global_id_from_serialized_message,
32
33
  )
33
34
 
34
35
 
35
36
  class SchemaNotFound(Exception):
37
+ """Schema not found in the Schema Registry"""
38
+
36
39
  pass
37
40
 
38
41
 
@@ -57,13 +60,25 @@ class TopicOffsets(BaseModel):
57
60
  return sum([partition.last for partition in self.partitions.values()])
58
61
 
59
62
 
60
- class KafkaSource(AbstractSource):
63
+ def on_error(err: KafkaError):
64
+ # Fires for client-level errors (incl. DNS resolve failures)
65
+ if err.fatal():
66
+ logger.error(f"Kafka client error: {err} | fatal={err.fatal()} retriable={err.retriable()}")
67
+ raise KafkaException(err)
68
+ else:
69
+ logger.warning(f"Kafka client error: {err} | fatal={err.fatal()} retriable={err.retriable()}")
70
+
61
71
 
72
+ class KafkaSource(AbstractSource):
62
73
  def __init__(self, config: KafkaSourceConfig):
63
74
  super().__init__(config)
64
75
 
65
76
  self.config: KafkaSourceConfig = config
66
77
 
78
+ # Ensure topics is always a list (not None)
79
+ if self.config.topics is None:
80
+ self.config.topics = []
81
+
67
82
  # Kafka consumer configuration.
68
83
  if self.config.authentication.type == AuthType.BASIC:
69
84
  self.config.consumer_config["sasl.mechanisms"] = "PLAIN"
@@ -74,11 +89,58 @@ class KafkaSource(AbstractSource):
74
89
  self.config.consumer_config["group.id"] = self.config.group_id
75
90
  self.config.consumer_config["bootstrap.servers"] = self.config.bootstrap_servers
76
91
 
92
+ # Set the error callback
93
+ self.config.consumer_config["error_cb"] = on_error
94
+
77
95
  # Consumer instance
78
- self.consumer = Consumer(self.config.consumer_config, logger=logger)
96
+ self.consumer = Consumer(self.config.consumer_config)
79
97
 
98
+ # Map topic_name to destination_id
80
99
  self.topic_map = {topic.name: topic.destination_id for topic in self.config.topics}
81
100
 
101
+ def set_streams_config(self, streams: list) -> None:
102
+ """Configure Kafka topics from streams config.
103
+
104
+ This method enriches self.config.topics from the streams configuration,
105
+ ensuring that subsequent source instantiations (e.g., in init_job) have
106
+ access to the topics without duplication in the YAML config.
107
+
108
+ When a top-level 'streams' configuration is present, this method:
109
+ 1. Extracts Kafka topics from streams (topic field)
110
+ 2. Builds TopicConfig objects with destination_id from streams
111
+ 3. Populates self.config.topics if empty (modifies bizon_config.source in-place)
112
+ 4. Updates topic_map for record routing
113
+
114
+ Args:
115
+ streams: List of StreamConfig objects from BizonConfig.streams
116
+ """
117
+ from .config import TopicConfig
118
+
119
+ # Extract topics from streams
120
+ topics_from_streams = []
121
+ streams_map = {}
122
+
123
+ for stream in streams:
124
+ if hasattr(stream.source, "topic") and stream.source.topic:
125
+ topic_name = stream.source.topic
126
+ streams_map[topic_name] = stream
127
+
128
+ # Build TopicConfig from stream
129
+ topic_config = TopicConfig(name=topic_name, destination_id=stream.destination.table_id)
130
+ topics_from_streams.append(topic_config)
131
+
132
+ # Populate self.config.topics from streams (modifies bizon_config.source in-place)
133
+ # This ensures check_connection() and subsequent source instantiations have topics
134
+ if not self.config.topics and topics_from_streams:
135
+ self.config.topics = topics_from_streams
136
+ logger.info(f"Kafka: Populated {len(topics_from_streams)} topics from streams config")
137
+ for topic_config in topics_from_streams:
138
+ logger.info(f" - Topic: {topic_config.name} -> {topic_config.destination_id}")
139
+
140
+ # Update topic_map with destination table_ids from streams
141
+ for topic, stream_config in streams_map.items():
142
+ self.topic_map[topic] = stream_config.destination.table_id
143
+
82
144
  @staticmethod
83
145
  def streams() -> List[str]:
84
146
  return ["topic"]
@@ -99,24 +161,52 @@ class KafkaSource(AbstractSource):
99
161
  def check_connection(self) -> Tuple[bool | Any | None]:
100
162
  """Check the connection to the Kafka source"""
101
163
 
102
- logger.info(f"Found: {len(self.consumer.list_topics().topics)} topics")
164
+ # Validate that topics have been configured
165
+ if not self.config.topics:
166
+ error_msg = (
167
+ "No topics configured. Either provide topics in source config or use streams configuration. "
168
+ "If using streams config, ensure set_streams_config() is called before check_connection()."
169
+ )
170
+ logger.error(error_msg)
171
+ return False, error_msg
172
+
173
+ try:
174
+ # Use a short timeout to avoid hanging on connection issues
175
+ cluster_metadata = self.consumer.list_topics(timeout=self.config.consumer_timeout)
176
+ topics = cluster_metadata.topics
177
+
178
+ logger.info(f"Found: {len(topics)} topics")
179
+
180
+ config_topics = [topic.name for topic in self.config.topics]
103
181
 
104
- topics = self.consumer.list_topics().topics
182
+ # Display consumer config
183
+ # We ignore the key sasl.password and sasl.username
184
+ consumer_config = self.config.consumer_config.copy()
185
+ consumer_config.pop("sasl.password", None)
186
+ consumer_config.pop("sasl.username", None)
187
+ logger.info(f"Consumer config: {consumer_config}")
105
188
 
106
- config_topics = [topic.name for topic in self.config.topics]
189
+ for topic in config_topics:
190
+ if topic not in topics:
191
+ logger.error(f"Topic {topic} not found, available topics: {topics.keys()}")
192
+ return False, f"Topic {topic} not found"
107
193
 
108
- for topic in config_topics:
109
- if topic not in topics:
110
- logger.error(f"Topic {topic} not found, available topics: {topics.keys()}")
111
- return False, f"Topic {topic} not found"
194
+ logger.info(f"Topic {topic} has {len(topics[topic].partitions)} partitions")
112
195
 
113
- logger.info(f"Topic {topic} has {len(topics[topic].partitions)} partitions")
196
+ return True, None
114
197
 
115
- return True, None
198
+ except KafkaException as e:
199
+ error_msg = f"Kafka connection failed: {e}"
200
+ logger.error(error_msg)
201
+ return False, error_msg
202
+ except Exception as e:
203
+ error_msg = f"Connection check failed: {e}"
204
+ logger.error(error_msg)
205
+ return False, error_msg
116
206
 
117
207
  def get_number_of_partitions(self, topic: str) -> int:
118
208
  """Get the number of partitions for the topic"""
119
- return len(self.consumer.list_topics().topics[topic].partitions)
209
+ return len(self.consumer.list_topics(timeout=self.config.consumer_timeout).topics[topic].partitions)
120
210
 
121
211
  def get_offset_partitions(self, topic: str) -> TopicOffsets:
122
212
  """Get the offsets for each partition of the topic"""
@@ -124,7 +214,9 @@ class KafkaSource(AbstractSource):
124
214
  partitions: Mapping[int, OffsetPartition] = {}
125
215
 
126
216
  for i in range(self.get_number_of_partitions(topic)):
127
- offsets = self.consumer.get_watermark_offsets(TopicPartition(topic, i))
217
+ offsets = self.consumer.get_watermark_offsets(
218
+ TopicPartition(topic, i), timeout=self.config.consumer_timeout
219
+ )
128
220
  partitions[i] = OffsetPartition(first=offsets[0], last=offsets[1])
129
221
 
130
222
  return TopicOffsets(name=topic, partitions=partitions)
@@ -137,7 +229,7 @@ class KafkaSource(AbstractSource):
137
229
  total_records += self.get_offset_partitions(topic).total_offset
138
230
  return total_records
139
231
 
140
- @lru_cache(maxsize=None)
232
+ @cache
141
233
  def get_schema_from_registry(self, global_id: int) -> Tuple[Hashabledict, Schema]:
142
234
  """Get the schema from the registry, return a hashable dict and an avro schema object"""
143
235
 
@@ -174,40 +266,40 @@ class KafkaSource(AbstractSource):
174
266
  else:
175
267
  raise ValueError(f"Schema registry type {self.config.authentication.schema_registry_type} not supported")
176
268
 
177
- def decode_avro(self, message: Message) -> dict:
178
- # Get the header bytes and the global id from the message
179
- header_message_bytes = get_header_bytes(
180
- nb_bytes_schema_id=self.config.nb_bytes_schema_id, message=message.value()
181
- )
182
- global_id = parse_global_id_from_serialized_message(
183
- nb_bytes_schema_id=self.config.nb_bytes_schema_id,
184
- header_message_bytes=header_message_bytes,
269
+ def decode_avro(self, message: Message) -> Tuple[dict, dict]:
270
+ """Decode the message as avro and return the parsed message and the schema"""
271
+ global_id, nb_bytes_schema_id = parse_global_id_from_serialized_message(
272
+ message=message.value(),
185
273
  )
186
274
 
187
275
  try:
188
276
  hashable_dict_schema, avro_schema = self.get_schema_from_registry(global_id=global_id)
189
277
  except SchemaNotFound as e:
190
278
  logger.error(
191
- (
192
- f"Message on topic {message.topic()} partition {message.partition()} at offset {message.offset()} has a SchemaID of {global_id} which is not found in Registry."
193
- f"message value: {message.value()}."
194
- )
279
+ f"Message on topic {message.topic()} partition {message.partition()} at offset {message.offset()} has a SchemaID of {global_id} which is not found in Registry."
280
+ f"message value: {message.value()}."
195
281
  )
196
282
  logger.error(traceback.format_exc())
197
283
  raise e
198
284
 
199
- return decode_avro_message(
200
- message=message,
201
- nb_bytes_schema_id=self.config.nb_bytes_schema_id,
202
- hashable_dict_schema=hashable_dict_schema,
203
- avro_schema=avro_schema,
285
+ return (
286
+ decode_avro_message(
287
+ message_value=message.value(),
288
+ nb_bytes_schema_id=nb_bytes_schema_id,
289
+ avro_schema=avro_schema,
290
+ ),
291
+ hashable_dict_schema,
204
292
  )
205
293
 
206
- def decode_utf_8(self, message: Message):
294
+ def decode_utf_8(self, message: Message) -> Tuple[dict, dict]:
295
+ """Decode the message as utf-8 and return the parsed message and the schema"""
207
296
  # Decode the message as utf-8
208
- return orjson.loads(message.value().decode("utf-8"))
297
+ return orjson.loads(message.value().decode("utf-8")), {}
209
298
 
210
- def decode(self, message):
299
+ def decode(self, message) -> Tuple[dict, dict]:
300
+ """Decode the message based on the encoding type
301
+ Returns parsed message and the schema
302
+ """
211
303
  if self.config.message_encoding == MessageEncoding.AVRO:
212
304
  return self.decode_avro(message)
213
305
 
@@ -218,50 +310,64 @@ class KafkaSource(AbstractSource):
218
310
  raise ValueError(f"Message encoding {self.config.message_encoding} not supported")
219
311
 
220
312
  def parse_encoded_messages(self, encoded_messages: list) -> List[SourceRecord]:
313
+ """Parse the encoded Kafka messages and return a list of SourceRecord"""
221
314
 
222
315
  records = []
223
316
 
224
317
  for message in encoded_messages:
318
+ MESSAGE_LOG_METADATA = (
319
+ f"Message for topic {message.topic()} partition {message.partition()} and offset {message.offset()}"
320
+ )
225
321
 
226
322
  if message.error():
227
323
  # If the message is too large, we skip it and update the offset
228
324
  if message.error().code() == KafkaError.MSG_SIZE_TOO_LARGE:
229
- logger.warning(
230
- (
231
- f"Message for topic {message.topic()} partition {message.partition()} and offset {message.offset()} has been skipped. "
232
- f"Raised MSG_SIZE_TOO_LARGE, we suppose the message does not exist. Double-check in Confluent Cloud."
233
- )
325
+ logger.error(
326
+ f"{MESSAGE_LOG_METADATA} is too large. "
327
+ "Raised MSG_SIZE_TOO_LARGE, if manually setting the offset, the message might not exist. Double-check in Confluent Cloud."
234
328
  )
235
- continue
236
329
 
237
- logger.error(
238
- (
239
- f"Error while consuming message for topic {message.topic()} partition {message.partition()} and offset {message.offset()}: "
240
- f"{message.error()}"
241
- )
242
- )
330
+ logger.error(f"{MESSAGE_LOG_METADATA}: {message.error()}")
243
331
  raise KafkaException(message.error())
244
332
 
245
333
  # We skip tombstone messages
246
334
  if self.config.skip_message_empty_value and not message.value():
247
- logger.debug(
248
- f"Message for topic {message.topic()} partition {message.partition()} and offset {message.offset()} is empty, skipping."
249
- )
335
+ logger.debug(f"{MESSAGE_LOG_METADATA} is empty, skipping.")
250
336
  continue
251
337
 
338
+ # Parse message keys
339
+ if message.key():
340
+ try:
341
+ message_keys = orjson.loads(message.key().decode("utf-8"))
342
+ except orjson.JSONDecodeError as e:
343
+ # We skip messages with invalid keys
344
+ if self.config.skip_message_invalid_keys:
345
+ logger.warning(f"{MESSAGE_LOG_METADATA} has an invalid key={message.key()}, skipping.")
346
+ # Skip the message
347
+ continue
348
+
349
+ logger.error(
350
+ f"{MESSAGE_LOG_METADATA}: Error while parsing message key: {e}, raw key: {message.key()}"
351
+ )
352
+ raise e
353
+ else:
354
+ message_keys = {}
355
+
252
356
  # Decode the message
253
357
  try:
358
+ decoded_message, hashable_dict_schema = self.decode(message)
254
359
 
255
360
  data = {
256
361
  "topic": message.topic(),
257
362
  "offset": message.offset(),
258
363
  "partition": message.partition(),
259
364
  "timestamp": message.timestamp()[1],
260
- "keys": orjson.loads(message.key().decode("utf-8")) if message.key() else {},
365
+ "keys": message_keys,
261
366
  "headers": (
262
367
  {key: value.decode("utf-8") for key, value in message.headers()} if message.headers() else {}
263
368
  ),
264
- "value": self.decode(message),
369
+ "value": decoded_message,
370
+ "schema": hashable_dict_schema,
265
371
  }
266
372
 
267
373
  records.append(
@@ -275,17 +381,27 @@ class KafkaSource(AbstractSource):
275
381
 
276
382
  except Exception as e:
277
383
  logger.error(
278
- (
279
- f"Error while decoding message for topic {message.topic()} on partition {message.partition()}: {e} at offset {message.offset()} "
280
- f"with value: {message.value()} and key: {message.key()}"
281
- )
384
+ f"{MESSAGE_LOG_METADATA}: Error while decoding message: {e} "
385
+ f"with value: {message.value()} and key: {message.key()}"
282
386
  )
283
- # Try to parse error message from the message
387
+
388
+ # Try to parse error message from the message value
284
389
  try:
285
390
  message_raw_text = message.value().decode("utf-8")
286
391
  logger.error(f"Parsed Kafka value: {message_raw_text}")
287
392
  except UnicodeDecodeError:
288
- logger.error("Message is not a valid UTF-8 string")
393
+ logger.error("Message value is not a valid UTF-8 string")
394
+
395
+ # Try to parse error message from the message headers
396
+ if message.headers():
397
+ try:
398
+ headers_dict = {key: value.decode("utf-8") for key, value in message.headers()}
399
+ logger.error(f"Parsed Kafka headers: {headers_dict}")
400
+ except UnicodeDecodeError as header_error:
401
+ logger.error(f"Some message headers are not valid UTF-8 strings: {header_error}")
402
+ logger.error(f"Raw message headers: {list(message.headers())}")
403
+ else:
404
+ logger.error("Message headers are None or empty")
289
405
 
290
406
  logger.error(traceback.format_exc())
291
407
  raise e
@@ -358,4 +474,9 @@ class KafkaSource(AbstractSource):
358
474
  return self.read_topics_manually(pagination)
359
475
 
360
476
  def commit(self):
361
- self.consumer.commit(asynchronous=False)
477
+ """Commit the offsets of the consumer"""
478
+ try:
479
+ self.consumer.commit(asynchronous=False)
480
+ except CimplKafkaException as e:
481
+ logger.error(f"Kafka exception occurred during commit: {e}")
482
+ logger.info("Gracefully exiting without committing offsets due to Kafka exception")
@@ -3,5 +3,5 @@ import os
3
3
  from bizon.engine.engine import RunnerFactory
4
4
 
5
5
  if __name__ == "__main__":
6
- runner = RunnerFactory.create_from_yaml(filepath=os.path.abspath("test-pipeline-streaming.yml"))
6
+ runner = RunnerFactory.create_from_yaml(filepath=os.path.abspath("test-pipeline-streaming-v2.yml"))
7
7
  runner.run()
@@ -0,0 +1,35 @@
1
+ # Notion Source Configuration
2
+ # This example shows how to configure the Notion source connector
3
+
4
+ source:
5
+ name: notion
6
+ stream: pages # Options: databases, data_sources, pages, blocks, users
7
+ authentication:
8
+ type: api_key
9
+ params:
10
+ token: secret_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx # Your Notion integration token
11
+
12
+ # List of database IDs to fetch data from
13
+ # Find the ID in the database URL: notion.so/{workspace}/{database_id}?v=...
14
+ database_ids:
15
+ - "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
16
+ - "yyyyyyyy-yyyy-yyyy-yyyy-yyyyyyyyyyyy"
17
+
18
+ # List of specific page IDs to fetch (optional)
19
+ # Find the ID in the page URL: notion.so/{page_id}
20
+ page_ids:
21
+ - "zzzzzzzz-zzzz-zzzz-zzzz-zzzzzzzzzzzz"
22
+
23
+ # Whether to fetch nested blocks recursively (default: true)
24
+ # Only applies to blocks stream
25
+ fetch_blocks_recursively: true
26
+
27
+ # Number of results per API call (1-100, default: 100)
28
+ page_size: 100
29
+
30
+ destination:
31
+ name: bigquery
32
+ config:
33
+ project_id: my-project
34
+ dataset_id: notion_data
35
+ # ... other destination config
File without changes
@@ -0,0 +1,59 @@
1
+ from enum import Enum
2
+ from typing import Any, Dict, List
3
+
4
+ from pydantic import Field
5
+
6
+ from bizon.source.config import SourceConfig
7
+
8
+
9
+ class NotionStreams(str, Enum):
10
+ DATABASES = "databases"
11
+ DATA_SOURCES = "data_sources"
12
+ PAGES = "pages"
13
+ BLOCKS = "blocks"
14
+ BLOCKS_MARKDOWN = "blocks_markdown"
15
+ USERS = "users"
16
+ # Streams that fetch all accessible content (no database_ids/page_ids required)
17
+ ALL_PAGES = "all_pages"
18
+ ALL_DATABASES = "all_databases"
19
+ ALL_DATA_SOURCES = "all_data_sources"
20
+ ALL_BLOCKS_MARKDOWN = "all_blocks_markdown"
21
+
22
+
23
+ class NotionSourceConfig(SourceConfig):
24
+ stream: NotionStreams
25
+
26
+ database_ids: List[str] = Field(
27
+ default_factory=list,
28
+ description="List of Notion database IDs to fetch. Required for databases, data_sources, pages, and blocks streams.",
29
+ )
30
+ page_ids: List[str] = Field(
31
+ default_factory=list,
32
+ description="List of Notion page IDs to fetch. Used for pages and blocks streams.",
33
+ )
34
+ fetch_blocks_recursively: bool = Field(
35
+ default=True,
36
+ description="Whether to fetch nested blocks recursively. Only applies to blocks stream.",
37
+ )
38
+ max_recursion_depth: int = Field(
39
+ default=5,
40
+ ge=1,
41
+ le=100,
42
+ description="Maximum nesting depth for recursive block fetching. Prevents infinite loops.",
43
+ )
44
+ page_size: int = Field(
45
+ default=100,
46
+ ge=1,
47
+ le=100,
48
+ description="Number of results per page (max 100)",
49
+ )
50
+ max_workers: int = Field(
51
+ default=3,
52
+ ge=1,
53
+ le=10,
54
+ description="Number of concurrent workers for fetching blocks. Keep low to respect rate limits.",
55
+ )
56
+ database_filters: Dict[str, Any] = Field(
57
+ default_factory=dict,
58
+ description="Map of database_id -> Notion filter object. Filters are passed directly to Notion API.",
59
+ )