bizon 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. bizon/alerting/alerts.py +0 -1
  2. bizon/common/models.py +182 -4
  3. bizon/connectors/destinations/bigquery/src/config.py +0 -1
  4. bizon/connectors/destinations/bigquery/src/destination.py +11 -8
  5. bizon/connectors/destinations/bigquery_streaming/config/bigquery_streaming.example.yml +74 -0
  6. bizon/connectors/destinations/bigquery_streaming/src/destination.py +4 -5
  7. bizon/connectors/destinations/bigquery_streaming_v2/config/bigquery_streaming_v2.example.yml +79 -0
  8. bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +4 -6
  9. bizon/connectors/destinations/file/config/file.example.yml +40 -0
  10. bizon/connectors/destinations/file/src/config.py +1 -1
  11. bizon/connectors/destinations/file/src/destination.py +0 -5
  12. bizon/connectors/destinations/logger/config/logger.example.yml +30 -0
  13. bizon/connectors/destinations/logger/src/config.py +0 -2
  14. bizon/connectors/destinations/logger/src/destination.py +1 -2
  15. bizon/connectors/sources/cycle/src/source.py +2 -6
  16. bizon/connectors/sources/dummy/src/source.py +0 -4
  17. bizon/connectors/sources/gsheets/src/source.py +2 -3
  18. bizon/connectors/sources/hubspot/src/hubspot_base.py +0 -1
  19. bizon/connectors/sources/hubspot/src/hubspot_objects.py +3 -4
  20. bizon/connectors/sources/hubspot/src/models/hs_object.py +0 -1
  21. bizon/connectors/sources/kafka/config/kafka_streams.example.yml +124 -0
  22. bizon/connectors/sources/kafka/src/config.py +10 -6
  23. bizon/connectors/sources/kafka/src/decode.py +2 -2
  24. bizon/connectors/sources/kafka/src/source.py +147 -46
  25. bizon/connectors/sources/notion/config/api_key.example.yml +35 -0
  26. bizon/connectors/sources/notion/src/__init__.py +0 -0
  27. bizon/connectors/sources/notion/src/config.py +59 -0
  28. bizon/connectors/sources/notion/src/source.py +1159 -0
  29. bizon/connectors/sources/notion/tests/notion_pipeline.py +7 -0
  30. bizon/connectors/sources/notion/tests/test_notion.py +113 -0
  31. bizon/connectors/sources/periscope/src/source.py +0 -6
  32. bizon/connectors/sources/pokeapi/src/source.py +0 -1
  33. bizon/connectors/sources/sana_ai/config/sana.example.yml +25 -0
  34. bizon/connectors/sources/sana_ai/src/source.py +85 -0
  35. bizon/destination/buffer.py +0 -1
  36. bizon/destination/config.py +0 -1
  37. bizon/destination/destination.py +1 -4
  38. bizon/engine/backend/adapters/sqlalchemy/backend.py +2 -5
  39. bizon/engine/backend/adapters/sqlalchemy/config.py +0 -1
  40. bizon/engine/config.py +0 -1
  41. bizon/engine/engine.py +0 -1
  42. bizon/engine/pipeline/consumer.py +0 -1
  43. bizon/engine/pipeline/producer.py +1 -5
  44. bizon/engine/queue/adapters/kafka/config.py +1 -1
  45. bizon/engine/queue/adapters/kafka/queue.py +0 -1
  46. bizon/engine/queue/adapters/python_queue/consumer.py +0 -1
  47. bizon/engine/queue/adapters/python_queue/queue.py +0 -2
  48. bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -1
  49. bizon/engine/queue/adapters/rabbitmq/queue.py +0 -1
  50. bizon/engine/queue/config.py +0 -2
  51. bizon/engine/runner/adapters/process.py +0 -2
  52. bizon/engine/runner/adapters/streaming.py +55 -1
  53. bizon/engine/runner/adapters/thread.py +0 -2
  54. bizon/engine/runner/config.py +0 -1
  55. bizon/engine/runner/runner.py +0 -2
  56. bizon/monitoring/datadog/monitor.py +5 -3
  57. bizon/monitoring/noop/monitor.py +1 -1
  58. bizon/source/auth/authenticators/abstract_oauth.py +11 -3
  59. bizon/source/auth/authenticators/abstract_token.py +2 -1
  60. bizon/source/auth/authenticators/basic.py +1 -1
  61. bizon/source/auth/authenticators/cookies.py +2 -1
  62. bizon/source/auth/authenticators/oauth.py +8 -3
  63. bizon/source/config.py +0 -2
  64. bizon/source/cursor.py +8 -16
  65. bizon/source/discover.py +3 -6
  66. bizon/source/models.py +0 -1
  67. bizon/source/session.py +0 -1
  68. bizon/source/source.py +17 -2
  69. bizon/transform/config.py +0 -2
  70. bizon/transform/transform.py +0 -3
  71. {bizon-0.1.2.dist-info → bizon-0.2.0.dist-info}/METADATA +62 -42
  72. bizon-0.2.0.dist-info/RECORD +136 -0
  73. {bizon-0.1.2.dist-info → bizon-0.2.0.dist-info}/WHEEL +1 -1
  74. bizon-0.2.0.dist-info/entry_points.txt +2 -0
  75. bizon-0.1.2.dist-info/RECORD +0 -123
  76. bizon-0.1.2.dist-info/entry_points.txt +0 -3
  77. {bizon-0.1.2.dist-info → bizon-0.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -60,9 +60,7 @@ class CycleSource(AbstractSource):
60
60
  cursor: "PAGINATION_CURSOR"
61
61
  direction: AFTER
62
62
  }
63
- """.replace(
64
- "PAGINATION_CURSOR", pagination.get("endCursor")
65
- )
63
+ """.replace("PAGINATION_CURSOR", pagination.get("endCursor"))
66
64
 
67
65
  return pagination_str
68
66
 
@@ -99,9 +97,7 @@ class CycleSource(AbstractSource):
99
97
  }
100
98
  }
101
99
  }
102
- """.replace(
103
- "PAGINATION_STRING", pagination_str
104
- )
100
+ """.replace("PAGINATION_STRING", pagination_str)
105
101
 
106
102
  variables = {"slug": self.config.slug}
107
103
 
@@ -1,4 +1,3 @@
1
- import random
2
1
  from typing import List, Literal, Tuple, Union
3
2
 
4
3
  from pydantic import Field
@@ -28,7 +27,6 @@ class DummySourceConfig(SourceConfig):
28
27
 
29
28
 
30
29
  class DummySource(AbstractSource):
31
-
32
30
  def __init__(self, config: DummySourceConfig):
33
31
  super().__init__(config)
34
32
  self.config = config
@@ -46,7 +44,6 @@ class DummySource(AbstractSource):
46
44
  return f"https://api.dummy.com/v1/{self.config.stream}"
47
45
 
48
46
  def get_authenticator(self) -> AuthBase:
49
-
50
47
  if self.config.authentication.type == AuthType.OAUTH:
51
48
  return AuthBuilder.oauth2(
52
49
  params=Oauth2AuthParams(
@@ -69,7 +66,6 @@ class DummySource(AbstractSource):
69
66
  return 5
70
67
 
71
68
  def get(self, pagination: dict = None) -> SourceIteration:
72
-
73
69
  response: dict = None
74
70
 
75
71
  # If no pagination data is passed, we want to reach first page
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import re
3
- from typing import Any, Counter, List, Tuple, Type
3
+ from collections import Counter
4
+ from typing import Any, List, Tuple
4
5
  from uuid import uuid4
5
6
 
6
7
  import google.auth
@@ -34,7 +35,6 @@ class GsheetsSourceConfig(SourceConfig):
34
35
 
35
36
 
36
37
  class GsheetsSource(AbstractSource):
37
-
38
38
  def __init__(self, config: GsheetsSourceConfig):
39
39
  super().__init__(config)
40
40
  self.config: GsheetsSourceConfig = config
@@ -49,7 +49,6 @@ class GsheetsSource(AbstractSource):
49
49
  return GsheetsSourceConfig
50
50
 
51
51
  def get_gspread_client(self) -> gspread.client.Client:
52
-
53
52
  if self.config.service_account_key:
54
53
  # use creds to create a client to interact with the Google Drive API
55
54
  credentials_dict = json.loads(self.config.service_account_key)
@@ -19,7 +19,6 @@ URL_TOKEN_REFRESH = f"{URL_BASE}/oauth/v1/token"
19
19
 
20
20
 
21
21
  class HubSpotBaseSource(AbstractSource, ABC):
22
-
23
22
  def get_session(self) -> Session:
24
23
  """Apply custom strategy for HubSpot"""
25
24
  session = Session()
@@ -1,6 +1,7 @@
1
1
  import json
2
+ from collections.abc import Generator
2
3
  from enum import Enum
3
- from typing import Any, Generator, List, Optional
4
+ from typing import List, Optional
4
5
 
5
6
  from loguru import logger
6
7
  from pydantic import BaseModel, Field
@@ -31,7 +32,6 @@ class HubSpotSourceConfig(SourceConfig):
31
32
 
32
33
 
33
34
  class HubSpotObjectsSource(HubSpotBaseSource):
34
-
35
35
  api_version = "v3"
36
36
 
37
37
  object_path = f"crm/{api_version}/objects"
@@ -75,7 +75,6 @@ class HubSpotObjectsSource(HubSpotBaseSource):
75
75
  payload: Optional[dict] = None,
76
76
  headers=None,
77
77
  ) -> Generator[dict, None, None]:
78
-
79
78
  # Call HubSpot API
80
79
  response = self.session.call(
81
80
  method=method,
@@ -164,7 +163,7 @@ class HubSpotObjectsSource(HubSpotBaseSource):
164
163
  payload={"filterGroups": [{"filters": [{"operator": "HAS_PROPERTY", "propertyName": "hs_object_id"}]}]},
165
164
  )
166
165
  total = search_response["total"]
167
- logger.info(f"Number of {self.object} in HubSpot: {'{:,}'.format(total).replace(',', ' ')}")
166
+ logger.info(f"Number of {self.object} in HubSpot: {f'{total:,}'.replace(',', ' ')}")
168
167
  return total
169
168
 
170
169
  def list_properties(self) -> AllObjectProperties:
@@ -40,7 +40,6 @@ class HubSpotObject(BaseModel):
40
40
  cls,
41
41
  raw_obj: dict,
42
42
  ):
43
-
44
43
  properties = {}
45
44
 
46
45
  for property_name, property_value in raw_obj.get("properties", {}).items():
@@ -0,0 +1,124 @@
1
+ # Example: Kafka source with streams configuration
2
+ # This demonstrates the unified streams config that consolidates
3
+ # topic-to-destination mapping with schema definitions.
4
+
5
+ name: kafka_streams_example
6
+
7
+ source:
8
+ name: kafka
9
+ stream: topic
10
+ sync_mode: stream
11
+ # No topics needed - they are automatically extracted from streams config
12
+ nb_bytes_schema_id: 4
13
+ timestamp_ms_name: ts_ms
14
+ batch_size: 100
15
+ consumer_timeout: 30
16
+ bootstrap_servers: your-kafka-broker:9092
17
+ group_id: your-consumer-group
18
+ authentication:
19
+ type: basic
20
+ schema_registry_url: https://your-schema-registry:8081
21
+ params:
22
+ username: your-kafka-username
23
+ password: your-kafka-password
24
+
25
+ destination:
26
+ name: bigquery_streaming_v2
27
+ config:
28
+ dataset_id: your_dataset
29
+ dataset_location: US
30
+ project_id: your-gcp-project
31
+ unnest: true
32
+ time_partitioning:
33
+ type: DAY
34
+ field: __inserted_at
35
+
36
+ # Streams configuration - consolidates topic -> table -> schema mapping
37
+ # Each stream defines:
38
+ # - source: where to read from (topic for Kafka)
39
+ # - destination: where to write (table_id + schema)
40
+ streams:
41
+ - name: "users"
42
+ source:
43
+ topic: "cdc.public.users"
44
+ destination:
45
+ table_id: "your-gcp-project.your_dataset.users"
46
+ clustering_keys:
47
+ - "id"
48
+ record_schema:
49
+ - name: "id"
50
+ type: "INTEGER"
51
+ mode: "REQUIRED"
52
+ - name: "email"
53
+ type: "STRING"
54
+ mode: "NULLABLE"
55
+ - name: "payload"
56
+ type: "JSON"
57
+ mode: "NULLABLE"
58
+ - name: "__operation"
59
+ type: "STRING"
60
+ mode: "NULLABLE"
61
+ - name: "__deleted"
62
+ type: "BOOLEAN"
63
+ mode: "NULLABLE"
64
+ - name: "__kafka_partition"
65
+ type: "INTEGER"
66
+ mode: "NULLABLE"
67
+ - name: "__kafka_offset"
68
+ type: "INTEGER"
69
+ mode: "NULLABLE"
70
+ - name: "__kafka_topic"
71
+ type: "STRING"
72
+ mode: "NULLABLE"
73
+ - name: "__event_timestamp"
74
+ type: "TIMESTAMP"
75
+ mode: "NULLABLE"
76
+ - name: "__inserted_at"
77
+ type: "TIMESTAMP"
78
+ mode: "NULLABLE"
79
+ default_value_expression: "CURRENT_TIMESTAMP()"
80
+
81
+ - name: "orders"
82
+ source:
83
+ topic: "cdc.public.orders"
84
+ destination:
85
+ table_id: "your-gcp-project.your_dataset.orders"
86
+ clustering_keys:
87
+ - "id"
88
+ - "user_id"
89
+ record_schema:
90
+ - name: "id"
91
+ type: "INTEGER"
92
+ mode: "REQUIRED"
93
+ - name: "user_id"
94
+ type: "INTEGER"
95
+ mode: "REQUIRED"
96
+ - name: "payload"
97
+ type: "JSON"
98
+ mode: "NULLABLE"
99
+ - name: "__operation"
100
+ type: "STRING"
101
+ mode: "NULLABLE"
102
+ - name: "__deleted"
103
+ type: "BOOLEAN"
104
+ mode: "NULLABLE"
105
+ - name: "__kafka_partition"
106
+ type: "INTEGER"
107
+ mode: "NULLABLE"
108
+ - name: "__kafka_offset"
109
+ type: "INTEGER"
110
+ mode: "NULLABLE"
111
+ - name: "__kafka_topic"
112
+ type: "STRING"
113
+ mode: "NULLABLE"
114
+ - name: "__event_timestamp"
115
+ type: "TIMESTAMP"
116
+ mode: "NULLABLE"
117
+ - name: "__inserted_at"
118
+ type: "TIMESTAMP"
119
+ mode: "NULLABLE"
120
+ default_value_expression: "CURRENT_TIMESTAMP()"
121
+
122
+ engine:
123
+ runner:
124
+ type: stream # Required when using streams config
@@ -1,5 +1,6 @@
1
+ from collections.abc import Mapping
1
2
  from enum import Enum
2
- from typing import Any, List, Literal, Mapping
3
+ from typing import Any, List, Literal, Optional
3
4
 
4
5
  from pydantic import BaseModel, Field
5
6
 
@@ -17,7 +18,6 @@ class MessageEncoding(str, Enum):
17
18
 
18
19
 
19
20
  class KafkaAuthConfig(AuthConfig):
20
-
21
21
  type: Literal[AuthType.BASIC] = AuthType.BASIC # username and password authentication
22
22
 
23
23
  # Schema registry authentication
@@ -45,16 +45,20 @@ class TopicConfig(BaseModel):
45
45
 
46
46
 
47
47
  class KafkaSourceConfig(SourceConfig):
48
-
49
- # Mandatory Kafka configuration
50
- topics: List[TopicConfig] = Field(..., description="Kafka topic, comma separated")
48
+ # Kafka configuration
49
+ topics: Optional[List[TopicConfig]] = Field(
50
+ default=[],
51
+ description="Kafka topics. Can be empty if using streams configuration to define topics.",
52
+ )
51
53
  bootstrap_servers: str = Field(..., description="Kafka bootstrap servers")
52
54
  group_id: str = Field(default="bizon", description="Kafka group id")
53
55
 
54
56
  skip_message_empty_value: bool = Field(
55
57
  default=True, description="Skip messages with empty value (tombstone messages)"
56
58
  )
57
-
59
+ skip_message_invalid_keys: bool = Field(
60
+ default=False, description="Skip messages with invalid keys (unparsable JSON keys)"
61
+ )
58
62
  # Kafka consumer configuration
59
63
  batch_size: int = Field(100, description="Kafka batch size, number of messages to fetch at once.")
60
64
  consumer_timeout: int = Field(10, description="Kafka consumer timeout in seconds, before returning batch.")
@@ -1,6 +1,6 @@
1
1
  import io
2
2
  import struct
3
- from functools import lru_cache
3
+ from functools import cache
4
4
  from typing import Tuple, Union
5
5
 
6
6
  import fastavro
@@ -20,7 +20,7 @@ class Hashabledict(dict):
20
20
  return hash(frozenset(self.items()))
21
21
 
22
22
 
23
- @lru_cache(maxsize=None)
23
+ @cache
24
24
  def parse_global_id_from_serialized_message(message: bytes) -> Tuple[int, int]:
25
25
  """
26
26
  Parse the global id from the serialized message.
@@ -1,7 +1,8 @@
1
1
  import traceback
2
+ from collections.abc import Mapping
2
3
  from datetime import datetime
3
- from functools import lru_cache
4
- from typing import Any, List, Mapping, Tuple
4
+ from functools import cache
5
+ from typing import Any, List, Tuple
5
6
 
6
7
  import orjson
7
8
  from avro.schema import Schema, parse
@@ -59,13 +60,25 @@ class TopicOffsets(BaseModel):
59
60
  return sum([partition.last for partition in self.partitions.values()])
60
61
 
61
62
 
62
- class KafkaSource(AbstractSource):
63
+ def on_error(err: KafkaError):
64
+ # Fires for client-level errors (incl. DNS resolve failures)
65
+ if err.fatal():
66
+ logger.error(f"Kafka client error: {err} | fatal={err.fatal()} retriable={err.retriable()}")
67
+ raise KafkaException(err)
68
+ else:
69
+ logger.warning(f"Kafka client error: {err} | fatal={err.fatal()} retriable={err.retriable()}")
70
+
63
71
 
72
+ class KafkaSource(AbstractSource):
64
73
  def __init__(self, config: KafkaSourceConfig):
65
74
  super().__init__(config)
66
75
 
67
76
  self.config: KafkaSourceConfig = config
68
77
 
78
+ # Ensure topics is always a list (not None)
79
+ if self.config.topics is None:
80
+ self.config.topics = []
81
+
69
82
  # Kafka consumer configuration.
70
83
  if self.config.authentication.type == AuthType.BASIC:
71
84
  self.config.consumer_config["sasl.mechanisms"] = "PLAIN"
@@ -76,12 +89,58 @@ class KafkaSource(AbstractSource):
76
89
  self.config.consumer_config["group.id"] = self.config.group_id
77
90
  self.config.consumer_config["bootstrap.servers"] = self.config.bootstrap_servers
78
91
 
92
+ # Set the error callback
93
+ self.config.consumer_config["error_cb"] = on_error
94
+
79
95
  # Consumer instance
80
96
  self.consumer = Consumer(self.config.consumer_config)
81
97
 
82
98
  # Map topic_name to destination_id
83
99
  self.topic_map = {topic.name: topic.destination_id for topic in self.config.topics}
84
100
 
101
+ def set_streams_config(self, streams: list) -> None:
102
+ """Configure Kafka topics from streams config.
103
+
104
+ This method enriches self.config.topics from the streams configuration,
105
+ ensuring that subsequent source instantiations (e.g., in init_job) have
106
+ access to the topics without duplication in the YAML config.
107
+
108
+ When a top-level 'streams' configuration is present, this method:
109
+ 1. Extracts Kafka topics from streams (topic field)
110
+ 2. Builds TopicConfig objects with destination_id from streams
111
+ 3. Populates self.config.topics if empty (modifies bizon_config.source in-place)
112
+ 4. Updates topic_map for record routing
113
+
114
+ Args:
115
+ streams: List of StreamConfig objects from BizonConfig.streams
116
+ """
117
+ from .config import TopicConfig
118
+
119
+ # Extract topics from streams
120
+ topics_from_streams = []
121
+ streams_map = {}
122
+
123
+ for stream in streams:
124
+ if hasattr(stream.source, "topic") and stream.source.topic:
125
+ topic_name = stream.source.topic
126
+ streams_map[topic_name] = stream
127
+
128
+ # Build TopicConfig from stream
129
+ topic_config = TopicConfig(name=topic_name, destination_id=stream.destination.table_id)
130
+ topics_from_streams.append(topic_config)
131
+
132
+ # Populate self.config.topics from streams (modifies bizon_config.source in-place)
133
+ # This ensures check_connection() and subsequent source instantiations have topics
134
+ if not self.config.topics and topics_from_streams:
135
+ self.config.topics = topics_from_streams
136
+ logger.info(f"Kafka: Populated {len(topics_from_streams)} topics from streams config")
137
+ for topic_config in topics_from_streams:
138
+ logger.info(f" - Topic: {topic_config.name} -> {topic_config.destination_id}")
139
+
140
+ # Update topic_map with destination table_ids from streams
141
+ for topic, stream_config in streams_map.items():
142
+ self.topic_map[topic] = stream_config.destination.table_id
143
+
85
144
  @staticmethod
86
145
  def streams() -> List[str]:
87
146
  return ["topic"]
@@ -102,31 +161,52 @@ class KafkaSource(AbstractSource):
102
161
  def check_connection(self) -> Tuple[bool | Any | None]:
103
162
  """Check the connection to the Kafka source"""
104
163
 
105
- logger.info(f"Found: {len(self.consumer.list_topics().topics)} topics")
164
+ # Validate that topics have been configured
165
+ if not self.config.topics:
166
+ error_msg = (
167
+ "No topics configured. Either provide topics in source config or use streams configuration. "
168
+ "If using streams config, ensure set_streams_config() is called before check_connection()."
169
+ )
170
+ logger.error(error_msg)
171
+ return False, error_msg
172
+
173
+ try:
174
+ # Use a short timeout to avoid hanging on connection issues
175
+ cluster_metadata = self.consumer.list_topics(timeout=self.config.consumer_timeout)
176
+ topics = cluster_metadata.topics
177
+
178
+ logger.info(f"Found: {len(topics)} topics")
106
179
 
107
- topics = self.consumer.list_topics().topics
180
+ config_topics = [topic.name for topic in self.config.topics]
108
181
 
109
- config_topics = [topic.name for topic in self.config.topics]
182
+ # Display consumer config
183
+ # We ignore the key sasl.password and sasl.username
184
+ consumer_config = self.config.consumer_config.copy()
185
+ consumer_config.pop("sasl.password", None)
186
+ consumer_config.pop("sasl.username", None)
187
+ logger.info(f"Consumer config: {consumer_config}")
110
188
 
111
- # Display consumer config
112
- # We ignore the key sasl.password and sasl.username
113
- consumer_config = self.config.consumer_config.copy()
114
- consumer_config.pop("sasl.password", None)
115
- consumer_config.pop("sasl.username", None)
116
- logger.info(f"Consumer config: {consumer_config}")
189
+ for topic in config_topics:
190
+ if topic not in topics:
191
+ logger.error(f"Topic {topic} not found, available topics: {topics.keys()}")
192
+ return False, f"Topic {topic} not found"
117
193
 
118
- for topic in config_topics:
119
- if topic not in topics:
120
- logger.error(f"Topic {topic} not found, available topics: {topics.keys()}")
121
- return False, f"Topic {topic} not found"
194
+ logger.info(f"Topic {topic} has {len(topics[topic].partitions)} partitions")
122
195
 
123
- logger.info(f"Topic {topic} has {len(topics[topic].partitions)} partitions")
196
+ return True, None
124
197
 
125
- return True, None
198
+ except KafkaException as e:
199
+ error_msg = f"Kafka connection failed: {e}"
200
+ logger.error(error_msg)
201
+ return False, error_msg
202
+ except Exception as e:
203
+ error_msg = f"Connection check failed: {e}"
204
+ logger.error(error_msg)
205
+ return False, error_msg
126
206
 
127
207
  def get_number_of_partitions(self, topic: str) -> int:
128
208
  """Get the number of partitions for the topic"""
129
- return len(self.consumer.list_topics().topics[topic].partitions)
209
+ return len(self.consumer.list_topics(timeout=self.config.consumer_timeout).topics[topic].partitions)
130
210
 
131
211
  def get_offset_partitions(self, topic: str) -> TopicOffsets:
132
212
  """Get the offsets for each partition of the topic"""
@@ -134,7 +214,9 @@ class KafkaSource(AbstractSource):
134
214
  partitions: Mapping[int, OffsetPartition] = {}
135
215
 
136
216
  for i in range(self.get_number_of_partitions(topic)):
137
- offsets = self.consumer.get_watermark_offsets(TopicPartition(topic, i))
217
+ offsets = self.consumer.get_watermark_offsets(
218
+ TopicPartition(topic, i), timeout=self.config.consumer_timeout
219
+ )
138
220
  partitions[i] = OffsetPartition(first=offsets[0], last=offsets[1])
139
221
 
140
222
  return TopicOffsets(name=topic, partitions=partitions)
@@ -147,7 +229,7 @@ class KafkaSource(AbstractSource):
147
229
  total_records += self.get_offset_partitions(topic).total_offset
148
230
  return total_records
149
231
 
150
- @lru_cache(maxsize=None)
232
+ @cache
151
233
  def get_schema_from_registry(self, global_id: int) -> Tuple[Hashabledict, Schema]:
152
234
  """Get the schema from the registry, return a hashable dict and an avro schema object"""
153
235
 
@@ -194,10 +276,8 @@ class KafkaSource(AbstractSource):
194
276
  hashable_dict_schema, avro_schema = self.get_schema_from_registry(global_id=global_id)
195
277
  except SchemaNotFound as e:
196
278
  logger.error(
197
- (
198
- f"Message on topic {message.topic()} partition {message.partition()} at offset {message.offset()} has a SchemaID of {global_id} which is not found in Registry."
199
- f"message value: {message.value()}."
200
- )
279
+ f"Message on topic {message.topic()} partition {message.partition()} at offset {message.offset()} has a SchemaID of {global_id} which is not found in Registry."
280
+ f"message value: {message.value()}."
201
281
  )
202
282
  logger.error(traceback.format_exc())
203
283
  raise e
@@ -235,35 +315,46 @@ class KafkaSource(AbstractSource):
235
315
  records = []
236
316
 
237
317
  for message in encoded_messages:
318
+ MESSAGE_LOG_METADATA = (
319
+ f"Message for topic {message.topic()} partition {message.partition()} and offset {message.offset()}"
320
+ )
238
321
 
239
322
  if message.error():
240
323
  # If the message is too large, we skip it and update the offset
241
324
  if message.error().code() == KafkaError.MSG_SIZE_TOO_LARGE:
242
325
  logger.error(
243
- (
244
- f"Message for topic {message.topic()} partition {message.partition()} and offset {message.offset()} is too large. "
245
- f"Raised MSG_SIZE_TOO_LARGE, if manually setting the offset, the message might not exist. Double-check in Confluent Cloud."
246
- )
326
+ f"{MESSAGE_LOG_METADATA} is too large. "
327
+ "Raised MSG_SIZE_TOO_LARGE, if manually setting the offset, the message might not exist. Double-check in Confluent Cloud."
247
328
  )
248
329
 
249
- logger.error(
250
- (
251
- f"Error while consuming message for topic {message.topic()} partition {message.partition()} and offset {message.offset()}: "
252
- f"{message.error()}"
253
- )
254
- )
330
+ logger.error(f"{MESSAGE_LOG_METADATA}: {message.error()}")
255
331
  raise KafkaException(message.error())
256
332
 
257
333
  # We skip tombstone messages
258
334
  if self.config.skip_message_empty_value and not message.value():
259
- logger.debug(
260
- f"Message for topic {message.topic()} partition {message.partition()} and offset {message.offset()} is empty, skipping."
261
- )
335
+ logger.debug(f"{MESSAGE_LOG_METADATA} is empty, skipping.")
262
336
  continue
263
337
 
338
+ # Parse message keys
339
+ if message.key():
340
+ try:
341
+ message_keys = orjson.loads(message.key().decode("utf-8"))
342
+ except orjson.JSONDecodeError as e:
343
+ # We skip messages with invalid keys
344
+ if self.config.skip_message_invalid_keys:
345
+ logger.warning(f"{MESSAGE_LOG_METADATA} has an invalid key={message.key()}, skipping.")
346
+ # Skip the message
347
+ continue
348
+
349
+ logger.error(
350
+ f"{MESSAGE_LOG_METADATA}: Error while parsing message key: {e}, raw key: {message.key()}"
351
+ )
352
+ raise e
353
+ else:
354
+ message_keys = {}
355
+
264
356
  # Decode the message
265
357
  try:
266
-
267
358
  decoded_message, hashable_dict_schema = self.decode(message)
268
359
 
269
360
  data = {
@@ -271,7 +362,7 @@ class KafkaSource(AbstractSource):
271
362
  "offset": message.offset(),
272
363
  "partition": message.partition(),
273
364
  "timestamp": message.timestamp()[1],
274
- "keys": orjson.loads(message.key().decode("utf-8")) if message.key() else {},
365
+ "keys": message_keys,
275
366
  "headers": (
276
367
  {key: value.decode("utf-8") for key, value in message.headers()} if message.headers() else {}
277
368
  ),
@@ -290,17 +381,27 @@ class KafkaSource(AbstractSource):
290
381
 
291
382
  except Exception as e:
292
383
  logger.error(
293
- (
294
- f"Error while decoding message for topic {message.topic()} on partition {message.partition()}: {e} at offset {message.offset()} "
295
- f"with value: {message.value()} and key: {message.key()}"
296
- )
384
+ f"{MESSAGE_LOG_METADATA}: Error while decoding message: {e} "
385
+ f"with value: {message.value()} and key: {message.key()}"
297
386
  )
298
- # Try to parse error message from the message
387
+
388
+ # Try to parse error message from the message value
299
389
  try:
300
390
  message_raw_text = message.value().decode("utf-8")
301
391
  logger.error(f"Parsed Kafka value: {message_raw_text}")
302
392
  except UnicodeDecodeError:
303
- logger.error("Message is not a valid UTF-8 string")
393
+ logger.error("Message value is not a valid UTF-8 string")
394
+
395
+ # Try to parse error message from the message headers
396
+ if message.headers():
397
+ try:
398
+ headers_dict = {key: value.decode("utf-8") for key, value in message.headers()}
399
+ logger.error(f"Parsed Kafka headers: {headers_dict}")
400
+ except UnicodeDecodeError as header_error:
401
+ logger.error(f"Some message headers are not valid UTF-8 strings: {header_error}")
402
+ logger.error(f"Raw message headers: {list(message.headers())}")
403
+ else:
404
+ logger.error("Message headers are None or empty")
304
405
 
305
406
  logger.error(traceback.format_exc())
306
407
  raise e
@@ -0,0 +1,35 @@
1
+ # Notion Source Configuration
2
+ # This example shows how to configure the Notion source connector
3
+
4
+ source:
5
+ name: notion
6
+ stream: pages # Options: databases, data_sources, pages, blocks, users
7
+ authentication:
8
+ type: api_key
9
+ params:
10
+ token: secret_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx # Your Notion integration token
11
+
12
+ # List of database IDs to fetch data from
13
+ # Find the ID in the database URL: notion.so/{workspace}/{database_id}?v=...
14
+ database_ids:
15
+ - "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
16
+ - "yyyyyyyy-yyyy-yyyy-yyyy-yyyyyyyyyyyy"
17
+
18
+ # List of specific page IDs to fetch (optional)
19
+ # Find the ID in the page URL: notion.so/{page_id}
20
+ page_ids:
21
+ - "zzzzzzzz-zzzz-zzzz-zzzz-zzzzzzzzzzzz"
22
+
23
+ # Whether to fetch nested blocks recursively (default: true)
24
+ # Only applies to blocks stream
25
+ fetch_blocks_recursively: true
26
+
27
+ # Number of results per API call (1-100, default: 100)
28
+ page_size: 100
29
+
30
+ destination:
31
+ name: bigquery
32
+ config:
33
+ project_id: my-project
34
+ dataset_id: notion_data
35
+ # ... other destination config
File without changes