bizon 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. bizon/alerting/alerts.py +0 -1
  2. bizon/common/models.py +182 -4
  3. bizon/connectors/destinations/bigquery/src/config.py +0 -1
  4. bizon/connectors/destinations/bigquery/src/destination.py +11 -8
  5. bizon/connectors/destinations/bigquery_streaming/config/bigquery_streaming.example.yml +74 -0
  6. bizon/connectors/destinations/bigquery_streaming/src/destination.py +4 -5
  7. bizon/connectors/destinations/bigquery_streaming_v2/config/bigquery_streaming_v2.example.yml +79 -0
  8. bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +4 -6
  9. bizon/connectors/destinations/file/config/file.example.yml +40 -0
  10. bizon/connectors/destinations/file/src/config.py +1 -1
  11. bizon/connectors/destinations/file/src/destination.py +0 -5
  12. bizon/connectors/destinations/logger/config/logger.example.yml +30 -0
  13. bizon/connectors/destinations/logger/src/config.py +0 -2
  14. bizon/connectors/destinations/logger/src/destination.py +1 -2
  15. bizon/connectors/sources/cycle/src/source.py +2 -6
  16. bizon/connectors/sources/dummy/src/source.py +0 -4
  17. bizon/connectors/sources/gsheets/src/source.py +2 -3
  18. bizon/connectors/sources/hubspot/src/hubspot_base.py +0 -1
  19. bizon/connectors/sources/hubspot/src/hubspot_objects.py +3 -4
  20. bizon/connectors/sources/hubspot/src/models/hs_object.py +0 -1
  21. bizon/connectors/sources/kafka/config/kafka_streams.example.yml +124 -0
  22. bizon/connectors/sources/kafka/src/config.py +10 -6
  23. bizon/connectors/sources/kafka/src/decode.py +2 -2
  24. bizon/connectors/sources/kafka/src/source.py +147 -46
  25. bizon/connectors/sources/notion/config/api_key.example.yml +35 -0
  26. bizon/connectors/sources/notion/src/__init__.py +0 -0
  27. bizon/connectors/sources/notion/src/config.py +59 -0
  28. bizon/connectors/sources/notion/src/source.py +1159 -0
  29. bizon/connectors/sources/notion/tests/notion_pipeline.py +7 -0
  30. bizon/connectors/sources/notion/tests/test_notion.py +113 -0
  31. bizon/connectors/sources/periscope/src/source.py +0 -6
  32. bizon/connectors/sources/pokeapi/src/source.py +0 -1
  33. bizon/connectors/sources/sana_ai/config/sana.example.yml +25 -0
  34. bizon/connectors/sources/sana_ai/src/source.py +85 -0
  35. bizon/destination/buffer.py +0 -1
  36. bizon/destination/config.py +0 -1
  37. bizon/destination/destination.py +1 -4
  38. bizon/engine/backend/adapters/sqlalchemy/backend.py +2 -5
  39. bizon/engine/backend/adapters/sqlalchemy/config.py +0 -1
  40. bizon/engine/config.py +0 -1
  41. bizon/engine/engine.py +0 -1
  42. bizon/engine/pipeline/consumer.py +0 -1
  43. bizon/engine/pipeline/producer.py +1 -5
  44. bizon/engine/queue/adapters/kafka/config.py +1 -1
  45. bizon/engine/queue/adapters/kafka/queue.py +0 -1
  46. bizon/engine/queue/adapters/python_queue/consumer.py +0 -1
  47. bizon/engine/queue/adapters/python_queue/queue.py +0 -2
  48. bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -1
  49. bizon/engine/queue/adapters/rabbitmq/queue.py +0 -1
  50. bizon/engine/queue/config.py +0 -2
  51. bizon/engine/runner/adapters/process.py +0 -2
  52. bizon/engine/runner/adapters/streaming.py +55 -1
  53. bizon/engine/runner/adapters/thread.py +0 -2
  54. bizon/engine/runner/config.py +0 -1
  55. bizon/engine/runner/runner.py +0 -2
  56. bizon/monitoring/datadog/monitor.py +5 -3
  57. bizon/monitoring/noop/monitor.py +1 -1
  58. bizon/source/auth/authenticators/abstract_oauth.py +11 -3
  59. bizon/source/auth/authenticators/abstract_token.py +2 -1
  60. bizon/source/auth/authenticators/basic.py +1 -1
  61. bizon/source/auth/authenticators/cookies.py +2 -1
  62. bizon/source/auth/authenticators/oauth.py +8 -3
  63. bizon/source/config.py +0 -2
  64. bizon/source/cursor.py +8 -16
  65. bizon/source/discover.py +3 -6
  66. bizon/source/models.py +0 -1
  67. bizon/source/session.py +0 -1
  68. bizon/source/source.py +17 -2
  69. bizon/transform/config.py +0 -2
  70. bizon/transform/transform.py +0 -3
  71. {bizon-0.1.2.dist-info → bizon-0.2.0.dist-info}/METADATA +62 -42
  72. bizon-0.2.0.dist-info/RECORD +136 -0
  73. {bizon-0.1.2.dist-info → bizon-0.2.0.dist-info}/WHEEL +1 -1
  74. bizon-0.2.0.dist-info/entry_points.txt +2 -0
  75. bizon-0.1.2.dist-info/RECORD +0 -123
  76. bizon-0.1.2.dist-info/entry_points.txt +0 -3
  77. {bizon-0.1.2.dist-info → bizon-0.2.0.dist-info/licenses}/LICENSE +0 -0
bizon/alerting/alerts.py CHANGED
@@ -7,7 +7,6 @@ from bizon.alerting.models import AlertingConfig, AlertMethod, LogLevel
7
7
 
8
8
 
9
9
  class AbstractAlert(ABC):
10
-
11
10
  def __init__(self, type: AlertMethod, config: AlertingConfig, log_levels: List[LogLevel] = [LogLevel.ERROR]):
12
11
  self.type = type
13
12
  self.config = config
bizon/common/models.py CHANGED
@@ -1,9 +1,12 @@
1
- from typing import Optional, Union
1
+ from typing import Any, Optional, Union
2
2
 
3
- from pydantic import BaseModel, ConfigDict, Field
3
+ from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
4
4
 
5
5
  from bizon.alerting.models import AlertingConfig
6
- from bizon.connectors.destinations.bigquery.src.config import BigQueryConfig
6
+ from bizon.connectors.destinations.bigquery.src.config import (
7
+ BigQueryColumn,
8
+ BigQueryConfig,
9
+ )
7
10
  from bizon.connectors.destinations.bigquery_streaming.src.config import (
8
11
  BigQueryStreamingConfig,
9
12
  )
@@ -18,8 +21,74 @@ from bizon.source.config import SourceConfig, SourceSyncModes
18
21
  from bizon.transform.config import TransformModel
19
22
 
20
23
 
21
- class BizonConfig(BaseModel):
24
+ class StreamSourceConfig(BaseModel):
25
+ """Source-specific stream routing configuration.
26
+
27
+ Uses extra='allow' to support source-specific fields like:
28
+ - topic (Kafka)
29
+ - endpoint (API sources)
30
+ - channel (other streaming sources)
31
+ """
32
+
33
+ model_config = ConfigDict(extra="allow")
34
+
35
+ # Common field for stream identifier
36
+ name: Optional[str] = Field(None, description="Stream identifier within the source")
37
+
38
+ # Kafka-specific
39
+ topic: Optional[str] = Field(None, description="Kafka topic name")
40
+
41
+ # API-specific
42
+ endpoint: Optional[str] = Field(None, description="API endpoint path")
43
+
44
+
45
+ class StreamDestinationConfig(BaseModel):
46
+ """Destination configuration for a stream.
47
+
48
+ Supports destination-specific schema definitions and options.
49
+ Uses extra='allow' to support destination-specific overrides.
50
+ """
22
51
 
52
+ model_config = ConfigDict(extra="allow")
53
+
54
+ # Universal destination identifier
55
+ table_id: str = Field(..., description="Full destination identifier (e.g., project.dataset.table)")
56
+
57
+ # BigQuery-specific schema (can be extended for other destinations)
58
+ record_schema: Optional[list[BigQueryColumn]] = Field(None, description="Schema for the destination records")
59
+ clustering_keys: Optional[list[str]] = Field(None, description="Clustering keys for the destination table")
60
+
61
+
62
+ class StreamConfig(BaseModel):
63
+ """Configuration for a single stream.
64
+
65
+ Consolidates source stream routing and destination configuration in one place,
66
+ eliminating duplication of destination_id between source and destination configs.
67
+ """
68
+
69
+ model_config = ConfigDict(extra="forbid")
70
+
71
+ name: str = Field(..., description="Logical name for this stream")
72
+ source: StreamSourceConfig = Field(..., description="Source-specific routing configuration")
73
+ destination: StreamDestinationConfig = Field(
74
+ ..., description="Destination configuration including table and schema"
75
+ )
76
+
77
+ @field_validator("destination")
78
+ @classmethod
79
+ def validate_table_id_format(cls, v: StreamDestinationConfig) -> StreamDestinationConfig:
80
+ """Ensure table_id follows expected format for BigQuery-like destinations."""
81
+ if v.table_id:
82
+ parts = v.table_id.split(".")
83
+ if len(parts) != 3:
84
+ raise ValueError(
85
+ f"table_id must be in format 'project.dataset.table', got: {v.table_id}. "
86
+ f"Found {len(parts)} parts instead of 3."
87
+ )
88
+ return v
89
+
90
+
91
+ class BizonConfig(BaseModel):
23
92
  # Forbid extra keys in the model
24
93
  model_config = ConfigDict(extra="forbid")
25
94
 
@@ -63,6 +132,115 @@ class BizonConfig(BaseModel):
63
132
  default=None,
64
133
  )
65
134
 
135
+ streams: Optional[list[StreamConfig]] = Field(
136
+ None,
137
+ description="Stream routing configuration (opt-in for multi-table streaming). "
138
+ "Consolidates source stream definitions with destination tables and schemas.",
139
+ )
140
+
141
+ @field_validator("streams")
142
+ @classmethod
143
+ def validate_streams_config(cls, v: Optional[list[StreamConfig]], info) -> Optional[list[StreamConfig]]:
144
+ """Validate streams configuration consistency."""
145
+ if not v:
146
+ return v
147
+
148
+ # Check for duplicate stream names
149
+ names = [s.name for s in v]
150
+ if len(names) != len(set(names)):
151
+ duplicates = [name for name in names if names.count(name) > 1]
152
+ raise ValueError(f"Duplicate stream names found in streams configuration: {set(duplicates)}")
153
+
154
+ # Check for duplicate table_ids
155
+ table_ids = [s.destination.table_id for s in v]
156
+ if len(table_ids) != len(set(table_ids)):
157
+ duplicates = [tid for tid in table_ids if table_ids.count(tid) > 1]
158
+ raise ValueError(f"Duplicate table_ids found in streams configuration: {set(duplicates)}")
159
+
160
+ # Validate that source sync_mode is 'stream' if streams config is used
161
+ source_config = info.data.get("source") if info.data else None
162
+ if source_config and source_config.sync_mode != SourceSyncModes.STREAM:
163
+ raise ValueError(
164
+ f"Configuration Error: 'streams' configuration requires source.sync_mode='stream'. "
165
+ f"Current sync_mode: {source_config.sync_mode}. "
166
+ f"Please update your config to use:\n"
167
+ f" source:\n"
168
+ f" sync_mode: stream"
169
+ )
170
+
171
+ return v
172
+
173
+ @model_validator(mode="before")
174
+ @classmethod
175
+ def inject_config_from_streams(cls, data: Any) -> Any:
176
+ """Inject source and destination config from streams.
177
+
178
+ This runs BEFORE field validation, enriching both source and destination
179
+ configs from the streams configuration. This allows:
180
+ 1. Sources (like Kafka) to omit topics - they're extracted from streams
181
+ 2. Destinations with unnest=true to work without duplicate record_schemas
182
+
183
+ This is source-agnostic: each source type can extract what it needs from streams.
184
+ """
185
+ if not isinstance(data, dict):
186
+ return data
187
+
188
+ streams = data.get("streams")
189
+ if not streams:
190
+ return data
191
+
192
+ source = data.get("source")
193
+ if source and isinstance(source, dict):
194
+ source_name = source.get("name")
195
+
196
+ # Kafka: inject topics from streams
197
+ if source_name == "kafka":
198
+ # Check if topics is missing, None, or empty list
199
+ if not source.get("topics") or source.get("topics") == []:
200
+ topics = []
201
+ for stream in streams:
202
+ if isinstance(stream, dict):
203
+ stream_src = stream.get("source", {})
204
+ stream_dest = stream.get("destination", {})
205
+ if stream_src.get("topic"):
206
+ topics.append(
207
+ {
208
+ "name": stream_src.get("topic"),
209
+ "destination_id": stream_dest.get("table_id", ""),
210
+ }
211
+ )
212
+ if topics:
213
+ source["topics"] = topics
214
+
215
+ destination = data.get("destination")
216
+ if not destination or not isinstance(destination, dict):
217
+ return data
218
+
219
+ destination_config = destination.get("config")
220
+ if not destination_config or not isinstance(destination_config, dict):
221
+ return data
222
+
223
+ # Only inject if record_schemas is not already set or is empty
224
+ if not destination_config.get("record_schemas"):
225
+ # Build record_schemas from streams
226
+ record_schemas = []
227
+ for stream in streams:
228
+ if isinstance(stream, dict):
229
+ stream_dest = stream.get("destination", {})
230
+ if stream_dest.get("record_schema"):
231
+ record_schema_config = {
232
+ "destination_id": stream_dest.get("table_id"),
233
+ "record_schema": stream_dest.get("record_schema"),
234
+ "clustering_keys": stream_dest.get("clustering_keys"),
235
+ }
236
+ record_schemas.append(record_schema_config)
237
+
238
+ # Inject into destination config
239
+ if record_schemas:
240
+ destination_config["record_schemas"] = record_schemas
241
+
242
+ return data
243
+
66
244
 
67
245
  class SyncMetadata(BaseModel):
68
246
  """Model which stores general metadata around a sync.
@@ -98,7 +98,6 @@ class BigQueryRecordSchemaConfig(BaseModel):
98
98
 
99
99
 
100
100
  class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
101
-
102
101
  # Table details
103
102
  project_id: str = Field(..., description="BigQuery Project ID")
104
103
  dataset_id: str = Field(..., description="BigQuery Dataset ID")
@@ -22,7 +22,6 @@ from .config import BigQueryColumn, BigQueryConfigDetails
22
22
 
23
23
 
24
24
  class BigQueryDestination(AbstractDestination):
25
-
26
25
  def __init__(
27
26
  self,
28
27
  sync_metadata: SyncMetadata,
@@ -56,7 +55,6 @@ class BigQueryDestination(AbstractDestination):
56
55
 
57
56
  @property
58
57
  def temp_table_id(self) -> str:
59
-
60
58
  if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
61
59
  return f"{self.table_id}_temp"
62
60
 
@@ -67,7 +65,6 @@ class BigQueryDestination(AbstractDestination):
67
65
  return f"{self.table_id}"
68
66
 
69
67
  def get_bigquery_schema(self, df_destination_records: pl.DataFrame) -> List[bigquery.SchemaField]:
70
-
71
68
  # Case we unnest the data
72
69
  if self.config.unnest:
73
70
  return [
@@ -113,9 +110,7 @@ class BigQueryDestination(AbstractDestination):
113
110
  # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dbapi.DataError
114
111
 
115
112
  def convert_and_upload_to_buffer(self, df_destination_records: pl.DataFrame) -> str:
116
-
117
113
  if self.buffer_format == "parquet":
118
-
119
114
  # Upload the Parquet file to GCS
120
115
  file_name = f"{self.sync_metadata.source_name}/{self.sync_metadata.stream_name}/{str(uuid4())}.parquet"
121
116
 
@@ -153,7 +148,6 @@ class BigQueryDestination(AbstractDestination):
153
148
  )
154
149
 
155
150
  def load_to_bigquery(self, gcs_file: str, df_destination_records: pl.DataFrame):
156
-
157
151
  # We always partition by the loaded_at field
158
152
  time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
159
153
 
@@ -171,7 +165,6 @@ class BigQueryDestination(AbstractDestination):
171
165
  assert result.state == "DONE", f"Job failed with state {result.state} with error {result.error_result}"
172
166
 
173
167
  def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
174
-
175
168
  # Rename fields to match BigQuery schema
176
169
  df_destination_records = df_destination_records.rename(
177
170
  {
@@ -201,7 +194,17 @@ class BigQueryDestination(AbstractDestination):
201
194
  def finalize(self):
202
195
  if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
203
196
  logger.info(f"Loading temp table {self.temp_table_id} data into {self.table_id} ...")
204
- self.bq_client.query(f"CREATE OR REPLACE TABLE {self.table_id} AS SELECT * FROM {self.temp_table_id}")
197
+ query = f"CREATE OR REPLACE TABLE {self.table_id} AS SELECT * FROM {self.temp_table_id}"
198
+ result = self.bq_client.query(query)
199
+ bq_result = result.result() # Waits for the job to completew
200
+ logger.info(f"BigQuery CREATE OR REPLACE query result: {bq_result}")
201
+ # Check if the destination table exists by fetching it; raise if it doesn't exist
202
+ try:
203
+ self.bq_client.get_table(self.table_id)
204
+ except NotFound:
205
+ logger.error(f"Table {self.table_id} not found")
206
+ raise Exception(f"Table {self.table_id} not found")
207
+ # Cleanup
205
208
  logger.info(f"Deleting temp table {self.temp_table_id} ...")
206
209
  self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
207
210
  return True
@@ -0,0 +1,74 @@
1
+ # BigQuery Streaming Destination Configuration
2
+ # Uses the BigQuery Storage Write API for low-latency inserts
3
+ #
4
+ # Use this destination when:
5
+ # - You need near real-time data loading
6
+ # - Low latency is more important than cost optimization
7
+ # - Working with streaming/continuous data sources
8
+ #
9
+ # Requirements:
10
+ # - Service account with bigquery.dataEditor role
11
+ # - Dataset must already exist
12
+
13
+ name: source_to_bigquery_streaming
14
+
15
+ source:
16
+ name: <YOUR_SOURCE>
17
+ stream: <YOUR_STREAM>
18
+ authentication:
19
+ type: api_key
20
+ params:
21
+ token: <YOUR_API_KEY>
22
+
23
+ destination:
24
+ name: bigquery_streaming
25
+ config:
26
+ # GCP Project ID
27
+ project_id: <YOUR_GCP_PROJECT>
28
+
29
+ # BigQuery dataset (must exist)
30
+ dataset_id: <YOUR_DATASET>
31
+
32
+ # Dataset location (US, EU, etc.)
33
+ dataset_location: US
34
+
35
+ # Time partitioning (optional)
36
+ time_partitioning:
37
+ type: DAY # Options: DAY, HOUR, MONTH, YEAR
38
+ field: _bizon_loaded_at
39
+
40
+ # Max rows per streaming request (max 10000)
41
+ bq_max_rows_per_request: 5000
42
+
43
+ # Buffer settings
44
+ buffer_size: 50 # MB before flushing
45
+ buffer_flush_timeout: 300 # Seconds before forcing flush
46
+
47
+ # Authentication (optional - uses ADC if not provided)
48
+ # authentication:
49
+ # service_account_key: |
50
+ # {
51
+ # "type": "service_account",
52
+ # "project_id": "<YOUR_GCP_PROJECT>",
53
+ # ...
54
+ # }
55
+
56
+ # Schema definition for unnesting (optional)
57
+ # Required if unnest: true
58
+ # unnest: true
59
+ # record_schemas:
60
+ # - destination_id: my_table
61
+ # record_schema:
62
+ # - name: id
63
+ # type: STRING
64
+ # mode: REQUIRED
65
+ # - name: created_at
66
+ # type: TIMESTAMP
67
+ # mode: NULLABLE
68
+
69
+ engine:
70
+ backend:
71
+ type: bigquery
72
+ database: <YOUR_GCP_PROJECT>
73
+ schema: bizon_state
74
+ syncCursorInDBEvery: 10
@@ -43,7 +43,6 @@ from .config import BigQueryStreamingConfigDetails
43
43
 
44
44
 
45
45
  class BigQueryStreamingDestination(AbstractDestination):
46
-
47
46
  # Add constants for limits
48
47
  MAX_REQUEST_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB (max is 10MB)
49
48
  MAX_ROW_SIZE_BYTES = 0.9 * 1024 * 1024 # 1 MB
@@ -78,7 +77,6 @@ class BigQueryStreamingDestination(AbstractDestination):
78
77
  return self.destination_id or f"{self.project_id}.{self.dataset_id}.{tabled_id}"
79
78
 
80
79
  def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
81
-
82
80
  if self.config.unnest:
83
81
  if len(list(self.record_schemas.keys())) == 1:
84
82
  self.destination_id = list(self.record_schemas.keys())[0]
@@ -169,7 +167,6 @@ class BigQueryStreamingDestination(AbstractDestination):
169
167
  Safe cast record values to the correct type for BigQuery.
170
168
  """
171
169
  for col in self.record_schemas[self.destination_id]:
172
-
173
170
  # Handle dicts as strings
174
171
  if col.type in [BigQueryColumnType.STRING, BigQueryColumnType.JSON]:
175
172
  if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
@@ -355,7 +352,9 @@ class BigQueryStreamingDestination(AbstractDestination):
355
352
  len(current_batch) >= self.bq_max_rows_per_request
356
353
  or current_batch_size + item_size > self.MAX_REQUEST_SIZE_BYTES
357
354
  ):
358
- logger.debug(f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB")
355
+ logger.debug(
356
+ f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size / 1024 / 1024:.2f}MB"
357
+ )
359
358
  yield {"stream_batch": current_batch, "json_batch": large_rows}
360
359
  current_batch = []
361
360
  current_batch_size = 0
@@ -371,7 +370,7 @@ class BigQueryStreamingDestination(AbstractDestination):
371
370
  # Yield the last batch
372
371
  if current_batch:
373
372
  logger.debug(
374
- f"Yielding streaming batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB"
373
+ f"Yielding streaming batch of {len(current_batch)} rows, size: {current_batch_size / 1024 / 1024:.2f}MB"
375
374
  )
376
375
  logger.debug(f"Yielding large rows batch of {len(large_rows)} rows")
377
376
  yield {"stream_batch": current_batch, "json_batch": large_rows}
@@ -0,0 +1,79 @@
1
+ # BigQuery Streaming V2 Destination Configuration
2
+ # Uses the BigQuery Storage Write API (v2) for improved streaming performance
3
+ #
4
+ # Use this destination when:
5
+ # - You need the latest BigQuery streaming features
6
+ # - Working with high-volume streaming data
7
+ # - Require better error handling and retry logic
8
+ #
9
+ # Differences from v1:
10
+ # - Improved batching and retry logic
11
+ # - Better handling of schema evolution
12
+ # - Enhanced error reporting
13
+ #
14
+ # Requirements:
15
+ # - Service account with bigquery.dataEditor role
16
+ # - Dataset must already exist
17
+
18
+ name: source_to_bigquery_streaming_v2
19
+
20
+ source:
21
+ name: <YOUR_SOURCE>
22
+ stream: <YOUR_STREAM>
23
+ authentication:
24
+ type: api_key
25
+ params:
26
+ token: <YOUR_API_KEY>
27
+
28
+ destination:
29
+ name: bigquery_streaming_v2
30
+ config:
31
+ # GCP Project ID
32
+ project_id: <YOUR_GCP_PROJECT>
33
+
34
+ # BigQuery dataset (must exist)
35
+ dataset_id: <YOUR_DATASET>
36
+
37
+ # Dataset location (US, EU, etc.)
38
+ dataset_location: US
39
+
40
+ # Time partitioning (optional)
41
+ time_partitioning:
42
+ type: DAY # Options: DAY, HOUR, MONTH, YEAR
43
+ field: _bizon_loaded_at
44
+
45
+ # Max rows per streaming request (max 10000)
46
+ bq_max_rows_per_request: 5000
47
+
48
+ # Buffer settings
49
+ buffer_size: 50 # MB before flushing
50
+ buffer_flush_timeout: 300 # Seconds before forcing flush
51
+
52
+ # Authentication (optional - uses ADC if not provided)
53
+ # authentication:
54
+ # service_account_key: |
55
+ # {
56
+ # "type": "service_account",
57
+ # "project_id": "<YOUR_GCP_PROJECT>",
58
+ # ...
59
+ # }
60
+
61
+ # Schema definition for unnesting (optional)
62
+ # Required if unnest: true
63
+ # unnest: true
64
+ # record_schemas:
65
+ # - destination_id: my_table
66
+ # record_schema:
67
+ # - name: id
68
+ # type: STRING
69
+ # mode: REQUIRED
70
+ # - name: created_at
71
+ # type: TIMESTAMP
72
+ # mode: NULLABLE
73
+
74
+ engine:
75
+ backend:
76
+ type: bigquery
77
+ database: <YOUR_GCP_PROJECT>
78
+ schema: bizon_state
79
+ syncCursorInDBEvery: 10
@@ -46,7 +46,6 @@ from .proto_utils import get_proto_schema_and_class
46
46
 
47
47
 
48
48
  class BigQueryStreamingV2Destination(AbstractDestination):
49
-
50
49
  # Add constants for limits
51
50
  MAX_REQUEST_SIZE_BYTES = 9.5 * 1024 * 1024 # 9.5 MB (max is 10MB)
52
51
  MAX_ROW_SIZE_BYTES = 8 * 1024 * 1024 # 8 MB (max is 10MB)
@@ -83,7 +82,6 @@ class BigQueryStreamingV2Destination(AbstractDestination):
83
82
  return self.destination_id or f"{self.project_id}.{self.dataset_id}.{tabled_id}"
84
83
 
85
84
  def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
86
-
87
85
  if self.config.unnest:
88
86
  if len(list(self.record_schemas.keys())) == 1:
89
87
  self.destination_id = list(self.record_schemas.keys())[0]
@@ -172,7 +170,6 @@ class BigQueryStreamingV2Destination(AbstractDestination):
172
170
  Safe cast record values to the correct type for BigQuery.
173
171
  """
174
172
  for col in self.record_schemas[self.destination_id]:
175
-
176
173
  # Handle dicts as strings
177
174
  if col.type in ["STRING", "JSON"]:
178
175
  if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
@@ -295,7 +292,6 @@ class BigQueryStreamingV2Destination(AbstractDestination):
295
292
  raise
296
293
 
297
294
  def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
298
-
299
295
  # Create table if it does not exist
300
296
  schema = self.get_bigquery_schema()
301
297
  table = bigquery.Table(self.table_id, schema=schema)
@@ -423,7 +419,9 @@ class BigQueryStreamingV2Destination(AbstractDestination):
423
419
  len(current_batch) >= self.bq_max_rows_per_request
424
420
  or current_batch_size + item_size > self.MAX_REQUEST_SIZE_BYTES
425
421
  ):
426
- logger.debug(f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB")
422
+ logger.debug(
423
+ f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size / 1024 / 1024:.2f}MB"
424
+ )
427
425
  yield {"stream_batch": current_batch, "json_batch": large_rows}
428
426
  current_batch = []
429
427
  current_batch_size = 0
@@ -439,7 +437,7 @@ class BigQueryStreamingV2Destination(AbstractDestination):
439
437
  # Yield the last batch
440
438
  if current_batch:
441
439
  logger.info(
442
- f"Yielding streaming batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB"
440
+ f"Yielding streaming batch of {len(current_batch)} rows, size: {current_batch_size / 1024 / 1024:.2f}MB"
443
441
  )
444
442
  if large_rows:
445
443
  logger.warning(f"Yielding large rows batch of {len(large_rows)} rows")
@@ -0,0 +1,40 @@
1
+ # File Destination Configuration
2
+ # Writes records to a local JSON file (NDJSON format)
3
+ #
4
+ # Use this destination when:
5
+ # - Exporting data for local analysis
6
+ # - Creating data backups
7
+ # - Debugging or inspecting extracted data
8
+ # - Testing before loading to cloud destinations
9
+
10
+ name: source_to_file
11
+
12
+ source:
13
+ name: <YOUR_SOURCE>
14
+ stream: <YOUR_STREAM>
15
+ authentication:
16
+ type: api_key
17
+ params:
18
+ token: <YOUR_API_KEY>
19
+
20
+ destination:
21
+ name: file
22
+ config:
23
+ # File path for output (relative or absolute)
24
+ # The destination_id will be used as the filename
25
+ destination_id: output.jsonl
26
+
27
+ # Output format (currently only json/ndjson supported)
28
+ format: json
29
+
30
+ # Buffer settings (optional)
31
+ # buffer_size: 50 # MB before flushing to file
32
+ # buffer_flush_timeout: 600 # Seconds before forcing flush
33
+
34
+ # Optional: Use in-memory backend for testing
35
+ engine:
36
+ backend:
37
+ type: sqlite_in_memory
38
+ config:
39
+ database: not_used
40
+ schema: not_used
@@ -1,5 +1,5 @@
1
1
  from enum import Enum
2
- from typing import Literal, Optional
2
+ from typing import Literal
3
3
 
4
4
  from pydantic import Field
5
5
 
@@ -13,7 +13,6 @@ from .config import FileDestinationDetailsConfig
13
13
 
14
14
 
15
15
  class FileDestination(AbstractDestination):
16
-
17
16
  def __init__(
18
17
  self,
19
18
  sync_metadata: SyncMetadata,
@@ -32,15 +31,11 @@ class FileDestination(AbstractDestination):
32
31
  return True
33
32
 
34
33
  def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
35
-
36
34
  if self.config.unnest:
37
-
38
35
  schema_keys = set([column.name for column in self.record_schemas[self.destination_id]])
39
36
 
40
37
  with open(f"{self.destination_id}.json", "a") as f:
41
-
42
38
  for value in [orjson.loads(data) for data in df_destination_records["source_data"].to_list()]:
43
-
44
39
  assert set(value.keys()) == schema_keys, "Keys do not match the schema"
45
40
 
46
41
  # Unnest the source_data column
@@ -0,0 +1,30 @@
1
+ # Logger Destination Configuration
2
+ # Prints records to console - useful for testing and debugging
3
+ #
4
+ # Use this destination when:
5
+ # - Testing a new source connector
6
+ # - Debugging data extraction issues
7
+ # - Verifying record structure before loading to production
8
+
9
+ name: source_to_logger
10
+
11
+ source:
12
+ name: <YOUR_SOURCE>
13
+ stream: <YOUR_STREAM>
14
+ authentication:
15
+ type: api_key
16
+ params:
17
+ token: <YOUR_API_KEY>
18
+
19
+ destination:
20
+ name: logger
21
+ config:
22
+ dummy: bizon # Required placeholder field
23
+
24
+ # Optional: Use in-memory backend for testing (no persistence)
25
+ engine:
26
+ backend:
27
+ type: sqlite_in_memory
28
+ config:
29
+ database: not_used
30
+ schema: not_used
@@ -1,7 +1,5 @@
1
1
  from typing import Literal, Optional
2
2
 
3
- from pydantic import BaseModel
4
-
5
3
  from bizon.destination.config import (
6
4
  AbstractDestinationConfig,
7
5
  AbstractDestinationDetailsConfig,
@@ -1,4 +1,4 @@
1
- from typing import List, Tuple
1
+ from typing import Tuple
2
2
 
3
3
  import polars as pl
4
4
  from loguru import logger
@@ -13,7 +13,6 @@ from .config import LoggerDestinationConfig
13
13
 
14
14
 
15
15
  class LoggerDestination(AbstractDestination):
16
-
17
16
  def __init__(
18
17
  self,
19
18
  sync_metadata: SyncMetadata,