bizon 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon/alerting/alerts.py +0 -1
- bizon/common/models.py +182 -4
- bizon/connectors/destinations/bigquery/src/config.py +0 -1
- bizon/connectors/destinations/bigquery/src/destination.py +11 -8
- bizon/connectors/destinations/bigquery_streaming/config/bigquery_streaming.example.yml +74 -0
- bizon/connectors/destinations/bigquery_streaming/src/destination.py +4 -5
- bizon/connectors/destinations/bigquery_streaming_v2/config/bigquery_streaming_v2.example.yml +79 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +4 -6
- bizon/connectors/destinations/file/config/file.example.yml +40 -0
- bizon/connectors/destinations/file/src/config.py +1 -1
- bizon/connectors/destinations/file/src/destination.py +0 -5
- bizon/connectors/destinations/logger/config/logger.example.yml +30 -0
- bizon/connectors/destinations/logger/src/config.py +0 -2
- bizon/connectors/destinations/logger/src/destination.py +1 -2
- bizon/connectors/sources/cycle/src/source.py +2 -6
- bizon/connectors/sources/dummy/src/source.py +0 -4
- bizon/connectors/sources/gsheets/src/source.py +2 -3
- bizon/connectors/sources/hubspot/src/hubspot_base.py +0 -1
- bizon/connectors/sources/hubspot/src/hubspot_objects.py +3 -4
- bizon/connectors/sources/hubspot/src/models/hs_object.py +0 -1
- bizon/connectors/sources/kafka/config/kafka_streams.example.yml +124 -0
- bizon/connectors/sources/kafka/src/config.py +10 -6
- bizon/connectors/sources/kafka/src/decode.py +2 -2
- bizon/connectors/sources/kafka/src/source.py +147 -46
- bizon/connectors/sources/notion/config/api_key.example.yml +35 -0
- bizon/connectors/sources/notion/src/__init__.py +0 -0
- bizon/connectors/sources/notion/src/config.py +59 -0
- bizon/connectors/sources/notion/src/source.py +1159 -0
- bizon/connectors/sources/notion/tests/notion_pipeline.py +7 -0
- bizon/connectors/sources/notion/tests/test_notion.py +113 -0
- bizon/connectors/sources/periscope/src/source.py +0 -6
- bizon/connectors/sources/pokeapi/src/source.py +0 -1
- bizon/connectors/sources/sana_ai/config/sana.example.yml +25 -0
- bizon/connectors/sources/sana_ai/src/source.py +85 -0
- bizon/destination/buffer.py +0 -1
- bizon/destination/config.py +0 -1
- bizon/destination/destination.py +1 -4
- bizon/engine/backend/adapters/sqlalchemy/backend.py +2 -5
- bizon/engine/backend/adapters/sqlalchemy/config.py +0 -1
- bizon/engine/config.py +0 -1
- bizon/engine/engine.py +0 -1
- bizon/engine/pipeline/consumer.py +0 -1
- bizon/engine/pipeline/producer.py +1 -5
- bizon/engine/queue/adapters/kafka/config.py +1 -1
- bizon/engine/queue/adapters/kafka/queue.py +0 -1
- bizon/engine/queue/adapters/python_queue/consumer.py +0 -1
- bizon/engine/queue/adapters/python_queue/queue.py +0 -2
- bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -1
- bizon/engine/queue/adapters/rabbitmq/queue.py +0 -1
- bizon/engine/queue/config.py +0 -2
- bizon/engine/runner/adapters/process.py +0 -2
- bizon/engine/runner/adapters/streaming.py +55 -1
- bizon/engine/runner/adapters/thread.py +0 -2
- bizon/engine/runner/config.py +0 -1
- bizon/engine/runner/runner.py +0 -2
- bizon/monitoring/datadog/monitor.py +5 -3
- bizon/monitoring/noop/monitor.py +1 -1
- bizon/source/auth/authenticators/abstract_oauth.py +11 -3
- bizon/source/auth/authenticators/abstract_token.py +2 -1
- bizon/source/auth/authenticators/basic.py +1 -1
- bizon/source/auth/authenticators/cookies.py +2 -1
- bizon/source/auth/authenticators/oauth.py +8 -3
- bizon/source/config.py +0 -2
- bizon/source/cursor.py +8 -16
- bizon/source/discover.py +3 -6
- bizon/source/models.py +0 -1
- bizon/source/session.py +0 -1
- bizon/source/source.py +17 -2
- bizon/transform/config.py +0 -2
- bizon/transform/transform.py +0 -3
- {bizon-0.1.2.dist-info → bizon-0.2.0.dist-info}/METADATA +62 -42
- bizon-0.2.0.dist-info/RECORD +136 -0
- {bizon-0.1.2.dist-info → bizon-0.2.0.dist-info}/WHEEL +1 -1
- bizon-0.2.0.dist-info/entry_points.txt +2 -0
- bizon-0.1.2.dist-info/RECORD +0 -123
- bizon-0.1.2.dist-info/entry_points.txt +0 -3
- {bizon-0.1.2.dist-info → bizon-0.2.0.dist-info/licenses}/LICENSE +0 -0
bizon/alerting/alerts.py
CHANGED
bizon/common/models.py
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
1
|
-
from typing import Optional, Union
|
|
1
|
+
from typing import Any, Optional, Union
|
|
2
2
|
|
|
3
|
-
from pydantic import BaseModel, ConfigDict, Field
|
|
3
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
|
4
4
|
|
|
5
5
|
from bizon.alerting.models import AlertingConfig
|
|
6
|
-
from bizon.connectors.destinations.bigquery.src.config import
|
|
6
|
+
from bizon.connectors.destinations.bigquery.src.config import (
|
|
7
|
+
BigQueryColumn,
|
|
8
|
+
BigQueryConfig,
|
|
9
|
+
)
|
|
7
10
|
from bizon.connectors.destinations.bigquery_streaming.src.config import (
|
|
8
11
|
BigQueryStreamingConfig,
|
|
9
12
|
)
|
|
@@ -18,8 +21,74 @@ from bizon.source.config import SourceConfig, SourceSyncModes
|
|
|
18
21
|
from bizon.transform.config import TransformModel
|
|
19
22
|
|
|
20
23
|
|
|
21
|
-
class
|
|
24
|
+
class StreamSourceConfig(BaseModel):
|
|
25
|
+
"""Source-specific stream routing configuration.
|
|
26
|
+
|
|
27
|
+
Uses extra='allow' to support source-specific fields like:
|
|
28
|
+
- topic (Kafka)
|
|
29
|
+
- endpoint (API sources)
|
|
30
|
+
- channel (other streaming sources)
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
model_config = ConfigDict(extra="allow")
|
|
34
|
+
|
|
35
|
+
# Common field for stream identifier
|
|
36
|
+
name: Optional[str] = Field(None, description="Stream identifier within the source")
|
|
37
|
+
|
|
38
|
+
# Kafka-specific
|
|
39
|
+
topic: Optional[str] = Field(None, description="Kafka topic name")
|
|
40
|
+
|
|
41
|
+
# API-specific
|
|
42
|
+
endpoint: Optional[str] = Field(None, description="API endpoint path")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class StreamDestinationConfig(BaseModel):
|
|
46
|
+
"""Destination configuration for a stream.
|
|
47
|
+
|
|
48
|
+
Supports destination-specific schema definitions and options.
|
|
49
|
+
Uses extra='allow' to support destination-specific overrides.
|
|
50
|
+
"""
|
|
22
51
|
|
|
52
|
+
model_config = ConfigDict(extra="allow")
|
|
53
|
+
|
|
54
|
+
# Universal destination identifier
|
|
55
|
+
table_id: str = Field(..., description="Full destination identifier (e.g., project.dataset.table)")
|
|
56
|
+
|
|
57
|
+
# BigQuery-specific schema (can be extended for other destinations)
|
|
58
|
+
record_schema: Optional[list[BigQueryColumn]] = Field(None, description="Schema for the destination records")
|
|
59
|
+
clustering_keys: Optional[list[str]] = Field(None, description="Clustering keys for the destination table")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class StreamConfig(BaseModel):
|
|
63
|
+
"""Configuration for a single stream.
|
|
64
|
+
|
|
65
|
+
Consolidates source stream routing and destination configuration in one place,
|
|
66
|
+
eliminating duplication of destination_id between source and destination configs.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
model_config = ConfigDict(extra="forbid")
|
|
70
|
+
|
|
71
|
+
name: str = Field(..., description="Logical name for this stream")
|
|
72
|
+
source: StreamSourceConfig = Field(..., description="Source-specific routing configuration")
|
|
73
|
+
destination: StreamDestinationConfig = Field(
|
|
74
|
+
..., description="Destination configuration including table and schema"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
@field_validator("destination")
|
|
78
|
+
@classmethod
|
|
79
|
+
def validate_table_id_format(cls, v: StreamDestinationConfig) -> StreamDestinationConfig:
|
|
80
|
+
"""Ensure table_id follows expected format for BigQuery-like destinations."""
|
|
81
|
+
if v.table_id:
|
|
82
|
+
parts = v.table_id.split(".")
|
|
83
|
+
if len(parts) != 3:
|
|
84
|
+
raise ValueError(
|
|
85
|
+
f"table_id must be in format 'project.dataset.table', got: {v.table_id}. "
|
|
86
|
+
f"Found {len(parts)} parts instead of 3."
|
|
87
|
+
)
|
|
88
|
+
return v
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class BizonConfig(BaseModel):
|
|
23
92
|
# Forbid extra keys in the model
|
|
24
93
|
model_config = ConfigDict(extra="forbid")
|
|
25
94
|
|
|
@@ -63,6 +132,115 @@ class BizonConfig(BaseModel):
|
|
|
63
132
|
default=None,
|
|
64
133
|
)
|
|
65
134
|
|
|
135
|
+
streams: Optional[list[StreamConfig]] = Field(
|
|
136
|
+
None,
|
|
137
|
+
description="Stream routing configuration (opt-in for multi-table streaming). "
|
|
138
|
+
"Consolidates source stream definitions with destination tables and schemas.",
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
@field_validator("streams")
|
|
142
|
+
@classmethod
|
|
143
|
+
def validate_streams_config(cls, v: Optional[list[StreamConfig]], info) -> Optional[list[StreamConfig]]:
|
|
144
|
+
"""Validate streams configuration consistency."""
|
|
145
|
+
if not v:
|
|
146
|
+
return v
|
|
147
|
+
|
|
148
|
+
# Check for duplicate stream names
|
|
149
|
+
names = [s.name for s in v]
|
|
150
|
+
if len(names) != len(set(names)):
|
|
151
|
+
duplicates = [name for name in names if names.count(name) > 1]
|
|
152
|
+
raise ValueError(f"Duplicate stream names found in streams configuration: {set(duplicates)}")
|
|
153
|
+
|
|
154
|
+
# Check for duplicate table_ids
|
|
155
|
+
table_ids = [s.destination.table_id for s in v]
|
|
156
|
+
if len(table_ids) != len(set(table_ids)):
|
|
157
|
+
duplicates = [tid for tid in table_ids if table_ids.count(tid) > 1]
|
|
158
|
+
raise ValueError(f"Duplicate table_ids found in streams configuration: {set(duplicates)}")
|
|
159
|
+
|
|
160
|
+
# Validate that source sync_mode is 'stream' if streams config is used
|
|
161
|
+
source_config = info.data.get("source") if info.data else None
|
|
162
|
+
if source_config and source_config.sync_mode != SourceSyncModes.STREAM:
|
|
163
|
+
raise ValueError(
|
|
164
|
+
f"Configuration Error: 'streams' configuration requires source.sync_mode='stream'. "
|
|
165
|
+
f"Current sync_mode: {source_config.sync_mode}. "
|
|
166
|
+
f"Please update your config to use:\n"
|
|
167
|
+
f" source:\n"
|
|
168
|
+
f" sync_mode: stream"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
return v
|
|
172
|
+
|
|
173
|
+
@model_validator(mode="before")
|
|
174
|
+
@classmethod
|
|
175
|
+
def inject_config_from_streams(cls, data: Any) -> Any:
|
|
176
|
+
"""Inject source and destination config from streams.
|
|
177
|
+
|
|
178
|
+
This runs BEFORE field validation, enriching both source and destination
|
|
179
|
+
configs from the streams configuration. This allows:
|
|
180
|
+
1. Sources (like Kafka) to omit topics - they're extracted from streams
|
|
181
|
+
2. Destinations with unnest=true to work without duplicate record_schemas
|
|
182
|
+
|
|
183
|
+
This is source-agnostic: each source type can extract what it needs from streams.
|
|
184
|
+
"""
|
|
185
|
+
if not isinstance(data, dict):
|
|
186
|
+
return data
|
|
187
|
+
|
|
188
|
+
streams = data.get("streams")
|
|
189
|
+
if not streams:
|
|
190
|
+
return data
|
|
191
|
+
|
|
192
|
+
source = data.get("source")
|
|
193
|
+
if source and isinstance(source, dict):
|
|
194
|
+
source_name = source.get("name")
|
|
195
|
+
|
|
196
|
+
# Kafka: inject topics from streams
|
|
197
|
+
if source_name == "kafka":
|
|
198
|
+
# Check if topics is missing, None, or empty list
|
|
199
|
+
if not source.get("topics") or source.get("topics") == []:
|
|
200
|
+
topics = []
|
|
201
|
+
for stream in streams:
|
|
202
|
+
if isinstance(stream, dict):
|
|
203
|
+
stream_src = stream.get("source", {})
|
|
204
|
+
stream_dest = stream.get("destination", {})
|
|
205
|
+
if stream_src.get("topic"):
|
|
206
|
+
topics.append(
|
|
207
|
+
{
|
|
208
|
+
"name": stream_src.get("topic"),
|
|
209
|
+
"destination_id": stream_dest.get("table_id", ""),
|
|
210
|
+
}
|
|
211
|
+
)
|
|
212
|
+
if topics:
|
|
213
|
+
source["topics"] = topics
|
|
214
|
+
|
|
215
|
+
destination = data.get("destination")
|
|
216
|
+
if not destination or not isinstance(destination, dict):
|
|
217
|
+
return data
|
|
218
|
+
|
|
219
|
+
destination_config = destination.get("config")
|
|
220
|
+
if not destination_config or not isinstance(destination_config, dict):
|
|
221
|
+
return data
|
|
222
|
+
|
|
223
|
+
# Only inject if record_schemas is not already set or is empty
|
|
224
|
+
if not destination_config.get("record_schemas"):
|
|
225
|
+
# Build record_schemas from streams
|
|
226
|
+
record_schemas = []
|
|
227
|
+
for stream in streams:
|
|
228
|
+
if isinstance(stream, dict):
|
|
229
|
+
stream_dest = stream.get("destination", {})
|
|
230
|
+
if stream_dest.get("record_schema"):
|
|
231
|
+
record_schema_config = {
|
|
232
|
+
"destination_id": stream_dest.get("table_id"),
|
|
233
|
+
"record_schema": stream_dest.get("record_schema"),
|
|
234
|
+
"clustering_keys": stream_dest.get("clustering_keys"),
|
|
235
|
+
}
|
|
236
|
+
record_schemas.append(record_schema_config)
|
|
237
|
+
|
|
238
|
+
# Inject into destination config
|
|
239
|
+
if record_schemas:
|
|
240
|
+
destination_config["record_schemas"] = record_schemas
|
|
241
|
+
|
|
242
|
+
return data
|
|
243
|
+
|
|
66
244
|
|
|
67
245
|
class SyncMetadata(BaseModel):
|
|
68
246
|
"""Model which stores general metadata around a sync.
|
|
@@ -98,7 +98,6 @@ class BigQueryRecordSchemaConfig(BaseModel):
|
|
|
98
98
|
|
|
99
99
|
|
|
100
100
|
class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
|
|
101
|
-
|
|
102
101
|
# Table details
|
|
103
102
|
project_id: str = Field(..., description="BigQuery Project ID")
|
|
104
103
|
dataset_id: str = Field(..., description="BigQuery Dataset ID")
|
|
@@ -22,7 +22,6 @@ from .config import BigQueryColumn, BigQueryConfigDetails
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class BigQueryDestination(AbstractDestination):
|
|
25
|
-
|
|
26
25
|
def __init__(
|
|
27
26
|
self,
|
|
28
27
|
sync_metadata: SyncMetadata,
|
|
@@ -56,7 +55,6 @@ class BigQueryDestination(AbstractDestination):
|
|
|
56
55
|
|
|
57
56
|
@property
|
|
58
57
|
def temp_table_id(self) -> str:
|
|
59
|
-
|
|
60
58
|
if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
|
|
61
59
|
return f"{self.table_id}_temp"
|
|
62
60
|
|
|
@@ -67,7 +65,6 @@ class BigQueryDestination(AbstractDestination):
|
|
|
67
65
|
return f"{self.table_id}"
|
|
68
66
|
|
|
69
67
|
def get_bigquery_schema(self, df_destination_records: pl.DataFrame) -> List[bigquery.SchemaField]:
|
|
70
|
-
|
|
71
68
|
# Case we unnest the data
|
|
72
69
|
if self.config.unnest:
|
|
73
70
|
return [
|
|
@@ -113,9 +110,7 @@ class BigQueryDestination(AbstractDestination):
|
|
|
113
110
|
# https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dbapi.DataError
|
|
114
111
|
|
|
115
112
|
def convert_and_upload_to_buffer(self, df_destination_records: pl.DataFrame) -> str:
|
|
116
|
-
|
|
117
113
|
if self.buffer_format == "parquet":
|
|
118
|
-
|
|
119
114
|
# Upload the Parquet file to GCS
|
|
120
115
|
file_name = f"{self.sync_metadata.source_name}/{self.sync_metadata.stream_name}/{str(uuid4())}.parquet"
|
|
121
116
|
|
|
@@ -153,7 +148,6 @@ class BigQueryDestination(AbstractDestination):
|
|
|
153
148
|
)
|
|
154
149
|
|
|
155
150
|
def load_to_bigquery(self, gcs_file: str, df_destination_records: pl.DataFrame):
|
|
156
|
-
|
|
157
151
|
# We always partition by the loaded_at field
|
|
158
152
|
time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
|
|
159
153
|
|
|
@@ -171,7 +165,6 @@ class BigQueryDestination(AbstractDestination):
|
|
|
171
165
|
assert result.state == "DONE", f"Job failed with state {result.state} with error {result.error_result}"
|
|
172
166
|
|
|
173
167
|
def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
|
|
174
|
-
|
|
175
168
|
# Rename fields to match BigQuery schema
|
|
176
169
|
df_destination_records = df_destination_records.rename(
|
|
177
170
|
{
|
|
@@ -201,7 +194,17 @@ class BigQueryDestination(AbstractDestination):
|
|
|
201
194
|
def finalize(self):
|
|
202
195
|
if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
|
|
203
196
|
logger.info(f"Loading temp table {self.temp_table_id} data into {self.table_id} ...")
|
|
204
|
-
|
|
197
|
+
query = f"CREATE OR REPLACE TABLE {self.table_id} AS SELECT * FROM {self.temp_table_id}"
|
|
198
|
+
result = self.bq_client.query(query)
|
|
199
|
+
bq_result = result.result() # Waits for the job to completew
|
|
200
|
+
logger.info(f"BigQuery CREATE OR REPLACE query result: {bq_result}")
|
|
201
|
+
# Check if the destination table exists by fetching it; raise if it doesn't exist
|
|
202
|
+
try:
|
|
203
|
+
self.bq_client.get_table(self.table_id)
|
|
204
|
+
except NotFound:
|
|
205
|
+
logger.error(f"Table {self.table_id} not found")
|
|
206
|
+
raise Exception(f"Table {self.table_id} not found")
|
|
207
|
+
# Cleanup
|
|
205
208
|
logger.info(f"Deleting temp table {self.temp_table_id} ...")
|
|
206
209
|
self.bq_client.delete_table(self.temp_table_id, not_found_ok=True)
|
|
207
210
|
return True
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# BigQuery Streaming Destination Configuration
|
|
2
|
+
# Uses the BigQuery Storage Write API for low-latency inserts
|
|
3
|
+
#
|
|
4
|
+
# Use this destination when:
|
|
5
|
+
# - You need near real-time data loading
|
|
6
|
+
# - Low latency is more important than cost optimization
|
|
7
|
+
# - Working with streaming/continuous data sources
|
|
8
|
+
#
|
|
9
|
+
# Requirements:
|
|
10
|
+
# - Service account with bigquery.dataEditor role
|
|
11
|
+
# - Dataset must already exist
|
|
12
|
+
|
|
13
|
+
name: source_to_bigquery_streaming
|
|
14
|
+
|
|
15
|
+
source:
|
|
16
|
+
name: <YOUR_SOURCE>
|
|
17
|
+
stream: <YOUR_STREAM>
|
|
18
|
+
authentication:
|
|
19
|
+
type: api_key
|
|
20
|
+
params:
|
|
21
|
+
token: <YOUR_API_KEY>
|
|
22
|
+
|
|
23
|
+
destination:
|
|
24
|
+
name: bigquery_streaming
|
|
25
|
+
config:
|
|
26
|
+
# GCP Project ID
|
|
27
|
+
project_id: <YOUR_GCP_PROJECT>
|
|
28
|
+
|
|
29
|
+
# BigQuery dataset (must exist)
|
|
30
|
+
dataset_id: <YOUR_DATASET>
|
|
31
|
+
|
|
32
|
+
# Dataset location (US, EU, etc.)
|
|
33
|
+
dataset_location: US
|
|
34
|
+
|
|
35
|
+
# Time partitioning (optional)
|
|
36
|
+
time_partitioning:
|
|
37
|
+
type: DAY # Options: DAY, HOUR, MONTH, YEAR
|
|
38
|
+
field: _bizon_loaded_at
|
|
39
|
+
|
|
40
|
+
# Max rows per streaming request (max 10000)
|
|
41
|
+
bq_max_rows_per_request: 5000
|
|
42
|
+
|
|
43
|
+
# Buffer settings
|
|
44
|
+
buffer_size: 50 # MB before flushing
|
|
45
|
+
buffer_flush_timeout: 300 # Seconds before forcing flush
|
|
46
|
+
|
|
47
|
+
# Authentication (optional - uses ADC if not provided)
|
|
48
|
+
# authentication:
|
|
49
|
+
# service_account_key: |
|
|
50
|
+
# {
|
|
51
|
+
# "type": "service_account",
|
|
52
|
+
# "project_id": "<YOUR_GCP_PROJECT>",
|
|
53
|
+
# ...
|
|
54
|
+
# }
|
|
55
|
+
|
|
56
|
+
# Schema definition for unnesting (optional)
|
|
57
|
+
# Required if unnest: true
|
|
58
|
+
# unnest: true
|
|
59
|
+
# record_schemas:
|
|
60
|
+
# - destination_id: my_table
|
|
61
|
+
# record_schema:
|
|
62
|
+
# - name: id
|
|
63
|
+
# type: STRING
|
|
64
|
+
# mode: REQUIRED
|
|
65
|
+
# - name: created_at
|
|
66
|
+
# type: TIMESTAMP
|
|
67
|
+
# mode: NULLABLE
|
|
68
|
+
|
|
69
|
+
engine:
|
|
70
|
+
backend:
|
|
71
|
+
type: bigquery
|
|
72
|
+
database: <YOUR_GCP_PROJECT>
|
|
73
|
+
schema: bizon_state
|
|
74
|
+
syncCursorInDBEvery: 10
|
|
@@ -43,7 +43,6 @@ from .config import BigQueryStreamingConfigDetails
|
|
|
43
43
|
|
|
44
44
|
|
|
45
45
|
class BigQueryStreamingDestination(AbstractDestination):
|
|
46
|
-
|
|
47
46
|
# Add constants for limits
|
|
48
47
|
MAX_REQUEST_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB (max is 10MB)
|
|
49
48
|
MAX_ROW_SIZE_BYTES = 0.9 * 1024 * 1024 # 1 MB
|
|
@@ -78,7 +77,6 @@ class BigQueryStreamingDestination(AbstractDestination):
|
|
|
78
77
|
return self.destination_id or f"{self.project_id}.{self.dataset_id}.{tabled_id}"
|
|
79
78
|
|
|
80
79
|
def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
|
|
81
|
-
|
|
82
80
|
if self.config.unnest:
|
|
83
81
|
if len(list(self.record_schemas.keys())) == 1:
|
|
84
82
|
self.destination_id = list(self.record_schemas.keys())[0]
|
|
@@ -169,7 +167,6 @@ class BigQueryStreamingDestination(AbstractDestination):
|
|
|
169
167
|
Safe cast record values to the correct type for BigQuery.
|
|
170
168
|
"""
|
|
171
169
|
for col in self.record_schemas[self.destination_id]:
|
|
172
|
-
|
|
173
170
|
# Handle dicts as strings
|
|
174
171
|
if col.type in [BigQueryColumnType.STRING, BigQueryColumnType.JSON]:
|
|
175
172
|
if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
|
|
@@ -355,7 +352,9 @@ class BigQueryStreamingDestination(AbstractDestination):
|
|
|
355
352
|
len(current_batch) >= self.bq_max_rows_per_request
|
|
356
353
|
or current_batch_size + item_size > self.MAX_REQUEST_SIZE_BYTES
|
|
357
354
|
):
|
|
358
|
-
logger.debug(
|
|
355
|
+
logger.debug(
|
|
356
|
+
f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size / 1024 / 1024:.2f}MB"
|
|
357
|
+
)
|
|
359
358
|
yield {"stream_batch": current_batch, "json_batch": large_rows}
|
|
360
359
|
current_batch = []
|
|
361
360
|
current_batch_size = 0
|
|
@@ -371,7 +370,7 @@ class BigQueryStreamingDestination(AbstractDestination):
|
|
|
371
370
|
# Yield the last batch
|
|
372
371
|
if current_batch:
|
|
373
372
|
logger.debug(
|
|
374
|
-
f"Yielding streaming batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB"
|
|
373
|
+
f"Yielding streaming batch of {len(current_batch)} rows, size: {current_batch_size / 1024 / 1024:.2f}MB"
|
|
375
374
|
)
|
|
376
375
|
logger.debug(f"Yielding large rows batch of {len(large_rows)} rows")
|
|
377
376
|
yield {"stream_batch": current_batch, "json_batch": large_rows}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# BigQuery Streaming V2 Destination Configuration
|
|
2
|
+
# Uses the BigQuery Storage Write API (v2) for improved streaming performance
|
|
3
|
+
#
|
|
4
|
+
# Use this destination when:
|
|
5
|
+
# - You need the latest BigQuery streaming features
|
|
6
|
+
# - Working with high-volume streaming data
|
|
7
|
+
# - Require better error handling and retry logic
|
|
8
|
+
#
|
|
9
|
+
# Differences from v1:
|
|
10
|
+
# - Improved batching and retry logic
|
|
11
|
+
# - Better handling of schema evolution
|
|
12
|
+
# - Enhanced error reporting
|
|
13
|
+
#
|
|
14
|
+
# Requirements:
|
|
15
|
+
# - Service account with bigquery.dataEditor role
|
|
16
|
+
# - Dataset must already exist
|
|
17
|
+
|
|
18
|
+
name: source_to_bigquery_streaming_v2
|
|
19
|
+
|
|
20
|
+
source:
|
|
21
|
+
name: <YOUR_SOURCE>
|
|
22
|
+
stream: <YOUR_STREAM>
|
|
23
|
+
authentication:
|
|
24
|
+
type: api_key
|
|
25
|
+
params:
|
|
26
|
+
token: <YOUR_API_KEY>
|
|
27
|
+
|
|
28
|
+
destination:
|
|
29
|
+
name: bigquery_streaming_v2
|
|
30
|
+
config:
|
|
31
|
+
# GCP Project ID
|
|
32
|
+
project_id: <YOUR_GCP_PROJECT>
|
|
33
|
+
|
|
34
|
+
# BigQuery dataset (must exist)
|
|
35
|
+
dataset_id: <YOUR_DATASET>
|
|
36
|
+
|
|
37
|
+
# Dataset location (US, EU, etc.)
|
|
38
|
+
dataset_location: US
|
|
39
|
+
|
|
40
|
+
# Time partitioning (optional)
|
|
41
|
+
time_partitioning:
|
|
42
|
+
type: DAY # Options: DAY, HOUR, MONTH, YEAR
|
|
43
|
+
field: _bizon_loaded_at
|
|
44
|
+
|
|
45
|
+
# Max rows per streaming request (max 10000)
|
|
46
|
+
bq_max_rows_per_request: 5000
|
|
47
|
+
|
|
48
|
+
# Buffer settings
|
|
49
|
+
buffer_size: 50 # MB before flushing
|
|
50
|
+
buffer_flush_timeout: 300 # Seconds before forcing flush
|
|
51
|
+
|
|
52
|
+
# Authentication (optional - uses ADC if not provided)
|
|
53
|
+
# authentication:
|
|
54
|
+
# service_account_key: |
|
|
55
|
+
# {
|
|
56
|
+
# "type": "service_account",
|
|
57
|
+
# "project_id": "<YOUR_GCP_PROJECT>",
|
|
58
|
+
# ...
|
|
59
|
+
# }
|
|
60
|
+
|
|
61
|
+
# Schema definition for unnesting (optional)
|
|
62
|
+
# Required if unnest: true
|
|
63
|
+
# unnest: true
|
|
64
|
+
# record_schemas:
|
|
65
|
+
# - destination_id: my_table
|
|
66
|
+
# record_schema:
|
|
67
|
+
# - name: id
|
|
68
|
+
# type: STRING
|
|
69
|
+
# mode: REQUIRED
|
|
70
|
+
# - name: created_at
|
|
71
|
+
# type: TIMESTAMP
|
|
72
|
+
# mode: NULLABLE
|
|
73
|
+
|
|
74
|
+
engine:
|
|
75
|
+
backend:
|
|
76
|
+
type: bigquery
|
|
77
|
+
database: <YOUR_GCP_PROJECT>
|
|
78
|
+
schema: bizon_state
|
|
79
|
+
syncCursorInDBEvery: 10
|
|
@@ -46,7 +46,6 @@ from .proto_utils import get_proto_schema_and_class
|
|
|
46
46
|
|
|
47
47
|
|
|
48
48
|
class BigQueryStreamingV2Destination(AbstractDestination):
|
|
49
|
-
|
|
50
49
|
# Add constants for limits
|
|
51
50
|
MAX_REQUEST_SIZE_BYTES = 9.5 * 1024 * 1024 # 9.5 MB (max is 10MB)
|
|
52
51
|
MAX_ROW_SIZE_BYTES = 8 * 1024 * 1024 # 8 MB (max is 10MB)
|
|
@@ -83,7 +82,6 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
83
82
|
return self.destination_id or f"{self.project_id}.{self.dataset_id}.{tabled_id}"
|
|
84
83
|
|
|
85
84
|
def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
|
|
86
|
-
|
|
87
85
|
if self.config.unnest:
|
|
88
86
|
if len(list(self.record_schemas.keys())) == 1:
|
|
89
87
|
self.destination_id = list(self.record_schemas.keys())[0]
|
|
@@ -172,7 +170,6 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
172
170
|
Safe cast record values to the correct type for BigQuery.
|
|
173
171
|
"""
|
|
174
172
|
for col in self.record_schemas[self.destination_id]:
|
|
175
|
-
|
|
176
173
|
# Handle dicts as strings
|
|
177
174
|
if col.type in ["STRING", "JSON"]:
|
|
178
175
|
if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
|
|
@@ -295,7 +292,6 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
295
292
|
raise
|
|
296
293
|
|
|
297
294
|
def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
|
|
298
|
-
|
|
299
295
|
# Create table if it does not exist
|
|
300
296
|
schema = self.get_bigquery_schema()
|
|
301
297
|
table = bigquery.Table(self.table_id, schema=schema)
|
|
@@ -423,7 +419,9 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
423
419
|
len(current_batch) >= self.bq_max_rows_per_request
|
|
424
420
|
or current_batch_size + item_size > self.MAX_REQUEST_SIZE_BYTES
|
|
425
421
|
):
|
|
426
|
-
logger.debug(
|
|
422
|
+
logger.debug(
|
|
423
|
+
f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size / 1024 / 1024:.2f}MB"
|
|
424
|
+
)
|
|
427
425
|
yield {"stream_batch": current_batch, "json_batch": large_rows}
|
|
428
426
|
current_batch = []
|
|
429
427
|
current_batch_size = 0
|
|
@@ -439,7 +437,7 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
439
437
|
# Yield the last batch
|
|
440
438
|
if current_batch:
|
|
441
439
|
logger.info(
|
|
442
|
-
f"Yielding streaming batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB"
|
|
440
|
+
f"Yielding streaming batch of {len(current_batch)} rows, size: {current_batch_size / 1024 / 1024:.2f}MB"
|
|
443
441
|
)
|
|
444
442
|
if large_rows:
|
|
445
443
|
logger.warning(f"Yielding large rows batch of {len(large_rows)} rows")
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# File Destination Configuration
|
|
2
|
+
# Writes records to a local JSON file (NDJSON format)
|
|
3
|
+
#
|
|
4
|
+
# Use this destination when:
|
|
5
|
+
# - Exporting data for local analysis
|
|
6
|
+
# - Creating data backups
|
|
7
|
+
# - Debugging or inspecting extracted data
|
|
8
|
+
# - Testing before loading to cloud destinations
|
|
9
|
+
|
|
10
|
+
name: source_to_file
|
|
11
|
+
|
|
12
|
+
source:
|
|
13
|
+
name: <YOUR_SOURCE>
|
|
14
|
+
stream: <YOUR_STREAM>
|
|
15
|
+
authentication:
|
|
16
|
+
type: api_key
|
|
17
|
+
params:
|
|
18
|
+
token: <YOUR_API_KEY>
|
|
19
|
+
|
|
20
|
+
destination:
|
|
21
|
+
name: file
|
|
22
|
+
config:
|
|
23
|
+
# File path for output (relative or absolute)
|
|
24
|
+
# The destination_id will be used as the filename
|
|
25
|
+
destination_id: output.jsonl
|
|
26
|
+
|
|
27
|
+
# Output format (currently only json/ndjson supported)
|
|
28
|
+
format: json
|
|
29
|
+
|
|
30
|
+
# Buffer settings (optional)
|
|
31
|
+
# buffer_size: 50 # MB before flushing to file
|
|
32
|
+
# buffer_flush_timeout: 600 # Seconds before forcing flush
|
|
33
|
+
|
|
34
|
+
# Optional: Use in-memory backend for testing
|
|
35
|
+
engine:
|
|
36
|
+
backend:
|
|
37
|
+
type: sqlite_in_memory
|
|
38
|
+
config:
|
|
39
|
+
database: not_used
|
|
40
|
+
schema: not_used
|
|
@@ -13,7 +13,6 @@ from .config import FileDestinationDetailsConfig
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class FileDestination(AbstractDestination):
|
|
16
|
-
|
|
17
16
|
def __init__(
|
|
18
17
|
self,
|
|
19
18
|
sync_metadata: SyncMetadata,
|
|
@@ -32,15 +31,11 @@ class FileDestination(AbstractDestination):
|
|
|
32
31
|
return True
|
|
33
32
|
|
|
34
33
|
def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
|
|
35
|
-
|
|
36
34
|
if self.config.unnest:
|
|
37
|
-
|
|
38
35
|
schema_keys = set([column.name for column in self.record_schemas[self.destination_id]])
|
|
39
36
|
|
|
40
37
|
with open(f"{self.destination_id}.json", "a") as f:
|
|
41
|
-
|
|
42
38
|
for value in [orjson.loads(data) for data in df_destination_records["source_data"].to_list()]:
|
|
43
|
-
|
|
44
39
|
assert set(value.keys()) == schema_keys, "Keys do not match the schema"
|
|
45
40
|
|
|
46
41
|
# Unnest the source_data column
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Logger Destination Configuration
|
|
2
|
+
# Prints records to console - useful for testing and debugging
|
|
3
|
+
#
|
|
4
|
+
# Use this destination when:
|
|
5
|
+
# - Testing a new source connector
|
|
6
|
+
# - Debugging data extraction issues
|
|
7
|
+
# - Verifying record structure before loading to production
|
|
8
|
+
|
|
9
|
+
name: source_to_logger
|
|
10
|
+
|
|
11
|
+
source:
|
|
12
|
+
name: <YOUR_SOURCE>
|
|
13
|
+
stream: <YOUR_STREAM>
|
|
14
|
+
authentication:
|
|
15
|
+
type: api_key
|
|
16
|
+
params:
|
|
17
|
+
token: <YOUR_API_KEY>
|
|
18
|
+
|
|
19
|
+
destination:
|
|
20
|
+
name: logger
|
|
21
|
+
config:
|
|
22
|
+
dummy: bizon # Required placeholder field
|
|
23
|
+
|
|
24
|
+
# Optional: Use in-memory backend for testing (no persistence)
|
|
25
|
+
engine:
|
|
26
|
+
backend:
|
|
27
|
+
type: sqlite_in_memory
|
|
28
|
+
config:
|
|
29
|
+
database: not_used
|
|
30
|
+
schema: not_used
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Tuple
|
|
2
2
|
|
|
3
3
|
import polars as pl
|
|
4
4
|
from loguru import logger
|
|
@@ -13,7 +13,6 @@ from .config import LoggerDestinationConfig
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class LoggerDestination(AbstractDestination):
|
|
16
|
-
|
|
17
16
|
def __init__(
|
|
18
17
|
self,
|
|
19
18
|
sync_metadata: SyncMetadata,
|