bizon 0.0.14__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon/alerting/__init__.py +0 -0
- bizon/alerting/alerts.py +23 -0
- bizon/alerting/models.py +28 -0
- bizon/alerting/slack/__init__.py +0 -0
- bizon/alerting/slack/config.py +5 -0
- bizon/alerting/slack/handler.py +39 -0
- bizon/cli/main.py +7 -3
- bizon/common/models.py +31 -7
- bizon/{destinations → connectors/destinations}/bigquery/config/bigquery.example.yml +3 -4
- bizon/connectors/destinations/bigquery/src/config.py +127 -0
- bizon/{destinations → connectors/destinations}/bigquery/src/destination.py +46 -25
- bizon/connectors/destinations/bigquery_streaming/src/config.py +56 -0
- bizon/connectors/destinations/bigquery_streaming/src/destination.py +372 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +52 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +261 -0
- bizon/{destinations/bigquery_streaming → connectors/destinations/bigquery_streaming_v2}/src/proto_utils.py +32 -26
- bizon/{destinations → connectors/destinations}/file/src/config.py +8 -3
- bizon/connectors/destinations/file/src/destination.py +54 -0
- bizon/{destinations → connectors/destinations}/logger/src/config.py +1 -1
- bizon/{destinations → connectors/destinations}/logger/src/destination.py +15 -3
- bizon/connectors/sources/cycle/config/cycle.example.yml +15 -0
- bizon/connectors/sources/cycle/src/source.py +133 -0
- bizon/{sources/periscope/tests/periscope_pipeline_dashboard.py → connectors/sources/cycle/tests/cycle_customers.py} +1 -1
- bizon/connectors/sources/dummy/config/dummy.example.yml +22 -0
- bizon/{sources → connectors/sources}/dummy/src/fake_api.py +6 -1
- bizon/{sources → connectors/sources}/dummy/src/source.py +18 -5
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline.py +5 -14
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_bigquery_backend.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_kafka.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_rabbitmq.py +2 -2
- bizon/connectors/sources/dummy/tests/dummy_pipeline_unnest.py +29 -0
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery.py +3 -3
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +2 -2
- bizon/{sources → connectors/sources}/gsheets/config/default_auth.example.yml +4 -2
- bizon/{sources → connectors/sources}/gsheets/config/service_account.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/config/api_key.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/config/oauth.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/src/hubspot_objects.py +1 -1
- bizon/connectors/sources/kafka/config/kafka.example.yml +50 -0
- bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +112 -0
- bizon/connectors/sources/kafka/src/callback.py +18 -0
- bizon/connectors/sources/kafka/src/config.py +75 -0
- bizon/connectors/sources/kafka/src/decode.py +88 -0
- bizon/connectors/sources/kafka/src/source.py +361 -0
- bizon/connectors/sources/kafka/tests/kafka_pipeline.py +7 -0
- bizon/connectors/sources/periscope/config/periscope_charts.example.yml +20 -0
- bizon/connectors/sources/periscope/config/periscope_dashboards.example.yml +20 -0
- bizon/{sources → connectors/sources}/periscope/src/source.py +136 -13
- bizon/connectors/sources/periscope/tests/periscope_pipeline_dashboard.py +9 -0
- bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_json.example.yml +19 -0
- bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_logger.example.yml +10 -0
- bizon/connectors/sources/pokeapi/src/source.py +79 -0
- bizon/{destinations → destination}/buffer.py +5 -0
- bizon/destination/config.py +74 -0
- bizon/{destinations → destination}/destination.py +71 -15
- bizon/engine/backend/adapters/sqlalchemy/backend.py +14 -23
- bizon/engine/engine.py +20 -1
- bizon/engine/pipeline/consumer.py +73 -5
- bizon/engine/pipeline/models.py +8 -3
- bizon/engine/pipeline/producer.py +18 -9
- bizon/engine/queue/adapters/kafka/consumer.py +2 -2
- bizon/engine/queue/adapters/kafka/queue.py +3 -2
- bizon/engine/queue/adapters/python_queue/consumer.py +40 -23
- bizon/engine/queue/adapters/python_queue/queue.py +19 -9
- bizon/engine/queue/adapters/rabbitmq/consumer.py +3 -6
- bizon/engine/queue/adapters/rabbitmq/queue.py +3 -2
- bizon/engine/queue/config.py +16 -0
- bizon/engine/queue/queue.py +17 -16
- bizon/engine/runner/adapters/process.py +15 -2
- bizon/engine/runner/adapters/streaming.py +103 -0
- bizon/engine/runner/adapters/thread.py +32 -9
- bizon/engine/runner/config.py +28 -0
- bizon/engine/runner/runner.py +107 -25
- bizon/monitoring/__init__.py +0 -0
- bizon/monitoring/config.py +29 -0
- bizon/monitoring/datadog/__init__.py +0 -0
- bizon/monitoring/datadog/monitor.py +69 -0
- bizon/monitoring/monitor.py +42 -0
- bizon/monitoring/noop/__init__.py +0 -0
- bizon/monitoring/noop/monitor.py +11 -0
- bizon/source/callback.py +24 -0
- bizon/source/config.py +3 -3
- bizon/source/cursor.py +1 -1
- bizon/source/discover.py +4 -3
- bizon/source/models.py +4 -2
- bizon/source/source.py +10 -2
- bizon/transform/config.py +8 -0
- bizon/transform/transform.py +48 -0
- bizon-0.1.1.dist-info/LICENSE +674 -0
- {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/METADATA +25 -7
- bizon-0.1.1.dist-info/RECORD +123 -0
- {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/WHEEL +1 -1
- bizon/destinations/bigquery/src/config.py +0 -51
- bizon/destinations/bigquery_streaming/src/config.py +0 -43
- bizon/destinations/bigquery_streaming/src/destination.py +0 -154
- bizon/destinations/config.py +0 -47
- bizon/destinations/file/src/destination.py +0 -27
- bizon/sources/dummy/config/api_key.example.yml +0 -20
- bizon/sources/dummy/config/api_key_kafka.example.yml +0 -27
- bizon/sources/kafka/config/kafka.example.yml +0 -38
- bizon/sources/kafka/src/source.py +0 -357
- bizon/sources/kafka/tests/kafka_pipeline.py +0 -9
- bizon/sources/periscope/config/periscope_charts.example.yml +0 -26
- bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -26
- bizon-0.0.14.dist-info/LICENSE +0 -21
- bizon-0.0.14.dist-info/RECORD +0 -94
- /bizon/{sources → connectors/sources}/gsheets/src/source.py +0 -0
- /bizon/{sources → connectors/sources}/gsheets/tests/gsheets_pipeline.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/src/hubspot_base.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/src/models/hs_object.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/tests/hubspot_pipeline.py +0 -0
- /bizon/{sources → connectors/sources}/periscope/tests/periscope_pipeline_charts.py +0 -0
- /bizon/{destinations → destination}/models.py +0 -0
- {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
import orjson
|
|
7
|
+
import polars as pl
|
|
8
|
+
import urllib3.exceptions
|
|
9
|
+
from google.api_core.exceptions import (
|
|
10
|
+
Conflict,
|
|
11
|
+
NotFound,
|
|
12
|
+
RetryError,
|
|
13
|
+
ServerError,
|
|
14
|
+
ServiceUnavailable,
|
|
15
|
+
)
|
|
16
|
+
from google.cloud import bigquery, bigquery_storage_v1
|
|
17
|
+
from google.cloud.bigquery import DatasetReference, TimePartitioning
|
|
18
|
+
from google.cloud.bigquery_storage_v1.types import (
|
|
19
|
+
AppendRowsRequest,
|
|
20
|
+
ProtoRows,
|
|
21
|
+
ProtoSchema,
|
|
22
|
+
)
|
|
23
|
+
from loguru import logger
|
|
24
|
+
from requests.exceptions import ConnectionError, SSLError, Timeout
|
|
25
|
+
from tenacity import (
|
|
26
|
+
retry,
|
|
27
|
+
retry_if_exception_type,
|
|
28
|
+
stop_after_attempt,
|
|
29
|
+
wait_exponential,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
from bizon.common.models import SyncMetadata
|
|
33
|
+
from bizon.connectors.destinations.bigquery.src.config import (
|
|
34
|
+
BigQueryColumnMode,
|
|
35
|
+
BigQueryColumnType,
|
|
36
|
+
)
|
|
37
|
+
from bizon.destination.destination import AbstractDestination
|
|
38
|
+
from bizon.engine.backend.backend import AbstractBackend
|
|
39
|
+
from bizon.source.callback import AbstractSourceCallback
|
|
40
|
+
|
|
41
|
+
from .config import BigQueryStreamingConfigDetails
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class BigQueryStreamingDestination(AbstractDestination):
|
|
45
|
+
|
|
46
|
+
# Add constants for limits
|
|
47
|
+
MAX_ROWS_PER_REQUEST = 5000 # 5000 (max is 10000)
|
|
48
|
+
MAX_REQUEST_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB (max is 10MB)
|
|
49
|
+
MAX_ROW_SIZE_BYTES = 0.9 * 1024 * 1024 # 1 MB
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
sync_metadata: SyncMetadata,
|
|
54
|
+
config: BigQueryStreamingConfigDetails,
|
|
55
|
+
backend: AbstractBackend,
|
|
56
|
+
source_callback: AbstractSourceCallback,
|
|
57
|
+
): # type: ignore
|
|
58
|
+
super().__init__(sync_metadata, config, backend, source_callback)
|
|
59
|
+
self.config: BigQueryStreamingConfigDetails = config
|
|
60
|
+
|
|
61
|
+
if config.authentication and config.authentication.service_account_key:
|
|
62
|
+
with tempfile.NamedTemporaryFile(delete=False) as temp:
|
|
63
|
+
temp.write(config.authentication.service_account_key.encode())
|
|
64
|
+
temp_file_path = temp.name
|
|
65
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file_path
|
|
66
|
+
|
|
67
|
+
self.project_id = config.project_id
|
|
68
|
+
self.bq_client = bigquery.Client(project=self.project_id)
|
|
69
|
+
self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
|
|
70
|
+
self.dataset_id = config.dataset_id
|
|
71
|
+
self.dataset_location = config.dataset_location
|
|
72
|
+
self.bq_max_rows_per_request = config.bq_max_rows_per_request
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def table_id(self) -> str:
|
|
76
|
+
tabled_id = f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
|
|
77
|
+
return self.destination_id or f"{self.project_id}.{self.dataset_id}.{tabled_id}"
|
|
78
|
+
|
|
79
|
+
def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
|
|
80
|
+
|
|
81
|
+
if self.config.unnest:
|
|
82
|
+
if len(list(self.record_schemas.keys())) == 1:
|
|
83
|
+
self.destination_id = list(self.record_schemas.keys())[0]
|
|
84
|
+
|
|
85
|
+
return [
|
|
86
|
+
bigquery.SchemaField(
|
|
87
|
+
name=col.name,
|
|
88
|
+
field_type=col.type,
|
|
89
|
+
mode=col.mode,
|
|
90
|
+
description=col.description,
|
|
91
|
+
default_value_expression=col.default_value_expression,
|
|
92
|
+
)
|
|
93
|
+
for col in self.record_schemas[self.destination_id]
|
|
94
|
+
]
|
|
95
|
+
|
|
96
|
+
# Case we don't unnest the data
|
|
97
|
+
else:
|
|
98
|
+
return [
|
|
99
|
+
bigquery.SchemaField(
|
|
100
|
+
"_source_record_id",
|
|
101
|
+
BigQueryColumnType.STRING,
|
|
102
|
+
mode=BigQueryColumnMode.REQUIRED,
|
|
103
|
+
description="The source record id",
|
|
104
|
+
),
|
|
105
|
+
bigquery.SchemaField(
|
|
106
|
+
"_source_timestamp",
|
|
107
|
+
BigQueryColumnType.TIMESTAMP,
|
|
108
|
+
mode=BigQueryColumnMode.REQUIRED,
|
|
109
|
+
description="The source timestamp",
|
|
110
|
+
),
|
|
111
|
+
bigquery.SchemaField(
|
|
112
|
+
"_source_data",
|
|
113
|
+
BigQueryColumnType.JSON,
|
|
114
|
+
mode=BigQueryColumnMode.NULLABLE,
|
|
115
|
+
description="The source data",
|
|
116
|
+
),
|
|
117
|
+
bigquery.SchemaField(
|
|
118
|
+
"_bizon_extracted_at",
|
|
119
|
+
BigQueryColumnType.TIMESTAMP,
|
|
120
|
+
mode=BigQueryColumnMode.REQUIRED,
|
|
121
|
+
description="The bizon extracted at",
|
|
122
|
+
),
|
|
123
|
+
bigquery.SchemaField(
|
|
124
|
+
"_bizon_loaded_at",
|
|
125
|
+
BigQueryColumnType.TIMESTAMP,
|
|
126
|
+
mode=BigQueryColumnMode.REQUIRED,
|
|
127
|
+
default_value_expression="CURRENT_TIMESTAMP()",
|
|
128
|
+
description="The bizon loaded at",
|
|
129
|
+
),
|
|
130
|
+
bigquery.SchemaField(
|
|
131
|
+
"_bizon_id",
|
|
132
|
+
BigQueryColumnType.STRING,
|
|
133
|
+
mode=BigQueryColumnMode.REQUIRED,
|
|
134
|
+
description="The bizon id",
|
|
135
|
+
),
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
def check_connection(self) -> bool:
|
|
139
|
+
dataset_ref = DatasetReference(self.project_id, self.dataset_id)
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
self.bq_client.get_dataset(dataset_ref)
|
|
143
|
+
except NotFound:
|
|
144
|
+
dataset = bigquery.Dataset(dataset_ref)
|
|
145
|
+
dataset.location = self.dataset_location
|
|
146
|
+
dataset = self.bq_client.create_dataset(dataset)
|
|
147
|
+
return True
|
|
148
|
+
|
|
149
|
+
def append_rows_to_stream(
|
|
150
|
+
self,
|
|
151
|
+
write_client: bigquery_storage_v1.BigQueryWriteClient,
|
|
152
|
+
stream_name: str,
|
|
153
|
+
proto_schema: ProtoSchema,
|
|
154
|
+
serialized_rows: List[bytes],
|
|
155
|
+
):
|
|
156
|
+
request = AppendRowsRequest(
|
|
157
|
+
write_stream=stream_name,
|
|
158
|
+
proto_rows=AppendRowsRequest.ProtoData(
|
|
159
|
+
rows=ProtoRows(serialized_rows=serialized_rows),
|
|
160
|
+
writer_schema=proto_schema,
|
|
161
|
+
),
|
|
162
|
+
)
|
|
163
|
+
response = write_client.append_rows(iter([request]))
|
|
164
|
+
return response.code().name
|
|
165
|
+
|
|
166
|
+
def safe_cast_record_values(self, row: dict):
|
|
167
|
+
"""
|
|
168
|
+
Safe cast record values to the correct type for BigQuery.
|
|
169
|
+
"""
|
|
170
|
+
for col in self.record_schemas[self.destination_id]:
|
|
171
|
+
|
|
172
|
+
# Handle dicts as strings
|
|
173
|
+
if col.type in [BigQueryColumnType.STRING, BigQueryColumnType.JSON]:
|
|
174
|
+
if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
|
|
175
|
+
row[col.name] = orjson.dumps(row[col.name]).decode("utf-8")
|
|
176
|
+
|
|
177
|
+
# Handle timestamps
|
|
178
|
+
if (
|
|
179
|
+
col.type in [BigQueryColumnType.TIMESTAMP, BigQueryColumnType.DATETIME]
|
|
180
|
+
and col.default_value_expression is None
|
|
181
|
+
):
|
|
182
|
+
if isinstance(row[col.name], int):
|
|
183
|
+
if row[col.name] > datetime(9999, 12, 31).timestamp():
|
|
184
|
+
row[col.name] = datetime.fromtimestamp(row[col.name] / 1_000_000).strftime(
|
|
185
|
+
"%Y-%m-%d %H:%M:%S.%f"
|
|
186
|
+
)
|
|
187
|
+
else:
|
|
188
|
+
try:
|
|
189
|
+
row[col.name] = datetime.fromtimestamp(row[col.name]).strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
190
|
+
except ValueError:
|
|
191
|
+
error_message = (
|
|
192
|
+
f"Error casting timestamp for destination '{self.destination_id}' column '{col.name}'. "
|
|
193
|
+
f"Invalid timestamp value: {row[col.name]} ({type(row[col.name])}). "
|
|
194
|
+
"Consider using a transformation."
|
|
195
|
+
)
|
|
196
|
+
logger.error(error_message)
|
|
197
|
+
raise ValueError(error_message)
|
|
198
|
+
return row
|
|
199
|
+
|
|
200
|
+
@retry(
|
|
201
|
+
retry=retry_if_exception_type(
|
|
202
|
+
(
|
|
203
|
+
ServerError,
|
|
204
|
+
ServiceUnavailable,
|
|
205
|
+
SSLError,
|
|
206
|
+
ConnectionError,
|
|
207
|
+
Timeout,
|
|
208
|
+
RetryError,
|
|
209
|
+
urllib3.exceptions.ProtocolError,
|
|
210
|
+
urllib3.exceptions.SSLError,
|
|
211
|
+
)
|
|
212
|
+
),
|
|
213
|
+
wait=wait_exponential(multiplier=2, min=4, max=120),
|
|
214
|
+
stop=stop_after_attempt(8),
|
|
215
|
+
before_sleep=lambda retry_state: logger.warning(
|
|
216
|
+
f"Attempt {retry_state.attempt_number} failed. Retrying in {retry_state.next_action.sleep} seconds..."
|
|
217
|
+
),
|
|
218
|
+
)
|
|
219
|
+
def _insert_batch(self, table, batch):
|
|
220
|
+
"""Helper method to insert a batch of rows with retry logic"""
|
|
221
|
+
logger.debug(f"Inserting batch in table {table.table_id}")
|
|
222
|
+
try:
|
|
223
|
+
# Handle streaming batch
|
|
224
|
+
if batch.get("stream_batch") and len(batch["stream_batch"]) > 0:
|
|
225
|
+
return self.bq_client.insert_rows_json(
|
|
226
|
+
table,
|
|
227
|
+
batch["stream_batch"],
|
|
228
|
+
row_ids=[None] * len(batch["stream_batch"]),
|
|
229
|
+
timeout=300, # 5 minutes timeout per request
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Handle large rows batch
|
|
233
|
+
if batch.get("json_batch") and len(batch["json_batch"]) > 0:
|
|
234
|
+
job_config = bigquery.LoadJobConfig(
|
|
235
|
+
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
|
|
236
|
+
schema=table.schema,
|
|
237
|
+
ignore_unknown_values=True,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
load_job = self.bq_client.load_table_from_json(
|
|
241
|
+
batch["json_batch"], table, job_config=job_config, timeout=300
|
|
242
|
+
)
|
|
243
|
+
load_job.result()
|
|
244
|
+
|
|
245
|
+
if load_job.state != "DONE":
|
|
246
|
+
raise Exception(f"Failed to load rows to BigQuery: {load_job.errors}")
|
|
247
|
+
|
|
248
|
+
except Exception as e:
|
|
249
|
+
logger.error(f"Error inserting batch: {str(e)}, type: {type(e)}")
|
|
250
|
+
raise
|
|
251
|
+
|
|
252
|
+
def load_to_bigquery_via_legacy_streaming(self, df_destination_records: pl.DataFrame) -> str:
|
|
253
|
+
# Create table if it does not exist
|
|
254
|
+
schema = self.get_bigquery_schema()
|
|
255
|
+
table = bigquery.Table(self.table_id, schema=schema)
|
|
256
|
+
time_partitioning = TimePartitioning(
|
|
257
|
+
field=self.config.time_partitioning.field, type_=self.config.time_partitioning.type
|
|
258
|
+
)
|
|
259
|
+
table.time_partitioning = time_partitioning
|
|
260
|
+
|
|
261
|
+
if self.clustering_keys and self.clustering_keys[self.destination_id]:
|
|
262
|
+
table.clustering_fields = self.clustering_keys[self.destination_id]
|
|
263
|
+
try:
|
|
264
|
+
table = self.bq_client.create_table(table)
|
|
265
|
+
except Conflict:
|
|
266
|
+
table = self.bq_client.get_table(self.table_id)
|
|
267
|
+
# Compare and update schema if needed
|
|
268
|
+
existing_fields = {field.name: field for field in table.schema}
|
|
269
|
+
new_fields = {field.name: field for field in self.get_bigquery_schema()}
|
|
270
|
+
|
|
271
|
+
# Find fields that need to be added
|
|
272
|
+
fields_to_add = [field for name, field in new_fields.items() if name not in existing_fields]
|
|
273
|
+
|
|
274
|
+
if fields_to_add:
|
|
275
|
+
logger.warning(f"Adding new fields to table schema: {[field.name for field in fields_to_add]}")
|
|
276
|
+
updated_schema = table.schema + fields_to_add
|
|
277
|
+
table.schema = updated_schema
|
|
278
|
+
table = self.bq_client.update_table(table, ["schema"])
|
|
279
|
+
|
|
280
|
+
if self.config.unnest:
|
|
281
|
+
# We cannot use the `json_decode` method here because of the issue: https://github.com/pola-rs/polars/issues/22371
|
|
282
|
+
rows_to_insert = [
|
|
283
|
+
self.safe_cast_record_values(orjson.loads(row))
|
|
284
|
+
for row in df_destination_records["source_data"].to_list()
|
|
285
|
+
]
|
|
286
|
+
else:
|
|
287
|
+
df_destination_records = df_destination_records.with_columns(
|
|
288
|
+
pl.col("bizon_extracted_at").dt.strftime("%Y-%m-%d %H:%M:%S").alias("bizon_extracted_at"),
|
|
289
|
+
pl.col("bizon_loaded_at").dt.strftime("%Y-%m-%d %H:%M:%S").alias("bizon_loaded_at"),
|
|
290
|
+
pl.col("source_timestamp").dt.strftime("%Y-%m-%d %H:%M:%S").alias("source_timestamp"),
|
|
291
|
+
)
|
|
292
|
+
df_destination_records = df_destination_records.rename(
|
|
293
|
+
{
|
|
294
|
+
"bizon_id": "_bizon_id",
|
|
295
|
+
"bizon_extracted_at": "_bizon_extracted_at",
|
|
296
|
+
"bizon_loaded_at": "_bizon_loaded_at",
|
|
297
|
+
"source_record_id": "_source_record_id",
|
|
298
|
+
"source_timestamp": "_source_timestamp",
|
|
299
|
+
"source_data": "_source_data",
|
|
300
|
+
}
|
|
301
|
+
)
|
|
302
|
+
rows_to_insert = [row for row in df_destination_records.iter_rows(named=True)]
|
|
303
|
+
|
|
304
|
+
errors = []
|
|
305
|
+
for batch in self.batch(rows_to_insert):
|
|
306
|
+
try:
|
|
307
|
+
batch_errors = self._insert_batch(table, batch)
|
|
308
|
+
if batch_errors:
|
|
309
|
+
errors.extend(batch_errors)
|
|
310
|
+
except Exception as e:
|
|
311
|
+
logger.error(f"Failed to insert batch on destination {self.destination_id} after all retries: {str(e)}")
|
|
312
|
+
if isinstance(e, RetryError):
|
|
313
|
+
logger.error(f"Retry error details: {e.cause if hasattr(e, 'cause') else 'No cause available'}")
|
|
314
|
+
raise
|
|
315
|
+
|
|
316
|
+
if errors:
|
|
317
|
+
logger.error("Encountered errors while inserting rows:")
|
|
318
|
+
for error in errors:
|
|
319
|
+
if error.get("errors") and len(error["errors"]) > 0:
|
|
320
|
+
logger.error("The following row failed to be inserted:")
|
|
321
|
+
if batch.get("stream_batch") and len(batch["stream_batch"]) > 0:
|
|
322
|
+
logger.error(f"{batch['stream_batch'][error['index']]}")
|
|
323
|
+
else:
|
|
324
|
+
logger.error(f"{batch['json_batch'][error['index']]}")
|
|
325
|
+
for error_detail in error["errors"]:
|
|
326
|
+
logger.error(f"Location (column): {error_detail['location']}")
|
|
327
|
+
logger.error(f"Reason: {error_detail['reason']}")
|
|
328
|
+
logger.error(f"Message: {error_detail['message']}")
|
|
329
|
+
raise Exception(f"Encountered errors while inserting rows: {errors}")
|
|
330
|
+
|
|
331
|
+
def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
|
|
332
|
+
logger.debug("Using BigQuery legacy streaming API...")
|
|
333
|
+
self.load_to_bigquery_via_legacy_streaming(df_destination_records=df_destination_records)
|
|
334
|
+
return True, ""
|
|
335
|
+
|
|
336
|
+
def batch(self, iterable):
|
|
337
|
+
"""
|
|
338
|
+
Yield successive batches respecting both row count and size limits.
|
|
339
|
+
"""
|
|
340
|
+
current_batch = []
|
|
341
|
+
current_batch_size = 0
|
|
342
|
+
large_rows = []
|
|
343
|
+
|
|
344
|
+
for item in iterable:
|
|
345
|
+
# Estimate the size of the item (as JSON)
|
|
346
|
+
item_size = len(str(item).encode("utf-8"))
|
|
347
|
+
|
|
348
|
+
# If adding this item would exceed either limit, yield current batch and start new one
|
|
349
|
+
if (
|
|
350
|
+
len(current_batch) >= self.MAX_ROWS_PER_REQUEST
|
|
351
|
+
or current_batch_size + item_size > self.MAX_REQUEST_SIZE_BYTES
|
|
352
|
+
):
|
|
353
|
+
logger.debug(f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB")
|
|
354
|
+
yield {"stream_batch": current_batch, "json_batch": large_rows}
|
|
355
|
+
current_batch = []
|
|
356
|
+
current_batch_size = 0
|
|
357
|
+
large_rows = []
|
|
358
|
+
|
|
359
|
+
if item_size > self.MAX_ROW_SIZE_BYTES:
|
|
360
|
+
large_rows.append(item)
|
|
361
|
+
logger.debug(f"Large row detected: {item_size} bytes")
|
|
362
|
+
else:
|
|
363
|
+
current_batch.append(item)
|
|
364
|
+
current_batch_size += item_size
|
|
365
|
+
|
|
366
|
+
# Yield the last batch
|
|
367
|
+
if current_batch:
|
|
368
|
+
logger.debug(
|
|
369
|
+
f"Yielding streaming batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB"
|
|
370
|
+
)
|
|
371
|
+
logger.debug(f"Yielding large rows batch of {len(large_rows)} rows")
|
|
372
|
+
yield {"stream_batch": current_batch, "json_batch": large_rows}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Literal, Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
|
|
6
|
+
from bizon.connectors.destinations.bigquery.src.config import BigQueryRecordSchemaConfig
|
|
7
|
+
from bizon.destination.config import (
|
|
8
|
+
AbstractDestinationConfig,
|
|
9
|
+
AbstractDestinationDetailsConfig,
|
|
10
|
+
DestinationTypes,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TimePartitioningWindow(str, Enum):
|
|
15
|
+
DAY = "DAY"
|
|
16
|
+
HOUR = "HOUR"
|
|
17
|
+
MONTH = "MONTH"
|
|
18
|
+
YEAR = "YEAR"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TimePartitioning(BaseModel):
|
|
22
|
+
type: TimePartitioningWindow = Field(default=TimePartitioningWindow.DAY, description="Time partitioning type")
|
|
23
|
+
field: Optional[str] = Field(
|
|
24
|
+
"_bizon_loaded_at", description="Field to partition by. You can use a transformation to create this field."
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BigQueryAuthentication(BaseModel):
|
|
29
|
+
service_account_key: str = Field(
|
|
30
|
+
description="Service Account Key JSON string. If empty it will be infered",
|
|
31
|
+
default="",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class BigQueryStreamingV2ConfigDetails(AbstractDestinationDetailsConfig):
|
|
36
|
+
project_id: str
|
|
37
|
+
dataset_id: str
|
|
38
|
+
dataset_location: Optional[str] = "US"
|
|
39
|
+
time_partitioning: Optional[TimePartitioning] = Field(
|
|
40
|
+
default=TimePartitioning(type=TimePartitioningWindow.DAY, field="_bizon_loaded_at"),
|
|
41
|
+
description="BigQuery Time partitioning type",
|
|
42
|
+
)
|
|
43
|
+
authentication: Optional[BigQueryAuthentication] = None
|
|
44
|
+
bq_max_rows_per_request: Optional[int] = Field(30000, description="Max rows per buffer streaming request.")
|
|
45
|
+
record_schemas: Optional[list[BigQueryRecordSchemaConfig]] = Field(
|
|
46
|
+
default=None, description="Schema for the records. Required if unnest is set to true."
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class BigQueryStreamingV2Config(AbstractDestinationConfig):
|
|
51
|
+
name: Literal[DestinationTypes.BIGQUERY_STREAMING_V2]
|
|
52
|
+
config: BigQueryStreamingV2ConfigDetails
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import List, Tuple, Type
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from google.api_core.exceptions import NotFound
|
|
9
|
+
from google.cloud import bigquery, bigquery_storage_v1
|
|
10
|
+
from google.cloud.bigquery import DatasetReference, TimePartitioning
|
|
11
|
+
from google.cloud.bigquery_storage_v1.types import (
|
|
12
|
+
AppendRowsRequest,
|
|
13
|
+
ProtoRows,
|
|
14
|
+
ProtoSchema,
|
|
15
|
+
)
|
|
16
|
+
from google.protobuf.json_format import ParseDict
|
|
17
|
+
from google.protobuf.message import Message
|
|
18
|
+
from loguru import logger
|
|
19
|
+
|
|
20
|
+
from bizon.common.models import SyncMetadata
|
|
21
|
+
from bizon.destination.destination import AbstractDestination
|
|
22
|
+
from bizon.engine.backend.backend import AbstractBackend
|
|
23
|
+
from bizon.source.callback import AbstractSourceCallback
|
|
24
|
+
|
|
25
|
+
from .config import BigQueryStreamingV2ConfigDetails
|
|
26
|
+
from .proto_utils import get_proto_schema_and_class
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class BigQueryStreamingV2Destination(AbstractDestination):
|
|
30
|
+
|
|
31
|
+
# Add constants for limits
|
|
32
|
+
MAX_ROWS_PER_REQUEST = 5000 # 5000 (max is 10000)
|
|
33
|
+
MAX_REQUEST_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB (max is 10MB)
|
|
34
|
+
MAX_ROW_SIZE_BYTES = 0.9 * 1024 * 1024 # 1 MB
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
sync_metadata: SyncMetadata,
|
|
39
|
+
config: BigQueryStreamingV2ConfigDetails,
|
|
40
|
+
backend: AbstractBackend,
|
|
41
|
+
source_callback: AbstractSourceCallback,
|
|
42
|
+
): # type: ignore
|
|
43
|
+
super().__init__(sync_metadata, config, backend, source_callback)
|
|
44
|
+
self.config: BigQueryStreamingV2ConfigDetails = config
|
|
45
|
+
|
|
46
|
+
if config.authentication and config.authentication.service_account_key:
|
|
47
|
+
with tempfile.NamedTemporaryFile(delete=False) as temp:
|
|
48
|
+
temp.write(config.authentication.service_account_key.encode())
|
|
49
|
+
temp_file_path = temp.name
|
|
50
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file_path
|
|
51
|
+
|
|
52
|
+
self.project_id = config.project_id
|
|
53
|
+
self.bq_client = bigquery.Client(project=self.project_id)
|
|
54
|
+
self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
|
|
55
|
+
self.dataset_id = config.dataset_id
|
|
56
|
+
self.dataset_location = config.dataset_location
|
|
57
|
+
self.bq_max_rows_per_request = config.bq_max_rows_per_request
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def table_id(self) -> str:
|
|
61
|
+
tabled_id = f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
|
|
62
|
+
return self.destination_id or f"{self.project_id}.{self.dataset_id}.{tabled_id}"
|
|
63
|
+
|
|
64
|
+
def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
|
|
65
|
+
|
|
66
|
+
if self.config.unnest:
|
|
67
|
+
if len(list(self.record_schemas.keys())) == 1:
|
|
68
|
+
self.destination_id = list(self.record_schemas.keys())[0]
|
|
69
|
+
|
|
70
|
+
return [
|
|
71
|
+
bigquery.SchemaField(
|
|
72
|
+
name=col.name,
|
|
73
|
+
field_type=col.type,
|
|
74
|
+
mode=col.mode,
|
|
75
|
+
description=col.description,
|
|
76
|
+
default_value_expression=col.default_value_expression,
|
|
77
|
+
)
|
|
78
|
+
for col in self.record_schemas[self.destination_id]
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
# Case we don't unnest the data
|
|
82
|
+
else:
|
|
83
|
+
return [
|
|
84
|
+
bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
|
|
85
|
+
bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
|
|
86
|
+
bigquery.SchemaField("_source_data", "JSON", mode="NULLABLE"),
|
|
87
|
+
bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
|
|
88
|
+
bigquery.SchemaField(
|
|
89
|
+
"_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
|
|
90
|
+
),
|
|
91
|
+
bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
def check_connection(self) -> bool:
|
|
95
|
+
dataset_ref = DatasetReference(self.project_id, self.dataset_id)
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
self.bq_client.get_dataset(dataset_ref)
|
|
99
|
+
except NotFound:
|
|
100
|
+
dataset = bigquery.Dataset(dataset_ref)
|
|
101
|
+
dataset.location = self.dataset_location
|
|
102
|
+
dataset = self.bq_client.create_dataset(dataset)
|
|
103
|
+
return True
|
|
104
|
+
|
|
105
|
+
def append_rows_to_stream(
|
|
106
|
+
self,
|
|
107
|
+
write_client: bigquery_storage_v1.BigQueryWriteClient,
|
|
108
|
+
stream_name: str,
|
|
109
|
+
proto_schema: ProtoSchema,
|
|
110
|
+
serialized_rows: List[bytes],
|
|
111
|
+
):
|
|
112
|
+
request = AppendRowsRequest(
|
|
113
|
+
write_stream=stream_name,
|
|
114
|
+
proto_rows=AppendRowsRequest.ProtoData(
|
|
115
|
+
rows=ProtoRows(serialized_rows=serialized_rows),
|
|
116
|
+
writer_schema=proto_schema,
|
|
117
|
+
),
|
|
118
|
+
)
|
|
119
|
+
response = write_client.append_rows(iter([request]))
|
|
120
|
+
return response.code().name
|
|
121
|
+
|
|
122
|
+
def safe_cast_record_values(self, row: dict):
|
|
123
|
+
for col in self.record_schemas[self.destination_id]:
|
|
124
|
+
if col.type in ["TIMESTAMP", "DATETIME"] and col.default_value_expression is None:
|
|
125
|
+
if isinstance(row[col.name], int):
|
|
126
|
+
if row[col.name] > datetime(9999, 12, 31).timestamp():
|
|
127
|
+
row[col.name] = datetime.fromtimestamp(row[col.name] / 1_000_000).strftime(
|
|
128
|
+
"%Y-%m-%d %H:%M:%S.%f"
|
|
129
|
+
)
|
|
130
|
+
else:
|
|
131
|
+
try:
|
|
132
|
+
row[col.name] = datetime.fromtimestamp(row[col.name]).strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
133
|
+
except ValueError:
|
|
134
|
+
error_message = (
|
|
135
|
+
f"Error casting timestamp for destination '{self.destination_id}' column '{col.name}'. "
|
|
136
|
+
f"Invalid timestamp value: {row[col.name]} ({type(row[col.name])}). "
|
|
137
|
+
"Consider using a transformation."
|
|
138
|
+
)
|
|
139
|
+
logger.error(error_message)
|
|
140
|
+
raise ValueError(error_message)
|
|
141
|
+
return row
|
|
142
|
+
|
|
143
|
+
@staticmethod
|
|
144
|
+
def to_protobuf_serialization(TableRowClass: Type[Message], row: dict) -> bytes:
|
|
145
|
+
"""Convert a row to a Protobuf serialization."""
|
|
146
|
+
record = ParseDict(row, TableRowClass())
|
|
147
|
+
return record.SerializeToString()
|
|
148
|
+
|
|
149
|
+
def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
|
|
150
|
+
|
|
151
|
+
# TODO: for now no clustering keys
|
|
152
|
+
clustering_keys = []
|
|
153
|
+
|
|
154
|
+
# Create table if it doesnt exist
|
|
155
|
+
schema = self.get_bigquery_schema()
|
|
156
|
+
table = bigquery.Table(self.table_id, schema=schema)
|
|
157
|
+
time_partitioning = TimePartitioning(
|
|
158
|
+
field=self.config.time_partitioning.field, type_=self.config.time_partitioning.type
|
|
159
|
+
)
|
|
160
|
+
table.time_partitioning = time_partitioning
|
|
161
|
+
|
|
162
|
+
# Override bigquery client with project's destination id
|
|
163
|
+
if self.destination_id:
|
|
164
|
+
project, dataset, table_name = self.destination_id.split(".")
|
|
165
|
+
self.bq_client = bigquery.Client(project=project)
|
|
166
|
+
|
|
167
|
+
table = self.bq_client.create_table(table, exists_ok=True)
|
|
168
|
+
|
|
169
|
+
# Create the stream
|
|
170
|
+
if self.destination_id:
|
|
171
|
+
project, dataset, table_name = self.destination_id.split(".")
|
|
172
|
+
write_client = bigquery_storage_v1.BigQueryWriteClient()
|
|
173
|
+
parent = write_client.table_path(project, dataset, table_name)
|
|
174
|
+
else:
|
|
175
|
+
write_client = self.bq_storage_client
|
|
176
|
+
parent = write_client.table_path(self.project_id, self.dataset_id, self.destination_id)
|
|
177
|
+
|
|
178
|
+
stream_name = f"{parent}/_default"
|
|
179
|
+
|
|
180
|
+
# Generating the protocol buffer representation of the message descriptor.
|
|
181
|
+
proto_schema, TableRow = get_proto_schema_and_class(schema, clustering_keys)
|
|
182
|
+
|
|
183
|
+
if self.config.unnest:
|
|
184
|
+
serialized_rows = [
|
|
185
|
+
self.to_protobuf_serialization(TableRowClass=TableRow, row=self.safe_cast_record_values(row))
|
|
186
|
+
for row in df_destination_records["source_data"].str.json_decode(infer_schema_length=None).to_list()
|
|
187
|
+
]
|
|
188
|
+
else:
|
|
189
|
+
df_destination_records = df_destination_records.with_columns(
|
|
190
|
+
pl.col("bizon_extracted_at").dt.strftime("%Y-%m-%d %H:%M:%S").alias("bizon_extracted_at"),
|
|
191
|
+
pl.col("bizon_loaded_at").dt.strftime("%Y-%m-%d %H:%M:%S").alias("bizon_loaded_at"),
|
|
192
|
+
pl.col("source_timestamp").dt.strftime("%Y-%m-%d %H:%M:%S").alias("source_timestamp"),
|
|
193
|
+
)
|
|
194
|
+
df_destination_records = df_destination_records.rename(
|
|
195
|
+
{
|
|
196
|
+
"bizon_id": "_bizon_id",
|
|
197
|
+
"bizon_extracted_at": "_bizon_extracted_at",
|
|
198
|
+
"bizon_loaded_at": "_bizon_loaded_at",
|
|
199
|
+
"source_record_id": "_source_record_id",
|
|
200
|
+
"source_timestamp": "_source_timestamp",
|
|
201
|
+
"source_data": "_source_data",
|
|
202
|
+
}
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
serialized_rows = [
|
|
206
|
+
self.to_protobuf_serialization(TableRowClass=TableRow, row=row)
|
|
207
|
+
for row in df_destination_records.iter_rows(named=True)
|
|
208
|
+
]
|
|
209
|
+
|
|
210
|
+
results = []
|
|
211
|
+
with ThreadPoolExecutor() as executor:
|
|
212
|
+
futures = [
|
|
213
|
+
executor.submit(self.append_rows_to_stream, write_client, stream_name, proto_schema, batch_rows)
|
|
214
|
+
for batch_rows in self.batch(serialized_rows)
|
|
215
|
+
]
|
|
216
|
+
for future in futures:
|
|
217
|
+
results.append(future.result())
|
|
218
|
+
|
|
219
|
+
assert all([r == "OK" for r in results]) is True, "Failed to append rows to stream"
|
|
220
|
+
|
|
221
|
+
def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
|
|
222
|
+
self.load_to_bigquery_via_streaming(df_destination_records=df_destination_records)
|
|
223
|
+
return True, ""
|
|
224
|
+
|
|
225
|
+
def batch(self, iterable):
|
|
226
|
+
"""
|
|
227
|
+
Yield successive batches respecting both row count and size limits.
|
|
228
|
+
"""
|
|
229
|
+
current_batch = []
|
|
230
|
+
current_batch_size = 0
|
|
231
|
+
large_rows = []
|
|
232
|
+
|
|
233
|
+
for item in iterable:
|
|
234
|
+
# Estimate the size of the item (as JSON)
|
|
235
|
+
item_size = len(str(item).encode("utf-8"))
|
|
236
|
+
|
|
237
|
+
# If adding this item would exceed either limit, yield current batch and start new one
|
|
238
|
+
if (
|
|
239
|
+
len(current_batch) >= self.MAX_ROWS_PER_REQUEST
|
|
240
|
+
or current_batch_size + item_size > self.MAX_REQUEST_SIZE_BYTES
|
|
241
|
+
):
|
|
242
|
+
logger.debug(f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB")
|
|
243
|
+
yield {"stream_batch": current_batch, "json_batch": large_rows}
|
|
244
|
+
current_batch = []
|
|
245
|
+
current_batch_size = 0
|
|
246
|
+
large_rows = []
|
|
247
|
+
|
|
248
|
+
if item_size > self.MAX_ROW_SIZE_BYTES:
|
|
249
|
+
large_rows.append(item)
|
|
250
|
+
logger.debug(f"Large row detected: {item_size} bytes")
|
|
251
|
+
else:
|
|
252
|
+
current_batch.append(item)
|
|
253
|
+
current_batch_size += item_size
|
|
254
|
+
|
|
255
|
+
# Yield the last batch
|
|
256
|
+
if current_batch:
|
|
257
|
+
logger.debug(
|
|
258
|
+
f"Yielding streaming batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB"
|
|
259
|
+
)
|
|
260
|
+
logger.debug(f"Yielding large rows batch of {len(large_rows)} rows")
|
|
261
|
+
yield {"stream_batch": current_batch, "json_batch": large_rows}
|