bizon 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon/common/models.py +2 -0
- bizon/connectors/destinations/bigquery/src/config.py +1 -0
- bizon/connectors/destinations/bigquery/src/destination.py +3 -1
- bizon/connectors/destinations/bigquery_streaming/src/config.py +6 -5
- bizon/connectors/destinations/bigquery_streaming/src/destination.py +9 -4
- bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +6 -1
- bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +230 -45
- bizon/connectors/destinations/bigquery_streaming_v2/src/proto_utils.py +1 -13
- bizon/connectors/destinations/file/src/config.py +1 -0
- bizon/connectors/destinations/file/src/destination.py +3 -1
- bizon/connectors/destinations/logger/src/config.py +1 -0
- bizon/connectors/destinations/logger/src/destination.py +3 -0
- bizon/connectors/sources/kafka/config/kafka.example.yml +1 -3
- bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +1 -3
- bizon/connectors/sources/kafka/src/config.py +0 -6
- bizon/connectors/sources/kafka/src/decode.py +71 -66
- bizon/connectors/sources/kafka/src/source.py +44 -24
- bizon/connectors/sources/kafka/tests/kafka_pipeline.py +1 -1
- bizon/destination/config.py +9 -0
- bizon/destination/destination.py +37 -5
- bizon/engine/runner/adapters/streaming.py +60 -42
- bizon/engine/runner/runner.py +14 -7
- bizon/monitoring/config.py +12 -2
- bizon/monitoring/datadog/monitor.py +98 -14
- bizon/monitoring/monitor.py +41 -12
- bizon/monitoring/noop/monitor.py +22 -3
- bizon/source/source.py +1 -1
- {bizon-0.1.1.dist-info → bizon-0.1.2.dist-info}/METADATA +2 -1
- {bizon-0.1.1.dist-info → bizon-0.1.2.dist-info}/RECORD +32 -32
- {bizon-0.1.1.dist-info → bizon-0.1.2.dist-info}/WHEEL +1 -1
- {bizon-0.1.1.dist-info → bizon-0.1.2.dist-info}/LICENSE +0 -0
- {bizon-0.1.1.dist-info → bizon-0.1.2.dist-info}/entry_points.txt +0 -0
bizon/common/models.py
CHANGED
|
@@ -75,6 +75,7 @@ class SyncMetadata(BaseModel):
|
|
|
75
75
|
stream_name: str
|
|
76
76
|
sync_mode: SourceSyncModes
|
|
77
77
|
destination_name: str
|
|
78
|
+
destination_alias: str
|
|
78
79
|
|
|
79
80
|
@classmethod
|
|
80
81
|
def from_bizon_config(cls, job_id: str, config: BizonConfig) -> "SyncMetadata":
|
|
@@ -85,4 +86,5 @@ class SyncMetadata(BaseModel):
|
|
|
85
86
|
stream_name=config.source.stream,
|
|
86
87
|
sync_mode=config.source.sync_mode,
|
|
87
88
|
destination_name=config.destination.name,
|
|
89
|
+
destination_alias=config.destination.alias,
|
|
88
90
|
)
|
|
@@ -123,5 +123,6 @@ class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
|
|
|
123
123
|
|
|
124
124
|
class BigQueryConfig(AbstractDestinationConfig):
|
|
125
125
|
name: Literal[DestinationTypes.BIGQUERY]
|
|
126
|
+
alias: str = "bigquery"
|
|
126
127
|
buffer_size: Optional[int] = 400
|
|
127
128
|
config: BigQueryConfigDetails
|
|
@@ -14,6 +14,7 @@ from loguru import logger
|
|
|
14
14
|
from bizon.common.models import SyncMetadata
|
|
15
15
|
from bizon.destination.destination import AbstractDestination
|
|
16
16
|
from bizon.engine.backend.backend import AbstractBackend
|
|
17
|
+
from bizon.monitoring.monitor import AbstractMonitor
|
|
17
18
|
from bizon.source.config import SourceSyncModes
|
|
18
19
|
from bizon.source.source import AbstractSourceCallback
|
|
19
20
|
|
|
@@ -28,8 +29,9 @@ class BigQueryDestination(AbstractDestination):
|
|
|
28
29
|
config: BigQueryConfigDetails,
|
|
29
30
|
backend: AbstractBackend,
|
|
30
31
|
source_callback: AbstractSourceCallback,
|
|
32
|
+
monitor: AbstractMonitor,
|
|
31
33
|
):
|
|
32
|
-
super().__init__(sync_metadata, config, backend, source_callback)
|
|
34
|
+
super().__init__(sync_metadata, config, backend, source_callback, monitor)
|
|
33
35
|
self.config: BigQueryConfigDetails = config
|
|
34
36
|
|
|
35
37
|
if config.authentication and config.authentication.service_account_key:
|
|
@@ -41,16 +41,17 @@ class BigQueryStreamingConfigDetails(AbstractDestinationDetailsConfig):
|
|
|
41
41
|
description="BigQuery Time partitioning type",
|
|
42
42
|
)
|
|
43
43
|
authentication: Optional[BigQueryAuthentication] = None
|
|
44
|
-
bq_max_rows_per_request: Optional[int] = Field(
|
|
44
|
+
bq_max_rows_per_request: Optional[int] = Field(
|
|
45
|
+
5000,
|
|
46
|
+
description="Max rows per buffer streaming request. Must not exceed 10000.",
|
|
47
|
+
le=10000,
|
|
48
|
+
)
|
|
45
49
|
record_schemas: Optional[list[BigQueryRecordSchemaConfig]] = Field(
|
|
46
50
|
default=None, description="Schema for the records. Required if unnest is set to true."
|
|
47
51
|
)
|
|
48
|
-
use_legacy_streaming_api: bool = Field(
|
|
49
|
-
default=False,
|
|
50
|
-
description="[DEPRECATED] Use the legacy streaming API. This is required for some older BigQuery versions.",
|
|
51
|
-
)
|
|
52
52
|
|
|
53
53
|
|
|
54
54
|
class BigQueryStreamingConfig(AbstractDestinationConfig):
|
|
55
55
|
name: Literal[DestinationTypes.BIGQUERY_STREAMING]
|
|
56
|
+
alias: str = "bigquery"
|
|
56
57
|
config: BigQueryStreamingConfigDetails
|
|
@@ -36,6 +36,7 @@ from bizon.connectors.destinations.bigquery.src.config import (
|
|
|
36
36
|
)
|
|
37
37
|
from bizon.destination.destination import AbstractDestination
|
|
38
38
|
from bizon.engine.backend.backend import AbstractBackend
|
|
39
|
+
from bizon.monitoring.monitor import AbstractMonitor
|
|
39
40
|
from bizon.source.callback import AbstractSourceCallback
|
|
40
41
|
|
|
41
42
|
from .config import BigQueryStreamingConfigDetails
|
|
@@ -44,7 +45,6 @@ from .config import BigQueryStreamingConfigDetails
|
|
|
44
45
|
class BigQueryStreamingDestination(AbstractDestination):
|
|
45
46
|
|
|
46
47
|
# Add constants for limits
|
|
47
|
-
MAX_ROWS_PER_REQUEST = 5000 # 5000 (max is 10000)
|
|
48
48
|
MAX_REQUEST_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB (max is 10MB)
|
|
49
49
|
MAX_ROW_SIZE_BYTES = 0.9 * 1024 * 1024 # 1 MB
|
|
50
50
|
|
|
@@ -54,8 +54,9 @@ class BigQueryStreamingDestination(AbstractDestination):
|
|
|
54
54
|
config: BigQueryStreamingConfigDetails,
|
|
55
55
|
backend: AbstractBackend,
|
|
56
56
|
source_callback: AbstractSourceCallback,
|
|
57
|
+
monitor: AbstractMonitor,
|
|
57
58
|
): # type: ignore
|
|
58
|
-
super().__init__(sync_metadata, config, backend, source_callback)
|
|
59
|
+
super().__init__(sync_metadata, config, backend, source_callback, monitor)
|
|
59
60
|
self.config: BigQueryStreamingConfigDetails = config
|
|
60
61
|
|
|
61
62
|
if config.authentication and config.authentication.service_account_key:
|
|
@@ -222,7 +223,7 @@ class BigQueryStreamingDestination(AbstractDestination):
|
|
|
222
223
|
try:
|
|
223
224
|
# Handle streaming batch
|
|
224
225
|
if batch.get("stream_batch") and len(batch["stream_batch"]) > 0:
|
|
225
|
-
|
|
226
|
+
self.bq_client.insert_rows_json(
|
|
226
227
|
table,
|
|
227
228
|
batch["stream_batch"],
|
|
228
229
|
row_ids=[None] * len(batch["stream_batch"]),
|
|
@@ -245,6 +246,10 @@ class BigQueryStreamingDestination(AbstractDestination):
|
|
|
245
246
|
if load_job.state != "DONE":
|
|
246
247
|
raise Exception(f"Failed to load rows to BigQuery: {load_job.errors}")
|
|
247
248
|
|
|
249
|
+
self.monitor.track_large_records_synced(
|
|
250
|
+
num_records=len(batch["json_batch"]), extra_tags={"destination_id": self.destination_id}
|
|
251
|
+
)
|
|
252
|
+
|
|
248
253
|
except Exception as e:
|
|
249
254
|
logger.error(f"Error inserting batch: {str(e)}, type: {type(e)}")
|
|
250
255
|
raise
|
|
@@ -347,7 +352,7 @@ class BigQueryStreamingDestination(AbstractDestination):
|
|
|
347
352
|
|
|
348
353
|
# If adding this item would exceed either limit, yield current batch and start new one
|
|
349
354
|
if (
|
|
350
|
-
len(current_batch) >= self.
|
|
355
|
+
len(current_batch) >= self.bq_max_rows_per_request
|
|
351
356
|
or current_batch_size + item_size > self.MAX_REQUEST_SIZE_BYTES
|
|
352
357
|
):
|
|
353
358
|
logger.debug(f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB")
|
|
@@ -41,7 +41,11 @@ class BigQueryStreamingV2ConfigDetails(AbstractDestinationDetailsConfig):
|
|
|
41
41
|
description="BigQuery Time partitioning type",
|
|
42
42
|
)
|
|
43
43
|
authentication: Optional[BigQueryAuthentication] = None
|
|
44
|
-
bq_max_rows_per_request: Optional[int] = Field(
|
|
44
|
+
bq_max_rows_per_request: Optional[int] = Field(
|
|
45
|
+
5000,
|
|
46
|
+
description="Max rows per buffer streaming request. Must not exceed 10000.",
|
|
47
|
+
le=10000,
|
|
48
|
+
)
|
|
45
49
|
record_schemas: Optional[list[BigQueryRecordSchemaConfig]] = Field(
|
|
46
50
|
default=None, description="Schema for the records. Required if unnest is set to true."
|
|
47
51
|
)
|
|
@@ -49,4 +53,5 @@ class BigQueryStreamingV2ConfigDetails(AbstractDestinationDetailsConfig):
|
|
|
49
53
|
|
|
50
54
|
class BigQueryStreamingV2Config(AbstractDestinationConfig):
|
|
51
55
|
name: Literal[DestinationTypes.BIGQUERY_STREAMING_V2]
|
|
56
|
+
alias: str = "bigquery"
|
|
52
57
|
config: BigQueryStreamingV2ConfigDetails
|
|
@@ -1,25 +1,44 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import tempfile
|
|
3
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from typing import List, Tuple, Type
|
|
6
6
|
|
|
7
|
+
import orjson
|
|
7
8
|
import polars as pl
|
|
8
|
-
|
|
9
|
-
from google.
|
|
9
|
+
import urllib3.exceptions
|
|
10
|
+
from google.api_core.client_options import ClientOptions
|
|
11
|
+
from google.api_core.exceptions import (
|
|
12
|
+
Conflict,
|
|
13
|
+
InvalidArgument,
|
|
14
|
+
NotFound,
|
|
15
|
+
RetryError,
|
|
16
|
+
ServerError,
|
|
17
|
+
ServiceUnavailable,
|
|
18
|
+
)
|
|
19
|
+
from google.cloud import bigquery
|
|
10
20
|
from google.cloud.bigquery import DatasetReference, TimePartitioning
|
|
21
|
+
from google.cloud.bigquery_storage_v1 import BigQueryWriteClient
|
|
11
22
|
from google.cloud.bigquery_storage_v1.types import (
|
|
12
23
|
AppendRowsRequest,
|
|
13
24
|
ProtoRows,
|
|
14
25
|
ProtoSchema,
|
|
15
26
|
)
|
|
16
|
-
from google.protobuf.json_format import ParseDict
|
|
17
|
-
from google.protobuf.message import Message
|
|
27
|
+
from google.protobuf.json_format import MessageToDict, ParseDict, ParseError
|
|
28
|
+
from google.protobuf.message import EncodeError, Message
|
|
18
29
|
from loguru import logger
|
|
30
|
+
from requests.exceptions import ConnectionError, SSLError, Timeout
|
|
31
|
+
from tenacity import (
|
|
32
|
+
retry,
|
|
33
|
+
retry_if_exception_type,
|
|
34
|
+
stop_after_attempt,
|
|
35
|
+
wait_exponential,
|
|
36
|
+
)
|
|
19
37
|
|
|
20
38
|
from bizon.common.models import SyncMetadata
|
|
21
39
|
from bizon.destination.destination import AbstractDestination
|
|
22
40
|
from bizon.engine.backend.backend import AbstractBackend
|
|
41
|
+
from bizon.monitoring.monitor import AbstractMonitor
|
|
23
42
|
from bizon.source.callback import AbstractSourceCallback
|
|
24
43
|
|
|
25
44
|
from .config import BigQueryStreamingV2ConfigDetails
|
|
@@ -29,9 +48,8 @@ from .proto_utils import get_proto_schema_and_class
|
|
|
29
48
|
class BigQueryStreamingV2Destination(AbstractDestination):
|
|
30
49
|
|
|
31
50
|
# Add constants for limits
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
MAX_ROW_SIZE_BYTES = 0.9 * 1024 * 1024 # 1 MB
|
|
51
|
+
MAX_REQUEST_SIZE_BYTES = 9.5 * 1024 * 1024 # 9.5 MB (max is 10MB)
|
|
52
|
+
MAX_ROW_SIZE_BYTES = 8 * 1024 * 1024 # 8 MB (max is 10MB)
|
|
35
53
|
|
|
36
54
|
def __init__(
|
|
37
55
|
self,
|
|
@@ -39,8 +57,9 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
39
57
|
config: BigQueryStreamingV2ConfigDetails,
|
|
40
58
|
backend: AbstractBackend,
|
|
41
59
|
source_callback: AbstractSourceCallback,
|
|
60
|
+
monitor: AbstractMonitor,
|
|
42
61
|
): # type: ignore
|
|
43
|
-
super().__init__(sync_metadata, config, backend, source_callback)
|
|
62
|
+
super().__init__(sync_metadata, config, backend, source_callback, monitor)
|
|
44
63
|
self.config: BigQueryStreamingV2ConfigDetails = config
|
|
45
64
|
|
|
46
65
|
if config.authentication and config.authentication.service_account_key:
|
|
@@ -51,10 +70,12 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
51
70
|
|
|
52
71
|
self.project_id = config.project_id
|
|
53
72
|
self.bq_client = bigquery.Client(project=self.project_id)
|
|
54
|
-
self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
|
|
55
73
|
self.dataset_id = config.dataset_id
|
|
56
74
|
self.dataset_location = config.dataset_location
|
|
57
75
|
self.bq_max_rows_per_request = config.bq_max_rows_per_request
|
|
76
|
+
self.bq_storage_client_options = ClientOptions(
|
|
77
|
+
quota_project_id=self.project_id,
|
|
78
|
+
)
|
|
58
79
|
|
|
59
80
|
@property
|
|
60
81
|
def table_id(self) -> str:
|
|
@@ -102,13 +123,35 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
102
123
|
dataset = self.bq_client.create_dataset(dataset)
|
|
103
124
|
return True
|
|
104
125
|
|
|
126
|
+
@retry(
|
|
127
|
+
retry=retry_if_exception_type(
|
|
128
|
+
(
|
|
129
|
+
ServerError,
|
|
130
|
+
ServiceUnavailable,
|
|
131
|
+
SSLError,
|
|
132
|
+
ConnectionError,
|
|
133
|
+
Timeout,
|
|
134
|
+
RetryError,
|
|
135
|
+
urllib3.exceptions.ProtocolError,
|
|
136
|
+
urllib3.exceptions.SSLError,
|
|
137
|
+
InvalidArgument,
|
|
138
|
+
)
|
|
139
|
+
),
|
|
140
|
+
wait=wait_exponential(multiplier=2, min=4, max=120),
|
|
141
|
+
stop=stop_after_attempt(8),
|
|
142
|
+
before_sleep=lambda retry_state: logger.warning(
|
|
143
|
+
f"Streaming append attempt {retry_state.attempt_number} failed. "
|
|
144
|
+
f"Retrying in {retry_state.next_action.sleep} seconds..."
|
|
145
|
+
),
|
|
146
|
+
)
|
|
105
147
|
def append_rows_to_stream(
|
|
106
148
|
self,
|
|
107
|
-
write_client: bigquery_storage_v1.BigQueryWriteClient,
|
|
108
149
|
stream_name: str,
|
|
109
150
|
proto_schema: ProtoSchema,
|
|
110
151
|
serialized_rows: List[bytes],
|
|
111
152
|
):
|
|
153
|
+
write_client = BigQueryWriteClient(client_options=self.bq_storage_client_options)
|
|
154
|
+
|
|
112
155
|
request = AppendRowsRequest(
|
|
113
156
|
write_stream=stream_name,
|
|
114
157
|
proto_rows=AppendRowsRequest.ProtoData(
|
|
@@ -116,11 +159,26 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
116
159
|
writer_schema=proto_schema,
|
|
117
160
|
),
|
|
118
161
|
)
|
|
119
|
-
|
|
120
|
-
|
|
162
|
+
try:
|
|
163
|
+
response = write_client.append_rows(iter([request]))
|
|
164
|
+
return response.code().name
|
|
165
|
+
except Exception as e:
|
|
166
|
+
logger.error(f"Error in append_rows_to_stream: {str(e)}")
|
|
167
|
+
logger.error(f"Stream name: {stream_name}")
|
|
168
|
+
raise
|
|
121
169
|
|
|
122
170
|
def safe_cast_record_values(self, row: dict):
|
|
171
|
+
"""
|
|
172
|
+
Safe cast record values to the correct type for BigQuery.
|
|
173
|
+
"""
|
|
123
174
|
for col in self.record_schemas[self.destination_id]:
|
|
175
|
+
|
|
176
|
+
# Handle dicts as strings
|
|
177
|
+
if col.type in ["STRING", "JSON"]:
|
|
178
|
+
if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
|
|
179
|
+
row[col.name] = orjson.dumps(row[col.name]).decode("utf-8")
|
|
180
|
+
|
|
181
|
+
# Handle timestamps
|
|
124
182
|
if col.type in ["TIMESTAMP", "DATETIME"] and col.default_value_expression is None:
|
|
125
183
|
if isinstance(row[col.name], int):
|
|
126
184
|
if row[col.name] > datetime(9999, 12, 31).timestamp():
|
|
@@ -143,15 +201,102 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
143
201
|
@staticmethod
|
|
144
202
|
def to_protobuf_serialization(TableRowClass: Type[Message], row: dict) -> bytes:
|
|
145
203
|
"""Convert a row to a Protobuf serialization."""
|
|
146
|
-
|
|
147
|
-
|
|
204
|
+
try:
|
|
205
|
+
record = ParseDict(row, TableRowClass())
|
|
206
|
+
except ParseError as e:
|
|
207
|
+
logger.error(f"Error serializing record: {e} for row: {row}.")
|
|
208
|
+
raise e
|
|
148
209
|
|
|
149
|
-
|
|
210
|
+
try:
|
|
211
|
+
serialized_record = record.SerializeToString()
|
|
212
|
+
except EncodeError as e:
|
|
213
|
+
logger.error(f"Error serializing record: {e} for row: {row}.")
|
|
214
|
+
raise e
|
|
215
|
+
return serialized_record
|
|
150
216
|
|
|
151
|
-
|
|
152
|
-
|
|
217
|
+
@staticmethod
|
|
218
|
+
def from_protobuf_serialization(
|
|
219
|
+
TableRowClass: Type[Message],
|
|
220
|
+
serialized_data: bytes,
|
|
221
|
+
) -> dict:
|
|
222
|
+
"""Convert protobuf serialization back to a dictionary."""
|
|
223
|
+
record = TableRowClass()
|
|
224
|
+
record.ParseFromString(serialized_data)
|
|
225
|
+
return MessageToDict(record, preserving_proto_field_name=True)
|
|
226
|
+
|
|
227
|
+
@retry(
|
|
228
|
+
retry=retry_if_exception_type(
|
|
229
|
+
(
|
|
230
|
+
ServerError,
|
|
231
|
+
ServiceUnavailable,
|
|
232
|
+
SSLError,
|
|
233
|
+
ConnectionError,
|
|
234
|
+
Timeout,
|
|
235
|
+
RetryError,
|
|
236
|
+
urllib3.exceptions.ProtocolError,
|
|
237
|
+
urllib3.exceptions.SSLError,
|
|
238
|
+
)
|
|
239
|
+
),
|
|
240
|
+
wait=wait_exponential(multiplier=2, min=4, max=120),
|
|
241
|
+
stop=stop_after_attempt(8),
|
|
242
|
+
before_sleep=lambda retry_state: logger.warning(
|
|
243
|
+
f"Attempt {retry_state.attempt_number} failed. Retrying in {retry_state.next_action.sleep} seconds..."
|
|
244
|
+
),
|
|
245
|
+
)
|
|
246
|
+
def process_streaming_batch(
|
|
247
|
+
self,
|
|
248
|
+
stream_name: str,
|
|
249
|
+
proto_schema: ProtoSchema,
|
|
250
|
+
batch: dict,
|
|
251
|
+
table_row_class: Type[Message],
|
|
252
|
+
) -> List[Tuple[str, str]]:
|
|
253
|
+
"""Process a single batch for streaming and/or large rows with retry logic."""
|
|
254
|
+
results = []
|
|
255
|
+
try:
|
|
256
|
+
# Handle streaming batch
|
|
257
|
+
if batch.get("stream_batch") and len(batch["stream_batch"]) > 0:
|
|
258
|
+
result = self.append_rows_to_stream(stream_name, proto_schema, batch["stream_batch"])
|
|
259
|
+
results.append(("streaming", result))
|
|
260
|
+
|
|
261
|
+
# Handle large rows batch
|
|
262
|
+
if batch.get("json_batch") and len(batch["json_batch"]) > 0:
|
|
263
|
+
# Deserialize protobuf bytes back to JSON for the load job
|
|
264
|
+
deserialized_rows = []
|
|
265
|
+
for serialized_row in batch["json_batch"]:
|
|
266
|
+
deserialized_row = self.from_protobuf_serialization(table_row_class, serialized_row)
|
|
267
|
+
deserialized_rows.append(deserialized_row)
|
|
268
|
+
|
|
269
|
+
# For large rows, we need to use the main client
|
|
270
|
+
job_config = bigquery.LoadJobConfig(
|
|
271
|
+
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
|
|
272
|
+
schema=self.bq_client.get_table(self.table_id).schema,
|
|
273
|
+
ignore_unknown_values=True,
|
|
274
|
+
)
|
|
275
|
+
load_job = self.bq_client.load_table_from_json(
|
|
276
|
+
deserialized_rows, self.table_id, job_config=job_config, timeout=300
|
|
277
|
+
)
|
|
278
|
+
result = load_job.result()
|
|
279
|
+
if load_job.state != "DONE":
|
|
280
|
+
raise Exception(f"Failed to load rows to BigQuery: {load_job.errors}")
|
|
281
|
+
|
|
282
|
+
# Track large rows
|
|
283
|
+
self.monitor.track_large_records_synced(
|
|
284
|
+
num_records=len(batch["json_batch"]), extra_tags={"destination_id": self.destination_id}
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
results.append(("large_rows", "DONE"))
|
|
288
|
+
|
|
289
|
+
if not results:
|
|
290
|
+
results.append(("empty", "SKIPPED"))
|
|
153
291
|
|
|
154
|
-
|
|
292
|
+
return results
|
|
293
|
+
except Exception as e:
|
|
294
|
+
logger.error(f"Error processing batch: {str(e)}")
|
|
295
|
+
raise
|
|
296
|
+
|
|
297
|
+
def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
|
|
298
|
+
|
|
299
|
+
# Create table if it does not exist
|
|
155
300
|
schema = self.get_bigquery_schema()
|
|
156
301
|
table = bigquery.Table(self.table_id, schema=schema)
|
|
157
302
|
time_partitioning = TimePartitioning(
|
|
@@ -159,31 +304,43 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
159
304
|
)
|
|
160
305
|
table.time_partitioning = time_partitioning
|
|
161
306
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
307
|
+
if self.clustering_keys and self.clustering_keys[self.destination_id]:
|
|
308
|
+
table.clustering_fields = self.clustering_keys[self.destination_id]
|
|
309
|
+
try:
|
|
310
|
+
table = self.bq_client.create_table(table)
|
|
311
|
+
except Conflict:
|
|
312
|
+
table = self.bq_client.get_table(self.table_id)
|
|
313
|
+
# Compare and update schema if needed
|
|
314
|
+
existing_fields = {field.name: field for field in table.schema}
|
|
315
|
+
new_fields = {field.name: field for field in self.get_bigquery_schema()}
|
|
316
|
+
|
|
317
|
+
# Find fields that need to be added
|
|
318
|
+
fields_to_add = [field for name, field in new_fields.items() if name not in existing_fields]
|
|
319
|
+
|
|
320
|
+
if fields_to_add:
|
|
321
|
+
logger.warning(f"Adding new fields to table schema: {[field.name for field in fields_to_add]}")
|
|
322
|
+
updated_schema = table.schema + fields_to_add
|
|
323
|
+
table.schema = updated_schema
|
|
324
|
+
table = self.bq_client.update_table(table, ["schema"])
|
|
168
325
|
|
|
169
326
|
# Create the stream
|
|
170
327
|
if self.destination_id:
|
|
171
328
|
project, dataset, table_name = self.destination_id.split(".")
|
|
172
|
-
|
|
173
|
-
parent = write_client.table_path(project, dataset, table_name)
|
|
329
|
+
parent = BigQueryWriteClient.table_path(project, dataset, table_name)
|
|
174
330
|
else:
|
|
175
|
-
|
|
176
|
-
parent = write_client.table_path(self.project_id, self.dataset_id, self.destination_id)
|
|
331
|
+
parent = BigQueryWriteClient.table_path(self.project_id, self.dataset_id, self.destination_id)
|
|
177
332
|
|
|
178
333
|
stream_name = f"{parent}/_default"
|
|
179
334
|
|
|
180
335
|
# Generating the protocol buffer representation of the message descriptor.
|
|
181
|
-
proto_schema, TableRow = get_proto_schema_and_class(schema
|
|
336
|
+
proto_schema, TableRow = get_proto_schema_and_class(schema)
|
|
182
337
|
|
|
183
338
|
if self.config.unnest:
|
|
184
339
|
serialized_rows = [
|
|
185
|
-
self.to_protobuf_serialization(
|
|
186
|
-
|
|
340
|
+
self.to_protobuf_serialization(
|
|
341
|
+
TableRowClass=TableRow, row=self.safe_cast_record_values(orjson.loads(row))
|
|
342
|
+
)
|
|
343
|
+
for row in df_destination_records["source_data"].to_list()
|
|
187
344
|
]
|
|
188
345
|
else:
|
|
189
346
|
df_destination_records = df_destination_records.with_columns(
|
|
@@ -207,16 +364,43 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
207
364
|
for row in df_destination_records.iter_rows(named=True)
|
|
208
365
|
]
|
|
209
366
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
367
|
+
streaming_results = []
|
|
368
|
+
large_rows_results = []
|
|
369
|
+
|
|
370
|
+
# Collect all batches first
|
|
371
|
+
batches = list(self.batch(serialized_rows))
|
|
372
|
+
|
|
373
|
+
# Use ThreadPoolExecutor for parallel processing
|
|
374
|
+
max_workers = min(len(batches), self.config.max_concurrent_threads)
|
|
375
|
+
logger.info(f"Processing {len(batches)} batches with {max_workers} concurrent threads")
|
|
376
|
+
|
|
377
|
+
try:
|
|
378
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
379
|
+
# Submit all batch processing tasks
|
|
380
|
+
future_to_batch = {
|
|
381
|
+
executor.submit(self.process_streaming_batch, stream_name, proto_schema, batch, TableRow): batch
|
|
382
|
+
for batch in batches
|
|
383
|
+
}
|
|
218
384
|
|
|
219
|
-
|
|
385
|
+
# Collect results as they complete
|
|
386
|
+
for future in as_completed(future_to_batch):
|
|
387
|
+
batch_results = future.result()
|
|
388
|
+
for batch_type, result in batch_results:
|
|
389
|
+
if batch_type == "streaming":
|
|
390
|
+
streaming_results.append(result)
|
|
391
|
+
if batch_type == "large_rows":
|
|
392
|
+
large_rows_results.append(result)
|
|
393
|
+
|
|
394
|
+
except Exception as e:
|
|
395
|
+
logger.error(f"Error in multithreaded batch processing: {str(e)}, type: {type(e)}")
|
|
396
|
+
if isinstance(e, RetryError):
|
|
397
|
+
logger.error(f"Retry error details: {e.cause if hasattr(e, 'cause') else 'No cause available'}")
|
|
398
|
+
raise
|
|
399
|
+
|
|
400
|
+
if len(streaming_results) > 0:
|
|
401
|
+
assert all([r == "OK" for r in streaming_results]) is True, "Failed to append rows to stream"
|
|
402
|
+
if len(large_rows_results) > 0:
|
|
403
|
+
assert all([r == "DONE" for r in large_rows_results]) is True, "Failed to load rows to BigQuery"
|
|
220
404
|
|
|
221
405
|
def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
|
|
222
406
|
self.load_to_bigquery_via_streaming(df_destination_records=df_destination_records)
|
|
@@ -236,7 +420,7 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
236
420
|
|
|
237
421
|
# If adding this item would exceed either limit, yield current batch and start new one
|
|
238
422
|
if (
|
|
239
|
-
len(current_batch) >= self.
|
|
423
|
+
len(current_batch) >= self.bq_max_rows_per_request
|
|
240
424
|
or current_batch_size + item_size > self.MAX_REQUEST_SIZE_BYTES
|
|
241
425
|
):
|
|
242
426
|
logger.debug(f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB")
|
|
@@ -247,15 +431,16 @@ class BigQueryStreamingV2Destination(AbstractDestination):
|
|
|
247
431
|
|
|
248
432
|
if item_size > self.MAX_ROW_SIZE_BYTES:
|
|
249
433
|
large_rows.append(item)
|
|
250
|
-
logger.
|
|
434
|
+
logger.warning(f"Large row detected: {item_size} bytes")
|
|
251
435
|
else:
|
|
252
436
|
current_batch.append(item)
|
|
253
437
|
current_batch_size += item_size
|
|
254
438
|
|
|
255
439
|
# Yield the last batch
|
|
256
440
|
if current_batch:
|
|
257
|
-
logger.
|
|
441
|
+
logger.info(
|
|
258
442
|
f"Yielding streaming batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB"
|
|
259
443
|
)
|
|
260
|
-
|
|
444
|
+
if large_rows:
|
|
445
|
+
logger.warning(f"Yielding large rows batch of {len(large_rows)} rows")
|
|
261
446
|
yield {"stream_batch": current_batch, "json_batch": large_rows}
|
|
@@ -32,9 +32,7 @@ def map_bq_type_to_field_descriptor(bq_type: str) -> int:
|
|
|
32
32
|
return type_map.get(bq_type, FieldDescriptorProto.TYPE_STRING) # Default to TYPE_STRING
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
def get_proto_schema_and_class(
|
|
36
|
-
bq_schema: List[SchemaField], clustering_keys: List[str] = None
|
|
37
|
-
) -> Tuple[ProtoSchema, Type[Message]]:
|
|
35
|
+
def get_proto_schema_and_class(bq_schema: List[SchemaField]) -> Tuple[ProtoSchema, Type[Message]]:
|
|
38
36
|
"""Generate a ProtoSchema and a TableRow class for unnested BigQuery schema."""
|
|
39
37
|
# Define the FileDescriptorProto
|
|
40
38
|
file_descriptor_proto = FileDescriptorProto()
|
|
@@ -60,16 +58,6 @@ def get_proto_schema_and_class(
|
|
|
60
58
|
for col in bq_schema
|
|
61
59
|
]
|
|
62
60
|
|
|
63
|
-
if clustering_keys:
|
|
64
|
-
for key in clustering_keys:
|
|
65
|
-
fields.append(
|
|
66
|
-
{
|
|
67
|
-
"name": key,
|
|
68
|
-
"type": FieldDescriptorProto.TYPE_STRING,
|
|
69
|
-
"label": FieldDescriptorProto.LABEL_OPTIONAL,
|
|
70
|
-
}
|
|
71
|
-
)
|
|
72
|
-
|
|
73
61
|
for i, field in enumerate(fields, start=1):
|
|
74
62
|
field_descriptor = message_descriptor.field.add()
|
|
75
63
|
field_descriptor.name = field["name"]
|
|
@@ -6,6 +6,7 @@ import polars as pl
|
|
|
6
6
|
from bizon.common.models import SyncMetadata
|
|
7
7
|
from bizon.destination.destination import AbstractDestination
|
|
8
8
|
from bizon.engine.backend.backend import AbstractBackend
|
|
9
|
+
from bizon.monitoring.monitor import AbstractMonitor
|
|
9
10
|
from bizon.source.callback import AbstractSourceCallback
|
|
10
11
|
|
|
11
12
|
from .config import FileDestinationDetailsConfig
|
|
@@ -19,8 +20,9 @@ class FileDestination(AbstractDestination):
|
|
|
19
20
|
config: FileDestinationDetailsConfig,
|
|
20
21
|
backend: AbstractBackend,
|
|
21
22
|
source_callback: AbstractSourceCallback,
|
|
23
|
+
monitor: AbstractMonitor,
|
|
22
24
|
):
|
|
23
|
-
super().__init__(sync_metadata, config, backend, source_callback)
|
|
25
|
+
super().__init__(sync_metadata, config, backend, source_callback, monitor)
|
|
24
26
|
self.config: FileDestinationDetailsConfig = config
|
|
25
27
|
|
|
26
28
|
def check_connection(self) -> bool:
|
|
@@ -6,6 +6,7 @@ from loguru import logger
|
|
|
6
6
|
from bizon.common.models import SyncMetadata
|
|
7
7
|
from bizon.destination.destination import AbstractDestination
|
|
8
8
|
from bizon.engine.backend.backend import AbstractBackend
|
|
9
|
+
from bizon.monitoring.monitor import AbstractMonitor
|
|
9
10
|
from bizon.source.callback import AbstractSourceCallback
|
|
10
11
|
|
|
11
12
|
from .config import LoggerDestinationConfig
|
|
@@ -19,12 +20,14 @@ class LoggerDestination(AbstractDestination):
|
|
|
19
20
|
config: LoggerDestinationConfig,
|
|
20
21
|
backend: AbstractBackend,
|
|
21
22
|
source_callback: AbstractSourceCallback,
|
|
23
|
+
monitor: AbstractMonitor,
|
|
22
24
|
):
|
|
23
25
|
super().__init__(
|
|
24
26
|
sync_metadata=sync_metadata,
|
|
25
27
|
config=config,
|
|
26
28
|
backend=backend,
|
|
27
29
|
source_callback=source_callback,
|
|
30
|
+
monitor=monitor,
|
|
28
31
|
)
|
|
29
32
|
|
|
30
33
|
def check_connection(self) -> bool:
|
|
@@ -8,8 +8,6 @@ source:
|
|
|
8
8
|
|
|
9
9
|
topic: my-topic
|
|
10
10
|
|
|
11
|
-
nb_bytes_schema_id: 8
|
|
12
|
-
|
|
13
11
|
batch_size: 1000
|
|
14
12
|
consumer_timeout: 10
|
|
15
13
|
bootstrap_servers: <bootstrap-severs>:9092
|
|
@@ -47,4 +45,4 @@ destination:
|
|
|
47
45
|
# syncCursorInDBEvery: 100
|
|
48
46
|
|
|
49
47
|
# runner:
|
|
50
|
-
# log_level: INFO
|
|
48
|
+
# log_level: INFO
|
|
@@ -10,8 +10,6 @@ source:
|
|
|
10
10
|
|
|
11
11
|
topic: <TOPIC_NAME>
|
|
12
12
|
|
|
13
|
-
nb_bytes_schema_id: 8
|
|
14
|
-
|
|
15
13
|
batch_size: 1000
|
|
16
14
|
consumer_timeout: 10
|
|
17
15
|
bootstrap_servers: <BOOTSTRAP_SERVERS>
|
|
@@ -109,4 +107,4 @@ engine:
|
|
|
109
107
|
queue:
|
|
110
108
|
type: python_queue
|
|
111
109
|
config:
|
|
112
|
-
max_nb_messages: 1000000
|
|
110
|
+
max_nb_messages: 1000000
|
|
@@ -66,10 +66,4 @@ class KafkaSourceConfig(SourceConfig):
|
|
|
66
66
|
|
|
67
67
|
message_encoding: str = Field(default=MessageEncoding.AVRO, description="Encoding to use to decode the message")
|
|
68
68
|
|
|
69
|
-
# Schema ID header configuration
|
|
70
|
-
nb_bytes_schema_id: Literal[4, 8] = Field(
|
|
71
|
-
description="Number of bytes encode SchemaID in Kafka message. Standard is 4.",
|
|
72
|
-
default=4,
|
|
73
|
-
)
|
|
74
|
-
|
|
75
69
|
authentication: KafkaAuthConfig = Field(..., description="Authentication configuration")
|