bizon 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. bizon/common/models.py +2 -0
  2. bizon/connectors/destinations/bigquery/src/config.py +1 -0
  3. bizon/connectors/destinations/bigquery/src/destination.py +3 -1
  4. bizon/connectors/destinations/bigquery_streaming/src/config.py +6 -5
  5. bizon/connectors/destinations/bigquery_streaming/src/destination.py +9 -4
  6. bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +6 -1
  7. bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +230 -45
  8. bizon/connectors/destinations/bigquery_streaming_v2/src/proto_utils.py +1 -13
  9. bizon/connectors/destinations/file/src/config.py +1 -0
  10. bizon/connectors/destinations/file/src/destination.py +3 -1
  11. bizon/connectors/destinations/logger/src/config.py +1 -0
  12. bizon/connectors/destinations/logger/src/destination.py +3 -0
  13. bizon/connectors/sources/kafka/config/kafka.example.yml +1 -3
  14. bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +1 -3
  15. bizon/connectors/sources/kafka/src/config.py +0 -6
  16. bizon/connectors/sources/kafka/src/decode.py +71 -66
  17. bizon/connectors/sources/kafka/src/source.py +44 -24
  18. bizon/connectors/sources/kafka/tests/kafka_pipeline.py +1 -1
  19. bizon/destination/config.py +9 -0
  20. bizon/destination/destination.py +37 -5
  21. bizon/engine/runner/adapters/streaming.py +60 -42
  22. bizon/engine/runner/runner.py +14 -7
  23. bizon/monitoring/config.py +12 -2
  24. bizon/monitoring/datadog/monitor.py +98 -14
  25. bizon/monitoring/monitor.py +41 -12
  26. bizon/monitoring/noop/monitor.py +22 -3
  27. bizon/source/source.py +1 -1
  28. {bizon-0.1.1.dist-info → bizon-0.1.2.dist-info}/METADATA +2 -1
  29. {bizon-0.1.1.dist-info → bizon-0.1.2.dist-info}/RECORD +32 -32
  30. {bizon-0.1.1.dist-info → bizon-0.1.2.dist-info}/WHEEL +1 -1
  31. {bizon-0.1.1.dist-info → bizon-0.1.2.dist-info}/LICENSE +0 -0
  32. {bizon-0.1.1.dist-info → bizon-0.1.2.dist-info}/entry_points.txt +0 -0
bizon/common/models.py CHANGED
@@ -75,6 +75,7 @@ class SyncMetadata(BaseModel):
75
75
  stream_name: str
76
76
  sync_mode: SourceSyncModes
77
77
  destination_name: str
78
+ destination_alias: str
78
79
 
79
80
  @classmethod
80
81
  def from_bizon_config(cls, job_id: str, config: BizonConfig) -> "SyncMetadata":
@@ -85,4 +86,5 @@ class SyncMetadata(BaseModel):
85
86
  stream_name=config.source.stream,
86
87
  sync_mode=config.source.sync_mode,
87
88
  destination_name=config.destination.name,
89
+ destination_alias=config.destination.alias,
88
90
  )
@@ -123,5 +123,6 @@ class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
123
123
 
124
124
  class BigQueryConfig(AbstractDestinationConfig):
125
125
  name: Literal[DestinationTypes.BIGQUERY]
126
+ alias: str = "bigquery"
126
127
  buffer_size: Optional[int] = 400
127
128
  config: BigQueryConfigDetails
@@ -14,6 +14,7 @@ from loguru import logger
14
14
  from bizon.common.models import SyncMetadata
15
15
  from bizon.destination.destination import AbstractDestination
16
16
  from bizon.engine.backend.backend import AbstractBackend
17
+ from bizon.monitoring.monitor import AbstractMonitor
17
18
  from bizon.source.config import SourceSyncModes
18
19
  from bizon.source.source import AbstractSourceCallback
19
20
 
@@ -28,8 +29,9 @@ class BigQueryDestination(AbstractDestination):
28
29
  config: BigQueryConfigDetails,
29
30
  backend: AbstractBackend,
30
31
  source_callback: AbstractSourceCallback,
32
+ monitor: AbstractMonitor,
31
33
  ):
32
- super().__init__(sync_metadata, config, backend, source_callback)
34
+ super().__init__(sync_metadata, config, backend, source_callback, monitor)
33
35
  self.config: BigQueryConfigDetails = config
34
36
 
35
37
  if config.authentication and config.authentication.service_account_key:
@@ -41,16 +41,17 @@ class BigQueryStreamingConfigDetails(AbstractDestinationDetailsConfig):
41
41
  description="BigQuery Time partitioning type",
42
42
  )
43
43
  authentication: Optional[BigQueryAuthentication] = None
44
- bq_max_rows_per_request: Optional[int] = Field(30000, description="Max rows per buffer streaming request.")
44
+ bq_max_rows_per_request: Optional[int] = Field(
45
+ 5000,
46
+ description="Max rows per buffer streaming request. Must not exceed 10000.",
47
+ le=10000,
48
+ )
45
49
  record_schemas: Optional[list[BigQueryRecordSchemaConfig]] = Field(
46
50
  default=None, description="Schema for the records. Required if unnest is set to true."
47
51
  )
48
- use_legacy_streaming_api: bool = Field(
49
- default=False,
50
- description="[DEPRECATED] Use the legacy streaming API. This is required for some older BigQuery versions.",
51
- )
52
52
 
53
53
 
54
54
  class BigQueryStreamingConfig(AbstractDestinationConfig):
55
55
  name: Literal[DestinationTypes.BIGQUERY_STREAMING]
56
+ alias: str = "bigquery"
56
57
  config: BigQueryStreamingConfigDetails
@@ -36,6 +36,7 @@ from bizon.connectors.destinations.bigquery.src.config import (
36
36
  )
37
37
  from bizon.destination.destination import AbstractDestination
38
38
  from bizon.engine.backend.backend import AbstractBackend
39
+ from bizon.monitoring.monitor import AbstractMonitor
39
40
  from bizon.source.callback import AbstractSourceCallback
40
41
 
41
42
  from .config import BigQueryStreamingConfigDetails
@@ -44,7 +45,6 @@ from .config import BigQueryStreamingConfigDetails
44
45
  class BigQueryStreamingDestination(AbstractDestination):
45
46
 
46
47
  # Add constants for limits
47
- MAX_ROWS_PER_REQUEST = 5000 # 5000 (max is 10000)
48
48
  MAX_REQUEST_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB (max is 10MB)
49
49
  MAX_ROW_SIZE_BYTES = 0.9 * 1024 * 1024 # 1 MB
50
50
 
@@ -54,8 +54,9 @@ class BigQueryStreamingDestination(AbstractDestination):
54
54
  config: BigQueryStreamingConfigDetails,
55
55
  backend: AbstractBackend,
56
56
  source_callback: AbstractSourceCallback,
57
+ monitor: AbstractMonitor,
57
58
  ): # type: ignore
58
- super().__init__(sync_metadata, config, backend, source_callback)
59
+ super().__init__(sync_metadata, config, backend, source_callback, monitor)
59
60
  self.config: BigQueryStreamingConfigDetails = config
60
61
 
61
62
  if config.authentication and config.authentication.service_account_key:
@@ -222,7 +223,7 @@ class BigQueryStreamingDestination(AbstractDestination):
222
223
  try:
223
224
  # Handle streaming batch
224
225
  if batch.get("stream_batch") and len(batch["stream_batch"]) > 0:
225
- return self.bq_client.insert_rows_json(
226
+ self.bq_client.insert_rows_json(
226
227
  table,
227
228
  batch["stream_batch"],
228
229
  row_ids=[None] * len(batch["stream_batch"]),
@@ -245,6 +246,10 @@ class BigQueryStreamingDestination(AbstractDestination):
245
246
  if load_job.state != "DONE":
246
247
  raise Exception(f"Failed to load rows to BigQuery: {load_job.errors}")
247
248
 
249
+ self.monitor.track_large_records_synced(
250
+ num_records=len(batch["json_batch"]), extra_tags={"destination_id": self.destination_id}
251
+ )
252
+
248
253
  except Exception as e:
249
254
  logger.error(f"Error inserting batch: {str(e)}, type: {type(e)}")
250
255
  raise
@@ -347,7 +352,7 @@ class BigQueryStreamingDestination(AbstractDestination):
347
352
 
348
353
  # If adding this item would exceed either limit, yield current batch and start new one
349
354
  if (
350
- len(current_batch) >= self.MAX_ROWS_PER_REQUEST
355
+ len(current_batch) >= self.bq_max_rows_per_request
351
356
  or current_batch_size + item_size > self.MAX_REQUEST_SIZE_BYTES
352
357
  ):
353
358
  logger.debug(f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB")
@@ -41,7 +41,11 @@ class BigQueryStreamingV2ConfigDetails(AbstractDestinationDetailsConfig):
41
41
  description="BigQuery Time partitioning type",
42
42
  )
43
43
  authentication: Optional[BigQueryAuthentication] = None
44
- bq_max_rows_per_request: Optional[int] = Field(30000, description="Max rows per buffer streaming request.")
44
+ bq_max_rows_per_request: Optional[int] = Field(
45
+ 5000,
46
+ description="Max rows per buffer streaming request. Must not exceed 10000.",
47
+ le=10000,
48
+ )
45
49
  record_schemas: Optional[list[BigQueryRecordSchemaConfig]] = Field(
46
50
  default=None, description="Schema for the records. Required if unnest is set to true."
47
51
  )
@@ -49,4 +53,5 @@ class BigQueryStreamingV2ConfigDetails(AbstractDestinationDetailsConfig):
49
53
 
50
54
  class BigQueryStreamingV2Config(AbstractDestinationConfig):
51
55
  name: Literal[DestinationTypes.BIGQUERY_STREAMING_V2]
56
+ alias: str = "bigquery"
52
57
  config: BigQueryStreamingV2ConfigDetails
@@ -1,25 +1,44 @@
1
1
  import os
2
2
  import tempfile
3
- from concurrent.futures import ThreadPoolExecutor
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
4
  from datetime import datetime
5
5
  from typing import List, Tuple, Type
6
6
 
7
+ import orjson
7
8
  import polars as pl
8
- from google.api_core.exceptions import NotFound
9
- from google.cloud import bigquery, bigquery_storage_v1
9
+ import urllib3.exceptions
10
+ from google.api_core.client_options import ClientOptions
11
+ from google.api_core.exceptions import (
12
+ Conflict,
13
+ InvalidArgument,
14
+ NotFound,
15
+ RetryError,
16
+ ServerError,
17
+ ServiceUnavailable,
18
+ )
19
+ from google.cloud import bigquery
10
20
  from google.cloud.bigquery import DatasetReference, TimePartitioning
21
+ from google.cloud.bigquery_storage_v1 import BigQueryWriteClient
11
22
  from google.cloud.bigquery_storage_v1.types import (
12
23
  AppendRowsRequest,
13
24
  ProtoRows,
14
25
  ProtoSchema,
15
26
  )
16
- from google.protobuf.json_format import ParseDict
17
- from google.protobuf.message import Message
27
+ from google.protobuf.json_format import MessageToDict, ParseDict, ParseError
28
+ from google.protobuf.message import EncodeError, Message
18
29
  from loguru import logger
30
+ from requests.exceptions import ConnectionError, SSLError, Timeout
31
+ from tenacity import (
32
+ retry,
33
+ retry_if_exception_type,
34
+ stop_after_attempt,
35
+ wait_exponential,
36
+ )
19
37
 
20
38
  from bizon.common.models import SyncMetadata
21
39
  from bizon.destination.destination import AbstractDestination
22
40
  from bizon.engine.backend.backend import AbstractBackend
41
+ from bizon.monitoring.monitor import AbstractMonitor
23
42
  from bizon.source.callback import AbstractSourceCallback
24
43
 
25
44
  from .config import BigQueryStreamingV2ConfigDetails
@@ -29,9 +48,8 @@ from .proto_utils import get_proto_schema_and_class
29
48
  class BigQueryStreamingV2Destination(AbstractDestination):
30
49
 
31
50
  # Add constants for limits
32
- MAX_ROWS_PER_REQUEST = 5000 # 5000 (max is 10000)
33
- MAX_REQUEST_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB (max is 10MB)
34
- MAX_ROW_SIZE_BYTES = 0.9 * 1024 * 1024 # 1 MB
51
+ MAX_REQUEST_SIZE_BYTES = 9.5 * 1024 * 1024 # 9.5 MB (max is 10MB)
52
+ MAX_ROW_SIZE_BYTES = 8 * 1024 * 1024 # 8 MB (max is 10MB)
35
53
 
36
54
  def __init__(
37
55
  self,
@@ -39,8 +57,9 @@ class BigQueryStreamingV2Destination(AbstractDestination):
39
57
  config: BigQueryStreamingV2ConfigDetails,
40
58
  backend: AbstractBackend,
41
59
  source_callback: AbstractSourceCallback,
60
+ monitor: AbstractMonitor,
42
61
  ): # type: ignore
43
- super().__init__(sync_metadata, config, backend, source_callback)
62
+ super().__init__(sync_metadata, config, backend, source_callback, monitor)
44
63
  self.config: BigQueryStreamingV2ConfigDetails = config
45
64
 
46
65
  if config.authentication and config.authentication.service_account_key:
@@ -51,10 +70,12 @@ class BigQueryStreamingV2Destination(AbstractDestination):
51
70
 
52
71
  self.project_id = config.project_id
53
72
  self.bq_client = bigquery.Client(project=self.project_id)
54
- self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
55
73
  self.dataset_id = config.dataset_id
56
74
  self.dataset_location = config.dataset_location
57
75
  self.bq_max_rows_per_request = config.bq_max_rows_per_request
76
+ self.bq_storage_client_options = ClientOptions(
77
+ quota_project_id=self.project_id,
78
+ )
58
79
 
59
80
  @property
60
81
  def table_id(self) -> str:
@@ -102,13 +123,35 @@ class BigQueryStreamingV2Destination(AbstractDestination):
102
123
  dataset = self.bq_client.create_dataset(dataset)
103
124
  return True
104
125
 
126
+ @retry(
127
+ retry=retry_if_exception_type(
128
+ (
129
+ ServerError,
130
+ ServiceUnavailable,
131
+ SSLError,
132
+ ConnectionError,
133
+ Timeout,
134
+ RetryError,
135
+ urllib3.exceptions.ProtocolError,
136
+ urllib3.exceptions.SSLError,
137
+ InvalidArgument,
138
+ )
139
+ ),
140
+ wait=wait_exponential(multiplier=2, min=4, max=120),
141
+ stop=stop_after_attempt(8),
142
+ before_sleep=lambda retry_state: logger.warning(
143
+ f"Streaming append attempt {retry_state.attempt_number} failed. "
144
+ f"Retrying in {retry_state.next_action.sleep} seconds..."
145
+ ),
146
+ )
105
147
  def append_rows_to_stream(
106
148
  self,
107
- write_client: bigquery_storage_v1.BigQueryWriteClient,
108
149
  stream_name: str,
109
150
  proto_schema: ProtoSchema,
110
151
  serialized_rows: List[bytes],
111
152
  ):
153
+ write_client = BigQueryWriteClient(client_options=self.bq_storage_client_options)
154
+
112
155
  request = AppendRowsRequest(
113
156
  write_stream=stream_name,
114
157
  proto_rows=AppendRowsRequest.ProtoData(
@@ -116,11 +159,26 @@ class BigQueryStreamingV2Destination(AbstractDestination):
116
159
  writer_schema=proto_schema,
117
160
  ),
118
161
  )
119
- response = write_client.append_rows(iter([request]))
120
- return response.code().name
162
+ try:
163
+ response = write_client.append_rows(iter([request]))
164
+ return response.code().name
165
+ except Exception as e:
166
+ logger.error(f"Error in append_rows_to_stream: {str(e)}")
167
+ logger.error(f"Stream name: {stream_name}")
168
+ raise
121
169
 
122
170
  def safe_cast_record_values(self, row: dict):
171
+ """
172
+ Safe cast record values to the correct type for BigQuery.
173
+ """
123
174
  for col in self.record_schemas[self.destination_id]:
175
+
176
+ # Handle dicts as strings
177
+ if col.type in ["STRING", "JSON"]:
178
+ if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
179
+ row[col.name] = orjson.dumps(row[col.name]).decode("utf-8")
180
+
181
+ # Handle timestamps
124
182
  if col.type in ["TIMESTAMP", "DATETIME"] and col.default_value_expression is None:
125
183
  if isinstance(row[col.name], int):
126
184
  if row[col.name] > datetime(9999, 12, 31).timestamp():
@@ -143,15 +201,102 @@ class BigQueryStreamingV2Destination(AbstractDestination):
143
201
  @staticmethod
144
202
  def to_protobuf_serialization(TableRowClass: Type[Message], row: dict) -> bytes:
145
203
  """Convert a row to a Protobuf serialization."""
146
- record = ParseDict(row, TableRowClass())
147
- return record.SerializeToString()
204
+ try:
205
+ record = ParseDict(row, TableRowClass())
206
+ except ParseError as e:
207
+ logger.error(f"Error serializing record: {e} for row: {row}.")
208
+ raise e
148
209
 
149
- def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
210
+ try:
211
+ serialized_record = record.SerializeToString()
212
+ except EncodeError as e:
213
+ logger.error(f"Error serializing record: {e} for row: {row}.")
214
+ raise e
215
+ return serialized_record
150
216
 
151
- # TODO: for now no clustering keys
152
- clustering_keys = []
217
+ @staticmethod
218
+ def from_protobuf_serialization(
219
+ TableRowClass: Type[Message],
220
+ serialized_data: bytes,
221
+ ) -> dict:
222
+ """Convert protobuf serialization back to a dictionary."""
223
+ record = TableRowClass()
224
+ record.ParseFromString(serialized_data)
225
+ return MessageToDict(record, preserving_proto_field_name=True)
226
+
227
+ @retry(
228
+ retry=retry_if_exception_type(
229
+ (
230
+ ServerError,
231
+ ServiceUnavailable,
232
+ SSLError,
233
+ ConnectionError,
234
+ Timeout,
235
+ RetryError,
236
+ urllib3.exceptions.ProtocolError,
237
+ urllib3.exceptions.SSLError,
238
+ )
239
+ ),
240
+ wait=wait_exponential(multiplier=2, min=4, max=120),
241
+ stop=stop_after_attempt(8),
242
+ before_sleep=lambda retry_state: logger.warning(
243
+ f"Attempt {retry_state.attempt_number} failed. Retrying in {retry_state.next_action.sleep} seconds..."
244
+ ),
245
+ )
246
+ def process_streaming_batch(
247
+ self,
248
+ stream_name: str,
249
+ proto_schema: ProtoSchema,
250
+ batch: dict,
251
+ table_row_class: Type[Message],
252
+ ) -> List[Tuple[str, str]]:
253
+ """Process a single batch for streaming and/or large rows with retry logic."""
254
+ results = []
255
+ try:
256
+ # Handle streaming batch
257
+ if batch.get("stream_batch") and len(batch["stream_batch"]) > 0:
258
+ result = self.append_rows_to_stream(stream_name, proto_schema, batch["stream_batch"])
259
+ results.append(("streaming", result))
260
+
261
+ # Handle large rows batch
262
+ if batch.get("json_batch") and len(batch["json_batch"]) > 0:
263
+ # Deserialize protobuf bytes back to JSON for the load job
264
+ deserialized_rows = []
265
+ for serialized_row in batch["json_batch"]:
266
+ deserialized_row = self.from_protobuf_serialization(table_row_class, serialized_row)
267
+ deserialized_rows.append(deserialized_row)
268
+
269
+ # For large rows, we need to use the main client
270
+ job_config = bigquery.LoadJobConfig(
271
+ source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
272
+ schema=self.bq_client.get_table(self.table_id).schema,
273
+ ignore_unknown_values=True,
274
+ )
275
+ load_job = self.bq_client.load_table_from_json(
276
+ deserialized_rows, self.table_id, job_config=job_config, timeout=300
277
+ )
278
+ result = load_job.result()
279
+ if load_job.state != "DONE":
280
+ raise Exception(f"Failed to load rows to BigQuery: {load_job.errors}")
281
+
282
+ # Track large rows
283
+ self.monitor.track_large_records_synced(
284
+ num_records=len(batch["json_batch"]), extra_tags={"destination_id": self.destination_id}
285
+ )
286
+
287
+ results.append(("large_rows", "DONE"))
288
+
289
+ if not results:
290
+ results.append(("empty", "SKIPPED"))
153
291
 
154
- # Create table if it doesnt exist
292
+ return results
293
+ except Exception as e:
294
+ logger.error(f"Error processing batch: {str(e)}")
295
+ raise
296
+
297
+ def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
298
+
299
+ # Create table if it does not exist
155
300
  schema = self.get_bigquery_schema()
156
301
  table = bigquery.Table(self.table_id, schema=schema)
157
302
  time_partitioning = TimePartitioning(
@@ -159,31 +304,43 @@ class BigQueryStreamingV2Destination(AbstractDestination):
159
304
  )
160
305
  table.time_partitioning = time_partitioning
161
306
 
162
- # Override bigquery client with project's destination id
163
- if self.destination_id:
164
- project, dataset, table_name = self.destination_id.split(".")
165
- self.bq_client = bigquery.Client(project=project)
166
-
167
- table = self.bq_client.create_table(table, exists_ok=True)
307
+ if self.clustering_keys and self.clustering_keys[self.destination_id]:
308
+ table.clustering_fields = self.clustering_keys[self.destination_id]
309
+ try:
310
+ table = self.bq_client.create_table(table)
311
+ except Conflict:
312
+ table = self.bq_client.get_table(self.table_id)
313
+ # Compare and update schema if needed
314
+ existing_fields = {field.name: field for field in table.schema}
315
+ new_fields = {field.name: field for field in self.get_bigquery_schema()}
316
+
317
+ # Find fields that need to be added
318
+ fields_to_add = [field for name, field in new_fields.items() if name not in existing_fields]
319
+
320
+ if fields_to_add:
321
+ logger.warning(f"Adding new fields to table schema: {[field.name for field in fields_to_add]}")
322
+ updated_schema = table.schema + fields_to_add
323
+ table.schema = updated_schema
324
+ table = self.bq_client.update_table(table, ["schema"])
168
325
 
169
326
  # Create the stream
170
327
  if self.destination_id:
171
328
  project, dataset, table_name = self.destination_id.split(".")
172
- write_client = bigquery_storage_v1.BigQueryWriteClient()
173
- parent = write_client.table_path(project, dataset, table_name)
329
+ parent = BigQueryWriteClient.table_path(project, dataset, table_name)
174
330
  else:
175
- write_client = self.bq_storage_client
176
- parent = write_client.table_path(self.project_id, self.dataset_id, self.destination_id)
331
+ parent = BigQueryWriteClient.table_path(self.project_id, self.dataset_id, self.destination_id)
177
332
 
178
333
  stream_name = f"{parent}/_default"
179
334
 
180
335
  # Generating the protocol buffer representation of the message descriptor.
181
- proto_schema, TableRow = get_proto_schema_and_class(schema, clustering_keys)
336
+ proto_schema, TableRow = get_proto_schema_and_class(schema)
182
337
 
183
338
  if self.config.unnest:
184
339
  serialized_rows = [
185
- self.to_protobuf_serialization(TableRowClass=TableRow, row=self.safe_cast_record_values(row))
186
- for row in df_destination_records["source_data"].str.json_decode(infer_schema_length=None).to_list()
340
+ self.to_protobuf_serialization(
341
+ TableRowClass=TableRow, row=self.safe_cast_record_values(orjson.loads(row))
342
+ )
343
+ for row in df_destination_records["source_data"].to_list()
187
344
  ]
188
345
  else:
189
346
  df_destination_records = df_destination_records.with_columns(
@@ -207,16 +364,43 @@ class BigQueryStreamingV2Destination(AbstractDestination):
207
364
  for row in df_destination_records.iter_rows(named=True)
208
365
  ]
209
366
 
210
- results = []
211
- with ThreadPoolExecutor() as executor:
212
- futures = [
213
- executor.submit(self.append_rows_to_stream, write_client, stream_name, proto_schema, batch_rows)
214
- for batch_rows in self.batch(serialized_rows)
215
- ]
216
- for future in futures:
217
- results.append(future.result())
367
+ streaming_results = []
368
+ large_rows_results = []
369
+
370
+ # Collect all batches first
371
+ batches = list(self.batch(serialized_rows))
372
+
373
+ # Use ThreadPoolExecutor for parallel processing
374
+ max_workers = min(len(batches), self.config.max_concurrent_threads)
375
+ logger.info(f"Processing {len(batches)} batches with {max_workers} concurrent threads")
376
+
377
+ try:
378
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
379
+ # Submit all batch processing tasks
380
+ future_to_batch = {
381
+ executor.submit(self.process_streaming_batch, stream_name, proto_schema, batch, TableRow): batch
382
+ for batch in batches
383
+ }
218
384
 
219
- assert all([r == "OK" for r in results]) is True, "Failed to append rows to stream"
385
+ # Collect results as they complete
386
+ for future in as_completed(future_to_batch):
387
+ batch_results = future.result()
388
+ for batch_type, result in batch_results:
389
+ if batch_type == "streaming":
390
+ streaming_results.append(result)
391
+ if batch_type == "large_rows":
392
+ large_rows_results.append(result)
393
+
394
+ except Exception as e:
395
+ logger.error(f"Error in multithreaded batch processing: {str(e)}, type: {type(e)}")
396
+ if isinstance(e, RetryError):
397
+ logger.error(f"Retry error details: {e.cause if hasattr(e, 'cause') else 'No cause available'}")
398
+ raise
399
+
400
+ if len(streaming_results) > 0:
401
+ assert all([r == "OK" for r in streaming_results]) is True, "Failed to append rows to stream"
402
+ if len(large_rows_results) > 0:
403
+ assert all([r == "DONE" for r in large_rows_results]) is True, "Failed to load rows to BigQuery"
220
404
 
221
405
  def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
222
406
  self.load_to_bigquery_via_streaming(df_destination_records=df_destination_records)
@@ -236,7 +420,7 @@ class BigQueryStreamingV2Destination(AbstractDestination):
236
420
 
237
421
  # If adding this item would exceed either limit, yield current batch and start new one
238
422
  if (
239
- len(current_batch) >= self.MAX_ROWS_PER_REQUEST
423
+ len(current_batch) >= self.bq_max_rows_per_request
240
424
  or current_batch_size + item_size > self.MAX_REQUEST_SIZE_BYTES
241
425
  ):
242
426
  logger.debug(f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB")
@@ -247,15 +431,16 @@ class BigQueryStreamingV2Destination(AbstractDestination):
247
431
 
248
432
  if item_size > self.MAX_ROW_SIZE_BYTES:
249
433
  large_rows.append(item)
250
- logger.debug(f"Large row detected: {item_size} bytes")
434
+ logger.warning(f"Large row detected: {item_size} bytes")
251
435
  else:
252
436
  current_batch.append(item)
253
437
  current_batch_size += item_size
254
438
 
255
439
  # Yield the last batch
256
440
  if current_batch:
257
- logger.debug(
441
+ logger.info(
258
442
  f"Yielding streaming batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB"
259
443
  )
260
- logger.debug(f"Yielding large rows batch of {len(large_rows)} rows")
444
+ if large_rows:
445
+ logger.warning(f"Yielding large rows batch of {len(large_rows)} rows")
261
446
  yield {"stream_batch": current_batch, "json_batch": large_rows}
@@ -32,9 +32,7 @@ def map_bq_type_to_field_descriptor(bq_type: str) -> int:
32
32
  return type_map.get(bq_type, FieldDescriptorProto.TYPE_STRING) # Default to TYPE_STRING
33
33
 
34
34
 
35
- def get_proto_schema_and_class(
36
- bq_schema: List[SchemaField], clustering_keys: List[str] = None
37
- ) -> Tuple[ProtoSchema, Type[Message]]:
35
+ def get_proto_schema_and_class(bq_schema: List[SchemaField]) -> Tuple[ProtoSchema, Type[Message]]:
38
36
  """Generate a ProtoSchema and a TableRow class for unnested BigQuery schema."""
39
37
  # Define the FileDescriptorProto
40
38
  file_descriptor_proto = FileDescriptorProto()
@@ -60,16 +58,6 @@ def get_proto_schema_and_class(
60
58
  for col in bq_schema
61
59
  ]
62
60
 
63
- if clustering_keys:
64
- for key in clustering_keys:
65
- fields.append(
66
- {
67
- "name": key,
68
- "type": FieldDescriptorProto.TYPE_STRING,
69
- "label": FieldDescriptorProto.LABEL_OPTIONAL,
70
- }
71
- )
72
-
73
61
  for i, field in enumerate(fields, start=1):
74
62
  field_descriptor = message_descriptor.field.add()
75
63
  field_descriptor.name = field["name"]
@@ -20,4 +20,5 @@ class FileDestinationDetailsConfig(AbstractDestinationDetailsConfig):
20
20
 
21
21
  class FileDestinationConfig(AbstractDestinationConfig):
22
22
  name: Literal[DestinationTypes.FILE]
23
+ alias: str = "file"
23
24
  config: FileDestinationDetailsConfig
@@ -6,6 +6,7 @@ import polars as pl
6
6
  from bizon.common.models import SyncMetadata
7
7
  from bizon.destination.destination import AbstractDestination
8
8
  from bizon.engine.backend.backend import AbstractBackend
9
+ from bizon.monitoring.monitor import AbstractMonitor
9
10
  from bizon.source.callback import AbstractSourceCallback
10
11
 
11
12
  from .config import FileDestinationDetailsConfig
@@ -19,8 +20,9 @@ class FileDestination(AbstractDestination):
19
20
  config: FileDestinationDetailsConfig,
20
21
  backend: AbstractBackend,
21
22
  source_callback: AbstractSourceCallback,
23
+ monitor: AbstractMonitor,
22
24
  ):
23
- super().__init__(sync_metadata, config, backend, source_callback)
25
+ super().__init__(sync_metadata, config, backend, source_callback, monitor)
24
26
  self.config: FileDestinationDetailsConfig = config
25
27
 
26
28
  def check_connection(self) -> bool:
@@ -15,4 +15,5 @@ class LoggerDestinationConfig(AbstractDestinationDetailsConfig):
15
15
 
16
16
  class LoggerConfig(AbstractDestinationConfig):
17
17
  name: Literal[DestinationTypes.LOGGER]
18
+ alias: str = "logger"
18
19
  config: LoggerDestinationConfig
@@ -6,6 +6,7 @@ from loguru import logger
6
6
  from bizon.common.models import SyncMetadata
7
7
  from bizon.destination.destination import AbstractDestination
8
8
  from bizon.engine.backend.backend import AbstractBackend
9
+ from bizon.monitoring.monitor import AbstractMonitor
9
10
  from bizon.source.callback import AbstractSourceCallback
10
11
 
11
12
  from .config import LoggerDestinationConfig
@@ -19,12 +20,14 @@ class LoggerDestination(AbstractDestination):
19
20
  config: LoggerDestinationConfig,
20
21
  backend: AbstractBackend,
21
22
  source_callback: AbstractSourceCallback,
23
+ monitor: AbstractMonitor,
22
24
  ):
23
25
  super().__init__(
24
26
  sync_metadata=sync_metadata,
25
27
  config=config,
26
28
  backend=backend,
27
29
  source_callback=source_callback,
30
+ monitor=monitor,
28
31
  )
29
32
 
30
33
  def check_connection(self) -> bool:
@@ -8,8 +8,6 @@ source:
8
8
 
9
9
  topic: my-topic
10
10
 
11
- nb_bytes_schema_id: 8
12
-
13
11
  batch_size: 1000
14
12
  consumer_timeout: 10
15
13
  bootstrap_servers: <bootstrap-severs>:9092
@@ -47,4 +45,4 @@ destination:
47
45
  # syncCursorInDBEvery: 100
48
46
 
49
47
  # runner:
50
- # log_level: INFO
48
+ # log_level: INFO
@@ -10,8 +10,6 @@ source:
10
10
 
11
11
  topic: <TOPIC_NAME>
12
12
 
13
- nb_bytes_schema_id: 8
14
-
15
13
  batch_size: 1000
16
14
  consumer_timeout: 10
17
15
  bootstrap_servers: <BOOTSTRAP_SERVERS>
@@ -109,4 +107,4 @@ engine:
109
107
  queue:
110
108
  type: python_queue
111
109
  config:
112
- max_nb_messages: 1000000
110
+ max_nb_messages: 1000000
@@ -66,10 +66,4 @@ class KafkaSourceConfig(SourceConfig):
66
66
 
67
67
  message_encoding: str = Field(default=MessageEncoding.AVRO, description="Encoding to use to decode the message")
68
68
 
69
- # Schema ID header configuration
70
- nb_bytes_schema_id: Literal[4, 8] = Field(
71
- description="Number of bytes encode SchemaID in Kafka message. Standard is 4.",
72
- default=4,
73
- )
74
-
75
69
  authentication: KafkaAuthConfig = Field(..., description="Authentication configuration")