bizon 0.0.14__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. bizon/alerting/__init__.py +0 -0
  2. bizon/alerting/alerts.py +23 -0
  3. bizon/alerting/models.py +28 -0
  4. bizon/alerting/slack/__init__.py +0 -0
  5. bizon/alerting/slack/config.py +5 -0
  6. bizon/alerting/slack/handler.py +39 -0
  7. bizon/cli/main.py +7 -3
  8. bizon/common/models.py +31 -7
  9. bizon/{destinations → connectors/destinations}/bigquery/config/bigquery.example.yml +3 -4
  10. bizon/connectors/destinations/bigquery/src/config.py +127 -0
  11. bizon/{destinations → connectors/destinations}/bigquery/src/destination.py +46 -25
  12. bizon/connectors/destinations/bigquery_streaming/src/config.py +56 -0
  13. bizon/connectors/destinations/bigquery_streaming/src/destination.py +372 -0
  14. bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +52 -0
  15. bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +261 -0
  16. bizon/{destinations/bigquery_streaming → connectors/destinations/bigquery_streaming_v2}/src/proto_utils.py +32 -26
  17. bizon/{destinations → connectors/destinations}/file/src/config.py +8 -3
  18. bizon/connectors/destinations/file/src/destination.py +54 -0
  19. bizon/{destinations → connectors/destinations}/logger/src/config.py +1 -1
  20. bizon/{destinations → connectors/destinations}/logger/src/destination.py +15 -3
  21. bizon/connectors/sources/cycle/config/cycle.example.yml +15 -0
  22. bizon/connectors/sources/cycle/src/source.py +133 -0
  23. bizon/{sources/periscope/tests/periscope_pipeline_dashboard.py → connectors/sources/cycle/tests/cycle_customers.py} +1 -1
  24. bizon/connectors/sources/dummy/config/dummy.example.yml +22 -0
  25. bizon/{sources → connectors/sources}/dummy/src/fake_api.py +6 -1
  26. bizon/{sources → connectors/sources}/dummy/src/source.py +18 -5
  27. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline.py +5 -14
  28. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_bigquery_backend.py +2 -2
  29. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_kafka.py +2 -2
  30. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_rabbitmq.py +2 -2
  31. bizon/connectors/sources/dummy/tests/dummy_pipeline_unnest.py +29 -0
  32. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery.py +3 -3
  33. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +2 -2
  34. bizon/{sources → connectors/sources}/gsheets/config/default_auth.example.yml +4 -2
  35. bizon/{sources → connectors/sources}/gsheets/config/service_account.example.yml +4 -2
  36. bizon/{sources → connectors/sources}/hubspot/config/api_key.example.yml +4 -2
  37. bizon/{sources → connectors/sources}/hubspot/config/oauth.example.yml +4 -2
  38. bizon/{sources → connectors/sources}/hubspot/src/hubspot_objects.py +1 -1
  39. bizon/connectors/sources/kafka/config/kafka.example.yml +50 -0
  40. bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +112 -0
  41. bizon/connectors/sources/kafka/src/callback.py +18 -0
  42. bizon/connectors/sources/kafka/src/config.py +75 -0
  43. bizon/connectors/sources/kafka/src/decode.py +88 -0
  44. bizon/connectors/sources/kafka/src/source.py +361 -0
  45. bizon/connectors/sources/kafka/tests/kafka_pipeline.py +7 -0
  46. bizon/connectors/sources/periscope/config/periscope_charts.example.yml +20 -0
  47. bizon/connectors/sources/periscope/config/periscope_dashboards.example.yml +20 -0
  48. bizon/{sources → connectors/sources}/periscope/src/source.py +136 -13
  49. bizon/connectors/sources/periscope/tests/periscope_pipeline_dashboard.py +9 -0
  50. bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_json.example.yml +19 -0
  51. bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_logger.example.yml +10 -0
  52. bizon/connectors/sources/pokeapi/src/source.py +79 -0
  53. bizon/{destinations → destination}/buffer.py +5 -0
  54. bizon/destination/config.py +74 -0
  55. bizon/{destinations → destination}/destination.py +71 -15
  56. bizon/engine/backend/adapters/sqlalchemy/backend.py +14 -23
  57. bizon/engine/engine.py +20 -1
  58. bizon/engine/pipeline/consumer.py +73 -5
  59. bizon/engine/pipeline/models.py +8 -3
  60. bizon/engine/pipeline/producer.py +18 -9
  61. bizon/engine/queue/adapters/kafka/consumer.py +2 -2
  62. bizon/engine/queue/adapters/kafka/queue.py +3 -2
  63. bizon/engine/queue/adapters/python_queue/consumer.py +40 -23
  64. bizon/engine/queue/adapters/python_queue/queue.py +19 -9
  65. bizon/engine/queue/adapters/rabbitmq/consumer.py +3 -6
  66. bizon/engine/queue/adapters/rabbitmq/queue.py +3 -2
  67. bizon/engine/queue/config.py +16 -0
  68. bizon/engine/queue/queue.py +17 -16
  69. bizon/engine/runner/adapters/process.py +15 -2
  70. bizon/engine/runner/adapters/streaming.py +103 -0
  71. bizon/engine/runner/adapters/thread.py +32 -9
  72. bizon/engine/runner/config.py +28 -0
  73. bizon/engine/runner/runner.py +107 -25
  74. bizon/monitoring/__init__.py +0 -0
  75. bizon/monitoring/config.py +29 -0
  76. bizon/monitoring/datadog/__init__.py +0 -0
  77. bizon/monitoring/datadog/monitor.py +69 -0
  78. bizon/monitoring/monitor.py +42 -0
  79. bizon/monitoring/noop/__init__.py +0 -0
  80. bizon/monitoring/noop/monitor.py +11 -0
  81. bizon/source/callback.py +24 -0
  82. bizon/source/config.py +3 -3
  83. bizon/source/cursor.py +1 -1
  84. bizon/source/discover.py +4 -3
  85. bizon/source/models.py +4 -2
  86. bizon/source/source.py +10 -2
  87. bizon/transform/config.py +8 -0
  88. bizon/transform/transform.py +48 -0
  89. bizon-0.1.1.dist-info/LICENSE +674 -0
  90. {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/METADATA +25 -7
  91. bizon-0.1.1.dist-info/RECORD +123 -0
  92. {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/WHEEL +1 -1
  93. bizon/destinations/bigquery/src/config.py +0 -51
  94. bizon/destinations/bigquery_streaming/src/config.py +0 -43
  95. bizon/destinations/bigquery_streaming/src/destination.py +0 -154
  96. bizon/destinations/config.py +0 -47
  97. bizon/destinations/file/src/destination.py +0 -27
  98. bizon/sources/dummy/config/api_key.example.yml +0 -20
  99. bizon/sources/dummy/config/api_key_kafka.example.yml +0 -27
  100. bizon/sources/kafka/config/kafka.example.yml +0 -38
  101. bizon/sources/kafka/src/source.py +0 -357
  102. bizon/sources/kafka/tests/kafka_pipeline.py +0 -9
  103. bizon/sources/periscope/config/periscope_charts.example.yml +0 -26
  104. bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -26
  105. bizon-0.0.14.dist-info/LICENSE +0 -21
  106. bizon-0.0.14.dist-info/RECORD +0 -94
  107. /bizon/{sources → connectors/sources}/gsheets/src/source.py +0 -0
  108. /bizon/{sources → connectors/sources}/gsheets/tests/gsheets_pipeline.py +0 -0
  109. /bizon/{sources → connectors/sources}/hubspot/src/hubspot_base.py +0 -0
  110. /bizon/{sources → connectors/sources}/hubspot/src/models/hs_object.py +0 -0
  111. /bizon/{sources → connectors/sources}/hubspot/tests/hubspot_pipeline.py +0 -0
  112. /bizon/{sources → connectors/sources}/periscope/tests/periscope_pipeline_charts.py +0 -0
  113. /bizon/{destinations → destination}/models.py +0 -0
  114. {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,372 @@
1
+ import os
2
+ import tempfile
3
+ from datetime import datetime
4
+ from typing import List, Tuple
5
+
6
+ import orjson
7
+ import polars as pl
8
+ import urllib3.exceptions
9
+ from google.api_core.exceptions import (
10
+ Conflict,
11
+ NotFound,
12
+ RetryError,
13
+ ServerError,
14
+ ServiceUnavailable,
15
+ )
16
+ from google.cloud import bigquery, bigquery_storage_v1
17
+ from google.cloud.bigquery import DatasetReference, TimePartitioning
18
+ from google.cloud.bigquery_storage_v1.types import (
19
+ AppendRowsRequest,
20
+ ProtoRows,
21
+ ProtoSchema,
22
+ )
23
+ from loguru import logger
24
+ from requests.exceptions import ConnectionError, SSLError, Timeout
25
+ from tenacity import (
26
+ retry,
27
+ retry_if_exception_type,
28
+ stop_after_attempt,
29
+ wait_exponential,
30
+ )
31
+
32
+ from bizon.common.models import SyncMetadata
33
+ from bizon.connectors.destinations.bigquery.src.config import (
34
+ BigQueryColumnMode,
35
+ BigQueryColumnType,
36
+ )
37
+ from bizon.destination.destination import AbstractDestination
38
+ from bizon.engine.backend.backend import AbstractBackend
39
+ from bizon.source.callback import AbstractSourceCallback
40
+
41
+ from .config import BigQueryStreamingConfigDetails
42
+
43
+
44
+ class BigQueryStreamingDestination(AbstractDestination):
45
+
46
+ # Add constants for limits
47
+ MAX_ROWS_PER_REQUEST = 5000 # 5000 (max is 10000)
48
+ MAX_REQUEST_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB (max is 10MB)
49
+ MAX_ROW_SIZE_BYTES = 0.9 * 1024 * 1024 # 1 MB
50
+
51
+ def __init__(
52
+ self,
53
+ sync_metadata: SyncMetadata,
54
+ config: BigQueryStreamingConfigDetails,
55
+ backend: AbstractBackend,
56
+ source_callback: AbstractSourceCallback,
57
+ ): # type: ignore
58
+ super().__init__(sync_metadata, config, backend, source_callback)
59
+ self.config: BigQueryStreamingConfigDetails = config
60
+
61
+ if config.authentication and config.authentication.service_account_key:
62
+ with tempfile.NamedTemporaryFile(delete=False) as temp:
63
+ temp.write(config.authentication.service_account_key.encode())
64
+ temp_file_path = temp.name
65
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file_path
66
+
67
+ self.project_id = config.project_id
68
+ self.bq_client = bigquery.Client(project=self.project_id)
69
+ self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
70
+ self.dataset_id = config.dataset_id
71
+ self.dataset_location = config.dataset_location
72
+ self.bq_max_rows_per_request = config.bq_max_rows_per_request
73
+
74
+ @property
75
+ def table_id(self) -> str:
76
+ tabled_id = f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
77
+ return self.destination_id or f"{self.project_id}.{self.dataset_id}.{tabled_id}"
78
+
79
+ def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
80
+
81
+ if self.config.unnest:
82
+ if len(list(self.record_schemas.keys())) == 1:
83
+ self.destination_id = list(self.record_schemas.keys())[0]
84
+
85
+ return [
86
+ bigquery.SchemaField(
87
+ name=col.name,
88
+ field_type=col.type,
89
+ mode=col.mode,
90
+ description=col.description,
91
+ default_value_expression=col.default_value_expression,
92
+ )
93
+ for col in self.record_schemas[self.destination_id]
94
+ ]
95
+
96
+ # Case we don't unnest the data
97
+ else:
98
+ return [
99
+ bigquery.SchemaField(
100
+ "_source_record_id",
101
+ BigQueryColumnType.STRING,
102
+ mode=BigQueryColumnMode.REQUIRED,
103
+ description="The source record id",
104
+ ),
105
+ bigquery.SchemaField(
106
+ "_source_timestamp",
107
+ BigQueryColumnType.TIMESTAMP,
108
+ mode=BigQueryColumnMode.REQUIRED,
109
+ description="The source timestamp",
110
+ ),
111
+ bigquery.SchemaField(
112
+ "_source_data",
113
+ BigQueryColumnType.JSON,
114
+ mode=BigQueryColumnMode.NULLABLE,
115
+ description="The source data",
116
+ ),
117
+ bigquery.SchemaField(
118
+ "_bizon_extracted_at",
119
+ BigQueryColumnType.TIMESTAMP,
120
+ mode=BigQueryColumnMode.REQUIRED,
121
+ description="The bizon extracted at",
122
+ ),
123
+ bigquery.SchemaField(
124
+ "_bizon_loaded_at",
125
+ BigQueryColumnType.TIMESTAMP,
126
+ mode=BigQueryColumnMode.REQUIRED,
127
+ default_value_expression="CURRENT_TIMESTAMP()",
128
+ description="The bizon loaded at",
129
+ ),
130
+ bigquery.SchemaField(
131
+ "_bizon_id",
132
+ BigQueryColumnType.STRING,
133
+ mode=BigQueryColumnMode.REQUIRED,
134
+ description="The bizon id",
135
+ ),
136
+ ]
137
+
138
+ def check_connection(self) -> bool:
139
+ dataset_ref = DatasetReference(self.project_id, self.dataset_id)
140
+
141
+ try:
142
+ self.bq_client.get_dataset(dataset_ref)
143
+ except NotFound:
144
+ dataset = bigquery.Dataset(dataset_ref)
145
+ dataset.location = self.dataset_location
146
+ dataset = self.bq_client.create_dataset(dataset)
147
+ return True
148
+
149
+ def append_rows_to_stream(
150
+ self,
151
+ write_client: bigquery_storage_v1.BigQueryWriteClient,
152
+ stream_name: str,
153
+ proto_schema: ProtoSchema,
154
+ serialized_rows: List[bytes],
155
+ ):
156
+ request = AppendRowsRequest(
157
+ write_stream=stream_name,
158
+ proto_rows=AppendRowsRequest.ProtoData(
159
+ rows=ProtoRows(serialized_rows=serialized_rows),
160
+ writer_schema=proto_schema,
161
+ ),
162
+ )
163
+ response = write_client.append_rows(iter([request]))
164
+ return response.code().name
165
+
166
+ def safe_cast_record_values(self, row: dict):
167
+ """
168
+ Safe cast record values to the correct type for BigQuery.
169
+ """
170
+ for col in self.record_schemas[self.destination_id]:
171
+
172
+ # Handle dicts as strings
173
+ if col.type in [BigQueryColumnType.STRING, BigQueryColumnType.JSON]:
174
+ if isinstance(row[col.name], dict) or isinstance(row[col.name], list):
175
+ row[col.name] = orjson.dumps(row[col.name]).decode("utf-8")
176
+
177
+ # Handle timestamps
178
+ if (
179
+ col.type in [BigQueryColumnType.TIMESTAMP, BigQueryColumnType.DATETIME]
180
+ and col.default_value_expression is None
181
+ ):
182
+ if isinstance(row[col.name], int):
183
+ if row[col.name] > datetime(9999, 12, 31).timestamp():
184
+ row[col.name] = datetime.fromtimestamp(row[col.name] / 1_000_000).strftime(
185
+ "%Y-%m-%d %H:%M:%S.%f"
186
+ )
187
+ else:
188
+ try:
189
+ row[col.name] = datetime.fromtimestamp(row[col.name]).strftime("%Y-%m-%d %H:%M:%S.%f")
190
+ except ValueError:
191
+ error_message = (
192
+ f"Error casting timestamp for destination '{self.destination_id}' column '{col.name}'. "
193
+ f"Invalid timestamp value: {row[col.name]} ({type(row[col.name])}). "
194
+ "Consider using a transformation."
195
+ )
196
+ logger.error(error_message)
197
+ raise ValueError(error_message)
198
+ return row
199
+
200
+ @retry(
201
+ retry=retry_if_exception_type(
202
+ (
203
+ ServerError,
204
+ ServiceUnavailable,
205
+ SSLError,
206
+ ConnectionError,
207
+ Timeout,
208
+ RetryError,
209
+ urllib3.exceptions.ProtocolError,
210
+ urllib3.exceptions.SSLError,
211
+ )
212
+ ),
213
+ wait=wait_exponential(multiplier=2, min=4, max=120),
214
+ stop=stop_after_attempt(8),
215
+ before_sleep=lambda retry_state: logger.warning(
216
+ f"Attempt {retry_state.attempt_number} failed. Retrying in {retry_state.next_action.sleep} seconds..."
217
+ ),
218
+ )
219
+ def _insert_batch(self, table, batch):
220
+ """Helper method to insert a batch of rows with retry logic"""
221
+ logger.debug(f"Inserting batch in table {table.table_id}")
222
+ try:
223
+ # Handle streaming batch
224
+ if batch.get("stream_batch") and len(batch["stream_batch"]) > 0:
225
+ return self.bq_client.insert_rows_json(
226
+ table,
227
+ batch["stream_batch"],
228
+ row_ids=[None] * len(batch["stream_batch"]),
229
+ timeout=300, # 5 minutes timeout per request
230
+ )
231
+
232
+ # Handle large rows batch
233
+ if batch.get("json_batch") and len(batch["json_batch"]) > 0:
234
+ job_config = bigquery.LoadJobConfig(
235
+ source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
236
+ schema=table.schema,
237
+ ignore_unknown_values=True,
238
+ )
239
+
240
+ load_job = self.bq_client.load_table_from_json(
241
+ batch["json_batch"], table, job_config=job_config, timeout=300
242
+ )
243
+ load_job.result()
244
+
245
+ if load_job.state != "DONE":
246
+ raise Exception(f"Failed to load rows to BigQuery: {load_job.errors}")
247
+
248
+ except Exception as e:
249
+ logger.error(f"Error inserting batch: {str(e)}, type: {type(e)}")
250
+ raise
251
+
252
+ def load_to_bigquery_via_legacy_streaming(self, df_destination_records: pl.DataFrame) -> str:
253
+ # Create table if it does not exist
254
+ schema = self.get_bigquery_schema()
255
+ table = bigquery.Table(self.table_id, schema=schema)
256
+ time_partitioning = TimePartitioning(
257
+ field=self.config.time_partitioning.field, type_=self.config.time_partitioning.type
258
+ )
259
+ table.time_partitioning = time_partitioning
260
+
261
+ if self.clustering_keys and self.clustering_keys[self.destination_id]:
262
+ table.clustering_fields = self.clustering_keys[self.destination_id]
263
+ try:
264
+ table = self.bq_client.create_table(table)
265
+ except Conflict:
266
+ table = self.bq_client.get_table(self.table_id)
267
+ # Compare and update schema if needed
268
+ existing_fields = {field.name: field for field in table.schema}
269
+ new_fields = {field.name: field for field in self.get_bigquery_schema()}
270
+
271
+ # Find fields that need to be added
272
+ fields_to_add = [field for name, field in new_fields.items() if name not in existing_fields]
273
+
274
+ if fields_to_add:
275
+ logger.warning(f"Adding new fields to table schema: {[field.name for field in fields_to_add]}")
276
+ updated_schema = table.schema + fields_to_add
277
+ table.schema = updated_schema
278
+ table = self.bq_client.update_table(table, ["schema"])
279
+
280
+ if self.config.unnest:
281
+ # We cannot use the `json_decode` method here because of the issue: https://github.com/pola-rs/polars/issues/22371
282
+ rows_to_insert = [
283
+ self.safe_cast_record_values(orjson.loads(row))
284
+ for row in df_destination_records["source_data"].to_list()
285
+ ]
286
+ else:
287
+ df_destination_records = df_destination_records.with_columns(
288
+ pl.col("bizon_extracted_at").dt.strftime("%Y-%m-%d %H:%M:%S").alias("bizon_extracted_at"),
289
+ pl.col("bizon_loaded_at").dt.strftime("%Y-%m-%d %H:%M:%S").alias("bizon_loaded_at"),
290
+ pl.col("source_timestamp").dt.strftime("%Y-%m-%d %H:%M:%S").alias("source_timestamp"),
291
+ )
292
+ df_destination_records = df_destination_records.rename(
293
+ {
294
+ "bizon_id": "_bizon_id",
295
+ "bizon_extracted_at": "_bizon_extracted_at",
296
+ "bizon_loaded_at": "_bizon_loaded_at",
297
+ "source_record_id": "_source_record_id",
298
+ "source_timestamp": "_source_timestamp",
299
+ "source_data": "_source_data",
300
+ }
301
+ )
302
+ rows_to_insert = [row for row in df_destination_records.iter_rows(named=True)]
303
+
304
+ errors = []
305
+ for batch in self.batch(rows_to_insert):
306
+ try:
307
+ batch_errors = self._insert_batch(table, batch)
308
+ if batch_errors:
309
+ errors.extend(batch_errors)
310
+ except Exception as e:
311
+ logger.error(f"Failed to insert batch on destination {self.destination_id} after all retries: {str(e)}")
312
+ if isinstance(e, RetryError):
313
+ logger.error(f"Retry error details: {e.cause if hasattr(e, 'cause') else 'No cause available'}")
314
+ raise
315
+
316
+ if errors:
317
+ logger.error("Encountered errors while inserting rows:")
318
+ for error in errors:
319
+ if error.get("errors") and len(error["errors"]) > 0:
320
+ logger.error("The following row failed to be inserted:")
321
+ if batch.get("stream_batch") and len(batch["stream_batch"]) > 0:
322
+ logger.error(f"{batch['stream_batch'][error['index']]}")
323
+ else:
324
+ logger.error(f"{batch['json_batch'][error['index']]}")
325
+ for error_detail in error["errors"]:
326
+ logger.error(f"Location (column): {error_detail['location']}")
327
+ logger.error(f"Reason: {error_detail['reason']}")
328
+ logger.error(f"Message: {error_detail['message']}")
329
+ raise Exception(f"Encountered errors while inserting rows: {errors}")
330
+
331
+ def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
332
+ logger.debug("Using BigQuery legacy streaming API...")
333
+ self.load_to_bigquery_via_legacy_streaming(df_destination_records=df_destination_records)
334
+ return True, ""
335
+
336
+ def batch(self, iterable):
337
+ """
338
+ Yield successive batches respecting both row count and size limits.
339
+ """
340
+ current_batch = []
341
+ current_batch_size = 0
342
+ large_rows = []
343
+
344
+ for item in iterable:
345
+ # Estimate the size of the item (as JSON)
346
+ item_size = len(str(item).encode("utf-8"))
347
+
348
+ # If adding this item would exceed either limit, yield current batch and start new one
349
+ if (
350
+ len(current_batch) >= self.MAX_ROWS_PER_REQUEST
351
+ or current_batch_size + item_size > self.MAX_REQUEST_SIZE_BYTES
352
+ ):
353
+ logger.debug(f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB")
354
+ yield {"stream_batch": current_batch, "json_batch": large_rows}
355
+ current_batch = []
356
+ current_batch_size = 0
357
+ large_rows = []
358
+
359
+ if item_size > self.MAX_ROW_SIZE_BYTES:
360
+ large_rows.append(item)
361
+ logger.debug(f"Large row detected: {item_size} bytes")
362
+ else:
363
+ current_batch.append(item)
364
+ current_batch_size += item_size
365
+
366
+ # Yield the last batch
367
+ if current_batch:
368
+ logger.debug(
369
+ f"Yielding streaming batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB"
370
+ )
371
+ logger.debug(f"Yielding large rows batch of {len(large_rows)} rows")
372
+ yield {"stream_batch": current_batch, "json_batch": large_rows}
@@ -0,0 +1,52 @@
1
+ from enum import Enum
2
+ from typing import Literal, Optional
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+ from bizon.connectors.destinations.bigquery.src.config import BigQueryRecordSchemaConfig
7
+ from bizon.destination.config import (
8
+ AbstractDestinationConfig,
9
+ AbstractDestinationDetailsConfig,
10
+ DestinationTypes,
11
+ )
12
+
13
+
14
+ class TimePartitioningWindow(str, Enum):
15
+ DAY = "DAY"
16
+ HOUR = "HOUR"
17
+ MONTH = "MONTH"
18
+ YEAR = "YEAR"
19
+
20
+
21
+ class TimePartitioning(BaseModel):
22
+ type: TimePartitioningWindow = Field(default=TimePartitioningWindow.DAY, description="Time partitioning type")
23
+ field: Optional[str] = Field(
24
+ "_bizon_loaded_at", description="Field to partition by. You can use a transformation to create this field."
25
+ )
26
+
27
+
28
+ class BigQueryAuthentication(BaseModel):
29
+ service_account_key: str = Field(
30
+ description="Service Account Key JSON string. If empty it will be infered",
31
+ default="",
32
+ )
33
+
34
+
35
+ class BigQueryStreamingV2ConfigDetails(AbstractDestinationDetailsConfig):
36
+ project_id: str
37
+ dataset_id: str
38
+ dataset_location: Optional[str] = "US"
39
+ time_partitioning: Optional[TimePartitioning] = Field(
40
+ default=TimePartitioning(type=TimePartitioningWindow.DAY, field="_bizon_loaded_at"),
41
+ description="BigQuery Time partitioning type",
42
+ )
43
+ authentication: Optional[BigQueryAuthentication] = None
44
+ bq_max_rows_per_request: Optional[int] = Field(30000, description="Max rows per buffer streaming request.")
45
+ record_schemas: Optional[list[BigQueryRecordSchemaConfig]] = Field(
46
+ default=None, description="Schema for the records. Required if unnest is set to true."
47
+ )
48
+
49
+
50
+ class BigQueryStreamingV2Config(AbstractDestinationConfig):
51
+ name: Literal[DestinationTypes.BIGQUERY_STREAMING_V2]
52
+ config: BigQueryStreamingV2ConfigDetails
@@ -0,0 +1,261 @@
1
+ import os
2
+ import tempfile
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from datetime import datetime
5
+ from typing import List, Tuple, Type
6
+
7
+ import polars as pl
8
+ from google.api_core.exceptions import NotFound
9
+ from google.cloud import bigquery, bigquery_storage_v1
10
+ from google.cloud.bigquery import DatasetReference, TimePartitioning
11
+ from google.cloud.bigquery_storage_v1.types import (
12
+ AppendRowsRequest,
13
+ ProtoRows,
14
+ ProtoSchema,
15
+ )
16
+ from google.protobuf.json_format import ParseDict
17
+ from google.protobuf.message import Message
18
+ from loguru import logger
19
+
20
+ from bizon.common.models import SyncMetadata
21
+ from bizon.destination.destination import AbstractDestination
22
+ from bizon.engine.backend.backend import AbstractBackend
23
+ from bizon.source.callback import AbstractSourceCallback
24
+
25
+ from .config import BigQueryStreamingV2ConfigDetails
26
+ from .proto_utils import get_proto_schema_and_class
27
+
28
+
29
+ class BigQueryStreamingV2Destination(AbstractDestination):
30
+
31
+ # Add constants for limits
32
+ MAX_ROWS_PER_REQUEST = 5000 # 5000 (max is 10000)
33
+ MAX_REQUEST_SIZE_BYTES = 5 * 1024 * 1024 # 5 MB (max is 10MB)
34
+ MAX_ROW_SIZE_BYTES = 0.9 * 1024 * 1024 # 1 MB
35
+
36
+ def __init__(
37
+ self,
38
+ sync_metadata: SyncMetadata,
39
+ config: BigQueryStreamingV2ConfigDetails,
40
+ backend: AbstractBackend,
41
+ source_callback: AbstractSourceCallback,
42
+ ): # type: ignore
43
+ super().__init__(sync_metadata, config, backend, source_callback)
44
+ self.config: BigQueryStreamingV2ConfigDetails = config
45
+
46
+ if config.authentication and config.authentication.service_account_key:
47
+ with tempfile.NamedTemporaryFile(delete=False) as temp:
48
+ temp.write(config.authentication.service_account_key.encode())
49
+ temp_file_path = temp.name
50
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file_path
51
+
52
+ self.project_id = config.project_id
53
+ self.bq_client = bigquery.Client(project=self.project_id)
54
+ self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
55
+ self.dataset_id = config.dataset_id
56
+ self.dataset_location = config.dataset_location
57
+ self.bq_max_rows_per_request = config.bq_max_rows_per_request
58
+
59
+ @property
60
+ def table_id(self) -> str:
61
+ tabled_id = f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
62
+ return self.destination_id or f"{self.project_id}.{self.dataset_id}.{tabled_id}"
63
+
64
+ def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
65
+
66
+ if self.config.unnest:
67
+ if len(list(self.record_schemas.keys())) == 1:
68
+ self.destination_id = list(self.record_schemas.keys())[0]
69
+
70
+ return [
71
+ bigquery.SchemaField(
72
+ name=col.name,
73
+ field_type=col.type,
74
+ mode=col.mode,
75
+ description=col.description,
76
+ default_value_expression=col.default_value_expression,
77
+ )
78
+ for col in self.record_schemas[self.destination_id]
79
+ ]
80
+
81
+ # Case we don't unnest the data
82
+ else:
83
+ return [
84
+ bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
85
+ bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
86
+ bigquery.SchemaField("_source_data", "JSON", mode="NULLABLE"),
87
+ bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
88
+ bigquery.SchemaField(
89
+ "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
90
+ ),
91
+ bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
92
+ ]
93
+
94
+ def check_connection(self) -> bool:
95
+ dataset_ref = DatasetReference(self.project_id, self.dataset_id)
96
+
97
+ try:
98
+ self.bq_client.get_dataset(dataset_ref)
99
+ except NotFound:
100
+ dataset = bigquery.Dataset(dataset_ref)
101
+ dataset.location = self.dataset_location
102
+ dataset = self.bq_client.create_dataset(dataset)
103
+ return True
104
+
105
+ def append_rows_to_stream(
106
+ self,
107
+ write_client: bigquery_storage_v1.BigQueryWriteClient,
108
+ stream_name: str,
109
+ proto_schema: ProtoSchema,
110
+ serialized_rows: List[bytes],
111
+ ):
112
+ request = AppendRowsRequest(
113
+ write_stream=stream_name,
114
+ proto_rows=AppendRowsRequest.ProtoData(
115
+ rows=ProtoRows(serialized_rows=serialized_rows),
116
+ writer_schema=proto_schema,
117
+ ),
118
+ )
119
+ response = write_client.append_rows(iter([request]))
120
+ return response.code().name
121
+
122
+ def safe_cast_record_values(self, row: dict):
123
+ for col in self.record_schemas[self.destination_id]:
124
+ if col.type in ["TIMESTAMP", "DATETIME"] and col.default_value_expression is None:
125
+ if isinstance(row[col.name], int):
126
+ if row[col.name] > datetime(9999, 12, 31).timestamp():
127
+ row[col.name] = datetime.fromtimestamp(row[col.name] / 1_000_000).strftime(
128
+ "%Y-%m-%d %H:%M:%S.%f"
129
+ )
130
+ else:
131
+ try:
132
+ row[col.name] = datetime.fromtimestamp(row[col.name]).strftime("%Y-%m-%d %H:%M:%S.%f")
133
+ except ValueError:
134
+ error_message = (
135
+ f"Error casting timestamp for destination '{self.destination_id}' column '{col.name}'. "
136
+ f"Invalid timestamp value: {row[col.name]} ({type(row[col.name])}). "
137
+ "Consider using a transformation."
138
+ )
139
+ logger.error(error_message)
140
+ raise ValueError(error_message)
141
+ return row
142
+
143
+ @staticmethod
144
+ def to_protobuf_serialization(TableRowClass: Type[Message], row: dict) -> bytes:
145
+ """Convert a row to a Protobuf serialization."""
146
+ record = ParseDict(row, TableRowClass())
147
+ return record.SerializeToString()
148
+
149
+ def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
150
+
151
+ # TODO: for now no clustering keys
152
+ clustering_keys = []
153
+
154
+ # Create table if it doesnt exist
155
+ schema = self.get_bigquery_schema()
156
+ table = bigquery.Table(self.table_id, schema=schema)
157
+ time_partitioning = TimePartitioning(
158
+ field=self.config.time_partitioning.field, type_=self.config.time_partitioning.type
159
+ )
160
+ table.time_partitioning = time_partitioning
161
+
162
+ # Override bigquery client with project's destination id
163
+ if self.destination_id:
164
+ project, dataset, table_name = self.destination_id.split(".")
165
+ self.bq_client = bigquery.Client(project=project)
166
+
167
+ table = self.bq_client.create_table(table, exists_ok=True)
168
+
169
+ # Create the stream
170
+ if self.destination_id:
171
+ project, dataset, table_name = self.destination_id.split(".")
172
+ write_client = bigquery_storage_v1.BigQueryWriteClient()
173
+ parent = write_client.table_path(project, dataset, table_name)
174
+ else:
175
+ write_client = self.bq_storage_client
176
+ parent = write_client.table_path(self.project_id, self.dataset_id, self.destination_id)
177
+
178
+ stream_name = f"{parent}/_default"
179
+
180
+ # Generating the protocol buffer representation of the message descriptor.
181
+ proto_schema, TableRow = get_proto_schema_and_class(schema, clustering_keys)
182
+
183
+ if self.config.unnest:
184
+ serialized_rows = [
185
+ self.to_protobuf_serialization(TableRowClass=TableRow, row=self.safe_cast_record_values(row))
186
+ for row in df_destination_records["source_data"].str.json_decode(infer_schema_length=None).to_list()
187
+ ]
188
+ else:
189
+ df_destination_records = df_destination_records.with_columns(
190
+ pl.col("bizon_extracted_at").dt.strftime("%Y-%m-%d %H:%M:%S").alias("bizon_extracted_at"),
191
+ pl.col("bizon_loaded_at").dt.strftime("%Y-%m-%d %H:%M:%S").alias("bizon_loaded_at"),
192
+ pl.col("source_timestamp").dt.strftime("%Y-%m-%d %H:%M:%S").alias("source_timestamp"),
193
+ )
194
+ df_destination_records = df_destination_records.rename(
195
+ {
196
+ "bizon_id": "_bizon_id",
197
+ "bizon_extracted_at": "_bizon_extracted_at",
198
+ "bizon_loaded_at": "_bizon_loaded_at",
199
+ "source_record_id": "_source_record_id",
200
+ "source_timestamp": "_source_timestamp",
201
+ "source_data": "_source_data",
202
+ }
203
+ )
204
+
205
+ serialized_rows = [
206
+ self.to_protobuf_serialization(TableRowClass=TableRow, row=row)
207
+ for row in df_destination_records.iter_rows(named=True)
208
+ ]
209
+
210
+ results = []
211
+ with ThreadPoolExecutor() as executor:
212
+ futures = [
213
+ executor.submit(self.append_rows_to_stream, write_client, stream_name, proto_schema, batch_rows)
214
+ for batch_rows in self.batch(serialized_rows)
215
+ ]
216
+ for future in futures:
217
+ results.append(future.result())
218
+
219
+ assert all([r == "OK" for r in results]) is True, "Failed to append rows to stream"
220
+
221
+ def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
222
+ self.load_to_bigquery_via_streaming(df_destination_records=df_destination_records)
223
+ return True, ""
224
+
225
+ def batch(self, iterable):
226
+ """
227
+ Yield successive batches respecting both row count and size limits.
228
+ """
229
+ current_batch = []
230
+ current_batch_size = 0
231
+ large_rows = []
232
+
233
+ for item in iterable:
234
+ # Estimate the size of the item (as JSON)
235
+ item_size = len(str(item).encode("utf-8"))
236
+
237
+ # If adding this item would exceed either limit, yield current batch and start new one
238
+ if (
239
+ len(current_batch) >= self.MAX_ROWS_PER_REQUEST
240
+ or current_batch_size + item_size > self.MAX_REQUEST_SIZE_BYTES
241
+ ):
242
+ logger.debug(f"Yielding batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB")
243
+ yield {"stream_batch": current_batch, "json_batch": large_rows}
244
+ current_batch = []
245
+ current_batch_size = 0
246
+ large_rows = []
247
+
248
+ if item_size > self.MAX_ROW_SIZE_BYTES:
249
+ large_rows.append(item)
250
+ logger.debug(f"Large row detected: {item_size} bytes")
251
+ else:
252
+ current_batch.append(item)
253
+ current_batch_size += item_size
254
+
255
+ # Yield the last batch
256
+ if current_batch:
257
+ logger.debug(
258
+ f"Yielding streaming batch of {len(current_batch)} rows, size: {current_batch_size/1024/1024:.2f}MB"
259
+ )
260
+ logger.debug(f"Yielding large rows batch of {len(large_rows)} rows")
261
+ yield {"stream_batch": current_batch, "json_batch": large_rows}