bizon 0.0.10__tar.gz → 0.0.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. {bizon-0.0.10 → bizon-0.0.13}/PKG-INFO +3 -3
  2. {bizon-0.0.10 → bizon-0.0.13}/bizon/common/models.py +2 -0
  3. {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/bigquery/src/destination.py +49 -92
  4. bizon-0.0.13/bizon/destinations/bigquery_streaming/src/config.py +43 -0
  5. bizon-0.0.13/bizon/destinations/bigquery_streaming/src/destination.py +154 -0
  6. bizon-0.0.13/bizon/destinations/bigquery_streaming/src/proto_utils.py +91 -0
  7. {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/buffer.py +16 -9
  8. {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/config.py +1 -0
  9. {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/destination.py +35 -36
  10. {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/file/src/destination.py +4 -6
  11. {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/logger/src/destination.py +4 -4
  12. bizon-0.0.13/bizon/destinations/models.py +31 -0
  13. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/backend/adapters/sqlalchemy/backend.py +1 -1
  14. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/backend/backend.py +1 -1
  15. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/pipeline/producer.py +39 -6
  16. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/python_queue/config.py +6 -2
  17. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/python_queue/consumer.py +3 -4
  18. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/python_queue/queue.py +9 -5
  19. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/config.py +2 -0
  20. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/queue.py +22 -9
  21. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/runner/adapters/thread.py +2 -0
  22. {bizon-0.0.10 → bizon-0.0.13}/bizon/source/cursor.py +7 -0
  23. {bizon-0.0.10 → bizon-0.0.13}/bizon/source/models.py +11 -0
  24. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/kafka/src/source.py +124 -52
  25. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/kafka/tests/kafka_pipeline.py +1 -1
  26. {bizon-0.0.10 → bizon-0.0.13}/pyproject.toml +8 -7
  27. bizon-0.0.10/bizon/destinations/models.py +0 -83
  28. {bizon-0.0.10 → bizon-0.0.13}/LICENSE +0 -0
  29. {bizon-0.0.10 → bizon-0.0.13}/README.md +0 -0
  30. {bizon-0.0.10 → bizon-0.0.13}/bizon/__main__.py +0 -0
  31. {bizon-0.0.10 → bizon-0.0.13}/bizon/cli/__init__.py +0 -0
  32. {bizon-0.0.10 → bizon-0.0.13}/bizon/cli/main.py +0 -0
  33. {bizon-0.0.10 → bizon-0.0.13}/bizon/cli/utils.py +0 -0
  34. {bizon-0.0.10 → bizon-0.0.13}/bizon/common/errors/backoff.py +0 -0
  35. {bizon-0.0.10 → bizon-0.0.13}/bizon/common/errors/errors.py +0 -0
  36. {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/bigquery/config/bigquery.example.yml +0 -0
  37. {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/bigquery/src/config.py +0 -0
  38. {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/file/src/config.py +0 -0
  39. {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/logger/src/config.py +0 -0
  40. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/backend/adapters/sqlalchemy/config.py +0 -0
  41. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/backend/config.py +0 -0
  42. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/backend/models.py +0 -0
  43. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/config.py +0 -0
  44. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/engine.py +0 -0
  45. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/pipeline/consumer.py +0 -0
  46. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/pipeline/models.py +0 -0
  47. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/kafka/config.py +0 -0
  48. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/kafka/consumer.py +0 -0
  49. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/kafka/queue.py +0 -0
  50. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/rabbitmq/config.py +0 -0
  51. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -0
  52. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/rabbitmq/queue.py +0 -0
  53. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/runner/adapters/process.py +0 -0
  54. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/runner/config.py +0 -0
  55. {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/runner/runner.py +0 -0
  56. {bizon-0.0.10 → bizon-0.0.13}/bizon/source/auth/authenticators/abstract_oauth.py +0 -0
  57. {bizon-0.0.10 → bizon-0.0.13}/bizon/source/auth/authenticators/abstract_token.py +0 -0
  58. {bizon-0.0.10 → bizon-0.0.13}/bizon/source/auth/authenticators/basic.py +0 -0
  59. {bizon-0.0.10 → bizon-0.0.13}/bizon/source/auth/authenticators/cookies.py +0 -0
  60. {bizon-0.0.10 → bizon-0.0.13}/bizon/source/auth/authenticators/oauth.py +0 -0
  61. {bizon-0.0.10 → bizon-0.0.13}/bizon/source/auth/authenticators/token.py +0 -0
  62. {bizon-0.0.10 → bizon-0.0.13}/bizon/source/auth/builder.py +0 -0
  63. {bizon-0.0.10 → bizon-0.0.13}/bizon/source/auth/config.py +0 -0
  64. {bizon-0.0.10 → bizon-0.0.13}/bizon/source/config.py +0 -0
  65. {bizon-0.0.10 → bizon-0.0.13}/bizon/source/discover.py +0 -0
  66. {bizon-0.0.10 → bizon-0.0.13}/bizon/source/session.py +0 -0
  67. {bizon-0.0.10 → bizon-0.0.13}/bizon/source/source.py +0 -0
  68. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/config/api_key.example.yml +0 -0
  69. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/config/api_key_kafka.example.yml +0 -0
  70. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/src/fake_api.py +0 -0
  71. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/src/source.py +0 -0
  72. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/tests/dummy_pipeline.py +0 -0
  73. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/tests/dummy_pipeline_bigquery_backend.py +0 -0
  74. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/tests/dummy_pipeline_kafka.py +0 -0
  75. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/tests/dummy_pipeline_rabbitmq.py +0 -0
  76. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery.py +0 -0
  77. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +0 -0
  78. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/gsheets/config/default_auth.example.yml +0 -0
  79. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/gsheets/config/service_account.example.yml +0 -0
  80. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/gsheets/src/source.py +0 -0
  81. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/gsheets/tests/gsheets_pipeline.py +0 -0
  82. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/hubspot/config/api_key.example.yml +0 -0
  83. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/hubspot/config/oauth.example.yml +0 -0
  84. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/hubspot/src/hubspot_base.py +0 -0
  85. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/hubspot/src/hubspot_objects.py +0 -0
  86. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/hubspot/src/models/hs_object.py +0 -0
  87. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/hubspot/tests/hubspot_pipeline.py +0 -0
  88. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/kafka/config/kafka.example.yml +0 -0
  89. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/periscope/config/periscope_charts.example.yml +0 -0
  90. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -0
  91. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/periscope/src/source.py +0 -0
  92. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/periscope/tests/periscope_pipeline_charts.py +0 -0
  93. {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/periscope/tests/periscope_pipeline_dashboard.py +0 -0
  94. {bizon-0.0.10 → bizon-0.0.13}/bizon/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bizon
3
- Version: 0.0.10
3
+ Version: 0.0.13
4
4
  Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
5
5
  Author: Antoine Balliet
6
6
  Author-email: antoine.balliet@gmail.com
@@ -20,7 +20,6 @@ Requires-Dist: backoff (>=2.2.1,<3.0.0)
20
20
  Requires-Dist: click (>=8.1.7,<9.0.0)
21
21
  Requires-Dist: confluent-kafka (>=2.6.0,<3.0.0) ; extra == "kafka"
22
22
  Requires-Dist: dpath (>=2.2.0,<3.0.0)
23
- Requires-Dist: faker (>=26.0.0,<27.0.0)
24
23
  Requires-Dist: fastavro (>=1.9.7,<2.0.0) ; extra == "kafka"
25
24
  Requires-Dist: google-cloud-bigquery (>=3.25.0,<4.0.0) ; extra == "bigquery"
26
25
  Requires-Dist: google-cloud-bigquery-storage (>=2.25.0,<3.0.0) ; extra == "bigquery"
@@ -28,9 +27,10 @@ Requires-Dist: google-cloud-storage (>=2.17.0,<3.0.0)
28
27
  Requires-Dist: gspread (>=6.1.2,<7.0.0) ; extra == "gsheets"
29
28
  Requires-Dist: kafka-python (>=2.0.2,<3.0.0) ; extra == "kafka"
30
29
  Requires-Dist: loguru (>=0.7.2,<0.8.0)
31
- Requires-Dist: pandas (>=2.2.2,<3.0.0) ; extra == "bigquery"
32
30
  Requires-Dist: pendulum (>=3.0.0,<4.0.0)
33
31
  Requires-Dist: pika (>=1.3.2,<2.0.0) ; extra == "rabbitmq"
32
+ Requires-Dist: polars (>=1.16.0,<2.0.0)
33
+ Requires-Dist: protobuf (>=4.24.0,<5.0.0) ; extra == "bigquery"
34
34
  Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgres"
35
35
  Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
36
36
  Requires-Dist: pydantic (>=2.8.2,<3.0.0)
@@ -3,6 +3,7 @@ from typing import Union
3
3
  from pydantic import BaseModel, ConfigDict, Field
4
4
 
5
5
  from bizon.destinations.bigquery.src.config import BigQueryConfig
6
+ from bizon.destinations.bigquery_streaming.src.config import BigQueryStreamingConfig
6
7
  from bizon.destinations.file.src.config import FileDestinationConfig
7
8
  from bizon.destinations.logger.src.config import LoggerConfig
8
9
  from bizon.engine.config import EngineConfig
@@ -24,6 +25,7 @@ class BizonConfig(BaseModel):
24
25
 
25
26
  destination: Union[
26
27
  BigQueryConfig,
28
+ BigQueryStreamingConfig,
27
29
  LoggerConfig,
28
30
  FileDestinationConfig,
29
31
  ] = Field(
@@ -2,22 +2,19 @@ import io
2
2
  import json
3
3
  import os
4
4
  import tempfile
5
+ import traceback
5
6
  from typing import List, Tuple
6
7
  from uuid import uuid4
7
8
 
8
- import pandas as pd
9
- import pyarrow as pa
10
- import pyarrow.parquet as pq
9
+ import polars as pl
11
10
  from google.api_core.exceptions import NotFound
12
11
  from google.cloud import bigquery, storage
13
12
  from google.cloud.bigquery import DatasetReference, TimePartitioning
14
13
  from loguru import logger
15
- from pytz import UTC
16
14
 
17
15
  from bizon.common.models import SyncMetadata
18
16
  from bizon.destinations.config import NormalizationType
19
17
  from bizon.destinations.destination import AbstractDestination
20
- from bizon.destinations.models import DestinationRecord
21
18
  from bizon.engine.backend.backend import AbstractBackend
22
19
  from bizon.source.config import SourceSyncModes
23
20
 
@@ -62,7 +59,7 @@ class BigQueryDestination(AbstractDestination):
62
59
  elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
63
60
  return f"{self.table_id}"
64
61
 
65
- def get_bigquery_schema(self, destination_records: List[DestinationRecord]) -> List[bigquery.SchemaField]:
62
+ def get_bigquery_schema(self, df_destination_records: pl.DataFrame) -> List[bigquery.SchemaField]:
66
63
 
67
64
  # we keep raw data in the column source_data
68
65
  if self.config.normalization.type == NormalizationType.NONE:
@@ -77,26 +74,13 @@ class BigQueryDestination(AbstractDestination):
77
74
  bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
78
75
  ]
79
76
 
80
- elif self.config.normalization.type == NormalizationType.DEBEZIUM:
81
- assert (
82
- "_bizon_message_key" in destination_records[0].source_data
83
- ), "Debezium records must have a '_bizon_message_key' key"
84
- message_keys = json.loads(destination_records[0].source_data["_bizon_message_key"])
85
- return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in message_keys] + [
86
- bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
87
- bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
88
- bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
89
- bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
90
- bigquery.SchemaField(
91
- "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
92
- ),
93
- bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
94
- ]
95
-
96
77
  # If normalization is tabular, we parse key / value pairs to columns
97
78
  elif self.config.normalization.type == NormalizationType.TABULAR:
98
- first_record_keys = destination_records[0].source_data.keys()
99
- return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in first_record_keys] + [
79
+
80
+ # We use the first record to infer the schema of tabular data (key / value pairs)
81
+ source_data_keys = list(json.loads(df_destination_records["source_data"][0]).keys())
82
+
83
+ return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in source_data_keys] + [
100
84
  bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
101
85
  bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
102
86
  bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
@@ -108,58 +92,6 @@ class BigQueryDestination(AbstractDestination):
108
92
 
109
93
  raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
110
94
 
111
- def get_batch_records_as_df(self, destination_records: List[DestinationRecord]) -> pd.DataFrame:
112
-
113
- # We keep raw data in a column -> convert the SourceRecord to a DestinationRecord
114
- if self.config.normalization.type == NormalizationType.NONE:
115
- df = pd.DataFrame([record.to_dict_raw_json_data(parquet=True) for record in destination_records])
116
- df["_bizon_loaded_at"] = pd.Timestamp.now(tz=UTC)
117
-
118
- # If normalization is tabular, we can just convert the data to a DataFrame parsing first-level keys
119
- elif self.config.normalization.type == NormalizationType.TABULAR:
120
- list_data_dict = [record.source_data for record in destination_records]
121
- df = pd.DataFrame(list_data_dict).astype(str)
122
- df["_bizon_id"] = [uuid4().hex for _ in range(len(destination_records))]
123
-
124
- df["_bizon_extracted_at"] = [
125
- int(record.source_timestamp.timestamp() * 1_000_000) for record in destination_records
126
- ]
127
-
128
- df["_bizon_loaded_at"] = pd.Timestamp.now(tz=UTC)
129
-
130
- df["_source_record_id"] = [record.source_record_id for record in destination_records]
131
-
132
- # We need to convert the source datetime to a int timestamp
133
- df["_source_timestamp"] = [
134
- int(record.source_timestamp.timestamp() * 1_000_000) for record in destination_records
135
- ]
136
-
137
- elif self.config.normalization.type == NormalizationType.DEBEZIUM:
138
- df = pd.DataFrame([record.to_dict_debezium(parquet=True) for record in destination_records])
139
- df["_bizon_loaded_at"] = pd.Timestamp.now(tz=UTC)
140
-
141
- else:
142
- raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
143
-
144
- return df
145
-
146
- def convert_and_upload_to_buffer(self, destination_records: List[DestinationRecord]):
147
-
148
- df = self.get_batch_records_as_df(destination_records)
149
-
150
- # Convert DataFrame to Parquet in-memory
151
- if self.buffer_format == "parquet":
152
- table = pa.Table.from_pandas(df)
153
- buffer = io.BytesIO()
154
- pq.write_table(table, buffer)
155
- buffer.seek(0)
156
-
157
- # Upload the Parquet file to GCS
158
- file_name = f"{self.sync_metadata.source_name}/{self.sync_metadata.stream_name}/{str(uuid4())}.parquet"
159
- blob = self.buffer_bucket.blob(file_name)
160
- blob.upload_from_file(buffer, content_type="application/octet-stream")
161
- return file_name
162
-
163
95
  def check_connection(self) -> bool:
164
96
  dataset_ref = DatasetReference(self.project_id, self.dataset_id)
165
97
 
@@ -179,7 +111,25 @@ class BigQueryDestination(AbstractDestination):
179
111
  # https://cloud.google.com/python/docs/reference/storage/latest/retry_timeout
180
112
  # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dbapi.DataError
181
113
 
182
- def load_to_bigquery(self, gcs_file: str, destination_records: List[DestinationRecord]):
114
+ def convert_and_upload_to_buffer(self, df_destination_records: pl.DataFrame) -> str:
115
+
116
+ if self.buffer_format == "parquet":
117
+
118
+ # Upload the Parquet file to GCS
119
+ file_name = f"{self.sync_metadata.source_name}/{self.sync_metadata.stream_name}/{str(uuid4())}.parquet"
120
+
121
+ with io.BytesIO() as stream:
122
+ df_destination_records.write_parquet(stream)
123
+ stream.seek(0)
124
+
125
+ blob = self.buffer_bucket.blob(file_name)
126
+ blob.upload_from_file(stream, content_type="application/octet-stream")
127
+
128
+ return file_name
129
+
130
+ raise NotImplementedError(f"Buffer format {self.buffer_format} is not supported")
131
+
132
+ def load_to_bigquery(self, gcs_file: str, df_destination_records: pl.DataFrame):
183
133
 
184
134
  # We always partition by the loaded_at field
185
135
  time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
@@ -187,34 +137,41 @@ class BigQueryDestination(AbstractDestination):
187
137
  job_config = bigquery.LoadJobConfig(
188
138
  source_format=bigquery.SourceFormat.PARQUET,
189
139
  write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
190
- schema=self.get_bigquery_schema(destination_records=destination_records),
140
+ schema=self.get_bigquery_schema(df_destination_records=df_destination_records),
191
141
  time_partitioning=time_partitioning,
192
142
  )
193
143
 
194
- if self.config.normalization.type == NormalizationType.DEBEZIUM:
195
- job_config.clustering_fields = list(
196
- json.loads(destination_records[0].source_data["_bizon_message_key"]).keys()
197
- )
198
-
199
144
  load_job = self.bq_client.load_table_from_uri(
200
145
  f"gs://{self.buffer_bucket_name}/{gcs_file}", self.temp_table_id, job_config=job_config
201
146
  )
147
+ result = load_job.result() # Waits for the job to complete
148
+ assert result.state == "DONE", f"Job failed with state {result.state} with error {result.error_result}"
149
+
150
+ def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
151
+
152
+ # Rename fields to match BigQuery schema
153
+ df_destination_records = df_destination_records.rename(
154
+ {
155
+ # Bizon fields
156
+ "bizon_extracted_at": "_bizon_extracted_at",
157
+ "bizon_id": "_bizon_id",
158
+ "bizon_loaded_at": "_bizon_loaded_at",
159
+ # Source fields
160
+ "source_record_id": "_source_record_id",
161
+ "source_timestamp": "_source_timestamp",
162
+ "source_data": "_source_data",
163
+ },
164
+ )
202
165
 
203
- load_job.result()
204
-
205
- def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, str]:
206
-
207
- # Here we can check if these IDs are already present in BigQuery
208
- # Using SourceRecord.id values
209
-
210
- gs_file_name = self.convert_and_upload_to_buffer(destination_records=destination_records)
166
+ gs_file_name = self.convert_and_upload_to_buffer(df_destination_records=df_destination_records)
211
167
 
212
168
  try:
213
- self.load_to_bigquery(gs_file_name, destination_records=destination_records)
169
+ self.load_to_bigquery(gcs_file=gs_file_name, df_destination_records=df_destination_records)
214
170
  self.cleanup(gs_file_name)
215
171
  except Exception as e:
216
172
  self.cleanup(gs_file_name)
217
173
  logger.error(f"Error loading data to BigQuery: {e}")
174
+ logger.error(traceback.format_exc())
218
175
  return False, str(e)
219
176
  return True, ""
220
177
 
@@ -0,0 +1,43 @@
1
+ from enum import Enum
2
+ from typing import Literal, Optional
3
+
4
+ from pydantic import BaseModel, Field, field_validator
5
+
6
+ from bizon.destinations.config import (
7
+ AbstractDestinationConfig,
8
+ AbstractDestinationDetailsConfig,
9
+ DestinationTypes,
10
+ )
11
+
12
+
13
+ class TimePartitioning(str, Enum):
14
+ DAY = "DAY"
15
+ HOUR = "HOUR"
16
+ MONTH = "MONTH"
17
+ YEAR = "YEAR"
18
+
19
+
20
+ class BigQueryAuthentication(BaseModel):
21
+ service_account_key: str = Field(
22
+ description="Service Account Key JSON string. If empty it will be infered",
23
+ default="",
24
+ )
25
+
26
+
27
+ class BigQueryStreamingConfigDetails(AbstractDestinationDetailsConfig):
28
+ project_id: str
29
+ dataset_id: str
30
+ dataset_location: Optional[str] = "US"
31
+ table_id: Optional[str] = Field(
32
+ default=None, description="Table ID, if not provided it will be inferred from source name"
33
+ )
34
+ time_partitioning: Optional[TimePartitioning] = Field(
35
+ default=TimePartitioning.DAY, description="BigQuery Time partitioning type"
36
+ )
37
+ authentication: Optional[BigQueryAuthentication] = None
38
+ bq_max_rows_per_request: Optional[int] = Field(30000, description="Max rows per buffer streaming request.")
39
+
40
+
41
+ class BigQueryStreamingConfig(AbstractDestinationConfig):
42
+ name: Literal[DestinationTypes.BIGQUERY_STREAMING]
43
+ config: BigQueryStreamingConfigDetails
@@ -0,0 +1,154 @@
1
+ import os
2
+ import tempfile
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from typing import List, Tuple, Type
5
+
6
+ import polars as pl
7
+ from google.api_core.exceptions import NotFound
8
+ from google.cloud import bigquery, bigquery_storage_v1
9
+ from google.cloud.bigquery import DatasetReference, TimePartitioning
10
+ from google.cloud.bigquery_storage_v1.types import (
11
+ AppendRowsRequest,
12
+ ProtoRows,
13
+ ProtoSchema,
14
+ )
15
+ from google.protobuf.message import Message
16
+
17
+ from bizon.common.models import SyncMetadata
18
+ from bizon.destinations.config import NormalizationType
19
+ from bizon.destinations.destination import AbstractDestination
20
+ from bizon.engine.backend.backend import AbstractBackend
21
+
22
+ from .config import BigQueryStreamingConfigDetails
23
+ from .proto_utils import get_proto_schema_and_class
24
+
25
+
26
+ class BigQueryStreamingDestination(AbstractDestination):
27
+
28
+ def __init__(self, sync_metadata: SyncMetadata, config: BigQueryStreamingConfigDetails, backend: AbstractBackend):
29
+ super().__init__(sync_metadata, config, backend)
30
+ self.config: BigQueryStreamingConfigDetails = config
31
+
32
+ if config.authentication and config.authentication.service_account_key:
33
+ with tempfile.NamedTemporaryFile(delete=False) as temp:
34
+ temp.write(config.authentication.service_account_key.encode())
35
+ temp_file_path = temp.name
36
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file_path
37
+
38
+ self.project_id = config.project_id
39
+ self.bq_client = bigquery.Client(project=self.project_id)
40
+ self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
41
+ self.dataset_id = config.dataset_id
42
+ self.dataset_location = config.dataset_location
43
+ self.bq_max_rows_per_request = config.bq_max_rows_per_request
44
+
45
+ @property
46
+ def table_id(self) -> str:
47
+ tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
48
+ return f"{self.project_id}.{self.dataset_id}.{tabled_id}"
49
+
50
+ def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
51
+
52
+ # we keep raw data in the column source_data
53
+ if self.config.normalization.type == NormalizationType.NONE:
54
+ return [
55
+ bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
56
+ bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
57
+ bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
58
+ bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
59
+ bigquery.SchemaField(
60
+ "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
61
+ ),
62
+ bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
63
+ ]
64
+
65
+ raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
66
+
67
+ def check_connection(self) -> bool:
68
+ dataset_ref = DatasetReference(self.project_id, self.dataset_id)
69
+
70
+ try:
71
+ self.bq_client.get_dataset(dataset_ref)
72
+ except NotFound:
73
+ dataset = bigquery.Dataset(dataset_ref)
74
+ dataset.location = self.dataset_location
75
+ dataset = self.bq_client.create_dataset(dataset)
76
+ return True
77
+
78
+ def append_rows_to_stream(
79
+ self,
80
+ write_client: bigquery_storage_v1.BigQueryWriteClient,
81
+ stream_name: str,
82
+ proto_schema: ProtoSchema,
83
+ serialized_rows: List[bytes],
84
+ ):
85
+ request = AppendRowsRequest(
86
+ write_stream=stream_name,
87
+ proto_rows=AppendRowsRequest.ProtoData(
88
+ rows=ProtoRows(serialized_rows=serialized_rows),
89
+ writer_schema=proto_schema,
90
+ ),
91
+ )
92
+ response = write_client.append_rows(iter([request]))
93
+ return response.code().name
94
+
95
+ @staticmethod
96
+ def to_protobuf_serialization(TableRowClass: Type[Message], row: dict) -> bytes:
97
+ """Convert a row to a protobuf serialization"""
98
+ record = TableRowClass()
99
+ record._bizon_id = row["bizon_id"]
100
+ record._bizon_extracted_at = row["bizon_extracted_at"].strftime("%Y-%m-%d %H:%M:%S.%f")
101
+ record._bizon_loaded_at = row["bizon_loaded_at"].strftime("%Y-%m-%d %H:%M:%S.%f")
102
+ record._source_record_id = row["source_record_id"]
103
+ record._source_timestamp = row["source_timestamp"].strftime("%Y-%m-%d %H:%M:%S.%f")
104
+ record._source_data = row["source_data"]
105
+ return record.SerializeToString()
106
+
107
+ def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
108
+ # TODO: for now no clustering keys
109
+ clustering_keys = []
110
+
111
+ # Create table if it doesnt exist
112
+ schema = self.get_bigquery_schema()
113
+ table = bigquery.Table(self.table_id, schema=schema)
114
+ time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
115
+ table.time_partitioning = time_partitioning
116
+
117
+ table = self.bq_client.create_table(table, exists_ok=True)
118
+
119
+ # Create the stream
120
+ write_client = self.bq_storage_client
121
+ tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
122
+ parent = write_client.table_path(self.project_id, self.dataset_id, tabled_id)
123
+ stream_name = f"{parent}/_default"
124
+
125
+ # Generating the protocol buffer representation of the message descriptor.
126
+ proto_schema, TableRow = get_proto_schema_and_class(clustering_keys)
127
+
128
+ serialized_rows = [
129
+ self.to_protobuf_serialization(TableRowClass=TableRow, row=row)
130
+ for row in df_destination_records.iter_rows(named=True)
131
+ ]
132
+
133
+ results = []
134
+ with ThreadPoolExecutor() as executor:
135
+ futures = [
136
+ executor.submit(self.append_rows_to_stream, write_client, stream_name, proto_schema, batch_rows)
137
+ for batch_rows in self.batch(serialized_rows)
138
+ ]
139
+ for future in futures:
140
+ results.append(future.result())
141
+
142
+ assert all([r == "OK" for r in results]) is True, "Failed to append rows to stream"
143
+
144
+ def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
145
+ self.load_to_bigquery_via_streaming(df_destination_records=df_destination_records)
146
+ return True, ""
147
+
148
+ def batch(self, iterable):
149
+ """
150
+ Yield successive batches of size `batch_size` from `iterable`.
151
+ """
152
+
153
+ for i in range(0, len(iterable), self.bq_max_rows_per_request):
154
+ yield iterable[i : i + self.bq_max_rows_per_request] # noqa
@@ -0,0 +1,91 @@
1
+ from typing import List, Tuple, Type
2
+
3
+ from google.cloud.bigquery_storage_v1.types import ProtoSchema
4
+ from google.protobuf.descriptor_pb2 import (
5
+ DescriptorProto,
6
+ FieldDescriptorProto,
7
+ FileDescriptorProto,
8
+ )
9
+ from google.protobuf.descriptor_pool import DescriptorPool
10
+ from google.protobuf.message import Message
11
+ from google.protobuf.message_factory import GetMessageClassesForFiles
12
+
13
+
14
+ def get_proto_schema_and_class(clustering_keys: List[str] = None) -> Tuple[ProtoSchema, Type[Message]]:
15
+ # Define the FileDescriptorProto
16
+ file_descriptor_proto = FileDescriptorProto()
17
+ file_descriptor_proto.name = "dynamic.proto"
18
+ file_descriptor_proto.package = "dynamic_package"
19
+
20
+ # Define the TableRow message schema
21
+ message_descriptor = DescriptorProto()
22
+ message_descriptor.name = "TableRow"
23
+
24
+ # Add fields to the message, only use TYPE_STRING, BigQuery does not support other types
25
+ # It does not imapact data types in final table
26
+
27
+ # https://stackoverflow.com/questions/70489919/protobuf-type-for-bigquery-timestamp-field
28
+ fields = [
29
+ {"name": "_bizon_id", "type": FieldDescriptorProto.TYPE_STRING, "label": FieldDescriptorProto.LABEL_REQUIRED},
30
+ {
31
+ "name": "_bizon_extracted_at",
32
+ "type": FieldDescriptorProto.TYPE_STRING,
33
+ "label": FieldDescriptorProto.LABEL_REQUIRED,
34
+ },
35
+ {
36
+ "name": "_bizon_loaded_at",
37
+ "type": FieldDescriptorProto.TYPE_STRING,
38
+ "label": FieldDescriptorProto.LABEL_REQUIRED,
39
+ },
40
+ {
41
+ "name": "_source_record_id",
42
+ "type": FieldDescriptorProto.TYPE_STRING,
43
+ "label": FieldDescriptorProto.LABEL_REQUIRED,
44
+ },
45
+ {
46
+ "name": "_source_timestamp",
47
+ "type": FieldDescriptorProto.TYPE_STRING,
48
+ "label": FieldDescriptorProto.LABEL_REQUIRED,
49
+ },
50
+ {
51
+ "name": "_source_data",
52
+ "type": FieldDescriptorProto.TYPE_STRING,
53
+ "label": FieldDescriptorProto.LABEL_OPTIONAL,
54
+ },
55
+ ]
56
+
57
+ if clustering_keys:
58
+ for key in clustering_keys:
59
+ fields.append(
60
+ {
61
+ "name": key,
62
+ "type": FieldDescriptorProto.TYPE_STRING,
63
+ "label": FieldDescriptorProto.LABEL_OPTIONAL,
64
+ }
65
+ )
66
+
67
+ for i, field in enumerate(fields, start=1):
68
+ field_descriptor = message_descriptor.field.add()
69
+ field_descriptor.name = field["name"]
70
+ field_descriptor.number = i
71
+ field_descriptor.type = field["type"]
72
+ field_descriptor.label = field["label"]
73
+
74
+ # Add the message to the file descriptor
75
+ file_descriptor_proto.message_type.add().CopyFrom(message_descriptor)
76
+
77
+ # Create a DescriptorPool and register the FileDescriptorProto
78
+ pool = DescriptorPool()
79
+ pool.Add(file_descriptor_proto)
80
+
81
+ # Use the registered file name to fetch the message classes
82
+ message_classes = GetMessageClassesForFiles(["dynamic.proto"], pool=pool)
83
+
84
+ # Fetch the TableRow class
85
+ table_row_class = message_classes["dynamic_package.TableRow"]
86
+
87
+ # Create the ProtoSchema
88
+ proto_schema = ProtoSchema()
89
+ proto_schema.proto_descriptor.CopyFrom(message_descriptor)
90
+
91
+ return proto_schema, table_row_class
@@ -1,8 +1,11 @@
1
- import sys
2
1
  from datetime import datetime
3
2
  from typing import List
4
3
 
5
- from bizon.destinations.models import DestinationRecord
4
+ from loguru import logger
5
+ from polars import DataFrame
6
+ from pytz import UTC
7
+
8
+ from .models import destination_record_schema
6
9
 
7
10
 
8
11
  class DestinationBuffer:
@@ -10,15 +13,15 @@ class DestinationBuffer:
10
13
  def __init__(self, buffer_size: int, buffer_flush_timeout: int) -> None:
11
14
  self.buffer_size = buffer_size * 1024 * 1024 # Convert to bytes
12
15
  self.buffer_flush_timeout = buffer_flush_timeout
13
- self.records: List[DestinationRecord] = []
16
+ self.df_destination_records: DataFrame = DataFrame(schema=destination_record_schema)
14
17
  self._iterations: List[int] = []
15
18
  self.pagination = {}
16
- self.modified_at: List[datetime] = [datetime.utcnow()]
19
+ self.modified_at: List[datetime] = [datetime.now(tz=UTC)]
17
20
 
18
21
  @property
19
22
  def current_size(self) -> int:
20
23
  """Return buffer size"""
21
- return sys.getsizeof(self.records)
24
+ return self.df_destination_records.estimated_size(unit="b")
22
25
 
23
26
  @property
24
27
  def buffer_free_space_pct(self) -> float:
@@ -61,16 +64,20 @@ class DestinationBuffer:
61
64
 
62
65
  def flush(self):
63
66
  """Flush buffer"""
64
- self.records = []
67
+ self.df_destination_records = DataFrame(schema=destination_record_schema)
65
68
  self._iterations = []
66
69
  self.pagination = {}
67
70
  self.modified_at = []
68
71
 
69
72
  def add_source_iteration_records_to_buffer(
70
- self, iteration: int, records: List[DestinationRecord], pagination: dict = None
73
+ self, iteration: int, df_destination_records: DataFrame, pagination: dict = None
71
74
  ):
72
75
  """Add records for the given iteration to buffer"""
73
- self.records.extend(records)
76
+ self.df_destination_records.vstack(df_destination_records, in_place=True)
74
77
  self._iterations.append(iteration)
75
78
  self.pagination = pagination
76
- self.modified_at.append(datetime.utcnow())
79
+ self.modified_at.append(datetime.now(tz=UTC))
80
+
81
+ logger.info(
82
+ f"Added {df_destination_records.height} records to buffer for iteration {iteration} - {self.df_destination_records.estimated_size(unit='mb')} MB"
83
+ )
@@ -6,6 +6,7 @@ from pydantic import BaseModel, ConfigDict, Field
6
6
 
7
7
  class DestinationTypes(str, Enum):
8
8
  BIGQUERY = "bigquery"
9
+ BIGQUERY_STREAMING = "bigquery_streaming"
9
10
  LOGGER = "logger"
10
11
  FILE = "file"
11
12