bizon 0.0.10__tar.gz → 0.0.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bizon-0.0.10 → bizon-0.0.13}/PKG-INFO +3 -3
- {bizon-0.0.10 → bizon-0.0.13}/bizon/common/models.py +2 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/bigquery/src/destination.py +49 -92
- bizon-0.0.13/bizon/destinations/bigquery_streaming/src/config.py +43 -0
- bizon-0.0.13/bizon/destinations/bigquery_streaming/src/destination.py +154 -0
- bizon-0.0.13/bizon/destinations/bigquery_streaming/src/proto_utils.py +91 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/buffer.py +16 -9
- {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/config.py +1 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/destination.py +35 -36
- {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/file/src/destination.py +4 -6
- {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/logger/src/destination.py +4 -4
- bizon-0.0.13/bizon/destinations/models.py +31 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/backend/adapters/sqlalchemy/backend.py +1 -1
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/backend/backend.py +1 -1
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/pipeline/producer.py +39 -6
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/python_queue/config.py +6 -2
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/python_queue/consumer.py +3 -4
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/python_queue/queue.py +9 -5
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/config.py +2 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/queue.py +22 -9
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/runner/adapters/thread.py +2 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/source/cursor.py +7 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/source/models.py +11 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/kafka/src/source.py +124 -52
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/kafka/tests/kafka_pipeline.py +1 -1
- {bizon-0.0.10 → bizon-0.0.13}/pyproject.toml +8 -7
- bizon-0.0.10/bizon/destinations/models.py +0 -83
- {bizon-0.0.10 → bizon-0.0.13}/LICENSE +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/README.md +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/__main__.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/cli/__init__.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/cli/main.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/cli/utils.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/common/errors/backoff.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/common/errors/errors.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/bigquery/config/bigquery.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/bigquery/src/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/file/src/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/destinations/logger/src/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/backend/adapters/sqlalchemy/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/backend/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/backend/models.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/engine.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/pipeline/consumer.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/pipeline/models.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/kafka/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/kafka/consumer.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/kafka/queue.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/rabbitmq/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/queue/adapters/rabbitmq/queue.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/runner/adapters/process.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/runner/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/engine/runner/runner.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/source/auth/authenticators/abstract_oauth.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/source/auth/authenticators/abstract_token.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/source/auth/authenticators/basic.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/source/auth/authenticators/cookies.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/source/auth/authenticators/oauth.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/source/auth/authenticators/token.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/source/auth/builder.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/source/auth/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/source/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/source/discover.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/source/session.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/source/source.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/config/api_key.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/config/api_key_kafka.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/src/fake_api.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/src/source.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/tests/dummy_pipeline.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/tests/dummy_pipeline_bigquery_backend.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/tests/dummy_pipeline_kafka.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/tests/dummy_pipeline_rabbitmq.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/gsheets/config/default_auth.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/gsheets/config/service_account.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/gsheets/src/source.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/gsheets/tests/gsheets_pipeline.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/hubspot/config/api_key.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/hubspot/config/oauth.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/hubspot/src/hubspot_base.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/hubspot/src/hubspot_objects.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/hubspot/src/models/hs_object.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/hubspot/tests/hubspot_pipeline.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/kafka/config/kafka.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/periscope/config/periscope_charts.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/periscope/src/source.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/periscope/tests/periscope_pipeline_charts.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/sources/periscope/tests/periscope_pipeline_dashboard.py +0 -0
- {bizon-0.0.10 → bizon-0.0.13}/bizon/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bizon
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.13
|
|
4
4
|
Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
|
|
5
5
|
Author: Antoine Balliet
|
|
6
6
|
Author-email: antoine.balliet@gmail.com
|
|
@@ -20,7 +20,6 @@ Requires-Dist: backoff (>=2.2.1,<3.0.0)
|
|
|
20
20
|
Requires-Dist: click (>=8.1.7,<9.0.0)
|
|
21
21
|
Requires-Dist: confluent-kafka (>=2.6.0,<3.0.0) ; extra == "kafka"
|
|
22
22
|
Requires-Dist: dpath (>=2.2.0,<3.0.0)
|
|
23
|
-
Requires-Dist: faker (>=26.0.0,<27.0.0)
|
|
24
23
|
Requires-Dist: fastavro (>=1.9.7,<2.0.0) ; extra == "kafka"
|
|
25
24
|
Requires-Dist: google-cloud-bigquery (>=3.25.0,<4.0.0) ; extra == "bigquery"
|
|
26
25
|
Requires-Dist: google-cloud-bigquery-storage (>=2.25.0,<3.0.0) ; extra == "bigquery"
|
|
@@ -28,9 +27,10 @@ Requires-Dist: google-cloud-storage (>=2.17.0,<3.0.0)
|
|
|
28
27
|
Requires-Dist: gspread (>=6.1.2,<7.0.0) ; extra == "gsheets"
|
|
29
28
|
Requires-Dist: kafka-python (>=2.0.2,<3.0.0) ; extra == "kafka"
|
|
30
29
|
Requires-Dist: loguru (>=0.7.2,<0.8.0)
|
|
31
|
-
Requires-Dist: pandas (>=2.2.2,<3.0.0) ; extra == "bigquery"
|
|
32
30
|
Requires-Dist: pendulum (>=3.0.0,<4.0.0)
|
|
33
31
|
Requires-Dist: pika (>=1.3.2,<2.0.0) ; extra == "rabbitmq"
|
|
32
|
+
Requires-Dist: polars (>=1.16.0,<2.0.0)
|
|
33
|
+
Requires-Dist: protobuf (>=4.24.0,<5.0.0) ; extra == "bigquery"
|
|
34
34
|
Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgres"
|
|
35
35
|
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
|
36
36
|
Requires-Dist: pydantic (>=2.8.2,<3.0.0)
|
|
@@ -3,6 +3,7 @@ from typing import Union
|
|
|
3
3
|
from pydantic import BaseModel, ConfigDict, Field
|
|
4
4
|
|
|
5
5
|
from bizon.destinations.bigquery.src.config import BigQueryConfig
|
|
6
|
+
from bizon.destinations.bigquery_streaming.src.config import BigQueryStreamingConfig
|
|
6
7
|
from bizon.destinations.file.src.config import FileDestinationConfig
|
|
7
8
|
from bizon.destinations.logger.src.config import LoggerConfig
|
|
8
9
|
from bizon.engine.config import EngineConfig
|
|
@@ -24,6 +25,7 @@ class BizonConfig(BaseModel):
|
|
|
24
25
|
|
|
25
26
|
destination: Union[
|
|
26
27
|
BigQueryConfig,
|
|
28
|
+
BigQueryStreamingConfig,
|
|
27
29
|
LoggerConfig,
|
|
28
30
|
FileDestinationConfig,
|
|
29
31
|
] = Field(
|
|
@@ -2,22 +2,19 @@ import io
|
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
4
|
import tempfile
|
|
5
|
+
import traceback
|
|
5
6
|
from typing import List, Tuple
|
|
6
7
|
from uuid import uuid4
|
|
7
8
|
|
|
8
|
-
import
|
|
9
|
-
import pyarrow as pa
|
|
10
|
-
import pyarrow.parquet as pq
|
|
9
|
+
import polars as pl
|
|
11
10
|
from google.api_core.exceptions import NotFound
|
|
12
11
|
from google.cloud import bigquery, storage
|
|
13
12
|
from google.cloud.bigquery import DatasetReference, TimePartitioning
|
|
14
13
|
from loguru import logger
|
|
15
|
-
from pytz import UTC
|
|
16
14
|
|
|
17
15
|
from bizon.common.models import SyncMetadata
|
|
18
16
|
from bizon.destinations.config import NormalizationType
|
|
19
17
|
from bizon.destinations.destination import AbstractDestination
|
|
20
|
-
from bizon.destinations.models import DestinationRecord
|
|
21
18
|
from bizon.engine.backend.backend import AbstractBackend
|
|
22
19
|
from bizon.source.config import SourceSyncModes
|
|
23
20
|
|
|
@@ -62,7 +59,7 @@ class BigQueryDestination(AbstractDestination):
|
|
|
62
59
|
elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
|
|
63
60
|
return f"{self.table_id}"
|
|
64
61
|
|
|
65
|
-
def get_bigquery_schema(self,
|
|
62
|
+
def get_bigquery_schema(self, df_destination_records: pl.DataFrame) -> List[bigquery.SchemaField]:
|
|
66
63
|
|
|
67
64
|
# we keep raw data in the column source_data
|
|
68
65
|
if self.config.normalization.type == NormalizationType.NONE:
|
|
@@ -77,26 +74,13 @@ class BigQueryDestination(AbstractDestination):
|
|
|
77
74
|
bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
|
|
78
75
|
]
|
|
79
76
|
|
|
80
|
-
elif self.config.normalization.type == NormalizationType.DEBEZIUM:
|
|
81
|
-
assert (
|
|
82
|
-
"_bizon_message_key" in destination_records[0].source_data
|
|
83
|
-
), "Debezium records must have a '_bizon_message_key' key"
|
|
84
|
-
message_keys = json.loads(destination_records[0].source_data["_bizon_message_key"])
|
|
85
|
-
return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in message_keys] + [
|
|
86
|
-
bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
|
|
87
|
-
bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
|
|
88
|
-
bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
|
|
89
|
-
bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
|
|
90
|
-
bigquery.SchemaField(
|
|
91
|
-
"_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
|
|
92
|
-
),
|
|
93
|
-
bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
|
|
94
|
-
]
|
|
95
|
-
|
|
96
77
|
# If normalization is tabular, we parse key / value pairs to columns
|
|
97
78
|
elif self.config.normalization.type == NormalizationType.TABULAR:
|
|
98
|
-
|
|
99
|
-
|
|
79
|
+
|
|
80
|
+
# We use the first record to infer the schema of tabular data (key / value pairs)
|
|
81
|
+
source_data_keys = list(json.loads(df_destination_records["source_data"][0]).keys())
|
|
82
|
+
|
|
83
|
+
return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in source_data_keys] + [
|
|
100
84
|
bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
|
|
101
85
|
bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
|
|
102
86
|
bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
|
|
@@ -108,58 +92,6 @@ class BigQueryDestination(AbstractDestination):
|
|
|
108
92
|
|
|
109
93
|
raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
|
|
110
94
|
|
|
111
|
-
def get_batch_records_as_df(self, destination_records: List[DestinationRecord]) -> pd.DataFrame:
|
|
112
|
-
|
|
113
|
-
# We keep raw data in a column -> convert the SourceRecord to a DestinationRecord
|
|
114
|
-
if self.config.normalization.type == NormalizationType.NONE:
|
|
115
|
-
df = pd.DataFrame([record.to_dict_raw_json_data(parquet=True) for record in destination_records])
|
|
116
|
-
df["_bizon_loaded_at"] = pd.Timestamp.now(tz=UTC)
|
|
117
|
-
|
|
118
|
-
# If normalization is tabular, we can just convert the data to a DataFrame parsing first-level keys
|
|
119
|
-
elif self.config.normalization.type == NormalizationType.TABULAR:
|
|
120
|
-
list_data_dict = [record.source_data for record in destination_records]
|
|
121
|
-
df = pd.DataFrame(list_data_dict).astype(str)
|
|
122
|
-
df["_bizon_id"] = [uuid4().hex for _ in range(len(destination_records))]
|
|
123
|
-
|
|
124
|
-
df["_bizon_extracted_at"] = [
|
|
125
|
-
int(record.source_timestamp.timestamp() * 1_000_000) for record in destination_records
|
|
126
|
-
]
|
|
127
|
-
|
|
128
|
-
df["_bizon_loaded_at"] = pd.Timestamp.now(tz=UTC)
|
|
129
|
-
|
|
130
|
-
df["_source_record_id"] = [record.source_record_id for record in destination_records]
|
|
131
|
-
|
|
132
|
-
# We need to convert the source datetime to a int timestamp
|
|
133
|
-
df["_source_timestamp"] = [
|
|
134
|
-
int(record.source_timestamp.timestamp() * 1_000_000) for record in destination_records
|
|
135
|
-
]
|
|
136
|
-
|
|
137
|
-
elif self.config.normalization.type == NormalizationType.DEBEZIUM:
|
|
138
|
-
df = pd.DataFrame([record.to_dict_debezium(parquet=True) for record in destination_records])
|
|
139
|
-
df["_bizon_loaded_at"] = pd.Timestamp.now(tz=UTC)
|
|
140
|
-
|
|
141
|
-
else:
|
|
142
|
-
raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
|
|
143
|
-
|
|
144
|
-
return df
|
|
145
|
-
|
|
146
|
-
def convert_and_upload_to_buffer(self, destination_records: List[DestinationRecord]):
|
|
147
|
-
|
|
148
|
-
df = self.get_batch_records_as_df(destination_records)
|
|
149
|
-
|
|
150
|
-
# Convert DataFrame to Parquet in-memory
|
|
151
|
-
if self.buffer_format == "parquet":
|
|
152
|
-
table = pa.Table.from_pandas(df)
|
|
153
|
-
buffer = io.BytesIO()
|
|
154
|
-
pq.write_table(table, buffer)
|
|
155
|
-
buffer.seek(0)
|
|
156
|
-
|
|
157
|
-
# Upload the Parquet file to GCS
|
|
158
|
-
file_name = f"{self.sync_metadata.source_name}/{self.sync_metadata.stream_name}/{str(uuid4())}.parquet"
|
|
159
|
-
blob = self.buffer_bucket.blob(file_name)
|
|
160
|
-
blob.upload_from_file(buffer, content_type="application/octet-stream")
|
|
161
|
-
return file_name
|
|
162
|
-
|
|
163
95
|
def check_connection(self) -> bool:
|
|
164
96
|
dataset_ref = DatasetReference(self.project_id, self.dataset_id)
|
|
165
97
|
|
|
@@ -179,7 +111,25 @@ class BigQueryDestination(AbstractDestination):
|
|
|
179
111
|
# https://cloud.google.com/python/docs/reference/storage/latest/retry_timeout
|
|
180
112
|
# https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dbapi.DataError
|
|
181
113
|
|
|
182
|
-
def
|
|
114
|
+
def convert_and_upload_to_buffer(self, df_destination_records: pl.DataFrame) -> str:
|
|
115
|
+
|
|
116
|
+
if self.buffer_format == "parquet":
|
|
117
|
+
|
|
118
|
+
# Upload the Parquet file to GCS
|
|
119
|
+
file_name = f"{self.sync_metadata.source_name}/{self.sync_metadata.stream_name}/{str(uuid4())}.parquet"
|
|
120
|
+
|
|
121
|
+
with io.BytesIO() as stream:
|
|
122
|
+
df_destination_records.write_parquet(stream)
|
|
123
|
+
stream.seek(0)
|
|
124
|
+
|
|
125
|
+
blob = self.buffer_bucket.blob(file_name)
|
|
126
|
+
blob.upload_from_file(stream, content_type="application/octet-stream")
|
|
127
|
+
|
|
128
|
+
return file_name
|
|
129
|
+
|
|
130
|
+
raise NotImplementedError(f"Buffer format {self.buffer_format} is not supported")
|
|
131
|
+
|
|
132
|
+
def load_to_bigquery(self, gcs_file: str, df_destination_records: pl.DataFrame):
|
|
183
133
|
|
|
184
134
|
# We always partition by the loaded_at field
|
|
185
135
|
time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
|
|
@@ -187,34 +137,41 @@ class BigQueryDestination(AbstractDestination):
|
|
|
187
137
|
job_config = bigquery.LoadJobConfig(
|
|
188
138
|
source_format=bigquery.SourceFormat.PARQUET,
|
|
189
139
|
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
|
|
190
|
-
schema=self.get_bigquery_schema(
|
|
140
|
+
schema=self.get_bigquery_schema(df_destination_records=df_destination_records),
|
|
191
141
|
time_partitioning=time_partitioning,
|
|
192
142
|
)
|
|
193
143
|
|
|
194
|
-
if self.config.normalization.type == NormalizationType.DEBEZIUM:
|
|
195
|
-
job_config.clustering_fields = list(
|
|
196
|
-
json.loads(destination_records[0].source_data["_bizon_message_key"]).keys()
|
|
197
|
-
)
|
|
198
|
-
|
|
199
144
|
load_job = self.bq_client.load_table_from_uri(
|
|
200
145
|
f"gs://{self.buffer_bucket_name}/{gcs_file}", self.temp_table_id, job_config=job_config
|
|
201
146
|
)
|
|
147
|
+
result = load_job.result() # Waits for the job to complete
|
|
148
|
+
assert result.state == "DONE", f"Job failed with state {result.state} with error {result.error_result}"
|
|
149
|
+
|
|
150
|
+
def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
|
|
151
|
+
|
|
152
|
+
# Rename fields to match BigQuery schema
|
|
153
|
+
df_destination_records = df_destination_records.rename(
|
|
154
|
+
{
|
|
155
|
+
# Bizon fields
|
|
156
|
+
"bizon_extracted_at": "_bizon_extracted_at",
|
|
157
|
+
"bizon_id": "_bizon_id",
|
|
158
|
+
"bizon_loaded_at": "_bizon_loaded_at",
|
|
159
|
+
# Source fields
|
|
160
|
+
"source_record_id": "_source_record_id",
|
|
161
|
+
"source_timestamp": "_source_timestamp",
|
|
162
|
+
"source_data": "_source_data",
|
|
163
|
+
},
|
|
164
|
+
)
|
|
202
165
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, str]:
|
|
206
|
-
|
|
207
|
-
# Here we can check if these IDs are already present in BigQuery
|
|
208
|
-
# Using SourceRecord.id values
|
|
209
|
-
|
|
210
|
-
gs_file_name = self.convert_and_upload_to_buffer(destination_records=destination_records)
|
|
166
|
+
gs_file_name = self.convert_and_upload_to_buffer(df_destination_records=df_destination_records)
|
|
211
167
|
|
|
212
168
|
try:
|
|
213
|
-
self.load_to_bigquery(gs_file_name,
|
|
169
|
+
self.load_to_bigquery(gcs_file=gs_file_name, df_destination_records=df_destination_records)
|
|
214
170
|
self.cleanup(gs_file_name)
|
|
215
171
|
except Exception as e:
|
|
216
172
|
self.cleanup(gs_file_name)
|
|
217
173
|
logger.error(f"Error loading data to BigQuery: {e}")
|
|
174
|
+
logger.error(traceback.format_exc())
|
|
218
175
|
return False, str(e)
|
|
219
176
|
return True, ""
|
|
220
177
|
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Literal, Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field, field_validator
|
|
5
|
+
|
|
6
|
+
from bizon.destinations.config import (
|
|
7
|
+
AbstractDestinationConfig,
|
|
8
|
+
AbstractDestinationDetailsConfig,
|
|
9
|
+
DestinationTypes,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TimePartitioning(str, Enum):
|
|
14
|
+
DAY = "DAY"
|
|
15
|
+
HOUR = "HOUR"
|
|
16
|
+
MONTH = "MONTH"
|
|
17
|
+
YEAR = "YEAR"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class BigQueryAuthentication(BaseModel):
|
|
21
|
+
service_account_key: str = Field(
|
|
22
|
+
description="Service Account Key JSON string. If empty it will be infered",
|
|
23
|
+
default="",
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BigQueryStreamingConfigDetails(AbstractDestinationDetailsConfig):
|
|
28
|
+
project_id: str
|
|
29
|
+
dataset_id: str
|
|
30
|
+
dataset_location: Optional[str] = "US"
|
|
31
|
+
table_id: Optional[str] = Field(
|
|
32
|
+
default=None, description="Table ID, if not provided it will be inferred from source name"
|
|
33
|
+
)
|
|
34
|
+
time_partitioning: Optional[TimePartitioning] = Field(
|
|
35
|
+
default=TimePartitioning.DAY, description="BigQuery Time partitioning type"
|
|
36
|
+
)
|
|
37
|
+
authentication: Optional[BigQueryAuthentication] = None
|
|
38
|
+
bq_max_rows_per_request: Optional[int] = Field(30000, description="Max rows per buffer streaming request.")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class BigQueryStreamingConfig(AbstractDestinationConfig):
|
|
42
|
+
name: Literal[DestinationTypes.BIGQUERY_STREAMING]
|
|
43
|
+
config: BigQueryStreamingConfigDetails
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
4
|
+
from typing import List, Tuple, Type
|
|
5
|
+
|
|
6
|
+
import polars as pl
|
|
7
|
+
from google.api_core.exceptions import NotFound
|
|
8
|
+
from google.cloud import bigquery, bigquery_storage_v1
|
|
9
|
+
from google.cloud.bigquery import DatasetReference, TimePartitioning
|
|
10
|
+
from google.cloud.bigquery_storage_v1.types import (
|
|
11
|
+
AppendRowsRequest,
|
|
12
|
+
ProtoRows,
|
|
13
|
+
ProtoSchema,
|
|
14
|
+
)
|
|
15
|
+
from google.protobuf.message import Message
|
|
16
|
+
|
|
17
|
+
from bizon.common.models import SyncMetadata
|
|
18
|
+
from bizon.destinations.config import NormalizationType
|
|
19
|
+
from bizon.destinations.destination import AbstractDestination
|
|
20
|
+
from bizon.engine.backend.backend import AbstractBackend
|
|
21
|
+
|
|
22
|
+
from .config import BigQueryStreamingConfigDetails
|
|
23
|
+
from .proto_utils import get_proto_schema_and_class
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class BigQueryStreamingDestination(AbstractDestination):
|
|
27
|
+
|
|
28
|
+
def __init__(self, sync_metadata: SyncMetadata, config: BigQueryStreamingConfigDetails, backend: AbstractBackend):
|
|
29
|
+
super().__init__(sync_metadata, config, backend)
|
|
30
|
+
self.config: BigQueryStreamingConfigDetails = config
|
|
31
|
+
|
|
32
|
+
if config.authentication and config.authentication.service_account_key:
|
|
33
|
+
with tempfile.NamedTemporaryFile(delete=False) as temp:
|
|
34
|
+
temp.write(config.authentication.service_account_key.encode())
|
|
35
|
+
temp_file_path = temp.name
|
|
36
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file_path
|
|
37
|
+
|
|
38
|
+
self.project_id = config.project_id
|
|
39
|
+
self.bq_client = bigquery.Client(project=self.project_id)
|
|
40
|
+
self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
|
|
41
|
+
self.dataset_id = config.dataset_id
|
|
42
|
+
self.dataset_location = config.dataset_location
|
|
43
|
+
self.bq_max_rows_per_request = config.bq_max_rows_per_request
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def table_id(self) -> str:
|
|
47
|
+
tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
|
|
48
|
+
return f"{self.project_id}.{self.dataset_id}.{tabled_id}"
|
|
49
|
+
|
|
50
|
+
def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
|
|
51
|
+
|
|
52
|
+
# we keep raw data in the column source_data
|
|
53
|
+
if self.config.normalization.type == NormalizationType.NONE:
|
|
54
|
+
return [
|
|
55
|
+
bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
|
|
56
|
+
bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
|
|
57
|
+
bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
|
|
58
|
+
bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
|
|
59
|
+
bigquery.SchemaField(
|
|
60
|
+
"_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
|
|
61
|
+
),
|
|
62
|
+
bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
|
|
66
|
+
|
|
67
|
+
def check_connection(self) -> bool:
|
|
68
|
+
dataset_ref = DatasetReference(self.project_id, self.dataset_id)
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
self.bq_client.get_dataset(dataset_ref)
|
|
72
|
+
except NotFound:
|
|
73
|
+
dataset = bigquery.Dataset(dataset_ref)
|
|
74
|
+
dataset.location = self.dataset_location
|
|
75
|
+
dataset = self.bq_client.create_dataset(dataset)
|
|
76
|
+
return True
|
|
77
|
+
|
|
78
|
+
def append_rows_to_stream(
|
|
79
|
+
self,
|
|
80
|
+
write_client: bigquery_storage_v1.BigQueryWriteClient,
|
|
81
|
+
stream_name: str,
|
|
82
|
+
proto_schema: ProtoSchema,
|
|
83
|
+
serialized_rows: List[bytes],
|
|
84
|
+
):
|
|
85
|
+
request = AppendRowsRequest(
|
|
86
|
+
write_stream=stream_name,
|
|
87
|
+
proto_rows=AppendRowsRequest.ProtoData(
|
|
88
|
+
rows=ProtoRows(serialized_rows=serialized_rows),
|
|
89
|
+
writer_schema=proto_schema,
|
|
90
|
+
),
|
|
91
|
+
)
|
|
92
|
+
response = write_client.append_rows(iter([request]))
|
|
93
|
+
return response.code().name
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def to_protobuf_serialization(TableRowClass: Type[Message], row: dict) -> bytes:
|
|
97
|
+
"""Convert a row to a protobuf serialization"""
|
|
98
|
+
record = TableRowClass()
|
|
99
|
+
record._bizon_id = row["bizon_id"]
|
|
100
|
+
record._bizon_extracted_at = row["bizon_extracted_at"].strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
101
|
+
record._bizon_loaded_at = row["bizon_loaded_at"].strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
102
|
+
record._source_record_id = row["source_record_id"]
|
|
103
|
+
record._source_timestamp = row["source_timestamp"].strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
104
|
+
record._source_data = row["source_data"]
|
|
105
|
+
return record.SerializeToString()
|
|
106
|
+
|
|
107
|
+
def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
|
|
108
|
+
# TODO: for now no clustering keys
|
|
109
|
+
clustering_keys = []
|
|
110
|
+
|
|
111
|
+
# Create table if it doesnt exist
|
|
112
|
+
schema = self.get_bigquery_schema()
|
|
113
|
+
table = bigquery.Table(self.table_id, schema=schema)
|
|
114
|
+
time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
|
|
115
|
+
table.time_partitioning = time_partitioning
|
|
116
|
+
|
|
117
|
+
table = self.bq_client.create_table(table, exists_ok=True)
|
|
118
|
+
|
|
119
|
+
# Create the stream
|
|
120
|
+
write_client = self.bq_storage_client
|
|
121
|
+
tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
|
|
122
|
+
parent = write_client.table_path(self.project_id, self.dataset_id, tabled_id)
|
|
123
|
+
stream_name = f"{parent}/_default"
|
|
124
|
+
|
|
125
|
+
# Generating the protocol buffer representation of the message descriptor.
|
|
126
|
+
proto_schema, TableRow = get_proto_schema_and_class(clustering_keys)
|
|
127
|
+
|
|
128
|
+
serialized_rows = [
|
|
129
|
+
self.to_protobuf_serialization(TableRowClass=TableRow, row=row)
|
|
130
|
+
for row in df_destination_records.iter_rows(named=True)
|
|
131
|
+
]
|
|
132
|
+
|
|
133
|
+
results = []
|
|
134
|
+
with ThreadPoolExecutor() as executor:
|
|
135
|
+
futures = [
|
|
136
|
+
executor.submit(self.append_rows_to_stream, write_client, stream_name, proto_schema, batch_rows)
|
|
137
|
+
for batch_rows in self.batch(serialized_rows)
|
|
138
|
+
]
|
|
139
|
+
for future in futures:
|
|
140
|
+
results.append(future.result())
|
|
141
|
+
|
|
142
|
+
assert all([r == "OK" for r in results]) is True, "Failed to append rows to stream"
|
|
143
|
+
|
|
144
|
+
def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
|
|
145
|
+
self.load_to_bigquery_via_streaming(df_destination_records=df_destination_records)
|
|
146
|
+
return True, ""
|
|
147
|
+
|
|
148
|
+
def batch(self, iterable):
|
|
149
|
+
"""
|
|
150
|
+
Yield successive batches of size `batch_size` from `iterable`.
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
for i in range(0, len(iterable), self.bq_max_rows_per_request):
|
|
154
|
+
yield iterable[i : i + self.bq_max_rows_per_request] # noqa
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from typing import List, Tuple, Type
|
|
2
|
+
|
|
3
|
+
from google.cloud.bigquery_storage_v1.types import ProtoSchema
|
|
4
|
+
from google.protobuf.descriptor_pb2 import (
|
|
5
|
+
DescriptorProto,
|
|
6
|
+
FieldDescriptorProto,
|
|
7
|
+
FileDescriptorProto,
|
|
8
|
+
)
|
|
9
|
+
from google.protobuf.descriptor_pool import DescriptorPool
|
|
10
|
+
from google.protobuf.message import Message
|
|
11
|
+
from google.protobuf.message_factory import GetMessageClassesForFiles
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_proto_schema_and_class(clustering_keys: List[str] = None) -> Tuple[ProtoSchema, Type[Message]]:
|
|
15
|
+
# Define the FileDescriptorProto
|
|
16
|
+
file_descriptor_proto = FileDescriptorProto()
|
|
17
|
+
file_descriptor_proto.name = "dynamic.proto"
|
|
18
|
+
file_descriptor_proto.package = "dynamic_package"
|
|
19
|
+
|
|
20
|
+
# Define the TableRow message schema
|
|
21
|
+
message_descriptor = DescriptorProto()
|
|
22
|
+
message_descriptor.name = "TableRow"
|
|
23
|
+
|
|
24
|
+
# Add fields to the message, only use TYPE_STRING, BigQuery does not support other types
|
|
25
|
+
# It does not imapact data types in final table
|
|
26
|
+
|
|
27
|
+
# https://stackoverflow.com/questions/70489919/protobuf-type-for-bigquery-timestamp-field
|
|
28
|
+
fields = [
|
|
29
|
+
{"name": "_bizon_id", "type": FieldDescriptorProto.TYPE_STRING, "label": FieldDescriptorProto.LABEL_REQUIRED},
|
|
30
|
+
{
|
|
31
|
+
"name": "_bizon_extracted_at",
|
|
32
|
+
"type": FieldDescriptorProto.TYPE_STRING,
|
|
33
|
+
"label": FieldDescriptorProto.LABEL_REQUIRED,
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
"name": "_bizon_loaded_at",
|
|
37
|
+
"type": FieldDescriptorProto.TYPE_STRING,
|
|
38
|
+
"label": FieldDescriptorProto.LABEL_REQUIRED,
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"name": "_source_record_id",
|
|
42
|
+
"type": FieldDescriptorProto.TYPE_STRING,
|
|
43
|
+
"label": FieldDescriptorProto.LABEL_REQUIRED,
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"name": "_source_timestamp",
|
|
47
|
+
"type": FieldDescriptorProto.TYPE_STRING,
|
|
48
|
+
"label": FieldDescriptorProto.LABEL_REQUIRED,
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"name": "_source_data",
|
|
52
|
+
"type": FieldDescriptorProto.TYPE_STRING,
|
|
53
|
+
"label": FieldDescriptorProto.LABEL_OPTIONAL,
|
|
54
|
+
},
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
if clustering_keys:
|
|
58
|
+
for key in clustering_keys:
|
|
59
|
+
fields.append(
|
|
60
|
+
{
|
|
61
|
+
"name": key,
|
|
62
|
+
"type": FieldDescriptorProto.TYPE_STRING,
|
|
63
|
+
"label": FieldDescriptorProto.LABEL_OPTIONAL,
|
|
64
|
+
}
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
for i, field in enumerate(fields, start=1):
|
|
68
|
+
field_descriptor = message_descriptor.field.add()
|
|
69
|
+
field_descriptor.name = field["name"]
|
|
70
|
+
field_descriptor.number = i
|
|
71
|
+
field_descriptor.type = field["type"]
|
|
72
|
+
field_descriptor.label = field["label"]
|
|
73
|
+
|
|
74
|
+
# Add the message to the file descriptor
|
|
75
|
+
file_descriptor_proto.message_type.add().CopyFrom(message_descriptor)
|
|
76
|
+
|
|
77
|
+
# Create a DescriptorPool and register the FileDescriptorProto
|
|
78
|
+
pool = DescriptorPool()
|
|
79
|
+
pool.Add(file_descriptor_proto)
|
|
80
|
+
|
|
81
|
+
# Use the registered file name to fetch the message classes
|
|
82
|
+
message_classes = GetMessageClassesForFiles(["dynamic.proto"], pool=pool)
|
|
83
|
+
|
|
84
|
+
# Fetch the TableRow class
|
|
85
|
+
table_row_class = message_classes["dynamic_package.TableRow"]
|
|
86
|
+
|
|
87
|
+
# Create the ProtoSchema
|
|
88
|
+
proto_schema = ProtoSchema()
|
|
89
|
+
proto_schema.proto_descriptor.CopyFrom(message_descriptor)
|
|
90
|
+
|
|
91
|
+
return proto_schema, table_row_class
|
|
@@ -1,8 +1,11 @@
|
|
|
1
|
-
import sys
|
|
2
1
|
from datetime import datetime
|
|
3
2
|
from typing import List
|
|
4
3
|
|
|
5
|
-
from
|
|
4
|
+
from loguru import logger
|
|
5
|
+
from polars import DataFrame
|
|
6
|
+
from pytz import UTC
|
|
7
|
+
|
|
8
|
+
from .models import destination_record_schema
|
|
6
9
|
|
|
7
10
|
|
|
8
11
|
class DestinationBuffer:
|
|
@@ -10,15 +13,15 @@ class DestinationBuffer:
|
|
|
10
13
|
def __init__(self, buffer_size: int, buffer_flush_timeout: int) -> None:
|
|
11
14
|
self.buffer_size = buffer_size * 1024 * 1024 # Convert to bytes
|
|
12
15
|
self.buffer_flush_timeout = buffer_flush_timeout
|
|
13
|
-
self.
|
|
16
|
+
self.df_destination_records: DataFrame = DataFrame(schema=destination_record_schema)
|
|
14
17
|
self._iterations: List[int] = []
|
|
15
18
|
self.pagination = {}
|
|
16
|
-
self.modified_at: List[datetime] = [datetime.
|
|
19
|
+
self.modified_at: List[datetime] = [datetime.now(tz=UTC)]
|
|
17
20
|
|
|
18
21
|
@property
|
|
19
22
|
def current_size(self) -> int:
|
|
20
23
|
"""Return buffer size"""
|
|
21
|
-
return
|
|
24
|
+
return self.df_destination_records.estimated_size(unit="b")
|
|
22
25
|
|
|
23
26
|
@property
|
|
24
27
|
def buffer_free_space_pct(self) -> float:
|
|
@@ -61,16 +64,20 @@ class DestinationBuffer:
|
|
|
61
64
|
|
|
62
65
|
def flush(self):
|
|
63
66
|
"""Flush buffer"""
|
|
64
|
-
self.
|
|
67
|
+
self.df_destination_records = DataFrame(schema=destination_record_schema)
|
|
65
68
|
self._iterations = []
|
|
66
69
|
self.pagination = {}
|
|
67
70
|
self.modified_at = []
|
|
68
71
|
|
|
69
72
|
def add_source_iteration_records_to_buffer(
|
|
70
|
-
self, iteration: int,
|
|
73
|
+
self, iteration: int, df_destination_records: DataFrame, pagination: dict = None
|
|
71
74
|
):
|
|
72
75
|
"""Add records for the given iteration to buffer"""
|
|
73
|
-
self.
|
|
76
|
+
self.df_destination_records.vstack(df_destination_records, in_place=True)
|
|
74
77
|
self._iterations.append(iteration)
|
|
75
78
|
self.pagination = pagination
|
|
76
|
-
self.modified_at.append(datetime.
|
|
79
|
+
self.modified_at.append(datetime.now(tz=UTC))
|
|
80
|
+
|
|
81
|
+
logger.info(
|
|
82
|
+
f"Added {df_destination_records.height} records to buffer for iteration {iteration} - {self.df_destination_records.estimated_size(unit='mb')} MB"
|
|
83
|
+
)
|