bizon 0.0.11__py3-none-any.whl → 0.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon/destinations/bigquery/src/destination.py +49 -92
- bizon/destinations/bigquery_streaming/src/config.py +3 -15
- bizon/destinations/bigquery_streaming/src/destination.py +69 -63
- bizon/destinations/buffer.py +16 -9
- bizon/destinations/destination.py +24 -35
- bizon/destinations/file/src/destination.py +4 -6
- bizon/destinations/logger/src/destination.py +4 -4
- bizon/destinations/models.py +26 -105
- bizon/engine/backend/adapters/sqlalchemy/backend.py +1 -1
- bizon/engine/backend/backend.py +1 -1
- bizon/engine/pipeline/producer.py +39 -6
- bizon/engine/queue/adapters/python_queue/config.py +6 -2
- bizon/engine/queue/adapters/python_queue/consumer.py +3 -4
- bizon/engine/queue/adapters/python_queue/queue.py +9 -5
- bizon/engine/queue/config.py +2 -0
- bizon/engine/queue/queue.py +22 -9
- bizon/source/cursor.py +7 -0
- bizon/source/models.py +11 -0
- bizon/sources/kafka/src/source.py +96 -23
- bizon/sources/kafka/tests/kafka_pipeline.py +1 -1
- {bizon-0.0.11.dist-info → bizon-0.0.13.dist-info}/METADATA +3 -4
- {bizon-0.0.11.dist-info → bizon-0.0.13.dist-info}/RECORD +25 -25
- {bizon-0.0.11.dist-info → bizon-0.0.13.dist-info}/LICENSE +0 -0
- {bizon-0.0.11.dist-info → bizon-0.0.13.dist-info}/WHEEL +0 -0
- {bizon-0.0.11.dist-info → bizon-0.0.13.dist-info}/entry_points.txt +0 -0
|
@@ -2,22 +2,19 @@ import io
|
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
4
|
import tempfile
|
|
5
|
+
import traceback
|
|
5
6
|
from typing import List, Tuple
|
|
6
7
|
from uuid import uuid4
|
|
7
8
|
|
|
8
|
-
import
|
|
9
|
-
import pyarrow as pa
|
|
10
|
-
import pyarrow.parquet as pq
|
|
9
|
+
import polars as pl
|
|
11
10
|
from google.api_core.exceptions import NotFound
|
|
12
11
|
from google.cloud import bigquery, storage
|
|
13
12
|
from google.cloud.bigquery import DatasetReference, TimePartitioning
|
|
14
13
|
from loguru import logger
|
|
15
|
-
from pytz import UTC
|
|
16
14
|
|
|
17
15
|
from bizon.common.models import SyncMetadata
|
|
18
16
|
from bizon.destinations.config import NormalizationType
|
|
19
17
|
from bizon.destinations.destination import AbstractDestination
|
|
20
|
-
from bizon.destinations.models import DestinationRecord
|
|
21
18
|
from bizon.engine.backend.backend import AbstractBackend
|
|
22
19
|
from bizon.source.config import SourceSyncModes
|
|
23
20
|
|
|
@@ -62,7 +59,7 @@ class BigQueryDestination(AbstractDestination):
|
|
|
62
59
|
elif self.sync_metadata.sync_mode == SourceSyncModes.STREAM:
|
|
63
60
|
return f"{self.table_id}"
|
|
64
61
|
|
|
65
|
-
def get_bigquery_schema(self,
|
|
62
|
+
def get_bigquery_schema(self, df_destination_records: pl.DataFrame) -> List[bigquery.SchemaField]:
|
|
66
63
|
|
|
67
64
|
# we keep raw data in the column source_data
|
|
68
65
|
if self.config.normalization.type == NormalizationType.NONE:
|
|
@@ -77,26 +74,13 @@ class BigQueryDestination(AbstractDestination):
|
|
|
77
74
|
bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
|
|
78
75
|
]
|
|
79
76
|
|
|
80
|
-
elif self.config.normalization.type == NormalizationType.DEBEZIUM:
|
|
81
|
-
assert (
|
|
82
|
-
"_bizon_message_key" in destination_records[0].source_data
|
|
83
|
-
), "Debezium records must have a '_bizon_message_key' key"
|
|
84
|
-
message_keys = json.loads(destination_records[0].source_data["_bizon_message_key"])
|
|
85
|
-
return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in message_keys] + [
|
|
86
|
-
bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
|
|
87
|
-
bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
|
|
88
|
-
bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
|
|
89
|
-
bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
|
|
90
|
-
bigquery.SchemaField(
|
|
91
|
-
"_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
|
|
92
|
-
),
|
|
93
|
-
bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
|
|
94
|
-
]
|
|
95
|
-
|
|
96
77
|
# If normalization is tabular, we parse key / value pairs to columns
|
|
97
78
|
elif self.config.normalization.type == NormalizationType.TABULAR:
|
|
98
|
-
|
|
99
|
-
|
|
79
|
+
|
|
80
|
+
# We use the first record to infer the schema of tabular data (key / value pairs)
|
|
81
|
+
source_data_keys = list(json.loads(df_destination_records["source_data"][0]).keys())
|
|
82
|
+
|
|
83
|
+
return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in source_data_keys] + [
|
|
100
84
|
bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
|
|
101
85
|
bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
|
|
102
86
|
bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
|
|
@@ -108,58 +92,6 @@ class BigQueryDestination(AbstractDestination):
|
|
|
108
92
|
|
|
109
93
|
raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
|
|
110
94
|
|
|
111
|
-
def get_batch_records_as_df(self, destination_records: List[DestinationRecord]) -> pd.DataFrame:
|
|
112
|
-
|
|
113
|
-
# We keep raw data in a column -> convert the SourceRecord to a DestinationRecord
|
|
114
|
-
if self.config.normalization.type == NormalizationType.NONE:
|
|
115
|
-
df = pd.DataFrame([record.to_dict_raw_json_data(parquet=True) for record in destination_records])
|
|
116
|
-
df["_bizon_loaded_at"] = pd.Timestamp.now(tz=UTC)
|
|
117
|
-
|
|
118
|
-
# If normalization is tabular, we can just convert the data to a DataFrame parsing first-level keys
|
|
119
|
-
elif self.config.normalization.type == NormalizationType.TABULAR:
|
|
120
|
-
list_data_dict = [record.source_data for record in destination_records]
|
|
121
|
-
df = pd.DataFrame(list_data_dict).astype(str)
|
|
122
|
-
df["_bizon_id"] = [uuid4().hex for _ in range(len(destination_records))]
|
|
123
|
-
|
|
124
|
-
df["_bizon_extracted_at"] = [
|
|
125
|
-
int(record.source_timestamp.timestamp() * 1_000_000) for record in destination_records
|
|
126
|
-
]
|
|
127
|
-
|
|
128
|
-
df["_bizon_loaded_at"] = pd.Timestamp.now(tz=UTC)
|
|
129
|
-
|
|
130
|
-
df["_source_record_id"] = [record.source_record_id for record in destination_records]
|
|
131
|
-
|
|
132
|
-
# We need to convert the source datetime to a int timestamp
|
|
133
|
-
df["_source_timestamp"] = [
|
|
134
|
-
int(record.source_timestamp.timestamp() * 1_000_000) for record in destination_records
|
|
135
|
-
]
|
|
136
|
-
|
|
137
|
-
elif self.config.normalization.type == NormalizationType.DEBEZIUM:
|
|
138
|
-
df = pd.DataFrame([record.to_dict_debezium(parquet=True) for record in destination_records])
|
|
139
|
-
df["_bizon_loaded_at"] = pd.Timestamp.now(tz=UTC)
|
|
140
|
-
|
|
141
|
-
else:
|
|
142
|
-
raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
|
|
143
|
-
|
|
144
|
-
return df
|
|
145
|
-
|
|
146
|
-
def convert_and_upload_to_buffer(self, destination_records: List[DestinationRecord]):
|
|
147
|
-
|
|
148
|
-
df = self.get_batch_records_as_df(destination_records)
|
|
149
|
-
|
|
150
|
-
# Convert DataFrame to Parquet in-memory
|
|
151
|
-
if self.buffer_format == "parquet":
|
|
152
|
-
table = pa.Table.from_pandas(df)
|
|
153
|
-
buffer = io.BytesIO()
|
|
154
|
-
pq.write_table(table, buffer)
|
|
155
|
-
buffer.seek(0)
|
|
156
|
-
|
|
157
|
-
# Upload the Parquet file to GCS
|
|
158
|
-
file_name = f"{self.sync_metadata.source_name}/{self.sync_metadata.stream_name}/{str(uuid4())}.parquet"
|
|
159
|
-
blob = self.buffer_bucket.blob(file_name)
|
|
160
|
-
blob.upload_from_file(buffer, content_type="application/octet-stream")
|
|
161
|
-
return file_name
|
|
162
|
-
|
|
163
95
|
def check_connection(self) -> bool:
|
|
164
96
|
dataset_ref = DatasetReference(self.project_id, self.dataset_id)
|
|
165
97
|
|
|
@@ -179,7 +111,25 @@ class BigQueryDestination(AbstractDestination):
|
|
|
179
111
|
# https://cloud.google.com/python/docs/reference/storage/latest/retry_timeout
|
|
180
112
|
# https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dbapi.DataError
|
|
181
113
|
|
|
182
|
-
def
|
|
114
|
+
def convert_and_upload_to_buffer(self, df_destination_records: pl.DataFrame) -> str:
|
|
115
|
+
|
|
116
|
+
if self.buffer_format == "parquet":
|
|
117
|
+
|
|
118
|
+
# Upload the Parquet file to GCS
|
|
119
|
+
file_name = f"{self.sync_metadata.source_name}/{self.sync_metadata.stream_name}/{str(uuid4())}.parquet"
|
|
120
|
+
|
|
121
|
+
with io.BytesIO() as stream:
|
|
122
|
+
df_destination_records.write_parquet(stream)
|
|
123
|
+
stream.seek(0)
|
|
124
|
+
|
|
125
|
+
blob = self.buffer_bucket.blob(file_name)
|
|
126
|
+
blob.upload_from_file(stream, content_type="application/octet-stream")
|
|
127
|
+
|
|
128
|
+
return file_name
|
|
129
|
+
|
|
130
|
+
raise NotImplementedError(f"Buffer format {self.buffer_format} is not supported")
|
|
131
|
+
|
|
132
|
+
def load_to_bigquery(self, gcs_file: str, df_destination_records: pl.DataFrame):
|
|
183
133
|
|
|
184
134
|
# We always partition by the loaded_at field
|
|
185
135
|
time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
|
|
@@ -187,34 +137,41 @@ class BigQueryDestination(AbstractDestination):
|
|
|
187
137
|
job_config = bigquery.LoadJobConfig(
|
|
188
138
|
source_format=bigquery.SourceFormat.PARQUET,
|
|
189
139
|
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
|
|
190
|
-
schema=self.get_bigquery_schema(
|
|
140
|
+
schema=self.get_bigquery_schema(df_destination_records=df_destination_records),
|
|
191
141
|
time_partitioning=time_partitioning,
|
|
192
142
|
)
|
|
193
143
|
|
|
194
|
-
if self.config.normalization.type == NormalizationType.DEBEZIUM:
|
|
195
|
-
job_config.clustering_fields = list(
|
|
196
|
-
json.loads(destination_records[0].source_data["_bizon_message_key"]).keys()
|
|
197
|
-
)
|
|
198
|
-
|
|
199
144
|
load_job = self.bq_client.load_table_from_uri(
|
|
200
145
|
f"gs://{self.buffer_bucket_name}/{gcs_file}", self.temp_table_id, job_config=job_config
|
|
201
146
|
)
|
|
147
|
+
result = load_job.result() # Waits for the job to complete
|
|
148
|
+
assert result.state == "DONE", f"Job failed with state {result.state} with error {result.error_result}"
|
|
149
|
+
|
|
150
|
+
def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
|
|
151
|
+
|
|
152
|
+
# Rename fields to match BigQuery schema
|
|
153
|
+
df_destination_records = df_destination_records.rename(
|
|
154
|
+
{
|
|
155
|
+
# Bizon fields
|
|
156
|
+
"bizon_extracted_at": "_bizon_extracted_at",
|
|
157
|
+
"bizon_id": "_bizon_id",
|
|
158
|
+
"bizon_loaded_at": "_bizon_loaded_at",
|
|
159
|
+
# Source fields
|
|
160
|
+
"source_record_id": "_source_record_id",
|
|
161
|
+
"source_timestamp": "_source_timestamp",
|
|
162
|
+
"source_data": "_source_data",
|
|
163
|
+
},
|
|
164
|
+
)
|
|
202
165
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, str]:
|
|
206
|
-
|
|
207
|
-
# Here we can check if these IDs are already present in BigQuery
|
|
208
|
-
# Using SourceRecord.id values
|
|
209
|
-
|
|
210
|
-
gs_file_name = self.convert_and_upload_to_buffer(destination_records=destination_records)
|
|
166
|
+
gs_file_name = self.convert_and_upload_to_buffer(df_destination_records=df_destination_records)
|
|
211
167
|
|
|
212
168
|
try:
|
|
213
|
-
self.load_to_bigquery(gs_file_name,
|
|
169
|
+
self.load_to_bigquery(gcs_file=gs_file_name, df_destination_records=df_destination_records)
|
|
214
170
|
self.cleanup(gs_file_name)
|
|
215
171
|
except Exception as e:
|
|
216
172
|
self.cleanup(gs_file_name)
|
|
217
173
|
logger.error(f"Error loading data to BigQuery: {e}")
|
|
174
|
+
logger.error(traceback.format_exc())
|
|
218
175
|
return False, str(e)
|
|
219
176
|
return True, ""
|
|
220
177
|
|
|
@@ -10,11 +10,6 @@ from bizon.destinations.config import (
|
|
|
10
10
|
)
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
class GCSBufferFormat(str, Enum):
|
|
14
|
-
PARQUET = "parquet"
|
|
15
|
-
CSV = "csv"
|
|
16
|
-
|
|
17
|
-
|
|
18
13
|
class TimePartitioning(str, Enum):
|
|
19
14
|
DAY = "DAY"
|
|
20
15
|
HOUR = "HOUR"
|
|
@@ -29,7 +24,7 @@ class BigQueryAuthentication(BaseModel):
|
|
|
29
24
|
)
|
|
30
25
|
|
|
31
26
|
|
|
32
|
-
class
|
|
27
|
+
class BigQueryStreamingConfigDetails(AbstractDestinationDetailsConfig):
|
|
33
28
|
project_id: str
|
|
34
29
|
dataset_id: str
|
|
35
30
|
dataset_location: Optional[str] = "US"
|
|
@@ -40,16 +35,9 @@ class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
|
|
|
40
35
|
default=TimePartitioning.DAY, description="BigQuery Time partitioning type"
|
|
41
36
|
)
|
|
42
37
|
authentication: Optional[BigQueryAuthentication] = None
|
|
43
|
-
|
|
44
|
-
buffer_size: int = Field(default=0, description="Buffer size in MB")
|
|
45
|
-
|
|
46
|
-
@field_validator("buffer_size", mode="after")
|
|
47
|
-
def validate_buffer_size(cls, value: int) -> int:
|
|
48
|
-
if value != 0:
|
|
49
|
-
raise ValueError("Buffer size must be 0, we directly stream to BigQuery")
|
|
50
|
-
return value
|
|
38
|
+
bq_max_rows_per_request: Optional[int] = Field(30000, description="Max rows per buffer streaming request.")
|
|
51
39
|
|
|
52
40
|
|
|
53
41
|
class BigQueryStreamingConfig(AbstractDestinationConfig):
|
|
54
42
|
name: Literal[DestinationTypes.BIGQUERY_STREAMING]
|
|
55
|
-
config:
|
|
43
|
+
config: BigQueryStreamingConfigDetails
|
|
@@ -1,29 +1,33 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import os
|
|
3
2
|
import tempfile
|
|
4
|
-
from
|
|
3
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
4
|
+
from typing import List, Tuple, Type
|
|
5
5
|
|
|
6
|
+
import polars as pl
|
|
6
7
|
from google.api_core.exceptions import NotFound
|
|
7
|
-
from google.cloud import bigquery, bigquery_storage_v1
|
|
8
|
+
from google.cloud import bigquery, bigquery_storage_v1
|
|
8
9
|
from google.cloud.bigquery import DatasetReference, TimePartitioning
|
|
9
|
-
from google.cloud.bigquery_storage_v1.types import
|
|
10
|
-
|
|
10
|
+
from google.cloud.bigquery_storage_v1.types import (
|
|
11
|
+
AppendRowsRequest,
|
|
12
|
+
ProtoRows,
|
|
13
|
+
ProtoSchema,
|
|
14
|
+
)
|
|
15
|
+
from google.protobuf.message import Message
|
|
11
16
|
|
|
12
17
|
from bizon.common.models import SyncMetadata
|
|
13
18
|
from bizon.destinations.config import NormalizationType
|
|
14
19
|
from bizon.destinations.destination import AbstractDestination
|
|
15
|
-
from bizon.destinations.models import DestinationRecord
|
|
16
20
|
from bizon.engine.backend.backend import AbstractBackend
|
|
17
21
|
|
|
18
|
-
from .config import
|
|
22
|
+
from .config import BigQueryStreamingConfigDetails
|
|
19
23
|
from .proto_utils import get_proto_schema_and_class
|
|
20
24
|
|
|
21
25
|
|
|
22
26
|
class BigQueryStreamingDestination(AbstractDestination):
|
|
23
27
|
|
|
24
|
-
def __init__(self, sync_metadata: SyncMetadata, config:
|
|
28
|
+
def __init__(self, sync_metadata: SyncMetadata, config: BigQueryStreamingConfigDetails, backend: AbstractBackend):
|
|
25
29
|
super().__init__(sync_metadata, config, backend)
|
|
26
|
-
self.config:
|
|
30
|
+
self.config: BigQueryStreamingConfigDetails = config
|
|
27
31
|
|
|
28
32
|
if config.authentication and config.authentication.service_account_key:
|
|
29
33
|
with tempfile.NamedTemporaryFile(delete=False) as temp:
|
|
@@ -34,16 +38,16 @@ class BigQueryStreamingDestination(AbstractDestination):
|
|
|
34
38
|
self.project_id = config.project_id
|
|
35
39
|
self.bq_client = bigquery.Client(project=self.project_id)
|
|
36
40
|
self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
|
|
37
|
-
self.gcs_client = storage.Client(project=self.project_id)
|
|
38
41
|
self.dataset_id = config.dataset_id
|
|
39
42
|
self.dataset_location = config.dataset_location
|
|
43
|
+
self.bq_max_rows_per_request = config.bq_max_rows_per_request
|
|
40
44
|
|
|
41
45
|
@property
|
|
42
46
|
def table_id(self) -> str:
|
|
43
47
|
tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
|
|
44
48
|
return f"{self.project_id}.{self.dataset_id}.{tabled_id}"
|
|
45
49
|
|
|
46
|
-
def get_bigquery_schema(self
|
|
50
|
+
def get_bigquery_schema(self) -> List[bigquery.SchemaField]:
|
|
47
51
|
|
|
48
52
|
# we keep raw data in the column source_data
|
|
49
53
|
if self.config.normalization.type == NormalizationType.NONE:
|
|
@@ -58,35 +62,6 @@ class BigQueryStreamingDestination(AbstractDestination):
|
|
|
58
62
|
bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
|
|
59
63
|
]
|
|
60
64
|
|
|
61
|
-
elif self.config.normalization.type == NormalizationType.DEBEZIUM:
|
|
62
|
-
assert (
|
|
63
|
-
"_bizon_message_key" in destination_records[0].source_data
|
|
64
|
-
), "Debezium records must have a '_bizon_message_key' key"
|
|
65
|
-
message_keys = json.loads(destination_records[0].source_data["_bizon_message_key"])
|
|
66
|
-
return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in message_keys] + [
|
|
67
|
-
bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
|
|
68
|
-
bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
|
|
69
|
-
bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
|
|
70
|
-
bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
|
|
71
|
-
bigquery.SchemaField(
|
|
72
|
-
"_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
|
|
73
|
-
),
|
|
74
|
-
bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
|
|
75
|
-
]
|
|
76
|
-
|
|
77
|
-
# If normalization is tabular, we parse key / value pairs to columns
|
|
78
|
-
elif self.config.normalization.type == NormalizationType.TABULAR:
|
|
79
|
-
first_record_keys = destination_records[0].source_data.keys()
|
|
80
|
-
return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in first_record_keys] + [
|
|
81
|
-
bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
|
|
82
|
-
bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
|
|
83
|
-
bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
|
|
84
|
-
bigquery.SchemaField(
|
|
85
|
-
"_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
|
|
86
|
-
),
|
|
87
|
-
bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
|
|
88
|
-
]
|
|
89
|
-
|
|
90
65
|
raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
|
|
91
66
|
|
|
92
67
|
def check_connection(self) -> bool:
|
|
@@ -100,21 +75,45 @@ class BigQueryStreamingDestination(AbstractDestination):
|
|
|
100
75
|
dataset = self.bq_client.create_dataset(dataset)
|
|
101
76
|
return True
|
|
102
77
|
|
|
103
|
-
def
|
|
78
|
+
def append_rows_to_stream(
|
|
79
|
+
self,
|
|
80
|
+
write_client: bigquery_storage_v1.BigQueryWriteClient,
|
|
81
|
+
stream_name: str,
|
|
82
|
+
proto_schema: ProtoSchema,
|
|
83
|
+
serialized_rows: List[bytes],
|
|
84
|
+
):
|
|
85
|
+
request = AppendRowsRequest(
|
|
86
|
+
write_stream=stream_name,
|
|
87
|
+
proto_rows=AppendRowsRequest.ProtoData(
|
|
88
|
+
rows=ProtoRows(serialized_rows=serialized_rows),
|
|
89
|
+
writer_schema=proto_schema,
|
|
90
|
+
),
|
|
91
|
+
)
|
|
92
|
+
response = write_client.append_rows(iter([request]))
|
|
93
|
+
return response.code().name
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def to_protobuf_serialization(TableRowClass: Type[Message], row: dict) -> bytes:
|
|
97
|
+
"""Convert a row to a protobuf serialization"""
|
|
98
|
+
record = TableRowClass()
|
|
99
|
+
record._bizon_id = row["bizon_id"]
|
|
100
|
+
record._bizon_extracted_at = row["bizon_extracted_at"].strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
101
|
+
record._bizon_loaded_at = row["bizon_loaded_at"].strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
102
|
+
record._source_record_id = row["source_record_id"]
|
|
103
|
+
record._source_timestamp = row["source_timestamp"].strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
104
|
+
record._source_data = row["source_data"]
|
|
105
|
+
return record.SerializeToString()
|
|
106
|
+
|
|
107
|
+
def load_to_bigquery_via_streaming(self, df_destination_records: pl.DataFrame) -> str:
|
|
108
|
+
# TODO: for now no clustering keys
|
|
104
109
|
clustering_keys = []
|
|
105
110
|
|
|
106
|
-
if self.config.normalization.type == NormalizationType.DEBEZIUM:
|
|
107
|
-
clustering_keys = list(json.loads(destination_records[0].source_data["_bizon_message_key"]).keys())
|
|
108
|
-
|
|
109
111
|
# Create table if it doesnt exist
|
|
110
|
-
schema = self.get_bigquery_schema(
|
|
112
|
+
schema = self.get_bigquery_schema()
|
|
111
113
|
table = bigquery.Table(self.table_id, schema=schema)
|
|
112
114
|
time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
|
|
113
115
|
table.time_partitioning = time_partitioning
|
|
114
116
|
|
|
115
|
-
if clustering_keys:
|
|
116
|
-
table.clustering_fields = clustering_keys
|
|
117
|
-
|
|
118
117
|
table = self.bq_client.create_table(table, exists_ok=True)
|
|
119
118
|
|
|
120
119
|
# Create the stream
|
|
@@ -127,22 +126,29 @@ class BigQueryStreamingDestination(AbstractDestination):
|
|
|
127
126
|
proto_schema, TableRow = get_proto_schema_and_class(clustering_keys)
|
|
128
127
|
|
|
129
128
|
serialized_rows = [
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
)
|
|
133
|
-
for record in destination_records
|
|
129
|
+
self.to_protobuf_serialization(TableRowClass=TableRow, row=row)
|
|
130
|
+
for row in df_destination_records.iter_rows(named=True)
|
|
134
131
|
]
|
|
135
132
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
133
|
+
results = []
|
|
134
|
+
with ThreadPoolExecutor() as executor:
|
|
135
|
+
futures = [
|
|
136
|
+
executor.submit(self.append_rows_to_stream, write_client, stream_name, proto_schema, batch_rows)
|
|
137
|
+
for batch_rows in self.batch(serialized_rows)
|
|
138
|
+
]
|
|
139
|
+
for future in futures:
|
|
140
|
+
results.append(future.result())
|
|
141
|
+
|
|
142
|
+
assert all([r == "OK" for r in results]) is True, "Failed to append rows to stream"
|
|
145
143
|
|
|
146
|
-
def write_records(self,
|
|
147
|
-
self.load_to_bigquery_via_streaming(
|
|
144
|
+
def write_records(self, df_destination_records: pl.DataFrame) -> Tuple[bool, str]:
|
|
145
|
+
self.load_to_bigquery_via_streaming(df_destination_records=df_destination_records)
|
|
148
146
|
return True, ""
|
|
147
|
+
|
|
148
|
+
def batch(self, iterable):
|
|
149
|
+
"""
|
|
150
|
+
Yield successive batches of size `batch_size` from `iterable`.
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
for i in range(0, len(iterable), self.bq_max_rows_per_request):
|
|
154
|
+
yield iterable[i : i + self.bq_max_rows_per_request] # noqa
|
bizon/destinations/buffer.py
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
|
-
import sys
|
|
2
1
|
from datetime import datetime
|
|
3
2
|
from typing import List
|
|
4
3
|
|
|
5
|
-
from
|
|
4
|
+
from loguru import logger
|
|
5
|
+
from polars import DataFrame
|
|
6
|
+
from pytz import UTC
|
|
7
|
+
|
|
8
|
+
from .models import destination_record_schema
|
|
6
9
|
|
|
7
10
|
|
|
8
11
|
class DestinationBuffer:
|
|
@@ -10,15 +13,15 @@ class DestinationBuffer:
|
|
|
10
13
|
def __init__(self, buffer_size: int, buffer_flush_timeout: int) -> None:
|
|
11
14
|
self.buffer_size = buffer_size * 1024 * 1024 # Convert to bytes
|
|
12
15
|
self.buffer_flush_timeout = buffer_flush_timeout
|
|
13
|
-
self.
|
|
16
|
+
self.df_destination_records: DataFrame = DataFrame(schema=destination_record_schema)
|
|
14
17
|
self._iterations: List[int] = []
|
|
15
18
|
self.pagination = {}
|
|
16
|
-
self.modified_at: List[datetime] = [datetime.
|
|
19
|
+
self.modified_at: List[datetime] = [datetime.now(tz=UTC)]
|
|
17
20
|
|
|
18
21
|
@property
|
|
19
22
|
def current_size(self) -> int:
|
|
20
23
|
"""Return buffer size"""
|
|
21
|
-
return
|
|
24
|
+
return self.df_destination_records.estimated_size(unit="b")
|
|
22
25
|
|
|
23
26
|
@property
|
|
24
27
|
def buffer_free_space_pct(self) -> float:
|
|
@@ -61,16 +64,20 @@ class DestinationBuffer:
|
|
|
61
64
|
|
|
62
65
|
def flush(self):
|
|
63
66
|
"""Flush buffer"""
|
|
64
|
-
self.
|
|
67
|
+
self.df_destination_records = DataFrame(schema=destination_record_schema)
|
|
65
68
|
self._iterations = []
|
|
66
69
|
self.pagination = {}
|
|
67
70
|
self.modified_at = []
|
|
68
71
|
|
|
69
72
|
def add_source_iteration_records_to_buffer(
|
|
70
|
-
self, iteration: int,
|
|
73
|
+
self, iteration: int, df_destination_records: DataFrame, pagination: dict = None
|
|
71
74
|
):
|
|
72
75
|
"""Add records for the given iteration to buffer"""
|
|
73
|
-
self.
|
|
76
|
+
self.df_destination_records.vstack(df_destination_records, in_place=True)
|
|
74
77
|
self._iterations.append(iteration)
|
|
75
78
|
self.pagination = pagination
|
|
76
|
-
self.modified_at.append(datetime.
|
|
79
|
+
self.modified_at.append(datetime.now(tz=UTC))
|
|
80
|
+
|
|
81
|
+
logger.info(
|
|
82
|
+
f"Added {df_destination_records.height} records to buffer for iteration {iteration} - {self.df_destination_records.estimated_size(unit='mb')} MB"
|
|
83
|
+
)
|