bizon 0.0.8__py3-none-any.whl → 0.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon/destinations/bigquery/src/config.py +11 -0
- bizon/destinations/bigquery/src/destination.py +39 -3
- bizon/destinations/config.py +1 -0
- bizon/destinations/models.py +33 -0
- bizon/sources/kafka/src/source.py +4 -1
- bizon/sources/kafka/tests/kafka_pipeline.py +3 -1
- {bizon-0.0.8.dist-info → bizon-0.0.9.dist-info}/METADATA +1 -1
- {bizon-0.0.8.dist-info → bizon-0.0.9.dist-info}/RECORD +11 -11
- {bizon-0.0.8.dist-info → bizon-0.0.9.dist-info}/LICENSE +0 -0
- {bizon-0.0.8.dist-info → bizon-0.0.9.dist-info}/WHEEL +0 -0
- {bizon-0.0.8.dist-info → bizon-0.0.9.dist-info}/entry_points.txt +0 -0
|
@@ -15,6 +15,13 @@ class GCSBufferFormat(str, Enum):
|
|
|
15
15
|
CSV = "csv"
|
|
16
16
|
|
|
17
17
|
|
|
18
|
+
class TimePartitioning(str, Enum):
|
|
19
|
+
DAY = "DAY"
|
|
20
|
+
HOUR = "HOUR"
|
|
21
|
+
MONTH = "MONTH"
|
|
22
|
+
YEAR = "YEAR"
|
|
23
|
+
|
|
24
|
+
|
|
18
25
|
class BigQueryAuthentication(BaseModel):
|
|
19
26
|
service_account_key: str = Field(
|
|
20
27
|
description="Service Account Key JSON string. If empty it will be infered",
|
|
@@ -31,6 +38,10 @@ class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
|
|
|
31
38
|
)
|
|
32
39
|
gcs_buffer_bucket: str
|
|
33
40
|
gcs_buffer_format: Optional[GCSBufferFormat] = GCSBufferFormat.PARQUET
|
|
41
|
+
|
|
42
|
+
time_partitioning: Optional[TimePartitioning] = Field(
|
|
43
|
+
default=TimePartitioning.DAY, description="BigQuery Time partitioning type"
|
|
44
|
+
)
|
|
34
45
|
authentication: Optional[BigQueryAuthentication] = None
|
|
35
46
|
|
|
36
47
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import io
|
|
2
|
+
import json
|
|
2
3
|
import os
|
|
3
4
|
import tempfile
|
|
4
5
|
from typing import List, Tuple
|
|
@@ -9,7 +10,7 @@ import pyarrow as pa
|
|
|
9
10
|
import pyarrow.parquet as pq
|
|
10
11
|
from google.api_core.exceptions import NotFound
|
|
11
12
|
from google.cloud import bigquery, storage
|
|
12
|
-
from google.cloud.bigquery import DatasetReference
|
|
13
|
+
from google.cloud.bigquery import DatasetReference, TimePartitioning
|
|
13
14
|
from loguru import logger
|
|
14
15
|
from pytz import UTC
|
|
15
16
|
|
|
@@ -70,7 +71,25 @@ class BigQueryDestination(AbstractDestination):
|
|
|
70
71
|
bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
|
|
71
72
|
bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
|
|
72
73
|
bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
|
|
73
|
-
bigquery.SchemaField(
|
|
74
|
+
bigquery.SchemaField(
|
|
75
|
+
"_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
|
|
76
|
+
),
|
|
77
|
+
bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
elif self.config.normalization.type == NormalizationType.DEBEZIUM:
|
|
81
|
+
assert (
|
|
82
|
+
"_bizon_message_key" in destination_records[0].source_data
|
|
83
|
+
), "Debezium records must have a '_bizon_message_key' key"
|
|
84
|
+
message_keys = json.loads(destination_records[0].source_data["_bizon_message_key"])
|
|
85
|
+
return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in message_keys] + [
|
|
86
|
+
bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
|
|
87
|
+
bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
|
|
88
|
+
bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
|
|
89
|
+
bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
|
|
90
|
+
bigquery.SchemaField(
|
|
91
|
+
"_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
|
|
92
|
+
),
|
|
74
93
|
bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
|
|
75
94
|
]
|
|
76
95
|
|
|
@@ -81,7 +100,9 @@ class BigQueryDestination(AbstractDestination):
|
|
|
81
100
|
bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
|
|
82
101
|
bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
|
|
83
102
|
bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
|
|
84
|
-
bigquery.SchemaField(
|
|
103
|
+
bigquery.SchemaField(
|
|
104
|
+
"_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
|
|
105
|
+
),
|
|
85
106
|
bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
|
|
86
107
|
]
|
|
87
108
|
|
|
@@ -113,6 +134,10 @@ class BigQueryDestination(AbstractDestination):
|
|
|
113
134
|
int(record.source_timestamp.timestamp() * 1_000_000) for record in destination_records
|
|
114
135
|
]
|
|
115
136
|
|
|
137
|
+
elif self.config.normalization.type == NormalizationType.DEBEZIUM:
|
|
138
|
+
df = pd.DataFrame([record.to_dict_debezium(parquet=True) for record in destination_records])
|
|
139
|
+
df["_bizon_loaded_at"] = pd.Timestamp.now(tz=UTC)
|
|
140
|
+
|
|
116
141
|
else:
|
|
117
142
|
raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
|
|
118
143
|
|
|
@@ -155,15 +180,26 @@ class BigQueryDestination(AbstractDestination):
|
|
|
155
180
|
# https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dbapi.DataError
|
|
156
181
|
|
|
157
182
|
def load_to_bigquery(self, gcs_file: str, destination_records: List[DestinationRecord]):
|
|
183
|
+
|
|
184
|
+
# We always partition by the loaded_at field
|
|
185
|
+
time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
|
|
186
|
+
|
|
158
187
|
job_config = bigquery.LoadJobConfig(
|
|
159
188
|
source_format=bigquery.SourceFormat.PARQUET,
|
|
160
189
|
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
|
|
161
190
|
schema=self.get_bigquery_schema(destination_records=destination_records),
|
|
191
|
+
time_partitioning=time_partitioning,
|
|
162
192
|
)
|
|
163
193
|
|
|
194
|
+
if self.config.normalization.type == NormalizationType.DEBEZIUM:
|
|
195
|
+
job_config.clustering_fields = list(
|
|
196
|
+
json.loads(destination_records[0].source_data["_bizon_message_key"]).keys()
|
|
197
|
+
)
|
|
198
|
+
|
|
164
199
|
load_job = self.bq_client.load_table_from_uri(
|
|
165
200
|
f"gs://{self.buffer_bucket_name}/{gcs_file}", self.temp_table_id, job_config=job_config
|
|
166
201
|
)
|
|
202
|
+
|
|
167
203
|
load_job.result()
|
|
168
204
|
|
|
169
205
|
def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, str]:
|
bizon/destinations/config.py
CHANGED
|
@@ -13,6 +13,7 @@ class DestinationTypes(str, Enum):
|
|
|
13
13
|
class NormalizationType(str, Enum):
|
|
14
14
|
TABULAR = "tabular" # Parse key / value pairs to columns
|
|
15
15
|
NONE = "none" # No normalization, raw data is stored
|
|
16
|
+
DEBEZIUM = "debezium" # Debezium normalization
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
class NormalizationConfig(BaseModel):
|
bizon/destinations/models.py
CHANGED
|
@@ -27,6 +27,39 @@ class DestinationRecord(BaseModel):
|
|
|
27
27
|
source_data=source_record.data,
|
|
28
28
|
)
|
|
29
29
|
|
|
30
|
+
def to_dict_debezium(self, parquet: bool = False) -> dict:
|
|
31
|
+
"""Return the record as a dict with Debezium data"""
|
|
32
|
+
|
|
33
|
+
# Extract keys from Debezium message key and unnest
|
|
34
|
+
parsed_debezium_keys = json.loads(self.source_data["_bizon_message_key"])
|
|
35
|
+
|
|
36
|
+
# Parse Debezium Operation and deleted record
|
|
37
|
+
if self.source_data.get("op") == "d":
|
|
38
|
+
parsed_source_data = {"__deleted": True, **self.source_data["before"]}
|
|
39
|
+
else:
|
|
40
|
+
parsed_source_data = {"__deleted": False, **self.source_data["after"]}
|
|
41
|
+
|
|
42
|
+
if parquet:
|
|
43
|
+
return {
|
|
44
|
+
**{k: str(v) for k, v in parsed_debezium_keys.items()},
|
|
45
|
+
"_bizon_id": self.bizon_id,
|
|
46
|
+
"_bizon_extracted_at": int(self.bizon_extracted_at.timestamp() * 1_000_000),
|
|
47
|
+
"_bizon_loaded_at": self.bizon_loaded_at.timestamp(),
|
|
48
|
+
"_source_record_id": self.source_record_id,
|
|
49
|
+
"_source_timestamp": int(self.source_timestamp.timestamp() * 1_000_000),
|
|
50
|
+
"_source_data": json.dumps(parsed_source_data),
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return {
|
|
54
|
+
**{k: str(v) for k, v in parsed_debezium_keys.items()},
|
|
55
|
+
"_bizon_id": self.bizon_id,
|
|
56
|
+
"_bizon_extracted_at": self.bizon_extracted_at,
|
|
57
|
+
"_bizon_loaded_at": self.bizon_loaded_at,
|
|
58
|
+
"_source_record_id": self.source_record_id,
|
|
59
|
+
"_source_timestamp": self.source_timestamp,
|
|
60
|
+
"_source_data": json.dumps(parsed_source_data),
|
|
61
|
+
}
|
|
62
|
+
|
|
30
63
|
def to_dict_raw_json_data(self, parquet: bool = False) -> str:
|
|
31
64
|
"""Return the record as a dict with raw JSON data"""
|
|
32
65
|
|
|
@@ -45,6 +45,8 @@ class KafkaSourceConfig(SourceConfig):
|
|
|
45
45
|
consumer_timeout: int = Field(10, description="Kafka consumer timeout in seconds")
|
|
46
46
|
group_id: str = Field("bizon", description="Kafka group id")
|
|
47
47
|
|
|
48
|
+
max_consumer_threads: int = Field(16, description="Maximum number of threads for the consumer")
|
|
49
|
+
|
|
48
50
|
nb_bytes_schema_id: Literal[4, 8] = Field(
|
|
49
51
|
4, description="Number of bytes for the schema id. 4 is the default for majority of the cases"
|
|
50
52
|
)
|
|
@@ -223,6 +225,7 @@ class KafkaSource(AbstractSource):
|
|
|
223
225
|
raise ValueError(f"Number of bytes for schema id {self.config.nb_bytes_schema_id} not supported")
|
|
224
226
|
|
|
225
227
|
data = self.decode(message.value(), schema)
|
|
228
|
+
data["_bizon_message_key"] = message.key().decode("utf-8")
|
|
226
229
|
|
|
227
230
|
# Get the source timestamp
|
|
228
231
|
if self.parse_timestamp:
|
|
@@ -261,7 +264,7 @@ class KafkaSource(AbstractSource):
|
|
|
261
264
|
|
|
262
265
|
# Use ThreadPoolExecutor to parallelize reading partitions
|
|
263
266
|
records = []
|
|
264
|
-
with ThreadPoolExecutor(max_workers=nb_partitions) as executor:
|
|
267
|
+
with ThreadPoolExecutor(max_workers=min(nb_partitions, self.config.max_consumer_threads)) as executor:
|
|
265
268
|
futures = {executor.submit(self.read_partition, i, topic_offsets): i for i in range(nb_partitions)}
|
|
266
269
|
for future in as_completed(futures):
|
|
267
270
|
partition_records = future.result()
|
|
@@ -3,5 +3,7 @@ import os
|
|
|
3
3
|
from bizon.engine.engine import RunnerFactory
|
|
4
4
|
|
|
5
5
|
if __name__ == "__main__":
|
|
6
|
-
runner = RunnerFactory.create_from_yaml(
|
|
6
|
+
runner = RunnerFactory.create_from_yaml(
|
|
7
|
+
filepath=os.path.abspath("bizon/sources/kafka/config/kafka_teams_users_eu_west1_c511.yml")
|
|
8
|
+
)
|
|
7
9
|
runner.run()
|
|
@@ -6,16 +6,16 @@ bizon/common/errors/backoff.py,sha256=z7RkQt1Npdh0sfD3hBDaiWQKe4iqS6ewvT1Q4Fds5a
|
|
|
6
6
|
bizon/common/errors/errors.py,sha256=mrYx1uE2kOuR2pEaB7ztK1l2m0E4V-_-hxq-DuILerY,682
|
|
7
7
|
bizon/common/models.py,sha256=7_HKAxOyN9eK8hmqahzHhmK-TYVAuRtGOgf4iadE7FI,1751
|
|
8
8
|
bizon/destinations/bigquery/config/bigquery.example.yml,sha256=mvKtFS_PUuekyMh9xssuwRfFwLtR-rVvpIy5xmF5__k,1261
|
|
9
|
-
bizon/destinations/bigquery/src/config.py,sha256=
|
|
10
|
-
bizon/destinations/bigquery/src/destination.py,sha256=
|
|
9
|
+
bizon/destinations/bigquery/src/config.py,sha256=QlD-FdBJ8Q6nKPrOf5q28lHnyFE8khT41dSR1s2meeM,1378
|
|
10
|
+
bizon/destinations/bigquery/src/destination.py,sha256=tPxE0IpHbR4zDkW5HaiHkgeDRDY2AibIPzY9iftZ2Uc,11079
|
|
11
11
|
bizon/destinations/buffer.py,sha256=bFYkaoge-3AyKfGolqsuB3PWWtdPt65Fllrz-3X_uMI,2594
|
|
12
|
-
bizon/destinations/config.py,sha256=
|
|
12
|
+
bizon/destinations/config.py,sha256=jD4nkG-sg7mzJMFKLErQBkJu7ri0PMbCRVU3xIvFT7E,1686
|
|
13
13
|
bizon/destinations/destination.py,sha256=VAyGPmowNimvK_joZj-6ESk2ezGxDZHnKCIpKRA-Vus,10995
|
|
14
14
|
bizon/destinations/file/src/config.py,sha256=C4BBIKzBH5343iLGR3aCubAGjPo0b2LegsCLjb77uFA,513
|
|
15
15
|
bizon/destinations/file/src/destination.py,sha256=1VCrVdtzAzwSKgYq0JUOc3r2cM7314dV-eIoAFhM_64,1003
|
|
16
16
|
bizon/destinations/logger/src/config.py,sha256=AWY3R9q3ZjD3uQ_KBq8VcW60deKSIHe3qtgCKjdywKk,433
|
|
17
17
|
bizon/destinations/logger/src/destination.py,sha256=xTt03F3AMI9KhQno2tGoCr3eacrO62qjnOlpeEHk6tQ,868
|
|
18
|
-
bizon/destinations/models.py,sha256=
|
|
18
|
+
bizon/destinations/models.py,sha256=hK7yXMoOArLJ5sUS9kgljXMBaq2vqu1l_7u707yS1KM,3630
|
|
19
19
|
bizon/engine/backend/adapters/sqlalchemy/backend.py,sha256=R0CztRGc3_6PdIIgbbrDYD2OJRNhq9PPmD6PYK7-fjk,15567
|
|
20
20
|
bizon/engine/backend/adapters/sqlalchemy/config.py,sha256=K-FpE_-VHnTSAQOduouhXFVy43EkrKbeZLqr9_OfeMw,1846
|
|
21
21
|
bizon/engine/backend/backend.py,sha256=Bodqoo5qJHV0H2zJJeGytaHGiNZmBjnLBxiRgq6M3kE,5844
|
|
@@ -76,16 +76,16 @@ bizon/sources/hubspot/src/hubspot_objects.py,sha256=EmABx9XD8q6g4Uc5mHLv5YYl5KcI
|
|
|
76
76
|
bizon/sources/hubspot/src/models/hs_object.py,sha256=-Y20H3-nenJyySMlvM4TPttPz4O8qm3ArKP_I8pxsuo,1235
|
|
77
77
|
bizon/sources/hubspot/tests/hubspot_pipeline.py,sha256=e6dCF5_MHMySkeiF6kKrSAuCa_48J22-ZeSCZSjrfUI,216
|
|
78
78
|
bizon/sources/kafka/config/kafka.example.yml,sha256=ZyHBmSWZ_5WQaBr9WzD05PuE6vi3hhYgHh2VZ-IU-Iw,755
|
|
79
|
-
bizon/sources/kafka/src/source.py,sha256=
|
|
80
|
-
bizon/sources/kafka/tests/kafka_pipeline.py,sha256=
|
|
79
|
+
bizon/sources/kafka/src/source.py,sha256=28Cn_m8DOzsUdgbq0sUm36I7hB0TWRF6xEzg7TcrPrc,11343
|
|
80
|
+
bizon/sources/kafka/tests/kafka_pipeline.py,sha256=DrMHq96ZDiQ2lWmxEf_aX7HmBg_qNOsSFGTuGmuhly8,252
|
|
81
81
|
bizon/sources/periscope/config/periscope_charts.example.yml,sha256=rpFDAWeU5oZ3UOiX0sSAgd1X5lv6t-s3iqiDPnRqutU,477
|
|
82
82
|
bizon/sources/periscope/config/periscope_dashboards.example.yml,sha256=sN2iGGqCQCvrMXcwxNGq_dR7-KZ1KtYdXmNYKXlfEpg,481
|
|
83
83
|
bizon/sources/periscope/src/source.py,sha256=AZM-HDDjdTWj8akeeofQ_-G8YlnNHEKi2mjEQSYwOvE,7638
|
|
84
84
|
bizon/sources/periscope/tests/periscope_pipeline_charts.py,sha256=mU0JtfhS1KmWsS3iovGhGxK7iPVWiYzjBM_QfRL3ZQI,275
|
|
85
85
|
bizon/sources/periscope/tests/periscope_pipeline_dashboard.py,sha256=vZKN7UfH-lQIWrnfjPqQFjZm28UIw2m9OSg4yS-Wckk,279
|
|
86
86
|
bizon/utils.py,sha256=HXaPiyxpWKoy3XN5vSYOve1ezlFeOYin3aFqTjcabUQ,81
|
|
87
|
-
bizon-0.0.
|
|
88
|
-
bizon-0.0.
|
|
89
|
-
bizon-0.0.
|
|
90
|
-
bizon-0.0.
|
|
91
|
-
bizon-0.0.
|
|
87
|
+
bizon-0.0.9.dist-info/LICENSE,sha256=AW7SjYVT2bBnXOxgDxqy_e_JF8jDCFlMCaPCF11wFDI,1072
|
|
88
|
+
bizon-0.0.9.dist-info/METADATA,sha256=POEtr3jEzvy8ogs2WvJ0rnlcpqFXcnZjLwQebLWxNnw,5646
|
|
89
|
+
bizon-0.0.9.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
|
90
|
+
bizon-0.0.9.dist-info/entry_points.txt,sha256=wtCd-6JswSY8lPWYSvOf7ASX1zfKgmgXtgg5XQS5274,44
|
|
91
|
+
bizon-0.0.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|