bizon 0.0.8__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,6 +15,13 @@ class GCSBufferFormat(str, Enum):
15
15
  CSV = "csv"
16
16
 
17
17
 
18
+ class TimePartitioning(str, Enum):
19
+ DAY = "DAY"
20
+ HOUR = "HOUR"
21
+ MONTH = "MONTH"
22
+ YEAR = "YEAR"
23
+
24
+
18
25
  class BigQueryAuthentication(BaseModel):
19
26
  service_account_key: str = Field(
20
27
  description="Service Account Key JSON string. If empty it will be infered",
@@ -31,6 +38,10 @@ class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
31
38
  )
32
39
  gcs_buffer_bucket: str
33
40
  gcs_buffer_format: Optional[GCSBufferFormat] = GCSBufferFormat.PARQUET
41
+
42
+ time_partitioning: Optional[TimePartitioning] = Field(
43
+ default=TimePartitioning.DAY, description="BigQuery Time partitioning type"
44
+ )
34
45
  authentication: Optional[BigQueryAuthentication] = None
35
46
 
36
47
 
@@ -1,4 +1,5 @@
1
1
  import io
2
+ import json
2
3
  import os
3
4
  import tempfile
4
5
  from typing import List, Tuple
@@ -9,7 +10,7 @@ import pyarrow as pa
9
10
  import pyarrow.parquet as pq
10
11
  from google.api_core.exceptions import NotFound
11
12
  from google.cloud import bigquery, storage
12
- from google.cloud.bigquery import DatasetReference
13
+ from google.cloud.bigquery import DatasetReference, TimePartitioning
13
14
  from loguru import logger
14
15
  from pytz import UTC
15
16
 
@@ -70,7 +71,25 @@ class BigQueryDestination(AbstractDestination):
70
71
  bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
71
72
  bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
72
73
  bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
73
- bigquery.SchemaField("_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED"),
74
+ bigquery.SchemaField(
75
+ "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
76
+ ),
77
+ bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
78
+ ]
79
+
80
+ elif self.config.normalization.type == NormalizationType.DEBEZIUM:
81
+ assert (
82
+ "_bizon_message_key" in destination_records[0].source_data
83
+ ), "Debezium records must have a '_bizon_message_key' key"
84
+ message_keys = json.loads(destination_records[0].source_data["_bizon_message_key"])
85
+ return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in message_keys] + [
86
+ bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
87
+ bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
88
+ bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
89
+ bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
90
+ bigquery.SchemaField(
91
+ "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
92
+ ),
74
93
  bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
75
94
  ]
76
95
 
@@ -81,7 +100,9 @@ class BigQueryDestination(AbstractDestination):
81
100
  bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
82
101
  bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
83
102
  bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
84
- bigquery.SchemaField("_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED"),
103
+ bigquery.SchemaField(
104
+ "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
105
+ ),
85
106
  bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
86
107
  ]
87
108
 
@@ -113,6 +134,10 @@ class BigQueryDestination(AbstractDestination):
113
134
  int(record.source_timestamp.timestamp() * 1_000_000) for record in destination_records
114
135
  ]
115
136
 
137
+ elif self.config.normalization.type == NormalizationType.DEBEZIUM:
138
+ df = pd.DataFrame([record.to_dict_debezium(parquet=True) for record in destination_records])
139
+ df["_bizon_loaded_at"] = pd.Timestamp.now(tz=UTC)
140
+
116
141
  else:
117
142
  raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
118
143
 
@@ -155,15 +180,26 @@ class BigQueryDestination(AbstractDestination):
155
180
  # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dbapi.DataError
156
181
 
157
182
  def load_to_bigquery(self, gcs_file: str, destination_records: List[DestinationRecord]):
183
+
184
+ # We always partition by the loaded_at field
185
+ time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
186
+
158
187
  job_config = bigquery.LoadJobConfig(
159
188
  source_format=bigquery.SourceFormat.PARQUET,
160
189
  write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
161
190
  schema=self.get_bigquery_schema(destination_records=destination_records),
191
+ time_partitioning=time_partitioning,
162
192
  )
163
193
 
194
+ if self.config.normalization.type == NormalizationType.DEBEZIUM:
195
+ job_config.clustering_fields = list(
196
+ json.loads(destination_records[0].source_data["_bizon_message_key"]).keys()
197
+ )
198
+
164
199
  load_job = self.bq_client.load_table_from_uri(
165
200
  f"gs://{self.buffer_bucket_name}/{gcs_file}", self.temp_table_id, job_config=job_config
166
201
  )
202
+
167
203
  load_job.result()
168
204
 
169
205
  def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, str]:
@@ -13,6 +13,7 @@ class DestinationTypes(str, Enum):
13
13
  class NormalizationType(str, Enum):
14
14
  TABULAR = "tabular" # Parse key / value pairs to columns
15
15
  NONE = "none" # No normalization, raw data is stored
16
+ DEBEZIUM = "debezium" # Debezium normalization
16
17
 
17
18
 
18
19
  class NormalizationConfig(BaseModel):
@@ -27,6 +27,39 @@ class DestinationRecord(BaseModel):
27
27
  source_data=source_record.data,
28
28
  )
29
29
 
30
+ def to_dict_debezium(self, parquet: bool = False) -> dict:
31
+ """Return the record as a dict with Debezium data"""
32
+
33
+ # Extract keys from Debezium message key and unnest
34
+ parsed_debezium_keys = json.loads(self.source_data["_bizon_message_key"])
35
+
36
+ # Parse Debezium Operation and deleted record
37
+ if self.source_data.get("op") == "d":
38
+ parsed_source_data = {"__deleted": True, **self.source_data["before"]}
39
+ else:
40
+ parsed_source_data = {"__deleted": False, **self.source_data["after"]}
41
+
42
+ if parquet:
43
+ return {
44
+ **{k: str(v) for k, v in parsed_debezium_keys.items()},
45
+ "_bizon_id": self.bizon_id,
46
+ "_bizon_extracted_at": int(self.bizon_extracted_at.timestamp() * 1_000_000),
47
+ "_bizon_loaded_at": self.bizon_loaded_at.timestamp(),
48
+ "_source_record_id": self.source_record_id,
49
+ "_source_timestamp": int(self.source_timestamp.timestamp() * 1_000_000),
50
+ "_source_data": json.dumps(parsed_source_data),
51
+ }
52
+
53
+ return {
54
+ **{k: str(v) for k, v in parsed_debezium_keys.items()},
55
+ "_bizon_id": self.bizon_id,
56
+ "_bizon_extracted_at": self.bizon_extracted_at,
57
+ "_bizon_loaded_at": self.bizon_loaded_at,
58
+ "_source_record_id": self.source_record_id,
59
+ "_source_timestamp": self.source_timestamp,
60
+ "_source_data": json.dumps(parsed_source_data),
61
+ }
62
+
30
63
  def to_dict_raw_json_data(self, parquet: bool = False) -> str:
31
64
  """Return the record as a dict with raw JSON data"""
32
65
 
@@ -45,6 +45,8 @@ class KafkaSourceConfig(SourceConfig):
45
45
  consumer_timeout: int = Field(10, description="Kafka consumer timeout in seconds")
46
46
  group_id: str = Field("bizon", description="Kafka group id")
47
47
 
48
+ max_consumer_threads: int = Field(16, description="Maximum number of threads for the consumer")
49
+
48
50
  nb_bytes_schema_id: Literal[4, 8] = Field(
49
51
  4, description="Number of bytes for the schema id. 4 is the default for majority of the cases"
50
52
  )
@@ -223,6 +225,7 @@ class KafkaSource(AbstractSource):
223
225
  raise ValueError(f"Number of bytes for schema id {self.config.nb_bytes_schema_id} not supported")
224
226
 
225
227
  data = self.decode(message.value(), schema)
228
+ data["_bizon_message_key"] = message.key().decode("utf-8")
226
229
 
227
230
  # Get the source timestamp
228
231
  if self.parse_timestamp:
@@ -261,7 +264,7 @@ class KafkaSource(AbstractSource):
261
264
 
262
265
  # Use ThreadPoolExecutor to parallelize reading partitions
263
266
  records = []
264
- with ThreadPoolExecutor(max_workers=nb_partitions) as executor:
267
+ with ThreadPoolExecutor(max_workers=min(nb_partitions, self.config.max_consumer_threads)) as executor:
265
268
  futures = {executor.submit(self.read_partition, i, topic_offsets): i for i in range(nb_partitions)}
266
269
  for future in as_completed(futures):
267
270
  partition_records = future.result()
@@ -3,5 +3,7 @@ import os
3
3
  from bizon.engine.engine import RunnerFactory
4
4
 
5
5
  if __name__ == "__main__":
6
- runner = RunnerFactory.create_from_yaml(filepath=os.path.abspath("bizon/sources/kafka/config/kafka.yml"))
6
+ runner = RunnerFactory.create_from_yaml(
7
+ filepath=os.path.abspath("bizon/sources/kafka/config/kafka_teams_users_eu_west1_c511.yml")
8
+ )
7
9
  runner.run()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bizon
3
- Version: 0.0.8
3
+ Version: 0.0.9
4
4
  Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
5
5
  Author: Antoine Balliet
6
6
  Author-email: antoine.balliet@gmail.com
@@ -6,16 +6,16 @@ bizon/common/errors/backoff.py,sha256=z7RkQt1Npdh0sfD3hBDaiWQKe4iqS6ewvT1Q4Fds5a
6
6
  bizon/common/errors/errors.py,sha256=mrYx1uE2kOuR2pEaB7ztK1l2m0E4V-_-hxq-DuILerY,682
7
7
  bizon/common/models.py,sha256=7_HKAxOyN9eK8hmqahzHhmK-TYVAuRtGOgf4iadE7FI,1751
8
8
  bizon/destinations/bigquery/config/bigquery.example.yml,sha256=mvKtFS_PUuekyMh9xssuwRfFwLtR-rVvpIy5xmF5__k,1261
9
- bizon/destinations/bigquery/src/config.py,sha256=HU04yJhFxnSbKi0R3B-xZCPT8CZ3l8KAUm-vRsFAT8w,1119
10
- bizon/destinations/bigquery/src/destination.py,sha256=m0lCvAuyM4FhbS29n4OuJI3g_7P4X_3d-erDrvb7igY,9138
9
+ bizon/destinations/bigquery/src/config.py,sha256=QlD-FdBJ8Q6nKPrOf5q28lHnyFE8khT41dSR1s2meeM,1378
10
+ bizon/destinations/bigquery/src/destination.py,sha256=tPxE0IpHbR4zDkW5HaiHkgeDRDY2AibIPzY9iftZ2Uc,11079
11
11
  bizon/destinations/buffer.py,sha256=bFYkaoge-3AyKfGolqsuB3PWWtdPt65Fllrz-3X_uMI,2594
12
- bizon/destinations/config.py,sha256=qncIW-rVJdViyNqYW9seBoBK0XMAbn45clC4CcHWrQw,1634
12
+ bizon/destinations/config.py,sha256=jD4nkG-sg7mzJMFKLErQBkJu7ri0PMbCRVU3xIvFT7E,1686
13
13
  bizon/destinations/destination.py,sha256=VAyGPmowNimvK_joZj-6ESk2ezGxDZHnKCIpKRA-Vus,10995
14
14
  bizon/destinations/file/src/config.py,sha256=C4BBIKzBH5343iLGR3aCubAGjPo0b2LegsCLjb77uFA,513
15
15
  bizon/destinations/file/src/destination.py,sha256=1VCrVdtzAzwSKgYq0JUOc3r2cM7314dV-eIoAFhM_64,1003
16
16
  bizon/destinations/logger/src/config.py,sha256=AWY3R9q3ZjD3uQ_KBq8VcW60deKSIHe3qtgCKjdywKk,433
17
17
  bizon/destinations/logger/src/destination.py,sha256=xTt03F3AMI9KhQno2tGoCr3eacrO62qjnOlpeEHk6tQ,868
18
- bizon/destinations/models.py,sha256=FJw32LBF9mcniFDhss5lp_UT1cENTwRapaP6iHSUhD8,2111
18
+ bizon/destinations/models.py,sha256=hK7yXMoOArLJ5sUS9kgljXMBaq2vqu1l_7u707yS1KM,3630
19
19
  bizon/engine/backend/adapters/sqlalchemy/backend.py,sha256=R0CztRGc3_6PdIIgbbrDYD2OJRNhq9PPmD6PYK7-fjk,15567
20
20
  bizon/engine/backend/adapters/sqlalchemy/config.py,sha256=K-FpE_-VHnTSAQOduouhXFVy43EkrKbeZLqr9_OfeMw,1846
21
21
  bizon/engine/backend/backend.py,sha256=Bodqoo5qJHV0H2zJJeGytaHGiNZmBjnLBxiRgq6M3kE,5844
@@ -76,16 +76,16 @@ bizon/sources/hubspot/src/hubspot_objects.py,sha256=EmABx9XD8q6g4Uc5mHLv5YYl5KcI
76
76
  bizon/sources/hubspot/src/models/hs_object.py,sha256=-Y20H3-nenJyySMlvM4TPttPz4O8qm3ArKP_I8pxsuo,1235
77
77
  bizon/sources/hubspot/tests/hubspot_pipeline.py,sha256=e6dCF5_MHMySkeiF6kKrSAuCa_48J22-ZeSCZSjrfUI,216
78
78
  bizon/sources/kafka/config/kafka.example.yml,sha256=ZyHBmSWZ_5WQaBr9WzD05PuE6vi3hhYgHh2VZ-IU-Iw,755
79
- bizon/sources/kafka/src/source.py,sha256=jbyk19RpXSS6CPbEhXa178OzcD-izk4biMr1EiOKBzk,11128
80
- bizon/sources/kafka/tests/kafka_pipeline.py,sha256=G22IyaStejMhh5zwyyKmLDBpzdigo1uT6qsIxcqVWpg,212
79
+ bizon/sources/kafka/src/source.py,sha256=28Cn_m8DOzsUdgbq0sUm36I7hB0TWRF6xEzg7TcrPrc,11343
80
+ bizon/sources/kafka/tests/kafka_pipeline.py,sha256=DrMHq96ZDiQ2lWmxEf_aX7HmBg_qNOsSFGTuGmuhly8,252
81
81
  bizon/sources/periscope/config/periscope_charts.example.yml,sha256=rpFDAWeU5oZ3UOiX0sSAgd1X5lv6t-s3iqiDPnRqutU,477
82
82
  bizon/sources/periscope/config/periscope_dashboards.example.yml,sha256=sN2iGGqCQCvrMXcwxNGq_dR7-KZ1KtYdXmNYKXlfEpg,481
83
83
  bizon/sources/periscope/src/source.py,sha256=AZM-HDDjdTWj8akeeofQ_-G8YlnNHEKi2mjEQSYwOvE,7638
84
84
  bizon/sources/periscope/tests/periscope_pipeline_charts.py,sha256=mU0JtfhS1KmWsS3iovGhGxK7iPVWiYzjBM_QfRL3ZQI,275
85
85
  bizon/sources/periscope/tests/periscope_pipeline_dashboard.py,sha256=vZKN7UfH-lQIWrnfjPqQFjZm28UIw2m9OSg4yS-Wckk,279
86
86
  bizon/utils.py,sha256=HXaPiyxpWKoy3XN5vSYOve1ezlFeOYin3aFqTjcabUQ,81
87
- bizon-0.0.8.dist-info/LICENSE,sha256=AW7SjYVT2bBnXOxgDxqy_e_JF8jDCFlMCaPCF11wFDI,1072
88
- bizon-0.0.8.dist-info/METADATA,sha256=XgDIPAHHLNd2__CX0K9gO1NJwg_UWsgYTEc073-6Uiw,5646
89
- bizon-0.0.8.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
90
- bizon-0.0.8.dist-info/entry_points.txt,sha256=wtCd-6JswSY8lPWYSvOf7ASX1zfKgmgXtgg5XQS5274,44
91
- bizon-0.0.8.dist-info/RECORD,,
87
+ bizon-0.0.9.dist-info/LICENSE,sha256=AW7SjYVT2bBnXOxgDxqy_e_JF8jDCFlMCaPCF11wFDI,1072
88
+ bizon-0.0.9.dist-info/METADATA,sha256=POEtr3jEzvy8ogs2WvJ0rnlcpqFXcnZjLwQebLWxNnw,5646
89
+ bizon-0.0.9.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
90
+ bizon-0.0.9.dist-info/entry_points.txt,sha256=wtCd-6JswSY8lPWYSvOf7ASX1zfKgmgXtgg5XQS5274,44
91
+ bizon-0.0.9.dist-info/RECORD,,
File without changes
File without changes