bizon 0.0.10__tar.gz → 0.0.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. {bizon-0.0.10 → bizon-0.0.11}/PKG-INFO +2 -1
  2. {bizon-0.0.10 → bizon-0.0.11}/bizon/common/models.py +2 -0
  3. bizon-0.0.11/bizon/destinations/bigquery_streaming/src/config.py +55 -0
  4. bizon-0.0.11/bizon/destinations/bigquery_streaming/src/destination.py +148 -0
  5. bizon-0.0.11/bizon/destinations/bigquery_streaming/src/proto_utils.py +91 -0
  6. {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/config.py +1 -0
  7. {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/destination.py +12 -2
  8. {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/models.py +27 -0
  9. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/runner/adapters/thread.py +2 -0
  10. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/kafka/src/source.py +31 -32
  11. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/kafka/tests/kafka_pipeline.py +1 -1
  12. {bizon-0.0.10 → bizon-0.0.11}/pyproject.toml +2 -1
  13. {bizon-0.0.10 → bizon-0.0.11}/LICENSE +0 -0
  14. {bizon-0.0.10 → bizon-0.0.11}/README.md +0 -0
  15. {bizon-0.0.10 → bizon-0.0.11}/bizon/__main__.py +0 -0
  16. {bizon-0.0.10 → bizon-0.0.11}/bizon/cli/__init__.py +0 -0
  17. {bizon-0.0.10 → bizon-0.0.11}/bizon/cli/main.py +0 -0
  18. {bizon-0.0.10 → bizon-0.0.11}/bizon/cli/utils.py +0 -0
  19. {bizon-0.0.10 → bizon-0.0.11}/bizon/common/errors/backoff.py +0 -0
  20. {bizon-0.0.10 → bizon-0.0.11}/bizon/common/errors/errors.py +0 -0
  21. {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/bigquery/config/bigquery.example.yml +0 -0
  22. {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/bigquery/src/config.py +0 -0
  23. {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/bigquery/src/destination.py +0 -0
  24. {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/buffer.py +0 -0
  25. {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/file/src/config.py +0 -0
  26. {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/file/src/destination.py +0 -0
  27. {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/logger/src/config.py +0 -0
  28. {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/logger/src/destination.py +0 -0
  29. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/backend/adapters/sqlalchemy/backend.py +0 -0
  30. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/backend/adapters/sqlalchemy/config.py +0 -0
  31. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/backend/backend.py +0 -0
  32. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/backend/config.py +0 -0
  33. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/backend/models.py +0 -0
  34. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/config.py +0 -0
  35. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/engine.py +0 -0
  36. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/pipeline/consumer.py +0 -0
  37. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/pipeline/models.py +0 -0
  38. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/pipeline/producer.py +0 -0
  39. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/kafka/config.py +0 -0
  40. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/kafka/consumer.py +0 -0
  41. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/kafka/queue.py +0 -0
  42. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/python_queue/config.py +0 -0
  43. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/python_queue/consumer.py +0 -0
  44. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/python_queue/queue.py +0 -0
  45. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/rabbitmq/config.py +0 -0
  46. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -0
  47. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/rabbitmq/queue.py +0 -0
  48. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/config.py +0 -0
  49. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/queue.py +0 -0
  50. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/runner/adapters/process.py +0 -0
  51. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/runner/config.py +0 -0
  52. {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/runner/runner.py +0 -0
  53. {bizon-0.0.10 → bizon-0.0.11}/bizon/source/auth/authenticators/abstract_oauth.py +0 -0
  54. {bizon-0.0.10 → bizon-0.0.11}/bizon/source/auth/authenticators/abstract_token.py +0 -0
  55. {bizon-0.0.10 → bizon-0.0.11}/bizon/source/auth/authenticators/basic.py +0 -0
  56. {bizon-0.0.10 → bizon-0.0.11}/bizon/source/auth/authenticators/cookies.py +0 -0
  57. {bizon-0.0.10 → bizon-0.0.11}/bizon/source/auth/authenticators/oauth.py +0 -0
  58. {bizon-0.0.10 → bizon-0.0.11}/bizon/source/auth/authenticators/token.py +0 -0
  59. {bizon-0.0.10 → bizon-0.0.11}/bizon/source/auth/builder.py +0 -0
  60. {bizon-0.0.10 → bizon-0.0.11}/bizon/source/auth/config.py +0 -0
  61. {bizon-0.0.10 → bizon-0.0.11}/bizon/source/config.py +0 -0
  62. {bizon-0.0.10 → bizon-0.0.11}/bizon/source/cursor.py +0 -0
  63. {bizon-0.0.10 → bizon-0.0.11}/bizon/source/discover.py +0 -0
  64. {bizon-0.0.10 → bizon-0.0.11}/bizon/source/models.py +0 -0
  65. {bizon-0.0.10 → bizon-0.0.11}/bizon/source/session.py +0 -0
  66. {bizon-0.0.10 → bizon-0.0.11}/bizon/source/source.py +0 -0
  67. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/config/api_key.example.yml +0 -0
  68. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/config/api_key_kafka.example.yml +0 -0
  69. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/src/fake_api.py +0 -0
  70. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/src/source.py +0 -0
  71. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline.py +0 -0
  72. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline_bigquery_backend.py +0 -0
  73. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline_kafka.py +0 -0
  74. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline_rabbitmq.py +0 -0
  75. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery.py +0 -0
  76. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +0 -0
  77. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/gsheets/config/default_auth.example.yml +0 -0
  78. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/gsheets/config/service_account.example.yml +0 -0
  79. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/gsheets/src/source.py +0 -0
  80. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/gsheets/tests/gsheets_pipeline.py +0 -0
  81. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/hubspot/config/api_key.example.yml +0 -0
  82. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/hubspot/config/oauth.example.yml +0 -0
  83. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/hubspot/src/hubspot_base.py +0 -0
  84. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/hubspot/src/hubspot_objects.py +0 -0
  85. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/hubspot/src/models/hs_object.py +0 -0
  86. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/hubspot/tests/hubspot_pipeline.py +0 -0
  87. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/kafka/config/kafka.example.yml +0 -0
  88. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/periscope/config/periscope_charts.example.yml +0 -0
  89. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -0
  90. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/periscope/src/source.py +0 -0
  91. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/periscope/tests/periscope_pipeline_charts.py +0 -0
  92. {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/periscope/tests/periscope_pipeline_dashboard.py +0 -0
  93. {bizon-0.0.10 → bizon-0.0.11}/bizon/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bizon
3
- Version: 0.0.10
3
+ Version: 0.0.11
4
4
  Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
5
5
  Author: Antoine Balliet
6
6
  Author-email: antoine.balliet@gmail.com
@@ -31,6 +31,7 @@ Requires-Dist: loguru (>=0.7.2,<0.8.0)
31
31
  Requires-Dist: pandas (>=2.2.2,<3.0.0) ; extra == "bigquery"
32
32
  Requires-Dist: pendulum (>=3.0.0,<4.0.0)
33
33
  Requires-Dist: pika (>=1.3.2,<2.0.0) ; extra == "rabbitmq"
34
+ Requires-Dist: protobuf (==4.24.0)
34
35
  Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgres"
35
36
  Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
36
37
  Requires-Dist: pydantic (>=2.8.2,<3.0.0)
@@ -3,6 +3,7 @@ from typing import Union
3
3
  from pydantic import BaseModel, ConfigDict, Field
4
4
 
5
5
  from bizon.destinations.bigquery.src.config import BigQueryConfig
6
+ from bizon.destinations.bigquery_streaming.src.config import BigQueryStreamingConfig
6
7
  from bizon.destinations.file.src.config import FileDestinationConfig
7
8
  from bizon.destinations.logger.src.config import LoggerConfig
8
9
  from bizon.engine.config import EngineConfig
@@ -24,6 +25,7 @@ class BizonConfig(BaseModel):
24
25
 
25
26
  destination: Union[
26
27
  BigQueryConfig,
28
+ BigQueryStreamingConfig,
27
29
  LoggerConfig,
28
30
  FileDestinationConfig,
29
31
  ] = Field(
@@ -0,0 +1,55 @@
1
+ from enum import Enum
2
+ from typing import Literal, Optional
3
+
4
+ from pydantic import BaseModel, Field, field_validator
5
+
6
+ from bizon.destinations.config import (
7
+ AbstractDestinationConfig,
8
+ AbstractDestinationDetailsConfig,
9
+ DestinationTypes,
10
+ )
11
+
12
+
13
+ class GCSBufferFormat(str, Enum):
14
+ PARQUET = "parquet"
15
+ CSV = "csv"
16
+
17
+
18
+ class TimePartitioning(str, Enum):
19
+ DAY = "DAY"
20
+ HOUR = "HOUR"
21
+ MONTH = "MONTH"
22
+ YEAR = "YEAR"
23
+
24
+
25
+ class BigQueryAuthentication(BaseModel):
26
+ service_account_key: str = Field(
27
+ description="Service Account Key JSON string. If empty it will be infered",
28
+ default="",
29
+ )
30
+
31
+
32
+ class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
33
+ project_id: str
34
+ dataset_id: str
35
+ dataset_location: Optional[str] = "US"
36
+ table_id: Optional[str] = Field(
37
+ default=None, description="Table ID, if not provided it will be inferred from source name"
38
+ )
39
+ time_partitioning: Optional[TimePartitioning] = Field(
40
+ default=TimePartitioning.DAY, description="BigQuery Time partitioning type"
41
+ )
42
+ authentication: Optional[BigQueryAuthentication] = None
43
+
44
+ buffer_size: int = Field(default=0, description="Buffer size in MB")
45
+
46
+ @field_validator("buffer_size", mode="after")
47
+ def validate_buffer_size(cls, value: int) -> int:
48
+ if value != 0:
49
+ raise ValueError("Buffer size must be 0, we directly stream to BigQuery")
50
+ return value
51
+
52
+
53
+ class BigQueryStreamingConfig(AbstractDestinationConfig):
54
+ name: Literal[DestinationTypes.BIGQUERY_STREAMING]
55
+ config: BigQueryConfigDetails
@@ -0,0 +1,148 @@
1
+ import json
2
+ import os
3
+ import tempfile
4
+ from typing import List, Tuple
5
+
6
+ from google.api_core.exceptions import NotFound
7
+ from google.cloud import bigquery, bigquery_storage_v1, storage
8
+ from google.cloud.bigquery import DatasetReference, TimePartitioning
9
+ from google.cloud.bigquery_storage_v1.types import AppendRowsRequest, ProtoRows
10
+ from loguru import logger
11
+
12
+ from bizon.common.models import SyncMetadata
13
+ from bizon.destinations.config import NormalizationType
14
+ from bizon.destinations.destination import AbstractDestination
15
+ from bizon.destinations.models import DestinationRecord
16
+ from bizon.engine.backend.backend import AbstractBackend
17
+
18
+ from .config import BigQueryConfigDetails
19
+ from .proto_utils import get_proto_schema_and_class
20
+
21
+
22
+ class BigQueryStreamingDestination(AbstractDestination):
23
+
24
+ def __init__(self, sync_metadata: SyncMetadata, config: BigQueryConfigDetails, backend: AbstractBackend):
25
+ super().__init__(sync_metadata, config, backend)
26
+ self.config: BigQueryConfigDetails = config
27
+
28
+ if config.authentication and config.authentication.service_account_key:
29
+ with tempfile.NamedTemporaryFile(delete=False) as temp:
30
+ temp.write(config.authentication.service_account_key.encode())
31
+ temp_file_path = temp.name
32
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file_path
33
+
34
+ self.project_id = config.project_id
35
+ self.bq_client = bigquery.Client(project=self.project_id)
36
+ self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
37
+ self.gcs_client = storage.Client(project=self.project_id)
38
+ self.dataset_id = config.dataset_id
39
+ self.dataset_location = config.dataset_location
40
+
41
+ @property
42
+ def table_id(self) -> str:
43
+ tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
44
+ return f"{self.project_id}.{self.dataset_id}.{tabled_id}"
45
+
46
+ def get_bigquery_schema(self, destination_records: List[DestinationRecord]) -> List[bigquery.SchemaField]:
47
+
48
+ # we keep raw data in the column source_data
49
+ if self.config.normalization.type == NormalizationType.NONE:
50
+ return [
51
+ bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
52
+ bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
53
+ bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
54
+ bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
55
+ bigquery.SchemaField(
56
+ "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
57
+ ),
58
+ bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
59
+ ]
60
+
61
+ elif self.config.normalization.type == NormalizationType.DEBEZIUM:
62
+ assert (
63
+ "_bizon_message_key" in destination_records[0].source_data
64
+ ), "Debezium records must have a '_bizon_message_key' key"
65
+ message_keys = json.loads(destination_records[0].source_data["_bizon_message_key"])
66
+ return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in message_keys] + [
67
+ bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
68
+ bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
69
+ bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
70
+ bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
71
+ bigquery.SchemaField(
72
+ "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
73
+ ),
74
+ bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
75
+ ]
76
+
77
+ # If normalization is tabular, we parse key / value pairs to columns
78
+ elif self.config.normalization.type == NormalizationType.TABULAR:
79
+ first_record_keys = destination_records[0].source_data.keys()
80
+ return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in first_record_keys] + [
81
+ bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
82
+ bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
83
+ bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
84
+ bigquery.SchemaField(
85
+ "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
86
+ ),
87
+ bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
88
+ ]
89
+
90
+ raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
91
+
92
+ def check_connection(self) -> bool:
93
+ dataset_ref = DatasetReference(self.project_id, self.dataset_id)
94
+
95
+ try:
96
+ self.bq_client.get_dataset(dataset_ref)
97
+ except NotFound:
98
+ dataset = bigquery.Dataset(dataset_ref)
99
+ dataset.location = self.dataset_location
100
+ dataset = self.bq_client.create_dataset(dataset)
101
+ return True
102
+
103
+ def load_to_bigquery_via_streaming(self, destination_records: List[DestinationRecord]) -> str:
104
+ clustering_keys = []
105
+
106
+ if self.config.normalization.type == NormalizationType.DEBEZIUM:
107
+ clustering_keys = list(json.loads(destination_records[0].source_data["_bizon_message_key"]).keys())
108
+
109
+ # Create table if it doesnt exist
110
+ schema = self.get_bigquery_schema(destination_records=destination_records)
111
+ table = bigquery.Table(self.table_id, schema=schema)
112
+ time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
113
+ table.time_partitioning = time_partitioning
114
+
115
+ if clustering_keys:
116
+ table.clustering_fields = clustering_keys
117
+
118
+ table = self.bq_client.create_table(table, exists_ok=True)
119
+
120
+ # Create the stream
121
+ write_client = self.bq_storage_client
122
+ tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
123
+ parent = write_client.table_path(self.project_id, self.dataset_id, tabled_id)
124
+ stream_name = f"{parent}/_default"
125
+
126
+ # Generating the protocol buffer representation of the message descriptor.
127
+ proto_schema, TableRow = get_proto_schema_and_class(clustering_keys)
128
+
129
+ serialized_rows = [
130
+ record.to_protobuf_serialization(
131
+ TableRow, debezium=self.config.normalization.type == NormalizationType.DEBEZIUM
132
+ )
133
+ for record in destination_records
134
+ ]
135
+
136
+ request = AppendRowsRequest(
137
+ write_stream=stream_name,
138
+ proto_rows=AppendRowsRequest.ProtoData(
139
+ rows=ProtoRows(serialized_rows=serialized_rows),
140
+ writer_schema=proto_schema,
141
+ ),
142
+ )
143
+ response = write_client.append_rows(iter([request]))
144
+ assert response.code().name == "OK"
145
+
146
+ def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, str]:
147
+ self.load_to_bigquery_via_streaming(destination_records=destination_records)
148
+ return True, ""
@@ -0,0 +1,91 @@
1
+ from typing import List, Tuple, Type
2
+
3
+ from google.cloud.bigquery_storage_v1.types import ProtoSchema
4
+ from google.protobuf.descriptor_pb2 import (
5
+ DescriptorProto,
6
+ FieldDescriptorProto,
7
+ FileDescriptorProto,
8
+ )
9
+ from google.protobuf.descriptor_pool import DescriptorPool
10
+ from google.protobuf.message import Message
11
+ from google.protobuf.message_factory import GetMessageClassesForFiles
12
+
13
+
14
+ def get_proto_schema_and_class(clustering_keys: List[str] = None) -> Tuple[ProtoSchema, Type[Message]]:
15
+ # Define the FileDescriptorProto
16
+ file_descriptor_proto = FileDescriptorProto()
17
+ file_descriptor_proto.name = "dynamic.proto"
18
+ file_descriptor_proto.package = "dynamic_package"
19
+
20
+ # Define the TableRow message schema
21
+ message_descriptor = DescriptorProto()
22
+ message_descriptor.name = "TableRow"
23
+
24
+ # Add fields to the message, only use TYPE_STRING, BigQuery does not support other types
25
+ # It does not imapact data types in final table
26
+
27
+ # https://stackoverflow.com/questions/70489919/protobuf-type-for-bigquery-timestamp-field
28
+ fields = [
29
+ {"name": "_bizon_id", "type": FieldDescriptorProto.TYPE_STRING, "label": FieldDescriptorProto.LABEL_REQUIRED},
30
+ {
31
+ "name": "_bizon_extracted_at",
32
+ "type": FieldDescriptorProto.TYPE_STRING,
33
+ "label": FieldDescriptorProto.LABEL_REQUIRED,
34
+ },
35
+ {
36
+ "name": "_bizon_loaded_at",
37
+ "type": FieldDescriptorProto.TYPE_STRING,
38
+ "label": FieldDescriptorProto.LABEL_REQUIRED,
39
+ },
40
+ {
41
+ "name": "_source_record_id",
42
+ "type": FieldDescriptorProto.TYPE_STRING,
43
+ "label": FieldDescriptorProto.LABEL_REQUIRED,
44
+ },
45
+ {
46
+ "name": "_source_timestamp",
47
+ "type": FieldDescriptorProto.TYPE_STRING,
48
+ "label": FieldDescriptorProto.LABEL_REQUIRED,
49
+ },
50
+ {
51
+ "name": "_source_data",
52
+ "type": FieldDescriptorProto.TYPE_STRING,
53
+ "label": FieldDescriptorProto.LABEL_OPTIONAL,
54
+ },
55
+ ]
56
+
57
+ if clustering_keys:
58
+ for key in clustering_keys:
59
+ fields.append(
60
+ {
61
+ "name": key,
62
+ "type": FieldDescriptorProto.TYPE_STRING,
63
+ "label": FieldDescriptorProto.LABEL_OPTIONAL,
64
+ }
65
+ )
66
+
67
+ for i, field in enumerate(fields, start=1):
68
+ field_descriptor = message_descriptor.field.add()
69
+ field_descriptor.name = field["name"]
70
+ field_descriptor.number = i
71
+ field_descriptor.type = field["type"]
72
+ field_descriptor.label = field["label"]
73
+
74
+ # Add the message to the file descriptor
75
+ file_descriptor_proto.message_type.add().CopyFrom(message_descriptor)
76
+
77
+ # Create a DescriptorPool and register the FileDescriptorProto
78
+ pool = DescriptorPool()
79
+ pool.Add(file_descriptor_proto)
80
+
81
+ # Use the registered file name to fetch the message classes
82
+ message_classes = GetMessageClassesForFiles(["dynamic.proto"], pool=pool)
83
+
84
+ # Fetch the TableRow class
85
+ table_row_class = message_classes["dynamic_package.TableRow"]
86
+
87
+ # Create the ProtoSchema
88
+ proto_schema = ProtoSchema()
89
+ proto_schema.proto_descriptor.CopyFrom(message_descriptor)
90
+
91
+ return proto_schema, table_row_class
@@ -6,6 +6,7 @@ from pydantic import BaseModel, ConfigDict, Field
6
6
 
7
7
  class DestinationTypes(str, Enum):
8
8
  BIGQUERY = "bigquery"
9
+ BIGQUERY_STREAMING = "bigquery_streaming"
9
10
  LOGGER = "logger"
10
11
  FILE = "file"
11
12
 
@@ -122,6 +122,11 @@ class AbstractDestination(ABC):
122
122
 
123
123
  # Last iteration, write all records to destination
124
124
  if last_iteration:
125
+
126
+ if len(self.buffer.records) == 0 and self.config.buffer_size == 0:
127
+ logger.warning("No records to write to destination, already written, buffer is empty.")
128
+ return DestinationBufferStatus.RECORDS_WRITTEN
129
+
125
130
  logger.debug("Writing last iteration records to destination")
126
131
  assert len(destination_records) == 0, "Last iteration should not have any records"
127
132
  destination_iteration = self.buffer_flush_handler(session=session)
@@ -152,9 +157,9 @@ class AbstractDestination(ABC):
152
157
  logger.warning("No records to write to destination. Check source and queue provider.")
153
158
  return DestinationBufferStatus.NO_RECORDS
154
159
 
155
- # Write records to destination if buffer size is 0
160
+ # Write records to destination if buffer size is 0 or streaming
156
161
  if self.buffer.buffer_size == 0:
157
- logger.info("Writing last iteration records to destination")
162
+ logger.info("Writing records to destination.")
158
163
  self.buffer.add_source_iteration_records_to_buffer(
159
164
  iteration=iteration, records=destination_records, pagination=pagination
160
165
  )
@@ -263,6 +268,11 @@ class DestinationFactory:
263
268
 
264
269
  return BigQueryDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
265
270
 
271
+ elif config.name == DestinationTypes.BIGQUERY_STREAMING:
272
+ from .bigquery_streaming.src.destination import BigQueryStreamingDestination
273
+
274
+ return BigQueryStreamingDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
275
+
266
276
  elif config.name == DestinationTypes.FILE:
267
277
  from .file.src.destination import FileDestination
268
278
 
@@ -1,7 +1,9 @@
1
1
  import json
2
2
  from datetime import datetime
3
+ from typing import Type
3
4
  from uuid import uuid4
4
5
 
6
+ from google.protobuf.message import Message
5
7
  from pydantic import BaseModel, Field
6
8
  from pytz import UTC
7
9
 
@@ -81,3 +83,28 @@ class DestinationRecord(BaseModel):
81
83
  "_source_timestamp": self.source_timestamp,
82
84
  "_source_data": json.dumps(self.source_data),
83
85
  }
86
+
87
+ def to_protobuf_serialization(self, TableRowClass: Type[Message], debezium=False):
88
+
89
+ record = TableRowClass()
90
+ record._bizon_id = self.bizon_id
91
+ record._bizon_extracted_at = str(int(self.bizon_extracted_at.timestamp()))
92
+ record._bizon_loaded_at = str(int(self.bizon_loaded_at.timestamp()))
93
+ record._source_record_id = self.source_record_id
94
+ record._source_timestamp = str(int(self.source_timestamp.timestamp()))
95
+
96
+ if debezium:
97
+ parsed_debezium_keys = json.loads(self.source_data["_bizon_message_key"])
98
+ if parsed_debezium_keys:
99
+ for _key in parsed_debezium_keys:
100
+ setattr(record, _key, str(parsed_debezium_keys[_key]))
101
+ if self.source_data.get("op") == "d":
102
+ source_data = {"__deleted": True, **self.source_data["before"]}
103
+ else:
104
+ source_data = {"__deleted": False, **self.source_data["after"]}
105
+
106
+ record._source_data = json.dumps(source_data)
107
+ else:
108
+ record._source_data = json.dumps(self.source_data)
109
+
110
+ return record.SerializeToString()
@@ -1,5 +1,6 @@
1
1
  import concurrent.futures
2
2
  import time
3
+ import traceback
3
4
 
4
5
  from loguru import logger
5
6
 
@@ -75,5 +76,6 @@ class ThreadRunner(AbstractRunner):
75
76
  future_consumer.result()
76
77
  except Exception as e:
77
78
  logger.error(f"Consumer thread stopped running with error {e}")
79
+ logger.error(traceback.format_exc())
78
80
 
79
81
  return True
@@ -1,7 +1,7 @@
1
1
  import io
2
2
  import json
3
+ import logging
3
4
  import struct
4
- from concurrent.futures import ThreadPoolExecutor, as_completed
5
5
  from datetime import datetime, timezone
6
6
  from enum import Enum
7
7
  from functools import lru_cache
@@ -18,6 +18,9 @@ from bizon.source.config import SourceConfig
18
18
  from bizon.source.models import SourceIteration, SourceRecord
19
19
  from bizon.source.source import AbstractSource
20
20
 
21
+ silent_logger = logging.getLogger()
22
+ silent_logger.addHandler(logging.StreamHandler())
23
+
21
24
 
22
25
  class SchemaRegistryType(str, Enum):
23
26
  APICURIO = "apicurio"
@@ -98,7 +101,10 @@ class KafkaSource(AbstractSource):
98
101
  }
99
102
 
100
103
  # Consumer instance
101
- self.consumer = Consumer(self.kafka_consumer_conf, logger=logger)
104
+ self.consumer = Consumer(self.kafka_consumer_conf, logger=silent_logger)
105
+
106
+ # Consumers for each worker thread
107
+ self.consumers_cached: Mapping[int, Consumer] = {}
102
108
 
103
109
  @staticmethod
104
110
  def streams() -> List[str]:
@@ -194,25 +200,17 @@ class KafkaSource(AbstractSource):
194
200
  global_id = self.parse_global_id_from_serialized_message(header_message)
195
201
  return self.get_parsed_avro_schema(global_id).to_json()
196
202
 
197
- def read_partition(self, partition: int, topic_offsets: TopicOffsets) -> List[SourceRecord]:
203
+ def parse_encoded_messages(self, encoded_messages: list) -> List[SourceRecord]:
204
+
198
205
  records = []
199
- encoded_messages = []
200
206
 
201
207
  # Set the source timestamp to now, otherwise it will be overwritten by the message timestamp
202
208
  source_timestamp = datetime.now(tz=timezone.utc)
203
209
 
204
- # Set consumer offset params
205
- consumer = Consumer(self.kafka_consumer_conf, logger=logger)
206
- consumer.assign([TopicPartition(self.config.topic, partition, topic_offsets.get_partition_offset(partition))])
207
- consumer.seek(TopicPartition(self.config.topic, partition, topic_offsets.get_partition_offset(partition)))
208
-
209
- # Read messages
210
- encoded_messages.extend(consumer.consume(self.config.batch_size, timeout=self.config.consumer_timeout))
211
-
212
210
  for message in encoded_messages:
213
211
  if not message.value():
214
212
  logger.debug(
215
- f"Message for partition {partition} and offset {message.offset()} and topic {self.config.topic} is empty, skipping."
213
+ f"Message for partition {message.partition()} and offset {message.offset()} and topic {self.config.topic} is empty, skipping."
216
214
  )
217
215
  continue
218
216
 
@@ -233,43 +231,44 @@ class KafkaSource(AbstractSource):
233
231
  data[self.config.timestamp_ms_name] / 1000, tz=timezone.utc
234
232
  )
235
233
 
234
+ self.topic_offsets.set_partition_offset(message.partition(), message.offset() + 1)
235
+
236
236
  records.append(
237
237
  SourceRecord(
238
- id=f"part_{partition}_offset_{message.offset()}",
238
+ id=f"part_{message.partition()}_offset_{message.offset()}",
239
239
  timestamp=source_timestamp,
240
240
  data=data,
241
241
  )
242
242
  )
243
+
243
244
  except Exception as e:
244
245
  logger.error(
245
- f"Error while decoding message for partition {partition}: {e} at offset {message.offset()}"
246
+ f"Error while decoding message for partition {message.partition()}: {e} at offset {message.offset()}"
246
247
  )
247
248
  continue
248
249
 
249
- # Update the offset for the partition
250
- if encoded_messages:
251
- topic_offsets.set_partition_offset(partition, encoded_messages[-1].offset() + 1)
252
- else:
253
- logger.warning(f"No new messages found for partition {partition}")
254
-
255
- consumer.close()
256
-
257
250
  return records
258
251
 
259
252
  def read_topic(self, pagination: dict = None) -> SourceIteration:
260
253
  nb_partitions = self.get_number_of_partitions()
261
254
 
262
255
  # Setup offset_pagination
263
- topic_offsets = TopicOffsets.model_validate(pagination) if pagination else self.get_offset_partitions()
256
+ self.topic_offsets = TopicOffsets.model_validate(pagination) if pagination else self.get_offset_partitions()
264
257
 
265
- # Use ThreadPoolExecutor to parallelize reading partitions
266
- records = []
267
- with ThreadPoolExecutor(max_workers=min(nb_partitions, self.config.max_consumer_threads)) as executor:
268
- futures = {executor.submit(self.read_partition, i, topic_offsets): i for i in range(nb_partitions)}
269
- for future in as_completed(futures):
270
- partition_records = future.result()
271
- records.extend(partition_records)
258
+ self.consumer.assign(
259
+ [
260
+ TopicPartition(self.config.topic, partition, self.topic_offsets.get_partition_offset(partition))
261
+ for partition in range(nb_partitions)
262
+ ]
263
+ )
272
264
 
265
+ t1 = datetime.now()
266
+ encoded_messages = self.consumer.consume(self.config.batch_size, timeout=self.config.consumer_timeout)
267
+ logger.info(f"Read Kafka: {len(encoded_messages)} messages in {datetime.now() - t1}")
268
+
269
+ records = self.parse_encoded_messages(encoded_messages)
270
+
271
+ # Update the offset for the partition
273
272
  if not records:
274
273
  logger.info("No new records found, stopping iteration")
275
274
  return SourceIteration(
@@ -278,7 +277,7 @@ class KafkaSource(AbstractSource):
278
277
  )
279
278
 
280
279
  return SourceIteration(
281
- next_pagination=topic_offsets.model_dump(),
280
+ next_pagination=self.topic_offsets.model_dump(),
282
281
  records=records,
283
282
  )
284
283
 
@@ -4,6 +4,6 @@ from bizon.engine.engine import RunnerFactory
4
4
 
5
5
  if __name__ == "__main__":
6
6
  runner = RunnerFactory.create_from_yaml(
7
- filepath=os.path.abspath("bizon/sources/kafka/config/kafka_teams_users_eu_west1_c511.yml")
7
+ filepath=os.path.abspath("bizon/sources/kafka/config/kafka_teams_users.yml")
8
8
  )
9
9
  runner.run()
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "bizon"
3
- version = "0.0.10"
3
+ version = "0.0.11"
4
4
  description = "Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism."
5
5
  authors = ["Antoine Balliet <antoine.balliet@gmail.com>", "Anas El Mhamdi <anas.elmhamdi@gmail.com>"]
6
6
  readme = "README.md"
@@ -44,6 +44,7 @@ python-dotenv = "^1.0.1"
44
44
  gspread = { version = "^6.1.2", optional = true }
45
45
  click = "^8.1.7"
46
46
  pytz = "^2024.2"
47
+ protobuf = "4.24.0"
47
48
 
48
49
  [tool.poetry.extras]
49
50
  postgres = ["psycopg2-binary"]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes