bizon 0.0.9__tar.gz → 0.0.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. {bizon-0.0.9 → bizon-0.0.11}/PKG-INFO +2 -1
  2. {bizon-0.0.9 → bizon-0.0.11}/bizon/common/models.py +2 -0
  3. bizon-0.0.11/bizon/destinations/bigquery_streaming/src/config.py +55 -0
  4. bizon-0.0.11/bizon/destinations/bigquery_streaming/src/destination.py +148 -0
  5. bizon-0.0.11/bizon/destinations/bigquery_streaming/src/proto_utils.py +91 -0
  6. {bizon-0.0.9 → bizon-0.0.11}/bizon/destinations/config.py +1 -0
  7. {bizon-0.0.9 → bizon-0.0.11}/bizon/destinations/destination.py +18 -3
  8. {bizon-0.0.9 → bizon-0.0.11}/bizon/destinations/models.py +27 -0
  9. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/pipeline/producer.py +11 -0
  10. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/queue/adapters/kafka/queue.py +4 -0
  11. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/queue/adapters/python_queue/queue.py +6 -0
  12. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/queue/adapters/rabbitmq/queue.py +5 -0
  13. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/queue/queue.py +6 -1
  14. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/runner/adapters/thread.py +2 -0
  15. {bizon-0.0.9 → bizon-0.0.11}/bizon/source/discover.py +1 -1
  16. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/kafka/src/source.py +31 -32
  17. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/kafka/tests/kafka_pipeline.py +1 -1
  18. {bizon-0.0.9 → bizon-0.0.11}/pyproject.toml +2 -1
  19. {bizon-0.0.9 → bizon-0.0.11}/LICENSE +0 -0
  20. {bizon-0.0.9 → bizon-0.0.11}/README.md +0 -0
  21. {bizon-0.0.9 → bizon-0.0.11}/bizon/__main__.py +0 -0
  22. {bizon-0.0.9 → bizon-0.0.11}/bizon/cli/__init__.py +0 -0
  23. {bizon-0.0.9 → bizon-0.0.11}/bizon/cli/main.py +0 -0
  24. {bizon-0.0.9 → bizon-0.0.11}/bizon/cli/utils.py +0 -0
  25. {bizon-0.0.9 → bizon-0.0.11}/bizon/common/errors/backoff.py +0 -0
  26. {bizon-0.0.9 → bizon-0.0.11}/bizon/common/errors/errors.py +0 -0
  27. {bizon-0.0.9 → bizon-0.0.11}/bizon/destinations/bigquery/config/bigquery.example.yml +0 -0
  28. {bizon-0.0.9 → bizon-0.0.11}/bizon/destinations/bigquery/src/config.py +0 -0
  29. {bizon-0.0.9 → bizon-0.0.11}/bizon/destinations/bigquery/src/destination.py +0 -0
  30. {bizon-0.0.9 → bizon-0.0.11}/bizon/destinations/buffer.py +0 -0
  31. {bizon-0.0.9 → bizon-0.0.11}/bizon/destinations/file/src/config.py +0 -0
  32. {bizon-0.0.9 → bizon-0.0.11}/bizon/destinations/file/src/destination.py +0 -0
  33. {bizon-0.0.9 → bizon-0.0.11}/bizon/destinations/logger/src/config.py +0 -0
  34. {bizon-0.0.9 → bizon-0.0.11}/bizon/destinations/logger/src/destination.py +0 -0
  35. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/backend/adapters/sqlalchemy/backend.py +0 -0
  36. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/backend/adapters/sqlalchemy/config.py +0 -0
  37. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/backend/backend.py +0 -0
  38. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/backend/config.py +0 -0
  39. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/backend/models.py +0 -0
  40. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/config.py +0 -0
  41. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/engine.py +0 -0
  42. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/pipeline/consumer.py +0 -0
  43. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/pipeline/models.py +0 -0
  44. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/queue/adapters/kafka/config.py +0 -0
  45. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/queue/adapters/kafka/consumer.py +0 -0
  46. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/queue/adapters/python_queue/config.py +0 -0
  47. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/queue/adapters/python_queue/consumer.py +0 -0
  48. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/queue/adapters/rabbitmq/config.py +0 -0
  49. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -0
  50. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/queue/config.py +0 -0
  51. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/runner/adapters/process.py +0 -0
  52. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/runner/config.py +0 -0
  53. {bizon-0.0.9 → bizon-0.0.11}/bizon/engine/runner/runner.py +0 -0
  54. {bizon-0.0.9 → bizon-0.0.11}/bizon/source/auth/authenticators/abstract_oauth.py +0 -0
  55. {bizon-0.0.9 → bizon-0.0.11}/bizon/source/auth/authenticators/abstract_token.py +0 -0
  56. {bizon-0.0.9 → bizon-0.0.11}/bizon/source/auth/authenticators/basic.py +0 -0
  57. {bizon-0.0.9 → bizon-0.0.11}/bizon/source/auth/authenticators/cookies.py +0 -0
  58. {bizon-0.0.9 → bizon-0.0.11}/bizon/source/auth/authenticators/oauth.py +0 -0
  59. {bizon-0.0.9 → bizon-0.0.11}/bizon/source/auth/authenticators/token.py +0 -0
  60. {bizon-0.0.9 → bizon-0.0.11}/bizon/source/auth/builder.py +0 -0
  61. {bizon-0.0.9 → bizon-0.0.11}/bizon/source/auth/config.py +0 -0
  62. {bizon-0.0.9 → bizon-0.0.11}/bizon/source/config.py +0 -0
  63. {bizon-0.0.9 → bizon-0.0.11}/bizon/source/cursor.py +0 -0
  64. {bizon-0.0.9 → bizon-0.0.11}/bizon/source/models.py +0 -0
  65. {bizon-0.0.9 → bizon-0.0.11}/bizon/source/session.py +0 -0
  66. {bizon-0.0.9 → bizon-0.0.11}/bizon/source/source.py +0 -0
  67. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/dummy/config/api_key.example.yml +0 -0
  68. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/dummy/config/api_key_kafka.example.yml +0 -0
  69. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/dummy/src/fake_api.py +0 -0
  70. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/dummy/src/source.py +0 -0
  71. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline.py +0 -0
  72. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline_bigquery_backend.py +0 -0
  73. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline_kafka.py +0 -0
  74. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline_rabbitmq.py +0 -0
  75. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery.py +0 -0
  76. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +0 -0
  77. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/gsheets/config/default_auth.example.yml +0 -0
  78. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/gsheets/config/service_account.example.yml +0 -0
  79. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/gsheets/src/source.py +0 -0
  80. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/gsheets/tests/gsheets_pipeline.py +0 -0
  81. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/hubspot/config/api_key.example.yml +0 -0
  82. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/hubspot/config/oauth.example.yml +0 -0
  83. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/hubspot/src/hubspot_base.py +0 -0
  84. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/hubspot/src/hubspot_objects.py +0 -0
  85. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/hubspot/src/models/hs_object.py +0 -0
  86. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/hubspot/tests/hubspot_pipeline.py +0 -0
  87. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/kafka/config/kafka.example.yml +0 -0
  88. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/periscope/config/periscope_charts.example.yml +0 -0
  89. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -0
  90. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/periscope/src/source.py +0 -0
  91. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/periscope/tests/periscope_pipeline_charts.py +0 -0
  92. {bizon-0.0.9 → bizon-0.0.11}/bizon/sources/periscope/tests/periscope_pipeline_dashboard.py +0 -0
  93. {bizon-0.0.9 → bizon-0.0.11}/bizon/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: bizon
3
- Version: 0.0.9
3
+ Version: 0.0.11
4
4
  Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
5
5
  Author: Antoine Balliet
6
6
  Author-email: antoine.balliet@gmail.com
@@ -31,6 +31,7 @@ Requires-Dist: loguru (>=0.7.2,<0.8.0)
31
31
  Requires-Dist: pandas (>=2.2.2,<3.0.0) ; extra == "bigquery"
32
32
  Requires-Dist: pendulum (>=3.0.0,<4.0.0)
33
33
  Requires-Dist: pika (>=1.3.2,<2.0.0) ; extra == "rabbitmq"
34
+ Requires-Dist: protobuf (==4.24.0)
34
35
  Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgres"
35
36
  Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
36
37
  Requires-Dist: pydantic (>=2.8.2,<3.0.0)
@@ -3,6 +3,7 @@ from typing import Union
3
3
  from pydantic import BaseModel, ConfigDict, Field
4
4
 
5
5
  from bizon.destinations.bigquery.src.config import BigQueryConfig
6
+ from bizon.destinations.bigquery_streaming.src.config import BigQueryStreamingConfig
6
7
  from bizon.destinations.file.src.config import FileDestinationConfig
7
8
  from bizon.destinations.logger.src.config import LoggerConfig
8
9
  from bizon.engine.config import EngineConfig
@@ -24,6 +25,7 @@ class BizonConfig(BaseModel):
24
25
 
25
26
  destination: Union[
26
27
  BigQueryConfig,
28
+ BigQueryStreamingConfig,
27
29
  LoggerConfig,
28
30
  FileDestinationConfig,
29
31
  ] = Field(
@@ -0,0 +1,55 @@
1
+ from enum import Enum
2
+ from typing import Literal, Optional
3
+
4
+ from pydantic import BaseModel, Field, field_validator
5
+
6
+ from bizon.destinations.config import (
7
+ AbstractDestinationConfig,
8
+ AbstractDestinationDetailsConfig,
9
+ DestinationTypes,
10
+ )
11
+
12
+
13
+ class GCSBufferFormat(str, Enum):
14
+ PARQUET = "parquet"
15
+ CSV = "csv"
16
+
17
+
18
+ class TimePartitioning(str, Enum):
19
+ DAY = "DAY"
20
+ HOUR = "HOUR"
21
+ MONTH = "MONTH"
22
+ YEAR = "YEAR"
23
+
24
+
25
+ class BigQueryAuthentication(BaseModel):
26
+ service_account_key: str = Field(
27
+ description="Service Account Key JSON string. If empty it will be infered",
28
+ default="",
29
+ )
30
+
31
+
32
+ class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
33
+ project_id: str
34
+ dataset_id: str
35
+ dataset_location: Optional[str] = "US"
36
+ table_id: Optional[str] = Field(
37
+ default=None, description="Table ID, if not provided it will be inferred from source name"
38
+ )
39
+ time_partitioning: Optional[TimePartitioning] = Field(
40
+ default=TimePartitioning.DAY, description="BigQuery Time partitioning type"
41
+ )
42
+ authentication: Optional[BigQueryAuthentication] = None
43
+
44
+ buffer_size: int = Field(default=0, description="Buffer size in MB")
45
+
46
+ @field_validator("buffer_size", mode="after")
47
+ def validate_buffer_size(cls, value: int) -> int:
48
+ if value != 0:
49
+ raise ValueError("Buffer size must be 0, we directly stream to BigQuery")
50
+ return value
51
+
52
+
53
+ class BigQueryStreamingConfig(AbstractDestinationConfig):
54
+ name: Literal[DestinationTypes.BIGQUERY_STREAMING]
55
+ config: BigQueryConfigDetails
@@ -0,0 +1,148 @@
1
+ import json
2
+ import os
3
+ import tempfile
4
+ from typing import List, Tuple
5
+
6
+ from google.api_core.exceptions import NotFound
7
+ from google.cloud import bigquery, bigquery_storage_v1, storage
8
+ from google.cloud.bigquery import DatasetReference, TimePartitioning
9
+ from google.cloud.bigquery_storage_v1.types import AppendRowsRequest, ProtoRows
10
+ from loguru import logger
11
+
12
+ from bizon.common.models import SyncMetadata
13
+ from bizon.destinations.config import NormalizationType
14
+ from bizon.destinations.destination import AbstractDestination
15
+ from bizon.destinations.models import DestinationRecord
16
+ from bizon.engine.backend.backend import AbstractBackend
17
+
18
+ from .config import BigQueryConfigDetails
19
+ from .proto_utils import get_proto_schema_and_class
20
+
21
+
22
+ class BigQueryStreamingDestination(AbstractDestination):
23
+
24
+ def __init__(self, sync_metadata: SyncMetadata, config: BigQueryConfigDetails, backend: AbstractBackend):
25
+ super().__init__(sync_metadata, config, backend)
26
+ self.config: BigQueryConfigDetails = config
27
+
28
+ if config.authentication and config.authentication.service_account_key:
29
+ with tempfile.NamedTemporaryFile(delete=False) as temp:
30
+ temp.write(config.authentication.service_account_key.encode())
31
+ temp_file_path = temp.name
32
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file_path
33
+
34
+ self.project_id = config.project_id
35
+ self.bq_client = bigquery.Client(project=self.project_id)
36
+ self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
37
+ self.gcs_client = storage.Client(project=self.project_id)
38
+ self.dataset_id = config.dataset_id
39
+ self.dataset_location = config.dataset_location
40
+
41
+ @property
42
+ def table_id(self) -> str:
43
+ tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
44
+ return f"{self.project_id}.{self.dataset_id}.{tabled_id}"
45
+
46
+ def get_bigquery_schema(self, destination_records: List[DestinationRecord]) -> List[bigquery.SchemaField]:
47
+
48
+ # we keep raw data in the column source_data
49
+ if self.config.normalization.type == NormalizationType.NONE:
50
+ return [
51
+ bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
52
+ bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
53
+ bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
54
+ bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
55
+ bigquery.SchemaField(
56
+ "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
57
+ ),
58
+ bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
59
+ ]
60
+
61
+ elif self.config.normalization.type == NormalizationType.DEBEZIUM:
62
+ assert (
63
+ "_bizon_message_key" in destination_records[0].source_data
64
+ ), "Debezium records must have a '_bizon_message_key' key"
65
+ message_keys = json.loads(destination_records[0].source_data["_bizon_message_key"])
66
+ return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in message_keys] + [
67
+ bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
68
+ bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
69
+ bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
70
+ bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
71
+ bigquery.SchemaField(
72
+ "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
73
+ ),
74
+ bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
75
+ ]
76
+
77
+ # If normalization is tabular, we parse key / value pairs to columns
78
+ elif self.config.normalization.type == NormalizationType.TABULAR:
79
+ first_record_keys = destination_records[0].source_data.keys()
80
+ return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in first_record_keys] + [
81
+ bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
82
+ bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
83
+ bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
84
+ bigquery.SchemaField(
85
+ "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
86
+ ),
87
+ bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
88
+ ]
89
+
90
+ raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
91
+
92
+ def check_connection(self) -> bool:
93
+ dataset_ref = DatasetReference(self.project_id, self.dataset_id)
94
+
95
+ try:
96
+ self.bq_client.get_dataset(dataset_ref)
97
+ except NotFound:
98
+ dataset = bigquery.Dataset(dataset_ref)
99
+ dataset.location = self.dataset_location
100
+ dataset = self.bq_client.create_dataset(dataset)
101
+ return True
102
+
103
+ def load_to_bigquery_via_streaming(self, destination_records: List[DestinationRecord]) -> str:
104
+ clustering_keys = []
105
+
106
+ if self.config.normalization.type == NormalizationType.DEBEZIUM:
107
+ clustering_keys = list(json.loads(destination_records[0].source_data["_bizon_message_key"]).keys())
108
+
109
+ # Create table if it doesnt exist
110
+ schema = self.get_bigquery_schema(destination_records=destination_records)
111
+ table = bigquery.Table(self.table_id, schema=schema)
112
+ time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
113
+ table.time_partitioning = time_partitioning
114
+
115
+ if clustering_keys:
116
+ table.clustering_fields = clustering_keys
117
+
118
+ table = self.bq_client.create_table(table, exists_ok=True)
119
+
120
+ # Create the stream
121
+ write_client = self.bq_storage_client
122
+ tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
123
+ parent = write_client.table_path(self.project_id, self.dataset_id, tabled_id)
124
+ stream_name = f"{parent}/_default"
125
+
126
+ # Generating the protocol buffer representation of the message descriptor.
127
+ proto_schema, TableRow = get_proto_schema_and_class(clustering_keys)
128
+
129
+ serialized_rows = [
130
+ record.to_protobuf_serialization(
131
+ TableRow, debezium=self.config.normalization.type == NormalizationType.DEBEZIUM
132
+ )
133
+ for record in destination_records
134
+ ]
135
+
136
+ request = AppendRowsRequest(
137
+ write_stream=stream_name,
138
+ proto_rows=AppendRowsRequest.ProtoData(
139
+ rows=ProtoRows(serialized_rows=serialized_rows),
140
+ writer_schema=proto_schema,
141
+ ),
142
+ )
143
+ response = write_client.append_rows(iter([request]))
144
+ assert response.code().name == "OK"
145
+
146
+ def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, str]:
147
+ self.load_to_bigquery_via_streaming(destination_records=destination_records)
148
+ return True, ""
@@ -0,0 +1,91 @@
1
+ from typing import List, Tuple, Type
2
+
3
+ from google.cloud.bigquery_storage_v1.types import ProtoSchema
4
+ from google.protobuf.descriptor_pb2 import (
5
+ DescriptorProto,
6
+ FieldDescriptorProto,
7
+ FileDescriptorProto,
8
+ )
9
+ from google.protobuf.descriptor_pool import DescriptorPool
10
+ from google.protobuf.message import Message
11
+ from google.protobuf.message_factory import GetMessageClassesForFiles
12
+
13
+
14
+ def get_proto_schema_and_class(clustering_keys: List[str] = None) -> Tuple[ProtoSchema, Type[Message]]:
15
+ # Define the FileDescriptorProto
16
+ file_descriptor_proto = FileDescriptorProto()
17
+ file_descriptor_proto.name = "dynamic.proto"
18
+ file_descriptor_proto.package = "dynamic_package"
19
+
20
+ # Define the TableRow message schema
21
+ message_descriptor = DescriptorProto()
22
+ message_descriptor.name = "TableRow"
23
+
24
+ # Add fields to the message, only use TYPE_STRING, BigQuery does not support other types
25
+ # It does not imapact data types in final table
26
+
27
+ # https://stackoverflow.com/questions/70489919/protobuf-type-for-bigquery-timestamp-field
28
+ fields = [
29
+ {"name": "_bizon_id", "type": FieldDescriptorProto.TYPE_STRING, "label": FieldDescriptorProto.LABEL_REQUIRED},
30
+ {
31
+ "name": "_bizon_extracted_at",
32
+ "type": FieldDescriptorProto.TYPE_STRING,
33
+ "label": FieldDescriptorProto.LABEL_REQUIRED,
34
+ },
35
+ {
36
+ "name": "_bizon_loaded_at",
37
+ "type": FieldDescriptorProto.TYPE_STRING,
38
+ "label": FieldDescriptorProto.LABEL_REQUIRED,
39
+ },
40
+ {
41
+ "name": "_source_record_id",
42
+ "type": FieldDescriptorProto.TYPE_STRING,
43
+ "label": FieldDescriptorProto.LABEL_REQUIRED,
44
+ },
45
+ {
46
+ "name": "_source_timestamp",
47
+ "type": FieldDescriptorProto.TYPE_STRING,
48
+ "label": FieldDescriptorProto.LABEL_REQUIRED,
49
+ },
50
+ {
51
+ "name": "_source_data",
52
+ "type": FieldDescriptorProto.TYPE_STRING,
53
+ "label": FieldDescriptorProto.LABEL_OPTIONAL,
54
+ },
55
+ ]
56
+
57
+ if clustering_keys:
58
+ for key in clustering_keys:
59
+ fields.append(
60
+ {
61
+ "name": key,
62
+ "type": FieldDescriptorProto.TYPE_STRING,
63
+ "label": FieldDescriptorProto.LABEL_OPTIONAL,
64
+ }
65
+ )
66
+
67
+ for i, field in enumerate(fields, start=1):
68
+ field_descriptor = message_descriptor.field.add()
69
+ field_descriptor.name = field["name"]
70
+ field_descriptor.number = i
71
+ field_descriptor.type = field["type"]
72
+ field_descriptor.label = field["label"]
73
+
74
+ # Add the message to the file descriptor
75
+ file_descriptor_proto.message_type.add().CopyFrom(message_descriptor)
76
+
77
+ # Create a DescriptorPool and register the FileDescriptorProto
78
+ pool = DescriptorPool()
79
+ pool.Add(file_descriptor_proto)
80
+
81
+ # Use the registered file name to fetch the message classes
82
+ message_classes = GetMessageClassesForFiles(["dynamic.proto"], pool=pool)
83
+
84
+ # Fetch the TableRow class
85
+ table_row_class = message_classes["dynamic_package.TableRow"]
86
+
87
+ # Create the ProtoSchema
88
+ proto_schema = ProtoSchema()
89
+ proto_schema.proto_descriptor.CopyFrom(message_descriptor)
90
+
91
+ return proto_schema, table_row_class
@@ -6,6 +6,7 @@ from pydantic import BaseModel, ConfigDict, Field
6
6
 
7
7
  class DestinationTypes(str, Enum):
8
8
  BIGQUERY = "bigquery"
9
+ BIGQUERY_STREAMING = "bigquery_streaming"
9
10
  LOGGER = "logger"
10
11
  FILE = "file"
11
12
 
@@ -85,11 +85,16 @@ class AbstractDestination(ABC):
85
85
  pagination=self.buffer.pagination,
86
86
  )
87
87
 
88
+ logger.info(
89
+ f"Writing in destination from source iteration {self.buffer.from_iteration} to {self.buffer.to_iteration}"
90
+ )
91
+
88
92
  success, error_msg = self.write_records(destination_records=self.buffer.records)
89
93
 
90
94
  if success:
91
95
  # We wrote records to destination so we keep it
92
96
  destination_iteration.records_written = len(self.buffer.records)
97
+ logger.info(f"Successfully wrote {destination_iteration.records_written} records to destination")
93
98
 
94
99
  else:
95
100
  # We failed to write records to destination so we keep the error message
@@ -117,6 +122,11 @@ class AbstractDestination(ABC):
117
122
 
118
123
  # Last iteration, write all records to destination
119
124
  if last_iteration:
125
+
126
+ if len(self.buffer.records) == 0 and self.config.buffer_size == 0:
127
+ logger.warning("No records to write to destination, already written, buffer is empty.")
128
+ return DestinationBufferStatus.RECORDS_WRITTEN
129
+
120
130
  logger.debug("Writing last iteration records to destination")
121
131
  assert len(destination_records) == 0, "Last iteration should not have any records"
122
132
  destination_iteration = self.buffer_flush_handler(session=session)
@@ -147,9 +157,9 @@ class AbstractDestination(ABC):
147
157
  logger.warning("No records to write to destination. Check source and queue provider.")
148
158
  return DestinationBufferStatus.NO_RECORDS
149
159
 
150
- # Write records to destination if buffer size is 0
160
+ # Write records to destination if buffer size is 0 or streaming
151
161
  if self.buffer.buffer_size == 0:
152
- logger.info("Writing last iteration records to destination")
162
+ logger.info("Writing records to destination.")
153
163
  self.buffer.add_source_iteration_records_to_buffer(
154
164
  iteration=iteration, records=destination_records, pagination=pagination
155
165
  )
@@ -160,7 +170,7 @@ class AbstractDestination(ABC):
160
170
  logger.debug(f"Buffer free space {self.buffer.buffer_free_space_pct}%")
161
171
  logger.debug(f"Buffer current size {self.buffer.current_size} bytes")
162
172
  logger.info(
163
- f"Buffer ripeness {self.buffer.ripeness / 60} min. Max ripeness {self.buffer.buffer_flush_timeout / 60} min." # noqa
173
+ f"Buffer ripeness {round(self.buffer.ripeness / 60, 2)} min. Max ripeness {round(self.buffer.buffer_flush_timeout / 60, 2)} min." # noqa
164
174
  )
165
175
 
166
176
  # Write buffer to destination if buffer is ripe and create a new buffer for the new iteration
@@ -258,6 +268,11 @@ class DestinationFactory:
258
268
 
259
269
  return BigQueryDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
260
270
 
271
+ elif config.name == DestinationTypes.BIGQUERY_STREAMING:
272
+ from .bigquery_streaming.src.destination import BigQueryStreamingDestination
273
+
274
+ return BigQueryStreamingDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
275
+
261
276
  elif config.name == DestinationTypes.FILE:
262
277
  from .file.src.destination import FileDestination
263
278
 
@@ -1,7 +1,9 @@
1
1
  import json
2
2
  from datetime import datetime
3
+ from typing import Type
3
4
  from uuid import uuid4
4
5
 
6
+ from google.protobuf.message import Message
5
7
  from pydantic import BaseModel, Field
6
8
  from pytz import UTC
7
9
 
@@ -81,3 +83,28 @@ class DestinationRecord(BaseModel):
81
83
  "_source_timestamp": self.source_timestamp,
82
84
  "_source_data": json.dumps(self.source_data),
83
85
  }
86
+
87
+ def to_protobuf_serialization(self, TableRowClass: Type[Message], debezium=False):
88
+
89
+ record = TableRowClass()
90
+ record._bizon_id = self.bizon_id
91
+ record._bizon_extracted_at = str(int(self.bizon_extracted_at.timestamp()))
92
+ record._bizon_loaded_at = str(int(self.bizon_loaded_at.timestamp()))
93
+ record._source_record_id = self.source_record_id
94
+ record._source_timestamp = str(int(self.source_timestamp.timestamp()))
95
+
96
+ if debezium:
97
+ parsed_debezium_keys = json.loads(self.source_data["_bizon_message_key"])
98
+ if parsed_debezium_keys:
99
+ for _key in parsed_debezium_keys:
100
+ setattr(record, _key, str(parsed_debezium_keys[_key]))
101
+ if self.source_data.get("op") == "d":
102
+ source_data = {"__deleted": True, **self.source_data["before"]}
103
+ else:
104
+ source_data = {"__deleted": False, **self.source_data["after"]}
105
+
106
+ record._source_data = json.dumps(source_data)
107
+ else:
108
+ record._source_data = json.dumps(self.source_data)
109
+
110
+ return record.SerializeToString()
@@ -104,6 +104,8 @@ class Producer:
104
104
 
105
105
  while not cursor.is_finished:
106
106
 
107
+ timestamp_start_iteration = datetime.now(tz=UTC)
108
+
107
109
  # Handle the case where last cursor already reach max_iterations
108
110
  terminate = self.handle_max_iterations(cursor)
109
111
  if terminate:
@@ -178,6 +180,15 @@ class Producer:
178
180
  return_value = PipelineReturnStatus.SOURCE_ERROR
179
181
  break
180
182
 
183
+ # Items in queue
184
+ items_in_queue = f"{self.queue.get_size()} items in queue." if self.queue.get_size() else ""
185
+
186
+ logger.info(
187
+ (
188
+ f"Iteration {cursor.iteration} finished in {datetime.now(tz=UTC) - timestamp_start_iteration}. {items_in_queue}"
189
+ )
190
+ )
191
+
181
192
  logger.info("Terminating destination ...")
182
193
 
183
194
  try:
@@ -1,4 +1,5 @@
1
1
  import json
2
+ from typing import Union
2
3
 
3
4
  from kafka import KafkaProducer
4
5
  from loguru import logger
@@ -36,6 +37,9 @@ class KafkaQueue(AbstractQueue):
36
37
  def on_error(e):
37
38
  logger.error(f"Error sending message: {e}")
38
39
 
40
+ def get_size(self) -> Union[int, None]:
41
+ return None
42
+
39
43
  def put_queue_message(self, queue_message: QueueMessage):
40
44
  future = self.producer.send(
41
45
  topic=self.config.queue.topic,
@@ -1,6 +1,7 @@
1
1
  import random
2
2
  import time
3
3
  from multiprocessing import Queue
4
+ from typing import Union
4
5
 
5
6
  from loguru import logger
6
7
 
@@ -52,6 +53,11 @@ class PythonQueue(AbstractQueue):
52
53
  time.sleep(random.random())
53
54
  return self.get()
54
55
 
56
+ def get_size(self) -> Union[int, None]:
57
+ if hasattr(self.queue, "qsize"):
58
+ return self.queue.qsize()
59
+ return None
60
+
55
61
  def terminate(self, iteration: int) -> bool:
56
62
  self.put(source_records=[], iteration=iteration, signal=QUEUE_TERMINATION)
57
63
  logger.info("Sent termination signal to destination.")
@@ -1,3 +1,5 @@
1
+ from typing import Union
2
+
1
3
  import pika
2
4
  from loguru import logger
3
5
 
@@ -31,6 +33,9 @@ class RabbitMQ(AbstractQueue):
31
33
  body=queue_message.model_dump_json(),
32
34
  )
33
35
 
36
+ def get_size(self) -> Union[int, None]:
37
+ return None
38
+
34
39
  def get(self) -> QueueMessage:
35
40
  raise NotImplementedError(
36
41
  "RabbitMQ does not support getting messages from the queue, directly use callback in consumer."
@@ -1,6 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from datetime import datetime
3
- from typing import List, Optional
3
+ from typing import List, Optional, Union
4
4
 
5
5
  from pydantic import BaseModel
6
6
  from pytz import UTC
@@ -45,6 +45,11 @@ class AbstractQueue(ABC):
45
45
  """Get a QueueMessage object from the queue system"""
46
46
  pass
47
47
 
48
+ @abstractmethod
49
+ def get_size(self) -> Union[int, None]:
50
+ """If queue is compatible, return size of the queue"""
51
+ pass
52
+
48
53
  @abstractmethod
49
54
  def terminate(self, iteration: int) -> bool:
50
55
  """Send a termination signal in the queue system"""
@@ -1,5 +1,6 @@
1
1
  import concurrent.futures
2
2
  import time
3
+ import traceback
3
4
 
4
5
  from loguru import logger
5
6
 
@@ -75,5 +76,6 @@ class ThreadRunner(AbstractRunner):
75
76
  future_consumer.result()
76
77
  except Exception as e:
77
78
  logger.error(f"Consumer thread stopped running with error {e}")
79
+ logger.error(traceback.format_exc())
78
80
 
79
81
  return True
@@ -143,7 +143,7 @@ def parse_streams_from_filepath(source_name: str, filepath: str, skip_unavailabl
143
143
 
144
144
  # Transform the relative path to a python import path and import the module
145
145
  python_import_path = get_python_import_path(relative_path)
146
- logger.info(f"Importing {python_import_path}")
146
+ logger.debug(f"Importing {python_import_path}")
147
147
 
148
148
  try:
149
149
  source_module = importlib.import_module(python_import_path, package="sources")
@@ -1,7 +1,7 @@
1
1
  import io
2
2
  import json
3
+ import logging
3
4
  import struct
4
- from concurrent.futures import ThreadPoolExecutor, as_completed
5
5
  from datetime import datetime, timezone
6
6
  from enum import Enum
7
7
  from functools import lru_cache
@@ -18,6 +18,9 @@ from bizon.source.config import SourceConfig
18
18
  from bizon.source.models import SourceIteration, SourceRecord
19
19
  from bizon.source.source import AbstractSource
20
20
 
21
+ silent_logger = logging.getLogger()
22
+ silent_logger.addHandler(logging.StreamHandler())
23
+
21
24
 
22
25
  class SchemaRegistryType(str, Enum):
23
26
  APICURIO = "apicurio"
@@ -98,7 +101,10 @@ class KafkaSource(AbstractSource):
98
101
  }
99
102
 
100
103
  # Consumer instance
101
- self.consumer = Consumer(self.kafka_consumer_conf)
104
+ self.consumer = Consumer(self.kafka_consumer_conf, logger=silent_logger)
105
+
106
+ # Consumers for each worker thread
107
+ self.consumers_cached: Mapping[int, Consumer] = {}
102
108
 
103
109
  @staticmethod
104
110
  def streams() -> List[str]:
@@ -194,25 +200,17 @@ class KafkaSource(AbstractSource):
194
200
  global_id = self.parse_global_id_from_serialized_message(header_message)
195
201
  return self.get_parsed_avro_schema(global_id).to_json()
196
202
 
197
- def read_partition(self, partition: int, topic_offsets: TopicOffsets) -> List[SourceRecord]:
203
+ def parse_encoded_messages(self, encoded_messages: list) -> List[SourceRecord]:
204
+
198
205
  records = []
199
- encoded_messages = []
200
206
 
201
207
  # Set the source timestamp to now, otherwise it will be overwritten by the message timestamp
202
208
  source_timestamp = datetime.now(tz=timezone.utc)
203
209
 
204
- # Set consumer offset params
205
- consumer = Consumer(self.kafka_consumer_conf)
206
- consumer.assign([TopicPartition(self.config.topic, partition, topic_offsets.get_partition_offset(partition))])
207
- consumer.seek(TopicPartition(self.config.topic, partition, topic_offsets.get_partition_offset(partition)))
208
-
209
- # Read messages
210
- encoded_messages.extend(consumer.consume(self.config.batch_size, timeout=self.config.consumer_timeout))
211
-
212
210
  for message in encoded_messages:
213
211
  if not message.value():
214
212
  logger.debug(
215
- f"Message for partition {partition} and offset {message.offset()} and topic {self.config.topic} is empty, skipping."
213
+ f"Message for partition {message.partition()} and offset {message.offset()} and topic {self.config.topic} is empty, skipping."
216
214
  )
217
215
  continue
218
216
 
@@ -233,43 +231,44 @@ class KafkaSource(AbstractSource):
233
231
  data[self.config.timestamp_ms_name] / 1000, tz=timezone.utc
234
232
  )
235
233
 
234
+ self.topic_offsets.set_partition_offset(message.partition(), message.offset() + 1)
235
+
236
236
  records.append(
237
237
  SourceRecord(
238
- id=f"part_{partition}_offset_{message.offset()}",
238
+ id=f"part_{message.partition()}_offset_{message.offset()}",
239
239
  timestamp=source_timestamp,
240
240
  data=data,
241
241
  )
242
242
  )
243
+
243
244
  except Exception as e:
244
245
  logger.error(
245
- f"Error while decoding message for partition {partition}: {e} at offset {message.offset()}"
246
+ f"Error while decoding message for partition {message.partition()}: {e} at offset {message.offset()}"
246
247
  )
247
248
  continue
248
249
 
249
- # Update the offset for the partition
250
- if encoded_messages:
251
- topic_offsets.set_partition_offset(partition, encoded_messages[-1].offset() + 1)
252
- else:
253
- logger.warning(f"No new messages found for partition {partition}")
254
-
255
- consumer.close()
256
-
257
250
  return records
258
251
 
259
252
  def read_topic(self, pagination: dict = None) -> SourceIteration:
260
253
  nb_partitions = self.get_number_of_partitions()
261
254
 
262
255
  # Setup offset_pagination
263
- topic_offsets = TopicOffsets.model_validate(pagination) if pagination else self.get_offset_partitions()
256
+ self.topic_offsets = TopicOffsets.model_validate(pagination) if pagination else self.get_offset_partitions()
264
257
 
265
- # Use ThreadPoolExecutor to parallelize reading partitions
266
- records = []
267
- with ThreadPoolExecutor(max_workers=min(nb_partitions, self.config.max_consumer_threads)) as executor:
268
- futures = {executor.submit(self.read_partition, i, topic_offsets): i for i in range(nb_partitions)}
269
- for future in as_completed(futures):
270
- partition_records = future.result()
271
- records.extend(partition_records)
258
+ self.consumer.assign(
259
+ [
260
+ TopicPartition(self.config.topic, partition, self.topic_offsets.get_partition_offset(partition))
261
+ for partition in range(nb_partitions)
262
+ ]
263
+ )
272
264
 
265
+ t1 = datetime.now()
266
+ encoded_messages = self.consumer.consume(self.config.batch_size, timeout=self.config.consumer_timeout)
267
+ logger.info(f"Read Kafka: {len(encoded_messages)} messages in {datetime.now() - t1}")
268
+
269
+ records = self.parse_encoded_messages(encoded_messages)
270
+
271
+ # Update the offset for the partition
273
272
  if not records:
274
273
  logger.info("No new records found, stopping iteration")
275
274
  return SourceIteration(
@@ -278,7 +277,7 @@ class KafkaSource(AbstractSource):
278
277
  )
279
278
 
280
279
  return SourceIteration(
281
- next_pagination=topic_offsets.model_dump(),
280
+ next_pagination=self.topic_offsets.model_dump(),
282
281
  records=records,
283
282
  )
284
283
 
@@ -4,6 +4,6 @@ from bizon.engine.engine import RunnerFactory
4
4
 
5
5
  if __name__ == "__main__":
6
6
  runner = RunnerFactory.create_from_yaml(
7
- filepath=os.path.abspath("bizon/sources/kafka/config/kafka_teams_users_eu_west1_c511.yml")
7
+ filepath=os.path.abspath("bizon/sources/kafka/config/kafka_teams_users.yml")
8
8
  )
9
9
  runner.run()
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "bizon"
3
- version = "0.0.9"
3
+ version = "0.0.11"
4
4
  description = "Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism."
5
5
  authors = ["Antoine Balliet <antoine.balliet@gmail.com>", "Anas El Mhamdi <anas.elmhamdi@gmail.com>"]
6
6
  readme = "README.md"
@@ -44,6 +44,7 @@ python-dotenv = "^1.0.1"
44
44
  gspread = { version = "^6.1.2", optional = true }
45
45
  click = "^8.1.7"
46
46
  pytz = "^2024.2"
47
+ protobuf = "4.24.0"
47
48
 
48
49
  [tool.poetry.extras]
49
50
  postgres = ["psycopg2-binary"]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes