bizon 0.0.10__tar.gz → 0.0.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bizon-0.0.10 → bizon-0.0.11}/PKG-INFO +2 -1
- {bizon-0.0.10 → bizon-0.0.11}/bizon/common/models.py +2 -0
- bizon-0.0.11/bizon/destinations/bigquery_streaming/src/config.py +55 -0
- bizon-0.0.11/bizon/destinations/bigquery_streaming/src/destination.py +148 -0
- bizon-0.0.11/bizon/destinations/bigquery_streaming/src/proto_utils.py +91 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/config.py +1 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/destination.py +12 -2
- {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/models.py +27 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/runner/adapters/thread.py +2 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/kafka/src/source.py +31 -32
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/kafka/tests/kafka_pipeline.py +1 -1
- {bizon-0.0.10 → bizon-0.0.11}/pyproject.toml +2 -1
- {bizon-0.0.10 → bizon-0.0.11}/LICENSE +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/README.md +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/__main__.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/cli/__init__.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/cli/main.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/cli/utils.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/common/errors/backoff.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/common/errors/errors.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/bigquery/config/bigquery.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/bigquery/src/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/bigquery/src/destination.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/buffer.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/file/src/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/file/src/destination.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/logger/src/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/destinations/logger/src/destination.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/backend/adapters/sqlalchemy/backend.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/backend/adapters/sqlalchemy/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/backend/backend.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/backend/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/backend/models.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/engine.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/pipeline/consumer.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/pipeline/models.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/pipeline/producer.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/kafka/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/kafka/consumer.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/kafka/queue.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/python_queue/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/python_queue/consumer.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/python_queue/queue.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/rabbitmq/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/adapters/rabbitmq/queue.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/queue/queue.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/runner/adapters/process.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/runner/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/engine/runner/runner.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/source/auth/authenticators/abstract_oauth.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/source/auth/authenticators/abstract_token.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/source/auth/authenticators/basic.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/source/auth/authenticators/cookies.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/source/auth/authenticators/oauth.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/source/auth/authenticators/token.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/source/auth/builder.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/source/auth/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/source/config.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/source/cursor.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/source/discover.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/source/models.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/source/session.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/source/source.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/config/api_key.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/config/api_key_kafka.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/src/fake_api.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/src/source.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline_bigquery_backend.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline_kafka.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline_rabbitmq.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/gsheets/config/default_auth.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/gsheets/config/service_account.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/gsheets/src/source.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/gsheets/tests/gsheets_pipeline.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/hubspot/config/api_key.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/hubspot/config/oauth.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/hubspot/src/hubspot_base.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/hubspot/src/hubspot_objects.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/hubspot/src/models/hs_object.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/hubspot/tests/hubspot_pipeline.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/kafka/config/kafka.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/periscope/config/periscope_charts.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/periscope/src/source.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/periscope/tests/periscope_pipeline_charts.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/sources/periscope/tests/periscope_pipeline_dashboard.py +0 -0
- {bizon-0.0.10 → bizon-0.0.11}/bizon/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bizon
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.11
|
|
4
4
|
Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
|
|
5
5
|
Author: Antoine Balliet
|
|
6
6
|
Author-email: antoine.balliet@gmail.com
|
|
@@ -31,6 +31,7 @@ Requires-Dist: loguru (>=0.7.2,<0.8.0)
|
|
|
31
31
|
Requires-Dist: pandas (>=2.2.2,<3.0.0) ; extra == "bigquery"
|
|
32
32
|
Requires-Dist: pendulum (>=3.0.0,<4.0.0)
|
|
33
33
|
Requires-Dist: pika (>=1.3.2,<2.0.0) ; extra == "rabbitmq"
|
|
34
|
+
Requires-Dist: protobuf (==4.24.0)
|
|
34
35
|
Requires-Dist: psycopg2-binary (>=2.9.9,<3.0.0) ; extra == "postgres"
|
|
35
36
|
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
|
36
37
|
Requires-Dist: pydantic (>=2.8.2,<3.0.0)
|
|
@@ -3,6 +3,7 @@ from typing import Union
|
|
|
3
3
|
from pydantic import BaseModel, ConfigDict, Field
|
|
4
4
|
|
|
5
5
|
from bizon.destinations.bigquery.src.config import BigQueryConfig
|
|
6
|
+
from bizon.destinations.bigquery_streaming.src.config import BigQueryStreamingConfig
|
|
6
7
|
from bizon.destinations.file.src.config import FileDestinationConfig
|
|
7
8
|
from bizon.destinations.logger.src.config import LoggerConfig
|
|
8
9
|
from bizon.engine.config import EngineConfig
|
|
@@ -24,6 +25,7 @@ class BizonConfig(BaseModel):
|
|
|
24
25
|
|
|
25
26
|
destination: Union[
|
|
26
27
|
BigQueryConfig,
|
|
28
|
+
BigQueryStreamingConfig,
|
|
27
29
|
LoggerConfig,
|
|
28
30
|
FileDestinationConfig,
|
|
29
31
|
] = Field(
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Literal, Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field, field_validator
|
|
5
|
+
|
|
6
|
+
from bizon.destinations.config import (
|
|
7
|
+
AbstractDestinationConfig,
|
|
8
|
+
AbstractDestinationDetailsConfig,
|
|
9
|
+
DestinationTypes,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class GCSBufferFormat(str, Enum):
|
|
14
|
+
PARQUET = "parquet"
|
|
15
|
+
CSV = "csv"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TimePartitioning(str, Enum):
|
|
19
|
+
DAY = "DAY"
|
|
20
|
+
HOUR = "HOUR"
|
|
21
|
+
MONTH = "MONTH"
|
|
22
|
+
YEAR = "YEAR"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class BigQueryAuthentication(BaseModel):
|
|
26
|
+
service_account_key: str = Field(
|
|
27
|
+
description="Service Account Key JSON string. If empty it will be infered",
|
|
28
|
+
default="",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
|
|
33
|
+
project_id: str
|
|
34
|
+
dataset_id: str
|
|
35
|
+
dataset_location: Optional[str] = "US"
|
|
36
|
+
table_id: Optional[str] = Field(
|
|
37
|
+
default=None, description="Table ID, if not provided it will be inferred from source name"
|
|
38
|
+
)
|
|
39
|
+
time_partitioning: Optional[TimePartitioning] = Field(
|
|
40
|
+
default=TimePartitioning.DAY, description="BigQuery Time partitioning type"
|
|
41
|
+
)
|
|
42
|
+
authentication: Optional[BigQueryAuthentication] = None
|
|
43
|
+
|
|
44
|
+
buffer_size: int = Field(default=0, description="Buffer size in MB")
|
|
45
|
+
|
|
46
|
+
@field_validator("buffer_size", mode="after")
|
|
47
|
+
def validate_buffer_size(cls, value: int) -> int:
|
|
48
|
+
if value != 0:
|
|
49
|
+
raise ValueError("Buffer size must be 0, we directly stream to BigQuery")
|
|
50
|
+
return value
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class BigQueryStreamingConfig(AbstractDestinationConfig):
|
|
54
|
+
name: Literal[DestinationTypes.BIGQUERY_STREAMING]
|
|
55
|
+
config: BigQueryConfigDetails
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
|
|
6
|
+
from google.api_core.exceptions import NotFound
|
|
7
|
+
from google.cloud import bigquery, bigquery_storage_v1, storage
|
|
8
|
+
from google.cloud.bigquery import DatasetReference, TimePartitioning
|
|
9
|
+
from google.cloud.bigquery_storage_v1.types import AppendRowsRequest, ProtoRows
|
|
10
|
+
from loguru import logger
|
|
11
|
+
|
|
12
|
+
from bizon.common.models import SyncMetadata
|
|
13
|
+
from bizon.destinations.config import NormalizationType
|
|
14
|
+
from bizon.destinations.destination import AbstractDestination
|
|
15
|
+
from bizon.destinations.models import DestinationRecord
|
|
16
|
+
from bizon.engine.backend.backend import AbstractBackend
|
|
17
|
+
|
|
18
|
+
from .config import BigQueryConfigDetails
|
|
19
|
+
from .proto_utils import get_proto_schema_and_class
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BigQueryStreamingDestination(AbstractDestination):
|
|
23
|
+
|
|
24
|
+
def __init__(self, sync_metadata: SyncMetadata, config: BigQueryConfigDetails, backend: AbstractBackend):
|
|
25
|
+
super().__init__(sync_metadata, config, backend)
|
|
26
|
+
self.config: BigQueryConfigDetails = config
|
|
27
|
+
|
|
28
|
+
if config.authentication and config.authentication.service_account_key:
|
|
29
|
+
with tempfile.NamedTemporaryFile(delete=False) as temp:
|
|
30
|
+
temp.write(config.authentication.service_account_key.encode())
|
|
31
|
+
temp_file_path = temp.name
|
|
32
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file_path
|
|
33
|
+
|
|
34
|
+
self.project_id = config.project_id
|
|
35
|
+
self.bq_client = bigquery.Client(project=self.project_id)
|
|
36
|
+
self.bq_storage_client = bigquery_storage_v1.BigQueryWriteClient()
|
|
37
|
+
self.gcs_client = storage.Client(project=self.project_id)
|
|
38
|
+
self.dataset_id = config.dataset_id
|
|
39
|
+
self.dataset_location = config.dataset_location
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def table_id(self) -> str:
|
|
43
|
+
tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
|
|
44
|
+
return f"{self.project_id}.{self.dataset_id}.{tabled_id}"
|
|
45
|
+
|
|
46
|
+
def get_bigquery_schema(self, destination_records: List[DestinationRecord]) -> List[bigquery.SchemaField]:
|
|
47
|
+
|
|
48
|
+
# we keep raw data in the column source_data
|
|
49
|
+
if self.config.normalization.type == NormalizationType.NONE:
|
|
50
|
+
return [
|
|
51
|
+
bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
|
|
52
|
+
bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
|
|
53
|
+
bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
|
|
54
|
+
bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
|
|
55
|
+
bigquery.SchemaField(
|
|
56
|
+
"_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
|
|
57
|
+
),
|
|
58
|
+
bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
elif self.config.normalization.type == NormalizationType.DEBEZIUM:
|
|
62
|
+
assert (
|
|
63
|
+
"_bizon_message_key" in destination_records[0].source_data
|
|
64
|
+
), "Debezium records must have a '_bizon_message_key' key"
|
|
65
|
+
message_keys = json.loads(destination_records[0].source_data["_bizon_message_key"])
|
|
66
|
+
return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in message_keys] + [
|
|
67
|
+
bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
|
|
68
|
+
bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
|
|
69
|
+
bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
|
|
70
|
+
bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
|
|
71
|
+
bigquery.SchemaField(
|
|
72
|
+
"_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
|
|
73
|
+
),
|
|
74
|
+
bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
# If normalization is tabular, we parse key / value pairs to columns
|
|
78
|
+
elif self.config.normalization.type == NormalizationType.TABULAR:
|
|
79
|
+
first_record_keys = destination_records[0].source_data.keys()
|
|
80
|
+
return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in first_record_keys] + [
|
|
81
|
+
bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
|
|
82
|
+
bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
|
|
83
|
+
bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
|
|
84
|
+
bigquery.SchemaField(
|
|
85
|
+
"_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
|
|
86
|
+
),
|
|
87
|
+
bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
|
|
91
|
+
|
|
92
|
+
def check_connection(self) -> bool:
|
|
93
|
+
dataset_ref = DatasetReference(self.project_id, self.dataset_id)
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
self.bq_client.get_dataset(dataset_ref)
|
|
97
|
+
except NotFound:
|
|
98
|
+
dataset = bigquery.Dataset(dataset_ref)
|
|
99
|
+
dataset.location = self.dataset_location
|
|
100
|
+
dataset = self.bq_client.create_dataset(dataset)
|
|
101
|
+
return True
|
|
102
|
+
|
|
103
|
+
def load_to_bigquery_via_streaming(self, destination_records: List[DestinationRecord]) -> str:
|
|
104
|
+
clustering_keys = []
|
|
105
|
+
|
|
106
|
+
if self.config.normalization.type == NormalizationType.DEBEZIUM:
|
|
107
|
+
clustering_keys = list(json.loads(destination_records[0].source_data["_bizon_message_key"]).keys())
|
|
108
|
+
|
|
109
|
+
# Create table if it doesnt exist
|
|
110
|
+
schema = self.get_bigquery_schema(destination_records=destination_records)
|
|
111
|
+
table = bigquery.Table(self.table_id, schema=schema)
|
|
112
|
+
time_partitioning = TimePartitioning(field="_bizon_loaded_at", type_=self.config.time_partitioning)
|
|
113
|
+
table.time_partitioning = time_partitioning
|
|
114
|
+
|
|
115
|
+
if clustering_keys:
|
|
116
|
+
table.clustering_fields = clustering_keys
|
|
117
|
+
|
|
118
|
+
table = self.bq_client.create_table(table, exists_ok=True)
|
|
119
|
+
|
|
120
|
+
# Create the stream
|
|
121
|
+
write_client = self.bq_storage_client
|
|
122
|
+
tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
|
|
123
|
+
parent = write_client.table_path(self.project_id, self.dataset_id, tabled_id)
|
|
124
|
+
stream_name = f"{parent}/_default"
|
|
125
|
+
|
|
126
|
+
# Generating the protocol buffer representation of the message descriptor.
|
|
127
|
+
proto_schema, TableRow = get_proto_schema_and_class(clustering_keys)
|
|
128
|
+
|
|
129
|
+
serialized_rows = [
|
|
130
|
+
record.to_protobuf_serialization(
|
|
131
|
+
TableRow, debezium=self.config.normalization.type == NormalizationType.DEBEZIUM
|
|
132
|
+
)
|
|
133
|
+
for record in destination_records
|
|
134
|
+
]
|
|
135
|
+
|
|
136
|
+
request = AppendRowsRequest(
|
|
137
|
+
write_stream=stream_name,
|
|
138
|
+
proto_rows=AppendRowsRequest.ProtoData(
|
|
139
|
+
rows=ProtoRows(serialized_rows=serialized_rows),
|
|
140
|
+
writer_schema=proto_schema,
|
|
141
|
+
),
|
|
142
|
+
)
|
|
143
|
+
response = write_client.append_rows(iter([request]))
|
|
144
|
+
assert response.code().name == "OK"
|
|
145
|
+
|
|
146
|
+
def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, str]:
|
|
147
|
+
self.load_to_bigquery_via_streaming(destination_records=destination_records)
|
|
148
|
+
return True, ""
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from typing import List, Tuple, Type
|
|
2
|
+
|
|
3
|
+
from google.cloud.bigquery_storage_v1.types import ProtoSchema
|
|
4
|
+
from google.protobuf.descriptor_pb2 import (
|
|
5
|
+
DescriptorProto,
|
|
6
|
+
FieldDescriptorProto,
|
|
7
|
+
FileDescriptorProto,
|
|
8
|
+
)
|
|
9
|
+
from google.protobuf.descriptor_pool import DescriptorPool
|
|
10
|
+
from google.protobuf.message import Message
|
|
11
|
+
from google.protobuf.message_factory import GetMessageClassesForFiles
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_proto_schema_and_class(clustering_keys: List[str] = None) -> Tuple[ProtoSchema, Type[Message]]:
|
|
15
|
+
# Define the FileDescriptorProto
|
|
16
|
+
file_descriptor_proto = FileDescriptorProto()
|
|
17
|
+
file_descriptor_proto.name = "dynamic.proto"
|
|
18
|
+
file_descriptor_proto.package = "dynamic_package"
|
|
19
|
+
|
|
20
|
+
# Define the TableRow message schema
|
|
21
|
+
message_descriptor = DescriptorProto()
|
|
22
|
+
message_descriptor.name = "TableRow"
|
|
23
|
+
|
|
24
|
+
# Add fields to the message, only use TYPE_STRING, BigQuery does not support other types
|
|
25
|
+
# It does not imapact data types in final table
|
|
26
|
+
|
|
27
|
+
# https://stackoverflow.com/questions/70489919/protobuf-type-for-bigquery-timestamp-field
|
|
28
|
+
fields = [
|
|
29
|
+
{"name": "_bizon_id", "type": FieldDescriptorProto.TYPE_STRING, "label": FieldDescriptorProto.LABEL_REQUIRED},
|
|
30
|
+
{
|
|
31
|
+
"name": "_bizon_extracted_at",
|
|
32
|
+
"type": FieldDescriptorProto.TYPE_STRING,
|
|
33
|
+
"label": FieldDescriptorProto.LABEL_REQUIRED,
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
"name": "_bizon_loaded_at",
|
|
37
|
+
"type": FieldDescriptorProto.TYPE_STRING,
|
|
38
|
+
"label": FieldDescriptorProto.LABEL_REQUIRED,
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"name": "_source_record_id",
|
|
42
|
+
"type": FieldDescriptorProto.TYPE_STRING,
|
|
43
|
+
"label": FieldDescriptorProto.LABEL_REQUIRED,
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"name": "_source_timestamp",
|
|
47
|
+
"type": FieldDescriptorProto.TYPE_STRING,
|
|
48
|
+
"label": FieldDescriptorProto.LABEL_REQUIRED,
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"name": "_source_data",
|
|
52
|
+
"type": FieldDescriptorProto.TYPE_STRING,
|
|
53
|
+
"label": FieldDescriptorProto.LABEL_OPTIONAL,
|
|
54
|
+
},
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
if clustering_keys:
|
|
58
|
+
for key in clustering_keys:
|
|
59
|
+
fields.append(
|
|
60
|
+
{
|
|
61
|
+
"name": key,
|
|
62
|
+
"type": FieldDescriptorProto.TYPE_STRING,
|
|
63
|
+
"label": FieldDescriptorProto.LABEL_OPTIONAL,
|
|
64
|
+
}
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
for i, field in enumerate(fields, start=1):
|
|
68
|
+
field_descriptor = message_descriptor.field.add()
|
|
69
|
+
field_descriptor.name = field["name"]
|
|
70
|
+
field_descriptor.number = i
|
|
71
|
+
field_descriptor.type = field["type"]
|
|
72
|
+
field_descriptor.label = field["label"]
|
|
73
|
+
|
|
74
|
+
# Add the message to the file descriptor
|
|
75
|
+
file_descriptor_proto.message_type.add().CopyFrom(message_descriptor)
|
|
76
|
+
|
|
77
|
+
# Create a DescriptorPool and register the FileDescriptorProto
|
|
78
|
+
pool = DescriptorPool()
|
|
79
|
+
pool.Add(file_descriptor_proto)
|
|
80
|
+
|
|
81
|
+
# Use the registered file name to fetch the message classes
|
|
82
|
+
message_classes = GetMessageClassesForFiles(["dynamic.proto"], pool=pool)
|
|
83
|
+
|
|
84
|
+
# Fetch the TableRow class
|
|
85
|
+
table_row_class = message_classes["dynamic_package.TableRow"]
|
|
86
|
+
|
|
87
|
+
# Create the ProtoSchema
|
|
88
|
+
proto_schema = ProtoSchema()
|
|
89
|
+
proto_schema.proto_descriptor.CopyFrom(message_descriptor)
|
|
90
|
+
|
|
91
|
+
return proto_schema, table_row_class
|
|
@@ -122,6 +122,11 @@ class AbstractDestination(ABC):
|
|
|
122
122
|
|
|
123
123
|
# Last iteration, write all records to destination
|
|
124
124
|
if last_iteration:
|
|
125
|
+
|
|
126
|
+
if len(self.buffer.records) == 0 and self.config.buffer_size == 0:
|
|
127
|
+
logger.warning("No records to write to destination, already written, buffer is empty.")
|
|
128
|
+
return DestinationBufferStatus.RECORDS_WRITTEN
|
|
129
|
+
|
|
125
130
|
logger.debug("Writing last iteration records to destination")
|
|
126
131
|
assert len(destination_records) == 0, "Last iteration should not have any records"
|
|
127
132
|
destination_iteration = self.buffer_flush_handler(session=session)
|
|
@@ -152,9 +157,9 @@ class AbstractDestination(ABC):
|
|
|
152
157
|
logger.warning("No records to write to destination. Check source and queue provider.")
|
|
153
158
|
return DestinationBufferStatus.NO_RECORDS
|
|
154
159
|
|
|
155
|
-
# Write records to destination if buffer size is 0
|
|
160
|
+
# Write records to destination if buffer size is 0 or streaming
|
|
156
161
|
if self.buffer.buffer_size == 0:
|
|
157
|
-
logger.info("Writing
|
|
162
|
+
logger.info("Writing records to destination.")
|
|
158
163
|
self.buffer.add_source_iteration_records_to_buffer(
|
|
159
164
|
iteration=iteration, records=destination_records, pagination=pagination
|
|
160
165
|
)
|
|
@@ -263,6 +268,11 @@ class DestinationFactory:
|
|
|
263
268
|
|
|
264
269
|
return BigQueryDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
|
|
265
270
|
|
|
271
|
+
elif config.name == DestinationTypes.BIGQUERY_STREAMING:
|
|
272
|
+
from .bigquery_streaming.src.destination import BigQueryStreamingDestination
|
|
273
|
+
|
|
274
|
+
return BigQueryStreamingDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
|
|
275
|
+
|
|
266
276
|
elif config.name == DestinationTypes.FILE:
|
|
267
277
|
from .file.src.destination import FileDestination
|
|
268
278
|
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from datetime import datetime
|
|
3
|
+
from typing import Type
|
|
3
4
|
from uuid import uuid4
|
|
4
5
|
|
|
6
|
+
from google.protobuf.message import Message
|
|
5
7
|
from pydantic import BaseModel, Field
|
|
6
8
|
from pytz import UTC
|
|
7
9
|
|
|
@@ -81,3 +83,28 @@ class DestinationRecord(BaseModel):
|
|
|
81
83
|
"_source_timestamp": self.source_timestamp,
|
|
82
84
|
"_source_data": json.dumps(self.source_data),
|
|
83
85
|
}
|
|
86
|
+
|
|
87
|
+
def to_protobuf_serialization(self, TableRowClass: Type[Message], debezium=False):
|
|
88
|
+
|
|
89
|
+
record = TableRowClass()
|
|
90
|
+
record._bizon_id = self.bizon_id
|
|
91
|
+
record._bizon_extracted_at = str(int(self.bizon_extracted_at.timestamp()))
|
|
92
|
+
record._bizon_loaded_at = str(int(self.bizon_loaded_at.timestamp()))
|
|
93
|
+
record._source_record_id = self.source_record_id
|
|
94
|
+
record._source_timestamp = str(int(self.source_timestamp.timestamp()))
|
|
95
|
+
|
|
96
|
+
if debezium:
|
|
97
|
+
parsed_debezium_keys = json.loads(self.source_data["_bizon_message_key"])
|
|
98
|
+
if parsed_debezium_keys:
|
|
99
|
+
for _key in parsed_debezium_keys:
|
|
100
|
+
setattr(record, _key, str(parsed_debezium_keys[_key]))
|
|
101
|
+
if self.source_data.get("op") == "d":
|
|
102
|
+
source_data = {"__deleted": True, **self.source_data["before"]}
|
|
103
|
+
else:
|
|
104
|
+
source_data = {"__deleted": False, **self.source_data["after"]}
|
|
105
|
+
|
|
106
|
+
record._source_data = json.dumps(source_data)
|
|
107
|
+
else:
|
|
108
|
+
record._source_data = json.dumps(self.source_data)
|
|
109
|
+
|
|
110
|
+
return record.SerializeToString()
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import concurrent.futures
|
|
2
2
|
import time
|
|
3
|
+
import traceback
|
|
3
4
|
|
|
4
5
|
from loguru import logger
|
|
5
6
|
|
|
@@ -75,5 +76,6 @@ class ThreadRunner(AbstractRunner):
|
|
|
75
76
|
future_consumer.result()
|
|
76
77
|
except Exception as e:
|
|
77
78
|
logger.error(f"Consumer thread stopped running with error {e}")
|
|
79
|
+
logger.error(traceback.format_exc())
|
|
78
80
|
|
|
79
81
|
return True
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import json
|
|
3
|
+
import logging
|
|
3
4
|
import struct
|
|
4
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
5
5
|
from datetime import datetime, timezone
|
|
6
6
|
from enum import Enum
|
|
7
7
|
from functools import lru_cache
|
|
@@ -18,6 +18,9 @@ from bizon.source.config import SourceConfig
|
|
|
18
18
|
from bizon.source.models import SourceIteration, SourceRecord
|
|
19
19
|
from bizon.source.source import AbstractSource
|
|
20
20
|
|
|
21
|
+
silent_logger = logging.getLogger()
|
|
22
|
+
silent_logger.addHandler(logging.StreamHandler())
|
|
23
|
+
|
|
21
24
|
|
|
22
25
|
class SchemaRegistryType(str, Enum):
|
|
23
26
|
APICURIO = "apicurio"
|
|
@@ -98,7 +101,10 @@ class KafkaSource(AbstractSource):
|
|
|
98
101
|
}
|
|
99
102
|
|
|
100
103
|
# Consumer instance
|
|
101
|
-
self.consumer = Consumer(self.kafka_consumer_conf, logger=
|
|
104
|
+
self.consumer = Consumer(self.kafka_consumer_conf, logger=silent_logger)
|
|
105
|
+
|
|
106
|
+
# Consumers for each worker thread
|
|
107
|
+
self.consumers_cached: Mapping[int, Consumer] = {}
|
|
102
108
|
|
|
103
109
|
@staticmethod
|
|
104
110
|
def streams() -> List[str]:
|
|
@@ -194,25 +200,17 @@ class KafkaSource(AbstractSource):
|
|
|
194
200
|
global_id = self.parse_global_id_from_serialized_message(header_message)
|
|
195
201
|
return self.get_parsed_avro_schema(global_id).to_json()
|
|
196
202
|
|
|
197
|
-
def
|
|
203
|
+
def parse_encoded_messages(self, encoded_messages: list) -> List[SourceRecord]:
|
|
204
|
+
|
|
198
205
|
records = []
|
|
199
|
-
encoded_messages = []
|
|
200
206
|
|
|
201
207
|
# Set the source timestamp to now, otherwise it will be overwritten by the message timestamp
|
|
202
208
|
source_timestamp = datetime.now(tz=timezone.utc)
|
|
203
209
|
|
|
204
|
-
# Set consumer offset params
|
|
205
|
-
consumer = Consumer(self.kafka_consumer_conf, logger=logger)
|
|
206
|
-
consumer.assign([TopicPartition(self.config.topic, partition, topic_offsets.get_partition_offset(partition))])
|
|
207
|
-
consumer.seek(TopicPartition(self.config.topic, partition, topic_offsets.get_partition_offset(partition)))
|
|
208
|
-
|
|
209
|
-
# Read messages
|
|
210
|
-
encoded_messages.extend(consumer.consume(self.config.batch_size, timeout=self.config.consumer_timeout))
|
|
211
|
-
|
|
212
210
|
for message in encoded_messages:
|
|
213
211
|
if not message.value():
|
|
214
212
|
logger.debug(
|
|
215
|
-
f"Message for partition {partition} and offset {message.offset()} and topic {self.config.topic} is empty, skipping."
|
|
213
|
+
f"Message for partition {message.partition()} and offset {message.offset()} and topic {self.config.topic} is empty, skipping."
|
|
216
214
|
)
|
|
217
215
|
continue
|
|
218
216
|
|
|
@@ -233,43 +231,44 @@ class KafkaSource(AbstractSource):
|
|
|
233
231
|
data[self.config.timestamp_ms_name] / 1000, tz=timezone.utc
|
|
234
232
|
)
|
|
235
233
|
|
|
234
|
+
self.topic_offsets.set_partition_offset(message.partition(), message.offset() + 1)
|
|
235
|
+
|
|
236
236
|
records.append(
|
|
237
237
|
SourceRecord(
|
|
238
|
-
id=f"part_{partition}_offset_{message.offset()}",
|
|
238
|
+
id=f"part_{message.partition()}_offset_{message.offset()}",
|
|
239
239
|
timestamp=source_timestamp,
|
|
240
240
|
data=data,
|
|
241
241
|
)
|
|
242
242
|
)
|
|
243
|
+
|
|
243
244
|
except Exception as e:
|
|
244
245
|
logger.error(
|
|
245
|
-
f"Error while decoding message for partition {partition}: {e} at offset {message.offset()}"
|
|
246
|
+
f"Error while decoding message for partition {message.partition()}: {e} at offset {message.offset()}"
|
|
246
247
|
)
|
|
247
248
|
continue
|
|
248
249
|
|
|
249
|
-
# Update the offset for the partition
|
|
250
|
-
if encoded_messages:
|
|
251
|
-
topic_offsets.set_partition_offset(partition, encoded_messages[-1].offset() + 1)
|
|
252
|
-
else:
|
|
253
|
-
logger.warning(f"No new messages found for partition {partition}")
|
|
254
|
-
|
|
255
|
-
consumer.close()
|
|
256
|
-
|
|
257
250
|
return records
|
|
258
251
|
|
|
259
252
|
def read_topic(self, pagination: dict = None) -> SourceIteration:
|
|
260
253
|
nb_partitions = self.get_number_of_partitions()
|
|
261
254
|
|
|
262
255
|
# Setup offset_pagination
|
|
263
|
-
topic_offsets = TopicOffsets.model_validate(pagination) if pagination else self.get_offset_partitions()
|
|
256
|
+
self.topic_offsets = TopicOffsets.model_validate(pagination) if pagination else self.get_offset_partitions()
|
|
264
257
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
records.extend(partition_records)
|
|
258
|
+
self.consumer.assign(
|
|
259
|
+
[
|
|
260
|
+
TopicPartition(self.config.topic, partition, self.topic_offsets.get_partition_offset(partition))
|
|
261
|
+
for partition in range(nb_partitions)
|
|
262
|
+
]
|
|
263
|
+
)
|
|
272
264
|
|
|
265
|
+
t1 = datetime.now()
|
|
266
|
+
encoded_messages = self.consumer.consume(self.config.batch_size, timeout=self.config.consumer_timeout)
|
|
267
|
+
logger.info(f"Read Kafka: {len(encoded_messages)} messages in {datetime.now() - t1}")
|
|
268
|
+
|
|
269
|
+
records = self.parse_encoded_messages(encoded_messages)
|
|
270
|
+
|
|
271
|
+
# Update the offset for the partition
|
|
273
272
|
if not records:
|
|
274
273
|
logger.info("No new records found, stopping iteration")
|
|
275
274
|
return SourceIteration(
|
|
@@ -278,7 +277,7 @@ class KafkaSource(AbstractSource):
|
|
|
278
277
|
)
|
|
279
278
|
|
|
280
279
|
return SourceIteration(
|
|
281
|
-
next_pagination=topic_offsets.model_dump(),
|
|
280
|
+
next_pagination=self.topic_offsets.model_dump(),
|
|
282
281
|
records=records,
|
|
283
282
|
)
|
|
284
283
|
|
|
@@ -4,6 +4,6 @@ from bizon.engine.engine import RunnerFactory
|
|
|
4
4
|
|
|
5
5
|
if __name__ == "__main__":
|
|
6
6
|
runner = RunnerFactory.create_from_yaml(
|
|
7
|
-
filepath=os.path.abspath("bizon/sources/kafka/config/
|
|
7
|
+
filepath=os.path.abspath("bizon/sources/kafka/config/kafka_teams_users.yml")
|
|
8
8
|
)
|
|
9
9
|
runner.run()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "bizon"
|
|
3
|
-
version = "0.0.
|
|
3
|
+
version = "0.0.11"
|
|
4
4
|
description = "Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism."
|
|
5
5
|
authors = ["Antoine Balliet <antoine.balliet@gmail.com>", "Anas El Mhamdi <anas.elmhamdi@gmail.com>"]
|
|
6
6
|
readme = "README.md"
|
|
@@ -44,6 +44,7 @@ python-dotenv = "^1.0.1"
|
|
|
44
44
|
gspread = { version = "^6.1.2", optional = true }
|
|
45
45
|
click = "^8.1.7"
|
|
46
46
|
pytz = "^2024.2"
|
|
47
|
+
protobuf = "4.24.0"
|
|
47
48
|
|
|
48
49
|
[tool.poetry.extras]
|
|
49
50
|
postgres = ["psycopg2-binary"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{bizon-0.0.10 → bizon-0.0.11}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{bizon-0.0.10 → bizon-0.0.11}/bizon/sources/periscope/config/periscope_dashboards.example.yml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|