bizon 0.0.14__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon/alerting/__init__.py +0 -0
- bizon/alerting/alerts.py +23 -0
- bizon/alerting/models.py +28 -0
- bizon/alerting/slack/__init__.py +0 -0
- bizon/alerting/slack/config.py +5 -0
- bizon/alerting/slack/handler.py +39 -0
- bizon/cli/main.py +7 -3
- bizon/common/models.py +31 -7
- bizon/{destinations → connectors/destinations}/bigquery/config/bigquery.example.yml +3 -4
- bizon/connectors/destinations/bigquery/src/config.py +127 -0
- bizon/{destinations → connectors/destinations}/bigquery/src/destination.py +46 -25
- bizon/connectors/destinations/bigquery_streaming/src/config.py +56 -0
- bizon/connectors/destinations/bigquery_streaming/src/destination.py +372 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +52 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +261 -0
- bizon/{destinations/bigquery_streaming → connectors/destinations/bigquery_streaming_v2}/src/proto_utils.py +32 -26
- bizon/{destinations → connectors/destinations}/file/src/config.py +8 -3
- bizon/connectors/destinations/file/src/destination.py +54 -0
- bizon/{destinations → connectors/destinations}/logger/src/config.py +1 -1
- bizon/{destinations → connectors/destinations}/logger/src/destination.py +15 -3
- bizon/connectors/sources/cycle/config/cycle.example.yml +15 -0
- bizon/connectors/sources/cycle/src/source.py +133 -0
- bizon/{sources/periscope/tests/periscope_pipeline_dashboard.py → connectors/sources/cycle/tests/cycle_customers.py} +1 -1
- bizon/connectors/sources/dummy/config/dummy.example.yml +22 -0
- bizon/{sources → connectors/sources}/dummy/src/fake_api.py +6 -1
- bizon/{sources → connectors/sources}/dummy/src/source.py +18 -5
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline.py +5 -14
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_bigquery_backend.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_kafka.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_rabbitmq.py +2 -2
- bizon/connectors/sources/dummy/tests/dummy_pipeline_unnest.py +29 -0
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery.py +3 -3
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +2 -2
- bizon/{sources → connectors/sources}/gsheets/config/default_auth.example.yml +4 -2
- bizon/{sources → connectors/sources}/gsheets/config/service_account.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/config/api_key.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/config/oauth.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/src/hubspot_objects.py +1 -1
- bizon/connectors/sources/kafka/config/kafka.example.yml +50 -0
- bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +112 -0
- bizon/connectors/sources/kafka/src/callback.py +18 -0
- bizon/connectors/sources/kafka/src/config.py +75 -0
- bizon/connectors/sources/kafka/src/decode.py +88 -0
- bizon/connectors/sources/kafka/src/source.py +361 -0
- bizon/connectors/sources/kafka/tests/kafka_pipeline.py +7 -0
- bizon/connectors/sources/periscope/config/periscope_charts.example.yml +20 -0
- bizon/connectors/sources/periscope/config/periscope_dashboards.example.yml +20 -0
- bizon/{sources → connectors/sources}/periscope/src/source.py +136 -13
- bizon/connectors/sources/periscope/tests/periscope_pipeline_dashboard.py +9 -0
- bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_json.example.yml +19 -0
- bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_logger.example.yml +10 -0
- bizon/connectors/sources/pokeapi/src/source.py +79 -0
- bizon/{destinations → destination}/buffer.py +5 -0
- bizon/destination/config.py +74 -0
- bizon/{destinations → destination}/destination.py +71 -15
- bizon/engine/backend/adapters/sqlalchemy/backend.py +14 -23
- bizon/engine/engine.py +20 -1
- bizon/engine/pipeline/consumer.py +73 -5
- bizon/engine/pipeline/models.py +8 -3
- bizon/engine/pipeline/producer.py +18 -9
- bizon/engine/queue/adapters/kafka/consumer.py +2 -2
- bizon/engine/queue/adapters/kafka/queue.py +3 -2
- bizon/engine/queue/adapters/python_queue/consumer.py +40 -23
- bizon/engine/queue/adapters/python_queue/queue.py +19 -9
- bizon/engine/queue/adapters/rabbitmq/consumer.py +3 -6
- bizon/engine/queue/adapters/rabbitmq/queue.py +3 -2
- bizon/engine/queue/config.py +16 -0
- bizon/engine/queue/queue.py +17 -16
- bizon/engine/runner/adapters/process.py +15 -2
- bizon/engine/runner/adapters/streaming.py +103 -0
- bizon/engine/runner/adapters/thread.py +32 -9
- bizon/engine/runner/config.py +28 -0
- bizon/engine/runner/runner.py +107 -25
- bizon/monitoring/__init__.py +0 -0
- bizon/monitoring/config.py +29 -0
- bizon/monitoring/datadog/__init__.py +0 -0
- bizon/monitoring/datadog/monitor.py +69 -0
- bizon/monitoring/monitor.py +42 -0
- bizon/monitoring/noop/__init__.py +0 -0
- bizon/monitoring/noop/monitor.py +11 -0
- bizon/source/callback.py +24 -0
- bizon/source/config.py +3 -3
- bizon/source/cursor.py +1 -1
- bizon/source/discover.py +4 -3
- bizon/source/models.py +4 -2
- bizon/source/source.py +10 -2
- bizon/transform/config.py +8 -0
- bizon/transform/transform.py +48 -0
- bizon-0.1.1.dist-info/LICENSE +674 -0
- {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/METADATA +25 -7
- bizon-0.1.1.dist-info/RECORD +123 -0
- {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/WHEEL +1 -1
- bizon/destinations/bigquery/src/config.py +0 -51
- bizon/destinations/bigquery_streaming/src/config.py +0 -43
- bizon/destinations/bigquery_streaming/src/destination.py +0 -154
- bizon/destinations/config.py +0 -47
- bizon/destinations/file/src/destination.py +0 -27
- bizon/sources/dummy/config/api_key.example.yml +0 -20
- bizon/sources/dummy/config/api_key_kafka.example.yml +0 -27
- bizon/sources/kafka/config/kafka.example.yml +0 -38
- bizon/sources/kafka/src/source.py +0 -357
- bizon/sources/kafka/tests/kafka_pipeline.py +0 -9
- bizon/sources/periscope/config/periscope_charts.example.yml +0 -26
- bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -26
- bizon-0.0.14.dist-info/LICENSE +0 -21
- bizon-0.0.14.dist-info/RECORD +0 -94
- /bizon/{sources → connectors/sources}/gsheets/src/source.py +0 -0
- /bizon/{sources → connectors/sources}/gsheets/tests/gsheets_pipeline.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/src/hubspot_base.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/src/models/hs_object.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/tests/hubspot_pipeline.py +0 -0
- /bizon/{sources → connectors/sources}/periscope/tests/periscope_pipeline_charts.py +0 -0
- /bizon/{destinations → destination}/models.py +0 -0
- {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/entry_points.txt +0 -0
|
File without changes
|
bizon/alerting/alerts.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
from loguru import logger
|
|
5
|
+
|
|
6
|
+
from bizon.alerting.models import AlertingConfig, AlertMethod, LogLevel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AbstractAlert(ABC):
|
|
10
|
+
|
|
11
|
+
def __init__(self, type: AlertMethod, config: AlertingConfig, log_levels: List[LogLevel] = [LogLevel.ERROR]):
|
|
12
|
+
self.type = type
|
|
13
|
+
self.config = config
|
|
14
|
+
self.log_levels = log_levels
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def handler(self, message: Dict) -> None:
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
def add_handlers(self) -> None:
|
|
21
|
+
levels = [level.value for level in self.log_levels]
|
|
22
|
+
for level in levels:
|
|
23
|
+
logger.add(self.handler, level=level, format="{message}")
|
bizon/alerting/models.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import List, Optional, Union
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from bizon.alerting.slack.config import SlackConfig
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LogLevel(str, Enum):
|
|
10
|
+
DEBUG = "DEBUG"
|
|
11
|
+
INFO = "INFO"
|
|
12
|
+
WARNING = "WARNING"
|
|
13
|
+
ERROR = "ERROR"
|
|
14
|
+
CRITICAL = "CRITICAL"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class AlertMethod(str, Enum):
|
|
18
|
+
"""Alerting methods"""
|
|
19
|
+
|
|
20
|
+
SLACK = "slack"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AlertingConfig(BaseModel):
|
|
24
|
+
"""Alerting configuration model"""
|
|
25
|
+
|
|
26
|
+
type: AlertMethod
|
|
27
|
+
log_levels: Optional[List[LogLevel]] = [LogLevel.ERROR]
|
|
28
|
+
config: Union[SlackConfig]
|
|
File without changes
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
from loguru import logger
|
|
6
|
+
|
|
7
|
+
from bizon.alerting.alerts import AbstractAlert, AlertMethod
|
|
8
|
+
from bizon.alerting.models import LogLevel
|
|
9
|
+
from bizon.alerting.slack.config import SlackConfig
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SlackHandler(AbstractAlert):
|
|
13
|
+
def __init__(self, config: SlackConfig, log_levels: List[LogLevel] = [LogLevel.ERROR]):
|
|
14
|
+
super().__init__(type=AlertMethod.SLACK, config=config, log_levels=log_levels)
|
|
15
|
+
self.webhook_url = config.webhook_url
|
|
16
|
+
|
|
17
|
+
def handler(self, message: Dict) -> None:
|
|
18
|
+
"""
|
|
19
|
+
Custom handler to send error logs to Slack, with additional context.
|
|
20
|
+
"""
|
|
21
|
+
log_entry = message.record
|
|
22
|
+
error_message = (
|
|
23
|
+
f"*Sync*: `{os.environ.get('BIZON_SYNC_NAME', 'N/A')}`\n"
|
|
24
|
+
f"*Source*: `{os.environ.get('BIZON_SOURCE_NAME', 'N/A')}` - `{os.environ.get('BIZON_SOURCE_STREAM', 'N/A')}`\n" # noqa
|
|
25
|
+
f"*Destination*: `{os.environ.get('BIZON_DESTINATION_NAME', 'N/A')}`\n\n"
|
|
26
|
+
f"*Message:*\n```{log_entry['message']}```\n"
|
|
27
|
+
f"*File:* `{log_entry['file'].path}:{log_entry['line']}`\n"
|
|
28
|
+
f"*Function:* `{log_entry['function']}`\n"
|
|
29
|
+
f"*Level:* `{log_entry['level'].name}`\n"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
payload = {"text": f":rotating_light: *Bizon Pipeline Alert* :rotating_light:\n\n{error_message}"}
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
response = requests.post(self.webhook_url, json=payload)
|
|
36
|
+
response.raise_for_status()
|
|
37
|
+
except requests.exceptions.RequestException as e:
|
|
38
|
+
logger.error(f"Failed to send log to Slack: {e}")
|
|
39
|
+
return None
|
bizon/cli/main.py
CHANGED
|
@@ -83,7 +83,7 @@ def destination():
|
|
|
83
83
|
@click.option(
|
|
84
84
|
"--runner",
|
|
85
85
|
required=False,
|
|
86
|
-
type=click.Choice(["thread", "process"]),
|
|
86
|
+
type=click.Choice(["thread", "process", "stream"]),
|
|
87
87
|
default="thread",
|
|
88
88
|
show_default=True,
|
|
89
89
|
help="Runner type to use. Thread or Process.",
|
|
@@ -117,9 +117,13 @@ def run(
|
|
|
117
117
|
set_runner_in_config(config=config, runner=runner)
|
|
118
118
|
|
|
119
119
|
runner = RunnerFactory.create_from_config_dict(config=config)
|
|
120
|
-
runner.run()
|
|
120
|
+
result = runner.run()
|
|
121
121
|
|
|
122
|
-
|
|
122
|
+
if result.is_success:
|
|
123
|
+
click.secho("Pipeline finished successfully.", fg="green")
|
|
124
|
+
|
|
125
|
+
else:
|
|
126
|
+
raise click.exceptions.ClickException(result.to_string())
|
|
123
127
|
|
|
124
128
|
|
|
125
129
|
if __name__ == "__main__":
|
bizon/common/models.py
CHANGED
|
@@ -1,13 +1,21 @@
|
|
|
1
|
-
from typing import Union
|
|
1
|
+
from typing import Optional, Union
|
|
2
2
|
|
|
3
3
|
from pydantic import BaseModel, ConfigDict, Field
|
|
4
4
|
|
|
5
|
-
from bizon.
|
|
6
|
-
from bizon.destinations.
|
|
7
|
-
from bizon.destinations.
|
|
8
|
-
|
|
5
|
+
from bizon.alerting.models import AlertingConfig
|
|
6
|
+
from bizon.connectors.destinations.bigquery.src.config import BigQueryConfig
|
|
7
|
+
from bizon.connectors.destinations.bigquery_streaming.src.config import (
|
|
8
|
+
BigQueryStreamingConfig,
|
|
9
|
+
)
|
|
10
|
+
from bizon.connectors.destinations.bigquery_streaming_v2.src.config import (
|
|
11
|
+
BigQueryStreamingV2Config,
|
|
12
|
+
)
|
|
13
|
+
from bizon.connectors.destinations.file.src.config import FileDestinationConfig
|
|
14
|
+
from bizon.connectors.destinations.logger.src.config import LoggerConfig
|
|
9
15
|
from bizon.engine.config import EngineConfig
|
|
16
|
+
from bizon.monitoring.config import MonitoringConfig
|
|
10
17
|
from bizon.source.config import SourceConfig, SourceSyncModes
|
|
18
|
+
from bizon.transform.config import TransformModel
|
|
11
19
|
|
|
12
20
|
|
|
13
21
|
class BizonConfig(BaseModel):
|
|
@@ -23,9 +31,15 @@ class BizonConfig(BaseModel):
|
|
|
23
31
|
default=...,
|
|
24
32
|
)
|
|
25
33
|
|
|
34
|
+
transforms: Optional[list[TransformModel]] = Field(
|
|
35
|
+
description="List of transformations to apply to the source data",
|
|
36
|
+
default=[],
|
|
37
|
+
)
|
|
38
|
+
|
|
26
39
|
destination: Union[
|
|
27
40
|
BigQueryConfig,
|
|
28
41
|
BigQueryStreamingConfig,
|
|
42
|
+
BigQueryStreamingV2Config,
|
|
29
43
|
LoggerConfig,
|
|
30
44
|
FileDestinationConfig,
|
|
31
45
|
] = Field(
|
|
@@ -39,6 +53,16 @@ class BizonConfig(BaseModel):
|
|
|
39
53
|
default=EngineConfig(),
|
|
40
54
|
)
|
|
41
55
|
|
|
56
|
+
alerting: Optional[AlertingConfig] = Field(
|
|
57
|
+
description="Alerting configuration",
|
|
58
|
+
default=None,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
monitoring: Optional[MonitoringConfig] = Field(
|
|
62
|
+
description="Monitoring configuration",
|
|
63
|
+
default=None,
|
|
64
|
+
)
|
|
65
|
+
|
|
42
66
|
|
|
43
67
|
class SyncMetadata(BaseModel):
|
|
44
68
|
"""Model which stores general metadata around a sync.
|
|
@@ -57,8 +81,8 @@ class SyncMetadata(BaseModel):
|
|
|
57
81
|
return cls(
|
|
58
82
|
name=config.name,
|
|
59
83
|
job_id=job_id,
|
|
60
|
-
source_name=config.source.
|
|
61
|
-
stream_name=config.source.
|
|
84
|
+
source_name=config.source.name,
|
|
85
|
+
stream_name=config.source.stream,
|
|
62
86
|
sync_mode=config.source.sync_mode,
|
|
63
87
|
destination_name=config.destination.name,
|
|
64
88
|
)
|
|
@@ -1,6 +1,8 @@
|
|
|
1
|
+
name: hubspot contacts to bigquery
|
|
2
|
+
|
|
1
3
|
source:
|
|
2
4
|
name: hubspot
|
|
3
|
-
|
|
5
|
+
stream: contacts
|
|
4
6
|
properties:
|
|
5
7
|
strategy: all
|
|
6
8
|
authentication:
|
|
@@ -34,6 +36,3 @@ destination:
|
|
|
34
36
|
"client_x509_cert_url": "",
|
|
35
37
|
"universe_domain": "googleapis.com"
|
|
36
38
|
}
|
|
37
|
-
|
|
38
|
-
pipeline:
|
|
39
|
-
log_level: DEBUG
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Literal, Optional
|
|
3
|
+
|
|
4
|
+
import polars as pl
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from bizon.destination.config import (
|
|
8
|
+
AbstractDestinationConfig,
|
|
9
|
+
AbstractDestinationDetailsConfig,
|
|
10
|
+
DestinationColumn,
|
|
11
|
+
DestinationTypes,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class GCSBufferFormat(str, Enum):
|
|
16
|
+
PARQUET = "parquet"
|
|
17
|
+
CSV = "csv"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TimePartitioning(str, Enum):
|
|
21
|
+
DAY = "DAY"
|
|
22
|
+
HOUR = "HOUR"
|
|
23
|
+
MONTH = "MONTH"
|
|
24
|
+
YEAR = "YEAR"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BigQueryColumnType(str, Enum):
|
|
28
|
+
BOOLEAN = "BOOLEAN"
|
|
29
|
+
BYTES = "BYTES"
|
|
30
|
+
DATE = "DATE"
|
|
31
|
+
DATETIME = "DATETIME"
|
|
32
|
+
FLOAT = "FLOAT"
|
|
33
|
+
FLOAT64 = "FLOAT64"
|
|
34
|
+
GEOGRAPHY = "GEOGRAPHY"
|
|
35
|
+
INTEGER = "INTEGER"
|
|
36
|
+
INT64 = "INT64"
|
|
37
|
+
NUMERIC = "NUMERIC"
|
|
38
|
+
BIGNUMERIC = "BIGNUMERIC"
|
|
39
|
+
JSON = "JSON"
|
|
40
|
+
RECORD = "RECORD"
|
|
41
|
+
STRING = "STRING"
|
|
42
|
+
TIME = "TIME"
|
|
43
|
+
TIMESTAMP = "TIMESTAMP"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class BigQueryColumnMode(str, Enum):
|
|
47
|
+
NULLABLE = "NULLABLE"
|
|
48
|
+
REQUIRED = "REQUIRED"
|
|
49
|
+
REPEATED = "REPEATED"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
BIGQUERY_TO_POLARS_TYPE_MAPPING = {
|
|
53
|
+
"STRING": pl.String,
|
|
54
|
+
"BYTES": pl.Binary,
|
|
55
|
+
"INTEGER": pl.Int64,
|
|
56
|
+
"INT64": pl.Int64,
|
|
57
|
+
"FLOAT": pl.Float64,
|
|
58
|
+
"FLOAT64": pl.Float64,
|
|
59
|
+
"NUMERIC": pl.Float64, # Can be refined for precision with Decimal128 if needed
|
|
60
|
+
"BIGNUMERIC": pl.Float64, # Similar to NUMERIC
|
|
61
|
+
"BOOLEAN": pl.Boolean,
|
|
62
|
+
"BOOL": pl.Boolean,
|
|
63
|
+
"TIMESTAMP": pl.String, # We use BigQuery internal parsing to convert to datetime
|
|
64
|
+
"DATE": pl.String, # We use BigQuery internal parsing to convert to datetime
|
|
65
|
+
"DATETIME": pl.String, # We use BigQuery internal parsing to convert to datetime
|
|
66
|
+
"TIME": pl.Time,
|
|
67
|
+
"GEOGRAPHY": pl.Object, # Polars doesn't natively support geography types
|
|
68
|
+
"ARRAY": pl.List, # Requires additional handling for element types
|
|
69
|
+
"JSON": pl.String,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class BigQueryColumn(DestinationColumn):
|
|
74
|
+
name: str = Field(..., description="Name of the column")
|
|
75
|
+
type: BigQueryColumnType = Field(..., description="Type of the column")
|
|
76
|
+
mode: BigQueryColumnMode = Field(..., description="Mode of the column")
|
|
77
|
+
description: Optional[str] = Field(None, description="Description of the column")
|
|
78
|
+
default_value_expression: Optional[str] = Field(None, description="Default value expression")
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def polars_type(self):
|
|
82
|
+
return BIGQUERY_TO_POLARS_TYPE_MAPPING.get(self.type.upper())
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class BigQueryAuthentication(BaseModel):
|
|
86
|
+
service_account_key: str = Field(
|
|
87
|
+
description="Service Account Key JSON string. If empty it will be infered",
|
|
88
|
+
default="",
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class BigQueryRecordSchemaConfig(BaseModel):
|
|
93
|
+
destination_id: str = Field(..., description="Destination ID")
|
|
94
|
+
record_schema: list[BigQueryColumn] = Field(..., description="Record schema")
|
|
95
|
+
|
|
96
|
+
# BigQuery Clustering Keys
|
|
97
|
+
clustering_keys: Optional[list[str]] = Field(None, description="Clustering keys")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
|
|
101
|
+
|
|
102
|
+
# Table details
|
|
103
|
+
project_id: str = Field(..., description="BigQuery Project ID")
|
|
104
|
+
dataset_id: str = Field(..., description="BigQuery Dataset ID")
|
|
105
|
+
dataset_location: str = Field(default="US", description="BigQuery Dataset location")
|
|
106
|
+
|
|
107
|
+
# GCS Buffer
|
|
108
|
+
gcs_buffer_bucket: str = Field(..., description="GCS Buffer bucket")
|
|
109
|
+
gcs_buffer_format: GCSBufferFormat = Field(default=GCSBufferFormat.PARQUET, description="GCS Buffer format")
|
|
110
|
+
|
|
111
|
+
# Time partitioning
|
|
112
|
+
time_partitioning: TimePartitioning = Field(
|
|
113
|
+
default=TimePartitioning.DAY, description="BigQuery Time partitioning type"
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Schema for unnesting
|
|
117
|
+
record_schemas: Optional[list[BigQueryRecordSchemaConfig]] = Field(
|
|
118
|
+
default=None, description="Schema for the records. Required if unnest is set to true."
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
authentication: Optional[BigQueryAuthentication] = None
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class BigQueryConfig(AbstractDestinationConfig):
|
|
125
|
+
name: Literal[DestinationTypes.BIGQUERY]
|
|
126
|
+
buffer_size: Optional[int] = 400
|
|
127
|
+
config: BigQueryConfigDetails
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import io
|
|
2
|
-
import json
|
|
3
2
|
import os
|
|
4
3
|
import tempfile
|
|
5
4
|
import traceback
|
|
@@ -13,18 +12,24 @@ from google.cloud.bigquery import DatasetReference, TimePartitioning
|
|
|
13
12
|
from loguru import logger
|
|
14
13
|
|
|
15
14
|
from bizon.common.models import SyncMetadata
|
|
16
|
-
from bizon.
|
|
17
|
-
from bizon.destinations.destination import AbstractDestination
|
|
15
|
+
from bizon.destination.destination import AbstractDestination
|
|
18
16
|
from bizon.engine.backend.backend import AbstractBackend
|
|
19
17
|
from bizon.source.config import SourceSyncModes
|
|
18
|
+
from bizon.source.source import AbstractSourceCallback
|
|
20
19
|
|
|
21
|
-
from .config import BigQueryConfigDetails
|
|
20
|
+
from .config import BigQueryColumn, BigQueryConfigDetails
|
|
22
21
|
|
|
23
22
|
|
|
24
23
|
class BigQueryDestination(AbstractDestination):
|
|
25
24
|
|
|
26
|
-
def __init__(
|
|
27
|
-
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
sync_metadata: SyncMetadata,
|
|
28
|
+
config: BigQueryConfigDetails,
|
|
29
|
+
backend: AbstractBackend,
|
|
30
|
+
source_callback: AbstractSourceCallback,
|
|
31
|
+
):
|
|
32
|
+
super().__init__(sync_metadata, config, backend, source_callback)
|
|
28
33
|
self.config: BigQueryConfigDetails = config
|
|
29
34
|
|
|
30
35
|
if config.authentication and config.authentication.service_account_key:
|
|
@@ -44,7 +49,7 @@ class BigQueryDestination(AbstractDestination):
|
|
|
44
49
|
|
|
45
50
|
@property
|
|
46
51
|
def table_id(self) -> str:
|
|
47
|
-
tabled_id = self.
|
|
52
|
+
tabled_id = self.destination_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
|
|
48
53
|
return f"{self.project_id}.{self.dataset_id}.{tabled_id}"
|
|
49
54
|
|
|
50
55
|
@property
|
|
@@ -61,28 +66,24 @@ class BigQueryDestination(AbstractDestination):
|
|
|
61
66
|
|
|
62
67
|
def get_bigquery_schema(self, df_destination_records: pl.DataFrame) -> List[bigquery.SchemaField]:
|
|
63
68
|
|
|
64
|
-
# we
|
|
65
|
-
if self.config.
|
|
69
|
+
# Case we unnest the data
|
|
70
|
+
if self.config.unnest:
|
|
66
71
|
return [
|
|
67
|
-
bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
|
|
68
|
-
bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
|
|
69
|
-
bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
|
|
70
|
-
bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
|
|
71
72
|
bigquery.SchemaField(
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
73
|
+
col.name,
|
|
74
|
+
col.type,
|
|
75
|
+
mode=col.mode,
|
|
76
|
+
description=col.description,
|
|
77
|
+
)
|
|
78
|
+
for col in self.record_schemas[self.destination_id]
|
|
75
79
|
]
|
|
76
80
|
|
|
77
|
-
#
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
# We use the first record to infer the schema of tabular data (key / value pairs)
|
|
81
|
-
source_data_keys = list(json.loads(df_destination_records["source_data"][0]).keys())
|
|
82
|
-
|
|
83
|
-
return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in source_data_keys] + [
|
|
81
|
+
# Case we don't unnest the data
|
|
82
|
+
else:
|
|
83
|
+
return [
|
|
84
84
|
bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
|
|
85
85
|
bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
|
|
86
|
+
bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
|
|
86
87
|
bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
|
|
87
88
|
bigquery.SchemaField(
|
|
88
89
|
"_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
|
|
@@ -90,8 +91,6 @@ class BigQueryDestination(AbstractDestination):
|
|
|
90
91
|
bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
|
|
91
92
|
]
|
|
92
93
|
|
|
93
|
-
raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
|
|
94
|
-
|
|
95
94
|
def check_connection(self) -> bool:
|
|
96
95
|
dataset_ref = DatasetReference(self.project_id, self.dataset_id)
|
|
97
96
|
|
|
@@ -129,6 +128,28 @@ class BigQueryDestination(AbstractDestination):
|
|
|
129
128
|
|
|
130
129
|
raise NotImplementedError(f"Buffer format {self.buffer_format} is not supported")
|
|
131
130
|
|
|
131
|
+
@staticmethod
|
|
132
|
+
def unnest_data(df_destination_records: pl.DataFrame, record_schema: list[BigQueryColumn]) -> pl.DataFrame:
|
|
133
|
+
"""Unnest the source_data field into separate columns"""
|
|
134
|
+
|
|
135
|
+
# Check if the schema matches the expected schema
|
|
136
|
+
source_data_fields = (
|
|
137
|
+
pl.DataFrame(df_destination_records["source_data"].str.json_decode(infer_schema_length=None))
|
|
138
|
+
.schema["source_data"]
|
|
139
|
+
.fields
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
record_schema_fields = [col.name for col in record_schema]
|
|
143
|
+
|
|
144
|
+
for field in source_data_fields:
|
|
145
|
+
assert field.name in record_schema_fields, f"Column {field.name} not found in BigQuery schema"
|
|
146
|
+
|
|
147
|
+
# Parse the JSON and unnest the fields to polar type
|
|
148
|
+
return df_destination_records.select(
|
|
149
|
+
pl.col("source_data").str.json_path_match(f"$.{col.name}").cast(col.polars_type).alias(col.name)
|
|
150
|
+
for col in record_schema
|
|
151
|
+
)
|
|
152
|
+
|
|
132
153
|
def load_to_bigquery(self, gcs_file: str, df_destination_records: pl.DataFrame):
|
|
133
154
|
|
|
134
155
|
# We always partition by the loaded_at field
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Literal, Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
|
|
6
|
+
from bizon.connectors.destinations.bigquery.src.config import BigQueryRecordSchemaConfig
|
|
7
|
+
from bizon.destination.config import (
|
|
8
|
+
AbstractDestinationConfig,
|
|
9
|
+
AbstractDestinationDetailsConfig,
|
|
10
|
+
DestinationTypes,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TimePartitioningWindow(str, Enum):
|
|
15
|
+
DAY = "DAY"
|
|
16
|
+
HOUR = "HOUR"
|
|
17
|
+
MONTH = "MONTH"
|
|
18
|
+
YEAR = "YEAR"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TimePartitioning(BaseModel):
|
|
22
|
+
type: TimePartitioningWindow = Field(default=TimePartitioningWindow.DAY, description="Time partitioning type")
|
|
23
|
+
field: Optional[str] = Field(
|
|
24
|
+
"_bizon_loaded_at", description="Field to partition by. You can use a transformation to create this field."
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BigQueryAuthentication(BaseModel):
|
|
29
|
+
service_account_key: str = Field(
|
|
30
|
+
description="Service Account Key JSON string. If empty it will be infered",
|
|
31
|
+
default="",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class BigQueryStreamingConfigDetails(AbstractDestinationDetailsConfig):
|
|
36
|
+
project_id: str
|
|
37
|
+
dataset_id: str
|
|
38
|
+
dataset_location: Optional[str] = "US"
|
|
39
|
+
time_partitioning: Optional[TimePartitioning] = Field(
|
|
40
|
+
default=TimePartitioning(type=TimePartitioningWindow.DAY, field="_bizon_loaded_at"),
|
|
41
|
+
description="BigQuery Time partitioning type",
|
|
42
|
+
)
|
|
43
|
+
authentication: Optional[BigQueryAuthentication] = None
|
|
44
|
+
bq_max_rows_per_request: Optional[int] = Field(30000, description="Max rows per buffer streaming request.")
|
|
45
|
+
record_schemas: Optional[list[BigQueryRecordSchemaConfig]] = Field(
|
|
46
|
+
default=None, description="Schema for the records. Required if unnest is set to true."
|
|
47
|
+
)
|
|
48
|
+
use_legacy_streaming_api: bool = Field(
|
|
49
|
+
default=False,
|
|
50
|
+
description="[DEPRECATED] Use the legacy streaming API. This is required for some older BigQuery versions.",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class BigQueryStreamingConfig(AbstractDestinationConfig):
|
|
55
|
+
name: Literal[DestinationTypes.BIGQUERY_STREAMING]
|
|
56
|
+
config: BigQueryStreamingConfigDetails
|