bizon 0.0.14__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. bizon/alerting/__init__.py +0 -0
  2. bizon/alerting/alerts.py +23 -0
  3. bizon/alerting/models.py +28 -0
  4. bizon/alerting/slack/__init__.py +0 -0
  5. bizon/alerting/slack/config.py +5 -0
  6. bizon/alerting/slack/handler.py +39 -0
  7. bizon/cli/main.py +7 -3
  8. bizon/common/models.py +31 -7
  9. bizon/{destinations → connectors/destinations}/bigquery/config/bigquery.example.yml +3 -4
  10. bizon/connectors/destinations/bigquery/src/config.py +127 -0
  11. bizon/{destinations → connectors/destinations}/bigquery/src/destination.py +46 -25
  12. bizon/connectors/destinations/bigquery_streaming/src/config.py +56 -0
  13. bizon/connectors/destinations/bigquery_streaming/src/destination.py +372 -0
  14. bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +52 -0
  15. bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +261 -0
  16. bizon/{destinations/bigquery_streaming → connectors/destinations/bigquery_streaming_v2}/src/proto_utils.py +32 -26
  17. bizon/{destinations → connectors/destinations}/file/src/config.py +8 -3
  18. bizon/connectors/destinations/file/src/destination.py +54 -0
  19. bizon/{destinations → connectors/destinations}/logger/src/config.py +1 -1
  20. bizon/{destinations → connectors/destinations}/logger/src/destination.py +15 -3
  21. bizon/connectors/sources/cycle/config/cycle.example.yml +15 -0
  22. bizon/connectors/sources/cycle/src/source.py +133 -0
  23. bizon/{sources/periscope/tests/periscope_pipeline_dashboard.py → connectors/sources/cycle/tests/cycle_customers.py} +1 -1
  24. bizon/connectors/sources/dummy/config/dummy.example.yml +22 -0
  25. bizon/{sources → connectors/sources}/dummy/src/fake_api.py +6 -1
  26. bizon/{sources → connectors/sources}/dummy/src/source.py +18 -5
  27. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline.py +5 -14
  28. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_bigquery_backend.py +2 -2
  29. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_kafka.py +2 -2
  30. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_rabbitmq.py +2 -2
  31. bizon/connectors/sources/dummy/tests/dummy_pipeline_unnest.py +29 -0
  32. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery.py +3 -3
  33. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +2 -2
  34. bizon/{sources → connectors/sources}/gsheets/config/default_auth.example.yml +4 -2
  35. bizon/{sources → connectors/sources}/gsheets/config/service_account.example.yml +4 -2
  36. bizon/{sources → connectors/sources}/hubspot/config/api_key.example.yml +4 -2
  37. bizon/{sources → connectors/sources}/hubspot/config/oauth.example.yml +4 -2
  38. bizon/{sources → connectors/sources}/hubspot/src/hubspot_objects.py +1 -1
  39. bizon/connectors/sources/kafka/config/kafka.example.yml +50 -0
  40. bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +112 -0
  41. bizon/connectors/sources/kafka/src/callback.py +18 -0
  42. bizon/connectors/sources/kafka/src/config.py +75 -0
  43. bizon/connectors/sources/kafka/src/decode.py +88 -0
  44. bizon/connectors/sources/kafka/src/source.py +361 -0
  45. bizon/connectors/sources/kafka/tests/kafka_pipeline.py +7 -0
  46. bizon/connectors/sources/periscope/config/periscope_charts.example.yml +20 -0
  47. bizon/connectors/sources/periscope/config/periscope_dashboards.example.yml +20 -0
  48. bizon/{sources → connectors/sources}/periscope/src/source.py +136 -13
  49. bizon/connectors/sources/periscope/tests/periscope_pipeline_dashboard.py +9 -0
  50. bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_json.example.yml +19 -0
  51. bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_logger.example.yml +10 -0
  52. bizon/connectors/sources/pokeapi/src/source.py +79 -0
  53. bizon/{destinations → destination}/buffer.py +5 -0
  54. bizon/destination/config.py +74 -0
  55. bizon/{destinations → destination}/destination.py +71 -15
  56. bizon/engine/backend/adapters/sqlalchemy/backend.py +14 -23
  57. bizon/engine/engine.py +20 -1
  58. bizon/engine/pipeline/consumer.py +73 -5
  59. bizon/engine/pipeline/models.py +8 -3
  60. bizon/engine/pipeline/producer.py +18 -9
  61. bizon/engine/queue/adapters/kafka/consumer.py +2 -2
  62. bizon/engine/queue/adapters/kafka/queue.py +3 -2
  63. bizon/engine/queue/adapters/python_queue/consumer.py +40 -23
  64. bizon/engine/queue/adapters/python_queue/queue.py +19 -9
  65. bizon/engine/queue/adapters/rabbitmq/consumer.py +3 -6
  66. bizon/engine/queue/adapters/rabbitmq/queue.py +3 -2
  67. bizon/engine/queue/config.py +16 -0
  68. bizon/engine/queue/queue.py +17 -16
  69. bizon/engine/runner/adapters/process.py +15 -2
  70. bizon/engine/runner/adapters/streaming.py +103 -0
  71. bizon/engine/runner/adapters/thread.py +32 -9
  72. bizon/engine/runner/config.py +28 -0
  73. bizon/engine/runner/runner.py +107 -25
  74. bizon/monitoring/__init__.py +0 -0
  75. bizon/monitoring/config.py +29 -0
  76. bizon/monitoring/datadog/__init__.py +0 -0
  77. bizon/monitoring/datadog/monitor.py +69 -0
  78. bizon/monitoring/monitor.py +42 -0
  79. bizon/monitoring/noop/__init__.py +0 -0
  80. bizon/monitoring/noop/monitor.py +11 -0
  81. bizon/source/callback.py +24 -0
  82. bizon/source/config.py +3 -3
  83. bizon/source/cursor.py +1 -1
  84. bizon/source/discover.py +4 -3
  85. bizon/source/models.py +4 -2
  86. bizon/source/source.py +10 -2
  87. bizon/transform/config.py +8 -0
  88. bizon/transform/transform.py +48 -0
  89. bizon-0.1.1.dist-info/LICENSE +674 -0
  90. {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/METADATA +25 -7
  91. bizon-0.1.1.dist-info/RECORD +123 -0
  92. {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/WHEEL +1 -1
  93. bizon/destinations/bigquery/src/config.py +0 -51
  94. bizon/destinations/bigquery_streaming/src/config.py +0 -43
  95. bizon/destinations/bigquery_streaming/src/destination.py +0 -154
  96. bizon/destinations/config.py +0 -47
  97. bizon/destinations/file/src/destination.py +0 -27
  98. bizon/sources/dummy/config/api_key.example.yml +0 -20
  99. bizon/sources/dummy/config/api_key_kafka.example.yml +0 -27
  100. bizon/sources/kafka/config/kafka.example.yml +0 -38
  101. bizon/sources/kafka/src/source.py +0 -357
  102. bizon/sources/kafka/tests/kafka_pipeline.py +0 -9
  103. bizon/sources/periscope/config/periscope_charts.example.yml +0 -26
  104. bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -26
  105. bizon-0.0.14.dist-info/LICENSE +0 -21
  106. bizon-0.0.14.dist-info/RECORD +0 -94
  107. /bizon/{sources → connectors/sources}/gsheets/src/source.py +0 -0
  108. /bizon/{sources → connectors/sources}/gsheets/tests/gsheets_pipeline.py +0 -0
  109. /bizon/{sources → connectors/sources}/hubspot/src/hubspot_base.py +0 -0
  110. /bizon/{sources → connectors/sources}/hubspot/src/models/hs_object.py +0 -0
  111. /bizon/{sources → connectors/sources}/hubspot/tests/hubspot_pipeline.py +0 -0
  112. /bizon/{sources → connectors/sources}/periscope/tests/periscope_pipeline_charts.py +0 -0
  113. /bizon/{destinations → destination}/models.py +0 -0
  114. {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/entry_points.txt +0 -0
File without changes
@@ -0,0 +1,23 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Dict, List
3
+
4
+ from loguru import logger
5
+
6
+ from bizon.alerting.models import AlertingConfig, AlertMethod, LogLevel
7
+
8
+
9
+ class AbstractAlert(ABC):
10
+
11
+ def __init__(self, type: AlertMethod, config: AlertingConfig, log_levels: List[LogLevel] = [LogLevel.ERROR]):
12
+ self.type = type
13
+ self.config = config
14
+ self.log_levels = log_levels
15
+
16
+ @abstractmethod
17
+ def handler(self, message: Dict) -> None:
18
+ pass
19
+
20
+ def add_handlers(self) -> None:
21
+ levels = [level.value for level in self.log_levels]
22
+ for level in levels:
23
+ logger.add(self.handler, level=level, format="{message}")
@@ -0,0 +1,28 @@
1
+ from enum import Enum
2
+ from typing import List, Optional, Union
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from bizon.alerting.slack.config import SlackConfig
7
+
8
+
9
+ class LogLevel(str, Enum):
10
+ DEBUG = "DEBUG"
11
+ INFO = "INFO"
12
+ WARNING = "WARNING"
13
+ ERROR = "ERROR"
14
+ CRITICAL = "CRITICAL"
15
+
16
+
17
+ class AlertMethod(str, Enum):
18
+ """Alerting methods"""
19
+
20
+ SLACK = "slack"
21
+
22
+
23
+ class AlertingConfig(BaseModel):
24
+ """Alerting configuration model"""
25
+
26
+ type: AlertMethod
27
+ log_levels: Optional[List[LogLevel]] = [LogLevel.ERROR]
28
+ config: Union[SlackConfig]
File without changes
@@ -0,0 +1,5 @@
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class SlackConfig(BaseModel):
5
+ webhook_url: str
@@ -0,0 +1,39 @@
1
+ import os
2
+ from typing import Dict, List
3
+
4
+ import requests
5
+ from loguru import logger
6
+
7
+ from bizon.alerting.alerts import AbstractAlert, AlertMethod
8
+ from bizon.alerting.models import LogLevel
9
+ from bizon.alerting.slack.config import SlackConfig
10
+
11
+
12
+ class SlackHandler(AbstractAlert):
13
+ def __init__(self, config: SlackConfig, log_levels: List[LogLevel] = [LogLevel.ERROR]):
14
+ super().__init__(type=AlertMethod.SLACK, config=config, log_levels=log_levels)
15
+ self.webhook_url = config.webhook_url
16
+
17
+ def handler(self, message: Dict) -> None:
18
+ """
19
+ Custom handler to send error logs to Slack, with additional context.
20
+ """
21
+ log_entry = message.record
22
+ error_message = (
23
+ f"*Sync*: `{os.environ.get('BIZON_SYNC_NAME', 'N/A')}`\n"
24
+ f"*Source*: `{os.environ.get('BIZON_SOURCE_NAME', 'N/A')}` - `{os.environ.get('BIZON_SOURCE_STREAM', 'N/A')}`\n" # noqa
25
+ f"*Destination*: `{os.environ.get('BIZON_DESTINATION_NAME', 'N/A')}`\n\n"
26
+ f"*Message:*\n```{log_entry['message']}```\n"
27
+ f"*File:* `{log_entry['file'].path}:{log_entry['line']}`\n"
28
+ f"*Function:* `{log_entry['function']}`\n"
29
+ f"*Level:* `{log_entry['level'].name}`\n"
30
+ )
31
+
32
+ payload = {"text": f":rotating_light: *Bizon Pipeline Alert* :rotating_light:\n\n{error_message}"}
33
+
34
+ try:
35
+ response = requests.post(self.webhook_url, json=payload)
36
+ response.raise_for_status()
37
+ except requests.exceptions.RequestException as e:
38
+ logger.error(f"Failed to send log to Slack: {e}")
39
+ return None
bizon/cli/main.py CHANGED
@@ -83,7 +83,7 @@ def destination():
83
83
  @click.option(
84
84
  "--runner",
85
85
  required=False,
86
- type=click.Choice(["thread", "process"]),
86
+ type=click.Choice(["thread", "process", "stream"]),
87
87
  default="thread",
88
88
  show_default=True,
89
89
  help="Runner type to use. Thread or Process.",
@@ -117,9 +117,13 @@ def run(
117
117
  set_runner_in_config(config=config, runner=runner)
118
118
 
119
119
  runner = RunnerFactory.create_from_config_dict(config=config)
120
- runner.run()
120
+ result = runner.run()
121
121
 
122
- click.echo("Pipeline finished.")
122
+ if result.is_success:
123
+ click.secho("Pipeline finished successfully.", fg="green")
124
+
125
+ else:
126
+ raise click.exceptions.ClickException(result.to_string())
123
127
 
124
128
 
125
129
  if __name__ == "__main__":
bizon/common/models.py CHANGED
@@ -1,13 +1,21 @@
1
- from typing import Union
1
+ from typing import Optional, Union
2
2
 
3
3
  from pydantic import BaseModel, ConfigDict, Field
4
4
 
5
- from bizon.destinations.bigquery.src.config import BigQueryConfig
6
- from bizon.destinations.bigquery_streaming.src.config import BigQueryStreamingConfig
7
- from bizon.destinations.file.src.config import FileDestinationConfig
8
- from bizon.destinations.logger.src.config import LoggerConfig
5
+ from bizon.alerting.models import AlertingConfig
6
+ from bizon.connectors.destinations.bigquery.src.config import BigQueryConfig
7
+ from bizon.connectors.destinations.bigquery_streaming.src.config import (
8
+ BigQueryStreamingConfig,
9
+ )
10
+ from bizon.connectors.destinations.bigquery_streaming_v2.src.config import (
11
+ BigQueryStreamingV2Config,
12
+ )
13
+ from bizon.connectors.destinations.file.src.config import FileDestinationConfig
14
+ from bizon.connectors.destinations.logger.src.config import LoggerConfig
9
15
  from bizon.engine.config import EngineConfig
16
+ from bizon.monitoring.config import MonitoringConfig
10
17
  from bizon.source.config import SourceConfig, SourceSyncModes
18
+ from bizon.transform.config import TransformModel
11
19
 
12
20
 
13
21
  class BizonConfig(BaseModel):
@@ -23,9 +31,15 @@ class BizonConfig(BaseModel):
23
31
  default=...,
24
32
  )
25
33
 
34
+ transforms: Optional[list[TransformModel]] = Field(
35
+ description="List of transformations to apply to the source data",
36
+ default=[],
37
+ )
38
+
26
39
  destination: Union[
27
40
  BigQueryConfig,
28
41
  BigQueryStreamingConfig,
42
+ BigQueryStreamingV2Config,
29
43
  LoggerConfig,
30
44
  FileDestinationConfig,
31
45
  ] = Field(
@@ -39,6 +53,16 @@ class BizonConfig(BaseModel):
39
53
  default=EngineConfig(),
40
54
  )
41
55
 
56
+ alerting: Optional[AlertingConfig] = Field(
57
+ description="Alerting configuration",
58
+ default=None,
59
+ )
60
+
61
+ monitoring: Optional[MonitoringConfig] = Field(
62
+ description="Monitoring configuration",
63
+ default=None,
64
+ )
65
+
42
66
 
43
67
  class SyncMetadata(BaseModel):
44
68
  """Model which stores general metadata around a sync.
@@ -57,8 +81,8 @@ class SyncMetadata(BaseModel):
57
81
  return cls(
58
82
  name=config.name,
59
83
  job_id=job_id,
60
- source_name=config.source.source_name,
61
- stream_name=config.source.stream_name,
84
+ source_name=config.source.name,
85
+ stream_name=config.source.stream,
62
86
  sync_mode=config.source.sync_mode,
63
87
  destination_name=config.destination.name,
64
88
  )
@@ -1,6 +1,8 @@
1
+ name: hubspot contacts to bigquery
2
+
1
3
  source:
2
4
  name: hubspot
3
- stream_name: contacts
5
+ stream: contacts
4
6
  properties:
5
7
  strategy: all
6
8
  authentication:
@@ -34,6 +36,3 @@ destination:
34
36
  "client_x509_cert_url": "",
35
37
  "universe_domain": "googleapis.com"
36
38
  }
37
-
38
- pipeline:
39
- log_level: DEBUG
@@ -0,0 +1,127 @@
1
+ from enum import Enum
2
+ from typing import Literal, Optional
3
+
4
+ import polars as pl
5
+ from pydantic import BaseModel, Field
6
+
7
+ from bizon.destination.config import (
8
+ AbstractDestinationConfig,
9
+ AbstractDestinationDetailsConfig,
10
+ DestinationColumn,
11
+ DestinationTypes,
12
+ )
13
+
14
+
15
+ class GCSBufferFormat(str, Enum):
16
+ PARQUET = "parquet"
17
+ CSV = "csv"
18
+
19
+
20
+ class TimePartitioning(str, Enum):
21
+ DAY = "DAY"
22
+ HOUR = "HOUR"
23
+ MONTH = "MONTH"
24
+ YEAR = "YEAR"
25
+
26
+
27
+ class BigQueryColumnType(str, Enum):
28
+ BOOLEAN = "BOOLEAN"
29
+ BYTES = "BYTES"
30
+ DATE = "DATE"
31
+ DATETIME = "DATETIME"
32
+ FLOAT = "FLOAT"
33
+ FLOAT64 = "FLOAT64"
34
+ GEOGRAPHY = "GEOGRAPHY"
35
+ INTEGER = "INTEGER"
36
+ INT64 = "INT64"
37
+ NUMERIC = "NUMERIC"
38
+ BIGNUMERIC = "BIGNUMERIC"
39
+ JSON = "JSON"
40
+ RECORD = "RECORD"
41
+ STRING = "STRING"
42
+ TIME = "TIME"
43
+ TIMESTAMP = "TIMESTAMP"
44
+
45
+
46
+ class BigQueryColumnMode(str, Enum):
47
+ NULLABLE = "NULLABLE"
48
+ REQUIRED = "REQUIRED"
49
+ REPEATED = "REPEATED"
50
+
51
+
52
+ BIGQUERY_TO_POLARS_TYPE_MAPPING = {
53
+ "STRING": pl.String,
54
+ "BYTES": pl.Binary,
55
+ "INTEGER": pl.Int64,
56
+ "INT64": pl.Int64,
57
+ "FLOAT": pl.Float64,
58
+ "FLOAT64": pl.Float64,
59
+ "NUMERIC": pl.Float64, # Can be refined for precision with Decimal128 if needed
60
+ "BIGNUMERIC": pl.Float64, # Similar to NUMERIC
61
+ "BOOLEAN": pl.Boolean,
62
+ "BOOL": pl.Boolean,
63
+ "TIMESTAMP": pl.String, # We use BigQuery internal parsing to convert to datetime
64
+ "DATE": pl.String, # We use BigQuery internal parsing to convert to datetime
65
+ "DATETIME": pl.String, # We use BigQuery internal parsing to convert to datetime
66
+ "TIME": pl.Time,
67
+ "GEOGRAPHY": pl.Object, # Polars doesn't natively support geography types
68
+ "ARRAY": pl.List, # Requires additional handling for element types
69
+ "JSON": pl.String,
70
+ }
71
+
72
+
73
+ class BigQueryColumn(DestinationColumn):
74
+ name: str = Field(..., description="Name of the column")
75
+ type: BigQueryColumnType = Field(..., description="Type of the column")
76
+ mode: BigQueryColumnMode = Field(..., description="Mode of the column")
77
+ description: Optional[str] = Field(None, description="Description of the column")
78
+ default_value_expression: Optional[str] = Field(None, description="Default value expression")
79
+
80
+ @property
81
+ def polars_type(self):
82
+ return BIGQUERY_TO_POLARS_TYPE_MAPPING.get(self.type.upper())
83
+
84
+
85
+ class BigQueryAuthentication(BaseModel):
86
+ service_account_key: str = Field(
87
+ description="Service Account Key JSON string. If empty it will be infered",
88
+ default="",
89
+ )
90
+
91
+
92
+ class BigQueryRecordSchemaConfig(BaseModel):
93
+ destination_id: str = Field(..., description="Destination ID")
94
+ record_schema: list[BigQueryColumn] = Field(..., description="Record schema")
95
+
96
+ # BigQuery Clustering Keys
97
+ clustering_keys: Optional[list[str]] = Field(None, description="Clustering keys")
98
+
99
+
100
+ class BigQueryConfigDetails(AbstractDestinationDetailsConfig):
101
+
102
+ # Table details
103
+ project_id: str = Field(..., description="BigQuery Project ID")
104
+ dataset_id: str = Field(..., description="BigQuery Dataset ID")
105
+ dataset_location: str = Field(default="US", description="BigQuery Dataset location")
106
+
107
+ # GCS Buffer
108
+ gcs_buffer_bucket: str = Field(..., description="GCS Buffer bucket")
109
+ gcs_buffer_format: GCSBufferFormat = Field(default=GCSBufferFormat.PARQUET, description="GCS Buffer format")
110
+
111
+ # Time partitioning
112
+ time_partitioning: TimePartitioning = Field(
113
+ default=TimePartitioning.DAY, description="BigQuery Time partitioning type"
114
+ )
115
+
116
+ # Schema for unnesting
117
+ record_schemas: Optional[list[BigQueryRecordSchemaConfig]] = Field(
118
+ default=None, description="Schema for the records. Required if unnest is set to true."
119
+ )
120
+
121
+ authentication: Optional[BigQueryAuthentication] = None
122
+
123
+
124
+ class BigQueryConfig(AbstractDestinationConfig):
125
+ name: Literal[DestinationTypes.BIGQUERY]
126
+ buffer_size: Optional[int] = 400
127
+ config: BigQueryConfigDetails
@@ -1,5 +1,4 @@
1
1
  import io
2
- import json
3
2
  import os
4
3
  import tempfile
5
4
  import traceback
@@ -13,18 +12,24 @@ from google.cloud.bigquery import DatasetReference, TimePartitioning
13
12
  from loguru import logger
14
13
 
15
14
  from bizon.common.models import SyncMetadata
16
- from bizon.destinations.config import NormalizationType
17
- from bizon.destinations.destination import AbstractDestination
15
+ from bizon.destination.destination import AbstractDestination
18
16
  from bizon.engine.backend.backend import AbstractBackend
19
17
  from bizon.source.config import SourceSyncModes
18
+ from bizon.source.source import AbstractSourceCallback
20
19
 
21
- from .config import BigQueryConfigDetails
20
+ from .config import BigQueryColumn, BigQueryConfigDetails
22
21
 
23
22
 
24
23
  class BigQueryDestination(AbstractDestination):
25
24
 
26
- def __init__(self, sync_metadata: SyncMetadata, config: BigQueryConfigDetails, backend: AbstractBackend):
27
- super().__init__(sync_metadata, config, backend)
25
+ def __init__(
26
+ self,
27
+ sync_metadata: SyncMetadata,
28
+ config: BigQueryConfigDetails,
29
+ backend: AbstractBackend,
30
+ source_callback: AbstractSourceCallback,
31
+ ):
32
+ super().__init__(sync_metadata, config, backend, source_callback)
28
33
  self.config: BigQueryConfigDetails = config
29
34
 
30
35
  if config.authentication and config.authentication.service_account_key:
@@ -44,7 +49,7 @@ class BigQueryDestination(AbstractDestination):
44
49
 
45
50
  @property
46
51
  def table_id(self) -> str:
47
- tabled_id = self.config.table_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
52
+ tabled_id = self.destination_id or f"{self.sync_metadata.source_name}_{self.sync_metadata.stream_name}"
48
53
  return f"{self.project_id}.{self.dataset_id}.{tabled_id}"
49
54
 
50
55
  @property
@@ -61,28 +66,24 @@ class BigQueryDestination(AbstractDestination):
61
66
 
62
67
  def get_bigquery_schema(self, df_destination_records: pl.DataFrame) -> List[bigquery.SchemaField]:
63
68
 
64
- # we keep raw data in the column source_data
65
- if self.config.normalization.type == NormalizationType.NONE:
69
+ # Case we unnest the data
70
+ if self.config.unnest:
66
71
  return [
67
- bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
68
- bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
69
- bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
70
- bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
71
72
  bigquery.SchemaField(
72
- "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
73
- ),
74
- bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
73
+ col.name,
74
+ col.type,
75
+ mode=col.mode,
76
+ description=col.description,
77
+ )
78
+ for col in self.record_schemas[self.destination_id]
75
79
  ]
76
80
 
77
- # If normalization is tabular, we parse key / value pairs to columns
78
- elif self.config.normalization.type == NormalizationType.TABULAR:
79
-
80
- # We use the first record to infer the schema of tabular data (key / value pairs)
81
- source_data_keys = list(json.loads(df_destination_records["source_data"][0]).keys())
82
-
83
- return [bigquery.SchemaField(key, "STRING", mode="NULLABLE") for key in source_data_keys] + [
81
+ # Case we don't unnest the data
82
+ else:
83
+ return [
84
84
  bigquery.SchemaField("_source_record_id", "STRING", mode="REQUIRED"),
85
85
  bigquery.SchemaField("_source_timestamp", "TIMESTAMP", mode="REQUIRED"),
86
+ bigquery.SchemaField("_source_data", "STRING", mode="NULLABLE"),
86
87
  bigquery.SchemaField("_bizon_extracted_at", "TIMESTAMP", mode="REQUIRED"),
87
88
  bigquery.SchemaField(
88
89
  "_bizon_loaded_at", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP()"
@@ -90,8 +91,6 @@ class BigQueryDestination(AbstractDestination):
90
91
  bigquery.SchemaField("_bizon_id", "STRING", mode="REQUIRED"),
91
92
  ]
92
93
 
93
- raise NotImplementedError(f"Normalization type {self.config.normalization.type} is not supported")
94
-
95
94
  def check_connection(self) -> bool:
96
95
  dataset_ref = DatasetReference(self.project_id, self.dataset_id)
97
96
 
@@ -129,6 +128,28 @@ class BigQueryDestination(AbstractDestination):
129
128
 
130
129
  raise NotImplementedError(f"Buffer format {self.buffer_format} is not supported")
131
130
 
131
+ @staticmethod
132
+ def unnest_data(df_destination_records: pl.DataFrame, record_schema: list[BigQueryColumn]) -> pl.DataFrame:
133
+ """Unnest the source_data field into separate columns"""
134
+
135
+ # Check if the schema matches the expected schema
136
+ source_data_fields = (
137
+ pl.DataFrame(df_destination_records["source_data"].str.json_decode(infer_schema_length=None))
138
+ .schema["source_data"]
139
+ .fields
140
+ )
141
+
142
+ record_schema_fields = [col.name for col in record_schema]
143
+
144
+ for field in source_data_fields:
145
+ assert field.name in record_schema_fields, f"Column {field.name} not found in BigQuery schema"
146
+
147
+ # Parse the JSON and unnest the fields to polar type
148
+ return df_destination_records.select(
149
+ pl.col("source_data").str.json_path_match(f"$.{col.name}").cast(col.polars_type).alias(col.name)
150
+ for col in record_schema
151
+ )
152
+
132
153
  def load_to_bigquery(self, gcs_file: str, df_destination_records: pl.DataFrame):
133
154
 
134
155
  # We always partition by the loaded_at field
@@ -0,0 +1,56 @@
1
+ from enum import Enum
2
+ from typing import Literal, Optional
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+ from bizon.connectors.destinations.bigquery.src.config import BigQueryRecordSchemaConfig
7
+ from bizon.destination.config import (
8
+ AbstractDestinationConfig,
9
+ AbstractDestinationDetailsConfig,
10
+ DestinationTypes,
11
+ )
12
+
13
+
14
+ class TimePartitioningWindow(str, Enum):
15
+ DAY = "DAY"
16
+ HOUR = "HOUR"
17
+ MONTH = "MONTH"
18
+ YEAR = "YEAR"
19
+
20
+
21
+ class TimePartitioning(BaseModel):
22
+ type: TimePartitioningWindow = Field(default=TimePartitioningWindow.DAY, description="Time partitioning type")
23
+ field: Optional[str] = Field(
24
+ "_bizon_loaded_at", description="Field to partition by. You can use a transformation to create this field."
25
+ )
26
+
27
+
28
+ class BigQueryAuthentication(BaseModel):
29
+ service_account_key: str = Field(
30
+ description="Service Account Key JSON string. If empty it will be infered",
31
+ default="",
32
+ )
33
+
34
+
35
+ class BigQueryStreamingConfigDetails(AbstractDestinationDetailsConfig):
36
+ project_id: str
37
+ dataset_id: str
38
+ dataset_location: Optional[str] = "US"
39
+ time_partitioning: Optional[TimePartitioning] = Field(
40
+ default=TimePartitioning(type=TimePartitioningWindow.DAY, field="_bizon_loaded_at"),
41
+ description="BigQuery Time partitioning type",
42
+ )
43
+ authentication: Optional[BigQueryAuthentication] = None
44
+ bq_max_rows_per_request: Optional[int] = Field(30000, description="Max rows per buffer streaming request.")
45
+ record_schemas: Optional[list[BigQueryRecordSchemaConfig]] = Field(
46
+ default=None, description="Schema for the records. Required if unnest is set to true."
47
+ )
48
+ use_legacy_streaming_api: bool = Field(
49
+ default=False,
50
+ description="[DEPRECATED] Use the legacy streaming API. This is required for some older BigQuery versions.",
51
+ )
52
+
53
+
54
+ class BigQueryStreamingConfig(AbstractDestinationConfig):
55
+ name: Literal[DestinationTypes.BIGQUERY_STREAMING]
56
+ config: BigQueryStreamingConfigDetails