bizon 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. bizon-0.0.1/LICENSE +21 -0
  2. bizon-0.0.1/PKG-INFO +131 -0
  3. bizon-0.0.1/README.md +100 -0
  4. bizon-0.0.1/bizon/cli/parser.py +7 -0
  5. bizon-0.0.1/bizon/common/errors/backoff.py +14 -0
  6. bizon-0.0.1/bizon/common/errors/errors.py +18 -0
  7. bizon-0.0.1/bizon/common/models.py +40 -0
  8. bizon-0.0.1/bizon/destinations/bigquery/config/bigquery.example.yml +37 -0
  9. bizon-0.0.1/bizon/destinations/bigquery/src/config.py +22 -0
  10. bizon-0.0.1/bizon/destinations/bigquery/src/destination.py +114 -0
  11. bizon-0.0.1/bizon/destinations/bigquery/tests/test_bigquery_client.py +40 -0
  12. bizon-0.0.1/bizon/destinations/config.py +17 -0
  13. bizon-0.0.1/bizon/destinations/destination.py +49 -0
  14. bizon-0.0.1/bizon/destinations/logger/src/config.py +7 -0
  15. bizon-0.0.1/bizon/destinations/logger/src/destination.py +21 -0
  16. bizon-0.0.1/bizon/destinations/models.py +24 -0
  17. bizon-0.0.1/bizon/engine/backend/backend.py +253 -0
  18. bizon-0.0.1/bizon/engine/backend/config.py +16 -0
  19. bizon-0.0.1/bizon/engine/backend/models.py +62 -0
  20. bizon-0.0.1/bizon/engine/config.py +47 -0
  21. bizon-0.0.1/bizon/engine/producer.py +150 -0
  22. bizon-0.0.1/bizon/engine/queue/adapters/kafka/config.py +23 -0
  23. bizon-0.0.1/bizon/engine/queue/adapters/kafka/consumer.py +38 -0
  24. bizon-0.0.1/bizon/engine/queue/adapters/kafka/queue.py +55 -0
  25. bizon-0.0.1/bizon/engine/queue/adapters/python_queue/config.py +25 -0
  26. bizon-0.0.1/bizon/engine/queue/adapters/python_queue/consumer.py +27 -0
  27. bizon-0.0.1/bizon/engine/queue/adapters/python_queue/queue.py +52 -0
  28. bizon-0.0.1/bizon/engine/queue/config.py +29 -0
  29. bizon-0.0.1/bizon/engine/queue/config_details.py +8 -0
  30. bizon-0.0.1/bizon/engine/queue/queue.py +66 -0
  31. bizon-0.0.1/bizon/engine/runner.py +93 -0
  32. bizon-0.0.1/bizon/engine/runners/config.py +36 -0
  33. bizon-0.0.1/bizon/engine/runners/thread.py +42 -0
  34. bizon-0.0.1/bizon/source/auth/authenticators/abstract_oauth.py +149 -0
  35. bizon-0.0.1/bizon/source/auth/authenticators/abstract_token.py +29 -0
  36. bizon-0.0.1/bizon/source/auth/authenticators/basic.py +34 -0
  37. bizon-0.0.1/bizon/source/auth/authenticators/cookies.py +24 -0
  38. bizon-0.0.1/bizon/source/auth/authenticators/oauth.py +87 -0
  39. bizon-0.0.1/bizon/source/auth/authenticators/token.py +32 -0
  40. bizon-0.0.1/bizon/source/auth/builder.py +22 -0
  41. bizon-0.0.1/bizon/source/auth/config.py +27 -0
  42. bizon-0.0.1/bizon/source/config.py +31 -0
  43. bizon-0.0.1/bizon/source/cursor.py +120 -0
  44. bizon-0.0.1/bizon/source/models.py +21 -0
  45. bizon-0.0.1/bizon/source/session.py +63 -0
  46. bizon-0.0.1/bizon/source/source.py +51 -0
  47. bizon-0.0.1/bizon/sources/__init__.py +94 -0
  48. bizon-0.0.1/bizon/sources/dummy/config/api_key.example.yml +20 -0
  49. bizon-0.0.1/bizon/sources/dummy/config/api_key_kafka.example.yml +27 -0
  50. bizon-0.0.1/bizon/sources/dummy/src/config.py +25 -0
  51. bizon-0.0.1/bizon/sources/dummy/src/fake_api.py +73 -0
  52. bizon-0.0.1/bizon/sources/dummy/src/source.py +87 -0
  53. bizon-0.0.1/bizon/sources/dummy/tests/dummy_pipeline.py +21 -0
  54. bizon-0.0.1/bizon/sources/dummy/tests/dummy_pipeline_bigquery_backend.py +28 -0
  55. bizon-0.0.1/bizon/sources/dummy/tests/dummy_pipeline_kafka.py +27 -0
  56. bizon-0.0.1/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery.py +28 -0
  57. bizon-0.0.1/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +31 -0
  58. bizon-0.0.1/bizon/sources/hubspot/config/api_key.example.yml +23 -0
  59. bizon-0.0.1/bizon/sources/hubspot/config/oauth.example.yml +24 -0
  60. bizon-0.0.1/bizon/sources/hubspot/src/errors.py +43 -0
  61. bizon-0.0.1/bizon/sources/hubspot/src/models/config.py +28 -0
  62. bizon-0.0.1/bizon/sources/hubspot/src/models/hs_object.py +54 -0
  63. bizon-0.0.1/bizon/sources/hubspot/src/source.py +238 -0
  64. bizon-0.0.1/bizon/sources/hubspot/tests/hubspot_benchmark.py +26 -0
  65. bizon-0.0.1/bizon/sources/hubspot/tests/hubspot_client.py +25 -0
  66. bizon-0.0.1/bizon/sources/hubspot/tests/hubspot_iteration.py +30 -0
  67. bizon-0.0.1/bizon/sources/hubspot/tests/hubspot_pipeline.py +9 -0
  68. bizon-0.0.1/bizon/sources/periscope/config/periscope_charts.example.yml +26 -0
  69. bizon-0.0.1/bizon/sources/periscope/config/periscope_dashboards.example.yml +26 -0
  70. bizon-0.0.1/bizon/sources/periscope/src/config.py +39 -0
  71. bizon-0.0.1/bizon/sources/periscope/src/source.py +180 -0
  72. bizon-0.0.1/bizon/sources/periscope/tests/periscope_pipeline_charts.py +9 -0
  73. bizon-0.0.1/bizon/sources/periscope/tests/periscope_pipeline_dashboard.py +9 -0
  74. bizon-0.0.1/pyproject.toml +39 -0
bizon-0.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 data-collective
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
bizon-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,131 @@
1
+ Metadata-Version: 2.1
2
+ Name: bizon
3
+ Version: 0.0.1
4
+ Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
5
+ Author: Antoine Balliet
6
+ Author-email: antoine.balliet@gmail.com
7
+ Requires-Python: >=3.9,<3.13
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Requires-Dist: backoff (>=2.2.1,<3.0.0)
14
+ Requires-Dist: dpath (>=2.2.0,<3.0.0)
15
+ Requires-Dist: faker (>=26.0.0,<27.0.0)
16
+ Requires-Dist: google-cloud-bigquery (>=3.25.0,<4.0.0)
17
+ Requires-Dist: google-cloud-storage (>=2.17.0,<3.0.0)
18
+ Requires-Dist: kafka-python (>=2.0.2,<3.0.0)
19
+ Requires-Dist: loguru (>=0.7.2,<0.8.0)
20
+ Requires-Dist: pandas (>=2.2.2,<3.0.0)
21
+ Requires-Dist: pendulum (>=3.0.0,<4.0.0)
22
+ Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
23
+ Requires-Dist: pydantic (>=2.8.2,<3.0.0)
24
+ Requires-Dist: pydantic-extra-types (>=2.9.0,<3.0.0)
25
+ Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
26
+ Requires-Dist: requests (>=2.28.2,<3.0.0)
27
+ Requires-Dist: sqlalchemy (>=2.0.32,<3.0.0)
28
+ Requires-Dist: sqlalchemy-bigquery (>=1.11.0,<2.0.0)
29
+ Description-Content-Type: text/markdown
30
+
31
+ # bizon ⚡️
32
+ Extract and load your largest data streams with a framework you can trust for billion records.
33
+
34
+ ## Features
35
+ - **Natively fault-tolerant**: Bizon uses a checkpointing mechanism to keep track of the progress and recover from the last checkpoint.
36
+ - **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system like Python Queue, Kafka or Redpanda. Thanks to the `bizon.queue.Queue` interface, adapters can be written for any queuing system.
37
+ - **Pipeline metrics**: Bizon provides exhaustive pipeline metrics and implement OpenTelemetry for tracing. You can monitor:
38
+ - ETAs for completion
39
+ - Number of records processed
40
+ - Completion percentage
41
+ - Latency Source <> Destination
42
+ - **Lightweight & lean**: Bizon is lightweight, minimal codebase and only uses few dependencies:
43
+ - `requests` for HTTP requests
44
+ - `pyyaml` for configuration
45
+ - `sqlalchemy` for database / warehouse connections
46
+ - `pyarrow` for Parquet file format
47
+
48
+ ## Installation
49
+ ```bash
50
+ pip install bizon
51
+ ```
52
+
53
+ ## Usage
54
+ ```python
55
+ from yaml import safe_load
56
+ from bizon.engine.runner import RunnerFactory
57
+
58
+ yaml_config = """
59
+ source:
60
+ source_name: dummy
61
+ stream_name: creatures
62
+ authentication:
63
+ type: api_key
64
+ params:
65
+ token: dummy_key
66
+
67
+ destination:
68
+ name: logger
69
+ config:
70
+ dummy: dummy
71
+ """
72
+
73
+ config = safe_load(yaml_config)
74
+ runner = RunnerFactory.create_from_config_dict(config=config)
75
+ runner.run()
76
+ ```
77
+
78
+ ## Backend configuration
79
+
80
+ Backend is the interface used by Bizon to store its state. It can be configured in the `backend` section of the configuration file. The following backends are supported:
81
+ - `sqlite`: In-memory SQLite database, useful for testing and development.
82
+ - `biguquery`: Google BigQuery backend, perfect for light setup & production.
83
+ - `postgres`: PostgreSQL backend, for production use and frequent cursor updates.
84
+
85
+ ## Queue configuration
86
+
87
+ Queue is the interface used by Bizon to exchange data between `Source` and `Destination`. It can be configured in the `queue` section of the configuration file. The following queues are supported:
88
+ - `python_queue`: Python Queue, useful for testing and development.
89
+ - `kafka`: Apache Kafka, for production use and high throughput.
90
+
91
+ ## Start syncing your data 🚀
92
+
93
+ ### Quick setup without any dependencies ✌️
94
+
95
+ Queue configuration can be set to `python_queue` and backend configuration to `sqlite`.
96
+ This will allow you to test the pipeline without any external dependencies.
97
+
98
+
99
+ ### Local Kafka setup
100
+
101
+ To test the pipeline with Kafka, you can use `docker compose` to setup Kafka or Redpanda locally.
102
+
103
+ **Kafka**
104
+ ```bash
105
+ docker compose --file ./scripts/kafka-compose.yml up
106
+ ```
107
+
108
+ In your YAML configuration, set the `queue` configuration to Kafka under `engine`:
109
+ ```yaml
110
+ engine:
111
+ queue:
112
+ type: kafka
113
+ config:
114
+ bootstrap_servers: localhost:9092
115
+ ```
116
+
117
+ **Redpanda**
118
+ ```bash
119
+ docker compose --file ./scripts/redpanda-compose.yml up
120
+ ```
121
+
122
+ In your YAML configuration, set the `queue` configuration to Kafka under `engine`:
123
+
124
+ ```yaml
125
+ engine:
126
+ queue:
127
+ type: kafka
128
+ config:
129
+ bootstrap_servers: localhost:19092
130
+ ```
131
+
bizon-0.0.1/README.md ADDED
@@ -0,0 +1,100 @@
1
+ # bizon ⚡️
2
+ Extract and load your largest data streams with a framework you can trust for billion records.
3
+
4
+ ## Features
5
+ - **Natively fault-tolerant**: Bizon uses a checkpointing mechanism to keep track of the progress and recover from the last checkpoint.
6
+ - **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system like Python Queue, Kafka or Redpanda. Thanks to the `bizon.queue.Queue` interface, adapters can be written for any queuing system.
7
+ - **Pipeline metrics**: Bizon provides exhaustive pipeline metrics and implement OpenTelemetry for tracing. You can monitor:
8
+ - ETAs for completion
9
+ - Number of records processed
10
+ - Completion percentage
11
+ - Latency Source <> Destination
12
+ - **Lightweight & lean**: Bizon is lightweight, minimal codebase and only uses few dependencies:
13
+ - `requests` for HTTP requests
14
+ - `pyyaml` for configuration
15
+ - `sqlalchemy` for database / warehouse connections
16
+ - `pyarrow` for Parquet file format
17
+
18
+ ## Installation
19
+ ```bash
20
+ pip install bizon
21
+ ```
22
+
23
+ ## Usage
24
+ ```python
25
+ from yaml import safe_load
26
+ from bizon.engine.runner import RunnerFactory
27
+
28
+ yaml_config = """
29
+ source:
30
+ source_name: dummy
31
+ stream_name: creatures
32
+ authentication:
33
+ type: api_key
34
+ params:
35
+ token: dummy_key
36
+
37
+ destination:
38
+ name: logger
39
+ config:
40
+ dummy: dummy
41
+ """
42
+
43
+ config = safe_load(yaml_config)
44
+ runner = RunnerFactory.create_from_config_dict(config=config)
45
+ runner.run()
46
+ ```
47
+
48
+ ## Backend configuration
49
+
50
+ Backend is the interface used by Bizon to store its state. It can be configured in the `backend` section of the configuration file. The following backends are supported:
51
+ - `sqlite`: In-memory SQLite database, useful for testing and development.
52
+ - `biguquery`: Google BigQuery backend, perfect for light setup & production.
53
+ - `postgres`: PostgreSQL backend, for production use and frequent cursor updates.
54
+
55
+ ## Queue configuration
56
+
57
+ Queue is the interface used by Bizon to exchange data between `Source` and `Destination`. It can be configured in the `queue` section of the configuration file. The following queues are supported:
58
+ - `python_queue`: Python Queue, useful for testing and development.
59
+ - `kafka`: Apache Kafka, for production use and high throughput.
60
+
61
+ ## Start syncing your data 🚀
62
+
63
+ ### Quick setup without any dependencies ✌️
64
+
65
+ Queue configuration can be set to `python_queue` and backend configuration to `sqlite`.
66
+ This will allow you to test the pipeline without any external dependencies.
67
+
68
+
69
+ ### Local Kafka setup
70
+
71
+ To test the pipeline with Kafka, you can use `docker compose` to setup Kafka or Redpanda locally.
72
+
73
+ **Kafka**
74
+ ```bash
75
+ docker compose --file ./scripts/kafka-compose.yml up
76
+ ```
77
+
78
+ In your YAML configuration, set the `queue` configuration to Kafka under `engine`:
79
+ ```yaml
80
+ engine:
81
+ queue:
82
+ type: kafka
83
+ config:
84
+ bootstrap_servers: localhost:9092
85
+ ```
86
+
87
+ **Redpanda**
88
+ ```bash
89
+ docker compose --file ./scripts/redpanda-compose.yml up
90
+ ```
91
+
92
+ In your YAML configuration, set the `queue` configuration to Kafka under `engine`:
93
+
94
+ ```yaml
95
+ engine:
96
+ queue:
97
+ type: kafka
98
+ config:
99
+ bootstrap_servers: localhost:19092
100
+ ```
@@ -0,0 +1,7 @@
1
+ import yaml
2
+
3
+
4
+ def parse_from_yaml(path_to_yaml) -> dict:
5
+ with open(path_to_yaml) as f:
6
+ config = yaml.safe_load(f)
7
+ return config
@@ -0,0 +1,14 @@
1
+ import requests
2
+
3
+
4
+ class BaseBackoffException(requests.exceptions.HTTPError):
5
+ def __init__(self, request: requests.PreparedRequest, response: requests.Response, error_message: str = ""):
6
+ error_message = (
7
+ error_message
8
+ or f"Request URL: {request.url}, Response Code: {response.status_code}, Response Text: {response.text}"
9
+ )
10
+ super().__init__(error_message, request=request, response=response)
11
+
12
+
13
+ class DefaultBackoffException(BaseBackoffException):
14
+ pass
@@ -0,0 +1,18 @@
1
+ from enum import Enum
2
+ from typing import Optional
3
+
4
+ from pydantic import BaseModel, ConfigDict, Field
5
+
6
+
7
+ class FailureType(str, Enum):
8
+ SYSTEM_ERROR = "SYSTEM_ERROR"
9
+ CONFIG_ERROR = "CONFIG_ERROR"
10
+
11
+
12
+ class ErrorTraceMessage(BaseModel):
13
+ model_config = ConfigDict(extra="forbid")
14
+
15
+ message: str = Field(..., description="A user-friendly message that indicates the cause of the error")
16
+ internal_message: Optional[str] = Field(None, description="The internal error that caused the failure")
17
+ stack_trace: Optional[str] = Field(None, description="The full stack trace of the error")
18
+ failure_type: Optional[FailureType] = Field(None, description="The type of error")
@@ -0,0 +1,40 @@
1
+ import json
2
+ from datetime import datetime
3
+ from enum import Enum
4
+ from typing import Optional
5
+ from uuid import uuid4
6
+
7
+ from pydantic import BaseModel, ConfigDict, Field
8
+ from pytz import UTC
9
+
10
+ from bizon.destinations.config import DestinationConfig
11
+ from bizon.engine.config import EngineConfig
12
+ from bizon.source.config import SourceConfig
13
+
14
+
15
+ class JobStatus(str, Enum):
16
+ NOT_STARTED = "not_started"
17
+ RUNNING = "running"
18
+ SUCCESS = "success"
19
+ FAILED = "failed"
20
+
21
+
22
+ class CursorStatus(str, Enum):
23
+ NOT_STARTED = "not_started"
24
+ SUCCESS = "success"
25
+ FAILED = "failed"
26
+
27
+
28
+ class BizonConfig(BaseModel):
29
+
30
+ # Forbid extra keys in the model
31
+ model_config = ConfigDict(extra="forbid")
32
+
33
+ source: SourceConfig
34
+
35
+ destination: DestinationConfig
36
+
37
+ engine: Optional[EngineConfig] = Field(
38
+ description="Engine configuration",
39
+ default=EngineConfig(),
40
+ )
@@ -0,0 +1,37 @@
1
+ source:
2
+ name: hubspot
3
+ stream_name: contacts
4
+ properties:
5
+ strategy: all
6
+ authentication:
7
+ type: api_key
8
+ api_key: <MY_API_KEY>
9
+
10
+ destination:
11
+ # Authentication: If empty it will be infered.
12
+ # Must have the bigquery.jobUser
13
+ # Must have the bigquery.dataEditor and storage.objectUser on the supplied dataset and bucket
14
+ name: bigquery
15
+ config:
16
+ dataset_id: bizon_test
17
+ dataset_location: US
18
+ project_id: my-gcp-project-id
19
+ gcs_buffer_bucket: bizon-buffer
20
+ gcs_buffer_format: parquet
21
+ service_account_key: >-
22
+ {
23
+ "type": "service_account",
24
+ "project_id": "my-gcp-project-id",
25
+ "private_key_id": "",
26
+ "private_key": "-----BEGIN PRIVATE KEY-----\n\n-----END PRIVATE KEY-----\n",
27
+ "client_email": "a-bizon-service-account@my-gcp-project-id.iam.gserviceaccount.com",
28
+ "client_id": "",
29
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
30
+ "token_uri": "https://oauth2.googleapis.com/token",
31
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
32
+ "client_x509_cert_url": "",
33
+ "universe_domain": "googleapis.com"
34
+ }
35
+
36
+ pipeline:
37
+ log_level: DEBUG
@@ -0,0 +1,22 @@
1
+ from enum import Enum
2
+ from typing import Optional
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+
7
+ class GCSBufferFormat(str, Enum):
8
+ PARQUET = "parquet"
9
+ CSV = "csv"
10
+
11
+
12
+ class BigQueryConfig(BaseModel):
13
+ dataset_id: str
14
+ dataset_location: Optional[str] = "US"
15
+ project_id: str
16
+ gcs_buffer_bucket: str
17
+ gcs_buffer_format: Optional[GCSBufferFormat] = GCSBufferFormat.PARQUET
18
+
19
+ service_account_key: Optional[str] = Field(
20
+ description="Service Accouner Key JSON string. If empty it will be infered",
21
+ default="",
22
+ )
@@ -0,0 +1,114 @@
1
+ import io
2
+ import os
3
+ import tempfile
4
+ from typing import List
5
+ from uuid import uuid4
6
+
7
+ import pandas as pd
8
+ import pyarrow as pa
9
+ import pyarrow.parquet as pq
10
+ from google.api_core.exceptions import NotFound
11
+ from google.cloud import bigquery, storage
12
+ from google.cloud.bigquery import DatasetReference
13
+ from loguru import logger
14
+
15
+ from bizon.destinations.destination import AbstractDestination
16
+ from bizon.destinations.models import DestinationRecord
17
+ from bizon.source.models import SourceRecord
18
+
19
+ from .config import BigQueryConfig
20
+
21
+
22
+ class BigQueryDestination(AbstractDestination):
23
+ def __init__(self, source_name: str, stream_name: str, config: BigQueryConfig):
24
+ super().__init__(source_name, stream_name, config)
25
+
26
+ if bool(config.service_account_key):
27
+ with tempfile.NamedTemporaryFile(delete=False) as temp:
28
+ temp.write(config.service_account_key.encode())
29
+ temp_file_path = temp.name
30
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file_path
31
+
32
+ self.project_id = config.project_id
33
+ self.bq_client = bigquery.Client(project=self.project_id)
34
+ self.gcs_client = storage.Client(project=self.project_id)
35
+ self.buffer_bucket_name = config.gcs_buffer_bucket
36
+ self.buffer_bucket = self.gcs_client.bucket(config.gcs_buffer_bucket)
37
+ self.buffer_format = config.gcs_buffer_format
38
+ self.dataset_id = config.dataset_id
39
+ self.dataset_location = config.dataset_location
40
+
41
+ def convert_and_upload_to_buffer(self, source_records: List[SourceRecord]):
42
+
43
+ bizon_records = [
44
+ DestinationRecord.from_source_record(source_record).model_dump() for source_record in source_records
45
+ ]
46
+
47
+ logger.info(bizon_records[0])
48
+
49
+ df = pd.DataFrame(bizon_records)
50
+
51
+ # Convert DataFrame to Parquet in-memory
52
+ if self.buffer_format == "parquet":
53
+ table = pa.Table.from_pandas(df)
54
+ buffer = io.BytesIO()
55
+ pq.write_table(table, buffer)
56
+ buffer.seek(0)
57
+
58
+ # Upload the Parquet file to GCS
59
+ file_name = f"{self.source_name}/{self.stream_name}/{str(uuid4())}.parquet"
60
+ blob = self.buffer_bucket.blob(file_name)
61
+ blob.upload_from_file(buffer, content_type="application/octet-stream")
62
+ return file_name
63
+
64
+ def check_connection(self) -> bool:
65
+ dataset_ref = DatasetReference(self.project_id, self.dataset_id)
66
+
67
+ try:
68
+ self.bq_client.get_dataset(dataset_ref)
69
+ except NotFound:
70
+ dataset = bigquery.Dataset(dataset_ref)
71
+ dataset.location = self.dataset_location
72
+ dataset = self.bq_client.create_dataset(dataset)
73
+ return True
74
+
75
+ def cleanup(self, gcs_file: str):
76
+ blob = self.buffer_bucket.blob(gcs_file)
77
+ blob.delete()
78
+
79
+ # TO DO: Add backoff to common exceptions => looks like most are hanlded by the client
80
+ # https://cloud.google.com/python/docs/reference/storage/latest/retry_timeout
81
+ # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dbapi.DataError
82
+ def load_to_bigquery(self, gcs_file: str):
83
+ job_config = bigquery.LoadJobConfig(
84
+ source_format=bigquery.SourceFormat.PARQUET,
85
+ write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
86
+ schema=[
87
+ bigquery.SchemaField("bizon_id", "STRING", mode="REQUIRED"),
88
+ bigquery.SchemaField("bizon_loaded_at", "STRING", mode="REQUIRED"),
89
+ bigquery.SchemaField("source_record_id", "STRING", mode="REQUIRED"),
90
+ bigquery.SchemaField("source_data", "STRING", mode="NULLABLE"),
91
+ ],
92
+ )
93
+
94
+ table_id = f"{self.project_id}.{self.dataset_id}.{self.source_name}_{self.stream_name}"
95
+
96
+ load_job = self.bq_client.load_table_from_uri(
97
+ f"gs://{self.buffer_bucket_name}/{gcs_file}", table_id, job_config=job_config
98
+ )
99
+ load_job.result()
100
+
101
+ def write_records(self, source_records: List[SourceRecord]):
102
+
103
+ # Here we can check if these IDs are already present in BigQuery
104
+ # Using SourceRecord.id values
105
+
106
+ gs_file_name = self.convert_and_upload_to_buffer(source_records=source_records)
107
+
108
+ try:
109
+ self.load_to_bigquery(gs_file_name)
110
+ self.cleanup(gs_file_name)
111
+ except Exception as e:
112
+ self.cleanup(gs_file_name)
113
+ raise e
114
+ return True
@@ -0,0 +1,40 @@
1
+ import logging
2
+ import os
3
+ import random
4
+ from random import randint
5
+
6
+ from faker import Faker
7
+
8
+ from bizon.cli.parser import parse_from_yaml
9
+ from bizon.destinations.bigquery.src.config import BigQueryConfig
10
+ from bizon.destinations.bigquery.src.destination import BigQueryDestination
11
+ from bizon.source.config import SourceConfig
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ fake = Faker("en_US")
16
+
17
+ raw_config = parse_from_yaml(os.path.abspath("bizon/destinations/bigquery/config/bigquery.test.yml"))
18
+
19
+
20
+ def test_config_models():
21
+ assert bool(SourceConfig.model_validate(raw_config["source"])) is True
22
+ logger.info("source validated...")
23
+
24
+ assert bool(BigQueryConfig.model_validate(raw_config["destination"])) is True
25
+ logger.info("destination validated...")
26
+
27
+
28
+ # def test_load_records_to_bigquery():
29
+ # bigquery_config = BigQueryConfig.model_validate(raw_config["destination"])
30
+ # source_config = SourceConfig.model_validate(raw_config["source"])
31
+
32
+ # fake_records = [
33
+ # {"foo": randint(0, 100), "bar": {"baz": fake.name(), "poo": float(random.randrange(155, 389)) / 100}}
34
+ # for _ in range(100)
35
+ # ]
36
+ # client = BigQueryClient(config=bigquery_config.config, source_config=source_config)
37
+
38
+ # success = client.load_records_to_bigquery(json_records=fake_records)
39
+
40
+ # assert success is True
@@ -0,0 +1,17 @@
1
+ from enum import Enum
2
+ from typing import Union
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+ from .bigquery.src.config import BigQueryConfig
7
+ from .logger.src.config import LoggerDestinationConfig
8
+
9
+
10
+ class DestinationTypes(str, Enum):
11
+ BIGQUERY = "bigquery"
12
+ LOGGER = "logger"
13
+
14
+
15
+ class DestinationConfig(BaseModel):
16
+ name: DestinationTypes = Field(..., description="Name of the destination")
17
+ config: Union[BigQueryConfig, LoggerDestinationConfig] = Field(..., description="Configuration for the destination")
@@ -0,0 +1,49 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List
3
+
4
+ from bizon.source.models import SourceRecord
5
+
6
+ from .config import DestinationConfig, DestinationTypes
7
+
8
+
9
+ class AbstractDestination(ABC):
10
+
11
+ def __init__(self, source_name: str, stream_name: str, config: DestinationConfig):
12
+ self.source_name = source_name
13
+ self.stream_name = stream_name
14
+ self.config = config
15
+
16
+ @property
17
+ def name(self) -> str:
18
+ return self.config.name
19
+
20
+ @abstractmethod
21
+ def check_connection(self) -> bool:
22
+ pass
23
+
24
+ @abstractmethod
25
+ def write_records(self, source_records: List[SourceRecord]):
26
+ pass
27
+
28
+
29
+ class DestinationFactory:
30
+ @staticmethod
31
+ def get_destination(source_name: str, stream_name: str, destination_config_dict: dict) -> AbstractDestination:
32
+
33
+ if destination_config_dict.get("name") == DestinationTypes.LOGGER:
34
+ from .logger.src.config import LoggerDestinationConfig
35
+ from .logger.src.destination import LoggerDestination
36
+
37
+ config = LoggerDestinationConfig.model_validate(obj=destination_config_dict.get("config"))
38
+ return LoggerDestination(source_name=source_name, stream_name=stream_name, config=config)
39
+
40
+ if destination_config_dict.get("name") == DestinationTypes.BIGQUERY:
41
+ from .bigquery.src.config import BigQueryConfig
42
+ from .bigquery.src.destination import BigQueryDestination
43
+
44
+ config = BigQueryConfig.model_validate(obj=destination_config_dict.get("config"))
45
+ return BigQueryDestination(source_name=source_name, stream_name=stream_name, config=config)
46
+
47
+ raise ValueError(
48
+ f"Destination {destination_config_dict.get('name')}" f"with params {destination_config_dict} not found"
49
+ )
@@ -0,0 +1,7 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import BaseModel
4
+
5
+
6
+ class LoggerDestinationConfig(BaseModel):
7
+ dummy: Optional[str] = "bizon"
@@ -0,0 +1,21 @@
1
+ from typing import List
2
+
3
+ from loguru import logger
4
+
5
+ from bizon.destinations.destination import AbstractDestination
6
+ from bizon.source.models import SourceRecord
7
+
8
+ from .config import LoggerDestinationConfig
9
+
10
+
11
+ class LoggerDestination(AbstractDestination):
12
+
13
+ def __init__(self, source_name: str, stream_name: str, config: LoggerDestinationConfig):
14
+ super().__init__(source_name, stream_name, config)
15
+
16
+ def check_connection(self) -> bool:
17
+ return True
18
+
19
+ def write_records(self, source_records: List[SourceRecord]):
20
+ for record in source_records:
21
+ logger.info(record.data)
@@ -0,0 +1,24 @@
1
+ import json
2
+ from datetime import datetime
3
+ from uuid import uuid4
4
+
5
+ from pydantic import BaseModel, Field
6
+ from pytz import UTC
7
+
8
+ from bizon.source.models import SourceRecord
9
+
10
+
11
+ class DestinationRecord(BaseModel):
12
+ bizon_id: str = Field(..., description="Bizon unique identifier of the record")
13
+ bizon_loaded_at: str = Field(..., description="Datetime when the record was loaded")
14
+ source_record_id: str = Field(..., description="Source record id")
15
+ source_data: str = Field(..., description="Source record JSON string data")
16
+
17
+ @classmethod
18
+ def from_source_record(cls, source_record: SourceRecord) -> "DestinationRecord":
19
+ return cls(
20
+ bizon_id=uuid4().hex,
21
+ bizon_loaded_at=datetime.now(tz=UTC).isoformat(),
22
+ source_record_id=source_record.id,
23
+ source_data=json.dumps(source_record.data),
24
+ )