bizon 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon-0.0.1/LICENSE +21 -0
- bizon-0.0.1/PKG-INFO +131 -0
- bizon-0.0.1/README.md +100 -0
- bizon-0.0.1/bizon/cli/parser.py +7 -0
- bizon-0.0.1/bizon/common/errors/backoff.py +14 -0
- bizon-0.0.1/bizon/common/errors/errors.py +18 -0
- bizon-0.0.1/bizon/common/models.py +40 -0
- bizon-0.0.1/bizon/destinations/bigquery/config/bigquery.example.yml +37 -0
- bizon-0.0.1/bizon/destinations/bigquery/src/config.py +22 -0
- bizon-0.0.1/bizon/destinations/bigquery/src/destination.py +114 -0
- bizon-0.0.1/bizon/destinations/bigquery/tests/test_bigquery_client.py +40 -0
- bizon-0.0.1/bizon/destinations/config.py +17 -0
- bizon-0.0.1/bizon/destinations/destination.py +49 -0
- bizon-0.0.1/bizon/destinations/logger/src/config.py +7 -0
- bizon-0.0.1/bizon/destinations/logger/src/destination.py +21 -0
- bizon-0.0.1/bizon/destinations/models.py +24 -0
- bizon-0.0.1/bizon/engine/backend/backend.py +253 -0
- bizon-0.0.1/bizon/engine/backend/config.py +16 -0
- bizon-0.0.1/bizon/engine/backend/models.py +62 -0
- bizon-0.0.1/bizon/engine/config.py +47 -0
- bizon-0.0.1/bizon/engine/producer.py +150 -0
- bizon-0.0.1/bizon/engine/queue/adapters/kafka/config.py +23 -0
- bizon-0.0.1/bizon/engine/queue/adapters/kafka/consumer.py +38 -0
- bizon-0.0.1/bizon/engine/queue/adapters/kafka/queue.py +55 -0
- bizon-0.0.1/bizon/engine/queue/adapters/python_queue/config.py +25 -0
- bizon-0.0.1/bizon/engine/queue/adapters/python_queue/consumer.py +27 -0
- bizon-0.0.1/bizon/engine/queue/adapters/python_queue/queue.py +52 -0
- bizon-0.0.1/bizon/engine/queue/config.py +29 -0
- bizon-0.0.1/bizon/engine/queue/config_details.py +8 -0
- bizon-0.0.1/bizon/engine/queue/queue.py +66 -0
- bizon-0.0.1/bizon/engine/runner.py +93 -0
- bizon-0.0.1/bizon/engine/runners/config.py +36 -0
- bizon-0.0.1/bizon/engine/runners/thread.py +42 -0
- bizon-0.0.1/bizon/source/auth/authenticators/abstract_oauth.py +149 -0
- bizon-0.0.1/bizon/source/auth/authenticators/abstract_token.py +29 -0
- bizon-0.0.1/bizon/source/auth/authenticators/basic.py +34 -0
- bizon-0.0.1/bizon/source/auth/authenticators/cookies.py +24 -0
- bizon-0.0.1/bizon/source/auth/authenticators/oauth.py +87 -0
- bizon-0.0.1/bizon/source/auth/authenticators/token.py +32 -0
- bizon-0.0.1/bizon/source/auth/builder.py +22 -0
- bizon-0.0.1/bizon/source/auth/config.py +27 -0
- bizon-0.0.1/bizon/source/config.py +31 -0
- bizon-0.0.1/bizon/source/cursor.py +120 -0
- bizon-0.0.1/bizon/source/models.py +21 -0
- bizon-0.0.1/bizon/source/session.py +63 -0
- bizon-0.0.1/bizon/source/source.py +51 -0
- bizon-0.0.1/bizon/sources/__init__.py +94 -0
- bizon-0.0.1/bizon/sources/dummy/config/api_key.example.yml +20 -0
- bizon-0.0.1/bizon/sources/dummy/config/api_key_kafka.example.yml +27 -0
- bizon-0.0.1/bizon/sources/dummy/src/config.py +25 -0
- bizon-0.0.1/bizon/sources/dummy/src/fake_api.py +73 -0
- bizon-0.0.1/bizon/sources/dummy/src/source.py +87 -0
- bizon-0.0.1/bizon/sources/dummy/tests/dummy_pipeline.py +21 -0
- bizon-0.0.1/bizon/sources/dummy/tests/dummy_pipeline_bigquery_backend.py +28 -0
- bizon-0.0.1/bizon/sources/dummy/tests/dummy_pipeline_kafka.py +27 -0
- bizon-0.0.1/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery.py +28 -0
- bizon-0.0.1/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +31 -0
- bizon-0.0.1/bizon/sources/hubspot/config/api_key.example.yml +23 -0
- bizon-0.0.1/bizon/sources/hubspot/config/oauth.example.yml +24 -0
- bizon-0.0.1/bizon/sources/hubspot/src/errors.py +43 -0
- bizon-0.0.1/bizon/sources/hubspot/src/models/config.py +28 -0
- bizon-0.0.1/bizon/sources/hubspot/src/models/hs_object.py +54 -0
- bizon-0.0.1/bizon/sources/hubspot/src/source.py +238 -0
- bizon-0.0.1/bizon/sources/hubspot/tests/hubspot_benchmark.py +26 -0
- bizon-0.0.1/bizon/sources/hubspot/tests/hubspot_client.py +25 -0
- bizon-0.0.1/bizon/sources/hubspot/tests/hubspot_iteration.py +30 -0
- bizon-0.0.1/bizon/sources/hubspot/tests/hubspot_pipeline.py +9 -0
- bizon-0.0.1/bizon/sources/periscope/config/periscope_charts.example.yml +26 -0
- bizon-0.0.1/bizon/sources/periscope/config/periscope_dashboards.example.yml +26 -0
- bizon-0.0.1/bizon/sources/periscope/src/config.py +39 -0
- bizon-0.0.1/bizon/sources/periscope/src/source.py +180 -0
- bizon-0.0.1/bizon/sources/periscope/tests/periscope_pipeline_charts.py +9 -0
- bizon-0.0.1/bizon/sources/periscope/tests/periscope_pipeline_dashboard.py +9 -0
- bizon-0.0.1/pyproject.toml +39 -0
bizon-0.0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 data-collective
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
bizon-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: bizon
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
|
|
5
|
+
Author: Antoine Balliet
|
|
6
|
+
Author-email: antoine.balliet@gmail.com
|
|
7
|
+
Requires-Python: >=3.9,<3.13
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Requires-Dist: backoff (>=2.2.1,<3.0.0)
|
|
14
|
+
Requires-Dist: dpath (>=2.2.0,<3.0.0)
|
|
15
|
+
Requires-Dist: faker (>=26.0.0,<27.0.0)
|
|
16
|
+
Requires-Dist: google-cloud-bigquery (>=3.25.0,<4.0.0)
|
|
17
|
+
Requires-Dist: google-cloud-storage (>=2.17.0,<3.0.0)
|
|
18
|
+
Requires-Dist: kafka-python (>=2.0.2,<3.0.0)
|
|
19
|
+
Requires-Dist: loguru (>=0.7.2,<0.8.0)
|
|
20
|
+
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
|
21
|
+
Requires-Dist: pendulum (>=3.0.0,<4.0.0)
|
|
22
|
+
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
|
23
|
+
Requires-Dist: pydantic (>=2.8.2,<3.0.0)
|
|
24
|
+
Requires-Dist: pydantic-extra-types (>=2.9.0,<3.0.0)
|
|
25
|
+
Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
|
|
26
|
+
Requires-Dist: requests (>=2.28.2,<3.0.0)
|
|
27
|
+
Requires-Dist: sqlalchemy (>=2.0.32,<3.0.0)
|
|
28
|
+
Requires-Dist: sqlalchemy-bigquery (>=1.11.0,<2.0.0)
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
# bizon ⚡️
|
|
32
|
+
Extract and load your largest data streams with a framework you can trust for billion records.
|
|
33
|
+
|
|
34
|
+
## Features
|
|
35
|
+
- **Natively fault-tolerant**: Bizon uses a checkpointing mechanism to keep track of the progress and recover from the last checkpoint.
|
|
36
|
+
- **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system like Python Queue, Kafka or Redpanda. Thanks to the `bizon.queue.Queue` interface, adapters can be written for any queuing system.
|
|
37
|
+
- **Pipeline metrics**: Bizon provides exhaustive pipeline metrics and implement OpenTelemetry for tracing. You can monitor:
|
|
38
|
+
- ETAs for completion
|
|
39
|
+
- Number of records processed
|
|
40
|
+
- Completion percentage
|
|
41
|
+
- Latency Source <> Destination
|
|
42
|
+
- **Lightweight & lean**: Bizon is lightweight, minimal codebase and only uses few dependencies:
|
|
43
|
+
- `requests` for HTTP requests
|
|
44
|
+
- `pyyaml` for configuration
|
|
45
|
+
- `sqlalchemy` for database / warehouse connections
|
|
46
|
+
- `pyarrow` for Parquet file format
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
```bash
|
|
50
|
+
pip install bizon
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Usage
|
|
54
|
+
```python
|
|
55
|
+
from yaml import safe_load
|
|
56
|
+
from bizon.engine.runner import RunnerFactory
|
|
57
|
+
|
|
58
|
+
yaml_config = """
|
|
59
|
+
source:
|
|
60
|
+
source_name: dummy
|
|
61
|
+
stream_name: creatures
|
|
62
|
+
authentication:
|
|
63
|
+
type: api_key
|
|
64
|
+
params:
|
|
65
|
+
token: dummy_key
|
|
66
|
+
|
|
67
|
+
destination:
|
|
68
|
+
name: logger
|
|
69
|
+
config:
|
|
70
|
+
dummy: dummy
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
config = safe_load(yaml_config)
|
|
74
|
+
runner = RunnerFactory.create_from_config_dict(config=config)
|
|
75
|
+
runner.run()
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Backend configuration
|
|
79
|
+
|
|
80
|
+
Backend is the interface used by Bizon to store its state. It can be configured in the `backend` section of the configuration file. The following backends are supported:
|
|
81
|
+
- `sqlite`: In-memory SQLite database, useful for testing and development.
|
|
82
|
+
- `biguquery`: Google BigQuery backend, perfect for light setup & production.
|
|
83
|
+
- `postgres`: PostgreSQL backend, for production use and frequent cursor updates.
|
|
84
|
+
|
|
85
|
+
## Queue configuration
|
|
86
|
+
|
|
87
|
+
Queue is the interface used by Bizon to exchange data between `Source` and `Destination`. It can be configured in the `queue` section of the configuration file. The following queues are supported:
|
|
88
|
+
- `python_queue`: Python Queue, useful for testing and development.
|
|
89
|
+
- `kafka`: Apache Kafka, for production use and high throughput.
|
|
90
|
+
|
|
91
|
+
## Start syncing your data 🚀
|
|
92
|
+
|
|
93
|
+
### Quick setup without any dependencies ✌️
|
|
94
|
+
|
|
95
|
+
Queue configuration can be set to `python_queue` and backend configuration to `sqlite`.
|
|
96
|
+
This will allow you to test the pipeline without any external dependencies.
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
### Local Kafka setup
|
|
100
|
+
|
|
101
|
+
To test the pipeline with Kafka, you can use `docker compose` to setup Kafka or Redpanda locally.
|
|
102
|
+
|
|
103
|
+
**Kafka**
|
|
104
|
+
```bash
|
|
105
|
+
docker compose --file ./scripts/kafka-compose.yml up
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
In your YAML configuration, set the `queue` configuration to Kafka under `engine`:
|
|
109
|
+
```yaml
|
|
110
|
+
engine:
|
|
111
|
+
queue:
|
|
112
|
+
type: kafka
|
|
113
|
+
config:
|
|
114
|
+
bootstrap_servers: localhost:9092
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
**Redpanda**
|
|
118
|
+
```bash
|
|
119
|
+
docker compose --file ./scripts/redpanda-compose.yml up
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
In your YAML configuration, set the `queue` configuration to Kafka under `engine`:
|
|
123
|
+
|
|
124
|
+
```yaml
|
|
125
|
+
engine:
|
|
126
|
+
queue:
|
|
127
|
+
type: kafka
|
|
128
|
+
config:
|
|
129
|
+
bootstrap_servers: localhost:19092
|
|
130
|
+
```
|
|
131
|
+
|
bizon-0.0.1/README.md
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# bizon ⚡️
|
|
2
|
+
Extract and load your largest data streams with a framework you can trust for billion records.
|
|
3
|
+
|
|
4
|
+
## Features
|
|
5
|
+
- **Natively fault-tolerant**: Bizon uses a checkpointing mechanism to keep track of the progress and recover from the last checkpoint.
|
|
6
|
+
- **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system like Python Queue, Kafka or Redpanda. Thanks to the `bizon.queue.Queue` interface, adapters can be written for any queuing system.
|
|
7
|
+
- **Pipeline metrics**: Bizon provides exhaustive pipeline metrics and implement OpenTelemetry for tracing. You can monitor:
|
|
8
|
+
- ETAs for completion
|
|
9
|
+
- Number of records processed
|
|
10
|
+
- Completion percentage
|
|
11
|
+
- Latency Source <> Destination
|
|
12
|
+
- **Lightweight & lean**: Bizon is lightweight, minimal codebase and only uses few dependencies:
|
|
13
|
+
- `requests` for HTTP requests
|
|
14
|
+
- `pyyaml` for configuration
|
|
15
|
+
- `sqlalchemy` for database / warehouse connections
|
|
16
|
+
- `pyarrow` for Parquet file format
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
```bash
|
|
20
|
+
pip install bizon
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
```python
|
|
25
|
+
from yaml import safe_load
|
|
26
|
+
from bizon.engine.runner import RunnerFactory
|
|
27
|
+
|
|
28
|
+
yaml_config = """
|
|
29
|
+
source:
|
|
30
|
+
source_name: dummy
|
|
31
|
+
stream_name: creatures
|
|
32
|
+
authentication:
|
|
33
|
+
type: api_key
|
|
34
|
+
params:
|
|
35
|
+
token: dummy_key
|
|
36
|
+
|
|
37
|
+
destination:
|
|
38
|
+
name: logger
|
|
39
|
+
config:
|
|
40
|
+
dummy: dummy
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
config = safe_load(yaml_config)
|
|
44
|
+
runner = RunnerFactory.create_from_config_dict(config=config)
|
|
45
|
+
runner.run()
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Backend configuration
|
|
49
|
+
|
|
50
|
+
Backend is the interface used by Bizon to store its state. It can be configured in the `backend` section of the configuration file. The following backends are supported:
|
|
51
|
+
- `sqlite`: In-memory SQLite database, useful for testing and development.
|
|
52
|
+
- `biguquery`: Google BigQuery backend, perfect for light setup & production.
|
|
53
|
+
- `postgres`: PostgreSQL backend, for production use and frequent cursor updates.
|
|
54
|
+
|
|
55
|
+
## Queue configuration
|
|
56
|
+
|
|
57
|
+
Queue is the interface used by Bizon to exchange data between `Source` and `Destination`. It can be configured in the `queue` section of the configuration file. The following queues are supported:
|
|
58
|
+
- `python_queue`: Python Queue, useful for testing and development.
|
|
59
|
+
- `kafka`: Apache Kafka, for production use and high throughput.
|
|
60
|
+
|
|
61
|
+
## Start syncing your data 🚀
|
|
62
|
+
|
|
63
|
+
### Quick setup without any dependencies ✌️
|
|
64
|
+
|
|
65
|
+
Queue configuration can be set to `python_queue` and backend configuration to `sqlite`.
|
|
66
|
+
This will allow you to test the pipeline without any external dependencies.
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
### Local Kafka setup
|
|
70
|
+
|
|
71
|
+
To test the pipeline with Kafka, you can use `docker compose` to setup Kafka or Redpanda locally.
|
|
72
|
+
|
|
73
|
+
**Kafka**
|
|
74
|
+
```bash
|
|
75
|
+
docker compose --file ./scripts/kafka-compose.yml up
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
In your YAML configuration, set the `queue` configuration to Kafka under `engine`:
|
|
79
|
+
```yaml
|
|
80
|
+
engine:
|
|
81
|
+
queue:
|
|
82
|
+
type: kafka
|
|
83
|
+
config:
|
|
84
|
+
bootstrap_servers: localhost:9092
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
**Redpanda**
|
|
88
|
+
```bash
|
|
89
|
+
docker compose --file ./scripts/redpanda-compose.yml up
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
In your YAML configuration, set the `queue` configuration to Kafka under `engine`:
|
|
93
|
+
|
|
94
|
+
```yaml
|
|
95
|
+
engine:
|
|
96
|
+
queue:
|
|
97
|
+
type: kafka
|
|
98
|
+
config:
|
|
99
|
+
bootstrap_servers: localhost:19092
|
|
100
|
+
```
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class BaseBackoffException(requests.exceptions.HTTPError):
|
|
5
|
+
def __init__(self, request: requests.PreparedRequest, response: requests.Response, error_message: str = ""):
|
|
6
|
+
error_message = (
|
|
7
|
+
error_message
|
|
8
|
+
or f"Request URL: {request.url}, Response Code: {response.status_code}, Response Text: {response.text}"
|
|
9
|
+
)
|
|
10
|
+
super().__init__(error_message, request=request, response=response)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DefaultBackoffException(BaseBackoffException):
|
|
14
|
+
pass
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FailureType(str, Enum):
|
|
8
|
+
SYSTEM_ERROR = "SYSTEM_ERROR"
|
|
9
|
+
CONFIG_ERROR = "CONFIG_ERROR"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ErrorTraceMessage(BaseModel):
|
|
13
|
+
model_config = ConfigDict(extra="forbid")
|
|
14
|
+
|
|
15
|
+
message: str = Field(..., description="A user-friendly message that indicates the cause of the error")
|
|
16
|
+
internal_message: Optional[str] = Field(None, description="The internal error that caused the failure")
|
|
17
|
+
stack_trace: Optional[str] = Field(None, description="The full stack trace of the error")
|
|
18
|
+
failure_type: Optional[FailureType] = Field(None, description="The type of error")
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import Optional
|
|
5
|
+
from uuid import uuid4
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
8
|
+
from pytz import UTC
|
|
9
|
+
|
|
10
|
+
from bizon.destinations.config import DestinationConfig
|
|
11
|
+
from bizon.engine.config import EngineConfig
|
|
12
|
+
from bizon.source.config import SourceConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class JobStatus(str, Enum):
|
|
16
|
+
NOT_STARTED = "not_started"
|
|
17
|
+
RUNNING = "running"
|
|
18
|
+
SUCCESS = "success"
|
|
19
|
+
FAILED = "failed"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CursorStatus(str, Enum):
|
|
23
|
+
NOT_STARTED = "not_started"
|
|
24
|
+
SUCCESS = "success"
|
|
25
|
+
FAILED = "failed"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BizonConfig(BaseModel):
|
|
29
|
+
|
|
30
|
+
# Forbid extra keys in the model
|
|
31
|
+
model_config = ConfigDict(extra="forbid")
|
|
32
|
+
|
|
33
|
+
source: SourceConfig
|
|
34
|
+
|
|
35
|
+
destination: DestinationConfig
|
|
36
|
+
|
|
37
|
+
engine: Optional[EngineConfig] = Field(
|
|
38
|
+
description="Engine configuration",
|
|
39
|
+
default=EngineConfig(),
|
|
40
|
+
)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
source:
|
|
2
|
+
name: hubspot
|
|
3
|
+
stream_name: contacts
|
|
4
|
+
properties:
|
|
5
|
+
strategy: all
|
|
6
|
+
authentication:
|
|
7
|
+
type: api_key
|
|
8
|
+
api_key: <MY_API_KEY>
|
|
9
|
+
|
|
10
|
+
destination:
|
|
11
|
+
# Authentication: If empty it will be infered.
|
|
12
|
+
# Must have the bigquery.jobUser
|
|
13
|
+
# Must have the bigquery.dataEditor and storage.objectUser on the supplied dataset and bucket
|
|
14
|
+
name: bigquery
|
|
15
|
+
config:
|
|
16
|
+
dataset_id: bizon_test
|
|
17
|
+
dataset_location: US
|
|
18
|
+
project_id: my-gcp-project-id
|
|
19
|
+
gcs_buffer_bucket: bizon-buffer
|
|
20
|
+
gcs_buffer_format: parquet
|
|
21
|
+
service_account_key: >-
|
|
22
|
+
{
|
|
23
|
+
"type": "service_account",
|
|
24
|
+
"project_id": "my-gcp-project-id",
|
|
25
|
+
"private_key_id": "",
|
|
26
|
+
"private_key": "-----BEGIN PRIVATE KEY-----\n\n-----END PRIVATE KEY-----\n",
|
|
27
|
+
"client_email": "a-bizon-service-account@my-gcp-project-id.iam.gserviceaccount.com",
|
|
28
|
+
"client_id": "",
|
|
29
|
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
|
30
|
+
"token_uri": "https://oauth2.googleapis.com/token",
|
|
31
|
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
|
32
|
+
"client_x509_cert_url": "",
|
|
33
|
+
"universe_domain": "googleapis.com"
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
pipeline:
|
|
37
|
+
log_level: DEBUG
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class GCSBufferFormat(str, Enum):
|
|
8
|
+
PARQUET = "parquet"
|
|
9
|
+
CSV = "csv"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BigQueryConfig(BaseModel):
|
|
13
|
+
dataset_id: str
|
|
14
|
+
dataset_location: Optional[str] = "US"
|
|
15
|
+
project_id: str
|
|
16
|
+
gcs_buffer_bucket: str
|
|
17
|
+
gcs_buffer_format: Optional[GCSBufferFormat] = GCSBufferFormat.PARQUET
|
|
18
|
+
|
|
19
|
+
service_account_key: Optional[str] = Field(
|
|
20
|
+
description="Service Accouner Key JSON string. If empty it will be infered",
|
|
21
|
+
default="",
|
|
22
|
+
)
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
4
|
+
from typing import List
|
|
5
|
+
from uuid import uuid4
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import pyarrow as pa
|
|
9
|
+
import pyarrow.parquet as pq
|
|
10
|
+
from google.api_core.exceptions import NotFound
|
|
11
|
+
from google.cloud import bigquery, storage
|
|
12
|
+
from google.cloud.bigquery import DatasetReference
|
|
13
|
+
from loguru import logger
|
|
14
|
+
|
|
15
|
+
from bizon.destinations.destination import AbstractDestination
|
|
16
|
+
from bizon.destinations.models import DestinationRecord
|
|
17
|
+
from bizon.source.models import SourceRecord
|
|
18
|
+
|
|
19
|
+
from .config import BigQueryConfig
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BigQueryDestination(AbstractDestination):
|
|
23
|
+
def __init__(self, source_name: str, stream_name: str, config: BigQueryConfig):
|
|
24
|
+
super().__init__(source_name, stream_name, config)
|
|
25
|
+
|
|
26
|
+
if bool(config.service_account_key):
|
|
27
|
+
with tempfile.NamedTemporaryFile(delete=False) as temp:
|
|
28
|
+
temp.write(config.service_account_key.encode())
|
|
29
|
+
temp_file_path = temp.name
|
|
30
|
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file_path
|
|
31
|
+
|
|
32
|
+
self.project_id = config.project_id
|
|
33
|
+
self.bq_client = bigquery.Client(project=self.project_id)
|
|
34
|
+
self.gcs_client = storage.Client(project=self.project_id)
|
|
35
|
+
self.buffer_bucket_name = config.gcs_buffer_bucket
|
|
36
|
+
self.buffer_bucket = self.gcs_client.bucket(config.gcs_buffer_bucket)
|
|
37
|
+
self.buffer_format = config.gcs_buffer_format
|
|
38
|
+
self.dataset_id = config.dataset_id
|
|
39
|
+
self.dataset_location = config.dataset_location
|
|
40
|
+
|
|
41
|
+
def convert_and_upload_to_buffer(self, source_records: List[SourceRecord]):
|
|
42
|
+
|
|
43
|
+
bizon_records = [
|
|
44
|
+
DestinationRecord.from_source_record(source_record).model_dump() for source_record in source_records
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
logger.info(bizon_records[0])
|
|
48
|
+
|
|
49
|
+
df = pd.DataFrame(bizon_records)
|
|
50
|
+
|
|
51
|
+
# Convert DataFrame to Parquet in-memory
|
|
52
|
+
if self.buffer_format == "parquet":
|
|
53
|
+
table = pa.Table.from_pandas(df)
|
|
54
|
+
buffer = io.BytesIO()
|
|
55
|
+
pq.write_table(table, buffer)
|
|
56
|
+
buffer.seek(0)
|
|
57
|
+
|
|
58
|
+
# Upload the Parquet file to GCS
|
|
59
|
+
file_name = f"{self.source_name}/{self.stream_name}/{str(uuid4())}.parquet"
|
|
60
|
+
blob = self.buffer_bucket.blob(file_name)
|
|
61
|
+
blob.upload_from_file(buffer, content_type="application/octet-stream")
|
|
62
|
+
return file_name
|
|
63
|
+
|
|
64
|
+
def check_connection(self) -> bool:
|
|
65
|
+
dataset_ref = DatasetReference(self.project_id, self.dataset_id)
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
self.bq_client.get_dataset(dataset_ref)
|
|
69
|
+
except NotFound:
|
|
70
|
+
dataset = bigquery.Dataset(dataset_ref)
|
|
71
|
+
dataset.location = self.dataset_location
|
|
72
|
+
dataset = self.bq_client.create_dataset(dataset)
|
|
73
|
+
return True
|
|
74
|
+
|
|
75
|
+
def cleanup(self, gcs_file: str):
|
|
76
|
+
blob = self.buffer_bucket.blob(gcs_file)
|
|
77
|
+
blob.delete()
|
|
78
|
+
|
|
79
|
+
# TO DO: Add backoff to common exceptions => looks like most are hanlded by the client
|
|
80
|
+
# https://cloud.google.com/python/docs/reference/storage/latest/retry_timeout
|
|
81
|
+
# https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dbapi.DataError
|
|
82
|
+
def load_to_bigquery(self, gcs_file: str):
|
|
83
|
+
job_config = bigquery.LoadJobConfig(
|
|
84
|
+
source_format=bigquery.SourceFormat.PARQUET,
|
|
85
|
+
write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
|
|
86
|
+
schema=[
|
|
87
|
+
bigquery.SchemaField("bizon_id", "STRING", mode="REQUIRED"),
|
|
88
|
+
bigquery.SchemaField("bizon_loaded_at", "STRING", mode="REQUIRED"),
|
|
89
|
+
bigquery.SchemaField("source_record_id", "STRING", mode="REQUIRED"),
|
|
90
|
+
bigquery.SchemaField("source_data", "STRING", mode="NULLABLE"),
|
|
91
|
+
],
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
table_id = f"{self.project_id}.{self.dataset_id}.{self.source_name}_{self.stream_name}"
|
|
95
|
+
|
|
96
|
+
load_job = self.bq_client.load_table_from_uri(
|
|
97
|
+
f"gs://{self.buffer_bucket_name}/{gcs_file}", table_id, job_config=job_config
|
|
98
|
+
)
|
|
99
|
+
load_job.result()
|
|
100
|
+
|
|
101
|
+
def write_records(self, source_records: List[SourceRecord]):
|
|
102
|
+
|
|
103
|
+
# Here we can check if these IDs are already present in BigQuery
|
|
104
|
+
# Using SourceRecord.id values
|
|
105
|
+
|
|
106
|
+
gs_file_name = self.convert_and_upload_to_buffer(source_records=source_records)
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
self.load_to_bigquery(gs_file_name)
|
|
110
|
+
self.cleanup(gs_file_name)
|
|
111
|
+
except Exception as e:
|
|
112
|
+
self.cleanup(gs_file_name)
|
|
113
|
+
raise e
|
|
114
|
+
return True
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import random
|
|
4
|
+
from random import randint
|
|
5
|
+
|
|
6
|
+
from faker import Faker
|
|
7
|
+
|
|
8
|
+
from bizon.cli.parser import parse_from_yaml
|
|
9
|
+
from bizon.destinations.bigquery.src.config import BigQueryConfig
|
|
10
|
+
from bizon.destinations.bigquery.src.destination import BigQueryDestination
|
|
11
|
+
from bizon.source.config import SourceConfig
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
fake = Faker("en_US")
|
|
16
|
+
|
|
17
|
+
raw_config = parse_from_yaml(os.path.abspath("bizon/destinations/bigquery/config/bigquery.test.yml"))
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_config_models():
|
|
21
|
+
assert bool(SourceConfig.model_validate(raw_config["source"])) is True
|
|
22
|
+
logger.info("source validated...")
|
|
23
|
+
|
|
24
|
+
assert bool(BigQueryConfig.model_validate(raw_config["destination"])) is True
|
|
25
|
+
logger.info("destination validated...")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# def test_load_records_to_bigquery():
|
|
29
|
+
# bigquery_config = BigQueryConfig.model_validate(raw_config["destination"])
|
|
30
|
+
# source_config = SourceConfig.model_validate(raw_config["source"])
|
|
31
|
+
|
|
32
|
+
# fake_records = [
|
|
33
|
+
# {"foo": randint(0, 100), "bar": {"baz": fake.name(), "poo": float(random.randrange(155, 389)) / 100}}
|
|
34
|
+
# for _ in range(100)
|
|
35
|
+
# ]
|
|
36
|
+
# client = BigQueryClient(config=bigquery_config.config, source_config=source_config)
|
|
37
|
+
|
|
38
|
+
# success = client.load_records_to_bigquery(json_records=fake_records)
|
|
39
|
+
|
|
40
|
+
# assert success is True
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Union
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
|
|
6
|
+
from .bigquery.src.config import BigQueryConfig
|
|
7
|
+
from .logger.src.config import LoggerDestinationConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DestinationTypes(str, Enum):
|
|
11
|
+
BIGQUERY = "bigquery"
|
|
12
|
+
LOGGER = "logger"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DestinationConfig(BaseModel):
|
|
16
|
+
name: DestinationTypes = Field(..., description="Name of the destination")
|
|
17
|
+
config: Union[BigQueryConfig, LoggerDestinationConfig] = Field(..., description="Configuration for the destination")
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from bizon.source.models import SourceRecord
|
|
5
|
+
|
|
6
|
+
from .config import DestinationConfig, DestinationTypes
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AbstractDestination(ABC):
|
|
10
|
+
|
|
11
|
+
def __init__(self, source_name: str, stream_name: str, config: DestinationConfig):
|
|
12
|
+
self.source_name = source_name
|
|
13
|
+
self.stream_name = stream_name
|
|
14
|
+
self.config = config
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def name(self) -> str:
|
|
18
|
+
return self.config.name
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def check_connection(self) -> bool:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def write_records(self, source_records: List[SourceRecord]):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DestinationFactory:
|
|
30
|
+
@staticmethod
|
|
31
|
+
def get_destination(source_name: str, stream_name: str, destination_config_dict: dict) -> AbstractDestination:
|
|
32
|
+
|
|
33
|
+
if destination_config_dict.get("name") == DestinationTypes.LOGGER:
|
|
34
|
+
from .logger.src.config import LoggerDestinationConfig
|
|
35
|
+
from .logger.src.destination import LoggerDestination
|
|
36
|
+
|
|
37
|
+
config = LoggerDestinationConfig.model_validate(obj=destination_config_dict.get("config"))
|
|
38
|
+
return LoggerDestination(source_name=source_name, stream_name=stream_name, config=config)
|
|
39
|
+
|
|
40
|
+
if destination_config_dict.get("name") == DestinationTypes.BIGQUERY:
|
|
41
|
+
from .bigquery.src.config import BigQueryConfig
|
|
42
|
+
from .bigquery.src.destination import BigQueryDestination
|
|
43
|
+
|
|
44
|
+
config = BigQueryConfig.model_validate(obj=destination_config_dict.get("config"))
|
|
45
|
+
return BigQueryDestination(source_name=source_name, stream_name=stream_name, config=config)
|
|
46
|
+
|
|
47
|
+
raise ValueError(
|
|
48
|
+
f"Destination {destination_config_dict.get('name')}" f"with params {destination_config_dict} not found"
|
|
49
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from loguru import logger
|
|
4
|
+
|
|
5
|
+
from bizon.destinations.destination import AbstractDestination
|
|
6
|
+
from bizon.source.models import SourceRecord
|
|
7
|
+
|
|
8
|
+
from .config import LoggerDestinationConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class LoggerDestination(AbstractDestination):
|
|
12
|
+
|
|
13
|
+
def __init__(self, source_name: str, stream_name: str, config: LoggerDestinationConfig):
|
|
14
|
+
super().__init__(source_name, stream_name, config)
|
|
15
|
+
|
|
16
|
+
def check_connection(self) -> bool:
|
|
17
|
+
return True
|
|
18
|
+
|
|
19
|
+
def write_records(self, source_records: List[SourceRecord]):
|
|
20
|
+
for record in source_records:
|
|
21
|
+
logger.info(record.data)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from uuid import uuid4
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
from pytz import UTC
|
|
7
|
+
|
|
8
|
+
from bizon.source.models import SourceRecord
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DestinationRecord(BaseModel):
|
|
12
|
+
bizon_id: str = Field(..., description="Bizon unique identifier of the record")
|
|
13
|
+
bizon_loaded_at: str = Field(..., description="Datetime when the record was loaded")
|
|
14
|
+
source_record_id: str = Field(..., description="Source record id")
|
|
15
|
+
source_data: str = Field(..., description="Source record JSON string data")
|
|
16
|
+
|
|
17
|
+
@classmethod
|
|
18
|
+
def from_source_record(cls, source_record: SourceRecord) -> "DestinationRecord":
|
|
19
|
+
return cls(
|
|
20
|
+
bizon_id=uuid4().hex,
|
|
21
|
+
bizon_loaded_at=datetime.now(tz=UTC).isoformat(),
|
|
22
|
+
source_record_id=source_record.id,
|
|
23
|
+
source_data=json.dumps(source_record.data),
|
|
24
|
+
)
|