bizon 0.0.4.dev2__tar.gz → 0.0.4.dev4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/PKG-INFO +13 -3
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/README.md +12 -2
- bizon-0.0.4.dev4/bizon/__main__.py +4 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/cli/main.py +17 -10
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/cli/utils.py +11 -5
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/bigquery/config/bigquery.example.yml +2 -0
- bizon-0.0.4.dev4/bizon/destinations/buffer.py +76 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/config.py +8 -1
- bizon-0.0.4.dev4/bizon/destinations/destination.py +266 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/file/src/destination.py +1 -3
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/models.py +0 -9
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/backend/adapters/sqlalchemy/backend.py +29 -35
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/backend/backend.py +15 -24
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/backend/models.py +5 -3
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/config.py +1 -1
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/engine.py +3 -3
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/pipeline/producer.py +14 -11
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/kafka/consumer.py +2 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/python_queue/consumer.py +3 -2
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/rabbitmq/consumer.py +2 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/queue.py +8 -1
- {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev4/bizon/engine/runner}/adapters/process.py +3 -4
- {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev4/bizon/engine/runner}/adapters/thread.py +24 -6
- {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev4/bizon/engine/runner}/config.py +11 -2
- {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev4/bizon/engine/runner}/runner.py +16 -9
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/discover.py +92 -81
- bizon-0.0.4.dev4/bizon/sources/hubspot/src/hubspot_base.py +89 -0
- bizon-0.0.4.dev2/bizon/sources/hubspot/src/source_objects.py → bizon-0.0.4.dev4/bizon/sources/hubspot/src/hubspot_objects.py +4 -81
- bizon-0.0.4.dev4/bizon/sources/hubspot/tests/hubspot_pipeline.py +7 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/periscope/src/source.py +8 -6
- bizon-0.0.4.dev4/bizon/utils.py +3 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/pyproject.toml +1 -1
- bizon-0.0.4.dev2/bizon/destinations/buffer.py +0 -49
- bizon-0.0.4.dev2/bizon/destinations/destination.py +0 -208
- bizon-0.0.4.dev2/bizon/sources/hubspot/tests/hubspot_benchmark.py +0 -26
- bizon-0.0.4.dev2/bizon/sources/hubspot/tests/hubspot_client.py +0 -25
- bizon-0.0.4.dev2/bizon/sources/hubspot/tests/hubspot_iteration.py +0 -30
- bizon-0.0.4.dev2/bizon/sources/hubspot/tests/hubspot_pipeline.py +0 -6
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/LICENSE +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/cli/__init__.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/common/errors/backoff.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/common/errors/errors.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/common/models.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/bigquery/src/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/bigquery/src/destination.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/bigquery/tests/test_bigquery_client.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/file/src/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/logger/src/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/logger/src/destination.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/backend/adapters/sqlalchemy/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/backend/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/pipeline/consumer.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/pipeline/models.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/kafka/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/kafka/queue.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/python_queue/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/python_queue/queue.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/rabbitmq/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/adapters/rabbitmq/queue.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/engine/queue/config.py +0 -0
- {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev4/bizon/engine/runner}/models.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/auth/authenticators/abstract_oauth.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/auth/authenticators/abstract_token.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/auth/authenticators/basic.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/auth/authenticators/cookies.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/auth/authenticators/oauth.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/auth/authenticators/token.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/auth/builder.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/auth/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/cursor.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/models.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/session.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/source/source.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/config/api_key.example.yml +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/config/api_key_kafka.example.yml +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/src/fake_api.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/src/source.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/tests/dummy_pipeline.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/tests/dummy_pipeline_bigquery_backend.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/tests/dummy_pipeline_kafka.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/tests/dummy_pipeline_rabbitmq.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/gsheets/config/default_auth.example.yml +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/gsheets/config/service_account.example.yml +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/gsheets/src/source.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/gsheets/tests/gsheets_pipeline.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/hubspot/config/api_key.example.yml +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/hubspot/config/oauth.example.yml +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/hubspot/src/models/hs_object.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/periscope/config/periscope_charts.example.yml +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/periscope/tests/periscope_pipeline_charts.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/sources/periscope/tests/periscope_pipeline_dashboard.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bizon
|
|
3
|
-
Version: 0.0.4.
|
|
3
|
+
Version: 0.0.4.dev4
|
|
4
4
|
Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
|
|
5
5
|
Author: Antoine Balliet
|
|
6
6
|
Author-email: antoine.balliet@gmail.com
|
|
@@ -45,7 +45,8 @@ Extract and load your largest data streams with a framework you can trust for bi
|
|
|
45
45
|
|
|
46
46
|
## Features
|
|
47
47
|
- **Natively fault-tolerant**: Bizon uses a checkpointing mechanism to keep track of the progress and recover from the last checkpoint.
|
|
48
|
-
- **
|
|
48
|
+
- **High throughput**: Bizon is designed to handle high throughput and can process billions of records.
|
|
49
|
+
- **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system among Python Queue, RabbitMQ, Kafka or Redpanda. Thanks to the `bizon.engine.queue.Queue` interface, adapters can be written for any queuing system.
|
|
49
50
|
- **Pipeline metrics**: Bizon provides exhaustive pipeline metrics and implement OpenTelemetry for tracing. You can monitor:
|
|
50
51
|
- ETAs for completion
|
|
51
52
|
- Number of records processed
|
|
@@ -64,6 +65,14 @@ pip install bizon
|
|
|
64
65
|
|
|
65
66
|
## Usage
|
|
66
67
|
|
|
68
|
+
### List available sources and streams
|
|
69
|
+
```bash
|
|
70
|
+
bizon source list
|
|
71
|
+
bizon stream list <source_name>
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Create a pipeline
|
|
75
|
+
|
|
67
76
|
Create a file named `config.yml` in your working directory with the following content:
|
|
68
77
|
|
|
69
78
|
```yaml
|
|
@@ -97,7 +106,8 @@ Backend is the interface used by Bizon to store its state. It can be configured
|
|
|
97
106
|
|
|
98
107
|
Queue is the interface used by Bizon to exchange data between `Source` and `Destination`. It can be configured in the `queue` section of the configuration file. The following queues are supported:
|
|
99
108
|
- `python_queue`: Python Queue, useful for testing and development.
|
|
100
|
-
- `
|
|
109
|
+
- `rabbitmq`: RabbitMQ, for production use and high throughput.
|
|
110
|
+
- `kafka`: Apache Kafka, for production use and high throughput and strong persistence.
|
|
101
111
|
|
|
102
112
|
## Start syncing your data 🚀
|
|
103
113
|
|
|
@@ -3,7 +3,8 @@ Extract and load your largest data streams with a framework you can trust for bi
|
|
|
3
3
|
|
|
4
4
|
## Features
|
|
5
5
|
- **Natively fault-tolerant**: Bizon uses a checkpointing mechanism to keep track of the progress and recover from the last checkpoint.
|
|
6
|
-
- **
|
|
6
|
+
- **High throughput**: Bizon is designed to handle high throughput and can process billions of records.
|
|
7
|
+
- **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system among Python Queue, RabbitMQ, Kafka or Redpanda. Thanks to the `bizon.engine.queue.Queue` interface, adapters can be written for any queuing system.
|
|
7
8
|
- **Pipeline metrics**: Bizon provides exhaustive pipeline metrics and implement OpenTelemetry for tracing. You can monitor:
|
|
8
9
|
- ETAs for completion
|
|
9
10
|
- Number of records processed
|
|
@@ -22,6 +23,14 @@ pip install bizon
|
|
|
22
23
|
|
|
23
24
|
## Usage
|
|
24
25
|
|
|
26
|
+
### List available sources and streams
|
|
27
|
+
```bash
|
|
28
|
+
bizon source list
|
|
29
|
+
bizon stream list <source_name>
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Create a pipeline
|
|
33
|
+
|
|
25
34
|
Create a file named `config.yml` in your working directory with the following content:
|
|
26
35
|
|
|
27
36
|
```yaml
|
|
@@ -55,7 +64,8 @@ Backend is the interface used by Bizon to store its state. It can be configured
|
|
|
55
64
|
|
|
56
65
|
Queue is the interface used by Bizon to exchange data between `Source` and `Destination`. It can be configured in the `queue` section of the configuration file. The following queues are supported:
|
|
57
66
|
- `python_queue`: Python Queue, useful for testing and development.
|
|
58
|
-
- `
|
|
67
|
+
- `rabbitmq`: RabbitMQ, for production use and high throughput.
|
|
68
|
+
- `kafka`: Apache Kafka, for production use and high throughput and strong persistence.
|
|
59
69
|
|
|
60
70
|
## Start syncing your data 🚀
|
|
61
71
|
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import click
|
|
2
2
|
|
|
3
3
|
from bizon.engine.engine import RunnerFactory
|
|
4
|
+
from bizon.engine.runner.config import LoggerLevel
|
|
4
5
|
from bizon.source.discover import discover_all_sources
|
|
5
6
|
|
|
6
7
|
from .utils import (
|
|
7
8
|
parse_from_yaml,
|
|
8
9
|
set_custom_source_path_in_config,
|
|
9
|
-
|
|
10
|
+
set_log_level,
|
|
10
11
|
set_runner_in_config,
|
|
11
12
|
)
|
|
12
13
|
|
|
@@ -50,7 +51,7 @@ def stream():
|
|
|
50
51
|
|
|
51
52
|
@stream.command()
|
|
52
53
|
@click.argument("source_name", type=click.STRING)
|
|
53
|
-
def list(source_name: str):
|
|
54
|
+
def list(source_name: str): # noqa
|
|
54
55
|
"""List available streams for a source."""
|
|
55
56
|
sources = discover_all_sources()
|
|
56
57
|
source_model = sources.get(source_name)
|
|
@@ -88,25 +89,29 @@ def destination():
|
|
|
88
89
|
help="Runner type to use. Thread or Process.",
|
|
89
90
|
)
|
|
90
91
|
@click.option(
|
|
91
|
-
"--
|
|
92
|
+
"--log-level",
|
|
92
93
|
required=False,
|
|
93
|
-
|
|
94
|
+
type=click.Choice([level.name for level in LoggerLevel]),
|
|
94
95
|
show_default=True,
|
|
95
|
-
|
|
96
|
-
help="Enable debug mode.",
|
|
96
|
+
help="Log level to use.",
|
|
97
97
|
)
|
|
98
|
-
def run(
|
|
98
|
+
def run(
|
|
99
|
+
filename: str,
|
|
100
|
+
custom_source: str,
|
|
101
|
+
runner: str,
|
|
102
|
+
log_level: LoggerLevel,
|
|
103
|
+
help="Run a bizon pipeline from a YAML file.",
|
|
104
|
+
):
|
|
99
105
|
"""Run a bizon pipeline from a YAML file."""
|
|
100
|
-
ctx = click.get_current_context()
|
|
101
106
|
|
|
102
107
|
# Parse config from YAML file as a dictionary
|
|
103
108
|
config = parse_from_yaml(filename)
|
|
104
109
|
|
|
105
110
|
# Set debug mode
|
|
106
|
-
|
|
111
|
+
set_log_level(config=config, level=log_level)
|
|
107
112
|
|
|
108
113
|
# Override source_file_path param in config
|
|
109
|
-
set_custom_source_path_in_config(config=config, custom_source=
|
|
114
|
+
set_custom_source_path_in_config(config=config, custom_source=custom_source)
|
|
110
115
|
|
|
111
116
|
# Override runner param in config
|
|
112
117
|
set_runner_in_config(config=config, runner=runner)
|
|
@@ -114,6 +119,8 @@ def run(filename: str, custom_source: str, runner: str, debug, help="Run a bizon
|
|
|
114
119
|
runner = RunnerFactory.create_from_config_dict(config=config)
|
|
115
120
|
runner.run()
|
|
116
121
|
|
|
122
|
+
click.echo("Pipeline finished.")
|
|
123
|
+
|
|
117
124
|
|
|
118
125
|
if __name__ == "__main__":
|
|
119
126
|
cli()
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
1
|
import yaml
|
|
4
2
|
|
|
5
3
|
|
|
@@ -9,10 +7,17 @@ def parse_from_yaml(path_to_yaml) -> dict:
|
|
|
9
7
|
return config
|
|
10
8
|
|
|
11
9
|
|
|
12
|
-
|
|
10
|
+
# TODO: Refacto
|
|
11
|
+
def set_log_level(config: dict, level: str):
|
|
13
12
|
# Set Log Level to DEBUG
|
|
14
|
-
if
|
|
15
|
-
|
|
13
|
+
if level:
|
|
14
|
+
if "engine" not in config:
|
|
15
|
+
config["engine"] = {}
|
|
16
|
+
|
|
17
|
+
if "runner" not in config["engine"]:
|
|
18
|
+
config["engine"]["runner"] = {"log_level": level}
|
|
19
|
+
|
|
20
|
+
config["engine"]["runner"]["log_level"] = level
|
|
16
21
|
|
|
17
22
|
|
|
18
23
|
def set_custom_source_path_in_config(config: dict, custom_source: str):
|
|
@@ -20,6 +25,7 @@ def set_custom_source_path_in_config(config: dict, custom_source: str):
|
|
|
20
25
|
config["source"]["source_file_path"] = custom_source
|
|
21
26
|
|
|
22
27
|
|
|
28
|
+
# TODO: Refacto
|
|
23
29
|
def set_runner_in_config(config: dict, runner: str):
|
|
24
30
|
if runner:
|
|
25
31
|
if "engine" not in config:
|
{bizon-0.0.4.dev2 → bizon-0.0.4.dev4}/bizon/destinations/bigquery/config/bigquery.example.yml
RENAMED
|
@@ -13,6 +13,8 @@ destination:
|
|
|
13
13
|
# Must have the bigquery.dataEditor and storage.objectUser on the supplied dataset and bucket
|
|
14
14
|
name: bigquery
|
|
15
15
|
config:
|
|
16
|
+
buffer_size: 10 # in Mb
|
|
17
|
+
buffer_flush_timeout: 300 # in seconds
|
|
16
18
|
dataset_id: bizon_test
|
|
17
19
|
dataset_location: US
|
|
18
20
|
project_id: my-gcp-project-id
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from bizon.destinations.models import DestinationRecord
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DestinationBuffer:
|
|
9
|
+
|
|
10
|
+
def __init__(self, buffer_size: int, buffer_flush_timeout: int) -> None:
|
|
11
|
+
self.buffer_size = buffer_size * 1024 * 1024 # Convert to bytes
|
|
12
|
+
self.buffer_flush_timeout = buffer_flush_timeout
|
|
13
|
+
self.records: List[DestinationRecord] = []
|
|
14
|
+
self._iterations: List[int] = []
|
|
15
|
+
self.pagination = {}
|
|
16
|
+
self.modified_at: List[datetime] = [datetime.utcnow()]
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def current_size(self) -> int:
|
|
20
|
+
"""Return buffer size"""
|
|
21
|
+
return sys.getsizeof(self.records)
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def buffer_free_space_pct(self) -> float:
|
|
25
|
+
"""Return free space in buffer in percentage"""
|
|
26
|
+
return round((self.buffer_free_space / self.buffer_size) * 100, 3)
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def from_iteration(self) -> int:
|
|
30
|
+
"""Return the smallest iteration in buffer"""
|
|
31
|
+
if not self._iterations:
|
|
32
|
+
raise ValueError("Buffer is empty")
|
|
33
|
+
return min(self._iterations)
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def to_iteration(self) -> int:
|
|
37
|
+
"""Return the largest iteration in buffer"""
|
|
38
|
+
if not self._iterations:
|
|
39
|
+
raise ValueError("Buffer is empty")
|
|
40
|
+
return max(self._iterations)
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def buffer_free_space(self) -> int:
|
|
44
|
+
"""Return free space for records in buffer"""
|
|
45
|
+
assert self.current_size <= self.buffer_size, "Buffer size exceeded"
|
|
46
|
+
return self.buffer_size - self.current_size
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def ripeness(self) -> float:
|
|
50
|
+
"""Return buffer ripeness"""
|
|
51
|
+
if self.buffer_flush_timeout == 0:
|
|
52
|
+
return 0
|
|
53
|
+
return round((max(self.modified_at) - min(self.modified_at)).seconds, 2)
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def is_ripe(self) -> bool:
|
|
57
|
+
"""Check if buffer is ripe for flushing based on the timeout"""
|
|
58
|
+
if self.buffer_flush_timeout == 0:
|
|
59
|
+
return False
|
|
60
|
+
return (max(self.modified_at) - min(self.modified_at)).seconds >= self.buffer_flush_timeout
|
|
61
|
+
|
|
62
|
+
def flush(self):
|
|
63
|
+
"""Flush buffer"""
|
|
64
|
+
self.records = []
|
|
65
|
+
self._iterations = []
|
|
66
|
+
self.pagination = {}
|
|
67
|
+
self.modified_at = []
|
|
68
|
+
|
|
69
|
+
def add_source_iteration_records_to_buffer(
|
|
70
|
+
self, iteration: int, records: List[DestinationRecord], pagination: dict = None
|
|
71
|
+
):
|
|
72
|
+
"""Add records for the given iteration to buffer"""
|
|
73
|
+
self.records.extend(records)
|
|
74
|
+
self._iterations.append(iteration)
|
|
75
|
+
self.pagination = pagination
|
|
76
|
+
self.modified_at.append(datetime.utcnow())
|
|
@@ -20,7 +20,14 @@ class NormalizationConfig(BaseModel):
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class AbstractDestinationDetailsConfig(BaseModel):
|
|
23
|
-
buffer_size: int = Field(
|
|
23
|
+
buffer_size: int = Field(
|
|
24
|
+
default=50,
|
|
25
|
+
description="Buffer size in Mb for the destination. Set to 0 to disable and write directly to the destination.",
|
|
26
|
+
)
|
|
27
|
+
buffer_flush_timeout: int = Field(
|
|
28
|
+
default=600,
|
|
29
|
+
description="Maximum time in seconds for buffering after which the records will be written to the destination. Set to 0 to deactivate the timeout buffer check.",
|
|
30
|
+
)
|
|
24
31
|
normalization: Optional[NormalizationConfig] = Field(
|
|
25
32
|
description="Normalization configuration, by default no normalization is applied",
|
|
26
33
|
default=NormalizationConfig(type=NormalizationType.NONE),
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import List, Optional, Tuple, Union
|
|
6
|
+
|
|
7
|
+
from loguru import logger
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
from bizon.common.models import SyncMetadata
|
|
11
|
+
from bizon.engine.backend.backend import AbstractBackend
|
|
12
|
+
from bizon.engine.backend.models import JobStatus
|
|
13
|
+
from bizon.source.config import SourceSyncModes
|
|
14
|
+
from bizon.source.models import SourceRecord
|
|
15
|
+
|
|
16
|
+
from .buffer import DestinationBuffer
|
|
17
|
+
from .config import (
|
|
18
|
+
AbstractDestinationConfig,
|
|
19
|
+
AbstractDestinationDetailsConfig,
|
|
20
|
+
DestinationTypes,
|
|
21
|
+
)
|
|
22
|
+
from .models import DestinationRecord
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DestinationBufferStatus(str, Enum):
|
|
26
|
+
"""Destination buffer status"""
|
|
27
|
+
|
|
28
|
+
RECORDS_WRITTEN = "RECORDS_WRITTEN"
|
|
29
|
+
RECORDS_WRITTEN_THEN_BUFFERED = "RECORDS_WRITTEN_THEN_BUFFERED"
|
|
30
|
+
RECORDS_BUFFERED = "RECORDS_BUFFERED"
|
|
31
|
+
NO_RECORDS = "NO_RECORDS"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DestinationIteration(BaseModel):
|
|
35
|
+
success: bool = Field(..., description="Success status of the iteration")
|
|
36
|
+
error_message: Optional[str] = Field(None, description="Error message if iteration failed")
|
|
37
|
+
records_written: int = Field(0, description="Number of records written to the destination")
|
|
38
|
+
from_source_iteration: Optional[int] = Field(None, description="From source iteration identifier buffer starts")
|
|
39
|
+
to_source_iteration: Optional[int] = Field(
|
|
40
|
+
None, description="To source iteration identifier buffer ends, inclusive"
|
|
41
|
+
)
|
|
42
|
+
pagination: Optional[dict] = Field(None, description="Source pagination for next interation recovery purposes")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class AbstractDestination(ABC):
|
|
46
|
+
|
|
47
|
+
def __init__(self, sync_metadata: SyncMetadata, config: AbstractDestinationDetailsConfig, backend: AbstractBackend):
|
|
48
|
+
self.sync_metadata = sync_metadata
|
|
49
|
+
self.config = config
|
|
50
|
+
self.backend = backend
|
|
51
|
+
self.buffer = DestinationBuffer(
|
|
52
|
+
buffer_size=self.config.buffer_size, buffer_flush_timeout=self.config.buffer_flush_timeout
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
def check_connection(self) -> bool:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
@abstractmethod
|
|
60
|
+
def delete_table(self) -> bool:
|
|
61
|
+
"""Delete table in destination"""
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
@abstractmethod
|
|
65
|
+
def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, Union[str, None]]:
|
|
66
|
+
"""Write records to destination for the given iteration and return success status and error message"""
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
@staticmethod
|
|
70
|
+
def convert_source_records_to_destination_records(
|
|
71
|
+
source_records: List[SourceRecord], extracted_at: datetime
|
|
72
|
+
) -> List[DestinationRecord]:
|
|
73
|
+
"""Convert source records to destination records"""
|
|
74
|
+
return [
|
|
75
|
+
DestinationRecord.from_source_record(source_record=source_record, extracted_at=extracted_at)
|
|
76
|
+
for source_record in source_records
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
def prepare_destination(self):
|
|
80
|
+
"""Prepare destination before writing records"""
|
|
81
|
+
# Delete table if full refresh
|
|
82
|
+
if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
|
|
83
|
+
self.delete_table()
|
|
84
|
+
|
|
85
|
+
def buffer_flush_handler(self, session=None) -> DestinationIteration:
|
|
86
|
+
# TODO: Add retry strategy
|
|
87
|
+
|
|
88
|
+
# Initialize destination iteration
|
|
89
|
+
destination_iteration = DestinationIteration(
|
|
90
|
+
success=False,
|
|
91
|
+
records_written=0,
|
|
92
|
+
pagination=self.buffer.pagination,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
success, error_msg = self.write_records(destination_records=self.buffer.records)
|
|
96
|
+
|
|
97
|
+
if success:
|
|
98
|
+
# We wrote records to destination so we keep it
|
|
99
|
+
destination_iteration.records_written = len(self.buffer.records)
|
|
100
|
+
|
|
101
|
+
else:
|
|
102
|
+
# We failed to write records to destination so we keep the error message
|
|
103
|
+
destination_iteration.error_message = error_msg
|
|
104
|
+
|
|
105
|
+
destination_iteration.success = success
|
|
106
|
+
destination_iteration.from_source_iteration = self.buffer.from_iteration
|
|
107
|
+
destination_iteration.to_source_iteration = self.buffer.to_iteration
|
|
108
|
+
destination_iteration.pagination = self.buffer.pagination
|
|
109
|
+
|
|
110
|
+
# Update destination cursor
|
|
111
|
+
self.create_cursors(destination_iteration=destination_iteration)
|
|
112
|
+
|
|
113
|
+
return destination_iteration
|
|
114
|
+
|
|
115
|
+
def write_or_buffer_records(
|
|
116
|
+
self,
|
|
117
|
+
destination_records: List[DestinationRecord],
|
|
118
|
+
iteration: int,
|
|
119
|
+
last_iteration: bool = False,
|
|
120
|
+
session=None,
|
|
121
|
+
pagination: dict = None,
|
|
122
|
+
) -> DestinationBufferStatus:
|
|
123
|
+
"""Write records to destination or buffer them for the given iteration"""
|
|
124
|
+
|
|
125
|
+
# Last iteration, write all records to destination
|
|
126
|
+
if last_iteration:
|
|
127
|
+
logger.debug("Writing last iteration records to destination")
|
|
128
|
+
assert len(destination_records) == 0, "Last iteration should not have any records"
|
|
129
|
+
destination_iteration = self.buffer_flush_handler(session=session)
|
|
130
|
+
|
|
131
|
+
if destination_iteration.success:
|
|
132
|
+
# Update job status to success
|
|
133
|
+
self.backend.update_stream_job_status(
|
|
134
|
+
job_id=self.sync_metadata.job_id,
|
|
135
|
+
job_status=JobStatus.SUCCEEDED,
|
|
136
|
+
session=session,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
self.buffer.flush()
|
|
140
|
+
return DestinationBufferStatus.RECORDS_WRITTEN
|
|
141
|
+
|
|
142
|
+
# Don't write empty records to destination
|
|
143
|
+
if len(destination_records) == 0 and not last_iteration:
|
|
144
|
+
logger.warning("No records to write to destination. Check source and queue provider.")
|
|
145
|
+
return DestinationBufferStatus.NO_RECORDS
|
|
146
|
+
|
|
147
|
+
# Write records to destination if buffer size is 0
|
|
148
|
+
if self.buffer.buffer_size == 0:
|
|
149
|
+
logger.info("Writing last iteration records to destination")
|
|
150
|
+
self.buffer.add_source_iteration_records_to_buffer(
|
|
151
|
+
iteration=iteration, records=destination_records, pagination=pagination
|
|
152
|
+
)
|
|
153
|
+
self.buffer_flush_handler(session=session)
|
|
154
|
+
self.buffer.flush()
|
|
155
|
+
return DestinationBufferStatus.RECORDS_WRITTEN
|
|
156
|
+
|
|
157
|
+
logger.debug(f"Buffer free space {self.buffer.buffer_free_space_pct}%")
|
|
158
|
+
logger.debug(f"Buffer current size {self.buffer.current_size} bytes")
|
|
159
|
+
logger.info(
|
|
160
|
+
f"Buffer ripeness {self.buffer.ripeness / 60} min. Max ripeness {self.buffer.buffer_flush_timeout / 60} min." # noqa
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Write buffer to destination if buffer is ripe and create a new buffer for the new iteration
|
|
164
|
+
if self.buffer.is_ripe:
|
|
165
|
+
logger.info(
|
|
166
|
+
f"Buffer is ripe (buffering for longer than buffer_flush_timeout: {self.buffer.buffer_flush_timeout} seconds), writing buffer to destination" # noqa
|
|
167
|
+
)
|
|
168
|
+
self.buffer_flush_handler(session=session)
|
|
169
|
+
self.buffer.flush()
|
|
170
|
+
self.buffer.add_source_iteration_records_to_buffer(
|
|
171
|
+
iteration=iteration, records=destination_records, pagination=pagination
|
|
172
|
+
)
|
|
173
|
+
return DestinationBufferStatus.RECORDS_WRITTEN_THEN_BUFFERED
|
|
174
|
+
|
|
175
|
+
# Buffer can hold all records from this iteration
|
|
176
|
+
elif self.buffer.buffer_free_space >= sys.getsizeof(destination_records):
|
|
177
|
+
self.buffer.add_source_iteration_records_to_buffer(
|
|
178
|
+
iteration=iteration, records=destination_records, pagination=pagination
|
|
179
|
+
)
|
|
180
|
+
return DestinationBufferStatus.RECORDS_BUFFERED
|
|
181
|
+
|
|
182
|
+
# Buffer can contain some records from this iteration
|
|
183
|
+
# For now we will write all records to destination and then buffer the remaining records
|
|
184
|
+
else:
|
|
185
|
+
self.buffer_flush_handler(session=session)
|
|
186
|
+
self.buffer.flush()
|
|
187
|
+
self.buffer.add_source_iteration_records_to_buffer(
|
|
188
|
+
iteration=iteration, records=destination_records, pagination=pagination
|
|
189
|
+
)
|
|
190
|
+
return DestinationBufferStatus.RECORDS_WRITTEN_THEN_BUFFERED
|
|
191
|
+
|
|
192
|
+
def create_cursors(self, destination_iteration: DestinationIteration):
|
|
193
|
+
self.backend.create_destination_cursor(
|
|
194
|
+
job_id=self.sync_metadata.job_id,
|
|
195
|
+
source_name=self.sync_metadata.source_name,
|
|
196
|
+
stream_name=self.sync_metadata.stream_name,
|
|
197
|
+
destination_name=self.sync_metadata.destination_name,
|
|
198
|
+
from_source_iteration=destination_iteration.from_source_iteration,
|
|
199
|
+
to_source_iteration=destination_iteration.to_source_iteration,
|
|
200
|
+
rows_written=destination_iteration.records_written,
|
|
201
|
+
pagination=destination_iteration.pagination,
|
|
202
|
+
success=destination_iteration.success,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
def write_records_and_update_cursor(
|
|
206
|
+
self,
|
|
207
|
+
source_records: List[SourceRecord],
|
|
208
|
+
iteration: int,
|
|
209
|
+
extracted_at: datetime,
|
|
210
|
+
last_iteration: bool = False,
|
|
211
|
+
pagination: dict = None,
|
|
212
|
+
) -> bool:
|
|
213
|
+
"""
|
|
214
|
+
Write records to destination and update the cursor for the given iteration.
|
|
215
|
+
Stores the source pagination for recovery purposes.
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
# Case when producer failed to fetch data from first iteration
|
|
219
|
+
if iteration == 0 and len(source_records) == 0:
|
|
220
|
+
logger.warning("Source failed to fetch data from the first iteration, no records will be written.")
|
|
221
|
+
return False
|
|
222
|
+
|
|
223
|
+
# Prepare destination
|
|
224
|
+
if iteration == 0:
|
|
225
|
+
self.prepare_destination()
|
|
226
|
+
|
|
227
|
+
destination_records = self.convert_source_records_to_destination_records(
|
|
228
|
+
source_records=source_records,
|
|
229
|
+
extracted_at=extracted_at,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Buffer records otherwise write to destination
|
|
233
|
+
self.write_or_buffer_records(
|
|
234
|
+
destination_records=destination_records,
|
|
235
|
+
iteration=iteration,
|
|
236
|
+
last_iteration=last_iteration,
|
|
237
|
+
pagination=pagination,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
return True
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class DestinationFactory:
|
|
244
|
+
@staticmethod
|
|
245
|
+
def get_destination(
|
|
246
|
+
sync_metadata: SyncMetadata,
|
|
247
|
+
config: AbstractDestinationConfig,
|
|
248
|
+
backend: AbstractBackend,
|
|
249
|
+
) -> AbstractDestination:
|
|
250
|
+
|
|
251
|
+
if config.name == DestinationTypes.LOGGER:
|
|
252
|
+
from .logger.src.destination import LoggerDestination
|
|
253
|
+
|
|
254
|
+
return LoggerDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
|
|
255
|
+
|
|
256
|
+
elif config.name == DestinationTypes.BIGQUERY:
|
|
257
|
+
from .bigquery.src.destination import BigQueryDestination
|
|
258
|
+
|
|
259
|
+
return BigQueryDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
|
|
260
|
+
|
|
261
|
+
elif config.name == DestinationTypes.FILE:
|
|
262
|
+
from .file.src.destination import FileDestination
|
|
263
|
+
|
|
264
|
+
return FileDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
|
|
265
|
+
|
|
266
|
+
raise ValueError(f"Destination {config.name}" f"with params {config} not found")
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import json
|
|
2
1
|
from typing import List, Tuple
|
|
3
2
|
|
|
4
3
|
from loguru import logger
|
|
@@ -26,6 +25,5 @@ class FileDestination(AbstractDestination):
|
|
|
26
25
|
def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, str]:
|
|
27
26
|
with open(self.config.filepath, "a") as f:
|
|
28
27
|
for record in destination_records:
|
|
29
|
-
|
|
30
|
-
f.write("\n")
|
|
28
|
+
f.write(record.model_dump_json() + "\n")
|
|
31
29
|
return True, ""
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from datetime import datetime
|
|
3
|
-
from typing import Optional
|
|
4
3
|
from uuid import uuid4
|
|
5
4
|
|
|
6
5
|
from pydantic import BaseModel, Field
|
|
@@ -9,14 +8,6 @@ from pytz import UTC
|
|
|
9
8
|
from bizon.source.models import SourceRecord
|
|
10
9
|
|
|
11
10
|
|
|
12
|
-
class DestinationIteration(BaseModel):
|
|
13
|
-
success: bool = Field(..., description="Success status of the iteration")
|
|
14
|
-
error_message: Optional[str] = Field(None, description="Error message if iteration failed")
|
|
15
|
-
records_written: int = Field(0, description="Number of records written to the destination")
|
|
16
|
-
from_source_iteration: Optional[int] = Field(None, description="From source iteration identifier buffer starts")
|
|
17
|
-
to_source_iteration: Optional[int] = Field(None, description="To source iteration identifier buffer ends")
|
|
18
|
-
|
|
19
|
-
|
|
20
11
|
class DestinationRecord(BaseModel):
|
|
21
12
|
bizon_id: str = Field(..., description="Bizon unique identifier of the record")
|
|
22
13
|
bizon_extracted_at: datetime = Field(..., description="Datetime when the record was extracted")
|