bizon 0.0.4.dev2__tar.gz → 0.0.4.dev3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/PKG-INFO +13 -3
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/README.md +12 -2
- bizon-0.0.4.dev3/bizon/__main__.py +4 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/cli/main.py +3 -1
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/bigquery/config/bigquery.example.yml +2 -0
- bizon-0.0.4.dev3/bizon/destinations/buffer.py +76 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/config.py +8 -1
- bizon-0.0.4.dev3/bizon/destinations/destination.py +260 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/file/src/destination.py +1 -3
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/models.py +0 -9
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/backend/adapters/sqlalchemy/backend.py +28 -34
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/backend/backend.py +15 -24
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/backend/models.py +5 -2
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/config.py +1 -1
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/engine.py +3 -3
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/pipeline/producer.py +14 -12
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/kafka/consumer.py +2 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/python_queue/consumer.py +3 -2
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/rabbitmq/consumer.py +2 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/queue.py +8 -1
- {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev3/bizon/engine/runner}/adapters/process.py +3 -4
- {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev3/bizon/engine/runner}/adapters/thread.py +24 -6
- {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev3/bizon/engine/runner}/runner.py +6 -8
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/discover.py +92 -81
- bizon-0.0.4.dev3/bizon/sources/hubspot/src/hubspot_base.py +89 -0
- bizon-0.0.4.dev2/bizon/sources/hubspot/src/source_objects.py → bizon-0.0.4.dev3/bizon/sources/hubspot/src/hubspot_objects.py +4 -81
- bizon-0.0.4.dev3/bizon/sources/hubspot/tests/hubspot_pipeline.py +7 -0
- bizon-0.0.4.dev3/bizon/utils.py +3 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/pyproject.toml +1 -1
- bizon-0.0.4.dev2/bizon/destinations/buffer.py +0 -49
- bizon-0.0.4.dev2/bizon/destinations/destination.py +0 -208
- bizon-0.0.4.dev2/bizon/sources/hubspot/tests/hubspot_benchmark.py +0 -26
- bizon-0.0.4.dev2/bizon/sources/hubspot/tests/hubspot_client.py +0 -25
- bizon-0.0.4.dev2/bizon/sources/hubspot/tests/hubspot_iteration.py +0 -30
- bizon-0.0.4.dev2/bizon/sources/hubspot/tests/hubspot_pipeline.py +0 -6
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/LICENSE +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/cli/__init__.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/cli/utils.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/common/errors/backoff.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/common/errors/errors.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/common/models.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/bigquery/src/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/bigquery/src/destination.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/bigquery/tests/test_bigquery_client.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/file/src/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/logger/src/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/logger/src/destination.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/backend/adapters/sqlalchemy/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/backend/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/pipeline/consumer.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/pipeline/models.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/kafka/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/kafka/queue.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/python_queue/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/python_queue/queue.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/rabbitmq/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/adapters/rabbitmq/queue.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/engine/queue/config.py +0 -0
- {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev3/bizon/engine/runner}/config.py +0 -0
- {bizon-0.0.4.dev2/bizon/engine/runners → bizon-0.0.4.dev3/bizon/engine/runner}/models.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/auth/authenticators/abstract_oauth.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/auth/authenticators/abstract_token.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/auth/authenticators/basic.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/auth/authenticators/cookies.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/auth/authenticators/oauth.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/auth/authenticators/token.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/auth/builder.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/auth/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/config.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/cursor.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/models.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/session.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/source/source.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/config/api_key.example.yml +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/config/api_key_kafka.example.yml +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/src/fake_api.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/src/source.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/tests/dummy_pipeline.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/tests/dummy_pipeline_bigquery_backend.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/tests/dummy_pipeline_kafka.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/tests/dummy_pipeline_rabbitmq.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/gsheets/config/default_auth.example.yml +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/gsheets/config/service_account.example.yml +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/gsheets/src/source.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/gsheets/tests/gsheets_pipeline.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/hubspot/config/api_key.example.yml +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/hubspot/config/oauth.example.yml +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/hubspot/src/models/hs_object.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/periscope/config/periscope_charts.example.yml +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/periscope/src/source.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/periscope/tests/periscope_pipeline_charts.py +0 -0
- {bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/sources/periscope/tests/periscope_pipeline_dashboard.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: bizon
|
|
3
|
-
Version: 0.0.4.
|
|
3
|
+
Version: 0.0.4.dev3
|
|
4
4
|
Summary: Extract and load your data reliably from API Clients with native fault-tolerant and checkpointing mechanism.
|
|
5
5
|
Author: Antoine Balliet
|
|
6
6
|
Author-email: antoine.balliet@gmail.com
|
|
@@ -45,7 +45,8 @@ Extract and load your largest data streams with a framework you can trust for bi
|
|
|
45
45
|
|
|
46
46
|
## Features
|
|
47
47
|
- **Natively fault-tolerant**: Bizon uses a checkpointing mechanism to keep track of the progress and recover from the last checkpoint.
|
|
48
|
-
- **
|
|
48
|
+
- **High throughput**: Bizon is designed to handle high throughput and can process billions of records.
|
|
49
|
+
- **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system among Python Queue, RabbitMQ, Kafka or Redpanda. Thanks to the `bizon.engine.queue.Queue` interface, adapters can be written for any queuing system.
|
|
49
50
|
- **Pipeline metrics**: Bizon provides exhaustive pipeline metrics and implement OpenTelemetry for tracing. You can monitor:
|
|
50
51
|
- ETAs for completion
|
|
51
52
|
- Number of records processed
|
|
@@ -64,6 +65,14 @@ pip install bizon
|
|
|
64
65
|
|
|
65
66
|
## Usage
|
|
66
67
|
|
|
68
|
+
### List available sources and streams
|
|
69
|
+
```bash
|
|
70
|
+
bizon source list
|
|
71
|
+
bizon stream list <source_name>
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Create a pipeline
|
|
75
|
+
|
|
67
76
|
Create a file named `config.yml` in your working directory with the following content:
|
|
68
77
|
|
|
69
78
|
```yaml
|
|
@@ -97,7 +106,8 @@ Backend is the interface used by Bizon to store its state. It can be configured
|
|
|
97
106
|
|
|
98
107
|
Queue is the interface used by Bizon to exchange data between `Source` and `Destination`. It can be configured in the `queue` section of the configuration file. The following queues are supported:
|
|
99
108
|
- `python_queue`: Python Queue, useful for testing and development.
|
|
100
|
-
- `
|
|
109
|
+
- `rabbitmq`: RabbitMQ, for production use and high throughput.
|
|
110
|
+
- `kafka`: Apache Kafka, for production use and high throughput and strong persistence.
|
|
101
111
|
|
|
102
112
|
## Start syncing your data 🚀
|
|
103
113
|
|
|
@@ -3,7 +3,8 @@ Extract and load your largest data streams with a framework you can trust for bi
|
|
|
3
3
|
|
|
4
4
|
## Features
|
|
5
5
|
- **Natively fault-tolerant**: Bizon uses a checkpointing mechanism to keep track of the progress and recover from the last checkpoint.
|
|
6
|
-
- **
|
|
6
|
+
- **High throughput**: Bizon is designed to handle high throughput and can process billions of records.
|
|
7
|
+
- **Queue system agnostic**: Bizon is agnostic of the queuing system, you can use any queuing system among Python Queue, RabbitMQ, Kafka or Redpanda. Thanks to the `bizon.engine.queue.Queue` interface, adapters can be written for any queuing system.
|
|
7
8
|
- **Pipeline metrics**: Bizon provides exhaustive pipeline metrics and implement OpenTelemetry for tracing. You can monitor:
|
|
8
9
|
- ETAs for completion
|
|
9
10
|
- Number of records processed
|
|
@@ -22,6 +23,14 @@ pip install bizon
|
|
|
22
23
|
|
|
23
24
|
## Usage
|
|
24
25
|
|
|
26
|
+
### List available sources and streams
|
|
27
|
+
```bash
|
|
28
|
+
bizon source list
|
|
29
|
+
bizon stream list <source_name>
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Create a pipeline
|
|
33
|
+
|
|
25
34
|
Create a file named `config.yml` in your working directory with the following content:
|
|
26
35
|
|
|
27
36
|
```yaml
|
|
@@ -55,7 +64,8 @@ Backend is the interface used by Bizon to store its state. It can be configured
|
|
|
55
64
|
|
|
56
65
|
Queue is the interface used by Bizon to exchange data between `Source` and `Destination`. It can be configured in the `queue` section of the configuration file. The following queues are supported:
|
|
57
66
|
- `python_queue`: Python Queue, useful for testing and development.
|
|
58
|
-
- `
|
|
67
|
+
- `rabbitmq`: RabbitMQ, for production use and high throughput.
|
|
68
|
+
- `kafka`: Apache Kafka, for production use and high throughput and strong persistence.
|
|
59
69
|
|
|
60
70
|
## Start syncing your data 🚀
|
|
61
71
|
|
|
@@ -50,7 +50,7 @@ def stream():
|
|
|
50
50
|
|
|
51
51
|
@stream.command()
|
|
52
52
|
@click.argument("source_name", type=click.STRING)
|
|
53
|
-
def list(source_name: str):
|
|
53
|
+
def list(source_name: str): # noqa
|
|
54
54
|
"""List available streams for a source."""
|
|
55
55
|
sources = discover_all_sources()
|
|
56
56
|
source_model = sources.get(source_name)
|
|
@@ -114,6 +114,8 @@ def run(filename: str, custom_source: str, runner: str, debug, help="Run a bizon
|
|
|
114
114
|
runner = RunnerFactory.create_from_config_dict(config=config)
|
|
115
115
|
runner.run()
|
|
116
116
|
|
|
117
|
+
click.echo("Pipeline finished.")
|
|
118
|
+
|
|
117
119
|
|
|
118
120
|
if __name__ == "__main__":
|
|
119
121
|
cli()
|
{bizon-0.0.4.dev2 → bizon-0.0.4.dev3}/bizon/destinations/bigquery/config/bigquery.example.yml
RENAMED
|
@@ -13,6 +13,8 @@ destination:
|
|
|
13
13
|
# Must have the bigquery.dataEditor and storage.objectUser on the supplied dataset and bucket
|
|
14
14
|
name: bigquery
|
|
15
15
|
config:
|
|
16
|
+
buffer_size: 10 # in Mb
|
|
17
|
+
buffer_flush_timeout: 300 # in seconds
|
|
16
18
|
dataset_id: bizon_test
|
|
17
19
|
dataset_location: US
|
|
18
20
|
project_id: my-gcp-project-id
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from bizon.destinations.models import DestinationRecord
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DestinationBuffer:
|
|
9
|
+
|
|
10
|
+
def __init__(self, buffer_size: int, buffer_flush_timeout: int) -> None:
|
|
11
|
+
self.buffer_size = buffer_size * 1024 * 1024 # Convert to bytes
|
|
12
|
+
self.buffer_flush_timeout = buffer_flush_timeout
|
|
13
|
+
self.records: List[DestinationRecord] = []
|
|
14
|
+
self._iterations: List[int] = []
|
|
15
|
+
self.pagination = {}
|
|
16
|
+
self.modified_at: List[datetime] = [datetime.utcnow()]
|
|
17
|
+
|
|
18
|
+
@property
|
|
19
|
+
def current_size(self) -> int:
|
|
20
|
+
"""Return buffer size"""
|
|
21
|
+
return sys.getsizeof(self.records)
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def buffer_free_space_pct(self) -> float:
|
|
25
|
+
"""Return free space in buffer in percentage"""
|
|
26
|
+
return round((self.buffer_free_space / self.buffer_size) * 100, 3)
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def from_iteration(self) -> int:
|
|
30
|
+
"""Return the smallest iteration in buffer"""
|
|
31
|
+
if not self._iterations:
|
|
32
|
+
raise ValueError("Buffer is empty")
|
|
33
|
+
return min(self._iterations)
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def to_iteration(self) -> int:
|
|
37
|
+
"""Return the largest iteration in buffer"""
|
|
38
|
+
if not self._iterations:
|
|
39
|
+
raise ValueError("Buffer is empty")
|
|
40
|
+
return max(self._iterations)
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def buffer_free_space(self) -> int:
|
|
44
|
+
"""Return free space for records in buffer"""
|
|
45
|
+
assert self.current_size <= self.buffer_size, "Buffer size exceeded"
|
|
46
|
+
return self.buffer_size - self.current_size
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def ripeness(self) -> float:
|
|
50
|
+
"""Return buffer ripeness"""
|
|
51
|
+
if self.buffer_flush_timeout == 0:
|
|
52
|
+
return 0
|
|
53
|
+
return round((max(self.modified_at) - min(self.modified_at)).seconds, 2)
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def is_ripe(self) -> bool:
|
|
57
|
+
"""Check if buffer is ripe for flushing based on the timeout"""
|
|
58
|
+
if self.buffer_flush_timeout == 0:
|
|
59
|
+
return False
|
|
60
|
+
return (max(self.modified_at) - min(self.modified_at)).seconds >= self.buffer_flush_timeout
|
|
61
|
+
|
|
62
|
+
def flush(self):
|
|
63
|
+
"""Flush buffer"""
|
|
64
|
+
self.records = []
|
|
65
|
+
self._iterations = []
|
|
66
|
+
self.pagination = {}
|
|
67
|
+
self.modified_at = []
|
|
68
|
+
|
|
69
|
+
def add_source_iteration_records_to_buffer(
|
|
70
|
+
self, iteration: int, records: List[DestinationRecord], pagination: dict = None
|
|
71
|
+
):
|
|
72
|
+
"""Add records for the given iteration to buffer"""
|
|
73
|
+
self.records.extend(records)
|
|
74
|
+
self._iterations.append(iteration)
|
|
75
|
+
self.pagination = pagination
|
|
76
|
+
self.modified_at.append(datetime.utcnow())
|
|
@@ -20,7 +20,14 @@ class NormalizationConfig(BaseModel):
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class AbstractDestinationDetailsConfig(BaseModel):
|
|
23
|
-
buffer_size: int = Field(
|
|
23
|
+
buffer_size: int = Field(
|
|
24
|
+
default=50,
|
|
25
|
+
description="Buffer size in Mb for the destination. Set to 0 to disable and write directly to the destination.",
|
|
26
|
+
)
|
|
27
|
+
buffer_flush_timeout: int = Field(
|
|
28
|
+
default=600,
|
|
29
|
+
description="Maximum time in seconds for buffering after which the records will be written to the destination. Set to 0 to deactivate the timeout buffer check.",
|
|
30
|
+
)
|
|
24
31
|
normalization: Optional[NormalizationConfig] = Field(
|
|
25
32
|
description="Normalization configuration, by default no normalization is applied",
|
|
26
33
|
default=NormalizationConfig(type=NormalizationType.NONE),
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import List, Optional, Tuple, Union
|
|
6
|
+
|
|
7
|
+
from loguru import logger
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
from bizon.common.models import SyncMetadata
|
|
11
|
+
from bizon.engine.backend.backend import AbstractBackend
|
|
12
|
+
from bizon.engine.backend.models import JobStatus
|
|
13
|
+
from bizon.source.config import SourceSyncModes
|
|
14
|
+
from bizon.source.models import SourceRecord
|
|
15
|
+
|
|
16
|
+
from .buffer import DestinationBuffer
|
|
17
|
+
from .config import (
|
|
18
|
+
AbstractDestinationConfig,
|
|
19
|
+
AbstractDestinationDetailsConfig,
|
|
20
|
+
DestinationTypes,
|
|
21
|
+
)
|
|
22
|
+
from .models import DestinationRecord
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DestinationBufferStatus(str, Enum):
|
|
26
|
+
"""Destination buffer status"""
|
|
27
|
+
|
|
28
|
+
RECORDS_WRITTEN = "RECORDS_WRITTEN"
|
|
29
|
+
RECORDS_WRITTEN_THEN_BUFFERED = "RECORDS_WRITTEN_THEN_BUFFERED"
|
|
30
|
+
RECORDS_BUFFERED = "RECORDS_BUFFERED"
|
|
31
|
+
NO_RECORDS = "NO_RECORDS"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DestinationIteration(BaseModel):
|
|
35
|
+
success: bool = Field(..., description="Success status of the iteration")
|
|
36
|
+
error_message: Optional[str] = Field(None, description="Error message if iteration failed")
|
|
37
|
+
records_written: int = Field(0, description="Number of records written to the destination")
|
|
38
|
+
from_source_iteration: Optional[int] = Field(None, description="From source iteration identifier buffer starts")
|
|
39
|
+
to_source_iteration: Optional[int] = Field(
|
|
40
|
+
None, description="To source iteration identifier buffer ends, inclusive"
|
|
41
|
+
)
|
|
42
|
+
pagination: Optional[dict] = Field(None, description="Source pagination for next interation recovery purposes")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class AbstractDestination(ABC):
|
|
46
|
+
|
|
47
|
+
def __init__(self, sync_metadata: SyncMetadata, config: AbstractDestinationDetailsConfig, backend: AbstractBackend):
|
|
48
|
+
self.sync_metadata = sync_metadata
|
|
49
|
+
self.config = config
|
|
50
|
+
self.backend = backend
|
|
51
|
+
self.buffer = DestinationBuffer(
|
|
52
|
+
buffer_size=self.config.buffer_size, buffer_flush_timeout=self.config.buffer_flush_timeout
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
def check_connection(self) -> bool:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
@abstractmethod
|
|
60
|
+
def delete_table(self) -> bool:
|
|
61
|
+
"""Delete table in destination"""
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
@abstractmethod
|
|
65
|
+
def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, Union[str, None]]:
|
|
66
|
+
"""Write records to destination for the given iteration and return success status and error message"""
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
@staticmethod
|
|
70
|
+
def convert_source_records_to_destination_records(
|
|
71
|
+
source_records: List[SourceRecord], extracted_at: datetime
|
|
72
|
+
) -> List[DestinationRecord]:
|
|
73
|
+
"""Convert source records to destination records"""
|
|
74
|
+
return [
|
|
75
|
+
DestinationRecord.from_source_record(source_record=source_record, extracted_at=extracted_at)
|
|
76
|
+
for source_record in source_records
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
def prepare_destination(self):
|
|
80
|
+
"""Prepare destination before writing records"""
|
|
81
|
+
# Delete table if full refresh
|
|
82
|
+
if self.sync_metadata.sync_mode == SourceSyncModes.FULL_REFRESH:
|
|
83
|
+
self.delete_table()
|
|
84
|
+
|
|
85
|
+
def buffer_flush_handler(self, session=None) -> DestinationIteration:
|
|
86
|
+
nb_records_to_write = len(self.buffer.records)
|
|
87
|
+
# Initialize destination iteration
|
|
88
|
+
destination_iteration = DestinationIteration(
|
|
89
|
+
success=False,
|
|
90
|
+
records_written=0,
|
|
91
|
+
pagination=self.buffer.pagination,
|
|
92
|
+
)
|
|
93
|
+
success, error_msg = self.write_records(destination_records=self.buffer.records)
|
|
94
|
+
if success:
|
|
95
|
+
destination_iteration.records_written = nb_records_to_write
|
|
96
|
+
self.backend.update_stream_job_status(
|
|
97
|
+
job_id=self.sync_metadata.job_id,
|
|
98
|
+
job_status=JobStatus.SUCCEEDED,
|
|
99
|
+
session=session,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
else:
|
|
103
|
+
self.backend.update_stream_job_status(
|
|
104
|
+
job_id=self.sync_metadata.job_id,
|
|
105
|
+
job_status=JobStatus.FAILED,
|
|
106
|
+
error_message=error_msg,
|
|
107
|
+
session=session,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
destination_iteration.success = success
|
|
111
|
+
destination_iteration.from_source_iteration = self.buffer.from_iteration
|
|
112
|
+
destination_iteration.to_source_iteration = self.buffer.to_iteration
|
|
113
|
+
destination_iteration.pagination = self.buffer.pagination
|
|
114
|
+
|
|
115
|
+
# Update destination cursor
|
|
116
|
+
self.create_cursors(destination_iteration=destination_iteration)
|
|
117
|
+
|
|
118
|
+
def write_or_buffer_records(
|
|
119
|
+
self,
|
|
120
|
+
destination_records: List[DestinationRecord],
|
|
121
|
+
iteration: int,
|
|
122
|
+
last_iteration: bool = False,
|
|
123
|
+
session=None,
|
|
124
|
+
pagination: dict = None,
|
|
125
|
+
) -> DestinationBufferStatus:
|
|
126
|
+
"""Write records to destination or buffer them for the given iteration"""
|
|
127
|
+
|
|
128
|
+
# Last iteration, write all records to destination
|
|
129
|
+
if last_iteration:
|
|
130
|
+
logger.debug("Writing last iteration records to destination")
|
|
131
|
+
assert len(destination_records) == 0, "Last iteration should not have any records"
|
|
132
|
+
self.buffer_flush_handler(session=session)
|
|
133
|
+
self.buffer.flush()
|
|
134
|
+
return DestinationBufferStatus.RECORDS_WRITTEN
|
|
135
|
+
|
|
136
|
+
# Don't write empty records to destination
|
|
137
|
+
if len(destination_records) == 0 and not last_iteration:
|
|
138
|
+
logger.warning("No records to write to destination. Check source and queue provider.")
|
|
139
|
+
return DestinationBufferStatus.NO_RECORDS
|
|
140
|
+
|
|
141
|
+
# Write records to destination if buffer size is 0
|
|
142
|
+
if self.buffer.buffer_size == 0:
|
|
143
|
+
logger.debug("Writing last iteration records to destination")
|
|
144
|
+
self.buffer.add_source_iteration_records_to_buffer(
|
|
145
|
+
iteration=iteration, records=destination_records, pagination=pagination
|
|
146
|
+
)
|
|
147
|
+
self.buffer_flush_handler(session=session)
|
|
148
|
+
self.buffer.flush()
|
|
149
|
+
return DestinationBufferStatus.RECORDS_WRITTEN
|
|
150
|
+
|
|
151
|
+
logger.info(f"Buffer free space {self.buffer.buffer_free_space_pct}%")
|
|
152
|
+
logger.info(f"Buffer current size {self.buffer.current_size} bytes")
|
|
153
|
+
logger.info(
|
|
154
|
+
f"Buffer ripeness {self.buffer.ripeness / 60} min. Max ripeness {self.buffer.buffer_flush_timeout / 60} min." # noqa
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Write buffer to destination if buffer is ripe and create a new buffer for the new iteration
|
|
158
|
+
if self.buffer.is_ripe:
|
|
159
|
+
logger.debug(
|
|
160
|
+
f"Buffer is ripe (buffering for longer than buffer_flush_timeout: {self.buffer.buffer_flush_timeout} seconds), writing buffer to destination" # noqa
|
|
161
|
+
)
|
|
162
|
+
self.buffer_flush_handler(session=session)
|
|
163
|
+
self.buffer.flush()
|
|
164
|
+
self.buffer.add_source_iteration_records_to_buffer(
|
|
165
|
+
iteration=iteration, records=destination_records, pagination=pagination
|
|
166
|
+
)
|
|
167
|
+
return DestinationBufferStatus.RECORDS_WRITTEN_THEN_BUFFERED
|
|
168
|
+
|
|
169
|
+
# Buffer can hold all records from this iteration
|
|
170
|
+
elif self.buffer.buffer_free_space >= sys.getsizeof(destination_records):
|
|
171
|
+
self.buffer.add_source_iteration_records_to_buffer(
|
|
172
|
+
iteration=iteration, records=destination_records, pagination=pagination
|
|
173
|
+
)
|
|
174
|
+
return DestinationBufferStatus.RECORDS_BUFFERED
|
|
175
|
+
|
|
176
|
+
# Buffer can contain some records from this iteration
|
|
177
|
+
# For now we will write all records to destination and then buffer the remaining records
|
|
178
|
+
else:
|
|
179
|
+
self.buffer_flush_handler(session=session)
|
|
180
|
+
self.buffer.flush()
|
|
181
|
+
self.buffer.add_source_iteration_records_to_buffer(
|
|
182
|
+
iteration=iteration, records=destination_records, pagination=pagination
|
|
183
|
+
)
|
|
184
|
+
return DestinationBufferStatus.RECORDS_WRITTEN_THEN_BUFFERED
|
|
185
|
+
|
|
186
|
+
def create_cursors(self, destination_iteration: DestinationIteration):
|
|
187
|
+
self.backend.create_destination_cursor(
|
|
188
|
+
job_id=self.sync_metadata.job_id,
|
|
189
|
+
source_name=self.sync_metadata.source_name,
|
|
190
|
+
stream_name=self.sync_metadata.stream_name,
|
|
191
|
+
destination_name=self.sync_metadata.destination_name,
|
|
192
|
+
from_source_iteration=destination_iteration.from_source_iteration,
|
|
193
|
+
to_source_iteration=destination_iteration.to_source_iteration,
|
|
194
|
+
rows_written=destination_iteration.records_written,
|
|
195
|
+
pagination=destination_iteration.pagination,
|
|
196
|
+
success=destination_iteration.success,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
def write_records_and_update_cursor(
|
|
200
|
+
self,
|
|
201
|
+
source_records: List[SourceRecord],
|
|
202
|
+
iteration: int,
|
|
203
|
+
extracted_at: datetime,
|
|
204
|
+
last_iteration: bool = False,
|
|
205
|
+
pagination: dict = None,
|
|
206
|
+
) -> bool:
|
|
207
|
+
"""
|
|
208
|
+
Write records to destination and update the cursor for the given iteration.
|
|
209
|
+
Stores the source pagination for recovery purposes.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
# Case when producer failed to fetch data from first iteration
|
|
213
|
+
if iteration == 0 and len(source_records) == 0:
|
|
214
|
+
logger.warning("Source failed to fetch data from the first iteration, no records will be written.")
|
|
215
|
+
return False
|
|
216
|
+
|
|
217
|
+
# Prepare destination
|
|
218
|
+
if iteration == 0:
|
|
219
|
+
self.prepare_destination()
|
|
220
|
+
|
|
221
|
+
destination_records = self.convert_source_records_to_destination_records(
|
|
222
|
+
source_records=source_records,
|
|
223
|
+
extracted_at=extracted_at,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Buffer records otherwise write to destination
|
|
227
|
+
self.write_or_buffer_records(
|
|
228
|
+
destination_records=destination_records,
|
|
229
|
+
iteration=iteration,
|
|
230
|
+
last_iteration=last_iteration,
|
|
231
|
+
pagination=pagination,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
return True
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
class DestinationFactory:
|
|
238
|
+
@staticmethod
|
|
239
|
+
def get_destination(
|
|
240
|
+
sync_metadata: SyncMetadata,
|
|
241
|
+
config: AbstractDestinationConfig,
|
|
242
|
+
backend: AbstractBackend,
|
|
243
|
+
) -> AbstractDestination:
|
|
244
|
+
|
|
245
|
+
if config.name == DestinationTypes.LOGGER:
|
|
246
|
+
from .logger.src.destination import LoggerDestination
|
|
247
|
+
|
|
248
|
+
return LoggerDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
|
|
249
|
+
|
|
250
|
+
elif config.name == DestinationTypes.BIGQUERY:
|
|
251
|
+
from .bigquery.src.destination import BigQueryDestination
|
|
252
|
+
|
|
253
|
+
return BigQueryDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
|
|
254
|
+
|
|
255
|
+
elif config.name == DestinationTypes.FILE:
|
|
256
|
+
from .file.src.destination import FileDestination
|
|
257
|
+
|
|
258
|
+
return FileDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
|
|
259
|
+
|
|
260
|
+
raise ValueError(f"Destination {config.name}" f"with params {config} not found")
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import json
|
|
2
1
|
from typing import List, Tuple
|
|
3
2
|
|
|
4
3
|
from loguru import logger
|
|
@@ -26,6 +25,5 @@ class FileDestination(AbstractDestination):
|
|
|
26
25
|
def write_records(self, destination_records: List[DestinationRecord]) -> Tuple[bool, str]:
|
|
27
26
|
with open(self.config.filepath, "a") as f:
|
|
28
27
|
for record in destination_records:
|
|
29
|
-
|
|
30
|
-
f.write("\n")
|
|
28
|
+
f.write(record.model_dump_json() + "\n")
|
|
31
29
|
return True, ""
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from datetime import datetime
|
|
3
|
-
from typing import Optional
|
|
4
3
|
from uuid import uuid4
|
|
5
4
|
|
|
6
5
|
from pydantic import BaseModel, Field
|
|
@@ -9,14 +8,6 @@ from pytz import UTC
|
|
|
9
8
|
from bizon.source.models import SourceRecord
|
|
10
9
|
|
|
11
10
|
|
|
12
|
-
class DestinationIteration(BaseModel):
|
|
13
|
-
success: bool = Field(..., description="Success status of the iteration")
|
|
14
|
-
error_message: Optional[str] = Field(None, description="Error message if iteration failed")
|
|
15
|
-
records_written: int = Field(0, description="Number of records written to the destination")
|
|
16
|
-
from_source_iteration: Optional[int] = Field(None, description="From source iteration identifier buffer starts")
|
|
17
|
-
to_source_iteration: Optional[int] = Field(None, description="To source iteration identifier buffer ends")
|
|
18
|
-
|
|
19
|
-
|
|
20
11
|
class DestinationRecord(BaseModel):
|
|
21
12
|
bizon_id: str = Field(..., description="Bizon unique identifier of the record")
|
|
22
13
|
bizon_extracted_at: datetime = Field(..., description="Datetime when the record was extracted")
|
|
@@ -4,7 +4,7 @@ from typing import Generator, Optional, Union
|
|
|
4
4
|
|
|
5
5
|
from loguru import logger
|
|
6
6
|
from pytz import UTC
|
|
7
|
-
from sqlalchemy import Result, Select, create_engine, inspect, select, update
|
|
7
|
+
from sqlalchemy import Result, Select, create_engine, func, inspect, select, update
|
|
8
8
|
from sqlalchemy.engine import Engine, create_engine
|
|
9
9
|
from sqlalchemy.orm import Session, scoped_session, sessionmaker
|
|
10
10
|
|
|
@@ -27,7 +27,7 @@ from .config import BigQueryConfigDetails, PostgresConfigDetails, SQLiteConfigDe
|
|
|
27
27
|
|
|
28
28
|
class SQLAlchemyBackend(AbstractBackend):
|
|
29
29
|
|
|
30
|
-
def __init__(self, config: Union[PostgresConfigDetails, SQLiteConfigDetails], type: BackendTypes):
|
|
30
|
+
def __init__(self, config: Union[PostgresConfigDetails, SQLiteConfigDetails], type: BackendTypes, **kwargs):
|
|
31
31
|
super().__init__(config, type)
|
|
32
32
|
|
|
33
33
|
self._engine = None
|
|
@@ -38,6 +38,8 @@ class SQLAlchemyBackend(AbstractBackend):
|
|
|
38
38
|
BigQueryConfigDetails,
|
|
39
39
|
] = config
|
|
40
40
|
|
|
41
|
+
self.kwargs = kwargs
|
|
42
|
+
|
|
41
43
|
@property
|
|
42
44
|
def session(self) -> Generator[Session, None, None]:
|
|
43
45
|
"""yields a SQLAlchemy connection"""
|
|
@@ -125,7 +127,8 @@ class SQLAlchemyBackend(AbstractBackend):
|
|
|
125
127
|
def get_engine(self) -> Engine:
|
|
126
128
|
"""Return the SQLAlchemy engine"""
|
|
127
129
|
if self.type == BackendTypes.SQLITE:
|
|
128
|
-
|
|
130
|
+
assert "sqlalchemy_engine" in self.kwargs
|
|
131
|
+
return self.kwargs["sqlalchemy_engine"]
|
|
129
132
|
|
|
130
133
|
if not self._engine:
|
|
131
134
|
self._engine = self._get_engine()
|
|
@@ -310,20 +313,16 @@ class SQLAlchemyBackend(AbstractBackend):
|
|
|
310
313
|
logger.warning(f"No cursor found for id={cursor_id}")
|
|
311
314
|
return None
|
|
312
315
|
|
|
313
|
-
def
|
|
314
|
-
|
|
315
|
-
) -> Optional[SourceCursor]:
|
|
316
|
-
"""Get the last cursor we should start source from for the given job id
|
|
317
|
-
The cursor should have status CursorStatus.WRITTEN_IN_DESTINATION
|
|
318
|
-
"""
|
|
316
|
+
def get_last_cursor_by_job_id(self, job_id: str, session: Optional[Session] = None) -> Optional[SourceCursor]:
|
|
317
|
+
"""Get the last cursor we should start source from for the given job id"""
|
|
319
318
|
|
|
320
319
|
smt = (
|
|
321
|
-
select(
|
|
320
|
+
select(DestinationCursor)
|
|
322
321
|
.filter(
|
|
323
|
-
|
|
324
|
-
|
|
322
|
+
DestinationCursor.job_id == job_id,
|
|
323
|
+
DestinationCursor.success == True, # noqa
|
|
325
324
|
)
|
|
326
|
-
.order_by(
|
|
325
|
+
.order_by(DestinationCursor.to_source_iteration.desc())
|
|
327
326
|
)
|
|
328
327
|
|
|
329
328
|
results = self._execute(smt, session=session).first()
|
|
@@ -355,8 +354,6 @@ class SQLAlchemyBackend(AbstractBackend):
|
|
|
355
354
|
logger.warning(f"No last cursor found for source={source_name} stream={stream_name} iteration={iteration}")
|
|
356
355
|
return None
|
|
357
356
|
|
|
358
|
-
### DESTINATION CURSOR ###
|
|
359
|
-
|
|
360
357
|
def create_destination_cursor(
|
|
361
358
|
self,
|
|
362
359
|
job_id: str,
|
|
@@ -366,6 +363,8 @@ class SQLAlchemyBackend(AbstractBackend):
|
|
|
366
363
|
from_source_iteration: int,
|
|
367
364
|
to_source_iteration: int,
|
|
368
365
|
rows_written: int,
|
|
366
|
+
success: bool,
|
|
367
|
+
pagination: Optional[dict] = None,
|
|
369
368
|
session: Session | None = None,
|
|
370
369
|
) -> DestinationCursor:
|
|
371
370
|
|
|
@@ -377,30 +376,13 @@ class SQLAlchemyBackend(AbstractBackend):
|
|
|
377
376
|
from_source_iteration=from_source_iteration,
|
|
378
377
|
to_source_iteration=to_source_iteration,
|
|
379
378
|
rows_written=rows_written,
|
|
379
|
+
pagination=json.dumps(pagination) if pagination else None,
|
|
380
|
+
success=success,
|
|
380
381
|
)
|
|
381
382
|
new_destination_cursor = self._add_and_commit(destination_cursor, session=session)
|
|
382
383
|
logger.debug(f"New Destination Cursor has been created: {new_destination_cursor}")
|
|
383
384
|
return new_destination_cursor
|
|
384
385
|
|
|
385
|
-
def update_destination_cursor(
|
|
386
|
-
self,
|
|
387
|
-
cursor_id: str,
|
|
388
|
-
rows_written: int,
|
|
389
|
-
status: CursorStatus,
|
|
390
|
-
error_message: Optional[str] = None,
|
|
391
|
-
session: Session | None = None,
|
|
392
|
-
):
|
|
393
|
-
"""Update the status of the stream job with the given id"""
|
|
394
|
-
stmt = (
|
|
395
|
-
update(DestinationCursor)
|
|
396
|
-
.where(DestinationCursor.id == cursor_id)
|
|
397
|
-
.values(
|
|
398
|
-
status=status, error_message=error_message, rows_written=rows_written, updated_at=datetime.now(tz=UTC)
|
|
399
|
-
)
|
|
400
|
-
.execution_options(synchronize_session="fetch")
|
|
401
|
-
)
|
|
402
|
-
self._execute(stmt, session=session)
|
|
403
|
-
|
|
404
386
|
def get_destination_cursor_by_id(
|
|
405
387
|
self, cursor_id: str, session: Optional[Session] = None
|
|
406
388
|
) -> Optional[DestinationCursor]:
|
|
@@ -414,3 +396,15 @@ class SQLAlchemyBackend(AbstractBackend):
|
|
|
414
396
|
return results[0]
|
|
415
397
|
logger.warning(f"No job found for id={cursor_id}")
|
|
416
398
|
return None
|
|
399
|
+
|
|
400
|
+
def get_number_of_written_rows_for_job(self, job_id: str, session: Optional[Session] = None) -> Optional[int]:
|
|
401
|
+
"""Get the number of written rows for the given job"""
|
|
402
|
+
smt = select(func.sum(DestinationCursor.rows_written)).filter(
|
|
403
|
+
DestinationCursor.job_id == job_id,
|
|
404
|
+
DestinationCursor.success == True, # noqa
|
|
405
|
+
)
|
|
406
|
+
results = self._execute(smt, session=session).one_or_none()
|
|
407
|
+
if results:
|
|
408
|
+
return results[0]
|
|
409
|
+
else:
|
|
410
|
+
raise Exception(f"No rows written found for job_id={job_id}")
|