bizon 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon/alerting/__init__.py +0 -0
- bizon/alerting/alerts.py +23 -0
- bizon/alerting/models.py +28 -0
- bizon/alerting/slack/__init__.py +0 -0
- bizon/alerting/slack/config.py +5 -0
- bizon/alerting/slack/handler.py +39 -0
- bizon/cli/main.py +7 -3
- bizon/common/models.py +33 -7
- bizon/{destinations → connectors/destinations}/bigquery/config/bigquery.example.yml +3 -4
- bizon/connectors/destinations/bigquery/src/config.py +128 -0
- bizon/{destinations → connectors/destinations}/bigquery/src/destination.py +48 -25
- bizon/connectors/destinations/bigquery_streaming/src/config.py +57 -0
- bizon/connectors/destinations/bigquery_streaming/src/destination.py +377 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +57 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +446 -0
- bizon/{destinations/bigquery_streaming → connectors/destinations/bigquery_streaming_v2}/src/proto_utils.py +30 -36
- bizon/{destinations → connectors/destinations}/file/src/config.py +9 -3
- bizon/connectors/destinations/file/src/destination.py +56 -0
- bizon/{destinations → connectors/destinations}/logger/src/config.py +2 -1
- bizon/{destinations → connectors/destinations}/logger/src/destination.py +18 -3
- bizon/connectors/sources/cycle/config/cycle.example.yml +15 -0
- bizon/connectors/sources/cycle/src/source.py +133 -0
- bizon/{sources/periscope/tests/periscope_pipeline_dashboard.py → connectors/sources/cycle/tests/cycle_customers.py} +1 -1
- bizon/{sources → connectors/sources}/dummy/config/dummy.example.yml +2 -2
- bizon/{sources → connectors/sources}/dummy/src/fake_api.py +6 -1
- bizon/{sources → connectors/sources}/dummy/src/source.py +18 -5
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_bigquery_backend.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_kafka.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_rabbitmq.py +2 -2
- bizon/connectors/sources/dummy/tests/dummy_pipeline_unnest.py +29 -0
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +2 -2
- bizon/{sources → connectors/sources}/gsheets/config/default_auth.example.yml +4 -2
- bizon/{sources → connectors/sources}/gsheets/config/service_account.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/config/api_key.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/config/oauth.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/src/hubspot_objects.py +1 -1
- bizon/{sources → connectors/sources}/kafka/config/kafka.example.yml +3 -5
- bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +110 -0
- bizon/connectors/sources/kafka/src/callback.py +18 -0
- bizon/connectors/sources/kafka/src/config.py +69 -0
- bizon/connectors/sources/kafka/src/decode.py +93 -0
- bizon/connectors/sources/kafka/src/source.py +381 -0
- bizon/connectors/sources/kafka/tests/kafka_pipeline.py +7 -0
- bizon/connectors/sources/periscope/config/periscope_charts.example.yml +20 -0
- bizon/connectors/sources/periscope/config/periscope_dashboards.example.yml +20 -0
- bizon/{sources → connectors/sources}/periscope/src/source.py +137 -13
- bizon/connectors/sources/periscope/tests/periscope_pipeline_dashboard.py +9 -0
- bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_json.example.yml +19 -0
- bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_logger.example.yml +10 -0
- bizon/connectors/sources/pokeapi/src/source.py +79 -0
- bizon/{destinations → destination}/buffer.py +5 -0
- bizon/destination/config.py +83 -0
- bizon/{destinations → destination}/destination.py +103 -15
- bizon/engine/backend/adapters/sqlalchemy/backend.py +3 -1
- bizon/engine/engine.py +20 -1
- bizon/engine/pipeline/consumer.py +73 -5
- bizon/engine/pipeline/models.py +8 -3
- bizon/engine/pipeline/producer.py +18 -9
- bizon/engine/queue/adapters/kafka/consumer.py +2 -2
- bizon/engine/queue/adapters/kafka/queue.py +3 -2
- bizon/engine/queue/adapters/python_queue/consumer.py +40 -23
- bizon/engine/queue/adapters/python_queue/queue.py +19 -9
- bizon/engine/queue/adapters/rabbitmq/consumer.py +3 -6
- bizon/engine/queue/adapters/rabbitmq/queue.py +3 -2
- bizon/engine/queue/config.py +16 -0
- bizon/engine/queue/queue.py +17 -16
- bizon/engine/runner/adapters/process.py +15 -2
- bizon/engine/runner/adapters/streaming.py +121 -0
- bizon/engine/runner/adapters/thread.py +32 -9
- bizon/engine/runner/config.py +28 -0
- bizon/engine/runner/runner.py +113 -24
- bizon/monitoring/__init__.py +0 -0
- bizon/monitoring/config.py +39 -0
- bizon/monitoring/datadog/__init__.py +0 -0
- bizon/monitoring/datadog/monitor.py +153 -0
- bizon/monitoring/monitor.py +71 -0
- bizon/monitoring/noop/__init__.py +0 -0
- bizon/monitoring/noop/monitor.py +30 -0
- bizon/source/callback.py +24 -0
- bizon/source/config.py +3 -3
- bizon/source/cursor.py +1 -1
- bizon/source/discover.py +4 -3
- bizon/source/models.py +4 -2
- bizon/source/source.py +10 -2
- bizon/transform/config.py +8 -0
- bizon/transform/transform.py +48 -0
- {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/METADATA +23 -6
- bizon-0.1.2.dist-info/RECORD +123 -0
- {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/WHEEL +1 -1
- bizon/destinations/bigquery/src/config.py +0 -51
- bizon/destinations/bigquery_streaming/src/config.py +0 -43
- bizon/destinations/bigquery_streaming/src/destination.py +0 -154
- bizon/destinations/config.py +0 -47
- bizon/destinations/file/src/destination.py +0 -27
- bizon/sources/kafka/src/source.py +0 -357
- bizon/sources/kafka/tests/kafka_pipeline.py +0 -9
- bizon/sources/periscope/config/periscope_charts.example.yml +0 -26
- bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -26
- bizon-0.1.0.dist-info/RECORD +0 -93
- /bizon/{sources → connectors/sources}/gsheets/src/source.py +0 -0
- /bizon/{sources → connectors/sources}/gsheets/tests/gsheets_pipeline.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/src/hubspot_base.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/src/models/hs_object.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/tests/hubspot_pipeline.py +0 -0
- /bizon/{sources → connectors/sources}/periscope/tests/periscope_pipeline_charts.py +0 -0
- /bizon/{destinations → destination}/models.py +0 -0
- {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/LICENSE +0 -0
- {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/entry_points.txt +0 -0
bizon/engine/engine.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from
|
|
1
|
+
from os import getenv
|
|
2
2
|
|
|
3
3
|
from bizon.cli.utils import parse_from_yaml
|
|
4
4
|
from bizon.common.models import BizonConfig
|
|
@@ -7,10 +7,24 @@ from .config import RunnerTypes
|
|
|
7
7
|
from .runner.runner import AbstractRunner
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
def replace_env_variables_in_config(config: dict) -> dict:
|
|
11
|
+
"""Replace templated secrets with actual values from environment variables"""
|
|
12
|
+
for key, value in config.items():
|
|
13
|
+
if isinstance(value, dict):
|
|
14
|
+
config[key] = replace_env_variables_in_config(value)
|
|
15
|
+
elif isinstance(value, str):
|
|
16
|
+
if value.startswith("BIZON_ENV_"):
|
|
17
|
+
config[key] = getenv(value)
|
|
18
|
+
return config
|
|
19
|
+
|
|
20
|
+
|
|
10
21
|
class RunnerFactory:
|
|
11
22
|
@staticmethod
|
|
12
23
|
def create_from_config_dict(config: dict) -> AbstractRunner:
|
|
13
24
|
|
|
25
|
+
# Replace env variables in config
|
|
26
|
+
config = replace_env_variables_in_config(config=config)
|
|
27
|
+
|
|
14
28
|
bizon_config = BizonConfig.model_validate(obj=config)
|
|
15
29
|
|
|
16
30
|
if bizon_config.engine.runner.type == RunnerTypes.THREAD:
|
|
@@ -23,6 +37,11 @@ class RunnerFactory:
|
|
|
23
37
|
|
|
24
38
|
return ProcessRunner(config=config)
|
|
25
39
|
|
|
40
|
+
if bizon_config.engine.runner.type == RunnerTypes.STREAM:
|
|
41
|
+
from .runner.adapters.streaming import StreamingRunner
|
|
42
|
+
|
|
43
|
+
return StreamingRunner(config=config)
|
|
44
|
+
|
|
26
45
|
raise ValueError(f"Runner type {bizon_config.engine.runner.type} is not supported")
|
|
27
46
|
|
|
28
47
|
@staticmethod
|
|
@@ -1,15 +1,83 @@
|
|
|
1
|
+
import multiprocessing
|
|
2
|
+
import multiprocessing.synchronize
|
|
3
|
+
import threading
|
|
4
|
+
import traceback
|
|
1
5
|
from abc import ABC, abstractmethod
|
|
2
|
-
from
|
|
6
|
+
from typing import Union
|
|
3
7
|
|
|
4
|
-
from
|
|
5
|
-
|
|
8
|
+
from loguru import logger
|
|
9
|
+
|
|
10
|
+
from bizon.destination.destination import AbstractDestination
|
|
11
|
+
from bizon.engine.pipeline.models import PipelineReturnStatus
|
|
12
|
+
from bizon.engine.queue.config import (
|
|
13
|
+
QUEUE_TERMINATION,
|
|
14
|
+
AbstractQueueConfig,
|
|
15
|
+
QueueMessage,
|
|
16
|
+
)
|
|
17
|
+
from bizon.monitoring.monitor import AbstractMonitor
|
|
18
|
+
from bizon.transform.transform import Transform
|
|
6
19
|
|
|
7
20
|
|
|
8
21
|
class AbstractQueueConsumer(ABC):
|
|
9
|
-
def __init__(
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
config: AbstractQueueConfig,
|
|
25
|
+
destination: AbstractDestination,
|
|
26
|
+
transform: Transform,
|
|
27
|
+
monitor: AbstractMonitor,
|
|
28
|
+
):
|
|
10
29
|
self.config = config
|
|
11
30
|
self.destination = destination
|
|
31
|
+
self.transform = transform
|
|
32
|
+
self.monitor = monitor
|
|
12
33
|
|
|
13
34
|
@abstractmethod
|
|
14
|
-
def run(self):
|
|
35
|
+
def run(self, stop_event: Union[multiprocessing.synchronize.Event, threading.Event]) -> PipelineReturnStatus:
|
|
15
36
|
pass
|
|
37
|
+
|
|
38
|
+
def process_queue_message(self, queue_message: QueueMessage) -> PipelineReturnStatus:
|
|
39
|
+
|
|
40
|
+
# Apply the transformation
|
|
41
|
+
try:
|
|
42
|
+
df_source_records = self.transform.apply_transforms(df_source_records=queue_message.df_source_records)
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.error(f"Error applying transformation: {e}")
|
|
45
|
+
logger.error(traceback.format_exc())
|
|
46
|
+
self.monitor.track_pipeline_status(PipelineReturnStatus.TRANSFORM_ERROR)
|
|
47
|
+
return PipelineReturnStatus.TRANSFORM_ERROR
|
|
48
|
+
|
|
49
|
+
# Handle last iteration
|
|
50
|
+
try:
|
|
51
|
+
if queue_message.signal == QUEUE_TERMINATION:
|
|
52
|
+
logger.info("Received termination signal, waiting for destination to close gracefully ...")
|
|
53
|
+
self.destination.write_records_and_update_cursor(
|
|
54
|
+
df_source_records=df_source_records,
|
|
55
|
+
iteration=queue_message.iteration,
|
|
56
|
+
extracted_at=queue_message.extracted_at,
|
|
57
|
+
pagination=queue_message.pagination,
|
|
58
|
+
last_iteration=True,
|
|
59
|
+
)
|
|
60
|
+
self.monitor.track_pipeline_status(PipelineReturnStatus.SUCCESS)
|
|
61
|
+
return PipelineReturnStatus.SUCCESS
|
|
62
|
+
|
|
63
|
+
except Exception as e:
|
|
64
|
+
logger.error(f"Error writing records to destination: {e}")
|
|
65
|
+
self.monitor.track_pipeline_status(PipelineReturnStatus.DESTINATION_ERROR)
|
|
66
|
+
return PipelineReturnStatus.DESTINATION_ERROR
|
|
67
|
+
|
|
68
|
+
# Write the records to the destination
|
|
69
|
+
try:
|
|
70
|
+
self.destination.write_records_and_update_cursor(
|
|
71
|
+
df_source_records=df_source_records,
|
|
72
|
+
iteration=queue_message.iteration,
|
|
73
|
+
extracted_at=queue_message.extracted_at,
|
|
74
|
+
pagination=queue_message.pagination,
|
|
75
|
+
)
|
|
76
|
+
return PipelineReturnStatus.RUNNING
|
|
77
|
+
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logger.error(f"Error writing records to destination: {e}")
|
|
80
|
+
self.monitor.track_pipeline_status(PipelineReturnStatus.DESTINATION_ERROR)
|
|
81
|
+
return PipelineReturnStatus.DESTINATION_ERROR
|
|
82
|
+
|
|
83
|
+
raise RuntimeError("Should not reach this point")
|
bizon/engine/pipeline/models.py
CHANGED
|
@@ -1,10 +1,15 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
2
|
|
|
3
3
|
|
|
4
|
-
class PipelineReturnStatus(Enum):
|
|
4
|
+
class PipelineReturnStatus(str, Enum):
|
|
5
5
|
"""Producer error types"""
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
BACKEND_ERROR = "backend_error"
|
|
8
|
+
DESTINATION_ERROR = "destination_error"
|
|
9
|
+
KILLED_BY_RUNNER = "killed_by_runner"
|
|
8
10
|
QUEUE_ERROR = "queue_error"
|
|
11
|
+
RUNNING = "running"
|
|
9
12
|
SOURCE_ERROR = "source_error"
|
|
10
|
-
|
|
13
|
+
SUCCESS = "success"
|
|
14
|
+
TRANSFORM_ERROR = "transform_error"
|
|
15
|
+
STREAM_ERROR = "stream_error"
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
import ast
|
|
2
|
+
import multiprocessing
|
|
3
|
+
import multiprocessing.synchronize
|
|
4
|
+
import threading
|
|
2
5
|
import traceback
|
|
3
6
|
from datetime import datetime
|
|
4
7
|
from time import sleep
|
|
5
|
-
from typing import Tuple
|
|
8
|
+
from typing import Tuple, Union
|
|
6
9
|
|
|
7
10
|
from loguru import logger
|
|
8
11
|
from pytz import UTC
|
|
@@ -28,7 +31,7 @@ class Producer:
|
|
|
28
31
|
|
|
29
32
|
@property
|
|
30
33
|
def name(self) -> str:
|
|
31
|
-
return f"producer-{self.source.config.
|
|
34
|
+
return f"producer-{self.source.config.name}-{self.source.config.stream}"
|
|
32
35
|
|
|
33
36
|
def get_or_create_cursor(self, job_id: str, session=None) -> Cursor:
|
|
34
37
|
"""Get or create a cursor for the current stream, return the cursor"""
|
|
@@ -47,8 +50,8 @@ class Producer:
|
|
|
47
50
|
|
|
48
51
|
# Initialize the recovery from the DestinationCursor
|
|
49
52
|
cursor = Cursor.from_db(
|
|
50
|
-
source_name=self.source.config.
|
|
51
|
-
stream_name=self.source.config.
|
|
53
|
+
source_name=self.source.config.name,
|
|
54
|
+
stream_name=self.source.config.stream,
|
|
52
55
|
job_id=job_id,
|
|
53
56
|
total_records=job.total_records_to_fetch,
|
|
54
57
|
iteration=cursor_from_db.to_source_iteration + 1,
|
|
@@ -60,8 +63,8 @@ class Producer:
|
|
|
60
63
|
total_records = self.source.get_total_records_count()
|
|
61
64
|
# Initialize the cursor
|
|
62
65
|
cursor = Cursor(
|
|
63
|
-
source_name=self.source.config.
|
|
64
|
-
stream_name=self.source.config.
|
|
66
|
+
source_name=self.source.config.name,
|
|
67
|
+
stream_name=self.source.config.stream,
|
|
65
68
|
job_id=job_id,
|
|
66
69
|
total_records=total_records,
|
|
67
70
|
)
|
|
@@ -99,7 +102,9 @@ class Producer:
|
|
|
99
102
|
|
|
100
103
|
return False, queue_size, approximate_nb_records_in_queue
|
|
101
104
|
|
|
102
|
-
def run(
|
|
105
|
+
def run(
|
|
106
|
+
self, job_id: int, stop_event: Union[multiprocessing.synchronize.Event, threading.Event]
|
|
107
|
+
) -> PipelineReturnStatus:
|
|
103
108
|
|
|
104
109
|
return_value: PipelineReturnStatus = PipelineReturnStatus.SUCCESS
|
|
105
110
|
|
|
@@ -128,6 +133,10 @@ class Producer:
|
|
|
128
133
|
|
|
129
134
|
while not cursor.is_finished:
|
|
130
135
|
|
|
136
|
+
if stop_event.is_set():
|
|
137
|
+
logger.info("Stop event is set, terminating producer ...")
|
|
138
|
+
return PipelineReturnStatus.KILLED_BY_RUNNER
|
|
139
|
+
|
|
131
140
|
timestamp_start_iteration = datetime.now(tz=UTC)
|
|
132
141
|
|
|
133
142
|
# Handle the case where last cursor already reach max_iterations
|
|
@@ -142,8 +151,8 @@ class Producer:
|
|
|
142
151
|
self.backend.create_source_cursor(
|
|
143
152
|
job_id=job_id,
|
|
144
153
|
name=self.bizon_config.name,
|
|
145
|
-
source_name=self.source.config.
|
|
146
|
-
stream_name=self.source.config.
|
|
154
|
+
source_name=self.source.config.name,
|
|
155
|
+
stream_name=self.source.config.stream,
|
|
147
156
|
iteration=cursor.iteration,
|
|
148
157
|
rows_fetched=cursor.rows_fetched,
|
|
149
158
|
next_pagination=cursor.pagination,
|
|
@@ -3,9 +3,9 @@ import json
|
|
|
3
3
|
from kafka import KafkaConsumer
|
|
4
4
|
from loguru import logger
|
|
5
5
|
|
|
6
|
-
from bizon.
|
|
6
|
+
from bizon.destination.destination import AbstractDestination
|
|
7
7
|
from bizon.engine.pipeline.consumer import AbstractQueueConsumer
|
|
8
|
-
from bizon.engine.queue.
|
|
8
|
+
from bizon.engine.queue.config import QUEUE_TERMINATION, QueueMessage
|
|
9
9
|
|
|
10
10
|
from .config import KafkaConfigDetails
|
|
11
11
|
|
|
@@ -4,8 +4,9 @@ from typing import Union
|
|
|
4
4
|
from kafka import KafkaProducer
|
|
5
5
|
from loguru import logger
|
|
6
6
|
|
|
7
|
-
from bizon.
|
|
8
|
-
from bizon.engine.queue.
|
|
7
|
+
from bizon.destination.destination import AbstractDestination
|
|
8
|
+
from bizon.engine.queue.config import QUEUE_TERMINATION, QueueMessage
|
|
9
|
+
from bizon.engine.queue.queue import AbstractQueue
|
|
9
10
|
|
|
10
11
|
from .config import KafkaConfigDetails
|
|
11
12
|
from .consumer import KafkaConsumer_
|
|
@@ -1,36 +1,53 @@
|
|
|
1
|
+
import multiprocessing
|
|
2
|
+
import multiprocessing.synchronize
|
|
3
|
+
import threading
|
|
4
|
+
from typing import Union
|
|
5
|
+
|
|
1
6
|
from loguru import logger
|
|
2
7
|
|
|
3
|
-
from bizon.
|
|
8
|
+
from bizon.destination.destination import AbstractDestination
|
|
4
9
|
from bizon.engine.pipeline.consumer import AbstractQueueConsumer
|
|
5
|
-
from bizon.engine.
|
|
10
|
+
from bizon.engine.pipeline.models import PipelineReturnStatus
|
|
11
|
+
from bizon.engine.queue.config import QueueMessage
|
|
12
|
+
from bizon.engine.queue.queue import AbstractQueue
|
|
13
|
+
from bizon.monitoring.monitor import AbstractMonitor
|
|
14
|
+
from bizon.transform.transform import Transform
|
|
6
15
|
|
|
7
16
|
from .config import PythonQueueConfig
|
|
8
17
|
|
|
9
18
|
|
|
10
19
|
class PythonQueueConsumer(AbstractQueueConsumer):
|
|
11
|
-
def __init__(
|
|
12
|
-
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
config: PythonQueueConfig,
|
|
23
|
+
queue: AbstractQueue,
|
|
24
|
+
destination: AbstractDestination,
|
|
25
|
+
transform: Transform,
|
|
26
|
+
monitor: AbstractMonitor,
|
|
27
|
+
):
|
|
28
|
+
super().__init__(
|
|
29
|
+
config,
|
|
30
|
+
destination=destination,
|
|
31
|
+
transform=transform,
|
|
32
|
+
monitor=monitor,
|
|
33
|
+
)
|
|
13
34
|
self.queue = queue
|
|
35
|
+
self.monitor.track_pipeline_status(PipelineReturnStatus.RUNNING)
|
|
36
|
+
|
|
37
|
+
def run(self, stop_event: Union[threading.Event, multiprocessing.synchronize.Event]) -> PipelineReturnStatus:
|
|
14
38
|
|
|
15
|
-
def run(self) -> None:
|
|
16
39
|
while True:
|
|
40
|
+
# Handle kill signal from the runner
|
|
41
|
+
if stop_event.is_set():
|
|
42
|
+
logger.info("Stop event is set, closing consumer ...")
|
|
43
|
+
self.monitor.track_pipeline_status(PipelineReturnStatus.KILLED_BY_RUNNER)
|
|
44
|
+
return PipelineReturnStatus.KILLED_BY_RUNNER
|
|
45
|
+
|
|
46
|
+
# Retrieve the message from the queue
|
|
17
47
|
queue_message: QueueMessage = self.queue.get()
|
|
18
48
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
extracted_at=queue_message.extracted_at,
|
|
25
|
-
pagination=queue_message.pagination,
|
|
26
|
-
last_iteration=True,
|
|
27
|
-
)
|
|
28
|
-
break
|
|
29
|
-
|
|
30
|
-
self.destination.write_records_and_update_cursor(
|
|
31
|
-
df_source_records=queue_message.df_source_records,
|
|
32
|
-
iteration=queue_message.iteration,
|
|
33
|
-
extracted_at=queue_message.extracted_at,
|
|
34
|
-
pagination=queue_message.pagination,
|
|
35
|
-
)
|
|
36
|
-
self.queue.task_done()
|
|
49
|
+
status = self.process_queue_message(queue_message)
|
|
50
|
+
|
|
51
|
+
if status != PipelineReturnStatus.RUNNING:
|
|
52
|
+
self.queue.task_done()
|
|
53
|
+
return status
|
|
@@ -5,14 +5,13 @@ from typing import Union
|
|
|
5
5
|
|
|
6
6
|
from loguru import logger
|
|
7
7
|
|
|
8
|
-
from bizon.
|
|
9
|
-
from bizon.engine.queue.
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
QueueMessage,
|
|
14
|
-
)
|
|
8
|
+
from bizon.destination.destination import AbstractDestination
|
|
9
|
+
from bizon.engine.queue.config import QUEUE_TERMINATION, QueueMessage
|
|
10
|
+
from bizon.engine.queue.queue import AbstractQueue, AbstractQueueConsumer
|
|
11
|
+
from bizon.monitoring.monitor import AbstractMonitor
|
|
12
|
+
from bizon.source.callback import AbstractSourceCallback
|
|
15
13
|
from bizon.source.models import SourceIteration
|
|
14
|
+
from bizon.transform.transform import Transform
|
|
16
15
|
|
|
17
16
|
from .config import PythonQueueConfigDetails
|
|
18
17
|
from .consumer import PythonQueueConsumer
|
|
@@ -31,8 +30,19 @@ class PythonQueue(AbstractQueue):
|
|
|
31
30
|
# No connection to establish for PythonQueue
|
|
32
31
|
pass
|
|
33
32
|
|
|
34
|
-
def get_consumer(
|
|
35
|
-
|
|
33
|
+
def get_consumer(
|
|
34
|
+
self,
|
|
35
|
+
destination: AbstractDestination,
|
|
36
|
+
transform: Transform,
|
|
37
|
+
monitor: AbstractMonitor,
|
|
38
|
+
) -> AbstractQueueConsumer:
|
|
39
|
+
return PythonQueueConsumer(
|
|
40
|
+
config=self.config,
|
|
41
|
+
queue=self.queue,
|
|
42
|
+
destination=destination,
|
|
43
|
+
transform=transform,
|
|
44
|
+
monitor=monitor,
|
|
45
|
+
)
|
|
36
46
|
|
|
37
47
|
def put_queue_message(self, queue_message: QueueMessage):
|
|
38
48
|
if not self.queue.full():
|
|
@@ -2,12 +2,9 @@ import pika
|
|
|
2
2
|
import pika.connection
|
|
3
3
|
from loguru import logger
|
|
4
4
|
|
|
5
|
-
from bizon.
|
|
6
|
-
from bizon.engine.queue.
|
|
7
|
-
|
|
8
|
-
AbstractQueueConsumer,
|
|
9
|
-
QueueMessage,
|
|
10
|
-
)
|
|
5
|
+
from bizon.destination.destination import AbstractDestination
|
|
6
|
+
from bizon.engine.queue.config import QUEUE_TERMINATION
|
|
7
|
+
from bizon.engine.queue.queue import AbstractQueueConsumer, QueueMessage
|
|
11
8
|
|
|
12
9
|
from .config import RabbitMQConfigDetails
|
|
13
10
|
|
|
@@ -3,9 +3,10 @@ from typing import Union
|
|
|
3
3
|
import pika
|
|
4
4
|
from loguru import logger
|
|
5
5
|
|
|
6
|
-
from bizon.
|
|
6
|
+
from bizon.destination.destination import AbstractDestination
|
|
7
7
|
from bizon.engine.pipeline.consumer import AbstractQueueConsumer
|
|
8
|
-
from bizon.engine.queue.
|
|
8
|
+
from bizon.engine.queue.config import QUEUE_TERMINATION, QueueMessage
|
|
9
|
+
from bizon.engine.queue.queue import AbstractQueue
|
|
9
10
|
|
|
10
11
|
from .config import RabbitMQConfigDetails
|
|
11
12
|
from .consumer import RabbitMQConsumer
|
bizon/engine/queue/config.py
CHANGED
|
@@ -1,7 +1,23 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
2
4
|
from enum import Enum
|
|
5
|
+
from typing import Optional
|
|
3
6
|
|
|
7
|
+
import polars as pl
|
|
4
8
|
from pydantic import BaseModel, ConfigDict, Field
|
|
9
|
+
from pytz import UTC
|
|
10
|
+
|
|
11
|
+
QUEUE_TERMINATION = "TERMINATION"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class QueueMessage:
|
|
16
|
+
iteration: int
|
|
17
|
+
df_source_records: pl.DataFrame
|
|
18
|
+
extracted_at: datetime = datetime.now(tz=UTC)
|
|
19
|
+
pagination: Optional[dict] = None
|
|
20
|
+
signal: Optional[str] = None
|
|
5
21
|
|
|
6
22
|
|
|
7
23
|
class QueueTypes(str, Enum):
|
bizon/engine/queue/queue.py
CHANGED
|
@@ -1,28 +1,23 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
|
-
from dataclasses import dataclass
|
|
4
3
|
from datetime import datetime
|
|
5
|
-
from typing import
|
|
4
|
+
from typing import Union
|
|
6
5
|
|
|
7
6
|
import polars as pl
|
|
8
7
|
from pytz import UTC
|
|
9
8
|
|
|
10
|
-
from bizon.
|
|
9
|
+
from bizon.destination.destination import AbstractDestination
|
|
11
10
|
from bizon.engine.pipeline.consumer import AbstractQueueConsumer
|
|
11
|
+
from bizon.monitoring.monitor import AbstractMonitor
|
|
12
12
|
from bizon.source.models import SourceIteration, source_record_schema
|
|
13
|
+
from bizon.transform.transform import Transform
|
|
13
14
|
|
|
14
|
-
from .config import
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class QueueMessage:
|
|
21
|
-
iteration: int
|
|
22
|
-
df_source_records: pl.DataFrame
|
|
23
|
-
extracted_at: datetime = datetime.now(tz=UTC)
|
|
24
|
-
pagination: Optional[dict] = None
|
|
25
|
-
signal: Optional[str] = None
|
|
15
|
+
from .config import (
|
|
16
|
+
AbastractQueueConfigDetails,
|
|
17
|
+
AbstractQueueConfig,
|
|
18
|
+
QueueMessage,
|
|
19
|
+
QueueTypes,
|
|
20
|
+
)
|
|
26
21
|
|
|
27
22
|
|
|
28
23
|
class AbstractQueue(ABC):
|
|
@@ -35,7 +30,12 @@ class AbstractQueue(ABC):
|
|
|
35
30
|
pass
|
|
36
31
|
|
|
37
32
|
@abstractmethod
|
|
38
|
-
def get_consumer(
|
|
33
|
+
def get_consumer(
|
|
34
|
+
self,
|
|
35
|
+
destination: AbstractDestination,
|
|
36
|
+
transform: Transform,
|
|
37
|
+
monitor: AbstractMonitor,
|
|
38
|
+
) -> AbstractQueueConsumer:
|
|
39
39
|
pass
|
|
40
40
|
|
|
41
41
|
@abstractmethod
|
|
@@ -71,6 +71,7 @@ class AbstractQueue(ABC):
|
|
|
71
71
|
"id": [record.id for record in source_iteration.records],
|
|
72
72
|
"data": [json.dumps(record.data, ensure_ascii=False) for record in source_iteration.records],
|
|
73
73
|
"timestamp": [record.timestamp for record in source_iteration.records],
|
|
74
|
+
"destination_id": [record.destination_id for record in source_iteration.records],
|
|
74
75
|
},
|
|
75
76
|
schema=source_record_schema,
|
|
76
77
|
)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import concurrent.futures
|
|
2
2
|
import time
|
|
3
|
+
import traceback
|
|
3
4
|
|
|
4
5
|
from loguru import logger
|
|
5
6
|
|
|
@@ -50,6 +51,7 @@ class ProcessRunner(AbstractRunner):
|
|
|
50
51
|
future_consumer = executor.submit(
|
|
51
52
|
AbstractRunner.instanciate_and_run_consumer,
|
|
52
53
|
self.bizon_config,
|
|
54
|
+
self.config,
|
|
53
55
|
job.id,
|
|
54
56
|
**extra_kwargs,
|
|
55
57
|
)
|
|
@@ -68,8 +70,19 @@ class ProcessRunner(AbstractRunner):
|
|
|
68
70
|
result_producer = future_producer.result()
|
|
69
71
|
logger.info(f"Producer process stopped running with result: {result_producer}")
|
|
70
72
|
|
|
73
|
+
if result_producer.SUCCESS:
|
|
74
|
+
logger.info("Producer thread has finished successfully, will wait for consumer to finish ...")
|
|
75
|
+
else:
|
|
76
|
+
logger.error("Producer thread failed, stopping consumer ...")
|
|
77
|
+
executor.shutdown(wait=False)
|
|
78
|
+
|
|
71
79
|
if not future_consumer.running():
|
|
72
|
-
|
|
73
|
-
|
|
80
|
+
try:
|
|
81
|
+
future_consumer.result()
|
|
82
|
+
except Exception as e:
|
|
83
|
+
logger.error(f"Consumer thread stopped running with error {e}")
|
|
84
|
+
logger.error(traceback.format_exc())
|
|
85
|
+
finally:
|
|
86
|
+
executor.shutdown(wait=False)
|
|
74
87
|
|
|
75
88
|
return True
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
import polars as pl
|
|
7
|
+
import simplejson as json
|
|
8
|
+
from loguru import logger
|
|
9
|
+
from pytz import UTC
|
|
10
|
+
|
|
11
|
+
from bizon.common.models import BizonConfig, SyncMetadata
|
|
12
|
+
from bizon.destination.models import transform_to_df_destination_records
|
|
13
|
+
from bizon.engine.pipeline.models import PipelineReturnStatus
|
|
14
|
+
from bizon.engine.runner.config import RunnerStatus
|
|
15
|
+
from bizon.engine.runner.runner import AbstractRunner
|
|
16
|
+
from bizon.source.models import SourceRecord, source_record_schema
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class StreamingRunner(AbstractRunner):
|
|
20
|
+
def __init__(self, config: BizonConfig):
|
|
21
|
+
super().__init__(config)
|
|
22
|
+
|
|
23
|
+
@staticmethod
|
|
24
|
+
def convert_source_records(records: List[SourceRecord]) -> pl.DataFrame:
|
|
25
|
+
return pl.DataFrame(
|
|
26
|
+
{
|
|
27
|
+
"id": [record.id for record in records],
|
|
28
|
+
"data": [json.dumps(record.data, ensure_ascii=False) for record in records],
|
|
29
|
+
"timestamp": [record.timestamp for record in records],
|
|
30
|
+
"destination_id": [record.destination_id for record in records],
|
|
31
|
+
},
|
|
32
|
+
schema=source_record_schema,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def convert_to_destination_records(df_source_records: pl.DataFrame, extracted_at: datetime) -> pl.DataFrame:
|
|
37
|
+
return transform_to_df_destination_records(df_source_records=df_source_records, extracted_at=extracted_at)
|
|
38
|
+
|
|
39
|
+
def run(self) -> RunnerStatus:
|
|
40
|
+
job = self.init_job(bizon_config=self.bizon_config, config=self.config)
|
|
41
|
+
backend = self.get_backend(bizon_config=self.bizon_config)
|
|
42
|
+
source = self.get_source(bizon_config=self.bizon_config, config=self.config)
|
|
43
|
+
|
|
44
|
+
sync_metadata = SyncMetadata.from_bizon_config(job_id=job.id, config=self.bizon_config)
|
|
45
|
+
monitor = self.get_monitoring_client(sync_metadata=sync_metadata, bizon_config=self.bizon_config)
|
|
46
|
+
|
|
47
|
+
destination = self.get_destination(
|
|
48
|
+
bizon_config=self.bizon_config,
|
|
49
|
+
backend=backend,
|
|
50
|
+
job_id=job.id,
|
|
51
|
+
source_callback=None,
|
|
52
|
+
monitor=monitor,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
transform = self.get_transform(bizon_config=self.bizon_config)
|
|
56
|
+
|
|
57
|
+
destination.buffer.buffer_size = 0 # force buffer to be flushed immediately
|
|
58
|
+
iteration = 0
|
|
59
|
+
|
|
60
|
+
while True:
|
|
61
|
+
|
|
62
|
+
if source.config.max_iterations and iteration > source.config.max_iterations:
|
|
63
|
+
logger.info(f"Max iterations {source.config.max_iterations} reached, terminating stream ...")
|
|
64
|
+
break
|
|
65
|
+
|
|
66
|
+
with monitor.trace(operation_name="bizon.stream.iteration"):
|
|
67
|
+
source_iteration = source.get()
|
|
68
|
+
|
|
69
|
+
destination_id_indexed_records = {}
|
|
70
|
+
|
|
71
|
+
if len(source_iteration.records) == 0:
|
|
72
|
+
logger.info("No new records found, stopping iteration")
|
|
73
|
+
time.sleep(2)
|
|
74
|
+
monitor.track_pipeline_status(PipelineReturnStatus.SUCCESS)
|
|
75
|
+
iteration += 1
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
for record in source_iteration.records:
|
|
79
|
+
if destination_id_indexed_records.get(record.destination_id):
|
|
80
|
+
destination_id_indexed_records[record.destination_id].append(record)
|
|
81
|
+
else:
|
|
82
|
+
destination_id_indexed_records[record.destination_id] = [record]
|
|
83
|
+
|
|
84
|
+
for destination_id, records in destination_id_indexed_records.items():
|
|
85
|
+
df_source_records = StreamingRunner.convert_source_records(records)
|
|
86
|
+
|
|
87
|
+
dsm_headers = monitor.track_source_iteration(records=records)
|
|
88
|
+
|
|
89
|
+
# Apply transformation
|
|
90
|
+
df_source_records = transform.apply_transforms(df_source_records=df_source_records)
|
|
91
|
+
|
|
92
|
+
df_destination_records = StreamingRunner.convert_to_destination_records(
|
|
93
|
+
df_source_records, datetime.now(tz=UTC)
|
|
94
|
+
)
|
|
95
|
+
# Override destination_id
|
|
96
|
+
destination.destination_id = destination_id
|
|
97
|
+
destination.write_or_buffer_records(
|
|
98
|
+
df_destination_records=df_destination_records,
|
|
99
|
+
iteration=iteration,
|
|
100
|
+
pagination=None,
|
|
101
|
+
)
|
|
102
|
+
monitor.track_records_synced(
|
|
103
|
+
num_records=len(df_destination_records),
|
|
104
|
+
destination_id=destination_id,
|
|
105
|
+
extra_tags={"destination_id": destination_id},
|
|
106
|
+
headers=dsm_headers,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
if os.getenv("ENVIRONMENT") == "production":
|
|
110
|
+
try:
|
|
111
|
+
source.commit()
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.error(f"Error committing source: {e}")
|
|
114
|
+
monitor.track_pipeline_status(PipelineReturnStatus.ERROR)
|
|
115
|
+
return RunnerStatus(stream=PipelineReturnStatus.ERROR)
|
|
116
|
+
|
|
117
|
+
iteration += 1
|
|
118
|
+
|
|
119
|
+
monitor.track_pipeline_status(PipelineReturnStatus.SUCCESS)
|
|
120
|
+
|
|
121
|
+
return RunnerStatus(stream=PipelineReturnStatus.SUCCESS) # return when max iterations is reached
|