bizon 0.0.14__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon/alerting/__init__.py +0 -0
- bizon/alerting/alerts.py +23 -0
- bizon/alerting/models.py +28 -0
- bizon/alerting/slack/__init__.py +0 -0
- bizon/alerting/slack/config.py +5 -0
- bizon/alerting/slack/handler.py +39 -0
- bizon/cli/main.py +7 -3
- bizon/common/models.py +31 -7
- bizon/{destinations → connectors/destinations}/bigquery/config/bigquery.example.yml +3 -4
- bizon/connectors/destinations/bigquery/src/config.py +127 -0
- bizon/{destinations → connectors/destinations}/bigquery/src/destination.py +46 -25
- bizon/connectors/destinations/bigquery_streaming/src/config.py +56 -0
- bizon/connectors/destinations/bigquery_streaming/src/destination.py +372 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +52 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +261 -0
- bizon/{destinations/bigquery_streaming → connectors/destinations/bigquery_streaming_v2}/src/proto_utils.py +32 -26
- bizon/{destinations → connectors/destinations}/file/src/config.py +8 -3
- bizon/connectors/destinations/file/src/destination.py +54 -0
- bizon/{destinations → connectors/destinations}/logger/src/config.py +1 -1
- bizon/{destinations → connectors/destinations}/logger/src/destination.py +15 -3
- bizon/connectors/sources/cycle/config/cycle.example.yml +15 -0
- bizon/connectors/sources/cycle/src/source.py +133 -0
- bizon/{sources/periscope/tests/periscope_pipeline_dashboard.py → connectors/sources/cycle/tests/cycle_customers.py} +1 -1
- bizon/connectors/sources/dummy/config/dummy.example.yml +22 -0
- bizon/{sources → connectors/sources}/dummy/src/fake_api.py +6 -1
- bizon/{sources → connectors/sources}/dummy/src/source.py +18 -5
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline.py +5 -14
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_bigquery_backend.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_kafka.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_rabbitmq.py +2 -2
- bizon/connectors/sources/dummy/tests/dummy_pipeline_unnest.py +29 -0
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery.py +3 -3
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +2 -2
- bizon/{sources → connectors/sources}/gsheets/config/default_auth.example.yml +4 -2
- bizon/{sources → connectors/sources}/gsheets/config/service_account.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/config/api_key.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/config/oauth.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/src/hubspot_objects.py +1 -1
- bizon/connectors/sources/kafka/config/kafka.example.yml +50 -0
- bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +112 -0
- bizon/connectors/sources/kafka/src/callback.py +18 -0
- bizon/connectors/sources/kafka/src/config.py +75 -0
- bizon/connectors/sources/kafka/src/decode.py +88 -0
- bizon/connectors/sources/kafka/src/source.py +361 -0
- bizon/connectors/sources/kafka/tests/kafka_pipeline.py +7 -0
- bizon/connectors/sources/periscope/config/periscope_charts.example.yml +20 -0
- bizon/connectors/sources/periscope/config/periscope_dashboards.example.yml +20 -0
- bizon/{sources → connectors/sources}/periscope/src/source.py +136 -13
- bizon/connectors/sources/periscope/tests/periscope_pipeline_dashboard.py +9 -0
- bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_json.example.yml +19 -0
- bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_logger.example.yml +10 -0
- bizon/connectors/sources/pokeapi/src/source.py +79 -0
- bizon/{destinations → destination}/buffer.py +5 -0
- bizon/destination/config.py +74 -0
- bizon/{destinations → destination}/destination.py +71 -15
- bizon/engine/backend/adapters/sqlalchemy/backend.py +14 -23
- bizon/engine/engine.py +20 -1
- bizon/engine/pipeline/consumer.py +73 -5
- bizon/engine/pipeline/models.py +8 -3
- bizon/engine/pipeline/producer.py +18 -9
- bizon/engine/queue/adapters/kafka/consumer.py +2 -2
- bizon/engine/queue/adapters/kafka/queue.py +3 -2
- bizon/engine/queue/adapters/python_queue/consumer.py +40 -23
- bizon/engine/queue/adapters/python_queue/queue.py +19 -9
- bizon/engine/queue/adapters/rabbitmq/consumer.py +3 -6
- bizon/engine/queue/adapters/rabbitmq/queue.py +3 -2
- bizon/engine/queue/config.py +16 -0
- bizon/engine/queue/queue.py +17 -16
- bizon/engine/runner/adapters/process.py +15 -2
- bizon/engine/runner/adapters/streaming.py +103 -0
- bizon/engine/runner/adapters/thread.py +32 -9
- bizon/engine/runner/config.py +28 -0
- bizon/engine/runner/runner.py +107 -25
- bizon/monitoring/__init__.py +0 -0
- bizon/monitoring/config.py +29 -0
- bizon/monitoring/datadog/__init__.py +0 -0
- bizon/monitoring/datadog/monitor.py +69 -0
- bizon/monitoring/monitor.py +42 -0
- bizon/monitoring/noop/__init__.py +0 -0
- bizon/monitoring/noop/monitor.py +11 -0
- bizon/source/callback.py +24 -0
- bizon/source/config.py +3 -3
- bizon/source/cursor.py +1 -1
- bizon/source/discover.py +4 -3
- bizon/source/models.py +4 -2
- bizon/source/source.py +10 -2
- bizon/transform/config.py +8 -0
- bizon/transform/transform.py +48 -0
- bizon-0.1.1.dist-info/LICENSE +674 -0
- {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/METADATA +25 -7
- bizon-0.1.1.dist-info/RECORD +123 -0
- {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/WHEEL +1 -1
- bizon/destinations/bigquery/src/config.py +0 -51
- bizon/destinations/bigquery_streaming/src/config.py +0 -43
- bizon/destinations/bigquery_streaming/src/destination.py +0 -154
- bizon/destinations/config.py +0 -47
- bizon/destinations/file/src/destination.py +0 -27
- bizon/sources/dummy/config/api_key.example.yml +0 -20
- bizon/sources/dummy/config/api_key_kafka.example.yml +0 -27
- bizon/sources/kafka/config/kafka.example.yml +0 -38
- bizon/sources/kafka/src/source.py +0 -357
- bizon/sources/kafka/tests/kafka_pipeline.py +0 -9
- bizon/sources/periscope/config/periscope_charts.example.yml +0 -26
- bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -26
- bizon-0.0.14.dist-info/LICENSE +0 -21
- bizon-0.0.14.dist-info/RECORD +0 -94
- /bizon/{sources → connectors/sources}/gsheets/src/source.py +0 -0
- /bizon/{sources → connectors/sources}/gsheets/tests/gsheets_pipeline.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/src/hubspot_base.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/src/models/hs_object.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/tests/hubspot_pipeline.py +0 -0
- /bizon/{sources → connectors/sources}/periscope/tests/periscope_pipeline_charts.py +0 -0
- /bizon/{destinations → destination}/models.py +0 -0
- {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Any, List, Tuple
|
|
3
|
+
|
|
4
|
+
from requests.auth import AuthBase
|
|
5
|
+
|
|
6
|
+
from bizon.source.config import SourceConfig
|
|
7
|
+
from bizon.source.models import SourceIteration, SourceRecord
|
|
8
|
+
from bizon.source.source import AbstractSource
|
|
9
|
+
|
|
10
|
+
BASE_URL = "https://pokeapi.co/api/v2"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Define the streams that the source supports
|
|
14
|
+
class PokeAPIStreams(str, Enum):
|
|
15
|
+
POKEMON = "pokemon"
|
|
16
|
+
BERRY = "berry"
|
|
17
|
+
ITEM = "item"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Define the config class for the source
|
|
21
|
+
class PokeAPISourceConfig(SourceConfig):
|
|
22
|
+
stream: PokeAPIStreams
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PeriscopeSource(AbstractSource):
|
|
26
|
+
|
|
27
|
+
def __init__(self, config: PokeAPISourceConfig):
|
|
28
|
+
super().__init__(config)
|
|
29
|
+
self.config: PokeAPISourceConfig = config
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def url_entity(self) -> str:
|
|
33
|
+
return f"{BASE_URL}/{self.config.stream}"
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def streams() -> List[str]:
|
|
37
|
+
return [item.value for item in PokeAPIStreams]
|
|
38
|
+
|
|
39
|
+
@staticmethod
|
|
40
|
+
def get_config_class() -> AbstractSource:
|
|
41
|
+
return PokeAPISourceConfig
|
|
42
|
+
|
|
43
|
+
def check_connection(self) -> Tuple[bool | Any | None]:
|
|
44
|
+
# Make a request to the base URL to check if the connection is successful
|
|
45
|
+
_ = self.session.get(self.url_entity)
|
|
46
|
+
return True, None
|
|
47
|
+
|
|
48
|
+
def get_authenticator(self) -> AuthBase:
|
|
49
|
+
# We return None because we don't need any authentication
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
def get_total_records_count(self) -> int | None:
|
|
53
|
+
# Return the total number of records in the stream
|
|
54
|
+
response = self.session.get(self.url_entity)
|
|
55
|
+
return response.json().get("count")
|
|
56
|
+
|
|
57
|
+
def get_entity_list(self, pagination: dict = None) -> SourceIteration:
|
|
58
|
+
# If pagination is provided, use the next URL to get the next set of records
|
|
59
|
+
url = pagination.get("next") if pagination else self.url_entity
|
|
60
|
+
response = self.session.get(url)
|
|
61
|
+
|
|
62
|
+
data = response.json()
|
|
63
|
+
|
|
64
|
+
return SourceIteration(
|
|
65
|
+
next_pagination={"next": data.get("next")} if data.get("next") else {},
|
|
66
|
+
records=[
|
|
67
|
+
SourceRecord(
|
|
68
|
+
id=record["name"],
|
|
69
|
+
data=record,
|
|
70
|
+
)
|
|
71
|
+
for record in data["results"]
|
|
72
|
+
],
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def get(self, pagination: dict = None) -> SourceIteration:
|
|
76
|
+
if self.config.stream in [PokeAPIStreams.POKEMON, PokeAPIStreams.BERRY, PokeAPIStreams.ITEM]:
|
|
77
|
+
return self.get_entity_list(pagination)
|
|
78
|
+
|
|
79
|
+
raise NotImplementedError(f"Stream {self.config.stream} not implemented for PokeAPI source")
|
|
@@ -18,6 +18,11 @@ class DestinationBuffer:
|
|
|
18
18
|
self.pagination = {}
|
|
19
19
|
self.modified_at: List[datetime] = [datetime.now(tz=UTC)]
|
|
20
20
|
|
|
21
|
+
@property
|
|
22
|
+
def is_empty(self) -> bool:
|
|
23
|
+
"""Check if buffer is empty"""
|
|
24
|
+
return self.df_destination_records.height == 0
|
|
25
|
+
|
|
21
26
|
@property
|
|
22
27
|
def current_size(self) -> int:
|
|
23
28
|
"""Return buffer size"""
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DestinationTypes(str, Enum):
|
|
9
|
+
BIGQUERY = "bigquery"
|
|
10
|
+
BIGQUERY_STREAMING = "bigquery_streaming"
|
|
11
|
+
BIGQUERY_STREAMING_V2 = "bigquery_streaming_v2"
|
|
12
|
+
LOGGER = "logger"
|
|
13
|
+
FILE = "file"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DestinationColumn(BaseModel, ABC):
|
|
17
|
+
name: str = Field(..., description="Name of the column")
|
|
18
|
+
type: str = Field(..., description="Type of the column")
|
|
19
|
+
description: Optional[str] = Field(None, description="Description of the column")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class RecordSchemaConfig(BaseModel):
|
|
23
|
+
# Forbid extra keys in the model
|
|
24
|
+
model_config = ConfigDict(extra="forbid")
|
|
25
|
+
|
|
26
|
+
destination_id: str = Field(..., description="Destination ID")
|
|
27
|
+
record_schema: list[DestinationColumn] = Field(..., description="Record schema")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class AbstractDestinationDetailsConfig(BaseModel):
|
|
31
|
+
|
|
32
|
+
# Forbid extra keys in the model
|
|
33
|
+
model_config = ConfigDict(extra="forbid")
|
|
34
|
+
|
|
35
|
+
buffer_size: int = Field(
|
|
36
|
+
default=50,
|
|
37
|
+
description="Buffer size in Mb for the destination. Set to 0 to disable and write directly to the destination.",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
buffer_flush_timeout: int = Field(
|
|
41
|
+
default=600,
|
|
42
|
+
description="Maximum time in seconds for buffering after which the records will be written to the destination. Set to 0 to deactivate the timeout buffer check.", # noqa
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
record_schemas: Optional[list[RecordSchemaConfig]] = Field(
|
|
46
|
+
default=None, description="Schemas for the records. Required if unnest is set to true."
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
unnest: bool = Field(
|
|
50
|
+
default=False,
|
|
51
|
+
description="Unnest the data before writing to the destination. Schema should be provided in the model_config.",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
authentication: Optional[BaseModel] = Field(
|
|
55
|
+
description="Authentication configuration for the destination, if needed", default=None
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
destination_id: Optional[str] = Field(
|
|
59
|
+
description="Destination ID, identifier to use to store the records in the destination", default=None
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
@field_validator("unnest", mode="before")
|
|
63
|
+
def validate_record_schema_if_unnest(cls, value, values):
|
|
64
|
+
if bool(value) and not values.data.get("record_schemas", []):
|
|
65
|
+
raise ValueError("At least one `record_schemas` must be provided if `unnest` is set to True.")
|
|
66
|
+
return value
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class AbstractDestinationConfig(BaseModel):
|
|
70
|
+
# Forbid extra keys in the model
|
|
71
|
+
model_config = ConfigDict(extra="forbid")
|
|
72
|
+
|
|
73
|
+
name: DestinationTypes = Field(..., description="Name of the destination")
|
|
74
|
+
config: AbstractDestinationDetailsConfig = Field(..., description="Configuration for the destination")
|
|
@@ -10,6 +10,7 @@ from pydantic import BaseModel, Field
|
|
|
10
10
|
from bizon.common.models import SyncMetadata
|
|
11
11
|
from bizon.engine.backend.backend import AbstractBackend
|
|
12
12
|
from bizon.engine.backend.models import JobStatus
|
|
13
|
+
from bizon.source.callback import AbstractSourceCallback
|
|
13
14
|
from bizon.source.config import SourceSyncModes
|
|
14
15
|
|
|
15
16
|
from .buffer import DestinationBuffer
|
|
@@ -43,13 +44,40 @@ class DestinationIteration(BaseModel):
|
|
|
43
44
|
|
|
44
45
|
class AbstractDestination(ABC):
|
|
45
46
|
|
|
46
|
-
def __init__(
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
sync_metadata: SyncMetadata,
|
|
50
|
+
config: AbstractDestinationDetailsConfig,
|
|
51
|
+
backend: AbstractBackend,
|
|
52
|
+
source_callback: AbstractSourceCallback,
|
|
53
|
+
):
|
|
47
54
|
self.sync_metadata = sync_metadata
|
|
48
55
|
self.config = config
|
|
49
56
|
self.backend = backend
|
|
50
57
|
self.buffer = DestinationBuffer(
|
|
51
58
|
buffer_size=self.config.buffer_size, buffer_flush_timeout=self.config.buffer_flush_timeout
|
|
52
59
|
)
|
|
60
|
+
self.source_callback = source_callback
|
|
61
|
+
self.destination_id = config.destination_id
|
|
62
|
+
|
|
63
|
+
self._record_schemas = None
|
|
64
|
+
self._clustering_keys = None
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def record_schemas(self):
|
|
68
|
+
if self._record_schemas is None and self.config.record_schemas:
|
|
69
|
+
self._record_schemas = {
|
|
70
|
+
schema.destination_id: schema.record_schema for schema in self.config.record_schemas
|
|
71
|
+
}
|
|
72
|
+
return self._record_schemas
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def clustering_keys(self):
|
|
76
|
+
if self._clustering_keys is None and self.config.record_schemas:
|
|
77
|
+
self._clustering_keys = {
|
|
78
|
+
schema.destination_id: schema.clustering_keys for schema in self.config.record_schemas
|
|
79
|
+
}
|
|
80
|
+
return self._clustering_keys
|
|
53
81
|
|
|
54
82
|
@abstractmethod
|
|
55
83
|
def check_connection(self) -> bool:
|
|
@@ -75,7 +103,7 @@ class AbstractDestination(ABC):
|
|
|
75
103
|
)
|
|
76
104
|
|
|
77
105
|
logger.info(
|
|
78
|
-
f"Writing in destination from source iteration {self.buffer.from_iteration} to {self.buffer.to_iteration}"
|
|
106
|
+
f"Writing in destination {self.destination_id} from source iteration {self.buffer.from_iteration} to {self.buffer.to_iteration}"
|
|
79
107
|
)
|
|
80
108
|
|
|
81
109
|
success, error_msg = self.write_records(df_destination_records=self.buffer.df_destination_records)
|
|
@@ -83,7 +111,9 @@ class AbstractDestination(ABC):
|
|
|
83
111
|
if success:
|
|
84
112
|
# We wrote records to destination so we keep it
|
|
85
113
|
destination_iteration.records_written = self.buffer.df_destination_records.height
|
|
86
|
-
logger.info(
|
|
114
|
+
logger.info(
|
|
115
|
+
f"Successfully wrote {destination_iteration.records_written} records to destination {self.destination_id}"
|
|
116
|
+
)
|
|
87
117
|
|
|
88
118
|
else:
|
|
89
119
|
# We failed to write records to destination so we keep the error message
|
|
@@ -112,8 +142,8 @@ class AbstractDestination(ABC):
|
|
|
112
142
|
# Last iteration, write all records to destination
|
|
113
143
|
if last_iteration:
|
|
114
144
|
|
|
115
|
-
if self.buffer.df_destination_records.height == 0 and self.
|
|
116
|
-
logger.
|
|
145
|
+
if self.buffer.df_destination_records.height == 0 and self.buffer.is_empty:
|
|
146
|
+
logger.info("No records to write to destination, already written, buffer is empty.")
|
|
117
147
|
return DestinationBufferStatus.RECORDS_WRITTEN
|
|
118
148
|
|
|
119
149
|
logger.debug("Writing last iteration records to destination")
|
|
@@ -143,12 +173,12 @@ class AbstractDestination(ABC):
|
|
|
143
173
|
|
|
144
174
|
# Don't write empty records to destination
|
|
145
175
|
if df_destination_records.height == 0 and not last_iteration:
|
|
146
|
-
logger.
|
|
176
|
+
logger.info("No records to write to destination. Check source and queue provider.")
|
|
147
177
|
return DestinationBufferStatus.NO_RECORDS
|
|
148
178
|
|
|
149
179
|
# Write records to destination if buffer size is 0 or streaming
|
|
150
180
|
if self.buffer.buffer_size == 0:
|
|
151
|
-
logger.info("Writing records to destination.")
|
|
181
|
+
logger.info(f"Writing records to destination {self.destination_id}.")
|
|
152
182
|
self.buffer.add_source_iteration_records_to_buffer(
|
|
153
183
|
iteration=iteration, df_destination_records=df_destination_records, pagination=pagination
|
|
154
184
|
)
|
|
@@ -245,26 +275,52 @@ class DestinationFactory:
|
|
|
245
275
|
sync_metadata: SyncMetadata,
|
|
246
276
|
config: AbstractDestinationConfig,
|
|
247
277
|
backend: AbstractBackend,
|
|
278
|
+
source_callback: AbstractSourceCallback,
|
|
248
279
|
) -> AbstractDestination:
|
|
249
280
|
|
|
250
281
|
if config.name == DestinationTypes.LOGGER:
|
|
251
|
-
from .logger.src.destination import
|
|
282
|
+
from bizon.connectors.destinations.logger.src.destination import (
|
|
283
|
+
LoggerDestination,
|
|
284
|
+
)
|
|
252
285
|
|
|
253
|
-
return LoggerDestination(
|
|
286
|
+
return LoggerDestination(
|
|
287
|
+
sync_metadata=sync_metadata, config=config.config, backend=backend, source_callback=source_callback
|
|
288
|
+
)
|
|
254
289
|
|
|
255
290
|
elif config.name == DestinationTypes.BIGQUERY:
|
|
256
|
-
from .bigquery.src.destination import
|
|
291
|
+
from bizon.connectors.destinations.bigquery.src.destination import (
|
|
292
|
+
BigQueryDestination,
|
|
293
|
+
)
|
|
257
294
|
|
|
258
|
-
return BigQueryDestination(
|
|
295
|
+
return BigQueryDestination(
|
|
296
|
+
sync_metadata=sync_metadata, config=config.config, backend=backend, source_callback=source_callback
|
|
297
|
+
)
|
|
259
298
|
|
|
260
299
|
elif config.name == DestinationTypes.BIGQUERY_STREAMING:
|
|
261
|
-
from .bigquery_streaming.src.destination import
|
|
300
|
+
from bizon.connectors.destinations.bigquery_streaming.src.destination import (
|
|
301
|
+
BigQueryStreamingDestination,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
return BigQueryStreamingDestination(
|
|
305
|
+
sync_metadata=sync_metadata, config=config.config, backend=backend, source_callback=source_callback
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
elif config.name == DestinationTypes.BIGQUERY_STREAMING_V2:
|
|
309
|
+
from bizon.connectors.destinations.bigquery_streaming_v2.src.destination import (
|
|
310
|
+
BigQueryStreamingV2Destination,
|
|
311
|
+
)
|
|
262
312
|
|
|
263
|
-
return
|
|
313
|
+
return BigQueryStreamingV2Destination(
|
|
314
|
+
sync_metadata=sync_metadata, config=config.config, backend=backend, source_callback=source_callback
|
|
315
|
+
)
|
|
264
316
|
|
|
265
317
|
elif config.name == DestinationTypes.FILE:
|
|
266
|
-
from .file.src.destination import
|
|
318
|
+
from bizon.connectors.destinations.file.src.destination import (
|
|
319
|
+
FileDestination,
|
|
320
|
+
)
|
|
267
321
|
|
|
268
|
-
return FileDestination(
|
|
322
|
+
return FileDestination(
|
|
323
|
+
sync_metadata=sync_metadata, config=config.config, backend=backend, source_callback=source_callback
|
|
324
|
+
)
|
|
269
325
|
|
|
270
326
|
raise ValueError(f"Destination {config.name}" f"with params {config} not found")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from datetime import datetime
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Optional, Union
|
|
4
4
|
|
|
5
5
|
from loguru import logger
|
|
6
6
|
from pytz import UTC
|
|
@@ -39,9 +39,9 @@ class SQLAlchemyBackend(AbstractBackend):
|
|
|
39
39
|
] = config
|
|
40
40
|
|
|
41
41
|
self.kwargs = kwargs
|
|
42
|
+
self.session = self.get_session()
|
|
42
43
|
|
|
43
|
-
|
|
44
|
-
def session(self) -> Generator[Session, None, None]:
|
|
44
|
+
def get_session(self) -> Session:
|
|
45
45
|
"""yields a SQLAlchemy connection"""
|
|
46
46
|
engine = self.get_engine()
|
|
47
47
|
session_ = scoped_session(
|
|
@@ -51,20 +51,17 @@ class SQLAlchemyBackend(AbstractBackend):
|
|
|
51
51
|
)
|
|
52
52
|
)
|
|
53
53
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
session_.close()
|
|
57
|
-
|
|
58
|
-
# For SQLite, we need to dispose the engine after each operation to prevent database lock
|
|
59
|
-
self.handle_dispose_sqlite(engine)
|
|
54
|
+
return session_
|
|
60
55
|
|
|
61
56
|
def _get_engine_bigquery(self) -> Engine:
|
|
57
|
+
# If service account key is provided, use it
|
|
62
58
|
if hasattr(self.config, "service_account_key") and self.config.service_account_key:
|
|
63
59
|
return create_engine(
|
|
64
60
|
f"bigquery://{self.config.database}/{self.config.schema_name}",
|
|
65
61
|
echo=self.config.echoEngine,
|
|
66
62
|
credentials_info=self.config.service_account_key,
|
|
67
63
|
)
|
|
64
|
+
# Otherwise we will rely on the default Google Authentication mechanism (e.g. GOOGLE_APPLICATION_CREDENTIALS)
|
|
68
65
|
return create_engine(
|
|
69
66
|
f"bigquery://{self.config.database}/{self.config.schema_name}", echo=self.config.echoEngine
|
|
70
67
|
)
|
|
@@ -78,7 +75,9 @@ class SQLAlchemyBackend(AbstractBackend):
|
|
|
78
75
|
def _get_engine_sqlite(self) -> Engine:
|
|
79
76
|
return create_engine(
|
|
80
77
|
f"sqlite:///{self.config.database}.sqlite3",
|
|
81
|
-
connect_args={"check_same_thread": False},
|
|
78
|
+
connect_args={"check_same_thread": False, "timeout": 30},
|
|
79
|
+
pool_size=5, # Adjust based on expected concurrency
|
|
80
|
+
pool_pre_ping=True, # Ensures connections are alive
|
|
82
81
|
)
|
|
83
82
|
|
|
84
83
|
def _get_engine(self) -> Engine:
|
|
@@ -120,12 +119,6 @@ class SQLAlchemyBackend(AbstractBackend):
|
|
|
120
119
|
f"Schema or dataset {self.config.schema_name} does not exist in the database, you need to create it first."
|
|
121
120
|
)
|
|
122
121
|
|
|
123
|
-
self.handle_dispose_sqlite(engine)
|
|
124
|
-
|
|
125
|
-
def handle_dispose_sqlite(self, engine: Engine):
|
|
126
|
-
if self.type == BackendTypes.SQLITE:
|
|
127
|
-
engine.dispose()
|
|
128
|
-
|
|
129
122
|
def get_engine(self) -> Engine:
|
|
130
123
|
"""Return the SQLAlchemy engine"""
|
|
131
124
|
if not self._engine:
|
|
@@ -138,12 +131,10 @@ class SQLAlchemyBackend(AbstractBackend):
|
|
|
138
131
|
def create_all_tables(self):
|
|
139
132
|
engine = self.get_engine()
|
|
140
133
|
Base.metadata.create_all(engine)
|
|
141
|
-
self.handle_dispose_sqlite(engine)
|
|
142
134
|
|
|
143
135
|
def drop_all_tables(self):
|
|
144
136
|
engine = self.get_engine()
|
|
145
137
|
Base.metadata.drop_all(engine)
|
|
146
|
-
self.handle_dispose_sqlite(engine)
|
|
147
138
|
|
|
148
139
|
def check_prerequisites(self) -> bool:
|
|
149
140
|
"""Check if the database contains the necessary tables, return True if entities are present
|
|
@@ -171,20 +162,20 @@ class SQLAlchemyBackend(AbstractBackend):
|
|
|
171
162
|
all_entities_exist = False
|
|
172
163
|
logger.info(f"Table {TABLE_DESTINATION_CURSOR} does not exist in the database, we will create it")
|
|
173
164
|
|
|
174
|
-
self.handle_dispose_sqlite(engine)
|
|
175
|
-
|
|
176
165
|
return all_entities_exist
|
|
177
166
|
|
|
178
167
|
def _add_and_commit(self, obj, session: Optional[Session] = None):
|
|
179
168
|
"""Add the object to the session and commit it, return its ID"""
|
|
180
|
-
session = session or
|
|
169
|
+
session = session or self.session
|
|
181
170
|
session.add(obj)
|
|
182
171
|
session.commit()
|
|
183
172
|
return obj
|
|
184
173
|
|
|
185
174
|
def _execute(self, select: Select, session: Optional[Session] = None) -> Result:
|
|
186
|
-
session = session or
|
|
187
|
-
|
|
175
|
+
session = session or self.session
|
|
176
|
+
result = session.execute(select)
|
|
177
|
+
session.commit()
|
|
178
|
+
return result
|
|
188
179
|
|
|
189
180
|
#### STREAM JOB ####
|
|
190
181
|
|
bizon/engine/engine.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from
|
|
1
|
+
from os import getenv
|
|
2
2
|
|
|
3
3
|
from bizon.cli.utils import parse_from_yaml
|
|
4
4
|
from bizon.common.models import BizonConfig
|
|
@@ -7,10 +7,24 @@ from .config import RunnerTypes
|
|
|
7
7
|
from .runner.runner import AbstractRunner
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
def replace_env_variables_in_config(config: dict) -> dict:
|
|
11
|
+
"""Replace templated secrets with actual values from environment variables"""
|
|
12
|
+
for key, value in config.items():
|
|
13
|
+
if isinstance(value, dict):
|
|
14
|
+
config[key] = replace_env_variables_in_config(value)
|
|
15
|
+
elif isinstance(value, str):
|
|
16
|
+
if value.startswith("BIZON_ENV_"):
|
|
17
|
+
config[key] = getenv(value)
|
|
18
|
+
return config
|
|
19
|
+
|
|
20
|
+
|
|
10
21
|
class RunnerFactory:
|
|
11
22
|
@staticmethod
|
|
12
23
|
def create_from_config_dict(config: dict) -> AbstractRunner:
|
|
13
24
|
|
|
25
|
+
# Replace env variables in config
|
|
26
|
+
config = replace_env_variables_in_config(config=config)
|
|
27
|
+
|
|
14
28
|
bizon_config = BizonConfig.model_validate(obj=config)
|
|
15
29
|
|
|
16
30
|
if bizon_config.engine.runner.type == RunnerTypes.THREAD:
|
|
@@ -23,6 +37,11 @@ class RunnerFactory:
|
|
|
23
37
|
|
|
24
38
|
return ProcessRunner(config=config)
|
|
25
39
|
|
|
40
|
+
if bizon_config.engine.runner.type == RunnerTypes.STREAM:
|
|
41
|
+
from .runner.adapters.streaming import StreamingRunner
|
|
42
|
+
|
|
43
|
+
return StreamingRunner(config=config)
|
|
44
|
+
|
|
26
45
|
raise ValueError(f"Runner type {bizon_config.engine.runner.type} is not supported")
|
|
27
46
|
|
|
28
47
|
@staticmethod
|
|
@@ -1,15 +1,83 @@
|
|
|
1
|
+
import multiprocessing
|
|
2
|
+
import multiprocessing.synchronize
|
|
3
|
+
import threading
|
|
4
|
+
import traceback
|
|
1
5
|
from abc import ABC, abstractmethod
|
|
2
|
-
from
|
|
6
|
+
from typing import Union
|
|
3
7
|
|
|
4
|
-
from
|
|
5
|
-
|
|
8
|
+
from loguru import logger
|
|
9
|
+
|
|
10
|
+
from bizon.destination.destination import AbstractDestination
|
|
11
|
+
from bizon.engine.pipeline.models import PipelineReturnStatus
|
|
12
|
+
from bizon.engine.queue.config import (
|
|
13
|
+
QUEUE_TERMINATION,
|
|
14
|
+
AbstractQueueConfig,
|
|
15
|
+
QueueMessage,
|
|
16
|
+
)
|
|
17
|
+
from bizon.monitoring.monitor import AbstractMonitor
|
|
18
|
+
from bizon.transform.transform import Transform
|
|
6
19
|
|
|
7
20
|
|
|
8
21
|
class AbstractQueueConsumer(ABC):
|
|
9
|
-
def __init__(
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
config: AbstractQueueConfig,
|
|
25
|
+
destination: AbstractDestination,
|
|
26
|
+
transform: Transform,
|
|
27
|
+
monitor: AbstractMonitor,
|
|
28
|
+
):
|
|
10
29
|
self.config = config
|
|
11
30
|
self.destination = destination
|
|
31
|
+
self.transform = transform
|
|
32
|
+
self.monitor = monitor
|
|
12
33
|
|
|
13
34
|
@abstractmethod
|
|
14
|
-
def run(self):
|
|
35
|
+
def run(self, stop_event: Union[multiprocessing.synchronize.Event, threading.Event]) -> PipelineReturnStatus:
|
|
15
36
|
pass
|
|
37
|
+
|
|
38
|
+
def process_queue_message(self, queue_message: QueueMessage) -> PipelineReturnStatus:
|
|
39
|
+
|
|
40
|
+
# Apply the transformation
|
|
41
|
+
try:
|
|
42
|
+
df_source_records = self.transform.apply_transforms(df_source_records=queue_message.df_source_records)
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.error(f"Error applying transformation: {e}")
|
|
45
|
+
logger.error(traceback.format_exc())
|
|
46
|
+
self.monitor.track_pipeline_status(PipelineReturnStatus.TRANSFORM_ERROR)
|
|
47
|
+
return PipelineReturnStatus.TRANSFORM_ERROR
|
|
48
|
+
|
|
49
|
+
# Handle last iteration
|
|
50
|
+
try:
|
|
51
|
+
if queue_message.signal == QUEUE_TERMINATION:
|
|
52
|
+
logger.info("Received termination signal, waiting for destination to close gracefully ...")
|
|
53
|
+
self.destination.write_records_and_update_cursor(
|
|
54
|
+
df_source_records=df_source_records,
|
|
55
|
+
iteration=queue_message.iteration,
|
|
56
|
+
extracted_at=queue_message.extracted_at,
|
|
57
|
+
pagination=queue_message.pagination,
|
|
58
|
+
last_iteration=True,
|
|
59
|
+
)
|
|
60
|
+
self.monitor.track_pipeline_status(PipelineReturnStatus.SUCCESS)
|
|
61
|
+
return PipelineReturnStatus.SUCCESS
|
|
62
|
+
|
|
63
|
+
except Exception as e:
|
|
64
|
+
logger.error(f"Error writing records to destination: {e}")
|
|
65
|
+
self.monitor.track_pipeline_status(PipelineReturnStatus.DESTINATION_ERROR)
|
|
66
|
+
return PipelineReturnStatus.DESTINATION_ERROR
|
|
67
|
+
|
|
68
|
+
# Write the records to the destination
|
|
69
|
+
try:
|
|
70
|
+
self.destination.write_records_and_update_cursor(
|
|
71
|
+
df_source_records=df_source_records,
|
|
72
|
+
iteration=queue_message.iteration,
|
|
73
|
+
extracted_at=queue_message.extracted_at,
|
|
74
|
+
pagination=queue_message.pagination,
|
|
75
|
+
)
|
|
76
|
+
return PipelineReturnStatus.RUNNING
|
|
77
|
+
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logger.error(f"Error writing records to destination: {e}")
|
|
80
|
+
self.monitor.track_pipeline_status(PipelineReturnStatus.DESTINATION_ERROR)
|
|
81
|
+
return PipelineReturnStatus.DESTINATION_ERROR
|
|
82
|
+
|
|
83
|
+
raise RuntimeError("Should not reach this point")
|
bizon/engine/pipeline/models.py
CHANGED
|
@@ -1,10 +1,15 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
2
|
|
|
3
3
|
|
|
4
|
-
class PipelineReturnStatus(Enum):
|
|
4
|
+
class PipelineReturnStatus(str, Enum):
|
|
5
5
|
"""Producer error types"""
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
BACKEND_ERROR = "backend_error"
|
|
8
|
+
DESTINATION_ERROR = "destination_error"
|
|
9
|
+
KILLED_BY_RUNNER = "killed_by_runner"
|
|
8
10
|
QUEUE_ERROR = "queue_error"
|
|
11
|
+
RUNNING = "running"
|
|
9
12
|
SOURCE_ERROR = "source_error"
|
|
10
|
-
|
|
13
|
+
SUCCESS = "success"
|
|
14
|
+
TRANSFORM_ERROR = "transform_error"
|
|
15
|
+
STREAM_ERROR = "stream_error"
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
import ast
|
|
2
|
+
import multiprocessing
|
|
3
|
+
import multiprocessing.synchronize
|
|
4
|
+
import threading
|
|
2
5
|
import traceback
|
|
3
6
|
from datetime import datetime
|
|
4
7
|
from time import sleep
|
|
5
|
-
from typing import Tuple
|
|
8
|
+
from typing import Tuple, Union
|
|
6
9
|
|
|
7
10
|
from loguru import logger
|
|
8
11
|
from pytz import UTC
|
|
@@ -28,7 +31,7 @@ class Producer:
|
|
|
28
31
|
|
|
29
32
|
@property
|
|
30
33
|
def name(self) -> str:
|
|
31
|
-
return f"producer-{self.source.config.
|
|
34
|
+
return f"producer-{self.source.config.name}-{self.source.config.stream}"
|
|
32
35
|
|
|
33
36
|
def get_or_create_cursor(self, job_id: str, session=None) -> Cursor:
|
|
34
37
|
"""Get or create a cursor for the current stream, return the cursor"""
|
|
@@ -47,8 +50,8 @@ class Producer:
|
|
|
47
50
|
|
|
48
51
|
# Initialize the recovery from the DestinationCursor
|
|
49
52
|
cursor = Cursor.from_db(
|
|
50
|
-
source_name=self.source.config.
|
|
51
|
-
stream_name=self.source.config.
|
|
53
|
+
source_name=self.source.config.name,
|
|
54
|
+
stream_name=self.source.config.stream,
|
|
52
55
|
job_id=job_id,
|
|
53
56
|
total_records=job.total_records_to_fetch,
|
|
54
57
|
iteration=cursor_from_db.to_source_iteration + 1,
|
|
@@ -60,8 +63,8 @@ class Producer:
|
|
|
60
63
|
total_records = self.source.get_total_records_count()
|
|
61
64
|
# Initialize the cursor
|
|
62
65
|
cursor = Cursor(
|
|
63
|
-
source_name=self.source.config.
|
|
64
|
-
stream_name=self.source.config.
|
|
66
|
+
source_name=self.source.config.name,
|
|
67
|
+
stream_name=self.source.config.stream,
|
|
65
68
|
job_id=job_id,
|
|
66
69
|
total_records=total_records,
|
|
67
70
|
)
|
|
@@ -99,7 +102,9 @@ class Producer:
|
|
|
99
102
|
|
|
100
103
|
return False, queue_size, approximate_nb_records_in_queue
|
|
101
104
|
|
|
102
|
-
def run(
|
|
105
|
+
def run(
|
|
106
|
+
self, job_id: int, stop_event: Union[multiprocessing.synchronize.Event, threading.Event]
|
|
107
|
+
) -> PipelineReturnStatus:
|
|
103
108
|
|
|
104
109
|
return_value: PipelineReturnStatus = PipelineReturnStatus.SUCCESS
|
|
105
110
|
|
|
@@ -128,6 +133,10 @@ class Producer:
|
|
|
128
133
|
|
|
129
134
|
while not cursor.is_finished:
|
|
130
135
|
|
|
136
|
+
if stop_event.is_set():
|
|
137
|
+
logger.info("Stop event is set, terminating producer ...")
|
|
138
|
+
return PipelineReturnStatus.KILLED_BY_RUNNER
|
|
139
|
+
|
|
131
140
|
timestamp_start_iteration = datetime.now(tz=UTC)
|
|
132
141
|
|
|
133
142
|
# Handle the case where last cursor already reach max_iterations
|
|
@@ -142,8 +151,8 @@ class Producer:
|
|
|
142
151
|
self.backend.create_source_cursor(
|
|
143
152
|
job_id=job_id,
|
|
144
153
|
name=self.bizon_config.name,
|
|
145
|
-
source_name=self.source.config.
|
|
146
|
-
stream_name=self.source.config.
|
|
154
|
+
source_name=self.source.config.name,
|
|
155
|
+
stream_name=self.source.config.stream,
|
|
147
156
|
iteration=cursor.iteration,
|
|
148
157
|
rows_fetched=cursor.rows_fetched,
|
|
149
158
|
next_pagination=cursor.pagination,
|
|
@@ -3,9 +3,9 @@ import json
|
|
|
3
3
|
from kafka import KafkaConsumer
|
|
4
4
|
from loguru import logger
|
|
5
5
|
|
|
6
|
-
from bizon.
|
|
6
|
+
from bizon.destination.destination import AbstractDestination
|
|
7
7
|
from bizon.engine.pipeline.consumer import AbstractQueueConsumer
|
|
8
|
-
from bizon.engine.queue.
|
|
8
|
+
from bizon.engine.queue.config import QUEUE_TERMINATION, QueueMessage
|
|
9
9
|
|
|
10
10
|
from .config import KafkaConfigDetails
|
|
11
11
|
|
|
@@ -4,8 +4,9 @@ from typing import Union
|
|
|
4
4
|
from kafka import KafkaProducer
|
|
5
5
|
from loguru import logger
|
|
6
6
|
|
|
7
|
-
from bizon.
|
|
8
|
-
from bizon.engine.queue.
|
|
7
|
+
from bizon.destination.destination import AbstractDestination
|
|
8
|
+
from bizon.engine.queue.config import QUEUE_TERMINATION, QueueMessage
|
|
9
|
+
from bizon.engine.queue.queue import AbstractQueue
|
|
9
10
|
|
|
10
11
|
from .config import KafkaConfigDetails
|
|
11
12
|
from .consumer import KafkaConsumer_
|