bizon 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon/alerting/alerts.py +0 -1
- bizon/common/models.py +184 -4
- bizon/connectors/destinations/bigquery/src/config.py +1 -1
- bizon/connectors/destinations/bigquery/src/destination.py +14 -9
- bizon/connectors/destinations/bigquery_streaming/config/bigquery_streaming.example.yml +74 -0
- bizon/connectors/destinations/bigquery_streaming/src/config.py +6 -5
- bizon/connectors/destinations/bigquery_streaming/src/destination.py +13 -9
- bizon/connectors/destinations/bigquery_streaming_v2/config/bigquery_streaming_v2.example.yml +79 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +6 -1
- bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +232 -49
- bizon/connectors/destinations/bigquery_streaming_v2/src/proto_utils.py +1 -13
- bizon/connectors/destinations/file/config/file.example.yml +40 -0
- bizon/connectors/destinations/file/src/config.py +2 -1
- bizon/connectors/destinations/file/src/destination.py +3 -6
- bizon/connectors/destinations/logger/config/logger.example.yml +30 -0
- bizon/connectors/destinations/logger/src/config.py +1 -2
- bizon/connectors/destinations/logger/src/destination.py +4 -2
- bizon/connectors/sources/cycle/src/source.py +2 -6
- bizon/connectors/sources/dummy/src/source.py +0 -4
- bizon/connectors/sources/gsheets/src/source.py +2 -3
- bizon/connectors/sources/hubspot/src/hubspot_base.py +0 -1
- bizon/connectors/sources/hubspot/src/hubspot_objects.py +3 -4
- bizon/connectors/sources/hubspot/src/models/hs_object.py +0 -1
- bizon/connectors/sources/kafka/config/kafka.example.yml +1 -3
- bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +1 -3
- bizon/connectors/sources/kafka/config/kafka_streams.example.yml +124 -0
- bizon/connectors/sources/kafka/src/config.py +10 -12
- bizon/connectors/sources/kafka/src/decode.py +65 -60
- bizon/connectors/sources/kafka/src/source.py +182 -61
- bizon/connectors/sources/kafka/tests/kafka_pipeline.py +1 -1
- bizon/connectors/sources/notion/config/api_key.example.yml +35 -0
- bizon/connectors/sources/notion/src/__init__.py +0 -0
- bizon/connectors/sources/notion/src/config.py +59 -0
- bizon/connectors/sources/notion/src/source.py +1159 -0
- bizon/connectors/sources/notion/tests/notion_pipeline.py +7 -0
- bizon/connectors/sources/notion/tests/test_notion.py +113 -0
- bizon/connectors/sources/periscope/src/source.py +0 -6
- bizon/connectors/sources/pokeapi/src/source.py +0 -1
- bizon/connectors/sources/sana_ai/config/sana.example.yml +25 -0
- bizon/connectors/sources/sana_ai/src/source.py +85 -0
- bizon/destination/buffer.py +0 -1
- bizon/destination/config.py +9 -1
- bizon/destination/destination.py +38 -9
- bizon/engine/backend/adapters/sqlalchemy/backend.py +2 -5
- bizon/engine/backend/adapters/sqlalchemy/config.py +0 -1
- bizon/engine/config.py +0 -1
- bizon/engine/engine.py +0 -1
- bizon/engine/pipeline/consumer.py +0 -1
- bizon/engine/pipeline/producer.py +1 -5
- bizon/engine/queue/adapters/kafka/config.py +1 -1
- bizon/engine/queue/adapters/kafka/queue.py +0 -1
- bizon/engine/queue/adapters/python_queue/consumer.py +0 -1
- bizon/engine/queue/adapters/python_queue/queue.py +0 -2
- bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -1
- bizon/engine/queue/adapters/rabbitmq/queue.py +0 -1
- bizon/engine/queue/config.py +0 -2
- bizon/engine/runner/adapters/process.py +0 -2
- bizon/engine/runner/adapters/streaming.py +114 -42
- bizon/engine/runner/adapters/thread.py +0 -2
- bizon/engine/runner/config.py +0 -1
- bizon/engine/runner/runner.py +14 -9
- bizon/monitoring/config.py +12 -2
- bizon/monitoring/datadog/monitor.py +100 -14
- bizon/monitoring/monitor.py +41 -12
- bizon/monitoring/noop/monitor.py +22 -3
- bizon/source/auth/authenticators/abstract_oauth.py +11 -3
- bizon/source/auth/authenticators/abstract_token.py +2 -1
- bizon/source/auth/authenticators/basic.py +1 -1
- bizon/source/auth/authenticators/cookies.py +2 -1
- bizon/source/auth/authenticators/oauth.py +8 -3
- bizon/source/config.py +0 -2
- bizon/source/cursor.py +8 -16
- bizon/source/discover.py +3 -6
- bizon/source/models.py +0 -1
- bizon/source/session.py +0 -1
- bizon/source/source.py +18 -3
- bizon/transform/config.py +0 -2
- bizon/transform/transform.py +0 -3
- {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info}/METADATA +62 -41
- bizon-0.2.0.dist-info/RECORD +136 -0
- {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info}/WHEEL +1 -1
- bizon-0.2.0.dist-info/entry_points.txt +2 -0
- bizon-0.1.1.dist-info/RECORD +0 -123
- bizon-0.1.1.dist-info/entry_points.txt +0 -3
- {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Quick test script for Notion source intermediate functions.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
# Set your token
|
|
6
|
+
export NOTION_TOKEN="your_notion_integration_token"
|
|
7
|
+
|
|
8
|
+
# Run interactively
|
|
9
|
+
python -i test_notion.py
|
|
10
|
+
|
|
11
|
+
# Then test functions:
|
|
12
|
+
>>> page = source.get_page("page-id-here")
|
|
13
|
+
>>> blocks = source.fetch_blocks_recursively("page-id-here")
|
|
14
|
+
>>> for b in blocks[:5]:
|
|
15
|
+
... print(source._block_to_markdown(b))
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
|
|
20
|
+
from bizon.connectors.sources.notion.src.config import NotionSourceConfig, NotionStreams
|
|
21
|
+
from bizon.connectors.sources.notion.src.source import NotionSource
|
|
22
|
+
from bizon.source.auth.authenticators.token import TokenAuthParams
|
|
23
|
+
from bizon.source.auth.config import AuthConfig, AuthType
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def create_notion_source(
|
|
27
|
+
token: str = None,
|
|
28
|
+
page_ids: list = None,
|
|
29
|
+
database_ids: list = None,
|
|
30
|
+
stream: NotionStreams = NotionStreams.BLOCKS,
|
|
31
|
+
) -> NotionSource:
|
|
32
|
+
"""Create a NotionSource instance for testing."""
|
|
33
|
+
token = token or os.environ.get("NOTION_TOKEN")
|
|
34
|
+
if not token:
|
|
35
|
+
raise ValueError("Provide token or set NOTION_TOKEN environment variable")
|
|
36
|
+
|
|
37
|
+
config = NotionSourceConfig(
|
|
38
|
+
name="notion",
|
|
39
|
+
stream=stream,
|
|
40
|
+
page_ids=page_ids or [],
|
|
41
|
+
database_ids=database_ids or [],
|
|
42
|
+
authentication=AuthConfig(
|
|
43
|
+
type=AuthType.BEARER,
|
|
44
|
+
params=TokenAuthParams(token=token),
|
|
45
|
+
),
|
|
46
|
+
init_pipeline=False,
|
|
47
|
+
max_recursion_depth=30,
|
|
48
|
+
)
|
|
49
|
+
return NotionSource(config)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ==================== HELPER FUNCTIONS ====================
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_block(source: NotionSource, block_id: str) -> dict:
|
|
56
|
+
"""Fetch a single block by ID."""
|
|
57
|
+
response = source.session.get(f"https://api.notion.com/v1/blocks/{block_id}")
|
|
58
|
+
response.raise_for_status()
|
|
59
|
+
return response.json()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_page_markdown(source: NotionSource, page_id: str) -> str:
|
|
63
|
+
"""Fetch all blocks from a page and return combined markdown."""
|
|
64
|
+
blocks = source.fetch_blocks_recursively(page_id, source_page_id=page_id)
|
|
65
|
+
lines = []
|
|
66
|
+
for block in blocks:
|
|
67
|
+
md = source._block_to_markdown(block)
|
|
68
|
+
if md:
|
|
69
|
+
# Add indentation based on depth
|
|
70
|
+
indent = " " * block.get("depth", 0)
|
|
71
|
+
lines.append(f"{indent}{md}")
|
|
72
|
+
return "\n".join(lines)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def inspect_blocks(source: NotionSource, page_id: str, max_blocks: int = 10):
|
|
76
|
+
"""Fetch and print block details for inspection."""
|
|
77
|
+
blocks = source.fetch_blocks_recursively(page_id, source_page_id=page_id)
|
|
78
|
+
print(f"Found {len(blocks)} blocks")
|
|
79
|
+
for i, block in enumerate(blocks[:max_blocks]):
|
|
80
|
+
print(f"\n--- Block {i} ({block.get('type')}) ---")
|
|
81
|
+
print(f"ID: {block.get('id')}")
|
|
82
|
+
print(f"Depth: {block.get('depth')}, Order: {block.get('page_order')}")
|
|
83
|
+
print(f"Markdown: {source._block_to_markdown(block)}")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def list_pages_in_database(source: NotionSource, database_id: str) -> list:
|
|
87
|
+
"""List all page IDs in a database."""
|
|
88
|
+
return source.get_pages_from_database(database_id, apply_filter=False)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# ==================== MAIN ====================
|
|
92
|
+
|
|
93
|
+
if __name__ == "__main__":
|
|
94
|
+
# Create source if token is available
|
|
95
|
+
token = os.environ.get("NOTION_TOKEN")
|
|
96
|
+
if token:
|
|
97
|
+
source = create_notion_source(token=token)
|
|
98
|
+
print("NotionSource created and available as 'source'")
|
|
99
|
+
print("\nAvailable functions:")
|
|
100
|
+
print(" source.get_page(page_id)")
|
|
101
|
+
print(" source.get_database(database_id)")
|
|
102
|
+
print(" source.get_block_children(block_id)")
|
|
103
|
+
print(" source.fetch_blocks_recursively(page_id)")
|
|
104
|
+
print(" source._block_to_markdown(block)")
|
|
105
|
+
print(" source.search()")
|
|
106
|
+
print("\nHelper functions:")
|
|
107
|
+
print(" get_block(source, block_id)")
|
|
108
|
+
print(" get_page_markdown(source, page_id)")
|
|
109
|
+
print(" inspect_blocks(source, page_id)")
|
|
110
|
+
print(" list_pages_in_database(source, database_id)")
|
|
111
|
+
else:
|
|
112
|
+
print("Set NOTION_TOKEN env var or call:")
|
|
113
|
+
print(" source = create_notion_source(token='your_token')")
|
|
@@ -41,7 +41,6 @@ class PeriscopeSourceConfig(SourceConfig):
|
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
class PeriscopeSource(AbstractSource):
|
|
44
|
-
|
|
45
44
|
def __init__(self, config: PeriscopeSourceConfig):
|
|
46
45
|
super().__init__(config)
|
|
47
46
|
self.config: PeriscopeSourceConfig = config
|
|
@@ -127,7 +126,6 @@ class PeriscopeSource(AbstractSource):
|
|
|
127
126
|
return self.transform_response_to_source_iteration(records_json)
|
|
128
127
|
|
|
129
128
|
def get_dashboards_metadata(self, pagination: dict = None) -> SourceIteration:
|
|
130
|
-
|
|
131
129
|
params = {
|
|
132
130
|
"client_site_id": self.config.client_site_id,
|
|
133
131
|
"filters": [{"name": "typeFilter", "input": "Dashboard"}],
|
|
@@ -186,7 +184,6 @@ class PeriscopeSource(AbstractSource):
|
|
|
186
184
|
dashboard_charts: List[dict] = []
|
|
187
185
|
|
|
188
186
|
for iter_count in range(MAXIMUM_ITERATION):
|
|
189
|
-
|
|
190
187
|
# Break the loop if no more charts are available
|
|
191
188
|
if iter_count > 0 and len(iter_charts) == 0:
|
|
192
189
|
break
|
|
@@ -217,10 +214,8 @@ class PeriscopeSource(AbstractSource):
|
|
|
217
214
|
iter_textboxes = response.json().get("TextBox")
|
|
218
215
|
|
|
219
216
|
for chart in iter_charts:
|
|
220
|
-
# Only fetch charts connected to gorgias-growth-production
|
|
221
217
|
if str(chart.get("database_id")) == str(self.config.database_id):
|
|
222
218
|
if chart.get("id") not in charts_list:
|
|
223
|
-
|
|
224
219
|
charts_list.add(chart.get("id"))
|
|
225
220
|
|
|
226
221
|
chart["raw_text"] = None
|
|
@@ -250,7 +245,6 @@ class PeriscopeSource(AbstractSource):
|
|
|
250
245
|
return dashboard_charts
|
|
251
246
|
|
|
252
247
|
def get_charts(self, pagination: dict = None) -> SourceIteration:
|
|
253
|
-
|
|
254
248
|
BATCH_SIZE = 10
|
|
255
249
|
|
|
256
250
|
if not pagination:
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
name: sana to file
|
|
2
|
+
|
|
3
|
+
source:
|
|
4
|
+
name: sana_ai
|
|
5
|
+
stream: insight_report
|
|
6
|
+
domain: my_domain
|
|
7
|
+
query: 'SELECT "user", "user_type", "user_role", "user_origin", "user_registration_step", "user_creation_date", "user_disabled_date", "user_completion_date", "user_status", "user_last_active_date", "user_attribute_evangelist" FROM "analytics"."users" ORDER BY "user" ASC'
|
|
8
|
+
authentication:
|
|
9
|
+
type: oauth
|
|
10
|
+
params:
|
|
11
|
+
token_refresh_endpoint: https://my_domain.sana.ai/api/token
|
|
12
|
+
client_id: <client_id>
|
|
13
|
+
client_secret: <client_secret>
|
|
14
|
+
grant_type: client_credentials
|
|
15
|
+
access_token_name: accessToken
|
|
16
|
+
expires_in_name: expiresIn
|
|
17
|
+
response_field_path: data
|
|
18
|
+
scopes:
|
|
19
|
+
- read
|
|
20
|
+
- write
|
|
21
|
+
|
|
22
|
+
destination:
|
|
23
|
+
name: file
|
|
24
|
+
config:
|
|
25
|
+
destination_id: sana_ai_user_status
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import io
|
|
3
|
+
import time
|
|
4
|
+
from typing import Any, List, Tuple
|
|
5
|
+
|
|
6
|
+
from loguru import logger
|
|
7
|
+
from pydantic import Field
|
|
8
|
+
from requests.auth import AuthBase
|
|
9
|
+
|
|
10
|
+
from bizon.source.auth.builder import AuthBuilder
|
|
11
|
+
from bizon.source.auth.config import AuthType
|
|
12
|
+
from bizon.source.config import SourceConfig
|
|
13
|
+
from bizon.source.models import SourceIteration, SourceRecord
|
|
14
|
+
from bizon.source.source import AbstractSource
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SanaSourceConfig(SourceConfig):
|
|
18
|
+
query: str = Field(..., description="Query to get the data from the Sana Insight API")
|
|
19
|
+
domain: str = Field(..., description="Domain of the Sana instance")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SanaSource(AbstractSource):
|
|
23
|
+
def __init__(self, config: SanaSourceConfig):
|
|
24
|
+
super().__init__(config)
|
|
25
|
+
self.config: SanaSourceConfig = config
|
|
26
|
+
self.base_url = f"https://{config.domain}.sana.ai/api/v1"
|
|
27
|
+
|
|
28
|
+
def get_authenticator(self) -> AuthBase:
|
|
29
|
+
if self.config.authentication.type.value == AuthType.OAUTH:
|
|
30
|
+
return AuthBuilder.oauth2(params=self.config.authentication.params)
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def streams() -> List[str]:
|
|
34
|
+
return ["insight_report"]
|
|
35
|
+
|
|
36
|
+
@staticmethod
|
|
37
|
+
def get_config_class() -> SourceConfig:
|
|
38
|
+
return SanaSourceConfig
|
|
39
|
+
|
|
40
|
+
def check_connection(self) -> Tuple[bool | Any | None]:
|
|
41
|
+
return True, None
|
|
42
|
+
|
|
43
|
+
def get_total_records_count(self) -> int | None:
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
def create_insight_report_job(self, query: str) -> str:
|
|
47
|
+
"""Create an insight report for the given query"""
|
|
48
|
+
response = self.session.post(f"{self.base_url}/reports/query", json={"query": query, "format": "csv"})
|
|
49
|
+
return response.json()["data"]["jobId"]
|
|
50
|
+
|
|
51
|
+
def get_insight_report_job(self, job_id: str) -> dict:
|
|
52
|
+
"""Get an insight report job for the given job id"""
|
|
53
|
+
response = self.session.get(f"{self.base_url}/reports/jobs/{job_id}")
|
|
54
|
+
return response.json()
|
|
55
|
+
|
|
56
|
+
def get_insight_report(self, pagination: dict) -> SourceIteration:
|
|
57
|
+
"""Return all insight report for the given query"""
|
|
58
|
+
|
|
59
|
+
job_id = self.create_insight_report_job(self.config.query)
|
|
60
|
+
logger.info(f"Created insight report job {job_id} for query {self.config.query}")
|
|
61
|
+
|
|
62
|
+
response = self.get_insight_report_job(job_id)
|
|
63
|
+
status = response["data"]["status"]
|
|
64
|
+
while status != "successful":
|
|
65
|
+
time.sleep(3)
|
|
66
|
+
response = self.get_insight_report_job(job_id)
|
|
67
|
+
status = response["data"]["status"]
|
|
68
|
+
logger.info(f"Insight report job {job_id} is {status}")
|
|
69
|
+
|
|
70
|
+
link = response["data"]["link"]["url"]
|
|
71
|
+
logger.info(f"Link for insight report job {job_id} is {link}")
|
|
72
|
+
|
|
73
|
+
csv_response = self.session.get(link)
|
|
74
|
+
csv_content = csv_response.content.decode("utf-8")
|
|
75
|
+
|
|
76
|
+
reader = csv.DictReader(io.StringIO(csv_content))
|
|
77
|
+
data = [SourceRecord(id=str(i), data=row) for i, row in enumerate(reader)]
|
|
78
|
+
|
|
79
|
+
return SourceIteration(records=data, next_pagination={})
|
|
80
|
+
|
|
81
|
+
def get(self, pagination: dict = None) -> SourceIteration:
|
|
82
|
+
if self.config.stream == "insight_report":
|
|
83
|
+
return self.get_insight_report(pagination)
|
|
84
|
+
|
|
85
|
+
raise NotImplementedError(f"Stream {self.config.stream} not implemented for Sana")
|
bizon/destination/buffer.py
CHANGED
|
@@ -9,7 +9,6 @@ from .models import destination_record_schema
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class DestinationBuffer:
|
|
12
|
-
|
|
13
12
|
def __init__(self, buffer_size: int, buffer_flush_timeout: int) -> None:
|
|
14
13
|
self.buffer_size = buffer_size * 1024 * 1024 # Convert to bytes
|
|
15
14
|
self.buffer_flush_timeout = buffer_flush_timeout
|
bizon/destination/config.py
CHANGED
|
@@ -28,7 +28,6 @@ class RecordSchemaConfig(BaseModel):
|
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
class AbstractDestinationDetailsConfig(BaseModel):
|
|
31
|
-
|
|
32
31
|
# Forbid extra keys in the model
|
|
33
32
|
model_config = ConfigDict(extra="forbid")
|
|
34
33
|
|
|
@@ -42,6 +41,11 @@ class AbstractDestinationDetailsConfig(BaseModel):
|
|
|
42
41
|
description="Maximum time in seconds for buffering after which the records will be written to the destination. Set to 0 to deactivate the timeout buffer check.", # noqa
|
|
43
42
|
)
|
|
44
43
|
|
|
44
|
+
max_concurrent_threads: int = Field(
|
|
45
|
+
default=10,
|
|
46
|
+
description="Maximum number of concurrent threads to use for writing to the destination.",
|
|
47
|
+
)
|
|
48
|
+
|
|
45
49
|
record_schemas: Optional[list[RecordSchemaConfig]] = Field(
|
|
46
50
|
default=None, description="Schemas for the records. Required if unnest is set to true."
|
|
47
51
|
)
|
|
@@ -71,4 +75,8 @@ class AbstractDestinationConfig(BaseModel):
|
|
|
71
75
|
model_config = ConfigDict(extra="forbid")
|
|
72
76
|
|
|
73
77
|
name: DestinationTypes = Field(..., description="Name of the destination")
|
|
78
|
+
alias: str = Field(
|
|
79
|
+
...,
|
|
80
|
+
description="Alias of the destination, used for tracking the system name (ie bigquery for bigquery_streaming)",
|
|
81
|
+
)
|
|
74
82
|
config: AbstractDestinationDetailsConfig = Field(..., description="Configuration for the destination")
|
bizon/destination/destination.py
CHANGED
|
@@ -10,6 +10,7 @@ from pydantic import BaseModel, Field
|
|
|
10
10
|
from bizon.common.models import SyncMetadata
|
|
11
11
|
from bizon.engine.backend.backend import AbstractBackend
|
|
12
12
|
from bizon.engine.backend.models import JobStatus
|
|
13
|
+
from bizon.monitoring.monitor import AbstractMonitor
|
|
13
14
|
from bizon.source.callback import AbstractSourceCallback
|
|
14
15
|
from bizon.source.config import SourceSyncModes
|
|
15
16
|
|
|
@@ -43,17 +44,18 @@ class DestinationIteration(BaseModel):
|
|
|
43
44
|
|
|
44
45
|
|
|
45
46
|
class AbstractDestination(ABC):
|
|
46
|
-
|
|
47
47
|
def __init__(
|
|
48
48
|
self,
|
|
49
49
|
sync_metadata: SyncMetadata,
|
|
50
50
|
config: AbstractDestinationDetailsConfig,
|
|
51
51
|
backend: AbstractBackend,
|
|
52
52
|
source_callback: AbstractSourceCallback,
|
|
53
|
+
monitor: AbstractMonitor,
|
|
53
54
|
):
|
|
54
55
|
self.sync_metadata = sync_metadata
|
|
55
56
|
self.config = config
|
|
56
57
|
self.backend = backend
|
|
58
|
+
self.monitor = monitor
|
|
57
59
|
self.buffer = DestinationBuffer(
|
|
58
60
|
buffer_size=self.config.buffer_size, buffer_flush_timeout=self.config.buffer_flush_timeout
|
|
59
61
|
)
|
|
@@ -141,7 +143,6 @@ class AbstractDestination(ABC):
|
|
|
141
143
|
|
|
142
144
|
# Last iteration, write all records to destination
|
|
143
145
|
if last_iteration:
|
|
144
|
-
|
|
145
146
|
if self.buffer.df_destination_records.height == 0 and self.buffer.is_empty:
|
|
146
147
|
logger.info("No records to write to destination, already written, buffer is empty.")
|
|
147
148
|
return DestinationBufferStatus.RECORDS_WRITTEN
|
|
@@ -191,6 +192,14 @@ class AbstractDestination(ABC):
|
|
|
191
192
|
logger.info(
|
|
192
193
|
f"Buffer ripeness {round(self.buffer.ripeness / 60, 2)} min. Max ripeness {round(self.buffer.buffer_flush_timeout / 60, 2)} min." # noqa
|
|
193
194
|
)
|
|
195
|
+
logger.info(
|
|
196
|
+
f"Current records size to process: {round(df_destination_records.estimated_size(unit='b') / 1024 / 1024, 2)} Mb."
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
if df_destination_records.estimated_size(unit="b") > self.buffer.buffer_size:
|
|
200
|
+
raise ValueError(
|
|
201
|
+
f"Records size {round(df_destination_records.estimated_size(unit='b') / 1024 / 1024, 2)} Mb is greater than buffer size {round(self.buffer.buffer_size / 1024 / 1024, 2)} Mb. Please increase destination buffer_size or reduce batch_size from the source."
|
|
202
|
+
)
|
|
194
203
|
|
|
195
204
|
# Write buffer to destination if buffer is ripe and create a new buffer for the new iteration
|
|
196
205
|
if self.buffer.is_ripe:
|
|
@@ -276,15 +285,19 @@ class DestinationFactory:
|
|
|
276
285
|
config: AbstractDestinationConfig,
|
|
277
286
|
backend: AbstractBackend,
|
|
278
287
|
source_callback: AbstractSourceCallback,
|
|
288
|
+
monitor: AbstractMonitor,
|
|
279
289
|
) -> AbstractDestination:
|
|
280
|
-
|
|
281
290
|
if config.name == DestinationTypes.LOGGER:
|
|
282
291
|
from bizon.connectors.destinations.logger.src.destination import (
|
|
283
292
|
LoggerDestination,
|
|
284
293
|
)
|
|
285
294
|
|
|
286
295
|
return LoggerDestination(
|
|
287
|
-
sync_metadata=sync_metadata,
|
|
296
|
+
sync_metadata=sync_metadata,
|
|
297
|
+
config=config.config,
|
|
298
|
+
backend=backend,
|
|
299
|
+
source_callback=source_callback,
|
|
300
|
+
monitor=monitor,
|
|
288
301
|
)
|
|
289
302
|
|
|
290
303
|
elif config.name == DestinationTypes.BIGQUERY:
|
|
@@ -293,7 +306,11 @@ class DestinationFactory:
|
|
|
293
306
|
)
|
|
294
307
|
|
|
295
308
|
return BigQueryDestination(
|
|
296
|
-
sync_metadata=sync_metadata,
|
|
309
|
+
sync_metadata=sync_metadata,
|
|
310
|
+
config=config.config,
|
|
311
|
+
backend=backend,
|
|
312
|
+
source_callback=source_callback,
|
|
313
|
+
monitor=monitor,
|
|
297
314
|
)
|
|
298
315
|
|
|
299
316
|
elif config.name == DestinationTypes.BIGQUERY_STREAMING:
|
|
@@ -302,7 +319,11 @@ class DestinationFactory:
|
|
|
302
319
|
)
|
|
303
320
|
|
|
304
321
|
return BigQueryStreamingDestination(
|
|
305
|
-
sync_metadata=sync_metadata,
|
|
322
|
+
sync_metadata=sync_metadata,
|
|
323
|
+
config=config.config,
|
|
324
|
+
backend=backend,
|
|
325
|
+
source_callback=source_callback,
|
|
326
|
+
monitor=monitor,
|
|
306
327
|
)
|
|
307
328
|
|
|
308
329
|
elif config.name == DestinationTypes.BIGQUERY_STREAMING_V2:
|
|
@@ -311,7 +332,11 @@ class DestinationFactory:
|
|
|
311
332
|
)
|
|
312
333
|
|
|
313
334
|
return BigQueryStreamingV2Destination(
|
|
314
|
-
sync_metadata=sync_metadata,
|
|
335
|
+
sync_metadata=sync_metadata,
|
|
336
|
+
config=config.config,
|
|
337
|
+
backend=backend,
|
|
338
|
+
source_callback=source_callback,
|
|
339
|
+
monitor=monitor,
|
|
315
340
|
)
|
|
316
341
|
|
|
317
342
|
elif config.name == DestinationTypes.FILE:
|
|
@@ -320,7 +345,11 @@ class DestinationFactory:
|
|
|
320
345
|
)
|
|
321
346
|
|
|
322
347
|
return FileDestination(
|
|
323
|
-
sync_metadata=sync_metadata,
|
|
348
|
+
sync_metadata=sync_metadata,
|
|
349
|
+
config=config.config,
|
|
350
|
+
backend=backend,
|
|
351
|
+
source_callback=source_callback,
|
|
352
|
+
monitor=monitor,
|
|
324
353
|
)
|
|
325
354
|
|
|
326
|
-
raise ValueError(f"Destination {config.name}
|
|
355
|
+
raise ValueError(f"Destination {config.name}with params {config} not found")
|
|
@@ -5,7 +5,7 @@ from typing import Optional, Union
|
|
|
5
5
|
from loguru import logger
|
|
6
6
|
from pytz import UTC
|
|
7
7
|
from sqlalchemy import Result, Select, create_engine, func, inspect, select, update
|
|
8
|
-
from sqlalchemy.engine import Engine
|
|
8
|
+
from sqlalchemy.engine import Engine
|
|
9
9
|
from sqlalchemy.orm import Session, scoped_session, sessionmaker
|
|
10
10
|
|
|
11
11
|
from bizon.engine.backend.backend import AbstractBackend
|
|
@@ -26,7 +26,6 @@ from .config import BigQueryConfigDetails, PostgresConfigDetails, SQLiteConfigDe
|
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
class SQLAlchemyBackend(AbstractBackend):
|
|
29
|
-
|
|
30
29
|
def __init__(self, config: Union[PostgresConfigDetails, SQLiteConfigDetails], type: BackendTypes, **kwargs):
|
|
31
30
|
super().__init__(config, type)
|
|
32
31
|
|
|
@@ -81,7 +80,6 @@ class SQLAlchemyBackend(AbstractBackend):
|
|
|
81
80
|
)
|
|
82
81
|
|
|
83
82
|
def _get_engine(self) -> Engine:
|
|
84
|
-
|
|
85
83
|
if self.type == BackendTypes.BIGQUERY:
|
|
86
84
|
return self._get_engine_bigquery()
|
|
87
85
|
|
|
@@ -96,7 +94,7 @@ class SQLAlchemyBackend(AbstractBackend):
|
|
|
96
94
|
# ONLY FOR UNIT TESTS: SQLite in memory
|
|
97
95
|
if self.type == BackendTypes.SQLITE_IN_MEMORY:
|
|
98
96
|
return create_engine(
|
|
99
|
-
|
|
97
|
+
"sqlite:///:memory:",
|
|
100
98
|
echo=self.config.echoEngine,
|
|
101
99
|
connect_args={"check_same_thread": False},
|
|
102
100
|
)
|
|
@@ -388,7 +386,6 @@ class SQLAlchemyBackend(AbstractBackend):
|
|
|
388
386
|
pagination: Optional[dict] = None,
|
|
389
387
|
session: Session | None = None,
|
|
390
388
|
) -> DestinationCursor:
|
|
391
|
-
|
|
392
389
|
destination_cursor = DestinationCursor(
|
|
393
390
|
name=name,
|
|
394
391
|
source_name=source_name,
|
bizon/engine/config.py
CHANGED
bizon/engine/engine.py
CHANGED
|
@@ -21,7 +21,6 @@ def replace_env_variables_in_config(config: dict) -> dict:
|
|
|
21
21
|
class RunnerFactory:
|
|
22
22
|
@staticmethod
|
|
23
23
|
def create_from_config_dict(config: dict) -> AbstractRunner:
|
|
24
|
-
|
|
25
24
|
# Replace env variables in config
|
|
26
25
|
config = replace_env_variables_in_config(config=config)
|
|
27
26
|
|
|
@@ -36,7 +36,6 @@ class AbstractQueueConsumer(ABC):
|
|
|
36
36
|
pass
|
|
37
37
|
|
|
38
38
|
def process_queue_message(self, queue_message: QueueMessage) -> PipelineReturnStatus:
|
|
39
|
-
|
|
40
39
|
# Apply the transformation
|
|
41
40
|
try:
|
|
42
41
|
df_source_records = self.transform.apply_transforms(df_source_records=queue_message.df_source_records)
|
|
@@ -105,7 +105,6 @@ class Producer:
|
|
|
105
105
|
def run(
|
|
106
106
|
self, job_id: int, stop_event: Union[multiprocessing.synchronize.Event, threading.Event]
|
|
107
107
|
) -> PipelineReturnStatus:
|
|
108
|
-
|
|
109
108
|
return_value: PipelineReturnStatus = PipelineReturnStatus.SUCCESS
|
|
110
109
|
|
|
111
110
|
# Init queue
|
|
@@ -132,7 +131,6 @@ class Producer:
|
|
|
132
131
|
return PipelineReturnStatus.BACKEND_ERROR
|
|
133
132
|
|
|
134
133
|
while not cursor.is_finished:
|
|
135
|
-
|
|
136
134
|
if stop_event.is_set():
|
|
137
135
|
logger.info("Stop event is set, terminating producer ...")
|
|
138
136
|
return PipelineReturnStatus.KILLED_BY_RUNNER
|
|
@@ -226,9 +224,7 @@ class Producer:
|
|
|
226
224
|
items_in_queue = f"{self.queue.get_size()} items in queue." if self.queue.get_size() else ""
|
|
227
225
|
|
|
228
226
|
logger.info(
|
|
229
|
-
(
|
|
230
|
-
f"Iteration {cursor.iteration} finished in {datetime.now(tz=UTC) - timestamp_start_iteration}. {items_in_queue}"
|
|
231
|
-
)
|
|
227
|
+
f"Iteration {cursor.iteration} finished in {datetime.now(tz=UTC) - timestamp_start_iteration}. {items_in_queue}"
|
|
232
228
|
)
|
|
233
229
|
|
|
234
230
|
logger.info("Terminating destination ...")
|
|
@@ -35,7 +35,6 @@ class PythonQueueConsumer(AbstractQueueConsumer):
|
|
|
35
35
|
self.monitor.track_pipeline_status(PipelineReturnStatus.RUNNING)
|
|
36
36
|
|
|
37
37
|
def run(self, stop_event: Union[threading.Event, multiprocessing.synchronize.Event]) -> PipelineReturnStatus:
|
|
38
|
-
|
|
39
38
|
while True:
|
|
40
39
|
# Handle kill signal from the runner
|
|
41
40
|
if stop_event.is_set():
|
|
@@ -9,7 +9,6 @@ from bizon.destination.destination import AbstractDestination
|
|
|
9
9
|
from bizon.engine.queue.config import QUEUE_TERMINATION, QueueMessage
|
|
10
10
|
from bizon.engine.queue.queue import AbstractQueue, AbstractQueueConsumer
|
|
11
11
|
from bizon.monitoring.monitor import AbstractMonitor
|
|
12
|
-
from bizon.source.callback import AbstractSourceCallback
|
|
13
12
|
from bizon.source.models import SourceIteration
|
|
14
13
|
from bizon.transform.transform import Transform
|
|
15
14
|
|
|
@@ -18,7 +17,6 @@ from .consumer import PythonQueueConsumer
|
|
|
18
17
|
|
|
19
18
|
|
|
20
19
|
class PythonQueue(AbstractQueue):
|
|
21
|
-
|
|
22
20
|
def __init__(self, config: PythonQueueConfigDetails, **kwargs) -> None:
|
|
23
21
|
super().__init__(config)
|
|
24
22
|
self.config: PythonQueueConfigDetails = config
|
|
@@ -24,7 +24,6 @@ class RabbitMQConsumer(AbstractQueueConsumer):
|
|
|
24
24
|
channel.queue_declare(queue=self.config.queue.queue_name)
|
|
25
25
|
|
|
26
26
|
for method_frame, properties, body in channel.consume(self.config.queue.queue_name):
|
|
27
|
-
|
|
28
27
|
queue_message = QueueMessage.model_validate_json(body)
|
|
29
28
|
if queue_message.signal == QUEUE_TERMINATION:
|
|
30
29
|
logger.info("Received termination signal, waiting for destination to close gracefully ...")
|
bizon/engine/queue/config.py
CHANGED
|
@@ -27,7 +27,6 @@ class QueueTypes(str, Enum):
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class AbastractQueueConfigDetails(BaseModel, ABC):
|
|
30
|
-
|
|
31
30
|
# Forbid extra keys in the model
|
|
32
31
|
model_config = ConfigDict(extra="forbid")
|
|
33
32
|
|
|
@@ -38,7 +37,6 @@ class AbastractQueueConfigDetails(BaseModel, ABC):
|
|
|
38
37
|
|
|
39
38
|
|
|
40
39
|
class AbstractQueueConfig(BaseModel, ABC):
|
|
41
|
-
|
|
42
40
|
# Forbid extra keys in the model
|
|
43
41
|
model_config = ConfigDict(extra="forbid")
|
|
44
42
|
|
|
@@ -8,7 +8,6 @@ from bizon.engine.runner.runner import AbstractRunner
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class ProcessRunner(AbstractRunner):
|
|
11
|
-
|
|
12
11
|
def __init__(self, config: dict):
|
|
13
12
|
super().__init__(config)
|
|
14
13
|
|
|
@@ -36,7 +35,6 @@ class ProcessRunner(AbstractRunner):
|
|
|
36
35
|
with concurrent.futures.ProcessPoolExecutor(
|
|
37
36
|
max_workers=self.bizon_config.engine.runner.config.max_workers
|
|
38
37
|
) as executor:
|
|
39
|
-
|
|
40
38
|
future_producer = executor.submit(
|
|
41
39
|
AbstractRunner.instanciate_and_run_producer,
|
|
42
40
|
self.bizon_config,
|