bizon 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon/alerting/__init__.py +0 -0
- bizon/alerting/alerts.py +23 -0
- bizon/alerting/models.py +28 -0
- bizon/alerting/slack/__init__.py +0 -0
- bizon/alerting/slack/config.py +5 -0
- bizon/alerting/slack/handler.py +39 -0
- bizon/cli/main.py +7 -3
- bizon/common/models.py +33 -7
- bizon/{destinations → connectors/destinations}/bigquery/config/bigquery.example.yml +3 -4
- bizon/connectors/destinations/bigquery/src/config.py +128 -0
- bizon/{destinations → connectors/destinations}/bigquery/src/destination.py +48 -25
- bizon/connectors/destinations/bigquery_streaming/src/config.py +57 -0
- bizon/connectors/destinations/bigquery_streaming/src/destination.py +377 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +57 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +446 -0
- bizon/{destinations/bigquery_streaming → connectors/destinations/bigquery_streaming_v2}/src/proto_utils.py +30 -36
- bizon/{destinations → connectors/destinations}/file/src/config.py +9 -3
- bizon/connectors/destinations/file/src/destination.py +56 -0
- bizon/{destinations → connectors/destinations}/logger/src/config.py +2 -1
- bizon/{destinations → connectors/destinations}/logger/src/destination.py +18 -3
- bizon/connectors/sources/cycle/config/cycle.example.yml +15 -0
- bizon/connectors/sources/cycle/src/source.py +133 -0
- bizon/{sources/periscope/tests/periscope_pipeline_dashboard.py → connectors/sources/cycle/tests/cycle_customers.py} +1 -1
- bizon/{sources → connectors/sources}/dummy/config/dummy.example.yml +2 -2
- bizon/{sources → connectors/sources}/dummy/src/fake_api.py +6 -1
- bizon/{sources → connectors/sources}/dummy/src/source.py +18 -5
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_bigquery_backend.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_kafka.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_rabbitmq.py +2 -2
- bizon/connectors/sources/dummy/tests/dummy_pipeline_unnest.py +29 -0
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery.py +2 -2
- bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +2 -2
- bizon/{sources → connectors/sources}/gsheets/config/default_auth.example.yml +4 -2
- bizon/{sources → connectors/sources}/gsheets/config/service_account.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/config/api_key.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/config/oauth.example.yml +4 -2
- bizon/{sources → connectors/sources}/hubspot/src/hubspot_objects.py +1 -1
- bizon/{sources → connectors/sources}/kafka/config/kafka.example.yml +3 -5
- bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +110 -0
- bizon/connectors/sources/kafka/src/callback.py +18 -0
- bizon/connectors/sources/kafka/src/config.py +69 -0
- bizon/connectors/sources/kafka/src/decode.py +93 -0
- bizon/connectors/sources/kafka/src/source.py +381 -0
- bizon/connectors/sources/kafka/tests/kafka_pipeline.py +7 -0
- bizon/connectors/sources/periscope/config/periscope_charts.example.yml +20 -0
- bizon/connectors/sources/periscope/config/periscope_dashboards.example.yml +20 -0
- bizon/{sources → connectors/sources}/periscope/src/source.py +137 -13
- bizon/connectors/sources/periscope/tests/periscope_pipeline_dashboard.py +9 -0
- bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_json.example.yml +19 -0
- bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_logger.example.yml +10 -0
- bizon/connectors/sources/pokeapi/src/source.py +79 -0
- bizon/{destinations → destination}/buffer.py +5 -0
- bizon/destination/config.py +83 -0
- bizon/{destinations → destination}/destination.py +103 -15
- bizon/engine/backend/adapters/sqlalchemy/backend.py +3 -1
- bizon/engine/engine.py +20 -1
- bizon/engine/pipeline/consumer.py +73 -5
- bizon/engine/pipeline/models.py +8 -3
- bizon/engine/pipeline/producer.py +18 -9
- bizon/engine/queue/adapters/kafka/consumer.py +2 -2
- bizon/engine/queue/adapters/kafka/queue.py +3 -2
- bizon/engine/queue/adapters/python_queue/consumer.py +40 -23
- bizon/engine/queue/adapters/python_queue/queue.py +19 -9
- bizon/engine/queue/adapters/rabbitmq/consumer.py +3 -6
- bizon/engine/queue/adapters/rabbitmq/queue.py +3 -2
- bizon/engine/queue/config.py +16 -0
- bizon/engine/queue/queue.py +17 -16
- bizon/engine/runner/adapters/process.py +15 -2
- bizon/engine/runner/adapters/streaming.py +121 -0
- bizon/engine/runner/adapters/thread.py +32 -9
- bizon/engine/runner/config.py +28 -0
- bizon/engine/runner/runner.py +113 -24
- bizon/monitoring/__init__.py +0 -0
- bizon/monitoring/config.py +39 -0
- bizon/monitoring/datadog/__init__.py +0 -0
- bizon/monitoring/datadog/monitor.py +153 -0
- bizon/monitoring/monitor.py +71 -0
- bizon/monitoring/noop/__init__.py +0 -0
- bizon/monitoring/noop/monitor.py +30 -0
- bizon/source/callback.py +24 -0
- bizon/source/config.py +3 -3
- bizon/source/cursor.py +1 -1
- bizon/source/discover.py +4 -3
- bizon/source/models.py +4 -2
- bizon/source/source.py +10 -2
- bizon/transform/config.py +8 -0
- bizon/transform/transform.py +48 -0
- {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/METADATA +23 -6
- bizon-0.1.2.dist-info/RECORD +123 -0
- {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/WHEEL +1 -1
- bizon/destinations/bigquery/src/config.py +0 -51
- bizon/destinations/bigquery_streaming/src/config.py +0 -43
- bizon/destinations/bigquery_streaming/src/destination.py +0 -154
- bizon/destinations/config.py +0 -47
- bizon/destinations/file/src/destination.py +0 -27
- bizon/sources/kafka/src/source.py +0 -357
- bizon/sources/kafka/tests/kafka_pipeline.py +0 -9
- bizon/sources/periscope/config/periscope_charts.example.yml +0 -26
- bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -26
- bizon-0.1.0.dist-info/RECORD +0 -93
- /bizon/{sources → connectors/sources}/gsheets/src/source.py +0 -0
- /bizon/{sources → connectors/sources}/gsheets/tests/gsheets_pipeline.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/src/hubspot_base.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/src/models/hs_object.py +0 -0
- /bizon/{sources → connectors/sources}/hubspot/tests/hubspot_pipeline.py +0 -0
- /bizon/{sources → connectors/sources}/periscope/tests/periscope_pipeline_charts.py +0 -0
- /bizon/{destinations → destination}/models.py +0 -0
- {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/LICENSE +0 -0
- {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/entry_points.txt +0 -0
|
@@ -15,8 +15,6 @@ from bizon.source.source import AbstractSource
|
|
|
15
15
|
|
|
16
16
|
BASE_URL = "https://app.periscopedata.com"
|
|
17
17
|
|
|
18
|
-
URL_OWNERS = f"{BASE_URL}/users/owners"
|
|
19
|
-
URL_VIEWS = f"{BASE_URL}/login_state/sql_views"
|
|
20
18
|
URL_DATABASES = f"{BASE_URL}/welcome/remaining_state/site_models"
|
|
21
19
|
|
|
22
20
|
|
|
@@ -39,6 +37,7 @@ class PeriscopeSourceConfig(SourceConfig):
|
|
|
39
37
|
workspace_name: str = Field(..., description="Name of the workspace")
|
|
40
38
|
client_site_id: int = Field(..., description="Client site ID")
|
|
41
39
|
database_id: int = Field(..., description="Fetch charts connected to this Database ID")
|
|
40
|
+
x_csrf_token: str = Field(..., description="CSRF token for the requests")
|
|
42
41
|
|
|
43
42
|
|
|
44
43
|
class PeriscopeSource(AbstractSource):
|
|
@@ -49,7 +48,14 @@ class PeriscopeSource(AbstractSource):
|
|
|
49
48
|
|
|
50
49
|
@staticmethod
|
|
51
50
|
def streams() -> List[str]:
|
|
52
|
-
return [
|
|
51
|
+
return [
|
|
52
|
+
"charts",
|
|
53
|
+
"dashboards_metadata",
|
|
54
|
+
"dashboards",
|
|
55
|
+
"databases",
|
|
56
|
+
"users",
|
|
57
|
+
"views",
|
|
58
|
+
]
|
|
53
59
|
|
|
54
60
|
@staticmethod
|
|
55
61
|
def get_config_class() -> AbstractSource:
|
|
@@ -86,8 +92,9 @@ class PeriscopeSource(AbstractSource):
|
|
|
86
92
|
"sec-fetch-dest": "empty",
|
|
87
93
|
"sec-fetch-mode": "cors",
|
|
88
94
|
"sec-fetch-site": "same-origin",
|
|
89
|
-
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
|
95
|
+
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
|
90
96
|
"x-requested-with": "XMLHttpRequest",
|
|
97
|
+
"x-csrf-token": self.config.x_csrf_token,
|
|
91
98
|
},
|
|
92
99
|
)
|
|
93
100
|
)
|
|
@@ -97,10 +104,8 @@ class PeriscopeSource(AbstractSource):
|
|
|
97
104
|
def get_total_records_count(self) -> int | None:
|
|
98
105
|
return None
|
|
99
106
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
records_json = response.json()["Dashboard"]
|
|
103
|
-
|
|
107
|
+
@staticmethod
|
|
108
|
+
def transform_response_to_source_iteration(records: List[dict]) -> SourceIteration:
|
|
104
109
|
return SourceIteration(
|
|
105
110
|
next_pagination=dict(),
|
|
106
111
|
records=[
|
|
@@ -108,14 +113,69 @@ class PeriscopeSource(AbstractSource):
|
|
|
108
113
|
id=record["id"],
|
|
109
114
|
data=record,
|
|
110
115
|
)
|
|
111
|
-
for record in
|
|
116
|
+
for record in records
|
|
112
117
|
],
|
|
113
118
|
)
|
|
114
119
|
|
|
120
|
+
def get_dashboards(self, pagination: dict = None) -> SourceIteration:
|
|
121
|
+
response = self.session.call(
|
|
122
|
+
method="GET",
|
|
123
|
+
url=f"{BASE_URL}/login_state/dashboards",
|
|
124
|
+
params=self.http_params,
|
|
125
|
+
)
|
|
126
|
+
records_json = response.json()["Dashboard"]
|
|
127
|
+
return self.transform_response_to_source_iteration(records_json)
|
|
128
|
+
|
|
129
|
+
def get_dashboards_metadata(self, pagination: dict = None) -> SourceIteration:
|
|
130
|
+
|
|
131
|
+
params = {
|
|
132
|
+
"client_site_id": self.config.client_site_id,
|
|
133
|
+
"filters": [{"name": "typeFilter", "input": "Dashboard"}],
|
|
134
|
+
"limit": 2000,
|
|
135
|
+
"query_plan": None,
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
response = self.session.call(
|
|
139
|
+
method="POST",
|
|
140
|
+
url=f"{BASE_URL}/global_search/search",
|
|
141
|
+
json=params,
|
|
142
|
+
)
|
|
143
|
+
records_json = response.json()["results"]["data"]
|
|
144
|
+
return self.transform_response_to_source_iteration(records_json)
|
|
145
|
+
|
|
115
146
|
def get_dashboard_ids(self) -> List[int]:
|
|
116
147
|
source_iteration = self.get_dashboards()
|
|
117
148
|
return [record.id for record in source_iteration.records]
|
|
118
149
|
|
|
150
|
+
def _extract_raw_text_from_textbox(self, data: dict) -> str:
|
|
151
|
+
raw_text = []
|
|
152
|
+
|
|
153
|
+
def clean_text(text: str):
|
|
154
|
+
"""Strip Byte Order Mark (BOM) and other unwanted whitespace."""
|
|
155
|
+
return text.replace("\ufeff", "").strip()
|
|
156
|
+
|
|
157
|
+
def traverse_nodes(nodes):
|
|
158
|
+
for node in nodes:
|
|
159
|
+
if node["object"] == "text":
|
|
160
|
+
for leaf in node["leaves"]:
|
|
161
|
+
raw_text.append(clean_text(leaf["text"]))
|
|
162
|
+
elif node["type"] == "link" and "data" in node and "url" in node["data"]:
|
|
163
|
+
link_text = []
|
|
164
|
+
for leaf in node["nodes"][0]["leaves"]: # Assume a single text node in link
|
|
165
|
+
link_text.append(clean_text(leaf["text"]))
|
|
166
|
+
# Format as Markdown link
|
|
167
|
+
raw_text.append(f"[{''.join(link_text)}]({node['data']['url']})")
|
|
168
|
+
elif "nodes" in node: # If there are nested nodes
|
|
169
|
+
traverse_nodes(node["nodes"])
|
|
170
|
+
|
|
171
|
+
if not data["text_data"]:
|
|
172
|
+
return ""
|
|
173
|
+
|
|
174
|
+
# Start traversal from the root nodes
|
|
175
|
+
traverse_nodes(data["text_data"]["document"]["nodes"])
|
|
176
|
+
|
|
177
|
+
return " ".join(raw_text)
|
|
178
|
+
|
|
119
179
|
def _get_charts(self, dashboard_id: int) -> List[dict]:
|
|
120
180
|
MAXIMUM_ITERATION: int = 1000
|
|
121
181
|
iter_count: int = 0
|
|
@@ -154,12 +214,37 @@ class PeriscopeSource(AbstractSource):
|
|
|
154
214
|
iter_count += 1
|
|
155
215
|
iter_charts = response.json().get("Widget")
|
|
156
216
|
|
|
217
|
+
iter_textboxes = response.json().get("TextBox")
|
|
218
|
+
|
|
157
219
|
for chart in iter_charts:
|
|
220
|
+
# Only fetch charts connected to gorgias-growth-production
|
|
158
221
|
if str(chart.get("database_id")) == str(self.config.database_id):
|
|
159
222
|
if chart.get("id") not in charts_list:
|
|
223
|
+
|
|
160
224
|
charts_list.add(chart.get("id"))
|
|
225
|
+
|
|
226
|
+
chart["raw_text"] = None
|
|
227
|
+
|
|
228
|
+
# In case the chart is a textbox, we parse the raw text
|
|
229
|
+
if chart.get("content_id"):
|
|
230
|
+
text_box = list(
|
|
231
|
+
filter(
|
|
232
|
+
lambda x: x.get("id") == chart.get("content_id"),
|
|
233
|
+
iter_textboxes,
|
|
234
|
+
)
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
if not text_box:
|
|
238
|
+
logger.error(
|
|
239
|
+
f"Failed to fetch the textbox with id: {chart.get('content_id')} for chart with id: {chart.get('id')}"
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
if text_box:
|
|
243
|
+
chart["raw_text"] = self._extract_raw_text_from_textbox(text_box[0])
|
|
244
|
+
|
|
161
245
|
dashboard_charts.append(chart)
|
|
162
|
-
except Exception:
|
|
246
|
+
except Exception as e:
|
|
247
|
+
logger.error(f"Failed to fetch the dashboard with id: {dashboard_id} with error: {e}")
|
|
163
248
|
continue
|
|
164
249
|
|
|
165
250
|
return dashboard_charts
|
|
@@ -200,11 +285,50 @@ class PeriscopeSource(AbstractSource):
|
|
|
200
285
|
],
|
|
201
286
|
)
|
|
202
287
|
|
|
288
|
+
def get_views(self, pagination: dict = None) -> SourceIteration:
|
|
289
|
+
response = self.session.call(
|
|
290
|
+
method="GET",
|
|
291
|
+
url=f"{BASE_URL}/login_state/sql_views",
|
|
292
|
+
params=self.http_params,
|
|
293
|
+
)
|
|
294
|
+
records_json = response.json()["SqlView"]
|
|
295
|
+
return self.transform_response_to_source_iteration(records_json)
|
|
296
|
+
|
|
297
|
+
def get_users(self, pagination: dict = None) -> SourceIteration:
|
|
298
|
+
response = self.session.call(
|
|
299
|
+
method="GET",
|
|
300
|
+
url=f"{BASE_URL}/users/owners",
|
|
301
|
+
params=self.http_params,
|
|
302
|
+
)
|
|
303
|
+
records_json = response.json()
|
|
304
|
+
return self.transform_response_to_source_iteration(records_json)
|
|
305
|
+
|
|
306
|
+
def get_databases(self, pagination: dict = None) -> SourceIteration:
|
|
307
|
+
response = self.session.call(
|
|
308
|
+
method="GET",
|
|
309
|
+
url=URL_DATABASES,
|
|
310
|
+
params=self.http_params,
|
|
311
|
+
)
|
|
312
|
+
records_json = response.json()["Database"]
|
|
313
|
+
return self.transform_response_to_source_iteration(records_json)
|
|
314
|
+
|
|
203
315
|
def get(self, pagination: dict = None) -> SourceIteration:
|
|
204
|
-
if self.config.
|
|
316
|
+
if self.config.stream == "dashboards":
|
|
205
317
|
return self.get_dashboards(pagination)
|
|
206
318
|
|
|
207
|
-
|
|
319
|
+
elif self.config.stream == "charts":
|
|
208
320
|
return self.get_charts(pagination)
|
|
209
321
|
|
|
210
|
-
|
|
322
|
+
elif self.config.stream == "dashboards_metadata":
|
|
323
|
+
return self.get_dashboards_metadata(pagination)
|
|
324
|
+
|
|
325
|
+
elif self.config.stream == "views":
|
|
326
|
+
return self.get_views(pagination)
|
|
327
|
+
|
|
328
|
+
elif self.config.stream == "users":
|
|
329
|
+
return self.get_users(pagination)
|
|
330
|
+
|
|
331
|
+
elif self.config.stream == "databases":
|
|
332
|
+
return self.get_databases(pagination)
|
|
333
|
+
|
|
334
|
+
raise NotImplementedError(f"Stream {self.config.stream} not implemented for Periscope")
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from bizon.cli.utils import parse_from_yaml
|
|
4
|
+
from bizon.engine.engine import RunnerFactory
|
|
5
|
+
|
|
6
|
+
config = parse_from_yaml(os.path.abspath("bizon/connectors/sources/periscope/config/periscope_dashboards.yml"))
|
|
7
|
+
|
|
8
|
+
runner = RunnerFactory.create_from_config_dict(config=config)
|
|
9
|
+
runner.run()
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
name: pokemon to json unnested
|
|
2
|
+
|
|
3
|
+
source:
|
|
4
|
+
name: pokeapi
|
|
5
|
+
stream: pokemon
|
|
6
|
+
|
|
7
|
+
destination:
|
|
8
|
+
name: file
|
|
9
|
+
config:
|
|
10
|
+
filepath: pokemon.json
|
|
11
|
+
buffer_flush_timeout: 2
|
|
12
|
+
|
|
13
|
+
unnest: true
|
|
14
|
+
|
|
15
|
+
record_schema:
|
|
16
|
+
- name: name
|
|
17
|
+
type: string
|
|
18
|
+
- name: url
|
|
19
|
+
type: string
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import Any, List, Tuple
|
|
3
|
+
|
|
4
|
+
from requests.auth import AuthBase
|
|
5
|
+
|
|
6
|
+
from bizon.source.config import SourceConfig
|
|
7
|
+
from bizon.source.models import SourceIteration, SourceRecord
|
|
8
|
+
from bizon.source.source import AbstractSource
|
|
9
|
+
|
|
10
|
+
BASE_URL = "https://pokeapi.co/api/v2"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Define the streams that the source supports
|
|
14
|
+
class PokeAPIStreams(str, Enum):
|
|
15
|
+
POKEMON = "pokemon"
|
|
16
|
+
BERRY = "berry"
|
|
17
|
+
ITEM = "item"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Define the config class for the source
|
|
21
|
+
class PokeAPISourceConfig(SourceConfig):
|
|
22
|
+
stream: PokeAPIStreams
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PeriscopeSource(AbstractSource):
|
|
26
|
+
|
|
27
|
+
def __init__(self, config: PokeAPISourceConfig):
|
|
28
|
+
super().__init__(config)
|
|
29
|
+
self.config: PokeAPISourceConfig = config
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def url_entity(self) -> str:
|
|
33
|
+
return f"{BASE_URL}/{self.config.stream}"
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def streams() -> List[str]:
|
|
37
|
+
return [item.value for item in PokeAPIStreams]
|
|
38
|
+
|
|
39
|
+
@staticmethod
|
|
40
|
+
def get_config_class() -> AbstractSource:
|
|
41
|
+
return PokeAPISourceConfig
|
|
42
|
+
|
|
43
|
+
def check_connection(self) -> Tuple[bool | Any | None]:
|
|
44
|
+
# Make a request to the base URL to check if the connection is successful
|
|
45
|
+
_ = self.session.get(self.url_entity)
|
|
46
|
+
return True, None
|
|
47
|
+
|
|
48
|
+
def get_authenticator(self) -> AuthBase:
|
|
49
|
+
# We return None because we don't need any authentication
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
def get_total_records_count(self) -> int | None:
|
|
53
|
+
# Return the total number of records in the stream
|
|
54
|
+
response = self.session.get(self.url_entity)
|
|
55
|
+
return response.json().get("count")
|
|
56
|
+
|
|
57
|
+
def get_entity_list(self, pagination: dict = None) -> SourceIteration:
|
|
58
|
+
# If pagination is provided, use the next URL to get the next set of records
|
|
59
|
+
url = pagination.get("next") if pagination else self.url_entity
|
|
60
|
+
response = self.session.get(url)
|
|
61
|
+
|
|
62
|
+
data = response.json()
|
|
63
|
+
|
|
64
|
+
return SourceIteration(
|
|
65
|
+
next_pagination={"next": data.get("next")} if data.get("next") else {},
|
|
66
|
+
records=[
|
|
67
|
+
SourceRecord(
|
|
68
|
+
id=record["name"],
|
|
69
|
+
data=record,
|
|
70
|
+
)
|
|
71
|
+
for record in data["results"]
|
|
72
|
+
],
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def get(self, pagination: dict = None) -> SourceIteration:
|
|
76
|
+
if self.config.stream in [PokeAPIStreams.POKEMON, PokeAPIStreams.BERRY, PokeAPIStreams.ITEM]:
|
|
77
|
+
return self.get_entity_list(pagination)
|
|
78
|
+
|
|
79
|
+
raise NotImplementedError(f"Stream {self.config.stream} not implemented for PokeAPI source")
|
|
@@ -18,6 +18,11 @@ class DestinationBuffer:
|
|
|
18
18
|
self.pagination = {}
|
|
19
19
|
self.modified_at: List[datetime] = [datetime.now(tz=UTC)]
|
|
20
20
|
|
|
21
|
+
@property
|
|
22
|
+
def is_empty(self) -> bool:
|
|
23
|
+
"""Check if buffer is empty"""
|
|
24
|
+
return self.df_destination_records.height == 0
|
|
25
|
+
|
|
21
26
|
@property
|
|
22
27
|
def current_size(self) -> int:
|
|
23
28
|
"""Return buffer size"""
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DestinationTypes(str, Enum):
|
|
9
|
+
BIGQUERY = "bigquery"
|
|
10
|
+
BIGQUERY_STREAMING = "bigquery_streaming"
|
|
11
|
+
BIGQUERY_STREAMING_V2 = "bigquery_streaming_v2"
|
|
12
|
+
LOGGER = "logger"
|
|
13
|
+
FILE = "file"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DestinationColumn(BaseModel, ABC):
|
|
17
|
+
name: str = Field(..., description="Name of the column")
|
|
18
|
+
type: str = Field(..., description="Type of the column")
|
|
19
|
+
description: Optional[str] = Field(None, description="Description of the column")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class RecordSchemaConfig(BaseModel):
|
|
23
|
+
# Forbid extra keys in the model
|
|
24
|
+
model_config = ConfigDict(extra="forbid")
|
|
25
|
+
|
|
26
|
+
destination_id: str = Field(..., description="Destination ID")
|
|
27
|
+
record_schema: list[DestinationColumn] = Field(..., description="Record schema")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class AbstractDestinationDetailsConfig(BaseModel):
|
|
31
|
+
|
|
32
|
+
# Forbid extra keys in the model
|
|
33
|
+
model_config = ConfigDict(extra="forbid")
|
|
34
|
+
|
|
35
|
+
buffer_size: int = Field(
|
|
36
|
+
default=50,
|
|
37
|
+
description="Buffer size in Mb for the destination. Set to 0 to disable and write directly to the destination.",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
buffer_flush_timeout: int = Field(
|
|
41
|
+
default=600,
|
|
42
|
+
description="Maximum time in seconds for buffering after which the records will be written to the destination. Set to 0 to deactivate the timeout buffer check.", # noqa
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
max_concurrent_threads: int = Field(
|
|
46
|
+
default=10,
|
|
47
|
+
description="Maximum number of concurrent threads to use for writing to the destination.",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
record_schemas: Optional[list[RecordSchemaConfig]] = Field(
|
|
51
|
+
default=None, description="Schemas for the records. Required if unnest is set to true."
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
unnest: bool = Field(
|
|
55
|
+
default=False,
|
|
56
|
+
description="Unnest the data before writing to the destination. Schema should be provided in the model_config.",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
authentication: Optional[BaseModel] = Field(
|
|
60
|
+
description="Authentication configuration for the destination, if needed", default=None
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
destination_id: Optional[str] = Field(
|
|
64
|
+
description="Destination ID, identifier to use to store the records in the destination", default=None
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
@field_validator("unnest", mode="before")
|
|
68
|
+
def validate_record_schema_if_unnest(cls, value, values):
|
|
69
|
+
if bool(value) and not values.data.get("record_schemas", []):
|
|
70
|
+
raise ValueError("At least one `record_schemas` must be provided if `unnest` is set to True.")
|
|
71
|
+
return value
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class AbstractDestinationConfig(BaseModel):
|
|
75
|
+
# Forbid extra keys in the model
|
|
76
|
+
model_config = ConfigDict(extra="forbid")
|
|
77
|
+
|
|
78
|
+
name: DestinationTypes = Field(..., description="Name of the destination")
|
|
79
|
+
alias: str = Field(
|
|
80
|
+
...,
|
|
81
|
+
description="Alias of the destination, used for tracking the system name (ie bigquery for bigquery_streaming)",
|
|
82
|
+
)
|
|
83
|
+
config: AbstractDestinationDetailsConfig = Field(..., description="Configuration for the destination")
|
|
@@ -10,6 +10,8 @@ from pydantic import BaseModel, Field
|
|
|
10
10
|
from bizon.common.models import SyncMetadata
|
|
11
11
|
from bizon.engine.backend.backend import AbstractBackend
|
|
12
12
|
from bizon.engine.backend.models import JobStatus
|
|
13
|
+
from bizon.monitoring.monitor import AbstractMonitor
|
|
14
|
+
from bizon.source.callback import AbstractSourceCallback
|
|
13
15
|
from bizon.source.config import SourceSyncModes
|
|
14
16
|
|
|
15
17
|
from .buffer import DestinationBuffer
|
|
@@ -43,13 +45,42 @@ class DestinationIteration(BaseModel):
|
|
|
43
45
|
|
|
44
46
|
class AbstractDestination(ABC):
|
|
45
47
|
|
|
46
|
-
def __init__(
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
sync_metadata: SyncMetadata,
|
|
51
|
+
config: AbstractDestinationDetailsConfig,
|
|
52
|
+
backend: AbstractBackend,
|
|
53
|
+
source_callback: AbstractSourceCallback,
|
|
54
|
+
monitor: AbstractMonitor,
|
|
55
|
+
):
|
|
47
56
|
self.sync_metadata = sync_metadata
|
|
48
57
|
self.config = config
|
|
49
58
|
self.backend = backend
|
|
59
|
+
self.monitor = monitor
|
|
50
60
|
self.buffer = DestinationBuffer(
|
|
51
61
|
buffer_size=self.config.buffer_size, buffer_flush_timeout=self.config.buffer_flush_timeout
|
|
52
62
|
)
|
|
63
|
+
self.source_callback = source_callback
|
|
64
|
+
self.destination_id = config.destination_id
|
|
65
|
+
|
|
66
|
+
self._record_schemas = None
|
|
67
|
+
self._clustering_keys = None
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def record_schemas(self):
|
|
71
|
+
if self._record_schemas is None and self.config.record_schemas:
|
|
72
|
+
self._record_schemas = {
|
|
73
|
+
schema.destination_id: schema.record_schema for schema in self.config.record_schemas
|
|
74
|
+
}
|
|
75
|
+
return self._record_schemas
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def clustering_keys(self):
|
|
79
|
+
if self._clustering_keys is None and self.config.record_schemas:
|
|
80
|
+
self._clustering_keys = {
|
|
81
|
+
schema.destination_id: schema.clustering_keys for schema in self.config.record_schemas
|
|
82
|
+
}
|
|
83
|
+
return self._clustering_keys
|
|
53
84
|
|
|
54
85
|
@abstractmethod
|
|
55
86
|
def check_connection(self) -> bool:
|
|
@@ -75,7 +106,7 @@ class AbstractDestination(ABC):
|
|
|
75
106
|
)
|
|
76
107
|
|
|
77
108
|
logger.info(
|
|
78
|
-
f"Writing in destination from source iteration {self.buffer.from_iteration} to {self.buffer.to_iteration}"
|
|
109
|
+
f"Writing in destination {self.destination_id} from source iteration {self.buffer.from_iteration} to {self.buffer.to_iteration}"
|
|
79
110
|
)
|
|
80
111
|
|
|
81
112
|
success, error_msg = self.write_records(df_destination_records=self.buffer.df_destination_records)
|
|
@@ -83,7 +114,9 @@ class AbstractDestination(ABC):
|
|
|
83
114
|
if success:
|
|
84
115
|
# We wrote records to destination so we keep it
|
|
85
116
|
destination_iteration.records_written = self.buffer.df_destination_records.height
|
|
86
|
-
logger.info(
|
|
117
|
+
logger.info(
|
|
118
|
+
f"Successfully wrote {destination_iteration.records_written} records to destination {self.destination_id}"
|
|
119
|
+
)
|
|
87
120
|
|
|
88
121
|
else:
|
|
89
122
|
# We failed to write records to destination so we keep the error message
|
|
@@ -112,8 +145,8 @@ class AbstractDestination(ABC):
|
|
|
112
145
|
# Last iteration, write all records to destination
|
|
113
146
|
if last_iteration:
|
|
114
147
|
|
|
115
|
-
if self.buffer.df_destination_records.height == 0 and self.
|
|
116
|
-
logger.
|
|
148
|
+
if self.buffer.df_destination_records.height == 0 and self.buffer.is_empty:
|
|
149
|
+
logger.info("No records to write to destination, already written, buffer is empty.")
|
|
117
150
|
return DestinationBufferStatus.RECORDS_WRITTEN
|
|
118
151
|
|
|
119
152
|
logger.debug("Writing last iteration records to destination")
|
|
@@ -143,12 +176,12 @@ class AbstractDestination(ABC):
|
|
|
143
176
|
|
|
144
177
|
# Don't write empty records to destination
|
|
145
178
|
if df_destination_records.height == 0 and not last_iteration:
|
|
146
|
-
logger.
|
|
179
|
+
logger.info("No records to write to destination. Check source and queue provider.")
|
|
147
180
|
return DestinationBufferStatus.NO_RECORDS
|
|
148
181
|
|
|
149
182
|
# Write records to destination if buffer size is 0 or streaming
|
|
150
183
|
if self.buffer.buffer_size == 0:
|
|
151
|
-
logger.info("Writing records to destination.")
|
|
184
|
+
logger.info(f"Writing records to destination {self.destination_id}.")
|
|
152
185
|
self.buffer.add_source_iteration_records_to_buffer(
|
|
153
186
|
iteration=iteration, df_destination_records=df_destination_records, pagination=pagination
|
|
154
187
|
)
|
|
@@ -161,6 +194,14 @@ class AbstractDestination(ABC):
|
|
|
161
194
|
logger.info(
|
|
162
195
|
f"Buffer ripeness {round(self.buffer.ripeness / 60, 2)} min. Max ripeness {round(self.buffer.buffer_flush_timeout / 60, 2)} min." # noqa
|
|
163
196
|
)
|
|
197
|
+
logger.info(
|
|
198
|
+
f"Current records size to process: {round(df_destination_records.estimated_size(unit='b') / 1024 / 1024, 2)} Mb."
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
if df_destination_records.estimated_size(unit="b") > self.buffer.buffer_size:
|
|
202
|
+
raise ValueError(
|
|
203
|
+
f"Records size {round(df_destination_records.estimated_size(unit='b') / 1024 / 1024, 2)} Mb is greater than buffer size {round(self.buffer.buffer_size / 1024 / 1024, 2)} Mb. Please increase destination buffer_size or reduce batch_size from the source."
|
|
204
|
+
)
|
|
164
205
|
|
|
165
206
|
# Write buffer to destination if buffer is ripe and create a new buffer for the new iteration
|
|
166
207
|
if self.buffer.is_ripe:
|
|
@@ -245,26 +286,73 @@ class DestinationFactory:
|
|
|
245
286
|
sync_metadata: SyncMetadata,
|
|
246
287
|
config: AbstractDestinationConfig,
|
|
247
288
|
backend: AbstractBackend,
|
|
289
|
+
source_callback: AbstractSourceCallback,
|
|
290
|
+
monitor: AbstractMonitor,
|
|
248
291
|
) -> AbstractDestination:
|
|
249
292
|
|
|
250
293
|
if config.name == DestinationTypes.LOGGER:
|
|
251
|
-
from .logger.src.destination import
|
|
294
|
+
from bizon.connectors.destinations.logger.src.destination import (
|
|
295
|
+
LoggerDestination,
|
|
296
|
+
)
|
|
252
297
|
|
|
253
|
-
return LoggerDestination(
|
|
298
|
+
return LoggerDestination(
|
|
299
|
+
sync_metadata=sync_metadata,
|
|
300
|
+
config=config.config,
|
|
301
|
+
backend=backend,
|
|
302
|
+
source_callback=source_callback,
|
|
303
|
+
monitor=monitor,
|
|
304
|
+
)
|
|
254
305
|
|
|
255
306
|
elif config.name == DestinationTypes.BIGQUERY:
|
|
256
|
-
from .bigquery.src.destination import
|
|
307
|
+
from bizon.connectors.destinations.bigquery.src.destination import (
|
|
308
|
+
BigQueryDestination,
|
|
309
|
+
)
|
|
257
310
|
|
|
258
|
-
return BigQueryDestination(
|
|
311
|
+
return BigQueryDestination(
|
|
312
|
+
sync_metadata=sync_metadata,
|
|
313
|
+
config=config.config,
|
|
314
|
+
backend=backend,
|
|
315
|
+
source_callback=source_callback,
|
|
316
|
+
monitor=monitor,
|
|
317
|
+
)
|
|
259
318
|
|
|
260
319
|
elif config.name == DestinationTypes.BIGQUERY_STREAMING:
|
|
261
|
-
from .bigquery_streaming.src.destination import
|
|
320
|
+
from bizon.connectors.destinations.bigquery_streaming.src.destination import (
|
|
321
|
+
BigQueryStreamingDestination,
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
return BigQueryStreamingDestination(
|
|
325
|
+
sync_metadata=sync_metadata,
|
|
326
|
+
config=config.config,
|
|
327
|
+
backend=backend,
|
|
328
|
+
source_callback=source_callback,
|
|
329
|
+
monitor=monitor,
|
|
330
|
+
)
|
|
262
331
|
|
|
263
|
-
|
|
332
|
+
elif config.name == DestinationTypes.BIGQUERY_STREAMING_V2:
|
|
333
|
+
from bizon.connectors.destinations.bigquery_streaming_v2.src.destination import (
|
|
334
|
+
BigQueryStreamingV2Destination,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
return BigQueryStreamingV2Destination(
|
|
338
|
+
sync_metadata=sync_metadata,
|
|
339
|
+
config=config.config,
|
|
340
|
+
backend=backend,
|
|
341
|
+
source_callback=source_callback,
|
|
342
|
+
monitor=monitor,
|
|
343
|
+
)
|
|
264
344
|
|
|
265
345
|
elif config.name == DestinationTypes.FILE:
|
|
266
|
-
from .file.src.destination import
|
|
346
|
+
from bizon.connectors.destinations.file.src.destination import (
|
|
347
|
+
FileDestination,
|
|
348
|
+
)
|
|
267
349
|
|
|
268
|
-
return FileDestination(
|
|
350
|
+
return FileDestination(
|
|
351
|
+
sync_metadata=sync_metadata,
|
|
352
|
+
config=config.config,
|
|
353
|
+
backend=backend,
|
|
354
|
+
source_callback=source_callback,
|
|
355
|
+
monitor=monitor,
|
|
356
|
+
)
|
|
269
357
|
|
|
270
358
|
raise ValueError(f"Destination {config.name}" f"with params {config} not found")
|
|
@@ -173,7 +173,9 @@ class SQLAlchemyBackend(AbstractBackend):
|
|
|
173
173
|
|
|
174
174
|
def _execute(self, select: Select, session: Optional[Session] = None) -> Result:
|
|
175
175
|
session = session or self.session
|
|
176
|
-
|
|
176
|
+
result = session.execute(select)
|
|
177
|
+
session.commit()
|
|
178
|
+
return result
|
|
177
179
|
|
|
178
180
|
#### STREAM JOB ####
|
|
179
181
|
|