bizon 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. bizon/alerting/__init__.py +0 -0
  2. bizon/alerting/alerts.py +23 -0
  3. bizon/alerting/models.py +28 -0
  4. bizon/alerting/slack/__init__.py +0 -0
  5. bizon/alerting/slack/config.py +5 -0
  6. bizon/alerting/slack/handler.py +39 -0
  7. bizon/cli/main.py +7 -3
  8. bizon/common/models.py +33 -7
  9. bizon/{destinations → connectors/destinations}/bigquery/config/bigquery.example.yml +3 -4
  10. bizon/connectors/destinations/bigquery/src/config.py +128 -0
  11. bizon/{destinations → connectors/destinations}/bigquery/src/destination.py +48 -25
  12. bizon/connectors/destinations/bigquery_streaming/src/config.py +57 -0
  13. bizon/connectors/destinations/bigquery_streaming/src/destination.py +377 -0
  14. bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +57 -0
  15. bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +446 -0
  16. bizon/{destinations/bigquery_streaming → connectors/destinations/bigquery_streaming_v2}/src/proto_utils.py +30 -36
  17. bizon/{destinations → connectors/destinations}/file/src/config.py +9 -3
  18. bizon/connectors/destinations/file/src/destination.py +56 -0
  19. bizon/{destinations → connectors/destinations}/logger/src/config.py +2 -1
  20. bizon/{destinations → connectors/destinations}/logger/src/destination.py +18 -3
  21. bizon/connectors/sources/cycle/config/cycle.example.yml +15 -0
  22. bizon/connectors/sources/cycle/src/source.py +133 -0
  23. bizon/{sources/periscope/tests/periscope_pipeline_dashboard.py → connectors/sources/cycle/tests/cycle_customers.py} +1 -1
  24. bizon/{sources → connectors/sources}/dummy/config/dummy.example.yml +2 -2
  25. bizon/{sources → connectors/sources}/dummy/src/fake_api.py +6 -1
  26. bizon/{sources → connectors/sources}/dummy/src/source.py +18 -5
  27. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline.py +2 -2
  28. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_bigquery_backend.py +2 -2
  29. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_kafka.py +2 -2
  30. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_rabbitmq.py +2 -2
  31. bizon/connectors/sources/dummy/tests/dummy_pipeline_unnest.py +29 -0
  32. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery.py +2 -2
  33. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +2 -2
  34. bizon/{sources → connectors/sources}/gsheets/config/default_auth.example.yml +4 -2
  35. bizon/{sources → connectors/sources}/gsheets/config/service_account.example.yml +4 -2
  36. bizon/{sources → connectors/sources}/hubspot/config/api_key.example.yml +4 -2
  37. bizon/{sources → connectors/sources}/hubspot/config/oauth.example.yml +4 -2
  38. bizon/{sources → connectors/sources}/hubspot/src/hubspot_objects.py +1 -1
  39. bizon/{sources → connectors/sources}/kafka/config/kafka.example.yml +3 -5
  40. bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +110 -0
  41. bizon/connectors/sources/kafka/src/callback.py +18 -0
  42. bizon/connectors/sources/kafka/src/config.py +69 -0
  43. bizon/connectors/sources/kafka/src/decode.py +93 -0
  44. bizon/connectors/sources/kafka/src/source.py +381 -0
  45. bizon/connectors/sources/kafka/tests/kafka_pipeline.py +7 -0
  46. bizon/connectors/sources/periscope/config/periscope_charts.example.yml +20 -0
  47. bizon/connectors/sources/periscope/config/periscope_dashboards.example.yml +20 -0
  48. bizon/{sources → connectors/sources}/periscope/src/source.py +137 -13
  49. bizon/connectors/sources/periscope/tests/periscope_pipeline_dashboard.py +9 -0
  50. bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_json.example.yml +19 -0
  51. bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_logger.example.yml +10 -0
  52. bizon/connectors/sources/pokeapi/src/source.py +79 -0
  53. bizon/{destinations → destination}/buffer.py +5 -0
  54. bizon/destination/config.py +83 -0
  55. bizon/{destinations → destination}/destination.py +103 -15
  56. bizon/engine/backend/adapters/sqlalchemy/backend.py +3 -1
  57. bizon/engine/engine.py +20 -1
  58. bizon/engine/pipeline/consumer.py +73 -5
  59. bizon/engine/pipeline/models.py +8 -3
  60. bizon/engine/pipeline/producer.py +18 -9
  61. bizon/engine/queue/adapters/kafka/consumer.py +2 -2
  62. bizon/engine/queue/adapters/kafka/queue.py +3 -2
  63. bizon/engine/queue/adapters/python_queue/consumer.py +40 -23
  64. bizon/engine/queue/adapters/python_queue/queue.py +19 -9
  65. bizon/engine/queue/adapters/rabbitmq/consumer.py +3 -6
  66. bizon/engine/queue/adapters/rabbitmq/queue.py +3 -2
  67. bizon/engine/queue/config.py +16 -0
  68. bizon/engine/queue/queue.py +17 -16
  69. bizon/engine/runner/adapters/process.py +15 -2
  70. bizon/engine/runner/adapters/streaming.py +121 -0
  71. bizon/engine/runner/adapters/thread.py +32 -9
  72. bizon/engine/runner/config.py +28 -0
  73. bizon/engine/runner/runner.py +113 -24
  74. bizon/monitoring/__init__.py +0 -0
  75. bizon/monitoring/config.py +39 -0
  76. bizon/monitoring/datadog/__init__.py +0 -0
  77. bizon/monitoring/datadog/monitor.py +153 -0
  78. bizon/monitoring/monitor.py +71 -0
  79. bizon/monitoring/noop/__init__.py +0 -0
  80. bizon/monitoring/noop/monitor.py +30 -0
  81. bizon/source/callback.py +24 -0
  82. bizon/source/config.py +3 -3
  83. bizon/source/cursor.py +1 -1
  84. bizon/source/discover.py +4 -3
  85. bizon/source/models.py +4 -2
  86. bizon/source/source.py +10 -2
  87. bizon/transform/config.py +8 -0
  88. bizon/transform/transform.py +48 -0
  89. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/METADATA +23 -6
  90. bizon-0.1.2.dist-info/RECORD +123 -0
  91. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/WHEEL +1 -1
  92. bizon/destinations/bigquery/src/config.py +0 -51
  93. bizon/destinations/bigquery_streaming/src/config.py +0 -43
  94. bizon/destinations/bigquery_streaming/src/destination.py +0 -154
  95. bizon/destinations/config.py +0 -47
  96. bizon/destinations/file/src/destination.py +0 -27
  97. bizon/sources/kafka/src/source.py +0 -357
  98. bizon/sources/kafka/tests/kafka_pipeline.py +0 -9
  99. bizon/sources/periscope/config/periscope_charts.example.yml +0 -26
  100. bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -26
  101. bizon-0.1.0.dist-info/RECORD +0 -93
  102. /bizon/{sources → connectors/sources}/gsheets/src/source.py +0 -0
  103. /bizon/{sources → connectors/sources}/gsheets/tests/gsheets_pipeline.py +0 -0
  104. /bizon/{sources → connectors/sources}/hubspot/src/hubspot_base.py +0 -0
  105. /bizon/{sources → connectors/sources}/hubspot/src/models/hs_object.py +0 -0
  106. /bizon/{sources → connectors/sources}/hubspot/tests/hubspot_pipeline.py +0 -0
  107. /bizon/{sources → connectors/sources}/periscope/tests/periscope_pipeline_charts.py +0 -0
  108. /bizon/{destinations → destination}/models.py +0 -0
  109. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/LICENSE +0 -0
  110. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/entry_points.txt +0 -0
@@ -15,8 +15,6 @@ from bizon.source.source import AbstractSource
15
15
 
16
16
  BASE_URL = "https://app.periscopedata.com"
17
17
 
18
- URL_OWNERS = f"{BASE_URL}/users/owners"
19
- URL_VIEWS = f"{BASE_URL}/login_state/sql_views"
20
18
  URL_DATABASES = f"{BASE_URL}/welcome/remaining_state/site_models"
21
19
 
22
20
 
@@ -39,6 +37,7 @@ class PeriscopeSourceConfig(SourceConfig):
39
37
  workspace_name: str = Field(..., description="Name of the workspace")
40
38
  client_site_id: int = Field(..., description="Client site ID")
41
39
  database_id: int = Field(..., description="Fetch charts connected to this Database ID")
40
+ x_csrf_token: str = Field(..., description="CSRF token for the requests")
42
41
 
43
42
 
44
43
  class PeriscopeSource(AbstractSource):
@@ -49,7 +48,14 @@ class PeriscopeSource(AbstractSource):
49
48
 
50
49
  @staticmethod
51
50
  def streams() -> List[str]:
52
- return ["dashboards", "charts", "users", "databases", "views"]
51
+ return [
52
+ "charts",
53
+ "dashboards_metadata",
54
+ "dashboards",
55
+ "databases",
56
+ "users",
57
+ "views",
58
+ ]
53
59
 
54
60
  @staticmethod
55
61
  def get_config_class() -> AbstractSource:
@@ -86,8 +92,9 @@ class PeriscopeSource(AbstractSource):
86
92
  "sec-fetch-dest": "empty",
87
93
  "sec-fetch-mode": "cors",
88
94
  "sec-fetch-site": "same-origin",
89
- "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", # noqa
95
+ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
90
96
  "x-requested-with": "XMLHttpRequest",
97
+ "x-csrf-token": self.config.x_csrf_token,
91
98
  },
92
99
  )
93
100
  )
@@ -97,10 +104,8 @@ class PeriscopeSource(AbstractSource):
97
104
  def get_total_records_count(self) -> int | None:
98
105
  return None
99
106
 
100
- def get_dashboards(self, pagination: dict = None) -> SourceIteration:
101
- response = self.session.call(method="GET", url=f"{BASE_URL}/login_state/dashboards", params=self.http_params)
102
- records_json = response.json()["Dashboard"]
103
-
107
+ @staticmethod
108
+ def transform_response_to_source_iteration(records: List[dict]) -> SourceIteration:
104
109
  return SourceIteration(
105
110
  next_pagination=dict(),
106
111
  records=[
@@ -108,14 +113,69 @@ class PeriscopeSource(AbstractSource):
108
113
  id=record["id"],
109
114
  data=record,
110
115
  )
111
- for record in records_json
116
+ for record in records
112
117
  ],
113
118
  )
114
119
 
120
+ def get_dashboards(self, pagination: dict = None) -> SourceIteration:
121
+ response = self.session.call(
122
+ method="GET",
123
+ url=f"{BASE_URL}/login_state/dashboards",
124
+ params=self.http_params,
125
+ )
126
+ records_json = response.json()["Dashboard"]
127
+ return self.transform_response_to_source_iteration(records_json)
128
+
129
+ def get_dashboards_metadata(self, pagination: dict = None) -> SourceIteration:
130
+
131
+ params = {
132
+ "client_site_id": self.config.client_site_id,
133
+ "filters": [{"name": "typeFilter", "input": "Dashboard"}],
134
+ "limit": 2000,
135
+ "query_plan": None,
136
+ }
137
+
138
+ response = self.session.call(
139
+ method="POST",
140
+ url=f"{BASE_URL}/global_search/search",
141
+ json=params,
142
+ )
143
+ records_json = response.json()["results"]["data"]
144
+ return self.transform_response_to_source_iteration(records_json)
145
+
115
146
  def get_dashboard_ids(self) -> List[int]:
116
147
  source_iteration = self.get_dashboards()
117
148
  return [record.id for record in source_iteration.records]
118
149
 
150
+ def _extract_raw_text_from_textbox(self, data: dict) -> str:
151
+ raw_text = []
152
+
153
+ def clean_text(text: str):
154
+ """Strip Byte Order Mark (BOM) and other unwanted whitespace."""
155
+ return text.replace("\ufeff", "").strip()
156
+
157
+ def traverse_nodes(nodes):
158
+ for node in nodes:
159
+ if node["object"] == "text":
160
+ for leaf in node["leaves"]:
161
+ raw_text.append(clean_text(leaf["text"]))
162
+ elif node["type"] == "link" and "data" in node and "url" in node["data"]:
163
+ link_text = []
164
+ for leaf in node["nodes"][0]["leaves"]: # Assume a single text node in link
165
+ link_text.append(clean_text(leaf["text"]))
166
+ # Format as Markdown link
167
+ raw_text.append(f"[{''.join(link_text)}]({node['data']['url']})")
168
+ elif "nodes" in node: # If there are nested nodes
169
+ traverse_nodes(node["nodes"])
170
+
171
+ if not data["text_data"]:
172
+ return ""
173
+
174
+ # Start traversal from the root nodes
175
+ traverse_nodes(data["text_data"]["document"]["nodes"])
176
+
177
+ return " ".join(raw_text)
178
+
119
179
  def _get_charts(self, dashboard_id: int) -> List[dict]:
120
180
  MAXIMUM_ITERATION: int = 1000
121
181
  iter_count: int = 0
@@ -154,12 +214,37 @@ class PeriscopeSource(AbstractSource):
154
214
  iter_count += 1
155
215
  iter_charts = response.json().get("Widget")
156
216
 
217
+ iter_textboxes = response.json().get("TextBox")
218
+
157
219
  for chart in iter_charts:
220
+ # Only fetch charts connected to gorgias-growth-production
158
221
  if str(chart.get("database_id")) == str(self.config.database_id):
159
222
  if chart.get("id") not in charts_list:
223
+
160
224
  charts_list.add(chart.get("id"))
225
+
226
+ chart["raw_text"] = None
227
+
228
+ # In case the chart is a textbox, we parse the raw text
229
+ if chart.get("content_id"):
230
+ text_box = list(
231
+ filter(
232
+ lambda x: x.get("id") == chart.get("content_id"),
233
+ iter_textboxes,
234
+ )
235
+ )
236
+
237
+ if not text_box:
238
+ logger.error(
239
+ f"Failed to fetch the textbox with id: {chart.get('content_id')} for chart with id: {chart.get('id')}"
240
+ )
241
+
242
+ if text_box:
243
+ chart["raw_text"] = self._extract_raw_text_from_textbox(text_box[0])
244
+
161
245
  dashboard_charts.append(chart)
162
- except Exception:
246
+ except Exception as e:
247
+ logger.error(f"Failed to fetch the dashboard with id: {dashboard_id} with error: {e}")
163
248
  continue
164
249
 
165
250
  return dashboard_charts
@@ -200,11 +285,50 @@ class PeriscopeSource(AbstractSource):
200
285
  ],
201
286
  )
202
287
 
288
+ def get_views(self, pagination: dict = None) -> SourceIteration:
289
+ response = self.session.call(
290
+ method="GET",
291
+ url=f"{BASE_URL}/login_state/sql_views",
292
+ params=self.http_params,
293
+ )
294
+ records_json = response.json()["SqlView"]
295
+ return self.transform_response_to_source_iteration(records_json)
296
+
297
+ def get_users(self, pagination: dict = None) -> SourceIteration:
298
+ response = self.session.call(
299
+ method="GET",
300
+ url=f"{BASE_URL}/users/owners",
301
+ params=self.http_params,
302
+ )
303
+ records_json = response.json()
304
+ return self.transform_response_to_source_iteration(records_json)
305
+
306
+ def get_databases(self, pagination: dict = None) -> SourceIteration:
307
+ response = self.session.call(
308
+ method="GET",
309
+ url=URL_DATABASES,
310
+ params=self.http_params,
311
+ )
312
+ records_json = response.json()["Database"]
313
+ return self.transform_response_to_source_iteration(records_json)
314
+
203
315
  def get(self, pagination: dict = None) -> SourceIteration:
204
- if self.config.stream_name == "dashboards":
316
+ if self.config.stream == "dashboards":
205
317
  return self.get_dashboards(pagination)
206
318
 
207
- if self.config.stream_name == "charts":
319
+ elif self.config.stream == "charts":
208
320
  return self.get_charts(pagination)
209
321
 
210
- raise NotImplementedError(f"Stream {self.config.stream_name} not implemented for Periscope")
322
+ elif self.config.stream == "dashboards_metadata":
323
+ return self.get_dashboards_metadata(pagination)
324
+
325
+ elif self.config.stream == "views":
326
+ return self.get_views(pagination)
327
+
328
+ elif self.config.stream == "users":
329
+ return self.get_users(pagination)
330
+
331
+ elif self.config.stream == "databases":
332
+ return self.get_databases(pagination)
333
+
334
+ raise NotImplementedError(f"Stream {self.config.stream} not implemented for Periscope")
@@ -0,0 +1,9 @@
1
+ import os
2
+
3
+ from bizon.cli.utils import parse_from_yaml
4
+ from bizon.engine.engine import RunnerFactory
5
+
6
+ config = parse_from_yaml(os.path.abspath("bizon/connectors/sources/periscope/config/periscope_dashboards.yml"))
7
+
8
+ runner = RunnerFactory.create_from_config_dict(config=config)
9
+ runner.run()
@@ -0,0 +1,19 @@
1
+ name: pokemon to json unnested
2
+
3
+ source:
4
+ name: pokeapi
5
+ stream: pokemon
6
+
7
+ destination:
8
+ name: file
9
+ config:
10
+ filepath: pokemon.json
11
+ buffer_flush_timeout: 2
12
+
13
+ unnest: true
14
+
15
+ record_schema:
16
+ - name: name
17
+ type: string
18
+ - name: url
19
+ type: string
@@ -0,0 +1,10 @@
1
+ name: pokemon to logger
2
+
3
+ source:
4
+ name: pokeapi
5
+ stream: pokemon
6
+
7
+ destination:
8
+ name: logger
9
+ config:
10
+ buffer_flush_timeout: 2
@@ -0,0 +1,79 @@
1
+ from enum import Enum
2
+ from typing import Any, List, Tuple
3
+
4
+ from requests.auth import AuthBase
5
+
6
+ from bizon.source.config import SourceConfig
7
+ from bizon.source.models import SourceIteration, SourceRecord
8
+ from bizon.source.source import AbstractSource
9
+
10
+ BASE_URL = "https://pokeapi.co/api/v2"
11
+
12
+
13
+ # Define the streams that the source supports
14
+ class PokeAPIStreams(str, Enum):
15
+ POKEMON = "pokemon"
16
+ BERRY = "berry"
17
+ ITEM = "item"
18
+
19
+
20
+ # Define the config class for the source
21
+ class PokeAPISourceConfig(SourceConfig):
22
+ stream: PokeAPIStreams
23
+
24
+
25
+ class PeriscopeSource(AbstractSource):
26
+
27
+ def __init__(self, config: PokeAPISourceConfig):
28
+ super().__init__(config)
29
+ self.config: PokeAPISourceConfig = config
30
+
31
+ @property
32
+ def url_entity(self) -> str:
33
+ return f"{BASE_URL}/{self.config.stream}"
34
+
35
+ @staticmethod
36
+ def streams() -> List[str]:
37
+ return [item.value for item in PokeAPIStreams]
38
+
39
+ @staticmethod
40
+ def get_config_class() -> AbstractSource:
41
+ return PokeAPISourceConfig
42
+
43
+ def check_connection(self) -> Tuple[bool | Any | None]:
44
+ # Make a request to the base URL to check if the connection is successful
45
+ _ = self.session.get(self.url_entity)
46
+ return True, None
47
+
48
+ def get_authenticator(self) -> AuthBase:
49
+ # We return None because we don't need any authentication
50
+ return None
51
+
52
+ def get_total_records_count(self) -> int | None:
53
+ # Return the total number of records in the stream
54
+ response = self.session.get(self.url_entity)
55
+ return response.json().get("count")
56
+
57
+ def get_entity_list(self, pagination: dict = None) -> SourceIteration:
58
+ # If pagination is provided, use the next URL to get the next set of records
59
+ url = pagination.get("next") if pagination else self.url_entity
60
+ response = self.session.get(url)
61
+
62
+ data = response.json()
63
+
64
+ return SourceIteration(
65
+ next_pagination={"next": data.get("next")} if data.get("next") else {},
66
+ records=[
67
+ SourceRecord(
68
+ id=record["name"],
69
+ data=record,
70
+ )
71
+ for record in data["results"]
72
+ ],
73
+ )
74
+
75
+ def get(self, pagination: dict = None) -> SourceIteration:
76
+ if self.config.stream in [PokeAPIStreams.POKEMON, PokeAPIStreams.BERRY, PokeAPIStreams.ITEM]:
77
+ return self.get_entity_list(pagination)
78
+
79
+ raise NotImplementedError(f"Stream {self.config.stream} not implemented for PokeAPI source")
@@ -18,6 +18,11 @@ class DestinationBuffer:
18
18
  self.pagination = {}
19
19
  self.modified_at: List[datetime] = [datetime.now(tz=UTC)]
20
20
 
21
+ @property
22
+ def is_empty(self) -> bool:
23
+ """Check if buffer is empty"""
24
+ return self.df_destination_records.height == 0
25
+
21
26
  @property
22
27
  def current_size(self) -> int:
23
28
  """Return buffer size"""
@@ -0,0 +1,83 @@
1
+ from abc import ABC
2
+ from enum import Enum
3
+ from typing import Optional
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
6
+
7
+
8
+ class DestinationTypes(str, Enum):
9
+ BIGQUERY = "bigquery"
10
+ BIGQUERY_STREAMING = "bigquery_streaming"
11
+ BIGQUERY_STREAMING_V2 = "bigquery_streaming_v2"
12
+ LOGGER = "logger"
13
+ FILE = "file"
14
+
15
+
16
+ class DestinationColumn(BaseModel, ABC):
17
+ name: str = Field(..., description="Name of the column")
18
+ type: str = Field(..., description="Type of the column")
19
+ description: Optional[str] = Field(None, description="Description of the column")
20
+
21
+
22
+ class RecordSchemaConfig(BaseModel):
23
+ # Forbid extra keys in the model
24
+ model_config = ConfigDict(extra="forbid")
25
+
26
+ destination_id: str = Field(..., description="Destination ID")
27
+ record_schema: list[DestinationColumn] = Field(..., description="Record schema")
28
+
29
+
30
+ class AbstractDestinationDetailsConfig(BaseModel):
31
+
32
+ # Forbid extra keys in the model
33
+ model_config = ConfigDict(extra="forbid")
34
+
35
+ buffer_size: int = Field(
36
+ default=50,
37
+ description="Buffer size in Mb for the destination. Set to 0 to disable and write directly to the destination.",
38
+ )
39
+
40
+ buffer_flush_timeout: int = Field(
41
+ default=600,
42
+ description="Maximum time in seconds for buffering after which the records will be written to the destination. Set to 0 to deactivate the timeout buffer check.", # noqa
43
+ )
44
+
45
+ max_concurrent_threads: int = Field(
46
+ default=10,
47
+ description="Maximum number of concurrent threads to use for writing to the destination.",
48
+ )
49
+
50
+ record_schemas: Optional[list[RecordSchemaConfig]] = Field(
51
+ default=None, description="Schemas for the records. Required if unnest is set to true."
52
+ )
53
+
54
+ unnest: bool = Field(
55
+ default=False,
56
+ description="Unnest the data before writing to the destination. Schema should be provided in the model_config.",
57
+ )
58
+
59
+ authentication: Optional[BaseModel] = Field(
60
+ description="Authentication configuration for the destination, if needed", default=None
61
+ )
62
+
63
+ destination_id: Optional[str] = Field(
64
+ description="Destination ID, identifier to use to store the records in the destination", default=None
65
+ )
66
+
67
+ @field_validator("unnest", mode="before")
68
+ def validate_record_schema_if_unnest(cls, value, values):
69
+ if bool(value) and not values.data.get("record_schemas", []):
70
+ raise ValueError("At least one `record_schemas` must be provided if `unnest` is set to True.")
71
+ return value
72
+
73
+
74
+ class AbstractDestinationConfig(BaseModel):
75
+ # Forbid extra keys in the model
76
+ model_config = ConfigDict(extra="forbid")
77
+
78
+ name: DestinationTypes = Field(..., description="Name of the destination")
79
+ alias: str = Field(
80
+ ...,
81
+ description="Alias of the destination, used for tracking the system name (ie bigquery for bigquery_streaming)",
82
+ )
83
+ config: AbstractDestinationDetailsConfig = Field(..., description="Configuration for the destination")
@@ -10,6 +10,8 @@ from pydantic import BaseModel, Field
10
10
  from bizon.common.models import SyncMetadata
11
11
  from bizon.engine.backend.backend import AbstractBackend
12
12
  from bizon.engine.backend.models import JobStatus
13
+ from bizon.monitoring.monitor import AbstractMonitor
14
+ from bizon.source.callback import AbstractSourceCallback
13
15
  from bizon.source.config import SourceSyncModes
14
16
 
15
17
  from .buffer import DestinationBuffer
@@ -43,13 +45,42 @@ class DestinationIteration(BaseModel):
43
45
 
44
46
  class AbstractDestination(ABC):
45
47
 
46
- def __init__(self, sync_metadata: SyncMetadata, config: AbstractDestinationDetailsConfig, backend: AbstractBackend):
48
+ def __init__(
49
+ self,
50
+ sync_metadata: SyncMetadata,
51
+ config: AbstractDestinationDetailsConfig,
52
+ backend: AbstractBackend,
53
+ source_callback: AbstractSourceCallback,
54
+ monitor: AbstractMonitor,
55
+ ):
47
56
  self.sync_metadata = sync_metadata
48
57
  self.config = config
49
58
  self.backend = backend
59
+ self.monitor = monitor
50
60
  self.buffer = DestinationBuffer(
51
61
  buffer_size=self.config.buffer_size, buffer_flush_timeout=self.config.buffer_flush_timeout
52
62
  )
63
+ self.source_callback = source_callback
64
+ self.destination_id = config.destination_id
65
+
66
+ self._record_schemas = None
67
+ self._clustering_keys = None
68
+
69
+ @property
70
+ def record_schemas(self):
71
+ if self._record_schemas is None and self.config.record_schemas:
72
+ self._record_schemas = {
73
+ schema.destination_id: schema.record_schema for schema in self.config.record_schemas
74
+ }
75
+ return self._record_schemas
76
+
77
+ @property
78
+ def clustering_keys(self):
79
+ if self._clustering_keys is None and self.config.record_schemas:
80
+ self._clustering_keys = {
81
+ schema.destination_id: schema.clustering_keys for schema in self.config.record_schemas
82
+ }
83
+ return self._clustering_keys
53
84
 
54
85
  @abstractmethod
55
86
  def check_connection(self) -> bool:
@@ -75,7 +106,7 @@ class AbstractDestination(ABC):
75
106
  )
76
107
 
77
108
  logger.info(
78
- f"Writing in destination from source iteration {self.buffer.from_iteration} to {self.buffer.to_iteration}"
109
+ f"Writing in destination {self.destination_id} from source iteration {self.buffer.from_iteration} to {self.buffer.to_iteration}"
79
110
  )
80
111
 
81
112
  success, error_msg = self.write_records(df_destination_records=self.buffer.df_destination_records)
@@ -83,7 +114,9 @@ class AbstractDestination(ABC):
83
114
  if success:
84
115
  # We wrote records to destination so we keep it
85
116
  destination_iteration.records_written = self.buffer.df_destination_records.height
86
- logger.info(f"Successfully wrote {destination_iteration.records_written} records to destination")
117
+ logger.info(
118
+ f"Successfully wrote {destination_iteration.records_written} records to destination {self.destination_id}"
119
+ )
87
120
 
88
121
  else:
89
122
  # We failed to write records to destination so we keep the error message
@@ -112,8 +145,8 @@ class AbstractDestination(ABC):
112
145
  # Last iteration, write all records to destination
113
146
  if last_iteration:
114
147
 
115
- if self.buffer.df_destination_records.height == 0 and self.config.buffer_size == 0:
116
- logger.warning("No records to write to destination, already written, buffer is empty.")
148
+ if self.buffer.df_destination_records.height == 0 and self.buffer.is_empty:
149
+ logger.info("No records to write to destination, already written, buffer is empty.")
117
150
  return DestinationBufferStatus.RECORDS_WRITTEN
118
151
 
119
152
  logger.debug("Writing last iteration records to destination")
@@ -143,12 +176,12 @@ class AbstractDestination(ABC):
143
176
 
144
177
  # Don't write empty records to destination
145
178
  if df_destination_records.height == 0 and not last_iteration:
146
- logger.warning("No records to write to destination. Check source and queue provider.")
179
+ logger.info("No records to write to destination. Check source and queue provider.")
147
180
  return DestinationBufferStatus.NO_RECORDS
148
181
 
149
182
  # Write records to destination if buffer size is 0 or streaming
150
183
  if self.buffer.buffer_size == 0:
151
- logger.info("Writing records to destination.")
184
+ logger.info(f"Writing records to destination {self.destination_id}.")
152
185
  self.buffer.add_source_iteration_records_to_buffer(
153
186
  iteration=iteration, df_destination_records=df_destination_records, pagination=pagination
154
187
  )
@@ -161,6 +194,14 @@ class AbstractDestination(ABC):
161
194
  logger.info(
162
195
  f"Buffer ripeness {round(self.buffer.ripeness / 60, 2)} min. Max ripeness {round(self.buffer.buffer_flush_timeout / 60, 2)} min." # noqa
163
196
  )
197
+ logger.info(
198
+ f"Current records size to process: {round(df_destination_records.estimated_size(unit='b') / 1024 / 1024, 2)} Mb."
199
+ )
200
+
201
+ if df_destination_records.estimated_size(unit="b") > self.buffer.buffer_size:
202
+ raise ValueError(
203
+ f"Records size {round(df_destination_records.estimated_size(unit='b') / 1024 / 1024, 2)} Mb is greater than buffer size {round(self.buffer.buffer_size / 1024 / 1024, 2)} Mb. Please increase destination buffer_size or reduce batch_size from the source."
204
+ )
164
205
 
165
206
  # Write buffer to destination if buffer is ripe and create a new buffer for the new iteration
166
207
  if self.buffer.is_ripe:
@@ -245,26 +286,73 @@ class DestinationFactory:
245
286
  sync_metadata: SyncMetadata,
246
287
  config: AbstractDestinationConfig,
247
288
  backend: AbstractBackend,
289
+ source_callback: AbstractSourceCallback,
290
+ monitor: AbstractMonitor,
248
291
  ) -> AbstractDestination:
249
292
 
250
293
  if config.name == DestinationTypes.LOGGER:
251
- from .logger.src.destination import LoggerDestination
294
+ from bizon.connectors.destinations.logger.src.destination import (
295
+ LoggerDestination,
296
+ )
252
297
 
253
- return LoggerDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
298
+ return LoggerDestination(
299
+ sync_metadata=sync_metadata,
300
+ config=config.config,
301
+ backend=backend,
302
+ source_callback=source_callback,
303
+ monitor=monitor,
304
+ )
254
305
 
255
306
  elif config.name == DestinationTypes.BIGQUERY:
256
- from .bigquery.src.destination import BigQueryDestination
307
+ from bizon.connectors.destinations.bigquery.src.destination import (
308
+ BigQueryDestination,
309
+ )
257
310
 
258
- return BigQueryDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
311
+ return BigQueryDestination(
312
+ sync_metadata=sync_metadata,
313
+ config=config.config,
314
+ backend=backend,
315
+ source_callback=source_callback,
316
+ monitor=monitor,
317
+ )
259
318
 
260
319
  elif config.name == DestinationTypes.BIGQUERY_STREAMING:
261
- from .bigquery_streaming.src.destination import BigQueryStreamingDestination
320
+ from bizon.connectors.destinations.bigquery_streaming.src.destination import (
321
+ BigQueryStreamingDestination,
322
+ )
323
+
324
+ return BigQueryStreamingDestination(
325
+ sync_metadata=sync_metadata,
326
+ config=config.config,
327
+ backend=backend,
328
+ source_callback=source_callback,
329
+ monitor=monitor,
330
+ )
262
331
 
263
- return BigQueryStreamingDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
332
+ elif config.name == DestinationTypes.BIGQUERY_STREAMING_V2:
333
+ from bizon.connectors.destinations.bigquery_streaming_v2.src.destination import (
334
+ BigQueryStreamingV2Destination,
335
+ )
336
+
337
+ return BigQueryStreamingV2Destination(
338
+ sync_metadata=sync_metadata,
339
+ config=config.config,
340
+ backend=backend,
341
+ source_callback=source_callback,
342
+ monitor=monitor,
343
+ )
264
344
 
265
345
  elif config.name == DestinationTypes.FILE:
266
- from .file.src.destination import FileDestination
346
+ from bizon.connectors.destinations.file.src.destination import (
347
+ FileDestination,
348
+ )
267
349
 
268
- return FileDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
350
+ return FileDestination(
351
+ sync_metadata=sync_metadata,
352
+ config=config.config,
353
+ backend=backend,
354
+ source_callback=source_callback,
355
+ monitor=monitor,
356
+ )
269
357
 
270
358
  raise ValueError(f"Destination {config.name}" f"with params {config} not found")
@@ -173,7 +173,9 @@ class SQLAlchemyBackend(AbstractBackend):
173
173
 
174
174
  def _execute(self, select: Select, session: Optional[Session] = None) -> Result:
175
175
  session = session or self.session
176
- return session.execute(select)
176
+ result = session.execute(select)
177
+ session.commit()
178
+ return result
177
179
 
178
180
  #### STREAM JOB ####
179
181