bizon 0.0.14__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. bizon/alerting/__init__.py +0 -0
  2. bizon/alerting/alerts.py +23 -0
  3. bizon/alerting/models.py +28 -0
  4. bizon/alerting/slack/__init__.py +0 -0
  5. bizon/alerting/slack/config.py +5 -0
  6. bizon/alerting/slack/handler.py +39 -0
  7. bizon/cli/main.py +7 -3
  8. bizon/common/models.py +31 -7
  9. bizon/{destinations → connectors/destinations}/bigquery/config/bigquery.example.yml +3 -4
  10. bizon/connectors/destinations/bigquery/src/config.py +127 -0
  11. bizon/{destinations → connectors/destinations}/bigquery/src/destination.py +46 -25
  12. bizon/connectors/destinations/bigquery_streaming/src/config.py +56 -0
  13. bizon/connectors/destinations/bigquery_streaming/src/destination.py +372 -0
  14. bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +52 -0
  15. bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +261 -0
  16. bizon/{destinations/bigquery_streaming → connectors/destinations/bigquery_streaming_v2}/src/proto_utils.py +32 -26
  17. bizon/{destinations → connectors/destinations}/file/src/config.py +8 -3
  18. bizon/connectors/destinations/file/src/destination.py +54 -0
  19. bizon/{destinations → connectors/destinations}/logger/src/config.py +1 -1
  20. bizon/{destinations → connectors/destinations}/logger/src/destination.py +15 -3
  21. bizon/connectors/sources/cycle/config/cycle.example.yml +15 -0
  22. bizon/connectors/sources/cycle/src/source.py +133 -0
  23. bizon/{sources/periscope/tests/periscope_pipeline_dashboard.py → connectors/sources/cycle/tests/cycle_customers.py} +1 -1
  24. bizon/connectors/sources/dummy/config/dummy.example.yml +22 -0
  25. bizon/{sources → connectors/sources}/dummy/src/fake_api.py +6 -1
  26. bizon/{sources → connectors/sources}/dummy/src/source.py +18 -5
  27. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline.py +5 -14
  28. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_bigquery_backend.py +2 -2
  29. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_kafka.py +2 -2
  30. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_rabbitmq.py +2 -2
  31. bizon/connectors/sources/dummy/tests/dummy_pipeline_unnest.py +29 -0
  32. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery.py +3 -3
  33. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +2 -2
  34. bizon/{sources → connectors/sources}/gsheets/config/default_auth.example.yml +4 -2
  35. bizon/{sources → connectors/sources}/gsheets/config/service_account.example.yml +4 -2
  36. bizon/{sources → connectors/sources}/hubspot/config/api_key.example.yml +4 -2
  37. bizon/{sources → connectors/sources}/hubspot/config/oauth.example.yml +4 -2
  38. bizon/{sources → connectors/sources}/hubspot/src/hubspot_objects.py +1 -1
  39. bizon/connectors/sources/kafka/config/kafka.example.yml +50 -0
  40. bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +112 -0
  41. bizon/connectors/sources/kafka/src/callback.py +18 -0
  42. bizon/connectors/sources/kafka/src/config.py +75 -0
  43. bizon/connectors/sources/kafka/src/decode.py +88 -0
  44. bizon/connectors/sources/kafka/src/source.py +361 -0
  45. bizon/connectors/sources/kafka/tests/kafka_pipeline.py +7 -0
  46. bizon/connectors/sources/periscope/config/periscope_charts.example.yml +20 -0
  47. bizon/connectors/sources/periscope/config/periscope_dashboards.example.yml +20 -0
  48. bizon/{sources → connectors/sources}/periscope/src/source.py +136 -13
  49. bizon/connectors/sources/periscope/tests/periscope_pipeline_dashboard.py +9 -0
  50. bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_json.example.yml +19 -0
  51. bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_logger.example.yml +10 -0
  52. bizon/connectors/sources/pokeapi/src/source.py +79 -0
  53. bizon/{destinations → destination}/buffer.py +5 -0
  54. bizon/destination/config.py +74 -0
  55. bizon/{destinations → destination}/destination.py +71 -15
  56. bizon/engine/backend/adapters/sqlalchemy/backend.py +14 -23
  57. bizon/engine/engine.py +20 -1
  58. bizon/engine/pipeline/consumer.py +73 -5
  59. bizon/engine/pipeline/models.py +8 -3
  60. bizon/engine/pipeline/producer.py +18 -9
  61. bizon/engine/queue/adapters/kafka/consumer.py +2 -2
  62. bizon/engine/queue/adapters/kafka/queue.py +3 -2
  63. bizon/engine/queue/adapters/python_queue/consumer.py +40 -23
  64. bizon/engine/queue/adapters/python_queue/queue.py +19 -9
  65. bizon/engine/queue/adapters/rabbitmq/consumer.py +3 -6
  66. bizon/engine/queue/adapters/rabbitmq/queue.py +3 -2
  67. bizon/engine/queue/config.py +16 -0
  68. bizon/engine/queue/queue.py +17 -16
  69. bizon/engine/runner/adapters/process.py +15 -2
  70. bizon/engine/runner/adapters/streaming.py +103 -0
  71. bizon/engine/runner/adapters/thread.py +32 -9
  72. bizon/engine/runner/config.py +28 -0
  73. bizon/engine/runner/runner.py +107 -25
  74. bizon/monitoring/__init__.py +0 -0
  75. bizon/monitoring/config.py +29 -0
  76. bizon/monitoring/datadog/__init__.py +0 -0
  77. bizon/monitoring/datadog/monitor.py +69 -0
  78. bizon/monitoring/monitor.py +42 -0
  79. bizon/monitoring/noop/__init__.py +0 -0
  80. bizon/monitoring/noop/monitor.py +11 -0
  81. bizon/source/callback.py +24 -0
  82. bizon/source/config.py +3 -3
  83. bizon/source/cursor.py +1 -1
  84. bizon/source/discover.py +4 -3
  85. bizon/source/models.py +4 -2
  86. bizon/source/source.py +10 -2
  87. bizon/transform/config.py +8 -0
  88. bizon/transform/transform.py +48 -0
  89. bizon-0.1.1.dist-info/LICENSE +674 -0
  90. {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/METADATA +25 -7
  91. bizon-0.1.1.dist-info/RECORD +123 -0
  92. {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/WHEEL +1 -1
  93. bizon/destinations/bigquery/src/config.py +0 -51
  94. bizon/destinations/bigquery_streaming/src/config.py +0 -43
  95. bizon/destinations/bigquery_streaming/src/destination.py +0 -154
  96. bizon/destinations/config.py +0 -47
  97. bizon/destinations/file/src/destination.py +0 -27
  98. bizon/sources/dummy/config/api_key.example.yml +0 -20
  99. bizon/sources/dummy/config/api_key_kafka.example.yml +0 -27
  100. bizon/sources/kafka/config/kafka.example.yml +0 -38
  101. bizon/sources/kafka/src/source.py +0 -357
  102. bizon/sources/kafka/tests/kafka_pipeline.py +0 -9
  103. bizon/sources/periscope/config/periscope_charts.example.yml +0 -26
  104. bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -26
  105. bizon-0.0.14.dist-info/LICENSE +0 -21
  106. bizon-0.0.14.dist-info/RECORD +0 -94
  107. /bizon/{sources → connectors/sources}/gsheets/src/source.py +0 -0
  108. /bizon/{sources → connectors/sources}/gsheets/tests/gsheets_pipeline.py +0 -0
  109. /bizon/{sources → connectors/sources}/hubspot/src/hubspot_base.py +0 -0
  110. /bizon/{sources → connectors/sources}/hubspot/src/models/hs_object.py +0 -0
  111. /bizon/{sources → connectors/sources}/hubspot/tests/hubspot_pipeline.py +0 -0
  112. /bizon/{sources → connectors/sources}/periscope/tests/periscope_pipeline_charts.py +0 -0
  113. /bizon/{destinations → destination}/models.py +0 -0
  114. {bizon-0.0.14.dist-info → bizon-0.1.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,79 @@
1
+ from enum import Enum
2
+ from typing import Any, List, Tuple
3
+
4
+ from requests.auth import AuthBase
5
+
6
+ from bizon.source.config import SourceConfig
7
+ from bizon.source.models import SourceIteration, SourceRecord
8
+ from bizon.source.source import AbstractSource
9
+
10
+ BASE_URL = "https://pokeapi.co/api/v2"
11
+
12
+
13
+ # Define the streams that the source supports
14
+ class PokeAPIStreams(str, Enum):
15
+ POKEMON = "pokemon"
16
+ BERRY = "berry"
17
+ ITEM = "item"
18
+
19
+
20
+ # Define the config class for the source
21
+ class PokeAPISourceConfig(SourceConfig):
22
+ stream: PokeAPIStreams
23
+
24
+
25
+ class PeriscopeSource(AbstractSource):
26
+
27
+ def __init__(self, config: PokeAPISourceConfig):
28
+ super().__init__(config)
29
+ self.config: PokeAPISourceConfig = config
30
+
31
+ @property
32
+ def url_entity(self) -> str:
33
+ return f"{BASE_URL}/{self.config.stream}"
34
+
35
+ @staticmethod
36
+ def streams() -> List[str]:
37
+ return [item.value for item in PokeAPIStreams]
38
+
39
+ @staticmethod
40
+ def get_config_class() -> AbstractSource:
41
+ return PokeAPISourceConfig
42
+
43
+ def check_connection(self) -> Tuple[bool | Any | None]:
44
+ # Make a request to the base URL to check if the connection is successful
45
+ _ = self.session.get(self.url_entity)
46
+ return True, None
47
+
48
+ def get_authenticator(self) -> AuthBase:
49
+ # We return None because we don't need any authentication
50
+ return None
51
+
52
+ def get_total_records_count(self) -> int | None:
53
+ # Return the total number of records in the stream
54
+ response = self.session.get(self.url_entity)
55
+ return response.json().get("count")
56
+
57
+ def get_entity_list(self, pagination: dict = None) -> SourceIteration:
58
+ # If pagination is provided, use the next URL to get the next set of records
59
+ url = pagination.get("next") if pagination else self.url_entity
60
+ response = self.session.get(url)
61
+
62
+ data = response.json()
63
+
64
+ return SourceIteration(
65
+ next_pagination={"next": data.get("next")} if data.get("next") else {},
66
+ records=[
67
+ SourceRecord(
68
+ id=record["name"],
69
+ data=record,
70
+ )
71
+ for record in data["results"]
72
+ ],
73
+ )
74
+
75
+ def get(self, pagination: dict = None) -> SourceIteration:
76
+ if self.config.stream in [PokeAPIStreams.POKEMON, PokeAPIStreams.BERRY, PokeAPIStreams.ITEM]:
77
+ return self.get_entity_list(pagination)
78
+
79
+ raise NotImplementedError(f"Stream {self.config.stream} not implemented for PokeAPI source")
@@ -18,6 +18,11 @@ class DestinationBuffer:
18
18
  self.pagination = {}
19
19
  self.modified_at: List[datetime] = [datetime.now(tz=UTC)]
20
20
 
21
+ @property
22
+ def is_empty(self) -> bool:
23
+ """Check if buffer is empty"""
24
+ return self.df_destination_records.height == 0
25
+
21
26
  @property
22
27
  def current_size(self) -> int:
23
28
  """Return buffer size"""
@@ -0,0 +1,74 @@
1
+ from abc import ABC
2
+ from enum import Enum
3
+ from typing import Optional
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
6
+
7
+
8
+ class DestinationTypes(str, Enum):
9
+ BIGQUERY = "bigquery"
10
+ BIGQUERY_STREAMING = "bigquery_streaming"
11
+ BIGQUERY_STREAMING_V2 = "bigquery_streaming_v2"
12
+ LOGGER = "logger"
13
+ FILE = "file"
14
+
15
+
16
+ class DestinationColumn(BaseModel, ABC):
17
+ name: str = Field(..., description="Name of the column")
18
+ type: str = Field(..., description="Type of the column")
19
+ description: Optional[str] = Field(None, description="Description of the column")
20
+
21
+
22
+ class RecordSchemaConfig(BaseModel):
23
+ # Forbid extra keys in the model
24
+ model_config = ConfigDict(extra="forbid")
25
+
26
+ destination_id: str = Field(..., description="Destination ID")
27
+ record_schema: list[DestinationColumn] = Field(..., description="Record schema")
28
+
29
+
30
+ class AbstractDestinationDetailsConfig(BaseModel):
31
+
32
+ # Forbid extra keys in the model
33
+ model_config = ConfigDict(extra="forbid")
34
+
35
+ buffer_size: int = Field(
36
+ default=50,
37
+ description="Buffer size in Mb for the destination. Set to 0 to disable and write directly to the destination.",
38
+ )
39
+
40
+ buffer_flush_timeout: int = Field(
41
+ default=600,
42
+ description="Maximum time in seconds for buffering after which the records will be written to the destination. Set to 0 to deactivate the timeout buffer check.", # noqa
43
+ )
44
+
45
+ record_schemas: Optional[list[RecordSchemaConfig]] = Field(
46
+ default=None, description="Schemas for the records. Required if unnest is set to true."
47
+ )
48
+
49
+ unnest: bool = Field(
50
+ default=False,
51
+ description="Unnest the data before writing to the destination. Schema should be provided in the model_config.",
52
+ )
53
+
54
+ authentication: Optional[BaseModel] = Field(
55
+ description="Authentication configuration for the destination, if needed", default=None
56
+ )
57
+
58
+ destination_id: Optional[str] = Field(
59
+ description="Destination ID, identifier to use to store the records in the destination", default=None
60
+ )
61
+
62
+ @field_validator("unnest", mode="before")
63
+ def validate_record_schema_if_unnest(cls, value, values):
64
+ if bool(value) and not values.data.get("record_schemas", []):
65
+ raise ValueError("At least one `record_schemas` must be provided if `unnest` is set to True.")
66
+ return value
67
+
68
+
69
+ class AbstractDestinationConfig(BaseModel):
70
+ # Forbid extra keys in the model
71
+ model_config = ConfigDict(extra="forbid")
72
+
73
+ name: DestinationTypes = Field(..., description="Name of the destination")
74
+ config: AbstractDestinationDetailsConfig = Field(..., description="Configuration for the destination")
@@ -10,6 +10,7 @@ from pydantic import BaseModel, Field
10
10
  from bizon.common.models import SyncMetadata
11
11
  from bizon.engine.backend.backend import AbstractBackend
12
12
  from bizon.engine.backend.models import JobStatus
13
+ from bizon.source.callback import AbstractSourceCallback
13
14
  from bizon.source.config import SourceSyncModes
14
15
 
15
16
  from .buffer import DestinationBuffer
@@ -43,13 +44,40 @@ class DestinationIteration(BaseModel):
43
44
 
44
45
  class AbstractDestination(ABC):
45
46
 
46
- def __init__(self, sync_metadata: SyncMetadata, config: AbstractDestinationDetailsConfig, backend: AbstractBackend):
47
+ def __init__(
48
+ self,
49
+ sync_metadata: SyncMetadata,
50
+ config: AbstractDestinationDetailsConfig,
51
+ backend: AbstractBackend,
52
+ source_callback: AbstractSourceCallback,
53
+ ):
47
54
  self.sync_metadata = sync_metadata
48
55
  self.config = config
49
56
  self.backend = backend
50
57
  self.buffer = DestinationBuffer(
51
58
  buffer_size=self.config.buffer_size, buffer_flush_timeout=self.config.buffer_flush_timeout
52
59
  )
60
+ self.source_callback = source_callback
61
+ self.destination_id = config.destination_id
62
+
63
+ self._record_schemas = None
64
+ self._clustering_keys = None
65
+
66
+ @property
67
+ def record_schemas(self):
68
+ if self._record_schemas is None and self.config.record_schemas:
69
+ self._record_schemas = {
70
+ schema.destination_id: schema.record_schema for schema in self.config.record_schemas
71
+ }
72
+ return self._record_schemas
73
+
74
+ @property
75
+ def clustering_keys(self):
76
+ if self._clustering_keys is None and self.config.record_schemas:
77
+ self._clustering_keys = {
78
+ schema.destination_id: schema.clustering_keys for schema in self.config.record_schemas
79
+ }
80
+ return self._clustering_keys
53
81
 
54
82
  @abstractmethod
55
83
  def check_connection(self) -> bool:
@@ -75,7 +103,7 @@ class AbstractDestination(ABC):
75
103
  )
76
104
 
77
105
  logger.info(
78
- f"Writing in destination from source iteration {self.buffer.from_iteration} to {self.buffer.to_iteration}"
106
+ f"Writing in destination {self.destination_id} from source iteration {self.buffer.from_iteration} to {self.buffer.to_iteration}"
79
107
  )
80
108
 
81
109
  success, error_msg = self.write_records(df_destination_records=self.buffer.df_destination_records)
@@ -83,7 +111,9 @@ class AbstractDestination(ABC):
83
111
  if success:
84
112
  # We wrote records to destination so we keep it
85
113
  destination_iteration.records_written = self.buffer.df_destination_records.height
86
- logger.info(f"Successfully wrote {destination_iteration.records_written} records to destination")
114
+ logger.info(
115
+ f"Successfully wrote {destination_iteration.records_written} records to destination {self.destination_id}"
116
+ )
87
117
 
88
118
  else:
89
119
  # We failed to write records to destination so we keep the error message
@@ -112,8 +142,8 @@ class AbstractDestination(ABC):
112
142
  # Last iteration, write all records to destination
113
143
  if last_iteration:
114
144
 
115
- if self.buffer.df_destination_records.height == 0 and self.config.buffer_size == 0:
116
- logger.warning("No records to write to destination, already written, buffer is empty.")
145
+ if self.buffer.df_destination_records.height == 0 and self.buffer.is_empty:
146
+ logger.info("No records to write to destination, already written, buffer is empty.")
117
147
  return DestinationBufferStatus.RECORDS_WRITTEN
118
148
 
119
149
  logger.debug("Writing last iteration records to destination")
@@ -143,12 +173,12 @@ class AbstractDestination(ABC):
143
173
 
144
174
  # Don't write empty records to destination
145
175
  if df_destination_records.height == 0 and not last_iteration:
146
- logger.warning("No records to write to destination. Check source and queue provider.")
176
+ logger.info("No records to write to destination. Check source and queue provider.")
147
177
  return DestinationBufferStatus.NO_RECORDS
148
178
 
149
179
  # Write records to destination if buffer size is 0 or streaming
150
180
  if self.buffer.buffer_size == 0:
151
- logger.info("Writing records to destination.")
181
+ logger.info(f"Writing records to destination {self.destination_id}.")
152
182
  self.buffer.add_source_iteration_records_to_buffer(
153
183
  iteration=iteration, df_destination_records=df_destination_records, pagination=pagination
154
184
  )
@@ -245,26 +275,52 @@ class DestinationFactory:
245
275
  sync_metadata: SyncMetadata,
246
276
  config: AbstractDestinationConfig,
247
277
  backend: AbstractBackend,
278
+ source_callback: AbstractSourceCallback,
248
279
  ) -> AbstractDestination:
249
280
 
250
281
  if config.name == DestinationTypes.LOGGER:
251
- from .logger.src.destination import LoggerDestination
282
+ from bizon.connectors.destinations.logger.src.destination import (
283
+ LoggerDestination,
284
+ )
252
285
 
253
- return LoggerDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
286
+ return LoggerDestination(
287
+ sync_metadata=sync_metadata, config=config.config, backend=backend, source_callback=source_callback
288
+ )
254
289
 
255
290
  elif config.name == DestinationTypes.BIGQUERY:
256
- from .bigquery.src.destination import BigQueryDestination
291
+ from bizon.connectors.destinations.bigquery.src.destination import (
292
+ BigQueryDestination,
293
+ )
257
294
 
258
- return BigQueryDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
295
+ return BigQueryDestination(
296
+ sync_metadata=sync_metadata, config=config.config, backend=backend, source_callback=source_callback
297
+ )
259
298
 
260
299
  elif config.name == DestinationTypes.BIGQUERY_STREAMING:
261
- from .bigquery_streaming.src.destination import BigQueryStreamingDestination
300
+ from bizon.connectors.destinations.bigquery_streaming.src.destination import (
301
+ BigQueryStreamingDestination,
302
+ )
303
+
304
+ return BigQueryStreamingDestination(
305
+ sync_metadata=sync_metadata, config=config.config, backend=backend, source_callback=source_callback
306
+ )
307
+
308
+ elif config.name == DestinationTypes.BIGQUERY_STREAMING_V2:
309
+ from bizon.connectors.destinations.bigquery_streaming_v2.src.destination import (
310
+ BigQueryStreamingV2Destination,
311
+ )
262
312
 
263
- return BigQueryStreamingDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
313
+ return BigQueryStreamingV2Destination(
314
+ sync_metadata=sync_metadata, config=config.config, backend=backend, source_callback=source_callback
315
+ )
264
316
 
265
317
  elif config.name == DestinationTypes.FILE:
266
- from .file.src.destination import FileDestination
318
+ from bizon.connectors.destinations.file.src.destination import (
319
+ FileDestination,
320
+ )
267
321
 
268
- return FileDestination(sync_metadata=sync_metadata, config=config.config, backend=backend)
322
+ return FileDestination(
323
+ sync_metadata=sync_metadata, config=config.config, backend=backend, source_callback=source_callback
324
+ )
269
325
 
270
326
  raise ValueError(f"Destination {config.name}" f"with params {config} not found")
@@ -1,6 +1,6 @@
1
1
  import json
2
2
  from datetime import datetime
3
- from typing import Generator, Optional, Union
3
+ from typing import Optional, Union
4
4
 
5
5
  from loguru import logger
6
6
  from pytz import UTC
@@ -39,9 +39,9 @@ class SQLAlchemyBackend(AbstractBackend):
39
39
  ] = config
40
40
 
41
41
  self.kwargs = kwargs
42
+ self.session = self.get_session()
42
43
 
43
- @property
44
- def session(self) -> Generator[Session, None, None]:
44
+ def get_session(self) -> Session:
45
45
  """yields a SQLAlchemy connection"""
46
46
  engine = self.get_engine()
47
47
  session_ = scoped_session(
@@ -51,20 +51,17 @@ class SQLAlchemyBackend(AbstractBackend):
51
51
  )
52
52
  )
53
53
 
54
- yield session_
55
-
56
- session_.close()
57
-
58
- # For SQLite, we need to dispose the engine after each operation to prevent database lock
59
- self.handle_dispose_sqlite(engine)
54
+ return session_
60
55
 
61
56
  def _get_engine_bigquery(self) -> Engine:
57
+ # If service account key is provided, use it
62
58
  if hasattr(self.config, "service_account_key") and self.config.service_account_key:
63
59
  return create_engine(
64
60
  f"bigquery://{self.config.database}/{self.config.schema_name}",
65
61
  echo=self.config.echoEngine,
66
62
  credentials_info=self.config.service_account_key,
67
63
  )
64
+ # Otherwise we will rely on the default Google Authentication mechanism (e.g. GOOGLE_APPLICATION_CREDENTIALS)
68
65
  return create_engine(
69
66
  f"bigquery://{self.config.database}/{self.config.schema_name}", echo=self.config.echoEngine
70
67
  )
@@ -78,7 +75,9 @@ class SQLAlchemyBackend(AbstractBackend):
78
75
  def _get_engine_sqlite(self) -> Engine:
79
76
  return create_engine(
80
77
  f"sqlite:///{self.config.database}.sqlite3",
81
- connect_args={"check_same_thread": False},
78
+ connect_args={"check_same_thread": False, "timeout": 30},
79
+ pool_size=5, # Adjust based on expected concurrency
80
+ pool_pre_ping=True, # Ensures connections are alive
82
81
  )
83
82
 
84
83
  def _get_engine(self) -> Engine:
@@ -120,12 +119,6 @@ class SQLAlchemyBackend(AbstractBackend):
120
119
  f"Schema or dataset {self.config.schema_name} does not exist in the database, you need to create it first."
121
120
  )
122
121
 
123
- self.handle_dispose_sqlite(engine)
124
-
125
- def handle_dispose_sqlite(self, engine: Engine):
126
- if self.type == BackendTypes.SQLITE:
127
- engine.dispose()
128
-
129
122
  def get_engine(self) -> Engine:
130
123
  """Return the SQLAlchemy engine"""
131
124
  if not self._engine:
@@ -138,12 +131,10 @@ class SQLAlchemyBackend(AbstractBackend):
138
131
  def create_all_tables(self):
139
132
  engine = self.get_engine()
140
133
  Base.metadata.create_all(engine)
141
- self.handle_dispose_sqlite(engine)
142
134
 
143
135
  def drop_all_tables(self):
144
136
  engine = self.get_engine()
145
137
  Base.metadata.drop_all(engine)
146
- self.handle_dispose_sqlite(engine)
147
138
 
148
139
  def check_prerequisites(self) -> bool:
149
140
  """Check if the database contains the necessary tables, return True if entities are present
@@ -171,20 +162,20 @@ class SQLAlchemyBackend(AbstractBackend):
171
162
  all_entities_exist = False
172
163
  logger.info(f"Table {TABLE_DESTINATION_CURSOR} does not exist in the database, we will create it")
173
164
 
174
- self.handle_dispose_sqlite(engine)
175
-
176
165
  return all_entities_exist
177
166
 
178
167
  def _add_and_commit(self, obj, session: Optional[Session] = None):
179
168
  """Add the object to the session and commit it, return its ID"""
180
- session = session or next(self.session)
169
+ session = session or self.session
181
170
  session.add(obj)
182
171
  session.commit()
183
172
  return obj
184
173
 
185
174
  def _execute(self, select: Select, session: Optional[Session] = None) -> Result:
186
- session = session or next(self.session)
187
- return session.execute(select)
175
+ session = session or self.session
176
+ result = session.execute(select)
177
+ session.commit()
178
+ return result
188
179
 
189
180
  #### STREAM JOB ####
190
181
 
bizon/engine/engine.py CHANGED
@@ -1,4 +1,4 @@
1
- from loguru import logger
1
+ from os import getenv
2
2
 
3
3
  from bizon.cli.utils import parse_from_yaml
4
4
  from bizon.common.models import BizonConfig
@@ -7,10 +7,24 @@ from .config import RunnerTypes
7
7
  from .runner.runner import AbstractRunner
8
8
 
9
9
 
10
+ def replace_env_variables_in_config(config: dict) -> dict:
11
+ """Replace templated secrets with actual values from environment variables"""
12
+ for key, value in config.items():
13
+ if isinstance(value, dict):
14
+ config[key] = replace_env_variables_in_config(value)
15
+ elif isinstance(value, str):
16
+ if value.startswith("BIZON_ENV_"):
17
+ config[key] = getenv(value)
18
+ return config
19
+
20
+
10
21
  class RunnerFactory:
11
22
  @staticmethod
12
23
  def create_from_config_dict(config: dict) -> AbstractRunner:
13
24
 
25
+ # Replace env variables in config
26
+ config = replace_env_variables_in_config(config=config)
27
+
14
28
  bizon_config = BizonConfig.model_validate(obj=config)
15
29
 
16
30
  if bizon_config.engine.runner.type == RunnerTypes.THREAD:
@@ -23,6 +37,11 @@ class RunnerFactory:
23
37
 
24
38
  return ProcessRunner(config=config)
25
39
 
40
+ if bizon_config.engine.runner.type == RunnerTypes.STREAM:
41
+ from .runner.adapters.streaming import StreamingRunner
42
+
43
+ return StreamingRunner(config=config)
44
+
26
45
  raise ValueError(f"Runner type {bizon_config.engine.runner.type} is not supported")
27
46
 
28
47
  @staticmethod
@@ -1,15 +1,83 @@
1
+ import multiprocessing
2
+ import multiprocessing.synchronize
3
+ import threading
4
+ import traceback
1
5
  from abc import ABC, abstractmethod
2
- from enum import Enum
6
+ from typing import Union
3
7
 
4
- from bizon.destinations.destination import AbstractDestination
5
- from bizon.engine.queue.config import AbstractQueueConfig
8
+ from loguru import logger
9
+
10
+ from bizon.destination.destination import AbstractDestination
11
+ from bizon.engine.pipeline.models import PipelineReturnStatus
12
+ from bizon.engine.queue.config import (
13
+ QUEUE_TERMINATION,
14
+ AbstractQueueConfig,
15
+ QueueMessage,
16
+ )
17
+ from bizon.monitoring.monitor import AbstractMonitor
18
+ from bizon.transform.transform import Transform
6
19
 
7
20
 
8
21
  class AbstractQueueConsumer(ABC):
9
- def __init__(self, config: AbstractQueueConfig, destination: AbstractDestination):
22
+ def __init__(
23
+ self,
24
+ config: AbstractQueueConfig,
25
+ destination: AbstractDestination,
26
+ transform: Transform,
27
+ monitor: AbstractMonitor,
28
+ ):
10
29
  self.config = config
11
30
  self.destination = destination
31
+ self.transform = transform
32
+ self.monitor = monitor
12
33
 
13
34
  @abstractmethod
14
- def run(self):
35
+ def run(self, stop_event: Union[multiprocessing.synchronize.Event, threading.Event]) -> PipelineReturnStatus:
15
36
  pass
37
+
38
+ def process_queue_message(self, queue_message: QueueMessage) -> PipelineReturnStatus:
39
+
40
+ # Apply the transformation
41
+ try:
42
+ df_source_records = self.transform.apply_transforms(df_source_records=queue_message.df_source_records)
43
+ except Exception as e:
44
+ logger.error(f"Error applying transformation: {e}")
45
+ logger.error(traceback.format_exc())
46
+ self.monitor.track_pipeline_status(PipelineReturnStatus.TRANSFORM_ERROR)
47
+ return PipelineReturnStatus.TRANSFORM_ERROR
48
+
49
+ # Handle last iteration
50
+ try:
51
+ if queue_message.signal == QUEUE_TERMINATION:
52
+ logger.info("Received termination signal, waiting for destination to close gracefully ...")
53
+ self.destination.write_records_and_update_cursor(
54
+ df_source_records=df_source_records,
55
+ iteration=queue_message.iteration,
56
+ extracted_at=queue_message.extracted_at,
57
+ pagination=queue_message.pagination,
58
+ last_iteration=True,
59
+ )
60
+ self.monitor.track_pipeline_status(PipelineReturnStatus.SUCCESS)
61
+ return PipelineReturnStatus.SUCCESS
62
+
63
+ except Exception as e:
64
+ logger.error(f"Error writing records to destination: {e}")
65
+ self.monitor.track_pipeline_status(PipelineReturnStatus.DESTINATION_ERROR)
66
+ return PipelineReturnStatus.DESTINATION_ERROR
67
+
68
+ # Write the records to the destination
69
+ try:
70
+ self.destination.write_records_and_update_cursor(
71
+ df_source_records=df_source_records,
72
+ iteration=queue_message.iteration,
73
+ extracted_at=queue_message.extracted_at,
74
+ pagination=queue_message.pagination,
75
+ )
76
+ return PipelineReturnStatus.RUNNING
77
+
78
+ except Exception as e:
79
+ logger.error(f"Error writing records to destination: {e}")
80
+ self.monitor.track_pipeline_status(PipelineReturnStatus.DESTINATION_ERROR)
81
+ return PipelineReturnStatus.DESTINATION_ERROR
82
+
83
+ raise RuntimeError("Should not reach this point")
@@ -1,10 +1,15 @@
1
1
  from enum import Enum
2
2
 
3
3
 
4
- class PipelineReturnStatus(Enum):
4
+ class PipelineReturnStatus(str, Enum):
5
5
  """Producer error types"""
6
6
 
7
- SUCCESS = "success"
7
+ BACKEND_ERROR = "backend_error"
8
+ DESTINATION_ERROR = "destination_error"
9
+ KILLED_BY_RUNNER = "killed_by_runner"
8
10
  QUEUE_ERROR = "queue_error"
11
+ RUNNING = "running"
9
12
  SOURCE_ERROR = "source_error"
10
- BACKEND_ERROR = "backend_error"
13
+ SUCCESS = "success"
14
+ TRANSFORM_ERROR = "transform_error"
15
+ STREAM_ERROR = "stream_error"
@@ -1,8 +1,11 @@
1
1
  import ast
2
+ import multiprocessing
3
+ import multiprocessing.synchronize
4
+ import threading
2
5
  import traceback
3
6
  from datetime import datetime
4
7
  from time import sleep
5
- from typing import Tuple
8
+ from typing import Tuple, Union
6
9
 
7
10
  from loguru import logger
8
11
  from pytz import UTC
@@ -28,7 +31,7 @@ class Producer:
28
31
 
29
32
  @property
30
33
  def name(self) -> str:
31
- return f"producer-{self.source.config.source_name}-{self.source.config.stream_name}"
34
+ return f"producer-{self.source.config.name}-{self.source.config.stream}"
32
35
 
33
36
  def get_or_create_cursor(self, job_id: str, session=None) -> Cursor:
34
37
  """Get or create a cursor for the current stream, return the cursor"""
@@ -47,8 +50,8 @@ class Producer:
47
50
 
48
51
  # Initialize the recovery from the DestinationCursor
49
52
  cursor = Cursor.from_db(
50
- source_name=self.source.config.source_name,
51
- stream_name=self.source.config.stream_name,
53
+ source_name=self.source.config.name,
54
+ stream_name=self.source.config.stream,
52
55
  job_id=job_id,
53
56
  total_records=job.total_records_to_fetch,
54
57
  iteration=cursor_from_db.to_source_iteration + 1,
@@ -60,8 +63,8 @@ class Producer:
60
63
  total_records = self.source.get_total_records_count()
61
64
  # Initialize the cursor
62
65
  cursor = Cursor(
63
- source_name=self.source.config.source_name,
64
- stream_name=self.source.config.stream_name,
66
+ source_name=self.source.config.name,
67
+ stream_name=self.source.config.stream,
65
68
  job_id=job_id,
66
69
  total_records=total_records,
67
70
  )
@@ -99,7 +102,9 @@ class Producer:
99
102
 
100
103
  return False, queue_size, approximate_nb_records_in_queue
101
104
 
102
- def run(self, job_id: int):
105
+ def run(
106
+ self, job_id: int, stop_event: Union[multiprocessing.synchronize.Event, threading.Event]
107
+ ) -> PipelineReturnStatus:
103
108
 
104
109
  return_value: PipelineReturnStatus = PipelineReturnStatus.SUCCESS
105
110
 
@@ -128,6 +133,10 @@ class Producer:
128
133
 
129
134
  while not cursor.is_finished:
130
135
 
136
+ if stop_event.is_set():
137
+ logger.info("Stop event is set, terminating producer ...")
138
+ return PipelineReturnStatus.KILLED_BY_RUNNER
139
+
131
140
  timestamp_start_iteration = datetime.now(tz=UTC)
132
141
 
133
142
  # Handle the case where last cursor already reach max_iterations
@@ -142,8 +151,8 @@ class Producer:
142
151
  self.backend.create_source_cursor(
143
152
  job_id=job_id,
144
153
  name=self.bizon_config.name,
145
- source_name=self.source.config.source_name,
146
- stream_name=self.source.config.stream_name,
154
+ source_name=self.source.config.name,
155
+ stream_name=self.source.config.stream,
147
156
  iteration=cursor.iteration,
148
157
  rows_fetched=cursor.rows_fetched,
149
158
  next_pagination=cursor.pagination,
@@ -3,9 +3,9 @@ import json
3
3
  from kafka import KafkaConsumer
4
4
  from loguru import logger
5
5
 
6
- from bizon.destinations.destination import AbstractDestination
6
+ from bizon.destination.destination import AbstractDestination
7
7
  from bizon.engine.pipeline.consumer import AbstractQueueConsumer
8
- from bizon.engine.queue.queue import QUEUE_TERMINATION, QueueMessage
8
+ from bizon.engine.queue.config import QUEUE_TERMINATION, QueueMessage
9
9
 
10
10
  from .config import KafkaConfigDetails
11
11
 
@@ -4,8 +4,9 @@ from typing import Union
4
4
  from kafka import KafkaProducer
5
5
  from loguru import logger
6
6
 
7
- from bizon.destinations.destination import AbstractDestination
8
- from bizon.engine.queue.queue import QUEUE_TERMINATION, AbstractQueue, QueueMessage
7
+ from bizon.destination.destination import AbstractDestination
8
+ from bizon.engine.queue.config import QUEUE_TERMINATION, QueueMessage
9
+ from bizon.engine.queue.queue import AbstractQueue
9
10
 
10
11
  from .config import KafkaConfigDetails
11
12
  from .consumer import KafkaConsumer_