bizon 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. bizon/alerting/__init__.py +0 -0
  2. bizon/alerting/alerts.py +23 -0
  3. bizon/alerting/models.py +28 -0
  4. bizon/alerting/slack/__init__.py +0 -0
  5. bizon/alerting/slack/config.py +5 -0
  6. bizon/alerting/slack/handler.py +39 -0
  7. bizon/cli/main.py +7 -3
  8. bizon/common/models.py +33 -7
  9. bizon/{destinations → connectors/destinations}/bigquery/config/bigquery.example.yml +3 -4
  10. bizon/connectors/destinations/bigquery/src/config.py +128 -0
  11. bizon/{destinations → connectors/destinations}/bigquery/src/destination.py +48 -25
  12. bizon/connectors/destinations/bigquery_streaming/src/config.py +57 -0
  13. bizon/connectors/destinations/bigquery_streaming/src/destination.py +377 -0
  14. bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +57 -0
  15. bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +446 -0
  16. bizon/{destinations/bigquery_streaming → connectors/destinations/bigquery_streaming_v2}/src/proto_utils.py +30 -36
  17. bizon/{destinations → connectors/destinations}/file/src/config.py +9 -3
  18. bizon/connectors/destinations/file/src/destination.py +56 -0
  19. bizon/{destinations → connectors/destinations}/logger/src/config.py +2 -1
  20. bizon/{destinations → connectors/destinations}/logger/src/destination.py +18 -3
  21. bizon/connectors/sources/cycle/config/cycle.example.yml +15 -0
  22. bizon/connectors/sources/cycle/src/source.py +133 -0
  23. bizon/{sources/periscope/tests/periscope_pipeline_dashboard.py → connectors/sources/cycle/tests/cycle_customers.py} +1 -1
  24. bizon/{sources → connectors/sources}/dummy/config/dummy.example.yml +2 -2
  25. bizon/{sources → connectors/sources}/dummy/src/fake_api.py +6 -1
  26. bizon/{sources → connectors/sources}/dummy/src/source.py +18 -5
  27. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline.py +2 -2
  28. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_bigquery_backend.py +2 -2
  29. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_kafka.py +2 -2
  30. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_rabbitmq.py +2 -2
  31. bizon/connectors/sources/dummy/tests/dummy_pipeline_unnest.py +29 -0
  32. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery.py +2 -2
  33. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +2 -2
  34. bizon/{sources → connectors/sources}/gsheets/config/default_auth.example.yml +4 -2
  35. bizon/{sources → connectors/sources}/gsheets/config/service_account.example.yml +4 -2
  36. bizon/{sources → connectors/sources}/hubspot/config/api_key.example.yml +4 -2
  37. bizon/{sources → connectors/sources}/hubspot/config/oauth.example.yml +4 -2
  38. bizon/{sources → connectors/sources}/hubspot/src/hubspot_objects.py +1 -1
  39. bizon/{sources → connectors/sources}/kafka/config/kafka.example.yml +3 -5
  40. bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +110 -0
  41. bizon/connectors/sources/kafka/src/callback.py +18 -0
  42. bizon/connectors/sources/kafka/src/config.py +69 -0
  43. bizon/connectors/sources/kafka/src/decode.py +93 -0
  44. bizon/connectors/sources/kafka/src/source.py +381 -0
  45. bizon/connectors/sources/kafka/tests/kafka_pipeline.py +7 -0
  46. bizon/connectors/sources/periscope/config/periscope_charts.example.yml +20 -0
  47. bizon/connectors/sources/periscope/config/periscope_dashboards.example.yml +20 -0
  48. bizon/{sources → connectors/sources}/periscope/src/source.py +137 -13
  49. bizon/connectors/sources/periscope/tests/periscope_pipeline_dashboard.py +9 -0
  50. bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_json.example.yml +19 -0
  51. bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_logger.example.yml +10 -0
  52. bizon/connectors/sources/pokeapi/src/source.py +79 -0
  53. bizon/{destinations → destination}/buffer.py +5 -0
  54. bizon/destination/config.py +83 -0
  55. bizon/{destinations → destination}/destination.py +103 -15
  56. bizon/engine/backend/adapters/sqlalchemy/backend.py +3 -1
  57. bizon/engine/engine.py +20 -1
  58. bizon/engine/pipeline/consumer.py +73 -5
  59. bizon/engine/pipeline/models.py +8 -3
  60. bizon/engine/pipeline/producer.py +18 -9
  61. bizon/engine/queue/adapters/kafka/consumer.py +2 -2
  62. bizon/engine/queue/adapters/kafka/queue.py +3 -2
  63. bizon/engine/queue/adapters/python_queue/consumer.py +40 -23
  64. bizon/engine/queue/adapters/python_queue/queue.py +19 -9
  65. bizon/engine/queue/adapters/rabbitmq/consumer.py +3 -6
  66. bizon/engine/queue/adapters/rabbitmq/queue.py +3 -2
  67. bizon/engine/queue/config.py +16 -0
  68. bizon/engine/queue/queue.py +17 -16
  69. bizon/engine/runner/adapters/process.py +15 -2
  70. bizon/engine/runner/adapters/streaming.py +121 -0
  71. bizon/engine/runner/adapters/thread.py +32 -9
  72. bizon/engine/runner/config.py +28 -0
  73. bizon/engine/runner/runner.py +113 -24
  74. bizon/monitoring/__init__.py +0 -0
  75. bizon/monitoring/config.py +39 -0
  76. bizon/monitoring/datadog/__init__.py +0 -0
  77. bizon/monitoring/datadog/monitor.py +153 -0
  78. bizon/monitoring/monitor.py +71 -0
  79. bizon/monitoring/noop/__init__.py +0 -0
  80. bizon/monitoring/noop/monitor.py +30 -0
  81. bizon/source/callback.py +24 -0
  82. bizon/source/config.py +3 -3
  83. bizon/source/cursor.py +1 -1
  84. bizon/source/discover.py +4 -3
  85. bizon/source/models.py +4 -2
  86. bizon/source/source.py +10 -2
  87. bizon/transform/config.py +8 -0
  88. bizon/transform/transform.py +48 -0
  89. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/METADATA +23 -6
  90. bizon-0.1.2.dist-info/RECORD +123 -0
  91. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/WHEEL +1 -1
  92. bizon/destinations/bigquery/src/config.py +0 -51
  93. bizon/destinations/bigquery_streaming/src/config.py +0 -43
  94. bizon/destinations/bigquery_streaming/src/destination.py +0 -154
  95. bizon/destinations/config.py +0 -47
  96. bizon/destinations/file/src/destination.py +0 -27
  97. bizon/sources/kafka/src/source.py +0 -357
  98. bizon/sources/kafka/tests/kafka_pipeline.py +0 -9
  99. bizon/sources/periscope/config/periscope_charts.example.yml +0 -26
  100. bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -26
  101. bizon-0.1.0.dist-info/RECORD +0 -93
  102. /bizon/{sources → connectors/sources}/gsheets/src/source.py +0 -0
  103. /bizon/{sources → connectors/sources}/gsheets/tests/gsheets_pipeline.py +0 -0
  104. /bizon/{sources → connectors/sources}/hubspot/src/hubspot_base.py +0 -0
  105. /bizon/{sources → connectors/sources}/hubspot/src/models/hs_object.py +0 -0
  106. /bizon/{sources → connectors/sources}/hubspot/tests/hubspot_pipeline.py +0 -0
  107. /bizon/{sources → connectors/sources}/periscope/tests/periscope_pipeline_charts.py +0 -0
  108. /bizon/{destinations → destination}/models.py +0 -0
  109. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/LICENSE +0 -0
  110. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/entry_points.txt +0 -0
bizon/engine/engine.py CHANGED
@@ -1,4 +1,4 @@
1
- from loguru import logger
1
+ from os import getenv
2
2
 
3
3
  from bizon.cli.utils import parse_from_yaml
4
4
  from bizon.common.models import BizonConfig
@@ -7,10 +7,24 @@ from .config import RunnerTypes
7
7
  from .runner.runner import AbstractRunner
8
8
 
9
9
 
10
+ def replace_env_variables_in_config(config: dict) -> dict:
11
+ """Replace templated secrets with actual values from environment variables"""
12
+ for key, value in config.items():
13
+ if isinstance(value, dict):
14
+ config[key] = replace_env_variables_in_config(value)
15
+ elif isinstance(value, str):
16
+ if value.startswith("BIZON_ENV_"):
17
+ config[key] = getenv(value)
18
+ return config
19
+
20
+
10
21
  class RunnerFactory:
11
22
  @staticmethod
12
23
  def create_from_config_dict(config: dict) -> AbstractRunner:
13
24
 
25
+ # Replace env variables in config
26
+ config = replace_env_variables_in_config(config=config)
27
+
14
28
  bizon_config = BizonConfig.model_validate(obj=config)
15
29
 
16
30
  if bizon_config.engine.runner.type == RunnerTypes.THREAD:
@@ -23,6 +37,11 @@ class RunnerFactory:
23
37
 
24
38
  return ProcessRunner(config=config)
25
39
 
40
+ if bizon_config.engine.runner.type == RunnerTypes.STREAM:
41
+ from .runner.adapters.streaming import StreamingRunner
42
+
43
+ return StreamingRunner(config=config)
44
+
26
45
  raise ValueError(f"Runner type {bizon_config.engine.runner.type} is not supported")
27
46
 
28
47
  @staticmethod
@@ -1,15 +1,83 @@
1
+ import multiprocessing
2
+ import multiprocessing.synchronize
3
+ import threading
4
+ import traceback
1
5
  from abc import ABC, abstractmethod
2
- from enum import Enum
6
+ from typing import Union
3
7
 
4
- from bizon.destinations.destination import AbstractDestination
5
- from bizon.engine.queue.config import AbstractQueueConfig
8
+ from loguru import logger
9
+
10
+ from bizon.destination.destination import AbstractDestination
11
+ from bizon.engine.pipeline.models import PipelineReturnStatus
12
+ from bizon.engine.queue.config import (
13
+ QUEUE_TERMINATION,
14
+ AbstractQueueConfig,
15
+ QueueMessage,
16
+ )
17
+ from bizon.monitoring.monitor import AbstractMonitor
18
+ from bizon.transform.transform import Transform
6
19
 
7
20
 
8
21
  class AbstractQueueConsumer(ABC):
9
- def __init__(self, config: AbstractQueueConfig, destination: AbstractDestination):
22
+ def __init__(
23
+ self,
24
+ config: AbstractQueueConfig,
25
+ destination: AbstractDestination,
26
+ transform: Transform,
27
+ monitor: AbstractMonitor,
28
+ ):
10
29
  self.config = config
11
30
  self.destination = destination
31
+ self.transform = transform
32
+ self.monitor = monitor
12
33
 
13
34
  @abstractmethod
14
- def run(self):
35
+ def run(self, stop_event: Union[multiprocessing.synchronize.Event, threading.Event]) -> PipelineReturnStatus:
15
36
  pass
37
+
38
+ def process_queue_message(self, queue_message: QueueMessage) -> PipelineReturnStatus:
39
+
40
+ # Apply the transformation
41
+ try:
42
+ df_source_records = self.transform.apply_transforms(df_source_records=queue_message.df_source_records)
43
+ except Exception as e:
44
+ logger.error(f"Error applying transformation: {e}")
45
+ logger.error(traceback.format_exc())
46
+ self.monitor.track_pipeline_status(PipelineReturnStatus.TRANSFORM_ERROR)
47
+ return PipelineReturnStatus.TRANSFORM_ERROR
48
+
49
+ # Handle last iteration
50
+ try:
51
+ if queue_message.signal == QUEUE_TERMINATION:
52
+ logger.info("Received termination signal, waiting for destination to close gracefully ...")
53
+ self.destination.write_records_and_update_cursor(
54
+ df_source_records=df_source_records,
55
+ iteration=queue_message.iteration,
56
+ extracted_at=queue_message.extracted_at,
57
+ pagination=queue_message.pagination,
58
+ last_iteration=True,
59
+ )
60
+ self.monitor.track_pipeline_status(PipelineReturnStatus.SUCCESS)
61
+ return PipelineReturnStatus.SUCCESS
62
+
63
+ except Exception as e:
64
+ logger.error(f"Error writing records to destination: {e}")
65
+ self.monitor.track_pipeline_status(PipelineReturnStatus.DESTINATION_ERROR)
66
+ return PipelineReturnStatus.DESTINATION_ERROR
67
+
68
+ # Write the records to the destination
69
+ try:
70
+ self.destination.write_records_and_update_cursor(
71
+ df_source_records=df_source_records,
72
+ iteration=queue_message.iteration,
73
+ extracted_at=queue_message.extracted_at,
74
+ pagination=queue_message.pagination,
75
+ )
76
+ return PipelineReturnStatus.RUNNING
77
+
78
+ except Exception as e:
79
+ logger.error(f"Error writing records to destination: {e}")
80
+ self.monitor.track_pipeline_status(PipelineReturnStatus.DESTINATION_ERROR)
81
+ return PipelineReturnStatus.DESTINATION_ERROR
82
+
83
+ raise RuntimeError("Should not reach this point")
@@ -1,10 +1,15 @@
1
1
  from enum import Enum
2
2
 
3
3
 
4
- class PipelineReturnStatus(Enum):
4
+ class PipelineReturnStatus(str, Enum):
5
5
  """Producer error types"""
6
6
 
7
- SUCCESS = "success"
7
+ BACKEND_ERROR = "backend_error"
8
+ DESTINATION_ERROR = "destination_error"
9
+ KILLED_BY_RUNNER = "killed_by_runner"
8
10
  QUEUE_ERROR = "queue_error"
11
+ RUNNING = "running"
9
12
  SOURCE_ERROR = "source_error"
10
- BACKEND_ERROR = "backend_error"
13
+ SUCCESS = "success"
14
+ TRANSFORM_ERROR = "transform_error"
15
+ STREAM_ERROR = "stream_error"
@@ -1,8 +1,11 @@
1
1
  import ast
2
+ import multiprocessing
3
+ import multiprocessing.synchronize
4
+ import threading
2
5
  import traceback
3
6
  from datetime import datetime
4
7
  from time import sleep
5
- from typing import Tuple
8
+ from typing import Tuple, Union
6
9
 
7
10
  from loguru import logger
8
11
  from pytz import UTC
@@ -28,7 +31,7 @@ class Producer:
28
31
 
29
32
  @property
30
33
  def name(self) -> str:
31
- return f"producer-{self.source.config.source_name}-{self.source.config.stream_name}"
34
+ return f"producer-{self.source.config.name}-{self.source.config.stream}"
32
35
 
33
36
  def get_or_create_cursor(self, job_id: str, session=None) -> Cursor:
34
37
  """Get or create a cursor for the current stream, return the cursor"""
@@ -47,8 +50,8 @@ class Producer:
47
50
 
48
51
  # Initialize the recovery from the DestinationCursor
49
52
  cursor = Cursor.from_db(
50
- source_name=self.source.config.source_name,
51
- stream_name=self.source.config.stream_name,
53
+ source_name=self.source.config.name,
54
+ stream_name=self.source.config.stream,
52
55
  job_id=job_id,
53
56
  total_records=job.total_records_to_fetch,
54
57
  iteration=cursor_from_db.to_source_iteration + 1,
@@ -60,8 +63,8 @@ class Producer:
60
63
  total_records = self.source.get_total_records_count()
61
64
  # Initialize the cursor
62
65
  cursor = Cursor(
63
- source_name=self.source.config.source_name,
64
- stream_name=self.source.config.stream_name,
66
+ source_name=self.source.config.name,
67
+ stream_name=self.source.config.stream,
65
68
  job_id=job_id,
66
69
  total_records=total_records,
67
70
  )
@@ -99,7 +102,9 @@ class Producer:
99
102
 
100
103
  return False, queue_size, approximate_nb_records_in_queue
101
104
 
102
- def run(self, job_id: int):
105
+ def run(
106
+ self, job_id: int, stop_event: Union[multiprocessing.synchronize.Event, threading.Event]
107
+ ) -> PipelineReturnStatus:
103
108
 
104
109
  return_value: PipelineReturnStatus = PipelineReturnStatus.SUCCESS
105
110
 
@@ -128,6 +133,10 @@ class Producer:
128
133
 
129
134
  while not cursor.is_finished:
130
135
 
136
+ if stop_event.is_set():
137
+ logger.info("Stop event is set, terminating producer ...")
138
+ return PipelineReturnStatus.KILLED_BY_RUNNER
139
+
131
140
  timestamp_start_iteration = datetime.now(tz=UTC)
132
141
 
133
142
  # Handle the case where last cursor already reach max_iterations
@@ -142,8 +151,8 @@ class Producer:
142
151
  self.backend.create_source_cursor(
143
152
  job_id=job_id,
144
153
  name=self.bizon_config.name,
145
- source_name=self.source.config.source_name,
146
- stream_name=self.source.config.stream_name,
154
+ source_name=self.source.config.name,
155
+ stream_name=self.source.config.stream,
147
156
  iteration=cursor.iteration,
148
157
  rows_fetched=cursor.rows_fetched,
149
158
  next_pagination=cursor.pagination,
@@ -3,9 +3,9 @@ import json
3
3
  from kafka import KafkaConsumer
4
4
  from loguru import logger
5
5
 
6
- from bizon.destinations.destination import AbstractDestination
6
+ from bizon.destination.destination import AbstractDestination
7
7
  from bizon.engine.pipeline.consumer import AbstractQueueConsumer
8
- from bizon.engine.queue.queue import QUEUE_TERMINATION, QueueMessage
8
+ from bizon.engine.queue.config import QUEUE_TERMINATION, QueueMessage
9
9
 
10
10
  from .config import KafkaConfigDetails
11
11
 
@@ -4,8 +4,9 @@ from typing import Union
4
4
  from kafka import KafkaProducer
5
5
  from loguru import logger
6
6
 
7
- from bizon.destinations.destination import AbstractDestination
8
- from bizon.engine.queue.queue import QUEUE_TERMINATION, AbstractQueue, QueueMessage
7
+ from bizon.destination.destination import AbstractDestination
8
+ from bizon.engine.queue.config import QUEUE_TERMINATION, QueueMessage
9
+ from bizon.engine.queue.queue import AbstractQueue
9
10
 
10
11
  from .config import KafkaConfigDetails
11
12
  from .consumer import KafkaConsumer_
@@ -1,36 +1,53 @@
1
+ import multiprocessing
2
+ import multiprocessing.synchronize
3
+ import threading
4
+ from typing import Union
5
+
1
6
  from loguru import logger
2
7
 
3
- from bizon.destinations.destination import AbstractDestination
8
+ from bizon.destination.destination import AbstractDestination
4
9
  from bizon.engine.pipeline.consumer import AbstractQueueConsumer
5
- from bizon.engine.queue.queue import QUEUE_TERMINATION, AbstractQueue, QueueMessage
10
+ from bizon.engine.pipeline.models import PipelineReturnStatus
11
+ from bizon.engine.queue.config import QueueMessage
12
+ from bizon.engine.queue.queue import AbstractQueue
13
+ from bizon.monitoring.monitor import AbstractMonitor
14
+ from bizon.transform.transform import Transform
6
15
 
7
16
  from .config import PythonQueueConfig
8
17
 
9
18
 
10
19
  class PythonQueueConsumer(AbstractQueueConsumer):
11
- def __init__(self, config: PythonQueueConfig, queue: AbstractQueue, destination: AbstractDestination):
12
- super().__init__(config, destination=destination)
20
+ def __init__(
21
+ self,
22
+ config: PythonQueueConfig,
23
+ queue: AbstractQueue,
24
+ destination: AbstractDestination,
25
+ transform: Transform,
26
+ monitor: AbstractMonitor,
27
+ ):
28
+ super().__init__(
29
+ config,
30
+ destination=destination,
31
+ transform=transform,
32
+ monitor=monitor,
33
+ )
13
34
  self.queue = queue
35
+ self.monitor.track_pipeline_status(PipelineReturnStatus.RUNNING)
36
+
37
+ def run(self, stop_event: Union[threading.Event, multiprocessing.synchronize.Event]) -> PipelineReturnStatus:
14
38
 
15
- def run(self) -> None:
16
39
  while True:
40
+ # Handle kill signal from the runner
41
+ if stop_event.is_set():
42
+ logger.info("Stop event is set, closing consumer ...")
43
+ self.monitor.track_pipeline_status(PipelineReturnStatus.KILLED_BY_RUNNER)
44
+ return PipelineReturnStatus.KILLED_BY_RUNNER
45
+
46
+ # Retrieve the message from the queue
17
47
  queue_message: QueueMessage = self.queue.get()
18
48
 
19
- if queue_message.signal == QUEUE_TERMINATION:
20
- logger.info("Received termination signal, waiting for destination to close gracefully ...")
21
- self.destination.write_records_and_update_cursor(
22
- df_source_records=queue_message.df_source_records,
23
- iteration=queue_message.iteration,
24
- extracted_at=queue_message.extracted_at,
25
- pagination=queue_message.pagination,
26
- last_iteration=True,
27
- )
28
- break
29
-
30
- self.destination.write_records_and_update_cursor(
31
- df_source_records=queue_message.df_source_records,
32
- iteration=queue_message.iteration,
33
- extracted_at=queue_message.extracted_at,
34
- pagination=queue_message.pagination,
35
- )
36
- self.queue.task_done()
49
+ status = self.process_queue_message(queue_message)
50
+
51
+ if status != PipelineReturnStatus.RUNNING:
52
+ self.queue.task_done()
53
+ return status
@@ -5,14 +5,13 @@ from typing import Union
5
5
 
6
6
  from loguru import logger
7
7
 
8
- from bizon.destinations.destination import AbstractDestination
9
- from bizon.engine.queue.queue import (
10
- QUEUE_TERMINATION,
11
- AbstractQueue,
12
- AbstractQueueConsumer,
13
- QueueMessage,
14
- )
8
+ from bizon.destination.destination import AbstractDestination
9
+ from bizon.engine.queue.config import QUEUE_TERMINATION, QueueMessage
10
+ from bizon.engine.queue.queue import AbstractQueue, AbstractQueueConsumer
11
+ from bizon.monitoring.monitor import AbstractMonitor
12
+ from bizon.source.callback import AbstractSourceCallback
15
13
  from bizon.source.models import SourceIteration
14
+ from bizon.transform.transform import Transform
16
15
 
17
16
  from .config import PythonQueueConfigDetails
18
17
  from .consumer import PythonQueueConsumer
@@ -31,8 +30,19 @@ class PythonQueue(AbstractQueue):
31
30
  # No connection to establish for PythonQueue
32
31
  pass
33
32
 
34
- def get_consumer(self, destination: AbstractDestination) -> AbstractQueueConsumer:
35
- return PythonQueueConsumer(config=self.config, queue=self.queue, destination=destination)
33
+ def get_consumer(
34
+ self,
35
+ destination: AbstractDestination,
36
+ transform: Transform,
37
+ monitor: AbstractMonitor,
38
+ ) -> AbstractQueueConsumer:
39
+ return PythonQueueConsumer(
40
+ config=self.config,
41
+ queue=self.queue,
42
+ destination=destination,
43
+ transform=transform,
44
+ monitor=monitor,
45
+ )
36
46
 
37
47
  def put_queue_message(self, queue_message: QueueMessage):
38
48
  if not self.queue.full():
@@ -2,12 +2,9 @@ import pika
2
2
  import pika.connection
3
3
  from loguru import logger
4
4
 
5
- from bizon.destinations.destination import AbstractDestination
6
- from bizon.engine.queue.queue import (
7
- QUEUE_TERMINATION,
8
- AbstractQueueConsumer,
9
- QueueMessage,
10
- )
5
+ from bizon.destination.destination import AbstractDestination
6
+ from bizon.engine.queue.config import QUEUE_TERMINATION
7
+ from bizon.engine.queue.queue import AbstractQueueConsumer, QueueMessage
11
8
 
12
9
  from .config import RabbitMQConfigDetails
13
10
 
@@ -3,9 +3,10 @@ from typing import Union
3
3
  import pika
4
4
  from loguru import logger
5
5
 
6
- from bizon.destinations.destination import AbstractDestination
6
+ from bizon.destination.destination import AbstractDestination
7
7
  from bizon.engine.pipeline.consumer import AbstractQueueConsumer
8
- from bizon.engine.queue.queue import QUEUE_TERMINATION, AbstractQueue, QueueMessage
8
+ from bizon.engine.queue.config import QUEUE_TERMINATION, QueueMessage
9
+ from bizon.engine.queue.queue import AbstractQueue
9
10
 
10
11
  from .config import RabbitMQConfigDetails
11
12
  from .consumer import RabbitMQConsumer
@@ -1,7 +1,23 @@
1
1
  from abc import ABC
2
+ from dataclasses import dataclass
3
+ from datetime import datetime
2
4
  from enum import Enum
5
+ from typing import Optional
3
6
 
7
+ import polars as pl
4
8
  from pydantic import BaseModel, ConfigDict, Field
9
+ from pytz import UTC
10
+
11
+ QUEUE_TERMINATION = "TERMINATION"
12
+
13
+
14
+ @dataclass
15
+ class QueueMessage:
16
+ iteration: int
17
+ df_source_records: pl.DataFrame
18
+ extracted_at: datetime = datetime.now(tz=UTC)
19
+ pagination: Optional[dict] = None
20
+ signal: Optional[str] = None
5
21
 
6
22
 
7
23
  class QueueTypes(str, Enum):
@@ -1,28 +1,23 @@
1
1
  import json
2
2
  from abc import ABC, abstractmethod
3
- from dataclasses import dataclass
4
3
  from datetime import datetime
5
- from typing import Optional, Union
4
+ from typing import Union
6
5
 
7
6
  import polars as pl
8
7
  from pytz import UTC
9
8
 
10
- from bizon.destinations.destination import AbstractDestination
9
+ from bizon.destination.destination import AbstractDestination
11
10
  from bizon.engine.pipeline.consumer import AbstractQueueConsumer
11
+ from bizon.monitoring.monitor import AbstractMonitor
12
12
  from bizon.source.models import SourceIteration, source_record_schema
13
+ from bizon.transform.transform import Transform
13
14
 
14
- from .config import AbastractQueueConfigDetails, AbstractQueueConfig, QueueTypes
15
-
16
- QUEUE_TERMINATION = "TERMINATION"
17
-
18
-
19
- @dataclass
20
- class QueueMessage:
21
- iteration: int
22
- df_source_records: pl.DataFrame
23
- extracted_at: datetime = datetime.now(tz=UTC)
24
- pagination: Optional[dict] = None
25
- signal: Optional[str] = None
15
+ from .config import (
16
+ AbastractQueueConfigDetails,
17
+ AbstractQueueConfig,
18
+ QueueMessage,
19
+ QueueTypes,
20
+ )
26
21
 
27
22
 
28
23
  class AbstractQueue(ABC):
@@ -35,7 +30,12 @@ class AbstractQueue(ABC):
35
30
  pass
36
31
 
37
32
  @abstractmethod
38
- def get_consumer(self, destination: AbstractDestination) -> AbstractQueueConsumer:
33
+ def get_consumer(
34
+ self,
35
+ destination: AbstractDestination,
36
+ transform: Transform,
37
+ monitor: AbstractMonitor,
38
+ ) -> AbstractQueueConsumer:
39
39
  pass
40
40
 
41
41
  @abstractmethod
@@ -71,6 +71,7 @@ class AbstractQueue(ABC):
71
71
  "id": [record.id for record in source_iteration.records],
72
72
  "data": [json.dumps(record.data, ensure_ascii=False) for record in source_iteration.records],
73
73
  "timestamp": [record.timestamp for record in source_iteration.records],
74
+ "destination_id": [record.destination_id for record in source_iteration.records],
74
75
  },
75
76
  schema=source_record_schema,
76
77
  )
@@ -1,5 +1,6 @@
1
1
  import concurrent.futures
2
2
  import time
3
+ import traceback
3
4
 
4
5
  from loguru import logger
5
6
 
@@ -50,6 +51,7 @@ class ProcessRunner(AbstractRunner):
50
51
  future_consumer = executor.submit(
51
52
  AbstractRunner.instanciate_and_run_consumer,
52
53
  self.bizon_config,
54
+ self.config,
53
55
  job.id,
54
56
  **extra_kwargs,
55
57
  )
@@ -68,8 +70,19 @@ class ProcessRunner(AbstractRunner):
68
70
  result_producer = future_producer.result()
69
71
  logger.info(f"Producer process stopped running with result: {result_producer}")
70
72
 
73
+ if result_producer.SUCCESS:
74
+ logger.info("Producer thread has finished successfully, will wait for consumer to finish ...")
75
+ else:
76
+ logger.error("Producer thread failed, stopping consumer ...")
77
+ executor.shutdown(wait=False)
78
+
71
79
  if not future_consumer.running():
72
- result_consumer = future_consumer.result()
73
- logger.info(f"Consumer process stopped running with result: {result_consumer}")
80
+ try:
81
+ future_consumer.result()
82
+ except Exception as e:
83
+ logger.error(f"Consumer thread stopped running with error {e}")
84
+ logger.error(traceback.format_exc())
85
+ finally:
86
+ executor.shutdown(wait=False)
74
87
 
75
88
  return True
@@ -0,0 +1,121 @@
1
+ import os
2
+ import time
3
+ from datetime import datetime
4
+ from typing import List
5
+
6
+ import polars as pl
7
+ import simplejson as json
8
+ from loguru import logger
9
+ from pytz import UTC
10
+
11
+ from bizon.common.models import BizonConfig, SyncMetadata
12
+ from bizon.destination.models import transform_to_df_destination_records
13
+ from bizon.engine.pipeline.models import PipelineReturnStatus
14
+ from bizon.engine.runner.config import RunnerStatus
15
+ from bizon.engine.runner.runner import AbstractRunner
16
+ from bizon.source.models import SourceRecord, source_record_schema
17
+
18
+
19
+ class StreamingRunner(AbstractRunner):
20
+ def __init__(self, config: BizonConfig):
21
+ super().__init__(config)
22
+
23
+ @staticmethod
24
+ def convert_source_records(records: List[SourceRecord]) -> pl.DataFrame:
25
+ return pl.DataFrame(
26
+ {
27
+ "id": [record.id for record in records],
28
+ "data": [json.dumps(record.data, ensure_ascii=False) for record in records],
29
+ "timestamp": [record.timestamp for record in records],
30
+ "destination_id": [record.destination_id for record in records],
31
+ },
32
+ schema=source_record_schema,
33
+ )
34
+
35
+ @staticmethod
36
+ def convert_to_destination_records(df_source_records: pl.DataFrame, extracted_at: datetime) -> pl.DataFrame:
37
+ return transform_to_df_destination_records(df_source_records=df_source_records, extracted_at=extracted_at)
38
+
39
+ def run(self) -> RunnerStatus:
40
+ job = self.init_job(bizon_config=self.bizon_config, config=self.config)
41
+ backend = self.get_backend(bizon_config=self.bizon_config)
42
+ source = self.get_source(bizon_config=self.bizon_config, config=self.config)
43
+
44
+ sync_metadata = SyncMetadata.from_bizon_config(job_id=job.id, config=self.bizon_config)
45
+ monitor = self.get_monitoring_client(sync_metadata=sync_metadata, bizon_config=self.bizon_config)
46
+
47
+ destination = self.get_destination(
48
+ bizon_config=self.bizon_config,
49
+ backend=backend,
50
+ job_id=job.id,
51
+ source_callback=None,
52
+ monitor=monitor,
53
+ )
54
+
55
+ transform = self.get_transform(bizon_config=self.bizon_config)
56
+
57
+ destination.buffer.buffer_size = 0 # force buffer to be flushed immediately
58
+ iteration = 0
59
+
60
+ while True:
61
+
62
+ if source.config.max_iterations and iteration > source.config.max_iterations:
63
+ logger.info(f"Max iterations {source.config.max_iterations} reached, terminating stream ...")
64
+ break
65
+
66
+ with monitor.trace(operation_name="bizon.stream.iteration"):
67
+ source_iteration = source.get()
68
+
69
+ destination_id_indexed_records = {}
70
+
71
+ if len(source_iteration.records) == 0:
72
+ logger.info("No new records found, stopping iteration")
73
+ time.sleep(2)
74
+ monitor.track_pipeline_status(PipelineReturnStatus.SUCCESS)
75
+ iteration += 1
76
+ continue
77
+
78
+ for record in source_iteration.records:
79
+ if destination_id_indexed_records.get(record.destination_id):
80
+ destination_id_indexed_records[record.destination_id].append(record)
81
+ else:
82
+ destination_id_indexed_records[record.destination_id] = [record]
83
+
84
+ for destination_id, records in destination_id_indexed_records.items():
85
+ df_source_records = StreamingRunner.convert_source_records(records)
86
+
87
+ dsm_headers = monitor.track_source_iteration(records=records)
88
+
89
+ # Apply transformation
90
+ df_source_records = transform.apply_transforms(df_source_records=df_source_records)
91
+
92
+ df_destination_records = StreamingRunner.convert_to_destination_records(
93
+ df_source_records, datetime.now(tz=UTC)
94
+ )
95
+ # Override destination_id
96
+ destination.destination_id = destination_id
97
+ destination.write_or_buffer_records(
98
+ df_destination_records=df_destination_records,
99
+ iteration=iteration,
100
+ pagination=None,
101
+ )
102
+ monitor.track_records_synced(
103
+ num_records=len(df_destination_records),
104
+ destination_id=destination_id,
105
+ extra_tags={"destination_id": destination_id},
106
+ headers=dsm_headers,
107
+ )
108
+
109
+ if os.getenv("ENVIRONMENT") == "production":
110
+ try:
111
+ source.commit()
112
+ except Exception as e:
113
+ logger.error(f"Error committing source: {e}")
114
+ monitor.track_pipeline_status(PipelineReturnStatus.ERROR)
115
+ return RunnerStatus(stream=PipelineReturnStatus.ERROR)
116
+
117
+ iteration += 1
118
+
119
+ monitor.track_pipeline_status(PipelineReturnStatus.SUCCESS)
120
+
121
+ return RunnerStatus(stream=PipelineReturnStatus.SUCCESS) # return when max iterations is reached