bizon 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. bizon/alerting/__init__.py +0 -0
  2. bizon/alerting/alerts.py +23 -0
  3. bizon/alerting/models.py +28 -0
  4. bizon/alerting/slack/__init__.py +0 -0
  5. bizon/alerting/slack/config.py +5 -0
  6. bizon/alerting/slack/handler.py +39 -0
  7. bizon/cli/main.py +7 -3
  8. bizon/common/models.py +33 -7
  9. bizon/{destinations → connectors/destinations}/bigquery/config/bigquery.example.yml +3 -4
  10. bizon/connectors/destinations/bigquery/src/config.py +128 -0
  11. bizon/{destinations → connectors/destinations}/bigquery/src/destination.py +48 -25
  12. bizon/connectors/destinations/bigquery_streaming/src/config.py +57 -0
  13. bizon/connectors/destinations/bigquery_streaming/src/destination.py +377 -0
  14. bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +57 -0
  15. bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +446 -0
  16. bizon/{destinations/bigquery_streaming → connectors/destinations/bigquery_streaming_v2}/src/proto_utils.py +30 -36
  17. bizon/{destinations → connectors/destinations}/file/src/config.py +9 -3
  18. bizon/connectors/destinations/file/src/destination.py +56 -0
  19. bizon/{destinations → connectors/destinations}/logger/src/config.py +2 -1
  20. bizon/{destinations → connectors/destinations}/logger/src/destination.py +18 -3
  21. bizon/connectors/sources/cycle/config/cycle.example.yml +15 -0
  22. bizon/connectors/sources/cycle/src/source.py +133 -0
  23. bizon/{sources/periscope/tests/periscope_pipeline_dashboard.py → connectors/sources/cycle/tests/cycle_customers.py} +1 -1
  24. bizon/{sources → connectors/sources}/dummy/config/dummy.example.yml +2 -2
  25. bizon/{sources → connectors/sources}/dummy/src/fake_api.py +6 -1
  26. bizon/{sources → connectors/sources}/dummy/src/source.py +18 -5
  27. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline.py +2 -2
  28. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_bigquery_backend.py +2 -2
  29. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_kafka.py +2 -2
  30. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_rabbitmq.py +2 -2
  31. bizon/connectors/sources/dummy/tests/dummy_pipeline_unnest.py +29 -0
  32. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery.py +2 -2
  33. bizon/{sources → connectors/sources}/dummy/tests/dummy_pipeline_write_data_bigquery_through_kafka.py +2 -2
  34. bizon/{sources → connectors/sources}/gsheets/config/default_auth.example.yml +4 -2
  35. bizon/{sources → connectors/sources}/gsheets/config/service_account.example.yml +4 -2
  36. bizon/{sources → connectors/sources}/hubspot/config/api_key.example.yml +4 -2
  37. bizon/{sources → connectors/sources}/hubspot/config/oauth.example.yml +4 -2
  38. bizon/{sources → connectors/sources}/hubspot/src/hubspot_objects.py +1 -1
  39. bizon/{sources → connectors/sources}/kafka/config/kafka.example.yml +3 -5
  40. bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +110 -0
  41. bizon/connectors/sources/kafka/src/callback.py +18 -0
  42. bizon/connectors/sources/kafka/src/config.py +69 -0
  43. bizon/connectors/sources/kafka/src/decode.py +93 -0
  44. bizon/connectors/sources/kafka/src/source.py +381 -0
  45. bizon/connectors/sources/kafka/tests/kafka_pipeline.py +7 -0
  46. bizon/connectors/sources/periscope/config/periscope_charts.example.yml +20 -0
  47. bizon/connectors/sources/periscope/config/periscope_dashboards.example.yml +20 -0
  48. bizon/{sources → connectors/sources}/periscope/src/source.py +137 -13
  49. bizon/connectors/sources/periscope/tests/periscope_pipeline_dashboard.py +9 -0
  50. bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_json.example.yml +19 -0
  51. bizon/connectors/sources/pokeapi/config/pokeapi_pokemon_to_logger.example.yml +10 -0
  52. bizon/connectors/sources/pokeapi/src/source.py +79 -0
  53. bizon/{destinations → destination}/buffer.py +5 -0
  54. bizon/destination/config.py +83 -0
  55. bizon/{destinations → destination}/destination.py +103 -15
  56. bizon/engine/backend/adapters/sqlalchemy/backend.py +3 -1
  57. bizon/engine/engine.py +20 -1
  58. bizon/engine/pipeline/consumer.py +73 -5
  59. bizon/engine/pipeline/models.py +8 -3
  60. bizon/engine/pipeline/producer.py +18 -9
  61. bizon/engine/queue/adapters/kafka/consumer.py +2 -2
  62. bizon/engine/queue/adapters/kafka/queue.py +3 -2
  63. bizon/engine/queue/adapters/python_queue/consumer.py +40 -23
  64. bizon/engine/queue/adapters/python_queue/queue.py +19 -9
  65. bizon/engine/queue/adapters/rabbitmq/consumer.py +3 -6
  66. bizon/engine/queue/adapters/rabbitmq/queue.py +3 -2
  67. bizon/engine/queue/config.py +16 -0
  68. bizon/engine/queue/queue.py +17 -16
  69. bizon/engine/runner/adapters/process.py +15 -2
  70. bizon/engine/runner/adapters/streaming.py +121 -0
  71. bizon/engine/runner/adapters/thread.py +32 -9
  72. bizon/engine/runner/config.py +28 -0
  73. bizon/engine/runner/runner.py +113 -24
  74. bizon/monitoring/__init__.py +0 -0
  75. bizon/monitoring/config.py +39 -0
  76. bizon/monitoring/datadog/__init__.py +0 -0
  77. bizon/monitoring/datadog/monitor.py +153 -0
  78. bizon/monitoring/monitor.py +71 -0
  79. bizon/monitoring/noop/__init__.py +0 -0
  80. bizon/monitoring/noop/monitor.py +30 -0
  81. bizon/source/callback.py +24 -0
  82. bizon/source/config.py +3 -3
  83. bizon/source/cursor.py +1 -1
  84. bizon/source/discover.py +4 -3
  85. bizon/source/models.py +4 -2
  86. bizon/source/source.py +10 -2
  87. bizon/transform/config.py +8 -0
  88. bizon/transform/transform.py +48 -0
  89. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/METADATA +23 -6
  90. bizon-0.1.2.dist-info/RECORD +123 -0
  91. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/WHEEL +1 -1
  92. bizon/destinations/bigquery/src/config.py +0 -51
  93. bizon/destinations/bigquery_streaming/src/config.py +0 -43
  94. bizon/destinations/bigquery_streaming/src/destination.py +0 -154
  95. bizon/destinations/config.py +0 -47
  96. bizon/destinations/file/src/destination.py +0 -27
  97. bizon/sources/kafka/src/source.py +0 -357
  98. bizon/sources/kafka/tests/kafka_pipeline.py +0 -9
  99. bizon/sources/periscope/config/periscope_charts.example.yml +0 -26
  100. bizon/sources/periscope/config/periscope_dashboards.example.yml +0 -26
  101. bizon-0.1.0.dist-info/RECORD +0 -93
  102. /bizon/{sources → connectors/sources}/gsheets/src/source.py +0 -0
  103. /bizon/{sources → connectors/sources}/gsheets/tests/gsheets_pipeline.py +0 -0
  104. /bizon/{sources → connectors/sources}/hubspot/src/hubspot_base.py +0 -0
  105. /bizon/{sources → connectors/sources}/hubspot/src/models/hs_object.py +0 -0
  106. /bizon/{sources → connectors/sources}/hubspot/tests/hubspot_pipeline.py +0 -0
  107. /bizon/{sources → connectors/sources}/periscope/tests/periscope_pipeline_charts.py +0 -0
  108. /bizon/{destinations → destination}/models.py +0 -0
  109. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/LICENSE +0 -0
  110. {bizon-0.1.0.dist-info → bizon-0.1.2.dist-info}/entry_points.txt +0 -0
@@ -1,10 +1,12 @@
1
1
  import concurrent.futures
2
2
  import time
3
- import traceback
3
+ from threading import Event
4
4
 
5
5
  from loguru import logger
6
6
 
7
7
  from bizon.common.models import BizonConfig
8
+ from bizon.engine.pipeline.models import PipelineReturnStatus
9
+ from bizon.engine.runner.config import RunnerStatus
8
10
  from bizon.engine.runner.runner import AbstractRunner
9
11
 
10
12
 
@@ -25,7 +27,7 @@ class ThreadRunner(AbstractRunner):
25
27
 
26
28
  return extra_kwargs
27
29
 
28
- def run(self) -> bool:
30
+ def run(self) -> RunnerStatus:
29
31
  """Run the pipeline with dedicated threads for source and destination"""
30
32
 
31
33
  extra_kwargs = self.get_kwargs()
@@ -35,6 +37,10 @@ class ThreadRunner(AbstractRunner):
35
37
  result_producer = None
36
38
  result_consumer = None
37
39
 
40
+ # Start the producer and consumer events
41
+ producer_stop_event = Event()
42
+ consumer_stop_event = Event()
43
+
38
44
  extra_kwargs = self.get_kwargs()
39
45
 
40
46
  with concurrent.futures.ThreadPoolExecutor(
@@ -46,6 +52,7 @@ class ThreadRunner(AbstractRunner):
46
52
  self.bizon_config,
47
53
  self.config,
48
54
  job.id,
55
+ producer_stop_event,
49
56
  **extra_kwargs,
50
57
  )
51
58
  logger.info("Producer thread has started ...")
@@ -55,7 +62,9 @@ class ThreadRunner(AbstractRunner):
55
62
  future_consumer = executor.submit(
56
63
  AbstractRunner.instanciate_and_run_consumer,
57
64
  self.bizon_config,
65
+ self.config,
58
66
  job.id,
67
+ consumer_stop_event,
59
68
  **extra_kwargs,
60
69
  )
61
70
  logger.info("Consumer thread has started ...")
@@ -68,14 +77,28 @@ class ThreadRunner(AbstractRunner):
68
77
  self._is_running = False
69
78
 
70
79
  if not future_producer.running():
71
- result_producer = future_producer.result()
80
+ result_producer: PipelineReturnStatus = future_producer.result()
72
81
  logger.info(f"Producer thread stopped running with result: {result_producer}")
73
82
 
83
+ if result_producer.SUCCESS:
84
+ logger.info("Producer thread has finished successfully, will wait for consumer to finish ...")
85
+ else:
86
+ logger.error("Producer thread failed, stopping consumer ...")
87
+ consumer_stop_event.set()
88
+
74
89
  if not future_consumer.running():
75
- try:
76
- future_consumer.result()
77
- except Exception as e:
78
- logger.error(f"Consumer thread stopped running with error {e}")
79
- logger.error(traceback.format_exc())
90
+ result_consumer = future_consumer.result()
91
+ logger.info(f"Consumer thread stopped running with result: {result_consumer}")
92
+
93
+ if result_consumer == PipelineReturnStatus.SUCCESS:
94
+ logger.info("Consumer thread has finished successfully")
95
+ else:
96
+ logger.error("Consumer thread failed, stopping producer ...")
97
+ producer_stop_event.set()
98
+
99
+ runner_status = RunnerStatus(producer=future_producer.result(), consumer=future_consumer.result())
100
+
101
+ if not runner_status.is_success:
102
+ logger.error(runner_status.to_string())
80
103
 
81
- return True
104
+ return runner_status
@@ -3,10 +3,13 @@ from typing import Optional
3
3
 
4
4
  from pydantic import BaseModel, Field
5
5
 
6
+ from bizon.engine.pipeline.models import PipelineReturnStatus
7
+
6
8
 
7
9
  class RunnerTypes(str, Enum):
8
10
  THREAD = "thread"
9
11
  PROCESS = "process"
12
+ STREAM = "stream"
10
13
 
11
14
 
12
15
  class LoggerLevel(str, Enum):
@@ -49,3 +52,28 @@ class RunnerConfig(BaseModel):
49
52
  description="Logging level",
50
53
  default=LoggerLevel.INFO,
51
54
  )
55
+
56
+
57
+ class RunnerStatus(BaseModel):
58
+ producer: Optional[PipelineReturnStatus] = None
59
+ consumer: Optional[PipelineReturnStatus] = None
60
+ stream: Optional[PipelineReturnStatus] = None
61
+
62
+ def __init__(self, **data):
63
+ super().__init__(**data)
64
+ if not ((self.producer is not None and self.consumer is not None) or self.stream is not None):
65
+ raise ValueError("Either both producer and consumer must be set, or stream must be set")
66
+
67
+ @property
68
+ def is_success(self):
69
+ if self.stream is not None:
70
+ return self.stream == PipelineReturnStatus.SUCCESS
71
+ return self.producer == PipelineReturnStatus.SUCCESS and self.consumer == PipelineReturnStatus.SUCCESS
72
+
73
+ def to_string(self):
74
+ if self.stream is not None:
75
+ return f"Pipeline finished with status {'Success' if self.is_success else 'Failure'} (Stream: {self.stream.value})"
76
+ return (
77
+ f"Pipeline finished with status {'Success' if self.is_success else 'Failure'} "
78
+ f"(Producer: {self.producer.value}, Consumer: {self.consumer.value})"
79
+ )
@@ -1,18 +1,28 @@
1
+ import multiprocessing
2
+ import multiprocessing.synchronize
1
3
  import os
2
4
  import sys
5
+ import threading
3
6
  from abc import ABC, abstractmethod
7
+ from typing import Union
4
8
 
5
9
  from loguru import logger
6
10
 
11
+ from bizon.alerting.models import AlertMethod
7
12
  from bizon.cli.utils import parse_from_yaml
8
13
  from bizon.common.models import BizonConfig, SyncMetadata
9
- from bizon.destinations.destination import AbstractDestination, DestinationFactory
14
+ from bizon.destination.destination import AbstractDestination, DestinationFactory
10
15
  from bizon.engine.backend.backend import AbstractBackend, BackendFactory
11
16
  from bizon.engine.backend.models import JobStatus, StreamJob
12
17
  from bizon.engine.pipeline.producer import Producer
13
18
  from bizon.engine.queue.queue import AbstractQueue, QueueFactory
19
+ from bizon.engine.runner.config import RunnerStatus
20
+ from bizon.monitoring.monitor import AbstractMonitor, MonitorFactory
21
+ from bizon.source.callback import AbstractSourceCallback
22
+ from bizon.source.config import SourceSyncModes
14
23
  from bizon.source.discover import get_source_instance_by_source_and_stream
15
24
  from bizon.source.source import AbstractSource
25
+ from bizon.transform.transform import Transform
16
26
 
17
27
 
18
28
  class AbstractRunner(ABC):
@@ -21,15 +31,31 @@ class AbstractRunner(ABC):
21
31
  # Internal state
22
32
  self._is_running: bool = False
23
33
 
24
- # Attributes should be serializable for multiprocessing
25
- self.config: dict = config
26
- self.bizon_config = BizonConfig.model_validate(obj=config)
34
+ self.config = config
35
+ self.bizon_config = BizonConfig.model_validate(obj=self.config)
36
+
37
+ # Set pipeline information as environment variables
38
+ os.environ["BIZON_SYNC_NAME"] = self.bizon_config.name
39
+ os.environ["BIZON_SOURCE_NAME"] = self.bizon_config.source.name
40
+ os.environ["BIZON_SOURCE_STREAM"] = self.bizon_config.source.stream
41
+ os.environ["BIZON_DESTINATION_NAME"] = self.bizon_config.destination.name
27
42
 
28
43
  # Set log level
29
44
  logger.info(f"Setting log level to {self.bizon_config.engine.runner.log_level.name}")
30
45
  logger.remove()
31
46
  logger.add(sys.stderr, level=self.bizon_config.engine.runner.log_level)
32
47
 
48
+ if self.bizon_config.alerting:
49
+ logger.info(f"Setting up alerting method {self.bizon_config.alerting.type}")
50
+ if self.bizon_config.alerting.type == AlertMethod.SLACK:
51
+ from bizon.alerting.slack.handler import SlackHandler
52
+
53
+ alert = SlackHandler(
54
+ config=self.bizon_config.alerting.config,
55
+ log_levels=self.bizon_config.alerting.log_levels,
56
+ )
57
+ alert.add_handlers()
58
+
33
59
  @property
34
60
  def is_running(self) -> bool:
35
61
  """Return True if the pipeline is running"""
@@ -45,17 +71,23 @@ class AbstractRunner(ABC):
45
71
  def get_source(bizon_config: BizonConfig, config: dict) -> AbstractSource:
46
72
  """Get an instance of the source based on the source config dict"""
47
73
 
48
- logger.info(f"Creating client for {bizon_config.source.source_name} - {bizon_config.source.stream_name} ...")
74
+ logger.info(f"Creating client for {bizon_config.source.name} - {bizon_config.source.stream} ...")
49
75
 
50
76
  # Get the client class, validate the config and return the client
51
77
  return get_source_instance_by_source_and_stream(
52
- source_name=bizon_config.source.source_name,
53
- stream_name=bizon_config.source.stream_name,
78
+ source_name=bizon_config.source.name,
79
+ stream_name=bizon_config.source.stream,
54
80
  source_config=config["source"], # We pass the raw config to have flexibility for custom sources
55
81
  )
56
82
 
57
83
  @staticmethod
58
- def get_destination(bizon_config: BizonConfig, backend: AbstractBackend, job_id: str) -> AbstractDestination:
84
+ def get_destination(
85
+ bizon_config: BizonConfig,
86
+ backend: AbstractBackend,
87
+ job_id: str,
88
+ source_callback: AbstractSourceCallback,
89
+ monitor: AbstractMonitor,
90
+ ) -> AbstractDestination:
59
91
  """Get an instance of the destination based on the destination config dict"""
60
92
 
61
93
  sync_metadata = SyncMetadata.from_bizon_config(job_id=job_id, config=bizon_config)
@@ -64,6 +96,8 @@ class AbstractRunner(ABC):
64
96
  sync_metadata=sync_metadata,
65
97
  config=bizon_config.destination,
66
98
  backend=backend,
99
+ source_callback=source_callback,
100
+ monitor=monitor,
67
101
  )
68
102
 
69
103
  @staticmethod
@@ -89,6 +123,16 @@ class AbstractRunner(ABC):
89
123
  **kwargs,
90
124
  )
91
125
 
126
+ @staticmethod
127
+ def get_transform(bizon_config: BizonConfig) -> Transform:
128
+ """Return the transform instance to apply to the source records"""
129
+ return Transform(transforms=bizon_config.transforms)
130
+
131
+ @staticmethod
132
+ def get_monitoring_client(sync_metadata: SyncMetadata, bizon_config: BizonConfig) -> AbstractMonitor:
133
+ """Return the monitoring client instance"""
134
+ return MonitorFactory.get_monitor(sync_metadata, bizon_config.monitoring)
135
+
92
136
  @staticmethod
93
137
  def get_or_create_job(
94
138
  bizon_config: BizonConfig,
@@ -101,15 +145,15 @@ class AbstractRunner(ABC):
101
145
  # Retrieve the last job for this stream
102
146
  job = backend.get_running_stream_job(
103
147
  name=bizon_config.name,
104
- source_name=bizon_config.source.source_name,
105
- stream_name=bizon_config.source.stream_name,
148
+ source_name=bizon_config.source.name,
149
+ stream_name=bizon_config.source.stream,
106
150
  session=session,
107
151
  )
108
152
 
109
153
  if job:
110
154
  # If force_create and a job is already running, we cancel it and create a new one
111
155
  if force_create:
112
- logger.info(f"Found an existing job, cancelling it...")
156
+ logger.info("Found an existing job, cancelling it...")
113
157
  backend.update_stream_job_status(job_id=job.id, job_status=JobStatus.CANCELED)
114
158
  logger.info(f"Job {job.id} canceled. Creating a new one...")
115
159
  # Otherwise we return the existing job
@@ -119,13 +163,16 @@ class AbstractRunner(ABC):
119
163
 
120
164
  # If no job is running, we create a new one:
121
165
  # Get the total number of records
122
- total_records = source.get_total_records_count()
166
+ if bizon_config.source.sync_mode == SourceSyncModes.STREAM:
167
+ total_records = None # Not available for stream mode
168
+ else:
169
+ total_records = source.get_total_records_count()
123
170
 
124
171
  # Create a new job
125
172
  job = backend.create_stream_job(
126
173
  name=bizon_config.name,
127
- source_name=bizon_config.source.source_name,
128
- stream_name=bizon_config.source.stream_name,
174
+ source_name=bizon_config.source.name,
175
+ stream_name=bizon_config.source.stream,
129
176
  sync_mode=bizon_config.source.sync_mode,
130
177
  total_records_to_fetch=total_records,
131
178
  session=session,
@@ -148,9 +195,7 @@ class AbstractRunner(ABC):
148
195
  source = AbstractRunner.get_source(bizon_config=bizon_config, config=config)
149
196
 
150
197
  check_connection, connection_error = source.check_connection()
151
- logger.info(
152
- f"Connection to source {bizon_config.source.source_name} - {bizon_config.source.stream_name} successful"
153
- )
198
+ logger.info(f"Connection to source {bizon_config.source.name} - {bizon_config.source.stream} successful")
154
199
 
155
200
  if not check_connection:
156
201
  logger.error(f"Error while connecting to source: {connection_error}")
@@ -170,12 +215,24 @@ class AbstractRunner(ABC):
170
215
  return job
171
216
 
172
217
  @staticmethod
173
- def instanciate_and_run_producer(bizon_config: BizonConfig, config: dict, job_id: str, **kwargs):
218
+ def instanciate_and_run_producer(
219
+ bizon_config: BizonConfig,
220
+ config: dict,
221
+ job_id: str,
222
+ stop_event: Union[multiprocessing.synchronize.Event, threading.Event],
223
+ **kwargs,
224
+ ):
174
225
 
226
+ # Get the source instance
175
227
  source = AbstractRunner.get_source(bizon_config=bizon_config, config=config)
228
+
229
+ # Get the queue instance
176
230
  queue = AbstractRunner.get_queue(bizon_config=bizon_config, **kwargs)
231
+
232
+ # Get the backend instance
177
233
  backend = AbstractRunner.get_backend(bizon_config=bizon_config, **kwargs)
178
234
 
235
+ # Create the producer instance
179
236
  producer = AbstractRunner.get_producer(
180
237
  bizon_config=bizon_config,
181
238
  source=source,
@@ -183,22 +240,54 @@ class AbstractRunner(ABC):
183
240
  backend=backend,
184
241
  )
185
242
 
186
- status = producer.run(job_id)
243
+ # Run the producer
244
+ status = producer.run(job_id, stop_event)
187
245
  return status
188
246
 
189
247
  @staticmethod
190
- def instanciate_and_run_consumer(bizon_config: BizonConfig, job_id: str, **kwargs):
248
+ def instanciate_and_run_consumer(
249
+ bizon_config: BizonConfig,
250
+ config: dict,
251
+ job_id: str,
252
+ stop_event: Union[multiprocessing.synchronize.Event, threading.Event],
253
+ **kwargs,
254
+ ):
255
+ # Get the source callback instance
256
+ source_callback = AbstractRunner.get_source(
257
+ bizon_config=bizon_config, config=config
258
+ ).get_source_callback_instance()
191
259
 
260
+ sync_metadata = SyncMetadata.from_bizon_config(job_id=job_id, config=bizon_config)
261
+
262
+ # Get the queue instance
192
263
  queue = AbstractRunner.get_queue(bizon_config=bizon_config, **kwargs)
264
+
265
+ # Get the backend instance
193
266
  backend = AbstractRunner.get_backend(bizon_config=bizon_config, **kwargs)
194
- destination = AbstractRunner.get_destination(bizon_config=bizon_config, backend=backend, job_id=job_id)
195
267
 
196
- consumer = queue.get_consumer(destination=destination)
268
+ # Get the monitor instance
269
+ monitor = AbstractRunner.get_monitoring_client(sync_metadata=sync_metadata, bizon_config=bizon_config)
270
+
271
+ # Get the destination instance
272
+ destination = AbstractRunner.get_destination(
273
+ bizon_config=bizon_config, backend=backend, job_id=job_id, source_callback=source_callback, monitor=monitor
274
+ )
275
+
276
+ # Get the transform instance
277
+ transform = AbstractRunner.get_transform(bizon_config=bizon_config)
278
+
279
+ # Create the consumer instance
280
+ consumer = queue.get_consumer(
281
+ destination=destination,
282
+ transform=transform,
283
+ monitor=monitor,
284
+ )
197
285
 
198
- status = consumer.run()
286
+ # Run the consumer
287
+ status = consumer.run(stop_event)
199
288
  return status
200
289
 
201
290
  @abstractmethod
202
- def run(self) -> bool:
291
+ def run(self) -> RunnerStatus:
203
292
  """Run the pipeline with dedicated adapter for source and destination"""
204
293
  pass
File without changes
@@ -0,0 +1,39 @@
1
+ from enum import Enum
2
+ from typing import Dict, Optional
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+
7
+ class MonitorType(str, Enum):
8
+ DATADOG = "datadog"
9
+
10
+
11
+ class BaseMonitoringConfig(BaseModel):
12
+ enable_tracing: bool = Field(default=False, description="Enable tracing for the monitor")
13
+
14
+
15
+ class DatadogConfig(BaseMonitoringConfig):
16
+ datadog_agent_host: Optional[str] = None
17
+ datadog_host_env_var: Optional[str] = None
18
+ datadog_agent_port: int = 8125
19
+ tags: Optional[Dict[str, str]] = Field(default={}, description="Key-value pairs to add to the monitor as tags")
20
+
21
+ @property
22
+ def host_is_configured(self) -> bool:
23
+ return bool(self.datadog_agent_host or self.datadog_host_env_var)
24
+
25
+ def __init__(self, **data):
26
+ super().__init__(**data)
27
+ if not self.host_is_configured:
28
+ raise ValueError("Either datadog_agent_host or datadog_host_env_var must be specified")
29
+
30
+ class Config:
31
+ extra = "forbid"
32
+
33
+
34
+ class MonitoringConfig(BaseMonitoringConfig):
35
+ type: MonitorType
36
+ config: Optional[DatadogConfig] = None
37
+
38
+ class Config:
39
+ extra = "forbid"
File without changes
@@ -0,0 +1,153 @@
1
+ import os
2
+ from contextlib import contextmanager
3
+ from typing import Dict, List, Union
4
+
5
+ from datadog import initialize, statsd
6
+ from loguru import logger
7
+
8
+ from bizon.common.models import SyncMetadata
9
+ from bizon.engine.pipeline.models import PipelineReturnStatus
10
+ from bizon.monitoring.config import MonitoringConfig
11
+ from bizon.monitoring.monitor import AbstractMonitor
12
+ from bizon.source.models import SourceRecord
13
+
14
+
15
+ class DatadogMonitor(AbstractMonitor):
16
+ def __init__(self, sync_metadata: SyncMetadata, monitoring_config: MonitoringConfig):
17
+ super().__init__(sync_metadata, monitoring_config)
18
+
19
+ # In Kubernetes, set the host dynamically
20
+ try:
21
+ datadog_host_from_env_var = os.getenv(monitoring_config.config.datadog_host_env_var)
22
+ if datadog_host_from_env_var:
23
+ initialize(
24
+ statsd_host=datadog_host_from_env_var,
25
+ statsd_port=monitoring_config.config.datadog_agent_port,
26
+ )
27
+ else:
28
+ initialize(
29
+ statsd_host=monitoring_config.config.datadog_agent_host,
30
+ statsd_port=monitoring_config.config.datadog_agent_port,
31
+ )
32
+ except Exception as e:
33
+ logger.info(f"Failed to initialize Datadog agent: {e}")
34
+
35
+ self.pipeline_monitor_status = "bizon_pipeline.status"
36
+ self.tags = [
37
+ f"pipeline_name:{self.sync_metadata.name}",
38
+ f"pipeline_stream:{self.sync_metadata.stream_name}",
39
+ f"pipeline_source:{self.sync_metadata.source_name}",
40
+ f"pipeline_destination:{self.sync_metadata.destination_name}",
41
+ ] + [f"{key}:{value}" for key, value in self.monitoring_config.config.tags.items()]
42
+
43
+ self.pipeline_active_pipelines = "bizon_pipeline.active_pipelines"
44
+ self.pipeline_records_synced = "bizon_pipeline.records_synced"
45
+ self.pipeline_large_records = "bizon_pipeline.large_records"
46
+
47
+ def track_pipeline_status(self, pipeline_status: PipelineReturnStatus, extra_tags: Dict[str, str] = {}) -> None:
48
+ """
49
+ Track the status of the pipeline.
50
+
51
+ Args:
52
+ status (str): The current status of the pipeline (e.g., 'running', 'failed', 'completed').
53
+ """
54
+
55
+ statsd.increment(
56
+ self.pipeline_monitor_status,
57
+ tags=self.tags
58
+ + [f"pipeline_status:{pipeline_status}"]
59
+ + [f"{key}:{value}" for key, value in extra_tags.items()],
60
+ )
61
+
62
+ def track_records_synced(
63
+ self, num_records: int, destination_id: str, extra_tags: Dict[str, str] = {}, headers: List[Dict[str, str]] = []
64
+ ) -> Union[List[Dict[str, str]], None]:
65
+ """
66
+ Track the number of records synced in the pipeline.
67
+
68
+ Args:
69
+ num_records (int): Number of records synced in this batch
70
+ """
71
+ statsd.increment(
72
+ self.pipeline_records_synced,
73
+ value=num_records,
74
+ tags=self.tags + [f"{key}:{value}" for key, value in extra_tags.items()],
75
+ )
76
+ if os.getenv("DD_DATA_STREAMS_ENABLED") == "true":
77
+ from ddtrace.data_streams import set_produce_checkpoint
78
+
79
+ destination_type = self.sync_metadata.destination_alias
80
+
81
+ for header in headers:
82
+ if "x-datadog-sampling-priority" in header:
83
+ del header["x-datadog-sampling-priority"]
84
+ if "dd-pathway-ctx-base64" in header:
85
+ del header["dd-pathway-ctx-base64"]
86
+ set_produce_checkpoint(destination_type, destination_id, header.setdefault)
87
+ return headers
88
+
89
+ def track_large_records_synced(self, num_records: int, extra_tags: Dict[str, str] = {}) -> None:
90
+ statsd.increment(
91
+ self.pipeline_large_records,
92
+ value=num_records,
93
+ tags=self.tags + [f"{key}:{value}" for key, value in extra_tags.items()],
94
+ )
95
+
96
+ def track_source_iteration(self, records: List[SourceRecord]) -> Union[List[Dict[str, str]], None]:
97
+ """
98
+ Track the number of records consumed from a Kafka topic.
99
+
100
+ Args:
101
+ kafka_topic (str): The Kafka topic name
102
+ """
103
+
104
+ if os.getenv("DD_DATA_STREAMS_ENABLED") == "true":
105
+ from ddtrace.data_streams import set_consume_checkpoint
106
+
107
+ headers_list = []
108
+ for record in records:
109
+ headers = record.data.get("headers", {})
110
+ set_consume_checkpoint("kafka", record.data["topic"], headers.get)
111
+ headers_list.append(headers)
112
+ return headers_list
113
+
114
+ @contextmanager
115
+ def trace(self, operation_name: str, resource: str = None, extra_tags: Dict[str, str] = None):
116
+ """
117
+ Create a trace span for monitoring using Datadog APM.
118
+
119
+ Args:
120
+ operation_name (str): The name of the operation being traced
121
+ resource (str): The resource being operated on (e.g., topic name, table name)
122
+ extra_tags (Dict[str, str]): Additional tags for the trace
123
+
124
+ Yields:
125
+ A span object that can be used to add additional metadata
126
+ """
127
+ if not self.monitoring_config.config.enable_tracing:
128
+ yield None
129
+ return
130
+
131
+ try:
132
+ from ddtrace import tracer
133
+
134
+ # Combine tags
135
+ all_tags = self.tags.copy()
136
+ if extra_tags:
137
+ all_tags.extend([f"{key}:{value}" for key, value in extra_tags.items()])
138
+
139
+ # Create the span
140
+ with tracer.trace(operation_name, resource=resource) as span:
141
+ # Add tags to the span
142
+ for tag in all_tags:
143
+ if ":" in tag:
144
+ key, value = tag.split(":", 1)
145
+ span.set_tag(key, value)
146
+ span.set_tag("_sampling_priority_v1", 1)
147
+ yield span
148
+ except ImportError:
149
+ logger.warning("ddtrace not available, skipping tracing")
150
+ yield None
151
+ except Exception as e:
152
+ logger.warning(f"Failed to create trace: {e}")
153
+ yield None
@@ -0,0 +1,71 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Dict, List, Union
3
+
4
+ from bizon.common.models import SyncMetadata
5
+ from bizon.engine.pipeline.models import PipelineReturnStatus
6
+ from bizon.monitoring.config import MonitoringConfig, MonitorType
7
+ from bizon.source.models import SourceRecord
8
+
9
+
10
+ class AbstractMonitor(ABC):
11
+ def __init__(self, sync_metadata: SyncMetadata, monitoring_config: MonitoringConfig):
12
+ self.sync_metadata = sync_metadata
13
+ self.monitoring_config = monitoring_config
14
+
15
+ @abstractmethod
16
+ def track_pipeline_status(self, pipeline_status: PipelineReturnStatus, extra_tags: Dict[str, str] = {}) -> None:
17
+ """
18
+ Track the status of the pipeline.
19
+
20
+ Args:
21
+ status (str): The current status of the pipeline (e.g., 'running', 'failed', 'completed').
22
+ """
23
+ pass
24
+
25
+ def track_source_iteration(self, records: List[SourceRecord], headers: Dict[str, str] = {}) -> None:
26
+ """
27
+ Run a process that tracks the source iteration.
28
+ """
29
+ pass
30
+
31
+ def track_records_synced(
32
+ self, num_records: int, destination_id: str, extra_tags: Dict[str, str] = {}, headers: Dict[str, str] = {}
33
+ ) -> None:
34
+ """
35
+ Track the number of records synced in the pipeline.
36
+ """
37
+ pass
38
+
39
+ def trace(self, operation_name: str, resource: str = None, extra_tags: Dict[str, str] = None):
40
+ """
41
+ Create a trace span for monitoring.
42
+
43
+ Args:
44
+ operation_name (str): The name of the operation being traced
45
+ resource (str): The resource being operated on (e.g., topic name, table name)
46
+ extra_tags (Dict[str, str]): Additional tags for the trace
47
+
48
+ Returns:
49
+ A context manager that can be used with 'with' statement
50
+ """
51
+ pass
52
+
53
+ def track_large_records_synced(self, num_records: int, extra_tags: Dict[str, str] = {}) -> None:
54
+ """
55
+ Track the number of large records synced in the destination system. This aims at helping to identify the source of the large records.
56
+ """
57
+ pass
58
+
59
+
60
+ class MonitorFactory:
61
+ @staticmethod
62
+ def get_monitor(sync_metadata: SyncMetadata, monitoring_config: Union[MonitoringConfig, None]) -> AbstractMonitor:
63
+ if monitoring_config is None:
64
+ from bizon.monitoring.noop.monitor import NoOpMonitor
65
+
66
+ return NoOpMonitor(sync_metadata, monitoring_config)
67
+
68
+ if monitoring_config.type == MonitorType.DATADOG:
69
+ from bizon.monitoring.datadog.monitor import DatadogMonitor
70
+
71
+ return DatadogMonitor(sync_metadata, monitoring_config)
File without changes
@@ -0,0 +1,30 @@
1
+ from contextlib import contextmanager
2
+ from typing import Dict
3
+
4
+ from bizon.common.models import BizonConfig, SyncMetadata
5
+ from bizon.engine.pipeline.models import PipelineReturnStatus
6
+ from bizon.monitoring.config import MonitoringConfig
7
+ from bizon.monitoring.monitor import AbstractMonitor
8
+
9
+
10
+ class NoOpMonitor(AbstractMonitor):
11
+ def __init__(self, sync_metadata: SyncMetadata, monitoring_config: MonitoringConfig):
12
+ super().__init__(sync_metadata, monitoring_config)
13
+
14
+ def track_pipeline_status(self, pipeline_status: PipelineReturnStatus) -> None:
15
+ pass
16
+
17
+ @contextmanager
18
+ def trace(self, operation_name: str, resource: str = None, extra_tags: Dict[str, str] = None):
19
+ """
20
+ No-op trace implementation.
21
+
22
+ Args:
23
+ operation_name (str): The name of the operation being traced
24
+ resource (str): The resource being operated on (e.g., topic name, table name)
25
+ extra_tags (Dict[str, str]): Additional tags for the trace
26
+
27
+ Yields:
28
+ None (no-op implementation)
29
+ """
30
+ yield None