bizon 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. bizon/alerting/alerts.py +0 -1
  2. bizon/common/models.py +184 -4
  3. bizon/connectors/destinations/bigquery/src/config.py +1 -1
  4. bizon/connectors/destinations/bigquery/src/destination.py +14 -9
  5. bizon/connectors/destinations/bigquery_streaming/config/bigquery_streaming.example.yml +74 -0
  6. bizon/connectors/destinations/bigquery_streaming/src/config.py +6 -5
  7. bizon/connectors/destinations/bigquery_streaming/src/destination.py +13 -9
  8. bizon/connectors/destinations/bigquery_streaming_v2/config/bigquery_streaming_v2.example.yml +79 -0
  9. bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +6 -1
  10. bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +232 -49
  11. bizon/connectors/destinations/bigquery_streaming_v2/src/proto_utils.py +1 -13
  12. bizon/connectors/destinations/file/config/file.example.yml +40 -0
  13. bizon/connectors/destinations/file/src/config.py +2 -1
  14. bizon/connectors/destinations/file/src/destination.py +3 -6
  15. bizon/connectors/destinations/logger/config/logger.example.yml +30 -0
  16. bizon/connectors/destinations/logger/src/config.py +1 -2
  17. bizon/connectors/destinations/logger/src/destination.py +4 -2
  18. bizon/connectors/sources/cycle/src/source.py +2 -6
  19. bizon/connectors/sources/dummy/src/source.py +0 -4
  20. bizon/connectors/sources/gsheets/src/source.py +2 -3
  21. bizon/connectors/sources/hubspot/src/hubspot_base.py +0 -1
  22. bizon/connectors/sources/hubspot/src/hubspot_objects.py +3 -4
  23. bizon/connectors/sources/hubspot/src/models/hs_object.py +0 -1
  24. bizon/connectors/sources/kafka/config/kafka.example.yml +1 -3
  25. bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +1 -3
  26. bizon/connectors/sources/kafka/config/kafka_streams.example.yml +124 -0
  27. bizon/connectors/sources/kafka/src/config.py +10 -12
  28. bizon/connectors/sources/kafka/src/decode.py +65 -60
  29. bizon/connectors/sources/kafka/src/source.py +182 -61
  30. bizon/connectors/sources/kafka/tests/kafka_pipeline.py +1 -1
  31. bizon/connectors/sources/notion/config/api_key.example.yml +35 -0
  32. bizon/connectors/sources/notion/src/__init__.py +0 -0
  33. bizon/connectors/sources/notion/src/config.py +59 -0
  34. bizon/connectors/sources/notion/src/source.py +1159 -0
  35. bizon/connectors/sources/notion/tests/notion_pipeline.py +7 -0
  36. bizon/connectors/sources/notion/tests/test_notion.py +113 -0
  37. bizon/connectors/sources/periscope/src/source.py +0 -6
  38. bizon/connectors/sources/pokeapi/src/source.py +0 -1
  39. bizon/connectors/sources/sana_ai/config/sana.example.yml +25 -0
  40. bizon/connectors/sources/sana_ai/src/source.py +85 -0
  41. bizon/destination/buffer.py +0 -1
  42. bizon/destination/config.py +9 -1
  43. bizon/destination/destination.py +38 -9
  44. bizon/engine/backend/adapters/sqlalchemy/backend.py +2 -5
  45. bizon/engine/backend/adapters/sqlalchemy/config.py +0 -1
  46. bizon/engine/config.py +0 -1
  47. bizon/engine/engine.py +0 -1
  48. bizon/engine/pipeline/consumer.py +0 -1
  49. bizon/engine/pipeline/producer.py +1 -5
  50. bizon/engine/queue/adapters/kafka/config.py +1 -1
  51. bizon/engine/queue/adapters/kafka/queue.py +0 -1
  52. bizon/engine/queue/adapters/python_queue/consumer.py +0 -1
  53. bizon/engine/queue/adapters/python_queue/queue.py +0 -2
  54. bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -1
  55. bizon/engine/queue/adapters/rabbitmq/queue.py +0 -1
  56. bizon/engine/queue/config.py +0 -2
  57. bizon/engine/runner/adapters/process.py +0 -2
  58. bizon/engine/runner/adapters/streaming.py +114 -42
  59. bizon/engine/runner/adapters/thread.py +0 -2
  60. bizon/engine/runner/config.py +0 -1
  61. bizon/engine/runner/runner.py +14 -9
  62. bizon/monitoring/config.py +12 -2
  63. bizon/monitoring/datadog/monitor.py +100 -14
  64. bizon/monitoring/monitor.py +41 -12
  65. bizon/monitoring/noop/monitor.py +22 -3
  66. bizon/source/auth/authenticators/abstract_oauth.py +11 -3
  67. bizon/source/auth/authenticators/abstract_token.py +2 -1
  68. bizon/source/auth/authenticators/basic.py +1 -1
  69. bizon/source/auth/authenticators/cookies.py +2 -1
  70. bizon/source/auth/authenticators/oauth.py +8 -3
  71. bizon/source/config.py +0 -2
  72. bizon/source/cursor.py +8 -16
  73. bizon/source/discover.py +3 -6
  74. bizon/source/models.py +0 -1
  75. bizon/source/session.py +0 -1
  76. bizon/source/source.py +18 -3
  77. bizon/transform/config.py +0 -2
  78. bizon/transform/transform.py +0 -3
  79. {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info}/METADATA +62 -41
  80. bizon-0.2.0.dist-info/RECORD +136 -0
  81. {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info}/WHEEL +1 -1
  82. bizon-0.2.0.dist-info/entry_points.txt +2 -0
  83. bizon-0.1.1.dist-info/RECORD +0 -123
  84. bizon-0.1.1.dist-info/entry_points.txt +0 -3
  85. {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info/licenses}/LICENSE +0 -0
@@ -8,12 +8,14 @@ import simplejson as json
8
8
  from loguru import logger
9
9
  from pytz import UTC
10
10
 
11
- from bizon.common.models import BizonConfig
11
+ from bizon.common.models import BizonConfig, SyncMetadata
12
+ from bizon.connectors.destinations.bigquery.src.config import BigQueryRecordSchemaConfig
12
13
  from bizon.destination.models import transform_to_df_destination_records
13
14
  from bizon.engine.pipeline.models import PipelineReturnStatus
14
15
  from bizon.engine.runner.config import RunnerStatus
15
16
  from bizon.engine.runner.runner import AbstractRunner
16
17
  from bizon.source.models import SourceRecord, source_record_schema
18
+ from bizon.source.source import AbstractSource
17
19
 
18
20
 
19
21
  class StreamingRunner(AbstractRunner):
@@ -36,68 +38,138 @@ class StreamingRunner(AbstractRunner):
36
38
  def convert_to_destination_records(df_source_records: pl.DataFrame, extracted_at: datetime) -> pl.DataFrame:
37
39
  return transform_to_df_destination_records(df_source_records=df_source_records, extracted_at=extracted_at)
38
40
 
41
+ def _apply_streams_config(self, source: AbstractSource = None) -> None:
42
+ """Apply streams configuration to source and destination.
43
+
44
+ This method is completely source-agnostic. Each source connector is responsible
45
+ for handling streams config appropriately via set_streams_config().
46
+
47
+ When a top-level 'streams' configuration is present, this method:
48
+ 1. Calls source.set_streams_config() to let the source enrich its own config
49
+ 2. Builds destination record_schemas from streams config
50
+ 3. Injects record_schemas into destination config for backward compatibility
51
+
52
+ The source is responsible for modifying self.config (which points to bizon_config.source)
53
+ so that subsequent source instantiations see the enriched config.
54
+ """
55
+ if not self.bizon_config.streams:
56
+ return
57
+
58
+ logger.info(f"Applying streams configuration: {len(self.bizon_config.streams)} streams defined")
59
+
60
+ # Let the source enrich its own config from streams
61
+ # Note: source modifies self.config, which is a reference to bizon_config.source
62
+ # This ensures init_job (which creates a new source) sees the enriched config
63
+ if source and hasattr(source, "set_streams_config") and callable(source.set_streams_config):
64
+ source.set_streams_config(self.bizon_config.streams)
65
+
66
+ # Build record_schemas list for destination from streams config
67
+ record_schemas = []
68
+ for stream in self.bizon_config.streams:
69
+ if stream.destination.record_schema:
70
+ record_schema_config = BigQueryRecordSchemaConfig(
71
+ destination_id=stream.destination.table_id,
72
+ record_schema=stream.destination.record_schema,
73
+ clustering_keys=stream.destination.clustering_keys,
74
+ )
75
+ record_schemas.append(record_schema_config)
76
+ logger.info(
77
+ f"Stream '{stream.name}': "
78
+ f"{getattr(stream.source, 'topic', getattr(stream.source, 'name', 'N/A'))} "
79
+ f"-> {stream.destination.table_id}"
80
+ )
81
+
82
+ # Inject into destination config
83
+ if record_schemas and hasattr(self.bizon_config.destination.config, "record_schemas"):
84
+ logger.info(f"Injecting {len(record_schemas)} record schemas into destination config")
85
+ self.bizon_config.destination.config.record_schemas = record_schemas
86
+
39
87
  def run(self) -> RunnerStatus:
88
+ # Create a temporary source to enrich bizon_config.source from streams
89
+ # The source's set_streams_config() modifies self.config (= bizon_config.source)
90
+ # This ensures subsequent source instantiations see the enriched config
91
+ temp_source = self.get_source(bizon_config=self.bizon_config, config=self.config)
92
+ self._apply_streams_config(temp_source)
93
+
94
+ # Now initialize job (check_connection will use enriched source config)
40
95
  job = self.init_job(bizon_config=self.bizon_config, config=self.config)
41
96
  backend = self.get_backend(bizon_config=self.bizon_config)
42
97
  source = self.get_source(bizon_config=self.bizon_config, config=self.config)
98
+
99
+ sync_metadata = SyncMetadata.from_bizon_config(job_id=job.id, config=self.bizon_config)
100
+ monitor = self.get_monitoring_client(sync_metadata=sync_metadata, bizon_config=self.bizon_config)
101
+
43
102
  destination = self.get_destination(
44
103
  bizon_config=self.bizon_config,
45
104
  backend=backend,
46
105
  job_id=job.id,
47
106
  source_callback=None,
107
+ monitor=monitor,
48
108
  )
109
+
49
110
  transform = self.get_transform(bizon_config=self.bizon_config)
50
- monitor = self.get_monitoring_client(bizon_config=self.bizon_config)
111
+
51
112
  destination.buffer.buffer_size = 0 # force buffer to be flushed immediately
52
113
  iteration = 0
53
114
 
54
115
  while True:
55
-
56
116
  if source.config.max_iterations and iteration > source.config.max_iterations:
57
117
  logger.info(f"Max iterations {source.config.max_iterations} reached, terminating stream ...")
58
118
  break
59
119
 
60
- source_iteration = source.get()
120
+ with monitor.trace(operation_name="bizon.stream.iteration"):
121
+ source_iteration = source.get()
122
+
123
+ destination_id_indexed_records = {}
124
+
125
+ if len(source_iteration.records) == 0:
126
+ logger.info("No new records found, stopping iteration")
127
+ time.sleep(2)
128
+ monitor.track_pipeline_status(PipelineReturnStatus.SUCCESS)
129
+ iteration += 1
130
+ continue
131
+
132
+ for record in source_iteration.records:
133
+ if destination_id_indexed_records.get(record.destination_id):
134
+ destination_id_indexed_records[record.destination_id].append(record)
135
+ else:
136
+ destination_id_indexed_records[record.destination_id] = [record]
137
+
138
+ for destination_id, records in destination_id_indexed_records.items():
139
+ df_source_records = StreamingRunner.convert_source_records(records)
140
+
141
+ dsm_headers = monitor.track_source_iteration(records=records)
142
+
143
+ # Apply transformation
144
+ df_source_records = transform.apply_transforms(df_source_records=df_source_records)
145
+
146
+ df_destination_records = StreamingRunner.convert_to_destination_records(
147
+ df_source_records, datetime.now(tz=UTC)
148
+ )
149
+ # Override destination_id
150
+ destination.destination_id = destination_id
151
+ destination.write_or_buffer_records(
152
+ df_destination_records=df_destination_records,
153
+ iteration=iteration,
154
+ pagination=None,
155
+ )
156
+ monitor.track_records_synced(
157
+ num_records=len(df_destination_records),
158
+ destination_id=destination_id,
159
+ extra_tags={"destination_id": destination_id},
160
+ headers=dsm_headers,
161
+ )
162
+
163
+ if os.getenv("ENVIRONMENT") == "production":
164
+ try:
165
+ source.commit()
166
+ except Exception as e:
167
+ logger.error(f"Error committing source: {e}")
168
+ monitor.track_pipeline_status(PipelineReturnStatus.ERROR)
169
+ return RunnerStatus(stream=PipelineReturnStatus.ERROR)
61
170
 
62
- destination_id_indexed_records = {}
63
-
64
- if len(source_iteration.records) == 0:
65
- logger.info("No new records found, stopping iteration")
66
- time.sleep(2)
67
- monitor.track_pipeline_status(PipelineReturnStatus.SUCCESS)
68
171
  iteration += 1
69
- continue
70
-
71
- for record in source_iteration.records:
72
- if destination_id_indexed_records.get(record.destination_id):
73
- destination_id_indexed_records[record.destination_id].append(record)
74
- else:
75
- destination_id_indexed_records[record.destination_id] = [record]
76
-
77
- for destination_id, records in destination_id_indexed_records.items():
78
- df_source_records = StreamingRunner.convert_source_records(records)
79
-
80
- # Apply transformation
81
- df_source_records = transform.apply_transforms(df_source_records=df_source_records)
82
172
 
83
- df_destination_records = StreamingRunner.convert_to_destination_records(
84
- df_source_records, datetime.now(tz=UTC)
85
- )
86
- # Override destination_id
87
- destination.destination_id = destination_id
88
- destination.write_or_buffer_records(
89
- df_destination_records=df_destination_records,
90
- iteration=iteration,
91
- pagination=None,
92
- )
93
- monitor.track_records_synced(
94
- num_records=len(df_destination_records),
95
- extra_tags={"destination_id": destination_id},
96
- )
97
- if os.getenv("ENVIRONMENT") == "production":
98
- source.commit()
99
-
100
- iteration += 1
173
+ monitor.track_pipeline_status(PipelineReturnStatus.SUCCESS)
101
174
 
102
- monitor.track_pipeline_status(PipelineReturnStatus.SUCCESS)
103
175
  return RunnerStatus(stream=PipelineReturnStatus.SUCCESS) # return when max iterations is reached
@@ -16,7 +16,6 @@ class ThreadRunner(AbstractRunner):
16
16
 
17
17
  # TODO: refacto this
18
18
  def get_kwargs(self):
19
-
20
19
  extra_kwargs = {}
21
20
 
22
21
  if self.bizon_config.engine.queue.type == "python_queue":
@@ -46,7 +45,6 @@ class ThreadRunner(AbstractRunner):
46
45
  with concurrent.futures.ThreadPoolExecutor(
47
46
  max_workers=self.bizon_config.engine.runner.config.max_workers
48
47
  ) as executor:
49
-
50
48
  future_producer = executor.submit(
51
49
  AbstractRunner.instanciate_and_run_producer,
52
50
  self.bizon_config,
@@ -37,7 +37,6 @@ class RunnerFuturesConfig(BaseModel):
37
37
 
38
38
 
39
39
  class RunnerConfig(BaseModel):
40
-
41
40
  type: RunnerTypes = Field(
42
41
  description="Runner to use for the pipeline",
43
42
  default=RunnerTypes.THREAD,
@@ -27,7 +27,6 @@ from bizon.transform.transform import Transform
27
27
 
28
28
  class AbstractRunner(ABC):
29
29
  def __init__(self, config: dict):
30
-
31
30
  # Internal state
32
31
  self._is_running: bool = False
33
32
 
@@ -82,7 +81,11 @@ class AbstractRunner(ABC):
82
81
 
83
82
  @staticmethod
84
83
  def get_destination(
85
- bizon_config: BizonConfig, backend: AbstractBackend, job_id: str, source_callback: AbstractSourceCallback
84
+ bizon_config: BizonConfig,
85
+ backend: AbstractBackend,
86
+ job_id: str,
87
+ source_callback: AbstractSourceCallback,
88
+ monitor: AbstractMonitor,
86
89
  ) -> AbstractDestination:
87
90
  """Get an instance of the destination based on the destination config dict"""
88
91
 
@@ -93,6 +96,7 @@ class AbstractRunner(ABC):
93
96
  config=bizon_config.destination,
94
97
  backend=backend,
95
98
  source_callback=source_callback,
99
+ monitor=monitor,
96
100
  )
97
101
 
98
102
  @staticmethod
@@ -124,9 +128,9 @@ class AbstractRunner(ABC):
124
128
  return Transform(transforms=bizon_config.transforms)
125
129
 
126
130
  @staticmethod
127
- def get_monitoring_client(bizon_config: BizonConfig) -> AbstractMonitor:
131
+ def get_monitoring_client(sync_metadata: SyncMetadata, bizon_config: BizonConfig) -> AbstractMonitor:
128
132
  """Return the monitoring client instance"""
129
- return MonitorFactory.get_monitor(bizon_config)
133
+ return MonitorFactory.get_monitor(sync_metadata, bizon_config.monitoring)
130
134
 
131
135
  @staticmethod
132
136
  def get_or_create_job(
@@ -217,7 +221,6 @@ class AbstractRunner(ABC):
217
221
  stop_event: Union[multiprocessing.synchronize.Event, threading.Event],
218
222
  **kwargs,
219
223
  ):
220
-
221
224
  # Get the source instance
222
225
  source = AbstractRunner.get_source(bizon_config=bizon_config, config=config)
223
226
 
@@ -252,23 +255,25 @@ class AbstractRunner(ABC):
252
255
  bizon_config=bizon_config, config=config
253
256
  ).get_source_callback_instance()
254
257
 
258
+ sync_metadata = SyncMetadata.from_bizon_config(job_id=job_id, config=bizon_config)
259
+
255
260
  # Get the queue instance
256
261
  queue = AbstractRunner.get_queue(bizon_config=bizon_config, **kwargs)
257
262
 
258
263
  # Get the backend instance
259
264
  backend = AbstractRunner.get_backend(bizon_config=bizon_config, **kwargs)
260
265
 
266
+ # Get the monitor instance
267
+ monitor = AbstractRunner.get_monitoring_client(sync_metadata=sync_metadata, bizon_config=bizon_config)
268
+
261
269
  # Get the destination instance
262
270
  destination = AbstractRunner.get_destination(
263
- bizon_config=bizon_config, backend=backend, job_id=job_id, source_callback=source_callback
271
+ bizon_config=bizon_config, backend=backend, job_id=job_id, source_callback=source_callback, monitor=monitor
264
272
  )
265
273
 
266
274
  # Get the transform instance
267
275
  transform = AbstractRunner.get_transform(bizon_config=bizon_config)
268
276
 
269
- # Get the monitor instance
270
- monitor = AbstractRunner.get_monitoring_client(bizon_config=bizon_config)
271
-
272
277
  # Create the consumer instance
273
278
  consumer = queue.get_consumer(
274
279
  destination=destination,
@@ -8,7 +8,11 @@ class MonitorType(str, Enum):
8
8
  DATADOG = "datadog"
9
9
 
10
10
 
11
- class DatadogConfig(BaseModel):
11
+ class BaseMonitoringConfig(BaseModel):
12
+ enable_tracing: bool = Field(default=False, description="Enable tracing for the monitor")
13
+
14
+
15
+ class DatadogConfig(BaseMonitoringConfig):
12
16
  datadog_agent_host: Optional[str] = None
13
17
  datadog_host_env_var: Optional[str] = None
14
18
  datadog_agent_port: int = 8125
@@ -23,7 +27,13 @@ class DatadogConfig(BaseModel):
23
27
  if not self.host_is_configured:
24
28
  raise ValueError("Either datadog_agent_host or datadog_host_env_var must be specified")
25
29
 
30
+ class Config:
31
+ extra = "forbid"
26
32
 
27
- class MonitoringConfig(BaseModel):
33
+
34
+ class MonitoringConfig(BaseMonitoringConfig):
28
35
  type: MonitorType
29
36
  config: Optional[DatadogConfig] = None
37
+
38
+ class Config:
39
+ extra = "forbid"
@@ -1,44 +1,48 @@
1
1
  import os
2
- from typing import Dict
2
+ from contextlib import contextmanager
3
+ from typing import Dict, List, Union
3
4
 
4
5
  from datadog import initialize, statsd
5
6
  from loguru import logger
6
7
 
7
- from bizon.common.models import BizonConfig
8
+ from bizon.common.models import SyncMetadata
8
9
  from bizon.engine.pipeline.models import PipelineReturnStatus
10
+ from bizon.monitoring.config import MonitoringConfig
9
11
  from bizon.monitoring.monitor import AbstractMonitor
12
+ from bizon.source.models import SourceRecord
10
13
 
11
14
 
12
15
  class DatadogMonitor(AbstractMonitor):
13
- def __init__(self, pipeline_config: BizonConfig):
14
- super().__init__(pipeline_config)
16
+ def __init__(self, sync_metadata: SyncMetadata, monitoring_config: MonitoringConfig):
17
+ super().__init__(sync_metadata, monitoring_config)
15
18
 
16
19
  # In Kubernetes, set the host dynamically
17
20
  try:
18
- datadog_host_from_env_var = os.getenv(pipeline_config.monitoring.config.datadog_host_env_var)
21
+ datadog_host_from_env_var = os.getenv(monitoring_config.config.datadog_host_env_var)
19
22
  if datadog_host_from_env_var:
20
23
  initialize(
21
24
  statsd_host=datadog_host_from_env_var,
22
- statsd_port=pipeline_config.monitoring.config.datadog_agent_port,
25
+ statsd_port=monitoring_config.config.datadog_agent_port,
23
26
  )
24
27
  else:
25
28
  initialize(
26
- statsd_host=pipeline_config.monitoring.config.datadog_agent_host,
27
- statsd_port=pipeline_config.monitoring.config.datadog_agent_port,
29
+ statsd_host=monitoring_config.config.datadog_agent_host,
30
+ statsd_port=monitoring_config.config.datadog_agent_port,
28
31
  )
29
32
  except Exception as e:
30
33
  logger.info(f"Failed to initialize Datadog agent: {e}")
31
34
 
32
35
  self.pipeline_monitor_status = "bizon_pipeline.status"
33
36
  self.tags = [
34
- f"pipeline_name:{self.pipeline_config.name}",
35
- f"pipeline_stream:{self.pipeline_config.source.stream}",
36
- f"pipeline_source:{self.pipeline_config.source.name}",
37
- f"pipeline_destination:{self.pipeline_config.destination.name}",
38
- ] + [f"{key}:{value}" for key, value in self.pipeline_config.monitoring.config.tags.items()]
37
+ f"pipeline_name:{self.sync_metadata.name}",
38
+ f"pipeline_stream:{self.sync_metadata.stream_name}",
39
+ f"pipeline_source:{self.sync_metadata.source_name}",
40
+ f"pipeline_destination:{self.sync_metadata.destination_name}",
41
+ ] + [f"{key}:{value}" for key, value in self.monitoring_config.config.tags.items()]
39
42
 
40
43
  self.pipeline_active_pipelines = "bizon_pipeline.active_pipelines"
41
44
  self.pipeline_records_synced = "bizon_pipeline.records_synced"
45
+ self.pipeline_large_records = "bizon_pipeline.large_records"
42
46
 
43
47
  def track_pipeline_status(self, pipeline_status: PipelineReturnStatus, extra_tags: Dict[str, str] = {}) -> None:
44
48
  """
@@ -55,7 +59,9 @@ class DatadogMonitor(AbstractMonitor):
55
59
  + [f"{key}:{value}" for key, value in extra_tags.items()],
56
60
  )
57
61
 
58
- def track_records_synced(self, num_records: int, extra_tags: Dict[str, str] = {}) -> None:
62
+ def track_records_synced(
63
+ self, num_records: int, destination_id: str, extra_tags: Dict[str, str] = {}, headers: List[Dict[str, str]] = []
64
+ ) -> Union[List[Dict[str, str]], None]:
59
65
  """
60
66
  Track the number of records synced in the pipeline.
61
67
 
@@ -67,3 +73,83 @@ class DatadogMonitor(AbstractMonitor):
67
73
  value=num_records,
68
74
  tags=self.tags + [f"{key}:{value}" for key, value in extra_tags.items()],
69
75
  )
76
+ if os.getenv("DD_DATA_STREAMS_ENABLED") == "true":
77
+ from ddtrace.data_streams import set_produce_checkpoint
78
+
79
+ destination_type = self.sync_metadata.destination_alias
80
+
81
+ for header in headers:
82
+ if "x-datadog-sampling-priority" in header:
83
+ del header["x-datadog-sampling-priority"]
84
+ if "dd-pathway-ctx-base64" in header:
85
+ del header["dd-pathway-ctx-base64"]
86
+ set_produce_checkpoint(destination_type, destination_id, header.setdefault)
87
+ return headers
88
+
89
+ def track_large_records_synced(self, num_records: int, extra_tags: Dict[str, str] = {}) -> None:
90
+ statsd.increment(
91
+ self.pipeline_large_records,
92
+ value=num_records,
93
+ tags=self.tags + [f"{key}:{value}" for key, value in extra_tags.items()],
94
+ )
95
+
96
+ def track_source_iteration(self, records: List[SourceRecord]) -> Union[List[Dict[str, str]], None]:
97
+ """
98
+ Track the number of records consumed from a Kafka topic.
99
+
100
+ Args:
101
+ kafka_topic (str): The Kafka topic name
102
+ """
103
+
104
+ if os.getenv("DD_DATA_STREAMS_ENABLED") == "true":
105
+ from ddtrace.data_streams import set_consume_checkpoint
106
+
107
+ headers_list = []
108
+ for record in records:
109
+ headers = record.data.get("headers", {})
110
+ set_consume_checkpoint("kafka", record.data["topic"], headers.get)
111
+ headers_list.append(headers)
112
+ return headers_list
113
+
114
+ @contextmanager
115
+ def trace(self, operation_name: str, resource: str = None, extra_tags: Dict[str, str] = None):
116
+ """
117
+ Create a trace span for monitoring using Datadog APM.
118
+
119
+ Args:
120
+ operation_name (str): The name of the operation being traced
121
+ resource (str): The resource being operated on (e.g., topic name, table name)
122
+ extra_tags (Dict[str, str]): Additional tags for the trace
123
+
124
+ Yields:
125
+ A span object that can be used to add additional metadata
126
+ """
127
+ if not self.monitoring_config.config.enable_tracing:
128
+ yield None
129
+ return
130
+
131
+ try:
132
+ from ddtrace import tracer
133
+ except ImportError:
134
+ logger.warning("ddtrace not available, skipping tracing")
135
+ yield None
136
+ return
137
+
138
+ try:
139
+ # Combine tags
140
+ all_tags = self.tags.copy()
141
+ if extra_tags:
142
+ all_tags.extend([f"{key}:{value}" for key, value in extra_tags.items()])
143
+
144
+ # Create the span
145
+ with tracer.trace(operation_name, resource=resource) as span:
146
+ # Add tags to the span
147
+ for tag in all_tags:
148
+ if ":" in tag:
149
+ key, value = tag.split(":", 1)
150
+ span.set_tag(key, value)
151
+ span.set_tag("_sampling_priority_v1", 1)
152
+ yield span
153
+ except Exception as e:
154
+ logger.warning(f"Failed to create trace: {e}")
155
+ yield None
@@ -1,15 +1,16 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import Dict
2
+ from typing import Dict, List, Union
3
3
 
4
- from bizon.common.models import BizonConfig
4
+ from bizon.common.models import SyncMetadata
5
5
  from bizon.engine.pipeline.models import PipelineReturnStatus
6
- from bizon.monitoring.config import MonitorType
6
+ from bizon.monitoring.config import MonitoringConfig, MonitorType
7
+ from bizon.source.models import SourceRecord
7
8
 
8
9
 
9
10
  class AbstractMonitor(ABC):
10
- def __init__(self, pipeline_config: BizonConfig):
11
- self.pipeline_config = pipeline_config
12
- # Initialize the monitor
11
+ def __init__(self, sync_metadata: SyncMetadata, monitoring_config: MonitoringConfig):
12
+ self.sync_metadata = sync_metadata
13
+ self.monitoring_config = monitoring_config
13
14
 
14
15
  @abstractmethod
15
16
  def track_pipeline_status(self, pipeline_status: PipelineReturnStatus, extra_tags: Dict[str, str] = {}) -> None:
@@ -21,22 +22,50 @@ class AbstractMonitor(ABC):
21
22
  """
22
23
  pass
23
24
 
24
- def track_records_synced(self, num_records: int, extra_tags: Dict[str, str] = {}) -> None:
25
+ def track_source_iteration(self, records: List[SourceRecord], headers: Dict[str, str] = {}) -> None:
26
+ """
27
+ Run a process that tracks the source iteration.
28
+ """
29
+ pass
30
+
31
+ def track_records_synced(
32
+ self, num_records: int, destination_id: str, extra_tags: Dict[str, str] = {}, headers: Dict[str, str] = {}
33
+ ) -> None:
25
34
  """
26
35
  Track the number of records synced in the pipeline.
27
36
  """
28
37
  pass
29
38
 
39
+ def trace(self, operation_name: str, resource: str = None, extra_tags: Dict[str, str] = None):
40
+ """
41
+ Create a trace span for monitoring.
42
+
43
+ Args:
44
+ operation_name (str): The name of the operation being traced
45
+ resource (str): The resource being operated on (e.g., topic name, table name)
46
+ extra_tags (Dict[str, str]): Additional tags for the trace
47
+
48
+ Returns:
49
+ A context manager that can be used with 'with' statement
50
+ """
51
+ pass
52
+
53
+ def track_large_records_synced(self, num_records: int, extra_tags: Dict[str, str] = {}) -> None:
54
+ """
55
+ Track the number of large records synced in the destination system. This aims at helping to identify the source of the large records.
56
+ """
57
+ pass
58
+
30
59
 
31
60
  class MonitorFactory:
32
61
  @staticmethod
33
- def get_monitor(pipeline_config: BizonConfig) -> AbstractMonitor:
34
- if pipeline_config.monitoring is None:
62
+ def get_monitor(sync_metadata: SyncMetadata, monitoring_config: Union[MonitoringConfig, None]) -> AbstractMonitor:
63
+ if monitoring_config is None:
35
64
  from bizon.monitoring.noop.monitor import NoOpMonitor
36
65
 
37
- return NoOpMonitor(pipeline_config)
66
+ return NoOpMonitor(sync_metadata, monitoring_config)
38
67
 
39
- if pipeline_config.monitoring.type == MonitorType.DATADOG:
68
+ if monitoring_config.type == MonitorType.DATADOG:
40
69
  from bizon.monitoring.datadog.monitor import DatadogMonitor
41
70
 
42
- return DatadogMonitor(pipeline_config)
71
+ return DatadogMonitor(sync_metadata, monitoring_config)
@@ -1,11 +1,30 @@
1
- from bizon.common.models import BizonConfig
1
+ from contextlib import contextmanager
2
+ from typing import Dict
3
+
4
+ from bizon.common.models import SyncMetadata
2
5
  from bizon.engine.pipeline.models import PipelineReturnStatus
6
+ from bizon.monitoring.config import MonitoringConfig
3
7
  from bizon.monitoring.monitor import AbstractMonitor
4
8
 
5
9
 
6
10
  class NoOpMonitor(AbstractMonitor):
7
- def __init__(self, pipeline_config: BizonConfig):
8
- super().__init__(pipeline_config)
11
+ def __init__(self, sync_metadata: SyncMetadata, monitoring_config: MonitoringConfig):
12
+ super().__init__(sync_metadata, monitoring_config)
9
13
 
10
14
  def track_pipeline_status(self, pipeline_status: PipelineReturnStatus) -> None:
11
15
  pass
16
+
17
+ @contextmanager
18
+ def trace(self, operation_name: str, resource: str = None, extra_tags: Dict[str, str] = None):
19
+ """
20
+ No-op trace implementation.
21
+
22
+ Args:
23
+ operation_name (str): The name of the operation being traced
24
+ resource (str): The resource being operated on (e.g., topic name, table name)
25
+ extra_tags (Dict[str, str]): Additional tags for the trace
26
+
27
+ Yields:
28
+ None (no-op implementation)
29
+ """
30
+ yield None
@@ -1,7 +1,9 @@
1
1
  from abc import abstractmethod
2
- from typing import Any, List, Mapping, MutableMapping, Tuple, Union
2
+ from collections.abc import Mapping, MutableMapping
3
+ from typing import Any, List, Tuple, Union
3
4
 
4
5
  import backoff
6
+ import dpath
5
7
  import pendulum
6
8
  import requests
7
9
  from loguru import logger
@@ -52,8 +54,8 @@ class AbstractOauth2Authenticator(AuthBase):
52
54
  "refresh_token": self.get_refresh_token(),
53
55
  }
54
56
 
55
- if self.get_scopes:
56
- payload["scopes"] = self.get_scopes()
57
+ if self.get_scopes():
58
+ payload["scope"] = ",".join(self.get_scopes())
57
59
 
58
60
  if self.get_refresh_request_body():
59
61
  for key, val in self.get_refresh_request_body().items():
@@ -92,6 +94,8 @@ class AbstractOauth2Authenticator(AuthBase):
92
94
  :return: a tuple of (access_token, token_lifespan_in_seconds)
93
95
  """
94
96
  response_json = self._get_refresh_access_token_response()
97
+ if self.get_response_field_path():
98
+ response_json = dpath.get(response_json, self.get_response_field_path())
95
99
  return response_json[self.get_access_token_name()], int(response_json[self.get_expires_in_name()])
96
100
 
97
101
  @abstractmethod
@@ -138,6 +142,10 @@ class AbstractOauth2Authenticator(AuthBase):
138
142
  def get_grant_type(self) -> str:
139
143
  """Returns grant_type specified for requesting access_token"""
140
144
 
145
+ @abstractmethod
146
+ def get_response_field_path(self) -> str:
147
+ """Returns the path to the response field"""
148
+
141
149
  @property
142
150
  @abstractmethod
143
151
  def access_token(self) -> str:
@@ -1,5 +1,6 @@
1
1
  from abc import abstractmethod
2
- from typing import Any, Mapping
2
+ from collections.abc import Mapping
3
+ from typing import Any
3
4
 
4
5
  from requests.auth import AuthBase
5
6
 
@@ -27,7 +27,7 @@ class BasicHttpAuthenticator(AbstractHeaderAuthenticator):
27
27
  return f"{self._auth_method} {self._token}"
28
28
 
29
29
  def __init__(self, params: BasicHttpAuthParams):
30
- auth_string = f"{params.username}:{params.password}".encode("utf8")
30
+ auth_string = f"{params.username}:{params.password}".encode()
31
31
  b64_encoded = base64.b64encode(auth_string).decode("utf8")
32
32
  self._auth_header = params.auth_header
33
33
  self._auth_method = params.auth_method
@@ -1,4 +1,5 @@
1
- from typing import Any, Mapping, Optional
1
+ from collections.abc import Mapping
2
+ from typing import Any, Optional
2
3
 
3
4
  from pydantic import BaseModel, Field
4
5
  from requests import PreparedRequest