bizon 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bizon/alerting/alerts.py +0 -1
- bizon/common/models.py +184 -4
- bizon/connectors/destinations/bigquery/src/config.py +1 -1
- bizon/connectors/destinations/bigquery/src/destination.py +14 -9
- bizon/connectors/destinations/bigquery_streaming/config/bigquery_streaming.example.yml +74 -0
- bizon/connectors/destinations/bigquery_streaming/src/config.py +6 -5
- bizon/connectors/destinations/bigquery_streaming/src/destination.py +13 -9
- bizon/connectors/destinations/bigquery_streaming_v2/config/bigquery_streaming_v2.example.yml +79 -0
- bizon/connectors/destinations/bigquery_streaming_v2/src/config.py +6 -1
- bizon/connectors/destinations/bigquery_streaming_v2/src/destination.py +232 -49
- bizon/connectors/destinations/bigquery_streaming_v2/src/proto_utils.py +1 -13
- bizon/connectors/destinations/file/config/file.example.yml +40 -0
- bizon/connectors/destinations/file/src/config.py +2 -1
- bizon/connectors/destinations/file/src/destination.py +3 -6
- bizon/connectors/destinations/logger/config/logger.example.yml +30 -0
- bizon/connectors/destinations/logger/src/config.py +1 -2
- bizon/connectors/destinations/logger/src/destination.py +4 -2
- bizon/connectors/sources/cycle/src/source.py +2 -6
- bizon/connectors/sources/dummy/src/source.py +0 -4
- bizon/connectors/sources/gsheets/src/source.py +2 -3
- bizon/connectors/sources/hubspot/src/hubspot_base.py +0 -1
- bizon/connectors/sources/hubspot/src/hubspot_objects.py +3 -4
- bizon/connectors/sources/hubspot/src/models/hs_object.py +0 -1
- bizon/connectors/sources/kafka/config/kafka.example.yml +1 -3
- bizon/connectors/sources/kafka/config/kafka_debezium.example.yml +1 -3
- bizon/connectors/sources/kafka/config/kafka_streams.example.yml +124 -0
- bizon/connectors/sources/kafka/src/config.py +10 -12
- bizon/connectors/sources/kafka/src/decode.py +65 -60
- bizon/connectors/sources/kafka/src/source.py +182 -61
- bizon/connectors/sources/kafka/tests/kafka_pipeline.py +1 -1
- bizon/connectors/sources/notion/config/api_key.example.yml +35 -0
- bizon/connectors/sources/notion/src/__init__.py +0 -0
- bizon/connectors/sources/notion/src/config.py +59 -0
- bizon/connectors/sources/notion/src/source.py +1159 -0
- bizon/connectors/sources/notion/tests/notion_pipeline.py +7 -0
- bizon/connectors/sources/notion/tests/test_notion.py +113 -0
- bizon/connectors/sources/periscope/src/source.py +0 -6
- bizon/connectors/sources/pokeapi/src/source.py +0 -1
- bizon/connectors/sources/sana_ai/config/sana.example.yml +25 -0
- bizon/connectors/sources/sana_ai/src/source.py +85 -0
- bizon/destination/buffer.py +0 -1
- bizon/destination/config.py +9 -1
- bizon/destination/destination.py +38 -9
- bizon/engine/backend/adapters/sqlalchemy/backend.py +2 -5
- bizon/engine/backend/adapters/sqlalchemy/config.py +0 -1
- bizon/engine/config.py +0 -1
- bizon/engine/engine.py +0 -1
- bizon/engine/pipeline/consumer.py +0 -1
- bizon/engine/pipeline/producer.py +1 -5
- bizon/engine/queue/adapters/kafka/config.py +1 -1
- bizon/engine/queue/adapters/kafka/queue.py +0 -1
- bizon/engine/queue/adapters/python_queue/consumer.py +0 -1
- bizon/engine/queue/adapters/python_queue/queue.py +0 -2
- bizon/engine/queue/adapters/rabbitmq/consumer.py +0 -1
- bizon/engine/queue/adapters/rabbitmq/queue.py +0 -1
- bizon/engine/queue/config.py +0 -2
- bizon/engine/runner/adapters/process.py +0 -2
- bizon/engine/runner/adapters/streaming.py +114 -42
- bizon/engine/runner/adapters/thread.py +0 -2
- bizon/engine/runner/config.py +0 -1
- bizon/engine/runner/runner.py +14 -9
- bizon/monitoring/config.py +12 -2
- bizon/monitoring/datadog/monitor.py +100 -14
- bizon/monitoring/monitor.py +41 -12
- bizon/monitoring/noop/monitor.py +22 -3
- bizon/source/auth/authenticators/abstract_oauth.py +11 -3
- bizon/source/auth/authenticators/abstract_token.py +2 -1
- bizon/source/auth/authenticators/basic.py +1 -1
- bizon/source/auth/authenticators/cookies.py +2 -1
- bizon/source/auth/authenticators/oauth.py +8 -3
- bizon/source/config.py +0 -2
- bizon/source/cursor.py +8 -16
- bizon/source/discover.py +3 -6
- bizon/source/models.py +0 -1
- bizon/source/session.py +0 -1
- bizon/source/source.py +18 -3
- bizon/transform/config.py +0 -2
- bizon/transform/transform.py +0 -3
- {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info}/METADATA +62 -41
- bizon-0.2.0.dist-info/RECORD +136 -0
- {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info}/WHEEL +1 -1
- bizon-0.2.0.dist-info/entry_points.txt +2 -0
- bizon-0.1.1.dist-info/RECORD +0 -123
- bizon-0.1.1.dist-info/entry_points.txt +0 -3
- {bizon-0.1.1.dist-info → bizon-0.2.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -8,12 +8,14 @@ import simplejson as json
|
|
|
8
8
|
from loguru import logger
|
|
9
9
|
from pytz import UTC
|
|
10
10
|
|
|
11
|
-
from bizon.common.models import BizonConfig
|
|
11
|
+
from bizon.common.models import BizonConfig, SyncMetadata
|
|
12
|
+
from bizon.connectors.destinations.bigquery.src.config import BigQueryRecordSchemaConfig
|
|
12
13
|
from bizon.destination.models import transform_to_df_destination_records
|
|
13
14
|
from bizon.engine.pipeline.models import PipelineReturnStatus
|
|
14
15
|
from bizon.engine.runner.config import RunnerStatus
|
|
15
16
|
from bizon.engine.runner.runner import AbstractRunner
|
|
16
17
|
from bizon.source.models import SourceRecord, source_record_schema
|
|
18
|
+
from bizon.source.source import AbstractSource
|
|
17
19
|
|
|
18
20
|
|
|
19
21
|
class StreamingRunner(AbstractRunner):
|
|
@@ -36,68 +38,138 @@ class StreamingRunner(AbstractRunner):
|
|
|
36
38
|
def convert_to_destination_records(df_source_records: pl.DataFrame, extracted_at: datetime) -> pl.DataFrame:
|
|
37
39
|
return transform_to_df_destination_records(df_source_records=df_source_records, extracted_at=extracted_at)
|
|
38
40
|
|
|
41
|
+
def _apply_streams_config(self, source: AbstractSource = None) -> None:
|
|
42
|
+
"""Apply streams configuration to source and destination.
|
|
43
|
+
|
|
44
|
+
This method is completely source-agnostic. Each source connector is responsible
|
|
45
|
+
for handling streams config appropriately via set_streams_config().
|
|
46
|
+
|
|
47
|
+
When a top-level 'streams' configuration is present, this method:
|
|
48
|
+
1. Calls source.set_streams_config() to let the source enrich its own config
|
|
49
|
+
2. Builds destination record_schemas from streams config
|
|
50
|
+
3. Injects record_schemas into destination config for backward compatibility
|
|
51
|
+
|
|
52
|
+
The source is responsible for modifying self.config (which points to bizon_config.source)
|
|
53
|
+
so that subsequent source instantiations see the enriched config.
|
|
54
|
+
"""
|
|
55
|
+
if not self.bizon_config.streams:
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
logger.info(f"Applying streams configuration: {len(self.bizon_config.streams)} streams defined")
|
|
59
|
+
|
|
60
|
+
# Let the source enrich its own config from streams
|
|
61
|
+
# Note: source modifies self.config, which is a reference to bizon_config.source
|
|
62
|
+
# This ensures init_job (which creates a new source) sees the enriched config
|
|
63
|
+
if source and hasattr(source, "set_streams_config") and callable(source.set_streams_config):
|
|
64
|
+
source.set_streams_config(self.bizon_config.streams)
|
|
65
|
+
|
|
66
|
+
# Build record_schemas list for destination from streams config
|
|
67
|
+
record_schemas = []
|
|
68
|
+
for stream in self.bizon_config.streams:
|
|
69
|
+
if stream.destination.record_schema:
|
|
70
|
+
record_schema_config = BigQueryRecordSchemaConfig(
|
|
71
|
+
destination_id=stream.destination.table_id,
|
|
72
|
+
record_schema=stream.destination.record_schema,
|
|
73
|
+
clustering_keys=stream.destination.clustering_keys,
|
|
74
|
+
)
|
|
75
|
+
record_schemas.append(record_schema_config)
|
|
76
|
+
logger.info(
|
|
77
|
+
f"Stream '{stream.name}': "
|
|
78
|
+
f"{getattr(stream.source, 'topic', getattr(stream.source, 'name', 'N/A'))} "
|
|
79
|
+
f"-> {stream.destination.table_id}"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Inject into destination config
|
|
83
|
+
if record_schemas and hasattr(self.bizon_config.destination.config, "record_schemas"):
|
|
84
|
+
logger.info(f"Injecting {len(record_schemas)} record schemas into destination config")
|
|
85
|
+
self.bizon_config.destination.config.record_schemas = record_schemas
|
|
86
|
+
|
|
39
87
|
def run(self) -> RunnerStatus:
|
|
88
|
+
# Create a temporary source to enrich bizon_config.source from streams
|
|
89
|
+
# The source's set_streams_config() modifies self.config (= bizon_config.source)
|
|
90
|
+
# This ensures subsequent source instantiations see the enriched config
|
|
91
|
+
temp_source = self.get_source(bizon_config=self.bizon_config, config=self.config)
|
|
92
|
+
self._apply_streams_config(temp_source)
|
|
93
|
+
|
|
94
|
+
# Now initialize job (check_connection will use enriched source config)
|
|
40
95
|
job = self.init_job(bizon_config=self.bizon_config, config=self.config)
|
|
41
96
|
backend = self.get_backend(bizon_config=self.bizon_config)
|
|
42
97
|
source = self.get_source(bizon_config=self.bizon_config, config=self.config)
|
|
98
|
+
|
|
99
|
+
sync_metadata = SyncMetadata.from_bizon_config(job_id=job.id, config=self.bizon_config)
|
|
100
|
+
monitor = self.get_monitoring_client(sync_metadata=sync_metadata, bizon_config=self.bizon_config)
|
|
101
|
+
|
|
43
102
|
destination = self.get_destination(
|
|
44
103
|
bizon_config=self.bizon_config,
|
|
45
104
|
backend=backend,
|
|
46
105
|
job_id=job.id,
|
|
47
106
|
source_callback=None,
|
|
107
|
+
monitor=monitor,
|
|
48
108
|
)
|
|
109
|
+
|
|
49
110
|
transform = self.get_transform(bizon_config=self.bizon_config)
|
|
50
|
-
|
|
111
|
+
|
|
51
112
|
destination.buffer.buffer_size = 0 # force buffer to be flushed immediately
|
|
52
113
|
iteration = 0
|
|
53
114
|
|
|
54
115
|
while True:
|
|
55
|
-
|
|
56
116
|
if source.config.max_iterations and iteration > source.config.max_iterations:
|
|
57
117
|
logger.info(f"Max iterations {source.config.max_iterations} reached, terminating stream ...")
|
|
58
118
|
break
|
|
59
119
|
|
|
60
|
-
|
|
120
|
+
with monitor.trace(operation_name="bizon.stream.iteration"):
|
|
121
|
+
source_iteration = source.get()
|
|
122
|
+
|
|
123
|
+
destination_id_indexed_records = {}
|
|
124
|
+
|
|
125
|
+
if len(source_iteration.records) == 0:
|
|
126
|
+
logger.info("No new records found, stopping iteration")
|
|
127
|
+
time.sleep(2)
|
|
128
|
+
monitor.track_pipeline_status(PipelineReturnStatus.SUCCESS)
|
|
129
|
+
iteration += 1
|
|
130
|
+
continue
|
|
131
|
+
|
|
132
|
+
for record in source_iteration.records:
|
|
133
|
+
if destination_id_indexed_records.get(record.destination_id):
|
|
134
|
+
destination_id_indexed_records[record.destination_id].append(record)
|
|
135
|
+
else:
|
|
136
|
+
destination_id_indexed_records[record.destination_id] = [record]
|
|
137
|
+
|
|
138
|
+
for destination_id, records in destination_id_indexed_records.items():
|
|
139
|
+
df_source_records = StreamingRunner.convert_source_records(records)
|
|
140
|
+
|
|
141
|
+
dsm_headers = monitor.track_source_iteration(records=records)
|
|
142
|
+
|
|
143
|
+
# Apply transformation
|
|
144
|
+
df_source_records = transform.apply_transforms(df_source_records=df_source_records)
|
|
145
|
+
|
|
146
|
+
df_destination_records = StreamingRunner.convert_to_destination_records(
|
|
147
|
+
df_source_records, datetime.now(tz=UTC)
|
|
148
|
+
)
|
|
149
|
+
# Override destination_id
|
|
150
|
+
destination.destination_id = destination_id
|
|
151
|
+
destination.write_or_buffer_records(
|
|
152
|
+
df_destination_records=df_destination_records,
|
|
153
|
+
iteration=iteration,
|
|
154
|
+
pagination=None,
|
|
155
|
+
)
|
|
156
|
+
monitor.track_records_synced(
|
|
157
|
+
num_records=len(df_destination_records),
|
|
158
|
+
destination_id=destination_id,
|
|
159
|
+
extra_tags={"destination_id": destination_id},
|
|
160
|
+
headers=dsm_headers,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
if os.getenv("ENVIRONMENT") == "production":
|
|
164
|
+
try:
|
|
165
|
+
source.commit()
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.error(f"Error committing source: {e}")
|
|
168
|
+
monitor.track_pipeline_status(PipelineReturnStatus.ERROR)
|
|
169
|
+
return RunnerStatus(stream=PipelineReturnStatus.ERROR)
|
|
61
170
|
|
|
62
|
-
destination_id_indexed_records = {}
|
|
63
|
-
|
|
64
|
-
if len(source_iteration.records) == 0:
|
|
65
|
-
logger.info("No new records found, stopping iteration")
|
|
66
|
-
time.sleep(2)
|
|
67
|
-
monitor.track_pipeline_status(PipelineReturnStatus.SUCCESS)
|
|
68
171
|
iteration += 1
|
|
69
|
-
continue
|
|
70
|
-
|
|
71
|
-
for record in source_iteration.records:
|
|
72
|
-
if destination_id_indexed_records.get(record.destination_id):
|
|
73
|
-
destination_id_indexed_records[record.destination_id].append(record)
|
|
74
|
-
else:
|
|
75
|
-
destination_id_indexed_records[record.destination_id] = [record]
|
|
76
|
-
|
|
77
|
-
for destination_id, records in destination_id_indexed_records.items():
|
|
78
|
-
df_source_records = StreamingRunner.convert_source_records(records)
|
|
79
|
-
|
|
80
|
-
# Apply transformation
|
|
81
|
-
df_source_records = transform.apply_transforms(df_source_records=df_source_records)
|
|
82
172
|
|
|
83
|
-
|
|
84
|
-
df_source_records, datetime.now(tz=UTC)
|
|
85
|
-
)
|
|
86
|
-
# Override destination_id
|
|
87
|
-
destination.destination_id = destination_id
|
|
88
|
-
destination.write_or_buffer_records(
|
|
89
|
-
df_destination_records=df_destination_records,
|
|
90
|
-
iteration=iteration,
|
|
91
|
-
pagination=None,
|
|
92
|
-
)
|
|
93
|
-
monitor.track_records_synced(
|
|
94
|
-
num_records=len(df_destination_records),
|
|
95
|
-
extra_tags={"destination_id": destination_id},
|
|
96
|
-
)
|
|
97
|
-
if os.getenv("ENVIRONMENT") == "production":
|
|
98
|
-
source.commit()
|
|
99
|
-
|
|
100
|
-
iteration += 1
|
|
173
|
+
monitor.track_pipeline_status(PipelineReturnStatus.SUCCESS)
|
|
101
174
|
|
|
102
|
-
monitor.track_pipeline_status(PipelineReturnStatus.SUCCESS)
|
|
103
175
|
return RunnerStatus(stream=PipelineReturnStatus.SUCCESS) # return when max iterations is reached
|
|
@@ -16,7 +16,6 @@ class ThreadRunner(AbstractRunner):
|
|
|
16
16
|
|
|
17
17
|
# TODO: refacto this
|
|
18
18
|
def get_kwargs(self):
|
|
19
|
-
|
|
20
19
|
extra_kwargs = {}
|
|
21
20
|
|
|
22
21
|
if self.bizon_config.engine.queue.type == "python_queue":
|
|
@@ -46,7 +45,6 @@ class ThreadRunner(AbstractRunner):
|
|
|
46
45
|
with concurrent.futures.ThreadPoolExecutor(
|
|
47
46
|
max_workers=self.bizon_config.engine.runner.config.max_workers
|
|
48
47
|
) as executor:
|
|
49
|
-
|
|
50
48
|
future_producer = executor.submit(
|
|
51
49
|
AbstractRunner.instanciate_and_run_producer,
|
|
52
50
|
self.bizon_config,
|
bizon/engine/runner/config.py
CHANGED
bizon/engine/runner/runner.py
CHANGED
|
@@ -27,7 +27,6 @@ from bizon.transform.transform import Transform
|
|
|
27
27
|
|
|
28
28
|
class AbstractRunner(ABC):
|
|
29
29
|
def __init__(self, config: dict):
|
|
30
|
-
|
|
31
30
|
# Internal state
|
|
32
31
|
self._is_running: bool = False
|
|
33
32
|
|
|
@@ -82,7 +81,11 @@ class AbstractRunner(ABC):
|
|
|
82
81
|
|
|
83
82
|
@staticmethod
|
|
84
83
|
def get_destination(
|
|
85
|
-
bizon_config: BizonConfig,
|
|
84
|
+
bizon_config: BizonConfig,
|
|
85
|
+
backend: AbstractBackend,
|
|
86
|
+
job_id: str,
|
|
87
|
+
source_callback: AbstractSourceCallback,
|
|
88
|
+
monitor: AbstractMonitor,
|
|
86
89
|
) -> AbstractDestination:
|
|
87
90
|
"""Get an instance of the destination based on the destination config dict"""
|
|
88
91
|
|
|
@@ -93,6 +96,7 @@ class AbstractRunner(ABC):
|
|
|
93
96
|
config=bizon_config.destination,
|
|
94
97
|
backend=backend,
|
|
95
98
|
source_callback=source_callback,
|
|
99
|
+
monitor=monitor,
|
|
96
100
|
)
|
|
97
101
|
|
|
98
102
|
@staticmethod
|
|
@@ -124,9 +128,9 @@ class AbstractRunner(ABC):
|
|
|
124
128
|
return Transform(transforms=bizon_config.transforms)
|
|
125
129
|
|
|
126
130
|
@staticmethod
|
|
127
|
-
def get_monitoring_client(bizon_config: BizonConfig) -> AbstractMonitor:
|
|
131
|
+
def get_monitoring_client(sync_metadata: SyncMetadata, bizon_config: BizonConfig) -> AbstractMonitor:
|
|
128
132
|
"""Return the monitoring client instance"""
|
|
129
|
-
return MonitorFactory.get_monitor(bizon_config)
|
|
133
|
+
return MonitorFactory.get_monitor(sync_metadata, bizon_config.monitoring)
|
|
130
134
|
|
|
131
135
|
@staticmethod
|
|
132
136
|
def get_or_create_job(
|
|
@@ -217,7 +221,6 @@ class AbstractRunner(ABC):
|
|
|
217
221
|
stop_event: Union[multiprocessing.synchronize.Event, threading.Event],
|
|
218
222
|
**kwargs,
|
|
219
223
|
):
|
|
220
|
-
|
|
221
224
|
# Get the source instance
|
|
222
225
|
source = AbstractRunner.get_source(bizon_config=bizon_config, config=config)
|
|
223
226
|
|
|
@@ -252,23 +255,25 @@ class AbstractRunner(ABC):
|
|
|
252
255
|
bizon_config=bizon_config, config=config
|
|
253
256
|
).get_source_callback_instance()
|
|
254
257
|
|
|
258
|
+
sync_metadata = SyncMetadata.from_bizon_config(job_id=job_id, config=bizon_config)
|
|
259
|
+
|
|
255
260
|
# Get the queue instance
|
|
256
261
|
queue = AbstractRunner.get_queue(bizon_config=bizon_config, **kwargs)
|
|
257
262
|
|
|
258
263
|
# Get the backend instance
|
|
259
264
|
backend = AbstractRunner.get_backend(bizon_config=bizon_config, **kwargs)
|
|
260
265
|
|
|
266
|
+
# Get the monitor instance
|
|
267
|
+
monitor = AbstractRunner.get_monitoring_client(sync_metadata=sync_metadata, bizon_config=bizon_config)
|
|
268
|
+
|
|
261
269
|
# Get the destination instance
|
|
262
270
|
destination = AbstractRunner.get_destination(
|
|
263
|
-
bizon_config=bizon_config, backend=backend, job_id=job_id, source_callback=source_callback
|
|
271
|
+
bizon_config=bizon_config, backend=backend, job_id=job_id, source_callback=source_callback, monitor=monitor
|
|
264
272
|
)
|
|
265
273
|
|
|
266
274
|
# Get the transform instance
|
|
267
275
|
transform = AbstractRunner.get_transform(bizon_config=bizon_config)
|
|
268
276
|
|
|
269
|
-
# Get the monitor instance
|
|
270
|
-
monitor = AbstractRunner.get_monitoring_client(bizon_config=bizon_config)
|
|
271
|
-
|
|
272
277
|
# Create the consumer instance
|
|
273
278
|
consumer = queue.get_consumer(
|
|
274
279
|
destination=destination,
|
bizon/monitoring/config.py
CHANGED
|
@@ -8,7 +8,11 @@ class MonitorType(str, Enum):
|
|
|
8
8
|
DATADOG = "datadog"
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
class
|
|
11
|
+
class BaseMonitoringConfig(BaseModel):
|
|
12
|
+
enable_tracing: bool = Field(default=False, description="Enable tracing for the monitor")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DatadogConfig(BaseMonitoringConfig):
|
|
12
16
|
datadog_agent_host: Optional[str] = None
|
|
13
17
|
datadog_host_env_var: Optional[str] = None
|
|
14
18
|
datadog_agent_port: int = 8125
|
|
@@ -23,7 +27,13 @@ class DatadogConfig(BaseModel):
|
|
|
23
27
|
if not self.host_is_configured:
|
|
24
28
|
raise ValueError("Either datadog_agent_host or datadog_host_env_var must be specified")
|
|
25
29
|
|
|
30
|
+
class Config:
|
|
31
|
+
extra = "forbid"
|
|
26
32
|
|
|
27
|
-
|
|
33
|
+
|
|
34
|
+
class MonitoringConfig(BaseMonitoringConfig):
|
|
28
35
|
type: MonitorType
|
|
29
36
|
config: Optional[DatadogConfig] = None
|
|
37
|
+
|
|
38
|
+
class Config:
|
|
39
|
+
extra = "forbid"
|
|
@@ -1,44 +1,48 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from typing import Dict, List, Union
|
|
3
4
|
|
|
4
5
|
from datadog import initialize, statsd
|
|
5
6
|
from loguru import logger
|
|
6
7
|
|
|
7
|
-
from bizon.common.models import
|
|
8
|
+
from bizon.common.models import SyncMetadata
|
|
8
9
|
from bizon.engine.pipeline.models import PipelineReturnStatus
|
|
10
|
+
from bizon.monitoring.config import MonitoringConfig
|
|
9
11
|
from bizon.monitoring.monitor import AbstractMonitor
|
|
12
|
+
from bizon.source.models import SourceRecord
|
|
10
13
|
|
|
11
14
|
|
|
12
15
|
class DatadogMonitor(AbstractMonitor):
|
|
13
|
-
def __init__(self,
|
|
14
|
-
super().__init__(
|
|
16
|
+
def __init__(self, sync_metadata: SyncMetadata, monitoring_config: MonitoringConfig):
|
|
17
|
+
super().__init__(sync_metadata, monitoring_config)
|
|
15
18
|
|
|
16
19
|
# In Kubernetes, set the host dynamically
|
|
17
20
|
try:
|
|
18
|
-
datadog_host_from_env_var = os.getenv(
|
|
21
|
+
datadog_host_from_env_var = os.getenv(monitoring_config.config.datadog_host_env_var)
|
|
19
22
|
if datadog_host_from_env_var:
|
|
20
23
|
initialize(
|
|
21
24
|
statsd_host=datadog_host_from_env_var,
|
|
22
|
-
statsd_port=
|
|
25
|
+
statsd_port=monitoring_config.config.datadog_agent_port,
|
|
23
26
|
)
|
|
24
27
|
else:
|
|
25
28
|
initialize(
|
|
26
|
-
statsd_host=
|
|
27
|
-
statsd_port=
|
|
29
|
+
statsd_host=monitoring_config.config.datadog_agent_host,
|
|
30
|
+
statsd_port=monitoring_config.config.datadog_agent_port,
|
|
28
31
|
)
|
|
29
32
|
except Exception as e:
|
|
30
33
|
logger.info(f"Failed to initialize Datadog agent: {e}")
|
|
31
34
|
|
|
32
35
|
self.pipeline_monitor_status = "bizon_pipeline.status"
|
|
33
36
|
self.tags = [
|
|
34
|
-
f"pipeline_name:{self.
|
|
35
|
-
f"pipeline_stream:{self.
|
|
36
|
-
f"pipeline_source:{self.
|
|
37
|
-
f"pipeline_destination:{self.
|
|
38
|
-
] + [f"{key}:{value}" for key, value in self.
|
|
37
|
+
f"pipeline_name:{self.sync_metadata.name}",
|
|
38
|
+
f"pipeline_stream:{self.sync_metadata.stream_name}",
|
|
39
|
+
f"pipeline_source:{self.sync_metadata.source_name}",
|
|
40
|
+
f"pipeline_destination:{self.sync_metadata.destination_name}",
|
|
41
|
+
] + [f"{key}:{value}" for key, value in self.monitoring_config.config.tags.items()]
|
|
39
42
|
|
|
40
43
|
self.pipeline_active_pipelines = "bizon_pipeline.active_pipelines"
|
|
41
44
|
self.pipeline_records_synced = "bizon_pipeline.records_synced"
|
|
45
|
+
self.pipeline_large_records = "bizon_pipeline.large_records"
|
|
42
46
|
|
|
43
47
|
def track_pipeline_status(self, pipeline_status: PipelineReturnStatus, extra_tags: Dict[str, str] = {}) -> None:
|
|
44
48
|
"""
|
|
@@ -55,7 +59,9 @@ class DatadogMonitor(AbstractMonitor):
|
|
|
55
59
|
+ [f"{key}:{value}" for key, value in extra_tags.items()],
|
|
56
60
|
)
|
|
57
61
|
|
|
58
|
-
def track_records_synced(
|
|
62
|
+
def track_records_synced(
|
|
63
|
+
self, num_records: int, destination_id: str, extra_tags: Dict[str, str] = {}, headers: List[Dict[str, str]] = []
|
|
64
|
+
) -> Union[List[Dict[str, str]], None]:
|
|
59
65
|
"""
|
|
60
66
|
Track the number of records synced in the pipeline.
|
|
61
67
|
|
|
@@ -67,3 +73,83 @@ class DatadogMonitor(AbstractMonitor):
|
|
|
67
73
|
value=num_records,
|
|
68
74
|
tags=self.tags + [f"{key}:{value}" for key, value in extra_tags.items()],
|
|
69
75
|
)
|
|
76
|
+
if os.getenv("DD_DATA_STREAMS_ENABLED") == "true":
|
|
77
|
+
from ddtrace.data_streams import set_produce_checkpoint
|
|
78
|
+
|
|
79
|
+
destination_type = self.sync_metadata.destination_alias
|
|
80
|
+
|
|
81
|
+
for header in headers:
|
|
82
|
+
if "x-datadog-sampling-priority" in header:
|
|
83
|
+
del header["x-datadog-sampling-priority"]
|
|
84
|
+
if "dd-pathway-ctx-base64" in header:
|
|
85
|
+
del header["dd-pathway-ctx-base64"]
|
|
86
|
+
set_produce_checkpoint(destination_type, destination_id, header.setdefault)
|
|
87
|
+
return headers
|
|
88
|
+
|
|
89
|
+
def track_large_records_synced(self, num_records: int, extra_tags: Dict[str, str] = {}) -> None:
|
|
90
|
+
statsd.increment(
|
|
91
|
+
self.pipeline_large_records,
|
|
92
|
+
value=num_records,
|
|
93
|
+
tags=self.tags + [f"{key}:{value}" for key, value in extra_tags.items()],
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def track_source_iteration(self, records: List[SourceRecord]) -> Union[List[Dict[str, str]], None]:
|
|
97
|
+
"""
|
|
98
|
+
Track the number of records consumed from a Kafka topic.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
kafka_topic (str): The Kafka topic name
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
if os.getenv("DD_DATA_STREAMS_ENABLED") == "true":
|
|
105
|
+
from ddtrace.data_streams import set_consume_checkpoint
|
|
106
|
+
|
|
107
|
+
headers_list = []
|
|
108
|
+
for record in records:
|
|
109
|
+
headers = record.data.get("headers", {})
|
|
110
|
+
set_consume_checkpoint("kafka", record.data["topic"], headers.get)
|
|
111
|
+
headers_list.append(headers)
|
|
112
|
+
return headers_list
|
|
113
|
+
|
|
114
|
+
@contextmanager
|
|
115
|
+
def trace(self, operation_name: str, resource: str = None, extra_tags: Dict[str, str] = None):
|
|
116
|
+
"""
|
|
117
|
+
Create a trace span for monitoring using Datadog APM.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
operation_name (str): The name of the operation being traced
|
|
121
|
+
resource (str): The resource being operated on (e.g., topic name, table name)
|
|
122
|
+
extra_tags (Dict[str, str]): Additional tags for the trace
|
|
123
|
+
|
|
124
|
+
Yields:
|
|
125
|
+
A span object that can be used to add additional metadata
|
|
126
|
+
"""
|
|
127
|
+
if not self.monitoring_config.config.enable_tracing:
|
|
128
|
+
yield None
|
|
129
|
+
return
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
from ddtrace import tracer
|
|
133
|
+
except ImportError:
|
|
134
|
+
logger.warning("ddtrace not available, skipping tracing")
|
|
135
|
+
yield None
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
# Combine tags
|
|
140
|
+
all_tags = self.tags.copy()
|
|
141
|
+
if extra_tags:
|
|
142
|
+
all_tags.extend([f"{key}:{value}" for key, value in extra_tags.items()])
|
|
143
|
+
|
|
144
|
+
# Create the span
|
|
145
|
+
with tracer.trace(operation_name, resource=resource) as span:
|
|
146
|
+
# Add tags to the span
|
|
147
|
+
for tag in all_tags:
|
|
148
|
+
if ":" in tag:
|
|
149
|
+
key, value = tag.split(":", 1)
|
|
150
|
+
span.set_tag(key, value)
|
|
151
|
+
span.set_tag("_sampling_priority_v1", 1)
|
|
152
|
+
yield span
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.warning(f"Failed to create trace: {e}")
|
|
155
|
+
yield None
|
bizon/monitoring/monitor.py
CHANGED
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
-
from typing import Dict
|
|
2
|
+
from typing import Dict, List, Union
|
|
3
3
|
|
|
4
|
-
from bizon.common.models import
|
|
4
|
+
from bizon.common.models import SyncMetadata
|
|
5
5
|
from bizon.engine.pipeline.models import PipelineReturnStatus
|
|
6
|
-
from bizon.monitoring.config import MonitorType
|
|
6
|
+
from bizon.monitoring.config import MonitoringConfig, MonitorType
|
|
7
|
+
from bizon.source.models import SourceRecord
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class AbstractMonitor(ABC):
|
|
10
|
-
def __init__(self,
|
|
11
|
-
self.
|
|
12
|
-
|
|
11
|
+
def __init__(self, sync_metadata: SyncMetadata, monitoring_config: MonitoringConfig):
|
|
12
|
+
self.sync_metadata = sync_metadata
|
|
13
|
+
self.monitoring_config = monitoring_config
|
|
13
14
|
|
|
14
15
|
@abstractmethod
|
|
15
16
|
def track_pipeline_status(self, pipeline_status: PipelineReturnStatus, extra_tags: Dict[str, str] = {}) -> None:
|
|
@@ -21,22 +22,50 @@ class AbstractMonitor(ABC):
|
|
|
21
22
|
"""
|
|
22
23
|
pass
|
|
23
24
|
|
|
24
|
-
def
|
|
25
|
+
def track_source_iteration(self, records: List[SourceRecord], headers: Dict[str, str] = {}) -> None:
|
|
26
|
+
"""
|
|
27
|
+
Run a process that tracks the source iteration.
|
|
28
|
+
"""
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
def track_records_synced(
|
|
32
|
+
self, num_records: int, destination_id: str, extra_tags: Dict[str, str] = {}, headers: Dict[str, str] = {}
|
|
33
|
+
) -> None:
|
|
25
34
|
"""
|
|
26
35
|
Track the number of records synced in the pipeline.
|
|
27
36
|
"""
|
|
28
37
|
pass
|
|
29
38
|
|
|
39
|
+
def trace(self, operation_name: str, resource: str = None, extra_tags: Dict[str, str] = None):
|
|
40
|
+
"""
|
|
41
|
+
Create a trace span for monitoring.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
operation_name (str): The name of the operation being traced
|
|
45
|
+
resource (str): The resource being operated on (e.g., topic name, table name)
|
|
46
|
+
extra_tags (Dict[str, str]): Additional tags for the trace
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
A context manager that can be used with 'with' statement
|
|
50
|
+
"""
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
def track_large_records_synced(self, num_records: int, extra_tags: Dict[str, str] = {}) -> None:
|
|
54
|
+
"""
|
|
55
|
+
Track the number of large records synced in the destination system. This aims at helping to identify the source of the large records.
|
|
56
|
+
"""
|
|
57
|
+
pass
|
|
58
|
+
|
|
30
59
|
|
|
31
60
|
class MonitorFactory:
|
|
32
61
|
@staticmethod
|
|
33
|
-
def get_monitor(
|
|
34
|
-
if
|
|
62
|
+
def get_monitor(sync_metadata: SyncMetadata, monitoring_config: Union[MonitoringConfig, None]) -> AbstractMonitor:
|
|
63
|
+
if monitoring_config is None:
|
|
35
64
|
from bizon.monitoring.noop.monitor import NoOpMonitor
|
|
36
65
|
|
|
37
|
-
return NoOpMonitor(
|
|
66
|
+
return NoOpMonitor(sync_metadata, monitoring_config)
|
|
38
67
|
|
|
39
|
-
if
|
|
68
|
+
if monitoring_config.type == MonitorType.DATADOG:
|
|
40
69
|
from bizon.monitoring.datadog.monitor import DatadogMonitor
|
|
41
70
|
|
|
42
|
-
return DatadogMonitor(
|
|
71
|
+
return DatadogMonitor(sync_metadata, monitoring_config)
|
bizon/monitoring/noop/monitor.py
CHANGED
|
@@ -1,11 +1,30 @@
|
|
|
1
|
-
from
|
|
1
|
+
from contextlib import contextmanager
|
|
2
|
+
from typing import Dict
|
|
3
|
+
|
|
4
|
+
from bizon.common.models import SyncMetadata
|
|
2
5
|
from bizon.engine.pipeline.models import PipelineReturnStatus
|
|
6
|
+
from bizon.monitoring.config import MonitoringConfig
|
|
3
7
|
from bizon.monitoring.monitor import AbstractMonitor
|
|
4
8
|
|
|
5
9
|
|
|
6
10
|
class NoOpMonitor(AbstractMonitor):
|
|
7
|
-
def __init__(self,
|
|
8
|
-
super().__init__(
|
|
11
|
+
def __init__(self, sync_metadata: SyncMetadata, monitoring_config: MonitoringConfig):
|
|
12
|
+
super().__init__(sync_metadata, monitoring_config)
|
|
9
13
|
|
|
10
14
|
def track_pipeline_status(self, pipeline_status: PipelineReturnStatus) -> None:
|
|
11
15
|
pass
|
|
16
|
+
|
|
17
|
+
@contextmanager
|
|
18
|
+
def trace(self, operation_name: str, resource: str = None, extra_tags: Dict[str, str] = None):
|
|
19
|
+
"""
|
|
20
|
+
No-op trace implementation.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
operation_name (str): The name of the operation being traced
|
|
24
|
+
resource (str): The resource being operated on (e.g., topic name, table name)
|
|
25
|
+
extra_tags (Dict[str, str]): Additional tags for the trace
|
|
26
|
+
|
|
27
|
+
Yields:
|
|
28
|
+
None (no-op implementation)
|
|
29
|
+
"""
|
|
30
|
+
yield None
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from abc import abstractmethod
|
|
2
|
-
from
|
|
2
|
+
from collections.abc import Mapping, MutableMapping
|
|
3
|
+
from typing import Any, List, Tuple, Union
|
|
3
4
|
|
|
4
5
|
import backoff
|
|
6
|
+
import dpath
|
|
5
7
|
import pendulum
|
|
6
8
|
import requests
|
|
7
9
|
from loguru import logger
|
|
@@ -52,8 +54,8 @@ class AbstractOauth2Authenticator(AuthBase):
|
|
|
52
54
|
"refresh_token": self.get_refresh_token(),
|
|
53
55
|
}
|
|
54
56
|
|
|
55
|
-
if self.get_scopes:
|
|
56
|
-
payload["
|
|
57
|
+
if self.get_scopes():
|
|
58
|
+
payload["scope"] = ",".join(self.get_scopes())
|
|
57
59
|
|
|
58
60
|
if self.get_refresh_request_body():
|
|
59
61
|
for key, val in self.get_refresh_request_body().items():
|
|
@@ -92,6 +94,8 @@ class AbstractOauth2Authenticator(AuthBase):
|
|
|
92
94
|
:return: a tuple of (access_token, token_lifespan_in_seconds)
|
|
93
95
|
"""
|
|
94
96
|
response_json = self._get_refresh_access_token_response()
|
|
97
|
+
if self.get_response_field_path():
|
|
98
|
+
response_json = dpath.get(response_json, self.get_response_field_path())
|
|
95
99
|
return response_json[self.get_access_token_name()], int(response_json[self.get_expires_in_name()])
|
|
96
100
|
|
|
97
101
|
@abstractmethod
|
|
@@ -138,6 +142,10 @@ class AbstractOauth2Authenticator(AuthBase):
|
|
|
138
142
|
def get_grant_type(self) -> str:
|
|
139
143
|
"""Returns grant_type specified for requesting access_token"""
|
|
140
144
|
|
|
145
|
+
@abstractmethod
|
|
146
|
+
def get_response_field_path(self) -> str:
|
|
147
|
+
"""Returns the path to the response field"""
|
|
148
|
+
|
|
141
149
|
@property
|
|
142
150
|
@abstractmethod
|
|
143
151
|
def access_token(self) -> str:
|
|
@@ -27,7 +27,7 @@ class BasicHttpAuthenticator(AbstractHeaderAuthenticator):
|
|
|
27
27
|
return f"{self._auth_method} {self._token}"
|
|
28
28
|
|
|
29
29
|
def __init__(self, params: BasicHttpAuthParams):
|
|
30
|
-
auth_string = f"{params.username}:{params.password}".encode(
|
|
30
|
+
auth_string = f"{params.username}:{params.password}".encode()
|
|
31
31
|
b64_encoded = base64.b64encode(auth_string).decode("utf8")
|
|
32
32
|
self._auth_header = params.auth_header
|
|
33
33
|
self._auth_method = params.auth_method
|