databricks4py 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. databricks4py/__init__.py +56 -0
  2. databricks4py/catalog.py +65 -0
  3. databricks4py/config/__init__.py +6 -0
  4. databricks4py/config/base.py +119 -0
  5. databricks4py/config/unity.py +72 -0
  6. databricks4py/filters/__init__.py +17 -0
  7. databricks4py/filters/base.py +154 -0
  8. databricks4py/io/__init__.py +40 -0
  9. databricks4py/io/checkpoint.py +98 -0
  10. databricks4py/io/dbfs.py +91 -0
  11. databricks4py/io/delta.py +564 -0
  12. databricks4py/io/merge.py +176 -0
  13. databricks4py/io/streaming.py +281 -0
  14. databricks4py/logging.py +39 -0
  15. databricks4py/metrics/__init__.py +22 -0
  16. databricks4py/metrics/base.py +66 -0
  17. databricks4py/metrics/delta_sink.py +75 -0
  18. databricks4py/metrics/logging_sink.py +20 -0
  19. databricks4py/migrations/__init__.py +27 -0
  20. databricks4py/migrations/alter.py +114 -0
  21. databricks4py/migrations/runner.py +241 -0
  22. databricks4py/migrations/schema_diff.py +136 -0
  23. databricks4py/migrations/validators.py +195 -0
  24. databricks4py/observability/__init__.py +24 -0
  25. databricks4py/observability/_utils.py +24 -0
  26. databricks4py/observability/batch_context.py +134 -0
  27. databricks4py/observability/health.py +223 -0
  28. databricks4py/observability/query_listener.py +236 -0
  29. databricks4py/py.typed +0 -0
  30. databricks4py/quality/__init__.py +26 -0
  31. databricks4py/quality/base.py +54 -0
  32. databricks4py/quality/expectations.py +184 -0
  33. databricks4py/quality/gate.py +90 -0
  34. databricks4py/retry.py +102 -0
  35. databricks4py/secrets.py +69 -0
  36. databricks4py/spark_session.py +68 -0
  37. databricks4py/testing/__init__.py +35 -0
  38. databricks4py/testing/assertions.py +111 -0
  39. databricks4py/testing/builders.py +127 -0
  40. databricks4py/testing/fixtures.py +134 -0
  41. databricks4py/testing/mocks.py +106 -0
  42. databricks4py/testing/temp_table.py +73 -0
  43. databricks4py/workflow.py +219 -0
  44. databricks4py-0.2.0.dist-info/METADATA +589 -0
  45. databricks4py-0.2.0.dist-info/RECORD +48 -0
  46. databricks4py-0.2.0.dist-info/WHEEL +5 -0
  47. databricks4py-0.2.0.dist-info/licenses/LICENSE +21 -0
  48. databricks4py-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,176 @@
1
+ """Fluent MERGE INTO builder for Delta Lake tables."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from dataclasses import dataclass
7
+ from datetime import datetime, timezone
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ from pyspark.sql import DataFrame, SparkSession
11
+
12
+ from databricks4py.spark_session import active_fallback
13
+
14
+ if TYPE_CHECKING:
15
+ from databricks4py.metrics.base import MetricsSink
16
+
17
+ __all__ = ["MergeBuilder", "MergeResult"]
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ _SOURCE_ALIAS = "source"
22
+ _TARGET_ALIAS = "target"
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class MergeResult:
27
+ """Outcome metrics from a MERGE operation."""
28
+
29
+ rows_inserted: int
30
+ rows_updated: int
31
+ rows_deleted: int
32
+
33
+
34
+ class MergeBuilder:
35
+ """Fluent builder for Delta Lake MERGE INTO operations.
36
+
37
+ Example::
38
+
39
+ result = (
40
+ MergeBuilder("catalog.schema.target", source_df, spark)
41
+ .on("id")
42
+ .when_matched_update()
43
+ .when_not_matched_insert()
44
+ .execute()
45
+ )
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ target_table_name: str,
51
+ source: DataFrame,
52
+ spark: SparkSession | None = None,
53
+ *,
54
+ metrics_sink: MetricsSink | None = None,
55
+ ) -> None:
56
+ self._target_table_name = target_table_name
57
+ self._source = source
58
+ self._spark = active_fallback(spark)
59
+ self._metrics_sink = metrics_sink
60
+
61
+ self._join_keys: list[str] = []
62
+ self._join_condition: str | None = None
63
+ self._actions: list[dict[str, Any]] = []
64
+
65
+ def on(self, *keys: str) -> MergeBuilder:
66
+ """Set merge join keys (ANDed equality conditions)."""
67
+ self._join_keys = list(keys)
68
+ return self
69
+
70
+ def on_condition(self, condition: str) -> MergeBuilder:
71
+ """Set a custom merge condition expression instead of key-based equality."""
72
+ self._join_condition = condition
73
+ return self
74
+
75
+ def when_matched_update(self, columns: list[str] | None = None) -> MergeBuilder:
76
+ """Update matched rows. If columns is None, updates all columns."""
77
+ self._actions.append({"type": "matched_update", "columns": columns})
78
+ return self
79
+
80
+ def when_matched_delete(self, condition: str | None = None) -> MergeBuilder:
81
+ """Delete matched rows, optionally filtered by condition."""
82
+ self._actions.append({"type": "matched_delete", "condition": condition})
83
+ return self
84
+
85
+ def when_not_matched_insert(self, columns: list[str] | None = None) -> MergeBuilder:
86
+ """Insert non-matched source rows. If columns is None, inserts all."""
87
+ self._actions.append({"type": "not_matched_insert", "columns": columns})
88
+ return self
89
+
90
+ def when_not_matched_by_source_delete(self, condition: str | None = None) -> MergeBuilder:
91
+ """Delete target rows not present in source."""
92
+ self._actions.append({"type": "not_matched_by_source_delete", "condition": condition})
93
+ return self
94
+
95
+ def _build_condition(self) -> str:
96
+ if self._join_condition:
97
+ return self._join_condition
98
+ parts = [f"{_TARGET_ALIAS}.{k} = {_SOURCE_ALIAS}.{k}" for k in self._join_keys]
99
+ return " AND ".join(parts)
100
+
101
+ def execute(self) -> MergeResult:
102
+ """Execute the merge and return metrics."""
103
+ from delta.tables import DeltaTable
104
+
105
+ target_dt = DeltaTable.forName(self._spark, self._target_table_name)
106
+ condition = self._build_condition()
107
+
108
+ merger = target_dt.alias(_TARGET_ALIAS).merge(self._source.alias(_SOURCE_ALIAS), condition)
109
+
110
+ for action in self._actions:
111
+ merger = self._apply_action(merger, action)
112
+
113
+ merger.execute()
114
+
115
+ result = self._read_metrics()
116
+ if self._metrics_sink:
117
+ self._emit_metrics(result)
118
+ return result
119
+
120
+ def _apply_action(self, merger: Any, action: dict[str, Any]) -> Any:
121
+ action_type = action["type"]
122
+
123
+ if action_type == "matched_update":
124
+ columns = action["columns"]
125
+ if columns:
126
+ update_map = {col: f"{_SOURCE_ALIAS}.{col}" for col in columns}
127
+ return merger.whenMatchedUpdate(set=update_map)
128
+ return merger.whenMatchedUpdateAll()
129
+
130
+ if action_type == "matched_delete":
131
+ cond = action.get("condition")
132
+ return merger.whenMatchedDelete(condition=cond) if cond else merger.whenMatchedDelete()
133
+
134
+ if action_type == "not_matched_insert":
135
+ columns = action["columns"]
136
+ if columns:
137
+ insert_map = {col: f"{_SOURCE_ALIAS}.{col}" for col in columns}
138
+ return merger.whenNotMatchedInsert(values=insert_map)
139
+ return merger.whenNotMatchedInsertAll()
140
+
141
+ if action_type == "not_matched_by_source_delete":
142
+ cond = action.get("condition")
143
+ if cond:
144
+ return merger.whenNotMatchedBySourceDelete(condition=cond)
145
+ return merger.whenNotMatchedBySourceDelete()
146
+
147
+ msg = f"Unknown merge action: {action_type}"
148
+ raise ValueError(msg)
149
+
150
+ def _read_metrics(self) -> MergeResult:
151
+ history = self._spark.sql(f"DESCRIBE HISTORY {self._target_table_name} LIMIT 1")
152
+ row = history.collect()[0]
153
+ metrics: dict[str, str] = row["operationMetrics"] or {}
154
+
155
+ return MergeResult(
156
+ rows_inserted=int(metrics.get("numTargetRowsInserted", 0)),
157
+ rows_updated=int(metrics.get("numTargetRowsUpdated", 0)),
158
+ rows_deleted=int(metrics.get("numTargetRowsDeleted", 0)),
159
+ )
160
+
161
+ def _emit_metrics(self, result: MergeResult) -> None:
162
+ from databricks4py.metrics.base import MetricEvent
163
+
164
+ event = MetricEvent(
165
+ job_name="merge",
166
+ event_type="merge_complete",
167
+ timestamp=datetime.now(tz=timezone.utc),
168
+ row_count=result.rows_inserted + result.rows_updated,
169
+ table_name=self._target_table_name,
170
+ metadata={
171
+ "rows_inserted": result.rows_inserted,
172
+ "rows_updated": result.rows_updated,
173
+ "rows_deleted": result.rows_deleted,
174
+ },
175
+ )
176
+ self._metrics_sink.emit(event) # type: ignore[union-attr]
@@ -0,0 +1,281 @@
1
+ """Structured Streaming utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import time
7
+ from abc import ABC, abstractmethod
8
+ from enum import Enum
9
+ from typing import TYPE_CHECKING
10
+
11
+ from pyspark.sql import DataFrame, SparkSession
12
+ from pyspark.sql.streaming import StreamingQuery
13
+
14
+ from databricks4py.filters.base import Filter
15
+ from databricks4py.spark_session import active_fallback
16
+
17
+ if TYPE_CHECKING:
18
+ from databricks4py.io.checkpoint import CheckpointManager
19
+ from databricks4py.metrics.base import MetricsSink
20
+
21
+ __all__ = [
22
+ "StreamingTableReader",
23
+ "StreamingTriggerOptions",
24
+ ]
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class StreamingTriggerOptions(Enum):
30
+ """Common streaming trigger configurations.
31
+
32
+ Values are dicts suitable for passing to ``writeStream.trigger(**value)``.
33
+
34
+ Example::
35
+
36
+ trigger = StreamingTriggerOptions.PROCESSING_TIME_1M
37
+ stream.writeStream.trigger(**trigger.value).start()
38
+ """
39
+
40
+ PROCESSING_TIME_10S = {"processingTime": "10 seconds"}
41
+ PROCESSING_TIME_30S = {"processingTime": "30 seconds"}
42
+ PROCESSING_TIME_1M = {"processingTime": "1 minute"}
43
+ PROCESSING_TIME_5M = {"processingTime": "5 minutes"}
44
+ PROCESSING_TIME_10M = {"processingTime": "10 minutes"}
45
+ AVAILABLE_NOW = {"availableNow": True}
46
+
47
+
48
+ class StreamingTableReader(ABC):
49
+ """Abstract base for streaming micro-batch processors.
50
+
51
+ Subclasses implement :meth:`process_batch` to handle each micro-batch.
52
+ The :meth:`start` method wires up ``foreachBatch`` and returns a
53
+ ``StreamingQuery``.
54
+
55
+ Example::
56
+
57
+ class MyProcessor(StreamingTableReader):
58
+ def process_batch(self, df, batch_id):
59
+ df.write.format("delta").mode("append").saveAsTable("output")
60
+
61
+ reader = MyProcessor(
62
+ source_table="catalog.schema.input",
63
+ trigger=StreamingTriggerOptions.PROCESSING_TIME_1M,
64
+ checkpoint_location="/checkpoints/my_reader",
65
+ )
66
+ query = reader.start()
67
+ query.awaitTermination()
68
+
69
+ Args:
70
+ source_table: Table name or path to read as a stream.
71
+ trigger: Trigger configuration, a raw dict, or None for the default.
72
+ checkpoint_location: Path for streaming checkpoints. Auto-generated
73
+ when a ``checkpoint_manager`` is provided and this is None.
74
+ source_format: Source format (default ``"delta"``).
75
+ row_filter: Optional Filter to apply before processing each batch.
76
+ skip_empty_batches: Skip batches with 0 rows (default True).
77
+ read_options: Additional read options as key-value pairs.
78
+ checkpoint_manager: Optional CheckpointManager for auto-generating
79
+ checkpoint paths.
80
+ metrics_sink: Optional MetricsSink for emitting batch metrics.
81
+ dead_letter_table: Fully qualified table name to write failed batches
82
+ to. When set and ``process_batch`` raises, the offending DataFrame
83
+ is written here with ``_dlq_error_message``, ``_dlq_error_timestamp``,
84
+ and ``_dlq_batch_id`` columns appended. Uses ``mergeSchema=true`` so
85
+ the table is auto-created on first failure.
86
+ spark: Optional SparkSession.
87
+ """
88
+
89
+ _DEFAULT_TRIGGER: dict[str, str] = {"processingTime": "10 seconds"}
90
+
91
+ def __init__(
92
+ self,
93
+ source_table: str,
94
+ trigger: StreamingTriggerOptions | dict | None = None,
95
+ checkpoint_location: str | None = None,
96
+ *,
97
+ source_format: str = "delta",
98
+ row_filter: Filter | None = None,
99
+ skip_empty_batches: bool = True,
100
+ read_options: dict[str, str] | None = None,
101
+ checkpoint_manager: CheckpointManager | None = None,
102
+ metrics_sink: MetricsSink | None = None,
103
+ dead_letter_table: str | None = None,
104
+ spark: SparkSession | None = None,
105
+ ) -> None:
106
+ self._spark = active_fallback(spark)
107
+ self._source_table = source_table
108
+ self._metrics_sink = metrics_sink
109
+ self._dead_letter_table = dead_letter_table
110
+ self._query: StreamingQuery | None = None
111
+
112
+ # Resolve trigger to a plain dict
113
+ if trigger is None:
114
+ self._trigger_dict = self._DEFAULT_TRIGGER
115
+ elif isinstance(trigger, StreamingTriggerOptions):
116
+ self._trigger_dict = trigger.value
117
+ else:
118
+ self._trigger_dict = trigger
119
+
120
+ # Auto-generate checkpoint path when manager is provided
121
+ if checkpoint_location is None and checkpoint_manager is not None:
122
+ checkpoint_location = checkpoint_manager.path_for(source_table, self.__class__.__name__)
123
+ if checkpoint_location is None:
124
+ raise ValueError(
125
+ "checkpoint_location is required when no checkpoint_manager is provided"
126
+ )
127
+
128
+ self._checkpoint_location = checkpoint_location
129
+ self._source_format = source_format
130
+ self._filter = row_filter
131
+ self._skip_empty_batches = skip_empty_batches
132
+ self._read_options = read_options or {}
133
+
134
+ @abstractmethod
135
+ def process_batch(self, df: DataFrame, batch_id: int) -> None:
136
+ """Process a single micro-batch.
137
+
138
+ Args:
139
+ df: The micro-batch DataFrame.
140
+ batch_id: The batch identifier.
141
+ """
142
+ ...
143
+
144
+ def _write_to_dlq(self, df: DataFrame, batch_id: int, error_msg: str) -> None:
145
+ """Append a failed batch to the dead-letter table with error metadata."""
146
+ assert self._dead_letter_table is not None # only called when dlq is configured
147
+ from datetime import datetime, timezone
148
+
149
+ from pyspark.sql import functions as F
150
+
151
+ error_df = (
152
+ df.withColumn("_dlq_error_message", F.lit(error_msg))
153
+ .withColumn(
154
+ "_dlq_error_timestamp",
155
+ F.lit(datetime.now(tz=timezone.utc).isoformat()).cast("timestamp"),
156
+ )
157
+ .withColumn("_dlq_batch_id", F.lit(batch_id))
158
+ )
159
+ (
160
+ error_df.write.format("delta")
161
+ .mode("append")
162
+ .option("mergeSchema", "true")
163
+ .saveAsTable(self._dead_letter_table)
164
+ )
165
+ logger.warning(
166
+ "Wrote batch %d to DLQ %s: %s",
167
+ batch_id,
168
+ self._dead_letter_table,
169
+ error_msg[:200],
170
+ )
171
+
172
+ def _foreach_batch_wrapper(self, df: DataFrame, batch_id: int) -> None:
173
+ """Internal wrapper handling empty batch detection, filtering, and metrics."""
174
+ if self._skip_empty_batches and df.isEmpty():
175
+ logger.debug("Skipping empty batch %d", batch_id)
176
+ return
177
+
178
+ if self._filter is not None:
179
+ df = self._filter(df)
180
+ if self._skip_empty_batches and df.isEmpty():
181
+ logger.debug("Skipping batch %d (empty after filtering)", batch_id)
182
+ return
183
+
184
+ count = df.count()
185
+ logger.info("Processing batch %d (%d rows)", batch_id, count)
186
+
187
+ start = time.monotonic()
188
+ try:
189
+ self.process_batch(df, batch_id)
190
+ except Exception: # noqa: BLE001 — broad catch intentional: DLQ must capture all failures
191
+ if self._dead_letter_table is not None:
192
+ import traceback
193
+
194
+ original_tb = traceback.format_exc()
195
+ try:
196
+ self._write_to_dlq(df, batch_id, original_tb)
197
+ except Exception:
198
+ logger.error(
199
+ "DLQ write failed for batch %d; original error: %s",
200
+ batch_id,
201
+ original_tb[:500],
202
+ exc_info=True,
203
+ )
204
+ raise
205
+ return
206
+ raise
207
+ duration_ms = (time.monotonic() - start) * 1000
208
+
209
+ if self._metrics_sink is not None:
210
+ from datetime import datetime, timezone
211
+
212
+ from databricks4py.metrics.base import MetricEvent
213
+
214
+ self._metrics_sink.emit(
215
+ MetricEvent(
216
+ job_name=self.__class__.__name__,
217
+ event_type="batch_complete",
218
+ timestamp=datetime.now(tz=timezone.utc),
219
+ duration_ms=duration_ms,
220
+ row_count=count,
221
+ batch_id=batch_id,
222
+ table_name=self._source_table,
223
+ )
224
+ )
225
+
226
+ def _build_read_stream(self) -> DataFrame:
227
+ """Build the readStream DataFrame."""
228
+ reader = self._spark.readStream.format(self._source_format)
229
+ for key, value in self._read_options.items():
230
+ reader = reader.option(key, value)
231
+
232
+ if self._source_format == "delta":
233
+ return reader.table(self._source_table)
234
+ return reader.load(self._source_table)
235
+
236
+ def start(self) -> StreamingQuery:
237
+ """Start the streaming query.
238
+
239
+ Returns:
240
+ The active StreamingQuery.
241
+ """
242
+ stream_df = self._build_read_stream()
243
+
244
+ self._query = (
245
+ stream_df.writeStream.foreachBatch(self._foreach_batch_wrapper)
246
+ .trigger(**self._trigger_dict)
247
+ .option("checkpointLocation", self._checkpoint_location)
248
+ .start()
249
+ )
250
+
251
+ logger.info(
252
+ "Started streaming query from %s with trigger %s",
253
+ self._source_table,
254
+ self._trigger_dict,
255
+ )
256
+ return self._query
257
+
258
+ def stop(self, timeout_seconds: int = 30) -> None:
259
+ """Stop the streaming query and wait for graceful termination.
260
+
261
+ Args:
262
+ timeout_seconds: Maximum seconds to wait after calling stop.
263
+
264
+ Raises:
265
+ ValueError: If the query has not been started yet.
266
+ """
267
+ if self._query is None:
268
+ raise ValueError("No active query. Call start() first.")
269
+ self._query.stop()
270
+ self._query.awaitTermination(timeout=timeout_seconds)
271
+ logger.info("Streaming query stopped")
272
+
273
+ @property
274
+ def query(self) -> StreamingQuery | None:
275
+ """The active StreamingQuery, or None if start() has not been called."""
276
+ return self._query
277
+
278
+ @property
279
+ def is_active(self) -> bool:
280
+ """True if the streaming query is currently running."""
281
+ return self._query is not None and self._query.isActive
@@ -0,0 +1,39 @@
1
+ """Logging configuration for Spark applications."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+
8
+ __all__ = ["configure_logging", "get_logger"]
9
+
10
+
11
+ def configure_logging(level: int | str | None = None) -> None:
12
+ """Configure root logger with standard formatting.
13
+
14
+ Reads ``LOG_LEVEL`` environment variable if no level is provided.
15
+ Silences noisy ``py4j`` logger.
16
+
17
+ Args:
18
+ level: Log level (e.g. ``logging.INFO``, ``"DEBUG"``).
19
+ Defaults to ``LOG_LEVEL`` env var or ``INFO``.
20
+ """
21
+ if level is None:
22
+ level = os.getenv("LOG_LEVEL", "INFO")
23
+
24
+ formatting = "[%(levelname)s] [%(asctime)s] %(name)s - %(message)s"
25
+ logging.basicConfig(level=level, format=formatting, force=True)
26
+
27
+ # Silence py4j noise
28
+ logging.getLogger("py4j").setLevel(logging.ERROR)
29
+
30
+
31
+ def get_logger(name: str) -> logging.Logger:
32
+ """Get a named logger.
33
+
34
+ Convenience wrapper around ``logging.getLogger``.
35
+
36
+ Args:
37
+ name: Logger name (typically ``__name__``).
38
+ """
39
+ return logging.getLogger(name)
@@ -0,0 +1,22 @@
1
+ """Metrics collection with pluggable sinks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from databricks4py.metrics.base import CompositeMetricsSink, MetricEvent, MetricsSink
6
+ from databricks4py.metrics.logging_sink import LoggingMetricsSink
7
+
8
+ __all__ = [
9
+ "CompositeMetricsSink",
10
+ "DeltaMetricsSink",
11
+ "LoggingMetricsSink",
12
+ "MetricEvent",
13
+ "MetricsSink",
14
+ ]
15
+
16
+
17
+ def __getattr__(name: str):
18
+ if name == "DeltaMetricsSink":
19
+ from databricks4py.metrics.delta_sink import DeltaMetricsSink
20
+
21
+ return DeltaMetricsSink
22
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,66 @@
1
+ """Core metric types and sink abstractions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from dataclasses import dataclass, field
7
+ from datetime import datetime
8
+ from typing import Any
9
+
10
+ __all__ = ["CompositeMetricsSink", "MetricEvent", "MetricsSink"]
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class MetricEvent:
15
+ """A single metrics observation."""
16
+
17
+ job_name: str
18
+ event_type: str
19
+ timestamp: datetime
20
+ duration_ms: int | None = None
21
+ row_count: int | None = None
22
+ table_name: str | None = None
23
+ batch_id: int | None = None
24
+ metadata: dict[str, Any] = field(default_factory=dict)
25
+
26
+
27
+ class MetricsSink(ABC):
28
+ """Abstract base for metrics destinations.
29
+
30
+ Subclasses must implement :meth:`emit`. Override :meth:`flush` if the
31
+ sink buffers events (e.g. :class:`~databricks4py.metrics.delta_sink.DeltaMetricsSink`).
32
+ """
33
+
34
+ @abstractmethod
35
+ def emit(self, event: MetricEvent) -> None:
36
+ """Send a single metric event to the destination."""
37
+ ...
38
+
39
+ def flush(self) -> None: # noqa: B027
40
+ """Flush any buffered events. Default is a no-op."""
41
+
42
+
43
+ class CompositeMetricsSink(MetricsSink):
44
+ """Fans out events to multiple sinks.
45
+
46
+ Example::
47
+
48
+ sink = CompositeMetricsSink(
49
+ DeltaMetricsSink("catalog.schema.metrics"),
50
+ LoggingMetricsSink(),
51
+ )
52
+
53
+ Args:
54
+ sinks: One or more MetricsSink instances to delegate to.
55
+ """
56
+
57
+ def __init__(self, *sinks: MetricsSink) -> None:
58
+ self._sinks = sinks
59
+
60
+ def emit(self, event: MetricEvent) -> None:
61
+ for sink in self._sinks:
62
+ sink.emit(event)
63
+
64
+ def flush(self) -> None:
65
+ for sink in self._sinks:
66
+ sink.flush()
@@ -0,0 +1,75 @@
1
+ """Delta Lake metrics sink."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from dataclasses import asdict
7
+ from typing import TYPE_CHECKING
8
+
9
+ from databricks4py.metrics.base import MetricEvent, MetricsSink
10
+ from databricks4py.spark_session import active_fallback
11
+
12
+ if TYPE_CHECKING:
13
+ from pyspark.sql import SparkSession
14
+
15
+ __all__ = ["DeltaMetricsSink"]
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class DeltaMetricsSink(MetricsSink):
21
+ """Buffers metric events and writes them to a Delta table on flush or threshold."""
22
+
23
+ def __init__(
24
+ self,
25
+ table_name: str,
26
+ *,
27
+ spark: SparkSession | None = None,
28
+ buffer_size: int = 100,
29
+ ) -> None:
30
+ self._table_name = table_name
31
+ self._spark = active_fallback(spark)
32
+ self._buffer_size = buffer_size
33
+ self._buffer: list[MetricEvent] = []
34
+
35
+ def emit(self, event: MetricEvent) -> None:
36
+ self._buffer.append(event)
37
+ if len(self._buffer) >= self._buffer_size:
38
+ self.flush()
39
+
40
+ def flush(self) -> None:
41
+ if not self._buffer:
42
+ return
43
+
44
+ from pyspark.sql.types import (
45
+ DoubleType,
46
+ IntegerType,
47
+ StringType,
48
+ StructField,
49
+ StructType,
50
+ )
51
+
52
+ schema = StructType(
53
+ [
54
+ StructField("job_name", StringType()),
55
+ StructField("event_type", StringType()),
56
+ StructField("timestamp", StringType()),
57
+ StructField("duration_ms", DoubleType()),
58
+ StructField("row_count", IntegerType()),
59
+ StructField("batch_id", IntegerType()),
60
+ StructField("table_name", StringType()),
61
+ StructField("metadata", StringType()),
62
+ ]
63
+ )
64
+
65
+ rows = [
66
+ {
67
+ k: str(v) if not isinstance(v, (int, float, str, type(None))) else v
68
+ for k, v in asdict(event).items()
69
+ }
70
+ for event in self._buffer
71
+ ]
72
+ df = self._spark.createDataFrame(rows, schema=schema)
73
+ df.write.format("delta").mode("append").saveAsTable(self._table_name)
74
+ logger.info("Flushed %d metric events to %s", len(self._buffer), self._table_name)
75
+ self._buffer.clear()
@@ -0,0 +1,20 @@
1
+ """Logging-based metrics sink."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ from dataclasses import asdict
8
+
9
+ from databricks4py.metrics.base import MetricEvent, MetricsSink
10
+
11
+ __all__ = ["LoggingMetricsSink"]
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class LoggingMetricsSink(MetricsSink):
17
+ """Emits metric events as JSON via the standard logger."""
18
+
19
+ def emit(self, event: MetricEvent) -> None:
20
+ logger.info(json.dumps(asdict(event), default=str))