databricks4py 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks4py/__init__.py +56 -0
- databricks4py/catalog.py +65 -0
- databricks4py/config/__init__.py +6 -0
- databricks4py/config/base.py +119 -0
- databricks4py/config/unity.py +72 -0
- databricks4py/filters/__init__.py +17 -0
- databricks4py/filters/base.py +154 -0
- databricks4py/io/__init__.py +40 -0
- databricks4py/io/checkpoint.py +98 -0
- databricks4py/io/dbfs.py +91 -0
- databricks4py/io/delta.py +564 -0
- databricks4py/io/merge.py +176 -0
- databricks4py/io/streaming.py +281 -0
- databricks4py/logging.py +39 -0
- databricks4py/metrics/__init__.py +22 -0
- databricks4py/metrics/base.py +66 -0
- databricks4py/metrics/delta_sink.py +75 -0
- databricks4py/metrics/logging_sink.py +20 -0
- databricks4py/migrations/__init__.py +27 -0
- databricks4py/migrations/alter.py +114 -0
- databricks4py/migrations/runner.py +241 -0
- databricks4py/migrations/schema_diff.py +136 -0
- databricks4py/migrations/validators.py +195 -0
- databricks4py/observability/__init__.py +24 -0
- databricks4py/observability/_utils.py +24 -0
- databricks4py/observability/batch_context.py +134 -0
- databricks4py/observability/health.py +223 -0
- databricks4py/observability/query_listener.py +236 -0
- databricks4py/py.typed +0 -0
- databricks4py/quality/__init__.py +26 -0
- databricks4py/quality/base.py +54 -0
- databricks4py/quality/expectations.py +184 -0
- databricks4py/quality/gate.py +90 -0
- databricks4py/retry.py +102 -0
- databricks4py/secrets.py +69 -0
- databricks4py/spark_session.py +68 -0
- databricks4py/testing/__init__.py +35 -0
- databricks4py/testing/assertions.py +111 -0
- databricks4py/testing/builders.py +127 -0
- databricks4py/testing/fixtures.py +134 -0
- databricks4py/testing/mocks.py +106 -0
- databricks4py/testing/temp_table.py +73 -0
- databricks4py/workflow.py +219 -0
- databricks4py-0.2.0.dist-info/METADATA +589 -0
- databricks4py-0.2.0.dist-info/RECORD +48 -0
- databricks4py-0.2.0.dist-info/WHEEL +5 -0
- databricks4py-0.2.0.dist-info/licenses/LICENSE +21 -0
- databricks4py-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""Fluent MERGE INTO builder for Delta Lake tables."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
|
+
|
|
10
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
11
|
+
|
|
12
|
+
from databricks4py.spark_session import active_fallback
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from databricks4py.metrics.base import MetricsSink
|
|
16
|
+
|
|
17
|
+
__all__ = ["MergeBuilder", "MergeResult"]
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
_SOURCE_ALIAS = "source"
|
|
22
|
+
_TARGET_ALIAS = "target"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(frozen=True)
|
|
26
|
+
class MergeResult:
|
|
27
|
+
"""Outcome metrics from a MERGE operation."""
|
|
28
|
+
|
|
29
|
+
rows_inserted: int
|
|
30
|
+
rows_updated: int
|
|
31
|
+
rows_deleted: int
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class MergeBuilder:
|
|
35
|
+
"""Fluent builder for Delta Lake MERGE INTO operations.
|
|
36
|
+
|
|
37
|
+
Example::
|
|
38
|
+
|
|
39
|
+
result = (
|
|
40
|
+
MergeBuilder("catalog.schema.target", source_df, spark)
|
|
41
|
+
.on("id")
|
|
42
|
+
.when_matched_update()
|
|
43
|
+
.when_not_matched_insert()
|
|
44
|
+
.execute()
|
|
45
|
+
)
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
target_table_name: str,
|
|
51
|
+
source: DataFrame,
|
|
52
|
+
spark: SparkSession | None = None,
|
|
53
|
+
*,
|
|
54
|
+
metrics_sink: MetricsSink | None = None,
|
|
55
|
+
) -> None:
|
|
56
|
+
self._target_table_name = target_table_name
|
|
57
|
+
self._source = source
|
|
58
|
+
self._spark = active_fallback(spark)
|
|
59
|
+
self._metrics_sink = metrics_sink
|
|
60
|
+
|
|
61
|
+
self._join_keys: list[str] = []
|
|
62
|
+
self._join_condition: str | None = None
|
|
63
|
+
self._actions: list[dict[str, Any]] = []
|
|
64
|
+
|
|
65
|
+
def on(self, *keys: str) -> MergeBuilder:
|
|
66
|
+
"""Set merge join keys (ANDed equality conditions)."""
|
|
67
|
+
self._join_keys = list(keys)
|
|
68
|
+
return self
|
|
69
|
+
|
|
70
|
+
def on_condition(self, condition: str) -> MergeBuilder:
|
|
71
|
+
"""Set a custom merge condition expression instead of key-based equality."""
|
|
72
|
+
self._join_condition = condition
|
|
73
|
+
return self
|
|
74
|
+
|
|
75
|
+
def when_matched_update(self, columns: list[str] | None = None) -> MergeBuilder:
|
|
76
|
+
"""Update matched rows. If columns is None, updates all columns."""
|
|
77
|
+
self._actions.append({"type": "matched_update", "columns": columns})
|
|
78
|
+
return self
|
|
79
|
+
|
|
80
|
+
def when_matched_delete(self, condition: str | None = None) -> MergeBuilder:
|
|
81
|
+
"""Delete matched rows, optionally filtered by condition."""
|
|
82
|
+
self._actions.append({"type": "matched_delete", "condition": condition})
|
|
83
|
+
return self
|
|
84
|
+
|
|
85
|
+
def when_not_matched_insert(self, columns: list[str] | None = None) -> MergeBuilder:
|
|
86
|
+
"""Insert non-matched source rows. If columns is None, inserts all."""
|
|
87
|
+
self._actions.append({"type": "not_matched_insert", "columns": columns})
|
|
88
|
+
return self
|
|
89
|
+
|
|
90
|
+
def when_not_matched_by_source_delete(self, condition: str | None = None) -> MergeBuilder:
|
|
91
|
+
"""Delete target rows not present in source."""
|
|
92
|
+
self._actions.append({"type": "not_matched_by_source_delete", "condition": condition})
|
|
93
|
+
return self
|
|
94
|
+
|
|
95
|
+
def _build_condition(self) -> str:
|
|
96
|
+
if self._join_condition:
|
|
97
|
+
return self._join_condition
|
|
98
|
+
parts = [f"{_TARGET_ALIAS}.{k} = {_SOURCE_ALIAS}.{k}" for k in self._join_keys]
|
|
99
|
+
return " AND ".join(parts)
|
|
100
|
+
|
|
101
|
+
def execute(self) -> MergeResult:
|
|
102
|
+
"""Execute the merge and return metrics."""
|
|
103
|
+
from delta.tables import DeltaTable
|
|
104
|
+
|
|
105
|
+
target_dt = DeltaTable.forName(self._spark, self._target_table_name)
|
|
106
|
+
condition = self._build_condition()
|
|
107
|
+
|
|
108
|
+
merger = target_dt.alias(_TARGET_ALIAS).merge(self._source.alias(_SOURCE_ALIAS), condition)
|
|
109
|
+
|
|
110
|
+
for action in self._actions:
|
|
111
|
+
merger = self._apply_action(merger, action)
|
|
112
|
+
|
|
113
|
+
merger.execute()
|
|
114
|
+
|
|
115
|
+
result = self._read_metrics()
|
|
116
|
+
if self._metrics_sink:
|
|
117
|
+
self._emit_metrics(result)
|
|
118
|
+
return result
|
|
119
|
+
|
|
120
|
+
def _apply_action(self, merger: Any, action: dict[str, Any]) -> Any:
|
|
121
|
+
action_type = action["type"]
|
|
122
|
+
|
|
123
|
+
if action_type == "matched_update":
|
|
124
|
+
columns = action["columns"]
|
|
125
|
+
if columns:
|
|
126
|
+
update_map = {col: f"{_SOURCE_ALIAS}.{col}" for col in columns}
|
|
127
|
+
return merger.whenMatchedUpdate(set=update_map)
|
|
128
|
+
return merger.whenMatchedUpdateAll()
|
|
129
|
+
|
|
130
|
+
if action_type == "matched_delete":
|
|
131
|
+
cond = action.get("condition")
|
|
132
|
+
return merger.whenMatchedDelete(condition=cond) if cond else merger.whenMatchedDelete()
|
|
133
|
+
|
|
134
|
+
if action_type == "not_matched_insert":
|
|
135
|
+
columns = action["columns"]
|
|
136
|
+
if columns:
|
|
137
|
+
insert_map = {col: f"{_SOURCE_ALIAS}.{col}" for col in columns}
|
|
138
|
+
return merger.whenNotMatchedInsert(values=insert_map)
|
|
139
|
+
return merger.whenNotMatchedInsertAll()
|
|
140
|
+
|
|
141
|
+
if action_type == "not_matched_by_source_delete":
|
|
142
|
+
cond = action.get("condition")
|
|
143
|
+
if cond:
|
|
144
|
+
return merger.whenNotMatchedBySourceDelete(condition=cond)
|
|
145
|
+
return merger.whenNotMatchedBySourceDelete()
|
|
146
|
+
|
|
147
|
+
msg = f"Unknown merge action: {action_type}"
|
|
148
|
+
raise ValueError(msg)
|
|
149
|
+
|
|
150
|
+
def _read_metrics(self) -> MergeResult:
|
|
151
|
+
history = self._spark.sql(f"DESCRIBE HISTORY {self._target_table_name} LIMIT 1")
|
|
152
|
+
row = history.collect()[0]
|
|
153
|
+
metrics: dict[str, str] = row["operationMetrics"] or {}
|
|
154
|
+
|
|
155
|
+
return MergeResult(
|
|
156
|
+
rows_inserted=int(metrics.get("numTargetRowsInserted", 0)),
|
|
157
|
+
rows_updated=int(metrics.get("numTargetRowsUpdated", 0)),
|
|
158
|
+
rows_deleted=int(metrics.get("numTargetRowsDeleted", 0)),
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def _emit_metrics(self, result: MergeResult) -> None:
|
|
162
|
+
from databricks4py.metrics.base import MetricEvent
|
|
163
|
+
|
|
164
|
+
event = MetricEvent(
|
|
165
|
+
job_name="merge",
|
|
166
|
+
event_type="merge_complete",
|
|
167
|
+
timestamp=datetime.now(tz=timezone.utc),
|
|
168
|
+
row_count=result.rows_inserted + result.rows_updated,
|
|
169
|
+
table_name=self._target_table_name,
|
|
170
|
+
metadata={
|
|
171
|
+
"rows_inserted": result.rows_inserted,
|
|
172
|
+
"rows_updated": result.rows_updated,
|
|
173
|
+
"rows_deleted": result.rows_deleted,
|
|
174
|
+
},
|
|
175
|
+
)
|
|
176
|
+
self._metrics_sink.emit(event) # type: ignore[union-attr]
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
"""Structured Streaming utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import time
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
12
|
+
from pyspark.sql.streaming import StreamingQuery
|
|
13
|
+
|
|
14
|
+
from databricks4py.filters.base import Filter
|
|
15
|
+
from databricks4py.spark_session import active_fallback
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from databricks4py.io.checkpoint import CheckpointManager
|
|
19
|
+
from databricks4py.metrics.base import MetricsSink
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"StreamingTableReader",
|
|
23
|
+
"StreamingTriggerOptions",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class StreamingTriggerOptions(Enum):
|
|
30
|
+
"""Common streaming trigger configurations.
|
|
31
|
+
|
|
32
|
+
Values are dicts suitable for passing to ``writeStream.trigger(**value)``.
|
|
33
|
+
|
|
34
|
+
Example::
|
|
35
|
+
|
|
36
|
+
trigger = StreamingTriggerOptions.PROCESSING_TIME_1M
|
|
37
|
+
stream.writeStream.trigger(**trigger.value).start()
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
PROCESSING_TIME_10S = {"processingTime": "10 seconds"}
|
|
41
|
+
PROCESSING_TIME_30S = {"processingTime": "30 seconds"}
|
|
42
|
+
PROCESSING_TIME_1M = {"processingTime": "1 minute"}
|
|
43
|
+
PROCESSING_TIME_5M = {"processingTime": "5 minutes"}
|
|
44
|
+
PROCESSING_TIME_10M = {"processingTime": "10 minutes"}
|
|
45
|
+
AVAILABLE_NOW = {"availableNow": True}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class StreamingTableReader(ABC):
|
|
49
|
+
"""Abstract base for streaming micro-batch processors.
|
|
50
|
+
|
|
51
|
+
Subclasses implement :meth:`process_batch` to handle each micro-batch.
|
|
52
|
+
The :meth:`start` method wires up ``foreachBatch`` and returns a
|
|
53
|
+
``StreamingQuery``.
|
|
54
|
+
|
|
55
|
+
Example::
|
|
56
|
+
|
|
57
|
+
class MyProcessor(StreamingTableReader):
|
|
58
|
+
def process_batch(self, df, batch_id):
|
|
59
|
+
df.write.format("delta").mode("append").saveAsTable("output")
|
|
60
|
+
|
|
61
|
+
reader = MyProcessor(
|
|
62
|
+
source_table="catalog.schema.input",
|
|
63
|
+
trigger=StreamingTriggerOptions.PROCESSING_TIME_1M,
|
|
64
|
+
checkpoint_location="/checkpoints/my_reader",
|
|
65
|
+
)
|
|
66
|
+
query = reader.start()
|
|
67
|
+
query.awaitTermination()
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
source_table: Table name or path to read as a stream.
|
|
71
|
+
trigger: Trigger configuration, a raw dict, or None for the default.
|
|
72
|
+
checkpoint_location: Path for streaming checkpoints. Auto-generated
|
|
73
|
+
when a ``checkpoint_manager`` is provided and this is None.
|
|
74
|
+
source_format: Source format (default ``"delta"``).
|
|
75
|
+
row_filter: Optional Filter to apply before processing each batch.
|
|
76
|
+
skip_empty_batches: Skip batches with 0 rows (default True).
|
|
77
|
+
read_options: Additional read options as key-value pairs.
|
|
78
|
+
checkpoint_manager: Optional CheckpointManager for auto-generating
|
|
79
|
+
checkpoint paths.
|
|
80
|
+
metrics_sink: Optional MetricsSink for emitting batch metrics.
|
|
81
|
+
dead_letter_table: Fully qualified table name to write failed batches
|
|
82
|
+
to. When set and ``process_batch`` raises, the offending DataFrame
|
|
83
|
+
is written here with ``_dlq_error_message``, ``_dlq_error_timestamp``,
|
|
84
|
+
and ``_dlq_batch_id`` columns appended. Uses ``mergeSchema=true`` so
|
|
85
|
+
the table is auto-created on first failure.
|
|
86
|
+
spark: Optional SparkSession.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
_DEFAULT_TRIGGER: dict[str, str] = {"processingTime": "10 seconds"}
|
|
90
|
+
|
|
91
|
+
def __init__(
|
|
92
|
+
self,
|
|
93
|
+
source_table: str,
|
|
94
|
+
trigger: StreamingTriggerOptions | dict | None = None,
|
|
95
|
+
checkpoint_location: str | None = None,
|
|
96
|
+
*,
|
|
97
|
+
source_format: str = "delta",
|
|
98
|
+
row_filter: Filter | None = None,
|
|
99
|
+
skip_empty_batches: bool = True,
|
|
100
|
+
read_options: dict[str, str] | None = None,
|
|
101
|
+
checkpoint_manager: CheckpointManager | None = None,
|
|
102
|
+
metrics_sink: MetricsSink | None = None,
|
|
103
|
+
dead_letter_table: str | None = None,
|
|
104
|
+
spark: SparkSession | None = None,
|
|
105
|
+
) -> None:
|
|
106
|
+
self._spark = active_fallback(spark)
|
|
107
|
+
self._source_table = source_table
|
|
108
|
+
self._metrics_sink = metrics_sink
|
|
109
|
+
self._dead_letter_table = dead_letter_table
|
|
110
|
+
self._query: StreamingQuery | None = None
|
|
111
|
+
|
|
112
|
+
# Resolve trigger to a plain dict
|
|
113
|
+
if trigger is None:
|
|
114
|
+
self._trigger_dict = self._DEFAULT_TRIGGER
|
|
115
|
+
elif isinstance(trigger, StreamingTriggerOptions):
|
|
116
|
+
self._trigger_dict = trigger.value
|
|
117
|
+
else:
|
|
118
|
+
self._trigger_dict = trigger
|
|
119
|
+
|
|
120
|
+
# Auto-generate checkpoint path when manager is provided
|
|
121
|
+
if checkpoint_location is None and checkpoint_manager is not None:
|
|
122
|
+
checkpoint_location = checkpoint_manager.path_for(source_table, self.__class__.__name__)
|
|
123
|
+
if checkpoint_location is None:
|
|
124
|
+
raise ValueError(
|
|
125
|
+
"checkpoint_location is required when no checkpoint_manager is provided"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
self._checkpoint_location = checkpoint_location
|
|
129
|
+
self._source_format = source_format
|
|
130
|
+
self._filter = row_filter
|
|
131
|
+
self._skip_empty_batches = skip_empty_batches
|
|
132
|
+
self._read_options = read_options or {}
|
|
133
|
+
|
|
134
|
+
@abstractmethod
|
|
135
|
+
def process_batch(self, df: DataFrame, batch_id: int) -> None:
|
|
136
|
+
"""Process a single micro-batch.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
df: The micro-batch DataFrame.
|
|
140
|
+
batch_id: The batch identifier.
|
|
141
|
+
"""
|
|
142
|
+
...
|
|
143
|
+
|
|
144
|
+
def _write_to_dlq(self, df: DataFrame, batch_id: int, error_msg: str) -> None:
|
|
145
|
+
"""Append a failed batch to the dead-letter table with error metadata."""
|
|
146
|
+
assert self._dead_letter_table is not None # only called when dlq is configured
|
|
147
|
+
from datetime import datetime, timezone
|
|
148
|
+
|
|
149
|
+
from pyspark.sql import functions as F
|
|
150
|
+
|
|
151
|
+
error_df = (
|
|
152
|
+
df.withColumn("_dlq_error_message", F.lit(error_msg))
|
|
153
|
+
.withColumn(
|
|
154
|
+
"_dlq_error_timestamp",
|
|
155
|
+
F.lit(datetime.now(tz=timezone.utc).isoformat()).cast("timestamp"),
|
|
156
|
+
)
|
|
157
|
+
.withColumn("_dlq_batch_id", F.lit(batch_id))
|
|
158
|
+
)
|
|
159
|
+
(
|
|
160
|
+
error_df.write.format("delta")
|
|
161
|
+
.mode("append")
|
|
162
|
+
.option("mergeSchema", "true")
|
|
163
|
+
.saveAsTable(self._dead_letter_table)
|
|
164
|
+
)
|
|
165
|
+
logger.warning(
|
|
166
|
+
"Wrote batch %d to DLQ %s: %s",
|
|
167
|
+
batch_id,
|
|
168
|
+
self._dead_letter_table,
|
|
169
|
+
error_msg[:200],
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
def _foreach_batch_wrapper(self, df: DataFrame, batch_id: int) -> None:
|
|
173
|
+
"""Internal wrapper handling empty batch detection, filtering, and metrics."""
|
|
174
|
+
if self._skip_empty_batches and df.isEmpty():
|
|
175
|
+
logger.debug("Skipping empty batch %d", batch_id)
|
|
176
|
+
return
|
|
177
|
+
|
|
178
|
+
if self._filter is not None:
|
|
179
|
+
df = self._filter(df)
|
|
180
|
+
if self._skip_empty_batches and df.isEmpty():
|
|
181
|
+
logger.debug("Skipping batch %d (empty after filtering)", batch_id)
|
|
182
|
+
return
|
|
183
|
+
|
|
184
|
+
count = df.count()
|
|
185
|
+
logger.info("Processing batch %d (%d rows)", batch_id, count)
|
|
186
|
+
|
|
187
|
+
start = time.monotonic()
|
|
188
|
+
try:
|
|
189
|
+
self.process_batch(df, batch_id)
|
|
190
|
+
except Exception: # noqa: BLE001 — broad catch intentional: DLQ must capture all failures
|
|
191
|
+
if self._dead_letter_table is not None:
|
|
192
|
+
import traceback
|
|
193
|
+
|
|
194
|
+
original_tb = traceback.format_exc()
|
|
195
|
+
try:
|
|
196
|
+
self._write_to_dlq(df, batch_id, original_tb)
|
|
197
|
+
except Exception:
|
|
198
|
+
logger.error(
|
|
199
|
+
"DLQ write failed for batch %d; original error: %s",
|
|
200
|
+
batch_id,
|
|
201
|
+
original_tb[:500],
|
|
202
|
+
exc_info=True,
|
|
203
|
+
)
|
|
204
|
+
raise
|
|
205
|
+
return
|
|
206
|
+
raise
|
|
207
|
+
duration_ms = (time.monotonic() - start) * 1000
|
|
208
|
+
|
|
209
|
+
if self._metrics_sink is not None:
|
|
210
|
+
from datetime import datetime, timezone
|
|
211
|
+
|
|
212
|
+
from databricks4py.metrics.base import MetricEvent
|
|
213
|
+
|
|
214
|
+
self._metrics_sink.emit(
|
|
215
|
+
MetricEvent(
|
|
216
|
+
job_name=self.__class__.__name__,
|
|
217
|
+
event_type="batch_complete",
|
|
218
|
+
timestamp=datetime.now(tz=timezone.utc),
|
|
219
|
+
duration_ms=duration_ms,
|
|
220
|
+
row_count=count,
|
|
221
|
+
batch_id=batch_id,
|
|
222
|
+
table_name=self._source_table,
|
|
223
|
+
)
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
def _build_read_stream(self) -> DataFrame:
|
|
227
|
+
"""Build the readStream DataFrame."""
|
|
228
|
+
reader = self._spark.readStream.format(self._source_format)
|
|
229
|
+
for key, value in self._read_options.items():
|
|
230
|
+
reader = reader.option(key, value)
|
|
231
|
+
|
|
232
|
+
if self._source_format == "delta":
|
|
233
|
+
return reader.table(self._source_table)
|
|
234
|
+
return reader.load(self._source_table)
|
|
235
|
+
|
|
236
|
+
def start(self) -> StreamingQuery:
|
|
237
|
+
"""Start the streaming query.
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
The active StreamingQuery.
|
|
241
|
+
"""
|
|
242
|
+
stream_df = self._build_read_stream()
|
|
243
|
+
|
|
244
|
+
self._query = (
|
|
245
|
+
stream_df.writeStream.foreachBatch(self._foreach_batch_wrapper)
|
|
246
|
+
.trigger(**self._trigger_dict)
|
|
247
|
+
.option("checkpointLocation", self._checkpoint_location)
|
|
248
|
+
.start()
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
logger.info(
|
|
252
|
+
"Started streaming query from %s with trigger %s",
|
|
253
|
+
self._source_table,
|
|
254
|
+
self._trigger_dict,
|
|
255
|
+
)
|
|
256
|
+
return self._query
|
|
257
|
+
|
|
258
|
+
def stop(self, timeout_seconds: int = 30) -> None:
|
|
259
|
+
"""Stop the streaming query and wait for graceful termination.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
timeout_seconds: Maximum seconds to wait after calling stop.
|
|
263
|
+
|
|
264
|
+
Raises:
|
|
265
|
+
ValueError: If the query has not been started yet.
|
|
266
|
+
"""
|
|
267
|
+
if self._query is None:
|
|
268
|
+
raise ValueError("No active query. Call start() first.")
|
|
269
|
+
self._query.stop()
|
|
270
|
+
self._query.awaitTermination(timeout=timeout_seconds)
|
|
271
|
+
logger.info("Streaming query stopped")
|
|
272
|
+
|
|
273
|
+
@property
|
|
274
|
+
def query(self) -> StreamingQuery | None:
|
|
275
|
+
"""The active StreamingQuery, or None if start() has not been called."""
|
|
276
|
+
return self._query
|
|
277
|
+
|
|
278
|
+
@property
|
|
279
|
+
def is_active(self) -> bool:
|
|
280
|
+
"""True if the streaming query is currently running."""
|
|
281
|
+
return self._query is not None and self._query.isActive
|
databricks4py/logging.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Logging configuration for Spark applications."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
__all__ = ["configure_logging", "get_logger"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def configure_logging(level: int | str | None = None) -> None:
|
|
12
|
+
"""Configure root logger with standard formatting.
|
|
13
|
+
|
|
14
|
+
Reads ``LOG_LEVEL`` environment variable if no level is provided.
|
|
15
|
+
Silences noisy ``py4j`` logger.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
level: Log level (e.g. ``logging.INFO``, ``"DEBUG"``).
|
|
19
|
+
Defaults to ``LOG_LEVEL`` env var or ``INFO``.
|
|
20
|
+
"""
|
|
21
|
+
if level is None:
|
|
22
|
+
level = os.getenv("LOG_LEVEL", "INFO")
|
|
23
|
+
|
|
24
|
+
formatting = "[%(levelname)s] [%(asctime)s] %(name)s - %(message)s"
|
|
25
|
+
logging.basicConfig(level=level, format=formatting, force=True)
|
|
26
|
+
|
|
27
|
+
# Silence py4j noise
|
|
28
|
+
logging.getLogger("py4j").setLevel(logging.ERROR)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_logger(name: str) -> logging.Logger:
|
|
32
|
+
"""Get a named logger.
|
|
33
|
+
|
|
34
|
+
Convenience wrapper around ``logging.getLogger``.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
name: Logger name (typically ``__name__``).
|
|
38
|
+
"""
|
|
39
|
+
return logging.getLogger(name)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Metrics collection with pluggable sinks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from databricks4py.metrics.base import CompositeMetricsSink, MetricEvent, MetricsSink
|
|
6
|
+
from databricks4py.metrics.logging_sink import LoggingMetricsSink
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"CompositeMetricsSink",
|
|
10
|
+
"DeltaMetricsSink",
|
|
11
|
+
"LoggingMetricsSink",
|
|
12
|
+
"MetricEvent",
|
|
13
|
+
"MetricsSink",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def __getattr__(name: str):
|
|
18
|
+
if name == "DeltaMetricsSink":
|
|
19
|
+
from databricks4py.metrics.delta_sink import DeltaMetricsSink
|
|
20
|
+
|
|
21
|
+
return DeltaMetricsSink
|
|
22
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Core metric types and sink abstractions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
__all__ = ["CompositeMetricsSink", "MetricEvent", "MetricsSink"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True)
|
|
14
|
+
class MetricEvent:
|
|
15
|
+
"""A single metrics observation."""
|
|
16
|
+
|
|
17
|
+
job_name: str
|
|
18
|
+
event_type: str
|
|
19
|
+
timestamp: datetime
|
|
20
|
+
duration_ms: int | None = None
|
|
21
|
+
row_count: int | None = None
|
|
22
|
+
table_name: str | None = None
|
|
23
|
+
batch_id: int | None = None
|
|
24
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class MetricsSink(ABC):
|
|
28
|
+
"""Abstract base for metrics destinations.
|
|
29
|
+
|
|
30
|
+
Subclasses must implement :meth:`emit`. Override :meth:`flush` if the
|
|
31
|
+
sink buffers events (e.g. :class:`~databricks4py.metrics.delta_sink.DeltaMetricsSink`).
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def emit(self, event: MetricEvent) -> None:
|
|
36
|
+
"""Send a single metric event to the destination."""
|
|
37
|
+
...
|
|
38
|
+
|
|
39
|
+
def flush(self) -> None: # noqa: B027
|
|
40
|
+
"""Flush any buffered events. Default is a no-op."""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class CompositeMetricsSink(MetricsSink):
|
|
44
|
+
"""Fans out events to multiple sinks.
|
|
45
|
+
|
|
46
|
+
Example::
|
|
47
|
+
|
|
48
|
+
sink = CompositeMetricsSink(
|
|
49
|
+
DeltaMetricsSink("catalog.schema.metrics"),
|
|
50
|
+
LoggingMetricsSink(),
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
sinks: One or more MetricsSink instances to delegate to.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(self, *sinks: MetricsSink) -> None:
|
|
58
|
+
self._sinks = sinks
|
|
59
|
+
|
|
60
|
+
def emit(self, event: MetricEvent) -> None:
|
|
61
|
+
for sink in self._sinks:
|
|
62
|
+
sink.emit(event)
|
|
63
|
+
|
|
64
|
+
def flush(self) -> None:
|
|
65
|
+
for sink in self._sinks:
|
|
66
|
+
sink.flush()
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Delta Lake metrics sink."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from dataclasses import asdict
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from databricks4py.metrics.base import MetricEvent, MetricsSink
|
|
10
|
+
from databricks4py.spark_session import active_fallback
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from pyspark.sql import SparkSession
|
|
14
|
+
|
|
15
|
+
__all__ = ["DeltaMetricsSink"]
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DeltaMetricsSink(MetricsSink):
|
|
21
|
+
"""Buffers metric events and writes them to a Delta table on flush or threshold."""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
table_name: str,
|
|
26
|
+
*,
|
|
27
|
+
spark: SparkSession | None = None,
|
|
28
|
+
buffer_size: int = 100,
|
|
29
|
+
) -> None:
|
|
30
|
+
self._table_name = table_name
|
|
31
|
+
self._spark = active_fallback(spark)
|
|
32
|
+
self._buffer_size = buffer_size
|
|
33
|
+
self._buffer: list[MetricEvent] = []
|
|
34
|
+
|
|
35
|
+
def emit(self, event: MetricEvent) -> None:
|
|
36
|
+
self._buffer.append(event)
|
|
37
|
+
if len(self._buffer) >= self._buffer_size:
|
|
38
|
+
self.flush()
|
|
39
|
+
|
|
40
|
+
def flush(self) -> None:
|
|
41
|
+
if not self._buffer:
|
|
42
|
+
return
|
|
43
|
+
|
|
44
|
+
from pyspark.sql.types import (
|
|
45
|
+
DoubleType,
|
|
46
|
+
IntegerType,
|
|
47
|
+
StringType,
|
|
48
|
+
StructField,
|
|
49
|
+
StructType,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
schema = StructType(
|
|
53
|
+
[
|
|
54
|
+
StructField("job_name", StringType()),
|
|
55
|
+
StructField("event_type", StringType()),
|
|
56
|
+
StructField("timestamp", StringType()),
|
|
57
|
+
StructField("duration_ms", DoubleType()),
|
|
58
|
+
StructField("row_count", IntegerType()),
|
|
59
|
+
StructField("batch_id", IntegerType()),
|
|
60
|
+
StructField("table_name", StringType()),
|
|
61
|
+
StructField("metadata", StringType()),
|
|
62
|
+
]
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
rows = [
|
|
66
|
+
{
|
|
67
|
+
k: str(v) if not isinstance(v, (int, float, str, type(None))) else v
|
|
68
|
+
for k, v in asdict(event).items()
|
|
69
|
+
}
|
|
70
|
+
for event in self._buffer
|
|
71
|
+
]
|
|
72
|
+
df = self._spark.createDataFrame(rows, schema=schema)
|
|
73
|
+
df.write.format("delta").mode("append").saveAsTable(self._table_name)
|
|
74
|
+
logger.info("Flushed %d metric events to %s", len(self._buffer), self._table_name)
|
|
75
|
+
self._buffer.clear()
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Logging-based metrics sink."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from dataclasses import asdict
|
|
8
|
+
|
|
9
|
+
from databricks4py.metrics.base import MetricEvent, MetricsSink
|
|
10
|
+
|
|
11
|
+
__all__ = ["LoggingMetricsSink"]
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LoggingMetricsSink(MetricsSink):
|
|
17
|
+
"""Emits metric events as JSON via the standard logger."""
|
|
18
|
+
|
|
19
|
+
def emit(self, event: MetricEvent) -> None:
|
|
20
|
+
logger.info(json.dumps(asdict(event), default=str))
|