databricks4py 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks4py/__init__.py +56 -0
- databricks4py/catalog.py +65 -0
- databricks4py/config/__init__.py +6 -0
- databricks4py/config/base.py +119 -0
- databricks4py/config/unity.py +72 -0
- databricks4py/filters/__init__.py +17 -0
- databricks4py/filters/base.py +154 -0
- databricks4py/io/__init__.py +40 -0
- databricks4py/io/checkpoint.py +98 -0
- databricks4py/io/dbfs.py +91 -0
- databricks4py/io/delta.py +564 -0
- databricks4py/io/merge.py +176 -0
- databricks4py/io/streaming.py +281 -0
- databricks4py/logging.py +39 -0
- databricks4py/metrics/__init__.py +22 -0
- databricks4py/metrics/base.py +66 -0
- databricks4py/metrics/delta_sink.py +75 -0
- databricks4py/metrics/logging_sink.py +20 -0
- databricks4py/migrations/__init__.py +27 -0
- databricks4py/migrations/alter.py +114 -0
- databricks4py/migrations/runner.py +241 -0
- databricks4py/migrations/schema_diff.py +136 -0
- databricks4py/migrations/validators.py +195 -0
- databricks4py/observability/__init__.py +24 -0
- databricks4py/observability/_utils.py +24 -0
- databricks4py/observability/batch_context.py +134 -0
- databricks4py/observability/health.py +223 -0
- databricks4py/observability/query_listener.py +236 -0
- databricks4py/py.typed +0 -0
- databricks4py/quality/__init__.py +26 -0
- databricks4py/quality/base.py +54 -0
- databricks4py/quality/expectations.py +184 -0
- databricks4py/quality/gate.py +90 -0
- databricks4py/retry.py +102 -0
- databricks4py/secrets.py +69 -0
- databricks4py/spark_session.py +68 -0
- databricks4py/testing/__init__.py +35 -0
- databricks4py/testing/assertions.py +111 -0
- databricks4py/testing/builders.py +127 -0
- databricks4py/testing/fixtures.py +134 -0
- databricks4py/testing/mocks.py +106 -0
- databricks4py/testing/temp_table.py +73 -0
- databricks4py/workflow.py +219 -0
- databricks4py-0.2.0.dist-info/METADATA +589 -0
- databricks4py-0.2.0.dist-info/RECORD +48 -0
- databricks4py-0.2.0.dist-info/WHEEL +5 -0
- databricks4py-0.2.0.dist-info/licenses/LICENSE +21 -0
- databricks4py-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Temporary Delta table context manager for testing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import uuid
|
|
6
|
+
from types import TracebackType
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from pyspark.sql import SparkSession
|
|
10
|
+
from pyspark.sql.types import StructField, StructType
|
|
11
|
+
|
|
12
|
+
from databricks4py.testing.builders import _resolve_type
|
|
13
|
+
|
|
14
|
+
__all__ = ["TempDeltaTable"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TempDeltaTable:
|
|
18
|
+
"""Context manager that creates a temporary Delta table and drops it on exit.
|
|
19
|
+
|
|
20
|
+
Example::
|
|
21
|
+
|
|
22
|
+
with TempDeltaTable(spark, schema={"id": "int"}, data=[(1,), (2,)]) as table:
|
|
23
|
+
df = table.dataframe()
|
|
24
|
+
assert df.count() == 2
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
spark: SparkSession,
|
|
30
|
+
*,
|
|
31
|
+
table_name: str | None = None,
|
|
32
|
+
schema: dict[str, str] | None = None,
|
|
33
|
+
data: list[tuple[Any, ...]] | None = None,
|
|
34
|
+
) -> None:
|
|
35
|
+
self._spark = spark
|
|
36
|
+
self._table_name = table_name or f"tmp_{uuid.uuid4().hex[:12]}"
|
|
37
|
+
self._schema = schema
|
|
38
|
+
self._data = data
|
|
39
|
+
self._delta_table: Any = None
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def table_name(self) -> str:
|
|
43
|
+
return self._table_name
|
|
44
|
+
|
|
45
|
+
def __enter__(self) -> Any:
|
|
46
|
+
from databricks4py.io.delta import DeltaTable
|
|
47
|
+
|
|
48
|
+
if self._schema is None:
|
|
49
|
+
raise ValueError("schema is required to create a TempDeltaTable")
|
|
50
|
+
|
|
51
|
+
struct = StructType(
|
|
52
|
+
[StructField(name, _resolve_type(t)) for name, t in self._schema.items()]
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
self._delta_table = DeltaTable(
|
|
56
|
+
table_name=self._table_name,
|
|
57
|
+
schema=struct,
|
|
58
|
+
spark=self._spark,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
if self._data:
|
|
62
|
+
df = self._spark.createDataFrame(self._data, schema=struct)
|
|
63
|
+
self._delta_table.write(df, mode="overwrite")
|
|
64
|
+
|
|
65
|
+
return self._delta_table
|
|
66
|
+
|
|
67
|
+
def __exit__(
|
|
68
|
+
self,
|
|
69
|
+
exc_type: type[BaseException] | None,
|
|
70
|
+
exc_val: BaseException | None,
|
|
71
|
+
exc_tb: TracebackType | None,
|
|
72
|
+
) -> None:
|
|
73
|
+
self._spark.sql(f"DROP TABLE IF EXISTS {self._table_name}")
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""Workflow base class for Databricks job entry points."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
|
+
|
|
10
|
+
from pyspark.sql import SparkSession
|
|
11
|
+
|
|
12
|
+
from databricks4py.logging import configure_logging, get_logger
|
|
13
|
+
from databricks4py.spark_session import active_fallback
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from databricks4py.config import JobConfig
|
|
17
|
+
from databricks4py.metrics import MetricsSink
|
|
18
|
+
from databricks4py.quality.gate import QualityGate
|
|
19
|
+
from databricks4py.retry import RetryConfig
|
|
20
|
+
|
|
21
|
+
__all__ = ["Workflow"]
|
|
22
|
+
|
|
23
|
+
logger = get_logger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Workflow(ABC):
|
|
27
|
+
"""Abstract base class for Databricks workflow entry points.
|
|
28
|
+
|
|
29
|
+
Provides a structured pattern for job scripts that auto-initializes:
|
|
30
|
+
|
|
31
|
+
- SparkSession (via :func:`~databricks4py.spark_session.active_fallback`)
|
|
32
|
+
- Logging configuration
|
|
33
|
+
- dbutils injection (optional)
|
|
34
|
+
- Config, metrics, and retry integration (optional, v0.2+)
|
|
35
|
+
|
|
36
|
+
Subclasses implement :meth:`run` with business logic.
|
|
37
|
+
|
|
38
|
+
Example::
|
|
39
|
+
|
|
40
|
+
class MyETL(Workflow):
|
|
41
|
+
def run(self) -> None:
|
|
42
|
+
df = self.spark.read.table("source")
|
|
43
|
+
df.write.format("delta").saveAsTable("target")
|
|
44
|
+
|
|
45
|
+
# As a CLI entry point (pyspark.dbutils only on Databricks Runtime):
|
|
46
|
+
def main():
|
|
47
|
+
import pyspark.dbutils
|
|
48
|
+
MyETL(dbutils=pyspark.dbutils).execute()
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
spark: Optional SparkSession. Defaults to active session.
|
|
52
|
+
dbutils: Optional dbutils module for secret/file operations.
|
|
53
|
+
Only available on Databricks Runtime (``pyspark.dbutils``).
|
|
54
|
+
log_level: Logging level (default INFO).
|
|
55
|
+
config: Optional JobConfig for table lookups and spark configs.
|
|
56
|
+
metrics: Optional MetricsSink for lifecycle and custom metrics.
|
|
57
|
+
retry_config: Optional RetryConfig for retrying run_at_time on failure.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
*,
|
|
63
|
+
spark: SparkSession | None = None,
|
|
64
|
+
dbutils: Any = None,
|
|
65
|
+
log_level: int = logging.INFO,
|
|
66
|
+
config: JobConfig | None = None,
|
|
67
|
+
metrics: MetricsSink | None = None,
|
|
68
|
+
retry_config: RetryConfig | None = None,
|
|
69
|
+
) -> None:
|
|
70
|
+
configure_logging(level=log_level)
|
|
71
|
+
self._spark = active_fallback(spark)
|
|
72
|
+
self._dbutils: Any | None = None
|
|
73
|
+
self._execution_time: datetime | None = None
|
|
74
|
+
self._config = config
|
|
75
|
+
self._metrics = metrics
|
|
76
|
+
self._retry_config = retry_config
|
|
77
|
+
|
|
78
|
+
if dbutils is not None:
|
|
79
|
+
try:
|
|
80
|
+
self._inject_dbutils(dbutils)
|
|
81
|
+
self._dbutils = dbutils
|
|
82
|
+
except (ImportError, AttributeError, TypeError) as exc:
|
|
83
|
+
logger.warning("dbutils injection failed: %s (running outside Databricks?)", exc)
|
|
84
|
+
|
|
85
|
+
@staticmethod
|
|
86
|
+
def _inject_dbutils(dbutils_module: Any) -> None:
|
|
87
|
+
from databricks4py.secrets import SecretFetcher
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
from databricks4py.io.dbfs import _set_dbutils_module
|
|
91
|
+
|
|
92
|
+
SecretFetcher.dbutils = dbutils_module
|
|
93
|
+
_set_dbutils_module(dbutils_module)
|
|
94
|
+
except ImportError:
|
|
95
|
+
SecretFetcher.dbutils = dbutils_module
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def spark(self) -> SparkSession:
|
|
99
|
+
"""The SparkSession for this workflow."""
|
|
100
|
+
return self._spark
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def dbutils(self) -> Any:
|
|
104
|
+
"""The dbutils module, or None if not in Databricks."""
|
|
105
|
+
return self._dbutils
|
|
106
|
+
|
|
107
|
+
@property
|
|
108
|
+
def execution_time(self) -> datetime:
|
|
109
|
+
"""The logical execution time (set by run_at_time, or defaults to init time)."""
|
|
110
|
+
return self._execution_time
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def config(self) -> JobConfig | None:
|
|
114
|
+
"""The JobConfig for this workflow, or None."""
|
|
115
|
+
return self._config
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def metrics(self) -> MetricsSink | None:
|
|
119
|
+
"""The MetricsSink for this workflow, or None."""
|
|
120
|
+
return self._metrics
|
|
121
|
+
|
|
122
|
+
@abstractmethod
|
|
123
|
+
def run(self) -> None:
|
|
124
|
+
"""Execute the workflow business logic.
|
|
125
|
+
|
|
126
|
+
Subclasses must implement this method.
|
|
127
|
+
"""
|
|
128
|
+
...
|
|
129
|
+
|
|
130
|
+
def emit_metric(self, event_type: str, **kwargs: Any) -> None:
|
|
131
|
+
"""Emit a metric event. No-op if no metrics sink is configured."""
|
|
132
|
+
if self._metrics is None:
|
|
133
|
+
return
|
|
134
|
+
from databricks4py.metrics.base import MetricEvent
|
|
135
|
+
|
|
136
|
+
self._metrics.emit(
|
|
137
|
+
MetricEvent(
|
|
138
|
+
job_name=self.__class__.__name__,
|
|
139
|
+
event_type=event_type,
|
|
140
|
+
timestamp=datetime.now(),
|
|
141
|
+
**kwargs,
|
|
142
|
+
)
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
def quality_check(
|
|
146
|
+
self,
|
|
147
|
+
df: Any,
|
|
148
|
+
gate: QualityGate,
|
|
149
|
+
*,
|
|
150
|
+
table_name: str | None = None,
|
|
151
|
+
) -> Any:
|
|
152
|
+
"""Run a quality gate on a DataFrame and enforce its policy.
|
|
153
|
+
|
|
154
|
+
Emits a quality_check metric if a metrics sink is configured.
|
|
155
|
+
Returns the original DataFrame if checks pass, or the enforced
|
|
156
|
+
result (filtered/raised/warned) if they don't.
|
|
157
|
+
"""
|
|
158
|
+
report = gate.check(df)
|
|
159
|
+
self.emit_metric(
|
|
160
|
+
"quality_check",
|
|
161
|
+
table_name=table_name,
|
|
162
|
+
metadata={"passed": report.passed, "checks": len(report.results)},
|
|
163
|
+
)
|
|
164
|
+
if not report.passed:
|
|
165
|
+
return gate.enforce(df)
|
|
166
|
+
return df
|
|
167
|
+
|
|
168
|
+
def run_at_time(self, execution_time: datetime | None = None) -> None:
|
|
169
|
+
"""Execute the workflow with an explicit execution timestamp.
|
|
170
|
+
|
|
171
|
+
Useful for backfill scenarios where the logical execution
|
|
172
|
+
time differs from wall-clock time.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
execution_time: The logical execution time. Defaults to now.
|
|
176
|
+
"""
|
|
177
|
+
self._execution_time = execution_time or datetime.now()
|
|
178
|
+
logger.info(
|
|
179
|
+
"Running %s at execution_time=%s",
|
|
180
|
+
self.__class__.__name__,
|
|
181
|
+
self._execution_time.isoformat(),
|
|
182
|
+
)
|
|
183
|
+
self.run()
|
|
184
|
+
|
|
185
|
+
def execute(self) -> None:
|
|
186
|
+
"""Standard entry point with logging, metrics, and error handling.
|
|
187
|
+
|
|
188
|
+
Call this from ``if __name__ == "__main__"``.
|
|
189
|
+
"""
|
|
190
|
+
workflow_name = self.__class__.__name__
|
|
191
|
+
|
|
192
|
+
if self._config and self._config.spark_configs:
|
|
193
|
+
for k, v in self._config.spark_configs.items():
|
|
194
|
+
self._spark.conf.set(k, v)
|
|
195
|
+
|
|
196
|
+
self.emit_metric("job_start")
|
|
197
|
+
logger.info("Starting workflow: %s", workflow_name)
|
|
198
|
+
start = datetime.now()
|
|
199
|
+
|
|
200
|
+
try:
|
|
201
|
+
if self._retry_config:
|
|
202
|
+
from databricks4py.retry import retry
|
|
203
|
+
|
|
204
|
+
retryable_run = retry(self._retry_config)(self.run_at_time)
|
|
205
|
+
retryable_run()
|
|
206
|
+
else:
|
|
207
|
+
self.run_at_time()
|
|
208
|
+
|
|
209
|
+
duration_ms = (datetime.now() - start).total_seconds() * 1000
|
|
210
|
+
self.emit_metric("job_complete", duration_ms=duration_ms)
|
|
211
|
+
logger.info("Workflow %s completed successfully", workflow_name)
|
|
212
|
+
except Exception:
|
|
213
|
+
duration_ms = (datetime.now() - start).total_seconds() * 1000
|
|
214
|
+
self.emit_metric("job_failed", duration_ms=duration_ms)
|
|
215
|
+
logger.exception("Workflow %s failed", workflow_name)
|
|
216
|
+
raise
|
|
217
|
+
finally:
|
|
218
|
+
if self._metrics:
|
|
219
|
+
self._metrics.flush()
|