databricks4py 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks4py/__init__.py +56 -0
- databricks4py/catalog.py +65 -0
- databricks4py/config/__init__.py +6 -0
- databricks4py/config/base.py +119 -0
- databricks4py/config/unity.py +72 -0
- databricks4py/filters/__init__.py +17 -0
- databricks4py/filters/base.py +154 -0
- databricks4py/io/__init__.py +40 -0
- databricks4py/io/checkpoint.py +98 -0
- databricks4py/io/dbfs.py +91 -0
- databricks4py/io/delta.py +564 -0
- databricks4py/io/merge.py +176 -0
- databricks4py/io/streaming.py +281 -0
- databricks4py/logging.py +39 -0
- databricks4py/metrics/__init__.py +22 -0
- databricks4py/metrics/base.py +66 -0
- databricks4py/metrics/delta_sink.py +75 -0
- databricks4py/metrics/logging_sink.py +20 -0
- databricks4py/migrations/__init__.py +27 -0
- databricks4py/migrations/alter.py +114 -0
- databricks4py/migrations/runner.py +241 -0
- databricks4py/migrations/schema_diff.py +136 -0
- databricks4py/migrations/validators.py +195 -0
- databricks4py/observability/__init__.py +24 -0
- databricks4py/observability/_utils.py +24 -0
- databricks4py/observability/batch_context.py +134 -0
- databricks4py/observability/health.py +223 -0
- databricks4py/observability/query_listener.py +236 -0
- databricks4py/py.typed +0 -0
- databricks4py/quality/__init__.py +26 -0
- databricks4py/quality/base.py +54 -0
- databricks4py/quality/expectations.py +184 -0
- databricks4py/quality/gate.py +90 -0
- databricks4py/retry.py +102 -0
- databricks4py/secrets.py +69 -0
- databricks4py/spark_session.py +68 -0
- databricks4py/testing/__init__.py +35 -0
- databricks4py/testing/assertions.py +111 -0
- databricks4py/testing/builders.py +127 -0
- databricks4py/testing/fixtures.py +134 -0
- databricks4py/testing/mocks.py +106 -0
- databricks4py/testing/temp_table.py +73 -0
- databricks4py/workflow.py +219 -0
- databricks4py-0.2.0.dist-info/METADATA +589 -0
- databricks4py-0.2.0.dist-info/RECORD +48 -0
- databricks4py-0.2.0.dist-info/WHEEL +5 -0
- databricks4py-0.2.0.dist-info/licenses/LICENSE +21 -0
- databricks4py-0.2.0.dist-info/top_level.txt +1 -0
databricks4py/secrets.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Databricks secret management via dbutils."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pyspark.sql
|
|
9
|
+
|
|
10
|
+
from databricks4py.spark_session import active_fallback
|
|
11
|
+
|
|
12
|
+
__all__ = ["SecretFetcher", "inject_dbutils"]
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SecretFetcher:
|
|
18
|
+
"""Fetch secrets from Databricks secret vaults via dbutils.
|
|
19
|
+
|
|
20
|
+
The ``dbutils`` module must be injected before use, either by calling
|
|
21
|
+
:func:`inject_dbutils` or by setting :attr:`dbutils` directly.
|
|
22
|
+
|
|
23
|
+
Example::
|
|
24
|
+
|
|
25
|
+
from databricks4py.secrets import SecretFetcher, inject_dbutils
|
|
26
|
+
import pyspark.dbutils # only available on Databricks
|
|
27
|
+
|
|
28
|
+
inject_dbutils(pyspark.dbutils)
|
|
29
|
+
value = SecretFetcher.fetch_secret("my-scope", "api-key")
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
dbutils: Any = None
|
|
33
|
+
|
|
34
|
+
@staticmethod
|
|
35
|
+
def fetch_secret(
|
|
36
|
+
secret_scope: str,
|
|
37
|
+
secret_key: str,
|
|
38
|
+
*,
|
|
39
|
+
spark: pyspark.sql.SparkSession | None = None,
|
|
40
|
+
) -> str:
|
|
41
|
+
"""Fetch a secret from the Databricks secret vault.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
secret_scope: The secret scope name.
|
|
45
|
+
secret_key: The secret key name.
|
|
46
|
+
spark: Optional SparkSession (used to create DBUtils instance).
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
RuntimeError: If dbutils has not been injected.
|
|
50
|
+
"""
|
|
51
|
+
if SecretFetcher.dbutils is None:
|
|
52
|
+
raise RuntimeError(
|
|
53
|
+
"SecretFetcher.dbutils has not been set. "
|
|
54
|
+
"Call inject_dbutils(pyspark.dbutils) first."
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
spark = active_fallback(spark)
|
|
58
|
+
_dbutils = SecretFetcher.dbutils.DBUtils(spark)
|
|
59
|
+
return _dbutils.secrets.get(scope=secret_scope, key=secret_key)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def inject_dbutils(dbutils_module: Any) -> None:
|
|
63
|
+
"""Inject the dbutils module for secret fetching.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
dbutils_module: The ``pyspark.dbutils`` module (only available on Databricks).
|
|
67
|
+
"""
|
|
68
|
+
SecretFetcher.dbutils = dbutils_module
|
|
69
|
+
logger.debug("Injected dbutils module: %s", dbutils_module)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""SparkSession management utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pyspark.sql
|
|
6
|
+
|
|
7
|
+
__all__ = ["get_active", "active_fallback", "get_or_create_local_session"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_active() -> pyspark.sql.SparkSession:
|
|
11
|
+
"""Get the currently active SparkSession.
|
|
12
|
+
|
|
13
|
+
Raises:
|
|
14
|
+
RuntimeError: If no active SparkSession exists.
|
|
15
|
+
"""
|
|
16
|
+
spark = pyspark.sql.SparkSession.getActiveSession()
|
|
17
|
+
if spark is None:
|
|
18
|
+
raise RuntimeError(
|
|
19
|
+
"No active SparkSession found. "
|
|
20
|
+
"Create one with SparkSession.builder.getOrCreate() or "
|
|
21
|
+
"use get_or_create_local_session()."
|
|
22
|
+
)
|
|
23
|
+
return spark
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def active_fallback(spark: pyspark.sql.SparkSession | None = None) -> pyspark.sql.SparkSession:
|
|
27
|
+
"""Return the given SparkSession, or fall back to the active one.
|
|
28
|
+
|
|
29
|
+
This is the standard pattern for library functions that accept an
|
|
30
|
+
optional ``spark`` parameter::
|
|
31
|
+
|
|
32
|
+
def my_function(data, *, spark=None):
|
|
33
|
+
spark = active_fallback(spark)
|
|
34
|
+
...
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
spark: An explicit SparkSession, or None to use the active session.
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
RuntimeError: If spark is None and no active session exists.
|
|
41
|
+
"""
|
|
42
|
+
return spark if spark is not None else get_active()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def get_or_create_local_session() -> pyspark.sql.SparkSession:
|
|
46
|
+
"""Create a local SparkSession configured for Delta Lake.
|
|
47
|
+
|
|
48
|
+
Suitable for local development and testing. Configures:
|
|
49
|
+
- ``local[*]`` master
|
|
50
|
+
- Delta Lake SQL extensions and catalog
|
|
51
|
+
- Local Derby metastore
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
A SparkSession with Delta Lake support.
|
|
55
|
+
"""
|
|
56
|
+
from delta import configure_spark_with_delta_pip
|
|
57
|
+
|
|
58
|
+
builder = (
|
|
59
|
+
pyspark.sql.SparkSession.builder.master("local[*]")
|
|
60
|
+
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
|
|
61
|
+
.config(
|
|
62
|
+
"spark.sql.catalog.spark_catalog",
|
|
63
|
+
"org.apache.spark.sql.delta.catalog.DeltaCatalog",
|
|
64
|
+
)
|
|
65
|
+
.config("spark.sql.warehouse.dir", "spark-warehouse")
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return configure_spark_with_delta_pip(builder).getOrCreate()
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Test utilities and fixtures for databricks4py.
|
|
2
|
+
|
|
3
|
+
Provides pytest fixtures and mock objects for testing Spark and
|
|
4
|
+
Databricks applications locally.
|
|
5
|
+
|
|
6
|
+
Usage in your ``conftest.py``::
|
|
7
|
+
|
|
8
|
+
from databricks4py.testing.fixtures import * # noqa: F401,F403
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from databricks4py.testing.assertions import assert_frame_equal, assert_schema_equal
|
|
12
|
+
from databricks4py.testing.builders import DataFrameBuilder
|
|
13
|
+
from databricks4py.testing.fixtures import (
|
|
14
|
+
clear_env,
|
|
15
|
+
df_builder,
|
|
16
|
+
spark_session,
|
|
17
|
+
spark_session_function,
|
|
18
|
+
temp_delta,
|
|
19
|
+
)
|
|
20
|
+
from databricks4py.testing.mocks import MockDBUtils, MockDBUtilsModule
|
|
21
|
+
from databricks4py.testing.temp_table import TempDeltaTable
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"DataFrameBuilder",
|
|
25
|
+
"MockDBUtils",
|
|
26
|
+
"MockDBUtilsModule",
|
|
27
|
+
"TempDeltaTable",
|
|
28
|
+
"assert_frame_equal",
|
|
29
|
+
"assert_schema_equal",
|
|
30
|
+
"clear_env",
|
|
31
|
+
"df_builder",
|
|
32
|
+
"spark_session",
|
|
33
|
+
"spark_session_function",
|
|
34
|
+
"temp_delta",
|
|
35
|
+
]
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""DataFrame and schema assertion helpers for testing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pyspark.sql import DataFrame
|
|
6
|
+
from pyspark.sql.types import StructType
|
|
7
|
+
|
|
8
|
+
__all__ = ["assert_frame_equal", "assert_schema_equal"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def assert_schema_equal(
|
|
12
|
+
actual: StructType,
|
|
13
|
+
expected: StructType,
|
|
14
|
+
*,
|
|
15
|
+
check_nullable: bool = False,
|
|
16
|
+
) -> None:
|
|
17
|
+
"""Compare two StructTypes field-by-field with a clear diff on mismatch.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
actual: The actual schema.
|
|
21
|
+
expected: The expected schema.
|
|
22
|
+
check_nullable: Whether to compare nullable flags (default False).
|
|
23
|
+
|
|
24
|
+
Raises:
|
|
25
|
+
AssertionError: If the schemas differ.
|
|
26
|
+
"""
|
|
27
|
+
actual_fields = actual.fields
|
|
28
|
+
expected_fields = expected.fields
|
|
29
|
+
|
|
30
|
+
if len(actual_fields) != len(expected_fields):
|
|
31
|
+
raise AssertionError(
|
|
32
|
+
f"Schema length mismatch: actual has {len(actual_fields)} fields, "
|
|
33
|
+
f"expected {len(expected_fields)}"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
for i, (a, e) in enumerate(zip(actual_fields, expected_fields, strict=True)):
|
|
37
|
+
if a.name != e.name:
|
|
38
|
+
raise AssertionError(
|
|
39
|
+
f"Field {i}: name mismatch — actual={a.name!r}, expected={e.name!r}"
|
|
40
|
+
)
|
|
41
|
+
if a.dataType != e.dataType:
|
|
42
|
+
raise AssertionError(
|
|
43
|
+
f"Field {a.name!r}: type mismatch — actual={a.dataType}, expected={e.dataType}"
|
|
44
|
+
)
|
|
45
|
+
if check_nullable and a.nullable != e.nullable:
|
|
46
|
+
raise AssertionError(
|
|
47
|
+
f"Field {a.name!r}: nullable mismatch — actual={a.nullable}, expected={e.nullable}"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def assert_frame_equal(
|
|
52
|
+
actual: DataFrame,
|
|
53
|
+
expected: DataFrame,
|
|
54
|
+
*,
|
|
55
|
+
check_order: bool = False,
|
|
56
|
+
check_schema: bool = True,
|
|
57
|
+
check_nullable: bool = False,
|
|
58
|
+
) -> None:
|
|
59
|
+
"""Assert two DataFrames are equal.
|
|
60
|
+
|
|
61
|
+
Uses Spark 3.5's ``assertDataFrameEqual`` when available, otherwise
|
|
62
|
+
falls back to a manual row-by-row comparison.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
actual: The actual DataFrame.
|
|
66
|
+
expected: The expected DataFrame.
|
|
67
|
+
check_order: Whether row order matters (default False).
|
|
68
|
+
check_schema: Whether to compare schemas (default True).
|
|
69
|
+
check_nullable: Whether to compare nullable flags (default False).
|
|
70
|
+
|
|
71
|
+
Raises:
|
|
72
|
+
AssertionError: If the DataFrames differ.
|
|
73
|
+
"""
|
|
74
|
+
if check_schema:
|
|
75
|
+
assert_schema_equal(actual.schema, expected.schema, check_nullable=check_nullable)
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
from pyspark.testing.utils import assertDataFrameEqual
|
|
79
|
+
|
|
80
|
+
assertDataFrameEqual(actual, expected, checkRowOrder=check_order)
|
|
81
|
+
except ImportError:
|
|
82
|
+
_manual_frame_compare(actual, expected, check_order=check_order)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _manual_frame_compare(
|
|
86
|
+
actual: DataFrame,
|
|
87
|
+
expected: DataFrame,
|
|
88
|
+
*,
|
|
89
|
+
check_order: bool,
|
|
90
|
+
) -> None:
|
|
91
|
+
actual_rows = actual.collect()
|
|
92
|
+
expected_rows = expected.collect()
|
|
93
|
+
|
|
94
|
+
if len(actual_rows) != len(expected_rows):
|
|
95
|
+
raise AssertionError(
|
|
96
|
+
f"Row count mismatch: actual={len(actual_rows)}, expected={len(expected_rows)}"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
if check_order:
|
|
100
|
+
for i, (a, e) in enumerate(zip(actual_rows, expected_rows, strict=True)):
|
|
101
|
+
if a != e:
|
|
102
|
+
raise AssertionError(f"Row {i} mismatch:\n actual: {a}\n expected: {e}")
|
|
103
|
+
else:
|
|
104
|
+
actual_sorted = sorted(actual_rows, key=str)
|
|
105
|
+
expected_sorted = sorted(expected_rows, key=str)
|
|
106
|
+
if actual_sorted != expected_sorted:
|
|
107
|
+
raise AssertionError(
|
|
108
|
+
f"DataFrames differ (ignoring order):\n"
|
|
109
|
+
f" actual: {actual_sorted}\n"
|
|
110
|
+
f" expected: {expected_sorted}"
|
|
111
|
+
)
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Fluent builder for test DataFrames."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import random
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
9
|
+
from pyspark.sql.types import (
|
|
10
|
+
BooleanType,
|
|
11
|
+
DataType,
|
|
12
|
+
DateType,
|
|
13
|
+
DoubleType,
|
|
14
|
+
FloatType,
|
|
15
|
+
IntegerType,
|
|
16
|
+
LongType,
|
|
17
|
+
ShortType,
|
|
18
|
+
StringType,
|
|
19
|
+
StructField,
|
|
20
|
+
StructType,
|
|
21
|
+
TimestampType,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
__all__ = ["DataFrameBuilder"]
|
|
25
|
+
|
|
26
|
+
_TYPE_MAP: dict[str, DataType] = {
|
|
27
|
+
"string": StringType(),
|
|
28
|
+
"int": IntegerType(),
|
|
29
|
+
"integer": IntegerType(),
|
|
30
|
+
"long": LongType(),
|
|
31
|
+
"bigint": LongType(),
|
|
32
|
+
"short": ShortType(),
|
|
33
|
+
"smallint": ShortType(),
|
|
34
|
+
"float": FloatType(),
|
|
35
|
+
"double": DoubleType(),
|
|
36
|
+
"boolean": BooleanType(),
|
|
37
|
+
"date": DateType(),
|
|
38
|
+
"timestamp": TimestampType(),
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _resolve_type(type_str: str) -> DataType:
|
|
43
|
+
resolved = _TYPE_MAP.get(type_str.lower())
|
|
44
|
+
if resolved is not None:
|
|
45
|
+
return resolved
|
|
46
|
+
from pyspark.sql.types import _parse_datatype_string
|
|
47
|
+
|
|
48
|
+
return _parse_datatype_string(type_str)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class DataFrameBuilder:
|
|
52
|
+
"""Fluent builder for constructing test DataFrames.
|
|
53
|
+
|
|
54
|
+
Example::
|
|
55
|
+
|
|
56
|
+
df = (
|
|
57
|
+
DataFrameBuilder(spark)
|
|
58
|
+
.with_columns({"id": "int", "name": "string"})
|
|
59
|
+
.with_rows((1, "alice"), (2, "bob"))
|
|
60
|
+
.build()
|
|
61
|
+
)
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
def __init__(self, spark: SparkSession) -> None:
|
|
65
|
+
self._spark = spark
|
|
66
|
+
self._schema: StructType | None = None
|
|
67
|
+
self._rows: list[tuple] = []
|
|
68
|
+
|
|
69
|
+
def with_columns(self, schema: dict[str, str]) -> DataFrameBuilder:
|
|
70
|
+
"""Define columns from a dict of ``{name: type_string}``."""
|
|
71
|
+
fields = [StructField(name, _resolve_type(t)) for name, t in schema.items()]
|
|
72
|
+
self._schema = StructType(fields)
|
|
73
|
+
return self
|
|
74
|
+
|
|
75
|
+
def with_schema(self, schema: StructType) -> DataFrameBuilder:
|
|
76
|
+
"""Define columns from a StructType directly."""
|
|
77
|
+
self._schema = schema
|
|
78
|
+
return self
|
|
79
|
+
|
|
80
|
+
def with_rows(self, *rows: tuple) -> DataFrameBuilder:
|
|
81
|
+
"""Add explicit data rows."""
|
|
82
|
+
self._rows.extend(rows)
|
|
83
|
+
return self
|
|
84
|
+
|
|
85
|
+
def with_sequential(self, column: str, start: int = 1, count: int = 10) -> DataFrameBuilder:
|
|
86
|
+
"""Generate sequential integer rows for a single column.
|
|
87
|
+
|
|
88
|
+
If rows already exist, this replaces them.
|
|
89
|
+
"""
|
|
90
|
+
if self._schema is None:
|
|
91
|
+
self._schema = StructType([StructField(column, IntegerType())])
|
|
92
|
+
self._rows = [(i,) for i in range(start, start + count)]
|
|
93
|
+
return self
|
|
94
|
+
|
|
95
|
+
def with_nulls(
|
|
96
|
+
self, column: str, frequency: float = 0.1, *, seed: int | None = None
|
|
97
|
+
) -> DataFrameBuilder:
|
|
98
|
+
"""Inject nulls into a column at the given frequency.
|
|
99
|
+
|
|
100
|
+
Must be called after rows are populated.
|
|
101
|
+
"""
|
|
102
|
+
if self._schema is None:
|
|
103
|
+
raise ValueError("Schema must be defined before injecting nulls")
|
|
104
|
+
|
|
105
|
+
col_idx = next((i for i, f in enumerate(self._schema.fields) if f.name == column), None)
|
|
106
|
+
if col_idx is None:
|
|
107
|
+
raise ValueError(f"Column {column!r} not found in schema")
|
|
108
|
+
|
|
109
|
+
rng = random.Random(seed)
|
|
110
|
+
new_rows: list[tuple[Any, ...]] = []
|
|
111
|
+
for row in self._rows:
|
|
112
|
+
row_list = list(row)
|
|
113
|
+
if rng.random() < frequency:
|
|
114
|
+
row_list[col_idx] = None
|
|
115
|
+
new_rows.append(tuple(row_list))
|
|
116
|
+
self._rows = new_rows
|
|
117
|
+
return self
|
|
118
|
+
|
|
119
|
+
def build(self) -> DataFrame:
|
|
120
|
+
"""Construct the DataFrame.
|
|
121
|
+
|
|
122
|
+
Raises:
|
|
123
|
+
ValueError: If no schema has been defined.
|
|
124
|
+
"""
|
|
125
|
+
if self._schema is None:
|
|
126
|
+
raise ValueError("No schema defined — call with_columns() or with_schema() first")
|
|
127
|
+
return self._spark.createDataFrame(self._rows, schema=self._schema)
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Pytest fixtures for Spark and Delta Lake testing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
from collections.abc import Callable, Generator
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import pyspark.sql
|
|
11
|
+
import pytest
|
|
12
|
+
|
|
13
|
+
from databricks4py.testing.builders import DataFrameBuilder
|
|
14
|
+
from databricks4py.testing.temp_table import TempDeltaTable
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"clear_env",
|
|
18
|
+
"df_builder",
|
|
19
|
+
"spark_session",
|
|
20
|
+
"spark_session_function",
|
|
21
|
+
"temp_delta",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@pytest.fixture(scope="session")
|
|
26
|
+
def spark_session(
|
|
27
|
+
tmp_path_factory: pytest.TempPathFactory,
|
|
28
|
+
) -> Generator[pyspark.sql.SparkSession, None, None]:
|
|
29
|
+
"""Session-scoped SparkSession with Delta Lake support.
|
|
30
|
+
|
|
31
|
+
Creates a single SparkSession for the entire test session to avoid
|
|
32
|
+
the overhead of starting/stopping the JVM repeatedly. Uses a temporary
|
|
33
|
+
directory for the Derby metastore and Spark warehouse.
|
|
34
|
+
"""
|
|
35
|
+
from delta import configure_spark_with_delta_pip
|
|
36
|
+
|
|
37
|
+
warehouse_dir = str(tmp_path_factory.mktemp("spark-warehouse"))
|
|
38
|
+
derby_dir = str(tmp_path_factory.mktemp("derby"))
|
|
39
|
+
|
|
40
|
+
builder = (
|
|
41
|
+
pyspark.sql.SparkSession.builder.master("local[*]")
|
|
42
|
+
.appName("databricks4py-tests")
|
|
43
|
+
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
|
|
44
|
+
.config(
|
|
45
|
+
"spark.sql.catalog.spark_catalog",
|
|
46
|
+
"org.apache.spark.sql.delta.catalog.DeltaCatalog",
|
|
47
|
+
)
|
|
48
|
+
.config("spark.sql.warehouse.dir", warehouse_dir)
|
|
49
|
+
.config("javax.jdo.option.ConnectionURL", f"jdbc:derby:{derby_dir}/metastore;create=true")
|
|
50
|
+
.config("spark.driver.extraJavaOptions", f"-Dderby.system.home={derby_dir}")
|
|
51
|
+
.config("spark.ui.enabled", "false")
|
|
52
|
+
.config("spark.sql.shuffle.partitions", "2")
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
spark = configure_spark_with_delta_pip(builder).getOrCreate()
|
|
56
|
+
yield spark
|
|
57
|
+
spark.stop()
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@pytest.fixture()
|
|
61
|
+
def spark_session_function(
|
|
62
|
+
spark_session: pyspark.sql.SparkSession,
|
|
63
|
+
) -> Generator[pyspark.sql.SparkSession, None, None]:
|
|
64
|
+
"""Function-scoped SparkSession that cleans up between tests.
|
|
65
|
+
|
|
66
|
+
Reuses the session-scoped SparkSession but clears the catalog
|
|
67
|
+
and cache after each test to ensure isolation.
|
|
68
|
+
"""
|
|
69
|
+
yield spark_session
|
|
70
|
+
|
|
71
|
+
# Clean up tables
|
|
72
|
+
for db in spark_session.catalog.listDatabases():
|
|
73
|
+
for table in spark_session.catalog.listTables(db.name):
|
|
74
|
+
spark_session.sql(f"DROP TABLE IF EXISTS {db.name}.{table.name}")
|
|
75
|
+
|
|
76
|
+
spark_session.catalog.clearCache()
|
|
77
|
+
|
|
78
|
+
# Clean up any leftover warehouse files
|
|
79
|
+
warehouse = spark_session.conf.get("spark.sql.warehouse.dir")
|
|
80
|
+
if warehouse and os.path.exists(warehouse):
|
|
81
|
+
for item in os.listdir(warehouse):
|
|
82
|
+
item_path = os.path.join(warehouse, item)
|
|
83
|
+
if os.path.isdir(item_path):
|
|
84
|
+
shutil.rmtree(item_path, ignore_errors=True)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@pytest.fixture(autouse=True)
|
|
88
|
+
def clear_env() -> Generator[None, None, None]:
|
|
89
|
+
"""Auto-use fixture that restores environment variables after each test."""
|
|
90
|
+
original = os.environ.copy()
|
|
91
|
+
yield
|
|
92
|
+
os.environ.clear()
|
|
93
|
+
os.environ.update(original)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@pytest.fixture()
|
|
97
|
+
def df_builder(spark_session: pyspark.sql.SparkSession) -> DataFrameBuilder:
|
|
98
|
+
"""Return a DataFrameBuilder bound to the session-scoped SparkSession."""
|
|
99
|
+
return DataFrameBuilder(spark_session)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@pytest.fixture()
|
|
103
|
+
def temp_delta(
|
|
104
|
+
spark_session_function: pyspark.sql.SparkSession,
|
|
105
|
+
) -> Generator[Callable[..., TempDeltaTable]]:
|
|
106
|
+
"""Factory fixture that creates TempDeltaTables and cleans up on exit.
|
|
107
|
+
|
|
108
|
+
Usage::
|
|
109
|
+
|
|
110
|
+
def test_something(temp_delta):
|
|
111
|
+
with temp_delta(schema={"id": "int"}, data=[(1,)]) as table:
|
|
112
|
+
assert table.dataframe().count() == 1
|
|
113
|
+
"""
|
|
114
|
+
tables: list[TempDeltaTable] = []
|
|
115
|
+
|
|
116
|
+
def _factory(
|
|
117
|
+
*,
|
|
118
|
+
table_name: str | None = None,
|
|
119
|
+
schema: dict[str, str] | None = None,
|
|
120
|
+
data: list[tuple[Any, ...]] | None = None,
|
|
121
|
+
) -> TempDeltaTable:
|
|
122
|
+
t = TempDeltaTable(
|
|
123
|
+
spark_session_function,
|
|
124
|
+
table_name=table_name,
|
|
125
|
+
schema=schema,
|
|
126
|
+
data=data,
|
|
127
|
+
)
|
|
128
|
+
tables.append(t)
|
|
129
|
+
return t
|
|
130
|
+
|
|
131
|
+
yield _factory
|
|
132
|
+
|
|
133
|
+
for t in tables:
|
|
134
|
+
spark_session_function.sql(f"DROP TABLE IF EXISTS {t.table_name}")
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Mock objects for Databricks dbutils in local testing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
__all__ = ["MockDBUtils", "MockDBUtilsModule"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class _MockSecrets:
|
|
11
|
+
"""Mock for dbutils.secrets."""
|
|
12
|
+
|
|
13
|
+
def __init__(self) -> None:
|
|
14
|
+
self._secrets: dict[tuple[str, str], str] = {}
|
|
15
|
+
|
|
16
|
+
def get(self, scope: str, key: str) -> str:
|
|
17
|
+
"""Get a secret value.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
scope: The secret scope.
|
|
21
|
+
key: The secret key.
|
|
22
|
+
|
|
23
|
+
Raises:
|
|
24
|
+
KeyError: If the secret is not found.
|
|
25
|
+
"""
|
|
26
|
+
try:
|
|
27
|
+
return self._secrets[(scope, key)]
|
|
28
|
+
except KeyError:
|
|
29
|
+
raise KeyError(f"Secret not found: scope={scope!r}, key={key!r}") from None
|
|
30
|
+
|
|
31
|
+
def put(self, scope: str, key: str, value: str) -> None:
|
|
32
|
+
"""Store a secret value (for test setup)."""
|
|
33
|
+
self._secrets[(scope, key)] = value
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class _MockFS:
|
|
37
|
+
"""Mock for dbutils.fs."""
|
|
38
|
+
|
|
39
|
+
def __init__(self) -> None:
|
|
40
|
+
self._copies: list[tuple[str, str, bool]] = []
|
|
41
|
+
self._moves: list[tuple[str, str, bool]] = []
|
|
42
|
+
self._removes: list[tuple[str, bool]] = []
|
|
43
|
+
self._mkdirs: list[str] = []
|
|
44
|
+
self._ls_results: dict[str, list[Any]] = {}
|
|
45
|
+
|
|
46
|
+
def cp(self, source: str, dest: str, recurse: bool = False) -> bool:
|
|
47
|
+
"""Record a copy operation."""
|
|
48
|
+
self._copies.append((source, dest, recurse))
|
|
49
|
+
return True
|
|
50
|
+
|
|
51
|
+
def mv(self, source: str, dest: str, recurse: bool = False) -> bool:
|
|
52
|
+
"""Record a move operation."""
|
|
53
|
+
self._moves.append((source, dest, recurse))
|
|
54
|
+
return True
|
|
55
|
+
|
|
56
|
+
def rm(self, path: str, recurse: bool = False) -> bool:
|
|
57
|
+
"""Record a remove operation."""
|
|
58
|
+
self._removes.append((path, recurse))
|
|
59
|
+
return True
|
|
60
|
+
|
|
61
|
+
def mkdirs(self, path: str) -> bool:
|
|
62
|
+
"""Record a mkdirs operation."""
|
|
63
|
+
self._mkdirs.append(path)
|
|
64
|
+
return True
|
|
65
|
+
|
|
66
|
+
def ls(self, path: str) -> list[Any]:
|
|
67
|
+
"""List files at path."""
|
|
68
|
+
return self._ls_results.get(path, [])
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class MockDBUtils:
|
|
72
|
+
"""Mock Databricks DBUtils for local testing.
|
|
73
|
+
|
|
74
|
+
Provides mock implementations of ``secrets`` and ``fs`` modules.
|
|
75
|
+
|
|
76
|
+
Example::
|
|
77
|
+
|
|
78
|
+
mock = MockDBUtils()
|
|
79
|
+
mock.secrets.put("my-scope", "api-key", "secret-value")
|
|
80
|
+
assert mock.secrets.get("my-scope", "api-key") == "secret-value"
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
def __init__(self) -> None:
|
|
84
|
+
self.secrets = _MockSecrets()
|
|
85
|
+
self.fs = _MockFS()
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class MockDBUtilsModule:
|
|
89
|
+
"""Mock for the ``pyspark.dbutils`` module.
|
|
90
|
+
|
|
91
|
+
Mimics the Databricks runtime's ``pyspark.dbutils`` module which
|
|
92
|
+
provides a ``DBUtils`` class that accepts a SparkSession.
|
|
93
|
+
|
|
94
|
+
Example::
|
|
95
|
+
|
|
96
|
+
mock_module = MockDBUtilsModule()
|
|
97
|
+
dbutils = mock_module.DBUtils(spark)
|
|
98
|
+
dbutils.secrets.get("scope", "key")
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
def __init__(self, mock_dbutils: MockDBUtils | None = None) -> None:
|
|
102
|
+
self._mock_dbutils = mock_dbutils or MockDBUtils()
|
|
103
|
+
|
|
104
|
+
def DBUtils(self, spark: Any = None) -> MockDBUtils: # noqa: N802
|
|
105
|
+
"""Create a DBUtils instance (ignores spark argument)."""
|
|
106
|
+
return self._mock_dbutils
|