databricks4py 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. databricks4py/__init__.py +56 -0
  2. databricks4py/catalog.py +65 -0
  3. databricks4py/config/__init__.py +6 -0
  4. databricks4py/config/base.py +119 -0
  5. databricks4py/config/unity.py +72 -0
  6. databricks4py/filters/__init__.py +17 -0
  7. databricks4py/filters/base.py +154 -0
  8. databricks4py/io/__init__.py +40 -0
  9. databricks4py/io/checkpoint.py +98 -0
  10. databricks4py/io/dbfs.py +91 -0
  11. databricks4py/io/delta.py +564 -0
  12. databricks4py/io/merge.py +176 -0
  13. databricks4py/io/streaming.py +281 -0
  14. databricks4py/logging.py +39 -0
  15. databricks4py/metrics/__init__.py +22 -0
  16. databricks4py/metrics/base.py +66 -0
  17. databricks4py/metrics/delta_sink.py +75 -0
  18. databricks4py/metrics/logging_sink.py +20 -0
  19. databricks4py/migrations/__init__.py +27 -0
  20. databricks4py/migrations/alter.py +114 -0
  21. databricks4py/migrations/runner.py +241 -0
  22. databricks4py/migrations/schema_diff.py +136 -0
  23. databricks4py/migrations/validators.py +195 -0
  24. databricks4py/observability/__init__.py +24 -0
  25. databricks4py/observability/_utils.py +24 -0
  26. databricks4py/observability/batch_context.py +134 -0
  27. databricks4py/observability/health.py +223 -0
  28. databricks4py/observability/query_listener.py +236 -0
  29. databricks4py/py.typed +0 -0
  30. databricks4py/quality/__init__.py +26 -0
  31. databricks4py/quality/base.py +54 -0
  32. databricks4py/quality/expectations.py +184 -0
  33. databricks4py/quality/gate.py +90 -0
  34. databricks4py/retry.py +102 -0
  35. databricks4py/secrets.py +69 -0
  36. databricks4py/spark_session.py +68 -0
  37. databricks4py/testing/__init__.py +35 -0
  38. databricks4py/testing/assertions.py +111 -0
  39. databricks4py/testing/builders.py +127 -0
  40. databricks4py/testing/fixtures.py +134 -0
  41. databricks4py/testing/mocks.py +106 -0
  42. databricks4py/testing/temp_table.py +73 -0
  43. databricks4py/workflow.py +219 -0
  44. databricks4py-0.2.0.dist-info/METADATA +589 -0
  45. databricks4py-0.2.0.dist-info/RECORD +48 -0
  46. databricks4py-0.2.0.dist-info/WHEEL +5 -0
  47. databricks4py-0.2.0.dist-info/licenses/LICENSE +21 -0
  48. databricks4py-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,56 @@
1
+ """databricks4py: Spark, Delta Lake, and Databricks utility library.
2
+
3
+ A collection of reusable abstractions for building PySpark applications
4
+ on Databricks and locally.
5
+ """
6
+
7
+ __version__ = "0.2.0"
8
+
9
+ from databricks4py.catalog import CatalogSchema
10
+ from databricks4py.config import Environment, JobConfig, UnityConfig
11
+ from databricks4py.logging import configure_logging, get_logger
12
+ from databricks4py.metrics import CompositeMetricsSink, LoggingMetricsSink, MetricEvent, MetricsSink
13
+ from databricks4py.retry import RetryConfig, retry
14
+ from databricks4py.secrets import SecretFetcher
15
+ from databricks4py.spark_session import active_fallback, get_active, get_or_create_local_session
16
+ from databricks4py.workflow import Workflow
17
+
18
+
19
+ def inject_dbutils(dbutils_module):
20
+ """Unified dbutils injection for secrets and DBFS operations."""
21
+ from databricks4py.io.dbfs import _set_dbutils_module
22
+
23
+ SecretFetcher.dbutils = dbutils_module
24
+ _set_dbutils_module(dbutils_module)
25
+
26
+
27
+ __all__ = [
28
+ "__version__",
29
+ # SparkSession
30
+ "get_active",
31
+ "active_fallback",
32
+ "get_or_create_local_session",
33
+ # Catalog
34
+ "CatalogSchema",
35
+ # Config
36
+ "Environment",
37
+ "JobConfig",
38
+ "UnityConfig",
39
+ # Logging
40
+ "configure_logging",
41
+ "get_logger",
42
+ # Metrics
43
+ "CompositeMetricsSink",
44
+ "LoggingMetricsSink",
45
+ "MetricEvent",
46
+ "MetricsSink",
47
+ # Retry
48
+ "RetryConfig",
49
+ "retry",
50
+ # Secrets
51
+ "SecretFetcher",
52
+ # Workflow
53
+ "Workflow",
54
+ # Utilities
55
+ "inject_dbutils",
56
+ ]
@@ -0,0 +1,65 @@
1
+ """Catalog and schema-qualified table name management."""
2
+
3
+ from __future__ import annotations
4
+
5
+ __all__ = ["CatalogSchema"]
6
+
7
+
8
+ class CatalogSchema:
9
+ """Schema-qualified table name registry.
10
+
11
+ Provides attribute-based access to fully qualified table names
12
+ within a schema. Supports versioned table aliases.
13
+
14
+ Example::
15
+
16
+ sales = CatalogSchema(
17
+ "sales",
18
+ tables=["orders", "customers"],
19
+ versioned_tables={"metrics": "metrics_v3"},
20
+ )
21
+
22
+ # Access table names
23
+ sales.orders # "sales.orders"
24
+ sales.customers # "sales.customers"
25
+ sales.metrics # "sales.metrics_v3"
26
+
27
+ Args:
28
+ name: The schema name.
29
+ tables: List of table names in this schema.
30
+ versioned_tables: Dict mapping logical names to versioned names.
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ name: str,
36
+ tables: list[str] | None = None,
37
+ versioned_tables: dict[str, str] | None = None,
38
+ ) -> None:
39
+ self._name = name
40
+ self._tables: dict[str, str] = {}
41
+
42
+ for table in tables or []:
43
+ self._tables[table] = f"{name}.{table}"
44
+
45
+ for logical_name, versioned_name in (versioned_tables or {}).items():
46
+ self._tables[logical_name] = f"{name}.{versioned_name}"
47
+
48
+ @property
49
+ def schema_name(self) -> str:
50
+ """The schema name."""
51
+ return self._name
52
+
53
+ def __getattr__(self, name: str) -> str:
54
+ if name.startswith("_"):
55
+ raise AttributeError(name)
56
+ try:
57
+ return self._tables[name]
58
+ except KeyError:
59
+ raise AttributeError(
60
+ f"'{type(self).__name__}' has no table '{name}'. "
61
+ f"Available: {sorted(self._tables.keys())}"
62
+ ) from None
63
+
64
+ def __repr__(self) -> str:
65
+ return f"CatalogSchema({self._name!r}, tables={sorted(self._tables.keys())})"
@@ -0,0 +1,6 @@
1
+ """Configuration classes for Databricks job entry points."""
2
+
3
+ from databricks4py.config.base import Environment, JobConfig
4
+ from databricks4py.config.unity import UnityConfig
5
+
6
+ __all__ = ["Environment", "JobConfig", "UnityConfig"]
@@ -0,0 +1,119 @@
1
+ """Base configuration for Databricks jobs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+ from enum import Enum
8
+
9
+ __all__ = ["Environment", "JobConfig"]
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class Environment(Enum):
15
+ """Deployment environment. Resolved automatically from Databricks widgets or env vars."""
16
+
17
+ DEV = "dev"
18
+ STAGING = "staging"
19
+ PROD = "prod"
20
+
21
+
22
+ class JobConfig:
23
+ """Configuration container for Databricks job parameters.
24
+
25
+ Resolves the deployment environment from (in priority order):
26
+ 1. ``spark.databricks.widget.env`` conf (Databricks widget)
27
+ 2. ``ENV`` or ``ENVIRONMENT`` environment variable
28
+ 3. Defaults to ``DEV``
29
+
30
+ Example::
31
+
32
+ config = JobConfig(
33
+ tables={"events": "catalog.bronze.events", "users": "catalog.silver.users"},
34
+ secret_scope="my-scope",
35
+ spark_configs={"spark.sql.shuffle.partitions": "8"},
36
+ )
37
+ table_name = config.table("events") # "catalog.bronze.events"
38
+
39
+ Args:
40
+ tables: Mapping of logical names to fully qualified table names.
41
+ secret_scope: Databricks secret scope for :meth:`secret` lookups.
42
+ storage_root: Optional root path for storage operations.
43
+ spark_configs: Spark configuration overrides applied by
44
+ :meth:`~databricks4py.workflow.Workflow.execute`.
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ tables: dict[str, str],
50
+ *,
51
+ secret_scope: str | None = None,
52
+ storage_root: str | None = None,
53
+ spark_configs: dict[str, str] | None = None,
54
+ ) -> None:
55
+ self.tables = tables
56
+ self.secret_scope = secret_scope
57
+ self.storage_root = storage_root
58
+ self.spark_configs = spark_configs or {}
59
+ self.env = self._resolve_env()
60
+
61
+ def _resolve_env(self) -> Environment:
62
+ raw: str | None = None
63
+
64
+ # Try Databricks widget parameter first
65
+ try:
66
+ from pyspark.sql import SparkSession
67
+
68
+ spark = SparkSession.getActiveSession()
69
+ if spark is not None:
70
+ raw = spark.conf.get("spark.databricks.widget.env", None)
71
+ except Exception:
72
+ pass
73
+
74
+ if raw is None:
75
+ raw = os.getenv("ENV") or os.getenv("ENVIRONMENT")
76
+
77
+ if raw is None:
78
+ return Environment.DEV
79
+
80
+ try:
81
+ return Environment(raw.lower())
82
+ except ValueError:
83
+ logger.warning("Unknown environment '%s', defaulting to DEV", raw)
84
+ return Environment.DEV
85
+
86
+ def table(self, name: str) -> str:
87
+ """Look up a fully qualified table name by logical key.
88
+
89
+ Raises:
90
+ KeyError: If *name* is not in the configured tables.
91
+ """
92
+ try:
93
+ return self.tables[name]
94
+ except KeyError:
95
+ available = sorted(self.tables.keys())
96
+ raise KeyError(f"Table '{name}' not configured. Available: {available}") from None
97
+
98
+ def secret(self, key: str) -> str:
99
+ """Fetch a secret from Databricks using the configured scope.
100
+
101
+ Raises:
102
+ ValueError: If no ``secret_scope`` was configured.
103
+ """
104
+ if self.secret_scope is None:
105
+ raise ValueError("No secret_scope configured on this JobConfig")
106
+
107
+ from databricks4py.secrets import SecretFetcher
108
+
109
+ return SecretFetcher.fetch_secret(self.secret_scope, key)
110
+
111
+ @classmethod
112
+ def from_env(cls, **kwargs) -> JobConfig:
113
+ return cls(**kwargs)
114
+
115
+ def __repr__(self) -> str:
116
+ return (
117
+ f"JobConfig(env={self.env.value!r}, tables={len(self.tables)}, "
118
+ f"secret_scope={self.secret_scope!r})"
119
+ )
@@ -0,0 +1,72 @@
1
+ """Unity Catalog-aware configuration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from databricks4py.config.base import JobConfig
6
+
7
+ __all__ = ["UnityConfig"]
8
+
9
+
10
+ class UnityConfig(JobConfig):
11
+ """Environment-aware Unity Catalog configuration.
12
+
13
+ Builds a catalog name from ``{catalog_prefix}_{env}`` (e.g. ``myapp_prod``)
14
+ and resolves table references in ``schema.table`` format to fully qualified
15
+ three-part names.
16
+
17
+ Example::
18
+
19
+ config = UnityConfig(
20
+ catalog_prefix="myapp",
21
+ schemas=["bronze", "silver"],
22
+ )
23
+ # In production: config.catalog == "myapp_prod"
24
+ config.table("bronze.events") # "myapp_prod.bronze.events"
25
+
26
+ Args:
27
+ catalog_prefix: Base name prepended to the resolved environment.
28
+ schemas: Allowed schema names. :meth:`table` rejects unknown schemas.
29
+ secret_scope: Forwarded to :class:`JobConfig`.
30
+ storage_root: Forwarded to :class:`JobConfig`.
31
+ spark_configs: Forwarded to :class:`JobConfig`.
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ catalog_prefix: str,
37
+ schemas: list[str],
38
+ *,
39
+ secret_scope: str | None = None,
40
+ storage_root: str | None = None,
41
+ spark_configs: dict[str, str] | None = None,
42
+ ) -> None:
43
+ super().__init__(
44
+ tables={},
45
+ secret_scope=secret_scope,
46
+ storage_root=storage_root,
47
+ spark_configs=spark_configs,
48
+ )
49
+ self.catalog_prefix = catalog_prefix
50
+ self.schemas = schemas
51
+ self.catalog = f"{catalog_prefix}_{self.env.value}"
52
+
53
+ def table(self, name: str) -> str:
54
+ """Resolve ``'schema.table'`` to ``'catalog.schema.table'``.
55
+
56
+ Raises:
57
+ ValueError: If *name* is not in ``schema.table`` format.
58
+ KeyError: If the schema is not in the configured list.
59
+ """
60
+ parts = name.split(".")
61
+ if len(parts) != 2:
62
+ raise ValueError(f"Expected 'schema.table' format, got '{name}'")
63
+ schema, table_name = parts
64
+ if schema not in self.schemas:
65
+ raise KeyError(f"Schema '{schema}' not in configured schemas: {sorted(self.schemas)}")
66
+ return f"{self.catalog}.{schema}.{table_name}"
67
+
68
+ def __repr__(self) -> str:
69
+ return (
70
+ f"UnityConfig(catalog={self.catalog!r}, schemas={self.schemas}, "
71
+ f"secret_scope={self.secret_scope!r})"
72
+ )
@@ -0,0 +1,17 @@
1
+ """DataFrame filter pipeline."""
2
+
3
+ from databricks4py.filters.base import (
4
+ ColumnFilter,
5
+ DropDuplicates,
6
+ Filter,
7
+ FilterPipeline,
8
+ WhereFilter,
9
+ )
10
+
11
+ __all__ = [
12
+ "ColumnFilter",
13
+ "DropDuplicates",
14
+ "Filter",
15
+ "FilterPipeline",
16
+ "WhereFilter",
17
+ ]
@@ -0,0 +1,154 @@
1
+ """DataFrame filter abstractions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from collections.abc import Sequence
7
+
8
+ from pyspark.sql import DataFrame
9
+
10
+ __all__ = [
11
+ "ColumnFilter",
12
+ "DropDuplicates",
13
+ "Filter",
14
+ "FilterPipeline",
15
+ "WhereFilter",
16
+ ]
17
+
18
+
19
+ class Filter(ABC):
20
+ """Abstract base class for DataFrame filters.
21
+
22
+ Subclasses implement :meth:`apply` to transform a DataFrame.
23
+ Filters are callable — ``filter(df)`` is equivalent to ``filter.apply(df)``.
24
+
25
+ Example::
26
+
27
+ class ActiveOnly(Filter):
28
+ def apply(self, df: DataFrame) -> DataFrame:
29
+ return df.where("is_active = true")
30
+
31
+ df = ActiveOnly()(raw_df)
32
+ """
33
+
34
+ @abstractmethod
35
+ def apply(self, df: DataFrame) -> DataFrame:
36
+ """Apply the filter to a DataFrame.
37
+
38
+ Args:
39
+ df: Input DataFrame.
40
+
41
+ Returns:
42
+ Filtered DataFrame (schema should be unchanged or a subset).
43
+ """
44
+ ...
45
+
46
+ def __call__(self, df: DataFrame) -> DataFrame:
47
+ """Apply the filter (callable interface).
48
+
49
+ Equivalent to calling :meth:`apply`.
50
+ """
51
+ return self.apply(df)
52
+
53
+
54
+ class FilterPipeline(Filter):
55
+ """Chain of filters applied sequentially.
56
+
57
+ Example::
58
+
59
+ pipeline = FilterPipeline([
60
+ DropDuplicates(),
61
+ WhereFilter("score > 50"),
62
+ ColumnFilter(["id", "name"]),
63
+ ])
64
+ result = pipeline(df)
65
+ """
66
+
67
+ def __init__(self, filters: Sequence[Filter] | None = None) -> None:
68
+ self._filters: list[Filter] = list(filters or [])
69
+
70
+ def add(self, filter_: Filter) -> None:
71
+ """Append a filter to the end of the pipeline.
72
+
73
+ Args:
74
+ filter_: The filter to add.
75
+ """
76
+ self._filters.append(filter_)
77
+
78
+ def apply(self, df: DataFrame) -> DataFrame:
79
+ """Apply all filters in sequence.
80
+
81
+ Args:
82
+ df: Input DataFrame.
83
+
84
+ Returns:
85
+ DataFrame after all filters have been applied.
86
+ """
87
+ for f in self._filters:
88
+ df = f(df)
89
+ return df
90
+
91
+ def __len__(self) -> int:
92
+ return len(self._filters)
93
+
94
+ def __repr__(self) -> str:
95
+ return f"FilterPipeline({self._filters!r})"
96
+
97
+
98
+ class DropDuplicates(Filter):
99
+ """Remove duplicate rows.
100
+
101
+ Args:
102
+ subset: Optional column names to consider for deduplication.
103
+ If None, all columns are used.
104
+ """
105
+
106
+ def __init__(self, subset: Sequence[str] | None = None) -> None:
107
+ self._subset = list(subset) if subset else None
108
+
109
+ def apply(self, df: DataFrame) -> DataFrame:
110
+ if self._subset:
111
+ return df.dropDuplicates(self._subset)
112
+ return df.dropDuplicates()
113
+
114
+ def __repr__(self) -> str:
115
+ return f"DropDuplicates(subset={self._subset!r})"
116
+
117
+
118
+ class ColumnFilter(Filter):
119
+ """Select specific columns from a DataFrame.
120
+
121
+ Args:
122
+ columns: Column names to keep.
123
+
124
+ Raises:
125
+ ValueError: If columns is empty.
126
+ """
127
+
128
+ def __init__(self, columns: Sequence[str]) -> None:
129
+ if not columns:
130
+ raise ValueError("ColumnFilter requires at least one column")
131
+ self._columns = list(columns)
132
+
133
+ def apply(self, df: DataFrame) -> DataFrame:
134
+ return df.select(*self._columns)
135
+
136
+ def __repr__(self) -> str:
137
+ return f"ColumnFilter(columns={self._columns!r})"
138
+
139
+
140
+ class WhereFilter(Filter):
141
+ """Apply a SQL WHERE condition.
142
+
143
+ Args:
144
+ condition: SQL condition string (e.g. ``"age > 18"``).
145
+ """
146
+
147
+ def __init__(self, condition: str) -> None:
148
+ self._condition = condition
149
+
150
+ def apply(self, df: DataFrame) -> DataFrame:
151
+ return df.where(self._condition)
152
+
153
+ def __repr__(self) -> str:
154
+ return f"WhereFilter({self._condition!r})"
@@ -0,0 +1,40 @@
1
+ """I/O utilities for Delta Lake, DBFS, and streaming."""
2
+
3
+ from databricks4py.io.checkpoint import CheckpointInfo, CheckpointManager
4
+ from databricks4py.io.dbfs import copy_from_remote, inject_dbutils_module, ls, mkdirs, mv, rm
5
+ from databricks4py.io.delta import (
6
+ DeltaTable,
7
+ DeltaTableAppender,
8
+ DeltaTableOverwriter,
9
+ GeneratedColumn,
10
+ optimize_table,
11
+ vacuum_table,
12
+ )
13
+ from databricks4py.io.merge import MergeBuilder, MergeResult
14
+ from databricks4py.io.streaming import StreamingTableReader, StreamingTriggerOptions
15
+
16
+ __all__ = [
17
+ # Checkpoint
18
+ "CheckpointInfo",
19
+ "CheckpointManager",
20
+ # DBFS
21
+ "copy_from_remote",
22
+ "inject_dbutils_module",
23
+ "ls",
24
+ "mkdirs",
25
+ "mv",
26
+ "rm",
27
+ # Delta
28
+ "DeltaTable",
29
+ "DeltaTableAppender",
30
+ "DeltaTableOverwriter",
31
+ "GeneratedColumn",
32
+ "optimize_table",
33
+ "vacuum_table",
34
+ # Merge
35
+ "MergeBuilder",
36
+ "MergeResult",
37
+ # Streaming
38
+ "StreamingTableReader",
39
+ "StreamingTriggerOptions",
40
+ ]
@@ -0,0 +1,98 @@
1
+ """Streaming checkpoint lifecycle management."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import os
8
+ import re
9
+ import shutil
10
+ from dataclasses import dataclass
11
+ from typing import Any
12
+
13
+ from pyspark.sql import SparkSession
14
+
15
+ __all__ = [
16
+ "CheckpointInfo",
17
+ "CheckpointManager",
18
+ ]
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class CheckpointInfo:
25
+ """Immutable snapshot of a streaming checkpoint's state."""
26
+
27
+ path: str
28
+ last_batch_id: int | None
29
+ offsets: dict[str, Any] | None
30
+ size_bytes: int
31
+
32
+
33
+ def _dir_size(path: str) -> int:
34
+ total = 0
35
+ for dirpath, _dirnames, filenames in os.walk(path):
36
+ for fname in filenames:
37
+ total += os.path.getsize(os.path.join(dirpath, fname))
38
+ return total
39
+
40
+
41
+ def _sanitize(name: str) -> str:
42
+ return re.sub(r"[^a-zA-Z0-9]", "_", name)
43
+
44
+
45
+ class CheckpointManager:
46
+ """Manages streaming checkpoint directories.
47
+
48
+ Args:
49
+ base_path: Root directory under which checkpoints are stored.
50
+ spark: Optional SparkSession (reserved for future DBFS/cloud support).
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ base_path: str,
56
+ *,
57
+ spark: SparkSession | None = None,
58
+ ) -> None:
59
+ self._base_path = base_path
60
+ self._spark = spark
61
+
62
+ def path_for(self, source: str, sink: str) -> str:
63
+ """Generate a deterministic checkpoint path for a source/sink pair."""
64
+ return f"{self._base_path}/{_sanitize(source)}__{_sanitize(sink)}"
65
+
66
+ def exists(self, path: str) -> bool:
67
+ return os.path.isdir(path)
68
+
69
+ def reset(self, path: str) -> None:
70
+ """Delete a checkpoint directory. No-op if it doesn't exist."""
71
+ if os.path.exists(path):
72
+ shutil.rmtree(path)
73
+ logger.info("Deleted checkpoint at %s", path)
74
+
75
+ def info(self, path: str) -> CheckpointInfo:
76
+ """Read checkpoint metadata from the offset log."""
77
+ offsets_dir = os.path.join(path, "offsets")
78
+ last_batch_id: int | None = None
79
+ offsets: dict | None = None
80
+
81
+ if os.path.isdir(offsets_dir):
82
+ batch_ids = []
83
+ for entry in os.listdir(offsets_dir):
84
+ if entry.isdigit():
85
+ batch_ids.append(int(entry))
86
+
87
+ if batch_ids:
88
+ last_batch_id = max(batch_ids)
89
+ offset_file = os.path.join(offsets_dir, str(last_batch_id))
90
+ with open(offset_file) as f:
91
+ offsets = json.load(f)
92
+
93
+ return CheckpointInfo(
94
+ path=path,
95
+ last_batch_id=last_batch_id,
96
+ offsets=offsets,
97
+ size_bytes=_dir_size(path),
98
+ )