databricks4py 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks4py/__init__.py +56 -0
- databricks4py/catalog.py +65 -0
- databricks4py/config/__init__.py +6 -0
- databricks4py/config/base.py +119 -0
- databricks4py/config/unity.py +72 -0
- databricks4py/filters/__init__.py +17 -0
- databricks4py/filters/base.py +154 -0
- databricks4py/io/__init__.py +40 -0
- databricks4py/io/checkpoint.py +98 -0
- databricks4py/io/dbfs.py +91 -0
- databricks4py/io/delta.py +564 -0
- databricks4py/io/merge.py +176 -0
- databricks4py/io/streaming.py +281 -0
- databricks4py/logging.py +39 -0
- databricks4py/metrics/__init__.py +22 -0
- databricks4py/metrics/base.py +66 -0
- databricks4py/metrics/delta_sink.py +75 -0
- databricks4py/metrics/logging_sink.py +20 -0
- databricks4py/migrations/__init__.py +27 -0
- databricks4py/migrations/alter.py +114 -0
- databricks4py/migrations/runner.py +241 -0
- databricks4py/migrations/schema_diff.py +136 -0
- databricks4py/migrations/validators.py +195 -0
- databricks4py/observability/__init__.py +24 -0
- databricks4py/observability/_utils.py +24 -0
- databricks4py/observability/batch_context.py +134 -0
- databricks4py/observability/health.py +223 -0
- databricks4py/observability/query_listener.py +236 -0
- databricks4py/py.typed +0 -0
- databricks4py/quality/__init__.py +26 -0
- databricks4py/quality/base.py +54 -0
- databricks4py/quality/expectations.py +184 -0
- databricks4py/quality/gate.py +90 -0
- databricks4py/retry.py +102 -0
- databricks4py/secrets.py +69 -0
- databricks4py/spark_session.py +68 -0
- databricks4py/testing/__init__.py +35 -0
- databricks4py/testing/assertions.py +111 -0
- databricks4py/testing/builders.py +127 -0
- databricks4py/testing/fixtures.py +134 -0
- databricks4py/testing/mocks.py +106 -0
- databricks4py/testing/temp_table.py +73 -0
- databricks4py/workflow.py +219 -0
- databricks4py-0.2.0.dist-info/METADATA +589 -0
- databricks4py-0.2.0.dist-info/RECORD +48 -0
- databricks4py-0.2.0.dist-info/WHEEL +5 -0
- databricks4py-0.2.0.dist-info/licenses/LICENSE +21 -0
- databricks4py-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""databricks4py: Spark, Delta Lake, and Databricks utility library.
|
|
2
|
+
|
|
3
|
+
A collection of reusable abstractions for building PySpark applications
|
|
4
|
+
on Databricks and locally.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
__version__ = "0.2.0"
|
|
8
|
+
|
|
9
|
+
from databricks4py.catalog import CatalogSchema
|
|
10
|
+
from databricks4py.config import Environment, JobConfig, UnityConfig
|
|
11
|
+
from databricks4py.logging import configure_logging, get_logger
|
|
12
|
+
from databricks4py.metrics import CompositeMetricsSink, LoggingMetricsSink, MetricEvent, MetricsSink
|
|
13
|
+
from databricks4py.retry import RetryConfig, retry
|
|
14
|
+
from databricks4py.secrets import SecretFetcher
|
|
15
|
+
from databricks4py.spark_session import active_fallback, get_active, get_or_create_local_session
|
|
16
|
+
from databricks4py.workflow import Workflow
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def inject_dbutils(dbutils_module):
|
|
20
|
+
"""Unified dbutils injection for secrets and DBFS operations."""
|
|
21
|
+
from databricks4py.io.dbfs import _set_dbutils_module
|
|
22
|
+
|
|
23
|
+
SecretFetcher.dbutils = dbutils_module
|
|
24
|
+
_set_dbutils_module(dbutils_module)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"__version__",
|
|
29
|
+
# SparkSession
|
|
30
|
+
"get_active",
|
|
31
|
+
"active_fallback",
|
|
32
|
+
"get_or_create_local_session",
|
|
33
|
+
# Catalog
|
|
34
|
+
"CatalogSchema",
|
|
35
|
+
# Config
|
|
36
|
+
"Environment",
|
|
37
|
+
"JobConfig",
|
|
38
|
+
"UnityConfig",
|
|
39
|
+
# Logging
|
|
40
|
+
"configure_logging",
|
|
41
|
+
"get_logger",
|
|
42
|
+
# Metrics
|
|
43
|
+
"CompositeMetricsSink",
|
|
44
|
+
"LoggingMetricsSink",
|
|
45
|
+
"MetricEvent",
|
|
46
|
+
"MetricsSink",
|
|
47
|
+
# Retry
|
|
48
|
+
"RetryConfig",
|
|
49
|
+
"retry",
|
|
50
|
+
# Secrets
|
|
51
|
+
"SecretFetcher",
|
|
52
|
+
# Workflow
|
|
53
|
+
"Workflow",
|
|
54
|
+
# Utilities
|
|
55
|
+
"inject_dbutils",
|
|
56
|
+
]
|
databricks4py/catalog.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Catalog and schema-qualified table name management."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__all__ = ["CatalogSchema"]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class CatalogSchema:
|
|
9
|
+
"""Schema-qualified table name registry.
|
|
10
|
+
|
|
11
|
+
Provides attribute-based access to fully qualified table names
|
|
12
|
+
within a schema. Supports versioned table aliases.
|
|
13
|
+
|
|
14
|
+
Example::
|
|
15
|
+
|
|
16
|
+
sales = CatalogSchema(
|
|
17
|
+
"sales",
|
|
18
|
+
tables=["orders", "customers"],
|
|
19
|
+
versioned_tables={"metrics": "metrics_v3"},
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Access table names
|
|
23
|
+
sales.orders # "sales.orders"
|
|
24
|
+
sales.customers # "sales.customers"
|
|
25
|
+
sales.metrics # "sales.metrics_v3"
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
name: The schema name.
|
|
29
|
+
tables: List of table names in this schema.
|
|
30
|
+
versioned_tables: Dict mapping logical names to versioned names.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
name: str,
|
|
36
|
+
tables: list[str] | None = None,
|
|
37
|
+
versioned_tables: dict[str, str] | None = None,
|
|
38
|
+
) -> None:
|
|
39
|
+
self._name = name
|
|
40
|
+
self._tables: dict[str, str] = {}
|
|
41
|
+
|
|
42
|
+
for table in tables or []:
|
|
43
|
+
self._tables[table] = f"{name}.{table}"
|
|
44
|
+
|
|
45
|
+
for logical_name, versioned_name in (versioned_tables or {}).items():
|
|
46
|
+
self._tables[logical_name] = f"{name}.{versioned_name}"
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def schema_name(self) -> str:
|
|
50
|
+
"""The schema name."""
|
|
51
|
+
return self._name
|
|
52
|
+
|
|
53
|
+
def __getattr__(self, name: str) -> str:
|
|
54
|
+
if name.startswith("_"):
|
|
55
|
+
raise AttributeError(name)
|
|
56
|
+
try:
|
|
57
|
+
return self._tables[name]
|
|
58
|
+
except KeyError:
|
|
59
|
+
raise AttributeError(
|
|
60
|
+
f"'{type(self).__name__}' has no table '{name}'. "
|
|
61
|
+
f"Available: {sorted(self._tables.keys())}"
|
|
62
|
+
) from None
|
|
63
|
+
|
|
64
|
+
def __repr__(self) -> str:
|
|
65
|
+
return f"CatalogSchema({self._name!r}, tables={sorted(self._tables.keys())})"
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Base configuration for Databricks jobs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from enum import Enum
|
|
8
|
+
|
|
9
|
+
__all__ = ["Environment", "JobConfig"]
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Environment(Enum):
|
|
15
|
+
"""Deployment environment. Resolved automatically from Databricks widgets or env vars."""
|
|
16
|
+
|
|
17
|
+
DEV = "dev"
|
|
18
|
+
STAGING = "staging"
|
|
19
|
+
PROD = "prod"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class JobConfig:
|
|
23
|
+
"""Configuration container for Databricks job parameters.
|
|
24
|
+
|
|
25
|
+
Resolves the deployment environment from (in priority order):
|
|
26
|
+
1. ``spark.databricks.widget.env`` conf (Databricks widget)
|
|
27
|
+
2. ``ENV`` or ``ENVIRONMENT`` environment variable
|
|
28
|
+
3. Defaults to ``DEV``
|
|
29
|
+
|
|
30
|
+
Example::
|
|
31
|
+
|
|
32
|
+
config = JobConfig(
|
|
33
|
+
tables={"events": "catalog.bronze.events", "users": "catalog.silver.users"},
|
|
34
|
+
secret_scope="my-scope",
|
|
35
|
+
spark_configs={"spark.sql.shuffle.partitions": "8"},
|
|
36
|
+
)
|
|
37
|
+
table_name = config.table("events") # "catalog.bronze.events"
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
tables: Mapping of logical names to fully qualified table names.
|
|
41
|
+
secret_scope: Databricks secret scope for :meth:`secret` lookups.
|
|
42
|
+
storage_root: Optional root path for storage operations.
|
|
43
|
+
spark_configs: Spark configuration overrides applied by
|
|
44
|
+
:meth:`~databricks4py.workflow.Workflow.execute`.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
tables: dict[str, str],
|
|
50
|
+
*,
|
|
51
|
+
secret_scope: str | None = None,
|
|
52
|
+
storage_root: str | None = None,
|
|
53
|
+
spark_configs: dict[str, str] | None = None,
|
|
54
|
+
) -> None:
|
|
55
|
+
self.tables = tables
|
|
56
|
+
self.secret_scope = secret_scope
|
|
57
|
+
self.storage_root = storage_root
|
|
58
|
+
self.spark_configs = spark_configs or {}
|
|
59
|
+
self.env = self._resolve_env()
|
|
60
|
+
|
|
61
|
+
def _resolve_env(self) -> Environment:
|
|
62
|
+
raw: str | None = None
|
|
63
|
+
|
|
64
|
+
# Try Databricks widget parameter first
|
|
65
|
+
try:
|
|
66
|
+
from pyspark.sql import SparkSession
|
|
67
|
+
|
|
68
|
+
spark = SparkSession.getActiveSession()
|
|
69
|
+
if spark is not None:
|
|
70
|
+
raw = spark.conf.get("spark.databricks.widget.env", None)
|
|
71
|
+
except Exception:
|
|
72
|
+
pass
|
|
73
|
+
|
|
74
|
+
if raw is None:
|
|
75
|
+
raw = os.getenv("ENV") or os.getenv("ENVIRONMENT")
|
|
76
|
+
|
|
77
|
+
if raw is None:
|
|
78
|
+
return Environment.DEV
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
return Environment(raw.lower())
|
|
82
|
+
except ValueError:
|
|
83
|
+
logger.warning("Unknown environment '%s', defaulting to DEV", raw)
|
|
84
|
+
return Environment.DEV
|
|
85
|
+
|
|
86
|
+
def table(self, name: str) -> str:
|
|
87
|
+
"""Look up a fully qualified table name by logical key.
|
|
88
|
+
|
|
89
|
+
Raises:
|
|
90
|
+
KeyError: If *name* is not in the configured tables.
|
|
91
|
+
"""
|
|
92
|
+
try:
|
|
93
|
+
return self.tables[name]
|
|
94
|
+
except KeyError:
|
|
95
|
+
available = sorted(self.tables.keys())
|
|
96
|
+
raise KeyError(f"Table '{name}' not configured. Available: {available}") from None
|
|
97
|
+
|
|
98
|
+
def secret(self, key: str) -> str:
|
|
99
|
+
"""Fetch a secret from Databricks using the configured scope.
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
ValueError: If no ``secret_scope`` was configured.
|
|
103
|
+
"""
|
|
104
|
+
if self.secret_scope is None:
|
|
105
|
+
raise ValueError("No secret_scope configured on this JobConfig")
|
|
106
|
+
|
|
107
|
+
from databricks4py.secrets import SecretFetcher
|
|
108
|
+
|
|
109
|
+
return SecretFetcher.fetch_secret(self.secret_scope, key)
|
|
110
|
+
|
|
111
|
+
@classmethod
|
|
112
|
+
def from_env(cls, **kwargs) -> JobConfig:
|
|
113
|
+
return cls(**kwargs)
|
|
114
|
+
|
|
115
|
+
def __repr__(self) -> str:
|
|
116
|
+
return (
|
|
117
|
+
f"JobConfig(env={self.env.value!r}, tables={len(self.tables)}, "
|
|
118
|
+
f"secret_scope={self.secret_scope!r})"
|
|
119
|
+
)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Unity Catalog-aware configuration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from databricks4py.config.base import JobConfig
|
|
6
|
+
|
|
7
|
+
__all__ = ["UnityConfig"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class UnityConfig(JobConfig):
|
|
11
|
+
"""Environment-aware Unity Catalog configuration.
|
|
12
|
+
|
|
13
|
+
Builds a catalog name from ``{catalog_prefix}_{env}`` (e.g. ``myapp_prod``)
|
|
14
|
+
and resolves table references in ``schema.table`` format to fully qualified
|
|
15
|
+
three-part names.
|
|
16
|
+
|
|
17
|
+
Example::
|
|
18
|
+
|
|
19
|
+
config = UnityConfig(
|
|
20
|
+
catalog_prefix="myapp",
|
|
21
|
+
schemas=["bronze", "silver"],
|
|
22
|
+
)
|
|
23
|
+
# In production: config.catalog == "myapp_prod"
|
|
24
|
+
config.table("bronze.events") # "myapp_prod.bronze.events"
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
catalog_prefix: Base name prepended to the resolved environment.
|
|
28
|
+
schemas: Allowed schema names. :meth:`table` rejects unknown schemas.
|
|
29
|
+
secret_scope: Forwarded to :class:`JobConfig`.
|
|
30
|
+
storage_root: Forwarded to :class:`JobConfig`.
|
|
31
|
+
spark_configs: Forwarded to :class:`JobConfig`.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
catalog_prefix: str,
|
|
37
|
+
schemas: list[str],
|
|
38
|
+
*,
|
|
39
|
+
secret_scope: str | None = None,
|
|
40
|
+
storage_root: str | None = None,
|
|
41
|
+
spark_configs: dict[str, str] | None = None,
|
|
42
|
+
) -> None:
|
|
43
|
+
super().__init__(
|
|
44
|
+
tables={},
|
|
45
|
+
secret_scope=secret_scope,
|
|
46
|
+
storage_root=storage_root,
|
|
47
|
+
spark_configs=spark_configs,
|
|
48
|
+
)
|
|
49
|
+
self.catalog_prefix = catalog_prefix
|
|
50
|
+
self.schemas = schemas
|
|
51
|
+
self.catalog = f"{catalog_prefix}_{self.env.value}"
|
|
52
|
+
|
|
53
|
+
def table(self, name: str) -> str:
|
|
54
|
+
"""Resolve ``'schema.table'`` to ``'catalog.schema.table'``.
|
|
55
|
+
|
|
56
|
+
Raises:
|
|
57
|
+
ValueError: If *name* is not in ``schema.table`` format.
|
|
58
|
+
KeyError: If the schema is not in the configured list.
|
|
59
|
+
"""
|
|
60
|
+
parts = name.split(".")
|
|
61
|
+
if len(parts) != 2:
|
|
62
|
+
raise ValueError(f"Expected 'schema.table' format, got '{name}'")
|
|
63
|
+
schema, table_name = parts
|
|
64
|
+
if schema not in self.schemas:
|
|
65
|
+
raise KeyError(f"Schema '{schema}' not in configured schemas: {sorted(self.schemas)}")
|
|
66
|
+
return f"{self.catalog}.{schema}.{table_name}"
|
|
67
|
+
|
|
68
|
+
def __repr__(self) -> str:
|
|
69
|
+
return (
|
|
70
|
+
f"UnityConfig(catalog={self.catalog!r}, schemas={self.schemas}, "
|
|
71
|
+
f"secret_scope={self.secret_scope!r})"
|
|
72
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""DataFrame filter pipeline."""
|
|
2
|
+
|
|
3
|
+
from databricks4py.filters.base import (
|
|
4
|
+
ColumnFilter,
|
|
5
|
+
DropDuplicates,
|
|
6
|
+
Filter,
|
|
7
|
+
FilterPipeline,
|
|
8
|
+
WhereFilter,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"ColumnFilter",
|
|
13
|
+
"DropDuplicates",
|
|
14
|
+
"Filter",
|
|
15
|
+
"FilterPipeline",
|
|
16
|
+
"WhereFilter",
|
|
17
|
+
]
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""DataFrame filter abstractions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from collections.abc import Sequence
|
|
7
|
+
|
|
8
|
+
from pyspark.sql import DataFrame
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"ColumnFilter",
|
|
12
|
+
"DropDuplicates",
|
|
13
|
+
"Filter",
|
|
14
|
+
"FilterPipeline",
|
|
15
|
+
"WhereFilter",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Filter(ABC):
|
|
20
|
+
"""Abstract base class for DataFrame filters.
|
|
21
|
+
|
|
22
|
+
Subclasses implement :meth:`apply` to transform a DataFrame.
|
|
23
|
+
Filters are callable — ``filter(df)`` is equivalent to ``filter.apply(df)``.
|
|
24
|
+
|
|
25
|
+
Example::
|
|
26
|
+
|
|
27
|
+
class ActiveOnly(Filter):
|
|
28
|
+
def apply(self, df: DataFrame) -> DataFrame:
|
|
29
|
+
return df.where("is_active = true")
|
|
30
|
+
|
|
31
|
+
df = ActiveOnly()(raw_df)
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def apply(self, df: DataFrame) -> DataFrame:
|
|
36
|
+
"""Apply the filter to a DataFrame.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
df: Input DataFrame.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Filtered DataFrame (schema should be unchanged or a subset).
|
|
43
|
+
"""
|
|
44
|
+
...
|
|
45
|
+
|
|
46
|
+
def __call__(self, df: DataFrame) -> DataFrame:
|
|
47
|
+
"""Apply the filter (callable interface).
|
|
48
|
+
|
|
49
|
+
Equivalent to calling :meth:`apply`.
|
|
50
|
+
"""
|
|
51
|
+
return self.apply(df)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class FilterPipeline(Filter):
|
|
55
|
+
"""Chain of filters applied sequentially.
|
|
56
|
+
|
|
57
|
+
Example::
|
|
58
|
+
|
|
59
|
+
pipeline = FilterPipeline([
|
|
60
|
+
DropDuplicates(),
|
|
61
|
+
WhereFilter("score > 50"),
|
|
62
|
+
ColumnFilter(["id", "name"]),
|
|
63
|
+
])
|
|
64
|
+
result = pipeline(df)
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def __init__(self, filters: Sequence[Filter] | None = None) -> None:
|
|
68
|
+
self._filters: list[Filter] = list(filters or [])
|
|
69
|
+
|
|
70
|
+
def add(self, filter_: Filter) -> None:
|
|
71
|
+
"""Append a filter to the end of the pipeline.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
filter_: The filter to add.
|
|
75
|
+
"""
|
|
76
|
+
self._filters.append(filter_)
|
|
77
|
+
|
|
78
|
+
def apply(self, df: DataFrame) -> DataFrame:
|
|
79
|
+
"""Apply all filters in sequence.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
df: Input DataFrame.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
DataFrame after all filters have been applied.
|
|
86
|
+
"""
|
|
87
|
+
for f in self._filters:
|
|
88
|
+
df = f(df)
|
|
89
|
+
return df
|
|
90
|
+
|
|
91
|
+
def __len__(self) -> int:
|
|
92
|
+
return len(self._filters)
|
|
93
|
+
|
|
94
|
+
def __repr__(self) -> str:
|
|
95
|
+
return f"FilterPipeline({self._filters!r})"
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class DropDuplicates(Filter):
|
|
99
|
+
"""Remove duplicate rows.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
subset: Optional column names to consider for deduplication.
|
|
103
|
+
If None, all columns are used.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
def __init__(self, subset: Sequence[str] | None = None) -> None:
|
|
107
|
+
self._subset = list(subset) if subset else None
|
|
108
|
+
|
|
109
|
+
def apply(self, df: DataFrame) -> DataFrame:
|
|
110
|
+
if self._subset:
|
|
111
|
+
return df.dropDuplicates(self._subset)
|
|
112
|
+
return df.dropDuplicates()
|
|
113
|
+
|
|
114
|
+
def __repr__(self) -> str:
|
|
115
|
+
return f"DropDuplicates(subset={self._subset!r})"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class ColumnFilter(Filter):
|
|
119
|
+
"""Select specific columns from a DataFrame.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
columns: Column names to keep.
|
|
123
|
+
|
|
124
|
+
Raises:
|
|
125
|
+
ValueError: If columns is empty.
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
def __init__(self, columns: Sequence[str]) -> None:
|
|
129
|
+
if not columns:
|
|
130
|
+
raise ValueError("ColumnFilter requires at least one column")
|
|
131
|
+
self._columns = list(columns)
|
|
132
|
+
|
|
133
|
+
def apply(self, df: DataFrame) -> DataFrame:
|
|
134
|
+
return df.select(*self._columns)
|
|
135
|
+
|
|
136
|
+
def __repr__(self) -> str:
|
|
137
|
+
return f"ColumnFilter(columns={self._columns!r})"
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class WhereFilter(Filter):
|
|
141
|
+
"""Apply a SQL WHERE condition.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
condition: SQL condition string (e.g. ``"age > 18"``).
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
def __init__(self, condition: str) -> None:
|
|
148
|
+
self._condition = condition
|
|
149
|
+
|
|
150
|
+
def apply(self, df: DataFrame) -> DataFrame:
|
|
151
|
+
return df.where(self._condition)
|
|
152
|
+
|
|
153
|
+
def __repr__(self) -> str:
|
|
154
|
+
return f"WhereFilter({self._condition!r})"
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""I/O utilities for Delta Lake, DBFS, and streaming."""
|
|
2
|
+
|
|
3
|
+
from databricks4py.io.checkpoint import CheckpointInfo, CheckpointManager
|
|
4
|
+
from databricks4py.io.dbfs import copy_from_remote, inject_dbutils_module, ls, mkdirs, mv, rm
|
|
5
|
+
from databricks4py.io.delta import (
|
|
6
|
+
DeltaTable,
|
|
7
|
+
DeltaTableAppender,
|
|
8
|
+
DeltaTableOverwriter,
|
|
9
|
+
GeneratedColumn,
|
|
10
|
+
optimize_table,
|
|
11
|
+
vacuum_table,
|
|
12
|
+
)
|
|
13
|
+
from databricks4py.io.merge import MergeBuilder, MergeResult
|
|
14
|
+
from databricks4py.io.streaming import StreamingTableReader, StreamingTriggerOptions
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
# Checkpoint
|
|
18
|
+
"CheckpointInfo",
|
|
19
|
+
"CheckpointManager",
|
|
20
|
+
# DBFS
|
|
21
|
+
"copy_from_remote",
|
|
22
|
+
"inject_dbutils_module",
|
|
23
|
+
"ls",
|
|
24
|
+
"mkdirs",
|
|
25
|
+
"mv",
|
|
26
|
+
"rm",
|
|
27
|
+
# Delta
|
|
28
|
+
"DeltaTable",
|
|
29
|
+
"DeltaTableAppender",
|
|
30
|
+
"DeltaTableOverwriter",
|
|
31
|
+
"GeneratedColumn",
|
|
32
|
+
"optimize_table",
|
|
33
|
+
"vacuum_table",
|
|
34
|
+
# Merge
|
|
35
|
+
"MergeBuilder",
|
|
36
|
+
"MergeResult",
|
|
37
|
+
# Streaming
|
|
38
|
+
"StreamingTableReader",
|
|
39
|
+
"StreamingTriggerOptions",
|
|
40
|
+
]
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Streaming checkpoint lifecycle management."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import shutil
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from pyspark.sql import SparkSession
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"CheckpointInfo",
|
|
17
|
+
"CheckpointManager",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(frozen=True)
|
|
24
|
+
class CheckpointInfo:
|
|
25
|
+
"""Immutable snapshot of a streaming checkpoint's state."""
|
|
26
|
+
|
|
27
|
+
path: str
|
|
28
|
+
last_batch_id: int | None
|
|
29
|
+
offsets: dict[str, Any] | None
|
|
30
|
+
size_bytes: int
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _dir_size(path: str) -> int:
|
|
34
|
+
total = 0
|
|
35
|
+
for dirpath, _dirnames, filenames in os.walk(path):
|
|
36
|
+
for fname in filenames:
|
|
37
|
+
total += os.path.getsize(os.path.join(dirpath, fname))
|
|
38
|
+
return total
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _sanitize(name: str) -> str:
|
|
42
|
+
return re.sub(r"[^a-zA-Z0-9]", "_", name)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class CheckpointManager:
|
|
46
|
+
"""Manages streaming checkpoint directories.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
base_path: Root directory under which checkpoints are stored.
|
|
50
|
+
spark: Optional SparkSession (reserved for future DBFS/cloud support).
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
base_path: str,
|
|
56
|
+
*,
|
|
57
|
+
spark: SparkSession | None = None,
|
|
58
|
+
) -> None:
|
|
59
|
+
self._base_path = base_path
|
|
60
|
+
self._spark = spark
|
|
61
|
+
|
|
62
|
+
def path_for(self, source: str, sink: str) -> str:
|
|
63
|
+
"""Generate a deterministic checkpoint path for a source/sink pair."""
|
|
64
|
+
return f"{self._base_path}/{_sanitize(source)}__{_sanitize(sink)}"
|
|
65
|
+
|
|
66
|
+
def exists(self, path: str) -> bool:
|
|
67
|
+
return os.path.isdir(path)
|
|
68
|
+
|
|
69
|
+
def reset(self, path: str) -> None:
|
|
70
|
+
"""Delete a checkpoint directory. No-op if it doesn't exist."""
|
|
71
|
+
if os.path.exists(path):
|
|
72
|
+
shutil.rmtree(path)
|
|
73
|
+
logger.info("Deleted checkpoint at %s", path)
|
|
74
|
+
|
|
75
|
+
def info(self, path: str) -> CheckpointInfo:
|
|
76
|
+
"""Read checkpoint metadata from the offset log."""
|
|
77
|
+
offsets_dir = os.path.join(path, "offsets")
|
|
78
|
+
last_batch_id: int | None = None
|
|
79
|
+
offsets: dict | None = None
|
|
80
|
+
|
|
81
|
+
if os.path.isdir(offsets_dir):
|
|
82
|
+
batch_ids = []
|
|
83
|
+
for entry in os.listdir(offsets_dir):
|
|
84
|
+
if entry.isdigit():
|
|
85
|
+
batch_ids.append(int(entry))
|
|
86
|
+
|
|
87
|
+
if batch_ids:
|
|
88
|
+
last_batch_id = max(batch_ids)
|
|
89
|
+
offset_file = os.path.join(offsets_dir, str(last_batch_id))
|
|
90
|
+
with open(offset_file) as f:
|
|
91
|
+
offsets = json.load(f)
|
|
92
|
+
|
|
93
|
+
return CheckpointInfo(
|
|
94
|
+
path=path,
|
|
95
|
+
last_batch_id=last_batch_id,
|
|
96
|
+
offsets=offsets,
|
|
97
|
+
size_bytes=_dir_size(path),
|
|
98
|
+
)
|