databricks4py 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. databricks4py/__init__.py +56 -0
  2. databricks4py/catalog.py +65 -0
  3. databricks4py/config/__init__.py +6 -0
  4. databricks4py/config/base.py +119 -0
  5. databricks4py/config/unity.py +72 -0
  6. databricks4py/filters/__init__.py +17 -0
  7. databricks4py/filters/base.py +154 -0
  8. databricks4py/io/__init__.py +40 -0
  9. databricks4py/io/checkpoint.py +98 -0
  10. databricks4py/io/dbfs.py +91 -0
  11. databricks4py/io/delta.py +564 -0
  12. databricks4py/io/merge.py +176 -0
  13. databricks4py/io/streaming.py +281 -0
  14. databricks4py/logging.py +39 -0
  15. databricks4py/metrics/__init__.py +22 -0
  16. databricks4py/metrics/base.py +66 -0
  17. databricks4py/metrics/delta_sink.py +75 -0
  18. databricks4py/metrics/logging_sink.py +20 -0
  19. databricks4py/migrations/__init__.py +27 -0
  20. databricks4py/migrations/alter.py +114 -0
  21. databricks4py/migrations/runner.py +241 -0
  22. databricks4py/migrations/schema_diff.py +136 -0
  23. databricks4py/migrations/validators.py +195 -0
  24. databricks4py/observability/__init__.py +24 -0
  25. databricks4py/observability/_utils.py +24 -0
  26. databricks4py/observability/batch_context.py +134 -0
  27. databricks4py/observability/health.py +223 -0
  28. databricks4py/observability/query_listener.py +236 -0
  29. databricks4py/py.typed +0 -0
  30. databricks4py/quality/__init__.py +26 -0
  31. databricks4py/quality/base.py +54 -0
  32. databricks4py/quality/expectations.py +184 -0
  33. databricks4py/quality/gate.py +90 -0
  34. databricks4py/retry.py +102 -0
  35. databricks4py/secrets.py +69 -0
  36. databricks4py/spark_session.py +68 -0
  37. databricks4py/testing/__init__.py +35 -0
  38. databricks4py/testing/assertions.py +111 -0
  39. databricks4py/testing/builders.py +127 -0
  40. databricks4py/testing/fixtures.py +134 -0
  41. databricks4py/testing/mocks.py +106 -0
  42. databricks4py/testing/temp_table.py +73 -0
  43. databricks4py/workflow.py +219 -0
  44. databricks4py-0.2.0.dist-info/METADATA +589 -0
  45. databricks4py-0.2.0.dist-info/RECORD +48 -0
  46. databricks4py-0.2.0.dist-info/WHEEL +5 -0
  47. databricks4py-0.2.0.dist-info/licenses/LICENSE +21 -0
  48. databricks4py-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,73 @@
1
+ """Temporary Delta table context manager for testing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import uuid
6
+ from types import TracebackType
7
+ from typing import Any
8
+
9
+ from pyspark.sql import SparkSession
10
+ from pyspark.sql.types import StructField, StructType
11
+
12
+ from databricks4py.testing.builders import _resolve_type
13
+
14
+ __all__ = ["TempDeltaTable"]
15
+
16
+
17
+ class TempDeltaTable:
18
+ """Context manager that creates a temporary Delta table and drops it on exit.
19
+
20
+ Example::
21
+
22
+ with TempDeltaTable(spark, schema={"id": "int"}, data=[(1,), (2,)]) as table:
23
+ df = table.dataframe()
24
+ assert df.count() == 2
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ spark: SparkSession,
30
+ *,
31
+ table_name: str | None = None,
32
+ schema: dict[str, str] | None = None,
33
+ data: list[tuple[Any, ...]] | None = None,
34
+ ) -> None:
35
+ self._spark = spark
36
+ self._table_name = table_name or f"tmp_{uuid.uuid4().hex[:12]}"
37
+ self._schema = schema
38
+ self._data = data
39
+ self._delta_table: Any = None
40
+
41
+ @property
42
+ def table_name(self) -> str:
43
+ return self._table_name
44
+
45
+ def __enter__(self) -> Any:
46
+ from databricks4py.io.delta import DeltaTable
47
+
48
+ if self._schema is None:
49
+ raise ValueError("schema is required to create a TempDeltaTable")
50
+
51
+ struct = StructType(
52
+ [StructField(name, _resolve_type(t)) for name, t in self._schema.items()]
53
+ )
54
+
55
+ self._delta_table = DeltaTable(
56
+ table_name=self._table_name,
57
+ schema=struct,
58
+ spark=self._spark,
59
+ )
60
+
61
+ if self._data:
62
+ df = self._spark.createDataFrame(self._data, schema=struct)
63
+ self._delta_table.write(df, mode="overwrite")
64
+
65
+ return self._delta_table
66
+
67
+ def __exit__(
68
+ self,
69
+ exc_type: type[BaseException] | None,
70
+ exc_val: BaseException | None,
71
+ exc_tb: TracebackType | None,
72
+ ) -> None:
73
+ self._spark.sql(f"DROP TABLE IF EXISTS {self._table_name}")
@@ -0,0 +1,219 @@
1
+ """Workflow base class for Databricks job entry points."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from abc import ABC, abstractmethod
7
+ from datetime import datetime
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ from pyspark.sql import SparkSession
11
+
12
+ from databricks4py.logging import configure_logging, get_logger
13
+ from databricks4py.spark_session import active_fallback
14
+
15
+ if TYPE_CHECKING:
16
+ from databricks4py.config import JobConfig
17
+ from databricks4py.metrics import MetricsSink
18
+ from databricks4py.quality.gate import QualityGate
19
+ from databricks4py.retry import RetryConfig
20
+
21
+ __all__ = ["Workflow"]
22
+
23
+ logger = get_logger(__name__)
24
+
25
+
26
+ class Workflow(ABC):
27
+ """Abstract base class for Databricks workflow entry points.
28
+
29
+ Provides a structured pattern for job scripts that auto-initializes:
30
+
31
+ - SparkSession (via :func:`~databricks4py.spark_session.active_fallback`)
32
+ - Logging configuration
33
+ - dbutils injection (optional)
34
+ - Config, metrics, and retry integration (optional, v0.2+)
35
+
36
+ Subclasses implement :meth:`run` with business logic.
37
+
38
+ Example::
39
+
40
+ class MyETL(Workflow):
41
+ def run(self) -> None:
42
+ df = self.spark.read.table("source")
43
+ df.write.format("delta").saveAsTable("target")
44
+
45
+ # As a CLI entry point (pyspark.dbutils only on Databricks Runtime):
46
+ def main():
47
+ import pyspark.dbutils
48
+ MyETL(dbutils=pyspark.dbutils).execute()
49
+
50
+ Args:
51
+ spark: Optional SparkSession. Defaults to active session.
52
+ dbutils: Optional dbutils module for secret/file operations.
53
+ Only available on Databricks Runtime (``pyspark.dbutils``).
54
+ log_level: Logging level (default INFO).
55
+ config: Optional JobConfig for table lookups and spark configs.
56
+ metrics: Optional MetricsSink for lifecycle and custom metrics.
57
+ retry_config: Optional RetryConfig for retrying run_at_time on failure.
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ *,
63
+ spark: SparkSession | None = None,
64
+ dbutils: Any = None,
65
+ log_level: int = logging.INFO,
66
+ config: JobConfig | None = None,
67
+ metrics: MetricsSink | None = None,
68
+ retry_config: RetryConfig | None = None,
69
+ ) -> None:
70
+ configure_logging(level=log_level)
71
+ self._spark = active_fallback(spark)
72
+ self._dbutils: Any | None = None
73
+ self._execution_time: datetime | None = None
74
+ self._config = config
75
+ self._metrics = metrics
76
+ self._retry_config = retry_config
77
+
78
+ if dbutils is not None:
79
+ try:
80
+ self._inject_dbutils(dbutils)
81
+ self._dbutils = dbutils
82
+ except (ImportError, AttributeError, TypeError) as exc:
83
+ logger.warning("dbutils injection failed: %s (running outside Databricks?)", exc)
84
+
85
+ @staticmethod
86
+ def _inject_dbutils(dbutils_module: Any) -> None:
87
+ from databricks4py.secrets import SecretFetcher
88
+
89
+ try:
90
+ from databricks4py.io.dbfs import _set_dbutils_module
91
+
92
+ SecretFetcher.dbutils = dbutils_module
93
+ _set_dbutils_module(dbutils_module)
94
+ except ImportError:
95
+ SecretFetcher.dbutils = dbutils_module
96
+
97
+ @property
98
+ def spark(self) -> SparkSession:
99
+ """The SparkSession for this workflow."""
100
+ return self._spark
101
+
102
+ @property
103
+ def dbutils(self) -> Any:
104
+ """The dbutils module, or None if not in Databricks."""
105
+ return self._dbutils
106
+
107
+ @property
108
+ def execution_time(self) -> datetime:
109
+ """The logical execution time (set by run_at_time, or defaults to init time)."""
110
+ return self._execution_time
111
+
112
+ @property
113
+ def config(self) -> JobConfig | None:
114
+ """The JobConfig for this workflow, or None."""
115
+ return self._config
116
+
117
+ @property
118
+ def metrics(self) -> MetricsSink | None:
119
+ """The MetricsSink for this workflow, or None."""
120
+ return self._metrics
121
+
122
+ @abstractmethod
123
+ def run(self) -> None:
124
+ """Execute the workflow business logic.
125
+
126
+ Subclasses must implement this method.
127
+ """
128
+ ...
129
+
130
+ def emit_metric(self, event_type: str, **kwargs: Any) -> None:
131
+ """Emit a metric event. No-op if no metrics sink is configured."""
132
+ if self._metrics is None:
133
+ return
134
+ from databricks4py.metrics.base import MetricEvent
135
+
136
+ self._metrics.emit(
137
+ MetricEvent(
138
+ job_name=self.__class__.__name__,
139
+ event_type=event_type,
140
+ timestamp=datetime.now(),
141
+ **kwargs,
142
+ )
143
+ )
144
+
145
+ def quality_check(
146
+ self,
147
+ df: Any,
148
+ gate: QualityGate,
149
+ *,
150
+ table_name: str | None = None,
151
+ ) -> Any:
152
+ """Run a quality gate on a DataFrame and enforce its policy.
153
+
154
+ Emits a quality_check metric if a metrics sink is configured.
155
+ Returns the original DataFrame if checks pass, or the enforced
156
+ result (filtered/raised/warned) if they don't.
157
+ """
158
+ report = gate.check(df)
159
+ self.emit_metric(
160
+ "quality_check",
161
+ table_name=table_name,
162
+ metadata={"passed": report.passed, "checks": len(report.results)},
163
+ )
164
+ if not report.passed:
165
+ return gate.enforce(df)
166
+ return df
167
+
168
+ def run_at_time(self, execution_time: datetime | None = None) -> None:
169
+ """Execute the workflow with an explicit execution timestamp.
170
+
171
+ Useful for backfill scenarios where the logical execution
172
+ time differs from wall-clock time.
173
+
174
+ Args:
175
+ execution_time: The logical execution time. Defaults to now.
176
+ """
177
+ self._execution_time = execution_time or datetime.now()
178
+ logger.info(
179
+ "Running %s at execution_time=%s",
180
+ self.__class__.__name__,
181
+ self._execution_time.isoformat(),
182
+ )
183
+ self.run()
184
+
185
+ def execute(self) -> None:
186
+ """Standard entry point with logging, metrics, and error handling.
187
+
188
+ Call this from ``if __name__ == "__main__"``.
189
+ """
190
+ workflow_name = self.__class__.__name__
191
+
192
+ if self._config and self._config.spark_configs:
193
+ for k, v in self._config.spark_configs.items():
194
+ self._spark.conf.set(k, v)
195
+
196
+ self.emit_metric("job_start")
197
+ logger.info("Starting workflow: %s", workflow_name)
198
+ start = datetime.now()
199
+
200
+ try:
201
+ if self._retry_config:
202
+ from databricks4py.retry import retry
203
+
204
+ retryable_run = retry(self._retry_config)(self.run_at_time)
205
+ retryable_run()
206
+ else:
207
+ self.run_at_time()
208
+
209
+ duration_ms = (datetime.now() - start).total_seconds() * 1000
210
+ self.emit_metric("job_complete", duration_ms=duration_ms)
211
+ logger.info("Workflow %s completed successfully", workflow_name)
212
+ except Exception:
213
+ duration_ms = (datetime.now() - start).total_seconds() * 1000
214
+ self.emit_metric("job_failed", duration_ms=duration_ms)
215
+ logger.exception("Workflow %s failed", workflow_name)
216
+ raise
217
+ finally:
218
+ if self._metrics:
219
+ self._metrics.flush()