databricks4py 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. databricks4py/__init__.py +56 -0
  2. databricks4py/catalog.py +65 -0
  3. databricks4py/config/__init__.py +6 -0
  4. databricks4py/config/base.py +119 -0
  5. databricks4py/config/unity.py +72 -0
  6. databricks4py/filters/__init__.py +17 -0
  7. databricks4py/filters/base.py +154 -0
  8. databricks4py/io/__init__.py +40 -0
  9. databricks4py/io/checkpoint.py +98 -0
  10. databricks4py/io/dbfs.py +91 -0
  11. databricks4py/io/delta.py +564 -0
  12. databricks4py/io/merge.py +176 -0
  13. databricks4py/io/streaming.py +281 -0
  14. databricks4py/logging.py +39 -0
  15. databricks4py/metrics/__init__.py +22 -0
  16. databricks4py/metrics/base.py +66 -0
  17. databricks4py/metrics/delta_sink.py +75 -0
  18. databricks4py/metrics/logging_sink.py +20 -0
  19. databricks4py/migrations/__init__.py +27 -0
  20. databricks4py/migrations/alter.py +114 -0
  21. databricks4py/migrations/runner.py +241 -0
  22. databricks4py/migrations/schema_diff.py +136 -0
  23. databricks4py/migrations/validators.py +195 -0
  24. databricks4py/observability/__init__.py +24 -0
  25. databricks4py/observability/_utils.py +24 -0
  26. databricks4py/observability/batch_context.py +134 -0
  27. databricks4py/observability/health.py +223 -0
  28. databricks4py/observability/query_listener.py +236 -0
  29. databricks4py/py.typed +0 -0
  30. databricks4py/quality/__init__.py +26 -0
  31. databricks4py/quality/base.py +54 -0
  32. databricks4py/quality/expectations.py +184 -0
  33. databricks4py/quality/gate.py +90 -0
  34. databricks4py/retry.py +102 -0
  35. databricks4py/secrets.py +69 -0
  36. databricks4py/spark_session.py +68 -0
  37. databricks4py/testing/__init__.py +35 -0
  38. databricks4py/testing/assertions.py +111 -0
  39. databricks4py/testing/builders.py +127 -0
  40. databricks4py/testing/fixtures.py +134 -0
  41. databricks4py/testing/mocks.py +106 -0
  42. databricks4py/testing/temp_table.py +73 -0
  43. databricks4py/workflow.py +219 -0
  44. databricks4py-0.2.0.dist-info/METADATA +589 -0
  45. databricks4py-0.2.0.dist-info/RECORD +48 -0
  46. databricks4py-0.2.0.dist-info/WHEEL +5 -0
  47. databricks4py-0.2.0.dist-info/licenses/LICENSE +21 -0
  48. databricks4py-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,27 @@
1
+ """Migration framework for Delta Lake table evolution."""
2
+
3
+ from databricks4py.migrations.alter import TableAlter
4
+ from databricks4py.migrations.runner import MigrationRunner, MigrationRunResult, MigrationStep
5
+ from databricks4py.migrations.schema_diff import (
6
+ ColumnChange,
7
+ SchemaDiff,
8
+ SchemaEvolutionError,
9
+ )
10
+ from databricks4py.migrations.validators import (
11
+ MigrationError,
12
+ TableValidator,
13
+ ValidationResult,
14
+ )
15
+
16
+ __all__ = [
17
+ "ColumnChange",
18
+ "MigrationError",
19
+ "MigrationRunResult",
20
+ "MigrationRunner",
21
+ "MigrationStep",
22
+ "SchemaDiff",
23
+ "SchemaEvolutionError",
24
+ "TableAlter",
25
+ "TableValidator",
26
+ "ValidationResult",
27
+ ]
@@ -0,0 +1,114 @@
1
+ """Fluent DDL builder for Delta table schema changes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ from pyspark.sql import SparkSession
8
+
9
+ from databricks4py.spark_session import active_fallback
10
+
11
+ __all__ = ["TableAlter"]
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def _sql_str(value: str) -> str:
17
+ """Escape single quotes for embedding in SQL string literals."""
18
+ return value.replace("'", "''")
19
+
20
+
21
+ class TableAlter:
22
+ """Fluent interface for batching Delta table DDL operations.
23
+
24
+ Operations are queued and executed one-by-one via ``ALTER TABLE`` when
25
+ :meth:`apply` is called. Each operation is logged at INFO level.
26
+
27
+ Note:
28
+ ``rename_column`` and ``drop_column`` require Delta column mapping to be
29
+ enabled on the target table
30
+ (``delta.columnMapping.mode = 'name'``). Use :meth:`set_property` to
31
+ enable it before renaming or dropping.
32
+
33
+ Example::
34
+
35
+ TableAlter("catalog.schema.events", spark=spark) \\
36
+ .add_column("region", "STRING", comment="ISO-3166 region code") \\
37
+ .set_property("delta.enableChangeDataFeed", "true") \\
38
+ .apply()
39
+
40
+ Args:
41
+ table_name: Fully qualified table name.
42
+ spark: Optional SparkSession.
43
+ """
44
+
45
+ def __init__(self, table_name: str, *, spark: SparkSession | None = None) -> None:
46
+ self._table_name = table_name
47
+ self._spark = active_fallback(spark)
48
+ self._ops: list[str] = []
49
+
50
+ def add_column(
51
+ self,
52
+ name: str,
53
+ data_type: str,
54
+ *,
55
+ after: str | None = None,
56
+ nullable: bool = True,
57
+ comment: str | None = None,
58
+ ) -> TableAlter:
59
+ """Queue an ADD COLUMN operation.
60
+
61
+ Args:
62
+ name: Column name.
63
+ data_type: Spark SQL type string (e.g. ``"STRING"``, ``"DOUBLE"``).
64
+ after: If set, place the new column after this existing column.
65
+ nullable: Whether the column allows nulls (default True).
66
+ comment: Optional column comment.
67
+ """
68
+ not_null = "" if nullable else " NOT NULL"
69
+ comment_clause = f" COMMENT '{_sql_str(comment)}'" if comment else ""
70
+ after_clause = f" AFTER {after}" if after else ""
71
+ self._ops.append(f"ADD COLUMN ({name} {data_type}{not_null}{comment_clause}{after_clause})")
72
+ return self
73
+
74
+ def rename_column(self, old_name: str, new_name: str) -> TableAlter:
75
+ """Queue a RENAME COLUMN operation.
76
+
77
+ Requires ``delta.columnMapping.mode = 'name'`` on the target table.
78
+ """
79
+ self._ops.append(f"RENAME COLUMN {old_name} TO {new_name}")
80
+ return self
81
+
82
+ def drop_column(self, name: str) -> TableAlter:
83
+ """Queue a DROP COLUMN operation.
84
+
85
+ Requires ``delta.columnMapping.mode = 'name'`` on the target table.
86
+ """
87
+ self._ops.append(f"DROP COLUMN {name}")
88
+ return self
89
+
90
+ def set_property(self, key: str, value: str) -> TableAlter:
91
+ """Queue a SET TBLPROPERTIES operation.
92
+
93
+ Args:
94
+ key: Property key (e.g. ``"delta.enableChangeDataFeed"``).
95
+ value: Property value string.
96
+ """
97
+ self._ops.append(f"SET TBLPROPERTIES ('{_sql_str(key)}' = '{_sql_str(value)}')")
98
+ return self
99
+
100
+ def apply(self) -> None:
101
+ """Execute all queued DDL operations against the target table.
102
+
103
+ Each operation runs as a separate ``ALTER TABLE`` statement. The queue
104
+ is cleared after a successful apply. If any statement raises, the
105
+ remaining operations are not run and the queue is not cleared — a
106
+ subsequent ``apply()`` call will retry from the beginning.
107
+ """
108
+ if not self._ops:
109
+ return
110
+ for op in self._ops:
111
+ sql = f"ALTER TABLE {self._table_name} {op}"
112
+ logger.info("Executing: %s", sql)
113
+ self._spark.sql(sql)
114
+ self._ops.clear()
@@ -0,0 +1,241 @@
1
+ """Ordered, idempotent migration runner for Delta Lake tables."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from collections.abc import Callable
7
+ from dataclasses import dataclass, field
8
+ from datetime import datetime, timezone
9
+
10
+ from pyspark.sql import SparkSession
11
+ from pyspark.sql.types import BooleanType, StringType, StructField, StructType, TimestampType
12
+
13
+ from databricks4py.spark_session import active_fallback
14
+
15
+ __all__ = ["MigrationRunResult", "MigrationRunner", "MigrationStep"]
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ _HISTORY_SCHEMA = StructType(
20
+ [
21
+ StructField("version", StringType(), False),
22
+ StructField("description", StringType(), True),
23
+ StructField("applied_at", TimestampType(), False),
24
+ StructField("success", BooleanType(), False),
25
+ StructField("error_message", StringType(), True),
26
+ ]
27
+ )
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class MigrationStep:
32
+ """A single versioned migration step.
33
+
34
+ Steps are sorted by ``version`` before execution, so lexicographic ordering
35
+ determines the run order. Use a fixed-width prefix (``"V001"``, ``"V002"``, etc.)
36
+ to keep ordering stable as the number of steps grows.
37
+
38
+ Args:
39
+ version: Unique version string. Determines execution order.
40
+ description: Human-readable description stored in the history table.
41
+ up: Callable that receives a SparkSession and applies the migration.
42
+ May execute any Spark SQL, Delta, or Python logic.
43
+ pre_validate: Optional guard — called before ``up``. Return ``False``
44
+ to abort the step with a ``MigrationError``.
45
+ post_validate: Optional check — called after ``up``. Return ``False``
46
+ to mark the step as failed and halt the run.
47
+ """
48
+
49
+ version: str
50
+ description: str
51
+ up: Callable[[SparkSession], None]
52
+ pre_validate: Callable[[SparkSession], bool] | None = None
53
+ post_validate: Callable[[SparkSession], bool] | None = None
54
+
55
+
56
+ @dataclass
57
+ class MigrationRunResult:
58
+ """Summary of a :meth:`MigrationRunner.run` execution.
59
+
60
+ Attributes:
61
+ applied: Versions applied in this run, in execution order.
62
+ skipped: Versions already recorded as applied (idempotent skips).
63
+ failed: The version that caused a failure, if any.
64
+ dry_run: Whether this was a dry-run (no changes written).
65
+ """
66
+
67
+ applied: list[str] = field(default_factory=list)
68
+ skipped: list[str] = field(default_factory=list)
69
+ failed: str | None = None
70
+ dry_run: bool = False
71
+
72
+
73
+ class MigrationRunner:
74
+ """Ordered, idempotent migration runner for Delta Lake.
75
+
76
+ Tracks applied versions in a Delta history table so each step runs exactly
77
+ once across all environments. Steps execute in lexicographic version order.
78
+
79
+ Example::
80
+
81
+ def add_audit_columns(spark: SparkSession) -> None:
82
+ spark.sql(
83
+ "ALTER TABLE catalog.schema.events "
84
+ "ADD COLUMNS (created_at TIMESTAMP, updated_at TIMESTAMP)"
85
+ )
86
+
87
+ runner = MigrationRunner(
88
+ history_table="catalog.schema._migration_history",
89
+ )
90
+ runner.register(
91
+ MigrationStep(
92
+ version="V001",
93
+ description="Add audit columns to events",
94
+ up=add_audit_columns,
95
+ )
96
+ )
97
+ result = runner.run()
98
+
99
+ Args:
100
+ history_table: Fully qualified Delta table name used to record applied steps.
101
+ Created automatically on first use.
102
+ spark: Optional SparkSession.
103
+ """
104
+
105
+ def __init__(
106
+ self,
107
+ history_table: str,
108
+ *,
109
+ spark: SparkSession | None = None,
110
+ ) -> None:
111
+ self._spark = active_fallback(spark)
112
+ self._history_table = history_table
113
+ self._steps: list[MigrationStep] = []
114
+ self._ensure_history_table()
115
+
116
+ def _ensure_history_table(self) -> None:
117
+ self._spark.sql(
118
+ f"""
119
+ CREATE TABLE IF NOT EXISTS {self._history_table} (
120
+ version STRING NOT NULL,
121
+ description STRING,
122
+ applied_at TIMESTAMP NOT NULL,
123
+ success BOOLEAN NOT NULL,
124
+ error_message STRING
125
+ )
126
+ USING DELTA
127
+ """
128
+ )
129
+
130
+ def _applied_versions(self) -> set[str]:
131
+ rows = self._spark.sql(
132
+ f"SELECT version FROM {self._history_table} WHERE success = true"
133
+ ).collect()
134
+ return {row["version"] for row in rows}
135
+
136
+ def _record(
137
+ self,
138
+ step: MigrationStep,
139
+ *,
140
+ success: bool,
141
+ error_message: str | None = None,
142
+ ) -> None:
143
+ from pyspark.sql import Row
144
+
145
+ row = Row(
146
+ version=step.version,
147
+ description=step.description,
148
+ applied_at=datetime.now(tz=timezone.utc),
149
+ success=success,
150
+ error_message=error_message,
151
+ )
152
+ (
153
+ self._spark.createDataFrame([row], schema=_HISTORY_SCHEMA)
154
+ .write.format("delta")
155
+ .mode("append")
156
+ .saveAsTable(self._history_table)
157
+ )
158
+
159
+ def register(self, *steps: MigrationStep) -> MigrationRunner:
160
+ """Add one or more migration steps. Returns self for chaining.
161
+
162
+ Steps can be registered in any call order — execution order is always
163
+ determined by ``version``, not registration order.
164
+ """
165
+ self._steps.extend(steps)
166
+ return self
167
+
168
+ def pending(self) -> list[MigrationStep]:
169
+ """Return steps not yet applied, sorted by version."""
170
+ applied = self._applied_versions()
171
+ return sorted(
172
+ (s for s in self._steps if s.version not in applied),
173
+ key=lambda s: s.version,
174
+ )
175
+
176
+ def applied(self) -> list[str]:
177
+ """Return successfully applied versions in sorted order."""
178
+ return sorted(self._applied_versions())
179
+
180
+ def run(self, *, dry_run: bool = False) -> MigrationRunResult:
181
+ """Run all pending migration steps in version order.
182
+
183
+ Steps already in the history table are skipped (idempotent). Execution
184
+ halts on the first failure — the failed version is recorded with
185
+ ``success=False`` and returned in :attr:`MigrationRunResult.failed`.
186
+
187
+ Args:
188
+ dry_run: Log what would run without executing or writing history.
189
+
190
+ Returns:
191
+ MigrationRunResult summarising applied, skipped, and failed steps.
192
+
193
+ Raises:
194
+ MigrationError: If a step's pre-validation returns False.
195
+ """
196
+ from databricks4py.migrations.validators import MigrationError
197
+
198
+ result = MigrationRunResult(dry_run=dry_run)
199
+ applied_set = self._applied_versions()
200
+ all_sorted = sorted(self._steps, key=lambda s: s.version)
201
+
202
+ for step in all_sorted:
203
+ if step.version in applied_set:
204
+ result.skipped.append(step.version)
205
+ logger.debug("Skipping already-applied step %s", step.version)
206
+ continue
207
+
208
+ logger.info("Running migration %s: %s", step.version, step.description)
209
+
210
+ if step.pre_validate is not None and not step.pre_validate(self._spark):
211
+ raise MigrationError(
212
+ step.version,
213
+ [f"Pre-validation failed for step {step.version}: {step.description}"],
214
+ )
215
+
216
+ if dry_run:
217
+ logger.info("[dry-run] Would apply %s: %s", step.version, step.description)
218
+ result.applied.append(step.version)
219
+ continue
220
+
221
+ try:
222
+ step.up(self._spark)
223
+ except Exception as exc:
224
+ error_msg = str(exc)
225
+ logger.error("Migration %s failed: %s", step.version, error_msg)
226
+ self._record(step, success=False, error_message=error_msg)
227
+ result.failed = step.version
228
+ return result
229
+
230
+ if step.post_validate is not None and not step.post_validate(self._spark):
231
+ error_msg = f"Post-validation failed for step {step.version}"
232
+ logger.error(error_msg)
233
+ self._record(step, success=False, error_message=error_msg)
234
+ result.failed = step.version
235
+ return result
236
+
237
+ self._record(step, success=True)
238
+ result.applied.append(step.version)
239
+ logger.info("Applied migration %s", step.version)
240
+
241
+ return result
@@ -0,0 +1,136 @@
1
+ """Schema diff detection for Delta Lake table evolution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Literal
7
+
8
+ from pyspark.sql import DataFrame, SparkSession
9
+ from pyspark.sql.types import StructType
10
+
11
+ from databricks4py.spark_session import active_fallback
12
+
13
+ __all__ = ["ColumnChange", "SchemaDiff", "SchemaEvolutionError"]
14
+
15
+
16
+ class SchemaEvolutionError(Exception):
17
+ """Raised on breaking schema changes."""
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class ColumnChange:
22
+ column: str
23
+ change_type: Literal["added", "removed", "type_changed", "nullable_changed"]
24
+ old_value: str | None = None
25
+ new_value: str | None = None
26
+ severity: Literal["info", "warning", "breaking"] = "info"
27
+
28
+
29
+ class SchemaDiff:
30
+ """Compares two StructType schemas and reports column-level changes."""
31
+
32
+ def __init__(self, current: StructType, incoming: StructType) -> None:
33
+ self._current = current
34
+ self._incoming = incoming
35
+ self._changes: list[ColumnChange] | None = None
36
+
37
+ @classmethod
38
+ def from_tables(
39
+ cls,
40
+ table_name: str,
41
+ incoming_df: DataFrame,
42
+ *,
43
+ spark: SparkSession | None = None,
44
+ ) -> SchemaDiff:
45
+ """Create a diff between an existing table's schema and an incoming DataFrame.
46
+
47
+ Args:
48
+ table_name: Fully qualified table to read the current schema from.
49
+ incoming_df: DataFrame whose schema represents the proposed change.
50
+ spark: Optional SparkSession.
51
+ """
52
+ spark = active_fallback(spark)
53
+ current_schema = spark.read.table(table_name).schema
54
+ return cls(current=current_schema, incoming=incoming_df.schema)
55
+
56
+ def changes(self) -> list[ColumnChange]:
57
+ """Compute and return the list of column-level changes between schemas."""
58
+ if self._changes is not None:
59
+ return self._changes
60
+
61
+ result: list[ColumnChange] = []
62
+ current_fields = {f.name: f for f in self._current.fields}
63
+ incoming_fields = {f.name: f for f in self._incoming.fields}
64
+
65
+ for name in sorted(incoming_fields.keys() - current_fields.keys()):
66
+ field = incoming_fields[name]
67
+ result.append(
68
+ ColumnChange(
69
+ column=name,
70
+ change_type="added",
71
+ new_value=str(field.dataType),
72
+ severity="info",
73
+ )
74
+ )
75
+
76
+ for name in sorted(current_fields.keys() - incoming_fields.keys()):
77
+ field = current_fields[name]
78
+ result.append(
79
+ ColumnChange(
80
+ column=name,
81
+ change_type="removed",
82
+ old_value=str(field.dataType),
83
+ severity="breaking",
84
+ )
85
+ )
86
+
87
+ for name in sorted(current_fields.keys() & incoming_fields.keys()):
88
+ cur = current_fields[name]
89
+ inc = incoming_fields[name]
90
+
91
+ if cur.dataType != inc.dataType:
92
+ result.append(
93
+ ColumnChange(
94
+ column=name,
95
+ change_type="type_changed",
96
+ old_value=str(cur.dataType),
97
+ new_value=str(inc.dataType),
98
+ severity="breaking",
99
+ )
100
+ )
101
+ elif cur.nullable != inc.nullable:
102
+ result.append(
103
+ ColumnChange(
104
+ column=name,
105
+ change_type="nullable_changed",
106
+ old_value=str(cur.nullable),
107
+ new_value=str(inc.nullable),
108
+ severity="warning",
109
+ )
110
+ )
111
+
112
+ self._changes = result
113
+ return result
114
+
115
+ def has_breaking_changes(self) -> bool:
116
+ """True if any change has ``severity='breaking'`` (column removal or type change)."""
117
+ return any(c.severity == "breaking" for c in self.changes())
118
+
119
+ def summary(self) -> str:
120
+ """Return a human-readable table of all detected changes."""
121
+ changes = self.changes()
122
+ if not changes:
123
+ return "No schema changes detected."
124
+
125
+ lines = [f"{'Column':<30} {'Change':<20} {'Severity':<10} {'Details'}"]
126
+ lines.append("-" * 80)
127
+ for c in changes:
128
+ details = ""
129
+ if c.old_value and c.new_value:
130
+ details = f"{c.old_value} -> {c.new_value}"
131
+ elif c.new_value:
132
+ details = c.new_value
133
+ elif c.old_value:
134
+ details = c.old_value
135
+ lines.append(f"{c.column:<30} {c.change_type:<20} {c.severity:<10} {details}")
136
+ return "\n".join(lines)