databricks4py 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks4py/__init__.py +56 -0
- databricks4py/catalog.py +65 -0
- databricks4py/config/__init__.py +6 -0
- databricks4py/config/base.py +119 -0
- databricks4py/config/unity.py +72 -0
- databricks4py/filters/__init__.py +17 -0
- databricks4py/filters/base.py +154 -0
- databricks4py/io/__init__.py +40 -0
- databricks4py/io/checkpoint.py +98 -0
- databricks4py/io/dbfs.py +91 -0
- databricks4py/io/delta.py +564 -0
- databricks4py/io/merge.py +176 -0
- databricks4py/io/streaming.py +281 -0
- databricks4py/logging.py +39 -0
- databricks4py/metrics/__init__.py +22 -0
- databricks4py/metrics/base.py +66 -0
- databricks4py/metrics/delta_sink.py +75 -0
- databricks4py/metrics/logging_sink.py +20 -0
- databricks4py/migrations/__init__.py +27 -0
- databricks4py/migrations/alter.py +114 -0
- databricks4py/migrations/runner.py +241 -0
- databricks4py/migrations/schema_diff.py +136 -0
- databricks4py/migrations/validators.py +195 -0
- databricks4py/observability/__init__.py +24 -0
- databricks4py/observability/_utils.py +24 -0
- databricks4py/observability/batch_context.py +134 -0
- databricks4py/observability/health.py +223 -0
- databricks4py/observability/query_listener.py +236 -0
- databricks4py/py.typed +0 -0
- databricks4py/quality/__init__.py +26 -0
- databricks4py/quality/base.py +54 -0
- databricks4py/quality/expectations.py +184 -0
- databricks4py/quality/gate.py +90 -0
- databricks4py/retry.py +102 -0
- databricks4py/secrets.py +69 -0
- databricks4py/spark_session.py +68 -0
- databricks4py/testing/__init__.py +35 -0
- databricks4py/testing/assertions.py +111 -0
- databricks4py/testing/builders.py +127 -0
- databricks4py/testing/fixtures.py +134 -0
- databricks4py/testing/mocks.py +106 -0
- databricks4py/testing/temp_table.py +73 -0
- databricks4py/workflow.py +219 -0
- databricks4py-0.2.0.dist-info/METADATA +589 -0
- databricks4py-0.2.0.dist-info/RECORD +48 -0
- databricks4py-0.2.0.dist-info/WHEEL +5 -0
- databricks4py-0.2.0.dist-info/licenses/LICENSE +21 -0
- databricks4py-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Migration framework for Delta Lake table evolution."""
|
|
2
|
+
|
|
3
|
+
from databricks4py.migrations.alter import TableAlter
|
|
4
|
+
from databricks4py.migrations.runner import MigrationRunner, MigrationRunResult, MigrationStep
|
|
5
|
+
from databricks4py.migrations.schema_diff import (
|
|
6
|
+
ColumnChange,
|
|
7
|
+
SchemaDiff,
|
|
8
|
+
SchemaEvolutionError,
|
|
9
|
+
)
|
|
10
|
+
from databricks4py.migrations.validators import (
|
|
11
|
+
MigrationError,
|
|
12
|
+
TableValidator,
|
|
13
|
+
ValidationResult,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"ColumnChange",
|
|
18
|
+
"MigrationError",
|
|
19
|
+
"MigrationRunResult",
|
|
20
|
+
"MigrationRunner",
|
|
21
|
+
"MigrationStep",
|
|
22
|
+
"SchemaDiff",
|
|
23
|
+
"SchemaEvolutionError",
|
|
24
|
+
"TableAlter",
|
|
25
|
+
"TableValidator",
|
|
26
|
+
"ValidationResult",
|
|
27
|
+
]
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Fluent DDL builder for Delta table schema changes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from pyspark.sql import SparkSession
|
|
8
|
+
|
|
9
|
+
from databricks4py.spark_session import active_fallback
|
|
10
|
+
|
|
11
|
+
__all__ = ["TableAlter"]
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _sql_str(value: str) -> str:
|
|
17
|
+
"""Escape single quotes for embedding in SQL string literals."""
|
|
18
|
+
return value.replace("'", "''")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TableAlter:
|
|
22
|
+
"""Fluent interface for batching Delta table DDL operations.
|
|
23
|
+
|
|
24
|
+
Operations are queued and executed one-by-one via ``ALTER TABLE`` when
|
|
25
|
+
:meth:`apply` is called. Each operation is logged at INFO level.
|
|
26
|
+
|
|
27
|
+
Note:
|
|
28
|
+
``rename_column`` and ``drop_column`` require Delta column mapping to be
|
|
29
|
+
enabled on the target table
|
|
30
|
+
(``delta.columnMapping.mode = 'name'``). Use :meth:`set_property` to
|
|
31
|
+
enable it before renaming or dropping.
|
|
32
|
+
|
|
33
|
+
Example::
|
|
34
|
+
|
|
35
|
+
TableAlter("catalog.schema.events", spark=spark) \\
|
|
36
|
+
.add_column("region", "STRING", comment="ISO-3166 region code") \\
|
|
37
|
+
.set_property("delta.enableChangeDataFeed", "true") \\
|
|
38
|
+
.apply()
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
table_name: Fully qualified table name.
|
|
42
|
+
spark: Optional SparkSession.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self, table_name: str, *, spark: SparkSession | None = None) -> None:
|
|
46
|
+
self._table_name = table_name
|
|
47
|
+
self._spark = active_fallback(spark)
|
|
48
|
+
self._ops: list[str] = []
|
|
49
|
+
|
|
50
|
+
def add_column(
|
|
51
|
+
self,
|
|
52
|
+
name: str,
|
|
53
|
+
data_type: str,
|
|
54
|
+
*,
|
|
55
|
+
after: str | None = None,
|
|
56
|
+
nullable: bool = True,
|
|
57
|
+
comment: str | None = None,
|
|
58
|
+
) -> TableAlter:
|
|
59
|
+
"""Queue an ADD COLUMN operation.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
name: Column name.
|
|
63
|
+
data_type: Spark SQL type string (e.g. ``"STRING"``, ``"DOUBLE"``).
|
|
64
|
+
after: If set, place the new column after this existing column.
|
|
65
|
+
nullable: Whether the column allows nulls (default True).
|
|
66
|
+
comment: Optional column comment.
|
|
67
|
+
"""
|
|
68
|
+
not_null = "" if nullable else " NOT NULL"
|
|
69
|
+
comment_clause = f" COMMENT '{_sql_str(comment)}'" if comment else ""
|
|
70
|
+
after_clause = f" AFTER {after}" if after else ""
|
|
71
|
+
self._ops.append(f"ADD COLUMN ({name} {data_type}{not_null}{comment_clause}{after_clause})")
|
|
72
|
+
return self
|
|
73
|
+
|
|
74
|
+
def rename_column(self, old_name: str, new_name: str) -> TableAlter:
|
|
75
|
+
"""Queue a RENAME COLUMN operation.
|
|
76
|
+
|
|
77
|
+
Requires ``delta.columnMapping.mode = 'name'`` on the target table.
|
|
78
|
+
"""
|
|
79
|
+
self._ops.append(f"RENAME COLUMN {old_name} TO {new_name}")
|
|
80
|
+
return self
|
|
81
|
+
|
|
82
|
+
def drop_column(self, name: str) -> TableAlter:
|
|
83
|
+
"""Queue a DROP COLUMN operation.
|
|
84
|
+
|
|
85
|
+
Requires ``delta.columnMapping.mode = 'name'`` on the target table.
|
|
86
|
+
"""
|
|
87
|
+
self._ops.append(f"DROP COLUMN {name}")
|
|
88
|
+
return self
|
|
89
|
+
|
|
90
|
+
def set_property(self, key: str, value: str) -> TableAlter:
|
|
91
|
+
"""Queue a SET TBLPROPERTIES operation.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
key: Property key (e.g. ``"delta.enableChangeDataFeed"``).
|
|
95
|
+
value: Property value string.
|
|
96
|
+
"""
|
|
97
|
+
self._ops.append(f"SET TBLPROPERTIES ('{_sql_str(key)}' = '{_sql_str(value)}')")
|
|
98
|
+
return self
|
|
99
|
+
|
|
100
|
+
def apply(self) -> None:
|
|
101
|
+
"""Execute all queued DDL operations against the target table.
|
|
102
|
+
|
|
103
|
+
Each operation runs as a separate ``ALTER TABLE`` statement. The queue
|
|
104
|
+
is cleared after a successful apply. If any statement raises, the
|
|
105
|
+
remaining operations are not run and the queue is not cleared — a
|
|
106
|
+
subsequent ``apply()`` call will retry from the beginning.
|
|
107
|
+
"""
|
|
108
|
+
if not self._ops:
|
|
109
|
+
return
|
|
110
|
+
for op in self._ops:
|
|
111
|
+
sql = f"ALTER TABLE {self._table_name} {op}"
|
|
112
|
+
logger.info("Executing: %s", sql)
|
|
113
|
+
self._spark.sql(sql)
|
|
114
|
+
self._ops.clear()
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
"""Ordered, idempotent migration runner for Delta Lake tables."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
|
|
10
|
+
from pyspark.sql import SparkSession
|
|
11
|
+
from pyspark.sql.types import BooleanType, StringType, StructField, StructType, TimestampType
|
|
12
|
+
|
|
13
|
+
from databricks4py.spark_session import active_fallback
|
|
14
|
+
|
|
15
|
+
__all__ = ["MigrationRunResult", "MigrationRunner", "MigrationStep"]
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
_HISTORY_SCHEMA = StructType(
|
|
20
|
+
[
|
|
21
|
+
StructField("version", StringType(), False),
|
|
22
|
+
StructField("description", StringType(), True),
|
|
23
|
+
StructField("applied_at", TimestampType(), False),
|
|
24
|
+
StructField("success", BooleanType(), False),
|
|
25
|
+
StructField("error_message", StringType(), True),
|
|
26
|
+
]
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class MigrationStep:
|
|
32
|
+
"""A single versioned migration step.
|
|
33
|
+
|
|
34
|
+
Steps are sorted by ``version`` before execution, so lexicographic ordering
|
|
35
|
+
determines the run order. Use a fixed-width prefix (``"V001"``, ``"V002"``, etc.)
|
|
36
|
+
to keep ordering stable as the number of steps grows.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
version: Unique version string. Determines execution order.
|
|
40
|
+
description: Human-readable description stored in the history table.
|
|
41
|
+
up: Callable that receives a SparkSession and applies the migration.
|
|
42
|
+
May execute any Spark SQL, Delta, or Python logic.
|
|
43
|
+
pre_validate: Optional guard — called before ``up``. Return ``False``
|
|
44
|
+
to abort the step with a ``MigrationError``.
|
|
45
|
+
post_validate: Optional check — called after ``up``. Return ``False``
|
|
46
|
+
to mark the step as failed and halt the run.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
version: str
|
|
50
|
+
description: str
|
|
51
|
+
up: Callable[[SparkSession], None]
|
|
52
|
+
pre_validate: Callable[[SparkSession], bool] | None = None
|
|
53
|
+
post_validate: Callable[[SparkSession], bool] | None = None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class MigrationRunResult:
|
|
58
|
+
"""Summary of a :meth:`MigrationRunner.run` execution.
|
|
59
|
+
|
|
60
|
+
Attributes:
|
|
61
|
+
applied: Versions applied in this run, in execution order.
|
|
62
|
+
skipped: Versions already recorded as applied (idempotent skips).
|
|
63
|
+
failed: The version that caused a failure, if any.
|
|
64
|
+
dry_run: Whether this was a dry-run (no changes written).
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
applied: list[str] = field(default_factory=list)
|
|
68
|
+
skipped: list[str] = field(default_factory=list)
|
|
69
|
+
failed: str | None = None
|
|
70
|
+
dry_run: bool = False
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class MigrationRunner:
|
|
74
|
+
"""Ordered, idempotent migration runner for Delta Lake.
|
|
75
|
+
|
|
76
|
+
Tracks applied versions in a Delta history table so each step runs exactly
|
|
77
|
+
once across all environments. Steps execute in lexicographic version order.
|
|
78
|
+
|
|
79
|
+
Example::
|
|
80
|
+
|
|
81
|
+
def add_audit_columns(spark: SparkSession) -> None:
|
|
82
|
+
spark.sql(
|
|
83
|
+
"ALTER TABLE catalog.schema.events "
|
|
84
|
+
"ADD COLUMNS (created_at TIMESTAMP, updated_at TIMESTAMP)"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
runner = MigrationRunner(
|
|
88
|
+
history_table="catalog.schema._migration_history",
|
|
89
|
+
)
|
|
90
|
+
runner.register(
|
|
91
|
+
MigrationStep(
|
|
92
|
+
version="V001",
|
|
93
|
+
description="Add audit columns to events",
|
|
94
|
+
up=add_audit_columns,
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
result = runner.run()
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
history_table: Fully qualified Delta table name used to record applied steps.
|
|
101
|
+
Created automatically on first use.
|
|
102
|
+
spark: Optional SparkSession.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
def __init__(
|
|
106
|
+
self,
|
|
107
|
+
history_table: str,
|
|
108
|
+
*,
|
|
109
|
+
spark: SparkSession | None = None,
|
|
110
|
+
) -> None:
|
|
111
|
+
self._spark = active_fallback(spark)
|
|
112
|
+
self._history_table = history_table
|
|
113
|
+
self._steps: list[MigrationStep] = []
|
|
114
|
+
self._ensure_history_table()
|
|
115
|
+
|
|
116
|
+
def _ensure_history_table(self) -> None:
|
|
117
|
+
self._spark.sql(
|
|
118
|
+
f"""
|
|
119
|
+
CREATE TABLE IF NOT EXISTS {self._history_table} (
|
|
120
|
+
version STRING NOT NULL,
|
|
121
|
+
description STRING,
|
|
122
|
+
applied_at TIMESTAMP NOT NULL,
|
|
123
|
+
success BOOLEAN NOT NULL,
|
|
124
|
+
error_message STRING
|
|
125
|
+
)
|
|
126
|
+
USING DELTA
|
|
127
|
+
"""
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
def _applied_versions(self) -> set[str]:
|
|
131
|
+
rows = self._spark.sql(
|
|
132
|
+
f"SELECT version FROM {self._history_table} WHERE success = true"
|
|
133
|
+
).collect()
|
|
134
|
+
return {row["version"] for row in rows}
|
|
135
|
+
|
|
136
|
+
def _record(
|
|
137
|
+
self,
|
|
138
|
+
step: MigrationStep,
|
|
139
|
+
*,
|
|
140
|
+
success: bool,
|
|
141
|
+
error_message: str | None = None,
|
|
142
|
+
) -> None:
|
|
143
|
+
from pyspark.sql import Row
|
|
144
|
+
|
|
145
|
+
row = Row(
|
|
146
|
+
version=step.version,
|
|
147
|
+
description=step.description,
|
|
148
|
+
applied_at=datetime.now(tz=timezone.utc),
|
|
149
|
+
success=success,
|
|
150
|
+
error_message=error_message,
|
|
151
|
+
)
|
|
152
|
+
(
|
|
153
|
+
self._spark.createDataFrame([row], schema=_HISTORY_SCHEMA)
|
|
154
|
+
.write.format("delta")
|
|
155
|
+
.mode("append")
|
|
156
|
+
.saveAsTable(self._history_table)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def register(self, *steps: MigrationStep) -> MigrationRunner:
|
|
160
|
+
"""Add one or more migration steps. Returns self for chaining.
|
|
161
|
+
|
|
162
|
+
Steps can be registered in any call order — execution order is always
|
|
163
|
+
determined by ``version``, not registration order.
|
|
164
|
+
"""
|
|
165
|
+
self._steps.extend(steps)
|
|
166
|
+
return self
|
|
167
|
+
|
|
168
|
+
def pending(self) -> list[MigrationStep]:
|
|
169
|
+
"""Return steps not yet applied, sorted by version."""
|
|
170
|
+
applied = self._applied_versions()
|
|
171
|
+
return sorted(
|
|
172
|
+
(s for s in self._steps if s.version not in applied),
|
|
173
|
+
key=lambda s: s.version,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def applied(self) -> list[str]:
|
|
177
|
+
"""Return successfully applied versions in sorted order."""
|
|
178
|
+
return sorted(self._applied_versions())
|
|
179
|
+
|
|
180
|
+
def run(self, *, dry_run: bool = False) -> MigrationRunResult:
|
|
181
|
+
"""Run all pending migration steps in version order.
|
|
182
|
+
|
|
183
|
+
Steps already in the history table are skipped (idempotent). Execution
|
|
184
|
+
halts on the first failure — the failed version is recorded with
|
|
185
|
+
``success=False`` and returned in :attr:`MigrationRunResult.failed`.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
dry_run: Log what would run without executing or writing history.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
MigrationRunResult summarising applied, skipped, and failed steps.
|
|
192
|
+
|
|
193
|
+
Raises:
|
|
194
|
+
MigrationError: If a step's pre-validation returns False.
|
|
195
|
+
"""
|
|
196
|
+
from databricks4py.migrations.validators import MigrationError
|
|
197
|
+
|
|
198
|
+
result = MigrationRunResult(dry_run=dry_run)
|
|
199
|
+
applied_set = self._applied_versions()
|
|
200
|
+
all_sorted = sorted(self._steps, key=lambda s: s.version)
|
|
201
|
+
|
|
202
|
+
for step in all_sorted:
|
|
203
|
+
if step.version in applied_set:
|
|
204
|
+
result.skipped.append(step.version)
|
|
205
|
+
logger.debug("Skipping already-applied step %s", step.version)
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
logger.info("Running migration %s: %s", step.version, step.description)
|
|
209
|
+
|
|
210
|
+
if step.pre_validate is not None and not step.pre_validate(self._spark):
|
|
211
|
+
raise MigrationError(
|
|
212
|
+
step.version,
|
|
213
|
+
[f"Pre-validation failed for step {step.version}: {step.description}"],
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
if dry_run:
|
|
217
|
+
logger.info("[dry-run] Would apply %s: %s", step.version, step.description)
|
|
218
|
+
result.applied.append(step.version)
|
|
219
|
+
continue
|
|
220
|
+
|
|
221
|
+
try:
|
|
222
|
+
step.up(self._spark)
|
|
223
|
+
except Exception as exc:
|
|
224
|
+
error_msg = str(exc)
|
|
225
|
+
logger.error("Migration %s failed: %s", step.version, error_msg)
|
|
226
|
+
self._record(step, success=False, error_message=error_msg)
|
|
227
|
+
result.failed = step.version
|
|
228
|
+
return result
|
|
229
|
+
|
|
230
|
+
if step.post_validate is not None and not step.post_validate(self._spark):
|
|
231
|
+
error_msg = f"Post-validation failed for step {step.version}"
|
|
232
|
+
logger.error(error_msg)
|
|
233
|
+
self._record(step, success=False, error_message=error_msg)
|
|
234
|
+
result.failed = step.version
|
|
235
|
+
return result
|
|
236
|
+
|
|
237
|
+
self._record(step, success=True)
|
|
238
|
+
result.applied.append(step.version)
|
|
239
|
+
logger.info("Applied migration %s", step.version)
|
|
240
|
+
|
|
241
|
+
return result
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Schema diff detection for Delta Lake table evolution."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
9
|
+
from pyspark.sql.types import StructType
|
|
10
|
+
|
|
11
|
+
from databricks4py.spark_session import active_fallback
|
|
12
|
+
|
|
13
|
+
__all__ = ["ColumnChange", "SchemaDiff", "SchemaEvolutionError"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SchemaEvolutionError(Exception):
|
|
17
|
+
"""Raised on breaking schema changes."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class ColumnChange:
|
|
22
|
+
column: str
|
|
23
|
+
change_type: Literal["added", "removed", "type_changed", "nullable_changed"]
|
|
24
|
+
old_value: str | None = None
|
|
25
|
+
new_value: str | None = None
|
|
26
|
+
severity: Literal["info", "warning", "breaking"] = "info"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class SchemaDiff:
|
|
30
|
+
"""Compares two StructType schemas and reports column-level changes."""
|
|
31
|
+
|
|
32
|
+
def __init__(self, current: StructType, incoming: StructType) -> None:
|
|
33
|
+
self._current = current
|
|
34
|
+
self._incoming = incoming
|
|
35
|
+
self._changes: list[ColumnChange] | None = None
|
|
36
|
+
|
|
37
|
+
@classmethod
|
|
38
|
+
def from_tables(
|
|
39
|
+
cls,
|
|
40
|
+
table_name: str,
|
|
41
|
+
incoming_df: DataFrame,
|
|
42
|
+
*,
|
|
43
|
+
spark: SparkSession | None = None,
|
|
44
|
+
) -> SchemaDiff:
|
|
45
|
+
"""Create a diff between an existing table's schema and an incoming DataFrame.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
table_name: Fully qualified table to read the current schema from.
|
|
49
|
+
incoming_df: DataFrame whose schema represents the proposed change.
|
|
50
|
+
spark: Optional SparkSession.
|
|
51
|
+
"""
|
|
52
|
+
spark = active_fallback(spark)
|
|
53
|
+
current_schema = spark.read.table(table_name).schema
|
|
54
|
+
return cls(current=current_schema, incoming=incoming_df.schema)
|
|
55
|
+
|
|
56
|
+
def changes(self) -> list[ColumnChange]:
|
|
57
|
+
"""Compute and return the list of column-level changes between schemas."""
|
|
58
|
+
if self._changes is not None:
|
|
59
|
+
return self._changes
|
|
60
|
+
|
|
61
|
+
result: list[ColumnChange] = []
|
|
62
|
+
current_fields = {f.name: f for f in self._current.fields}
|
|
63
|
+
incoming_fields = {f.name: f for f in self._incoming.fields}
|
|
64
|
+
|
|
65
|
+
for name in sorted(incoming_fields.keys() - current_fields.keys()):
|
|
66
|
+
field = incoming_fields[name]
|
|
67
|
+
result.append(
|
|
68
|
+
ColumnChange(
|
|
69
|
+
column=name,
|
|
70
|
+
change_type="added",
|
|
71
|
+
new_value=str(field.dataType),
|
|
72
|
+
severity="info",
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
for name in sorted(current_fields.keys() - incoming_fields.keys()):
|
|
77
|
+
field = current_fields[name]
|
|
78
|
+
result.append(
|
|
79
|
+
ColumnChange(
|
|
80
|
+
column=name,
|
|
81
|
+
change_type="removed",
|
|
82
|
+
old_value=str(field.dataType),
|
|
83
|
+
severity="breaking",
|
|
84
|
+
)
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
for name in sorted(current_fields.keys() & incoming_fields.keys()):
|
|
88
|
+
cur = current_fields[name]
|
|
89
|
+
inc = incoming_fields[name]
|
|
90
|
+
|
|
91
|
+
if cur.dataType != inc.dataType:
|
|
92
|
+
result.append(
|
|
93
|
+
ColumnChange(
|
|
94
|
+
column=name,
|
|
95
|
+
change_type="type_changed",
|
|
96
|
+
old_value=str(cur.dataType),
|
|
97
|
+
new_value=str(inc.dataType),
|
|
98
|
+
severity="breaking",
|
|
99
|
+
)
|
|
100
|
+
)
|
|
101
|
+
elif cur.nullable != inc.nullable:
|
|
102
|
+
result.append(
|
|
103
|
+
ColumnChange(
|
|
104
|
+
column=name,
|
|
105
|
+
change_type="nullable_changed",
|
|
106
|
+
old_value=str(cur.nullable),
|
|
107
|
+
new_value=str(inc.nullable),
|
|
108
|
+
severity="warning",
|
|
109
|
+
)
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
self._changes = result
|
|
113
|
+
return result
|
|
114
|
+
|
|
115
|
+
def has_breaking_changes(self) -> bool:
|
|
116
|
+
"""True if any change has ``severity='breaking'`` (column removal or type change)."""
|
|
117
|
+
return any(c.severity == "breaking" for c in self.changes())
|
|
118
|
+
|
|
119
|
+
def summary(self) -> str:
|
|
120
|
+
"""Return a human-readable table of all detected changes."""
|
|
121
|
+
changes = self.changes()
|
|
122
|
+
if not changes:
|
|
123
|
+
return "No schema changes detected."
|
|
124
|
+
|
|
125
|
+
lines = [f"{'Column':<30} {'Change':<20} {'Severity':<10} {'Details'}"]
|
|
126
|
+
lines.append("-" * 80)
|
|
127
|
+
for c in changes:
|
|
128
|
+
details = ""
|
|
129
|
+
if c.old_value and c.new_value:
|
|
130
|
+
details = f"{c.old_value} -> {c.new_value}"
|
|
131
|
+
elif c.new_value:
|
|
132
|
+
details = c.new_value
|
|
133
|
+
elif c.old_value:
|
|
134
|
+
details = c.old_value
|
|
135
|
+
lines.append(f"{c.column:<30} {c.change_type:<20} {c.severity:<10} {details}")
|
|
136
|
+
return "\n".join(lines)
|