databricks4py 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. databricks4py/__init__.py +56 -0
  2. databricks4py/catalog.py +65 -0
  3. databricks4py/config/__init__.py +6 -0
  4. databricks4py/config/base.py +119 -0
  5. databricks4py/config/unity.py +72 -0
  6. databricks4py/filters/__init__.py +17 -0
  7. databricks4py/filters/base.py +154 -0
  8. databricks4py/io/__init__.py +40 -0
  9. databricks4py/io/checkpoint.py +98 -0
  10. databricks4py/io/dbfs.py +91 -0
  11. databricks4py/io/delta.py +564 -0
  12. databricks4py/io/merge.py +176 -0
  13. databricks4py/io/streaming.py +281 -0
  14. databricks4py/logging.py +39 -0
  15. databricks4py/metrics/__init__.py +22 -0
  16. databricks4py/metrics/base.py +66 -0
  17. databricks4py/metrics/delta_sink.py +75 -0
  18. databricks4py/metrics/logging_sink.py +20 -0
  19. databricks4py/migrations/__init__.py +27 -0
  20. databricks4py/migrations/alter.py +114 -0
  21. databricks4py/migrations/runner.py +241 -0
  22. databricks4py/migrations/schema_diff.py +136 -0
  23. databricks4py/migrations/validators.py +195 -0
  24. databricks4py/observability/__init__.py +24 -0
  25. databricks4py/observability/_utils.py +24 -0
  26. databricks4py/observability/batch_context.py +134 -0
  27. databricks4py/observability/health.py +223 -0
  28. databricks4py/observability/query_listener.py +236 -0
  29. databricks4py/py.typed +0 -0
  30. databricks4py/quality/__init__.py +26 -0
  31. databricks4py/quality/base.py +54 -0
  32. databricks4py/quality/expectations.py +184 -0
  33. databricks4py/quality/gate.py +90 -0
  34. databricks4py/retry.py +102 -0
  35. databricks4py/secrets.py +69 -0
  36. databricks4py/spark_session.py +68 -0
  37. databricks4py/testing/__init__.py +35 -0
  38. databricks4py/testing/assertions.py +111 -0
  39. databricks4py/testing/builders.py +127 -0
  40. databricks4py/testing/fixtures.py +134 -0
  41. databricks4py/testing/mocks.py +106 -0
  42. databricks4py/testing/temp_table.py +73 -0
  43. databricks4py/workflow.py +219 -0
  44. databricks4py-0.2.0.dist-info/METADATA +589 -0
  45. databricks4py-0.2.0.dist-info/RECORD +48 -0
  46. databricks4py-0.2.0.dist-info/WHEEL +5 -0
  47. databricks4py-0.2.0.dist-info/licenses/LICENSE +21 -0
  48. databricks4py-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,91 @@
1
+ """DBFS file operations via dbutils."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ __all__ = [
9
+ "copy_from_remote",
10
+ "inject_dbutils_module",
11
+ "ls",
12
+ "mkdirs",
13
+ "mv",
14
+ "rm",
15
+ "_set_dbutils_module",
16
+ ]
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ _dbutils_module: Any = None
21
+
22
+
23
+ def _set_dbutils_module(dbutils_module: Any) -> None:
24
+ """Internal: set the dbutils module. Use top-level inject_dbutils() instead."""
25
+ global _dbutils_module
26
+ _dbutils_module = dbutils_module
27
+ logger.debug("Injected dbutils module for DBFS: %s", dbutils_module)
28
+
29
+
30
+ inject_dbutils_module = _set_dbutils_module
31
+
32
+
33
+ def _get_dbutils() -> Any:
34
+ """Get a DBUtils instance from the injected module."""
35
+ if _dbutils_module is None:
36
+ raise RuntimeError("dbutils module not injected. Call inject_dbutils_module() first.")
37
+
38
+ from pyspark.sql import SparkSession
39
+
40
+ spark = SparkSession.getActiveSession()
41
+ return _dbutils_module.DBUtils(spark)
42
+
43
+
44
+ def copy_from_remote(
45
+ remote_path: str,
46
+ local_path: str,
47
+ recurse: bool = False,
48
+ ) -> bool:
49
+ """Copy a file from a remote path to a local path via dbutils.fs.
50
+
51
+ Args:
52
+ remote_path: The source path (e.g. ``abfss://...`` or ``dbfs://...``).
53
+ local_path: The destination local path.
54
+ recurse: Whether to copy recursively.
55
+
56
+ Returns:
57
+ True if the copy succeeded.
58
+
59
+ Raises:
60
+ RuntimeError: If dbutils has not been injected.
61
+ """
62
+ dbutils = _get_dbutils()
63
+ logger.info("Copying %s -> %s (recurse=%s)", remote_path, local_path, recurse)
64
+ return dbutils.fs.cp(remote_path, local_path, recurse=recurse)
65
+
66
+
67
+ def ls(path: str) -> list:
68
+ """List files at the given path."""
69
+ dbutils = _get_dbutils()
70
+ return dbutils.fs.ls(path)
71
+
72
+
73
+ def mv(source: str, dest: str, *, recurse: bool = False) -> None:
74
+ """Move a file or directory."""
75
+ dbutils = _get_dbutils()
76
+ logger.info("Moving %s → %s (recurse=%s)", source, dest, recurse)
77
+ dbutils.fs.mv(source, dest, recurse)
78
+
79
+
80
+ def rm(path: str, *, recurse: bool = False) -> None:
81
+ """Remove a file or directory."""
82
+ dbutils = _get_dbutils()
83
+ logger.info("Removing %s (recurse=%s)", path, recurse)
84
+ dbutils.fs.rm(path, recurse)
85
+
86
+
87
+ def mkdirs(path: str) -> None:
88
+ """Create a directory (and parents)."""
89
+ dbutils = _get_dbutils()
90
+ logger.info("Creating directory %s", path)
91
+ dbutils.fs.mkdirs(path)
@@ -0,0 +1,564 @@
1
+ """Delta Lake table management utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from collections.abc import Sequence
7
+ from dataclasses import dataclass
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ from pyspark.sql import DataFrame, SparkSession
11
+ from pyspark.sql.types import StructType
12
+
13
+ from databricks4py.spark_session import active_fallback
14
+
15
+ if TYPE_CHECKING:
16
+ from databricks4py.io.merge import MergeBuilder, MergeResult
17
+ from databricks4py.metrics.base import MetricsSink
18
+
19
+ __all__ = [
20
+ "DeltaTable",
21
+ "DeltaTableAppender",
22
+ "DeltaTableOverwriter",
23
+ "GeneratedColumn",
24
+ "optimize_table",
25
+ "vacuum_table",
26
+ ]
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ @dataclass(frozen=True)
32
+ class GeneratedColumn:
33
+ """Definition of a Delta Lake generated column.
34
+
35
+ Generated columns are computed from expressions over other columns
36
+ and are automatically maintained by Delta Lake on write.
37
+
38
+ Args:
39
+ name: Column name.
40
+ data_type: Spark SQL data type string (e.g. ``"DATE"``, ``"STRING"``).
41
+ expression: SQL expression to generate the column value.
42
+ comment: Optional column comment.
43
+ """
44
+
45
+ name: str
46
+ data_type: str
47
+ expression: str
48
+ comment: str | None = None
49
+
50
+
51
+ class DeltaTable:
52
+ """Managed Delta Lake table with structured creation and access.
53
+
54
+ Wraps the delta-spark API to provide:
55
+ - Automatic table creation with schema, partitioning, and generated columns
56
+ - Convenient read/write/metadata operations
57
+ - Atomic table replacement for migrations
58
+
59
+ Example::
60
+
61
+ from pyspark.sql.types import StructType, StructField, StringType, IntegerType
62
+
63
+ schema = StructType([
64
+ StructField("id", IntegerType()),
65
+ StructField("name", StringType()),
66
+ ])
67
+
68
+ table = DeltaTable(
69
+ table_name="catalog.schema.users",
70
+ schema=schema,
71
+ location="/data/users",
72
+ partition_by=["id"],
73
+ )
74
+
75
+ df = table.dataframe()
76
+ table.write(df, mode="append")
77
+
78
+ Args:
79
+ table_name: Fully qualified table name.
80
+ schema: PySpark StructType for the table schema.
81
+ location: Optional physical storage location.
82
+ partition_by: Optional column(s) to partition by.
83
+ generated_columns: Optional generated column definitions.
84
+ spark: Optional SparkSession.
85
+ """
86
+
87
+ def __init__(
88
+ self,
89
+ table_name: str,
90
+ schema: StructType | dict[str, str],
91
+ *,
92
+ location: str | None = None,
93
+ partition_by: str | Sequence[str] | None = None,
94
+ generated_columns: Sequence[GeneratedColumn] | None = None,
95
+ spark: SparkSession | None = None,
96
+ ) -> None:
97
+ self._spark = active_fallback(spark)
98
+ self._table_name = table_name
99
+ self._schema = self._resolve_schema(schema)
100
+ self._location = location
101
+ self._generated_columns = list(generated_columns or [])
102
+
103
+ if isinstance(partition_by, str):
104
+ self._partition_by = [partition_by]
105
+ else:
106
+ self._partition_by = list(partition_by or [])
107
+
108
+ self._ensure_table_exists()
109
+
110
+ @staticmethod
111
+ def _resolve_schema(schema: StructType | dict[str, str]) -> StructType:
112
+ if isinstance(schema, dict):
113
+ from pyspark.sql import types as T
114
+
115
+ fields = []
116
+ for name, type_str in schema.items():
117
+ spark_type = T._parse_datatype_string(type_str)
118
+ fields.append(T.StructField(name, spark_type, nullable=True))
119
+ return T.StructType(fields)
120
+ return schema
121
+
122
+ @property
123
+ def table_name(self) -> str:
124
+ """The fully qualified table name."""
125
+ return self._table_name
126
+
127
+ def _table_exists(self) -> bool:
128
+ from delta.tables import DeltaTable as _DeltaTable
129
+ from pyspark.errors import AnalysisException
130
+
131
+ try:
132
+ if self._location:
133
+ _DeltaTable.forPath(self._spark, self._location)
134
+ else:
135
+ _DeltaTable.forName(self._spark, self._table_name)
136
+ return True
137
+ except AnalysisException:
138
+ return False
139
+
140
+ def _ensure_table_exists(self) -> None:
141
+ """Create the table if it doesn't exist."""
142
+ if self._table_exists():
143
+ logger.debug("Table %s already exists", self._table_name)
144
+ else:
145
+ self._create_table()
146
+
147
+ def _create_table(self) -> None:
148
+ """Create the Delta table using the builder API."""
149
+ from delta.tables import DeltaTable as _DeltaTable
150
+
151
+ builder = _DeltaTable.createIfNotExists(self._spark).tableName(self._table_name)
152
+
153
+ if self._location:
154
+ builder = builder.location(self._location)
155
+
156
+ # Add columns from schema
157
+ for field in self._schema.fields:
158
+ gen_col = self._find_generated_column(field.name)
159
+ if gen_col:
160
+ builder = builder.addColumn(
161
+ field.name,
162
+ gen_col.data_type,
163
+ generatedAlwaysAs=gen_col.expression,
164
+ comment=gen_col.comment,
165
+ )
166
+ else:
167
+ builder = builder.addColumn(
168
+ field.name,
169
+ field.dataType,
170
+ comment=field.metadata.get("comment") if field.metadata else None,
171
+ )
172
+
173
+ if self._partition_by:
174
+ builder = builder.partitionedBy(*self._partition_by)
175
+
176
+ builder.execute()
177
+ logger.info("Created table %s", self._table_name)
178
+
179
+ def _find_generated_column(self, name: str) -> GeneratedColumn | None:
180
+ """Find a generated column definition by name."""
181
+ for gc in self._generated_columns:
182
+ if gc.name == name:
183
+ return gc
184
+ return None
185
+
186
+ def dataframe(self) -> DataFrame:
187
+ """Read the table as a DataFrame.
188
+
189
+ Returns:
190
+ The table contents as a PySpark DataFrame.
191
+ """
192
+ if self._location:
193
+ return self._spark.read.format("delta").load(self._location)
194
+ return self._spark.read.table(self._table_name)
195
+
196
+ def write(self, df: DataFrame, mode: str = "append", *, schema_check: bool = True) -> None:
197
+ """Write a DataFrame to the table.
198
+
199
+ Args:
200
+ df: The DataFrame to write.
201
+ mode: Write mode (``"append"`` or ``"overwrite"``).
202
+ schema_check: If True, validates schema compatibility before writing.
203
+ """
204
+ if schema_check and self._table_exists():
205
+ from databricks4py.migrations.schema_diff import SchemaDiff, SchemaEvolutionError
206
+
207
+ diff = SchemaDiff.from_tables(self._table_name, df, spark=self._spark)
208
+ if diff.has_breaking_changes():
209
+ raise SchemaEvolutionError(
210
+ f"Breaking schema changes detected for {self._table_name}:\n{diff.summary()}"
211
+ )
212
+
213
+ writer = df.write.format("delta").mode(mode)
214
+
215
+ if self._partition_by:
216
+ writer = writer.partitionBy(*self._partition_by)
217
+
218
+ if self._location:
219
+ writer.save(self._location)
220
+ else:
221
+ writer.saveAsTable(self._table_name)
222
+
223
+ logger.info("Wrote to %s (mode=%s)", self._table_name, mode)
224
+
225
+ def detail(self) -> DataFrame:
226
+ """Get Delta table metadata.
227
+
228
+ Returns:
229
+ A DataFrame with table detail (location, partitions, size, etc.).
230
+ """
231
+ from delta.tables import DeltaTable as _DeltaTable
232
+
233
+ if self._location:
234
+ dt = _DeltaTable.forPath(self._spark, self._location)
235
+ else:
236
+ dt = _DeltaTable.forName(self._spark, self._table_name)
237
+ return dt.detail()
238
+
239
+ def location(self) -> str:
240
+ """Get the physical storage location of the table."""
241
+ row = self.detail().select("location").first()
242
+ return row["location"] if row else ""
243
+
244
+ def size_in_bytes(self) -> int:
245
+ """Get the table size in bytes."""
246
+ row = self.detail().select("sizeInBytes").first()
247
+ return row["sizeInBytes"] if row else 0
248
+
249
+ def partition_columns(self) -> list[str]:
250
+ """Get the partition columns of the table."""
251
+ row = self.detail().select("partitionColumns").first()
252
+ return list(row["partitionColumns"]) if row else []
253
+
254
+ def merge(
255
+ self,
256
+ source: DataFrame,
257
+ *,
258
+ metrics_sink: MetricsSink | None = None,
259
+ ) -> MergeBuilder:
260
+ """Start a fluent MERGE INTO operation against this table.
261
+
262
+ Args:
263
+ source: Source DataFrame to merge from.
264
+ metrics_sink: Optional sink for merge metrics.
265
+
266
+ Returns:
267
+ A MergeBuilder for chaining merge conditions.
268
+ """
269
+ from databricks4py.io.merge import MergeBuilder as _MergeBuilder
270
+
271
+ self._ensure_table_exists()
272
+ return _MergeBuilder(self._table_name, source, self._spark, metrics_sink=metrics_sink)
273
+
274
+ def upsert(
275
+ self,
276
+ source: DataFrame,
277
+ keys: list[str],
278
+ *,
279
+ update_columns: list[str] | None = None,
280
+ metrics_sink: MetricsSink | None = None,
281
+ ) -> MergeResult:
282
+ """Upsert (update existing, insert new) rows by key columns.
283
+
284
+ Args:
285
+ source: Source DataFrame.
286
+ keys: Columns to match on.
287
+ update_columns: Specific columns to update on match. If None, updates all.
288
+ metrics_sink: Optional sink for merge metrics.
289
+
290
+ Returns:
291
+ MergeResult with insert/update/delete counts.
292
+ """
293
+ return (
294
+ self.merge(source, metrics_sink=metrics_sink)
295
+ .on(*keys)
296
+ .when_matched_update(update_columns)
297
+ .when_not_matched_insert()
298
+ .execute()
299
+ )
300
+
301
+ def scd_type2(
302
+ self,
303
+ source: DataFrame,
304
+ keys: list[str],
305
+ *,
306
+ effective_date_col: str = "effective_date",
307
+ end_date_col: str = "end_date",
308
+ active_col: str = "is_active",
309
+ metrics_sink: MetricsSink | None = None,
310
+ ) -> MergeResult:
311
+ """Apply SCD Type 2 logic: expire changed records and insert new versions.
312
+
313
+ Matches on keys where active=True. For changed records, sets end_date
314
+ to current_timestamp and active to False. All incoming source rows are
315
+ inserted as new active records with the current effective_date.
316
+
317
+ Args:
318
+ source: Incoming DataFrame (without SCD metadata columns).
319
+ keys: Business key columns.
320
+ effective_date_col: Column name for the effective date.
321
+ end_date_col: Column name for the end date.
322
+ active_col: Column name for the active flag.
323
+ metrics_sink: Optional sink for merge metrics.
324
+
325
+ Returns:
326
+ MergeResult with insert/update/delete counts.
327
+ """
328
+ from delta.tables import DeltaTable as _DeltaTable
329
+ from pyspark.sql import functions as F
330
+
331
+ from databricks4py.io.merge import MergeResult as _MergeResult
332
+
333
+ self._ensure_table_exists()
334
+
335
+ staged = (
336
+ source.withColumn(effective_date_col, F.current_timestamp())
337
+ .withColumn(end_date_col, F.lit(None).cast("timestamp"))
338
+ .withColumn(active_col, F.lit(True))
339
+ )
340
+
341
+ key_conds = [f"target.{k} = source.{k}" for k in keys]
342
+ key_conds.append(f"target.{active_col} = true")
343
+ condition = " AND ".join(key_conds)
344
+
345
+ target_dt = _DeltaTable.forName(self._spark, self._table_name)
346
+ (
347
+ target_dt.alias("target")
348
+ .merge(staged.alias("source"), condition)
349
+ .whenMatchedUpdate(
350
+ set={
351
+ end_date_col: "current_timestamp()",
352
+ active_col: "false",
353
+ }
354
+ )
355
+ .whenNotMatchedInsertAll()
356
+ .execute()
357
+ )
358
+
359
+ history = self._spark.sql(f"DESCRIBE HISTORY {self._table_name} LIMIT 1")
360
+ rows = history.collect()
361
+ if not rows:
362
+ return
363
+ metrics: dict[str, str] = rows[0]["operationMetrics"] or {}
364
+
365
+ result = _MergeResult(
366
+ rows_inserted=int(metrics.get("numTargetRowsInserted", 0)),
367
+ rows_updated=int(metrics.get("numTargetRowsUpdated", 0)),
368
+ rows_deleted=int(metrics.get("numTargetRowsDeleted", 0)),
369
+ )
370
+
371
+ if metrics_sink:
372
+ from datetime import datetime, timezone
373
+
374
+ from databricks4py.metrics.base import MetricEvent
375
+
376
+ event = MetricEvent(
377
+ job_name="scd_type2",
378
+ event_type="merge_complete",
379
+ timestamp=datetime.now(tz=timezone.utc),
380
+ row_count=result.rows_inserted + result.rows_updated,
381
+ table_name=self._table_name,
382
+ metadata={
383
+ "rows_inserted": result.rows_inserted,
384
+ "rows_updated": result.rows_updated,
385
+ "rows_deleted": result.rows_deleted,
386
+ },
387
+ )
388
+ metrics_sink.emit(event)
389
+
390
+ return result
391
+
392
+ def replace_data(
393
+ self,
394
+ replacement_table_name: str,
395
+ recovery_table_name: str,
396
+ ) -> None:
397
+ """Replace this table's data with another table via atomic rename.
398
+
399
+ Performs a two-step swap:
400
+ 1. Rename current table to recovery name (backup)
401
+ 2. Rename replacement table to current name
402
+
403
+ Args:
404
+ replacement_table_name: The table containing new data.
405
+ recovery_table_name: Name for the backup of current data.
406
+ """
407
+ logger.info(
408
+ "Replacing %s with %s (recovery: %s)",
409
+ self._table_name,
410
+ replacement_table_name,
411
+ recovery_table_name,
412
+ )
413
+ self._spark.sql(f"ALTER TABLE {self._table_name} RENAME TO {recovery_table_name}")
414
+ self._spark.sql(f"ALTER TABLE {replacement_table_name} RENAME TO {self._table_name}")
415
+ logger.info("Table replacement complete")
416
+
417
+ @classmethod
418
+ def from_parquet(
419
+ cls,
420
+ *paths: str,
421
+ table_name: str,
422
+ schema: StructType,
423
+ location: str | None = None,
424
+ partition_by: str | Sequence[str] | None = None,
425
+ generated_columns: Sequence[GeneratedColumn] | None = None,
426
+ spark: SparkSession | None = None,
427
+ ) -> DeltaTable:
428
+ """Create a DeltaTable by loading data from Parquet files.
429
+
430
+ Args:
431
+ *paths: One or more Parquet file/directory paths.
432
+ table_name: Target table name.
433
+ schema: Table schema.
434
+ location: Optional storage location.
435
+ partition_by: Optional partition columns.
436
+ generated_columns: Optional generated columns.
437
+ spark: Optional SparkSession.
438
+
439
+ Returns:
440
+ A DeltaTable with the loaded data.
441
+ """
442
+ _spark = active_fallback(spark)
443
+ df = _spark.read.schema(schema).parquet(*paths)
444
+
445
+ table = cls(
446
+ table_name=table_name,
447
+ schema=schema,
448
+ location=location,
449
+ partition_by=partition_by,
450
+ generated_columns=generated_columns,
451
+ spark=_spark,
452
+ )
453
+ table.write(df, mode="overwrite")
454
+ return table
455
+
456
+ @classmethod
457
+ def from_data(
458
+ cls,
459
+ data: list[dict[str, Any]] | list[tuple],
460
+ *,
461
+ table_name: str,
462
+ schema: StructType,
463
+ location: str | None = None,
464
+ partition_by: str | Sequence[str] | None = None,
465
+ generated_columns: Sequence[GeneratedColumn] | None = None,
466
+ spark: SparkSession | None = None,
467
+ ) -> DeltaTable:
468
+ """Create a DeltaTable from in-memory data.
469
+
470
+ Args:
471
+ data: List of dicts or tuples.
472
+ table_name: Target table name.
473
+ schema: Table schema.
474
+ location: Optional storage location.
475
+ partition_by: Optional partition columns.
476
+ generated_columns: Optional generated columns.
477
+ spark: Optional SparkSession.
478
+
479
+ Returns:
480
+ A DeltaTable with the loaded data.
481
+ """
482
+ _spark = active_fallback(spark)
483
+ df = _spark.createDataFrame(data, schema=schema)
484
+
485
+ table = cls(
486
+ table_name=table_name,
487
+ schema=schema,
488
+ location=location,
489
+ partition_by=partition_by,
490
+ generated_columns=generated_columns,
491
+ spark=_spark,
492
+ )
493
+ table.write(df, mode="overwrite")
494
+ return table
495
+
496
+ def __repr__(self) -> str:
497
+ return f"DeltaTable({self._table_name!r})"
498
+
499
+
500
+ class DeltaTableAppender(DeltaTable):
501
+ """DeltaTable that provides a convenient ``append()`` method."""
502
+
503
+ def append(self, df: DataFrame) -> None:
504
+ """Append data to the table.
505
+
506
+ Args:
507
+ df: The DataFrame to append.
508
+ """
509
+ self.write(df, mode="append")
510
+
511
+
512
+ class DeltaTableOverwriter(DeltaTable):
513
+ """DeltaTable that provides a convenient ``overwrite()`` method."""
514
+
515
+ def overwrite(self, df: DataFrame) -> None:
516
+ """Overwrite the table with new data.
517
+
518
+ Args:
519
+ df: The DataFrame to write.
520
+ """
521
+ self.write(df, mode="overwrite")
522
+
523
+
524
+ def optimize_table(
525
+ table_name: str,
526
+ *,
527
+ zorder_by: str | Sequence[str] | None = None,
528
+ spark: SparkSession | None = None,
529
+ ) -> None:
530
+ """Run OPTIMIZE on a Delta table.
531
+
532
+ Args:
533
+ table_name: The table to optimize.
534
+ zorder_by: Optional column(s) for Z-ordering.
535
+ spark: Optional SparkSession.
536
+ """
537
+ _spark = active_fallback(spark)
538
+ sql = f"OPTIMIZE {table_name}"
539
+
540
+ if zorder_by:
541
+ cols = [zorder_by] if isinstance(zorder_by, str) else list(zorder_by)
542
+ sql += f" ZORDER BY ({', '.join(cols)})"
543
+
544
+ logger.info("Running: %s", sql)
545
+ _spark.sql(sql)
546
+
547
+
548
+ def vacuum_table(
549
+ table_name: str,
550
+ *,
551
+ retention_hours: int = 168,
552
+ spark: SparkSession | None = None,
553
+ ) -> None:
554
+ """Run VACUUM on a Delta table.
555
+
556
+ Args:
557
+ table_name: The table to vacuum.
558
+ retention_hours: Hours of history to retain (default 168 = 7 days).
559
+ spark: Optional SparkSession.
560
+ """
561
+ _spark = active_fallback(spark)
562
+ sql = f"VACUUM {table_name} RETAIN {retention_hours} HOURS"
563
+ logger.info("Running: %s", sql)
564
+ _spark.sql(sql)