delta-explain 0.4.0__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- delta_explain/__init__.py +177 -0
- delta_explain-0.4.0.data/scripts/delta-explain.exe +0 -0
- delta_explain-0.4.0.dist-info/METADATA +437 -0
- delta_explain-0.4.0.dist-info/RECORD +7 -0
- delta_explain-0.4.0.dist-info/WHEEL +4 -0
- delta_explain-0.4.0.dist-info/licenses/LICENSE +21 -0
- delta_explain-0.4.0.dist-info/sboms/delta-explain.cyclonedx.json +10411 -0
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""Thin Python wrapper around the delta-explain CLI.
|
|
2
|
+
|
|
3
|
+
The wheel ships the compiled binary; this module invokes it and returns the
|
|
4
|
+
schema-versioned JSON report as a `Report`. One contract: everything the
|
|
5
|
+
module exposes is documented in docs/json-schema.md and guaranteed by
|
|
6
|
+
docs/semantics.md in the repository.
|
|
7
|
+
|
|
8
|
+
from delta_explain import explain
|
|
9
|
+
|
|
10
|
+
report = explain("s3://warehouse/events",
|
|
11
|
+
where="country = 'DE' AND age > 40",
|
|
12
|
+
min_pruning=80, env_creds=True)
|
|
13
|
+
report.passed # False also makes the CLI exit 1 in CI
|
|
14
|
+
report.total_pruning_pct
|
|
15
|
+
report["analysis"]["confidence"]
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import json
|
|
21
|
+
import shutil
|
|
22
|
+
import subprocess
|
|
23
|
+
import sysconfig
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Any, Mapping, Optional, Sequence, Union
|
|
26
|
+
|
|
27
|
+
__all__ = ["explain", "Report", "DeltaExplainError", "binary_path"]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DeltaExplainError(RuntimeError):
|
|
31
|
+
"""A runtime failure: unreadable table, bad predicate, storage error.
|
|
32
|
+
|
|
33
|
+
Gate failures (--min-pruning / --assert-stats) are NOT errors: they
|
|
34
|
+
come back as a Report with `passed == False`, mirroring the CLI's
|
|
35
|
+
exit-code contract (report on stdout, exit 1).
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class Report(Mapping[str, Any]):
|
|
40
|
+
"""The JSON report; a read-only mapping plus convenience accessors."""
|
|
41
|
+
|
|
42
|
+
def __init__(self, raw: dict):
|
|
43
|
+
self._raw = raw
|
|
44
|
+
|
|
45
|
+
# Mapping protocol: report["analysis"]["confidence"] etc.
|
|
46
|
+
def __getitem__(self, key: str) -> Any:
|
|
47
|
+
return self._raw[key]
|
|
48
|
+
|
|
49
|
+
def __iter__(self):
|
|
50
|
+
return iter(self._raw)
|
|
51
|
+
|
|
52
|
+
def __len__(self) -> int:
|
|
53
|
+
return len(self._raw)
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def raw(self) -> dict:
|
|
57
|
+
return self._raw
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def schema_version(self) -> str:
|
|
61
|
+
return self._raw["schema_version"]
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def total_files(self) -> int:
|
|
65
|
+
return self._raw["total_files"]
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def final_files(self) -> int:
|
|
69
|
+
return self._raw["final_files"]
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def total_pruning_pct(self) -> float:
|
|
73
|
+
return self._raw["total_pruning_pct"]
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def result(self) -> Optional[str]:
|
|
77
|
+
""""pass" / "fail" from the gates, or None when no gate ran."""
|
|
78
|
+
return self._raw["result"]
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def passed(self) -> bool:
|
|
82
|
+
"""True unless a gate failed. No gates counts as passed."""
|
|
83
|
+
return self._raw["result"] != "fail"
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def files(self) -> Optional[list]:
|
|
87
|
+
"""Per-file outcomes; present only when verbose=True was requested."""
|
|
88
|
+
return self._raw.get("files")
|
|
89
|
+
|
|
90
|
+
def __repr__(self) -> str: # pragma: no cover
|
|
91
|
+
return (
|
|
92
|
+
f"Report(files={self.total_files}->{self.final_files}, "
|
|
93
|
+
f"pruned={self.total_pruning_pct:.0f}%, result={self.result!r})"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def binary_path() -> str:
|
|
98
|
+
"""The delta-explain binary this module will invoke.
|
|
99
|
+
|
|
100
|
+
The one shipped inside this wheel (the scripts directory) wins; PATH is
|
|
101
|
+
the fallback for source checkouts and custom setups.
|
|
102
|
+
"""
|
|
103
|
+
bundled = Path(sysconfig.get_path("scripts")) / "delta-explain"
|
|
104
|
+
for candidate in (bundled, bundled.with_suffix(".exe")):
|
|
105
|
+
if candidate.is_file():
|
|
106
|
+
return str(candidate)
|
|
107
|
+
on_path = shutil.which("delta-explain")
|
|
108
|
+
if on_path:
|
|
109
|
+
return on_path
|
|
110
|
+
raise DeltaExplainError(
|
|
111
|
+
"delta-explain binary not found (neither bundled in this "
|
|
112
|
+
"environment's scripts directory nor on PATH)"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def explain(
|
|
117
|
+
table: str,
|
|
118
|
+
*,
|
|
119
|
+
where: Optional[str] = None,
|
|
120
|
+
min_pruning: Optional[float] = None,
|
|
121
|
+
assert_stats: bool = False,
|
|
122
|
+
at_version: Optional[int] = None,
|
|
123
|
+
verbose: bool = False,
|
|
124
|
+
limit: Optional[int] = None,
|
|
125
|
+
env_creds: bool = False,
|
|
126
|
+
profile: Optional[str] = None,
|
|
127
|
+
region: Optional[str] = None,
|
|
128
|
+
public: bool = False,
|
|
129
|
+
options: Optional[Mapping[str, str]] = None,
|
|
130
|
+
binary: Optional[Union[str, Sequence[str]]] = None,
|
|
131
|
+
) -> Report:
|
|
132
|
+
"""Run delta-explain against `table` and return the JSON report.
|
|
133
|
+
|
|
134
|
+
Keyword arguments mirror the CLI flags one to one; `options` becomes
|
|
135
|
+
repeated `--option KEY=VALUE` pairs. Gate failures return a Report with
|
|
136
|
+
`passed == False`; runtime errors raise DeltaExplainError with the
|
|
137
|
+
CLI's stderr message.
|
|
138
|
+
"""
|
|
139
|
+
launcher: Sequence[str]
|
|
140
|
+
if binary is None:
|
|
141
|
+
launcher = [binary_path()]
|
|
142
|
+
elif isinstance(binary, str):
|
|
143
|
+
launcher = [binary]
|
|
144
|
+
else:
|
|
145
|
+
launcher = list(binary)
|
|
146
|
+
argv: list[str] = [*launcher, table, "--format", "json"]
|
|
147
|
+
if where is not None:
|
|
148
|
+
argv += ["--where", where]
|
|
149
|
+
if min_pruning is not None:
|
|
150
|
+
argv += ["--min-pruning", str(min_pruning)]
|
|
151
|
+
if assert_stats:
|
|
152
|
+
argv += ["--assert-stats"]
|
|
153
|
+
if at_version is not None:
|
|
154
|
+
argv += ["--at-version", str(at_version)]
|
|
155
|
+
if verbose:
|
|
156
|
+
argv += ["--verbose"]
|
|
157
|
+
if limit is not None:
|
|
158
|
+
argv += ["--limit", str(limit)]
|
|
159
|
+
if env_creds:
|
|
160
|
+
argv += ["--env-creds"]
|
|
161
|
+
if profile is not None:
|
|
162
|
+
argv += ["--profile", profile]
|
|
163
|
+
if region is not None:
|
|
164
|
+
argv += ["--region", region]
|
|
165
|
+
if public:
|
|
166
|
+
argv += ["--public"]
|
|
167
|
+
for key, value in (options or {}).items():
|
|
168
|
+
argv += ["--option", f"{key}={value}"]
|
|
169
|
+
|
|
170
|
+
proc = subprocess.run(argv, capture_output=True, text=True)
|
|
171
|
+
|
|
172
|
+
# The CLI contract (docs/semantics.md): stdout is a complete report or
|
|
173
|
+
# empty. Exit 1 with a report is a gate failure; exit 1 with empty
|
|
174
|
+
# stdout is a runtime error; exit 2 is a usage error.
|
|
175
|
+
if proc.stdout.strip():
|
|
176
|
+
return Report(json.loads(proc.stdout))
|
|
177
|
+
raise DeltaExplainError(proc.stderr.strip() or f"exit code {proc.returncode}")
|
|
Binary file
|
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: delta-explain
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Classifier: Development Status :: 4 - Beta
|
|
5
|
+
Classifier: Intended Audience :: Developers
|
|
6
|
+
Classifier: Programming Language :: Rust
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Topic :: Database
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Summary: Make Delta Lake pruning visible: partition pruning and data skipping diagnostics, as a CLI and a thin Python wrapper around it.
|
|
11
|
+
Keywords: delta-lake,pruning,data-skipping,observability
|
|
12
|
+
Home-Page: https://github.com/cdelmonte-zg/delta-explain
|
|
13
|
+
Author: Christian Del Monte
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
16
|
+
Project-URL: Documentation, https://github.com/cdelmonte-zg/delta-explain/tree/main/docs
|
|
17
|
+
Project-URL: Repository, https://github.com/cdelmonte-zg/delta-explain
|
|
18
|
+
|
|
19
|
+
# delta-explain
|
|
20
|
+
|
|
21
|
+
**Make Delta pruning visible.**
|
|
22
|
+
|
|
23
|
+
A CLI that shows how partition pruning and data skipping reduce the set of candidate files in a Delta table.
|
|
24
|
+
|
|
25
|
+
Production-usable as a conservative Delta metadata diagnostic and CI guardrail — not yet a fully production-grade general-purpose Delta observability product. That line is meant literally: what the tool guarantees, and what it deliberately does not, is written down in [docs/semantics.md](docs/semantics.md).
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
**Documentation**: [three-minute quickstart](examples/quickstart/) - [what delta-explain guarantees (and what it does not)](docs/semantics.md) - [the JSON report, field by field](docs/json-schema.md) - [what it is validated against](docs/validation.md) - [current limitations](#current-limitations)
|
|
29
|
+
|
|
30
|
+
## The problem
|
|
31
|
+
|
|
32
|
+
You run a query with a filter. The engine reads some files. But how many files were actually eliminated, and *why*?
|
|
33
|
+
|
|
34
|
+
Delta Lake uses two mechanisms to skip files before reading data:
|
|
35
|
+
|
|
36
|
+
- **Partition pruning** eliminates files at the directory level based on partition column values
|
|
37
|
+
- **Data skipping** eliminates files at the file level based on per-column min/max statistics
|
|
38
|
+
|
|
39
|
+
Both happen silently during scan planning, below the query. If partitioning is wrong or stats are missing, you won't know until performance degrades.
|
|
40
|
+
|
|
41
|
+
## What this tool does
|
|
42
|
+
|
|
43
|
+
`delta-explain` uses [delta-kernel-rs](https://github.com/delta-io/delta-kernel-rs) to read Delta metadata directly (no Spark, no DuckDB, no query execution engine) and shows, step by step, how a predicate narrows the set of candidate files.
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
$ delta-explain ./my-table -w "age > 40 AND country = 'DE'"
|
|
47
|
+
|
|
48
|
+
Delta table: ./my-table
|
|
49
|
+
Version: 5
|
|
50
|
+
Predicate: age > 40 AND country = 'DE'
|
|
51
|
+
|
|
52
|
+
Predicate Analysis:
|
|
53
|
+
partition-safe: country = 'DE'
|
|
54
|
+
stats-safe: age > 40
|
|
55
|
+
unsplittable: -
|
|
56
|
+
confidence: conservative
|
|
57
|
+
|
|
58
|
+
Files in snapshot: 6
|
|
59
|
+
|
|
60
|
+
Phase 1: Partition pruning [exact]
|
|
61
|
+
predicate: country = 'DE'
|
|
62
|
+
files remaining: 2 (-4, 67% pruned)
|
|
63
|
+
|
|
64
|
+
Phase 2: Data skipping (min/max statistics) [conservative]
|
|
65
|
+
predicate: age > 40
|
|
66
|
+
files remaining: 1 (-1, 50% pruned)
|
|
67
|
+
|
|
68
|
+
Total reduction: 6 -> 1 files (83% pruned)
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
The **Predicate Analysis** block shows how the predicate was split across the two pruning phases, and `confidence` labels how precisely the elimination can be explained (`exact` / `conservative` / `incomplete`). The precise definitions, the degradation rules, and what each label guarantees are in [docs/semantics.md](docs/semantics.md).
|
|
72
|
+
|
|
73
|
+
With `--verbose`, you see exactly *which* files are kept or dropped and *why*:
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
Phase 1: Partition pruning [exact]
|
|
77
|
+
predicate: country = 'DE'
|
|
78
|
+
files remaining: 2 (-4, 67% pruned)
|
|
79
|
+
|
|
80
|
+
[DROPPED] part-00000-48368dae.parquet (1.1 KB 3 records) partition(country=IT) stats(age: 41..65)
|
|
81
|
+
[DROPPED] part-00000-fcf95aac.parquet (1.1 KB 5 records) partition(country=IT) stats(age: 22..38)
|
|
82
|
+
[DROPPED] part-00000-eee5a3ec.parquet (1.1 KB 3 records) partition(country=US) stats(age: 31..55)
|
|
83
|
+
[DROPPED] part-00000-de2ffaef.parquet (1.1 KB 4 records) partition(country=US) stats(age: 18..29)
|
|
84
|
+
[KEPT ] part-00000-a35083c1.parquet (1.1 KB 4 records) partition(country=DE) stats(age: 40..60)
|
|
85
|
+
[KEPT ] part-00000-c34f1417.parquet (1.1 KB 5 records) partition(country=DE) stats(age: 20..35)
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
(Use `--limit` to cap the listing on large tables; in JSON mode `--verbose` emits the machine-readable `files[]` array instead.) Files without a `stats` payload appear as `[no stats]`; statistics come from the kernel's log replay, checkpoint Parquet included, so `[no stats]` means the writer really recorded none.
|
|
90
|
+
|
|
91
|
+
## Install
|
|
92
|
+
|
|
93
|
+
### Homebrew (macOS, Linux)
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
brew tap cdelmonte-zg/tap
|
|
97
|
+
brew install delta-explain
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Scoop (Windows)
|
|
101
|
+
|
|
102
|
+
```powershell
|
|
103
|
+
scoop bucket add cdelmonte-zg https://github.com/cdelmonte-zg/scoop-bucket
|
|
104
|
+
scoop install delta-explain
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Debian / Ubuntu (`.deb`)
|
|
108
|
+
|
|
109
|
+
Download the `.deb` for your architecture from the [latest release](https://github.com/cdelmonte-zg/delta-explain/releases/latest) and install with `dpkg`:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
wget https://github.com/cdelmonte-zg/delta-explain/releases/download/v0.2.3/delta-explain_0.2.3-1_amd64.deb
|
|
113
|
+
sudo dpkg -i delta-explain_0.2.3-1_amd64.deb
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Available for `amd64` and `arm64`. Uninstall with `sudo apt remove delta-explain`.
|
|
117
|
+
|
|
118
|
+
### Pre-built binary (any OS, no package manager)
|
|
119
|
+
|
|
120
|
+
Download the archive for your platform from the [latest release](https://github.com/cdelmonte-zg/delta-explain/releases/latest), extract, and place on `$PATH`:
|
|
121
|
+
|
|
122
|
+
| Platform | Archive |
|
|
123
|
+
|---|---|
|
|
124
|
+
| Linux x86_64 (glibc) | `delta-explain-x86_64-unknown-linux-gnu.tar.gz` |
|
|
125
|
+
| Linux x86_64 (static, musl) | `delta-explain-x86_64-unknown-linux-musl.tar.gz` |
|
|
126
|
+
| Linux ARM64 | `delta-explain-aarch64-unknown-linux-gnu.tar.gz` |
|
|
127
|
+
| macOS Intel | `delta-explain-x86_64-apple-darwin.tar.gz` |
|
|
128
|
+
| macOS Apple Silicon | `delta-explain-aarch64-apple-darwin.tar.gz` |
|
|
129
|
+
| Windows x86_64 | `delta-explain-x86_64-pc-windows-msvc.zip` |
|
|
130
|
+
|
|
131
|
+
Each archive ships with a `.sha256` checksum. The musl build is statically linked and runs on any Linux distribution without glibc dependencies.
|
|
132
|
+
|
|
133
|
+
### From PyPI (Python, no Rust needed)
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
pip install delta-explain
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
The wheel ships the compiled binary (the `delta-explain` command works from the same environment) plus a thin Python API around the JSON contract:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from delta_explain import explain
|
|
143
|
+
|
|
144
|
+
report = explain("s3://warehouse/events",
|
|
145
|
+
where="country = 'DE' AND age > 40",
|
|
146
|
+
min_pruning=80, env_creds=True)
|
|
147
|
+
report.passed # gate outcome; False means the CLI would exit 1
|
|
148
|
+
report.total_pruning_pct
|
|
149
|
+
report["analysis"]["confidence"]
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Gate failures come back as a report with `passed == False`; runtime errors raise `DeltaExplainError` with the CLI's message — the same exit-code contract as the command line, in Python types.
|
|
153
|
+
|
|
154
|
+
### From crates.io (requires Rust 1.88+)
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
cargo install delta-explain
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### From Git (latest development version)
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
cargo install --git https://github.com/cdelmonte-zg/delta-explain
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### Docker (amd64 + arm64)
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
docker pull ghcr.io/cdelmonte-zg/delta-explain
|
|
170
|
+
docker run --rm -v /path/to/table:/data ghcr.io/cdelmonte-zg/delta-explain /data -w "col > 10"
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
For pipelines, pin to a release tag (e.g., `:0.2.3`) or to a digest; `:latest` is for local exploration only.
|
|
174
|
+
|
|
175
|
+
## Usage
|
|
176
|
+
|
|
177
|
+
```
|
|
178
|
+
delta-explain <PATH> [OPTIONS]
|
|
179
|
+
|
|
180
|
+
Arguments:
|
|
181
|
+
<PATH> Path to the Delta table (local path, s3://, az://, gs://)
|
|
182
|
+
|
|
183
|
+
Options:
|
|
184
|
+
-w, --where <PREDICATE> Predicate (e.g. "age > 30 AND country = 'DE'")
|
|
185
|
+
-v, --verbose Show per-file details (kept/dropped with reason);
|
|
186
|
+
in JSON, adds the "files" array
|
|
187
|
+
--limit <N> Cap per-file listings at N entries
|
|
188
|
+
--format <FORMAT> Output format: text (default) or json
|
|
189
|
+
--min-pruning <PCT> Fail if total pruning is below this percentage
|
|
190
|
+
--assert-stats Fail if any file is missing statistics
|
|
191
|
+
--at-version <N> Analyze the table at this version (time travel)
|
|
192
|
+
--profile <NAME> Static AWS credentials from ~/.aws/credentials (S3)
|
|
193
|
+
--region <REGION> AWS region (S3 / S3-compatible)
|
|
194
|
+
--option <KEY=VALUE> Object store config (repeatable)
|
|
195
|
+
--env-creds Read cloud credentials from environment variables
|
|
196
|
+
--public Access a public bucket (skip auth)
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### Local table
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
delta-explain ./my-table -w "country = 'DE'"
|
|
203
|
+
delta-explain ./my-table -w "age > 30 AND country = 'IT'" --verbose
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
### Cloud storage
|
|
207
|
+
|
|
208
|
+
**Credentials.** Three ways in, by environment:
|
|
209
|
+
|
|
210
|
+
- **On cloud infrastructure** (EC2/ECS, EKS, AKS, GKE): with no explicit credentials the storage layer falls back to the provider's ambient chain (instance profile, Managed Identity, Workload Identity) on its own; add `--env-creds` when the credentials live in environment variables instead (`AWS_ACCESS_KEY_ID`/`AWS_SECRET_ACCESS_KEY`/`AWS_SESSION_TOKEN`/`AWS_REGION`, `AZURE_STORAGE_ACCOUNT_NAME`/`AZURE_STORAGE_ACCOUNT_KEY`, `GOOGLE_APPLICATION_CREDENTIALS`).
|
|
211
|
+
- **On a developer laptop** (AWS): `--profile <name>` resolves static keys, session token, and region from `~/.aws/credentials` and `~/.aws/config`, the same files the AWS CLI reads (including the `AWS_SHARED_CREDENTIALS_FILE` / `AWS_CONFIG_FILE` overrides). Profiles that rely on SSO, `credential_process`, or role assumption are not resolved; export them first and use `--env-creds`:
|
|
212
|
+
```bash
|
|
213
|
+
eval $(aws configure export-credentials --profile corp --format env)
|
|
214
|
+
delta-explain --env-creds s3://bucket/table -w "..."
|
|
215
|
+
```
|
|
216
|
+
- **Static keys** (MinIO, local development): pass them via `--option`, expanding from environment variables to keep secrets out of argv. Valid `--option` keys are passed through to the [`object_store`](https://docs.rs/object_store/) builders; see upstream docs for the per-backend list.
|
|
217
|
+
|
|
218
|
+
```bash
|
|
219
|
+
# S3 with credentials from the environment
|
|
220
|
+
delta-explain --env-creds s3://bucket/path/to/table -w "date = '2024-01-01'"
|
|
221
|
+
|
|
222
|
+
# S3 public bucket
|
|
223
|
+
delta-explain --region us-east-1 --public s3://my-public-bucket/table -w "id > 100"
|
|
224
|
+
|
|
225
|
+
# Azure
|
|
226
|
+
delta-explain --env-creds az://container/table -w "region = 'eu-west-1'"
|
|
227
|
+
|
|
228
|
+
# GCS (Workload Identity on GKE, or service account JSON via env)
|
|
229
|
+
delta-explain --env-creds gs://bucket/table -w "date = '2024-01-01'"
|
|
230
|
+
|
|
231
|
+
# S3-compatible (MinIO, Akamai, etc.); endpoint via --option, key/secret expanded from env
|
|
232
|
+
delta-explain \
|
|
233
|
+
--option AWS_ENDPOINT=https://minio.local:9000 \
|
|
234
|
+
--option AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID" \
|
|
235
|
+
--option AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY" \
|
|
236
|
+
s3://bucket/table -w "col > 5"
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
## CI/CD mode
|
|
240
|
+
|
|
241
|
+
`delta-explain` doubles as an assertion tool in pipelines. After your ETL writes a Delta table, verify that the pruning layout is healthy.
|
|
242
|
+
|
|
243
|
+
`--min-pruning`, `--assert-stats`, `--format json`, and `--verbose` are independent. Without `--verbose` the JSON document is summary-only; with it, a per-file `files` array is included (cap it with `--limit` on large tables).
|
|
244
|
+
|
|
245
|
+
### GitHub Action
|
|
246
|
+
|
|
247
|
+
The repo doubles as a composite action, so the gate is one step. Pin the tag: the action downloads a released binary, so the ref you pin is the behavior you get.
|
|
248
|
+
|
|
249
|
+
```yaml
|
|
250
|
+
- uses: cdelmonte-zg/delta-explain@v0.4.0
|
|
251
|
+
with:
|
|
252
|
+
table: s3://warehouse/events
|
|
253
|
+
where: "country = 'DE' AND age > 40"
|
|
254
|
+
min-pruning: "60"
|
|
255
|
+
assert-stats: "true"
|
|
256
|
+
env-creds: "true"
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
Inputs mirror the CLI flags (`table`, `where`, `min-pruning`, `assert-stats`, `at-version`, `env-creds`, `profile`, plus `options` as one `KEY=VALUE` per line, and `version` to pin a release; default `latest`). The step fails when a gate fails, and exposes `pruning-pct`, `final-files`, and `result` as outputs for later steps:
|
|
260
|
+
|
|
261
|
+
```yaml
|
|
262
|
+
- name: Comment the pruning percentage
|
|
263
|
+
run: echo "Pruning ${{ steps.gate.outputs.pruning-pct }}%"
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
### Assert minimum pruning
|
|
267
|
+
|
|
268
|
+
Fail the pipeline if a predicate doesn't eliminate enough files:
|
|
269
|
+
|
|
270
|
+
```bash
|
|
271
|
+
delta-explain s3://warehouse/events -w "date = '2024-01-15'" --min-pruning 90
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
Exit code 1 if total pruning is below 90%.
|
|
275
|
+
|
|
276
|
+
The threshold is per-invocation, applied to the current predicate against the current snapshot. Calibrate it against a baseline pruning percentage in dev (set the gate a few points below it); a flat threshold across heterogeneous partitions will misfire. Note also that 100% pruning can signal a broken or unexpectedly empty predicate, so pair `--min-pruning` with a sanity check on `final_files > 0` when the workload is expected to read data.
|
|
277
|
+
|
|
278
|
+
### Assert statistics coverage
|
|
279
|
+
|
|
280
|
+
Fail if any file in the table is missing min/max statistics:
|
|
281
|
+
|
|
282
|
+
```bash
|
|
283
|
+
delta-explain s3://warehouse/events --assert-stats
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
Statistics are resolved through the kernel's log replay, checkpoint Parquet included, so a file is flagged only when its `add` action genuinely carries no statistics. Long-lived tables whose older commits have been consolidated into a checkpoint do not produce false positives.
|
|
287
|
+
|
|
288
|
+
### Predicate parity
|
|
289
|
+
|
|
290
|
+
The pruning percentage `delta-explain` reports reflects the predicate you pass to `-w`. If the runtime query wraps a column in `LOWER`, `CAST`, or a UDF, the engine may prune less than the gate suggests. Use a CI predicate that is semantically equivalent to the runtime predicate and explicitly track that equivalence: a gate on `country = 'DE'` does not automatically validate a production query using `LOWER(country) = 'de'`.
|
|
291
|
+
|
|
292
|
+
### JSON output for downstream processing
|
|
293
|
+
|
|
294
|
+
```bash
|
|
295
|
+
delta-explain ./my-table -w "country = 'DE'" --format json | jq '.total_pruning_pct'
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
The JSON output is versioned independently from the CLI binary (`schema_version: "0.2.0"`). The schema is pre-1.0: additive changes bump the minor version, breaking changes bump the major version. Consumers should branch on stable field names (e.g. assertion names), tolerate unknown fields, and check `schema_version`.
|
|
299
|
+
|
|
300
|
+
The contract is formal: [`schemas/report-v0.2.schema.json`](schemas/report-v0.2.schema.json) is a JSON Schema that the integration suite validates every emitted document against, and [`docs/json-schema.md`](docs/json-schema.md) explains each field, the stable note codes, and the meaning of `confidence`, `kept`, and `pruned_by`.
|
|
301
|
+
|
|
302
|
+
Exit code is `0` when all assertions pass and `1` if any fails; the JSON `result` field carries the per-assertion outcome.
|
|
303
|
+
|
|
304
|
+
See [CHANGELOG.md](CHANGELOG.md) for the full schema notes.
|
|
305
|
+
|
|
306
|
+
### Docker in a pipeline
|
|
307
|
+
|
|
308
|
+
```yaml
|
|
309
|
+
# GitHub Actions example
|
|
310
|
+
- name: Verify pruning after ETL
|
|
311
|
+
run: |
|
|
312
|
+
docker run --rm \
|
|
313
|
+
-e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION \
|
|
314
|
+
ghcr.io/cdelmonte-zg/delta-explain:0.2.3 \
|
|
315
|
+
--env-creds s3://warehouse/events \
|
|
316
|
+
-w "date = '2024-01-15'" \
|
|
317
|
+
--min-pruning 90 --assert-stats --format json
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
## How it works
|
|
321
|
+
|
|
322
|
+
`delta-explain` replays Delta metadata through [delta-kernel-rs](https://github.com/delta-io/delta-kernel-rs) and runs separate metadata scans (no predicate, partition-safe fragment, full predicate) to isolate each pruning phase's contribution. No query engine is involved, no data files are read: only metadata. The full pipeline, the soundness guarantee, and the attribution rules are in [docs/semantics.md](docs/semantics.md).
|
|
323
|
+
|
|
324
|
+
## Predicate syntax
|
|
325
|
+
|
|
326
|
+
`delta-explain` accepts standard SQL WHERE-clause syntax, parsed via [sqlparser-rs](https://github.com/sqlparser-rs/sqlparser-rs).
|
|
327
|
+
|
|
328
|
+
```sql
|
|
329
|
+
-- Comparisons
|
|
330
|
+
age > 30
|
|
331
|
+
country = 'DE'
|
|
332
|
+
score >= 90.5
|
|
333
|
+
|
|
334
|
+
-- Logical operators
|
|
335
|
+
age > 30 AND country = 'DE'
|
|
336
|
+
country = 'DE' OR country = 'IT'
|
|
337
|
+
NOT country = 'US'
|
|
338
|
+
|
|
339
|
+
-- IN lists
|
|
340
|
+
country IN ('DE', 'IT', 'US')
|
|
341
|
+
country NOT IN ('US')
|
|
342
|
+
|
|
343
|
+
-- BETWEEN
|
|
344
|
+
age BETWEEN 20 AND 40
|
|
345
|
+
|
|
346
|
+
-- NULL checks
|
|
347
|
+
name IS NOT NULL
|
|
348
|
+
age IS NULL
|
|
349
|
+
|
|
350
|
+
-- Parentheses
|
|
351
|
+
(country = 'DE' OR country = 'IT') AND age > 30
|
|
352
|
+
|
|
353
|
+
-- Nested columns
|
|
354
|
+
payload.age > 30
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
Also supported: `IS [NOT] DISTINCT FROM`, `DATE '...'` / `TIMESTAMP '...'` literal forms, and schema-driven coercion (a quoted `'2026-07-01'` against a `DATE` column just works, including `DECIMAL` and narrow integers). Subqueries, functions, and `LIKE` are outside the pruning language: they warn and keep files instead of failing (see [Current limitations](#current-limitations)).
|
|
358
|
+
|
|
359
|
+
## Performance notes
|
|
360
|
+
|
|
361
|
+
delta-explain reads only the Delta log, never the parquet data files, so its cost scales with the number of `add` actions, not with data volume. Measured at 200k files on Linux, local disk, in the three log shapes that matter (generate them yourself with `cargo run --release --example gen_scale_log`):
|
|
362
|
+
|
|
363
|
+
| Log shape (200k files) | Baseline | With predicate | Peak memory |
|
|
364
|
+
|---|---|---|---|
|
|
365
|
+
| single JSON commit | ~1.0 s | ~1.3 s | ~280 MB |
|
|
366
|
+
| 2000 JSON commits | ~1.4 s | ~2.2 s | ~320 MB |
|
|
367
|
+
| 2000 commits + parquet checkpoint | ~0.8 s | ~1.0 s | ~240 MB |
|
|
368
|
+
|
|
369
|
+
The most production-like shape (checkpointed) is also the fastest: the kernel reads one parquet checkpoint instead of replaying thousands of JSON commits. Scaling is linear at roughly 1.5 KB of resident memory per file, which extrapolates to ~1.5 GB at one million files; that is the current practical ceiling and it is a known limitation, not a hidden one. Predicate complexity is immaterial at this level: an `IN` list with 500 items over 200k files adds ~0.4 s.
|
|
370
|
+
|
|
371
|
+
Output is the dimension to manage on large tables: the compact JSON stays summary-only at any size, and per-file detail (`--verbose`, in both formats) should be capped with `--limit`.
|
|
372
|
+
|
|
373
|
+
## Current limitations
|
|
374
|
+
|
|
375
|
+
- **First N indexed leaf columns only.** Delta collects min/max statistics only for the first `delta.dataSkippingNumIndexedCols` leaf fields (default 32, configurable per-table; nested struct children count separately).
|
|
376
|
+
|
|
377
|
+
Predicates on columns past this index are still classified as `stats-safe` but contribute no pruning, because the column's min/max never appears in the log. (`stats.mode` reflects per-table coverage of the indexed columns, not per-predicate reachability, so it can read `exact` even when the predicate column is unreachable by stats.)
|
|
378
|
+
|
|
379
|
+
- **No query planner simulation.** This tool shows metadata-level file elimination only. It does not predict query execution time or replicate engine-specific optimizer behavior.
|
|
380
|
+
|
|
381
|
+
- **OR-mixed predicates.** Predicate classification operates on top-level AND conjuncts, after normalization: negations push down to the leaves (De Morgan) and conjuncts common to every OR branch factor out of the OR, so `NOT (country = 'DE' OR age > 30)` splits into two attributable phases and `(country = 'DE' AND x) OR (country = 'DE' AND y)` exposes `country = 'DE'` as partition-safe. What remains is the irreducibly mixed OR (`country = 'DE' OR age > 30`): it is flagged as `unsplittable` per the rule above, never silently downgraded.
|
|
382
|
+
|
|
383
|
+
- **Computed expressions keep all files.** Function calls, arithmetic, `LIKE`, subqueries, and column-to-column comparisons are outside the pruning language; such fragments are reported with an `UNSUPPORTED_EXPRESSION` warning and conservatively keep every file, while sibling AND conjuncts still prune. Most of these are file-level unskippable for any engine; the exception is prefix `LIKE 'abc%'`, which engines like delta-spark do skip on string min/max and delta-explain does not yet.
|
|
384
|
+
|
|
385
|
+
- **`IN` pruning strength varies by engine.** delta-explain expands `IN` lists into OR-of-equalities, the strongest sound form, with no size cap. Real engines differ: DataFusion-based engines (delta-rs) do the same expansion but stop skipping past 20 list items, and delta-spark evaluates an imprecise range test over the whole list (`min(values) <= col <= max(values)`), which keeps more files on sparse lists. On `IN`-heavy predicates a specific engine may therefore prune less than this report shows; the report reflects what the metadata makes possible, and it is always sound.
|
|
386
|
+
|
|
387
|
+
- **Protocol features are declared, not compensated.** Deletion vectors, column mapping, and liquid clustering are detected and reported in `table_features` with explicit warnings, but the numbers are not adjusted: record counts still include soft-deleted rows on files with deletion vectors, verbose statistics may show physical column names under column mapping, and clustering columns are informational. On a fully checkpointed log (no JSON commits) liquid clustering goes undetected, because delta-kernel exposes no public accessor for system metadata domains.
|
|
388
|
+
|
|
389
|
+
See [VISION.md](VISION.md) for planned improvements.
|
|
390
|
+
|
|
391
|
+
## Development
|
|
392
|
+
|
|
393
|
+
To build and test from a fresh clone:
|
|
394
|
+
|
|
395
|
+
```bash
|
|
396
|
+
git clone https://github.com/cdelmonte-zg/delta-explain
|
|
397
|
+
cd delta-explain
|
|
398
|
+
cargo build
|
|
399
|
+
cargo test
|
|
400
|
+
```
|
|
401
|
+
|
|
402
|
+
The integration tests under `tests/` rely on pre-built Delta tables checked into the repo under `fixtures/`. They are real Delta tables, not synthetic blobs, so the tests exercise the kernel's actual scan planner.
|
|
403
|
+
|
|
404
|
+
### Regenerating the fixtures
|
|
405
|
+
|
|
406
|
+
The fixtures only need to be regenerated when you change their schema or the data they contain, for ordinary development you can ignore this step entirely.
|
|
407
|
+
|
|
408
|
+
The generator is a small Python script (`fixtures/create_test_table.py`) that uses `pyarrow` and `deltalake` to write the tables. Set up a virtual environment and install the pinned dependencies:
|
|
409
|
+
|
|
410
|
+
```bash
|
|
411
|
+
python -m venv .venv
|
|
412
|
+
source .venv/bin/activate # Windows: .venv\Scripts\activate
|
|
413
|
+
pip install -r fixtures/requirements.txt
|
|
414
|
+
```
|
|
415
|
+
|
|
416
|
+
Then run the generator:
|
|
417
|
+
|
|
418
|
+
```bash
|
|
419
|
+
python fixtures/create_test_table.py
|
|
420
|
+
```
|
|
421
|
+
|
|
422
|
+
The script skips any fixture directory that already exists; delete the directory you want to regenerate first, then re-run.
|
|
423
|
+
|
|
424
|
+
## Deep dive
|
|
425
|
+
|
|
426
|
+
For a detailed walkthrough of the architecture, design decisions, and the reasoning behind the two-phase model, see the companion article: [delta-explain: Making Delta Lake Pruning Visible](https://cdelmonte.dev/deep-dives/delta-explain-making-delta-pruning-visible/).
|
|
427
|
+
|
|
428
|
+
## License
|
|
429
|
+
|
|
430
|
+
MIT
|
|
431
|
+
|
|
432
|
+
## Author
|
|
433
|
+
|
|
434
|
+
[Christian Del Monte](https://github.com/cdelmonte-zg)
|
|
435
|
+
|
|
436
|
+
`delta-explain` is built on [delta-kernel-rs](https://github.com/delta-io/delta-kernel-rs) and focuses on making Delta-level file elimination visible.
|
|
437
|
+
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
delta_explain/__init__.py,sha256=GwTe0twykW9ZBJvi3TGT3v9wEWjyAvCTnMVdSt6DdCo,5836
|
|
2
|
+
delta_explain-0.4.0.data/scripts/delta-explain.exe,sha256=g_9-WRWakge_pKJRLEu4jOdMeR0tPkad7wrdSQwH29U,38895616
|
|
3
|
+
delta_explain-0.4.0.dist-info/METADATA,sha256=bRr2jRsOCB_VMDdwtExpDZTvMjGZ7juROvkNahJNrYA,22318
|
|
4
|
+
delta_explain-0.4.0.dist-info/WHEEL,sha256=2zDlIYIdD4m4N3p5DVEG3iJhGLdhsBQgdH-FqVkAur8,94
|
|
5
|
+
delta_explain-0.4.0.dist-info/licenses/LICENSE,sha256=vvhvHBPooei2DnPz4OCTH7tJIt6TDD7ztTK_f486w2A,1097
|
|
6
|
+
delta_explain-0.4.0.dist-info/sboms/delta-explain.cyclonedx.json,sha256=jtVYdnjiWKOI7f1zurVOWaTEwNwMRWC_Xc8GL3vlYqU,338897
|
|
7
|
+
delta_explain-0.4.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Christian Del Monte
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|