delta-explain 0.4.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,177 @@
1
+ """Thin Python wrapper around the delta-explain CLI.
2
+
3
+ The wheel ships the compiled binary; this module invokes it and returns the
4
+ schema-versioned JSON report as a `Report`. One contract: everything the
5
+ module exposes is documented in docs/json-schema.md and guaranteed by
6
+ docs/semantics.md in the repository.
7
+
8
+ from delta_explain import explain
9
+
10
+ report = explain("s3://warehouse/events",
11
+ where="country = 'DE' AND age > 40",
12
+ min_pruning=80, env_creds=True)
13
+ report.passed # False also makes the CLI exit 1 in CI
14
+ report.total_pruning_pct
15
+ report["analysis"]["confidence"]
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import json
21
+ import shutil
22
+ import subprocess
23
+ import sysconfig
24
+ from pathlib import Path
25
+ from typing import Any, Mapping, Optional, Sequence, Union
26
+
27
+ __all__ = ["explain", "Report", "DeltaExplainError", "binary_path"]
28
+
29
+
30
+ class DeltaExplainError(RuntimeError):
31
+ """A runtime failure: unreadable table, bad predicate, storage error.
32
+
33
+ Gate failures (--min-pruning / --assert-stats) are NOT errors: they
34
+ come back as a Report with `passed == False`, mirroring the CLI's
35
+ exit-code contract (report on stdout, exit 1).
36
+ """
37
+
38
+
39
+ class Report(Mapping[str, Any]):
40
+ """The JSON report; a read-only mapping plus convenience accessors."""
41
+
42
+ def __init__(self, raw: dict):
43
+ self._raw = raw
44
+
45
+ # Mapping protocol: report["analysis"]["confidence"] etc.
46
+ def __getitem__(self, key: str) -> Any:
47
+ return self._raw[key]
48
+
49
+ def __iter__(self):
50
+ return iter(self._raw)
51
+
52
+ def __len__(self) -> int:
53
+ return len(self._raw)
54
+
55
+ @property
56
+ def raw(self) -> dict:
57
+ return self._raw
58
+
59
+ @property
60
+ def schema_version(self) -> str:
61
+ return self._raw["schema_version"]
62
+
63
+ @property
64
+ def total_files(self) -> int:
65
+ return self._raw["total_files"]
66
+
67
+ @property
68
+ def final_files(self) -> int:
69
+ return self._raw["final_files"]
70
+
71
+ @property
72
+ def total_pruning_pct(self) -> float:
73
+ return self._raw["total_pruning_pct"]
74
+
75
+ @property
76
+ def result(self) -> Optional[str]:
77
+ """"pass" / "fail" from the gates, or None when no gate ran."""
78
+ return self._raw["result"]
79
+
80
+ @property
81
+ def passed(self) -> bool:
82
+ """True unless a gate failed. No gates counts as passed."""
83
+ return self._raw["result"] != "fail"
84
+
85
+ @property
86
+ def files(self) -> Optional[list]:
87
+ """Per-file outcomes; present only when verbose=True was requested."""
88
+ return self._raw.get("files")
89
+
90
+ def __repr__(self) -> str: # pragma: no cover
91
+ return (
92
+ f"Report(files={self.total_files}->{self.final_files}, "
93
+ f"pruned={self.total_pruning_pct:.0f}%, result={self.result!r})"
94
+ )
95
+
96
+
97
+ def binary_path() -> str:
98
+ """The delta-explain binary this module will invoke.
99
+
100
+ The one shipped inside this wheel (the scripts directory) wins; PATH is
101
+ the fallback for source checkouts and custom setups.
102
+ """
103
+ bundled = Path(sysconfig.get_path("scripts")) / "delta-explain"
104
+ for candidate in (bundled, bundled.with_suffix(".exe")):
105
+ if candidate.is_file():
106
+ return str(candidate)
107
+ on_path = shutil.which("delta-explain")
108
+ if on_path:
109
+ return on_path
110
+ raise DeltaExplainError(
111
+ "delta-explain binary not found (neither bundled in this "
112
+ "environment's scripts directory nor on PATH)"
113
+ )
114
+
115
+
116
+ def explain(
117
+ table: str,
118
+ *,
119
+ where: Optional[str] = None,
120
+ min_pruning: Optional[float] = None,
121
+ assert_stats: bool = False,
122
+ at_version: Optional[int] = None,
123
+ verbose: bool = False,
124
+ limit: Optional[int] = None,
125
+ env_creds: bool = False,
126
+ profile: Optional[str] = None,
127
+ region: Optional[str] = None,
128
+ public: bool = False,
129
+ options: Optional[Mapping[str, str]] = None,
130
+ binary: Optional[Union[str, Sequence[str]]] = None,
131
+ ) -> Report:
132
+ """Run delta-explain against `table` and return the JSON report.
133
+
134
+ Keyword arguments mirror the CLI flags one to one; `options` becomes
135
+ repeated `--option KEY=VALUE` pairs. Gate failures return a Report with
136
+ `passed == False`; runtime errors raise DeltaExplainError with the
137
+ CLI's stderr message.
138
+ """
139
+ launcher: Sequence[str]
140
+ if binary is None:
141
+ launcher = [binary_path()]
142
+ elif isinstance(binary, str):
143
+ launcher = [binary]
144
+ else:
145
+ launcher = list(binary)
146
+ argv: list[str] = [*launcher, table, "--format", "json"]
147
+ if where is not None:
148
+ argv += ["--where", where]
149
+ if min_pruning is not None:
150
+ argv += ["--min-pruning", str(min_pruning)]
151
+ if assert_stats:
152
+ argv += ["--assert-stats"]
153
+ if at_version is not None:
154
+ argv += ["--at-version", str(at_version)]
155
+ if verbose:
156
+ argv += ["--verbose"]
157
+ if limit is not None:
158
+ argv += ["--limit", str(limit)]
159
+ if env_creds:
160
+ argv += ["--env-creds"]
161
+ if profile is not None:
162
+ argv += ["--profile", profile]
163
+ if region is not None:
164
+ argv += ["--region", region]
165
+ if public:
166
+ argv += ["--public"]
167
+ for key, value in (options or {}).items():
168
+ argv += ["--option", f"{key}={value}"]
169
+
170
+ proc = subprocess.run(argv, capture_output=True, text=True)
171
+
172
+ # The CLI contract (docs/semantics.md): stdout is a complete report or
173
+ # empty. Exit 1 with a report is a gate failure; exit 1 with empty
174
+ # stdout is a runtime error; exit 2 is a usage error.
175
+ if proc.stdout.strip():
176
+ return Report(json.loads(proc.stdout))
177
+ raise DeltaExplainError(proc.stderr.strip() or f"exit code {proc.returncode}")
@@ -0,0 +1,437 @@
1
+ Metadata-Version: 2.4
2
+ Name: delta-explain
3
+ Version: 0.4.0
4
+ Classifier: Development Status :: 4 - Beta
5
+ Classifier: Intended Audience :: Developers
6
+ Classifier: Programming Language :: Rust
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Topic :: Database
9
+ License-File: LICENSE
10
+ Summary: Make Delta Lake pruning visible: partition pruning and data skipping diagnostics, as a CLI and a thin Python wrapper around it.
11
+ Keywords: delta-lake,pruning,data-skipping,observability
12
+ Home-Page: https://github.com/cdelmonte-zg/delta-explain
13
+ Author: Christian Del Monte
14
+ Requires-Python: >=3.9
15
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
16
+ Project-URL: Documentation, https://github.com/cdelmonte-zg/delta-explain/tree/main/docs
17
+ Project-URL: Repository, https://github.com/cdelmonte-zg/delta-explain
18
+
19
+ # delta-explain
20
+
21
+ **Make Delta pruning visible.**
22
+
23
+ A CLI that shows how partition pruning and data skipping reduce the set of candidate files in a Delta table.
24
+
25
+ Production-usable as a conservative Delta metadata diagnostic and CI guardrail — not yet a fully production-grade general-purpose Delta observability product. That line is meant literally: what the tool guarantees, and what it deliberately does not, is written down in [docs/semantics.md](docs/semantics.md).
26
+
27
+
28
+ **Documentation**: [three-minute quickstart](examples/quickstart/) - [what delta-explain guarantees (and what it does not)](docs/semantics.md) - [the JSON report, field by field](docs/json-schema.md) - [what it is validated against](docs/validation.md) - [current limitations](#current-limitations)
29
+
30
+ ## The problem
31
+
32
+ You run a query with a filter. The engine reads some files. But how many files were actually eliminated, and *why*?
33
+
34
+ Delta Lake uses two mechanisms to skip files before reading data:
35
+
36
+ - **Partition pruning** eliminates files at the directory level based on partition column values
37
+ - **Data skipping** eliminates files at the file level based on per-column min/max statistics
38
+
39
+ Both happen silently during scan planning, below the query. If partitioning is wrong or stats are missing, you won't know until performance degrades.
40
+
41
+ ## What this tool does
42
+
43
+ `delta-explain` uses [delta-kernel-rs](https://github.com/delta-io/delta-kernel-rs) to read Delta metadata directly (no Spark, no DuckDB, no query execution engine) and shows, step by step, how a predicate narrows the set of candidate files.
44
+
45
+ ```
46
+ $ delta-explain ./my-table -w "age > 40 AND country = 'DE'"
47
+
48
+ Delta table: ./my-table
49
+ Version: 5
50
+ Predicate: age > 40 AND country = 'DE'
51
+
52
+ Predicate Analysis:
53
+ partition-safe: country = 'DE'
54
+ stats-safe: age > 40
55
+ unsplittable: -
56
+ confidence: conservative
57
+
58
+ Files in snapshot: 6
59
+
60
+ Phase 1: Partition pruning [exact]
61
+ predicate: country = 'DE'
62
+ files remaining: 2 (-4, 67% pruned)
63
+
64
+ Phase 2: Data skipping (min/max statistics) [conservative]
65
+ predicate: age > 40
66
+ files remaining: 1 (-1, 50% pruned)
67
+
68
+ Total reduction: 6 -> 1 files (83% pruned)
69
+ ```
70
+
71
+ The **Predicate Analysis** block shows how the predicate was split across the two pruning phases, and `confidence` labels how precisely the elimination can be explained (`exact` / `conservative` / `incomplete`). The precise definitions, the degradation rules, and what each label guarantees are in [docs/semantics.md](docs/semantics.md).
72
+
73
+ With `--verbose`, you see exactly *which* files are kept or dropped and *why*:
74
+
75
+ ```
76
+ Phase 1: Partition pruning [exact]
77
+ predicate: country = 'DE'
78
+ files remaining: 2 (-4, 67% pruned)
79
+
80
+ [DROPPED] part-00000-48368dae.parquet (1.1 KB 3 records) partition(country=IT) stats(age: 41..65)
81
+ [DROPPED] part-00000-fcf95aac.parquet (1.1 KB 5 records) partition(country=IT) stats(age: 22..38)
82
+ [DROPPED] part-00000-eee5a3ec.parquet (1.1 KB 3 records) partition(country=US) stats(age: 31..55)
83
+ [DROPPED] part-00000-de2ffaef.parquet (1.1 KB 4 records) partition(country=US) stats(age: 18..29)
84
+ [KEPT ] part-00000-a35083c1.parquet (1.1 KB 4 records) partition(country=DE) stats(age: 40..60)
85
+ [KEPT ] part-00000-c34f1417.parquet (1.1 KB 5 records) partition(country=DE) stats(age: 20..35)
86
+
87
+ ```
88
+
89
+ (Use `--limit` to cap the listing on large tables; in JSON mode `--verbose` emits the machine-readable `files[]` array instead.) Files without a `stats` payload appear as `[no stats]`; statistics come from the kernel's log replay, checkpoint Parquet included, so `[no stats]` means the writer really recorded none.
90
+
91
+ ## Install
92
+
93
+ ### Homebrew (macOS, Linux)
94
+
95
+ ```bash
96
+ brew tap cdelmonte-zg/tap
97
+ brew install delta-explain
98
+ ```
99
+
100
+ ### Scoop (Windows)
101
+
102
+ ```powershell
103
+ scoop bucket add cdelmonte-zg https://github.com/cdelmonte-zg/scoop-bucket
104
+ scoop install delta-explain
105
+ ```
106
+
107
+ ### Debian / Ubuntu (`.deb`)
108
+
109
+ Download the `.deb` for your architecture from the [latest release](https://github.com/cdelmonte-zg/delta-explain/releases/latest) and install with `dpkg`:
110
+
111
+ ```bash
112
+ wget https://github.com/cdelmonte-zg/delta-explain/releases/download/v0.2.3/delta-explain_0.2.3-1_amd64.deb
113
+ sudo dpkg -i delta-explain_0.2.3-1_amd64.deb
114
+ ```
115
+
116
+ Available for `amd64` and `arm64`. Uninstall with `sudo apt remove delta-explain`.
117
+
118
+ ### Pre-built binary (any OS, no package manager)
119
+
120
+ Download the archive for your platform from the [latest release](https://github.com/cdelmonte-zg/delta-explain/releases/latest), extract, and place on `$PATH`:
121
+
122
+ | Platform | Archive |
123
+ |---|---|
124
+ | Linux x86_64 (glibc) | `delta-explain-x86_64-unknown-linux-gnu.tar.gz` |
125
+ | Linux x86_64 (static, musl) | `delta-explain-x86_64-unknown-linux-musl.tar.gz` |
126
+ | Linux ARM64 | `delta-explain-aarch64-unknown-linux-gnu.tar.gz` |
127
+ | macOS Intel | `delta-explain-x86_64-apple-darwin.tar.gz` |
128
+ | macOS Apple Silicon | `delta-explain-aarch64-apple-darwin.tar.gz` |
129
+ | Windows x86_64 | `delta-explain-x86_64-pc-windows-msvc.zip` |
130
+
131
+ Each archive ships with a `.sha256` checksum. The musl build is statically linked and runs on any Linux distribution without glibc dependencies.
132
+
133
+ ### From PyPI (Python, no Rust needed)
134
+
135
+ ```bash
136
+ pip install delta-explain
137
+ ```
138
+
139
+ The wheel ships the compiled binary (the `delta-explain` command works from the same environment) plus a thin Python API around the JSON contract:
140
+
141
+ ```python
142
+ from delta_explain import explain
143
+
144
+ report = explain("s3://warehouse/events",
145
+ where="country = 'DE' AND age > 40",
146
+ min_pruning=80, env_creds=True)
147
+ report.passed # gate outcome; False means the CLI would exit 1
148
+ report.total_pruning_pct
149
+ report["analysis"]["confidence"]
150
+ ```
151
+
152
+ Gate failures come back as a report with `passed == False`; runtime errors raise `DeltaExplainError` with the CLI's message — the same exit-code contract as the command line, in Python types.
153
+
154
+ ### From crates.io (requires Rust 1.88+)
155
+
156
+ ```bash
157
+ cargo install delta-explain
158
+ ```
159
+
160
+ ### From Git (latest development version)
161
+
162
+ ```bash
163
+ cargo install --git https://github.com/cdelmonte-zg/delta-explain
164
+ ```
165
+
166
+ ### Docker (amd64 + arm64)
167
+
168
+ ```bash
169
+ docker pull ghcr.io/cdelmonte-zg/delta-explain
170
+ docker run --rm -v /path/to/table:/data ghcr.io/cdelmonte-zg/delta-explain /data -w "col > 10"
171
+ ```
172
+
173
+ For pipelines, pin to a release tag (e.g., `:0.2.3`) or to a digest; `:latest` is for local exploration only.
174
+
175
+ ## Usage
176
+
177
+ ```
178
+ delta-explain <PATH> [OPTIONS]
179
+
180
+ Arguments:
181
+ <PATH> Path to the Delta table (local path, s3://, az://, gs://)
182
+
183
+ Options:
184
+ -w, --where <PREDICATE> Predicate (e.g. "age > 30 AND country = 'DE'")
185
+ -v, --verbose Show per-file details (kept/dropped with reason);
186
+ in JSON, adds the "files" array
187
+ --limit <N> Cap per-file listings at N entries
188
+ --format <FORMAT> Output format: text (default) or json
189
+ --min-pruning <PCT> Fail if total pruning is below this percentage
190
+ --assert-stats Fail if any file is missing statistics
191
+ --at-version <N> Analyze the table at this version (time travel)
192
+ --profile <NAME> Static AWS credentials from ~/.aws/credentials (S3)
193
+ --region <REGION> AWS region (S3 / S3-compatible)
194
+ --option <KEY=VALUE> Object store config (repeatable)
195
+ --env-creds Read cloud credentials from environment variables
196
+ --public Access a public bucket (skip auth)
197
+ ```
198
+
199
+ ### Local table
200
+
201
+ ```bash
202
+ delta-explain ./my-table -w "country = 'DE'"
203
+ delta-explain ./my-table -w "age > 30 AND country = 'IT'" --verbose
204
+ ```
205
+
206
+ ### Cloud storage
207
+
208
+ **Credentials.** Three ways in, by environment:
209
+
210
+ - **On cloud infrastructure** (EC2/ECS, EKS, AKS, GKE): with no explicit credentials the storage layer falls back to the provider's ambient chain (instance profile, Managed Identity, Workload Identity) on its own; add `--env-creds` when the credentials live in environment variables instead (`AWS_ACCESS_KEY_ID`/`AWS_SECRET_ACCESS_KEY`/`AWS_SESSION_TOKEN`/`AWS_REGION`, `AZURE_STORAGE_ACCOUNT_NAME`/`AZURE_STORAGE_ACCOUNT_KEY`, `GOOGLE_APPLICATION_CREDENTIALS`).
211
+ - **On a developer laptop** (AWS): `--profile <name>` resolves static keys, session token, and region from `~/.aws/credentials` and `~/.aws/config`, the same files the AWS CLI reads (including the `AWS_SHARED_CREDENTIALS_FILE` / `AWS_CONFIG_FILE` overrides). Profiles that rely on SSO, `credential_process`, or role assumption are not resolved; export them first and use `--env-creds`:
212
+ ```bash
213
+ eval $(aws configure export-credentials --profile corp --format env)
214
+ delta-explain --env-creds s3://bucket/table -w "..."
215
+ ```
216
+ - **Static keys** (MinIO, local development): pass them via `--option`, expanding from environment variables to keep secrets out of argv. Valid `--option` keys are passed through to the [`object_store`](https://docs.rs/object_store/) builders; see upstream docs for the per-backend list.
217
+
218
+ ```bash
219
+ # S3 with credentials from the environment
220
+ delta-explain --env-creds s3://bucket/path/to/table -w "date = '2024-01-01'"
221
+
222
+ # S3 public bucket
223
+ delta-explain --region us-east-1 --public s3://my-public-bucket/table -w "id > 100"
224
+
225
+ # Azure
226
+ delta-explain --env-creds az://container/table -w "region = 'eu-west-1'"
227
+
228
+ # GCS (Workload Identity on GKE, or service account JSON via env)
229
+ delta-explain --env-creds gs://bucket/table -w "date = '2024-01-01'"
230
+
231
+ # S3-compatible (MinIO, Akamai, etc.); endpoint via --option, key/secret expanded from env
232
+ delta-explain \
233
+ --option AWS_ENDPOINT=https://minio.local:9000 \
234
+ --option AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID" \
235
+ --option AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY" \
236
+ s3://bucket/table -w "col > 5"
237
+ ```
238
+
239
+ ## CI/CD mode
240
+
241
+ `delta-explain` doubles as an assertion tool in pipelines. After your ETL writes a Delta table, verify that the pruning layout is healthy.
242
+
243
+ `--min-pruning`, `--assert-stats`, `--format json`, and `--verbose` are independent. Without `--verbose` the JSON document is summary-only; with it, a per-file `files` array is included (cap it with `--limit` on large tables).
244
+
245
+ ### GitHub Action
246
+
247
+ The repo doubles as a composite action, so the gate is one step. Pin the tag: the action downloads a released binary, so the ref you pin is the behavior you get.
248
+
249
+ ```yaml
250
+ - uses: cdelmonte-zg/delta-explain@v0.4.0
251
+ with:
252
+ table: s3://warehouse/events
253
+ where: "country = 'DE' AND age > 40"
254
+ min-pruning: "60"
255
+ assert-stats: "true"
256
+ env-creds: "true"
257
+ ```
258
+
259
+ Inputs mirror the CLI flags (`table`, `where`, `min-pruning`, `assert-stats`, `at-version`, `env-creds`, `profile`, plus `options` as one `KEY=VALUE` per line, and `version` to pin a release; default `latest`). The step fails when a gate fails, and exposes `pruning-pct`, `final-files`, and `result` as outputs for later steps:
260
+
261
+ ```yaml
262
+ - name: Comment the pruning percentage
263
+ run: echo "Pruning ${{ steps.gate.outputs.pruning-pct }}%"
264
+ ```
265
+
266
+ ### Assert minimum pruning
267
+
268
+ Fail the pipeline if a predicate doesn't eliminate enough files:
269
+
270
+ ```bash
271
+ delta-explain s3://warehouse/events -w "date = '2024-01-15'" --min-pruning 90
272
+ ```
273
+
274
+ Exit code 1 if total pruning is below 90%.
275
+
276
+ The threshold is per-invocation, applied to the current predicate against the current snapshot. Calibrate it against a baseline pruning percentage in dev (set the gate a few points below it); a flat threshold across heterogeneous partitions will misfire. Note also that 100% pruning can signal a broken or unexpectedly empty predicate, so pair `--min-pruning` with a sanity check on `final_files > 0` when the workload is expected to read data.
277
+
278
+ ### Assert statistics coverage
279
+
280
+ Fail if any file in the table is missing min/max statistics:
281
+
282
+ ```bash
283
+ delta-explain s3://warehouse/events --assert-stats
284
+ ```
285
+
286
+ Statistics are resolved through the kernel's log replay, checkpoint Parquet included, so a file is flagged only when its `add` action genuinely carries no statistics. Long-lived tables whose older commits have been consolidated into a checkpoint do not produce false positives.
287
+
288
+ ### Predicate parity
289
+
290
+ The pruning percentage `delta-explain` reports reflects the predicate you pass to `-w`. If the runtime query wraps a column in `LOWER`, `CAST`, or a UDF, the engine may prune less than the gate suggests. Use a CI predicate that is semantically equivalent to the runtime predicate and explicitly track that equivalence: a gate on `country = 'DE'` does not automatically validate a production query using `LOWER(country) = 'de'`.
291
+
292
+ ### JSON output for downstream processing
293
+
294
+ ```bash
295
+ delta-explain ./my-table -w "country = 'DE'" --format json | jq '.total_pruning_pct'
296
+ ```
297
+
298
+ The JSON output is versioned independently from the CLI binary (`schema_version: "0.2.0"`). The schema is pre-1.0: additive changes bump the minor version, breaking changes bump the major version. Consumers should branch on stable field names (e.g. assertion names), tolerate unknown fields, and check `schema_version`.
299
+
300
+ The contract is formal: [`schemas/report-v0.2.schema.json`](schemas/report-v0.2.schema.json) is a JSON Schema that the integration suite validates every emitted document against, and [`docs/json-schema.md`](docs/json-schema.md) explains each field, the stable note codes, and the meaning of `confidence`, `kept`, and `pruned_by`.
301
+
302
+ Exit code is `0` when all assertions pass and `1` if any fails; the JSON `result` field carries the per-assertion outcome.
303
+
304
+ See [CHANGELOG.md](CHANGELOG.md) for the full schema notes.
305
+
306
+ ### Docker in a pipeline
307
+
308
+ ```yaml
309
+ # GitHub Actions example
310
+ - name: Verify pruning after ETL
311
+ run: |
312
+ docker run --rm \
313
+ -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e AWS_DEFAULT_REGION \
314
+ ghcr.io/cdelmonte-zg/delta-explain:0.2.3 \
315
+ --env-creds s3://warehouse/events \
316
+ -w "date = '2024-01-15'" \
317
+ --min-pruning 90 --assert-stats --format json
318
+ ```
319
+
320
+ ## How it works
321
+
322
+ `delta-explain` replays Delta metadata through [delta-kernel-rs](https://github.com/delta-io/delta-kernel-rs) and runs separate metadata scans (no predicate, partition-safe fragment, full predicate) to isolate each pruning phase's contribution. No query engine is involved, no data files are read: only metadata. The full pipeline, the soundness guarantee, and the attribution rules are in [docs/semantics.md](docs/semantics.md).
323
+
324
+ ## Predicate syntax
325
+
326
+ `delta-explain` accepts standard SQL WHERE-clause syntax, parsed via [sqlparser-rs](https://github.com/sqlparser-rs/sqlparser-rs).
327
+
328
+ ```sql
329
+ -- Comparisons
330
+ age > 30
331
+ country = 'DE'
332
+ score >= 90.5
333
+
334
+ -- Logical operators
335
+ age > 30 AND country = 'DE'
336
+ country = 'DE' OR country = 'IT'
337
+ NOT country = 'US'
338
+
339
+ -- IN lists
340
+ country IN ('DE', 'IT', 'US')
341
+ country NOT IN ('US')
342
+
343
+ -- BETWEEN
344
+ age BETWEEN 20 AND 40
345
+
346
+ -- NULL checks
347
+ name IS NOT NULL
348
+ age IS NULL
349
+
350
+ -- Parentheses
351
+ (country = 'DE' OR country = 'IT') AND age > 30
352
+
353
+ -- Nested columns
354
+ payload.age > 30
355
+ ```
356
+
357
+ Also supported: `IS [NOT] DISTINCT FROM`, `DATE '...'` / `TIMESTAMP '...'` literal forms, and schema-driven coercion (a quoted `'2026-07-01'` against a `DATE` column just works, including `DECIMAL` and narrow integers). Subqueries, functions, and `LIKE` are outside the pruning language: they warn and keep files instead of failing (see [Current limitations](#current-limitations)).
358
+
359
+ ## Performance notes
360
+
361
+ delta-explain reads only the Delta log, never the parquet data files, so its cost scales with the number of `add` actions, not with data volume. Measured at 200k files on Linux, local disk, in the three log shapes that matter (generate them yourself with `cargo run --release --example gen_scale_log`):
362
+
363
+ | Log shape (200k files) | Baseline | With predicate | Peak memory |
364
+ |---|---|---|---|
365
+ | single JSON commit | ~1.0 s | ~1.3 s | ~280 MB |
366
+ | 2000 JSON commits | ~1.4 s | ~2.2 s | ~320 MB |
367
+ | 2000 commits + parquet checkpoint | ~0.8 s | ~1.0 s | ~240 MB |
368
+
369
+ The most production-like shape (checkpointed) is also the fastest: the kernel reads one parquet checkpoint instead of replaying thousands of JSON commits. Scaling is linear at roughly 1.5 KB of resident memory per file, which extrapolates to ~1.5 GB at one million files; that is the current practical ceiling and it is a known limitation, not a hidden one. Predicate complexity is immaterial at this level: an `IN` list with 500 items over 200k files adds ~0.4 s.
370
+
371
+ Output is the dimension to manage on large tables: the compact JSON stays summary-only at any size, and per-file detail (`--verbose`, in both formats) should be capped with `--limit`.
372
+
373
+ ## Current limitations
374
+
375
+ - **First N indexed leaf columns only.** Delta collects min/max statistics only for the first `delta.dataSkippingNumIndexedCols` leaf fields (default 32, configurable per-table; nested struct children count separately).
376
+
377
+ Predicates on columns past this index are still classified as `stats-safe` but contribute no pruning, because the column's min/max never appears in the log. (`stats.mode` reflects per-table coverage of the indexed columns, not per-predicate reachability, so it can read `exact` even when the predicate column is unreachable by stats.)
378
+
379
+ - **No query planner simulation.** This tool shows metadata-level file elimination only. It does not predict query execution time or replicate engine-specific optimizer behavior.
380
+
381
+ - **OR-mixed predicates.** Predicate classification operates on top-level AND conjuncts, after normalization: negations push down to the leaves (De Morgan) and conjuncts common to every OR branch factor out of the OR, so `NOT (country = 'DE' OR age > 30)` splits into two attributable phases and `(country = 'DE' AND x) OR (country = 'DE' AND y)` exposes `country = 'DE'` as partition-safe. What remains is the irreducibly mixed OR (`country = 'DE' OR age > 30`): it is flagged as `unsplittable` per the rule above, never silently downgraded.
382
+
383
+ - **Computed expressions keep all files.** Function calls, arithmetic, `LIKE`, subqueries, and column-to-column comparisons are outside the pruning language; such fragments are reported with an `UNSUPPORTED_EXPRESSION` warning and conservatively keep every file, while sibling AND conjuncts still prune. Most of these are file-level unskippable for any engine; the exception is prefix `LIKE 'abc%'`, which engines like delta-spark do skip on string min/max and delta-explain does not yet.
384
+
385
+ - **`IN` pruning strength varies by engine.** delta-explain expands `IN` lists into OR-of-equalities, the strongest sound form, with no size cap. Real engines differ: DataFusion-based engines (delta-rs) do the same expansion but stop skipping past 20 list items, and delta-spark evaluates an imprecise range test over the whole list (`min(values) <= col <= max(values)`), which keeps more files on sparse lists. On `IN`-heavy predicates a specific engine may therefore prune less than this report shows; the report reflects what the metadata makes possible, and it is always sound.
386
+
387
+ - **Protocol features are declared, not compensated.** Deletion vectors, column mapping, and liquid clustering are detected and reported in `table_features` with explicit warnings, but the numbers are not adjusted: record counts still include soft-deleted rows on files with deletion vectors, verbose statistics may show physical column names under column mapping, and clustering columns are informational. On a fully checkpointed log (no JSON commits) liquid clustering goes undetected, because delta-kernel exposes no public accessor for system metadata domains.
388
+
389
+ See [VISION.md](VISION.md) for planned improvements.
390
+
391
+ ## Development
392
+
393
+ To build and test from a fresh clone:
394
+
395
+ ```bash
396
+ git clone https://github.com/cdelmonte-zg/delta-explain
397
+ cd delta-explain
398
+ cargo build
399
+ cargo test
400
+ ```
401
+
402
+ The integration tests under `tests/` rely on pre-built Delta tables checked into the repo under `fixtures/`. They are real Delta tables, not synthetic blobs, so the tests exercise the kernel's actual scan planner.
403
+
404
+ ### Regenerating the fixtures
405
+
406
+ The fixtures only need to be regenerated when you change their schema or the data they contain, for ordinary development you can ignore this step entirely.
407
+
408
+ The generator is a small Python script (`fixtures/create_test_table.py`) that uses `pyarrow` and `deltalake` to write the tables. Set up a virtual environment and install the pinned dependencies:
409
+
410
+ ```bash
411
+ python -m venv .venv
412
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
413
+ pip install -r fixtures/requirements.txt
414
+ ```
415
+
416
+ Then run the generator:
417
+
418
+ ```bash
419
+ python fixtures/create_test_table.py
420
+ ```
421
+
422
+ The script skips any fixture directory that already exists; delete the directory you want to regenerate first, then re-run.
423
+
424
+ ## Deep dive
425
+
426
+ For a detailed walkthrough of the architecture, design decisions, and the reasoning behind the two-phase model, see the companion article: [delta-explain: Making Delta Lake Pruning Visible](https://cdelmonte.dev/deep-dives/delta-explain-making-delta-pruning-visible/).
427
+
428
+ ## License
429
+
430
+ MIT
431
+
432
+ ## Author
433
+
434
+ [Christian Del Monte](https://github.com/cdelmonte-zg)
435
+
436
+ `delta-explain` is built on [delta-kernel-rs](https://github.com/delta-io/delta-kernel-rs) and focuses on making Delta-level file elimination visible.
437
+
@@ -0,0 +1,7 @@
1
+ delta_explain/__init__.py,sha256=GwTe0twykW9ZBJvi3TGT3v9wEWjyAvCTnMVdSt6DdCo,5836
2
+ delta_explain-0.4.0.data/scripts/delta-explain.exe,sha256=g_9-WRWakge_pKJRLEu4jOdMeR0tPkad7wrdSQwH29U,38895616
3
+ delta_explain-0.4.0.dist-info/METADATA,sha256=bRr2jRsOCB_VMDdwtExpDZTvMjGZ7juROvkNahJNrYA,22318
4
+ delta_explain-0.4.0.dist-info/WHEEL,sha256=2zDlIYIdD4m4N3p5DVEG3iJhGLdhsBQgdH-FqVkAur8,94
5
+ delta_explain-0.4.0.dist-info/licenses/LICENSE,sha256=vvhvHBPooei2DnPz4OCTH7tJIt6TDD7ztTK_f486w2A,1097
6
+ delta_explain-0.4.0.dist-info/sboms/delta-explain.cyclonedx.json,sha256=jtVYdnjiWKOI7f1zurVOWaTEwNwMRWC_Xc8GL3vlYqU,338897
7
+ delta_explain-0.4.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.14.1)
3
+ Root-Is-Purelib: false
4
+ Tag: py3-none-win_amd64
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Christian Del Monte
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.