dumpling-cli 0.3.0__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/CHANGELOG.md +20 -0
  2. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/Cargo.lock +1 -1
  3. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/Cargo.toml +1 -1
  4. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/PKG-INFO +66 -11
  5. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/README.md +65 -10
  6. dumpling_cli-0.4.1/assets/logo.svg +33 -0
  7. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/docs/src/configuration.md +23 -1
  8. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/docs/src/getting-started.md +2 -0
  9. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/docs/src/index.md +1 -1
  10. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/pyproject.toml +1 -1
  11. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/main.rs +177 -26
  12. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/sql.rs +124 -12
  13. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.dumplingconf.example +0 -0
  14. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/ci.yml +0 -0
  15. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/docs-pr.yml +0 -0
  16. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/docs.yml +0 -0
  17. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/platform-compat-latest.yml +0 -0
  18. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/platform-compat-matrix.yml +0 -0
  19. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/policy-lint.yml +0 -0
  20. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/publish.yml +0 -0
  21. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/release.yml +0 -0
  22. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/tests.yml +0 -0
  23. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.gitignore +0 -0
  24. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/AGENTS.md +0 -0
  25. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/CONTRIBUTING.md +0 -0
  26. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/MAINTENANCE.md +0 -0
  27. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/book.toml +0 -0
  28. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/datetime_out.sql +0 -0
  29. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/datetime_sample.sql +0 -0
  30. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/docs/src/SUMMARY.md +0 -0
  31. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/docs/src/ci-guardrails.md +0 -0
  32. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/docs/src/releasing.md +0 -0
  33. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/rust-toolchain.toml +0 -0
  34. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/scripts/setup-dev.sh +0 -0
  35. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/faker_dispatch.rs +0 -0
  36. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/filter.rs +0 -0
  37. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/lint.rs +0 -0
  38. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/report.rs +0 -0
  39. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/scan.rs +0 -0
  40. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/settings.rs +0 -0
  41. {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/transform.rs +0 -0
@@ -7,6 +7,24 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.4.1] - 2026-05-03
11
+
12
+ ### Fixed
13
+
14
+ - **INSERT row parsing with JSON casts**: Values such as `'{"k":1}'::jsonb` are parsed so the cell’s unescaped payload is valid JSON for JSON path rules and anonymization; trailing casts like `::jsonb` / `::text` are preserved on output.
15
+
16
+ ## [0.4.0] - 2026-05-02
17
+
18
+ ### Added
19
+
20
+ - **`--dump-decode` CLI**: Decode PostgreSQL **custom-format** (`pg_dump -Fc`) or **directory-format** archives by running **`pg_restore -f -`** (plain SQL to stdout, no database), then anonymize—built for workflows such as **`heroku pg:backups:download`**. Requires PostgreSQL client tools (`pg_restore` on `PATH`, or **`--pg-restore-path`**).
21
+ - **`--dump-decode-arg`** (repeatable): Extra arguments forwarded to `pg_restore`.
22
+ - **`--dump-decode-keep-input`**: Keep the archive after a successful run. **By default** the `--input` path is **removed** after success so only anonymized output remains. **`--check`** with **`--dump-decode`** requires **`--dump-decode-keep-input`** (otherwise the dump would be deleted before config iteration).
23
+
24
+ ### Changed
25
+
26
+ - README and mdBook documentation for PostgreSQL archive decoding and Heroku-style examples.
27
+
10
28
  ## [0.3.0] - 2026-05-02
11
29
 
12
30
  ### Added
@@ -43,5 +61,7 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
43
61
  - Configurable output scan severities and per-category thresholds via `[output_scan]`.
44
62
  - JSON report section for output scan findings including category, count, threshold, severity, and sample locations.
45
63
 
64
+ [0.4.1]: https://github.com/ababic/dumpling/compare/v0.4.0...v0.4.1
65
+ [0.4.0]: https://github.com/ababic/dumpling/compare/v0.3.0...v0.4.0
46
66
  [0.3.0]: https://github.com/ababic/dumpling/compare/v0.2.0...v0.3.0
47
67
  [0.2.0]: https://github.com/ababic/dumpling/compare/v0.1.0...v0.2.0
@@ -262,7 +262,7 @@ dependencies = [
262
262
 
263
263
  [[package]]
264
264
  name = "dumpling"
265
- version = "0.3.0"
265
+ version = "0.4.1"
266
266
  dependencies = [
267
267
  "anyhow",
268
268
  "chrono",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "dumpling"
3
- version = "0.3.0"
3
+ version = "0.4.1"
4
4
  edition = "2021"
5
5
  readme = "README.md"
6
6
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dumpling-cli
3
- Version: 0.3.0
3
+ Version: 0.4.1
4
4
  Classifier: Development Status :: 4 - Beta
5
5
  Classifier: Environment :: Console
6
6
  Classifier: Intended Audience :: Developers
@@ -19,18 +19,48 @@ Keywords: postgres,sqlite,mssql,sql,anonymization,cli,rust
19
19
  Requires-Python: >=3.8
20
20
  Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
21
21
 
22
- # Dumpling
22
+ <p align="center">
23
+ <img src="assets/logo.svg" width="140" height="140" alt="Dumpling logo: a dumpling with steam" />
24
+ </p>
23
25
 
24
- **Dumpling** is a static anonymizer for plain SQL dumps. It supports PostgreSQL (`pg_dump` plain format), SQLite (`.dump`), and SQL Server / MSSQL (SSMS / mssql-scripter output). It lets you safely share, test with, or store database snapshots by replacing sensitive column data according to configurable rules — without ever touching a live database.
26
+ <h1 align="center">Dumpling</h1>
27
+
28
+ <p align="center">
29
+ <strong>Sanitize SQL dumps before they go anywhere.</strong><br />
30
+ Turn huge <code>pg_dump</code> / SQLite / SQL Server exports into shareable, test-friendly snapshots — no DB connection, no secrets left by accident.
31
+ </p>
32
+
33
+ <p align="center">
34
+ <a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/v/dumpling-cli.svg" alt="PyPI version" /></a>
35
+ <a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/pyversions/dumpling-cli.svg" alt="Python versions" /></a>
36
+ <a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/l/dumpling-cli.svg" alt="PyPI license" /></a>
37
+ <a href="https://github.com/ababic/dumpling/actions/workflows/tests.yml"><img src="https://github.com/ababic/dumpling/actions/workflows/tests.yml/badge.svg" alt="Tests" /></a>
38
+ <a href="https://github.com/ababic/dumpling/actions/workflows/ci.yml"><img src="https://github.com/ababic/dumpling/actions/workflows/ci.yml/badge.svg" alt="Lint" /></a>
39
+ <img src="https://img.shields.io/badge/rust-stable-orange?logo=rust" alt="Rust stable" />
40
+ </p>
41
+
42
+ <p align="center">
43
+ <a href="https://ababic.github.io/dumpling/"><strong>Documentation</strong></a>
44
+ &nbsp;·&nbsp;
45
+ <a href="https://github.com/ababic/dumpling"><strong>GitHub</strong></a>
46
+ </p>
47
+
48
+ <p align="center">
49
+ <sub><em>Disclaimer: This project is entirely vibe-coded, but with strong human guidance, review, and attention to quality and safety.</em></sub>
50
+ </p>
51
+
52
+ ---
53
+
54
+ **Dumpling** reads plain-text SQL dumps (PostgreSQL `pg_dump`, SQLite `.dump`, SQL Server / MSSQL scripts) and rewrites sensitive columns using rules you define in TOML. Everything runs offline on files — ideal for CI, staging share-outs, and compliance-minded workflows.
25
55
 
26
56
  ## Why Dumpling?
27
57
 
28
- - **No live database required.** Works entirely on dump files; nothing connects to your database.
29
- - **Streaming and memory-efficient.** Processes dumps line by line, so even multi-gigabyte files stay manageable.
30
- - **Fail-safe by default.** If no configuration is found, Dumpling exits non-zero and tells you exactly where it looked. Silence is never mistaken for success.
31
- - **Deterministic anonymization.** Domain mappings ensure the same source value always produces the same pseudonym, keeping foreign-key relationships intact across tables.
32
- - **CI/CD ready.** `--check` mode, strict-coverage enforcement, JSON reports, and residual-PII scan gates plug cleanly into any pipeline.
33
- - **Flexible configuration.** Rules live in a `.dumplingconf` file or directly in `pyproject.toml` no extra tooling needed.
58
+ - **Offline by design** works on dump files only; nothing connects to your database.
59
+ - **Streams giant files** line-by-line processing keeps multi‑GB dumps reasonable on modest hardware.
60
+ - **Fails loud, not silent** missing config exits nonzero and lists where Dumpling looked; use `--allow-noop` only when you mean it.
61
+ - **Stable pseudonyms** optional domain mappings keep the same source value as the same fake value across tables (foreign keys stay consistent).
62
+ - **Pipeline-ready** `--check`, strict coverage, JSON reports, and residual PII scans fit pre-merge gates and release automation.
63
+ - **Configure once** `.dumplingconf` or `[tool.dumpling]` in `pyproject.toml`; install via **Rust** (`cargo`) or **`pip install dumpling-cli`**.
34
64
 
35
65
  ---
36
66
 
@@ -289,7 +319,16 @@ Produced by `pg_dump --format=plain`. Handles:
289
319
  - `"double-quoted"` identifiers
290
320
  - `''`-escaped string literals
291
321
 
292
- Binary, custom, and directory formats from `pg_dump` are not supporteduse `--format=plain` when running `pg_dump`.
322
+ Binary, custom, and directory formats from `pg_dump` are not parsed directly Dumpling’s SQL pipeline expects plain text. Use either:
323
+
324
+ - **`pg_dump --format=plain`** when you control capture, or
325
+ - **`dumpling --dump-decode`** with `--input` set to a **custom-format** (`.dump`) or **directory-format** folder: Dumpling runs `pg_restore -f -` and streams the resulting SQL (same as a manual `pg_restore` “script” output, no database required). Requires PostgreSQL client tools on `PATH` (`pg_restore`), or set `--pg-restore-path`. Use `--dump-decode-arg` to pass extra flags (e.g. `--no-owner --no-acl`). **By default** the archive is removed after a fully successful run; pass **`--dump-decode-keep-input`** to retain it. **`--check`** requires **`--dump-decode-keep-input`** so the archive still exists if changes would be detected.
326
+
327
+ Example (e.g. after `heroku pg:backups:download`):
328
+
329
+ ```bash
330
+ dumpling --dump-decode -i latest.dump -c .dumplingconf -o anonymized.sql
331
+ ```
293
332
 
294
333
  ### SQLite (`--format sqlite`)
295
334
 
@@ -335,6 +374,22 @@ Supported predicate operators:
335
374
 
336
375
  Predicates can target nested JSON values using dot notation (`payload.profile.tier`) or Django-style notation (`payload__profile__tier`). For JSON arrays, path segments are evaluated against each element, so list-of-dicts structures can be matched naturally.
337
376
 
377
+ ### JSON path list targeting
378
+
379
+ JSON list/array traversal is automatic once a path segment resolves to an array.
380
+
381
+ - **All elements in an array**: use the next field name directly.
382
+ - `payload.items.kind` or `payload__items__kind`
383
+ - Matches/rewrites `kind` for every object in `items`.
384
+ - **Specific array index**: use a numeric segment.
385
+ - `payload.items.0.kind` or `payload__items__0__kind`
386
+ - Targets only the first element.
387
+ - **Nested arrays**: combine field and index segments as needed.
388
+ - `payload.groups.members.email`
389
+ - `payload.groups.1.members.0.email`
390
+
391
+ This path behavior is shared by both `row_filters` predicates and JSON-path anonymization rules in `[rules]`.
392
+
338
393
  ```toml
339
394
  [row_filters."public.users"]
340
395
  retain = [
@@ -477,5 +532,5 @@ See the [CI guardrails documentation](docs/src/ci-guardrails.md) for full pipeli
477
532
 
478
533
  ## Full documentation
479
534
 
480
- Detailed docs, including the configuration reference and release process, are available at the project's [GitHub Pages site](https://github.com) (built from `docs/src/`).
535
+ Detailed docs, including the configuration reference and release process, are available at the project's [GitHub Pages site](https://ababic.github.io/dumpling/) (built from `docs/src/`).
481
536
 
@@ -1,15 +1,45 @@
1
- # Dumpling
1
+ <p align="center">
2
+ <img src="assets/logo.svg" width="140" height="140" alt="Dumpling logo: a dumpling with steam" />
3
+ </p>
4
+
5
+ <h1 align="center">Dumpling</h1>
6
+
7
+ <p align="center">
8
+ <strong>Sanitize SQL dumps before they go anywhere.</strong><br />
9
+ Turn huge <code>pg_dump</code> / SQLite / SQL Server exports into shareable, test-friendly snapshots — no DB connection, no secrets left by accident.
10
+ </p>
11
+
12
+ <p align="center">
13
+ <a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/v/dumpling-cli.svg" alt="PyPI version" /></a>
14
+ <a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/pyversions/dumpling-cli.svg" alt="Python versions" /></a>
15
+ <a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/l/dumpling-cli.svg" alt="PyPI license" /></a>
16
+ <a href="https://github.com/ababic/dumpling/actions/workflows/tests.yml"><img src="https://github.com/ababic/dumpling/actions/workflows/tests.yml/badge.svg" alt="Tests" /></a>
17
+ <a href="https://github.com/ababic/dumpling/actions/workflows/ci.yml"><img src="https://github.com/ababic/dumpling/actions/workflows/ci.yml/badge.svg" alt="Lint" /></a>
18
+ <img src="https://img.shields.io/badge/rust-stable-orange?logo=rust" alt="Rust stable" />
19
+ </p>
20
+
21
+ <p align="center">
22
+ <a href="https://ababic.github.io/dumpling/"><strong>Documentation</strong></a>
23
+ &nbsp;·&nbsp;
24
+ <a href="https://github.com/ababic/dumpling"><strong>GitHub</strong></a>
25
+ </p>
26
+
27
+ <p align="center">
28
+ <sub><em>Disclaimer: This project is entirely vibe-coded, but with strong human guidance, review, and attention to quality and safety.</em></sub>
29
+ </p>
2
30
 
3
- **Dumpling** is a static anonymizer for plain SQL dumps. It supports PostgreSQL (`pg_dump` plain format), SQLite (`.dump`), and SQL Server / MSSQL (SSMS / mssql-scripter output). It lets you safely share, test with, or store database snapshots by replacing sensitive column data according to configurable rules — without ever touching a live database.
31
+ ---
32
+
33
+ **Dumpling** reads plain-text SQL dumps (PostgreSQL `pg_dump`, SQLite `.dump`, SQL Server / MSSQL scripts) and rewrites sensitive columns using rules you define in TOML. Everything runs offline on files — ideal for CI, staging share-outs, and compliance-minded workflows.
4
34
 
5
35
  ## Why Dumpling?
6
36
 
7
- - **No live database required.** Works entirely on dump files; nothing connects to your database.
8
- - **Streaming and memory-efficient.** Processes dumps line by line, so even multi-gigabyte files stay manageable.
9
- - **Fail-safe by default.** If no configuration is found, Dumpling exits non-zero and tells you exactly where it looked. Silence is never mistaken for success.
10
- - **Deterministic anonymization.** Domain mappings ensure the same source value always produces the same pseudonym, keeping foreign-key relationships intact across tables.
11
- - **CI/CD ready.** `--check` mode, strict-coverage enforcement, JSON reports, and residual-PII scan gates plug cleanly into any pipeline.
12
- - **Flexible configuration.** Rules live in a `.dumplingconf` file or directly in `pyproject.toml` no extra tooling needed.
37
+ - **Offline by design** works on dump files only; nothing connects to your database.
38
+ - **Streams giant files** line-by-line processing keeps multi‑GB dumps reasonable on modest hardware.
39
+ - **Fails loud, not silent** missing config exits nonzero and lists where Dumpling looked; use `--allow-noop` only when you mean it.
40
+ - **Stable pseudonyms** optional domain mappings keep the same source value as the same fake value across tables (foreign keys stay consistent).
41
+ - **Pipeline-ready** `--check`, strict coverage, JSON reports, and residual PII scans fit pre-merge gates and release automation.
42
+ - **Configure once** `.dumplingconf` or `[tool.dumpling]` in `pyproject.toml`; install via **Rust** (`cargo`) or **`pip install dumpling-cli`**.
13
43
 
14
44
  ---
15
45
 
@@ -268,7 +298,16 @@ Produced by `pg_dump --format=plain`. Handles:
268
298
  - `"double-quoted"` identifiers
269
299
  - `''`-escaped string literals
270
300
 
271
- Binary, custom, and directory formats from `pg_dump` are not supporteduse `--format=plain` when running `pg_dump`.
301
+ Binary, custom, and directory formats from `pg_dump` are not parsed directly Dumpling’s SQL pipeline expects plain text. Use either:
302
+
303
+ - **`pg_dump --format=plain`** when you control capture, or
304
+ - **`dumpling --dump-decode`** with `--input` set to a **custom-format** (`.dump`) or **directory-format** folder: Dumpling runs `pg_restore -f -` and streams the resulting SQL (same as a manual `pg_restore` “script” output, no database required). Requires PostgreSQL client tools on `PATH` (`pg_restore`), or set `--pg-restore-path`. Use `--dump-decode-arg` to pass extra flags (e.g. `--no-owner --no-acl`). **By default** the archive is removed after a fully successful run; pass **`--dump-decode-keep-input`** to retain it. **`--check`** requires **`--dump-decode-keep-input`** so the archive still exists if changes would be detected.
305
+
306
+ Example (e.g. after `heroku pg:backups:download`):
307
+
308
+ ```bash
309
+ dumpling --dump-decode -i latest.dump -c .dumplingconf -o anonymized.sql
310
+ ```
272
311
 
273
312
  ### SQLite (`--format sqlite`)
274
313
 
@@ -314,6 +353,22 @@ Supported predicate operators:
314
353
 
315
354
  Predicates can target nested JSON values using dot notation (`payload.profile.tier`) or Django-style notation (`payload__profile__tier`). For JSON arrays, path segments are evaluated against each element, so list-of-dicts structures can be matched naturally.
316
355
 
356
+ ### JSON path list targeting
357
+
358
+ JSON list/array traversal is automatic once a path segment resolves to an array.
359
+
360
+ - **All elements in an array**: use the next field name directly.
361
+ - `payload.items.kind` or `payload__items__kind`
362
+ - Matches/rewrites `kind` for every object in `items`.
363
+ - **Specific array index**: use a numeric segment.
364
+ - `payload.items.0.kind` or `payload__items__0__kind`
365
+ - Targets only the first element.
366
+ - **Nested arrays**: combine field and index segments as needed.
367
+ - `payload.groups.members.email`
368
+ - `payload.groups.1.members.0.email`
369
+
370
+ This path behavior is shared by both `row_filters` predicates and JSON-path anonymization rules in `[rules]`.
371
+
317
372
  ```toml
318
373
  [row_filters."public.users"]
319
374
  retain = [
@@ -456,4 +511,4 @@ See the [CI guardrails documentation](docs/src/ci-guardrails.md) for full pipeli
456
511
 
457
512
  ## Full documentation
458
513
 
459
- Detailed docs, including the configuration reference and release process, are available at the project's [GitHub Pages site](https://github.com) (built from `docs/src/`).
514
+ Detailed docs, including the configuration reference and release process, are available at the project's [GitHub Pages site](https://ababic.github.io/dumpling/) (built from `docs/src/`).
@@ -0,0 +1,33 @@
1
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 128 128" role="img" aria-label="Dumpling logo">
2
+ <defs>
3
+ <linearGradient id="steam" x1="0%" y1="100%" x2="0%" y2="0%">
4
+ <stop offset="0%" stop-color="#e8f4fc" stop-opacity="0"/>
5
+ <stop offset="50%" stop-color="#cfe9fb" stop-opacity="0.85"/>
6
+ <stop offset="100%" stop-color="#b8dff8" stop-opacity="0"/>
7
+ </linearGradient>
8
+ <linearGradient id="dough" x1="0%" y1="0%" x2="100%" y2="100%">
9
+ <stop offset="0%" stop-color="#fff8ef"/>
10
+ <stop offset="45%" stop-color="#f4dcc4"/>
11
+ <stop offset="100%" stop-color="#e8b896"/>
12
+ </linearGradient>
13
+ <linearGradient id="shadow" x1="0%" y1="0%" x2="0%" y2="100%">
14
+ <stop offset="0%" stop-color="#c4865a" stop-opacity="0.35"/>
15
+ <stop offset="100%" stop-color="#8b5a3c" stop-opacity="0.15"/>
16
+ </linearGradient>
17
+ </defs>
18
+ <!-- Steam -->
19
+ <path fill="url(#steam)" d="M44 18c2-6 8-10 14-8s8 10 4 15c-3 4-2 9 2 12 5 4 5 12 0 16-6 5-16 4-20-3-2-4 0-9 4-11 3-2 3-6 0-9-4-4-4-10 0-12z"/>
20
+ <path fill="url(#steam)" opacity="0.75" d="M64 14c3-5 9-7 14-4 5 3 6 10 2 14-4 4-3 10 2 13 6 4 7 13 1 18-7 6-19 4-24-5-2-5 1-11 6-13 4-2 4-7 1-10-5-4-5-11 0-13z"/>
21
+ <path fill="url(#steam)" opacity="0.6" d="M82 20c2-5 8-8 13-5 5 3 7 10 3 15-3 4-2 9 3 12 5 3 7 11 2 16-6 6-17 5-22-3-2-4 0-9 5-11 3-2 4-6 1-9-4-4-4-10 0-13z"/>
22
+ <!-- Plate -->
23
+ <ellipse cx="64" cy="108" rx="52" ry="10" fill="#dfe8ef"/>
24
+ <ellipse cx="64" cy="106" rx="48" ry="8" fill="#eef4f8"/>
25
+ <!-- Dumpling body -->
26
+ <ellipse cx="64" cy="82" rx="42" ry="28" fill="url(#dough)" stroke="#d4a574" stroke-width="2"/>
27
+ <ellipse cx="64" cy="96" rx="38" ry="12" fill="url(#shadow)"/>
28
+ <!-- Pleats -->
29
+ <path fill="none" stroke="#c9956a" stroke-width="1.8" stroke-linecap="round" d="M34 58c6 10 14 16 30 16s24-6 30-16"/>
30
+ <path fill="none" stroke="#d9b08a" stroke-width="1.2" stroke-linecap="round" opacity="0.9" d="M42 54c5 8 13 13 22 13s17-5 22-13"/>
31
+ <!-- Highlight -->
32
+ <ellipse cx="48" cy="76" rx="10" ry="6" fill="#ffffff" opacity="0.35"/>
33
+ </svg>
@@ -6,7 +6,7 @@ Use `--format` to declare the SQL dialect of your input file:
6
6
 
7
7
  | Value | Description |
8
8
  |---|---|
9
- | `postgres` (default) | PostgreSQL `pg_dump` plain-text format. Supports `COPY … FROM stdin` blocks, `"double-quoted"` identifiers, `''`-escaped strings. |
9
+ | `postgres` (default) | PostgreSQL `pg_dump` plain-text format. Supports `COPY … FROM stdin` blocks, `"double-quoted"` identifiers, `''`-escaped strings. Custom-format (`-Fc`) or directory dumps can be decoded on the fly with `dumpling --dump-decode` (wraps `pg_restore -f -`; requires client tools). By default the archive is deleted after success; use `--dump-decode-keep-input` to retain it. |
10
10
  | `sqlite` | SQLite `.dump` format. Adds `INSERT OR REPLACE INTO` / `INSERT OR IGNORE INTO` support. No COPY blocks. |
11
11
  | `mssql` | SQL Server / MSSQL plain SQL. Adds `[bracket]` identifier quoting, `N'…'` Unicode string literals, and `nvarchar(n)` / `nchar(n)` length extraction. No COPY blocks. |
12
12
 
@@ -17,6 +17,28 @@ dumpling --format sqlite -i data.db.sql -o anonymized.sql
17
17
  dumpling --format mssql -i backup.sql -o anonymized.sql
18
18
  ```
19
19
 
20
+ ### PostgreSQL custom-format archives (`--dump-decode`)
21
+
22
+ Heroku PGBackups and many pipelines ship **`pg_dump` custom format** (`-Fc`) or **directory-format** dumps to save bandwidth. Dumpling’s SQL engine still expects **plain text**; use **`--dump-decode`** so Dumpling runs **`pg_restore -f -`** (script to stdout, no database) and pipes the result through the same anonymizer as a normal plain-SQL file.
23
+
24
+ **Requirements:** PostgreSQL client tools on `PATH` (`pg_restore`), or set **`--pg-restore-path`**. Use **`--dump-decode-arg`** (repeatable) for extra `pg_restore` flags, e.g. `--dump-decode-arg=--no-owner --dump-decode-arg=--no-acl`.
25
+
26
+ **Input deletion:** After a **fully successful** run, Dumpling **removes** the `--input` path (single file or directory-format folder) by default so only the anonymized output remains. Pass **`--dump-decode-keep-input`** to retain the archive.
27
+
28
+ **Check mode:** **`--check`** with **`--dump-decode`** requires **`--dump-decode-keep-input`**. Otherwise the default would delete the dump before you can iterate on config.
29
+
30
+ Example (e.g. after `heroku pg:backups:download`):
31
+
32
+ ```bash
33
+ dumpling --dump-decode -i latest.dump -c .dumplingconf -o anonymized.sql
34
+ ```
35
+
36
+ Dry run while keeping the downloaded file:
37
+
38
+ ```bash
39
+ dumpling --dump-decode --dump-decode-keep-input --check -i latest.dump -c .dumplingconf
40
+ ```
41
+
20
42
  ---
21
43
 
22
44
  ## Configuration sources
@@ -28,4 +28,6 @@ cargo test --all-targets --all-features
28
28
  dumpling -i dump.sql -o sanitized.sql
29
29
  ```
30
30
 
31
+ If your input is a PostgreSQL **custom-format** file (not plain SQL), decode and anonymize in one step with **`--dump-decode`** (needs `pg_restore` from PostgreSQL client tools). See [PostgreSQL custom-format archives](configuration.md#postgresql-custom-format-archives---dump-decode) in the configuration guide.
32
+
31
33
  For full command examples and strategy options, see the repository `README.md`.
@@ -1,6 +1,6 @@
1
1
  # Dumpling documentation
2
2
 
3
- Dumpling is a streaming anonymizer for plain SQL dumps. It supports PostgreSQL (`pg_dump` plain format), SQLite (`.dump`), and SQL Server / MSSQL (SSMS / mssql-scripter plain SQL output).
3
+ Dumpling is a streaming anonymizer for plain SQL dumps. It supports PostgreSQL (`pg_dump` plain format), SQLite (`.dump`), and SQL Server / MSSQL (SSMS / mssql-scripter plain SQL output). For PostgreSQL **custom-format** archives (e.g. Heroku `pg:backups:download`), use **`--dump-decode`** so Dumpling invokes `pg_restore` and streams plain SQL—see [Dump format](configuration.html#postgresql-custom-format-archives---dump-decode) in the configuration guide.
4
4
 
5
5
  This documentation covers the operating model for day-to-day use:
6
6
 
@@ -4,7 +4,7 @@ build-backend = "maturin"
4
4
 
5
5
  [project]
6
6
  name = "dumpling-cli"
7
- version = "0.3.0"
7
+ version = "0.4.1"
8
8
  description = "Static anonymizer for plain SQL dumps (PostgreSQL, SQLite, SQL Server)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -1,6 +1,7 @@
1
1
  use std::fs::File;
2
2
  use std::io::{self, BufRead, BufReader, BufWriter, Write};
3
3
  use std::path::{Path, PathBuf};
4
+ use std::process::{Command, Stdio};
4
5
 
5
6
  use clap::{ArgAction, Parser, Subcommand};
6
7
 
@@ -13,6 +14,7 @@ mod settings;
13
14
  mod sql;
14
15
  mod transform;
15
16
 
17
+ use anyhow::Context;
16
18
  use regex::Regex;
17
19
  use report::Reporter;
18
20
  use scan::{OutputScanner, ScanningWriter};
@@ -105,6 +107,26 @@ struct Cli {
105
107
  #[arg(long = "security-profile", default_value = "standard")]
106
108
  security_profile: String,
107
109
 
110
+ /// Decode PostgreSQL custom-format or directory-format dumps via `pg_restore -f -` before anonymizing.
111
+ /// Requires `--input` pointing at the archive file or directory and `--format postgres`. Requires a
112
+ /// PostgreSQL client install (`pg_restore` on PATH unless overridden by `--pg-restore-path`).
113
+ #[arg(long = "dump-decode", action = ArgAction::SetTrue)]
114
+ dump_decode: bool,
115
+
116
+ /// Keep the input archive after `--dump-decode` (default: delete file or directory after a fully
117
+ /// successful run). Cannot retain the archive with `--check` (would delete before verifying changes).
118
+ #[arg(long = "dump-decode-keep-input", action = ArgAction::SetTrue)]
119
+ dump_decode_keep_input: bool,
120
+
121
+ /// `pg_restore` executable to use with `--dump-decode` (default: `pg_restore` on PATH).
122
+ #[arg(long = "pg-restore-path", default_value = "pg_restore")]
123
+ pg_restore_path: PathBuf,
124
+
125
+ /// Extra arguments forwarded to `pg_restore` before the archive path (repeatable). Example:
126
+ /// `--dump-decode-arg=--no-owner` `--dump-decode-arg=--no-acl`
127
+ #[arg(long = "dump-decode-arg")]
128
+ dump_decode_arg: Vec<String>,
129
+
108
130
  #[command(subcommand)]
109
131
  command: Option<Commands>,
110
132
  }
@@ -184,6 +206,14 @@ fn run_anonymize(cli: Cli) -> anyhow::Result<()> {
184
206
  if cli.check && (cli.in_place || cli.output.is_some()) {
185
207
  anyhow::bail!("--check cannot be used together with --output or --in-place");
186
208
  }
209
+ if cli.dump_decode && !cli.dump_decode_keep_input && cli.check {
210
+ anyhow::bail!(
211
+ "--dump-decode removes the input archive on success by default; use --dump-decode-keep-input with --check"
212
+ );
213
+ }
214
+ if cli.dump_decode && cli.in_place {
215
+ anyhow::bail!("--dump-decode cannot be used with --in-place");
216
+ }
187
217
 
188
218
  // Resolve config from provided path or discover in CWD
189
219
  let resolved_config: ResolvedConfig =
@@ -247,36 +277,97 @@ fn run_anonymize(cli: Cli) -> anyhow::Result<()> {
247
277
  other
248
278
  ),
249
279
  };
280
+ if cli.dump_decode && dump_format != DumpFormat::Postgres {
281
+ anyhow::bail!(
282
+ "--dump-decode only applies to PostgreSQL dumps; use --format postgres (default)"
283
+ );
284
+ }
250
285
 
251
286
  // Compile table include/exclude regex patterns
252
287
  let include_res = compile_patterns(&cli.include_table)?;
253
288
  let exclude_res = compile_patterns(&cli.exclude_table)?;
254
289
 
255
- // Determine IO
256
- let (mut reader, input_path_for_inplace): (Box<dyn BufRead>, Option<PathBuf>) = match &cli.input
290
+ // Determine IO (optional pg_restore child when --dump-decode)
291
+ let mut pg_restore_child: Option<std::process::Child> = None;
292
+ let (mut reader, input_path_for_inplace): (Box<dyn BufRead>, Option<PathBuf>) = if cli
293
+ .dump_decode
257
294
  {
258
- Some(path) => {
259
- // Enforce extension allowlist if provided
260
- if !cli.allow_ext.is_empty() && !has_allowed_extension(path, &cli.allow_ext) {
261
- let actual = path
262
- .extension()
263
- .and_then(|s| s.to_str())
264
- .unwrap_or("<none>")
265
- .to_string();
266
- anyhow::bail!(
267
- "input file extension '{}' is not in allowed set {:?}",
268
- actual,
269
- cli.allow_ext
270
- );
271
- }
272
- let f = File::open(path)?;
273
- (Box::new(BufReader::new(f)), Some(path.clone()))
295
+ let archive_path = cli
296
+ .input
297
+ .as_ref()
298
+ .ok_or_else(|| {
299
+ anyhow::anyhow!(
300
+ "--dump-decode requires --input pointing at a pg_dump custom-format file or directory-format directory"
301
+ )
302
+ })?;
303
+ if !cli.allow_ext.is_empty() && !has_allowed_extension(archive_path, &cli.allow_ext) {
304
+ let actual = archive_path
305
+ .extension()
306
+ .and_then(|s| s.to_str())
307
+ .unwrap_or("<none>")
308
+ .to_string();
309
+ anyhow::bail!(
310
+ "input file extension '{}' is not in allowed set {:?}",
311
+ actual,
312
+ cli.allow_ext
313
+ );
314
+ }
315
+ if !archive_path.exists() {
316
+ anyhow::bail!(
317
+ "--dump-decode input path does not exist: {}",
318
+ archive_path.display()
319
+ );
320
+ }
321
+ eprintln!(
322
+ "dumpling: decoding PostgreSQL archive via {} -f - {}",
323
+ cli.pg_restore_path.display(),
324
+ archive_path.display()
325
+ );
326
+ let mut cmd = Command::new(&cli.pg_restore_path);
327
+ for a in &cli.dump_decode_arg {
328
+ cmd.arg(a);
274
329
  }
275
- None => {
276
- if !cli.allow_ext.is_empty() {
277
- eprintln!("dumpling: --allow-ext provided but no --input file; extension check is ignored for stdin");
330
+ cmd.arg("-f")
331
+ .arg("-")
332
+ .arg(archive_path)
333
+ .stdout(Stdio::piped())
334
+ .stderr(Stdio::inherit());
335
+ let mut child = cmd.spawn().with_context(|| {
336
+ format!(
337
+ "failed to spawn `{}`; install PostgreSQL client tools or set --pg-restore-path",
338
+ cli.pg_restore_path.display()
339
+ )
340
+ })?;
341
+ let stdout = child
342
+ .stdout
343
+ .take()
344
+ .ok_or_else(|| anyhow::anyhow!("pg_restore stdout missing"))?;
345
+ pg_restore_child = Some(child);
346
+ (Box::new(BufReader::new(stdout)), Some(archive_path.clone()))
347
+ } else {
348
+ match &cli.input {
349
+ Some(path) => {
350
+ if !cli.allow_ext.is_empty() && !has_allowed_extension(path, &cli.allow_ext) {
351
+ let actual = path
352
+ .extension()
353
+ .and_then(|s| s.to_str())
354
+ .unwrap_or("<none>")
355
+ .to_string();
356
+ anyhow::bail!(
357
+ "input file extension '{}' is not in allowed set {:?}",
358
+ actual,
359
+ cli.allow_ext
360
+ );
361
+ }
362
+ let f = File::open(path)?;
363
+ (Box::new(BufReader::new(f)), Some(path.clone()))
364
+ }
365
+ None => {
366
+ if !cli.allow_ext.is_empty() {
367
+ eprintln!("dumpling: --allow-ext provided but no --input file; extension check is ignored for stdin");
368
+ }
369
+ (Box::new(BufReader::new(io::stdin())), None)
278
370
  }
279
- (Box::new(BufReader::new(io::stdin())), None)
280
371
  }
281
372
  };
282
373
 
@@ -330,12 +421,30 @@ fn run_anonymize(cli: Cli) -> anyhow::Result<()> {
330
421
  dump_format,
331
422
  );
332
423
  let mut writer = output;
333
- if let Some(scanner) = output_scanner.as_mut() {
424
+ let proc_res = if let Some(scanner) = output_scanner.as_mut() {
334
425
  let mut scanning_writer = ScanningWriter::new(&mut writer, scanner);
335
- processor.process(&mut reader, &mut scanning_writer)?;
426
+ processor.process(&mut reader, &mut scanning_writer)
336
427
  } else {
337
- processor.process(&mut reader, &mut writer)?;
428
+ processor.process(&mut reader, &mut writer)
429
+ };
430
+
431
+ if let Some(mut child) = pg_restore_child {
432
+ if proc_res.is_err() {
433
+ let _ = child.kill();
434
+ }
435
+ let status = child
436
+ .wait()
437
+ .with_context(|| format!("waiting for `{}`", cli.pg_restore_path.display()))?;
438
+ if proc_res.is_ok() && !status.success() {
439
+ anyhow::bail!(
440
+ "`{}` exited with status {}",
441
+ cli.pg_restore_path.display(),
442
+ status
443
+ );
444
+ }
338
445
  }
446
+
447
+ proc_res?;
339
448
  let coverage = processor.sensitive_coverage_summary();
340
449
  reporter.report.sensitive_columns_detected = coverage.detected.clone();
341
450
  reporter.report.sensitive_columns_covered = coverage.covered.clone();
@@ -363,7 +472,10 @@ fn run_anonymize(cli: Cli) -> anyhow::Result<()> {
363
472
 
364
473
  // If in-place, do the swap now
365
474
  if cli.in_place {
366
- let input_path = input_path_for_inplace.unwrap();
475
+ let input_path = input_path_for_inplace
476
+ .as_ref()
477
+ .ok_or_else(|| anyhow::anyhow!("--in-place requires an --input path"))?
478
+ .clone();
367
479
  let mut tmp = input_path.clone();
368
480
  tmp.set_extension("sql.dumpling.tmp");
369
481
  writer.flush()?;
@@ -405,9 +517,30 @@ fn run_anonymize(cli: Cli) -> anyhow::Result<()> {
405
517
  std::process::exit(1);
406
518
  }
407
519
 
520
+ if cli.dump_decode && !cli.dump_decode_keep_input {
521
+ if let Some(ref p) = input_path_for_inplace {
522
+ match remove_pg_archive(p) {
523
+ Ok(()) => eprintln!("dumpling: removed input archive {}", p.display()),
524
+ Err(e) => eprintln!(
525
+ "dumpling: warning: could not remove input archive {}: {}",
526
+ p.display(),
527
+ e
528
+ ),
529
+ }
530
+ }
531
+ }
532
+
408
533
  Ok(())
409
534
  }
410
535
 
536
+ fn remove_pg_archive(path: &Path) -> std::io::Result<()> {
537
+ if path.is_dir() {
538
+ std::fs::remove_dir_all(path)
539
+ } else {
540
+ std::fs::remove_file(path)
541
+ }
542
+ }
543
+
411
544
  fn compile_patterns(patterns: &[String]) -> anyhow::Result<Vec<Regex>> {
412
545
  let mut out = Vec::new();
413
546
  for p in patterns {
@@ -494,6 +627,24 @@ mod tests_main {
494
627
  }
495
628
  }
496
629
 
630
+ #[test]
631
+ fn test_dump_decode_flags_parse() {
632
+ let cli = Cli::parse_from([
633
+ "dumpling",
634
+ "--dump-decode",
635
+ "--dump-decode-keep-input",
636
+ "--pg-restore-path",
637
+ "/usr/bin/pg_restore",
638
+ "--dump-decode-arg=--no-owner",
639
+ "-i",
640
+ "/tmp/latest.dump",
641
+ ]);
642
+ assert!(cli.dump_decode);
643
+ assert!(cli.dump_decode_keep_input);
644
+ assert_eq!(cli.pg_restore_path, PathBuf::from("/usr/bin/pg_restore"));
645
+ assert_eq!(cli.dump_decode_arg, vec!["--no-owner"]);
646
+ }
647
+
497
648
  #[test]
498
649
  fn test_lint_policy_allow_noop_flag() {
499
650
  let cli = Cli::parse_from(["dumpling", "lint-policy", "--allow-noop"]);
@@ -1155,20 +1155,22 @@ struct Cell {
1155
1155
  original: Option<String>, // None for NULL
1156
1156
  was_quoted: bool,
1157
1157
  was_default: bool,
1158
+ trailing_expr: Option<String>,
1158
1159
  }
1159
1160
 
1160
1161
  impl Cell {
1161
1162
  fn render_original(&self) -> String {
1163
+ let trailing = self.trailing_expr.as_deref().unwrap_or("");
1162
1164
  if self.was_default {
1163
- return "DEFAULT".to_string();
1165
+ return format!("DEFAULT{trailing}");
1164
1166
  }
1165
1167
  match &self.original {
1166
- None => "NULL".to_string(),
1168
+ None => format!("NULL{trailing}"),
1167
1169
  Some(s) => {
1168
1170
  if self.was_quoted {
1169
- format!("'{}'", s.replace('\'', "''"))
1171
+ format!("'{}'{trailing}", s.replace('\'', "''"))
1170
1172
  } else {
1171
- s.clone()
1173
+ format!("{s}{trailing}")
1172
1174
  }
1173
1175
  }
1174
1176
  }
@@ -1176,14 +1178,15 @@ impl Cell {
1176
1178
  }
1177
1179
 
1178
1180
  fn render_cell(repl: &Replacement, original: &Cell) -> String {
1181
+ let trailing = original.trailing_expr.as_deref().unwrap_or("");
1179
1182
  if repl.is_null {
1180
- return "NULL".to_string();
1183
+ return format!("NULL{trailing}");
1181
1184
  }
1182
1185
  let should_quote = repl.force_quoted || original.was_quoted;
1183
1186
  if should_quote {
1184
- format!("'{}'", repl.value.replace('\'', "''"))
1187
+ format!("'{}'{trailing}", repl.value.replace('\'', "''"))
1185
1188
  } else {
1186
- repl.value.clone()
1189
+ format!("{}{trailing}", repl.value)
1187
1190
  }
1188
1191
  }
1189
1192
 
@@ -1243,7 +1246,9 @@ fn parse_parenthesized_values(s: &str) -> anyhow::Result<(Vec<Cell>, usize)> {
1243
1246
  let mut cells: Vec<Cell> = Vec::new();
1244
1247
  let mut in_single = false;
1245
1248
  let mut buf = String::new();
1249
+ let mut trailing_expr = String::new();
1246
1250
  let mut was_quoted = false;
1251
+ let mut closed_quoted_literal = false;
1247
1252
  while i < chs.len() {
1248
1253
  let c = chs[i];
1249
1254
  if in_single {
@@ -1255,6 +1260,7 @@ fn parse_parenthesized_values(s: &str) -> anyhow::Result<(Vec<Cell>, usize)> {
1255
1260
  continue;
1256
1261
  } else {
1257
1262
  in_single = false;
1263
+ closed_quoted_literal = true;
1258
1264
  i += 1;
1259
1265
  continue;
1260
1266
  }
@@ -1282,17 +1288,19 @@ fn parse_parenthesized_values(s: &str) -> anyhow::Result<(Vec<Cell>, usize)> {
1282
1288
  }
1283
1289
  ')' => {
1284
1290
  // end cell, end row
1285
- let cell = finalize_cell(&buf, was_quoted);
1291
+ let cell = finalize_cell(&buf, was_quoted, &trailing_expr);
1286
1292
  cells.push(cell);
1287
1293
  i += 1;
1288
1294
  return Ok((cells, i));
1289
1295
  }
1290
1296
  ',' => {
1291
1297
  // end cell
1292
- let cell = finalize_cell(&buf, was_quoted);
1298
+ let cell = finalize_cell(&buf, was_quoted, &trailing_expr);
1293
1299
  cells.push(cell);
1294
1300
  buf.clear();
1301
+ trailing_expr.clear();
1295
1302
  was_quoted = false;
1303
+ closed_quoted_literal = false;
1296
1304
  i += 1;
1297
1305
  // consume following spaces
1298
1306
  while i < chs.len() && chs[i].is_whitespace() {
@@ -1300,11 +1308,19 @@ fn parse_parenthesized_values(s: &str) -> anyhow::Result<(Vec<Cell>, usize)> {
1300
1308
  }
1301
1309
  }
1302
1310
  c if c.is_whitespace() => {
1303
- // skip insignificant whitespace between tokens when unquoted
1311
+ // Preserve whitespace after a quoted literal so explicit SQL casts stay intact.
1312
+ if was_quoted && closed_quoted_literal {
1313
+ trailing_expr.push(c);
1314
+ }
1315
+ // Skip insignificant whitespace between tokens when unquoted.
1304
1316
  i += 1;
1305
1317
  }
1306
1318
  other => {
1307
- buf.push(other);
1319
+ if was_quoted && closed_quoted_literal {
1320
+ trailing_expr.push(other);
1321
+ } else {
1322
+ buf.push(other);
1323
+ }
1308
1324
  i += 1;
1309
1325
  }
1310
1326
  }
@@ -1313,12 +1329,21 @@ fn parse_parenthesized_values(s: &str) -> anyhow::Result<(Vec<Cell>, usize)> {
1313
1329
  anyhow::bail!("unterminated values row")
1314
1330
  }
1315
1331
 
1316
- fn finalize_cell(buf: &str, was_quoted: bool) -> Cell {
1332
+ fn finalize_cell(buf: &str, was_quoted: bool, trailing_expr: &str) -> Cell {
1333
+ let trailing = {
1334
+ let t = trailing_expr.trim();
1335
+ if t.is_empty() {
1336
+ None
1337
+ } else {
1338
+ Some(t.to_string())
1339
+ }
1340
+ };
1317
1341
  if was_quoted {
1318
1342
  Cell {
1319
1343
  original: Some(buf.to_string()),
1320
1344
  was_quoted: true,
1321
1345
  was_default: false,
1346
+ trailing_expr: trailing,
1322
1347
  }
1323
1348
  } else {
1324
1349
  let t = buf.trim();
@@ -1327,18 +1352,21 @@ fn finalize_cell(buf: &str, was_quoted: bool) -> Cell {
1327
1352
  original: None,
1328
1353
  was_quoted: false,
1329
1354
  was_default: false,
1355
+ trailing_expr: None,
1330
1356
  }
1331
1357
  } else if t.eq_ignore_ascii_case("default") {
1332
1358
  Cell {
1333
1359
  original: None,
1334
1360
  was_quoted: false,
1335
1361
  was_default: true,
1362
+ trailing_expr: None,
1336
1363
  }
1337
1364
  } else {
1338
1365
  Cell {
1339
1366
  original: Some(t.to_string()),
1340
1367
  was_quoted: false,
1341
1368
  was_default: false,
1369
+ trailing_expr: None,
1342
1370
  }
1343
1371
  }
1344
1372
  }
@@ -2370,6 +2398,90 @@ COPY public.events (id, payload) FROM stdin;
2370
2398
  );
2371
2399
  }
2372
2400
 
2401
+ #[test]
2402
+ fn parse_values_rows_tracks_trailing_cast_for_quoted_literals() {
2403
+ let rows =
2404
+ parse_values_rows("(1, '{\"profile\":{\"secret\":\"alpha\"}}'::jsonb, 'note'::text)")
2405
+ .unwrap();
2406
+ assert_eq!(rows.len(), 1);
2407
+ assert_eq!(rows[0].len(), 3);
2408
+ assert_eq!(
2409
+ rows[0][1].original.as_deref(),
2410
+ Some("{\"profile\":{\"secret\":\"alpha\"}}")
2411
+ );
2412
+ assert_eq!(rows[0][1].trailing_expr.as_deref(), Some("::jsonb"));
2413
+ assert_eq!(rows[0][2].original.as_deref(), Some("note"));
2414
+ assert_eq!(rows[0][2].trailing_expr.as_deref(), Some("::text"));
2415
+ }
2416
+
2417
+ #[test]
2418
+ fn pipeline_anonymizes_nested_json_paths_for_jsonb_cast_insert_rows() {
2419
+ let mut rules: HashMap<String, HashMap<String, AnonymizerSpec>> = HashMap::new();
2420
+ let mut cols: HashMap<String, AnonymizerSpec> = HashMap::new();
2421
+ cols.insert(
2422
+ "payload.profile.secret".to_string(),
2423
+ AnonymizerSpec {
2424
+ strategy: "string".to_string(),
2425
+ salt: None,
2426
+ min: None,
2427
+ max: None,
2428
+ length: Some(8),
2429
+ min_days: None,
2430
+ max_days: None,
2431
+ min_seconds: None,
2432
+ max_seconds: None,
2433
+ domain: Some("secrets".to_string()),
2434
+ unique_within_domain: None,
2435
+ as_string: Some(true),
2436
+ locale: None,
2437
+ faker: None,
2438
+ format: None,
2439
+ },
2440
+ );
2441
+ rules.insert("public.events".to_string(), cols);
2442
+ let cfg = ResolvedConfig {
2443
+ salt: None,
2444
+ rules,
2445
+ row_filters: HashMap::new(),
2446
+ column_cases: HashMap::new(),
2447
+ sensitive_columns: HashMap::new(),
2448
+ output_scan: crate::settings::OutputScanConfig::default(),
2449
+ source_path: None,
2450
+ };
2451
+ let reg = AnonymizerRegistry::from_config(&cfg);
2452
+ let mut proc =
2453
+ SqlStreamProcessor::new(reg, cfg, Vec::new(), Vec::new(), None, DumpFormat::Postgres);
2454
+ let input = r#"
2455
+ CREATE TABLE public.events (id int, payload jsonb);
2456
+ INSERT INTO public.events (id, payload) VALUES
2457
+ (1, '{"profile":{"tier":"gold","secret":"alpha"}}'::jsonb),
2458
+ (2, '{"profile":{"tier":"gold","secret":"alpha"}}'::jsonb);
2459
+ "#;
2460
+ let mut reader = std::io::BufReader::new(input.as_bytes());
2461
+ let mut out = Vec::new();
2462
+ proc.process(&mut reader, &mut out).unwrap();
2463
+ let s = String::from_utf8(out).unwrap();
2464
+ assert!(!s.contains("alpha"), "nested secret should be anonymized");
2465
+ assert!(s.contains("::jsonb"), "jsonb cast should be preserved");
2466
+
2467
+ let insert_pos = s.find("INSERT INTO public.events").unwrap();
2468
+ let insert_tail = &s[insert_pos..];
2469
+ let insert_end = insert_tail.find(";\n").unwrap() + insert_pos;
2470
+ let ins_stmt = &s[insert_pos..=insert_end];
2471
+ let vals_idx = ins_stmt.to_uppercase().find("VALUES").unwrap();
2472
+ let ins_block = strip_trailing_semicolon(ins_stmt[vals_idx + "VALUES".len()..].trim());
2473
+ let ins_rows = parse_values_rows(ins_block).unwrap();
2474
+ assert_eq!(ins_rows[0][1].trailing_expr.as_deref(), Some("::jsonb"));
2475
+ assert_eq!(ins_rows[1][1].trailing_expr.as_deref(), Some("::jsonb"));
2476
+ let v0 =
2477
+ serde_json::from_str::<serde_json::Value>(ins_rows[0][1].original.as_ref().unwrap())
2478
+ .unwrap();
2479
+ let v1 =
2480
+ serde_json::from_str::<serde_json::Value>(ins_rows[1][1].original.as_ref().unwrap())
2481
+ .unwrap();
2482
+ assert_eq!(v0["profile"]["secret"], v1["profile"]["secret"]);
2483
+ }
2484
+
2373
2485
  #[test]
2374
2486
  fn generated_values_fit_length_restricted_columns_from_create_table() {
2375
2487
  let mut rules: HashMap<String, HashMap<String, AnonymizerSpec>> = HashMap::new();
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes