dumpling-cli 0.3.0__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/CHANGELOG.md +20 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/Cargo.lock +1 -1
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/Cargo.toml +1 -1
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/PKG-INFO +66 -11
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/README.md +65 -10
- dumpling_cli-0.4.1/assets/logo.svg +33 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/docs/src/configuration.md +23 -1
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/docs/src/getting-started.md +2 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/docs/src/index.md +1 -1
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/pyproject.toml +1 -1
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/main.rs +177 -26
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/sql.rs +124 -12
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.dumplingconf.example +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/ci.yml +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/docs-pr.yml +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/docs.yml +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/platform-compat-latest.yml +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/platform-compat-matrix.yml +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/policy-lint.yml +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/publish.yml +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/release.yml +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.github/workflows/tests.yml +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/.gitignore +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/AGENTS.md +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/CONTRIBUTING.md +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/MAINTENANCE.md +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/book.toml +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/datetime_out.sql +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/datetime_sample.sql +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/docs/src/SUMMARY.md +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/docs/src/ci-guardrails.md +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/docs/src/releasing.md +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/rust-toolchain.toml +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/scripts/setup-dev.sh +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/faker_dispatch.rs +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/filter.rs +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/lint.rs +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/report.rs +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/scan.rs +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/settings.rs +0 -0
- {dumpling_cli-0.3.0 → dumpling_cli-0.4.1}/src/transform.rs +0 -0
|
@@ -7,6 +7,24 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.4.1] - 2026-05-03
|
|
11
|
+
|
|
12
|
+
### Fixed
|
|
13
|
+
|
|
14
|
+
- **INSERT row parsing with JSON casts**: Values such as `'{"k":1}'::jsonb` are parsed so the cell’s unescaped payload is valid JSON for JSON path rules and anonymization; trailing casts like `::jsonb` / `::text` are preserved on output.
|
|
15
|
+
|
|
16
|
+
## [0.4.0] - 2026-05-02
|
|
17
|
+
|
|
18
|
+
### Added
|
|
19
|
+
|
|
20
|
+
- **`--dump-decode` CLI**: Decode PostgreSQL **custom-format** (`pg_dump -Fc`) or **directory-format** archives by running **`pg_restore -f -`** (plain SQL to stdout, no database), then anonymize—built for workflows such as **`heroku pg:backups:download`**. Requires PostgreSQL client tools (`pg_restore` on `PATH`, or **`--pg-restore-path`**).
|
|
21
|
+
- **`--dump-decode-arg`** (repeatable): Extra arguments forwarded to `pg_restore`.
|
|
22
|
+
- **`--dump-decode-keep-input`**: Keep the archive after a successful run. **By default** the `--input` path is **removed** after success so only anonymized output remains. **`--check`** with **`--dump-decode`** requires **`--dump-decode-keep-input`** (otherwise the dump would be deleted before config iteration).
|
|
23
|
+
|
|
24
|
+
### Changed
|
|
25
|
+
|
|
26
|
+
- README and mdBook documentation for PostgreSQL archive decoding and Heroku-style examples.
|
|
27
|
+
|
|
10
28
|
## [0.3.0] - 2026-05-02
|
|
11
29
|
|
|
12
30
|
### Added
|
|
@@ -43,5 +61,7 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
|
|
|
43
61
|
- Configurable output scan severities and per-category thresholds via `[output_scan]`.
|
|
44
62
|
- JSON report section for output scan findings including category, count, threshold, severity, and sample locations.
|
|
45
63
|
|
|
64
|
+
[0.4.1]: https://github.com/ababic/dumpling/compare/v0.4.0...v0.4.1
|
|
65
|
+
[0.4.0]: https://github.com/ababic/dumpling/compare/v0.3.0...v0.4.0
|
|
46
66
|
[0.3.0]: https://github.com/ababic/dumpling/compare/v0.2.0...v0.3.0
|
|
47
67
|
[0.2.0]: https://github.com/ababic/dumpling/compare/v0.1.0...v0.2.0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dumpling-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Classifier: Development Status :: 4 - Beta
|
|
5
5
|
Classifier: Environment :: Console
|
|
6
6
|
Classifier: Intended Audience :: Developers
|
|
@@ -19,18 +19,48 @@ Keywords: postgres,sqlite,mssql,sql,anonymization,cli,rust
|
|
|
19
19
|
Requires-Python: >=3.8
|
|
20
20
|
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
<p align="center">
|
|
23
|
+
<img src="assets/logo.svg" width="140" height="140" alt="Dumpling logo: a dumpling with steam" />
|
|
24
|
+
</p>
|
|
23
25
|
|
|
24
|
-
|
|
26
|
+
<h1 align="center">Dumpling</h1>
|
|
27
|
+
|
|
28
|
+
<p align="center">
|
|
29
|
+
<strong>Sanitize SQL dumps before they go anywhere.</strong><br />
|
|
30
|
+
Turn huge <code>pg_dump</code> / SQLite / SQL Server exports into shareable, test-friendly snapshots — no DB connection, no secrets left by accident.
|
|
31
|
+
</p>
|
|
32
|
+
|
|
33
|
+
<p align="center">
|
|
34
|
+
<a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/v/dumpling-cli.svg" alt="PyPI version" /></a>
|
|
35
|
+
<a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/pyversions/dumpling-cli.svg" alt="Python versions" /></a>
|
|
36
|
+
<a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/l/dumpling-cli.svg" alt="PyPI license" /></a>
|
|
37
|
+
<a href="https://github.com/ababic/dumpling/actions/workflows/tests.yml"><img src="https://github.com/ababic/dumpling/actions/workflows/tests.yml/badge.svg" alt="Tests" /></a>
|
|
38
|
+
<a href="https://github.com/ababic/dumpling/actions/workflows/ci.yml"><img src="https://github.com/ababic/dumpling/actions/workflows/ci.yml/badge.svg" alt="Lint" /></a>
|
|
39
|
+
<img src="https://img.shields.io/badge/rust-stable-orange?logo=rust" alt="Rust stable" />
|
|
40
|
+
</p>
|
|
41
|
+
|
|
42
|
+
<p align="center">
|
|
43
|
+
<a href="https://ababic.github.io/dumpling/"><strong>Documentation</strong></a>
|
|
44
|
+
·
|
|
45
|
+
<a href="https://github.com/ababic/dumpling"><strong>GitHub</strong></a>
|
|
46
|
+
</p>
|
|
47
|
+
|
|
48
|
+
<p align="center">
|
|
49
|
+
<sub><em>Disclaimer: This project is entirely vibe-coded, but with strong human guidance, review, and attention to quality and safety.</em></sub>
|
|
50
|
+
</p>
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
**Dumpling** reads plain-text SQL dumps (PostgreSQL `pg_dump`, SQLite `.dump`, SQL Server / MSSQL scripts) and rewrites sensitive columns using rules you define in TOML. Everything runs offline on files — ideal for CI, staging share-outs, and compliance-minded workflows.
|
|
25
55
|
|
|
26
56
|
## Why Dumpling?
|
|
27
57
|
|
|
28
|
-
- **
|
|
29
|
-
- **
|
|
30
|
-
- **
|
|
31
|
-
- **
|
|
32
|
-
- **
|
|
33
|
-
- **
|
|
58
|
+
- **Offline by design** — works on dump files only; nothing connects to your database.
|
|
59
|
+
- **Streams giant files** — line-by-line processing keeps multi‑GB dumps reasonable on modest hardware.
|
|
60
|
+
- **Fails loud, not silent** — missing config exits non‑zero and lists where Dumpling looked; use `--allow-noop` only when you mean it.
|
|
61
|
+
- **Stable pseudonyms** — optional domain mappings keep the same source value as the same fake value across tables (foreign keys stay consistent).
|
|
62
|
+
- **Pipeline-ready** — `--check`, strict coverage, JSON reports, and residual PII scans fit pre-merge gates and release automation.
|
|
63
|
+
- **Configure once** — `.dumplingconf` or `[tool.dumpling]` in `pyproject.toml`; install via **Rust** (`cargo`) or **`pip install dumpling-cli`**.
|
|
34
64
|
|
|
35
65
|
---
|
|
36
66
|
|
|
@@ -289,7 +319,16 @@ Produced by `pg_dump --format=plain`. Handles:
|
|
|
289
319
|
- `"double-quoted"` identifiers
|
|
290
320
|
- `''`-escaped string literals
|
|
291
321
|
|
|
292
|
-
Binary, custom, and directory formats from `pg_dump` are not
|
|
322
|
+
Binary, custom, and directory formats from `pg_dump` are not parsed directly — Dumpling’s SQL pipeline expects plain text. Use either:
|
|
323
|
+
|
|
324
|
+
- **`pg_dump --format=plain`** when you control capture, or
|
|
325
|
+
- **`dumpling --dump-decode`** with `--input` set to a **custom-format** (`.dump`) or **directory-format** folder: Dumpling runs `pg_restore -f -` and streams the resulting SQL (same as a manual `pg_restore` “script” output, no database required). Requires PostgreSQL client tools on `PATH` (`pg_restore`), or set `--pg-restore-path`. Use `--dump-decode-arg` to pass extra flags (e.g. `--no-owner --no-acl`). **By default** the archive is removed after a fully successful run; pass **`--dump-decode-keep-input`** to retain it. **`--check`** requires **`--dump-decode-keep-input`** so the archive still exists if changes would be detected.
|
|
326
|
+
|
|
327
|
+
Example (e.g. after `heroku pg:backups:download`):
|
|
328
|
+
|
|
329
|
+
```bash
|
|
330
|
+
dumpling --dump-decode -i latest.dump -c .dumplingconf -o anonymized.sql
|
|
331
|
+
```
|
|
293
332
|
|
|
294
333
|
### SQLite (`--format sqlite`)
|
|
295
334
|
|
|
@@ -335,6 +374,22 @@ Supported predicate operators:
|
|
|
335
374
|
|
|
336
375
|
Predicates can target nested JSON values using dot notation (`payload.profile.tier`) or Django-style notation (`payload__profile__tier`). For JSON arrays, path segments are evaluated against each element, so list-of-dicts structures can be matched naturally.
|
|
337
376
|
|
|
377
|
+
### JSON path list targeting
|
|
378
|
+
|
|
379
|
+
JSON list/array traversal is automatic once a path segment resolves to an array.
|
|
380
|
+
|
|
381
|
+
- **All elements in an array**: use the next field name directly.
|
|
382
|
+
- `payload.items.kind` or `payload__items__kind`
|
|
383
|
+
- Matches/rewrites `kind` for every object in `items`.
|
|
384
|
+
- **Specific array index**: use a numeric segment.
|
|
385
|
+
- `payload.items.0.kind` or `payload__items__0__kind`
|
|
386
|
+
- Targets only the first element.
|
|
387
|
+
- **Nested arrays**: combine field and index segments as needed.
|
|
388
|
+
- `payload.groups.members.email`
|
|
389
|
+
- `payload.groups.1.members.0.email`
|
|
390
|
+
|
|
391
|
+
This path behavior is shared by both `row_filters` predicates and JSON-path anonymization rules in `[rules]`.
|
|
392
|
+
|
|
338
393
|
```toml
|
|
339
394
|
[row_filters."public.users"]
|
|
340
395
|
retain = [
|
|
@@ -477,5 +532,5 @@ See the [CI guardrails documentation](docs/src/ci-guardrails.md) for full pipeli
|
|
|
477
532
|
|
|
478
533
|
## Full documentation
|
|
479
534
|
|
|
480
|
-
Detailed docs, including the configuration reference and release process, are available at the project's [GitHub Pages site](https://github.
|
|
535
|
+
Detailed docs, including the configuration reference and release process, are available at the project's [GitHub Pages site](https://ababic.github.io/dumpling/) (built from `docs/src/`).
|
|
481
536
|
|
|
@@ -1,15 +1,45 @@
|
|
|
1
|
-
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="assets/logo.svg" width="140" height="140" alt="Dumpling logo: a dumpling with steam" />
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">Dumpling</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<strong>Sanitize SQL dumps before they go anywhere.</strong><br />
|
|
9
|
+
Turn huge <code>pg_dump</code> / SQLite / SQL Server exports into shareable, test-friendly snapshots — no DB connection, no secrets left by accident.
|
|
10
|
+
</p>
|
|
11
|
+
|
|
12
|
+
<p align="center">
|
|
13
|
+
<a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/v/dumpling-cli.svg" alt="PyPI version" /></a>
|
|
14
|
+
<a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/pyversions/dumpling-cli.svg" alt="Python versions" /></a>
|
|
15
|
+
<a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/l/dumpling-cli.svg" alt="PyPI license" /></a>
|
|
16
|
+
<a href="https://github.com/ababic/dumpling/actions/workflows/tests.yml"><img src="https://github.com/ababic/dumpling/actions/workflows/tests.yml/badge.svg" alt="Tests" /></a>
|
|
17
|
+
<a href="https://github.com/ababic/dumpling/actions/workflows/ci.yml"><img src="https://github.com/ababic/dumpling/actions/workflows/ci.yml/badge.svg" alt="Lint" /></a>
|
|
18
|
+
<img src="https://img.shields.io/badge/rust-stable-orange?logo=rust" alt="Rust stable" />
|
|
19
|
+
</p>
|
|
20
|
+
|
|
21
|
+
<p align="center">
|
|
22
|
+
<a href="https://ababic.github.io/dumpling/"><strong>Documentation</strong></a>
|
|
23
|
+
·
|
|
24
|
+
<a href="https://github.com/ababic/dumpling"><strong>GitHub</strong></a>
|
|
25
|
+
</p>
|
|
26
|
+
|
|
27
|
+
<p align="center">
|
|
28
|
+
<sub><em>Disclaimer: This project is entirely vibe-coded, but with strong human guidance, review, and attention to quality and safety.</em></sub>
|
|
29
|
+
</p>
|
|
2
30
|
|
|
3
|
-
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
**Dumpling** reads plain-text SQL dumps (PostgreSQL `pg_dump`, SQLite `.dump`, SQL Server / MSSQL scripts) and rewrites sensitive columns using rules you define in TOML. Everything runs offline on files — ideal for CI, staging share-outs, and compliance-minded workflows.
|
|
4
34
|
|
|
5
35
|
## Why Dumpling?
|
|
6
36
|
|
|
7
|
-
- **
|
|
8
|
-
- **
|
|
9
|
-
- **
|
|
10
|
-
- **
|
|
11
|
-
- **
|
|
12
|
-
- **
|
|
37
|
+
- **Offline by design** — works on dump files only; nothing connects to your database.
|
|
38
|
+
- **Streams giant files** — line-by-line processing keeps multi‑GB dumps reasonable on modest hardware.
|
|
39
|
+
- **Fails loud, not silent** — missing config exits non‑zero and lists where Dumpling looked; use `--allow-noop` only when you mean it.
|
|
40
|
+
- **Stable pseudonyms** — optional domain mappings keep the same source value as the same fake value across tables (foreign keys stay consistent).
|
|
41
|
+
- **Pipeline-ready** — `--check`, strict coverage, JSON reports, and residual PII scans fit pre-merge gates and release automation.
|
|
42
|
+
- **Configure once** — `.dumplingconf` or `[tool.dumpling]` in `pyproject.toml`; install via **Rust** (`cargo`) or **`pip install dumpling-cli`**.
|
|
13
43
|
|
|
14
44
|
---
|
|
15
45
|
|
|
@@ -268,7 +298,16 @@ Produced by `pg_dump --format=plain`. Handles:
|
|
|
268
298
|
- `"double-quoted"` identifiers
|
|
269
299
|
- `''`-escaped string literals
|
|
270
300
|
|
|
271
|
-
Binary, custom, and directory formats from `pg_dump` are not
|
|
301
|
+
Binary, custom, and directory formats from `pg_dump` are not parsed directly — Dumpling’s SQL pipeline expects plain text. Use either:
|
|
302
|
+
|
|
303
|
+
- **`pg_dump --format=plain`** when you control capture, or
|
|
304
|
+
- **`dumpling --dump-decode`** with `--input` set to a **custom-format** (`.dump`) or **directory-format** folder: Dumpling runs `pg_restore -f -` and streams the resulting SQL (same as a manual `pg_restore` “script” output, no database required). Requires PostgreSQL client tools on `PATH` (`pg_restore`), or set `--pg-restore-path`. Use `--dump-decode-arg` to pass extra flags (e.g. `--no-owner --no-acl`). **By default** the archive is removed after a fully successful run; pass **`--dump-decode-keep-input`** to retain it. **`--check`** requires **`--dump-decode-keep-input`** so the archive still exists if changes would be detected.
|
|
305
|
+
|
|
306
|
+
Example (e.g. after `heroku pg:backups:download`):
|
|
307
|
+
|
|
308
|
+
```bash
|
|
309
|
+
dumpling --dump-decode -i latest.dump -c .dumplingconf -o anonymized.sql
|
|
310
|
+
```
|
|
272
311
|
|
|
273
312
|
### SQLite (`--format sqlite`)
|
|
274
313
|
|
|
@@ -314,6 +353,22 @@ Supported predicate operators:
|
|
|
314
353
|
|
|
315
354
|
Predicates can target nested JSON values using dot notation (`payload.profile.tier`) or Django-style notation (`payload__profile__tier`). For JSON arrays, path segments are evaluated against each element, so list-of-dicts structures can be matched naturally.
|
|
316
355
|
|
|
356
|
+
### JSON path list targeting
|
|
357
|
+
|
|
358
|
+
JSON list/array traversal is automatic once a path segment resolves to an array.
|
|
359
|
+
|
|
360
|
+
- **All elements in an array**: use the next field name directly.
|
|
361
|
+
- `payload.items.kind` or `payload__items__kind`
|
|
362
|
+
- Matches/rewrites `kind` for every object in `items`.
|
|
363
|
+
- **Specific array index**: use a numeric segment.
|
|
364
|
+
- `payload.items.0.kind` or `payload__items__0__kind`
|
|
365
|
+
- Targets only the first element.
|
|
366
|
+
- **Nested arrays**: combine field and index segments as needed.
|
|
367
|
+
- `payload.groups.members.email`
|
|
368
|
+
- `payload.groups.1.members.0.email`
|
|
369
|
+
|
|
370
|
+
This path behavior is shared by both `row_filters` predicates and JSON-path anonymization rules in `[rules]`.
|
|
371
|
+
|
|
317
372
|
```toml
|
|
318
373
|
[row_filters."public.users"]
|
|
319
374
|
retain = [
|
|
@@ -456,4 +511,4 @@ See the [CI guardrails documentation](docs/src/ci-guardrails.md) for full pipeli
|
|
|
456
511
|
|
|
457
512
|
## Full documentation
|
|
458
513
|
|
|
459
|
-
Detailed docs, including the configuration reference and release process, are available at the project's [GitHub Pages site](https://github.
|
|
514
|
+
Detailed docs, including the configuration reference and release process, are available at the project's [GitHub Pages site](https://ababic.github.io/dumpling/) (built from `docs/src/`).
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 128 128" role="img" aria-label="Dumpling logo">
|
|
2
|
+
<defs>
|
|
3
|
+
<linearGradient id="steam" x1="0%" y1="100%" x2="0%" y2="0%">
|
|
4
|
+
<stop offset="0%" stop-color="#e8f4fc" stop-opacity="0"/>
|
|
5
|
+
<stop offset="50%" stop-color="#cfe9fb" stop-opacity="0.85"/>
|
|
6
|
+
<stop offset="100%" stop-color="#b8dff8" stop-opacity="0"/>
|
|
7
|
+
</linearGradient>
|
|
8
|
+
<linearGradient id="dough" x1="0%" y1="0%" x2="100%" y2="100%">
|
|
9
|
+
<stop offset="0%" stop-color="#fff8ef"/>
|
|
10
|
+
<stop offset="45%" stop-color="#f4dcc4"/>
|
|
11
|
+
<stop offset="100%" stop-color="#e8b896"/>
|
|
12
|
+
</linearGradient>
|
|
13
|
+
<linearGradient id="shadow" x1="0%" y1="0%" x2="0%" y2="100%">
|
|
14
|
+
<stop offset="0%" stop-color="#c4865a" stop-opacity="0.35"/>
|
|
15
|
+
<stop offset="100%" stop-color="#8b5a3c" stop-opacity="0.15"/>
|
|
16
|
+
</linearGradient>
|
|
17
|
+
</defs>
|
|
18
|
+
<!-- Steam -->
|
|
19
|
+
<path fill="url(#steam)" d="M44 18c2-6 8-10 14-8s8 10 4 15c-3 4-2 9 2 12 5 4 5 12 0 16-6 5-16 4-20-3-2-4 0-9 4-11 3-2 3-6 0-9-4-4-4-10 0-12z"/>
|
|
20
|
+
<path fill="url(#steam)" opacity="0.75" d="M64 14c3-5 9-7 14-4 5 3 6 10 2 14-4 4-3 10 2 13 6 4 7 13 1 18-7 6-19 4-24-5-2-5 1-11 6-13 4-2 4-7 1-10-5-4-5-11 0-13z"/>
|
|
21
|
+
<path fill="url(#steam)" opacity="0.6" d="M82 20c2-5 8-8 13-5 5 3 7 10 3 15-3 4-2 9 3 12 5 3 7 11 2 16-6 6-17 5-22-3-2-4 0-9 5-11 3-2 4-6 1-9-4-4-4-10 0-13z"/>
|
|
22
|
+
<!-- Plate -->
|
|
23
|
+
<ellipse cx="64" cy="108" rx="52" ry="10" fill="#dfe8ef"/>
|
|
24
|
+
<ellipse cx="64" cy="106" rx="48" ry="8" fill="#eef4f8"/>
|
|
25
|
+
<!-- Dumpling body -->
|
|
26
|
+
<ellipse cx="64" cy="82" rx="42" ry="28" fill="url(#dough)" stroke="#d4a574" stroke-width="2"/>
|
|
27
|
+
<ellipse cx="64" cy="96" rx="38" ry="12" fill="url(#shadow)"/>
|
|
28
|
+
<!-- Pleats -->
|
|
29
|
+
<path fill="none" stroke="#c9956a" stroke-width="1.8" stroke-linecap="round" d="M34 58c6 10 14 16 30 16s24-6 30-16"/>
|
|
30
|
+
<path fill="none" stroke="#d9b08a" stroke-width="1.2" stroke-linecap="round" opacity="0.9" d="M42 54c5 8 13 13 22 13s17-5 22-13"/>
|
|
31
|
+
<!-- Highlight -->
|
|
32
|
+
<ellipse cx="48" cy="76" rx="10" ry="6" fill="#ffffff" opacity="0.35"/>
|
|
33
|
+
</svg>
|
|
@@ -6,7 +6,7 @@ Use `--format` to declare the SQL dialect of your input file:
|
|
|
6
6
|
|
|
7
7
|
| Value | Description |
|
|
8
8
|
|---|---|
|
|
9
|
-
| `postgres` (default) | PostgreSQL `pg_dump` plain-text format. Supports `COPY … FROM stdin` blocks, `"double-quoted"` identifiers, `''`-escaped strings. |
|
|
9
|
+
| `postgres` (default) | PostgreSQL `pg_dump` plain-text format. Supports `COPY … FROM stdin` blocks, `"double-quoted"` identifiers, `''`-escaped strings. Custom-format (`-Fc`) or directory dumps can be decoded on the fly with `dumpling --dump-decode` (wraps `pg_restore -f -`; requires client tools). By default the archive is deleted after success; use `--dump-decode-keep-input` to retain it. |
|
|
10
10
|
| `sqlite` | SQLite `.dump` format. Adds `INSERT OR REPLACE INTO` / `INSERT OR IGNORE INTO` support. No COPY blocks. |
|
|
11
11
|
| `mssql` | SQL Server / MSSQL plain SQL. Adds `[bracket]` identifier quoting, `N'…'` Unicode string literals, and `nvarchar(n)` / `nchar(n)` length extraction. No COPY blocks. |
|
|
12
12
|
|
|
@@ -17,6 +17,28 @@ dumpling --format sqlite -i data.db.sql -o anonymized.sql
|
|
|
17
17
|
dumpling --format mssql -i backup.sql -o anonymized.sql
|
|
18
18
|
```
|
|
19
19
|
|
|
20
|
+
### PostgreSQL custom-format archives (`--dump-decode`)
|
|
21
|
+
|
|
22
|
+
Heroku PGBackups and many pipelines ship **`pg_dump` custom format** (`-Fc`) or **directory-format** dumps to save bandwidth. Dumpling’s SQL engine still expects **plain text**; use **`--dump-decode`** so Dumpling runs **`pg_restore -f -`** (script to stdout, no database) and pipes the result through the same anonymizer as a normal plain-SQL file.
|
|
23
|
+
|
|
24
|
+
**Requirements:** PostgreSQL client tools on `PATH` (`pg_restore`), or set **`--pg-restore-path`**. Use **`--dump-decode-arg`** (repeatable) for extra `pg_restore` flags, e.g. `--dump-decode-arg=--no-owner --dump-decode-arg=--no-acl`.
|
|
25
|
+
|
|
26
|
+
**Input deletion:** After a **fully successful** run, Dumpling **removes** the `--input` path (single file or directory-format folder) by default so only the anonymized output remains. Pass **`--dump-decode-keep-input`** to retain the archive.
|
|
27
|
+
|
|
28
|
+
**Check mode:** **`--check`** with **`--dump-decode`** requires **`--dump-decode-keep-input`**. Otherwise the default would delete the dump before you can iterate on config.
|
|
29
|
+
|
|
30
|
+
Example (e.g. after `heroku pg:backups:download`):
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
dumpling --dump-decode -i latest.dump -c .dumplingconf -o anonymized.sql
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Dry run while keeping the downloaded file:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
dumpling --dump-decode --dump-decode-keep-input --check -i latest.dump -c .dumplingconf
|
|
40
|
+
```
|
|
41
|
+
|
|
20
42
|
---
|
|
21
43
|
|
|
22
44
|
## Configuration sources
|
|
@@ -28,4 +28,6 @@ cargo test --all-targets --all-features
|
|
|
28
28
|
dumpling -i dump.sql -o sanitized.sql
|
|
29
29
|
```
|
|
30
30
|
|
|
31
|
+
If your input is a PostgreSQL **custom-format** file (not plain SQL), decode and anonymize in one step with **`--dump-decode`** (needs `pg_restore` from PostgreSQL client tools). See [PostgreSQL custom-format archives](configuration.md#postgresql-custom-format-archives---dump-decode) in the configuration guide.
|
|
32
|
+
|
|
31
33
|
For full command examples and strategy options, see the repository `README.md`.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Dumpling documentation
|
|
2
2
|
|
|
3
|
-
Dumpling is a streaming anonymizer for plain SQL dumps. It supports PostgreSQL (`pg_dump` plain format), SQLite (`.dump`), and SQL Server / MSSQL (SSMS / mssql-scripter plain SQL output).
|
|
3
|
+
Dumpling is a streaming anonymizer for plain SQL dumps. It supports PostgreSQL (`pg_dump` plain format), SQLite (`.dump`), and SQL Server / MSSQL (SSMS / mssql-scripter plain SQL output). For PostgreSQL **custom-format** archives (e.g. Heroku `pg:backups:download`), use **`--dump-decode`** so Dumpling invokes `pg_restore` and streams plain SQL—see [Dump format](configuration.html#postgresql-custom-format-archives---dump-decode) in the configuration guide.
|
|
4
4
|
|
|
5
5
|
This documentation covers the operating model for day-to-day use:
|
|
6
6
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
use std::fs::File;
|
|
2
2
|
use std::io::{self, BufRead, BufReader, BufWriter, Write};
|
|
3
3
|
use std::path::{Path, PathBuf};
|
|
4
|
+
use std::process::{Command, Stdio};
|
|
4
5
|
|
|
5
6
|
use clap::{ArgAction, Parser, Subcommand};
|
|
6
7
|
|
|
@@ -13,6 +14,7 @@ mod settings;
|
|
|
13
14
|
mod sql;
|
|
14
15
|
mod transform;
|
|
15
16
|
|
|
17
|
+
use anyhow::Context;
|
|
16
18
|
use regex::Regex;
|
|
17
19
|
use report::Reporter;
|
|
18
20
|
use scan::{OutputScanner, ScanningWriter};
|
|
@@ -105,6 +107,26 @@ struct Cli {
|
|
|
105
107
|
#[arg(long = "security-profile", default_value = "standard")]
|
|
106
108
|
security_profile: String,
|
|
107
109
|
|
|
110
|
+
/// Decode PostgreSQL custom-format or directory-format dumps via `pg_restore -f -` before anonymizing.
|
|
111
|
+
/// Requires `--input` pointing at the archive file or directory and `--format postgres`. Requires a
|
|
112
|
+
/// PostgreSQL client install (`pg_restore` on PATH unless overridden by `--pg-restore-path`).
|
|
113
|
+
#[arg(long = "dump-decode", action = ArgAction::SetTrue)]
|
|
114
|
+
dump_decode: bool,
|
|
115
|
+
|
|
116
|
+
/// Keep the input archive after `--dump-decode` (default: delete file or directory after a fully
|
|
117
|
+
/// successful run). Cannot retain the archive with `--check` (would delete before verifying changes).
|
|
118
|
+
#[arg(long = "dump-decode-keep-input", action = ArgAction::SetTrue)]
|
|
119
|
+
dump_decode_keep_input: bool,
|
|
120
|
+
|
|
121
|
+
/// `pg_restore` executable to use with `--dump-decode` (default: `pg_restore` on PATH).
|
|
122
|
+
#[arg(long = "pg-restore-path", default_value = "pg_restore")]
|
|
123
|
+
pg_restore_path: PathBuf,
|
|
124
|
+
|
|
125
|
+
/// Extra arguments forwarded to `pg_restore` before the archive path (repeatable). Example:
|
|
126
|
+
/// `--dump-decode-arg=--no-owner` `--dump-decode-arg=--no-acl`
|
|
127
|
+
#[arg(long = "dump-decode-arg")]
|
|
128
|
+
dump_decode_arg: Vec<String>,
|
|
129
|
+
|
|
108
130
|
#[command(subcommand)]
|
|
109
131
|
command: Option<Commands>,
|
|
110
132
|
}
|
|
@@ -184,6 +206,14 @@ fn run_anonymize(cli: Cli) -> anyhow::Result<()> {
|
|
|
184
206
|
if cli.check && (cli.in_place || cli.output.is_some()) {
|
|
185
207
|
anyhow::bail!("--check cannot be used together with --output or --in-place");
|
|
186
208
|
}
|
|
209
|
+
if cli.dump_decode && !cli.dump_decode_keep_input && cli.check {
|
|
210
|
+
anyhow::bail!(
|
|
211
|
+
"--dump-decode removes the input archive on success by default; use --dump-decode-keep-input with --check"
|
|
212
|
+
);
|
|
213
|
+
}
|
|
214
|
+
if cli.dump_decode && cli.in_place {
|
|
215
|
+
anyhow::bail!("--dump-decode cannot be used with --in-place");
|
|
216
|
+
}
|
|
187
217
|
|
|
188
218
|
// Resolve config from provided path or discover in CWD
|
|
189
219
|
let resolved_config: ResolvedConfig =
|
|
@@ -247,36 +277,97 @@ fn run_anonymize(cli: Cli) -> anyhow::Result<()> {
|
|
|
247
277
|
other
|
|
248
278
|
),
|
|
249
279
|
};
|
|
280
|
+
if cli.dump_decode && dump_format != DumpFormat::Postgres {
|
|
281
|
+
anyhow::bail!(
|
|
282
|
+
"--dump-decode only applies to PostgreSQL dumps; use --format postgres (default)"
|
|
283
|
+
);
|
|
284
|
+
}
|
|
250
285
|
|
|
251
286
|
// Compile table include/exclude regex patterns
|
|
252
287
|
let include_res = compile_patterns(&cli.include_table)?;
|
|
253
288
|
let exclude_res = compile_patterns(&cli.exclude_table)?;
|
|
254
289
|
|
|
255
|
-
// Determine IO
|
|
256
|
-
let
|
|
290
|
+
// Determine IO (optional pg_restore child when --dump-decode)
|
|
291
|
+
let mut pg_restore_child: Option<std::process::Child> = None;
|
|
292
|
+
let (mut reader, input_path_for_inplace): (Box<dyn BufRead>, Option<PathBuf>) = if cli
|
|
293
|
+
.dump_decode
|
|
257
294
|
{
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
)
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
295
|
+
let archive_path = cli
|
|
296
|
+
.input
|
|
297
|
+
.as_ref()
|
|
298
|
+
.ok_or_else(|| {
|
|
299
|
+
anyhow::anyhow!(
|
|
300
|
+
"--dump-decode requires --input pointing at a pg_dump custom-format file or directory-format directory"
|
|
301
|
+
)
|
|
302
|
+
})?;
|
|
303
|
+
if !cli.allow_ext.is_empty() && !has_allowed_extension(archive_path, &cli.allow_ext) {
|
|
304
|
+
let actual = archive_path
|
|
305
|
+
.extension()
|
|
306
|
+
.and_then(|s| s.to_str())
|
|
307
|
+
.unwrap_or("<none>")
|
|
308
|
+
.to_string();
|
|
309
|
+
anyhow::bail!(
|
|
310
|
+
"input file extension '{}' is not in allowed set {:?}",
|
|
311
|
+
actual,
|
|
312
|
+
cli.allow_ext
|
|
313
|
+
);
|
|
314
|
+
}
|
|
315
|
+
if !archive_path.exists() {
|
|
316
|
+
anyhow::bail!(
|
|
317
|
+
"--dump-decode input path does not exist: {}",
|
|
318
|
+
archive_path.display()
|
|
319
|
+
);
|
|
320
|
+
}
|
|
321
|
+
eprintln!(
|
|
322
|
+
"dumpling: decoding PostgreSQL archive via {} -f - {}",
|
|
323
|
+
cli.pg_restore_path.display(),
|
|
324
|
+
archive_path.display()
|
|
325
|
+
);
|
|
326
|
+
let mut cmd = Command::new(&cli.pg_restore_path);
|
|
327
|
+
for a in &cli.dump_decode_arg {
|
|
328
|
+
cmd.arg(a);
|
|
274
329
|
}
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
330
|
+
cmd.arg("-f")
|
|
331
|
+
.arg("-")
|
|
332
|
+
.arg(archive_path)
|
|
333
|
+
.stdout(Stdio::piped())
|
|
334
|
+
.stderr(Stdio::inherit());
|
|
335
|
+
let mut child = cmd.spawn().with_context(|| {
|
|
336
|
+
format!(
|
|
337
|
+
"failed to spawn `{}`; install PostgreSQL client tools or set --pg-restore-path",
|
|
338
|
+
cli.pg_restore_path.display()
|
|
339
|
+
)
|
|
340
|
+
})?;
|
|
341
|
+
let stdout = child
|
|
342
|
+
.stdout
|
|
343
|
+
.take()
|
|
344
|
+
.ok_or_else(|| anyhow::anyhow!("pg_restore stdout missing"))?;
|
|
345
|
+
pg_restore_child = Some(child);
|
|
346
|
+
(Box::new(BufReader::new(stdout)), Some(archive_path.clone()))
|
|
347
|
+
} else {
|
|
348
|
+
match &cli.input {
|
|
349
|
+
Some(path) => {
|
|
350
|
+
if !cli.allow_ext.is_empty() && !has_allowed_extension(path, &cli.allow_ext) {
|
|
351
|
+
let actual = path
|
|
352
|
+
.extension()
|
|
353
|
+
.and_then(|s| s.to_str())
|
|
354
|
+
.unwrap_or("<none>")
|
|
355
|
+
.to_string();
|
|
356
|
+
anyhow::bail!(
|
|
357
|
+
"input file extension '{}' is not in allowed set {:?}",
|
|
358
|
+
actual,
|
|
359
|
+
cli.allow_ext
|
|
360
|
+
);
|
|
361
|
+
}
|
|
362
|
+
let f = File::open(path)?;
|
|
363
|
+
(Box::new(BufReader::new(f)), Some(path.clone()))
|
|
364
|
+
}
|
|
365
|
+
None => {
|
|
366
|
+
if !cli.allow_ext.is_empty() {
|
|
367
|
+
eprintln!("dumpling: --allow-ext provided but no --input file; extension check is ignored for stdin");
|
|
368
|
+
}
|
|
369
|
+
(Box::new(BufReader::new(io::stdin())), None)
|
|
278
370
|
}
|
|
279
|
-
(Box::new(BufReader::new(io::stdin())), None)
|
|
280
371
|
}
|
|
281
372
|
};
|
|
282
373
|
|
|
@@ -330,12 +421,30 @@ fn run_anonymize(cli: Cli) -> anyhow::Result<()> {
|
|
|
330
421
|
dump_format,
|
|
331
422
|
);
|
|
332
423
|
let mut writer = output;
|
|
333
|
-
if let Some(scanner) = output_scanner.as_mut() {
|
|
424
|
+
let proc_res = if let Some(scanner) = output_scanner.as_mut() {
|
|
334
425
|
let mut scanning_writer = ScanningWriter::new(&mut writer, scanner);
|
|
335
|
-
processor.process(&mut reader, &mut scanning_writer)
|
|
426
|
+
processor.process(&mut reader, &mut scanning_writer)
|
|
336
427
|
} else {
|
|
337
|
-
processor.process(&mut reader, &mut writer)
|
|
428
|
+
processor.process(&mut reader, &mut writer)
|
|
429
|
+
};
|
|
430
|
+
|
|
431
|
+
if let Some(mut child) = pg_restore_child {
|
|
432
|
+
if proc_res.is_err() {
|
|
433
|
+
let _ = child.kill();
|
|
434
|
+
}
|
|
435
|
+
let status = child
|
|
436
|
+
.wait()
|
|
437
|
+
.with_context(|| format!("waiting for `{}`", cli.pg_restore_path.display()))?;
|
|
438
|
+
if proc_res.is_ok() && !status.success() {
|
|
439
|
+
anyhow::bail!(
|
|
440
|
+
"`{}` exited with status {}",
|
|
441
|
+
cli.pg_restore_path.display(),
|
|
442
|
+
status
|
|
443
|
+
);
|
|
444
|
+
}
|
|
338
445
|
}
|
|
446
|
+
|
|
447
|
+
proc_res?;
|
|
339
448
|
let coverage = processor.sensitive_coverage_summary();
|
|
340
449
|
reporter.report.sensitive_columns_detected = coverage.detected.clone();
|
|
341
450
|
reporter.report.sensitive_columns_covered = coverage.covered.clone();
|
|
@@ -363,7 +472,10 @@ fn run_anonymize(cli: Cli) -> anyhow::Result<()> {
|
|
|
363
472
|
|
|
364
473
|
// If in-place, do the swap now
|
|
365
474
|
if cli.in_place {
|
|
366
|
-
let input_path = input_path_for_inplace
|
|
475
|
+
let input_path = input_path_for_inplace
|
|
476
|
+
.as_ref()
|
|
477
|
+
.ok_or_else(|| anyhow::anyhow!("--in-place requires an --input path"))?
|
|
478
|
+
.clone();
|
|
367
479
|
let mut tmp = input_path.clone();
|
|
368
480
|
tmp.set_extension("sql.dumpling.tmp");
|
|
369
481
|
writer.flush()?;
|
|
@@ -405,9 +517,30 @@ fn run_anonymize(cli: Cli) -> anyhow::Result<()> {
|
|
|
405
517
|
std::process::exit(1);
|
|
406
518
|
}
|
|
407
519
|
|
|
520
|
+
if cli.dump_decode && !cli.dump_decode_keep_input {
|
|
521
|
+
if let Some(ref p) = input_path_for_inplace {
|
|
522
|
+
match remove_pg_archive(p) {
|
|
523
|
+
Ok(()) => eprintln!("dumpling: removed input archive {}", p.display()),
|
|
524
|
+
Err(e) => eprintln!(
|
|
525
|
+
"dumpling: warning: could not remove input archive {}: {}",
|
|
526
|
+
p.display(),
|
|
527
|
+
e
|
|
528
|
+
),
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
|
|
408
533
|
Ok(())
|
|
409
534
|
}
|
|
410
535
|
|
|
536
|
+
fn remove_pg_archive(path: &Path) -> std::io::Result<()> {
|
|
537
|
+
if path.is_dir() {
|
|
538
|
+
std::fs::remove_dir_all(path)
|
|
539
|
+
} else {
|
|
540
|
+
std::fs::remove_file(path)
|
|
541
|
+
}
|
|
542
|
+
}
|
|
543
|
+
|
|
411
544
|
fn compile_patterns(patterns: &[String]) -> anyhow::Result<Vec<Regex>> {
|
|
412
545
|
let mut out = Vec::new();
|
|
413
546
|
for p in patterns {
|
|
@@ -494,6 +627,24 @@ mod tests_main {
|
|
|
494
627
|
}
|
|
495
628
|
}
|
|
496
629
|
|
|
630
|
+
#[test]
|
|
631
|
+
fn test_dump_decode_flags_parse() {
|
|
632
|
+
let cli = Cli::parse_from([
|
|
633
|
+
"dumpling",
|
|
634
|
+
"--dump-decode",
|
|
635
|
+
"--dump-decode-keep-input",
|
|
636
|
+
"--pg-restore-path",
|
|
637
|
+
"/usr/bin/pg_restore",
|
|
638
|
+
"--dump-decode-arg=--no-owner",
|
|
639
|
+
"-i",
|
|
640
|
+
"/tmp/latest.dump",
|
|
641
|
+
]);
|
|
642
|
+
assert!(cli.dump_decode);
|
|
643
|
+
assert!(cli.dump_decode_keep_input);
|
|
644
|
+
assert_eq!(cli.pg_restore_path, PathBuf::from("/usr/bin/pg_restore"));
|
|
645
|
+
assert_eq!(cli.dump_decode_arg, vec!["--no-owner"]);
|
|
646
|
+
}
|
|
647
|
+
|
|
497
648
|
#[test]
|
|
498
649
|
fn test_lint_policy_allow_noop_flag() {
|
|
499
650
|
let cli = Cli::parse_from(["dumpling", "lint-policy", "--allow-noop"]);
|
|
@@ -1155,20 +1155,22 @@ struct Cell {
|
|
|
1155
1155
|
original: Option<String>, // None for NULL
|
|
1156
1156
|
was_quoted: bool,
|
|
1157
1157
|
was_default: bool,
|
|
1158
|
+
trailing_expr: Option<String>,
|
|
1158
1159
|
}
|
|
1159
1160
|
|
|
1160
1161
|
impl Cell {
|
|
1161
1162
|
fn render_original(&self) -> String {
|
|
1163
|
+
let trailing = self.trailing_expr.as_deref().unwrap_or("");
|
|
1162
1164
|
if self.was_default {
|
|
1163
|
-
return "DEFAULT"
|
|
1165
|
+
return format!("DEFAULT{trailing}");
|
|
1164
1166
|
}
|
|
1165
1167
|
match &self.original {
|
|
1166
|
-
None => "NULL"
|
|
1168
|
+
None => format!("NULL{trailing}"),
|
|
1167
1169
|
Some(s) => {
|
|
1168
1170
|
if self.was_quoted {
|
|
1169
|
-
format!("'{}'", s.replace('\'', "''"))
|
|
1171
|
+
format!("'{}'{trailing}", s.replace('\'', "''"))
|
|
1170
1172
|
} else {
|
|
1171
|
-
s
|
|
1173
|
+
format!("{s}{trailing}")
|
|
1172
1174
|
}
|
|
1173
1175
|
}
|
|
1174
1176
|
}
|
|
@@ -1176,14 +1178,15 @@ impl Cell {
|
|
|
1176
1178
|
}
|
|
1177
1179
|
|
|
1178
1180
|
fn render_cell(repl: &Replacement, original: &Cell) -> String {
|
|
1181
|
+
let trailing = original.trailing_expr.as_deref().unwrap_or("");
|
|
1179
1182
|
if repl.is_null {
|
|
1180
|
-
return "NULL"
|
|
1183
|
+
return format!("NULL{trailing}");
|
|
1181
1184
|
}
|
|
1182
1185
|
let should_quote = repl.force_quoted || original.was_quoted;
|
|
1183
1186
|
if should_quote {
|
|
1184
|
-
format!("'{}'", repl.value.replace('\'', "''"))
|
|
1187
|
+
format!("'{}'{trailing}", repl.value.replace('\'', "''"))
|
|
1185
1188
|
} else {
|
|
1186
|
-
repl.value
|
|
1189
|
+
format!("{}{trailing}", repl.value)
|
|
1187
1190
|
}
|
|
1188
1191
|
}
|
|
1189
1192
|
|
|
@@ -1243,7 +1246,9 @@ fn parse_parenthesized_values(s: &str) -> anyhow::Result<(Vec<Cell>, usize)> {
|
|
|
1243
1246
|
let mut cells: Vec<Cell> = Vec::new();
|
|
1244
1247
|
let mut in_single = false;
|
|
1245
1248
|
let mut buf = String::new();
|
|
1249
|
+
let mut trailing_expr = String::new();
|
|
1246
1250
|
let mut was_quoted = false;
|
|
1251
|
+
let mut closed_quoted_literal = false;
|
|
1247
1252
|
while i < chs.len() {
|
|
1248
1253
|
let c = chs[i];
|
|
1249
1254
|
if in_single {
|
|
@@ -1255,6 +1260,7 @@ fn parse_parenthesized_values(s: &str) -> anyhow::Result<(Vec<Cell>, usize)> {
|
|
|
1255
1260
|
continue;
|
|
1256
1261
|
} else {
|
|
1257
1262
|
in_single = false;
|
|
1263
|
+
closed_quoted_literal = true;
|
|
1258
1264
|
i += 1;
|
|
1259
1265
|
continue;
|
|
1260
1266
|
}
|
|
@@ -1282,17 +1288,19 @@ fn parse_parenthesized_values(s: &str) -> anyhow::Result<(Vec<Cell>, usize)> {
|
|
|
1282
1288
|
}
|
|
1283
1289
|
')' => {
|
|
1284
1290
|
// end cell, end row
|
|
1285
|
-
let cell = finalize_cell(&buf, was_quoted);
|
|
1291
|
+
let cell = finalize_cell(&buf, was_quoted, &trailing_expr);
|
|
1286
1292
|
cells.push(cell);
|
|
1287
1293
|
i += 1;
|
|
1288
1294
|
return Ok((cells, i));
|
|
1289
1295
|
}
|
|
1290
1296
|
',' => {
|
|
1291
1297
|
// end cell
|
|
1292
|
-
let cell = finalize_cell(&buf, was_quoted);
|
|
1298
|
+
let cell = finalize_cell(&buf, was_quoted, &trailing_expr);
|
|
1293
1299
|
cells.push(cell);
|
|
1294
1300
|
buf.clear();
|
|
1301
|
+
trailing_expr.clear();
|
|
1295
1302
|
was_quoted = false;
|
|
1303
|
+
closed_quoted_literal = false;
|
|
1296
1304
|
i += 1;
|
|
1297
1305
|
// consume following spaces
|
|
1298
1306
|
while i < chs.len() && chs[i].is_whitespace() {
|
|
@@ -1300,11 +1308,19 @@ fn parse_parenthesized_values(s: &str) -> anyhow::Result<(Vec<Cell>, usize)> {
|
|
|
1300
1308
|
}
|
|
1301
1309
|
}
|
|
1302
1310
|
c if c.is_whitespace() => {
|
|
1303
|
-
//
|
|
1311
|
+
// Preserve whitespace after a quoted literal so explicit SQL casts stay intact.
|
|
1312
|
+
if was_quoted && closed_quoted_literal {
|
|
1313
|
+
trailing_expr.push(c);
|
|
1314
|
+
}
|
|
1315
|
+
// Skip insignificant whitespace between tokens when unquoted.
|
|
1304
1316
|
i += 1;
|
|
1305
1317
|
}
|
|
1306
1318
|
other => {
|
|
1307
|
-
|
|
1319
|
+
if was_quoted && closed_quoted_literal {
|
|
1320
|
+
trailing_expr.push(other);
|
|
1321
|
+
} else {
|
|
1322
|
+
buf.push(other);
|
|
1323
|
+
}
|
|
1308
1324
|
i += 1;
|
|
1309
1325
|
}
|
|
1310
1326
|
}
|
|
@@ -1313,12 +1329,21 @@ fn parse_parenthesized_values(s: &str) -> anyhow::Result<(Vec<Cell>, usize)> {
|
|
|
1313
1329
|
anyhow::bail!("unterminated values row")
|
|
1314
1330
|
}
|
|
1315
1331
|
|
|
1316
|
-
fn finalize_cell(buf: &str, was_quoted: bool) -> Cell {
|
|
1332
|
+
fn finalize_cell(buf: &str, was_quoted: bool, trailing_expr: &str) -> Cell {
|
|
1333
|
+
let trailing = {
|
|
1334
|
+
let t = trailing_expr.trim();
|
|
1335
|
+
if t.is_empty() {
|
|
1336
|
+
None
|
|
1337
|
+
} else {
|
|
1338
|
+
Some(t.to_string())
|
|
1339
|
+
}
|
|
1340
|
+
};
|
|
1317
1341
|
if was_quoted {
|
|
1318
1342
|
Cell {
|
|
1319
1343
|
original: Some(buf.to_string()),
|
|
1320
1344
|
was_quoted: true,
|
|
1321
1345
|
was_default: false,
|
|
1346
|
+
trailing_expr: trailing,
|
|
1322
1347
|
}
|
|
1323
1348
|
} else {
|
|
1324
1349
|
let t = buf.trim();
|
|
@@ -1327,18 +1352,21 @@ fn finalize_cell(buf: &str, was_quoted: bool) -> Cell {
|
|
|
1327
1352
|
original: None,
|
|
1328
1353
|
was_quoted: false,
|
|
1329
1354
|
was_default: false,
|
|
1355
|
+
trailing_expr: None,
|
|
1330
1356
|
}
|
|
1331
1357
|
} else if t.eq_ignore_ascii_case("default") {
|
|
1332
1358
|
Cell {
|
|
1333
1359
|
original: None,
|
|
1334
1360
|
was_quoted: false,
|
|
1335
1361
|
was_default: true,
|
|
1362
|
+
trailing_expr: None,
|
|
1336
1363
|
}
|
|
1337
1364
|
} else {
|
|
1338
1365
|
Cell {
|
|
1339
1366
|
original: Some(t.to_string()),
|
|
1340
1367
|
was_quoted: false,
|
|
1341
1368
|
was_default: false,
|
|
1369
|
+
trailing_expr: None,
|
|
1342
1370
|
}
|
|
1343
1371
|
}
|
|
1344
1372
|
}
|
|
@@ -2370,6 +2398,90 @@ COPY public.events (id, payload) FROM stdin;
|
|
|
2370
2398
|
);
|
|
2371
2399
|
}
|
|
2372
2400
|
|
|
2401
|
+
#[test]
|
|
2402
|
+
fn parse_values_rows_tracks_trailing_cast_for_quoted_literals() {
|
|
2403
|
+
let rows =
|
|
2404
|
+
parse_values_rows("(1, '{\"profile\":{\"secret\":\"alpha\"}}'::jsonb, 'note'::text)")
|
|
2405
|
+
.unwrap();
|
|
2406
|
+
assert_eq!(rows.len(), 1);
|
|
2407
|
+
assert_eq!(rows[0].len(), 3);
|
|
2408
|
+
assert_eq!(
|
|
2409
|
+
rows[0][1].original.as_deref(),
|
|
2410
|
+
Some("{\"profile\":{\"secret\":\"alpha\"}}")
|
|
2411
|
+
);
|
|
2412
|
+
assert_eq!(rows[0][1].trailing_expr.as_deref(), Some("::jsonb"));
|
|
2413
|
+
assert_eq!(rows[0][2].original.as_deref(), Some("note"));
|
|
2414
|
+
assert_eq!(rows[0][2].trailing_expr.as_deref(), Some("::text"));
|
|
2415
|
+
}
|
|
2416
|
+
|
|
2417
|
+
#[test]
|
|
2418
|
+
fn pipeline_anonymizes_nested_json_paths_for_jsonb_cast_insert_rows() {
|
|
2419
|
+
let mut rules: HashMap<String, HashMap<String, AnonymizerSpec>> = HashMap::new();
|
|
2420
|
+
let mut cols: HashMap<String, AnonymizerSpec> = HashMap::new();
|
|
2421
|
+
cols.insert(
|
|
2422
|
+
"payload.profile.secret".to_string(),
|
|
2423
|
+
AnonymizerSpec {
|
|
2424
|
+
strategy: "string".to_string(),
|
|
2425
|
+
salt: None,
|
|
2426
|
+
min: None,
|
|
2427
|
+
max: None,
|
|
2428
|
+
length: Some(8),
|
|
2429
|
+
min_days: None,
|
|
2430
|
+
max_days: None,
|
|
2431
|
+
min_seconds: None,
|
|
2432
|
+
max_seconds: None,
|
|
2433
|
+
domain: Some("secrets".to_string()),
|
|
2434
|
+
unique_within_domain: None,
|
|
2435
|
+
as_string: Some(true),
|
|
2436
|
+
locale: None,
|
|
2437
|
+
faker: None,
|
|
2438
|
+
format: None,
|
|
2439
|
+
},
|
|
2440
|
+
);
|
|
2441
|
+
rules.insert("public.events".to_string(), cols);
|
|
2442
|
+
let cfg = ResolvedConfig {
|
|
2443
|
+
salt: None,
|
|
2444
|
+
rules,
|
|
2445
|
+
row_filters: HashMap::new(),
|
|
2446
|
+
column_cases: HashMap::new(),
|
|
2447
|
+
sensitive_columns: HashMap::new(),
|
|
2448
|
+
output_scan: crate::settings::OutputScanConfig::default(),
|
|
2449
|
+
source_path: None,
|
|
2450
|
+
};
|
|
2451
|
+
let reg = AnonymizerRegistry::from_config(&cfg);
|
|
2452
|
+
let mut proc =
|
|
2453
|
+
SqlStreamProcessor::new(reg, cfg, Vec::new(), Vec::new(), None, DumpFormat::Postgres);
|
|
2454
|
+
let input = r#"
|
|
2455
|
+
CREATE TABLE public.events (id int, payload jsonb);
|
|
2456
|
+
INSERT INTO public.events (id, payload) VALUES
|
|
2457
|
+
(1, '{"profile":{"tier":"gold","secret":"alpha"}}'::jsonb),
|
|
2458
|
+
(2, '{"profile":{"tier":"gold","secret":"alpha"}}'::jsonb);
|
|
2459
|
+
"#;
|
|
2460
|
+
let mut reader = std::io::BufReader::new(input.as_bytes());
|
|
2461
|
+
let mut out = Vec::new();
|
|
2462
|
+
proc.process(&mut reader, &mut out).unwrap();
|
|
2463
|
+
let s = String::from_utf8(out).unwrap();
|
|
2464
|
+
assert!(!s.contains("alpha"), "nested secret should be anonymized");
|
|
2465
|
+
assert!(s.contains("::jsonb"), "jsonb cast should be preserved");
|
|
2466
|
+
|
|
2467
|
+
let insert_pos = s.find("INSERT INTO public.events").unwrap();
|
|
2468
|
+
let insert_tail = &s[insert_pos..];
|
|
2469
|
+
let insert_end = insert_tail.find(";\n").unwrap() + insert_pos;
|
|
2470
|
+
let ins_stmt = &s[insert_pos..=insert_end];
|
|
2471
|
+
let vals_idx = ins_stmt.to_uppercase().find("VALUES").unwrap();
|
|
2472
|
+
let ins_block = strip_trailing_semicolon(ins_stmt[vals_idx + "VALUES".len()..].trim());
|
|
2473
|
+
let ins_rows = parse_values_rows(ins_block).unwrap();
|
|
2474
|
+
assert_eq!(ins_rows[0][1].trailing_expr.as_deref(), Some("::jsonb"));
|
|
2475
|
+
assert_eq!(ins_rows[1][1].trailing_expr.as_deref(), Some("::jsonb"));
|
|
2476
|
+
let v0 =
|
|
2477
|
+
serde_json::from_str::<serde_json::Value>(ins_rows[0][1].original.as_ref().unwrap())
|
|
2478
|
+
.unwrap();
|
|
2479
|
+
let v1 =
|
|
2480
|
+
serde_json::from_str::<serde_json::Value>(ins_rows[1][1].original.as_ref().unwrap())
|
|
2481
|
+
.unwrap();
|
|
2482
|
+
assert_eq!(v0["profile"]["secret"], v1["profile"]["secret"]);
|
|
2483
|
+
}
|
|
2484
|
+
|
|
2373
2485
|
#[test]
|
|
2374
2486
|
fn generated_values_fit_length_restricted_columns_from_create_table() {
|
|
2375
2487
|
let mut rules: HashMap<String, HashMap<String, AnonymizerSpec>> = HashMap::new();
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|