dumpling-cli 0.4.0__tar.gz → 0.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/CHANGELOG.md +15 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/Cargo.lock +1 -1
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/Cargo.toml +1 -1
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/PKG-INFO +55 -9
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/README.md +54 -8
- dumpling_cli-0.4.2/assets/logo.svg +33 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/pyproject.toml +1 -1
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/src/filter.rs +234 -5
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/src/sql.rs +279 -13
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/.dumplingconf.example +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/.github/workflows/ci.yml +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/.github/workflows/docs-pr.yml +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/.github/workflows/docs.yml +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/.github/workflows/platform-compat-latest.yml +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/.github/workflows/platform-compat-matrix.yml +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/.github/workflows/policy-lint.yml +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/.github/workflows/publish.yml +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/.github/workflows/release.yml +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/.github/workflows/tests.yml +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/.gitignore +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/AGENTS.md +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/CONTRIBUTING.md +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/MAINTENANCE.md +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/book.toml +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/datetime_out.sql +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/datetime_sample.sql +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/docs/src/SUMMARY.md +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/docs/src/ci-guardrails.md +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/docs/src/configuration.md +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/docs/src/getting-started.md +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/docs/src/index.md +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/docs/src/releasing.md +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/rust-toolchain.toml +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/scripts/setup-dev.sh +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/src/faker_dispatch.rs +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/src/lint.rs +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/src/main.rs +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/src/report.rs +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/src/scan.rs +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/src/settings.rs +0 -0
- {dumpling_cli-0.4.0 → dumpling_cli-0.4.2}/src/transform.rs +0 -0
|
@@ -7,6 +7,19 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.4.2] - 2026-05-03
|
|
11
|
+
|
|
12
|
+
### Fixed
|
|
13
|
+
|
|
14
|
+
- **JSON path rules on non-JSON cells**: Path-based `[rules]` anonymization is skipped when the cell is not strict JSON, leaving the original value unchanged (consistent with row-filter JSON path behavior).
|
|
15
|
+
- **JSON scalar types in path-based anonymization**: Replacements at JSON paths preserve number and boolean leaf types where possible (numeric and boolean coercion from generated text).
|
|
16
|
+
|
|
17
|
+
## [0.4.1] - 2026-05-03
|
|
18
|
+
|
|
19
|
+
### Fixed
|
|
20
|
+
|
|
21
|
+
- **INSERT row parsing with JSON casts**: Values such as `'{"k":1}'::jsonb` are parsed so the cell’s unescaped payload is valid JSON for JSON path rules and anonymization; trailing casts like `::jsonb` / `::text` are preserved on output.
|
|
22
|
+
|
|
10
23
|
## [0.4.0] - 2026-05-02
|
|
11
24
|
|
|
12
25
|
### Added
|
|
@@ -55,6 +68,8 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
|
|
|
55
68
|
- Configurable output scan severities and per-category thresholds via `[output_scan]`.
|
|
56
69
|
- JSON report section for output scan findings including category, count, threshold, severity, and sample locations.
|
|
57
70
|
|
|
71
|
+
[0.4.2]: https://github.com/ababic/dumpling/compare/v0.4.1...v0.4.2
|
|
72
|
+
[0.4.1]: https://github.com/ababic/dumpling/compare/v0.4.0...v0.4.1
|
|
58
73
|
[0.4.0]: https://github.com/ababic/dumpling/compare/v0.3.0...v0.4.0
|
|
59
74
|
[0.3.0]: https://github.com/ababic/dumpling/compare/v0.2.0...v0.3.0
|
|
60
75
|
[0.2.0]: https://github.com/ababic/dumpling/compare/v0.1.0...v0.2.0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dumpling-cli
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Classifier: Development Status :: 4 - Beta
|
|
5
5
|
Classifier: Environment :: Console
|
|
6
6
|
Classifier: Intended Audience :: Developers
|
|
@@ -19,18 +19,48 @@ Keywords: postgres,sqlite,mssql,sql,anonymization,cli,rust
|
|
|
19
19
|
Requires-Python: >=3.8
|
|
20
20
|
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
<p align="center">
|
|
23
|
+
<img src="assets/logo.svg" width="140" height="140" alt="Dumpling logo: a dumpling with steam" />
|
|
24
|
+
</p>
|
|
23
25
|
|
|
24
|
-
|
|
26
|
+
<h1 align="center">Dumpling</h1>
|
|
27
|
+
|
|
28
|
+
<p align="center">
|
|
29
|
+
<strong>Sanitize SQL dumps before they go anywhere.</strong><br />
|
|
30
|
+
Turn huge <code>pg_dump</code> / SQLite / SQL Server exports into shareable, test-friendly snapshots — no DB connection, no secrets left by accident.
|
|
31
|
+
</p>
|
|
32
|
+
|
|
33
|
+
<p align="center">
|
|
34
|
+
<a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/v/dumpling-cli.svg" alt="PyPI version" /></a>
|
|
35
|
+
<a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/pyversions/dumpling-cli.svg" alt="Python versions" /></a>
|
|
36
|
+
<a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/l/dumpling-cli.svg" alt="PyPI license" /></a>
|
|
37
|
+
<a href="https://github.com/ababic/dumpling/actions/workflows/tests.yml"><img src="https://github.com/ababic/dumpling/actions/workflows/tests.yml/badge.svg" alt="Tests" /></a>
|
|
38
|
+
<a href="https://github.com/ababic/dumpling/actions/workflows/ci.yml"><img src="https://github.com/ababic/dumpling/actions/workflows/ci.yml/badge.svg" alt="Lint" /></a>
|
|
39
|
+
<img src="https://img.shields.io/badge/rust-stable-orange?logo=rust" alt="Rust stable" />
|
|
40
|
+
</p>
|
|
41
|
+
|
|
42
|
+
<p align="center">
|
|
43
|
+
<a href="https://ababic.github.io/dumpling/"><strong>Documentation</strong></a>
|
|
44
|
+
·
|
|
45
|
+
<a href="https://github.com/ababic/dumpling"><strong>GitHub</strong></a>
|
|
46
|
+
</p>
|
|
47
|
+
|
|
48
|
+
<p align="center">
|
|
49
|
+
<sub><em>Disclaimer: This project is entirely vibe-coded, but with strong human guidance, review, and attention to quality and safety.</em></sub>
|
|
50
|
+
</p>
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
**Dumpling** reads plain-text SQL dumps (PostgreSQL `pg_dump`, SQLite `.dump`, SQL Server / MSSQL scripts) and rewrites sensitive columns using rules you define in TOML. Everything runs offline on files — ideal for CI, staging share-outs, and compliance-minded workflows.
|
|
25
55
|
|
|
26
56
|
## Why Dumpling?
|
|
27
57
|
|
|
28
|
-
- **
|
|
29
|
-
- **
|
|
30
|
-
- **
|
|
31
|
-
- **
|
|
32
|
-
- **
|
|
33
|
-
- **
|
|
58
|
+
- **Offline by design** — works on dump files only; nothing connects to your database.
|
|
59
|
+
- **Streams giant files** — line-by-line processing keeps multi‑GB dumps reasonable on modest hardware.
|
|
60
|
+
- **Fails loud, not silent** — missing config exits non‑zero and lists where Dumpling looked; use `--allow-noop` only when you mean it.
|
|
61
|
+
- **Stable pseudonyms** — optional domain mappings keep the same source value as the same fake value across tables (foreign keys stay consistent).
|
|
62
|
+
- **Pipeline-ready** — `--check`, strict coverage, JSON reports, and residual PII scans fit pre-merge gates and release automation.
|
|
63
|
+
- **Configure once** — `.dumplingconf` or `[tool.dumpling]` in `pyproject.toml`; install via **Rust** (`cargo`) or **`pip install dumpling-cli`**.
|
|
34
64
|
|
|
35
65
|
---
|
|
36
66
|
|
|
@@ -344,6 +374,22 @@ Supported predicate operators:
|
|
|
344
374
|
|
|
345
375
|
Predicates can target nested JSON values using dot notation (`payload.profile.tier`) or Django-style notation (`payload__profile__tier`). For JSON arrays, path segments are evaluated against each element, so list-of-dicts structures can be matched naturally.
|
|
346
376
|
|
|
377
|
+
### JSON path list targeting
|
|
378
|
+
|
|
379
|
+
JSON list/array traversal is automatic once a path segment resolves to an array.
|
|
380
|
+
|
|
381
|
+
- **All elements in an array**: use the next field name directly.
|
|
382
|
+
- `payload.items.kind` or `payload__items__kind`
|
|
383
|
+
- Matches/rewrites `kind` for every object in `items`.
|
|
384
|
+
- **Specific array index**: use a numeric segment.
|
|
385
|
+
- `payload.items.0.kind` or `payload__items__0__kind`
|
|
386
|
+
- Targets only the first element.
|
|
387
|
+
- **Nested arrays**: combine field and index segments as needed.
|
|
388
|
+
- `payload.groups.members.email`
|
|
389
|
+
- `payload.groups.1.members.0.email`
|
|
390
|
+
|
|
391
|
+
This path behavior is shared by both `row_filters` predicates and JSON-path anonymization rules in `[rules]`.
|
|
392
|
+
|
|
347
393
|
```toml
|
|
348
394
|
[row_filters."public.users"]
|
|
349
395
|
retain = [
|
|
@@ -1,15 +1,45 @@
|
|
|
1
|
-
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="assets/logo.svg" width="140" height="140" alt="Dumpling logo: a dumpling with steam" />
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">Dumpling</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<strong>Sanitize SQL dumps before they go anywhere.</strong><br />
|
|
9
|
+
Turn huge <code>pg_dump</code> / SQLite / SQL Server exports into shareable, test-friendly snapshots — no DB connection, no secrets left by accident.
|
|
10
|
+
</p>
|
|
11
|
+
|
|
12
|
+
<p align="center">
|
|
13
|
+
<a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/v/dumpling-cli.svg" alt="PyPI version" /></a>
|
|
14
|
+
<a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/pyversions/dumpling-cli.svg" alt="Python versions" /></a>
|
|
15
|
+
<a href="https://pypi.org/project/dumpling-cli/"><img src="https://img.shields.io/pypi/l/dumpling-cli.svg" alt="PyPI license" /></a>
|
|
16
|
+
<a href="https://github.com/ababic/dumpling/actions/workflows/tests.yml"><img src="https://github.com/ababic/dumpling/actions/workflows/tests.yml/badge.svg" alt="Tests" /></a>
|
|
17
|
+
<a href="https://github.com/ababic/dumpling/actions/workflows/ci.yml"><img src="https://github.com/ababic/dumpling/actions/workflows/ci.yml/badge.svg" alt="Lint" /></a>
|
|
18
|
+
<img src="https://img.shields.io/badge/rust-stable-orange?logo=rust" alt="Rust stable" />
|
|
19
|
+
</p>
|
|
20
|
+
|
|
21
|
+
<p align="center">
|
|
22
|
+
<a href="https://ababic.github.io/dumpling/"><strong>Documentation</strong></a>
|
|
23
|
+
·
|
|
24
|
+
<a href="https://github.com/ababic/dumpling"><strong>GitHub</strong></a>
|
|
25
|
+
</p>
|
|
26
|
+
|
|
27
|
+
<p align="center">
|
|
28
|
+
<sub><em>Disclaimer: This project is entirely vibe-coded, but with strong human guidance, review, and attention to quality and safety.</em></sub>
|
|
29
|
+
</p>
|
|
2
30
|
|
|
3
|
-
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
**Dumpling** reads plain-text SQL dumps (PostgreSQL `pg_dump`, SQLite `.dump`, SQL Server / MSSQL scripts) and rewrites sensitive columns using rules you define in TOML. Everything runs offline on files — ideal for CI, staging share-outs, and compliance-minded workflows.
|
|
4
34
|
|
|
5
35
|
## Why Dumpling?
|
|
6
36
|
|
|
7
|
-
- **
|
|
8
|
-
- **
|
|
9
|
-
- **
|
|
10
|
-
- **
|
|
11
|
-
- **
|
|
12
|
-
- **
|
|
37
|
+
- **Offline by design** — works on dump files only; nothing connects to your database.
|
|
38
|
+
- **Streams giant files** — line-by-line processing keeps multi‑GB dumps reasonable on modest hardware.
|
|
39
|
+
- **Fails loud, not silent** — missing config exits non‑zero and lists where Dumpling looked; use `--allow-noop` only when you mean it.
|
|
40
|
+
- **Stable pseudonyms** — optional domain mappings keep the same source value as the same fake value across tables (foreign keys stay consistent).
|
|
41
|
+
- **Pipeline-ready** — `--check`, strict coverage, JSON reports, and residual PII scans fit pre-merge gates and release automation.
|
|
42
|
+
- **Configure once** — `.dumplingconf` or `[tool.dumpling]` in `pyproject.toml`; install via **Rust** (`cargo`) or **`pip install dumpling-cli`**.
|
|
13
43
|
|
|
14
44
|
---
|
|
15
45
|
|
|
@@ -323,6 +353,22 @@ Supported predicate operators:
|
|
|
323
353
|
|
|
324
354
|
Predicates can target nested JSON values using dot notation (`payload.profile.tier`) or Django-style notation (`payload__profile__tier`). For JSON arrays, path segments are evaluated against each element, so list-of-dicts structures can be matched naturally.
|
|
325
355
|
|
|
356
|
+
### JSON path list targeting
|
|
357
|
+
|
|
358
|
+
JSON list/array traversal is automatic once a path segment resolves to an array.
|
|
359
|
+
|
|
360
|
+
- **All elements in an array**: use the next field name directly.
|
|
361
|
+
- `payload.items.kind` or `payload__items__kind`
|
|
362
|
+
- Matches/rewrites `kind` for every object in `items`.
|
|
363
|
+
- **Specific array index**: use a numeric segment.
|
|
364
|
+
- `payload.items.0.kind` or `payload__items__0__kind`
|
|
365
|
+
- Targets only the first element.
|
|
366
|
+
- **Nested arrays**: combine field and index segments as needed.
|
|
367
|
+
- `payload.groups.members.email`
|
|
368
|
+
- `payload.groups.1.members.0.email`
|
|
369
|
+
|
|
370
|
+
This path behavior is shared by both `row_filters` predicates and JSON-path anonymization rules in `[rules]`.
|
|
371
|
+
|
|
326
372
|
```toml
|
|
327
373
|
[row_filters."public.users"]
|
|
328
374
|
retain = [
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 128 128" role="img" aria-label="Dumpling logo">
|
|
2
|
+
<defs>
|
|
3
|
+
<linearGradient id="steam" x1="0%" y1="100%" x2="0%" y2="0%">
|
|
4
|
+
<stop offset="0%" stop-color="#e8f4fc" stop-opacity="0"/>
|
|
5
|
+
<stop offset="50%" stop-color="#cfe9fb" stop-opacity="0.85"/>
|
|
6
|
+
<stop offset="100%" stop-color="#b8dff8" stop-opacity="0"/>
|
|
7
|
+
</linearGradient>
|
|
8
|
+
<linearGradient id="dough" x1="0%" y1="0%" x2="100%" y2="100%">
|
|
9
|
+
<stop offset="0%" stop-color="#fff8ef"/>
|
|
10
|
+
<stop offset="45%" stop-color="#f4dcc4"/>
|
|
11
|
+
<stop offset="100%" stop-color="#e8b896"/>
|
|
12
|
+
</linearGradient>
|
|
13
|
+
<linearGradient id="shadow" x1="0%" y1="0%" x2="0%" y2="100%">
|
|
14
|
+
<stop offset="0%" stop-color="#c4865a" stop-opacity="0.35"/>
|
|
15
|
+
<stop offset="100%" stop-color="#8b5a3c" stop-opacity="0.15"/>
|
|
16
|
+
</linearGradient>
|
|
17
|
+
</defs>
|
|
18
|
+
<!-- Steam -->
|
|
19
|
+
<path fill="url(#steam)" d="M44 18c2-6 8-10 14-8s8 10 4 15c-3 4-2 9 2 12 5 4 5 12 0 16-6 5-16 4-20-3-2-4 0-9 4-11 3-2 3-6 0-9-4-4-4-10 0-12z"/>
|
|
20
|
+
<path fill="url(#steam)" opacity="0.75" d="M64 14c3-5 9-7 14-4 5 3 6 10 2 14-4 4-3 10 2 13 6 4 7 13 1 18-7 6-19 4-24-5-2-5 1-11 6-13 4-2 4-7 1-10-5-4-5-11 0-13z"/>
|
|
21
|
+
<path fill="url(#steam)" opacity="0.6" d="M82 20c2-5 8-8 13-5 5 3 7 10 3 15-3 4-2 9 3 12 5 3 7 11 2 16-6 6-17 5-22-3-2-4 0-9 5-11 3-2 4-6 1-9-4-4-4-10 0-13z"/>
|
|
22
|
+
<!-- Plate -->
|
|
23
|
+
<ellipse cx="64" cy="108" rx="52" ry="10" fill="#dfe8ef"/>
|
|
24
|
+
<ellipse cx="64" cy="106" rx="48" ry="8" fill="#eef4f8"/>
|
|
25
|
+
<!-- Dumpling body -->
|
|
26
|
+
<ellipse cx="64" cy="82" rx="42" ry="28" fill="url(#dough)" stroke="#d4a574" stroke-width="2"/>
|
|
27
|
+
<ellipse cx="64" cy="96" rx="38" ry="12" fill="url(#shadow)"/>
|
|
28
|
+
<!-- Pleats -->
|
|
29
|
+
<path fill="none" stroke="#c9956a" stroke-width="1.8" stroke-linecap="round" d="M34 58c6 10 14 16 30 16s24-6 30-16"/>
|
|
30
|
+
<path fill="none" stroke="#d9b08a" stroke-width="1.2" stroke-linecap="round" opacity="0.9" d="M42 54c5 8 13 13 22 13s17-5 22-13"/>
|
|
31
|
+
<!-- Highlight -->
|
|
32
|
+
<ellipse cx="48" cy="76" rx="10" ry="6" fill="#ffffff" opacity="0.35"/>
|
|
33
|
+
</svg>
|
|
@@ -224,25 +224,118 @@ fn replacement_to_json_value(repl: &Replacement) -> serde_json::Value {
|
|
|
224
224
|
.unwrap_or_else(|_| serde_json::Value::String(repl.value.clone()))
|
|
225
225
|
}
|
|
226
226
|
|
|
227
|
+
/// When rewriting JSON at a path, map `Replacement` back into [`serde_json::Value`] while keeping
|
|
228
|
+
/// the leaf's JSON type when the strategy still returns text (e.g. `Replacement::quoted` for
|
|
229
|
+
/// `string`, `hash`, etc.): numeric and boolean leaves stay JSON numbers/bools if the replacement
|
|
230
|
+
/// text parses as such.
|
|
231
|
+
fn coerce_json_path_replacement(
|
|
232
|
+
original: &serde_json::Value,
|
|
233
|
+
repl: &Replacement,
|
|
234
|
+
) -> serde_json::Value {
|
|
235
|
+
if repl.is_null {
|
|
236
|
+
return serde_json::Value::Null;
|
|
237
|
+
}
|
|
238
|
+
match original {
|
|
239
|
+
serde_json::Value::Bool(_) => {
|
|
240
|
+
if let Some(b) = parse_loose_json_bool(&repl.value) {
|
|
241
|
+
return serde_json::Value::Bool(b);
|
|
242
|
+
}
|
|
243
|
+
if !repl.force_quoted {
|
|
244
|
+
if let Ok(v) = serde_json::from_str::<serde_json::Value>(&repl.value) {
|
|
245
|
+
match v {
|
|
246
|
+
serde_json::Value::Bool(b) => return serde_json::Value::Bool(b),
|
|
247
|
+
serde_json::Value::Number(n) => {
|
|
248
|
+
if n.as_u64() == Some(0) || n.as_i64() == Some(0) {
|
|
249
|
+
return serde_json::Value::Bool(false);
|
|
250
|
+
}
|
|
251
|
+
if n.as_u64() == Some(1) || n.as_i64() == Some(1) {
|
|
252
|
+
return serde_json::Value::Bool(true);
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
_ => {}
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
serde_json::Value::String(repl.value.clone())
|
|
260
|
+
}
|
|
261
|
+
serde_json::Value::Number(_) => {
|
|
262
|
+
if let Some(n) = parse_loose_json_number(&repl.value) {
|
|
263
|
+
return serde_json::Value::Number(n);
|
|
264
|
+
}
|
|
265
|
+
if !repl.force_quoted {
|
|
266
|
+
if let Ok(serde_json::Value::Number(n)) =
|
|
267
|
+
serde_json::from_str::<serde_json::Value>(&repl.value)
|
|
268
|
+
{
|
|
269
|
+
return serde_json::Value::Number(n);
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
serde_json::Value::String(repl.value.clone())
|
|
273
|
+
}
|
|
274
|
+
serde_json::Value::String(_) => {
|
|
275
|
+
if repl.force_quoted {
|
|
276
|
+
serde_json::Value::String(repl.value.clone())
|
|
277
|
+
} else {
|
|
278
|
+
serde_json::from_str(&repl.value)
|
|
279
|
+
.unwrap_or_else(|_| serde_json::Value::String(repl.value.clone()))
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
serde_json::Value::Null => replacement_to_json_value(repl),
|
|
283
|
+
serde_json::Value::Array(_) | serde_json::Value::Object(_) => {
|
|
284
|
+
replacement_to_json_value(repl)
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
fn parse_loose_json_bool(s: &str) -> Option<bool> {
|
|
290
|
+
match s.trim().to_ascii_lowercase().as_str() {
|
|
291
|
+
"true" => Some(true),
|
|
292
|
+
"false" => Some(false),
|
|
293
|
+
_ => None,
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
fn parse_loose_json_number(s: &str) -> Option<serde_json::Number> {
|
|
298
|
+
let t = s.trim();
|
|
299
|
+
if t.is_empty() {
|
|
300
|
+
return None;
|
|
301
|
+
}
|
|
302
|
+
if let Ok(i) = t.parse::<i64>() {
|
|
303
|
+
return Some(i.into());
|
|
304
|
+
}
|
|
305
|
+
if let Ok(u) = t.parse::<u64>() {
|
|
306
|
+
return Some(u.into());
|
|
307
|
+
}
|
|
308
|
+
let f = t.parse::<f64>().ok()?;
|
|
309
|
+
serde_json::Number::from_f64(f)
|
|
310
|
+
}
|
|
311
|
+
|
|
227
312
|
fn apply_leaf_replacement(target: &mut serde_json::Value, repl: &Replacement) {
|
|
228
|
-
|
|
313
|
+
let original = target.clone();
|
|
314
|
+
*target = coerce_json_path_replacement(&original, repl);
|
|
229
315
|
}
|
|
230
316
|
|
|
231
317
|
/// Mutate JSON document strings at configured paths using the same path semantics as predicates.
|
|
318
|
+
///
|
|
319
|
+
/// Returns [`None`] when `raw_json` is not valid strict JSON (same tolerance as row-filter JSON
|
|
320
|
+
/// path extraction): path rules are skipped for that cell and callers should passthrough the
|
|
321
|
+
/// original value unchanged.
|
|
232
322
|
pub fn rewrite_json_paths_with_rules(
|
|
233
323
|
registry: &AnonymizerRegistry,
|
|
234
324
|
column_max_len: Option<usize>,
|
|
235
325
|
json_rules: &[(Vec<String>, AnonymizerSpec)],
|
|
236
326
|
raw_json: &str,
|
|
237
|
-
) -> anyhow::Result<String
|
|
238
|
-
let mut root = serde_json::from_str::<serde_json::Value>(raw_json)
|
|
327
|
+
) -> anyhow::Result<Option<String>> {
|
|
328
|
+
let mut root = match serde_json::from_str::<serde_json::Value>(raw_json) {
|
|
329
|
+
Ok(v) => v,
|
|
330
|
+
Err(_) => return Ok(None),
|
|
331
|
+
};
|
|
239
332
|
for (path, spec) in json_rules {
|
|
240
333
|
let mut apply = |original_cell: Option<String>| {
|
|
241
334
|
apply_anonymizer(registry, spec, original_cell.as_deref(), column_max_len)
|
|
242
335
|
};
|
|
243
336
|
mutate_json_at_path(&mut root, path, &mut apply)?;
|
|
244
337
|
}
|
|
245
|
-
Ok(root.to_string())
|
|
338
|
+
Ok(Some(root.to_string()))
|
|
246
339
|
}
|
|
247
340
|
|
|
248
341
|
fn mutate_json_at_path<F>(
|
|
@@ -457,7 +550,8 @@ fn get_cached_regex(pat: &str, case_insensitive: bool) -> regex::Regex {
|
|
|
457
550
|
#[cfg(test)]
|
|
458
551
|
mod tests {
|
|
459
552
|
use super::*;
|
|
460
|
-
use crate::settings::{ResolvedConfig, RowFilterSet};
|
|
553
|
+
use crate::settings::{AnonymizerSpec, ResolvedConfig, RowFilterSet};
|
|
554
|
+
use crate::transform::AnonymizerRegistry;
|
|
461
555
|
use std::collections::HashMap;
|
|
462
556
|
|
|
463
557
|
#[test]
|
|
@@ -630,4 +724,139 @@ mod tests {
|
|
|
630
724
|
&[Some(r#"{"items":[{"kind":"secondary"}]}"#.to_string())]
|
|
631
725
|
));
|
|
632
726
|
}
|
|
727
|
+
|
|
728
|
+
#[test]
|
|
729
|
+
fn rewrite_json_paths_skips_non_json_cells_like_row_filters() {
|
|
730
|
+
let mut rules: HashMap<String, HashMap<String, AnonymizerSpec>> = HashMap::new();
|
|
731
|
+
let spec = AnonymizerSpec {
|
|
732
|
+
strategy: "string".to_string(),
|
|
733
|
+
salt: None,
|
|
734
|
+
min: None,
|
|
735
|
+
max: None,
|
|
736
|
+
length: Some(4),
|
|
737
|
+
min_days: None,
|
|
738
|
+
max_days: None,
|
|
739
|
+
min_seconds: None,
|
|
740
|
+
max_seconds: None,
|
|
741
|
+
domain: None,
|
|
742
|
+
unique_within_domain: None,
|
|
743
|
+
as_string: Some(true),
|
|
744
|
+
locale: None,
|
|
745
|
+
faker: None,
|
|
746
|
+
format: None,
|
|
747
|
+
};
|
|
748
|
+
rules.insert("public.t".to_string(), HashMap::new());
|
|
749
|
+
let cfg = ResolvedConfig {
|
|
750
|
+
salt: None,
|
|
751
|
+
rules,
|
|
752
|
+
row_filters: HashMap::new(),
|
|
753
|
+
column_cases: HashMap::new(),
|
|
754
|
+
sensitive_columns: HashMap::new(),
|
|
755
|
+
output_scan: crate::settings::OutputScanConfig::default(),
|
|
756
|
+
source_path: None,
|
|
757
|
+
};
|
|
758
|
+
let registry = AnonymizerRegistry::from_config(&cfg);
|
|
759
|
+
let json_rules: Vec<(Vec<String>, AnonymizerSpec)> = vec![(
|
|
760
|
+
vec!["profile".to_string(), "secret".to_string()],
|
|
761
|
+
spec.clone(),
|
|
762
|
+
)];
|
|
763
|
+
assert!(
|
|
764
|
+
rewrite_json_paths_with_rules(®istry, None, &json_rules, "{not json")
|
|
765
|
+
.unwrap()
|
|
766
|
+
.is_none()
|
|
767
|
+
);
|
|
768
|
+
let out = rewrite_json_paths_with_rules(
|
|
769
|
+
®istry,
|
|
770
|
+
None,
|
|
771
|
+
&json_rules,
|
|
772
|
+
r#"{"profile":{"secret":"x"}}"#,
|
|
773
|
+
)
|
|
774
|
+
.unwrap()
|
|
775
|
+
.expect("valid JSON should rewrite");
|
|
776
|
+
let v: serde_json::Value = serde_json::from_str(&out).unwrap();
|
|
777
|
+
assert_ne!(v["profile"]["secret"], "x");
|
|
778
|
+
}
|
|
779
|
+
|
|
780
|
+
#[test]
|
|
781
|
+
fn rewrite_json_paths_preserves_number_and_bool_leaf_types_for_quoted_replacements() {
|
|
782
|
+
let mut rules: HashMap<String, HashMap<String, AnonymizerSpec>> = HashMap::new();
|
|
783
|
+
rules.insert("public.t".to_string(), HashMap::new());
|
|
784
|
+
let cfg = ResolvedConfig {
|
|
785
|
+
salt: None,
|
|
786
|
+
rules,
|
|
787
|
+
row_filters: HashMap::new(),
|
|
788
|
+
column_cases: HashMap::new(),
|
|
789
|
+
sensitive_columns: HashMap::new(),
|
|
790
|
+
output_scan: crate::settings::OutputScanConfig::default(),
|
|
791
|
+
source_path: None,
|
|
792
|
+
};
|
|
793
|
+
let registry = AnonymizerRegistry::from_config(&cfg);
|
|
794
|
+
|
|
795
|
+
let int_spec = AnonymizerSpec {
|
|
796
|
+
strategy: "int_range".to_string(),
|
|
797
|
+
salt: None,
|
|
798
|
+
min: Some(0),
|
|
799
|
+
max: Some(9),
|
|
800
|
+
length: None,
|
|
801
|
+
min_days: None,
|
|
802
|
+
max_days: None,
|
|
803
|
+
min_seconds: None,
|
|
804
|
+
max_seconds: None,
|
|
805
|
+
domain: Some("coerce_int_leaf".to_string()),
|
|
806
|
+
unique_within_domain: None,
|
|
807
|
+
as_string: None,
|
|
808
|
+
locale: None,
|
|
809
|
+
faker: None,
|
|
810
|
+
format: None,
|
|
811
|
+
};
|
|
812
|
+
let out = rewrite_json_paths_with_rules(
|
|
813
|
+
®istry,
|
|
814
|
+
None,
|
|
815
|
+
&[(vec!["n".to_string()], int_spec)],
|
|
816
|
+
r#"{"n":1,"b":true,"s":"x"}"#,
|
|
817
|
+
)
|
|
818
|
+
.unwrap()
|
|
819
|
+
.unwrap();
|
|
820
|
+
let v: serde_json::Value = serde_json::from_str(&out).unwrap();
|
|
821
|
+
assert!(
|
|
822
|
+
v["n"].is_number(),
|
|
823
|
+
"int_range replacement should stay JSON number, got {:?}",
|
|
824
|
+
v["n"]
|
|
825
|
+
);
|
|
826
|
+
assert_eq!(v["b"], true);
|
|
827
|
+
assert_eq!(v["s"], "x");
|
|
828
|
+
|
|
829
|
+
let string_spec = AnonymizerSpec {
|
|
830
|
+
strategy: "int_range".to_string(),
|
|
831
|
+
salt: None,
|
|
832
|
+
min: Some(0),
|
|
833
|
+
max: Some(0),
|
|
834
|
+
length: None,
|
|
835
|
+
min_days: None,
|
|
836
|
+
max_days: None,
|
|
837
|
+
min_seconds: None,
|
|
838
|
+
max_seconds: None,
|
|
839
|
+
domain: Some("coerce_bool_leaf".to_string()),
|
|
840
|
+
unique_within_domain: None,
|
|
841
|
+
as_string: None,
|
|
842
|
+
locale: None,
|
|
843
|
+
faker: None,
|
|
844
|
+
format: None,
|
|
845
|
+
};
|
|
846
|
+
let out2 = rewrite_json_paths_with_rules(
|
|
847
|
+
®istry,
|
|
848
|
+
None,
|
|
849
|
+
&[(vec!["b".to_string()], string_spec)],
|
|
850
|
+
r#"{"b":false}"#,
|
|
851
|
+
)
|
|
852
|
+
.unwrap()
|
|
853
|
+
.unwrap();
|
|
854
|
+
let v2: serde_json::Value = serde_json::from_str(&out2).unwrap();
|
|
855
|
+
assert!(
|
|
856
|
+
v2["b"].is_boolean(),
|
|
857
|
+
"unquoted 0 from int_range should coerce to bool at bool leaf, got {:?}",
|
|
858
|
+
v2["b"]
|
|
859
|
+
);
|
|
860
|
+
assert_eq!(v2["b"], false);
|
|
861
|
+
}
|
|
633
862
|
}
|
|
@@ -562,7 +562,11 @@ impl SqlStreamProcessor {
|
|
|
562
562
|
None => return Ok(None),
|
|
563
563
|
};
|
|
564
564
|
let specs: Vec<AnonymizerSpec> = json_owned.iter().map(|(_, s)| s.clone()).collect();
|
|
565
|
-
let out =
|
|
565
|
+
let Some(out) =
|
|
566
|
+
rewrite_json_paths_with_rules(&self.anonymizers, col_len, &json_owned, raw)?
|
|
567
|
+
else {
|
|
568
|
+
return Ok(None);
|
|
569
|
+
};
|
|
566
570
|
let repl = Replacement::quoted(out);
|
|
567
571
|
Ok(Some((repl, specs)))
|
|
568
572
|
}
|
|
@@ -1155,20 +1159,22 @@ struct Cell {
|
|
|
1155
1159
|
original: Option<String>, // None for NULL
|
|
1156
1160
|
was_quoted: bool,
|
|
1157
1161
|
was_default: bool,
|
|
1162
|
+
trailing_expr: Option<String>,
|
|
1158
1163
|
}
|
|
1159
1164
|
|
|
1160
1165
|
impl Cell {
|
|
1161
1166
|
fn render_original(&self) -> String {
|
|
1167
|
+
let trailing = self.trailing_expr.as_deref().unwrap_or("");
|
|
1162
1168
|
if self.was_default {
|
|
1163
|
-
return "DEFAULT"
|
|
1169
|
+
return format!("DEFAULT{trailing}");
|
|
1164
1170
|
}
|
|
1165
1171
|
match &self.original {
|
|
1166
|
-
None => "NULL"
|
|
1172
|
+
None => format!("NULL{trailing}"),
|
|
1167
1173
|
Some(s) => {
|
|
1168
1174
|
if self.was_quoted {
|
|
1169
|
-
format!("'{}'", s.replace('\'', "''"))
|
|
1175
|
+
format!("'{}'{trailing}", s.replace('\'', "''"))
|
|
1170
1176
|
} else {
|
|
1171
|
-
s
|
|
1177
|
+
format!("{s}{trailing}")
|
|
1172
1178
|
}
|
|
1173
1179
|
}
|
|
1174
1180
|
}
|
|
@@ -1176,14 +1182,15 @@ impl Cell {
|
|
|
1176
1182
|
}
|
|
1177
1183
|
|
|
1178
1184
|
fn render_cell(repl: &Replacement, original: &Cell) -> String {
|
|
1185
|
+
let trailing = original.trailing_expr.as_deref().unwrap_or("");
|
|
1179
1186
|
if repl.is_null {
|
|
1180
|
-
return "NULL"
|
|
1187
|
+
return format!("NULL{trailing}");
|
|
1181
1188
|
}
|
|
1182
1189
|
let should_quote = repl.force_quoted || original.was_quoted;
|
|
1183
1190
|
if should_quote {
|
|
1184
|
-
format!("'{}'", repl.value.replace('\'', "''"))
|
|
1191
|
+
format!("'{}'{trailing}", repl.value.replace('\'', "''"))
|
|
1185
1192
|
} else {
|
|
1186
|
-
repl.value
|
|
1193
|
+
format!("{}{trailing}", repl.value)
|
|
1187
1194
|
}
|
|
1188
1195
|
}
|
|
1189
1196
|
|
|
@@ -1243,7 +1250,9 @@ fn parse_parenthesized_values(s: &str) -> anyhow::Result<(Vec<Cell>, usize)> {
|
|
|
1243
1250
|
let mut cells: Vec<Cell> = Vec::new();
|
|
1244
1251
|
let mut in_single = false;
|
|
1245
1252
|
let mut buf = String::new();
|
|
1253
|
+
let mut trailing_expr = String::new();
|
|
1246
1254
|
let mut was_quoted = false;
|
|
1255
|
+
let mut closed_quoted_literal = false;
|
|
1247
1256
|
while i < chs.len() {
|
|
1248
1257
|
let c = chs[i];
|
|
1249
1258
|
if in_single {
|
|
@@ -1255,6 +1264,7 @@ fn parse_parenthesized_values(s: &str) -> anyhow::Result<(Vec<Cell>, usize)> {
|
|
|
1255
1264
|
continue;
|
|
1256
1265
|
} else {
|
|
1257
1266
|
in_single = false;
|
|
1267
|
+
closed_quoted_literal = true;
|
|
1258
1268
|
i += 1;
|
|
1259
1269
|
continue;
|
|
1260
1270
|
}
|
|
@@ -1282,17 +1292,19 @@ fn parse_parenthesized_values(s: &str) -> anyhow::Result<(Vec<Cell>, usize)> {
|
|
|
1282
1292
|
}
|
|
1283
1293
|
')' => {
|
|
1284
1294
|
// end cell, end row
|
|
1285
|
-
let cell = finalize_cell(&buf, was_quoted);
|
|
1295
|
+
let cell = finalize_cell(&buf, was_quoted, &trailing_expr);
|
|
1286
1296
|
cells.push(cell);
|
|
1287
1297
|
i += 1;
|
|
1288
1298
|
return Ok((cells, i));
|
|
1289
1299
|
}
|
|
1290
1300
|
',' => {
|
|
1291
1301
|
// end cell
|
|
1292
|
-
let cell = finalize_cell(&buf, was_quoted);
|
|
1302
|
+
let cell = finalize_cell(&buf, was_quoted, &trailing_expr);
|
|
1293
1303
|
cells.push(cell);
|
|
1294
1304
|
buf.clear();
|
|
1305
|
+
trailing_expr.clear();
|
|
1295
1306
|
was_quoted = false;
|
|
1307
|
+
closed_quoted_literal = false;
|
|
1296
1308
|
i += 1;
|
|
1297
1309
|
// consume following spaces
|
|
1298
1310
|
while i < chs.len() && chs[i].is_whitespace() {
|
|
@@ -1300,11 +1312,19 @@ fn parse_parenthesized_values(s: &str) -> anyhow::Result<(Vec<Cell>, usize)> {
|
|
|
1300
1312
|
}
|
|
1301
1313
|
}
|
|
1302
1314
|
c if c.is_whitespace() => {
|
|
1303
|
-
//
|
|
1315
|
+
// Preserve whitespace after a quoted literal so explicit SQL casts stay intact.
|
|
1316
|
+
if was_quoted && closed_quoted_literal {
|
|
1317
|
+
trailing_expr.push(c);
|
|
1318
|
+
}
|
|
1319
|
+
// Skip insignificant whitespace between tokens when unquoted.
|
|
1304
1320
|
i += 1;
|
|
1305
1321
|
}
|
|
1306
1322
|
other => {
|
|
1307
|
-
|
|
1323
|
+
if was_quoted && closed_quoted_literal {
|
|
1324
|
+
trailing_expr.push(other);
|
|
1325
|
+
} else {
|
|
1326
|
+
buf.push(other);
|
|
1327
|
+
}
|
|
1308
1328
|
i += 1;
|
|
1309
1329
|
}
|
|
1310
1330
|
}
|
|
@@ -1313,12 +1333,21 @@ fn parse_parenthesized_values(s: &str) -> anyhow::Result<(Vec<Cell>, usize)> {
|
|
|
1313
1333
|
anyhow::bail!("unterminated values row")
|
|
1314
1334
|
}
|
|
1315
1335
|
|
|
1316
|
-
fn finalize_cell(buf: &str, was_quoted: bool) -> Cell {
|
|
1336
|
+
fn finalize_cell(buf: &str, was_quoted: bool, trailing_expr: &str) -> Cell {
|
|
1337
|
+
let trailing = {
|
|
1338
|
+
let t = trailing_expr.trim();
|
|
1339
|
+
if t.is_empty() {
|
|
1340
|
+
None
|
|
1341
|
+
} else {
|
|
1342
|
+
Some(t.to_string())
|
|
1343
|
+
}
|
|
1344
|
+
};
|
|
1317
1345
|
if was_quoted {
|
|
1318
1346
|
Cell {
|
|
1319
1347
|
original: Some(buf.to_string()),
|
|
1320
1348
|
was_quoted: true,
|
|
1321
1349
|
was_default: false,
|
|
1350
|
+
trailing_expr: trailing,
|
|
1322
1351
|
}
|
|
1323
1352
|
} else {
|
|
1324
1353
|
let t = buf.trim();
|
|
@@ -1327,18 +1356,21 @@ fn finalize_cell(buf: &str, was_quoted: bool) -> Cell {
|
|
|
1327
1356
|
original: None,
|
|
1328
1357
|
was_quoted: false,
|
|
1329
1358
|
was_default: false,
|
|
1359
|
+
trailing_expr: None,
|
|
1330
1360
|
}
|
|
1331
1361
|
} else if t.eq_ignore_ascii_case("default") {
|
|
1332
1362
|
Cell {
|
|
1333
1363
|
original: None,
|
|
1334
1364
|
was_quoted: false,
|
|
1335
1365
|
was_default: true,
|
|
1366
|
+
trailing_expr: None,
|
|
1336
1367
|
}
|
|
1337
1368
|
} else {
|
|
1338
1369
|
Cell {
|
|
1339
1370
|
original: Some(t.to_string()),
|
|
1340
1371
|
was_quoted: false,
|
|
1341
1372
|
was_default: false,
|
|
1373
|
+
trailing_expr: None,
|
|
1342
1374
|
}
|
|
1343
1375
|
}
|
|
1344
1376
|
}
|
|
@@ -2370,6 +2402,240 @@ COPY public.events (id, payload) FROM stdin;
|
|
|
2370
2402
|
);
|
|
2371
2403
|
}
|
|
2372
2404
|
|
|
2405
|
+
#[test]
|
|
2406
|
+
fn pipeline_json_path_rules_passthrough_non_json_cells() {
|
|
2407
|
+
let mut rules: HashMap<String, HashMap<String, AnonymizerSpec>> = HashMap::new();
|
|
2408
|
+
let mut cols: HashMap<String, AnonymizerSpec> = HashMap::new();
|
|
2409
|
+
cols.insert(
|
|
2410
|
+
"payload.profile.secret".to_string(),
|
|
2411
|
+
AnonymizerSpec {
|
|
2412
|
+
strategy: "string".to_string(),
|
|
2413
|
+
salt: None,
|
|
2414
|
+
min: None,
|
|
2415
|
+
max: None,
|
|
2416
|
+
length: Some(8),
|
|
2417
|
+
min_days: None,
|
|
2418
|
+
max_days: None,
|
|
2419
|
+
min_seconds: None,
|
|
2420
|
+
max_seconds: None,
|
|
2421
|
+
domain: Some("secrets".to_string()),
|
|
2422
|
+
unique_within_domain: None,
|
|
2423
|
+
as_string: Some(true),
|
|
2424
|
+
locale: None,
|
|
2425
|
+
faker: None,
|
|
2426
|
+
format: None,
|
|
2427
|
+
},
|
|
2428
|
+
);
|
|
2429
|
+
rules.insert("public.events".to_string(), cols);
|
|
2430
|
+
let cfg = ResolvedConfig {
|
|
2431
|
+
salt: None,
|
|
2432
|
+
rules,
|
|
2433
|
+
row_filters: HashMap::new(),
|
|
2434
|
+
column_cases: HashMap::new(),
|
|
2435
|
+
sensitive_columns: HashMap::new(),
|
|
2436
|
+
output_scan: crate::settings::OutputScanConfig::default(),
|
|
2437
|
+
source_path: None,
|
|
2438
|
+
};
|
|
2439
|
+
let reg = AnonymizerRegistry::from_config(&cfg);
|
|
2440
|
+
let mut proc =
|
|
2441
|
+
SqlStreamProcessor::new(reg, cfg, Vec::new(), Vec::new(), None, DumpFormat::Postgres);
|
|
2442
|
+
let input = r#"
|
|
2443
|
+
CREATE TABLE public.events (id int, payload jsonb);
|
|
2444
|
+
INSERT INTO public.events (id, payload) VALUES
|
|
2445
|
+
(1, '{not strict json}'),
|
|
2446
|
+
(2, '{"profile":{"tier":"gold","secret":"alpha"}}');
|
|
2447
|
+
|
|
2448
|
+
COPY public.events (id, payload) FROM stdin;
|
|
2449
|
+
3 {not strict json}
|
|
2450
|
+
4 {"profile":{"tier":"gold","secret":"alpha"}}
|
|
2451
|
+
\.
|
|
2452
|
+
"#;
|
|
2453
|
+
let mut reader = std::io::BufReader::new(input.as_bytes());
|
|
2454
|
+
let mut out = Vec::new();
|
|
2455
|
+
proc.process(&mut reader, &mut out).unwrap();
|
|
2456
|
+
let s = String::from_utf8(out).unwrap();
|
|
2457
|
+
assert!(
|
|
2458
|
+
s.contains("(1, '{not strict json}')"),
|
|
2459
|
+
"non-JSON INSERT cell should passthrough unchanged, got:\n{s}"
|
|
2460
|
+
);
|
|
2461
|
+
assert!(
|
|
2462
|
+
!s.contains("alpha"),
|
|
2463
|
+
"valid JSON INSERT row should still anonymize nested paths, got:\n{s}"
|
|
2464
|
+
);
|
|
2465
|
+
assert!(
|
|
2466
|
+
s.contains("\n3\t{not strict json}\n"),
|
|
2467
|
+
"non-JSON COPY cell should passthrough unchanged, got:\n{s}"
|
|
2468
|
+
);
|
|
2469
|
+
assert!(
|
|
2470
|
+
!s.contains("\n4\t{\"profile\":{\"tier\":\"gold\",\"secret\":\"alpha\"}}\n"),
|
|
2471
|
+
"valid JSON COPY row should anonymize nested secret, got:\n{s}"
|
|
2472
|
+
);
|
|
2473
|
+
}
|
|
2474
|
+
|
|
2475
|
+
#[test]
|
|
2476
|
+
fn pipeline_json_path_int_range_preserves_json_number_type() {
|
|
2477
|
+
let mut rules: HashMap<String, HashMap<String, AnonymizerSpec>> = HashMap::new();
|
|
2478
|
+
let mut cols: HashMap<String, AnonymizerSpec> = HashMap::new();
|
|
2479
|
+
cols.insert(
|
|
2480
|
+
"payload.score".to_string(),
|
|
2481
|
+
AnonymizerSpec {
|
|
2482
|
+
strategy: "int_range".to_string(),
|
|
2483
|
+
salt: None,
|
|
2484
|
+
min: Some(0),
|
|
2485
|
+
max: Some(100),
|
|
2486
|
+
length: None,
|
|
2487
|
+
min_days: None,
|
|
2488
|
+
max_days: None,
|
|
2489
|
+
min_seconds: None,
|
|
2490
|
+
max_seconds: None,
|
|
2491
|
+
domain: Some("pipeline_json_num".to_string()),
|
|
2492
|
+
unique_within_domain: None,
|
|
2493
|
+
as_string: None,
|
|
2494
|
+
locale: None,
|
|
2495
|
+
faker: None,
|
|
2496
|
+
format: None,
|
|
2497
|
+
},
|
|
2498
|
+
);
|
|
2499
|
+
rules.insert("public.events".to_string(), cols);
|
|
2500
|
+
let cfg = ResolvedConfig {
|
|
2501
|
+
salt: None,
|
|
2502
|
+
rules,
|
|
2503
|
+
row_filters: HashMap::new(),
|
|
2504
|
+
column_cases: HashMap::new(),
|
|
2505
|
+
sensitive_columns: HashMap::new(),
|
|
2506
|
+
output_scan: crate::settings::OutputScanConfig::default(),
|
|
2507
|
+
source_path: None,
|
|
2508
|
+
};
|
|
2509
|
+
let reg = AnonymizerRegistry::from_config(&cfg);
|
|
2510
|
+
let mut proc =
|
|
2511
|
+
SqlStreamProcessor::new(reg, cfg, Vec::new(), Vec::new(), None, DumpFormat::Postgres);
|
|
2512
|
+
let input = r#"
|
|
2513
|
+
CREATE TABLE public.events (id int, payload jsonb);
|
|
2514
|
+
INSERT INTO public.events (id, payload) VALUES
|
|
2515
|
+
(1, '{"score":42,"label":"x"}');
|
|
2516
|
+
|
|
2517
|
+
COPY public.events (id, payload) FROM stdin;
|
|
2518
|
+
2 {"score":42,"label":"x"}
|
|
2519
|
+
\.
|
|
2520
|
+
"#;
|
|
2521
|
+
let mut reader = std::io::BufReader::new(input.as_bytes());
|
|
2522
|
+
let mut out = Vec::new();
|
|
2523
|
+
proc.process(&mut reader, &mut out).unwrap();
|
|
2524
|
+
let s = String::from_utf8(out).unwrap();
|
|
2525
|
+
let insert_pos = s.find("INSERT INTO public.events").unwrap();
|
|
2526
|
+
let insert_tail = &s[insert_pos..];
|
|
2527
|
+
let insert_end = insert_tail.find(";\n").unwrap() + insert_pos;
|
|
2528
|
+
let ins_stmt = &s[insert_pos..=insert_end];
|
|
2529
|
+
let vals_idx = ins_stmt.to_uppercase().find("VALUES").unwrap();
|
|
2530
|
+
let ins_block = strip_trailing_semicolon(ins_stmt[vals_idx + "VALUES".len()..].trim());
|
|
2531
|
+
let ins_rows = parse_values_rows(ins_block).unwrap();
|
|
2532
|
+
let copy_line = s
|
|
2533
|
+
.lines()
|
|
2534
|
+
.find(|l| l.starts_with("2\t{"))
|
|
2535
|
+
.expect("expected COPY data row");
|
|
2536
|
+
let copy_json = copy_line.split_once('\t').unwrap().1;
|
|
2537
|
+
let v_ins =
|
|
2538
|
+
serde_json::from_str::<serde_json::Value>(ins_rows[0][1].original.as_ref().unwrap())
|
|
2539
|
+
.unwrap();
|
|
2540
|
+
let v_copy = serde_json::from_str::<serde_json::Value>(copy_json).unwrap();
|
|
2541
|
+
assert!(
|
|
2542
|
+
v_ins["score"].is_number(),
|
|
2543
|
+
"INSERT payload.score should remain JSON number, got {:?}",
|
|
2544
|
+
v_ins["score"]
|
|
2545
|
+
);
|
|
2546
|
+
assert!(
|
|
2547
|
+
v_copy["score"].is_number(),
|
|
2548
|
+
"COPY payload.score should remain JSON number, got {:?}",
|
|
2549
|
+
v_copy["score"]
|
|
2550
|
+
);
|
|
2551
|
+
assert_eq!(v_ins["score"], v_copy["score"]);
|
|
2552
|
+
assert_eq!(v_ins["label"], "x");
|
|
2553
|
+
}
|
|
2554
|
+
|
|
2555
|
+
#[test]
|
|
2556
|
+
fn parse_values_rows_tracks_trailing_cast_for_quoted_literals() {
|
|
2557
|
+
let rows =
|
|
2558
|
+
parse_values_rows("(1, '{\"profile\":{\"secret\":\"alpha\"}}'::jsonb, 'note'::text)")
|
|
2559
|
+
.unwrap();
|
|
2560
|
+
assert_eq!(rows.len(), 1);
|
|
2561
|
+
assert_eq!(rows[0].len(), 3);
|
|
2562
|
+
assert_eq!(
|
|
2563
|
+
rows[0][1].original.as_deref(),
|
|
2564
|
+
Some("{\"profile\":{\"secret\":\"alpha\"}}")
|
|
2565
|
+
);
|
|
2566
|
+
assert_eq!(rows[0][1].trailing_expr.as_deref(), Some("::jsonb"));
|
|
2567
|
+
assert_eq!(rows[0][2].original.as_deref(), Some("note"));
|
|
2568
|
+
assert_eq!(rows[0][2].trailing_expr.as_deref(), Some("::text"));
|
|
2569
|
+
}
|
|
2570
|
+
|
|
2571
|
+
#[test]
|
|
2572
|
+
fn pipeline_anonymizes_nested_json_paths_for_jsonb_cast_insert_rows() {
|
|
2573
|
+
let mut rules: HashMap<String, HashMap<String, AnonymizerSpec>> = HashMap::new();
|
|
2574
|
+
let mut cols: HashMap<String, AnonymizerSpec> = HashMap::new();
|
|
2575
|
+
cols.insert(
|
|
2576
|
+
"payload.profile.secret".to_string(),
|
|
2577
|
+
AnonymizerSpec {
|
|
2578
|
+
strategy: "string".to_string(),
|
|
2579
|
+
salt: None,
|
|
2580
|
+
min: None,
|
|
2581
|
+
max: None,
|
|
2582
|
+
length: Some(8),
|
|
2583
|
+
min_days: None,
|
|
2584
|
+
max_days: None,
|
|
2585
|
+
min_seconds: None,
|
|
2586
|
+
max_seconds: None,
|
|
2587
|
+
domain: Some("secrets".to_string()),
|
|
2588
|
+
unique_within_domain: None,
|
|
2589
|
+
as_string: Some(true),
|
|
2590
|
+
locale: None,
|
|
2591
|
+
faker: None,
|
|
2592
|
+
format: None,
|
|
2593
|
+
},
|
|
2594
|
+
);
|
|
2595
|
+
rules.insert("public.events".to_string(), cols);
|
|
2596
|
+
let cfg = ResolvedConfig {
|
|
2597
|
+
salt: None,
|
|
2598
|
+
rules,
|
|
2599
|
+
row_filters: HashMap::new(),
|
|
2600
|
+
column_cases: HashMap::new(),
|
|
2601
|
+
sensitive_columns: HashMap::new(),
|
|
2602
|
+
output_scan: crate::settings::OutputScanConfig::default(),
|
|
2603
|
+
source_path: None,
|
|
2604
|
+
};
|
|
2605
|
+
let reg = AnonymizerRegistry::from_config(&cfg);
|
|
2606
|
+
let mut proc =
|
|
2607
|
+
SqlStreamProcessor::new(reg, cfg, Vec::new(), Vec::new(), None, DumpFormat::Postgres);
|
|
2608
|
+
let input = r#"
|
|
2609
|
+
CREATE TABLE public.events (id int, payload jsonb);
|
|
2610
|
+
INSERT INTO public.events (id, payload) VALUES
|
|
2611
|
+
(1, '{"profile":{"tier":"gold","secret":"alpha"}}'::jsonb),
|
|
2612
|
+
(2, '{"profile":{"tier":"gold","secret":"alpha"}}'::jsonb);
|
|
2613
|
+
"#;
|
|
2614
|
+
let mut reader = std::io::BufReader::new(input.as_bytes());
|
|
2615
|
+
let mut out = Vec::new();
|
|
2616
|
+
proc.process(&mut reader, &mut out).unwrap();
|
|
2617
|
+
let s = String::from_utf8(out).unwrap();
|
|
2618
|
+
assert!(!s.contains("alpha"), "nested secret should be anonymized");
|
|
2619
|
+
assert!(s.contains("::jsonb"), "jsonb cast should be preserved");
|
|
2620
|
+
|
|
2621
|
+
let insert_pos = s.find("INSERT INTO public.events").unwrap();
|
|
2622
|
+
let insert_tail = &s[insert_pos..];
|
|
2623
|
+
let insert_end = insert_tail.find(";\n").unwrap() + insert_pos;
|
|
2624
|
+
let ins_stmt = &s[insert_pos..=insert_end];
|
|
2625
|
+
let vals_idx = ins_stmt.to_uppercase().find("VALUES").unwrap();
|
|
2626
|
+
let ins_block = strip_trailing_semicolon(ins_stmt[vals_idx + "VALUES".len()..].trim());
|
|
2627
|
+
let ins_rows = parse_values_rows(ins_block).unwrap();
|
|
2628
|
+
assert_eq!(ins_rows[0][1].trailing_expr.as_deref(), Some("::jsonb"));
|
|
2629
|
+
assert_eq!(ins_rows[1][1].trailing_expr.as_deref(), Some("::jsonb"));
|
|
2630
|
+
let v0 =
|
|
2631
|
+
serde_json::from_str::<serde_json::Value>(ins_rows[0][1].original.as_ref().unwrap())
|
|
2632
|
+
.unwrap();
|
|
2633
|
+
let v1 =
|
|
2634
|
+
serde_json::from_str::<serde_json::Value>(ins_rows[1][1].original.as_ref().unwrap())
|
|
2635
|
+
.unwrap();
|
|
2636
|
+
assert_eq!(v0["profile"]["secret"], v1["profile"]["secret"]);
|
|
2637
|
+
}
|
|
2638
|
+
|
|
2373
2639
|
#[test]
|
|
2374
2640
|
fn generated_values_fit_length_restricted_columns_from_create_table() {
|
|
2375
2641
|
let mut rules: HashMap<String, HashMap<String, AnonymizerSpec>> = HashMap::new();
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|