dumpling-cli 0.4.2__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/.dumplingconf.example +6 -6
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/CHANGELOG.md +23 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/Cargo.lock +1 -1
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/Cargo.toml +1 -1
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/PKG-INFO +11 -7
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/README.md +10 -6
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/docs/src/releasing.md +1 -1
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/pyproject.toml +1 -1
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/src/faker_dispatch.rs +66 -6
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/src/filter.rs +13 -24
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/src/main.rs +23 -5
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/src/settings.rs +26 -41
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/src/sql.rs +156 -137
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/src/transform.rs +49 -53
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/.github/workflows/ci.yml +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/.github/workflows/docs-pr.yml +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/.github/workflows/docs.yml +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/.github/workflows/platform-compat-latest.yml +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/.github/workflows/platform-compat-matrix.yml +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/.github/workflows/policy-lint.yml +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/.github/workflows/publish.yml +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/.github/workflows/release.yml +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/.github/workflows/tests.yml +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/.gitignore +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/AGENTS.md +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/CONTRIBUTING.md +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/MAINTENANCE.md +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/assets/logo.svg +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/book.toml +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/datetime_out.sql +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/datetime_sample.sql +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/docs/src/SUMMARY.md +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/docs/src/ci-guardrails.md +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/docs/src/configuration.md +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/docs/src/getting-started.md +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/docs/src/index.md +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/rust-toolchain.toml +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/scripts/setup-dev.sh +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/src/lint.rs +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/src/report.rs +0 -0
- {dumpling_cli-0.4.2 → dumpling_cli-0.5.0}/src/scan.rs +0 -0
|
@@ -31,12 +31,12 @@ salt = "${DUMPLING_GLOBAL_SALT}"
|
|
|
31
31
|
# Faker modules: https://docs.rs/fake/latest/fake/faker/index.html
|
|
32
32
|
# Upstream repo: https://github.com/cksac/fake-rs
|
|
33
33
|
[rules."public.users"]
|
|
34
|
-
# email — fake email
|
|
35
|
-
email = { strategy = "
|
|
36
|
-
# name — locale-aware full name (see `locale`)
|
|
37
|
-
full_name = { strategy = "
|
|
38
|
-
first_name = { strategy = "
|
|
39
|
-
last_name = { strategy = "
|
|
34
|
+
# email — safe fake email (built-in); force quoted string output
|
|
35
|
+
email = { strategy = "email", domain = "customer_identity", unique_within_domain = true }
|
|
36
|
+
# name — locale-aware full name (see `locale`)
|
|
37
|
+
full_name = { strategy = "name" }
|
|
38
|
+
first_name = { strategy = "first_name" }
|
|
39
|
+
last_name = { strategy = "last_name" }
|
|
40
40
|
# phone — US-style (xxx) xxx-xxxx
|
|
41
41
|
phone = { strategy = "phone" }
|
|
42
42
|
# ssn — SHA-256 hex of original; use per-column salt for extra protection
|
|
@@ -7,6 +7,27 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.5.0] - 2026-05-03
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- **First-class strategies** `email`, `name`, `first_name`, `last_name`, and `phone` in config (same generators as `faker = "internet::SafeEmail"`, `name::Name`, `name::FirstName`, `name::LastName`, and locale-aware phone). Strategy names are normalized to lowercase at load.
|
|
15
|
+
|
|
16
|
+
### Changed
|
|
17
|
+
|
|
18
|
+
- **Random-path faker/phone/PII**: one reused `StdRng` on `AnonymizerRegistry` instead of re-seeding per cell.
|
|
19
|
+
- **`faker` locale resolution**: `resolved_locale_key` avoids allocating a `String` per faker call when locale is `en` or absent.
|
|
20
|
+
|
|
21
|
+
### Performance
|
|
22
|
+
|
|
23
|
+
- Larger default I/O buffers; fewer per-line and per-row allocations on the SQL stream path (INSERT/COPY parsing and row filters).
|
|
24
|
+
|
|
25
|
+
## [0.4.3] - 2026-05-03
|
|
26
|
+
|
|
27
|
+
### Fixed
|
|
28
|
+
|
|
29
|
+
- **COPY row integrity after anonymization**: Control characters in anonymized COPY text fields are escaped so tab/newline/etc. cannot break column alignment or row boundaries ([#53](https://github.com/ababic/dumpling/pull/53)).
|
|
30
|
+
|
|
10
31
|
## [0.4.2] - 2026-05-03
|
|
11
32
|
|
|
12
33
|
### Fixed
|
|
@@ -68,6 +89,8 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
|
|
|
68
89
|
- Configurable output scan severities and per-category thresholds via `[output_scan]`.
|
|
69
90
|
- JSON report section for output scan findings including category, count, threshold, severity, and sample locations.
|
|
70
91
|
|
|
92
|
+
[0.5.0]: https://github.com/ababic/dumpling/compare/v0.4.3...v0.5.0
|
|
93
|
+
[0.4.3]: https://github.com/ababic/dumpling/compare/v0.4.2...v0.4.3
|
|
71
94
|
[0.4.2]: https://github.com/ababic/dumpling/compare/v0.4.1...v0.4.2
|
|
72
95
|
[0.4.1]: https://github.com/ababic/dumpling/compare/v0.4.0...v0.4.1
|
|
73
96
|
[0.4.0]: https://github.com/ababic/dumpling/compare/v0.3.0...v0.4.0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dumpling-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Classifier: Development Status :: 4 - Beta
|
|
5
5
|
Classifier: Environment :: Console
|
|
6
6
|
Classifier: Intended Audience :: Developers
|
|
@@ -142,8 +142,8 @@ salt = "${DUMPLING_GLOBAL_SALT}"
|
|
|
142
142
|
|
|
143
143
|
# Rules are keyed by either "table" or "schema.table"
|
|
144
144
|
[rules."public.users"]
|
|
145
|
-
email = { strategy = "
|
|
146
|
-
name = { strategy = "
|
|
145
|
+
email = { strategy = "email", domain = "customer_identity", unique_within_domain = true }
|
|
146
|
+
name = { strategy = "name", locale = "de_de" } # German-locale name
|
|
147
147
|
ssn = { strategy = "hash", salt = "${env:DUMPLING_USERS_SSN_SALT}", as_string = true } # SHA-256 of original (salted)
|
|
148
148
|
age = { strategy = "int_range", min = 18, max = 90 }
|
|
149
149
|
|
|
@@ -183,8 +183,12 @@ token = "high"
|
|
|
183
183
|
| `redact` | Replace with `REDACTED` (string) |
|
|
184
184
|
| `uuid` | Random UUIDv4-like string |
|
|
185
185
|
| `hash` | SHA-256 hex of original value; supports per-column `salt` and global `salt` |
|
|
186
|
-
| `
|
|
186
|
+
| `email` | Safe email address (same generator as `faker = "internet::SafeEmail"`); supports `locale` |
|
|
187
|
+
| `name` | Full name (same as `faker = "name::Name"`); supports `locale` |
|
|
188
|
+
| `first_name` | First name (same as `faker = "name::FirstName"`); supports `locale` |
|
|
189
|
+
| `last_name` | Last name (same as `faker = "name::LastName"`); supports `locale` |
|
|
187
190
|
| `phone` | Locale-aware fake phone number (configurable via `locale`); defaults to English format |
|
|
191
|
+
| `faker` | Values from the Rust [`fake`](https://crates.io/crates/fake) crate ([docs.rs](https://docs.rs/fake/latest/fake/), [`faker` modules](https://docs.rs/fake/latest/fake/faker/index.html)), chosen by a **string identifier** only (`faker = "module::Type"`, e.g. `internet::SafeEmail`). Config is **data only**: nothing from TOML is compiled or executed as Rust at runtime. Use `locale` for locale-aware generators; optional `min`/`max`, `length`, `format` as documented. Unsupported targets fail at config load. New generators require a **new Dumpling release** (or your own fork), not config-side code. |
|
|
188
192
|
| `int_range` | Random integer in `[min, max]` |
|
|
189
193
|
| `string` | Random alphanumeric string (`length = 12` by default) |
|
|
190
194
|
| `date_fuzz` | Shifts a date by a random number of days in `[min_days, max_days]` (defaults: `-30..30`) |
|
|
@@ -244,7 +248,7 @@ dumpling --security-profile hardened --input dump.sql --check
|
|
|
244
248
|
- `unique_within_domain`: when true, different source values are assigned unique pseudonyms within the configured `domain`. NULL values are unaffected and always remain NULL.
|
|
245
249
|
- `min_days` / `max_days`: used by `date_fuzz`.
|
|
246
250
|
- `min_seconds` / `max_seconds`: used by `time_fuzz` and `datetime_fuzz`.
|
|
247
|
-
- `locale`: selects the language/regional format for
|
|
251
|
+
- `locale`: selects the language/regional format for `email`, `name`, `first_name`, `last_name`, `faker`, and `phone`. Supported values: `en`, `fr_fr`, `de_de`, `it_it`, `pt_br`, `pt_pt`, `ar_sa`, `zh_cn`, `zh_tw`, `ja_jp`, `cy_gb`. Defaults to `en` when not specified.
|
|
248
252
|
- `faker`: required when `strategy = "faker"`. A plain string `"module::Type"` (case-insensitive) that maps to a **built-in** generator compiled into Dumpling—not arbitrary Rust or expressions. Names follow [`fake::faker`](https://docs.rs/fake/latest/fake/faker/index.html) (e.g. `internet::SafeEmail` → `faker::internet::SafeEmail` in the crate).
|
|
249
253
|
- `format`: used with `faker = "number::NumberWithFormat"`; pattern uses `#` (0–9) and `^` (1–9) per the [`fake` crate docs](https://docs.rs/fake/latest/fake/).
|
|
250
254
|
|
|
@@ -415,7 +419,7 @@ Define default strategies in `rules."<table>"` and add ordered per-column cases
|
|
|
415
419
|
```toml
|
|
416
420
|
[rules."public.users"]
|
|
417
421
|
email = { strategy = "hash", as_string = true } # default
|
|
418
|
-
name = { strategy = "
|
|
422
|
+
name = { strategy = "name" }
|
|
419
423
|
|
|
420
424
|
[[column_cases."public.users".email]]
|
|
421
425
|
when.any = [{ column = "is_admin", op = "eq", value = "true" }]
|
|
@@ -466,7 +470,7 @@ salt = "${DUMPLING_HMAC_KEY}"
|
|
|
466
470
|
|
|
467
471
|
[rules."public.users"]
|
|
468
472
|
ssn = { strategy = "hash", as_string = true }
|
|
469
|
-
email = { strategy = "
|
|
473
|
+
email = { strategy = "email", domain = "users" }
|
|
470
474
|
```
|
|
471
475
|
|
|
472
476
|
```bash
|
|
@@ -121,8 +121,8 @@ salt = "${DUMPLING_GLOBAL_SALT}"
|
|
|
121
121
|
|
|
122
122
|
# Rules are keyed by either "table" or "schema.table"
|
|
123
123
|
[rules."public.users"]
|
|
124
|
-
email = { strategy = "
|
|
125
|
-
name = { strategy = "
|
|
124
|
+
email = { strategy = "email", domain = "customer_identity", unique_within_domain = true }
|
|
125
|
+
name = { strategy = "name", locale = "de_de" } # German-locale name
|
|
126
126
|
ssn = { strategy = "hash", salt = "${env:DUMPLING_USERS_SSN_SALT}", as_string = true } # SHA-256 of original (salted)
|
|
127
127
|
age = { strategy = "int_range", min = 18, max = 90 }
|
|
128
128
|
|
|
@@ -162,8 +162,12 @@ token = "high"
|
|
|
162
162
|
| `redact` | Replace with `REDACTED` (string) |
|
|
163
163
|
| `uuid` | Random UUIDv4-like string |
|
|
164
164
|
| `hash` | SHA-256 hex of original value; supports per-column `salt` and global `salt` |
|
|
165
|
-
| `
|
|
165
|
+
| `email` | Safe email address (same generator as `faker = "internet::SafeEmail"`); supports `locale` |
|
|
166
|
+
| `name` | Full name (same as `faker = "name::Name"`); supports `locale` |
|
|
167
|
+
| `first_name` | First name (same as `faker = "name::FirstName"`); supports `locale` |
|
|
168
|
+
| `last_name` | Last name (same as `faker = "name::LastName"`); supports `locale` |
|
|
166
169
|
| `phone` | Locale-aware fake phone number (configurable via `locale`); defaults to English format |
|
|
170
|
+
| `faker` | Values from the Rust [`fake`](https://crates.io/crates/fake) crate ([docs.rs](https://docs.rs/fake/latest/fake/), [`faker` modules](https://docs.rs/fake/latest/fake/faker/index.html)), chosen by a **string identifier** only (`faker = "module::Type"`, e.g. `internet::SafeEmail`). Config is **data only**: nothing from TOML is compiled or executed as Rust at runtime. Use `locale` for locale-aware generators; optional `min`/`max`, `length`, `format` as documented. Unsupported targets fail at config load. New generators require a **new Dumpling release** (or your own fork), not config-side code. |
|
|
167
171
|
| `int_range` | Random integer in `[min, max]` |
|
|
168
172
|
| `string` | Random alphanumeric string (`length = 12` by default) |
|
|
169
173
|
| `date_fuzz` | Shifts a date by a random number of days in `[min_days, max_days]` (defaults: `-30..30`) |
|
|
@@ -223,7 +227,7 @@ dumpling --security-profile hardened --input dump.sql --check
|
|
|
223
227
|
- `unique_within_domain`: when true, different source values are assigned unique pseudonyms within the configured `domain`. NULL values are unaffected and always remain NULL.
|
|
224
228
|
- `min_days` / `max_days`: used by `date_fuzz`.
|
|
225
229
|
- `min_seconds` / `max_seconds`: used by `time_fuzz` and `datetime_fuzz`.
|
|
226
|
-
- `locale`: selects the language/regional format for
|
|
230
|
+
- `locale`: selects the language/regional format for `email`, `name`, `first_name`, `last_name`, `faker`, and `phone`. Supported values: `en`, `fr_fr`, `de_de`, `it_it`, `pt_br`, `pt_pt`, `ar_sa`, `zh_cn`, `zh_tw`, `ja_jp`, `cy_gb`. Defaults to `en` when not specified.
|
|
227
231
|
- `faker`: required when `strategy = "faker"`. A plain string `"module::Type"` (case-insensitive) that maps to a **built-in** generator compiled into Dumpling—not arbitrary Rust or expressions. Names follow [`fake::faker`](https://docs.rs/fake/latest/fake/faker/index.html) (e.g. `internet::SafeEmail` → `faker::internet::SafeEmail` in the crate).
|
|
228
232
|
- `format`: used with `faker = "number::NumberWithFormat"`; pattern uses `#` (0–9) and `^` (1–9) per the [`fake` crate docs](https://docs.rs/fake/latest/fake/).
|
|
229
233
|
|
|
@@ -394,7 +398,7 @@ Define default strategies in `rules."<table>"` and add ordered per-column cases
|
|
|
394
398
|
```toml
|
|
395
399
|
[rules."public.users"]
|
|
396
400
|
email = { strategy = "hash", as_string = true } # default
|
|
397
|
-
name = { strategy = "
|
|
401
|
+
name = { strategy = "name" }
|
|
398
402
|
|
|
399
403
|
[[column_cases."public.users".email]]
|
|
400
404
|
when.any = [{ column = "is_admin", op = "eq", value = "true" }]
|
|
@@ -445,7 +449,7 @@ salt = "${DUMPLING_HMAC_KEY}"
|
|
|
445
449
|
|
|
446
450
|
[rules."public.users"]
|
|
447
451
|
ssn = { strategy = "hash", as_string = true }
|
|
448
|
-
email = { strategy = "
|
|
452
|
+
email = { strategy = "email", domain = "users" }
|
|
449
453
|
```
|
|
450
454
|
|
|
451
455
|
```bash
|
|
@@ -11,7 +11,7 @@ This project uses **tag-driven releases**.
|
|
|
11
11
|
## Maintainer checklist
|
|
12
12
|
|
|
13
13
|
1. Ensure `main` is green in CI.
|
|
14
|
-
2. Update `Cargo.toml`
|
|
14
|
+
2. Update `Cargo.toml` and `pyproject.toml` versions and `CHANGELOG.md`.
|
|
15
15
|
3. Open and merge a release preparation PR.
|
|
16
16
|
4. Create and push a tag from `main`:
|
|
17
17
|
|
|
@@ -89,6 +89,71 @@ pub fn parse_faker_path(faker: &str) -> Option<(&str, &str)> {
|
|
|
89
89
|
Some((module, typ))
|
|
90
90
|
}
|
|
91
91
|
|
|
92
|
+
/// Normalized locale key for `faker`, `phone`, and built-in PII strategies (`email`, `name`, …).
|
|
93
|
+
/// Uses ASCII case-insensitive matching without allocating.
|
|
94
|
+
pub fn resolved_locale_key(spec: &AnonymizerSpec) -> &'static str {
|
|
95
|
+
let s = spec.locale.as_deref().map(str::trim).unwrap_or("");
|
|
96
|
+
if s.is_empty() || s.eq_ignore_ascii_case("en") {
|
|
97
|
+
return "en";
|
|
98
|
+
}
|
|
99
|
+
if s.eq_ignore_ascii_case("fr_fr") {
|
|
100
|
+
return "fr_fr";
|
|
101
|
+
}
|
|
102
|
+
if s.eq_ignore_ascii_case("de_de") {
|
|
103
|
+
return "de_de";
|
|
104
|
+
}
|
|
105
|
+
if s.eq_ignore_ascii_case("it_it") {
|
|
106
|
+
return "it_it";
|
|
107
|
+
}
|
|
108
|
+
if s.eq_ignore_ascii_case("pt_br") {
|
|
109
|
+
return "pt_br";
|
|
110
|
+
}
|
|
111
|
+
if s.eq_ignore_ascii_case("pt_pt") {
|
|
112
|
+
return "pt_pt";
|
|
113
|
+
}
|
|
114
|
+
if s.eq_ignore_ascii_case("ar_sa") {
|
|
115
|
+
return "ar_sa";
|
|
116
|
+
}
|
|
117
|
+
if s.eq_ignore_ascii_case("zh_cn") {
|
|
118
|
+
return "zh_cn";
|
|
119
|
+
}
|
|
120
|
+
if s.eq_ignore_ascii_case("zh_tw") {
|
|
121
|
+
return "zh_tw";
|
|
122
|
+
}
|
|
123
|
+
if s.eq_ignore_ascii_case("ja_jp") {
|
|
124
|
+
return "ja_jp";
|
|
125
|
+
}
|
|
126
|
+
if s.eq_ignore_ascii_case("cy_gb") {
|
|
127
|
+
return "cy_gb";
|
|
128
|
+
}
|
|
129
|
+
"en"
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/// Built-in `strategy = "email"` — same generator as `faker = "internet::SafeEmail"`.
|
|
133
|
+
pub fn pii_safe_email(loc: &str, rng: &mut StdRng) -> String {
|
|
134
|
+
fl!(loc, rng, SafeEmail)
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/// Built-in `strategy = "name"` — full name.
|
|
138
|
+
pub fn pii_full_name(loc: &str, rng: &mut StdRng) -> String {
|
|
139
|
+
fl!(loc, rng, Name)
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/// Built-in `strategy = "first_name"`.
|
|
143
|
+
pub fn pii_first_name(loc: &str, rng: &mut StdRng) -> String {
|
|
144
|
+
fl!(loc, rng, FirstName)
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/// Built-in `strategy = "last_name"`.
|
|
148
|
+
pub fn pii_last_name(loc: &str, rng: &mut StdRng) -> String {
|
|
149
|
+
fl!(loc, rng, LastName)
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/// Built-in `strategy = "phone"` — same generator as `faker` phone_number fakers.
|
|
153
|
+
pub fn pii_phone_number(loc: &str, rng: &mut StdRng) -> String {
|
|
154
|
+
fl!(loc, rng, PhoneNumber)
|
|
155
|
+
}
|
|
156
|
+
|
|
92
157
|
pub fn faker_string_with_rng(spec: &AnonymizerSpec, rng: &mut StdRng) -> Option<String> {
|
|
93
158
|
let faker = spec.faker.as_deref()?.trim();
|
|
94
159
|
if faker.is_empty() {
|
|
@@ -97,12 +162,7 @@ pub fn faker_string_with_rng(spec: &AnonymizerSpec, rng: &mut StdRng) -> Option<
|
|
|
97
162
|
let (module, typ) = parse_faker_path(faker)?;
|
|
98
163
|
let module_lc = module.to_ascii_lowercase();
|
|
99
164
|
let typ_lc = typ.to_ascii_lowercase();
|
|
100
|
-
let
|
|
101
|
-
.locale
|
|
102
|
-
.as_deref()
|
|
103
|
-
.map(|l| l.trim().to_ascii_lowercase())
|
|
104
|
-
.unwrap_or_else(|| "en".to_string());
|
|
105
|
-
let loc = locale.as_str();
|
|
165
|
+
let loc = resolved_locale_key(spec);
|
|
106
166
|
|
|
107
167
|
let s: String = match (module_lc.as_str(), typ_lc.as_str()) {
|
|
108
168
|
("name", "firstname") => fl!(loc, rng, FirstName),
|
|
@@ -13,7 +13,7 @@ pub fn should_keep_row(
|
|
|
13
13
|
schema: Option<&str>,
|
|
14
14
|
table: &str,
|
|
15
15
|
columns: &[String],
|
|
16
|
-
cells: &[Option
|
|
16
|
+
cells: &[Option<&str>], // unescaped strings; None for NULL
|
|
17
17
|
) -> bool {
|
|
18
18
|
let set = match lookup_row_filters(cfg, schema, table) {
|
|
19
19
|
Some(s) => s,
|
|
@@ -43,7 +43,7 @@ pub fn should_keep_row(
|
|
|
43
43
|
true
|
|
44
44
|
}
|
|
45
45
|
|
|
46
|
-
fn predicate_matches(pred: &Predicate, columns: &[String], cells: &[Option
|
|
46
|
+
fn predicate_matches(pred: &Predicate, columns: &[String], cells: &[Option<&str>]) -> bool {
|
|
47
47
|
let targets = match extract_predicate_targets(pred, columns, cells) {
|
|
48
48
|
Some(values) => values,
|
|
49
49
|
None => return false, // top-level column missing -> does not match
|
|
@@ -121,13 +121,14 @@ fn predicate_matches(pred: &Predicate, columns: &[String], cells: &[Option<Strin
|
|
|
121
121
|
fn extract_predicate_targets(
|
|
122
122
|
pred: &Predicate,
|
|
123
123
|
columns: &[String],
|
|
124
|
-
cells: &[Option
|
|
124
|
+
cells: &[Option<&str>],
|
|
125
125
|
) -> Option<Vec<Option<String>>> {
|
|
126
126
|
if let Some(i) = columns
|
|
127
127
|
.iter()
|
|
128
128
|
.position(|c| c.eq_ignore_ascii_case(&pred.column))
|
|
129
129
|
{
|
|
130
|
-
|
|
130
|
+
let cell = cells.get(i).copied().flatten();
|
|
131
|
+
return Some(vec![cell.map(|s| s.to_string())]);
|
|
131
132
|
}
|
|
132
133
|
|
|
133
134
|
let (base_column, path) = parse_json_column_key(&pred.column);
|
|
@@ -413,7 +414,7 @@ where
|
|
|
413
414
|
Ok(())
|
|
414
415
|
}
|
|
415
416
|
|
|
416
|
-
pub fn when_matches(when: &When, columns: &[String], cells: &[Option
|
|
417
|
+
pub fn when_matches(when: &When, columns: &[String], cells: &[Option<&str>]) -> bool {
|
|
417
418
|
// If any is non-empty, require at least one to match
|
|
418
419
|
if !when.any.is_empty() {
|
|
419
420
|
let mut matched_any = false;
|
|
@@ -603,11 +604,7 @@ mod tests {
|
|
|
603
604
|
Some("public"),
|
|
604
605
|
"users",
|
|
605
606
|
&cols,
|
|
606
|
-
&[
|
|
607
|
-
Some("1".to_string()),
|
|
608
|
-
Some("alice@myco.com".to_string()),
|
|
609
|
-
Some("US".to_string())
|
|
610
|
-
]
|
|
607
|
+
&[Some("1"), Some("alice@myco.com"), Some("US")]
|
|
611
608
|
));
|
|
612
609
|
// Case-insensitive keep (iregex)
|
|
613
610
|
assert!(should_keep_row(
|
|
@@ -615,11 +612,7 @@ mod tests {
|
|
|
615
612
|
Some("public"),
|
|
616
613
|
"users",
|
|
617
614
|
&cols,
|
|
618
|
-
&[
|
|
619
|
-
Some("2".to_string()),
|
|
620
|
-
Some("Carol@MYCO.COM".to_string()),
|
|
621
|
-
Some("GB".to_string())
|
|
622
|
-
]
|
|
615
|
+
&[Some("2"), Some("Carol@MYCO.COM"), Some("GB")]
|
|
623
616
|
));
|
|
624
617
|
// Delete example.com
|
|
625
618
|
assert!(!should_keep_row(
|
|
@@ -627,11 +620,7 @@ mod tests {
|
|
|
627
620
|
Some("public"),
|
|
628
621
|
"users",
|
|
629
622
|
&cols,
|
|
630
|
-
&[
|
|
631
|
-
Some("3".to_string()),
|
|
632
|
-
Some("bob@example.com".to_string()),
|
|
633
|
-
Some("US".to_string())
|
|
634
|
-
]
|
|
623
|
+
&[Some("3"), Some("bob@example.com"), Some("US")]
|
|
635
624
|
));
|
|
636
625
|
}
|
|
637
626
|
|
|
@@ -668,14 +657,14 @@ mod tests {
|
|
|
668
657
|
Some("public"),
|
|
669
658
|
"events",
|
|
670
659
|
&cols,
|
|
671
|
-
&[Some(r#"{"profile":{"tier":"gold"}}"
|
|
660
|
+
&[Some(r#"{"profile":{"tier":"gold"}}"#)]
|
|
672
661
|
));
|
|
673
662
|
assert!(!should_keep_row(
|
|
674
663
|
&cfg,
|
|
675
664
|
Some("public"),
|
|
676
665
|
"events",
|
|
677
666
|
&cols,
|
|
678
|
-
&[Some(r#"{"profile":{"tier":"silver"}}"
|
|
667
|
+
&[Some(r#"{"profile":{"tier":"silver"}}"#)]
|
|
679
668
|
));
|
|
680
669
|
}
|
|
681
670
|
|
|
@@ -713,7 +702,7 @@ mod tests {
|
|
|
713
702
|
"events",
|
|
714
703
|
&cols,
|
|
715
704
|
&[Some(
|
|
716
|
-
r#"{"items":[{"kind":"secondary"},{"kind":"primary"}]}"
|
|
705
|
+
r#"{"items":[{"kind":"secondary"},{"kind":"primary"}]}"#
|
|
717
706
|
)]
|
|
718
707
|
));
|
|
719
708
|
assert!(!should_keep_row(
|
|
@@ -721,7 +710,7 @@ mod tests {
|
|
|
721
710
|
Some("public"),
|
|
722
711
|
"events",
|
|
723
712
|
&cols,
|
|
724
|
-
&[Some(r#"{"items":[{"kind":"secondary"}]}"
|
|
713
|
+
&[Some(r#"{"items":[{"kind":"secondary"}]}"#)]
|
|
725
714
|
));
|
|
726
715
|
}
|
|
727
716
|
|
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
use std::fs::File;
|
|
2
2
|
use std::io::{self, BufRead, BufReader, BufWriter, Write};
|
|
3
|
+
|
|
4
|
+
/// Larger than default 8 KiB to reduce syscall overhead on big dumps.
|
|
5
|
+
const IO_BUF_CAPACITY: usize = 256 * 1024;
|
|
3
6
|
use std::path::{Path, PathBuf};
|
|
4
7
|
use std::process::{Command, Stdio};
|
|
5
8
|
|
|
@@ -343,7 +346,10 @@ fn run_anonymize(cli: Cli) -> anyhow::Result<()> {
|
|
|
343
346
|
.take()
|
|
344
347
|
.ok_or_else(|| anyhow::anyhow!("pg_restore stdout missing"))?;
|
|
345
348
|
pg_restore_child = Some(child);
|
|
346
|
-
(
|
|
349
|
+
(
|
|
350
|
+
Box::new(BufReader::with_capacity(IO_BUF_CAPACITY, stdout)),
|
|
351
|
+
Some(archive_path.clone()),
|
|
352
|
+
)
|
|
347
353
|
} else {
|
|
348
354
|
match &cli.input {
|
|
349
355
|
Some(path) => {
|
|
@@ -360,13 +366,19 @@ fn run_anonymize(cli: Cli) -> anyhow::Result<()> {
|
|
|
360
366
|
);
|
|
361
367
|
}
|
|
362
368
|
let f = File::open(path)?;
|
|
363
|
-
(
|
|
369
|
+
(
|
|
370
|
+
Box::new(BufReader::with_capacity(IO_BUF_CAPACITY, f)),
|
|
371
|
+
Some(path.clone()),
|
|
372
|
+
)
|
|
364
373
|
}
|
|
365
374
|
None => {
|
|
366
375
|
if !cli.allow_ext.is_empty() {
|
|
367
376
|
eprintln!("dumpling: --allow-ext provided but no --input file; extension check is ignored for stdin");
|
|
368
377
|
}
|
|
369
|
-
(
|
|
378
|
+
(
|
|
379
|
+
Box::new(BufReader::with_capacity(IO_BUF_CAPACITY, io::stdin())),
|
|
380
|
+
None,
|
|
381
|
+
)
|
|
370
382
|
}
|
|
371
383
|
}
|
|
372
384
|
};
|
|
@@ -380,9 +392,15 @@ fn run_anonymize(cli: Cli) -> anyhow::Result<()> {
|
|
|
380
392
|
.ok_or_else(|| anyhow::anyhow!("--in-place requires an --input path"))?;
|
|
381
393
|
let mut tmp = input_path.clone();
|
|
382
394
|
tmp.set_extension("sql.dumpling.tmp");
|
|
383
|
-
Box::new(BufWriter::
|
|
395
|
+
Box::new(BufWriter::with_capacity(
|
|
396
|
+
IO_BUF_CAPACITY,
|
|
397
|
+
File::create(&tmp)?,
|
|
398
|
+
))
|
|
384
399
|
} else if let Some(path) = &cli.output {
|
|
385
|
-
Box::new(BufWriter::
|
|
400
|
+
Box::new(BufWriter::with_capacity(
|
|
401
|
+
IO_BUF_CAPACITY,
|
|
402
|
+
File::create(path)?,
|
|
403
|
+
))
|
|
386
404
|
} else {
|
|
387
405
|
Box::new(BufWriter::new(io::stdout()))
|
|
388
406
|
};
|
|
@@ -32,7 +32,7 @@ pub struct RawConfig {
|
|
|
32
32
|
|
|
33
33
|
#[derive(Debug, Clone, Deserialize)]
|
|
34
34
|
pub struct AnonymizerSpec {
|
|
35
|
-
/// Strategy name: redact|null|uuid|hash|
|
|
35
|
+
/// Strategy name: redact|null|uuid|hash|email|name|first_name|last_name|phone|faker|int_range|string|date_fuzz|time_fuzz|datetime_fuzz
|
|
36
36
|
pub strategy: String,
|
|
37
37
|
/// if strategy=hash: optional per-column salt override; otherwise ignored
|
|
38
38
|
pub salt: Option<String>,
|
|
@@ -56,11 +56,12 @@ pub struct AnonymizerSpec {
|
|
|
56
56
|
/// Force the replacement to be rendered as a SQL string literal
|
|
57
57
|
/// If unset, we attempt to preserve the original quoting style.
|
|
58
58
|
pub as_string: Option<bool>,
|
|
59
|
-
/// Locale for locale-aware strategies:
|
|
59
|
+
/// Locale for locale-aware strategies: built-in PII (`email`, `name`, `first_name`, `last_name`), `faker`, and `phone`.
|
|
60
60
|
/// Supported values: en, fr_fr, de_de, it_it, pt_br, pt_pt, ar_sa, zh_cn, zh_tw, ja_jp, cy_gb.
|
|
61
61
|
/// Defaults to "en" when not specified.
|
|
62
62
|
pub locale: Option<String>,
|
|
63
63
|
/// When `strategy = "faker"`, selects the `fake` generator as `"module::Type"` (e.g. `internet::SafeEmail`, `name::FirstName`).
|
|
64
|
+
/// Only read when `strategy = "faker"` (other strategies must not set `faker`).
|
|
64
65
|
/// See [`fake::faker`](https://docs.rs/fake/latest/fake/faker/index.html) and the [crate docs](https://docs.rs/fake/latest/fake/).
|
|
65
66
|
#[serde(default)]
|
|
66
67
|
pub faker: Option<String>,
|
|
@@ -448,31 +449,6 @@ fn is_simple_key(key: &str) -> bool {
|
|
|
448
449
|
.all(|ch| ch.is_ascii_alphanumeric() || ch == '_')
|
|
449
450
|
}
|
|
450
451
|
|
|
451
|
-
/// Map removed built-in strategies to `faker` + `faker` path for backwards compatibility.
|
|
452
|
-
fn normalize_anonymizer_spec(mut spec: AnonymizerSpec) -> AnonymizerSpec {
|
|
453
|
-
let s = spec.strategy.to_ascii_lowercase();
|
|
454
|
-
match s.as_str() {
|
|
455
|
-
"email" => {
|
|
456
|
-
spec.strategy = "faker".to_string();
|
|
457
|
-
spec.faker = Some("internet::SafeEmail".to_string());
|
|
458
|
-
}
|
|
459
|
-
"name" => {
|
|
460
|
-
spec.strategy = "faker".to_string();
|
|
461
|
-
spec.faker = Some("name::Name".to_string());
|
|
462
|
-
}
|
|
463
|
-
"first_name" => {
|
|
464
|
-
spec.strategy = "faker".to_string();
|
|
465
|
-
spec.faker = Some("name::FirstName".to_string());
|
|
466
|
-
}
|
|
467
|
-
"last_name" => {
|
|
468
|
-
spec.strategy = "faker".to_string();
|
|
469
|
-
spec.faker = Some("name::LastName".to_string());
|
|
470
|
-
}
|
|
471
|
-
_ => {}
|
|
472
|
-
}
|
|
473
|
-
spec
|
|
474
|
-
}
|
|
475
|
-
|
|
476
452
|
fn resolve(raw: RawConfig, source_path: Option<PathBuf>) -> ResolvedConfig {
|
|
477
453
|
let RawConfig {
|
|
478
454
|
salt,
|
|
@@ -497,8 +473,9 @@ fn resolve(raw: RawConfig, source_path: Option<PathBuf>) -> ResolvedConfig {
|
|
|
497
473
|
for (table_key, cols) in rules.into_iter() {
|
|
498
474
|
let table_key_norm = table_key.to_lowercase();
|
|
499
475
|
let mut col_map: HashMap<String, AnonymizerSpec> = HashMap::new();
|
|
500
|
-
for (col, spec) in cols.into_iter() {
|
|
501
|
-
|
|
476
|
+
for (col, mut spec) in cols.into_iter() {
|
|
477
|
+
spec.strategy = spec.strategy.to_ascii_lowercase();
|
|
478
|
+
col_map.insert(col.to_lowercase(), spec);
|
|
502
479
|
}
|
|
503
480
|
normalized_rules.insert(table_key_norm, col_map);
|
|
504
481
|
}
|
|
@@ -514,7 +491,7 @@ fn resolve(raw: RawConfig, source_path: Option<PathBuf>) -> ResolvedConfig {
|
|
|
514
491
|
let cases: Vec<ColumnCase> = cases
|
|
515
492
|
.into_iter()
|
|
516
493
|
.map(|mut c| {
|
|
517
|
-
c.strategy =
|
|
494
|
+
c.strategy.strategy = c.strategy.strategy.to_ascii_lowercase();
|
|
518
495
|
c
|
|
519
496
|
})
|
|
520
497
|
.collect();
|
|
@@ -570,8 +547,12 @@ const KNOWN_STRATEGIES: &[&str] = &[
|
|
|
570
547
|
"redact",
|
|
571
548
|
"uuid",
|
|
572
549
|
"hash",
|
|
573
|
-
"
|
|
550
|
+
"email",
|
|
551
|
+
"name",
|
|
552
|
+
"first_name",
|
|
553
|
+
"last_name",
|
|
574
554
|
"phone",
|
|
555
|
+
"faker",
|
|
575
556
|
"int_range",
|
|
576
557
|
"string",
|
|
577
558
|
"date_fuzz",
|
|
@@ -673,10 +654,8 @@ fn is_valid_severity(value: &str) -> bool {
|
|
|
673
654
|
}
|
|
674
655
|
|
|
675
656
|
fn validate_anonymizer_spec(spec: &AnonymizerSpec, path: &str) -> anyhow::Result<()> {
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
let spec = normalize_anonymizer_spec(spec.clone());
|
|
679
|
-
let strategy = spec.strategy.as_str();
|
|
657
|
+
let strategy = spec.strategy.to_ascii_lowercase();
|
|
658
|
+
let strategy = strategy.as_str();
|
|
680
659
|
if !KNOWN_STRATEGIES.contains(&strategy) {
|
|
681
660
|
anyhow::bail!(
|
|
682
661
|
"{}.strategy has unknown strategy '{}'; expected one of {}",
|
|
@@ -739,7 +718,12 @@ fn validate_anonymizer_spec(spec: &AnonymizerSpec, path: &str) -> anyhow::Result
|
|
|
739
718
|
unsupported.push("max_seconds");
|
|
740
719
|
}
|
|
741
720
|
}
|
|
742
|
-
if spec.locale.is_some()
|
|
721
|
+
if spec.locale.is_some()
|
|
722
|
+
&& !matches!(
|
|
723
|
+
strategy,
|
|
724
|
+
"faker" | "phone" | "email" | "name" | "first_name" | "last_name"
|
|
725
|
+
)
|
|
726
|
+
{
|
|
743
727
|
unsupported.push("locale");
|
|
744
728
|
}
|
|
745
729
|
|
|
@@ -843,7 +827,7 @@ fn validate_anonymizer_spec(spec: &AnonymizerSpec, path: &str) -> anyhow::Result
|
|
|
843
827
|
faker
|
|
844
828
|
);
|
|
845
829
|
}
|
|
846
|
-
if !crate::faker_dispatch::faker_path_supported(
|
|
830
|
+
if !crate::faker_dispatch::faker_path_supported(spec) {
|
|
847
831
|
anyhow::bail!(
|
|
848
832
|
"{}.faker {:?} is not a supported generator; see README and \
|
|
849
833
|
https://docs.rs/fake/latest/fake/faker/index.html for upstream module names. \
|
|
@@ -853,6 +837,7 @@ fn validate_anonymizer_spec(spec: &AnonymizerSpec, path: &str) -> anyhow::Result
|
|
|
853
837
|
);
|
|
854
838
|
}
|
|
855
839
|
}
|
|
840
|
+
"email" | "name" | "first_name" | "last_name" | "phone" => {}
|
|
856
841
|
_ => {}
|
|
857
842
|
}
|
|
858
843
|
|
|
@@ -1633,7 +1618,7 @@ salt = "${vault:secret/dumpling#key}"
|
|
|
1633
1618
|
let path = write_temp_config(
|
|
1634
1619
|
r#"
|
|
1635
1620
|
[rules."public.users"]
|
|
1636
|
-
full_name = { strategy = "
|
|
1621
|
+
full_name = { strategy = "name", locale = "de_de" }
|
|
1637
1622
|
"#,
|
|
1638
1623
|
);
|
|
1639
1624
|
let cfg = load_config(Some(&path), false).expect("locale=de_de should be valid");
|
|
@@ -1643,8 +1628,8 @@ full_name = { strategy = "faker", faker = "name::Name", locale = "de_de" }
|
|
|
1643
1628
|
.and_then(|c| c.get("full_name"))
|
|
1644
1629
|
.expect("expected full_name rule");
|
|
1645
1630
|
assert_eq!(spec.locale.as_deref(), Some("de_de"));
|
|
1646
|
-
assert_eq!(spec.strategy, "
|
|
1647
|
-
|
|
1631
|
+
assert_eq!(spec.strategy, "name");
|
|
1632
|
+
assert!(spec.faker.is_none());
|
|
1648
1633
|
let _ = fs::remove_file(path);
|
|
1649
1634
|
}
|
|
1650
1635
|
|
|
@@ -1682,7 +1667,7 @@ full_name = { strategy = "faker", faker = "name::Name", locale = "klingon" }
|
|
|
1682
1667
|
}
|
|
1683
1668
|
|
|
1684
1669
|
#[test]
|
|
1685
|
-
fn
|
|
1670
|
+
fn locale_on_non_locale_strategy_fails_validation() {
|
|
1686
1671
|
let path = write_temp_config(
|
|
1687
1672
|
r#"
|
|
1688
1673
|
[rules."public.users"]
|