dumpling-cli 0.4.3__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/.dumplingconf.example +6 -6
  2. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/CHANGELOG.md +16 -0
  3. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/Cargo.lock +1 -1
  4. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/Cargo.toml +1 -1
  5. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/PKG-INFO +11 -7
  6. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/README.md +10 -6
  7. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/docs/src/releasing.md +1 -1
  8. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/pyproject.toml +1 -1
  9. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/src/faker_dispatch.rs +66 -6
  10. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/src/filter.rs +13 -24
  11. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/src/main.rs +23 -5
  12. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/src/settings.rs +26 -41
  13. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/src/sql.rs +119 -136
  14. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/src/transform.rs +49 -53
  15. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/.github/workflows/ci.yml +0 -0
  16. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/.github/workflows/docs-pr.yml +0 -0
  17. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/.github/workflows/docs.yml +0 -0
  18. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/.github/workflows/platform-compat-latest.yml +0 -0
  19. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/.github/workflows/platform-compat-matrix.yml +0 -0
  20. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/.github/workflows/policy-lint.yml +0 -0
  21. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/.github/workflows/publish.yml +0 -0
  22. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/.github/workflows/release.yml +0 -0
  23. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/.github/workflows/tests.yml +0 -0
  24. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/.gitignore +0 -0
  25. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/AGENTS.md +0 -0
  26. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/CONTRIBUTING.md +0 -0
  27. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/MAINTENANCE.md +0 -0
  28. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/assets/logo.svg +0 -0
  29. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/book.toml +0 -0
  30. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/datetime_out.sql +0 -0
  31. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/datetime_sample.sql +0 -0
  32. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/docs/src/SUMMARY.md +0 -0
  33. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/docs/src/ci-guardrails.md +0 -0
  34. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/docs/src/configuration.md +0 -0
  35. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/docs/src/getting-started.md +0 -0
  36. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/docs/src/index.md +0 -0
  37. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/rust-toolchain.toml +0 -0
  38. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/scripts/setup-dev.sh +0 -0
  39. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/src/lint.rs +0 -0
  40. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/src/report.rs +0 -0
  41. {dumpling_cli-0.4.3 → dumpling_cli-0.5.0}/src/scan.rs +0 -0
@@ -31,12 +31,12 @@ salt = "${DUMPLING_GLOBAL_SALT}"
31
31
  # Faker modules: https://docs.rs/fake/latest/fake/faker/index.html
32
32
  # Upstream repo: https://github.com/cksac/fake-rs
33
33
  [rules."public.users"]
34
- # email — fake email via Rust `fake` crate; force quoted string output
35
- email = { strategy = "faker", faker = "internet::SafeEmail", domain = "customer_identity", unique_within_domain = true }
36
- # name — locale-aware full name (see `locale`); other generators use `faker = "module::Type"`
37
- full_name = { strategy = "faker", faker = "name::Name" }
38
- first_name = { strategy = "faker", faker = "name::FirstName" }
39
- last_name = { strategy = "faker", faker = "name::LastName" }
34
+ # email — safe fake email (built-in); force quoted string output
35
+ email = { strategy = "email", domain = "customer_identity", unique_within_domain = true }
36
+ # name — locale-aware full name (see `locale`)
37
+ full_name = { strategy = "name" }
38
+ first_name = { strategy = "first_name" }
39
+ last_name = { strategy = "last_name" }
40
40
  # phone — US-style (xxx) xxx-xxxx
41
41
  phone = { strategy = "phone" }
42
42
  # ssn — SHA-256 hex of original; use per-column salt for extra protection
@@ -7,6 +7,21 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.5.0] - 2026-05-03
11
+
12
+ ### Added
13
+
14
+ - **First-class strategies** `email`, `name`, `first_name`, `last_name`, and `phone` in config (same generators as `faker = "internet::SafeEmail"`, `name::Name`, `name::FirstName`, `name::LastName`, and locale-aware phone). Strategy names are normalized to lowercase at load.
15
+
16
+ ### Changed
17
+
18
+ - **Random-path faker/phone/PII**: one reused `StdRng` on `AnonymizerRegistry` instead of re-seeding per cell.
19
+ - **`faker` locale resolution**: `resolved_locale_key` avoids allocating a `String` per faker call when locale is `en` or absent.
20
+
21
+ ### Performance
22
+
23
+ - Larger default I/O buffers; fewer per-line and per-row allocations on the SQL stream path (INSERT/COPY parsing and row filters).
24
+
10
25
  ## [0.4.3] - 2026-05-03
11
26
 
12
27
  ### Fixed
@@ -74,6 +89,7 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
74
89
  - Configurable output scan severities and per-category thresholds via `[output_scan]`.
75
90
  - JSON report section for output scan findings including category, count, threshold, severity, and sample locations.
76
91
 
92
+ [0.5.0]: https://github.com/ababic/dumpling/compare/v0.4.3...v0.5.0
77
93
  [0.4.3]: https://github.com/ababic/dumpling/compare/v0.4.2...v0.4.3
78
94
  [0.4.2]: https://github.com/ababic/dumpling/compare/v0.4.1...v0.4.2
79
95
  [0.4.1]: https://github.com/ababic/dumpling/compare/v0.4.0...v0.4.1
@@ -262,7 +262,7 @@ dependencies = [
262
262
 
263
263
  [[package]]
264
264
  name = "dumpling"
265
- version = "0.4.3"
265
+ version = "0.5.0"
266
266
  dependencies = [
267
267
  "anyhow",
268
268
  "chrono",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "dumpling"
3
- version = "0.4.3"
3
+ version = "0.5.0"
4
4
  edition = "2021"
5
5
  readme = "README.md"
6
6
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dumpling-cli
3
- Version: 0.4.3
3
+ Version: 0.5.0
4
4
  Classifier: Development Status :: 4 - Beta
5
5
  Classifier: Environment :: Console
6
6
  Classifier: Intended Audience :: Developers
@@ -142,8 +142,8 @@ salt = "${DUMPLING_GLOBAL_SALT}"
142
142
 
143
143
  # Rules are keyed by either "table" or "schema.table"
144
144
  [rules."public.users"]
145
- email = { strategy = "faker", faker = "internet::SafeEmail", domain = "customer_identity", unique_within_domain = true }
146
- name = { strategy = "faker", faker = "name::Name", locale = "de_de" } # German-locale name
145
+ email = { strategy = "email", domain = "customer_identity", unique_within_domain = true }
146
+ name = { strategy = "name", locale = "de_de" } # German-locale name
147
147
  ssn = { strategy = "hash", salt = "${env:DUMPLING_USERS_SSN_SALT}", as_string = true } # SHA-256 of original (salted)
148
148
  age = { strategy = "int_range", min = 18, max = 90 }
149
149
 
@@ -183,8 +183,12 @@ token = "high"
183
183
  | `redact` | Replace with `REDACTED` (string) |
184
184
  | `uuid` | Random UUIDv4-like string |
185
185
  | `hash` | SHA-256 hex of original value; supports per-column `salt` and global `salt` |
186
- | `faker` | Values from the Rust [`fake`](https://crates.io/crates/fake) crate ([docs.rs](https://docs.rs/fake/latest/fake/), [`faker` modules](https://docs.rs/fake/latest/fake/faker/index.html)), chosen by a **string identifier** only (`faker = "module::Type"`, e.g. `internet::SafeEmail`). Config is **data only**: nothing from TOML is compiled or executed as Rust at runtime. Use `locale` for locale-aware generators; optional `min`/`max`, `length`, `format` as documented. Unsupported targets fail at config load. New generators require a **new Dumpling release** (or your own fork), not config-side code. |
186
+ | `email` | Safe email address (same generator as `faker = "internet::SafeEmail"`); supports `locale` |
187
+ | `name` | Full name (same as `faker = "name::Name"`); supports `locale` |
188
+ | `first_name` | First name (same as `faker = "name::FirstName"`); supports `locale` |
189
+ | `last_name` | Last name (same as `faker = "name::LastName"`); supports `locale` |
187
190
  | `phone` | Locale-aware fake phone number (configurable via `locale`); defaults to English format |
191
+ | `faker` | Values from the Rust [`fake`](https://crates.io/crates/fake) crate ([docs.rs](https://docs.rs/fake/latest/fake/), [`faker` modules](https://docs.rs/fake/latest/fake/faker/index.html)), chosen by a **string identifier** only (`faker = "module::Type"`, e.g. `internet::SafeEmail`). Config is **data only**: nothing from TOML is compiled or executed as Rust at runtime. Use `locale` for locale-aware generators; optional `min`/`max`, `length`, `format` as documented. Unsupported targets fail at config load. New generators require a **new Dumpling release** (or your own fork), not config-side code. |
188
192
  | `int_range` | Random integer in `[min, max]` |
189
193
  | `string` | Random alphanumeric string (`length = 12` by default) |
190
194
  | `date_fuzz` | Shifts a date by a random number of days in `[min_days, max_days]` (defaults: `-30..30`) |
@@ -244,7 +248,7 @@ dumpling --security-profile hardened --input dump.sql --check
244
248
  - `unique_within_domain`: when true, different source values are assigned unique pseudonyms within the configured `domain`. NULL values are unaffected and always remain NULL.
245
249
  - `min_days` / `max_days`: used by `date_fuzz`.
246
250
  - `min_seconds` / `max_seconds`: used by `time_fuzz` and `datetime_fuzz`.
247
- - `locale`: selects the language/regional format for the `faker` and `phone` strategies. Supported values: `en`, `fr_fr`, `de_de`, `it_it`, `pt_br`, `pt_pt`, `ar_sa`, `zh_cn`, `zh_tw`, `ja_jp`, `cy_gb`. Defaults to `en` when not specified.
251
+ - `locale`: selects the language/regional format for `email`, `name`, `first_name`, `last_name`, `faker`, and `phone`. Supported values: `en`, `fr_fr`, `de_de`, `it_it`, `pt_br`, `pt_pt`, `ar_sa`, `zh_cn`, `zh_tw`, `ja_jp`, `cy_gb`. Defaults to `en` when not specified.
248
252
  - `faker`: required when `strategy = "faker"`. A plain string `"module::Type"` (case-insensitive) that maps to a **built-in** generator compiled into Dumpling—not arbitrary Rust or expressions. Names follow [`fake::faker`](https://docs.rs/fake/latest/fake/faker/index.html) (e.g. `internet::SafeEmail` → `faker::internet::SafeEmail` in the crate).
249
253
  - `format`: used with `faker = "number::NumberWithFormat"`; pattern uses `#` (0–9) and `^` (1–9) per the [`fake` crate docs](https://docs.rs/fake/latest/fake/).
250
254
 
@@ -415,7 +419,7 @@ Define default strategies in `rules."<table>"` and add ordered per-column cases
415
419
  ```toml
416
420
  [rules."public.users"]
417
421
  email = { strategy = "hash", as_string = true } # default
418
- name = { strategy = "faker", faker = "name::Name" }
422
+ name = { strategy = "name" }
419
423
 
420
424
  [[column_cases."public.users".email]]
421
425
  when.any = [{ column = "is_admin", op = "eq", value = "true" }]
@@ -466,7 +470,7 @@ salt = "${DUMPLING_HMAC_KEY}"
466
470
 
467
471
  [rules."public.users"]
468
472
  ssn = { strategy = "hash", as_string = true }
469
- email = { strategy = "faker", faker = "internet::SafeEmail", domain = "users" }
473
+ email = { strategy = "email", domain = "users" }
470
474
  ```
471
475
 
472
476
  ```bash
@@ -121,8 +121,8 @@ salt = "${DUMPLING_GLOBAL_SALT}"
121
121
 
122
122
  # Rules are keyed by either "table" or "schema.table"
123
123
  [rules."public.users"]
124
- email = { strategy = "faker", faker = "internet::SafeEmail", domain = "customer_identity", unique_within_domain = true }
125
- name = { strategy = "faker", faker = "name::Name", locale = "de_de" } # German-locale name
124
+ email = { strategy = "email", domain = "customer_identity", unique_within_domain = true }
125
+ name = { strategy = "name", locale = "de_de" } # German-locale name
126
126
  ssn = { strategy = "hash", salt = "${env:DUMPLING_USERS_SSN_SALT}", as_string = true } # SHA-256 of original (salted)
127
127
  age = { strategy = "int_range", min = 18, max = 90 }
128
128
 
@@ -162,8 +162,12 @@ token = "high"
162
162
  | `redact` | Replace with `REDACTED` (string) |
163
163
  | `uuid` | Random UUIDv4-like string |
164
164
  | `hash` | SHA-256 hex of original value; supports per-column `salt` and global `salt` |
165
- | `faker` | Values from the Rust [`fake`](https://crates.io/crates/fake) crate ([docs.rs](https://docs.rs/fake/latest/fake/), [`faker` modules](https://docs.rs/fake/latest/fake/faker/index.html)), chosen by a **string identifier** only (`faker = "module::Type"`, e.g. `internet::SafeEmail`). Config is **data only**: nothing from TOML is compiled or executed as Rust at runtime. Use `locale` for locale-aware generators; optional `min`/`max`, `length`, `format` as documented. Unsupported targets fail at config load. New generators require a **new Dumpling release** (or your own fork), not config-side code. |
165
+ | `email` | Safe email address (same generator as `faker = "internet::SafeEmail"`); supports `locale` |
166
+ | `name` | Full name (same as `faker = "name::Name"`); supports `locale` |
167
+ | `first_name` | First name (same as `faker = "name::FirstName"`); supports `locale` |
168
+ | `last_name` | Last name (same as `faker = "name::LastName"`); supports `locale` |
166
169
  | `phone` | Locale-aware fake phone number (configurable via `locale`); defaults to English format |
170
+ | `faker` | Values from the Rust [`fake`](https://crates.io/crates/fake) crate ([docs.rs](https://docs.rs/fake/latest/fake/), [`faker` modules](https://docs.rs/fake/latest/fake/faker/index.html)), chosen by a **string identifier** only (`faker = "module::Type"`, e.g. `internet::SafeEmail`). Config is **data only**: nothing from TOML is compiled or executed as Rust at runtime. Use `locale` for locale-aware generators; optional `min`/`max`, `length`, `format` as documented. Unsupported targets fail at config load. New generators require a **new Dumpling release** (or your own fork), not config-side code. |
167
171
  | `int_range` | Random integer in `[min, max]` |
168
172
  | `string` | Random alphanumeric string (`length = 12` by default) |
169
173
  | `date_fuzz` | Shifts a date by a random number of days in `[min_days, max_days]` (defaults: `-30..30`) |
@@ -223,7 +227,7 @@ dumpling --security-profile hardened --input dump.sql --check
223
227
  - `unique_within_domain`: when true, different source values are assigned unique pseudonyms within the configured `domain`. NULL values are unaffected and always remain NULL.
224
228
  - `min_days` / `max_days`: used by `date_fuzz`.
225
229
  - `min_seconds` / `max_seconds`: used by `time_fuzz` and `datetime_fuzz`.
226
- - `locale`: selects the language/regional format for the `faker` and `phone` strategies. Supported values: `en`, `fr_fr`, `de_de`, `it_it`, `pt_br`, `pt_pt`, `ar_sa`, `zh_cn`, `zh_tw`, `ja_jp`, `cy_gb`. Defaults to `en` when not specified.
230
+ - `locale`: selects the language/regional format for `email`, `name`, `first_name`, `last_name`, `faker`, and `phone`. Supported values: `en`, `fr_fr`, `de_de`, `it_it`, `pt_br`, `pt_pt`, `ar_sa`, `zh_cn`, `zh_tw`, `ja_jp`, `cy_gb`. Defaults to `en` when not specified.
227
231
  - `faker`: required when `strategy = "faker"`. A plain string `"module::Type"` (case-insensitive) that maps to a **built-in** generator compiled into Dumpling—not arbitrary Rust or expressions. Names follow [`fake::faker`](https://docs.rs/fake/latest/fake/faker/index.html) (e.g. `internet::SafeEmail` → `faker::internet::SafeEmail` in the crate).
228
232
  - `format`: used with `faker = "number::NumberWithFormat"`; pattern uses `#` (0–9) and `^` (1–9) per the [`fake` crate docs](https://docs.rs/fake/latest/fake/).
229
233
 
@@ -394,7 +398,7 @@ Define default strategies in `rules."<table>"` and add ordered per-column cases
394
398
  ```toml
395
399
  [rules."public.users"]
396
400
  email = { strategy = "hash", as_string = true } # default
397
- name = { strategy = "faker", faker = "name::Name" }
401
+ name = { strategy = "name" }
398
402
 
399
403
  [[column_cases."public.users".email]]
400
404
  when.any = [{ column = "is_admin", op = "eq", value = "true" }]
@@ -445,7 +449,7 @@ salt = "${DUMPLING_HMAC_KEY}"
445
449
 
446
450
  [rules."public.users"]
447
451
  ssn = { strategy = "hash", as_string = true }
448
- email = { strategy = "faker", faker = "internet::SafeEmail", domain = "users" }
452
+ email = { strategy = "email", domain = "users" }
449
453
  ```
450
454
 
451
455
  ```bash
@@ -11,7 +11,7 @@ This project uses **tag-driven releases**.
11
11
  ## Maintainer checklist
12
12
 
13
13
  1. Ensure `main` is green in CI.
14
- 2. Update `Cargo.toml` version and `CHANGELOG.md`.
14
+ 2. Update `Cargo.toml` and `pyproject.toml` versions and `CHANGELOG.md`.
15
15
  3. Open and merge a release preparation PR.
16
16
  4. Create and push a tag from `main`:
17
17
 
@@ -4,7 +4,7 @@ build-backend = "maturin"
4
4
 
5
5
  [project]
6
6
  name = "dumpling-cli"
7
- version = "0.4.3"
7
+ version = "0.5.0"
8
8
  description = "Static anonymizer for plain SQL dumps (PostgreSQL, SQLite, SQL Server)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -89,6 +89,71 @@ pub fn parse_faker_path(faker: &str) -> Option<(&str, &str)> {
89
89
  Some((module, typ))
90
90
  }
91
91
 
92
+ /// Normalized locale key for `faker`, `phone`, and built-in PII strategies (`email`, `name`, …).
93
+ /// Uses ASCII case-insensitive matching without allocating.
94
+ pub fn resolved_locale_key(spec: &AnonymizerSpec) -> &'static str {
95
+ let s = spec.locale.as_deref().map(str::trim).unwrap_or("");
96
+ if s.is_empty() || s.eq_ignore_ascii_case("en") {
97
+ return "en";
98
+ }
99
+ if s.eq_ignore_ascii_case("fr_fr") {
100
+ return "fr_fr";
101
+ }
102
+ if s.eq_ignore_ascii_case("de_de") {
103
+ return "de_de";
104
+ }
105
+ if s.eq_ignore_ascii_case("it_it") {
106
+ return "it_it";
107
+ }
108
+ if s.eq_ignore_ascii_case("pt_br") {
109
+ return "pt_br";
110
+ }
111
+ if s.eq_ignore_ascii_case("pt_pt") {
112
+ return "pt_pt";
113
+ }
114
+ if s.eq_ignore_ascii_case("ar_sa") {
115
+ return "ar_sa";
116
+ }
117
+ if s.eq_ignore_ascii_case("zh_cn") {
118
+ return "zh_cn";
119
+ }
120
+ if s.eq_ignore_ascii_case("zh_tw") {
121
+ return "zh_tw";
122
+ }
123
+ if s.eq_ignore_ascii_case("ja_jp") {
124
+ return "ja_jp";
125
+ }
126
+ if s.eq_ignore_ascii_case("cy_gb") {
127
+ return "cy_gb";
128
+ }
129
+ "en"
130
+ }
131
+
132
+ /// Built-in `strategy = "email"` — same generator as `faker = "internet::SafeEmail"`.
133
+ pub fn pii_safe_email(loc: &str, rng: &mut StdRng) -> String {
134
+ fl!(loc, rng, SafeEmail)
135
+ }
136
+
137
+ /// Built-in `strategy = "name"` — full name.
138
+ pub fn pii_full_name(loc: &str, rng: &mut StdRng) -> String {
139
+ fl!(loc, rng, Name)
140
+ }
141
+
142
+ /// Built-in `strategy = "first_name"`.
143
+ pub fn pii_first_name(loc: &str, rng: &mut StdRng) -> String {
144
+ fl!(loc, rng, FirstName)
145
+ }
146
+
147
+ /// Built-in `strategy = "last_name"`.
148
+ pub fn pii_last_name(loc: &str, rng: &mut StdRng) -> String {
149
+ fl!(loc, rng, LastName)
150
+ }
151
+
152
+ /// Built-in `strategy = "phone"` — same generator as `faker` phone_number fakers.
153
+ pub fn pii_phone_number(loc: &str, rng: &mut StdRng) -> String {
154
+ fl!(loc, rng, PhoneNumber)
155
+ }
156
+
92
157
  pub fn faker_string_with_rng(spec: &AnonymizerSpec, rng: &mut StdRng) -> Option<String> {
93
158
  let faker = spec.faker.as_deref()?.trim();
94
159
  if faker.is_empty() {
@@ -97,12 +162,7 @@ pub fn faker_string_with_rng(spec: &AnonymizerSpec, rng: &mut StdRng) -> Option<
97
162
  let (module, typ) = parse_faker_path(faker)?;
98
163
  let module_lc = module.to_ascii_lowercase();
99
164
  let typ_lc = typ.to_ascii_lowercase();
100
- let locale = spec
101
- .locale
102
- .as_deref()
103
- .map(|l| l.trim().to_ascii_lowercase())
104
- .unwrap_or_else(|| "en".to_string());
105
- let loc = locale.as_str();
165
+ let loc = resolved_locale_key(spec);
106
166
 
107
167
  let s: String = match (module_lc.as_str(), typ_lc.as_str()) {
108
168
  ("name", "firstname") => fl!(loc, rng, FirstName),
@@ -13,7 +13,7 @@ pub fn should_keep_row(
13
13
  schema: Option<&str>,
14
14
  table: &str,
15
15
  columns: &[String],
16
- cells: &[Option<String>], // unescaped strings; None for NULL
16
+ cells: &[Option<&str>], // unescaped strings; None for NULL
17
17
  ) -> bool {
18
18
  let set = match lookup_row_filters(cfg, schema, table) {
19
19
  Some(s) => s,
@@ -43,7 +43,7 @@ pub fn should_keep_row(
43
43
  true
44
44
  }
45
45
 
46
- fn predicate_matches(pred: &Predicate, columns: &[String], cells: &[Option<String>]) -> bool {
46
+ fn predicate_matches(pred: &Predicate, columns: &[String], cells: &[Option<&str>]) -> bool {
47
47
  let targets = match extract_predicate_targets(pred, columns, cells) {
48
48
  Some(values) => values,
49
49
  None => return false, // top-level column missing -> does not match
@@ -121,13 +121,14 @@ fn predicate_matches(pred: &Predicate, columns: &[String], cells: &[Option<Strin
121
121
  fn extract_predicate_targets(
122
122
  pred: &Predicate,
123
123
  columns: &[String],
124
- cells: &[Option<String>],
124
+ cells: &[Option<&str>],
125
125
  ) -> Option<Vec<Option<String>>> {
126
126
  if let Some(i) = columns
127
127
  .iter()
128
128
  .position(|c| c.eq_ignore_ascii_case(&pred.column))
129
129
  {
130
- return Some(vec![cells.get(i).cloned().unwrap_or(None)]);
130
+ let cell = cells.get(i).copied().flatten();
131
+ return Some(vec![cell.map(|s| s.to_string())]);
131
132
  }
132
133
 
133
134
  let (base_column, path) = parse_json_column_key(&pred.column);
@@ -413,7 +414,7 @@ where
413
414
  Ok(())
414
415
  }
415
416
 
416
- pub fn when_matches(when: &When, columns: &[String], cells: &[Option<String>]) -> bool {
417
+ pub fn when_matches(when: &When, columns: &[String], cells: &[Option<&str>]) -> bool {
417
418
  // If any is non-empty, require at least one to match
418
419
  if !when.any.is_empty() {
419
420
  let mut matched_any = false;
@@ -603,11 +604,7 @@ mod tests {
603
604
  Some("public"),
604
605
  "users",
605
606
  &cols,
606
- &[
607
- Some("1".to_string()),
608
- Some("alice@myco.com".to_string()),
609
- Some("US".to_string())
610
- ]
607
+ &[Some("1"), Some("alice@myco.com"), Some("US")]
611
608
  ));
612
609
  // Case-insensitive keep (iregex)
613
610
  assert!(should_keep_row(
@@ -615,11 +612,7 @@ mod tests {
615
612
  Some("public"),
616
613
  "users",
617
614
  &cols,
618
- &[
619
- Some("2".to_string()),
620
- Some("Carol@MYCO.COM".to_string()),
621
- Some("GB".to_string())
622
- ]
615
+ &[Some("2"), Some("Carol@MYCO.COM"), Some("GB")]
623
616
  ));
624
617
  // Delete example.com
625
618
  assert!(!should_keep_row(
@@ -627,11 +620,7 @@ mod tests {
627
620
  Some("public"),
628
621
  "users",
629
622
  &cols,
630
- &[
631
- Some("3".to_string()),
632
- Some("bob@example.com".to_string()),
633
- Some("US".to_string())
634
- ]
623
+ &[Some("3"), Some("bob@example.com"), Some("US")]
635
624
  ));
636
625
  }
637
626
 
@@ -668,14 +657,14 @@ mod tests {
668
657
  Some("public"),
669
658
  "events",
670
659
  &cols,
671
- &[Some(r#"{"profile":{"tier":"gold"}}"#.to_string())]
660
+ &[Some(r#"{"profile":{"tier":"gold"}}"#)]
672
661
  ));
673
662
  assert!(!should_keep_row(
674
663
  &cfg,
675
664
  Some("public"),
676
665
  "events",
677
666
  &cols,
678
- &[Some(r#"{"profile":{"tier":"silver"}}"#.to_string())]
667
+ &[Some(r#"{"profile":{"tier":"silver"}}"#)]
679
668
  ));
680
669
  }
681
670
 
@@ -713,7 +702,7 @@ mod tests {
713
702
  "events",
714
703
  &cols,
715
704
  &[Some(
716
- r#"{"items":[{"kind":"secondary"},{"kind":"primary"}]}"#.to_string()
705
+ r#"{"items":[{"kind":"secondary"},{"kind":"primary"}]}"#
717
706
  )]
718
707
  ));
719
708
  assert!(!should_keep_row(
@@ -721,7 +710,7 @@ mod tests {
721
710
  Some("public"),
722
711
  "events",
723
712
  &cols,
724
- &[Some(r#"{"items":[{"kind":"secondary"}]}"#.to_string())]
713
+ &[Some(r#"{"items":[{"kind":"secondary"}]}"#)]
725
714
  ));
726
715
  }
727
716
 
@@ -1,5 +1,8 @@
1
1
  use std::fs::File;
2
2
  use std::io::{self, BufRead, BufReader, BufWriter, Write};
3
+
4
+ /// Larger than default 8 KiB to reduce syscall overhead on big dumps.
5
+ const IO_BUF_CAPACITY: usize = 256 * 1024;
3
6
  use std::path::{Path, PathBuf};
4
7
  use std::process::{Command, Stdio};
5
8
 
@@ -343,7 +346,10 @@ fn run_anonymize(cli: Cli) -> anyhow::Result<()> {
343
346
  .take()
344
347
  .ok_or_else(|| anyhow::anyhow!("pg_restore stdout missing"))?;
345
348
  pg_restore_child = Some(child);
346
- (Box::new(BufReader::new(stdout)), Some(archive_path.clone()))
349
+ (
350
+ Box::new(BufReader::with_capacity(IO_BUF_CAPACITY, stdout)),
351
+ Some(archive_path.clone()),
352
+ )
347
353
  } else {
348
354
  match &cli.input {
349
355
  Some(path) => {
@@ -360,13 +366,19 @@ fn run_anonymize(cli: Cli) -> anyhow::Result<()> {
360
366
  );
361
367
  }
362
368
  let f = File::open(path)?;
363
- (Box::new(BufReader::new(f)), Some(path.clone()))
369
+ (
370
+ Box::new(BufReader::with_capacity(IO_BUF_CAPACITY, f)),
371
+ Some(path.clone()),
372
+ )
364
373
  }
365
374
  None => {
366
375
  if !cli.allow_ext.is_empty() {
367
376
  eprintln!("dumpling: --allow-ext provided but no --input file; extension check is ignored for stdin");
368
377
  }
369
- (Box::new(BufReader::new(io::stdin())), None)
378
+ (
379
+ Box::new(BufReader::with_capacity(IO_BUF_CAPACITY, io::stdin())),
380
+ None,
381
+ )
370
382
  }
371
383
  }
372
384
  };
@@ -380,9 +392,15 @@ fn run_anonymize(cli: Cli) -> anyhow::Result<()> {
380
392
  .ok_or_else(|| anyhow::anyhow!("--in-place requires an --input path"))?;
381
393
  let mut tmp = input_path.clone();
382
394
  tmp.set_extension("sql.dumpling.tmp");
383
- Box::new(BufWriter::new(File::create(&tmp)?))
395
+ Box::new(BufWriter::with_capacity(
396
+ IO_BUF_CAPACITY,
397
+ File::create(&tmp)?,
398
+ ))
384
399
  } else if let Some(path) = &cli.output {
385
- Box::new(BufWriter::new(File::create(path)?))
400
+ Box::new(BufWriter::with_capacity(
401
+ IO_BUF_CAPACITY,
402
+ File::create(path)?,
403
+ ))
386
404
  } else {
387
405
  Box::new(BufWriter::new(io::stdout()))
388
406
  };
@@ -32,7 +32,7 @@ pub struct RawConfig {
32
32
 
33
33
  #[derive(Debug, Clone, Deserialize)]
34
34
  pub struct AnonymizerSpec {
35
- /// Strategy name: redact|null|uuid|hash|faker|phone|int_range|string|date_fuzz|time_fuzz|datetime_fuzz
35
+ /// Strategy name: redact|null|uuid|hash|email|name|first_name|last_name|phone|faker|int_range|string|date_fuzz|time_fuzz|datetime_fuzz
36
36
  pub strategy: String,
37
37
  /// if strategy=hash: optional per-column salt override; otherwise ignored
38
38
  pub salt: Option<String>,
@@ -56,11 +56,12 @@ pub struct AnonymizerSpec {
56
56
  /// Force the replacement to be rendered as a SQL string literal
57
57
  /// If unset, we attempt to preserve the original quoting style.
58
58
  pub as_string: Option<bool>,
59
- /// Locale for locale-aware strategies: `faker` (passed to the `fake` crate) and `phone`.
59
+ /// Locale for locale-aware strategies: built-in PII (`email`, `name`, `first_name`, `last_name`), `faker`, and `phone`.
60
60
  /// Supported values: en, fr_fr, de_de, it_it, pt_br, pt_pt, ar_sa, zh_cn, zh_tw, ja_jp, cy_gb.
61
61
  /// Defaults to "en" when not specified.
62
62
  pub locale: Option<String>,
63
63
  /// When `strategy = "faker"`, selects the `fake` generator as `"module::Type"` (e.g. `internet::SafeEmail`, `name::FirstName`).
64
+ /// Only read when `strategy = "faker"` (other strategies must not set `faker`).
64
65
  /// See [`fake::faker`](https://docs.rs/fake/latest/fake/faker/index.html) and the [crate docs](https://docs.rs/fake/latest/fake/).
65
66
  #[serde(default)]
66
67
  pub faker: Option<String>,
@@ -448,31 +449,6 @@ fn is_simple_key(key: &str) -> bool {
448
449
  .all(|ch| ch.is_ascii_alphanumeric() || ch == '_')
449
450
  }
450
451
 
451
- /// Map removed built-in strategies to `faker` + `faker` path for backwards compatibility.
452
- fn normalize_anonymizer_spec(mut spec: AnonymizerSpec) -> AnonymizerSpec {
453
- let s = spec.strategy.to_ascii_lowercase();
454
- match s.as_str() {
455
- "email" => {
456
- spec.strategy = "faker".to_string();
457
- spec.faker = Some("internet::SafeEmail".to_string());
458
- }
459
- "name" => {
460
- spec.strategy = "faker".to_string();
461
- spec.faker = Some("name::Name".to_string());
462
- }
463
- "first_name" => {
464
- spec.strategy = "faker".to_string();
465
- spec.faker = Some("name::FirstName".to_string());
466
- }
467
- "last_name" => {
468
- spec.strategy = "faker".to_string();
469
- spec.faker = Some("name::LastName".to_string());
470
- }
471
- _ => {}
472
- }
473
- spec
474
- }
475
-
476
452
  fn resolve(raw: RawConfig, source_path: Option<PathBuf>) -> ResolvedConfig {
477
453
  let RawConfig {
478
454
  salt,
@@ -497,8 +473,9 @@ fn resolve(raw: RawConfig, source_path: Option<PathBuf>) -> ResolvedConfig {
497
473
  for (table_key, cols) in rules.into_iter() {
498
474
  let table_key_norm = table_key.to_lowercase();
499
475
  let mut col_map: HashMap<String, AnonymizerSpec> = HashMap::new();
500
- for (col, spec) in cols.into_iter() {
501
- col_map.insert(col.to_lowercase(), normalize_anonymizer_spec(spec));
476
+ for (col, mut spec) in cols.into_iter() {
477
+ spec.strategy = spec.strategy.to_ascii_lowercase();
478
+ col_map.insert(col.to_lowercase(), spec);
502
479
  }
503
480
  normalized_rules.insert(table_key_norm, col_map);
504
481
  }
@@ -514,7 +491,7 @@ fn resolve(raw: RawConfig, source_path: Option<PathBuf>) -> ResolvedConfig {
514
491
  let cases: Vec<ColumnCase> = cases
515
492
  .into_iter()
516
493
  .map(|mut c| {
517
- c.strategy = normalize_anonymizer_spec(c.strategy.clone());
494
+ c.strategy.strategy = c.strategy.strategy.to_ascii_lowercase();
518
495
  c
519
496
  })
520
497
  .collect();
@@ -570,8 +547,12 @@ const KNOWN_STRATEGIES: &[&str] = &[
570
547
  "redact",
571
548
  "uuid",
572
549
  "hash",
573
- "faker",
550
+ "email",
551
+ "name",
552
+ "first_name",
553
+ "last_name",
574
554
  "phone",
555
+ "faker",
575
556
  "int_range",
576
557
  "string",
577
558
  "date_fuzz",
@@ -673,10 +654,8 @@ fn is_valid_severity(value: &str) -> bool {
673
654
  }
674
655
 
675
656
  fn validate_anonymizer_spec(spec: &AnonymizerSpec, path: &str) -> anyhow::Result<()> {
676
- // Legacy strategy names (`email`, `name`, …) normalize to `faker` during `resolve()`; apply the
677
- // same mapping here so file validation matches runtime behavior.
678
- let spec = normalize_anonymizer_spec(spec.clone());
679
- let strategy = spec.strategy.as_str();
657
+ let strategy = spec.strategy.to_ascii_lowercase();
658
+ let strategy = strategy.as_str();
680
659
  if !KNOWN_STRATEGIES.contains(&strategy) {
681
660
  anyhow::bail!(
682
661
  "{}.strategy has unknown strategy '{}'; expected one of {}",
@@ -739,7 +718,12 @@ fn validate_anonymizer_spec(spec: &AnonymizerSpec, path: &str) -> anyhow::Result
739
718
  unsupported.push("max_seconds");
740
719
  }
741
720
  }
742
- if spec.locale.is_some() && !matches!(strategy, "faker" | "phone") {
721
+ if spec.locale.is_some()
722
+ && !matches!(
723
+ strategy,
724
+ "faker" | "phone" | "email" | "name" | "first_name" | "last_name"
725
+ )
726
+ {
743
727
  unsupported.push("locale");
744
728
  }
745
729
 
@@ -843,7 +827,7 @@ fn validate_anonymizer_spec(spec: &AnonymizerSpec, path: &str) -> anyhow::Result
843
827
  faker
844
828
  );
845
829
  }
846
- if !crate::faker_dispatch::faker_path_supported(&spec) {
830
+ if !crate::faker_dispatch::faker_path_supported(spec) {
847
831
  anyhow::bail!(
848
832
  "{}.faker {:?} is not a supported generator; see README and \
849
833
  https://docs.rs/fake/latest/fake/faker/index.html for upstream module names. \
@@ -853,6 +837,7 @@ fn validate_anonymizer_spec(spec: &AnonymizerSpec, path: &str) -> anyhow::Result
853
837
  );
854
838
  }
855
839
  }
840
+ "email" | "name" | "first_name" | "last_name" | "phone" => {}
856
841
  _ => {}
857
842
  }
858
843
 
@@ -1633,7 +1618,7 @@ salt = "${vault:secret/dumpling#key}"
1633
1618
  let path = write_temp_config(
1634
1619
  r#"
1635
1620
  [rules."public.users"]
1636
- full_name = { strategy = "faker", faker = "name::Name", locale = "de_de" }
1621
+ full_name = { strategy = "name", locale = "de_de" }
1637
1622
  "#,
1638
1623
  );
1639
1624
  let cfg = load_config(Some(&path), false).expect("locale=de_de should be valid");
@@ -1643,8 +1628,8 @@ full_name = { strategy = "faker", faker = "name::Name", locale = "de_de" }
1643
1628
  .and_then(|c| c.get("full_name"))
1644
1629
  .expect("expected full_name rule");
1645
1630
  assert_eq!(spec.locale.as_deref(), Some("de_de"));
1646
- assert_eq!(spec.strategy, "faker");
1647
- assert_eq!(spec.faker.as_deref(), Some("name::Name"));
1631
+ assert_eq!(spec.strategy, "name");
1632
+ assert!(spec.faker.is_none());
1648
1633
  let _ = fs::remove_file(path);
1649
1634
  }
1650
1635
 
@@ -1682,7 +1667,7 @@ full_name = { strategy = "faker", faker = "name::Name", locale = "klingon" }
1682
1667
  }
1683
1668
 
1684
1669
  #[test]
1685
- fn locale_on_non_name_phone_strategy_fails_validation() {
1670
+ fn locale_on_non_locale_strategy_fails_validation() {
1686
1671
  let path = write_temp_config(
1687
1672
  r#"
1688
1673
  [rules."public.users"]