dumpling-cli 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/.dumplingconf.example +11 -7
  2. dumpling_cli-0.3.0/.github/workflows/docs-pr.yml +35 -0
  3. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/.github/workflows/docs.yml +9 -11
  4. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/.gitignore +1 -0
  5. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/AGENTS.md +17 -1
  6. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/CHANGELOG.md +14 -0
  7. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/CONTRIBUTING.md +8 -0
  8. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/Cargo.lock +1 -1
  9. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/Cargo.toml +1 -1
  10. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/PKG-INFO +15 -8
  11. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/README.md +14 -7
  12. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/docs/src/ci-guardrails.md +1 -1
  13. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/docs/src/configuration.md +21 -2
  14. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/docs/src/getting-started.md +3 -1
  15. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/docs/src/index.md +3 -3
  16. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/pyproject.toml +1 -1
  17. dumpling_cli-0.3.0/rust-toolchain.toml +3 -0
  18. dumpling_cli-0.3.0/scripts/setup-dev.sh +89 -0
  19. dumpling_cli-0.3.0/src/faker_dispatch.rs +521 -0
  20. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/src/filter.rs +115 -41
  21. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/src/lint.rs +6 -0
  22. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/src/main.rs +1 -0
  23. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/src/scan.rs +1 -0
  24. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/src/settings.rs +245 -29
  25. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/src/sql.rs +276 -95
  26. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/src/transform.rs +48 -119
  27. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/.github/workflows/ci.yml +0 -0
  28. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/.github/workflows/platform-compat-latest.yml +0 -0
  29. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/.github/workflows/platform-compat-matrix.yml +0 -0
  30. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/.github/workflows/policy-lint.yml +0 -0
  31. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/.github/workflows/publish.yml +0 -0
  32. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/.github/workflows/release.yml +0 -0
  33. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/.github/workflows/tests.yml +0 -0
  34. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/MAINTENANCE.md +0 -0
  35. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/book.toml +0 -0
  36. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/datetime_out.sql +0 -0
  37. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/datetime_sample.sql +0 -0
  38. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/docs/src/SUMMARY.md +0 -0
  39. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/docs/src/releasing.md +0 -0
  40. {dumpling_cli-0.2.0 → dumpling_cli-0.3.0}/src/report.rs +0 -0
@@ -26,13 +26,17 @@ salt = "${DUMPLING_GLOBAL_SALT}"
26
26
  #
27
27
  # Each column maps to an anonymizer spec: { strategy = "…", <options> }
28
28
  # ---------------------------------------------------------------------------
29
+ # Faker strategy: `faker = "module::Type"` matches the Rust `fake` crate layout.
30
+ # Crate docs: https://docs.rs/fake/latest/fake/
31
+ # Faker modules: https://docs.rs/fake/latest/fake/faker/index.html
32
+ # Upstream repo: https://github.com/cksac/fake-rs
29
33
  [rules."public.users"]
30
- # email — random-looking email at example.com; force quoted string output
31
- email = { strategy = "email", domain = "customer_identity", unique_within_domain = true }
32
- # name — random placeholder full name
33
- full_name = { strategy = "name" }
34
- first_name = { strategy = "first_name" }
35
- last_name = { strategy = "last_name" }
34
+ # email — fake email via Rust `fake` crate; force quoted string output
35
+ email = { strategy = "faker", faker = "internet::SafeEmail", domain = "customer_identity", unique_within_domain = true }
36
+ # name — locale-aware full name (see `locale`); other generators use `faker = "module::Type"`
37
+ full_name = { strategy = "faker", faker = "name::Name" }
38
+ first_name = { strategy = "faker", faker = "name::FirstName" }
39
+ last_name = { strategy = "faker", faker = "name::LastName" }
36
40
  # phone — US-style (xxx) xxx-xxxx
37
41
  phone = { strategy = "phone" }
38
42
  # ssn — SHA-256 hex of original; use per-column salt for extra protection
@@ -58,7 +62,7 @@ wake_time = { strategy = "time_fuzz", min_seconds = -3600, max_seconds = 360
58
62
  # credit card — redact entirely; force as quoted string
59
63
  credit_card = { strategy = "redact", as_string = true }
60
64
  # keep the same anonymized email as users table via shared domain
61
- customer_email = { strategy = "email", domain = "customer_identity" }
65
+ customer_email = { strategy = "faker", faker = "internet::SafeEmail", domain = "customer_identity" }
62
66
 
63
67
  [rules."public.audit_log"]
64
68
  # unqualified table name also works (matches any schema)
@@ -0,0 +1,35 @@
1
+ # mdBook verification on pull requests only (no GitHub Pages upload or deploy).
2
+ # Pages build + deploy live in docs.yml and run on pushes to main.
3
+ name: Docs (PR)
4
+
5
+ on:
6
+ pull_request:
7
+ paths:
8
+ - "README.md"
9
+ - "book.toml"
10
+ - "docs/**"
11
+ - ".github/workflows/docs.yml"
12
+ - ".github/workflows/docs-pr.yml"
13
+
14
+ permissions:
15
+ contents: read
16
+
17
+ concurrency:
18
+ group: docs-pr-${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
19
+ cancel-in-progress: true
20
+
21
+ jobs:
22
+ verify:
23
+ name: Build mdBook (verify)
24
+ runs-on: ubuntu-latest
25
+ steps:
26
+ - name: Checkout
27
+ uses: actions/checkout@v4
28
+
29
+ - name: Install mdBook
30
+ uses: peaceiris/actions-mdbook@v2
31
+ with:
32
+ mdbook-version: "0.4.52"
33
+
34
+ - name: Build documentation site
35
+ run: mdbook build
@@ -1,12 +1,8 @@
1
+ # Build and deploy the mdBook site to GitHub Pages (main branch only).
2
+ # Pull-request verification runs in docs-pr.yml — this workflow does not run on PRs.
1
3
  name: Docs
2
4
 
3
5
  on:
4
- pull_request:
5
- paths:
6
- - "README.md"
7
- - "book.toml"
8
- - "docs/**"
9
- - ".github/workflows/docs.yml"
10
6
  push:
11
7
  branches:
12
8
  - main
@@ -18,16 +14,16 @@ on:
18
14
 
19
15
  permissions:
20
16
  contents: read
21
- pages: write
22
- id-token: write
23
17
 
24
18
  concurrency:
25
- group: docs-${{ github.ref }}
19
+ group: docs-pages-${{ github.ref }}
26
20
  cancel-in-progress: true
27
21
 
28
22
  jobs:
29
23
  build:
30
24
  runs-on: ubuntu-latest
25
+ permissions:
26
+ contents: read
31
27
  steps:
32
28
  - name: Checkout
33
29
  uses: actions/checkout@v4
@@ -40,15 +36,17 @@ jobs:
40
36
  - name: Build documentation site
41
37
  run: mdbook build
42
38
 
43
- - name: Upload docs artifact
39
+ - name: Upload Pages deployment artifact
44
40
  uses: actions/upload-pages-artifact@v3
45
41
  with:
46
42
  path: docs/book
47
43
 
48
44
  deploy:
49
- if: github.event_name == 'push' && github.ref == 'refs/heads/main'
50
45
  needs: build
51
46
  runs-on: ubuntu-latest
47
+ permissions:
48
+ pages: write
49
+ id-token: write
52
50
  environment:
53
51
  name: github-pages
54
52
  url: ${{ steps.deployment.outputs.page_url }}
@@ -1,2 +1,3 @@
1
1
  /target/
2
2
  /docs/book/
3
+ /.tools/
@@ -223,6 +223,8 @@ Follow these steps in order. Do not skip any step.
223
223
 
224
224
  8. **`README.md`**: Add a row to the "Anonymization strategies" table.
225
225
 
226
+ **`faker` strategy:** Config only carries string identifiers; Dumpling never evaluates user Rust from config. To ship a new generator, add dispatch in `src/faker_dispatch.rs` and validation in `validate_anonymizer_spec` for the `faker` branch. Upstream reference: [`fake` on docs.rs](https://docs.rs/fake/latest/fake/), [`fake::faker` module index](https://docs.rs/fake/latest/fake/faker/index.html), [source on GitHub](https://github.com/cksac/fake-rs).
227
+
226
228
  ---
227
229
 
228
230
  ## How to Add a New Row Filter Predicate Operator
@@ -274,15 +276,29 @@ Follow these steps in order. Do not skip any step.
274
276
 
275
277
  This is a pure Rust CLI project with **no external services** (no database, Docker, or network dependencies). The Rust stable toolchain (rustc + cargo) is the only prerequisite.
276
278
 
279
+ ### One-shot environment (agents and humans)
280
+
281
+ From the repository root:
282
+
283
+ ```bash
284
+ ./scripts/setup-dev.sh
285
+ ```
286
+
287
+ This installs the **stable** toolchain with **rustfmt** and **clippy** (via `rustup` when available), runs **`cargo fetch`**, and installs a pinned **mdBook** binary under `.tools/` (same version as the Docs CI workflow) so you can run `mdbook build` without a global install. Add `.tools` to `PATH` for convenience, or invoke `.tools/mdbook build` directly.
288
+
289
+ The repo root **`rust-toolchain.toml`** pins **stable** and the **components** CI uses, so `cargo` automatically selects the right toolchain in fresh checkouts.
290
+
277
291
  ### Quick reference
278
292
 
279
293
  | Task | Command |
280
294
  |------|---------|
295
+ | Setup (toolchain + fetch + mdbook) | `./scripts/setup-dev.sh` |
281
296
  | Build | `cargo build` |
282
297
  | Test | `cargo test --all-targets --all-features` |
283
298
  | Lint | `cargo clippy --all-targets --all-features` |
284
299
  | Format check | `cargo fmt --all -- --check` |
285
300
  | Auto-format | `cargo fmt` |
301
+ | Docs site (mdBook) | `mdbook build` or `.tools/mdbook build` after setup |
286
302
  | Run CLI | `./target/debug/dumpling --help` |
287
303
 
288
304
  ### Running the CLI
@@ -295,6 +311,6 @@ Dumpling is fail-closed by default — it exits non-zero without a config file.
295
311
 
296
312
  ### Notes
297
313
 
298
- - All 94 tests are inline `#[cfg(test)]` modules; there are no separate test files or fixtures to manage.
314
+ - All tests are inline `#[cfg(test)]` modules; there are no separate test files or fixtures to manage.
299
315
  - The update script uses `cargo fetch` to pre-download crate dependencies. A full `cargo build` or `cargo test` will then compile from the local cache without network access.
300
316
  - No environment variables or secrets are required for building, testing, or running the CLI locally.
@@ -7,6 +7,19 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.3.0] - 2026-05-02
11
+
12
+ ### Added
13
+
14
+ - **`faker` anonymization strategy** backed by the Rust [`fake`](https://crates.io/crates/fake) crate: select generators with `faker = "module::Type"` (for example `internet::SafeEmail`, `name::Name`). Unsupported targets fail at config load with a clear error; extending the allowlist requires a Dumpling release (see `src/faker_dispatch.rs`).
15
+ - **JSON path rules in `[rules]`**: column keys such as `payload.profile.email` or `payload__profile__email` apply strategies to nested fields inside JSON text columns while preserving document structure. Conflicts between a whole-column rule and JSON path rules for the same base column are rejected at validation.
16
+ - **`format` on `AnonymizerSpec`** for pattern-based faker generators such as `number::NumberWithFormat`.
17
+
18
+ ### Changed
19
+
20
+ - **Legacy strategy names** `email`, `name`, `first_name`, and `last_name` in config are normalized at load time to `strategy = "faker"` with the same defaults as before (`internet::SafeEmail`, `name::Name`, `name::FirstName`, `name::LastName`), so existing configs keep working.
21
+ - **`locale`** applies to both `faker` and `phone` strategies.
22
+
10
23
  ## [0.2.0] - 2026-05-02
11
24
 
12
25
  ### Added
@@ -30,4 +43,5 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
30
43
  - Configurable output scan severities and per-category thresholds via `[output_scan]`.
31
44
  - JSON report section for output scan findings including category, count, threshold, severity, and sample locations.
32
45
 
46
+ [0.3.0]: https://github.com/ababic/dumpling/compare/v0.2.0...v0.3.0
33
47
  [0.2.0]: https://github.com/ababic/dumpling/compare/v0.1.0...v0.2.0
@@ -13,6 +13,14 @@ For AI coding agents: also read `AGENTS.md`, which contains more detailed techni
13
13
  - **Rust stable toolchain** — install via [rustup.rs](https://rustup.rs/).
14
14
  - No database, Docker, or external services are required. Dumpling is a pure CLI tool.
15
15
 
16
+ ### One-shot setup (recommended)
17
+
18
+ ```bash
19
+ ./scripts/setup-dev.sh
20
+ ```
21
+
22
+ Installs stable + `rustfmt` + `clippy`, prefetches crates, and downloads a pinned **mdBook** under `.tools/` (for `mdbook build`, same version as CI). Optional: `export PATH="$PWD/.tools:$PATH"`.
23
+
16
24
  ### Build and run
17
25
 
18
26
  ```bash
@@ -262,7 +262,7 @@ dependencies = [
262
262
 
263
263
  [[package]]
264
264
  name = "dumpling"
265
- version = "0.2.0"
265
+ version = "0.3.0"
266
266
  dependencies = [
267
267
  "anyhow",
268
268
  "chrono",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "dumpling"
3
- version = "0.2.0"
3
+ version = "0.3.0"
4
4
  edition = "2021"
5
5
  readme = "README.md"
6
6
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dumpling-cli
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Classifier: Development Status :: 4 - Beta
5
5
  Classifier: Environment :: Console
6
6
  Classifier: Intended Audience :: Developers
@@ -112,8 +112,8 @@ salt = "${DUMPLING_GLOBAL_SALT}"
112
112
 
113
113
  # Rules are keyed by either "table" or "schema.table"
114
114
  [rules."public.users"]
115
- email = { strategy = "email", domain = "customer_identity", unique_within_domain = true }
116
- name = { strategy = "name", locale = "de_de" } # German-locale name
115
+ email = { strategy = "faker", faker = "internet::SafeEmail", domain = "customer_identity", unique_within_domain = true }
116
+ name = { strategy = "faker", faker = "name::Name", locale = "de_de" } # German-locale name
117
117
  ssn = { strategy = "hash", salt = "${env:DUMPLING_USERS_SSN_SALT}", as_string = true } # SHA-256 of original (salted)
118
118
  age = { strategy = "int_range", min = 18, max = 90 }
119
119
 
@@ -153,8 +153,7 @@ token = "high"
153
153
  | `redact` | Replace with `REDACTED` (string) |
154
154
  | `uuid` | Random UUIDv4-like string |
155
155
  | `hash` | SHA-256 hex of original value; supports per-column `salt` and global `salt` |
156
- | `email` | Random-looking email at `example.com` |
157
- | `name` / `first_name` / `last_name` | Locale-aware fake name (configurable via `locale`); defaults to English |
156
+ | `faker` | Values from the Rust [`fake`](https://crates.io/crates/fake) crate ([docs.rs](https://docs.rs/fake/latest/fake/), [`faker` modules](https://docs.rs/fake/latest/fake/faker/index.html)), chosen by a **string identifier** only (`faker = "module::Type"`, e.g. `internet::SafeEmail`). Config is **data only**: nothing from TOML is compiled or executed as Rust at runtime. Use `locale` for locale-aware generators; optional `min`/`max`, `length`, `format` as documented. Unsupported targets fail at config load. New generators require a **new Dumpling release** (or your own fork), not config-side code. |
158
157
  | `phone` | Locale-aware fake phone number (configurable via `locale`); defaults to English format |
159
158
  | `int_range` | Random integer in `[min, max]` |
160
159
  | `string` | Random alphanumeric string (`length = 12` by default) |
@@ -162,6 +161,12 @@ token = "high"
162
161
  | `time_fuzz` | Shifts a time-of-day by a random number of seconds in `[min_seconds, max_seconds]` with 24h wraparound (defaults: `-300..300`) |
163
162
  | `datetime_fuzz` | Shifts a timestamp/timestamptz by a random number of seconds in `[min_seconds, max_seconds]` (defaults: `-86400..86400`) |
164
163
 
164
+ **`faker` reference (upstream `fake` crate):** Dumpling’s `faker = "module::Type"` strings mirror the Rust [`fake`](https://crates.io/crates/fake) crate’s [`faker`](https://docs.rs/fake/latest/fake/faker/index.html) module layout. Use these when picking or extending generators:
165
+
166
+ - [docs.rs — `fake` crate root](https://docs.rs/fake/latest/fake/) (overview, `Fake` / `Dummy` traits, locales)
167
+ - [docs.rs — `fake::faker` module index](https://docs.rs/fake/latest/fake/faker/index.html) (per-domain submodules: `address`, `internet`, `name`, …)
168
+ - [GitHub — `cksac/fake-rs`](https://github.com/cksac/fake-rs) (source, README with the CLI’s generator name list)
169
+
165
170
  ### Secret references
166
171
 
167
172
  Dumpling resolves secret references in string config fields so plaintext salts/keys
@@ -209,7 +214,9 @@ dumpling --security-profile hardened --input dump.sql --check
209
214
  - `unique_within_domain`: when true, different source values are assigned unique pseudonyms within the configured `domain`. NULL values are unaffected and always remain NULL.
210
215
  - `min_days` / `max_days`: used by `date_fuzz`.
211
216
  - `min_seconds` / `max_seconds`: used by `time_fuzz` and `datetime_fuzz`.
212
- - `locale`: selects the language/regional format for the `name`, `first_name`, `last_name`, and `phone` strategies. Supported values: `en`, `fr_fr`, `de_de`, `it_it`, `pt_br`, `pt_pt`, `ar_sa`, `zh_cn`, `zh_tw`, `ja_jp`, `cy_gb`. Defaults to `en` when not specified.
217
+ - `locale`: selects the language/regional format for the `faker` and `phone` strategies. Supported values: `en`, `fr_fr`, `de_de`, `it_it`, `pt_br`, `pt_pt`, `ar_sa`, `zh_cn`, `zh_tw`, `ja_jp`, `cy_gb`. Defaults to `en` when not specified.
218
+ - `faker`: required when `strategy = "faker"`. A plain string `"module::Type"` (case-insensitive) that maps to a **built-in** generator compiled into Dumpling—not arbitrary Rust or expressions. Names follow [`fake::faker`](https://docs.rs/fake/latest/fake/faker/index.html) (e.g. `internet::SafeEmail` → `faker::internet::SafeEmail` in the crate).
219
+ - `format`: used with `faker = "number::NumberWithFormat"`; pattern uses `#` (0–9) and `^` (1–9) per the [`fake` crate docs](https://docs.rs/fake/latest/fake/).
213
220
 
214
221
  > **Note:** `table_options` are no longer supported; use explicit `rules` and optional `column_cases`.
215
222
 
@@ -353,7 +360,7 @@ Define default strategies in `rules."<table>"` and add ordered per-column cases
353
360
  ```toml
354
361
  [rules."public.users"]
355
362
  email = { strategy = "hash", as_string = true } # default
356
- name = { strategy = "name" }
363
+ name = { strategy = "faker", faker = "name::Name" }
357
364
 
358
365
  [[column_cases."public.users".email]]
359
366
  when.any = [{ column = "is_admin", op = "eq", value = "true" }]
@@ -404,7 +411,7 @@ salt = "${DUMPLING_HMAC_KEY}"
404
411
 
405
412
  [rules."public.users"]
406
413
  ssn = { strategy = "hash", as_string = true }
407
- email = { strategy = "email", domain = "users" }
414
+ email = { strategy = "faker", faker = "internet::SafeEmail", domain = "users" }
408
415
  ```
409
416
 
410
417
  ```bash
@@ -91,8 +91,8 @@ salt = "${DUMPLING_GLOBAL_SALT}"
91
91
 
92
92
  # Rules are keyed by either "table" or "schema.table"
93
93
  [rules."public.users"]
94
- email = { strategy = "email", domain = "customer_identity", unique_within_domain = true }
95
- name = { strategy = "name", locale = "de_de" } # German-locale name
94
+ email = { strategy = "faker", faker = "internet::SafeEmail", domain = "customer_identity", unique_within_domain = true }
95
+ name = { strategy = "faker", faker = "name::Name", locale = "de_de" } # German-locale name
96
96
  ssn = { strategy = "hash", salt = "${env:DUMPLING_USERS_SSN_SALT}", as_string = true } # SHA-256 of original (salted)
97
97
  age = { strategy = "int_range", min = 18, max = 90 }
98
98
 
@@ -132,8 +132,7 @@ token = "high"
132
132
  | `redact` | Replace with `REDACTED` (string) |
133
133
  | `uuid` | Random UUIDv4-like string |
134
134
  | `hash` | SHA-256 hex of original value; supports per-column `salt` and global `salt` |
135
- | `email` | Random-looking email at `example.com` |
136
- | `name` / `first_name` / `last_name` | Locale-aware fake name (configurable via `locale`); defaults to English |
135
+ | `faker` | Values from the Rust [`fake`](https://crates.io/crates/fake) crate ([docs.rs](https://docs.rs/fake/latest/fake/), [`faker` modules](https://docs.rs/fake/latest/fake/faker/index.html)), chosen by a **string identifier** only (`faker = "module::Type"`, e.g. `internet::SafeEmail`). Config is **data only**: nothing from TOML is compiled or executed as Rust at runtime. Use `locale` for locale-aware generators; optional `min`/`max`, `length`, `format` as documented. Unsupported targets fail at config load. New generators require a **new Dumpling release** (or your own fork), not config-side code. |
137
136
  | `phone` | Locale-aware fake phone number (configurable via `locale`); defaults to English format |
138
137
  | `int_range` | Random integer in `[min, max]` |
139
138
  | `string` | Random alphanumeric string (`length = 12` by default) |
@@ -141,6 +140,12 @@ token = "high"
141
140
  | `time_fuzz` | Shifts a time-of-day by a random number of seconds in `[min_seconds, max_seconds]` with 24h wraparound (defaults: `-300..300`) |
142
141
  | `datetime_fuzz` | Shifts a timestamp/timestamptz by a random number of seconds in `[min_seconds, max_seconds]` (defaults: `-86400..86400`) |
143
142
 
143
+ **`faker` reference (upstream `fake` crate):** Dumpling’s `faker = "module::Type"` strings mirror the Rust [`fake`](https://crates.io/crates/fake) crate’s [`faker`](https://docs.rs/fake/latest/fake/faker/index.html) module layout. Use these when picking or extending generators:
144
+
145
+ - [docs.rs — `fake` crate root](https://docs.rs/fake/latest/fake/) (overview, `Fake` / `Dummy` traits, locales)
146
+ - [docs.rs — `fake::faker` module index](https://docs.rs/fake/latest/fake/faker/index.html) (per-domain submodules: `address`, `internet`, `name`, …)
147
+ - [GitHub — `cksac/fake-rs`](https://github.com/cksac/fake-rs) (source, README with the CLI’s generator name list)
148
+
144
149
  ### Secret references
145
150
 
146
151
  Dumpling resolves secret references in string config fields so plaintext salts/keys
@@ -188,7 +193,9 @@ dumpling --security-profile hardened --input dump.sql --check
188
193
  - `unique_within_domain`: when true, different source values are assigned unique pseudonyms within the configured `domain`. NULL values are unaffected and always remain NULL.
189
194
  - `min_days` / `max_days`: used by `date_fuzz`.
190
195
  - `min_seconds` / `max_seconds`: used by `time_fuzz` and `datetime_fuzz`.
191
- - `locale`: selects the language/regional format for the `name`, `first_name`, `last_name`, and `phone` strategies. Supported values: `en`, `fr_fr`, `de_de`, `it_it`, `pt_br`, `pt_pt`, `ar_sa`, `zh_cn`, `zh_tw`, `ja_jp`, `cy_gb`. Defaults to `en` when not specified.
196
+ - `locale`: selects the language/regional format for the `faker` and `phone` strategies. Supported values: `en`, `fr_fr`, `de_de`, `it_it`, `pt_br`, `pt_pt`, `ar_sa`, `zh_cn`, `zh_tw`, `ja_jp`, `cy_gb`. Defaults to `en` when not specified.
197
+ - `faker`: required when `strategy = "faker"`. A plain string `"module::Type"` (case-insensitive) that maps to a **built-in** generator compiled into Dumpling—not arbitrary Rust or expressions. Names follow [`fake::faker`](https://docs.rs/fake/latest/fake/faker/index.html) (e.g. `internet::SafeEmail` → `faker::internet::SafeEmail` in the crate).
198
+ - `format`: used with `faker = "number::NumberWithFormat"`; pattern uses `#` (0–9) and `^` (1–9) per the [`fake` crate docs](https://docs.rs/fake/latest/fake/).
192
199
 
193
200
  > **Note:** `table_options` are no longer supported; use explicit `rules` and optional `column_cases`.
194
201
 
@@ -332,7 +339,7 @@ Define default strategies in `rules."<table>"` and add ordered per-column cases
332
339
  ```toml
333
340
  [rules."public.users"]
334
341
  email = { strategy = "hash", as_string = true } # default
335
- name = { strategy = "name" }
342
+ name = { strategy = "faker", faker = "name::Name" }
336
343
 
337
344
  [[column_cases."public.users".email]]
338
345
  when.any = [{ column = "is_admin", op = "eq", value = "true" }]
@@ -383,7 +390,7 @@ salt = "${DUMPLING_HMAC_KEY}"
383
390
 
384
391
  [rules."public.users"]
385
392
  ssn = { strategy = "hash", as_string = true }
386
- email = { strategy = "email", domain = "users" }
393
+ email = { strategy = "faker", faker = "internet::SafeEmail", domain = "users" }
387
394
  ```
388
395
 
389
396
  ```bash
@@ -29,7 +29,7 @@ violations to stderr, and exits:
29
29
  | `empty-rules-table` | warning | A `[rules]` entry has no column rules. Likely a stale or incomplete config section. |
30
30
  | `empty-column-cases-table` | warning | A `[column_cases]` entry has no column cases. |
31
31
  | `unsalted-hash` | warning | A `hash` strategy is used with no salt (neither per-column `salt` nor global `salt`). Unsalted hashes are reversible via precomputed lookup tables for low-entropy inputs (names, emails, common IDs). |
32
- | `inconsistent-domain-strategy` | error | The same domain name is used with two or more different strategies. This breaks referential integrity: a domain shared between `email` and `name` would try to maintain a bidirectional map between incompatible pseudonym types. |
32
+ | `inconsistent-domain-strategy` | error | The same domain name is used with two or more different strategies. This breaks referential integrity: a domain shared between incompatible generators (for example `faker` with different `faker` targets, or `faker` vs `hash`) cannot maintain a single stable mapping. |
33
33
  | `uncovered-sensitive-column` | error | A column listed in `[sensitive_columns]` has no matching anonymization rule or case. The column will pass through unmodified, making the sensitive declaration misleading. |
34
34
 
35
35
  ---
@@ -31,6 +31,16 @@ If no configuration is found, Dumpling fails closed by default and exits non-zer
31
31
  Error output includes every checked location. If you intentionally want a no-op
32
32
  run, pass `--allow-noop`.
33
33
 
34
+ ## Faker strategy and the `fake` crate
35
+
36
+ When you use `strategy = "faker"` with `faker = "module::Type"`, those names align with the Rust [**`fake`**](https://crates.io/crates/fake) crate’s [`faker`](https://docs.rs/fake/latest/fake/faker/index.html) modules (for example `name::FirstName` ↔ `fake::faker::name::raw::FirstName`). Use the upstream docs to discover available generators and options:
37
+
38
+ - [docs.rs — `fake` (crate overview)](https://docs.rs/fake/latest/fake/)
39
+ - [docs.rs — `fake::faker` (all faker submodules)](https://docs.rs/fake/latest/fake/faker/index.html)
40
+ - [GitHub — `cksac/fake-rs` (source + README)](https://github.com/cksac/fake-rs)
41
+
42
+ Dumpling only exposes a **subset** wired in `src/faker_dispatch.rs`; unsupported `module::Type` pairs fail at config load.
43
+
34
44
  ## Baseline config template
35
45
 
36
46
  ```toml
@@ -38,7 +48,7 @@ salt = "${DUMPLING_GLOBAL_SALT}"
38
48
 
39
49
  [rules."public.users"]
40
50
  email = { strategy = "hash", salt = "${env:DUMPLING_USERS_EMAIL_SALT}", as_string = true }
41
- name = { strategy = "name" }
51
+ full_name = { strategy = "faker", faker = "name::Name" }
42
52
 
43
53
  [sensitive_columns]
44
54
  "public.users" = ["employee_number", "tax_id"]
@@ -184,6 +194,15 @@ Nested JSON targeting is supported in predicate `column` values via either:
184
194
  When a JSON path traverses an array, Dumpling checks each element (useful for
185
195
  list-of-dicts JSON structures).
186
196
 
197
+ ### JSON path rules (`json` / `jsonb` columns)
198
+
199
+ You can anonymise values **inside** a text column that holds JSON using the same path syntax as row-filter predicates, but on **`[rules]` keys**:
200
+
201
+ - Dot notation: `"payload.profile.email" = { strategy = "email", domain = "orders_email", as_string = true }`
202
+ - Django-style: `"payload__profile__email" = { strategy = "hash", salt = "${env:ORDER_SECRET_SALT}", as_string = true }`
203
+
204
+ The part before the first dot or `__` is the **SQL column name**; the rest is the path inside the parsed JSON document. Use **quoted** keys in TOML when the name contains dots. For a given table, you can use **either** path-level rules for a column **or** one whole-column rule for that column’s base name, not both (Dumpling rejects the conflict at startup). If a path is missing in a given row, that rule is skipped for that row. When only path rules apply (no whole-column rule), the rest of the JSON is left unchanged. Path rules are applied in **longest-path-first** order. `column_cases` still match the SQL column name only; use `when` predicates with nested `column` paths to branch on JSON content.
205
+
187
206
  ## Safety recommendations
188
207
 
189
208
  - Prefer deterministic runs in CI by passing `--seed` (or `DUMPLING_SEED`).
@@ -202,7 +221,7 @@ list-of-dicts JSON structures).
202
221
  - Sensitive columns are detected by:
203
222
  1. built-in column-name patterns, and
204
223
  2. explicit per-table lists under `[sensitive_columns]`.
205
- - A sensitive column is considered covered only if it has an explicit `rules` or `column_cases` entry.
224
+ - A sensitive column is considered covered only if it has an explicit `rules` or `column_cases` entry (including JSON path rules whose base name is that column, e.g. `payload.x.y` covers `payload`).
206
225
  - If uncovered sensitive columns are found, Dumpling exits non-zero.
207
226
 
208
227
  When `--report` is enabled, coverage fields are added to JSON output:
@@ -2,9 +2,11 @@
2
2
 
3
3
  ## Prerequisites
4
4
 
5
- - Rust stable toolchain (edition 2021 compatible)
5
+ - Rust **stable** toolchain (`rustup` recommended). The repo includes `rust-toolchain.toml` (stable + `rustfmt` + `clippy`) so CI and local `cargo` stay aligned.
6
6
  - `cargo` on your `PATH`
7
7
 
8
+ Optional: run **`./scripts/setup-dev.sh`** once from the repo root — it installs toolchain components, **`cargo fetch`**, and a pinned **mdBook** under `.tools/` for the same docs build CI uses.
9
+
8
10
  ## Build
9
11
 
10
12
  ```bash
@@ -11,9 +11,9 @@ This documentation covers the operating model for day-to-day use:
11
11
 
12
12
  ## Documentation quality gate
13
13
 
14
- All documentation is built with `mdBook` in CI:
14
+ The mdBook site is built in CI as follows:
15
15
 
16
- - pull requests must pass the docs build job,
17
- - pushes to `main` automatically publish docs to GitHub Pages.
16
+ - **Pull requests:** the **Docs (PR)** workflow runs `mdbook build` when docs-related paths change (no deploy).
17
+ - **`main`:** the **Docs** workflow builds and deploys to GitHub Pages when docs-related paths change.
18
18
 
19
19
  This keeps the docs in a continuously deployable state instead of drifting from the codebase.
@@ -4,7 +4,7 @@ build-backend = "maturin"
4
4
 
5
5
  [project]
6
6
  name = "dumpling-cli"
7
- version = "0.2.0"
7
+ version = "0.3.0"
8
8
  description = "Static anonymizer for plain SQL dumps (PostgreSQL, SQLite, SQL Server)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -0,0 +1,3 @@
1
+ [toolchain]
2
+ channel = "stable"
3
+ components = ["rustfmt", "clippy"]
@@ -0,0 +1,89 @@
1
+ #!/usr/bin/env bash
2
+ # One-shot dev environment for Dumpling (Rust CLI + optional mdBook for docs).
3
+ # Safe to re-run; skips work that is already done.
4
+ #
5
+ # Usage: from repo root — ./scripts/setup-dev.sh
6
+ #
7
+ # Environment:
8
+ # MDBOOK_VERSION — mdBook release tag (default: 0.4.52, matches CI docs workflow)
9
+
10
+ set -euo pipefail
11
+
12
+ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
13
+ cd "$ROOT"
14
+
15
+ MDBOOK_VERSION="${MDBOOK_VERSION:-0.4.52}"
16
+ TOOLS_DIR="${ROOT}/.tools"
17
+ MDBOOK_BIN="${TOOLS_DIR}/mdbook"
18
+
19
+ require_rust() {
20
+ if ! command -v rustc >/dev/null 2>&1 || ! command -v cargo >/dev/null 2>&1; then
21
+ echo "error: rustc/cargo not found. Install Rust: https://rustup.rs/" >&2
22
+ exit 1
23
+ fi
24
+ }
25
+
26
+ mdbook_download_url() {
27
+ local arch
28
+ case "$(uname -sm)" in
29
+ Linux\ x86_64) arch="x86_64-unknown-linux-gnu" ;;
30
+ Darwin\ x86_64) arch="x86_64-apple-darwin" ;;
31
+ Darwin\ arm64) arch="aarch64-apple-darwin" ;;
32
+ *)
33
+ echo "error: unsupported OS/arch for prebuilt mdbook: $(uname -sm)" >&2
34
+ echo "Install mdbook yourself: https://github.com/rust-lang/mdBook/releases" >&2
35
+ exit 1
36
+ ;;
37
+ esac
38
+ echo "https://github.com/rust-lang/mdBook/releases/download/v${MDBOOK_VERSION}/mdbook-v${MDBOOK_VERSION}-${arch}.tar.gz"
39
+ }
40
+
41
+ ensure_mdbook() {
42
+ if [[ -x "${MDBOOK_BIN}" ]]; then
43
+ installed="$("${MDBOOK_BIN}" --version 2>/dev/null | awk '{print $2}' || true)"
44
+ if [[ "${installed}" == "${MDBOOK_VERSION}" ]]; then
45
+ return 0
46
+ fi
47
+ fi
48
+
49
+ mkdir -p "${TOOLS_DIR}"
50
+ local url tmp
51
+ url="$(mdbook_download_url)"
52
+ tmp="$(mktemp -d)"
53
+ trap 'rm -rf "${tmp}"' EXIT
54
+ echo "Downloading mdbook v${MDBOOK_VERSION}…"
55
+ curl -fsSL "${url}" | tar xz -C "${tmp}"
56
+ mv "${tmp}/mdbook" "${MDBOOK_BIN}"
57
+ chmod +x "${MDBOOK_BIN}"
58
+ trap - EXIT
59
+ rm -rf "${tmp}"
60
+ }
61
+
62
+ main() {
63
+ require_rust
64
+
65
+ if command -v rustup >/dev/null 2>&1; then
66
+ echo "Installing stable toolchain + rustfmt + clippy (rustup)…"
67
+ rustup toolchain install stable
68
+ rustup component add rustfmt clippy --toolchain stable
69
+ else
70
+ echo "warning: rustup not found; ensure rustfmt and clippy are installed for stable CI parity." >&2
71
+ fi
72
+
73
+ echo "Prefetching crates (cargo fetch)…"
74
+ cargo fetch
75
+
76
+ ensure_mdbook
77
+ echo "mdbook: ${MDBOOK_BIN} ($("${MDBOOK_BIN}" --version))"
78
+
79
+ echo
80
+ echo "Done. Typical checks:"
81
+ echo " cargo fmt --all -- --check"
82
+ echo " cargo clippy --all-targets --all-features"
83
+ echo " cargo test --all-targets --all-features"
84
+ echo " ${MDBOOK_BIN} build # same as Docs CI (book.toml → docs/book)"
85
+ echo
86
+ echo "Tip: add ${TOOLS_DIR} to PATH for this shell: export PATH=\"${TOOLS_DIR}:\${PATH}\""
87
+ }
88
+
89
+ main "$@"