dumpling-cli 0.2.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/.dumplingconf.example +11 -7
  2. dumpling_cli-0.4.0/.github/workflows/docs-pr.yml +35 -0
  3. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/.github/workflows/docs.yml +9 -11
  4. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/.gitignore +1 -0
  5. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/AGENTS.md +17 -1
  6. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/CHANGELOG.md +27 -0
  7. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/CONTRIBUTING.md +8 -0
  8. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/Cargo.lock +1 -1
  9. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/Cargo.toml +1 -1
  10. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/PKG-INFO +26 -10
  11. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/README.md +25 -9
  12. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/docs/src/ci-guardrails.md +1 -1
  13. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/docs/src/configuration.md +44 -3
  14. dumpling_cli-0.4.0/docs/src/getting-started.md +33 -0
  15. dumpling_cli-0.4.0/docs/src/index.md +19 -0
  16. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/pyproject.toml +1 -1
  17. dumpling_cli-0.4.0/rust-toolchain.toml +3 -0
  18. dumpling_cli-0.4.0/scripts/setup-dev.sh +89 -0
  19. dumpling_cli-0.4.0/src/faker_dispatch.rs +521 -0
  20. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/src/filter.rs +115 -41
  21. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/src/lint.rs +6 -0
  22. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/src/main.rs +178 -26
  23. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/src/scan.rs +1 -0
  24. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/src/settings.rs +245 -29
  25. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/src/sql.rs +276 -95
  26. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/src/transform.rs +48 -119
  27. dumpling_cli-0.2.0/docs/src/getting-started.md +0 -29
  28. dumpling_cli-0.2.0/docs/src/index.md +0 -19
  29. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/.github/workflows/ci.yml +0 -0
  30. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/.github/workflows/platform-compat-latest.yml +0 -0
  31. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/.github/workflows/platform-compat-matrix.yml +0 -0
  32. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/.github/workflows/policy-lint.yml +0 -0
  33. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/.github/workflows/publish.yml +0 -0
  34. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/.github/workflows/release.yml +0 -0
  35. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/.github/workflows/tests.yml +0 -0
  36. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/MAINTENANCE.md +0 -0
  37. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/book.toml +0 -0
  38. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/datetime_out.sql +0 -0
  39. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/datetime_sample.sql +0 -0
  40. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/docs/src/SUMMARY.md +0 -0
  41. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/docs/src/releasing.md +0 -0
  42. {dumpling_cli-0.2.0 → dumpling_cli-0.4.0}/src/report.rs +0 -0
@@ -26,13 +26,17 @@ salt = "${DUMPLING_GLOBAL_SALT}"
26
26
  #
27
27
  # Each column maps to an anonymizer spec: { strategy = "…", <options> }
28
28
  # ---------------------------------------------------------------------------
29
+ # Faker strategy: `faker = "module::Type"` matches the Rust `fake` crate layout.
30
+ # Crate docs: https://docs.rs/fake/latest/fake/
31
+ # Faker modules: https://docs.rs/fake/latest/fake/faker/index.html
32
+ # Upstream repo: https://github.com/cksac/fake-rs
29
33
  [rules."public.users"]
30
- # email — random-looking email at example.com; force quoted string output
31
- email = { strategy = "email", domain = "customer_identity", unique_within_domain = true }
32
- # name — random placeholder full name
33
- full_name = { strategy = "name" }
34
- first_name = { strategy = "first_name" }
35
- last_name = { strategy = "last_name" }
34
+ # email — fake email via Rust `fake` crate; force quoted string output
35
+ email = { strategy = "faker", faker = "internet::SafeEmail", domain = "customer_identity", unique_within_domain = true }
36
+ # name — locale-aware full name (see `locale`); other generators use `faker = "module::Type"`
37
+ full_name = { strategy = "faker", faker = "name::Name" }
38
+ first_name = { strategy = "faker", faker = "name::FirstName" }
39
+ last_name = { strategy = "faker", faker = "name::LastName" }
36
40
  # phone — US-style (xxx) xxx-xxxx
37
41
  phone = { strategy = "phone" }
38
42
  # ssn — SHA-256 hex of original; use per-column salt for extra protection
@@ -58,7 +62,7 @@ wake_time = { strategy = "time_fuzz", min_seconds = -3600, max_seconds = 360
58
62
  # credit card — redact entirely; force as quoted string
59
63
  credit_card = { strategy = "redact", as_string = true }
60
64
  # keep the same anonymized email as users table via shared domain
61
- customer_email = { strategy = "email", domain = "customer_identity" }
65
+ customer_email = { strategy = "faker", faker = "internet::SafeEmail", domain = "customer_identity" }
62
66
 
63
67
  [rules."public.audit_log"]
64
68
  # unqualified table name also works (matches any schema)
@@ -0,0 +1,35 @@
1
+ # mdBook verification on pull requests only (no GitHub Pages upload or deploy).
2
+ # Pages build + deploy live in docs.yml and run on pushes to main.
3
+ name: Docs (PR)
4
+
5
+ on:
6
+ pull_request:
7
+ paths:
8
+ - "README.md"
9
+ - "book.toml"
10
+ - "docs/**"
11
+ - ".github/workflows/docs.yml"
12
+ - ".github/workflows/docs-pr.yml"
13
+
14
+ permissions:
15
+ contents: read
16
+
17
+ concurrency:
18
+ group: docs-pr-${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
19
+ cancel-in-progress: true
20
+
21
+ jobs:
22
+ verify:
23
+ name: Build mdBook (verify)
24
+ runs-on: ubuntu-latest
25
+ steps:
26
+ - name: Checkout
27
+ uses: actions/checkout@v4
28
+
29
+ - name: Install mdBook
30
+ uses: peaceiris/actions-mdbook@v2
31
+ with:
32
+ mdbook-version: "0.4.52"
33
+
34
+ - name: Build documentation site
35
+ run: mdbook build
@@ -1,12 +1,8 @@
1
+ # Build and deploy the mdBook site to GitHub Pages (main branch only).
2
+ # Pull-request verification runs in docs-pr.yml — this workflow does not run on PRs.
1
3
  name: Docs
2
4
 
3
5
  on:
4
- pull_request:
5
- paths:
6
- - "README.md"
7
- - "book.toml"
8
- - "docs/**"
9
- - ".github/workflows/docs.yml"
10
6
  push:
11
7
  branches:
12
8
  - main
@@ -18,16 +14,16 @@ on:
18
14
 
19
15
  permissions:
20
16
  contents: read
21
- pages: write
22
- id-token: write
23
17
 
24
18
  concurrency:
25
- group: docs-${{ github.ref }}
19
+ group: docs-pages-${{ github.ref }}
26
20
  cancel-in-progress: true
27
21
 
28
22
  jobs:
29
23
  build:
30
24
  runs-on: ubuntu-latest
25
+ permissions:
26
+ contents: read
31
27
  steps:
32
28
  - name: Checkout
33
29
  uses: actions/checkout@v4
@@ -40,15 +36,17 @@ jobs:
40
36
  - name: Build documentation site
41
37
  run: mdbook build
42
38
 
43
- - name: Upload docs artifact
39
+ - name: Upload Pages deployment artifact
44
40
  uses: actions/upload-pages-artifact@v3
45
41
  with:
46
42
  path: docs/book
47
43
 
48
44
  deploy:
49
- if: github.event_name == 'push' && github.ref == 'refs/heads/main'
50
45
  needs: build
51
46
  runs-on: ubuntu-latest
47
+ permissions:
48
+ pages: write
49
+ id-token: write
52
50
  environment:
53
51
  name: github-pages
54
52
  url: ${{ steps.deployment.outputs.page_url }}
@@ -1,2 +1,3 @@
1
1
  /target/
2
2
  /docs/book/
3
+ /.tools/
@@ -223,6 +223,8 @@ Follow these steps in order. Do not skip any step.
223
223
 
224
224
  8. **`README.md`**: Add a row to the "Anonymization strategies" table.
225
225
 
226
+ **`faker` strategy:** Config only carries string identifiers; Dumpling never evaluates user Rust from config. To ship a new generator, add dispatch in `src/faker_dispatch.rs` and validation in `validate_anonymizer_spec` for the `faker` branch. Upstream reference: [`fake` on docs.rs](https://docs.rs/fake/latest/fake/), [`fake::faker` module index](https://docs.rs/fake/latest/fake/faker/index.html), [source on GitHub](https://github.com/cksac/fake-rs).
227
+
226
228
  ---
227
229
 
228
230
  ## How to Add a New Row Filter Predicate Operator
@@ -274,15 +276,29 @@ Follow these steps in order. Do not skip any step.
274
276
 
275
277
  This is a pure Rust CLI project with **no external services** (no database, Docker, or network dependencies). The Rust stable toolchain (rustc + cargo) is the only prerequisite.
276
278
 
279
+ ### One-shot environment (agents and humans)
280
+
281
+ From the repository root:
282
+
283
+ ```bash
284
+ ./scripts/setup-dev.sh
285
+ ```
286
+
287
+ This installs the **stable** toolchain with **rustfmt** and **clippy** (via `rustup` when available), runs **`cargo fetch`**, and installs a pinned **mdBook** binary under `.tools/` (same version as the Docs CI workflow) so you can run `mdbook build` without a global install. Add `.tools` to `PATH` for convenience, or invoke `.tools/mdbook build` directly.
288
+
289
+ The repo root **`rust-toolchain.toml`** pins **stable** and the **components** CI uses, so `cargo` automatically selects the right toolchain in fresh checkouts.
290
+
277
291
  ### Quick reference
278
292
 
279
293
  | Task | Command |
280
294
  |------|---------|
295
+ | Setup (toolchain + fetch + mdbook) | `./scripts/setup-dev.sh` |
281
296
  | Build | `cargo build` |
282
297
  | Test | `cargo test --all-targets --all-features` |
283
298
  | Lint | `cargo clippy --all-targets --all-features` |
284
299
  | Format check | `cargo fmt --all -- --check` |
285
300
  | Auto-format | `cargo fmt` |
301
+ | Docs site (mdBook) | `mdbook build` or `.tools/mdbook build` after setup |
286
302
  | Run CLI | `./target/debug/dumpling --help` |
287
303
 
288
304
  ### Running the CLI
@@ -295,6 +311,6 @@ Dumpling is fail-closed by default — it exits non-zero without a config file.
295
311
 
296
312
  ### Notes
297
313
 
298
- - All 94 tests are inline `#[cfg(test)]` modules; there are no separate test files or fixtures to manage.
314
+ - All tests are inline `#[cfg(test)]` modules; there are no separate test files or fixtures to manage.
299
315
  - The update script uses `cargo fetch` to pre-download crate dependencies. A full `cargo build` or `cargo test` will then compile from the local cache without network access.
300
316
  - No environment variables or secrets are required for building, testing, or running the CLI locally.
@@ -7,6 +7,31 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.4.0] - 2026-05-02
11
+
12
+ ### Added
13
+
14
+ - **`--dump-decode` CLI**: Decode PostgreSQL **custom-format** (`pg_dump -Fc`) or **directory-format** archives by running **`pg_restore -f -`** (plain SQL to stdout, no database), then anonymize—built for workflows such as **`heroku pg:backups:download`**. Requires PostgreSQL client tools (`pg_restore` on `PATH`, or **`--pg-restore-path`**).
15
+ - **`--dump-decode-arg`** (repeatable): Extra arguments forwarded to `pg_restore`.
16
+ - **`--dump-decode-keep-input`**: Keep the archive after a successful run. **By default** the `--input` path is **removed** after success so only anonymized output remains. **`--check`** with **`--dump-decode`** requires **`--dump-decode-keep-input`** (otherwise the dump would be deleted before config iteration).
17
+
18
+ ### Changed
19
+
20
+ - README and mdBook documentation for PostgreSQL archive decoding and Heroku-style examples.
21
+
22
+ ## [0.3.0] - 2026-05-02
23
+
24
+ ### Added
25
+
26
+ - **`faker` anonymization strategy** backed by the Rust [`fake`](https://crates.io/crates/fake) crate: select generators with `faker = "module::Type"` (for example `internet::SafeEmail`, `name::Name`). Unsupported targets fail at config load with a clear error; extending the allowlist requires a Dumpling release (see `src/faker_dispatch.rs`).
27
+ - **JSON path rules in `[rules]`**: column keys such as `payload.profile.email` or `payload__profile__email` apply strategies to nested fields inside JSON text columns while preserving document structure. Conflicts between a whole-column rule and JSON path rules for the same base column are rejected at validation.
28
+ - **`format` on `AnonymizerSpec`** for pattern-based faker generators such as `number::NumberWithFormat`.
29
+
30
+ ### Changed
31
+
32
+ - **Legacy strategy names** `email`, `name`, `first_name`, and `last_name` in config are normalized at load time to `strategy = "faker"` with the same defaults as before (`internet::SafeEmail`, `name::Name`, `name::FirstName`, `name::LastName`), so existing configs keep working.
33
+ - **`locale`** applies to both `faker` and `phone` strategies.
34
+
10
35
  ## [0.2.0] - 2026-05-02
11
36
 
12
37
  ### Added
@@ -30,4 +55,6 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
30
55
  - Configurable output scan severities and per-category thresholds via `[output_scan]`.
31
56
  - JSON report section for output scan findings including category, count, threshold, severity, and sample locations.
32
57
 
58
+ [0.4.0]: https://github.com/ababic/dumpling/compare/v0.3.0...v0.4.0
59
+ [0.3.0]: https://github.com/ababic/dumpling/compare/v0.2.0...v0.3.0
33
60
  [0.2.0]: https://github.com/ababic/dumpling/compare/v0.1.0...v0.2.0
@@ -13,6 +13,14 @@ For AI coding agents: also read `AGENTS.md`, which contains more detailed techni
13
13
  - **Rust stable toolchain** — install via [rustup.rs](https://rustup.rs/).
14
14
  - No database, Docker, or external services are required. Dumpling is a pure CLI tool.
15
15
 
16
+ ### One-shot setup (recommended)
17
+
18
+ ```bash
19
+ ./scripts/setup-dev.sh
20
+ ```
21
+
22
+ Installs stable + `rustfmt` + `clippy`, prefetches crates, and downloads a pinned **mdBook** under `.tools/` (for `mdbook build`, same version as CI). Optional: `export PATH="$PWD/.tools:$PATH"`.
23
+
16
24
  ### Build and run
17
25
 
18
26
  ```bash
@@ -262,7 +262,7 @@ dependencies = [
262
262
 
263
263
  [[package]]
264
264
  name = "dumpling"
265
- version = "0.2.0"
265
+ version = "0.4.0"
266
266
  dependencies = [
267
267
  "anyhow",
268
268
  "chrono",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "dumpling"
3
- version = "0.2.0"
3
+ version = "0.4.0"
4
4
  edition = "2021"
5
5
  readme = "README.md"
6
6
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dumpling-cli
3
- Version: 0.2.0
3
+ Version: 0.4.0
4
4
  Classifier: Development Status :: 4 - Beta
5
5
  Classifier: Environment :: Console
6
6
  Classifier: Intended Audience :: Developers
@@ -112,8 +112,8 @@ salt = "${DUMPLING_GLOBAL_SALT}"
112
112
 
113
113
  # Rules are keyed by either "table" or "schema.table"
114
114
  [rules."public.users"]
115
- email = { strategy = "email", domain = "customer_identity", unique_within_domain = true }
116
- name = { strategy = "name", locale = "de_de" } # German-locale name
115
+ email = { strategy = "faker", faker = "internet::SafeEmail", domain = "customer_identity", unique_within_domain = true }
116
+ name = { strategy = "faker", faker = "name::Name", locale = "de_de" } # German-locale name
117
117
  ssn = { strategy = "hash", salt = "${env:DUMPLING_USERS_SSN_SALT}", as_string = true } # SHA-256 of original (salted)
118
118
  age = { strategy = "int_range", min = 18, max = 90 }
119
119
 
@@ -153,8 +153,7 @@ token = "high"
153
153
  | `redact` | Replace with `REDACTED` (string) |
154
154
  | `uuid` | Random UUIDv4-like string |
155
155
  | `hash` | SHA-256 hex of original value; supports per-column `salt` and global `salt` |
156
- | `email` | Random-looking email at `example.com` |
157
- | `name` / `first_name` / `last_name` | Locale-aware fake name (configurable via `locale`); defaults to English |
156
+ | `faker` | Values from the Rust [`fake`](https://crates.io/crates/fake) crate ([docs.rs](https://docs.rs/fake/latest/fake/), [`faker` modules](https://docs.rs/fake/latest/fake/faker/index.html)), chosen by a **string identifier** only (`faker = "module::Type"`, e.g. `internet::SafeEmail`). Config is **data only**: nothing from TOML is compiled or executed as Rust at runtime. Use `locale` for locale-aware generators; optional `min`/`max`, `length`, `format` as documented. Unsupported targets fail at config load. New generators require a **new Dumpling release** (or your own fork), not config-side code. |
158
157
  | `phone` | Locale-aware fake phone number (configurable via `locale`); defaults to English format |
159
158
  | `int_range` | Random integer in `[min, max]` |
160
159
  | `string` | Random alphanumeric string (`length = 12` by default) |
@@ -162,6 +161,12 @@ token = "high"
162
161
  | `time_fuzz` | Shifts a time-of-day by a random number of seconds in `[min_seconds, max_seconds]` with 24h wraparound (defaults: `-300..300`) |
163
162
  | `datetime_fuzz` | Shifts a timestamp/timestamptz by a random number of seconds in `[min_seconds, max_seconds]` (defaults: `-86400..86400`) |
164
163
 
164
+ **`faker` reference (upstream `fake` crate):** Dumpling’s `faker = "module::Type"` strings mirror the Rust [`fake`](https://crates.io/crates/fake) crate’s [`faker`](https://docs.rs/fake/latest/fake/faker/index.html) module layout. Use these when picking or extending generators:
165
+
166
+ - [docs.rs — `fake` crate root](https://docs.rs/fake/latest/fake/) (overview, `Fake` / `Dummy` traits, locales)
167
+ - [docs.rs — `fake::faker` module index](https://docs.rs/fake/latest/fake/faker/index.html) (per-domain submodules: `address`, `internet`, `name`, …)
168
+ - [GitHub — `cksac/fake-rs`](https://github.com/cksac/fake-rs) (source, README with the CLI’s generator name list)
169
+
165
170
  ### Secret references
166
171
 
167
172
  Dumpling resolves secret references in string config fields so plaintext salts/keys
@@ -209,7 +214,9 @@ dumpling --security-profile hardened --input dump.sql --check
209
214
  - `unique_within_domain`: when true, different source values are assigned unique pseudonyms within the configured `domain`. NULL values are unaffected and always remain NULL.
210
215
  - `min_days` / `max_days`: used by `date_fuzz`.
211
216
  - `min_seconds` / `max_seconds`: used by `time_fuzz` and `datetime_fuzz`.
212
- - `locale`: selects the language/regional format for the `name`, `first_name`, `last_name`, and `phone` strategies. Supported values: `en`, `fr_fr`, `de_de`, `it_it`, `pt_br`, `pt_pt`, `ar_sa`, `zh_cn`, `zh_tw`, `ja_jp`, `cy_gb`. Defaults to `en` when not specified.
217
+ - `locale`: selects the language/regional format for the `faker` and `phone` strategies. Supported values: `en`, `fr_fr`, `de_de`, `it_it`, `pt_br`, `pt_pt`, `ar_sa`, `zh_cn`, `zh_tw`, `ja_jp`, `cy_gb`. Defaults to `en` when not specified.
218
+ - `faker`: required when `strategy = "faker"`. A plain string `"module::Type"` (case-insensitive) that maps to a **built-in** generator compiled into Dumpling—not arbitrary Rust or expressions. Names follow [`fake::faker`](https://docs.rs/fake/latest/fake/faker/index.html) (e.g. `internet::SafeEmail` → `faker::internet::SafeEmail` in the crate).
219
+ - `format`: used with `faker = "number::NumberWithFormat"`; pattern uses `#` (0–9) and `^` (1–9) per the [`fake` crate docs](https://docs.rs/fake/latest/fake/).
213
220
 
214
221
  > **Note:** `table_options` are no longer supported; use explicit `rules` and optional `column_cases`.
215
222
 
@@ -282,7 +289,16 @@ Produced by `pg_dump --format=plain`. Handles:
282
289
  - `"double-quoted"` identifiers
283
290
  - `''`-escaped string literals
284
291
 
285
- Binary, custom, and directory formats from `pg_dump` are not supporteduse `--format=plain` when running `pg_dump`.
292
+ Binary, custom, and directory formats from `pg_dump` are not parsed directly Dumpling’s SQL pipeline expects plain text. Use either:
293
+
294
+ - **`pg_dump --format=plain`** when you control capture, or
295
+ - **`dumpling --dump-decode`** with `--input` set to a **custom-format** (`.dump`) or **directory-format** folder: Dumpling runs `pg_restore -f -` and streams the resulting SQL (same as a manual `pg_restore` “script” output, no database required). Requires PostgreSQL client tools on `PATH` (`pg_restore`), or set `--pg-restore-path`. Use `--dump-decode-arg` to pass extra flags (e.g. `--no-owner --no-acl`). **By default** the archive is removed after a fully successful run; pass **`--dump-decode-keep-input`** to retain it. **`--check`** requires **`--dump-decode-keep-input`** so the archive still exists if changes would be detected.
296
+
297
+ Example (e.g. after `heroku pg:backups:download`):
298
+
299
+ ```bash
300
+ dumpling --dump-decode -i latest.dump -c .dumplingconf -o anonymized.sql
301
+ ```
286
302
 
287
303
  ### SQLite (`--format sqlite`)
288
304
 
@@ -353,7 +369,7 @@ Define default strategies in `rules."<table>"` and add ordered per-column cases
353
369
  ```toml
354
370
  [rules."public.users"]
355
371
  email = { strategy = "hash", as_string = true } # default
356
- name = { strategy = "name" }
372
+ name = { strategy = "faker", faker = "name::Name" }
357
373
 
358
374
  [[column_cases."public.users".email]]
359
375
  when.any = [{ column = "is_admin", op = "eq", value = "true" }]
@@ -404,7 +420,7 @@ salt = "${DUMPLING_HMAC_KEY}"
404
420
 
405
421
  [rules."public.users"]
406
422
  ssn = { strategy = "hash", as_string = true }
407
- email = { strategy = "email", domain = "users" }
423
+ email = { strategy = "faker", faker = "internet::SafeEmail", domain = "users" }
408
424
  ```
409
425
 
410
426
  ```bash
@@ -470,5 +486,5 @@ See the [CI guardrails documentation](docs/src/ci-guardrails.md) for full pipeli
470
486
 
471
487
  ## Full documentation
472
488
 
473
- Detailed docs, including the configuration reference and release process, are available at the project's [GitHub Pages site](https://github.com) (built from `docs/src/`).
489
+ Detailed docs, including the configuration reference and release process, are available at the project's [GitHub Pages site](https://ababic.github.io/dumpling/) (built from `docs/src/`).
474
490
 
@@ -91,8 +91,8 @@ salt = "${DUMPLING_GLOBAL_SALT}"
91
91
 
92
92
  # Rules are keyed by either "table" or "schema.table"
93
93
  [rules."public.users"]
94
- email = { strategy = "email", domain = "customer_identity", unique_within_domain = true }
95
- name = { strategy = "name", locale = "de_de" } # German-locale name
94
+ email = { strategy = "faker", faker = "internet::SafeEmail", domain = "customer_identity", unique_within_domain = true }
95
+ name = { strategy = "faker", faker = "name::Name", locale = "de_de" } # German-locale name
96
96
  ssn = { strategy = "hash", salt = "${env:DUMPLING_USERS_SSN_SALT}", as_string = true } # SHA-256 of original (salted)
97
97
  age = { strategy = "int_range", min = 18, max = 90 }
98
98
 
@@ -132,8 +132,7 @@ token = "high"
132
132
  | `redact` | Replace with `REDACTED` (string) |
133
133
  | `uuid` | Random UUIDv4-like string |
134
134
  | `hash` | SHA-256 hex of original value; supports per-column `salt` and global `salt` |
135
- | `email` | Random-looking email at `example.com` |
136
- | `name` / `first_name` / `last_name` | Locale-aware fake name (configurable via `locale`); defaults to English |
135
+ | `faker` | Values from the Rust [`fake`](https://crates.io/crates/fake) crate ([docs.rs](https://docs.rs/fake/latest/fake/), [`faker` modules](https://docs.rs/fake/latest/fake/faker/index.html)), chosen by a **string identifier** only (`faker = "module::Type"`, e.g. `internet::SafeEmail`). Config is **data only**: nothing from TOML is compiled or executed as Rust at runtime. Use `locale` for locale-aware generators; optional `min`/`max`, `length`, `format` as documented. Unsupported targets fail at config load. New generators require a **new Dumpling release** (or your own fork), not config-side code. |
137
136
  | `phone` | Locale-aware fake phone number (configurable via `locale`); defaults to English format |
138
137
  | `int_range` | Random integer in `[min, max]` |
139
138
  | `string` | Random alphanumeric string (`length = 12` by default) |
@@ -141,6 +140,12 @@ token = "high"
141
140
  | `time_fuzz` | Shifts a time-of-day by a random number of seconds in `[min_seconds, max_seconds]` with 24h wraparound (defaults: `-300..300`) |
142
141
  | `datetime_fuzz` | Shifts a timestamp/timestamptz by a random number of seconds in `[min_seconds, max_seconds]` (defaults: `-86400..86400`) |
143
142
 
143
+ **`faker` reference (upstream `fake` crate):** Dumpling’s `faker = "module::Type"` strings mirror the Rust [`fake`](https://crates.io/crates/fake) crate’s [`faker`](https://docs.rs/fake/latest/fake/faker/index.html) module layout. Use these when picking or extending generators:
144
+
145
+ - [docs.rs — `fake` crate root](https://docs.rs/fake/latest/fake/) (overview, `Fake` / `Dummy` traits, locales)
146
+ - [docs.rs — `fake::faker` module index](https://docs.rs/fake/latest/fake/faker/index.html) (per-domain submodules: `address`, `internet`, `name`, …)
147
+ - [GitHub — `cksac/fake-rs`](https://github.com/cksac/fake-rs) (source, README with the CLI’s generator name list)
148
+
144
149
  ### Secret references
145
150
 
146
151
  Dumpling resolves secret references in string config fields so plaintext salts/keys
@@ -188,7 +193,9 @@ dumpling --security-profile hardened --input dump.sql --check
188
193
  - `unique_within_domain`: when true, different source values are assigned unique pseudonyms within the configured `domain`. NULL values are unaffected and always remain NULL.
189
194
  - `min_days` / `max_days`: used by `date_fuzz`.
190
195
  - `min_seconds` / `max_seconds`: used by `time_fuzz` and `datetime_fuzz`.
191
- - `locale`: selects the language/regional format for the `name`, `first_name`, `last_name`, and `phone` strategies. Supported values: `en`, `fr_fr`, `de_de`, `it_it`, `pt_br`, `pt_pt`, `ar_sa`, `zh_cn`, `zh_tw`, `ja_jp`, `cy_gb`. Defaults to `en` when not specified.
196
+ - `locale`: selects the language/regional format for the `faker` and `phone` strategies. Supported values: `en`, `fr_fr`, `de_de`, `it_it`, `pt_br`, `pt_pt`, `ar_sa`, `zh_cn`, `zh_tw`, `ja_jp`, `cy_gb`. Defaults to `en` when not specified.
197
+ - `faker`: required when `strategy = "faker"`. A plain string `"module::Type"` (case-insensitive) that maps to a **built-in** generator compiled into Dumpling—not arbitrary Rust or expressions. Names follow [`fake::faker`](https://docs.rs/fake/latest/fake/faker/index.html) (e.g. `internet::SafeEmail` → `faker::internet::SafeEmail` in the crate).
198
+ - `format`: used with `faker = "number::NumberWithFormat"`; pattern uses `#` (0–9) and `^` (1–9) per the [`fake` crate docs](https://docs.rs/fake/latest/fake/).
192
199
 
193
200
  > **Note:** `table_options` are no longer supported; use explicit `rules` and optional `column_cases`.
194
201
 
@@ -261,7 +268,16 @@ Produced by `pg_dump --format=plain`. Handles:
261
268
  - `"double-quoted"` identifiers
262
269
  - `''`-escaped string literals
263
270
 
264
- Binary, custom, and directory formats from `pg_dump` are not supporteduse `--format=plain` when running `pg_dump`.
271
+ Binary, custom, and directory formats from `pg_dump` are not parsed directly Dumpling’s SQL pipeline expects plain text. Use either:
272
+
273
+ - **`pg_dump --format=plain`** when you control capture, or
274
+ - **`dumpling --dump-decode`** with `--input` set to a **custom-format** (`.dump`) or **directory-format** folder: Dumpling runs `pg_restore -f -` and streams the resulting SQL (same as a manual `pg_restore` “script” output, no database required). Requires PostgreSQL client tools on `PATH` (`pg_restore`), or set `--pg-restore-path`. Use `--dump-decode-arg` to pass extra flags (e.g. `--no-owner --no-acl`). **By default** the archive is removed after a fully successful run; pass **`--dump-decode-keep-input`** to retain it. **`--check`** requires **`--dump-decode-keep-input`** so the archive still exists if changes would be detected.
275
+
276
+ Example (e.g. after `heroku pg:backups:download`):
277
+
278
+ ```bash
279
+ dumpling --dump-decode -i latest.dump -c .dumplingconf -o anonymized.sql
280
+ ```
265
281
 
266
282
  ### SQLite (`--format sqlite`)
267
283
 
@@ -332,7 +348,7 @@ Define default strategies in `rules."<table>"` and add ordered per-column cases
332
348
  ```toml
333
349
  [rules."public.users"]
334
350
  email = { strategy = "hash", as_string = true } # default
335
- name = { strategy = "name" }
351
+ name = { strategy = "faker", faker = "name::Name" }
336
352
 
337
353
  [[column_cases."public.users".email]]
338
354
  when.any = [{ column = "is_admin", op = "eq", value = "true" }]
@@ -383,7 +399,7 @@ salt = "${DUMPLING_HMAC_KEY}"
383
399
 
384
400
  [rules."public.users"]
385
401
  ssn = { strategy = "hash", as_string = true }
386
- email = { strategy = "email", domain = "users" }
402
+ email = { strategy = "faker", faker = "internet::SafeEmail", domain = "users" }
387
403
  ```
388
404
 
389
405
  ```bash
@@ -449,4 +465,4 @@ See the [CI guardrails documentation](docs/src/ci-guardrails.md) for full pipeli
449
465
 
450
466
  ## Full documentation
451
467
 
452
- Detailed docs, including the configuration reference and release process, are available at the project's [GitHub Pages site](https://github.com) (built from `docs/src/`).
468
+ Detailed docs, including the configuration reference and release process, are available at the project's [GitHub Pages site](https://ababic.github.io/dumpling/) (built from `docs/src/`).
@@ -29,7 +29,7 @@ violations to stderr, and exits:
29
29
  | `empty-rules-table` | warning | A `[rules]` entry has no column rules. Likely a stale or incomplete config section. |
30
30
  | `empty-column-cases-table` | warning | A `[column_cases]` entry has no column cases. |
31
31
  | `unsalted-hash` | warning | A `hash` strategy is used with no salt (neither per-column `salt` nor global `salt`). Unsalted hashes are reversible via precomputed lookup tables for low-entropy inputs (names, emails, common IDs). |
32
- | `inconsistent-domain-strategy` | error | The same domain name is used with two or more different strategies. This breaks referential integrity: a domain shared between `email` and `name` would try to maintain a bidirectional map between incompatible pseudonym types. |
32
+ | `inconsistent-domain-strategy` | error | The same domain name is used with two or more different strategies. This breaks referential integrity: a domain shared between incompatible generators (for example `faker` with different `faker` targets, or `faker` vs `hash`) cannot maintain a single stable mapping. |
33
33
  | `uncovered-sensitive-column` | error | A column listed in `[sensitive_columns]` has no matching anonymization rule or case. The column will pass through unmodified, making the sensitive declaration misleading. |
34
34
 
35
35
  ---
@@ -6,7 +6,7 @@ Use `--format` to declare the SQL dialect of your input file:
6
6
 
7
7
  | Value | Description |
8
8
  |---|---|
9
- | `postgres` (default) | PostgreSQL `pg_dump` plain-text format. Supports `COPY … FROM stdin` blocks, `"double-quoted"` identifiers, `''`-escaped strings. |
9
+ | `postgres` (default) | PostgreSQL `pg_dump` plain-text format. Supports `COPY … FROM stdin` blocks, `"double-quoted"` identifiers, `''`-escaped strings. Custom-format (`-Fc`) or directory dumps can be decoded on the fly with `dumpling --dump-decode` (wraps `pg_restore -f -`; requires client tools). By default the archive is deleted after success; use `--dump-decode-keep-input` to retain it. |
10
10
  | `sqlite` | SQLite `.dump` format. Adds `INSERT OR REPLACE INTO` / `INSERT OR IGNORE INTO` support. No COPY blocks. |
11
11
  | `mssql` | SQL Server / MSSQL plain SQL. Adds `[bracket]` identifier quoting, `N'…'` Unicode string literals, and `nvarchar(n)` / `nchar(n)` length extraction. No COPY blocks. |
12
12
 
@@ -17,6 +17,28 @@ dumpling --format sqlite -i data.db.sql -o anonymized.sql
17
17
  dumpling --format mssql -i backup.sql -o anonymized.sql
18
18
  ```
19
19
 
20
+ ### PostgreSQL custom-format archives (`--dump-decode`)
21
+
22
+ Heroku PGBackups and many pipelines ship **`pg_dump` custom format** (`-Fc`) or **directory-format** dumps to save bandwidth. Dumpling’s SQL engine still expects **plain text**; use **`--dump-decode`** so Dumpling runs **`pg_restore -f -`** (script to stdout, no database) and pipes the result through the same anonymizer as a normal plain-SQL file.
23
+
24
+ **Requirements:** PostgreSQL client tools on `PATH` (`pg_restore`), or set **`--pg-restore-path`**. Use **`--dump-decode-arg`** (repeatable) for extra `pg_restore` flags, e.g. `--dump-decode-arg=--no-owner --dump-decode-arg=--no-acl`.
25
+
26
+ **Input deletion:** After a **fully successful** run, Dumpling **removes** the `--input` path (single file or directory-format folder) by default so only the anonymized output remains. Pass **`--dump-decode-keep-input`** to retain the archive.
27
+
28
+ **Check mode:** **`--check`** with **`--dump-decode`** requires **`--dump-decode-keep-input`**. Otherwise the default would delete the dump before you can iterate on config.
29
+
30
+ Example (e.g. after `heroku pg:backups:download`):
31
+
32
+ ```bash
33
+ dumpling --dump-decode -i latest.dump -c .dumplingconf -o anonymized.sql
34
+ ```
35
+
36
+ Dry run while keeping the downloaded file:
37
+
38
+ ```bash
39
+ dumpling --dump-decode --dump-decode-keep-input --check -i latest.dump -c .dumplingconf
40
+ ```
41
+
20
42
  ---
21
43
 
22
44
  ## Configuration sources
@@ -31,6 +53,16 @@ If no configuration is found, Dumpling fails closed by default and exits non-zer
31
53
  Error output includes every checked location. If you intentionally want a no-op
32
54
  run, pass `--allow-noop`.
33
55
 
56
+ ## Faker strategy and the `fake` crate
57
+
58
+ When you use `strategy = "faker"` with `faker = "module::Type"`, those names align with the Rust [**`fake`**](https://crates.io/crates/fake) crate’s [`faker`](https://docs.rs/fake/latest/fake/faker/index.html) modules (for example `name::FirstName` ↔ `fake::faker::name::raw::FirstName`). Use the upstream docs to discover available generators and options:
59
+
60
+ - [docs.rs — `fake` (crate overview)](https://docs.rs/fake/latest/fake/)
61
+ - [docs.rs — `fake::faker` (all faker submodules)](https://docs.rs/fake/latest/fake/faker/index.html)
62
+ - [GitHub — `cksac/fake-rs` (source + README)](https://github.com/cksac/fake-rs)
63
+
64
+ Dumpling only exposes a **subset** wired in `src/faker_dispatch.rs`; unsupported `module::Type` pairs fail at config load.
65
+
34
66
  ## Baseline config template
35
67
 
36
68
  ```toml
@@ -38,7 +70,7 @@ salt = "${DUMPLING_GLOBAL_SALT}"
38
70
 
39
71
  [rules."public.users"]
40
72
  email = { strategy = "hash", salt = "${env:DUMPLING_USERS_EMAIL_SALT}", as_string = true }
41
- name = { strategy = "name" }
73
+ full_name = { strategy = "faker", faker = "name::Name" }
42
74
 
43
75
  [sensitive_columns]
44
76
  "public.users" = ["employee_number", "tax_id"]
@@ -184,6 +216,15 @@ Nested JSON targeting is supported in predicate `column` values via either:
184
216
  When a JSON path traverses an array, Dumpling checks each element (useful for
185
217
  list-of-dicts JSON structures).
186
218
 
219
+ ### JSON path rules (`json` / `jsonb` columns)
220
+
221
+ You can anonymise values **inside** a text column that holds JSON using the same path syntax as row-filter predicates, but on **`[rules]` keys**:
222
+
223
+ - Dot notation: `"payload.profile.email" = { strategy = "email", domain = "orders_email", as_string = true }`
224
+ - Django-style: `"payload__profile__email" = { strategy = "hash", salt = "${env:ORDER_SECRET_SALT}", as_string = true }`
225
+
226
+ The part before the first dot or `__` is the **SQL column name**; the rest is the path inside the parsed JSON document. Use **quoted** keys in TOML when the name contains dots. For a given table, you can use **either** path-level rules for a column **or** one whole-column rule for that column’s base name, not both (Dumpling rejects the conflict at startup). If a path is missing in a given row, that rule is skipped for that row. When only path rules apply (no whole-column rule), the rest of the JSON is left unchanged. Path rules are applied in **longest-path-first** order. `column_cases` still match the SQL column name only; use `when` predicates with nested `column` paths to branch on JSON content.
227
+
187
228
  ## Safety recommendations
188
229
 
189
230
  - Prefer deterministic runs in CI by passing `--seed` (or `DUMPLING_SEED`).
@@ -202,7 +243,7 @@ list-of-dicts JSON structures).
202
243
  - Sensitive columns are detected by:
203
244
  1. built-in column-name patterns, and
204
245
  2. explicit per-table lists under `[sensitive_columns]`.
205
- - A sensitive column is considered covered only if it has an explicit `rules` or `column_cases` entry.
246
+ - A sensitive column is considered covered only if it has an explicit `rules` or `column_cases` entry (including JSON path rules whose base name is that column, e.g. `payload.x.y` covers `payload`).
206
247
  - If uncovered sensitive columns are found, Dumpling exits non-zero.
207
248
 
208
249
  When `--report` is enabled, coverage fields are added to JSON output:
@@ -0,0 +1,33 @@
1
+ # Getting started
2
+
3
+ ## Prerequisites
4
+
5
+ - Rust **stable** toolchain (`rustup` recommended). The repo includes `rust-toolchain.toml` (stable + `rustfmt` + `clippy`) so CI and local `cargo` stay aligned.
6
+ - `cargo` on your `PATH`
7
+
8
+ Optional: run **`./scripts/setup-dev.sh`** once from the repo root — it installs toolchain components, **`cargo fetch`**, and a pinned **mdBook** under `.tools/` for the same docs build CI uses.
9
+
10
+ ## Build
11
+
12
+ ```bash
13
+ cargo build --release
14
+ ./target/release/dumpling --help
15
+ ```
16
+
17
+ ## Test locally
18
+
19
+ ```bash
20
+ cargo fmt --all -- --check
21
+ cargo clippy --all-targets --all-features
22
+ cargo test --all-targets --all-features
23
+ ```
24
+
25
+ ## Run against a dump
26
+
27
+ ```bash
28
+ dumpling -i dump.sql -o sanitized.sql
29
+ ```
30
+
31
+ If your input is a PostgreSQL **custom-format** file (not plain SQL), decode and anonymize in one step with **`--dump-decode`** (needs `pg_restore` from PostgreSQL client tools). See [PostgreSQL custom-format archives](configuration.md#postgresql-custom-format-archives---dump-decode) in the configuration guide.
32
+
33
+ For full command examples and strategy options, see the repository `README.md`.
@@ -0,0 +1,19 @@
1
+ # Dumpling documentation
2
+
3
+ Dumpling is a streaming anonymizer for plain SQL dumps. It supports PostgreSQL (`pg_dump` plain format), SQLite (`.dump`), and SQL Server / MSSQL (SSMS / mssql-scripter plain SQL output). For PostgreSQL **custom-format** archives (e.g. Heroku `pg:backups:download`), use **`--dump-decode`** so Dumpling invokes `pg_restore` and streams plain SQL—see [Dump format](configuration.html#postgresql-custom-format-archives---dump-decode) in the configuration guide.
4
+
5
+ This documentation covers the operating model for day-to-day use:
6
+
7
+ - how to build and run Dumpling locally,
8
+ - how to configure transformation behavior safely,
9
+ - how CI validates quality before changes merge,
10
+ - and how maintainers produce tagged releases.
11
+
12
+ ## Documentation quality gate
13
+
14
+ The mdBook site is built in CI as follows:
15
+
16
+ - **Pull requests:** the **Docs (PR)** workflow runs `mdbook build` when docs-related paths change (no deploy).
17
+ - **`main`:** the **Docs** workflow builds and deploys to GitHub Pages when docs-related paths change.
18
+
19
+ This keeps the docs in a continuously deployable state instead of drifting from the codebase.
@@ -4,7 +4,7 @@ build-backend = "maturin"
4
4
 
5
5
  [project]
6
6
  name = "dumpling-cli"
7
- version = "0.2.0"
7
+ version = "0.4.0"
8
8
  description = "Static anonymizer for plain SQL dumps (PostgreSQL, SQLite, SQL Server)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -0,0 +1,3 @@
1
+ [toolchain]
2
+ channel = "stable"
3
+ components = ["rustfmt", "clippy"]