dumpling-cli 0.6.0__tar.gz → 0.7.0a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/.dumplingconf.example +4 -2
  2. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/.github/workflows/ci.yml +1 -1
  3. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/.github/workflows/platform-compat-latest.yml +1 -1
  4. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/.github/workflows/platform-compat-matrix.yml +1 -1
  5. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/.github/workflows/policy-lint.yml +1 -1
  6. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/.github/workflows/publish.yml +1 -1
  7. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/.github/workflows/release.yml +1 -1
  8. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/.github/workflows/tests.yml +1 -1
  9. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/AGENTS.md +1 -1
  10. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/CHANGELOG.md +13 -1
  11. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/Cargo.lock +1 -1
  12. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/Cargo.toml +1 -1
  13. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/PKG-INFO +207 -188
  14. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/README.md +206 -187
  15. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/docs/src/configuration.md +77 -0
  16. dumpling_cli-0.7.0a0/docs/src/getting-started.md +45 -0
  17. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/docs/src/index.md +2 -0
  18. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/pyproject.toml +1 -1
  19. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/src/filter.rs +81 -0
  20. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/src/lint.rs +3 -0
  21. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/src/main.rs +272 -55
  22. dumpling_cli-0.7.0a0/src/scaffold.rs +280 -0
  23. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/src/scan.rs +2 -1
  24. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/src/seal.rs +9 -25
  25. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/src/settings.rs +83 -7
  26. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/src/sql.rs +811 -121
  27. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/src/transform.rs +323 -2
  28. dumpling_cli-0.6.0/docs/src/getting-started.md +0 -33
  29. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/.github/workflows/docs-pr.yml +0 -0
  30. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/.github/workflows/docs.yml +0 -0
  31. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/.gitignore +0 -0
  32. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/CONTRIBUTING.md +0 -0
  33. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/CONTRIBUTORS.md +0 -0
  34. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/MAINTENANCE.md +0 -0
  35. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/assets/logo.svg +0 -0
  36. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/book.toml +0 -0
  37. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/datetime_out.sql +0 -0
  38. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/datetime_sample.sql +0 -0
  39. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/docs/src/SUMMARY.md +0 -0
  40. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/docs/src/ci-guardrails.md +0 -0
  41. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/docs/src/releasing.md +0 -0
  42. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/rust-toolchain.toml +0 -0
  43. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/scripts/setup-dev.sh +0 -0
  44. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/src/faker_dispatch.rs +0 -0
  45. {dumpling_cli-0.6.0 → dumpling_cli-0.7.0a0}/src/report.rs +0 -0
@@ -59,8 +59,10 @@ last_login = { strategy = "datetime_fuzz" }
59
59
  wake_time = { strategy = "time_fuzz", min_seconds = -3600, max_seconds = 3600 }
60
60
 
61
61
  [rules."public.orders"]
62
- # credit card — redact entirely; force as quoted string
63
- credit_card = { strategy = "redact", as_string = true }
62
+ # credit card — Luhn-valid synthetic PAN (length 13–19); use domain for stable FKs across dumps
63
+ credit_card = { strategy = "payment_card", length = 16, domain = "order_pan" }
64
+ # monetary / numeric — random decimal in range with fixed fractional digits
65
+ order_total = { strategy = "decimal", min = 0, max = 99999, scale = 2, domain = "order_amount" }
64
66
  # keep the same anonymized email as users table via shared domain
65
67
  customer_email = { strategy = "faker", faker = "internet::SafeEmail", domain = "customer_identity" }
66
68
 
@@ -23,7 +23,7 @@ jobs:
23
23
  components: rustfmt, clippy
24
24
 
25
25
  - name: Cache Cargo build artifacts
26
- uses: Swatinem/rust-cache@v2
26
+ uses: Swatinem/rust-cache@v2.9.1
27
27
 
28
28
  - name: Check formatting
29
29
  run: cargo fmt --all -- --check
@@ -28,7 +28,7 @@ jobs:
28
28
  uses: dtolnay/rust-toolchain@stable
29
29
 
30
30
  - name: Cache Cargo build artifacts
31
- uses: Swatinem/rust-cache@v2
31
+ uses: Swatinem/rust-cache@v2.9.1
32
32
 
33
33
  - name: Build release binary
34
34
  run: cargo build --release --locked
@@ -26,7 +26,7 @@ jobs:
26
26
  uses: dtolnay/rust-toolchain@stable
27
27
 
28
28
  - name: Cache Cargo build artifacts
29
- uses: Swatinem/rust-cache@v2
29
+ uses: Swatinem/rust-cache@v2.9.1
30
30
 
31
31
  - name: Build release binary
32
32
  run: cargo build --release --locked
@@ -38,7 +38,7 @@ jobs:
38
38
  uses: dtolnay/rust-toolchain@stable
39
39
 
40
40
  - name: Cache Cargo build artifacts
41
- uses: Swatinem/rust-cache@v2
41
+ uses: Swatinem/rust-cache@v2.9.1
42
42
 
43
43
  - name: Build dumpling
44
44
  run: cargo build --release --locked
@@ -38,7 +38,7 @@ jobs:
38
38
  uses: dtolnay/rust-toolchain@stable
39
39
 
40
40
  - name: Cache Cargo build artifacts
41
- uses: Swatinem/rust-cache@v2
41
+ uses: Swatinem/rust-cache@v2.9.1
42
42
 
43
43
  - name: Set up Python
44
44
  uses: actions/setup-python@v6
@@ -21,7 +21,7 @@ jobs:
21
21
  components: rustfmt, clippy
22
22
 
23
23
  - name: Cache Cargo build artifacts
24
- uses: Swatinem/rust-cache@v2
24
+ uses: Swatinem/rust-cache@v2.9.1
25
25
 
26
26
  - name: Validate formatting
27
27
  run: cargo fmt --all -- --check
@@ -21,7 +21,7 @@ jobs:
21
21
  uses: dtolnay/rust-toolchain@stable
22
22
 
23
23
  - name: Cache Cargo build artifacts
24
- uses: Swatinem/rust-cache@v2
24
+ uses: Swatinem/rust-cache@v2.9.1
25
25
 
26
26
  - name: Run cargo tests
27
27
  run: cargo test --all-targets --all-features
@@ -221,7 +221,7 @@ Follow these steps in order. Do not skip any step.
221
221
 
222
222
  7. **Tests**: Add `#[test]` functions in `src/transform.rs` (unit-test strategy output values) and in `src/sql.rs` (end-to-end pipeline test). Use `set_random_seed(N)` for reproducibility.
223
223
 
224
- 8. **`README.md`**: Add a row to the "Anonymization strategies" table.
224
+ 8. **`README.md`**: Document the strategy under *Configuration Anonymization strategies* (per-strategy subsection with accepted options), and mention any new spec fields in `AnonymizerSpec`’s doc comment in `settings.rs`.
225
225
 
226
226
  **`faker` strategy:** Config only carries string identifiers; Dumpling never evaluates user Rust from config. To ship a new generator, add dispatch in `src/faker_dispatch.rs` and validation in `validate_anonymizer_spec` for the `faker` branch. Upstream reference: [`fake` on docs.rs](https://docs.rs/fake/latest/fake/), [`fake::faker` module index](https://docs.rs/fake/latest/fake/faker/index.html), [source on GitHub](https://github.com/cksac/fake-rs).
227
227
 
@@ -7,11 +7,23 @@ and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.ht
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.7.0-alpha] - 2026-05-04
11
+
12
+ Pre-release toward **0.7.0** (stable **0.7.0** is not published yet; crates use the **0.7.0-alpha** prerelease identifier until then).
13
+
14
+ ### Removed
15
+
16
+ - **`--include-table` / `--exclude-table`**: these CLI flags and the associated per-table skip logic in the SQL stream processor are removed. Anonymize the full dump, or split/filter dumps outside Dumpling if you need a smaller input.
17
+
18
+ ### Changed
19
+
20
+ - **Dump seal (`v=3`):** the fingerprint JSON no longer includes table-filter fields. Seals produced by Dumpling **0.6.x** (`v=2`) will not match **0.7.x**; the first line is treated as stale and the dump is re-processed.
21
+
10
22
  ## [0.6.0] - 2026-05-03
11
23
 
12
24
  ### Added
13
25
 
14
- - **Dump seal** (leading `-- dumpling-seal:` SQL comment): records Dumpling version, security profile, a SHA-256 fingerprint of the resolved policy, and runtime CLI options that affect transforms (`--format`, sorted `--include-table` / `--exclude-table`, effective PRNG seed in standard profile). When the input already begins with a **matching** seal, the remainder is copied through unchanged; stale or unknown seal lines are stripped and the dump is re-processed. See README for full semantics ([#58](https://github.com/ababic/dumpling/pull/58)).
26
+ - **Dump seal** (leading `-- dumpling-seal:` SQL comment): records Dumpling version, security profile, a SHA-256 fingerprint of the resolved policy, and runtime CLI options that affect transforms (`--format` and the effective PRNG seed in standard profile; `null` in hardened, where seeds are ignored). When the input already begins with a **matching** seal, the remainder is copied through unchanged; stale or unknown seal lines are stripped and the dump is re-processed. See README for full semantics ([#58](https://github.com/ababic/dumpling/pull/58)).
15
27
  - **`--stats`**: prints `wall_ms` plus `domain_cache_hits` and `domain_cache_misses` for quick profiling of large runs ([#59](https://github.com/ababic/dumpling/pull/59)).
16
28
  - **`CONTRIBUTORS.md`** ([#59](https://github.com/ababic/dumpling/pull/59)).
17
29
 
@@ -262,7 +262,7 @@ dependencies = [
262
262
 
263
263
  [[package]]
264
264
  name = "dumpling"
265
- version = "0.6.0"
265
+ version = "0.7.0-alpha"
266
266
  dependencies = [
267
267
  "anyhow",
268
268
  "chrono",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "dumpling"
3
- version = "0.6.0"
3
+ version = "0.7.0-alpha"
4
4
  edition = "2021"
5
5
  readme = "README.md"
6
6
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dumpling-cli
3
- Version: 0.6.0
3
+ Version: 0.7.0a0
4
4
  Classifier: Development Status :: 4 - Beta
5
5
  Classifier: Environment :: Console
6
6
  Classifier: Intended Audience :: Developers
@@ -26,7 +26,7 @@ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
26
26
  <h1 align="center">Dumpling</h1>
27
27
 
28
28
  <p align="center">
29
- <strong>Sanitize SQL dumps before they go anywhere.</strong><br />
29
+ <strong>Sanitize database dumps before they go anywhere.</strong><br />
30
30
  Turn huge <code>pg_dump</code> / SQLite / SQL Server exports into shareable, test-friendly snapshots — no DB connection, no secrets left by accident.
31
31
  </p>
32
32
 
@@ -55,10 +55,12 @@ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
55
55
 
56
56
  ## Why Dumpling?
57
57
 
58
+ - **Rich built-in strategies** — from fast clears (`null`, `redact`, `blank`, `empty_array` / `empty_object`) and bounded fakes (`int_range`, `decimal`, `string`) to realistic stand-ins (`email`, `name`, `payment_card`, `faker`, date/time fuzz), with optional **`domain`** so the same source value stays consistent across tables.
59
+ - **JSON inside columns** — target paths inside `json` / `jsonb` text with the same dot or `__` syntax you use elsewhere; pair with row filters on nested fields.
60
+ - **Row-level control** — **`retain`** and **`delete`** predicates (including nested JSON paths) drop or keep whole rows before transforms run.
58
61
  - **Offline by design** — works on dump files only; nothing connects to your database.
59
62
  - **Streams giant files** — line-by-line processing keeps multi‑GB dumps reasonable on modest hardware.
60
63
  - **Fails loud, not silent** — missing config exits non‑zero and lists where Dumpling looked; use `--allow-noop` only when you mean it.
61
- - **Stable pseudonyms** — optional domain mappings keep the same source value as the same fake value across tables (foreign keys stay consistent).
62
64
  - **Pipeline-ready** — `--check`, strict coverage, JSON reports, and residual PII scans fit pre-merge gates and release automation.
63
65
  - **Configure once** — `.dumplingconf` or `[tool.dumpling]` in `pyproject.toml`; install via **Rust** (`cargo`) or **`pip install dumpling-cli`**.
64
66
 
@@ -95,6 +97,21 @@ dumpling --help
95
97
 
96
98
  ---
97
99
 
100
+ ## Getting started
101
+
102
+ Follow these steps once; you will have a working path from “raw dump” to “first sanitized output,” then you can deepen coverage using the rest of this README and the [documentation site](https://ababic.github.io/dumpling/).
103
+
104
+ 1. **Start from the example policy** — Copy [`.dumplingconf.example`](.dumplingconf.example) to `.dumplingconf` in your project root (or merge the same keys under `[tool.dumpling]` in `pyproject.toml`). Set environment variables for `salt` and any `${…}` references so Dumpling can resolve secrets at startup.
105
+ 2. **Name your tables and columns** — Open your dump next to the config. `CREATE TABLE`, `COPY … (…)` and `INSERT INTO … (…)` lines list the identifiers you need for `[rules."table"]` or `[rules."schema.table"]` (see [Configuration (TOML)](#configuration-toml) below). Trim the example rules down to the tables you care about first, then add columns and strategies as you go.
106
+ 3. **Run Dumpling** — `dumpling -i dump.sql -o sanitized.sql` (add `-c path` if the config is not in the default search path). Use `dumpling --check -i dump.sql` when you only want to know whether anything would change.
107
+ 4. **Tighten the policy** — Run `dumpling lint-policy` on your config. When you are ready for stricter gates, add `[sensitive_columns]` and use `--strict-coverage` / `--report` / `--scan-output` as described under [Usage](#usage).
108
+
109
+ **Draft policy generation (planned)** — A future command will stream a dump and emit a **draft** starter TOML so you spend less time hunting table and column names and basic DDL hints (for example `varchar(N)` lengths). Output will be explicitly **draft**: always review and edit before production or compliance workflows; it is a time-saver, not a full policy.
110
+
111
+ The same flow is spelled out in the docs: [Getting started](https://ababic.github.io/dumpling/getting-started.html).
112
+
113
+ ---
114
+
98
115
  ## Usage
99
116
 
100
117
  ```bash
@@ -108,13 +125,10 @@ dumpling --report report.json -i dump.sql # write detailed JSON report of
108
125
  dumpling --strict-coverage --report report.json -i dump.sql --check # fail on uncovered sensitive columns
109
126
  dumpling --scan-output --report report.json -i dump.sql # scan transformed output for residual PII-like patterns
110
127
  dumpling --scan-output --fail-on-findings --report report.json -i dump.sql --check # fail if scan thresholds are exceeded
111
- dumpling --include-table '^public\\.' -i dump.sql -o out.sql
112
- dumpling --exclude-table '^audit\\.' -i dump.sql -o out.sql
113
128
  dumpling --allow-ext dmp -i data.dmp # restrict processing to specific extensions
114
129
  dumpling --allow-noop -i dump.sql -o out.sql # explicitly allow no-op when config is missing
115
130
  dumpling --format sqlite -i data.db.sql -o out.sql # process a SQLite .dump file
116
131
  dumpling --format mssql -i backup.sql -o out.sql # process a SQL Server plain-SQL dump
117
- dumpling --security-profile hardened -i dump.sql -o sanitized.sql # hardened CSPRNG + HMAC mode
118
132
  dumpling lint-policy # lint the anonymization policy config
119
133
  dumpling lint-policy --config .dumplingconf # lint with explicit config path
120
134
  ```
@@ -129,15 +143,194 @@ If no configuration is found, Dumpling fails closed by default and exits non-zer
129
143
  The error output lists every checked location. Use `--allow-noop` to explicitly
130
144
  permit no-op behavior.
131
145
 
132
- ### Dump seal (always on)
146
+ The **dump seal** comment prefixed to successful output and **`--security-profile hardened`** are documented in the [configuration guide](https://ababic.github.io/dumpling/configuration.html) (see *Dump seal* and *Hardened security profile*).
147
+
148
+ ---
149
+
150
+ ## Anonymization strategies
151
+
152
+ Column rules live under `[rules."schema.table"]` (or `[rules."table"]`) as inline tables: `{ strategy = "<name>", ... }`. **Strategy-specific keys** are documented next to the strategy that accepts them. A few keys apply across many strategies; see [Cross-cutting options](#cross-cutting-options) below.
153
+
154
+ #### Choosing a strategy (cheaper vs more realistic)
155
+
156
+ Prefer **lightweight** strategies when nothing downstream requires lifelike values: **`null`**, **`redact`**, **`blank`**, **`empty_array`**, **`empty_object`**, **`string`**, **`int_range`**, and **`decimal`** are cheap to generate (simple constants, random digits/alnum, or bounded numeric shapes). Use **`blank`** for NOT NULL text where you must clear content without SQL NULL; use **`empty_array`** / **`empty_object`** on JSON path rules (or text columns holding JSON) when the document must keep `[]` / `{}` instead of `null` or `""`.
157
+
158
+ Reach for **richer** strategies when realism matters for restores, demos, or tests that exercise parsers and validators: **`email`**, **`name`**, **`first_name`**, **`last_name`**, **`phone`**, **`faker`**, **`uuid`**, **`hash`**, **`payment_card`**, and the **`date_fuzz` / `time_fuzz` / `datetime_fuzz`** family do more work (formatting, parsing, digest, or upstream generators). If a cheap strategy would break **CHECK constraints**, **NOT NULL**, **foreign-key shape**, or **import tooling** that validates formats, switch to a strategy that emits compatible values—or keep **`domain`** on the heavier strategy so referential consistency is preserved where you need it.
159
+
160
+ #### `null`
161
+
162
+ - **Behavior:** emit SQL `NULL` for the cell.
163
+ - **Options:** none. (`domain` is rejected.)
164
+
165
+ #### `redact`
166
+
167
+ - **Behavior:** replace with the literal `REDACTED`.
168
+ - **`as_string`:** if `true`, the replacement is always a single-quoted SQL string; if `false`, it is emitted without quotes (still valid as an identifier-like token in many dumps). When the **original** cell was already a quoted string, Dumpling quotes the output even when `as_string` is omitted—see [Cross-cutting options](#cross-cutting-options).
169
+
170
+ #### `blank`
171
+
172
+ - **Behavior:** replace with an **empty string** (`''` in SQL when quoted). If the source cell is SQL **`NULL`**, the cell stays **`NULL`** (same as `null` / `redact` semantics for missing values).
173
+ - **Options:** none. (`domain` is rejected.) **`as_string`** is ignored; output is always the empty string literal when non-NULL.
174
+
175
+ #### `empty_array` / `empty_object`
176
+
177
+ - **Behavior:** replace with the JSON tokens **`[]`** and **`{}`** as **unquoted** SQL/COPY tokens (so they parse as JSON when the column holds JSON). If the source cell is SQL **`NULL`**, the cell stays **`NULL`**.
178
+ - **JSON path rules:** use these on leaves that are JSON **arrays** or **objects** when you need a typed empty container instead of `null` or `""`.
179
+ - **Options:** none. (`domain` is rejected.)
180
+
181
+ #### `uuid`
182
+
183
+ - **Behavior:** random UUIDv4-like hyphenated hex string.
184
+ - **`as_string`:** same meaning as for `redact` / `hash` (force quoted literal vs. unquoted token).
185
+
186
+ #### `hash`
187
+
188
+ - **Behavior:** salted digest of the original cell value (SHA-256 by default; see [configuration guide — Hardened security profile](https://ababic.github.io/dumpling/configuration.html#hardened-security-profile) for HMAC mode).
189
+ - **`salt`:** optional per-column salt; otherwise the top-level `salt` or registry default applies.
190
+ - **`as_string`:** if `true`, force a quoted string literal; if `false`, unquoted hex. Quoted **source** cells are still written quoted when `as_string` is omitted.
191
+
192
+ #### `email`, `name`, `first_name`, `last_name`, `phone`
193
+
194
+ - **Behavior:** locale-aware fake values (same underlying generators as the matching `faker` targets).
195
+ - **`locale`:** optional; one of `en`, `fr_fr`, `de_de`, `it_it`, `pt_br`, `pt_pt`, `ar_sa`, `zh_cn`, `zh_tw`, `ja_jp`, `cy_gb` (default `en`).
196
+ - **Output:** always emitted as a quoted string replacement.
197
+
198
+ #### `int_range`
199
+
200
+ - **Behavior:** random integer in the inclusive range `[min, max]` (defaults `min = 0`, `max = 1_000_000`).
201
+ - **`min` / `max`:** inclusive bounds; `min` must be ≤ `max`.
202
+ - **Output:** always unquoted digits (suitable for integer / JSON number columns).
133
203
 
134
- Every successful run that writes output prefixes the stream with a single-line SQL comment:
204
+ #### `decimal`
135
205
 
136
- `-- dumpling-seal: v=2 version=<semver> profile=<standard|hardened> sha256=<64 hex chars>`
206
+ - **Behavior:** random decimal with integer part in `[min, max]` and fractional part of **`scale`** digits (defaults `min = 0`, `max = 1_000_000`, `scale = 2`). Use `scale = 0` for a plain integer string in the same range.
207
+ - **`min` / `max`:** inclusive integer-part bounds.
208
+ - **`scale`:** number of digits after `.` (0–38).
209
+ - **`as_string`:** same as `hash` / `redact` for quoting the full literal.
137
210
 
138
- The `sha256` is over canonical JSON that includes the Dumpling version, the active security profile, a stable encoding of the resolved policy (rules, row filters, column cases, sensitive columns, output scan, global salt), and **runtime options** that affect transforms: `--format`, sorted `--include-table` / `--exclude-table` patterns, and the effective `--seed` / `DUMPLING_SEED` value in standard profile (`null` in hardened, where seeds are ignored).
211
+ #### `payment_card`
212
+
213
+ - **Behavior:** random digit string of length **`length`** (default **16**) with a **valid Luhn check digit**, so `--scan-output` PAN detection treats synthetic values like test cards, not arbitrary digit runs.
214
+ - **`length`:** total digit count including check digit; must be 13–19 (PAN lengths).
215
+ - **Output:** always a quoted string of digits (no separators).
216
+
217
+ #### `string`
218
+
219
+ - **Behavior:** random alphanumeric string.
220
+ - **`length`:** character count (default 12); must be ≥ 1 when set.
221
+
222
+ #### `faker`
223
+
224
+ - **Behavior:** values from the Rust [`fake`](https://crates.io/crates/fake) crate ([`faker` modules](https://docs.rs/fake/latest/fake/faker/index.html)), selected only by the string **`faker = "module::Type"`** (e.g. `internet::SafeEmail`). Config is **data only**—nothing from TOML is compiled as Rust. Unsupported pairs fail at config load; new generators require a **new Dumpling release** (or a fork), not config-side code.
225
+ - **`faker`:** required; maps to a built-in allowlist in `src/faker_dispatch.rs`.
226
+ - **`locale`:** optional; same set as the built-in PII strategies when the upstream generator is locale-aware.
227
+ - **`min` / `max` / `length` / `format`:** only for/faker combinations that upstream supports (e.g. `number::NumberWithFormat` uses **`format`**: `#` = any digit, `^` = 1–9 per [`fake` docs](https://docs.rs/fake/latest/fake/)).
228
+
229
+ **Upstream reference:** [docs.rs — `fake`](https://docs.rs/fake/latest/fake/), [docs.rs — `fake::faker`](https://docs.rs/fake/latest/fake/faker/index.html), [GitHub — cksac/fake-rs](https://github.com/cksac/fake-rs).
230
+
231
+ #### `date_fuzz`, `time_fuzz`, `datetime_fuzz`
232
+
233
+ - **Behavior:** parse the existing value when possible and shift by a random offset; on parse failure the original string is kept.
234
+ - **`date_fuzz`:** **`min_days` / `max_days`** (defaults `-30` … `30`).
235
+ - **`time_fuzz` / `datetime_fuzz`:** **`min_seconds` / `max_seconds`** (`time_fuzz` defaults `-300` … `300`; `datetime_fuzz` defaults `-86400` … `86400`).
236
+ - **`as_string`:** force quoted literal vs. unquoted token for the emitted date/time/timestamp string.
237
+
238
+ ### Cross-cutting options
239
+
240
+ These keys are valid on **multiple** strategies (unless validation says otherwise):
241
+
242
+ - **`domain`:** deterministic mapping bucket. The same non-NULL source value maps to the same pseudonym for that strategy inside the domain (across tables/columns). **SQL `NULL` is always preserved**—no fabricated FK targets.
243
+ - **`unique_within_domain`:** when `true` (requires `domain`), different source values are assigned distinct pseudonyms within the domain.
244
+ - **`as_string`:** when `true`, force the replacement to render as a **single-quoted SQL string literal**. When `false` or omitted, Dumpling still quotes the output if the **original** cell was quoted (`render_cell` uses `force_quoted || original.was_quoted`). Set `as_string = true` when the source may be unquoted (numeric-looking literals, some `COPY` shapes) but you need a string literal in the dump.
245
+
246
+ ---
247
+
248
+ ## Conditional per-column cases
249
+
250
+ Define default strategies in `rules."<table>"` and add ordered per-column cases in `column_cases."<table>"."<column>"`. For each row and column, Dumpling applies the first matching case; if none match, it falls back to the default from `rules`.
251
+
252
+ ```toml
253
+ [rules."public.users"]
254
+ email = { strategy = "hash", as_string = true } # default
255
+ name = { strategy = "name" }
256
+
257
+ [[column_cases."public.users".email]]
258
+ when.any = [{ column = "is_admin", op = "eq", value = "true" }]
259
+ strategy = { strategy = "redact", as_string = true }
260
+
261
+ [[column_cases."public.users".email]]
262
+ when.any = [{ column = "country", op = "in", values = ["DE","FR","GB"] }]
263
+ strategy = { strategy = "hash", salt = "eu-salt", as_string = true }
264
+ ```
265
+
266
+ - `when.any` is OR, `when.all` is AND; you can use either or both. If both are empty, the case matches unconditionally.
267
+ - First-match-wins per column; there is no merge or fallthrough.
268
+ - Row filtering (`row_filters`) is evaluated before cases; deleted rows are not transformed.
269
+
270
+ ---
271
+
272
+ ## JSON path rules inside columns
273
+
274
+ When a column stores JSON as text (`json` / `jsonb` dumped as a string), you can target **fields inside the document** with the same path syntax as row filters — but as **keys under `[rules."<table>"]`**. Use **quoted** TOML keys when the path contains dots.
275
+
276
+ - Dot notation: `"payload.profile.email" = { strategy = "email", domain = "orders_email", as_string = true }`
277
+ - Django-style: `"payload__profile__email" = { strategy = "hash", salt = "${env:ORDER_SECRET_SALT}", as_string = true }`
278
+
279
+ The segment before the first `.` or `__` is the **SQL column name**; the rest is the path inside the parsed JSON. You can use **either** path-level rules for a column **or** one whole-column rule for that column’s base name, not both (Dumpling rejects the conflict at startup). If a path is missing in a row, that rule is skipped for that row. When only path rules apply, the rest of the JSON is left unchanged. Path rules run in **longest-path-first** order. `column_cases` still match the SQL column name only; use `when` predicates with nested `column` paths to branch on JSON content.
280
+
281
+ ---
282
+
283
+ ## Row filtering
284
+
285
+ You can retain or delete rows for specific tables using explicit predicate lists.
286
+
287
+ - If `retain` is non-empty, a row is kept only if it matches at least one predicate.
288
+ - Regardless of `retain`, a row is dropped if it matches any predicate in `delete`.
289
+
290
+ Supported predicate operators:
291
+
292
+ | Operator | Description |
293
+ |---|---|
294
+ | `eq` / `neq` | String compare (case-insensitive if `case_insensitive = true`) |
295
+ | `in` / `not_in` | List of values (string compare) |
296
+ | `like` / `ilike` | SQL-like patterns (`%` and `_`) |
297
+ | `regex` / `iregex` | Rust regex (`iregex` is case-insensitive) |
298
+ | `lt` / `lte` / `gt` / `gte` | Numeric compare (values parsed as numbers) |
299
+ | `is_null` / `not_null` | No value needed |
300
+
301
+ Predicates can target nested JSON values using dot notation (`payload.profile.tier`) or Django-style notation (`payload__profile__tier`). For JSON arrays, path segments are evaluated against each element, so list-of-dicts structures can be matched naturally.
302
+
303
+ ### JSON path list targeting
304
+
305
+ JSON list/array traversal is automatic once a path segment resolves to an array.
306
+
307
+ - **All elements in an array**: use the next field name directly.
308
+ - `payload.items.kind` or `payload__items__kind`
309
+ - Matches/rewrites `kind` for every object in `items`.
310
+ - **Specific array index**: use a numeric segment.
311
+ - `payload.items.0.kind` or `payload__items__0__kind`
312
+ - Targets only the first element.
313
+ - **Nested arrays**: combine field and index segments as needed.
314
+ - `payload.groups.members.email`
315
+ - `payload.groups.1.members.0.email`
316
+
317
+ This path behavior is shared by both `row_filters` predicates and JSON-path anonymization rules in `[rules]`.
318
+
319
+ ```toml
320
+ [row_filters."public.users"]
321
+ retain = [
322
+ { column = "country", op = "eq", value = "US" },
323
+ { column = "email", op = "ilike", value = "%@myco.com" },
324
+ { column = "profile.flags.plan", op = "eq", value = "gold" }
325
+ ]
326
+ delete = [
327
+ { column = "is_admin", op = "eq", value = "true" },
328
+ { column = "email", op = "ilike", value = "%@example.com" },
329
+ { column = "devices__platform", op = "eq", value = "android" }
330
+ ]
331
+ ```
139
332
 
140
- If the **input** already begins with a seal line and it **matches** the current run, Dumpling copies the rest of the file through unchanged. If the line looks like a seal but does **not** match (stale policy, different flags, or older `v=`), that line is **dropped** and the dump is re-processed so you do not end up with two seal lines. `--strict-coverage` cannot be combined with a matching seal (table definitions are not scanned in passthrough mode). `--check` writes no output and therefore emits no seal line.
333
+ Row filtering works for both `INSERT ... VALUES (...)` and `COPY ... FROM stdin` rows.
141
334
 
142
335
  ---
143
336
 
@@ -158,7 +351,8 @@ ssn = { strategy = "hash", salt = "${env:DUMPLING_USERS_SSN_SALT}", as_string
158
351
  age = { strategy = "int_range", min = 18, max = 90 }
159
352
 
160
353
  [rules."orders"]
161
- credit_card = { strategy = "redact", as_string = true }
354
+ credit_card = { strategy = "payment_card", length = 16, domain = "order_pan" }
355
+ amount = { strategy = "decimal", min = 0, max = 9999, scale = 2, domain = "order_amount" }
162
356
 
163
357
  # Optional explicit sensitive columns policy list (for strict coverage)
164
358
  [sensitive_columns]
@@ -185,32 +379,6 @@ pan = "critical"
185
379
  token = "high"
186
380
  ```
187
381
 
188
- ### Anonymization strategies
189
-
190
- | Strategy | Description |
191
- |---|---|
192
- | `null` | Set field to SQL `NULL` |
193
- | `redact` | Replace with `REDACTED` (string) |
194
- | `uuid` | Random UUIDv4-like string |
195
- | `hash` | SHA-256 hex of original value; supports per-column `salt` and global `salt` |
196
- | `email` | Safe email address (same generator as `faker = "internet::SafeEmail"`); supports `locale` |
197
- | `name` | Full name (same as `faker = "name::Name"`); supports `locale` |
198
- | `first_name` | First name (same as `faker = "name::FirstName"`); supports `locale` |
199
- | `last_name` | Last name (same as `faker = "name::LastName"`); supports `locale` |
200
- | `phone` | Locale-aware fake phone number (configurable via `locale`); defaults to English format |
201
- | `faker` | Values from the Rust [`fake`](https://crates.io/crates/fake) crate ([docs.rs](https://docs.rs/fake/latest/fake/), [`faker` modules](https://docs.rs/fake/latest/fake/faker/index.html)), chosen by a **string identifier** only (`faker = "module::Type"`, e.g. `internet::SafeEmail`). Config is **data only**: nothing from TOML is compiled or executed as Rust at runtime. Use `locale` for locale-aware generators; optional `min`/`max`, `length`, `format` as documented. Unsupported targets fail at config load. New generators require a **new Dumpling release** (or your own fork), not config-side code. |
202
- | `int_range` | Random integer in `[min, max]` |
203
- | `string` | Random alphanumeric string (`length = 12` by default) |
204
- | `date_fuzz` | Shifts a date by a random number of days in `[min_days, max_days]` (defaults: `-30..30`) |
205
- | `time_fuzz` | Shifts a time-of-day by a random number of seconds in `[min_seconds, max_seconds]` with 24h wraparound (defaults: `-300..300`) |
206
- | `datetime_fuzz` | Shifts a timestamp/timestamptz by a random number of seconds in `[min_seconds, max_seconds]` (defaults: `-86400..86400`) |
207
-
208
- **`faker` reference (upstream `fake` crate):** Dumpling’s `faker = "module::Type"` strings mirror the Rust [`fake`](https://crates.io/crates/fake) crate’s [`faker`](https://docs.rs/fake/latest/fake/faker/index.html) module layout. Use these when picking or extending generators:
209
-
210
- - [docs.rs — `fake` crate root](https://docs.rs/fake/latest/fake/) (overview, `Fake` / `Dummy` traits, locales)
211
- - [docs.rs — `fake::faker` module index](https://docs.rs/fake/latest/fake/faker/index.html) (per-domain submodules: `address`, `internet`, `name`, …)
212
- - [GitHub — `cksac/fake-rs`](https://github.com/cksac/fake-rs) (source, README with the CLI’s generator name list)
213
-
214
382
  ### Secret references
215
383
 
216
384
  Dumpling resolves secret references in string config fields so plaintext salts/keys
@@ -251,17 +419,6 @@ dumpling --input dump.sql --check --strict-coverage --report coverage.json
251
419
  dumpling --security-profile hardened --input dump.sql --check
252
420
  ```
253
421
 
254
- ### Common column options
255
-
256
- - `as_string`: if true, forces the anonymized value to be rendered as a quoted SQL string literal. By default Dumpling preserves the original quoting where possible.
257
- - `domain`: deterministic mapping domain. When set, the same source value always maps to the same pseudonym inside that domain (across tables/columns). **SQL `NULL` inputs are always preserved as `NULL`** — a null FK reference has no source value to map, so no pseudonym is fabricated.
258
- - `unique_within_domain`: when true, different source values are assigned unique pseudonyms within the configured `domain`. NULL values are unaffected and always remain NULL.
259
- - `min_days` / `max_days`: used by `date_fuzz`.
260
- - `min_seconds` / `max_seconds`: used by `time_fuzz` and `datetime_fuzz`.
261
- - `locale`: selects the language/regional format for `email`, `name`, `first_name`, `last_name`, `faker`, and `phone`. Supported values: `en`, `fr_fr`, `de_de`, `it_it`, `pt_br`, `pt_pt`, `ar_sa`, `zh_cn`, `zh_tw`, `ja_jp`, `cy_gb`. Defaults to `en` when not specified.
262
- - `faker`: required when `strategy = "faker"`. A plain string `"module::Type"` (case-insensitive) that maps to a **built-in** generator compiled into Dumpling—not arbitrary Rust or expressions. Names follow [`fake::faker`](https://docs.rs/fake/latest/fake/faker/index.html) (e.g. `internet::SafeEmail` → `faker::internet::SafeEmail` in the crate).
263
- - `format`: used with `faker = "number::NumberWithFormat"`; pattern uses `#` (0–9) and `^` (1–9) per the [`fake` crate docs](https://docs.rs/fake/latest/fake/).
264
-
265
422
  > **Note:** `table_options` are no longer supported; use explicit `rules` and optional `column_cases`.
266
423
 
267
424
  ---
@@ -368,144 +525,6 @@ Produced by SSMS "Script Table as → INSERT To", `mssql-scripter`, or similar t
368
525
 
369
526
  ---
370
527
 
371
- ## Row filtering
372
-
373
- You can retain or delete rows for specific tables using explicit predicate lists.
374
-
375
- - If `retain` is non-empty, a row is kept only if it matches at least one predicate.
376
- - Regardless of `retain`, a row is dropped if it matches any predicate in `delete`.
377
-
378
- Supported predicate operators:
379
-
380
- | Operator | Description |
381
- |---|---|
382
- | `eq` / `neq` | String compare (case-insensitive if `case_insensitive = true`) |
383
- | `in` / `not_in` | List of values (string compare) |
384
- | `like` / `ilike` | SQL-like patterns (`%` and `_`) |
385
- | `regex` / `iregex` | Rust regex (`iregex` is case-insensitive) |
386
- | `lt` / `lte` / `gt` / `gte` | Numeric compare (values parsed as numbers) |
387
- | `is_null` / `not_null` | No value needed |
388
-
389
- Predicates can target nested JSON values using dot notation (`payload.profile.tier`) or Django-style notation (`payload__profile__tier`). For JSON arrays, path segments are evaluated against each element, so list-of-dicts structures can be matched naturally.
390
-
391
- ### JSON path list targeting
392
-
393
- JSON list/array traversal is automatic once a path segment resolves to an array.
394
-
395
- - **All elements in an array**: use the next field name directly.
396
- - `payload.items.kind` or `payload__items__kind`
397
- - Matches/rewrites `kind` for every object in `items`.
398
- - **Specific array index**: use a numeric segment.
399
- - `payload.items.0.kind` or `payload__items__0__kind`
400
- - Targets only the first element.
401
- - **Nested arrays**: combine field and index segments as needed.
402
- - `payload.groups.members.email`
403
- - `payload.groups.1.members.0.email`
404
-
405
- This path behavior is shared by both `row_filters` predicates and JSON-path anonymization rules in `[rules]`.
406
-
407
- ```toml
408
- [row_filters."public.users"]
409
- retain = [
410
- { column = "country", op = "eq", value = "US" },
411
- { column = "email", op = "ilike", value = "%@myco.com" },
412
- { column = "profile.flags.plan", op = "eq", value = "gold" }
413
- ]
414
- delete = [
415
- { column = "is_admin", op = "eq", value = "true" },
416
- { column = "email", op = "ilike", value = "%@example.com" },
417
- { column = "devices__platform", op = "eq", value = "android" }
418
- ]
419
- ```
420
-
421
- Row filtering works for both `INSERT ... VALUES (...)` and `COPY ... FROM stdin` rows.
422
-
423
- ---
424
-
425
- ## Conditional per-column cases
426
-
427
- Define default strategies in `rules."<table>"` and add ordered per-column cases in `column_cases."<table>"."<column>"`. For each row and column, Dumpling applies the first matching case; if none match, it falls back to the default from `rules`.
428
-
429
- ```toml
430
- [rules."public.users"]
431
- email = { strategy = "hash", as_string = true } # default
432
- name = { strategy = "name" }
433
-
434
- [[column_cases."public.users".email]]
435
- when.any = [{ column = "is_admin", op = "eq", value = "true" }]
436
- strategy = { strategy = "redact", as_string = true }
437
-
438
- [[column_cases."public.users".email]]
439
- when.any = [{ column = "country", op = "in", values = ["DE","FR","GB"] }]
440
- strategy = { strategy = "hash", salt = "eu-salt", as_string = true }
441
- ```
442
-
443
- - `when.any` is OR, `when.all` is AND; you can use either or both. If both are empty, the case matches unconditionally.
444
- - First-match-wins per column; there is no merge or fallthrough.
445
- - Row filtering (`row_filters`) is evaluated before cases; deleted rows are not transformed.
446
-
447
- ---
448
-
449
- ## Hardened security profile
450
-
451
- For adversarial risk environments — where an internal or external actor may have partial auxiliary data — use `--security-profile hardened`:
452
-
453
- ```bash
454
- dumpling --security-profile hardened -i dump.sql -o sanitized.sql
455
- ```
456
-
457
- ### What changes in hardened mode
458
-
459
- | Aspect | Standard | Hardened |
460
- |---|---|---|
461
- | Random generation | xorshift64\* seeded from system time | OS CSPRNG (`getrandom`) — non-predictable |
462
- | `hash` strategy | SHA-256(salt \|\| input) | HMAC-SHA-256(key=salt, data=input) |
463
- | Deterministic domain byte stream | SHA-256 CTR-mode | HMAC-SHA-256 CTR-mode |
464
- | Report `security_profile` field | `"standard"` | `"hardened"` |
465
- | `--seed` / `DUMPLING_SEED` | Seeds the PRNG | Ignored (warning emitted) |
466
-
467
- ### Why this matters
468
-
469
- - **Non-predictable output**: xorshift64\* is seeded from system time, which is guessable. The OS CSPRNG cannot be predicted from timing alone.
470
- - **Proper keyed hashing**: `SHA-256(key || data)` is vulnerable to length-extension attacks and weak as a MAC. HMAC-SHA-256 uses the salt as a genuine cryptographic key, providing provable PRF security.
471
- - **Domain separation**: HMAC construction ensures outputs from one salt/key cannot be confused with another.
472
-
473
- ### Key management guidance
474
-
475
- Configure a per-environment secret via an env-backed reference to prevent key leakage:
476
-
477
- ```toml
478
- # .dumplingconf
479
- salt = "${DUMPLING_HMAC_KEY}"
480
-
481
- [rules."public.users"]
482
- ssn = { strategy = "hash", as_string = true }
483
- email = { strategy = "email", domain = "users" }
484
- ```
485
-
486
- ```bash
487
- export DUMPLING_HMAC_KEY="$(openssl rand -base64 32)"
488
- dumpling --security-profile hardened -i dump.sql -o sanitized.sql
489
- ```
490
-
491
- **Key rotation**: Changing `DUMPLING_HMAC_KEY` will produce entirely different pseudonyms for all salted/domain-mapped columns. If you rely on referential consistency across separately-processed dumps (e.g., snapshots over time), keep the same key or re-anonymize all related dumps together. Rotate keys when:
492
- - A key may have been compromised.
493
- - You intentionally want to break prior referential linkability.
494
-
495
- ### Report metadata
496
-
497
- The JSON report always includes the active security profile:
498
-
499
- ```json
500
- {
501
- "security_profile": "hardened",
502
- "total_rows_processed": 1000,
503
- ...
504
- }
505
- ```
506
-
507
- ---
508
-
509
528
  ## Policy linting
510
529
 
511
530
  The `lint-policy` subcommand statically analyses your configuration and flags common issues before they affect a production pipeline.
@@ -539,7 +558,7 @@ See the [CI guardrails documentation](docs/src/ci-guardrails.md) for full pipeli
539
558
  - For CI/CD and production-like workflows, prefer the default fail-closed mode and avoid `--allow-noop` unless a no-op run is intentional.
540
559
  - For best results, configure strategies compatible with column data types. If you hash an integer column, Dumpling will render a string; most databases can coerce this, but explicit `as_string = false` may help in some cases.
541
560
  - For length-restricted text columns (`varchar(n)`, `character varying(n)`, `char(n)`, `character(n)`), Dumpling reads `CREATE TABLE` definitions and truncates generated text values to fit within the declared limit.
542
- - Deterministic anonymization for tests: pass `--seed <u64>` or set env `DUMPLING_SEED` to make fuzz strategies reproducible across runs. Note: `--seed` has no effect in `--security-profile hardened`.
561
+ - Deterministic anonymization for tests: pass `--seed <u64>` or set env `DUMPLING_SEED` to make fuzz strategies reproducible across runs. In hardened security profile, seeds are ignored; see the [configuration guide](https://ababic.github.io/dumpling/configuration.html#hardened-security-profile).
543
562
  - Domain mappings (`domain = "..."`) are deterministic by source value + domain (+ optional salt), so referential joins stay stable across tables within the same dump.
544
563
 
545
564
  ---