dumpling-cli 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. dumpling_cli-0.3.0/.dumplingconf.example +164 -0
  2. dumpling_cli-0.3.0/.github/workflows/docs-pr.yml +35 -0
  3. {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/.github/workflows/docs.yml +9 -11
  4. dumpling_cli-0.3.0/.github/workflows/policy-lint.yml +53 -0
  5. {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/.gitignore +1 -0
  6. dumpling_cli-0.3.0/AGENTS.md +316 -0
  7. dumpling_cli-0.3.0/CHANGELOG.md +47 -0
  8. dumpling_cli-0.3.0/CONTRIBUTING.md +107 -0
  9. {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/Cargo.lock +212 -2
  10. {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/Cargo.toml +5 -1
  11. dumpling_cli-0.3.0/MAINTENANCE.md +67 -0
  12. dumpling_cli-0.3.0/PKG-INFO +481 -0
  13. dumpling_cli-0.3.0/README.md +459 -0
  14. {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/book.toml +1 -1
  15. {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/docs/src/SUMMARY.md +1 -0
  16. dumpling_cli-0.3.0/docs/src/ci-guardrails.md +153 -0
  17. dumpling_cli-0.3.0/docs/src/configuration.md +266 -0
  18. {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/docs/src/getting-started.md +3 -1
  19. dumpling_cli-0.3.0/docs/src/index.md +19 -0
  20. {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/pyproject.toml +3 -3
  21. dumpling_cli-0.3.0/rust-toolchain.toml +3 -0
  22. dumpling_cli-0.3.0/scripts/setup-dev.sh +89 -0
  23. dumpling_cli-0.3.0/src/faker_dispatch.rs +521 -0
  24. dumpling_cli-0.3.0/src/filter.rs +633 -0
  25. dumpling_cli-0.3.0/src/lint.rs +551 -0
  26. dumpling_cli-0.3.0/src/main.rs +507 -0
  27. dumpling_cli-0.3.0/src/report.rs +198 -0
  28. dumpling_cli-0.3.0/src/scan.rs +392 -0
  29. dumpling_cli-0.3.0/src/settings.rs +1698 -0
  30. dumpling_cli-0.3.0/src/sql.rs +3092 -0
  31. dumpling_cli-0.3.0/src/transform.rs +1454 -0
  32. dumpling_cli-0.1.0/CHANGELOG.md +0 -20
  33. dumpling_cli-0.1.0/PKG-INFO +0 -207
  34. dumpling_cli-0.1.0/README.md +0 -185
  35. dumpling_cli-0.1.0/docs/src/configuration.md +0 -33
  36. dumpling_cli-0.1.0/docs/src/index.md +0 -19
  37. dumpling_cli-0.1.0/src/filter.rs +0 -310
  38. dumpling_cli-0.1.0/src/main.rs +0 -243
  39. dumpling_cli-0.1.0/src/report.rs +0 -102
  40. dumpling_cli-0.1.0/src/settings.rs +0 -266
  41. dumpling_cli-0.1.0/src/sql.rs +0 -896
  42. dumpling_cli-0.1.0/src/transform.rs +0 -395
  43. {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/.github/workflows/ci.yml +0 -0
  44. {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/.github/workflows/platform-compat-latest.yml +0 -0
  45. {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/.github/workflows/platform-compat-matrix.yml +0 -0
  46. {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/.github/workflows/publish.yml +0 -0
  47. {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/.github/workflows/release.yml +0 -0
  48. {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/.github/workflows/tests.yml +0 -0
  49. {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/datetime_out.sql +0 -0
  50. {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/datetime_sample.sql +0 -0
  51. {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/docs/src/releasing.md +0 -0
@@ -0,0 +1,164 @@
1
+ # .dumplingconf.example
2
+ #
3
+ # Rename this file to .dumplingconf (or place the [tool.dumpling] section in
4
+ # pyproject.toml) to activate it. Dumpling discovers config in this order:
5
+ #
6
+ # 1. --config <path> explicit CLI flag
7
+ # 2. ./.dumplingconf this file (place it in your project root)
8
+ # 3. ./pyproject.toml [tool.dumpling]
9
+ #
10
+ # Missing config is a hard error unless --allow-noop is passed.
11
+
12
+ # ---------------------------------------------------------------------------
13
+ # Global salt for strategies that support it (e.g. hash).
14
+ # Use an env-var reference instead of a plaintext value; Dumpling warns on
15
+ # plaintext salts. Supported forms:
16
+ # salt = "${MY_ENV_VAR}" # short form
17
+ # salt = "${env:MY_ENV_VAR}" # explicit env provider
18
+ # ---------------------------------------------------------------------------
19
+ salt = "${DUMPLING_GLOBAL_SALT}"
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # [rules] — per-table anonymization rules
23
+ #
24
+ # Keys are either "table" or "schema.table" (schema-qualified takes priority).
25
+ # Column names and table names are matched case-insensitively.
26
+ #
27
+ # Each column maps to an anonymizer spec: { strategy = "…", <options> }
28
+ # ---------------------------------------------------------------------------
29
+ # Faker strategy: `faker = "module::Type"` matches the Rust `fake` crate layout.
30
+ # Crate docs: https://docs.rs/fake/latest/fake/
31
+ # Faker modules: https://docs.rs/fake/latest/fake/faker/index.html
32
+ # Upstream repo: https://github.com/cksac/fake-rs
33
+ [rules."public.users"]
34
+ # email — fake email via Rust `fake` crate; force quoted string output
35
+ email = { strategy = "faker", faker = "internet::SafeEmail", domain = "customer_identity", unique_within_domain = true }
36
+ # name — locale-aware full name (see `locale`); other generators use `faker = "module::Type"`
37
+ full_name = { strategy = "faker", faker = "name::Name" }
38
+ first_name = { strategy = "faker", faker = "name::FirstName" }
39
+ last_name = { strategy = "faker", faker = "name::LastName" }
40
+ # phone — US-style (xxx) xxx-xxxx
41
+ phone = { strategy = "phone" }
42
+ # ssn — SHA-256 hex of original; use per-column salt for extra protection
43
+ ssn = { strategy = "hash", salt = "${DUMPLING_USERS_SSN_SALT}", as_string = true }
44
+ # password/token — hash without revealing original
45
+ password_hash = { strategy = "hash", as_string = true }
46
+ api_token = { strategy = "redact", as_string = true }
47
+ # numeric fields
48
+ age = { strategy = "int_range", min = 18, max = 90 }
49
+ # random alphanumeric string of given length
50
+ ref_code = { strategy = "string", length = 12 }
51
+ # uuid v4-like string
52
+ external_id = { strategy = "uuid" }
53
+ # set to SQL NULL
54
+ legacy_field = { strategy = "null" }
55
+ # temporal fuzzing
56
+ date_of_birth = { strategy = "date_fuzz", min_days = -365, max_days = 365 }
57
+ created_at = { strategy = "datetime_fuzz", min_seconds = -86400, max_seconds = 86400 }
58
+ last_login = { strategy = "datetime_fuzz" }
59
+ wake_time = { strategy = "time_fuzz", min_seconds = -3600, max_seconds = 3600 }
60
+
61
+ [rules."public.orders"]
62
+ # credit card — redact entirely; force as quoted string
63
+ credit_card = { strategy = "redact", as_string = true }
64
+ # keep the same anonymized email as users table via shared domain
65
+ customer_email = { strategy = "faker", faker = "internet::SafeEmail", domain = "customer_identity" }
66
+
67
+ [rules."public.audit_log"]
68
+ # unqualified table name also works (matches any schema)
69
+ ip_address = { strategy = "string", length = 15 }
70
+
71
+ # ---------------------------------------------------------------------------
72
+ # [sensitive_columns] — explicit list of columns that must be covered when
73
+ # --strict-coverage is used. Supplements automatic name-based detection.
74
+ # ---------------------------------------------------------------------------
75
+ [sensitive_columns]
76
+ "public.users" = ["employee_number", "tax_id", "national_id"]
77
+ "public.orders" = ["bank_account"]
78
+
79
+ # ---------------------------------------------------------------------------
80
+ # [row_filters] — retain or delete rows based on column predicates
81
+ #
82
+ # retain: keep the row only if at least one predicate matches (OR semantics)
83
+ # delete: drop the row if any predicate matches (evaluated after retain)
84
+ #
85
+ # Row filtering works for both INSERT … VALUES and COPY … FROM stdin.
86
+ # ---------------------------------------------------------------------------
87
+ [row_filters."public.users"]
88
+ retain = [
89
+ # Keep internal employees
90
+ { column = "email", op = "ilike", value = "%@myco.com" },
91
+ # Keep gold-tier users
92
+ { column = "tier", op = "eq", value = "gold" },
93
+ # Keep users with active subscription flag in a JSONB column
94
+ { column = "profile.subscription.active", op = "eq", value = "true" },
95
+ ]
96
+ delete = [
97
+ # Always drop admin accounts from anonymized dumps
98
+ { column = "is_admin", op = "eq", value = "true" },
99
+ # Drop bot/test accounts
100
+ { column = "email", op = "ilike", value = "%@example.com" },
101
+ # Drop users outside the US
102
+ { column = "country", op = "not_in", values = ["US", "CA"] },
103
+ # Drop rows where age is below threshold (numeric compare)
104
+ { column = "age", op = "lt", value = 18 },
105
+ # Drop if nested JSON array contains a specific value (Django-style path)
106
+ { column = "devices__platform", op = "eq", value = "android" },
107
+ ]
108
+
109
+ # Supported operators:
110
+ # eq / neq — string equality (case_insensitive = true for case-insensitive)
111
+ # in / not_in — list membership (use `values = [...]`)
112
+ # like / ilike — SQL LIKE pattern (% and _); ilike is always case-insensitive
113
+ # regex / iregex — Rust regex; iregex is case-insensitive
114
+ # lt / lte / gt / gte — numeric compare (values parsed as f64)
115
+ # is_null / not_null — NULL check (no value needed)
116
+
117
+ # ---------------------------------------------------------------------------
118
+ # [[column_cases]] — conditional per-column strategy overrides
119
+ #
120
+ # For each row and column, Dumpling evaluates cases top-to-bottom and applies
121
+ # the FIRST matching case. Falls back to [rules] if none match.
122
+ #
123
+ # when.any = OR semantics; when.all = AND semantics; both empty = always match.
124
+ # ---------------------------------------------------------------------------
125
+ [[column_cases."public.users".email]]
126
+ # Admin emails → always redact (regardless of other conditions)
127
+ when.any = [{ column = "is_admin", op = "eq", value = "true" }]
128
+ strategy = { strategy = "redact", as_string = true }
129
+
130
+ [[column_cases."public.users".email]]
131
+ # EU users → hash with a region-specific salt for GDPR compliance
132
+ when.any = [{ column = "country", op = "in", values = ["DE", "FR", "GB", "IT", "ES"] }]
133
+ strategy = { strategy = "hash", salt = "${DUMPLING_EU_SALT}", as_string = true }
134
+
135
+ # (No matching case → falls back to [rules."public.users"].email)
136
+
137
+ # ---------------------------------------------------------------------------
138
+ # [output_scan] — post-transform residual PII detection
139
+ #
140
+ # Scan the anonymized output for leftover PII-like patterns. Categories:
141
+ # email, ssn, pan (payment card, Luhn-validated), token (JWT/AWS/GitHub/Slack)
142
+ #
143
+ # Use --scan-output to enable scanning and --fail-on-findings to gate on it.
144
+ # ---------------------------------------------------------------------------
145
+ [output_scan]
146
+ enabled_categories = ["email", "ssn", "pan", "token"]
147
+ default_threshold = 0 # allowed findings per category (0 = none)
148
+ default_severity = "high"
149
+ fail_on_severity = "low" # fail if any category at or above this severity exceeds threshold
150
+ sample_limit_per_category = 5 # max sample locations stored in the JSON report
151
+
152
+ [output_scan.thresholds]
153
+ # Override per-category threshold (0 = zero tolerance)
154
+ email = 0
155
+ ssn = 0
156
+ pan = 0
157
+ token = 0
158
+
159
+ [output_scan.severities]
160
+ # Override per-category severity level
161
+ email = "medium"
162
+ ssn = "high"
163
+ pan = "critical"
164
+ token = "high"
@@ -0,0 +1,35 @@
1
+ # mdBook verification on pull requests only (no GitHub Pages upload or deploy).
2
+ # Pages build + deploy live in docs.yml and run on pushes to main.
3
+ name: Docs (PR)
4
+
5
+ on:
6
+ pull_request:
7
+ paths:
8
+ - "README.md"
9
+ - "book.toml"
10
+ - "docs/**"
11
+ - ".github/workflows/docs.yml"
12
+ - ".github/workflows/docs-pr.yml"
13
+
14
+ permissions:
15
+ contents: read
16
+
17
+ concurrency:
18
+ group: docs-pr-${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
19
+ cancel-in-progress: true
20
+
21
+ jobs:
22
+ verify:
23
+ name: Build mdBook (verify)
24
+ runs-on: ubuntu-latest
25
+ steps:
26
+ - name: Checkout
27
+ uses: actions/checkout@v4
28
+
29
+ - name: Install mdBook
30
+ uses: peaceiris/actions-mdbook@v2
31
+ with:
32
+ mdbook-version: "0.4.52"
33
+
34
+ - name: Build documentation site
35
+ run: mdbook build
@@ -1,12 +1,8 @@
1
+ # Build and deploy the mdBook site to GitHub Pages (main branch only).
2
+ # Pull-request verification runs in docs-pr.yml — this workflow does not run on PRs.
1
3
  name: Docs
2
4
 
3
5
  on:
4
- pull_request:
5
- paths:
6
- - "README.md"
7
- - "book.toml"
8
- - "docs/**"
9
- - ".github/workflows/docs.yml"
10
6
  push:
11
7
  branches:
12
8
  - main
@@ -18,16 +14,16 @@ on:
18
14
 
19
15
  permissions:
20
16
  contents: read
21
- pages: write
22
- id-token: write
23
17
 
24
18
  concurrency:
25
- group: docs-${{ github.ref }}
19
+ group: docs-pages-${{ github.ref }}
26
20
  cancel-in-progress: true
27
21
 
28
22
  jobs:
29
23
  build:
30
24
  runs-on: ubuntu-latest
25
+ permissions:
26
+ contents: read
31
27
  steps:
32
28
  - name: Checkout
33
29
  uses: actions/checkout@v4
@@ -40,15 +36,17 @@ jobs:
40
36
  - name: Build documentation site
41
37
  run: mdbook build
42
38
 
43
- - name: Upload docs artifact
39
+ - name: Upload Pages deployment artifact
44
40
  uses: actions/upload-pages-artifact@v3
45
41
  with:
46
42
  path: docs/book
47
43
 
48
44
  deploy:
49
- if: github.event_name == 'push' && github.ref == 'refs/heads/main'
50
45
  needs: build
51
46
  runs-on: ubuntu-latest
47
+ permissions:
48
+ pages: write
49
+ id-token: write
52
50
  environment:
53
51
  name: github-pages
54
52
  url: ${{ steps.deployment.outputs.page_url }}
@@ -0,0 +1,53 @@
1
+ name: Policy Lint
2
+
3
+ # Reference CI guardrail template for Dumpling anonymization policy.
4
+ #
5
+ # This workflow runs `dumpling lint-policy` on every PR and push to main to
6
+ # catch policy regressions before they are merged.
7
+ #
8
+ # Checks performed:
9
+ # empty-rules-table — a [rules] entry has no column rules
10
+ # empty-column-cases-table — a [column_cases] entry has no column cases
11
+ # unsalted-hash — hash strategy used without any salt
12
+ # inconsistent-domain-strategy — same domain used with different strategies
13
+ # uncovered-sensitive-column — sensitive_columns entry with no matching rule
14
+ #
15
+ # Exit codes:
16
+ # 0 — no violations found
17
+ # 1 — one or more violations found (job fails)
18
+
19
+ on:
20
+ pull_request:
21
+ push:
22
+ branches:
23
+ - main
24
+
25
+ permissions:
26
+ contents: read
27
+
28
+ jobs:
29
+ policy-lint:
30
+ name: Anonymization policy lint
31
+ runs-on: ubuntu-latest
32
+
33
+ steps:
34
+ - name: Checkout
35
+ uses: actions/checkout@v4
36
+
37
+ - name: Install Rust toolchain
38
+ uses: dtolnay/rust-toolchain@stable
39
+
40
+ - name: Cache Cargo build artifacts
41
+ uses: Swatinem/rust-cache@v2
42
+
43
+ - name: Build dumpling
44
+ run: cargo build --release --locked
45
+
46
+ - name: Lint anonymization policy
47
+ # --allow-noop lets the command succeed when no config is present.
48
+ # In your own repo, remove --allow-noop so a missing config is a hard
49
+ # error (fail-closed). If your config lives at a non-default path, add:
50
+ # --config path/to/.dumplingconf
51
+ # Dumpling auto-discovers .dumplingconf or pyproject.toml [tool.dumpling]
52
+ # in the working directory when --config is omitted.
53
+ run: ./target/release/dumpling lint-policy --allow-noop
@@ -1,2 +1,3 @@
1
1
  /target/
2
2
  /docs/book/
3
+ /.tools/
@@ -0,0 +1,316 @@
1
+ # AGENTS.md
2
+
3
+ This file provides structured guidance for AI coding agents working on the **Dumpling** project. Read this before making any code changes.
4
+
5
+ ---
6
+
7
+ ## Project Overview
8
+
9
+ Dumpling is a streaming, static anonymizer for Postgres plain-SQL dumps produced by `pg_dump`. It reads dump files line by line, anonymizes sensitive column data according to TOML-based configuration rules, and writes sanitized output—without ever connecting to a live database.
10
+
11
+ Key design goals:
12
+ - **Fail-closed**: Missing config is a hard error by default (prevents accidental no-ops in pipelines)
13
+ - **Streaming**: Processes dumps line by line so even multi-gigabyte files use minimal memory
14
+ - **Deterministic**: Domain mappings ensure the same source value always produces the same pseudonym, preserving foreign-key consistency across tables
15
+ - **CI-ready**: `--check` mode, strict coverage enforcement, JSON reports, and a residual-PII scan gate plug into any pipeline
16
+
17
+ ---
18
+
19
+ ## Repository Layout
20
+
21
+ ```
22
+ src/
23
+ main.rs — CLI entrypoint (clap), IO routing, program orchestration
24
+ settings.rs — Config loading, TOML parsing, secret resolution, validation, normalization
25
+ transform.rs — All anonymization strategies, PRNG, deterministic domain mapping
26
+ sql.rs — SQL stream processor: INSERT + COPY parsing, column strategy selection,
27
+ CREATE TABLE length extraction, sensitive coverage tracking
28
+ filter.rs — Row-filter predicate evaluation (eq/neq/like/regex/JSON-path/…)
29
+ scan.rs — Post-transform residual PII scanner (email/SSN/PAN/token regex)
30
+ report.rs — JSON report data structures and Reporter helper
31
+ docs/src/ — mdBook documentation source
32
+ .github/ — CI/CD GitHub Actions workflows
33
+ Cargo.toml — Rust package manifest
34
+ pyproject.toml — Python packaging (maturin) for pip-installable CLI
35
+ ```
36
+
37
+ ---
38
+
39
+ ## Development Setup
40
+
41
+ ### Prerequisites
42
+
43
+ - **Rust stable toolchain**: install via [rustup.rs](https://rustup.rs/). No database, Docker, or external services required.
44
+
45
+ ### Build
46
+
47
+ ```bash
48
+ cargo build # debug build
49
+ cargo build --release # optimized release build
50
+ ./target/debug/dumpling --help
51
+ ```
52
+
53
+ ### Run locally
54
+
55
+ ```bash
56
+ # Anonymize a dump using a config found in CWD
57
+ dumpling -i dump.sql -o sanitized.sql
58
+
59
+ # Explicit config path
60
+ dumpling -i dump.sql -o sanitized.sql -c path/to/.dumplingconf
61
+
62
+ # Check mode: exit 1 if any changes would be made, write nothing
63
+ dumpling --check -i dump.sql
64
+
65
+ # Allow running with no config (explicitly opt into passthrough)
66
+ dumpling --allow-noop -i dump.sql -o out.sql
67
+ ```
68
+
69
+ Config is discovered in order: `--config` path → `./.dumplingconf` → `./pyproject.toml [tool.dumpling]`.
70
+
71
+ ---
72
+
73
+ ## Testing
74
+
75
+ Run the full test suite:
76
+
77
+ ```bash
78
+ cargo test --all-targets --all-features
79
+ ```
80
+
81
+ All tests are inline `#[cfg(test)]` modules at the bottom of each source file. There are no separate test files.
82
+
83
+ ### Test conventions
84
+
85
+ - Call `set_random_seed(N)` from `transform.rs` when a test exercises fuzz strategies (`date_fuzz`, `int_range`, etc.) to ensure reproducible, assertion-checkable output.
86
+ - Config-parsing tests use a `write_temp_config(contents)` helper that writes a TOML file to `std::env::temp_dir()`. Always clean up temp files after assertions with `let _ = fs::remove_file(path)`.
87
+ - Never hard-code absolute paths; always use `std::env::temp_dir()` or `std::env::current_dir()`.
88
+ - When testing with `ResolvedConfig` constructed directly (bypassing file loading), initialize all `HashMap` fields explicitly. Construct `OutputScanConfig::default()` for the `output_scan` field.
89
+ - Use `proc.process(&mut reader, &mut out)` and then `String::from_utf8(out).unwrap()` to get the full processed SQL output for assertion.
90
+
91
+ ---
92
+
93
+ ## CI Gates
94
+
95
+ **All three must pass.** Run these locally before pushing:
96
+
97
+ ```bash
98
+ cargo fmt --all -- --check # formatting
99
+ cargo clippy --all-targets --all-features # linting (zero warnings allowed)
100
+ cargo test --all-targets --all-features # tests
101
+ ```
102
+
103
+ Apply auto-formatting with `cargo fmt`. For clippy warnings, fix the code rather than adding `#[allow(...)]` attributes unless there is a specific, documented reason.
104
+
105
+ ---
106
+
107
+ ## Code Conventions
108
+
109
+ ### Imports
110
+
111
+ Imports **must** be at the top of each module, grouped by standard library / external crates / internal crates. Never place `use` statements inside function bodies unless strictly necessary to resolve a circular import. This is a hard rule.
112
+
113
+ ### Error handling
114
+
115
+ - Use `anyhow::Result<T>` for fallible public functions that propagate errors toward `main`.
116
+ - Attach context with `.with_context(|| format!("…"))` when propagating errors so callers see where the failure originated.
117
+ - Use `anyhow::bail!(…)` for early exits with a descriptive message.
118
+ - Define new typed errors with `thiserror` only when callers need to match on them (rare in this codebase).
119
+
120
+ ### Naming and normalization
121
+
122
+ - All config table and column keys are normalized to **lowercase** during `resolve()` in `settings.rs`. The lookup helpers (`lookup_column_rule`, `lookup_column_cases`, `lookup_row_filters`, `lookup_sensitive_columns`) always compare lowercase keys. Pass lowercase when calling them.
123
+ - Follow standard Rust naming: `snake_case` for variables, functions, and modules; `PascalCase` for types and enum variants.
124
+
125
+ ### SQL output quoting
126
+
127
+ The `Replacement` type in `transform.rs` carries a `force_quoted` flag that controls whether the value is wrapped in SQL single quotes on output.
128
+
129
+ - `Replacement::quoted(v)` — forces single-quoted output (use for strings, emails, names, etc.)
130
+ - `Replacement::unquoted(v)` — raw output (use for integers, hashes when not forced to string)
131
+ - `Replacement::null()` — renders as `NULL` / `\N` (COPY format)
132
+
133
+ `render_cell` in `sql.rs` applies: `force_quoted || original.was_quoted` → quoted output. New strategies should use `quoted` for text-like values and `unquoted` for numeric/raw values.
134
+
135
+ ---
136
+
137
+ ## Architecture Deep-Dive
138
+
139
+ ### Config Resolution Flow (`settings.rs`)
140
+
141
+ 1. Load raw TOML bytes from the selected source.
142
+ 2. Walk the TOML value tree with `resolve_secrets_in_value`: replace `${ENV_VAR}` and `${env:ENV_VAR}` references with environment variable values. Emit stderr warnings for plaintext `salt` values.
143
+ 3. Deserialize the resolved `toml::Value` into `RawConfig` via serde.
144
+ 4. Call `validate_raw_config`: verify all `strategy` names are in `KNOWN_STRATEGIES`, check strategy-option compatibility (e.g., `salt` is only valid for `hash`), and validate numeric bounds.
145
+ 5. Call `resolve()`: normalize all table/column keys to lowercase, producing `ResolvedConfig`.
146
+
147
+ **Lookup pattern** — all lookup helpers try `schema.table` first, then fall back to bare `table`:
148
+
149
+ ```rust
150
+ lookup_column_rule(&cfg, Some("public"), "users", "email")
151
+ // tries "public.users" then "users"
152
+ ```
153
+
154
+ ### SQL Stream Processing (`sql.rs`)
155
+
156
+ `SqlStreamProcessor::process()` drives a state machine over lines:
157
+
158
+ | Mode | Trigger | Behavior |
159
+ |------|---------|----------|
160
+ | `Pass` | Default | Passthrough; detects INSERT/COPY/CREATE TABLE starts |
161
+ | `InInsert` | `INSERT INTO …` without trailing `;` | Accumulate until `statement_complete()` detects `;` outside quotes/parens |
162
+ | `InCopy` | `COPY … FROM stdin;` | Process tab-delimited rows until `\.` |
163
+ | `InCreateTable` | `CREATE TABLE …` | Accumulate until `;`; parse column length limits |
164
+
165
+ **Per-row pipeline (INSERT and COPY)**:
166
+ 1. Parse cells (preserving quoting metadata into `Cell` structs).
167
+ 2. Call `should_keep_row` (row filters) — skip row if filtered.
168
+ 3. For each column, call `select_strategy_for_cell`:
169
+ - Check `column_cases` for this table+column; iterate in declared order; **first matching `when` wins** (first-match-wins, no fallthrough).
170
+ - Fall back to base `rules` entry for the table+column.
171
+ - Return `None` → cell passes through unchanged.
172
+ 4. Call `apply_anonymizer` → `Replacement`.
173
+ 5. Render back to SQL with `render_cell`.
174
+
175
+ `CREATE TABLE` statements are parsed to extract `varchar(N)` / `character varying(N)` / `char(N)` / `character(N)` / `bpchar(N)` length limits. These are stored in `column_length_limits` and passed to `apply_anonymizer` so generated strings are truncated to fit the column constraint.
176
+
177
+ ### Anonymization (`transform.rs`)
178
+
179
+ Two code paths:
180
+
181
+ - **Random path**: `apply_random_anonymizer` — uses a global xorshift64\* PRNG (seeded from system time, `--seed` flag, or `DUMPLING_SEED` env var). Used when `spec.domain` is absent.
182
+ - **Deterministic/domain path**: `apply_domain_anonymizer` → `apply_deterministic_anonymizer` — uses a `DeterministicByteStream` (SHA-256 CTR-mode) seeded from `(domain_key, original_value, strategy, salt, collision_index)`. Used when `spec.domain` is set. The `domain_mappings` cache ensures the same source value always maps to the same pseudonym within a domain across tables. When `unique_within_domain = true`, collision detection retries up to `MAX_DOMAIN_UNIQUENESS_ATTEMPTS` (4096) before giving up.
183
+
184
+ ### Row Filtering (`filter.rs`)
185
+
186
+ `should_keep_row` evaluates `RowFilterSet`:
187
+ - `retain` (OR): if non-empty, a row is kept only if at least one predicate matches.
188
+ - `delete` (any-match): if any predicate matches, the row is dropped (evaluated after `retain`).
189
+
190
+ JSON path traversal is supported: `payload.profile.tier` (dot) and `payload__profile__tier` (Django double-underscore). Array elements are traversed by evaluating each item's fields, so list-of-dicts structures work naturally.
191
+
192
+ ### Residual PII Scanning (`scan.rs`)
193
+
194
+ `ScanningWriter` wraps the output `Write` stream, intercepting bytes and passing them to `OutputScanner`. The scanner applies regex detectors line-by-line for:
195
+ - `email`: RFC-like email pattern
196
+ - `ssn`: US SSN-like `DDD-DD-DDDD` with invalid area/group/serial number rejection
197
+ - `pan`: Payment card numbers (13–19 digits, Luhn-validated)
198
+ - `token`: JWT, AWS access key IDs (`AKIA…`), GitHub PATs (`ghp_/gho_/…`), Slack tokens, labeled `key=value` patterns
199
+
200
+ Findings are aggregated per category and compared against configurable thresholds and severity gates.
201
+
202
+ ---
203
+
204
+ ## How to Add a New Anonymization Strategy
205
+
206
+ Follow these steps in order. Do not skip any step.
207
+
208
+ 1. **`src/settings.rs` — `KNOWN_STRATEGIES`**: Add the strategy name string to the `KNOWN_STRATEGIES` const slice.
209
+
210
+ 2. **`src/settings.rs` — `AnonymizerSpec`**: If the strategy needs new config fields (e.g., `min_length`, `charset`), add them as `pub field: Option<T>`. Keep them `Option` for backwards compatibility.
211
+
212
+ 3. **`src/settings.rs` — `validate_anonymizer_spec`**:
213
+ - Add the new strategy-specific fields to the `unsupported` list for all *other* strategies (so using them with the wrong strategy produces a clear error).
214
+ - Add bounds or range validation if applicable (see `int_range` and `string` examples).
215
+
216
+ 4. **`src/transform.rs` — `apply_random_anonymizer`**: Add a `match` arm with the random implementation. Return an appropriate `Replacement` variant.
217
+
218
+ 5. **`src/transform.rs` — `apply_deterministic_anonymizer`**: Add a matching `match` arm for domain-mapping support. Use the provided `DeterministicByteStream` for all randomness to ensure reproducibility.
219
+
220
+ 6. **`src/transform.rs` — `should_enforce_max_len`**: If the strategy generates string values that should be truncated to fit `varchar(N)` columns, make sure it is **not** in the exclusion list. Currently excluded: `null` and `int_range`.
221
+
222
+ 7. **Tests**: Add `#[test]` functions in `src/transform.rs` (unit-test strategy output values) and in `src/sql.rs` (end-to-end pipeline test). Use `set_random_seed(N)` for reproducibility.
223
+
224
+ 8. **`README.md`**: Add a row to the "Anonymization strategies" table.
225
+
226
+ **`faker` strategy:** Config only carries string identifiers; Dumpling never evaluates user Rust from config. To ship a new generator, add dispatch in `src/faker_dispatch.rs` and validation in `validate_anonymizer_spec` for the `faker` branch. Upstream reference: [`fake` on docs.rs](https://docs.rs/fake/latest/fake/), [`fake::faker` module index](https://docs.rs/fake/latest/fake/faker/index.html), [source on GitHub](https://github.com/cksac/fake-rs).
227
+
228
+ ---
229
+
230
+ ## How to Add a New Row Filter Predicate Operator
231
+
232
+ 1. **`src/settings.rs` — `Predicate.op` doc comment**: Update the doc comment listing supported operator names.
233
+
234
+ 2. **`src/filter.rs` — `predicate_matches`**: Add a match arm for the new operator string in the appropriate branch (`single-value` or `multi-value`).
235
+
236
+ 3. **Tests**: Add `#[test]` functions in `src/filter.rs`.
237
+
238
+ 4. **`README.md`**: Add a row to the predicate operators table.
239
+
240
+ ---
241
+
242
+ ## Exit Codes
243
+
244
+ | Code | Meaning |
245
+ |------|---------|
246
+ | `0` | Success |
247
+ | `1` | Config/startup error, or `--check` mode found changes |
248
+ | `2` | `--strict-coverage`: uncovered sensitive columns detected |
249
+ | `3` | `--fail-on-findings`: output scan thresholds exceeded |
250
+
251
+ ---
252
+
253
+ ## Important Gotchas
254
+
255
+ - **Fail-closed by default**: If no config is found, Dumpling exits non-zero. Tests that exercise the CLI without providing a config must either supply one or pass `--allow-noop`. Never assume a missing config is safe.
256
+
257
+ - **`table_options` is intentionally removed**: The `[table_options]` key was deprecated and now deliberately fails with a targeted error pointing to `[rules]` and `[column_cases]`. Do not re-introduce it.
258
+
259
+ - **`null` and `redact` reject `domain`**: These strategies produce constant outputs regardless of input, so deterministic domain mapping is meaningless. The validator rejects `domain` with these strategies.
260
+
261
+ - **COPY NULL representation**: In COPY format, `\N` (backslash-N) means NULL. This is different from the string `"NULL"` used in INSERT VALUES format. Handle both correctly.
262
+
263
+ - **Multi-line INSERT statements**: A single logical INSERT may span many lines (pg_dump does this for large `VALUES` lists). The state machine accumulates lines until `statement_complete()` detects a semicolon that is outside all single-quote and parenthesis context.
264
+
265
+ - **Column key normalization is mandatory**: All config lookups compare lowercase keys. A table `Public.Users` in the dump matches config key `"public.users"`. Always normalize before lookup; the lookup helpers do this for you if you use them.
266
+
267
+ - **`AnonymizerSpec` construction in tests**: When building `AnonymizerSpec` directly in tests, you must set all fields explicitly (there is no `Default` impl). Set unused fields to `None`.
268
+
269
+ - **Clippy is zero-tolerance**: CI runs clippy with all targets and features. Any warning fails the build. Always run `cargo clippy --all-targets --all-features` locally before pushing.
270
+
271
+ - **Do not use `unsafe` beyond the existing PRNG seed**: The only `unsafe` in the codebase is in `transform.rs` for the global PRNG seed override (`RNG_SEED_OVERRIDE`). Do not add new `unsafe` blocks without a compelling reason and a code comment.
272
+
273
+ ---
274
+
275
+ ## Cursor Cloud specific instructions
276
+
277
+ This is a pure Rust CLI project with **no external services** (no database, Docker, or network dependencies). The Rust stable toolchain (rustc + cargo) is the only prerequisite.
278
+
279
+ ### One-shot environment (agents and humans)
280
+
281
+ From the repository root:
282
+
283
+ ```bash
284
+ ./scripts/setup-dev.sh
285
+ ```
286
+
287
+ This installs the **stable** toolchain with **rustfmt** and **clippy** (via `rustup` when available), runs **`cargo fetch`**, and installs a pinned **mdBook** binary under `.tools/` (same version as the Docs CI workflow) so you can run `mdbook build` without a global install. Add `.tools` to `PATH` for convenience, or invoke `.tools/mdbook build` directly.
288
+
289
+ The repo root **`rust-toolchain.toml`** pins **stable** and the **components** CI uses, so `cargo` automatically selects the right toolchain in fresh checkouts.
290
+
291
+ ### Quick reference
292
+
293
+ | Task | Command |
294
+ |------|---------|
295
+ | Setup (toolchain + fetch + mdbook) | `./scripts/setup-dev.sh` |
296
+ | Build | `cargo build` |
297
+ | Test | `cargo test --all-targets --all-features` |
298
+ | Lint | `cargo clippy --all-targets --all-features` |
299
+ | Format check | `cargo fmt --all -- --check` |
300
+ | Auto-format | `cargo fmt` |
301
+ | Docs site (mdBook) | `mdbook build` or `.tools/mdbook build` after setup |
302
+ | Run CLI | `./target/debug/dumpling --help` |
303
+
304
+ ### Running the CLI
305
+
306
+ Dumpling is fail-closed by default — it exits non-zero without a config file. To run a quick smoke test, either provide a `.dumplingconf` via `-c` or pass `--allow-noop`. Example:
307
+
308
+ ```bash
309
+ ./target/debug/dumpling --allow-noop -i /tmp/some_dump.sql -o /tmp/out.sql
310
+ ```
311
+
312
+ ### Notes
313
+
314
+ - All tests are inline `#[cfg(test)]` modules; there are no separate test files or fixtures to manage.
315
+ - The update script uses `cargo fetch` to pre-download crate dependencies. A full `cargo build` or `cargo test` will then compile from the local cache without network access.
316
+ - No environment variables or secrets are required for building, testing, or running the CLI locally.