dumpling-cli 0.1.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dumpling_cli-0.3.0/.dumplingconf.example +164 -0
- dumpling_cli-0.3.0/.github/workflows/docs-pr.yml +35 -0
- {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/.github/workflows/docs.yml +9 -11
- dumpling_cli-0.3.0/.github/workflows/policy-lint.yml +53 -0
- {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/.gitignore +1 -0
- dumpling_cli-0.3.0/AGENTS.md +316 -0
- dumpling_cli-0.3.0/CHANGELOG.md +47 -0
- dumpling_cli-0.3.0/CONTRIBUTING.md +107 -0
- {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/Cargo.lock +212 -2
- {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/Cargo.toml +5 -1
- dumpling_cli-0.3.0/MAINTENANCE.md +67 -0
- dumpling_cli-0.3.0/PKG-INFO +481 -0
- dumpling_cli-0.3.0/README.md +459 -0
- {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/book.toml +1 -1
- {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/docs/src/SUMMARY.md +1 -0
- dumpling_cli-0.3.0/docs/src/ci-guardrails.md +153 -0
- dumpling_cli-0.3.0/docs/src/configuration.md +266 -0
- {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/docs/src/getting-started.md +3 -1
- dumpling_cli-0.3.0/docs/src/index.md +19 -0
- {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/pyproject.toml +3 -3
- dumpling_cli-0.3.0/rust-toolchain.toml +3 -0
- dumpling_cli-0.3.0/scripts/setup-dev.sh +89 -0
- dumpling_cli-0.3.0/src/faker_dispatch.rs +521 -0
- dumpling_cli-0.3.0/src/filter.rs +633 -0
- dumpling_cli-0.3.0/src/lint.rs +551 -0
- dumpling_cli-0.3.0/src/main.rs +507 -0
- dumpling_cli-0.3.0/src/report.rs +198 -0
- dumpling_cli-0.3.0/src/scan.rs +392 -0
- dumpling_cli-0.3.0/src/settings.rs +1698 -0
- dumpling_cli-0.3.0/src/sql.rs +3092 -0
- dumpling_cli-0.3.0/src/transform.rs +1454 -0
- dumpling_cli-0.1.0/CHANGELOG.md +0 -20
- dumpling_cli-0.1.0/PKG-INFO +0 -207
- dumpling_cli-0.1.0/README.md +0 -185
- dumpling_cli-0.1.0/docs/src/configuration.md +0 -33
- dumpling_cli-0.1.0/docs/src/index.md +0 -19
- dumpling_cli-0.1.0/src/filter.rs +0 -310
- dumpling_cli-0.1.0/src/main.rs +0 -243
- dumpling_cli-0.1.0/src/report.rs +0 -102
- dumpling_cli-0.1.0/src/settings.rs +0 -266
- dumpling_cli-0.1.0/src/sql.rs +0 -896
- dumpling_cli-0.1.0/src/transform.rs +0 -395
- {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/.github/workflows/ci.yml +0 -0
- {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/.github/workflows/platform-compat-latest.yml +0 -0
- {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/.github/workflows/platform-compat-matrix.yml +0 -0
- {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/.github/workflows/publish.yml +0 -0
- {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/.github/workflows/release.yml +0 -0
- {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/.github/workflows/tests.yml +0 -0
- {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/datetime_out.sql +0 -0
- {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/datetime_sample.sql +0 -0
- {dumpling_cli-0.1.0 → dumpling_cli-0.3.0}/docs/src/releasing.md +0 -0
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# .dumplingconf.example
|
|
2
|
+
#
|
|
3
|
+
# Rename this file to .dumplingconf (or place the [tool.dumpling] section in
|
|
4
|
+
# pyproject.toml) to activate it. Dumpling discovers config in this order:
|
|
5
|
+
#
|
|
6
|
+
# 1. --config <path> explicit CLI flag
|
|
7
|
+
# 2. ./.dumplingconf this file (place it in your project root)
|
|
8
|
+
# 3. ./pyproject.toml [tool.dumpling]
|
|
9
|
+
#
|
|
10
|
+
# Missing config is a hard error unless --allow-noop is passed.
|
|
11
|
+
|
|
12
|
+
# ---------------------------------------------------------------------------
|
|
13
|
+
# Global salt for strategies that support it (e.g. hash).
|
|
14
|
+
# Use an env-var reference instead of a plaintext value; Dumpling warns on
|
|
15
|
+
# plaintext salts. Supported forms:
|
|
16
|
+
# salt = "${MY_ENV_VAR}" # short form
|
|
17
|
+
# salt = "${env:MY_ENV_VAR}" # explicit env provider
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
salt = "${DUMPLING_GLOBAL_SALT}"
|
|
20
|
+
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
# [rules] — per-table anonymization rules
|
|
23
|
+
#
|
|
24
|
+
# Keys are either "table" or "schema.table" (schema-qualified takes priority).
|
|
25
|
+
# Column names and table names are matched case-insensitively.
|
|
26
|
+
#
|
|
27
|
+
# Each column maps to an anonymizer spec: { strategy = "…", <options> }
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
# Faker strategy: `faker = "module::Type"` matches the Rust `fake` crate layout.
|
|
30
|
+
# Crate docs: https://docs.rs/fake/latest/fake/
|
|
31
|
+
# Faker modules: https://docs.rs/fake/latest/fake/faker/index.html
|
|
32
|
+
# Upstream repo: https://github.com/cksac/fake-rs
|
|
33
|
+
[rules."public.users"]
|
|
34
|
+
# email — fake email via Rust `fake` crate; force quoted string output
|
|
35
|
+
email = { strategy = "faker", faker = "internet::SafeEmail", domain = "customer_identity", unique_within_domain = true }
|
|
36
|
+
# name — locale-aware full name (see `locale`); other generators use `faker = "module::Type"`
|
|
37
|
+
full_name = { strategy = "faker", faker = "name::Name" }
|
|
38
|
+
first_name = { strategy = "faker", faker = "name::FirstName" }
|
|
39
|
+
last_name = { strategy = "faker", faker = "name::LastName" }
|
|
40
|
+
# phone — US-style (xxx) xxx-xxxx
|
|
41
|
+
phone = { strategy = "phone" }
|
|
42
|
+
# ssn — SHA-256 hex of original; use per-column salt for extra protection
|
|
43
|
+
ssn = { strategy = "hash", salt = "${DUMPLING_USERS_SSN_SALT}", as_string = true }
|
|
44
|
+
# password/token — hash without revealing original
|
|
45
|
+
password_hash = { strategy = "hash", as_string = true }
|
|
46
|
+
api_token = { strategy = "redact", as_string = true }
|
|
47
|
+
# numeric fields
|
|
48
|
+
age = { strategy = "int_range", min = 18, max = 90 }
|
|
49
|
+
# random alphanumeric string of given length
|
|
50
|
+
ref_code = { strategy = "string", length = 12 }
|
|
51
|
+
# uuid v4-like string
|
|
52
|
+
external_id = { strategy = "uuid" }
|
|
53
|
+
# set to SQL NULL
|
|
54
|
+
legacy_field = { strategy = "null" }
|
|
55
|
+
# temporal fuzzing
|
|
56
|
+
date_of_birth = { strategy = "date_fuzz", min_days = -365, max_days = 365 }
|
|
57
|
+
created_at = { strategy = "datetime_fuzz", min_seconds = -86400, max_seconds = 86400 }
|
|
58
|
+
last_login = { strategy = "datetime_fuzz" }
|
|
59
|
+
wake_time = { strategy = "time_fuzz", min_seconds = -3600, max_seconds = 3600 }
|
|
60
|
+
|
|
61
|
+
[rules."public.orders"]
|
|
62
|
+
# credit card — redact entirely; force as quoted string
|
|
63
|
+
credit_card = { strategy = "redact", as_string = true }
|
|
64
|
+
# keep the same anonymized email as users table via shared domain
|
|
65
|
+
customer_email = { strategy = "faker", faker = "internet::SafeEmail", domain = "customer_identity" }
|
|
66
|
+
|
|
67
|
+
[rules."public.audit_log"]
|
|
68
|
+
# unqualified table name also works (matches any schema)
|
|
69
|
+
ip_address = { strategy = "string", length = 15 }
|
|
70
|
+
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
# [sensitive_columns] — explicit list of columns that must be covered when
|
|
73
|
+
# --strict-coverage is used. Supplements automatic name-based detection.
|
|
74
|
+
# ---------------------------------------------------------------------------
|
|
75
|
+
[sensitive_columns]
|
|
76
|
+
"public.users" = ["employee_number", "tax_id", "national_id"]
|
|
77
|
+
"public.orders" = ["bank_account"]
|
|
78
|
+
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
# [row_filters] — retain or delete rows based on column predicates
|
|
81
|
+
#
|
|
82
|
+
# retain: keep the row only if at least one predicate matches (OR semantics)
|
|
83
|
+
# delete: drop the row if any predicate matches (evaluated after retain)
|
|
84
|
+
#
|
|
85
|
+
# Row filtering works for both INSERT … VALUES and COPY … FROM stdin.
|
|
86
|
+
# ---------------------------------------------------------------------------
|
|
87
|
+
[row_filters."public.users"]
|
|
88
|
+
retain = [
|
|
89
|
+
# Keep internal employees
|
|
90
|
+
{ column = "email", op = "ilike", value = "%@myco.com" },
|
|
91
|
+
# Keep gold-tier users
|
|
92
|
+
{ column = "tier", op = "eq", value = "gold" },
|
|
93
|
+
# Keep users with active subscription flag in a JSONB column
|
|
94
|
+
{ column = "profile.subscription.active", op = "eq", value = "true" },
|
|
95
|
+
]
|
|
96
|
+
delete = [
|
|
97
|
+
# Always drop admin accounts from anonymized dumps
|
|
98
|
+
{ column = "is_admin", op = "eq", value = "true" },
|
|
99
|
+
# Drop bot/test accounts
|
|
100
|
+
{ column = "email", op = "ilike", value = "%@example.com" },
|
|
101
|
+
# Drop users outside the US
|
|
102
|
+
{ column = "country", op = "not_in", values = ["US", "CA"] },
|
|
103
|
+
# Drop rows where age is below threshold (numeric compare)
|
|
104
|
+
{ column = "age", op = "lt", value = 18 },
|
|
105
|
+
# Drop if nested JSON array contains a specific value (Django-style path)
|
|
106
|
+
{ column = "devices__platform", op = "eq", value = "android" },
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
# Supported operators:
|
|
110
|
+
# eq / neq — string equality (case_insensitive = true for case-insensitive)
|
|
111
|
+
# in / not_in — list membership (use `values = [...]`)
|
|
112
|
+
# like / ilike — SQL LIKE pattern (% and _); ilike is always case-insensitive
|
|
113
|
+
# regex / iregex — Rust regex; iregex is case-insensitive
|
|
114
|
+
# lt / lte / gt / gte — numeric compare (values parsed as f64)
|
|
115
|
+
# is_null / not_null — NULL check (no value needed)
|
|
116
|
+
|
|
117
|
+
# ---------------------------------------------------------------------------
|
|
118
|
+
# [[column_cases]] — conditional per-column strategy overrides
|
|
119
|
+
#
|
|
120
|
+
# For each row and column, Dumpling evaluates cases top-to-bottom and applies
|
|
121
|
+
# the FIRST matching case. Falls back to [rules] if none match.
|
|
122
|
+
#
|
|
123
|
+
# when.any = OR semantics; when.all = AND semantics; both empty = always match.
|
|
124
|
+
# ---------------------------------------------------------------------------
|
|
125
|
+
[[column_cases."public.users".email]]
|
|
126
|
+
# Admin emails → always redact (regardless of other conditions)
|
|
127
|
+
when.any = [{ column = "is_admin", op = "eq", value = "true" }]
|
|
128
|
+
strategy = { strategy = "redact", as_string = true }
|
|
129
|
+
|
|
130
|
+
[[column_cases."public.users".email]]
|
|
131
|
+
# EU users → hash with a region-specific salt for GDPR compliance
|
|
132
|
+
when.any = [{ column = "country", op = "in", values = ["DE", "FR", "GB", "IT", "ES"] }]
|
|
133
|
+
strategy = { strategy = "hash", salt = "${DUMPLING_EU_SALT}", as_string = true }
|
|
134
|
+
|
|
135
|
+
# (No matching case → falls back to [rules."public.users"].email)
|
|
136
|
+
|
|
137
|
+
# ---------------------------------------------------------------------------
|
|
138
|
+
# [output_scan] — post-transform residual PII detection
|
|
139
|
+
#
|
|
140
|
+
# Scan the anonymized output for leftover PII-like patterns. Categories:
|
|
141
|
+
# email, ssn, pan (payment card, Luhn-validated), token (JWT/AWS/GitHub/Slack)
|
|
142
|
+
#
|
|
143
|
+
# Use --scan-output to enable scanning and --fail-on-findings to gate on it.
|
|
144
|
+
# ---------------------------------------------------------------------------
|
|
145
|
+
[output_scan]
|
|
146
|
+
enabled_categories = ["email", "ssn", "pan", "token"]
|
|
147
|
+
default_threshold = 0 # allowed findings per category (0 = none)
|
|
148
|
+
default_severity = "high"
|
|
149
|
+
fail_on_severity = "low" # fail if any category at or above this severity exceeds threshold
|
|
150
|
+
sample_limit_per_category = 5 # max sample locations stored in the JSON report
|
|
151
|
+
|
|
152
|
+
[output_scan.thresholds]
|
|
153
|
+
# Override per-category threshold (0 = zero tolerance)
|
|
154
|
+
email = 0
|
|
155
|
+
ssn = 0
|
|
156
|
+
pan = 0
|
|
157
|
+
token = 0
|
|
158
|
+
|
|
159
|
+
[output_scan.severities]
|
|
160
|
+
# Override per-category severity level
|
|
161
|
+
email = "medium"
|
|
162
|
+
ssn = "high"
|
|
163
|
+
pan = "critical"
|
|
164
|
+
token = "high"
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# mdBook verification on pull requests only (no GitHub Pages upload or deploy).
|
|
2
|
+
# Pages build + deploy live in docs.yml and run on pushes to main.
|
|
3
|
+
name: Docs (PR)
|
|
4
|
+
|
|
5
|
+
on:
|
|
6
|
+
pull_request:
|
|
7
|
+
paths:
|
|
8
|
+
- "README.md"
|
|
9
|
+
- "book.toml"
|
|
10
|
+
- "docs/**"
|
|
11
|
+
- ".github/workflows/docs.yml"
|
|
12
|
+
- ".github/workflows/docs-pr.yml"
|
|
13
|
+
|
|
14
|
+
permissions:
|
|
15
|
+
contents: read
|
|
16
|
+
|
|
17
|
+
concurrency:
|
|
18
|
+
group: docs-pr-${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
|
19
|
+
cancel-in-progress: true
|
|
20
|
+
|
|
21
|
+
jobs:
|
|
22
|
+
verify:
|
|
23
|
+
name: Build mdBook (verify)
|
|
24
|
+
runs-on: ubuntu-latest
|
|
25
|
+
steps:
|
|
26
|
+
- name: Checkout
|
|
27
|
+
uses: actions/checkout@v4
|
|
28
|
+
|
|
29
|
+
- name: Install mdBook
|
|
30
|
+
uses: peaceiris/actions-mdbook@v2
|
|
31
|
+
with:
|
|
32
|
+
mdbook-version: "0.4.52"
|
|
33
|
+
|
|
34
|
+
- name: Build documentation site
|
|
35
|
+
run: mdbook build
|
|
@@ -1,12 +1,8 @@
|
|
|
1
|
+
# Build and deploy the mdBook site to GitHub Pages (main branch only).
|
|
2
|
+
# Pull-request verification runs in docs-pr.yml — this workflow does not run on PRs.
|
|
1
3
|
name: Docs
|
|
2
4
|
|
|
3
5
|
on:
|
|
4
|
-
pull_request:
|
|
5
|
-
paths:
|
|
6
|
-
- "README.md"
|
|
7
|
-
- "book.toml"
|
|
8
|
-
- "docs/**"
|
|
9
|
-
- ".github/workflows/docs.yml"
|
|
10
6
|
push:
|
|
11
7
|
branches:
|
|
12
8
|
- main
|
|
@@ -18,16 +14,16 @@ on:
|
|
|
18
14
|
|
|
19
15
|
permissions:
|
|
20
16
|
contents: read
|
|
21
|
-
pages: write
|
|
22
|
-
id-token: write
|
|
23
17
|
|
|
24
18
|
concurrency:
|
|
25
|
-
group: docs-${{ github.ref }}
|
|
19
|
+
group: docs-pages-${{ github.ref }}
|
|
26
20
|
cancel-in-progress: true
|
|
27
21
|
|
|
28
22
|
jobs:
|
|
29
23
|
build:
|
|
30
24
|
runs-on: ubuntu-latest
|
|
25
|
+
permissions:
|
|
26
|
+
contents: read
|
|
31
27
|
steps:
|
|
32
28
|
- name: Checkout
|
|
33
29
|
uses: actions/checkout@v4
|
|
@@ -40,15 +36,17 @@ jobs:
|
|
|
40
36
|
- name: Build documentation site
|
|
41
37
|
run: mdbook build
|
|
42
38
|
|
|
43
|
-
- name: Upload
|
|
39
|
+
- name: Upload Pages deployment artifact
|
|
44
40
|
uses: actions/upload-pages-artifact@v3
|
|
45
41
|
with:
|
|
46
42
|
path: docs/book
|
|
47
43
|
|
|
48
44
|
deploy:
|
|
49
|
-
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
|
50
45
|
needs: build
|
|
51
46
|
runs-on: ubuntu-latest
|
|
47
|
+
permissions:
|
|
48
|
+
pages: write
|
|
49
|
+
id-token: write
|
|
52
50
|
environment:
|
|
53
51
|
name: github-pages
|
|
54
52
|
url: ${{ steps.deployment.outputs.page_url }}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
name: Policy Lint
|
|
2
|
+
|
|
3
|
+
# Reference CI guardrail template for Dumpling anonymization policy.
|
|
4
|
+
#
|
|
5
|
+
# This workflow runs `dumpling lint-policy` on every PR and push to main to
|
|
6
|
+
# catch policy regressions before they are merged.
|
|
7
|
+
#
|
|
8
|
+
# Checks performed:
|
|
9
|
+
# empty-rules-table — a [rules] entry has no column rules
|
|
10
|
+
# empty-column-cases-table — a [column_cases] entry has no column cases
|
|
11
|
+
# unsalted-hash — hash strategy used without any salt
|
|
12
|
+
# inconsistent-domain-strategy — same domain used with different strategies
|
|
13
|
+
# uncovered-sensitive-column — sensitive_columns entry with no matching rule
|
|
14
|
+
#
|
|
15
|
+
# Exit codes:
|
|
16
|
+
# 0 — no violations found
|
|
17
|
+
# 1 — one or more violations found (job fails)
|
|
18
|
+
|
|
19
|
+
on:
|
|
20
|
+
pull_request:
|
|
21
|
+
push:
|
|
22
|
+
branches:
|
|
23
|
+
- main
|
|
24
|
+
|
|
25
|
+
permissions:
|
|
26
|
+
contents: read
|
|
27
|
+
|
|
28
|
+
jobs:
|
|
29
|
+
policy-lint:
|
|
30
|
+
name: Anonymization policy lint
|
|
31
|
+
runs-on: ubuntu-latest
|
|
32
|
+
|
|
33
|
+
steps:
|
|
34
|
+
- name: Checkout
|
|
35
|
+
uses: actions/checkout@v4
|
|
36
|
+
|
|
37
|
+
- name: Install Rust toolchain
|
|
38
|
+
uses: dtolnay/rust-toolchain@stable
|
|
39
|
+
|
|
40
|
+
- name: Cache Cargo build artifacts
|
|
41
|
+
uses: Swatinem/rust-cache@v2
|
|
42
|
+
|
|
43
|
+
- name: Build dumpling
|
|
44
|
+
run: cargo build --release --locked
|
|
45
|
+
|
|
46
|
+
- name: Lint anonymization policy
|
|
47
|
+
# --allow-noop lets the command succeed when no config is present.
|
|
48
|
+
# In your own repo, remove --allow-noop so a missing config is a hard
|
|
49
|
+
# error (fail-closed). If your config lives at a non-default path, add:
|
|
50
|
+
# --config path/to/.dumplingconf
|
|
51
|
+
# Dumpling auto-discovers .dumplingconf or pyproject.toml [tool.dumpling]
|
|
52
|
+
# in the working directory when --config is omitted.
|
|
53
|
+
run: ./target/release/dumpling lint-policy --allow-noop
|
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
# AGENTS.md
|
|
2
|
+
|
|
3
|
+
This file provides structured guidance for AI coding agents working on the **Dumpling** project. Read this before making any code changes.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Project Overview
|
|
8
|
+
|
|
9
|
+
Dumpling is a streaming, static anonymizer for Postgres plain-SQL dumps produced by `pg_dump`. It reads dump files line by line, anonymizes sensitive column data according to TOML-based configuration rules, and writes sanitized output—without ever connecting to a live database.
|
|
10
|
+
|
|
11
|
+
Key design goals:
|
|
12
|
+
- **Fail-closed**: Missing config is a hard error by default (prevents accidental no-ops in pipelines)
|
|
13
|
+
- **Streaming**: Processes dumps line by line so even multi-gigabyte files use minimal memory
|
|
14
|
+
- **Deterministic**: Domain mappings ensure the same source value always produces the same pseudonym, preserving foreign-key consistency across tables
|
|
15
|
+
- **CI-ready**: `--check` mode, strict coverage enforcement, JSON reports, and a residual-PII scan gate plug into any pipeline
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Repository Layout
|
|
20
|
+
|
|
21
|
+
```
|
|
22
|
+
src/
|
|
23
|
+
main.rs — CLI entrypoint (clap), IO routing, program orchestration
|
|
24
|
+
settings.rs — Config loading, TOML parsing, secret resolution, validation, normalization
|
|
25
|
+
transform.rs — All anonymization strategies, PRNG, deterministic domain mapping
|
|
26
|
+
sql.rs — SQL stream processor: INSERT + COPY parsing, column strategy selection,
|
|
27
|
+
CREATE TABLE length extraction, sensitive coverage tracking
|
|
28
|
+
filter.rs — Row-filter predicate evaluation (eq/neq/like/regex/JSON-path/…)
|
|
29
|
+
scan.rs — Post-transform residual PII scanner (email/SSN/PAN/token regex)
|
|
30
|
+
report.rs — JSON report data structures and Reporter helper
|
|
31
|
+
docs/src/ — mdBook documentation source
|
|
32
|
+
.github/ — CI/CD GitHub Actions workflows
|
|
33
|
+
Cargo.toml — Rust package manifest
|
|
34
|
+
pyproject.toml — Python packaging (maturin) for pip-installable CLI
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## Development Setup
|
|
40
|
+
|
|
41
|
+
### Prerequisites
|
|
42
|
+
|
|
43
|
+
- **Rust stable toolchain**: install via [rustup.rs](https://rustup.rs/). No database, Docker, or external services required.
|
|
44
|
+
|
|
45
|
+
### Build
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
cargo build # debug build
|
|
49
|
+
cargo build --release # optimized release build
|
|
50
|
+
./target/debug/dumpling --help
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Run locally
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
# Anonymize a dump using a config found in CWD
|
|
57
|
+
dumpling -i dump.sql -o sanitized.sql
|
|
58
|
+
|
|
59
|
+
# Explicit config path
|
|
60
|
+
dumpling -i dump.sql -o sanitized.sql -c path/to/.dumplingconf
|
|
61
|
+
|
|
62
|
+
# Check mode: exit 1 if any changes would be made, write nothing
|
|
63
|
+
dumpling --check -i dump.sql
|
|
64
|
+
|
|
65
|
+
# Allow running with no config (explicitly opt into passthrough)
|
|
66
|
+
dumpling --allow-noop -i dump.sql -o out.sql
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Config is discovered in order: `--config` path → `./.dumplingconf` → `./pyproject.toml [tool.dumpling]`.
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## Testing
|
|
74
|
+
|
|
75
|
+
Run the full test suite:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
cargo test --all-targets --all-features
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
All tests are inline `#[cfg(test)]` modules at the bottom of each source file. There are no separate test files.
|
|
82
|
+
|
|
83
|
+
### Test conventions
|
|
84
|
+
|
|
85
|
+
- Call `set_random_seed(N)` from `transform.rs` when a test exercises fuzz strategies (`date_fuzz`, `int_range`, etc.) to ensure reproducible, assertion-checkable output.
|
|
86
|
+
- Config-parsing tests use a `write_temp_config(contents)` helper that writes a TOML file to `std::env::temp_dir()`. Always clean up temp files after assertions with `let _ = fs::remove_file(path)`.
|
|
87
|
+
- Never hard-code absolute paths; always use `std::env::temp_dir()` or `std::env::current_dir()`.
|
|
88
|
+
- When testing with `ResolvedConfig` constructed directly (bypassing file loading), initialize all `HashMap` fields explicitly. Construct `OutputScanConfig::default()` for the `output_scan` field.
|
|
89
|
+
- Use `proc.process(&mut reader, &mut out)` and then `String::from_utf8(out).unwrap()` to get the full processed SQL output for assertion.
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## CI Gates
|
|
94
|
+
|
|
95
|
+
**All three must pass.** Run these locally before pushing:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
cargo fmt --all -- --check # formatting
|
|
99
|
+
cargo clippy --all-targets --all-features # linting (zero warnings allowed)
|
|
100
|
+
cargo test --all-targets --all-features # tests
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Apply auto-formatting with `cargo fmt`. For clippy warnings, fix the code rather than adding `#[allow(...)]` attributes unless there is a specific, documented reason.
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## Code Conventions
|
|
108
|
+
|
|
109
|
+
### Imports
|
|
110
|
+
|
|
111
|
+
Imports **must** be at the top of each module, grouped by standard library / external crates / internal crates. Never place `use` statements inside function bodies unless strictly necessary to resolve a circular import. This is a hard rule.
|
|
112
|
+
|
|
113
|
+
### Error handling
|
|
114
|
+
|
|
115
|
+
- Use `anyhow::Result<T>` for fallible public functions that propagate errors toward `main`.
|
|
116
|
+
- Attach context with `.with_context(|| format!("…"))` when propagating errors so callers see where the failure originated.
|
|
117
|
+
- Use `anyhow::bail!(…)` for early exits with a descriptive message.
|
|
118
|
+
- Define new typed errors with `thiserror` only when callers need to match on them (rare in this codebase).
|
|
119
|
+
|
|
120
|
+
### Naming and normalization
|
|
121
|
+
|
|
122
|
+
- All config table and column keys are normalized to **lowercase** during `resolve()` in `settings.rs`. The lookup helpers (`lookup_column_rule`, `lookup_column_cases`, `lookup_row_filters`, `lookup_sensitive_columns`) always compare lowercase keys. Pass lowercase when calling them.
|
|
123
|
+
- Follow standard Rust naming: `snake_case` for variables, functions, and modules; `PascalCase` for types and enum variants.
|
|
124
|
+
|
|
125
|
+
### SQL output quoting
|
|
126
|
+
|
|
127
|
+
The `Replacement` type in `transform.rs` carries a `force_quoted` flag that controls whether the value is wrapped in SQL single quotes on output.
|
|
128
|
+
|
|
129
|
+
- `Replacement::quoted(v)` — forces single-quoted output (use for strings, emails, names, etc.)
|
|
130
|
+
- `Replacement::unquoted(v)` — raw output (use for integers, hashes when not forced to string)
|
|
131
|
+
- `Replacement::null()` — renders as `NULL` / `\N` (COPY format)
|
|
132
|
+
|
|
133
|
+
`render_cell` in `sql.rs` applies: `force_quoted || original.was_quoted` → quoted output. New strategies should use `quoted` for text-like values and `unquoted` for numeric/raw values.
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Architecture Deep-Dive
|
|
138
|
+
|
|
139
|
+
### Config Resolution Flow (`settings.rs`)
|
|
140
|
+
|
|
141
|
+
1. Load raw TOML bytes from the selected source.
|
|
142
|
+
2. Walk the TOML value tree with `resolve_secrets_in_value`: replace `${ENV_VAR}` and `${env:ENV_VAR}` references with environment variable values. Emit stderr warnings for plaintext `salt` values.
|
|
143
|
+
3. Deserialize the resolved `toml::Value` into `RawConfig` via serde.
|
|
144
|
+
4. Call `validate_raw_config`: verify all `strategy` names are in `KNOWN_STRATEGIES`, check strategy-option compatibility (e.g., `salt` is only valid for `hash`), and validate numeric bounds.
|
|
145
|
+
5. Call `resolve()`: normalize all table/column keys to lowercase, producing `ResolvedConfig`.
|
|
146
|
+
|
|
147
|
+
**Lookup pattern** — all lookup helpers try `schema.table` first, then fall back to bare `table`:
|
|
148
|
+
|
|
149
|
+
```rust
|
|
150
|
+
lookup_column_rule(&cfg, Some("public"), "users", "email")
|
|
151
|
+
// tries "public.users" then "users"
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### SQL Stream Processing (`sql.rs`)
|
|
155
|
+
|
|
156
|
+
`SqlStreamProcessor::process()` drives a state machine over lines:
|
|
157
|
+
|
|
158
|
+
| Mode | Trigger | Behavior |
|
|
159
|
+
|------|---------|----------|
|
|
160
|
+
| `Pass` | Default | Passthrough; detects INSERT/COPY/CREATE TABLE starts |
|
|
161
|
+
| `InInsert` | `INSERT INTO …` without trailing `;` | Accumulate until `statement_complete()` detects `;` outside quotes/parens |
|
|
162
|
+
| `InCopy` | `COPY … FROM stdin;` | Process tab-delimited rows until `\.` |
|
|
163
|
+
| `InCreateTable` | `CREATE TABLE …` | Accumulate until `;`; parse column length limits |
|
|
164
|
+
|
|
165
|
+
**Per-row pipeline (INSERT and COPY)**:
|
|
166
|
+
1. Parse cells (preserving quoting metadata into `Cell` structs).
|
|
167
|
+
2. Call `should_keep_row` (row filters) — skip row if filtered.
|
|
168
|
+
3. For each column, call `select_strategy_for_cell`:
|
|
169
|
+
- Check `column_cases` for this table+column; iterate in declared order; **first matching `when` wins** (first-match-wins, no fallthrough).
|
|
170
|
+
- Fall back to base `rules` entry for the table+column.
|
|
171
|
+
- Return `None` → cell passes through unchanged.
|
|
172
|
+
4. Call `apply_anonymizer` → `Replacement`.
|
|
173
|
+
5. Render back to SQL with `render_cell`.
|
|
174
|
+
|
|
175
|
+
`CREATE TABLE` statements are parsed to extract `varchar(N)` / `character varying(N)` / `char(N)` / `character(N)` / `bpchar(N)` length limits. These are stored in `column_length_limits` and passed to `apply_anonymizer` so generated strings are truncated to fit the column constraint.
|
|
176
|
+
|
|
177
|
+
### Anonymization (`transform.rs`)
|
|
178
|
+
|
|
179
|
+
Two code paths:
|
|
180
|
+
|
|
181
|
+
- **Random path**: `apply_random_anonymizer` — uses a global xorshift64\* PRNG (seeded from system time, `--seed` flag, or `DUMPLING_SEED` env var). Used when `spec.domain` is absent.
|
|
182
|
+
- **Deterministic/domain path**: `apply_domain_anonymizer` → `apply_deterministic_anonymizer` — uses a `DeterministicByteStream` (SHA-256 CTR-mode) seeded from `(domain_key, original_value, strategy, salt, collision_index)`. Used when `spec.domain` is set. The `domain_mappings` cache ensures the same source value always maps to the same pseudonym within a domain across tables. When `unique_within_domain = true`, collision detection retries up to `MAX_DOMAIN_UNIQUENESS_ATTEMPTS` (4096) before giving up.
|
|
183
|
+
|
|
184
|
+
### Row Filtering (`filter.rs`)
|
|
185
|
+
|
|
186
|
+
`should_keep_row` evaluates `RowFilterSet`:
|
|
187
|
+
- `retain` (OR): if non-empty, a row is kept only if at least one predicate matches.
|
|
188
|
+
- `delete` (any-match): if any predicate matches, the row is dropped (evaluated after `retain`).
|
|
189
|
+
|
|
190
|
+
JSON path traversal is supported: `payload.profile.tier` (dot) and `payload__profile__tier` (Django double-underscore). Array elements are traversed by evaluating each item's fields, so list-of-dicts structures work naturally.
|
|
191
|
+
|
|
192
|
+
### Residual PII Scanning (`scan.rs`)
|
|
193
|
+
|
|
194
|
+
`ScanningWriter` wraps the output `Write` stream, intercepting bytes and passing them to `OutputScanner`. The scanner applies regex detectors line-by-line for:
|
|
195
|
+
- `email`: RFC-like email pattern
|
|
196
|
+
- `ssn`: US SSN-like `DDD-DD-DDDD` with invalid area/group/serial number rejection
|
|
197
|
+
- `pan`: Payment card numbers (13–19 digits, Luhn-validated)
|
|
198
|
+
- `token`: JWT, AWS access key IDs (`AKIA…`), GitHub PATs (`ghp_/gho_/…`), Slack tokens, labeled `key=value` patterns
|
|
199
|
+
|
|
200
|
+
Findings are aggregated per category and compared against configurable thresholds and severity gates.
|
|
201
|
+
|
|
202
|
+
---
|
|
203
|
+
|
|
204
|
+
## How to Add a New Anonymization Strategy
|
|
205
|
+
|
|
206
|
+
Follow these steps in order. Do not skip any step.
|
|
207
|
+
|
|
208
|
+
1. **`src/settings.rs` — `KNOWN_STRATEGIES`**: Add the strategy name string to the `KNOWN_STRATEGIES` const slice.
|
|
209
|
+
|
|
210
|
+
2. **`src/settings.rs` — `AnonymizerSpec`**: If the strategy needs new config fields (e.g., `min_length`, `charset`), add them as `pub field: Option<T>`. Keep them `Option` for backwards compatibility.
|
|
211
|
+
|
|
212
|
+
3. **`src/settings.rs` — `validate_anonymizer_spec`**:
|
|
213
|
+
- Add the new strategy-specific fields to the `unsupported` list for all *other* strategies (so using them with the wrong strategy produces a clear error).
|
|
214
|
+
- Add bounds or range validation if applicable (see `int_range` and `string` examples).
|
|
215
|
+
|
|
216
|
+
4. **`src/transform.rs` — `apply_random_anonymizer`**: Add a `match` arm with the random implementation. Return an appropriate `Replacement` variant.
|
|
217
|
+
|
|
218
|
+
5. **`src/transform.rs` — `apply_deterministic_anonymizer`**: Add a matching `match` arm for domain-mapping support. Use the provided `DeterministicByteStream` for all randomness to ensure reproducibility.
|
|
219
|
+
|
|
220
|
+
6. **`src/transform.rs` — `should_enforce_max_len`**: If the strategy generates string values that should be truncated to fit `varchar(N)` columns, make sure it is **not** in the exclusion list. Currently excluded: `null` and `int_range`.
|
|
221
|
+
|
|
222
|
+
7. **Tests**: Add `#[test]` functions in `src/transform.rs` (unit-test strategy output values) and in `src/sql.rs` (end-to-end pipeline test). Use `set_random_seed(N)` for reproducibility.
|
|
223
|
+
|
|
224
|
+
8. **`README.md`**: Add a row to the "Anonymization strategies" table.
|
|
225
|
+
|
|
226
|
+
**`faker` strategy:** Config only carries string identifiers; Dumpling never evaluates user Rust from config. To ship a new generator, add dispatch in `src/faker_dispatch.rs` and validation in `validate_anonymizer_spec` for the `faker` branch. Upstream reference: [`fake` on docs.rs](https://docs.rs/fake/latest/fake/), [`fake::faker` module index](https://docs.rs/fake/latest/fake/faker/index.html), [source on GitHub](https://github.com/cksac/fake-rs).
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## How to Add a New Row Filter Predicate Operator
|
|
231
|
+
|
|
232
|
+
1. **`src/settings.rs` — `Predicate.op` doc comment**: Update the doc comment listing supported operator names.
|
|
233
|
+
|
|
234
|
+
2. **`src/filter.rs` — `predicate_matches`**: Add a match arm for the new operator string in the appropriate branch (`single-value` or `multi-value`).
|
|
235
|
+
|
|
236
|
+
3. **Tests**: Add `#[test]` functions in `src/filter.rs`.
|
|
237
|
+
|
|
238
|
+
4. **`README.md`**: Add a row to the predicate operators table.
|
|
239
|
+
|
|
240
|
+
---
|
|
241
|
+
|
|
242
|
+
## Exit Codes
|
|
243
|
+
|
|
244
|
+
| Code | Meaning |
|
|
245
|
+
|------|---------|
|
|
246
|
+
| `0` | Success |
|
|
247
|
+
| `1` | Config/startup error, or `--check` mode found changes |
|
|
248
|
+
| `2` | `--strict-coverage`: uncovered sensitive columns detected |
|
|
249
|
+
| `3` | `--fail-on-findings`: output scan thresholds exceeded |
|
|
250
|
+
|
|
251
|
+
---
|
|
252
|
+
|
|
253
|
+
## Important Gotchas
|
|
254
|
+
|
|
255
|
+
- **Fail-closed by default**: If no config is found, Dumpling exits non-zero. Tests that exercise the CLI without providing a config must either supply one or pass `--allow-noop`. Never assume a missing config is safe.
|
|
256
|
+
|
|
257
|
+
- **`table_options` is intentionally removed**: The `[table_options]` key was deprecated and now deliberately fails with a targeted error pointing to `[rules]` and `[column_cases]`. Do not re-introduce it.
|
|
258
|
+
|
|
259
|
+
- **`null` and `redact` reject `domain`**: These strategies produce constant outputs regardless of input, so deterministic domain mapping is meaningless. The validator rejects `domain` with these strategies.
|
|
260
|
+
|
|
261
|
+
- **COPY NULL representation**: In COPY format, `\N` (backslash-N) means NULL. This is different from the string `"NULL"` used in INSERT VALUES format. Handle both correctly.
|
|
262
|
+
|
|
263
|
+
- **Multi-line INSERT statements**: A single logical INSERT may span many lines (pg_dump does this for large `VALUES` lists). The state machine accumulates lines until `statement_complete()` detects a semicolon that is outside all single-quote and parenthesis context.
|
|
264
|
+
|
|
265
|
+
- **Column key normalization is mandatory**: All config lookups compare lowercase keys. A table `Public.Users` in the dump matches config key `"public.users"`. Always normalize before lookup; the lookup helpers do this for you if you use them.
|
|
266
|
+
|
|
267
|
+
- **`AnonymizerSpec` construction in tests**: When building `AnonymizerSpec` directly in tests, you must set all fields explicitly (there is no `Default` impl). Set unused fields to `None`.
|
|
268
|
+
|
|
269
|
+
- **Clippy is zero-tolerance**: CI runs clippy with all targets and features. Any warning fails the build. Always run `cargo clippy --all-targets --all-features` locally before pushing.
|
|
270
|
+
|
|
271
|
+
- **Do not use `unsafe` beyond the existing PRNG seed**: The only `unsafe` in the codebase is in `transform.rs` for the global PRNG seed override (`RNG_SEED_OVERRIDE`). Do not add new `unsafe` blocks without a compelling reason and a code comment.
|
|
272
|
+
|
|
273
|
+
---
|
|
274
|
+
|
|
275
|
+
## Cursor Cloud specific instructions
|
|
276
|
+
|
|
277
|
+
This is a pure Rust CLI project with **no external services** (no database, Docker, or network dependencies). The Rust stable toolchain (rustc + cargo) is the only prerequisite.
|
|
278
|
+
|
|
279
|
+
### One-shot environment (agents and humans)
|
|
280
|
+
|
|
281
|
+
From the repository root:
|
|
282
|
+
|
|
283
|
+
```bash
|
|
284
|
+
./scripts/setup-dev.sh
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
This installs the **stable** toolchain with **rustfmt** and **clippy** (via `rustup` when available), runs **`cargo fetch`**, and installs a pinned **mdBook** binary under `.tools/` (same version as the Docs CI workflow) so you can run `mdbook build` without a global install. Add `.tools` to `PATH` for convenience, or invoke `.tools/mdbook build` directly.
|
|
288
|
+
|
|
289
|
+
The repo root **`rust-toolchain.toml`** pins **stable** and the **components** CI uses, so `cargo` automatically selects the right toolchain in fresh checkouts.
|
|
290
|
+
|
|
291
|
+
### Quick reference
|
|
292
|
+
|
|
293
|
+
| Task | Command |
|
|
294
|
+
|------|---------|
|
|
295
|
+
| Setup (toolchain + fetch + mdbook) | `./scripts/setup-dev.sh` |
|
|
296
|
+
| Build | `cargo build` |
|
|
297
|
+
| Test | `cargo test --all-targets --all-features` |
|
|
298
|
+
| Lint | `cargo clippy --all-targets --all-features` |
|
|
299
|
+
| Format check | `cargo fmt --all -- --check` |
|
|
300
|
+
| Auto-format | `cargo fmt` |
|
|
301
|
+
| Docs site (mdBook) | `mdbook build` or `.tools/mdbook build` after setup |
|
|
302
|
+
| Run CLI | `./target/debug/dumpling --help` |
|
|
303
|
+
|
|
304
|
+
### Running the CLI
|
|
305
|
+
|
|
306
|
+
Dumpling is fail-closed by default — it exits non-zero without a config file. To run a quick smoke test, either provide a `.dumplingconf` via `-c` or pass `--allow-noop`. Example:
|
|
307
|
+
|
|
308
|
+
```bash
|
|
309
|
+
./target/debug/dumpling --allow-noop -i /tmp/some_dump.sql -o /tmp/out.sql
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
### Notes
|
|
313
|
+
|
|
314
|
+
- All tests are inline `#[cfg(test)]` modules; there are no separate test files or fixtures to manage.
|
|
315
|
+
- The update script uses `cargo fetch` to pre-download crate dependencies. A full `cargo build` or `cargo test` will then compile from the local cache without network access.
|
|
316
|
+
- No environment variables or secrets are required for building, testing, or running the CLI locally.
|