datamasque-cli 1.1.0__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/CHANGELOG.md +42 -0
  2. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/PKG-INFO +24 -1
  3. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/README.md +23 -0
  4. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/datamasque-cli/skills/datamasque-cli/SKILL.md +1 -1
  5. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/ruleset-builder/.claude-plugin/plugin.json +1 -1
  6. datamasque_cli-1.3.0/claude-skills/ruleset-builder/skills/ruleset-builder/SKILL.md +176 -0
  7. datamasque_cli-1.3.0/claude-skills/ruleset-builder/skills/ruleset-builder/references/fk-cascade.md +109 -0
  8. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/ruleset-builder/skills/ruleset-builder/references/hash-columns-guide.md +8 -8
  9. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/ruleset-builder/skills/ruleset-builder/references/mask-definitions-guide.md +7 -7
  10. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/ruleset-builder/skills/ruleset-builder/references/ruleset-libraries-guide.md +7 -7
  11. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/ruleset-builder/skills/ruleset-builder/references/ruleset-yaml-reference.md +24 -0
  12. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/pyproject.toml +1 -1
  13. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/client.py +63 -25
  14. datamasque_cli-1.3.0/src/datamasque_cli/commands/ifm.py +354 -0
  15. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/system.py +32 -1
  16. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/main.py +2 -0
  17. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/output.py +26 -6
  18. datamasque_cli-1.3.0/tests/commands/test_ifm.py +577 -0
  19. datamasque_cli-1.3.0/tests/commands/test_system.py +82 -0
  20. datamasque_cli-1.3.0/tests/test_client_ifm.py +65 -0
  21. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/uv.lock +1 -1
  22. datamasque_cli-1.1.0/claude-skills/ruleset-builder/skills/ruleset-builder/SKILL.md +0 -175
  23. datamasque_cli-1.1.0/tests/commands/test_system.py +0 -38
  24. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/.claude-plugin/marketplace.json +0 -0
  25. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/.github/workflows/ci.yml +0 -0
  26. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/.github/workflows/release-testpypi.yml +0 -0
  27. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/.github/workflows/release.yml +0 -0
  28. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/.gitignore +0 -0
  29. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/CONTRIBUTING.md +0 -0
  30. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/LICENSE +0 -0
  31. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/Makefile +0 -0
  32. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/NOTICE +0 -0
  33. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/assets/demo.gif +0 -0
  34. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/README.md +0 -0
  35. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/datamasque-cli/.claude-plugin/plugin.json +0 -0
  36. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/ruleset-splitter/.claude-plugin/plugin.json +0 -0
  37. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/ruleset-splitter/skills/ruleset-splitter/SKILL.md +0 -0
  38. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/scripts/active_profile_env.py +0 -0
  39. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/scripts/bump_version.py +0 -0
  40. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/__init__.py +0 -0
  41. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/__init__.py +0 -0
  42. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/auth.py +0 -0
  43. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/connections.py +0 -0
  44. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/discovery.py +0 -0
  45. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/files.py +0 -0
  46. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/ruleset_libraries.py +0 -0
  47. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/rulesets.py +0 -0
  48. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/runs.py +0 -0
  49. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/seeds.py +0 -0
  50. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/users.py +0 -0
  51. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/config.py +0 -0
  52. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/py.typed +0 -0
  53. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/__init__.py +0 -0
  54. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/__init__.py +0 -0
  55. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_auth.py +0 -0
  56. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_catalog.py +0 -0
  57. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_connections.py +0 -0
  58. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_discovery.py +0 -0
  59. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_files.py +0 -0
  60. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_ruleset_libraries.py +0 -0
  61. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_rulesets.py +0 -0
  62. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_runs.py +0 -0
  63. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_seeds.py +0 -0
  64. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_users.py +0 -0
  65. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/conftest.py +0 -0
  66. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/integration/README.md +0 -0
  67. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/integration/__init__.py +0 -0
  68. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/integration/conftest.py +0 -0
  69. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/integration/test_connections.py +0 -0
  70. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/integration/test_delete_safety.py +0 -0
  71. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/integration/test_rulesets.py +0 -0
  72. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/integration/test_runs.py +0 -0
  73. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/test_client_auth.py +0 -0
  74. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/test_client_env.py +0 -0
  75. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/test_client_profile.py +0 -0
  76. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/test_config.py +0 -0
  77. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/test_output.py +0 -0
  78. {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/test_version.py +0 -0
@@ -1,5 +1,47 @@
1
1
  # Changelog
2
2
 
3
+ ## v1.3.0
4
+
5
+ ### Added
6
+ - `dm system ai-engine show` and `dm system ai-engine set <URL>` — view and
7
+ configure the AI Engine URL.
8
+
9
+ ## v1.2.0
10
+
11
+ ### Added
12
+ - `dm ifm` command group
13
+ for managing in-flight masking ruleset plans
14
+ and running mask operations against the IFM service:
15
+ - `dm ifm list` —
16
+ list all IFM ruleset plans.
17
+ - `dm ifm get <name>` —
18
+ show plan metadata,
19
+ or the ruleset YAML with `--yaml`.
20
+ - `dm ifm create --name <name> --file <yaml>` —
21
+ create a plan from a YAML ruleset,
22
+ with optional `--enabled/--disabled` and `--log-level`.
23
+ - `dm ifm update <name>` —
24
+ update a plan;
25
+ pass any of `--file`, `--enabled/--disabled`, `--log-level`
26
+ and only those fields are sent.
27
+ - `dm ifm delete <name>` —
28
+ delete a plan
29
+ (interactive confirm,
30
+ or `--yes` to skip).
31
+ - `dm ifm mask <name> --data <file|->` —
32
+ mask a JSON list of records against a plan,
33
+ with `--disable-instance-secret`,
34
+ `--run-secret`,
35
+ `--log-level`,
36
+ `--request-id`,
37
+ and `--json/--no-json` (NDJSON) output.
38
+ - `dm ifm verify-token` —
39
+ verify the current IFM token and list its scopes.
40
+
41
+ Authentication reuses your existing `dm` profile credentials
42
+ via the SDK's `DataMasqueIfmClient`,
43
+ which transparently exchanges admin-server credentials for an IFM JWT.
44
+
3
45
  ## v1.1.0
4
46
 
5
47
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datamasque-cli
3
- Version: 1.1.0
3
+ Version: 1.3.0
4
4
  Summary: Official command-line interface for the DataMasque data-masking platform.
5
5
  Project-URL: Homepage, https://datamasque.com/
6
6
  Project-URL: Repository, https://github.com/datamasque/datamasque-cli
@@ -39,6 +39,7 @@ so teams can use production-shaped data in non-production environments without e
39
39
  DataMasque CLI `dm` covers:
40
40
 
41
41
  - connections, rulesets, ruleset libraries, and masking runs
42
+ - in-flight masking (IFM) ruleset plans and on-demand mask requests
42
43
  - schema discovery and sensitive-data discovery
43
44
  - users, files, and DataMasque instance administration
44
45
 
@@ -196,6 +197,26 @@ dm libraries validate <name> # Re-validate against current
196
197
  dm libraries usage <name> # Show rulesets using it
197
198
  ```
198
199
 
200
+ ### In-flight masking
201
+
202
+ The IFM service runs alongside the admin server,
203
+ reached at `<DataMasque URL>/ifm`.
204
+
205
+ ```console
206
+ dm ifm list # List ruleset plans
207
+ dm ifm get <name> # Show plan metadata
208
+ dm ifm get <name> --yaml # Print the ruleset YAML
209
+ dm ifm create --name myplan --file rules.yaml # Create (server suffixes a random string to the name)
210
+ dm ifm create --name myplan --file rules.yaml --disabled --log-level DEBUG
211
+ dm ifm update <name> --file rules.yaml # Replace the ruleset YAML
212
+ dm ifm update <name> --enabled # Toggle without re-sending the YAML
213
+ dm ifm update <name> --log-level INFO
214
+ dm ifm delete <name> --yes # Delete a plan
215
+ dm ifm mask <name> --data input.json # Mask a JSON list of records
216
+ dm ifm mask <name> --data - # Read records from stdin
217
+ dm ifm verify-token # Show scopes granted to the current IFM token
218
+ ```
219
+
199
220
  ### Masking runs
200
221
 
201
222
  ```console
@@ -257,6 +278,8 @@ dm system upload-licence ./licence.lic # Upload a licence file
257
278
  dm system logs -o logs.tar.gz # Download application logs
258
279
  dm system admin-install --email admin@co.com # Initial admin setup
259
280
  dm system set-locality AU # Set system locality
281
+ dm system ai-engine show # Show the configured AI Engine URL
282
+ dm system ai-engine set <URL> # Point DataMasque at an AI Engine
260
283
  ```
261
284
 
262
285
  ## JSON output
@@ -9,6 +9,7 @@ so teams can use production-shaped data in non-production environments without e
9
9
  DataMasque CLI `dm` covers:
10
10
 
11
11
  - connections, rulesets, ruleset libraries, and masking runs
12
+ - in-flight masking (IFM) ruleset plans and on-demand mask requests
12
13
  - schema discovery and sensitive-data discovery
13
14
  - users, files, and DataMasque instance administration
14
15
 
@@ -166,6 +167,26 @@ dm libraries validate <name> # Re-validate against current
166
167
  dm libraries usage <name> # Show rulesets using it
167
168
  ```
168
169
 
170
+ ### In-flight masking
171
+
172
+ The IFM service runs alongside the admin server,
173
+ reached at `<DataMasque URL>/ifm`.
174
+
175
+ ```console
176
+ dm ifm list # List ruleset plans
177
+ dm ifm get <name> # Show plan metadata
178
+ dm ifm get <name> --yaml # Print the ruleset YAML
179
+ dm ifm create --name myplan --file rules.yaml # Create (server suffixes a random string to the name)
180
+ dm ifm create --name myplan --file rules.yaml --disabled --log-level DEBUG
181
+ dm ifm update <name> --file rules.yaml # Replace the ruleset YAML
182
+ dm ifm update <name> --enabled # Toggle without re-sending the YAML
183
+ dm ifm update <name> --log-level INFO
184
+ dm ifm delete <name> --yes # Delete a plan
185
+ dm ifm mask <name> --data input.json # Mask a JSON list of records
186
+ dm ifm mask <name> --data - # Read records from stdin
187
+ dm ifm verify-token # Show scopes granted to the current IFM token
188
+ ```
189
+
169
190
  ### Masking runs
170
191
 
171
192
  ```console
@@ -227,6 +248,8 @@ dm system upload-licence ./licence.lic # Upload a licence file
227
248
  dm system logs -o logs.tar.gz # Download application logs
228
249
  dm system admin-install --email admin@co.com # Initial admin setup
229
250
  dm system set-locality AU # Set system locality
251
+ dm system ai-engine show # Show the configured AI Engine URL
252
+ dm system ai-engine set <URL> # Point DataMasque at an AI Engine
230
253
  ```
231
254
 
232
255
  ## JSON output
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: datamasque-cli
3
- description: Use when the user wants to interact with a DataMasque instance — start masking runs, check run status, list connections or rulesets, manage seeds, manage ruleset libraries, check system health, or any task involving the DataMasque API. Triggers on "mask the data", "start a run", "check the run", "list connections", "list rulesets", "upload a seed", "check DataMasque health", "dm status", "ruleset library", or any request to operate DataMasque programmatically.
3
+ description: Use when the user wants to interact with a DataMasque instance — start masking runs, check run status, list connections or rulesets, manage seeds, manage ruleset libraries, check system health, configure the AI Engine, or any task involving the DataMasque API. Triggers on "mask the data", "start a run", "check the run", "list connections", "list rulesets", "upload a seed", "check DataMasque health", "dm status", "ruleset library", "configure the AI Engine", "set the AI Engine URL", or any request to operate DataMasque programmatically.
4
4
  argument-hint: e.g. "start a run with docx_masking on var_input_docx"
5
5
  user-invocable: true
6
6
  ---
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ruleset-builder",
3
- "version": "1.0.0",
3
+ "version": "1.1.0",
4
4
  "description": "Convert auto-generated DataMasque rulesets into production-ready form. Validate and iterate.",
5
5
  "author": { "name": "DataMasque Ltd" },
6
6
  "repository": "https://github.com/datamasque/datamasque-cli",
@@ -0,0 +1,176 @@
1
+ ---
2
+ name: ruleset-builder
3
+ description: Use when the user wants to turn auto-generated DataMasque rulesets into production-ready ones — extract a `ruleset_library`, add `hash_columns`, refine a ruleset, or clean up generated YAML. Triggers on "ruleset builder", "build ruleset", "refine ruleset", "add hash columns", "add ruleset library", "production ruleset", "clean up ruleset".
4
+ argument-hint: e.g. "build a ruleset from these generated files"
5
+ user-invocable: true
6
+ ---
7
+
8
+ # Ruleset Builder
9
+
10
+ Transform auto-generated DataMasque rulesets into production-ready rulesets with three improvements:
11
+ 1. **`ruleset_library` references** — `$ref` links replacing every repeated inline mask
12
+ 2. **`hash_columns`** — on every applicable `mask_table` task for deterministic consistency
13
+ 3. **Clean structure** — `skip_defaults`, no doc blocks, validated
14
+
15
+ FK cascade is automatic: mask the parent PK with `imitate_unique` (or `imitate_uuid` / `imitate_nz_ird`) and the engine replicates the rule onto every FK column referencing it. **Do NOT add explicit rules for FK columns.** Avoid `from_unique_imitate` and `mask_unique_key` (both deprecated). Never skip IDs.
16
+
17
+ 5-step process (1–5). Use `TaskCreate` to track all 5; report after each step before proceeding. The prompt must include business domain and application type — ask if missing.
18
+
19
+ ---
20
+
21
+ ## Step 1: Report versions
22
+
23
+ Report the Ruleset Builder version (from `plugin.json`) and `dm version` so the operator can correlate output with releases.
24
+
25
+ ---
26
+
27
+ ## Step 2: Read reference docs
28
+
29
+ Canonical mask reference:
30
+ <https://portal.datamasque.com/portal/documentation/latest/masking-functions-overview.html>
31
+
32
+ Read all of these before any other work:
33
+ ```
34
+ ${CLAUDE_PLUGIN_ROOT}/skills/ruleset-builder/references/fk-cascade.md
35
+ ${CLAUDE_PLUGIN_ROOT}/skills/ruleset-builder/references/mask-definitions-guide.md
36
+ ${CLAUDE_PLUGIN_ROOT}/skills/ruleset-builder/references/hash-columns-guide.md
37
+ ${CLAUDE_PLUGIN_ROOT}/skills/ruleset-builder/references/ruleset-yaml-reference.md
38
+ ```
39
+
40
+ ---
41
+
42
+ ## Step 3: Extract ruleset_library
43
+
44
+ Write a Python script using `ruamel.yaml` (`uv pip install ruamel.yaml`).
45
+
46
+ Process the input YAML. For each `mask_table` task, replace every inline mask with a `$ref` to a rule in `ruleset_library.yaml`. Build the library progressively — read its current state at the start of each iteration, create it if absent.
47
+
48
+ The library `masks` section structure:
49
+ ```yaml
50
+ version: "1.0"
51
+ masks:
52
+ rule_name:
53
+ type: rule_type
54
+ ...params
55
+ ```
56
+
57
+ ### Classification rules (apply in order)
58
+
59
+ **1. ID columns** — any column ending in `_ID`, `_NO`, `_NR`, `_NBR` is an entity identifier.
60
+ - **FK side: drop the rule entirely.** If an ID column is a foreign key (the table's `Foreign Keys` metadata in the discovery CSV has an entry for it), do NOT emit a rule for it. The engine cascades automatically from the parent PK rule. See `fk-cascade.md`.
61
+ - **PK side: use `imitate_unique` with `seed:`.** Strip adjective/verb prefixes before the noun: `PREVIOUS_`, `OLD_`, `TRANSFERRED_`, `PRIOR_`, `CURR_`, `NEW_`, `NEXT_`, `ALT_`, `PARENT_`, `CHILD_`, `SOURCE_`, `TARGET_`, `ORIG_`, `PENDING_`, `ARCHIVED_`, `DELETED_`. Extract the core entity (`PREVIOUS_INVOICE_ID` → `invoice`).
62
+ - Library entry name: `{entity}_id`. Reference it as `$ref: "Global/RuleLib#masks/{entity}_id"`.
63
+ - Library entry body: `type: imitate_unique`, `seed: "{entity}"`. The `seed` is optional but recommended: it namespaces by entity so unrelated IDs don't collide (e.g. `customer.id=42` doesn't mask to the same value as `product.id=42`). Doesn't affect FK cascade.
64
+ - This overrides whatever mask was originally generated (even `from_random_number`).
65
+
66
+ **2. Named patterns** — detect by mask structure:
67
+
68
+ | Pattern | Detection | Library rule |
69
+ |-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|
70
+ | Email | `chain(concat(concat(firstName+lastName, glue='.')+email_suffix)+transform_case(lower))` | `email_address` |
71
+ | Full name | `chain(concat(firstName+lastName, glue=' ')+take_substring)` OR plain `concat(firstName+lastName, glue=' ')` — column not containing USERNAME/LOGIN | `full_name` |
72
+ | Username | Same mask as full_name but column name contains USERNAME, USER_NAME, LOGIN, LOGON | `username` |
73
+ | First name only | `from_file` with firstNames seed | `name_first` |
74
+ | Last name only | `from_file` with lastNames seed | `name_last` |
75
+ | DOB | Column name contains DOB/BIRTH/DATE_OF_BIRTH — use `retain_age` regardless of original type | `dob` |
76
+ | Company | `chain(from_file(companies)+take_substring)` | `company_name` |
77
+ | Country name | `from_file(country_codes, seed_column=name)` | `country_name` |
78
+ | Country alpha-2 | `from_file(country_codes, seed_column=alpha_2)` | `country_code_2` |
79
+ | Country alpha-3 | `from_file(country_codes, seed_column=alpha_3)` | `country_code_3` |
80
+ | Phone/fax | `imitate` on column name containing PHONE, TEL, FAX, MOBILE, CELL | `phone` |
81
+ | Address line 1 | `from_file(addresses, seed_column=street_address)` on LINE_1/ADDRESS_LINE_1 columns | `address_line1` |
82
+ | Address line N | Same for LINE_2, LINE_3 etc. | `address_lineN` |
83
+ | Address full | `from_file(addresses, seed_column=street_address)` on non-line-numbered columns | `address_full` |
84
+ | Address expr | `concat(address+city+state+postcode, glue=', ')` | `network_address_expr` |
85
+ | City | `from_file(addresses, seed_column=city)` | `city` |
86
+ | Postcode | `from_file(addresses, seed_column=postcode)` | `post_code` |
87
+ | Suburb | `from_file(addresses, seed_column=suburb)` | `suburb` |
88
+ | Occupation | `from_file(occupations)` | `occupation` |
89
+
90
+ **3. Remaining** — group by column name concept. Where column names share a root (e.g., `RESULT3_VALUE`, `RESULT5_VALUE` → `result_value`; `GENERAL_2`, `GENERAL_6` → `general`), use one shared rule. Strip adjective prefixes. Use first occurrence's parameters.
91
+
92
+ - `imitate_unique` (non-ID cols) → `{col_group}: type: imitate_unique, seed: "{col_group}"` (seed recommended for namespacing; see ID columns section).
93
+ - `from_random_date` → `{col_group}: type: from_random_date, min/max from first occurrence`
94
+ - `from_random_number` → `{col_group}: type: from_random_number, min/max from first occurrence`
95
+ - String catch-all → `{col_group}: type: imitate_unique, seed: "{col_group}"` (use `imitate` only for types `imitate_unique` can't handle, e.g. datetime, bool).
96
+ - Complex chains → keep structure, group by column name
97
+
98
+ ### Output format
99
+
100
+ `Global/RuleLib` below is a placeholder for `<namespace>/<library_name>` — substitute the operator's real values, and create the library with `dm libraries create` before running the ruleset.
101
+
102
+ ```yaml
103
+ version: '1.0'
104
+ skip_defaults:
105
+ - ''
106
+ - null
107
+ imports:
108
+ - Global/RuleLib
109
+
110
+ tasks:
111
+ - type: mask_table
112
+ table: '"SCHEMA"."TABLE"'
113
+ key: '"ROWID"'
114
+ rules:
115
+ - column: '"FIRST_NAME"'
116
+ masks:
117
+ - $ref: "Global/RuleLib#masks/name_first"
118
+ ```
119
+
120
+ Do NOT write a custom YAML serializer. Use `ruamel.yaml` round-trip dumper. Use `DoubleQuotedScalarString` for `$ref` values.
121
+
122
+ **Report:** "Step 3 done — extracted N rule library definitions: [list each name and usage count]."
123
+
124
+ ---
125
+
126
+ ## Step 4: Add hash_columns
127
+
128
+ Write a Python script that:
129
+
130
+ **Parse the discovery CSV** (comma-separated):
131
+ `Selected`, `Table schema`, `Table name`, `Column name`, `Data Type`, `Constraint`, `Foreign Keys`, `Max Length`, `Numeric Precision`, `Numeric Scale`, `Reason for flag`, `Flagged by`, `Data classifications`
132
+
133
+ Build a lookup of `(schema, table)` → columns with constraint and FK metadata:
134
+ - `Constraint` patterns: `Primary(COL)`, `Unique(COL)`, `Foreign(COL)`
135
+ - `Foreign Keys` JSON: `["FK_NAME", "SCHEMA.TABLE.COLUMN"]` — index 1 gives the referenced table
136
+
137
+ **For each `mask_table` task:**
138
+
139
+ 1. **Pick hash column** using this priority:
140
+ - **Parent-entity FK first**: find FK columns where the referenced table is the parent of the current table — i.e., the current table name *starts with* the referenced table name (e.g., `ACCOUNT_HISTORY` starts with `ACCOUNT` → use `ACCOUNT_ID`). This avoids choosing lookup-table FKs (e.g., don't choose `ACCOUNT_TYPE_ID` in `ACCOUNT` just because it has a FK).
141
+ - **PK fallback**: if no parent-entity FK found, use the Primary Key column (never `ROWID`)
142
+ - **Archive table fallback**: if no PK in the CSV (archive tables `_A`, `_A_R`, `_R` often lack explicit keys), strip the suffix and look up the base table recursively
143
+ - **Composite PKs**: prefer `*_ID` or `*_NO` columns; deduplicate derivatives (`ACCOUNT_ID` + `PREVIOUS_ACCOUNT_ID` → keep `ACCOUNT_ID`)
144
+ - **Skip** if no suitable column found
145
+
146
+ 2. Insert `hash_columns: ["COLUMN_NAME"]` after the `key:` field
147
+
148
+ 3. Verify all rules in output are `$ref` — fix any remaining inline masks
149
+
150
+ 4. Write to output file
151
+
152
+ **Report:** "Step 4 done — added hash_columns to N tables, skipped M (all-unique), skipped K (no suitable key). Top hash columns: [column → count]."
153
+
154
+ ---
155
+
156
+ ## Step 5: Validate and clean up
157
+
158
+ Remove any comment lines containing `ROWID`.
159
+
160
+ Run `dm rulesets validate --file <output_file> --type database`
161
+ (use `file` for file-masking rulesets).
162
+
163
+ Fix any errors and re-validate until passing.
164
+
165
+ ---
166
+
167
+ ## Summary
168
+
169
+ | Metric | Value |
170
+ |----------------------------|----------------|
171
+ | Total tables | N |
172
+ | Mask definitions extracted | N (list names) |
173
+ | Tables with hash_columns | N |
174
+ | Tables skipped (no key) | N |
175
+ | Validation | passed/failed |
176
+ | Output file | path |
@@ -0,0 +1,109 @@
1
+ # FK Cascade Invariant
2
+
3
+ The most important rule when refining a DataMasque ruleset that spans
4
+ related tables. Get this wrong and you either leak identity (by skipping
5
+ IDs entirely) or break the engine (by adding rules for FK columns).
6
+
7
+ ## The rule
8
+
9
+ **Mask only the parent PK column. The engine cascades the same masked value
10
+ to every FK column referencing it.**
11
+
12
+ Three masks support this cascade:
13
+
14
+ - `imitate_unique` — recommended for new work.
15
+ - `imitate_uuid` — for UUID-shaped IDs.
16
+ - `imitate_nz_ird` — for NZ IRD numbers.
17
+
18
+ (`from_unique_imitate` and `mask_unique_key` are deprecated; do not emit.)
19
+
20
+ When `mask_table` runs and a rule on a referenced column uses one of these
21
+ masks, the engine:
22
+
23
+ 1. Discovers child tables with FKs referencing this column.
24
+ 2. Auto-replicates the parent's rule onto every FK column.
25
+ 3. Same mask config → same masked output → joins survive.
26
+
27
+ This is documented at
28
+ <https://portal.datamasque.com/portal/documentation/latest/unique-masks.html>:
29
+
30
+ > "You can apply an `imitate_unique` mask to a primary key column or a
31
+ > column that is used as a foreign key in another table. References will be
32
+ > updated automatically. Composite primary keys are supported."
33
+
34
+ ## Worked example
35
+
36
+ Schema:
37
+ - `customers.id` (PK), `customers.email`
38
+ - `orders.id` (PK), `orders.customer_id` (FK → `customers.id`), `orders.tracking_number`
39
+
40
+ Correct ruleset:
41
+
42
+ ```yaml
43
+ - type: mask_table
44
+ table: customers
45
+ key: id
46
+ rules:
47
+ - column: id
48
+ masks:
49
+ - type: imitate_unique
50
+ seed: customer
51
+ - column: email
52
+ masks:
53
+ - type: from_file
54
+ seed_file: DataMasque_emails.csv
55
+ seed_column: email
56
+
57
+ - type: mask_table
58
+ table: orders
59
+ key: id
60
+ rules:
61
+ # customer_id is intentionally absent — the engine replicates the
62
+ # `customers.id` rule onto it automatically. Adding it here would
63
+ # be rejected by the runtime FK check.
64
+ - column: tracking_number
65
+ masks:
66
+ - type: imitate_unique
67
+ seed: tracking
68
+ ```
69
+
70
+ After the run, `orders.customer_id` holds the same masked values as
71
+ `customers.id`, joins remain intact, and `tracking_number` is independently
72
+ masked with its own seed.
73
+
74
+ ## Anti-patterns to refuse
75
+
76
+ - **Adding explicit FK rules** ("I'll mask both PK and FK with shared
77
+ `$ref` so the cascade works"). The runtime rejects this by default with
78
+ the error:
79
+ *"To preserve referential integrity, the following foreign key columns
80
+ cannot be directly masked by this task."*
81
+ The engine will replicate the rule for you; adding your own conflicts.
82
+ - **Skipping IDs to "preserve FK joins"**. Leaves identifiers in plain
83
+ sight. Mask the parent PK with `imitate_unique` — joins survive via
84
+ the auto-cascade.
85
+ - **Inventing linking parameters** (`source_table`, `source_column`,
86
+ `parent_column`, `link_to`). None of these exist on any DataMasque mask.
87
+ - **Inventing a hashing mask** (`hash_text`, `hash`, `link`, `match_id`).
88
+ None of these exist. `imitate_unique` is the deterministic mask.
89
+ - **Using `from_unique_imitate` or `mask_unique_key`**. Both deprecated.
90
+ `imitate_unique` replaces both.
91
+
92
+ ## Cross-run consistency requires `run_secret`
93
+
94
+ Within a single run, `imitate_unique` is deterministic via a per-run
95
+ `insecure_seed`. Across runs, the cascade only holds if the run is
96
+ invoked with a `run_secret`. Without it, the same input maps to a
97
+ different masked value next run. If cross-run consistency matters, flag
98
+ this in the final summary.
99
+
100
+ ## Self-check before finishing
101
+
102
+ For each FK relationship in the schema:
103
+
104
+ 1. Is the parent PK masked with `imitate_unique`, `imitate_uuid`, or
105
+ `imitate_nz_ird`?
106
+ 2. Is the FK column **absent** from your output (no explicit rule)?
107
+ 3. Are `from_unique_imitate` and `mask_unique_key` absent from your output?
108
+
109
+ If any answer is "no", fix it before validation.
@@ -71,14 +71,14 @@ hash_columns:
71
71
 
72
72
  Every table belongs to a domain entity. Find the column that identifies that entity:
73
73
 
74
- | Domain | Typical hash column | Examples |
75
- |--------|-------------------|----------|
76
- | Customer | `cust_id`, `customer_id`, `client_id` | CUST_MASTER, CUST_ADDRESS |
77
- | Account | `acc_id`, `account_id`, `account_no` | DEP_ACCOUNT, DEP_EMAIL_ALERT |
78
- | Card | `card_id`, `card_no` | CARD_MASTER, CARD_INSURANCE |
79
- | Loan | `loan_id`, `loan_no` | LOAN_COLLATERAL, LOAN_GUARANTOR |
80
- | Employee | `emp_id`, `emp_no`, `employee_id` | COM_EMPLOYEE, COM_EMP_ROLE |
81
- | Transaction | `tx_id`, `trf_id`, `fx_tx_id` | TRF_MASTER, FX_RECEIPT |
74
+ | Domain | Typical hash column | Examples |
75
+ |-------------|---------------------------------------|---------------------------------|
76
+ | Customer | `cust_id`, `customer_id`, `client_id` | CUST_MASTER, CUST_ADDRESS |
77
+ | Account | `acc_id`, `account_id`, `account_no` | DEP_ACCOUNT, DEP_EMAIL_ALERT |
78
+ | Card | `card_id`, `card_no` | CARD_MASTER, CARD_INSURANCE |
79
+ | Loan | `loan_id`, `loan_no` | LOAN_COLLATERAL, LOAN_GUARANTOR |
80
+ | Employee | `emp_id`, `emp_no`, `employee_id` | COM_EMPLOYEE, COM_EMP_ROLE |
81
+ | Transaction | `tx_id`, `trf_id`, `fx_tx_id` | TRF_MASTER, FX_RECEIPT |
82
82
 
83
83
  ### Step 2: Check foreign keys in the DDL
84
84
 
@@ -175,14 +175,14 @@ tasks:
175
175
 
176
176
  Common seed files for `from_file` masks:
177
177
 
178
- | Category | Files |
179
- |----------|-------|
180
- | Names | `DataMasque_firstNames_mixed.csv`, `DataMasque_lastNames_v2.csv` |
178
+ | Category | Files |
179
+ |-----------|-------------------------------------------------------------------------------------------------------|
180
+ | Names | `DataMasque_firstNames_mixed.csv`, `DataMasque_lastNames_v2.csv` |
181
181
  | Addresses | `DataMasque_US_addresses.csv`, `DataMasque_AU_addresses_real.csv`, `DataMasque_NZ_addresses_real.csv` |
182
- | Companies | `DataMasque_companies.csv`, `DataMasque_NZ_companies.csv`, `DataMasque_AU_companies.csv` |
183
- | Email | `DataMasque_fake_email_suffixes.csv`, `DataMasque_email_suffixes.csv` |
184
- | Reference | `DataMasque_country_codes.csv`, `DataMasque_occupations.csv` |
185
- | Cards | `DataMasque_credit_card_numbers.csv`, `DataMasque_credit_card_prefixes.csv` |
182
+ | Companies | `DataMasque_companies.csv`, `DataMasque_NZ_companies.csv`, `DataMasque_AU_companies.csv` |
183
+ | Email | `DataMasque_fake_email_suffixes.csv`, `DataMasque_email_suffixes.csv` |
184
+ | Reference | `DataMasque_country_codes.csv`, `DataMasque_occupations.csv` |
185
+ | Cards | `DataMasque_credit_card_numbers.csv`, `DataMasque_credit_card_prefixes.csv` |
186
186
 
187
187
  Regional variants exist for BR, IN, AU, NZ, US.
188
188
  Use `from_file` when there are more than ~50 distinct values;
@@ -143,13 +143,13 @@ tasks: [...]
143
143
 
144
144
  ## Libraries vs YAML Anchors
145
145
 
146
- | Feature | YAML Anchors (`&`/`*`) | Libraries (`$ref`) |
147
- |---------|----------------------|-------------------|
148
- | Scope | Within one ruleset | Across multiple rulesets |
149
- | Management | Inline in YAML | Managed via API/CLI, versioned |
150
- | Syntax | `<<: *anchor_name` | `$ref: "lib#path"` |
151
- | Override | `<<:` merge key | Not supported (use as-is) |
152
- | Best for | Single-ruleset reuse | Organisation-wide standards |
146
+ | Feature | YAML Anchors (`&`/`*`) | Libraries (`$ref`) |
147
+ |------------|------------------------|--------------------------------|
148
+ | Scope | Within one ruleset | Across multiple rulesets |
149
+ | Management | Inline in YAML | Managed via API/CLI, versioned |
150
+ | Syntax | `<<: *anchor_name` | `$ref: "lib#path"` |
151
+ | Override | `<<:` merge key | Not supported (use as-is) |
152
+ | Best for | Single-ruleset reuse | Organisation-wide standards |
153
153
 
154
154
  **Recommendation:**
155
155
  - Start with YAML anchors (`mask_definitions`) for within-ruleset deduplication
@@ -72,6 +72,16 @@ For PostgreSQL/MySQL, plain names work: `table: users`, `key: id`.
72
72
 
73
73
  ## Mask Types Quick Reference
74
74
 
75
+ This is the **closed list** of every `type:` value DataMasque accepts. Do
76
+ not invent mask types or parameters (no `source_table`, no `link_to`, no
77
+ `parent_column` — none exist). For per-mask parameter details, see the
78
+ canonical source:
79
+ <https://portal.datamasque.com/portal/documentation/latest/masking-functions-overview.html>.
80
+
81
+ For a deterministic hash, use `imitate_unique` (or `imitate_uuid` for UUIDs)
82
+ optionally with `seed:` to namespace. The cascade is automatic; no
83
+ cross-table reference parameter exists. See `fk-cascade.md`.
84
+
75
85
  ### Generic
76
86
  - `from_fixed` — fixed replacement value
77
87
  - `from_column` — copy from another column
@@ -124,6 +134,20 @@ For PostgreSQL/MySQL, plain names work: `table: users`, `key: id`.
124
134
  ### Document
125
135
  - `json` — mask JSON fields within a column
126
136
  - `xml` — mask XML elements within a column
137
+ - `unstructured_text` — mask entities inside free text
138
+
139
+ ### Commonly-hallucinated names that do NOT exist
140
+
141
+ These plausible-sounding names are not in DataMasque. Refuse to emit them:
142
+
143
+ | Hallucinated name | What was wanted | Use instead |
144
+ |-------------------------------------------------------------|-----------------------------------|---------------------------------------------------------------|
145
+ | `hash_text`, `hash` | deterministic hash of a value | `imitate_unique` (or `imitate_uuid` for UUIDs) |
146
+ | `link`, `match_id`, `link_to` | join two columns after masking | shared `imitate_unique` config on both sides |
147
+ | `from_random_words` | random words / short text | `from_random_text` (random chars) or `from_file` |
148
+ | `from_random_string` | random string | `from_random_text` |
149
+ | `redact`, `mask_value` | constant placeholder | `from_fixed` with `value:` |
150
+ | `source_table`, `source_column`, `parent_column`, `link_to` | param to point a FK at its parent | does not exist — cascade is automatic with shared mask config |
127
151
 
128
152
  ## skip_defaults
129
153
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "datamasque-cli"
3
- version = "1.1.0"
3
+ version = "1.3.0"
4
4
  description = "Official command-line interface for the DataMasque data-masking platform."
5
5
  authors = [
6
6
  { name = "DataMasque Ltd" },