datamasque-cli 1.1.0__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/CHANGELOG.md +42 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/PKG-INFO +24 -1
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/README.md +23 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/datamasque-cli/skills/datamasque-cli/SKILL.md +1 -1
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/ruleset-builder/.claude-plugin/plugin.json +1 -1
- datamasque_cli-1.3.0/claude-skills/ruleset-builder/skills/ruleset-builder/SKILL.md +176 -0
- datamasque_cli-1.3.0/claude-skills/ruleset-builder/skills/ruleset-builder/references/fk-cascade.md +109 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/ruleset-builder/skills/ruleset-builder/references/hash-columns-guide.md +8 -8
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/ruleset-builder/skills/ruleset-builder/references/mask-definitions-guide.md +7 -7
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/ruleset-builder/skills/ruleset-builder/references/ruleset-libraries-guide.md +7 -7
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/ruleset-builder/skills/ruleset-builder/references/ruleset-yaml-reference.md +24 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/pyproject.toml +1 -1
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/client.py +63 -25
- datamasque_cli-1.3.0/src/datamasque_cli/commands/ifm.py +354 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/system.py +32 -1
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/main.py +2 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/output.py +26 -6
- datamasque_cli-1.3.0/tests/commands/test_ifm.py +577 -0
- datamasque_cli-1.3.0/tests/commands/test_system.py +82 -0
- datamasque_cli-1.3.0/tests/test_client_ifm.py +65 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/uv.lock +1 -1
- datamasque_cli-1.1.0/claude-skills/ruleset-builder/skills/ruleset-builder/SKILL.md +0 -175
- datamasque_cli-1.1.0/tests/commands/test_system.py +0 -38
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/.claude-plugin/marketplace.json +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/.github/workflows/ci.yml +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/.github/workflows/release-testpypi.yml +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/.github/workflows/release.yml +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/.gitignore +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/CONTRIBUTING.md +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/LICENSE +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/Makefile +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/NOTICE +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/assets/demo.gif +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/README.md +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/datamasque-cli/.claude-plugin/plugin.json +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/ruleset-splitter/.claude-plugin/plugin.json +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/claude-skills/ruleset-splitter/skills/ruleset-splitter/SKILL.md +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/scripts/active_profile_env.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/scripts/bump_version.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/__init__.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/__init__.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/auth.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/connections.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/discovery.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/files.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/ruleset_libraries.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/rulesets.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/runs.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/seeds.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/commands/users.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/config.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/src/datamasque_cli/py.typed +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/__init__.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/__init__.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_auth.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_catalog.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_connections.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_discovery.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_files.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_ruleset_libraries.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_rulesets.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_runs.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_seeds.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/commands/test_users.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/conftest.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/integration/README.md +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/integration/__init__.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/integration/conftest.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/integration/test_connections.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/integration/test_delete_safety.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/integration/test_rulesets.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/integration/test_runs.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/test_client_auth.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/test_client_env.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/test_client_profile.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/test_config.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/test_output.py +0 -0
- {datamasque_cli-1.1.0 → datamasque_cli-1.3.0}/tests/test_version.py +0 -0
|
@@ -1,5 +1,47 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## v1.3.0
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- `dm system ai-engine show` and `dm system ai-engine set <URL>` — view and
|
|
7
|
+
configure the AI Engine URL.
|
|
8
|
+
|
|
9
|
+
## v1.2.0
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
- `dm ifm` command group
|
|
13
|
+
for managing in-flight masking ruleset plans
|
|
14
|
+
and running mask operations against the IFM service:
|
|
15
|
+
- `dm ifm list` —
|
|
16
|
+
list all IFM ruleset plans.
|
|
17
|
+
- `dm ifm get <name>` —
|
|
18
|
+
show plan metadata,
|
|
19
|
+
or the ruleset YAML with `--yaml`.
|
|
20
|
+
- `dm ifm create --name <name> --file <yaml>` —
|
|
21
|
+
create a plan from a YAML ruleset,
|
|
22
|
+
with optional `--enabled/--disabled` and `--log-level`.
|
|
23
|
+
- `dm ifm update <name>` —
|
|
24
|
+
update a plan;
|
|
25
|
+
pass any of `--file`, `--enabled/--disabled`, `--log-level`
|
|
26
|
+
and only those fields are sent.
|
|
27
|
+
- `dm ifm delete <name>` —
|
|
28
|
+
delete a plan
|
|
29
|
+
(interactive confirm,
|
|
30
|
+
or `--yes` to skip).
|
|
31
|
+
- `dm ifm mask <name> --data <file|->` —
|
|
32
|
+
mask a JSON list of records against a plan,
|
|
33
|
+
with `--disable-instance-secret`,
|
|
34
|
+
`--run-secret`,
|
|
35
|
+
`--log-level`,
|
|
36
|
+
`--request-id`,
|
|
37
|
+
and `--json/--no-json` (NDJSON) output.
|
|
38
|
+
- `dm ifm verify-token` —
|
|
39
|
+
verify the current IFM token and list its scopes.
|
|
40
|
+
|
|
41
|
+
Authentication reuses your existing `dm` profile credentials
|
|
42
|
+
via the SDK's `DataMasqueIfmClient`,
|
|
43
|
+
which transparently exchanges admin-server credentials for an IFM JWT.
|
|
44
|
+
|
|
3
45
|
## v1.1.0
|
|
4
46
|
|
|
5
47
|
### Added
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datamasque-cli
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.0
|
|
4
4
|
Summary: Official command-line interface for the DataMasque data-masking platform.
|
|
5
5
|
Project-URL: Homepage, https://datamasque.com/
|
|
6
6
|
Project-URL: Repository, https://github.com/datamasque/datamasque-cli
|
|
@@ -39,6 +39,7 @@ so teams can use production-shaped data in non-production environments without e
|
|
|
39
39
|
DataMasque CLI `dm` covers:
|
|
40
40
|
|
|
41
41
|
- connections, rulesets, ruleset libraries, and masking runs
|
|
42
|
+
- in-flight masking (IFM) ruleset plans and on-demand mask requests
|
|
42
43
|
- schema discovery and sensitive-data discovery
|
|
43
44
|
- users, files, and DataMasque instance administration
|
|
44
45
|
|
|
@@ -196,6 +197,26 @@ dm libraries validate <name> # Re-validate against current
|
|
|
196
197
|
dm libraries usage <name> # Show rulesets using it
|
|
197
198
|
```
|
|
198
199
|
|
|
200
|
+
### In-flight masking
|
|
201
|
+
|
|
202
|
+
The IFM service runs alongside the admin server,
|
|
203
|
+
reached at `<DataMasque URL>/ifm`.
|
|
204
|
+
|
|
205
|
+
```console
|
|
206
|
+
dm ifm list # List ruleset plans
|
|
207
|
+
dm ifm get <name> # Show plan metadata
|
|
208
|
+
dm ifm get <name> --yaml # Print the ruleset YAML
|
|
209
|
+
dm ifm create --name myplan --file rules.yaml # Create (server suffixes a random string to the name)
|
|
210
|
+
dm ifm create --name myplan --file rules.yaml --disabled --log-level DEBUG
|
|
211
|
+
dm ifm update <name> --file rules.yaml # Replace the ruleset YAML
|
|
212
|
+
dm ifm update <name> --enabled # Toggle without re-sending the YAML
|
|
213
|
+
dm ifm update <name> --log-level INFO
|
|
214
|
+
dm ifm delete <name> --yes # Delete a plan
|
|
215
|
+
dm ifm mask <name> --data input.json # Mask a JSON list of records
|
|
216
|
+
dm ifm mask <name> --data - # Read records from stdin
|
|
217
|
+
dm ifm verify-token # Show scopes granted to the current IFM token
|
|
218
|
+
```
|
|
219
|
+
|
|
199
220
|
### Masking runs
|
|
200
221
|
|
|
201
222
|
```console
|
|
@@ -257,6 +278,8 @@ dm system upload-licence ./licence.lic # Upload a licence file
|
|
|
257
278
|
dm system logs -o logs.tar.gz # Download application logs
|
|
258
279
|
dm system admin-install --email admin@co.com # Initial admin setup
|
|
259
280
|
dm system set-locality AU # Set system locality
|
|
281
|
+
dm system ai-engine show # Show the configured AI Engine URL
|
|
282
|
+
dm system ai-engine set <URL> # Point DataMasque at an AI Engine
|
|
260
283
|
```
|
|
261
284
|
|
|
262
285
|
## JSON output
|
|
@@ -9,6 +9,7 @@ so teams can use production-shaped data in non-production environments without e
|
|
|
9
9
|
DataMasque CLI `dm` covers:
|
|
10
10
|
|
|
11
11
|
- connections, rulesets, ruleset libraries, and masking runs
|
|
12
|
+
- in-flight masking (IFM) ruleset plans and on-demand mask requests
|
|
12
13
|
- schema discovery and sensitive-data discovery
|
|
13
14
|
- users, files, and DataMasque instance administration
|
|
14
15
|
|
|
@@ -166,6 +167,26 @@ dm libraries validate <name> # Re-validate against current
|
|
|
166
167
|
dm libraries usage <name> # Show rulesets using it
|
|
167
168
|
```
|
|
168
169
|
|
|
170
|
+
### In-flight masking
|
|
171
|
+
|
|
172
|
+
The IFM service runs alongside the admin server,
|
|
173
|
+
reached at `<DataMasque URL>/ifm`.
|
|
174
|
+
|
|
175
|
+
```console
|
|
176
|
+
dm ifm list # List ruleset plans
|
|
177
|
+
dm ifm get <name> # Show plan metadata
|
|
178
|
+
dm ifm get <name> --yaml # Print the ruleset YAML
|
|
179
|
+
dm ifm create --name myplan --file rules.yaml # Create (server suffixes a random string to the name)
|
|
180
|
+
dm ifm create --name myplan --file rules.yaml --disabled --log-level DEBUG
|
|
181
|
+
dm ifm update <name> --file rules.yaml # Replace the ruleset YAML
|
|
182
|
+
dm ifm update <name> --enabled # Toggle without re-sending the YAML
|
|
183
|
+
dm ifm update <name> --log-level INFO
|
|
184
|
+
dm ifm delete <name> --yes # Delete a plan
|
|
185
|
+
dm ifm mask <name> --data input.json # Mask a JSON list of records
|
|
186
|
+
dm ifm mask <name> --data - # Read records from stdin
|
|
187
|
+
dm ifm verify-token # Show scopes granted to the current IFM token
|
|
188
|
+
```
|
|
189
|
+
|
|
169
190
|
### Masking runs
|
|
170
191
|
|
|
171
192
|
```console
|
|
@@ -227,6 +248,8 @@ dm system upload-licence ./licence.lic # Upload a licence file
|
|
|
227
248
|
dm system logs -o logs.tar.gz # Download application logs
|
|
228
249
|
dm system admin-install --email admin@co.com # Initial admin setup
|
|
229
250
|
dm system set-locality AU # Set system locality
|
|
251
|
+
dm system ai-engine show # Show the configured AI Engine URL
|
|
252
|
+
dm system ai-engine set <URL> # Point DataMasque at an AI Engine
|
|
230
253
|
```
|
|
231
254
|
|
|
232
255
|
## JSON output
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: datamasque-cli
|
|
3
|
-
description: Use when the user wants to interact with a DataMasque instance — start masking runs, check run status, list connections or rulesets, manage seeds, manage ruleset libraries, check system health, or any task involving the DataMasque API. Triggers on "mask the data", "start a run", "check the run", "list connections", "list rulesets", "upload a seed", "check DataMasque health", "dm status", "ruleset library", or any request to operate DataMasque programmatically.
|
|
3
|
+
description: Use when the user wants to interact with a DataMasque instance — start masking runs, check run status, list connections or rulesets, manage seeds, manage ruleset libraries, check system health, configure the AI Engine, or any task involving the DataMasque API. Triggers on "mask the data", "start a run", "check the run", "list connections", "list rulesets", "upload a seed", "check DataMasque health", "dm status", "ruleset library", "configure the AI Engine", "set the AI Engine URL", or any request to operate DataMasque programmatically.
|
|
4
4
|
argument-hint: e.g. "start a run with docx_masking on var_input_docx"
|
|
5
5
|
user-invocable: true
|
|
6
6
|
---
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ruleset-builder",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "Convert auto-generated DataMasque rulesets into production-ready form. Validate and iterate.",
|
|
5
5
|
"author": { "name": "DataMasque Ltd" },
|
|
6
6
|
"repository": "https://github.com/datamasque/datamasque-cli",
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ruleset-builder
|
|
3
|
+
description: Use when the user wants to turn auto-generated DataMasque rulesets into production-ready ones — extract a `ruleset_library`, add `hash_columns`, refine a ruleset, or clean up generated YAML. Triggers on "ruleset builder", "build ruleset", "refine ruleset", "add hash columns", "add ruleset library", "production ruleset", "clean up ruleset".
|
|
4
|
+
argument-hint: e.g. "build a ruleset from these generated files"
|
|
5
|
+
user-invocable: true
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# Ruleset Builder
|
|
9
|
+
|
|
10
|
+
Transform auto-generated DataMasque rulesets into production-ready rulesets with three improvements:
|
|
11
|
+
1. **`ruleset_library` references** — `$ref` links replacing every repeated inline mask
|
|
12
|
+
2. **`hash_columns`** — on every applicable `mask_table` task for deterministic consistency
|
|
13
|
+
3. **Clean structure** — `skip_defaults`, no doc blocks, validated
|
|
14
|
+
|
|
15
|
+
FK cascade is automatic: mask the parent PK with `imitate_unique` (or `imitate_uuid` / `imitate_nz_ird`) and the engine replicates the rule onto every FK column referencing it. **Do NOT add explicit rules for FK columns.** Avoid `from_unique_imitate` and `mask_unique_key` (both deprecated). Never skip IDs.
|
|
16
|
+
|
|
17
|
+
5-step process (1–5). Use `TaskCreate` to track all 5; report after each step before proceeding. The prompt must include business domain and application type — ask if missing.
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Step 1: Report versions
|
|
22
|
+
|
|
23
|
+
Report the Ruleset Builder version (from `plugin.json`) and `dm version` so the operator can correlate output with releases.
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Step 2: Read reference docs
|
|
28
|
+
|
|
29
|
+
Canonical mask reference:
|
|
30
|
+
<https://portal.datamasque.com/portal/documentation/latest/masking-functions-overview.html>
|
|
31
|
+
|
|
32
|
+
Read all of these before any other work:
|
|
33
|
+
```
|
|
34
|
+
${CLAUDE_PLUGIN_ROOT}/skills/ruleset-builder/references/fk-cascade.md
|
|
35
|
+
${CLAUDE_PLUGIN_ROOT}/skills/ruleset-builder/references/mask-definitions-guide.md
|
|
36
|
+
${CLAUDE_PLUGIN_ROOT}/skills/ruleset-builder/references/hash-columns-guide.md
|
|
37
|
+
${CLAUDE_PLUGIN_ROOT}/skills/ruleset-builder/references/ruleset-yaml-reference.md
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## Step 3: Extract ruleset_library
|
|
43
|
+
|
|
44
|
+
Write a Python script using `ruamel.yaml` (`uv pip install ruamel.yaml`).
|
|
45
|
+
|
|
46
|
+
Process the input YAML. For each `mask_table` task, replace every inline mask with a `$ref` to a rule in `ruleset_library.yaml`. Build the library progressively — read its current state at the start of each iteration, create it if absent.
|
|
47
|
+
|
|
48
|
+
The library `masks` section structure:
|
|
49
|
+
```yaml
|
|
50
|
+
version: "1.0"
|
|
51
|
+
masks:
|
|
52
|
+
rule_name:
|
|
53
|
+
type: rule_type
|
|
54
|
+
...params
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Classification rules (apply in order)
|
|
58
|
+
|
|
59
|
+
**1. ID columns** — any column ending in `_ID`, `_NO`, `_NR`, `_NBR` is an entity identifier.
|
|
60
|
+
- **FK side: drop the rule entirely.** If an ID column is a foreign key (the table's `Foreign Keys` metadata in the discovery CSV has an entry for it), do NOT emit a rule for it. The engine cascades automatically from the parent PK rule. See `fk-cascade.md`.
|
|
61
|
+
- **PK side: use `imitate_unique` with `seed:`.** Strip adjective/verb prefixes before the noun: `PREVIOUS_`, `OLD_`, `TRANSFERRED_`, `PRIOR_`, `CURR_`, `NEW_`, `NEXT_`, `ALT_`, `PARENT_`, `CHILD_`, `SOURCE_`, `TARGET_`, `ORIG_`, `PENDING_`, `ARCHIVED_`, `DELETED_`. Extract the core entity (`PREVIOUS_INVOICE_ID` → `invoice`).
|
|
62
|
+
- Library entry name: `{entity}_id`. Reference it as `$ref: "Global/RuleLib#masks/{entity}_id"`.
|
|
63
|
+
- Library entry body: `type: imitate_unique`, `seed: "{entity}"`. The `seed` is optional but recommended: it namespaces by entity so unrelated IDs don't collide (e.g. `customer.id=42` doesn't mask to the same value as `product.id=42`). Doesn't affect FK cascade.
|
|
64
|
+
- This overrides whatever mask was originally generated (even `from_random_number`).
|
|
65
|
+
|
|
66
|
+
**2. Named patterns** — detect by mask structure:
|
|
67
|
+
|
|
68
|
+
| Pattern | Detection | Library rule |
|
|
69
|
+
|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|
|
|
70
|
+
| Email | `chain(concat(concat(firstName+lastName, glue='.')+email_suffix)+transform_case(lower))` | `email_address` |
|
|
71
|
+
| Full name | `chain(concat(firstName+lastName, glue=' ')+take_substring)` OR plain `concat(firstName+lastName, glue=' ')` — column not containing USERNAME/LOGIN | `full_name` |
|
|
72
|
+
| Username | Same mask as full_name but column name contains USERNAME, USER_NAME, LOGIN, LOGON | `username` |
|
|
73
|
+
| First name only | `from_file` with firstNames seed | `name_first` |
|
|
74
|
+
| Last name only | `from_file` with lastNames seed | `name_last` |
|
|
75
|
+
| DOB | Column name contains DOB/BIRTH/DATE_OF_BIRTH — use `retain_age` regardless of original type | `dob` |
|
|
76
|
+
| Company | `chain(from_file(companies)+take_substring)` | `company_name` |
|
|
77
|
+
| Country name | `from_file(country_codes, seed_column=name)` | `country_name` |
|
|
78
|
+
| Country alpha-2 | `from_file(country_codes, seed_column=alpha_2)` | `country_code_2` |
|
|
79
|
+
| Country alpha-3 | `from_file(country_codes, seed_column=alpha_3)` | `country_code_3` |
|
|
80
|
+
| Phone/fax | `imitate` on column name containing PHONE, TEL, FAX, MOBILE, CELL | `phone` |
|
|
81
|
+
| Address line 1 | `from_file(addresses, seed_column=street_address)` on LINE_1/ADDRESS_LINE_1 columns | `address_line1` |
|
|
82
|
+
| Address line N | Same for LINE_2, LINE_3 etc. | `address_lineN` |
|
|
83
|
+
| Address full | `from_file(addresses, seed_column=street_address)` on non-line-numbered columns | `address_full` |
|
|
84
|
+
| Address expr | `concat(address+city+state+postcode, glue=', ')` | `network_address_expr` |
|
|
85
|
+
| City | `from_file(addresses, seed_column=city)` | `city` |
|
|
86
|
+
| Postcode | `from_file(addresses, seed_column=postcode)` | `post_code` |
|
|
87
|
+
| Suburb | `from_file(addresses, seed_column=suburb)` | `suburb` |
|
|
88
|
+
| Occupation | `from_file(occupations)` | `occupation` |
|
|
89
|
+
|
|
90
|
+
**3. Remaining** — group by column name concept. Where column names share a root (e.g., `RESULT3_VALUE`, `RESULT5_VALUE` → `result_value`; `GENERAL_2`, `GENERAL_6` → `general`), use one shared rule. Strip adjective prefixes. Use first occurrence's parameters.
|
|
91
|
+
|
|
92
|
+
- `imitate_unique` (non-ID cols) → `{col_group}: type: imitate_unique, seed: "{col_group}"` (seed recommended for namespacing; see ID columns section).
|
|
93
|
+
- `from_random_date` → `{col_group}: type: from_random_date, min/max from first occurrence`
|
|
94
|
+
- `from_random_number` → `{col_group}: type: from_random_number, min/max from first occurrence`
|
|
95
|
+
- String catch-all → `{col_group}: type: imitate_unique, seed: "{col_group}"` (use `imitate` only for types `imitate_unique` can't handle, e.g. datetime, bool).
|
|
96
|
+
- Complex chains → keep structure, group by column name
|
|
97
|
+
|
|
98
|
+
### Output format
|
|
99
|
+
|
|
100
|
+
`Global/RuleLib` below is a placeholder for `<namespace>/<library_name>` — substitute the operator's real values, and create the library with `dm libraries create` before running the ruleset.
|
|
101
|
+
|
|
102
|
+
```yaml
|
|
103
|
+
version: '1.0'
|
|
104
|
+
skip_defaults:
|
|
105
|
+
- ''
|
|
106
|
+
- null
|
|
107
|
+
imports:
|
|
108
|
+
- Global/RuleLib
|
|
109
|
+
|
|
110
|
+
tasks:
|
|
111
|
+
- type: mask_table
|
|
112
|
+
table: '"SCHEMA"."TABLE"'
|
|
113
|
+
key: '"ROWID"'
|
|
114
|
+
rules:
|
|
115
|
+
- column: '"FIRST_NAME"'
|
|
116
|
+
masks:
|
|
117
|
+
- $ref: "Global/RuleLib#masks/name_first"
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Do NOT write a custom YAML serializer. Use `ruamel.yaml` round-trip dumper. Use `DoubleQuotedScalarString` for `$ref` values.
|
|
121
|
+
|
|
122
|
+
**Report:** "Step 3 done — extracted N rule library definitions: [list each name and usage count]."
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## Step 4: Add hash_columns
|
|
127
|
+
|
|
128
|
+
Write a Python script that:
|
|
129
|
+
|
|
130
|
+
**Parse the discovery CSV** (comma-separated):
|
|
131
|
+
`Selected`, `Table schema`, `Table name`, `Column name`, `Data Type`, `Constraint`, `Foreign Keys`, `Max Length`, `Numeric Precision`, `Numeric Scale`, `Reason for flag`, `Flagged by`, `Data classifications`
|
|
132
|
+
|
|
133
|
+
Build a lookup of `(schema, table)` → columns with constraint and FK metadata:
|
|
134
|
+
- `Constraint` patterns: `Primary(COL)`, `Unique(COL)`, `Foreign(COL)`
|
|
135
|
+
- `Foreign Keys` JSON: `["FK_NAME", "SCHEMA.TABLE.COLUMN"]` — index 1 gives the referenced table
|
|
136
|
+
|
|
137
|
+
**For each `mask_table` task:**
|
|
138
|
+
|
|
139
|
+
1. **Pick hash column** using this priority:
|
|
140
|
+
- **Parent-entity FK first**: find FK columns where the referenced table is the parent of the current table — i.e., the current table name *starts with* the referenced table name (e.g., `ACCOUNT_HISTORY` starts with `ACCOUNT` → use `ACCOUNT_ID`). This avoids choosing lookup-table FKs (e.g., don't choose `ACCOUNT_TYPE_ID` in `ACCOUNT` just because it has a FK).
|
|
141
|
+
- **PK fallback**: if no parent-entity FK found, use the Primary Key column (never `ROWID`)
|
|
142
|
+
- **Archive table fallback**: if no PK in the CSV (archive tables `_A`, `_A_R`, `_R` often lack explicit keys), strip the suffix and look up the base table recursively
|
|
143
|
+
- **Composite PKs**: prefer `*_ID` or `*_NO` columns; deduplicate derivatives (`ACCOUNT_ID` + `PREVIOUS_ACCOUNT_ID` → keep `ACCOUNT_ID`)
|
|
144
|
+
- **Skip** if no suitable column found
|
|
145
|
+
|
|
146
|
+
2. Insert `hash_columns: ["COLUMN_NAME"]` after the `key:` field
|
|
147
|
+
|
|
148
|
+
3. Verify all rules in output are `$ref` — fix any remaining inline masks
|
|
149
|
+
|
|
150
|
+
4. Write to output file
|
|
151
|
+
|
|
152
|
+
**Report:** "Step 4 done — added hash_columns to N tables, skipped M (all-unique), skipped K (no suitable key). Top hash columns: [column → count]."
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## Step 5: Validate and clean up
|
|
157
|
+
|
|
158
|
+
Remove any comment lines containing `ROWID`.
|
|
159
|
+
|
|
160
|
+
Run `dm rulesets validate --file <output_file> --type database`
|
|
161
|
+
(use `file` for file-masking rulesets).
|
|
162
|
+
|
|
163
|
+
Fix any errors and re-validate until passing.
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
## Summary
|
|
168
|
+
|
|
169
|
+
| Metric | Value |
|
|
170
|
+
|----------------------------|----------------|
|
|
171
|
+
| Total tables | N |
|
|
172
|
+
| Mask definitions extracted | N (list names) |
|
|
173
|
+
| Tables with hash_columns | N |
|
|
174
|
+
| Tables skipped (no key) | N |
|
|
175
|
+
| Validation | passed/failed |
|
|
176
|
+
| Output file | path |
|
datamasque_cli-1.3.0/claude-skills/ruleset-builder/skills/ruleset-builder/references/fk-cascade.md
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# FK Cascade Invariant
|
|
2
|
+
|
|
3
|
+
The most important rule when refining a DataMasque ruleset that spans
|
|
4
|
+
related tables. Get this wrong and you either leak identity (by skipping
|
|
5
|
+
IDs entirely) or break the engine (by adding rules for FK columns).
|
|
6
|
+
|
|
7
|
+
## The rule
|
|
8
|
+
|
|
9
|
+
**Mask only the parent PK column. The engine cascades the same masked value
|
|
10
|
+
to every FK column referencing it.**
|
|
11
|
+
|
|
12
|
+
Three masks support this cascade:
|
|
13
|
+
|
|
14
|
+
- `imitate_unique` — recommended for new work.
|
|
15
|
+
- `imitate_uuid` — for UUID-shaped IDs.
|
|
16
|
+
- `imitate_nz_ird` — for NZ IRD numbers.
|
|
17
|
+
|
|
18
|
+
(`from_unique_imitate` and `mask_unique_key` are deprecated; do not emit.)
|
|
19
|
+
|
|
20
|
+
When `mask_table` runs and a rule on a referenced column uses one of these
|
|
21
|
+
masks, the engine:
|
|
22
|
+
|
|
23
|
+
1. Discovers child tables with FKs referencing this column.
|
|
24
|
+
2. Auto-replicates the parent's rule onto every FK column.
|
|
25
|
+
3. Same mask config → same masked output → joins survive.
|
|
26
|
+
|
|
27
|
+
This is documented at
|
|
28
|
+
<https://portal.datamasque.com/portal/documentation/latest/unique-masks.html>:
|
|
29
|
+
|
|
30
|
+
> "You can apply an `imitate_unique` mask to a primary key column or a
|
|
31
|
+
> column that is used as a foreign key in another table. References will be
|
|
32
|
+
> updated automatically. Composite primary keys are supported."
|
|
33
|
+
|
|
34
|
+
## Worked example
|
|
35
|
+
|
|
36
|
+
Schema:
|
|
37
|
+
- `customers.id` (PK), `customers.email`
|
|
38
|
+
- `orders.id` (PK), `orders.customer_id` (FK → `customers.id`), `orders.tracking_number`
|
|
39
|
+
|
|
40
|
+
Correct ruleset:
|
|
41
|
+
|
|
42
|
+
```yaml
|
|
43
|
+
- type: mask_table
|
|
44
|
+
table: customers
|
|
45
|
+
key: id
|
|
46
|
+
rules:
|
|
47
|
+
- column: id
|
|
48
|
+
masks:
|
|
49
|
+
- type: imitate_unique
|
|
50
|
+
seed: customer
|
|
51
|
+
- column: email
|
|
52
|
+
masks:
|
|
53
|
+
- type: from_file
|
|
54
|
+
seed_file: DataMasque_emails.csv
|
|
55
|
+
seed_column: email
|
|
56
|
+
|
|
57
|
+
- type: mask_table
|
|
58
|
+
table: orders
|
|
59
|
+
key: id
|
|
60
|
+
rules:
|
|
61
|
+
# customer_id is intentionally absent — the engine replicates the
|
|
62
|
+
# `customers.id` rule onto it automatically. Adding it here would
|
|
63
|
+
# be rejected by the runtime FK check.
|
|
64
|
+
- column: tracking_number
|
|
65
|
+
masks:
|
|
66
|
+
- type: imitate_unique
|
|
67
|
+
seed: tracking
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
After the run, `orders.customer_id` holds the same masked values as
|
|
71
|
+
`customers.id`, joins remain intact, and `tracking_number` is independently
|
|
72
|
+
masked with its own seed.
|
|
73
|
+
|
|
74
|
+
## Anti-patterns to refuse
|
|
75
|
+
|
|
76
|
+
- **Adding explicit FK rules** ("I'll mask both PK and FK with shared
|
|
77
|
+
`$ref` so the cascade works"). The runtime rejects this by default with
|
|
78
|
+
the error:
|
|
79
|
+
*"To preserve referential integrity, the following foreign key columns
|
|
80
|
+
cannot be directly masked by this task."*
|
|
81
|
+
The engine will replicate the rule for you; adding your own conflicts.
|
|
82
|
+
- **Skipping IDs to "preserve FK joins"**. Leaves identifiers in plain
|
|
83
|
+
sight. Mask the parent PK with `imitate_unique` — joins survive via
|
|
84
|
+
the auto-cascade.
|
|
85
|
+
- **Inventing linking parameters** (`source_table`, `source_column`,
|
|
86
|
+
`parent_column`, `link_to`). None of these exist on any DataMasque mask.
|
|
87
|
+
- **Inventing a hashing mask** (`hash_text`, `hash`, `link`, `match_id`).
|
|
88
|
+
None of these exist. `imitate_unique` is the deterministic mask.
|
|
89
|
+
- **Using `from_unique_imitate` or `mask_unique_key`**. Both deprecated.
|
|
90
|
+
`imitate_unique` replaces both.
|
|
91
|
+
|
|
92
|
+
## Cross-run consistency requires `run_secret`
|
|
93
|
+
|
|
94
|
+
Within a single run, `imitate_unique` is deterministic via a per-run
|
|
95
|
+
`insecure_seed`. Across runs, the cascade only holds if the run is
|
|
96
|
+
invoked with a `run_secret`. Without it, the same input maps to a
|
|
97
|
+
different masked value next run. If cross-run consistency matters, flag
|
|
98
|
+
this in the final summary.
|
|
99
|
+
|
|
100
|
+
## Self-check before finishing
|
|
101
|
+
|
|
102
|
+
For each FK relationship in the schema:
|
|
103
|
+
|
|
104
|
+
1. Is the parent PK masked with `imitate_unique`, `imitate_uuid`, or
|
|
105
|
+
`imitate_nz_ird`?
|
|
106
|
+
2. Is the FK column **absent** from your output (no explicit rule)?
|
|
107
|
+
3. Are `from_unique_imitate` and `mask_unique_key` absent from your output?
|
|
108
|
+
|
|
109
|
+
If any answer is "no", fix it before validation.
|
|
@@ -71,14 +71,14 @@ hash_columns:
|
|
|
71
71
|
|
|
72
72
|
Every table belongs to a domain entity. Find the column that identifies that entity:
|
|
73
73
|
|
|
74
|
-
| Domain
|
|
75
|
-
|
|
76
|
-
| Customer
|
|
77
|
-
| Account
|
|
78
|
-
| Card
|
|
79
|
-
| Loan
|
|
80
|
-
| Employee
|
|
81
|
-
| Transaction | `tx_id`, `trf_id`, `fx_tx_id`
|
|
74
|
+
| Domain | Typical hash column | Examples |
|
|
75
|
+
|-------------|---------------------------------------|---------------------------------|
|
|
76
|
+
| Customer | `cust_id`, `customer_id`, `client_id` | CUST_MASTER, CUST_ADDRESS |
|
|
77
|
+
| Account | `acc_id`, `account_id`, `account_no` | DEP_ACCOUNT, DEP_EMAIL_ALERT |
|
|
78
|
+
| Card | `card_id`, `card_no` | CARD_MASTER, CARD_INSURANCE |
|
|
79
|
+
| Loan | `loan_id`, `loan_no` | LOAN_COLLATERAL, LOAN_GUARANTOR |
|
|
80
|
+
| Employee | `emp_id`, `emp_no`, `employee_id` | COM_EMPLOYEE, COM_EMP_ROLE |
|
|
81
|
+
| Transaction | `tx_id`, `trf_id`, `fx_tx_id` | TRF_MASTER, FX_RECEIPT |
|
|
82
82
|
|
|
83
83
|
### Step 2: Check foreign keys in the DDL
|
|
84
84
|
|
|
@@ -175,14 +175,14 @@ tasks:
|
|
|
175
175
|
|
|
176
176
|
Common seed files for `from_file` masks:
|
|
177
177
|
|
|
178
|
-
| Category
|
|
179
|
-
|
|
180
|
-
| Names
|
|
178
|
+
| Category | Files |
|
|
179
|
+
|-----------|-------------------------------------------------------------------------------------------------------|
|
|
180
|
+
| Names | `DataMasque_firstNames_mixed.csv`, `DataMasque_lastNames_v2.csv` |
|
|
181
181
|
| Addresses | `DataMasque_US_addresses.csv`, `DataMasque_AU_addresses_real.csv`, `DataMasque_NZ_addresses_real.csv` |
|
|
182
|
-
| Companies | `DataMasque_companies.csv`, `DataMasque_NZ_companies.csv`, `DataMasque_AU_companies.csv`
|
|
183
|
-
| Email
|
|
184
|
-
| Reference | `DataMasque_country_codes.csv`, `DataMasque_occupations.csv`
|
|
185
|
-
| Cards
|
|
182
|
+
| Companies | `DataMasque_companies.csv`, `DataMasque_NZ_companies.csv`, `DataMasque_AU_companies.csv` |
|
|
183
|
+
| Email | `DataMasque_fake_email_suffixes.csv`, `DataMasque_email_suffixes.csv` |
|
|
184
|
+
| Reference | `DataMasque_country_codes.csv`, `DataMasque_occupations.csv` |
|
|
185
|
+
| Cards | `DataMasque_credit_card_numbers.csv`, `DataMasque_credit_card_prefixes.csv` |
|
|
186
186
|
|
|
187
187
|
Regional variants exist for BR, IN, AU, NZ, US.
|
|
188
188
|
Use `from_file` when there are more than ~50 distinct values;
|
|
@@ -143,13 +143,13 @@ tasks: [...]
|
|
|
143
143
|
|
|
144
144
|
## Libraries vs YAML Anchors
|
|
145
145
|
|
|
146
|
-
| Feature
|
|
147
|
-
|
|
148
|
-
| Scope
|
|
149
|
-
| Management | Inline in YAML
|
|
150
|
-
| Syntax
|
|
151
|
-
| Override
|
|
152
|
-
| Best for
|
|
146
|
+
| Feature | YAML Anchors (`&`/`*`) | Libraries (`$ref`) |
|
|
147
|
+
|------------|------------------------|--------------------------------|
|
|
148
|
+
| Scope | Within one ruleset | Across multiple rulesets |
|
|
149
|
+
| Management | Inline in YAML | Managed via API/CLI, versioned |
|
|
150
|
+
| Syntax | `<<: *anchor_name` | `$ref: "lib#path"` |
|
|
151
|
+
| Override | `<<:` merge key | Not supported (use as-is) |
|
|
152
|
+
| Best for | Single-ruleset reuse | Organisation-wide standards |
|
|
153
153
|
|
|
154
154
|
**Recommendation:**
|
|
155
155
|
- Start with YAML anchors (`mask_definitions`) for within-ruleset deduplication
|
|
@@ -72,6 +72,16 @@ For PostgreSQL/MySQL, plain names work: `table: users`, `key: id`.
|
|
|
72
72
|
|
|
73
73
|
## Mask Types Quick Reference
|
|
74
74
|
|
|
75
|
+
This is the **closed list** of every `type:` value DataMasque accepts. Do
|
|
76
|
+
not invent mask types or parameters (no `source_table`, no `link_to`, no
|
|
77
|
+
`parent_column` — none exist). For per-mask parameter details, see the
|
|
78
|
+
canonical source:
|
|
79
|
+
<https://portal.datamasque.com/portal/documentation/latest/masking-functions-overview.html>.
|
|
80
|
+
|
|
81
|
+
For a deterministic hash, use `imitate_unique` (or `imitate_uuid` for UUIDs)
|
|
82
|
+
optionally with `seed:` to namespace. The cascade is automatic; no
|
|
83
|
+
cross-table reference parameter exists. See `fk-cascade.md`.
|
|
84
|
+
|
|
75
85
|
### Generic
|
|
76
86
|
- `from_fixed` — fixed replacement value
|
|
77
87
|
- `from_column` — copy from another column
|
|
@@ -124,6 +134,20 @@ For PostgreSQL/MySQL, plain names work: `table: users`, `key: id`.
|
|
|
124
134
|
### Document
|
|
125
135
|
- `json` — mask JSON fields within a column
|
|
126
136
|
- `xml` — mask XML elements within a column
|
|
137
|
+
- `unstructured_text` — mask entities inside free text
|
|
138
|
+
|
|
139
|
+
### Commonly-hallucinated names that do NOT exist
|
|
140
|
+
|
|
141
|
+
These plausible-sounding names are not in DataMasque. Refuse to emit them:
|
|
142
|
+
|
|
143
|
+
| Hallucinated name | What was wanted | Use instead |
|
|
144
|
+
|-------------------------------------------------------------|-----------------------------------|---------------------------------------------------------------|
|
|
145
|
+
| `hash_text`, `hash` | deterministic hash of a value | `imitate_unique` (or `imitate_uuid` for UUIDs) |
|
|
146
|
+
| `link`, `match_id`, `link_to` | join two columns after masking | shared `imitate_unique` config on both sides |
|
|
147
|
+
| `from_random_words` | random words / short text | `from_random_text` (random chars) or `from_file` |
|
|
148
|
+
| `from_random_string` | random string | `from_random_text` |
|
|
149
|
+
| `redact`, `mask_value` | constant placeholder | `from_fixed` with `value:` |
|
|
150
|
+
| `source_table`, `source_column`, `parent_column`, `link_to` | param to point a FK at its parent | does not exist — cascade is automatic with shared mask config |
|
|
127
151
|
|
|
128
152
|
## skip_defaults
|
|
129
153
|
|