@orderful/droid 0.45.0 → 0.46.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/.claude-plugin/marketplace.json +1 -1
  2. package/.claude-plugin/plugin.json +8 -2
  3. package/.github/workflows/claude-issue-agent.yml +1 -2
  4. package/CHANGELOG.md +14 -0
  5. package/dist/tools/droid/.claude-plugin/plugin.json +1 -1
  6. package/dist/tools/droid/TOOL.yaml +1 -1
  7. package/dist/tools/droid/skills/droid/SKILL.md +1 -0
  8. package/dist/tools/droid/skills/droid-bootstrap/SKILL.md +1 -0
  9. package/dist/tools/edi-schema/.claude-plugin/plugin.json +25 -0
  10. package/dist/tools/edi-schema/TOOL.yaml +29 -0
  11. package/dist/tools/edi-schema/agents/edi-schema-agent.md +97 -0
  12. package/dist/tools/edi-schema/commands/edi-schema.md +33 -0
  13. package/dist/tools/edi-schema/skills/edi-schema/SKILL.md +86 -0
  14. package/dist/tools/pii/.claude-plugin/plugin.json +25 -0
  15. package/dist/tools/pii/TOOL.yaml +22 -0
  16. package/dist/tools/pii/agents/pii-scanner.md +85 -0
  17. package/dist/tools/pii/commands/pii.md +33 -0
  18. package/dist/tools/pii/skills/pii/SKILL.md +97 -0
  19. package/dist/tools/pii/skills/pii/references/supported-entities.md +90 -0
  20. package/dist/tools/pii/skills/pii/scripts/presidio-analyze.d.ts +18 -0
  21. package/dist/tools/pii/skills/pii/scripts/presidio-analyze.d.ts.map +1 -0
  22. package/dist/tools/pii/skills/pii/scripts/presidio-analyze.ts +258 -0
  23. package/dist/tools/pii/skills/pii/scripts/presidio-init.d.ts +17 -0
  24. package/dist/tools/pii/skills/pii/scripts/presidio-init.d.ts.map +1 -0
  25. package/dist/tools/pii/skills/pii/scripts/presidio-init.ts +151 -0
  26. package/dist/tools/pii/skills/pii/scripts/presidio-redact.d.ts +21 -0
  27. package/dist/tools/pii/skills/pii/scripts/presidio-redact.d.ts.map +1 -0
  28. package/dist/tools/pii/skills/pii/scripts/presidio-redact.ts +294 -0
  29. package/dist/tools/pii/skills/pii/scripts/presidio.test.ts +444 -0
  30. package/package.json +1 -1
  31. package/src/tools/droid/.claude-plugin/plugin.json +1 -1
  32. package/src/tools/droid/TOOL.yaml +1 -1
  33. package/src/tools/droid/skills/droid/SKILL.md +1 -0
  34. package/src/tools/droid/skills/droid-bootstrap/SKILL.md +1 -0
  35. package/src/tools/edi-schema/.claude-plugin/plugin.json +25 -0
  36. package/src/tools/edi-schema/TOOL.yaml +29 -0
  37. package/src/tools/edi-schema/agents/edi-schema-agent.md +97 -0
  38. package/src/tools/edi-schema/commands/edi-schema.md +33 -0
  39. package/src/tools/edi-schema/skills/edi-schema/SKILL.md +86 -0
  40. package/src/tools/pii/.claude-plugin/plugin.json +25 -0
  41. package/src/tools/pii/TOOL.yaml +22 -0
  42. package/src/tools/pii/agents/pii-scanner.md +85 -0
  43. package/src/tools/pii/commands/pii.md +33 -0
  44. package/src/tools/pii/skills/pii/SKILL.md +97 -0
  45. package/src/tools/pii/skills/pii/references/supported-entities.md +90 -0
  46. package/src/tools/pii/skills/pii/scripts/presidio-analyze.ts +258 -0
  47. package/src/tools/pii/skills/pii/scripts/presidio-init.ts +151 -0
  48. package/src/tools/pii/skills/pii/scripts/presidio-redact.ts +294 -0
  49. package/src/tools/pii/skills/pii/scripts/presidio.test.ts +444 -0
@@ -0,0 +1,86 @@
1
+ ---
2
+ name: edi-schema
3
+ description: "Query EDI schemas without consuming parent context. Use when finding fields, loops, segments, requirements, or code-list values for X12/EDIFACT transaction types. User prompts like '/edi-schema 944_004010 freeFormDescription' or '/edi-schema 850 --codes BEG01'."
4
+ argument-hint: "{transactionType|types|versions {transactionType}} [{query|--segments|--loops|--codes {elementId}}]"
5
+ allowed-tools: [Read, Glob, Grep, Bash, Task]
6
+ ---
7
+
8
+ # EDI Schema Skill
9
+
10
+ Query EDI schemas through a sub-agent so large schema JSON files never enter parent context.
11
+
12
+ ## Commands
13
+
14
+ ```bash
15
+ /edi-schema {transactionType} {query}
16
+ /edi-schema {transactionType} --segments
17
+ /edi-schema {transactionType} --loops
18
+ /edi-schema {transactionType} --codes {elementId}
19
+ /edi-schema types
20
+ /edi-schema versions {transactionType}
21
+ ```
22
+
23
+ ## Input Rules
24
+
25
+ - `transactionType` accepts either full form (`850_004010`, `ORDERS_D96A`) or short form (`850`, `ORDERS`).
26
+ - If short form is used, resolve to the most common version and state which version was selected.
27
+ - If multiple plausible versions exist and no safe default is clear, show options and ask user to pick.
28
+
29
+ ## Data Sources
30
+
31
+ Expected paths in the target schemas repo:
32
+
33
+ - `libs/schemas/src/lib/data/transactionSchemas/x12/`
34
+ - `libs/schemas/src/lib/data/transactionSchemas/edifact/`
35
+ - `libs/schemas/src/lib/data/codeLists/`
36
+ - `libs/schemas/src/lib/data/transactionTypes.json`
37
+
38
+ ## Procedure
39
+
40
+ ### 1. Resolve schemas root
41
+
42
+ 1. Check `droid config --get tools.edi-schema.schemas_dir`.
43
+ 2. If configured and valid, use it.
44
+ 3. Otherwise auto-detect from current workspace by finding `libs/schemas/src/lib/data/`.
45
+ 4. If not found, ask user for the repo path or suggest setting `tools.edi-schema.schemas_dir`.
46
+
47
+ ### 2. Route command
48
+
49
+ - `types`:
50
+ - List transaction types from `transactionTypes.json` (type + description where available).
51
+ - `versions {transactionType}`:
52
+ - List matching schema versions from x12/edifact schema filenames.
53
+ - `{transactionType} --segments`:
54
+ - Ask sub-agent for unique segment list and requirement indicators.
55
+ - `{transactionType} --loops`:
56
+ - Ask sub-agent for loop hierarchy.
57
+ - `{transactionType} --codes {elementId}`:
58
+ - Ask sub-agent for code-list values for the requested element ID.
59
+ - `{transactionType} {query}`:
60
+ - Ask sub-agent to search by identifier/title/business term and return best matches.
61
+
62
+ ### 3. Sub-agent handoff
63
+
64
+ Use `edi-schema-agent` for all schema/content queries. Pass:
65
+
66
+ - Resolved schemas root
67
+ - Resolved schema file path + transaction type/version
68
+ - Query mode (`field-search`, `segments`, `loops`, `codes`, `versions`, `types`)
69
+ - User input (`query`, `elementId`)
70
+
71
+ ### 4. Response format
72
+
73
+ Keep responses compact and structured:
74
+
75
+ - Header with resolved schema (`850_004010`, etc.)
76
+ - Result table or bullets with identifiers and titles
77
+ - Requirement/type constraints where relevant
78
+ - Path hints (loop/segment path) when available
79
+ - Clear next step if no matches
80
+
81
+ ## Behaviour Rules
82
+
83
+ - Never dump the entire schema JSON.
84
+ - Prefer exact identifier match first (for example `BEG01`) before fuzzy title matching.
85
+ - When no exact match, return top fuzzy matches with confidence notes.
86
+ - If jq is unavailable, fall back to `rg`/targeted reads and state that fallback was used.
@@ -0,0 +1,25 @@
1
+ {
2
+ "name": "droid-pii",
3
+ "version": "0.1.0",
4
+ "description": "Detect and redact PII (personally identifiable information) in text files. Powered by Microsoft Presidio with a bundled Python venv — zero external dependencies after first run.",
5
+ "author": {
6
+ "name": "Orderful",
7
+ "url": "https://github.com/orderful"
8
+ },
9
+ "repository": "https://github.com/orderful/droid",
10
+ "license": "MIT",
11
+ "keywords": [
12
+ "droid",
13
+ "ai",
14
+ "pii"
15
+ ],
16
+ "skills": [
17
+ "./skills/pii/SKILL.md"
18
+ ],
19
+ "commands": [
20
+ "./commands/pii.md"
21
+ ],
22
+ "agents": [
23
+ "./agents/pii-scanner.md"
24
+ ]
25
+ }
@@ -0,0 +1,22 @@
1
+ name: pii
2
+ description: "Detect and redact PII (personally identifiable information) in text files. Powered by Microsoft Presidio with a bundled Python venv — zero external dependencies after first run."
3
+ version: 0.1.0
4
+ status: beta
5
+
6
+ includes:
7
+ skills:
8
+ - name: pii
9
+ required: true
10
+ commands:
11
+ - name: pii
12
+ is_alias: false
13
+ agents:
14
+ - pii-scanner
15
+
16
+ dependencies: []
17
+
18
+ prerequisites:
19
+ - name: python3
20
+ description: "Python 3.8+ required to run the Presidio venv"
21
+ check: "python3 --version"
22
+ install_hint: "Install Python 3 from python.org or via: brew install python3"
@@ -0,0 +1,85 @@
1
+ ---
2
+ name: pii-scanner
3
+ description: "Isolated PII analysis agent. Runs presidio-analyze.ts in a contained context so raw entity values never appear in the parent conversation. Use PROACTIVELY when the pii skill delegates scan operations."
4
+ tools:
5
+ - Bash
6
+ color: orange
7
+ ---
8
+
9
+ You are a PII scanning agent. Your sole job is to run `presidio-analyze.ts` on a file or text and return a structured summary — without leaking raw PII values into the conversation.
10
+
11
+ ## Inputs
12
+
13
+ You will receive:
14
+
15
+ 1. `file_path` — absolute path to the file to scan (preferred), OR
16
+ 2. `text_content` — inline text to scan (for small strings only)
17
+ 3. `entities` (optional) — comma-separated list of entity types to filter (e.g. `EMAIL_ADDRESS,PHONE_NUMBER`)
18
+ 4. `venv_path` (optional) — override for the Presidio venv path (default: `~/.droid/runtimes/presidio/`)
19
+
20
+ ## Rules
21
+
22
+ - **Never echo raw PII values** in your output — return entity types, counts, and line numbers only
23
+ - Make exactly one Bash call to `presidio-analyze.ts`
24
+ - Parse the JSON result and return only the structured summary
25
+ - If the script returns `init_required: true`, stop and tell the parent skill to run `presidio-init.ts` first
26
+ - If the file does not exist, return a clear error
27
+
28
+ ## Procedure
29
+
30
+ 1. Build the command:
31
+ ```bash
32
+ droid exec pii presidio-analyze --file <file_path> [--entities <types>]
33
+ ```
34
+ (Use `--text` only for inline strings under ~1000 characters)
35
+
36
+ 2. Parse the JSON output from the script
37
+
38
+ 3. From the `entities` array, compute:
39
+ - `total_entities`: total count
40
+ - `by_type`: entity type → count map
41
+ - `lines_affected`: sorted unique list of line numbers
42
+ - `sample_lines`: up to 5 line numbers with the entity types found on each line
43
+
44
+ 4. Return the structured summary (see Output Format below)
45
+
46
+ ## Output Format
47
+
48
+ Return JSON:
49
+
50
+ ```json
51
+ {
52
+ "file": "/path/to/file.md",
53
+ "total_entities": 3,
54
+ "by_type": {
55
+ "EMAIL_ADDRESS": 2,
56
+ "PHONE_NUMBER": 1
57
+ },
58
+ "lines_affected": [4, 8, 12],
59
+ "sample_lines": [
60
+ { "line": 4, "types": ["EMAIL_ADDRESS"] },
61
+ { "line": 8, "types": ["PHONE_NUMBER"] },
62
+ { "line": 12, "types": ["EMAIL_ADDRESS"] }
63
+ ]
64
+ }
65
+ ```
66
+
67
+ If no entities are found:
68
+ ```json
69
+ {
70
+ "file": "/path/to/file.md",
71
+ "total_entities": 0,
72
+ "by_type": {},
73
+ "lines_affected": [],
74
+ "sample_lines": []
75
+ }
76
+ ```
77
+
78
+ If an error occurs:
79
+ ```json
80
+ {
81
+ "file": "/path/to/file.md",
82
+ "error": "...",
83
+ "init_required": true
84
+ }
85
+ ```
@@ -0,0 +1,33 @@
1
+ ---
2
+ name: pii
3
+ description: "Detect and redact PII (personally identifiable information) in files and directories. Powered by Microsoft Presidio."
4
+ argument-hint: "[scan | redact] {file|dir} [--entities TYPES] [--output PATH] [--dry-run] [--mask]"
5
+ ---
6
+
7
+ # /pii
8
+
9
+ **User invoked:** `/pii $ARGUMENTS`
10
+
11
+ **Your task:** Invoke the **pii skill** with these arguments.
12
+
13
+ ## Examples
14
+
15
+ - `/pii scan transcript.md`
16
+ - `/pii scan ./meeting-notes/`
17
+ - `/pii redact transcript.md --output transcript-clean.md`
18
+ - `/pii redact transcript.md --dry-run`
19
+ - `/pii redact transcript.md --entities PHONE_NUMBER,EMAIL_ADDRESS`
20
+
21
+ ## Quick Reference
22
+
23
+ ```bash
24
+ /pii scan {file} # Scan file for PII
25
+ /pii scan {dir} # Scan directory recursively
26
+ /pii redact {file} # Redact PII (writes {file}-redacted.{ext})
27
+ /pii redact {file} --output {out} # Redact to specific output path
28
+ /pii redact {file} --dry-run # Preview redactions without writing
29
+ /pii redact {file} --entities TYPES # Redact only specific entity types
30
+ /pii redact {file} --mask # Use *** instead of <ENTITY_TYPE>
31
+ ```
32
+
33
+ See the **pii skill** for full behaviour, procedure, and supported entity types.
@@ -0,0 +1,97 @@
1
+ ---
2
+ name: pii
3
+ description: "Detect and redact PII in text files and directories. Use when scanning files for sensitive data before sharing, or redacting emails, phone numbers, credit cards, SSNs, and other PII. User prompts like '/pii scan transcript.md' or '/pii redact meeting-notes/ --dry-run'."
4
+ argument-hint: "[scan | redact] {file|dir} [--entities TYPES] [--output PATH] [--dry-run] [--mask]"
5
+ allowed-tools: [Read, Glob, Grep, Bash, Task]
6
+ ---
7
+
8
+ # PII Skill
9
+
10
+ Detect and redact personally identifiable information using Microsoft Presidio — deterministic pattern matching + NER, fully offline after first run.
11
+
12
+ ## Usage
13
+
14
+ ```bash
15
+ /pii scan transcript.md # Scan a file for PII
16
+ /pii scan ./meeting-notes/ # Scan a directory recursively
17
+ /pii redact transcript.md # Redact PII in-place (writes to transcript-redacted.md)
18
+ /pii redact transcript.md --output clean.md # Redact to specific output file
19
+ /pii redact transcript.md --dry-run # Preview redactions without writing
20
+ /pii redact transcript.md --entities PHONE_NUMBER,EMAIL_ADDRESS # Redact specific types only
21
+ /pii redact transcript.md --mask # Use *** instead of <ENTITY_TYPE> placeholders
22
+ ```
23
+
24
+ ## Procedure
25
+
26
+ ### Step 0: Ensure Presidio venv is ready
27
+
28
+ Before any scan or redact operation, verify the venv exists:
29
+
30
+ ```bash
31
+ droid exec pii presidio-init
32
+ ```
33
+
34
+ On first run this takes ~2–3 min (downloads Presidio packages + spaCy en_core_web_lg model). Subsequent runs return immediately (`already_existed: true`).
35
+
36
+ If init fails, surface the error and stop — do not attempt to run analysis without the venv.
37
+
38
+ ### Step 1: Route on subcommand
39
+
40
+ Parse the first argument:
41
+ - `scan` → proceed to Step 2
42
+ - `redact` → proceed to Step 3
43
+ - anything else → show usage and stop
44
+
45
+ ### Step 2: Scan (delegate to pii-scanner agent)
46
+
47
+ The `pii-scanner` agent runs analysis in an isolated context so raw PII values never appear in the parent conversation.
48
+
49
+ **For a single file:**
50
+ Delegate to the `pii-scanner` agent with `file_path` set to the absolute path.
51
+
52
+ **For a directory:**
53
+ Use `Glob` to find text files matching: `**/*.{md,txt,ts,js,json,yaml,yml,csv,html,xml}`
54
+ Delegate each file to `pii-scanner` individually and aggregate results.
55
+
56
+ **Display results:**
57
+ Show entity types, counts, and affected line numbers only — never print raw PII values.
58
+
59
+ Example output:
60
+ ```
61
+ transcript.md: 3 PII entities found
62
+ EMAIL_ADDRESS × 2 (lines 4, 12)
63
+ PHONE_NUMBER × 1 (line 8)
64
+ ```
65
+
66
+ ### Step 3: Redact (run presidio-redact.ts directly)
67
+
68
+ ```bash
69
+ droid exec pii presidio-redact \
70
+ --file <path> \
71
+ [--output <path>] \
72
+ [--dry-run] \
73
+ [--entities <TYPES>] \
74
+ [--mask]
75
+ ```
76
+
77
+ Parse the JSON result and display:
78
+ - Number of entities found and redacted
79
+ - Output file path (if written)
80
+ - In `--dry-run` mode: show a brief diff preview (first 10 redacted lines), but do NOT echo the full `redacted_text` field by default
81
+
82
+ **Never echo `redacted_text` in full to the terminal** unless the user explicitly requests it.
83
+
84
+ ### Step 4: Format and present results
85
+
86
+ - Always show: entity type, count, affected lines
87
+ - On success for redact: confirm output path and entity counts
88
+ - On failure: surface the error from the script's `error` field with actionable next steps
89
+
90
+ ## Behaviour Rules
91
+
92
+ - **Never print raw PII values** to the terminal — always show types + line numbers only
93
+ - For `--dry-run` redact, show a preview diff (10 lines max) not the full document
94
+ - If `python3` is not found, show a clear install message from the prerequisite check
95
+ - Binary files and unknown extensions are skipped during directory scans
96
+ - The `pii-scanner` agent handles the isolation boundary for `scan`; `redact` operates directly since the redacted output is the goal
97
+ - Supported entity types are listed in `references/supported-entities.md`
@@ -0,0 +1,90 @@
1
+ # Supported PII Entity Types
2
+
3
+ Reference for all entity types detectable by Microsoft Presidio (used by the `/pii` skill).
4
+
5
+ Pass these names to `--entities` to filter detection:
6
+ ```bash
7
+ /pii redact notes.md --entities EMAIL_ADDRESS,PHONE_NUMBER
8
+ /pii scan transcript.md --entities US_SSN,CREDIT_CARD
9
+ ```
10
+
11
+ ---
12
+
13
+ ## Global Entities
14
+
15
+ These entity types are supported across all languages and locales.
16
+
17
+ | Entity | Description | Example | Detection Method |
18
+ |--------|-------------|---------|-----------------|
19
+ | `PERSON` | Full or partial personal names | John Smith, Jane | NER (spaCy) |
20
+ | `EMAIL_ADDRESS` | Email addresses | user@example.com | Regex + validation |
21
+ | `PHONE_NUMBER` | Phone numbers (various formats) | +1 555-123-4567, (555) 123-4567 | Regex |
22
+ | `CREDIT_CARD` | Credit card numbers (Luhn-validated) | 4111 1111 1111 1111 | Regex + Luhn algorithm |
23
+ | `IBAN_CODE` | International Bank Account Numbers | GB29 NWBK 6016 1331 9268 19 | Regex + checksum |
24
+ | `IP_ADDRESS` | IPv4 and IPv6 addresses | 192.168.1.1, 2001:db8::1 | Regex |
25
+ | `LOCATION` | Geographic locations and place names | New York, London, 123 Main St | NER (spaCy) |
26
+ | `DATE_TIME` | Dates, times, and datetime values | 2024-01-15, January 15, 3:00 PM | NER (spaCy) |
27
+ | `NRP` | Nationality, religion, political group | American, Christian | NER (spaCy) |
28
+ | `MEDICAL_LICENSE` | Medical licence numbers | MD12345 | Regex |
29
+ | `URL` | Web URLs | https://example.com/path | Regex |
30
+ | `CRYPTO` | Cryptocurrency wallet addresses | 1BvBMSEYstWetqTFn5Au4m4GFg7xJaNVN2 | Regex |
31
+
32
+ ---
33
+
34
+ ## US Entities
35
+
36
+ | Entity | Description | Example | Detection Method |
37
+ |--------|-------------|---------|-----------------|
38
+ | `US_SSN` | US Social Security Numbers | 123-45-6789 | Regex |
39
+ | `US_PASSPORT` | US passport numbers | A12345678 | Regex |
40
+ | `US_ITIN` | US Individual Taxpayer Identification Numbers | 912-00-0000 | Regex |
41
+ | `US_DRIVER_LICENSE` | US driver's licence numbers (state-specific) | A1234567 | Regex |
42
+ | `US_BANK_NUMBER` | US bank account numbers | 123456789012 | Regex |
43
+
44
+ ---
45
+
46
+ ## UK Entities
47
+
48
+ | Entity | Description | Example | Detection Method |
49
+ |--------|-------------|---------|-----------------|
50
+ | `UK_NHS` | UK National Health Service numbers | 123 456 7890 | Regex + checksum |
51
+
52
+ ---
53
+
54
+ ## European Entities
55
+
56
+ | Entity | Description | Example | Detection Method |
57
+ |--------|-------------|---------|-----------------|
58
+ | `ES_NIF` | Spanish NIF (tax ID) | 12345678A | Regex + checksum |
59
+ | `IT_FISCAL_CODE` | Italian fiscal code | RSSMRA85T10A562S | Regex + checksum |
60
+ | `IT_DRIVER_LICENSE` | Italian driver's licence | AA123456 | Regex |
61
+ | `IT_VAT_CODE` | Italian VAT code | IT12345678901 | Regex |
62
+ | `IT_PASSPORT` | Italian passport | AA1234567 | Regex |
63
+ | `IT_IDENTITY_CARD` | Italian identity card | CA12345AA | Regex |
64
+ | `PL_PESEL` | Polish PESEL national ID | 44051401458 | Regex + checksum |
65
+
66
+ ---
67
+
68
+ ## Asia-Pacific & Other Entities
69
+
70
+ | Entity | Description | Example | Detection Method |
71
+ |--------|-------------|---------|-----------------|
72
+ | `SG_NRIC_FIN` | Singapore NRIC/FIN | S1234567A | Regex + checksum |
73
+ | `AU_ABN` | Australian Business Number | 51 824 753 556 | Regex + checksum |
74
+ | `AU_ACN` | Australian Company Number | 004 085 616 | Regex + checksum |
75
+ | `AU_TFN` | Australian Tax File Number | 123 456 782 | Regex + checksum |
76
+ | `AU_MEDICARE` | Australian Medicare number | 2123456701 | Regex + checksum |
77
+
78
+ ---
79
+
80
+ ## Notes
81
+
82
+ - **NER-based** entities (PERSON, LOCATION, DATE_TIME, NRP) use the spaCy `en_core_web_sm` model. Accuracy depends on context — short, ambiguous names may be missed or misidentified.
83
+ - **Regex + checksum** entities are highly accurate — a CREDIT_CARD match always passes Luhn's algorithm.
84
+ - **Confidence scores** in `presidio-analyze.ts` output reflect detection certainty (0.0–1.0). Scores < 0.5 indicate uncertain matches.
85
+ - For custom recognizers (Orderful-specific patterns like API keys, account IDs), see the v2 roadmap in issue #292.
86
+
87
+ ## References
88
+
89
+ - [Presidio supported entities documentation](https://microsoft.github.io/presidio/supported_entities/)
90
+ - [Adding custom recognizers](https://microsoft.github.io/presidio/analyzer/adding_recognizers/)
@@ -0,0 +1,258 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * presidio-analyze
4
+ *
5
+ * Detect PII in a file or text string using Presidio.
6
+ * Shells out to the bundled Python venv.
7
+ *
8
+ * Usage:
9
+ * bun run presidio-analyze.ts --file transcript.md
10
+ * bun run presidio-analyze.ts --text "Call me at 555-1234"
11
+ * bun run presidio-analyze.ts --file notes.md --entities EMAIL_ADDRESS,PHONE_NUMBER
12
+ *
13
+ * Output (JSON):
14
+ * { "success": true, "entities": [{ "type": "EMAIL_ADDRESS", "start": 10, "end": 25, "score": 0.85, "line": 3 }] }
15
+ * { "success": false, "error": "...", "init_required": true }
16
+ */
17
+
18
+ import { execSync } from 'child_process';
19
+ import { existsSync, mkdirSync, writeFileSync, unlinkSync, readFileSync } from 'fs';
20
+ import { join } from 'path';
21
+ import { tmpdir } from 'os';
22
+
23
+ const VENV_PATH = join(process.env.HOME || '', '.droid', 'runtimes', 'presidio');
24
+ const VENV_PYTHON = join(VENV_PATH, 'bin', 'python3');
25
+ const MAX_BUFFER_BYTES = 50 * 1024 * 1024;
26
+ const ENTITY_NAME_PATTERN = /^[A-Z0-9_]+$/;
27
+ const SUPPORTED_ENTITIES = new Set([
28
+ 'PERSON',
29
+ 'EMAIL_ADDRESS',
30
+ 'PHONE_NUMBER',
31
+ 'CREDIT_CARD',
32
+ 'IBAN_CODE',
33
+ 'IP_ADDRESS',
34
+ 'LOCATION',
35
+ 'DATE_TIME',
36
+ 'NRP',
37
+ 'MEDICAL_LICENSE',
38
+ 'URL',
39
+ 'CRYPTO',
40
+ 'US_SSN',
41
+ 'US_PASSPORT',
42
+ 'US_ITIN',
43
+ 'US_DRIVER_LICENSE',
44
+ 'US_BANK_NUMBER',
45
+ 'UK_NHS',
46
+ 'ES_NIF',
47
+ 'IT_FISCAL_CODE',
48
+ 'IT_DRIVER_LICENSE',
49
+ 'IT_VAT_CODE',
50
+ 'IT_PASSPORT',
51
+ 'IT_IDENTITY_CARD',
52
+ 'PL_PESEL',
53
+ 'SG_NRIC_FIN',
54
+ 'AU_ABN',
55
+ 'AU_ACN',
56
+ 'AU_TFN',
57
+ 'AU_MEDICARE',
58
+ ]);
59
+
60
+ interface Entity {
61
+ type: string;
62
+ start: number;
63
+ end: number;
64
+ score: number;
65
+ line: number;
66
+ }
67
+
68
+ interface AnalyzeResult {
69
+ success: boolean;
70
+ entities?: Entity[];
71
+ error?: string;
72
+ init_required?: boolean;
73
+ }
74
+
75
+ interface ParsedArgs {
76
+ file?: string;
77
+ text?: string;
78
+ entities?: string[];
79
+ }
80
+
81
+ function parseArgs(args: string[]): ParsedArgs {
82
+ const result: ParsedArgs = {};
83
+
84
+ for (let i = 0; i < args.length; i++) {
85
+ const arg = args[i];
86
+ if (arg === '--file' && args[i + 1]) {
87
+ result.file = args[++i];
88
+ } else if (arg === '--text' && args[i + 1]) {
89
+ result.text = args[++i];
90
+ } else if (arg === '--entities' && args[i + 1]) {
91
+ result.entities = args[++i].split(',').map(e => e.trim());
92
+ }
93
+ }
94
+
95
+ return result;
96
+ }
97
+
98
+ function computeLineNumber(text: string, offset: number): number {
99
+ const before = text.slice(0, offset);
100
+ return before.split('\n').length;
101
+ }
102
+
103
+ function validateEntities(entities: string[] | undefined): string | undefined {
104
+ if (!entities || entities.length === 0) {
105
+ return undefined;
106
+ }
107
+
108
+ for (const entity of entities) {
109
+ if (!ENTITY_NAME_PATTERN.test(entity)) {
110
+ return `Invalid entity type: ${entity}. Allowed pattern: ${ENTITY_NAME_PATTERN.source}`;
111
+ }
112
+
113
+ if (!SUPPORTED_ENTITIES.has(entity)) {
114
+ return `Unsupported entity type: ${entity}`;
115
+ }
116
+ }
117
+
118
+ return undefined;
119
+ }
120
+
121
+ function run(cmd: string): { ok: boolean; stdout: string; stderr: string } {
122
+ try {
123
+ const output = execSync(cmd, {
124
+ encoding: 'utf-8',
125
+ stdio: ['pipe', 'pipe', 'pipe'],
126
+ maxBuffer: MAX_BUFFER_BYTES,
127
+ });
128
+ return { ok: true, stdout: output, stderr: '' };
129
+ } catch (err: unknown) {
130
+ const error = err as { stdout?: string; stderr?: string; message?: string };
131
+ return {
132
+ ok: false,
133
+ stdout: error.stdout || '',
134
+ stderr: error.stderr || error.message || 'Unknown error',
135
+ };
136
+ }
137
+ }
138
+
139
+ function presidioAnalyze(parsed: ParsedArgs): AnalyzeResult {
140
+ // Validate venv exists
141
+ if (!existsSync(VENV_PYTHON)) {
142
+ return {
143
+ success: false,
144
+ error: 'Presidio venv not found. Run presidio-init.ts first.',
145
+ init_required: true,
146
+ };
147
+ }
148
+
149
+ // Validate input
150
+ if (!parsed.file && !parsed.text) {
151
+ return {
152
+ success: false,
153
+ error: 'Either --file or --text is required.',
154
+ };
155
+ }
156
+
157
+ const entitiesError = validateEntities(parsed.entities);
158
+ if (entitiesError) {
159
+ return {
160
+ success: false,
161
+ error: entitiesError,
162
+ };
163
+ }
164
+
165
+ // Read source text for line number computation
166
+ let sourceText: string;
167
+ if (parsed.file) {
168
+ if (!existsSync(parsed.file)) {
169
+ return { success: false, error: `File not found: ${parsed.file}` };
170
+ }
171
+ try {
172
+ sourceText = readFileSync(parsed.file, 'utf-8');
173
+ } catch (err: unknown) {
174
+ const e = err as { message?: string };
175
+ return { success: false, error: `Failed to read file: ${e.message}` };
176
+ }
177
+ } else {
178
+ sourceText = parsed.text!;
179
+ }
180
+
181
+ // Build Python inline script
182
+ const entitiesArg = parsed.entities && parsed.entities.length > 0
183
+ ? `entities=[${parsed.entities.map(e => `"${e}"`).join(', ')}]`
184
+ : '';
185
+
186
+ const pythonScript = `
187
+ import sys, json
188
+ from presidio_analyzer import AnalyzerEngine
189
+
190
+ engine = AnalyzerEngine()
191
+ text = ${JSON.stringify(sourceText)}
192
+ results = engine.analyze(text=text, language='en'${entitiesArg ? ', ' + entitiesArg : ''})
193
+ output = []
194
+ for r in results:
195
+ output.append({
196
+ 'type': r.entity_type,
197
+ 'start': r.start,
198
+ 'end': r.end,
199
+ 'score': round(r.score, 4)
200
+ })
201
+ print(json.dumps(output))
202
+ `.trim();
203
+
204
+ // Write tmp script
205
+ const tmpDir = tmpdir();
206
+ const tmpScript = join(tmpDir, `pii-analyze-${Date.now()}.py`);
207
+
208
+ try {
209
+ mkdirSync(tmpDir, { recursive: true });
210
+ writeFileSync(tmpScript, pythonScript, 'utf-8');
211
+ } catch (err: unknown) {
212
+ const e = err as { message?: string };
213
+ return { success: false, error: `Failed to write temp script: ${e.message}` };
214
+ }
215
+
216
+ try {
217
+ const result = run(`"${VENV_PYTHON}" "${tmpScript}"`);
218
+
219
+ if (!result.ok) {
220
+ return {
221
+ success: false,
222
+ error: `Presidio analysis failed: ${result.stderr}`,
223
+ };
224
+ }
225
+
226
+ let rawEntities: Array<{ type: string; start: number; end: number; score: number }>;
227
+ try {
228
+ rawEntities = JSON.parse(result.stdout.trim());
229
+ } catch {
230
+ return { success: false, error: `Failed to parse Presidio output: ${result.stdout}` };
231
+ }
232
+
233
+ // Compute line numbers
234
+ const entities: Entity[] = rawEntities.map(e => ({
235
+ ...e,
236
+ line: computeLineNumber(sourceText, e.start),
237
+ }));
238
+
239
+ return { success: true, entities };
240
+ } finally {
241
+ // Clean up tmp file
242
+ try {
243
+ unlinkSync(tmpScript);
244
+ } catch {
245
+ // Ignore cleanup errors
246
+ }
247
+ }
248
+ }
249
+
250
+ // Main
251
+ const args = process.argv.slice(2);
252
+ const parsed = parseArgs(args);
253
+ const result = presidioAnalyze(parsed);
254
+ console.log(JSON.stringify(result, null, 2));
255
+
256
+ if (!result.success) {
257
+ process.exit(1);
258
+ }