npm - @orderful/droid - Versions diffs - 0.45.1 → 0.47.0 - Mend

@orderful/droid 0.45.1 → 0.47.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

package/src/tools/pii/skills/pii/scripts/presidio-redact.ts ADDED Viewed

@@ -0,0 +1,294 @@
+#!/usr/bin/env bun
+/**
+ * presidio-redact
+ *
+ * Redact PII in a file using Presidio.
+ * Shells out to the bundled Python venv.
+ *
+ * Usage:
+ *   bun run presidio-redact.ts --file transcript.md
+ *   bun run presidio-redact.ts --file transcript.md --output clean.md
+ *   bun run presidio-redact.ts --file transcript.md --dry-run
+ *   bun run presidio-redact.ts --file transcript.md --entities EMAIL_ADDRESS,PHONE_NUMBER
+ *   bun run presidio-redact.ts --file transcript.md --mask
+ *
+ * Output (JSON):
+ *   { "success": true, "dry_run": false, "original_path": "...", "output_path": "...", "entities_found": 3, "entities_redacted": 3 }
+ *   { "success": true, "dry_run": true, "original_path": "...", "entities_found": 3, "entities_redacted": 3, "redacted_text": "..." }
+ *   { "success": false, "error": "..." }
+ */
+import { execSync } from 'child_process';
+import { existsSync, mkdirSync, writeFileSync, unlinkSync, readFileSync } from 'fs';
+import { join, dirname, basename, extname } from 'path';
+import { tmpdir } from 'os';
+const VENV_PATH = join(process.env.HOME || '', '.droid', 'runtimes', 'presidio');
+const VENV_PYTHON = join(VENV_PATH, 'bin', 'python3');
+const MAX_BUFFER_BYTES = 50 * 1024 * 1024;
+const ENTITY_NAME_PATTERN = /^[A-Z0-9_]+$/;
+const SUPPORTED_ENTITIES = new Set([
+  'PERSON',
+  'EMAIL_ADDRESS',
+  'PHONE_NUMBER',
+  'CREDIT_CARD',
+  'IBAN_CODE',
+  'IP_ADDRESS',
+  'LOCATION',
+  'DATE_TIME',
+  'NRP',
+  'MEDICAL_LICENSE',
+  'URL',
+  'CRYPTO',
+  'US_SSN',
+  'US_PASSPORT',
+  'US_ITIN',
+  'US_DRIVER_LICENSE',
+  'US_BANK_NUMBER',
+  'UK_NHS',
+  'ES_NIF',
+  'IT_FISCAL_CODE',
+  'IT_DRIVER_LICENSE',
+  'IT_VAT_CODE',
+  'IT_PASSPORT',
+  'IT_IDENTITY_CARD',
+  'PL_PESEL',
+  'SG_NRIC_FIN',
+  'AU_ABN',
+  'AU_ACN',
+  'AU_TFN',
+  'AU_MEDICARE',
+]);
+interface RedactResult {
+  success: boolean;
+  dry_run?: boolean;
+  original_path?: string;
+  output_path?: string;
+  entities_found?: number;
+  entities_redacted?: number;
+  redacted_text?: string;
+  error?: string;
+  init_required?: boolean;
+}
+interface ParsedArgs {
+  file?: string;
+  output?: string;
+  dryRun: boolean;
+  entities?: string[];
+  mask: boolean;
+}
+function parseArgs(args: string[]): ParsedArgs {
+  const result: ParsedArgs = { dryRun: false, mask: false };
+  for (let i = 0; i < args.length; i++) {
+    const arg = args[i];
+    if (arg === '--file' && args[i + 1]) {
+      result.file = args[++i];
+    } else if (arg === '--output' && args[i + 1]) {
+      result.output = args[++i];
+    } else if (arg === '--dry-run') {
+      result.dryRun = true;
+    } else if (arg === '--entities' && args[i + 1]) {
+      result.entities = args[++i].split(',').map(e => e.trim());
+    } else if (arg === '--mask') {
+      result.mask = true;
+    }
+  }
+  return result;
+}
+function defaultOutputPath(filePath: string): string {
+  const dir = dirname(filePath);
+  const ext = extname(filePath);
+  const base = basename(filePath, ext);
+  return join(dir, `${base}-redacted${ext}`);
+}
+function validateEntities(entities: string[] | undefined): string | undefined {
+  if (!entities || entities.length === 0) {
+    return undefined;
+  }
+  for (const entity of entities) {
+    if (!ENTITY_NAME_PATTERN.test(entity)) {
+      return `Invalid entity type: ${entity}. Allowed pattern: ${ENTITY_NAME_PATTERN.source}`;
+    }
+    if (!SUPPORTED_ENTITIES.has(entity)) {
+      return `Unsupported entity type: ${entity}`;
+    }
+  }
+  return undefined;
+}
+function run(cmd: string): { ok: boolean; stdout: string; stderr: string } {
+  try {
+    const output = execSync(cmd, {
+      encoding: 'utf-8',
+      stdio: ['pipe', 'pipe', 'pipe'],
+      maxBuffer: MAX_BUFFER_BYTES,
+    });
+    return { ok: true, stdout: output, stderr: '' };
+  } catch (err: unknown) {
+    const error = err as { stdout?: string; stderr?: string; message?: string };
+    return {
+      ok: false,
+      stdout: error.stdout || '',
+      stderr: error.stderr || error.message || 'Unknown error',
+    };
+  }
+}
+function presidioRedact(parsed: ParsedArgs): RedactResult {
+  // Validate venv
+  if (!existsSync(VENV_PYTHON)) {
+    return {
+      success: false,
+      error: 'Presidio venv not found. Run presidio-init.ts first.',
+      init_required: true,
+    };
+  }
+  // Validate required args
+  if (!parsed.file) {
+    return { success: false, error: '--file is required.' };
+  }
+  if (!existsSync(parsed.file)) {
+    return { success: false, error: `File not found: ${parsed.file}` };
+  }
+  const entitiesError = validateEntities(parsed.entities);
+  if (entitiesError) {
+    return {
+      success: false,
+      error: entitiesError,
+    };
+  }
+  let sourceText: string;
+  try {
+    sourceText = readFileSync(parsed.file, 'utf-8');
+  } catch (err: unknown) {
+    const e = err as { message?: string };
+    return { success: false, error: `Failed to read file: ${e.message}` };
+  }
+  // Build entity filter
+  const entitiesArg = parsed.entities && parsed.entities.length > 0
+    ? `entities=[${parsed.entities.map(e => `"${e}"`).join(', ')}]`
+    : '';
+  const pythonScript = `
+import sys, json
+from presidio_analyzer import AnalyzerEngine
+from presidio_anonymizer import AnonymizerEngine
+from presidio_anonymizer.entities import OperatorConfig
+analyzer = AnalyzerEngine()
+anonymizer = AnonymizerEngine()
+text = ${JSON.stringify(sourceText)}
+# Analyze
+results = analyzer.analyze(text=text, language='en'${entitiesArg ? ', ' + entitiesArg : ''})
+entities_found = len(results)
+# Redact
+if ${parsed.mask ? 'True' : 'False'}:
+    redacted_text = text
+    for r in sorted(results, key=lambda e: e.start, reverse=True):
+        replacement = '*' * (r.end - r.start)
+        redacted_text = redacted_text[:r.start] + replacement + redacted_text[r.end:]
+else:
+    operators = {r.entity_type: OperatorConfig('replace', {'new_value': f'<{r.entity_type}>'}) for r in results}
+    anonymized = anonymizer.anonymize(text=text, analyzer_results=results, operators=operators)
+    redacted_text = anonymized.text
+print(json.dumps({
+    'redacted_text': redacted_text,
+    'entities_found': entities_found,
+    'entities_redacted': entities_found
+}))
+`.trim();
+  // Write tmp script
+  const tmpDir = tmpdir();
+  const tmpScript = join(tmpDir, `pii-redact-${Date.now()}.py`);
+  try {
+    mkdirSync(tmpDir, { recursive: true });
+    writeFileSync(tmpScript, pythonScript, 'utf-8');
+  } catch (err: unknown) {
+    const e = err as { message?: string };
+    return { success: false, error: `Failed to write temp script: ${e.message}` };
+  }
+  try {
+    const result = run(`"${VENV_PYTHON}" "${tmpScript}"`);
+    if (!result.ok) {
+      return {
+        success: false,
+        error: `Presidio redaction failed: ${result.stderr}`,
+      };
+    }
+    let pyResult: { redacted_text: string; entities_found: number; entities_redacted: number };
+    try {
+      pyResult = JSON.parse(result.stdout.trim());
+    } catch {
+      return { success: false, error: `Failed to parse Presidio output: ${result.stdout}` };
+    }
+    const outputPath = parsed.dryRun
+      ? undefined
+      : (parsed.output || defaultOutputPath(parsed.file));
+    // Write output file unless dry run
+    if (!parsed.dryRun && outputPath) {
+      try {
+        writeFileSync(outputPath, pyResult.redacted_text, 'utf-8');
+      } catch (err: unknown) {
+        const e = err as { message?: string };
+        return { success: false, error: `Failed to write output file: ${e.message}` };
+      }
+    }
+    const baseResult: RedactResult = {
+      success: true,
+      dry_run: parsed.dryRun,
+      original_path: parsed.file,
+      output_path: outputPath,
+      entities_found: pyResult.entities_found,
+      entities_redacted: pyResult.entities_redacted,
+    };
+    if (parsed.dryRun) {
+      baseResult.redacted_text = pyResult.redacted_text;
+    }
+    return baseResult;
+  } finally {
+    // Clean up tmp file
+    try {
+      unlinkSync(tmpScript);
+    } catch {
+      // Ignore cleanup errors
+    }
+  }
+}
+// Main
+const args = process.argv.slice(2);
+const parsed = parseArgs(args);
+const result = presidioRedact(parsed);
+console.log(JSON.stringify(result, null, 2));
+if (!result.success) {
+  process.exit(1);
+}