n8n-nodes-redactor 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/LICENSE +42 -0
  2. package/README.dev.md +153 -0
  3. package/README.md +443 -0
  4. package/README.npm.md +443 -0
  5. package/dist/nodes/PiiRedactor/PiiRedactor.node.d.ts +5 -0
  6. package/dist/nodes/PiiRedactor/PiiRedactor.node.js +1093 -0
  7. package/dist/nodes/PiiRedactor/__tests__/encryption.test.d.ts +1 -0
  8. package/dist/nodes/PiiRedactor/__tests__/encryption.test.js +200 -0
  9. package/dist/nodes/PiiRedactor/__tests__/engine.test.d.ts +1 -0
  10. package/dist/nodes/PiiRedactor/__tests__/engine.test.js +524 -0
  11. package/dist/nodes/PiiRedactor/__tests__/operations.test.d.ts +1 -0
  12. package/dist/nodes/PiiRedactor/__tests__/operations.test.js +316 -0
  13. package/dist/nodes/PiiRedactor/__tests__/patterns-global.test.d.ts +1 -0
  14. package/dist/nodes/PiiRedactor/__tests__/patterns-global.test.js +427 -0
  15. package/dist/nodes/PiiRedactor/__tests__/patterns.test.d.ts +1 -0
  16. package/dist/nodes/PiiRedactor/__tests__/patterns.test.js +481 -0
  17. package/dist/nodes/PiiRedactor/__tests__/phase1.test.d.ts +1 -0
  18. package/dist/nodes/PiiRedactor/__tests__/phase1.test.js +343 -0
  19. package/dist/nodes/PiiRedactor/__tests__/phase3.test.d.ts +1 -0
  20. package/dist/nodes/PiiRedactor/__tests__/phase3.test.js +275 -0
  21. package/dist/nodes/PiiRedactor/__tests__/phase4.test.d.ts +1 -0
  22. package/dist/nodes/PiiRedactor/__tests__/phase4.test.js +184 -0
  23. package/dist/nodes/PiiRedactor/__tests__/presidio.test.d.ts +1 -0
  24. package/dist/nodes/PiiRedactor/__tests__/presidio.test.js +170 -0
  25. package/dist/nodes/PiiRedactor/__tests__/security.test.d.ts +1 -0
  26. package/dist/nodes/PiiRedactor/__tests__/security.test.js +178 -0
  27. package/dist/nodes/PiiRedactor/__tests__/semantic.test.d.ts +1 -0
  28. package/dist/nodes/PiiRedactor/__tests__/semantic.test.js +319 -0
  29. package/dist/nodes/PiiRedactor/__tests__/vault.test.d.ts +1 -0
  30. package/dist/nodes/PiiRedactor/__tests__/vault.test.js +247 -0
  31. package/dist/nodes/PiiRedactor/audit.d.ts +48 -0
  32. package/dist/nodes/PiiRedactor/audit.js +192 -0
  33. package/dist/nodes/PiiRedactor/classification.d.ts +33 -0
  34. package/dist/nodes/PiiRedactor/classification.js +118 -0
  35. package/dist/nodes/PiiRedactor/context.d.ts +57 -0
  36. package/dist/nodes/PiiRedactor/context.js +260 -0
  37. package/dist/nodes/PiiRedactor/encryption.d.ts +45 -0
  38. package/dist/nodes/PiiRedactor/encryption.js +158 -0
  39. package/dist/nodes/PiiRedactor/engine.d.ts +23 -0
  40. package/dist/nodes/PiiRedactor/engine.js +888 -0
  41. package/dist/nodes/PiiRedactor/injection.d.ts +46 -0
  42. package/dist/nodes/PiiRedactor/injection.js +425 -0
  43. package/dist/nodes/PiiRedactor/names.d.ts +25 -0
  44. package/dist/nodes/PiiRedactor/names.js +188 -0
  45. package/dist/nodes/PiiRedactor/patterns.d.ts +17 -0
  46. package/dist/nodes/PiiRedactor/patterns.js +1742 -0
  47. package/dist/nodes/PiiRedactor/presidio.d.ts +77 -0
  48. package/dist/nodes/PiiRedactor/presidio.js +264 -0
  49. package/dist/nodes/PiiRedactor/profiles.d.ts +47 -0
  50. package/dist/nodes/PiiRedactor/profiles.js +139 -0
  51. package/dist/nodes/PiiRedactor/pseudonymize.d.ts +20 -0
  52. package/dist/nodes/PiiRedactor/pseudonymize.js +203 -0
  53. package/dist/nodes/PiiRedactor/redact.png +0 -0
  54. package/dist/nodes/PiiRedactor/redact.svg +3 -0
  55. package/dist/nodes/PiiRedactor/ropa.d.ts +63 -0
  56. package/dist/nodes/PiiRedactor/ropa.js +70 -0
  57. package/dist/nodes/PiiRedactor/types.d.ts +82 -0
  58. package/dist/nodes/PiiRedactor/types.js +3 -0
  59. package/dist/nodes/PiiRedactor/vault.d.ts +61 -0
  60. package/dist/nodes/PiiRedactor/vault.js +352 -0
  61. package/package.json +87 -0
@@ -0,0 +1,192 @@
1
+ "use strict";
2
+ /**
3
+ * Persistent Audit Log
4
+ *
5
+ * JSONL (JSON Lines) audit logger for compliance recording.
6
+ * One JSON object per line, append-only, auto-rotating.
7
+ *
8
+ * Compliant with:
9
+ * - GDPR Article 30 (Records of Processing Activities)
10
+ * - HIPAA (6-year retention)
11
+ * - SOX (7-year retention)
12
+ * - PCI DSS (1-year minimum)
13
+ *
14
+ * Format: JSONL (compatible with Splunk, ELK, Datadog, CloudWatch)
15
+ * Location: ~/.n8n/pii-audit/pii-audit-YYYY-MM-DD.jsonl
16
+ * Rotation: Daily + 100MB size limit
17
+ */
18
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
19
+ if (k2 === undefined) k2 = k;
20
+ var desc = Object.getOwnPropertyDescriptor(m, k);
21
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
22
+ desc = { enumerable: true, get: function() { return m[k]; } };
23
+ }
24
+ Object.defineProperty(o, k2, desc);
25
+ }) : (function(o, m, k, k2) {
26
+ if (k2 === undefined) k2 = k;
27
+ o[k2] = m[k];
28
+ }));
29
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
30
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
31
+ }) : function(o, v) {
32
+ o["default"] = v;
33
+ });
34
+ var __importStar = (this && this.__importStar) || (function () {
35
+ var ownKeys = function(o) {
36
+ ownKeys = Object.getOwnPropertyNames || function (o) {
37
+ var ar = [];
38
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
39
+ return ar;
40
+ };
41
+ return ownKeys(o);
42
+ };
43
+ return function (mod) {
44
+ if (mod && mod.__esModule) return mod;
45
+ var result = {};
46
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
47
+ __setModuleDefault(result, mod);
48
+ return result;
49
+ };
50
+ })();
51
+ Object.defineProperty(exports, "__esModule", { value: true });
52
+ exports.writeAuditLog = writeAuditLog;
53
+ exports.readAuditLog = readAuditLog;
54
+ const fs = __importStar(require("fs"));
55
+ const path = __importStar(require("path"));
56
+ const MAX_FILE_SIZE = 104857600; // 100MB
57
+ const DEFAULT_AUDIT_DIR = path.join(process.env.HOME || '/tmp', '.n8n', 'pii-audit');
58
+ /**
59
+ * Generate a simple UUID v4.
60
+ */
61
+ function generateEventId() {
62
+ const bytes = new Array(16).fill(0).map(() => Math.floor(Math.random() * 256));
63
+ bytes[6] = (bytes[6] & 0x0f) | 0x40;
64
+ bytes[8] = (bytes[8] & 0x3f) | 0x80;
65
+ const hex = bytes.map((b) => b.toString(16).padStart(2, '0')).join('');
66
+ return `${hex.slice(0, 8)}-${hex.slice(8, 12)}-${hex.slice(12, 16)}-${hex.slice(16, 20)}-${hex.slice(20)}`;
67
+ }
68
+ /**
69
+ * Get today's audit log file path.
70
+ */
71
+ function getLogFilePath(auditDir) {
72
+ const today = new Date().toISOString().split('T')[0]; // YYYY-MM-DD
73
+ return path.join(auditDir, `pii-audit-${today}.jsonl`);
74
+ }
75
+ /**
76
+ * Determine severity from classification level and hit count.
77
+ */
78
+ function determineSeverity(classificationLevel, totalHits = 0) {
79
+ if (classificationLevel === 'RESTRICTED' || totalHits >= 50)
80
+ return 'CRITICAL';
81
+ if (classificationLevel === 'CONFIDENTIAL' || totalHits >= 20)
82
+ return 'HIGH';
83
+ if (classificationLevel === 'INTERNAL' || totalHits >= 5)
84
+ return 'MEDIUM';
85
+ return 'LOW';
86
+ }
87
+ /**
88
+ * Write an audit log entry.
89
+ * Creates the audit directory if it doesn't exist.
90
+ * Handles file rotation (daily + size-based).
91
+ * Graceful: never throws, never blocks the workflow.
92
+ */
93
+ function writeAuditLog(action, mode, hits, inputItemCount, processingTimeMs, patternsUsed, presidioEnabled, classification, sessionId, auditDir) {
94
+ try {
95
+ const dir = auditDir || DEFAULT_AUDIT_DIR;
96
+ // Ensure directory exists
97
+ if (!fs.existsSync(dir)) {
98
+ fs.mkdirSync(dir, { recursive: true });
99
+ }
100
+ // Build hit summaries
101
+ const hitsByCategory = {};
102
+ const hitsByPattern = {};
103
+ for (const hit of hits) {
104
+ hitsByCategory[hit.category] = (hitsByCategory[hit.category] || 0) + 1;
105
+ hitsByPattern[hit.patternLabel] = (hitsByPattern[hit.patternLabel] || 0) + 1;
106
+ }
107
+ const entry = {
108
+ version: '1.0',
109
+ timestamp: new Date().toISOString(),
110
+ eventId: generateEventId(),
111
+ severity: determineSeverity(classification?.level, hits.length),
112
+ action,
113
+ mode,
114
+ status: 'SUCCESS',
115
+ totalHits: hits.length,
116
+ hitsByCategory,
117
+ hitsByPattern,
118
+ classificationLevel: classification?.level,
119
+ inputItemCount,
120
+ processingTimeMs,
121
+ patternsUsed,
122
+ presidioEnabled,
123
+ sessionId,
124
+ };
125
+ // Get log file path
126
+ let logFile = getLogFilePath(dir);
127
+ // Size-based rotation
128
+ try {
129
+ if (fs.existsSync(logFile)) {
130
+ const stats = fs.statSync(logFile);
131
+ if (stats.size >= MAX_FILE_SIZE) {
132
+ // Rotate: rename current file with sequence number
133
+ let seq = 1;
134
+ while (fs.existsSync(`${logFile}.${seq}`))
135
+ seq++;
136
+ fs.renameSync(logFile, `${logFile}.${seq}`);
137
+ }
138
+ }
139
+ }
140
+ catch {
141
+ // Rotation failed, continue writing to current file
142
+ }
143
+ // Append entry as JSONL (one JSON object per line)
144
+ fs.appendFileSync(logFile, JSON.stringify(entry) + '\n', 'utf-8');
145
+ }
146
+ catch {
147
+ // Audit logging should NEVER crash the workflow
148
+ // Silently fail — the redaction itself already succeeded
149
+ }
150
+ }
151
+ /**
152
+ * Read audit log entries for a given date range.
153
+ * Returns parsed entries. Used by Stats operation for audit review.
154
+ */
155
+ function readAuditLog(startDate, endDate, auditDir) {
156
+ try {
157
+ const dir = auditDir || DEFAULT_AUDIT_DIR;
158
+ if (!fs.existsSync(dir))
159
+ return [];
160
+ const files = fs.readdirSync(dir)
161
+ .filter((f) => f.startsWith('pii-audit-') && f.endsWith('.jsonl'))
162
+ .sort();
163
+ const entries = [];
164
+ for (const file of files) {
165
+ // Filter by date range if provided
166
+ const fileDate = file.replace('pii-audit-', '').replace('.jsonl', '');
167
+ if (startDate && fileDate < startDate)
168
+ continue;
169
+ if (endDate && fileDate > endDate)
170
+ continue;
171
+ try {
172
+ const content = fs.readFileSync(path.join(dir, file), 'utf-8');
173
+ const lines = content.split('\n').filter((l) => l.trim().length > 0);
174
+ for (const line of lines) {
175
+ try {
176
+ entries.push(JSON.parse(line));
177
+ }
178
+ catch {
179
+ // Skip malformed lines
180
+ }
181
+ }
182
+ }
183
+ catch {
184
+ // Skip unreadable files
185
+ }
186
+ }
187
+ return entries;
188
+ }
189
+ catch {
190
+ return [];
191
+ }
192
+ }
@@ -0,0 +1,33 @@
1
+ /**
2
+ * Data Classification Engine
3
+ *
4
+ * Assigns sensitivity labels to data based on what PII categories were found.
5
+ * Modeled on Microsoft Purview and Google Cloud DLP classification taxonomy.
6
+ *
7
+ * Levels:
8
+ * PUBLIC (0) - No PII detected
9
+ * INTERNAL (1) - Low-sensitivity PII (network, location, vehicle, temporal)
10
+ * CONFIDENTIAL (2) - Personal PII (names, emails, phones, addresses, identity docs)
11
+ * RESTRICTED (3) - High-sensitivity PII (financial, medical, biometric, enterprise secrets)
12
+ *
13
+ * Escalation rules (based on Purview instance-count thresholds):
14
+ * - Any RESTRICTED-category hit -> RESTRICTED
15
+ * - 10+ CONFIDENTIAL hits -> escalate to RESTRICTED
16
+ * - 5+ INTERNAL hits -> escalate to CONFIDENTIAL
17
+ */
18
+ import { PiiCategory, RedactionHit } from './types';
19
+ export type SensitivityLevel = 'PUBLIC' | 'INTERNAL' | 'CONFIDENTIAL' | 'RESTRICTED';
20
+ export interface ClassificationLabel {
21
+ level: SensitivityLevel;
22
+ numericLevel: 0 | 1 | 2 | 3;
23
+ reason: string;
24
+ topCategory: PiiCategory | 'none';
25
+ hitCountByLevel: Record<SensitivityLevel, number>;
26
+ escalated: boolean;
27
+ totalHits: number;
28
+ averageConfidence: number;
29
+ }
30
+ /**
31
+ * Classify data sensitivity based on detected PII hits.
32
+ */
33
+ export declare function classifyData(hits: RedactionHit[]): ClassificationLabel;
@@ -0,0 +1,118 @@
1
+ "use strict";
2
+ /**
3
+ * Data Classification Engine
4
+ *
5
+ * Assigns sensitivity labels to data based on what PII categories were found.
6
+ * Modeled on Microsoft Purview and Google Cloud DLP classification taxonomy.
7
+ *
8
+ * Levels:
9
+ * PUBLIC (0) - No PII detected
10
+ * INTERNAL (1) - Low-sensitivity PII (network, location, vehicle, temporal)
11
+ * CONFIDENTIAL (2) - Personal PII (names, emails, phones, addresses, identity docs)
12
+ * RESTRICTED (3) - High-sensitivity PII (financial, medical, biometric, enterprise secrets)
13
+ *
14
+ * Escalation rules (based on Purview instance-count thresholds):
15
+ * - Any RESTRICTED-category hit -> RESTRICTED
16
+ * - 10+ CONFIDENTIAL hits -> escalate to RESTRICTED
17
+ * - 5+ INTERNAL hits -> escalate to CONFIDENTIAL
18
+ */
19
+ Object.defineProperty(exports, "__esModule", { value: true });
20
+ exports.classifyData = classifyData;
21
+ /** Map PII categories to base sensitivity levels */
22
+ const CATEGORY_SENSITIVITY = {
23
+ // RESTRICTED (3) - highest sensitivity
24
+ financial: 'RESTRICTED',
25
+ medical: 'RESTRICTED',
26
+ biometric: 'RESTRICTED',
27
+ enterprise: 'RESTRICTED',
28
+ crypto: 'RESTRICTED',
29
+ // CONFIDENTIAL (2) - personal data
30
+ identity: 'CONFIDENTIAL',
31
+ contact: 'CONFIDENTIAL',
32
+ // INTERNAL (1) - low sensitivity
33
+ network: 'INTERNAL',
34
+ location: 'INTERNAL',
35
+ vehicle: 'INTERNAL',
36
+ temporal: 'INTERNAL',
37
+ other: 'INTERNAL',
38
+ };
39
+ const LEVEL_NUMERIC = {
40
+ PUBLIC: 0,
41
+ INTERNAL: 1,
42
+ CONFIDENTIAL: 2,
43
+ RESTRICTED: 3,
44
+ };
45
+ /**
46
+ * Classify data sensitivity based on detected PII hits.
47
+ */
48
+ function classifyData(hits) {
49
+ if (hits.length === 0) {
50
+ return {
51
+ level: 'PUBLIC',
52
+ numericLevel: 0,
53
+ reason: 'No sensitive data detected.',
54
+ topCategory: 'none',
55
+ hitCountByLevel: { PUBLIC: 0, INTERNAL: 0, CONFIDENTIAL: 0, RESTRICTED: 0 },
56
+ escalated: false,
57
+ totalHits: 0,
58
+ averageConfidence: 0,
59
+ };
60
+ }
61
+ // Count hits by sensitivity level
62
+ const hitCountByLevel = {
63
+ PUBLIC: 0,
64
+ INTERNAL: 0,
65
+ CONFIDENTIAL: 0,
66
+ RESTRICTED: 0,
67
+ };
68
+ const categoryCounts = {};
69
+ let totalConfidence = 0;
70
+ for (const hit of hits) {
71
+ const catLevel = CATEGORY_SENSITIVITY[hit.category] || 'INTERNAL';
72
+ hitCountByLevel[catLevel]++;
73
+ categoryCounts[hit.category] = (categoryCounts[hit.category] || 0) + 1;
74
+ totalConfidence += hit.confidence || 0.65;
75
+ }
76
+ // Determine base level from highest-sensitivity hit
77
+ let level = 'PUBLIC';
78
+ if (hitCountByLevel.RESTRICTED > 0)
79
+ level = 'RESTRICTED';
80
+ else if (hitCountByLevel.CONFIDENTIAL > 0)
81
+ level = 'CONFIDENTIAL';
82
+ else if (hitCountByLevel.INTERNAL > 0)
83
+ level = 'INTERNAL';
84
+ // Escalation rules
85
+ let escalated = false;
86
+ // 10+ CONFIDENTIAL hits -> RESTRICTED
87
+ if (level === 'CONFIDENTIAL' && hitCountByLevel.CONFIDENTIAL >= 10) {
88
+ level = 'RESTRICTED';
89
+ escalated = true;
90
+ }
91
+ // 5+ INTERNAL hits -> CONFIDENTIAL
92
+ if (level === 'INTERNAL' && hitCountByLevel.INTERNAL >= 5) {
93
+ level = 'CONFIDENTIAL';
94
+ escalated = true;
95
+ }
96
+ // Find top category
97
+ const topCategory = Object.entries(categoryCounts)
98
+ .sort((a, b) => b[1] - a[1])[0][0];
99
+ // Build reason string
100
+ const reasonParts = [];
101
+ for (const [cat, count] of Object.entries(categoryCounts).sort((a, b) => b[1] - a[1])) {
102
+ reasonParts.push(`${count} ${cat}`);
103
+ }
104
+ let reason = `Contains: ${reasonParts.join(', ')}.`;
105
+ if (escalated) {
106
+ reason += ` Escalated due to high volume of detections.`;
107
+ }
108
+ return {
109
+ level,
110
+ numericLevel: LEVEL_NUMERIC[level],
111
+ reason,
112
+ topCategory,
113
+ hitCountByLevel,
114
+ escalated,
115
+ totalHits: hits.length,
116
+ averageConfidence: Math.round((totalConfidence / hits.length) * 100) / 100,
117
+ };
118
+ }
@@ -0,0 +1,57 @@
1
+ /**
2
+ * Context words and confidence scoring for PII patterns.
3
+ *
4
+ * Context words are keywords that appear near a PII match and boost confidence.
5
+ * For example, "SSN" appearing before "123-45-6789" boosts confidence from 0.60 to 0.85.
6
+ *
7
+ * Base confidence scores reflect how reliable a pattern is:
8
+ * - 0.95: Checksum validated (Luhn, IBAN, PESEL, etc.)
9
+ * - 0.90: Semantic field-name match
10
+ * - 0.85: Context-aware ambiguous field
11
+ * - 0.80: Regex match WITH context words nearby
12
+ * - 0.60: Regex match WITHOUT context words (bare pattern)
13
+ * - 0.70: Custom pattern match
14
+ */
15
+ /** Context words by pattern name. Case-insensitive matching. */
16
+ export declare const CONTEXT_WORDS: Record<string, string[]>;
17
+ /**
18
+ * Base confidence scores by pattern name.
19
+ * Patterns with checksum validation get higher scores.
20
+ */
21
+ export declare const BASE_CONFIDENCE: Record<string, number>;
22
+ /**
23
+ * Default base confidence for patterns not in the BASE_CONFIDENCE map.
24
+ */
25
+ export declare const DEFAULT_BASE_CONFIDENCE = 0.65;
26
+ /**
27
+ * Confidence boost when context words are found nearby.
28
+ */
29
+ export declare const CONTEXT_BOOST = 0.2;
30
+ /**
31
+ * Confidence score for semantic field-name matches.
32
+ */
33
+ export declare const SEMANTIC_CONFIDENCE = 0.9;
34
+ /**
35
+ * Confidence score for context-aware ambiguous field matches.
36
+ */
37
+ export declare const AMBIGUOUS_FIELD_CONFIDENCE = 0.85;
38
+ /**
39
+ * Confidence score for custom pattern matches.
40
+ */
41
+ export declare const CUSTOM_PATTERN_CONFIDENCE = 0.7;
42
+ /**
43
+ * Confidence score for deny list matches.
44
+ */
45
+ export declare const DENY_LIST_CONFIDENCE = 1;
46
+ /**
47
+ * Window size (characters) to search for context words around a match.
48
+ */
49
+ export declare const CONTEXT_WINDOW = 80;
50
+ /**
51
+ * Check if any context words appear near a match position in the text.
52
+ */
53
+ export declare function hasContextWords(text: string, matchStart: number, matchEnd: number, words: string[]): boolean;
54
+ /**
55
+ * Calculate confidence score for a pattern match.
56
+ */
57
+ export declare function calculateConfidence(patternName: string, hasValidator: boolean, validatorPassed: boolean, text: string, matchStart: number, matchEnd: number): number;
@@ -0,0 +1,260 @@
1
+ "use strict";
2
+ /**
3
+ * Context words and confidence scoring for PII patterns.
4
+ *
5
+ * Context words are keywords that appear near a PII match and boost confidence.
6
+ * For example, "SSN" appearing before "123-45-6789" boosts confidence from 0.60 to 0.85.
7
+ *
8
+ * Base confidence scores reflect how reliable a pattern is:
9
+ * - 0.95: Checksum validated (Luhn, IBAN, PESEL, etc.)
10
+ * - 0.90: Semantic field-name match
11
+ * - 0.85: Context-aware ambiguous field
12
+ * - 0.80: Regex match WITH context words nearby
13
+ * - 0.60: Regex match WITHOUT context words (bare pattern)
14
+ * - 0.70: Custom pattern match
15
+ */
16
+ Object.defineProperty(exports, "__esModule", { value: true });
17
+ exports.CONTEXT_WINDOW = exports.DENY_LIST_CONFIDENCE = exports.CUSTOM_PATTERN_CONFIDENCE = exports.AMBIGUOUS_FIELD_CONFIDENCE = exports.SEMANTIC_CONFIDENCE = exports.CONTEXT_BOOST = exports.DEFAULT_BASE_CONFIDENCE = exports.BASE_CONFIDENCE = exports.CONTEXT_WORDS = void 0;
18
+ exports.hasContextWords = hasContextWords;
19
+ exports.calculateConfidence = calculateConfidence;
20
+ /** Context words by pattern name. Case-insensitive matching. */
21
+ exports.CONTEXT_WORDS = {
22
+ // Contact
23
+ email: ['email', 'e-mail', 'mail', 'contact', 'reach', 'send', 'Kontakt', 'correo', 'posta'],
24
+ phone: ['phone', 'call', 'mobile', 'cell', 'tel', 'telephone', 'dial', 'ring', 'Telefon', 'anrufen', 'Handy', 'appeler', 'llamar'],
25
+ phoneDE: ['Telefon', 'anrufen', 'Handy', 'Rufnummer', 'Festnetz', 'mobil'],
26
+ phoneUK: ['phone', 'call', 'mobile', 'ring', 'dial', 'landline'],
27
+ phoneAT: ['Telefon', 'anrufen', 'Handy'],
28
+ phoneCH: ['Telefon', 'anrufen', 'Natel', 'Handy'],
29
+ phoneFR: ['telephone', 'appeler', 'portable', 'fixe', 'numero'],
30
+ phoneNL: ['telefoon', 'bellen', 'mobiel'],
31
+ phoneES: ['telefono', 'llamar', 'movil', 'celular'],
32
+ phoneIT: ['telefono', 'chiamare', 'cellulare'],
33
+ phoneAU: ['phone', 'call', 'mobile'],
34
+ phoneIN: ['phone', 'call', 'mobile'],
35
+ phoneBR: ['telefone', 'ligar', 'celular'],
36
+ phoneCN: ['phone', 'call', 'mobile'],
37
+ phoneKR: ['phone', 'call'],
38
+ phoneRU: ['telefon', 'pozvonit'],
39
+ phoneMX: ['telefono', 'llamar', 'celular'],
40
+ phoneZA: ['phone', 'call', 'mobile'],
41
+ // Identity - person names
42
+ personName: ['name', 'person', 'contact', 'customer', 'client', 'patient', 'employee', 'user', 'Mr', 'Mrs', 'Ms', 'Dr', 'Herr', 'Frau'],
43
+ // Identity - government IDs
44
+ ssn: ['social security', 'SSN', 'social', 'security number', 'Sozialversicherung'],
45
+ itinUS: ['ITIN', 'taxpayer', 'individual taxpayer'],
46
+ sinCA: ['SIN', 'social insurance', 'insurance number'],
47
+ ninoUK: ['national insurance', 'NINO', 'NI number'],
48
+ nhsNumber: ['NHS', 'national health', 'health service', 'patient'],
49
+ passportUS: ['passport', 'travel document', 'Reisepass'],
50
+ passportEU: ['passport', 'travel document', 'Reisepass', 'passeport'],
51
+ passportDE: ['Reisepass', 'passport', 'Passnummer'],
52
+ nationalIdDE: ['Personalausweis', 'Ausweis', 'ID card', 'identity card'],
53
+ taxIdUS: ['EIN', 'employer identification', 'tax ID', 'federal tax'],
54
+ taxIdDE: ['Steuer', 'Identifikationsnummer', 'Finanzamt'],
55
+ sozialversicherungDE: ['Sozialversicherung', 'Rentenversicherung', 'SV-Nummer'],
56
+ ahvCH: ['AHV', 'AVS', 'Sozialversicherung', 'assurance'],
57
+ nationalIdFR: ['securite sociale', 'NIR', 'numero national'],
58
+ codiceFiscaleIT: ['codice fiscale', 'fiscal code', 'CF'],
59
+ dniES: ['DNI', 'documento nacional', 'identidad'],
60
+ nieES: ['NIE', 'extranjero', 'foreigner'],
61
+ peselPL: ['PESEL', 'numer identyfikacyjny'],
62
+ bsnNL: ['BSN', 'burgerservicenummer', 'burger service'],
63
+ ppsIE: ['PPS', 'personal public service'],
64
+ tfnAU: ['TFN', 'tax file', 'tax number'],
65
+ nricSG: ['NRIC', 'identity card', 'IC'],
66
+ panIN: ['PAN', 'permanent account', 'income tax'],
67
+ aadhaarIN: ['Aadhaar', 'UID', 'unique identification'],
68
+ cpfBR: ['CPF', 'cadastro', 'pessoa fisica'],
69
+ hetuFI: ['HETU', 'henkilotunnus', 'personal identity'],
70
+ personnummerSE: ['personnummer', 'personal number'],
71
+ fodselsnummerNO: ['fodselsnummer', 'personal number'],
72
+ cprDK: ['CPR', 'personnummer'],
73
+ nifPT: ['NIF', 'numero fiscal', 'contribuinte'],
74
+ nationalIdCN: ['identity card', 'resident card', 'shenfenzheng'],
75
+ nationalIdTR: ['TC Kimlik', 'kimlik no', 'identity'],
76
+ curpMX: ['CURP', 'poblacion'],
77
+ rfcMX: ['RFC', 'registro federal'],
78
+ emiratesIdUAE: ['emirates ID', 'UAE ID'],
79
+ // Financial
80
+ creditCard: ['card', 'credit', 'debit', 'visa', 'mastercard', 'amex', 'payment', 'Kreditkarte', 'carte', 'tarjeta'],
81
+ amex: ['amex', 'american express', 'card', 'payment'],
82
+ iban: ['IBAN', 'bank account', 'Kontonummer', 'compte', 'cuenta', 'conto'],
83
+ bic: ['BIC', 'SWIFT', 'bank', 'routing'],
84
+ vatEU: ['VAT', 'MwSt', 'Umsatzsteuer', 'TVA', 'IVA', 'BTW'],
85
+ abaRouting: ['routing', 'ABA', 'bank', 'transfer'],
86
+ sortCodeUK: ['sort code', 'bank', 'branch'],
87
+ cardExpiry: ['expiry', 'expiration', 'valid', 'exp', 'Gultig'],
88
+ cvvCtx: ['CVV', 'CVC', 'security code', 'verification'],
89
+ bankAccountCtx: ['account', 'bank', 'checking', 'savings', 'Konto'],
90
+ insurancePolicyCtx: ['insurance', 'policy', 'Versicherung', 'assurance', 'polizza'],
91
+ salaryCtx: ['salary', 'compensation', 'pay', 'wage', 'Gehalt', 'salaire'],
92
+ // Network
93
+ ipv4: ['IP', 'address', 'server', 'host', 'network'],
94
+ ipv6: ['IP', 'address', 'server', 'IPv6'],
95
+ macAddress: ['MAC', 'hardware', 'network', 'device', 'adapter'],
96
+ url: ['URL', 'link', 'website', 'http', 'visit'],
97
+ privateIp: ['internal', 'private', 'LAN', 'intranet', 'network'],
98
+ // Location
99
+ addressDE: ['Adresse', 'Anschrift', 'wohnt', 'Straße', 'address'],
100
+ addressLabeledEN: ['address', 'lives', 'located', 'residing'],
101
+ addressLabeledDE: ['Adresse', 'Anschrift', 'wohnt', 'Wohnort'],
102
+ gpsCoordinates: ['GPS', 'coordinates', 'location', 'latitude', 'longitude', 'Standort'],
103
+ // Medical
104
+ medicalRecordNumber: ['MRN', 'medical record', 'patient', 'chart', 'Krankenakte'],
105
+ deaNumber: ['DEA', 'drug enforcement', 'prescriber'],
106
+ npiNumber: ['NPI', 'provider', 'physician', 'doctor'],
107
+ rxNumber: ['prescription', 'Rx', 'pharmacy', 'medication', 'Rezept'],
108
+ healthPlanCtx: ['health plan', 'insurance', 'member', 'subscriber', 'beneficiary'],
109
+ bloodTypeCtx: ['blood', 'type', 'group', 'Blutgruppe'],
110
+ // Enterprise
111
+ awsAccessKey: ['AWS', 'amazon', 'access key', 'credential'],
112
+ gcpApiKey: ['Google', 'GCP', 'API key', 'cloud'],
113
+ stripeKey: ['Stripe', 'payment', 'API key'],
114
+ openaiKey: ['OpenAI', 'API key', 'GPT'],
115
+ githubToken: ['GitHub', 'token', 'personal access'],
116
+ slackToken: ['Slack', 'token', 'bot'],
117
+ jwtToken: ['JWT', 'token', 'auth', 'bearer', 'session'],
118
+ pemPrivateKey: ['private key', 'RSA', 'certificate', 'SSL', 'TLS'],
119
+ sshPublicKey: ['SSH', 'key', 'public key', 'authorized'],
120
+ genericSecret: ['password', 'secret', 'credential', 'API key', 'token'],
121
+ dbConnectionString: ['database', 'connection', 'JDBC', 'mongo', 'postgres', 'mysql'],
122
+ internalHostname: ['server', 'host', 'internal', 'intranet'],
123
+ // Vehicle
124
+ vin: ['VIN', 'vehicle', 'chassis', 'Fahrgestell', 'vehicule'],
125
+ // Crypto
126
+ bitcoinAddress: ['bitcoin', 'BTC', 'wallet', 'crypto'],
127
+ ethereumAddress: ['ethereum', 'ETH', 'wallet', 'crypto', 'contract'],
128
+ };
129
+ /**
130
+ * Base confidence scores by pattern name.
131
+ * Patterns with checksum validation get higher scores.
132
+ */
133
+ exports.BASE_CONFIDENCE = {
134
+ // Checksum validated = 0.95
135
+ creditCard: 0.95,
136
+ amex: 0.95,
137
+ iban: 0.95,
138
+ nhsNumber: 0.95,
139
+ sinCA: 0.95,
140
+ peselPL: 0.95,
141
+ bsnNL: 0.95,
142
+ nifPT: 0.95,
143
+ tfnAU: 0.95,
144
+ abaRouting: 0.95,
145
+ // Strong format (unique structure) = 0.85
146
+ email: 0.90,
147
+ codiceFiscaleIT: 0.90,
148
+ hetuFI: 0.90,
149
+ ahvCH: 0.90,
150
+ nricSG: 0.90,
151
+ panIN: 0.90,
152
+ passportUS: 0.85,
153
+ passportEU: 0.85,
154
+ passportDE: 0.85,
155
+ ssn: 0.85,
156
+ ninoUK: 0.85,
157
+ dniES: 0.85,
158
+ nieES: 0.85,
159
+ nationalIdCN: 0.85,
160
+ nationalIdTH: 0.85,
161
+ emiratesIdUAE: 0.85,
162
+ curpMX: 0.85,
163
+ bitcoinAddress: 0.85,
164
+ ethereumAddress: 0.85,
165
+ vin: 0.85,
166
+ jwtToken: 0.85,
167
+ awsAccessKey: 0.90,
168
+ gcpApiKey: 0.90,
169
+ stripeKey: 0.90,
170
+ openaiKey: 0.90,
171
+ githubToken: 0.90,
172
+ pemPrivateKey: 0.90,
173
+ sshPublicKey: 0.90,
174
+ // Medium format = 0.70
175
+ phone: 0.70,
176
+ phoneDE: 0.75,
177
+ phoneUK: 0.75,
178
+ phoneAT: 0.75,
179
+ phoneCH: 0.75,
180
+ phoneFR: 0.75,
181
+ personName: 0.80,
182
+ ipv4: 0.70,
183
+ ipv6: 0.80,
184
+ macAddress: 0.80,
185
+ url: 0.75,
186
+ vatEU: 0.80,
187
+ bic: 0.65,
188
+ // Broad patterns (high false positive risk) = 0.50-0.60
189
+ postalCodeDE: 0.40,
190
+ postalCodeAT: 0.40,
191
+ postalCodeCH: 0.40,
192
+ postalCodeUS: 0.50,
193
+ postalCodeIT: 0.40,
194
+ postalCodeES: 0.50,
195
+ postalCodeIN: 0.40,
196
+ dateSlash: 0.50,
197
+ dateDash: 0.50,
198
+ dateDot: 0.50,
199
+ sortCodeUK: 0.50,
200
+ cardExpiry: 0.50,
201
+ socialHandle: 0.60,
202
+ licensePlateDE: 0.55,
203
+ dnaSequence: 0.50,
204
+ };
205
+ /**
206
+ * Default base confidence for patterns not in the BASE_CONFIDENCE map.
207
+ */
208
+ exports.DEFAULT_BASE_CONFIDENCE = 0.65;
209
+ /**
210
+ * Confidence boost when context words are found nearby.
211
+ */
212
+ exports.CONTEXT_BOOST = 0.20;
213
+ /**
214
+ * Confidence score for semantic field-name matches.
215
+ */
216
+ exports.SEMANTIC_CONFIDENCE = 0.90;
217
+ /**
218
+ * Confidence score for context-aware ambiguous field matches.
219
+ */
220
+ exports.AMBIGUOUS_FIELD_CONFIDENCE = 0.85;
221
+ /**
222
+ * Confidence score for custom pattern matches.
223
+ */
224
+ exports.CUSTOM_PATTERN_CONFIDENCE = 0.70;
225
+ /**
226
+ * Confidence score for deny list matches.
227
+ */
228
+ exports.DENY_LIST_CONFIDENCE = 1.0;
229
+ /**
230
+ * Window size (characters) to search for context words around a match.
231
+ */
232
+ exports.CONTEXT_WINDOW = 80;
233
+ /**
234
+ * Check if any context words appear near a match position in the text.
235
+ */
236
+ function hasContextWords(text, matchStart, matchEnd, words) {
237
+ if (!words || words.length === 0)
238
+ return false;
239
+ const windowStart = Math.max(0, matchStart - exports.CONTEXT_WINDOW);
240
+ const windowEnd = Math.min(text.length, matchEnd + exports.CONTEXT_WINDOW);
241
+ const surrounding = text.slice(windowStart, windowEnd).toLowerCase();
242
+ return words.some((word) => surrounding.includes(word.toLowerCase()));
243
+ }
244
+ /**
245
+ * Calculate confidence score for a pattern match.
246
+ */
247
+ function calculateConfidence(patternName, hasValidator, validatorPassed, text, matchStart, matchEnd) {
248
+ // Start with base confidence
249
+ let confidence = exports.BASE_CONFIDENCE[patternName] ?? exports.DEFAULT_BASE_CONFIDENCE;
250
+ // If has validator and passed, boost to at least 0.95
251
+ if (hasValidator && validatorPassed) {
252
+ confidence = Math.max(confidence, 0.95);
253
+ }
254
+ // Check context words
255
+ const contextWords = exports.CONTEXT_WORDS[patternName];
256
+ if (contextWords && hasContextWords(text, matchStart, matchEnd, contextWords)) {
257
+ confidence = Math.min(1.0, confidence + exports.CONTEXT_BOOST);
258
+ }
259
+ return Math.round(confidence * 100) / 100;
260
+ }