@elanlanguages/bridge-anonymization 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +304 -71
- package/dist/crypto/pii-map-crypto.d.ts +50 -36
- package/dist/crypto/pii-map-crypto.d.ts.map +1 -1
- package/dist/crypto/pii-map-crypto.js +137 -72
- package/dist/crypto/pii-map-crypto.js.map +1 -1
- package/dist/index.d.ts +27 -20
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +145 -55
- package/dist/index.js.map +1 -1
- package/dist/ner/model-manager.d.ts +20 -11
- package/dist/ner/model-manager.d.ts.map +1 -1
- package/dist/ner/model-manager.js +154 -81
- package/dist/ner/model-manager.js.map +1 -1
- package/dist/ner/ner-model.d.ts +1 -1
- package/dist/ner/ner-model.d.ts.map +1 -1
- package/dist/ner/ner-model.js +49 -36
- package/dist/ner/ner-model.js.map +1 -1
- package/dist/ner/onnx-runtime.d.ts +8 -7
- package/dist/ner/onnx-runtime.d.ts.map +1 -1
- package/dist/ner/onnx-runtime.js +56 -25
- package/dist/ner/onnx-runtime.js.map +1 -1
- package/dist/ner/tokenizer.d.ts +5 -0
- package/dist/ner/tokenizer.d.ts.map +1 -1
- package/dist/ner/tokenizer.js +18 -5
- package/dist/ner/tokenizer.js.map +1 -1
- package/dist/pipeline/index.d.ts +7 -4
- package/dist/pipeline/index.d.ts.map +1 -1
- package/dist/pipeline/index.js +7 -4
- package/dist/pipeline/index.js.map +1 -1
- package/dist/pipeline/resolver.d.ts.map +1 -1
- package/dist/pipeline/resolver.js +3 -2
- package/dist/pipeline/resolver.js.map +1 -1
- package/dist/pipeline/semantic-data-loader.d.ts +165 -0
- package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
- package/dist/pipeline/semantic-data-loader.js +655 -0
- package/dist/pipeline/semantic-data-loader.js.map +1 -0
- package/dist/pipeline/semantic-enricher.d.ts +112 -0
- package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
- package/dist/pipeline/semantic-enricher.js +318 -0
- package/dist/pipeline/semantic-enricher.js.map +1 -0
- package/dist/pipeline/tagger.d.ts +52 -12
- package/dist/pipeline/tagger.d.ts.map +1 -1
- package/dist/pipeline/tagger.js +226 -21
- package/dist/pipeline/tagger.js.map +1 -1
- package/dist/pipeline/title-extractor.d.ts +79 -0
- package/dist/pipeline/title-extractor.d.ts.map +1 -0
- package/dist/pipeline/title-extractor.js +801 -0
- package/dist/pipeline/title-extractor.js.map +1 -0
- package/dist/types/index.d.ts +66 -3
- package/dist/types/index.d.ts.map +1 -1
- package/dist/types/index.js +14 -3
- package/dist/types/index.js.map +1 -1
- package/dist/utils/index.d.ts +5 -3
- package/dist/utils/index.d.ts.map +1 -1
- package/dist/utils/index.js +5 -3
- package/dist/utils/index.js.map +1 -1
- package/dist/utils/path.d.ts +34 -0
- package/dist/utils/path.d.ts.map +1 -0
- package/dist/utils/path.js +96 -0
- package/dist/utils/path.js.map +1 -0
- package/dist/utils/storage-browser.d.ts +51 -0
- package/dist/utils/storage-browser.d.ts.map +1 -0
- package/dist/utils/storage-browser.js +381 -0
- package/dist/utils/storage-browser.js.map +1 -0
- package/dist/utils/storage-node.d.ts +43 -0
- package/dist/utils/storage-node.d.ts.map +1 -0
- package/dist/utils/storage-node.js +93 -0
- package/dist/utils/storage-node.js.map +1 -0
- package/dist/utils/storage.d.ts +70 -0
- package/dist/utils/storage.d.ts.map +1 -0
- package/dist/utils/storage.js +69 -0
- package/dist/utils/storage.js.map +1 -0
- package/package.json +8 -5
package/README.md
CHANGED
|
@@ -1,22 +1,47 @@
|
|
|
1
1
|
# Bridge Anonymization
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
[](https://codecov.io/github/elanlanguages/bridge-anonymization)
|
|
6
|
+
|
|
7
|
+
On-device PII anonymization module for high-privacy AI workflows. Detects and replaces Personally Identifiable Information (PII) with placeholder tags while maintaining an encrypted mapping for later rehydration.
|
|
8
|
+
|
|
9
|
+
**Works in Node.js, Bun, and browsers** - zero server-side dependencies required.
|
|
4
10
|
|
|
5
11
|
## Features
|
|
6
12
|
|
|
7
13
|
- **Structured PII Detection**: Regex-based detection for emails, phones, IBANs, credit cards, IPs, URLs
|
|
8
|
-
- **Soft PII Detection**: ONNX-powered NER model for names, organizations, locations (auto-downloads on first use)
|
|
14
|
+
- **Soft PII Detection**: ONNX-powered NER model for names, organizations, locations (auto-downloads on first use if enabled)
|
|
15
|
+
- **Semantic Enrichment**: AI/MT-friendly tags with gender/location attributes for better translations
|
|
9
16
|
- **Secure PII Mapping**: AES-256-GCM encrypted storage of original PII values
|
|
17
|
+
- **Cross-Platform**: Works identically in Node.js, Bun, and browsers
|
|
10
18
|
- **Configurable Policies**: Customizable detection rules, thresholds, and allowlists
|
|
11
19
|
- **Validation & Leak Scanning**: Built-in validation and optional leak detection
|
|
12
20
|
|
|
13
21
|
## Installation
|
|
14
22
|
|
|
23
|
+
### Node.js / Bun
|
|
24
|
+
|
|
15
25
|
```bash
|
|
16
26
|
npm install @elanlanguages/bridge-anonymization
|
|
17
27
|
```
|
|
18
28
|
|
|
19
|
-
|
|
29
|
+
### Browser (with bundler)
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
npm install @elanlanguages/bridge-anonymization onnxruntime-web
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Browser (without bundler)
|
|
36
|
+
|
|
37
|
+
```html
|
|
38
|
+
<script type="module">
|
|
39
|
+
// Import directly from your dist folder or CDN
|
|
40
|
+
import { createAnonymizer } from './node_modules/@elanlanguages/bridge-anonymization/dist/index.js';
|
|
41
|
+
|
|
42
|
+
// onnxruntime-web is automatically loaded from CDN when needed
|
|
43
|
+
</script>
|
|
44
|
+
```
|
|
20
45
|
|
|
21
46
|
## Quick Start
|
|
22
47
|
|
|
@@ -45,7 +70,7 @@ import { createAnonymizer } from '@elanlanguages/bridge-anonymization';
|
|
|
45
70
|
const anonymizer = createAnonymizer({
|
|
46
71
|
ner: {
|
|
47
72
|
mode: 'quantized', // or 'standard' for full model (~1.1 GB)
|
|
48
|
-
onStatus: (status) => console.log(status),
|
|
73
|
+
onStatus: (status) => console.log(status),
|
|
49
74
|
}
|
|
50
75
|
});
|
|
51
76
|
|
|
@@ -57,19 +82,90 @@ const result = await anonymizer.anonymize(
|
|
|
57
82
|
|
|
58
83
|
console.log(result.anonymizedText);
|
|
59
84
|
// "Hello <PII type="PERSON" id="1"/> from <PII type="ORG" id="2"/> in <PII type="LOCATION" id="3"/>!"
|
|
85
|
+
|
|
86
|
+
// Clean up when done
|
|
87
|
+
await anonymizer.dispose();
|
|
60
88
|
```
|
|
61
89
|
|
|
62
|
-
###
|
|
90
|
+
### With Semantic Enrichment
|
|
91
|
+
|
|
92
|
+
Add gender and location scope for better machine translation:
|
|
63
93
|
|
|
64
94
|
```typescript
|
|
65
|
-
import {
|
|
95
|
+
import { createAnonymizer } from '@elanlanguages/bridge-anonymization';
|
|
66
96
|
|
|
67
|
-
const
|
|
68
|
-
|
|
69
|
-
|
|
97
|
+
const anonymizer = createAnonymizer({
|
|
98
|
+
ner: { mode: 'quantized' },
|
|
99
|
+
semantic: {
|
|
100
|
+
enabled: true, // Downloads ~12 MB of semantic data on first use
|
|
101
|
+
onStatus: (status) => console.log(status),
|
|
102
|
+
}
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
await anonymizer.initialize();
|
|
106
|
+
|
|
107
|
+
const result = await anonymizer.anonymize(
|
|
108
|
+
'Hello Maria Schmidt from Berlin!'
|
|
70
109
|
);
|
|
110
|
+
|
|
111
|
+
console.log(result.anonymizedText);
|
|
112
|
+
// "Hello <PII type="PERSON" gender="female" id="1"/> from <PII type="LOCATION" scope="city" id="2"/>!"
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Example: Translation Workflow (Anonymize → Translate → Rehydrate)
|
|
116
|
+
|
|
117
|
+
The full workflow for privacy-preserving translation:
|
|
118
|
+
|
|
119
|
+
```typescript
|
|
120
|
+
import {
|
|
121
|
+
createAnonymizer,
|
|
122
|
+
decryptPIIMap,
|
|
123
|
+
rehydrate,
|
|
124
|
+
InMemoryKeyProvider
|
|
125
|
+
} from '@elanlanguages/bridge-anonymization';
|
|
126
|
+
|
|
127
|
+
// 1. Create a key provider (required to decrypt later)
|
|
128
|
+
const keyProvider = new InMemoryKeyProvider();
|
|
129
|
+
|
|
130
|
+
// 2. Create anonymizer with key provider
|
|
131
|
+
const anonymizer = createAnonymizer({
|
|
132
|
+
ner: { mode: 'quantized' },
|
|
133
|
+
keyProvider: keyProvider
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
await anonymizer.initialize();
|
|
137
|
+
|
|
138
|
+
// 3. Anonymize before translation
|
|
139
|
+
const original = 'Hello John Smith from Acme Corp in Berlin!';
|
|
140
|
+
const result = await anonymizer.anonymize(original);
|
|
141
|
+
|
|
142
|
+
console.log(result.anonymizedText);
|
|
143
|
+
// "Hello <PII type="PERSON" id="1"/> from <PII type="ORG" id="2"/> in <PII type="LOCATION" id="3"/>!"
|
|
144
|
+
|
|
145
|
+
// 4. Translate (or do other AI workloads that preserve placeholders)
|
|
146
|
+
const translated = await yourTranslationService(result.anonymizedText, { from: 'en', to: 'de' });
|
|
147
|
+
// "Hallo <PII type="PERSON" id="1"/> von <PII type="ORG" id="2"/> in <PII type="LOCATION" id="3"/>!"
|
|
148
|
+
|
|
149
|
+
// 5. Decrypt the PII map using the same key
|
|
150
|
+
const encryptionKey = await keyProvider.getKey();
|
|
151
|
+
const piiMap = await decryptPIIMap(result.piiMap, encryptionKey);
|
|
152
|
+
|
|
153
|
+
// 6. Rehydrate - replace placeholders with original values
|
|
154
|
+
const rehydrated = rehydrate(translated, piiMap);
|
|
155
|
+
|
|
156
|
+
console.log(rehydrated);
|
|
157
|
+
// "Hallo John Smith von Acme Corp in Berlin!"
|
|
158
|
+
|
|
159
|
+
// 7. Clean up
|
|
160
|
+
await anonymizer.dispose();
|
|
71
161
|
```
|
|
72
162
|
|
|
163
|
+
### Key Points
|
|
164
|
+
|
|
165
|
+
- **Save the encryption key** - You need the same key to decrypt the PII map
|
|
166
|
+
- **Placeholders are XML-like** - Most translation services preserve them automatically
|
|
167
|
+
- **PII stays local** - Original values never leave your system during translation
|
|
168
|
+
|
|
73
169
|
## API Reference
|
|
74
170
|
|
|
75
171
|
### Configuration Options
|
|
@@ -80,21 +176,31 @@ import { createAnonymizer, InMemoryKeyProvider } from '@elanlanguages/bridge-ano
|
|
|
80
176
|
const anonymizer = createAnonymizer({
|
|
81
177
|
// NER configuration
|
|
82
178
|
ner: {
|
|
83
|
-
mode: 'quantized',
|
|
84
|
-
autoDownload: true,
|
|
85
|
-
onStatus: (
|
|
86
|
-
onDownloadProgress: (
|
|
179
|
+
mode: 'quantized', // 'standard' | 'quantized' | 'disabled' | 'custom'
|
|
180
|
+
autoDownload: true, // Auto-download model if not present
|
|
181
|
+
onStatus: (status) => {}, // Status messages callback
|
|
182
|
+
onDownloadProgress: (progress) => {
|
|
183
|
+
console.log(`${progress.file}: ${progress.percent}%`);
|
|
184
|
+
},
|
|
87
185
|
|
|
88
186
|
// For 'custom' mode only:
|
|
89
187
|
modelPath: './my-model.onnx',
|
|
90
188
|
vocabPath: './vocab.txt',
|
|
91
189
|
},
|
|
92
190
|
|
|
191
|
+
// Semantic enrichment (adds gender/scope attributes)
|
|
192
|
+
semantic: {
|
|
193
|
+
enabled: true, // Enable MT-friendly attributes
|
|
194
|
+
autoDownload: true, // Auto-download semantic data (~12 MB)
|
|
195
|
+
onStatus: (status) => {},
|
|
196
|
+
onDownloadProgress: (progress) => {},
|
|
197
|
+
},
|
|
198
|
+
|
|
93
199
|
// Encryption key provider
|
|
94
200
|
keyProvider: new InMemoryKeyProvider(),
|
|
95
201
|
|
|
96
|
-
// Custom policy
|
|
97
|
-
defaultPolicy: { /*
|
|
202
|
+
// Custom policy (optional)
|
|
203
|
+
defaultPolicy: { /* see Policy section */ },
|
|
98
204
|
});
|
|
99
205
|
|
|
100
206
|
await anonymizer.initialize();
|
|
@@ -113,7 +219,7 @@ await anonymizer.initialize();
|
|
|
113
219
|
|
|
114
220
|
#### `createAnonymizer(config?)`
|
|
115
221
|
|
|
116
|
-
Creates
|
|
222
|
+
Creates a reusable anonymizer instance:
|
|
117
223
|
|
|
118
224
|
```typescript
|
|
119
225
|
const anonymizer = createAnonymizer({
|
|
@@ -130,6 +236,8 @@ await anonymizer.dispose();
|
|
|
130
236
|
One-off anonymization (regex-only by default):
|
|
131
237
|
|
|
132
238
|
```typescript
|
|
239
|
+
import { anonymize } from '@elanlanguages/bridge-anonymization';
|
|
240
|
+
|
|
133
241
|
const result = await anonymize('Contact test@example.com');
|
|
134
242
|
```
|
|
135
243
|
|
|
@@ -138,6 +246,8 @@ const result = await anonymize('Contact test@example.com');
|
|
|
138
246
|
One-off anonymization with NER:
|
|
139
247
|
|
|
140
248
|
```typescript
|
|
249
|
+
import { anonymizeWithNER } from '@elanlanguages/bridge-anonymization';
|
|
250
|
+
|
|
141
251
|
const result = await anonymizeWithNER(
|
|
142
252
|
'Hello John Smith',
|
|
143
253
|
{ mode: 'quantized' }
|
|
@@ -149,9 +259,34 @@ const result = await anonymizeWithNER(
|
|
|
149
259
|
Fast regex-only anonymization:
|
|
150
260
|
|
|
151
261
|
```typescript
|
|
262
|
+
import { anonymizeRegexOnly } from '@elanlanguages/bridge-anonymization';
|
|
263
|
+
|
|
152
264
|
const result = await anonymizeRegexOnly('Card: 4111111111111111');
|
|
153
265
|
```
|
|
154
266
|
|
|
267
|
+
### Rehydration Functions
|
|
268
|
+
|
|
269
|
+
#### `decryptPIIMap(encryptedMap, key)`
|
|
270
|
+
|
|
271
|
+
Decrypts the PII map for rehydration:
|
|
272
|
+
|
|
273
|
+
```typescript
|
|
274
|
+
import { decryptPIIMap } from '@elanlanguages/bridge-anonymization';
|
|
275
|
+
|
|
276
|
+
const piiMap = await decryptPIIMap(result.piiMap, encryptionKey);
|
|
277
|
+
// Returns Map<string, string> where key is "PERSON:1" and value is "John Smith"
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
#### `rehydrate(text, piiMap)`
|
|
281
|
+
|
|
282
|
+
Replaces placeholders with original values:
|
|
283
|
+
|
|
284
|
+
```typescript
|
|
285
|
+
import { rehydrate } from '@elanlanguages/bridge-anonymization';
|
|
286
|
+
|
|
287
|
+
const original = rehydrate(translatedText, piiMap);
|
|
288
|
+
```
|
|
289
|
+
|
|
155
290
|
### Result Structure
|
|
156
291
|
|
|
157
292
|
```typescript
|
|
@@ -189,22 +324,22 @@ interface AnonymizationResult {
|
|
|
189
324
|
|
|
190
325
|
## Supported PII Types
|
|
191
326
|
|
|
192
|
-
| Type | Description | Detection
|
|
193
|
-
|
|
194
|
-
| `EMAIL` | Email addresses | Regex |
|
|
195
|
-
| `PHONE` | Phone numbers (international) | Regex |
|
|
196
|
-
| `IBAN` | International Bank Account Numbers | Regex + Checksum |
|
|
197
|
-
| `BIC_SWIFT` | Bank Identifier Codes | Regex |
|
|
198
|
-
| `CREDIT_CARD` | Credit card numbers | Regex + Luhn |
|
|
199
|
-
| `IP_ADDRESS` | IPv4 and IPv6 addresses | Regex |
|
|
200
|
-
| `URL` | Web URLs | Regex |
|
|
201
|
-
| `CASE_ID` | Case/ticket numbers | Regex (configurable) |
|
|
202
|
-
| `CUSTOMER_ID` | Customer identifiers | Regex (configurable) |
|
|
203
|
-
| `PERSON` | Person names | NER |
|
|
204
|
-
| `ORG` | Organization names | NER |
|
|
205
|
-
| `LOCATION` | Location/place names | NER |
|
|
206
|
-
| `ADDRESS` | Physical addresses | NER |
|
|
207
|
-
| `DATE_OF_BIRTH` | Dates of birth | NER |
|
|
327
|
+
| Type | Description | Detection | Semantic Attributes |
|
|
328
|
+
|------|-------------|-----------|---------------------|
|
|
329
|
+
| `EMAIL` | Email addresses | Regex | - |
|
|
330
|
+
| `PHONE` | Phone numbers (international) | Regex | - |
|
|
331
|
+
| `IBAN` | International Bank Account Numbers | Regex + Checksum | - |
|
|
332
|
+
| `BIC_SWIFT` | Bank Identifier Codes | Regex | - |
|
|
333
|
+
| `CREDIT_CARD` | Credit card numbers | Regex + Luhn | - |
|
|
334
|
+
| `IP_ADDRESS` | IPv4 and IPv6 addresses | Regex | - |
|
|
335
|
+
| `URL` | Web URLs | Regex | - |
|
|
336
|
+
| `CASE_ID` | Case/ticket numbers | Regex (configurable) | - |
|
|
337
|
+
| `CUSTOMER_ID` | Customer identifiers | Regex (configurable) | - |
|
|
338
|
+
| `PERSON` | Person names | NER | `gender` (male/female/neutral) |
|
|
339
|
+
| `ORG` | Organization names | NER | - |
|
|
340
|
+
| `LOCATION` | Location/place names | NER | `scope` (city/country/region) |
|
|
341
|
+
| `ADDRESS` | Physical addresses | NER | - |
|
|
342
|
+
| `DATE_OF_BIRTH` | Dates of birth | NER | - |
|
|
208
343
|
|
|
209
344
|
## Configuration
|
|
210
345
|
|
|
@@ -228,6 +363,9 @@ const anonymizer = createAnonymizer({
|
|
|
228
363
|
// Terms to never treat as PII
|
|
229
364
|
allowlistTerms: new Set(['Customer Service', 'Help Desk']),
|
|
230
365
|
|
|
366
|
+
// Enable semantic enrichment (gender/scope)
|
|
367
|
+
enableSemanticMasking: true,
|
|
368
|
+
|
|
231
369
|
// Enable leak scanning on output
|
|
232
370
|
enableLeakScan: true,
|
|
233
371
|
},
|
|
@@ -253,88 +391,175 @@ const anonymizer = createAnonymizer();
|
|
|
253
391
|
anonymizer.getRegistry().register(customRecognizer);
|
|
254
392
|
```
|
|
255
393
|
|
|
256
|
-
## Model
|
|
394
|
+
## Data & Model Storage
|
|
395
|
+
|
|
396
|
+
Models and semantic data are cached locally for offline use.
|
|
397
|
+
|
|
398
|
+
### Node.js Cache Locations
|
|
399
|
+
|
|
400
|
+
| Data | macOS | Linux | Windows |
|
|
401
|
+
|------|-------|-------|---------|
|
|
402
|
+
| NER Models | `~/Library/Caches/bridge-anonymization/models/` | `~/.cache/bridge-anonymization/models/` | `%LOCALAPPDATA%/bridge-anonymization/models/` |
|
|
403
|
+
| Semantic Data | `~/Library/Caches/bridge-anonymization/semantic-data/` | `~/.cache/bridge-anonymization/semantic-data/` | `%LOCALAPPDATA%/bridge-anonymization/semantic-data/` |
|
|
404
|
+
|
|
405
|
+
### Browser Cache
|
|
257
406
|
|
|
258
|
-
|
|
407
|
+
In browsers, data is stored using:
|
|
408
|
+
- **IndexedDB**: For semantic data and smaller files
|
|
409
|
+
- **Origin Private File System (OPFS)**: For large model files (~280 MB)
|
|
259
410
|
|
|
260
|
-
|
|
261
|
-
- **macOS**: `~/Library/Caches/bridge-anonymization/models/`
|
|
262
|
-
- **Linux**: `~/.cache/bridge-anonymization/models/`
|
|
263
|
-
- **Windows**: `%LOCALAPPDATA%/bridge-anonymization/models/`
|
|
411
|
+
Data persists across page reloads and browser sessions.
|
|
264
412
|
|
|
265
|
-
### Manual
|
|
413
|
+
### Manual Data Management
|
|
266
414
|
|
|
267
415
|
```typescript
|
|
268
416
|
import {
|
|
417
|
+
// Model management
|
|
269
418
|
isModelDownloaded,
|
|
270
419
|
downloadModel,
|
|
271
420
|
clearModelCache,
|
|
272
421
|
listDownloadedModels,
|
|
273
|
-
|
|
422
|
+
|
|
423
|
+
// Semantic data management
|
|
424
|
+
isSemanticDataDownloaded,
|
|
425
|
+
downloadSemanticData,
|
|
426
|
+
clearSemanticDataCache,
|
|
274
427
|
} from '@elanlanguages/bridge-anonymization';
|
|
275
428
|
|
|
276
429
|
// Check if model is downloaded
|
|
277
430
|
const hasModel = await isModelDownloaded('quantized');
|
|
278
431
|
|
|
279
|
-
// Manually download
|
|
432
|
+
// Manually download model with progress
|
|
280
433
|
await downloadModel('quantized', (progress) => {
|
|
281
434
|
console.log(`${progress.file}: ${progress.percent}%`);
|
|
282
435
|
});
|
|
283
436
|
|
|
437
|
+
// Check semantic data
|
|
438
|
+
const hasSemanticData = await isSemanticDataDownloaded();
|
|
439
|
+
|
|
284
440
|
// List downloaded models
|
|
285
441
|
const models = await listDownloadedModels();
|
|
286
442
|
|
|
287
|
-
// Clear
|
|
443
|
+
// Clear caches
|
|
288
444
|
await clearModelCache('quantized'); // or clearModelCache() for all
|
|
445
|
+
await clearSemanticDataCache();
|
|
289
446
|
```
|
|
290
447
|
|
|
291
448
|
## Encryption & Security
|
|
292
449
|
|
|
293
|
-
The PII map is encrypted using AES-256-GCM
|
|
450
|
+
The PII map is encrypted using **AES-256-GCM** via the Web Crypto API (works in both Node.js and browsers).
|
|
451
|
+
|
|
452
|
+
### Key Providers
|
|
294
453
|
|
|
295
454
|
```typescript
|
|
296
|
-
import {
|
|
455
|
+
import {
|
|
456
|
+
InMemoryKeyProvider, // For development/testing
|
|
457
|
+
ConfigKeyProvider, // For production with pre-configured key
|
|
458
|
+
KeyProvider, // Interface for custom implementations
|
|
459
|
+
generateKey,
|
|
460
|
+
} from '@elanlanguages/bridge-anonymization';
|
|
461
|
+
|
|
462
|
+
// Development: In-memory key (generates random key, lost on page refresh)
|
|
463
|
+
const devKeyProvider = new InMemoryKeyProvider();
|
|
464
|
+
|
|
465
|
+
// Production: Pre-configured key
|
|
466
|
+
// Generate key: openssl rand -base64 32
|
|
467
|
+
const keyBase64 = process.env.PII_ENCRYPTION_KEY; // or read from config
|
|
468
|
+
const prodKeyProvider = new ConfigKeyProvider(keyBase64);
|
|
297
469
|
|
|
470
|
+
// Custom: Implement KeyProvider interface
|
|
298
471
|
class SecureKeyProvider implements KeyProvider {
|
|
299
|
-
async getKey(): Promise<
|
|
300
|
-
// Retrieve from
|
|
472
|
+
async getKey(): Promise<Uint8Array> {
|
|
473
|
+
// Retrieve from secure storage, HSM, keychain, etc.
|
|
301
474
|
return await getKeyFromSecureStorage();
|
|
302
475
|
}
|
|
303
476
|
}
|
|
304
|
-
|
|
305
|
-
const anonymizer = createAnonymizer({
|
|
306
|
-
keyProvider: new SecureKeyProvider(),
|
|
307
|
-
ner: { mode: 'quantized' },
|
|
308
|
-
});
|
|
309
477
|
```
|
|
310
478
|
|
|
311
479
|
### Security Best Practices
|
|
312
480
|
|
|
313
481
|
- **Never log the raw PII map** - Always use encrypted storage
|
|
482
|
+
- **Persist the encryption key securely** - Use platform keystores (iOS Keychain, Android Keystore, etc.)
|
|
314
483
|
- **Rotate keys** - Implement key rotation for long-running applications
|
|
315
|
-
- **Use platform keystores** - iOS Keychain, Android Keystore, or OS credential managers
|
|
316
484
|
- **Enable leak scanning** - Catch any missed PII in output
|
|
317
485
|
|
|
486
|
+
## Browser Usage
|
|
487
|
+
|
|
488
|
+
The library works seamlessly in browsers without any special configuration.
|
|
489
|
+
|
|
490
|
+
### Basic Browser Example
|
|
491
|
+
|
|
492
|
+
```html
|
|
493
|
+
<!DOCTYPE html>
|
|
494
|
+
<html>
|
|
495
|
+
<head>
|
|
496
|
+
<title>PII Anonymization</title>
|
|
497
|
+
</head>
|
|
498
|
+
<body>
|
|
499
|
+
<script type="module">
|
|
500
|
+
import {
|
|
501
|
+
createAnonymizer,
|
|
502
|
+
InMemoryKeyProvider,
|
|
503
|
+
decryptPIIMap,
|
|
504
|
+
rehydrate
|
|
505
|
+
} from './node_modules/@elanlanguages/bridge-anonymization/dist/index.js';
|
|
506
|
+
|
|
507
|
+
async function demo() {
|
|
508
|
+
// Create anonymizer
|
|
509
|
+
const keyProvider = new InMemoryKeyProvider();
|
|
510
|
+
const anonymizer = createAnonymizer({
|
|
511
|
+
ner: {
|
|
512
|
+
mode: 'quantized',
|
|
513
|
+
onStatus: (s) => console.log('NER:', s),
|
|
514
|
+
onDownloadProgress: (p) => console.log(`Download: ${p.percent}%`)
|
|
515
|
+
},
|
|
516
|
+
semantic: { enabled: true },
|
|
517
|
+
keyProvider
|
|
518
|
+
});
|
|
519
|
+
|
|
520
|
+
// Initialize (downloads models on first use)
|
|
521
|
+
await anonymizer.initialize();
|
|
522
|
+
|
|
523
|
+
// Anonymize
|
|
524
|
+
const result = await anonymizer.anonymize(
|
|
525
|
+
'Contact Maria Schmidt at maria@example.com in Berlin.'
|
|
526
|
+
);
|
|
527
|
+
|
|
528
|
+
console.log('Anonymized:', result.anonymizedText);
|
|
529
|
+
// "Contact <PII type="PERSON" gender="female" id="1"/> at <PII type="EMAIL" id="2"/> in <PII type="LOCATION" scope="city" id="3"/>."
|
|
530
|
+
|
|
531
|
+
// Rehydrate
|
|
532
|
+
const key = await keyProvider.getKey();
|
|
533
|
+
const piiMap = await decryptPIIMap(result.piiMap, key);
|
|
534
|
+
const original = rehydrate(result.anonymizedText, piiMap);
|
|
535
|
+
|
|
536
|
+
console.log('Rehydrated:', original);
|
|
537
|
+
|
|
538
|
+
await anonymizer.dispose();
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
demo().catch(console.error);
|
|
542
|
+
</script>
|
|
543
|
+
</body>
|
|
544
|
+
</html>
|
|
545
|
+
```
|
|
546
|
+
|
|
547
|
+
### Browser Notes
|
|
548
|
+
|
|
549
|
+
- **First-use downloads**: NER model (~280 MB) and semantic data (~12 MB) are downloaded on first use
|
|
550
|
+
- **ONNX runtime**: Automatically loaded from CDN if not bundled
|
|
551
|
+
- **Offline support**: After initial download, everything works offline
|
|
552
|
+
- **Storage**: Uses IndexedDB and OPFS - data persists across sessions
|
|
553
|
+
|
|
318
554
|
## Bun Support
|
|
319
555
|
|
|
320
|
-
This library works with [Bun](https://bun.sh). Since `onnxruntime-node` is a native Node.js addon, Bun
|
|
556
|
+
This library works with [Bun](https://bun.sh). Since `onnxruntime-node` is a native Node.js addon, Bun uses `onnxruntime-web`:
|
|
321
557
|
|
|
322
558
|
```bash
|
|
323
559
|
bun add @elanlanguages/bridge-anonymization onnxruntime-web
|
|
324
560
|
```
|
|
325
561
|
|
|
326
|
-
Usage is identical - the library auto-detects the runtime
|
|
327
|
-
|
|
328
|
-
```typescript
|
|
329
|
-
import { createAnonymizer } from '@elanlanguages/bridge-anonymization';
|
|
330
|
-
|
|
331
|
-
const anonymizer = createAnonymizer({
|
|
332
|
-
ner: { mode: 'quantized' }
|
|
333
|
-
});
|
|
334
|
-
|
|
335
|
-
await anonymizer.initialize();
|
|
336
|
-
const result = await anonymizer.anonymize('Hello John Smith');
|
|
337
|
-
```
|
|
562
|
+
Usage is identical - the library auto-detects the runtime.
|
|
338
563
|
|
|
339
564
|
## Performance
|
|
340
565
|
|
|
@@ -342,12 +567,22 @@ const result = await anonymizer.anonymize('Hello John Smith');
|
|
|
342
567
|
|-----------|-----------------|-------|
|
|
343
568
|
| Regex pass | ~5 ms | All regex recognizers |
|
|
344
569
|
| NER inference | ~100-150 ms | Quantized model |
|
|
570
|
+
| Semantic enrichment | ~1-2 ms | After data loaded |
|
|
345
571
|
| Total pipeline | ~150-200 ms | Full anonymization |
|
|
346
572
|
|
|
347
573
|
| Model | Size | First-Use Download |
|
|
348
574
|
|-------|------|-------------------|
|
|
349
575
|
| Quantized | ~280 MB | ~30s on fast connection |
|
|
350
576
|
| Standard | ~1.1 GB | ~2min on fast connection |
|
|
577
|
+
| Semantic Data | ~12 MB | ~5s on fast connection |
|
|
578
|
+
|
|
579
|
+
## Requirements
|
|
580
|
+
|
|
581
|
+
| Environment | Version | Notes |
|
|
582
|
+
|-------------|---------|-------|
|
|
583
|
+
| Node.js | >= 18.0.0 | Uses native `onnxruntime-node` |
|
|
584
|
+
| Bun | >= 1.0.0 | Requires `onnxruntime-web` |
|
|
585
|
+
| Browsers | Chrome 86+, Firefox 89+, Safari 15.4+, Edge 86+ | Uses OPFS for model storage |
|
|
351
586
|
|
|
352
587
|
## Development
|
|
353
588
|
|
|
@@ -360,11 +595,14 @@ npm test
|
|
|
360
595
|
|
|
361
596
|
# Build
|
|
362
597
|
npm run build
|
|
598
|
+
|
|
599
|
+
# Lint
|
|
600
|
+
npm run lint
|
|
363
601
|
```
|
|
364
602
|
|
|
365
603
|
### Building Custom Models
|
|
366
604
|
|
|
367
|
-
For development or custom models
|
|
605
|
+
For development or custom models:
|
|
368
606
|
|
|
369
607
|
```bash
|
|
370
608
|
# Requires Python 3.8+
|
|
@@ -372,11 +610,6 @@ npm run setup:ner # Standard model
|
|
|
372
610
|
npm run setup:ner:quantized # Quantized model
|
|
373
611
|
```
|
|
374
612
|
|
|
375
|
-
## Requirements
|
|
376
|
-
|
|
377
|
-
- Node.js >= 18.0.0 (ONNX runtime included automatically)
|
|
378
|
-
- Bun >= 1.0.0 (requires `onnxruntime-web`)
|
|
379
|
-
|
|
380
613
|
## License
|
|
381
614
|
|
|
382
615
|
MIT
|