rehydra 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +615 -0
- package/dist/crypto/index.d.ts +6 -0
- package/dist/crypto/index.d.ts.map +1 -0
- package/dist/crypto/index.js +6 -0
- package/dist/crypto/index.js.map +1 -0
- package/dist/crypto/pii-map-crypto.d.ts +114 -0
- package/dist/crypto/pii-map-crypto.d.ts.map +1 -0
- package/dist/crypto/pii-map-crypto.js +228 -0
- package/dist/crypto/pii-map-crypto.js.map +1 -0
- package/dist/index.d.ts +180 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +384 -0
- package/dist/index.js.map +1 -0
- package/dist/ner/bio-decoder.d.ts +64 -0
- package/dist/ner/bio-decoder.d.ts.map +1 -0
- package/dist/ner/bio-decoder.js +216 -0
- package/dist/ner/bio-decoder.js.map +1 -0
- package/dist/ner/index.d.ts +10 -0
- package/dist/ner/index.d.ts.map +1 -0
- package/dist/ner/index.js +10 -0
- package/dist/ner/index.js.map +1 -0
- package/dist/ner/model-manager.d.ts +111 -0
- package/dist/ner/model-manager.d.ts.map +1 -0
- package/dist/ner/model-manager.js +325 -0
- package/dist/ner/model-manager.js.map +1 -0
- package/dist/ner/ner-model.d.ts +114 -0
- package/dist/ner/ner-model.d.ts.map +1 -0
- package/dist/ner/ner-model.js +253 -0
- package/dist/ner/ner-model.js.map +1 -0
- package/dist/ner/onnx-runtime.d.ts +46 -0
- package/dist/ner/onnx-runtime.d.ts.map +1 -0
- package/dist/ner/onnx-runtime.js +130 -0
- package/dist/ner/onnx-runtime.js.map +1 -0
- package/dist/ner/tokenizer.d.ts +118 -0
- package/dist/ner/tokenizer.d.ts.map +1 -0
- package/dist/ner/tokenizer.js +332 -0
- package/dist/ner/tokenizer.js.map +1 -0
- package/dist/pipeline/index.d.ts +12 -0
- package/dist/pipeline/index.d.ts.map +1 -0
- package/dist/pipeline/index.js +12 -0
- package/dist/pipeline/index.js.map +1 -0
- package/dist/pipeline/prenormalize.d.ts +48 -0
- package/dist/pipeline/prenormalize.d.ts.map +1 -0
- package/dist/pipeline/prenormalize.js +94 -0
- package/dist/pipeline/prenormalize.js.map +1 -0
- package/dist/pipeline/resolver.d.ts +56 -0
- package/dist/pipeline/resolver.d.ts.map +1 -0
- package/dist/pipeline/resolver.js +239 -0
- package/dist/pipeline/resolver.js.map +1 -0
- package/dist/pipeline/semantic-data-loader.d.ts +165 -0
- package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
- package/dist/pipeline/semantic-data-loader.js +655 -0
- package/dist/pipeline/semantic-data-loader.js.map +1 -0
- package/dist/pipeline/semantic-enricher.d.ts +112 -0
- package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
- package/dist/pipeline/semantic-enricher.js +318 -0
- package/dist/pipeline/semantic-enricher.js.map +1 -0
- package/dist/pipeline/tagger.d.ts +114 -0
- package/dist/pipeline/tagger.d.ts.map +1 -0
- package/dist/pipeline/tagger.js +374 -0
- package/dist/pipeline/tagger.js.map +1 -0
- package/dist/pipeline/title-extractor.d.ts +79 -0
- package/dist/pipeline/title-extractor.d.ts.map +1 -0
- package/dist/pipeline/title-extractor.js +801 -0
- package/dist/pipeline/title-extractor.js.map +1 -0
- package/dist/pipeline/validator.d.ts +65 -0
- package/dist/pipeline/validator.d.ts.map +1 -0
- package/dist/pipeline/validator.js +264 -0
- package/dist/pipeline/validator.js.map +1 -0
- package/dist/recognizers/base.d.ts +78 -0
- package/dist/recognizers/base.d.ts.map +1 -0
- package/dist/recognizers/base.js +100 -0
- package/dist/recognizers/base.js.map +1 -0
- package/dist/recognizers/bic-swift.d.ts +10 -0
- package/dist/recognizers/bic-swift.d.ts.map +1 -0
- package/dist/recognizers/bic-swift.js +107 -0
- package/dist/recognizers/bic-swift.js.map +1 -0
- package/dist/recognizers/credit-card.d.ts +32 -0
- package/dist/recognizers/credit-card.d.ts.map +1 -0
- package/dist/recognizers/credit-card.js +160 -0
- package/dist/recognizers/credit-card.js.map +1 -0
- package/dist/recognizers/custom-id.d.ts +28 -0
- package/dist/recognizers/custom-id.d.ts.map +1 -0
- package/dist/recognizers/custom-id.js +116 -0
- package/dist/recognizers/custom-id.js.map +1 -0
- package/dist/recognizers/email.d.ts +10 -0
- package/dist/recognizers/email.d.ts.map +1 -0
- package/dist/recognizers/email.js +75 -0
- package/dist/recognizers/email.js.map +1 -0
- package/dist/recognizers/iban.d.ts +14 -0
- package/dist/recognizers/iban.d.ts.map +1 -0
- package/dist/recognizers/iban.js +67 -0
- package/dist/recognizers/iban.js.map +1 -0
- package/dist/recognizers/index.d.ts +20 -0
- package/dist/recognizers/index.d.ts.map +1 -0
- package/dist/recognizers/index.js +42 -0
- package/dist/recognizers/index.js.map +1 -0
- package/dist/recognizers/ip-address.d.ts +14 -0
- package/dist/recognizers/ip-address.d.ts.map +1 -0
- package/dist/recognizers/ip-address.js +183 -0
- package/dist/recognizers/ip-address.js.map +1 -0
- package/dist/recognizers/phone.d.ts +10 -0
- package/dist/recognizers/phone.d.ts.map +1 -0
- package/dist/recognizers/phone.js +145 -0
- package/dist/recognizers/phone.js.map +1 -0
- package/dist/recognizers/registry.d.ts +59 -0
- package/dist/recognizers/registry.d.ts.map +1 -0
- package/dist/recognizers/registry.js +113 -0
- package/dist/recognizers/registry.js.map +1 -0
- package/dist/recognizers/url.d.ts +14 -0
- package/dist/recognizers/url.d.ts.map +1 -0
- package/dist/recognizers/url.js +121 -0
- package/dist/recognizers/url.js.map +1 -0
- package/dist/types/index.d.ts +197 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +80 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/pii-types.d.ts +50 -0
- package/dist/types/pii-types.d.ts.map +1 -0
- package/dist/types/pii-types.js +114 -0
- package/dist/types/pii-types.js.map +1 -0
- package/dist/utils/iban-checksum.d.ts +23 -0
- package/dist/utils/iban-checksum.d.ts.map +1 -0
- package/dist/utils/iban-checksum.js +106 -0
- package/dist/utils/iban-checksum.js.map +1 -0
- package/dist/utils/index.d.ts +10 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +10 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/luhn.d.ts +17 -0
- package/dist/utils/luhn.d.ts.map +1 -0
- package/dist/utils/luhn.js +55 -0
- package/dist/utils/luhn.js.map +1 -0
- package/dist/utils/offsets.d.ts +86 -0
- package/dist/utils/offsets.d.ts.map +1 -0
- package/dist/utils/offsets.js +124 -0
- package/dist/utils/offsets.js.map +1 -0
- package/dist/utils/path.d.ts +34 -0
- package/dist/utils/path.d.ts.map +1 -0
- package/dist/utils/path.js +96 -0
- package/dist/utils/path.js.map +1 -0
- package/dist/utils/storage-browser.d.ts +51 -0
- package/dist/utils/storage-browser.d.ts.map +1 -0
- package/dist/utils/storage-browser.js +381 -0
- package/dist/utils/storage-browser.js.map +1 -0
- package/dist/utils/storage-node.d.ts +43 -0
- package/dist/utils/storage-node.d.ts.map +1 -0
- package/dist/utils/storage-node.js +93 -0
- package/dist/utils/storage-node.js.map +1 -0
- package/dist/utils/storage.d.ts +70 -0
- package/dist/utils/storage.d.ts.map +1 -0
- package/dist/utils/storage.js +69 -0
- package/dist/utils/storage.js.map +1 -0
- package/package.json +66 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 ELAN Languages
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
package/README.md
ADDED
|
@@ -0,0 +1,615 @@
|
|
|
1
|
+
# Rehydra
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
[](https://codecov.io/github/rehydra-ai/rehydra)
|
|
6
|
+
|
|
7
|
+
On-device PII anonymization module for high-privacy AI workflows. Detects and replaces Personally Identifiable Information (PII) with placeholder tags while maintaining an encrypted mapping for later rehydration.
|
|
8
|
+
|
|
9
|
+
**Works in Node.js, Bun, and browsers** - zero server-side dependencies required.
|
|
10
|
+
|
|
11
|
+
## Features
|
|
12
|
+
|
|
13
|
+
- **Structured PII Detection**: Regex-based detection for emails, phones, IBANs, credit cards, IPs, URLs
|
|
14
|
+
- **Soft PII Detection**: ONNX-powered NER model for names, organizations, locations (auto-downloads on first use if enabled)
|
|
15
|
+
- **Semantic Enrichment**: AI/MT-friendly tags with gender/location attributes for better translations
|
|
16
|
+
- **Secure PII Mapping**: AES-256-GCM encrypted storage of original PII values
|
|
17
|
+
- **Cross-Platform**: Works identically in Node.js, Bun, and browsers
|
|
18
|
+
- **Configurable Policies**: Customizable detection rules, thresholds, and allowlists
|
|
19
|
+
- **Validation & Leak Scanning**: Built-in validation and optional leak detection
|
|
20
|
+
|
|
21
|
+
## Installation
|
|
22
|
+
|
|
23
|
+
### Node.js / Bun
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
npm install rehydra
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Browser (with bundler)
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
npm install rehydra onnxruntime-web
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### Browser (without bundler)
|
|
36
|
+
|
|
37
|
+
```html
|
|
38
|
+
<script type="module">
|
|
39
|
+
// Import directly from your dist folder or CDN
|
|
40
|
+
import { createAnonymizer } from './node_modules/rehydra/dist/index.js';
|
|
41
|
+
|
|
42
|
+
// onnxruntime-web is automatically loaded from CDN when needed
|
|
43
|
+
</script>
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Quick Start
|
|
47
|
+
|
|
48
|
+
### Regex-Only Mode (No Downloads Required)
|
|
49
|
+
|
|
50
|
+
For structured PII like emails, phones, IBANs, credit cards:
|
|
51
|
+
|
|
52
|
+
```typescript
|
|
53
|
+
import { anonymizeRegexOnly } from 'rehydra';
|
|
54
|
+
|
|
55
|
+
const result = await anonymizeRegexOnly(
|
|
56
|
+
'Contact john@example.com or call +49 30 123456. IBAN: DE89370400440532013000'
|
|
57
|
+
);
|
|
58
|
+
|
|
59
|
+
console.log(result.anonymizedText);
|
|
60
|
+
// "Contact <PII type="EMAIL" id="1"/> or call <PII type="PHONE" id="2"/>. IBAN: <PII type="IBAN" id="3"/>"
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Full Mode with NER (Detects Names, Organizations, Locations)
|
|
64
|
+
|
|
65
|
+
The NER model is automatically downloaded on first use (~280 MB for quantized):
|
|
66
|
+
|
|
67
|
+
```typescript
|
|
68
|
+
import { createAnonymizer } from 'rehydra';
|
|
69
|
+
|
|
70
|
+
const anonymizer = createAnonymizer({
|
|
71
|
+
ner: {
|
|
72
|
+
mode: 'quantized', // or 'standard' for full model (~1.1 GB)
|
|
73
|
+
onStatus: (status) => console.log(status),
|
|
74
|
+
}
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
await anonymizer.initialize(); // Downloads model if needed
|
|
78
|
+
|
|
79
|
+
const result = await anonymizer.anonymize(
|
|
80
|
+
'Hello John Smith from Acme Corp in Berlin!'
|
|
81
|
+
);
|
|
82
|
+
|
|
83
|
+
console.log(result.anonymizedText);
|
|
84
|
+
// "Hello <PII type="PERSON" id="1"/> from <PII type="ORG" id="2"/> in <PII type="LOCATION" id="3"/>!"
|
|
85
|
+
|
|
86
|
+
// Clean up when done
|
|
87
|
+
await anonymizer.dispose();
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### With Semantic Enrichment
|
|
91
|
+
|
|
92
|
+
Add gender and location scope for better machine translation:
|
|
93
|
+
|
|
94
|
+
```typescript
|
|
95
|
+
import { createAnonymizer } from 'rehydra';
|
|
96
|
+
|
|
97
|
+
const anonymizer = createAnonymizer({
|
|
98
|
+
ner: { mode: 'quantized' },
|
|
99
|
+
semantic: {
|
|
100
|
+
enabled: true, // Downloads ~12 MB of semantic data on first use
|
|
101
|
+
onStatus: (status) => console.log(status),
|
|
102
|
+
}
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
await anonymizer.initialize();
|
|
106
|
+
|
|
107
|
+
const result = await anonymizer.anonymize(
|
|
108
|
+
'Hello Maria Schmidt from Berlin!'
|
|
109
|
+
);
|
|
110
|
+
|
|
111
|
+
console.log(result.anonymizedText);
|
|
112
|
+
// "Hello <PII type="PERSON" gender="female" id="1"/> from <PII type="LOCATION" scope="city" id="2"/>!"
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Example: Translation Workflow (Anonymize → Translate → Rehydrate)
|
|
116
|
+
|
|
117
|
+
The full workflow for privacy-preserving translation:
|
|
118
|
+
|
|
119
|
+
```typescript
|
|
120
|
+
import {
|
|
121
|
+
createAnonymizer,
|
|
122
|
+
decryptPIIMap,
|
|
123
|
+
rehydrate,
|
|
124
|
+
InMemoryKeyProvider
|
|
125
|
+
} from 'rehydra';
|
|
126
|
+
|
|
127
|
+
// 1. Create a key provider (required to decrypt later)
|
|
128
|
+
const keyProvider = new InMemoryKeyProvider();
|
|
129
|
+
|
|
130
|
+
// 2. Create anonymizer with key provider
|
|
131
|
+
const anonymizer = createAnonymizer({
|
|
132
|
+
ner: { mode: 'quantized' },
|
|
133
|
+
keyProvider: keyProvider
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
await anonymizer.initialize();
|
|
137
|
+
|
|
138
|
+
// 3. Anonymize before translation
|
|
139
|
+
const original = 'Hello John Smith from Acme Corp in Berlin!';
|
|
140
|
+
const result = await anonymizer.anonymize(original);
|
|
141
|
+
|
|
142
|
+
console.log(result.anonymizedText);
|
|
143
|
+
// "Hello <PII type="PERSON" id="1"/> from <PII type="ORG" id="2"/> in <PII type="LOCATION" id="3"/>!"
|
|
144
|
+
|
|
145
|
+
// 4. Translate (or do other AI workloads that preserve placeholders)
|
|
146
|
+
const translated = await yourAIWorkflow(result.anonymizedText, { from: 'en', to: 'de' });
|
|
147
|
+
// "Hallo <PII type="PERSON" id="1"/> von <PII type="ORG" id="2"/> in <PII type="LOCATION" id="3"/>!"
|
|
148
|
+
|
|
149
|
+
// 5. Decrypt the PII map using the same key
|
|
150
|
+
const encryptionKey = await keyProvider.getKey();
|
|
151
|
+
const piiMap = await decryptPIIMap(result.piiMap, encryptionKey);
|
|
152
|
+
|
|
153
|
+
// 6. Rehydrate - replace placeholders with original values
|
|
154
|
+
const rehydrated = rehydrate(translated, piiMap);
|
|
155
|
+
|
|
156
|
+
console.log(rehydrated);
|
|
157
|
+
// "Hallo John Smith von Acme Corp in Berlin!"
|
|
158
|
+
|
|
159
|
+
// 7. Clean up
|
|
160
|
+
await anonymizer.dispose();
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### Key Points
|
|
164
|
+
|
|
165
|
+
- **Save the encryption key** - You need the same key to decrypt the PII map
|
|
166
|
+
- **Placeholders are XML-like** - Most translation services preserve them automatically
|
|
167
|
+
- **PII stays local** - Original values never leave your system during translation
|
|
168
|
+
|
|
169
|
+
## API Reference
|
|
170
|
+
|
|
171
|
+
### Configuration Options
|
|
172
|
+
|
|
173
|
+
```typescript
|
|
174
|
+
import { createAnonymizer, InMemoryKeyProvider } from 'rehydra';
|
|
175
|
+
|
|
176
|
+
const anonymizer = createAnonymizer({
|
|
177
|
+
// NER configuration
|
|
178
|
+
ner: {
|
|
179
|
+
mode: 'quantized', // 'standard' | 'quantized' | 'disabled' | 'custom'
|
|
180
|
+
autoDownload: true, // Auto-download model if not present
|
|
181
|
+
onStatus: (status) => {}, // Status messages callback
|
|
182
|
+
onDownloadProgress: (progress) => {
|
|
183
|
+
console.log(`${progress.file}: ${progress.percent}%`);
|
|
184
|
+
},
|
|
185
|
+
|
|
186
|
+
// For 'custom' mode only:
|
|
187
|
+
modelPath: './my-model.onnx',
|
|
188
|
+
vocabPath: './vocab.txt',
|
|
189
|
+
},
|
|
190
|
+
|
|
191
|
+
// Semantic enrichment (adds gender/scope attributes)
|
|
192
|
+
semantic: {
|
|
193
|
+
enabled: true, // Enable MT-friendly attributes
|
|
194
|
+
autoDownload: true, // Auto-download semantic data (~12 MB)
|
|
195
|
+
onStatus: (status) => {},
|
|
196
|
+
onDownloadProgress: (progress) => {},
|
|
197
|
+
},
|
|
198
|
+
|
|
199
|
+
// Encryption key provider
|
|
200
|
+
keyProvider: new InMemoryKeyProvider(),
|
|
201
|
+
|
|
202
|
+
// Custom policy (optional)
|
|
203
|
+
defaultPolicy: { /* see Policy section */ },
|
|
204
|
+
});
|
|
205
|
+
|
|
206
|
+
await anonymizer.initialize();
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### NER Modes
|
|
210
|
+
|
|
211
|
+
| Mode | Description | Size | Auto-Download |
|
|
212
|
+
|------|-------------|------|---------------|
|
|
213
|
+
| `'disabled'` | No NER, regex only | 0 | N/A |
|
|
214
|
+
| `'quantized'` | Smaller model, ~95% accuracy | ~280 MB | Yes |
|
|
215
|
+
| `'standard'` | Full model, best accuracy | ~1.1 GB | Yes |
|
|
216
|
+
| `'custom'` | Your own ONNX model | Varies | No |
|
|
217
|
+
|
|
218
|
+
### Main Functions
|
|
219
|
+
|
|
220
|
+
#### `createAnonymizer(config?)`
|
|
221
|
+
|
|
222
|
+
Creates a reusable anonymizer instance:
|
|
223
|
+
|
|
224
|
+
```typescript
|
|
225
|
+
const anonymizer = createAnonymizer({
|
|
226
|
+
ner: { mode: 'quantized' }
|
|
227
|
+
});
|
|
228
|
+
|
|
229
|
+
await anonymizer.initialize();
|
|
230
|
+
const result = await anonymizer.anonymize('text');
|
|
231
|
+
await anonymizer.dispose();
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
#### `anonymize(text, locale?, policy?)`
|
|
235
|
+
|
|
236
|
+
One-off anonymization (regex-only by default):
|
|
237
|
+
|
|
238
|
+
```typescript
|
|
239
|
+
import { anonymize } from 'rehydra';
|
|
240
|
+
|
|
241
|
+
const result = await anonymize('Contact test@example.com');
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
#### `anonymizeWithNER(text, nerConfig, policy?)`
|
|
245
|
+
|
|
246
|
+
One-off anonymization with NER:
|
|
247
|
+
|
|
248
|
+
```typescript
|
|
249
|
+
import { anonymizeWithNER } from 'rehydra';
|
|
250
|
+
|
|
251
|
+
const result = await anonymizeWithNER(
|
|
252
|
+
'Hello John Smith',
|
|
253
|
+
{ mode: 'quantized' }
|
|
254
|
+
);
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
#### `anonymizeRegexOnly(text, policy?)`
|
|
258
|
+
|
|
259
|
+
Fast regex-only anonymization:
|
|
260
|
+
|
|
261
|
+
```typescript
|
|
262
|
+
import { anonymizeRegexOnly } from 'rehydra';
|
|
263
|
+
|
|
264
|
+
const result = await anonymizeRegexOnly('Card: 4111111111111111');
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
### Rehydration Functions
|
|
268
|
+
|
|
269
|
+
#### `decryptPIIMap(encryptedMap, key)`
|
|
270
|
+
|
|
271
|
+
Decrypts the PII map for rehydration:
|
|
272
|
+
|
|
273
|
+
```typescript
|
|
274
|
+
import { decryptPIIMap } from 'rehydra';
|
|
275
|
+
|
|
276
|
+
const piiMap = await decryptPIIMap(result.piiMap, encryptionKey);
|
|
277
|
+
// Returns Map<string, string> where key is "PERSON:1" and value is "John Smith"
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
#### `rehydrate(text, piiMap)`
|
|
281
|
+
|
|
282
|
+
Replaces placeholders with original values:
|
|
283
|
+
|
|
284
|
+
```typescript
|
|
285
|
+
import { rehydrate } from 'rehydra';
|
|
286
|
+
|
|
287
|
+
const original = rehydrate(translatedText, piiMap);
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
### Result Structure
|
|
291
|
+
|
|
292
|
+
```typescript
|
|
293
|
+
interface AnonymizationResult {
|
|
294
|
+
// Text with PII replaced by placeholder tags
|
|
295
|
+
anonymizedText: string;
|
|
296
|
+
|
|
297
|
+
// Detected entities (without original text for safety)
|
|
298
|
+
entities: Array<{
|
|
299
|
+
type: PIIType;
|
|
300
|
+
id: number;
|
|
301
|
+
start: number;
|
|
302
|
+
end: number;
|
|
303
|
+
confidence: number;
|
|
304
|
+
source: 'REGEX' | 'NER';
|
|
305
|
+
}>;
|
|
306
|
+
|
|
307
|
+
// Encrypted PII mapping (for later rehydration)
|
|
308
|
+
piiMap: {
|
|
309
|
+
ciphertext: string; // Base64
|
|
310
|
+
iv: string; // Base64
|
|
311
|
+
authTag: string; // Base64
|
|
312
|
+
};
|
|
313
|
+
|
|
314
|
+
// Processing statistics
|
|
315
|
+
stats: {
|
|
316
|
+
countsByType: Record<PIIType, number>;
|
|
317
|
+
totalEntities: number;
|
|
318
|
+
processingTimeMs: number;
|
|
319
|
+
modelVersion: string;
|
|
320
|
+
leakScanPassed?: boolean;
|
|
321
|
+
};
|
|
322
|
+
}
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
## Supported PII Types
|
|
326
|
+
|
|
327
|
+
| Type | Description | Detection | Semantic Attributes |
|
|
328
|
+
|------|-------------|-----------|---------------------|
|
|
329
|
+
| `EMAIL` | Email addresses | Regex | - |
|
|
330
|
+
| `PHONE` | Phone numbers (international) | Regex | - |
|
|
331
|
+
| `IBAN` | International Bank Account Numbers | Regex + Checksum | - |
|
|
332
|
+
| `BIC_SWIFT` | Bank Identifier Codes | Regex | - |
|
|
333
|
+
| `CREDIT_CARD` | Credit card numbers | Regex + Luhn | - |
|
|
334
|
+
| `IP_ADDRESS` | IPv4 and IPv6 addresses | Regex | - |
|
|
335
|
+
| `URL` | Web URLs | Regex | - |
|
|
336
|
+
| `CASE_ID` | Case/ticket numbers | Regex (configurable) | - |
|
|
337
|
+
| `CUSTOMER_ID` | Customer identifiers | Regex (configurable) | - |
|
|
338
|
+
| `PERSON` | Person names | NER | `gender` (male/female/neutral) |
|
|
339
|
+
| `ORG` | Organization names | NER | - |
|
|
340
|
+
| `LOCATION` | Location/place names | NER | `scope` (city/country/region) |
|
|
341
|
+
| `ADDRESS` | Physical addresses | NER | - |
|
|
342
|
+
| `DATE_OF_BIRTH` | Dates of birth | NER | - |
|
|
343
|
+
|
|
344
|
+
## Configuration
|
|
345
|
+
|
|
346
|
+
### Anonymization Policy
|
|
347
|
+
|
|
348
|
+
```typescript
|
|
349
|
+
import { createAnonymizer, PIIType } from 'rehydra';
|
|
350
|
+
|
|
351
|
+
const anonymizer = createAnonymizer({
|
|
352
|
+
ner: { mode: 'quantized' },
|
|
353
|
+
defaultPolicy: {
|
|
354
|
+
// Which PII types to detect
|
|
355
|
+
enabledTypes: new Set([PIIType.EMAIL, PIIType.PHONE, PIIType.PERSON]),
|
|
356
|
+
|
|
357
|
+
// Confidence thresholds per type (0.0 - 1.0)
|
|
358
|
+
confidenceThresholds: new Map([
|
|
359
|
+
[PIIType.PERSON, 0.8],
|
|
360
|
+
[PIIType.EMAIL, 0.5],
|
|
361
|
+
]),
|
|
362
|
+
|
|
363
|
+
// Terms to never treat as PII
|
|
364
|
+
allowlistTerms: new Set(['Customer Service', 'Help Desk']),
|
|
365
|
+
|
|
366
|
+
// Enable semantic enrichment (gender/scope)
|
|
367
|
+
enableSemanticMasking: true,
|
|
368
|
+
|
|
369
|
+
// Enable leak scanning on output
|
|
370
|
+
enableLeakScan: true,
|
|
371
|
+
},
|
|
372
|
+
});
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
### Custom Recognizers
|
|
376
|
+
|
|
377
|
+
Add domain-specific patterns:
|
|
378
|
+
|
|
379
|
+
```typescript
|
|
380
|
+
import { createCustomIdRecognizer, PIIType, createAnonymizer } from 'rehydra';
|
|
381
|
+
|
|
382
|
+
const customRecognizer = createCustomIdRecognizer([
|
|
383
|
+
{
|
|
384
|
+
name: 'Order Number',
|
|
385
|
+
pattern: /\bORD-[A-Z0-9]{8}\b/g,
|
|
386
|
+
type: PIIType.CASE_ID,
|
|
387
|
+
},
|
|
388
|
+
]);
|
|
389
|
+
|
|
390
|
+
const anonymizer = createAnonymizer();
|
|
391
|
+
anonymizer.getRegistry().register(customRecognizer);
|
|
392
|
+
```
|
|
393
|
+
|
|
394
|
+
## Data & Model Storage
|
|
395
|
+
|
|
396
|
+
Models and semantic data are cached locally for offline use.
|
|
397
|
+
|
|
398
|
+
### Node.js Cache Locations
|
|
399
|
+
|
|
400
|
+
| Data | macOS | Linux | Windows |
|
|
401
|
+
|------|-------|-------|---------|
|
|
402
|
+
| NER Models | `~/Library/Caches/rehydra/models/` | `~/.cache/rehydra/models/` | `%LOCALAPPDATA%/rehydra/models/` |
|
|
403
|
+
| Semantic Data | `~/Library/Caches/rehydra/semantic-data/` | `~/.cache/rehydra/semantic-data/` | `%LOCALAPPDATA%/rehydra/semantic-data/` |
|
|
404
|
+
|
|
405
|
+
### Browser Cache
|
|
406
|
+
|
|
407
|
+
In browsers, data is stored using:
|
|
408
|
+
- **IndexedDB**: For semantic data and smaller files
|
|
409
|
+
- **Origin Private File System (OPFS)**: For large model files (~280 MB)
|
|
410
|
+
|
|
411
|
+
Data persists across page reloads and browser sessions.
|
|
412
|
+
|
|
413
|
+
### Manual Data Management
|
|
414
|
+
|
|
415
|
+
```typescript
|
|
416
|
+
import {
|
|
417
|
+
// Model management
|
|
418
|
+
isModelDownloaded,
|
|
419
|
+
downloadModel,
|
|
420
|
+
clearModelCache,
|
|
421
|
+
listDownloadedModels,
|
|
422
|
+
|
|
423
|
+
// Semantic data management
|
|
424
|
+
isSemanticDataDownloaded,
|
|
425
|
+
downloadSemanticData,
|
|
426
|
+
clearSemanticDataCache,
|
|
427
|
+
} from 'rehydra';
|
|
428
|
+
|
|
429
|
+
// Check if model is downloaded
|
|
430
|
+
const hasModel = await isModelDownloaded('quantized');
|
|
431
|
+
|
|
432
|
+
// Manually download model with progress
|
|
433
|
+
await downloadModel('quantized', (progress) => {
|
|
434
|
+
console.log(`${progress.file}: ${progress.percent}%`);
|
|
435
|
+
});
|
|
436
|
+
|
|
437
|
+
// Check semantic data
|
|
438
|
+
const hasSemanticData = await isSemanticDataDownloaded();
|
|
439
|
+
|
|
440
|
+
// List downloaded models
|
|
441
|
+
const models = await listDownloadedModels();
|
|
442
|
+
|
|
443
|
+
// Clear caches
|
|
444
|
+
await clearModelCache('quantized'); // or clearModelCache() for all
|
|
445
|
+
await clearSemanticDataCache();
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
## Encryption & Security
|
|
449
|
+
|
|
450
|
+
The PII map is encrypted using **AES-256-GCM** via the Web Crypto API (works in both Node.js and browsers).
|
|
451
|
+
|
|
452
|
+
### Key Providers
|
|
453
|
+
|
|
454
|
+
```typescript
|
|
455
|
+
import {
|
|
456
|
+
InMemoryKeyProvider, // For development/testing
|
|
457
|
+
ConfigKeyProvider, // For production with pre-configured key
|
|
458
|
+
KeyProvider, // Interface for custom implementations
|
|
459
|
+
generateKey,
|
|
460
|
+
} from 'rehydra';
|
|
461
|
+
|
|
462
|
+
// Development: In-memory key (generates random key, lost on page refresh)
|
|
463
|
+
const devKeyProvider = new InMemoryKeyProvider();
|
|
464
|
+
|
|
465
|
+
// Production: Pre-configured key
|
|
466
|
+
// Generate key: openssl rand -base64 32
|
|
467
|
+
const keyBase64 = process.env.PII_ENCRYPTION_KEY; // or read from config
|
|
468
|
+
const prodKeyProvider = new ConfigKeyProvider(keyBase64);
|
|
469
|
+
|
|
470
|
+
// Custom: Implement KeyProvider interface
|
|
471
|
+
class SecureKeyProvider implements KeyProvider {
|
|
472
|
+
async getKey(): Promise<Uint8Array> {
|
|
473
|
+
// Retrieve from secure storage, HSM, keychain, etc.
|
|
474
|
+
return await getKeyFromSecureStorage();
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
```
|
|
478
|
+
|
|
479
|
+
### Security Best Practices
|
|
480
|
+
|
|
481
|
+
- **Never log the raw PII map** - Always use encrypted storage
|
|
482
|
+
- **Persist the encryption key securely** - Use platform keystores (iOS Keychain, Android Keystore, etc.)
|
|
483
|
+
- **Rotate keys** - Implement key rotation for long-running applications
|
|
484
|
+
- **Enable leak scanning** - Catch any missed PII in output
|
|
485
|
+
|
|
486
|
+
## Browser Usage
|
|
487
|
+
|
|
488
|
+
The library works seamlessly in browsers without any special configuration.
|
|
489
|
+
|
|
490
|
+
### Basic Browser Example
|
|
491
|
+
|
|
492
|
+
```html
|
|
493
|
+
<!DOCTYPE html>
|
|
494
|
+
<html>
|
|
495
|
+
<head>
|
|
496
|
+
<title>PII Anonymization</title>
|
|
497
|
+
</head>
|
|
498
|
+
<body>
|
|
499
|
+
<script type="module">
|
|
500
|
+
import {
|
|
501
|
+
createAnonymizer,
|
|
502
|
+
InMemoryKeyProvider,
|
|
503
|
+
decryptPIIMap,
|
|
504
|
+
rehydrate
|
|
505
|
+
} from './node_modules/rehydra/dist/index.js';
|
|
506
|
+
|
|
507
|
+
async function demo() {
|
|
508
|
+
// Create anonymizer
|
|
509
|
+
const keyProvider = new InMemoryKeyProvider();
|
|
510
|
+
const anonymizer = createAnonymizer({
|
|
511
|
+
ner: {
|
|
512
|
+
mode: 'quantized',
|
|
513
|
+
onStatus: (s) => console.log('NER:', s),
|
|
514
|
+
onDownloadProgress: (p) => console.log(`Download: ${p.percent}%`)
|
|
515
|
+
},
|
|
516
|
+
semantic: { enabled: true },
|
|
517
|
+
keyProvider
|
|
518
|
+
});
|
|
519
|
+
|
|
520
|
+
// Initialize (downloads models on first use)
|
|
521
|
+
await anonymizer.initialize();
|
|
522
|
+
|
|
523
|
+
// Anonymize
|
|
524
|
+
const result = await anonymizer.anonymize(
|
|
525
|
+
'Contact Maria Schmidt at maria@example.com in Berlin.'
|
|
526
|
+
);
|
|
527
|
+
|
|
528
|
+
console.log('Anonymized:', result.anonymizedText);
|
|
529
|
+
// "Contact <PII type="PERSON" gender="female" id="1"/> at <PII type="EMAIL" id="2"/> in <PII type="LOCATION" scope="city" id="3"/>."
|
|
530
|
+
|
|
531
|
+
// Rehydrate
|
|
532
|
+
const key = await keyProvider.getKey();
|
|
533
|
+
const piiMap = await decryptPIIMap(result.piiMap, key);
|
|
534
|
+
const original = rehydrate(result.anonymizedText, piiMap);
|
|
535
|
+
|
|
536
|
+
console.log('Rehydrated:', original);
|
|
537
|
+
|
|
538
|
+
await anonymizer.dispose();
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
demo().catch(console.error);
|
|
542
|
+
</script>
|
|
543
|
+
</body>
|
|
544
|
+
</html>
|
|
545
|
+
```
|
|
546
|
+
|
|
547
|
+
### Browser Notes
|
|
548
|
+
|
|
549
|
+
- **First-use downloads**: NER model (~280 MB) and semantic data (~12 MB) are downloaded on first use
|
|
550
|
+
- **ONNX runtime**: Automatically loaded from CDN if not bundled
|
|
551
|
+
- **Offline support**: After initial download, everything works offline
|
|
552
|
+
- **Storage**: Uses IndexedDB and OPFS - data persists across sessions
|
|
553
|
+
|
|
554
|
+
## Bun Support
|
|
555
|
+
|
|
556
|
+
This library works with [Bun](https://bun.sh). Since `onnxruntime-node` is a native Node.js addon, Bun uses `onnxruntime-web`:
|
|
557
|
+
|
|
558
|
+
```bash
|
|
559
|
+
bun add rehydra onnxruntime-web
|
|
560
|
+
```
|
|
561
|
+
|
|
562
|
+
Usage is identical - the library auto-detects the runtime.
|
|
563
|
+
|
|
564
|
+
## Performance
|
|
565
|
+
|
|
566
|
+
| Component | Time (2K chars) | Notes |
|
|
567
|
+
|-----------|-----------------|-------|
|
|
568
|
+
| Regex pass | ~5 ms | All regex recognizers |
|
|
569
|
+
| NER inference | ~100-150 ms | Quantized model |
|
|
570
|
+
| Semantic enrichment | ~1-2 ms | After data loaded |
|
|
571
|
+
| Total pipeline | ~150-200 ms | Full anonymization |
|
|
572
|
+
|
|
573
|
+
| Model | Size | First-Use Download |
|
|
574
|
+
|-------|------|-------------------|
|
|
575
|
+
| Quantized | ~280 MB | ~30s on fast connection |
|
|
576
|
+
| Standard | ~1.1 GB | ~2min on fast connection |
|
|
577
|
+
| Semantic Data | ~12 MB | ~5s on fast connection |
|
|
578
|
+
|
|
579
|
+
## Requirements
|
|
580
|
+
|
|
581
|
+
| Environment | Version | Notes |
|
|
582
|
+
|-------------|---------|-------|
|
|
583
|
+
| Node.js | >= 18.0.0 | Uses native `onnxruntime-node` |
|
|
584
|
+
| Bun | >= 1.0.0 | Requires `onnxruntime-web` |
|
|
585
|
+
| Browsers | Chrome 86+, Firefox 89+, Safari 15.4+, Edge 86+ | Uses OPFS for model storage |
|
|
586
|
+
|
|
587
|
+
## Development
|
|
588
|
+
|
|
589
|
+
```bash
|
|
590
|
+
# Install dependencies
|
|
591
|
+
npm install
|
|
592
|
+
|
|
593
|
+
# Run tests
|
|
594
|
+
npm test
|
|
595
|
+
|
|
596
|
+
# Build
|
|
597
|
+
npm run build
|
|
598
|
+
|
|
599
|
+
# Lint
|
|
600
|
+
npm run lint
|
|
601
|
+
```
|
|
602
|
+
|
|
603
|
+
### Building Custom Models
|
|
604
|
+
|
|
605
|
+
For development or custom models:
|
|
606
|
+
|
|
607
|
+
```bash
|
|
608
|
+
# Requires Python 3.8+
|
|
609
|
+
npm run setup:ner # Standard model
|
|
610
|
+
npm run setup:ner:quantized # Quantized model
|
|
611
|
+
```
|
|
612
|
+
|
|
613
|
+
## License
|
|
614
|
+
|
|
615
|
+
MIT
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/crypto/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,cAAc,qBAAqB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/crypto/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,cAAc,qBAAqB,CAAC"}
|