rehydra 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +615 -0
  3. package/dist/crypto/index.d.ts +6 -0
  4. package/dist/crypto/index.d.ts.map +1 -0
  5. package/dist/crypto/index.js +6 -0
  6. package/dist/crypto/index.js.map +1 -0
  7. package/dist/crypto/pii-map-crypto.d.ts +114 -0
  8. package/dist/crypto/pii-map-crypto.d.ts.map +1 -0
  9. package/dist/crypto/pii-map-crypto.js +228 -0
  10. package/dist/crypto/pii-map-crypto.js.map +1 -0
  11. package/dist/index.d.ts +180 -0
  12. package/dist/index.d.ts.map +1 -0
  13. package/dist/index.js +384 -0
  14. package/dist/index.js.map +1 -0
  15. package/dist/ner/bio-decoder.d.ts +64 -0
  16. package/dist/ner/bio-decoder.d.ts.map +1 -0
  17. package/dist/ner/bio-decoder.js +216 -0
  18. package/dist/ner/bio-decoder.js.map +1 -0
  19. package/dist/ner/index.d.ts +10 -0
  20. package/dist/ner/index.d.ts.map +1 -0
  21. package/dist/ner/index.js +10 -0
  22. package/dist/ner/index.js.map +1 -0
  23. package/dist/ner/model-manager.d.ts +111 -0
  24. package/dist/ner/model-manager.d.ts.map +1 -0
  25. package/dist/ner/model-manager.js +325 -0
  26. package/dist/ner/model-manager.js.map +1 -0
  27. package/dist/ner/ner-model.d.ts +114 -0
  28. package/dist/ner/ner-model.d.ts.map +1 -0
  29. package/dist/ner/ner-model.js +253 -0
  30. package/dist/ner/ner-model.js.map +1 -0
  31. package/dist/ner/onnx-runtime.d.ts +46 -0
  32. package/dist/ner/onnx-runtime.d.ts.map +1 -0
  33. package/dist/ner/onnx-runtime.js +130 -0
  34. package/dist/ner/onnx-runtime.js.map +1 -0
  35. package/dist/ner/tokenizer.d.ts +118 -0
  36. package/dist/ner/tokenizer.d.ts.map +1 -0
  37. package/dist/ner/tokenizer.js +332 -0
  38. package/dist/ner/tokenizer.js.map +1 -0
  39. package/dist/pipeline/index.d.ts +12 -0
  40. package/dist/pipeline/index.d.ts.map +1 -0
  41. package/dist/pipeline/index.js +12 -0
  42. package/dist/pipeline/index.js.map +1 -0
  43. package/dist/pipeline/prenormalize.d.ts +48 -0
  44. package/dist/pipeline/prenormalize.d.ts.map +1 -0
  45. package/dist/pipeline/prenormalize.js +94 -0
  46. package/dist/pipeline/prenormalize.js.map +1 -0
  47. package/dist/pipeline/resolver.d.ts +56 -0
  48. package/dist/pipeline/resolver.d.ts.map +1 -0
  49. package/dist/pipeline/resolver.js +239 -0
  50. package/dist/pipeline/resolver.js.map +1 -0
  51. package/dist/pipeline/semantic-data-loader.d.ts +165 -0
  52. package/dist/pipeline/semantic-data-loader.d.ts.map +1 -0
  53. package/dist/pipeline/semantic-data-loader.js +655 -0
  54. package/dist/pipeline/semantic-data-loader.js.map +1 -0
  55. package/dist/pipeline/semantic-enricher.d.ts +112 -0
  56. package/dist/pipeline/semantic-enricher.d.ts.map +1 -0
  57. package/dist/pipeline/semantic-enricher.js +318 -0
  58. package/dist/pipeline/semantic-enricher.js.map +1 -0
  59. package/dist/pipeline/tagger.d.ts +114 -0
  60. package/dist/pipeline/tagger.d.ts.map +1 -0
  61. package/dist/pipeline/tagger.js +374 -0
  62. package/dist/pipeline/tagger.js.map +1 -0
  63. package/dist/pipeline/title-extractor.d.ts +79 -0
  64. package/dist/pipeline/title-extractor.d.ts.map +1 -0
  65. package/dist/pipeline/title-extractor.js +801 -0
  66. package/dist/pipeline/title-extractor.js.map +1 -0
  67. package/dist/pipeline/validator.d.ts +65 -0
  68. package/dist/pipeline/validator.d.ts.map +1 -0
  69. package/dist/pipeline/validator.js +264 -0
  70. package/dist/pipeline/validator.js.map +1 -0
  71. package/dist/recognizers/base.d.ts +78 -0
  72. package/dist/recognizers/base.d.ts.map +1 -0
  73. package/dist/recognizers/base.js +100 -0
  74. package/dist/recognizers/base.js.map +1 -0
  75. package/dist/recognizers/bic-swift.d.ts +10 -0
  76. package/dist/recognizers/bic-swift.d.ts.map +1 -0
  77. package/dist/recognizers/bic-swift.js +107 -0
  78. package/dist/recognizers/bic-swift.js.map +1 -0
  79. package/dist/recognizers/credit-card.d.ts +32 -0
  80. package/dist/recognizers/credit-card.d.ts.map +1 -0
  81. package/dist/recognizers/credit-card.js +160 -0
  82. package/dist/recognizers/credit-card.js.map +1 -0
  83. package/dist/recognizers/custom-id.d.ts +28 -0
  84. package/dist/recognizers/custom-id.d.ts.map +1 -0
  85. package/dist/recognizers/custom-id.js +116 -0
  86. package/dist/recognizers/custom-id.js.map +1 -0
  87. package/dist/recognizers/email.d.ts +10 -0
  88. package/dist/recognizers/email.d.ts.map +1 -0
  89. package/dist/recognizers/email.js +75 -0
  90. package/dist/recognizers/email.js.map +1 -0
  91. package/dist/recognizers/iban.d.ts +14 -0
  92. package/dist/recognizers/iban.d.ts.map +1 -0
  93. package/dist/recognizers/iban.js +67 -0
  94. package/dist/recognizers/iban.js.map +1 -0
  95. package/dist/recognizers/index.d.ts +20 -0
  96. package/dist/recognizers/index.d.ts.map +1 -0
  97. package/dist/recognizers/index.js +42 -0
  98. package/dist/recognizers/index.js.map +1 -0
  99. package/dist/recognizers/ip-address.d.ts +14 -0
  100. package/dist/recognizers/ip-address.d.ts.map +1 -0
  101. package/dist/recognizers/ip-address.js +183 -0
  102. package/dist/recognizers/ip-address.js.map +1 -0
  103. package/dist/recognizers/phone.d.ts +10 -0
  104. package/dist/recognizers/phone.d.ts.map +1 -0
  105. package/dist/recognizers/phone.js +145 -0
  106. package/dist/recognizers/phone.js.map +1 -0
  107. package/dist/recognizers/registry.d.ts +59 -0
  108. package/dist/recognizers/registry.d.ts.map +1 -0
  109. package/dist/recognizers/registry.js +113 -0
  110. package/dist/recognizers/registry.js.map +1 -0
  111. package/dist/recognizers/url.d.ts +14 -0
  112. package/dist/recognizers/url.d.ts.map +1 -0
  113. package/dist/recognizers/url.js +121 -0
  114. package/dist/recognizers/url.js.map +1 -0
  115. package/dist/types/index.d.ts +197 -0
  116. package/dist/types/index.d.ts.map +1 -0
  117. package/dist/types/index.js +80 -0
  118. package/dist/types/index.js.map +1 -0
  119. package/dist/types/pii-types.d.ts +50 -0
  120. package/dist/types/pii-types.d.ts.map +1 -0
  121. package/dist/types/pii-types.js +114 -0
  122. package/dist/types/pii-types.js.map +1 -0
  123. package/dist/utils/iban-checksum.d.ts +23 -0
  124. package/dist/utils/iban-checksum.d.ts.map +1 -0
  125. package/dist/utils/iban-checksum.js +106 -0
  126. package/dist/utils/iban-checksum.js.map +1 -0
  127. package/dist/utils/index.d.ts +10 -0
  128. package/dist/utils/index.d.ts.map +1 -0
  129. package/dist/utils/index.js +10 -0
  130. package/dist/utils/index.js.map +1 -0
  131. package/dist/utils/luhn.d.ts +17 -0
  132. package/dist/utils/luhn.d.ts.map +1 -0
  133. package/dist/utils/luhn.js +55 -0
  134. package/dist/utils/luhn.js.map +1 -0
  135. package/dist/utils/offsets.d.ts +86 -0
  136. package/dist/utils/offsets.d.ts.map +1 -0
  137. package/dist/utils/offsets.js +124 -0
  138. package/dist/utils/offsets.js.map +1 -0
  139. package/dist/utils/path.d.ts +34 -0
  140. package/dist/utils/path.d.ts.map +1 -0
  141. package/dist/utils/path.js +96 -0
  142. package/dist/utils/path.js.map +1 -0
  143. package/dist/utils/storage-browser.d.ts +51 -0
  144. package/dist/utils/storage-browser.d.ts.map +1 -0
  145. package/dist/utils/storage-browser.js +381 -0
  146. package/dist/utils/storage-browser.js.map +1 -0
  147. package/dist/utils/storage-node.d.ts +43 -0
  148. package/dist/utils/storage-node.d.ts.map +1 -0
  149. package/dist/utils/storage-node.js +93 -0
  150. package/dist/utils/storage-node.js.map +1 -0
  151. package/dist/utils/storage.d.ts +70 -0
  152. package/dist/utils/storage.d.ts.map +1 -0
  153. package/dist/utils/storage.js +69 -0
  154. package/dist/utils/storage.js.map +1 -0
  155. package/package.json +66 -0
package/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 ELAN Languages
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
package/README.md ADDED
@@ -0,0 +1,615 @@
1
+ # Rehydra
2
+
3
+ ![License](https://img.shields.io/github/license/rehydra-ai/rehydra)
4
+ ![Issues](https://img.shields.io/github/issues/rehydra-ai/rehydra)
5
+ [![codecov](https://codecov.io/github/rehydra-ai/rehydra/graph/badge.svg?token=WX5RI0ZZJG)](https://codecov.io/github/rehydra-ai/rehydra)
6
+
7
+ On-device PII anonymization module for high-privacy AI workflows. Detects and replaces Personally Identifiable Information (PII) with placeholder tags while maintaining an encrypted mapping for later rehydration.
8
+
9
+ **Works in Node.js, Bun, and browsers** - zero server-side dependencies required.
10
+
11
+ ## Features
12
+
13
+ - **Structured PII Detection**: Regex-based detection for emails, phones, IBANs, credit cards, IPs, URLs
14
+ - **Soft PII Detection**: ONNX-powered NER model for names, organizations, locations (auto-downloads on first use if enabled)
15
+ - **Semantic Enrichment**: AI/MT-friendly tags with gender/location attributes for better translations
16
+ - **Secure PII Mapping**: AES-256-GCM encrypted storage of original PII values
17
+ - **Cross-Platform**: Works identically in Node.js, Bun, and browsers
18
+ - **Configurable Policies**: Customizable detection rules, thresholds, and allowlists
19
+ - **Validation & Leak Scanning**: Built-in validation and optional leak detection
20
+
21
+ ## Installation
22
+
23
+ ### Node.js / Bun
24
+
25
+ ```bash
26
+ npm install rehydra
27
+ ```
28
+
29
+ ### Browser (with bundler)
30
+
31
+ ```bash
32
+ npm install rehydra onnxruntime-web
33
+ ```
34
+
35
+ ### Browser (without bundler)
36
+
37
+ ```html
38
+ <script type="module">
39
+ // Import directly from your dist folder or CDN
40
+ import { createAnonymizer } from './node_modules/rehydra/dist/index.js';
41
+
42
+ // onnxruntime-web is automatically loaded from CDN when needed
43
+ </script>
44
+ ```
45
+
46
+ ## Quick Start
47
+
48
+ ### Regex-Only Mode (No Downloads Required)
49
+
50
+ For structured PII like emails, phones, IBANs, credit cards:
51
+
52
+ ```typescript
53
+ import { anonymizeRegexOnly } from 'rehydra';
54
+
55
+ const result = await anonymizeRegexOnly(
56
+ 'Contact john@example.com or call +49 30 123456. IBAN: DE89370400440532013000'
57
+ );
58
+
59
+ console.log(result.anonymizedText);
60
+ // "Contact <PII type="EMAIL" id="1"/> or call <PII type="PHONE" id="2"/>. IBAN: <PII type="IBAN" id="3"/>"
61
+ ```
62
+
63
+ ### Full Mode with NER (Detects Names, Organizations, Locations)
64
+
65
+ The NER model is automatically downloaded on first use (~280 MB for quantized):
66
+
67
+ ```typescript
68
+ import { createAnonymizer } from 'rehydra';
69
+
70
+ const anonymizer = createAnonymizer({
71
+ ner: {
72
+ mode: 'quantized', // or 'standard' for full model (~1.1 GB)
73
+ onStatus: (status) => console.log(status),
74
+ }
75
+ });
76
+
77
+ await anonymizer.initialize(); // Downloads model if needed
78
+
79
+ const result = await anonymizer.anonymize(
80
+ 'Hello John Smith from Acme Corp in Berlin!'
81
+ );
82
+
83
+ console.log(result.anonymizedText);
84
+ // "Hello <PII type="PERSON" id="1"/> from <PII type="ORG" id="2"/> in <PII type="LOCATION" id="3"/>!"
85
+
86
+ // Clean up when done
87
+ await anonymizer.dispose();
88
+ ```
89
+
90
+ ### With Semantic Enrichment
91
+
92
+ Add gender and location scope for better machine translation:
93
+
94
+ ```typescript
95
+ import { createAnonymizer } from 'rehydra';
96
+
97
+ const anonymizer = createAnonymizer({
98
+ ner: { mode: 'quantized' },
99
+ semantic: {
100
+ enabled: true, // Downloads ~12 MB of semantic data on first use
101
+ onStatus: (status) => console.log(status),
102
+ }
103
+ });
104
+
105
+ await anonymizer.initialize();
106
+
107
+ const result = await anonymizer.anonymize(
108
+ 'Hello Maria Schmidt from Berlin!'
109
+ );
110
+
111
+ console.log(result.anonymizedText);
112
+ // "Hello <PII type="PERSON" gender="female" id="1"/> from <PII type="LOCATION" scope="city" id="2"/>!"
113
+ ```
114
+
115
+ ## Example: Translation Workflow (Anonymize → Translate → Rehydrate)
116
+
117
+ The full workflow for privacy-preserving translation:
118
+
119
+ ```typescript
120
+ import {
121
+ createAnonymizer,
122
+ decryptPIIMap,
123
+ rehydrate,
124
+ InMemoryKeyProvider
125
+ } from 'rehydra';
126
+
127
+ // 1. Create a key provider (required to decrypt later)
128
+ const keyProvider = new InMemoryKeyProvider();
129
+
130
+ // 2. Create anonymizer with key provider
131
+ const anonymizer = createAnonymizer({
132
+ ner: { mode: 'quantized' },
133
+ keyProvider: keyProvider
134
+ });
135
+
136
+ await anonymizer.initialize();
137
+
138
+ // 3. Anonymize before translation
139
+ const original = 'Hello John Smith from Acme Corp in Berlin!';
140
+ const result = await anonymizer.anonymize(original);
141
+
142
+ console.log(result.anonymizedText);
143
+ // "Hello <PII type="PERSON" id="1"/> from <PII type="ORG" id="2"/> in <PII type="LOCATION" id="3"/>!"
144
+
145
+ // 4. Translate (or do other AI workloads that preserve placeholders)
146
+ const translated = await yourAIWorkflow(result.anonymizedText, { from: 'en', to: 'de' });
147
+ // "Hallo <PII type="PERSON" id="1"/> von <PII type="ORG" id="2"/> in <PII type="LOCATION" id="3"/>!"
148
+
149
+ // 5. Decrypt the PII map using the same key
150
+ const encryptionKey = await keyProvider.getKey();
151
+ const piiMap = await decryptPIIMap(result.piiMap, encryptionKey);
152
+
153
+ // 6. Rehydrate - replace placeholders with original values
154
+ const rehydrated = rehydrate(translated, piiMap);
155
+
156
+ console.log(rehydrated);
157
+ // "Hallo John Smith von Acme Corp in Berlin!"
158
+
159
+ // 7. Clean up
160
+ await anonymizer.dispose();
161
+ ```
162
+
163
+ ### Key Points
164
+
165
+ - **Save the encryption key** - You need the same key to decrypt the PII map
166
+ - **Placeholders are XML-like** - Most translation services preserve them automatically
167
+ - **PII stays local** - Original values never leave your system during translation
168
+
169
+ ## API Reference
170
+
171
+ ### Configuration Options
172
+
173
+ ```typescript
174
+ import { createAnonymizer, InMemoryKeyProvider } from 'rehydra';
175
+
176
+ const anonymizer = createAnonymizer({
177
+ // NER configuration
178
+ ner: {
179
+ mode: 'quantized', // 'standard' | 'quantized' | 'disabled' | 'custom'
180
+ autoDownload: true, // Auto-download model if not present
181
+ onStatus: (status) => {}, // Status messages callback
182
+ onDownloadProgress: (progress) => {
183
+ console.log(`${progress.file}: ${progress.percent}%`);
184
+ },
185
+
186
+ // For 'custom' mode only:
187
+ modelPath: './my-model.onnx',
188
+ vocabPath: './vocab.txt',
189
+ },
190
+
191
+ // Semantic enrichment (adds gender/scope attributes)
192
+ semantic: {
193
+ enabled: true, // Enable MT-friendly attributes
194
+ autoDownload: true, // Auto-download semantic data (~12 MB)
195
+ onStatus: (status) => {},
196
+ onDownloadProgress: (progress) => {},
197
+ },
198
+
199
+ // Encryption key provider
200
+ keyProvider: new InMemoryKeyProvider(),
201
+
202
+ // Custom policy (optional)
203
+ defaultPolicy: { /* see Policy section */ },
204
+ });
205
+
206
+ await anonymizer.initialize();
207
+ ```
208
+
209
+ ### NER Modes
210
+
211
+ | Mode | Description | Size | Auto-Download |
212
+ |------|-------------|------|---------------|
213
+ | `'disabled'` | No NER, regex only | 0 | N/A |
214
+ | `'quantized'` | Smaller model, ~95% accuracy | ~280 MB | Yes |
215
+ | `'standard'` | Full model, best accuracy | ~1.1 GB | Yes |
216
+ | `'custom'` | Your own ONNX model | Varies | No |
217
+
218
+ ### Main Functions
219
+
220
+ #### `createAnonymizer(config?)`
221
+
222
+ Creates a reusable anonymizer instance:
223
+
224
+ ```typescript
225
+ const anonymizer = createAnonymizer({
226
+ ner: { mode: 'quantized' }
227
+ });
228
+
229
+ await anonymizer.initialize();
230
+ const result = await anonymizer.anonymize('text');
231
+ await anonymizer.dispose();
232
+ ```
233
+
234
+ #### `anonymize(text, locale?, policy?)`
235
+
236
+ One-off anonymization (regex-only by default):
237
+
238
+ ```typescript
239
+ import { anonymize } from 'rehydra';
240
+
241
+ const result = await anonymize('Contact test@example.com');
242
+ ```
243
+
244
+ #### `anonymizeWithNER(text, nerConfig, policy?)`
245
+
246
+ One-off anonymization with NER:
247
+
248
+ ```typescript
249
+ import { anonymizeWithNER } from 'rehydra';
250
+
251
+ const result = await anonymizeWithNER(
252
+ 'Hello John Smith',
253
+ { mode: 'quantized' }
254
+ );
255
+ ```
256
+
257
+ #### `anonymizeRegexOnly(text, policy?)`
258
+
259
+ Fast regex-only anonymization:
260
+
261
+ ```typescript
262
+ import { anonymizeRegexOnly } from 'rehydra';
263
+
264
+ const result = await anonymizeRegexOnly('Card: 4111111111111111');
265
+ ```
266
+
267
+ ### Rehydration Functions
268
+
269
+ #### `decryptPIIMap(encryptedMap, key)`
270
+
271
+ Decrypts the PII map for rehydration:
272
+
273
+ ```typescript
274
+ import { decryptPIIMap } from 'rehydra';
275
+
276
+ const piiMap = await decryptPIIMap(result.piiMap, encryptionKey);
277
+ // Returns Map<string, string> where key is "PERSON:1" and value is "John Smith"
278
+ ```
279
+
280
+ #### `rehydrate(text, piiMap)`
281
+
282
+ Replaces placeholders with original values:
283
+
284
+ ```typescript
285
+ import { rehydrate } from 'rehydra';
286
+
287
+ const original = rehydrate(translatedText, piiMap);
288
+ ```
289
+
290
+ ### Result Structure
291
+
292
+ ```typescript
293
+ interface AnonymizationResult {
294
+ // Text with PII replaced by placeholder tags
295
+ anonymizedText: string;
296
+
297
+ // Detected entities (without original text for safety)
298
+ entities: Array<{
299
+ type: PIIType;
300
+ id: number;
301
+ start: number;
302
+ end: number;
303
+ confidence: number;
304
+ source: 'REGEX' | 'NER';
305
+ }>;
306
+
307
+ // Encrypted PII mapping (for later rehydration)
308
+ piiMap: {
309
+ ciphertext: string; // Base64
310
+ iv: string; // Base64
311
+ authTag: string; // Base64
312
+ };
313
+
314
+ // Processing statistics
315
+ stats: {
316
+ countsByType: Record<PIIType, number>;
317
+ totalEntities: number;
318
+ processingTimeMs: number;
319
+ modelVersion: string;
320
+ leakScanPassed?: boolean;
321
+ };
322
+ }
323
+ ```
324
+
325
+ ## Supported PII Types
326
+
327
+ | Type | Description | Detection | Semantic Attributes |
328
+ |------|-------------|-----------|---------------------|
329
+ | `EMAIL` | Email addresses | Regex | - |
330
+ | `PHONE` | Phone numbers (international) | Regex | - |
331
+ | `IBAN` | International Bank Account Numbers | Regex + Checksum | - |
332
+ | `BIC_SWIFT` | Bank Identifier Codes | Regex | - |
333
+ | `CREDIT_CARD` | Credit card numbers | Regex + Luhn | - |
334
+ | `IP_ADDRESS` | IPv4 and IPv6 addresses | Regex | - |
335
+ | `URL` | Web URLs | Regex | - |
336
+ | `CASE_ID` | Case/ticket numbers | Regex (configurable) | - |
337
+ | `CUSTOMER_ID` | Customer identifiers | Regex (configurable) | - |
338
+ | `PERSON` | Person names | NER | `gender` (male/female/neutral) |
339
+ | `ORG` | Organization names | NER | - |
340
+ | `LOCATION` | Location/place names | NER | `scope` (city/country/region) |
341
+ | `ADDRESS` | Physical addresses | NER | - |
342
+ | `DATE_OF_BIRTH` | Dates of birth | NER | - |
343
+
344
+ ## Configuration
345
+
346
+ ### Anonymization Policy
347
+
348
+ ```typescript
349
+ import { createAnonymizer, PIIType } from 'rehydra';
350
+
351
+ const anonymizer = createAnonymizer({
352
+ ner: { mode: 'quantized' },
353
+ defaultPolicy: {
354
+ // Which PII types to detect
355
+ enabledTypes: new Set([PIIType.EMAIL, PIIType.PHONE, PIIType.PERSON]),
356
+
357
+ // Confidence thresholds per type (0.0 - 1.0)
358
+ confidenceThresholds: new Map([
359
+ [PIIType.PERSON, 0.8],
360
+ [PIIType.EMAIL, 0.5],
361
+ ]),
362
+
363
+ // Terms to never treat as PII
364
+ allowlistTerms: new Set(['Customer Service', 'Help Desk']),
365
+
366
+ // Enable semantic enrichment (gender/scope)
367
+ enableSemanticMasking: true,
368
+
369
+ // Enable leak scanning on output
370
+ enableLeakScan: true,
371
+ },
372
+ });
373
+ ```
374
+
375
+ ### Custom Recognizers
376
+
377
+ Add domain-specific patterns:
378
+
379
+ ```typescript
380
+ import { createCustomIdRecognizer, PIIType, createAnonymizer } from 'rehydra';
381
+
382
+ const customRecognizer = createCustomIdRecognizer([
383
+ {
384
+ name: 'Order Number',
385
+ pattern: /\bORD-[A-Z0-9]{8}\b/g,
386
+ type: PIIType.CASE_ID,
387
+ },
388
+ ]);
389
+
390
+ const anonymizer = createAnonymizer();
391
+ anonymizer.getRegistry().register(customRecognizer);
392
+ ```
393
+
394
+ ## Data & Model Storage
395
+
396
+ Models and semantic data are cached locally for offline use.
397
+
398
+ ### Node.js Cache Locations
399
+
400
+ | Data | macOS | Linux | Windows |
401
+ |------|-------|-------|---------|
402
+ | NER Models | `~/Library/Caches/rehydra/models/` | `~/.cache/rehydra/models/` | `%LOCALAPPDATA%/rehydra/models/` |
403
+ | Semantic Data | `~/Library/Caches/rehydra/semantic-data/` | `~/.cache/rehydra/semantic-data/` | `%LOCALAPPDATA%/rehydra/semantic-data/` |
404
+
405
+ ### Browser Cache
406
+
407
+ In browsers, data is stored using:
408
+ - **IndexedDB**: For semantic data and smaller files
409
+ - **Origin Private File System (OPFS)**: For large model files (~280 MB)
410
+
411
+ Data persists across page reloads and browser sessions.
412
+
413
+ ### Manual Data Management
414
+
415
+ ```typescript
416
+ import {
417
+ // Model management
418
+ isModelDownloaded,
419
+ downloadModel,
420
+ clearModelCache,
421
+ listDownloadedModels,
422
+
423
+ // Semantic data management
424
+ isSemanticDataDownloaded,
425
+ downloadSemanticData,
426
+ clearSemanticDataCache,
427
+ } from 'rehydra';
428
+
429
+ // Check if model is downloaded
430
+ const hasModel = await isModelDownloaded('quantized');
431
+
432
+ // Manually download model with progress
433
+ await downloadModel('quantized', (progress) => {
434
+ console.log(`${progress.file}: ${progress.percent}%`);
435
+ });
436
+
437
+ // Check semantic data
438
+ const hasSemanticData = await isSemanticDataDownloaded();
439
+
440
+ // List downloaded models
441
+ const models = await listDownloadedModels();
442
+
443
+ // Clear caches
444
+ await clearModelCache('quantized'); // or clearModelCache() for all
445
+ await clearSemanticDataCache();
446
+ ```
447
+
448
+ ## Encryption & Security
449
+
450
+ The PII map is encrypted using **AES-256-GCM** via the Web Crypto API (works in both Node.js and browsers).
451
+
452
+ ### Key Providers
453
+
454
+ ```typescript
455
+ import {
456
+ InMemoryKeyProvider, // For development/testing
457
+ ConfigKeyProvider, // For production with pre-configured key
458
+ KeyProvider, // Interface for custom implementations
459
+ generateKey,
460
+ } from 'rehydra';
461
+
462
+ // Development: In-memory key (generates random key, lost on page refresh)
463
+ const devKeyProvider = new InMemoryKeyProvider();
464
+
465
+ // Production: Pre-configured key
466
+ // Generate key: openssl rand -base64 32
467
+ const keyBase64 = process.env.PII_ENCRYPTION_KEY; // or read from config
468
+ const prodKeyProvider = new ConfigKeyProvider(keyBase64);
469
+
470
+ // Custom: Implement KeyProvider interface
471
+ class SecureKeyProvider implements KeyProvider {
472
+ async getKey(): Promise<Uint8Array> {
473
+ // Retrieve from secure storage, HSM, keychain, etc.
474
+ return await getKeyFromSecureStorage();
475
+ }
476
+ }
477
+ ```
478
+
479
+ ### Security Best Practices
480
+
481
+ - **Never log the raw PII map** - Always use encrypted storage
482
+ - **Persist the encryption key securely** - Use platform keystores (iOS Keychain, Android Keystore, etc.)
483
+ - **Rotate keys** - Implement key rotation for long-running applications
484
+ - **Enable leak scanning** - Catch any missed PII in output
485
+
486
+ ## Browser Usage
487
+
488
+ The library works seamlessly in browsers without any special configuration.
489
+
490
+ ### Basic Browser Example
491
+
492
+ ```html
493
+ <!DOCTYPE html>
494
+ <html>
495
+ <head>
496
+ <title>PII Anonymization</title>
497
+ </head>
498
+ <body>
499
+ <script type="module">
500
+ import {
501
+ createAnonymizer,
502
+ InMemoryKeyProvider,
503
+ decryptPIIMap,
504
+ rehydrate
505
+ } from './node_modules/rehydra/dist/index.js';
506
+
507
+ async function demo() {
508
+ // Create anonymizer
509
+ const keyProvider = new InMemoryKeyProvider();
510
+ const anonymizer = createAnonymizer({
511
+ ner: {
512
+ mode: 'quantized',
513
+ onStatus: (s) => console.log('NER:', s),
514
+ onDownloadProgress: (p) => console.log(`Download: ${p.percent}%`)
515
+ },
516
+ semantic: { enabled: true },
517
+ keyProvider
518
+ });
519
+
520
+ // Initialize (downloads models on first use)
521
+ await anonymizer.initialize();
522
+
523
+ // Anonymize
524
+ const result = await anonymizer.anonymize(
525
+ 'Contact Maria Schmidt at maria@example.com in Berlin.'
526
+ );
527
+
528
+ console.log('Anonymized:', result.anonymizedText);
529
+ // "Contact <PII type="PERSON" gender="female" id="1"/> at <PII type="EMAIL" id="2"/> in <PII type="LOCATION" scope="city" id="3"/>."
530
+
531
+ // Rehydrate
532
+ const key = await keyProvider.getKey();
533
+ const piiMap = await decryptPIIMap(result.piiMap, key);
534
+ const original = rehydrate(result.anonymizedText, piiMap);
535
+
536
+ console.log('Rehydrated:', original);
537
+
538
+ await anonymizer.dispose();
539
+ }
540
+
541
+ demo().catch(console.error);
542
+ </script>
543
+ </body>
544
+ </html>
545
+ ```
546
+
547
+ ### Browser Notes
548
+
549
+ - **First-use downloads**: NER model (~280 MB) and semantic data (~12 MB) are downloaded on first use
550
+ - **ONNX runtime**: Automatically loaded from CDN if not bundled
551
+ - **Offline support**: After initial download, everything works offline
552
+ - **Storage**: Uses IndexedDB and OPFS - data persists across sessions
553
+
554
+ ## Bun Support
555
+
556
+ This library works with [Bun](https://bun.sh). Since `onnxruntime-node` is a native Node.js addon, Bun uses `onnxruntime-web`:
557
+
558
+ ```bash
559
+ bun add rehydra onnxruntime-web
560
+ ```
561
+
562
+ Usage is identical - the library auto-detects the runtime.
563
+
564
+ ## Performance
565
+
566
+ | Component | Time (2K chars) | Notes |
567
+ |-----------|-----------------|-------|
568
+ | Regex pass | ~5 ms | All regex recognizers |
569
+ | NER inference | ~100-150 ms | Quantized model |
570
+ | Semantic enrichment | ~1-2 ms | After data loaded |
571
+ | Total pipeline | ~150-200 ms | Full anonymization |
572
+
573
+ | Model | Size | First-Use Download |
574
+ |-------|------|-------------------|
575
+ | Quantized | ~280 MB | ~30s on fast connection |
576
+ | Standard | ~1.1 GB | ~2min on fast connection |
577
+ | Semantic Data | ~12 MB | ~5s on fast connection |
578
+
579
+ ## Requirements
580
+
581
+ | Environment | Version | Notes |
582
+ |-------------|---------|-------|
583
+ | Node.js | >= 18.0.0 | Uses native `onnxruntime-node` |
584
+ | Bun | >= 1.0.0 | Requires `onnxruntime-web` |
585
+ | Browsers | Chrome 86+, Firefox 89+, Safari 15.4+, Edge 86+ | Uses OPFS for model storage |
586
+
587
+ ## Development
588
+
589
+ ```bash
590
+ # Install dependencies
591
+ npm install
592
+
593
+ # Run tests
594
+ npm test
595
+
596
+ # Build
597
+ npm run build
598
+
599
+ # Lint
600
+ npm run lint
601
+ ```
602
+
603
+ ### Building Custom Models
604
+
605
+ For development or custom models:
606
+
607
+ ```bash
608
+ # Requires Python 3.8+
609
+ npm run setup:ner # Standard model
610
+ npm run setup:ner:quantized # Quantized model
611
+ ```
612
+
613
+ ## License
614
+
615
+ MIT
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Crypto Module
3
+ * Exports encryption utilities for PII map
4
+ */
5
+ export * from './pii-map-crypto.js';
6
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/crypto/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,cAAc,qBAAqB,CAAC"}
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Crypto Module
3
+ * Exports encryption utilities for PII map
4
+ */
5
+ export * from './pii-map-crypto.js';
6
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/crypto/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,cAAc,qBAAqB,CAAC"}