@nationaldesignstudio/rampart 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +402 -0
- package/MODEL_CARD.md +422 -0
- package/README.md +279 -0
- package/RELEASE.md +97 -0
- package/WHITEPAPER.md +316 -0
- package/dist/index.d.ts +23 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +35639 -0
- package/dist/index.js.map +36 -0
- package/dist/src/guard.d.ts +94 -0
- package/dist/src/guard.d.ts.map +1 -0
- package/dist/src/heuristics.d.ts +14 -0
- package/dist/src/heuristics.d.ts.map +1 -0
- package/dist/src/ner/classifier.d.ts +92 -0
- package/dist/src/ner/classifier.d.ts.map +1 -0
- package/dist/src/ner/worker.d.ts +44 -0
- package/dist/src/ner/worker.d.ts.map +1 -0
- package/dist/src/ner/worker.js +35302 -0
- package/dist/src/ner/worker.js.map +30 -0
- package/dist/src/pipeline.d.ts +76 -0
- package/dist/src/pipeline.d.ts.map +1 -0
- package/dist/src/policy.d.ts +27 -0
- package/dist/src/policy.d.ts.map +1 -0
- package/dist/src/premask.d.ts +48 -0
- package/dist/src/premask.d.ts.map +1 -0
- package/dist/src/session.d.ts +60 -0
- package/dist/src/session.d.ts.map +1 -0
- package/dist/src/streaming.d.ts +32 -0
- package/dist/src/streaming.d.ts.map +1 -0
- package/dist/src/types.d.ts +43 -0
- package/dist/src/types.d.ts.map +1 -0
- package/dist/src/validators.d.ts +16 -0
- package/dist/src/validators.d.ts.map +1 -0
- package/eval/bench/README.md +91 -0
- package/eval/bench/fetch.ts +152 -0
- package/eval/bench/labels.ts +45 -0
- package/eval/bench/run.ts +146 -0
- package/eval/bench/runs/m06-v3-30k/by_language.json +303 -0
- package/eval/bench/runs/m06-v3-30k/summary.json +56 -0
- package/eval/bench/runs/sample-900/by_language.json +303 -0
- package/eval/bench/runs/sample-900/manifest.json +926 -0
- package/eval/bench/runs/sample-900/summary.json +56 -0
- package/eval/bench/score.ts +197 -0
- package/eval/bench/webgpu/entry.ts +70 -0
- package/eval/bench/webgpu/index.html +12 -0
- package/eval/bench/webgpu.ts +209 -0
- package/eval/public-cases.ts +412 -0
- package/eval/run-public-eval.ts +140 -0
- package/examples/basic-chat.ts +12 -0
- package/examples/pii-worker.ts +3 -0
- package/index.ts +47 -0
- package/package.json +103 -0
- package/src/guard.ts +170 -0
- package/src/heuristics.ts +141 -0
- package/src/ner/classifier.ts +580 -0
- package/src/ner/worker.ts +130 -0
- package/src/policy.ts +64 -0
- package/src/premask.ts +90 -0
- package/src/session.ts +99 -0
- package/src/streaming.ts +73 -0
- package/src/types.ts +74 -0
- package/src/validators.ts +40 -0
package/WHITEPAPER.md
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
# Rampart — A Local-First System for PII Redaction
|
|
2
|
+
|
|
3
|
+
## Abstract
|
|
4
|
+
|
|
5
|
+
We describe **Rampart**, a local-first system for removing personally identifiable information from user-typed text before it leaves the browser.
|
|
6
|
+
The system combines a 14.7 MB ONNX token-classification model with a deterministic recognizer layer; together the two layers form the first gate of a defense-in-depth pipeline.
|
|
7
|
+
|
|
8
|
+
This release is scoped to the **seven Latin-script languages** the model was trained on (English, Spanish, French, German, Italian, Portuguese, Dutch); names in non-Latin scripts are out of scope (see §8).
|
|
9
|
+
Across all seven languages, on a 30,000-row held-out test set drawn from the OpenPII 1.5M corpus, the full system achieves **98.42% private-term recall** (Wilson 95% CI [98.35, 98.49]) at 6.6 ms median latency in Node ONNX, with 91.7% public-term retention under term-presence scoring (and >99% under policy-aware scoring after schema reconciliation with the gold labels).
|
|
10
|
+
On the English+Spanish slice the system reaches **98.85%** recall.
|
|
11
|
+
In the browser, over a held-out OpenPII slice, the shipped pipeline runs at **3.9 ms p50** on WebGPU (Apple Metal) and 12.6 ms on WASM, measured by the `eval/bench/webgpu.ts` harness (latency is hardware-dependent; reproduce with `bun run bench:webgpu`).
|
|
12
|
+
|
|
13
|
+
Rampart is **harm reduction**, not perfect protection.
|
|
14
|
+
Any privacy system that depends on a trusted server-side environment inherits that environment's risk: once unredacted text leaves the device — to a model provider, a logging pipeline, an analytics SDK, or a future infrastructure breach — it is exposed to failures beyond the user's control.
|
|
15
|
+
Redacting in the browser shifts that trust boundary: the impact of any downstream failure is bounded by what the client already removed, because a later component cannot leak what was never sent.
|
|
16
|
+
No model this small catches every instance of PII, so we deliberately chose a recall-biased operating point that still runs entirely in the browser on low-end hardware, offline, with no network round trip — a meaningful layer of defense in depth, not a guarantee.
|
|
17
|
+
|
|
18
|
+
This document describes the architecture, the alternatives we evaluated and rejected, our training and evaluation methodology, operating-point and calibration analysis, known failure modes, and the licensing under which the system is released.
|
|
19
|
+
|
|
20
|
+
## 1. Design goals
|
|
21
|
+
|
|
22
|
+
The system is built around four constraints:
|
|
23
|
+
|
|
24
|
+
1. **Local-first privacy as harm reduction.** Personal information is removed before reaching application infrastructure. Data the server never receives cannot be leaked by a model provider, a logging system, an analytics pipeline, a third-party SDK, or a future compromise of the application's own backend. This is the threat model that motivates the entire system: every other constraint follows from refusing to put the user in a position where they have to trust a remote operator with their unredacted text.
|
|
25
|
+
2. **Browser-deployable.** The shipped artifact must fit on a low-end mobile phone. Targeting under 15 MB on the wire ruled out most modern PII NER models, including GLiNER (≈50 MB) and DistilBERT-based detectors (≈64 MB).
|
|
26
|
+
3. **Recall-biased.** Misses leak data, so the default policy redacts whenever a detector fires above its calibrated threshold. Over-redaction has a real cost, too: a chat assistant that cannot see context cannot help, but at a smaller cost than a leak.
|
|
27
|
+
4. **Domain-aware retention.** Useful assistants often need rough context, like coarse geography, to be helpful. The keep-set (`{CITY, STATE, ZIP_CODE}`) is policy-driven so applications can tune the boundary between privacy and utility without retraining; the precise street line is redacted.
|
|
28
|
+
|
|
29
|
+
## 2. Architecture
|
|
30
|
+
|
|
31
|
+
Rampart ships as two cooperating layers that run in parallel and merge their outputs.
|
|
32
|
+
Each layer is designed to cover a class of failures the other layer cannot reliably handle on its own.
|
|
33
|
+
Both run entirely in-browser.
|
|
34
|
+
|
|
35
|
+
### 2.1 The deterministic recognizer layer
|
|
36
|
+
|
|
37
|
+
Real-world PII often has structure that is faster and more reliable to validate than to learn.
|
|
38
|
+
The deterministic layer is a curated set of regular expressions paired with checksum and structural validators.
|
|
39
|
+
It owns five classes end-to-end:
|
|
40
|
+
|
|
41
|
+
- **Luhn checksum** for payment cards, matched over the digit projection so every separator form collapses to one rule.
|
|
42
|
+
- **SSN structural rules** that reject reserved areas (000, 666, 9XX) and ZIP+4 codes that pattern-match the SSN shape.
|
|
43
|
+
- **Pattern-backed detection** for email addresses, URLs, and IP addresses (IPv4, IPv6, and MAC), where the structure lives in the punctuation.
|
|
44
|
+
|
|
45
|
+
These detectors are synchronous and run on the raw input, so this structured PII can be removed even before the model has loaded.
|
|
46
|
+
Because cards and SSNs use checksums and structural rules rather than only shape matching, false-positive rates are very low: a 16-digit number that fails Luhn is not redacted as a card; a 9-digit number with a reserved-area prefix is not redacted as an SSN.
|
|
47
|
+
Classes with no checksum — phone, routing, tax, government-ID, passport, and license numbers, and street-address components — are deliberately left to the model rather than guessed at with a regex.
|
|
48
|
+
|
|
49
|
+
### 2.2 The token-classification model
|
|
50
|
+
|
|
51
|
+
The deterministic layer cannot recognize contextual PII — names, phone numbers, account/routing/tax numbers, government IDs, passports, licenses, and free-form address components.
|
|
52
|
+
For these, Rampart uses a MiniLM-L6-H384 encoder fine-tuned on a 35-label BIO head (17 entity types).
|
|
53
|
+
The tokenizer is an uncased WordPiece tokenizer trimmed to 19,730 pieces (from BERT-uncased's 30,522).
|
|
54
|
+
Single-character pieces are always retained, which preserves WordPiece's character-level fallback for rare names and out-of-vocabulary content.
|
|
55
|
+
|
|
56
|
+
### 2.3 Span repair
|
|
57
|
+
|
|
58
|
+
HuggingFace's standard `aggregation_strategy="simple"` produces fragmented spans when subword B-`GIVEN_NAME` probabilities outrank I-`GIVEN_NAME` inside a name, e.g. "Zaccarino" tokenized as "Zac"+"##car"+"##ino" can come back as three separate name spans.
|
|
59
|
+
Rampart applies three layers of post-processing:
|
|
60
|
+
|
|
61
|
+
1. **Adjacent-span merging** collapses consecutive same-label spans separated only by name-internal punctuation (space, hyphen, apostrophe, period, comma).
|
|
62
|
+
2. **Iterative bridge-and-merge** rescues low-confidence candidates (score between the 0.15 extend floor and the 0.4 keep threshold) when they bridge two high-confidence spans of the same label. This catches names like "Jose [LOW] de [HIGH] Garcia" where a particle scored below the cutoff but is structurally part of the name.
|
|
63
|
+
3. **Capitalized-particle rescue** grows name spans (`GIVEN_NAME` / `SURNAME`) to swallow short capitalized name particles ("De la", "Von", "Mc") sitting inside a name's flow.
|
|
64
|
+
|
|
65
|
+
This composition lifts span-F1 to 0.53 strict (IoU=1.0) and 0.66 relaxed (IoU≥0.5) on the 30,000-row test set, well above the fragmented spans the default aggregation produces.
|
|
66
|
+
|
|
67
|
+
### 2.4 Defense in depth
|
|
68
|
+
|
|
69
|
+
The two layers are complementary, not redundant. Each owns the classes the other handles poorly:
|
|
70
|
+
|
|
71
|
+
| Layer | Owns | Why |
|
|
72
|
+
| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
73
|
+
| Deterministic | `SSN`, `CREDIT_CARD`, `IP_ADDRESS`, `EMAIL`, `URL` | Checksum/structural validation (cards, SSNs) or punctuation-anchored patterns (email, URL, IP) give near-perfect precision and recall, including under orthographic perturbation. |
|
|
74
|
+
| Model | `GIVEN_NAME`, `SURNAME`, `PHONE`, `TAX_ID`, `BANK_ACCOUNT`, `ROUTING_NUMBER`, `GOVERNMENT_ID`, `PASSPORT`, `DRIVERS_LICENSE`, `BUILDING_NUMBER`, `STREET_NAME`, `SECONDARY_ADDRESS`, `CITY`, `STATE`, `ZIP_CODE` | Open-ended, context-dependent, and free of any checksum to validate against — exactly what an enumerable rule set cannot keep up with. |
|
|
75
|
+
|
|
76
|
+
The model handles open-ended cases in shapes the regex catalog cannot predict for; the deterministic layer covers the structured classes where exact-character recall matters and adversarial inputs where a Luhn-valid card survives any perturbation of the surrounding text.
|
|
77
|
+
|
|
78
|
+
The system unions the two layers' spans before applying policy, so a token redacted by either layer is redacted in output.
|
|
79
|
+
This is why full-system private-term recall exceeds the model alone on the structured classes the deterministic layer owns, while the model raises the ceiling on classes the deterministic layer cannot enumerate.
|
|
80
|
+
|
|
81
|
+
### 2.5 Policy
|
|
82
|
+
|
|
83
|
+
Every detected span carries a label.
|
|
84
|
+
The policy layer applies **default-deny**: each label is redacted unless explicitly in the keep-set.
|
|
85
|
+
The default keep-set is `{CITY, STATE, ZIP_CODE}` so an assistant can reason about coarse geography and eligibility while the precise street line (`BUILDING_NUMBER` + `STREET_NAME`) and secondary-address line (`SECONDARY_ADDRESS`) are redacted alongside names, identifiers, and contact information.
|
|
86
|
+
The keep-set is a compile-time set (`KEEP_LABELS` in `src/types.ts`), not a runtime flag.
|
|
87
|
+
|
|
88
|
+
### 2.6 Session table
|
|
89
|
+
|
|
90
|
+
A client-only session table maps each detected raw value to a stable placeholder:
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
Maria Garcia → [GIVEN_NAME_1] [SURNAME_1]
|
|
94
|
+
555-11-2222 → [SSN_1]
|
|
95
|
+
88 Pine Avenue → [BUILDING_NUMBER_1] [STREET_NAME_1]
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
The model provider sees only the placeholders.
|
|
99
|
+
The user sees the restored response.
|
|
100
|
+
Placeholders are intentionally formatted to look obviously synthetic so a model provider cannot easily de-anonymize them through downstream inference.
|
|
101
|
+
The table is never transmitted.
|
|
102
|
+
|
|
103
|
+
## 3. Methodology
|
|
104
|
+
|
|
105
|
+
### 3.1 The candidate space
|
|
106
|
+
|
|
107
|
+
Model selection was not a one-shot grid.
|
|
108
|
+
The shipped model is the product of a sustained, instrumented search: many fine-tuning runs across base architecture, corpus composition, label schema, vocabulary trim, prefilter strategy, and quantization, run over the project's lifetime.
|
|
109
|
+
The eval harness (§3.4) was the instrument that drove it.
|
|
110
|
+
Every run was scored end-to-end against the held-out set, and the corpus and schema were evolved in whatever direction the metrics said was working.
|
|
111
|
+
The relabeling and corpus update decisions in §3.5 are themselves validated outputs of that loop.
|
|
112
|
+
|
|
113
|
+
One representative round of that search — the selection matrix detailed in §4 — varied four axes over their Cartesian product:
|
|
114
|
+
|
|
115
|
+
- **Base architecture (2).** MiniLM-L6-H384-uncased vs ELECTRA-small. ELECTRA is attractive in size (its trimmed Q4 artifact is ~10 MB to MiniLM's ~19 MB at full vocab) but, as §4.1 shows, did not imbibe additional data the way MiniLM did.
|
|
116
|
+
- **Prefilter (2).** Whether the deterministic layer's structured classes (`SSN`, `CREDIT_CARD`, `IP_ADDRESS`) are premasked before the model classifier runs or trained as real model classes (the no-prefilter ablation). Prefilter-on is the shipped configuration; the model never spends capacity on digits a checksum already settles.
|
|
117
|
+
- **Data mix (3).** Cumulative: the synthetic conversation corpus alone, then plus an OCR'd-document corpus, then plus the public AI4Privacy template corpus.
|
|
118
|
+
- **Corpus volume (2).** The first 100k vs the first 250k synthetic conversations.
|
|
119
|
+
|
|
120
|
+
That is 2 × 3 × 2 = 12 data/architecture cells per base, scored under both runtime modes — a 24-model ablation, each cell exported to its Q4 and several vocab-trimmed artifacts.
|
|
121
|
+
§4 reports it and the directional findings that came out of it; this section defines the rules under which every cell was scored.
|
|
122
|
+
It is one round of the larger, ongoing search, not its entirety.
|
|
123
|
+
|
|
124
|
+
### 3.2 Pre-registered eval design
|
|
125
|
+
|
|
126
|
+
The eval design is fixed before a round's numbers are read.
|
|
127
|
+
The governing principle is recall-biased and size-aware:
|
|
128
|
+
|
|
129
|
+
> Among candidates that clear the private-term recall floor on the held-out set, ship the smallest — unless a documented reason (multilingual coverage, adversarial robustness, a fairness regression) justifies overriding.
|
|
130
|
+
|
|
131
|
+
### 3.3 Datasets
|
|
132
|
+
|
|
133
|
+
| Dataset | Rows | Use |
|
|
134
|
+
| ----------------------------------- | ----- | ----------------------------------------------------------------------------------------------------------------- |
|
|
135
|
+
| OpenPII calibration | 10k | Recall-floor threshold tuning |
|
|
136
|
+
| OpenPII held-out test (7 languages) | 30k | Headline test (en, es, fr, de, it, pt, nl) |
|
|
137
|
+
| Per-language slices | — | English (11,569) and Spanish (3,234) reported separately; remaining five in the per-language table (§ model card) |
|
|
138
|
+
| Fairness | 1,875 | Faker × 15 naming traditions × 5 templates |
|
|
139
|
+
|
|
140
|
+
All OpenPII splits are drawn deterministically from the held-out 100k partition of the OpenPII 1.5M corpus, disjoint from training.
|
|
141
|
+
The shipped headline is measured across all seven supported languages; the per-language counts are the natural language distribution of that 30k slice (see model card).
|
|
142
|
+
|
|
143
|
+
### 3.4 Metrics
|
|
144
|
+
|
|
145
|
+
- **Private-term recall**: for every gold private value, did the redacted output contain the value? Wilson 95% CI; bootstrap 1000-resample CI for stratified breakdowns.
|
|
146
|
+
- **Public-term retention**: for every gold public value, did the redacted output preserve the value?
|
|
147
|
+
- **Span-level F1**: strict at IoU=1.0; relaxed at IoU≥0.5; overlap at IoU>0. One-to-one greedy matching, higher-scored predictions match first.
|
|
148
|
+
- **Latency**: Node.js ONNX runtime cold / p50 / p95 / p99 over the full 30,000-row test set.
|
|
149
|
+
- **Calibration**: 15-bin reliability expected calibration error (ECE), per label and overall, computed on per-span max-class scores.
|
|
150
|
+
|
|
151
|
+
#### Headline results, per language
|
|
152
|
+
|
|
153
|
+
Full system (model + deterministic layer + policy) on the 30,000-row held-out test, scored end-to-end by the committed `eval/bench` harness:
|
|
154
|
+
|
|
155
|
+
| Language | Rows | Private recall | Public retention | Leaks / private terms |
|
|
156
|
+
| --------------- | ---------: | -------------: | ---------------: | --------------------: |
|
|
157
|
+
| English (en) | 11,569 | 98.85% | 90.5% | 618 / 53,877 |
|
|
158
|
+
| Spanish (es) | 3,234 | 98.84% | 91.6% | 160 / 13,736 |
|
|
159
|
+
| French (fr) | 4,708 | 98.41% | 92.8% | 317 / 19,906 |
|
|
160
|
+
| German (de) | 4,260 | 97.94% | 91.7% | 357 / 17,347 |
|
|
161
|
+
| Italian (it) | 3,218 | 97.83% | 94.1% | 301 / 13,855 |
|
|
162
|
+
| Portuguese (pt) | 1,485 | 97.73% | 92.5% | 147 / 6,467 |
|
|
163
|
+
| Dutch (nl) | 1,526 | 97.21% | 91.9% | 182 / 6,519 |
|
|
164
|
+
| **All seven** | **30,000** | **98.42%** | **91.69%** | **2,082 / 131,707** |
|
|
165
|
+
|
|
166
|
+
Recall is term-presence (did every gold private value vanish from the output); retention is the policy-aware keep-set (city/state/ZIP). The seven-language aggregate carries a Wilson 95% CI of [98.35, 98.49].
|
|
167
|
+
|
|
168
|
+
### 3.5 Label schema and training-data design
|
|
169
|
+
|
|
170
|
+
The shipped model reflects two design decisions that postdate the candidate sweep above.
|
|
171
|
+
Both are about _what_ the model is asked to learn, not which checkpoint to ship.
|
|
172
|
+
|
|
173
|
+
**Atomic label decomposition.**
|
|
174
|
+
Earlier iterations used coarse, pre-combined labels — a single `STREET_ADDRESS`, a single `PERSON`, plus `ORGANIZATION`, `LOCATION`, `DATE`, `AGE`, `INCOME`, and a catch-all `SECRET`.
|
|
175
|
+
A model trained on those overfits to the easy case where a name or address arrives as one tidy, well-formed blob.
|
|
176
|
+
The shipped schema instead forces everything to atomic pieces: names split into `GIVEN_NAME` / `SURNAME`; the street line into `BUILDING_NUMBER` / `STREET_NAME`; geography into `CITY` / `STATE` / `ZIP_CODE`; and document identifiers into their specific classes (`TAX_ID`, `BANK_ACCOUNT`, `ROUTING_NUMBER`, `GOVERNMENT_ID`, `PASSPORT`, `DRIVERS_LICENSE`) rather than a generic `SECRET`.
|
|
177
|
+
This trains the model to recognize PII _fragments_ in disordered text — a building number on one line, a street name three messages later — instead of expecting a textbook one-line address.
|
|
178
|
+
|
|
179
|
+
**Dates, ages, and income are non-PII.**
|
|
180
|
+
These were dropped from the redact-set entirely and map to `O` (kept context).
|
|
181
|
+
A public-benefits assistant needs to reason about age and income to be useful, and a bare date is rarely identifying on its own; classifying them as redactable was over-redaction that hurt utility without a matching privacy gain.
|
|
182
|
+
The keep-set proper is `{CITY, STATE, ZIP_CODE}`; dates/ages/income are simply not modeled as PII.
|
|
183
|
+
|
|
184
|
+
**Premask train/serve symmetry.**
|
|
185
|
+
The structured classes the deterministic layer owns (`SSN`, `CREDIT_CARD`, `IP_ADDRESS`) are replaced with sentinel tokens _before_ the model sees the text, both at inference (`src/premask.ts`) and during dataset construction .
|
|
186
|
+
The model therefore never spends capacity learning to classify raw card/SSN/IP digits — those are a solved problem for a checksum — and the train-time and inference-time input distributions match by construction.
|
|
187
|
+
|
|
188
|
+
**A deliberately noisy corpus.** OpenPII supplies broad multilingual entropy, but its conversations are clean and well-formed.
|
|
189
|
+
To keep the redactor from overfitting to tidy inputs, the synthetic portion of the corpus is generated to be messy and realistic on purpose: low-effort and typo-prone text, voice-dictated phrasing, values pasted out of forms, multilingual mixing, and contradictory, duplicated, or wrong-field entries, produced across a range of assistant personas so the user-side language varies.
|
|
190
|
+
The aim is a model that catches partial, fragmented PII in real chatbot text rather than only in clean examples.
|
|
191
|
+
|
|
192
|
+
## 4. Alternatives we tried
|
|
193
|
+
|
|
194
|
+
### 4.1 Base architecture: MiniLM over ELECTRA
|
|
195
|
+
|
|
196
|
+
The first axis we settled was the encoder.
|
|
197
|
+
ELECTRA-small was the strongest size contender — its trimmed, Q4 artifact is ~10 MB against MiniLM's ~19 MB at full vocab — which for a browser-deployed model is a real pull.
|
|
198
|
+
We ran the entire selection matrix (§4.2) twice, once on each base, on identical data and schema.
|
|
199
|
+
|
|
200
|
+
ELECTRA did not turn extra data into accuracy the way MiniLM did.
|
|
201
|
+
Its eval loss bottomed out early and then crept up as the corpus grew, and the matrix bears this out: every one of ELECTRA's strongest cells used the _smaller_ (100k-conversation) data slice, and adding the larger slice produced its _worst_ cells, not its best.
|
|
202
|
+
MiniLM, on the same axes, reached its top result on the larger slice.
|
|
203
|
+
For a project whose whole thesis is a corpus that keeps growing (§3.5), an encoder that stops improving — or regresses — as data grows has no headroom for the loop we rely on.
|
|
204
|
+
We shipped MiniLM-L6-H384-uncased and kept ELECTRA on the shelf as a size lever for a hypothetical future release willing to accept that ceiling.
|
|
205
|
+
|
|
206
|
+
### 4.2 The selection matrix
|
|
207
|
+
|
|
208
|
+
To choose the data recipe and the prefilter strategy, we ran a Cartesian ablation rather than tuning one variable at a time.
|
|
209
|
+
Twelve cells per base — **prefilter** {on, off} × **data mix** {synthetic-only, +OCR'd documents, +AI4Privacy templates} × **corpus volume** {100k, 250k synthetic conversations} — each fine-tuned, exported to its Q4 and vocab-trimmed artifacts, and scored _end-to-end through its matching runtime_ (prefilter-on cells run with the deterministic layer; prefilter-off cells run the model raw).
|
|
210
|
+
Across both bases that is 24 trained models, scored under both runtime modes.
|
|
211
|
+
|
|
212
|
+
Scoring used a deliberately hard internal development suite — 55 hand-written chat cases (66 private terms) skewed toward the failure modes we most wanted to catch: hyphenated and particled names, non-Latin-script names, address fragments, government identifiers, split and dotted contact details.
|
|
213
|
+
It is **not** the shipped held-out metric (the 98.42% headline is measured on 30k OpenPII rows, §3.3); these are the much harder, much smaller dev numbers we used to rank _directions_, and the shipped model is many training iterations beyond this round.
|
|
214
|
+
The matched-runtime results:
|
|
215
|
+
|
|
216
|
+
| Base | Best cell (data mix / volume / prefilter) | Dev recall | Worst cell |
|
|
217
|
+
| ------- | ----------------------------------------- | ---------: | -------------------------------------------------------------------- |
|
|
218
|
+
| MiniLM | +AI4Privacy / 250k / prefilter-on | **80.3%** | 21.2% (no-prefilter, 250k synthetic, full mix — a training collapse) |
|
|
219
|
+
| ELECTRA | +AI4Privacy / 100k / prefilter-on | 81.8% | 42.4% (no-prefilter, 250k, synthetic-only) |
|
|
220
|
+
|
|
221
|
+
Three findings drove the recipe:
|
|
222
|
+
|
|
223
|
+
1. **The full data mix produced every top cell.** On both bases the highest-recall configurations folded in all three sources; synthetic-only cells trailed. Breadth of corpus, not volume of one source, moved the needle.
|
|
224
|
+
2. **MiniLM scaled with data; ELECTRA did not** (§4.1). MiniLM's best cell used the larger slice; all of ELECTRA's best cells used the smaller one.
|
|
225
|
+
3. **One cell collapsed.** The MiniLM no-prefilter / 250k synthetic / full-mix cell scored 21.2% — a reminder that more data and more classes can destabilize training, and the reason every candidate is scored end-to-end before it is trusted, never assumed good from its loss curve.
|
|
226
|
+
|
|
227
|
+
The prefilter-on configuration won on its merits here and matches the runtime: the deterministic layer is a checksum away from perfect on the digit classes, so making the model relearn them only adds variance.
|
|
228
|
+
|
|
229
|
+
### 4.3 Prefolded normalization
|
|
230
|
+
|
|
231
|
+
All training rows pass through the same normalization the runtime applies before tokenization: lowercase, NFKD decomposition, and combining-mark stripping.
|
|
232
|
+
The combining-mark step is what folds accents — `José` becomes `jose`, `Müller` becomes `muller`, `François` becomes `francois` — so the model sees a single canonical form regardless of how the user typed the name.
|
|
233
|
+
|
|
234
|
+
We do this for two reasons.
|
|
235
|
+
First, BERT's BasicTokenizer already performs the same fold implicitly at inference time under `do_lower_case=True` with default accent-stripping, so prefolding the training data makes the train-time and runtime distributions identical by construction; without it, the model would learn token sequences containing combining marks that the runtime tokenizer would never emit.
|
|
236
|
+
Second, accent collapse is a robustness property we want: a user who types `Jose` and a user who types `José` should be redacted identically, and an attacker who substitutes one for the other to evade detection should fail.
|
|
237
|
+
Prefolding bakes that property into the training distribution rather than relying on the runtime to recover it after the fact.
|
|
238
|
+
|
|
239
|
+
A guard in the training pipeline fails the run if a future tokenizer change breaks this assumption, so retraining with a cased base model cannot silently desync the two normalizations.
|
|
240
|
+
Because prefolding already produces the canonical form, separate accent-augmentation (training on both `José` and `Jose` as distinct strings) is disabled — it would be a no-op against an already-folded corpus.
|
|
241
|
+
|
|
242
|
+
## 5. Comparison to public PII baselines
|
|
243
|
+
|
|
244
|
+
During model selection we ran a one-off comparison of Rampart against public PII systems — a community BERT-small detector, Microsoft Presidio + spaCy, GLiNER small v2.1, and AWS Bedrock Guardrails — on the English+Spanish slice under identical scoring rules.
|
|
245
|
+
On that slice Rampart reached 98.85% private-term recall; the open baselines trailed on retention in particular (GLiNER and BERT-small kept ~33.5% of public context), and the cloud incumbent trailed on both recall and latency.
|
|
246
|
+
Also of note, those systems are Python- or cloud-only (Presidio, GLiNER/torch, Bedrock) and cannot run in the shipped TypeScript form factor.
|
|
247
|
+
|
|
248
|
+
## 6. Calibration
|
|
249
|
+
|
|
250
|
+
The runtime applies a single recall-biased confidence floor (`minScore` = 0.4) uniformly across the model's labels, chosen against the 10,000-row OpenPII Latin calibration split (disjoint from test).
|
|
251
|
+
There is no per-label threshold table in the shipped runtime: classes the model alone is weak on — SSN, CREDIT_CARD — are not propped up with a tuned threshold but covered by the deterministic layer's checksum/structural validation, which is the system of record for them (with EMAIL, URL, and IP_ADDRESS covered by pattern match).
|
|
252
|
+
Phone, routing, government-ID, passport, and license numbers carry no checksum and are left to the model under the same floor.
|
|
253
|
+
Trading a miss (which leaks data) against the cheaper failure of over-redaction is the explicit policy choice here, not a model regression.
|
|
254
|
+
|
|
255
|
+
ECE on the 30,000-row test set is **0.018** for the model alone (well-calibrated, no post-hoc isotonic correction needed) and **0.291** for the full system.
|
|
256
|
+
The system-level ECE is higher because the deterministic layer always emits score 1.0 on its detections, making the score distribution bimodal — that is a score-distribution artifact of the union, not a calibration regression of the underlying model.
|
|
257
|
+
|
|
258
|
+
## 7. Schema reconciliation
|
|
259
|
+
|
|
260
|
+
The 91.69% retention number on the headline test is term-presence scoring that already credits the keep-set (city/state/ZIP) as kept, matching the Rampart policy. We analyzed the 7,244 remaining "over-redacted" public terms in the 30,000-row eval:
|
|
261
|
+
|
|
262
|
+
- The vast majority are policy-driven redactions of street-line components (street name, building number, secondary address line). OpenPII marks `STREET`, `BUILDINGNUM`, and `SECADDRESS` as `O` (public); the Rampart policy redacts the precise street line (`BUILDING_NUMBER` + `STREET_NAME`) and `SECONDARY_ADDRESS` while keeping `CITY`, `STATE`, and `ZIP`. These are not detector errors; they are the policy firing as designed.
|
|
263
|
+
- A smaller share are span-edge artifacts. The runtime's particle-rescue step grows name spans (`GIVEN_NAME` / `SURNAME`) to swallow capitalized particles ("De la", "Von", "Mc"). When an adjacent public token is itself capitalized, that token can be absorbed into the redacted span.
|
|
264
|
+
- A very small fraction are digit fragments inside longer correctly-redacted spans (e.g. "376" found inside a redacted 16-digit credit card, surfacing as a "kept token" because the gold schema marks individual digits separately from the card number).
|
|
265
|
+
|
|
266
|
+
We publish the term-presence number for like-for-like comparison with public PII benchmarks running the same scoring rules.
|
|
267
|
+
Under policy-aware scoring the retention exceeds 99%.
|
|
268
|
+
|
|
269
|
+
## 8. Systems we considered and did not adopt
|
|
270
|
+
|
|
271
|
+
Two widely-discussed alternatives are worth addressing directly because each is a plausible default for a team starting from scratch on this problem.
|
|
272
|
+
|
|
273
|
+
**OpenAI Privacy Filter.** OpenAI released an open-weight token-classification model under Apache 2.0 ([announcement](https://openai.com/index/introducing-openai-privacy-filter/)) with a similar shape to ours: bidirectional token classification with BIOES span decoding via constrained Viterbi, eight detection categories (person, address, email, phone, URL, date, account number, secret), and reported F1 of 97.4% on the corrected `pii-masking-300k` benchmark.
|
|
274
|
+
It is the closest peer to Rampart in design intent, and we evaluated it as a candidate.
|
|
275
|
+
|
|
276
|
+
We did not adopt it for two reasons.
|
|
277
|
+
(1) Size: the released model is 1.5B total parameters (50M active, MoE-style routing).
|
|
278
|
+
Even with aggressive quantization the on-disk footprint is two orders of magnitude beyond our 15 MB browser-deployment target, and the active-parameter count alone exceeds what we can ship to a low-end device over the wire.
|
|
279
|
+
(2) Inference shape: a 1.5B-parameter forward pass with a 128k-token context window is engineered for server-side or workstation-class throughput on long documents, not for sub-10 ms per-keystroke client-side redaction during a chat turn.
|
|
280
|
+
The two systems are solving overlapping problems with different deployment constraints, and the OpenAI model's strengths (long-context coherence, fine-tunability for domain adaptation) are orthogonal to ours (browser-deployable size, encoder-only latency, per-class checksum validation through the deterministic layer).
|
|
281
|
+
For applications that have a server they trust and documents that justify the round-trip, the OpenAI model is a strong choice; for an application whose contract is "the user's text never leaves the device," it is too large to deploy on the device.
|
|
282
|
+
|
|
283
|
+
**`ai4privacy/llama-ai4privacy-english-anonymiser-openpii`.**
|
|
284
|
+
A Llama-family fine-tune trained on the same OpenPII corpus we used.
|
|
285
|
+
The model is high quality on its native distribution and was a serious candidate.
|
|
286
|
+
We did not adopt it for three reasons.
|
|
287
|
+
(1) Size: the released artifact is multiple gigabytes, three orders of magnitude larger than our 15 MB target and incompatible with browser deployment on the low-end devices we need to support.
|
|
288
|
+
(2) Inference cost: a Llama-class generative anonymizer takes hundreds of milliseconds to seconds per turn even on accelerated hardware, versus 6.6 ms for an encoder pass; running it on the client is not viable, and running it on the server reintroduces the threat-model problem above.
|
|
289
|
+
(3) Generative outputs require a different correctness story — the model rewrites text rather than emitting spans, which makes calibration, span F1, and policy-driven keep-sets harder to define and audit.
|
|
290
|
+
A token classifier with a deterministic post-processing layer is a much smaller surface area to reason about for a system whose contract is "do not leak."
|
|
291
|
+
|
|
292
|
+
Both systems are good engineering for their intended deployment shape.
|
|
293
|
+
Neither is a substitute for client-side redaction when the goal is to prevent data from reaching any server in the first place.
|
|
294
|
+
|
|
295
|
+
## 9. Limitations
|
|
296
|
+
|
|
297
|
+
The model card enumerates each documented failure with statistics. The most consequential:
|
|
298
|
+
|
|
299
|
+
1. **Cross-locale name fairness.** Recall on Faker-generated names spans 15 naming traditions; non-Latin scripts (Korean, Han Chinese, Japanese, Arabic, South Asian, Slavic) are below 50%. This is the most important regression to close in subsequent training cycles and is tracked by a stratified regression test in the eval suite.
|
|
300
|
+
2. **Adversarial robustness.** The system catches 86.4% of a 20-case adversarial suite. Combined attacks (homoglyph plus whitespace-split, deep zero-width injection inside checksum-valid identifiers) can still bypass the union of the two layers. The deterministic layer raises the floor on structured classes but does not close the gap on unstructured identifiers under composed perturbations. This is the right framing for the limitation, not the primary use case: Rampart is designed to protect users who are entering their own information in good faith from incidental disclosure to downstream services, not to defeat a motivated user who is actively trying to smuggle their own PII past the filter. Adversarial cases are scored to characterize the failure surface and to surface regressions, not because circumventing one's own redactor is the threat model.
|
|
301
|
+
3. **Indirect identifiers.** Inferential leaks — e.g. a rare medical condition combined with a ZIP code — are out of scope. The system redacts terms, not statistical fingerprints.
|
|
302
|
+
|
|
303
|
+
## 10. Reproducibility
|
|
304
|
+
|
|
305
|
+
The training pipeline and the benchmark are released under CC BY 4.0.
|
|
306
|
+
The benchmark (`eval/bench`) runs the shipped TypeScript pipeline over a frozen OpenPII held-out slice and writes the per-run `summary.json` / `by_language.json` that the published numbers cite, so every figure traces to committed evidence produced by the artifact itself.
|
|
307
|
+
The held-out row `uid`s are pinned in a committed manifest; `bun run bench:fetch` regenerates the rows and `bun run bench` reproduces the numbers.
|
|
308
|
+
|
|
309
|
+
## 11. Conclusion
|
|
310
|
+
|
|
311
|
+
Rampart is harm reduction. No client-side redactor at this size will catch every leak, and we do not claim otherwise — §9 documents the classes where the system underperforms and the eval suite is structured so future regressions surface immediately.
|
|
312
|
+
What it provides is a defensible floor: text is filtered through two complementary layers and replaced with stable placeholders before any server sees it, so the worst case for a downstream leak is bounded by what the client failed to redact, not by the entire raw conversation.
|
|
313
|
+
|
|
314
|
+
We release the model, deterministic layer, and eval suite under CC BY 4.0 so any team building privacy-sensitive software can use, audit, fork, and improve it.
|
|
315
|
+
The constraints adopted here — browser-deployable size, recall-biased calibration, defense in depth, no network dependency at inference — are specific to the threat model in which the user's unredacted text must never leave the device.
|
|
316
|
+
Other deployment shapes warrant other tradeoffs; under this one, the system reported here is a deployable baseline against which future work can be measured.
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @nationaldesignstudio/rampart
|
|
3
|
+
*
|
|
4
|
+
* Client-side PII filter for LLM chat. Strips names, SSNs, card numbers, and
|
|
5
|
+
* every other PII class out of text before it leaves the device — keeping only
|
|
6
|
+
* {city, state, zip} — so raw PII never reaches our servers or logs.
|
|
7
|
+
*
|
|
8
|
+
* Layers: offset-preserving heuristics + validators (structured PII, any
|
|
9
|
+
* separator form), an optional small wasm token-classifier (contextual PII),
|
|
10
|
+
* a default-deny keep-set policy, and a reversible placeholder/rehydrate
|
|
11
|
+
* session table for coherent multi-turn chat.
|
|
12
|
+
*/
|
|
13
|
+
export { ChatGuard, createGuard, DEFAULT_ALIASES, type GuardOptions, type NerDetector, } from "./src/guard";
|
|
14
|
+
export { StreamingReveal, createRevealTransform, type PlaceholderResolver, } from "./src/streaming";
|
|
15
|
+
export { SessionEntityTable, PLACEHOLDER_PATTERN, type ScrubResult, type PlaceholderAliases, } from "./src/session";
|
|
16
|
+
export { detectHeuristics } from "./src/heuristics";
|
|
17
|
+
export { mergeSpans, applyPolicy } from "./src/policy";
|
|
18
|
+
export { premask, projectMaskedSpan, sentinelFor, type PremaskResult } from "./src/premask";
|
|
19
|
+
export { KEEP_LABELS, resolveKeepLabels, shouldRedact, type PiiLabel, type Span } from "./src/types";
|
|
20
|
+
export { detectNer, loadNerClassifier, RAMPART_MODEL_ID, NER_TOKEN_BUDGET, NER_TOKEN_OVERLAP, type NerOptions, type TokenClassifier, type TokenCounter, } from "./src/ner/classifier";
|
|
21
|
+
export { registerNerWorker, createWorkerClassifier } from "./src/ner/worker";
|
|
22
|
+
export { isLuhnValid, isValidSsn } from "./src/validators";
|
|
23
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,OAAO,EACL,SAAS,EACT,WAAW,EACX,eAAe,EACf,KAAK,YAAY,EACjB,KAAK,WAAW,GACjB,MAAM,aAAa,CAAC;AACrB,OAAO,EACL,eAAe,EACf,qBAAqB,EACrB,KAAK,mBAAmB,GACzB,MAAM,iBAAiB,CAAC;AACzB,OAAO,EACL,kBAAkB,EAClB,mBAAmB,EACnB,KAAK,WAAW,EAChB,KAAK,kBAAkB,GACxB,MAAM,eAAe,CAAC;AACvB,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,UAAU,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC;AACvD,OAAO,EAAE,OAAO,EAAE,iBAAiB,EAAE,WAAW,EAAE,KAAK,aAAa,EAAE,MAAM,eAAe,CAAC;AAC5F,OAAO,EAAE,WAAW,EAAE,iBAAiB,EAAE,YAAY,EAAE,KAAK,QAAQ,EAAE,KAAK,IAAI,EAAE,MAAM,aAAa,CAAC;AACrG,OAAO,EACL,SAAS,EACT,iBAAiB,EACjB,gBAAgB,EAChB,gBAAgB,EAChB,iBAAiB,EACjB,KAAK,UAAU,EACf,KAAK,eAAe,EACpB,KAAK,YAAY,GAClB,MAAM,sBAAsB,CAAC;AAC9B,OAAO,EAAE,iBAAiB,EAAE,sBAAsB,EAAE,MAAM,kBAAkB,CAAC;AAC7E,OAAO,EAAE,WAAW,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC"}
|