@nationaldesignstudio/rampart 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +402 -0
- package/MODEL_CARD.md +422 -0
- package/README.md +279 -0
- package/RELEASE.md +97 -0
- package/WHITEPAPER.md +316 -0
- package/dist/index.d.ts +23 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +35639 -0
- package/dist/index.js.map +36 -0
- package/dist/src/guard.d.ts +94 -0
- package/dist/src/guard.d.ts.map +1 -0
- package/dist/src/heuristics.d.ts +14 -0
- package/dist/src/heuristics.d.ts.map +1 -0
- package/dist/src/ner/classifier.d.ts +92 -0
- package/dist/src/ner/classifier.d.ts.map +1 -0
- package/dist/src/ner/worker.d.ts +44 -0
- package/dist/src/ner/worker.d.ts.map +1 -0
- package/dist/src/ner/worker.js +35302 -0
- package/dist/src/ner/worker.js.map +30 -0
- package/dist/src/pipeline.d.ts +76 -0
- package/dist/src/pipeline.d.ts.map +1 -0
- package/dist/src/policy.d.ts +27 -0
- package/dist/src/policy.d.ts.map +1 -0
- package/dist/src/premask.d.ts +48 -0
- package/dist/src/premask.d.ts.map +1 -0
- package/dist/src/session.d.ts +60 -0
- package/dist/src/session.d.ts.map +1 -0
- package/dist/src/streaming.d.ts +32 -0
- package/dist/src/streaming.d.ts.map +1 -0
- package/dist/src/types.d.ts +43 -0
- package/dist/src/types.d.ts.map +1 -0
- package/dist/src/validators.d.ts +16 -0
- package/dist/src/validators.d.ts.map +1 -0
- package/eval/bench/README.md +91 -0
- package/eval/bench/fetch.ts +152 -0
- package/eval/bench/labels.ts +45 -0
- package/eval/bench/run.ts +146 -0
- package/eval/bench/runs/m06-v3-30k/by_language.json +303 -0
- package/eval/bench/runs/m06-v3-30k/summary.json +56 -0
- package/eval/bench/runs/sample-900/by_language.json +303 -0
- package/eval/bench/runs/sample-900/manifest.json +926 -0
- package/eval/bench/runs/sample-900/summary.json +56 -0
- package/eval/bench/score.ts +197 -0
- package/eval/bench/webgpu/entry.ts +70 -0
- package/eval/bench/webgpu/index.html +12 -0
- package/eval/bench/webgpu.ts +209 -0
- package/eval/public-cases.ts +412 -0
- package/eval/run-public-eval.ts +140 -0
- package/examples/basic-chat.ts +12 -0
- package/examples/pii-worker.ts +3 -0
- package/index.ts +47 -0
- package/package.json +103 -0
- package/src/guard.ts +170 -0
- package/src/heuristics.ts +141 -0
- package/src/ner/classifier.ts +580 -0
- package/src/ner/worker.ts +130 -0
- package/src/policy.ts +64 -0
- package/src/premask.ts +90 -0
- package/src/session.ts +99 -0
- package/src/streaming.ts +73 -0
- package/src/types.ts +74 -0
- package/src/validators.ts +40 -0
|
@@ -0,0 +1,580 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Contextual PII detection via a small token-classification model running in
|
|
3
|
+
* the browser (transformers.js → ONNX Runtime Web, wasm or WebGPU backend).
|
|
4
|
+
*
|
|
5
|
+
* This is the residual layer: it catches what the heuristics can't — people's
|
|
6
|
+
* names, organizations, and free-text identifiers — which is exactly the PII we
|
|
7
|
+
* never want in our logs. The model is intentionally tiny and int8-quantized so
|
|
8
|
+
* it loads once (cached in IndexedDB by the runtime) and runs on-device with no
|
|
9
|
+
* server round-trip and no shared queue to saturate.
|
|
10
|
+
*
|
|
11
|
+
* Label mapping: the fine-tuned model emits token-classification entity groups,
|
|
12
|
+
* which we map onto our {@link PiiLabel} set. CITY/STATE/ZIP_CODE are emitted
|
|
13
|
+
* too so the merge step can carry them through to the keep-set.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { mergeSpans } from "../policy";
|
|
17
|
+
import type { PiiLabel, Span } from "../types";
|
|
18
|
+
|
|
19
|
+
/** Minimal shape of a transformers.js token-classification result row. */
|
|
20
|
+
interface RawEntity {
|
|
21
|
+
readonly entity_group?: string;
|
|
22
|
+
readonly entity?: string;
|
|
23
|
+
readonly score: number;
|
|
24
|
+
readonly start: number;
|
|
25
|
+
readonly end: number;
|
|
26
|
+
readonly word: string;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/** Counts the model tokens in a string, excluding the [CLS]/[SEP] specials. */
|
|
30
|
+
export type TokenCounter = (text: string) => number;
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* The callable returned by a token-classification pipeline. `countTokens` is
|
|
34
|
+
* attached when the classifier is backed by a real tokenizer (see
|
|
35
|
+
* {@link loadNerClassifier}); {@link detectNer} uses it to size windows by the
|
|
36
|
+
* model's token budget. Bare mocks may omit it, in which case detection runs the
|
|
37
|
+
* whole input as a single window.
|
|
38
|
+
*/
|
|
39
|
+
export interface TokenClassifier {
|
|
40
|
+
(text: string, options?: { aggregation_strategy?: "simple" | "first" | "max" }): Promise<RawEntity[]>;
|
|
41
|
+
countTokens?: TokenCounter;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/** Maps model entity groups to our labels. Unknown groups are dropped. */
|
|
45
|
+
const GROUP_TO_LABEL: Readonly<Record<string, PiiLabel>> = {
|
|
46
|
+
// Split names (a household may share a surname, so they stay distinct).
|
|
47
|
+
GIVEN_NAME: "GIVEN_NAME",
|
|
48
|
+
GIVENNAME: "GIVEN_NAME",
|
|
49
|
+
SURNAME: "SURNAME",
|
|
50
|
+
LASTNAME: "SURNAME",
|
|
51
|
+
// Contact / document identifiers.
|
|
52
|
+
EMAIL: "EMAIL",
|
|
53
|
+
PHONE: "PHONE",
|
|
54
|
+
URL: "URL",
|
|
55
|
+
TAX_ID: "TAX_ID",
|
|
56
|
+
BANK_ACCOUNT: "BANK_ACCOUNT",
|
|
57
|
+
ROUTING_NUMBER: "ROUTING_NUMBER",
|
|
58
|
+
GOVERNMENT_ID: "GOVERNMENT_ID",
|
|
59
|
+
PASSPORT: "PASSPORT",
|
|
60
|
+
DRIVERS_LICENSE: "DRIVERS_LICENSE",
|
|
61
|
+
// Address components.
|
|
62
|
+
BUILDING_NUMBER: "BUILDING_NUMBER",
|
|
63
|
+
STREET_NAME: "STREET_NAME",
|
|
64
|
+
SECONDARY_ADDRESS: "SECONDARY_ADDRESS",
|
|
65
|
+
SECADDRESS: "SECONDARY_ADDRESS",
|
|
66
|
+
CITY: "CITY",
|
|
67
|
+
STATE: "STATE",
|
|
68
|
+
ZIP_CODE: "ZIP_CODE",
|
|
69
|
+
};
|
|
70
|
+
|
|
71
|
+
/** The shipped Rampart token-classifier on Hugging Face (q4 ONNX only). */
|
|
72
|
+
export const RAMPART_MODEL_ID = "nationaldesignstudio/rampart";
|
|
73
|
+
|
|
74
|
+
export interface NerOptions {
|
|
75
|
+
/**
|
|
76
|
+
* Hugging Face model id or local directory path. Must be a token-classification
|
|
77
|
+
* ONNX export compatible with Rampart's label schema. Defaults to
|
|
78
|
+
* {@link RAMPART_MODEL_ID}.
|
|
79
|
+
*/
|
|
80
|
+
readonly model?: string;
|
|
81
|
+
/** Backend. `"wasm"`/`"webgpu"` in browsers; `"cpu"` for Node (ORT). */
|
|
82
|
+
readonly device?: "wasm" | "webgpu" | "cpu";
|
|
83
|
+
/** Spans below this score are discarded. Low default → recall-biased. */
|
|
84
|
+
readonly minScore?: number;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const DEFAULT_OPTIONS: Required<Omit<NerOptions, "model">> = {
|
|
88
|
+
device: "wasm",
|
|
89
|
+
minScore: 0.4,
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* The MiniLM token classifier has a hard context window, so each NER window is
|
|
94
|
+
* sized to the model's token budget — measured with the model's own tokenizer,
|
|
95
|
+
* not a character proxy, so a window holds exactly as much text as actually fits
|
|
96
|
+
* regardless of token density. Past this, ORT would silently truncate the
|
|
97
|
+
* sequence and drop whatever followed.
|
|
98
|
+
*/
|
|
99
|
+
const MODEL_MAX_TOKENS = 512;
|
|
100
|
+
/** [CLS] + [SEP] the pipeline wraps every window in. */
|
|
101
|
+
const SPECIAL_TOKENS = 2;
|
|
102
|
+
/** Per-window content-token budget: the model max less specials and a safety margin. */
|
|
103
|
+
export const NER_TOKEN_BUDGET = MODEL_MAX_TOKENS - SPECIAL_TOKENS - 10;
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Tokens shared by consecutive NER windows. Long input slides a window of
|
|
107
|
+
* {@link NER_TOKEN_BUDGET} tokens; this overlap guarantees an entity landing on a
|
|
108
|
+
* window seam is still *wholly* inside a neighbouring window.
|
|
109
|
+
*
|
|
110
|
+
* The invariant: as long as the overlap is at least the longest entity we detect
|
|
111
|
+
* (names, orgs, street lines — all a handful of tokens), no entity is ever split
|
|
112
|
+
* across a boundary, so a window-edge name is never silently dropped. The generous
|
|
113
|
+
* margin over the longest entity also means a seam entity reappears deep inside its
|
|
114
|
+
* neighbour with ample context, which the classifier needs to label it confidently.
|
|
115
|
+
*/
|
|
116
|
+
export const NER_TOKEN_OVERLAP = 64;
|
|
117
|
+
|
|
118
|
+
/** Unicode combining marks; stripped during model-space folding (José → jose). */
|
|
119
|
+
const COMBINING_MARKS_RE = /\p{M}/gu;
|
|
120
|
+
|
|
121
|
+
const EXTEND_SCORE = 0.15;
|
|
122
|
+
const CONNECTOR_RE = /^[\s'\u2019.-]*$/;
|
|
123
|
+
const PERSON_LABELS: ReadonlySet<PiiLabel> = new Set(["GIVEN_NAME", "SURNAME"]);
|
|
124
|
+
const LEFT_PARTICLE_RE = /([\p{Lu}][\p{L}\p{M}\u2019']{0,3})([\s'\u2019.-]{1,3})$/u;
|
|
125
|
+
const RIGHT_PARTICLE_RE = /^([\s'\u2019.-]{1,3})([\p{Lu}][\p{L}\p{M}\u2019']{0,3})/u;
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Lazily construct the token-classification pipeline. transformers.js is a peer
|
|
129
|
+
* dependency and a heavy import, so it is loaded on first use, not at module
|
|
130
|
+
* load — keeping the heuristic path dependency-free.
|
|
131
|
+
*/
|
|
132
|
+
export async function loadNerClassifier(options: NerOptions = {}): Promise<TokenClassifier> {
|
|
133
|
+
const { pipeline } = await import("@huggingface/transformers");
|
|
134
|
+
const merged = { ...DEFAULT_OPTIONS, ...options };
|
|
135
|
+
const model = merged.model ?? RAMPART_MODEL_ID;
|
|
136
|
+
const classifier = await pipeline("token-classification", model, {
|
|
137
|
+
dtype: "q4",
|
|
138
|
+
device: merged.device,
|
|
139
|
+
});
|
|
140
|
+
// transformers.js's published types omit `aggregation_strategy` from
|
|
141
|
+
// `TokenClassificationPipelineOptions` even though the runtime accepts it,
|
|
142
|
+
// so we wrap the pipeline in a typed adapter rather than coercing the
|
|
143
|
+
// union return through a double cast at the call site.
|
|
144
|
+
const adapter: TokenClassifier = (text, opts) =>
|
|
145
|
+
(classifier as (input: string, options?: unknown) => Promise<RawEntity[]>)(text, opts);
|
|
146
|
+
// Expose the pipeline's tokenizer so detectNer can size windows by real tokens.
|
|
147
|
+
// `encode` returns the content token ids (no specials), so its length is the
|
|
148
|
+
// token count we budget each window against. A token-classification pipeline
|
|
149
|
+
// always carries a tokenizer; guard anyway so an unexpected runtime degrades to
|
|
150
|
+
// the single-window path rather than throwing mid-detection.
|
|
151
|
+
const tokenizer = (classifier as unknown as {
|
|
152
|
+
tokenizer?: { encode?: (t: string, o?: { add_special_tokens?: boolean }) => number[] };
|
|
153
|
+
}).tokenizer;
|
|
154
|
+
if (tokenizer?.encode) {
|
|
155
|
+
adapter.countTokens = (text) => tokenizer.encode!(text, { add_special_tokens: false }).length;
|
|
156
|
+
}
|
|
157
|
+
return adapter;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Detect contextual PII across the whole input, regardless of length.
|
|
162
|
+
*
|
|
163
|
+
* The model has a fixed token budget, so input longer than one window is scanned
|
|
164
|
+
* as a sliding window sized to {@link NER_TOKEN_BUDGET} *tokens* (measured by the
|
|
165
|
+
* classifier's own tokenizer) that overlaps its neighbour by {@link NER_TOKEN_OVERLAP}
|
|
166
|
+
* tokens. Each window's spans are shifted back into whole-text coordinates; because
|
|
167
|
+
* windows overlap, an entity on a seam is re-detected in both, so {@link mergeSpans}
|
|
168
|
+
* collapses the duplicates into the canonical disjoint set. Input that fits one
|
|
169
|
+
* window — or any classifier without a tokenizer, e.g. a bare test mock — takes a
|
|
170
|
+
* single-window fast path identical to scanning the text directly.
|
|
171
|
+
*
|
|
172
|
+
* Sizing by tokens rather than a char cap means a window holds exactly as much
|
|
173
|
+
* text as the model can attend to, and nothing past a fixed char count is silently
|
|
174
|
+
* dropped: the overlap keeps any entity from being split across a seam.
|
|
175
|
+
*/
|
|
176
|
+
export async function detectNer(
|
|
177
|
+
raw: string,
|
|
178
|
+
classifier: TokenClassifier,
|
|
179
|
+
minScore: number = DEFAULT_OPTIONS.minScore,
|
|
180
|
+
): Promise<Span[]> {
|
|
181
|
+
const windows =
|
|
182
|
+
classifier.countTokens === undefined
|
|
183
|
+
? [{ start: 0, end: raw.length }]
|
|
184
|
+
: planTokenWindows(raw, classifier.countTokens, NER_TOKEN_BUDGET, NER_TOKEN_OVERLAP);
|
|
185
|
+
|
|
186
|
+
if (windows.length <= 1) {
|
|
187
|
+
return detectNerWindow(raw, classifier, minScore);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
const spans: Span[] = [];
|
|
191
|
+
for (const window of windows) {
|
|
192
|
+
// Windows run sequentially: they share one model/session, which is not safe
|
|
193
|
+
// to drive with concurrent inference calls.
|
|
194
|
+
const windowSpans = await detectNerWindow(raw.slice(window.start, window.end), classifier, minScore);
|
|
195
|
+
for (const span of windowSpans) {
|
|
196
|
+
spans.push({ ...span, start: span.start + window.start, end: span.end + window.start });
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
return mergeSpans(spans);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/** A half-open char window `[start, end)` into the raw text. */
|
|
203
|
+
interface CharWindow {
|
|
204
|
+
readonly start: number;
|
|
205
|
+
readonly end: number;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* Plan the sliding windows over `raw`: each holds at most `budget` tokens and
|
|
210
|
+
* overlaps its predecessor by at least `overlap` tokens, together covering the
|
|
211
|
+
* whole string. Windows snap to word boundaries (so no window cuts a word — and
|
|
212
|
+
* therefore a token — in half); a single word longer than the budget is the only
|
|
213
|
+
* case hard-split mid-word, by character, as a fallback. `countTokens` is the
|
|
214
|
+
* model's tokenizer, so `budget` is the real per-window capacity.
|
|
215
|
+
*/
|
|
216
|
+
function planTokenWindows(
|
|
217
|
+
raw: string,
|
|
218
|
+
countTokens: TokenCounter,
|
|
219
|
+
budget: number,
|
|
220
|
+
overlap: number,
|
|
221
|
+
): CharWindow[] {
|
|
222
|
+
const segments = toSegments(raw, countTokens, budget);
|
|
223
|
+
if (segments.length === 0) return [];
|
|
224
|
+
|
|
225
|
+
const windows: CharWindow[] = [];
|
|
226
|
+
let i = 0;
|
|
227
|
+
while (i < segments.length) {
|
|
228
|
+
// Grow [i, j) while it fits the budget, always taking at least one segment
|
|
229
|
+
// (toSegments guarantees each segment is within budget).
|
|
230
|
+
let tokens = 0;
|
|
231
|
+
let j = i;
|
|
232
|
+
while (j < segments.length && (j === i || tokens + segments[j].tokens <= budget)) {
|
|
233
|
+
tokens += segments[j].tokens;
|
|
234
|
+
j++;
|
|
235
|
+
}
|
|
236
|
+
windows.push({ start: segments[i].start, end: segments[j - 1].end });
|
|
237
|
+
if (j === segments.length) break;
|
|
238
|
+
|
|
239
|
+
// Advance so the next window overlaps this one by >= overlap tokens, while
|
|
240
|
+
// always making progress (start strictly after i).
|
|
241
|
+
let shared = 0;
|
|
242
|
+
let next = j;
|
|
243
|
+
while (next > i + 1 && shared < overlap) {
|
|
244
|
+
next--;
|
|
245
|
+
shared += segments[next].tokens;
|
|
246
|
+
}
|
|
247
|
+
i = next;
|
|
248
|
+
}
|
|
249
|
+
return windows;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
/** A word-aligned slice of the raw text with its model-token count. */
|
|
253
|
+
interface Segment {
|
|
254
|
+
readonly start: number;
|
|
255
|
+
readonly end: number;
|
|
256
|
+
readonly tokens: number;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Partition `raw` into word-aligned segments (a word plus its trailing
|
|
261
|
+
* whitespace) tagged with token counts. A word that alone exceeds `budget` —
|
|
262
|
+
* pathological, e.g. a long unbroken blob — is hard-split by character so every
|
|
263
|
+
* returned segment fits the budget and the packer can always place it.
|
|
264
|
+
*/
|
|
265
|
+
function toSegments(raw: string, countTokens: TokenCounter, budget: number): Segment[] {
|
|
266
|
+
const segments: Segment[] = [];
|
|
267
|
+
for (const [start, end] of wordSpans(raw)) {
|
|
268
|
+
let from = start;
|
|
269
|
+
while (from < end) {
|
|
270
|
+
const tokens = countTokens(raw.slice(from, end));
|
|
271
|
+
if (tokens <= budget) {
|
|
272
|
+
segments.push({ start: from, end, tokens });
|
|
273
|
+
break;
|
|
274
|
+
}
|
|
275
|
+
const cut = fitCharsToBudget(raw, from, end, budget, countTokens);
|
|
276
|
+
segments.push({ start: from, end: cut, tokens: countTokens(raw.slice(from, cut)) });
|
|
277
|
+
from = cut;
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
return segments;
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
/** Yield contiguous `[start, end)` spans of `raw`, each a word + trailing whitespace. */
|
|
284
|
+
function* wordSpans(raw: string): Generator<[number, number]> {
|
|
285
|
+
const n = raw.length;
|
|
286
|
+
let i = 0;
|
|
287
|
+
while (i < n) {
|
|
288
|
+
let j = i;
|
|
289
|
+
while (j < n && !/\s/.test(raw[j])) j++; // the word
|
|
290
|
+
while (j < n && /\s/.test(raw[j])) j++; // its trailing whitespace
|
|
291
|
+
yield [i, j];
|
|
292
|
+
i = j;
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* Largest char offset `cut` in `(from, end]` whose slice fits `budget` tokens.
|
|
298
|
+
* Only reached for a single over-budget word, where token count grows with
|
|
299
|
+
* length; binary search lands on the cut and the budget's safety margin absorbs
|
|
300
|
+
* the slight non-monotonicity of subword tokenization at the edge.
|
|
301
|
+
*/
|
|
302
|
+
function fitCharsToBudget(
|
|
303
|
+
raw: string,
|
|
304
|
+
from: number,
|
|
305
|
+
end: number,
|
|
306
|
+
budget: number,
|
|
307
|
+
countTokens: TokenCounter,
|
|
308
|
+
): number {
|
|
309
|
+
let lo = from + 1;
|
|
310
|
+
let hi = end;
|
|
311
|
+
let best = from + 1;
|
|
312
|
+
while (lo <= hi) {
|
|
313
|
+
const mid = (lo + hi) >> 1;
|
|
314
|
+
if (countTokens(raw.slice(from, mid)) <= budget) {
|
|
315
|
+
best = mid;
|
|
316
|
+
lo = mid + 1;
|
|
317
|
+
} else {
|
|
318
|
+
hi = mid - 1;
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
return best;
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
/**
|
|
325
|
+
* Run the classifier over a single window and convert its output to {@link Span}s
|
|
326
|
+
* over that window's text. Below-threshold and zero-width spans are dropped;
|
|
327
|
+
* recognized kept-classes are preserved as spans so the policy layer can shield
|
|
328
|
+
* them.
|
|
329
|
+
*
|
|
330
|
+
* Handles two pipeline output shapes:
|
|
331
|
+
* - aggregated (`entity_group` + char `start`/`end`) — when the runtime applied
|
|
332
|
+
* `simple` aggregation; offsets are used directly.
|
|
333
|
+
* - raw BIO tokens (`entity` = `B-GIVEN_NAME`/`I-GIVEN_NAME`, no offsets) — what
|
|
334
|
+
* transformers.js emits here. The token `word` is accent-folded, so we locate
|
|
335
|
+
* it in a matching folded projection of the input and project the span back to
|
|
336
|
+
* raw offsets through an offset map. A naive search against the unfolded text
|
|
337
|
+
* fails on every accented character, which drops spans (leaks) and lets the
|
|
338
|
+
* offset cursor desync into multi-line, paragraph-swallowing spans.
|
|
339
|
+
*/
|
|
340
|
+
async function detectNerWindow(
|
|
341
|
+
raw: string,
|
|
342
|
+
classifier: TokenClassifier,
|
|
343
|
+
minScore: number = DEFAULT_OPTIONS.minScore,
|
|
344
|
+
): Promise<Span[]> {
|
|
345
|
+
const inferText = raw.replaceAll("-", " ");
|
|
346
|
+
const entities = await classifier(inferText, { aggregation_strategy: "simple" });
|
|
347
|
+
// `inferText` and `raw` are the same length (hyphen→space is 1:1), so offsets
|
|
348
|
+
// recovered against this folded projection address `raw` too.
|
|
349
|
+
const folded = foldForModel(inferText);
|
|
350
|
+
const candidates: Span[] = [];
|
|
351
|
+
for (const entity of mergeBioTokens(entities, folded)) {
|
|
352
|
+
const label = GROUP_TO_LABEL[entity.group.toUpperCase()];
|
|
353
|
+
if (label === undefined) continue;
|
|
354
|
+
if (entity.score < EXTEND_SCORE || entity.end <= entity.start) continue;
|
|
355
|
+
candidates.push({
|
|
356
|
+
start: entity.start,
|
|
357
|
+
end: entity.end,
|
|
358
|
+
label,
|
|
359
|
+
score: entity.score,
|
|
360
|
+
source: "ner",
|
|
361
|
+
text: raw.slice(entity.start, entity.end),
|
|
362
|
+
});
|
|
363
|
+
}
|
|
364
|
+
return repairSpans(raw, candidates, minScore);
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
interface AggregatedEntity {
|
|
368
|
+
group: string;
|
|
369
|
+
score: number;
|
|
370
|
+
start: number;
|
|
371
|
+
end: number;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
/** Strip a BIO prefix: `B-GIVEN_NAME`/`I-GIVEN_NAME` → `GIVEN_NAME`; bare labels pass through. */
|
|
375
|
+
function stripBio(label: string): { prefix: "B" | "I" | null; base: string } {
|
|
376
|
+
const m = /^([BI])-(.+)$/.exec(label);
|
|
377
|
+
return m ? { prefix: m[1] as "B" | "I", base: m[2] } : { prefix: null, base: label };
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
/** A folded copy of the model input plus a map from each folded char to raw. */
|
|
381
|
+
interface FoldedProjection {
|
|
382
|
+
/** Lowercased, NFKD, combining-mark-stripped copy of the input. */
|
|
383
|
+
readonly text: string;
|
|
384
|
+
/** `rawStart[i]` is the raw offset of the source code point of `text[i]`. */
|
|
385
|
+
readonly rawStart: number[];
|
|
386
|
+
/** `rawEnd[i]` is the raw offset just past that source code point. */
|
|
387
|
+
readonly rawEnd: number[];
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
/**
|
|
391
|
+
* Fold `raw` to the model's normalized space (lowercase + NFKD + combining-mark
|
|
392
|
+
* strip — the same fold BERT's BasicTokenizer applies) while recording, per
|
|
393
|
+
* folded character, the `[start, end)` raw offsets of the code point it came
|
|
394
|
+
* from. This is the bridge that lets a folded token `word` be matched against
|
|
395
|
+
* folded text and then projected back to exact raw offsets. Iterates by code
|
|
396
|
+
* point so surrogate pairs map back to whole-character raw spans.
|
|
397
|
+
*/
|
|
398
|
+
function foldForModel(raw: string): FoldedProjection {
|
|
399
|
+
let text = "";
|
|
400
|
+
const rawStart: number[] = [];
|
|
401
|
+
const rawEnd: number[] = [];
|
|
402
|
+
let i = 0;
|
|
403
|
+
for (const codePoint of raw) {
|
|
404
|
+
const folded = codePoint.toLowerCase().normalize("NFKD").replace(COMBINING_MARKS_RE, "");
|
|
405
|
+
for (const ch of folded) {
|
|
406
|
+
text += ch;
|
|
407
|
+
rawStart.push(i);
|
|
408
|
+
rawEnd.push(i + codePoint.length);
|
|
409
|
+
}
|
|
410
|
+
i += codePoint.length;
|
|
411
|
+
}
|
|
412
|
+
return { text, rawStart, rawEnd };
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
/**
|
|
416
|
+
* Normalize either output shape into raw-offset entities. Aggregated rows carry
|
|
417
|
+
* offsets in the model-input (unfolded) coordinate system, so they are used
|
|
418
|
+
* directly. Raw BIO tokens are merged (B starts a span, matching I extends it);
|
|
419
|
+
* each token's folded `word` is located in the folded projection via a
|
|
420
|
+
* forward-advancing search — so repeated words map to distinct offsets — and the
|
|
421
|
+
* folded span is projected back to raw through the projection's offset map.
|
|
422
|
+
*/
|
|
423
|
+
function mergeBioTokens(entities: RawEntity[], folded: FoldedProjection): AggregatedEntity[] {
|
|
424
|
+
const out: AggregatedEntity[] = [];
|
|
425
|
+
let cursor = 0;
|
|
426
|
+
let current: (AggregatedEntity & { count: number }) | null = null;
|
|
427
|
+
|
|
428
|
+
const flush = (): void => {
|
|
429
|
+
if (current !== null) {
|
|
430
|
+
out.push({ group: current.group, score: current.score / current.count, start: current.start, end: current.end });
|
|
431
|
+
current = null;
|
|
432
|
+
}
|
|
433
|
+
};
|
|
434
|
+
|
|
435
|
+
for (const entity of entities) {
|
|
436
|
+
// Already-aggregated shape: offsets are in unfolded model-input coordinates.
|
|
437
|
+
if (entity.entity_group !== undefined && typeof entity.start === "number" && typeof entity.end === "number") {
|
|
438
|
+
flush();
|
|
439
|
+
out.push({ group: entity.entity_group, score: entity.score, start: entity.start, end: entity.end });
|
|
440
|
+
continue;
|
|
441
|
+
}
|
|
442
|
+
const rawLabel = entity.entity ?? entity.entity_group;
|
|
443
|
+
if (rawLabel === undefined) continue;
|
|
444
|
+
const { prefix, base } = stripBio(rawLabel);
|
|
445
|
+
|
|
446
|
+
const word = (entity.word ?? "").replace(/^##/, "").toLowerCase();
|
|
447
|
+
if (!word) continue;
|
|
448
|
+
const at = folded.text.indexOf(word, cursor);
|
|
449
|
+
if (at < 0) continue;
|
|
450
|
+
const start = folded.rawStart[at];
|
|
451
|
+
const end = folded.rawEnd[at + word.length - 1];
|
|
452
|
+
cursor = at + word.length;
|
|
453
|
+
|
|
454
|
+
const continues = current !== null && current.group === base && prefix !== "B";
|
|
455
|
+
if (continues && current !== null) {
|
|
456
|
+
current.end = end;
|
|
457
|
+
current.score += entity.score;
|
|
458
|
+
current.count += 1;
|
|
459
|
+
} else {
|
|
460
|
+
flush();
|
|
461
|
+
current = { group: base, score: entity.score, start, end, count: 1 };
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
flush();
|
|
465
|
+
return out;
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
function repairSpans(raw: string, spans: readonly Span[], anchorScore: number): Span[] {
|
|
469
|
+
let kept = spans.filter((span) => span.score >= anchorScore).map(copySpan);
|
|
470
|
+
const candidates = spans.filter((span) => span.score >= EXTEND_SCORE && span.score < anchorScore).map(copySpan);
|
|
471
|
+
|
|
472
|
+
// Hard cap on the convergence loop. Real text converges in O(maxLabelLen)
|
|
473
|
+
// iterations; rows with many same-label single-token fragments can otherwise
|
|
474
|
+
// make rescue + bridge keep flipping. 32 is well above any healthy run and
|
|
475
|
+
// bounds the pathological case to a few milliseconds.
|
|
476
|
+
const MAX_ITERS = 32;
|
|
477
|
+
let iters = 0;
|
|
478
|
+
let changed = true;
|
|
479
|
+
while (changed && iters < MAX_ITERS) {
|
|
480
|
+
changed = false;
|
|
481
|
+
iters++;
|
|
482
|
+
|
|
483
|
+
for (let i = candidates.length - 1; i >= 0; i--) {
|
|
484
|
+
const candidate = candidates[i];
|
|
485
|
+
if (kept.some((span) => canBridge(raw, candidate, span))) {
|
|
486
|
+
kept.push(candidate);
|
|
487
|
+
candidates.splice(i, 1);
|
|
488
|
+
changed = true;
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
const merged = mergeAdjacentConnectors(raw, kept);
|
|
493
|
+
const didMerge =
|
|
494
|
+
merged.length !== kept.length ||
|
|
495
|
+
merged.some((span, index) => span.start !== kept[index]?.start || span.end !== kept[index]?.end);
|
|
496
|
+
if (didMerge) {
|
|
497
|
+
changed = true;
|
|
498
|
+
}
|
|
499
|
+
kept = merged;
|
|
500
|
+
|
|
501
|
+
for (let i = 0; i < kept.length; i++) {
|
|
502
|
+
const repaired = rescueCapitalizedParticles(raw, kept[i]);
|
|
503
|
+
if (repaired.start !== kept[i].start || repaired.end !== kept[i].end) {
|
|
504
|
+
kept[i] = repaired;
|
|
505
|
+
changed = true;
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
return kept
|
|
511
|
+
.map((span) => ({ ...span, text: raw.slice(span.start, span.end) }))
|
|
512
|
+
.sort((a, b) => a.start - b.start || a.end - b.end);
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
function copySpan(span: Span): Span {
|
|
516
|
+
return { ...span };
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
/**
|
|
520
|
+
* True when the character at `idx` is a lone initial — a single capital letter
|
|
521
|
+
* with a non-letter to its left (the "F" in "John F."). An initial's trailing
|
|
522
|
+
* dot is name-internal; a dot after a full word is a sentence boundary.
|
|
523
|
+
*/
|
|
524
|
+
function isInitialChar(raw: string, idx: number): boolean {
|
|
525
|
+
const c = raw[idx];
|
|
526
|
+
if (c === undefined || !/\p{Lu}/u.test(c)) return false;
|
|
527
|
+
const prev = raw[idx - 1];
|
|
528
|
+
return prev === undefined || !/\p{L}/u.test(prev);
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
function canBridge(raw: string, a: Span, b: Span): boolean {
|
|
532
|
+
if (a.label !== b.label) return false;
|
|
533
|
+
const [left, right] = a.start <= b.start ? [a, b] : [b, a];
|
|
534
|
+
const gap = raw.slice(left.end, right.start);
|
|
535
|
+
if (!CONNECTOR_RE.test(gap)) return false;
|
|
536
|
+
// A period bridges fragments only across an initial ("J." + "R."); a period
|
|
537
|
+
// after a full word ("Garcia." + "I") is a sentence boundary, not a name.
|
|
538
|
+
if (gap.includes(".") && !isInitialChar(raw, left.end - 1)) return false;
|
|
539
|
+
return true;
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
function mergeAdjacentConnectors(raw: string, spans: readonly Span[]): Span[] {
|
|
543
|
+
const merged: Span[] = [];
|
|
544
|
+
for (const span of [...spans].sort((a, b) => a.start - b.start || a.end - b.end)) {
|
|
545
|
+
const previous = merged[merged.length - 1];
|
|
546
|
+
if (previous !== undefined && canBridge(raw, previous, span)) {
|
|
547
|
+
merged[merged.length - 1] = {
|
|
548
|
+
...previous,
|
|
549
|
+
end: Math.max(previous.end, span.end),
|
|
550
|
+
score: Math.max(previous.score, span.score),
|
|
551
|
+
text: raw.slice(previous.start, Math.max(previous.end, span.end)),
|
|
552
|
+
};
|
|
553
|
+
} else {
|
|
554
|
+
merged.push(copySpan(span));
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
return merged;
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
function rescueCapitalizedParticles(raw: string, span: Span): Span {
|
|
561
|
+
if (!PERSON_LABELS.has(span.label)) return span;
|
|
562
|
+
|
|
563
|
+
let start = span.start;
|
|
564
|
+
let end = span.end;
|
|
565
|
+
const left = LEFT_PARTICLE_RE.exec(raw.slice(0, start));
|
|
566
|
+
// Extend left across the connector, but let a period through only when the
|
|
567
|
+
// particle is a one-letter initial ("J.", "R."), never a word ("Dr.", "St.").
|
|
568
|
+
if (left !== null && CONNECTOR_RE.test(left[2]) && (!left[2].includes(".") || left[1].length === 1)) {
|
|
569
|
+
start -= left[0].length;
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
const right = RIGHT_PARTICLE_RE.exec(raw.slice(end));
|
|
573
|
+
// Never extend right across a period: it always crosses a sentence boundary
|
|
574
|
+
// ("Garcia. I", "Chen. After"). Trailing initials are reached via space.
|
|
575
|
+
if (right !== null && !right[1].includes(".")) {
|
|
576
|
+
end += right[0].length;
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
return start === span.start && end === span.end ? span : { ...span, start, end, text: raw.slice(start, end) };
|
|
580
|
+
}
|