@nationaldesignstudio/rampart 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/LICENSE +402 -0
  2. package/MODEL_CARD.md +422 -0
  3. package/README.md +279 -0
  4. package/RELEASE.md +97 -0
  5. package/WHITEPAPER.md +316 -0
  6. package/dist/index.d.ts +23 -0
  7. package/dist/index.d.ts.map +1 -0
  8. package/dist/index.js +35639 -0
  9. package/dist/index.js.map +36 -0
  10. package/dist/src/guard.d.ts +94 -0
  11. package/dist/src/guard.d.ts.map +1 -0
  12. package/dist/src/heuristics.d.ts +14 -0
  13. package/dist/src/heuristics.d.ts.map +1 -0
  14. package/dist/src/ner/classifier.d.ts +92 -0
  15. package/dist/src/ner/classifier.d.ts.map +1 -0
  16. package/dist/src/ner/worker.d.ts +44 -0
  17. package/dist/src/ner/worker.d.ts.map +1 -0
  18. package/dist/src/ner/worker.js +35302 -0
  19. package/dist/src/ner/worker.js.map +30 -0
  20. package/dist/src/pipeline.d.ts +76 -0
  21. package/dist/src/pipeline.d.ts.map +1 -0
  22. package/dist/src/policy.d.ts +27 -0
  23. package/dist/src/policy.d.ts.map +1 -0
  24. package/dist/src/premask.d.ts +48 -0
  25. package/dist/src/premask.d.ts.map +1 -0
  26. package/dist/src/session.d.ts +60 -0
  27. package/dist/src/session.d.ts.map +1 -0
  28. package/dist/src/streaming.d.ts +32 -0
  29. package/dist/src/streaming.d.ts.map +1 -0
  30. package/dist/src/types.d.ts +43 -0
  31. package/dist/src/types.d.ts.map +1 -0
  32. package/dist/src/validators.d.ts +16 -0
  33. package/dist/src/validators.d.ts.map +1 -0
  34. package/eval/bench/README.md +91 -0
  35. package/eval/bench/fetch.ts +152 -0
  36. package/eval/bench/labels.ts +45 -0
  37. package/eval/bench/run.ts +146 -0
  38. package/eval/bench/runs/m06-v3-30k/by_language.json +303 -0
  39. package/eval/bench/runs/m06-v3-30k/summary.json +56 -0
  40. package/eval/bench/runs/sample-900/by_language.json +303 -0
  41. package/eval/bench/runs/sample-900/manifest.json +926 -0
  42. package/eval/bench/runs/sample-900/summary.json +56 -0
  43. package/eval/bench/score.ts +197 -0
  44. package/eval/bench/webgpu/entry.ts +70 -0
  45. package/eval/bench/webgpu/index.html +12 -0
  46. package/eval/bench/webgpu.ts +209 -0
  47. package/eval/public-cases.ts +412 -0
  48. package/eval/run-public-eval.ts +140 -0
  49. package/examples/basic-chat.ts +12 -0
  50. package/examples/pii-worker.ts +3 -0
  51. package/index.ts +47 -0
  52. package/package.json +103 -0
  53. package/src/guard.ts +170 -0
  54. package/src/heuristics.ts +141 -0
  55. package/src/ner/classifier.ts +580 -0
  56. package/src/ner/worker.ts +130 -0
  57. package/src/policy.ts +64 -0
  58. package/src/premask.ts +90 -0
  59. package/src/session.ts +99 -0
  60. package/src/streaming.ts +73 -0
  61. package/src/types.ts +74 -0
  62. package/src/validators.ts +40 -0
@@ -0,0 +1,580 @@
1
+ /**
2
+ * Contextual PII detection via a small token-classification model running in
3
+ * the browser (transformers.js → ONNX Runtime Web, wasm or WebGPU backend).
4
+ *
5
+ * This is the residual layer: it catches what the heuristics can't — people's
6
+ * names, organizations, and free-text identifiers — which is exactly the PII we
7
+ * never want in our logs. The model is intentionally tiny and int8-quantized so
8
+ * it loads once (cached in IndexedDB by the runtime) and runs on-device with no
9
+ * server round-trip and no shared queue to saturate.
10
+ *
11
+ * Label mapping: the fine-tuned model emits token-classification entity groups,
12
+ * which we map onto our {@link PiiLabel} set. CITY/STATE/ZIP_CODE are emitted
13
+ * too so the merge step can carry them through to the keep-set.
14
+ */
15
+
16
+ import { mergeSpans } from "../policy";
17
+ import type { PiiLabel, Span } from "../types";
18
+
19
+ /** Minimal shape of a transformers.js token-classification result row. */
20
+ interface RawEntity {
21
+ readonly entity_group?: string;
22
+ readonly entity?: string;
23
+ readonly score: number;
24
+ readonly start: number;
25
+ readonly end: number;
26
+ readonly word: string;
27
+ }
28
+
29
+ /** Counts the model tokens in a string, excluding the [CLS]/[SEP] specials. */
30
+ export type TokenCounter = (text: string) => number;
31
+
32
+ /**
33
+ * The callable returned by a token-classification pipeline. `countTokens` is
34
+ * attached when the classifier is backed by a real tokenizer (see
35
+ * {@link loadNerClassifier}); {@link detectNer} uses it to size windows by the
36
+ * model's token budget. Bare mocks may omit it, in which case detection runs the
37
+ * whole input as a single window.
38
+ */
39
+ export interface TokenClassifier {
40
+ (text: string, options?: { aggregation_strategy?: "simple" | "first" | "max" }): Promise<RawEntity[]>;
41
+ countTokens?: TokenCounter;
42
+ }
43
+
44
+ /** Maps model entity groups to our labels. Unknown groups are dropped. */
45
+ const GROUP_TO_LABEL: Readonly<Record<string, PiiLabel>> = {
46
+ // Split names (a household may share a surname, so they stay distinct).
47
+ GIVEN_NAME: "GIVEN_NAME",
48
+ GIVENNAME: "GIVEN_NAME",
49
+ SURNAME: "SURNAME",
50
+ LASTNAME: "SURNAME",
51
+ // Contact / document identifiers.
52
+ EMAIL: "EMAIL",
53
+ PHONE: "PHONE",
54
+ URL: "URL",
55
+ TAX_ID: "TAX_ID",
56
+ BANK_ACCOUNT: "BANK_ACCOUNT",
57
+ ROUTING_NUMBER: "ROUTING_NUMBER",
58
+ GOVERNMENT_ID: "GOVERNMENT_ID",
59
+ PASSPORT: "PASSPORT",
60
+ DRIVERS_LICENSE: "DRIVERS_LICENSE",
61
+ // Address components.
62
+ BUILDING_NUMBER: "BUILDING_NUMBER",
63
+ STREET_NAME: "STREET_NAME",
64
+ SECONDARY_ADDRESS: "SECONDARY_ADDRESS",
65
+ SECADDRESS: "SECONDARY_ADDRESS",
66
+ CITY: "CITY",
67
+ STATE: "STATE",
68
+ ZIP_CODE: "ZIP_CODE",
69
+ };
70
+
71
+ /** The shipped Rampart token-classifier on Hugging Face (q4 ONNX only). */
72
+ export const RAMPART_MODEL_ID = "nationaldesignstudio/rampart";
73
+
74
+ export interface NerOptions {
75
+ /**
76
+ * Hugging Face model id or local directory path. Must be a token-classification
77
+ * ONNX export compatible with Rampart's label schema. Defaults to
78
+ * {@link RAMPART_MODEL_ID}.
79
+ */
80
+ readonly model?: string;
81
+ /** Backend. `"wasm"`/`"webgpu"` in browsers; `"cpu"` for Node (ORT). */
82
+ readonly device?: "wasm" | "webgpu" | "cpu";
83
+ /** Spans below this score are discarded. Low default → recall-biased. */
84
+ readonly minScore?: number;
85
+ }
86
+
87
+ const DEFAULT_OPTIONS: Required<Omit<NerOptions, "model">> = {
88
+ device: "wasm",
89
+ minScore: 0.4,
90
+ };
91
+
92
+ /**
93
+ * The MiniLM token classifier has a hard context window, so each NER window is
94
+ * sized to the model's token budget — measured with the model's own tokenizer,
95
+ * not a character proxy, so a window holds exactly as much text as actually fits
96
+ * regardless of token density. Past this, ORT would silently truncate the
97
+ * sequence and drop whatever followed.
98
+ */
99
+ const MODEL_MAX_TOKENS = 512;
100
+ /** [CLS] + [SEP] the pipeline wraps every window in. */
101
+ const SPECIAL_TOKENS = 2;
102
+ /** Per-window content-token budget: the model max less specials and a safety margin. */
103
+ export const NER_TOKEN_BUDGET = MODEL_MAX_TOKENS - SPECIAL_TOKENS - 10;
104
+
105
+ /**
106
+ * Tokens shared by consecutive NER windows. Long input slides a window of
107
+ * {@link NER_TOKEN_BUDGET} tokens; this overlap guarantees an entity landing on a
108
+ * window seam is still *wholly* inside a neighbouring window.
109
+ *
110
+ * The invariant: as long as the overlap is at least the longest entity we detect
111
+ * (names, orgs, street lines — all a handful of tokens), no entity is ever split
112
+ * across a boundary, so a window-edge name is never silently dropped. The generous
113
+ * margin over the longest entity also means a seam entity reappears deep inside its
114
+ * neighbour with ample context, which the classifier needs to label it confidently.
115
+ */
116
+ export const NER_TOKEN_OVERLAP = 64;
117
+
118
+ /** Unicode combining marks; stripped during model-space folding (José → jose). */
119
+ const COMBINING_MARKS_RE = /\p{M}/gu;
120
+
121
+ const EXTEND_SCORE = 0.15;
122
+ const CONNECTOR_RE = /^[\s'\u2019.-]*$/;
123
+ const PERSON_LABELS: ReadonlySet<PiiLabel> = new Set(["GIVEN_NAME", "SURNAME"]);
124
+ const LEFT_PARTICLE_RE = /([\p{Lu}][\p{L}\p{M}\u2019']{0,3})([\s'\u2019.-]{1,3})$/u;
125
+ const RIGHT_PARTICLE_RE = /^([\s'\u2019.-]{1,3})([\p{Lu}][\p{L}\p{M}\u2019']{0,3})/u;
126
+
127
+ /**
128
+ * Lazily construct the token-classification pipeline. transformers.js is a peer
129
+ * dependency and a heavy import, so it is loaded on first use, not at module
130
+ * load — keeping the heuristic path dependency-free.
131
+ */
132
+ export async function loadNerClassifier(options: NerOptions = {}): Promise<TokenClassifier> {
133
+ const { pipeline } = await import("@huggingface/transformers");
134
+ const merged = { ...DEFAULT_OPTIONS, ...options };
135
+ const model = merged.model ?? RAMPART_MODEL_ID;
136
+ const classifier = await pipeline("token-classification", model, {
137
+ dtype: "q4",
138
+ device: merged.device,
139
+ });
140
+ // transformers.js's published types omit `aggregation_strategy` from
141
+ // `TokenClassificationPipelineOptions` even though the runtime accepts it,
142
+ // so we wrap the pipeline in a typed adapter rather than coercing the
143
+ // union return through a double cast at the call site.
144
+ const adapter: TokenClassifier = (text, opts) =>
145
+ (classifier as (input: string, options?: unknown) => Promise<RawEntity[]>)(text, opts);
146
+ // Expose the pipeline's tokenizer so detectNer can size windows by real tokens.
147
+ // `encode` returns the content token ids (no specials), so its length is the
148
+ // token count we budget each window against. A token-classification pipeline
149
+ // always carries a tokenizer; guard anyway so an unexpected runtime degrades to
150
+ // the single-window path rather than throwing mid-detection.
151
+ const tokenizer = (classifier as unknown as {
152
+ tokenizer?: { encode?: (t: string, o?: { add_special_tokens?: boolean }) => number[] };
153
+ }).tokenizer;
154
+ if (tokenizer?.encode) {
155
+ adapter.countTokens = (text) => tokenizer.encode!(text, { add_special_tokens: false }).length;
156
+ }
157
+ return adapter;
158
+ }
159
+
160
+ /**
161
+ * Detect contextual PII across the whole input, regardless of length.
162
+ *
163
+ * The model has a fixed token budget, so input longer than one window is scanned
164
+ * as a sliding window sized to {@link NER_TOKEN_BUDGET} *tokens* (measured by the
165
+ * classifier's own tokenizer) that overlaps its neighbour by {@link NER_TOKEN_OVERLAP}
166
+ * tokens. Each window's spans are shifted back into whole-text coordinates; because
167
+ * windows overlap, an entity on a seam is re-detected in both, so {@link mergeSpans}
168
+ * collapses the duplicates into the canonical disjoint set. Input that fits one
169
+ * window — or any classifier without a tokenizer, e.g. a bare test mock — takes a
170
+ * single-window fast path identical to scanning the text directly.
171
+ *
172
+ * Sizing by tokens rather than a char cap means a window holds exactly as much
173
+ * text as the model can attend to, and nothing past a fixed char count is silently
174
+ * dropped: the overlap keeps any entity from being split across a seam.
175
+ */
176
+ export async function detectNer(
177
+ raw: string,
178
+ classifier: TokenClassifier,
179
+ minScore: number = DEFAULT_OPTIONS.minScore,
180
+ ): Promise<Span[]> {
181
+ const windows =
182
+ classifier.countTokens === undefined
183
+ ? [{ start: 0, end: raw.length }]
184
+ : planTokenWindows(raw, classifier.countTokens, NER_TOKEN_BUDGET, NER_TOKEN_OVERLAP);
185
+
186
+ if (windows.length <= 1) {
187
+ return detectNerWindow(raw, classifier, minScore);
188
+ }
189
+
190
+ const spans: Span[] = [];
191
+ for (const window of windows) {
192
+ // Windows run sequentially: they share one model/session, which is not safe
193
+ // to drive with concurrent inference calls.
194
+ const windowSpans = await detectNerWindow(raw.slice(window.start, window.end), classifier, minScore);
195
+ for (const span of windowSpans) {
196
+ spans.push({ ...span, start: span.start + window.start, end: span.end + window.start });
197
+ }
198
+ }
199
+ return mergeSpans(spans);
200
+ }
201
+
202
+ /** A half-open char window `[start, end)` into the raw text. */
203
+ interface CharWindow {
204
+ readonly start: number;
205
+ readonly end: number;
206
+ }
207
+
208
+ /**
209
+ * Plan the sliding windows over `raw`: each holds at most `budget` tokens and
210
+ * overlaps its predecessor by at least `overlap` tokens, together covering the
211
+ * whole string. Windows snap to word boundaries (so no window cuts a word — and
212
+ * therefore a token — in half); a single word longer than the budget is the only
213
+ * case hard-split mid-word, by character, as a fallback. `countTokens` is the
214
+ * model's tokenizer, so `budget` is the real per-window capacity.
215
+ */
216
+ function planTokenWindows(
217
+ raw: string,
218
+ countTokens: TokenCounter,
219
+ budget: number,
220
+ overlap: number,
221
+ ): CharWindow[] {
222
+ const segments = toSegments(raw, countTokens, budget);
223
+ if (segments.length === 0) return [];
224
+
225
+ const windows: CharWindow[] = [];
226
+ let i = 0;
227
+ while (i < segments.length) {
228
+ // Grow [i, j) while it fits the budget, always taking at least one segment
229
+ // (toSegments guarantees each segment is within budget).
230
+ let tokens = 0;
231
+ let j = i;
232
+ while (j < segments.length && (j === i || tokens + segments[j].tokens <= budget)) {
233
+ tokens += segments[j].tokens;
234
+ j++;
235
+ }
236
+ windows.push({ start: segments[i].start, end: segments[j - 1].end });
237
+ if (j === segments.length) break;
238
+
239
+ // Advance so the next window overlaps this one by >= overlap tokens, while
240
+ // always making progress (start strictly after i).
241
+ let shared = 0;
242
+ let next = j;
243
+ while (next > i + 1 && shared < overlap) {
244
+ next--;
245
+ shared += segments[next].tokens;
246
+ }
247
+ i = next;
248
+ }
249
+ return windows;
250
+ }
251
+
252
+ /** A word-aligned slice of the raw text with its model-token count. */
253
+ interface Segment {
254
+ readonly start: number;
255
+ readonly end: number;
256
+ readonly tokens: number;
257
+ }
258
+
259
+ /**
260
+ * Partition `raw` into word-aligned segments (a word plus its trailing
261
+ * whitespace) tagged with token counts. A word that alone exceeds `budget` —
262
+ * pathological, e.g. a long unbroken blob — is hard-split by character so every
263
+ * returned segment fits the budget and the packer can always place it.
264
+ */
265
+ function toSegments(raw: string, countTokens: TokenCounter, budget: number): Segment[] {
266
+ const segments: Segment[] = [];
267
+ for (const [start, end] of wordSpans(raw)) {
268
+ let from = start;
269
+ while (from < end) {
270
+ const tokens = countTokens(raw.slice(from, end));
271
+ if (tokens <= budget) {
272
+ segments.push({ start: from, end, tokens });
273
+ break;
274
+ }
275
+ const cut = fitCharsToBudget(raw, from, end, budget, countTokens);
276
+ segments.push({ start: from, end: cut, tokens: countTokens(raw.slice(from, cut)) });
277
+ from = cut;
278
+ }
279
+ }
280
+ return segments;
281
+ }
282
+
283
+ /** Yield contiguous `[start, end)` spans of `raw`, each a word + trailing whitespace. */
284
+ function* wordSpans(raw: string): Generator<[number, number]> {
285
+ const n = raw.length;
286
+ let i = 0;
287
+ while (i < n) {
288
+ let j = i;
289
+ while (j < n && !/\s/.test(raw[j])) j++; // the word
290
+ while (j < n && /\s/.test(raw[j])) j++; // its trailing whitespace
291
+ yield [i, j];
292
+ i = j;
293
+ }
294
+ }
295
+
296
+ /**
297
+ * Largest char offset `cut` in `(from, end]` whose slice fits `budget` tokens.
298
+ * Only reached for a single over-budget word, where token count grows with
299
+ * length; binary search lands on the cut and the budget's safety margin absorbs
300
+ * the slight non-monotonicity of subword tokenization at the edge.
301
+ */
302
+ function fitCharsToBudget(
303
+ raw: string,
304
+ from: number,
305
+ end: number,
306
+ budget: number,
307
+ countTokens: TokenCounter,
308
+ ): number {
309
+ let lo = from + 1;
310
+ let hi = end;
311
+ let best = from + 1;
312
+ while (lo <= hi) {
313
+ const mid = (lo + hi) >> 1;
314
+ if (countTokens(raw.slice(from, mid)) <= budget) {
315
+ best = mid;
316
+ lo = mid + 1;
317
+ } else {
318
+ hi = mid - 1;
319
+ }
320
+ }
321
+ return best;
322
+ }
323
+
324
+ /**
325
+ * Run the classifier over a single window and convert its output to {@link Span}s
326
+ * over that window's text. Below-threshold and zero-width spans are dropped;
327
+ * recognized kept-classes are preserved as spans so the policy layer can shield
328
+ * them.
329
+ *
330
+ * Handles two pipeline output shapes:
331
+ * - aggregated (`entity_group` + char `start`/`end`) — when the runtime applied
332
+ * `simple` aggregation; offsets are used directly.
333
+ * - raw BIO tokens (`entity` = `B-GIVEN_NAME`/`I-GIVEN_NAME`, no offsets) — what
334
+ * transformers.js emits here. The token `word` is accent-folded, so we locate
335
+ * it in a matching folded projection of the input and project the span back to
336
+ * raw offsets through an offset map. A naive search against the unfolded text
337
+ * fails on every accented character, which drops spans (leaks) and lets the
338
+ * offset cursor desync into multi-line, paragraph-swallowing spans.
339
+ */
340
+ async function detectNerWindow(
341
+ raw: string,
342
+ classifier: TokenClassifier,
343
+ minScore: number = DEFAULT_OPTIONS.minScore,
344
+ ): Promise<Span[]> {
345
+ const inferText = raw.replaceAll("-", " ");
346
+ const entities = await classifier(inferText, { aggregation_strategy: "simple" });
347
+ // `inferText` and `raw` are the same length (hyphen→space is 1:1), so offsets
348
+ // recovered against this folded projection address `raw` too.
349
+ const folded = foldForModel(inferText);
350
+ const candidates: Span[] = [];
351
+ for (const entity of mergeBioTokens(entities, folded)) {
352
+ const label = GROUP_TO_LABEL[entity.group.toUpperCase()];
353
+ if (label === undefined) continue;
354
+ if (entity.score < EXTEND_SCORE || entity.end <= entity.start) continue;
355
+ candidates.push({
356
+ start: entity.start,
357
+ end: entity.end,
358
+ label,
359
+ score: entity.score,
360
+ source: "ner",
361
+ text: raw.slice(entity.start, entity.end),
362
+ });
363
+ }
364
+ return repairSpans(raw, candidates, minScore);
365
+ }
366
+
367
+ interface AggregatedEntity {
368
+ group: string;
369
+ score: number;
370
+ start: number;
371
+ end: number;
372
+ }
373
+
374
+ /** Strip a BIO prefix: `B-GIVEN_NAME`/`I-GIVEN_NAME` → `GIVEN_NAME`; bare labels pass through. */
375
+ function stripBio(label: string): { prefix: "B" | "I" | null; base: string } {
376
+ const m = /^([BI])-(.+)$/.exec(label);
377
+ return m ? { prefix: m[1] as "B" | "I", base: m[2] } : { prefix: null, base: label };
378
+ }
379
+
380
+ /** A folded copy of the model input plus a map from each folded char to raw. */
381
+ interface FoldedProjection {
382
+ /** Lowercased, NFKD, combining-mark-stripped copy of the input. */
383
+ readonly text: string;
384
+ /** `rawStart[i]` is the raw offset of the source code point of `text[i]`. */
385
+ readonly rawStart: number[];
386
+ /** `rawEnd[i]` is the raw offset just past that source code point. */
387
+ readonly rawEnd: number[];
388
+ }
389
+
390
+ /**
391
+ * Fold `raw` to the model's normalized space (lowercase + NFKD + combining-mark
392
+ * strip — the same fold BERT's BasicTokenizer applies) while recording, per
393
+ * folded character, the `[start, end)` raw offsets of the code point it came
394
+ * from. This is the bridge that lets a folded token `word` be matched against
395
+ * folded text and then projected back to exact raw offsets. Iterates by code
396
+ * point so surrogate pairs map back to whole-character raw spans.
397
+ */
398
+ function foldForModel(raw: string): FoldedProjection {
399
+ let text = "";
400
+ const rawStart: number[] = [];
401
+ const rawEnd: number[] = [];
402
+ let i = 0;
403
+ for (const codePoint of raw) {
404
+ const folded = codePoint.toLowerCase().normalize("NFKD").replace(COMBINING_MARKS_RE, "");
405
+ for (const ch of folded) {
406
+ text += ch;
407
+ rawStart.push(i);
408
+ rawEnd.push(i + codePoint.length);
409
+ }
410
+ i += codePoint.length;
411
+ }
412
+ return { text, rawStart, rawEnd };
413
+ }
414
+
415
+ /**
416
+ * Normalize either output shape into raw-offset entities. Aggregated rows carry
417
+ * offsets in the model-input (unfolded) coordinate system, so they are used
418
+ * directly. Raw BIO tokens are merged (B starts a span, matching I extends it);
419
+ * each token's folded `word` is located in the folded projection via a
420
+ * forward-advancing search — so repeated words map to distinct offsets — and the
421
+ * folded span is projected back to raw through the projection's offset map.
422
+ */
423
+ function mergeBioTokens(entities: RawEntity[], folded: FoldedProjection): AggregatedEntity[] {
424
+ const out: AggregatedEntity[] = [];
425
+ let cursor = 0;
426
+ let current: (AggregatedEntity & { count: number }) | null = null;
427
+
428
+ const flush = (): void => {
429
+ if (current !== null) {
430
+ out.push({ group: current.group, score: current.score / current.count, start: current.start, end: current.end });
431
+ current = null;
432
+ }
433
+ };
434
+
435
+ for (const entity of entities) {
436
+ // Already-aggregated shape: offsets are in unfolded model-input coordinates.
437
+ if (entity.entity_group !== undefined && typeof entity.start === "number" && typeof entity.end === "number") {
438
+ flush();
439
+ out.push({ group: entity.entity_group, score: entity.score, start: entity.start, end: entity.end });
440
+ continue;
441
+ }
442
+ const rawLabel = entity.entity ?? entity.entity_group;
443
+ if (rawLabel === undefined) continue;
444
+ const { prefix, base } = stripBio(rawLabel);
445
+
446
+ const word = (entity.word ?? "").replace(/^##/, "").toLowerCase();
447
+ if (!word) continue;
448
+ const at = folded.text.indexOf(word, cursor);
449
+ if (at < 0) continue;
450
+ const start = folded.rawStart[at];
451
+ const end = folded.rawEnd[at + word.length - 1];
452
+ cursor = at + word.length;
453
+
454
+ const continues = current !== null && current.group === base && prefix !== "B";
455
+ if (continues && current !== null) {
456
+ current.end = end;
457
+ current.score += entity.score;
458
+ current.count += 1;
459
+ } else {
460
+ flush();
461
+ current = { group: base, score: entity.score, start, end, count: 1 };
462
+ }
463
+ }
464
+ flush();
465
+ return out;
466
+ }
467
+
468
+ function repairSpans(raw: string, spans: readonly Span[], anchorScore: number): Span[] {
469
+ let kept = spans.filter((span) => span.score >= anchorScore).map(copySpan);
470
+ const candidates = spans.filter((span) => span.score >= EXTEND_SCORE && span.score < anchorScore).map(copySpan);
471
+
472
+ // Hard cap on the convergence loop. Real text converges in O(maxLabelLen)
473
+ // iterations; rows with many same-label single-token fragments can otherwise
474
+ // make rescue + bridge keep flipping. 32 is well above any healthy run and
475
+ // bounds the pathological case to a few milliseconds.
476
+ const MAX_ITERS = 32;
477
+ let iters = 0;
478
+ let changed = true;
479
+ while (changed && iters < MAX_ITERS) {
480
+ changed = false;
481
+ iters++;
482
+
483
+ for (let i = candidates.length - 1; i >= 0; i--) {
484
+ const candidate = candidates[i];
485
+ if (kept.some((span) => canBridge(raw, candidate, span))) {
486
+ kept.push(candidate);
487
+ candidates.splice(i, 1);
488
+ changed = true;
489
+ }
490
+ }
491
+
492
+ const merged = mergeAdjacentConnectors(raw, kept);
493
+ const didMerge =
494
+ merged.length !== kept.length ||
495
+ merged.some((span, index) => span.start !== kept[index]?.start || span.end !== kept[index]?.end);
496
+ if (didMerge) {
497
+ changed = true;
498
+ }
499
+ kept = merged;
500
+
501
+ for (let i = 0; i < kept.length; i++) {
502
+ const repaired = rescueCapitalizedParticles(raw, kept[i]);
503
+ if (repaired.start !== kept[i].start || repaired.end !== kept[i].end) {
504
+ kept[i] = repaired;
505
+ changed = true;
506
+ }
507
+ }
508
+ }
509
+
510
+ return kept
511
+ .map((span) => ({ ...span, text: raw.slice(span.start, span.end) }))
512
+ .sort((a, b) => a.start - b.start || a.end - b.end);
513
+ }
514
+
515
+ function copySpan(span: Span): Span {
516
+ return { ...span };
517
+ }
518
+
519
+ /**
520
+ * True when the character at `idx` is a lone initial — a single capital letter
521
+ * with a non-letter to its left (the "F" in "John F."). An initial's trailing
522
+ * dot is name-internal; a dot after a full word is a sentence boundary.
523
+ */
524
+ function isInitialChar(raw: string, idx: number): boolean {
525
+ const c = raw[idx];
526
+ if (c === undefined || !/\p{Lu}/u.test(c)) return false;
527
+ const prev = raw[idx - 1];
528
+ return prev === undefined || !/\p{L}/u.test(prev);
529
+ }
530
+
531
+ function canBridge(raw: string, a: Span, b: Span): boolean {
532
+ if (a.label !== b.label) return false;
533
+ const [left, right] = a.start <= b.start ? [a, b] : [b, a];
534
+ const gap = raw.slice(left.end, right.start);
535
+ if (!CONNECTOR_RE.test(gap)) return false;
536
+ // A period bridges fragments only across an initial ("J." + "R."); a period
537
+ // after a full word ("Garcia." + "I") is a sentence boundary, not a name.
538
+ if (gap.includes(".") && !isInitialChar(raw, left.end - 1)) return false;
539
+ return true;
540
+ }
541
+
542
+ function mergeAdjacentConnectors(raw: string, spans: readonly Span[]): Span[] {
543
+ const merged: Span[] = [];
544
+ for (const span of [...spans].sort((a, b) => a.start - b.start || a.end - b.end)) {
545
+ const previous = merged[merged.length - 1];
546
+ if (previous !== undefined && canBridge(raw, previous, span)) {
547
+ merged[merged.length - 1] = {
548
+ ...previous,
549
+ end: Math.max(previous.end, span.end),
550
+ score: Math.max(previous.score, span.score),
551
+ text: raw.slice(previous.start, Math.max(previous.end, span.end)),
552
+ };
553
+ } else {
554
+ merged.push(copySpan(span));
555
+ }
556
+ }
557
+ return merged;
558
+ }
559
+
560
+ function rescueCapitalizedParticles(raw: string, span: Span): Span {
561
+ if (!PERSON_LABELS.has(span.label)) return span;
562
+
563
+ let start = span.start;
564
+ let end = span.end;
565
+ const left = LEFT_PARTICLE_RE.exec(raw.slice(0, start));
566
+ // Extend left across the connector, but let a period through only when the
567
+ // particle is a one-letter initial ("J.", "R."), never a word ("Dr.", "St.").
568
+ if (left !== null && CONNECTOR_RE.test(left[2]) && (!left[2].includes(".") || left[1].length === 1)) {
569
+ start -= left[0].length;
570
+ }
571
+
572
+ const right = RIGHT_PARTICLE_RE.exec(raw.slice(end));
573
+ // Never extend right across a period: it always crosses a sentence boundary
574
+ // ("Garcia. I", "Chen. After"). Trailing initials are reached via space.
575
+ if (right !== null && !right[1].includes(".")) {
576
+ end += right[0].length;
577
+ }
578
+
579
+ return start === span.start && end === span.end ? span : { ...span, start, end, text: raw.slice(start, end) };
580
+ }