@nationaldesignstudio/rampart 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +64 -17
- package/dist/index.js.map +3 -3
- package/dist/src/ner/classifier.d.ts +4 -0
- package/dist/src/ner/classifier.d.ts.map +1 -1
- package/dist/src/ner/worker.js +64 -17
- package/dist/src/ner/worker.js.map +3 -3
- package/eval/public-cases.ts +31 -0
- package/package.json +2 -2
- package/src/ner/classifier.ts +107 -24
package/eval/public-cases.ts
CHANGED
|
@@ -409,4 +409,35 @@ export const PUBLIC_E2E_CASES: readonly PublicEvalCase[] = [
|
|
|
409
409
|
],
|
|
410
410
|
publicTerms: ["age 29", "$52,000"],
|
|
411
411
|
},
|
|
412
|
+
{
|
|
413
|
+
id: "chat-accented-name-address",
|
|
414
|
+
input: "My name is Renée Müller and I live at 1234 Cárdenas Boulevard.",
|
|
415
|
+
privateTerms: [
|
|
416
|
+
{ text: "Renée", label: "GIVEN_NAME" },
|
|
417
|
+
{ text: "Müller", label: "SURNAME" },
|
|
418
|
+
{ text: "1234", label: "BUILDING_NUMBER" },
|
|
419
|
+
{ text: "Cárdenas Boulevard", label: "STREET_NAME" },
|
|
420
|
+
],
|
|
421
|
+
publicTerms: [],
|
|
422
|
+
},
|
|
423
|
+
{
|
|
424
|
+
id: "chat-accented-name-phone",
|
|
425
|
+
input: "Please contact José Ångström at 305-201-0143 about the appointment.",
|
|
426
|
+
privateTerms: [
|
|
427
|
+
{ text: "José", label: "GIVEN_NAME" },
|
|
428
|
+
{ text: "Ångström", label: "SURNAME" },
|
|
429
|
+
{ text: "305-201-0143", label: "PHONE" },
|
|
430
|
+
],
|
|
431
|
+
publicTerms: ["appointment"],
|
|
432
|
+
},
|
|
433
|
+
{
|
|
434
|
+
id: "chat-duplicate-digit-run-address",
|
|
435
|
+
input: "Call me at 646-555-0199. I live at 1234 Maple Street.",
|
|
436
|
+
privateTerms: [
|
|
437
|
+
{ text: "646-555-0199", label: "PHONE" },
|
|
438
|
+
{ text: "1234", label: "BUILDING_NUMBER" },
|
|
439
|
+
{ text: "Maple Street", label: "STREET_NAME" },
|
|
440
|
+
],
|
|
441
|
+
publicTerms: [],
|
|
442
|
+
},
|
|
412
443
|
];
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@nationaldesignstudio/rampart",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.2",
|
|
4
4
|
"description": "Rampart — client-side PII redaction for AI assistants: deterministic recognizers + a 14.7 MB ONNX classifier (transformers.js), default-deny policy, and reversible placeholders. Runs entirely in the browser.",
|
|
5
5
|
"license": "CC-BY-4.0",
|
|
6
6
|
"homepage": "https://github.com/nationaldesignstudio/rampart#readme",
|
|
@@ -100,4 +100,4 @@
|
|
|
100
100
|
"vitest": "4.1.8"
|
|
101
101
|
},
|
|
102
102
|
"packageManager": "bun@1.3.14"
|
|
103
|
-
}
|
|
103
|
+
}
|
package/src/ner/classifier.ts
CHANGED
|
@@ -24,6 +24,8 @@ interface RawEntity {
|
|
|
24
24
|
readonly start: number;
|
|
25
25
|
readonly end: number;
|
|
26
26
|
readonly word: string;
|
|
27
|
+
/** Index into `[CLS] + content + [SEP]`; used for tokenizer-backed offset recovery. */
|
|
28
|
+
readonly index?: number;
|
|
27
29
|
}
|
|
28
30
|
|
|
29
31
|
/** Counts the model tokens in a string, excluding the [CLS]/[SEP] specials. */
|
|
@@ -39,6 +41,8 @@ export type TokenCounter = (text: string) => number;
|
|
|
39
41
|
export interface TokenClassifier {
|
|
40
42
|
(text: string, options?: { aggregation_strategy?: "simple" | "first" | "max" }): Promise<RawEntity[]>;
|
|
41
43
|
countTokens?: TokenCounter;
|
|
44
|
+
/** WordPiece tokens for `text` (no specials); drives index→char offset recovery. */
|
|
45
|
+
tokenize?: (text: string) => readonly string[];
|
|
42
46
|
}
|
|
43
47
|
|
|
44
48
|
/** Maps model entity groups to our labels. Unknown groups are dropped. */
|
|
@@ -149,11 +153,17 @@ export async function loadNerClassifier(options: NerOptions = {}): Promise<Token
|
|
|
149
153
|
// always carries a tokenizer; guard anyway so an unexpected runtime degrades to
|
|
150
154
|
// the single-window path rather than throwing mid-detection.
|
|
151
155
|
const tokenizer = (classifier as unknown as {
|
|
152
|
-
tokenizer?: {
|
|
156
|
+
tokenizer?: {
|
|
157
|
+
encode?: (t: string, o?: { add_special_tokens?: boolean }) => number[];
|
|
158
|
+
tokenize?: (t: string) => string[];
|
|
159
|
+
};
|
|
153
160
|
}).tokenizer;
|
|
154
161
|
if (tokenizer?.encode) {
|
|
155
162
|
adapter.countTokens = (text) => tokenizer.encode!(text, { add_special_tokens: false }).length;
|
|
156
163
|
}
|
|
164
|
+
if (tokenizer?.tokenize) {
|
|
165
|
+
adapter.tokenize = (text) => tokenizer.tokenize!(text);
|
|
166
|
+
}
|
|
157
167
|
return adapter;
|
|
158
168
|
}
|
|
159
169
|
|
|
@@ -330,12 +340,10 @@ function fitCharsToBudget(
|
|
|
330
340
|
* Handles two pipeline output shapes:
|
|
331
341
|
* - aggregated (`entity_group` + char `start`/`end`) — when the runtime applied
|
|
332
342
|
* `simple` aggregation; offsets are used directly.
|
|
333
|
-
* - raw BIO tokens (`entity` = `B-GIVEN_NAME`/`I-GIVEN_NAME`, no offsets) —
|
|
334
|
-
* transformers.js emits here.
|
|
335
|
-
*
|
|
336
|
-
*
|
|
337
|
-
* fails on every accented character, which drops spans (leaks) and lets the
|
|
338
|
-
* offset cursor desync into multi-line, paragraph-swallowing spans.
|
|
343
|
+
* - raw BIO tokens (`entity` = `B-GIVEN_NAME`/`I-GIVEN_NAME`, no char offsets) —
|
|
344
|
+
* what transformers.js emits here. When the runtime supplies a token `index`,
|
|
345
|
+
* char offsets come from a tokenizer walk; otherwise the folded `word` is
|
|
346
|
+
* located in a matching folded projection (accent-safe fallback).
|
|
339
347
|
*/
|
|
340
348
|
async function detectNerWindow(
|
|
341
349
|
raw: string,
|
|
@@ -347,8 +355,10 @@ async function detectNerWindow(
|
|
|
347
355
|
// `inferText` and `raw` are the same length (hyphen→space is 1:1), so offsets
|
|
348
356
|
// recovered against this folded projection address `raw` too.
|
|
349
357
|
const folded = foldForModel(inferText);
|
|
358
|
+
const indexOffsets =
|
|
359
|
+
classifier.tokenize === undefined ? undefined : buildTokenIndexOffsets(folded, classifier.tokenize(inferText));
|
|
350
360
|
const candidates: Span[] = [];
|
|
351
|
-
for (const entity of mergeBioTokens(entities, folded)) {
|
|
361
|
+
for (const entity of mergeBioTokens(entities, folded, indexOffsets)) {
|
|
352
362
|
const label = GROUP_TO_LABEL[entity.group.toUpperCase()];
|
|
353
363
|
if (label === undefined) continue;
|
|
354
364
|
if (entity.score < EXTEND_SCORE || entity.end <= entity.start) continue;
|
|
@@ -412,15 +422,58 @@ function foldForModel(raw: string): FoldedProjection {
|
|
|
412
422
|
return { text, rawStart, rawEnd };
|
|
413
423
|
}
|
|
414
424
|
|
|
425
|
+
/**
|
|
426
|
+
* Map each pipeline token index (`[CLS]` = 0, content, `[SEP]` last) to char
|
|
427
|
+
* offsets in the raw input. The tokenizer emits WordPiece tokens in the model's
|
|
428
|
+
* *folded* space (lowercase + accent-stripped), so the walk is performed against
|
|
429
|
+
* the folded projection and projected back to raw offsets through its offset
|
|
430
|
+
* map. This both handles accented input (where folded/raw lengths differ) and
|
|
431
|
+
* resolves duplicate substrings — the second `123` lands on the address token,
|
|
432
|
+
* not the first digit run.
|
|
433
|
+
*/
|
|
434
|
+
function buildTokenIndexOffsets(
|
|
435
|
+
folded: FoldedProjection,
|
|
436
|
+
contentTokens: readonly string[],
|
|
437
|
+
): ReadonlyArray<readonly [number, number]> {
|
|
438
|
+
const offsets: [number, number][] = [[0, 0]];
|
|
439
|
+
let cursor = 0;
|
|
440
|
+
for (const token of contentTokens) {
|
|
441
|
+
const piece = token.startsWith("##") ? token.slice(2) : token;
|
|
442
|
+
const at = token.startsWith("##") ? cursor : folded.text.indexOf(piece, cursor);
|
|
443
|
+
if (piece.length === 0 || at < 0 || at + piece.length > folded.text.length) {
|
|
444
|
+
offsets.push([0, 0]);
|
|
445
|
+
continue;
|
|
446
|
+
}
|
|
447
|
+
offsets.push([folded.rawStart[at], folded.rawEnd[at + piece.length - 1]]);
|
|
448
|
+
cursor = at + piece.length;
|
|
449
|
+
}
|
|
450
|
+
offsets.push([0, 0]);
|
|
451
|
+
return offsets;
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
function offsetFromTokenIndex(
|
|
455
|
+
indexOffsets: ReadonlyArray<readonly [number, number]> | undefined,
|
|
456
|
+
index: number | undefined,
|
|
457
|
+
): readonly [number, number] | null {
|
|
458
|
+
if (indexOffsets === undefined || index === undefined) return null;
|
|
459
|
+
const pair = indexOffsets[index];
|
|
460
|
+
if (pair === undefined || pair[0] === pair[1]) return null;
|
|
461
|
+
return pair;
|
|
462
|
+
}
|
|
463
|
+
|
|
415
464
|
/**
|
|
416
465
|
* Normalize either output shape into raw-offset entities. Aggregated rows carry
|
|
417
466
|
* offsets in the model-input (unfolded) coordinate system, so they are used
|
|
418
467
|
* directly. Raw BIO tokens are merged (B starts a span, matching I extends it);
|
|
419
|
-
*
|
|
420
|
-
*
|
|
421
|
-
*
|
|
468
|
+
* when the runtime supplies a token `index`, char offsets come from the
|
|
469
|
+
* tokenizer walk. Otherwise each token's folded `word` is located in the folded
|
|
470
|
+
* projection via a forward-advancing search and projected back to raw.
|
|
422
471
|
*/
|
|
423
|
-
function mergeBioTokens(
|
|
472
|
+
function mergeBioTokens(
|
|
473
|
+
entities: RawEntity[],
|
|
474
|
+
folded: FoldedProjection,
|
|
475
|
+
indexOffsets?: ReadonlyArray<readonly [number, number]>,
|
|
476
|
+
): AggregatedEntity[] {
|
|
424
477
|
const out: AggregatedEntity[] = [];
|
|
425
478
|
let cursor = 0;
|
|
426
479
|
let current: (AggregatedEntity & { count: number }) | null = null;
|
|
@@ -443,15 +496,28 @@ function mergeBioTokens(entities: RawEntity[], folded: FoldedProjection): Aggreg
|
|
|
443
496
|
if (rawLabel === undefined) continue;
|
|
444
497
|
const { prefix, base } = stripBio(rawLabel);
|
|
445
498
|
|
|
446
|
-
const
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
if (
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
499
|
+
const indexed = offsetFromTokenIndex(indexOffsets, entity.index);
|
|
500
|
+
let start: number;
|
|
501
|
+
let end: number;
|
|
502
|
+
if (indexed !== null) {
|
|
503
|
+
[start, end] = indexed;
|
|
504
|
+
} else {
|
|
505
|
+
const word = (entity.word ?? "").replace(/^##/, "").toLowerCase();
|
|
506
|
+
if (!word) continue;
|
|
507
|
+
const at = folded.text.indexOf(word, cursor);
|
|
508
|
+
if (at < 0) continue;
|
|
509
|
+
start = folded.rawStart[at];
|
|
510
|
+
end = folded.rawEnd[at + word.length - 1];
|
|
511
|
+
cursor = at + word.length;
|
|
512
|
+
}
|
|
453
513
|
|
|
454
|
-
|
|
514
|
+
// A `##` piece is a WordPiece *continuation* of the previous token's word,
|
|
515
|
+
// so it always extends the current span when the base label matches — even
|
|
516
|
+
// when the model mislabels it `B-` (common on accented words: "Ångström" is
|
|
517
|
+
// emitted as `ang`(B-SURNAME) + `##strom`(B-SURNAME)). Treating that `B-` as
|
|
518
|
+
// a new span fractures one name into pieces that then drift through repair.
|
|
519
|
+
const isSubword = (entity.word ?? "").startsWith("##");
|
|
520
|
+
const continues = current !== null && current.group === base && (prefix !== "B" || isSubword);
|
|
455
521
|
if (continues && current !== null) {
|
|
456
522
|
current.end = end;
|
|
457
523
|
current.score += entity.score;
|
|
@@ -499,7 +565,7 @@ function repairSpans(raw: string, spans: readonly Span[], anchorScore: number):
|
|
|
499
565
|
kept = merged;
|
|
500
566
|
|
|
501
567
|
for (let i = 0; i < kept.length; i++) {
|
|
502
|
-
const repaired = rescueCapitalizedParticles(raw, kept[i]);
|
|
568
|
+
const repaired = rescueCapitalizedParticles(raw, kept[i], kept, i);
|
|
503
569
|
if (repaired.start !== kept[i].start || repaired.end !== kept[i].end) {
|
|
504
570
|
kept[i] = repaired;
|
|
505
571
|
changed = true;
|
|
@@ -557,22 +623,39 @@ function mergeAdjacentConnectors(raw: string, spans: readonly Span[]): Span[] {
|
|
|
557
623
|
return merged;
|
|
558
624
|
}
|
|
559
625
|
|
|
560
|
-
function rescueCapitalizedParticles(raw: string, span: Span): Span {
|
|
626
|
+
function rescueCapitalizedParticles(raw: string, span: Span, all: readonly Span[] = [], selfIndex = -1): Span {
|
|
561
627
|
if (!PERSON_LABELS.has(span.label)) return span;
|
|
562
628
|
|
|
629
|
+
// Don't extend into a region already claimed by another kept span — that
|
|
630
|
+
// territory is its own entity (e.g. a SURNAME beside a GIVEN_NAME). Rescue is
|
|
631
|
+
// only for *untagged* particles, so clamp to the nearest neighbor boundary.
|
|
632
|
+
let leftBound = 0;
|
|
633
|
+
let rightBound = raw.length;
|
|
634
|
+
for (let i = 0; i < all.length; i++) {
|
|
635
|
+
if (i === selfIndex) continue;
|
|
636
|
+
const other = all[i];
|
|
637
|
+
if (other.end <= span.start && other.end > leftBound) leftBound = other.end;
|
|
638
|
+
if (other.start >= span.end && other.start < rightBound) rightBound = other.start;
|
|
639
|
+
}
|
|
640
|
+
|
|
563
641
|
let start = span.start;
|
|
564
642
|
let end = span.end;
|
|
565
643
|
const left = LEFT_PARTICLE_RE.exec(raw.slice(0, start));
|
|
566
644
|
// Extend left across the connector, but let a period through only when the
|
|
567
645
|
// particle is a one-letter initial ("J.", "R."), never a word ("Dr.", "St.").
|
|
568
|
-
if (
|
|
646
|
+
if (
|
|
647
|
+
left !== null &&
|
|
648
|
+
CONNECTOR_RE.test(left[2]) &&
|
|
649
|
+
(!left[2].includes(".") || left[1].length === 1) &&
|
|
650
|
+
start - left[0].length >= leftBound
|
|
651
|
+
) {
|
|
569
652
|
start -= left[0].length;
|
|
570
653
|
}
|
|
571
654
|
|
|
572
655
|
const right = RIGHT_PARTICLE_RE.exec(raw.slice(end));
|
|
573
656
|
// Never extend right across a period: it always crosses a sentence boundary
|
|
574
657
|
// ("Garcia. I", "Chen. After"). Trailing initials are reached via space.
|
|
575
|
-
if (right !== null && !right[1].includes(".")) {
|
|
658
|
+
if (right !== null && !right[1].includes(".") && end + right[0].length <= rightBound) {
|
|
576
659
|
end += right[0].length;
|
|
577
660
|
}
|
|
578
661
|
|