@nationaldesignstudio/rampart 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -409,4 +409,35 @@ export const PUBLIC_E2E_CASES: readonly PublicEvalCase[] = [
409
409
  ],
410
410
  publicTerms: ["age 29", "$52,000"],
411
411
  },
412
+ {
413
+ id: "chat-accented-name-address",
414
+ input: "My name is Renée Müller and I live at 1234 Cárdenas Boulevard.",
415
+ privateTerms: [
416
+ { text: "Renée", label: "GIVEN_NAME" },
417
+ { text: "Müller", label: "SURNAME" },
418
+ { text: "1234", label: "BUILDING_NUMBER" },
419
+ { text: "Cárdenas Boulevard", label: "STREET_NAME" },
420
+ ],
421
+ publicTerms: [],
422
+ },
423
+ {
424
+ id: "chat-accented-name-phone",
425
+ input: "Please contact José Ångström at 305-201-0143 about the appointment.",
426
+ privateTerms: [
427
+ { text: "José", label: "GIVEN_NAME" },
428
+ { text: "Ångström", label: "SURNAME" },
429
+ { text: "305-201-0143", label: "PHONE" },
430
+ ],
431
+ publicTerms: ["appointment"],
432
+ },
433
+ {
434
+ id: "chat-duplicate-digit-run-address",
435
+ input: "Call me at 646-555-0199. I live at 1234 Maple Street.",
436
+ privateTerms: [
437
+ { text: "646-555-0199", label: "PHONE" },
438
+ { text: "1234", label: "BUILDING_NUMBER" },
439
+ { text: "Maple Street", label: "STREET_NAME" },
440
+ ],
441
+ publicTerms: [],
442
+ },
412
443
  ];
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@nationaldesignstudio/rampart",
3
- "version": "0.1.1",
3
+ "version": "0.1.3",
4
4
  "description": "Rampart — client-side PII redaction for AI assistants: deterministic recognizers + a 14.7 MB ONNX classifier (transformers.js), default-deny policy, and reversible placeholders. Runs entirely in the browser.",
5
5
  "license": "CC-BY-4.0",
6
6
  "homepage": "https://github.com/nationaldesignstudio/rampart#readme",
@@ -66,7 +66,7 @@
66
66
  "!eval/bench/runs/*/latency.json"
67
67
  ],
68
68
  "scripts": {
69
- "build": "bun build ./index.ts ./src/ner/worker.ts --outdir ./dist --format esm --sourcemap --target browser && tsc -p tsconfig.build.json",
69
+ "build": "bun build ./index.ts ./src/ner/worker.ts --outdir ./dist --format esm --sourcemap --target browser --external @huggingface/transformers && tsc -p tsconfig.build.json",
70
70
  "eval:public": "bun eval/run-public-eval.ts",
71
71
  "eval:public:strict": "bun eval/run-public-eval.ts --strict",
72
72
  "bench:fetch": "bun eval/bench/fetch.ts",
@@ -100,4 +100,4 @@
100
100
  "vitest": "4.1.8"
101
101
  },
102
102
  "packageManager": "bun@1.3.14"
103
- }
103
+ }
@@ -24,6 +24,8 @@ interface RawEntity {
24
24
  readonly start: number;
25
25
  readonly end: number;
26
26
  readonly word: string;
27
+ /** Index into `[CLS] + content + [SEP]`; used for tokenizer-backed offset recovery. */
28
+ readonly index?: number;
27
29
  }
28
30
 
29
31
  /** Counts the model tokens in a string, excluding the [CLS]/[SEP] specials. */
@@ -39,6 +41,8 @@ export type TokenCounter = (text: string) => number;
39
41
  export interface TokenClassifier {
40
42
  (text: string, options?: { aggregation_strategy?: "simple" | "first" | "max" }): Promise<RawEntity[]>;
41
43
  countTokens?: TokenCounter;
44
+ /** WordPiece tokens for `text` (no specials); drives index→char offset recovery. */
45
+ tokenize?: (text: string) => readonly string[];
42
46
  }
43
47
 
44
48
  /** Maps model entity groups to our labels. Unknown groups are dropped. */
@@ -149,11 +153,17 @@ export async function loadNerClassifier(options: NerOptions = {}): Promise<Token
149
153
  // always carries a tokenizer; guard anyway so an unexpected runtime degrades to
150
154
  // the single-window path rather than throwing mid-detection.
151
155
  const tokenizer = (classifier as unknown as {
152
- tokenizer?: { encode?: (t: string, o?: { add_special_tokens?: boolean }) => number[] };
156
+ tokenizer?: {
157
+ encode?: (t: string, o?: { add_special_tokens?: boolean }) => number[];
158
+ tokenize?: (t: string) => string[];
159
+ };
153
160
  }).tokenizer;
154
161
  if (tokenizer?.encode) {
155
162
  adapter.countTokens = (text) => tokenizer.encode!(text, { add_special_tokens: false }).length;
156
163
  }
164
+ if (tokenizer?.tokenize) {
165
+ adapter.tokenize = (text) => tokenizer.tokenize!(text);
166
+ }
157
167
  return adapter;
158
168
  }
159
169
 
@@ -330,12 +340,10 @@ function fitCharsToBudget(
330
340
  * Handles two pipeline output shapes:
331
341
  * - aggregated (`entity_group` + char `start`/`end`) — when the runtime applied
332
342
  * `simple` aggregation; offsets are used directly.
333
- * - raw BIO tokens (`entity` = `B-GIVEN_NAME`/`I-GIVEN_NAME`, no offsets) — what
334
- * transformers.js emits here. The token `word` is accent-folded, so we locate
335
- * it in a matching folded projection of the input and project the span back to
336
- * raw offsets through an offset map. A naive search against the unfolded text
337
- * fails on every accented character, which drops spans (leaks) and lets the
338
- * offset cursor desync into multi-line, paragraph-swallowing spans.
343
+ * - raw BIO tokens (`entity` = `B-GIVEN_NAME`/`I-GIVEN_NAME`, no char offsets) —
344
+ * what transformers.js emits here. When the runtime supplies a token `index`,
345
+ * char offsets come from a tokenizer walk; otherwise the folded `word` is
346
+ * located in a matching folded projection (accent-safe fallback).
339
347
  */
340
348
  async function detectNerWindow(
341
349
  raw: string,
@@ -347,8 +355,10 @@ async function detectNerWindow(
347
355
  // `inferText` and `raw` are the same length (hyphen→space is 1:1), so offsets
348
356
  // recovered against this folded projection address `raw` too.
349
357
  const folded = foldForModel(inferText);
358
+ const indexOffsets =
359
+ classifier.tokenize === undefined ? undefined : buildTokenIndexOffsets(folded, classifier.tokenize(inferText));
350
360
  const candidates: Span[] = [];
351
- for (const entity of mergeBioTokens(entities, folded)) {
361
+ for (const entity of mergeBioTokens(entities, folded, indexOffsets)) {
352
362
  const label = GROUP_TO_LABEL[entity.group.toUpperCase()];
353
363
  if (label === undefined) continue;
354
364
  if (entity.score < EXTEND_SCORE || entity.end <= entity.start) continue;
@@ -412,15 +422,58 @@ function foldForModel(raw: string): FoldedProjection {
412
422
  return { text, rawStart, rawEnd };
413
423
  }
414
424
 
425
+ /**
426
+ * Map each pipeline token index (`[CLS]` = 0, content, `[SEP]` last) to char
427
+ * offsets in the raw input. The tokenizer emits WordPiece tokens in the model's
428
+ * *folded* space (lowercase + accent-stripped), so the walk is performed against
429
+ * the folded projection and projected back to raw offsets through its offset
430
+ * map. This both handles accented input (where folded/raw lengths differ) and
431
+ * resolves duplicate substrings — the second `123` lands on the address token,
432
+ * not the first digit run.
433
+ */
434
+ function buildTokenIndexOffsets(
435
+ folded: FoldedProjection,
436
+ contentTokens: readonly string[],
437
+ ): ReadonlyArray<readonly [number, number]> {
438
+ const offsets: [number, number][] = [[0, 0]];
439
+ let cursor = 0;
440
+ for (const token of contentTokens) {
441
+ const piece = token.startsWith("##") ? token.slice(2) : token;
442
+ const at = token.startsWith("##") ? cursor : folded.text.indexOf(piece, cursor);
443
+ if (piece.length === 0 || at < 0 || at + piece.length > folded.text.length) {
444
+ offsets.push([0, 0]);
445
+ continue;
446
+ }
447
+ offsets.push([folded.rawStart[at], folded.rawEnd[at + piece.length - 1]]);
448
+ cursor = at + piece.length;
449
+ }
450
+ offsets.push([0, 0]);
451
+ return offsets;
452
+ }
453
+
454
+ function offsetFromTokenIndex(
455
+ indexOffsets: ReadonlyArray<readonly [number, number]> | undefined,
456
+ index: number | undefined,
457
+ ): readonly [number, number] | null {
458
+ if (indexOffsets === undefined || index === undefined) return null;
459
+ const pair = indexOffsets[index];
460
+ if (pair === undefined || pair[0] === pair[1]) return null;
461
+ return pair;
462
+ }
463
+
415
464
  /**
416
465
  * Normalize either output shape into raw-offset entities. Aggregated rows carry
417
466
  * offsets in the model-input (unfolded) coordinate system, so they are used
418
467
  * directly. Raw BIO tokens are merged (B starts a span, matching I extends it);
419
- * each token's folded `word` is located in the folded projection via a
420
- * forward-advancing search so repeated words map to distinct offsets and the
421
- * folded span is projected back to raw through the projection's offset map.
468
+ * when the runtime supplies a token `index`, char offsets come from the
469
+ * tokenizer walk. Otherwise each token's folded `word` is located in the folded
470
+ * projection via a forward-advancing search and projected back to raw.
422
471
  */
423
- function mergeBioTokens(entities: RawEntity[], folded: FoldedProjection): AggregatedEntity[] {
472
+ function mergeBioTokens(
473
+ entities: RawEntity[],
474
+ folded: FoldedProjection,
475
+ indexOffsets?: ReadonlyArray<readonly [number, number]>,
476
+ ): AggregatedEntity[] {
424
477
  const out: AggregatedEntity[] = [];
425
478
  let cursor = 0;
426
479
  let current: (AggregatedEntity & { count: number }) | null = null;
@@ -443,15 +496,28 @@ function mergeBioTokens(entities: RawEntity[], folded: FoldedProjection): Aggreg
443
496
  if (rawLabel === undefined) continue;
444
497
  const { prefix, base } = stripBio(rawLabel);
445
498
 
446
- const word = (entity.word ?? "").replace(/^##/, "").toLowerCase();
447
- if (!word) continue;
448
- const at = folded.text.indexOf(word, cursor);
449
- if (at < 0) continue;
450
- const start = folded.rawStart[at];
451
- const end = folded.rawEnd[at + word.length - 1];
452
- cursor = at + word.length;
499
+ const indexed = offsetFromTokenIndex(indexOffsets, entity.index);
500
+ let start: number;
501
+ let end: number;
502
+ if (indexed !== null) {
503
+ [start, end] = indexed;
504
+ } else {
505
+ const word = (entity.word ?? "").replace(/^##/, "").toLowerCase();
506
+ if (!word) continue;
507
+ const at = folded.text.indexOf(word, cursor);
508
+ if (at < 0) continue;
509
+ start = folded.rawStart[at];
510
+ end = folded.rawEnd[at + word.length - 1];
511
+ cursor = at + word.length;
512
+ }
453
513
 
454
- const continues = current !== null && current.group === base && prefix !== "B";
514
+ // A `##` piece is a WordPiece *continuation* of the previous token's word,
515
+ // so it always extends the current span when the base label matches — even
516
+ // when the model mislabels it `B-` (common on accented words: "Ångström" is
517
+ // emitted as `ang`(B-SURNAME) + `##strom`(B-SURNAME)). Treating that `B-` as
518
+ // a new span fractures one name into pieces that then drift through repair.
519
+ const isSubword = (entity.word ?? "").startsWith("##");
520
+ const continues = current !== null && current.group === base && (prefix !== "B" || isSubword);
455
521
  if (continues && current !== null) {
456
522
  current.end = end;
457
523
  current.score += entity.score;
@@ -499,7 +565,7 @@ function repairSpans(raw: string, spans: readonly Span[], anchorScore: number):
499
565
  kept = merged;
500
566
 
501
567
  for (let i = 0; i < kept.length; i++) {
502
- const repaired = rescueCapitalizedParticles(raw, kept[i]);
568
+ const repaired = rescueCapitalizedParticles(raw, kept[i], kept, i);
503
569
  if (repaired.start !== kept[i].start || repaired.end !== kept[i].end) {
504
570
  kept[i] = repaired;
505
571
  changed = true;
@@ -557,22 +623,39 @@ function mergeAdjacentConnectors(raw: string, spans: readonly Span[]): Span[] {
557
623
  return merged;
558
624
  }
559
625
 
560
- function rescueCapitalizedParticles(raw: string, span: Span): Span {
626
+ function rescueCapitalizedParticles(raw: string, span: Span, all: readonly Span[] = [], selfIndex = -1): Span {
561
627
  if (!PERSON_LABELS.has(span.label)) return span;
562
628
 
629
+ // Don't extend into a region already claimed by another kept span — that
630
+ // territory is its own entity (e.g. a SURNAME beside a GIVEN_NAME). Rescue is
631
+ // only for *untagged* particles, so clamp to the nearest neighbor boundary.
632
+ let leftBound = 0;
633
+ let rightBound = raw.length;
634
+ for (let i = 0; i < all.length; i++) {
635
+ if (i === selfIndex) continue;
636
+ const other = all[i];
637
+ if (other.end <= span.start && other.end > leftBound) leftBound = other.end;
638
+ if (other.start >= span.end && other.start < rightBound) rightBound = other.start;
639
+ }
640
+
563
641
  let start = span.start;
564
642
  let end = span.end;
565
643
  const left = LEFT_PARTICLE_RE.exec(raw.slice(0, start));
566
644
  // Extend left across the connector, but let a period through only when the
567
645
  // particle is a one-letter initial ("J.", "R."), never a word ("Dr.", "St.").
568
- if (left !== null && CONNECTOR_RE.test(left[2]) && (!left[2].includes(".") || left[1].length === 1)) {
646
+ if (
647
+ left !== null &&
648
+ CONNECTOR_RE.test(left[2]) &&
649
+ (!left[2].includes(".") || left[1].length === 1) &&
650
+ start - left[0].length >= leftBound
651
+ ) {
569
652
  start -= left[0].length;
570
653
  }
571
654
 
572
655
  const right = RIGHT_PARTICLE_RE.exec(raw.slice(end));
573
656
  // Never extend right across a period: it always crosses a sentence boundary
574
657
  // ("Garcia. I", "Chen. After"). Trailing initials are reached via space.
575
- if (right !== null && !right[1].includes(".")) {
658
+ if (right !== null && !right[1].includes(".") && end + right[0].length <= rightBound) {
576
659
  end += right[0].length;
577
660
  }
578
661