@nationaldesignstudio/rampart 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/LICENSE +402 -0
  2. package/MODEL_CARD.md +422 -0
  3. package/README.md +279 -0
  4. package/RELEASE.md +97 -0
  5. package/WHITEPAPER.md +316 -0
  6. package/dist/index.d.ts +23 -0
  7. package/dist/index.d.ts.map +1 -0
  8. package/dist/index.js +35639 -0
  9. package/dist/index.js.map +36 -0
  10. package/dist/src/guard.d.ts +94 -0
  11. package/dist/src/guard.d.ts.map +1 -0
  12. package/dist/src/heuristics.d.ts +14 -0
  13. package/dist/src/heuristics.d.ts.map +1 -0
  14. package/dist/src/ner/classifier.d.ts +92 -0
  15. package/dist/src/ner/classifier.d.ts.map +1 -0
  16. package/dist/src/ner/worker.d.ts +44 -0
  17. package/dist/src/ner/worker.d.ts.map +1 -0
  18. package/dist/src/ner/worker.js +35302 -0
  19. package/dist/src/ner/worker.js.map +30 -0
  20. package/dist/src/pipeline.d.ts +76 -0
  21. package/dist/src/pipeline.d.ts.map +1 -0
  22. package/dist/src/policy.d.ts +27 -0
  23. package/dist/src/policy.d.ts.map +1 -0
  24. package/dist/src/premask.d.ts +48 -0
  25. package/dist/src/premask.d.ts.map +1 -0
  26. package/dist/src/session.d.ts +60 -0
  27. package/dist/src/session.d.ts.map +1 -0
  28. package/dist/src/streaming.d.ts +32 -0
  29. package/dist/src/streaming.d.ts.map +1 -0
  30. package/dist/src/types.d.ts +43 -0
  31. package/dist/src/types.d.ts.map +1 -0
  32. package/dist/src/validators.d.ts +16 -0
  33. package/dist/src/validators.d.ts.map +1 -0
  34. package/eval/bench/README.md +91 -0
  35. package/eval/bench/fetch.ts +152 -0
  36. package/eval/bench/labels.ts +45 -0
  37. package/eval/bench/run.ts +146 -0
  38. package/eval/bench/runs/m06-v3-30k/by_language.json +303 -0
  39. package/eval/bench/runs/m06-v3-30k/summary.json +56 -0
  40. package/eval/bench/runs/sample-900/by_language.json +303 -0
  41. package/eval/bench/runs/sample-900/manifest.json +926 -0
  42. package/eval/bench/runs/sample-900/summary.json +56 -0
  43. package/eval/bench/score.ts +197 -0
  44. package/eval/bench/webgpu/entry.ts +70 -0
  45. package/eval/bench/webgpu/index.html +12 -0
  46. package/eval/bench/webgpu.ts +209 -0
  47. package/eval/public-cases.ts +412 -0
  48. package/eval/run-public-eval.ts +140 -0
  49. package/examples/basic-chat.ts +12 -0
  50. package/examples/pii-worker.ts +3 -0
  51. package/index.ts +47 -0
  52. package/package.json +103 -0
  53. package/src/guard.ts +170 -0
  54. package/src/heuristics.ts +141 -0
  55. package/src/ner/classifier.ts +580 -0
  56. package/src/ner/worker.ts +130 -0
  57. package/src/policy.ts +64 -0
  58. package/src/premask.ts +90 -0
  59. package/src/session.ts +99 -0
  60. package/src/streaming.ts +73 -0
  61. package/src/types.ts +74 -0
  62. package/src/validators.ts +40 -0
@@ -0,0 +1,130 @@
1
+ /**
2
+ * Web Worker host for the NER classifier.
3
+ *
4
+ * Inference must not jank the chat UI, so the model lives on a worker thread.
5
+ * The main thread talks to it through {@link createWorkerClassifier}, which
6
+ * adapts the postMessage round-trip back into the {@link TokenClassifier}
7
+ * signature the pipeline expects — so the rest of the system is agnostic to
8
+ * whether detection runs on the main thread or off it.
9
+ *
10
+ * Bundle this file as the worker entry (it self-registers `onmessage`). The
11
+ * client is created on the main thread with `new Worker(new URL(...))`.
12
+ */
13
+
14
+ import { detectNer, loadNerClassifier, type NerOptions, type TokenClassifier } from "./classifier";
15
+
16
+ interface InitMessage {
17
+ readonly kind: "init";
18
+ readonly options: NerOptions;
19
+ }
20
+ interface DetectMessage {
21
+ readonly kind: "detect";
22
+ readonly id: number;
23
+ readonly text: string;
24
+ readonly minScore?: number;
25
+ }
26
+ type InboundMessage = InitMessage | DetectMessage;
27
+
28
+ type WorkerInboundEvent = {
29
+ data: InboundMessage;
30
+ };
31
+
32
+ type WorkerOutboundEvent = {
33
+ data: { kind: string; id?: number; spans?: unknown; message?: string };
34
+ };
35
+
36
+ export type WorkerMessagePort = {
37
+ onmessage: ((event: WorkerInboundEvent) => void) | null;
38
+ postMessage: (message: unknown) => void;
39
+ };
40
+
41
+ // --- Worker side (runs inside the worker thread) ---
42
+
43
+ /** Register the worker message handler. Call from the worker entry module. */
44
+ export function registerNerWorker(scope: WorkerMessagePort): void {
45
+ let classifierPromise: Promise<TokenClassifier> | null = null;
46
+
47
+ scope.onmessage = async (event: WorkerInboundEvent) => {
48
+ const message = event.data;
49
+ if (message.kind === "init") {
50
+ try {
51
+ classifierPromise = loadNerClassifier(message.options);
52
+ await classifierPromise;
53
+ scope.postMessage({ kind: "ready" });
54
+ } catch (error) {
55
+ // Surface init failures so the main thread can fail closed instead of
56
+ // hanging forever on a `ready` message that never arrives.
57
+ classifierPromise = null;
58
+ scope.postMessage({ kind: "error", message: String(error) });
59
+ }
60
+ return;
61
+ }
62
+ if (message.kind === "detect") {
63
+ try {
64
+ if (classifierPromise === null) throw new Error("[pii-filter] worker not initialized");
65
+ const classifier = await classifierPromise;
66
+ const spans = await detectNer(message.text, classifier, message.minScore);
67
+ scope.postMessage({ kind: "result", id: message.id, spans });
68
+ } catch (error) {
69
+ scope.postMessage({ kind: "error", id: message.id, message: String(error) });
70
+ }
71
+ }
72
+ };
73
+ }
74
+
75
+ // --- Main-thread side ---
76
+
77
+ /**
78
+ * Wrap a worker as a {@link TokenClassifier}-compatible async function. The
79
+ * detection contract is span-in/span-out, so callers use it exactly like the
80
+ * in-process classifier. Resolves once the worker reports `ready`.
81
+ */
82
+ export function createWorkerClassifier(
83
+ worker: WorkerMessagePort,
84
+ options: NerOptions,
85
+ ): { ready: Promise<void>; detect: (text: string, minScore?: number) => Promise<unknown> } {
86
+ let nextId = 0;
87
+ const pending = new Map<number, { resolve: (v: unknown) => void; reject: (e: unknown) => void }>();
88
+ let resolveReady: () => void = () => {
89
+ /* replaced synchronously by Promise constructor below */
90
+ };
91
+ let rejectReady: (error: unknown) => void = () => {
92
+ /* replaced synchronously by Promise constructor below */
93
+ };
94
+ const ready = new Promise<void>((resolve, reject) => {
95
+ resolveReady = resolve;
96
+ rejectReady = reject;
97
+ });
98
+
99
+ worker.onmessage = (event: WorkerOutboundEvent) => {
100
+ const data = event.data;
101
+ if (data.kind === "ready") {
102
+ resolveReady();
103
+ return;
104
+ }
105
+ // Worker reports a pre-init failure (no id) → reject the ready promise so
106
+ // callers fail loudly instead of hanging on `await ready`.
107
+ if (data.kind === "error" && data.id === undefined) {
108
+ rejectReady(new Error(data.message ?? "[pii-filter] worker init failed"));
109
+ return;
110
+ }
111
+ if (data.id === undefined) return;
112
+ const entry = pending.get(data.id);
113
+ if (entry === undefined) return;
114
+ pending.delete(data.id);
115
+ if (data.kind === "error") entry.reject(new Error(data.message));
116
+ else entry.resolve(data.spans);
117
+ };
118
+
119
+ worker.postMessage({ kind: "init", options });
120
+
121
+ function detect(text: string, minScore?: number): Promise<unknown> {
122
+ const id = nextId++;
123
+ return new Promise((resolve, reject) => {
124
+ pending.set(id, { resolve, reject });
125
+ worker.postMessage({ kind: "detect", id, text, minScore });
126
+ });
127
+ }
128
+
129
+ return { ready, detect };
130
+ }
package/src/policy.ts ADDED
@@ -0,0 +1,64 @@
1
+ /**
2
+ * Span reconciliation + default-deny policy.
3
+ *
4
+ * Detectors (heuristic + NER) emit overlapping, possibly conflicting spans.
5
+ * {@link mergeSpans} resolves them into a non-overlapping set, then
6
+ * {@link applyPolicy} drops anything in the keep-set so only redactable spans
7
+ * remain. This is Option A: redact detected entities whose label is not kept.
8
+ */
9
+
10
+ import { KEEP_LABELS, shouldRedact, type PiiLabel, type Span } from "./types";
11
+
12
+ /**
13
+ * Reduce overlapping spans to a disjoint set. When two spans overlap, the
14
+ * higher-confidence one wins; ties break toward the longer span, then toward
15
+ * heuristics (validator-backed, so most trustworthy). Biased to *keep* a
16
+ * redaction rather than drop one — recall over precision, per the threat model.
17
+ *
18
+ * On partial overlap (neither span contains the other) the previous span
19
+ * covers bytes the winner does not. Taking only the winner exposes those
20
+ * bytes; we take the byte-union under the preferred label so no detected
21
+ * bytes are silently dropped. Full containment still collapses to the winner.
22
+ */
23
+ export function mergeSpans(spans: readonly Span[]): Span[] {
24
+ const sorted = [...spans].sort((a, b) => a.start - b.start || b.end - a.end);
25
+ const merged: Span[] = [];
26
+ for (const span of sorted) {
27
+ const prev = merged[merged.length - 1];
28
+ if (prev === undefined || span.start >= prev.end) {
29
+ merged.push(span);
30
+ continue;
31
+ }
32
+ const winner = preferred(prev, span);
33
+ const prevContains = prev.start <= span.start && prev.end >= span.end;
34
+ const spanContains = span.start <= prev.start && span.end >= prev.end;
35
+ if (prevContains || spanContains) {
36
+ merged[merged.length - 1] = winner;
37
+ } else {
38
+ // Partial overlap: union the bytes under the preferred label so the
39
+ // loser's exclusive range is not silently exposed.
40
+ const start = Math.min(prev.start, span.start);
41
+ const end = Math.max(prev.end, span.end);
42
+ merged[merged.length - 1] = { ...winner, start, end, text: winner.text };
43
+ }
44
+ }
45
+ return merged;
46
+ }
47
+
48
+ function preferred(a: Span, b: Span): Span {
49
+ if (a.score !== b.score) return a.score > b.score ? a : b;
50
+ const aLen = a.end - a.start;
51
+ const bLen = b.end - b.start;
52
+ if (aLen !== bLen) return aLen > bLen ? a : b;
53
+ return a.source === "heuristic" ? a : b;
54
+ }
55
+
56
+ /**
57
+ * Apply the keep-set. Returns only spans that must be redacted, sorted right to
58
+ * left so callers can splice from the end and keep earlier offsets valid.
59
+ */
60
+ export function applyPolicy(spans: readonly Span[], keepLabels: ReadonlySet<PiiLabel> = KEEP_LABELS): Span[] {
61
+ return mergeSpans(spans)
62
+ .filter((s) => shouldRedact(s.label, keepLabels))
63
+ .sort((a, b) => b.start - a.start);
64
+ }
package/src/premask.ts ADDED
@@ -0,0 +1,90 @@
1
+ /**
2
+ * Pre-mask harness: substitute deterministically-detected structured PII with
3
+ * neutral sentinels *before* the contextual model runs.
4
+ *
5
+ * The heuristic layer (SSN / CREDIT_CARD / IP_ADDRESS / EMAIL / URL) is
6
+ * validator- or pattern-backed and runs synchronously. Rather than show the
7
+ * model raw card/SSN/IP digits or email/URL strings — which it never needs to
8
+ * classify and which only add noise — we replace each heuristic span with a
9
+ * fixed sentinel token (`[SSN]`, `[CREDIT_CARD]`, `[IP_ADDRESS]`, ...). The model
10
+ * is trained on text masked the exact same way, so the train-time and
11
+ * inference-time input distributions match by construction.
12
+ *
13
+ * Offsets: the masked string is paired with a per-character map back to the raw
14
+ * input, so a span the model reports in masked coordinates can be projected to
15
+ * exact raw offsets. Sentinel characters map onto their source span's raw range,
16
+ * so if the model happens to label part of a sentinel the projection lands
17
+ * inside the heuristic span (which then wins the merge on its score-1 weight).
18
+ *
19
+ * The heuristic spans themselves are returned with their original raw offsets,
20
+ * so final redaction and the session table are unaffected: only the text handed
21
+ * to the model is masked, never the text that gets placeholdered.
22
+ */
23
+
24
+ import { mergeSpans } from "./policy";
25
+ import type { PiiLabel, Span } from "./types";
26
+
27
+ /** A masked copy of the input plus a map from each masked char to raw offsets. */
28
+ export interface PremaskResult {
29
+ /** Input with heuristic spans replaced by sentinel tokens. */
30
+ readonly masked: string;
31
+ /** `rawStart[i]` is the raw offset of the source of `masked[i]`. */
32
+ readonly rawStart: number[];
33
+ /** `rawEnd[i]` is the raw offset just past that source. */
34
+ readonly rawEnd: number[];
35
+ }
36
+
37
+ /** The sentinel substituted for a premasked label. Stable across train/serve. */
38
+ export function sentinelFor(label: PiiLabel): string {
39
+ return `[${label}]`;
40
+ }
41
+
42
+ /**
43
+ * Replace each (disjoint) heuristic span in `raw` with its label sentinel,
44
+ * recording per masked-character raw offsets. Spans are merged first so they are
45
+ * non-overlapping; verbatim runs between spans map 1:1 to raw, and sentinel
46
+ * characters map onto the whole span they stand in for.
47
+ */
48
+ export function premask(raw: string, spans: readonly Span[]): PremaskResult {
49
+ const ordered = mergeSpans(spans).slice().sort((a, b) => a.start - b.start);
50
+ let masked = "";
51
+ const rawStart: number[] = [];
52
+ const rawEnd: number[] = [];
53
+
54
+ const copyVerbatim = (from: number, to: number): void => {
55
+ for (let i = from; i < to; i++) {
56
+ masked += raw[i];
57
+ rawStart.push(i);
58
+ rawEnd.push(i + 1);
59
+ }
60
+ };
61
+
62
+ let cursor = 0;
63
+ for (const span of ordered) {
64
+ if (span.start < cursor) continue; // defensive: skip any residual overlap
65
+ copyVerbatim(cursor, span.start);
66
+ const sentinel = sentinelFor(span.label);
67
+ for (const ch of sentinel) {
68
+ masked += ch;
69
+ rawStart.push(span.start);
70
+ rawEnd.push(span.end);
71
+ }
72
+ cursor = span.end;
73
+ }
74
+ copyVerbatim(cursor, raw.length);
75
+
76
+ return { masked, rawStart, rawEnd };
77
+ }
78
+
79
+ /**
80
+ * Project a span reported in masked coordinates back onto the raw input. Returns
81
+ * `null` for an empty/degenerate span. The raw `text` is sliced from `raw` so
82
+ * callers always carry the real substring for rehydration.
83
+ */
84
+ export function projectMaskedSpan(span: Span, raw: string, map: PremaskResult): Span | null {
85
+ if (span.end <= span.start) return null;
86
+ const start = map.rawStart[span.start];
87
+ const end = map.rawEnd[span.end - 1];
88
+ if (start === undefined || end === undefined || end <= start) return null;
89
+ return { ...span, start, end, text: raw.slice(start, end) };
90
+ }
package/src/session.ts ADDED
@@ -0,0 +1,99 @@
1
+ /**
2
+ * Session entity table: reversible placeholders for coherent chat.
3
+ *
4
+ * Blanket `[REDACTED]` makes the assistant's replies nonsense. Instead each
5
+ * redacted value gets a stable, typed placeholder (`[GIVEN_NAME_1]`, `[SSN_2]`)
6
+ * that survives across turns: the same raw value always maps to the same token,
7
+ * so the model can reason about "GIVEN_NAME_1" and we {@link rehydrate} the real
8
+ * value back into its reply before display.
9
+ *
10
+ * The map lives only on the client. What leaves the device is placeholdered
11
+ * text; the table never crosses the wire.
12
+ */
13
+
14
+ import { applyPolicy } from "./policy";
15
+ import { KEEP_LABELS, type PiiLabel, type Span } from "./types";
16
+
17
+ /** Result of scrubbing one message. */
18
+ export interface ScrubResult {
19
+ /** Text with PII replaced by placeholders. Safe to send/log. */
20
+ readonly text: string;
21
+ /** Placeholders introduced or reused in this message. */
22
+ readonly placeholders: readonly string[];
23
+ }
24
+
25
+ /**
26
+ * Optional display aliases for placeholder tokens. By default a GIVEN_NAME span
27
+ * becomes `[GIVEN_NAME_1]`; pass `{ GIVEN_NAME: "NAME" }` to get `[NAME_1]` instead.
28
+ * Only the visible token changes — detection and policy are unaffected.
29
+ */
30
+ export type PlaceholderAliases = Partial<Record<PiiLabel, string>>;
31
+
32
+ /** Token shape minted by the table; also the regex used to find them on the
33
+ * way back. Kept in one place so scrub and rehydrate can never disagree. */
34
+ export const PLACEHOLDER_PATTERN = /\[[A-Z][A-Z_]*_\d+\]/g;
35
+
36
+ /**
37
+ * A per-conversation store mapping raw PII values to stable placeholders.
38
+ * Keyed by `label + normalized value` so "John" stays `GIVEN_NAME_1` on every turn
39
+ * and casing/whitespace noise doesn't mint duplicate tokens.
40
+ */
41
+ export class SessionEntityTable {
42
+ private readonly forward = new Map<string, string>();
43
+ private readonly reverse = new Map<string, string>();
44
+ private readonly counters = new Map<string, number>();
45
+
46
+ constructor(
47
+ private readonly aliases: PlaceholderAliases = {},
48
+ private readonly keepLabels: ReadonlySet<PiiLabel> = KEEP_LABELS,
49
+ ) {}
50
+
51
+ /** The visible name used in tokens for a label (alias or the label itself). */
52
+ private displayName(label: PiiLabel): string {
53
+ return this.aliases[label] ?? label;
54
+ }
55
+
56
+ /** Get or mint the placeholder for a given label+value. Idempotent. */
57
+ placeholderFor(label: PiiLabel, value: string): string {
58
+ const key = `${label}:${value.toLowerCase().replace(/\s+/g, " ").trim()}`;
59
+ const existing = this.forward.get(key);
60
+ if (existing !== undefined) return existing;
61
+ const name = this.displayName(label);
62
+ const next = (this.counters.get(name) ?? 0) + 1;
63
+ this.counters.set(name, next);
64
+ const token = `[${name}_${next}]`;
65
+ this.forward.set(key, token);
66
+ this.reverse.set(token, value);
67
+ return token;
68
+ }
69
+
70
+ /**
71
+ * Replace each redactable span with its placeholder. Spans are pre-sorted
72
+ * right-to-left by {@link applyPolicy}, so splicing never invalidates an
73
+ * earlier offset.
74
+ */
75
+ scrub(raw: string, spans: readonly Span[]): ScrubResult {
76
+ const redactable = applyPolicy(spans, this.keepLabels);
77
+ const placeholders: string[] = [];
78
+ let text = raw;
79
+ for (const span of redactable) {
80
+ const token = this.placeholderFor(span.label, span.text);
81
+ placeholders.push(token);
82
+ text = `${text.slice(0, span.start)}${token}${text.slice(span.end)}`;
83
+ }
84
+ return { text, placeholders: placeholders.reverse() };
85
+ }
86
+
87
+ /**
88
+ * Restore real values in an assistant reply. Used on the *outbound* response
89
+ * so the user sees "John", not "[NAME_1]". Unknown tokens are left intact.
90
+ */
91
+ rehydrate(text: string): string {
92
+ return text.replace(PLACEHOLDER_PATTERN, (token) => this.reverse.get(token) ?? token);
93
+ }
94
+
95
+ /** True if `token` is a placeholder this table can resolve. */
96
+ knows(token: string): boolean {
97
+ return this.reverse.has(token);
98
+ }
99
+ }
@@ -0,0 +1,73 @@
1
+ /**
2
+ * Streaming placeholder reveal.
3
+ *
4
+ * The assistant streams tokens, so a placeholder can be split across chunks:
5
+ * chunk A: "...thanks [NA"
6
+ * chunk B: "ME_1] for..."
7
+ * A per-chunk regex replace would emit the broken `[NA` and never match. This
8
+ * buffers the smallest suffix that could still become a placeholder, emits
9
+ * everything safe before it, and flushes the remainder at stream end.
10
+ */
11
+
12
+ import { PLACEHOLDER_PATTERN } from "./session";
13
+
14
+ /** Resolves a placeholder token to its real value, or null to leave it as-is. */
15
+ export type PlaceholderResolver = (token: string) => string | null;
16
+
17
+ // A run that *might* grow into a placeholder: '[' then [A-Z_] then optional
18
+ // '_' digits, not yet closed by ']'. If a suffix matches this, hold it.
19
+ const PARTIAL_TOKEN = /\[[A-Z_]*(?:_\d*)?$/;
20
+
21
+ /**
22
+ * Stateful reveal for a token stream. Feed chunks in order; `push` returns the
23
+ * text safe to render now, `flush` returns whatever was held at the end.
24
+ */
25
+ export class StreamingReveal {
26
+ private buffer = "";
27
+
28
+ constructor(private readonly resolve: PlaceholderResolver) {}
29
+
30
+ /** Reveal complete placeholders in `chunk`, holding any partial tail. */
31
+ push(chunk: string): string {
32
+ this.buffer += chunk;
33
+ const revealed = this.replaceComplete(this.buffer);
34
+ // Find the longest suffix that could still be completing a placeholder.
35
+ const partial = revealed.match(PARTIAL_TOKEN);
36
+ if (partial === null) {
37
+ this.buffer = "";
38
+ return revealed;
39
+ }
40
+ const cut = revealed.length - partial[0].length;
41
+ this.buffer = revealed.slice(cut);
42
+ return revealed.slice(0, cut);
43
+ }
44
+
45
+ /** Emit any buffered tail (e.g. a lone `[` that never became a token). */
46
+ flush(): string {
47
+ const out = this.replaceComplete(this.buffer);
48
+ this.buffer = "";
49
+ return out;
50
+ }
51
+
52
+ private replaceComplete(text: string): string {
53
+ return text.replace(PLACEHOLDER_PATTERN, (token) => this.resolve(token) ?? token);
54
+ }
55
+ }
56
+
57
+ /**
58
+ * A Web Streams transform that reveals placeholders in a string stream — drop
59
+ * it into an AI SDK text stream pipeline so the user never sees `[NAME_1]`.
60
+ */
61
+ export function createRevealTransform(resolve: PlaceholderResolver): TransformStream<string, string> {
62
+ const reveal = new StreamingReveal(resolve);
63
+ return new TransformStream<string, string>({
64
+ transform(chunk, controller) {
65
+ const out = reveal.push(chunk);
66
+ if (out) controller.enqueue(out);
67
+ },
68
+ flush(controller) {
69
+ const out = reveal.flush();
70
+ if (out) controller.enqueue(out);
71
+ },
72
+ });
73
+ }
package/src/types.ts ADDED
@@ -0,0 +1,74 @@
1
+ /**
2
+ * Shared vocabulary for the PII filter.
3
+ *
4
+ * The whole system is *default-deny*: every detected entity is redacted unless
5
+ * its label is in {@link KEEP_LABELS} (currently {CITY, STATE, ZIP_CODE}). An
6
+ * unrecognized span is dropped, never leaked.
7
+ *
8
+ * Two layers emit labels:
9
+ * - Heuristics (synchronous, validator-backed) detect and premask the
10
+ * structured numeric identifiers below before the model ever runs.
11
+ * - The NER model learns the contextual "fine set": split names, contact and
12
+ * document identifiers, and address components.
13
+ */
14
+
15
+ /** Every entity class the heuristics and the NER model can emit. */
16
+ export type PiiLabel =
17
+ // --- Structured (heuristic-detectable, premasked before the model) ---
18
+ | "SSN"
19
+ | "CREDIT_CARD"
20
+ | "IP_ADDRESS"
21
+ // --- Contextual (NER model fine set) ---
22
+ | "GIVEN_NAME"
23
+ | "SURNAME"
24
+ | "EMAIL"
25
+ | "PHONE"
26
+ | "URL"
27
+ | "TAX_ID"
28
+ | "BANK_ACCOUNT"
29
+ | "ROUTING_NUMBER"
30
+ | "GOVERNMENT_ID"
31
+ | "PASSPORT"
32
+ | "DRIVERS_LICENSE"
33
+ // --- Address components (NER) ---
34
+ | "BUILDING_NUMBER"
35
+ | "STREET_NAME"
36
+ | "SECONDARY_ADDRESS"
37
+ | "CITY"
38
+ | "STATE"
39
+ | "ZIP_CODE";
40
+
41
+ /**
42
+ * Labels that are intentionally preserved (classified but never redacted).
43
+ * The model still learns these; the JS policy layer simply keeps them in the
44
+ * text. Public-benefits assistants need broad geography (city / state / ZIP)
45
+ * for area-median-income (AMI) eligibility, so those are kept while the precise
46
+ * street line (BUILDING_NUMBER + STREET_NAME) is still redacted.
47
+ */
48
+ export const KEEP_LABELS: ReadonlySet<PiiLabel> = new Set<PiiLabel>(["CITY", "STATE", "ZIP_CODE"]);
49
+
50
+ /** A detected entity span over the *original* (raw) text. */
51
+ export interface Span {
52
+ /** Inclusive start offset into the raw input. */
53
+ readonly start: number;
54
+ /** Exclusive end offset into the raw input. */
55
+ readonly end: number;
56
+ /** The classified entity type. */
57
+ readonly label: PiiLabel;
58
+ /** Detector confidence in [0, 1]. Heuristics with validators report 1. */
59
+ readonly score: number;
60
+ /** Which layer produced the span; used for merge tie-breaks and audit. */
61
+ readonly source: "heuristic" | "ner";
62
+ /** The raw substring covered, retained for placeholder rehydration. */
63
+ readonly text: string;
64
+ }
65
+
66
+ /** Resolve a caller keep-set; omitted → {@link KEEP_LABELS}. */
67
+ export function resolveKeepLabels(keepLabels?: readonly PiiLabel[]): ReadonlySet<PiiLabel> {
68
+ return keepLabels === undefined ? KEEP_LABELS : new Set(keepLabels);
69
+ }
70
+
71
+ /** True when a label must be redacted under the default-deny policy. */
72
+ export function shouldRedact(label: PiiLabel, keepLabels: ReadonlySet<PiiLabel> = KEEP_LABELS): boolean {
73
+ return !keepLabels.has(label);
74
+ }
@@ -0,0 +1,40 @@
1
+ /**
2
+ * Validators that turn "a run of digits" into "a *valid* SSN / card / phone".
3
+ *
4
+ * These exist to suppress false positives so the filter doesn't mangle numbers
5
+ * the assistant is allowed to keep (income figures, ages, years). A detector
6
+ * proposes a span; a validator decides whether it is really that entity.
7
+ */
8
+
9
+ /** Luhn checksum — gates CREDIT_CARD so arbitrary 16-digit runs don't match. */
10
+ export function isLuhnValid(digits: string): boolean {
11
+ let sum = 0;
12
+ let double = false;
13
+ for (let i = digits.length - 1; i >= 0; i--) {
14
+ let d = digits.charCodeAt(i) - 0x30;
15
+ if (double) {
16
+ d *= 2;
17
+ if (d > 9) d -= 9;
18
+ }
19
+ sum += d;
20
+ double = !double;
21
+ }
22
+ return sum % 10 === 0;
23
+ }
24
+
25
+ /**
26
+ * US SSN structural rules. Area (first 3) cannot be 000, 666, or 900-999;
27
+ * group (middle 2) cannot be 00; serial (last 4) cannot be 0000. Rejects
28
+ * obvious non-SSNs like phone numbers padded to nine digits.
29
+ */
30
+ export function isValidSsn(digits: string): boolean {
31
+ if (digits.length !== 9) return false;
32
+ const area = digits.slice(0, 3);
33
+ const group = digits.slice(3, 5);
34
+ const serial = digits.slice(5);
35
+ if (area === "000" || area === "666") return false;
36
+ if (Number(area) >= 900) return false;
37
+ if (group === "00") return false;
38
+ if (serial === "0000") return false;
39
+ return true;
40
+ }