@nationaldesignstudio/rampart 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +402 -0
- package/MODEL_CARD.md +422 -0
- package/README.md +279 -0
- package/RELEASE.md +97 -0
- package/WHITEPAPER.md +316 -0
- package/dist/index.d.ts +23 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +35639 -0
- package/dist/index.js.map +36 -0
- package/dist/src/guard.d.ts +94 -0
- package/dist/src/guard.d.ts.map +1 -0
- package/dist/src/heuristics.d.ts +14 -0
- package/dist/src/heuristics.d.ts.map +1 -0
- package/dist/src/ner/classifier.d.ts +92 -0
- package/dist/src/ner/classifier.d.ts.map +1 -0
- package/dist/src/ner/worker.d.ts +44 -0
- package/dist/src/ner/worker.d.ts.map +1 -0
- package/dist/src/ner/worker.js +35302 -0
- package/dist/src/ner/worker.js.map +30 -0
- package/dist/src/pipeline.d.ts +76 -0
- package/dist/src/pipeline.d.ts.map +1 -0
- package/dist/src/policy.d.ts +27 -0
- package/dist/src/policy.d.ts.map +1 -0
- package/dist/src/premask.d.ts +48 -0
- package/dist/src/premask.d.ts.map +1 -0
- package/dist/src/session.d.ts +60 -0
- package/dist/src/session.d.ts.map +1 -0
- package/dist/src/streaming.d.ts +32 -0
- package/dist/src/streaming.d.ts.map +1 -0
- package/dist/src/types.d.ts +43 -0
- package/dist/src/types.d.ts.map +1 -0
- package/dist/src/validators.d.ts +16 -0
- package/dist/src/validators.d.ts.map +1 -0
- package/eval/bench/README.md +91 -0
- package/eval/bench/fetch.ts +152 -0
- package/eval/bench/labels.ts +45 -0
- package/eval/bench/run.ts +146 -0
- package/eval/bench/runs/m06-v3-30k/by_language.json +303 -0
- package/eval/bench/runs/m06-v3-30k/summary.json +56 -0
- package/eval/bench/runs/sample-900/by_language.json +303 -0
- package/eval/bench/runs/sample-900/manifest.json +926 -0
- package/eval/bench/runs/sample-900/summary.json +56 -0
- package/eval/bench/score.ts +197 -0
- package/eval/bench/webgpu/entry.ts +70 -0
- package/eval/bench/webgpu/index.html +12 -0
- package/eval/bench/webgpu.ts +209 -0
- package/eval/public-cases.ts +412 -0
- package/eval/run-public-eval.ts +140 -0
- package/examples/basic-chat.ts +12 -0
- package/examples/pii-worker.ts +3 -0
- package/index.ts +47 -0
- package/package.json +103 -0
- package/src/guard.ts +170 -0
- package/src/heuristics.ts +141 -0
- package/src/ner/classifier.ts +580 -0
- package/src/ner/worker.ts +130 -0
- package/src/policy.ts +64 -0
- package/src/premask.ts +90 -0
- package/src/session.ts +99 -0
- package/src/streaming.ts +73 -0
- package/src/types.ts +74 -0
- package/src/validators.ts +40 -0
package/package.json
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@nationaldesignstudio/rampart",
|
|
3
|
+
"version": "0.1.1",
|
|
4
|
+
"description": "Rampart — client-side PII redaction for AI assistants: deterministic recognizers + a 14.7 MB ONNX classifier (transformers.js), default-deny policy, and reversible placeholders. Runs entirely in the browser.",
|
|
5
|
+
"license": "CC-BY-4.0",
|
|
6
|
+
"homepage": "https://github.com/nationaldesignstudio/rampart#readme",
|
|
7
|
+
"repository": {
|
|
8
|
+
"type": "git",
|
|
9
|
+
"url": "git+https://github.com/nationaldesignstudio/rampart.git"
|
|
10
|
+
},
|
|
11
|
+
"bugs": {
|
|
12
|
+
"url": "https://github.com/nationaldesignstudio/rampart/issues"
|
|
13
|
+
},
|
|
14
|
+
"keywords": [
|
|
15
|
+
"pii",
|
|
16
|
+
"redaction",
|
|
17
|
+
"privacy",
|
|
18
|
+
"llm",
|
|
19
|
+
"browser",
|
|
20
|
+
"onnx",
|
|
21
|
+
"transformers.js"
|
|
22
|
+
],
|
|
23
|
+
"type": "module",
|
|
24
|
+
"main": "./dist/index.js",
|
|
25
|
+
"module": "./dist/index.js",
|
|
26
|
+
"types": "./dist/index.d.ts",
|
|
27
|
+
"exports": {
|
|
28
|
+
".": {
|
|
29
|
+
"types": "./dist/index.d.ts",
|
|
30
|
+
"import": "./dist/index.js",
|
|
31
|
+
"default": "./dist/index.js"
|
|
32
|
+
},
|
|
33
|
+
"./dist": {
|
|
34
|
+
"types": "./dist/index.d.ts",
|
|
35
|
+
"import": "./dist/index.js",
|
|
36
|
+
"default": "./dist/index.js"
|
|
37
|
+
},
|
|
38
|
+
"./worker": {
|
|
39
|
+
"types": "./dist/src/ner/worker.d.ts",
|
|
40
|
+
"import": "./dist/src/ner/worker.js",
|
|
41
|
+
"default": "./dist/src/ner/worker.js"
|
|
42
|
+
},
|
|
43
|
+
"./dist/worker": {
|
|
44
|
+
"types": "./dist/src/ner/worker.d.ts",
|
|
45
|
+
"import": "./dist/src/ner/worker.js",
|
|
46
|
+
"default": "./dist/src/ner/worker.js"
|
|
47
|
+
},
|
|
48
|
+
"./package.json": "./package.json"
|
|
49
|
+
},
|
|
50
|
+
"publishConfig": {
|
|
51
|
+
"access": "restricted"
|
|
52
|
+
},
|
|
53
|
+
"files": [
|
|
54
|
+
"dist",
|
|
55
|
+
"eval",
|
|
56
|
+
"examples",
|
|
57
|
+
"index.ts",
|
|
58
|
+
"LICENSE",
|
|
59
|
+
"src",
|
|
60
|
+
"MODEL_CARD.md",
|
|
61
|
+
"README.md",
|
|
62
|
+
"RELEASE.md",
|
|
63
|
+
"WHITEPAPER.md",
|
|
64
|
+
"!eval/bench/webgpu/bundle.js",
|
|
65
|
+
"!eval/bench/data",
|
|
66
|
+
"!eval/bench/runs/*/latency.json"
|
|
67
|
+
],
|
|
68
|
+
"scripts": {
|
|
69
|
+
"build": "bun build ./index.ts ./src/ner/worker.ts --outdir ./dist --format esm --sourcemap --target browser && tsc -p tsconfig.build.json",
|
|
70
|
+
"eval:public": "bun eval/run-public-eval.ts",
|
|
71
|
+
"eval:public:strict": "bun eval/run-public-eval.ts --strict",
|
|
72
|
+
"bench:fetch": "bun eval/bench/fetch.ts",
|
|
73
|
+
"bench": "bun eval/bench/run.ts",
|
|
74
|
+
"bench:webgpu": "bun eval/bench/webgpu.ts",
|
|
75
|
+
"bench:webgpu:wasm": "bun eval/bench/webgpu.ts --device wasm",
|
|
76
|
+
"export:huggingface": "bun scripts/export-huggingface.ts",
|
|
77
|
+
"export:huggingface:verify": "bun run export:huggingface && bun scripts/verify-huggingface-export.ts",
|
|
78
|
+
"publish:huggingface": "bun run export:huggingface:verify && hf repo create nationaldesignstudio/rampart --repo-type model --private --exist-ok && hf upload nationaldesignstudio/rampart hf-export . --repo-type model",
|
|
79
|
+
"prepack": "bun run build",
|
|
80
|
+
"prepublishOnly": "bun run verify:public",
|
|
81
|
+
"redact": "bun cli/redact.ts",
|
|
82
|
+
"test": "vitest run",
|
|
83
|
+
"test:watch": "vitest",
|
|
84
|
+
"type-check": "tsc -p tsconfig.json --noEmit",
|
|
85
|
+
"verify:public": "bun run build && bun test && bun run type-check && bun run eval:public:strict && bun run export:huggingface:verify"
|
|
86
|
+
},
|
|
87
|
+
"peerDependencies": {
|
|
88
|
+
"@huggingface/transformers": ">=3"
|
|
89
|
+
},
|
|
90
|
+
"peerDependenciesMeta": {
|
|
91
|
+
"@huggingface/transformers": {
|
|
92
|
+
"optional": true
|
|
93
|
+
}
|
|
94
|
+
},
|
|
95
|
+
"devDependencies": {
|
|
96
|
+
"@huggingface/transformers": "3.7.5",
|
|
97
|
+
"@types/bun": "1.3.14",
|
|
98
|
+
"playwright": "1.61.1",
|
|
99
|
+
"typescript": "5.9.2",
|
|
100
|
+
"vitest": "4.1.8"
|
|
101
|
+
},
|
|
102
|
+
"packageManager": "bun@1.3.14"
|
|
103
|
+
}
|
package/src/guard.ts
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ChatGuard: per-conversation PII filter for wiring into a chat app.
|
|
3
|
+
*
|
|
4
|
+
* const guard = await createGuard();
|
|
5
|
+
* const safe = await guard.protect(userInput);
|
|
6
|
+
* const reply = await llm(safe.text);
|
|
7
|
+
* guard.reveal(reply); // non-streaming
|
|
8
|
+
* stream.pipeThrough(guard.revealTransform()); // streaming
|
|
9
|
+
*
|
|
10
|
+
* The entity table lives only on the client; the real values never leave the
|
|
11
|
+
* device. Each guard keeps placeholder identity stable across every turn of a
|
|
12
|
+
* conversation.
|
|
13
|
+
*
|
|
14
|
+
* user message
|
|
15
|
+
* → heuristics (sync, structured PII)
|
|
16
|
+
* → NER (optional, async, contextual PII)
|
|
17
|
+
* → merge + default-deny policy (keep {city, state, zip})
|
|
18
|
+
* → session table: replace with stable placeholders
|
|
19
|
+
* assistant reply
|
|
20
|
+
* → rehydrate placeholders → render to user
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
import { detectHeuristics } from "./heuristics";
|
|
24
|
+
import { loadNerClassifier, detectNer, RAMPART_MODEL_ID, type NerOptions } from "./ner/classifier";
|
|
25
|
+
import { createWorkerClassifier, type WorkerMessagePort } from "./ner/worker";
|
|
26
|
+
import { premask, projectMaskedSpan } from "./premask";
|
|
27
|
+
import { createRevealTransform } from "./streaming";
|
|
28
|
+
import { SessionEntityTable, type PlaceholderAliases, type ScrubResult } from "./session";
|
|
29
|
+
import { resolveKeepLabels, type PiiLabel, type Span } from "./types";
|
|
30
|
+
|
|
31
|
+
/** An async contextual detector. Matches both the in-process and worker forms. */
|
|
32
|
+
export type NerDetector = (text: string) => Promise<Span[]>;
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Default placeholder aliases. Empty: names are split into GIVEN_NAME/SURNAME
|
|
36
|
+
* and a household may share a surname, so each keeps its own typed token
|
|
37
|
+
* (`[GIVEN_NAME_1]`, `[SURNAME_1]`) rather than collapsing to a single `NAME`.
|
|
38
|
+
*/
|
|
39
|
+
export const DEFAULT_ALIASES: PlaceholderAliases = {};
|
|
40
|
+
|
|
41
|
+
export interface GuardOptions {
|
|
42
|
+
/** Placeholder aliases. Defaults to `{}` — typed tokens like `[GIVEN_NAME_1]`. */
|
|
43
|
+
readonly aliases?: PlaceholderAliases;
|
|
44
|
+
/** Labels to preserve; defaults to `{CITY, STATE, ZIP_CODE}`. */
|
|
45
|
+
readonly keepLabels?: readonly PiiLabel[];
|
|
46
|
+
/**
|
|
47
|
+
* When `true`, skip the structured-PII premask before the model. Required for
|
|
48
|
+
* a model trained without prefilter (no-prefilter ablation) whose classes
|
|
49
|
+
* include SSN / CREDIT_CARD / IP_ADDRESS. Heuristic spans for those types
|
|
50
|
+
* still run as a safety net.
|
|
51
|
+
*/
|
|
52
|
+
readonly noPrefilter?: boolean;
|
|
53
|
+
/** Pre-built NER detector. When set, `model` is ignored. */
|
|
54
|
+
readonly ner?: NerDetector;
|
|
55
|
+
/** When `true`, skip the classifier and run heuristics only. */
|
|
56
|
+
readonly heuristicsOnly?: boolean;
|
|
57
|
+
/**
|
|
58
|
+
* Hugging Face model id or local directory path (q4 ONNX). Defaults to
|
|
59
|
+
* {@link RAMPART_MODEL_ID}.
|
|
60
|
+
*/
|
|
61
|
+
readonly model?: string;
|
|
62
|
+
/** Worker script URL. When set, NER runs off the main thread. */
|
|
63
|
+
readonly worker?: string | URL;
|
|
64
|
+
/** Backend. `"wasm"`/`"webgpu"` in browsers; `"cpu"` for Node. */
|
|
65
|
+
readonly device?: NerOptions["device"];
|
|
66
|
+
/** Spans below this score are discarded. */
|
|
67
|
+
readonly minScore?: number;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
type ChatGuardConfig = Pick<GuardOptions, "ner" | "aliases" | "keepLabels" | "noPrefilter">;
|
|
71
|
+
|
|
72
|
+
export class ChatGuard {
|
|
73
|
+
private readonly table: SessionEntityTable;
|
|
74
|
+
private readonly ner?: NerDetector;
|
|
75
|
+
private readonly noPrefilter: boolean;
|
|
76
|
+
|
|
77
|
+
constructor(config: ChatGuardConfig = {}) {
|
|
78
|
+
this.table = new SessionEntityTable(config.aliases, resolveKeepLabels(config.keepLabels));
|
|
79
|
+
this.ner = config.ner;
|
|
80
|
+
this.noPrefilter = config.noPrefilter ?? false;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
private async detect(text: string): Promise<Span[]> {
|
|
84
|
+
const heuristic = detectHeuristics(text);
|
|
85
|
+
if (this.ner === undefined) return heuristic;
|
|
86
|
+
if (this.noPrefilter) {
|
|
87
|
+
const modelSpans = await this.ner(text);
|
|
88
|
+
return [...heuristic, ...modelSpans];
|
|
89
|
+
}
|
|
90
|
+
const map = premask(text, heuristic);
|
|
91
|
+
const maskedSpans = await this.ner(map.masked);
|
|
92
|
+
const contextual: Span[] = [];
|
|
93
|
+
for (const span of maskedSpans) {
|
|
94
|
+
const projected = projectMaskedSpan(span, text, map);
|
|
95
|
+
if (projected !== null) contextual.push(projected);
|
|
96
|
+
}
|
|
97
|
+
return [...heuristic, ...contextual];
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Run this on the user's text *before* handing it to the AI SDK. Returns the
|
|
102
|
+
* placeholdered text to send plus the placeholders introduced this turn.
|
|
103
|
+
*/
|
|
104
|
+
async protect(text: string): Promise<ScrubResult> {
|
|
105
|
+
const spans = await this.detect(text);
|
|
106
|
+
return this.table.scrub(text, spans);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/** Restore real values in a complete (non-streaming) assistant reply. */
|
|
110
|
+
reveal(reply: string): string {
|
|
111
|
+
return this.table.rehydrate(reply);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* A Web Streams transform that reveals placeholders in a streamed reply,
|
|
116
|
+
* correctly handling placeholders split across chunks. Pipe an AI SDK
|
|
117
|
+
* `textStream` through it before rendering.
|
|
118
|
+
*/
|
|
119
|
+
revealTransform(): TransformStream<string, string> {
|
|
120
|
+
return createRevealTransform((token) => {
|
|
121
|
+
const restored = this.table.rehydrate(token);
|
|
122
|
+
return restored === token ? null : restored;
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Defense in depth: scrub the model's *output* before logging/persisting it,
|
|
128
|
+
* since a model can emit PII the user never typed. Returns placeholdered text.
|
|
129
|
+
*/
|
|
130
|
+
async protectReply(reply: string): Promise<ScrubResult> {
|
|
131
|
+
const spans = await this.detect(reply);
|
|
132
|
+
return this.table.scrub(reply, spans);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
type NerLoadOptions = Pick<GuardOptions, "model" | "worker" | "minScore" | "device">;
|
|
137
|
+
|
|
138
|
+
async function buildNer(options: NerLoadOptions): Promise<NerDetector> {
|
|
139
|
+
const { model, worker, minScore, device = "wasm" } = options;
|
|
140
|
+
const modelOptions = { model, device, minScore };
|
|
141
|
+
|
|
142
|
+
if (worker !== undefined) {
|
|
143
|
+
// The DOM `Worker` types `onmessage` with `MessageEvent`; `WorkerMessagePort`
|
|
144
|
+
// is the deliberately DOM-free structural port the worker module also uses.
|
|
145
|
+
// They are not mutually assignable, so cast at this single boundary.
|
|
146
|
+
const port = new Worker(worker, { type: "module" }) as unknown as WorkerMessagePort;
|
|
147
|
+
const classifier = createWorkerClassifier(port, modelOptions);
|
|
148
|
+
await classifier.ready;
|
|
149
|
+
return async (text) => (await classifier.detect(text)) as Span[];
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
const classifier = await loadNerClassifier(modelOptions);
|
|
153
|
+
return (text) => detectNer(text, classifier, minScore);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Build a conversation guard. Loads the Rampart classifier (q4 ONNX) by default.
|
|
158
|
+
* Pass `model` for a different Hugging Face id or local path, `heuristicsOnly:
|
|
159
|
+
* true` to skip NER, or `ner` to inject a custom detector.
|
|
160
|
+
*/
|
|
161
|
+
export async function createGuard(options: GuardOptions = {}): Promise<ChatGuard> {
|
|
162
|
+
const { aliases = DEFAULT_ALIASES, keepLabels, noPrefilter, ner, heuristicsOnly, ...nerLoad } = options;
|
|
163
|
+
|
|
164
|
+
let detector = ner;
|
|
165
|
+
if (detector === undefined && heuristicsOnly !== true) {
|
|
166
|
+
detector = await buildNer(nerLoad);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return new ChatGuard({ ner: detector, aliases, keepLabels, noPrefilter });
|
|
170
|
+
}
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Heuristic detectors: the cheap, synchronous, zero-model first pass.
|
|
3
|
+
*
|
|
4
|
+
* Digit-bearing PII (SSN, card, phone, routing) is found over the *digit
|
|
5
|
+
* projection* (see normalize.ts) so every separator variant collapses to one
|
|
6
|
+
* rule: `888-88-8888`, `888 88 8888`, `888.88.8888`, and `888888888` all match.
|
|
7
|
+
* Text-shaped PII (email, URL, IP) is matched on the raw string where the
|
|
8
|
+
* structure lives in the punctuation.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import type { PiiLabel, Span } from "./types";
|
|
12
|
+
import { isLuhnValid, isValidSsn } from "./validators";
|
|
13
|
+
|
|
14
|
+
/** A fixed-length digit candidate paired with its label and validator. */
|
|
15
|
+
interface DigitRule {
|
|
16
|
+
readonly label: PiiLabel;
|
|
17
|
+
/** Exact digit lengths to try, longest first so cards beat SSN/phone. */
|
|
18
|
+
readonly lengths: readonly number[];
|
|
19
|
+
/** Returns true if the digit slice is a real instance of this entity. */
|
|
20
|
+
readonly validate: (digits: string) => boolean;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// Order matters: longer/stricter entities are tried first so a 16-digit card is
|
|
24
|
+
// never carved into a 9-digit "SSN". Overlap is also resolved later in merge.
|
|
25
|
+
const DIGIT_RULES: readonly DigitRule[] = [
|
|
26
|
+
{ label: "CREDIT_CARD", lengths: [16, 15, 14], validate: isLuhnValid },
|
|
27
|
+
{ label: "SSN", lengths: [9], validate: isValidSsn },
|
|
28
|
+
];
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* A contiguous digit run with its inline separators, e.g. `888 12 3456`. Runs
|
|
32
|
+
* never span a letter or other word boundary, so digits from different fields
|
|
33
|
+
* ("...3456 and income is 50000") can't merge into one phantom candidate.
|
|
34
|
+
*/
|
|
35
|
+
interface DigitRun {
|
|
36
|
+
/** Separator-free digits of the run. */
|
|
37
|
+
readonly digits: string;
|
|
38
|
+
/** `rawIndex[i]` is the raw offset of `digits[i]`. */
|
|
39
|
+
readonly rawIndex: number[];
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// A run is digits joined only by single inline separators (space, dash, dot).
|
|
43
|
+
const DIGIT_RUN = /\d(?:[ .-]?\d)*/g;
|
|
44
|
+
|
|
45
|
+
function extractRuns(raw: string): DigitRun[] {
|
|
46
|
+
const runs: DigitRun[] = [];
|
|
47
|
+
DIGIT_RUN.lastIndex = 0;
|
|
48
|
+
for (let m = DIGIT_RUN.exec(raw); m !== null; m = DIGIT_RUN.exec(raw)) {
|
|
49
|
+
const digits: string[] = [];
|
|
50
|
+
const rawIndex: number[] = [];
|
|
51
|
+
for (let i = 0; i < m[0].length; i++) {
|
|
52
|
+
const code = m[0].charCodeAt(i);
|
|
53
|
+
if (code >= 0x30 && code <= 0x39) {
|
|
54
|
+
digits.push(m[0][i]);
|
|
55
|
+
rawIndex.push(m.index + i);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
runs.push({ digits: digits.join(""), rawIndex });
|
|
59
|
+
}
|
|
60
|
+
return runs;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Scan each digit run for fixed-length, validator-backed entities. A candidate
|
|
65
|
+
* is anchored to the *whole* run (start..len and end-len..end) so we only match
|
|
66
|
+
* when the run length equals the entity length — preventing a 9-digit SSN from
|
|
67
|
+
* firing inside a longer account number. The raw span includes interior
|
|
68
|
+
* separators so redaction covers `888-12-3456` whole.
|
|
69
|
+
*/
|
|
70
|
+
function detectDigitEntities(raw: string): Span[] {
|
|
71
|
+
const spans: Span[] = [];
|
|
72
|
+
for (const run of extractRuns(raw)) {
|
|
73
|
+
for (const rule of DIGIT_RULES) {
|
|
74
|
+
if (!rule.lengths.includes(run.digits.length)) continue;
|
|
75
|
+
if (!rule.validate(run.digits)) continue;
|
|
76
|
+
const start = run.rawIndex[0];
|
|
77
|
+
const end = run.rawIndex[run.rawIndex.length - 1] + 1;
|
|
78
|
+
spans.push({ start, end, label: rule.label, score: 1, source: "heuristic", text: raw.slice(start, end) });
|
|
79
|
+
break; // first matching rule (rules are ordered strict→loose) wins the run
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return spans;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// Text-shaped entities: structure lives in the punctuation, so match raw.
|
|
86
|
+
// EMAIL and URL are structured text-shaped PII: a regex catches them at near-
|
|
87
|
+
// 100% recall, far better than the model (URL recall is ~5% model-alone), so
|
|
88
|
+
// the deterministic layer owns them and premasks them to sentinels before the
|
|
89
|
+
// model — exactly as it does for SSN/CC/IP. The model's vestigial EMAIL/URL
|
|
90
|
+
// head no longer needs to fire.
|
|
91
|
+
const TEXT_RULES: readonly { label: PiiLabel; pattern: RegExp; group?: number }[] = [
|
|
92
|
+
// Email: local part (with +tags and dots) @ dotted domain. Matches
|
|
93
|
+
// plus-addressing and sub-domains, e.g. `alex+housing@sub.example.gov`.
|
|
94
|
+
{ label: "EMAIL", pattern: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g },
|
|
95
|
+
// URL with explicit scheme: everything up to whitespace / closing bracket /
|
|
96
|
+
// quote. Trailing sentence punctuation is left in the span (harmless — the
|
|
97
|
+
// sensitive host+path is what matters and the whole run is redacted).
|
|
98
|
+
{ label: "URL", pattern: /\bhttps?:\/\/[^\s<>"'\])}]+/g },
|
|
99
|
+
// Schemeless web URL: `www.` host followed by the rest of the URL.
|
|
100
|
+
{ label: "URL", pattern: /\bwww\.[A-Za-z0-9.-]+\.[A-Za-z]{2,}(?:\/[^\s<>"'\])}]*)?/g },
|
|
101
|
+
{ label: "IP_ADDRESS", pattern: /\b(?:(?:25[0-5]|2[0-4]\d|1?\d?\d)\.){3}(?:25[0-5]|2[0-4]\d|1?\d?\d)\b/g },
|
|
102
|
+
// IPv6: full and `::`-compressed forms. Every alternative requires either 8
|
|
103
|
+
// colon-separated groups or a `::`, so it never fires on times ("12:34") or
|
|
104
|
+
// MAC addresses (handled below); the lookarounds keep it off larger tokens.
|
|
105
|
+
{
|
|
106
|
+
label: "IP_ADDRESS",
|
|
107
|
+
pattern:
|
|
108
|
+
/(?<![:.\w])(?:(?:[0-9A-Fa-f]{1,4}:){7}[0-9A-Fa-f]{1,4}|(?:[0-9A-Fa-f]{1,4}:){1,7}:|(?:[0-9A-Fa-f]{1,4}:){1,6}:[0-9A-Fa-f]{1,4}|(?:[0-9A-Fa-f]{1,4}:){1,5}(?::[0-9A-Fa-f]{1,4}){1,2}|(?:[0-9A-Fa-f]{1,4}:){1,4}(?::[0-9A-Fa-f]{1,4}){1,3}|(?:[0-9A-Fa-f]{1,4}:){1,3}(?::[0-9A-Fa-f]{1,4}){1,4}|(?:[0-9A-Fa-f]{1,4}:){1,2}(?::[0-9A-Fa-f]{1,4}){1,5}|[0-9A-Fa-f]{1,4}:(?::[0-9A-Fa-f]{1,4}){1,6}|::(?:[0-9A-Fa-f]{1,4}:){0,6}[0-9A-Fa-f]{1,4})(?![:.\w])/g,
|
|
109
|
+
},
|
|
110
|
+
// MAC address: six hex pairs joined by ":" or "-".
|
|
111
|
+
{ label: "IP_ADDRESS", pattern: /\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\b/g },
|
|
112
|
+
];
|
|
113
|
+
|
|
114
|
+
function detectTextEntities(raw: string): Span[] {
|
|
115
|
+
const spans: Span[] = [];
|
|
116
|
+
for (const { label, pattern, group } of TEXT_RULES) {
|
|
117
|
+
pattern.lastIndex = 0;
|
|
118
|
+
for (let m = pattern.exec(raw); m !== null; m = pattern.exec(raw)) {
|
|
119
|
+
const text = group === undefined ? m[0] : m[group];
|
|
120
|
+
const offset = group === undefined ? 0 : m[0].indexOf(text);
|
|
121
|
+
spans.push({
|
|
122
|
+
start: m.index + offset,
|
|
123
|
+
end: m.index + offset + text.length,
|
|
124
|
+
label,
|
|
125
|
+
score: 1,
|
|
126
|
+
source: "heuristic",
|
|
127
|
+
text,
|
|
128
|
+
});
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
return spans;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/** Run all heuristic detectors over the raw input. Spans may overlap; the
|
|
135
|
+
* pipeline's merge step resolves conflicts before redaction. */
|
|
136
|
+
export function detectHeuristics(raw: string): Span[] {
|
|
137
|
+
return [
|
|
138
|
+
...detectDigitEntities(raw),
|
|
139
|
+
...detectTextEntities(raw),
|
|
140
|
+
];
|
|
141
|
+
}
|