@nationaldesignstudio/rampart 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +402 -0
- package/MODEL_CARD.md +422 -0
- package/README.md +279 -0
- package/RELEASE.md +97 -0
- package/WHITEPAPER.md +316 -0
- package/dist/index.d.ts +23 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +35639 -0
- package/dist/index.js.map +36 -0
- package/dist/src/guard.d.ts +94 -0
- package/dist/src/guard.d.ts.map +1 -0
- package/dist/src/heuristics.d.ts +14 -0
- package/dist/src/heuristics.d.ts.map +1 -0
- package/dist/src/ner/classifier.d.ts +92 -0
- package/dist/src/ner/classifier.d.ts.map +1 -0
- package/dist/src/ner/worker.d.ts +44 -0
- package/dist/src/ner/worker.d.ts.map +1 -0
- package/dist/src/ner/worker.js +35302 -0
- package/dist/src/ner/worker.js.map +30 -0
- package/dist/src/pipeline.d.ts +76 -0
- package/dist/src/pipeline.d.ts.map +1 -0
- package/dist/src/policy.d.ts +27 -0
- package/dist/src/policy.d.ts.map +1 -0
- package/dist/src/premask.d.ts +48 -0
- package/dist/src/premask.d.ts.map +1 -0
- package/dist/src/session.d.ts +60 -0
- package/dist/src/session.d.ts.map +1 -0
- package/dist/src/streaming.d.ts +32 -0
- package/dist/src/streaming.d.ts.map +1 -0
- package/dist/src/types.d.ts +43 -0
- package/dist/src/types.d.ts.map +1 -0
- package/dist/src/validators.d.ts +16 -0
- package/dist/src/validators.d.ts.map +1 -0
- package/eval/bench/README.md +91 -0
- package/eval/bench/fetch.ts +152 -0
- package/eval/bench/labels.ts +45 -0
- package/eval/bench/run.ts +146 -0
- package/eval/bench/runs/m06-v3-30k/by_language.json +303 -0
- package/eval/bench/runs/m06-v3-30k/summary.json +56 -0
- package/eval/bench/runs/sample-900/by_language.json +303 -0
- package/eval/bench/runs/sample-900/manifest.json +926 -0
- package/eval/bench/runs/sample-900/summary.json +56 -0
- package/eval/bench/score.ts +197 -0
- package/eval/bench/webgpu/entry.ts +70 -0
- package/eval/bench/webgpu/index.html +12 -0
- package/eval/bench/webgpu.ts +209 -0
- package/eval/public-cases.ts +412 -0
- package/eval/run-public-eval.ts +140 -0
- package/examples/basic-chat.ts +12 -0
- package/examples/pii-worker.ts +3 -0
- package/index.ts +47 -0
- package/package.json +103 -0
- package/src/guard.ts +170 -0
- package/src/heuristics.ts +141 -0
- package/src/ner/classifier.ts +580 -0
- package/src/ner/worker.ts +130 -0
- package/src/policy.ts +64 -0
- package/src/premask.ts +90 -0
- package/src/session.ts +99 -0
- package/src/streaming.ts +73 -0
- package/src/types.ts +74 -0
- package/src/validators.ts +40 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* The PII filter pipeline: the single entry point a chat app wires in.
|
|
3
|
+
*
|
|
4
|
+
* user message
|
|
5
|
+
* → heuristics (sync, structured PII)
|
|
6
|
+
* → NER (optional, async, contextual PII)
|
|
7
|
+
* → merge + default-deny policy (keep {city, state, zip})
|
|
8
|
+
* → session table: replace with stable placeholders
|
|
9
|
+
* → send placeholdered text to the LLM
|
|
10
|
+
* assistant reply
|
|
11
|
+
* → outbound scrub (model can echo/emit PII too)
|
|
12
|
+
* → rehydrate placeholders → render to user
|
|
13
|
+
*
|
|
14
|
+
* Heuristics run synchronously so the structured PII (SSNs in any separator
|
|
15
|
+
* form, cards, phones, emails) is gone immediately even before the model loads.
|
|
16
|
+
* The NER detector is injected, so callers can run it on a worker, skip it in
|
|
17
|
+
* tests, or swap models without touching this orchestration.
|
|
18
|
+
*/
|
|
19
|
+
import { type PlaceholderAliases, type ScrubResult } from "./session";
|
|
20
|
+
import type { Span } from "./types";
|
|
21
|
+
/** An async contextual detector. Matches both the in-process and worker forms. */
|
|
22
|
+
export type NerDetector = (text: string) => Promise<Span[]>;
|
|
23
|
+
export interface FilterOptions {
|
|
24
|
+
/** Optional contextual detector. Omit to run heuristics-only. */
|
|
25
|
+
readonly ner?: NerDetector;
|
|
26
|
+
/** Display aliases for placeholder tokens, e.g. `{ GIVEN_NAME: "NAME" }`. */
|
|
27
|
+
readonly aliases?: PlaceholderAliases;
|
|
28
|
+
/**
|
|
29
|
+
* When `true`, skip the structured-PII premask before the model. Use this with
|
|
30
|
+
* a model trained without prefilter (no-prefilter ablation), whose classes
|
|
31
|
+
* include SSN / CREDIT_CARD / IP_ADDRESS — the model expects raw digits, not
|
|
32
|
+
* sentinels. Heuristic spans for those types are still emitted and merged, so
|
|
33
|
+
* they act as belt-and-suspenders for the model's predictions.
|
|
34
|
+
*/
|
|
35
|
+
readonly noPrefilter?: boolean;
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Stateful filter bound to one conversation. Holds the session entity table so
|
|
39
|
+
* placeholders stay consistent across turns and rehydration can reverse them.
|
|
40
|
+
*/
|
|
41
|
+
export declare class PiiFilter {
|
|
42
|
+
private readonly options;
|
|
43
|
+
private readonly table;
|
|
44
|
+
constructor(options?: FilterOptions);
|
|
45
|
+
/**
|
|
46
|
+
* Detect all PII in a message: heuristics always, NER when configured.
|
|
47
|
+
*
|
|
48
|
+
* The heuristic layer (structured PII: SSN / CREDIT_CARD / IP_ADDRESS) runs
|
|
49
|
+
* first and its spans are substituted with neutral sentinels before the text
|
|
50
|
+
* reaches the model — the same masking the model was trained on, so it never
|
|
51
|
+
* sees raw structured digits. The model's spans, reported in masked
|
|
52
|
+
* coordinates, are projected back to raw offsets and merged with the
|
|
53
|
+
* (raw-offset) heuristic spans for redaction.
|
|
54
|
+
*/
|
|
55
|
+
private detect;
|
|
56
|
+
/**
|
|
57
|
+
* Scrub an outgoing user message. The returned `text` is what may be sent to
|
|
58
|
+
* the LLM and written to logs — it contains no PII outside the keep-set.
|
|
59
|
+
*/
|
|
60
|
+
scrubOutbound(text: string): Promise<ScrubResult>;
|
|
61
|
+
/**
|
|
62
|
+
* Restore real values in an assistant reply before showing it to the user.
|
|
63
|
+
* Run on every model response: the model reasons over placeholders and may
|
|
64
|
+
* repeat them, and we must not surface a bare `[PERSON_1]`.
|
|
65
|
+
*/
|
|
66
|
+
rehydrateInbound(reply: string): string;
|
|
67
|
+
/**
|
|
68
|
+
* Defense in depth: scrub the *model's* output too. A model can emit PII the
|
|
69
|
+
* user never typed (hallucinated or inferred), so re-run detection on the
|
|
70
|
+
* reply and placeholder anything new before it is logged or persisted.
|
|
71
|
+
*/
|
|
72
|
+
scrubInbound(reply: string): Promise<ScrubResult>;
|
|
73
|
+
}
|
|
74
|
+
/** Stateless one-shot scrub for non-conversational callers (e.g. log sinks). */
|
|
75
|
+
export declare function scrubOnce(text: string, options?: FilterOptions): Promise<string>;
|
|
76
|
+
//# sourceMappingURL=pipeline.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pipeline.d.ts","sourceRoot":"","sources":["../../src/pipeline.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;GAiBG;AAIH,OAAO,EAAsB,KAAK,kBAAkB,EAAE,KAAK,WAAW,EAAE,MAAM,WAAW,CAAC;AAC1F,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,SAAS,CAAC;AAEpC,kFAAkF;AAClF,MAAM,MAAM,WAAW,GAAG,CAAC,IAAI,EAAE,MAAM,KAAK,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC;AAE5D,MAAM,WAAW,aAAa;IAC5B,iEAAiE;IACjE,QAAQ,CAAC,GAAG,CAAC,EAAE,WAAW,CAAC;IAC3B,6EAA6E;IAC7E,QAAQ,CAAC,OAAO,CAAC,EAAE,kBAAkB,CAAC;IACtC;;;;;;OAMG;IACH,QAAQ,CAAC,WAAW,CAAC,EAAE,OAAO,CAAC;CAChC;AAED;;;GAGG;AACH,qBAAa,SAAS;IAGR,OAAO,CAAC,QAAQ,CAAC,OAAO;IAFpC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAqB;gBAEd,OAAO,GAAE,aAAkB;IAIxD;;;;;;;;;OASG;YACW,MAAM;IAoBpB;;;OAGG;IACG,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC;IAKvD;;;;OAIG;IACH,gBAAgB,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM;IAIvC;;;;OAIG;IACG,YAAY,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC;CAIxD;AAED,gFAAgF;AAChF,wBAAsB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,GAAE,aAAkB,GAAG,OAAO,CAAC,MAAM,CAAC,CAG1F"}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Span reconciliation + default-deny policy.
|
|
3
|
+
*
|
|
4
|
+
* Detectors (heuristic + NER) emit overlapping, possibly conflicting spans.
|
|
5
|
+
* {@link mergeSpans} resolves them into a non-overlapping set, then
|
|
6
|
+
* {@link applyPolicy} drops anything in the keep-set so only redactable spans
|
|
7
|
+
* remain. This is Option A: redact detected entities whose label is not kept.
|
|
8
|
+
*/
|
|
9
|
+
import { type PiiLabel, type Span } from "./types";
|
|
10
|
+
/**
|
|
11
|
+
* Reduce overlapping spans to a disjoint set. When two spans overlap, the
|
|
12
|
+
* higher-confidence one wins; ties break toward the longer span, then toward
|
|
13
|
+
* heuristics (validator-backed, so most trustworthy). Biased to *keep* a
|
|
14
|
+
* redaction rather than drop one — recall over precision, per the threat model.
|
|
15
|
+
*
|
|
16
|
+
* On partial overlap (neither span contains the other) the previous span
|
|
17
|
+
* covers bytes the winner does not. Taking only the winner exposes those
|
|
18
|
+
* bytes; we take the byte-union under the preferred label so no detected
|
|
19
|
+
* bytes are silently dropped. Full containment still collapses to the winner.
|
|
20
|
+
*/
|
|
21
|
+
export declare function mergeSpans(spans: readonly Span[]): Span[];
|
|
22
|
+
/**
|
|
23
|
+
* Apply the keep-set. Returns only spans that must be redacted, sorted right to
|
|
24
|
+
* left so callers can splice from the end and keep earlier offsets valid.
|
|
25
|
+
*/
|
|
26
|
+
export declare function applyPolicy(spans: readonly Span[], keepLabels?: ReadonlySet<PiiLabel>): Span[];
|
|
27
|
+
//# sourceMappingURL=policy.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"policy.d.ts","sourceRoot":"","sources":["../../src/policy.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAA6B,KAAK,QAAQ,EAAE,KAAK,IAAI,EAAE,MAAM,SAAS,CAAC;AAE9E;;;;;;;;;;GAUG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,SAAS,IAAI,EAAE,GAAG,IAAI,EAAE,CAuBzD;AAUD;;;GAGG;AACH,wBAAgB,WAAW,CAAC,KAAK,EAAE,SAAS,IAAI,EAAE,EAAE,UAAU,GAAE,WAAW,CAAC,QAAQ,CAAe,GAAG,IAAI,EAAE,CAI3G"}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pre-mask harness: substitute deterministically-detected structured PII with
|
|
3
|
+
* neutral sentinels *before* the contextual model runs.
|
|
4
|
+
*
|
|
5
|
+
* The heuristic layer (SSN / CREDIT_CARD / IP_ADDRESS / EMAIL / URL) is
|
|
6
|
+
* validator- or pattern-backed and runs synchronously. Rather than show the
|
|
7
|
+
* model raw card/SSN/IP digits or email/URL strings — which it never needs to
|
|
8
|
+
* classify and which only add noise — we replace each heuristic span with a
|
|
9
|
+
* fixed sentinel token (`[SSN]`, `[CREDIT_CARD]`, `[IP_ADDRESS]`, ...). The model
|
|
10
|
+
* is trained on text masked the exact same way, so the train-time and
|
|
11
|
+
* inference-time input distributions match by construction.
|
|
12
|
+
*
|
|
13
|
+
* Offsets: the masked string is paired with a per-character map back to the raw
|
|
14
|
+
* input, so a span the model reports in masked coordinates can be projected to
|
|
15
|
+
* exact raw offsets. Sentinel characters map onto their source span's raw range,
|
|
16
|
+
* so if the model happens to label part of a sentinel the projection lands
|
|
17
|
+
* inside the heuristic span (which then wins the merge on its score-1 weight).
|
|
18
|
+
*
|
|
19
|
+
* The heuristic spans themselves are returned with their original raw offsets,
|
|
20
|
+
* so final redaction and the session table are unaffected: only the text handed
|
|
21
|
+
* to the model is masked, never the text that gets placeholdered.
|
|
22
|
+
*/
|
|
23
|
+
import type { PiiLabel, Span } from "./types";
|
|
24
|
+
/** A masked copy of the input plus a map from each masked char to raw offsets. */
|
|
25
|
+
export interface PremaskResult {
|
|
26
|
+
/** Input with heuristic spans replaced by sentinel tokens. */
|
|
27
|
+
readonly masked: string;
|
|
28
|
+
/** `rawStart[i]` is the raw offset of the source of `masked[i]`. */
|
|
29
|
+
readonly rawStart: number[];
|
|
30
|
+
/** `rawEnd[i]` is the raw offset just past that source. */
|
|
31
|
+
readonly rawEnd: number[];
|
|
32
|
+
}
|
|
33
|
+
/** The sentinel substituted for a premasked label. Stable across train/serve. */
|
|
34
|
+
export declare function sentinelFor(label: PiiLabel): string;
|
|
35
|
+
/**
|
|
36
|
+
* Replace each (disjoint) heuristic span in `raw` with its label sentinel,
|
|
37
|
+
* recording per masked-character raw offsets. Spans are merged first so they are
|
|
38
|
+
* non-overlapping; verbatim runs between spans map 1:1 to raw, and sentinel
|
|
39
|
+
* characters map onto the whole span they stand in for.
|
|
40
|
+
*/
|
|
41
|
+
export declare function premask(raw: string, spans: readonly Span[]): PremaskResult;
|
|
42
|
+
/**
|
|
43
|
+
* Project a span reported in masked coordinates back onto the raw input. Returns
|
|
44
|
+
* `null` for an empty/degenerate span. The raw `text` is sliced from `raw` so
|
|
45
|
+
* callers always carry the real substring for rehydration.
|
|
46
|
+
*/
|
|
47
|
+
export declare function projectMaskedSpan(span: Span, raw: string, map: PremaskResult): Span | null;
|
|
48
|
+
//# sourceMappingURL=premask.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"premask.d.ts","sourceRoot":"","sources":["../../src/premask.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAGH,OAAO,KAAK,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,SAAS,CAAC;AAE9C,kFAAkF;AAClF,MAAM,WAAW,aAAa;IAC5B,8DAA8D;IAC9D,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,oEAAoE;IACpE,QAAQ,CAAC,QAAQ,EAAE,MAAM,EAAE,CAAC;IAC5B,2DAA2D;IAC3D,QAAQ,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC;CAC3B;AAED,iFAAiF;AACjF,wBAAgB,WAAW,CAAC,KAAK,EAAE,QAAQ,GAAG,MAAM,CAEnD;AAED;;;;;GAKG;AACH,wBAAgB,OAAO,CAAC,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,IAAI,EAAE,GAAG,aAAa,CA6B1E;AAED;;;;GAIG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE,aAAa,GAAG,IAAI,GAAG,IAAI,CAM1F"}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Session entity table: reversible placeholders for coherent chat.
|
|
3
|
+
*
|
|
4
|
+
* Blanket `[REDACTED]` makes the assistant's replies nonsense. Instead each
|
|
5
|
+
* redacted value gets a stable, typed placeholder (`[GIVEN_NAME_1]`, `[SSN_2]`)
|
|
6
|
+
* that survives across turns: the same raw value always maps to the same token,
|
|
7
|
+
* so the model can reason about "GIVEN_NAME_1" and we {@link rehydrate} the real
|
|
8
|
+
* value back into its reply before display.
|
|
9
|
+
*
|
|
10
|
+
* The map lives only on the client. What leaves the device is placeholdered
|
|
11
|
+
* text; the table never crosses the wire.
|
|
12
|
+
*/
|
|
13
|
+
import { type PiiLabel, type Span } from "./types";
|
|
14
|
+
/** Result of scrubbing one message. */
|
|
15
|
+
export interface ScrubResult {
|
|
16
|
+
/** Text with PII replaced by placeholders. Safe to send/log. */
|
|
17
|
+
readonly text: string;
|
|
18
|
+
/** Placeholders introduced or reused in this message. */
|
|
19
|
+
readonly placeholders: readonly string[];
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Optional display aliases for placeholder tokens. By default a GIVEN_NAME span
|
|
23
|
+
* becomes `[GIVEN_NAME_1]`; pass `{ GIVEN_NAME: "NAME" }` to get `[NAME_1]` instead.
|
|
24
|
+
* Only the visible token changes — detection and policy are unaffected.
|
|
25
|
+
*/
|
|
26
|
+
export type PlaceholderAliases = Partial<Record<PiiLabel, string>>;
|
|
27
|
+
/** Token shape minted by the table; also the regex used to find them on the
|
|
28
|
+
* way back. Kept in one place so scrub and rehydrate can never disagree. */
|
|
29
|
+
export declare const PLACEHOLDER_PATTERN: RegExp;
|
|
30
|
+
/**
|
|
31
|
+
* A per-conversation store mapping raw PII values to stable placeholders.
|
|
32
|
+
* Keyed by `label + normalized value` so "John" stays `GIVEN_NAME_1` on every turn
|
|
33
|
+
* and casing/whitespace noise doesn't mint duplicate tokens.
|
|
34
|
+
*/
|
|
35
|
+
export declare class SessionEntityTable {
|
|
36
|
+
private readonly aliases;
|
|
37
|
+
private readonly keepLabels;
|
|
38
|
+
private readonly forward;
|
|
39
|
+
private readonly reverse;
|
|
40
|
+
private readonly counters;
|
|
41
|
+
constructor(aliases?: PlaceholderAliases, keepLabels?: ReadonlySet<PiiLabel>);
|
|
42
|
+
/** The visible name used in tokens for a label (alias or the label itself). */
|
|
43
|
+
private displayName;
|
|
44
|
+
/** Get or mint the placeholder for a given label+value. Idempotent. */
|
|
45
|
+
placeholderFor(label: PiiLabel, value: string): string;
|
|
46
|
+
/**
|
|
47
|
+
* Replace each redactable span with its placeholder. Spans are pre-sorted
|
|
48
|
+
* right-to-left by {@link applyPolicy}, so splicing never invalidates an
|
|
49
|
+
* earlier offset.
|
|
50
|
+
*/
|
|
51
|
+
scrub(raw: string, spans: readonly Span[]): ScrubResult;
|
|
52
|
+
/**
|
|
53
|
+
* Restore real values in an assistant reply. Used on the *outbound* response
|
|
54
|
+
* so the user sees "John", not "[NAME_1]". Unknown tokens are left intact.
|
|
55
|
+
*/
|
|
56
|
+
rehydrate(text: string): string;
|
|
57
|
+
/** True if `token` is a placeholder this table can resolve. */
|
|
58
|
+
knows(token: string): boolean;
|
|
59
|
+
}
|
|
60
|
+
//# sourceMappingURL=session.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"session.d.ts","sourceRoot":"","sources":["../../src/session.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAGH,OAAO,EAAe,KAAK,QAAQ,EAAE,KAAK,IAAI,EAAE,MAAM,SAAS,CAAC;AAEhE,uCAAuC;AACvC,MAAM,WAAW,WAAW;IAC1B,gEAAgE;IAChE,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,yDAAyD;IACzD,QAAQ,CAAC,YAAY,EAAE,SAAS,MAAM,EAAE,CAAC;CAC1C;AAED;;;;GAIG;AACH,MAAM,MAAM,kBAAkB,GAAG,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAC;AAEnE;4EAC4E;AAC5E,eAAO,MAAM,mBAAmB,QAA0B,CAAC;AAE3D;;;;GAIG;AACH,qBAAa,kBAAkB;IAM3B,OAAO,CAAC,QAAQ,CAAC,OAAO;IACxB,OAAO,CAAC,QAAQ,CAAC,UAAU;IAN7B,OAAO,CAAC,QAAQ,CAAC,OAAO,CAA6B;IACrD,OAAO,CAAC,QAAQ,CAAC,OAAO,CAA6B;IACrD,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAA6B;gBAGnC,OAAO,GAAE,kBAAuB,EAChC,UAAU,GAAE,WAAW,CAAC,QAAQ,CAAe;IAGlE,+EAA+E;IAC/E,OAAO,CAAC,WAAW;IAInB,uEAAuE;IACvE,cAAc,CAAC,KAAK,EAAE,QAAQ,EAAE,KAAK,EAAE,MAAM,GAAG,MAAM;IAatD;;;;OAIG;IACH,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,IAAI,EAAE,GAAG,WAAW;IAYvD;;;OAGG;IACH,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;IAI/B,+DAA+D;IAC/D,KAAK,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO;CAG9B"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Streaming placeholder reveal.
|
|
3
|
+
*
|
|
4
|
+
* The assistant streams tokens, so a placeholder can be split across chunks:
|
|
5
|
+
* chunk A: "...thanks [NA"
|
|
6
|
+
* chunk B: "ME_1] for..."
|
|
7
|
+
* A per-chunk regex replace would emit the broken `[NA` and never match. This
|
|
8
|
+
* buffers the smallest suffix that could still become a placeholder, emits
|
|
9
|
+
* everything safe before it, and flushes the remainder at stream end.
|
|
10
|
+
*/
|
|
11
|
+
/** Resolves a placeholder token to its real value, or null to leave it as-is. */
|
|
12
|
+
export type PlaceholderResolver = (token: string) => string | null;
|
|
13
|
+
/**
|
|
14
|
+
* Stateful reveal for a token stream. Feed chunks in order; `push` returns the
|
|
15
|
+
* text safe to render now, `flush` returns whatever was held at the end.
|
|
16
|
+
*/
|
|
17
|
+
export declare class StreamingReveal {
|
|
18
|
+
private readonly resolve;
|
|
19
|
+
private buffer;
|
|
20
|
+
constructor(resolve: PlaceholderResolver);
|
|
21
|
+
/** Reveal complete placeholders in `chunk`, holding any partial tail. */
|
|
22
|
+
push(chunk: string): string;
|
|
23
|
+
/** Emit any buffered tail (e.g. a lone `[` that never became a token). */
|
|
24
|
+
flush(): string;
|
|
25
|
+
private replaceComplete;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* A Web Streams transform that reveals placeholders in a string stream — drop
|
|
29
|
+
* it into an AI SDK text stream pipeline so the user never sees `[NAME_1]`.
|
|
30
|
+
*/
|
|
31
|
+
export declare function createRevealTransform(resolve: PlaceholderResolver): TransformStream<string, string>;
|
|
32
|
+
//# sourceMappingURL=streaming.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"streaming.d.ts","sourceRoot":"","sources":["../../src/streaming.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAIH,iFAAiF;AACjF,MAAM,MAAM,mBAAmB,GAAG,CAAC,KAAK,EAAE,MAAM,KAAK,MAAM,GAAG,IAAI,CAAC;AAMnE;;;GAGG;AACH,qBAAa,eAAe;IAGd,OAAO,CAAC,QAAQ,CAAC,OAAO;IAFpC,OAAO,CAAC,MAAM,CAAM;gBAES,OAAO,EAAE,mBAAmB;IAEzD,yEAAyE;IACzE,IAAI,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM;IAc3B,0EAA0E;IAC1E,KAAK,IAAI,MAAM;IAMf,OAAO,CAAC,eAAe;CAGxB;AAED;;;GAGG;AACH,wBAAgB,qBAAqB,CAAC,OAAO,EAAE,mBAAmB,GAAG,eAAe,CAAC,MAAM,EAAE,MAAM,CAAC,CAYnG"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared vocabulary for the PII filter.
|
|
3
|
+
*
|
|
4
|
+
* The whole system is *default-deny*: every detected entity is redacted unless
|
|
5
|
+
* its label is in {@link KEEP_LABELS} (currently {CITY, STATE, ZIP_CODE}). An
|
|
6
|
+
* unrecognized span is dropped, never leaked.
|
|
7
|
+
*
|
|
8
|
+
* Two layers emit labels:
|
|
9
|
+
* - Heuristics (synchronous, validator-backed) detect and premask the
|
|
10
|
+
* structured numeric identifiers below before the model ever runs.
|
|
11
|
+
* - The NER model learns the contextual "fine set": split names, contact and
|
|
12
|
+
* document identifiers, and address components.
|
|
13
|
+
*/
|
|
14
|
+
/** Every entity class the heuristics and the NER model can emit. */
|
|
15
|
+
export type PiiLabel = "SSN" | "CREDIT_CARD" | "IP_ADDRESS" | "GIVEN_NAME" | "SURNAME" | "EMAIL" | "PHONE" | "URL" | "TAX_ID" | "BANK_ACCOUNT" | "ROUTING_NUMBER" | "GOVERNMENT_ID" | "PASSPORT" | "DRIVERS_LICENSE" | "BUILDING_NUMBER" | "STREET_NAME" | "SECONDARY_ADDRESS" | "CITY" | "STATE" | "ZIP_CODE";
|
|
16
|
+
/**
|
|
17
|
+
* Labels that are intentionally preserved (classified but never redacted).
|
|
18
|
+
* The model still learns these; the JS policy layer simply keeps them in the
|
|
19
|
+
* text. Public-benefits assistants need broad geography (city / state / ZIP)
|
|
20
|
+
* for area-median-income (AMI) eligibility, so those are kept while the precise
|
|
21
|
+
* street line (BUILDING_NUMBER + STREET_NAME) is still redacted.
|
|
22
|
+
*/
|
|
23
|
+
export declare const KEEP_LABELS: ReadonlySet<PiiLabel>;
|
|
24
|
+
/** A detected entity span over the *original* (raw) text. */
|
|
25
|
+
export interface Span {
|
|
26
|
+
/** Inclusive start offset into the raw input. */
|
|
27
|
+
readonly start: number;
|
|
28
|
+
/** Exclusive end offset into the raw input. */
|
|
29
|
+
readonly end: number;
|
|
30
|
+
/** The classified entity type. */
|
|
31
|
+
readonly label: PiiLabel;
|
|
32
|
+
/** Detector confidence in [0, 1]. Heuristics with validators report 1. */
|
|
33
|
+
readonly score: number;
|
|
34
|
+
/** Which layer produced the span; used for merge tie-breaks and audit. */
|
|
35
|
+
readonly source: "heuristic" | "ner";
|
|
36
|
+
/** The raw substring covered, retained for placeholder rehydration. */
|
|
37
|
+
readonly text: string;
|
|
38
|
+
}
|
|
39
|
+
/** Resolve a caller keep-set; omitted → {@link KEEP_LABELS}. */
|
|
40
|
+
export declare function resolveKeepLabels(keepLabels?: readonly PiiLabel[]): ReadonlySet<PiiLabel>;
|
|
41
|
+
/** True when a label must be redacted under the default-deny policy. */
|
|
42
|
+
export declare function shouldRedact(label: PiiLabel, keepLabels?: ReadonlySet<PiiLabel>): boolean;
|
|
43
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/types.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,oEAAoE;AACpE,MAAM,MAAM,QAAQ,GAEhB,KAAK,GACL,aAAa,GACb,YAAY,GAEZ,YAAY,GACZ,SAAS,GACT,OAAO,GACP,OAAO,GACP,KAAK,GACL,QAAQ,GACR,cAAc,GACd,gBAAgB,GAChB,eAAe,GACf,UAAU,GACV,iBAAiB,GAEjB,iBAAiB,GACjB,aAAa,GACb,mBAAmB,GACnB,MAAM,GACN,OAAO,GACP,UAAU,CAAC;AAEf;;;;;;GAMG;AACH,eAAO,MAAM,WAAW,EAAE,WAAW,CAAC,QAAQ,CAAoD,CAAC;AAEnG,6DAA6D;AAC7D,MAAM,WAAW,IAAI;IACnB,iDAAiD;IACjD,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,+CAA+C;IAC/C,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,kCAAkC;IAClC,QAAQ,CAAC,KAAK,EAAE,QAAQ,CAAC;IACzB,0EAA0E;IAC1E,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,0EAA0E;IAC1E,QAAQ,CAAC,MAAM,EAAE,WAAW,GAAG,KAAK,CAAC;IACrC,uEAAuE;IACvE,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;CACvB;AAED,gEAAgE;AAChE,wBAAgB,iBAAiB,CAAC,UAAU,CAAC,EAAE,SAAS,QAAQ,EAAE,GAAG,WAAW,CAAC,QAAQ,CAAC,CAEzF;AAED,wEAAwE;AACxE,wBAAgB,YAAY,CAAC,KAAK,EAAE,QAAQ,EAAE,UAAU,GAAE,WAAW,CAAC,QAAQ,CAAe,GAAG,OAAO,CAEtG"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Validators that turn "a run of digits" into "a *valid* SSN / card / phone".
|
|
3
|
+
*
|
|
4
|
+
* These exist to suppress false positives so the filter doesn't mangle numbers
|
|
5
|
+
* the assistant is allowed to keep (income figures, ages, years). A detector
|
|
6
|
+
* proposes a span; a validator decides whether it is really that entity.
|
|
7
|
+
*/
|
|
8
|
+
/** Luhn checksum — gates CREDIT_CARD so arbitrary 16-digit runs don't match. */
|
|
9
|
+
export declare function isLuhnValid(digits: string): boolean;
|
|
10
|
+
/**
|
|
11
|
+
* US SSN structural rules. Area (first 3) cannot be 000, 666, or 900-999;
|
|
12
|
+
* group (middle 2) cannot be 00; serial (last 4) cannot be 0000. Rejects
|
|
13
|
+
* obvious non-SSNs like phone numbers padded to nine digits.
|
|
14
|
+
*/
|
|
15
|
+
export declare function isValidSsn(digits: string): boolean;
|
|
16
|
+
//# sourceMappingURL=validators.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"validators.d.ts","sourceRoot":"","sources":["../../src/validators.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,gFAAgF;AAChF,wBAAgB,WAAW,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAanD;AAED;;;;GAIG;AACH,wBAAgB,UAAU,CAAC,MAAM,EAAE,MAAM,GAAG,OAAO,CAUlD"}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Native benchmark
|
|
2
|
+
|
|
3
|
+
The benchmark runs the **shipped TypeScript pipeline** — the exact
|
|
4
|
+
`@nationaldesignstudio/rampart` code that consumers run — over a frozen
|
|
5
|
+
held-out slice of [`ai4privacy/pii-masking-openpii-1.5m`](https://huggingface.co/datasets/ai4privacy/pii-masking-openpii-1.5m)
|
|
6
|
+
and scores it. The predictions come straight from `src/`, scored by
|
|
7
|
+
`eval/bench/score.ts`, so the benchmark measures the artifact it describes.
|
|
8
|
+
|
|
9
|
+
## Layout
|
|
10
|
+
|
|
11
|
+
| File | Role |
|
|
12
|
+
| --- | --- |
|
|
13
|
+
| `labels.ts` | OpenPII gold label → Rampart policy projection (default-deny, keep-set wins). |
|
|
14
|
+
| `score.ts` | Term-presence recall/retention (Wilson + bootstrap CI), span F1, ECE, latency. |
|
|
15
|
+
| `fetch.ts` | Pull the frozen held-out rows; writes the **committed manifest** (`heldout.manifest.json`) and the gitignored data. |
|
|
16
|
+
| `run.ts` | Run the shipped detectors + policy over the rows and emit `runs/<name>/summary.json` + `by_language.json`. |
|
|
17
|
+
|
|
18
|
+
## Reproduce
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
# 1. Materialise the held-out rows named in heldout.manifest.json.
|
|
22
|
+
# OpenPII is public (CC BY 4.0); HF_TOKEN is optional (higher rate limits).
|
|
23
|
+
bun run bench:fetch --n 30000 --seed 0
|
|
24
|
+
|
|
25
|
+
# 2. Run the bench (loads nationaldesignstudio/rampart q4 from Hugging Face).
|
|
26
|
+
bun run bench --out eval/bench/runs/native
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
`fetch.ts` selects rows by a seeded shuffle over the validation split and pins
|
|
30
|
+
the exact row `uid`s in `heldout.manifest.json`, so the held-out set is frozen
|
|
31
|
+
and any upstream dataset drift is caught on the next fetch. The row data itself
|
|
32
|
+
is regenerable and gitignored; only the manifest and the `summary.json` are
|
|
33
|
+
committed, so every published number traces to committed evidence.
|
|
34
|
+
|
|
35
|
+
## Committed sample
|
|
36
|
+
|
|
37
|
+
`runs/sample-900/` is a committed proof run over a 900-row held-out slice (all
|
|
38
|
+
seven languages, `manifest.json` pins the `uid`s). It shows the shipped runtime
|
|
39
|
+
at **99.70%** private-term recall on that small slice — a smoke test that the
|
|
40
|
+
benchmark measures the same artifact it describes. The published headline
|
|
41
|
+
(**98.42%**) comes from the full 30k run committed in `runs/m06-v3-30k/`; scale
|
|
42
|
+
the manifest to the full 30k with `bun run bench:fetch --n 30000` to regenerate it.
|
|
43
|
+
|
|
44
|
+
## Metrics
|
|
45
|
+
|
|
46
|
+
Identical definitions to the figures in `MODEL_CARD.md` / `WHITEPAPER.md`:
|
|
47
|
+
private-term recall and public-term retention (term-presence, Wilson 95% CI),
|
|
48
|
+
span-F1 at IoU ∈ {1.0, 0.5, 0.0}, ECE, and Node-ONNX latency percentiles —
|
|
49
|
+
now measured end-to-end through the form factor that ships.
|
|
50
|
+
|
|
51
|
+
## WebGPU latency (`webgpu.ts`)
|
|
52
|
+
|
|
53
|
+
`run.ts` measures ONNX-CPU latency under Node. `webgpu.ts` measures the form
|
|
54
|
+
factor that actually ships: the same shipped detection path (heuristics →
|
|
55
|
+
premask → NER → policy), bundled for the browser and run inside a real Chromium
|
|
56
|
+
tab with the NER model on the **WebGPU** backend.
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
bun run bench:webgpu # WebGPU, q4 (writes runs/webgpu-q4/latency.json)
|
|
60
|
+
bun run bench:webgpu:wasm # WASM backend baseline for contrast
|
|
61
|
+
bun eval/bench/webgpu.ts --headed --iters 400
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
By default it times one inference per row over the frozen held-out slice
|
|
65
|
+
(`eval/bench/data/heldout.jsonl`, materialised by `bench:fetch`), so browser
|
|
66
|
+
latency is measured over the same OpenPII rows as the Node bench. When that file
|
|
67
|
+
is absent it falls back to the committed `eval/public-cases.ts` chat strings;
|
|
68
|
+
override with `--data <path>`.
|
|
69
|
+
|
|
70
|
+
It serves the repo over `http://localhost` (a WebGPU secure context — `about:blank`
|
|
71
|
+
will not expose `navigator.gpu`) and drives **Playwright's bundled Chromium** — no
|
|
72
|
+
system Chrome required. The launch strips Playwright's GPU-disabling default args
|
|
73
|
+
(notably `--use-gl=swiftshader`) and forces ANGLE/Metal, so the headless browser
|
|
74
|
+
reaches the real GPU (`apple/metal-3`) instead of the SwiftShader software adapter
|
|
75
|
+
(which is ~400× slower and meaningless for latency). Pass `--chrome /path/to/chrome`
|
|
76
|
+
to use a system Chrome instead.
|
|
77
|
+
|
|
78
|
+
Latency is hardware-dependent and machine-specific, so latency runs are **not
|
|
79
|
+
committed** (`runs/*/latency.json` is gitignored — regenerate locally). For
|
|
80
|
+
reference, an 800-row held-out slice, q4, on Apple Metal:
|
|
81
|
+
|
|
82
|
+
| Backend | p50 | p95 | p99 | mean |
|
|
83
|
+
| --- | --: | --: | --: | --: |
|
|
84
|
+
| WebGPU | 3.9 ms | 9.3 ms | 13.4 ms | 4.8 ms |
|
|
85
|
+
| WASM | 12.6 ms | 35.5 ms | 53.7 ms | 15.8 ms |
|
|
86
|
+
|
|
87
|
+
The Node ONNX (CPU) bench is 6.6 ms p50 over the full 30k slice — so WebGPU beats
|
|
88
|
+
Node CPU and WASM is the no-GPU floor. Reproduce with `bun run bench:webgpu` on
|
|
89
|
+
your own hardware.
|
|
90
|
+
|
|
91
|
+
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fetch a frozen, reproducible held-out sample from the OpenPII validation split.
|
|
3
|
+
*
|
|
4
|
+
* The row *data* is a regenerable local artifact (gitignored); the committed
|
|
5
|
+
* output is the manifest — dataset revision, seed, and the exact row `uid`s — so
|
|
6
|
+
* the held-out set is pinned and any drift in the upstream dataset is detected
|
|
7
|
+
* on the next fetch. Rows are restricted to the seven supported Latin-script
|
|
8
|
+
* languages.
|
|
9
|
+
*
|
|
10
|
+
* HF_TOKEN=... bun eval/bench/fetch.ts --n 5000 --seed 0
|
|
11
|
+
*
|
|
12
|
+
* The OpenPII dataset is public (CC BY 4.0), so HF_TOKEN is optional — it is
|
|
13
|
+
* used when present (higher rate limits) and the anonymous datasets-server
|
|
14
|
+
* endpoint is used otherwise.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { appendFile, mkdir, writeFile } from "node:fs/promises";
|
|
18
|
+
import { dirname } from "node:path";
|
|
19
|
+
|
|
20
|
+
const DATASET = "ai4privacy/pii-masking-openpii-1.5m";
|
|
21
|
+
const SPLIT = "validation";
|
|
22
|
+
const LATIN = new Set(["en", "es", "fr", "de", "it", "pt", "nl"]);
|
|
23
|
+
const PAGE = 100;
|
|
24
|
+
|
|
25
|
+
interface Row {
|
|
26
|
+
uid: number;
|
|
27
|
+
language: string;
|
|
28
|
+
source_text: string;
|
|
29
|
+
privacy_mask: { label: string; start: number; end: number; value: string }[];
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function arg(name: string, fallback: string): string {
|
|
33
|
+
const hit = Bun.argv.find((a) => a.startsWith(`--${name}=`)) ?? Bun.argv[Bun.argv.indexOf(`--${name}`) + 1];
|
|
34
|
+
return hit?.startsWith("--") ? fallback : (hit?.replace(`--${name}=`, "") ?? fallback);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function mulberry32(seed: number): () => number {
|
|
38
|
+
let a = seed >>> 0;
|
|
39
|
+
return () => {
|
|
40
|
+
a = (a + 0x6d2b79f5) | 0;
|
|
41
|
+
let t = Math.imul(a ^ (a >>> 15), 1 | a);
|
|
42
|
+
t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
|
|
43
|
+
return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const sleep = (ms: number): Promise<void> => new Promise((r) => setTimeout(r, ms));
|
|
48
|
+
|
|
49
|
+
/** Write with a couple of retries: the sandbox occasionally returns ETXTBSY/EBUSY. */
|
|
50
|
+
async function safeWrite(path: string, contents: string): Promise<void> {
|
|
51
|
+
for (let attempt = 0; attempt < 4; attempt++) {
|
|
52
|
+
try {
|
|
53
|
+
await writeFile(path, contents);
|
|
54
|
+
return;
|
|
55
|
+
} catch (e) {
|
|
56
|
+
if (attempt === 3) throw e;
|
|
57
|
+
await sleep(500 * (attempt + 1));
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
async function getJson(url: string, token: string | undefined): Promise<any> {
|
|
63
|
+
const headers: Record<string, string> = token ? { Authorization: `Bearer ${token}` } : {};
|
|
64
|
+
for (let attempt = 0; attempt < 8; attempt++) {
|
|
65
|
+
try {
|
|
66
|
+
const res = await fetch(url, { headers });
|
|
67
|
+
if (res.ok) return await res.json();
|
|
68
|
+
if (res.status === 429) {
|
|
69
|
+
const retryAfter = Number(res.headers.get("retry-after")) || 0;
|
|
70
|
+
await sleep(Math.max(retryAfter * 1000, 2000 * 2 ** attempt));
|
|
71
|
+
continue;
|
|
72
|
+
}
|
|
73
|
+
process.stderr.write(`\n[fetch ${res.status}] ${url}\n`);
|
|
74
|
+
} catch (e) {
|
|
75
|
+
process.stderr.write(`\n[fetch error] ${String(e)}\n`);
|
|
76
|
+
}
|
|
77
|
+
await sleep(1000 * 2 ** attempt);
|
|
78
|
+
}
|
|
79
|
+
throw new Error(`failed to fetch ${url}`);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
async function main(): Promise<void> {
|
|
83
|
+
const token = process.env.HF_TOKEN; // optional: public dataset, used for higher rate limits
|
|
84
|
+
const n = Number(arg("n", "5000"));
|
|
85
|
+
const seed = Number(arg("seed", "0"));
|
|
86
|
+
const dataPath = arg("out", "eval/bench/data/heldout.jsonl");
|
|
87
|
+
const manifestPath = arg("manifest", "eval/bench/heldout.manifest.json");
|
|
88
|
+
|
|
89
|
+
const info = await getJson(`https://datasets-server.huggingface.co/size?dataset=${DATASET}`, token);
|
|
90
|
+
const splitInfo = info.size.splits.find((s: any) => s.split === SPLIT);
|
|
91
|
+
const total: number = splitInfo.num_rows;
|
|
92
|
+
|
|
93
|
+
// Deterministic, shuffled offsets so the sample spans the split rather than
|
|
94
|
+
// one contiguous (language-clustered) region.
|
|
95
|
+
const rng = mulberry32(seed);
|
|
96
|
+
const offsets = Array.from({ length: Math.ceil(total / PAGE) }, (_, i) => i * PAGE);
|
|
97
|
+
for (let i = offsets.length - 1; i > 0; i--) {
|
|
98
|
+
const j = Math.floor(rng() * (i + 1));
|
|
99
|
+
[offsets[i], offsets[j]] = [offsets[j], offsets[i]];
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
await mkdir(dirname(dataPath), { recursive: true });
|
|
103
|
+
await safeWrite(dataPath, ""); // truncate; rows are streamed in as collected
|
|
104
|
+
|
|
105
|
+
const selected: Row[] = [];
|
|
106
|
+
const seen = new Set<number>();
|
|
107
|
+
for (const offset of offsets) {
|
|
108
|
+
if (selected.length >= n) break;
|
|
109
|
+
const page = await getJson(
|
|
110
|
+
`https://datasets-server.huggingface.co/rows?dataset=${DATASET}&config=default&split=${SPLIT}&offset=${offset}&length=${PAGE}`,
|
|
111
|
+
token,
|
|
112
|
+
);
|
|
113
|
+
let batch = "";
|
|
114
|
+
for (const item of page.rows ?? []) {
|
|
115
|
+
const r = item.row;
|
|
116
|
+
if (!LATIN.has(r.language) || seen.has(r.uid) || selected.length >= n) continue;
|
|
117
|
+
seen.add(r.uid);
|
|
118
|
+
const row: Row = { uid: r.uid, language: r.language, source_text: r.source_text, privacy_mask: r.privacy_mask };
|
|
119
|
+
selected.push(row);
|
|
120
|
+
batch += JSON.stringify(row) + "\n";
|
|
121
|
+
}
|
|
122
|
+
if (batch) await appendFile(dataPath, batch); // persist incrementally
|
|
123
|
+
process.stderr.write(`\rcollected ${selected.length}/${n}`);
|
|
124
|
+
await sleep(700); // throttle to stay under the datasets-server rate limit
|
|
125
|
+
}
|
|
126
|
+
process.stderr.write("\n");
|
|
127
|
+
|
|
128
|
+
const byLang: Record<string, number> = {};
|
|
129
|
+
for (const r of selected) byLang[r.language] = (byLang[r.language] ?? 0) + 1;
|
|
130
|
+
|
|
131
|
+
const manifest = {
|
|
132
|
+
dataset: DATASET,
|
|
133
|
+
split: SPLIT,
|
|
134
|
+
revision: info.size?.dataset ?? null,
|
|
135
|
+
seed,
|
|
136
|
+
languages: [...LATIN],
|
|
137
|
+
rows: selected.length,
|
|
138
|
+
by_language: byLang,
|
|
139
|
+
uids: selected.map((r) => r.uid),
|
|
140
|
+
};
|
|
141
|
+
const manifestJson = JSON.stringify(manifest, null, 2) + "\n";
|
|
142
|
+
try {
|
|
143
|
+
await safeWrite(manifestPath, manifestJson);
|
|
144
|
+
console.log(`wrote ${selected.length} rows -> ${dataPath}\nmanifest -> ${manifestPath}\nby language: ${JSON.stringify(byLang)}`);
|
|
145
|
+
} catch {
|
|
146
|
+
// Sandbox teardown can ETXTBSY the final write; emit the manifest so it can
|
|
147
|
+
// be captured from stdout. The streamed data file is already complete.
|
|
148
|
+
console.log(`MANIFEST_BEGIN\n${manifestJson}MANIFEST_END`);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
await main();
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenPII (`ai4privacy/pii-masking-openpii-1.5m`) gold-label projection onto the
|
|
3
|
+
* Rampart policy: default-deny, keep-set wins, and any unknown label fails safe
|
|
4
|
+
* to a redacted class.
|
|
5
|
+
*
|
|
6
|
+
* This projection lives in TS alongside the runtime it scores, so the gold-label
|
|
7
|
+
* mapping the benchmark uses is the same mapping the shipped policy enforces.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
/** Keep set / not-PII sentinel. */
|
|
11
|
+
export const KEEP = "O";
|
|
12
|
+
|
|
13
|
+
const OPENPII_TO_OURS: Readonly<Record<string, string>> = {
|
|
14
|
+
// Split names (a household may share a surname, so they stay distinct).
|
|
15
|
+
GIVENNAME: "GIVEN_NAME",
|
|
16
|
+
SURNAME: "SURNAME",
|
|
17
|
+
// Direct contact identifiers
|
|
18
|
+
EMAIL: "EMAIL",
|
|
19
|
+
TELEPHONENUM: "PHONE",
|
|
20
|
+
// Government / financial strong identifiers (redact)
|
|
21
|
+
SOCIALNUM: "SSN",
|
|
22
|
+
CREDITCARDNUMBER: "CREDIT_CARD",
|
|
23
|
+
IDCARDNUM: "GOVERNMENT_ID",
|
|
24
|
+
PASSPORTNUM: "PASSPORT",
|
|
25
|
+
DRIVERLICENSENUM: "DRIVERS_LICENSE",
|
|
26
|
+
TAXNUM: "TAX_ID",
|
|
27
|
+
// Address components: only the precise street line is redacted at runtime.
|
|
28
|
+
// CITY / STATE / ZIPCODE are in KEEP_LABELS (see src/types.ts) — they ride
|
|
29
|
+
// through the runtime untouched, so the bench must NOT count them as leaks.
|
|
30
|
+
BUILDINGNUM: "BUILDING_NUMBER",
|
|
31
|
+
STREET: "STREET_NAME",
|
|
32
|
+
CITY: KEEP,
|
|
33
|
+
STATE: KEEP,
|
|
34
|
+
ZIPCODE: KEEP,
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
/** Project an OpenPII label onto our schema; unknown → O (pass through). */
|
|
38
|
+
export function mapOpenPiiLabel(label: string): string {
|
|
39
|
+
return OPENPII_TO_OURS[label] ?? KEEP;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/** A gold value is "private" iff its label maps to a redacted (non-keep) class. */
|
|
43
|
+
export function isPrivateLabel(label: string): boolean {
|
|
44
|
+
return mapOpenPiiLabel(label) !== KEEP;
|
|
45
|
+
}
|