@nationaldesignstudio/rampart 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +402 -0
- package/MODEL_CARD.md +422 -0
- package/README.md +279 -0
- package/RELEASE.md +97 -0
- package/WHITEPAPER.md +316 -0
- package/dist/index.d.ts +23 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +35639 -0
- package/dist/index.js.map +36 -0
- package/dist/src/guard.d.ts +94 -0
- package/dist/src/guard.d.ts.map +1 -0
- package/dist/src/heuristics.d.ts +14 -0
- package/dist/src/heuristics.d.ts.map +1 -0
- package/dist/src/ner/classifier.d.ts +92 -0
- package/dist/src/ner/classifier.d.ts.map +1 -0
- package/dist/src/ner/worker.d.ts +44 -0
- package/dist/src/ner/worker.d.ts.map +1 -0
- package/dist/src/ner/worker.js +35302 -0
- package/dist/src/ner/worker.js.map +30 -0
- package/dist/src/pipeline.d.ts +76 -0
- package/dist/src/pipeline.d.ts.map +1 -0
- package/dist/src/policy.d.ts +27 -0
- package/dist/src/policy.d.ts.map +1 -0
- package/dist/src/premask.d.ts +48 -0
- package/dist/src/premask.d.ts.map +1 -0
- package/dist/src/session.d.ts +60 -0
- package/dist/src/session.d.ts.map +1 -0
- package/dist/src/streaming.d.ts +32 -0
- package/dist/src/streaming.d.ts.map +1 -0
- package/dist/src/types.d.ts +43 -0
- package/dist/src/types.d.ts.map +1 -0
- package/dist/src/validators.d.ts +16 -0
- package/dist/src/validators.d.ts.map +1 -0
- package/eval/bench/README.md +91 -0
- package/eval/bench/fetch.ts +152 -0
- package/eval/bench/labels.ts +45 -0
- package/eval/bench/run.ts +146 -0
- package/eval/bench/runs/m06-v3-30k/by_language.json +303 -0
- package/eval/bench/runs/m06-v3-30k/summary.json +56 -0
- package/eval/bench/runs/sample-900/by_language.json +303 -0
- package/eval/bench/runs/sample-900/manifest.json +926 -0
- package/eval/bench/runs/sample-900/summary.json +56 -0
- package/eval/bench/score.ts +197 -0
- package/eval/bench/webgpu/entry.ts +70 -0
- package/eval/bench/webgpu/index.html +12 -0
- package/eval/bench/webgpu.ts +209 -0
- package/eval/public-cases.ts +412 -0
- package/eval/run-public-eval.ts +140 -0
- package/examples/basic-chat.ts +12 -0
- package/examples/pii-worker.ts +3 -0
- package/index.ts +47 -0
- package/package.json +103 -0
- package/src/guard.ts +170 -0
- package/src/heuristics.ts +141 -0
- package/src/ner/classifier.ts +580 -0
- package/src/ner/worker.ts +130 -0
- package/src/policy.ts +64 -0
- package/src/premask.ts +90 -0
- package/src/session.ts +99 -0
- package/src/streaming.ts +73 -0
- package/src/types.ts +74 -0
- package/src/validators.ts +40 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
{
|
|
2
|
+
"rows": 900,
|
|
3
|
+
"private_total": 3991,
|
|
4
|
+
"private_recall": 0.9969932347782511,
|
|
5
|
+
"private_recall_wilson95": [
|
|
6
|
+
0.994751461461101,
|
|
7
|
+
0.9982791508164921
|
|
8
|
+
],
|
|
9
|
+
"private_recall_bootstrap95": [
|
|
10
|
+
0.9952392883988975,
|
|
11
|
+
0.9984966173891255
|
|
12
|
+
],
|
|
13
|
+
"leaked": 12,
|
|
14
|
+
"public_total": 2405,
|
|
15
|
+
"public_retained": 0.8731808731808732,
|
|
16
|
+
"over_redacted": 305,
|
|
17
|
+
"span_f1": {
|
|
18
|
+
"iou_1.00": {
|
|
19
|
+
"tp": 2386,
|
|
20
|
+
"fp": 1404,
|
|
21
|
+
"fn": 1605,
|
|
22
|
+
"precision": 0.6295514511873351,
|
|
23
|
+
"recall": 0.59784515159108,
|
|
24
|
+
"f1": 0.6132887803624213
|
|
25
|
+
},
|
|
26
|
+
"iou_0.50": {
|
|
27
|
+
"tp": 2983,
|
|
28
|
+
"fp": 807,
|
|
29
|
+
"fn": 1008,
|
|
30
|
+
"precision": 0.7870712401055409,
|
|
31
|
+
"recall": 0.7474317213730894,
|
|
32
|
+
"f1": 0.7667394936383498
|
|
33
|
+
},
|
|
34
|
+
"iou_0.00": {
|
|
35
|
+
"tp": 3141,
|
|
36
|
+
"fp": 649,
|
|
37
|
+
"fn": 850,
|
|
38
|
+
"precision": 0.8287598944591029,
|
|
39
|
+
"recall": 0.7870207967927838,
|
|
40
|
+
"f1": 0.8073512402004883
|
|
41
|
+
}
|
|
42
|
+
},
|
|
43
|
+
"calibration": {
|
|
44
|
+
"ece": 0.21079228912807385,
|
|
45
|
+
"n_pairs": 3790
|
|
46
|
+
},
|
|
47
|
+
"latency_ms": {
|
|
48
|
+
"cold": 77.65675199999896,
|
|
49
|
+
"p50": 14.373870999999781,
|
|
50
|
+
"p95": 36.97866299999987,
|
|
51
|
+
"p99": 48.98503100000016,
|
|
52
|
+
"mean": 17.03590428111117
|
|
53
|
+
},
|
|
54
|
+
"model": "./model",
|
|
55
|
+
"rows_scored": 900
|
|
56
|
+
}
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scoring for the native bench: term-presence recall/retention, span-level F1,
|
|
3
|
+
* calibration (ECE/Brier) and percentile latency. The predictions are produced
|
|
4
|
+
* by the shipped TypeScript pipeline, so the metrics describe the artifact that
|
|
5
|
+
* ships rather than a separate evaluation implementation.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export interface GoldSpan {
|
|
9
|
+
readonly label: string;
|
|
10
|
+
readonly start: number;
|
|
11
|
+
readonly end: number;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export interface PredSpan extends GoldSpan {
|
|
15
|
+
/** Detector confidence; deterministic spans are 1. */
|
|
16
|
+
readonly score: number;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/** Wilson 95% CI for a binomial proportion. Robust at small n and at p=0/1. */
|
|
20
|
+
export function wilsonCi(k: number, n: number, z = 1.96): [number, number] {
|
|
21
|
+
if (n === 0) return [0, 1];
|
|
22
|
+
const phat = k / n;
|
|
23
|
+
const denom = 1 + (z * z) / n;
|
|
24
|
+
const center = (phat + (z * z) / (2 * n)) / denom;
|
|
25
|
+
const half = (z * Math.sqrt((phat * (1 - phat)) / n + (z * z) / (4 * n * n))) / denom;
|
|
26
|
+
return [Math.max(0, center - half), Math.min(1, center + half)];
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/** Deterministic PRNG so bootstrap CIs are reproducible run to run. */
|
|
30
|
+
function mulberry32(seed: number): () => number {
|
|
31
|
+
let a = seed >>> 0;
|
|
32
|
+
return () => {
|
|
33
|
+
a = (a + 0x6d2b79f5) | 0;
|
|
34
|
+
let t = Math.imul(a ^ (a >>> 15), 1 | a);
|
|
35
|
+
t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
|
|
36
|
+
return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/** Percentile-bootstrap CI for the mean of 0/1 outcomes. */
|
|
41
|
+
export function bootstrapCi(successes: readonly boolean[], iters = 1000, alpha = 0.05, seed = 0): [number, number] {
|
|
42
|
+
if (successes.length === 0) return [0, 1];
|
|
43
|
+
const rng = mulberry32(seed);
|
|
44
|
+
const n = successes.length;
|
|
45
|
+
const means: number[] = [];
|
|
46
|
+
for (let it = 0; it < iters; it++) {
|
|
47
|
+
let s = 0;
|
|
48
|
+
for (let i = 0; i < n; i++) s += successes[Math.floor(rng() * n)] ? 1 : 0;
|
|
49
|
+
means.push(s / n);
|
|
50
|
+
}
|
|
51
|
+
means.sort((a, b) => a - b);
|
|
52
|
+
return [means[Math.floor((iters * alpha) / 2)], means[Math.floor(iters * (1 - alpha / 2))]];
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export function iou(a: GoldSpan, b: GoldSpan): number {
|
|
56
|
+
const inter = Math.max(0, Math.min(a.end, b.end) - Math.max(a.start, b.start));
|
|
57
|
+
if (inter === 0) return 0;
|
|
58
|
+
return inter / (Math.max(a.end, b.end) - Math.min(a.start, b.start));
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/** (TP, FP, FN) under one-to-one greedy matching at the given IoU; higher-scored predictions match first. */
|
|
62
|
+
export function matchSpans(gold: readonly GoldSpan[], pred: readonly PredSpan[], threshold = 1): { tp: number; fp: number; fn: number } {
|
|
63
|
+
const goldUsed = new Array(gold.length).fill(false);
|
|
64
|
+
const order = [...pred].sort((a, b) => b.score - a.score);
|
|
65
|
+
let tp = 0;
|
|
66
|
+
let fp = 0;
|
|
67
|
+
for (const p of order) {
|
|
68
|
+
let bestIou = 0;
|
|
69
|
+
let bestJ = -1;
|
|
70
|
+
for (let j = 0; j < gold.length; j++) {
|
|
71
|
+
if (goldUsed[j] || gold[j].label !== p.label) continue;
|
|
72
|
+
const s = iou(p, gold[j]);
|
|
73
|
+
if (s >= threshold && s > bestIou) {
|
|
74
|
+
bestIou = s;
|
|
75
|
+
bestJ = j;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
if (bestJ >= 0) {
|
|
79
|
+
goldUsed[bestJ] = true;
|
|
80
|
+
tp += 1;
|
|
81
|
+
} else {
|
|
82
|
+
fp += 1;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
return { tp, fp, fn: goldUsed.filter((u) => !u).length };
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
export function f1(tp: number, fp: number, fn: number): { precision: number; recall: number; f1: number } {
|
|
89
|
+
const precision = tp + fp ? tp / (tp + fp) : 0;
|
|
90
|
+
const recall = tp + fn ? tp / (tp + fn) : 0;
|
|
91
|
+
const f = precision + recall ? (2 * precision * recall) / (precision + recall) : 0;
|
|
92
|
+
return { precision, recall, f1: f };
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/** Replace each span with a `[LABEL]` placeholder, right to left. */
|
|
96
|
+
export function redactText(raw: string, spans: readonly GoldSpan[]): string {
|
|
97
|
+
let out = raw;
|
|
98
|
+
for (const s of [...spans].sort((a, b) => b.start - a.start)) {
|
|
99
|
+
out = out.slice(0, s.start) + `[${s.label}]` + out.slice(s.end);
|
|
100
|
+
}
|
|
101
|
+
return out;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
export interface TermResult {
|
|
105
|
+
readonly leaked: number;
|
|
106
|
+
readonly protectedCount: number;
|
|
107
|
+
readonly over: number;
|
|
108
|
+
readonly retained: number;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/** One row: each private term must vanish from the redacted text; each public term must remain. */
|
|
112
|
+
export function termPresence(redacted: string, goldPrivate: readonly string[], goldPublic: readonly string[]): TermResult {
|
|
113
|
+
const leaked = goldPrivate.filter((t) => t && redacted.includes(t)).length;
|
|
114
|
+
const over = goldPublic.filter((t) => t && !redacted.includes(t)).length;
|
|
115
|
+
return { leaked, protectedCount: goldPrivate.length - leaked, over, retained: goldPublic.length - over };
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/** 15-bin reliability ECE over (predicted_score, was_correct) pairs. */
|
|
119
|
+
export function expectedCalibrationError(pairs: readonly [number, boolean][], nBins = 15): number {
|
|
120
|
+
if (pairs.length === 0) return 0;
|
|
121
|
+
const bins: [number, boolean][][] = Array.from({ length: nBins }, () => []);
|
|
122
|
+
for (const [score, correct] of pairs) bins[Math.min(nBins - 1, Math.floor(score * nBins))].push([score, correct]);
|
|
123
|
+
let ece = 0;
|
|
124
|
+
for (const b of bins) {
|
|
125
|
+
if (!b.length) continue;
|
|
126
|
+
const acc = b.filter(([, c]) => c).length / b.length;
|
|
127
|
+
const conf = b.reduce((s, [v]) => s + v, 0) / b.length;
|
|
128
|
+
ece += Math.abs(acc - conf) * (b.length / pairs.length);
|
|
129
|
+
}
|
|
130
|
+
return ece;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
export function brierScore(pairs: readonly [number, boolean][]): number {
|
|
134
|
+
if (pairs.length === 0) return 0;
|
|
135
|
+
return pairs.reduce((s, [score, c]) => s + (score - (c ? 1 : 0)) ** 2, 0) / pairs.length;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
export function percentile(sorted: readonly number[], p: number): number {
|
|
139
|
+
if (sorted.length === 0) return 0;
|
|
140
|
+
return sorted[Math.min(sorted.length - 1, Math.floor(sorted.length * p))];
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/** Accumulates per-row results into one bucket and reports the summary.json shape. */
|
|
144
|
+
export class Stratum {
|
|
145
|
+
rows = 0;
|
|
146
|
+
leaked = 0;
|
|
147
|
+
privateTotal = 0;
|
|
148
|
+
over = 0;
|
|
149
|
+
publicTotal = 0;
|
|
150
|
+
private readonly outcomes: boolean[] = [];
|
|
151
|
+
private readonly spanTp: Record<string, number> = {};
|
|
152
|
+
private readonly spanFp: Record<string, number> = {};
|
|
153
|
+
private readonly spanFn: Record<string, number> = {};
|
|
154
|
+
|
|
155
|
+
addTerm(term: TermResult): void {
|
|
156
|
+
this.rows += 1;
|
|
157
|
+
this.leaked += term.leaked;
|
|
158
|
+
this.privateTotal += term.leaked + term.protectedCount;
|
|
159
|
+
this.over += term.over;
|
|
160
|
+
this.publicTotal += term.over + term.retained;
|
|
161
|
+
for (let i = 0; i < term.protectedCount; i++) this.outcomes.push(true);
|
|
162
|
+
for (let i = 0; i < term.leaked; i++) this.outcomes.push(false);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
addSpans(gold: readonly GoldSpan[], pred: readonly PredSpan[], thresholds: readonly number[]): void {
|
|
166
|
+
for (const t of thresholds) {
|
|
167
|
+
const key = t.toFixed(2);
|
|
168
|
+
const { tp, fp, fn } = matchSpans(gold, pred, t);
|
|
169
|
+
this.spanTp[key] = (this.spanTp[key] ?? 0) + tp;
|
|
170
|
+
this.spanFp[key] = (this.spanFp[key] ?? 0) + fp;
|
|
171
|
+
this.spanFn[key] = (this.spanFn[key] ?? 0) + fn;
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
report(): Record<string, unknown> {
|
|
176
|
+
const recall = this.privateTotal ? (this.privateTotal - this.leaked) / this.privateTotal : 1;
|
|
177
|
+
const spanF1: Record<string, unknown> = {};
|
|
178
|
+
for (const key of Object.keys(this.spanTp)) {
|
|
179
|
+
const tp = this.spanTp[key];
|
|
180
|
+
const fp = this.spanFp[key];
|
|
181
|
+
const fn = this.spanFn[key];
|
|
182
|
+
spanF1[`iou_${key}`] = { tp, fp, fn, ...f1(tp, fp, fn) };
|
|
183
|
+
}
|
|
184
|
+
return {
|
|
185
|
+
rows: this.rows,
|
|
186
|
+
private_total: this.privateTotal,
|
|
187
|
+
private_recall: recall,
|
|
188
|
+
private_recall_wilson95: wilsonCi(this.privateTotal - this.leaked, this.privateTotal),
|
|
189
|
+
private_recall_bootstrap95: bootstrapCi(this.outcomes),
|
|
190
|
+
leaked: this.leaked,
|
|
191
|
+
public_total: this.publicTotal,
|
|
192
|
+
public_retained: this.publicTotal ? (this.publicTotal - this.over) / this.publicTotal : 1,
|
|
193
|
+
over_redacted: this.over,
|
|
194
|
+
span_f1: spanF1,
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
}
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Browser entry for the WebGPU latency harness.
|
|
3
|
+
*
|
|
4
|
+
* Runs the exact shipped detection path (heuristics → premask → NER → policy)
|
|
5
|
+
* the same way `eval/bench/run.ts` does, but inside a real Chromium tab with the
|
|
6
|
+
* NER model executing on the WebGPU backend. The Node bench measures ORT-CPU
|
|
7
|
+
* latency; this measures the form factor that actually ships to the browser.
|
|
8
|
+
*
|
|
9
|
+
* `bun eval/bench/webgpu.ts` bundles this for the browser, serves it, drives it
|
|
10
|
+
* with Playwright, and writes the latency summary.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { env } from "@huggingface/transformers";
|
|
14
|
+
import { detectHeuristics } from "../../../src/heuristics";
|
|
15
|
+
import { detectNer, loadNerClassifier, type TokenClassifier } from "../../../src/ner/classifier";
|
|
16
|
+
import { applyPolicy } from "../../../src/policy";
|
|
17
|
+
import { premask, projectMaskedSpan } from "../../../src/premask";
|
|
18
|
+
import type { Span } from "../../../src/types";
|
|
19
|
+
|
|
20
|
+
interface BenchOptions {
|
|
21
|
+
readonly modelBaseUrl: string;
|
|
22
|
+
readonly device: "webgpu" | "wasm";
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
let classifier: TokenClassifier | null = null;
|
|
26
|
+
|
|
27
|
+
/** Configure transformers.js to fetch the committed local model over HTTP. */
|
|
28
|
+
function configureEnv(baseUrl: string): void {
|
|
29
|
+
env.allowRemoteModels = false;
|
|
30
|
+
env.allowLocalModels = true;
|
|
31
|
+
// transformers.js resolves a model id against localModelPath; "model" → /model.
|
|
32
|
+
env.localModelPath = baseUrl.endsWith("/") ? baseUrl : `${baseUrl}/`;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
async function init(opts: BenchOptions): Promise<{ ok: true; adapter: string }> {
|
|
36
|
+
configureEnv(opts.modelBaseUrl);
|
|
37
|
+
classifier = await loadNerClassifier({ device: opts.device });
|
|
38
|
+
const gpu = (navigator as unknown as { gpu?: { requestAdapter: () => Promise<{ info?: { vendor?: string; architecture?: string } } | null> } }).gpu;
|
|
39
|
+
const adapter = gpu ? await gpu.requestAdapter() : null;
|
|
40
|
+
const info = adapter?.info;
|
|
41
|
+
return { ok: true, adapter: info ? `${info.vendor ?? "?"}/${info.architecture ?? "?"}` : "n/a" };
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/** One row through the shipped pipeline; returns wall-clock ms for the row. */
|
|
45
|
+
async function scrubOnceTimed(text: string): Promise<number> {
|
|
46
|
+
if (!classifier) throw new Error("classifier not initialised");
|
|
47
|
+
const t0 = performance.now();
|
|
48
|
+
const heuristic = detectHeuristics(text);
|
|
49
|
+
const map = premask(text, heuristic);
|
|
50
|
+
const masked = await detectNer(map.masked, classifier);
|
|
51
|
+
const modelSpans: Span[] = [];
|
|
52
|
+
for (const s of masked) {
|
|
53
|
+
const projected = projectMaskedSpan(s, text, map);
|
|
54
|
+
if (projected !== null) modelSpans.push(projected);
|
|
55
|
+
}
|
|
56
|
+
applyPolicy([...heuristic, ...modelSpans]);
|
|
57
|
+
return performance.now() - t0;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
async function run(texts: readonly string[], warmup: number, iters: number): Promise<{ latencies: number[]; cold: number }> {
|
|
61
|
+
// Cold start: first inference pays model-graph compilation.
|
|
62
|
+
const cold = await scrubOnceTimed(texts[0]);
|
|
63
|
+
for (let i = 0; i < warmup; i++) await scrubOnceTimed(texts[i % texts.length]);
|
|
64
|
+
const latencies: number[] = [];
|
|
65
|
+
for (let i = 0; i < iters; i++) latencies.push(await scrubOnceTimed(texts[i % texts.length]));
|
|
66
|
+
return { latencies, cold };
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Exposed to Playwright via window.
|
|
70
|
+
(globalThis as unknown as { rampartBench: unknown }).rampartBench = { init, run };
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
<!doctype html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="utf-8" />
|
|
5
|
+
<title>Rampart WebGPU bench</title>
|
|
6
|
+
</head>
|
|
7
|
+
<body>
|
|
8
|
+
<h1>Rampart WebGPU latency harness</h1>
|
|
9
|
+
<pre id="status">loading bundle…</pre>
|
|
10
|
+
<script type="module" src="/eval/bench/webgpu/bundle.js"></script>
|
|
11
|
+
</body>
|
|
12
|
+
</html>
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WebGPU latency harness.
|
|
3
|
+
*
|
|
4
|
+
* Bundles the shipped detection path for the browser, serves the repo over
|
|
5
|
+
* http://localhost (a WebGPU secure context), drives Playwright's bundled
|
|
6
|
+
* Chromium, runs warmup + timed inferences with the NER model on the WebGPU
|
|
7
|
+
* backend, and writes percentile latency to `runs/<out>/latency.json`.
|
|
8
|
+
*
|
|
9
|
+
* bun eval/bench/webgpu.ts # WebGPU, q4
|
|
10
|
+
* bun eval/bench/webgpu.ts --device wasm # WASM baseline for contrast
|
|
11
|
+
* bun eval/bench/webgpu.ts --iters 400 --headed # more samples, visible window
|
|
12
|
+
*
|
|
13
|
+
* Inputs default to the frozen held-out slice (`eval/bench/data/heldout.jsonl`,
|
|
14
|
+
* materialised by `bun run bench:fetch`) so browser latency is measured over the
|
|
15
|
+
* same rows as the Node bench; if that file is absent it falls back to the
|
|
16
|
+
* committed `eval/public-cases.ts` chat strings. Override with `--data <path>`.
|
|
17
|
+
*
|
|
18
|
+
* Uses Playwright's bundled Chromium — no system Chrome required. The launch
|
|
19
|
+
* strips Playwright's GPU-disabling default args and forces ANGLE/Metal so the
|
|
20
|
+
* headless browser reaches the real GPU instead of the SwiftShader software
|
|
21
|
+
* adapter. Point at a system Chrome with --chrome /path/to/chrome if you prefer.
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
25
|
+
import { createServer } from "node:http";
|
|
26
|
+
import { extname, join } from "node:path";
|
|
27
|
+
import { chromium } from "playwright";
|
|
28
|
+
import { PUBLIC_E2E_CASES } from "../public-cases";
|
|
29
|
+
import { percentile } from "./score";
|
|
30
|
+
|
|
31
|
+
const ROOT = join(import.meta.dir, "..", "..");
|
|
32
|
+
|
|
33
|
+
function arg(name: string, fallback: string): string {
|
|
34
|
+
const eq = Bun.argv.find((a) => a.startsWith(`--${name}=`));
|
|
35
|
+
if (eq) return eq.slice(`--${name}=`.length);
|
|
36
|
+
const idx = Bun.argv.indexOf(`--${name}`);
|
|
37
|
+
return idx >= 0 && Bun.argv[idx + 1] && !Bun.argv[idx + 1].startsWith("--") ? Bun.argv[idx + 1] : fallback;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Load the benchmark input texts. Prefers the frozen held-out slice
|
|
42
|
+
* (`eval/bench/data/heldout.jsonl`, gitignored, materialised by `bench:fetch`)
|
|
43
|
+
* so browser latency is measured over the same rows as the Node bench; falls
|
|
44
|
+
* back to the committed `public-cases.ts` chat strings when it is absent.
|
|
45
|
+
*/
|
|
46
|
+
async function loadTexts(dataPath: string): Promise<{ texts: string[]; source: string }> {
|
|
47
|
+
try {
|
|
48
|
+
const raw = await readFile(dataPath, "utf8");
|
|
49
|
+
const texts = raw
|
|
50
|
+
.trim()
|
|
51
|
+
.split("\n")
|
|
52
|
+
.filter(Boolean)
|
|
53
|
+
.map((line) => JSON.parse(line).source_text as string)
|
|
54
|
+
.filter((t) => typeof t === "string" && t.length > 0);
|
|
55
|
+
if (texts.length) return { texts, source: dataPath };
|
|
56
|
+
} catch {
|
|
57
|
+
// fall through to public cases
|
|
58
|
+
}
|
|
59
|
+
return { texts: PUBLIC_E2E_CASES.map((c) => c.input), source: "public-cases" };
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
function resolveChrome(): string | undefined {
|
|
63
|
+
// Default: Playwright's bundled Chromium (it exposes WebGPU over http://localhost).
|
|
64
|
+
// Override only if you want to point at a system Chrome install.
|
|
65
|
+
const override = arg("chrome", "");
|
|
66
|
+
return override || undefined;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const MIME: Record<string, string> = {
|
|
70
|
+
".html": "text/html",
|
|
71
|
+
".js": "text/javascript",
|
|
72
|
+
".json": "application/json",
|
|
73
|
+
".onnx": "application/octet-stream",
|
|
74
|
+
".txt": "text/plain",
|
|
75
|
+
".wasm": "application/wasm",
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
async function bundleEntry(): Promise<void> {
|
|
79
|
+
const result = await Bun.build({
|
|
80
|
+
entrypoints: [join(import.meta.dir, "webgpu", "entry.ts")],
|
|
81
|
+
target: "browser",
|
|
82
|
+
format: "esm",
|
|
83
|
+
outdir: join(import.meta.dir, "webgpu"),
|
|
84
|
+
naming: "bundle.js",
|
|
85
|
+
});
|
|
86
|
+
if (!result.success) {
|
|
87
|
+
for (const log of result.logs) console.error(log);
|
|
88
|
+
throw new Error("bundle failed");
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function serveRoot(): Promise<{ port: number; close: () => void }> {
|
|
93
|
+
const server = createServer(async (req, res) => {
|
|
94
|
+
const url = new URL(req.url ?? "/", "http://localhost");
|
|
95
|
+
const path = url.pathname === "/" ? "/eval/bench/webgpu/index.html" : url.pathname;
|
|
96
|
+
const file = Bun.file(join(ROOT, decodeURIComponent(path)));
|
|
97
|
+
if (!(await file.exists())) {
|
|
98
|
+
res.statusCode = 404;
|
|
99
|
+
res.end("not found");
|
|
100
|
+
return;
|
|
101
|
+
}
|
|
102
|
+
res.setHeader("Content-Type", MIME[extname(path)] ?? "application/octet-stream");
|
|
103
|
+
// WebGPU + threaded WASM want cross-origin isolation; harmless for single-thread.
|
|
104
|
+
res.setHeader("Cross-Origin-Opener-Policy", "same-origin");
|
|
105
|
+
res.setHeader("Cross-Origin-Embedder-Policy", "require-corp");
|
|
106
|
+
res.end(Buffer.from(await file.arrayBuffer()));
|
|
107
|
+
});
|
|
108
|
+
return new Promise((resolve) => {
|
|
109
|
+
server.listen(0, () => {
|
|
110
|
+
const addr = server.address();
|
|
111
|
+
const port = typeof addr === "object" && addr ? addr.port : 0;
|
|
112
|
+
resolve({ port, close: () => server.close() });
|
|
113
|
+
});
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
async function main(): Promise<void> {
|
|
118
|
+
const device = arg("device", "webgpu") as "webgpu" | "wasm";
|
|
119
|
+
const itersArg = arg("iters", "");
|
|
120
|
+
const warmup = Number(arg("warmup", "20"));
|
|
121
|
+
const dataPath = arg("data", join(ROOT, "eval/bench/data/heldout.jsonl"));
|
|
122
|
+
const headed = Bun.argv.includes("--headed");
|
|
123
|
+
const outDir = join(import.meta.dir, "runs", arg("out", `${device}-q4`));
|
|
124
|
+
|
|
125
|
+
const chrome = resolveChrome();
|
|
126
|
+
|
|
127
|
+
await bundleEntry();
|
|
128
|
+
const { port, close } = await serveRoot();
|
|
129
|
+
const { texts, source } = await loadTexts(dataPath);
|
|
130
|
+
// Default to one timed inference per input row (a full pass over the slice);
|
|
131
|
+
// for the small public-cases set, run a few cycles so percentiles are stable.
|
|
132
|
+
const iters = itersArg ? Number(itersArg) : Math.max(texts.length, source === "public-cases" ? 300 : texts.length);
|
|
133
|
+
|
|
134
|
+
const browser = await chromium.launch({
|
|
135
|
+
headless: !headed,
|
|
136
|
+
executablePath: chrome,
|
|
137
|
+
// Drop Playwright's GPU-disabling defaults so the bundled Chromium can reach
|
|
138
|
+
// the real GPU. Without removing --use-gl=swiftshader, WebGPU silently falls
|
|
139
|
+
// back to the SwiftShader software adapter (~800 ms/inference, useless here).
|
|
140
|
+
ignoreDefaultArgs: ["--disable-gpu", "--use-gl=swiftshader", "--disable-gpu-compositing"],
|
|
141
|
+
// --use-angle=metal is what gets the bundled headless Chromium onto Metal on
|
|
142
|
+
// macOS; on Linux the Vulkan feature flag does the equivalent.
|
|
143
|
+
args: ["--enable-unsafe-webgpu", "--use-angle=metal", "--enable-features=WebGPU,Vulkan,Metal", "--ignore-gpu-blocklist"],
|
|
144
|
+
});
|
|
145
|
+
try {
|
|
146
|
+
const page = await browser.newPage();
|
|
147
|
+
const errors: string[] = [];
|
|
148
|
+
page.on("pageerror", (e) => errors.push(e.message));
|
|
149
|
+
page.on("console", (m) => {
|
|
150
|
+
if (m.type() === "error") errors.push(m.text());
|
|
151
|
+
});
|
|
152
|
+
await page.goto(`http://localhost:${port}/`, { waitUntil: "load" });
|
|
153
|
+
await page.waitForFunction(() => "rampartBench" in globalThis, { timeout: 30_000 });
|
|
154
|
+
|
|
155
|
+
const initResult = await page.evaluate(
|
|
156
|
+
async ([base, dev]) =>
|
|
157
|
+
(globalThis as unknown as { rampartBench: { init: (o: unknown) => Promise<{ adapter: string }> } }).rampartBench.init({
|
|
158
|
+
modelBaseUrl: base,
|
|
159
|
+
device: dev,
|
|
160
|
+
}),
|
|
161
|
+
[`http://localhost:${port}`, device] as const,
|
|
162
|
+
);
|
|
163
|
+
|
|
164
|
+
const { latencies, cold } = await page.evaluate(
|
|
165
|
+
async ([ts, w, it]) =>
|
|
166
|
+
(globalThis as unknown as { rampartBench: { run: (t: string[], w: number, it: number) => Promise<{ latencies: number[]; cold: number }> } }).rampartBench.run(
|
|
167
|
+
ts as string[],
|
|
168
|
+
w as number,
|
|
169
|
+
it as number,
|
|
170
|
+
),
|
|
171
|
+
[texts, warmup, iters] as const,
|
|
172
|
+
);
|
|
173
|
+
|
|
174
|
+
if (!latencies.length) throw new Error(`no latencies collected. page errors: ${errors.join(" | ") || "none"}`);
|
|
175
|
+
|
|
176
|
+
latencies.sort((a, b) => a - b);
|
|
177
|
+
const summary = {
|
|
178
|
+
device,
|
|
179
|
+
dtype: "q4",
|
|
180
|
+
adapter: initResult.adapter,
|
|
181
|
+
browser: browser.version(),
|
|
182
|
+
inputs: source === "public-cases" ? "public-cases" : "heldout",
|
|
183
|
+
input_texts: texts.length,
|
|
184
|
+
rows_scored: latencies.length,
|
|
185
|
+
warmup,
|
|
186
|
+
latency_ms: {
|
|
187
|
+
cold,
|
|
188
|
+
p50: percentile(latencies, 0.5),
|
|
189
|
+
p95: percentile(latencies, 0.95),
|
|
190
|
+
p99: percentile(latencies, 0.99),
|
|
191
|
+
mean: latencies.reduce((a, b) => a + b, 0) / latencies.length,
|
|
192
|
+
},
|
|
193
|
+
};
|
|
194
|
+
|
|
195
|
+
await mkdir(outDir, { recursive: true });
|
|
196
|
+
await writeFile(join(outDir, "latency.json"), JSON.stringify(summary, null, 2) + "\n");
|
|
197
|
+
|
|
198
|
+
const l = summary.latency_ms;
|
|
199
|
+
console.log(`\n${device}/q4 on ${summary.adapter} (${summary.browser})`);
|
|
200
|
+
console.log(`inputs: ${summary.inputs} (${summary.input_texts} rows, ${summary.rows_scored} timed)`);
|
|
201
|
+
console.log(`cold ${l.cold.toFixed(1)} ms · p50 ${l.p50.toFixed(2)} ms · p95 ${l.p95.toFixed(2)} ms · p99 ${l.p99.toFixed(2)} ms · mean ${l.mean.toFixed(2)} ms`);
|
|
202
|
+
console.log(`wrote ${join(outDir, "latency.json")}`);
|
|
203
|
+
} finally {
|
|
204
|
+
await browser.close();
|
|
205
|
+
close();
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
await main();
|