vskill 0.2.75 → 0.2.76
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/eval/benchmark-history.d.ts +31 -4
- package/dist/eval/benchmark-history.js +83 -4
- package/dist/eval/benchmark-history.js.map +1 -1
- package/dist/eval/benchmark.d.ts +32 -0
- package/dist/eval/benchmark.js.map +1 -1
- package/dist/eval-server/api-routes.js +165 -99
- package/dist/eval-server/api-routes.js.map +1 -1
- package/dist/eval-server/benchmark-runner.d.ts +16 -0
- package/dist/eval-server/benchmark-runner.js +114 -0
- package/dist/eval-server/benchmark-runner.js.map +1 -0
- package/dist/eval-ui/assets/index-BYv6znnG.css +1 -0
- package/dist/eval-ui/assets/index-Bp1HQKak.js +70 -0
- package/dist/eval-ui/index.html +2 -2
- package/dist/installer/canonical.js +1 -1
- package/dist/installer/canonical.js.map +1 -1
- package/dist/installer/canonical.test.js +8 -0
- package/dist/installer/canonical.test.js.map +1 -1
- package/package.json +1 -1
- package/dist/eval-ui/assets/index-BsNUxjb1.js +0 -70
- package/dist/eval-ui/assets/index-D5mEzX7i.css +0 -1
|
@@ -1,11 +1,34 @@
|
|
|
1
|
-
import type { BenchmarkResult } from "./benchmark.js";
|
|
1
|
+
import type { BenchmarkResult, BenchmarkAssertionResult } from "./benchmark.js";
|
|
2
2
|
export interface HistorySummary {
|
|
3
3
|
timestamp: string;
|
|
4
4
|
filename: string;
|
|
5
5
|
model: string;
|
|
6
6
|
skillName: string;
|
|
7
7
|
passRate: number;
|
|
8
|
-
type: "benchmark" | "comparison";
|
|
8
|
+
type: "benchmark" | "comparison" | "baseline";
|
|
9
|
+
caseCount: number;
|
|
10
|
+
totalDurationMs: number;
|
|
11
|
+
totalTokens: number | null;
|
|
12
|
+
provider?: string;
|
|
13
|
+
verdict?: string;
|
|
14
|
+
}
|
|
15
|
+
export interface HistoryFilter {
|
|
16
|
+
model?: string;
|
|
17
|
+
type?: "benchmark" | "comparison" | "baseline";
|
|
18
|
+
from?: string;
|
|
19
|
+
to?: string;
|
|
20
|
+
}
|
|
21
|
+
export interface CaseHistoryEntry {
|
|
22
|
+
timestamp: string;
|
|
23
|
+
model: string;
|
|
24
|
+
type: "benchmark" | "comparison" | "baseline";
|
|
25
|
+
provider?: string;
|
|
26
|
+
pass_rate: number;
|
|
27
|
+
durationMs?: number;
|
|
28
|
+
tokens?: number | null;
|
|
29
|
+
inputTokens?: number | null;
|
|
30
|
+
outputTokens?: number | null;
|
|
31
|
+
assertions: BenchmarkAssertionResult[];
|
|
9
32
|
}
|
|
10
33
|
export interface RegressionEntry {
|
|
11
34
|
assertionId: string;
|
|
@@ -16,8 +39,12 @@ export interface RegressionEntry {
|
|
|
16
39
|
change: "regression" | "improvement";
|
|
17
40
|
}
|
|
18
41
|
export declare function writeHistoryEntry(skillDir: string, result: BenchmarkResult & {
|
|
19
|
-
type?: "benchmark" | "comparison";
|
|
42
|
+
type?: "benchmark" | "comparison" | "baseline";
|
|
20
43
|
}): Promise<string>;
|
|
21
|
-
export declare function
|
|
44
|
+
export declare function deleteHistoryEntry(skillDir: string, timestamp: string): Promise<boolean>;
|
|
45
|
+
export declare function listHistory(skillDir: string, filter?: HistoryFilter): Promise<HistorySummary[]>;
|
|
22
46
|
export declare function readHistoryEntry(skillDir: string, timestamp: string): Promise<BenchmarkResult | null>;
|
|
47
|
+
export declare function getCaseHistory(skillDir: string, evalId: number, filter?: {
|
|
48
|
+
model?: string;
|
|
49
|
+
}): Promise<CaseHistoryEntry[]>;
|
|
23
50
|
export declare function computeRegressions(current: BenchmarkResult, previous: BenchmarkResult): RegressionEntry[];
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// ---------------------------------------------------------------------------
|
|
2
2
|
// benchmark-history.ts -- timestamped benchmark history with regression diffing
|
|
3
3
|
// ---------------------------------------------------------------------------
|
|
4
|
-
import { readdir, readFile, mkdir, writeFile } from "node:fs/promises";
|
|
4
|
+
import { readdir, readFile, mkdir, writeFile, unlink } from "node:fs/promises";
|
|
5
5
|
import { join } from "node:path";
|
|
6
6
|
import { writeBenchmark } from "./benchmark.js";
|
|
7
7
|
function toFilesafeTimestamp(iso) {
|
|
@@ -24,7 +24,18 @@ export async function writeHistoryEntry(skillDir, result) {
|
|
|
24
24
|
await writeBenchmark(skillDir, result);
|
|
25
25
|
return filename;
|
|
26
26
|
}
|
|
27
|
-
export async function
|
|
27
|
+
export async function deleteHistoryEntry(skillDir, timestamp) {
|
|
28
|
+
const historyDir = join(skillDir, "evals", "history");
|
|
29
|
+
const filename = `${toFilesafeTimestamp(timestamp)}.json`;
|
|
30
|
+
try {
|
|
31
|
+
await unlink(join(historyDir, filename));
|
|
32
|
+
return true;
|
|
33
|
+
}
|
|
34
|
+
catch {
|
|
35
|
+
return false;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
export async function listHistory(skillDir, filter) {
|
|
28
39
|
const historyDir = join(skillDir, "evals", "history");
|
|
29
40
|
let files;
|
|
30
41
|
try {
|
|
@@ -33,20 +44,50 @@ export async function listHistory(skillDir) {
|
|
|
33
44
|
catch {
|
|
34
45
|
return [];
|
|
35
46
|
}
|
|
47
|
+
let jsonFiles = files.filter((f) => f.endsWith(".json")).sort().reverse();
|
|
48
|
+
// Pre-filter by date range using filename timestamps (fast, no JSON parse)
|
|
49
|
+
if (filter?.from || filter?.to) {
|
|
50
|
+
const fromSafe = filter.from ? toFilesafeTimestamp(filter.from) : undefined;
|
|
51
|
+
const toSafe = filter.to ? toFilesafeTimestamp(filter.to) : undefined;
|
|
52
|
+
jsonFiles = jsonFiles.filter((f) => {
|
|
53
|
+
const ts = f.replace(/\.json$/, "");
|
|
54
|
+
if (fromSafe && ts < fromSafe)
|
|
55
|
+
return false;
|
|
56
|
+
if (toSafe && ts > toSafe)
|
|
57
|
+
return false;
|
|
58
|
+
return true;
|
|
59
|
+
});
|
|
60
|
+
}
|
|
36
61
|
const entries = [];
|
|
37
|
-
for (const file of
|
|
62
|
+
for (const file of jsonFiles) {
|
|
38
63
|
try {
|
|
39
64
|
const content = await readFile(join(historyDir, file), "utf-8");
|
|
40
65
|
const data = JSON.parse(content);
|
|
66
|
+
const entryType = data.type || "benchmark";
|
|
67
|
+
// Post-filter by model and type
|
|
68
|
+
if (filter?.model && data.model !== filter.model)
|
|
69
|
+
continue;
|
|
70
|
+
if (filter?.type && entryType !== filter.type)
|
|
71
|
+
continue;
|
|
41
72
|
const totalAssertions = data.cases.reduce((sum, c) => sum + c.assertions.length, 0);
|
|
42
73
|
const passedAssertions = data.cases.reduce((sum, c) => sum + c.assertions.filter((a) => a.pass).length, 0);
|
|
74
|
+
const totalDurationMs = data.cases.reduce((s, c) => s + (c.durationMs ?? 0), 0);
|
|
75
|
+
const hasTokens = data.cases.some((c) => c.tokens != null);
|
|
76
|
+
const totalTokens = hasTokens
|
|
77
|
+
? data.cases.reduce((s, c) => s + (c.tokens ?? 0), 0)
|
|
78
|
+
: null;
|
|
43
79
|
entries.push({
|
|
44
80
|
timestamp: fromFilesafeTimestamp(file),
|
|
45
81
|
filename: file,
|
|
46
82
|
model: data.model,
|
|
47
83
|
skillName: data.skill_name,
|
|
48
84
|
passRate: totalAssertions > 0 ? passedAssertions / totalAssertions : 0,
|
|
49
|
-
type:
|
|
85
|
+
type: entryType,
|
|
86
|
+
caseCount: data.cases.length,
|
|
87
|
+
totalDurationMs,
|
|
88
|
+
totalTokens,
|
|
89
|
+
provider: data.provider,
|
|
90
|
+
verdict: data.verdict,
|
|
50
91
|
});
|
|
51
92
|
}
|
|
52
93
|
catch {
|
|
@@ -66,6 +107,44 @@ export async function readHistoryEntry(skillDir, timestamp) {
|
|
|
66
107
|
return null;
|
|
67
108
|
}
|
|
68
109
|
}
|
|
110
|
+
export async function getCaseHistory(skillDir, evalId, filter) {
|
|
111
|
+
const historyDir = join(skillDir, "evals", "history");
|
|
112
|
+
let files;
|
|
113
|
+
try {
|
|
114
|
+
files = await readdir(historyDir);
|
|
115
|
+
}
|
|
116
|
+
catch {
|
|
117
|
+
return [];
|
|
118
|
+
}
|
|
119
|
+
const entries = [];
|
|
120
|
+
for (const file of files.filter((f) => f.endsWith(".json")).sort().reverse()) {
|
|
121
|
+
try {
|
|
122
|
+
const content = await readFile(join(historyDir, file), "utf-8");
|
|
123
|
+
const data = JSON.parse(content);
|
|
124
|
+
if (filter?.model && data.model !== filter.model)
|
|
125
|
+
continue;
|
|
126
|
+
const matchingCase = data.cases.find((c) => c.eval_id === evalId);
|
|
127
|
+
if (!matchingCase)
|
|
128
|
+
continue;
|
|
129
|
+
entries.push({
|
|
130
|
+
timestamp: fromFilesafeTimestamp(file),
|
|
131
|
+
model: data.model,
|
|
132
|
+
type: data.type || "benchmark",
|
|
133
|
+
provider: data.provider,
|
|
134
|
+
pass_rate: matchingCase.pass_rate,
|
|
135
|
+
durationMs: matchingCase.durationMs,
|
|
136
|
+
tokens: matchingCase.tokens,
|
|
137
|
+
inputTokens: matchingCase.inputTokens,
|
|
138
|
+
outputTokens: matchingCase.outputTokens,
|
|
139
|
+
assertions: matchingCase.assertions,
|
|
140
|
+
});
|
|
141
|
+
}
|
|
142
|
+
catch {
|
|
143
|
+
// Skip malformed files
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
return entries;
|
|
147
|
+
}
|
|
69
148
|
export function computeRegressions(current, previous) {
|
|
70
149
|
const regressions = [];
|
|
71
150
|
// Build a map of previous assertion results by eval_id + assertion_id
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"benchmark-history.js","sourceRoot":"","sources":["../../src/eval/benchmark-history.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,gFAAgF;AAChF,8EAA8E;AAE9E,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;
|
|
1
|
+
{"version":3,"file":"benchmark-history.js","sourceRoot":"","sources":["../../src/eval/benchmark-history.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,gFAAgF;AAChF,8EAA8E;AAE9E,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AAC/E,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AA6ChD,SAAS,mBAAmB,CAAC,GAAW;IACtC,OAAO,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;AAChC,CAAC;AAED,SAAS,qBAAqB,CAAC,QAAgB;IAC7C,0CAA0C;IAC1C,MAAM,EAAE,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;IAC3C,yDAAyD;IACzD,OAAO,EAAE,CAAC,OAAO,CAAC,0BAA0B,EAAE,WAAW,CAAC,CAAC;AAC7D,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,QAAgB,EAChB,MAA4E;IAE5E,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,MAAM,KAAK,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE7C,MAAM,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAC/D,MAAM,QAAQ,GAAG,GAAG,mBAAmB,CAAC,SAAS,CAAC,OAAO,CAAC;IAC1D,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC;IAE5C,MAAM,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAE3D,uDAAuD;IACvD,MAAM,cAAc,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IAEvC,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,QAAgB,EAChB,SAAiB;IAEjB,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,MAAM,QAAQ,GAAG,GAAG,mBAAmB,CAAC,SAAS,CAAC,OAAO,CAAC;IAC1D,IAAI,CAAC;QACH,MAAM,MAAM,CAAC,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC,CAAC;QACzC,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,QAAgB,EAChB,MAAsB;IAEtB,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,IAAI,KAAe,CAAC;IACpB,IAAI,CAAC;QACH,KAAK,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC;IACpC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,IAAI,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,OAAO,EAAE,CAAC;IAE1E,2EAA2E;IAC3E,IAAI,MAAM,EAAE,IAAI,IAAI,MAAM,EAAE,EAAE,EAAE,CAAC;QAC/B,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,mBAAmB,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QAC5E,MAAM,MAAM,GAAG,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,mBAAmB,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QACtE,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YACjC,MAAM,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;YACpC,IAAI,QAAQ,IAAI,EAAE,GAAG,QAAQ;gBAAE,OAAO,KAAK,CAAC;YAC5C,IAAI,MAAM,IAAI,EAAE,GAAG,MAAM;gBAAE,OAAO,KAAK,CAAC;YACxC,OAAO,IAAI,CAAC;QACd,CAAC,CAAC,CAAC;IACL,CAAC;IAED,MAAM,OAAO,GAAqB,EAAE,CAAC;IACrC,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;QAC7B,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;YAChE,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAwC,CAAC;YAExE,MAAM,SAAS,GAAI,IAAI,CAAC,IAA+B,IAAI,WAAW,CAAC;YAEvE,gCAAgC;YAChC,IAAI,MAAM,EAAE,KAAK,IAAI,IAAI,CAAC,KAAK,KAAK,MAAM,CAAC,KAAK;gBAAE,SAAS;YAC3D,IAAI,MAAM,EAAE,IAAI,IAAI,SAAS,KAAK,MAAM,CAAC,IAAI;gBAAE,SAAS;YAExD,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;YACpF,MAAM,gBAAgB,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CACxC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAC3D,CAAC,CACF,CAAC;YACF,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,UAAU,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YAChF,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,IAAI,IAAI,CAAC,CAAC;YAC3D,MAAM,WAAW,GAAG,SAAS;gBAC3B,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC;gBACrD,CAAC,CAAC,IAAI,CAAC;YAET,OAAO,CAAC,IAAI,CAAC;gBACX,SAAS,EAAE,qBAAqB,CAAC,IAAI,CAAC;gBACtC,QAAQ,EAAE,IAAI;gBACd,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,SAAS,EAAE,IAAI,CAAC,UAAU;gBAC1B,QAAQ,EAAE,eAAe,GAAG,CAAC,CAAC,CAAC,CAAC,gBAAgB,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC;gBACtE,IAAI,EAAE,SAAS;gBACf,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM;gBAC5B,eAAe;gBACf,WAAW;gBACX,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,OAAO,EAAE,IAAI,CAAC,OAAO;aACtB,CAAC,CAAC;QACL,CAAC;QAAC,MAAM,CAAC;YACP,uBAAuB;QACzB,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,QAAgB,EAChB,SAAiB;IAEjB,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,MAAM,QAAQ,GAAG,GAAG,mBAAmB,CAAC,SAAS,CAAC,OAAO,CAAC;IAC1D,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,EAAE,OAAO,CAAC,CAAC;QACpE,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAoB,CAAC;IAChD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,QAAgB,EAChB,MAAc,EACd,MAA2B;IAE3B,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,IAAI,KAAe,CAAC;IACpB,IAAI,CAAC;QACH,KAAK,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC;IACpC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,OAAO,GAAuB,EAAE,CAAC;IACvC,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,OAAO,EAAE,EAAE,CAAC;QAC7E,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;YAChE,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAwC,CAAC;YAExE,IAAI,MAAM,EAAE,KAAK,IAAI,IAAI,CAAC,KAAK,KAAK,MAAM,CAAC,KAAK;gBAAE,SAAS;YAE3D,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,MAAM,CAAC,CAAC;YAClE,IAAI,CAAC,YAAY;gBAAE,SAAS;YAE5B,OAAO,CAAC,IAAI,CAAC;gBACX,SAAS,EAAE,qBAAqB,CAAC,IAAI,CAAC;gBACtC,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,IAAI,EAAG,IAAI,CAAC,IAAiC,IAAI,WAAW;gBAC5D,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,SAAS,EAAE,YAAY,CAAC,SAAS;gBACjC,UAAU,EAAE,YAAY,CAAC,UAAU;gBACnC,MAAM,EAAE,YAAY,CAAC,MAAM;gBAC3B,WAAW,EAAE,YAAY,CAAC,WAAW;gBACrC,YAAY,EAAE,YAAY,CAAC,YAAY;gBACvC,UAAU,EAAE,YAAY,CAAC,UAAU;aACpC,CAAC,CAAC;QACL,CAAC;QAAC,MAAM,CAAC;YACP,uBAAuB;QACzB,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,UAAU,kBAAkB,CAChC,OAAwB,EACxB,QAAyB;IAEzB,MAAM,WAAW,GAAsB,EAAE,CAAC;IAE1C,sEAAsE;IACtE,MAAM,OAAO,GAAG,IAAI,GAAG,EAAmB,CAAC;IAC3C,KAAK,MAAM,CAAC,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;QAC/B,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;YAC7B,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC;QAC9C,CAAC;IACH,CAAC;IAED,KAAK,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;QAC9B,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;YAC7B,MAAM,GAAG,GAAG,GAAG,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC;YACnC,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YAC9B,IAAI,IAAI,KAAK,SAAS;gBAAE,SAAS,CAAC,sBAAsB;YAExD,IAAI,IAAI,IAAI,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBACpB,WAAW,CAAC,IAAI,CAAC;oBACf,WAAW,EAAE,CAAC,CAAC,EAAE;oBACjB,MAAM,EAAE,CAAC,CAAC,OAAO;oBACjB,QAAQ,EAAE,CAAC,CAAC,SAAS;oBACrB,cAAc,EAAE,IAAI;oBACpB,aAAa,EAAE,KAAK;oBACpB,MAAM,EAAE,YAAY;iBACrB,CAAC,CAAC;YACL,CAAC;iBAAM,IAAI,CAAC,IAAI,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;gBAC3B,WAAW,CAAC,IAAI,CAAC;oBACf,WAAW,EAAE,CAAC,CAAC,EAAE;oBACjB,MAAM,EAAE,CAAC,CAAC,OAAO;oBACjB,QAAQ,EAAE,CAAC,CAAC,SAAS;oBACrB,cAAc,EAAE,KAAK;oBACrB,aAAa,EAAE,IAAI;oBACnB,MAAM,EAAE,aAAa;iBACtB,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,WAAW,CAAC;AACrB,CAAC"}
|
package/dist/eval/benchmark.d.ts
CHANGED
|
@@ -4,6 +4,21 @@ export interface BenchmarkAssertionResult {
|
|
|
4
4
|
pass: boolean;
|
|
5
5
|
reasoning: string;
|
|
6
6
|
}
|
|
7
|
+
export interface ComparisonCaseDetail {
|
|
8
|
+
skillDurationMs: number;
|
|
9
|
+
skillTokens: number | null;
|
|
10
|
+
skillInputTokens?: number | null;
|
|
11
|
+
skillOutputTokens?: number | null;
|
|
12
|
+
baselineDurationMs: number;
|
|
13
|
+
baselineTokens: number | null;
|
|
14
|
+
baselineInputTokens?: number | null;
|
|
15
|
+
baselineOutputTokens?: number | null;
|
|
16
|
+
skillContentScore: number;
|
|
17
|
+
skillStructureScore: number;
|
|
18
|
+
baselineContentScore: number;
|
|
19
|
+
baselineStructureScore: number;
|
|
20
|
+
winner: "skill" | "baseline" | "tie";
|
|
21
|
+
}
|
|
7
22
|
export interface BenchmarkCase {
|
|
8
23
|
eval_id: number;
|
|
9
24
|
eval_name: string;
|
|
@@ -12,7 +27,11 @@ export interface BenchmarkCase {
|
|
|
12
27
|
pass_rate: number;
|
|
13
28
|
durationMs?: number;
|
|
14
29
|
tokens?: number | null;
|
|
30
|
+
inputTokens?: number | null;
|
|
31
|
+
outputTokens?: number | null;
|
|
32
|
+
output?: string;
|
|
15
33
|
assertions: BenchmarkAssertionResult[];
|
|
34
|
+
comparisonDetail?: ComparisonCaseDetail;
|
|
16
35
|
}
|
|
17
36
|
export interface BenchmarkResult {
|
|
18
37
|
timestamp: string;
|
|
@@ -20,6 +39,19 @@ export interface BenchmarkResult {
|
|
|
20
39
|
skill_name: string;
|
|
21
40
|
cases: BenchmarkCase[];
|
|
22
41
|
overall_pass_rate?: number;
|
|
42
|
+
type?: "benchmark" | "comparison" | "baseline";
|
|
43
|
+
provider?: string;
|
|
44
|
+
totalDurationMs?: number;
|
|
45
|
+
totalInputTokens?: number | null;
|
|
46
|
+
totalOutputTokens?: number | null;
|
|
47
|
+
verdict?: string;
|
|
48
|
+
comparison?: {
|
|
49
|
+
skillPassRate: number;
|
|
50
|
+
baselinePassRate: number;
|
|
51
|
+
skillRubricAvg: number;
|
|
52
|
+
baselineRubricAvg: number;
|
|
53
|
+
delta: number;
|
|
54
|
+
};
|
|
23
55
|
}
|
|
24
56
|
export declare function writeBenchmark(skillDir: string, result: BenchmarkResult): Promise<void>;
|
|
25
57
|
export declare function readBenchmark(skillDir: string): Promise<BenchmarkResult | null>;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"benchmark.js","sourceRoot":"","sources":["../../src/eval/benchmark.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,4BAA4B;AAC5B,8EAA8E;AAE9E,OAAO,EAAE,aAAa,EAAE,YAAY,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAC7E,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;
|
|
1
|
+
{"version":3,"file":"benchmark.js","sourceRoot":"","sources":["../../src/eval/benchmark.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,4BAA4B;AAC5B,8EAA8E;AAE9E,OAAO,EAAE,aAAa,EAAE,YAAY,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAC7E,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AA6DjC,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,QAAgB,EAChB,MAAuB;IAEvB,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACzC,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACzC,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;IAClD,aAAa,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;AACpE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,QAAgB;IAEhB,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,gBAAgB,CAAC,CAAC;IAC3D,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC;QAAE,OAAO,IAAI,CAAC;IAEvC,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAC5C,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAoB,CAAC;IAC5C,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC"}
|
|
@@ -5,14 +5,16 @@ import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs";
|
|
|
5
5
|
import { join } from "node:path";
|
|
6
6
|
import { sendJson, readBody } from "./router.js";
|
|
7
7
|
import { initSSE, sendSSE, sendSSEDone } from "./sse-helpers.js";
|
|
8
|
+
import { runBenchmarkSSE } from "./benchmark-runner.js";
|
|
8
9
|
import { scanSkills } from "../eval/skill-scanner.js";
|
|
9
10
|
import { loadAndValidateEvals, EvalValidationError } from "../eval/schema.js";
|
|
10
11
|
import { readBenchmark } from "../eval/benchmark.js";
|
|
11
|
-
import { writeHistoryEntry, listHistory, readHistoryEntry } from "../eval/benchmark-history.js";
|
|
12
|
+
import { writeHistoryEntry, listHistory, readHistoryEntry, computeRegressions, deleteHistoryEntry, getCaseHistory } from "../eval/benchmark-history.js";
|
|
12
13
|
import { judgeAssertion } from "../eval/judge.js";
|
|
13
14
|
import { createLlmClient } from "../eval/llm.js";
|
|
14
15
|
import { runComparison } from "../eval/comparator.js";
|
|
15
16
|
import { computeVerdict } from "../eval/verdict.js";
|
|
17
|
+
import { buildEvalInitPrompt, parseGeneratedEvals } from "../eval/prompt-builder.js";
|
|
16
18
|
import { testActivation } from "../eval/activation-tester.js";
|
|
17
19
|
function resolveSkillDir(root, plugin, skill) {
|
|
18
20
|
// Try direct layout: {root}/{plugin}/skills/{skill}/
|
|
@@ -227,12 +229,31 @@ export function registerRoutes(router, root, projectName) {
|
|
|
227
229
|
writeFileSync(filePath, JSON.stringify(body, null, 2), "utf-8");
|
|
228
230
|
sendJson(res, body, 200, req);
|
|
229
231
|
});
|
|
232
|
+
// Generate evals using AI — reads SKILL.md and returns generated EvalsFile
|
|
233
|
+
router.post("/api/skills/:plugin/:skill/generate-evals", async (req, res, params) => {
|
|
234
|
+
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
235
|
+
const skillMdPath = join(skillDir, "SKILL.md");
|
|
236
|
+
if (!existsSync(skillMdPath)) {
|
|
237
|
+
sendJson(res, { error: "SKILL.md not found — cannot generate evals without skill content" }, 400, req);
|
|
238
|
+
return;
|
|
239
|
+
}
|
|
240
|
+
try {
|
|
241
|
+
const skillContent = readFileSync(skillMdPath, "utf-8");
|
|
242
|
+
const prompt = buildEvalInitPrompt(skillContent);
|
|
243
|
+
const client = getClient();
|
|
244
|
+
const genResult = await client.generate("You generate eval test cases for AI skills. Output only valid JSON in a code fence.", prompt);
|
|
245
|
+
const evalsFile = parseGeneratedEvals(genResult.text);
|
|
246
|
+
sendJson(res, evalsFile, 200, req);
|
|
247
|
+
}
|
|
248
|
+
catch (err) {
|
|
249
|
+
sendJson(res, { error: `Eval generation failed: ${err.message}` }, 500, req);
|
|
250
|
+
}
|
|
251
|
+
});
|
|
230
252
|
// Run benchmark (SSE) — optionally accepts { eval_ids: number[] } to run specific cases
|
|
231
253
|
router.post("/api/skills/:plugin/:skill/benchmark", async (req, res, params) => {
|
|
232
254
|
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
233
255
|
let aborted = false;
|
|
234
256
|
res.on("close", () => { aborted = true; });
|
|
235
|
-
// Read body before switching to SSE mode
|
|
236
257
|
const body = await readBody(req).catch(() => ({}));
|
|
237
258
|
const filterIds = Array.isArray(body?.eval_ids) ? new Set(body.eval_ids) : null;
|
|
238
259
|
initSSE(res, req);
|
|
@@ -244,101 +265,34 @@ export function registerRoutes(router, root, projectName) {
|
|
|
244
265
|
const systemPrompt = skillContent
|
|
245
266
|
? `You are an AI assistant enhanced with the following skill:\n\n${skillContent}`
|
|
246
267
|
: "You are a helpful AI assistant.";
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
: evals.evals
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
break;
|
|
276
|
-
const result = await judgeAssertion(genResult.text, assertion, client);
|
|
277
|
-
assertionResults.push(result);
|
|
278
|
-
sendSSE(res, "assertion_result", {
|
|
279
|
-
eval_id: evalCase.id,
|
|
280
|
-
assertion_id: result.id,
|
|
281
|
-
text: result.text,
|
|
282
|
-
pass: result.pass,
|
|
283
|
-
reasoning: result.reasoning,
|
|
284
|
-
});
|
|
285
|
-
}
|
|
286
|
-
const passRate = assertionResults.length > 0
|
|
287
|
-
? assertionResults.filter((a) => a.pass).length / assertionResults.length
|
|
288
|
-
: 0;
|
|
289
|
-
const status = assertionResults.every((a) => a.pass) ? "pass" : "fail";
|
|
290
|
-
const benchCase = {
|
|
291
|
-
eval_id: evalCase.id,
|
|
292
|
-
eval_name: evalCase.name,
|
|
293
|
-
status: status,
|
|
294
|
-
error_message: null,
|
|
295
|
-
pass_rate: passRate,
|
|
296
|
-
durationMs: genResult.durationMs,
|
|
297
|
-
tokens: totalTokens,
|
|
298
|
-
assertions: assertionResults,
|
|
299
|
-
};
|
|
300
|
-
cases.push(benchCase);
|
|
301
|
-
sendSSE(res, "case_complete", {
|
|
302
|
-
eval_id: evalCase.id,
|
|
303
|
-
status,
|
|
304
|
-
pass_rate: passRate,
|
|
305
|
-
durationMs: genResult.durationMs,
|
|
306
|
-
tokens: totalTokens,
|
|
307
|
-
});
|
|
308
|
-
}
|
|
309
|
-
catch (err) {
|
|
310
|
-
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
311
|
-
cases.push({
|
|
312
|
-
eval_id: evalCase.id,
|
|
313
|
-
eval_name: evalCase.name,
|
|
314
|
-
status: "error",
|
|
315
|
-
error_message: errorMsg,
|
|
316
|
-
pass_rate: 0,
|
|
317
|
-
assertions: [],
|
|
318
|
-
});
|
|
319
|
-
sendSSE(res, "case_complete", {
|
|
320
|
-
eval_id: evalCase.id,
|
|
321
|
-
status: "error",
|
|
322
|
-
error_message: errorMsg,
|
|
323
|
-
});
|
|
324
|
-
}
|
|
325
|
-
}
|
|
326
|
-
const totalAssertions = cases.reduce((s, c) => s + c.assertions.length, 0);
|
|
327
|
-
const passedAssertions = cases.reduce((s, c) => s + c.assertions.filter((a) => a.pass).length, 0);
|
|
328
|
-
const result = {
|
|
329
|
-
timestamp: new Date().toISOString(),
|
|
330
|
-
model: client.model,
|
|
331
|
-
skill_name: evals.skill_name,
|
|
332
|
-
cases,
|
|
333
|
-
overall_pass_rate: totalAssertions > 0 ? passedAssertions / totalAssertions : 0,
|
|
334
|
-
};
|
|
335
|
-
if (!aborted) {
|
|
336
|
-
// Only save to history for full benchmark runs (not single-case)
|
|
337
|
-
if (!filterIds) {
|
|
338
|
-
await writeHistoryEntry(skillDir, result);
|
|
339
|
-
}
|
|
340
|
-
sendSSEDone(res, result);
|
|
341
|
-
}
|
|
268
|
+
await runBenchmarkSSE({
|
|
269
|
+
res, skillDir, skillName: evals.skill_name, systemPrompt,
|
|
270
|
+
runType: "benchmark", provider: currentOverrides.provider || "claude-cli",
|
|
271
|
+
evalCases: evals.evals, filterIds, client, isAborted: () => aborted,
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
catch (err) {
|
|
275
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
276
|
+
sendSSEDone(res, { error: errorMsg });
|
|
277
|
+
}
|
|
278
|
+
});
|
|
279
|
+
// Run baseline (SSE) — same as benchmark but without skill content
|
|
280
|
+
router.post("/api/skills/:plugin/:skill/baseline", async (req, res, params) => {
|
|
281
|
+
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
282
|
+
let aborted = false;
|
|
283
|
+
res.on("close", () => { aborted = true; });
|
|
284
|
+
const body = await readBody(req).catch(() => ({}));
|
|
285
|
+
const filterIds = Array.isArray(body?.eval_ids) ? new Set(body.eval_ids) : null;
|
|
286
|
+
initSSE(res, req);
|
|
287
|
+
try {
|
|
288
|
+
const evals = loadAndValidateEvals(skillDir);
|
|
289
|
+
const client = getClient();
|
|
290
|
+
await runBenchmarkSSE({
|
|
291
|
+
res, skillDir, skillName: evals.skill_name,
|
|
292
|
+
systemPrompt: "You are a helpful AI assistant.",
|
|
293
|
+
runType: "baseline", provider: currentOverrides.provider || "claude-cli",
|
|
294
|
+
evalCases: evals.evals, filterIds, client, isAborted: () => aborted,
|
|
295
|
+
});
|
|
342
296
|
}
|
|
343
297
|
catch (err) {
|
|
344
298
|
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
@@ -435,7 +389,20 @@ export function registerRoutes(router, root, projectName) {
|
|
|
435
389
|
pass_rate: r.assertionResults.length > 0
|
|
436
390
|
? r.assertionResults.filter((a) => a.pass).length / r.assertionResults.length
|
|
437
391
|
: 0,
|
|
392
|
+
durationMs: r.comparison.skillDurationMs,
|
|
393
|
+
tokens: r.comparison.skillTokens,
|
|
438
394
|
assertions: r.assertionResults,
|
|
395
|
+
comparisonDetail: {
|
|
396
|
+
skillDurationMs: r.comparison.skillDurationMs,
|
|
397
|
+
skillTokens: r.comparison.skillTokens,
|
|
398
|
+
baselineDurationMs: r.comparison.baselineDurationMs,
|
|
399
|
+
baselineTokens: r.comparison.baselineTokens,
|
|
400
|
+
skillContentScore: r.comparison.skillContentScore,
|
|
401
|
+
skillStructureScore: r.comparison.skillStructureScore,
|
|
402
|
+
baselineContentScore: r.comparison.baselineContentScore,
|
|
403
|
+
baselineStructureScore: r.comparison.baselineStructureScore,
|
|
404
|
+
winner: r.comparison.winner,
|
|
405
|
+
},
|
|
439
406
|
}));
|
|
440
407
|
const historyResult = {
|
|
441
408
|
timestamp: new Date().toISOString(),
|
|
@@ -444,6 +411,7 @@ export function registerRoutes(router, root, projectName) {
|
|
|
444
411
|
cases,
|
|
445
412
|
overall_pass_rate: passRate,
|
|
446
413
|
type: "comparison",
|
|
414
|
+
provider: currentOverrides.provider || "claude-cli",
|
|
447
415
|
verdict,
|
|
448
416
|
comparison: {
|
|
449
417
|
skillPassRate: passRate,
|
|
@@ -461,12 +429,100 @@ export function registerRoutes(router, root, projectName) {
|
|
|
461
429
|
sendSSEDone(res, { error: err instanceof Error ? err.message : String(err) });
|
|
462
430
|
}
|
|
463
431
|
});
|
|
464
|
-
// List benchmark history
|
|
432
|
+
// List benchmark history (with optional filters)
|
|
465
433
|
router.get("/api/skills/:plugin/:skill/history", async (req, res, params) => {
|
|
466
434
|
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
467
|
-
const
|
|
435
|
+
const url = new URL(req.url, `http://localhost`);
|
|
436
|
+
const filter = {};
|
|
437
|
+
const modelParam = url.searchParams.get("model");
|
|
438
|
+
const typeParam = url.searchParams.get("type");
|
|
439
|
+
const fromParam = url.searchParams.get("from");
|
|
440
|
+
const toParam = url.searchParams.get("to");
|
|
441
|
+
if (modelParam)
|
|
442
|
+
filter.model = modelParam;
|
|
443
|
+
if (typeParam && ["benchmark", "comparison", "baseline"].includes(typeParam)) {
|
|
444
|
+
filter.type = typeParam;
|
|
445
|
+
}
|
|
446
|
+
if (fromParam)
|
|
447
|
+
filter.from = fromParam;
|
|
448
|
+
if (toParam)
|
|
449
|
+
filter.to = toParam;
|
|
450
|
+
const hasFilter = Object.keys(filter).length > 0;
|
|
451
|
+
const history = await listHistory(skillDir, hasFilter ? filter : undefined);
|
|
468
452
|
sendJson(res, history, 200, req);
|
|
469
453
|
});
|
|
454
|
+
// Compare two history runs
|
|
455
|
+
router.get("/api/skills/:plugin/:skill/history-compare", async (req, res, params) => {
|
|
456
|
+
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
457
|
+
const url = new URL(req.url, `http://localhost`);
|
|
458
|
+
const tsA = url.searchParams.get("a");
|
|
459
|
+
const tsB = url.searchParams.get("b");
|
|
460
|
+
if (!tsA || !tsB) {
|
|
461
|
+
sendJson(res, { error: "Both 'a' and 'b' timestamps are required" }, 400, req);
|
|
462
|
+
return;
|
|
463
|
+
}
|
|
464
|
+
const [runA, runB] = await Promise.all([
|
|
465
|
+
readHistoryEntry(skillDir, tsA),
|
|
466
|
+
readHistoryEntry(skillDir, tsB),
|
|
467
|
+
]);
|
|
468
|
+
if (!runA || !runB) {
|
|
469
|
+
sendJson(res, { error: "One or both history entries not found" }, 404, req);
|
|
470
|
+
return;
|
|
471
|
+
}
|
|
472
|
+
const regressions = computeRegressions(runB, runA);
|
|
473
|
+
// Build case diffs
|
|
474
|
+
const allEvalIds = new Set([
|
|
475
|
+
...runA.cases.map((c) => c.eval_id),
|
|
476
|
+
...runB.cases.map((c) => c.eval_id),
|
|
477
|
+
]);
|
|
478
|
+
const caseDiffs = Array.from(allEvalIds).map((evalId) => {
|
|
479
|
+
const caseA = runA.cases.find((c) => c.eval_id === evalId);
|
|
480
|
+
const caseB = runB.cases.find((c) => c.eval_id === evalId);
|
|
481
|
+
return {
|
|
482
|
+
eval_id: evalId,
|
|
483
|
+
eval_name: caseA?.eval_name || caseB?.eval_name || `Eval #${evalId}`,
|
|
484
|
+
statusA: caseA?.status ?? "missing",
|
|
485
|
+
statusB: caseB?.status ?? "missing",
|
|
486
|
+
passRateA: caseA?.pass_rate ?? null,
|
|
487
|
+
passRateB: caseB?.pass_rate ?? null,
|
|
488
|
+
durationMsA: caseA?.durationMs ?? null,
|
|
489
|
+
durationMsB: caseB?.durationMs ?? null,
|
|
490
|
+
tokensA: caseA?.tokens ?? null,
|
|
491
|
+
tokensB: caseB?.tokens ?? null,
|
|
492
|
+
};
|
|
493
|
+
});
|
|
494
|
+
const totalA = runA.cases.reduce((s, c) => s + c.assertions.length, 0);
|
|
495
|
+
const passedA = runA.cases.reduce((s, c) => s + c.assertions.filter((a) => a.pass).length, 0);
|
|
496
|
+
const totalB = runB.cases.reduce((s, c) => s + c.assertions.length, 0);
|
|
497
|
+
const passedB = runB.cases.reduce((s, c) => s + c.assertions.filter((a) => a.pass).length, 0);
|
|
498
|
+
sendJson(res, {
|
|
499
|
+
runA: {
|
|
500
|
+
timestamp: runA.timestamp, model: runA.model,
|
|
501
|
+
passRate: totalA > 0 ? passedA / totalA : 0,
|
|
502
|
+
type: runA.type || "benchmark",
|
|
503
|
+
},
|
|
504
|
+
runB: {
|
|
505
|
+
timestamp: runB.timestamp, model: runB.model,
|
|
506
|
+
passRate: totalB > 0 ? passedB / totalB : 0,
|
|
507
|
+
type: runB.type || "benchmark",
|
|
508
|
+
},
|
|
509
|
+
regressions,
|
|
510
|
+
caseDiffs,
|
|
511
|
+
}, 200, req);
|
|
512
|
+
});
|
|
513
|
+
// Per-case history
|
|
514
|
+
router.get("/api/skills/:plugin/:skill/history/case/:evalId", async (req, res, params) => {
|
|
515
|
+
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
516
|
+
const evalId = parseInt(params.evalId, 10);
|
|
517
|
+
if (isNaN(evalId)) {
|
|
518
|
+
sendJson(res, { error: "Invalid eval ID" }, 400, req);
|
|
519
|
+
return;
|
|
520
|
+
}
|
|
521
|
+
const url = new URL(req.url, `http://localhost`);
|
|
522
|
+
const modelParam = url.searchParams.get("model") || undefined;
|
|
523
|
+
const entries = await getCaseHistory(skillDir, evalId, modelParam ? { model: modelParam } : undefined);
|
|
524
|
+
sendJson(res, entries, 200, req);
|
|
525
|
+
});
|
|
470
526
|
// Get specific history entry
|
|
471
527
|
router.get("/api/skills/:plugin/:skill/history/:timestamp", async (req, res, params) => {
|
|
472
528
|
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
@@ -477,6 +533,16 @@ export function registerRoutes(router, root, projectName) {
|
|
|
477
533
|
}
|
|
478
534
|
sendJson(res, entry, 200, req);
|
|
479
535
|
});
|
|
536
|
+
// Delete history entry
|
|
537
|
+
router.delete("/api/skills/:plugin/:skill/history/:timestamp", async (req, res, params) => {
|
|
538
|
+
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
539
|
+
const deleted = await deleteHistoryEntry(skillDir, params.timestamp);
|
|
540
|
+
if (!deleted) {
|
|
541
|
+
sendJson(res, { error: "History entry not found" }, 404, req);
|
|
542
|
+
return;
|
|
543
|
+
}
|
|
544
|
+
sendJson(res, { ok: true }, 200, req);
|
|
545
|
+
});
|
|
480
546
|
// Get latest benchmark
|
|
481
547
|
router.get("/api/skills/:plugin/:skill/benchmark/latest", async (req, res, params) => {
|
|
482
548
|
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|