vskill 0.2.75 → 0.2.76

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,34 @@
1
- import type { BenchmarkResult } from "./benchmark.js";
1
+ import type { BenchmarkResult, BenchmarkAssertionResult } from "./benchmark.js";
2
2
  export interface HistorySummary {
3
3
  timestamp: string;
4
4
  filename: string;
5
5
  model: string;
6
6
  skillName: string;
7
7
  passRate: number;
8
- type: "benchmark" | "comparison";
8
+ type: "benchmark" | "comparison" | "baseline";
9
+ caseCount: number;
10
+ totalDurationMs: number;
11
+ totalTokens: number | null;
12
+ provider?: string;
13
+ verdict?: string;
14
+ }
15
+ export interface HistoryFilter {
16
+ model?: string;
17
+ type?: "benchmark" | "comparison" | "baseline";
18
+ from?: string;
19
+ to?: string;
20
+ }
21
+ export interface CaseHistoryEntry {
22
+ timestamp: string;
23
+ model: string;
24
+ type: "benchmark" | "comparison" | "baseline";
25
+ provider?: string;
26
+ pass_rate: number;
27
+ durationMs?: number;
28
+ tokens?: number | null;
29
+ inputTokens?: number | null;
30
+ outputTokens?: number | null;
31
+ assertions: BenchmarkAssertionResult[];
9
32
  }
10
33
  export interface RegressionEntry {
11
34
  assertionId: string;
@@ -16,8 +39,12 @@ export interface RegressionEntry {
16
39
  change: "regression" | "improvement";
17
40
  }
18
41
  export declare function writeHistoryEntry(skillDir: string, result: BenchmarkResult & {
19
- type?: "benchmark" | "comparison";
42
+ type?: "benchmark" | "comparison" | "baseline";
20
43
  }): Promise<string>;
21
- export declare function listHistory(skillDir: string): Promise<HistorySummary[]>;
44
+ export declare function deleteHistoryEntry(skillDir: string, timestamp: string): Promise<boolean>;
45
+ export declare function listHistory(skillDir: string, filter?: HistoryFilter): Promise<HistorySummary[]>;
22
46
  export declare function readHistoryEntry(skillDir: string, timestamp: string): Promise<BenchmarkResult | null>;
47
+ export declare function getCaseHistory(skillDir: string, evalId: number, filter?: {
48
+ model?: string;
49
+ }): Promise<CaseHistoryEntry[]>;
23
50
  export declare function computeRegressions(current: BenchmarkResult, previous: BenchmarkResult): RegressionEntry[];
@@ -1,7 +1,7 @@
1
1
  // ---------------------------------------------------------------------------
2
2
  // benchmark-history.ts -- timestamped benchmark history with regression diffing
3
3
  // ---------------------------------------------------------------------------
4
- import { readdir, readFile, mkdir, writeFile } from "node:fs/promises";
4
+ import { readdir, readFile, mkdir, writeFile, unlink } from "node:fs/promises";
5
5
  import { join } from "node:path";
6
6
  import { writeBenchmark } from "./benchmark.js";
7
7
  function toFilesafeTimestamp(iso) {
@@ -24,7 +24,18 @@ export async function writeHistoryEntry(skillDir, result) {
24
24
  await writeBenchmark(skillDir, result);
25
25
  return filename;
26
26
  }
27
- export async function listHistory(skillDir) {
27
+ export async function deleteHistoryEntry(skillDir, timestamp) {
28
+ const historyDir = join(skillDir, "evals", "history");
29
+ const filename = `${toFilesafeTimestamp(timestamp)}.json`;
30
+ try {
31
+ await unlink(join(historyDir, filename));
32
+ return true;
33
+ }
34
+ catch {
35
+ return false;
36
+ }
37
+ }
38
+ export async function listHistory(skillDir, filter) {
28
39
  const historyDir = join(skillDir, "evals", "history");
29
40
  let files;
30
41
  try {
@@ -33,20 +44,50 @@ export async function listHistory(skillDir) {
33
44
  catch {
34
45
  return [];
35
46
  }
47
+ let jsonFiles = files.filter((f) => f.endsWith(".json")).sort().reverse();
48
+ // Pre-filter by date range using filename timestamps (fast, no JSON parse)
49
+ if (filter?.from || filter?.to) {
50
+ const fromSafe = filter.from ? toFilesafeTimestamp(filter.from) : undefined;
51
+ const toSafe = filter.to ? toFilesafeTimestamp(filter.to) : undefined;
52
+ jsonFiles = jsonFiles.filter((f) => {
53
+ const ts = f.replace(/\.json$/, "");
54
+ if (fromSafe && ts < fromSafe)
55
+ return false;
56
+ if (toSafe && ts > toSafe)
57
+ return false;
58
+ return true;
59
+ });
60
+ }
36
61
  const entries = [];
37
- for (const file of files.filter((f) => f.endsWith(".json")).sort().reverse()) {
62
+ for (const file of jsonFiles) {
38
63
  try {
39
64
  const content = await readFile(join(historyDir, file), "utf-8");
40
65
  const data = JSON.parse(content);
66
+ const entryType = data.type || "benchmark";
67
+ // Post-filter by model and type
68
+ if (filter?.model && data.model !== filter.model)
69
+ continue;
70
+ if (filter?.type && entryType !== filter.type)
71
+ continue;
41
72
  const totalAssertions = data.cases.reduce((sum, c) => sum + c.assertions.length, 0);
42
73
  const passedAssertions = data.cases.reduce((sum, c) => sum + c.assertions.filter((a) => a.pass).length, 0);
74
+ const totalDurationMs = data.cases.reduce((s, c) => s + (c.durationMs ?? 0), 0);
75
+ const hasTokens = data.cases.some((c) => c.tokens != null);
76
+ const totalTokens = hasTokens
77
+ ? data.cases.reduce((s, c) => s + (c.tokens ?? 0), 0)
78
+ : null;
43
79
  entries.push({
44
80
  timestamp: fromFilesafeTimestamp(file),
45
81
  filename: file,
46
82
  model: data.model,
47
83
  skillName: data.skill_name,
48
84
  passRate: totalAssertions > 0 ? passedAssertions / totalAssertions : 0,
49
- type: data.type || "benchmark",
85
+ type: entryType,
86
+ caseCount: data.cases.length,
87
+ totalDurationMs,
88
+ totalTokens,
89
+ provider: data.provider,
90
+ verdict: data.verdict,
50
91
  });
51
92
  }
52
93
  catch {
@@ -66,6 +107,44 @@ export async function readHistoryEntry(skillDir, timestamp) {
66
107
  return null;
67
108
  }
68
109
  }
110
+ export async function getCaseHistory(skillDir, evalId, filter) {
111
+ const historyDir = join(skillDir, "evals", "history");
112
+ let files;
113
+ try {
114
+ files = await readdir(historyDir);
115
+ }
116
+ catch {
117
+ return [];
118
+ }
119
+ const entries = [];
120
+ for (const file of files.filter((f) => f.endsWith(".json")).sort().reverse()) {
121
+ try {
122
+ const content = await readFile(join(historyDir, file), "utf-8");
123
+ const data = JSON.parse(content);
124
+ if (filter?.model && data.model !== filter.model)
125
+ continue;
126
+ const matchingCase = data.cases.find((c) => c.eval_id === evalId);
127
+ if (!matchingCase)
128
+ continue;
129
+ entries.push({
130
+ timestamp: fromFilesafeTimestamp(file),
131
+ model: data.model,
132
+ type: data.type || "benchmark",
133
+ provider: data.provider,
134
+ pass_rate: matchingCase.pass_rate,
135
+ durationMs: matchingCase.durationMs,
136
+ tokens: matchingCase.tokens,
137
+ inputTokens: matchingCase.inputTokens,
138
+ outputTokens: matchingCase.outputTokens,
139
+ assertions: matchingCase.assertions,
140
+ });
141
+ }
142
+ catch {
143
+ // Skip malformed files
144
+ }
145
+ }
146
+ return entries;
147
+ }
69
148
  export function computeRegressions(current, previous) {
70
149
  const regressions = [];
71
150
  // Build a map of previous assertion results by eval_id + assertion_id
@@ -1 +1 @@
1
- {"version":3,"file":"benchmark-history.js","sourceRoot":"","sources":["../../src/eval/benchmark-history.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,gFAAgF;AAChF,8EAA8E;AAE9E,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACvE,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AAoBhD,SAAS,mBAAmB,CAAC,GAAW;IACtC,OAAO,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;AAChC,CAAC;AAED,SAAS,qBAAqB,CAAC,QAAgB;IAC7C,0CAA0C;IAC1C,MAAM,EAAE,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;IAC3C,yDAAyD;IACzD,OAAO,EAAE,CAAC,OAAO,CAAC,0BAA0B,EAAE,WAAW,CAAC,CAAC;AAC7D,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,QAAgB,EAChB,MAA+D;IAE/D,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,MAAM,KAAK,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE7C,MAAM,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAC/D,MAAM,QAAQ,GAAG,GAAG,mBAAmB,CAAC,SAAS,CAAC,OAAO,CAAC;IAC1D,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC;IAE5C,MAAM,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAE3D,uDAAuD;IACvD,MAAM,cAAc,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IAEvC,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,QAAgB;IAEhB,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,IAAI,KAAe,CAAC;IACpB,IAAI,CAAC;QACH,KAAK,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC;IACpC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,OAAO,GAAqB,EAAE,CAAC;IACrC,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,OAAO,EAAE,EAAE,CAAC;QAC7E,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;YAChE,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAwC,CAAC;YACxE,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;YACpF,MAAM,gBAAgB,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CACxC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAC3D,CAAC,CACF,CAAC;YACF,OAAO,CAAC,IAAI,CAAC;gBACX,SAAS,EAAE,qBAAqB,CAAC,IAAI,CAAC;gBACtC,QAAQ,EAAE,IAAI;gBACd,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,SAAS,EAAE,IAAI,CAAC,UAAU;gBAC1B,QAAQ,EAAE,eAAe,GAAG,CAAC,CAAC,CAAC,CAAC,gBAAgB,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC;gBACtE,IAAI,EAAG,IAAI,CAAC,IAAmC,IAAI,WAAW;aAC/D,CAAC,CAAC;QACL,CAAC;QAAC,MAAM,CAAC;YACP,uBAAuB;QACzB,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,QAAgB,EAChB,SAAiB;IAEjB,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,MAAM,QAAQ,GAAG,GAAG,mBAAmB,CAAC,SAAS,CAAC,OAAO,CAAC;IAC1D,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,EAAE,OAAO,CAAC,CAAC;QACpE,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAoB,CAAC;IAChD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,MAAM,UAAU,kBAAkB,CAChC,OAAwB,EACxB,QAAyB;IAEzB,MAAM,WAAW,GAAsB,EAAE,CAAC;IAE1C,sEAAsE;IACtE,MAAM,OAAO,GAAG,IAAI,GAAG,EAAmB,CAAC;IAC3C,KAAK,MAAM,CAAC,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;QAC/B,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;YAC7B,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC;QAC9C,CAAC;IACH,CAAC;IAED,KAAK,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;QAC9B,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;YAC7B,MAAM,GAAG,GAAG,GAAG,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC;YACnC,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YAC9B,IAAI,IAAI,KAAK,SAAS;gBAAE,SAAS,CAAC,sBAAsB;YAExD,IAAI,IAAI,IAAI,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBACpB,WAAW,CAAC,IAAI,CAAC;oBACf,WAAW,EAAE,CAAC,CAAC,EAAE;oBACjB,MAAM,EAAE,CAAC,CAAC,OAAO;oBACjB,QAAQ,EAAE,CAAC,CAAC,SAAS;oBACrB,cAAc,EAAE,IAAI;oBACpB,aAAa,EAAE,KAAK;oBACpB,MAAM,EAAE,YAAY;iBACrB,CAAC,CAAC;YACL,CAAC;iBAAM,IAAI,CAAC,IAAI,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;gBAC3B,WAAW,CAAC,IAAI,CAAC;oBACf,WAAW,EAAE,CAAC,CAAC,EAAE;oBACjB,MAAM,EAAE,CAAC,CAAC,OAAO;oBACjB,QAAQ,EAAE,CAAC,CAAC,SAAS;oBACrB,cAAc,EAAE,KAAK;oBACrB,aAAa,EAAE,IAAI;oBACnB,MAAM,EAAE,aAAa;iBACtB,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,WAAW,CAAC;AACrB,CAAC"}
1
+ {"version":3,"file":"benchmark-history.js","sourceRoot":"","sources":["../../src/eval/benchmark-history.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,gFAAgF;AAChF,8EAA8E;AAE9E,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AAC/E,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,OAAO,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AA6ChD,SAAS,mBAAmB,CAAC,GAAW;IACtC,OAAO,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;AAChC,CAAC;AAED,SAAS,qBAAqB,CAAC,QAAgB;IAC7C,0CAA0C;IAC1C,MAAM,EAAE,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;IAC3C,yDAAyD;IACzD,OAAO,EAAE,CAAC,OAAO,CAAC,0BAA0B,EAAE,WAAW,CAAC,CAAC;AAC7D,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,QAAgB,EAChB,MAA4E;IAE5E,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,MAAM,KAAK,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE7C,MAAM,SAAS,GAAG,MAAM,CAAC,SAAS,IAAI,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAC/D,MAAM,QAAQ,GAAG,GAAG,mBAAmB,CAAC,SAAS,CAAC,OAAO,CAAC;IAC1D,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC;IAE5C,MAAM,SAAS,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAE3D,uDAAuD;IACvD,MAAM,cAAc,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IAEvC,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,QAAgB,EAChB,SAAiB;IAEjB,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,MAAM,QAAQ,GAAG,GAAG,mBAAmB,CAAC,SAAS,CAAC,OAAO,CAAC;IAC1D,IAAI,CAAC;QACH,MAAM,MAAM,CAAC,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC,CAAC;QACzC,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,QAAgB,EAChB,MAAsB;IAEtB,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,IAAI,KAAe,CAAC;IACpB,IAAI,CAAC;QACH,KAAK,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC;IACpC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,IAAI,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,OAAO,EAAE,CAAC;IAE1E,2EAA2E;IAC3E,IAAI,MAAM,EAAE,IAAI,IAAI,MAAM,EAAE,EAAE,EAAE,CAAC;QAC/B,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,mBAAmB,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QAC5E,MAAM,MAAM,GAAG,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,mBAAmB,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QACtE,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YACjC,MAAM,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;YACpC,IAAI,QAAQ,IAAI,EAAE,GAAG,QAAQ;gBAAE,OAAO,KAAK,CAAC;YAC5C,IAAI,MAAM,IAAI,EAAE,GAAG,MAAM;gBAAE,OAAO,KAAK,CAAC;YACxC,OAAO,IAAI,CAAC;QACd,CAAC,CAAC,CAAC;IACL,CAAC;IAED,MAAM,OAAO,GAAqB,EAAE,CAAC;IACrC,KAAK,MAAM,IAAI,IAAI,SAAS,EAAE,CAAC;QAC7B,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;YAChE,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAwC,CAAC;YAExE,MAAM,SAAS,GAAI,IAAI,CAAC,IAA+B,IAAI,WAAW,CAAC;YAEvE,gCAAgC;YAChC,IAAI,MAAM,EAAE,KAAK,IAAI,IAAI,CAAC,KAAK,KAAK,MAAM,CAAC,KAAK;gBAAE,SAAS;YAC3D,IAAI,MAAM,EAAE,IAAI,IAAI,SAAS,KAAK,MAAM,CAAC,IAAI;gBAAE,SAAS;YAExD,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;YACpF,MAAM,gBAAgB,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CACxC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAC3D,CAAC,CACF,CAAC;YACF,MAAM,eAAe,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,UAAU,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YAChF,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,IAAI,IAAI,CAAC,CAAC;YAC3D,MAAM,WAAW,GAAG,SAAS;gBAC3B,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC;gBACrD,CAAC,CAAC,IAAI,CAAC;YAET,OAAO,CAAC,IAAI,CAAC;gBACX,SAAS,EAAE,qBAAqB,CAAC,IAAI,CAAC;gBACtC,QAAQ,EAAE,IAAI;gBACd,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,SAAS,EAAE,IAAI,CAAC,UAAU;gBAC1B,QAAQ,EAAE,eAAe,GAAG,CAAC,CAAC,CAAC,CAAC,gBAAgB,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC;gBACtE,IAAI,EAAE,SAAS;gBACf,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM;gBAC5B,eAAe;gBACf,WAAW;gBACX,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,OAAO,EAAE,IAAI,CAAC,OAAO;aACtB,CAAC,CAAC;QACL,CAAC;QAAC,MAAM,CAAC;YACP,uBAAuB;QACzB,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,QAAgB,EAChB,SAAiB;IAEjB,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,MAAM,QAAQ,GAAG,GAAG,mBAAmB,CAAC,SAAS,CAAC,OAAO,CAAC;IAC1D,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,EAAE,OAAO,CAAC,CAAC;QACpE,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAoB,CAAC;IAChD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,QAAgB,EAChB,MAAc,EACd,MAA2B;IAE3B,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IACtD,IAAI,KAAe,CAAC;IACpB,IAAI,CAAC;QACH,KAAK,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC,CAAC;IACpC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,OAAO,GAAuB,EAAE,CAAC;IACvC,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,OAAO,EAAE,EAAE,CAAC;QAC7E,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,UAAU,EAAE,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;YAChE,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAwC,CAAC;YAExE,IAAI,MAAM,EAAE,KAAK,IAAI,IAAI,CAAC,KAAK,KAAK,MAAM,CAAC,KAAK;gBAAE,SAAS;YAE3D,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,MAAM,CAAC,CAAC;YAClE,IAAI,CAAC,YAAY;gBAAE,SAAS;YAE5B,OAAO,CAAC,IAAI,CAAC;gBACX,SAAS,EAAE,qBAAqB,CAAC,IAAI,CAAC;gBACtC,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,IAAI,EAAG,IAAI,CAAC,IAAiC,IAAI,WAAW;gBAC5D,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,SAAS,EAAE,YAAY,CAAC,SAAS;gBACjC,UAAU,EAAE,YAAY,CAAC,UAAU;gBACnC,MAAM,EAAE,YAAY,CAAC,MAAM;gBAC3B,WAAW,EAAE,YAAY,CAAC,WAAW;gBACrC,YAAY,EAAE,YAAY,CAAC,YAAY;gBACvC,UAAU,EAAE,YAAY,CAAC,UAAU;aACpC,CAAC,CAAC;QACL,CAAC;QAAC,MAAM,CAAC;YACP,uBAAuB;QACzB,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,MAAM,UAAU,kBAAkB,CAChC,OAAwB,EACxB,QAAyB;IAEzB,MAAM,WAAW,GAAsB,EAAE,CAAC;IAE1C,sEAAsE;IACtE,MAAM,OAAO,GAAG,IAAI,GAAG,EAAmB,CAAC;IAC3C,KAAK,MAAM,CAAC,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;QAC/B,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;YAC7B,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC;QAC9C,CAAC;IACH,CAAC;IAED,KAAK,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;QAC9B,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;YAC7B,MAAM,GAAG,GAAG,GAAG,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC;YACnC,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YAC9B,IAAI,IAAI,KAAK,SAAS;gBAAE,SAAS,CAAC,sBAAsB;YAExD,IAAI,IAAI,IAAI,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBACpB,WAAW,CAAC,IAAI,CAAC;oBACf,WAAW,EAAE,CAAC,CAAC,EAAE;oBACjB,MAAM,EAAE,CAAC,CAAC,OAAO;oBACjB,QAAQ,EAAE,CAAC,CAAC,SAAS;oBACrB,cAAc,EAAE,IAAI;oBACpB,aAAa,EAAE,KAAK;oBACpB,MAAM,EAAE,YAAY;iBACrB,CAAC,CAAC;YACL,CAAC;iBAAM,IAAI,CAAC,IAAI,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;gBAC3B,WAAW,CAAC,IAAI,CAAC;oBACf,WAAW,EAAE,CAAC,CAAC,EAAE;oBACjB,MAAM,EAAE,CAAC,CAAC,OAAO;oBACjB,QAAQ,EAAE,CAAC,CAAC,SAAS;oBACrB,cAAc,EAAE,KAAK;oBACrB,aAAa,EAAE,IAAI;oBACnB,MAAM,EAAE,aAAa;iBACtB,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,WAAW,CAAC;AACrB,CAAC"}
@@ -4,6 +4,21 @@ export interface BenchmarkAssertionResult {
4
4
  pass: boolean;
5
5
  reasoning: string;
6
6
  }
7
+ export interface ComparisonCaseDetail {
8
+ skillDurationMs: number;
9
+ skillTokens: number | null;
10
+ skillInputTokens?: number | null;
11
+ skillOutputTokens?: number | null;
12
+ baselineDurationMs: number;
13
+ baselineTokens: number | null;
14
+ baselineInputTokens?: number | null;
15
+ baselineOutputTokens?: number | null;
16
+ skillContentScore: number;
17
+ skillStructureScore: number;
18
+ baselineContentScore: number;
19
+ baselineStructureScore: number;
20
+ winner: "skill" | "baseline" | "tie";
21
+ }
7
22
  export interface BenchmarkCase {
8
23
  eval_id: number;
9
24
  eval_name: string;
@@ -12,7 +27,11 @@ export interface BenchmarkCase {
12
27
  pass_rate: number;
13
28
  durationMs?: number;
14
29
  tokens?: number | null;
30
+ inputTokens?: number | null;
31
+ outputTokens?: number | null;
32
+ output?: string;
15
33
  assertions: BenchmarkAssertionResult[];
34
+ comparisonDetail?: ComparisonCaseDetail;
16
35
  }
17
36
  export interface BenchmarkResult {
18
37
  timestamp: string;
@@ -20,6 +39,19 @@ export interface BenchmarkResult {
20
39
  skill_name: string;
21
40
  cases: BenchmarkCase[];
22
41
  overall_pass_rate?: number;
42
+ type?: "benchmark" | "comparison" | "baseline";
43
+ provider?: string;
44
+ totalDurationMs?: number;
45
+ totalInputTokens?: number | null;
46
+ totalOutputTokens?: number | null;
47
+ verdict?: string;
48
+ comparison?: {
49
+ skillPassRate: number;
50
+ baselinePassRate: number;
51
+ skillRubricAvg: number;
52
+ baselineRubricAvg: number;
53
+ delta: number;
54
+ };
23
55
  }
24
56
  export declare function writeBenchmark(skillDir: string, result: BenchmarkResult): Promise<void>;
25
57
  export declare function readBenchmark(skillDir: string): Promise<BenchmarkResult | null>;
@@ -1 +1 @@
1
- {"version":3,"file":"benchmark.js","sourceRoot":"","sources":["../../src/eval/benchmark.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,4BAA4B;AAC5B,8EAA8E;AAE9E,OAAO,EAAE,aAAa,EAAE,YAAY,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAC7E,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AA4BjC,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,QAAgB,EAChB,MAAuB;IAEvB,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACzC,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACzC,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;IAClD,aAAa,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;AACpE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,QAAgB;IAEhB,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,gBAAgB,CAAC,CAAC;IAC3D,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC;QAAE,OAAO,IAAI,CAAC;IAEvC,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAC5C,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAoB,CAAC;IAC5C,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC"}
1
+ {"version":3,"file":"benchmark.js","sourceRoot":"","sources":["../../src/eval/benchmark.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,4BAA4B;AAC5B,8EAA8E;AAE9E,OAAO,EAAE,aAAa,EAAE,YAAY,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAC7E,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AA6DjC,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,QAAgB,EAChB,MAAuB;IAEvB,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACzC,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACzC,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;IAClD,aAAa,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;AACpE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,QAAgB;IAEhB,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,gBAAgB,CAAC,CAAC;IAC3D,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC;QAAE,OAAO,IAAI,CAAC;IAEvC,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAC5C,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAoB,CAAC;IAC5C,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC"}
@@ -5,14 +5,16 @@ import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs";
5
5
  import { join } from "node:path";
6
6
  import { sendJson, readBody } from "./router.js";
7
7
  import { initSSE, sendSSE, sendSSEDone } from "./sse-helpers.js";
8
+ import { runBenchmarkSSE } from "./benchmark-runner.js";
8
9
  import { scanSkills } from "../eval/skill-scanner.js";
9
10
  import { loadAndValidateEvals, EvalValidationError } from "../eval/schema.js";
10
11
  import { readBenchmark } from "../eval/benchmark.js";
11
- import { writeHistoryEntry, listHistory, readHistoryEntry } from "../eval/benchmark-history.js";
12
+ import { writeHistoryEntry, listHistory, readHistoryEntry, computeRegressions, deleteHistoryEntry, getCaseHistory } from "../eval/benchmark-history.js";
12
13
  import { judgeAssertion } from "../eval/judge.js";
13
14
  import { createLlmClient } from "../eval/llm.js";
14
15
  import { runComparison } from "../eval/comparator.js";
15
16
  import { computeVerdict } from "../eval/verdict.js";
17
+ import { buildEvalInitPrompt, parseGeneratedEvals } from "../eval/prompt-builder.js";
16
18
  import { testActivation } from "../eval/activation-tester.js";
17
19
  function resolveSkillDir(root, plugin, skill) {
18
20
  // Try direct layout: {root}/{plugin}/skills/{skill}/
@@ -227,12 +229,31 @@ export function registerRoutes(router, root, projectName) {
227
229
  writeFileSync(filePath, JSON.stringify(body, null, 2), "utf-8");
228
230
  sendJson(res, body, 200, req);
229
231
  });
232
+ // Generate evals using AI — reads SKILL.md and returns generated EvalsFile
233
+ router.post("/api/skills/:plugin/:skill/generate-evals", async (req, res, params) => {
234
+ const skillDir = resolveSkillDir(root, params.plugin, params.skill);
235
+ const skillMdPath = join(skillDir, "SKILL.md");
236
+ if (!existsSync(skillMdPath)) {
237
+ sendJson(res, { error: "SKILL.md not found — cannot generate evals without skill content" }, 400, req);
238
+ return;
239
+ }
240
+ try {
241
+ const skillContent = readFileSync(skillMdPath, "utf-8");
242
+ const prompt = buildEvalInitPrompt(skillContent);
243
+ const client = getClient();
244
+ const genResult = await client.generate("You generate eval test cases for AI skills. Output only valid JSON in a code fence.", prompt);
245
+ const evalsFile = parseGeneratedEvals(genResult.text);
246
+ sendJson(res, evalsFile, 200, req);
247
+ }
248
+ catch (err) {
249
+ sendJson(res, { error: `Eval generation failed: ${err.message}` }, 500, req);
250
+ }
251
+ });
230
252
  // Run benchmark (SSE) — optionally accepts { eval_ids: number[] } to run specific cases
231
253
  router.post("/api/skills/:plugin/:skill/benchmark", async (req, res, params) => {
232
254
  const skillDir = resolveSkillDir(root, params.plugin, params.skill);
233
255
  let aborted = false;
234
256
  res.on("close", () => { aborted = true; });
235
- // Read body before switching to SSE mode
236
257
  const body = await readBody(req).catch(() => ({}));
237
258
  const filterIds = Array.isArray(body?.eval_ids) ? new Set(body.eval_ids) : null;
238
259
  initSSE(res, req);
@@ -244,101 +265,34 @@ export function registerRoutes(router, root, projectName) {
244
265
  const systemPrompt = skillContent
245
266
  ? `You are an AI assistant enhanced with the following skill:\n\n${skillContent}`
246
267
  : "You are a helpful AI assistant.";
247
- // Filter to specific eval cases if requested
248
- const evalCases = filterIds
249
- ? evals.evals.filter((e) => filterIds.has(e.id))
250
- : evals.evals;
251
- const cases = [];
252
- for (const evalCase of evalCases) {
253
- if (aborted)
254
- break;
255
- sendSSE(res, "case_start", {
256
- eval_id: evalCase.id,
257
- eval_name: evalCase.name,
258
- total: evalCases.length,
259
- });
260
- try {
261
- const genResult = await client.generate(systemPrompt, evalCase.prompt);
262
- const totalTokens = genResult.inputTokens != null && genResult.outputTokens != null
263
- ? genResult.inputTokens + genResult.outputTokens
264
- : null;
265
- // Stream the actual LLM output so the UI can display it as proof
266
- sendSSE(res, "output_ready", {
267
- eval_id: evalCase.id,
268
- output: genResult.text,
269
- durationMs: genResult.durationMs,
270
- tokens: totalTokens,
271
- });
272
- const assertionResults = [];
273
- for (const assertion of evalCase.assertions) {
274
- if (aborted)
275
- break;
276
- const result = await judgeAssertion(genResult.text, assertion, client);
277
- assertionResults.push(result);
278
- sendSSE(res, "assertion_result", {
279
- eval_id: evalCase.id,
280
- assertion_id: result.id,
281
- text: result.text,
282
- pass: result.pass,
283
- reasoning: result.reasoning,
284
- });
285
- }
286
- const passRate = assertionResults.length > 0
287
- ? assertionResults.filter((a) => a.pass).length / assertionResults.length
288
- : 0;
289
- const status = assertionResults.every((a) => a.pass) ? "pass" : "fail";
290
- const benchCase = {
291
- eval_id: evalCase.id,
292
- eval_name: evalCase.name,
293
- status: status,
294
- error_message: null,
295
- pass_rate: passRate,
296
- durationMs: genResult.durationMs,
297
- tokens: totalTokens,
298
- assertions: assertionResults,
299
- };
300
- cases.push(benchCase);
301
- sendSSE(res, "case_complete", {
302
- eval_id: evalCase.id,
303
- status,
304
- pass_rate: passRate,
305
- durationMs: genResult.durationMs,
306
- tokens: totalTokens,
307
- });
308
- }
309
- catch (err) {
310
- const errorMsg = err instanceof Error ? err.message : String(err);
311
- cases.push({
312
- eval_id: evalCase.id,
313
- eval_name: evalCase.name,
314
- status: "error",
315
- error_message: errorMsg,
316
- pass_rate: 0,
317
- assertions: [],
318
- });
319
- sendSSE(res, "case_complete", {
320
- eval_id: evalCase.id,
321
- status: "error",
322
- error_message: errorMsg,
323
- });
324
- }
325
- }
326
- const totalAssertions = cases.reduce((s, c) => s + c.assertions.length, 0);
327
- const passedAssertions = cases.reduce((s, c) => s + c.assertions.filter((a) => a.pass).length, 0);
328
- const result = {
329
- timestamp: new Date().toISOString(),
330
- model: client.model,
331
- skill_name: evals.skill_name,
332
- cases,
333
- overall_pass_rate: totalAssertions > 0 ? passedAssertions / totalAssertions : 0,
334
- };
335
- if (!aborted) {
336
- // Only save to history for full benchmark runs (not single-case)
337
- if (!filterIds) {
338
- await writeHistoryEntry(skillDir, result);
339
- }
340
- sendSSEDone(res, result);
341
- }
268
+ await runBenchmarkSSE({
269
+ res, skillDir, skillName: evals.skill_name, systemPrompt,
270
+ runType: "benchmark", provider: currentOverrides.provider || "claude-cli",
271
+ evalCases: evals.evals, filterIds, client, isAborted: () => aborted,
272
+ });
273
+ }
274
+ catch (err) {
275
+ const errorMsg = err instanceof Error ? err.message : String(err);
276
+ sendSSEDone(res, { error: errorMsg });
277
+ }
278
+ });
279
+ // Run baseline (SSE) — same as benchmark but without skill content
280
+ router.post("/api/skills/:plugin/:skill/baseline", async (req, res, params) => {
281
+ const skillDir = resolveSkillDir(root, params.plugin, params.skill);
282
+ let aborted = false;
283
+ res.on("close", () => { aborted = true; });
284
+ const body = await readBody(req).catch(() => ({}));
285
+ const filterIds = Array.isArray(body?.eval_ids) ? new Set(body.eval_ids) : null;
286
+ initSSE(res, req);
287
+ try {
288
+ const evals = loadAndValidateEvals(skillDir);
289
+ const client = getClient();
290
+ await runBenchmarkSSE({
291
+ res, skillDir, skillName: evals.skill_name,
292
+ systemPrompt: "You are a helpful AI assistant.",
293
+ runType: "baseline", provider: currentOverrides.provider || "claude-cli",
294
+ evalCases: evals.evals, filterIds, client, isAborted: () => aborted,
295
+ });
342
296
  }
343
297
  catch (err) {
344
298
  const errorMsg = err instanceof Error ? err.message : String(err);
@@ -435,7 +389,20 @@ export function registerRoutes(router, root, projectName) {
435
389
  pass_rate: r.assertionResults.length > 0
436
390
  ? r.assertionResults.filter((a) => a.pass).length / r.assertionResults.length
437
391
  : 0,
392
+ durationMs: r.comparison.skillDurationMs,
393
+ tokens: r.comparison.skillTokens,
438
394
  assertions: r.assertionResults,
395
+ comparisonDetail: {
396
+ skillDurationMs: r.comparison.skillDurationMs,
397
+ skillTokens: r.comparison.skillTokens,
398
+ baselineDurationMs: r.comparison.baselineDurationMs,
399
+ baselineTokens: r.comparison.baselineTokens,
400
+ skillContentScore: r.comparison.skillContentScore,
401
+ skillStructureScore: r.comparison.skillStructureScore,
402
+ baselineContentScore: r.comparison.baselineContentScore,
403
+ baselineStructureScore: r.comparison.baselineStructureScore,
404
+ winner: r.comparison.winner,
405
+ },
439
406
  }));
440
407
  const historyResult = {
441
408
  timestamp: new Date().toISOString(),
@@ -444,6 +411,7 @@ export function registerRoutes(router, root, projectName) {
444
411
  cases,
445
412
  overall_pass_rate: passRate,
446
413
  type: "comparison",
414
+ provider: currentOverrides.provider || "claude-cli",
447
415
  verdict,
448
416
  comparison: {
449
417
  skillPassRate: passRate,
@@ -461,12 +429,100 @@ export function registerRoutes(router, root, projectName) {
461
429
  sendSSEDone(res, { error: err instanceof Error ? err.message : String(err) });
462
430
  }
463
431
  });
464
- // List benchmark history
432
+ // List benchmark history (with optional filters)
465
433
  router.get("/api/skills/:plugin/:skill/history", async (req, res, params) => {
466
434
  const skillDir = resolveSkillDir(root, params.plugin, params.skill);
467
- const history = await listHistory(skillDir);
435
+ const url = new URL(req.url, `http://localhost`);
436
+ const filter = {};
437
+ const modelParam = url.searchParams.get("model");
438
+ const typeParam = url.searchParams.get("type");
439
+ const fromParam = url.searchParams.get("from");
440
+ const toParam = url.searchParams.get("to");
441
+ if (modelParam)
442
+ filter.model = modelParam;
443
+ if (typeParam && ["benchmark", "comparison", "baseline"].includes(typeParam)) {
444
+ filter.type = typeParam;
445
+ }
446
+ if (fromParam)
447
+ filter.from = fromParam;
448
+ if (toParam)
449
+ filter.to = toParam;
450
+ const hasFilter = Object.keys(filter).length > 0;
451
+ const history = await listHistory(skillDir, hasFilter ? filter : undefined);
468
452
  sendJson(res, history, 200, req);
469
453
  });
454
+ // Compare two history runs
455
+ router.get("/api/skills/:plugin/:skill/history-compare", async (req, res, params) => {
456
+ const skillDir = resolveSkillDir(root, params.plugin, params.skill);
457
+ const url = new URL(req.url, `http://localhost`);
458
+ const tsA = url.searchParams.get("a");
459
+ const tsB = url.searchParams.get("b");
460
+ if (!tsA || !tsB) {
461
+ sendJson(res, { error: "Both 'a' and 'b' timestamps are required" }, 400, req);
462
+ return;
463
+ }
464
+ const [runA, runB] = await Promise.all([
465
+ readHistoryEntry(skillDir, tsA),
466
+ readHistoryEntry(skillDir, tsB),
467
+ ]);
468
+ if (!runA || !runB) {
469
+ sendJson(res, { error: "One or both history entries not found" }, 404, req);
470
+ return;
471
+ }
472
+ const regressions = computeRegressions(runB, runA);
473
+ // Build case diffs
474
+ const allEvalIds = new Set([
475
+ ...runA.cases.map((c) => c.eval_id),
476
+ ...runB.cases.map((c) => c.eval_id),
477
+ ]);
478
+ const caseDiffs = Array.from(allEvalIds).map((evalId) => {
479
+ const caseA = runA.cases.find((c) => c.eval_id === evalId);
480
+ const caseB = runB.cases.find((c) => c.eval_id === evalId);
481
+ return {
482
+ eval_id: evalId,
483
+ eval_name: caseA?.eval_name || caseB?.eval_name || `Eval #${evalId}`,
484
+ statusA: caseA?.status ?? "missing",
485
+ statusB: caseB?.status ?? "missing",
486
+ passRateA: caseA?.pass_rate ?? null,
487
+ passRateB: caseB?.pass_rate ?? null,
488
+ durationMsA: caseA?.durationMs ?? null,
489
+ durationMsB: caseB?.durationMs ?? null,
490
+ tokensA: caseA?.tokens ?? null,
491
+ tokensB: caseB?.tokens ?? null,
492
+ };
493
+ });
494
+ const totalA = runA.cases.reduce((s, c) => s + c.assertions.length, 0);
495
+ const passedA = runA.cases.reduce((s, c) => s + c.assertions.filter((a) => a.pass).length, 0);
496
+ const totalB = runB.cases.reduce((s, c) => s + c.assertions.length, 0);
497
+ const passedB = runB.cases.reduce((s, c) => s + c.assertions.filter((a) => a.pass).length, 0);
498
+ sendJson(res, {
499
+ runA: {
500
+ timestamp: runA.timestamp, model: runA.model,
501
+ passRate: totalA > 0 ? passedA / totalA : 0,
502
+ type: runA.type || "benchmark",
503
+ },
504
+ runB: {
505
+ timestamp: runB.timestamp, model: runB.model,
506
+ passRate: totalB > 0 ? passedB / totalB : 0,
507
+ type: runB.type || "benchmark",
508
+ },
509
+ regressions,
510
+ caseDiffs,
511
+ }, 200, req);
512
+ });
513
+ // Per-case history
514
+ router.get("/api/skills/:plugin/:skill/history/case/:evalId", async (req, res, params) => {
515
+ const skillDir = resolveSkillDir(root, params.plugin, params.skill);
516
+ const evalId = parseInt(params.evalId, 10);
517
+ if (isNaN(evalId)) {
518
+ sendJson(res, { error: "Invalid eval ID" }, 400, req);
519
+ return;
520
+ }
521
+ const url = new URL(req.url, `http://localhost`);
522
+ const modelParam = url.searchParams.get("model") || undefined;
523
+ const entries = await getCaseHistory(skillDir, evalId, modelParam ? { model: modelParam } : undefined);
524
+ sendJson(res, entries, 200, req);
525
+ });
470
526
  // Get specific history entry
471
527
  router.get("/api/skills/:plugin/:skill/history/:timestamp", async (req, res, params) => {
472
528
  const skillDir = resolveSkillDir(root, params.plugin, params.skill);
@@ -477,6 +533,16 @@ export function registerRoutes(router, root, projectName) {
477
533
  }
478
534
  sendJson(res, entry, 200, req);
479
535
  });
536
+ // Delete history entry
537
+ router.delete("/api/skills/:plugin/:skill/history/:timestamp", async (req, res, params) => {
538
+ const skillDir = resolveSkillDir(root, params.plugin, params.skill);
539
+ const deleted = await deleteHistoryEntry(skillDir, params.timestamp);
540
+ if (!deleted) {
541
+ sendJson(res, { error: "History entry not found" }, 404, req);
542
+ return;
543
+ }
544
+ sendJson(res, { ok: true }, 200, req);
545
+ });
480
546
  // Get latest benchmark
481
547
  router.get("/api/skills/:plugin/:skill/benchmark/latest", async (req, res, params) => {
482
548
  const skillDir = resolveSkillDir(root, params.plugin, params.skill);