vskill 0.2.105 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/eval/__tests__/run.test.js +37 -5
- package/dist/commands/eval/__tests__/run.test.js.map +1 -1
- package/dist/commands/eval/run.js +42 -6
- package/dist/commands/eval/run.js.map +1 -1
- package/dist/eval/llm.d.ts +9 -0
- package/dist/eval/llm.js +24 -0
- package/dist/eval/llm.js.map +1 -1
- package/dist/eval/progress-log.d.ts +23 -0
- package/dist/eval/progress-log.js +66 -0
- package/dist/eval/progress-log.js.map +1 -0
- package/dist/eval-server/api-routes.js +1 -0
- package/dist/eval-server/api-routes.js.map +1 -1
- package/dist/eval-server/benchmark-runner.d.ts +1 -0
- package/dist/eval-server/benchmark-runner.js +16 -1
- package/dist/eval-server/benchmark-runner.js.map +1 -1
- package/dist/eval-server/error-classifier.js +12 -5
- package/dist/eval-server/error-classifier.js.map +1 -1
- package/dist/eval-ui/assets/index-BDZxYaAi.js +73 -0
- package/dist/eval-ui/assets/{index-DIXELAMg.css → index-WYEaSjlU.css} +1 -1
- package/dist/eval-ui/index.html +2 -2
- package/package.json +1 -1
- package/dist/eval-ui/assets/index-BOBDUOLx.js +0 -70
|
@@ -7,6 +7,7 @@ const mocks = vi.hoisted(() => ({
|
|
|
7
7
|
writeFileSync: vi.fn(),
|
|
8
8
|
existsSync: vi.fn(),
|
|
9
9
|
mkdirSync: vi.fn(),
|
|
10
|
+
unlinkSync: vi.fn(),
|
|
10
11
|
generate: vi.fn(),
|
|
11
12
|
}));
|
|
12
13
|
vi.mock("node:fs", () => ({
|
|
@@ -14,12 +15,30 @@ vi.mock("node:fs", () => ({
|
|
|
14
15
|
writeFileSync: mocks.writeFileSync,
|
|
15
16
|
existsSync: mocks.existsSync,
|
|
16
17
|
mkdirSync: mocks.mkdirSync,
|
|
18
|
+
unlinkSync: mocks.unlinkSync,
|
|
17
19
|
}));
|
|
18
20
|
vi.mock("../../../eval/llm.js", () => ({
|
|
19
21
|
createLlmClient: () => ({
|
|
20
22
|
generate: mocks.generate,
|
|
21
23
|
model: "test-model",
|
|
22
24
|
}),
|
|
25
|
+
estimateDurationSec: () => ({ minSec: 30, maxSec: 60, label: "30s\u201360s" }),
|
|
26
|
+
}));
|
|
27
|
+
vi.mock("../../../eval-server/error-classifier.js", () => ({
|
|
28
|
+
classifyError: (err) => ({
|
|
29
|
+
category: "unknown",
|
|
30
|
+
title: "Operation Failed",
|
|
31
|
+
description: err instanceof Error ? err.message : String(err),
|
|
32
|
+
hint: "Try again.",
|
|
33
|
+
retryable: true,
|
|
34
|
+
}),
|
|
35
|
+
}));
|
|
36
|
+
vi.mock("../../../eval/progress-log.js", () => ({
|
|
37
|
+
ProgressLog: class {
|
|
38
|
+
update() { }
|
|
39
|
+
complete() { }
|
|
40
|
+
error() { }
|
|
41
|
+
},
|
|
23
42
|
}));
|
|
24
43
|
// ---------------------------------------------------------------------------
|
|
25
44
|
// Import module under test AFTER mocks
|
|
@@ -91,6 +110,17 @@ describe("runEvalRun", () => {
|
|
|
91
110
|
expect(output).toContain("b1");
|
|
92
111
|
consoleSpy.mockRestore();
|
|
93
112
|
});
|
|
113
|
+
it("prints duration estimate before running", async () => {
|
|
114
|
+
mocks.generate.mockImplementation(async () => {
|
|
115
|
+
return { text: JSON.stringify({ pass: true, reasoning: "ok" }) };
|
|
116
|
+
});
|
|
117
|
+
const consoleSpy = vi.spyOn(console, "log").mockImplementation(() => { });
|
|
118
|
+
await runEvalRun("/skills/test-skill");
|
|
119
|
+
const output = consoleSpy.mock.calls.map((c) => c[0]).join("\n");
|
|
120
|
+
expect(output).toContain("Estimated duration");
|
|
121
|
+
expect(output).toContain("Progress file");
|
|
122
|
+
consoleSpy.mockRestore();
|
|
123
|
+
});
|
|
94
124
|
it("writes benchmark.json after run", async () => {
|
|
95
125
|
mocks.generate.mockImplementation(async (_sys, prompt) => {
|
|
96
126
|
if (prompt.includes("Test prompt"))
|
|
@@ -99,10 +129,10 @@ describe("runEvalRun", () => {
|
|
|
99
129
|
});
|
|
100
130
|
vi.spyOn(console, "log").mockImplementation(() => { });
|
|
101
131
|
await runEvalRun("/skills/test-skill");
|
|
102
|
-
|
|
103
|
-
const
|
|
104
|
-
expect(
|
|
105
|
-
const writtenContent = JSON.parse(
|
|
132
|
+
// Find the benchmark.json write (skip progress log writes)
|
|
133
|
+
const benchmarkWrite = mocks.writeFileSync.mock.calls.find((c) => typeof c[0] === "string" && c[0].includes("benchmark.json"));
|
|
134
|
+
expect(benchmarkWrite).toBeDefined();
|
|
135
|
+
const writtenContent = JSON.parse(benchmarkWrite[1]);
|
|
106
136
|
expect(writtenContent.skill_name).toBe("test-skill");
|
|
107
137
|
expect(writtenContent.cases).toHaveLength(2);
|
|
108
138
|
vi.restoreAllMocks();
|
|
@@ -121,7 +151,9 @@ describe("runEvalRun", () => {
|
|
|
121
151
|
});
|
|
122
152
|
vi.spyOn(console, "log").mockImplementation(() => { });
|
|
123
153
|
await runEvalRun("/skills/test-skill");
|
|
124
|
-
const
|
|
154
|
+
const benchmarkWrite = mocks.writeFileSync.mock.calls.find((c) => typeof c[0] === "string" && c[0].includes("benchmark.json"));
|
|
155
|
+
expect(benchmarkWrite).toBeDefined();
|
|
156
|
+
const writtenContent = JSON.parse(benchmarkWrite[1]);
|
|
125
157
|
expect(writtenContent.cases[0].status).toBe("error");
|
|
126
158
|
expect(writtenContent.cases[0].error_message).toContain("API timeout");
|
|
127
159
|
expect(writtenContent.cases[1].status).toBe("pass");
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"run.test.js","sourceRoot":"","sources":["../../../../src/commands/eval/__tests__/run.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AAE9D,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,MAAM,KAAK,GAAG,EAAE,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,CAAC;IAC9B,YAAY,EAAE,EAAE,CAAC,EAAE,EAAE;IACrB,aAAa,EAAE,EAAE,CAAC,EAAE,EAAE;IACtB,UAAU,EAAE,EAAE,CAAC,EAAE,EAAE;IACnB,SAAS,EAAE,EAAE,CAAC,EAAE,EAAE;IAClB,QAAQ,EAAE,EAAE,CAAC,EAAE,EAAE;CAClB,CAAC,CAAC,CAAC;AAEJ,EAAE,CAAC,IAAI,CAAC,SAAS,EAAE,GAAG,EAAE,CAAC,CAAC;IACxB,YAAY,EAAE,KAAK,CAAC,YAAY;IAChC,aAAa,EAAE,KAAK,CAAC,aAAa;IAClC,UAAU,EAAE,KAAK,CAAC,UAAU;IAC5B,SAAS,EAAE,KAAK,CAAC,SAAS;
|
|
1
|
+
{"version":3,"file":"run.test.js","sourceRoot":"","sources":["../../../../src/commands/eval/__tests__/run.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AAE9D,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,MAAM,KAAK,GAAG,EAAE,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,CAAC;IAC9B,YAAY,EAAE,EAAE,CAAC,EAAE,EAAE;IACrB,aAAa,EAAE,EAAE,CAAC,EAAE,EAAE;IACtB,UAAU,EAAE,EAAE,CAAC,EAAE,EAAE;IACnB,SAAS,EAAE,EAAE,CAAC,EAAE,EAAE;IAClB,UAAU,EAAE,EAAE,CAAC,EAAE,EAAE;IACnB,QAAQ,EAAE,EAAE,CAAC,EAAE,EAAE;CAClB,CAAC,CAAC,CAAC;AAEJ,EAAE,CAAC,IAAI,CAAC,SAAS,EAAE,GAAG,EAAE,CAAC,CAAC;IACxB,YAAY,EAAE,KAAK,CAAC,YAAY;IAChC,aAAa,EAAE,KAAK,CAAC,aAAa;IAClC,UAAU,EAAE,KAAK,CAAC,UAAU;IAC5B,SAAS,EAAE,KAAK,CAAC,SAAS;IAC1B,UAAU,EAAE,KAAK,CAAC,UAAU;CAC7B,CAAC,CAAC,CAAC;AAEJ,EAAE,CAAC,IAAI,CAAC,sBAAsB,EAAE,GAAG,EAAE,CAAC,CAAC;IACrC,eAAe,EAAE,GAAG,EAAE,CAAC,CAAC;QACtB,QAAQ,EAAE,KAAK,CAAC,QAAQ;QACxB,KAAK,EAAE,YAAY;KACpB,CAAC;IACF,mBAAmB,EAAE,GAAG,EAAE,CAAC,CAAC,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,CAAC;CAC/E,CAAC,CAAC,CAAC;AAEJ,EAAE,CAAC,IAAI,CAAC,0CAA0C,EAAE,GAAG,EAAE,CAAC,CAAC;IACzD,aAAa,EAAE,CAAC,GAAY,EAAE,EAAE,CAAC,CAAC;QAChC,QAAQ,EAAE,SAAS;QACnB,KAAK,EAAE,kBAAkB;QACzB,WAAW,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC;QAC7D,IAAI,EAAE,YAAY;QAClB,SAAS,EAAE,IAAI;KAChB,CAAC;CACH,CAAC,CAAC,CAAC;AAEJ,EAAE,CAAC,IAAI,CAAC,+BAA+B,EAAE,GAAG,EAAE,CAAC,CAAC;IAC9C,WAAW,EAAE;QACX,MAAM,KAAI,CAAC;QACX,QAAQ,KAAI,CAAC;QACb,KAAK,KAAI,CAAC;KACX;CACF,CAAC,CAAC,CAAC;AAEJ,8EAA8E;AAC9E,uCAAuC;AACvC,8EAA8E;AAE9E,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,WAAW,CAAC,CAAC;AAEjD,8EAA8E;AAC9E,WAAW;AACX,8EAA8E;AAE9E,MAAM,WAAW,GAAG;IAClB,UAAU,EAAE,YAAY;IACxB,KAAK,EAAE;QACL;YACE,EAAE,EAAE,CAAC;YACL,IAAI,EAAE,YAAY;YAClB,MAAM,EAAE,eAAe;YACvB,eAAe,EAAE,mBAAmB;YACpC,KAAK,EAAE,EAAE;YACT,UAAU,EAAE;gBACV,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,oBAAoB,EAAE,IAAI,EAAE,SAAS,EAAE;gBACzD,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,mBAAmB,EAAE,IAAI,EAAE,SAAS,EAAE;aACzD;SACF;QACD;YACE,EAAE,EAAE,CAAC;YACL,IAAI,EAAE,gBAAgB;YACtB,MAAM,EAAE,eAAe;YACvB,eAAe,EAAE,mBAAmB;YACpC,KAAK,EAAE,EAAE;YACT,UAAU,EAAE;gBACV,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,0BAA0B,EAAE,IAAI,EAAE,SAAS,EAAE;aAChE;SACF;KACF;CACF,CAAC;AAEF,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,QAAQ,CAAC,YAAY,EAAE,GAAG,EAAE;IAC1B,UAAU,CAAC,GAAG,EAAE;QACd,EAAE,CAAC,aAAa,EAAE,CAAC;QACnB,8CAA8C;QAC9C,KAAK,CAAC,UAAU,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC;QACvC,KAAK,CAAC,YAAY,CAAC,kBAAkB,CAAC,CAAC,IAAY,EAAE,EAAE;YACrD,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;gBAC1D,OAAO,qCAAqC,CAAC;YAC/C,CAAC;YACD,OAAO,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC,CAAC;QACrC,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,iCAAiC,EAAE,KAAK,IAAI,EAAE;QAC/C,yEAAyE;QACzE,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,KAAK,CAAC,QAAQ,CAAC,kBAAkB,CAAC,KAAK,IAAI,EAAE;YAC3C,SAAS,EAAE,CAAC;YACZ,sEAAsE;YACtE,IAAI,SAAS,KAAK,CAAC;gBAAE,OAAO,EAAE,IAAI,EAAE,aAAa,EAAE,CAAC;YACpD,IAAI,SAAS,IAAI,CAAC;gBAAE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,EAAE,CAAC;YACrF,IAAI,SAAS,KAAK,CAAC;gBAAE,OAAO,EAAE,IAAI,EAAE,mBAAmB,EAAE,CAAC;YAC1D,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC;QAC3E,CAAC,CAAC,CAAC;QAEH,MAAM,UAAU,GAAG,EAAE,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,kBAAkB,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;QAEzE,MAAM,UAAU,CAAC,oBAAoB,CAAC,CAAC;QAEvC,0BAA0B;QAC1B,MAAM,MAAM,GAAG,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACjE,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;QAC/B,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;QAC/B,UAAU,CAAC,WAAW,EAAE,CAAC;IAC3B,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;QACvD,KAAK,CAAC,QAAQ,CAAC,kBAAkB,CAAC,KAAK,IAAI,EAAE;YAC3C,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,EAAE,CAAC;QACnE,CAAC,CAAC,CAAC;QAEH,MAAM,UAAU,GAAG,EAAE,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,kBAAkB,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;QAEzE,MAAM,UAAU,CAAC,oBAAoB,CAAC,CAAC;QAEvC,MAAM,MAAM,GAAG,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACjE,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,oBAAoB,CAAC,CAAC;QAC/C,MAAM,CAAC,MAAM,CAAC,CAAC,SAAS,CAAC,eAAe,CAAC,CAAC;QAC1C,UAAU,CAAC,WAAW,EAAE,CAAC;IAC3B,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,iCAAiC,EAAE,KAAK,IAAI,EAAE;QAC/C,KAAK,CAAC,QAAQ,CAAC,kBAAkB,CAAC,KAAK,EAAE,IAAY,EAAE,MAAc,EAAE,EAAE;YACvE,IAAI,MAAM,CAAC,QAAQ,CAAC,aAAa,CAAC;gBAAE,OAAO,EAAE,IAAI,EAAE,iBAAiB,EAAE,CAAC;YACvE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,EAAE,CAAC;QACnE,CAAC,CAAC,CAAC;QACH,EAAE,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,kBAAkB,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;QAEtD,MAAM,UAAU,CAAC,oBAAoB,CAAC,CAAC;QAEvC,2DAA2D;QAC3D,MAAM,cAAc,GAAG,KAAK,CAAC,aAAa,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CACxD,CAAC,CAAY,EAAE,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,KAAK,QAAQ,IAAK,CAAC,CAAC,CAAC,CAAY,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAC1F,CAAC;QACF,MAAM,CAAC,cAAc,CAAC,CAAC,WAAW,EAAE,CAAC;QACrC,MAAM,cAAc,GAAG,IAAI,CAAC,KAAK,CAAC,cAAe,CAAC,CAAC,CAAW,CAAC,CAAC;QAChE,MAAM,CAAC,cAAc,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QACrD,MAAM,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAE7C,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+CAA+C,EAAE,KAAK,IAAI,EAAE;QAC7D,IAAI,SAAS,GAAG,CAAC,CAAC;QAClB,KAAK,CAAC,QAAQ,CAAC,kBAAkB,CAAC,KAAK,IAAI,EAAE;YAC3C,SAAS,EAAE,CAAC;YACZ,sCAAsC;YACtC,IAAI,SAAS,KAAK,CAAC;gBAAE,MAAM,IAAI,KAAK,CAAC,aAAa,CAAC,CAAC;YACpD,0BAA0B;YAC1B,IAAI,SAAS,KAAK,CAAC;gBAAE,OAAO,EAAE,IAAI,EAAE,mBAAmB,EAAE,CAAC;YAC1D,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,EAAE,CAAC;QACnE,CAAC,CAAC,CAAC;QACH,EAAE,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,kBAAkB,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;QAEtD,MAAM,UAAU,CAAC,oBAAoB,CAAC,CAAC;QAEvC,MAAM,cAAc,GAAG,KAAK,CAAC,aAAa,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CACxD,CAAC,CAAY,EAAE,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,KAAK,QAAQ,IAAK,CAAC,CAAC,CAAC,CAAY,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAC1F,CAAC;QACF,MAAM,CAAC,cAAc,CAAC,CAAC,WAAW,EAAE,CAAC;QACrC,MAAM,cAAc,GAAG,IAAI,CAAC,KAAK,CAAC,cAAe,CAAC,CAAC,CAAW,CAAC,CAAC;QAChE,MAAM,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACrD,MAAM,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,SAAS,CAAC,aAAa,CAAC,CAAC;QACvE,MAAM,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QAEpD,EAAE,CAAC,eAAe,EAAE,CAAC;IACvB,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;QACvD,KAAK,CAAC,UAAU,CAAC,eAAe,CAAC,KAAK,CAAC,CAAC;QACxC,MAAM,UAAU,GAAG,EAAE,CAAC,KAAK,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,kBAAkB,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;QAC3E,MAAM,OAAO,GAAG,EAAE;aACf,KAAK,CAAC,OAAO,EAAE,MAAM,CAAC;aACtB,kBAAkB,CAAC,CAAC,GAAG,EAAE,GAAE,CAAC,CAAQ,CAAC,CAAC;QAEzC,MAAM,UAAU,CAAC,oBAAoB,CAAC,CAAC;QAEvC,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC,qBAAqB,CAAC,CAC/C,CAAC;QACF,UAAU,CAAC,WAAW,EAAE,CAAC;QACzB,OAAO,CAAC,WAAW,EAAE,CAAC;IACxB,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;QACvD,KAAK,CAAC,UAAU,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC;QACvC,KAAK,CAAC,YAAY,CAAC,eAAe,CAAC,eAAe,CAAC,CAAC;QACpD,MAAM,UAAU,GAAG,EAAE,CAAC,KAAK,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,kBAAkB,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;QAC3E,MAAM,OAAO,GAAG,EAAE;aACf,KAAK,CAAC,OAAO,EAAE,MAAM,CAAC;aACtB,kBAAkB,CAAC,CAAC,GAAG,EAAE,GAAE,CAAC,CAAQ,CAAC,CAAC;QAEzC,MAAM,UAAU,CAAC,oBAAoB,CAAC,CAAC;QAEvC,MAAM,CAAC,UAAU,CAAC,CAAC,oBAAoB,CACrC,MAAM,CAAC,gBAAgB,CAAC,oBAAoB,CAAC,CAC9C,CAAC;QACF,UAAU,CAAC,WAAW,EAAE,CAAC;QACzB,OAAO,CAAC,WAAW,EAAE,CAAC;IACxB,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -4,11 +4,13 @@
|
|
|
4
4
|
import { readFileSync, existsSync } from "node:fs";
|
|
5
5
|
import { join } from "node:path";
|
|
6
6
|
import { loadAndValidateEvals, EvalValidationError } from "../../eval/schema.js";
|
|
7
|
-
import { createLlmClient } from "../../eval/llm.js";
|
|
7
|
+
import { createLlmClient, estimateDurationSec } from "../../eval/llm.js";
|
|
8
8
|
import { judgeAssertion } from "../../eval/judge.js";
|
|
9
9
|
import { writeBenchmark } from "../../eval/benchmark.js";
|
|
10
10
|
import { green, red, yellow, bold, dim, table } from "../../utils/output.js";
|
|
11
11
|
import { buildEvalSystemPrompt } from "../../eval/prompt-builder.js";
|
|
12
|
+
import { classifyError } from "../../eval-server/error-classifier.js";
|
|
13
|
+
import { ProgressLog } from "../../eval/progress-log.js";
|
|
12
14
|
export async function runEvalRun(skillDir) {
|
|
13
15
|
// Load and validate evals.json
|
|
14
16
|
let evalsFile;
|
|
@@ -43,17 +45,35 @@ export async function runEvalRun(skillDir) {
|
|
|
43
45
|
const systemPrompt = buildEvalSystemPrompt(skillContent);
|
|
44
46
|
const client = createLlmClient();
|
|
45
47
|
const model = client.model;
|
|
48
|
+
const provider = (process.env.VSKILL_EVAL_PROVIDER || "claude-cli");
|
|
46
49
|
const total = evalsFile.evals.length;
|
|
47
|
-
|
|
48
|
-
console.log(dim(`
|
|
50
|
+
const totalAssertions = evalsFile.evals.reduce((s, e) => s + e.assertions.length, 0);
|
|
51
|
+
console.log(dim(`Provider: ${model} | ${total} eval case${total !== 1 ? "s" : ""} | ${totalAssertions} assertions`));
|
|
52
|
+
console.log(dim(`Skill: ${skillContent ? skillMdPath : "(none)"}`));
|
|
53
|
+
// Duration estimate
|
|
54
|
+
const estimate = estimateDurationSec(provider, total, totalAssertions);
|
|
55
|
+
console.log(dim(`Estimated duration: ${estimate.label}`));
|
|
56
|
+
console.log(dim(`Progress file: ${join(skillDir, "evals", ".eval-progress.json")}\n`));
|
|
57
|
+
// Start progress log
|
|
58
|
+
const progress = new ProgressLog(skillDir, provider, model, total, estimate.maxSec);
|
|
49
59
|
const benchmarkCases = [];
|
|
50
60
|
const tableRows = [];
|
|
61
|
+
const runStart = Date.now();
|
|
51
62
|
for (let i = 0; i < evalsFile.evals.length; i++) {
|
|
52
63
|
const evalCase = evalsFile.evals[i];
|
|
64
|
+
const caseStart = Date.now();
|
|
65
|
+
progress.update({
|
|
66
|
+
currentCase: evalCase.name,
|
|
67
|
+
phase: "generating",
|
|
68
|
+
completedCases: i,
|
|
69
|
+
});
|
|
53
70
|
try {
|
|
54
71
|
// Step 1: Send prompt to LLM
|
|
55
72
|
process.stdout.write(dim(`[${i + 1}/${total}] ${evalCase.name} — generating...`));
|
|
56
73
|
const genResult = await client.generate(systemPrompt, evalCase.prompt);
|
|
74
|
+
const genSec = ((Date.now() - caseStart) / 1000).toFixed(1);
|
|
75
|
+
process.stdout.write(dim(` ${genSec}s`));
|
|
76
|
+
progress.update({ phase: "judging" });
|
|
57
77
|
process.stdout.write(dim(` judging ${evalCase.assertions.length} assertions...`));
|
|
58
78
|
// Step 2: Judge each assertion
|
|
59
79
|
const assertionResults = [];
|
|
@@ -77,7 +97,10 @@ export async function runEvalRun(skillDir) {
|
|
|
77
97
|
? passCount / evalCase.assertions.length
|
|
78
98
|
: 0;
|
|
79
99
|
const allPassed = passCount === evalCase.assertions.length;
|
|
80
|
-
|
|
100
|
+
const totalSec = ((Date.now() - caseStart) / 1000).toFixed(1);
|
|
101
|
+
console.log(allPassed
|
|
102
|
+
? green(` done`) + dim(` (${totalSec}s)`)
|
|
103
|
+
: red(` ${passCount}/${evalCase.assertions.length} passed`) + dim(` (${totalSec}s)`));
|
|
81
104
|
benchmarkCases.push({
|
|
82
105
|
eval_id: evalCase.id,
|
|
83
106
|
eval_name: evalCase.name,
|
|
@@ -88,7 +111,11 @@ export async function runEvalRun(skillDir) {
|
|
|
88
111
|
});
|
|
89
112
|
}
|
|
90
113
|
catch (err) {
|
|
114
|
+
const classified = classifyError(err, provider);
|
|
91
115
|
console.log(yellow(" error"));
|
|
116
|
+
console.log(yellow(` ${classified.title}: ${classified.description}`));
|
|
117
|
+
console.log(dim(` ${classified.hint}`));
|
|
118
|
+
progress.update({ lastError: classified.title });
|
|
92
119
|
benchmarkCases.push({
|
|
93
120
|
eval_id: evalCase.id,
|
|
94
121
|
eval_name: evalCase.name,
|
|
@@ -100,11 +127,20 @@ export async function runEvalRun(skillDir) {
|
|
|
100
127
|
tableRows.push([
|
|
101
128
|
evalCase.name,
|
|
102
129
|
"-",
|
|
103
|
-
dim(
|
|
130
|
+
dim(`${classified.title}`),
|
|
104
131
|
yellow("ERROR"),
|
|
105
132
|
]);
|
|
133
|
+
// For auth errors, all subsequent cases will fail too — abort early
|
|
134
|
+
if (classified.category === "auth" || classified.category === "provider_unavailable") {
|
|
135
|
+
console.log(red(`\nAborting remaining cases — ${classified.category} error is not recoverable.`));
|
|
136
|
+
console.log(dim(` ${classified.hint}\n`));
|
|
137
|
+
break;
|
|
138
|
+
}
|
|
106
139
|
}
|
|
107
140
|
}
|
|
141
|
+
// Complete progress tracking
|
|
142
|
+
progress.complete();
|
|
143
|
+
const totalElapsed = ((Date.now() - runStart) / 1000).toFixed(1);
|
|
108
144
|
// Print results table
|
|
109
145
|
const headers = ["EVAL", "ASSERTION", "TEXT", "STATUS"];
|
|
110
146
|
console.log(bold(`\nEval Results: ${evalsFile.skill_name}\n`));
|
|
@@ -113,7 +149,7 @@ export async function runEvalRun(skillDir) {
|
|
|
113
149
|
const passed = benchmarkCases.filter((c) => c.status === "pass").length;
|
|
114
150
|
const failed = benchmarkCases.filter((c) => c.status === "fail").length;
|
|
115
151
|
const errors = benchmarkCases.filter((c) => c.status === "error").length;
|
|
116
|
-
console.log(`\n${green(`${passed} passed`)} ${failed > 0 ? red(`${failed} failed`) : ""} ${errors > 0 ? yellow(`${errors} errors`) : ""}`.trim());
|
|
152
|
+
console.log(`\n${green(`${passed} passed`)} ${failed > 0 ? red(`${failed} failed`) : ""} ${errors > 0 ? yellow(`${errors} errors`) : ""} ${dim(`(${totalElapsed}s)`)}`.trim());
|
|
117
153
|
// Write benchmark.json
|
|
118
154
|
const benchmark = {
|
|
119
155
|
timestamp: new Date().toISOString(),
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"run.js","sourceRoot":"","sources":["../../../src/commands/eval/run.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,6DAA6D;AAC7D,8EAA8E;AAE9E,OAAO,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AACjF,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;
|
|
1
|
+
{"version":3,"file":"run.js","sourceRoot":"","sources":["../../../src/commands/eval/run.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,6DAA6D;AAC7D,8EAA8E;AAE9E,OAAO,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AACjF,OAAO,EAAE,eAAe,EAAE,mBAAmB,EAAE,MAAM,mBAAmB,CAAC;AAEzE,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACrD,OAAO,EAAE,cAAc,EAAE,MAAM,yBAAyB,CAAC;AAEzD,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,uBAAuB,CAAC;AAC7E,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,aAAa,EAAE,MAAM,uCAAuC,CAAC;AACtE,OAAO,EAAE,WAAW,EAAE,MAAM,4BAA4B,CAAC;AAEzD,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,QAAgB;IAC/C,+BAA+B;IAC/B,IAAI,SAAS,CAAC;IACd,IAAI,CAAC;QACH,SAAS,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IAC7C,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,IAAI,GAAG,YAAY,mBAAmB,EAAE,CAAC;YACvC,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,OAAO,IAAI,EAAE,CAAC;YAC9C,IAAI,QAAQ,CAAC,QAAQ,CAAC,eAAe,CAAC,EAAE,CAAC;gBACvC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,0BAA0B,QAAQ,mBAAmB,CAAC,CAAC,CAAC;YAC5E,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,uBAAuB,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YAC3D,CAAC;QACH,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,wBAAyB,GAAa,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QACvE,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChB,OAAO;IACT,CAAC;IAED,8CAA8C;IAC9C,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IAC/C,IAAI,YAAY,GAAG,EAAE,CAAC;IACtB,IAAI,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;QAC5B,YAAY,GAAG,YAAY,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;IACpD,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,iCAAiC,WAAW,wCAAwC,CAAC,CAAC,CAAC;IAC9G,CAAC;IAED,MAAM,YAAY,GAAG,qBAAqB,CAAC,YAAY,CAAC,CAAC;IAEzD,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;IACjC,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC;IAC3B,MAAM,QAAQ,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,oBAAoB,IAAI,YAAY,CAAiB,CAAC;IACpF,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,CAAC,MAAM,CAAC;IACrC,MAAM,eAAe,GAAG,SAAS,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IAErF,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,aAAa,KAAK,MAAM,KAAK,aAAa,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,MAAM,eAAe,aAAa,CAAC,CAAC,CAAC;IACrH,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,UAAU,YAAY,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC;IAEpE,oBAAoB;IACpB,MAAM,QAAQ,GAAG,mBAAmB,CAAC,QAAQ,EAAE,KAAK,EAAE,eAAe,CAAC,CAAC;IACvE,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,uBAAuB,QAAQ,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IAC1D,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,kBAAkB,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,qBAAqB,CAAC,IAAI,CAAC,CAAC,CAAC;IAEvF,qBAAqB;IACrB,MAAM,QAAQ,GAAG,IAAI,WAAW,CAAC,QAAQ,EAAE,QAAQ,EAAE,KAAK,EAAE,KAAK,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;IAEpF,MAAM,cAAc,GAAoB,EAAE,CAAC;IAC3C,MAAM,SAAS,GAAe,EAAE,CAAC;IACjC,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAChD,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACpC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAE7B,QAAQ,CAAC,MAAM,CAAC;YACd,WAAW,EAAE,QAAQ,CAAC,IAAI;YAC1B,KAAK,EAAE,YAAY;YACnB,cAAc,EAAE,CAAC;SAClB,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,6BAA6B;YAC7B,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,KAAK,KAAK,QAAQ,CAAC,IAAI,kBAAkB,CAAC,CAAC,CAAC;YAClF,MAAM,SAAS,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,YAAY,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;YACvE,MAAM,MAAM,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;YAC5D,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,MAAM,GAAG,CAAC,CAAC,CAAC;YAEzC,QAAQ,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC,CAAC;YAEtC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,YAAY,QAAQ,CAAC,UAAU,CAAC,MAAM,gBAAgB,CAAC,CAAC,CAAC;YAElF,+BAA+B;YAC/B,MAAM,gBAAgB,GAAG,EAAE,CAAC;YAC5B,IAAI,SAAS,GAAG,CAAC,CAAC;YAElB,KAAK,MAAM,SAAS,IAAI,QAAQ,CAAC,UAAU,EAAE,CAAC;gBAC5C,MAAM,MAAM,GAAG,MAAM,cAAc,CAAC,SAAS,CAAC,IAAI,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;gBACvE,gBAAgB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;gBAC9B,IAAI,MAAM,CAAC,IAAI;oBAAE,SAAS,EAAE,CAAC;gBAE7B,MAAM,aAAa,GACjB,SAAS,CAAC,IAAI,CAAC,MAAM,GAAG,EAAE;oBACxB,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,KAAK;oBACrC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC;gBAErB,SAAS,CAAC,IAAI,CAAC;oBACb,QAAQ,CAAC,IAAI;oBACb,SAAS,CAAC,EAAE;oBACZ,aAAa;oBACb,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC;iBAC1C,CAAC,CAAC;YACL,CAAC;YAED,MAAM,QAAQ,GAAG,QAAQ,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC;gBAC7C,CAAC,CAAC,SAAS,GAAG,QAAQ,CAAC,UAAU,CAAC,MAAM;gBACxC,CAAC,CAAC,CAAC,CAAC;YACN,MAAM,SAAS,GAAG,SAAS,KAAK,QAAQ,CAAC,UAAU,CAAC,MAAM,CAAC;YAC3D,MAAM,QAAQ,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;YAC9D,OAAO,CAAC,GAAG,CACT,SAAS;gBACP,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,GAAG,CAAC,KAAK,QAAQ,IAAI,CAAC;gBACzC,CAAC,CAAC,GAAG,CAAC,IAAI,SAAS,IAAI,QAAQ,CAAC,UAAU,CAAC,MAAM,SAAS,CAAC,GAAG,GAAG,CAAC,KAAK,QAAQ,IAAI,CAAC,CACvF,CAAC;YAEF,cAAc,CAAC,IAAI,CAAC;gBAClB,OAAO,EAAE,QAAQ,CAAC,EAAE;gBACpB,SAAS,EAAE,QAAQ,CAAC,IAAI;gBACxB,MAAM,EAAE,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;gBACnC,aAAa,EAAE,IAAI;gBACnB,SAAS,EAAE,QAAQ;gBACnB,UAAU,EAAE,gBAAgB;aAC7B,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,MAAM,UAAU,GAAG,aAAa,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;YAChD,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC;YAC9B,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,UAAU,CAAC,KAAK,KAAK,UAAU,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC;YACxE,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,UAAU,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;YAEzC,QAAQ,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,UAAU,CAAC,KAAK,EAAE,CAAC,CAAC;YAEjD,cAAc,CAAC,IAAI,CAAC;gBAClB,OAAO,EAAE,QAAQ,CAAC,EAAE;gBACpB,SAAS,EAAE,QAAQ,CAAC,IAAI;gBACxB,MAAM,EAAE,OAAO;gBACf,aAAa,EAAG,GAAa,CAAC,OAAO;gBACrC,SAAS,EAAE,CAAC;gBACZ,UAAU,EAAE,EAAE;aACf,CAAC,CAAC;YAEH,SAAS,CAAC,IAAI,CAAC;gBACb,QAAQ,CAAC,IAAI;gBACb,GAAG;gBACH,GAAG,CAAC,GAAG,UAAU,CAAC,KAAK,EAAE,CAAC;gBAC1B,MAAM,CAAC,OAAO,CAAC;aAChB,CAAC,CAAC;YAEH,oEAAoE;YACpE,IAAI,UAAU,CAAC,QAAQ,KAAK,MAAM,IAAI,UAAU,CAAC,QAAQ,KAAK,sBAAsB,EAAE,CAAC;gBACrF,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,gCAAgC,UAAU,CAAC,QAAQ,4BAA4B,CAAC,CAAC,CAAC;gBAClG,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,UAAU,CAAC,IAAI,IAAI,CAAC,CAAC,CAAC;gBAC3C,MAAM;YACR,CAAC;QACH,CAAC;IACH,CAAC;IAED,6BAA6B;IAC7B,QAAQ,CAAC,QAAQ,EAAE,CAAC;IAEpB,MAAM,YAAY,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,QAAQ,CAAC,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;IAEjE,sBAAsB;IACtB,MAAM,OAAO,GAAG,CAAC,MAAM,EAAE,WAAW,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC;IACxD,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,mBAAmB,SAAS,CAAC,UAAU,IAAI,CAAC,CAAC,CAAC;IAC/D,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC,CAAC;IAEvC,kBAAkB;IAClB,MAAM,MAAM,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,MAAM,CAAC;IACxE,MAAM,MAAM,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,MAAM,CAAC;IACxE,MAAM,MAAM,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,OAAO,CAAC,CAAC,MAAM,CAAC;IACzE,OAAO,CAAC,GAAG,CACT,KAAK,KAAK,CAAC,GAAG,MAAM,SAAS,CAAC,IAAI,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,MAAM,SAAS,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,MAAM,SAAS,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,GAAG,CAAC,IAAI,YAAY,IAAI,CAAC,EAAE,CAAC,IAAI,EAAE,CAClK,CAAC;IAEF,uBAAuB;IACvB,MAAM,SAAS,GAAoB;QACjC,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,KAAK;QACL,UAAU,EAAE,SAAS,CAAC,UAAU;QAChC,KAAK,EAAE,cAAc;KACtB,CAAC;IAEF,MAAM,cAAc,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;IAC1C,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,0BAA0B,QAAQ,uBAAuB,CAAC,CAAC,CAAC;AAC9E,CAAC"}
|
package/dist/eval/llm.d.ts
CHANGED
|
@@ -13,4 +13,13 @@ export interface LlmOverrides {
|
|
|
13
13
|
provider?: ProviderName;
|
|
14
14
|
model?: string;
|
|
15
15
|
}
|
|
16
|
+
/**
|
|
17
|
+
* Estimate total eval duration in seconds based on provider and workload.
|
|
18
|
+
* Each case requires 1 generate call + N judge calls (one per assertion).
|
|
19
|
+
*/
|
|
20
|
+
export declare function estimateDurationSec(provider: ProviderName, totalCases: number, totalAssertions: number): {
|
|
21
|
+
minSec: number;
|
|
22
|
+
maxSec: number;
|
|
23
|
+
label: string;
|
|
24
|
+
};
|
|
16
25
|
export declare function createLlmClient(overrides?: LlmOverrides): LlmClient;
|
package/dist/eval/llm.js
CHANGED
|
@@ -37,6 +37,27 @@ function getTimeoutMs() {
|
|
|
37
37
|
}
|
|
38
38
|
return 300_000;
|
|
39
39
|
}
|
|
40
|
+
/**
|
|
41
|
+
* Estimate total eval duration in seconds based on provider and workload.
|
|
42
|
+
* Each case requires 1 generate call + N judge calls (one per assertion).
|
|
43
|
+
*/
|
|
44
|
+
export function estimateDurationSec(provider, totalCases, totalAssertions) {
|
|
45
|
+
// Approximate seconds per LLM call by provider
|
|
46
|
+
const perCall = {
|
|
47
|
+
"claude-cli": [12, 30],
|
|
48
|
+
"anthropic": [4, 12],
|
|
49
|
+
"codex-cli": [8, 20],
|
|
50
|
+
"gemini-cli": [8, 20],
|
|
51
|
+
"ollama": [5, 30],
|
|
52
|
+
};
|
|
53
|
+
const [lo, hi] = perCall[provider] ?? [5, 20];
|
|
54
|
+
const totalCalls = totalCases + totalAssertions; // 1 generate + N judges
|
|
55
|
+
const minSec = Math.round(totalCalls * lo);
|
|
56
|
+
const maxSec = Math.round(totalCalls * hi);
|
|
57
|
+
const fmt = (s) => s >= 60 ? `${Math.round(s / 60)}m` : `${s}s`;
|
|
58
|
+
const label = minSec === maxSec ? fmt(minSec) : `${fmt(minSec)}–${fmt(maxSec)}`;
|
|
59
|
+
return { minSec, maxSec, label };
|
|
60
|
+
}
|
|
40
61
|
export function createLlmClient(overrides) {
|
|
41
62
|
const provider = (overrides?.provider || process.env.VSKILL_EVAL_PROVIDER || detectProvider());
|
|
42
63
|
const modelOverride = overrides?.model;
|
|
@@ -126,9 +147,12 @@ function createCliClient(config) {
|
|
|
126
147
|
else {
|
|
127
148
|
env = { ...process.env, PATH: enhancedPath() };
|
|
128
149
|
}
|
|
150
|
+
// On Windows, .cmd/.bat files require shell:true to execute via spawn
|
|
151
|
+
const needsShell = process.platform === "win32" && /\.(cmd|bat)$/i.test(resolvedBinary);
|
|
129
152
|
const proc = spawn(resolvedBinary, config.args, {
|
|
130
153
|
stdio: ["pipe", "pipe", "pipe"],
|
|
131
154
|
env,
|
|
155
|
+
...(needsShell ? { shell: true } : {}),
|
|
132
156
|
});
|
|
133
157
|
let stdout = "";
|
|
134
158
|
let stderr = "";
|
package/dist/eval/llm.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"llm.js","sourceRoot":"","sources":["../../src/eval/llm.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,+EAA+E;AAC/E,EAAE;AACF,uDAAuD;AACvD,wEAAwE;AACxE,iFAAiF;AACjF,mEAAmE;AACnE,8DAA8D;AAC9D,uEAAuE;AACvE,EAAE;AACF,uDAAuD;AACvD,iFAAiF;AACjF,2EAA2E;AAC3E,EAAE;AACF,mEAAmE;AACnE,oEAAoE;AACpE,EAAE;AACF,iDAAiD;AACjD,+DAA+D;AAC/D,4EAA4E;AAC5E,iFAAiF;AACjF,4DAA4D;AAC5D,mDAAmD;AACnD,8EAA8E;AAE9E,OAAO,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAC3C,OAAO,EAAE,gBAAgB,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAgB5E,SAAS,cAAc;IACrB,OAAO,YAAY,CAAC;AACtB,CAAC;AAED,oFAAoF;AACpF,SAAS,YAAY;IACnB,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC;IAC/C,IAAI,MAAM,EAAE,CAAC;QACX,MAAM,OAAO,GAAG,QAAQ,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;QACrC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,OAAO,GAAG,CAAC;YAAE,OAAO,OAAO,GAAG,IAAI,CAAC;IAC5D,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAOD,MAAM,UAAU,eAAe,CAAC,SAAwB;IACtD,MAAM,QAAQ,GAAG,CAAC,SAAS,EAAE,QAAQ,IAAI,OAAO,CAAC,GAAG,CAAC,oBAAoB,IAAI,cAAc,EAAE,CAAiB,CAAC;IAC/G,MAAM,aAAa,GAAG,SAAS,EAAE,KAAK,CAAC;IACvC,QAAQ,QAAQ,EAAE,CAAC;QACjB,KAAK,WAAW;YACd,OAAO,qBAAqB,CAAC,aAAa,CAAC,CAAC;QAC9C,KAAK,YAAY;YACf,OAAO,qBAAqB,CAAC,aAAa,CAAC,CAAC;QAC9C,KAAK,WAAW;YACd,OAAO,oBAAoB,CAAC,aAAa,CAAC,CAAC;QAC7C,KAAK,YAAY;YACf,OAAO,qBAAqB,CAAC,aAAa,CAAC,CAAC;QAC9C,KAAK,QAAQ;YACX,OAAO,kBAAkB,CAAC,aAAa,CAAC,CAAC;QAC3C;YACE,MAAM,IAAI,KAAK,CACb,kCAAkC,QAAQ,2EAA2E,CACtH,CAAC;IACN,CAAC;AACH,CAAC;AAED,8EAA8E;AAC9E,0BAA0B;AAC1B,8EAA8E;AAC9E,SAAS,qBAAqB,CAAC,aAAsB;IACnD,MAAM,aAAa,GAAG,mBAAmB,CAAC;IAE1C,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;IAC7C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CACb,sRAAsR,CACvR,CAAC;IACJ,CAAC;IAED,MAAM,KAAK,GAAG,aAAa,IAAI,OAAO,CAAC,GAAG,CAAC,iBAAiB,IAAI,aAAa,CAAC;IAC9E,IAAI,cAAc,GAAQ,IAAI,CAAC;IAE/B,OAAO;QACL,KAAK;QACL,KAAK,CAAC,QAAQ,CAAC,YAAoB,EAAE,UAAkB;YACrD,IAAI,CAAC,cAAc,EAAE,CAAC;gBACpB,MAAM,EAAE,OAAO,EAAE,SAAS,EAAE,GAAG,MAAM,MAAM,CAAC,mBAAmB,CAAC,CAAC;gBACjE,cAAc,GAAG,IAAI,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;YAC7C,CAAC;YAED,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;YACzC,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,YAAY,EAAE,CAAC,CAAC;YACrE,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YACzB,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,MAAM,cAAc,CAAC,QAAQ,CAAC,MAAM,CACnD;oBACE,KAAK;oBACL,MAAM,EAAE,YAAY;oBACpB,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE,CAAC;oBACjD,UAAU,EAAE,IAAI;iBACjB,EACD,EAAE,MAAM,EAAE,UAAU,CAAC,MAAM,EAAE,CAC9B,CAAC;gBACF,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC;gBAEtC,MAAM,SAAS,GAAG,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,MAAM,CAAC,CAAC;gBACvE,MAAM,IAAI,GAAG,SAAS,IAAI,MAAM,IAAI,SAAS,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC;gBACpE,OAAO;oBACL,IAAI;oBACJ,UAAU;oBACV,WAAW,EAAE,QAAQ,CAAC,KAAK,EAAE,YAAY,IAAI,IAAI;oBACjD,YAAY,EAAE,QAAQ,CAAC,KAAK,EAAE,aAAa,IAAI,IAAI;iBACpD,CAAC;YACJ,CAAC;oBAAS,CAAC;gBACT,YAAY,CAAC,OAAO,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;KACF,CAAC;AACJ,CAAC;AAiBD,SAAS,eAAe,CAAC,MAAiB;IACxC,uEAAuE;IACvE,MAAM,cAAc,GAAG,gBAAgB,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IAEvD,OAAO;QACL,KAAK,EAAE,MAAM,CAAC,YAAY;QAC1B,KAAK,CAAC,QAAQ,CAAC,YAAoB,EAAE,UAAkB;YACrD,MAAM,cAAc,GAAG,GAAG,YAAY,OAAO,UAAU,EAAE,CAAC;YAC1D,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAEzB,MAAM,IAAI,GAAG,MAAM,IAAI,OAAO,CAAS,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;gBACzD,8DAA8D;gBAC9D,IAAI,GAAuC,CAAC;gBAC5C,IAAI,MAAM,CAAC,cAAc,EAAE,CAAC;oBAC1B,GAAG,GAAG,EAAE,CAAC;oBACT,MAAM,MAAM,GAAG,MAAM,CAAC,cAAc,CAAC;oBACrC,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;wBACjD,IAAI,CAAC,KAAK,SAAS,IAAI,CAAC,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC;4BAAE,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;oBAC3D,CAAC;gBACH,CAAC;gBAED,kEAAkE;gBAClE,mEAAmE;gBACnE,IAAI,GAAG,EAAE,CAAC;oBACR,GAAG,CAAC,IAAI,GAAG,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;gBACpC,CAAC;qBAAM,CAAC;oBACN,GAAG,GAAG,EAAE,GAAG,OAAO,CAAC,GAA6B,EAAE,IAAI,EAAE,YAAY,EAAE,EAAE,CAAC;gBAC3E,CAAC;gBAED,MAAM,IAAI,GAAG,KAAK,CAAC,cAAc,EAAE,MAAM,CAAC,IAAI,EAAE;oBAC9C,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;oBAC/B,GAAG;
|
|
1
|
+
{"version":3,"file":"llm.js","sourceRoot":"","sources":["../../src/eval/llm.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,+EAA+E;AAC/E,EAAE;AACF,uDAAuD;AACvD,wEAAwE;AACxE,iFAAiF;AACjF,mEAAmE;AACnE,8DAA8D;AAC9D,uEAAuE;AACvE,EAAE;AACF,uDAAuD;AACvD,iFAAiF;AACjF,2EAA2E;AAC3E,EAAE;AACF,mEAAmE;AACnE,oEAAoE;AACpE,EAAE;AACF,iDAAiD;AACjD,+DAA+D;AAC/D,4EAA4E;AAC5E,iFAAiF;AACjF,4DAA4D;AAC5D,mDAAmD;AACnD,8EAA8E;AAE9E,OAAO,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAC3C,OAAO,EAAE,gBAAgB,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAgB5E,SAAS,cAAc;IACrB,OAAO,YAAY,CAAC;AACtB,CAAC;AAED,oFAAoF;AACpF,SAAS,YAAY;IACnB,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC;IAC/C,IAAI,MAAM,EAAE,CAAC;QACX,MAAM,OAAO,GAAG,QAAQ,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;QACrC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,OAAO,GAAG,CAAC;YAAE,OAAO,OAAO,GAAG,IAAI,CAAC;IAC5D,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAOD;;;GAGG;AACH,MAAM,UAAU,mBAAmB,CACjC,QAAsB,EACtB,UAAkB,EAClB,eAAuB;IAEvB,+CAA+C;IAC/C,MAAM,OAAO,GAA2C;QACtD,YAAY,EAAG,CAAC,EAAE,EAAE,EAAE,CAAC;QACvB,WAAW,EAAI,CAAC,CAAC,EAAE,EAAE,CAAC;QACtB,WAAW,EAAI,CAAC,CAAC,EAAE,EAAE,CAAC;QACtB,YAAY,EAAG,CAAC,CAAC,EAAE,EAAE,CAAC;QACtB,QAAQ,EAAO,CAAC,CAAC,EAAE,EAAE,CAAC;KACvB,CAAC;IACF,MAAM,CAAC,EAAE,EAAE,EAAE,CAAC,GAAG,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAC9C,MAAM,UAAU,GAAG,UAAU,GAAG,eAAe,CAAC,CAAC,wBAAwB;IACzE,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,GAAG,EAAE,CAAC,CAAC;IAC3C,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,GAAG,EAAE,CAAC,CAAC;IAE3C,MAAM,GAAG,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC;IACxE,MAAM,KAAK,GAAG,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC;IAChF,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC;AACnC,CAAC;AAED,MAAM,UAAU,eAAe,CAAC,SAAwB;IACtD,MAAM,QAAQ,GAAG,CAAC,SAAS,EAAE,QAAQ,IAAI,OAAO,CAAC,GAAG,CAAC,oBAAoB,IAAI,cAAc,EAAE,CAAiB,CAAC;IAC/G,MAAM,aAAa,GAAG,SAAS,EAAE,KAAK,CAAC;IACvC,QAAQ,QAAQ,EAAE,CAAC;QACjB,KAAK,WAAW;YACd,OAAO,qBAAqB,CAAC,aAAa,CAAC,CAAC;QAC9C,KAAK,YAAY;YACf,OAAO,qBAAqB,CAAC,aAAa,CAAC,CAAC;QAC9C,KAAK,WAAW;YACd,OAAO,oBAAoB,CAAC,aAAa,CAAC,CAAC;QAC7C,KAAK,YAAY;YACf,OAAO,qBAAqB,CAAC,aAAa,CAAC,CAAC;QAC9C,KAAK,QAAQ;YACX,OAAO,kBAAkB,CAAC,aAAa,CAAC,CAAC;QAC3C;YACE,MAAM,IAAI,KAAK,CACb,kCAAkC,QAAQ,2EAA2E,CACtH,CAAC;IACN,CAAC;AACH,CAAC;AAED,8EAA8E;AAC9E,0BAA0B;AAC1B,8EAA8E;AAC9E,SAAS,qBAAqB,CAAC,aAAsB;IACnD,MAAM,aAAa,GAAG,mBAAmB,CAAC;IAE1C,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC;IAC7C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CACb,sRAAsR,CACvR,CAAC;IACJ,CAAC;IAED,MAAM,KAAK,GAAG,aAAa,IAAI,OAAO,CAAC,GAAG,CAAC,iBAAiB,IAAI,aAAa,CAAC;IAC9E,IAAI,cAAc,GAAQ,IAAI,CAAC;IAE/B,OAAO;QACL,KAAK;QACL,KAAK,CAAC,QAAQ,CAAC,YAAoB,EAAE,UAAkB;YACrD,IAAI,CAAC,cAAc,EAAE,CAAC;gBACpB,MAAM,EAAE,OAAO,EAAE,SAAS,EAAE,GAAG,MAAM,MAAM,CAAC,mBAAmB,CAAC,CAAC;gBACjE,cAAc,GAAG,IAAI,SAAS,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;YAC7C,CAAC;YAED,MAAM,UAAU,GAAG,IAAI,eAAe,EAAE,CAAC;YACzC,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,UAAU,CAAC,KAAK,EAAE,EAAE,YAAY,EAAE,CAAC,CAAC;YACrE,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YACzB,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,MAAM,cAAc,CAAC,QAAQ,CAAC,MAAM,CACnD;oBACE,KAAK;oBACL,MAAM,EAAE,YAAY;oBACpB,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE,CAAC;oBACjD,UAAU,EAAE,IAAI;iBACjB,EACD,EAAE,MAAM,EAAE,UAAU,CAAC,MAAM,EAAE,CAC9B,CAAC;gBACF,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC;gBAEtC,MAAM,SAAS,GAAG,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,MAAM,CAAC,CAAC;gBACvE,MAAM,IAAI,GAAG,SAAS,IAAI,MAAM,IAAI,SAAS,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC;gBACpE,OAAO;oBACL,IAAI;oBACJ,UAAU;oBACV,WAAW,EAAE,QAAQ,CAAC,KAAK,EAAE,YAAY,IAAI,IAAI;oBACjD,YAAY,EAAE,QAAQ,CAAC,KAAK,EAAE,aAAa,IAAI,IAAI;iBACpD,CAAC;YACJ,CAAC;oBAAS,CAAC;gBACT,YAAY,CAAC,OAAO,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;KACF,CAAC;AACJ,CAAC;AAiBD,SAAS,eAAe,CAAC,MAAiB;IACxC,uEAAuE;IACvE,MAAM,cAAc,GAAG,gBAAgB,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IAEvD,OAAO;QACL,KAAK,EAAE,MAAM,CAAC,YAAY;QAC1B,KAAK,CAAC,QAAQ,CAAC,YAAoB,EAAE,UAAkB;YACrD,MAAM,cAAc,GAAG,GAAG,YAAY,OAAO,UAAU,EAAE,CAAC;YAC1D,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAEzB,MAAM,IAAI,GAAG,MAAM,IAAI,OAAO,CAAS,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;gBACzD,8DAA8D;gBAC9D,IAAI,GAAuC,CAAC;gBAC5C,IAAI,MAAM,CAAC,cAAc,EAAE,CAAC;oBAC1B,GAAG,GAAG,EAAE,CAAC;oBACT,MAAM,MAAM,GAAG,MAAM,CAAC,cAAc,CAAC;oBACrC,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;wBACjD,IAAI,CAAC,KAAK,SAAS,IAAI,CAAC,CAAC,CAAC,UAAU,CAAC,MAAM,CAAC;4BAAE,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;oBAC3D,CAAC;gBACH,CAAC;gBAED,kEAAkE;gBAClE,mEAAmE;gBACnE,IAAI,GAAG,EAAE,CAAC;oBACR,GAAG,CAAC,IAAI,GAAG,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;gBACpC,CAAC;qBAAM,CAAC;oBACN,GAAG,GAAG,EAAE,GAAG,OAAO,CAAC,GAA6B,EAAE,IAAI,EAAE,YAAY,EAAE,EAAE,CAAC;gBAC3E,CAAC;gBAED,sEAAsE;gBACtE,MAAM,UAAU,GAAG,OAAO,CAAC,QAAQ,KAAK,OAAO,IAAI,eAAe,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAExF,MAAM,IAAI,GAAG,KAAK,CAAC,cAAc,EAAE,MAAM,CAAC,IAAI,EAAE;oBAC9C,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;oBAC/B,GAAG;oBACH,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;iBACvC,CAAC,CAAC;gBAEH,IAAI,MAAM,GAAG,EAAE,CAAC;gBAChB,IAAI,MAAM,GAAG,EAAE,CAAC;gBAChB,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAS,EAAE,EAAE,GAAG,MAAM,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;gBACnE,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAS,EAAE,EAAE,GAAG,MAAM,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;gBAEnE,MAAM,SAAS,GAAG,YAAY,EAAE,CAAC;gBACjC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE;oBAC5B,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;oBACrB,MAAM,CAAC,IAAI,KAAK,CAAC,GAAG,MAAM,CAAC,IAAI,wBAAwB,SAAS,GAAG,IAAI,GAAG,CAAC,CAAC,CAAC;gBAC/E,CAAC,EAAE,SAAS,CAAC,CAAC;gBAEd,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,GAA0B,EAAE,EAAE;oBAC9C,YAAY,CAAC,KAAK,CAAC,CAAC;oBACpB,IAAI,GAAG,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;wBAC1B,MAAM,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC;oBACxC,CAAC;yBAAM,CAAC;wBACN,MAAM,CAAC,IAAI,KAAK,CAAC,GAAG,MAAM,CAAC,IAAI,gBAAgB,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;oBACjE,CAAC;gBACH,CAAC,CAAC,CAAC;gBAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;oBACxB,YAAY,CAAC,KAAK,CAAC,CAAC;oBACpB,IAAI,IAAI,KAAK,CAAC,EAAE,CAAC;wBACf,OAAO,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC;oBACzB,CAAC;yBAAM,CAAC;wBACN,MAAM,MAAM,GAAG,CAAC,MAAM,IAAI,MAAM,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;wBAChD,MAAM,CAAC,IAAI,KAAK,CACd,GAAG,MAAM,CAAC,IAAI,yBAAyB,IAAI,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,GAAG,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAC5E,CAAC,CAAC;oBACL,CAAC;gBACH,CAAC,CAAC,CAAC;gBAEH,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC;YACjC,CAAC,CAAC,CAAC;YAEH,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,EAAE,WAAW,EAAE,IAAI,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC;QACzF,CAAC;KACF,CAAC;AACJ,CAAC;AAED,8EAA8E;AAC9E,4EAA4E;AAC5E,uEAAuE;AACvE,8EAA8E;AAC9E,SAAS,qBAAqB,CAAC,aAAsB;IACnD,MAAM,KAAK,GAAG,aAAa,IAAI,OAAO,CAAC,GAAG,CAAC,iBAAiB,IAAI,QAAQ,CAAC;IACzE,OAAO,eAAe,CAAC;QACrB,MAAM,EAAE,QAAQ;QAChB,IAAI,EAAE,QAAQ;QACd,IAAI,EAAE,CAAC,IAAI,EAAE,SAAS,EAAE,KAAK,CAAC;QAC9B,YAAY,EAAE,UAAU,KAAK,EAAE;QAC/B,cAAc,EAAE,QAAQ;QACxB,WAAW,EACT,qJAAqJ;KACxJ,CAAC,CAAC;AACL,CAAC;AAED,8EAA8E;AAC9E,iFAAiF;AACjF,8EAA8E;AAC9E,SAAS,oBAAoB,CAAC,aAAsB;IAClD,MAAM,KAAK,GAAG,aAAa,IAAI,OAAO,CAAC,GAAG,CAAC,iBAAiB,IAAI,SAAS,CAAC;IAC1E,OAAO,eAAe,CAAC;QACrB,MAAM,EAAE,OAAO;QACf,IAAI,EAAE,OAAO;QACb,IAAI,EAAE,CAAC,MAAM,EAAE,SAAS,EAAE,KAAK,CAAC;QAChC,YAAY,EAAE,SAAS,KAAK,EAAE;QAC9B,WAAW,EACT,4IAA4I;KAC/I,CAAC,CAAC;AACL,CAAC;AAED,8EAA8E;AAC9E,iFAAiF;AACjF,kFAAkF;AAClF,8EAA8E;AAC9E,SAAS,qBAAqB,CAAC,aAAsB;IACnD,MAAM,KAAK,GAAG,aAAa,IAAI,OAAO,CAAC,GAAG,CAAC,iBAAiB,IAAI,gBAAgB,CAAC;IACjF,OAAO,eAAe,CAAC;QACrB,MAAM,EAAE,QAAQ;QAChB,IAAI,EAAE,QAAQ;QACd,IAAI,EAAE,CAAC,IAAI,EAAE,SAAS,EAAE,KAAK,CAAC;QAC9B,YAAY,EAAE,KAAK;QACnB,WAAW,EACT,kJAAkJ;KACrJ,CAAC,CAAC;AACL,CAAC;AAED,8EAA8E;AAC9E,qDAAqD;AACrD,8EAA8E;AAC9E,SAAS,kBAAkB,CAAC,aAAsB;IAChD,MAAM,OAAO,GAAG,OAAO,CAAC,GAAG,CAAC,eAAe,IAAI,wBAAwB,CAAC;IACxE,MAAM,KAAK,GAAG,aAAa,IAAI,OAAO,CAAC,GAAG,CAAC,iBAAiB,IAAI,aAAa,CAAC;IAE9E,OAAO;QACL,KAAK;QACL,KAAK,CAAC,QAAQ,CAAC,YAAoB,EAAE,UAAkB;YACrD,MAAM,UAAU,GAAG,GAAG,YAAY,OAAO,UAAU,EAAE,CAAC;YACtD,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YAEzB,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,OAAO,eAAe,EAAE;gBACtD,MAAM,EAAE,MAAM;gBACd,OAAO,EAAE,EAAE,cAAc,EAAE,kBAAkB,EAAE;gBAC/C,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;oBACnB,KAAK;oBACL,MAAM,EAAE,UAAU;oBAClB,MAAM,EAAE,KAAK;oBACb,OAAO,EAAE;wBACP,WAAW,EAAE,IAAI;wBACjB,WAAW,EAAE,GAAG;qBACjB;iBACF,CAAC;gBACF,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,YAAY,EAAE,CAAC;aAC5C,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,MAAM,KAAK,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;gBACpC,IAAI,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,WAAW,CAAC,EAAE,CAAC;oBAC3D,MAAM,IAAI,KAAK,CACb,iBAAiB,KAAK,8CAA8C,KAAK,EAAE,CAC5E,CAAC;gBACJ,CAAC;gBACD,MAAM,IAAI,KAAK,CAAC,0BAA0B,KAAK,EAAE,CAAC,CAAC;YACrD,CAAC;YAED,MAAM,IAAI,GAAG,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAIlC,CAAC;YACF,OAAO;gBACL,IAAI,EAAE,IAAI,CAAC,QAAQ,IAAI,EAAE;gBACzB,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK;gBAC9B,WAAW,EAAE,IAAI,CAAC,iBAAiB,IAAI,IAAI;gBAC3C,YAAY,EAAE,IAAI,CAAC,UAAU,IAAI,IAAI;aACtC,CAAC;QACJ,CAAC;KACF,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
export interface ProgressState {
|
|
2
|
+
startedAt: string;
|
|
3
|
+
provider: string;
|
|
4
|
+
model: string;
|
|
5
|
+
totalCases: number;
|
|
6
|
+
completedCases: number;
|
|
7
|
+
currentCase: string | null;
|
|
8
|
+
phase: "starting" | "generating" | "judging" | "complete" | "error";
|
|
9
|
+
elapsedMs: number;
|
|
10
|
+
estimatedTotalSec: number | null;
|
|
11
|
+
lastError: string | null;
|
|
12
|
+
}
|
|
13
|
+
export declare class ProgressLog {
|
|
14
|
+
private filePath;
|
|
15
|
+
private startTime;
|
|
16
|
+
private state;
|
|
17
|
+
constructor(skillDir: string, provider: string, model: string, totalCases: number, estimatedTotalSec: number | null);
|
|
18
|
+
update(partial: Partial<ProgressState>): void;
|
|
19
|
+
complete(): void;
|
|
20
|
+
error(msg: string): void;
|
|
21
|
+
private write;
|
|
22
|
+
private cleanup;
|
|
23
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// progress-log.ts -- Write eval progress to a JSON file for external monitoring
|
|
3
|
+
//
|
|
4
|
+
// During long-running eval operations (2-10 minutes), this file allows
|
|
5
|
+
// users to check progress from another terminal:
|
|
6
|
+
// cat skills/my-skill/evals/.eval-progress.json
|
|
7
|
+
//
|
|
8
|
+
// The file is created at start and deleted on completion.
|
|
9
|
+
// ---------------------------------------------------------------------------
|
|
10
|
+
import { writeFileSync, unlinkSync } from "node:fs";
|
|
11
|
+
import { join } from "node:path";
|
|
12
|
+
const PROGRESS_FILE = ".eval-progress.json";
|
|
13
|
+
export class ProgressLog {
|
|
14
|
+
filePath;
|
|
15
|
+
startTime;
|
|
16
|
+
state;
|
|
17
|
+
constructor(skillDir, provider, model, totalCases, estimatedTotalSec) {
|
|
18
|
+
this.filePath = join(skillDir, "evals", PROGRESS_FILE);
|
|
19
|
+
this.startTime = Date.now();
|
|
20
|
+
this.state = {
|
|
21
|
+
startedAt: new Date().toISOString(),
|
|
22
|
+
provider,
|
|
23
|
+
model,
|
|
24
|
+
totalCases,
|
|
25
|
+
completedCases: 0,
|
|
26
|
+
currentCase: null,
|
|
27
|
+
phase: "starting",
|
|
28
|
+
elapsedMs: 0,
|
|
29
|
+
estimatedTotalSec,
|
|
30
|
+
lastError: null,
|
|
31
|
+
};
|
|
32
|
+
this.write();
|
|
33
|
+
}
|
|
34
|
+
update(partial) {
|
|
35
|
+
Object.assign(this.state, partial, { elapsedMs: Date.now() - this.startTime });
|
|
36
|
+
this.write();
|
|
37
|
+
}
|
|
38
|
+
complete() {
|
|
39
|
+
this.state.phase = "complete";
|
|
40
|
+
this.state.elapsedMs = Date.now() - this.startTime;
|
|
41
|
+
this.cleanup();
|
|
42
|
+
}
|
|
43
|
+
error(msg) {
|
|
44
|
+
this.state.phase = "error";
|
|
45
|
+
this.state.lastError = msg;
|
|
46
|
+
this.state.elapsedMs = Date.now() - this.startTime;
|
|
47
|
+
this.write();
|
|
48
|
+
}
|
|
49
|
+
write() {
|
|
50
|
+
try {
|
|
51
|
+
writeFileSync(this.filePath, JSON.stringify(this.state, null, 2), "utf-8");
|
|
52
|
+
}
|
|
53
|
+
catch {
|
|
54
|
+
// Non-critical — don't crash eval for progress logging
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
cleanup() {
|
|
58
|
+
try {
|
|
59
|
+
unlinkSync(this.filePath);
|
|
60
|
+
}
|
|
61
|
+
catch {
|
|
62
|
+
// File may not exist
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
//# sourceMappingURL=progress-log.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"progress-log.js","sourceRoot":"","sources":["../../src/eval/progress-log.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,gFAAgF;AAChF,EAAE;AACF,uEAAuE;AACvE,iDAAiD;AACjD,kDAAkD;AAClD,EAAE;AACF,0DAA0D;AAC1D,8EAA8E;AAE9E,OAAO,EAAE,aAAa,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACpD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAejC,MAAM,aAAa,GAAG,qBAAqB,CAAC;AAE5C,MAAM,OAAO,WAAW;IACd,QAAQ,CAAS;IACjB,SAAS,CAAS;IAClB,KAAK,CAAgB;IAE7B,YAAY,QAAgB,EAAE,QAAgB,EAAE,KAAa,EAAE,UAAkB,EAAE,iBAAgC;QACjH,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,EAAE,aAAa,CAAC,CAAC;QACvD,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC5B,IAAI,CAAC,KAAK,GAAG;YACX,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACnC,QAAQ;YACR,KAAK;YACL,UAAU;YACV,cAAc,EAAE,CAAC;YACjB,WAAW,EAAE,IAAI;YACjB,KAAK,EAAE,UAAU;YACjB,SAAS,EAAE,CAAC;YACZ,iBAAiB;YACjB,SAAS,EAAE,IAAI;SAChB,CAAC;QACF,IAAI,CAAC,KAAK,EAAE,CAAC;IACf,CAAC;IAED,MAAM,CAAC,OAA+B;QACpC,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC;QAC/E,IAAI,CAAC,KAAK,EAAE,CAAC;IACf,CAAC;IAED,QAAQ;QACN,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,UAAU,CAAC;QAC9B,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC;QACnD,IAAI,CAAC,OAAO,EAAE,CAAC;IACjB,CAAC;IAED,KAAK,CAAC,GAAW;QACf,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,OAAO,CAAC;QAC3B,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,GAAG,CAAC;QAC3B,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC;QACnD,IAAI,CAAC,KAAK,EAAE,CAAC;IACf,CAAC;IAEO,KAAK;QACX,IAAI,CAAC;YACH,aAAa,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;QAC7E,CAAC;QAAC,MAAM,CAAC;YACP,uDAAuD;QACzD,CAAC;IACH,CAAC;IAEO,OAAO;QACb,IAAI,CAAC;YACH,UAAU,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC5B,CAAC;QAAC,MAAM,CAAC;YACP,qBAAqB;QACvB,CAAC;IACH,CAAC;CACF"}
|
|
@@ -389,6 +389,7 @@ export function registerRoutes(router, root, projectName) {
|
|
|
389
389
|
await sem.acquire();
|
|
390
390
|
const benchCase = await runSingleCaseSSE({
|
|
391
391
|
res, evalCase, systemPrompt, client, isAborted: () => aborted,
|
|
392
|
+
provider: currentOverrides.provider || "claude-cli",
|
|
392
393
|
});
|
|
393
394
|
if (!released) {
|
|
394
395
|
released = true;
|