vskill 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/eval-ui/assets/{index-CcnlpaWS.js → index-CxHCKEhf.js} +2 -2
- package/dist/eval-ui/index.html +1 -1
- package/package.json +1 -1
- package/dist/agents/agents-registry.test.d.ts +0 -1
- package/dist/agents/agents-registry.test.js +0 -248
- package/dist/agents/agents-registry.test.js.map +0 -1
- package/dist/api/client.test.d.ts +0 -1
- package/dist/api/client.test.js +0 -428
- package/dist/api/client.test.js.map +0 -1
- package/dist/audit/audit-integration.test.d.ts +0 -1
- package/dist/audit/audit-integration.test.js +0 -92
- package/dist/audit/audit-integration.test.js.map +0 -1
- package/dist/audit/audit-llm.test.d.ts +0 -1
- package/dist/audit/audit-llm.test.js +0 -110
- package/dist/audit/audit-llm.test.js.map +0 -1
- package/dist/audit/audit-patterns.test.d.ts +0 -1
- package/dist/audit/audit-patterns.test.js +0 -91
- package/dist/audit/audit-patterns.test.js.map +0 -1
- package/dist/audit/audit-scanner.test.d.ts +0 -1
- package/dist/audit/audit-scanner.test.js +0 -112
- package/dist/audit/audit-scanner.test.js.map +0 -1
- package/dist/audit/audit-types.test.d.ts +0 -1
- package/dist/audit/audit-types.test.js +0 -140
- package/dist/audit/audit-types.test.js.map +0 -1
- package/dist/audit/config.test.d.ts +0 -1
- package/dist/audit/config.test.js +0 -44
- package/dist/audit/config.test.js.map +0 -1
- package/dist/audit/file-discovery.test.d.ts +0 -1
- package/dist/audit/file-discovery.test.js +0 -120
- package/dist/audit/file-discovery.test.js.map +0 -1
- package/dist/audit/fix-suggestions.test.d.ts +0 -1
- package/dist/audit/fix-suggestions.test.js +0 -35
- package/dist/audit/fix-suggestions.test.js.map +0 -1
- package/dist/audit/formatters/json-formatter.test.d.ts +0 -1
- package/dist/audit/formatters/json-formatter.test.js +0 -49
- package/dist/audit/formatters/json-formatter.test.js.map +0 -1
- package/dist/audit/formatters/report-formatter.test.d.ts +0 -1
- package/dist/audit/formatters/report-formatter.test.js +0 -51
- package/dist/audit/formatters/report-formatter.test.js.map +0 -1
- package/dist/audit/formatters/sarif-formatter.test.d.ts +0 -1
- package/dist/audit/formatters/sarif-formatter.test.js +0 -71
- package/dist/audit/formatters/sarif-formatter.test.js.map +0 -1
- package/dist/audit/formatters/terminal-formatter.test.d.ts +0 -1
- package/dist/audit/formatters/terminal-formatter.test.js +0 -51
- package/dist/audit/formatters/terminal-formatter.test.js.map +0 -1
- package/dist/blocklist/blocklist-e2e.test.d.ts +0 -1
- package/dist/blocklist/blocklist-e2e.test.js +0 -346
- package/dist/blocklist/blocklist-e2e.test.js.map +0 -1
- package/dist/blocklist/blocklist.test.d.ts +0 -1
- package/dist/blocklist/blocklist.test.js +0 -259
- package/dist/blocklist/blocklist.test.js.map +0 -1
- package/dist/commands/__tests__/eval-router.test.d.ts +0 -1
- package/dist/commands/__tests__/eval-router.test.js +0 -60
- package/dist/commands/__tests__/eval-router.test.js.map +0 -1
- package/dist/commands/__tests__/eval-serve.test.d.ts +0 -1
- package/dist/commands/__tests__/eval-serve.test.js +0 -23
- package/dist/commands/__tests__/eval-serve.test.js.map +0 -1
- package/dist/commands/add-blocklist-e2e.test.d.ts +0 -1
- package/dist/commands/add-blocklist-e2e.test.js +0 -397
- package/dist/commands/add-blocklist-e2e.test.js.map +0 -1
- package/dist/commands/add-wizard.test.d.ts +0 -1
- package/dist/commands/add-wizard.test.js +0 -392
- package/dist/commands/add-wizard.test.js.map +0 -1
- package/dist/commands/add.test.d.ts +0 -1
- package/dist/commands/add.test.js +0 -2365
- package/dist/commands/add.test.js.map +0 -1
- package/dist/commands/audit.test.d.ts +0 -1
- package/dist/commands/audit.test.js +0 -79
- package/dist/commands/audit.test.js.map +0 -1
- package/dist/commands/blocklist.test.d.ts +0 -1
- package/dist/commands/blocklist.test.js +0 -158
- package/dist/commands/blocklist.test.js.map +0 -1
- package/dist/commands/eval/__tests__/coverage.test.d.ts +0 -1
- package/dist/commands/eval/__tests__/coverage.test.js +0 -122
- package/dist/commands/eval/__tests__/coverage.test.js.map +0 -1
- package/dist/commands/eval/__tests__/generate-all.test.d.ts +0 -1
- package/dist/commands/eval/__tests__/generate-all.test.js +0 -133
- package/dist/commands/eval/__tests__/generate-all.test.js.map +0 -1
- package/dist/commands/eval/__tests__/init.test.d.ts +0 -1
- package/dist/commands/eval/__tests__/init.test.js +0 -116
- package/dist/commands/eval/__tests__/init.test.js.map +0 -1
- package/dist/commands/eval/__tests__/run.test.d.ts +0 -1
- package/dist/commands/eval/__tests__/run.test.js +0 -186
- package/dist/commands/eval/__tests__/run.test.js.map +0 -1
- package/dist/commands/find.test.d.ts +0 -1
- package/dist/commands/find.test.js +0 -481
- package/dist/commands/find.test.js.map +0 -1
- package/dist/commands/marketplace.test.d.ts +0 -1
- package/dist/commands/marketplace.test.js +0 -129
- package/dist/commands/marketplace.test.js.map +0 -1
- package/dist/commands/remove.test.d.ts +0 -1
- package/dist/commands/remove.test.js +0 -164
- package/dist/commands/remove.test.js.map +0 -1
- package/dist/commands/should-skip.test.d.ts +0 -1
- package/dist/commands/should-skip.test.js +0 -56
- package/dist/commands/should-skip.test.js.map +0 -1
- package/dist/commands/submit.test.d.ts +0 -1
- package/dist/commands/submit.test.js +0 -83
- package/dist/commands/submit.test.js.map +0 -1
- package/dist/commands/update.test.d.ts +0 -1
- package/dist/commands/update.test.js +0 -250
- package/dist/commands/update.test.js.map +0 -1
- package/dist/discovery/github-tree.test.d.ts +0 -1
- package/dist/discovery/github-tree.test.js +0 -372
- package/dist/discovery/github-tree.test.js.map +0 -1
- package/dist/eval/__tests__/activation-tester.test.d.ts +0 -1
- package/dist/eval/__tests__/activation-tester.test.js +0 -203
- package/dist/eval/__tests__/activation-tester.test.js.map +0 -1
- package/dist/eval/__tests__/benchmark-history.test.d.ts +0 -1
- package/dist/eval/__tests__/benchmark-history.test.js +0 -422
- package/dist/eval/__tests__/benchmark-history.test.js.map +0 -1
- package/dist/eval/__tests__/benchmark.test.d.ts +0 -1
- package/dist/eval/__tests__/benchmark.test.js +0 -94
- package/dist/eval/__tests__/benchmark.test.js.map +0 -1
- package/dist/eval/__tests__/comparator.test.d.ts +0 -1
- package/dist/eval/__tests__/comparator.test.js +0 -282
- package/dist/eval/__tests__/comparator.test.js.map +0 -1
- package/dist/eval/__tests__/judge.test.d.ts +0 -1
- package/dist/eval/__tests__/judge.test.js +0 -122
- package/dist/eval/__tests__/judge.test.js.map +0 -1
- package/dist/eval/__tests__/llm.test.d.ts +0 -1
- package/dist/eval/__tests__/llm.test.js +0 -543
- package/dist/eval/__tests__/llm.test.js.map +0 -1
- package/dist/eval/__tests__/mcp-detector.test.d.ts +0 -1
- package/dist/eval/__tests__/mcp-detector.test.js +0 -180
- package/dist/eval/__tests__/mcp-detector.test.js.map +0 -1
- package/dist/eval/__tests__/prompt-builder.test.d.ts +0 -1
- package/dist/eval/__tests__/prompt-builder.test.js +0 -142
- package/dist/eval/__tests__/prompt-builder.test.js.map +0 -1
- package/dist/eval/__tests__/schema.test.d.ts +0 -1
- package/dist/eval/__tests__/schema.test.js +0 -247
- package/dist/eval/__tests__/schema.test.js.map +0 -1
- package/dist/eval/__tests__/skill-scanner.test.d.ts +0 -1
- package/dist/eval/__tests__/skill-scanner.test.js +0 -228
- package/dist/eval/__tests__/skill-scanner.test.js.map +0 -1
- package/dist/eval/__tests__/verdict.test.d.ts +0 -1
- package/dist/eval/__tests__/verdict.test.js +0 -47
- package/dist/eval/__tests__/verdict.test.js.map +0 -1
- package/dist/eval-server/__tests__/benchmark-runner.test.d.ts +0 -1
- package/dist/eval-server/__tests__/benchmark-runner.test.js +0 -301
- package/dist/eval-server/__tests__/benchmark-runner.test.js.map +0 -1
- package/dist/eval-server/__tests__/comparison-sse-events.test.d.ts +0 -1
- package/dist/eval-server/__tests__/comparison-sse-events.test.js +0 -278
- package/dist/eval-server/__tests__/comparison-sse-events.test.js.map +0 -1
- package/dist/eval-server/__tests__/sse-helpers.test.d.ts +0 -1
- package/dist/eval-server/__tests__/sse-helpers.test.js +0 -128
- package/dist/eval-server/__tests__/sse-helpers.test.js.map +0 -1
- package/dist/installer/canonical.test.d.ts +0 -1
- package/dist/installer/canonical.test.js +0 -264
- package/dist/installer/canonical.test.js.map +0 -1
- package/dist/lockfile/lockfile.test.d.ts +0 -1
- package/dist/lockfile/lockfile.test.js +0 -204
- package/dist/lockfile/lockfile.test.js.map +0 -1
- package/dist/lockfile/project-root.test.d.ts +0 -1
- package/dist/lockfile/project-root.test.js +0 -49
- package/dist/lockfile/project-root.test.js.map +0 -1
- package/dist/marketplace/marketplace.test.d.ts +0 -1
- package/dist/marketplace/marketplace.test.js +0 -312
- package/dist/marketplace/marketplace.test.js.map +0 -1
- package/dist/resolvers/source-resolver.test.d.ts +0 -1
- package/dist/resolvers/source-resolver.test.js +0 -104
- package/dist/resolvers/source-resolver.test.js.map +0 -1
- package/dist/resolvers/url-resolver.test.d.ts +0 -1
- package/dist/resolvers/url-resolver.test.js +0 -49
- package/dist/resolvers/url-resolver.test.js.map +0 -1
- package/dist/scanner/dci-integration.test.d.ts +0 -1
- package/dist/scanner/dci-integration.test.js +0 -83
- package/dist/scanner/dci-integration.test.js.map +0 -1
- package/dist/scanner/patterns.test.d.ts +0 -1
- package/dist/scanner/patterns.test.js +0 -832
- package/dist/scanner/patterns.test.js.map +0 -1
- package/dist/scanner/tier1.test.d.ts +0 -1
- package/dist/scanner/tier1.test.js +0 -305
- package/dist/scanner/tier1.test.js.map +0 -1
- package/dist/security/platform-security.test.d.ts +0 -1
- package/dist/security/platform-security.test.js +0 -92
- package/dist/security/platform-security.test.js.map +0 -1
- package/dist/settings/settings.test.d.ts +0 -1
- package/dist/settings/settings.test.js +0 -103
- package/dist/settings/settings.test.js.map +0 -1
- package/dist/updater/source-fetcher.test.d.ts +0 -1
- package/dist/updater/source-fetcher.test.js +0 -192
- package/dist/updater/source-fetcher.test.js.map +0 -1
- package/dist/utils/__tests__/paths.test.d.ts +0 -1
- package/dist/utils/__tests__/paths.test.js +0 -22
- package/dist/utils/__tests__/paths.test.js.map +0 -1
- package/dist/utils/__tests__/resolve-binary.integration.test.d.ts +0 -1
- package/dist/utils/__tests__/resolve-binary.integration.test.js +0 -138
- package/dist/utils/__tests__/resolve-binary.integration.test.js.map +0 -1
- package/dist/utils/__tests__/resolve-binary.test.d.ts +0 -1
- package/dist/utils/__tests__/resolve-binary.test.js +0 -175
- package/dist/utils/__tests__/resolve-binary.test.js.map +0 -1
- package/dist/utils/__tests__/validation.test.d.ts +0 -1
- package/dist/utils/__tests__/validation.test.js +0 -107
- package/dist/utils/__tests__/validation.test.js.map +0 -1
- package/dist/utils/agent-filter.test.d.ts +0 -1
- package/dist/utils/agent-filter.test.js +0 -75
- package/dist/utils/agent-filter.test.js.map +0 -1
- package/dist/utils/output.test.d.ts +0 -1
- package/dist/utils/output.test.js +0 -28
- package/dist/utils/output.test.js.map +0 -1
- package/dist/utils/project-root.test.d.ts +0 -1
- package/dist/utils/project-root.test.js +0 -74
- package/dist/utils/project-root.test.js.map +0 -1
- package/dist/utils/prompts.test.d.ts +0 -1
- package/dist/utils/prompts.test.js +0 -285
- package/dist/utils/prompts.test.js.map +0 -1
|
@@ -1,422 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, beforeEach, afterEach } from "vitest";
|
|
2
|
-
import { mkdirSync, rmSync, readFileSync } from "node:fs";
|
|
3
|
-
import { join } from "node:path";
|
|
4
|
-
import { tmpdir } from "node:os";
|
|
5
|
-
import { writeHistoryEntry, listHistory, readHistoryEntry, computeRegressions, getCaseHistory, } from "../benchmark-history.js";
|
|
6
|
-
let testDir;
|
|
7
|
-
const mkResult = (overrides = {}) => ({
|
|
8
|
-
timestamp: "2026-03-08T12:00:00.000Z",
|
|
9
|
-
model: "test-model",
|
|
10
|
-
skill_name: "test-skill",
|
|
11
|
-
cases: [
|
|
12
|
-
{
|
|
13
|
-
eval_id: 1,
|
|
14
|
-
eval_name: "test-case",
|
|
15
|
-
status: "pass",
|
|
16
|
-
error_message: null,
|
|
17
|
-
pass_rate: 1.0,
|
|
18
|
-
assertions: [
|
|
19
|
-
{ id: "a1", text: "Check 1", pass: true, reasoning: "OK" },
|
|
20
|
-
{ id: "a2", text: "Check 2", pass: true, reasoning: "OK" },
|
|
21
|
-
],
|
|
22
|
-
},
|
|
23
|
-
],
|
|
24
|
-
...overrides,
|
|
25
|
-
});
|
|
26
|
-
describe("benchmark-history", () => {
|
|
27
|
-
beforeEach(() => {
|
|
28
|
-
testDir = join(tmpdir(), `vskill-history-${Date.now()}`);
|
|
29
|
-
mkdirSync(join(testDir, "evals"), { recursive: true });
|
|
30
|
-
});
|
|
31
|
-
afterEach(() => {
|
|
32
|
-
rmSync(testDir, { recursive: true, force: true });
|
|
33
|
-
});
|
|
34
|
-
describe("writeHistoryEntry", () => {
|
|
35
|
-
it("writes history file with filesystem-safe timestamp", async () => {
|
|
36
|
-
const result = mkResult();
|
|
37
|
-
const filename = await writeHistoryEntry(testDir, result);
|
|
38
|
-
expect(filename).toBe("2026-03-08T12-00-00.000Z.json");
|
|
39
|
-
const content = readFileSync(join(testDir, "evals", "history", filename), "utf-8");
|
|
40
|
-
const parsed = JSON.parse(content);
|
|
41
|
-
expect(parsed.skill_name).toBe("test-skill");
|
|
42
|
-
});
|
|
43
|
-
it("also writes benchmark.json for backward compat", async () => {
|
|
44
|
-
await writeHistoryEntry(testDir, mkResult());
|
|
45
|
-
const bm = readFileSync(join(testDir, "evals", "benchmark.json"), "utf-8");
|
|
46
|
-
expect(JSON.parse(bm).skill_name).toBe("test-skill");
|
|
47
|
-
});
|
|
48
|
-
it("creates history directory if missing", async () => {
|
|
49
|
-
rmSync(join(testDir, "evals"), { recursive: true, force: true });
|
|
50
|
-
const result = mkResult();
|
|
51
|
-
const filename = await writeHistoryEntry(testDir, result);
|
|
52
|
-
expect(filename).toBeTruthy();
|
|
53
|
-
});
|
|
54
|
-
});
|
|
55
|
-
describe("listHistory", () => {
|
|
56
|
-
it("returns empty array when no history directory", async () => {
|
|
57
|
-
const list = await listHistory(join(testDir, "nonexistent"));
|
|
58
|
-
expect(list).toEqual([]);
|
|
59
|
-
});
|
|
60
|
-
it("lists entries sorted reverse-chronologically", async () => {
|
|
61
|
-
const r1 = mkResult({ timestamp: "2026-03-01T10:00:00.000Z" });
|
|
62
|
-
const r2 = mkResult({ timestamp: "2026-03-02T10:00:00.000Z" });
|
|
63
|
-
await writeHistoryEntry(testDir, r1);
|
|
64
|
-
await writeHistoryEntry(testDir, r2);
|
|
65
|
-
const list = await listHistory(testDir);
|
|
66
|
-
expect(list).toHaveLength(2);
|
|
67
|
-
expect(list[0].timestamp).toBe("2026-03-02T10:00:00.000Z");
|
|
68
|
-
expect(list[1].timestamp).toBe("2026-03-01T10:00:00.000Z");
|
|
69
|
-
});
|
|
70
|
-
it("computes pass rate from assertion results", async () => {
|
|
71
|
-
const result = mkResult({
|
|
72
|
-
cases: [
|
|
73
|
-
{
|
|
74
|
-
eval_id: 1,
|
|
75
|
-
eval_name: "test",
|
|
76
|
-
status: "fail",
|
|
77
|
-
error_message: null,
|
|
78
|
-
pass_rate: 0.5,
|
|
79
|
-
assertions: [
|
|
80
|
-
{ id: "a1", text: "Check 1", pass: true, reasoning: "OK" },
|
|
81
|
-
{ id: "a2", text: "Check 2", pass: false, reasoning: "Fail" },
|
|
82
|
-
],
|
|
83
|
-
},
|
|
84
|
-
],
|
|
85
|
-
});
|
|
86
|
-
await writeHistoryEntry(testDir, result);
|
|
87
|
-
const list = await listHistory(testDir);
|
|
88
|
-
expect(list[0].passRate).toBe(0.5);
|
|
89
|
-
});
|
|
90
|
-
});
|
|
91
|
-
describe("readHistoryEntry", () => {
|
|
92
|
-
it("reads a specific history entry by timestamp", async () => {
|
|
93
|
-
await writeHistoryEntry(testDir, mkResult());
|
|
94
|
-
const entry = await readHistoryEntry(testDir, "2026-03-08T12:00:00.000Z");
|
|
95
|
-
expect(entry).not.toBeNull();
|
|
96
|
-
expect(entry.skill_name).toBe("test-skill");
|
|
97
|
-
});
|
|
98
|
-
it("returns null for nonexistent entry", async () => {
|
|
99
|
-
const entry = await readHistoryEntry(testDir, "1999-01-01T00:00:00.000Z");
|
|
100
|
-
expect(entry).toBeNull();
|
|
101
|
-
});
|
|
102
|
-
});
|
|
103
|
-
describe("computeRegressions", () => {
|
|
104
|
-
it("detects regression (pass → fail)", () => {
|
|
105
|
-
const prev = mkResult({
|
|
106
|
-
cases: [
|
|
107
|
-
{
|
|
108
|
-
eval_id: 1,
|
|
109
|
-
eval_name: "test",
|
|
110
|
-
status: "pass",
|
|
111
|
-
error_message: null,
|
|
112
|
-
pass_rate: 1.0,
|
|
113
|
-
assertions: [{ id: "a1", text: "Check", pass: true, reasoning: "OK" }],
|
|
114
|
-
},
|
|
115
|
-
],
|
|
116
|
-
});
|
|
117
|
-
const curr = mkResult({
|
|
118
|
-
cases: [
|
|
119
|
-
{
|
|
120
|
-
eval_id: 1,
|
|
121
|
-
eval_name: "test",
|
|
122
|
-
status: "fail",
|
|
123
|
-
error_message: null,
|
|
124
|
-
pass_rate: 0,
|
|
125
|
-
assertions: [{ id: "a1", text: "Check", pass: false, reasoning: "Fail" }],
|
|
126
|
-
},
|
|
127
|
-
],
|
|
128
|
-
});
|
|
129
|
-
const regressions = computeRegressions(curr, prev);
|
|
130
|
-
expect(regressions).toHaveLength(1);
|
|
131
|
-
expect(regressions[0].change).toBe("regression");
|
|
132
|
-
expect(regressions[0].assertionId).toBe("a1");
|
|
133
|
-
});
|
|
134
|
-
it("detects improvement (fail → pass)", () => {
|
|
135
|
-
const prev = mkResult({
|
|
136
|
-
cases: [
|
|
137
|
-
{
|
|
138
|
-
eval_id: 1,
|
|
139
|
-
eval_name: "test",
|
|
140
|
-
status: "fail",
|
|
141
|
-
error_message: null,
|
|
142
|
-
pass_rate: 0,
|
|
143
|
-
assertions: [{ id: "a1", text: "Check", pass: false, reasoning: "Fail" }],
|
|
144
|
-
},
|
|
145
|
-
],
|
|
146
|
-
});
|
|
147
|
-
const curr = mkResult({
|
|
148
|
-
cases: [
|
|
149
|
-
{
|
|
150
|
-
eval_id: 1,
|
|
151
|
-
eval_name: "test",
|
|
152
|
-
status: "pass",
|
|
153
|
-
error_message: null,
|
|
154
|
-
pass_rate: 1,
|
|
155
|
-
assertions: [{ id: "a1", text: "Check", pass: true, reasoning: "OK" }],
|
|
156
|
-
},
|
|
157
|
-
],
|
|
158
|
-
});
|
|
159
|
-
const regressions = computeRegressions(curr, prev);
|
|
160
|
-
expect(regressions).toHaveLength(1);
|
|
161
|
-
expect(regressions[0].change).toBe("improvement");
|
|
162
|
-
});
|
|
163
|
-
it("returns empty array when no changes", () => {
|
|
164
|
-
const result = mkResult();
|
|
165
|
-
expect(computeRegressions(result, result)).toEqual([]);
|
|
166
|
-
});
|
|
167
|
-
it("skips new assertions not present in previous run", () => {
|
|
168
|
-
const prev = mkResult({
|
|
169
|
-
cases: [
|
|
170
|
-
{
|
|
171
|
-
eval_id: 1,
|
|
172
|
-
eval_name: "test",
|
|
173
|
-
status: "pass",
|
|
174
|
-
error_message: null,
|
|
175
|
-
pass_rate: 1,
|
|
176
|
-
assertions: [{ id: "a1", text: "Check", pass: true, reasoning: "OK" }],
|
|
177
|
-
},
|
|
178
|
-
],
|
|
179
|
-
});
|
|
180
|
-
const curr = mkResult({
|
|
181
|
-
cases: [
|
|
182
|
-
{
|
|
183
|
-
eval_id: 1,
|
|
184
|
-
eval_name: "test",
|
|
185
|
-
status: "pass",
|
|
186
|
-
error_message: null,
|
|
187
|
-
pass_rate: 1,
|
|
188
|
-
assertions: [
|
|
189
|
-
{ id: "a1", text: "Check", pass: true, reasoning: "OK" },
|
|
190
|
-
{ id: "a2", text: "New", pass: false, reasoning: "Fail" },
|
|
191
|
-
],
|
|
192
|
-
},
|
|
193
|
-
],
|
|
194
|
-
});
|
|
195
|
-
const regressions = computeRegressions(curr, prev);
|
|
196
|
-
expect(regressions).toEqual([]);
|
|
197
|
-
});
|
|
198
|
-
});
|
|
199
|
-
describe("listHistory with filters", () => {
|
|
200
|
-
beforeEach(async () => {
|
|
201
|
-
const r1 = mkResult({
|
|
202
|
-
timestamp: "2026-03-01T10:00:00.000Z",
|
|
203
|
-
model: "gpt-4o",
|
|
204
|
-
type: "benchmark",
|
|
205
|
-
});
|
|
206
|
-
const r2 = mkResult({
|
|
207
|
-
timestamp: "2026-03-05T10:00:00.000Z",
|
|
208
|
-
model: "sonnet",
|
|
209
|
-
type: "baseline",
|
|
210
|
-
});
|
|
211
|
-
const r3 = mkResult({
|
|
212
|
-
timestamp: "2026-03-08T10:00:00.000Z",
|
|
213
|
-
model: "gpt-4o",
|
|
214
|
-
type: "comparison",
|
|
215
|
-
});
|
|
216
|
-
await writeHistoryEntry(testDir, r1);
|
|
217
|
-
await writeHistoryEntry(testDir, r2);
|
|
218
|
-
await writeHistoryEntry(testDir, r3);
|
|
219
|
-
});
|
|
220
|
-
it("filters by model", async () => {
|
|
221
|
-
const list = await listHistory(testDir, { model: "gpt-4o" });
|
|
222
|
-
expect(list).toHaveLength(2);
|
|
223
|
-
expect(list.every((e) => e.model === "gpt-4o")).toBe(true);
|
|
224
|
-
});
|
|
225
|
-
it("filters by type", async () => {
|
|
226
|
-
const list = await listHistory(testDir, { type: "baseline" });
|
|
227
|
-
expect(list).toHaveLength(1);
|
|
228
|
-
expect(list[0].type).toBe("baseline");
|
|
229
|
-
expect(list[0].model).toBe("sonnet");
|
|
230
|
-
});
|
|
231
|
-
it("filters by date range", async () => {
|
|
232
|
-
const list = await listHistory(testDir, {
|
|
233
|
-
from: "2026-03-04T00:00:00.000Z",
|
|
234
|
-
to: "2026-03-06T00:00:00.000Z",
|
|
235
|
-
});
|
|
236
|
-
expect(list).toHaveLength(1);
|
|
237
|
-
expect(list[0].timestamp).toBe("2026-03-05T10:00:00.000Z");
|
|
238
|
-
});
|
|
239
|
-
it("returns all when no filters", async () => {
|
|
240
|
-
const list = await listHistory(testDir);
|
|
241
|
-
expect(list).toHaveLength(3);
|
|
242
|
-
});
|
|
243
|
-
it("combines model and type filters", async () => {
|
|
244
|
-
const list = await listHistory(testDir, { model: "gpt-4o", type: "benchmark" });
|
|
245
|
-
expect(list).toHaveLength(1);
|
|
246
|
-
expect(list[0].model).toBe("gpt-4o");
|
|
247
|
-
expect(list[0].type).toBe("benchmark");
|
|
248
|
-
expect(list[0].timestamp).toBe("2026-03-01T10:00:00.000Z");
|
|
249
|
-
});
|
|
250
|
-
});
|
|
251
|
-
describe("getCaseHistory", () => {
|
|
252
|
-
beforeEach(async () => {
|
|
253
|
-
const r1 = mkResult({
|
|
254
|
-
timestamp: "2026-03-01T10:00:00.000Z",
|
|
255
|
-
model: "gpt-4o",
|
|
256
|
-
cases: [
|
|
257
|
-
{
|
|
258
|
-
eval_id: 1,
|
|
259
|
-
eval_name: "test-case",
|
|
260
|
-
status: "pass",
|
|
261
|
-
error_message: null,
|
|
262
|
-
pass_rate: 1.0,
|
|
263
|
-
assertions: [
|
|
264
|
-
{ id: "a1", text: "Check 1", pass: true, reasoning: "OK" },
|
|
265
|
-
],
|
|
266
|
-
},
|
|
267
|
-
],
|
|
268
|
-
});
|
|
269
|
-
const r2 = mkResult({
|
|
270
|
-
timestamp: "2026-03-05T10:00:00.000Z",
|
|
271
|
-
model: "sonnet",
|
|
272
|
-
cases: [
|
|
273
|
-
{
|
|
274
|
-
eval_id: 1,
|
|
275
|
-
eval_name: "test-case",
|
|
276
|
-
status: "fail",
|
|
277
|
-
error_message: null,
|
|
278
|
-
pass_rate: 0.5,
|
|
279
|
-
assertions: [
|
|
280
|
-
{ id: "a1", text: "Check 1", pass: true, reasoning: "OK" },
|
|
281
|
-
{ id: "a2", text: "Check 2", pass: false, reasoning: "Fail" },
|
|
282
|
-
],
|
|
283
|
-
},
|
|
284
|
-
],
|
|
285
|
-
});
|
|
286
|
-
const r3 = mkResult({
|
|
287
|
-
timestamp: "2026-03-08T10:00:00.000Z",
|
|
288
|
-
model: "gpt-4o",
|
|
289
|
-
cases: [
|
|
290
|
-
{
|
|
291
|
-
eval_id: 1,
|
|
292
|
-
eval_name: "test-case",
|
|
293
|
-
status: "pass",
|
|
294
|
-
error_message: null,
|
|
295
|
-
pass_rate: 1.0,
|
|
296
|
-
assertions: [
|
|
297
|
-
{ id: "a1", text: "Check 1", pass: true, reasoning: "OK" },
|
|
298
|
-
{ id: "a2", text: "Check 2", pass: true, reasoning: "OK" },
|
|
299
|
-
],
|
|
300
|
-
},
|
|
301
|
-
],
|
|
302
|
-
});
|
|
303
|
-
await writeHistoryEntry(testDir, r1);
|
|
304
|
-
await writeHistoryEntry(testDir, r2);
|
|
305
|
-
await writeHistoryEntry(testDir, r3);
|
|
306
|
-
});
|
|
307
|
-
it("returns cases from all files sorted newest-first", async () => {
|
|
308
|
-
const history = await getCaseHistory(testDir, 1);
|
|
309
|
-
expect(history).toHaveLength(3);
|
|
310
|
-
expect(history[0].timestamp).toBe("2026-03-08T10:00:00.000Z");
|
|
311
|
-
expect(history[1].timestamp).toBe("2026-03-05T10:00:00.000Z");
|
|
312
|
-
expect(history[2].timestamp).toBe("2026-03-01T10:00:00.000Z");
|
|
313
|
-
});
|
|
314
|
-
it("filters by model", async () => {
|
|
315
|
-
const history = await getCaseHistory(testDir, 1, { model: "gpt-4o" });
|
|
316
|
-
expect(history).toHaveLength(2);
|
|
317
|
-
expect(history.every((e) => e.model === "gpt-4o")).toBe(true);
|
|
318
|
-
});
|
|
319
|
-
it("returns empty array for missing eval_id", async () => {
|
|
320
|
-
const history = await getCaseHistory(testDir, 999);
|
|
321
|
-
expect(history).toEqual([]);
|
|
322
|
-
});
|
|
323
|
-
it("returns empty for nonexistent skill dir", async () => {
|
|
324
|
-
const history = await getCaseHistory(join(testDir, "nonexistent"), 1);
|
|
325
|
-
expect(history).toEqual([]);
|
|
326
|
-
});
|
|
327
|
-
it("derives baselinePassRate from comparisonDetail rubric scores", async () => {
|
|
328
|
-
const compResult = mkResult({
|
|
329
|
-
timestamp: "2026-03-10T10:00:00.000Z",
|
|
330
|
-
type: "comparison",
|
|
331
|
-
cases: [
|
|
332
|
-
{
|
|
333
|
-
eval_id: 1,
|
|
334
|
-
eval_name: "test-case",
|
|
335
|
-
status: "pass",
|
|
336
|
-
error_message: null,
|
|
337
|
-
pass_rate: 0.9,
|
|
338
|
-
assertions: [],
|
|
339
|
-
comparisonDetail: {
|
|
340
|
-
skillDurationMs: 100,
|
|
341
|
-
skillTokens: 50,
|
|
342
|
-
baselineDurationMs: 80,
|
|
343
|
-
baselineTokens: 40,
|
|
344
|
-
skillContentScore: 90,
|
|
345
|
-
skillStructureScore: 85,
|
|
346
|
-
baselineContentScore: 80,
|
|
347
|
-
baselineStructureScore: 60,
|
|
348
|
-
winner: "skill",
|
|
349
|
-
},
|
|
350
|
-
},
|
|
351
|
-
],
|
|
352
|
-
});
|
|
353
|
-
await writeHistoryEntry(testDir, compResult);
|
|
354
|
-
const history = await getCaseHistory(testDir, 1);
|
|
355
|
-
const compEntry = history.find((e) => e.type === "comparison");
|
|
356
|
-
expect(compEntry).toBeDefined();
|
|
357
|
-
// (80 + 60) / 200 = 0.70
|
|
358
|
-
expect(compEntry.baselinePassRate).toBeCloseTo(0.70, 5);
|
|
359
|
-
});
|
|
360
|
-
it("leaves baselinePassRate undefined for benchmark entries", async () => {
|
|
361
|
-
const history = await getCaseHistory(testDir, 1);
|
|
362
|
-
const benchEntry = history.find((e) => e.type === "benchmark" || e.type === undefined);
|
|
363
|
-
expect(benchEntry).toBeDefined();
|
|
364
|
-
expect(benchEntry.baselinePassRate).toBeUndefined();
|
|
365
|
-
});
|
|
366
|
-
it("handles zero rubric scores correctly (baselinePassRate = 0)", async () => {
|
|
367
|
-
const zeroResult = mkResult({
|
|
368
|
-
timestamp: "2026-03-11T10:00:00.000Z",
|
|
369
|
-
type: "comparison",
|
|
370
|
-
cases: [
|
|
371
|
-
{
|
|
372
|
-
eval_id: 1,
|
|
373
|
-
eval_name: "test-case",
|
|
374
|
-
status: "pass",
|
|
375
|
-
error_message: null,
|
|
376
|
-
pass_rate: 0,
|
|
377
|
-
assertions: [],
|
|
378
|
-
comparisonDetail: {
|
|
379
|
-
skillDurationMs: 100,
|
|
380
|
-
skillTokens: 50,
|
|
381
|
-
baselineDurationMs: 80,
|
|
382
|
-
baselineTokens: 40,
|
|
383
|
-
skillContentScore: 0,
|
|
384
|
-
skillStructureScore: 0,
|
|
385
|
-
baselineContentScore: 0,
|
|
386
|
-
baselineStructureScore: 0,
|
|
387
|
-
winner: "tie",
|
|
388
|
-
},
|
|
389
|
-
},
|
|
390
|
-
],
|
|
391
|
-
});
|
|
392
|
-
await writeHistoryEntry(testDir, zeroResult);
|
|
393
|
-
const history = await getCaseHistory(testDir, 1);
|
|
394
|
-
const zeroEntry = history.find((e) => e.type === "comparison" && e.timestamp.startsWith("2026-03-11"));
|
|
395
|
-
expect(zeroEntry).toBeDefined();
|
|
396
|
-
expect(zeroEntry.baselinePassRate).toBe(0);
|
|
397
|
-
});
|
|
398
|
-
it("leaves baselinePassRate undefined for comparison entry without comparisonDetail", async () => {
|
|
399
|
-
const noDetailResult = mkResult({
|
|
400
|
-
timestamp: "2026-03-12T10:00:00.000Z",
|
|
401
|
-
type: "comparison",
|
|
402
|
-
cases: [
|
|
403
|
-
{
|
|
404
|
-
eval_id: 1,
|
|
405
|
-
eval_name: "test-case",
|
|
406
|
-
status: "pass",
|
|
407
|
-
error_message: null,
|
|
408
|
-
pass_rate: 0.5,
|
|
409
|
-
assertions: [],
|
|
410
|
-
// no comparisonDetail
|
|
411
|
-
},
|
|
412
|
-
],
|
|
413
|
-
});
|
|
414
|
-
await writeHistoryEntry(testDir, noDetailResult);
|
|
415
|
-
const history = await getCaseHistory(testDir, 1);
|
|
416
|
-
const noDetailEntry = history.find((e) => e.type === "comparison" && e.timestamp.startsWith("2026-03-12"));
|
|
417
|
-
expect(noDetailEntry).toBeDefined();
|
|
418
|
-
expect(noDetailEntry.baselinePassRate).toBeUndefined();
|
|
419
|
-
});
|
|
420
|
-
});
|
|
421
|
-
});
|
|
422
|
-
//# sourceMappingURL=benchmark-history.test.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"benchmark-history.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/benchmark-history.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,QAAQ,CAAC;AACrE,OAAO,EAAE,SAAS,EAAE,MAAM,EAAE,YAAY,EAAiB,MAAM,SAAS,CAAC;AACzE,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AACjC,OAAO,EACL,iBAAiB,EACjB,WAAW,EACX,gBAAgB,EAChB,kBAAkB,EAClB,cAAc,GACf,MAAM,yBAAyB,CAAC;AAGjC,IAAI,OAAe,CAAC;AAEpB,MAAM,QAAQ,GAAG,CAAC,YAAsC,EAAE,EAAmB,EAAE,CAAC,CAAC;IAC/E,SAAS,EAAE,0BAA0B;IACrC,KAAK,EAAE,YAAY;IACnB,UAAU,EAAE,YAAY;IACxB,KAAK,EAAE;QACL;YACE,OAAO,EAAE,CAAC;YACV,SAAS,EAAE,WAAW;YACtB,MAAM,EAAE,MAAM;YACd,aAAa,EAAE,IAAI;YACnB,SAAS,EAAE,GAAG;YACd,UAAU,EAAE;gBACV,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE;gBAC1D,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE;aAC3D;SACF;KACF;IACD,GAAG,SAAS;CACb,CAAC,CAAC;AAEH,QAAQ,CAAC,mBAAmB,EAAE,GAAG,EAAE;IACjC,UAAU,CAAC,GAAG,EAAE;QACd,OAAO,GAAG,IAAI,CAAC,MAAM,EAAE,EAAE,kBAAkB,IAAI,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;QACzD,SAAS,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACzD,CAAC,CAAC,CAAC;IAEH,SAAS,CAAC,GAAG,EAAE;QACb,MAAM,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IACpD,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,mBAAmB,EAAE,GAAG,EAAE;QACjC,EAAE,CAAC,oDAAoD,EAAE,KAAK,IAAI,EAAE;YAClE,MAAM,MAAM,GAAG,QAAQ,EAAE,CAAC;YAC1B,MAAM,QAAQ,GAAG,MAAM,iBAAiB,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;YAE1D,MAAM,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,+BAA+B,CAAC,CAAC;YACvD,MAAM,OAAO,GAAG,YAAY,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,QAAQ,CAAC,EAAE,OAAO,CAAC,CAAC;YACnF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;YACnC,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC/C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;YAC9D,MAAM,iBAAiB,CAAC,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC;YAC7C,MAAM,EAAE,GAAG,YAAY,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,gBAAgB,CAAC,EAAE,OAAO,CAAC,CAAC;YAC3E,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QACvD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,sCAAsC,EAAE,KAAK,IAAI,EAAE;YACpD,MAAM,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;YACjE,MAAM,MAAM,GAAG,QAAQ,EAAE,CAAC;YAC1B,MAAM,QAAQ,GAAG,MAAM,iBAAiB,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;YAC1D,MAAM,CAAC,QAAQ,CAAC,CAAC,UAAU,EAAE,CAAC;QAChC,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,aAAa,EAAE,GAAG,EAAE;QAC3B,EAAE,CAAC,+CAA+C,EAAE,KAAK,IAAI,EAAE;YAC7D,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,IAAI,CAAC,OAAO,EAAE,aAAa,CAAC,CAAC,CAAC;YAC7D,MAAM,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;QAC3B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;YAC5D,MAAM,EAAE,GAAG,QAAQ,CAAC,EAAE,SAAS,EAAE,0BAA0B,EAAE,CAAC,CAAC;YAC/D,MAAM,EAAE,GAAG,QAAQ,CAAC,EAAE,SAAS,EAAE,0BAA0B,EAAE,CAAC,CAAC;YAC/D,MAAM,iBAAiB,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;YACrC,MAAM,iBAAiB,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;YAErC,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC,CAAC;YACxC,MAAM,CAAC,IAAI,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAC7B,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;YAC3D,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;QAC7D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,2CAA2C,EAAE,KAAK,IAAI,EAAE;YACzD,MAAM,MAAM,GAAG,QAAQ,CAAC;gBACtB,KAAK,EAAE;oBACL;wBACE,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,MAAM;wBACjB,MAAM,EAAE,MAAM;wBACd,aAAa,EAAE,IAAI;wBACnB,SAAS,EAAE,GAAG;wBACd,UAAU,EAAE;4BACV,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE;4BAC1D,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,EAAE;yBAC9D;qBACF;iBACF;aACF,CAAC,CAAC;YACH,MAAM,iBAAiB,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;YACzC,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC,CAAC;YACxC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACrC,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,kBAAkB,EAAE,GAAG,EAAE;QAChC,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YAC3D,MAAM,iBAAiB,CAAC,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC;YAC7C,MAAM,KAAK,GAAG,MAAM,gBAAgB,CAAC,OAAO,EAAE,0BAA0B,CAAC,CAAC;YAC1E,MAAM,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;YAC7B,MAAM,CAAC,KAAM,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC/C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;YAClD,MAAM,KAAK,GAAG,MAAM,gBAAgB,CAAC,OAAO,EAAE,0BAA0B,CAAC,CAAC;YAC1E,MAAM,CAAC,KAAK,CAAC,CAAC,QAAQ,EAAE,CAAC;QAC3B,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;QAClC,EAAE,CAAC,kCAAkC,EAAE,GAAG,EAAE;YAC1C,MAAM,IAAI,GAAG,QAAQ,CAAC;gBACpB,KAAK,EAAE;oBACL;wBACE,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,MAAM;wBACjB,MAAM,EAAE,MAAM;wBACd,aAAa,EAAE,IAAI;wBACnB,SAAS,EAAE,GAAG;wBACd,UAAU,EAAE,CAAC,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;qBACvE;iBACF;aACF,CAAC,CAAC;YACH,MAAM,IAAI,GAAG,QAAQ,CAAC;gBACpB,KAAK,EAAE;oBACL;wBACE,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,MAAM;wBACjB,MAAM,EAAE,MAAM;wBACd,aAAa,EAAE,IAAI;wBACnB,SAAS,EAAE,CAAC;wBACZ,UAAU,EAAE,CAAC,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,EAAE,CAAC;qBAC1E;iBACF;aACF,CAAC,CAAC;YAEH,MAAM,WAAW,GAAG,kBAAkB,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;YACnD,MAAM,CAAC,WAAW,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YACpC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YACjD,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,mCAAmC,EAAE,GAAG,EAAE;YAC3C,MAAM,IAAI,GAAG,QAAQ,CAAC;gBACpB,KAAK,EAAE;oBACL;wBACE,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,MAAM;wBACjB,MAAM,EAAE,MAAM;wBACd,aAAa,EAAE,IAAI;wBACnB,SAAS,EAAE,CAAC;wBACZ,UAAU,EAAE,CAAC,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,EAAE,CAAC;qBAC1E;iBACF;aACF,CAAC,CAAC;YACH,MAAM,IAAI,GAAG,QAAQ,CAAC;gBACpB,KAAK,EAAE;oBACL;wBACE,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,MAAM;wBACjB,MAAM,EAAE,MAAM;wBACd,aAAa,EAAE,IAAI;wBACnB,SAAS,EAAE,CAAC;wBACZ,UAAU,EAAE,CAAC,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;qBACvE;iBACF;aACF,CAAC,CAAC;YAEH,MAAM,WAAW,GAAG,kBAAkB,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;YACnD,MAAM,CAAC,WAAW,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YACpC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QACpD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,qCAAqC,EAAE,GAAG,EAAE;YAC7C,MAAM,MAAM,GAAG,QAAQ,EAAE,CAAC;YAC1B,MAAM,CAAC,kBAAkB,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;QACzD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,kDAAkD,EAAE,GAAG,EAAE;YAC1D,MAAM,IAAI,GAAG,QAAQ,CAAC;gBACpB,KAAK,EAAE;oBACL;wBACE,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,MAAM;wBACjB,MAAM,EAAE,MAAM;wBACd,aAAa,EAAE,IAAI;wBACnB,SAAS,EAAE,CAAC;wBACZ,UAAU,EAAE,CAAC,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;qBACvE;iBACF;aACF,CAAC,CAAC;YACH,MAAM,IAAI,GAAG,QAAQ,CAAC;gBACpB,KAAK,EAAE;oBACL;wBACE,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,MAAM;wBACjB,MAAM,EAAE,MAAM;wBACd,aAAa,EAAE,IAAI;wBACnB,SAAS,EAAE,CAAC;wBACZ,UAAU,EAAE;4BACV,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE;4BACxD,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,EAAE;yBAC1D;qBACF;iBACF;aACF,CAAC,CAAC;YAEH,MAAM,WAAW,GAAG,kBAAkB,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;YACnD,MAAM,CAAC,WAAW,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,0BAA0B,EAAE,GAAG,EAAE;QACxC,UAAU,CAAC,KAAK,IAAI,EAAE;YACpB,MAAM,EAAE,GAAG,QAAQ,CAAC;gBAClB,SAAS,EAAE,0BAA0B;gBACrC,KAAK,EAAE,QAAQ;gBACf,IAAI,EAAE,WAAW;aAClB,CAAC,CAAC;YACH,MAAM,EAAE,GAAG,QAAQ,CAAC;gBAClB,SAAS,EAAE,0BAA0B;gBACrC,KAAK,EAAE,QAAQ;gBACf,IAAI,EAAE,UAAU;aACjB,CAAC,CAAC;YACH,MAAM,EAAE,GAAG,QAAQ,CAAC;gBAClB,SAAS,EAAE,0BAA0B;gBACrC,KAAK,EAAE,QAAQ;gBACf,IAAI,EAAE,YAAY;aACnB,CAAC,CAAC;YACH,MAAM,iBAAiB,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;YACrC,MAAM,iBAAiB,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;YACrC,MAAM,iBAAiB,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QACvC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,kBAAkB,EAAE,KAAK,IAAI,EAAE;YAChC,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,OAAO,EAAE,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;YAC7D,MAAM,CAAC,IAAI,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAC7B,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iBAAiB,EAAE,KAAK,IAAI,EAAE;YAC/B,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,OAAO,EAAE,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,CAAC;YAC9D,MAAM,CAAC,IAAI,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAC7B,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YACtC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QACvC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uBAAuB,EAAE,KAAK,IAAI,EAAE;YACrC,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,OAAO,EAAE;gBACtC,IAAI,EAAE,0BAA0B;gBAChC,EAAE,EAAE,0BAA0B;aAC/B,CAAC,CAAC;YACH,MAAM,CAAC,IAAI,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAC7B,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;QAC7D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6BAA6B,EAAE,KAAK,IAAI,EAAE;YAC3C,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,OAAO,CAAC,CAAC;YACxC,MAAM,CAAC,IAAI,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAC/B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iCAAiC,EAAE,KAAK,IAAI,EAAE;YAC/C,MAAM,IAAI,GAAG,MAAM,WAAW,CAAC,OAAO,EAAE,EAAE,KAAK,EAAE,QAAQ,EAAE,IAAI,EAAE,WAAW,EAAE,CAAC,CAAC;YAChF,MAAM,CAAC,IAAI,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAC7B,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACrC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;YACvC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;QAC7D,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;QAC9B,UAAU,CAAC,KAAK,IAAI,EAAE;YACpB,MAAM,EAAE,GAAG,QAAQ,CAAC;gBAClB,SAAS,EAAE,0BAA0B;gBACrC,KAAK,EAAE,QAAQ;gBACf,KAAK,EAAE;oBACL;wBACE,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,WAAW;wBACtB,MAAM,EAAE,MAAM;wBACd,aAAa,EAAE,IAAI;wBACnB,SAAS,EAAE,GAAG;wBACd,UAAU,EAAE;4BACV,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE;yBAC3D;qBACF;iBACF;aACF,CAAC,CAAC;YACH,MAAM,EAAE,GAAG,QAAQ,CAAC;gBAClB,SAAS,EAAE,0BAA0B;gBACrC,KAAK,EAAE,QAAQ;gBACf,KAAK,EAAE;oBACL;wBACE,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,WAAW;wBACtB,MAAM,EAAE,MAAM;wBACd,aAAa,EAAE,IAAI;wBACnB,SAAS,EAAE,GAAG;wBACd,UAAU,EAAE;4BACV,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE;4BAC1D,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,EAAE;yBAC9D;qBACF;iBACF;aACF,CAAC,CAAC;YACH,MAAM,EAAE,GAAG,QAAQ,CAAC;gBAClB,SAAS,EAAE,0BAA0B;gBACrC,KAAK,EAAE,QAAQ;gBACf,KAAK,EAAE;oBACL;wBACE,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,WAAW;wBACtB,MAAM,EAAE,MAAM;wBACd,aAAa,EAAE,IAAI;wBACnB,SAAS,EAAE,GAAG;wBACd,UAAU,EAAE;4BACV,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE;4BAC1D,EAAE,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE;yBAC3D;qBACF;iBACF;aACF,CAAC,CAAC;YACH,MAAM,iBAAiB,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;YACrC,MAAM,iBAAiB,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;YACrC,MAAM,iBAAiB,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QACvC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;YAChE,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YACjD,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAChC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;YAC9D,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;YAC9D,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;QAChE,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,kBAAkB,EAAE,KAAK,IAAI,EAAE;YAChC,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,OAAO,EAAE,CAAC,EAAE,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;YACtE,MAAM,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAChC,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChE,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;YACvD,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YACnD,MAAM,CAAC,OAAO,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;QAC9B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;YACvD,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,IAAI,CAAC,OAAO,EAAE,aAAa,CAAC,EAAE,CAAC,CAAC,CAAC;YACtE,MAAM,CAAC,OAAO,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;QAC9B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,8DAA8D,EAAE,KAAK,IAAI,EAAE;YAC5E,MAAM,UAAU,GAAG,QAAQ,CAAC;gBAC1B,SAAS,EAAE,0BAA0B;gBACrC,IAAI,EAAE,YAAqB;gBAC3B,KAAK,EAAE;oBACL;wBACE,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,WAAW;wBACtB,MAAM,EAAE,MAAM;wBACd,aAAa,EAAE,IAAI;wBACnB,SAAS,EAAE,GAAG;wBACd,UAAU,EAAE,EAAE;wBACd,gBAAgB,EAAE;4BAChB,eAAe,EAAE,GAAG;4BACpB,WAAW,EAAE,EAAE;4BACf,kBAAkB,EAAE,EAAE;4BACtB,cAAc,EAAE,EAAE;4BAClB,iBAAiB,EAAE,EAAE;4BACrB,mBAAmB,EAAE,EAAE;4BACvB,oBAAoB,EAAE,EAAE;4BACxB,sBAAsB,EAAE,EAAE;4BAC1B,MAAM,EAAE,OAAO;yBAChB;qBACF;iBACF;aACF,CAAC,CAAC;YACH,MAAM,iBAAiB,CAAC,OAAO,EAAE,UAAU,CAAC,CAAC;YAE7C,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YACjD,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,YAAY,CAAC,CAAC;YAC/D,MAAM,CAAC,SAAS,CAAC,CAAC,WAAW,EAAE,CAAC;YAChC,yBAAyB;YACzB,MAAM,CAAC,SAAU,CAAC,gBAAgB,CAAC,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;QAC3D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,yDAAyD,EAAE,KAAK,IAAI,EAAE;YACvE,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YACjD,MAAM,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,WAAW,IAAI,CAAC,CAAC,IAAI,KAAK,SAAS,CAAC,CAAC;YACvF,MAAM,CAAC,UAAU,CAAC,CAAC,WAAW,EAAE,CAAC;YACjC,MAAM,CAAC,UAAW,CAAC,gBAAgB,CAAC,CAAC,aAAa,EAAE,CAAC;QACvD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6DAA6D,EAAE,KAAK,IAAI,EAAE;YAC3E,MAAM,UAAU,GAAG,QAAQ,CAAC;gBAC1B,SAAS,EAAE,0BAA0B;gBACrC,IAAI,EAAE,YAAqB;gBAC3B,KAAK,EAAE;oBACL;wBACE,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,WAAW;wBACtB,MAAM,EAAE,MAAM;wBACd,aAAa,EAAE,IAAI;wBACnB,SAAS,EAAE,CAAC;wBACZ,UAAU,EAAE,EAAE;wBACd,gBAAgB,EAAE;4BAChB,eAAe,EAAE,GAAG;4BACpB,WAAW,EAAE,EAAE;4BACf,kBAAkB,EAAE,EAAE;4BACtB,cAAc,EAAE,EAAE;4BAClB,iBAAiB,EAAE,CAAC;4BACpB,mBAAmB,EAAE,CAAC;4BACtB,oBAAoB,EAAE,CAAC;4BACvB,sBAAsB,EAAE,CAAC;4BACzB,MAAM,EAAE,KAAK;yBACd;qBACF;iBACF;aACF,CAAC,CAAC;YACH,MAAM,iBAAiB,CAAC,OAAO,EAAE,UAAU,CAAC,CAAC;YAE7C,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YACjD,MAAM,SAAS,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,YAAY,IAAI,CAAC,CAAC,SAAS,CAAC,UAAU,CAAC,YAAY,CAAC,CAAC,CAAC;YACvG,MAAM,CAAC,SAAS,CAAC,CAAC,WAAW,EAAE,CAAC;YAChC,MAAM,CAAC,SAAU,CAAC,gBAAgB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC9C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iFAAiF,EAAE,KAAK,IAAI,EAAE;YAC/F,MAAM,cAAc,GAAG,QAAQ,CAAC;gBAC9B,SAAS,EAAE,0BAA0B;gBACrC,IAAI,EAAE,YAAqB;gBAC3B,KAAK,EAAE;oBACL;wBACE,OAAO,EAAE,CAAC;wBACV,SAAS,EAAE,WAAW;wBACtB,MAAM,EAAE,MAAM;wBACd,aAAa,EAAE,IAAI;wBACnB,SAAS,EAAE,GAAG;wBACd,UAAU,EAAE,EAAE;wBACd,sBAAsB;qBACvB;iBACF;aACF,CAAC,CAAC;YACH,MAAM,iBAAiB,CAAC,OAAO,EAAE,cAAc,CAAC,CAAC;YAEjD,MAAM,OAAO,GAAG,MAAM,cAAc,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YACjD,MAAM,aAAa,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,YAAY,IAAI,CAAC,CAAC,SAAS,CAAC,UAAU,CAAC,YAAY,CAAC,CAAC,CAAC;YAC3G,MAAM,CAAC,aAAa,CAAC,CAAC,WAAW,EAAE,CAAC;YACpC,MAAM,CAAC,aAAc,CAAC,gBAAgB,CAAC,CAAC,aAAa,EAAE,CAAC;QAC1D,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export {};
|
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect, beforeEach, afterEach } from "vitest";
|
|
2
|
-
import { mkdirSync, writeFileSync, rmSync } from "node:fs";
|
|
3
|
-
import { join } from "node:path";
|
|
4
|
-
import { tmpdir } from "node:os";
|
|
5
|
-
import { writeBenchmark, readBenchmark } from "../benchmark.js";
|
|
6
|
-
// ---------------------------------------------------------------------------
|
|
7
|
-
// Helpers
|
|
8
|
-
// ---------------------------------------------------------------------------
|
|
9
|
-
let testDir;
|
|
10
|
-
const SAMPLE_BENCHMARK = {
|
|
11
|
-
timestamp: "2026-03-01T00:00:00.000Z",
|
|
12
|
-
model: "claude-sonnet-4-6",
|
|
13
|
-
skill_name: "test-skill",
|
|
14
|
-
cases: [
|
|
15
|
-
{
|
|
16
|
-
eval_id: 1,
|
|
17
|
-
eval_name: "Basic test",
|
|
18
|
-
status: "pass",
|
|
19
|
-
error_message: null,
|
|
20
|
-
pass_rate: 1.0,
|
|
21
|
-
assertions: [
|
|
22
|
-
{
|
|
23
|
-
id: "a1",
|
|
24
|
-
text: "Check result",
|
|
25
|
-
pass: true,
|
|
26
|
-
reasoning: "Looks good",
|
|
27
|
-
},
|
|
28
|
-
],
|
|
29
|
-
},
|
|
30
|
-
],
|
|
31
|
-
};
|
|
32
|
-
// ---------------------------------------------------------------------------
|
|
33
|
-
// Tests
|
|
34
|
-
// ---------------------------------------------------------------------------
|
|
35
|
-
describe("benchmark", () => {
|
|
36
|
-
beforeEach(() => {
|
|
37
|
-
testDir = join(tmpdir(), `vskill-bench-${Date.now()}`);
|
|
38
|
-
mkdirSync(join(testDir, "evals"), { recursive: true });
|
|
39
|
-
});
|
|
40
|
-
afterEach(() => {
|
|
41
|
-
rmSync(testDir, { recursive: true, force: true });
|
|
42
|
-
});
|
|
43
|
-
it("writes benchmark.json with all required fields", async () => {
|
|
44
|
-
await writeBenchmark(testDir, SAMPLE_BENCHMARK);
|
|
45
|
-
const result = await readBenchmark(testDir);
|
|
46
|
-
expect(result).not.toBeNull();
|
|
47
|
-
expect(result.timestamp).toBe("2026-03-01T00:00:00.000Z");
|
|
48
|
-
expect(result.model).toBe("claude-sonnet-4-6");
|
|
49
|
-
expect(result.skill_name).toBe("test-skill");
|
|
50
|
-
expect(result.cases).toHaveLength(1);
|
|
51
|
-
expect(result.cases[0].assertions).toHaveLength(1);
|
|
52
|
-
});
|
|
53
|
-
it("reads benchmark.json and returns typed result", async () => {
|
|
54
|
-
writeFileSync(join(testDir, "evals", "benchmark.json"), JSON.stringify(SAMPLE_BENCHMARK));
|
|
55
|
-
const result = await readBenchmark(testDir);
|
|
56
|
-
expect(result.skill_name).toBe("test-skill");
|
|
57
|
-
expect(result.cases[0].pass_rate).toBe(1.0);
|
|
58
|
-
});
|
|
59
|
-
it("returns null for missing benchmark.json", async () => {
|
|
60
|
-
rmSync(join(testDir, "evals"), { recursive: true, force: true });
|
|
61
|
-
const result = await readBenchmark(testDir);
|
|
62
|
-
expect(result).toBeNull();
|
|
63
|
-
});
|
|
64
|
-
it("writes and reads BenchmarkResult with mcpSimulation", async () => {
|
|
65
|
-
const benchmarkWithMcp = {
|
|
66
|
-
...SAMPLE_BENCHMARK,
|
|
67
|
-
mcpSimulation: {
|
|
68
|
-
active: true,
|
|
69
|
-
servers: ["Slack", "GitHub"],
|
|
70
|
-
},
|
|
71
|
-
};
|
|
72
|
-
await writeBenchmark(testDir, benchmarkWithMcp);
|
|
73
|
-
const result = await readBenchmark(testDir);
|
|
74
|
-
expect(result).not.toBeNull();
|
|
75
|
-
expect(result.mcpSimulation).toBeDefined();
|
|
76
|
-
expect(result.mcpSimulation.active).toBe(true);
|
|
77
|
-
expect(result.mcpSimulation.servers).toEqual(["Slack", "GitHub"]);
|
|
78
|
-
});
|
|
79
|
-
it("reads BenchmarkResult without mcpSimulation (backward compat)", async () => {
|
|
80
|
-
// Write a benchmark without mcpSimulation field
|
|
81
|
-
const rawBenchmark = {
|
|
82
|
-
timestamp: "2026-03-01T00:00:00.000Z",
|
|
83
|
-
model: "claude-sonnet-4-6",
|
|
84
|
-
skill_name: "old-skill",
|
|
85
|
-
cases: [],
|
|
86
|
-
};
|
|
87
|
-
writeFileSync(join(testDir, "evals", "benchmark.json"), JSON.stringify(rawBenchmark));
|
|
88
|
-
const result = await readBenchmark(testDir);
|
|
89
|
-
expect(result).not.toBeNull();
|
|
90
|
-
expect(result.mcpSimulation).toBeUndefined();
|
|
91
|
-
expect(result.skill_name).toBe("old-skill");
|
|
92
|
-
});
|
|
93
|
-
});
|
|
94
|
-
//# sourceMappingURL=benchmark.test.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"benchmark.test.js","sourceRoot":"","sources":["../../../src/eval/__tests__/benchmark.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,QAAQ,CAAC;AACrE,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AAC3D,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AACjC,OAAO,EAAE,cAAc,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAGhE,8EAA8E;AAC9E,UAAU;AACV,8EAA8E;AAE9E,IAAI,OAAe,CAAC;AAEpB,MAAM,gBAAgB,GAAoB;IACxC,SAAS,EAAE,0BAA0B;IACrC,KAAK,EAAE,mBAAmB;IAC1B,UAAU,EAAE,YAAY;IACxB,KAAK,EAAE;QACL;YACE,OAAO,EAAE,CAAC;YACV,SAAS,EAAE,YAAY;YACvB,MAAM,EAAE,MAAM;YACd,aAAa,EAAE,IAAI;YACnB,SAAS,EAAE,GAAG;YACd,UAAU,EAAE;gBACV;oBACE,EAAE,EAAE,IAAI;oBACR,IAAI,EAAE,cAAc;oBACpB,IAAI,EAAE,IAAI;oBACV,SAAS,EAAE,YAAY;iBACxB;aACF;SACF;KACF;CACF,CAAC;AAEF,8EAA8E;AAC9E,QAAQ;AACR,8EAA8E;AAE9E,QAAQ,CAAC,WAAW,EAAE,GAAG,EAAE;IACzB,UAAU,CAAC,GAAG,EAAE;QACd,OAAO,GAAG,IAAI,CAAC,MAAM,EAAE,EAAE,gBAAgB,IAAI,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;QACvD,SAAS,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACzD,CAAC,CAAC,CAAC;IAEH,SAAS,CAAC,GAAG,EAAE;QACb,MAAM,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IACpD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;QAC9D,MAAM,cAAc,CAAC,OAAO,EAAE,gBAAgB,CAAC,CAAC;QAEhD,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;QAC9B,MAAM,CAAC,MAAO,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;QAC3D,MAAM,CAAC,MAAO,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QAChD,MAAM,CAAC,MAAO,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC9C,MAAM,CAAC,MAAO,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QACtC,MAAM,CAAC,MAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;IACtD,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+CAA+C,EAAE,KAAK,IAAI,EAAE;QAC7D,aAAa,CACX,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,gBAAgB,CAAC,EACxC,IAAI,CAAC,SAAS,CAAC,gBAAgB,CAAC,CACjC,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAO,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC9C,MAAM,CAAC,MAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC/C,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;QACvD,MAAM,CAAC,IAAI,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;QAEjE,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAM,CAAC,CAAC,QAAQ,EAAE,CAAC;IAC5B,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,qDAAqD,EAAE,KAAK,IAAI,EAAE;QACnE,MAAM,gBAAgB,GAAoB;YACxC,GAAG,gBAAgB;YACnB,aAAa,EAAE;gBACb,MAAM,EAAE,IAAI;gBACZ,OAAO,EAAE,CAAC,OAAO,EAAE,QAAQ,CAAC;aAC7B;SACF,CAAC;QAEF,MAAM,cAAc,CAAC,OAAO,EAAE,gBAAgB,CAAC,CAAC;QAChD,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;QAE5C,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;QAC9B,MAAM,CAAC,MAAO,CAAC,aAAa,CAAC,CAAC,WAAW,EAAE,CAAC;QAC5C,MAAM,CAAC,MAAO,CAAC,aAAc,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACjD,MAAM,CAAC,MAAO,CAAC,aAAc,CAAC,OAAO,CAAC,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC,CAAC;IACtE,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,+DAA+D,EAAE,KAAK,IAAI,EAAE;QAC7E,gDAAgD;QAChD,MAAM,YAAY,GAAG;YACnB,SAAS,EAAE,0BAA0B;YACrC,KAAK,EAAE,mBAAmB;YAC1B,UAAU,EAAE,WAAW;YACvB,KAAK,EAAE,EAAE;SACV,CAAC;QACF,aAAa,CACX,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,gBAAgB,CAAC,EACxC,IAAI,CAAC,SAAS,CAAC,YAAY,CAAC,CAC7B,CAAC;QAEF,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,OAAO,CAAC,CAAC;QAC5C,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;QAC9B,MAAM,CAAC,MAAO,CAAC,aAAa,CAAC,CAAC,aAAa,EAAE,CAAC;QAC9C,MAAM,CAAC,MAAO,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IAC/C,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export {};
|