@jean.gnc/harness-kit 0.11.2 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/schema.d.ts +2 -2
- package/dist/cli.js +69 -0
- package/dist/cli.js.map +1 -1
- package/dist/eval/cases.d.ts +14 -0
- package/dist/eval/cases.d.ts.map +1 -0
- package/dist/eval/cases.js +84 -0
- package/dist/eval/cases.js.map +1 -0
- package/dist/eval/detect.d.ts +14 -0
- package/dist/eval/detect.d.ts.map +1 -0
- package/dist/eval/detect.js +105 -0
- package/dist/eval/detect.js.map +1 -0
- package/dist/eval/index.d.ts +20 -0
- package/dist/eval/index.d.ts.map +1 -0
- package/dist/eval/index.js +46 -0
- package/dist/eval/index.js.map +1 -0
- package/dist/eval/report.d.ts +15 -0
- package/dist/eval/report.d.ts.map +1 -0
- package/dist/eval/report.js +81 -0
- package/dist/eval/report.js.map +1 -0
- package/dist/eval/runner.d.ts +17 -0
- package/dist/eval/runner.d.ts.map +1 -0
- package/dist/eval/runner.js +89 -0
- package/dist/eval/runner.js.map +1 -0
- package/dist/eval/schema.d.ts +253 -0
- package/dist/eval/schema.d.ts.map +1 -0
- package/dist/eval/schema.js +50 -0
- package/dist/eval/schema.js.map +1 -0
- package/dist/eval/score.d.ts +13 -0
- package/dist/eval/score.d.ts.map +1 -0
- package/dist/eval/score.js +52 -0
- package/dist/eval/score.js.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +2 -0
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/agent/schema.d.ts
CHANGED
|
@@ -7,13 +7,13 @@ export declare const AgentSchema: z.ZodObject<{
|
|
|
7
7
|
}, "strict", z.ZodTypeAny, {
|
|
8
8
|
name: string;
|
|
9
9
|
description: string;
|
|
10
|
-
tools?: string | string[] | undefined;
|
|
11
10
|
model?: string | undefined;
|
|
11
|
+
tools?: string | string[] | undefined;
|
|
12
12
|
}, {
|
|
13
13
|
name: string;
|
|
14
14
|
description: string;
|
|
15
|
-
tools?: string | string[] | undefined;
|
|
16
15
|
model?: string | undefined;
|
|
16
|
+
tools?: string | string[] | undefined;
|
|
17
17
|
}>;
|
|
18
18
|
export type Agent = z.infer<typeof AgentSchema>;
|
|
19
19
|
export declare function defineAgent(agent: Agent): Agent;
|
package/dist/cli.js
CHANGED
|
@@ -3,8 +3,12 @@ import { readFileSync } from "node:fs";
|
|
|
3
3
|
import { fileURLToPath } from "node:url";
|
|
4
4
|
import { defineCommand, runMain } from "citty";
|
|
5
5
|
import { z } from "zod";
|
|
6
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
7
|
+
import { dirname } from "node:path";
|
|
6
8
|
import { compile } from "./compile.js";
|
|
7
9
|
import { CHECK_MODES, check } from "./check/index.js";
|
|
10
|
+
import { formatConsole, runEval, toJson } from "./eval/index.js";
|
|
11
|
+
import { TIERS } from "./eval/schema.js";
|
|
8
12
|
import { initHarness } from "./init/index.js";
|
|
9
13
|
import { install, uninstall } from "./install/index.js";
|
|
10
14
|
import { parseInstallMode } from "./install/mode.js";
|
|
@@ -166,6 +170,70 @@ const lintCmd = defineCommand({
|
|
|
166
170
|
process.exit(1);
|
|
167
171
|
},
|
|
168
172
|
});
|
|
173
|
+
function parseTier(value) {
|
|
174
|
+
if (TIERS.includes(value))
|
|
175
|
+
return value;
|
|
176
|
+
throw new Error(`Unknown tier "${value}". Valid: ${TIERS.join(", ")}`);
|
|
177
|
+
}
|
|
178
|
+
function parsePositiveInt(value, flag) {
|
|
179
|
+
const parsed = Number(value);
|
|
180
|
+
if (!Number.isInteger(parsed) || parsed < 1) {
|
|
181
|
+
throw new Error(`--${flag} must be a positive integer, got "${value}"`);
|
|
182
|
+
}
|
|
183
|
+
return parsed;
|
|
184
|
+
}
|
|
185
|
+
const evalCmd = defineCommand({
|
|
186
|
+
meta: {
|
|
187
|
+
name: "eval",
|
|
188
|
+
description: "Run skill-routing evals against the installed harness",
|
|
189
|
+
},
|
|
190
|
+
args: {
|
|
191
|
+
cases: {
|
|
192
|
+
type: "string",
|
|
193
|
+
default: "./evals/cases",
|
|
194
|
+
description: "directory of eval case files",
|
|
195
|
+
},
|
|
196
|
+
cwd: { type: "string", default: ".", description: "working directory for claude -p sessions" },
|
|
197
|
+
suite: { type: "string", description: "only run cases from this suite" },
|
|
198
|
+
case: { type: "string", description: "only run the case with this id" },
|
|
199
|
+
tier: { type: "string", description: `only run this tier: ${TIERS.join(" | ")}` },
|
|
200
|
+
runs: { type: "string", description: "runs per case (overrides per-case default)" },
|
|
201
|
+
concurrency: {
|
|
202
|
+
type: "string",
|
|
203
|
+
default: "1",
|
|
204
|
+
description: "max concurrent sessions (parallel sessions interfere with routing; default 1)",
|
|
205
|
+
},
|
|
206
|
+
model: {
|
|
207
|
+
type: "string",
|
|
208
|
+
description: "model for claude -p (default: user's configured model)",
|
|
209
|
+
},
|
|
210
|
+
json: { type: "string", description: "write machine-readable results to this path" },
|
|
211
|
+
},
|
|
212
|
+
run: async ({ args }) => {
|
|
213
|
+
const result = await runEval({
|
|
214
|
+
casesDir: args.cases,
|
|
215
|
+
cwd: args.cwd,
|
|
216
|
+
concurrency: parsePositiveInt(args.concurrency, "concurrency"),
|
|
217
|
+
...(args.suite !== undefined && { suite: args.suite }),
|
|
218
|
+
...(args.case !== undefined && { caseId: args.case }),
|
|
219
|
+
...(args.tier !== undefined && { tier: parseTier(args.tier) }),
|
|
220
|
+
...(args.runs !== undefined && { runs: parsePositiveInt(args.runs, "runs") }),
|
|
221
|
+
...(args.model !== undefined && { model: args.model }),
|
|
222
|
+
});
|
|
223
|
+
if (!result.ok) {
|
|
224
|
+
for (const e of result.error)
|
|
225
|
+
console.error(`${e.file}: ${e.message}`);
|
|
226
|
+
process.exit(1);
|
|
227
|
+
}
|
|
228
|
+
console.log(formatConsole(result.value));
|
|
229
|
+
if (args.json) {
|
|
230
|
+
await mkdir(dirname(args.json), { recursive: true });
|
|
231
|
+
await writeFile(args.json, toJson(result.value) + "\n");
|
|
232
|
+
}
|
|
233
|
+
if (result.value.failed > 0)
|
|
234
|
+
process.exit(1);
|
|
235
|
+
},
|
|
236
|
+
});
|
|
169
237
|
const main = defineCommand({
|
|
170
238
|
meta: {
|
|
171
239
|
name: "harness",
|
|
@@ -175,6 +243,7 @@ const main = defineCommand({
|
|
|
175
243
|
subCommands: {
|
|
176
244
|
check: checkCmd,
|
|
177
245
|
compile: compileCmd,
|
|
246
|
+
eval: evalCmd,
|
|
178
247
|
init: initCmd,
|
|
179
248
|
install: installCmd,
|
|
180
249
|
lint: lintCmd,
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACvC,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AAEzC,OAAO,EAAE,aAAa,EAAE,OAAO,EAAE,MAAM,OAAO,CAAC;AAC/C,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AACvC,OAAO,EAAE,WAAW,EAAE,KAAK,EAA2C,MAAM,kBAAkB,CAAC;AAC/F,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAC9C,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,qBAAqB,EAAE,MAAM,sBAAsB,CAAC;AAE7E,MAAM,iBAAiB,GAAG,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;AAEnE,MAAM,OAAO,GAAG,aAAa,CAAC,IAAI,GAAG,CAAC,iBAAiB,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;AAC3E,MAAM,GAAG,GAAG,iBAAiB,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC;AAE/E,SAAS,WAAW,CAAC,KAAa;IAChC,OAAQ,WAAiC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;AAC5D,CAAC;AAED,SAAS,cAAc,CAAC,KAAa;IACnC,IAAI,WAAW,CAAC,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IACrC,MAAM,IAAI,KAAK,CAAC,uBAAuB,KAAK,aAAa,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;AACrF,CAAC;AAED,MAAM,UAAU,GAAG,aAAa,CAAC;IAC/B,IAAI,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,WAAW,EAAE,kCAAkC,EAAE;IAC1E,IAAI,EAAE;QACJ,GAAG,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,WAAW,EAAE,aAAa,EAAE;QACrE,GAAG,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,WAAW,EAAE,aAAa,EAAE;QACtE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,GAAG,EAAE,WAAW,EAAE,sCAAsC,EAAE;QAC3F,MAAM,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,sBAAsB,EAAE;KACjF;IACD,GAAG,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE;QACtB,MAAM,OAAO,CAAC;YACZ,OAAO,EAAE,IAAI,CAAC,GAAG;YACjB,OAAO,EAAE,IAAI,CAAC,GAAG;YACjB,QAAQ,EAAE,IAAI,CAAC,IAAI;YACnB,MAAM,EAAE,IAAI,CAAC,MAAM;SACpB,CAAC,CAAC;IACL,CAAC;CACF,CAAC,CAAC;AAEH,MAAM,WAAW,GAAG;IAClB,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,WAAW,EAAE,WAAW,EAAE;IACrE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,GAAG,EAAE,WAAW,EAAE,sCAAsC,EAAE;IAC3F,MAAM,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,sBAAsB,EAAE;IAChF,SAAS,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,6BAA6B,EAAE;CAClF,CAAC;AAEX,MAAM,UAAU,GAAG,aAAa,CAAC;IAC/B,IAAI,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,WAAW,EAAE,qDAAqD,EAAE;IAC7F,IAAI,EAAE;QACJ,GAAG,WAAW;QACd,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,WAAW,EAAE,gCAAgC,EAAE;KAC1F;IACD,GAAG,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE;QACtB,MAAM,IAAI,GAAG,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACzC,MAAM,OAAO,GAAG,MAAM,qBAAqB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACvD,MAAM,OAAO,CAAC;YACZ,QAAQ,EAAE,IAAI,CAAC,IAAI;YACnB,QAAQ,EAAE,IAAI,CAAC,IAAI;YACnB,OAAO;YACP,IAAI;YACJ,MAAM,EAAE,IAAI,CAAC,MAAM;YACnB,MAAM,EAAE,IAAI,CAAC,SAAS,CAAC;SACxB,CAAC,CAAC;IACL,CAAC;CACF,CAAC,CAAC;AAEH,MAAM,YAAY,GAAG,aAAa,CAAC;IACjC,IAAI,EAAE,EAAE,IAAI,EAAE,WAAW,EAAE,WAAW,EAAE,8CAA8C,EAAE;IACxF,IAAI,EAAE,WAAW;IACjB,GAAG,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE;QACtB,MAAM,OAAO,GAAG,MAAM,qBAAqB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACvD,MAAM,SAAS,CAAC;YACd,QAAQ,EAAE,IAAI,CAAC,IAAI;YACnB,QAAQ,EAAE,IAAI,CAAC,IAAI;YACnB,OAAO;YACP,MAAM,EAAE,IAAI,CAAC,MAAM;YACnB,MAAM,EAAE,IAAI,CAAC,SAAS,CAAC;SACxB,CAAC,CAAC;IACL,CAAC;CACF,CAAC,CAAC;AAEH,MAAM,OAAO,GAAG,aAAa,CAAC;IAC5B,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM;QACZ,WAAW,EAAE,2EAA2E;KACzF;IACD,IAAI,EAAE;QACJ,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,GAAG,EAAE,WAAW,EAAE,4BAA4B,EAAE;QACjF,WAAW,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,WAAW,EAAE,kBAAkB,EAAE;QAChF,OAAO,EAAE;YACP,IAAI,EAAE,QAAQ;YACd,QAAQ,EAAE,IAAI;YACd,WAAW,EAAE,mCAAmC,cAAc,EAAE;iBAC7D,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;iBAClB,IAAI,CAAC,IAAI,CAAC,GAAG;SACjB;QACD,MAAM,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,sBAAsB,EAAE;KACjF;IACD,GAAG,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE;QACtB,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO;aACzB,KAAK,CAAC,GAAG,CAAC;aACV,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;aACpB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAC/B,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;YAAE,MAAM,IAAI,KAAK,CAAC,yCAAyC,CAAC,CAAC;QACrF,cAAc,CAAC,OAAO,EAAE,cAAc,EAAE,CAAC,CAAC;QAC1C,MAAM,WAAW,CAAC;YAChB,QAAQ,EAAE,IAAI,CAAC,IAAI;YACnB,WAAW,EAAE,IAAI,CAAC,WAAW;YAC7B,OAAO;YACP,MAAM,EAAE,IAAI,CAAC,MAAM;SACpB,CAAC,CAAC;IACL,CAAC;CACF,CAAC,CAAC;AAEH,MAAM,QAAQ,GAAG,aAAa,CAAC;IAC7B,IAAI,EAAE;QACJ,IAAI,EAAE,OAAO;QACb,WAAW,EAAE,8DAA8D;KAC5E;IACD,IAAI,EAAE;QACJ,GAAG,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,WAAW,EAAE,aAAa,EAAE;QACrE,IAAI,EAAE;YACJ,IAAI,EAAE,QAAQ;YACd,OAAO,EAAE,WAAW;YACpB,WAAW,EAAE,2CAA2C;SACzD;QACD,MAAM,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,2BAA2B,EAAE;KACtF;IACD,GAAG,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE;QACtB,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACvC,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,EAAE,OAAO,EAAE,IAAI,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;QACxD,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;YACjB,MAAM,SAAS,GAAG,MAAM,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7F,MAAM,KAAK,GAAG,MAAM,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;YAC9E,OAAO,CAAC,GAAG,CACT,WAAW,KAAK,kBAAkB,MAAM,CAAC,cAAc,CAAC,MAAM,aAAa,SAAS,GAAG,CACxF,CAAC;YACF,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,YAAY,eAAe,CAAC,CAAC;YAC3D,IAAI,MAAM,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC;gBAAE,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QACpD,CAAC;QACD,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;YAClC,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC;QACD,IAAI,MAAM,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACjC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;gBACjB,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;gBAChB,OAAO,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,UAAU,CAAC,MAAM,aAAa,CAAC,CAAC;YACxD,CAAC;YACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;CACF,CAAC,CAAC;AAEH,SAAS,eAAe,CAAC,CAAqB;IAC5C,OAAO,GAAG,CAAC,CAAC,IAAI,IAAI,CAAC,CAAC,IAAI,IAAI,CAAC,CAAC,MAAM,OAAO,CAAC,CAAC,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,CAAC;AAC1E,CAAC;AAED,MAAM,OAAO,GAAG,aAAa,CAAC;IAC5B,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM;QACZ,WAAW,EAAE,qEAAqE;KACnF;IACD,IAAI,EAAE;QACJ,GAAG,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,WAAW,EAAE,aAAa,EAAE;QACtE,MAAM,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,2BAA2B,EAAE;KACtF;IACD,GAAG,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE;QACtB,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,EAAE,OAAO,EAAE,IAAI,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;QACtE,IAAI,MAAM,CAAC,UAAU,GAAG,CAAC;YAAE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAC7C,CAAC;CACF,CAAC,CAAC;AAEH,MAAM,IAAI,GAAG,aAAa,CAAC;IACzB,IAAI,EAAE;QACJ,IAAI,EAAE,SAAS;QACf,OAAO,EAAE,GAAG,CAAC,OAAO;QACpB,WAAW,EACT,yFAAyF;KAC5F;IACD,WAAW,EAAE;QACX,KAAK,EAAE,QAAQ;QACf,OAAO,EAAE,UAAU;QACnB,IAAI,EAAE,OAAO;QACb,OAAO,EAAE,UAAU;QACnB,IAAI,EAAE,OAAO;QACb,SAAS,EAAE,YAAY;KACxB;CACF,CAAC,CAAC;AAEH,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC"}
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACvC,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AAEzC,OAAO,EAAE,aAAa,EAAE,OAAO,EAAE,MAAM,OAAO,CAAC;AAC/C,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAEpC,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AACvC,OAAO,EAAE,WAAW,EAAE,KAAK,EAA2C,MAAM,kBAAkB,CAAC;AAC/F,OAAO,EAAE,aAAa,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,iBAAiB,CAAC;AACjE,OAAO,EAAE,KAAK,EAAa,MAAM,kBAAkB,CAAC;AACpD,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAC9C,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AACxD,OAAO,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AACrD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,qBAAqB,EAAE,MAAM,sBAAsB,CAAC;AAE7E,MAAM,iBAAiB,GAAG,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;AAEnE,MAAM,OAAO,GAAG,aAAa,CAAC,IAAI,GAAG,CAAC,iBAAiB,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;AAC3E,MAAM,GAAG,GAAG,iBAAiB,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC;AAE/E,SAAS,WAAW,CAAC,KAAa;IAChC,OAAQ,WAAiC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;AAC5D,CAAC;AAED,SAAS,cAAc,CAAC,KAAa;IACnC,IAAI,WAAW,CAAC,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IACrC,MAAM,IAAI,KAAK,CAAC,uBAAuB,KAAK,aAAa,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;AACrF,CAAC;AAED,MAAM,UAAU,GAAG,aAAa,CAAC;IAC/B,IAAI,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,WAAW,EAAE,kCAAkC,EAAE;IAC1E,IAAI,EAAE;QACJ,GAAG,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,WAAW,EAAE,aAAa,EAAE;QACrE,GAAG,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,WAAW,EAAE,aAAa,EAAE;QACtE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,GAAG,EAAE,WAAW,EAAE,sCAAsC,EAAE;QAC3F,MAAM,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,sBAAsB,EAAE;KACjF;IACD,GAAG,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE;QACtB,MAAM,OAAO,CAAC;YACZ,OAAO,EAAE,IAAI,CAAC,GAAG;YACjB,OAAO,EAAE,IAAI,CAAC,GAAG;YACjB,QAAQ,EAAE,IAAI,CAAC,IAAI;YACnB,MAAM,EAAE,IAAI,CAAC,MAAM;SACpB,CAAC,CAAC;IACL,CAAC;CACF,CAAC,CAAC;AAEH,MAAM,WAAW,GAAG;IAClB,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,WAAW,EAAE,WAAW,EAAE;IACrE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,GAAG,EAAE,WAAW,EAAE,sCAAsC,EAAE;IAC3F,MAAM,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,sBAAsB,EAAE;IAChF,SAAS,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,6BAA6B,EAAE;CAClF,CAAC;AAEX,MAAM,UAAU,GAAG,aAAa,CAAC;IAC/B,IAAI,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,WAAW,EAAE,qDAAqD,EAAE;IAC7F,IAAI,EAAE;QACJ,GAAG,WAAW;QACd,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,WAAW,EAAE,gCAAgC,EAAE;KAC1F;IACD,GAAG,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE;QACtB,MAAM,IAAI,GAAG,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACzC,MAAM,OAAO,GAAG,MAAM,qBAAqB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACvD,MAAM,OAAO,CAAC;YACZ,QAAQ,EAAE,IAAI,CAAC,IAAI;YACnB,QAAQ,EAAE,IAAI,CAAC,IAAI;YACnB,OAAO;YACP,IAAI;YACJ,MAAM,EAAE,IAAI,CAAC,MAAM;YACnB,MAAM,EAAE,IAAI,CAAC,SAAS,CAAC;SACxB,CAAC,CAAC;IACL,CAAC;CACF,CAAC,CAAC;AAEH,MAAM,YAAY,GAAG,aAAa,CAAC;IACjC,IAAI,EAAE,EAAE,IAAI,EAAE,WAAW,EAAE,WAAW,EAAE,8CAA8C,EAAE;IACxF,IAAI,EAAE,WAAW;IACjB,GAAG,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE;QACtB,MAAM,OAAO,GAAG,MAAM,qBAAqB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACvD,MAAM,SAAS,CAAC;YACd,QAAQ,EAAE,IAAI,CAAC,IAAI;YACnB,QAAQ,EAAE,IAAI,CAAC,IAAI;YACnB,OAAO;YACP,MAAM,EAAE,IAAI,CAAC,MAAM;YACnB,MAAM,EAAE,IAAI,CAAC,SAAS,CAAC;SACxB,CAAC,CAAC;IACL,CAAC;CACF,CAAC,CAAC;AAEH,MAAM,OAAO,GAAG,aAAa,CAAC;IAC5B,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM;QACZ,WAAW,EAAE,2EAA2E;KACzF;IACD,IAAI,EAAE;QACJ,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,GAAG,EAAE,WAAW,EAAE,4BAA4B,EAAE;QACjF,WAAW,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,WAAW,EAAE,kBAAkB,EAAE;QAChF,OAAO,EAAE;YACP,IAAI,EAAE,QAAQ;YACd,QAAQ,EAAE,IAAI;YACd,WAAW,EAAE,mCAAmC,cAAc,EAAE;iBAC7D,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;iBAClB,IAAI,CAAC,IAAI,CAAC,GAAG;SACjB;QACD,MAAM,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,sBAAsB,EAAE;KACjF;IACD,GAAG,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE;QACtB,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO;aACzB,KAAK,CAAC,GAAG,CAAC;aACV,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;aACpB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAC/B,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;YAAE,MAAM,IAAI,KAAK,CAAC,yCAAyC,CAAC,CAAC;QACrF,cAAc,CAAC,OAAO,EAAE,cAAc,EAAE,CAAC,CAAC;QAC1C,MAAM,WAAW,CAAC;YAChB,QAAQ,EAAE,IAAI,CAAC,IAAI;YACnB,WAAW,EAAE,IAAI,CAAC,WAAW;YAC7B,OAAO;YACP,MAAM,EAAE,IAAI,CAAC,MAAM;SACpB,CAAC,CAAC;IACL,CAAC;CACF,CAAC,CAAC;AAEH,MAAM,QAAQ,GAAG,aAAa,CAAC;IAC7B,IAAI,EAAE;QACJ,IAAI,EAAE,OAAO;QACb,WAAW,EAAE,8DAA8D;KAC5E;IACD,IAAI,EAAE;QACJ,GAAG,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,WAAW,EAAE,aAAa,EAAE;QACrE,IAAI,EAAE;YACJ,IAAI,EAAE,QAAQ;YACd,OAAO,EAAE,WAAW;YACpB,WAAW,EAAE,2CAA2C;SACzD;QACD,MAAM,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,2BAA2B,EAAE;KACtF;IACD,GAAG,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE;QACtB,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACvC,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,EAAE,OAAO,EAAE,IAAI,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;QACxD,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;YACjB,MAAM,SAAS,GAAG,MAAM,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7F,MAAM,KAAK,GAAG,MAAM,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;YAC9E,OAAO,CAAC,GAAG,CACT,WAAW,KAAK,kBAAkB,MAAM,CAAC,cAAc,CAAC,MAAM,aAAa,SAAS,GAAG,CACxF,CAAC;YACF,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,YAAY,eAAe,CAAC,CAAC;YAC3D,IAAI,MAAM,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC;gBAAE,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QACpD,CAAC;QACD,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,UAAU,EAAE,CAAC;YAClC,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC;QAClC,CAAC;QACD,IAAI,MAAM,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACjC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;gBACjB,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;gBAChB,OAAO,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,UAAU,CAAC,MAAM,aAAa,CAAC,CAAC;YACxD,CAAC;YACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;CACF,CAAC,CAAC;AAEH,SAAS,eAAe,CAAC,CAAqB;IAC5C,OAAO,GAAG,CAAC,CAAC,IAAI,IAAI,CAAC,CAAC,IAAI,IAAI,CAAC,CAAC,MAAM,OAAO,CAAC,CAAC,KAAK,QAAQ,CAAC,CAAC,OAAO,EAAE,CAAC;AAC1E,CAAC;AAED,MAAM,OAAO,GAAG,aAAa,CAAC;IAC5B,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM;QACZ,WAAW,EAAE,qEAAqE;KACnF;IACD,IAAI,EAAE;QACJ,GAAG,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,WAAW,EAAE,aAAa,EAAE;QACtE,MAAM,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,2BAA2B,EAAE;KACtF;IACD,GAAG,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE;QACtB,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,EAAE,OAAO,EAAE,IAAI,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;QACtE,IAAI,MAAM,CAAC,UAAU,GAAG,CAAC;YAAE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAC7C,CAAC;CACF,CAAC,CAAC;AAEH,SAAS,SAAS,CAAC,KAAa;IAC9B,IAAK,KAA2B,CAAC,QAAQ,CAAC,KAAK,CAAC;QAAE,OAAO,KAAa,CAAC;IACvE,MAAM,IAAI,KAAK,CAAC,iBAAiB,KAAK,aAAa,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;AACzE,CAAC;AAED,SAAS,gBAAgB,CAAC,KAAa,EAAE,IAAY;IACnD,MAAM,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;IAC7B,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5C,MAAM,IAAI,KAAK,CAAC,KAAK,IAAI,qCAAqC,KAAK,GAAG,CAAC,CAAC;IAC1E,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,MAAM,OAAO,GAAG,aAAa,CAAC;IAC5B,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM;QACZ,WAAW,EAAE,uDAAuD;KACrE;IACD,IAAI,EAAE;QACJ,KAAK,EAAE;YACL,IAAI,EAAE,QAAQ;YACd,OAAO,EAAE,eAAe;YACxB,WAAW,EAAE,8BAA8B;SAC5C;QACD,GAAG,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,GAAG,EAAE,WAAW,EAAE,0CAA0C,EAAE;QAC9F,KAAK,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,gCAAgC,EAAE;QACxE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,gCAAgC,EAAE;QACvE,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,uBAAuB,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,EAAE;QACjF,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,4CAA4C,EAAE;QACnF,WAAW,EAAE;YACX,IAAI,EAAE,QAAQ;YACd,OAAO,EAAE,GAAG;YACZ,WAAW,EAAE,+EAA+E;SAC7F;QACD,KAAK,EAAE;YACL,IAAI,EAAE,QAAQ;YACd,WAAW,EAAE,wDAAwD;SACtE;QACD,IAAI,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,6CAA6C,EAAE;KACrF;IACD,GAAG,EAAE,KAAK,EAAE,EAAE,IAAI,EAAE,EAAE,EAAE;QACtB,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC;YAC3B,QAAQ,EAAE,IAAI,CAAC,KAAK;YACpB,GAAG,EAAE,IAAI,CAAC,GAAG;YACb,WAAW,EAAE,gBAAgB,CAAC,IAAI,CAAC,WAAW,EAAE,aAAa,CAAC;YAC9D,GAAG,CAAC,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,CAAC;YACtD,GAAG,CAAC,IAAI,CAAC,IAAI,KAAK,SAAS,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,IAAI,EAAE,CAAC;YACrD,GAAG,CAAC,IAAI,CAAC,IAAI,KAAK,SAAS,IAAI,EAAE,IAAI,EAAE,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YAC9D,GAAG,CAAC,IAAI,CAAC,IAAI,KAAK,SAAS,IAAI,EAAE,IAAI,EAAE,gBAAgB,CAAC,IAAI,CAAC,IAAI,EAAE,MAAM,CAAC,EAAE,CAAC;YAC7E,GAAG,CAAC,IAAI,CAAC,KAAK,KAAK,SAAS,IAAI,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,CAAC;SACvD,CAAC,CAAC;QAEH,IAAI,CAAC,MAAM,CAAC,EAAE,EAAE,CAAC;YACf,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,KAAK;gBAAE,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC;YACvE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,aAAa,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC;QACzC,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;YACd,MAAM,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YACrD,MAAM,SAAS,CAAC,IAAI,CAAC,IAAI,EAAE,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,CAAC;QAC1D,CAAC;QACD,IAAI,MAAM,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC;YAAE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAC/C,CAAC;CACF,CAAC,CAAC;AAEH,MAAM,IAAI,GAAG,aAAa,CAAC;IACzB,IAAI,EAAE;QACJ,IAAI,EAAE,SAAS;QACf,OAAO,EAAE,GAAG,CAAC,OAAO;QACpB,WAAW,EACT,yFAAyF;KAC5F;IACD,WAAW,EAAE;QACX,KAAK,EAAE,QAAQ;QACf,OAAO,EAAE,UAAU;QACnB,IAAI,EAAE,OAAO;QACb,IAAI,EAAE,OAAO;QACb,OAAO,EAAE,UAAU;QACnB,IAAI,EAAE,OAAO;QACb,SAAS,EAAE,YAAY;KACxB;CACF,CAAC,CAAC;AAEH,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { type Result } from "../result.js";
|
|
2
|
+
import { type EvalCase, type Tier } from "./schema.js";
|
|
3
|
+
export interface LoadedCase extends EvalCase {
|
|
4
|
+
readonly suite: string;
|
|
5
|
+
readonly tier: Tier;
|
|
6
|
+
readonly file: string;
|
|
7
|
+
}
|
|
8
|
+
export interface CaseLoadError {
|
|
9
|
+
readonly file: string;
|
|
10
|
+
readonly message: string;
|
|
11
|
+
}
|
|
12
|
+
export declare function loadCases(casesDir: string): Promise<Result<LoadedCase[], CaseLoadError[]>>;
|
|
13
|
+
export declare function unresolvedSkills(cases: readonly LoadedCase[], installedIds: ReadonlySet<string>): CaseLoadError[];
|
|
14
|
+
//# sourceMappingURL=cases.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cases.d.ts","sourceRoot":"","sources":["../../src/eval/cases.ts"],"names":[],"mappings":"AAMA,OAAO,EAAW,KAAK,MAAM,EAAE,MAAM,cAAc,CAAC;AACpD,OAAO,EAIL,KAAK,QAAQ,EACb,KAAK,IAAI,EACV,MAAM,aAAa,CAAC;AAErB,MAAM,WAAW,UAAW,SAAQ,QAAQ;IAC1C,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,IAAI,EAAE,IAAI,CAAC;IACpB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;CACvB;AAED,MAAM,WAAW,aAAa;IAC5B,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B;AAED,wBAAsB,SAAS,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,UAAU,EAAE,EAAE,aAAa,EAAE,CAAC,CAAC,CAuBhG;AAED,wBAAgB,gBAAgB,CAC9B,KAAK,EAAE,SAAS,UAAU,EAAE,EAC5B,YAAY,EAAE,WAAW,CAAC,MAAM,CAAC,GAChC,aAAa,EAAE,CAajB"}
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import { readdir, readFile } from "node:fs/promises";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import yaml from "js-yaml";
|
|
4
|
+
import { formatZodIssues } from "../errors/zod.js";
|
|
5
|
+
import { err, ok } from "../result.js";
|
|
6
|
+
import { CaseFileSchema, expectedSkills, } from "./schema.js";
|
|
7
|
+
export async function loadCases(casesDir) {
|
|
8
|
+
const files = await collectYamlFiles(casesDir);
|
|
9
|
+
const loaded = [];
|
|
10
|
+
const errors = [];
|
|
11
|
+
const seenIds = new Set();
|
|
12
|
+
for (const file of files) {
|
|
13
|
+
const parsed = await parseFile(file);
|
|
14
|
+
if (!parsed.ok) {
|
|
15
|
+
errors.push(parsed.error);
|
|
16
|
+
continue;
|
|
17
|
+
}
|
|
18
|
+
for (const evalCase of parsed.value.cases) {
|
|
19
|
+
if (seenIds.has(evalCase.id)) {
|
|
20
|
+
errors.push({ file, message: `duplicate case id "${evalCase.id}"` });
|
|
21
|
+
continue;
|
|
22
|
+
}
|
|
23
|
+
seenIds.add(evalCase.id);
|
|
24
|
+
loaded.push({ ...evalCase, suite: parsed.value.suite, tier: parsed.value.tier, file });
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
return errors.length > 0 ? err(errors) : ok(loaded);
|
|
28
|
+
}
|
|
29
|
+
export function unresolvedSkills(cases, installedIds) {
|
|
30
|
+
const errors = [];
|
|
31
|
+
for (const evalCase of cases) {
|
|
32
|
+
for (const id of expectedSkills(evalCase.expect)) {
|
|
33
|
+
if (!installedIds.has(id)) {
|
|
34
|
+
errors.push({
|
|
35
|
+
file: evalCase.file,
|
|
36
|
+
message: `case "${evalCase.id}" expects skill "${id}", which is not installed`,
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
return errors;
|
|
42
|
+
}
|
|
43
|
+
async function parseFile(file) {
|
|
44
|
+
let raw;
|
|
45
|
+
try {
|
|
46
|
+
raw = await readFile(file, "utf8");
|
|
47
|
+
}
|
|
48
|
+
catch (cause) {
|
|
49
|
+
return err({ file, message: `cannot read file: ${cause.message}` });
|
|
50
|
+
}
|
|
51
|
+
let doc;
|
|
52
|
+
try {
|
|
53
|
+
doc = yaml.load(raw);
|
|
54
|
+
}
|
|
55
|
+
catch (cause) {
|
|
56
|
+
return err({ file, message: `invalid YAML: ${cause.message}` });
|
|
57
|
+
}
|
|
58
|
+
const result = CaseFileSchema.safeParse(doc);
|
|
59
|
+
if (!result.success) {
|
|
60
|
+
return err({ file, message: formatZodIssues(result.error).join("; ") });
|
|
61
|
+
}
|
|
62
|
+
return ok(result.data);
|
|
63
|
+
}
|
|
64
|
+
async function collectYamlFiles(dir) {
|
|
65
|
+
const files = [];
|
|
66
|
+
let entries;
|
|
67
|
+
try {
|
|
68
|
+
entries = await readdir(dir, { withFileTypes: true });
|
|
69
|
+
}
|
|
70
|
+
catch {
|
|
71
|
+
return files;
|
|
72
|
+
}
|
|
73
|
+
for (const entry of entries) {
|
|
74
|
+
const full = join(dir, entry.name);
|
|
75
|
+
if (entry.isDirectory()) {
|
|
76
|
+
files.push(...(await collectYamlFiles(full)));
|
|
77
|
+
}
|
|
78
|
+
else if (entry.isFile() && (entry.name.endsWith(".yaml") || entry.name.endsWith(".yml"))) {
|
|
79
|
+
files.push(full);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return files.sort();
|
|
83
|
+
}
|
|
84
|
+
//# sourceMappingURL=cases.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cases.js","sourceRoot":"","sources":["../../src/eval/cases.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AACrD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,OAAO,IAAI,MAAM,SAAS,CAAC;AAE3B,OAAO,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AACnD,OAAO,EAAE,GAAG,EAAE,EAAE,EAAe,MAAM,cAAc,CAAC;AACpD,OAAO,EACL,cAAc,EACd,cAAc,GAIf,MAAM,aAAa,CAAC;AAarB,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,QAAgB;IAC9C,MAAM,KAAK,GAAG,MAAM,gBAAgB,CAAC,QAAQ,CAAC,CAAC;IAC/C,MAAM,MAAM,GAAiB,EAAE,CAAC;IAChC,MAAM,MAAM,GAAoB,EAAE,CAAC;IACnC,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAC;IAElC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,IAAI,CAAC,CAAC;QACrC,IAAI,CAAC,MAAM,CAAC,EAAE,EAAE,CAAC;YACf,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAC1B,SAAS;QACX,CAAC;QACD,KAAK,MAAM,QAAQ,IAAI,MAAM,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;YAC1C,IAAI,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC,EAAE,CAAC;gBAC7B,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,sBAAsB,QAAQ,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC;gBACrE,SAAS;YACX,CAAC;YACD,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;YACzB,MAAM,CAAC,IAAI,CAAC,EAAE,GAAG,QAAQ,EAAE,KAAK,EAAE,MAAM,CAAC,KAAK,CAAC,KAAK,EAAE,IAAI,EAAE,MAAM,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;QACzF,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,CAAC;AACtD,CAAC;AAED,MAAM,UAAU,gBAAgB,CAC9B,KAA4B,EAC5B,YAAiC;IAEjC,MAAM,MAAM,GAAoB,EAAE,CAAC;IACnC,KAAK,MAAM,QAAQ,IAAI,KAAK,EAAE,CAAC;QAC7B,KAAK,MAAM,EAAE,IAAI,cAAc,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YACjD,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC;gBAC1B,MAAM,CAAC,IAAI,CAAC;oBACV,IAAI,EAAE,QAAQ,CAAC,IAAI;oBACnB,OAAO,EAAE,SAAS,QAAQ,CAAC,EAAE,oBAAoB,EAAE,2BAA2B;iBAC/E,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,KAAK,UAAU,SAAS,CAAC,IAAY;IACnC,IAAI,GAAW,CAAC;IAChB,IAAI,CAAC;QACH,GAAG,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IACrC,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,GAAG,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,qBAAsB,KAAe,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;IACjF,CAAC;IACD,IAAI,GAAY,CAAC;IACjB,IAAI,CAAC;QACH,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACvB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,GAAG,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,iBAAkB,KAAe,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;IAC7E,CAAC;IACD,MAAM,MAAM,GAAG,cAAc,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;IAC7C,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;QACpB,OAAO,GAAG,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,eAAe,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAC1E,CAAC;IACD,OAAO,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;AACzB,CAAC;AAED,KAAK,UAAU,gBAAgB,CAAC,GAAW;IACzC,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,OAAO,CAAC;IACZ,IAAI,CAAC;QACH,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,EAAE,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC;IACxD,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,KAAK,CAAC;IACf,CAAC;IACD,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;QAC5B,MAAM,IAAI,GAAG,IAAI,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC;QACnC,IAAI,KAAK,CAAC,WAAW,EAAE,EAAE,CAAC;YACxB,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,gBAAgB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChD,CAAC;aAAM,IAAI,KAAK,CAAC,MAAM,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC;YAC3F,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACnB,CAAC;IACH,CAAC;IACD,OAAO,KAAK,CAAC,IAAI,EAAE,CAAC;AACtB,CAAC"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
export type ExitReason = "skill" | "no-skill" | "timeout" | "stream-end";
|
|
2
|
+
export interface DetectionResult {
|
|
3
|
+
readonly observed: readonly string[];
|
|
4
|
+
readonly firstSkill: string | null;
|
|
5
|
+
readonly exitReason: ExitReason;
|
|
6
|
+
}
|
|
7
|
+
interface Detector {
|
|
8
|
+
readonly push: (line: string) => void;
|
|
9
|
+
readonly done: boolean;
|
|
10
|
+
readonly result: (reason: ExitReason) => DetectionResult;
|
|
11
|
+
}
|
|
12
|
+
export declare function createDetector(stopAfter?: number): Detector;
|
|
13
|
+
export {};
|
|
14
|
+
//# sourceMappingURL=detect.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"detect.d.ts","sourceRoot":"","sources":["../../src/eval/detect.ts"],"names":[],"mappings":"AAEA,MAAM,MAAM,UAAU,GAAG,OAAO,GAAG,UAAU,GAAG,SAAS,GAAG,YAAY,CAAC;AAEzE,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,QAAQ,EAAE,SAAS,MAAM,EAAE,CAAC;IACrC,QAAQ,CAAC,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IACnC,QAAQ,CAAC,UAAU,EAAE,UAAU,CAAC;CACjC;AAED,UAAU,QAAQ;IAChB,QAAQ,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,IAAI,CAAC;IACtC,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC;IACvB,QAAQ,CAAC,MAAM,EAAE,CAAC,MAAM,EAAE,UAAU,KAAK,eAAe,CAAC;CAC1D;AAwBD,wBAAgB,cAAc,CAAC,SAAS,SAAI,GAAG,QAAQ,CA6EtD"}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import { FQ_ID } from "../ids.js";
|
|
2
|
+
function asRecord(value) {
|
|
3
|
+
return typeof value === "object" && value !== null ? value : {};
|
|
4
|
+
}
|
|
5
|
+
function isSkillToolUse(block) {
|
|
6
|
+
return block["type"] === "tool_use" && block["name"] === "Skill";
|
|
7
|
+
}
|
|
8
|
+
function skillIdOf(input) {
|
|
9
|
+
const record = asRecord(input);
|
|
10
|
+
const value = record["skill"] ?? record["command"];
|
|
11
|
+
return typeof value === "string" && FQ_ID.test(value) ? value : null;
|
|
12
|
+
}
|
|
13
|
+
function skillIdInJson(buffer) {
|
|
14
|
+
try {
|
|
15
|
+
return skillIdOf(JSON.parse(buffer));
|
|
16
|
+
}
|
|
17
|
+
catch {
|
|
18
|
+
return null;
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
export function createDetector(stopAfter = 1) {
|
|
22
|
+
const observed = [];
|
|
23
|
+
let pendingSkillJson = null;
|
|
24
|
+
let finished = false;
|
|
25
|
+
function record(id) {
|
|
26
|
+
if (id !== null)
|
|
27
|
+
observed.push(id);
|
|
28
|
+
if (observed.length >= stopAfter)
|
|
29
|
+
finished = true;
|
|
30
|
+
return finished;
|
|
31
|
+
}
|
|
32
|
+
function flushPending() {
|
|
33
|
+
if (pendingSkillJson === null)
|
|
34
|
+
return;
|
|
35
|
+
record(skillIdInJson(pendingSkillJson));
|
|
36
|
+
pendingSkillJson = null;
|
|
37
|
+
}
|
|
38
|
+
return {
|
|
39
|
+
get done() {
|
|
40
|
+
return finished;
|
|
41
|
+
},
|
|
42
|
+
push(line) {
|
|
43
|
+
if (finished)
|
|
44
|
+
return;
|
|
45
|
+
const trimmed = line.trim();
|
|
46
|
+
if (!trimmed)
|
|
47
|
+
return;
|
|
48
|
+
let parsed;
|
|
49
|
+
try {
|
|
50
|
+
parsed = JSON.parse(trimmed);
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
return;
|
|
54
|
+
}
|
|
55
|
+
const event = asRecord(parsed);
|
|
56
|
+
const type = event["type"];
|
|
57
|
+
if (type === "stream_event") {
|
|
58
|
+
const se = asRecord(event["event"]);
|
|
59
|
+
const seType = se["type"];
|
|
60
|
+
if (seType === "content_block_start") {
|
|
61
|
+
if (isSkillToolUse(asRecord(se["content_block"])))
|
|
62
|
+
pendingSkillJson = "";
|
|
63
|
+
}
|
|
64
|
+
else if (seType === "content_block_delta" && pendingSkillJson !== null) {
|
|
65
|
+
const delta = asRecord(se["delta"]);
|
|
66
|
+
if (delta["type"] === "input_json_delta") {
|
|
67
|
+
const partial = delta["partial_json"];
|
|
68
|
+
if (typeof partial === "string")
|
|
69
|
+
pendingSkillJson += partial;
|
|
70
|
+
if (skillIdInJson(pendingSkillJson) !== null)
|
|
71
|
+
flushPending();
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
else if (seType === "content_block_stop") {
|
|
75
|
+
flushPending();
|
|
76
|
+
}
|
|
77
|
+
else if (seType === "message_stop") {
|
|
78
|
+
flushPending();
|
|
79
|
+
finished = true;
|
|
80
|
+
}
|
|
81
|
+
return;
|
|
82
|
+
}
|
|
83
|
+
if (type === "assistant") {
|
|
84
|
+
const content = asRecord(event["message"])["content"];
|
|
85
|
+
for (const item of Array.isArray(content) ? content : []) {
|
|
86
|
+
const block = asRecord(item);
|
|
87
|
+
if (isSkillToolUse(block) && record(skillIdOf(block["input"])))
|
|
88
|
+
return;
|
|
89
|
+
}
|
|
90
|
+
return;
|
|
91
|
+
}
|
|
92
|
+
if (type === "result") {
|
|
93
|
+
finished = true;
|
|
94
|
+
}
|
|
95
|
+
},
|
|
96
|
+
result(reason) {
|
|
97
|
+
return {
|
|
98
|
+
observed,
|
|
99
|
+
firstSkill: observed[0] ?? null,
|
|
100
|
+
exitReason: observed.length > 0 ? "skill" : reason,
|
|
101
|
+
};
|
|
102
|
+
},
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
//# sourceMappingURL=detect.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"detect.js","sourceRoot":"","sources":["../../src/eval/detect.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,MAAM,WAAW,CAAC;AAgBlC,SAAS,QAAQ,CAAC,KAAc;IAC9B,OAAO,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,IAAI,CAAC,CAAC,CAAE,KAAiC,CAAC,CAAC,CAAC,EAAE,CAAC;AAC/F,CAAC;AAED,SAAS,cAAc,CAAC,KAA8B;IACpD,OAAO,KAAK,CAAC,MAAM,CAAC,KAAK,UAAU,IAAI,KAAK,CAAC,MAAM,CAAC,KAAK,OAAO,CAAC;AACnE,CAAC;AAED,SAAS,SAAS,CAAC,KAAc;IAC/B,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC;IAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,SAAS,CAAC,CAAC;IACnD,OAAO,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC;AACvE,CAAC;AAED,SAAS,aAAa,CAAC,MAAc;IACnC,IAAI,CAAC;QACH,OAAO,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC;IACvC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,MAAM,UAAU,cAAc,CAAC,SAAS,GAAG,CAAC;IAC1C,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,IAAI,gBAAgB,GAAkB,IAAI,CAAC;IAC3C,IAAI,QAAQ,GAAG,KAAK,CAAC;IAErB,SAAS,MAAM,CAAC,EAAiB;QAC/B,IAAI,EAAE,KAAK,IAAI;YAAE,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACnC,IAAI,QAAQ,CAAC,MAAM,IAAI,SAAS;YAAE,QAAQ,GAAG,IAAI,CAAC;QAClD,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED,SAAS,YAAY;QACnB,IAAI,gBAAgB,KAAK,IAAI;YAAE,OAAO;QACtC,MAAM,CAAC,aAAa,CAAC,gBAAgB,CAAC,CAAC,CAAC;QACxC,gBAAgB,GAAG,IAAI,CAAC;IAC1B,CAAC;IAED,OAAO;QACL,IAAI,IAAI;YACN,OAAO,QAAQ,CAAC;QAClB,CAAC;QACD,IAAI,CAAC,IAAY;YACf,IAAI,QAAQ;gBAAE,OAAO;YACrB,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;YAC5B,IAAI,CAAC,OAAO;gBAAE,OAAO;YACrB,IAAI,MAAe,CAAC;YACpB,IAAI,CAAC;gBACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;YAC/B,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO;YACT,CAAC;YACD,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC;YAC/B,MAAM,IAAI,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC;YAE3B,IAAI,IAAI,KAAK,cAAc,EAAE,CAAC;gBAC5B,MAAM,EAAE,GAAG,QAAQ,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC;gBACpC,MAAM,MAAM,GAAG,EAAE,CAAC,MAAM,CAAC,CAAC;gBAE1B,IAAI,MAAM,KAAK,qBAAqB,EAAE,CAAC;oBACrC,IAAI,cAAc,CAAC,QAAQ,CAAC,EAAE,CAAC,eAAe,CAAC,CAAC,CAAC;wBAAE,gBAAgB,GAAG,EAAE,CAAC;gBAC3E,CAAC;qBAAM,IAAI,MAAM,KAAK,qBAAqB,IAAI,gBAAgB,KAAK,IAAI,EAAE,CAAC;oBACzE,MAAM,KAAK,GAAG,QAAQ,CAAC,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;oBACpC,IAAI,KAAK,CAAC,MAAM,CAAC,KAAK,kBAAkB,EAAE,CAAC;wBACzC,MAAM,OAAO,GAAG,KAAK,CAAC,cAAc,CAAC,CAAC;wBACtC,IAAI,OAAO,OAAO,KAAK,QAAQ;4BAAE,gBAAgB,IAAI,OAAO,CAAC;wBAC7D,IAAI,aAAa,CAAC,gBAAgB,CAAC,KAAK,IAAI;4BAAE,YAAY,EAAE,CAAC;oBAC/D,CAAC;gBACH,CAAC;qBAAM,IAAI,MAAM,KAAK,oBAAoB,EAAE,CAAC;oBAC3C,YAAY,EAAE,CAAC;gBACjB,CAAC;qBAAM,IAAI,MAAM,KAAK,cAAc,EAAE,CAAC;oBACrC,YAAY,EAAE,CAAC;oBACf,QAAQ,GAAG,IAAI,CAAC;gBAClB,CAAC;gBACD,OAAO;YACT,CAAC;YAED,IAAI,IAAI,KAAK,WAAW,EAAE,CAAC;gBACzB,MAAM,OAAO,GAAG,QAAQ,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;gBACtD,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;oBACzD,MAAM,KAAK,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;oBAC7B,IAAI,cAAc,CAAC,KAAK,CAAC,IAAI,MAAM,CAAC,SAAS,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC;wBAAE,OAAO;gBACzE,CAAC;gBACD,OAAO;YACT,CAAC;YAED,IAAI,IAAI,KAAK,QAAQ,EAAE,CAAC;gBACtB,QAAQ,GAAG,IAAI,CAAC;YAClB,CAAC;QACH,CAAC;QACD,MAAM,CAAC,MAAkB;YACvB,OAAO;gBACL,QAAQ;gBACR,UAAU,EAAE,QAAQ,CAAC,CAAC,CAAC,IAAI,IAAI;gBAC/B,UAAU,EAAE,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM;aACnD,CAAC;QACJ,CAAC;KACF,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { type Result } from "../result.js";
|
|
2
|
+
import { type CaseLoadError, type LoadedCase } from "./cases.js";
|
|
3
|
+
import { type EvalReport } from "./report.js";
|
|
4
|
+
import { type RunnerOptions } from "./runner.js";
|
|
5
|
+
export interface EvalOptions {
|
|
6
|
+
readonly casesDir: string;
|
|
7
|
+
readonly cwd: string;
|
|
8
|
+
readonly suite?: string;
|
|
9
|
+
readonly caseId?: string;
|
|
10
|
+
readonly tier?: LoadedCase["tier"];
|
|
11
|
+
readonly runs?: number;
|
|
12
|
+
readonly concurrency?: number;
|
|
13
|
+
readonly model?: string;
|
|
14
|
+
readonly onRun?: RunnerOptions["onRun"];
|
|
15
|
+
}
|
|
16
|
+
export declare function runEval(options: EvalOptions): Promise<Result<EvalReport, CaseLoadError[]>>;
|
|
17
|
+
export type { EvalReport, CaseReport } from "./report.js";
|
|
18
|
+
export { formatConsole, toJson } from "./report.js";
|
|
19
|
+
export type { LoadedCase, CaseLoadError } from "./cases.js";
|
|
20
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAW,KAAK,MAAM,EAAE,MAAM,cAAc,CAAC;AACpD,OAAO,EAA+B,KAAK,aAAa,EAAE,KAAK,UAAU,EAAE,MAAM,YAAY,CAAC;AAC9F,OAAO,EAAgC,KAAK,UAAU,EAAE,MAAM,aAAa,CAAC;AAC5E,OAAO,EAAY,KAAK,aAAa,EAAE,MAAM,aAAa,CAAC;AAG3D,MAAM,WAAW,WAAW;IAC1B,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,IAAI,CAAC,EAAE,UAAU,CAAC,MAAM,CAAC,CAAC;IACnC,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,WAAW,CAAC,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,KAAK,CAAC,EAAE,aAAa,CAAC,OAAO,CAAC,CAAC;CACzC;AAED,wBAAsB,OAAO,CAAC,OAAO,EAAE,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC,UAAU,EAAE,aAAa,EAAE,CAAC,CAAC,CA6BhG;AAWD,YAAY,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACpD,YAAY,EAAE,UAAU,EAAE,aAAa,EAAE,MAAM,YAAY,CAAC"}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { defaultSources, discoverInstalled, indexInstalled } from "../installed.js";
|
|
2
|
+
import { err, ok } from "../result.js";
|
|
3
|
+
import { loadCases, unresolvedSkills } from "./cases.js";
|
|
4
|
+
import { buildReport } from "./report.js";
|
|
5
|
+
import { runCases } from "./runner.js";
|
|
6
|
+
import { scoreCase } from "./score.js";
|
|
7
|
+
export async function runEval(options) {
|
|
8
|
+
const loaded = await loadCases(options.casesDir);
|
|
9
|
+
if (!loaded.ok)
|
|
10
|
+
return err(loaded.error);
|
|
11
|
+
const selected = select(loaded.value, options);
|
|
12
|
+
if (selected.length === 0) {
|
|
13
|
+
return err([{ file: options.casesDir, message: "no cases matched the given filters" }]);
|
|
14
|
+
}
|
|
15
|
+
const installed = indexInstalled(await discoverInstalled(defaultSources()));
|
|
16
|
+
const installedIds = new Set(installed.skills.keys());
|
|
17
|
+
const unresolved = unresolvedSkills(selected, installedIds);
|
|
18
|
+
if (unresolved.length > 0)
|
|
19
|
+
return err(unresolved);
|
|
20
|
+
const runnerOptions = {
|
|
21
|
+
cwd: options.cwd,
|
|
22
|
+
...(options.runs !== undefined && { runs: options.runs }),
|
|
23
|
+
...(options.concurrency !== undefined && { concurrency: options.concurrency }),
|
|
24
|
+
...(options.model !== undefined && { model: options.model }),
|
|
25
|
+
...(options.onRun !== undefined && { onRun: options.onRun }),
|
|
26
|
+
};
|
|
27
|
+
const results = await runCases(selected, runnerOptions);
|
|
28
|
+
const reports = results.map(({ evalCase, runs }) => ({
|
|
29
|
+
evalCase,
|
|
30
|
+
score: scoreCase(evalCase.expect, runs, evalCase.threshold),
|
|
31
|
+
}));
|
|
32
|
+
return ok(buildReport(reports));
|
|
33
|
+
}
|
|
34
|
+
function select(cases, options) {
|
|
35
|
+
return cases.filter((evalCase) => {
|
|
36
|
+
if (options.caseId && evalCase.id !== options.caseId)
|
|
37
|
+
return false;
|
|
38
|
+
if (options.suite && evalCase.suite !== options.suite)
|
|
39
|
+
return false;
|
|
40
|
+
if (options.tier && evalCase.tier !== options.tier)
|
|
41
|
+
return false;
|
|
42
|
+
return true;
|
|
43
|
+
});
|
|
44
|
+
}
|
|
45
|
+
export { formatConsole, toJson } from "./report.js";
|
|
46
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,iBAAiB,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AACpF,OAAO,EAAE,GAAG,EAAE,EAAE,EAAe,MAAM,cAAc,CAAC;AACpD,OAAO,EAAE,SAAS,EAAE,gBAAgB,EAAuC,MAAM,YAAY,CAAC;AAC9F,OAAO,EAAE,WAAW,EAAoC,MAAM,aAAa,CAAC;AAC5E,OAAO,EAAE,QAAQ,EAAsB,MAAM,aAAa,CAAC;AAC3D,OAAO,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAcvC,MAAM,CAAC,KAAK,UAAU,OAAO,CAAC,OAAoB;IAChD,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;IACjD,IAAI,CAAC,MAAM,CAAC,EAAE;QAAE,OAAO,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAEzC,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,KAAK,EAAE,OAAO,CAAC,CAAC;IAC/C,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,OAAO,GAAG,CAAC,CAAC,EAAE,IAAI,EAAE,OAAO,CAAC,QAAQ,EAAE,OAAO,EAAE,oCAAoC,EAAE,CAAC,CAAC,CAAC;IAC1F,CAAC;IAED,MAAM,SAAS,GAAG,cAAc,CAAC,MAAM,iBAAiB,CAAC,cAAc,EAAE,CAAC,CAAC,CAAC;IAC5E,MAAM,YAAY,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC;IACtD,MAAM,UAAU,GAAG,gBAAgB,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;IAC5D,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,GAAG,CAAC,UAAU,CAAC,CAAC;IAElD,MAAM,aAAa,GAAkB;QACnC,GAAG,EAAE,OAAO,CAAC,GAAG;QAChB,GAAG,CAAC,OAAO,CAAC,IAAI,KAAK,SAAS,IAAI,EAAE,IAAI,EAAE,OAAO,CAAC,IAAI,EAAE,CAAC;QACzD,GAAG,CAAC,OAAO,CAAC,WAAW,KAAK,SAAS,IAAI,EAAE,WAAW,EAAE,OAAO,CAAC,WAAW,EAAE,CAAC;QAC9E,GAAG,CAAC,OAAO,CAAC,KAAK,KAAK,SAAS,IAAI,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,CAAC;QAC5D,GAAG,CAAC,OAAO,CAAC,KAAK,KAAK,SAAS,IAAI,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,CAAC;KAC7D,CAAC;IACF,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;IAExD,MAAM,OAAO,GAAiB,OAAO,CAAC,GAAG,CAAC,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;QACjE,QAAQ;QACR,KAAK,EAAE,SAAS,CAAC,QAAQ,CAAC,MAAM,EAAE,IAAI,EAAE,QAAQ,CAAC,SAAS,CAAC;KAC5D,CAAC,CAAC,CAAC;IAEJ,OAAO,EAAE,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC,CAAC;AAClC,CAAC;AAED,SAAS,MAAM,CAAC,KAA4B,EAAE,OAAoB;IAChE,OAAO,KAAK,CAAC,MAAM,CAAC,CAAC,QAAQ,EAAE,EAAE;QAC/B,IAAI,OAAO,CAAC,MAAM,IAAI,QAAQ,CAAC,EAAE,KAAK,OAAO,CAAC,MAAM;YAAE,OAAO,KAAK,CAAC;QACnE,IAAI,OAAO,CAAC,KAAK,IAAI,QAAQ,CAAC,KAAK,KAAK,OAAO,CAAC,KAAK;YAAE,OAAO,KAAK,CAAC;QACpE,IAAI,OAAO,CAAC,IAAI,IAAI,QAAQ,CAAC,IAAI,KAAK,OAAO,CAAC,IAAI;YAAE,OAAO,KAAK,CAAC;QACjE,OAAO,IAAI,CAAC;IACd,CAAC,CAAC,CAAC;AACL,CAAC;AAGD,OAAO,EAAE,aAAa,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import type { LoadedCase } from "./cases.js";
|
|
2
|
+
import type { CaseScore } from "./score.js";
|
|
3
|
+
export interface CaseReport {
|
|
4
|
+
readonly evalCase: LoadedCase;
|
|
5
|
+
readonly score: CaseScore;
|
|
6
|
+
}
|
|
7
|
+
export interface EvalReport {
|
|
8
|
+
readonly cases: readonly CaseReport[];
|
|
9
|
+
readonly passed: number;
|
|
10
|
+
readonly failed: number;
|
|
11
|
+
}
|
|
12
|
+
export declare function buildReport(cases: readonly CaseReport[]): EvalReport;
|
|
13
|
+
export declare function formatConsole(report: EvalReport): string;
|
|
14
|
+
export declare function toJson(report: EvalReport): string;
|
|
15
|
+
//# sourceMappingURL=report.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"report.d.ts","sourceRoot":"","sources":["../../src/eval/report.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AAC7C,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAG5C,MAAM,WAAW,UAAU;IACzB,QAAQ,CAAC,QAAQ,EAAE,UAAU,CAAC;IAC9B,QAAQ,CAAC,KAAK,EAAE,SAAS,CAAC;CAC3B;AAED,MAAM,WAAW,UAAU;IACzB,QAAQ,CAAC,KAAK,EAAE,SAAS,UAAU,EAAE,CAAC;IACtC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;CACzB;AAED,wBAAgB,WAAW,CAAC,KAAK,EAAE,SAAS,UAAU,EAAE,GAAG,UAAU,CAGpE;AAED,wBAAgB,aAAa,CAAC,MAAM,EAAE,UAAU,GAAG,MAAM,CAuBxD;AAED,wBAAgB,MAAM,CAAC,MAAM,EAAE,UAAU,GAAG,MAAM,CAyBjD"}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
export function buildReport(cases) {
|
|
2
|
+
const passed = cases.filter((c) => c.score.pass).length;
|
|
3
|
+
return { cases, passed, failed: cases.length - passed };
|
|
4
|
+
}
|
|
5
|
+
export function formatConsole(report) {
|
|
6
|
+
const lines = [];
|
|
7
|
+
const groups = groupBy(report.cases, (c) => `${c.evalCase.tier}/${c.evalCase.suite}`);
|
|
8
|
+
for (const [group, entries] of groups) {
|
|
9
|
+
lines.push(group);
|
|
10
|
+
for (const { evalCase, score } of entries) {
|
|
11
|
+
const tag = score.pass ? "PASS" : "FAIL";
|
|
12
|
+
const tally = `${score.matched}/${score.runs}`;
|
|
13
|
+
lines.push(` ${tag} ${evalCase.id} ${tally} → ${describeExpectation(evalCase.expect)}`);
|
|
14
|
+
if (!score.pass) {
|
|
15
|
+
lines.push(` got: ${formatHistogram(score.histogram)}`);
|
|
16
|
+
lines.push(` prompt: ${truncate(evalCase.prompt)}`);
|
|
17
|
+
if (evalCase.note)
|
|
18
|
+
lines.push(` note: ${evalCase.note}`);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
lines.push("");
|
|
22
|
+
}
|
|
23
|
+
const total = report.passed + report.failed;
|
|
24
|
+
const pct = total === 0 ? 0 : Math.round((report.passed / total) * 100);
|
|
25
|
+
lines.push(`Summary: ${report.passed}/${total} cases passed (${pct}%).`);
|
|
26
|
+
return lines.join("\n");
|
|
27
|
+
}
|
|
28
|
+
export function toJson(report) {
|
|
29
|
+
return JSON.stringify({
|
|
30
|
+
summary: {
|
|
31
|
+
total: report.passed + report.failed,
|
|
32
|
+
passed: report.passed,
|
|
33
|
+
failed: report.failed,
|
|
34
|
+
},
|
|
35
|
+
cases: report.cases.map(({ evalCase, score }) => ({
|
|
36
|
+
id: evalCase.id,
|
|
37
|
+
suite: evalCase.suite,
|
|
38
|
+
tier: evalCase.tier,
|
|
39
|
+
prompt: evalCase.prompt,
|
|
40
|
+
expect: evalCase.expect,
|
|
41
|
+
pass: score.pass,
|
|
42
|
+
matched: score.matched,
|
|
43
|
+
runs: score.runs,
|
|
44
|
+
triggerRate: score.triggerRate,
|
|
45
|
+
threshold: score.threshold,
|
|
46
|
+
histogram: Object.fromEntries(score.histogram),
|
|
47
|
+
})),
|
|
48
|
+
}, null, 2);
|
|
49
|
+
}
|
|
50
|
+
function describeExpectation(expectation) {
|
|
51
|
+
if ("noSkill" in expectation)
|
|
52
|
+
return "(no skill)";
|
|
53
|
+
if ("first" in expectation)
|
|
54
|
+
return expectation.first;
|
|
55
|
+
if ("anyOf" in expectation)
|
|
56
|
+
return `one of [${expectation.anyOf.join(", ")}]`;
|
|
57
|
+
return expectation.path.join(" → ");
|
|
58
|
+
}
|
|
59
|
+
function formatHistogram(histogram) {
|
|
60
|
+
return [...histogram.entries()]
|
|
61
|
+
.sort((a, b) => b[1] - a[1])
|
|
62
|
+
.map(([id, count]) => `${id} ×${count}`)
|
|
63
|
+
.join(", ");
|
|
64
|
+
}
|
|
65
|
+
function truncate(text, max = 80) {
|
|
66
|
+
const single = text.replace(/\s+/g, " ").trim();
|
|
67
|
+
return single.length > max ? `${single.slice(0, max - 1)}…` : single;
|
|
68
|
+
}
|
|
69
|
+
function groupBy(items, key) {
|
|
70
|
+
const groups = new Map();
|
|
71
|
+
for (const item of items) {
|
|
72
|
+
const k = key(item);
|
|
73
|
+
const existing = groups.get(k);
|
|
74
|
+
if (existing)
|
|
75
|
+
existing.push(item);
|
|
76
|
+
else
|
|
77
|
+
groups.set(k, [item]);
|
|
78
|
+
}
|
|
79
|
+
return groups;
|
|
80
|
+
}
|
|
81
|
+
//# sourceMappingURL=report.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"report.js","sourceRoot":"","sources":["../../src/eval/report.ts"],"names":[],"mappings":"AAeA,MAAM,UAAU,WAAW,CAAC,KAA4B;IACtD,MAAM,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;IACxD,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,CAAC,MAAM,GAAG,MAAM,EAAE,CAAC;AAC1D,CAAC;AAED,MAAM,UAAU,aAAa,CAAC,MAAkB;IAC9C,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,IAAI,IAAI,CAAC,CAAC,QAAQ,CAAC,KAAK,EAAE,CAAC,CAAC;IAEtF,KAAK,MAAM,CAAC,KAAK,EAAE,OAAO,CAAC,IAAI,MAAM,EAAE,CAAC;QACtC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAClB,KAAK,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,IAAI,OAAO,EAAE,CAAC;YAC1C,MAAM,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;YACzC,MAAM,KAAK,GAAG,GAAG,KAAK,CAAC,OAAO,IAAI,KAAK,CAAC,IAAI,EAAE,CAAC;YAC/C,KAAK,CAAC,IAAI,CAAC,KAAK,GAAG,KAAK,QAAQ,CAAC,EAAE,KAAK,KAAK,OAAO,mBAAmB,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;YAC5F,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;gBAChB,KAAK,CAAC,IAAI,CAAC,kBAAkB,eAAe,CAAC,KAAK,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC;gBACjE,KAAK,CAAC,IAAI,CAAC,qBAAqB,QAAQ,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;gBAC7D,IAAI,QAAQ,CAAC,IAAI;oBAAE,KAAK,CAAC,IAAI,CAAC,mBAAmB,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC;YACpE,CAAC;QACH,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACjB,CAAC;IAED,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC;IAC5C,MAAM,GAAG,GAAG,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,MAAM,GAAG,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC;IACxE,KAAK,CAAC,IAAI,CAAC,YAAY,MAAM,CAAC,MAAM,IAAI,KAAK,kBAAkB,GAAG,KAAK,CAAC,CAAC;IACzE,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,MAAM,UAAU,MAAM,CAAC,MAAkB;IACvC,OAAO,IAAI,CAAC,SAAS,CACnB;QACE,OAAO,EAAE;YACP,KAAK,EAAE,MAAM,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM;YACpC,MAAM,EAAE,MAAM,CAAC,MAAM;YACrB,MAAM,EAAE,MAAM,CAAC,MAAM;SACtB;QACD,KAAK,EAAE,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC,CAAC;YAChD,EAAE,EAAE,QAAQ,CAAC,EAAE;YACf,KAAK,EAAE,QAAQ,CAAC,KAAK;YACrB,IAAI,EAAE,QAAQ,CAAC,IAAI;YACnB,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,IAAI,EAAE,KAAK,CAAC,IAAI;YAChB,OAAO,EAAE,KAAK,CAAC,OAAO;YACtB,IAAI,EAAE,KAAK,CAAC,IAAI;YAChB,WAAW,EAAE,KAAK,CAAC,WAAW;YAC9B,SAAS,EAAE,KAAK,CAAC,SAAS;YAC1B,SAAS,EAAE,MAAM,CAAC,WAAW,CAAC,KAAK,CAAC,SAAS,CAAC;SAC/C,CAAC,CAAC;KACJ,EACD,IAAI,EACJ,CAAC,CACF,CAAC;AACJ,CAAC;AAED,SAAS,mBAAmB,CAAC,WAAwB;IACnD,IAAI,SAAS,IAAI,WAAW;QAAE,OAAO,YAAY,CAAC;IAClD,IAAI,OAAO,IAAI,WAAW;QAAE,OAAO,WAAW,CAAC,KAAK,CAAC;IACrD,IAAI,OAAO,IAAI,WAAW;QAAE,OAAO,WAAW,WAAW,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC;IAC9E,OAAO,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;AACtC,CAAC;AAED,SAAS,eAAe,CAAC,SAAsC;IAC7D,OAAO,CAAC,GAAG,SAAS,CAAC,OAAO,EAAE,CAAC;SAC5B,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;SAC3B,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,GAAG,EAAE,KAAK,KAAK,EAAE,CAAC;SACvC,IAAI,CAAC,IAAI,CAAC,CAAC;AAChB,CAAC;AAED,SAAS,QAAQ,CAAC,IAAY,EAAE,GAAG,GAAG,EAAE;IACtC,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;IAChD,OAAO,MAAM,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC;AACvE,CAAC;AAED,SAAS,OAAO,CAAI,KAAmB,EAAE,GAAwB;IAC/D,MAAM,MAAM,GAAG,IAAI,GAAG,EAAe,CAAC;IACtC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,CAAC,GAAG,GAAG,CAAC,IAAI,CAAC,CAAC;QACpB,MAAM,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QAC/B,IAAI,QAAQ;YAAE,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;;YAC7B,MAAM,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC;IAC7B,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { type DetectionResult } from "./detect.js";
|
|
2
|
+
import type { LoadedCase } from "./cases.js";
|
|
3
|
+
export interface RunnerOptions {
|
|
4
|
+
readonly cwd: string;
|
|
5
|
+
readonly runs?: number;
|
|
6
|
+
readonly concurrency?: number;
|
|
7
|
+
readonly timeoutMs?: number;
|
|
8
|
+
readonly model?: string;
|
|
9
|
+
readonly claudeBin?: string;
|
|
10
|
+
readonly onRun?: (caseId: string, result: DetectionResult) => void;
|
|
11
|
+
}
|
|
12
|
+
export interface CaseRuns {
|
|
13
|
+
readonly evalCase: LoadedCase;
|
|
14
|
+
readonly runs: readonly DetectionResult[];
|
|
15
|
+
}
|
|
16
|
+
export declare function runCases(cases: readonly LoadedCase[], options: RunnerOptions): Promise<CaseRuns[]>;
|
|
17
|
+
//# sourceMappingURL=runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/eval/runner.ts"],"names":[],"mappings":"AAIA,OAAO,EAAkB,KAAK,eAAe,EAAE,MAAM,aAAa,CAAC;AACnE,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AAM7C,MAAM,WAAW,aAAa;IAC5B,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,WAAW,CAAC,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,SAAS,CAAC,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,SAAS,CAAC,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,eAAe,KAAK,IAAI,CAAC;CACpE;AAED,MAAM,WAAW,QAAQ;IACvB,QAAQ,CAAC,QAAQ,EAAE,UAAU,CAAC;IAC9B,QAAQ,CAAC,IAAI,EAAE,SAAS,eAAe,EAAE,CAAC;CAC3C;AAED,wBAAsB,QAAQ,CAC5B,KAAK,EAAE,SAAS,UAAU,EAAE,EAC5B,OAAO,EAAE,aAAa,GACrB,OAAO,CAAC,QAAQ,EAAE,CAAC,CAgBrB"}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import { spawn } from "node:child_process";
|
|
2
|
+
import { isAbsolute, resolve } from "node:path";
|
|
3
|
+
import { createInterface } from "node:readline";
|
|
4
|
+
import { createDetector } from "./detect.js";
|
|
5
|
+
const DEFAULT_RUNS = 5;
|
|
6
|
+
const DEFAULT_CONCURRENCY = 1;
|
|
7
|
+
const DEFAULT_TIMEOUT_MS = 60_000;
|
|
8
|
+
export async function runCases(cases, options) {
|
|
9
|
+
const byCase = new Map();
|
|
10
|
+
const jobs = cases.flatMap((evalCase) => {
|
|
11
|
+
byCase.set(evalCase, []);
|
|
12
|
+
const count = evalCase.runs ?? options.runs ?? DEFAULT_RUNS;
|
|
13
|
+
return Array.from({ length: count }, () => evalCase);
|
|
14
|
+
});
|
|
15
|
+
const concurrency = Math.max(1, options.concurrency ?? DEFAULT_CONCURRENCY);
|
|
16
|
+
await forEachLimit(jobs, concurrency, async (evalCase) => {
|
|
17
|
+
const result = await runOnce(evalCase, options);
|
|
18
|
+
byCase.get(evalCase)?.push(result);
|
|
19
|
+
options.onRun?.(evalCase.id, result);
|
|
20
|
+
});
|
|
21
|
+
return cases.map((evalCase) => ({ evalCase, runs: byCase.get(evalCase) ?? [] }));
|
|
22
|
+
}
|
|
23
|
+
function skillsToCollect(evalCase) {
|
|
24
|
+
return "path" in evalCase.expect ? evalCase.expect.path.length : 1;
|
|
25
|
+
}
|
|
26
|
+
async function runOnce(evalCase, options) {
|
|
27
|
+
const cwd = evalCase.cwd ? resolveCwd(options.cwd, evalCase.cwd) : options.cwd;
|
|
28
|
+
const args = [
|
|
29
|
+
"-p",
|
|
30
|
+
evalCase.prompt,
|
|
31
|
+
"--output-format",
|
|
32
|
+
"stream-json",
|
|
33
|
+
"--verbose",
|
|
34
|
+
"--include-partial-messages",
|
|
35
|
+
];
|
|
36
|
+
if (options.model)
|
|
37
|
+
args.push("--model", options.model);
|
|
38
|
+
const env = { ...process.env };
|
|
39
|
+
delete env["CLAUDECODE"];
|
|
40
|
+
const detector = createDetector(skillsToCollect(evalCase));
|
|
41
|
+
const child = spawn(options.claudeBin ?? "claude", args, {
|
|
42
|
+
cwd,
|
|
43
|
+
env,
|
|
44
|
+
stdio: ["ignore", "pipe", "ignore"],
|
|
45
|
+
});
|
|
46
|
+
const spawnFailure = new Promise((_, reject) => {
|
|
47
|
+
child.on("error", (cause) => reject(new Error(`failed to spawn claude: ${cause.message}`)));
|
|
48
|
+
});
|
|
49
|
+
const deadline = { reached: false };
|
|
50
|
+
const timer = setTimeout(() => {
|
|
51
|
+
deadline.reached = true;
|
|
52
|
+
child.kill("SIGKILL");
|
|
53
|
+
}, options.timeoutMs ?? DEFAULT_TIMEOUT_MS);
|
|
54
|
+
try {
|
|
55
|
+
await Promise.race([drain(child.stdout, () => child.kill("SIGKILL"), detector), spawnFailure]);
|
|
56
|
+
}
|
|
57
|
+
finally {
|
|
58
|
+
clearTimeout(timer);
|
|
59
|
+
if (child.exitCode === null)
|
|
60
|
+
child.kill("SIGKILL");
|
|
61
|
+
}
|
|
62
|
+
return detector.result(deadline.reached ? "timeout" : "no-skill");
|
|
63
|
+
}
|
|
64
|
+
async function drain(stdout, stop, detector) {
|
|
65
|
+
const lines = createInterface({ input: stdout });
|
|
66
|
+
for await (const line of lines) {
|
|
67
|
+
detector.push(line);
|
|
68
|
+
if (detector.done) {
|
|
69
|
+
stop();
|
|
70
|
+
break;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
function resolveCwd(base, caseCwd) {
|
|
75
|
+
return isAbsolute(caseCwd) ? caseCwd : resolve(base, caseCwd);
|
|
76
|
+
}
|
|
77
|
+
async function forEachLimit(items, limit, worker) {
|
|
78
|
+
let cursor = 0;
|
|
79
|
+
const runners = Array.from({ length: Math.min(limit, items.length) }, async () => {
|
|
80
|
+
while (cursor < items.length) {
|
|
81
|
+
const item = items[cursor];
|
|
82
|
+
cursor += 1;
|
|
83
|
+
if (item !== undefined)
|
|
84
|
+
await worker(item);
|
|
85
|
+
}
|
|
86
|
+
});
|
|
87
|
+
await Promise.all(runners);
|
|
88
|
+
}
|
|
89
|
+
//# sourceMappingURL=runner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../../src/eval/runner.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAC3C,OAAO,EAAE,UAAU,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAChD,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAEhD,OAAO,EAAE,cAAc,EAAwB,MAAM,aAAa,CAAC;AAGnE,MAAM,YAAY,GAAG,CAAC,CAAC;AACvB,MAAM,mBAAmB,GAAG,CAAC,CAAC;AAC9B,MAAM,kBAAkB,GAAG,MAAM,CAAC;AAiBlC,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,KAA4B,EAC5B,OAAsB;IAEtB,MAAM,MAAM,GAAG,IAAI,GAAG,EAAiC,CAAC;IACxD,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,CAAC,CAAC,QAAQ,EAAE,EAAE;QACtC,MAAM,CAAC,GAAG,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;QACzB,MAAM,KAAK,GAAG,QAAQ,CAAC,IAAI,IAAI,OAAO,CAAC,IAAI,IAAI,YAAY,CAAC;QAC5D,OAAO,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,KAAK,EAAE,EAAE,GAAG,EAAE,CAAC,QAAQ,CAAC,CAAC;IACvD,CAAC,CAAC,CAAC;IAEH,MAAM,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,CAAC,WAAW,IAAI,mBAAmB,CAAC,CAAC;IAC5E,MAAM,YAAY,CAAC,IAAI,EAAE,WAAW,EAAE,KAAK,EAAE,QAAQ,EAAE,EAAE;QACvD,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAChD,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;QACnC,OAAO,CAAC,KAAK,EAAE,CAAC,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC,CAAC;IACvC,CAAC,CAAC,CAAC;IAEH,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC;AACnF,CAAC;AAED,SAAS,eAAe,CAAC,QAAoB;IAC3C,OAAO,MAAM,IAAI,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;AACrE,CAAC;AAED,KAAK,UAAU,OAAO,CAAC,QAAoB,EAAE,OAAsB;IACjE,MAAM,GAAG,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,UAAU,CAAC,OAAO,CAAC,GAAG,EAAE,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC;IAC/E,MAAM,IAAI,GAAG;QACX,IAAI;QACJ,QAAQ,CAAC,MAAM;QACf,iBAAiB;QACjB,aAAa;QACb,WAAW;QACX,4BAA4B;KAC7B,CAAC;IACF,IAAI,OAAO,CAAC,KAAK;QAAE,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,CAAC,KAAK,CAAC,CAAC;IAEvD,MAAM,GAAG,GAAG,EAAE,GAAG,OAAO,CAAC,GAAG,EAAE,CAAC;IAC/B,OAAO,GAAG,CAAC,YAAY,CAAC,CAAC;IAEzB,MAAM,QAAQ,GAAG,cAAc,CAAC,eAAe,CAAC,QAAQ,CAAC,CAAC,CAAC;IAC3D,MAAM,KAAK,GAAG,KAAK,CAAC,OAAO,CAAC,SAAS,IAAI,QAAQ,EAAE,IAAI,EAAE;QACvD,GAAG;QACH,GAAG;QACH,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,CAAC;KACpC,CAAC,CAAC;IAEH,MAAM,YAAY,GAAG,IAAI,OAAO,CAAQ,CAAC,CAAC,EAAE,MAAM,EAAE,EAAE;QACpD,KAAK,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,MAAM,CAAC,IAAI,KAAK,CAAC,2BAA2B,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC,CAAC;IAC9F,CAAC,CAAC,CAAC;IAEH,MAAM,QAAQ,GAAG,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;IACpC,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE;QAC5B,QAAQ,CAAC,OAAO,GAAG,IAAI,CAAC;QACxB,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACxB,CAAC,EAAE,OAAO,CAAC,SAAS,IAAI,kBAAkB,CAAC,CAAC;IAE5C,IAAI,CAAC;QACH,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,QAAQ,CAAC,EAAE,YAAY,CAAC,CAAC,CAAC;IACjG,CAAC;YAAS,CAAC;QACT,YAAY,CAAC,KAAK,CAAC,CAAC;QACpB,IAAI,KAAK,CAAC,QAAQ,KAAK,IAAI;YAAE,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACrD,CAAC;IAED,OAAO,QAAQ,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC;AACpE,CAAC;AAED,KAAK,UAAU,KAAK,CAClB,MAA6B,EAC7B,IAAgB,EAChB,QAA2C;IAE3C,MAAM,KAAK,GAAG,eAAe,CAAC,EAAE,KAAK,EAAE,MAAM,EAAE,CAAC,CAAC;IACjD,IAAI,KAAK,EAAE,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QAC/B,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACpB,IAAI,QAAQ,CAAC,IAAI,EAAE,CAAC;YAClB,IAAI,EAAE,CAAC;YACP,MAAM;QACR,CAAC;IACH,CAAC;AACH,CAAC;AAED,SAAS,UAAU,CAAC,IAAY,EAAE,OAAe;IAC/C,OAAO,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;AAChE,CAAC;AAED,KAAK,UAAU,YAAY,CACzB,KAAmB,EACnB,KAAa,EACb,MAAkC;IAElC,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,MAAM,CAAC,EAAE,EAAE,KAAK,IAAI,EAAE;QAC/E,OAAO,MAAM,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAC7B,MAAM,IAAI,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC;YAC3B,MAAM,IAAI,CAAC,CAAC;YACZ,IAAI,IAAI,KAAK,SAAS;gBAAE,MAAM,MAAM,CAAC,IAAI,CAAC,CAAC;QAC7C,CAAC;IACH,CAAC,CAAC,CAAC;IACH,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;AAC7B,CAAC"}
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
declare const Expectation: z.ZodUnion<[z.ZodObject<{
|
|
3
|
+
first: z.ZodString;
|
|
4
|
+
not: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
5
|
+
}, "strip", z.ZodTypeAny, {
|
|
6
|
+
first: string;
|
|
7
|
+
not?: string[] | undefined;
|
|
8
|
+
}, {
|
|
9
|
+
first: string;
|
|
10
|
+
not?: string[] | undefined;
|
|
11
|
+
}>, z.ZodObject<{
|
|
12
|
+
anyOf: z.ZodArray<z.ZodString, "many">;
|
|
13
|
+
not: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
14
|
+
}, "strip", z.ZodTypeAny, {
|
|
15
|
+
anyOf: string[];
|
|
16
|
+
not?: string[] | undefined;
|
|
17
|
+
}, {
|
|
18
|
+
anyOf: string[];
|
|
19
|
+
not?: string[] | undefined;
|
|
20
|
+
}>, z.ZodObject<{
|
|
21
|
+
path: z.ZodArray<z.ZodString, "many">;
|
|
22
|
+
not: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
23
|
+
}, "strip", z.ZodTypeAny, {
|
|
24
|
+
path: string[];
|
|
25
|
+
not?: string[] | undefined;
|
|
26
|
+
}, {
|
|
27
|
+
path: string[];
|
|
28
|
+
not?: string[] | undefined;
|
|
29
|
+
}>, z.ZodObject<{
|
|
30
|
+
noSkill: z.ZodLiteral<true>;
|
|
31
|
+
}, "strip", z.ZodTypeAny, {
|
|
32
|
+
noSkill: true;
|
|
33
|
+
}, {
|
|
34
|
+
noSkill: true;
|
|
35
|
+
}>]>;
|
|
36
|
+
declare const Case: z.ZodObject<{
|
|
37
|
+
id: z.ZodString;
|
|
38
|
+
prompt: z.ZodString;
|
|
39
|
+
expect: z.ZodUnion<[z.ZodObject<{
|
|
40
|
+
first: z.ZodString;
|
|
41
|
+
not: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
42
|
+
}, "strip", z.ZodTypeAny, {
|
|
43
|
+
first: string;
|
|
44
|
+
not?: string[] | undefined;
|
|
45
|
+
}, {
|
|
46
|
+
first: string;
|
|
47
|
+
not?: string[] | undefined;
|
|
48
|
+
}>, z.ZodObject<{
|
|
49
|
+
anyOf: z.ZodArray<z.ZodString, "many">;
|
|
50
|
+
not: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
51
|
+
}, "strip", z.ZodTypeAny, {
|
|
52
|
+
anyOf: string[];
|
|
53
|
+
not?: string[] | undefined;
|
|
54
|
+
}, {
|
|
55
|
+
anyOf: string[];
|
|
56
|
+
not?: string[] | undefined;
|
|
57
|
+
}>, z.ZodObject<{
|
|
58
|
+
path: z.ZodArray<z.ZodString, "many">;
|
|
59
|
+
not: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
60
|
+
}, "strip", z.ZodTypeAny, {
|
|
61
|
+
path: string[];
|
|
62
|
+
not?: string[] | undefined;
|
|
63
|
+
}, {
|
|
64
|
+
path: string[];
|
|
65
|
+
not?: string[] | undefined;
|
|
66
|
+
}>, z.ZodObject<{
|
|
67
|
+
noSkill: z.ZodLiteral<true>;
|
|
68
|
+
}, "strip", z.ZodTypeAny, {
|
|
69
|
+
noSkill: true;
|
|
70
|
+
}, {
|
|
71
|
+
noSkill: true;
|
|
72
|
+
}>]>;
|
|
73
|
+
cwd: z.ZodOptional<z.ZodString>;
|
|
74
|
+
runs: z.ZodOptional<z.ZodNumber>;
|
|
75
|
+
threshold: z.ZodOptional<z.ZodNumber>;
|
|
76
|
+
note: z.ZodOptional<z.ZodString>;
|
|
77
|
+
}, "strip", z.ZodTypeAny, {
|
|
78
|
+
id: string;
|
|
79
|
+
prompt: string;
|
|
80
|
+
expect: {
|
|
81
|
+
first: string;
|
|
82
|
+
not?: string[] | undefined;
|
|
83
|
+
} | {
|
|
84
|
+
anyOf: string[];
|
|
85
|
+
not?: string[] | undefined;
|
|
86
|
+
} | {
|
|
87
|
+
path: string[];
|
|
88
|
+
not?: string[] | undefined;
|
|
89
|
+
} | {
|
|
90
|
+
noSkill: true;
|
|
91
|
+
};
|
|
92
|
+
cwd?: string | undefined;
|
|
93
|
+
runs?: number | undefined;
|
|
94
|
+
threshold?: number | undefined;
|
|
95
|
+
note?: string | undefined;
|
|
96
|
+
}, {
|
|
97
|
+
id: string;
|
|
98
|
+
prompt: string;
|
|
99
|
+
expect: {
|
|
100
|
+
first: string;
|
|
101
|
+
not?: string[] | undefined;
|
|
102
|
+
} | {
|
|
103
|
+
anyOf: string[];
|
|
104
|
+
not?: string[] | undefined;
|
|
105
|
+
} | {
|
|
106
|
+
path: string[];
|
|
107
|
+
not?: string[] | undefined;
|
|
108
|
+
} | {
|
|
109
|
+
noSkill: true;
|
|
110
|
+
};
|
|
111
|
+
cwd?: string | undefined;
|
|
112
|
+
runs?: number | undefined;
|
|
113
|
+
threshold?: number | undefined;
|
|
114
|
+
note?: string | undefined;
|
|
115
|
+
}>;
|
|
116
|
+
export declare const TIERS: readonly ["routing", "solving"];
|
|
117
|
+
export declare const CaseFileSchema: z.ZodObject<{
|
|
118
|
+
suite: z.ZodString;
|
|
119
|
+
tier: z.ZodEnum<["routing", "solving"]>;
|
|
120
|
+
cases: z.ZodArray<z.ZodObject<{
|
|
121
|
+
id: z.ZodString;
|
|
122
|
+
prompt: z.ZodString;
|
|
123
|
+
expect: z.ZodUnion<[z.ZodObject<{
|
|
124
|
+
first: z.ZodString;
|
|
125
|
+
not: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
126
|
+
}, "strip", z.ZodTypeAny, {
|
|
127
|
+
first: string;
|
|
128
|
+
not?: string[] | undefined;
|
|
129
|
+
}, {
|
|
130
|
+
first: string;
|
|
131
|
+
not?: string[] | undefined;
|
|
132
|
+
}>, z.ZodObject<{
|
|
133
|
+
anyOf: z.ZodArray<z.ZodString, "many">;
|
|
134
|
+
not: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
135
|
+
}, "strip", z.ZodTypeAny, {
|
|
136
|
+
anyOf: string[];
|
|
137
|
+
not?: string[] | undefined;
|
|
138
|
+
}, {
|
|
139
|
+
anyOf: string[];
|
|
140
|
+
not?: string[] | undefined;
|
|
141
|
+
}>, z.ZodObject<{
|
|
142
|
+
path: z.ZodArray<z.ZodString, "many">;
|
|
143
|
+
not: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
144
|
+
}, "strip", z.ZodTypeAny, {
|
|
145
|
+
path: string[];
|
|
146
|
+
not?: string[] | undefined;
|
|
147
|
+
}, {
|
|
148
|
+
path: string[];
|
|
149
|
+
not?: string[] | undefined;
|
|
150
|
+
}>, z.ZodObject<{
|
|
151
|
+
noSkill: z.ZodLiteral<true>;
|
|
152
|
+
}, "strip", z.ZodTypeAny, {
|
|
153
|
+
noSkill: true;
|
|
154
|
+
}, {
|
|
155
|
+
noSkill: true;
|
|
156
|
+
}>]>;
|
|
157
|
+
cwd: z.ZodOptional<z.ZodString>;
|
|
158
|
+
runs: z.ZodOptional<z.ZodNumber>;
|
|
159
|
+
threshold: z.ZodOptional<z.ZodNumber>;
|
|
160
|
+
note: z.ZodOptional<z.ZodString>;
|
|
161
|
+
}, "strip", z.ZodTypeAny, {
|
|
162
|
+
id: string;
|
|
163
|
+
prompt: string;
|
|
164
|
+
expect: {
|
|
165
|
+
first: string;
|
|
166
|
+
not?: string[] | undefined;
|
|
167
|
+
} | {
|
|
168
|
+
anyOf: string[];
|
|
169
|
+
not?: string[] | undefined;
|
|
170
|
+
} | {
|
|
171
|
+
path: string[];
|
|
172
|
+
not?: string[] | undefined;
|
|
173
|
+
} | {
|
|
174
|
+
noSkill: true;
|
|
175
|
+
};
|
|
176
|
+
cwd?: string | undefined;
|
|
177
|
+
runs?: number | undefined;
|
|
178
|
+
threshold?: number | undefined;
|
|
179
|
+
note?: string | undefined;
|
|
180
|
+
}, {
|
|
181
|
+
id: string;
|
|
182
|
+
prompt: string;
|
|
183
|
+
expect: {
|
|
184
|
+
first: string;
|
|
185
|
+
not?: string[] | undefined;
|
|
186
|
+
} | {
|
|
187
|
+
anyOf: string[];
|
|
188
|
+
not?: string[] | undefined;
|
|
189
|
+
} | {
|
|
190
|
+
path: string[];
|
|
191
|
+
not?: string[] | undefined;
|
|
192
|
+
} | {
|
|
193
|
+
noSkill: true;
|
|
194
|
+
};
|
|
195
|
+
cwd?: string | undefined;
|
|
196
|
+
runs?: number | undefined;
|
|
197
|
+
threshold?: number | undefined;
|
|
198
|
+
note?: string | undefined;
|
|
199
|
+
}>, "many">;
|
|
200
|
+
}, "strip", z.ZodTypeAny, {
|
|
201
|
+
suite: string;
|
|
202
|
+
tier: "routing" | "solving";
|
|
203
|
+
cases: {
|
|
204
|
+
id: string;
|
|
205
|
+
prompt: string;
|
|
206
|
+
expect: {
|
|
207
|
+
first: string;
|
|
208
|
+
not?: string[] | undefined;
|
|
209
|
+
} | {
|
|
210
|
+
anyOf: string[];
|
|
211
|
+
not?: string[] | undefined;
|
|
212
|
+
} | {
|
|
213
|
+
path: string[];
|
|
214
|
+
not?: string[] | undefined;
|
|
215
|
+
} | {
|
|
216
|
+
noSkill: true;
|
|
217
|
+
};
|
|
218
|
+
cwd?: string | undefined;
|
|
219
|
+
runs?: number | undefined;
|
|
220
|
+
threshold?: number | undefined;
|
|
221
|
+
note?: string | undefined;
|
|
222
|
+
}[];
|
|
223
|
+
}, {
|
|
224
|
+
suite: string;
|
|
225
|
+
tier: "routing" | "solving";
|
|
226
|
+
cases: {
|
|
227
|
+
id: string;
|
|
228
|
+
prompt: string;
|
|
229
|
+
expect: {
|
|
230
|
+
first: string;
|
|
231
|
+
not?: string[] | undefined;
|
|
232
|
+
} | {
|
|
233
|
+
anyOf: string[];
|
|
234
|
+
not?: string[] | undefined;
|
|
235
|
+
} | {
|
|
236
|
+
path: string[];
|
|
237
|
+
not?: string[] | undefined;
|
|
238
|
+
} | {
|
|
239
|
+
noSkill: true;
|
|
240
|
+
};
|
|
241
|
+
cwd?: string | undefined;
|
|
242
|
+
runs?: number | undefined;
|
|
243
|
+
threshold?: number | undefined;
|
|
244
|
+
note?: string | undefined;
|
|
245
|
+
}[];
|
|
246
|
+
}>;
|
|
247
|
+
export type Tier = (typeof TIERS)[number];
|
|
248
|
+
export type Expectation = z.infer<typeof Expectation>;
|
|
249
|
+
export type EvalCase = z.infer<typeof Case>;
|
|
250
|
+
export type CaseFile = z.infer<typeof CaseFileSchema>;
|
|
251
|
+
export declare function expectedSkills(expectation: Expectation): readonly string[];
|
|
252
|
+
export {};
|
|
253
|
+
//# sourceMappingURL=schema.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../src/eval/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAyBxB,QAAA,MAAM,WAAW;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAKf,CAAC;AAEH,QAAA,MAAM,IAAI;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAQR,CAAC;AAEH,eAAO,MAAM,KAAK,iCAAkC,CAAC;AAErD,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAIzB,CAAC;AAEH,MAAM,MAAM,IAAI,GAAG,CAAC,OAAO,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC;AAC1C,MAAM,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,WAAW,CAAC,CAAC;AACtD,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,IAAI,CAAC,CAAC;AAC5C,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AAEtD,wBAAgB,cAAc,CAAC,WAAW,EAAE,WAAW,GAAG,SAAS,MAAM,EAAE,CAM1E"}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import { FQ_ID } from "../ids.js";
|
|
3
|
+
const FqId = z.string().regex(FQ_ID, "must be a `plugin:name` id");
|
|
4
|
+
const FirstExpectation = z.object({
|
|
5
|
+
first: FqId,
|
|
6
|
+
not: z.array(FqId).optional(),
|
|
7
|
+
});
|
|
8
|
+
const AnyOfExpectation = z.object({
|
|
9
|
+
anyOf: z.array(FqId).min(1),
|
|
10
|
+
not: z.array(FqId).optional(),
|
|
11
|
+
});
|
|
12
|
+
const PathExpectation = z.object({
|
|
13
|
+
path: z.array(FqId).min(2),
|
|
14
|
+
not: z.array(FqId).optional(),
|
|
15
|
+
});
|
|
16
|
+
const NoSkillExpectation = z.object({
|
|
17
|
+
noSkill: z.literal(true),
|
|
18
|
+
});
|
|
19
|
+
const Expectation = z.union([
|
|
20
|
+
FirstExpectation,
|
|
21
|
+
AnyOfExpectation,
|
|
22
|
+
PathExpectation,
|
|
23
|
+
NoSkillExpectation,
|
|
24
|
+
]);
|
|
25
|
+
const Case = z.object({
|
|
26
|
+
id: z.string().min(1),
|
|
27
|
+
prompt: z.string().min(1),
|
|
28
|
+
expect: Expectation,
|
|
29
|
+
cwd: z.string().optional(),
|
|
30
|
+
runs: z.number().int().positive().optional(),
|
|
31
|
+
threshold: z.number().min(0).max(1).optional(),
|
|
32
|
+
note: z.string().optional(),
|
|
33
|
+
});
|
|
34
|
+
export const TIERS = ["routing", "solving"];
|
|
35
|
+
export const CaseFileSchema = z.object({
|
|
36
|
+
suite: z.string().min(1),
|
|
37
|
+
tier: z.enum(TIERS),
|
|
38
|
+
cases: z.array(Case).min(1),
|
|
39
|
+
});
|
|
40
|
+
export function expectedSkills(expectation) {
|
|
41
|
+
if ("noSkill" in expectation)
|
|
42
|
+
return [];
|
|
43
|
+
const forbidden = expectation.not ?? [];
|
|
44
|
+
if ("first" in expectation)
|
|
45
|
+
return [expectation.first, ...forbidden];
|
|
46
|
+
if ("anyOf" in expectation)
|
|
47
|
+
return [...expectation.anyOf, ...forbidden];
|
|
48
|
+
return [...expectation.path, ...forbidden];
|
|
49
|
+
}
|
|
50
|
+
//# sourceMappingURL=schema.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"schema.js","sourceRoot":"","sources":["../../src/eval/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,OAAO,EAAE,KAAK,EAAE,MAAM,WAAW,CAAC;AAElC,MAAM,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,KAAK,CAAC,KAAK,EAAE,4BAA4B,CAAC,CAAC;AAEnE,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IAChC,KAAK,EAAE,IAAI;IACX,GAAG,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE;CAC9B,CAAC,CAAC;AAEH,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IAChC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC3B,GAAG,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE;CAC9B,CAAC,CAAC;AAEH,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IAC/B,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC1B,GAAG,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE;CAC9B,CAAC,CAAC;AAEH,MAAM,kBAAkB,GAAG,CAAC,CAAC,MAAM,CAAC;IAClC,OAAO,EAAE,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC;CACzB,CAAC,CAAC;AAEH,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC;IAC1B,gBAAgB;IAChB,gBAAgB;IAChB,eAAe;IACf,kBAAkB;CACnB,CAAC,CAAC;AAEH,MAAM,IAAI,GAAG,CAAC,CAAC,MAAM,CAAC;IACpB,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IACrB,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IACzB,MAAM,EAAE,WAAW;IACnB,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC1B,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,EAAE;IAC5C,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE;IAC9C,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;CAC5B,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,KAAK,GAAG,CAAC,SAAS,EAAE,SAAS,CAAU,CAAC;AAErD,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,MAAM,CAAC;IACrC,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IACxB,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC;IACnB,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;CAC5B,CAAC,CAAC;AAOH,MAAM,UAAU,cAAc,CAAC,WAAwB;IACrD,IAAI,SAAS,IAAI,WAAW;QAAE,OAAO,EAAE,CAAC;IACxC,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,IAAI,EAAE,CAAC;IACxC,IAAI,OAAO,IAAI,WAAW;QAAE,OAAO,CAAC,WAAW,CAAC,KAAK,EAAE,GAAG,SAAS,CAAC,CAAC;IACrE,IAAI,OAAO,IAAI,WAAW;QAAE,OAAO,CAAC,GAAG,WAAW,CAAC,KAAK,EAAE,GAAG,SAAS,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,WAAW,CAAC,IAAI,EAAE,GAAG,SAAS,CAAC,CAAC;AAC7C,CAAC"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { DetectionResult } from "./detect.js";
|
|
2
|
+
import type { Expectation } from "./schema.js";
|
|
3
|
+
export interface CaseScore {
|
|
4
|
+
readonly matched: number;
|
|
5
|
+
readonly runs: number;
|
|
6
|
+
readonly triggerRate: number;
|
|
7
|
+
readonly threshold: number;
|
|
8
|
+
readonly pass: boolean;
|
|
9
|
+
readonly histogram: ReadonlyMap<string, number>;
|
|
10
|
+
}
|
|
11
|
+
export declare function matchesExpectation(expectation: Expectation, run: DetectionResult): boolean;
|
|
12
|
+
export declare function scoreCase(expectation: Expectation, runs: readonly DetectionResult[], threshold?: number): CaseScore;
|
|
13
|
+
//# sourceMappingURL=score.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"score.d.ts","sourceRoot":"","sources":["../../src/eval/score.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AACnD,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAI/C,MAAM,WAAW,SAAS;IACxB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,WAAW,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACjD;AAED,wBAAgB,kBAAkB,CAAC,WAAW,EAAE,WAAW,EAAE,GAAG,EAAE,eAAe,GAAG,OAAO,CAa1F;AAED,wBAAgB,SAAS,CACvB,WAAW,EAAE,WAAW,EACxB,IAAI,EAAE,SAAS,eAAe,EAAE,EAChC,SAAS,SAAoB,GAC5B,SAAS,CAWX"}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
const DEFAULT_THRESHOLD = 1.0;
|
|
2
|
+
export function matchesExpectation(expectation, run) {
|
|
3
|
+
if (violatesNot(expectation, run))
|
|
4
|
+
return false;
|
|
5
|
+
if ("noSkill" in expectation) {
|
|
6
|
+
return run.firstSkill === null;
|
|
7
|
+
}
|
|
8
|
+
if ("first" in expectation) {
|
|
9
|
+
return run.firstSkill === expectation.first;
|
|
10
|
+
}
|
|
11
|
+
if ("anyOf" in expectation) {
|
|
12
|
+
return run.firstSkill !== null && expectation.anyOf.includes(run.firstSkill);
|
|
13
|
+
}
|
|
14
|
+
return isOrderedSubsequence(expectation.path, run.observed);
|
|
15
|
+
}
|
|
16
|
+
export function scoreCase(expectation, runs, threshold = DEFAULT_THRESHOLD) {
|
|
17
|
+
const matched = runs.filter((run) => matchesExpectation(expectation, run)).length;
|
|
18
|
+
const triggerRate = runs.length === 0 ? 0 : matched / runs.length;
|
|
19
|
+
return {
|
|
20
|
+
matched,
|
|
21
|
+
runs: runs.length,
|
|
22
|
+
triggerRate,
|
|
23
|
+
threshold,
|
|
24
|
+
pass: triggerRate >= threshold,
|
|
25
|
+
histogram: histogramOf(runs),
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
function violatesNot(expectation, run) {
|
|
29
|
+
const forbidden = "noSkill" in expectation ? undefined : expectation.not;
|
|
30
|
+
if (!forbidden || forbidden.length === 0)
|
|
31
|
+
return false;
|
|
32
|
+
return run.firstSkill !== null && forbidden.includes(run.firstSkill);
|
|
33
|
+
}
|
|
34
|
+
function isOrderedSubsequence(needle, haystack) {
|
|
35
|
+
let cursor = 0;
|
|
36
|
+
for (const id of haystack) {
|
|
37
|
+
if (id === needle[cursor])
|
|
38
|
+
cursor += 1;
|
|
39
|
+
if (cursor === needle.length)
|
|
40
|
+
return true;
|
|
41
|
+
}
|
|
42
|
+
return cursor === needle.length;
|
|
43
|
+
}
|
|
44
|
+
function histogramOf(runs) {
|
|
45
|
+
const counts = new Map();
|
|
46
|
+
for (const run of runs) {
|
|
47
|
+
const key = run.firstSkill ?? "(no skill)";
|
|
48
|
+
counts.set(key, (counts.get(key) ?? 0) + 1);
|
|
49
|
+
}
|
|
50
|
+
return counts;
|
|
51
|
+
}
|
|
52
|
+
//# sourceMappingURL=score.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"score.js","sourceRoot":"","sources":["../../src/eval/score.ts"],"names":[],"mappings":"AAGA,MAAM,iBAAiB,GAAG,GAAG,CAAC;AAW9B,MAAM,UAAU,kBAAkB,CAAC,WAAwB,EAAE,GAAoB;IAC/E,IAAI,WAAW,CAAC,WAAW,EAAE,GAAG,CAAC;QAAE,OAAO,KAAK,CAAC;IAEhD,IAAI,SAAS,IAAI,WAAW,EAAE,CAAC;QAC7B,OAAO,GAAG,CAAC,UAAU,KAAK,IAAI,CAAC;IACjC,CAAC;IACD,IAAI,OAAO,IAAI,WAAW,EAAE,CAAC;QAC3B,OAAO,GAAG,CAAC,UAAU,KAAK,WAAW,CAAC,KAAK,CAAC;IAC9C,CAAC;IACD,IAAI,OAAO,IAAI,WAAW,EAAE,CAAC;QAC3B,OAAO,GAAG,CAAC,UAAU,KAAK,IAAI,IAAI,WAAW,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;IAC/E,CAAC;IACD,OAAO,oBAAoB,CAAC,WAAW,CAAC,IAAI,EAAE,GAAG,CAAC,QAAQ,CAAC,CAAC;AAC9D,CAAC;AAED,MAAM,UAAU,SAAS,CACvB,WAAwB,EACxB,IAAgC,EAChC,SAAS,GAAG,iBAAiB;IAE7B,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,kBAAkB,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC;IAClF,MAAM,WAAW,GAAG,IAAI,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC;IAClE,OAAO;QACL,OAAO;QACP,IAAI,EAAE,IAAI,CAAC,MAAM;QACjB,WAAW;QACX,SAAS;QACT,IAAI,EAAE,WAAW,IAAI,SAAS;QAC9B,SAAS,EAAE,WAAW,CAAC,IAAI,CAAC;KAC7B,CAAC;AACJ,CAAC;AAED,SAAS,WAAW,CAAC,WAAwB,EAAE,GAAoB;IACjE,MAAM,SAAS,GAAG,SAAS,IAAI,WAAW,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,WAAW,CAAC,GAAG,CAAC;IACzE,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IACvD,OAAO,GAAG,CAAC,UAAU,KAAK,IAAI,IAAI,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;AACvE,CAAC;AAED,SAAS,oBAAoB,CAAC,MAAyB,EAAE,QAA2B;IAClF,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,IAAI,EAAE,KAAK,MAAM,CAAC,MAAM,CAAC;YAAE,MAAM,IAAI,CAAC,CAAC;QACvC,IAAI,MAAM,KAAK,MAAM,CAAC,MAAM;YAAE,OAAO,IAAI,CAAC;IAC5C,CAAC;IACD,OAAO,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC;AAClC,CAAC;AAED,SAAS,WAAW,CAAC,IAAgC;IACnD,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAC;IACzC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,GAAG,GAAG,GAAG,CAAC,UAAU,IAAI,YAAY,CAAC;QAC3C,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC9C,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
package/dist/index.d.ts
CHANGED
|
@@ -27,6 +27,10 @@ export { resolveVendors } from "./vendor/registry.js";
|
|
|
27
27
|
export type { DiscoveredVendorPlugin, LinkedFile, Vendor, VendorEmitContext, VendorInstallContext, } from "./vendor/schema.js";
|
|
28
28
|
export { check } from "./check/index.js";
|
|
29
29
|
export type { CheckOptions, CheckResult, ReferenceViolation, ReferenceViolationKind, SourceSummary, } from "./check/index.js";
|
|
30
|
+
export { runEval, formatConsole, toJson } from "./eval/index.js";
|
|
31
|
+
export type { EvalOptions, EvalReport, CaseReport, LoadedCase, CaseLoadError, } from "./eval/index.js";
|
|
32
|
+
export { CaseFileSchema, TIERS } from "./eval/schema.js";
|
|
33
|
+
export type { CaseFile, EvalCase, Expectation, Tier } from "./eval/schema.js";
|
|
30
34
|
export { defaultSources, discoverInstalled, indexInstalled } from "./installed.js";
|
|
31
35
|
export type { InstalledAgent, InstalledArtifacts, InstalledCommand, InstalledIndex, InstalledSkill, PluginSource, } from "./installed.js";
|
|
32
36
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,eAAe,EACf,WAAW,EACX,mBAAmB,EACnB,WAAW,EACX,aAAa,EACb,oBAAoB,EACpB,2BAA2B,EAC3B,SAAS,GACV,MAAM,kBAAkB,CAAC;AAC1B,YAAY,EACV,SAAS,EACT,kBAAkB,EAClB,WAAW,EACX,cAAc,EACd,KAAK,EACL,SAAS,EACT,WAAW,GACZ,MAAM,kBAAkB,CAAC;AAE1B,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AAC/D,YAAY,EAAE,MAAM,EAAE,MAAM,mBAAmB,CAAC;AAEhD,OAAO,EAAE,sBAAsB,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AAC9E,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAC;AAE9E,OAAO,EAAE,GAAG,EAAE,EAAE,EAAE,MAAM,aAAa,CAAC;AACtC,YAAY,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAE1C,OAAO,EAAE,iBAAiB,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;AACxE,YAAY,EACV,WAAW,EACX,gBAAgB,EAChB,SAAS,EACT,iBAAiB,EACjB,eAAe,GAChB,MAAM,yBAAyB,CAAC;AAEjC,OAAO,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAC;AACpD,YAAY,EAAE,qBAAqB,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAE7E,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AACvC,YAAY,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAEnD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,YAAY,EAAE,WAAW,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AAEzD,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AACxD,YAAY,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAC;AACzD,OAAO,EAAE,aAAa,EAAE,aAAa,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AACnF,YAAY,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAErD,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAC9C,YAAY,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAEnD,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AACxD,YAAY,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAEzD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,YAAY,EACV,sBAAsB,EACtB,UAAU,EACV,MAAM,EACN,iBAAiB,EACjB,oBAAoB,GACrB,MAAM,oBAAoB,CAAC;AAE5B,OAAO,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AACzC,YAAY,EACV,YAAY,EACZ,WAAW,EACX,kBAAkB,EAClB,sBAAsB,EACtB,aAAa,GACd,MAAM,kBAAkB,CAAC;AAE1B,OAAO,EAAE,cAAc,EAAE,iBAAiB,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AACnF,YAAY,EACV,cAAc,EACd,kBAAkB,EAClB,gBAAgB,EAChB,cAAc,EACd,cAAc,EACd,YAAY,GACb,MAAM,gBAAgB,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,eAAe,EACf,WAAW,EACX,mBAAmB,EACnB,WAAW,EACX,aAAa,EACb,oBAAoB,EACpB,2BAA2B,EAC3B,SAAS,GACV,MAAM,kBAAkB,CAAC;AAC1B,YAAY,EACV,SAAS,EACT,kBAAkB,EAClB,WAAW,EACX,cAAc,EACd,KAAK,EACL,SAAS,EACT,WAAW,GACZ,MAAM,kBAAkB,CAAC;AAE1B,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AAC/D,YAAY,EAAE,MAAM,EAAE,MAAM,mBAAmB,CAAC;AAEhD,OAAO,EAAE,sBAAsB,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AAC9E,YAAY,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAC;AAE9E,OAAO,EAAE,GAAG,EAAE,EAAE,EAAE,MAAM,aAAa,CAAC;AACtC,YAAY,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAE1C,OAAO,EAAE,iBAAiB,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;AACxE,YAAY,EACV,WAAW,EACX,gBAAgB,EAChB,SAAS,EACT,iBAAiB,EACjB,eAAe,GAChB,MAAM,yBAAyB,CAAC;AAEjC,OAAO,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAC;AACpD,YAAY,EAAE,qBAAqB,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAE7E,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AACvC,YAAY,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAEnD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,YAAY,EAAE,WAAW,EAAE,UAAU,EAAE,MAAM,WAAW,CAAC;AAEzD,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AACxD,YAAY,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAC;AACzD,OAAO,EAAE,aAAa,EAAE,aAAa,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AACnF,YAAY,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAErD,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAC9C,YAAY,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAEnD,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AACxD,YAAY,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAEzD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,YAAY,EACV,sBAAsB,EACtB,UAAU,EACV,MAAM,EACN,iBAAiB,EACjB,oBAAoB,GACrB,MAAM,oBAAoB,CAAC;AAE5B,OAAO,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AACzC,YAAY,EACV,YAAY,EACZ,WAAW,EACX,kBAAkB,EAClB,sBAAsB,EACtB,aAAa,GACd,MAAM,kBAAkB,CAAC;AAE1B,OAAO,EAAE,OAAO,EAAE,aAAa,EAAE,MAAM,EAAE,MAAM,iBAAiB,CAAC;AACjE,YAAY,EACV,WAAW,EACX,UAAU,EACV,UAAU,EACV,UAAU,EACV,aAAa,GACd,MAAM,iBAAiB,CAAC;AACzB,OAAO,EAAE,cAAc,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AACzD,YAAY,EAAE,QAAQ,EAAE,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,MAAM,kBAAkB,CAAC;AAE9E,OAAO,EAAE,cAAc,EAAE,iBAAiB,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC;AACnF,YAAY,EACV,cAAc,EACd,kBAAkB,EAClB,gBAAgB,EAChB,cAAc,EACd,cAAc,EACd,YAAY,GACb,MAAM,gBAAgB,CAAC"}
|
package/dist/index.js
CHANGED
|
@@ -13,5 +13,7 @@ export { loadHarnessConfig } from "./config/harness.js";
|
|
|
13
13
|
export { builtinVendors } from "./vendor/builtins.js";
|
|
14
14
|
export { resolveVendors } from "./vendor/registry.js";
|
|
15
15
|
export { check } from "./check/index.js";
|
|
16
|
+
export { runEval, formatConsole, toJson } from "./eval/index.js";
|
|
17
|
+
export { CaseFileSchema, TIERS } from "./eval/schema.js";
|
|
16
18
|
export { defaultSources, discoverInstalled, indexInstalled } from "./installed.js";
|
|
17
19
|
//# sourceMappingURL=index.js.map
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,eAAe,EACf,WAAW,EACX,mBAAmB,EACnB,WAAW,EACX,aAAa,EACb,oBAAoB,EACpB,2BAA2B,EAC3B,SAAS,GACV,MAAM,kBAAkB,CAAC;AAW1B,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AAG/D,OAAO,EAAE,sBAAsB,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AAG9E,OAAO,EAAE,GAAG,EAAE,EAAE,EAAE,MAAM,aAAa,CAAC;AAGtC,OAAO,EAAE,iBAAiB,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;AASxE,OAAO,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAC;AAGpD,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAGvC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAGjC,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AAExD,OAAO,EAAE,aAAa,EAAE,aAAa,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AAGnF,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAG9C,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AAGxD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AAStD,OAAO,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AASzC,OAAO,EAAE,cAAc,EAAE,iBAAiB,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,eAAe,EACf,WAAW,EACX,mBAAmB,EACnB,WAAW,EACX,aAAa,EACb,oBAAoB,EACpB,2BAA2B,EAC3B,SAAS,GACV,MAAM,kBAAkB,CAAC;AAW1B,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AAG/D,OAAO,EAAE,sBAAsB,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AAG9E,OAAO,EAAE,GAAG,EAAE,EAAE,EAAE,MAAM,aAAa,CAAC;AAGtC,OAAO,EAAE,iBAAiB,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;AASxE,OAAO,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAC;AAGpD,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAGvC,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAGjC,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AAExD,OAAO,EAAE,aAAa,EAAE,aAAa,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AAGnF,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAG9C,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AAGxD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AAStD,OAAO,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AASzC,OAAO,EAAE,OAAO,EAAE,aAAa,EAAE,MAAM,EAAE,MAAM,iBAAiB,CAAC;AAQjE,OAAO,EAAE,cAAc,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AAGzD,OAAO,EAAE,cAAc,EAAE,iBAAiB,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@jean.gnc/harness-kit",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.12.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Build your own multi-agent harness: typed toolkit for authoring plugins (skills, agents, commands, hooks) and shipping them to Claude Code and Codex from a single source tree.",
|
|
6
6
|
"license": "MIT",
|