vskill 0.5.12 → 0.5.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/eval/credentials.d.ts +12 -0
- package/dist/commands/eval/credentials.js +140 -0
- package/dist/commands/eval/credentials.js.map +1 -0
- package/dist/commands/eval/generate-all.d.ts +1 -1
- package/dist/commands/eval/generate-all.js +57 -12
- package/dist/commands/eval/generate-all.js.map +1 -1
- package/dist/commands/eval/init.d.ts +2 -1
- package/dist/commands/eval/init.js +76 -10
- package/dist/commands/eval/init.js.map +1 -1
- package/dist/commands/eval/run.d.ts +7 -1
- package/dist/commands/eval/run.js +207 -26
- package/dist/commands/eval/run.js.map +1 -1
- package/dist/commands/eval/sweep.d.ts +7 -0
- package/dist/commands/eval/sweep.js +99 -0
- package/dist/commands/eval/sweep.js.map +1 -0
- package/dist/commands/eval.d.ts +10 -0
- package/dist/commands/eval.js +62 -4
- package/dist/commands/eval.js.map +1 -1
- package/dist/eval/batch-judge.d.ts +27 -0
- package/dist/eval/batch-judge.js +242 -0
- package/dist/eval/batch-judge.js.map +1 -0
- package/dist/eval/chrome-profile.d.ts +16 -0
- package/dist/eval/chrome-profile.js +65 -0
- package/dist/eval/chrome-profile.js.map +1 -0
- package/dist/eval/comparator.d.ts +3 -1
- package/dist/eval/comparator.js +19 -3
- package/dist/eval/comparator.js.map +1 -1
- package/dist/eval/concurrency.d.ts +13 -0
- package/dist/eval/concurrency.js +53 -0
- package/dist/eval/concurrency.js.map +1 -0
- package/dist/eval/credential-resolver.d.ts +31 -0
- package/dist/eval/credential-resolver.js +111 -0
- package/dist/eval/credential-resolver.js.map +1 -0
- package/dist/eval/integration-runner.d.ts +12 -0
- package/dist/eval/integration-runner.js +303 -0
- package/dist/eval/integration-runner.js.map +1 -0
- package/dist/eval/integration-types.d.ts +65 -0
- package/dist/eval/integration-types.js +18 -0
- package/dist/eval/integration-types.js.map +1 -0
- package/dist/eval/judge-cache.d.ts +29 -0
- package/dist/eval/judge-cache.js +109 -0
- package/dist/eval/judge-cache.js.map +1 -0
- package/dist/eval/judge.d.ts +1 -1
- package/dist/eval/judge.js +20 -3
- package/dist/eval/judge.js.map +1 -1
- package/dist/eval/llm.d.ts +2 -1
- package/dist/eval/llm.js +54 -2
- package/dist/eval/llm.js.map +1 -1
- package/dist/eval/prompt-builder.d.ts +10 -0
- package/dist/eval/prompt-builder.js +167 -0
- package/dist/eval/prompt-builder.js.map +1 -1
- package/dist/eval/rate-limiter.d.ts +20 -0
- package/dist/eval/rate-limiter.js +62 -0
- package/dist/eval/rate-limiter.js.map +1 -0
- package/dist/eval/schema.d.ts +14 -0
- package/dist/eval/schema.js +55 -5
- package/dist/eval/schema.js.map +1 -1
- package/dist/eval-server/api-routes.js +71 -2
- package/dist/eval-server/api-routes.js.map +1 -1
- package/dist/eval-server/benchmark-runner.d.ts +7 -0
- package/dist/eval-server/benchmark-runner.js +158 -42
- package/dist/eval-server/benchmark-runner.js.map +1 -1
- package/dist/eval-server/concurrency.d.ts +1 -13
- package/dist/eval-server/concurrency.js +3 -49
- package/dist/eval-server/concurrency.js.map +1 -1
- package/dist/eval-server/eval-server.js +4 -0
- package/dist/eval-server/eval-server.js.map +1 -1
- package/dist/eval-server/integration-routes.d.ts +2 -0
- package/dist/eval-server/integration-routes.js +100 -0
- package/dist/eval-server/integration-routes.js.map +1 -0
- package/dist/eval-server/skill-create-routes.js +151 -22
- package/dist/eval-server/skill-create-routes.js.map +1 -1
- package/dist/eval-server/sweep-routes.d.ts +2 -0
- package/dist/eval-server/sweep-routes.js +93 -0
- package/dist/eval-server/sweep-routes.js.map +1 -0
- package/dist/eval-server/sweep-runner.d.ts +93 -0
- package/dist/eval-server/sweep-runner.js +275 -0
- package/dist/eval-server/sweep-runner.js.map +1 -0
- package/dist/eval-ui/assets/index-KfkLPyh3.js +74 -0
- package/dist/eval-ui/index.html +1 -1
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/eval-ui/assets/index-C7MIPqI-.js +0 -74
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* vskill credentials set <KEY> -- prompt for value and store in .env.local
|
|
3
|
+
*/
|
|
4
|
+
export declare function runCredentialsSet(skillDir: string, key: string): Promise<void>;
|
|
5
|
+
/**
|
|
6
|
+
* vskill credentials list -- show all credentials referenced by integration tests
|
|
7
|
+
*/
|
|
8
|
+
export declare function runCredentialsList(skillDir: string): Promise<void>;
|
|
9
|
+
/**
|
|
10
|
+
* vskill credentials check -- resolve each credential and report source
|
|
11
|
+
*/
|
|
12
|
+
export declare function runCredentialsCheck(skillDir: string): Promise<void>;
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// vskill credentials -- manage credentials for integration tests
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
import { createInterface } from "node:readline";
|
|
5
|
+
import { resolveCredential, resolveAllCredentials, writeCredential } from "../../eval/credential-resolver.js";
|
|
6
|
+
import { loadAndValidateEvals } from "../../eval/schema.js";
|
|
7
|
+
import { green, red, yellow, dim, bold, table } from "../../utils/output.js";
|
|
8
|
+
/**
|
|
9
|
+
* vskill credentials set <KEY> -- prompt for value and store in .env.local
|
|
10
|
+
*/
|
|
11
|
+
export async function runCredentialsSet(skillDir, key) {
|
|
12
|
+
const value = await promptHidden(`Enter value for ${key}: `);
|
|
13
|
+
if (!value) {
|
|
14
|
+
console.error(red("No value provided. Aborted."));
|
|
15
|
+
return;
|
|
16
|
+
}
|
|
17
|
+
writeCredential(skillDir, key, value);
|
|
18
|
+
console.log(green(`${key} saved to .env.local`));
|
|
19
|
+
// Verify it resolves
|
|
20
|
+
const result = resolveCredential(key, skillDir);
|
|
21
|
+
if (result) {
|
|
22
|
+
console.log(dim(`Verified: resolves from ${result.source}`));
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* vskill credentials list -- show all credentials referenced by integration tests
|
|
27
|
+
*/
|
|
28
|
+
export async function runCredentialsList(skillDir) {
|
|
29
|
+
const names = collectRequiredCredentials(skillDir);
|
|
30
|
+
if (names.length === 0) {
|
|
31
|
+
console.log(dim("No integration test credentials found in evals."));
|
|
32
|
+
return;
|
|
33
|
+
}
|
|
34
|
+
const statuses = resolveAllCredentials(names, skillDir);
|
|
35
|
+
const rows = statuses.map((s) => [
|
|
36
|
+
s.name,
|
|
37
|
+
s.status === "ready" ? green("Ready") : red("Missing"),
|
|
38
|
+
s.source ?? "-",
|
|
39
|
+
]);
|
|
40
|
+
console.log(bold("\nCredential Status\n"));
|
|
41
|
+
console.log(table(["NAME", "STATUS", "SOURCE"], rows));
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* vskill credentials check -- resolve each credential and report source
|
|
45
|
+
*/
|
|
46
|
+
export async function runCredentialsCheck(skillDir) {
|
|
47
|
+
const names = collectRequiredCredentials(skillDir);
|
|
48
|
+
if (names.length === 0) {
|
|
49
|
+
console.log(dim("No integration test credentials found in evals."));
|
|
50
|
+
return;
|
|
51
|
+
}
|
|
52
|
+
const statuses = resolveAllCredentials(names, skillDir);
|
|
53
|
+
const rows = statuses.map((s) => [
|
|
54
|
+
s.name,
|
|
55
|
+
s.status === "ready" ? green("Ready") : red("Missing"),
|
|
56
|
+
s.source === "env"
|
|
57
|
+
? "Environment variable"
|
|
58
|
+
: s.source === "dotenv"
|
|
59
|
+
? ".env.local"
|
|
60
|
+
: "-",
|
|
61
|
+
]);
|
|
62
|
+
console.log(bold("\nCredential Resolution Check\n"));
|
|
63
|
+
console.log(table(["NAME", "STATUS", "SOURCE"], rows));
|
|
64
|
+
const missing = statuses.filter((s) => s.status === "missing");
|
|
65
|
+
if (missing.length > 0) {
|
|
66
|
+
console.log(yellow(`\n${missing.length} credential(s) missing. Set them with:`));
|
|
67
|
+
for (const m of missing) {
|
|
68
|
+
console.log(dim(` vskill credentials set ${m.name}`));
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
// ---------------------------------------------------------------------------
|
|
73
|
+
// Helpers
|
|
74
|
+
// ---------------------------------------------------------------------------
|
|
75
|
+
function collectRequiredCredentials(skillDir) {
|
|
76
|
+
try {
|
|
77
|
+
const evalsFile = loadAndValidateEvals(skillDir);
|
|
78
|
+
const allCreds = new Set();
|
|
79
|
+
for (const evalCase of evalsFile.evals) {
|
|
80
|
+
if (evalCase.testType === "integration" && evalCase.requiredCredentials) {
|
|
81
|
+
for (const cred of evalCase.requiredCredentials) {
|
|
82
|
+
allCreds.add(cred);
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
return [...allCreds].sort();
|
|
87
|
+
}
|
|
88
|
+
catch {
|
|
89
|
+
return [];
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
function promptHidden(question) {
|
|
93
|
+
return new Promise((resolve) => {
|
|
94
|
+
const rl = createInterface({
|
|
95
|
+
input: process.stdin,
|
|
96
|
+
output: process.stdout,
|
|
97
|
+
});
|
|
98
|
+
// Attempt to hide input (works in TTY mode)
|
|
99
|
+
if (process.stdin.isTTY) {
|
|
100
|
+
process.stdout.write(question);
|
|
101
|
+
const stdin = process.stdin;
|
|
102
|
+
const wasRaw = stdin.isRaw;
|
|
103
|
+
stdin.setRawMode(true);
|
|
104
|
+
let input = "";
|
|
105
|
+
const onData = (char) => {
|
|
106
|
+
const c = char.toString();
|
|
107
|
+
if (c === "\n" || c === "\r") {
|
|
108
|
+
stdin.setRawMode(wasRaw ?? false);
|
|
109
|
+
stdin.removeListener("data", onData);
|
|
110
|
+
process.stdout.write("\n");
|
|
111
|
+
rl.close();
|
|
112
|
+
resolve(input);
|
|
113
|
+
}
|
|
114
|
+
else if (c === "\u0003") {
|
|
115
|
+
// Ctrl+C
|
|
116
|
+
stdin.setRawMode(wasRaw ?? false);
|
|
117
|
+
stdin.removeListener("data", onData);
|
|
118
|
+
rl.close();
|
|
119
|
+
resolve("");
|
|
120
|
+
}
|
|
121
|
+
else if (c === "\u007F" || c === "\b") {
|
|
122
|
+
// Backspace
|
|
123
|
+
input = input.slice(0, -1);
|
|
124
|
+
}
|
|
125
|
+
else {
|
|
126
|
+
input += c;
|
|
127
|
+
}
|
|
128
|
+
};
|
|
129
|
+
stdin.on("data", onData);
|
|
130
|
+
}
|
|
131
|
+
else {
|
|
132
|
+
// Non-TTY: just use readline normally
|
|
133
|
+
rl.question(question, (answer) => {
|
|
134
|
+
rl.close();
|
|
135
|
+
resolve(answer);
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
//# sourceMappingURL=credentials.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"credentials.js","sourceRoot":"","sources":["../../../src/commands/eval/credentials.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,iEAAiE;AACjE,8EAA8E;AAE9E,OAAO,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAGhD,OAAO,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,eAAe,EAAE,MAAM,mCAAmC,CAAC;AAC9G,OAAO,EAAE,oBAAoB,EAAE,MAAM,sBAAsB,CAAC;AAC5D,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,uBAAuB,CAAC;AAE7E;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,QAAgB,EAAE,GAAW;IACnE,MAAM,KAAK,GAAG,MAAM,YAAY,CAAC,mBAAmB,GAAG,IAAI,CAAC,CAAC;IAC7D,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,6BAA6B,CAAC,CAAC,CAAC;QAClD,OAAO;IACT,CAAC;IAED,eAAe,CAAC,QAAQ,EAAE,GAAG,EAAE,KAAK,CAAC,CAAC;IACtC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,GAAG,sBAAsB,CAAC,CAAC,CAAC;IAEjD,qBAAqB;IACrB,MAAM,MAAM,GAAG,iBAAiB,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IAChD,IAAI,MAAM,EAAE,CAAC;QACX,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,2BAA2B,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IAC/D,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CAAC,QAAgB;IACvD,MAAM,KAAK,GAAG,0BAA0B,CAAC,QAAQ,CAAC,CAAC;IACnD,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,iDAAiD,CAAC,CAAC,CAAC;QACpE,OAAO;IACT,CAAC;IAED,MAAM,QAAQ,GAAG,qBAAqB,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;IACxD,MAAM,IAAI,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;QAC/B,CAAC,CAAC,IAAI;QACN,CAAC,CAAC,MAAM,KAAK,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC;QACtD,CAAC,CAAC,MAAM,IAAI,GAAG;KAChB,CAAC,CAAC;IAEH,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,uBAAuB,CAAC,CAAC,CAAC;IAC3C,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,QAAQ,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC;AACzD,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,mBAAmB,CAAC,QAAgB;IACxD,MAAM,KAAK,GAAG,0BAA0B,CAAC,QAAQ,CAAC,CAAC;IACnD,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,iDAAiD,CAAC,CAAC,CAAC;QACpE,OAAO;IACT,CAAC;IAED,MAAM,QAAQ,GAAG,qBAAqB,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;IACxD,MAAM,IAAI,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;QAC/B,CAAC,CAAC,IAAI;QACN,CAAC,CAAC,MAAM,KAAK,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC;QACtD,CAAC,CAAC,MAAM,KAAK,KAAK;YAChB,CAAC,CAAC,sBAAsB;YACxB,CAAC,CAAC,CAAC,CAAC,MAAM,KAAK,QAAQ;gBACrB,CAAC,CAAC,YAAY;gBACd,CAAC,CAAC,GAAG;KACV,CAAC,CAAC;IAEH,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC,CAAC;IACrD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,QAAQ,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC;IAEvD,MAAM,OAAO,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,SAAS,CAAC,CAAC;IAC/D,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,OAAO,CAAC,MAAM,wCAAwC,CAAC,CAAC,CAAC;QACjF,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;YACxB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,4BAA4B,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;QACzD,CAAC;IACH,CAAC;AACH,CAAC;AAED,8EAA8E;AAC9E,UAAU;AACV,8EAA8E;AAE9E,SAAS,0BAA0B,CAAC,QAAgB;IAClD,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;QACjD,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAU,CAAC;QACnC,KAAK,MAAM,QAAQ,IAAI,SAAS,CAAC,KAAK,EAAE,CAAC;YACvC,IAAI,QAAQ,CAAC,QAAQ,KAAK,aAAa,IAAI,QAAQ,CAAC,mBAAmB,EAAE,CAAC;gBACxE,KAAK,MAAM,IAAI,IAAI,QAAQ,CAAC,mBAAmB,EAAE,CAAC;oBAChD,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;gBACrB,CAAC;YACH,CAAC;QACH,CAAC;QACD,OAAO,CAAC,GAAG,QAAQ,CAAC,CAAC,IAAI,EAAE,CAAC;IAC9B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED,SAAS,YAAY,CAAC,QAAgB;IACpC,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;QAC7B,MAAM,EAAE,GAAG,eAAe,CAAC;YACzB,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;SACvB,CAAC,CAAC;QAEH,4CAA4C;QAC5C,IAAI,OAAO,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;YACxB,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;YAC/B,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC;YAC5B,MAAM,MAAM,GAAG,KAAK,CAAC,KAAK,CAAC;YAC3B,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;YACvB,IAAI,KAAK,GAAG,EAAE,CAAC;YACf,MAAM,MAAM,GAAG,CAAC,IAAY,EAAE,EAAE;gBAC9B,MAAM,CAAC,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;gBAC1B,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC;oBAC7B,KAAK,CAAC,UAAU,CAAC,MAAM,IAAI,KAAK,CAAC,CAAC;oBAClC,KAAK,CAAC,cAAc,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;oBACrC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;oBAC3B,EAAE,CAAC,KAAK,EAAE,CAAC;oBACX,OAAO,CAAC,KAAK,CAAC,CAAC;gBACjB,CAAC;qBAAM,IAAI,CAAC,KAAK,QAAQ,EAAE,CAAC;oBAC1B,SAAS;oBACT,KAAK,CAAC,UAAU,CAAC,MAAM,IAAI,KAAK,CAAC,CAAC;oBAClC,KAAK,CAAC,cAAc,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;oBACrC,EAAE,CAAC,KAAK,EAAE,CAAC;oBACX,OAAO,CAAC,EAAE,CAAC,CAAC;gBACd,CAAC;qBAAM,IAAI,CAAC,KAAK,QAAQ,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC;oBACxC,YAAY;oBACZ,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;gBAC7B,CAAC;qBAAM,CAAC;oBACN,KAAK,IAAI,CAAC,CAAC;gBACb,CAAC;YACH,CAAC,CAAC;YACF,KAAK,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QAC3B,CAAC;aAAM,CAAC;YACN,sCAAsC;YACtC,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,CAAC,MAAM,EAAE,EAAE;gBAC/B,EAAE,CAAC,KAAK,EAAE,CAAC;gBACX,OAAO,CAAC,MAAM,CAAC,CAAC;YAClB,CAAC,CAAC,CAAC;QACL,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
export declare function runEvalGenerateAll(root: string, force: boolean): Promise<void>;
|
|
1
|
+
export declare function runEvalGenerateAll(root: string, force: boolean, concurrency?: number): Promise<void>;
|
|
@@ -6,48 +6,93 @@ import { join } from "node:path";
|
|
|
6
6
|
import { scanSkills } from "../../eval/skill-scanner.js";
|
|
7
7
|
import { createLlmClient } from "../../eval/llm.js";
|
|
8
8
|
import { buildEvalInitPrompt, parseGeneratedEvals } from "../../eval/prompt-builder.js";
|
|
9
|
+
import { Semaphore } from "../../eval/concurrency.js";
|
|
9
10
|
import { green, red, yellow, bold, dim } from "../../utils/output.js";
|
|
10
|
-
|
|
11
|
+
const CLI_PROVIDERS = new Set(["claude-cli", "codex-cli", "gemini-cli"]);
|
|
12
|
+
function resolveProvider() {
|
|
13
|
+
const explicit = process.env.VSKILL_EVAL_PROVIDER;
|
|
14
|
+
if (explicit)
|
|
15
|
+
return { provider: explicit, autoSelected: false };
|
|
16
|
+
// Auto-select anthropic for batch ops when API key is available
|
|
17
|
+
if (process.env.ANTHROPIC_API_KEY) {
|
|
18
|
+
return { provider: "anthropic", autoSelected: true };
|
|
19
|
+
}
|
|
20
|
+
return { provider: "claude-cli", autoSelected: false };
|
|
21
|
+
}
|
|
22
|
+
function resolveConcurrency(explicitConcurrency, provider) {
|
|
23
|
+
if (explicitConcurrency !== undefined)
|
|
24
|
+
return Math.max(1, explicitConcurrency);
|
|
25
|
+
return CLI_PROVIDERS.has(provider) ? 1 : 3;
|
|
26
|
+
}
|
|
27
|
+
export async function runEvalGenerateAll(root, force, concurrency) {
|
|
11
28
|
const skills = await scanSkills(root);
|
|
12
29
|
if (skills.length === 0) {
|
|
13
30
|
console.log(dim("No skills found in " + root));
|
|
14
31
|
return;
|
|
15
32
|
}
|
|
16
|
-
const
|
|
33
|
+
const { provider, autoSelected } = resolveProvider();
|
|
34
|
+
const effectiveConcurrency = resolveConcurrency(concurrency, provider);
|
|
35
|
+
if (autoSelected) {
|
|
36
|
+
console.log(dim("Auto-selected anthropic provider for batch operation"));
|
|
37
|
+
}
|
|
38
|
+
console.log(dim(`Provider: ${provider} | Concurrency: ${effectiveConcurrency}`));
|
|
39
|
+
const client = createLlmClient({ provider });
|
|
40
|
+
const sem = new Semaphore(effectiveConcurrency);
|
|
17
41
|
let generated = 0;
|
|
18
42
|
let skipped = 0;
|
|
19
43
|
let failed = 0;
|
|
20
44
|
const failedPaths = [];
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
// Skip if evals already exist and not forcing
|
|
45
|
+
// Filter skills that need generation
|
|
46
|
+
const toGenerate = skills.filter((skill) => {
|
|
24
47
|
if (skill.hasEvals && !force) {
|
|
25
48
|
skipped++;
|
|
26
|
-
|
|
49
|
+
return false;
|
|
27
50
|
}
|
|
28
51
|
const skillMdPath = join(skill.dir, "SKILL.md");
|
|
29
52
|
if (!existsSync(skillMdPath)) {
|
|
30
53
|
failed++;
|
|
31
54
|
failedPaths.push(`${skill.plugin}/${skill.skill} (no SKILL.md)`);
|
|
32
|
-
|
|
55
|
+
return false;
|
|
33
56
|
}
|
|
57
|
+
return true;
|
|
58
|
+
});
|
|
59
|
+
// Process all skills concurrently with semaphore gating
|
|
60
|
+
const results = await Promise.allSettled(toGenerate.map(async (skill) => {
|
|
61
|
+
await sem.acquire();
|
|
34
62
|
try {
|
|
63
|
+
const skillMdPath = join(skill.dir, "SKILL.md");
|
|
64
|
+
const evalsPath = join(skill.dir, "evals", "evals.json");
|
|
35
65
|
const skillContent = readFileSync(skillMdPath, "utf-8");
|
|
36
66
|
const prompt = buildEvalInitPrompt(skillContent);
|
|
37
67
|
const genResult = await client.generate("You generate eval test cases for AI skills. Output only valid JSON in a code fence.", prompt);
|
|
38
68
|
const evalsFile = parseGeneratedEvals(genResult.text);
|
|
39
69
|
mkdirSync(join(skill.dir, "evals"), { recursive: true });
|
|
40
70
|
writeFileSync(evalsPath, JSON.stringify(evalsFile, null, 2), "utf-8");
|
|
41
|
-
generated++;
|
|
42
71
|
console.log(green(` Generated: ${skill.plugin}/${skill.skill}`));
|
|
43
|
-
|
|
44
|
-
await new Promise((r) => setTimeout(r, 2000));
|
|
72
|
+
return { skill, success: true };
|
|
45
73
|
}
|
|
46
74
|
catch (err) {
|
|
47
|
-
failed++;
|
|
48
|
-
failedPaths.push(`${skill.plugin}/${skill.skill}`);
|
|
49
75
|
console.error(red(` Failed: ${skill.plugin}/${skill.skill} - `) +
|
|
50
76
|
dim(err.message));
|
|
77
|
+
return { skill, success: false, error: err };
|
|
78
|
+
}
|
|
79
|
+
finally {
|
|
80
|
+
sem.release();
|
|
81
|
+
}
|
|
82
|
+
}));
|
|
83
|
+
// Tally results
|
|
84
|
+
for (const r of results) {
|
|
85
|
+
if (r.status === "fulfilled") {
|
|
86
|
+
if (r.value.success) {
|
|
87
|
+
generated++;
|
|
88
|
+
}
|
|
89
|
+
else {
|
|
90
|
+
failed++;
|
|
91
|
+
failedPaths.push(`${r.value.skill.plugin}/${r.value.skill.skill}`);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
else {
|
|
95
|
+
failed++;
|
|
51
96
|
}
|
|
52
97
|
}
|
|
53
98
|
// Print summary
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"generate-all.js","sourceRoot":"","sources":["../../../src/commands/eval/generate-all.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,uEAAuE;AACvE,8EAA8E;AAE9E,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAC7E,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,UAAU,EAAE,MAAM,6BAA6B,CAAC;AACzD,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;
|
|
1
|
+
{"version":3,"file":"generate-all.js","sourceRoot":"","sources":["../../../src/commands/eval/generate-all.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,uEAAuE;AACvE,8EAA8E;AAE9E,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAC7E,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,UAAU,EAAE,MAAM,6BAA6B,CAAC;AACzD,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AAEpD,OAAO,EAAE,mBAAmB,EAAE,mBAAmB,EAAE,MAAM,8BAA8B,CAAC;AACxF,OAAO,EAAE,SAAS,EAAE,MAAM,2BAA2B,CAAC;AACtD,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,uBAAuB,CAAC;AAEtE,MAAM,aAAa,GAAG,IAAI,GAAG,CAAe,CAAC,YAAY,EAAE,WAAW,EAAE,YAAY,CAAC,CAAC,CAAC;AAEvF,SAAS,eAAe;IACtB,MAAM,QAAQ,GAAG,OAAO,CAAC,GAAG,CAAC,oBAAgD,CAAC;IAC9E,IAAI,QAAQ;QAAE,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,YAAY,EAAE,KAAK,EAAE,CAAC;IAEjE,gEAAgE;IAChE,IAAI,OAAO,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC;QAClC,OAAO,EAAE,QAAQ,EAAE,WAAW,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC;IACvD,CAAC;IAED,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,YAAY,EAAE,KAAK,EAAE,CAAC;AACzD,CAAC;AAED,SAAS,kBAAkB,CACzB,mBAAuC,EACvC,QAAsB;IAEtB,IAAI,mBAAmB,KAAK,SAAS;QAAE,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,mBAAmB,CAAC,CAAC;IAC/E,OAAO,aAAa,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;AAC7C,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,IAAY,EACZ,KAAc,EACd,WAAoB;IAEpB,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,IAAI,CAAC,CAAC;IAEtC,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,qBAAqB,GAAG,IAAI,CAAC,CAAC,CAAC;QAC/C,OAAO;IACT,CAAC;IAED,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,GAAG,eAAe,EAAE,CAAC;IACrD,MAAM,oBAAoB,GAAG,kBAAkB,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;IAEvE,IAAI,YAAY,EAAE,CAAC;QACjB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,sDAAsD,CAAC,CAAC,CAAC;IAC3E,CAAC;IACD,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,aAAa,QAAQ,mBAAmB,oBAAoB,EAAE,CAAC,CAAC,CAAC;IAEjF,MAAM,MAAM,GAAG,eAAe,CAAC,EAAE,QAAQ,EAAE,CAAC,CAAC;IAC7C,MAAM,GAAG,GAAG,IAAI,SAAS,CAAC,oBAAoB,CAAC,CAAC;IAChD,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,IAAI,OAAO,GAAG,CAAC,CAAC;IAChB,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,MAAM,WAAW,GAAa,EAAE,CAAC;IAEjC,qCAAqC;IACrC,MAAM,UAAU,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE;QACzC,IAAI,KAAK,CAAC,QAAQ,IAAI,CAAC,KAAK,EAAE,CAAC;YAC7B,OAAO,EAAE,CAAC;YACV,OAAO,KAAK,CAAC;QACf,CAAC;QACD,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,UAAU,CAAC,CAAC;QAChD,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;YAC7B,MAAM,EAAE,CAAC;YACT,WAAW,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,KAAK,CAAC,KAAK,gBAAgB,CAAC,CAAC;YACjE,OAAO,KAAK,CAAC;QACf,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC,CAAC,CAAC;IAEH,wDAAwD;IACxD,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,UAAU,CACtC,UAAU,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;QAC7B,MAAM,GAAG,CAAC,OAAO,EAAE,CAAC;QACpB,IAAI,CAAC;YACH,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,UAAU,CAAC,CAAC;YAChD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC;YACzD,MAAM,YAAY,GAAG,YAAY,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;YACxD,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;YAEjD,MAAM,SAAS,GAAG,MAAM,MAAM,CAAC,QAAQ,CACrC,qFAAqF,EACrF,MAAM,CACP,CAAC;YAEF,MAAM,SAAS,GAAG,mBAAmB,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;YAEtD,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,OAAO,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YACzD,aAAa,CAAC,SAAS,EAAE,IAAI,CAAC,SAAS,CAAC,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;YAEtE,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,gBAAgB,KAAK,CAAC,MAAM,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;YAClE,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,IAAa,EAAE,CAAC;QAC3C,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,OAAO,CAAC,KAAK,CACX,GAAG,CAAC,aAAa,KAAK,CAAC,MAAM,IAAI,KAAK,CAAC,KAAK,KAAK,CAAC;gBAChD,GAAG,CAAE,GAAa,CAAC,OAAO,CAAC,CAC9B,CAAC;YACF,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,KAAc,EAAE,KAAK,EAAE,GAAY,EAAE,CAAC;QACjE,CAAC;gBAAS,CAAC;YACT,GAAG,CAAC,OAAO,EAAE,CAAC;QAChB,CAAC;IACH,CAAC,CAAC,CACH,CAAC;IAEF,gBAAgB;IAChB,KAAK,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;QACxB,IAAI,CAAC,CAAC,MAAM,KAAK,WAAW,EAAE,CAAC;YAC7B,IAAI,CAAC,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;gBACpB,SAAS,EAAE,CAAC;YACd,CAAC;iBAAM,CAAC;gBACN,MAAM,EAAE,CAAC;gBACT,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,IAAI,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC,CAAC;YACrE,CAAC;QACH,CAAC;aAAM,CAAC;YACN,MAAM,EAAE,CAAC;QACX,CAAC;IACH,CAAC;IAED,gBAAgB;IAChB,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,4BAA4B,CAAC,CAAC,CAAC;IAChD,OAAO,CAAC,GAAG,CAAC,cAAc,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;IAC3C,OAAO,CAAC,GAAG,CAAC,KAAK,KAAK,CAAC,cAAc,SAAS,EAAE,CAAC,EAAE,CAAC,CAAC;IACrD,OAAO,CAAC,GAAG,CAAC,KAAK,MAAM,CAAC,YAAY,OAAO,EAAE,CAAC,EAAE,CAAC,CAAC;IAClD,OAAO,CAAC,GAAG,CAAC,KAAK,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,WAAW,MAAM,EAAE,CAAC,CAAC,CAAC,CAAC,WAAW,MAAM,EAAE,EAAE,CAAC,CAAC;IAEhF,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC3B,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,kBAAkB,CAAC,CAAC,CAAC;QACrC,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;YAC/B,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,CAAC;QAClC,CAAC;IACH,CAAC;AACH,CAAC"}
|
|
@@ -1 +1,2 @@
|
|
|
1
|
-
export
|
|
1
|
+
export type EvalInitType = "unit" | "integration" | "all";
|
|
2
|
+
export declare function runEvalInit(skillDir: string, force: boolean, type?: EvalInitType): Promise<void>;
|
|
@@ -4,9 +4,10 @@
|
|
|
4
4
|
import { readFileSync, writeFileSync, existsSync, mkdirSync } from "node:fs";
|
|
5
5
|
import { join } from "node:path";
|
|
6
6
|
import { createLlmClient } from "../../eval/llm.js";
|
|
7
|
-
import { buildEvalInitPrompt, parseGeneratedEvals } from "../../eval/prompt-builder.js";
|
|
7
|
+
import { buildEvalInitPrompt, buildIntegrationEvalPrompt, parseGeneratedEvals, parseGeneratedIntegrationEvals, detectBrowserRequirements, detectPlatformTargets, } from "../../eval/prompt-builder.js";
|
|
8
|
+
import { detectMcpDependencies } from "../../eval/mcp-detector.js";
|
|
8
9
|
import { green, red, dim, yellow } from "../../utils/output.js";
|
|
9
|
-
export async function runEvalInit(skillDir, force) {
|
|
10
|
+
export async function runEvalInit(skillDir, force, type = "unit") {
|
|
10
11
|
const skillMdPath = join(skillDir, "SKILL.md");
|
|
11
12
|
const evalsDir = join(skillDir, "evals");
|
|
12
13
|
const evalsPath = join(evalsDir, "evals.json");
|
|
@@ -21,15 +22,80 @@ export async function runEvalInit(skillDir, force) {
|
|
|
21
22
|
return;
|
|
22
23
|
}
|
|
23
24
|
const skillContent = readFileSync(skillMdPath, "utf-8");
|
|
24
|
-
|
|
25
|
+
// Detect integration capabilities
|
|
26
|
+
const mcpDeps = detectMcpDependencies(skillContent);
|
|
27
|
+
const browserReqs = detectBrowserRequirements(skillContent);
|
|
28
|
+
const platforms = detectPlatformTargets(skillContent);
|
|
29
|
+
const hasIntegrationTargets = mcpDeps.length > 0 || browserReqs.hasBrowser;
|
|
30
|
+
// AC-US3-05: No integration targets + --type integration → skip
|
|
31
|
+
if (type === "integration" && !hasIntegrationTargets) {
|
|
32
|
+
console.log(dim("No integration targets detected, generating unit tests only"));
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
25
35
|
try {
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
36
|
+
if (type === "all" && hasIntegrationTargets) {
|
|
37
|
+
// AC-US3-04: Parallel dispatch — unit (Haiku) + integration (Sonnet)
|
|
38
|
+
const unitPrompt = buildEvalInitPrompt(skillContent);
|
|
39
|
+
const integrationPrompt = buildIntegrationEvalPrompt(skillContent, mcpDeps, browserReqs, platforms);
|
|
40
|
+
const unitClient = createLlmClient({ model: "haiku" });
|
|
41
|
+
const integrationClient = createLlmClient({ model: "sonnet" });
|
|
42
|
+
const [unitResult, integrationResult] = await Promise.allSettled([
|
|
43
|
+
unitClient.generate("You generate eval test cases for AI skills. Output only valid JSON in a code fence.", unitPrompt),
|
|
44
|
+
integrationClient.generate("You generate integration eval test cases for AI skills. Output only valid JSON in a code fence.", integrationPrompt),
|
|
45
|
+
]);
|
|
46
|
+
const unitCases = unitResult.status === "fulfilled"
|
|
47
|
+
? parseGeneratedEvals(unitResult.value.text).evals
|
|
48
|
+
: [];
|
|
49
|
+
const integrationCases = integrationResult.status === "fulfilled"
|
|
50
|
+
? parseGeneratedIntegrationEvals(integrationResult.value.text)
|
|
51
|
+
: [];
|
|
52
|
+
if (unitResult.status === "rejected") {
|
|
53
|
+
console.log(yellow(`Unit eval generation failed: ${unitResult.reason}`));
|
|
54
|
+
}
|
|
55
|
+
if (integrationResult.status === "rejected") {
|
|
56
|
+
console.log(yellow(`Integration eval generation failed: ${integrationResult.reason}`));
|
|
57
|
+
}
|
|
58
|
+
const allCases = [...unitCases, ...integrationCases];
|
|
59
|
+
if (allCases.length === 0) {
|
|
60
|
+
console.error(red("Both unit and integration eval generation failed"));
|
|
61
|
+
return;
|
|
62
|
+
}
|
|
63
|
+
// Extract skill name from unit result or derive from directory
|
|
64
|
+
const skillName = unitResult.status === "fulfilled"
|
|
65
|
+
? parseGeneratedEvals(unitResult.value.text).skill_name
|
|
66
|
+
: skillDir.split("/").pop() || "unknown";
|
|
67
|
+
const evalsFile = { skill_name: skillName, evals: allCases };
|
|
68
|
+
mkdirSync(evalsDir, { recursive: true });
|
|
69
|
+
writeFileSync(evalsPath, JSON.stringify(evalsFile, null, 2), "utf-8");
|
|
70
|
+
const unitCount = unitCases.length;
|
|
71
|
+
const intgCount = integrationCases.length;
|
|
72
|
+
console.log(green(`Created ${evalsPath}`));
|
|
73
|
+
console.log(dim(` ${unitCount} unit + ${intgCount} integration cases, ${allCases.reduce((sum, e) => sum + (e.assertions?.length || 0), 0)} assertions`));
|
|
74
|
+
}
|
|
75
|
+
else if (type === "integration") {
|
|
76
|
+
// Integration only
|
|
77
|
+
const integrationPrompt = buildIntegrationEvalPrompt(skillContent, mcpDeps, browserReqs, platforms);
|
|
78
|
+
const client = createLlmClient({ model: "sonnet" });
|
|
79
|
+
const genResult = await client.generate("You generate integration eval test cases for AI skills. Output only valid JSON in a code fence.", integrationPrompt);
|
|
80
|
+
const integrationCases = parseGeneratedIntegrationEvals(genResult.text);
|
|
81
|
+
const skillName = skillDir.split("/").pop() || "unknown";
|
|
82
|
+
const evalsFile = { skill_name: skillName, evals: integrationCases };
|
|
83
|
+
mkdirSync(evalsDir, { recursive: true });
|
|
84
|
+
writeFileSync(evalsPath, JSON.stringify(evalsFile, null, 2), "utf-8");
|
|
85
|
+
console.log(green(`Created ${evalsPath}`));
|
|
86
|
+
console.log(dim(` ${integrationCases.length} integration cases, ${integrationCases.reduce((sum, e) => sum + (e.assertions?.length || 0), 0)} assertions`));
|
|
87
|
+
}
|
|
88
|
+
else {
|
|
89
|
+
// Unit only (default, existing behavior)
|
|
90
|
+
const prompt = buildEvalInitPrompt(skillContent);
|
|
91
|
+
const client = createLlmClient();
|
|
92
|
+
const genResult = await client.generate("You generate eval test cases for AI skills. Output only valid JSON in a code fence.", prompt);
|
|
93
|
+
const evalsFile = parseGeneratedEvals(genResult.text);
|
|
94
|
+
mkdirSync(evalsDir, { recursive: true });
|
|
95
|
+
writeFileSync(evalsPath, JSON.stringify(evalsFile, null, 2), "utf-8");
|
|
96
|
+
console.log(green(`Created ${evalsPath}`));
|
|
97
|
+
console.log(dim(` ${evalsFile.evals.length} eval cases, ${evalsFile.evals.reduce((sum, e) => sum + e.assertions.length, 0)} assertions`));
|
|
98
|
+
}
|
|
33
99
|
}
|
|
34
100
|
catch (err) {
|
|
35
101
|
console.error(red("Failed to generate evals: ") + dim(err.message));
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"init.js","sourceRoot":"","sources":["../../../src/commands/eval/init.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,gEAAgE;AAChE,8EAA8E;AAE9E,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAC7E,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,
|
|
1
|
+
{"version":3,"file":"init.js","sourceRoot":"","sources":["../../../src/commands/eval/init.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,gEAAgE;AAChE,8EAA8E;AAE9E,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAC7E,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,EACL,mBAAmB,EACnB,0BAA0B,EAC1B,mBAAmB,EACnB,8BAA8B,EAC9B,yBAAyB,EACzB,qBAAqB,GACtB,MAAM,8BAA8B,CAAC;AACtC,OAAO,EAAE,qBAAqB,EAAE,MAAM,4BAA4B,CAAC;AACnE,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,EAAE,MAAM,uBAAuB,CAAC;AAIhE,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,QAAgB,EAChB,KAAc,EACd,OAAqB,MAAM;IAE3B,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IAC/C,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACzC,MAAM,SAAS,GAAG,IAAI,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;IAE/C,wBAAwB;IACxB,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;QAC7B,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,yBAAyB,WAAW,EAAE,CAAC,CAAC,CAAC;QAC3D,OAAO;IACT,CAAC;IAED,4BAA4B;IAC5B,IAAI,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC;QACpC,OAAO,CAAC,GAAG,CACT,MAAM,CAAC,qDAAqD,CAAC,CAC9D,CAAC;QACF,OAAO;IACT,CAAC;IAED,MAAM,YAAY,GAAG,YAAY,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;IAExD,kCAAkC;IAClC,MAAM,OAAO,GAAG,qBAAqB,CAAC,YAAY,CAAC,CAAC;IACpD,MAAM,WAAW,GAAG,yBAAyB,CAAC,YAAY,CAAC,CAAC;IAC5D,MAAM,SAAS,GAAG,qBAAqB,CAAC,YAAY,CAAC,CAAC;IACtD,MAAM,qBAAqB,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,IAAI,WAAW,CAAC,UAAU,CAAC;IAE3E,gEAAgE;IAChE,IAAI,IAAI,KAAK,aAAa,IAAI,CAAC,qBAAqB,EAAE,CAAC;QACrD,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,6DAA6D,CAAC,CAAC,CAAC;QAChF,OAAO;IACT,CAAC;IAED,IAAI,CAAC;QACH,IAAI,IAAI,KAAK,KAAK,IAAI,qBAAqB,EAAE,CAAC;YAC5C,qEAAqE;YACrE,MAAM,UAAU,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;YACrD,MAAM,iBAAiB,GAAG,0BAA0B,CAAC,YAAY,EAAE,OAAO,EAAE,WAAW,EAAE,SAAS,CAAC,CAAC;YAEpG,MAAM,UAAU,GAAG,eAAe,CAAC,EAAE,KAAK,EAAE,OAAO,EAAE,CAAC,CAAC;YACvD,MAAM,iBAAiB,GAAG,eAAe,CAAC,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;YAE/D,MAAM,CAAC,UAAU,EAAE,iBAAiB,CAAC,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC;gBAC/D,UAAU,CAAC,QAAQ,CACjB,qFAAqF,EACrF,UAAU,CACX;gBACD,iBAAiB,CAAC,QAAQ,CACxB,iGAAiG,EACjG,iBAAiB,CAClB;aACF,CAAC,CAAC;YAEH,MAAM,SAAS,GAAG,UAAU,CAAC,MAAM,KAAK,WAAW;gBACjD,CAAC,CAAC,mBAAmB,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,KAAK;gBAClD,CAAC,CAAC,EAAE,CAAC;YACP,MAAM,gBAAgB,GAAG,iBAAiB,CAAC,MAAM,KAAK,WAAW;gBAC/D,CAAC,CAAC,8BAA8B,CAAC,iBAAiB,CAAC,KAAK,CAAC,IAAI,CAAC;gBAC9D,CAAC,CAAC,EAAE,CAAC;YAEP,IAAI,UAAU,CAAC,MAAM,KAAK,UAAU,EAAE,CAAC;gBACrC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,gCAAgC,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;YAC3E,CAAC;YACD,IAAI,iBAAiB,CAAC,MAAM,KAAK,UAAU,EAAE,CAAC;gBAC5C,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,uCAAuC,iBAAiB,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;YACzF,CAAC;YAED,MAAM,QAAQ,GAAG,CAAC,GAAG,SAAS,EAAE,GAAG,gBAAgB,CAAC,CAAC;YACrD,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC1B,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,kDAAkD,CAAC,CAAC,CAAC;gBACvE,OAAO;YACT,CAAC;YAED,+DAA+D;YAC/D,MAAM,SAAS,GAAG,UAAU,CAAC,MAAM,KAAK,WAAW;gBACjD,CAAC,CAAC,mBAAmB,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,UAAU;gBACvD,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,IAAI,SAAS,CAAC;YAE3C,MAAM,SAAS,GAAG,EAAE,UAAU,EAAE,SAAS,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC;YAC7D,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YACzC,aAAa,CAAC,SAAS,EAAE,IAAI,CAAC,SAAS,CAAC,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;YAEtE,MAAM,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC;YACnC,MAAM,SAAS,GAAG,gBAAgB,CAAC,MAAM,CAAC;YAC1C,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,WAAW,SAAS,EAAE,CAAC,CAAC,CAAC;YAC3C,OAAO,CAAC,GAAG,CACT,GAAG,CAAC,KAAK,SAAS,WAAW,SAAS,uBAAuB,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,aAAa,CAAC,CAC7I,CAAC;QACJ,CAAC;aAAM,IAAI,IAAI,KAAK,aAAa,EAAE,CAAC;YAClC,mBAAmB;YACnB,MAAM,iBAAiB,GAAG,0BAA0B,CAAC,YAAY,EAAE,OAAO,EAAE,WAAW,EAAE,SAAS,CAAC,CAAC;YACpG,MAAM,MAAM,GAAG,eAAe,CAAC,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;YACpD,MAAM,SAAS,GAAG,MAAM,MAAM,CAAC,QAAQ,CACrC,iGAAiG,EACjG,iBAAiB,CAClB,CAAC;YAEF,MAAM,gBAAgB,GAAG,8BAA8B,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;YACxE,MAAM,SAAS,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,IAAI,SAAS,CAAC;YACzD,MAAM,SAAS,GAAG,EAAE,UAAU,EAAE,SAAS,EAAE,KAAK,EAAE,gBAAgB,EAAE,CAAC;YAErE,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YACzC,aAAa,CAAC,SAAS,EAAE,IAAI,CAAC,SAAS,CAAC,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;YAEtE,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,WAAW,SAAS,EAAE,CAAC,CAAC,CAAC;YAC3C,OAAO,CAAC,GAAG,CACT,GAAG,CAAC,KAAK,gBAAgB,CAAC,MAAM,uBAAuB,gBAAgB,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,aAAa,CAAC,CAC/I,CAAC;QACJ,CAAC;aAAM,CAAC;YACN,yCAAyC;YACzC,MAAM,MAAM,GAAG,mBAAmB,CAAC,YAAY,CAAC,CAAC;YACjD,MAAM,MAAM,GAAG,eAAe,EAAE,CAAC;YACjC,MAAM,SAAS,GAAG,MAAM,MAAM,CAAC,QAAQ,CACrC,qFAAqF,EACrF,MAAM,CACP,CAAC;YAEF,MAAM,SAAS,GAAG,mBAAmB,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;YAEtD,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YACzC,aAAa,CAAC,SAAS,EAAE,IAAI,CAAC,SAAS,CAAC,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;YAEtE,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,WAAW,SAAS,EAAE,CAAC,CAAC,CAAC;YAC3C,OAAO,CAAC,GAAG,CACT,GAAG,CAAC,KAAK,SAAS,CAAC,KAAK,CAAC,MAAM,gBAAgB,SAAS,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC,aAAa,CAAC,CAC9H,CAAC;QACJ,CAAC;IACH,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,OAAO,CAAC,KAAK,CACX,GAAG,CAAC,4BAA4B,CAAC,GAAG,GAAG,CAAE,GAAa,CAAC,OAAO,CAAC,CAChE,CAAC;IACJ,CAAC;AACH,CAAC"}
|
|
@@ -1 +1,7 @@
|
|
|
1
|
-
export
|
|
1
|
+
export interface EvalRunOptions {
|
|
2
|
+
concurrency?: number;
|
|
3
|
+
judgeModel?: string;
|
|
4
|
+
noCache?: boolean;
|
|
5
|
+
batch?: boolean;
|
|
6
|
+
}
|
|
7
|
+
export declare function runEvalRun(skillDir: string, options?: EvalRunOptions): Promise<void>;
|