vskill 0.5.21 → 0.5.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/eval/sweep.d.ts +1 -0
- package/dist/commands/eval/sweep.js +78 -18
- package/dist/commands/eval/sweep.js.map +1 -1
- package/dist/commands/eval.d.ts +1 -0
- package/dist/commands/eval.js +1 -0
- package/dist/commands/eval.js.map +1 -1
- package/dist/eval/activation-history.d.ts +21 -0
- package/dist/eval/activation-history.js +41 -0
- package/dist/eval/activation-history.js.map +1 -0
- package/dist/eval-server/api-routes.js +197 -26
- package/dist/eval-server/api-routes.js.map +1 -1
- package/dist/eval-server/integration-routes.js +70 -1
- package/dist/eval-server/integration-routes.js.map +1 -1
- package/dist/eval-server/skill-create-routes.d.ts +11 -0
- package/dist/eval-server/skill-create-routes.js +131 -4
- package/dist/eval-server/skill-create-routes.js.map +1 -1
- package/dist/eval-server/skill-resolver.js +11 -1
- package/dist/eval-server/skill-resolver.js.map +1 -1
- package/dist/eval-server/sweep-routes.js +1 -0
- package/dist/eval-server/sweep-routes.js.map +1 -1
- package/dist/eval-server/sweep-runner.d.ts +37 -0
- package/dist/eval-server/sweep-runner.js +239 -85
- package/dist/eval-server/sweep-runner.js.map +1 -1
- package/dist/eval-ui/assets/index-BJKnEy7-.css +1 -0
- package/dist/eval-ui/assets/index-T2Uxn2Me.js +73 -0
- package/dist/eval-ui/index.html +2 -2
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/eval-ui/assets/index-C9_Pey9T.css +0 -1
- package/dist/eval-ui/assets/index-KfkLPyh3.js +0 -74
|
@@ -6,7 +6,7 @@ import { join } from "node:path";
|
|
|
6
6
|
import { loadAndValidateEvals, EvalValidationError } from "../../eval/schema.js";
|
|
7
7
|
import { buildEvalSystemPrompt } from "../../eval/prompt-builder.js";
|
|
8
8
|
import { runSweep } from "../../eval-server/sweep-runner.js";
|
|
9
|
-
import { green, red, bold, dim, table } from "../../utils/output.js";
|
|
9
|
+
import { green, red, yellow, bold, dim, table } from "../../utils/output.js";
|
|
10
10
|
export async function runEvalSweep(skillDir, options) {
|
|
11
11
|
// Load and validate evals.json
|
|
12
12
|
let evalsFile;
|
|
@@ -34,11 +34,19 @@ export async function runEvalSweep(skillDir, options) {
|
|
|
34
34
|
}
|
|
35
35
|
const runs = options.runs ?? 1;
|
|
36
36
|
const concurrency = options.concurrency ?? 5;
|
|
37
|
+
const baseline = options.baseline ?? false;
|
|
37
38
|
console.log(bold(`\nSweep: ${evalsFile.skill_name}`));
|
|
38
39
|
console.log(dim(`Models: ${modelList.join(", ")}`));
|
|
39
40
|
console.log(dim(`Judge: ${options.judge}`));
|
|
40
41
|
console.log(dim(`Runs per model: ${runs}`));
|
|
41
|
-
console.log(dim(`Cases: ${evalsFile.evals.length}
|
|
42
|
+
console.log(dim(`Cases: ${evalsFile.evals.length}`));
|
|
43
|
+
if (baseline)
|
|
44
|
+
console.log(dim(`Baseline: enabled (comparing with vs without skill)`));
|
|
45
|
+
console.log("");
|
|
46
|
+
// Warn about low run count
|
|
47
|
+
if (runs < 3) {
|
|
48
|
+
console.log(yellow(`Note: ${runs} run(s) may not produce statistically meaningful results. Use --runs 3+ for reliable ranking.\n`));
|
|
49
|
+
}
|
|
42
50
|
let sweepResult = null;
|
|
43
51
|
for await (const event of runSweep({
|
|
44
52
|
skillDir,
|
|
@@ -49,17 +57,33 @@ export async function runEvalSweep(skillDir, options) {
|
|
|
49
57
|
judge: options.judge,
|
|
50
58
|
runs,
|
|
51
59
|
concurrency,
|
|
60
|
+
baseline,
|
|
52
61
|
})) {
|
|
53
62
|
switch (event.type) {
|
|
63
|
+
case "sweep_judge_bias_warning":
|
|
64
|
+
console.log(yellow(`WARNING: ${event.data.warning}\n`));
|
|
65
|
+
break;
|
|
54
66
|
case "sweep_model_start":
|
|
55
67
|
process.stdout.write(dim(`[${event.data.modelIndex + 1}/${event.data.totalModels}] ${event.data.model} — `));
|
|
56
68
|
break;
|
|
57
|
-
case "sweep_model_progress":
|
|
58
|
-
|
|
69
|
+
case "sweep_model_progress": {
|
|
70
|
+
const phaseLabel = event.data.phase === "baseline" ? " [baseline]" : "";
|
|
71
|
+
process.stdout.write(dim(`\r[${event.data.model}${phaseLabel}] run ${event.data.run}/${event.data.totalRuns} case ${event.data.currentCase}/${event.data.totalCases} (${event.data.percentComplete}%)`));
|
|
59
72
|
break;
|
|
73
|
+
}
|
|
60
74
|
case "sweep_model_complete":
|
|
61
75
|
if (event.data.status === "complete" && event.data.passRate) {
|
|
62
|
-
|
|
76
|
+
let summary = ` done (pass rate: ${(event.data.passRate.mean * 100).toFixed(1)}%)`;
|
|
77
|
+
if (event.data.baselinePassRate && event.data.skillDelta) {
|
|
78
|
+
const delta = event.data.skillDelta.mean * 100;
|
|
79
|
+
const sign = delta >= 0 ? "+" : "";
|
|
80
|
+
summary += ` | baseline: ${(event.data.baselinePassRate.mean * 100).toFixed(1)}% | delta: ${sign}${delta.toFixed(1)}pp`;
|
|
81
|
+
if (event.data.amplificationPct != null && isFinite(event.data.amplificationPct)) {
|
|
82
|
+
const ampSign = event.data.amplificationPct >= 0 ? "+" : "";
|
|
83
|
+
summary += ` (${ampSign}${event.data.amplificationPct.toFixed(1)}%)`;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
console.log(green(summary));
|
|
63
87
|
}
|
|
64
88
|
else {
|
|
65
89
|
console.log(red(` error: ${event.data.errorMessage || "unknown"}`));
|
|
@@ -75,19 +99,55 @@ export async function runEvalSweep(skillDir, options) {
|
|
|
75
99
|
process.exit(1);
|
|
76
100
|
return;
|
|
77
101
|
}
|
|
78
|
-
//
|
|
79
|
-
const
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
102
|
+
// Sort by composite score (if available) then by pass rate
|
|
103
|
+
const sorted = [...sweepResult.models].sort((a, b) => {
|
|
104
|
+
if (a.compositeScore != null && b.compositeScore != null) {
|
|
105
|
+
return b.compositeScore - a.compositeScore;
|
|
106
|
+
}
|
|
107
|
+
return b.passRate.mean - a.passRate.mean;
|
|
108
|
+
});
|
|
109
|
+
// Build table based on whether baseline was used
|
|
110
|
+
if (baseline) {
|
|
111
|
+
const headers = ["RANK", "MODEL", "WITH SKILL", "WITHOUT SKILL", "DELTA", "AMPLIFICATION", "STATUS"];
|
|
112
|
+
const rows = sorted.map((m, i) => [
|
|
113
|
+
String(i + 1),
|
|
114
|
+
`${m.provider}/${m.model}`,
|
|
115
|
+
m.status === "complete" ? formatStats(m.passRate, true) : "-",
|
|
116
|
+
m.status === "complete" && m.baselinePassRate ? formatStats(m.baselinePassRate, true) : "-",
|
|
117
|
+
m.status === "complete" && m.skillDelta
|
|
118
|
+
? `${m.skillDelta.mean >= 0 ? "+" : ""}${(m.skillDelta.mean * 100).toFixed(1)}pp`
|
|
119
|
+
: "-",
|
|
120
|
+
m.status === "complete" && m.amplificationPct != null && isFinite(m.amplificationPct)
|
|
121
|
+
? `${m.amplificationPct >= 0 ? "+" : ""}${m.amplificationPct.toFixed(1)}%`
|
|
122
|
+
: "-",
|
|
123
|
+
m.status === "complete" ? green("OK") : red("ERR"),
|
|
124
|
+
]);
|
|
125
|
+
console.log(bold("\nSweep Results (Skill Amplification)\n"));
|
|
126
|
+
console.log(table(headers, rows));
|
|
127
|
+
// Skill quality badge
|
|
128
|
+
if (sweepResult.skillQualityScore != null && sweepResult.skillQualityRating) {
|
|
129
|
+
const ratingColors = {
|
|
130
|
+
excellent: green, good: green, marginal: yellow, minimal: yellow, harmful: red,
|
|
131
|
+
};
|
|
132
|
+
const colorFn = ratingColors[sweepResult.skillQualityRating] ?? dim;
|
|
133
|
+
const sign = sweepResult.skillQualityScore >= 0 ? "+" : "";
|
|
134
|
+
const label = `${sign}${sweepResult.skillQualityScore.toFixed(1)}% (${sweepResult.skillQualityRating.toUpperCase()})`;
|
|
135
|
+
console.log(`\nSkill Quality: ${colorFn(label)}`);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
else {
|
|
139
|
+
const headers = ["RANK", "MODEL", "PASS RATE", "DURATION", "COST", "STATUS"];
|
|
140
|
+
const rows = sorted.map((m, i) => [
|
|
141
|
+
String(i + 1),
|
|
142
|
+
`${m.provider}/${m.model}`,
|
|
143
|
+
m.status === "complete" ? formatStats(m.passRate, true) : "-",
|
|
144
|
+
m.status === "complete" ? formatStats(m.duration, false, "ms") : "-",
|
|
145
|
+
m.cost.total > 0 ? `$${m.cost.total.toFixed(4)}` : "-",
|
|
146
|
+
m.status === "complete" ? green("OK") : red("ERR"),
|
|
147
|
+
]);
|
|
148
|
+
console.log(bold("\nSweep Results\n"));
|
|
149
|
+
console.log(table(headers, rows));
|
|
150
|
+
}
|
|
91
151
|
console.log(dim(`\nLeaderboard saved to ${skillDir}/evals/leaderboard/`));
|
|
92
152
|
}
|
|
93
153
|
function formatStats(stats, asPercent, suffix = "") {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"sweep.js","sourceRoot":"","sources":["../../../src/commands/eval/sweep.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,wDAAwD;AACxD,8EAA8E;AAE9E,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AACjF,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,QAAQ,EAAE,MAAM,mCAAmC,CAAC;AAE7D,OAAO,EAAE,KAAK,EAAE,GAAG,
|
|
1
|
+
{"version":3,"file":"sweep.js","sourceRoot":"","sources":["../../../src/commands/eval/sweep.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,wDAAwD;AACxD,8EAA8E;AAE9E,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AACjF,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,QAAQ,EAAE,MAAM,mCAAmC,CAAC;AAE7D,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,uBAAuB,CAAC;AAU7E,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,QAAgB,EAAE,OAAqB;IACxE,+BAA+B;IAC/B,IAAI,SAAS,CAAC;IACd,IAAI,CAAC;QACH,SAAS,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IAC7C,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,IAAI,GAAG,YAAY,mBAAmB,EAAE,CAAC;YACvC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,uBAAuB,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QAC3D,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,wBAAyB,GAAa,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QACvE,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChB,OAAO;IACT,CAAC;IAED,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IAC/C,MAAM,YAAY,GAAG,UAAU,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IACvF,MAAM,YAAY,GAAG,qBAAqB,CAAC,YAAY,CAAC,CAAC;IACzD,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAEjF,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3B,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,mEAAmE,CAAC,CAAC,CAAC;QACxF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChB,OAAO;IACT,CAAC;IAED,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,IAAI,CAAC,CAAC;IAC/B,MAAM,WAAW,GAAG,OAAO,CAAC,WAAW,IAAI,CAAC,CAAC;IAC7C,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,IAAI,KAAK,CAAC;IAE3C,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,YAAY,SAAS,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;IACtD,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;IACpD,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,UAAU,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IAC5C,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,mBAAmB,IAAI,EAAE,CAAC,CAAC,CAAC;IAC5C,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,UAAU,SAAS,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IACrD,IAAI,QAAQ;QAAE,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,qDAAqD,CAAC,CAAC,CAAC;IACtF,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAEhB,2BAA2B;IAC3B,IAAI,IAAI,GAAG,CAAC,EAAE,CAAC;QACb,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,SAAS,IAAI,iGAAiG,CAAC,CAAC,CAAC;IACtI,CAAC;IAED,IAAI,WAAW,GAAuB,IAAI,CAAC;IAE3C,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,QAAQ,CAAC;QACjC,QAAQ;QACR,SAAS,EAAE,SAAS,CAAC,UAAU;QAC/B,YAAY;QACZ,SAAS,EAAE,SAAS,CAAC,KAAK;QAC1B,MAAM,EAAE,SAAS;QACjB,KAAK,EAAE,OAAO,CAAC,KAAK;QACpB,IAAI;QACJ,WAAW;QACX,QAAQ;KACT,CAAC,EAAE,CAAC;QACH,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;YACnB,KAAK,0BAA0B;gBAC7B,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,YAAY,KAAK,CAAC,IAAI,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC;gBACxD,MAAM;YAER,KAAK,mBAAmB;gBACtB,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,IAAI,CAAC,UAAU,GAAG,CAAC,IAAI,KAAK,CAAC,IAAI,CAAC,WAAW,KAAK,KAAK,CAAC,IAAI,CAAC,KAAK,KAAK,CAAC,CAAC,CAAC;gBAC7G,MAAM;YAER,KAAK,sBAAsB,CAAC,CAAC,CAAC;gBAC5B,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,KAAK,UAAU,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,EAAE,CAAC;gBACxE,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,KAAK,GAAG,UAAU,SAAS,KAAK,CAAC,IAAI,CAAC,GAAG,IAAI,KAAK,CAAC,IAAI,CAAC,SAAS,SAAS,KAAK,CAAC,IAAI,CAAC,WAAW,IAAI,KAAK,CAAC,IAAI,CAAC,UAAU,KAAK,KAAK,CAAC,IAAI,CAAC,eAAe,IAAI,CAAC,CAAC,CAAC;gBACzM,MAAM;YACR,CAAC;YAED,KAAK,sBAAsB;gBACzB,IAAI,KAAK,CAAC,IAAI,CAAC,MAAM,KAAK,UAAU,IAAI,KAAK,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC;oBAC5D,IAAI,OAAO,GAAG,qBAAqB,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC;oBACnF,IAAI,KAAK,CAAC,IAAI,CAAC,gBAAgB,IAAI,KAAK,CAAC,IAAI,CAAC,UAAU,EAAE,CAAC;wBACzD,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,GAAG,GAAG,CAAC;wBAC/C,MAAM,IAAI,GAAG,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;wBACnC,OAAO,IAAI,gBAAgB,CAAC,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,cAAc,IAAI,GAAG,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC;wBACxH,IAAI,KAAK,CAAC,IAAI,CAAC,gBAAgB,IAAI,IAAI,IAAI,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,EAAE,CAAC;4BACjF,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,gBAAgB,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;4BAC5D,OAAO,IAAI,KAAK,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC;wBACvE,CAAC;oBACH,CAAC;oBACD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC;gBAC9B,CAAC;qBAAM,CAAC;oBACN,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,KAAK,CAAC,IAAI,CAAC,YAAY,IAAI,SAAS,EAAE,CAAC,CAAC,CAAC;gBACtE,CAAC;gBACD,MAAM;YAER,KAAK,gBAAgB;gBACnB,WAAW,GAAG,KAAK,CAAC,IAAI,CAAC;gBACzB,MAAM;QACV,CAAC;IACH,CAAC;IAED,IAAI,CAAC,WAAW,EAAE,CAAC;QACjB,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,oCAAoC,CAAC,CAAC,CAAC;QACzD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChB,OAAO;IACT,CAAC;IAED,2DAA2D;IAC3D,MAAM,MAAM,GAAG,CAAC,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACnD,IAAI,CAAC,CAAC,cAAc,IAAI,IAAI,IAAI,CAAC,CAAC,cAAc,IAAI,IAAI,EAAE,CAAC;YACzD,OAAO,CAAC,CAAC,cAAc,GAAG,CAAC,CAAC,cAAc,CAAC;QAC7C,CAAC;QACD,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,iDAAiD;IACjD,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,OAAO,GAAG,CAAC,MAAM,EAAE,OAAO,EAAE,YAAY,EAAE,eAAe,EAAE,OAAO,EAAE,eAAe,EAAE,QAAQ,CAAC,CAAC;QACrG,MAAM,IAAI,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAChC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC;YACb,GAAG,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,KAAK,EAAE;YAC1B,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG;YAC7D,CAAC,CAAC,MAAM,KAAK,UAAU,IAAI,CAAC,CAAC,gBAAgB,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,gBAAgB,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG;YAC3F,CAAC,CAAC,MAAM,KAAK,UAAU,IAAI,CAAC,CAAC,UAAU;gBACrC,CAAC,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,UAAU,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI;gBACjF,CAAC,CAAC,GAAG;YACP,CAAC,CAAC,MAAM,KAAK,UAAU,IAAI,CAAC,CAAC,gBAAgB,IAAI,IAAI,IAAI,QAAQ,CAAC,CAAC,CAAC,gBAAgB,CAAC;gBACnF,CAAC,CAAC,GAAG,CAAC,CAAC,gBAAgB,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;gBAC1E,CAAC,CAAC,GAAG;YACP,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC;SACnD,CAAC,CAAC;QAEH,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,yCAAyC,CAAC,CAAC,CAAC;QAC7D,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;QAElC,sBAAsB;QACtB,IAAI,WAAW,CAAC,iBAAiB,IAAI,IAAI,IAAI,WAAW,CAAC,kBAAkB,EAAE,CAAC;YAC5E,MAAM,YAAY,GAA0C;gBAC1D,SAAS,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,GAAG;aAC/E,CAAC;YACF,MAAM,OAAO,GAAG,YAAY,CAAC,WAAW,CAAC,kBAAkB,CAAC,IAAI,GAAG,CAAC;YACpE,MAAM,IAAI,GAAG,WAAW,CAAC,iBAAiB,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;YAC3D,MAAM,KAAK,GAAG,GAAG,IAAI,GAAG,WAAW,CAAC,iBAAiB,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,WAAW,CAAC,kBAAkB,CAAC,WAAW,EAAE,GAAG,CAAC;YACtH,OAAO,CAAC,GAAG,CAAC,oBAAoB,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QACpD,CAAC;IACH,CAAC;SAAM,CAAC;QACN,MAAM,OAAO,GAAG,CAAC,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,UAAU,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC;QAC7E,MAAM,IAAI,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAChC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC;YACb,GAAG,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,KAAK,EAAE;YAC1B,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG;YAC7D,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG;YACpE,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG;YACtD,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC;SACnD,CAAC,CAAC;QAEH,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC,CAAC;QACvC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;IACpC,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,0BAA0B,QAAQ,qBAAqB,CAAC,CAAC,CAAC;AAC5E,CAAC;AAED,SAAS,WAAW,CAAC,KAAiB,EAAE,SAAkB,EAAE,MAAM,GAAG,EAAE;IACrE,IAAI,SAAS,EAAE,CAAC;QACd,OAAO,GAAG,CAAC,KAAK,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;IACjH,CAAC;IACD,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,MAAM,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;AACnH,CAAC"}
|
package/dist/commands/eval.d.ts
CHANGED
package/dist/commands/eval.js
CHANGED
|
@@ -67,6 +67,7 @@ export async function evalCommand(subcommand, target, opts = {}) {
|
|
|
67
67
|
judge: opts.judge,
|
|
68
68
|
runs: opts.runs ? parseInt(opts.runs, 10) : undefined,
|
|
69
69
|
concurrency: opts.concurrency ? parseInt(opts.concurrency, 10) : undefined,
|
|
70
|
+
baseline: opts.baseline,
|
|
70
71
|
});
|
|
71
72
|
}
|
|
72
73
|
case "credentials": {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"eval.js","sourceRoot":"","sources":["../../src/commands/eval.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,mCAAmC;AACnC,8EAA8E;AAE9E,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,oBAAoB,CAAC;AAE9C,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,UAAkB,EAClB,MAAe,EACf,
|
|
1
|
+
{"version":3,"file":"eval.js","sourceRoot":"","sources":["../../src/commands/eval.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,mCAAmC;AACnC,8EAA8E;AAE9E,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,oBAAoB,CAAC;AAE9C,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,UAAkB,EAClB,MAAe,EACf,OAAqQ,EAAE;IAEvQ,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;IAE3D,QAAQ,UAAU,EAAE,CAAC;QACnB,KAAK,OAAO,CAAC,CAAC,CAAC;YACb,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;YACxD,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAC;YACzD,OAAO,YAAY,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;QAClC,CAAC;QAED,KAAK,MAAM,CAAC,CAAC,CAAC;YACZ,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,0CAA0C,CAAC,CAAC,CAAC;gBAC/D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC/C,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,gBAAgB,CAAC,CAAC;YACvD,MAAM,QAAQ,GAAG,CAAC,IAAI,CAAC,IAAI,KAAK,aAAa,IAAI,IAAI,CAAC,IAAI,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC;YAC3F,OAAO,WAAW,CAAC,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;QACvD,CAAC;QAED,KAAK,KAAK,CAAC,CAAC,CAAC;YACX,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,yCAAyC,CAAC,CAAC,CAAC;gBAC9D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC/C,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,eAAe,CAAC,CAAC;YACrD,sEAAsE;YACtE,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,KAAK,KAAK,KAAK,CAAC;YAC9D,OAAO,UAAU,CAAC,QAAQ,EAAE;gBAC1B,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;gBAC1E,UAAU,EAAE,IAAI,CAAC,UAAU;gBAC3B,OAAO;gBACP,KAAK,EAAE,IAAI,CAAC,KAAK;aAClB,CAAC,CAAC;QACL,CAAC;QAED,KAAK,UAAU,CAAC,CAAC,CAAC;YAChB,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;YAC/D,OAAO,eAAe,CAAC,IAAI,CAAC,CAAC;QAC/B,CAAC;QAED,KAAK,cAAc,CAAC,CAAC,CAAC;YACpB,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,wBAAwB,CAAC,CAAC;YACtE,MAAM,gBAAgB,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YACvF,OAAO,kBAAkB,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,gBAAgB,CAAC,CAAC;QAClE,CAAC;QAED,KAAK,OAAO,CAAC,CAAC,CAAC;YACb,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,wEAAwE,CAAC,CAAC,CAAC;gBAC7F,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;gBACjB,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,oHAAoH,CAAC,CAAC,CAAC;gBACzI,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,IAAI,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC;gBAChB,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,8DAA8D,CAAC,CAAC,CAAC;gBACnF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC/C,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAC;YACzD,OAAO,YAAY,CAAC,QAAQ,EAAE;gBAC5B,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;gBACrD,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;gBAC1E,QAAQ,EAAE,IAAI,CAAC,QAAQ;aACxB,CAAC,CAAC;QACL,CAAC;QAED,KAAK,aAAa,CAAC,CAAC,CAAC;YACnB,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,iEAAiE,CAAC,CAAC,CAAC;gBACtF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,iDAAiD;YACjD,sDAAsD;YACtD,MAAM,YAAY,GAAG,IAAI,CAAC;YAC1B,MAAM,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,mBAAmB,EAAE,GAAG,MAAM,MAAM,CAAC,uBAAuB,CAAC,CAAC;YAC7G,QAAQ,MAAM,EAAE,CAAC;gBACf,KAAK,KAAK,CAAC,CAAC,CAAC;oBACX,MAAM,GAAG,GAAG,IAAI,CAAC,aAAa,CAAC;oBAC/B,IAAI,CAAC,GAAG,EAAE,CAAC;wBACT,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,qCAAqC,CAAC,CAAC,CAAC;wBAC1D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;oBAClB,CAAC;oBACD,OAAO,iBAAiB,CAAC,YAAY,EAAE,GAAG,CAAC,CAAC;gBAC9C,CAAC;gBACD,KAAK,MAAM;oBACT,OAAO,kBAAkB,CAAC,YAAY,CAAC,CAAC;gBAC1C,KAAK,OAAO;oBACV,OAAO,mBAAmB,CAAC,YAAY,CAAC,CAAC;gBAC3C;oBACE,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,oCAAoC,MAAM,KAAK,CAAC,GAAG,GAAG,CAAC,6BAA6B,CAAC,CAAC,CAAC;YAC7G,CAAC;YACD,MAAM;QACR,CAAC;QAED;YACE,OAAO,CAAC,KAAK,CACX,GAAG,CAAC,wBAAwB,UAAU,KAAK,CAAC;gBAC1C,GAAG,CAAC,yEAAyE,CAAC,CACjF,CAAC;IACN,CAAC;AACH,CAAC;AAED,SAAS,eAAe,CAAC,IAAY,EAAE,MAAc;IACnD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAChC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,KAAK,CACX,GAAG,CAAC,mBAAmB,MAAM,sCAAsC,CAAC,CACrE,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,qDAAqD;IACrD,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IAC5D,IAAI,UAAU,CAAC,UAAU,CAAC;QAAE,OAAO,UAAU,CAAC;IAE9C,sEAAsE;IACtE,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,SAAS,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACvE,IAAI,UAAU,CAAC,UAAU,CAAC;QAAE,OAAO,UAAU,CAAC;IAE9C,0CAA0C;IAC1C,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IAChD,IAAI,UAAU,CAAC,QAAQ,CAAC;QAAE,OAAO,QAAQ,CAAC;IAE1C,sEAAsE;IACtE,OAAO,UAAU,CAAC;AACpB,CAAC"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { ActivationResult } from "./activation-tester.js";
|
|
2
|
+
export interface ActivationHistoryRun {
|
|
3
|
+
id: string;
|
|
4
|
+
timestamp: string;
|
|
5
|
+
model: string;
|
|
6
|
+
provider: string;
|
|
7
|
+
promptCount: number;
|
|
8
|
+
summary: {
|
|
9
|
+
precision: number;
|
|
10
|
+
recall: number;
|
|
11
|
+
reliability: number;
|
|
12
|
+
tp: number;
|
|
13
|
+
tn: number;
|
|
14
|
+
fp: number;
|
|
15
|
+
fn: number;
|
|
16
|
+
};
|
|
17
|
+
results: ActivationResult[];
|
|
18
|
+
}
|
|
19
|
+
export declare function writeActivationRun(skillDir: string, run: ActivationHistoryRun): Promise<void>;
|
|
20
|
+
export declare function listActivationRuns(skillDir: string): Promise<Omit<ActivationHistoryRun, "results">[]>;
|
|
21
|
+
export declare function getActivationRun(skillDir: string, runId: string): Promise<ActivationHistoryRun | null>;
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// activation-history.ts -- persistent activation test history per skill
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
import { readFile, writeFile, mkdir } from "node:fs/promises";
|
|
5
|
+
import { join } from "node:path";
|
|
6
|
+
const HISTORY_FILENAME = "activation-history.json";
|
|
7
|
+
const MAX_RUNS = 50;
|
|
8
|
+
function historyPath(skillDir) {
|
|
9
|
+
return join(skillDir, HISTORY_FILENAME);
|
|
10
|
+
}
|
|
11
|
+
async function readHistoryFile(skillDir) {
|
|
12
|
+
try {
|
|
13
|
+
const content = await readFile(historyPath(skillDir), "utf-8");
|
|
14
|
+
const parsed = JSON.parse(content);
|
|
15
|
+
if (parsed && Array.isArray(parsed.runs))
|
|
16
|
+
return parsed;
|
|
17
|
+
return { runs: [] };
|
|
18
|
+
}
|
|
19
|
+
catch {
|
|
20
|
+
return { runs: [] };
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
export async function writeActivationRun(skillDir, run) {
|
|
24
|
+
await mkdir(skillDir, { recursive: true });
|
|
25
|
+
const history = await readHistoryFile(skillDir);
|
|
26
|
+
history.runs.push(run);
|
|
27
|
+
// Prune oldest if over cap
|
|
28
|
+
if (history.runs.length > MAX_RUNS) {
|
|
29
|
+
history.runs = history.runs.slice(history.runs.length - MAX_RUNS);
|
|
30
|
+
}
|
|
31
|
+
await writeFile(historyPath(skillDir), JSON.stringify(history, null, 2));
|
|
32
|
+
}
|
|
33
|
+
export async function listActivationRuns(skillDir) {
|
|
34
|
+
const history = await readHistoryFile(skillDir);
|
|
35
|
+
return history.runs.map(({ results: _results, ...rest }) => rest).reverse();
|
|
36
|
+
}
|
|
37
|
+
export async function getActivationRun(skillDir, runId) {
|
|
38
|
+
const history = await readHistoryFile(skillDir);
|
|
39
|
+
return history.runs.find((r) => r.id === runId) ?? null;
|
|
40
|
+
}
|
|
41
|
+
//# sourceMappingURL=activation-history.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"activation-history.js","sourceRoot":"","sources":["../../src/eval/activation-history.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,wEAAwE;AACxE,8EAA8E;AAE9E,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAyBjC,MAAM,gBAAgB,GAAG,yBAAyB,CAAC;AACnD,MAAM,QAAQ,GAAG,EAAE,CAAC;AAEpB,SAAS,WAAW,CAAC,QAAgB;IACnC,OAAO,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;AAC1C,CAAC;AAED,KAAK,UAAU,eAAe,CAAC,QAAgB;IAC7C,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,WAAW,CAAC,QAAQ,CAAC,EAAE,OAAO,CAAC,CAAC;QAC/D,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QACnC,IAAI,MAAM,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,IAAI,CAAC;YAAE,OAAO,MAA+B,CAAC;QACjF,OAAO,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC;IACtB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC;IACtB,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,QAAgB,EAChB,GAAyB;IAEzB,MAAM,KAAK,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC3C,MAAM,OAAO,GAAG,MAAM,eAAe,CAAC,QAAQ,CAAC,CAAC;IAChD,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACvB,2BAA2B;IAC3B,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;QACnC,OAAO,CAAC,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,GAAG,QAAQ,CAAC,CAAC;IACpE,CAAC;IACD,MAAM,SAAS,CAAC,WAAW,CAAC,QAAQ,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;AAC3E,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,QAAgB;IAEhB,MAAM,OAAO,GAAG,MAAM,eAAe,CAAC,QAAQ,CAAC,CAAC;IAChD,OAAO,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,GAAG,IAAI,EAAE,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,CAAC;AAC9E,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,QAAgB,EAChB,KAAa;IAEb,MAAM,OAAO,GAAG,MAAM,eAAe,CAAC,QAAQ,CAAC,CAAC;IAChD,OAAO,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,KAAK,CAAC,IAAI,IAAI,CAAC;AAC1D,CAAC"}
|
|
@@ -19,9 +19,10 @@ import { createLlmClient } from "../eval/llm.js";
|
|
|
19
19
|
import { runComparison } from "../eval/comparator.js";
|
|
20
20
|
import { computeVerdict } from "../eval/verdict.js";
|
|
21
21
|
import { generateActionItems } from "../eval/action-items.js";
|
|
22
|
-
import { buildEvalInitPrompt, parseGeneratedEvals } from "../eval/prompt-builder.js";
|
|
22
|
+
import { buildEvalInitPrompt, parseGeneratedEvals, buildIntegrationEvalPrompt, parseGeneratedIntegrationEvals, detectBrowserRequirements, detectPlatformTargets } from "../eval/prompt-builder.js";
|
|
23
23
|
import { testActivation } from "../eval/activation-tester.js";
|
|
24
24
|
import { detectMcpDependencies, detectSkillDependencies } from "../eval/mcp-detector.js";
|
|
25
|
+
import { writeActivationRun, listActivationRuns, getActivationRun } from "../eval/activation-history.js";
|
|
25
26
|
// ---------------------------------------------------------------------------
|
|
26
27
|
// In-memory config state — UI can change provider/model at runtime.
|
|
27
28
|
//
|
|
@@ -468,6 +469,7 @@ export function registerRoutes(router, root, projectName) {
|
|
|
468
469
|
sendJson(res, body, 200, req);
|
|
469
470
|
});
|
|
470
471
|
// Generate evals using AI — reads SKILL.md and returns generated EvalsFile
|
|
472
|
+
// Accepts optional { provider, model, testType } in request body
|
|
471
473
|
router.post("/api/skills/:plugin/:skill/generate-evals", async (req, res, params) => {
|
|
472
474
|
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
473
475
|
const skillMdPath = join(skillDir, "SKILL.md");
|
|
@@ -481,47 +483,114 @@ export function registerRoutes(router, root, projectName) {
|
|
|
481
483
|
res.on("close", () => { aborted = true; });
|
|
482
484
|
if (wantsSSE)
|
|
483
485
|
initSSE(res, req);
|
|
486
|
+
// Read optional body params for model selection + test type
|
|
487
|
+
const body = await readBody(req).catch(() => ({}));
|
|
488
|
+
// Build per-request client: use body overrides if provided, else global
|
|
489
|
+
const overrides = { ...currentOverrides };
|
|
490
|
+
if (body.provider)
|
|
491
|
+
overrides.provider = body.provider;
|
|
492
|
+
if (body.model)
|
|
493
|
+
overrides.model = body.model;
|
|
494
|
+
const isIntegration = body.testType === "integration";
|
|
484
495
|
try {
|
|
485
496
|
if (wantsSSE && !aborted)
|
|
486
497
|
sendSSE(res, "progress", { phase: "preparing", message: "Reading skill content..." });
|
|
487
498
|
const skillContent = readFileSync(skillMdPath, "utf-8");
|
|
488
|
-
|
|
489
|
-
|
|
499
|
+
// Build prompt based on test type
|
|
500
|
+
let prompt;
|
|
501
|
+
if (isIntegration) {
|
|
502
|
+
const mcpDeps = detectMcpDependencies(skillContent);
|
|
503
|
+
const browserReqs = detectBrowserRequirements(skillContent);
|
|
504
|
+
const platforms = detectPlatformTargets(skillContent);
|
|
505
|
+
prompt = buildIntegrationEvalPrompt(skillContent, mcpDeps, browserReqs, platforms);
|
|
506
|
+
}
|
|
507
|
+
else {
|
|
508
|
+
prompt = buildEvalInitPrompt(skillContent);
|
|
509
|
+
}
|
|
510
|
+
const client = createLlmClient(overrides);
|
|
490
511
|
if (wantsSSE && !aborted)
|
|
491
|
-
sendSSE(res, "progress", {
|
|
512
|
+
sendSSE(res, "progress", {
|
|
513
|
+
phase: "generating",
|
|
514
|
+
message: `Generating ${isIntegration ? "integration" : "unit"} test cases...`,
|
|
515
|
+
});
|
|
492
516
|
const genResult = wantsSSE
|
|
493
|
-
? await withHeartbeat(res, undefined, "generating",
|
|
517
|
+
? await withHeartbeat(res, undefined, "generating", `Generating ${isIntegration ? "integration" : "unit"} test cases`, () => client.generate("You generate eval test cases for AI skills. Output only valid JSON in a code fence.", prompt))
|
|
494
518
|
: await client.generate("You generate eval test cases for AI skills. Output only valid JSON in a code fence.", prompt);
|
|
495
519
|
if (aborted)
|
|
496
520
|
return;
|
|
497
521
|
if (wantsSSE && !aborted)
|
|
498
522
|
sendSSE(res, "progress", { phase: "parsing", message: "Parsing generated evals..." });
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
523
|
+
// Parse based on test type
|
|
524
|
+
if (isIntegration) {
|
|
525
|
+
const integrationCases = parseGeneratedIntegrationEvals(genResult.text);
|
|
526
|
+
// Load existing evals to merge and avoid ID collisions
|
|
527
|
+
let existingEvals = null;
|
|
528
|
+
try {
|
|
529
|
+
existingEvals = loadAndValidateEvals(skillDir);
|
|
530
|
+
}
|
|
531
|
+
catch (e) {
|
|
532
|
+
if (e.code !== "ENOENT" &&
|
|
533
|
+
!(e instanceof Error && e.message.includes("ENOENT"))) {
|
|
534
|
+
throw e;
|
|
535
|
+
}
|
|
536
|
+
// File doesn't exist — no existing evals, proceed with empty
|
|
537
|
+
}
|
|
538
|
+
const existingIds = existingEvals?.evals.map((e) => e.id) ?? [];
|
|
539
|
+
const maxId = existingIds.length > 0 ? Math.max(...existingIds) : 0;
|
|
540
|
+
// Re-number integration cases to avoid collisions
|
|
541
|
+
const reNumbered = integrationCases.map((c, i) => ({ ...c, id: maxId + 1 + i }));
|
|
542
|
+
const mergedEvals = {
|
|
543
|
+
skill_name: existingEvals?.skill_name || params.skill,
|
|
544
|
+
evals: [...(existingEvals?.evals || []), ...reNumbered],
|
|
545
|
+
};
|
|
546
|
+
// Record history
|
|
547
|
+
try {
|
|
548
|
+
await writeHistoryEntry(skillDir, {
|
|
549
|
+
timestamp: new Date().toISOString(),
|
|
550
|
+
model: client.model,
|
|
551
|
+
skill_name: mergedEvals.skill_name,
|
|
552
|
+
cases: [],
|
|
553
|
+
overall_pass_rate: undefined,
|
|
554
|
+
type: "eval-generate",
|
|
555
|
+
provider: overrides.provider || "claude-cli",
|
|
556
|
+
generate: { prompt, result: JSON.stringify(mergedEvals) },
|
|
557
|
+
});
|
|
558
|
+
}
|
|
559
|
+
catch { /* history write failure should not break the main response */ }
|
|
560
|
+
if (wantsSSE && !aborted) {
|
|
561
|
+
sendSSEDone(res, mergedEvals);
|
|
562
|
+
}
|
|
563
|
+
else {
|
|
564
|
+
sendJson(res, mergedEvals, 200, req);
|
|
565
|
+
}
|
|
517
566
|
}
|
|
518
567
|
else {
|
|
519
|
-
|
|
568
|
+
const evalsFile = parseGeneratedEvals(genResult.text);
|
|
569
|
+
// Record history entry for eval generation
|
|
570
|
+
try {
|
|
571
|
+
await writeHistoryEntry(skillDir, {
|
|
572
|
+
timestamp: new Date().toISOString(),
|
|
573
|
+
model: client.model,
|
|
574
|
+
skill_name: evalsFile.skill_name || params.skill,
|
|
575
|
+
cases: [],
|
|
576
|
+
overall_pass_rate: undefined,
|
|
577
|
+
type: "eval-generate",
|
|
578
|
+
provider: overrides.provider || "claude-cli",
|
|
579
|
+
generate: { prompt, result: JSON.stringify(evalsFile) },
|
|
580
|
+
});
|
|
581
|
+
}
|
|
582
|
+
catch { /* history write failure should not break the main response */ }
|
|
583
|
+
if (wantsSSE && !aborted) {
|
|
584
|
+
sendSSEDone(res, evalsFile);
|
|
585
|
+
}
|
|
586
|
+
else {
|
|
587
|
+
sendJson(res, evalsFile, 200, req);
|
|
588
|
+
}
|
|
520
589
|
}
|
|
521
590
|
}
|
|
522
591
|
catch (err) {
|
|
523
592
|
if (wantsSSE && !aborted) {
|
|
524
|
-
sendSSE(res, "error", classifyError(err,
|
|
593
|
+
sendSSE(res, "error", classifyError(err, overrides.provider || "claude-cli"));
|
|
525
594
|
res.end();
|
|
526
595
|
}
|
|
527
596
|
else {
|
|
@@ -1033,13 +1102,40 @@ export function registerRoutes(router, root, projectName) {
|
|
|
1033
1102
|
name: nameMatch ? nameMatch[1] : params.skill,
|
|
1034
1103
|
tags: tagsMatch ? tagsMatch[1].split(",").map((t) => t.trim()).filter(Boolean) : [],
|
|
1035
1104
|
};
|
|
1036
|
-
|
|
1105
|
+
// Use per-request model overrides if provided, fall back to global config
|
|
1106
|
+
const client = body.provider || body.model
|
|
1107
|
+
? createLlmClient({ provider: body.provider, model: body.model })
|
|
1108
|
+
: getClient();
|
|
1037
1109
|
const summary = await testActivation(description, body.prompts, client, (result) => {
|
|
1038
1110
|
if (!aborted) {
|
|
1039
1111
|
sendSSE(res, "prompt_result", result);
|
|
1040
1112
|
}
|
|
1041
1113
|
}, meta);
|
|
1042
1114
|
if (!aborted) {
|
|
1115
|
+
// Write activation history entry
|
|
1116
|
+
const usedProvider = body.provider || currentOverrides.provider || "unknown";
|
|
1117
|
+
const usedModel = body.model || currentOverrides.model || "unknown";
|
|
1118
|
+
const run = {
|
|
1119
|
+
id: `run-${Date.now()}`,
|
|
1120
|
+
timestamp: new Date().toISOString(),
|
|
1121
|
+
model: usedModel,
|
|
1122
|
+
provider: usedProvider,
|
|
1123
|
+
promptCount: summary.total,
|
|
1124
|
+
summary: {
|
|
1125
|
+
precision: summary.precision,
|
|
1126
|
+
recall: summary.recall,
|
|
1127
|
+
reliability: summary.reliability,
|
|
1128
|
+
tp: summary.tp,
|
|
1129
|
+
tn: summary.tn,
|
|
1130
|
+
fp: summary.fp,
|
|
1131
|
+
fn: summary.fn,
|
|
1132
|
+
},
|
|
1133
|
+
results: summary.results,
|
|
1134
|
+
};
|
|
1135
|
+
try {
|
|
1136
|
+
await writeActivationRun(skillDir, run);
|
|
1137
|
+
}
|
|
1138
|
+
catch { /* non-blocking */ }
|
|
1043
1139
|
sendSSEDone(res, { ...summary, description });
|
|
1044
1140
|
}
|
|
1045
1141
|
}
|
|
@@ -1047,6 +1143,81 @@ export function registerRoutes(router, root, projectName) {
|
|
|
1047
1143
|
sendSSEDone(res, { error: err instanceof Error ? err.message : String(err) });
|
|
1048
1144
|
}
|
|
1049
1145
|
});
|
|
1146
|
+
// AI-generate activation test prompts (SSE)
|
|
1147
|
+
router.post("/api/skills/:plugin/:skill/activation-prompts", async (req, res, params) => {
|
|
1148
|
+
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
1149
|
+
let aborted = false;
|
|
1150
|
+
res.on("close", () => { aborted = true; });
|
|
1151
|
+
try {
|
|
1152
|
+
const body = (await readBody(req));
|
|
1153
|
+
const skillMdPath = join(skillDir, "SKILL.md");
|
|
1154
|
+
if (!existsSync(skillMdPath)) {
|
|
1155
|
+
sendJson(res, { error: "SKILL.md not found" }, 404, req);
|
|
1156
|
+
return;
|
|
1157
|
+
}
|
|
1158
|
+
const skillContent = readFileSync(skillMdPath, "utf-8");
|
|
1159
|
+
const descMatch = skillContent.match(/^---[\s\S]*?description:\s*"([^"]+)"[\s\S]*?---/);
|
|
1160
|
+
const description = descMatch ? descMatch[1] : "";
|
|
1161
|
+
if (!description) {
|
|
1162
|
+
sendJson(res, { error: "No skill description available" }, 400, req);
|
|
1163
|
+
return;
|
|
1164
|
+
}
|
|
1165
|
+
initSSE(res, req);
|
|
1166
|
+
const count = body.count || 8;
|
|
1167
|
+
const half = Math.ceil(count / 2);
|
|
1168
|
+
const client = body.provider || body.model
|
|
1169
|
+
? createLlmClient({ provider: body.provider, model: body.model })
|
|
1170
|
+
: getClient();
|
|
1171
|
+
const systemPrompt = `Given this skill description, generate test prompts to evaluate activation quality.
|
|
1172
|
+
Generate ${count} prompts: ${half} that SHOULD activate this skill, ${count - half} that should NOT.
|
|
1173
|
+
For "should not" prompts, make them plausible but clearly outside this skill's domain.
|
|
1174
|
+
Return one JSON object per line: {"prompt": "...", "expected": "should_activate"|"should_not_activate"}
|
|
1175
|
+
Return ONLY the JSON lines, no other text.`;
|
|
1176
|
+
const userPrompt = `Skill description: ${description}`;
|
|
1177
|
+
const { text } = await client.generate(systemPrompt, userPrompt);
|
|
1178
|
+
if (aborted)
|
|
1179
|
+
return;
|
|
1180
|
+
const allPrompts = [];
|
|
1181
|
+
const lines = text.split("\n").filter((l) => l.trim());
|
|
1182
|
+
for (const line of lines) {
|
|
1183
|
+
try {
|
|
1184
|
+
const cleaned = line.replace(/^```(?:json)?\s*/i, "").replace(/```\s*$/i, "").trim();
|
|
1185
|
+
if (!cleaned.startsWith("{"))
|
|
1186
|
+
continue;
|
|
1187
|
+
const parsed = JSON.parse(cleaned);
|
|
1188
|
+
if (parsed.prompt && parsed.expected) {
|
|
1189
|
+
allPrompts.push({ prompt: parsed.prompt, expected: parsed.expected });
|
|
1190
|
+
if (!aborted)
|
|
1191
|
+
sendSSE(res, "prompt_generated", parsed);
|
|
1192
|
+
}
|
|
1193
|
+
}
|
|
1194
|
+
catch { /* skip malformed lines */ }
|
|
1195
|
+
}
|
|
1196
|
+
if (!aborted)
|
|
1197
|
+
sendSSEDone(res, { prompts: allPrompts });
|
|
1198
|
+
}
|
|
1199
|
+
catch (err) {
|
|
1200
|
+
if (!aborted) {
|
|
1201
|
+
sendSSEDone(res, { error: err instanceof Error ? err.message : String(err) });
|
|
1202
|
+
}
|
|
1203
|
+
}
|
|
1204
|
+
});
|
|
1205
|
+
// List activation test history (summaries only)
|
|
1206
|
+
router.get("/api/skills/:plugin/:skill/activation-history", async (req, res, params) => {
|
|
1207
|
+
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
1208
|
+
const runs = await listActivationRuns(skillDir);
|
|
1209
|
+
sendJson(res, { runs }, 200, req);
|
|
1210
|
+
});
|
|
1211
|
+
// Get full activation test run by ID
|
|
1212
|
+
router.get("/api/skills/:plugin/:skill/activation-history/:runId", async (req, res, params) => {
|
|
1213
|
+
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
1214
|
+
const run = await getActivationRun(skillDir, params.runId);
|
|
1215
|
+
if (!run) {
|
|
1216
|
+
sendJson(res, { error: "Run not found" }, 404, req);
|
|
1217
|
+
return;
|
|
1218
|
+
}
|
|
1219
|
+
sendJson(res, run, 200, req);
|
|
1220
|
+
});
|
|
1050
1221
|
// Get skill dependencies (MCP + skill-to-skill)
|
|
1051
1222
|
router.get("/api/skills/:plugin/:skill/dependencies", async (req, res, params) => {
|
|
1052
1223
|
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|