vskill 0.5.21 → 0.5.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/eval/sweep.d.ts +1 -0
- package/dist/commands/eval/sweep.js +78 -18
- package/dist/commands/eval/sweep.js.map +1 -1
- package/dist/commands/eval.d.ts +1 -0
- package/dist/commands/eval.js +1 -0
- package/dist/commands/eval.js.map +1 -1
- package/dist/eval/activation-history.d.ts +21 -0
- package/dist/eval/activation-history.js +41 -0
- package/dist/eval/activation-history.js.map +1 -0
- package/dist/eval-server/api-routes.js +104 -1
- package/dist/eval-server/api-routes.js.map +1 -1
- package/dist/eval-server/skill-create-routes.d.ts +11 -0
- package/dist/eval-server/skill-create-routes.js +131 -4
- package/dist/eval-server/skill-create-routes.js.map +1 -1
- package/dist/eval-server/sweep-routes.js +1 -0
- package/dist/eval-server/sweep-routes.js.map +1 -1
- package/dist/eval-server/sweep-runner.d.ts +33 -0
- package/dist/eval-server/sweep-runner.js +233 -84
- package/dist/eval-server/sweep-runner.js.map +1 -1
- package/dist/eval-ui/assets/index-BM579QSw.js +73 -0
- package/dist/eval-ui/assets/index-DcbzllbY.css +1 -0
- package/dist/eval-ui/index.html +2 -2
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/eval-ui/assets/index-C9_Pey9T.css +0 -1
- package/dist/eval-ui/assets/index-KfkLPyh3.js +0 -74
|
@@ -6,7 +6,7 @@ import { join } from "node:path";
|
|
|
6
6
|
import { loadAndValidateEvals, EvalValidationError } from "../../eval/schema.js";
|
|
7
7
|
import { buildEvalSystemPrompt } from "../../eval/prompt-builder.js";
|
|
8
8
|
import { runSweep } from "../../eval-server/sweep-runner.js";
|
|
9
|
-
import { green, red, bold, dim, table } from "../../utils/output.js";
|
|
9
|
+
import { green, red, yellow, bold, dim, table } from "../../utils/output.js";
|
|
10
10
|
export async function runEvalSweep(skillDir, options) {
|
|
11
11
|
// Load and validate evals.json
|
|
12
12
|
let evalsFile;
|
|
@@ -34,11 +34,19 @@ export async function runEvalSweep(skillDir, options) {
|
|
|
34
34
|
}
|
|
35
35
|
const runs = options.runs ?? 1;
|
|
36
36
|
const concurrency = options.concurrency ?? 5;
|
|
37
|
+
const baseline = options.baseline ?? false;
|
|
37
38
|
console.log(bold(`\nSweep: ${evalsFile.skill_name}`));
|
|
38
39
|
console.log(dim(`Models: ${modelList.join(", ")}`));
|
|
39
40
|
console.log(dim(`Judge: ${options.judge}`));
|
|
40
41
|
console.log(dim(`Runs per model: ${runs}`));
|
|
41
|
-
console.log(dim(`Cases: ${evalsFile.evals.length}
|
|
42
|
+
console.log(dim(`Cases: ${evalsFile.evals.length}`));
|
|
43
|
+
if (baseline)
|
|
44
|
+
console.log(dim(`Baseline: enabled (comparing with vs without skill)`));
|
|
45
|
+
console.log("");
|
|
46
|
+
// Warn about low run count
|
|
47
|
+
if (runs < 3) {
|
|
48
|
+
console.log(yellow(`Note: ${runs} run(s) may not produce statistically meaningful results. Use --runs 3+ for reliable ranking.\n`));
|
|
49
|
+
}
|
|
42
50
|
let sweepResult = null;
|
|
43
51
|
for await (const event of runSweep({
|
|
44
52
|
skillDir,
|
|
@@ -49,17 +57,33 @@ export async function runEvalSweep(skillDir, options) {
|
|
|
49
57
|
judge: options.judge,
|
|
50
58
|
runs,
|
|
51
59
|
concurrency,
|
|
60
|
+
baseline,
|
|
52
61
|
})) {
|
|
53
62
|
switch (event.type) {
|
|
63
|
+
case "sweep_judge_bias_warning":
|
|
64
|
+
console.log(yellow(`WARNING: ${event.data.warning}\n`));
|
|
65
|
+
break;
|
|
54
66
|
case "sweep_model_start":
|
|
55
67
|
process.stdout.write(dim(`[${event.data.modelIndex + 1}/${event.data.totalModels}] ${event.data.model} — `));
|
|
56
68
|
break;
|
|
57
|
-
case "sweep_model_progress":
|
|
58
|
-
|
|
69
|
+
case "sweep_model_progress": {
|
|
70
|
+
const phaseLabel = event.data.phase === "baseline" ? " [baseline]" : "";
|
|
71
|
+
process.stdout.write(dim(`\r[${event.data.model}${phaseLabel}] run ${event.data.run}/${event.data.totalRuns} case ${event.data.currentCase}/${event.data.totalCases} (${event.data.percentComplete}%)`));
|
|
59
72
|
break;
|
|
73
|
+
}
|
|
60
74
|
case "sweep_model_complete":
|
|
61
75
|
if (event.data.status === "complete" && event.data.passRate) {
|
|
62
|
-
|
|
76
|
+
let summary = ` done (pass rate: ${(event.data.passRate.mean * 100).toFixed(1)}%)`;
|
|
77
|
+
if (event.data.baselinePassRate && event.data.skillDelta) {
|
|
78
|
+
const delta = event.data.skillDelta.mean * 100;
|
|
79
|
+
const sign = delta >= 0 ? "+" : "";
|
|
80
|
+
summary += ` | baseline: ${(event.data.baselinePassRate.mean * 100).toFixed(1)}% | delta: ${sign}${delta.toFixed(1)}pp`;
|
|
81
|
+
if (event.data.amplificationPct != null && isFinite(event.data.amplificationPct)) {
|
|
82
|
+
const ampSign = event.data.amplificationPct >= 0 ? "+" : "";
|
|
83
|
+
summary += ` (${ampSign}${event.data.amplificationPct.toFixed(1)}%)`;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
console.log(green(summary));
|
|
63
87
|
}
|
|
64
88
|
else {
|
|
65
89
|
console.log(red(` error: ${event.data.errorMessage || "unknown"}`));
|
|
@@ -75,19 +99,55 @@ export async function runEvalSweep(skillDir, options) {
|
|
|
75
99
|
process.exit(1);
|
|
76
100
|
return;
|
|
77
101
|
}
|
|
78
|
-
//
|
|
79
|
-
const
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
102
|
+
// Sort by composite score (if available) then by pass rate
|
|
103
|
+
const sorted = [...sweepResult.models].sort((a, b) => {
|
|
104
|
+
if (a.compositeScore != null && b.compositeScore != null) {
|
|
105
|
+
return b.compositeScore - a.compositeScore;
|
|
106
|
+
}
|
|
107
|
+
return b.passRate.mean - a.passRate.mean;
|
|
108
|
+
});
|
|
109
|
+
// Build table based on whether baseline was used
|
|
110
|
+
if (baseline) {
|
|
111
|
+
const headers = ["RANK", "MODEL", "WITH SKILL", "WITHOUT SKILL", "DELTA", "AMPLIFICATION", "STATUS"];
|
|
112
|
+
const rows = sorted.map((m, i) => [
|
|
113
|
+
String(i + 1),
|
|
114
|
+
`${m.provider}/${m.model}`,
|
|
115
|
+
m.status === "complete" ? formatStats(m.passRate, true) : "-",
|
|
116
|
+
m.status === "complete" && m.baselinePassRate ? formatStats(m.baselinePassRate, true) : "-",
|
|
117
|
+
m.status === "complete" && m.skillDelta
|
|
118
|
+
? `${m.skillDelta.mean >= 0 ? "+" : ""}${(m.skillDelta.mean * 100).toFixed(1)}pp`
|
|
119
|
+
: "-",
|
|
120
|
+
m.status === "complete" && m.amplificationPct != null && isFinite(m.amplificationPct)
|
|
121
|
+
? `${m.amplificationPct >= 0 ? "+" : ""}${m.amplificationPct.toFixed(1)}%`
|
|
122
|
+
: "-",
|
|
123
|
+
m.status === "complete" ? green("OK") : red("ERR"),
|
|
124
|
+
]);
|
|
125
|
+
console.log(bold("\nSweep Results (Skill Amplification)\n"));
|
|
126
|
+
console.log(table(headers, rows));
|
|
127
|
+
// Skill quality badge
|
|
128
|
+
if (sweepResult.skillQualityScore != null && sweepResult.skillQualityRating) {
|
|
129
|
+
const ratingColors = {
|
|
130
|
+
excellent: green, good: green, marginal: yellow, minimal: yellow, harmful: red,
|
|
131
|
+
};
|
|
132
|
+
const colorFn = ratingColors[sweepResult.skillQualityRating] ?? dim;
|
|
133
|
+
const sign = sweepResult.skillQualityScore >= 0 ? "+" : "";
|
|
134
|
+
const label = `${sign}${sweepResult.skillQualityScore.toFixed(1)}% (${sweepResult.skillQualityRating.toUpperCase()})`;
|
|
135
|
+
console.log(`\nSkill Quality: ${colorFn(label)}`);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
else {
|
|
139
|
+
const headers = ["RANK", "MODEL", "PASS RATE", "DURATION", "COST", "STATUS"];
|
|
140
|
+
const rows = sorted.map((m, i) => [
|
|
141
|
+
String(i + 1),
|
|
142
|
+
`${m.provider}/${m.model}`,
|
|
143
|
+
m.status === "complete" ? formatStats(m.passRate, true) : "-",
|
|
144
|
+
m.status === "complete" ? formatStats(m.duration, false, "ms") : "-",
|
|
145
|
+
m.cost.total > 0 ? `$${m.cost.total.toFixed(4)}` : "-",
|
|
146
|
+
m.status === "complete" ? green("OK") : red("ERR"),
|
|
147
|
+
]);
|
|
148
|
+
console.log(bold("\nSweep Results\n"));
|
|
149
|
+
console.log(table(headers, rows));
|
|
150
|
+
}
|
|
91
151
|
console.log(dim(`\nLeaderboard saved to ${skillDir}/evals/leaderboard/`));
|
|
92
152
|
}
|
|
93
153
|
function formatStats(stats, asPercent, suffix = "") {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"sweep.js","sourceRoot":"","sources":["../../../src/commands/eval/sweep.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,wDAAwD;AACxD,8EAA8E;AAE9E,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AACjF,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,QAAQ,EAAE,MAAM,mCAAmC,CAAC;AAE7D,OAAO,EAAE,KAAK,EAAE,GAAG,
|
|
1
|
+
{"version":3,"file":"sweep.js","sourceRoot":"","sources":["../../../src/commands/eval/sweep.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,wDAAwD;AACxD,8EAA8E;AAE9E,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AACjF,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,QAAQ,EAAE,MAAM,mCAAmC,CAAC;AAE7D,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,uBAAuB,CAAC;AAU7E,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,QAAgB,EAAE,OAAqB;IACxE,+BAA+B;IAC/B,IAAI,SAAS,CAAC;IACd,IAAI,CAAC;QACH,SAAS,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IAC7C,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,IAAI,GAAG,YAAY,mBAAmB,EAAE,CAAC;YACvC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,uBAAuB,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QAC3D,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,wBAAyB,GAAa,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QACvE,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChB,OAAO;IACT,CAAC;IAED,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IAC/C,MAAM,YAAY,GAAG,UAAU,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IACvF,MAAM,YAAY,GAAG,qBAAqB,CAAC,YAAY,CAAC,CAAC;IACzD,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAEjF,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3B,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,mEAAmE,CAAC,CAAC,CAAC;QACxF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChB,OAAO;IACT,CAAC;IAED,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,IAAI,CAAC,CAAC;IAC/B,MAAM,WAAW,GAAG,OAAO,CAAC,WAAW,IAAI,CAAC,CAAC;IAC7C,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,IAAI,KAAK,CAAC;IAE3C,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,YAAY,SAAS,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;IACtD,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;IACpD,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,UAAU,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IAC5C,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,mBAAmB,IAAI,EAAE,CAAC,CAAC,CAAC;IAC5C,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,UAAU,SAAS,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IACrD,IAAI,QAAQ;QAAE,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,qDAAqD,CAAC,CAAC,CAAC;IACtF,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAEhB,2BAA2B;IAC3B,IAAI,IAAI,GAAG,CAAC,EAAE,CAAC;QACb,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,SAAS,IAAI,iGAAiG,CAAC,CAAC,CAAC;IACtI,CAAC;IAED,IAAI,WAAW,GAAuB,IAAI,CAAC;IAE3C,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,QAAQ,CAAC;QACjC,QAAQ;QACR,SAAS,EAAE,SAAS,CAAC,UAAU;QAC/B,YAAY;QACZ,SAAS,EAAE,SAAS,CAAC,KAAK;QAC1B,MAAM,EAAE,SAAS;QACjB,KAAK,EAAE,OAAO,CAAC,KAAK;QACpB,IAAI;QACJ,WAAW;QACX,QAAQ;KACT,CAAC,EAAE,CAAC;QACH,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;YACnB,KAAK,0BAA0B;gBAC7B,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,YAAY,KAAK,CAAC,IAAI,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC;gBACxD,MAAM;YAER,KAAK,mBAAmB;gBACtB,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,IAAI,CAAC,UAAU,GAAG,CAAC,IAAI,KAAK,CAAC,IAAI,CAAC,WAAW,KAAK,KAAK,CAAC,IAAI,CAAC,KAAK,KAAK,CAAC,CAAC,CAAC;gBAC7G,MAAM;YAER,KAAK,sBAAsB,CAAC,CAAC,CAAC;gBAC5B,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,KAAK,UAAU,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,EAAE,CAAC;gBACxE,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,KAAK,GAAG,UAAU,SAAS,KAAK,CAAC,IAAI,CAAC,GAAG,IAAI,KAAK,CAAC,IAAI,CAAC,SAAS,SAAS,KAAK,CAAC,IAAI,CAAC,WAAW,IAAI,KAAK,CAAC,IAAI,CAAC,UAAU,KAAK,KAAK,CAAC,IAAI,CAAC,eAAe,IAAI,CAAC,CAAC,CAAC;gBACzM,MAAM;YACR,CAAC;YAED,KAAK,sBAAsB;gBACzB,IAAI,KAAK,CAAC,IAAI,CAAC,MAAM,KAAK,UAAU,IAAI,KAAK,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC;oBAC5D,IAAI,OAAO,GAAG,qBAAqB,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC;oBACnF,IAAI,KAAK,CAAC,IAAI,CAAC,gBAAgB,IAAI,KAAK,CAAC,IAAI,CAAC,UAAU,EAAE,CAAC;wBACzD,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,GAAG,GAAG,CAAC;wBAC/C,MAAM,IAAI,GAAG,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;wBACnC,OAAO,IAAI,gBAAgB,CAAC,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,cAAc,IAAI,GAAG,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC;wBACxH,IAAI,KAAK,CAAC,IAAI,CAAC,gBAAgB,IAAI,IAAI,IAAI,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,EAAE,CAAC;4BACjF,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,gBAAgB,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;4BAC5D,OAAO,IAAI,KAAK,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC;wBACvE,CAAC;oBACH,CAAC;oBACD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC;gBAC9B,CAAC;qBAAM,CAAC;oBACN,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,KAAK,CAAC,IAAI,CAAC,YAAY,IAAI,SAAS,EAAE,CAAC,CAAC,CAAC;gBACtE,CAAC;gBACD,MAAM;YAER,KAAK,gBAAgB;gBACnB,WAAW,GAAG,KAAK,CAAC,IAAI,CAAC;gBACzB,MAAM;QACV,CAAC;IACH,CAAC;IAED,IAAI,CAAC,WAAW,EAAE,CAAC;QACjB,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,oCAAoC,CAAC,CAAC,CAAC;QACzD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChB,OAAO;IACT,CAAC;IAED,2DAA2D;IAC3D,MAAM,MAAM,GAAG,CAAC,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACnD,IAAI,CAAC,CAAC,cAAc,IAAI,IAAI,IAAI,CAAC,CAAC,cAAc,IAAI,IAAI,EAAE,CAAC;YACzD,OAAO,CAAC,CAAC,cAAc,GAAG,CAAC,CAAC,cAAc,CAAC;QAC7C,CAAC;QACD,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,iDAAiD;IACjD,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,OAAO,GAAG,CAAC,MAAM,EAAE,OAAO,EAAE,YAAY,EAAE,eAAe,EAAE,OAAO,EAAE,eAAe,EAAE,QAAQ,CAAC,CAAC;QACrG,MAAM,IAAI,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAChC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC;YACb,GAAG,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,KAAK,EAAE;YAC1B,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG;YAC7D,CAAC,CAAC,MAAM,KAAK,UAAU,IAAI,CAAC,CAAC,gBAAgB,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,gBAAgB,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG;YAC3F,CAAC,CAAC,MAAM,KAAK,UAAU,IAAI,CAAC,CAAC,UAAU;gBACrC,CAAC,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,UAAU,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI;gBACjF,CAAC,CAAC,GAAG;YACP,CAAC,CAAC,MAAM,KAAK,UAAU,IAAI,CAAC,CAAC,gBAAgB,IAAI,IAAI,IAAI,QAAQ,CAAC,CAAC,CAAC,gBAAgB,CAAC;gBACnF,CAAC,CAAC,GAAG,CAAC,CAAC,gBAAgB,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;gBAC1E,CAAC,CAAC,GAAG;YACP,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC;SACnD,CAAC,CAAC;QAEH,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,yCAAyC,CAAC,CAAC,CAAC;QAC7D,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;QAElC,sBAAsB;QACtB,IAAI,WAAW,CAAC,iBAAiB,IAAI,IAAI,IAAI,WAAW,CAAC,kBAAkB,EAAE,CAAC;YAC5E,MAAM,YAAY,GAA0C;gBAC1D,SAAS,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,GAAG;aAC/E,CAAC;YACF,MAAM,OAAO,GAAG,YAAY,CAAC,WAAW,CAAC,kBAAkB,CAAC,IAAI,GAAG,CAAC;YACpE,MAAM,IAAI,GAAG,WAAW,CAAC,iBAAiB,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;YAC3D,MAAM,KAAK,GAAG,GAAG,IAAI,GAAG,WAAW,CAAC,iBAAiB,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,WAAW,CAAC,kBAAkB,CAAC,WAAW,EAAE,GAAG,CAAC;YACtH,OAAO,CAAC,GAAG,CAAC,oBAAoB,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QACpD,CAAC;IACH,CAAC;SAAM,CAAC;QACN,MAAM,OAAO,GAAG,CAAC,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,UAAU,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC;QAC7E,MAAM,IAAI,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAChC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC;YACb,GAAG,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,KAAK,EAAE;YAC1B,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG;YAC7D,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG;YACpE,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG;YACtD,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC;SACnD,CAAC,CAAC;QAEH,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC,CAAC;QACvC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;IACpC,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,0BAA0B,QAAQ,qBAAqB,CAAC,CAAC,CAAC;AAC5E,CAAC;AAED,SAAS,WAAW,CAAC,KAAiB,EAAE,SAAkB,EAAE,MAAM,GAAG,EAAE;IACrE,IAAI,SAAS,EAAE,CAAC;QACd,OAAO,GAAG,CAAC,KAAK,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;IACjH,CAAC;IACD,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,MAAM,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;AACnH,CAAC"}
|
package/dist/commands/eval.d.ts
CHANGED
package/dist/commands/eval.js
CHANGED
|
@@ -67,6 +67,7 @@ export async function evalCommand(subcommand, target, opts = {}) {
|
|
|
67
67
|
judge: opts.judge,
|
|
68
68
|
runs: opts.runs ? parseInt(opts.runs, 10) : undefined,
|
|
69
69
|
concurrency: opts.concurrency ? parseInt(opts.concurrency, 10) : undefined,
|
|
70
|
+
baseline: opts.baseline,
|
|
70
71
|
});
|
|
71
72
|
}
|
|
72
73
|
case "credentials": {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"eval.js","sourceRoot":"","sources":["../../src/commands/eval.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,mCAAmC;AACnC,8EAA8E;AAE9E,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,oBAAoB,CAAC;AAE9C,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,UAAkB,EAClB,MAAe,EACf,
|
|
1
|
+
{"version":3,"file":"eval.js","sourceRoot":"","sources":["../../src/commands/eval.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,mCAAmC;AACnC,8EAA8E;AAE9E,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,oBAAoB,CAAC;AAE9C,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,UAAkB,EAClB,MAAe,EACf,OAAqQ,EAAE;IAEvQ,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;IAE3D,QAAQ,UAAU,EAAE,CAAC;QACnB,KAAK,OAAO,CAAC,CAAC,CAAC;YACb,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;YACxD,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAC;YACzD,OAAO,YAAY,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;QAClC,CAAC;QAED,KAAK,MAAM,CAAC,CAAC,CAAC;YACZ,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,0CAA0C,CAAC,CAAC,CAAC;gBAC/D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC/C,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,gBAAgB,CAAC,CAAC;YACvD,MAAM,QAAQ,GAAG,CAAC,IAAI,CAAC,IAAI,KAAK,aAAa,IAAI,IAAI,CAAC,IAAI,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC;YAC3F,OAAO,WAAW,CAAC,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;QACvD,CAAC;QAED,KAAK,KAAK,CAAC,CAAC,CAAC;YACX,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,yCAAyC,CAAC,CAAC,CAAC;gBAC9D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC/C,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,eAAe,CAAC,CAAC;YACrD,sEAAsE;YACtE,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,KAAK,KAAK,KAAK,CAAC;YAC9D,OAAO,UAAU,CAAC,QAAQ,EAAE;gBAC1B,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;gBAC1E,UAAU,EAAE,IAAI,CAAC,UAAU;gBAC3B,OAAO;gBACP,KAAK,EAAE,IAAI,CAAC,KAAK;aAClB,CAAC,CAAC;QACL,CAAC;QAED,KAAK,UAAU,CAAC,CAAC,CAAC;YAChB,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;YAC/D,OAAO,eAAe,CAAC,IAAI,CAAC,CAAC;QAC/B,CAAC;QAED,KAAK,cAAc,CAAC,CAAC,CAAC;YACpB,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,wBAAwB,CAAC,CAAC;YACtE,MAAM,gBAAgB,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YACvF,OAAO,kBAAkB,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,gBAAgB,CAAC,CAAC;QAClE,CAAC;QAED,KAAK,OAAO,CAAC,CAAC,CAAC;YACb,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,wEAAwE,CAAC,CAAC,CAAC;gBAC7F,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;gBACjB,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,oHAAoH,CAAC,CAAC,CAAC;gBACzI,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,IAAI,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC;gBAChB,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,8DAA8D,CAAC,CAAC,CAAC;gBACnF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC/C,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAC;YACzD,OAAO,YAAY,CAAC,QAAQ,EAAE;gBAC5B,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;gBACrD,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;gBAC1E,QAAQ,EAAE,IAAI,CAAC,QAAQ;aACxB,CAAC,CAAC;QACL,CAAC;QAED,KAAK,aAAa,CAAC,CAAC,CAAC;YACnB,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,iEAAiE,CAAC,CAAC,CAAC;gBACtF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,iDAAiD;YACjD,sDAAsD;YACtD,MAAM,YAAY,GAAG,IAAI,CAAC;YAC1B,MAAM,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,mBAAmB,EAAE,GAAG,MAAM,MAAM,CAAC,uBAAuB,CAAC,CAAC;YAC7G,QAAQ,MAAM,EAAE,CAAC;gBACf,KAAK,KAAK,CAAC,CAAC,CAAC;oBACX,MAAM,GAAG,GAAG,IAAI,CAAC,aAAa,CAAC;oBAC/B,IAAI,CAAC,GAAG,EAAE,CAAC;wBACT,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,qCAAqC,CAAC,CAAC,CAAC;wBAC1D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;oBAClB,CAAC;oBACD,OAAO,iBAAiB,CAAC,YAAY,EAAE,GAAG,CAAC,CAAC;gBAC9C,CAAC;gBACD,KAAK,MAAM;oBACT,OAAO,kBAAkB,CAAC,YAAY,CAAC,CAAC;gBAC1C,KAAK,OAAO;oBACV,OAAO,mBAAmB,CAAC,YAAY,CAAC,CAAC;gBAC3C;oBACE,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,oCAAoC,MAAM,KAAK,CAAC,GAAG,GAAG,CAAC,6BAA6B,CAAC,CAAC,CAAC;YAC7G,CAAC;YACD,MAAM;QACR,CAAC;QAED;YACE,OAAO,CAAC,KAAK,CACX,GAAG,CAAC,wBAAwB,UAAU,KAAK,CAAC;gBAC1C,GAAG,CAAC,yEAAyE,CAAC,CACjF,CAAC;IACN,CAAC;AACH,CAAC;AAED,SAAS,eAAe,CAAC,IAAY,EAAE,MAAc;IACnD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAChC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,KAAK,CACX,GAAG,CAAC,mBAAmB,MAAM,sCAAsC,CAAC,CACrE,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,qDAAqD;IACrD,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IAC5D,IAAI,UAAU,CAAC,UAAU,CAAC;QAAE,OAAO,UAAU,CAAC;IAE9C,sEAAsE;IACtE,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,SAAS,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACvE,IAAI,UAAU,CAAC,UAAU,CAAC;QAAE,OAAO,UAAU,CAAC;IAE9C,0CAA0C;IAC1C,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IAChD,IAAI,UAAU,CAAC,QAAQ,CAAC;QAAE,OAAO,QAAQ,CAAC;IAE1C,sEAAsE;IACtE,OAAO,UAAU,CAAC;AACpB,CAAC"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { ActivationResult } from "./activation-tester.js";
|
|
2
|
+
export interface ActivationHistoryRun {
|
|
3
|
+
id: string;
|
|
4
|
+
timestamp: string;
|
|
5
|
+
model: string;
|
|
6
|
+
provider: string;
|
|
7
|
+
promptCount: number;
|
|
8
|
+
summary: {
|
|
9
|
+
precision: number;
|
|
10
|
+
recall: number;
|
|
11
|
+
reliability: number;
|
|
12
|
+
tp: number;
|
|
13
|
+
tn: number;
|
|
14
|
+
fp: number;
|
|
15
|
+
fn: number;
|
|
16
|
+
};
|
|
17
|
+
results: ActivationResult[];
|
|
18
|
+
}
|
|
19
|
+
export declare function writeActivationRun(skillDir: string, run: ActivationHistoryRun): Promise<void>;
|
|
20
|
+
export declare function listActivationRuns(skillDir: string): Promise<Omit<ActivationHistoryRun, "results">[]>;
|
|
21
|
+
export declare function getActivationRun(skillDir: string, runId: string): Promise<ActivationHistoryRun | null>;
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// activation-history.ts -- persistent activation test history per skill
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
import { readFile, writeFile, mkdir } from "node:fs/promises";
|
|
5
|
+
import { join } from "node:path";
|
|
6
|
+
const HISTORY_FILENAME = "activation-history.json";
|
|
7
|
+
const MAX_RUNS = 50;
|
|
8
|
+
function historyPath(skillDir) {
|
|
9
|
+
return join(skillDir, HISTORY_FILENAME);
|
|
10
|
+
}
|
|
11
|
+
async function readHistoryFile(skillDir) {
|
|
12
|
+
try {
|
|
13
|
+
const content = await readFile(historyPath(skillDir), "utf-8");
|
|
14
|
+
const parsed = JSON.parse(content);
|
|
15
|
+
if (parsed && Array.isArray(parsed.runs))
|
|
16
|
+
return parsed;
|
|
17
|
+
return { runs: [] };
|
|
18
|
+
}
|
|
19
|
+
catch {
|
|
20
|
+
return { runs: [] };
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
export async function writeActivationRun(skillDir, run) {
|
|
24
|
+
await mkdir(skillDir, { recursive: true });
|
|
25
|
+
const history = await readHistoryFile(skillDir);
|
|
26
|
+
history.runs.push(run);
|
|
27
|
+
// Prune oldest if over cap
|
|
28
|
+
if (history.runs.length > MAX_RUNS) {
|
|
29
|
+
history.runs = history.runs.slice(history.runs.length - MAX_RUNS);
|
|
30
|
+
}
|
|
31
|
+
await writeFile(historyPath(skillDir), JSON.stringify(history, null, 2));
|
|
32
|
+
}
|
|
33
|
+
export async function listActivationRuns(skillDir) {
|
|
34
|
+
const history = await readHistoryFile(skillDir);
|
|
35
|
+
return history.runs.map(({ results: _results, ...rest }) => rest).reverse();
|
|
36
|
+
}
|
|
37
|
+
export async function getActivationRun(skillDir, runId) {
|
|
38
|
+
const history = await readHistoryFile(skillDir);
|
|
39
|
+
return history.runs.find((r) => r.id === runId) ?? null;
|
|
40
|
+
}
|
|
41
|
+
//# sourceMappingURL=activation-history.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"activation-history.js","sourceRoot":"","sources":["../../src/eval/activation-history.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,wEAAwE;AACxE,8EAA8E;AAE9E,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAyBjC,MAAM,gBAAgB,GAAG,yBAAyB,CAAC;AACnD,MAAM,QAAQ,GAAG,EAAE,CAAC;AAEpB,SAAS,WAAW,CAAC,QAAgB;IACnC,OAAO,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;AAC1C,CAAC;AAED,KAAK,UAAU,eAAe,CAAC,QAAgB;IAC7C,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,WAAW,CAAC,QAAQ,CAAC,EAAE,OAAO,CAAC,CAAC;QAC/D,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QACnC,IAAI,MAAM,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,IAAI,CAAC;YAAE,OAAO,MAA+B,CAAC;QACjF,OAAO,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC;IACtB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC;IACtB,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,QAAgB,EAChB,GAAyB;IAEzB,MAAM,KAAK,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC3C,MAAM,OAAO,GAAG,MAAM,eAAe,CAAC,QAAQ,CAAC,CAAC;IAChD,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACvB,2BAA2B;IAC3B,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;QACnC,OAAO,CAAC,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,GAAG,QAAQ,CAAC,CAAC;IACpE,CAAC;IACD,MAAM,SAAS,CAAC,WAAW,CAAC,QAAQ,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;AAC3E,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,QAAgB;IAEhB,MAAM,OAAO,GAAG,MAAM,eAAe,CAAC,QAAQ,CAAC,CAAC;IAChD,OAAO,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,GAAG,IAAI,EAAE,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,CAAC;AAC9E,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,QAAgB,EAChB,KAAa;IAEb,MAAM,OAAO,GAAG,MAAM,eAAe,CAAC,QAAQ,CAAC,CAAC;IAChD,OAAO,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,KAAK,CAAC,IAAI,IAAI,CAAC;AAC1D,CAAC"}
|
|
@@ -22,6 +22,7 @@ import { generateActionItems } from "../eval/action-items.js";
|
|
|
22
22
|
import { buildEvalInitPrompt, parseGeneratedEvals } from "../eval/prompt-builder.js";
|
|
23
23
|
import { testActivation } from "../eval/activation-tester.js";
|
|
24
24
|
import { detectMcpDependencies, detectSkillDependencies } from "../eval/mcp-detector.js";
|
|
25
|
+
import { writeActivationRun, listActivationRuns, getActivationRun } from "../eval/activation-history.js";
|
|
25
26
|
// ---------------------------------------------------------------------------
|
|
26
27
|
// In-memory config state — UI can change provider/model at runtime.
|
|
27
28
|
//
|
|
@@ -1033,13 +1034,40 @@ export function registerRoutes(router, root, projectName) {
|
|
|
1033
1034
|
name: nameMatch ? nameMatch[1] : params.skill,
|
|
1034
1035
|
tags: tagsMatch ? tagsMatch[1].split(",").map((t) => t.trim()).filter(Boolean) : [],
|
|
1035
1036
|
};
|
|
1036
|
-
|
|
1037
|
+
// Use per-request model overrides if provided, fall back to global config
|
|
1038
|
+
const client = body.provider || body.model
|
|
1039
|
+
? createLlmClient({ provider: body.provider, model: body.model })
|
|
1040
|
+
: getClient();
|
|
1037
1041
|
const summary = await testActivation(description, body.prompts, client, (result) => {
|
|
1038
1042
|
if (!aborted) {
|
|
1039
1043
|
sendSSE(res, "prompt_result", result);
|
|
1040
1044
|
}
|
|
1041
1045
|
}, meta);
|
|
1042
1046
|
if (!aborted) {
|
|
1047
|
+
// Write activation history entry
|
|
1048
|
+
const usedProvider = body.provider || currentOverrides.provider || "unknown";
|
|
1049
|
+
const usedModel = body.model || currentOverrides.model || "unknown";
|
|
1050
|
+
const run = {
|
|
1051
|
+
id: `run-${Date.now()}`,
|
|
1052
|
+
timestamp: new Date().toISOString(),
|
|
1053
|
+
model: usedModel,
|
|
1054
|
+
provider: usedProvider,
|
|
1055
|
+
promptCount: summary.total,
|
|
1056
|
+
summary: {
|
|
1057
|
+
precision: summary.precision,
|
|
1058
|
+
recall: summary.recall,
|
|
1059
|
+
reliability: summary.reliability,
|
|
1060
|
+
tp: summary.tp,
|
|
1061
|
+
tn: summary.tn,
|
|
1062
|
+
fp: summary.fp,
|
|
1063
|
+
fn: summary.fn,
|
|
1064
|
+
},
|
|
1065
|
+
results: summary.results,
|
|
1066
|
+
};
|
|
1067
|
+
try {
|
|
1068
|
+
await writeActivationRun(skillDir, run);
|
|
1069
|
+
}
|
|
1070
|
+
catch { /* non-blocking */ }
|
|
1043
1071
|
sendSSEDone(res, { ...summary, description });
|
|
1044
1072
|
}
|
|
1045
1073
|
}
|
|
@@ -1047,6 +1075,81 @@ export function registerRoutes(router, root, projectName) {
|
|
|
1047
1075
|
sendSSEDone(res, { error: err instanceof Error ? err.message : String(err) });
|
|
1048
1076
|
}
|
|
1049
1077
|
});
|
|
1078
|
+
// AI-generate activation test prompts (SSE)
|
|
1079
|
+
router.post("/api/skills/:plugin/:skill/activation-prompts", async (req, res, params) => {
|
|
1080
|
+
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
1081
|
+
let aborted = false;
|
|
1082
|
+
res.on("close", () => { aborted = true; });
|
|
1083
|
+
try {
|
|
1084
|
+
const body = (await readBody(req));
|
|
1085
|
+
const skillMdPath = join(skillDir, "SKILL.md");
|
|
1086
|
+
if (!existsSync(skillMdPath)) {
|
|
1087
|
+
sendJson(res, { error: "SKILL.md not found" }, 404, req);
|
|
1088
|
+
return;
|
|
1089
|
+
}
|
|
1090
|
+
const skillContent = readFileSync(skillMdPath, "utf-8");
|
|
1091
|
+
const descMatch = skillContent.match(/^---[\s\S]*?description:\s*"([^"]+)"[\s\S]*?---/);
|
|
1092
|
+
const description = descMatch ? descMatch[1] : "";
|
|
1093
|
+
if (!description) {
|
|
1094
|
+
sendJson(res, { error: "No skill description available" }, 400, req);
|
|
1095
|
+
return;
|
|
1096
|
+
}
|
|
1097
|
+
initSSE(res, req);
|
|
1098
|
+
const count = body.count || 8;
|
|
1099
|
+
const half = Math.ceil(count / 2);
|
|
1100
|
+
const client = body.provider || body.model
|
|
1101
|
+
? createLlmClient({ provider: body.provider, model: body.model })
|
|
1102
|
+
: getClient();
|
|
1103
|
+
const systemPrompt = `Given this skill description, generate test prompts to evaluate activation quality.
|
|
1104
|
+
Generate ${count} prompts: ${half} that SHOULD activate this skill, ${count - half} that should NOT.
|
|
1105
|
+
For "should not" prompts, make them plausible but clearly outside this skill's domain.
|
|
1106
|
+
Return one JSON object per line: {"prompt": "...", "expected": "should_activate"|"should_not_activate"}
|
|
1107
|
+
Return ONLY the JSON lines, no other text.`;
|
|
1108
|
+
const userPrompt = `Skill description: ${description}`;
|
|
1109
|
+
const { text } = await client.generate(systemPrompt, userPrompt);
|
|
1110
|
+
if (aborted)
|
|
1111
|
+
return;
|
|
1112
|
+
const allPrompts = [];
|
|
1113
|
+
const lines = text.split("\n").filter((l) => l.trim());
|
|
1114
|
+
for (const line of lines) {
|
|
1115
|
+
try {
|
|
1116
|
+
const cleaned = line.replace(/^```(?:json)?\s*/i, "").replace(/```\s*$/i, "").trim();
|
|
1117
|
+
if (!cleaned.startsWith("{"))
|
|
1118
|
+
continue;
|
|
1119
|
+
const parsed = JSON.parse(cleaned);
|
|
1120
|
+
if (parsed.prompt && parsed.expected) {
|
|
1121
|
+
allPrompts.push({ prompt: parsed.prompt, expected: parsed.expected });
|
|
1122
|
+
if (!aborted)
|
|
1123
|
+
sendSSE(res, "prompt_generated", parsed);
|
|
1124
|
+
}
|
|
1125
|
+
}
|
|
1126
|
+
catch { /* skip malformed lines */ }
|
|
1127
|
+
}
|
|
1128
|
+
if (!aborted)
|
|
1129
|
+
sendSSEDone(res, { prompts: allPrompts });
|
|
1130
|
+
}
|
|
1131
|
+
catch (err) {
|
|
1132
|
+
if (!aborted) {
|
|
1133
|
+
sendSSEDone(res, { error: err instanceof Error ? err.message : String(err) });
|
|
1134
|
+
}
|
|
1135
|
+
}
|
|
1136
|
+
});
|
|
1137
|
+
// List activation test history (summaries only)
|
|
1138
|
+
router.get("/api/skills/:plugin/:skill/activation-history", async (req, res, params) => {
|
|
1139
|
+
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
1140
|
+
const runs = await listActivationRuns(skillDir);
|
|
1141
|
+
sendJson(res, { runs }, 200, req);
|
|
1142
|
+
});
|
|
1143
|
+
// Get full activation test run by ID
|
|
1144
|
+
router.get("/api/skills/:plugin/:skill/activation-history/:runId", async (req, res, params) => {
|
|
1145
|
+
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|
|
1146
|
+
const run = await getActivationRun(skillDir, params.runId);
|
|
1147
|
+
if (!run) {
|
|
1148
|
+
sendJson(res, { error: "Run not found" }, 404, req);
|
|
1149
|
+
return;
|
|
1150
|
+
}
|
|
1151
|
+
sendJson(res, run, 200, req);
|
|
1152
|
+
});
|
|
1050
1153
|
// Get skill dependencies (MCP + skill-to-skill)
|
|
1051
1154
|
router.get("/api/skills/:plugin/:skill/dependencies", async (req, res, params) => {
|
|
1052
1155
|
const skillDir = resolveSkillDir(root, params.plugin, params.skill);
|