vskill 0.5.21 → 0.5.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,5 +3,6 @@ export interface SweepOptions {
3
3
  judge: string;
4
4
  runs?: number;
5
5
  concurrency?: number;
6
+ baseline?: boolean;
6
7
  }
7
8
  export declare function runEvalSweep(skillDir: string, options: SweepOptions): Promise<void>;
@@ -6,7 +6,7 @@ import { join } from "node:path";
6
6
  import { loadAndValidateEvals, EvalValidationError } from "../../eval/schema.js";
7
7
  import { buildEvalSystemPrompt } from "../../eval/prompt-builder.js";
8
8
  import { runSweep } from "../../eval-server/sweep-runner.js";
9
- import { green, red, bold, dim, table } from "../../utils/output.js";
9
+ import { green, red, yellow, bold, dim, table } from "../../utils/output.js";
10
10
  export async function runEvalSweep(skillDir, options) {
11
11
  // Load and validate evals.json
12
12
  let evalsFile;
@@ -34,11 +34,19 @@ export async function runEvalSweep(skillDir, options) {
34
34
  }
35
35
  const runs = options.runs ?? 1;
36
36
  const concurrency = options.concurrency ?? 5;
37
+ const baseline = options.baseline ?? false;
37
38
  console.log(bold(`\nSweep: ${evalsFile.skill_name}`));
38
39
  console.log(dim(`Models: ${modelList.join(", ")}`));
39
40
  console.log(dim(`Judge: ${options.judge}`));
40
41
  console.log(dim(`Runs per model: ${runs}`));
41
- console.log(dim(`Cases: ${evalsFile.evals.length}\n`));
42
+ console.log(dim(`Cases: ${evalsFile.evals.length}`));
43
+ if (baseline)
44
+ console.log(dim(`Baseline: enabled (comparing with vs without skill)`));
45
+ console.log("");
46
+ // Warn about low run count
47
+ if (runs < 3) {
48
+ console.log(yellow(`Note: ${runs} run(s) may not produce statistically meaningful results. Use --runs 3+ for reliable ranking.\n`));
49
+ }
42
50
  let sweepResult = null;
43
51
  for await (const event of runSweep({
44
52
  skillDir,
@@ -49,17 +57,33 @@ export async function runEvalSweep(skillDir, options) {
49
57
  judge: options.judge,
50
58
  runs,
51
59
  concurrency,
60
+ baseline,
52
61
  })) {
53
62
  switch (event.type) {
63
+ case "sweep_judge_bias_warning":
64
+ console.log(yellow(`WARNING: ${event.data.warning}\n`));
65
+ break;
54
66
  case "sweep_model_start":
55
67
  process.stdout.write(dim(`[${event.data.modelIndex + 1}/${event.data.totalModels}] ${event.data.model} — `));
56
68
  break;
57
- case "sweep_model_progress":
58
- process.stdout.write(dim(`\r[${event.data.model}] run ${event.data.run}/${event.data.totalRuns} case ${event.data.currentCase}/${event.data.totalCases} (${event.data.percentComplete}%)`));
69
+ case "sweep_model_progress": {
70
+ const phaseLabel = event.data.phase === "baseline" ? " [baseline]" : "";
71
+ process.stdout.write(dim(`\r[${event.data.model}${phaseLabel}] run ${event.data.run}/${event.data.totalRuns} case ${event.data.currentCase}/${event.data.totalCases} (${event.data.percentComplete}%)`));
59
72
  break;
73
+ }
60
74
  case "sweep_model_complete":
61
75
  if (event.data.status === "complete" && event.data.passRate) {
62
- console.log(green(` done`) + dim(` (pass rate: ${(event.data.passRate.mean * 100).toFixed(1)}%)`));
76
+ let summary = ` done (pass rate: ${(event.data.passRate.mean * 100).toFixed(1)}%)`;
77
+ if (event.data.baselinePassRate && event.data.skillDelta) {
78
+ const delta = event.data.skillDelta.mean * 100;
79
+ const sign = delta >= 0 ? "+" : "";
80
+ summary += ` | baseline: ${(event.data.baselinePassRate.mean * 100).toFixed(1)}% | delta: ${sign}${delta.toFixed(1)}pp`;
81
+ if (event.data.amplificationPct != null && isFinite(event.data.amplificationPct)) {
82
+ const ampSign = event.data.amplificationPct >= 0 ? "+" : "";
83
+ summary += ` (${ampSign}${event.data.amplificationPct.toFixed(1)}%)`;
84
+ }
85
+ }
86
+ console.log(green(summary));
63
87
  }
64
88
  else {
65
89
  console.log(red(` error: ${event.data.errorMessage || "unknown"}`));
@@ -75,19 +99,55 @@ export async function runEvalSweep(skillDir, options) {
75
99
  process.exit(1);
76
100
  return;
77
101
  }
78
- // Print summary table
79
- const headers = ["RANK", "MODEL", "PASS RATE", "DURATION", "COST", "STATUS"];
80
- const sorted = [...sweepResult.models].sort((a, b) => b.passRate.mean - a.passRate.mean);
81
- const rows = sorted.map((m, i) => [
82
- String(i + 1),
83
- `${m.provider}/${m.model}`,
84
- m.status === "complete" ? formatStats(m.passRate, true) : "-",
85
- m.status === "complete" ? formatStats(m.duration, false, "ms") : "-",
86
- m.cost.total > 0 ? `$${m.cost.total.toFixed(4)}` : "-",
87
- m.status === "complete" ? green("OK") : red("ERR"),
88
- ]);
89
- console.log(bold("\nSweep Results\n"));
90
- console.log(table(headers, rows));
102
+ // Sort by composite score (if available) then by pass rate
103
+ const sorted = [...sweepResult.models].sort((a, b) => {
104
+ if (a.compositeScore != null && b.compositeScore != null) {
105
+ return b.compositeScore - a.compositeScore;
106
+ }
107
+ return b.passRate.mean - a.passRate.mean;
108
+ });
109
+ // Build table based on whether baseline was used
110
+ if (baseline) {
111
+ const headers = ["RANK", "MODEL", "WITH SKILL", "WITHOUT SKILL", "DELTA", "AMPLIFICATION", "STATUS"];
112
+ const rows = sorted.map((m, i) => [
113
+ String(i + 1),
114
+ `${m.provider}/${m.model}`,
115
+ m.status === "complete" ? formatStats(m.passRate, true) : "-",
116
+ m.status === "complete" && m.baselinePassRate ? formatStats(m.baselinePassRate, true) : "-",
117
+ m.status === "complete" && m.skillDelta
118
+ ? `${m.skillDelta.mean >= 0 ? "+" : ""}${(m.skillDelta.mean * 100).toFixed(1)}pp`
119
+ : "-",
120
+ m.status === "complete" && m.amplificationPct != null && isFinite(m.amplificationPct)
121
+ ? `${m.amplificationPct >= 0 ? "+" : ""}${m.amplificationPct.toFixed(1)}%`
122
+ : "-",
123
+ m.status === "complete" ? green("OK") : red("ERR"),
124
+ ]);
125
+ console.log(bold("\nSweep Results (Skill Amplification)\n"));
126
+ console.log(table(headers, rows));
127
+ // Skill quality badge
128
+ if (sweepResult.skillQualityScore != null && sweepResult.skillQualityRating) {
129
+ const ratingColors = {
130
+ excellent: green, good: green, marginal: yellow, minimal: yellow, harmful: red,
131
+ };
132
+ const colorFn = ratingColors[sweepResult.skillQualityRating] ?? dim;
133
+ const sign = sweepResult.skillQualityScore >= 0 ? "+" : "";
134
+ const label = `${sign}${sweepResult.skillQualityScore.toFixed(1)}% (${sweepResult.skillQualityRating.toUpperCase()})`;
135
+ console.log(`\nSkill Quality: ${colorFn(label)}`);
136
+ }
137
+ }
138
+ else {
139
+ const headers = ["RANK", "MODEL", "PASS RATE", "DURATION", "COST", "STATUS"];
140
+ const rows = sorted.map((m, i) => [
141
+ String(i + 1),
142
+ `${m.provider}/${m.model}`,
143
+ m.status === "complete" ? formatStats(m.passRate, true) : "-",
144
+ m.status === "complete" ? formatStats(m.duration, false, "ms") : "-",
145
+ m.cost.total > 0 ? `$${m.cost.total.toFixed(4)}` : "-",
146
+ m.status === "complete" ? green("OK") : red("ERR"),
147
+ ]);
148
+ console.log(bold("\nSweep Results\n"));
149
+ console.log(table(headers, rows));
150
+ }
91
151
  console.log(dim(`\nLeaderboard saved to ${skillDir}/evals/leaderboard/`));
92
152
  }
93
153
  function formatStats(stats, asPercent, suffix = "") {
@@ -1 +1 @@
1
- {"version":3,"file":"sweep.js","sourceRoot":"","sources":["../../../src/commands/eval/sweep.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,wDAAwD;AACxD,8EAA8E;AAE9E,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AACjF,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,QAAQ,EAAE,MAAM,mCAAmC,CAAC;AAE7D,OAAO,EAAE,KAAK,EAAE,GAAG,EAAU,IAAI,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,uBAAuB,CAAC;AAS7E,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,QAAgB,EAAE,OAAqB;IACxE,+BAA+B;IAC/B,IAAI,SAAS,CAAC;IACd,IAAI,CAAC;QACH,SAAS,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IAC7C,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,IAAI,GAAG,YAAY,mBAAmB,EAAE,CAAC;YACvC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,uBAAuB,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QAC3D,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,wBAAyB,GAAa,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QACvE,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChB,OAAO;IACT,CAAC;IAED,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IAC/C,MAAM,YAAY,GAAG,UAAU,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IACvF,MAAM,YAAY,GAAG,qBAAqB,CAAC,YAAY,CAAC,CAAC;IACzD,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAEjF,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3B,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,mEAAmE,CAAC,CAAC,CAAC;QACxF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChB,OAAO;IACT,CAAC;IAED,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,IAAI,CAAC,CAAC;IAC/B,MAAM,WAAW,GAAG,OAAO,CAAC,WAAW,IAAI,CAAC,CAAC;IAE7C,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,YAAY,SAAS,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;IACtD,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;IACpD,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,UAAU,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IAC5C,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,mBAAmB,IAAI,EAAE,CAAC,CAAC,CAAC;IAC5C,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,UAAU,SAAS,CAAC,KAAK,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC;IAEvD,IAAI,WAAW,GAAuB,IAAI,CAAC;IAE3C,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,QAAQ,CAAC;QACjC,QAAQ;QACR,SAAS,EAAE,SAAS,CAAC,UAAU;QAC/B,YAAY;QACZ,SAAS,EAAE,SAAS,CAAC,KAAK;QAC1B,MAAM,EAAE,SAAS;QACjB,KAAK,EAAE,OAAO,CAAC,KAAK;QACpB,IAAI;QACJ,WAAW;KACZ,CAAC,EAAE,CAAC;QACH,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;YACnB,KAAK,mBAAmB;gBACtB,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,IAAI,CAAC,UAAU,GAAG,CAAC,IAAI,KAAK,CAAC,IAAI,CAAC,WAAW,KAAK,KAAK,CAAC,IAAI,CAAC,KAAK,KAAK,CAAC,CAAC,CAAC;gBAC7G,MAAM;YAER,KAAK,sBAAsB;gBACzB,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,KAAK,SAAS,KAAK,CAAC,IAAI,CAAC,GAAG,IAAI,KAAK,CAAC,IAAI,CAAC,SAAS,SAAS,KAAK,CAAC,IAAI,CAAC,WAAW,IAAI,KAAK,CAAC,IAAI,CAAC,UAAU,KAAK,KAAK,CAAC,IAAI,CAAC,eAAe,IAAI,CAAC,CAAC,CAAC;gBAC5L,MAAM;YAER,KAAK,sBAAsB;gBACzB,IAAI,KAAK,CAAC,IAAI,CAAC,MAAM,KAAK,UAAU,IAAI,KAAK,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC;oBAC5D,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,GAAG,CAAC,gBAAgB,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;gBACrG,CAAC;qBAAM,CAAC;oBACN,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,KAAK,CAAC,IAAI,CAAC,YAAY,IAAI,SAAS,EAAE,CAAC,CAAC,CAAC;gBACtE,CAAC;gBACD,MAAM;YAER,KAAK,gBAAgB;gBACnB,WAAW,GAAG,KAAK,CAAC,IAAI,CAAC;gBACzB,MAAM;QACV,CAAC;IACH,CAAC;IAED,IAAI,CAAC,WAAW,EAAE,CAAC;QACjB,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,oCAAoC,CAAC,CAAC,CAAC;QACzD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChB,OAAO;IACT,CAAC;IAED,sBAAsB;IACtB,MAAM,OAAO,GAAG,CAAC,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,UAAU,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC;IAC7E,MAAM,MAAM,GAAG,CAAC,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IACzF,MAAM,IAAI,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAChC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC;QACb,GAAG,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,KAAK,EAAE;QAC1B,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG;QAC7D,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG;QACpE,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG;QACtD,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC;KACnD,CAAC,CAAC;IAEH,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC,CAAC;IACvC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;IAClC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,0BAA0B,QAAQ,qBAAqB,CAAC,CAAC,CAAC;AAC5E,CAAC;AAED,SAAS,WAAW,CAAC,KAAiB,EAAE,SAAkB,EAAE,MAAM,GAAG,EAAE;IACrE,IAAI,SAAS,EAAE,CAAC;QACd,OAAO,GAAG,CAAC,KAAK,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;IACjH,CAAC;IACD,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,MAAM,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;AACnH,CAAC"}
1
+ {"version":3,"file":"sweep.js","sourceRoot":"","sources":["../../../src/commands/eval/sweep.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,wDAAwD;AACxD,8EAA8E;AAE9E,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,oBAAoB,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AACjF,OAAO,EAAE,qBAAqB,EAAE,MAAM,8BAA8B,CAAC;AACrE,OAAO,EAAE,QAAQ,EAAE,MAAM,mCAAmC,CAAC;AAE7D,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,KAAK,EAAE,MAAM,uBAAuB,CAAC;AAU7E,MAAM,CAAC,KAAK,UAAU,YAAY,CAAC,QAAgB,EAAE,OAAqB;IACxE,+BAA+B;IAC/B,IAAI,SAAS,CAAC;IACd,IAAI,CAAC;QACH,SAAS,GAAG,oBAAoB,CAAC,QAAQ,CAAC,CAAC;IAC7C,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,IAAI,GAAG,YAAY,mBAAmB,EAAE,CAAC;YACvC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,uBAAuB,GAAG,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QAC3D,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,wBAAyB,GAAa,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QACvE,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChB,OAAO;IACT,CAAC;IAED,MAAM,WAAW,GAAG,IAAI,CAAC,QAAQ,EAAE,UAAU,CAAC,CAAC;IAC/C,MAAM,YAAY,GAAG,UAAU,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IACvF,MAAM,YAAY,GAAG,qBAAqB,CAAC,YAAY,CAAC,CAAC;IACzD,MAAM,SAAS,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAEjF,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3B,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,mEAAmE,CAAC,CAAC,CAAC;QACxF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChB,OAAO;IACT,CAAC;IAED,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,IAAI,CAAC,CAAC;IAC/B,MAAM,WAAW,GAAG,OAAO,CAAC,WAAW,IAAI,CAAC,CAAC;IAC7C,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,IAAI,KAAK,CAAC;IAE3C,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,YAAY,SAAS,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;IACtD,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;IACpD,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,UAAU,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;IAC5C,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,mBAAmB,IAAI,EAAE,CAAC,CAAC,CAAC;IAC5C,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,UAAU,SAAS,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IACrD,IAAI,QAAQ;QAAE,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,qDAAqD,CAAC,CAAC,CAAC;IACtF,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAEhB,2BAA2B;IAC3B,IAAI,IAAI,GAAG,CAAC,EAAE,CAAC;QACb,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,SAAS,IAAI,iGAAiG,CAAC,CAAC,CAAC;IACtI,CAAC;IAED,IAAI,WAAW,GAAuB,IAAI,CAAC;IAE3C,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,QAAQ,CAAC;QACjC,QAAQ;QACR,SAAS,EAAE,SAAS,CAAC,UAAU;QAC/B,YAAY;QACZ,SAAS,EAAE,SAAS,CAAC,KAAK;QAC1B,MAAM,EAAE,SAAS;QACjB,KAAK,EAAE,OAAO,CAAC,KAAK;QACpB,IAAI;QACJ,WAAW;QACX,QAAQ;KACT,CAAC,EAAE,CAAC;QACH,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;YACnB,KAAK,0BAA0B;gBAC7B,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,YAAY,KAAK,CAAC,IAAI,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC;gBACxD,MAAM;YAER,KAAK,mBAAmB;gBACtB,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,IAAI,CAAC,UAAU,GAAG,CAAC,IAAI,KAAK,CAAC,IAAI,CAAC,WAAW,KAAK,KAAK,CAAC,IAAI,CAAC,KAAK,KAAK,CAAC,CAAC,CAAC;gBAC7G,MAAM;YAER,KAAK,sBAAsB,CAAC,CAAC,CAAC;gBAC5B,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,KAAK,UAAU,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,EAAE,CAAC;gBACxE,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,KAAK,GAAG,UAAU,SAAS,KAAK,CAAC,IAAI,CAAC,GAAG,IAAI,KAAK,CAAC,IAAI,CAAC,SAAS,SAAS,KAAK,CAAC,IAAI,CAAC,WAAW,IAAI,KAAK,CAAC,IAAI,CAAC,UAAU,KAAK,KAAK,CAAC,IAAI,CAAC,eAAe,IAAI,CAAC,CAAC,CAAC;gBACzM,MAAM;YACR,CAAC;YAED,KAAK,sBAAsB;gBACzB,IAAI,KAAK,CAAC,IAAI,CAAC,MAAM,KAAK,UAAU,IAAI,KAAK,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC;oBAC5D,IAAI,OAAO,GAAG,qBAAqB,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC;oBACnF,IAAI,KAAK,CAAC,IAAI,CAAC,gBAAgB,IAAI,KAAK,CAAC,IAAI,CAAC,UAAU,EAAE,CAAC;wBACzD,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,GAAG,GAAG,CAAC;wBAC/C,MAAM,IAAI,GAAG,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;wBACnC,OAAO,IAAI,gBAAgB,CAAC,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,cAAc,IAAI,GAAG,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC;wBACxH,IAAI,KAAK,CAAC,IAAI,CAAC,gBAAgB,IAAI,IAAI,IAAI,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,EAAE,CAAC;4BACjF,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,gBAAgB,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;4BAC5D,OAAO,IAAI,KAAK,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC;wBACvE,CAAC;oBACH,CAAC;oBACD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC;gBAC9B,CAAC;qBAAM,CAAC;oBACN,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,KAAK,CAAC,IAAI,CAAC,YAAY,IAAI,SAAS,EAAE,CAAC,CAAC,CAAC;gBACtE,CAAC;gBACD,MAAM;YAER,KAAK,gBAAgB;gBACnB,WAAW,GAAG,KAAK,CAAC,IAAI,CAAC;gBACzB,MAAM;QACV,CAAC;IACH,CAAC;IAED,IAAI,CAAC,WAAW,EAAE,CAAC;QACjB,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,oCAAoC,CAAC,CAAC,CAAC;QACzD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChB,OAAO;IACT,CAAC;IAED,2DAA2D;IAC3D,MAAM,MAAM,GAAG,CAAC,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACnD,IAAI,CAAC,CAAC,cAAc,IAAI,IAAI,IAAI,CAAC,CAAC,cAAc,IAAI,IAAI,EAAE,CAAC;YACzD,OAAO,CAAC,CAAC,cAAc,GAAG,CAAC,CAAC,cAAc,CAAC;QAC7C,CAAC;QACD,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC;IAC3C,CAAC,CAAC,CAAC;IAEH,iDAAiD;IACjD,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,OAAO,GAAG,CAAC,MAAM,EAAE,OAAO,EAAE,YAAY,EAAE,eAAe,EAAE,OAAO,EAAE,eAAe,EAAE,QAAQ,CAAC,CAAC;QACrG,MAAM,IAAI,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAChC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC;YACb,GAAG,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,KAAK,EAAE;YAC1B,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG;YAC7D,CAAC,CAAC,MAAM,KAAK,UAAU,IAAI,CAAC,CAAC,gBAAgB,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,gBAAgB,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG;YAC3F,CAAC,CAAC,MAAM,KAAK,UAAU,IAAI,CAAC,CAAC,UAAU;gBACrC,CAAC,CAAC,GAAG,CAAC,CAAC,UAAU,CAAC,IAAI,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,UAAU,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI;gBACjF,CAAC,CAAC,GAAG;YACP,CAAC,CAAC,MAAM,KAAK,UAAU,IAAI,CAAC,CAAC,gBAAgB,IAAI,IAAI,IAAI,QAAQ,CAAC,CAAC,CAAC,gBAAgB,CAAC;gBACnF,CAAC,CAAC,GAAG,CAAC,CAAC,gBAAgB,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;gBAC1E,CAAC,CAAC,GAAG;YACP,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC;SACnD,CAAC,CAAC;QAEH,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,yCAAyC,CAAC,CAAC,CAAC;QAC7D,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;QAElC,sBAAsB;QACtB,IAAI,WAAW,CAAC,iBAAiB,IAAI,IAAI,IAAI,WAAW,CAAC,kBAAkB,EAAE,CAAC;YAC5E,MAAM,YAAY,GAA0C;gBAC1D,SAAS,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,GAAG;aAC/E,CAAC;YACF,MAAM,OAAO,GAAG,YAAY,CAAC,WAAW,CAAC,kBAAkB,CAAC,IAAI,GAAG,CAAC;YACpE,MAAM,IAAI,GAAG,WAAW,CAAC,iBAAiB,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;YAC3D,MAAM,KAAK,GAAG,GAAG,IAAI,GAAG,WAAW,CAAC,iBAAiB,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,WAAW,CAAC,kBAAkB,CAAC,WAAW,EAAE,GAAG,CAAC;YACtH,OAAO,CAAC,GAAG,CAAC,oBAAoB,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QACpD,CAAC;IACH,CAAC;SAAM,CAAC;QACN,MAAM,OAAO,GAAG,CAAC,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,UAAU,EAAE,MAAM,EAAE,QAAQ,CAAC,CAAC;QAC7E,MAAM,IAAI,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAChC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC;YACb,GAAG,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,KAAK,EAAE;YAC1B,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG;YAC7D,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,EAAE,KAAK,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG;YACpE,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG;YACtD,CAAC,CAAC,MAAM,KAAK,UAAU,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC;SACnD,CAAC,CAAC;QAEH,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC,CAAC;QACvC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;IACpC,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,0BAA0B,QAAQ,qBAAqB,CAAC,CAAC,CAAC;AAC5E,CAAC;AAED,SAAS,WAAW,CAAC,KAAiB,EAAE,SAAkB,EAAE,MAAM,GAAG,EAAE;IACrE,IAAI,SAAS,EAAE,CAAC;QACd,OAAO,GAAG,CAAC,KAAK,CAAC,IAAI,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;IACjH,CAAC;IACD,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,MAAM,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,MAAM,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;AACnH,CAAC"}
@@ -12,4 +12,5 @@ export declare function evalCommand(subcommand: string, target?: string, opts?:
12
12
  judge?: string;
13
13
  runs?: string;
14
14
  batch?: boolean;
15
+ baseline?: boolean;
15
16
  }): Promise<void>;
@@ -67,6 +67,7 @@ export async function evalCommand(subcommand, target, opts = {}) {
67
67
  judge: opts.judge,
68
68
  runs: opts.runs ? parseInt(opts.runs, 10) : undefined,
69
69
  concurrency: opts.concurrency ? parseInt(opts.concurrency, 10) : undefined,
70
+ baseline: opts.baseline,
70
71
  });
71
72
  }
72
73
  case "credentials": {
@@ -1 +1 @@
1
- {"version":3,"file":"eval.js","sourceRoot":"","sources":["../../src/commands/eval.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,mCAAmC;AACnC,8EAA8E;AAE9E,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,oBAAoB,CAAC;AAE9C,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,UAAkB,EAClB,MAAe,EACf,OAAiP,EAAE;IAEnP,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;IAE3D,QAAQ,UAAU,EAAE,CAAC;QACnB,KAAK,OAAO,CAAC,CAAC,CAAC;YACb,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;YACxD,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAC;YACzD,OAAO,YAAY,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;QAClC,CAAC;QAED,KAAK,MAAM,CAAC,CAAC,CAAC;YACZ,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,0CAA0C,CAAC,CAAC,CAAC;gBAC/D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC/C,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,gBAAgB,CAAC,CAAC;YACvD,MAAM,QAAQ,GAAG,CAAC,IAAI,CAAC,IAAI,KAAK,aAAa,IAAI,IAAI,CAAC,IAAI,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC;YAC3F,OAAO,WAAW,CAAC,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;QACvD,CAAC;QAED,KAAK,KAAK,CAAC,CAAC,CAAC;YACX,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,yCAAyC,CAAC,CAAC,CAAC;gBAC9D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC/C,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,eAAe,CAAC,CAAC;YACrD,sEAAsE;YACtE,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,KAAK,KAAK,KAAK,CAAC;YAC9D,OAAO,UAAU,CAAC,QAAQ,EAAE;gBAC1B,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;gBAC1E,UAAU,EAAE,IAAI,CAAC,UAAU;gBAC3B,OAAO;gBACP,KAAK,EAAE,IAAI,CAAC,KAAK;aAClB,CAAC,CAAC;QACL,CAAC;QAED,KAAK,UAAU,CAAC,CAAC,CAAC;YAChB,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;YAC/D,OAAO,eAAe,CAAC,IAAI,CAAC,CAAC;QAC/B,CAAC;QAED,KAAK,cAAc,CAAC,CAAC,CAAC;YACpB,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,wBAAwB,CAAC,CAAC;YACtE,MAAM,gBAAgB,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YACvF,OAAO,kBAAkB,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,gBAAgB,CAAC,CAAC;QAClE,CAAC;QAED,KAAK,OAAO,CAAC,CAAC,CAAC;YACb,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,wEAAwE,CAAC,CAAC,CAAC;gBAC7F,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;gBACjB,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,oHAAoH,CAAC,CAAC,CAAC;gBACzI,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,IAAI,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC;gBAChB,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,8DAA8D,CAAC,CAAC,CAAC;gBACnF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC/C,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAC;YACzD,OAAO,YAAY,CAAC,QAAQ,EAAE;gBAC5B,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;gBACrD,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;aAC3E,CAAC,CAAC;QACL,CAAC;QAED,KAAK,aAAa,CAAC,CAAC,CAAC;YACnB,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,iEAAiE,CAAC,CAAC,CAAC;gBACtF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,iDAAiD;YACjD,sDAAsD;YACtD,MAAM,YAAY,GAAG,IAAI,CAAC;YAC1B,MAAM,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,mBAAmB,EAAE,GAAG,MAAM,MAAM,CAAC,uBAAuB,CAAC,CAAC;YAC7G,QAAQ,MAAM,EAAE,CAAC;gBACf,KAAK,KAAK,CAAC,CAAC,CAAC;oBACX,MAAM,GAAG,GAAG,IAAI,CAAC,aAAa,CAAC;oBAC/B,IAAI,CAAC,GAAG,EAAE,CAAC;wBACT,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,qCAAqC,CAAC,CAAC,CAAC;wBAC1D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;oBAClB,CAAC;oBACD,OAAO,iBAAiB,CAAC,YAAY,EAAE,GAAG,CAAC,CAAC;gBAC9C,CAAC;gBACD,KAAK,MAAM;oBACT,OAAO,kBAAkB,CAAC,YAAY,CAAC,CAAC;gBAC1C,KAAK,OAAO;oBACV,OAAO,mBAAmB,CAAC,YAAY,CAAC,CAAC;gBAC3C;oBACE,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,oCAAoC,MAAM,KAAK,CAAC,GAAG,GAAG,CAAC,6BAA6B,CAAC,CAAC,CAAC;YAC7G,CAAC;YACD,MAAM;QACR,CAAC;QAED;YACE,OAAO,CAAC,KAAK,CACX,GAAG,CAAC,wBAAwB,UAAU,KAAK,CAAC;gBAC1C,GAAG,CAAC,yEAAyE,CAAC,CACjF,CAAC;IACN,CAAC;AACH,CAAC;AAED,SAAS,eAAe,CAAC,IAAY,EAAE,MAAc;IACnD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAChC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,KAAK,CACX,GAAG,CAAC,mBAAmB,MAAM,sCAAsC,CAAC,CACrE,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,qDAAqD;IACrD,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IAC5D,IAAI,UAAU,CAAC,UAAU,CAAC;QAAE,OAAO,UAAU,CAAC;IAE9C,sEAAsE;IACtE,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,SAAS,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACvE,IAAI,UAAU,CAAC,UAAU,CAAC;QAAE,OAAO,UAAU,CAAC;IAE9C,0CAA0C;IAC1C,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IAChD,IAAI,UAAU,CAAC,QAAQ,CAAC;QAAE,OAAO,QAAQ,CAAC;IAE1C,sEAAsE;IACtE,OAAO,UAAU,CAAC;AACpB,CAAC"}
1
+ {"version":3,"file":"eval.js","sourceRoot":"","sources":["../../src/commands/eval.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,mCAAmC;AACnC,8EAA8E;AAE9E,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACrC,OAAO,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,oBAAoB,CAAC;AAE9C,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,UAAkB,EAClB,MAAe,EACf,OAAqQ,EAAE;IAEvQ,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;IAE3D,QAAQ,UAAU,EAAE,CAAC;QACnB,KAAK,OAAO,CAAC,CAAC,CAAC;YACb,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;YACxD,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAC;YACzD,OAAO,YAAY,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;QAClC,CAAC;QAED,KAAK,MAAM,CAAC,CAAC,CAAC;YACZ,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,0CAA0C,CAAC,CAAC,CAAC;gBAC/D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC/C,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,MAAM,CAAC,gBAAgB,CAAC,CAAC;YACvD,MAAM,QAAQ,GAAG,CAAC,IAAI,CAAC,IAAI,KAAK,aAAa,IAAI,IAAI,CAAC,IAAI,KAAK,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC;YAC3F,OAAO,WAAW,CAAC,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;QACvD,CAAC;QAED,KAAK,KAAK,CAAC,CAAC,CAAC;YACX,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,yCAAyC,CAAC,CAAC,CAAC;gBAC9D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC/C,MAAM,EAAE,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC,eAAe,CAAC,CAAC;YACrD,sEAAsE;YACtE,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,KAAK,KAAK,KAAK,CAAC;YAC9D,OAAO,UAAU,CAAC,QAAQ,EAAE;gBAC1B,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;gBAC1E,UAAU,EAAE,IAAI,CAAC,UAAU;gBAC3B,OAAO;gBACP,KAAK,EAAE,IAAI,CAAC,KAAK;aAClB,CAAC,CAAC;QACL,CAAC;QAED,KAAK,UAAU,CAAC,CAAC,CAAC;YAChB,MAAM,EAAE,eAAe,EAAE,GAAG,MAAM,MAAM,CAAC,oBAAoB,CAAC,CAAC;YAC/D,OAAO,eAAe,CAAC,IAAI,CAAC,CAAC;QAC/B,CAAC;QAED,KAAK,cAAc,CAAC,CAAC,CAAC;YACpB,MAAM,EAAE,kBAAkB,EAAE,GAAG,MAAM,MAAM,CAAC,wBAAwB,CAAC,CAAC;YACtE,MAAM,gBAAgB,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YACvF,OAAO,kBAAkB,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,gBAAgB,CAAC,CAAC;QAClE,CAAC;QAED,KAAK,OAAO,CAAC,CAAC,CAAC;YACb,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,wEAAwE,CAAC,CAAC,CAAC;gBAC7F,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;gBACjB,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,oHAAoH,CAAC,CAAC,CAAC;gBACzI,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,IAAI,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC;gBAChB,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,8DAA8D,CAAC,CAAC,CAAC;gBACnF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;YAC/C,MAAM,EAAE,YAAY,EAAE,GAAG,MAAM,MAAM,CAAC,iBAAiB,CAAC,CAAC;YACzD,OAAO,YAAY,CAAC,QAAQ,EAAE;gBAC5B,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;gBACrD,WAAW,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;gBAC1E,QAAQ,EAAE,IAAI,CAAC,QAAQ;aACxB,CAAC,CAAC;QACL,CAAC;QAED,KAAK,aAAa,CAAC,CAAC,CAAC;YACnB,IAAI,CAAC,MAAM,EAAE,CAAC;gBACZ,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,iEAAiE,CAAC,CAAC,CAAC;gBACtF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;YACD,iDAAiD;YACjD,sDAAsD;YACtD,MAAM,YAAY,GAAG,IAAI,CAAC;YAC1B,MAAM,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,mBAAmB,EAAE,GAAG,MAAM,MAAM,CAAC,uBAAuB,CAAC,CAAC;YAC7G,QAAQ,MAAM,EAAE,CAAC;gBACf,KAAK,KAAK,CAAC,CAAC,CAAC;oBACX,MAAM,GAAG,GAAG,IAAI,CAAC,aAAa,CAAC;oBAC/B,IAAI,CAAC,GAAG,EAAE,CAAC;wBACT,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,qCAAqC,CAAC,CAAC,CAAC;wBAC1D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;oBAClB,CAAC;oBACD,OAAO,iBAAiB,CAAC,YAAY,EAAE,GAAG,CAAC,CAAC;gBAC9C,CAAC;gBACD,KAAK,MAAM;oBACT,OAAO,kBAAkB,CAAC,YAAY,CAAC,CAAC;gBAC1C,KAAK,OAAO;oBACV,OAAO,mBAAmB,CAAC,YAAY,CAAC,CAAC;gBAC3C;oBACE,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,oCAAoC,MAAM,KAAK,CAAC,GAAG,GAAG,CAAC,6BAA6B,CAAC,CAAC,CAAC;YAC7G,CAAC;YACD,MAAM;QACR,CAAC;QAED;YACE,OAAO,CAAC,KAAK,CACX,GAAG,CAAC,wBAAwB,UAAU,KAAK,CAAC;gBAC1C,GAAG,CAAC,yEAAyE,CAAC,CACjF,CAAC;IACN,CAAC;AACH,CAAC;AAED,SAAS,eAAe,CAAC,IAAY,EAAE,MAAc;IACnD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;IAChC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,KAAK,CACX,GAAG,CAAC,mBAAmB,MAAM,sCAAsC,CAAC,CACrE,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,qDAAqD;IACrD,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IAC5D,IAAI,UAAU,CAAC,UAAU,CAAC;QAAE,OAAO,UAAU,CAAC;IAE9C,sEAAsE;IACtE,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,SAAS,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACvE,IAAI,UAAU,CAAC,UAAU,CAAC;QAAE,OAAO,UAAU,CAAC;IAE9C,0CAA0C;IAC1C,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,EAAE,QAAQ,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IAChD,IAAI,UAAU,CAAC,QAAQ,CAAC;QAAE,OAAO,QAAQ,CAAC;IAE1C,sEAAsE;IACtE,OAAO,UAAU,CAAC;AACpB,CAAC"}
@@ -0,0 +1,21 @@
1
+ import type { ActivationResult } from "./activation-tester.js";
2
+ export interface ActivationHistoryRun {
3
+ id: string;
4
+ timestamp: string;
5
+ model: string;
6
+ provider: string;
7
+ promptCount: number;
8
+ summary: {
9
+ precision: number;
10
+ recall: number;
11
+ reliability: number;
12
+ tp: number;
13
+ tn: number;
14
+ fp: number;
15
+ fn: number;
16
+ };
17
+ results: ActivationResult[];
18
+ }
19
+ export declare function writeActivationRun(skillDir: string, run: ActivationHistoryRun): Promise<void>;
20
+ export declare function listActivationRuns(skillDir: string): Promise<Omit<ActivationHistoryRun, "results">[]>;
21
+ export declare function getActivationRun(skillDir: string, runId: string): Promise<ActivationHistoryRun | null>;
@@ -0,0 +1,41 @@
1
+ // ---------------------------------------------------------------------------
2
+ // activation-history.ts -- persistent activation test history per skill
3
+ // ---------------------------------------------------------------------------
4
+ import { readFile, writeFile, mkdir } from "node:fs/promises";
5
+ import { join } from "node:path";
6
+ const HISTORY_FILENAME = "activation-history.json";
7
+ const MAX_RUNS = 50;
8
+ function historyPath(skillDir) {
9
+ return join(skillDir, HISTORY_FILENAME);
10
+ }
11
+ async function readHistoryFile(skillDir) {
12
+ try {
13
+ const content = await readFile(historyPath(skillDir), "utf-8");
14
+ const parsed = JSON.parse(content);
15
+ if (parsed && Array.isArray(parsed.runs))
16
+ return parsed;
17
+ return { runs: [] };
18
+ }
19
+ catch {
20
+ return { runs: [] };
21
+ }
22
+ }
23
+ export async function writeActivationRun(skillDir, run) {
24
+ await mkdir(skillDir, { recursive: true });
25
+ const history = await readHistoryFile(skillDir);
26
+ history.runs.push(run);
27
+ // Prune oldest if over cap
28
+ if (history.runs.length > MAX_RUNS) {
29
+ history.runs = history.runs.slice(history.runs.length - MAX_RUNS);
30
+ }
31
+ await writeFile(historyPath(skillDir), JSON.stringify(history, null, 2));
32
+ }
33
+ export async function listActivationRuns(skillDir) {
34
+ const history = await readHistoryFile(skillDir);
35
+ return history.runs.map(({ results: _results, ...rest }) => rest).reverse();
36
+ }
37
+ export async function getActivationRun(skillDir, runId) {
38
+ const history = await readHistoryFile(skillDir);
39
+ return history.runs.find((r) => r.id === runId) ?? null;
40
+ }
41
+ //# sourceMappingURL=activation-history.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"activation-history.js","sourceRoot":"","sources":["../../src/eval/activation-history.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,wEAAwE;AACxE,8EAA8E;AAE9E,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AAC9D,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAyBjC,MAAM,gBAAgB,GAAG,yBAAyB,CAAC;AACnD,MAAM,QAAQ,GAAG,EAAE,CAAC;AAEpB,SAAS,WAAW,CAAC,QAAgB;IACnC,OAAO,IAAI,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;AAC1C,CAAC;AAED,KAAK,UAAU,eAAe,CAAC,QAAgB;IAC7C,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,WAAW,CAAC,QAAQ,CAAC,EAAE,OAAO,CAAC,CAAC;QAC/D,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QACnC,IAAI,MAAM,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,IAAI,CAAC;YAAE,OAAO,MAA+B,CAAC;QACjF,OAAO,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC;IACtB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC;IACtB,CAAC;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,QAAgB,EAChB,GAAyB;IAEzB,MAAM,KAAK,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC3C,MAAM,OAAO,GAAG,MAAM,eAAe,CAAC,QAAQ,CAAC,CAAC;IAChD,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACvB,2BAA2B;IAC3B,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;QACnC,OAAO,CAAC,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,GAAG,QAAQ,CAAC,CAAC;IACpE,CAAC;IACD,MAAM,SAAS,CAAC,WAAW,CAAC,QAAQ,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;AAC3E,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,kBAAkB,CACtC,QAAgB;IAEhB,MAAM,OAAO,GAAG,MAAM,eAAe,CAAC,QAAQ,CAAC,CAAC;IAChD,OAAO,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,GAAG,IAAI,EAAE,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,CAAC;AAC9E,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,QAAgB,EAChB,KAAa;IAEb,MAAM,OAAO,GAAG,MAAM,eAAe,CAAC,QAAQ,CAAC,CAAC;IAChD,OAAO,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,KAAK,CAAC,IAAI,IAAI,CAAC;AAC1D,CAAC"}
@@ -19,9 +19,10 @@ import { createLlmClient } from "../eval/llm.js";
19
19
  import { runComparison } from "../eval/comparator.js";
20
20
  import { computeVerdict } from "../eval/verdict.js";
21
21
  import { generateActionItems } from "../eval/action-items.js";
22
- import { buildEvalInitPrompt, parseGeneratedEvals } from "../eval/prompt-builder.js";
22
+ import { buildEvalInitPrompt, parseGeneratedEvals, buildIntegrationEvalPrompt, parseGeneratedIntegrationEvals, detectBrowserRequirements, detectPlatformTargets } from "../eval/prompt-builder.js";
23
23
  import { testActivation } from "../eval/activation-tester.js";
24
24
  import { detectMcpDependencies, detectSkillDependencies } from "../eval/mcp-detector.js";
25
+ import { writeActivationRun, listActivationRuns, getActivationRun } from "../eval/activation-history.js";
25
26
  // ---------------------------------------------------------------------------
26
27
  // In-memory config state — UI can change provider/model at runtime.
27
28
  //
@@ -468,6 +469,7 @@ export function registerRoutes(router, root, projectName) {
468
469
  sendJson(res, body, 200, req);
469
470
  });
470
471
  // Generate evals using AI — reads SKILL.md and returns generated EvalsFile
472
+ // Accepts optional { provider, model, testType } in request body
471
473
  router.post("/api/skills/:plugin/:skill/generate-evals", async (req, res, params) => {
472
474
  const skillDir = resolveSkillDir(root, params.plugin, params.skill);
473
475
  const skillMdPath = join(skillDir, "SKILL.md");
@@ -481,47 +483,114 @@ export function registerRoutes(router, root, projectName) {
481
483
  res.on("close", () => { aborted = true; });
482
484
  if (wantsSSE)
483
485
  initSSE(res, req);
486
+ // Read optional body params for model selection + test type
487
+ const body = await readBody(req).catch(() => ({}));
488
+ // Build per-request client: use body overrides if provided, else global
489
+ const overrides = { ...currentOverrides };
490
+ if (body.provider)
491
+ overrides.provider = body.provider;
492
+ if (body.model)
493
+ overrides.model = body.model;
494
+ const isIntegration = body.testType === "integration";
484
495
  try {
485
496
  if (wantsSSE && !aborted)
486
497
  sendSSE(res, "progress", { phase: "preparing", message: "Reading skill content..." });
487
498
  const skillContent = readFileSync(skillMdPath, "utf-8");
488
- const prompt = buildEvalInitPrompt(skillContent);
489
- const client = getClient();
499
+ // Build prompt based on test type
500
+ let prompt;
501
+ if (isIntegration) {
502
+ const mcpDeps = detectMcpDependencies(skillContent);
503
+ const browserReqs = detectBrowserRequirements(skillContent);
504
+ const platforms = detectPlatformTargets(skillContent);
505
+ prompt = buildIntegrationEvalPrompt(skillContent, mcpDeps, browserReqs, platforms);
506
+ }
507
+ else {
508
+ prompt = buildEvalInitPrompt(skillContent);
509
+ }
510
+ const client = createLlmClient(overrides);
490
511
  if (wantsSSE && !aborted)
491
- sendSSE(res, "progress", { phase: "generating", message: "Generating test cases..." });
512
+ sendSSE(res, "progress", {
513
+ phase: "generating",
514
+ message: `Generating ${isIntegration ? "integration" : "unit"} test cases...`,
515
+ });
492
516
  const genResult = wantsSSE
493
- ? await withHeartbeat(res, undefined, "generating", "Generating test cases", () => client.generate("You generate eval test cases for AI skills. Output only valid JSON in a code fence.", prompt))
517
+ ? await withHeartbeat(res, undefined, "generating", `Generating ${isIntegration ? "integration" : "unit"} test cases`, () => client.generate("You generate eval test cases for AI skills. Output only valid JSON in a code fence.", prompt))
494
518
  : await client.generate("You generate eval test cases for AI skills. Output only valid JSON in a code fence.", prompt);
495
519
  if (aborted)
496
520
  return;
497
521
  if (wantsSSE && !aborted)
498
522
  sendSSE(res, "progress", { phase: "parsing", message: "Parsing generated evals..." });
499
- const evalsFile = parseGeneratedEvals(genResult.text);
500
- // Record history entry for eval generation
501
- try {
502
- const client = getClient();
503
- await writeHistoryEntry(skillDir, {
504
- timestamp: new Date().toISOString(),
505
- model: client.model,
506
- skill_name: evalsFile.skill_name || params.skill,
507
- cases: [],
508
- overall_pass_rate: undefined,
509
- type: "eval-generate",
510
- provider: currentOverrides.provider || "claude-cli",
511
- generate: { prompt: prompt, result: JSON.stringify(evalsFile) },
512
- });
513
- }
514
- catch { /* history write failure should not break the main response */ }
515
- if (wantsSSE && !aborted) {
516
- sendSSEDone(res, evalsFile);
523
+ // Parse based on test type
524
+ if (isIntegration) {
525
+ const integrationCases = parseGeneratedIntegrationEvals(genResult.text);
526
+ // Load existing evals to merge and avoid ID collisions
527
+ let existingEvals = null;
528
+ try {
529
+ existingEvals = loadAndValidateEvals(skillDir);
530
+ }
531
+ catch (e) {
532
+ if (e.code !== "ENOENT" &&
533
+ !(e instanceof Error && e.message.includes("ENOENT"))) {
534
+ throw e;
535
+ }
536
+ // File doesn't exist — no existing evals, proceed with empty
537
+ }
538
+ const existingIds = existingEvals?.evals.map((e) => e.id) ?? [];
539
+ const maxId = existingIds.length > 0 ? Math.max(...existingIds) : 0;
540
+ // Re-number integration cases to avoid collisions
541
+ const reNumbered = integrationCases.map((c, i) => ({ ...c, id: maxId + 1 + i }));
542
+ const mergedEvals = {
543
+ skill_name: existingEvals?.skill_name || params.skill,
544
+ evals: [...(existingEvals?.evals || []), ...reNumbered],
545
+ };
546
+ // Record history
547
+ try {
548
+ await writeHistoryEntry(skillDir, {
549
+ timestamp: new Date().toISOString(),
550
+ model: client.model,
551
+ skill_name: mergedEvals.skill_name,
552
+ cases: [],
553
+ overall_pass_rate: undefined,
554
+ type: "eval-generate",
555
+ provider: overrides.provider || "claude-cli",
556
+ generate: { prompt, result: JSON.stringify(mergedEvals) },
557
+ });
558
+ }
559
+ catch { /* history write failure should not break the main response */ }
560
+ if (wantsSSE && !aborted) {
561
+ sendSSEDone(res, mergedEvals);
562
+ }
563
+ else {
564
+ sendJson(res, mergedEvals, 200, req);
565
+ }
517
566
  }
518
567
  else {
519
- sendJson(res, evalsFile, 200, req);
568
+ const evalsFile = parseGeneratedEvals(genResult.text);
569
+ // Record history entry for eval generation
570
+ try {
571
+ await writeHistoryEntry(skillDir, {
572
+ timestamp: new Date().toISOString(),
573
+ model: client.model,
574
+ skill_name: evalsFile.skill_name || params.skill,
575
+ cases: [],
576
+ overall_pass_rate: undefined,
577
+ type: "eval-generate",
578
+ provider: overrides.provider || "claude-cli",
579
+ generate: { prompt, result: JSON.stringify(evalsFile) },
580
+ });
581
+ }
582
+ catch { /* history write failure should not break the main response */ }
583
+ if (wantsSSE && !aborted) {
584
+ sendSSEDone(res, evalsFile);
585
+ }
586
+ else {
587
+ sendJson(res, evalsFile, 200, req);
588
+ }
520
589
  }
521
590
  }
522
591
  catch (err) {
523
592
  if (wantsSSE && !aborted) {
524
- sendSSE(res, "error", classifyError(err, currentOverrides.provider || "claude-cli"));
593
+ sendSSE(res, "error", classifyError(err, overrides.provider || "claude-cli"));
525
594
  res.end();
526
595
  }
527
596
  else {
@@ -1033,13 +1102,40 @@ export function registerRoutes(router, root, projectName) {
1033
1102
  name: nameMatch ? nameMatch[1] : params.skill,
1034
1103
  tags: tagsMatch ? tagsMatch[1].split(",").map((t) => t.trim()).filter(Boolean) : [],
1035
1104
  };
1036
- const client = getClient();
1105
+ // Use per-request model overrides if provided, fall back to global config
1106
+ const client = body.provider || body.model
1107
+ ? createLlmClient({ provider: body.provider, model: body.model })
1108
+ : getClient();
1037
1109
  const summary = await testActivation(description, body.prompts, client, (result) => {
1038
1110
  if (!aborted) {
1039
1111
  sendSSE(res, "prompt_result", result);
1040
1112
  }
1041
1113
  }, meta);
1042
1114
  if (!aborted) {
1115
+ // Write activation history entry
1116
+ const usedProvider = body.provider || currentOverrides.provider || "unknown";
1117
+ const usedModel = body.model || currentOverrides.model || "unknown";
1118
+ const run = {
1119
+ id: `run-${Date.now()}`,
1120
+ timestamp: new Date().toISOString(),
1121
+ model: usedModel,
1122
+ provider: usedProvider,
1123
+ promptCount: summary.total,
1124
+ summary: {
1125
+ precision: summary.precision,
1126
+ recall: summary.recall,
1127
+ reliability: summary.reliability,
1128
+ tp: summary.tp,
1129
+ tn: summary.tn,
1130
+ fp: summary.fp,
1131
+ fn: summary.fn,
1132
+ },
1133
+ results: summary.results,
1134
+ };
1135
+ try {
1136
+ await writeActivationRun(skillDir, run);
1137
+ }
1138
+ catch { /* non-blocking */ }
1043
1139
  sendSSEDone(res, { ...summary, description });
1044
1140
  }
1045
1141
  }
@@ -1047,6 +1143,81 @@ export function registerRoutes(router, root, projectName) {
1047
1143
  sendSSEDone(res, { error: err instanceof Error ? err.message : String(err) });
1048
1144
  }
1049
1145
  });
1146
+ // AI-generate activation test prompts (SSE)
1147
+ router.post("/api/skills/:plugin/:skill/activation-prompts", async (req, res, params) => {
1148
+ const skillDir = resolveSkillDir(root, params.plugin, params.skill);
1149
+ let aborted = false;
1150
+ res.on("close", () => { aborted = true; });
1151
+ try {
1152
+ const body = (await readBody(req));
1153
+ const skillMdPath = join(skillDir, "SKILL.md");
1154
+ if (!existsSync(skillMdPath)) {
1155
+ sendJson(res, { error: "SKILL.md not found" }, 404, req);
1156
+ return;
1157
+ }
1158
+ const skillContent = readFileSync(skillMdPath, "utf-8");
1159
+ const descMatch = skillContent.match(/^---[\s\S]*?description:\s*"([^"]+)"[\s\S]*?---/);
1160
+ const description = descMatch ? descMatch[1] : "";
1161
+ if (!description) {
1162
+ sendJson(res, { error: "No skill description available" }, 400, req);
1163
+ return;
1164
+ }
1165
+ initSSE(res, req);
1166
+ const count = body.count || 8;
1167
+ const half = Math.ceil(count / 2);
1168
+ const client = body.provider || body.model
1169
+ ? createLlmClient({ provider: body.provider, model: body.model })
1170
+ : getClient();
1171
+ const systemPrompt = `Given this skill description, generate test prompts to evaluate activation quality.
1172
+ Generate ${count} prompts: ${half} that SHOULD activate this skill, ${count - half} that should NOT.
1173
+ For "should not" prompts, make them plausible but clearly outside this skill's domain.
1174
+ Return one JSON object per line: {"prompt": "...", "expected": "should_activate"|"should_not_activate"}
1175
+ Return ONLY the JSON lines, no other text.`;
1176
+ const userPrompt = `Skill description: ${description}`;
1177
+ const { text } = await client.generate(systemPrompt, userPrompt);
1178
+ if (aborted)
1179
+ return;
1180
+ const allPrompts = [];
1181
+ const lines = text.split("\n").filter((l) => l.trim());
1182
+ for (const line of lines) {
1183
+ try {
1184
+ const cleaned = line.replace(/^```(?:json)?\s*/i, "").replace(/```\s*$/i, "").trim();
1185
+ if (!cleaned.startsWith("{"))
1186
+ continue;
1187
+ const parsed = JSON.parse(cleaned);
1188
+ if (parsed.prompt && parsed.expected) {
1189
+ allPrompts.push({ prompt: parsed.prompt, expected: parsed.expected });
1190
+ if (!aborted)
1191
+ sendSSE(res, "prompt_generated", parsed);
1192
+ }
1193
+ }
1194
+ catch { /* skip malformed lines */ }
1195
+ }
1196
+ if (!aborted)
1197
+ sendSSEDone(res, { prompts: allPrompts });
1198
+ }
1199
+ catch (err) {
1200
+ if (!aborted) {
1201
+ sendSSEDone(res, { error: err instanceof Error ? err.message : String(err) });
1202
+ }
1203
+ }
1204
+ });
1205
+ // List activation test history (summaries only)
1206
+ router.get("/api/skills/:plugin/:skill/activation-history", async (req, res, params) => {
1207
+ const skillDir = resolveSkillDir(root, params.plugin, params.skill);
1208
+ const runs = await listActivationRuns(skillDir);
1209
+ sendJson(res, { runs }, 200, req);
1210
+ });
1211
+ // Get full activation test run by ID
1212
+ router.get("/api/skills/:plugin/:skill/activation-history/:runId", async (req, res, params) => {
1213
+ const skillDir = resolveSkillDir(root, params.plugin, params.skill);
1214
+ const run = await getActivationRun(skillDir, params.runId);
1215
+ if (!run) {
1216
+ sendJson(res, { error: "Run not found" }, 404, req);
1217
+ return;
1218
+ }
1219
+ sendJson(res, run, 200, req);
1220
+ });
1050
1221
  // Get skill dependencies (MCP + skill-to-skill)
1051
1222
  router.get("/api/skills/:plugin/:skill/dependencies", async (req, res, params) => {
1052
1223
  const skillDir = resolveSkillDir(root, params.plugin, params.skill);