@skilljack/evals 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +327 -0
  3. package/action/action.yml +72 -0
  4. package/action/index.ts +78 -0
  5. package/dist/action/index.d.ts +8 -0
  6. package/dist/action/index.d.ts.map +1 -0
  7. package/dist/action/index.js +68 -0
  8. package/dist/action/index.js.map +1 -0
  9. package/dist/src/cli.d.ts +9 -0
  10. package/dist/src/cli.d.ts.map +1 -0
  11. package/dist/src/cli.js +264 -0
  12. package/dist/src/cli.js.map +1 -0
  13. package/dist/src/config.d.ts +52 -0
  14. package/dist/src/config.d.ts.map +1 -0
  15. package/dist/src/config.js +194 -0
  16. package/dist/src/config.js.map +1 -0
  17. package/dist/src/index.d.ts +24 -0
  18. package/dist/src/index.d.ts.map +1 -0
  19. package/dist/src/index.js +28 -0
  20. package/dist/src/index.js.map +1 -0
  21. package/dist/src/parser.d.ts +22 -0
  22. package/dist/src/parser.d.ts.map +1 -0
  23. package/dist/src/parser.js +205 -0
  24. package/dist/src/parser.js.map +1 -0
  25. package/dist/src/pipeline.d.ts +53 -0
  26. package/dist/src/pipeline.d.ts.map +1 -0
  27. package/dist/src/pipeline.js +185 -0
  28. package/dist/src/pipeline.js.map +1 -0
  29. package/dist/src/report/github-summary.d.ts +15 -0
  30. package/dist/src/report/github-summary.d.ts.map +1 -0
  31. package/dist/src/report/github-summary.js +77 -0
  32. package/dist/src/report/github-summary.js.map +1 -0
  33. package/dist/src/report/report.d.ts +23 -0
  34. package/dist/src/report/report.d.ts.map +1 -0
  35. package/dist/src/report/report.js +216 -0
  36. package/dist/src/report/report.js.map +1 -0
  37. package/dist/src/runner/runner.d.ts +29 -0
  38. package/dist/src/runner/runner.d.ts.map +1 -0
  39. package/dist/src/runner/runner.js +211 -0
  40. package/dist/src/runner/runner.js.map +1 -0
  41. package/dist/src/runner/security.d.ts +26 -0
  42. package/dist/src/runner/security.d.ts.map +1 -0
  43. package/dist/src/runner/security.js +34 -0
  44. package/dist/src/runner/security.js.map +1 -0
  45. package/dist/src/runner/skill-setup.d.ts +19 -0
  46. package/dist/src/runner/skill-setup.d.ts.map +1 -0
  47. package/dist/src/runner/skill-setup.js +72 -0
  48. package/dist/src/runner/skill-setup.js.map +1 -0
  49. package/dist/src/scorer/deterministic.d.ts +12 -0
  50. package/dist/src/scorer/deterministic.d.ts.map +1 -0
  51. package/dist/src/scorer/deterministic.js +149 -0
  52. package/dist/src/scorer/deterministic.js.map +1 -0
  53. package/dist/src/scorer/judge.d.ts +34 -0
  54. package/dist/src/scorer/judge.d.ts.map +1 -0
  55. package/dist/src/scorer/judge.js +226 -0
  56. package/dist/src/scorer/judge.js.map +1 -0
  57. package/dist/src/scorer/scorer.d.ts +25 -0
  58. package/dist/src/scorer/scorer.d.ts.map +1 -0
  59. package/dist/src/scorer/scorer.js +149 -0
  60. package/dist/src/scorer/scorer.js.map +1 -0
  61. package/dist/src/session/session-logger.d.ts +30 -0
  62. package/dist/src/session/session-logger.d.ts.map +1 -0
  63. package/dist/src/session/session-logger.js +157 -0
  64. package/dist/src/session/session-logger.js.map +1 -0
  65. package/dist/src/types.d.ts +227 -0
  66. package/dist/src/types.d.ts.map +1 -0
  67. package/dist/src/types.js +16 -0
  68. package/dist/src/types.js.map +1 -0
  69. package/package.json +44 -0
@@ -0,0 +1,216 @@
1
+ /**
2
+ * Report generation for skill evaluation results.
3
+ *
4
+ * Generates markdown and JSON reports from combined evaluation scores.
5
+ */
6
+ import * as fs from 'fs/promises';
7
+ import * as path from 'path';
8
+ import { loadConfigSync } from '../config.js';
9
+ /**
10
+ * Generate a markdown report from evaluation results.
11
+ */
12
+ export async function generateReport(evaluation, results, scores, outputPath, metadata) {
13
+ const config = loadConfigSync();
14
+ const totalTasks = evaluation.tasks.length;
15
+ const summary = computeSummary(results, scores);
16
+ const failureBreakdown = computeFailureBreakdown(scores);
17
+ // Determine pass/fail
18
+ const discoveryPassed = summary.discoveryAccuracy >= config.discoveryThreshold;
19
+ const scorePassed = summary.avgAdherence >= config.scoreThreshold && summary.avgOutputQuality >= config.scoreThreshold;
20
+ const passed = discoveryPassed && scorePassed;
21
+ // Build metadata section
22
+ let metaSection = '';
23
+ if (metadata) {
24
+ const metaLines = [`**Skill Path:** \`${metadata.skillPath}\``];
25
+ if (metadata.gitCommit) {
26
+ metaLines.push(`**Git:** ${metadata.gitBranch}@${metadata.gitCommit}`);
27
+ }
28
+ if (metadata.version)
29
+ metaLines.push(`**Version:** ${metadata.version}`);
30
+ metaLines.push(`**Agent Model:** ${metadata.agentModel}`);
31
+ metaLines.push(`**Judge Model:** ${metadata.judgeModel}`);
32
+ metaSection = metaLines.join('\n') + '\n';
33
+ }
34
+ // Build report
35
+ let report = `# Skill Evaluation Report: ${evaluation.skillName}
36
+
37
+ **Generated:** ${new Date().toISOString()}
38
+ **Total Tasks:** ${totalTasks}
39
+ **Result:** ${passed ? 'PASS' : 'FAIL'}
40
+ ${metaSection}
41
+ ---
42
+
43
+ ## Summary
44
+
45
+ | Metric | Value | Threshold | Status |
46
+ |--------|-------|-----------|--------|
47
+ | **Discovery Accuracy** | ${(summary.discoveryAccuracy * 100).toFixed(1)}% | ${(config.discoveryThreshold * 100).toFixed(0)}% | ${discoveryPassed ? 'PASS' : 'FAIL'} |
48
+ | **Avg Adherence Score** | ${summary.avgAdherence.toFixed(2)}/5.0 | ${config.scoreThreshold.toFixed(1)} | ${summary.avgAdherence >= config.scoreThreshold ? 'PASS' : 'FAIL'} |
49
+ | **Avg Output Quality** | ${summary.avgOutputQuality.toFixed(2)}/5.0 | ${config.scoreThreshold.toFixed(1)} | ${summary.avgOutputQuality >= config.scoreThreshold ? 'PASS' : 'FAIL'} |
50
+ | **Avg Weighted Score** | ${summary.avgWeightedScore.toFixed(2)} | | |
51
+ | **Total Duration** | ${(summary.totalDurationMs / 1000).toFixed(1)}s | | |
52
+ | **Total Cost** | $${summary.totalCostUsd.toFixed(4)} | | |
53
+
54
+ ## Failure Analysis
55
+
56
+ | Category | Count | Percentage |
57
+ |----------|-------|------------|
58
+ `;
59
+ for (const fb of failureBreakdown) {
60
+ const displayCat = fb.category === 'none' ? 'No Failure' : formatCategory(fb.category);
61
+ report += `| ${displayCat} | ${fb.count} | ${fb.percentage.toFixed(1)}% |\n`;
62
+ }
63
+ report += `\n---\n\n## Task Details\n\n`;
64
+ for (let i = 0; i < evaluation.tasks.length; i++) {
65
+ const task = evaluation.tasks[i];
66
+ const result = results[i];
67
+ const score = scores[i];
68
+ const loadedSkills = result.skillLoads.length > 0
69
+ ? result.skillLoads.map((s) => `\`${s}\``).join(', ')
70
+ : 'None';
71
+ report += `### Task ${i + 1}: ${task.id}
72
+
73
+ **Prompt:** ${task.prompt}
74
+
75
+ **Expected Skill:** \`${task.expectedSkillLoad}\`
76
+ **Loaded Skills:** ${loadedSkills}
77
+
78
+ #### Scores
79
+
80
+ | Dimension | Score | Status |
81
+ |-----------|-------|--------|
82
+ | Discovery | ${Math.round(score.discovery)} | ${score.discovery >= 1 ? 'PASS' : 'FAIL'} |
83
+ | Adherence | ${score.adherence}/5 | ${score.adherence >= 4 ? 'PASS' : 'FAIL'} |
84
+ | Output Quality | ${score.outputQuality}/5 | ${score.outputQuality >= 4 ? 'PASS' : 'FAIL'} |
85
+ | **Weighted** | **${score.weightedScore.toFixed(2)}** | |
86
+
87
+ **Failure Category:** ${formatCategory(score.failureCategory)}
88
+ `;
89
+ // Show deterministic results if available
90
+ if (score.deterministic) {
91
+ report += `\n**Deterministic Check:** ${score.deterministic.passed ? 'PASS' : 'FAIL'}\n`;
92
+ for (const detail of score.deterministic.details) {
93
+ report += `- ${detail}\n`;
94
+ }
95
+ }
96
+ report += `\n**Reasoning:** ${score.reasoning || 'No reasoning provided'}
97
+
98
+ <details>
99
+ <summary>Agent Output (click to expand)</summary>
100
+
101
+ \`\`\`
102
+ ${result.output.slice(0, config.reportOutputTruncation) || '(no output)'}
103
+ \`\`\`
104
+
105
+ </details>
106
+
107
+ **Metrics:** Duration: ${(result.durationMs / 1000).toFixed(1)}s | Turns: ${result.numTurns} | Cost: $${result.costUsd.toFixed(4)}
108
+
109
+ ---
110
+
111
+ `;
112
+ }
113
+ if (outputPath) {
114
+ await fs.mkdir(path.dirname(outputPath), { recursive: true });
115
+ await fs.writeFile(outputPath, report);
116
+ console.log(`Report saved to: ${outputPath}`);
117
+ }
118
+ return report;
119
+ }
120
+ /**
121
+ * Generate JSON report for programmatic analysis.
122
+ */
123
+ export async function generateJsonResults(evaluation, results, scores, outputPath, metadata) {
124
+ const config = loadConfigSync();
125
+ const summary = computeSummary(results, scores);
126
+ const failureBreakdown = computeFailureBreakdown(scores);
127
+ const discoveryPassed = summary.discoveryAccuracy >= config.discoveryThreshold;
128
+ const scorePassed = summary.avgAdherence >= config.scoreThreshold && summary.avgOutputQuality >= config.scoreThreshold;
129
+ const passed = discoveryPassed && scorePassed;
130
+ const failureReasons = [];
131
+ if (!discoveryPassed) {
132
+ failureReasons.push(`Discovery rate ${(summary.discoveryAccuracy * 100).toFixed(1)}% below threshold ${(config.discoveryThreshold * 100).toFixed(0)}%`);
133
+ }
134
+ if (summary.avgAdherence < config.scoreThreshold) {
135
+ failureReasons.push(`Avg adherence ${summary.avgAdherence.toFixed(2)} below threshold ${config.scoreThreshold}`);
136
+ }
137
+ if (summary.avgOutputQuality < config.scoreThreshold) {
138
+ failureReasons.push(`Avg output quality ${summary.avgOutputQuality.toFixed(2)} below threshold ${config.scoreThreshold}`);
139
+ }
140
+ const report = {
141
+ skillName: evaluation.skillName,
142
+ timestamp: new Date().toISOString(),
143
+ passed,
144
+ failureReasons,
145
+ metadata: metadata ? {
146
+ skillPath: metadata.skillPath,
147
+ gitCommit: metadata.gitCommit,
148
+ gitBranch: metadata.gitBranch,
149
+ version: metadata.version,
150
+ agentModel: metadata.agentModel,
151
+ judgeModel: metadata.judgeModel,
152
+ } : undefined,
153
+ summary,
154
+ failureBreakdown,
155
+ tasks: evaluation.tasks.map((task, i) => ({
156
+ task,
157
+ result: results[i],
158
+ score: scores[i],
159
+ })),
160
+ };
161
+ if (outputPath) {
162
+ await fs.mkdir(path.dirname(outputPath), { recursive: true });
163
+ await fs.writeFile(outputPath, JSON.stringify(report, null, 2));
164
+ console.log(`JSON results saved to: ${outputPath}`);
165
+ }
166
+ return report;
167
+ }
168
+ /**
169
+ * Compute summary statistics from combined scores.
170
+ */
171
+ export function computeSummary(results, scores) {
172
+ const totalTasks = scores.length;
173
+ const discoveryCorrect = scores.filter((s) => s.discovery >= 1).length;
174
+ return {
175
+ totalTasks,
176
+ discoveryAccuracy: totalTasks > 0 ? discoveryCorrect / totalTasks : 0,
177
+ avgAdherence: totalTasks > 0
178
+ ? scores.reduce((sum, s) => sum + s.adherence, 0) / totalTasks
179
+ : 0,
180
+ avgOutputQuality: totalTasks > 0
181
+ ? scores.reduce((sum, s) => sum + s.outputQuality, 0) / totalTasks
182
+ : 0,
183
+ avgWeightedScore: totalTasks > 0
184
+ ? scores.reduce((sum, s) => sum + s.weightedScore, 0) / totalTasks
185
+ : 0,
186
+ totalDurationMs: results.reduce((sum, r) => sum + r.durationMs, 0),
187
+ totalCostUsd: results.reduce((sum, r) => sum + r.costUsd, 0),
188
+ };
189
+ }
190
+ /**
191
+ * Compute failure category breakdown.
192
+ */
193
+ export function computeFailureBreakdown(scores) {
194
+ const counts = new Map();
195
+ for (const score of scores) {
196
+ const cat = score.failureCategory || 'none';
197
+ counts.set(cat, (counts.get(cat) || 0) + 1);
198
+ }
199
+ const total = scores.length;
200
+ return Array.from(counts.entries())
201
+ .map(([category, count]) => ({
202
+ category: category,
203
+ count,
204
+ percentage: total > 0 ? (count / total) * 100 : 0,
205
+ }))
206
+ .sort((a, b) => b.count - a.count);
207
+ }
208
+ function formatCategory(cat) {
209
+ if (cat === 'none')
210
+ return 'No Failure';
211
+ return cat
212
+ .split('_')
213
+ .map((w) => w.charAt(0).toUpperCase() + w.slice(1))
214
+ .join(' ');
215
+ }
216
+ //# sourceMappingURL=report.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"report.js","sourceRoot":"","sources":["../../../src/report/report.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAClC,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAW7B,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAE9C;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,UAA2B,EAC3B,OAAqB,EACrB,MAAuB,EACvB,UAAmB,EACnB,QAAyB;IAEzB,MAAM,MAAM,GAAG,cAAc,EAAE,CAAC;IAChC,MAAM,UAAU,GAAG,UAAU,CAAC,KAAK,CAAC,MAAM,CAAC;IAC3C,MAAM,OAAO,GAAG,cAAc,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;IAChD,MAAM,gBAAgB,GAAG,uBAAuB,CAAC,MAAM,CAAC,CAAC;IAEzD,sBAAsB;IACtB,MAAM,eAAe,GAAG,OAAO,CAAC,iBAAiB,IAAI,MAAM,CAAC,kBAAkB,CAAC;IAC/E,MAAM,WAAW,GAAG,OAAO,CAAC,YAAY,IAAI,MAAM,CAAC,cAAc,IAAI,OAAO,CAAC,gBAAgB,IAAI,MAAM,CAAC,cAAc,CAAC;IACvH,MAAM,MAAM,GAAG,eAAe,IAAI,WAAW,CAAC;IAE9C,yBAAyB;IACzB,IAAI,WAAW,GAAG,EAAE,CAAC;IACrB,IAAI,QAAQ,EAAE,CAAC;QACb,MAAM,SAAS,GAAG,CAAC,qBAAqB,QAAQ,CAAC,SAAS,IAAI,CAAC,CAAC;QAChE,IAAI,QAAQ,CAAC,SAAS,EAAE,CAAC;YACvB,SAAS,CAAC,IAAI,CAAC,YAAY,QAAQ,CAAC,SAAS,IAAI,QAAQ,CAAC,SAAS,EAAE,CAAC,CAAC;QACzE,CAAC;QACD,IAAI,QAAQ,CAAC,OAAO;YAAE,SAAS,CAAC,IAAI,CAAC,gBAAgB,QAAQ,CAAC,OAAO,EAAE,CAAC,CAAC;QACzE,SAAS,CAAC,IAAI,CAAC,oBAAoB,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;QAC1D,SAAS,CAAC,IAAI,CAAC,oBAAoB,QAAQ,CAAC,UAAU,EAAE,CAAC,CAAC;QAC1D,WAAW,GAAG,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;IAC5C,CAAC;IAED,eAAe;IACf,IAAI,MAAM,GAAG,8BAA8B,UAAU,CAAC,SAAS;;iBAEhD,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;mBACtB,UAAU;cACf,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;EACpC,WAAW;;;;;;;6BAOgB,CAAC,OAAO,CAAC,iBAAiB,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,kBAAkB,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,eAAe,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;8BACtI,OAAO,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,UAAU,MAAM,CAAC,cAAc,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,OAAO,CAAC,YAAY,IAAI,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;6BAC/I,OAAO,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,UAAU,MAAM,CAAC,cAAc,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,OAAO,CAAC,gBAAgB,IAAI,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;6BACtJ,OAAO,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC;yBACvC,CAAC,OAAO,CAAC,eAAe,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC;sBAC9C,OAAO,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC;;;;;;CAMpD,CAAC;IAEA,KAAK,MAAM,EAAE,IAAI,gBAAgB,EAAE,CAAC;QAClC,MAAM,UAAU,GAAG,EAAE,CAAC,QAAQ,KAAK,MAAM,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,cAAc,CAAC,EAAE,CAAC,QAAQ,CAAC,CAAC;QACvF,MAAM,IAAI,KAAK,UAAU,MAAM,EAAE,CAAC,KAAK,MAAM,EAAE,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC;IAC/E,CAAC;IAED,MAAM,IAAI,8BAA8B,CAAC;IAEzC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACjD,MAAM,IAAI,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACjC,MAAM,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;QAC1B,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;QAExB,MAAM,YAAY,GAAG,MAAM,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC;YAC/C,CAAC,CAAC,MAAM,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;YACrD,CAAC,CAAC,MAAM,CAAC;QAEX,MAAM,IAAI,YAAY,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC,EAAE;;cAE7B,IAAI,CAAC,MAAM;;wBAED,IAAI,CAAC,iBAAiB;qBACzB,YAAY;;;;;;gBAMjB,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,SAAS,CAAC,MAAM,KAAK,CAAC,SAAS,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;gBACvE,KAAK,CAAC,SAAS,QAAQ,KAAK,CAAC,SAAS,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;qBACxD,KAAK,CAAC,aAAa,QAAQ,KAAK,CAAC,aAAa,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM;qBACrE,KAAK,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC;;wBAE3B,cAAc,CAAC,KAAK,CAAC,eAAe,CAAC;CAC5D,CAAC;QAEE,0CAA0C;QAC1C,IAAI,KAAK,CAAC,aAAa,EAAE,CAAC;YACxB,MAAM,IAAI,8BAA8B,KAAK,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC;YACzF,KAAK,MAAM,MAAM,IAAI,KAAK,CAAC,aAAa,CAAC,OAAO,EAAE,CAAC;gBACjD,MAAM,IAAI,KAAK,MAAM,IAAI,CAAC;YAC5B,CAAC;QACH,CAAC;QAED,MAAM,IAAI,oBAAoB,KAAK,CAAC,SAAS,IAAI,uBAAuB;;;;;;EAM1E,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,sBAAsB,CAAC,IAAI,aAAa;;;;;yBAK/C,CAAC,MAAM,CAAC,UAAU,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,cAAc,MAAM,CAAC,QAAQ,aAAa,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC;;;;CAIhI,CAAC;IACA,CAAC;IAED,IAAI,UAAU,EAAE,CAAC;QACf,MAAM,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAC9D,MAAM,EAAE,CAAC,SAAS,CAAC,UAAU,EAAE,MAAM,CAAC,CAAC;QACvC,OAAO,CAAC,GAAG,CAAC,oBAAoB,UAAU,EAAE,CAAC,CAAC;IAChD,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,mBAAmB,CACvC,UAA2B,EAC3B,OAAqB,EACrB,MAAuB,EACvB,UAAmB,EACnB,QAAyB;IAEzB,MAAM,MAAM,GAAG,cAAc,EAAE,CAAC;IAChC,MAAM,OAAO,GAAG,cAAc,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC;IAChD,MAAM,gBAAgB,GAAG,uBAAuB,CAAC,MAAM,CAAC,CAAC;IAEzD,MAAM,eAAe,GAAG,OAAO,CAAC,iBAAiB,IAAI,MAAM,CAAC,kBAAkB,CAAC;IAC/E,MAAM,WAAW,GAAG,OAAO,CAAC,YAAY,IAAI,MAAM,CAAC,cAAc,IAAI,OAAO,CAAC,gBAAgB,IAAI,MAAM,CAAC,cAAc,CAAC;IACvH,MAAM,MAAM,GAAG,eAAe,IAAI,WAAW,CAAC;IAE9C,MAAM,cAAc,GAAa,EAAE,CAAC;IACpC,IAAI,CAAC,eAAe,EAAE,CAAC;QACrB,cAAc,CAAC,IAAI,CACjB,kBAAkB,CAAC,OAAO,CAAC,iBAAiB,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,qBAAqB,CAAC,MAAM,CAAC,kBAAkB,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CACnI,CAAC;IACJ,CAAC;IACD,IAAI,OAAO,CAAC,YAAY,GAAG,MAAM,CAAC,cAAc,EAAE,CAAC;QACjD,cAAc,CAAC,IAAI,CACjB,iBAAiB,OAAO,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,oBAAoB,MAAM,CAAC,cAAc,EAAE,CAC5F,CAAC;IACJ,CAAC;IACD,IAAI,OAAO,CAAC,gBAAgB,GAAG,MAAM,CAAC,cAAc,EAAE,CAAC;QACrD,cAAc,CAAC,IAAI,CACjB,sBAAsB,OAAO,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC,CAAC,oBAAoB,MAAM,CAAC,cAAc,EAAE,CACrG,CAAC;IACJ,CAAC;IAED,MAAM,MAAM,GAAqB;QAC/B,SAAS,EAAE,UAAU,CAAC,SAAS;QAC/B,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,MAAM;QACN,cAAc;QACd,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;YACnB,SAAS,EAAE,QAAQ,CAAC,SAAS;YAC7B,SAAS,EAAE,QAAQ,CAAC,SAAS;YAC7B,SAAS,EAAE,QAAQ,CAAC,SAAS;YAC7B,OAAO,EAAE,QAAQ,CAAC,OAAO;YACzB,UAAU,EAAE,QAAQ,CAAC,UAAU;YAC/B,UAAU,EAAE,QAAQ,CAAC,UAAU;SAChC,CAAC,CAAC,CAAC,SAAS;QACb,OAAO;QACP,gBAAgB;QAChB,KAAK,EAAE,UAAU,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;YACxC,IAAI;YACJ,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC;YAClB,KAAK,EAAE,MAAM,CAAC,CAAC,CAAC;SACjB,CAAC,CAAC;KACJ,CAAC;IAEF,IAAI,UAAU,EAAE,CAAC;QACf,MAAM,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAC9D,MAAM,EAAE,CAAC,SAAS,CAAC,UAAU,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;QAChE,OAAO,CAAC,GAAG,CAAC,0BAA0B,UAAU,EAAE,CAAC,CAAC;IACtD,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAC5B,OAAqB,EACrB,MAAuB;IAEvB,MAAM,UAAU,GAAG,MAAM,CAAC,MAAM,CAAC;IACjC,MAAM,gBAAgB,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC;IAEvE,OAAO;QACL,UAAU;QACV,iBAAiB,EAAE,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,gBAAgB,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;QACrE,YAAY,EAAE,UAAU,GAAG,CAAC;YAC1B,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,GAAG,UAAU;YAC9D,CAAC,CAAC,CAAC;QACL,gBAAgB,EAAE,UAAU,GAAG,CAAC;YAC9B,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,aAAa,EAAE,CAAC,CAAC,GAAG,UAAU;YAClE,CAAC,CAAC,CAAC;QACL,gBAAgB,EAAE,UAAU,GAAG,CAAC;YAC9B,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,aAAa,EAAE,CAAC,CAAC,GAAG,UAAU;YAClE,CAAC,CAAC,CAAC;QACL,eAAe,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC;QAClE,YAAY,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC;KAC7D,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,uBAAuB,CAAC,MAAuB;IAC7D,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAC;IACzC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,MAAM,GAAG,GAAG,KAAK,CAAC,eAAe,IAAI,MAAM,CAAC;QAC5C,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC9C,CAAC;IAED,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC;IAC5B,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;SAChC,GAAG,CAAC,CAAC,CAAC,QAAQ,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;QAC3B,QAAQ,EAAE,QAA2B;QACrC,KAAK;QACL,UAAU,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC;KAClD,CAAC,CAAC;SACF,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;AACvC,CAAC;AAED,SAAS,cAAc,CAAC,GAAW;IACjC,IAAI,GAAG,KAAK,MAAM;QAAE,OAAO,YAAY,CAAC;IACxC,OAAO,GAAG;SACP,KAAK,CAAC,GAAG,CAAC;SACV,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;SAClD,IAAI,CAAC,GAAG,CAAC,CAAC;AACf,CAAC"}
@@ -0,0 +1,29 @@
1
+ /**
2
+ * Skill Evaluation Runner - Claude Agent SDK
3
+ *
4
+ * Runs evaluation tasks against an agent using the Claude Agent SDK.
5
+ * Supports local skill delivery (.claude/skills/) with both Anthropic API
6
+ * and Bedrock (via CLAUDE_CODE_USE_BEDROCK=1 env var).
7
+ *
8
+ * Security: Uses permissionMode 'bypassPermissions' for automated execution,
9
+ * with file writes restricted via canUseTool callback to allowedWriteDirs only.
10
+ */
11
+ import type { EvalTask, SkillEvaluation, TaskResult, RunnerOptions } from '../types.js';
12
+ import type { SessionLogger } from '../session/session-logger.js';
13
+ export declare class SkillEvalRunner {
14
+ private options;
15
+ constructor(options?: RunnerOptions);
16
+ /**
17
+ * Execute a single evaluation task.
18
+ */
19
+ runTask(task: EvalTask, logger?: SessionLogger): Promise<TaskResult>;
20
+ /**
21
+ * Execute a task with timeout protection.
22
+ */
23
+ runTaskWithTimeout(task: EvalTask, timeoutMs?: number, logger?: SessionLogger): Promise<TaskResult>;
24
+ /**
25
+ * Run all tasks in an evaluation suite.
26
+ */
27
+ runAll(evaluation: SkillEvaluation, createLogger?: (task: EvalTask) => SessionLogger): Promise<TaskResult[]>;
28
+ }
29
+ //# sourceMappingURL=runner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../../src/runner/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAGH,OAAO,KAAK,EACV,QAAQ,EACR,eAAe,EAEf,UAAU,EACV,aAAa,EACd,MAAM,aAAa,CAAC;AASrB,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,8BAA8B,CAAC;AAElE,qBAAa,eAAe;IAC1B,OAAO,CAAC,OAAO,CAA0B;gBAE7B,OAAO,GAAE,aAAkB;IAavC;;OAEG;IACG,OAAO,CAAC,IAAI,EAAE,QAAQ,EAAE,MAAM,CAAC,EAAE,aAAa,GAAG,OAAO,CAAC,UAAU,CAAC;IA4H1E;;OAEG;IACG,kBAAkB,CACtB,IAAI,EAAE,QAAQ,EACd,SAAS,CAAC,EAAE,MAAM,EAClB,MAAM,CAAC,EAAE,aAAa,GACrB,OAAO,CAAC,UAAU,CAAC;IAgCtB;;OAEG;IACG,MAAM,CACV,UAAU,EAAE,eAAe,EAC3B,YAAY,CAAC,EAAE,CAAC,IAAI,EAAE,QAAQ,KAAK,aAAa,GAC/C,OAAO,CAAC,UAAU,EAAE,CAAC;CA6CzB"}
@@ -0,0 +1,211 @@
1
+ /**
2
+ * Skill Evaluation Runner - Claude Agent SDK
3
+ *
4
+ * Runs evaluation tasks against an agent using the Claude Agent SDK.
5
+ * Supports local skill delivery (.claude/skills/) with both Anthropic API
6
+ * and Bedrock (via CLAUDE_CODE_USE_BEDROCK=1 env var).
7
+ *
8
+ * Security: Uses permissionMode 'bypassPermissions' for automated execution,
9
+ * with file writes restricted via canUseTool callback to allowedWriteDirs only.
10
+ */
11
+ import { query } from '@anthropic-ai/claude-agent-sdk';
12
+ import { isAssistantMessage, isResultMessage, isTextBlock, isToolUseBlock, } from '../types.js';
13
+ import { createToolPolicy } from './security.js';
14
+ import { loadConfigSync } from '../config.js';
15
+ export class SkillEvalRunner {
16
+ options;
17
+ constructor(options = {}) {
18
+ const config = loadConfigSync();
19
+ this.options = {
20
+ cwd: options.cwd ?? process.cwd(),
21
+ parallel: options.parallel ?? false,
22
+ model: options.model ?? config.defaultAgentModel,
23
+ settingSources: options.settingSources ?? ['project'],
24
+ countReadAsFallback: options.countReadAsFallback ?? false,
25
+ allowedWriteDirs: options.allowedWriteDirs ?? config.allowedWriteDirs,
26
+ };
27
+ }
28
+ /**
29
+ * Execute a single evaluation task.
30
+ */
31
+ async runTask(task, logger) {
32
+ const skillLoads = [];
33
+ const toolCalls = [];
34
+ const startTime = Date.now();
35
+ try {
36
+ let resultOutput = '';
37
+ let resultDurationMs = 0;
38
+ let resultNumTurns = 0;
39
+ let resultCostUsd = 0;
40
+ const toolPolicy = createToolPolicy(this.options.allowedWriteDirs, this.options.cwd);
41
+ const q = query({
42
+ prompt: task.prompt,
43
+ options: {
44
+ cwd: this.options.cwd,
45
+ model: this.options.model,
46
+ systemPrompt: { type: 'preset', preset: 'claude_code' },
47
+ settingSources: this.options.settingSources,
48
+ allowedTools: [
49
+ 'Read', 'Write', 'Edit',
50
+ 'Glob', 'Grep', 'Bash',
51
+ 'Skill', 'Task',
52
+ ],
53
+ permissionMode: 'bypassPermissions',
54
+ canUseTool: toolPolicy,
55
+ },
56
+ });
57
+ for await (const message of q) {
58
+ // Process assistant messages
59
+ if (isAssistantMessage(message)) {
60
+ const content = message.message.content;
61
+ logger?.addAssistantMessage(content);
62
+ for (const block of content) {
63
+ if (isTextBlock(block)) {
64
+ resultOutput += block.text;
65
+ logger?.addTextMessage(block.text);
66
+ }
67
+ if (isToolUseBlock(block)) {
68
+ const toolName = block.name;
69
+ const toolInput = block.input;
70
+ toolCalls.push({
71
+ tool: toolName,
72
+ toolUseId: block.id,
73
+ timestamp: Date.now(),
74
+ input: toolInput,
75
+ });
76
+ logger?.addToolUse(toolName, toolInput);
77
+ // Detect skill loading via Skill tool
78
+ if (toolName === 'Skill') {
79
+ const skillName = toolInput.skill || '';
80
+ if (skillName) {
81
+ skillLoads.push(skillName);
82
+ }
83
+ }
84
+ // Optionally detect via Read calls to SKILL.md
85
+ if (this.options.countReadAsFallback && toolName === 'Read') {
86
+ const filePath = toolInput.file_path || '';
87
+ if (filePath.includes('SKILL.md') || filePath.includes('/skills/')) {
88
+ const match = filePath.match(/skills\/([^/]+)/);
89
+ if (match) {
90
+ skillLoads.push(match[1]);
91
+ }
92
+ }
93
+ }
94
+ }
95
+ }
96
+ }
97
+ // Capture final metrics from result message
98
+ if (isResultMessage(message)) {
99
+ resultDurationMs = message.duration_ms ?? 0;
100
+ resultNumTurns = message.num_turns ?? 0;
101
+ resultCostUsd = message.total_cost_usd ?? 0;
102
+ if (message.result) {
103
+ resultOutput = message.result;
104
+ }
105
+ }
106
+ }
107
+ return {
108
+ taskId: task.id,
109
+ prompt: task.prompt,
110
+ output: resultOutput,
111
+ durationMs: resultDurationMs || (Date.now() - startTime),
112
+ numTurns: resultNumTurns,
113
+ costUsd: resultCostUsd,
114
+ skillLoads: [...new Set(skillLoads)],
115
+ toolCalls,
116
+ isError: false,
117
+ errorMessage: '',
118
+ };
119
+ }
120
+ catch (error) {
121
+ const errorMessage = error instanceof Error ? error.message : String(error);
122
+ logger?.markAsError(errorMessage);
123
+ return {
124
+ taskId: task.id,
125
+ prompt: task.prompt,
126
+ output: '',
127
+ durationMs: Date.now() - startTime,
128
+ numTurns: 0,
129
+ costUsd: 0,
130
+ skillLoads: [],
131
+ toolCalls: [],
132
+ isError: true,
133
+ errorMessage,
134
+ };
135
+ }
136
+ }
137
+ /**
138
+ * Execute a task with timeout protection.
139
+ */
140
+ async runTaskWithTimeout(task, timeoutMs, logger) {
141
+ const config = loadConfigSync();
142
+ const timeout = timeoutMs ?? config.taskTimeoutMs;
143
+ const timeoutPromise = new Promise((_, reject) => {
144
+ setTimeout(() => reject(new Error(`Task ${task.id} timed out after ${timeout}ms`)), timeout);
145
+ });
146
+ try {
147
+ return await Promise.race([this.runTask(task, logger), timeoutPromise]);
148
+ }
149
+ catch (error) {
150
+ const errorMessage = error instanceof Error ? error.message : String(error);
151
+ logger?.markAsError(errorMessage);
152
+ return {
153
+ taskId: task.id,
154
+ prompt: task.prompt,
155
+ output: '',
156
+ durationMs: timeout,
157
+ numTurns: 0,
158
+ costUsd: 0,
159
+ skillLoads: [],
160
+ toolCalls: [],
161
+ isError: true,
162
+ errorMessage,
163
+ };
164
+ }
165
+ }
166
+ /**
167
+ * Run all tasks in an evaluation suite.
168
+ */
169
+ async runAll(evaluation, createLogger) {
170
+ if (this.options.parallel) {
171
+ const results = await Promise.allSettled(evaluation.tasks.map((task) => {
172
+ const logger = createLogger?.(task);
173
+ return this.runTaskWithTimeout(task, undefined, logger);
174
+ }));
175
+ return results.map((result, i) => {
176
+ if (result.status === 'fulfilled') {
177
+ return result.value;
178
+ }
179
+ const task = evaluation.tasks[i];
180
+ return {
181
+ taskId: task.id,
182
+ prompt: task.prompt,
183
+ output: '',
184
+ durationMs: 0,
185
+ numTurns: 0,
186
+ costUsd: 0,
187
+ skillLoads: [],
188
+ toolCalls: [],
189
+ isError: true,
190
+ errorMessage: result.reason?.message || 'Unknown error',
191
+ };
192
+ });
193
+ }
194
+ const results = [];
195
+ for (const task of evaluation.tasks) {
196
+ console.log(`Running task ${task.id}: ${task.prompt.slice(0, 60)}...`);
197
+ const logger = createLogger?.(task);
198
+ const result = await this.runTaskWithTimeout(task, undefined, logger);
199
+ results.push(result);
200
+ if (result.isError) {
201
+ console.error(` ERROR: ${result.errorMessage}`);
202
+ }
203
+ else {
204
+ console.log(` Skills loaded: ${result.skillLoads.join(', ') || 'none'}`);
205
+ console.log(` Duration: ${(result.durationMs / 1000).toFixed(1)}s | Cost: $${result.costUsd.toFixed(4)}`);
206
+ }
207
+ }
208
+ return results;
209
+ }
210
+ }
211
+ //# sourceMappingURL=runner.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"runner.js","sourceRoot":"","sources":["../../../src/runner/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,EAAE,KAAK,EAAE,MAAM,gCAAgC,CAAC;AAQvD,OAAO,EACL,kBAAkB,EAClB,eAAe,EACf,WAAW,EACX,cAAc,GACf,MAAM,aAAa,CAAC;AACrB,OAAO,EAAE,gBAAgB,EAAE,MAAM,eAAe,CAAC;AACjD,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAG9C,MAAM,OAAO,eAAe;IAClB,OAAO,CAA0B;IAEzC,YAAY,UAAyB,EAAE;QACrC,MAAM,MAAM,GAAG,cAAc,EAAE,CAAC;QAEhC,IAAI,CAAC,OAAO,GAAG;YACb,GAAG,EAAE,OAAO,CAAC,GAAG,IAAI,OAAO,CAAC,GAAG,EAAE;YACjC,QAAQ,EAAE,OAAO,CAAC,QAAQ,IAAI,KAAK;YACnC,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,MAAM,CAAC,iBAAiB;YAChD,cAAc,EAAE,OAAO,CAAC,cAAc,IAAI,CAAC,SAAS,CAAC;YACrD,mBAAmB,EAAE,OAAO,CAAC,mBAAmB,IAAI,KAAK;YACzD,gBAAgB,EAAE,OAAO,CAAC,gBAAgB,IAAI,MAAM,CAAC,gBAAgB;SACtE,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO,CAAC,IAAc,EAAE,MAAsB;QAClD,MAAM,UAAU,GAAa,EAAE,CAAC;QAChC,MAAM,SAAS,GAAqB,EAAE,CAAC;QACvC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAE7B,IAAI,CAAC;YACH,IAAI,YAAY,GAAG,EAAE,CAAC;YACtB,IAAI,gBAAgB,GAAG,CAAC,CAAC;YACzB,IAAI,cAAc,GAAG,CAAC,CAAC;YACvB,IAAI,aAAa,GAAG,CAAC,CAAC;YAEtB,MAAM,UAAU,GAAG,gBAAgB,CACjC,IAAI,CAAC,OAAO,CAAC,gBAAgB,EAC7B,IAAI,CAAC,OAAO,CAAC,GAAG,CACjB,CAAC;YAEF,MAAM,CAAC,GAAG,KAAK,CAAC;gBACd,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,OAAO,EAAE;oBACP,GAAG,EAAE,IAAI,CAAC,OAAO,CAAC,GAAG;oBACrB,KAAK,EAAE,IAAI,CAAC,OAAO,CAAC,KAAK;oBACzB,YAAY,EAAE,EAAE,IAAI,EAAE,QAAQ,EAAE,MAAM,EAAE,aAAa,EAAE;oBACvD,cAAc,EAAE,IAAI,CAAC,OAAO,CAAC,cAAc;oBAC3C,YAAY,EAAE;wBACZ,MAAM,EAAE,OAAO,EAAE,MAAM;wBACvB,MAAM,EAAE,MAAM,EAAE,MAAM;wBACtB,OAAO,EAAE,MAAM;qBAChB;oBACD,cAAc,EAAE,mBAAmB;oBACnC,UAAU,EAAE,UAAU;iBACvB;aACF,CAAC,CAAC;YAEH,IAAI,KAAK,EAAE,MAAM,OAAO,IAAI,CAAC,EAAE,CAAC;gBAC9B,6BAA6B;gBAC7B,IAAI,kBAAkB,CAAC,OAAO,CAAC,EAAE,CAAC;oBAChC,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,OAAO,CAAC;oBAExC,MAAM,EAAE,mBAAmB,CAAC,OAAoB,CAAC,CAAC;oBAElD,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;wBAC5B,IAAI,WAAW,CAAC,KAAK,CAAC,EAAE,CAAC;4BACvB,YAAY,IAAI,KAAK,CAAC,IAAI,CAAC;4BAC3B,MAAM,EAAE,cAAc,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;wBACrC,CAAC;wBAED,IAAI,cAAc,CAAC,KAAK,CAAC,EAAE,CAAC;4BAC1B,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC;4BAC5B,MAAM,SAAS,GAAG,KAAK,CAAC,KAAK,CAAC;4BAE9B,SAAS,CAAC,IAAI,CAAC;gCACb,IAAI,EAAE,QAAQ;gCACd,SAAS,EAAE,KAAK,CAAC,EAAE;gCACnB,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE;gCACrB,KAAK,EAAE,SAAS;6BACjB,CAAC,CAAC;4BAEH,MAAM,EAAE,UAAU,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;4BAExC,sCAAsC;4BACtC,IAAI,QAAQ,KAAK,OAAO,EAAE,CAAC;gCACzB,MAAM,SAAS,GAAI,SAAS,CAAC,KAAgB,IAAI,EAAE,CAAC;gCACpD,IAAI,SAAS,EAAE,CAAC;oCACd,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;gCAC7B,CAAC;4BACH,CAAC;4BAED,+CAA+C;4BAC/C,IAAI,IAAI,CAAC,OAAO,CAAC,mBAAmB,IAAI,QAAQ,KAAK,MAAM,EAAE,CAAC;gCAC5D,MAAM,QAAQ,GAAI,SAAS,CAAC,SAAoB,IAAI,EAAE,CAAC;gCACvD,IAAI,QAAQ,CAAC,QAAQ,CAAC,UAAU,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;oCACnE,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,iBAAiB,CAAC,CAAC;oCAChD,IAAI,KAAK,EAAE,CAAC;wCACV,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;oCAC5B,CAAC;gCACH,CAAC;4BACH,CAAC;wBACH,CAAC;oBACH,CAAC;gBACH,CAAC;gBAED,4CAA4C;gBAC5C,IAAI,eAAe,CAAC,OAAO,CAAC,EAAE,CAAC;oBAC7B,gBAAgB,GAAG,OAAO,CAAC,WAAW,IAAI,CAAC,CAAC;oBAC5C,cAAc,GAAG,OAAO,CAAC,SAAS,IAAI,CAAC,CAAC;oBACxC,aAAa,GAAG,OAAO,CAAC,cAAc,IAAI,CAAC,CAAC;oBAE5C,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;wBACnB,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC;oBAChC,CAAC;gBACH,CAAC;YACH,CAAC;YAED,OAAO;gBACL,MAAM,EAAE,IAAI,CAAC,EAAE;gBACf,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,MAAM,EAAE,YAAY;gBACpB,UAAU,EAAE,gBAAgB,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;gBACxD,QAAQ,EAAE,cAAc;gBACxB,OAAO,EAAE,aAAa;gBACtB,UAAU,EAAE,CAAC,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,CAAC;gBACpC,SAAS;gBACT,OAAO,EAAE,KAAK;gBACd,YAAY,EAAE,EAAE;aACjB,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAC5E,MAAM,EAAE,WAAW,CAAC,YAAY,CAAC,CAAC;YAElC,OAAO;gBACL,MAAM,EAAE,IAAI,CAAC,EAAE;gBACf,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,MAAM,EAAE,EAAE;gBACV,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;gBAClC,QAAQ,EAAE,CAAC;gBACX,OAAO,EAAE,CAAC;gBACV,UAAU,EAAE,EAAE;gBACd,SAAS,EAAE,EAAE;gBACb,OAAO,EAAE,IAAI;gBACb,YAAY;aACb,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,kBAAkB,CACtB,IAAc,EACd,SAAkB,EAClB,MAAsB;QAEtB,MAAM,MAAM,GAAG,cAAc,EAAE,CAAC;QAChC,MAAM,OAAO,GAAG,SAAS,IAAI,MAAM,CAAC,aAAa,CAAC;QAElD,MAAM,cAAc,GAAG,IAAI,OAAO,CAAa,CAAC,CAAC,EAAE,MAAM,EAAE,EAAE;YAC3D,UAAU,CACR,GAAG,EAAE,CAAC,MAAM,CAAC,IAAI,KAAK,CAAC,QAAQ,IAAI,CAAC,EAAE,oBAAoB,OAAO,IAAI,CAAC,CAAC,EACvE,OAAO,CACR,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,OAAO,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,MAAM,CAAC,EAAE,cAAc,CAAC,CAAC,CAAC;QAC1E,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAC5E,MAAM,EAAE,WAAW,CAAC,YAAY,CAAC,CAAC;YAElC,OAAO;gBACL,MAAM,EAAE,IAAI,CAAC,EAAE;gBACf,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,MAAM,EAAE,EAAE;gBACV,UAAU,EAAE,OAAO;gBACnB,QAAQ,EAAE,CAAC;gBACX,OAAO,EAAE,CAAC;gBACV,UAAU,EAAE,EAAE;gBACd,SAAS,EAAE,EAAE;gBACb,OAAO,EAAE,IAAI;gBACb,YAAY;aACb,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,MAAM,CACV,UAA2B,EAC3B,YAAgD;QAEhD,IAAI,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,CAAC;YAC1B,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,UAAU,CACtC,UAAU,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;gBAC5B,MAAM,MAAM,GAAG,YAAY,EAAE,CAAC,IAAI,CAAC,CAAC;gBACpC,OAAO,IAAI,CAAC,kBAAkB,CAAC,IAAI,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;YAC1D,CAAC,CAAC,CACH,CAAC;YAEF,OAAO,OAAO,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE;gBAC/B,IAAI,MAAM,CAAC,MAAM,KAAK,WAAW,EAAE,CAAC;oBAClC,OAAO,MAAM,CAAC,KAAK,CAAC;gBACtB,CAAC;gBACD,MAAM,IAAI,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;gBACjC,OAAO;oBACL,MAAM,EAAE,IAAI,CAAC,EAAE;oBACf,MAAM,EAAE,IAAI,CAAC,MAAM;oBACnB,MAAM,EAAE,EAAE;oBACV,UAAU,EAAE,CAAC;oBACb,QAAQ,EAAE,CAAC;oBACX,OAAO,EAAE,CAAC;oBACV,UAAU,EAAE,EAAE;oBACd,SAAS,EAAE,EAAE;oBACb,OAAO,EAAE,IAAI;oBACb,YAAY,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,IAAI,eAAe;iBACxD,CAAC;YACJ,CAAC,CAAC,CAAC;QACL,CAAC;QAED,MAAM,OAAO,GAAiB,EAAE,CAAC;QACjC,KAAK,MAAM,IAAI,IAAI,UAAU,CAAC,KAAK,EAAE,CAAC;YACpC,OAAO,CAAC,GAAG,CAAC,gBAAgB,IAAI,CAAC,EAAE,KAAK,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC;YACvE,MAAM,MAAM,GAAG,YAAY,EAAE,CAAC,IAAI,CAAC,CAAC;YACpC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,kBAAkB,CAAC,IAAI,EAAE,SAAS,EAAE,MAAM,CAAC,CAAC;YACtE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YAErB,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;gBACnB,OAAO,CAAC,KAAK,CAAC,YAAY,MAAM,CAAC,YAAY,EAAE,CAAC,CAAC;YACnD,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,GAAG,CAAC,oBAAoB,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,MAAM,EAAE,CAAC,CAAC;gBAC1E,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC,MAAM,CAAC,UAAU,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,cAAc,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;YAC7G,CAAC;QACH,CAAC;QACD,OAAO,OAAO,CAAC;IACjB,CAAC;CACF"}
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Security policies for the evaluation runner.
3
+ *
4
+ * Restricts file writes to allowed directories via the Agent SDK's
5
+ * canUseTool callback.
6
+ */
7
+ /**
8
+ * Create a canUseTool callback that restricts Write/Edit to allowed directories.
9
+ *
10
+ * Matches the Agent SDK's CanUseTool signature:
11
+ * (toolName, input, options) => Promise<PermissionResult>
12
+ */
13
+ export declare function createToolPolicy(allowedWriteDirs: string[], cwd: string): (toolName: string, input: Record<string, unknown>, _options: {
14
+ signal: AbortSignal;
15
+ toolUseID: string;
16
+ [key: string]: unknown;
17
+ }) => Promise<{
18
+ behavior: "allow";
19
+ updatedInput: Record<string, unknown>;
20
+ message?: undefined;
21
+ } | {
22
+ behavior: "deny";
23
+ message: string;
24
+ updatedInput?: undefined;
25
+ }>;
26
+ //# sourceMappingURL=security.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"security.d.ts","sourceRoot":"","sources":["../../../src/runner/security.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH;;;;;GAKG;AACH,wBAAgB,gBAAgB,CAC9B,gBAAgB,EAAE,MAAM,EAAE,EAC1B,GAAG,EAAE,MAAM,IAOT,UAAU,MAAM,EAChB,OAAO,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,EAC9B,UAAU;IAAE,MAAM,EAAE,WAAW,CAAC;IAAC,SAAS,EAAE,MAAM,CAAC;IAAC,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAA;CAAE;;;;;;;;GAwB/E"}
@@ -0,0 +1,34 @@
1
+ /**
2
+ * Security policies for the evaluation runner.
3
+ *
4
+ * Restricts file writes to allowed directories via the Agent SDK's
5
+ * canUseTool callback.
6
+ */
7
+ import * as path from 'path';
8
+ /**
9
+ * Create a canUseTool callback that restricts Write/Edit to allowed directories.
10
+ *
11
+ * Matches the Agent SDK's CanUseTool signature:
12
+ * (toolName, input, options) => Promise<PermissionResult>
13
+ */
14
+ export function createToolPolicy(allowedWriteDirs, cwd) {
15
+ const resolvedDirs = allowedWriteDirs.map((dir) => path.resolve(cwd, dir));
16
+ return async (toolName, input, _options) => {
17
+ // Allow all non-write tools
18
+ if (!['Write', 'Edit'].includes(toolName)) {
19
+ return { behavior: 'allow', updatedInput: input };
20
+ }
21
+ // Check if file path is in allowed directories
22
+ const filePath = input.file_path || '';
23
+ const resolvedPath = path.resolve(cwd, filePath);
24
+ const isAllowed = resolvedDirs.some((dir) => resolvedPath.startsWith(dir));
25
+ if (isAllowed) {
26
+ return { behavior: 'allow', updatedInput: input };
27
+ }
28
+ return {
29
+ behavior: 'deny',
30
+ message: `Write denied: ${filePath} is outside allowed directories: ${allowedWriteDirs.join(', ')}`,
31
+ };
32
+ };
33
+ }
34
+ //# sourceMappingURL=security.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"security.js","sourceRoot":"","sources":["../../../src/runner/security.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAE7B;;;;;GAKG;AACH,MAAM,UAAU,gBAAgB,CAC9B,gBAA0B,EAC1B,GAAW;IAEX,MAAM,YAAY,GAAG,gBAAgB,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAChD,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,GAAG,CAAC,CACvB,CAAC;IAEF,OAAO,KAAK,EACV,QAAgB,EAChB,KAA8B,EAC9B,QAA4E,EAC5E,EAAE;QACF,4BAA4B;QAC5B,IAAI,CAAC,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC1C,OAAO,EAAE,QAAQ,EAAE,OAAgB,EAAE,YAAY,EAAE,KAAK,EAAE,CAAC;QAC7D,CAAC;QAED,+CAA+C;QAC/C,MAAM,QAAQ,GAAI,KAAK,CAAC,SAAoB,IAAI,EAAE,CAAC;QACnD,MAAM,YAAY,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;QAEjD,MAAM,SAAS,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,EAAE,CAC1C,YAAY,CAAC,UAAU,CAAC,GAAG,CAAC,CAC7B,CAAC;QAEF,IAAI,SAAS,EAAE,CAAC;YACd,OAAO,EAAE,QAAQ,EAAE,OAAgB,EAAE,YAAY,EAAE,KAAK,EAAE,CAAC;QAC7D,CAAC;QAED,OAAO;YACL,QAAQ,EAAE,MAAe;YACzB,OAAO,EAAE,iBAAiB,QAAQ,oCAAoC,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;SACpG,CAAC;IACJ,CAAC,CAAC;AACJ,CAAC"}
@@ -0,0 +1,19 @@
1
+ /**
2
+ * Local skill setup and cleanup for evaluation runs.
3
+ *
4
+ * Copies skill directories into .claude/skills/ within the working directory
5
+ * so the Agent SDK can discover them via settingSources: ['project'].
6
+ */
7
+ /**
8
+ * Copy skills from a source directory to .claude/skills/ in the working directory.
9
+ *
10
+ * @param skillsSourceDir - Directory containing skill folders (each with SKILL.md)
11
+ * @param cwd - Working directory where .claude/skills/ should be created
12
+ * @returns List of skill names that were set up
13
+ */
14
+ export declare function setupLocalSkills(skillsSourceDir: string, cwd: string): Promise<string[]>;
15
+ /**
16
+ * Remove .claude/skills/ from the working directory.
17
+ */
18
+ export declare function cleanupLocalSkills(cwd: string): Promise<void>;
19
+ //# sourceMappingURL=skill-setup.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"skill-setup.d.ts","sourceRoot":"","sources":["../../../src/runner/skill-setup.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAKH;;;;;;GAMG;AACH,wBAAsB,gBAAgB,CACpC,eAAe,EAAE,MAAM,EACvB,GAAG,EAAE,MAAM,GACV,OAAO,CAAC,MAAM,EAAE,CAAC,CAkCnB;AAED;;GAEG;AACH,wBAAsB,kBAAkB,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAOnE"}
@@ -0,0 +1,72 @@
1
+ /**
2
+ * Local skill setup and cleanup for evaluation runs.
3
+ *
4
+ * Copies skill directories into .claude/skills/ within the working directory
5
+ * so the Agent SDK can discover them via settingSources: ['project'].
6
+ */
7
+ import * as fs from 'fs/promises';
8
+ import * as path from 'path';
9
+ /**
10
+ * Copy skills from a source directory to .claude/skills/ in the working directory.
11
+ *
12
+ * @param skillsSourceDir - Directory containing skill folders (each with SKILL.md)
13
+ * @param cwd - Working directory where .claude/skills/ should be created
14
+ * @returns List of skill names that were set up
15
+ */
16
+ export async function setupLocalSkills(skillsSourceDir, cwd) {
17
+ const targetDir = path.join(cwd, '.claude', 'skills');
18
+ await fs.mkdir(targetDir, { recursive: true });
19
+ const skillNames = [];
20
+ try {
21
+ const entries = await fs.readdir(skillsSourceDir, { withFileTypes: true });
22
+ for (const entry of entries) {
23
+ if (entry.isDirectory()) {
24
+ const srcSkillDir = path.join(skillsSourceDir, entry.name);
25
+ const destSkillDir = path.join(targetDir, entry.name);
26
+ await copyDir(srcSkillDir, destSkillDir);
27
+ skillNames.push(entry.name);
28
+ }
29
+ else if (entry.name === 'SKILL.md') {
30
+ // Single skill file at root level
31
+ const skillName = path.basename(skillsSourceDir);
32
+ await fs.mkdir(path.join(targetDir, skillName), { recursive: true });
33
+ await fs.copyFile(path.join(skillsSourceDir, entry.name), path.join(targetDir, skillName, 'SKILL.md'));
34
+ skillNames.push(skillName);
35
+ }
36
+ }
37
+ }
38
+ catch (err) {
39
+ throw new Error(`Failed to setup local skills from ${skillsSourceDir}: ${err instanceof Error ? err.message : String(err)}`);
40
+ }
41
+ return skillNames;
42
+ }
43
+ /**
44
+ * Remove .claude/skills/ from the working directory.
45
+ */
46
+ export async function cleanupLocalSkills(cwd) {
47
+ const skillsDir = path.join(cwd, '.claude', 'skills');
48
+ try {
49
+ await fs.rm(skillsDir, { recursive: true, force: true });
50
+ }
51
+ catch {
52
+ // Ignore cleanup errors
53
+ }
54
+ }
55
+ /**
56
+ * Recursively copy a directory.
57
+ */
58
+ async function copyDir(src, dest) {
59
+ await fs.mkdir(dest, { recursive: true });
60
+ const entries = await fs.readdir(src, { withFileTypes: true });
61
+ for (const entry of entries) {
62
+ const srcPath = path.join(src, entry.name);
63
+ const destPath = path.join(dest, entry.name);
64
+ if (entry.isDirectory()) {
65
+ await copyDir(srcPath, destPath);
66
+ }
67
+ else {
68
+ await fs.copyFile(srcPath, destPath);
69
+ }
70
+ }
71
+ }
72
+ //# sourceMappingURL=skill-setup.js.map