@skilljack/evals 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +327 -0
- package/action/action.yml +72 -0
- package/action/index.ts +78 -0
- package/dist/action/index.d.ts +8 -0
- package/dist/action/index.d.ts.map +1 -0
- package/dist/action/index.js +68 -0
- package/dist/action/index.js.map +1 -0
- package/dist/src/cli.d.ts +9 -0
- package/dist/src/cli.d.ts.map +1 -0
- package/dist/src/cli.js +264 -0
- package/dist/src/cli.js.map +1 -0
- package/dist/src/config.d.ts +52 -0
- package/dist/src/config.d.ts.map +1 -0
- package/dist/src/config.js +194 -0
- package/dist/src/config.js.map +1 -0
- package/dist/src/index.d.ts +24 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +28 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/parser.d.ts +22 -0
- package/dist/src/parser.d.ts.map +1 -0
- package/dist/src/parser.js +205 -0
- package/dist/src/parser.js.map +1 -0
- package/dist/src/pipeline.d.ts +53 -0
- package/dist/src/pipeline.d.ts.map +1 -0
- package/dist/src/pipeline.js +185 -0
- package/dist/src/pipeline.js.map +1 -0
- package/dist/src/report/github-summary.d.ts +15 -0
- package/dist/src/report/github-summary.d.ts.map +1 -0
- package/dist/src/report/github-summary.js +77 -0
- package/dist/src/report/github-summary.js.map +1 -0
- package/dist/src/report/report.d.ts +23 -0
- package/dist/src/report/report.d.ts.map +1 -0
- package/dist/src/report/report.js +216 -0
- package/dist/src/report/report.js.map +1 -0
- package/dist/src/runner/runner.d.ts +29 -0
- package/dist/src/runner/runner.d.ts.map +1 -0
- package/dist/src/runner/runner.js +211 -0
- package/dist/src/runner/runner.js.map +1 -0
- package/dist/src/runner/security.d.ts +26 -0
- package/dist/src/runner/security.d.ts.map +1 -0
- package/dist/src/runner/security.js +34 -0
- package/dist/src/runner/security.js.map +1 -0
- package/dist/src/runner/skill-setup.d.ts +19 -0
- package/dist/src/runner/skill-setup.d.ts.map +1 -0
- package/dist/src/runner/skill-setup.js +72 -0
- package/dist/src/runner/skill-setup.js.map +1 -0
- package/dist/src/scorer/deterministic.d.ts +12 -0
- package/dist/src/scorer/deterministic.d.ts.map +1 -0
- package/dist/src/scorer/deterministic.js +149 -0
- package/dist/src/scorer/deterministic.js.map +1 -0
- package/dist/src/scorer/judge.d.ts +34 -0
- package/dist/src/scorer/judge.d.ts.map +1 -0
- package/dist/src/scorer/judge.js +226 -0
- package/dist/src/scorer/judge.js.map +1 -0
- package/dist/src/scorer/scorer.d.ts +25 -0
- package/dist/src/scorer/scorer.d.ts.map +1 -0
- package/dist/src/scorer/scorer.js +149 -0
- package/dist/src/scorer/scorer.js.map +1 -0
- package/dist/src/session/session-logger.d.ts +30 -0
- package/dist/src/session/session-logger.d.ts.map +1 -0
- package/dist/src/session/session-logger.js +157 -0
- package/dist/src/session/session-logger.js.map +1 -0
- package/dist/src/types.d.ts +227 -0
- package/dist/src/types.d.ts.map +1 -0
- package/dist/src/types.js +16 -0
- package/dist/src/types.js.map +1 -0
- package/package.json +44 -0
package/dist/src/cli.js
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* CLI for skill evaluation framework.
|
|
4
|
+
*
|
|
5
|
+
* Primary command: `skilljack-evals run` — runs the full evaluation pipeline.
|
|
6
|
+
* Also supports: score, report, create-eval, validate.
|
|
7
|
+
*/
|
|
8
|
+
import 'dotenv/config';
|
|
9
|
+
import { Command } from 'commander';
|
|
10
|
+
import * as fs from 'fs/promises';
|
|
11
|
+
import * as path from 'path';
|
|
12
|
+
import { parseEvalFile, createEvalTemplate, validateEvalFile } from './parser.js';
|
|
13
|
+
import { runPipeline, scorePipeline } from './pipeline.js';
|
|
14
|
+
import { generateReport, generateJsonResults } from './report/report.js';
|
|
15
|
+
import { SkillJudge } from './scorer/judge.js';
|
|
16
|
+
const program = new Command();
|
|
17
|
+
program
|
|
18
|
+
.name('skilljack-evals')
|
|
19
|
+
.description('Skill evaluation CLI — run evaluations, score results, generate reports')
|
|
20
|
+
.version('1.0.0');
|
|
21
|
+
// ============================================
|
|
22
|
+
// Primary command: run
|
|
23
|
+
// ============================================
|
|
24
|
+
program
|
|
25
|
+
.command('run')
|
|
26
|
+
.description('Run the full evaluation pipeline: execute tasks → score → report')
|
|
27
|
+
.argument('<tasks>', 'Path to tasks YAML file')
|
|
28
|
+
.option('--model <model>', 'Agent model (default: sonnet)')
|
|
29
|
+
.option('--judge-model <model>', 'Judge model (default: haiku)')
|
|
30
|
+
.option('--config <path>', 'Path to eval.config.yaml')
|
|
31
|
+
.option('--output-dir <dir>', 'Output directory for results')
|
|
32
|
+
.option('--timeout <ms>', 'Per-task timeout in milliseconds')
|
|
33
|
+
.option('--tasks <ids>', 'Comma-separated task IDs to run')
|
|
34
|
+
.option('--skills-dir <path>', 'Path to skills directory for local setup')
|
|
35
|
+
.option('--cwd <path>', 'Working directory for agent execution')
|
|
36
|
+
.option('--threshold-discovery <rate>', 'Min discovery rate (0-1)')
|
|
37
|
+
.option('--threshold-score <score>', 'Min avg score (1-5)')
|
|
38
|
+
.option('--no-judge', 'Skip LLM judge scoring (deterministic only)')
|
|
39
|
+
.option('--no-deterministic', 'Skip deterministic scoring (LLM judge only)')
|
|
40
|
+
.option('--github-summary', 'Write GitHub Actions step summary')
|
|
41
|
+
.option('--verbose', 'Enable verbose output')
|
|
42
|
+
.action(async (tasksFile, options) => {
|
|
43
|
+
try {
|
|
44
|
+
const configOverrides = {};
|
|
45
|
+
if (options.model)
|
|
46
|
+
configOverrides.defaultAgentModel = options.model;
|
|
47
|
+
if (options.judgeModel)
|
|
48
|
+
configOverrides.defaultJudgeModel = options.judgeModel;
|
|
49
|
+
if (options.outputDir)
|
|
50
|
+
configOverrides.outputDir = options.outputDir;
|
|
51
|
+
if (options.timeout)
|
|
52
|
+
configOverrides.taskTimeoutMs = parseInt(options.timeout, 10);
|
|
53
|
+
if (options.thresholdDiscovery)
|
|
54
|
+
configOverrides.discoveryThreshold = parseFloat(options.thresholdDiscovery);
|
|
55
|
+
if (options.thresholdScore)
|
|
56
|
+
configOverrides.scoreThreshold = parseFloat(options.thresholdScore);
|
|
57
|
+
if (options.githubSummary)
|
|
58
|
+
configOverrides.githubSummary = true;
|
|
59
|
+
const result = await runPipeline({
|
|
60
|
+
tasksFile,
|
|
61
|
+
configPath: options.config,
|
|
62
|
+
configOverrides,
|
|
63
|
+
cwd: options.cwd,
|
|
64
|
+
skillsDir: options.skillsDir,
|
|
65
|
+
taskFilter: options.tasks,
|
|
66
|
+
noJudge: options.judge === false,
|
|
67
|
+
noDeterministic: options.deterministic === false,
|
|
68
|
+
verbose: options.verbose,
|
|
69
|
+
});
|
|
70
|
+
if (!result.passed) {
|
|
71
|
+
process.exit(1);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
catch (error) {
|
|
75
|
+
console.error(`Error: ${error instanceof Error ? error.message : String(error)}`);
|
|
76
|
+
process.exit(1);
|
|
77
|
+
}
|
|
78
|
+
});
|
|
79
|
+
// ============================================
|
|
80
|
+
// Score existing results
|
|
81
|
+
// ============================================
|
|
82
|
+
program
|
|
83
|
+
.command('score')
|
|
84
|
+
.description('Score existing results JSON (no runner)')
|
|
85
|
+
.argument('<results>', 'Path to results JSON file')
|
|
86
|
+
.option('--judge-model <model>', 'Judge model')
|
|
87
|
+
.option('--config <path>', 'Path to eval.config.yaml')
|
|
88
|
+
.option('--no-judge', 'Skip LLM judge')
|
|
89
|
+
.option('--no-deterministic', 'Skip deterministic checks')
|
|
90
|
+
.action(async (resultsFile, options) => {
|
|
91
|
+
try {
|
|
92
|
+
const configOverrides = {};
|
|
93
|
+
if (options.judgeModel)
|
|
94
|
+
configOverrides.defaultJudgeModel = options.judgeModel;
|
|
95
|
+
const result = await scorePipeline(resultsFile, {
|
|
96
|
+
configPath: options.config,
|
|
97
|
+
configOverrides,
|
|
98
|
+
noJudge: options.judge === false,
|
|
99
|
+
noDeterministic: options.deterministic === false,
|
|
100
|
+
});
|
|
101
|
+
if (!result.passed) {
|
|
102
|
+
process.exit(1);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
catch (error) {
|
|
106
|
+
console.error(`Error: ${error instanceof Error ? error.message : String(error)}`);
|
|
107
|
+
process.exit(1);
|
|
108
|
+
}
|
|
109
|
+
});
|
|
110
|
+
// ============================================
|
|
111
|
+
// Report generation
|
|
112
|
+
// ============================================
|
|
113
|
+
program
|
|
114
|
+
.command('report')
|
|
115
|
+
.description('Generate report from results JSON')
|
|
116
|
+
.requiredOption('-r, --results <path>', 'Path to results JSON file')
|
|
117
|
+
.option('-o, --output <path>', 'Output markdown file')
|
|
118
|
+
.option('--json <path>', 'Also output JSON report')
|
|
119
|
+
.action(async (options) => {
|
|
120
|
+
try {
|
|
121
|
+
const resultsData = await fs.readFile(options.results, 'utf-8');
|
|
122
|
+
const data = JSON.parse(resultsData);
|
|
123
|
+
const evaluation = {
|
|
124
|
+
skillName: data.skillName,
|
|
125
|
+
tasks: data.tasks.map((t) => t.task),
|
|
126
|
+
};
|
|
127
|
+
const results = data.tasks.map((t) => t.result);
|
|
128
|
+
const scores = data.tasks.map((t) => t.score);
|
|
129
|
+
const report = await generateReport(evaluation, results, scores, options.output, data.metadata);
|
|
130
|
+
if (!options.output) {
|
|
131
|
+
console.log(report);
|
|
132
|
+
}
|
|
133
|
+
if (options.json) {
|
|
134
|
+
await generateJsonResults(evaluation, results, scores, options.json, data.metadata);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
catch (error) {
|
|
138
|
+
console.error(`Error: ${error instanceof Error ? error.message : String(error)}`);
|
|
139
|
+
process.exit(1);
|
|
140
|
+
}
|
|
141
|
+
});
|
|
142
|
+
// ============================================
|
|
143
|
+
// Create eval template
|
|
144
|
+
// ============================================
|
|
145
|
+
program
|
|
146
|
+
.command('create-eval')
|
|
147
|
+
.description('Create an evaluation template for a skill')
|
|
148
|
+
.argument('<skill_name>', 'Name of the skill')
|
|
149
|
+
.option('-o, --output <path>', 'Output path for template')
|
|
150
|
+
.option('-n, --num-tasks <number>', 'Number of placeholder tasks', '10')
|
|
151
|
+
.action(async (skillName, options) => {
|
|
152
|
+
const outputPath = options.output || path.join(process.cwd(), 'evals', skillName, 'tasks.yaml');
|
|
153
|
+
const numTasks = parseInt(options.numTasks, 10);
|
|
154
|
+
await fs.mkdir(path.dirname(outputPath), { recursive: true });
|
|
155
|
+
const template = createEvalTemplate(skillName, numTasks);
|
|
156
|
+
await fs.writeFile(outputPath, template);
|
|
157
|
+
console.log(`Created evaluation template: ${outputPath}`);
|
|
158
|
+
console.log();
|
|
159
|
+
console.log('Next steps:');
|
|
160
|
+
console.log(`1. Edit ${outputPath} to add real evaluation tasks`);
|
|
161
|
+
console.log(`2. Run: skilljack-evals run ${outputPath}`);
|
|
162
|
+
});
|
|
163
|
+
// ============================================
|
|
164
|
+
// Parse (legacy, for interop)
|
|
165
|
+
// ============================================
|
|
166
|
+
program
|
|
167
|
+
.command('parse')
|
|
168
|
+
.description('Parse tasks YAML and output JSON')
|
|
169
|
+
.argument('<skill_name>', 'Name of the skill')
|
|
170
|
+
.option('-f, --eval-file <path>', 'Path to evaluation YAML file')
|
|
171
|
+
.option('-o, --output <path>', 'Output JSON file (default: stdout)')
|
|
172
|
+
.action(async (skillName, options) => {
|
|
173
|
+
const baseDir = path.join(process.cwd(), 'evals', skillName);
|
|
174
|
+
const evalFile = options.evalFile || path.join(baseDir, 'tasks.yaml');
|
|
175
|
+
try {
|
|
176
|
+
await fs.access(evalFile);
|
|
177
|
+
}
|
|
178
|
+
catch {
|
|
179
|
+
console.error(`Error: Evaluation file not found: ${evalFile}`);
|
|
180
|
+
process.exit(1);
|
|
181
|
+
}
|
|
182
|
+
const evaluation = await parseEvalFile(evalFile);
|
|
183
|
+
const json = JSON.stringify(evaluation, null, 2);
|
|
184
|
+
if (options.output) {
|
|
185
|
+
await fs.writeFile(options.output, json);
|
|
186
|
+
console.error(`Parsed ${evaluation.tasks.length} tasks to: ${options.output}`);
|
|
187
|
+
}
|
|
188
|
+
else {
|
|
189
|
+
console.log(json);
|
|
190
|
+
}
|
|
191
|
+
});
|
|
192
|
+
// ============================================
|
|
193
|
+
// Validate
|
|
194
|
+
// ============================================
|
|
195
|
+
program
|
|
196
|
+
.command('validate')
|
|
197
|
+
.description('Validate a tasks YAML file')
|
|
198
|
+
.argument('<file>', 'Path to tasks YAML file')
|
|
199
|
+
.action(async (file) => {
|
|
200
|
+
const errors = await validateEvalFile(file);
|
|
201
|
+
if (errors.length === 0) {
|
|
202
|
+
const evaluation = await parseEvalFile(file);
|
|
203
|
+
console.log(`Valid: ${evaluation.tasks.length} task(s) for skill '${evaluation.skillName}'`);
|
|
204
|
+
}
|
|
205
|
+
else {
|
|
206
|
+
console.error(`Validation errors in ${file}:`);
|
|
207
|
+
for (const error of errors) {
|
|
208
|
+
console.error(` - ${error}`);
|
|
209
|
+
}
|
|
210
|
+
process.exit(1);
|
|
211
|
+
}
|
|
212
|
+
});
|
|
213
|
+
// ============================================
|
|
214
|
+
// Judge (legacy single-task scoring)
|
|
215
|
+
// ============================================
|
|
216
|
+
program
|
|
217
|
+
.command('judge')
|
|
218
|
+
.description('Score a single task result using LLM judge')
|
|
219
|
+
.requiredOption('--task-id <id>', 'Task ID')
|
|
220
|
+
.requiredOption('--prompt <text>', 'Original task prompt')
|
|
221
|
+
.requiredOption('--expected-skill <name>', 'Expected skill to be loaded')
|
|
222
|
+
.requiredOption('--output <text>', 'Agent output to judge')
|
|
223
|
+
.option('--skill-loads <skills>', 'Comma-separated list of skills loaded', '')
|
|
224
|
+
.option('--checklist <items>', 'Comma-separated golden checklist items', '')
|
|
225
|
+
.option('--model <model>', 'Judge model (default: haiku)')
|
|
226
|
+
.option('-o, --output-file <path>', 'Output JSON file (default: stdout)')
|
|
227
|
+
.action(async (options) => {
|
|
228
|
+
const task = {
|
|
229
|
+
id: options.taskId,
|
|
230
|
+
prompt: options.prompt,
|
|
231
|
+
expectedSkillLoad: options.expectedSkill,
|
|
232
|
+
criteria: [
|
|
233
|
+
{ dimension: 'discovery', weight: 0.3, description: 'Skill discovery' },
|
|
234
|
+
{ dimension: 'adherence', weight: 0.4, description: 'Instruction adherence' },
|
|
235
|
+
{ dimension: 'output', weight: 0.3, description: 'Output quality' },
|
|
236
|
+
],
|
|
237
|
+
goldenChecklist: options.checklist ? options.checklist.split(',').map(s => s.trim()) : [],
|
|
238
|
+
};
|
|
239
|
+
const result = {
|
|
240
|
+
taskId: options.taskId,
|
|
241
|
+
prompt: options.prompt,
|
|
242
|
+
output: options.output,
|
|
243
|
+
durationMs: 0,
|
|
244
|
+
numTurns: 0,
|
|
245
|
+
costUsd: 0,
|
|
246
|
+
skillLoads: options.skillLoads ? options.skillLoads.split(',').map(s => s.trim()) : [],
|
|
247
|
+
toolCalls: [],
|
|
248
|
+
isError: false,
|
|
249
|
+
errorMessage: '',
|
|
250
|
+
};
|
|
251
|
+
const judge = new SkillJudge({ model: options.model });
|
|
252
|
+
console.error(`Judging task ${options.taskId}...`);
|
|
253
|
+
const score = await judge.judgeResult(task, result);
|
|
254
|
+
const json = JSON.stringify(score, null, 2);
|
|
255
|
+
if (options.outputFile) {
|
|
256
|
+
await fs.writeFile(options.outputFile, json);
|
|
257
|
+
console.error(`Score saved to: ${options.outputFile}`);
|
|
258
|
+
}
|
|
259
|
+
else {
|
|
260
|
+
console.log(json);
|
|
261
|
+
}
|
|
262
|
+
});
|
|
263
|
+
program.parse();
|
|
264
|
+
//# sourceMappingURL=cli.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../../src/cli.ts"],"names":[],"mappings":";AAEA;;;;;GAKG;AAEH,OAAO,eAAe,CAAC;AACvB,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAClC,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAC7B,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAClF,OAAO,EAAE,WAAW,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAC3D,OAAO,EAAE,cAAc,EAAE,mBAAmB,EAAE,MAAM,oBAAoB,CAAC;AACzE,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAI/C,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,iBAAiB,CAAC;KACvB,WAAW,CAAC,yEAAyE,CAAC;KACtF,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,+CAA+C;AAC/C,uBAAuB;AACvB,+CAA+C;AAE/C,OAAO;KACJ,OAAO,CAAC,KAAK,CAAC;KACd,WAAW,CAAC,kEAAkE,CAAC;KAC/E,QAAQ,CAAC,SAAS,EAAE,yBAAyB,CAAC;KAC9C,MAAM,CAAC,iBAAiB,EAAE,+BAA+B,CAAC;KAC1D,MAAM,CAAC,uBAAuB,EAAE,8BAA8B,CAAC;KAC/D,MAAM,CAAC,iBAAiB,EAAE,0BAA0B,CAAC;KACrD,MAAM,CAAC,oBAAoB,EAAE,8BAA8B,CAAC;KAC5D,MAAM,CAAC,gBAAgB,EAAE,kCAAkC,CAAC;KAC5D,MAAM,CAAC,eAAe,EAAE,iCAAiC,CAAC;KAC1D,MAAM,CAAC,qBAAqB,EAAE,0CAA0C,CAAC;KACzE,MAAM,CAAC,cAAc,EAAE,uCAAuC,CAAC;KAC/D,MAAM,CAAC,8BAA8B,EAAE,0BAA0B,CAAC;KAClE,MAAM,CAAC,2BAA2B,EAAE,qBAAqB,CAAC;KAC1D,MAAM,CAAC,YAAY,EAAE,6CAA6C,CAAC;KACnE,MAAM,CAAC,oBAAoB,EAAE,6CAA6C,CAAC;KAC3E,MAAM,CAAC,kBAAkB,EAAE,mCAAmC,CAAC;KAC/D,MAAM,CAAC,WAAW,EAAE,uBAAuB,CAAC;KAC5C,MAAM,CAAC,KAAK,EAAE,SAAiB,EAAE,OAejC,EAAE,EAAE;IACH,IAAI,CAAC;QACH,MAAM,eAAe,GAAwB,EAAE,CAAC;QAChD,IAAI,OAAO,CAAC,KAAK;YAAE,eAAe,CAAC,iBAAiB,GAAG,OAAO,CAAC,KAAK,CAAC;QACrE,IAAI,OAAO,CAAC,UAAU;YAAE,eAAe,CAAC,iBAAiB,GAAG,OAAO,CAAC,UAAU,CAAC;QAC/E,IAAI,OAAO,CAAC,SAAS;YAAE,eAAe,CAAC,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;QACrE,IAAI,OAAO,CAAC,OAAO;YAAE,eAAe,CAAC,aAAa,GAAG,QAAQ,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;QACnF,IAAI,OAAO,CAAC,kBAAkB;YAAE,eAAe,CAAC,kBAAkB,GAAG,UAAU,CAAC,OAAO,CAAC,kBAAkB,CAAC,CAAC;QAC5G,IAAI,OAAO,CAAC,cAAc;YAAE,eAAe,CAAC,cAAc,GAAG,UAAU,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC;QAChG,IAAI,OAAO,CAAC,aAAa;YAAE,eAAe,CAAC,aAAa,GAAG,IAAI,CAAC;QAEhE,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC;YAC/B,SAAS;YACT,UAAU,EAAE,OAAO,CAAC,MAAM;YAC1B,eAAe;YACf,GAAG,EAAE,OAAO,CAAC,GAAG;YAChB,SAAS,EAAE,OAAO,CAAC,SAAS;YAC5B,UAAU,EAAE,OAAO,CAAC,KAAK;YACzB,OAAO,EAAE,OAAO,CAAC,KAAK,KAAK,KAAK;YAChC,eAAe,EAAE,OAAO,CAAC,aAAa,KAAK,KAAK;YAChD,OAAO,EAAE,OAAO,CAAC,OAAO;SACzB,CAAC,CAAC;QAEH,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YACnB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,UAAU,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QAClF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,+CAA+C;AAC/C,yBAAyB;AACzB,+CAA+C;AAE/C,OAAO;KACJ,OAAO,CAAC,OAAO,CAAC;KAChB,WAAW,CAAC,yCAAyC,CAAC;KACtD,QAAQ,CAAC,WAAW,EAAE,2BAA2B,CAAC;KAClD,MAAM,CAAC,uBAAuB,EAAE,aAAa,CAAC;KAC9C,MAAM,CAAC,iBAAiB,EAAE,0BAA0B,CAAC;KACrD,MAAM,CAAC,YAAY,EAAE,gBAAgB,CAAC;KACtC,MAAM,CAAC,oBAAoB,EAAE,2BAA2B,CAAC;KACzD,MAAM,CAAC,KAAK,EAAE,WAAmB,EAAE,OAKnC,EAAE,EAAE;IACH,IAAI,CAAC;QACH,MAAM,eAAe,GAAwB,EAAE,CAAC;QAChD,IAAI,OAAO,CAAC,UAAU;YAAE,eAAe,CAAC,iBAAiB,GAAG,OAAO,CAAC,UAAU,CAAC;QAE/E,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,WAAW,EAAE;YAC9C,UAAU,EAAE,OAAO,CAAC,MAAM;YAC1B,eAAe;YACf,OAAO,EAAE,OAAO,CAAC,KAAK,KAAK,KAAK;YAChC,eAAe,EAAE,OAAO,CAAC,aAAa,KAAK,KAAK;SACjD,CAAC,CAAC;QAEH,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YACnB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,UAAU,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QAClF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,+CAA+C;AAC/C,oBAAoB;AACpB,+CAA+C;AAE/C,OAAO;KACJ,OAAO,CAAC,QAAQ,CAAC;KACjB,WAAW,CAAC,mCAAmC,CAAC;KAChD,cAAc,CAAC,sBAAsB,EAAE,2BAA2B,CAAC;KACnE,MAAM,CAAC,qBAAqB,EAAE,sBAAsB,CAAC;KACrD,MAAM,CAAC,eAAe,EAAE,yBAAyB,CAAC;KAClD,MAAM,CAAC,KAAK,EAAE,OAId,EAAE,EAAE;IACH,IAAI,CAAC;QACH,MAAM,WAAW,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,OAAO,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;QAChE,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,CAIlC,CAAC;QAEF,MAAM,UAAU,GAAoB;YAClC,SAAS,EAAE,IAAI,CAAC,SAAS;YACzB,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;SACrC,CAAC;QACF,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QAChD,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;QAE9C,MAAM,MAAM,GAAG,MAAM,cAAc,CACjC,UAAU,EAAE,OAAO,EAAE,MAAM,EAAE,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC,QAAQ,CAC3D,CAAC;QAEF,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC;YACpB,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QACtB,CAAC;QAED,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACjB,MAAM,mBAAmB,CACvB,UAAU,EAAE,OAAO,EAAE,MAAM,EAAE,OAAO,CAAC,IAAI,EAAE,IAAI,CAAC,QAAQ,CACzD,CAAC;QACJ,CAAC;IACH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,UAAU,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QAClF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,+CAA+C;AAC/C,uBAAuB;AACvB,+CAA+C;AAE/C,OAAO;KACJ,OAAO,CAAC,aAAa,CAAC;KACtB,WAAW,CAAC,2CAA2C,CAAC;KACxD,QAAQ,CAAC,cAAc,EAAE,mBAAmB,CAAC;KAC7C,MAAM,CAAC,qBAAqB,EAAE,0BAA0B,CAAC;KACzD,MAAM,CAAC,0BAA0B,EAAE,6BAA6B,EAAE,IAAI,CAAC;KACvE,MAAM,CAAC,KAAK,EAAE,SAAiB,EAAE,OAGjC,EAAE,EAAE;IACH,MAAM,UAAU,GAAG,OAAO,CAAC,MAAM,IAAI,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,SAAS,EAAE,YAAY,CAAC,CAAC;IAChG,MAAM,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;IAEhD,MAAM,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE9D,MAAM,QAAQ,GAAG,kBAAkB,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;IACzD,MAAM,EAAE,CAAC,SAAS,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC;IAEzC,OAAO,CAAC,GAAG,CAAC,gCAAgC,UAAU,EAAE,CAAC,CAAC;IAC1D,OAAO,CAAC,GAAG,EAAE,CAAC;IACd,OAAO,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC;IAC3B,OAAO,CAAC,GAAG,CAAC,WAAW,UAAU,+BAA+B,CAAC,CAAC;IAClE,OAAO,CAAC,GAAG,CAAC,+BAA+B,UAAU,EAAE,CAAC,CAAC;AAC3D,CAAC,CAAC,CAAC;AAEL,+CAA+C;AAC/C,8BAA8B;AAC9B,+CAA+C;AAE/C,OAAO;KACJ,OAAO,CAAC,OAAO,CAAC;KAChB,WAAW,CAAC,kCAAkC,CAAC;KAC/C,QAAQ,CAAC,cAAc,EAAE,mBAAmB,CAAC;KAC7C,MAAM,CAAC,wBAAwB,EAAE,8BAA8B,CAAC;KAChE,MAAM,CAAC,qBAAqB,EAAE,oCAAoC,CAAC;KACnE,MAAM,CAAC,KAAK,EAAE,SAAiB,EAAE,OAGjC,EAAE,EAAE;IACH,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,SAAS,CAAC,CAAC;IAC7D,MAAM,QAAQ,GAAG,OAAO,CAAC,QAAQ,IAAI,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,YAAY,CAAC,CAAC;IAEtE,IAAI,CAAC;QACH,MAAM,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;IAC5B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,CAAC,KAAK,CAAC,qCAAqC,QAAQ,EAAE,CAAC,CAAC;QAC/D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,aAAa,CAAC,QAAQ,CAAC,CAAC;IACjD,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;IAEjD,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;QACnB,MAAM,EAAE,CAAC,SAAS,CAAC,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;QACzC,OAAO,CAAC,KAAK,CAAC,UAAU,UAAU,CAAC,KAAK,CAAC,MAAM,cAAc,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;IACjF,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IACpB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,+CAA+C;AAC/C,WAAW;AACX,+CAA+C;AAE/C,OAAO;KACJ,OAAO,CAAC,UAAU,CAAC;KACnB,WAAW,CAAC,4BAA4B,CAAC;KACzC,QAAQ,CAAC,QAAQ,EAAE,yBAAyB,CAAC;KAC7C,MAAM,CAAC,KAAK,EAAE,IAAY,EAAE,EAAE;IAC7B,MAAM,MAAM,GAAG,MAAM,gBAAgB,CAAC,IAAI,CAAC,CAAC;IAE5C,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxB,MAAM,UAAU,GAAG,MAAM,aAAa,CAAC,IAAI,CAAC,CAAC;QAC7C,OAAO,CAAC,GAAG,CAAC,UAAU,UAAU,CAAC,KAAK,CAAC,MAAM,uBAAuB,UAAU,CAAC,SAAS,GAAG,CAAC,CAAC;IAC/F,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,KAAK,CAAC,wBAAwB,IAAI,GAAG,CAAC,CAAC;QAC/C,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,OAAO,CAAC,KAAK,CAAC,OAAO,KAAK,EAAE,CAAC,CAAC;QAChC,CAAC;QACD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,+CAA+C;AAC/C,qCAAqC;AACrC,+CAA+C;AAE/C,OAAO;KACJ,OAAO,CAAC,OAAO,CAAC;KAChB,WAAW,CAAC,4CAA4C,CAAC;KACzD,cAAc,CAAC,gBAAgB,EAAE,SAAS,CAAC;KAC3C,cAAc,CAAC,iBAAiB,EAAE,sBAAsB,CAAC;KACzD,cAAc,CAAC,yBAAyB,EAAE,6BAA6B,CAAC;KACxE,cAAc,CAAC,iBAAiB,EAAE,uBAAuB,CAAC;KAC1D,MAAM,CAAC,wBAAwB,EAAE,uCAAuC,EAAE,EAAE,CAAC;KAC7E,MAAM,CAAC,qBAAqB,EAAE,wCAAwC,EAAE,EAAE,CAAC;KAC3E,MAAM,CAAC,iBAAiB,EAAE,8BAA8B,CAAC;KACzD,MAAM,CAAC,0BAA0B,EAAE,oCAAoC,CAAC;KACxE,MAAM,CAAC,KAAK,EAAE,OASd,EAAE,EAAE;IACH,MAAM,IAAI,GAAa;QACrB,EAAE,EAAE,OAAO,CAAC,MAAM;QAClB,MAAM,EAAE,OAAO,CAAC,MAAM;QACtB,iBAAiB,EAAE,OAAO,CAAC,aAAa;QACxC,QAAQ,EAAE;YACR,EAAE,SAAS,EAAE,WAAW,EAAE,MAAM,EAAE,GAAG,EAAE,WAAW,EAAE,iBAAiB,EAAE;YACvE,EAAE,SAAS,EAAE,WAAW,EAAE,MAAM,EAAE,GAAG,EAAE,WAAW,EAAE,uBAAuB,EAAE;YAC7E,EAAE,SAAS,EAAE,QAAQ,EAAE,MAAM,EAAE,GAAG,EAAE,WAAW,EAAE,gBAAgB,EAAE;SACpE;QACD,eAAe,EAAE,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE;KAC1F,CAAC;IAEF,MAAM,MAAM,GAAe;QACzB,MAAM,EAAE,OAAO,CAAC,MAAM;QACtB,MAAM,EAAE,OAAO,CAAC,MAAM;QACtB,MAAM,EAAE,OAAO,CAAC,MAAM;QACtB,UAAU,EAAE,CAAC;QACb,QAAQ,EAAE,CAAC;QACX,OAAO,EAAE,CAAC;QACV,UAAU,EAAE,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE;QACtF,SAAS,EAAE,EAAE;QACb,OAAO,EAAE,KAAK;QACd,YAAY,EAAE,EAAE;KACjB,CAAC;IAEF,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC;IACvD,OAAO,CAAC,KAAK,CAAC,gBAAgB,OAAO,CAAC,MAAM,KAAK,CAAC,CAAC;IACnD,MAAM,KAAK,GAAG,MAAM,KAAK,CAAC,WAAW,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IAEpD,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;IAC5C,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;QACvB,MAAM,EAAE,CAAC,SAAS,CAAC,OAAO,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC;QAC7C,OAAO,CAAC,KAAK,CAAC,mBAAmB,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC;IACzD,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IACpB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,OAAO,CAAC,KAAK,EAAE,CAAC"}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Centralized configuration for the skill evaluation framework.
|
|
3
|
+
*
|
|
4
|
+
* Configuration is loaded with the following precedence (lowest to highest):
|
|
5
|
+
* 1. Built-in defaults
|
|
6
|
+
* 2. Config file (eval.config.yaml or custom path)
|
|
7
|
+
* 3. Environment variables (EVAL_* prefix)
|
|
8
|
+
* 4. Programmatic overrides (CLI flags or API)
|
|
9
|
+
*
|
|
10
|
+
* Supports both Anthropic API and Bedrock via Agent SDK env vars:
|
|
11
|
+
* - Anthropic: Set ANTHROPIC_API_KEY
|
|
12
|
+
* - Bedrock: Set CLAUDE_CODE_USE_BEDROCK=1, AWS_REGION, AWS_PROFILE
|
|
13
|
+
*/
|
|
14
|
+
export interface EvalConfig {
|
|
15
|
+
defaultAgentModel: string;
|
|
16
|
+
defaultJudgeModel: string;
|
|
17
|
+
defaultWeights: {
|
|
18
|
+
discovery: number;
|
|
19
|
+
adherence: number;
|
|
20
|
+
output: number;
|
|
21
|
+
};
|
|
22
|
+
judgeOutputTruncation: number;
|
|
23
|
+
reportOutputTruncation: number;
|
|
24
|
+
taskTimeoutMs: number;
|
|
25
|
+
exitOnFailure: boolean;
|
|
26
|
+
outputDir: string;
|
|
27
|
+
githubSummary: boolean;
|
|
28
|
+
discoveryThreshold: number;
|
|
29
|
+
scoreThreshold: number;
|
|
30
|
+
allowedWriteDirs: string[];
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Default configuration values.
|
|
34
|
+
*/
|
|
35
|
+
export declare const DEFAULT_CONFIG: EvalConfig;
|
|
36
|
+
/**
|
|
37
|
+
* Load full configuration with all sources merged.
|
|
38
|
+
*
|
|
39
|
+
* @param configPath - Optional path to eval.config.yaml
|
|
40
|
+
* @param overrides - Optional programmatic overrides (CLI flags)
|
|
41
|
+
*/
|
|
42
|
+
export declare function loadConfig(configPath?: string, overrides?: Partial<EvalConfig>): Promise<EvalConfig>;
|
|
43
|
+
/**
|
|
44
|
+
* Load configuration synchronously (env vars + defaults only, no file).
|
|
45
|
+
* Useful when you can't await — e.g., in constructors.
|
|
46
|
+
*/
|
|
47
|
+
export declare function loadConfigSync(overrides?: Partial<EvalConfig>): EvalConfig;
|
|
48
|
+
/**
|
|
49
|
+
* Get default weights for scoring dimensions.
|
|
50
|
+
*/
|
|
51
|
+
export declare function getDefaultWeights(config?: EvalConfig): Map<string, number>;
|
|
52
|
+
//# sourceMappingURL=config.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"config.d.ts","sourceRoot":"","sources":["../../src/config.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAMH,MAAM,WAAW,UAAU;IAEzB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,iBAAiB,EAAE,MAAM,CAAC;IAG1B,cAAc,EAAE;QACd,SAAS,EAAE,MAAM,CAAC;QAClB,SAAS,EAAE,MAAM,CAAC;QAClB,MAAM,EAAE,MAAM,CAAC;KAChB,CAAC;IAGF,qBAAqB,EAAE,MAAM,CAAC;IAC9B,sBAAsB,EAAE,MAAM,CAAC;IAG/B,aAAa,EAAE,MAAM,CAAC;IAGtB,aAAa,EAAE,OAAO,CAAC;IACvB,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,OAAO,CAAC;IAGvB,kBAAkB,EAAE,MAAM,CAAC;IAC3B,cAAc,EAAE,MAAM,CAAC;IAGvB,gBAAgB,EAAE,MAAM,EAAE,CAAC;CAC5B;AAED;;GAEG;AACH,eAAO,MAAM,cAAc,EAAE,UAiB5B,CAAC;AAyJF;;;;;GAKG;AACH,wBAAsB,UAAU,CAC9B,UAAU,CAAC,EAAE,MAAM,EACnB,SAAS,CAAC,EAAE,OAAO,CAAC,UAAU,CAAC,GAC9B,OAAO,CAAC,UAAU,CAAC,CAKrB;AAED;;;GAGG;AACH,wBAAgB,cAAc,CAAC,SAAS,CAAC,EAAE,OAAO,CAAC,UAAU,CAAC,GAAG,UAAU,CAG1E;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,MAAM,CAAC,EAAE,UAAU,GAAG,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAO1E"}
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Centralized configuration for the skill evaluation framework.
|
|
3
|
+
*
|
|
4
|
+
* Configuration is loaded with the following precedence (lowest to highest):
|
|
5
|
+
* 1. Built-in defaults
|
|
6
|
+
* 2. Config file (eval.config.yaml or custom path)
|
|
7
|
+
* 3. Environment variables (EVAL_* prefix)
|
|
8
|
+
* 4. Programmatic overrides (CLI flags or API)
|
|
9
|
+
*
|
|
10
|
+
* Supports both Anthropic API and Bedrock via Agent SDK env vars:
|
|
11
|
+
* - Anthropic: Set ANTHROPIC_API_KEY
|
|
12
|
+
* - Bedrock: Set CLAUDE_CODE_USE_BEDROCK=1, AWS_REGION, AWS_PROFILE
|
|
13
|
+
*/
|
|
14
|
+
import yaml from 'js-yaml';
|
|
15
|
+
import * as fs from 'fs/promises';
|
|
16
|
+
import * as path from 'path';
|
|
17
|
+
/**
|
|
18
|
+
* Default configuration values.
|
|
19
|
+
*/
|
|
20
|
+
export const DEFAULT_CONFIG = {
|
|
21
|
+
defaultAgentModel: 'sonnet',
|
|
22
|
+
defaultJudgeModel: 'haiku',
|
|
23
|
+
defaultWeights: {
|
|
24
|
+
discovery: 0.3,
|
|
25
|
+
adherence: 0.4,
|
|
26
|
+
output: 0.3,
|
|
27
|
+
},
|
|
28
|
+
judgeOutputTruncation: 5000,
|
|
29
|
+
reportOutputTruncation: 2000,
|
|
30
|
+
taskTimeoutMs: 300000, // 5 minutes
|
|
31
|
+
exitOnFailure: true,
|
|
32
|
+
outputDir: './results',
|
|
33
|
+
githubSummary: false,
|
|
34
|
+
discoveryThreshold: 0.8,
|
|
35
|
+
scoreThreshold: 4.0,
|
|
36
|
+
allowedWriteDirs: ['./results/', './fixtures/'],
|
|
37
|
+
};
|
|
38
|
+
/**
|
|
39
|
+
* Load a YAML config file if it exists.
|
|
40
|
+
*/
|
|
41
|
+
async function loadConfigFile(configPath) {
|
|
42
|
+
const filePath = configPath || path.join(process.cwd(), 'eval.config.yaml');
|
|
43
|
+
try {
|
|
44
|
+
const content = await fs.readFile(filePath, 'utf-8');
|
|
45
|
+
const raw = yaml.load(content);
|
|
46
|
+
if (!raw)
|
|
47
|
+
return {};
|
|
48
|
+
const config = {};
|
|
49
|
+
if (raw.models?.agent)
|
|
50
|
+
config.defaultAgentModel = raw.models.agent;
|
|
51
|
+
if (raw.models?.judge)
|
|
52
|
+
config.defaultJudgeModel = raw.models.judge;
|
|
53
|
+
if (raw.scoring?.weights) {
|
|
54
|
+
config.defaultWeights = {
|
|
55
|
+
discovery: raw.scoring.weights.discovery ?? DEFAULT_CONFIG.defaultWeights.discovery,
|
|
56
|
+
adherence: raw.scoring.weights.adherence ?? DEFAULT_CONFIG.defaultWeights.adherence,
|
|
57
|
+
output: raw.scoring.weights.output ?? DEFAULT_CONFIG.defaultWeights.output,
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
if (raw.thresholds?.discovery_rate !== undefined)
|
|
61
|
+
config.discoveryThreshold = raw.thresholds.discovery_rate;
|
|
62
|
+
if (raw.thresholds?.avg_score !== undefined)
|
|
63
|
+
config.scoreThreshold = raw.thresholds.avg_score;
|
|
64
|
+
if (raw.runner?.timeout_ms !== undefined)
|
|
65
|
+
config.taskTimeoutMs = raw.runner.timeout_ms;
|
|
66
|
+
if (raw.runner?.allowed_write_dirs)
|
|
67
|
+
config.allowedWriteDirs = raw.runner.allowed_write_dirs;
|
|
68
|
+
if (raw.output?.dir)
|
|
69
|
+
config.outputDir = raw.output.dir;
|
|
70
|
+
if (raw.output?.judge_truncation !== undefined)
|
|
71
|
+
config.judgeOutputTruncation = raw.output.judge_truncation;
|
|
72
|
+
if (raw.output?.report_truncation !== undefined)
|
|
73
|
+
config.reportOutputTruncation = raw.output.report_truncation;
|
|
74
|
+
if (raw.ci?.exit_on_failure !== undefined)
|
|
75
|
+
config.exitOnFailure = raw.ci.exit_on_failure;
|
|
76
|
+
if (raw.ci?.github_summary !== undefined)
|
|
77
|
+
config.githubSummary = raw.ci.github_summary;
|
|
78
|
+
return config;
|
|
79
|
+
}
|
|
80
|
+
catch {
|
|
81
|
+
// Config file not found or invalid — that's fine
|
|
82
|
+
return {};
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Load configuration from environment variables.
|
|
87
|
+
*
|
|
88
|
+
* Supported variables:
|
|
89
|
+
* - EVAL_AGENT_MODEL: Model for task execution (default: 'sonnet')
|
|
90
|
+
* - EVAL_JUDGE_MODEL: Model for scoring (default: 'haiku')
|
|
91
|
+
* - EVAL_OUTPUT_TRUNCATION: Max chars to show judge (default: 5000)
|
|
92
|
+
* - EVAL_REPORT_TRUNCATION: Max chars in reports (default: 2000)
|
|
93
|
+
* - EVAL_TASK_TIMEOUT_MS: Per-task timeout in ms (default: 300000)
|
|
94
|
+
* - EVAL_EXIT_ON_FAILURE: Exit with code 1 on failures (default: true)
|
|
95
|
+
* - EVAL_OUTPUT_DIR: Directory for results (default: './results')
|
|
96
|
+
* - EVAL_DISCOVERY_THRESHOLD: Min discovery rate 0-1 (default: 0.8)
|
|
97
|
+
* - EVAL_SCORE_THRESHOLD: Min avg score 1-5 (default: 4.0)
|
|
98
|
+
* - EVAL_GITHUB_SUMMARY: Write GitHub Actions summary (default: false)
|
|
99
|
+
*/
|
|
100
|
+
function loadEnvConfig() {
|
|
101
|
+
const config = {};
|
|
102
|
+
if (process.env.EVAL_AGENT_MODEL)
|
|
103
|
+
config.defaultAgentModel = process.env.EVAL_AGENT_MODEL;
|
|
104
|
+
if (process.env.EVAL_JUDGE_MODEL)
|
|
105
|
+
config.defaultJudgeModel = process.env.EVAL_JUDGE_MODEL;
|
|
106
|
+
const truncation = parseInt(process.env.EVAL_OUTPUT_TRUNCATION || '', 10);
|
|
107
|
+
if (!isNaN(truncation))
|
|
108
|
+
config.judgeOutputTruncation = truncation;
|
|
109
|
+
const reportTruncation = parseInt(process.env.EVAL_REPORT_TRUNCATION || '', 10);
|
|
110
|
+
if (!isNaN(reportTruncation))
|
|
111
|
+
config.reportOutputTruncation = reportTruncation;
|
|
112
|
+
const timeout = parseInt(process.env.EVAL_TASK_TIMEOUT_MS || '', 10);
|
|
113
|
+
if (!isNaN(timeout))
|
|
114
|
+
config.taskTimeoutMs = timeout;
|
|
115
|
+
if (process.env.EVAL_EXIT_ON_FAILURE !== undefined) {
|
|
116
|
+
config.exitOnFailure = process.env.EVAL_EXIT_ON_FAILURE !== 'false';
|
|
117
|
+
}
|
|
118
|
+
if (process.env.EVAL_OUTPUT_DIR)
|
|
119
|
+
config.outputDir = process.env.EVAL_OUTPUT_DIR;
|
|
120
|
+
const discoveryThreshold = parseFloat(process.env.EVAL_DISCOVERY_THRESHOLD || '');
|
|
121
|
+
if (!isNaN(discoveryThreshold))
|
|
122
|
+
config.discoveryThreshold = discoveryThreshold;
|
|
123
|
+
const scoreThreshold = parseFloat(process.env.EVAL_SCORE_THRESHOLD || '');
|
|
124
|
+
if (!isNaN(scoreThreshold))
|
|
125
|
+
config.scoreThreshold = scoreThreshold;
|
|
126
|
+
if (process.env.EVAL_GITHUB_SUMMARY !== undefined) {
|
|
127
|
+
config.githubSummary = process.env.EVAL_GITHUB_SUMMARY === 'true';
|
|
128
|
+
}
|
|
129
|
+
return config;
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Deep merge multiple partial configs into a full config.
|
|
133
|
+
*/
|
|
134
|
+
function mergeConfigs(...configs) {
|
|
135
|
+
const result = { ...DEFAULT_CONFIG };
|
|
136
|
+
for (const config of configs) {
|
|
137
|
+
if (config.defaultAgentModel !== undefined)
|
|
138
|
+
result.defaultAgentModel = config.defaultAgentModel;
|
|
139
|
+
if (config.defaultJudgeModel !== undefined)
|
|
140
|
+
result.defaultJudgeModel = config.defaultJudgeModel;
|
|
141
|
+
if (config.defaultWeights !== undefined)
|
|
142
|
+
result.defaultWeights = { ...result.defaultWeights, ...config.defaultWeights };
|
|
143
|
+
if (config.judgeOutputTruncation !== undefined)
|
|
144
|
+
result.judgeOutputTruncation = config.judgeOutputTruncation;
|
|
145
|
+
if (config.reportOutputTruncation !== undefined)
|
|
146
|
+
result.reportOutputTruncation = config.reportOutputTruncation;
|
|
147
|
+
if (config.taskTimeoutMs !== undefined)
|
|
148
|
+
result.taskTimeoutMs = config.taskTimeoutMs;
|
|
149
|
+
if (config.exitOnFailure !== undefined)
|
|
150
|
+
result.exitOnFailure = config.exitOnFailure;
|
|
151
|
+
if (config.outputDir !== undefined)
|
|
152
|
+
result.outputDir = config.outputDir;
|
|
153
|
+
if (config.githubSummary !== undefined)
|
|
154
|
+
result.githubSummary = config.githubSummary;
|
|
155
|
+
if (config.discoveryThreshold !== undefined)
|
|
156
|
+
result.discoveryThreshold = config.discoveryThreshold;
|
|
157
|
+
if (config.scoreThreshold !== undefined)
|
|
158
|
+
result.scoreThreshold = config.scoreThreshold;
|
|
159
|
+
if (config.allowedWriteDirs !== undefined)
|
|
160
|
+
result.allowedWriteDirs = config.allowedWriteDirs;
|
|
161
|
+
}
|
|
162
|
+
return result;
|
|
163
|
+
}
|
|
164
|
+
/**
|
|
165
|
+
* Load full configuration with all sources merged.
|
|
166
|
+
*
|
|
167
|
+
* @param configPath - Optional path to eval.config.yaml
|
|
168
|
+
* @param overrides - Optional programmatic overrides (CLI flags)
|
|
169
|
+
*/
|
|
170
|
+
export async function loadConfig(configPath, overrides) {
|
|
171
|
+
const fileConfig = await loadConfigFile(configPath);
|
|
172
|
+
const envConfig = loadEnvConfig();
|
|
173
|
+
return mergeConfigs(fileConfig, envConfig, overrides ?? {});
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Load configuration synchronously (env vars + defaults only, no file).
|
|
177
|
+
* Useful when you can't await — e.g., in constructors.
|
|
178
|
+
*/
|
|
179
|
+
export function loadConfigSync(overrides) {
|
|
180
|
+
const envConfig = loadEnvConfig();
|
|
181
|
+
return mergeConfigs(envConfig, overrides ?? {});
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Get default weights for scoring dimensions.
|
|
185
|
+
*/
|
|
186
|
+
export function getDefaultWeights(config) {
|
|
187
|
+
const c = config ?? DEFAULT_CONFIG;
|
|
188
|
+
return new Map([
|
|
189
|
+
['discovery', c.defaultWeights.discovery],
|
|
190
|
+
['adherence', c.defaultWeights.adherence],
|
|
191
|
+
['output', c.defaultWeights.output],
|
|
192
|
+
]);
|
|
193
|
+
}
|
|
194
|
+
//# sourceMappingURL=config.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"config.js","sourceRoot":"","sources":["../../src/config.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,IAAI,MAAM,SAAS,CAAC;AAC3B,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAClC,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAkC7B;;GAEG;AACH,MAAM,CAAC,MAAM,cAAc,GAAe;IACxC,iBAAiB,EAAE,QAAQ;IAC3B,iBAAiB,EAAE,OAAO;IAC1B,cAAc,EAAE;QACd,SAAS,EAAE,GAAG;QACd,SAAS,EAAE,GAAG;QACd,MAAM,EAAE,GAAG;KACZ;IACD,qBAAqB,EAAE,IAAI;IAC3B,sBAAsB,EAAE,IAAI;IAC5B,aAAa,EAAE,MAAM,EAAE,YAAY;IACnC,aAAa,EAAE,IAAI;IACnB,SAAS,EAAE,WAAW;IACtB,aAAa,EAAE,KAAK;IACpB,kBAAkB,EAAE,GAAG;IACvB,cAAc,EAAE,GAAG;IACnB,gBAAgB,EAAE,CAAC,YAAY,EAAE,aAAa,CAAC;CAChD,CAAC;AAoCF;;GAEG;AACH,KAAK,UAAU,cAAc,CAAC,UAAmB;IAC/C,MAAM,QAAQ,GAAG,UAAU,IAAI,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,kBAAkB,CAAC,CAAC;IAE5E,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QACrD,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAkB,CAAC;QAChD,IAAI,CAAC,GAAG;YAAE,OAAO,EAAE,CAAC;QAEpB,MAAM,MAAM,GAAwB,EAAE,CAAC;QAEvC,IAAI,GAAG,CAAC,MAAM,EAAE,KAAK;YAAE,MAAM,CAAC,iBAAiB,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC;QACnE,IAAI,GAAG,CAAC,MAAM,EAAE,KAAK;YAAE,MAAM,CAAC,iBAAiB,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC;QAEnE,IAAI,GAAG,CAAC,OAAO,EAAE,OAAO,EAAE,CAAC;YACzB,MAAM,CAAC,cAAc,GAAG;gBACtB,SAAS,EAAE,GAAG,CAAC,OAAO,CAAC,OAAO,CAAC,SAAS,IAAI,cAAc,CAAC,cAAc,CAAC,SAAS;gBACnF,SAAS,EAAE,GAAG,CAAC,OAAO,CAAC,OAAO,CAAC,SAAS,IAAI,cAAc,CAAC,cAAc,CAAC,SAAS;gBACnF,MAAM,EAAE,GAAG,CAAC,OAAO,CAAC,OAAO,CAAC,MAAM,IAAI,cAAc,CAAC,cAAc,CAAC,MAAM;aAC3E,CAAC;QACJ,CAAC;QAED,IAAI,GAAG,CAAC,UAAU,EAAE,cAAc,KAAK,SAAS;YAAE,MAAM,CAAC,kBAAkB,GAAG,GAAG,CAAC,UAAU,CAAC,cAAc,CAAC;QAC5G,IAAI,GAAG,CAAC,UAAU,EAAE,SAAS,KAAK,SAAS;YAAE,MAAM,CAAC,cAAc,GAAG,GAAG,CAAC,UAAU,CAAC,SAAS,CAAC;QAE9F,IAAI,GAAG,CAAC,MAAM,EAAE,UAAU,KAAK,SAAS;YAAE,MAAM,CAAC,aAAa,GAAG,GAAG,CAAC,MAAM,CAAC,UAAU,CAAC;QACvF,IAAI,GAAG,CAAC,MAAM,EAAE,kBAAkB;YAAE,MAAM,CAAC,gBAAgB,GAAG,GAAG,CAAC,MAAM,CAAC,kBAAkB,CAAC;QAE5F,IAAI,GAAG,CAAC,MAAM,EAAE,GAAG;YAAE,MAAM,CAAC,SAAS,GAAG,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC;QACvD,IAAI,GAAG,CAAC,MAAM,EAAE,gBAAgB,KAAK,SAAS;YAAE,MAAM,CAAC,qBAAqB,GAAG,GAAG,CAAC,MAAM,CAAC,gBAAgB,CAAC;QAC3G,IAAI,GAAG,CAAC,MAAM,EAAE,iBAAiB,KAAK,SAAS;YAAE,MAAM,CAAC,sBAAsB,GAAG,GAAG,CAAC,MAAM,CAAC,iBAAiB,CAAC;QAE9G,IAAI,GAAG,CAAC,EAAE,EAAE,eAAe,KAAK,SAAS;YAAE,MAAM,CAAC,aAAa,GAAG,GAAG,CAAC,EAAE,CAAC,eAAe,CAAC;QACzF,IAAI,GAAG,CAAC,EAAE,EAAE,cAAc,KAAK,SAAS;YAAE,MAAM,CAAC,aAAa,GAAG,GAAG,CAAC,EAAE,CAAC,cAAc,CAAC;QAEvF,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,MAAM,CAAC;QACP,iDAAiD;QACjD,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED;;;;;;;;;;;;;;GAcG;AACH,SAAS,aAAa;IACpB,MAAM,MAAM,GAAwB,EAAE,CAAC;IAEvC,IAAI,OAAO,CAAC,GAAG,CAAC,gBAAgB;QAAE,MAAM,CAAC,iBAAiB,GAAG,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC;IAC1F,IAAI,OAAO,CAAC,GAAG,CAAC,gBAAgB;QAAE,MAAM,CAAC,iBAAiB,GAAG,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC;IAE1F,MAAM,UAAU,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,sBAAsB,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;IAC1E,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC;QAAE,MAAM,CAAC,qBAAqB,GAAG,UAAU,CAAC;IAElE,MAAM,gBAAgB,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,sBAAsB,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;IAChF,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAAC;QAAE,MAAM,CAAC,sBAAsB,GAAG,gBAAgB,CAAC;IAE/E,MAAM,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,oBAAoB,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;IACrE,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC;QAAE,MAAM,CAAC,aAAa,GAAG,OAAO,CAAC;IAEpD,IAAI,OAAO,CAAC,GAAG,CAAC,oBAAoB,KAAK,SAAS,EAAE,CAAC;QACnD,MAAM,CAAC,aAAa,GAAG,OAAO,CAAC,GAAG,CAAC,oBAAoB,KAAK,OAAO,CAAC;IACtE,CAAC;IAED,IAAI,OAAO,CAAC,GAAG,CAAC,eAAe;QAAE,MAAM,CAAC,SAAS,GAAG,OAAO,CAAC,GAAG,CAAC,eAAe,CAAC;IAEhF,MAAM,kBAAkB,GAAG,UAAU,CAAC,OAAO,CAAC,GAAG,CAAC,wBAAwB,IAAI,EAAE,CAAC,CAAC;IAClF,IAAI,CAAC,KAAK,CAAC,kBAAkB,CAAC;QAAE,MAAM,CAAC,kBAAkB,GAAG,kBAAkB,CAAC;IAE/E,MAAM,cAAc,GAAG,UAAU,CAAC,OAAO,CAAC,GAAG,CAAC,oBAAoB,IAAI,EAAE,CAAC,CAAC;IAC1E,IAAI,CAAC,KAAK,CAAC,cAAc,CAAC;QAAE,MAAM,CAAC,cAAc,GAAG,cAAc,CAAC;IAEnE,IAAI,OAAO,CAAC,GAAG,CAAC,mBAAmB,KAAK,SAAS,EAAE,CAAC;QAClD,MAAM,CAAC,aAAa,GAAG,OAAO,CAAC,GAAG,CAAC,mBAAmB,KAAK,MAAM,CAAC;IACpE,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,SAAS,YAAY,CAAC,GAAG,OAA8B;IACrD,MAAM,MAAM,GAAG,EAAE,GAAG,cAAc,EAAE,CAAC;IAErC,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,IAAI,MAAM,CAAC,iBAAiB,KAAK,SAAS;YAAE,MAAM,CAAC,iBAAiB,GAAG,MAAM,CAAC,iBAAiB,CAAC;QAChG,IAAI,MAAM,CAAC,iBAAiB,KAAK,SAAS;YAAE,MAAM,CAAC,iBAAiB,GAAG,MAAM,CAAC,iBAAiB,CAAC;QAChG,IAAI,MAAM,CAAC,cAAc,KAAK,SAAS;YAAE,MAAM,CAAC,cAAc,GAAG,EAAE,GAAG,MAAM,CAAC,cAAc,EAAE,GAAG,MAAM,CAAC,cAAc,EAAE,CAAC;QACxH,IAAI,MAAM,CAAC,qBAAqB,KAAK,SAAS;YAAE,MAAM,CAAC,qBAAqB,GAAG,MAAM,CAAC,qBAAqB,CAAC;QAC5G,IAAI,MAAM,CAAC,sBAAsB,KAAK,SAAS;YAAE,MAAM,CAAC,sBAAsB,GAAG,MAAM,CAAC,sBAAsB,CAAC;QAC/G,IAAI,MAAM,CAAC,aAAa,KAAK,SAAS;YAAE,MAAM,CAAC,aAAa,GAAG,MAAM,CAAC,aAAa,CAAC;QACpF,IAAI,MAAM,CAAC,aAAa,KAAK,SAAS;YAAE,MAAM,CAAC,aAAa,GAAG,MAAM,CAAC,aAAa,CAAC;QACpF,IAAI,MAAM,CAAC,SAAS,KAAK,SAAS;YAAE,MAAM,CAAC,SAAS,GAAG,MAAM,CAAC,SAAS,CAAC;QACxE,IAAI,MAAM,CAAC,aAAa,KAAK,SAAS;YAAE,MAAM,CAAC,aAAa,GAAG,MAAM,CAAC,aAAa,CAAC;QACpF,IAAI,MAAM,CAAC,kBAAkB,KAAK,SAAS;YAAE,MAAM,CAAC,kBAAkB,GAAG,MAAM,CAAC,kBAAkB,CAAC;QACnG,IAAI,MAAM,CAAC,cAAc,KAAK,SAAS;YAAE,MAAM,CAAC,cAAc,GAAG,MAAM,CAAC,cAAc,CAAC;QACvF,IAAI,MAAM,CAAC,gBAAgB,KAAK,SAAS;YAAE,MAAM,CAAC,gBAAgB,GAAG,MAAM,CAAC,gBAAgB,CAAC;IAC/F,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,UAAmB,EACnB,SAA+B;IAE/B,MAAM,UAAU,GAAG,MAAM,cAAc,CAAC,UAAU,CAAC,CAAC;IACpD,MAAM,SAAS,GAAG,aAAa,EAAE,CAAC;IAElC,OAAO,YAAY,CAAC,UAAU,EAAE,SAAS,EAAE,SAAS,IAAI,EAAE,CAAC,CAAC;AAC9D,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,cAAc,CAAC,SAA+B;IAC5D,MAAM,SAAS,GAAG,aAAa,EAAE,CAAC;IAClC,OAAO,YAAY,CAAC,SAAS,EAAE,SAAS,IAAI,EAAE,CAAC,CAAC;AAClD,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,iBAAiB,CAAC,MAAmB;IACnD,MAAM,CAAC,GAAG,MAAM,IAAI,cAAc,CAAC;IACnC,OAAO,IAAI,GAAG,CAAC;QACb,CAAC,WAAW,EAAE,CAAC,CAAC,cAAc,CAAC,SAAS,CAAC;QACzC,CAAC,WAAW,EAAE,CAAC,CAAC,cAAc,CAAC,SAAS,CAAC;QACzC,CAAC,QAAQ,EAAE,CAAC,CAAC,cAAc,CAAC,MAAM,CAAC;KACpC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Skill evaluation framework.
|
|
3
|
+
*
|
|
4
|
+
* Provides tools for evaluating AI agent skill discoverability,
|
|
5
|
+
* adherence, and output quality.
|
|
6
|
+
*
|
|
7
|
+
* @packageDocumentation
|
|
8
|
+
*/
|
|
9
|
+
export type { EvalCriteria, EvalTask, EvalDefaults, SkillEvaluation, DeterministicCheck, DeterministicResult, FixtureConfig, ToolCallRecord, TaskResult, RunnerOptions, FailureCategory, JudgeScore, JudgeOptions, CombinedScore, SessionLogEntry, MetricsData, SessionLog, EvaluationSummary, FailureBreakdown, ReportMetadata, EvaluationReport, } from './types.js';
|
|
10
|
+
export { loadConfig, loadConfigSync, getDefaultWeights, DEFAULT_CONFIG } from './config.js';
|
|
11
|
+
export type { EvalConfig } from './config.js';
|
|
12
|
+
export { parseEvalFile, createEvalTemplate, validateEvalFile } from './parser.js';
|
|
13
|
+
export { SkillEvalRunner } from './runner/runner.js';
|
|
14
|
+
export { setupLocalSkills, cleanupLocalSkills } from './runner/skill-setup.js';
|
|
15
|
+
export { createToolPolicy } from './runner/security.js';
|
|
16
|
+
export { scoreTask, scoreAll } from './scorer/scorer.js';
|
|
17
|
+
export { scoreDeterministic } from './scorer/deterministic.js';
|
|
18
|
+
export { SkillJudge } from './scorer/judge.js';
|
|
19
|
+
export { SessionLogger } from './session/session-logger.js';
|
|
20
|
+
export { generateReport, generateJsonResults, computeSummary, computeFailureBreakdown } from './report/report.js';
|
|
21
|
+
export { generateGitHubSummary, writeGitHubSummary } from './report/github-summary.js';
|
|
22
|
+
export { runPipeline, scorePipeline } from './pipeline.js';
|
|
23
|
+
export type { PipelineOptions, PipelineResult } from './pipeline.js';
|
|
24
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAGH,YAAY,EACV,YAAY,EACZ,QAAQ,EACR,YAAY,EACZ,eAAe,EACf,kBAAkB,EAClB,mBAAmB,EACnB,aAAa,EACb,cAAc,EACd,UAAU,EACV,aAAa,EACb,eAAe,EACf,UAAU,EACV,YAAY,EACZ,aAAa,EACb,eAAe,EACf,WAAW,EACX,UAAU,EACV,iBAAiB,EACjB,gBAAgB,EAChB,cAAc,EACd,gBAAgB,GACjB,MAAM,YAAY,CAAC;AAGpB,OAAO,EAAE,UAAU,EAAE,cAAc,EAAE,iBAAiB,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAC5F,YAAY,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAG9C,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAGlF,OAAO,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AACrD,OAAO,EAAE,gBAAgB,EAAE,kBAAkB,EAAE,MAAM,yBAAyB,CAAC;AAC/E,OAAO,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AAGxD,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AACzD,OAAO,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAC/D,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAG/C,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAG5D,OAAO,EAAE,cAAc,EAAE,mBAAmB,EAAE,cAAc,EAAE,uBAAuB,EAAE,MAAM,oBAAoB,CAAC;AAClH,OAAO,EAAE,qBAAqB,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC;AAGvF,OAAO,EAAE,WAAW,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAC3D,YAAY,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Skill evaluation framework.
|
|
3
|
+
*
|
|
4
|
+
* Provides tools for evaluating AI agent skill discoverability,
|
|
5
|
+
* adherence, and output quality.
|
|
6
|
+
*
|
|
7
|
+
* @packageDocumentation
|
|
8
|
+
*/
|
|
9
|
+
// Config
|
|
10
|
+
export { loadConfig, loadConfigSync, getDefaultWeights, DEFAULT_CONFIG } from './config.js';
|
|
11
|
+
// Parser
|
|
12
|
+
export { parseEvalFile, createEvalTemplate, validateEvalFile } from './parser.js';
|
|
13
|
+
// Runner
|
|
14
|
+
export { SkillEvalRunner } from './runner/runner.js';
|
|
15
|
+
export { setupLocalSkills, cleanupLocalSkills } from './runner/skill-setup.js';
|
|
16
|
+
export { createToolPolicy } from './runner/security.js';
|
|
17
|
+
// Scorer
|
|
18
|
+
export { scoreTask, scoreAll } from './scorer/scorer.js';
|
|
19
|
+
export { scoreDeterministic } from './scorer/deterministic.js';
|
|
20
|
+
export { SkillJudge } from './scorer/judge.js';
|
|
21
|
+
// Session
|
|
22
|
+
export { SessionLogger } from './session/session-logger.js';
|
|
23
|
+
// Report
|
|
24
|
+
export { generateReport, generateJsonResults, computeSummary, computeFailureBreakdown } from './report/report.js';
|
|
25
|
+
export { generateGitHubSummary, writeGitHubSummary } from './report/github-summary.js';
|
|
26
|
+
// Pipeline
|
|
27
|
+
export { runPipeline, scorePipeline } from './pipeline.js';
|
|
28
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AA2BH,SAAS;AACT,OAAO,EAAE,UAAU,EAAE,cAAc,EAAE,iBAAiB,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAG5F,SAAS;AACT,OAAO,EAAE,aAAa,EAAE,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAElF,SAAS;AACT,OAAO,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AACrD,OAAO,EAAE,gBAAgB,EAAE,kBAAkB,EAAE,MAAM,yBAAyB,CAAC;AAC/E,OAAO,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AAExD,SAAS;AACT,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AACzD,OAAO,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AAC/D,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAE/C,UAAU;AACV,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAE5D,SAAS;AACT,OAAO,EAAE,cAAc,EAAE,mBAAmB,EAAE,cAAc,EAAE,uBAAuB,EAAE,MAAM,oBAAoB,CAAC;AAClH,OAAO,EAAE,qBAAqB,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC;AAEvF,WAAW;AACX,OAAO,EAAE,WAAW,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC"}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* YAML parser for skill evaluation task files.
|
|
3
|
+
*
|
|
4
|
+
* Supports enriched YAML schema with:
|
|
5
|
+
* - defaults block (shared criteria, expected_skill_load)
|
|
6
|
+
* - deterministic block (marker-based checks, tool call expectations)
|
|
7
|
+
* - fixture block (setup/teardown scripts per task)
|
|
8
|
+
*/
|
|
9
|
+
import type { SkillEvaluation } from './types.js';
|
|
10
|
+
/**
|
|
11
|
+
* Parse a YAML evaluation file into a SkillEvaluation object.
|
|
12
|
+
*/
|
|
13
|
+
export declare function parseEvalFile(filePath: string): Promise<SkillEvaluation>;
|
|
14
|
+
/**
|
|
15
|
+
* Validate a YAML evaluation file and return any errors.
|
|
16
|
+
*/
|
|
17
|
+
export declare function validateEvalFile(filePath: string): Promise<string[]>;
|
|
18
|
+
/**
|
|
19
|
+
* Generate a YAML template for a new skill evaluation.
|
|
20
|
+
*/
|
|
21
|
+
export declare function createEvalTemplate(skillName: string, numTasks?: number): string;
|
|
22
|
+
//# sourceMappingURL=parser.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"parser.d.ts","sourceRoot":"","sources":["../../src/parser.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH,OAAO,KAAK,EAGV,eAAe,EAIhB,MAAM,YAAY,CAAC;AA0DpB;;GAEG;AACH,wBAAsB,aAAa,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC,CAiB9E;AA0FD;;GAEG;AACH,wBAAsB,gBAAgB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,CA4D1E;AAMD;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,SAAS,EAAE,MAAM,EAAE,QAAQ,SAAI,GAAG,MAAM,CAkC1E"}
|