coding-agent-benchmarks 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -18
- package/dist/adapters/claudeCodeCLI.d.ts +4 -0
- package/dist/adapters/claudeCodeCLI.d.ts.map +1 -1
- package/dist/adapters/claudeCodeCLI.js +6 -0
- package/dist/adapters/claudeCodeCLI.js.map +1 -1
- package/dist/adapters/copilotCLI.d.ts +4 -0
- package/dist/adapters/copilotCLI.d.ts.map +1 -1
- package/dist/adapters/copilotCLI.js +6 -0
- package/dist/adapters/copilotCLI.js.map +1 -1
- package/dist/evaluator.d.ts +20 -2
- package/dist/evaluator.d.ts.map +1 -1
- package/dist/evaluator.js +50 -51
- package/dist/evaluator.js.map +1 -1
- package/dist/reporter.d.ts +43 -0
- package/dist/reporter.d.ts.map +1 -0
- package/dist/reporter.js +281 -0
- package/dist/reporter.js.map +1 -0
- package/dist/runner.js +80 -65
- package/dist/runner.js.map +1 -1
- package/dist/types.d.ts +12 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/utils/baselineManager.d.ts +2 -2
- package/dist/utils/baselineManager.d.ts.map +1 -1
- package/dist/utils/baselineManager.js +2 -2
- package/dist/utils/baselineManager.js.map +1 -1
- package/package.json +21 -4
package/dist/reporter.js
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.ProgressReporter = void 0;
|
|
7
|
+
const chalk_1 = __importDefault(require("chalk"));
|
|
8
|
+
const log_update_1 = __importDefault(require("log-update"));
|
|
9
|
+
const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
|
|
10
|
+
class ProgressReporter {
|
|
11
|
+
constructor(options = {}) {
|
|
12
|
+
this.scenarios = new Map();
|
|
13
|
+
this.scenarioOrder = [];
|
|
14
|
+
this.currentIndex = 0;
|
|
15
|
+
this.totalScenarios = 0;
|
|
16
|
+
this.spinnerFrame = 0;
|
|
17
|
+
this.startTime = 0;
|
|
18
|
+
this.verboseBuffer = [];
|
|
19
|
+
this.isInteractive = process.stdout.isTTY === true;
|
|
20
|
+
this.verbose = options.verbose ?? false;
|
|
21
|
+
this.saveBaseline = options.saveBaseline ?? false;
|
|
22
|
+
this.compareBaseline = options.compareBaseline ?? false;
|
|
23
|
+
this.adapter = options.adapter ?? 'unknown';
|
|
24
|
+
}
|
|
25
|
+
start(scenarios) {
|
|
26
|
+
this.startTime = Date.now();
|
|
27
|
+
this.totalScenarios = scenarios.length;
|
|
28
|
+
this.scenarioOrder = scenarios.map((s) => s.id);
|
|
29
|
+
this.currentIndex = 0;
|
|
30
|
+
for (const scenario of scenarios) {
|
|
31
|
+
this.scenarios.set(scenario.id, {
|
|
32
|
+
scenario,
|
|
33
|
+
phase: "pending",
|
|
34
|
+
status: "runs",
|
|
35
|
+
});
|
|
36
|
+
}
|
|
37
|
+
if (this.isInteractive) {
|
|
38
|
+
console.log(`\nEvaluating ${scenarios.length} scenario(s)...\n`);
|
|
39
|
+
this.startSpinner();
|
|
40
|
+
}
|
|
41
|
+
else {
|
|
42
|
+
console.log(`Evaluating ${scenarios.length} scenario(s)...`);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
onScenarioStart(scenarioId) {
|
|
46
|
+
const state = this.scenarios.get(scenarioId);
|
|
47
|
+
if (!state)
|
|
48
|
+
return;
|
|
49
|
+
state.phase = "generating";
|
|
50
|
+
state.status = "runs";
|
|
51
|
+
state.startTime = Date.now();
|
|
52
|
+
if (this.isInteractive) {
|
|
53
|
+
this.render();
|
|
54
|
+
}
|
|
55
|
+
else {
|
|
56
|
+
console.log(`\n[${this.getScenarioIndex(scenarioId)}/${this.totalScenarios}] ${scenarioId}`);
|
|
57
|
+
if (this.verbose) {
|
|
58
|
+
console.log(` Generating code...`);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
onScenarioValidating(scenarioId) {
|
|
63
|
+
const state = this.scenarios.get(scenarioId);
|
|
64
|
+
if (!state)
|
|
65
|
+
return;
|
|
66
|
+
state.phase = "validating";
|
|
67
|
+
if (this.isInteractive) {
|
|
68
|
+
this.render();
|
|
69
|
+
}
|
|
70
|
+
else if (this.verbose) {
|
|
71
|
+
console.log(` Validating...`);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
onScenarioComplete(scenarioId, result, model) {
|
|
75
|
+
const state = this.scenarios.get(scenarioId);
|
|
76
|
+
if (!state)
|
|
77
|
+
return;
|
|
78
|
+
state.phase = "complete";
|
|
79
|
+
state.result = result;
|
|
80
|
+
state.status = result.error ? "skip" : result.passed ? "pass" : "fail";
|
|
81
|
+
if (this.isInteractive) {
|
|
82
|
+
this.render();
|
|
83
|
+
if (this.compareBaseline && result.baselineComparison) {
|
|
84
|
+
console.log(this.formatBaselineComparison(result.baselineComparison));
|
|
85
|
+
}
|
|
86
|
+
if (this.saveBaseline) {
|
|
87
|
+
console.log(this.formatBaselineSave(model));
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
else {
|
|
91
|
+
this.printScenarioResult(scenarioId, result, model);
|
|
92
|
+
}
|
|
93
|
+
this.flushVerboseBuffer();
|
|
94
|
+
}
|
|
95
|
+
log(message) {
|
|
96
|
+
if (this.isInteractive) {
|
|
97
|
+
this.verboseBuffer.push(message);
|
|
98
|
+
}
|
|
99
|
+
else {
|
|
100
|
+
console.log(message);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
finish(report) {
|
|
104
|
+
this.stopSpinner();
|
|
105
|
+
if (this.isInteractive) {
|
|
106
|
+
log_update_1.default.done();
|
|
107
|
+
}
|
|
108
|
+
this.printSummary(report);
|
|
109
|
+
}
|
|
110
|
+
getScenarioIndex(scenarioId) {
|
|
111
|
+
return this.scenarioOrder.indexOf(scenarioId) + 1;
|
|
112
|
+
}
|
|
113
|
+
startSpinner() {
|
|
114
|
+
if (!this.isInteractive)
|
|
115
|
+
return;
|
|
116
|
+
this.spinnerInterval = setInterval(() => {
|
|
117
|
+
this.spinnerFrame = (this.spinnerFrame + 1) % SPINNER_FRAMES.length;
|
|
118
|
+
this.render();
|
|
119
|
+
}, 80);
|
|
120
|
+
}
|
|
121
|
+
stopSpinner() {
|
|
122
|
+
if (this.spinnerInterval) {
|
|
123
|
+
clearInterval(this.spinnerInterval);
|
|
124
|
+
this.spinnerInterval = undefined;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
render() {
|
|
128
|
+
if (!this.isInteractive)
|
|
129
|
+
return;
|
|
130
|
+
const lines = [];
|
|
131
|
+
for (const scenarioId of this.scenarioOrder) {
|
|
132
|
+
const state = this.scenarios.get(scenarioId);
|
|
133
|
+
if (!state)
|
|
134
|
+
continue;
|
|
135
|
+
const line = this.formatScenarioLine(state);
|
|
136
|
+
lines.push(line);
|
|
137
|
+
}
|
|
138
|
+
(0, log_update_1.default)(lines.join("\n"));
|
|
139
|
+
}
|
|
140
|
+
formatScenarioLine(state) {
|
|
141
|
+
const index = this.getScenarioIndex(state.scenario.id);
|
|
142
|
+
const prefix = `[${index}/${this.totalScenarios}]`;
|
|
143
|
+
switch (state.status) {
|
|
144
|
+
case "runs": {
|
|
145
|
+
const spinner = SPINNER_FRAMES[this.spinnerFrame];
|
|
146
|
+
const phaseText = state.phase === "generating" ? "generating..." : "validating...";
|
|
147
|
+
const elapsed = state.startTime
|
|
148
|
+
? this.formatDuration(Date.now() - state.startTime)
|
|
149
|
+
: "";
|
|
150
|
+
return `${chalk_1.default.yellow(spinner)} ${prefix} ${state.scenario.id} ${chalk_1.default.dim(phaseText)} ${chalk_1.default.dim(elapsed)}`;
|
|
151
|
+
}
|
|
152
|
+
case "pass": {
|
|
153
|
+
const score = state.result?.score.toFixed(2) ?? "0.00";
|
|
154
|
+
const duration = state.result
|
|
155
|
+
? this.formatDuration(state.result.duration)
|
|
156
|
+
: "";
|
|
157
|
+
return `${chalk_1.default.green("✓")} ${prefix} ${state.scenario.id} ${chalk_1.default.green("PASS")} ${chalk_1.default.dim(`(score: ${score})`)} ${chalk_1.default.dim(duration)}`;
|
|
158
|
+
}
|
|
159
|
+
case "fail": {
|
|
160
|
+
const score = state.result?.score.toFixed(2) ?? "0.00";
|
|
161
|
+
const duration = state.result
|
|
162
|
+
? this.formatDuration(state.result.duration)
|
|
163
|
+
: "";
|
|
164
|
+
return `${chalk_1.default.red("✗")} ${prefix} ${state.scenario.id} ${chalk_1.default.red("FAIL")} ${chalk_1.default.dim(`(score: ${score})`)} ${chalk_1.default.dim(duration)}`;
|
|
165
|
+
}
|
|
166
|
+
case "skip": {
|
|
167
|
+
return `${chalk_1.default.yellow("○")} ${prefix} ${state.scenario.id} ${chalk_1.default.yellow("SKIP")} ${chalk_1.default.dim("(error)")}`;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
printScenarioResult(scenarioId, result, model) {
|
|
172
|
+
if (result.passed) {
|
|
173
|
+
console.log(` ✓ PASSED (score: ${result.score.toFixed(2)})`);
|
|
174
|
+
}
|
|
175
|
+
else {
|
|
176
|
+
console.log(` ✗ FAILED (score: ${result.score.toFixed(2)})`);
|
|
177
|
+
this.printViolations(result.violations);
|
|
178
|
+
if (result.error) {
|
|
179
|
+
console.log(` Error: ${result.error}`);
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
if (this.compareBaseline && result.baselineComparison) {
|
|
183
|
+
console.log(this.formatBaselineComparison(result.baselineComparison));
|
|
184
|
+
}
|
|
185
|
+
if (this.saveBaseline) {
|
|
186
|
+
console.log(this.formatBaselineSave(model));
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
printViolations(violations) {
|
|
190
|
+
if (violations.length === 0)
|
|
191
|
+
return;
|
|
192
|
+
console.log(` ${violations.length} violation(s):\n`);
|
|
193
|
+
violations.forEach((v, idx) => {
|
|
194
|
+
console.log(` ${idx + 1}. [${v.type}] ${v.message}`);
|
|
195
|
+
if (v.file) {
|
|
196
|
+
console.log(` File: ${v.file}${v.line ? `:${v.line}` : ""}`);
|
|
197
|
+
}
|
|
198
|
+
if (v.details) {
|
|
199
|
+
console.log(` Details: ${v.details}`);
|
|
200
|
+
}
|
|
201
|
+
});
|
|
202
|
+
}
|
|
203
|
+
flushVerboseBuffer() {
|
|
204
|
+
if (!this.isInteractive || this.verboseBuffer.length === 0)
|
|
205
|
+
return;
|
|
206
|
+
log_update_1.default.clear();
|
|
207
|
+
this.verboseBuffer.forEach((message) => console.log(message));
|
|
208
|
+
this.verboseBuffer = [];
|
|
209
|
+
this.render();
|
|
210
|
+
}
|
|
211
|
+
printSummary(report) {
|
|
212
|
+
const { summary, totalDuration } = report;
|
|
213
|
+
if (this.isInteractive) {
|
|
214
|
+
report.results.forEach((result) => {
|
|
215
|
+
if (!result.passed) {
|
|
216
|
+
console.log(`\n${chalk_1.default.red("✗")} ${result.scenario.id}`);
|
|
217
|
+
this.printViolations(result.violations);
|
|
218
|
+
if (result.error) {
|
|
219
|
+
console.log(` Error: ${result.error}`);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
console.log("\n" + "=".repeat(60));
|
|
225
|
+
console.log("EVALUATION SUMMARY");
|
|
226
|
+
console.log("=".repeat(60));
|
|
227
|
+
console.log(`Total scenarios: ${summary.total}`);
|
|
228
|
+
console.log(`Passed: ${this.isInteractive ? chalk_1.default.green(summary.passed.toString()) : summary.passed}`);
|
|
229
|
+
console.log(`Failed: ${this.isInteractive ? chalk_1.default.red(summary.failed.toString()) : summary.failed}`);
|
|
230
|
+
console.log(`Skipped: ${this.isInteractive ? chalk_1.default.yellow(summary.skipped.toString()) : summary.skipped}`);
|
|
231
|
+
console.log(`Average score: ${summary.averageScore.toFixed(2)}`);
|
|
232
|
+
console.log(`Total violations: ${summary.totalViolations}`);
|
|
233
|
+
console.log(`Total duration: ${(totalDuration / 1000).toFixed(1)}s`);
|
|
234
|
+
console.log("=".repeat(60));
|
|
235
|
+
}
|
|
236
|
+
formatDuration(ms) {
|
|
237
|
+
if (ms < 1000)
|
|
238
|
+
return `${ms}ms`;
|
|
239
|
+
if (ms < 60000)
|
|
240
|
+
return `${(ms / 1000).toFixed(1)}s`;
|
|
241
|
+
const minutes = Math.floor(ms / 60000);
|
|
242
|
+
const seconds = ((ms % 60000) / 1000).toFixed(0);
|
|
243
|
+
return `${minutes}m ${seconds}s`;
|
|
244
|
+
}
|
|
245
|
+
formatBaselineComparison(comparison) {
|
|
246
|
+
if (comparison.baselineScore === 0) {
|
|
247
|
+
const sign = comparison.delta >= 0 ? '+' : '';
|
|
248
|
+
const arrow = comparison.delta >= 0
|
|
249
|
+
? (this.isInteractive ? chalk_1.default.green('↑') : '↑')
|
|
250
|
+
: (this.isInteractive ? chalk_1.default.red('↓') : '↓');
|
|
251
|
+
const text = this.isInteractive
|
|
252
|
+
? (comparison.delta >= 0
|
|
253
|
+
? chalk_1.default.green(`${sign}${comparison.delta.toFixed(2)} from baseline (0.00)`)
|
|
254
|
+
: chalk_1.default.red(`${sign}${comparison.delta.toFixed(2)} from baseline (0.00)`))
|
|
255
|
+
: `${sign}${comparison.delta.toFixed(2)} from baseline (0.00)`;
|
|
256
|
+
return ` ${arrow} ${text}`;
|
|
257
|
+
}
|
|
258
|
+
const percentage = (comparison.delta / comparison.baselineScore) * 100;
|
|
259
|
+
const percentStr = Math.abs(percentage).toFixed(1);
|
|
260
|
+
if (comparison.isImprovement) {
|
|
261
|
+
const arrow = this.isInteractive ? chalk_1.default.green('↑') : '↑';
|
|
262
|
+
const text = this.isInteractive
|
|
263
|
+
? chalk_1.default.green(`+${percentStr}% improvement from baseline`)
|
|
264
|
+
: `+${percentStr}% improvement from baseline`;
|
|
265
|
+
return ` ${arrow} ${text}`;
|
|
266
|
+
}
|
|
267
|
+
else {
|
|
268
|
+
const arrow = this.isInteractive ? chalk_1.default.red('↓') : '↓';
|
|
269
|
+
const text = this.isInteractive
|
|
270
|
+
? chalk_1.default.red(`-${percentStr}% regression from baseline`)
|
|
271
|
+
: `-${percentStr}% regression from baseline`;
|
|
272
|
+
return ` ${arrow} ${text}`;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
formatBaselineSave(model) {
|
|
276
|
+
const path = `${this.adapter}/${model}`;
|
|
277
|
+
return ` → Baseline saved (${path})`;
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
exports.ProgressReporter = ProgressReporter;
|
|
281
|
+
//# sourceMappingURL=reporter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"reporter.js","sourceRoot":"","sources":["../src/reporter.ts"],"names":[],"mappings":";;;;;;AAAA,kDAA0B;AAC1B,4DAAmC;AAwBnC,MAAM,cAAc,GAAG,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;AAE1E,MAAa,gBAAgB;IAe3B,YAAY,UAKR,EAAE;QAjBE,cAAS,GAA+B,IAAI,GAAG,EAAE,CAAC;QAClD,kBAAa,GAAa,EAAE,CAAC;QAC7B,iBAAY,GAAW,CAAC,CAAC;QACzB,mBAAc,GAAW,CAAC,CAAC;QAC3B,iBAAY,GAAW,CAAC,CAAC;QAEzB,cAAS,GAAW,CAAC,CAAC;QACtB,kBAAa,GAAa,EAAE,CAAC;QAWnC,IAAI,CAAC,aAAa,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,KAAK,IAAI,CAAC;QACnD,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,KAAK,CAAC;QACxC,IAAI,CAAC,YAAY,GAAG,OAAO,CAAC,YAAY,IAAI,KAAK,CAAC;QAClD,IAAI,CAAC,eAAe,GAAG,OAAO,CAAC,eAAe,IAAI,KAAK,CAAC;QACxD,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,SAAS,CAAC;IAC9C,CAAC;IAED,KAAK,CAAC,SAAyB;QAC7B,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC5B,IAAI,CAAC,cAAc,GAAG,SAAS,CAAC,MAAM,CAAC;QACvC,IAAI,CAAC,aAAa,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QAChD,IAAI,CAAC,YAAY,GAAG,CAAC,CAAC;QAEtB,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;YACjC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,EAAE;gBAC9B,QAAQ;gBACR,KAAK,EAAE,SAAS;gBAChB,MAAM,EAAE,MAAM;aACf,CAAC,CAAC;QACL,CAAC;QAED,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;YACvB,OAAO,CAAC,GAAG,CAAC,gBAAgB,SAAS,CAAC,MAAM,mBAAmB,CAAC,CAAC;YACjE,IAAI,CAAC,YAAY,EAAE,CAAC;QACtB,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,GAAG,CAAC,cAAc,SAAS,CAAC,MAAM,iBAAiB,CAAC,CAAC;QAC/D,CAAC;IACH,CAAC;IAED,eAAe,CAAC,UAAkB;QAChC,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;QAC7C,IAAI,CAAC,KAAK;YAAE,OAAO;QAEnB,KAAK,CAAC,KAAK,GAAG,YAAY,CAAC;QAC3B,KAAK,CAAC,MAAM,GAAG,MAAM,CAAC;QACtB,KAAK,CAAC,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAE7B,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;YACvB,IAAI,CAAC,MAAM,EAAE,CAAC;QAChB,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,GAAG,CACT,MAAM,IAAI,CAAC,gBAAgB,CAAC,UAAU,CAAC,IAAI,IAAI,CAAC,cAAc,KAAK,UAAU,EAAE,CAChF,CAAC;YACF,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;gBACjB,OAAO,CAAC,GAAG,CAAC,sBAAsB,CAAC,CAAC;YACtC,CAAC;QACH,CAAC;IACH,CAAC;IAED,oBAAoB,CAAC,UAAkB;QACrC,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;QAC7C,IAAI,CAAC,KAAK;YAAE,OAAO;QAEnB,KAAK,CAAC,KAAK,GAAG,YAAY,CAAC;QAE3B,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;YACvB,IAAI,CAAC,MAAM,EAAE,CAAC;QAChB,CAAC;aAAM,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACxB,OAAO,CAAC,GAAG,CAAC,iBAAiB,CAAC,CAAC;QACjC,CAAC;IACH,CAAC;IAED,kBAAkB,CAAC,UAAkB,EAAE,MAAwB,EAAE,KAAa;QAC5E,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;QAC7C,IAAI,CAAC,KAAK;YAAE,OAAO;QAEnB,KAAK,CAAC,KAAK,GAAG,UAAU,CAAC;QACzB,KAAK,CAAC,MAAM,GAAG,MAAM,CAAC;QACtB,KAAK,CAAC,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;QAEvE,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;YACvB,IAAI,CAAC,MAAM,EAAE,CAAC;YAEd,IAAI,IAAI,CAAC,eAAe,IAAI,MAAM,CAAC,kBAAkB,EAAE,CAAC;gBACtD,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,wBAAwB,CAAC,MAAM,CAAC,kBAAkB,CAAC,CAAC,CAAC;YACxE,CAAC;YACD,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;gBACtB,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,kBAAkB,CAAC,KAAK,CAAC,CAAC,CAAC;YAC9C,CAAC;QACH,CAAC;aAAM,CAAC;YACN,IAAI,CAAC,mBAAmB,CAAC,UAAU,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC;QACtD,CAAC;QAED,IAAI,CAAC,kBAAkB,EAAE,CAAC;IAC5B,CAAC;IAED,GAAG,CAAC,OAAe;QACjB,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;YACvB,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACnC,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;QACvB,CAAC;IACH,CAAC;IAED,MAAM,CAAC,MAAwB;QAC7B,IAAI,CAAC,WAAW,EAAE,CAAC;QAEnB,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;YACvB,oBAAS,CAAC,IAAI,EAAE,CAAC;QACnB,CAAC;QAED,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;IAC5B,CAAC;IAEO,gBAAgB,CAAC,UAAkB;QACzC,OAAO,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC;IACpD,CAAC;IAEO,YAAY;QAClB,IAAI,CAAC,IAAI,CAAC,aAAa;YAAE,OAAO;QAEhC,IAAI,CAAC,eAAe,GAAG,WAAW,CAAC,GAAG,EAAE;YACtC,IAAI,CAAC,YAAY,GAAG,CAAC,IAAI,CAAC,YAAY,GAAG,CAAC,CAAC,GAAG,cAAc,CAAC,MAAM,CAAC;YACpE,IAAI,CAAC,MAAM,EAAE,CAAC;QAChB,CAAC,EAAE,EAAE,CAAC,CAAC;IACT,CAAC;IAEO,WAAW;QACjB,IAAI,IAAI,CAAC,eAAe,EAAE,CAAC;YACzB,aAAa,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;YACpC,IAAI,CAAC,eAAe,GAAG,SAAS,CAAC;QACnC,CAAC;IACH,CAAC;IAEO,MAAM;QACZ,IAAI,CAAC,IAAI,CAAC,aAAa;YAAE,OAAO;QAEhC,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,KAAK,MAAM,UAAU,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;YAC5C,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;YAC7C,IAAI,CAAC,KAAK;gBAAE,SAAS;YAErB,MAAM,IAAI,GAAG,IAAI,CAAC,kBAAkB,CAAC,KAAK,CAAC,CAAC;YAC5C,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACnB,CAAC;QAED,IAAA,oBAAS,EAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC;IAC9B,CAAC;IAEO,kBAAkB,CAAC,KAAoB;QAC7C,MAAM,KAAK,GAAG,IAAI,CAAC,gBAAgB,CAAC,KAAK,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;QACvD,MAAM,MAAM,GAAG,IAAI,KAAK,IAAI,IAAI,CAAC,cAAc,GAAG,CAAC;QAEnD,QAAQ,KAAK,CAAC,MAAM,EAAE,CAAC;YACrB,KAAK,MAAM,CAAC,CAAC,CAAC;gBACZ,MAAM,OAAO,GAAG,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;gBAClD,MAAM,SAAS,GACb,KAAK,CAAC,KAAK,KAAK,YAAY,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,eAAe,CAAC;gBACnE,MAAM,OAAO,GAAG,KAAK,CAAC,SAAS;oBAC7B,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC,SAAS,CAAC;oBACnD,CAAC,CAAC,EAAE,CAAC;gBACP,OAAO,GAAG,eAAK,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,MAAM,IAAI,KAAK,CAAC,QAAQ,CAAC,EAAE,IAAI,eAAK,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,eAAK,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC;YACjH,CAAC;YACD,KAAK,MAAM,CAAC,CAAC,CAAC;gBACZ,MAAM,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC;gBACvD,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM;oBAC3B,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,CAAC;oBAC5C,CAAC,CAAC,EAAE,CAAC;gBACP,OAAO,GAAG,eAAK,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,MAAM,IAAI,KAAK,CAAC,QAAQ,CAAC,EAAE,IAAI,eAAK,CAAC,KAAK,CAAC,MAAM,CAAC,IAAI,eAAK,CAAC,GAAG,CAAC,WAAW,KAAK,GAAG,CAAC,IAAI,eAAK,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC9I,CAAC;YACD,KAAK,MAAM,CAAC,CAAC,CAAC;gBACZ,MAAM,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC;gBACvD,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM;oBAC3B,CAAC,CAAC,IAAI,CAAC,cAAc,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,CAAC;oBAC5C,CAAC,CAAC,EAAE,CAAC;gBACP,OAAO,GAAG,eAAK,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,MAAM,IAAI,KAAK,CAAC,QAAQ,CAAC,EAAE,IAAI,eAAK,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,eAAK,CAAC,GAAG,CAAC,WAAW,KAAK,GAAG,CAAC,IAAI,eAAK,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC1I,CAAC;YACD,KAAK,MAAM,CAAC,CAAC,CAAC;gBACZ,OAAO,GAAG,eAAK,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,MAAM,IAAI,KAAK,CAAC,QAAQ,CAAC,EAAE,IAAI,eAAK,CAAC,MAAM,CAAC,MAAM,CAAC,IAAI,eAAK,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,CAAC;YAC/G,CAAC;QACH,CAAC;IACH,CAAC;IAEO,mBAAmB,CACzB,UAAkB,EAClB,MAAwB,EACxB,KAAa;QAEb,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;YAClB,OAAO,CAAC,GAAG,CAAC,sBAAsB,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QAChE,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,GAAG,CAAC,sBAAsB,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;YAC9D,IAAI,CAAC,eAAe,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;YACxC,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;gBACjB,OAAO,CAAC,GAAG,CAAC,cAAc,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;YAC5C,CAAC;QACH,CAAC;QAED,IAAI,IAAI,CAAC,eAAe,IAAI,MAAM,CAAC,kBAAkB,EAAE,CAAC;YACtD,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,wBAAwB,CAAC,MAAM,CAAC,kBAAkB,CAAC,CAAC,CAAC;QACxE,CAAC;QACD,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;YACtB,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,kBAAkB,CAAC,KAAK,CAAC,CAAC,CAAC;QAC9C,CAAC;IACH,CAAC;IAEO,eAAe,CAAC,UAAuB;QAC7C,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO;QAEpC,OAAO,CAAC,GAAG,CAAC,OAAO,UAAU,CAAC,MAAM,kBAAkB,CAAC,CAAC;QACxD,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE;YAC5B,OAAO,CAAC,GAAG,CAAC,OAAO,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC;YACxD,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;gBACX,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACrE,CAAC;YACD,IAAI,CAAC,CAAC,OAAO,EAAE,CAAC;gBACd,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC;YAC9C,CAAC;QACH,CAAC,CAAC,CAAC;IACL,CAAC;IAEO,kBAAkB;QACxB,IAAI,CAAC,IAAI,CAAC,aAAa,IAAI,IAAI,CAAC,aAAa,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO;QAEnE,oBAAS,CAAC,KAAK,EAAE,CAAC;QAElB,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC;QAC9D,IAAI,CAAC,aAAa,GAAG,EAAE,CAAC;QAExB,IAAI,CAAC,MAAM,EAAE,CAAC;IAChB,CAAC;IAEO,YAAY,CAAC,MAAwB;QAC3C,MAAM,EAAE,OAAO,EAAE,aAAa,EAAE,GAAG,MAAM,CAAC;QAE1C,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;YACvB,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;gBAChC,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;oBACnB,OAAO,CAAC,GAAG,CAAC,KAAK,eAAK,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC,CAAC;oBACzD,IAAI,CAAC,eAAe,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;oBACxC,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;wBACjB,OAAO,CAAC,GAAG,CAAC,YAAY,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;oBAC1C,CAAC;gBACH,CAAC;YACH,CAAC,CAAC,CAAC;QACL,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;QACnC,OAAO,CAAC,GAAG,CAAC,oBAAoB,CAAC,CAAC;QAClC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;QAC5B,OAAO,CAAC,GAAG,CAAC,oBAAoB,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC;QACjD,OAAO,CAAC,GAAG,CACT,WAAW,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,eAAK,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,CAC1F,CAAC;QACF,OAAO,CAAC,GAAG,CACT,WAAW,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,eAAK,CAAC,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,CACxF,CAAC;QACF,OAAO,CAAC,GAAG,CACT,YAAY,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,eAAK,CAAC,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE,CAC9F,CAAC;QACF,OAAO,CAAC,GAAG,CAAC,kBAAkB,OAAO,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACjE,OAAO,CAAC,GAAG,CAAC,qBAAqB,OAAO,CAAC,eAAe,EAAE,CAAC,CAAC;QAC5D,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,aAAa,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QACrE,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAC9B,CAAC;IAEO,cAAc,CAAC,EAAU;QAC/B,IAAI,EAAE,GAAG,IAAI;YAAE,OAAO,GAAG,EAAE,IAAI,CAAC;QAChC,IAAI,EAAE,GAAG,KAAK;YAAE,OAAO,GAAG,CAAC,EAAE,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;QACpD,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,GAAG,KAAK,CAAC,CAAC;QACvC,MAAM,OAAO,GAAG,CAAC,CAAC,EAAE,GAAG,KAAK,CAAC,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QACjD,OAAO,GAAG,OAAO,KAAK,OAAO,GAAG,CAAC;IACnC,CAAC;IAEO,wBAAwB,CAAC,UAIhC;QACC,IAAI,UAAU,CAAC,aAAa,KAAK,CAAC,EAAE,CAAC;YACnC,MAAM,IAAI,GAAG,UAAU,CAAC,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;YAC9C,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,IAAI,CAAC;gBACjC,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,eAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;gBAC/C,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,eAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;YAChD,MAAM,IAAI,GAAG,IAAI,CAAC,aAAa;gBAC7B,CAAC,CAAC,CAAC,UAAU,CAAC,KAAK,IAAI,CAAC;oBACtB,CAAC,CAAC,eAAK,CAAC,KAAK,CAAC,GAAG,IAAI,GAAG,UAAU,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,uBAAuB,CAAC;oBAC3E,CAAC,CAAC,eAAK,CAAC,GAAG,CAAC,GAAG,IAAI,GAAG,UAAU,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,uBAAuB,CAAC,CAAC;gBAC5E,CAAC,CAAC,GAAG,IAAI,GAAG,UAAU,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,uBAAuB,CAAC;YACjE,OAAO,OAAO,KAAK,IAAI,IAAI,EAAE,CAAC;QAChC,CAAC;QAED,MAAM,UAAU,GAAG,CAAC,UAAU,CAAC,KAAK,GAAG,UAAU,CAAC,aAAa,CAAC,GAAG,GAAG,CAAC;QACvE,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QAEnD,IAAI,UAAU,CAAC,aAAa,EAAE,CAAC;YAC7B,MAAM,KAAK,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,eAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;YAC1D,MAAM,IAAI,GAAG,IAAI,CAAC,aAAa;gBAC7B,CAAC,CAAC,eAAK,CAAC,KAAK,CAAC,IAAI,UAAU,6BAA6B,CAAC;gBAC1D,CAAC,CAAC,IAAI,UAAU,6BAA6B,CAAC;YAChD,OAAO,OAAO,KAAK,IAAI,IAAI,EAAE,CAAC;QAChC,CAAC;aAAM,CAAC;YACN,MAAM,KAAK,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,eAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;YACxD,MAAM,IAAI,GAAG,IAAI,CAAC,aAAa;gBAC7B,CAAC,CAAC,eAAK,CAAC,GAAG,CAAC,IAAI,UAAU,4BAA4B,CAAC;gBACvD,CAAC,CAAC,IAAI,UAAU,4BAA4B,CAAC;YAC/C,OAAO,OAAO,KAAK,IAAI,IAAI,EAAE,CAAC;QAChC,CAAC;IACH,CAAC;IAEO,kBAAkB,CAAC,KAAa;QACtC,MAAM,IAAI,GAAG,GAAG,IAAI,CAAC,OAAO,IAAI,KAAK,EAAE,CAAC;QACxC,OAAO,yBAAyB,IAAI,GAAG,CAAC;IAC1C,CAAC;CACF;AAtUD,4CAsUC"}
|
package/dist/runner.js
CHANGED
|
@@ -45,41 +45,62 @@ const copilotCLI_1 = require("./adapters/copilotCLI");
|
|
|
45
45
|
const claudeCodeCLI_1 = require("./adapters/claudeCodeCLI");
|
|
46
46
|
const llmJudge_1 = require("./validators/llmJudge");
|
|
47
47
|
const githubAuth_1 = require("./utils/githubAuth");
|
|
48
|
+
const reporter_1 = require("./reporter");
|
|
48
49
|
const program = new commander_1.Command();
|
|
49
50
|
program
|
|
50
|
-
.name(
|
|
51
|
-
.description(
|
|
52
|
-
.version(
|
|
51
|
+
.name("coding-agent-benchmarks")
|
|
52
|
+
.description("Evaluate coding agents against coding standards and best practices")
|
|
53
|
+
.version("0.1.0");
|
|
53
54
|
/**
|
|
54
55
|
* Evaluate command
|
|
55
56
|
*/
|
|
56
57
|
program
|
|
57
|
-
.command(
|
|
58
|
-
.description(
|
|
59
|
-
.option(
|
|
60
|
-
.option(
|
|
61
|
-
.option(
|
|
62
|
-
.option(
|
|
63
|
-
.option(
|
|
64
|
-
.option(
|
|
65
|
-
.option(
|
|
66
|
-
.option(
|
|
67
|
-
.option(
|
|
68
|
-
.option('--compare-baseline', 'Compare results with baseline')
|
|
69
|
-
.option('--workspace-root <path>', 'Workspace root directory')
|
|
58
|
+
.command("evaluate")
|
|
59
|
+
.description("Run benchmark evaluations")
|
|
60
|
+
.option("--scenario <pattern>", "Filter scenarios by ID pattern (supports wildcards)")
|
|
61
|
+
.option("--category <categories>", "Filter by category (comma-separated)")
|
|
62
|
+
.option("--tag <tags>", "Filter by tags (comma-separated)")
|
|
63
|
+
.option("--adapter <type>", "Code generation adapter (copilot or claude-code)", "copilot")
|
|
64
|
+
.option("--model <model>", "LLM model for judge (default: openai/gpt-4.1)")
|
|
65
|
+
.option("--threshold <number>", "Minimum passing score", "0.8")
|
|
66
|
+
.option("--verbose", "Show detailed output")
|
|
67
|
+
.option("--output <file>", "Export JSON report to file")
|
|
68
|
+
.option("--workspace-root <path>", "Workspace root directory")
|
|
70
69
|
.action(async (options) => {
|
|
71
70
|
try {
|
|
72
|
-
// Load configuration
|
|
73
71
|
const { config, scenarios } = await (0, loader_1.loadConfig)(options.workspaceRoot || process.cwd());
|
|
74
|
-
// Create evaluator
|
|
75
72
|
const evaluator = new evaluator_1.Evaluator({
|
|
76
73
|
adapter: options.adapter,
|
|
77
74
|
model: options.model,
|
|
78
75
|
workspaceRoot: options.workspaceRoot,
|
|
79
76
|
defaultTimeout: config.defaultTimeout,
|
|
80
77
|
verbose: options.verbose,
|
|
81
|
-
saveBaseline:
|
|
82
|
-
compareBaseline:
|
|
78
|
+
saveBaseline: config.saveBaseline,
|
|
79
|
+
compareBaseline: config.compareBaseline,
|
|
80
|
+
});
|
|
81
|
+
const reporter = new reporter_1.ProgressReporter({
|
|
82
|
+
verbose: options.verbose,
|
|
83
|
+
saveBaseline: config.saveBaseline,
|
|
84
|
+
compareBaseline: config.compareBaseline,
|
|
85
|
+
adapter: options.adapter,
|
|
86
|
+
});
|
|
87
|
+
evaluator.on("evaluation:start", (scenarioList) => {
|
|
88
|
+
reporter.start(scenarioList);
|
|
89
|
+
});
|
|
90
|
+
evaluator.on("scenario:start", (scenarioId) => {
|
|
91
|
+
reporter.onScenarioStart(scenarioId);
|
|
92
|
+
});
|
|
93
|
+
evaluator.on("scenario:generating", (scenarioId) => {
|
|
94
|
+
// Phase already set in onScenarioStart, but could be used for more granular updates
|
|
95
|
+
});
|
|
96
|
+
evaluator.on("scenario:validating", (scenarioId) => {
|
|
97
|
+
reporter.onScenarioValidating(scenarioId);
|
|
98
|
+
});
|
|
99
|
+
evaluator.on("scenario:complete", (scenarioId, result, model) => {
|
|
100
|
+
reporter.onScenarioComplete(scenarioId, result, model);
|
|
101
|
+
});
|
|
102
|
+
evaluator.on("log", (message) => {
|
|
103
|
+
reporter.log(message);
|
|
83
104
|
});
|
|
84
105
|
// Check adapter availability
|
|
85
106
|
const isAvailable = await evaluator.checkAdapterAvailability();
|
|
@@ -92,29 +113,21 @@ program
|
|
|
92
113
|
const filteredScenarios = evaluator.filterScenarios(scenarios, {
|
|
93
114
|
scenarioPattern: options.scenario,
|
|
94
115
|
category: options.category,
|
|
95
|
-
tags: options.tag
|
|
116
|
+
tags: options.tag
|
|
117
|
+
? options.tag.split(",").map((t) => t.trim())
|
|
118
|
+
: undefined,
|
|
96
119
|
});
|
|
97
120
|
if (filteredScenarios.length === 0) {
|
|
98
|
-
console.log(
|
|
121
|
+
console.log("No scenarios match the specified filters");
|
|
99
122
|
return;
|
|
100
123
|
}
|
|
101
124
|
// Run evaluation
|
|
102
125
|
const report = await evaluator.evaluate(filteredScenarios);
|
|
103
|
-
//
|
|
104
|
-
|
|
105
|
-
console.log('EVALUATION SUMMARY');
|
|
106
|
-
console.log('='.repeat(60));
|
|
107
|
-
console.log(`Total scenarios: ${report.summary.total}`);
|
|
108
|
-
console.log(`Passed: ${report.summary.passed}`);
|
|
109
|
-
console.log(`Failed: ${report.summary.failed}`);
|
|
110
|
-
console.log(`Skipped: ${report.summary.skipped}`);
|
|
111
|
-
console.log(`Average score: ${report.summary.averageScore.toFixed(2)}`);
|
|
112
|
-
console.log(`Total violations: ${report.summary.totalViolations}`);
|
|
113
|
-
console.log(`Total duration: ${(report.totalDuration / 1000).toFixed(1)}s`);
|
|
114
|
-
console.log('='.repeat(60));
|
|
126
|
+
// Finish reporter and display summary
|
|
127
|
+
reporter.finish(report);
|
|
115
128
|
// Export JSON report if requested
|
|
116
129
|
if (options.output) {
|
|
117
|
-
fs.writeFileSync(options.output, JSON.stringify(report, null, 2),
|
|
130
|
+
fs.writeFileSync(options.output, JSON.stringify(report, null, 2), "utf-8");
|
|
118
131
|
console.log(`\nReport exported to: ${options.output}`);
|
|
119
132
|
}
|
|
120
133
|
// Exit with error code if any scenarios failed
|
|
@@ -131,30 +144,32 @@ program
|
|
|
131
144
|
* List command
|
|
132
145
|
*/
|
|
133
146
|
program
|
|
134
|
-
.command(
|
|
135
|
-
.description(
|
|
136
|
-
.option(
|
|
137
|
-
.option(
|
|
147
|
+
.command("list")
|
|
148
|
+
.description("List available test scenarios")
|
|
149
|
+
.option("--category <categories>", "Filter by category")
|
|
150
|
+
.option("--tag <tags>", "Filter by tags (comma-separated)")
|
|
138
151
|
.action(async (options) => {
|
|
139
152
|
try {
|
|
140
153
|
const { scenarios } = await (0, loader_1.loadConfig)();
|
|
141
154
|
let filtered = scenarios;
|
|
142
155
|
// Filter by category
|
|
143
156
|
if (options.category) {
|
|
144
|
-
const categories = options.category
|
|
145
|
-
|
|
157
|
+
const categories = options.category
|
|
158
|
+
.split(",")
|
|
159
|
+
.map((c) => c.trim());
|
|
160
|
+
filtered = filtered.filter((s) => categories.includes(s.category));
|
|
146
161
|
}
|
|
147
162
|
// Filter by tags
|
|
148
163
|
if (options.tag) {
|
|
149
|
-
const tags = options.tag.split(
|
|
150
|
-
filtered = filtered.filter(s => tags.some((tag) => s.tags.includes(tag)));
|
|
164
|
+
const tags = options.tag.split(",").map((t) => t.trim());
|
|
165
|
+
filtered = filtered.filter((s) => tags.some((tag) => s.tags.includes(tag)));
|
|
151
166
|
}
|
|
152
167
|
console.log(`\nAvailable scenarios (${filtered.length}):\n`);
|
|
153
168
|
for (const scenario of filtered) {
|
|
154
169
|
console.log(` ${scenario.id}`);
|
|
155
170
|
console.log(` Category: ${scenario.category}`);
|
|
156
171
|
console.log(` Severity: ${scenario.severity}`);
|
|
157
|
-
console.log(` Tags: ${scenario.tags.join(
|
|
172
|
+
console.log(` Tags: ${scenario.tags.join(", ")}`);
|
|
158
173
|
console.log(` Description: ${scenario.description}`);
|
|
159
174
|
console.log();
|
|
160
175
|
}
|
|
@@ -168,34 +183,34 @@ program
|
|
|
168
183
|
* Check command
|
|
169
184
|
*/
|
|
170
185
|
program
|
|
171
|
-
.command(
|
|
172
|
-
.description(
|
|
186
|
+
.command("check")
|
|
187
|
+
.description("Check if coding agent CLIs and GitHub auth are available")
|
|
173
188
|
.action(async () => {
|
|
174
|
-
console.log(
|
|
189
|
+
console.log("Checking adapter availability...\n");
|
|
175
190
|
const adapters = [
|
|
176
|
-
{ name:
|
|
177
|
-
{ name:
|
|
191
|
+
{ name: "GitHub Copilot CLI", type: "copilot" },
|
|
192
|
+
{ name: "Claude Code CLI", type: "claude-code" },
|
|
178
193
|
];
|
|
179
194
|
for (const { name, type } of adapters) {
|
|
180
195
|
let adapter;
|
|
181
|
-
if (type ===
|
|
196
|
+
if (type === "copilot") {
|
|
182
197
|
adapter = new copilotCLI_1.CopilotCLIAdapter();
|
|
183
198
|
}
|
|
184
199
|
else {
|
|
185
200
|
adapter = new claudeCodeCLI_1.ClaudeCodeCLIAdapter();
|
|
186
201
|
}
|
|
187
202
|
const available = await adapter.checkAvailability();
|
|
188
|
-
const status = available ?
|
|
203
|
+
const status = available ? "✓ Available" : "✗ Not found";
|
|
189
204
|
console.log(` ${name}: ${status}`);
|
|
190
205
|
}
|
|
191
|
-
console.log(
|
|
206
|
+
console.log("\nChecking GitHub authentication...\n");
|
|
192
207
|
const authStatus = (0, githubAuth_1.checkGitHubAuth)();
|
|
193
|
-
const authIcon = authStatus.available ?
|
|
208
|
+
const authIcon = authStatus.available ? "✓" : "✗";
|
|
194
209
|
console.log(` ${authIcon} ${authStatus.message}`);
|
|
195
210
|
if (!authStatus.available) {
|
|
196
|
-
console.log(
|
|
197
|
-
console.log(
|
|
198
|
-
console.log(
|
|
211
|
+
console.log("\n 💡 GitHub token is required for LLM-as-judge validation");
|
|
212
|
+
console.log(" Setup: https://github.com/settings/tokens (scope: models:read)");
|
|
213
|
+
console.log(" Or install GitHub CLI: brew install gh && gh auth login");
|
|
199
214
|
}
|
|
200
215
|
console.log();
|
|
201
216
|
});
|
|
@@ -203,23 +218,23 @@ program
|
|
|
203
218
|
* Test LLM command
|
|
204
219
|
*/
|
|
205
220
|
program
|
|
206
|
-
.command(
|
|
207
|
-
.description(
|
|
208
|
-
.option(
|
|
221
|
+
.command("test-llm")
|
|
222
|
+
.description("Test LLM judge with a custom prompt")
|
|
223
|
+
.option("--model <model>", "LLM model to use (default: openai/gpt-4.1)")
|
|
209
224
|
.action(async (options) => {
|
|
210
225
|
try {
|
|
211
|
-
console.log(
|
|
212
|
-
console.log(
|
|
226
|
+
console.log("Testing LLM judge...\n");
|
|
227
|
+
console.log("Enter your prompt (Ctrl+D when done):\n");
|
|
213
228
|
// Read prompt from stdin
|
|
214
229
|
const chunks = [];
|
|
215
|
-
process.stdin.on(
|
|
230
|
+
process.stdin.on("data", (chunk) => {
|
|
216
231
|
chunks.push(chunk.toString());
|
|
217
232
|
});
|
|
218
|
-
process.stdin.on(
|
|
219
|
-
const prompt = chunks.join(
|
|
233
|
+
process.stdin.on("end", async () => {
|
|
234
|
+
const prompt = chunks.join("");
|
|
220
235
|
const validator = new llmJudge_1.LLMJudgeValidator(undefined, options.model);
|
|
221
236
|
const result = await validator.testJudge(prompt, options.model);
|
|
222
|
-
console.log(
|
|
237
|
+
console.log("\nLLM Response:\n");
|
|
223
238
|
console.log(result);
|
|
224
239
|
});
|
|
225
240
|
}
|
package/dist/runner.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../src/runner.ts"],"names":[],"mappings":";;AAEA;;GAEG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEH,yCAAoC;AACpC,uCAAyB;AACzB,4CAA6C;AAC7C,2CAAwC;AAExC,sDAA0D;AAC1D,4DAAgE;AAChE,oDAA0D;AAC1D,mDAAqD;
|
|
1
|
+
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../src/runner.ts"],"names":[],"mappings":";;AAEA;;GAEG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEH,yCAAoC;AACpC,uCAAyB;AACzB,4CAA6C;AAC7C,2CAAwC;AAExC,sDAA0D;AAC1D,4DAAgE;AAChE,oDAA0D;AAC1D,mDAAqD;AACrD,yCAA8C;AAE9C,MAAM,OAAO,GAAG,IAAI,mBAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,yBAAyB,CAAC;KAC/B,WAAW,CACV,oEAAoE,CACrE;KACA,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB;;GAEG;AACH,OAAO;KACJ,OAAO,CAAC,UAAU,CAAC;KACnB,WAAW,CAAC,2BAA2B,CAAC;KACxC,MAAM,CACL,sBAAsB,EACtB,qDAAqD,CACtD;KACA,MAAM,CAAC,yBAAyB,EAAE,sCAAsC,CAAC;KACzE,MAAM,CAAC,cAAc,EAAE,kCAAkC,CAAC;KAC1D,MAAM,CACL,kBAAkB,EAClB,kDAAkD,EAClD,SAAS,CACV;KACA,MAAM,CAAC,iBAAiB,EAAE,+CAA+C,CAAC;KAC1E,MAAM,CAAC,sBAAsB,EAAE,uBAAuB,EAAE,KAAK,CAAC;KAC9D,MAAM,CAAC,WAAW,EAAE,sBAAsB,CAAC;KAC3C,MAAM,CAAC,iBAAiB,EAAE,4BAA4B,CAAC;KACvD,MAAM,CAAC,yBAAyB,EAAE,0BAA0B,CAAC;KAC7D,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,IAAI,CAAC;QACH,MAAM,EAAE,MAAM,EAAE,SAAS,EAAE,GAAG,MAAM,IAAA,mBAAU,EAC5C,OAAO,CAAC,aAAa,IAAI,OAAO,CAAC,GAAG,EAAE,CACvC,CAAC;QAEF,MAAM,SAAS,GAAG,IAAI,qBAAS,CAAC;YAC9B,OAAO,EAAE,OAAO,CAAC,OAAsB;YACvC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,aAAa,EAAE,OAAO,CAAC,aAAa;YACpC,cAAc,EAAE,MAAM,CAAC,cAAc;YACrC,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,YAAY,EAAE,MAAM,CAAC,YAAY;YACjC,eAAe,EAAE,MAAM,CAAC,eAAe;SACxC,CAAC,CAAC;QAEH,MAAM,QAAQ,GAAG,IAAI,2BAAgB,CAAC;YACpC,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,YAAY,EAAE,MAAM,CAAC,YAAY;YACjC,eAAe,EAAE,MAAM,CAAC,eAAe;YACvC,OAAO,EAAE,OAAO,CAAC,OAAO;SACzB,CAAC,CAAC;QAEH,SAAS,CAAC,EAAE,CAAC,kBAAkB,EAAE,CAAC,YAAY,EAAE,EAAE;YAChD,QAAQ,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;QAC/B,CAAC,CAAC,CAAC;QAEH,SAAS,CAAC,EAAE,CAAC,gBAAgB,EAAE,CAAC,UAAU,EAAE,EAAE;YAC5C,QAAQ,CAAC,eAAe,CAAC,UAAU,CAAC,CAAC;QACvC,CAAC,CAAC,CAAC;QAEH,SAAS,CAAC,EAAE,CAAC,qBAAqB,EAAE,CAAC,UAAU,EAAE,EAAE;YACjD,oFAAoF;QACtF,CAAC,CAAC,CAAC;QAEH,SAAS,CAAC,EAAE,CAAC,qBAAqB,EAAE,CAAC,UAAU,EAAE,EAAE;YACjD,QAAQ,CAAC,oBAAoB,CAAC,UAAU,CAAC,CAAC;QAC5C,CAAC,CAAC,CAAC;QAEH,SAAS,CAAC,EAAE,CAAC,mBAAmB,EAAE,CAAC,UAAU,EAAE,MAAM,EAAE,KAAK,EAAE,EAAE;YAC9D,QAAQ,CAAC,kBAAkB,CAAC,UAAU,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC;QACzD,CAAC,CAAC,CAAC;QAEH,SAAS,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC,OAAO,EAAE,EAAE;YAC9B,QAAQ,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;QACxB,CAAC,CAAC,CAAC;QAEH,6BAA6B;QAC7B,MAAM,WAAW,GAAG,MAAM,SAAS,CAAC,wBAAwB,EAAE,CAAC;QAC/D,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,OAAO,CAAC,KAAK,CAAC,UAAU,OAAO,CAAC,OAAO,gBAAgB,CAAC,CAAC;YACzD,OAAO,CAAC,KAAK,CACX,kBAAkB,OAAO,CAAC,OAAO,0BAA0B,CAC5D,CAAC;YACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,mBAAmB;QACnB,MAAM,iBAAiB,GAAG,SAAS,CAAC,eAAe,CAAC,SAAS,EAAE;YAC7D,eAAe,EAAE,OAAO,CAAC,QAAQ;YACjC,QAAQ,EAAE,OAAO,CAAC,QAAQ;YAC1B,IAAI,EAAE,OAAO,CAAC,GAAG;gBACf,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBACrD,CAAC,CAAC,SAAS;SACd,CAAC,CAAC;QAEH,IAAI,iBAAiB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACnC,OAAO,CAAC,GAAG,CAAC,0CAA0C,CAAC,CAAC;YACxD,OAAO;QACT,CAAC;QAED,iBAAiB;QACjB,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC;QAE3D,sCAAsC;QACtC,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;QAExB,kCAAkC;QAClC,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;YACnB,EAAE,CAAC,aAAa,CACd,OAAO,CAAC,MAAM,EACd,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,EAC/B,OAAO,CACR,CAAC;YACF,OAAO,CAAC,GAAG,CAAC,yBAAyB,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;QACzD,CAAC;QAED,+CAA+C;QAC/C,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,GAAG,CAAC,EAAE,CAAC;YAC5D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,UAAU,KAAK,EAAE,CAAC,CAAC;QACjC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL;;GAEG;AACH,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,+BAA+B,CAAC;KAC5C,MAAM,CAAC,yBAAyB,EAAE,oBAAoB,CAAC;KACvD,MAAM,CAAC,cAAc,EAAE,kCAAkC,CAAC;KAC1D,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,IAAI,CAAC;QACH,MAAM,EAAE,SAAS,EAAE,GAAG,MAAM,IAAA,mBAAU,GAAE,CAAC;QAEzC,IAAI,QAAQ,GAAG,SAAS,CAAC;QAEzB,qBAAqB;QACrB,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;YACrB,MAAM,UAAU,GAAG,OAAO,CAAC,QAAQ;iBAChC,KAAK,CAAC,GAAG,CAAC;iBACV,GAAG,CAAC,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;YAChC,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC;QACrE,CAAC;QAED,iBAAiB;QACjB,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;YAChB,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;YACjE,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAC/B,IAAI,CAAC,IAAI,CAAC,CAAC,GAAW,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CACjD,CAAC;QACJ,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,0BAA0B,QAAQ,CAAC,MAAM,MAAM,CAAC,CAAC;QAE7D,KAAK,MAAM,QAAQ,IAAI,QAAQ,EAAE,CAAC;YAChC,OAAO,CAAC,GAAG,CAAC,KAAK,QAAQ,CAAC,EAAE,EAAE,CAAC,CAAC;YAChC,OAAO,CAAC,GAAG,CAAC,iBAAiB,QAAQ,CAAC,QAAQ,EAAE,CAAC,CAAC;YAClD,OAAO,CAAC,GAAG,CAAC,iBAAiB,QAAQ,CAAC,QAAQ,EAAE,CAAC,CAAC;YAClD,OAAO,CAAC,GAAG,CAAC,aAAa,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACrD,OAAO,CAAC,GAAG,CAAC,oBAAoB,QAAQ,CAAC,WAAW,EAAE,CAAC,CAAC;YACxD,OAAO,CAAC,GAAG,EAAE,CAAC;QAChB,CAAC;IACH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,UAAU,KAAK,EAAE,CAAC,CAAC;QACjC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL;;GAEG;AACH,OAAO;KACJ,OAAO,CAAC,OAAO,CAAC;KAChB,WAAW,CAAC,0DAA0D,CAAC;KACvE,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,OAAO,CAAC,GAAG,CAAC,oCAAoC,CAAC,CAAC;IAElD,MAAM,QAAQ,GAA+C;QAC3D,EAAE,IAAI,EAAE,oBAAoB,EAAE,IAAI,EAAE,SAAS,EAAE;QAC/C,EAAE,IAAI,EAAE,iBAAiB,EAAE,IAAI,EAAE,aAAa,EAAE;KACjD,CAAC;IAEF,KAAK,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,QAAQ,EAAE,CAAC;QACtC,IAAI,OAAO,CAAC;QACZ,IAAI,IAAI,KAAK,SAAS,EAAE,CAAC;YACvB,OAAO,GAAG,IAAI,8BAAiB,EAAE,CAAC;QACpC,CAAC;aAAM,CAAC;YACN,OAAO,GAAG,IAAI,oCAAoB,EAAE,CAAC;QACvC,CAAC;QAED,MAAM,SAAS,GAAG,MAAM,OAAO,CAAC,iBAAiB,EAAE,CAAC;QACpD,MAAM,MAAM,GAAG,SAAS,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,aAAa,CAAC;QACzD,OAAO,CAAC,GAAG,CAAC,KAAK,IAAI,KAAK,MAAM,EAAE,CAAC,CAAC;IACtC,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,uCAAuC,CAAC,CAAC;IACrD,MAAM,UAAU,GAAG,IAAA,4BAAe,GAAE,CAAC;IACrC,MAAM,QAAQ,GAAG,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;IAClD,OAAO,CAAC,GAAG,CAAC,KAAK,QAAQ,IAAI,UAAU,CAAC,OAAO,EAAE,CAAC,CAAC;IAEnD,IAAI,CAAC,UAAU,CAAC,SAAS,EAAE,CAAC;QAC1B,OAAO,CAAC,GAAG,CACT,6DAA6D,CAC9D,CAAC;QACF,OAAO,CAAC,GAAG,CACT,kEAAkE,CACnE,CAAC;QACF,OAAO,CAAC,GAAG,CAAC,2DAA2D,CAAC,CAAC;IAC3E,CAAC;IAED,OAAO,CAAC,GAAG,EAAE,CAAC;AAChB,CAAC,CAAC,CAAC;AAEL;;GAEG;AACH,OAAO;KACJ,OAAO,CAAC,UAAU,CAAC;KACnB,WAAW,CAAC,qCAAqC,CAAC;KAClD,MAAM,CAAC,iBAAiB,EAAE,4CAA4C,CAAC;KACvE,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,IAAI,CAAC;QACH,OAAO,CAAC,GAAG,CAAC,wBAAwB,CAAC,CAAC;QACtC,OAAO,CAAC,GAAG,CAAC,yCAAyC,CAAC,CAAC;QAEvD,yBAAyB;QACzB,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,KAAK,EAAE,EAAE;YACjC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,EAAE,KAAK,IAAI,EAAE;YACjC,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YAE/B,MAAM,SAAS,GAAG,IAAI,4BAAiB,CAAC,SAAS,EAAE,OAAO,CAAC,KAAK,CAAC,CAAC;YAClE,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,SAAS,CAAC,MAAM,EAAE,OAAO,CAAC,KAAK,CAAC,CAAC;YAEhE,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,CAAC;YACjC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QACtB,CAAC,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,UAAU,KAAK,EAAE,CAAC,CAAC;QACjC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,kBAAkB;AAClB,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC"}
|
package/dist/types.d.ts
CHANGED
|
@@ -302,6 +302,10 @@ export interface CodeGenerationAdapter {
|
|
|
302
302
|
* Check if the adapter's CLI tool is available
|
|
303
303
|
*/
|
|
304
304
|
checkAvailability(): Promise<boolean>;
|
|
305
|
+
/**
|
|
306
|
+
* Get the model being used by this adapter
|
|
307
|
+
*/
|
|
308
|
+
getModel(): string;
|
|
305
309
|
/**
|
|
306
310
|
* Generate code based on a prompt
|
|
307
311
|
* @param prompt The instruction/prompt for the coding agent
|
|
@@ -355,6 +359,14 @@ export interface BenchmarkConfig {
|
|
|
355
359
|
* - undefined: Use built-in default of 120000ms
|
|
356
360
|
*/
|
|
357
361
|
defaultTimeout?: number | null;
|
|
362
|
+
/**
|
|
363
|
+
* Enable automatic baseline saving for all evaluations
|
|
364
|
+
*/
|
|
365
|
+
saveBaseline?: boolean;
|
|
366
|
+
/**
|
|
367
|
+
* Enable automatic baseline comparison for all evaluations
|
|
368
|
+
*/
|
|
369
|
+
compareBaseline?: boolean;
|
|
358
370
|
/**
|
|
359
371
|
* Output directory for reports
|
|
360
372
|
*/
|
package/dist/types.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,MAAM,WAAW,IAAI;IACnB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACnC;AAED;;GAEG;AACH,MAAM,MAAM,YAAY,GAAG,YAAY,GAAG,OAAO,GAAG,SAAS,GAAG,cAAc,GAAG,aAAa,GAAG,SAAS,CAAC;AAE3G;;GAEG;AACH,MAAM,MAAM,QAAQ,GAAG,UAAU,GAAG,OAAO,GAAG,OAAO,CAAC;AAEtD;;GAEG;AACH,MAAM,MAAM,WAAW,GAAG,SAAS,GAAG,aAAa,CAAC;AAEpD;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC;;OAEG;IACH,iBAAiB,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;IAEtC;;OAEG;IACH,gBAAgB,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;IAErC;;;OAGG;IACH,gBAAgB,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;IAErC;;OAEG;IACH,eAAe,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;IAEpC;;OAEG;IACH,yBAAyB,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;IAE9C;;OAEG;IACH,wBAAwB,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;CAC9C;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC;;OAEG;IACH,OAAO,EAAE,OAAO,CAAC;IAEjB;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IAExB;;OAEG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;OAEG;IACH,OAAO,EAAE,OAAO,CAAC;IAEjB;;OAEG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC;;OAEG;IACH,QAAQ,CAAC,EAAE,iBAAiB,CAAC;IAE7B;;OAEG;IACH,QAAQ,CAAC,EAAE,kBAAkB,CAAC;IAE9B;;OAEG;IACH,MAAM,CAAC,EAAE,gBAAgB,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B;;OAEG;IACH,EAAE,EAAE,MAAM,CAAC;IAEX;;OAEG;IACH,QAAQ,EAAE,YAAY,CAAC;IAEvB;;OAEG;IACH,QAAQ,EAAE,QAAQ,CAAC;IAEnB;;OAEG;IACH,IAAI,EAAE,SAAS,MAAM,EAAE,CAAC;IAExB;;OAEG;IACH,WAAW,EAAE,MAAM,CAAC;IAEpB;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IAEf;;OAEG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;;OAGG;IACH,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;IAEjC;;OAEG;IACH,kBAAkB,EAAE,kBAAkB,CAAC;IAEvC;;;;;OAKG;IACH,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CACzB;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB;;OAEG;IACH,IAAI,EAAE,SAAS,GAAG,WAAW,GAAG,QAAQ,CAAC;IAEzC;;OAEG;IACH,OAAO,EAAE,MAAM,CAAC;IAEhB;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IAEd;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IAEd;;OAEG;IACH,QAAQ,EAAE,QAAQ,CAAC;IAEnB;;OAEG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;OAEG;IACH,MAAM,EAAE,OAAO,CAAC;IAEhB;;;OAGG;IACH,KAAK,EAAE,MAAM,CAAC;IAEd;;OAEG;IACH,UAAU,EAAE,SAAS,EAAE,CAAC;IAExB;;OAEG;IACH,aAAa,EAAE,SAAS,GAAG,WAAW,GAAG,QAAQ,CAAC;IAElD;;OAEG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;OAEG;IACH,QAAQ,EAAE,YAAY,CAAC;IAEvB;;OAEG;IACH,MAAM,EAAE,OAAO,CAAC;IAEhB;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IAEd;;OAEG;IACH,iBAAiB,EAAE,gBAAgB,EAAE,CAAC;IAEtC;;OAEG;IACH,UAAU,EAAE,SAAS,EAAE,CAAC;IAExB;;OAEG;IACH,aAAa,CAAC,EAAE;QACd,KAAK,EAAE;YACL,IAAI,EAAE,MAAM,CAAC;YACb,OAAO,EAAE,MAAM,CAAC;SACjB,EAAE,CAAC;KACL,CAAC;IAEF;;OAEG;IACH,QAAQ,EAAE,MAAM,CAAC;IAEjB;;OAEG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;OAEG;IACH,kBAAkB,CAAC,EAAE;QACnB,aAAa,EAAE,MAAM,CAAC;QACtB,KAAK,EAAE,MAAM,CAAC;QACd,aAAa,EAAE,OAAO,CAAC;KACxB,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;OAEG;IACH,OAAO,EAAE,WAAW,CAAC;IAErB;;OAEG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;OAEG;IACH,SAAS,EAAE,MAAM,CAAC;IAElB;;OAEG;IACH,OAAO,EAAE,gBAAgB,EAAE,CAAC;IAE5B;;OAEG;IACH,OAAO,EAAE;QACP,KAAK,EAAE,MAAM,CAAC;QACd,MAAM,EAAE,MAAM,CAAC;QACf,MAAM,EAAE,MAAM,CAAC;QACf,OAAO,EAAE,MAAM,CAAC;QAChB,YAAY,EAAE,MAAM,CAAC;QACrB,eAAe,EAAE,MAAM,CAAC;KACzB,CAAC;IAEF;;OAEG;IACH,aAAa,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,qBAAqB;IACpC;;OAEG;IACH,IAAI,EAAE,WAAW,CAAC;IAElB;;OAEG;IACH,iBAAiB,IAAI,OAAO,CAAC,OAAO,CAAC,CAAC;IAEtC;;;;;;OAMG;IACH,QAAQ,CACN,MAAM,EAAE,MAAM,EACd,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,GACtB,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B;;OAEG;IACH,IAAI,EAAE,SAAS,GAAG,WAAW,GAAG,QAAQ,CAAC;IAEzC;;;;;OAKG;IACH,QAAQ,CACN,KAAK,EAAE,SAAS,MAAM,EAAE,EACxB,QAAQ,EAAE,YAAY,GACrB,OAAO,CAAC,gBAAgB,CAAC,CAAC;CAC9B;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B;;OAEG;IACH,SAAS,CAAC,EAAE,YAAY,EAAE,CAAC;IAE3B;;OAEG;IACH,cAAc,CAAC,EAAE,WAAW,CAAC;IAE7B;;OAEG;IACH,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB;;OAEG;IACH,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB;;;;;;OAMG;IACH,cAAc,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAE/B;;OAEG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB"}
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,MAAM,WAAW,IAAI;IACnB,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACnC;AAED;;GAEG;AACH,MAAM,MAAM,YAAY,GAAG,YAAY,GAAG,OAAO,GAAG,SAAS,GAAG,cAAc,GAAG,aAAa,GAAG,SAAS,CAAC;AAE3G;;GAEG;AACH,MAAM,MAAM,QAAQ,GAAG,UAAU,GAAG,OAAO,GAAG,OAAO,CAAC;AAEtD;;GAEG;AACH,MAAM,MAAM,WAAW,GAAG,SAAS,GAAG,aAAa,CAAC;AAEpD;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC;;OAEG;IACH,iBAAiB,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;IAEtC;;OAEG;IACH,gBAAgB,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;IAErC;;;OAGG;IACH,gBAAgB,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;IAErC;;OAEG;IACH,eAAe,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;IAEpC;;OAEG;IACH,yBAAyB,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;IAE9C;;OAEG;IACH,wBAAwB,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;CAC9C;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC;;OAEG;IACH,OAAO,EAAE,OAAO,CAAC;IAEjB;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IAExB;;OAEG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;OAEG;IACH,OAAO,EAAE,OAAO,CAAC;IAEjB;;OAEG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC;;OAEG;IACH,QAAQ,CAAC,EAAE,iBAAiB,CAAC;IAE7B;;OAEG;IACH,QAAQ,CAAC,EAAE,kBAAkB,CAAC;IAE9B;;OAEG;IACH,MAAM,CAAC,EAAE,gBAAgB,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B;;OAEG;IACH,EAAE,EAAE,MAAM,CAAC;IAEX;;OAEG;IACH,QAAQ,EAAE,YAAY,CAAC;IAEvB;;OAEG;IACH,QAAQ,EAAE,QAAQ,CAAC;IAEnB;;OAEG;IACH,IAAI,EAAE,SAAS,MAAM,EAAE,CAAC;IAExB;;OAEG;IACH,WAAW,EAAE,MAAM,CAAC;IAEpB;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IAEf;;OAEG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB;;;OAGG;IACH,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;IAEjC;;OAEG;IACH,kBAAkB,EAAE,kBAAkB,CAAC;IAEvC;;;;;OAKG;IACH,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CACzB;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB;;OAEG;IACH,IAAI,EAAE,SAAS,GAAG,WAAW,GAAG,QAAQ,CAAC;IAEzC;;OAEG;IACH,OAAO,EAAE,MAAM,CAAC;IAEhB;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IAEd;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IAEd;;OAEG;IACH,QAAQ,EAAE,QAAQ,CAAC;IAEnB;;OAEG;IACH,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;OAEG;IACH,MAAM,EAAE,OAAO,CAAC;IAEhB;;;OAGG;IACH,KAAK,EAAE,MAAM,CAAC;IAEd;;OAEG;IACH,UAAU,EAAE,SAAS,EAAE,CAAC;IAExB;;OAEG;IACH,aAAa,EAAE,SAAS,GAAG,WAAW,GAAG,QAAQ,CAAC;IAElD;;OAEG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;OAEG;IACH,QAAQ,EAAE,YAAY,CAAC;IAEvB;;OAEG;IACH,MAAM,EAAE,OAAO,CAAC;IAEhB;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IAEd;;OAEG;IACH,iBAAiB,EAAE,gBAAgB,EAAE,CAAC;IAEtC;;OAEG;IACH,UAAU,EAAE,SAAS,EAAE,CAAC;IAExB;;OAEG;IACH,aAAa,CAAC,EAAE;QACd,KAAK,EAAE;YACL,IAAI,EAAE,MAAM,CAAC;YACb,OAAO,EAAE,MAAM,CAAC;SACjB,EAAE,CAAC;KACL,CAAC;IAEF;;OAEG;IACH,QAAQ,EAAE,MAAM,CAAC;IAEjB;;OAEG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;OAEG;IACH,kBAAkB,CAAC,EAAE;QACnB,aAAa,EAAE,MAAM,CAAC;QACtB,KAAK,EAAE,MAAM,CAAC;QACd,aAAa,EAAE,OAAO,CAAC;KACxB,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;OAEG;IACH,OAAO,EAAE,WAAW,CAAC;IAErB;;OAEG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;OAEG;IACH,SAAS,EAAE,MAAM,CAAC;IAElB;;OAEG;IACH,OAAO,EAAE,gBAAgB,EAAE,CAAC;IAE5B;;OAEG;IACH,OAAO,EAAE;QACP,KAAK,EAAE,MAAM,CAAC;QACd,MAAM,EAAE,MAAM,CAAC;QACf,MAAM,EAAE,MAAM,CAAC;QACf,OAAO,EAAE,MAAM,CAAC;QAChB,YAAY,EAAE,MAAM,CAAC;QACrB,eAAe,EAAE,MAAM,CAAC;KACzB,CAAC;IAEF;;OAEG;IACH,aAAa,EAAE,MAAM,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,qBAAqB;IACpC;;OAEG;IACH,IAAI,EAAE,WAAW,CAAC;IAElB;;OAEG;IACH,iBAAiB,IAAI,OAAO,CAAC,OAAO,CAAC,CAAC;IAEtC;;OAEG;IACH,QAAQ,IAAI,MAAM,CAAC;IAEnB;;;;;;OAMG;IACH,QAAQ,CACN,MAAM,EAAE,MAAM,EACd,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,GACtB,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B;;OAEG;IACH,IAAI,EAAE,SAAS,GAAG,WAAW,GAAG,QAAQ,CAAC;IAEzC;;;;;OAKG;IACH,QAAQ,CACN,KAAK,EAAE,SAAS,MAAM,EAAE,EACxB,QAAQ,EAAE,YAAY,GACrB,OAAO,CAAC,gBAAgB,CAAC,CAAC;CAC9B;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B;;OAEG;IACH,SAAS,CAAC,EAAE,YAAY,EAAE,CAAC;IAE3B;;OAEG;IACH,cAAc,CAAC,EAAE,WAAW,CAAC;IAE7B;;OAEG;IACH,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB;;OAEG;IACH,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB;;;;;;OAMG;IACH,cAAc,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAE/B;;OAEG;IACH,YAAY,CAAC,EAAE,OAAO,CAAC;IAEvB;;OAEG;IACH,eAAe,CAAC,EAAE,OAAO,CAAC;IAE1B;;OAEG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB"}
|