coding-agent-benchmarks 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +474 -0
- package/dist/adapters/claudeCodeCLI.d.ts +19 -0
- package/dist/adapters/claudeCodeCLI.d.ts.map +1 -0
- package/dist/adapters/claudeCodeCLI.js +106 -0
- package/dist/adapters/claudeCodeCLI.js.map +1 -0
- package/dist/adapters/copilotCLI.d.ts +19 -0
- package/dist/adapters/copilotCLI.d.ts.map +1 -0
- package/dist/adapters/copilotCLI.js +104 -0
- package/dist/adapters/copilotCLI.js.map +1 -0
- package/dist/config/defaultScenarios.d.ts +6 -0
- package/dist/config/defaultScenarios.d.ts.map +1 -0
- package/dist/config/defaultScenarios.js +209 -0
- package/dist/config/defaultScenarios.js.map +1 -0
- package/dist/config/loader.d.ts +13 -0
- package/dist/config/loader.d.ts.map +1 -0
- package/dist/config/loader.js +153 -0
- package/dist/config/loader.js.map +1 -0
- package/dist/evaluator.d.ts +45 -0
- package/dist/evaluator.d.ts.map +1 -0
- package/dist/evaluator.js +226 -0
- package/dist/evaluator.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +38 -0
- package/dist/index.js.map +1 -0
- package/dist/runner.d.ts +6 -0
- package/dist/runner.d.ts.map +1 -0
- package/dist/runner.js +233 -0
- package/dist/runner.js.map +1 -0
- package/dist/types.d.ts +354 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/baselineManager.d.ts +53 -0
- package/dist/utils/baselineManager.d.ts.map +1 -0
- package/dist/utils/baselineManager.js +220 -0
- package/dist/utils/baselineManager.js.map +1 -0
- package/dist/utils/gitUtils.d.ts +39 -0
- package/dist/utils/gitUtils.d.ts.map +1 -0
- package/dist/utils/gitUtils.js +121 -0
- package/dist/utils/gitUtils.js.map +1 -0
- package/dist/utils/githubAuth.d.ts +22 -0
- package/dist/utils/githubAuth.d.ts.map +1 -0
- package/dist/utils/githubAuth.js +79 -0
- package/dist/utils/githubAuth.js.map +1 -0
- package/dist/utils/workspaceUtils.d.ts +32 -0
- package/dist/utils/workspaceUtils.d.ts.map +1 -0
- package/dist/utils/workspaceUtils.js +121 -0
- package/dist/utils/workspaceUtils.js.map +1 -0
- package/dist/validators/eslintValidator.d.ts +22 -0
- package/dist/validators/eslintValidator.d.ts.map +1 -0
- package/dist/validators/eslintValidator.js +217 -0
- package/dist/validators/eslintValidator.js.map +1 -0
- package/dist/validators/llmJudge.d.ts +28 -0
- package/dist/validators/llmJudge.d.ts.map +1 -0
- package/dist/validators/llmJudge.js +241 -0
- package/dist/validators/llmJudge.js.map +1 -0
- package/dist/validators/patternValidator.d.ts +27 -0
- package/dist/validators/patternValidator.d.ts.map +1 -0
- package/dist/validators/patternValidator.js +233 -0
- package/dist/validators/patternValidator.js.map +1 -0
- package/package.json +50 -0
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Main evaluation engine
|
|
4
|
+
*/
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.Evaluator = void 0;
|
|
7
|
+
const copilotCLI_1 = require("./adapters/copilotCLI");
|
|
8
|
+
const claudeCodeCLI_1 = require("./adapters/claudeCodeCLI");
|
|
9
|
+
const patternValidator_1 = require("./validators/patternValidator");
|
|
10
|
+
const llmJudge_1 = require("./validators/llmJudge");
|
|
11
|
+
const eslintValidator_1 = require("./validators/eslintValidator");
|
|
12
|
+
const workspaceUtils_1 = require("./utils/workspaceUtils");
|
|
13
|
+
const baselineManager_1 = require("./utils/baselineManager");
|
|
14
|
+
class Evaluator {
|
|
15
|
+
constructor(options) {
|
|
16
|
+
this.options = options;
|
|
17
|
+
this.workspaceRoot = (0, workspaceUtils_1.resolveWorkspaceRoot)(options.workspaceRoot);
|
|
18
|
+
this.baselineManager = new baselineManager_1.BaselineManager(this.workspaceRoot);
|
|
19
|
+
// Create adapter based on type
|
|
20
|
+
this.adapter = this.createAdapter(options.adapter);
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Create adapter instance based on type
|
|
24
|
+
*/
|
|
25
|
+
createAdapter(type) {
|
|
26
|
+
switch (type) {
|
|
27
|
+
case 'copilot':
|
|
28
|
+
return new copilotCLI_1.CopilotCLIAdapter(this.workspaceRoot);
|
|
29
|
+
case 'claude-code':
|
|
30
|
+
return new claudeCodeCLI_1.ClaudeCodeCLIAdapter(this.workspaceRoot);
|
|
31
|
+
default:
|
|
32
|
+
throw new Error(`Unknown adapter type: ${type}`);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Check if adapter is available
|
|
37
|
+
*/
|
|
38
|
+
async checkAdapterAvailability() {
|
|
39
|
+
return this.adapter.checkAvailability();
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Filter scenarios based on criteria
|
|
43
|
+
*/
|
|
44
|
+
filterScenarios(scenarios, filters) {
|
|
45
|
+
let filtered = scenarios;
|
|
46
|
+
// Filter by scenario ID pattern
|
|
47
|
+
if (filters.scenarioPattern) {
|
|
48
|
+
const pattern = filters.scenarioPattern.replace(/\*/g, '.*');
|
|
49
|
+
const regex = new RegExp(pattern);
|
|
50
|
+
filtered = filtered.filter(s => regex.test(s.id));
|
|
51
|
+
}
|
|
52
|
+
// Filter by category
|
|
53
|
+
if (filters.category) {
|
|
54
|
+
const categories = filters.category.split(',').map(c => c.trim());
|
|
55
|
+
filtered = filtered.filter(s => categories.includes(s.category));
|
|
56
|
+
}
|
|
57
|
+
// Filter by tags
|
|
58
|
+
if (filters.tags && filters.tags.length > 0) {
|
|
59
|
+
filtered = filtered.filter(s => filters.tags.some(tag => s.tags.includes(tag)));
|
|
60
|
+
}
|
|
61
|
+
return filtered;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Evaluate a single scenario
|
|
65
|
+
*/
|
|
66
|
+
async evaluateScenario(scenario) {
|
|
67
|
+
const startTime = Date.now();
|
|
68
|
+
try {
|
|
69
|
+
if (this.options.verbose) {
|
|
70
|
+
console.log(`\nEvaluating scenario: ${scenario.id}`);
|
|
71
|
+
console.log(` Description: ${scenario.description}`);
|
|
72
|
+
}
|
|
73
|
+
// Generate code using adapter
|
|
74
|
+
if (this.options.verbose) {
|
|
75
|
+
console.log(' Generating code...');
|
|
76
|
+
}
|
|
77
|
+
// Resolve timeout (null = no timeout, undefined = use defaults)
|
|
78
|
+
let timeout;
|
|
79
|
+
if (scenario.timeout !== undefined) {
|
|
80
|
+
// Scenario explicitly sets timeout (could be number or null)
|
|
81
|
+
timeout = scenario.timeout;
|
|
82
|
+
}
|
|
83
|
+
else if (this.options.defaultTimeout !== undefined) {
|
|
84
|
+
// Config sets default timeout (could be number or null)
|
|
85
|
+
timeout = this.options.defaultTimeout;
|
|
86
|
+
}
|
|
87
|
+
else {
|
|
88
|
+
// Built-in default: 2 minutes
|
|
89
|
+
timeout = 120000;
|
|
90
|
+
}
|
|
91
|
+
const generatedFiles = await this.adapter.generate(scenario.prompt, scenario.contextFiles, timeout);
|
|
92
|
+
if (this.options.verbose) {
|
|
93
|
+
console.log(` Generated ${generatedFiles.length} file(s)`);
|
|
94
|
+
}
|
|
95
|
+
// Run validators
|
|
96
|
+
const validationResults = [];
|
|
97
|
+
// Pattern validator
|
|
98
|
+
const patternValidator = new patternValidator_1.PatternValidator(this.workspaceRoot);
|
|
99
|
+
const patternResult = await patternValidator.validate(generatedFiles, scenario);
|
|
100
|
+
validationResults.push(patternResult);
|
|
101
|
+
if (this.options.verbose && patternResult.score >= 0) {
|
|
102
|
+
console.log(` Pattern validation: ${patternResult.score.toFixed(2)}`);
|
|
103
|
+
}
|
|
104
|
+
// LLM judge validator
|
|
105
|
+
const llmValidator = new llmJudge_1.LLMJudgeValidator(this.workspaceRoot, this.options.model);
|
|
106
|
+
const llmResult = await llmValidator.validate(generatedFiles, scenario);
|
|
107
|
+
validationResults.push(llmResult);
|
|
108
|
+
if (this.options.verbose && llmResult.score >= 0) {
|
|
109
|
+
console.log(` LLM judge: ${llmResult.score.toFixed(2)}`);
|
|
110
|
+
}
|
|
111
|
+
// ESLint validator
|
|
112
|
+
const eslintValidator = new eslintValidator_1.ESLintValidator(this.workspaceRoot);
|
|
113
|
+
const eslintResult = await eslintValidator.validate(generatedFiles, scenario);
|
|
114
|
+
validationResults.push(eslintResult);
|
|
115
|
+
if (this.options.verbose && eslintResult.score >= 0) {
|
|
116
|
+
console.log(` ESLint: ${eslintResult.score.toFixed(2)}`);
|
|
117
|
+
}
|
|
118
|
+
// Calculate overall score (average of non-skipped validators)
|
|
119
|
+
const activeResults = validationResults.filter(r => r.score >= 0);
|
|
120
|
+
const overallScore = activeResults.length > 0
|
|
121
|
+
? activeResults.reduce((sum, r) => sum + r.score, 0) / activeResults.length
|
|
122
|
+
: 0;
|
|
123
|
+
// Collect all violations
|
|
124
|
+
const allViolations = validationResults.flatMap(r => r.violations);
|
|
125
|
+
// Check if passed (score above threshold and no violations)
|
|
126
|
+
const passed = overallScore >= 0.8 && allViolations.length === 0;
|
|
127
|
+
const result = {
|
|
128
|
+
scenario,
|
|
129
|
+
passed,
|
|
130
|
+
score: overallScore,
|
|
131
|
+
validationResults,
|
|
132
|
+
violations: allViolations,
|
|
133
|
+
duration: Date.now() - startTime,
|
|
134
|
+
};
|
|
135
|
+
// Compare with baseline if requested
|
|
136
|
+
if (this.options.compareBaseline) {
|
|
137
|
+
const comparison = this.baselineManager.compareWithBaseline(result, this.options.adapter, this.options.model || 'default');
|
|
138
|
+
if (comparison) {
|
|
139
|
+
result.baselineComparison = comparison;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
// Save baseline if requested
|
|
143
|
+
if (this.options.saveBaseline) {
|
|
144
|
+
this.baselineManager.saveBaseline(result, this.options.adapter, this.options.model || 'default');
|
|
145
|
+
}
|
|
146
|
+
return result;
|
|
147
|
+
}
|
|
148
|
+
catch (error) {
|
|
149
|
+
const errorMessage = String(error);
|
|
150
|
+
const isTimeout = errorMessage.includes('timed out');
|
|
151
|
+
// Create a violation for timeout errors
|
|
152
|
+
const violations = isTimeout
|
|
153
|
+
? [
|
|
154
|
+
{
|
|
155
|
+
type: 'pattern',
|
|
156
|
+
message: 'Code generation timed out',
|
|
157
|
+
severity: scenario.severity,
|
|
158
|
+
details: errorMessage,
|
|
159
|
+
},
|
|
160
|
+
]
|
|
161
|
+
: [];
|
|
162
|
+
return {
|
|
163
|
+
scenario,
|
|
164
|
+
passed: false,
|
|
165
|
+
score: 0,
|
|
166
|
+
validationResults: [],
|
|
167
|
+
violations,
|
|
168
|
+
duration: Date.now() - startTime,
|
|
169
|
+
error: `Evaluation failed: ${error}`,
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Evaluate multiple scenarios
|
|
175
|
+
*/
|
|
176
|
+
async evaluate(scenarios) {
|
|
177
|
+
const startTime = Date.now();
|
|
178
|
+
const results = [];
|
|
179
|
+
console.log(`Evaluating ${scenarios.length} scenario(s)...`);
|
|
180
|
+
for (let i = 0; i < scenarios.length; i++) {
|
|
181
|
+
const scenario = scenarios[i];
|
|
182
|
+
console.log(`\n[${i + 1}/${scenarios.length}] ${scenario.id}`);
|
|
183
|
+
const result = await this.evaluateScenario(scenario);
|
|
184
|
+
results.push(result);
|
|
185
|
+
// Show result summary
|
|
186
|
+
if (result.passed) {
|
|
187
|
+
console.log(` ✓ PASSED (score: ${result.score.toFixed(2)})`);
|
|
188
|
+
}
|
|
189
|
+
else {
|
|
190
|
+
console.log(` ✗ FAILED (score: ${result.score.toFixed(2)})`);
|
|
191
|
+
if (result.violations.length > 0) {
|
|
192
|
+
console.log(` ${result.violations.length} violation(s)`);
|
|
193
|
+
}
|
|
194
|
+
if (result.error) {
|
|
195
|
+
console.log(` Error: ${result.error}`);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
// Calculate summary statistics
|
|
200
|
+
const passed = results.filter(r => r.passed).length;
|
|
201
|
+
const failed = results.filter(r => !r.passed && !r.error).length;
|
|
202
|
+
const skipped = results.filter(r => r.error).length;
|
|
203
|
+
const totalViolations = results.reduce((sum, r) => sum + r.violations.length, 0);
|
|
204
|
+
const averageScore = results.length > 0
|
|
205
|
+
? results.reduce((sum, r) => sum + r.score, 0) / results.length
|
|
206
|
+
: 0;
|
|
207
|
+
const report = {
|
|
208
|
+
adapter: this.options.adapter,
|
|
209
|
+
model: this.options.model,
|
|
210
|
+
timestamp: new Date().toISOString(),
|
|
211
|
+
results,
|
|
212
|
+
summary: {
|
|
213
|
+
total: scenarios.length,
|
|
214
|
+
passed,
|
|
215
|
+
failed,
|
|
216
|
+
skipped,
|
|
217
|
+
averageScore,
|
|
218
|
+
totalViolations,
|
|
219
|
+
},
|
|
220
|
+
totalDuration: Date.now() - startTime,
|
|
221
|
+
};
|
|
222
|
+
return report;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
exports.Evaluator = Evaluator;
|
|
226
|
+
//# sourceMappingURL=evaluator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"evaluator.js","sourceRoot":"","sources":["../src/evaluator.ts"],"names":[],"mappings":";AAAA;;GAEG;;;AAUH,sDAA0D;AAC1D,4DAAgE;AAChE,oEAAiE;AACjE,oDAA0D;AAC1D,kEAA+D;AAC/D,2DAA8D;AAC9D,6DAA0D;AAY1D,MAAa,SAAS;IAMpB,YAAY,OAAyB;QACnC,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACvB,IAAI,CAAC,aAAa,GAAG,IAAA,qCAAoB,EAAC,OAAO,CAAC,aAAa,CAAC,CAAC;QACjE,IAAI,CAAC,eAAe,GAAG,IAAI,iCAAe,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAE/D,+BAA+B;QAC/B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;IACrD,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,IAAiB;QACrC,QAAQ,IAAI,EAAE,CAAC;YACb,KAAK,SAAS;gBACZ,OAAO,IAAI,8BAAiB,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;YACnD,KAAK,aAAa;gBAChB,OAAO,IAAI,oCAAoB,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;YACtD;gBACE,MAAM,IAAI,KAAK,CAAC,yBAAyB,IAAI,EAAE,CAAC,CAAC;QACrD,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,wBAAwB;QAC5B,OAAO,IAAI,CAAC,OAAO,CAAC,iBAAiB,EAAE,CAAC;IAC1C,CAAC;IAED;;OAEG;IACH,eAAe,CACb,SAAyB,EACzB,OAIC;QAED,IAAI,QAAQ,GAAG,SAAS,CAAC;QAEzB,gCAAgC;QAChC,IAAI,OAAO,CAAC,eAAe,EAAE,CAAC;YAC5B,MAAM,OAAO,GAAG,OAAO,CAAC,eAAe,CAAC,OAAO,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;YAC7D,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,OAAO,CAAC,CAAC;YAClC,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QACpD,CAAC;QAED,qBAAqB;QACrB,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;YACrB,MAAM,UAAU,GAAG,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;YAClE,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC;QACnE,CAAC;QAED,iBAAiB;QACjB,IAAI,OAAO,CAAC,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5C,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAC7B,OAAO,CAAC,IAAK,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAChD,CAAC;QACJ,CAAC;QAED,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,gBAAgB,CAAC,QAAsB;QAC3C,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAE7B,IAAI,CAAC;YACH,IAAI,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;gBACzB,OAAO,CAAC,GAAG,CAAC,0BAA0B,QAAQ,CAAC,EAAE,EAAE,CAAC,CAAC;gBACrD,OAAO,CAAC,GAAG,CAAC,kBAAkB,QAAQ,CAAC,WAAW,EAAE,CAAC,CAAC;YACxD,CAAC;YAED,8BAA8B;YAC9B,IAAI,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;gBACzB,OAAO,CAAC,GAAG,CAAC,sBAAsB,CAAC,CAAC;YACtC,CAAC;YAED,gEAAgE;YAChE,IAAI,OAAsB,CAAC;YAC3B,IAAI,QAAQ,CAAC,OAAO,KAAK,SAAS,EAAE,CAAC;gBACnC,6DAA6D;gBAC7D,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC;YAC7B,CAAC;iBAAM,IAAI,IAAI,CAAC,OAAO,CAAC,cAAc,KAAK,SAAS,EAAE,CAAC;gBACrD,wDAAwD;gBACxD,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC;YACxC,CAAC;iBAAM,CAAC;gBACN,8BAA8B;gBAC9B,OAAO,GAAG,MAAM,CAAC;YACnB,CAAC;YAED,MAAM,cAAc,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,QAAQ,CAChD,QAAQ,CAAC,MAAM,EACf,QAAQ,CAAC,YAAY,EACrB,OAAO,CACR,CAAC;YAEF,IAAI,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;gBACzB,OAAO,CAAC,GAAG,CAAC,eAAe,cAAc,CAAC,MAAM,UAAU,CAAC,CAAC;YAC9D,CAAC;YAED,iBAAiB;YACjB,MAAM,iBAAiB,GAAuB,EAAE,CAAC;YAEjD,oBAAoB;YACpB,MAAM,gBAAgB,GAAG,IAAI,mCAAgB,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;YAClE,MAAM,aAAa,GAAG,MAAM,gBAAgB,CAAC,QAAQ,CAAC,cAAc,EAAE,QAAQ,CAAC,CAAC;YAChF,iBAAiB,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;YAEtC,IAAI,IAAI,CAAC,OAAO,CAAC,OAAO,IAAI,aAAa,CAAC,KAAK,IAAI,CAAC,EAAE,CAAC;gBACrD,OAAO,CAAC,GAAG,CAAC,yBAAyB,aAAa,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;YACzE,CAAC;YAED,sBAAsB;YACtB,MAAM,YAAY,GAAG,IAAI,4BAAiB,CACxC,IAAI,CAAC,aAAa,EAClB,IAAI,CAAC,OAAO,CAAC,KAAK,CACnB,CAAC;YACF,MAAM,SAAS,GAAG,MAAM,YAAY,CAAC,QAAQ,CAAC,cAAc,EAAE,QAAQ,CAAC,CAAC;YACxE,iBAAiB,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YAElC,IAAI,IAAI,CAAC,OAAO,CAAC,OAAO,IAAI,SAAS,CAAC,KAAK,IAAI,CAAC,EAAE,CAAC;gBACjD,OAAO,CAAC,GAAG,CAAC,gBAAgB,SAAS,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;YAC5D,CAAC;YAED,mBAAmB;YACnB,MAAM,eAAe,GAAG,IAAI,iCAAe,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;YAChE,MAAM,YAAY,GAAG,MAAM,eAAe,CAAC,QAAQ,CAAC,cAAc,EAAE,QAAQ,CAAC,CAAC;YAC9E,iBAAiB,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YAErC,IAAI,IAAI,CAAC,OAAO,CAAC,OAAO,IAAI,YAAY,CAAC,KAAK,IAAI,CAAC,EAAE,CAAC;gBACpD,OAAO,CAAC,GAAG,CAAC,aAAa,YAAY,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;YAC5D,CAAC;YAED,8DAA8D;YAC9D,MAAM,aAAa,GAAG,iBAAiB,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,CAAC;YAClE,MAAM,YAAY,GAChB,aAAa,CAAC,MAAM,GAAG,CAAC;gBACtB,CAAC,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,GAAG,aAAa,CAAC,MAAM;gBAC3E,CAAC,CAAC,CAAC,CAAC;YAER,yBAAyB;YACzB,MAAM,aAAa,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC;YAEnE,4DAA4D;YAC5D,MAAM,MAAM,GAAG,YAAY,IAAI,GAAG,IAAI,aAAa,CAAC,MAAM,KAAK,CAAC,CAAC;YAEjE,MAAM,MAAM,GAAqB;gBAC/B,QAAQ;gBACR,MAAM;gBACN,KAAK,EAAE,YAAY;gBACnB,iBAAiB;gBACjB,UAAU,EAAE,aAAa;gBACzB,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;aACjC,CAAC;YAEF,qCAAqC;YACrC,IAAI,IAAI,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC;gBACjC,MAAM,UAAU,GAAG,IAAI,CAAC,eAAe,CAAC,mBAAmB,CACzD,MAAM,EACN,IAAI,CAAC,OAAO,CAAC,OAAO,EACpB,IAAI,CAAC,OAAO,CAAC,KAAK,IAAI,SAAS,CAChC,CAAC;gBACF,IAAI,UAAU,EAAE,CAAC;oBACf,MAAM,CAAC,kBAAkB,GAAG,UAAU,CAAC;gBACzC,CAAC;YACH,CAAC;YAED,6BAA6B;YAC7B,IAAI,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,CAAC;gBAC9B,IAAI,CAAC,eAAe,CAAC,YAAY,CAC/B,MAAM,EACN,IAAI,CAAC,OAAO,CAAC,OAAO,EACpB,IAAI,CAAC,OAAO,CAAC,KAAK,IAAI,SAAS,CAChC,CAAC;YACJ,CAAC;YAED,OAAO,MAAM,CAAC;QAChB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;YACnC,MAAM,SAAS,GAAG,YAAY,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC;YAErD,wCAAwC;YACxC,MAAM,UAAU,GAAG,SAAS;gBAC1B,CAAC,CAAC;oBACE;wBACE,IAAI,EAAE,SAAkB;wBACxB,OAAO,EAAE,2BAA2B;wBACpC,QAAQ,EAAE,QAAQ,CAAC,QAAQ;wBAC3B,OAAO,EAAE,YAAY;qBACtB;iBACF;gBACH,CAAC,CAAC,EAAE,CAAC;YAEP,OAAO;gBACL,QAAQ;gBACR,MAAM,EAAE,KAAK;gBACb,KAAK,EAAE,CAAC;gBACR,iBAAiB,EAAE,EAAE;gBACrB,UAAU;gBACV,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;gBAChC,KAAK,EAAE,sBAAsB,KAAK,EAAE;aACrC,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,QAAQ,CAAC,SAAyB;QACtC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,MAAM,OAAO,GAAuB,EAAE,CAAC;QAEvC,OAAO,CAAC,GAAG,CAAC,cAAc,SAAS,CAAC,MAAM,iBAAiB,CAAC,CAAC;QAE7D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,MAAM,QAAQ,GAAG,SAAS,CAAC,CAAC,CAAC,CAAC;YAC9B,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,SAAS,CAAC,MAAM,KAAK,QAAQ,CAAC,EAAE,EAAE,CAAC,CAAC;YAE/D,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;YACrD,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YAErB,sBAAsB;YACtB,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;gBAClB,OAAO,CAAC,GAAG,CAAC,sBAAsB,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;YAChE,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,GAAG,CAAC,sBAAsB,MAAM,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;gBAC9D,IAAI,MAAM,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBACjC,OAAO,CAAC,GAAG,CAAC,OAAO,MAAM,CAAC,UAAU,CAAC,MAAM,eAAe,CAAC,CAAC;gBAC9D,CAAC;gBACD,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;oBACjB,OAAO,CAAC,GAAG,CAAC,cAAc,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;gBAC5C,CAAC;YACH,CAAC;QACH,CAAC;QAED,+BAA+B;QAC/B,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;QACpD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC;QACjE,MAAM,OAAO,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC;QACpD,MAAM,eAAe,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;QACjF,MAAM,YAAY,GAChB,OAAO,CAAC,MAAM,GAAG,CAAC;YAChB,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM;YAC/D,CAAC,CAAC,CAAC,CAAC;QAER,MAAM,MAAM,GAAqB;YAC/B,OAAO,EAAE,IAAI,CAAC,OAAO,CAAC,OAAO;YAC7B,KAAK,EAAE,IAAI,CAAC,OAAO,CAAC,KAAK;YACzB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACnC,OAAO;YACP,OAAO,EAAE;gBACP,KAAK,EAAE,SAAS,CAAC,MAAM;gBACvB,MAAM;gBACN,MAAM;gBACN,OAAO;gBACP,YAAY;gBACZ,eAAe;aAChB;YACD,aAAa,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;SACtC,CAAC;QAEF,OAAO,MAAM,CAAC;IAChB,CAAC;CACF;AAnRD,8BAmRC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export * from './types';
|
|
2
|
+
export { Evaluator, EvaluatorOptions } from './evaluator';
|
|
3
|
+
export { CopilotCLIAdapter } from './adapters/copilotCLI';
|
|
4
|
+
export { ClaudeCodeCLIAdapter } from './adapters/claudeCodeCLI';
|
|
5
|
+
export { PatternValidator } from './validators/patternValidator';
|
|
6
|
+
export { LLMJudgeValidator } from './validators/llmJudge';
|
|
7
|
+
export { ESLintValidator } from './validators/eslintValidator';
|
|
8
|
+
export { loadConfig } from './config/loader';
|
|
9
|
+
export { BaselineManager } from './utils/baselineManager';
|
|
10
|
+
export * from './utils/gitUtils';
|
|
11
|
+
export * from './utils/workspaceUtils';
|
|
12
|
+
export * from './utils/githubAuth';
|
|
13
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,cAAc,SAAS,CAAC;AACxB,OAAO,EAAE,SAAS,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAC1D,OAAO,EAAE,iBAAiB,EAAE,MAAM,uBAAuB,CAAC;AAC1D,OAAO,EAAE,oBAAoB,EAAE,MAAM,0BAA0B,CAAC;AAChE,OAAO,EAAE,gBAAgB,EAAE,MAAM,+BAA+B,CAAC;AACjE,OAAO,EAAE,iBAAiB,EAAE,MAAM,uBAAuB,CAAC;AAC1D,OAAO,EAAE,eAAe,EAAE,MAAM,8BAA8B,CAAC;AAC/D,OAAO,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAC7C,OAAO,EAAE,eAAe,EAAE,MAAM,yBAAyB,CAAC;AAC1D,cAAc,kBAAkB,CAAC;AACjC,cAAc,wBAAwB,CAAC;AACvC,cAAc,oBAAoB,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
exports.BaselineManager = exports.loadConfig = exports.ESLintValidator = exports.LLMJudgeValidator = exports.PatternValidator = exports.ClaudeCodeCLIAdapter = exports.CopilotCLIAdapter = exports.Evaluator = void 0;
|
|
18
|
+
__exportStar(require("./types"), exports);
|
|
19
|
+
var evaluator_1 = require("./evaluator");
|
|
20
|
+
Object.defineProperty(exports, "Evaluator", { enumerable: true, get: function () { return evaluator_1.Evaluator; } });
|
|
21
|
+
var copilotCLI_1 = require("./adapters/copilotCLI");
|
|
22
|
+
Object.defineProperty(exports, "CopilotCLIAdapter", { enumerable: true, get: function () { return copilotCLI_1.CopilotCLIAdapter; } });
|
|
23
|
+
var claudeCodeCLI_1 = require("./adapters/claudeCodeCLI");
|
|
24
|
+
Object.defineProperty(exports, "ClaudeCodeCLIAdapter", { enumerable: true, get: function () { return claudeCodeCLI_1.ClaudeCodeCLIAdapter; } });
|
|
25
|
+
var patternValidator_1 = require("./validators/patternValidator");
|
|
26
|
+
Object.defineProperty(exports, "PatternValidator", { enumerable: true, get: function () { return patternValidator_1.PatternValidator; } });
|
|
27
|
+
var llmJudge_1 = require("./validators/llmJudge");
|
|
28
|
+
Object.defineProperty(exports, "LLMJudgeValidator", { enumerable: true, get: function () { return llmJudge_1.LLMJudgeValidator; } });
|
|
29
|
+
var eslintValidator_1 = require("./validators/eslintValidator");
|
|
30
|
+
Object.defineProperty(exports, "ESLintValidator", { enumerable: true, get: function () { return eslintValidator_1.ESLintValidator; } });
|
|
31
|
+
var loader_1 = require("./config/loader");
|
|
32
|
+
Object.defineProperty(exports, "loadConfig", { enumerable: true, get: function () { return loader_1.loadConfig; } });
|
|
33
|
+
var baselineManager_1 = require("./utils/baselineManager");
|
|
34
|
+
Object.defineProperty(exports, "BaselineManager", { enumerable: true, get: function () { return baselineManager_1.BaselineManager; } });
|
|
35
|
+
__exportStar(require("./utils/gitUtils"), exports);
|
|
36
|
+
__exportStar(require("./utils/workspaceUtils"), exports);
|
|
37
|
+
__exportStar(require("./utils/githubAuth"), exports);
|
|
38
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;AACA,0CAAwB;AACxB,yCAA0D;AAAjD,sGAAA,SAAS,OAAA;AAClB,oDAA0D;AAAjD,+GAAA,iBAAiB,OAAA;AAC1B,0DAAgE;AAAvD,qHAAA,oBAAoB,OAAA;AAC7B,kEAAiE;AAAxD,oHAAA,gBAAgB,OAAA;AACzB,kDAA0D;AAAjD,6GAAA,iBAAiB,OAAA;AAC1B,gEAA+D;AAAtD,kHAAA,eAAe,OAAA;AACxB,0CAA6C;AAApC,oGAAA,UAAU,OAAA;AACnB,2DAA0D;AAAjD,kHAAA,eAAe,OAAA;AACxB,mDAAiC;AACjC,yDAAuC;AACvC,qDAAmC"}
|
package/dist/runner.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../src/runner.ts"],"names":[],"mappings":";AAEA;;GAEG"}
|
package/dist/runner.js
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
/**
|
|
4
|
+
* CLI interface for coding-agent-benchmarks
|
|
5
|
+
*/
|
|
6
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
7
|
+
if (k2 === undefined) k2 = k;
|
|
8
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
9
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
10
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
11
|
+
}
|
|
12
|
+
Object.defineProperty(o, k2, desc);
|
|
13
|
+
}) : (function(o, m, k, k2) {
|
|
14
|
+
if (k2 === undefined) k2 = k;
|
|
15
|
+
o[k2] = m[k];
|
|
16
|
+
}));
|
|
17
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
18
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
19
|
+
}) : function(o, v) {
|
|
20
|
+
o["default"] = v;
|
|
21
|
+
});
|
|
22
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
23
|
+
var ownKeys = function(o) {
|
|
24
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
25
|
+
var ar = [];
|
|
26
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
27
|
+
return ar;
|
|
28
|
+
};
|
|
29
|
+
return ownKeys(o);
|
|
30
|
+
};
|
|
31
|
+
return function (mod) {
|
|
32
|
+
if (mod && mod.__esModule) return mod;
|
|
33
|
+
var result = {};
|
|
34
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
35
|
+
__setModuleDefault(result, mod);
|
|
36
|
+
return result;
|
|
37
|
+
};
|
|
38
|
+
})();
|
|
39
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
40
|
+
const commander_1 = require("commander");
|
|
41
|
+
const fs = __importStar(require("fs"));
|
|
42
|
+
const loader_1 = require("./config/loader");
|
|
43
|
+
const evaluator_1 = require("./evaluator");
|
|
44
|
+
const copilotCLI_1 = require("./adapters/copilotCLI");
|
|
45
|
+
const claudeCodeCLI_1 = require("./adapters/claudeCodeCLI");
|
|
46
|
+
const llmJudge_1 = require("./validators/llmJudge");
|
|
47
|
+
const githubAuth_1 = require("./utils/githubAuth");
|
|
48
|
+
const program = new commander_1.Command();
|
|
49
|
+
program
|
|
50
|
+
.name('coding-agent-benchmarks')
|
|
51
|
+
.description('Evaluate coding agents against coding standards and best practices')
|
|
52
|
+
.version('0.1.0');
|
|
53
|
+
/**
|
|
54
|
+
* Evaluate command
|
|
55
|
+
*/
|
|
56
|
+
program
|
|
57
|
+
.command('evaluate')
|
|
58
|
+
.description('Run benchmark evaluations')
|
|
59
|
+
.option('--scenario <pattern>', 'Filter scenarios by ID pattern (supports wildcards)')
|
|
60
|
+
.option('--category <categories>', 'Filter by category (comma-separated)')
|
|
61
|
+
.option('--tag <tags>', 'Filter by tags (comma-separated)')
|
|
62
|
+
.option('--adapter <type>', 'Code generation adapter (copilot or claude-code)', 'copilot')
|
|
63
|
+
.option('--model <model>', 'LLM model for judge (default: openai/gpt-4.1)')
|
|
64
|
+
.option('--threshold <number>', 'Minimum passing score', '0.8')
|
|
65
|
+
.option('--verbose', 'Show detailed output')
|
|
66
|
+
.option('--output <file>', 'Export JSON report to file')
|
|
67
|
+
.option('--save-baseline', 'Save results as baseline')
|
|
68
|
+
.option('--compare-baseline', 'Compare results with baseline')
|
|
69
|
+
.option('--workspace-root <path>', 'Workspace root directory')
|
|
70
|
+
.action(async (options) => {
|
|
71
|
+
try {
|
|
72
|
+
// Load configuration
|
|
73
|
+
const { config, scenarios } = await (0, loader_1.loadConfig)(options.workspaceRoot || process.cwd());
|
|
74
|
+
// Create evaluator
|
|
75
|
+
const evaluator = new evaluator_1.Evaluator({
|
|
76
|
+
adapter: options.adapter,
|
|
77
|
+
model: options.model,
|
|
78
|
+
workspaceRoot: options.workspaceRoot,
|
|
79
|
+
defaultTimeout: config.defaultTimeout,
|
|
80
|
+
verbose: options.verbose,
|
|
81
|
+
saveBaseline: options.saveBaseline,
|
|
82
|
+
compareBaseline: options.compareBaseline,
|
|
83
|
+
});
|
|
84
|
+
// Check adapter availability
|
|
85
|
+
const isAvailable = await evaluator.checkAdapterAvailability();
|
|
86
|
+
if (!isAvailable) {
|
|
87
|
+
console.error(`Error: ${options.adapter} CLI not found`);
|
|
88
|
+
console.error(`Please install ${options.adapter} CLI to use this adapter`);
|
|
89
|
+
process.exit(1);
|
|
90
|
+
}
|
|
91
|
+
// Filter scenarios
|
|
92
|
+
const filteredScenarios = evaluator.filterScenarios(scenarios, {
|
|
93
|
+
scenarioPattern: options.scenario,
|
|
94
|
+
category: options.category,
|
|
95
|
+
tags: options.tag ? options.tag.split(',').map((t) => t.trim()) : undefined,
|
|
96
|
+
});
|
|
97
|
+
if (filteredScenarios.length === 0) {
|
|
98
|
+
console.log('No scenarios match the specified filters');
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
// Run evaluation
|
|
102
|
+
const report = await evaluator.evaluate(filteredScenarios);
|
|
103
|
+
// Display summary
|
|
104
|
+
console.log('\n' + '='.repeat(60));
|
|
105
|
+
console.log('EVALUATION SUMMARY');
|
|
106
|
+
console.log('='.repeat(60));
|
|
107
|
+
console.log(`Total scenarios: ${report.summary.total}`);
|
|
108
|
+
console.log(`Passed: ${report.summary.passed}`);
|
|
109
|
+
console.log(`Failed: ${report.summary.failed}`);
|
|
110
|
+
console.log(`Skipped: ${report.summary.skipped}`);
|
|
111
|
+
console.log(`Average score: ${report.summary.averageScore.toFixed(2)}`);
|
|
112
|
+
console.log(`Total violations: ${report.summary.totalViolations}`);
|
|
113
|
+
console.log(`Total duration: ${(report.totalDuration / 1000).toFixed(1)}s`);
|
|
114
|
+
console.log('='.repeat(60));
|
|
115
|
+
// Export JSON report if requested
|
|
116
|
+
if (options.output) {
|
|
117
|
+
fs.writeFileSync(options.output, JSON.stringify(report, null, 2), 'utf-8');
|
|
118
|
+
console.log(`\nReport exported to: ${options.output}`);
|
|
119
|
+
}
|
|
120
|
+
// Exit with error code if any scenarios failed
|
|
121
|
+
if (report.summary.failed > 0 || report.summary.skipped > 0) {
|
|
122
|
+
process.exit(1);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
catch (error) {
|
|
126
|
+
console.error(`Error: ${error}`);
|
|
127
|
+
process.exit(1);
|
|
128
|
+
}
|
|
129
|
+
});
|
|
130
|
+
/**
|
|
131
|
+
* List command
|
|
132
|
+
*/
|
|
133
|
+
program
|
|
134
|
+
.command('list')
|
|
135
|
+
.description('List available test scenarios')
|
|
136
|
+
.option('--category <categories>', 'Filter by category')
|
|
137
|
+
.option('--tag <tags>', 'Filter by tags (comma-separated)')
|
|
138
|
+
.action(async (options) => {
|
|
139
|
+
try {
|
|
140
|
+
const { scenarios } = await (0, loader_1.loadConfig)();
|
|
141
|
+
let filtered = scenarios;
|
|
142
|
+
// Filter by category
|
|
143
|
+
if (options.category) {
|
|
144
|
+
const categories = options.category.split(',').map((c) => c.trim());
|
|
145
|
+
filtered = filtered.filter(s => categories.includes(s.category));
|
|
146
|
+
}
|
|
147
|
+
// Filter by tags
|
|
148
|
+
if (options.tag) {
|
|
149
|
+
const tags = options.tag.split(',').map((t) => t.trim());
|
|
150
|
+
filtered = filtered.filter(s => tags.some((tag) => s.tags.includes(tag)));
|
|
151
|
+
}
|
|
152
|
+
console.log(`\nAvailable scenarios (${filtered.length}):\n`);
|
|
153
|
+
for (const scenario of filtered) {
|
|
154
|
+
console.log(` ${scenario.id}`);
|
|
155
|
+
console.log(` Category: ${scenario.category}`);
|
|
156
|
+
console.log(` Severity: ${scenario.severity}`);
|
|
157
|
+
console.log(` Tags: ${scenario.tags.join(', ')}`);
|
|
158
|
+
console.log(` Description: ${scenario.description}`);
|
|
159
|
+
console.log();
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
catch (error) {
|
|
163
|
+
console.error(`Error: ${error}`);
|
|
164
|
+
process.exit(1);
|
|
165
|
+
}
|
|
166
|
+
});
|
|
167
|
+
/**
|
|
168
|
+
* Check command
|
|
169
|
+
*/
|
|
170
|
+
program
|
|
171
|
+
.command('check')
|
|
172
|
+
.description('Check if coding agent CLIs and GitHub auth are available')
|
|
173
|
+
.action(async () => {
|
|
174
|
+
console.log('Checking adapter availability...\n');
|
|
175
|
+
const adapters = [
|
|
176
|
+
{ name: 'GitHub Copilot CLI', type: 'copilot' },
|
|
177
|
+
{ name: 'Claude Code CLI', type: 'claude-code' },
|
|
178
|
+
];
|
|
179
|
+
for (const { name, type } of adapters) {
|
|
180
|
+
let adapter;
|
|
181
|
+
if (type === 'copilot') {
|
|
182
|
+
adapter = new copilotCLI_1.CopilotCLIAdapter();
|
|
183
|
+
}
|
|
184
|
+
else {
|
|
185
|
+
adapter = new claudeCodeCLI_1.ClaudeCodeCLIAdapter();
|
|
186
|
+
}
|
|
187
|
+
const available = await adapter.checkAvailability();
|
|
188
|
+
const status = available ? '✓ Available' : '✗ Not found';
|
|
189
|
+
console.log(` ${name}: ${status}`);
|
|
190
|
+
}
|
|
191
|
+
console.log('\nChecking GitHub authentication...\n');
|
|
192
|
+
const authStatus = (0, githubAuth_1.checkGitHubAuth)();
|
|
193
|
+
const authIcon = authStatus.available ? '✓' : '✗';
|
|
194
|
+
console.log(` ${authIcon} ${authStatus.message}`);
|
|
195
|
+
if (!authStatus.available) {
|
|
196
|
+
console.log('\n 💡 GitHub token is required for LLM-as-judge validation');
|
|
197
|
+
console.log(' Setup: https://github.com/settings/tokens (scope: models:read)');
|
|
198
|
+
console.log(' Or install GitHub CLI: brew install gh && gh auth login');
|
|
199
|
+
}
|
|
200
|
+
console.log();
|
|
201
|
+
});
|
|
202
|
+
/**
|
|
203
|
+
* Test LLM command
|
|
204
|
+
*/
|
|
205
|
+
program
|
|
206
|
+
.command('test-llm')
|
|
207
|
+
.description('Test LLM judge with a custom prompt')
|
|
208
|
+
.option('--model <model>', 'LLM model to use (default: openai/gpt-4.1)')
|
|
209
|
+
.action(async (options) => {
|
|
210
|
+
try {
|
|
211
|
+
console.log('Testing LLM judge...\n');
|
|
212
|
+
console.log('Enter your prompt (Ctrl+D when done):\n');
|
|
213
|
+
// Read prompt from stdin
|
|
214
|
+
const chunks = [];
|
|
215
|
+
process.stdin.on('data', (chunk) => {
|
|
216
|
+
chunks.push(chunk.toString());
|
|
217
|
+
});
|
|
218
|
+
process.stdin.on('end', async () => {
|
|
219
|
+
const prompt = chunks.join('');
|
|
220
|
+
const validator = new llmJudge_1.LLMJudgeValidator(undefined, options.model);
|
|
221
|
+
const result = await validator.testJudge(prompt, options.model);
|
|
222
|
+
console.log('\nLLM Response:\n');
|
|
223
|
+
console.log(result);
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
catch (error) {
|
|
227
|
+
console.error(`Error: ${error}`);
|
|
228
|
+
process.exit(1);
|
|
229
|
+
}
|
|
230
|
+
});
|
|
231
|
+
// Parse arguments
|
|
232
|
+
program.parse(process.argv);
|
|
233
|
+
//# sourceMappingURL=runner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../src/runner.ts"],"names":[],"mappings":";;AAEA;;GAEG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEH,yCAAoC;AACpC,uCAAyB;AACzB,4CAA6C;AAC7C,2CAAwC;AAExC,sDAA0D;AAC1D,4DAAgE;AAChE,oDAA0D;AAC1D,mDAAqD;AAErD,MAAM,OAAO,GAAG,IAAI,mBAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,yBAAyB,CAAC;KAC/B,WAAW,CAAC,oEAAoE,CAAC;KACjF,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB;;GAEG;AACH,OAAO;KACJ,OAAO,CAAC,UAAU,CAAC;KACnB,WAAW,CAAC,2BAA2B,CAAC;KACxC,MAAM,CAAC,sBAAsB,EAAE,qDAAqD,CAAC;KACrF,MAAM,CAAC,yBAAyB,EAAE,sCAAsC,CAAC;KACzE,MAAM,CAAC,cAAc,EAAE,kCAAkC,CAAC;KAC1D,MAAM,CAAC,kBAAkB,EAAE,kDAAkD,EAAE,SAAS,CAAC;KACzF,MAAM,CAAC,iBAAiB,EAAE,+CAA+C,CAAC;KAC1E,MAAM,CAAC,sBAAsB,EAAE,uBAAuB,EAAE,KAAK,CAAC;KAC9D,MAAM,CAAC,WAAW,EAAE,sBAAsB,CAAC;KAC3C,MAAM,CAAC,iBAAiB,EAAE,4BAA4B,CAAC;KACvD,MAAM,CAAC,iBAAiB,EAAE,0BAA0B,CAAC;KACrD,MAAM,CAAC,oBAAoB,EAAE,+BAA+B,CAAC;KAC7D,MAAM,CAAC,yBAAyB,EAAE,0BAA0B,CAAC;KAC7D,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,IAAI,CAAC;QACH,qBAAqB;QACrB,MAAM,EAAE,MAAM,EAAE,SAAS,EAAE,GAAG,MAAM,IAAA,mBAAU,EAAC,OAAO,CAAC,aAAa,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC,CAAC;QAEvF,mBAAmB;QACnB,MAAM,SAAS,GAAG,IAAI,qBAAS,CAAC;YAC9B,OAAO,EAAE,OAAO,CAAC,OAAsB;YACvC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,aAAa,EAAE,OAAO,CAAC,aAAa;YACpC,cAAc,EAAE,MAAM,CAAC,cAAc;YACrC,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,YAAY,EAAE,OAAO,CAAC,YAAY;YAClC,eAAe,EAAE,OAAO,CAAC,eAAe;SACzC,CAAC,CAAC;QAEH,6BAA6B;QAC7B,MAAM,WAAW,GAAG,MAAM,SAAS,CAAC,wBAAwB,EAAE,CAAC;QAC/D,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,OAAO,CAAC,KAAK,CAAC,UAAU,OAAO,CAAC,OAAO,gBAAgB,CAAC,CAAC;YACzD,OAAO,CAAC,KAAK,CAAC,kBAAkB,OAAO,CAAC,OAAO,0BAA0B,CAAC,CAAC;YAC3E,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,mBAAmB;QACnB,MAAM,iBAAiB,GAAG,SAAS,CAAC,eAAe,CAAC,SAAS,EAAE;YAC7D,eAAe,EAAE,OAAO,CAAC,QAAQ;YACjC,QAAQ,EAAE,OAAO,CAAC,QAAQ;YAC1B,IAAI,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;SACpF,CAAC,CAAC;QAEH,IAAI,iBAAiB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACnC,OAAO,CAAC,GAAG,CAAC,0CAA0C,CAAC,CAAC;YACxD,OAAO;QACT,CAAC;QAED,iBAAiB;QACjB,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC;QAE3D,kBAAkB;QAClB,OAAO,CAAC,GAAG,CAAC,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;QACnC,OAAO,CAAC,GAAG,CAAC,oBAAoB,CAAC,CAAC;QAClC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;QAC5B,OAAO,CAAC,GAAG,CAAC,oBAAoB,MAAM,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC;QACxD,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;QAChD,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;QAChD,OAAO,CAAC,GAAG,CAAC,YAAY,MAAM,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAC;QAClD,OAAO,CAAC,GAAG,CAAC,kBAAkB,MAAM,CAAC,OAAO,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACxE,OAAO,CAAC,GAAG,CAAC,qBAAqB,MAAM,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC,CAAC;QACnE,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,MAAM,CAAC,aAAa,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QAC5E,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;QAE5B,kCAAkC;QAClC,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;YACnB,EAAE,CAAC,aAAa,CAAC,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;YAC3E,OAAO,CAAC,GAAG,CAAC,yBAAyB,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;QACzD,CAAC;QAED,+CAA+C;QAC/C,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,GAAG,CAAC,EAAE,CAAC;YAC5D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,UAAU,KAAK,EAAE,CAAC,CAAC;QACjC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL;;GAEG;AACH,OAAO;KACJ,OAAO,CAAC,MAAM,CAAC;KACf,WAAW,CAAC,+BAA+B,CAAC;KAC5C,MAAM,CAAC,yBAAyB,EAAE,oBAAoB,CAAC;KACvD,MAAM,CAAC,cAAc,EAAE,kCAAkC,CAAC;KAC1D,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,IAAI,CAAC;QACH,MAAM,EAAE,SAAS,EAAE,GAAG,MAAM,IAAA,mBAAU,GAAE,CAAC;QAEzC,IAAI,QAAQ,GAAG,SAAS,CAAC;QAEzB,qBAAqB;QACrB,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;YACrB,MAAM,UAAU,GAAG,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;YAC5E,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC;QACnE,CAAC;QAED,iBAAiB;QACjB,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;YAChB,MAAM,IAAI,GAAG,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;YACjE,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAW,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QACpF,CAAC;QAED,OAAO,CAAC,GAAG,CAAC,0BAA0B,QAAQ,CAAC,MAAM,MAAM,CAAC,CAAC;QAE7D,KAAK,MAAM,QAAQ,IAAI,QAAQ,EAAE,CAAC;YAChC,OAAO,CAAC,GAAG,CAAC,KAAK,QAAQ,CAAC,EAAE,EAAE,CAAC,CAAC;YAChC,OAAO,CAAC,GAAG,CAAC,iBAAiB,QAAQ,CAAC,QAAQ,EAAE,CAAC,CAAC;YAClD,OAAO,CAAC,GAAG,CAAC,iBAAiB,QAAQ,CAAC,QAAQ,EAAE,CAAC,CAAC;YAClD,OAAO,CAAC,GAAG,CAAC,aAAa,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACrD,OAAO,CAAC,GAAG,CAAC,oBAAoB,QAAQ,CAAC,WAAW,EAAE,CAAC,CAAC;YACxD,OAAO,CAAC,GAAG,EAAE,CAAC;QAChB,CAAC;IACH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,UAAU,KAAK,EAAE,CAAC,CAAC;QACjC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL;;GAEG;AACH,OAAO;KACJ,OAAO,CAAC,OAAO,CAAC;KAChB,WAAW,CAAC,0DAA0D,CAAC;KACvE,MAAM,CAAC,KAAK,IAAI,EAAE;IACjB,OAAO,CAAC,GAAG,CAAC,oCAAoC,CAAC,CAAC;IAElD,MAAM,QAAQ,GAA+C;QAC3D,EAAE,IAAI,EAAE,oBAAoB,EAAE,IAAI,EAAE,SAAS,EAAE;QAC/C,EAAE,IAAI,EAAE,iBAAiB,EAAE,IAAI,EAAE,aAAa,EAAE;KACjD,CAAC;IAEF,KAAK,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,QAAQ,EAAE,CAAC;QACtC,IAAI,OAAO,CAAC;QACZ,IAAI,IAAI,KAAK,SAAS,EAAE,CAAC;YACvB,OAAO,GAAG,IAAI,8BAAiB,EAAE,CAAC;QACpC,CAAC;aAAM,CAAC;YACN,OAAO,GAAG,IAAI,oCAAoB,EAAE,CAAC;QACvC,CAAC;QAED,MAAM,SAAS,GAAG,MAAM,OAAO,CAAC,iBAAiB,EAAE,CAAC;QACpD,MAAM,MAAM,GAAG,SAAS,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,aAAa,CAAC;QACzD,OAAO,CAAC,GAAG,CAAC,KAAK,IAAI,KAAK,MAAM,EAAE,CAAC,CAAC;IACtC,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,uCAAuC,CAAC,CAAC;IACrD,MAAM,UAAU,GAAG,IAAA,4BAAe,GAAE,CAAC;IACrC,MAAM,QAAQ,GAAG,UAAU,CAAC,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;IAClD,OAAO,CAAC,GAAG,CAAC,KAAK,QAAQ,IAAI,UAAU,CAAC,OAAO,EAAE,CAAC,CAAC;IAEnD,IAAI,CAAC,UAAU,CAAC,SAAS,EAAE,CAAC;QAC1B,OAAO,CAAC,GAAG,CAAC,6DAA6D,CAAC,CAAC;QAC3E,OAAO,CAAC,GAAG,CAAC,kEAAkE,CAAC,CAAC;QAChF,OAAO,CAAC,GAAG,CAAC,2DAA2D,CAAC,CAAC;IAC3E,CAAC;IAED,OAAO,CAAC,GAAG,EAAE,CAAC;AAChB,CAAC,CAAC,CAAC;AAEL;;GAEG;AACH,OAAO;KACJ,OAAO,CAAC,UAAU,CAAC;KACnB,WAAW,CAAC,qCAAqC,CAAC;KAClD,MAAM,CAAC,iBAAiB,EAAE,4CAA4C,CAAC;KACvE,MAAM,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE;IACxB,IAAI,CAAC;QACH,OAAO,CAAC,GAAG,CAAC,wBAAwB,CAAC,CAAC;QACtC,OAAO,CAAC,GAAG,CAAC,yCAAyC,CAAC,CAAC;QAEvD,yBAAyB;QACzB,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,KAAK,EAAE,EAAE;YACjC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,EAAE,KAAK,IAAI,EAAE;YACjC,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YAE/B,MAAM,SAAS,GAAG,IAAI,4BAAiB,CAAC,SAAS,EAAE,OAAO,CAAC,KAAK,CAAC,CAAC;YAClE,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,SAAS,CAAC,MAAM,EAAE,OAAO,CAAC,KAAK,CAAC,CAAC;YAEhE,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,CAAC;YACjC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QACtB,CAAC,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,UAAU,KAAK,EAAE,CAAC,CAAC;QACjC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,kBAAkB;AAClB,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC"}
|