coding-agent-benchmarks 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +474 -0
- package/dist/adapters/claudeCodeCLI.d.ts +19 -0
- package/dist/adapters/claudeCodeCLI.d.ts.map +1 -0
- package/dist/adapters/claudeCodeCLI.js +106 -0
- package/dist/adapters/claudeCodeCLI.js.map +1 -0
- package/dist/adapters/copilotCLI.d.ts +19 -0
- package/dist/adapters/copilotCLI.d.ts.map +1 -0
- package/dist/adapters/copilotCLI.js +104 -0
- package/dist/adapters/copilotCLI.js.map +1 -0
- package/dist/config/defaultScenarios.d.ts +6 -0
- package/dist/config/defaultScenarios.d.ts.map +1 -0
- package/dist/config/defaultScenarios.js +209 -0
- package/dist/config/defaultScenarios.js.map +1 -0
- package/dist/config/loader.d.ts +13 -0
- package/dist/config/loader.d.ts.map +1 -0
- package/dist/config/loader.js +153 -0
- package/dist/config/loader.js.map +1 -0
- package/dist/evaluator.d.ts +45 -0
- package/dist/evaluator.d.ts.map +1 -0
- package/dist/evaluator.js +226 -0
- package/dist/evaluator.js.map +1 -0
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +38 -0
- package/dist/index.js.map +1 -0
- package/dist/runner.d.ts +6 -0
- package/dist/runner.d.ts.map +1 -0
- package/dist/runner.js +233 -0
- package/dist/runner.js.map +1 -0
- package/dist/types.d.ts +354 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/baselineManager.d.ts +53 -0
- package/dist/utils/baselineManager.d.ts.map +1 -0
- package/dist/utils/baselineManager.js +220 -0
- package/dist/utils/baselineManager.js.map +1 -0
- package/dist/utils/gitUtils.d.ts +39 -0
- package/dist/utils/gitUtils.d.ts.map +1 -0
- package/dist/utils/gitUtils.js +121 -0
- package/dist/utils/gitUtils.js.map +1 -0
- package/dist/utils/githubAuth.d.ts +22 -0
- package/dist/utils/githubAuth.d.ts.map +1 -0
- package/dist/utils/githubAuth.js +79 -0
- package/dist/utils/githubAuth.js.map +1 -0
- package/dist/utils/workspaceUtils.d.ts +32 -0
- package/dist/utils/workspaceUtils.d.ts.map +1 -0
- package/dist/utils/workspaceUtils.js +121 -0
- package/dist/utils/workspaceUtils.js.map +1 -0
- package/dist/validators/eslintValidator.d.ts +22 -0
- package/dist/validators/eslintValidator.d.ts.map +1 -0
- package/dist/validators/eslintValidator.js +217 -0
- package/dist/validators/eslintValidator.js.map +1 -0
- package/dist/validators/llmJudge.d.ts +28 -0
- package/dist/validators/llmJudge.d.ts.map +1 -0
- package/dist/validators/llmJudge.js +241 -0
- package/dist/validators/llmJudge.js.map +1 -0
- package/dist/validators/patternValidator.d.ts +27 -0
- package/dist/validators/patternValidator.d.ts.map +1 -0
- package/dist/validators/patternValidator.js +233 -0
- package/dist/validators/patternValidator.js.map +1 -0
- package/package.json +50 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 coding-agent-benchmarks contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,474 @@
|
|
|
1
|
+
# coding-agent-benchmarks
|
|
2
|
+
|
|
3
|
+
Open-source framework for evaluating AI coding assistants (like GitHub Copilot CLI or Claude Code) follow your coding standards. Here's the workflow:
|
|
4
|
+
1. You give it a prompt → 2. AI generates code → 3. Library validates → 4. You get a score
|
|
5
|
+
|
|
6
|
+
## Features
|
|
7
|
+
|
|
8
|
+
- **Multiple Adapters**: Built-in support for GitHub Copilot CLI and Claude Code CLI
|
|
9
|
+
- **Flexible Validation**: Pattern-based validation, LLM-as-judge semantic evaluation, and ESLint integration
|
|
10
|
+
- **Baseline Tracking**: Save and compare evaluation results over time
|
|
11
|
+
- **Extensible**: Easy to add custom scenarios, validators, and adapters
|
|
12
|
+
- **CLI & Programmatic API**: Use as a command-line tool or integrate into your workflow
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
npm install --save-dev coding-agent-benchmarks
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Quick Start
|
|
21
|
+
|
|
22
|
+
### 1. Create a Configuration File
|
|
23
|
+
|
|
24
|
+
Create a `benchmarks.config.js` file in your project root with your scenarios (see Configuration section below).
|
|
25
|
+
|
|
26
|
+
### 2. Check Adapter Availability
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
npx coding-agent-benchmarks check
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
This will show which coding agent CLIs are installed on your system.
|
|
33
|
+
|
|
34
|
+
### 3. List Your Scenarios
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
npx coding-agent-benchmarks list
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### 4. Run Evaluations
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# Run all scenarios with default adapter (Copilot)
|
|
44
|
+
npx coding-agent-benchmarks evaluate
|
|
45
|
+
|
|
46
|
+
# Run with Claude Code
|
|
47
|
+
npx coding-agent-benchmarks evaluate --adapter claude-code
|
|
48
|
+
|
|
49
|
+
# Run specific scenarios
|
|
50
|
+
npx coding-agent-benchmarks evaluate --scenario typescript-no-any
|
|
51
|
+
npx coding-agent-benchmarks evaluate --category typescript
|
|
52
|
+
npx coding-agent-benchmarks evaluate --tag best-practices
|
|
53
|
+
|
|
54
|
+
# Save as baseline for future comparison
|
|
55
|
+
npx coding-agent-benchmarks evaluate --save-baseline
|
|
56
|
+
|
|
57
|
+
# Compare with baseline
|
|
58
|
+
npx coding-agent-benchmarks evaluate --compare-baseline
|
|
59
|
+
|
|
60
|
+
# Export report as JSON
|
|
61
|
+
npx coding-agent-benchmarks evaluate --output report.json
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Configuration
|
|
65
|
+
|
|
66
|
+
**Configuration is required.** Create a `benchmarks.config.js` (or `.ts`) file in your project root with your test scenarios:
|
|
67
|
+
|
|
68
|
+
```javascript
|
|
69
|
+
module.exports = {
|
|
70
|
+
// Default adapter to use
|
|
71
|
+
defaultAdapter: 'copilot',
|
|
72
|
+
|
|
73
|
+
// Default LLM model for judge
|
|
74
|
+
defaultModel: 'openai/gpt-4.1',
|
|
75
|
+
|
|
76
|
+
// Default timeout for code generation (milliseconds)
|
|
77
|
+
// Individual scenarios can override this
|
|
78
|
+
// Default: 120000 (2 minutes)
|
|
79
|
+
defaultTimeout: 180000, // 3 minutes
|
|
80
|
+
|
|
81
|
+
// Workspace root (auto-detected if not specified)
|
|
82
|
+
workspaceRoot: process.cwd(),
|
|
83
|
+
|
|
84
|
+
// Define your test scenarios
|
|
85
|
+
scenarios: [
|
|
86
|
+
{
|
|
87
|
+
id: 'typescript-no-any',
|
|
88
|
+
category: 'typescript',
|
|
89
|
+
severity: 'critical',
|
|
90
|
+
tags: ['typescript', 'types', 'safety'],
|
|
91
|
+
description: 'Ensure TypeScript interfaces use explicit types instead of "any"',
|
|
92
|
+
prompt: 'Create a TypeScript interface called User with fields: id (number), name (string), email (string), and metadata (object with key-value pairs)',
|
|
93
|
+
validationStrategy: {
|
|
94
|
+
patterns: {
|
|
95
|
+
forbiddenPatterns: [/:\s*any\b/],
|
|
96
|
+
requiredPatterns: [/interface\s+User/],
|
|
97
|
+
},
|
|
98
|
+
},
|
|
99
|
+
timeout: 120000,
|
|
100
|
+
},
|
|
101
|
+
{
|
|
102
|
+
id: 'react-no-inline-styles',
|
|
103
|
+
category: 'react',
|
|
104
|
+
severity: 'major',
|
|
105
|
+
tags: ['react', 'styling', 'best-practices'],
|
|
106
|
+
description: 'Forbid inline style objects in React components',
|
|
107
|
+
prompt: 'Create a React functional component called Button that accepts a "label" prop and renders a styled button. Use CSS classes instead of inline styles.',
|
|
108
|
+
validationStrategy: {
|
|
109
|
+
patterns: {
|
|
110
|
+
forbiddenPatterns: [/style\s*=\s*\{\{/, /style\s*=\s*\{[^}]*\}/],
|
|
111
|
+
requiredPatterns: [/className/],
|
|
112
|
+
},
|
|
113
|
+
},
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
id: 'async-error-handling',
|
|
117
|
+
category: 'general',
|
|
118
|
+
severity: 'critical',
|
|
119
|
+
tags: ['async', 'error-handling', 'robustness'],
|
|
120
|
+
description: 'Ensure async functions have proper error handling',
|
|
121
|
+
prompt: 'Create an async function called fetchUserData that takes a userId parameter, makes an HTTP request to fetch user data, and returns the user object. Handle errors appropriately.',
|
|
122
|
+
validationStrategy: {
|
|
123
|
+
patterns: {
|
|
124
|
+
requiredPatterns: [/async\s+function\s+fetchUserData|const\s+fetchUserData.*async/, /try|catch|\.catch\(/],
|
|
125
|
+
},
|
|
126
|
+
llmJudge: {
|
|
127
|
+
enabled: true,
|
|
128
|
+
judgmentPrompt: 'Evaluate the error handling in this async function. Does it use try/catch or .catch()? Are errors logged or re-thrown appropriately?',
|
|
129
|
+
},
|
|
130
|
+
},
|
|
131
|
+
},
|
|
132
|
+
],
|
|
133
|
+
};
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Timeout Configuration
|
|
137
|
+
|
|
138
|
+
You can configure timeouts at three levels (in order of precedence):
|
|
139
|
+
|
|
140
|
+
1. **Per-scenario timeout**: Set `timeout` on individual scenarios (highest priority)
|
|
141
|
+
2. **Global default**: Set `defaultTimeout` in your config file
|
|
142
|
+
3. **Built-in default**: 120000ms (2 minutes) if nothing else is specified
|
|
143
|
+
|
|
144
|
+
```javascript
|
|
145
|
+
// In benchmarks.config.js
|
|
146
|
+
module.exports = {
|
|
147
|
+
// Global default applies to all scenarios
|
|
148
|
+
defaultTimeout: 180000, // 3 minutes
|
|
149
|
+
|
|
150
|
+
scenarios: [
|
|
151
|
+
{
|
|
152
|
+
id: 'quick-check',
|
|
153
|
+
prompt: '...',
|
|
154
|
+
timeout: 60000, // Override: 1 minute for this scenario
|
|
155
|
+
// ...
|
|
156
|
+
},
|
|
157
|
+
{
|
|
158
|
+
id: 'complex-task',
|
|
159
|
+
prompt: '...',
|
|
160
|
+
// Will use defaultTimeout (3 minutes)
|
|
161
|
+
// ...
|
|
162
|
+
},
|
|
163
|
+
],
|
|
164
|
+
};
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
**Why configure timeouts?**
|
|
168
|
+
- Complex code generation tasks may need more time
|
|
169
|
+
- Simple checks can complete faster with shorter timeouts
|
|
170
|
+
- Different AI models may have different response times
|
|
171
|
+
|
|
172
|
+
## Validation Strategies
|
|
173
|
+
|
|
174
|
+
### Pattern Validation
|
|
175
|
+
|
|
176
|
+
Regex-based validation for forbidden/required patterns:
|
|
177
|
+
|
|
178
|
+
```javascript
|
|
179
|
+
validationStrategy: {
|
|
180
|
+
patterns: {
|
|
181
|
+
// Patterns that should NOT appear
|
|
182
|
+
forbiddenPatterns: [/:\s*any\b/, /console\.log/],
|
|
183
|
+
|
|
184
|
+
// Patterns that MUST appear
|
|
185
|
+
requiredPatterns: [/interface\s+User/],
|
|
186
|
+
|
|
187
|
+
// Import statements that should NOT be present
|
|
188
|
+
forbiddenImports: ['from "lodash"'],
|
|
189
|
+
|
|
190
|
+
// Import statements that MUST be present
|
|
191
|
+
requiredImports: ['import React'],
|
|
192
|
+
|
|
193
|
+
// File name patterns that should NOT be created
|
|
194
|
+
forbiddenFileNamePatterns: [/\.test\.js$/],
|
|
195
|
+
|
|
196
|
+
// File name patterns that MUST be created
|
|
197
|
+
requiredFileNamePatterns: [/\.tsx?$/],
|
|
198
|
+
},
|
|
199
|
+
}
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### LLM-as-Judge
|
|
203
|
+
|
|
204
|
+
Semantic evaluation using AI:
|
|
205
|
+
|
|
206
|
+
```javascript
|
|
207
|
+
validationStrategy: {
|
|
208
|
+
llmJudge: {
|
|
209
|
+
enabled: true,
|
|
210
|
+
model: 'openai/gpt-4.1', // or 'gpt-4o'
|
|
211
|
+
judgmentPrompt: `Evaluate if the code follows best practices...`,
|
|
212
|
+
},
|
|
213
|
+
}
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
The LLM judge requires a `GITHUB_TOKEN` environment variable with access to GitHub Models API.
|
|
217
|
+
|
|
218
|
+
### ESLint Integration
|
|
219
|
+
|
|
220
|
+
Run ESLint on generated code:
|
|
221
|
+
|
|
222
|
+
```javascript
|
|
223
|
+
validationStrategy: {
|
|
224
|
+
eslint: {
|
|
225
|
+
enabled: true,
|
|
226
|
+
configPath: '.eslintrc.js', // optional
|
|
227
|
+
},
|
|
228
|
+
}
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
## Scoring
|
|
232
|
+
|
|
233
|
+
Each scenario receives a score from 0.0 to 1.0:
|
|
234
|
+
|
|
235
|
+
- **1.0**: Perfect, no violations
|
|
236
|
+
- **0.8-0.99**: Minor issues
|
|
237
|
+
- **0.5-0.79**: Moderate issues
|
|
238
|
+
- **0.0-0.49**: Major issues or failed
|
|
239
|
+
|
|
240
|
+
Violations are weighted by severity:
|
|
241
|
+
- **Critical**: 1.0 weight
|
|
242
|
+
- **Major**: 0.7 weight
|
|
243
|
+
- **Minor**: 0.3 weight
|
|
244
|
+
|
|
245
|
+
## Baseline Tracking
|
|
246
|
+
|
|
247
|
+
Save current results as a baseline:
|
|
248
|
+
|
|
249
|
+
```bash
|
|
250
|
+
npx coding-agent-benchmarks evaluate --save-baseline
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
Baselines are stored in `.benchmarks/baselines/{adapter}/{model}/{scenario-id}.json`
|
|
254
|
+
|
|
255
|
+
Compare future runs against the baseline:
|
|
256
|
+
|
|
257
|
+
```bash
|
|
258
|
+
npx coding-agent-benchmarks evaluate --compare-baseline
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
The report will show score deltas and whether results improved or regressed.
|
|
262
|
+
|
|
263
|
+
## CLI Commands
|
|
264
|
+
|
|
265
|
+
### `evaluate`
|
|
266
|
+
|
|
267
|
+
Run benchmark evaluations.
|
|
268
|
+
|
|
269
|
+
**Options:**
|
|
270
|
+
- `--scenario <pattern>`: Filter by scenario ID (supports wildcards like `typescript-*`)
|
|
271
|
+
- `--category <categories>`: Filter by category (comma-separated)
|
|
272
|
+
- `--tag <tags>`: Filter by tags (comma-separated)
|
|
273
|
+
- `--adapter <type>`: Adapter to use (`copilot` or `claude-code`)
|
|
274
|
+
- `--model <model>`: LLM model for judge (default: `openai/gpt-4.1`)
|
|
275
|
+
- `--threshold <number>`: Minimum passing score (default: `0.8`)
|
|
276
|
+
- `--verbose`: Show detailed output
|
|
277
|
+
- `--output <file>`: Export JSON report
|
|
278
|
+
- `--save-baseline`: Save results as baseline
|
|
279
|
+
- `--compare-baseline`: Compare with baseline
|
|
280
|
+
- `--workspace-root <path>`: Workspace root directory
|
|
281
|
+
|
|
282
|
+
### `list`
|
|
283
|
+
|
|
284
|
+
List available test scenarios.
|
|
285
|
+
|
|
286
|
+
**Options:**
|
|
287
|
+
- `--category <categories>`: Filter by category
|
|
288
|
+
- `--tag <tags>`: Filter by tags
|
|
289
|
+
|
|
290
|
+
### `check`
|
|
291
|
+
|
|
292
|
+
Check if coding agent CLIs are available.
|
|
293
|
+
|
|
294
|
+
### `test-llm`
|
|
295
|
+
|
|
296
|
+
Test LLM judge with a custom prompt (for debugging).
|
|
297
|
+
|
|
298
|
+
**Options:**
|
|
299
|
+
- `--model <model>`: LLM model to use
|
|
300
|
+
|
|
301
|
+
## Programmatic Usage
|
|
302
|
+
|
|
303
|
+
You can also use the framework programmatically:
|
|
304
|
+
|
|
305
|
+
```typescript
|
|
306
|
+
import { Evaluator, loadConfig } from 'coding-agent-benchmarks';
|
|
307
|
+
|
|
308
|
+
async function runEvaluation() {
|
|
309
|
+
// Load configuration
|
|
310
|
+
const { scenarios } = await loadConfig();
|
|
311
|
+
|
|
312
|
+
// Create evaluator
|
|
313
|
+
const evaluator = new Evaluator({
|
|
314
|
+
adapter: 'copilot',
|
|
315
|
+
model: 'openai/gpt-4.1',
|
|
316
|
+
verbose: true,
|
|
317
|
+
});
|
|
318
|
+
|
|
319
|
+
// Check adapter availability
|
|
320
|
+
const available = await evaluator.checkAdapterAvailability();
|
|
321
|
+
if (!available) {
|
|
322
|
+
throw new Error('Adapter not available');
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// Run evaluation
|
|
326
|
+
const report = await evaluator.evaluate(scenarios);
|
|
327
|
+
|
|
328
|
+
console.log(`Passed: ${report.summary.passed}/${report.summary.total}`);
|
|
329
|
+
console.log(`Average score: ${report.summary.averageScore.toFixed(2)}`);
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
runEvaluation();
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
## Creating Custom Validators
|
|
336
|
+
|
|
337
|
+
Implement the `CodeValidator` interface:
|
|
338
|
+
|
|
339
|
+
```typescript
|
|
340
|
+
import { CodeValidator, ValidationResult, TestScenario } from 'coding-agent-benchmarks';
|
|
341
|
+
|
|
342
|
+
export class CustomValidator implements CodeValidator {
|
|
343
|
+
public readonly type = 'custom';
|
|
344
|
+
|
|
345
|
+
async validate(
|
|
346
|
+
files: readonly string[],
|
|
347
|
+
scenario: TestScenario
|
|
348
|
+
): Promise<ValidationResult> {
|
|
349
|
+
// Your validation logic here
|
|
350
|
+
return {
|
|
351
|
+
passed: true,
|
|
352
|
+
score: 1.0,
|
|
353
|
+
violations: [],
|
|
354
|
+
validatorType: 'custom',
|
|
355
|
+
};
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
## Creating Custom Adapters
|
|
361
|
+
|
|
362
|
+
Implement the `CodeGenerationAdapter` interface:
|
|
363
|
+
|
|
364
|
+
```typescript
|
|
365
|
+
import { CodeGenerationAdapter, AdapterType } from 'coding-agent-benchmarks';
|
|
366
|
+
|
|
367
|
+
export class CustomAdapter implements CodeGenerationAdapter {
|
|
368
|
+
public readonly type: AdapterType = 'copilot'; // or extend the type
|
|
369
|
+
|
|
370
|
+
async checkAvailability(): Promise<boolean> {
|
|
371
|
+
// Check if CLI is available
|
|
372
|
+
return true;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
async generate(
|
|
376
|
+
prompt: string,
|
|
377
|
+
contextFiles?: readonly string[],
|
|
378
|
+
timeout?: number
|
|
379
|
+
): Promise<string[]> {
|
|
380
|
+
// Generate code and return changed files
|
|
381
|
+
return ['path/to/generated/file.ts'];
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
```
|
|
385
|
+
|
|
386
|
+
## GitHub Authentication (for LLM Judge)
|
|
387
|
+
|
|
388
|
+
LLM-as-judge validation requires GitHub authentication to access GitHub Models API. There are **two easy options** - no OAuth registration needed!
|
|
389
|
+
|
|
390
|
+
### Option 1: Personal Access Token (Recommended)
|
|
391
|
+
|
|
392
|
+
1. Create token at https://github.com/settings/tokens
|
|
393
|
+
2. Click "Generate new token (classic)"
|
|
394
|
+
3. Give it a name (e.g., "coding-agent-benchmarks")
|
|
395
|
+
4. Select scope: **`models:read`**
|
|
396
|
+
5. Generate and copy the token
|
|
397
|
+
6. Set environment variable:
|
|
398
|
+
```bash
|
|
399
|
+
export GITHUB_TOKEN=ghp_xxxxxxxxxxxxxxxxxxxx
|
|
400
|
+
```
|
|
401
|
+
|
|
402
|
+
### Option 2: GitHub CLI (Automatic)
|
|
403
|
+
|
|
404
|
+
If you have GitHub CLI installed, tokens are auto-detected:
|
|
405
|
+
|
|
406
|
+
```bash
|
|
407
|
+
# Install GitHub CLI
|
|
408
|
+
brew install gh # macOS
|
|
409
|
+
# or download from https://cli.github.com
|
|
410
|
+
|
|
411
|
+
# Authenticate (one time)
|
|
412
|
+
gh auth login
|
|
413
|
+
|
|
414
|
+
# Token will be used automatically - no GITHUB_TOKEN needed!
|
|
415
|
+
```
|
|
416
|
+
|
|
417
|
+
### Check Authentication Status
|
|
418
|
+
|
|
419
|
+
```bash
|
|
420
|
+
npx coding-agent-benchmarks check
|
|
421
|
+
```
|
|
422
|
+
|
|
423
|
+
Output:
|
|
424
|
+
```
|
|
425
|
+
Checking adapter availability...
|
|
426
|
+
GitHub Copilot CLI: ✓ Available
|
|
427
|
+
Claude Code CLI: ✗ Not found
|
|
428
|
+
|
|
429
|
+
Checking GitHub authentication...
|
|
430
|
+
✓ Using token from GitHub CLI (gh auth token)
|
|
431
|
+
```
|
|
432
|
+
|
|
433
|
+
## How It Works
|
|
434
|
+
|
|
435
|
+
1. **Code Generation**: The adapter spawns a coding agent CLI with a prompt
|
|
436
|
+
2. **File Tracking**: Git is used to detect which files were created/modified
|
|
437
|
+
3. **Validation**: Multiple validators check the generated code
|
|
438
|
+
4. **Scoring**: Results are aggregated and compared against thresholds
|
|
439
|
+
5. **Reporting**: Results are displayed in terminal and optionally exported as JSON
|
|
440
|
+
|
|
441
|
+
## Requirements
|
|
442
|
+
|
|
443
|
+
- Node.js >= 18.0.0
|
|
444
|
+
- Git repository (for file change tracking)
|
|
445
|
+
- At least one coding agent CLI installed (Copilot or Claude Code)
|
|
446
|
+
- (Optional) `GITHUB_TOKEN` for LLM judge validation
|
|
447
|
+
|
|
448
|
+
## Contributing
|
|
449
|
+
|
|
450
|
+
Contributions are welcome! Please:
|
|
451
|
+
|
|
452
|
+
1. Fork the repository
|
|
453
|
+
2. Create a feature branch
|
|
454
|
+
3. Make your changes
|
|
455
|
+
4. Add tests if applicable
|
|
456
|
+
5. Submit a pull request
|
|
457
|
+
|
|
458
|
+
## License
|
|
459
|
+
|
|
460
|
+
MIT License - see LICENSE file for details
|
|
461
|
+
|
|
462
|
+
## Acknowledgments
|
|
463
|
+
|
|
464
|
+
Inspired by the need for systematic evaluation of AI coding assistants. Built to help teams ensure their AI tools follow coding standards and best practices.
|
|
465
|
+
|
|
466
|
+
## Support
|
|
467
|
+
|
|
468
|
+
- Report issues: [GitHub Issues](https://github.com/yourusername/coding-agent-benchmarks/issues)
|
|
469
|
+
- Documentation: This README and inline JSDoc comments
|
|
470
|
+
- Examples: See `examples/` directory (coming soon)
|
|
471
|
+
|
|
472
|
+
---
|
|
473
|
+
|
|
474
|
+
**Happy benchmarking!** 🚀
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Claude Code CLI Adapter
|
|
3
|
+
*/
|
|
4
|
+
import { CodeGenerationAdapter } from '../types';
|
|
5
|
+
export declare class ClaudeCodeCLIAdapter implements CodeGenerationAdapter {
|
|
6
|
+
readonly type: "claude-code";
|
|
7
|
+
private workspaceRoot;
|
|
8
|
+
constructor(workspaceRoot?: string);
|
|
9
|
+
/**
|
|
10
|
+
* Check if Claude Code CLI is available
|
|
11
|
+
*/
|
|
12
|
+
checkAvailability(): Promise<boolean>;
|
|
13
|
+
/**
|
|
14
|
+
* Generate code using Claude Code CLI
|
|
15
|
+
* @param timeout Timeout in milliseconds, or null for no timeout
|
|
16
|
+
*/
|
|
17
|
+
generate(prompt: string, contextFiles?: readonly string[], timeout?: number | null): Promise<string[]>;
|
|
18
|
+
}
|
|
19
|
+
//# sourceMappingURL=claudeCodeCLI.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"claudeCodeCLI.d.ts","sourceRoot":"","sources":["../../src/adapters/claudeCodeCLI.ts"],"names":[],"mappings":"AAAA;;GAEG;AAGH,OAAO,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AAIjD,qBAAa,oBAAqB,YAAW,qBAAqB;IAChE,SAAgB,IAAI,EAAG,aAAa,CAAU;IAC9C,OAAO,CAAC,aAAa,CAAS;gBAElB,aAAa,CAAC,EAAE,MAAM;IAIlC;;OAEG;IACG,iBAAiB,IAAI,OAAO,CAAC,OAAO,CAAC;IAgB3C;;;OAGG;IACG,QAAQ,CACZ,MAAM,EAAE,MAAM,EACd,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,GACtB,OAAO,CAAC,MAAM,EAAE,CAAC;CA8ErB"}
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Claude Code CLI Adapter
|
|
4
|
+
*/
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.ClaudeCodeCLIAdapter = void 0;
|
|
7
|
+
const child_process_1 = require("child_process");
|
|
8
|
+
const gitUtils_1 = require("../utils/gitUtils");
|
|
9
|
+
const workspaceUtils_1 = require("../utils/workspaceUtils");
|
|
10
|
+
class ClaudeCodeCLIAdapter {
|
|
11
|
+
constructor(workspaceRoot) {
|
|
12
|
+
this.type = 'claude-code';
|
|
13
|
+
this.workspaceRoot = (0, workspaceUtils_1.resolveWorkspaceRoot)(workspaceRoot);
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Check if Claude Code CLI is available
|
|
17
|
+
*/
|
|
18
|
+
async checkAvailability() {
|
|
19
|
+
return new Promise((resolve) => {
|
|
20
|
+
const proc = (0, child_process_1.spawn)('which', ['claude'], {
|
|
21
|
+
stdio: 'pipe',
|
|
22
|
+
});
|
|
23
|
+
proc.on('close', (code) => {
|
|
24
|
+
resolve(code === 0);
|
|
25
|
+
});
|
|
26
|
+
proc.on('error', () => {
|
|
27
|
+
resolve(false);
|
|
28
|
+
});
|
|
29
|
+
});
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Generate code using Claude Code CLI
|
|
33
|
+
* @param timeout Timeout in milliseconds, or null for no timeout
|
|
34
|
+
*/
|
|
35
|
+
async generate(prompt, contextFiles, timeout) {
|
|
36
|
+
// Reset workspace to clean state before generation
|
|
37
|
+
try {
|
|
38
|
+
(0, gitUtils_1.resetGitWorkingDirectory)(this.workspaceRoot);
|
|
39
|
+
}
|
|
40
|
+
catch (error) {
|
|
41
|
+
console.warn('Warning: Could not reset git working directory:', error);
|
|
42
|
+
}
|
|
43
|
+
// Build the full prompt with context
|
|
44
|
+
let fullPrompt = prompt;
|
|
45
|
+
if (contextFiles && contextFiles.length > 0) {
|
|
46
|
+
const contexts = (0, workspaceUtils_1.readContextFiles)(this.workspaceRoot, contextFiles);
|
|
47
|
+
if (contexts.length > 0) {
|
|
48
|
+
const contextSection = contexts
|
|
49
|
+
.map(ctx => `\n\n### Context from ${ctx.path}:\n\`\`\`\n${ctx.content}\n\`\`\``)
|
|
50
|
+
.join('\n');
|
|
51
|
+
fullPrompt = `${prompt}${contextSection}`;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
// Spawn the claude CLI process
|
|
55
|
+
// Note: Claude Code CLI may require different flags or approach
|
|
56
|
+
// This is a basic implementation that may need adjustment
|
|
57
|
+
return new Promise((resolve, reject) => {
|
|
58
|
+
const proc = (0, child_process_1.spawn)('claude', ['--non-interactive', fullPrompt], {
|
|
59
|
+
cwd: this.workspaceRoot,
|
|
60
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
61
|
+
shell: true,
|
|
62
|
+
});
|
|
63
|
+
let stdout = '';
|
|
64
|
+
let stderr = '';
|
|
65
|
+
proc.stdout?.on('data', (data) => {
|
|
66
|
+
stdout += data.toString();
|
|
67
|
+
});
|
|
68
|
+
proc.stderr?.on('data', (data) => {
|
|
69
|
+
stderr += data.toString();
|
|
70
|
+
});
|
|
71
|
+
// Set timeout only if specified (null/undefined = no timeout)
|
|
72
|
+
let timeoutHandle = null;
|
|
73
|
+
if (timeout !== null && timeout !== undefined) {
|
|
74
|
+
timeoutHandle = setTimeout(() => {
|
|
75
|
+
proc.kill('SIGTERM');
|
|
76
|
+
reject(new Error(`Claude Code CLI timed out after ${timeout}ms`));
|
|
77
|
+
}, timeout);
|
|
78
|
+
}
|
|
79
|
+
proc.on('close', (code) => {
|
|
80
|
+
if (timeoutHandle) {
|
|
81
|
+
clearTimeout(timeoutHandle);
|
|
82
|
+
}
|
|
83
|
+
if (code !== 0) {
|
|
84
|
+
reject(new Error(`Claude Code CLI exited with code ${code}\nStderr: ${stderr}`));
|
|
85
|
+
return;
|
|
86
|
+
}
|
|
87
|
+
// Get the list of changed files
|
|
88
|
+
try {
|
|
89
|
+
const changedFiles = (0, gitUtils_1.getChangedFiles)(this.workspaceRoot);
|
|
90
|
+
resolve(changedFiles);
|
|
91
|
+
}
|
|
92
|
+
catch (error) {
|
|
93
|
+
reject(new Error(`Failed to get changed files: ${error}`));
|
|
94
|
+
}
|
|
95
|
+
});
|
|
96
|
+
proc.on('error', (error) => {
|
|
97
|
+
if (timeoutHandle) {
|
|
98
|
+
clearTimeout(timeoutHandle);
|
|
99
|
+
}
|
|
100
|
+
reject(new Error(`Failed to spawn Claude Code CLI: ${error}`));
|
|
101
|
+
});
|
|
102
|
+
});
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
exports.ClaudeCodeCLIAdapter = ClaudeCodeCLIAdapter;
|
|
106
|
+
//# sourceMappingURL=claudeCodeCLI.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"claudeCodeCLI.js","sourceRoot":"","sources":["../../src/adapters/claudeCodeCLI.ts"],"names":[],"mappings":";AAAA;;GAEG;;;AAEH,iDAAsC;AAEtC,gDAA8E;AAC9E,4DAAiF;AAEjF,MAAa,oBAAoB;IAI/B,YAAY,aAAsB;QAHlB,SAAI,GAAG,aAAsB,CAAC;QAI5C,IAAI,CAAC,aAAa,GAAG,IAAA,qCAAoB,EAAC,aAAa,CAAC,CAAC;IAC3D,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,iBAAiB;QACrB,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;YAC7B,MAAM,IAAI,GAAG,IAAA,qBAAK,EAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,EAAE;gBACtC,KAAK,EAAE,MAAM;aACd,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;gBACxB,OAAO,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC;YACtB,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE;gBACpB,OAAO,CAAC,KAAK,CAAC,CAAC;YACjB,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,QAAQ,CACZ,MAAc,EACd,YAAgC,EAChC,OAAuB;QAEvB,mDAAmD;QACnD,IAAI,CAAC;YACH,IAAA,mCAAwB,EAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAC/C,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,IAAI,CAAC,iDAAiD,EAAE,KAAK,CAAC,CAAC;QACzE,CAAC;QAED,qCAAqC;QACrC,IAAI,UAAU,GAAG,MAAM,CAAC;QAExB,IAAI,YAAY,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5C,MAAM,QAAQ,GAAG,IAAA,iCAAgB,EAAC,IAAI,CAAC,aAAa,EAAE,YAAY,CAAC,CAAC;YACpE,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxB,MAAM,cAAc,GAAG,QAAQ;qBAC5B,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,wBAAwB,GAAG,CAAC,IAAI,cAAc,GAAG,CAAC,OAAO,UAAU,CAAC;qBAC/E,IAAI,CAAC,IAAI,CAAC,CAAC;gBACd,UAAU,GAAG,GAAG,MAAM,GAAG,cAAc,EAAE,CAAC;YAC5C,CAAC;QACH,CAAC;QAED,+BAA+B;QAC/B,gEAAgE;QAChE,0DAA0D;QAC1D,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;YACrC,MAAM,IAAI,GAAG,IAAA,qBAAK,EAAC,QAAQ,EAAE,CAAC,mBAAmB,EAAE,UAAU,CAAC,EAAE;gBAC9D,GAAG,EAAE,IAAI,CAAC,aAAa;gBACvB,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;gBAC/B,KAAK,EAAE,IAAI;aACZ,CAAC,CAAC;YAEH,IAAI,MAAM,GAAG,EAAE,CAAC;YAChB,IAAI,MAAM,GAAG,EAAE,CAAC;YAEhB,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;gBAC/B,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAC5B,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;gBAC/B,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAC5B,CAAC,CAAC,CAAC;YAEH,8DAA8D;YAC9D,IAAI,aAAa,GAA0B,IAAI,CAAC;YAChD,IAAI,OAAO,KAAK,IAAI,IAAI,OAAO,KAAK,SAAS,EAAE,CAAC;gBAC9C,aAAa,GAAG,UAAU,CAAC,GAAG,EAAE;oBAC9B,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;oBACrB,MAAM,CAAC,IAAI,KAAK,CAAC,mCAAmC,OAAO,IAAI,CAAC,CAAC,CAAC;gBACpE,CAAC,EAAE,OAAO,CAAC,CAAC;YACd,CAAC;YAED,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;gBACxB,IAAI,aAAa,EAAE,CAAC;oBAClB,YAAY,CAAC,aAAa,CAAC,CAAC;gBAC9B,CAAC;gBAED,IAAI,IAAI,KAAK,CAAC,EAAE,CAAC;oBACf,MAAM,CAAC,IAAI,KAAK,CAAC,oCAAoC,IAAI,aAAa,MAAM,EAAE,CAAC,CAAC,CAAC;oBACjF,OAAO;gBACT,CAAC;gBAED,gCAAgC;gBAChC,IAAI,CAAC;oBACH,MAAM,YAAY,GAAG,IAAA,0BAAe,EAAC,IAAI,CAAC,aAAa,CAAC,CAAC;oBACzD,OAAO,CAAC,YAAY,CAAC,CAAC;gBACxB,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,CAAC,IAAI,KAAK,CAAC,gCAAgC,KAAK,EAAE,CAAC,CAAC,CAAC;gBAC7D,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;gBACzB,IAAI,aAAa,EAAE,CAAC;oBAClB,YAAY,CAAC,aAAa,CAAC,CAAC;gBAC9B,CAAC;gBACD,MAAM,CAAC,IAAI,KAAK,CAAC,oCAAoC,KAAK,EAAE,CAAC,CAAC,CAAC;YACjE,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;CACF;AAjHD,oDAiHC"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GitHub Copilot CLI Adapter
|
|
3
|
+
*/
|
|
4
|
+
import { CodeGenerationAdapter } from '../types';
|
|
5
|
+
export declare class CopilotCLIAdapter implements CodeGenerationAdapter {
|
|
6
|
+
readonly type: "copilot";
|
|
7
|
+
private workspaceRoot;
|
|
8
|
+
constructor(workspaceRoot?: string);
|
|
9
|
+
/**
|
|
10
|
+
* Check if GitHub Copilot CLI is available
|
|
11
|
+
*/
|
|
12
|
+
checkAvailability(): Promise<boolean>;
|
|
13
|
+
/**
|
|
14
|
+
* Generate code using GitHub Copilot CLI
|
|
15
|
+
* @param timeout Timeout in milliseconds, or null for no timeout
|
|
16
|
+
*/
|
|
17
|
+
generate(prompt: string, contextFiles?: readonly string[], timeout?: number | null): Promise<string[]>;
|
|
18
|
+
}
|
|
19
|
+
//# sourceMappingURL=copilotCLI.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"copilotCLI.d.ts","sourceRoot":"","sources":["../../src/adapters/copilotCLI.ts"],"names":[],"mappings":"AAAA;;GAEG;AAGH,OAAO,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AAIjD,qBAAa,iBAAkB,YAAW,qBAAqB;IAC7D,SAAgB,IAAI,EAAG,SAAS,CAAU;IAC1C,OAAO,CAAC,aAAa,CAAS;gBAElB,aAAa,CAAC,EAAE,MAAM;IAIlC;;OAEG;IACG,iBAAiB,IAAI,OAAO,CAAC,OAAO,CAAC;IAgB3C;;;OAGG;IACG,QAAQ,CACZ,MAAM,EAAE,MAAM,EACd,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,GACtB,OAAO,CAAC,MAAM,EAAE,CAAC;CA4ErB"}
|