coding-agent-benchmarks 0.3.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -257
- package/dist/adapters/claudeCodeCLI.d.ts.map +1 -1
- package/dist/adapters/claudeCodeCLI.js +0 -6
- package/dist/adapters/claudeCodeCLI.js.map +1 -1
- package/dist/adapters/copilotCLI.d.ts.map +1 -1
- package/dist/adapters/copilotCLI.js +0 -6
- package/dist/adapters/copilotCLI.js.map +1 -1
- package/dist/config/loader.d.ts.map +1 -1
- package/dist/config/loader.js +0 -7
- package/dist/config/loader.js.map +1 -1
- package/dist/evaluator.d.ts +0 -12
- package/dist/evaluator.d.ts.map +1 -1
- package/dist/evaluator.js +0 -36
- package/dist/evaluator.js.map +1 -1
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +4 -1
- package/dist/index.js.map +1 -1
- package/dist/runner.js +12 -14
- package/dist/runner.js.map +1 -1
- package/dist/types.d.ts +1 -1
- package/dist/utils/baselineManager.d.ts.map +1 -1
- package/dist/utils/baselineManager.js +0 -4
- package/dist/utils/baselineManager.js.map +1 -1
- package/dist/utils/gitUtils.d.ts.map +1 -1
- package/dist/utils/gitUtils.js +0 -6
- package/dist/utils/gitUtils.js.map +1 -1
- package/dist/utils/updateChecker.d.ts +9 -0
- package/dist/utils/updateChecker.d.ts.map +1 -0
- package/dist/utils/updateChecker.js +100 -0
- package/dist/utils/updateChecker.js.map +1 -0
- package/dist/utils/updateNotifier.d.ts +3 -0
- package/dist/utils/updateNotifier.d.ts.map +1 -0
- package/dist/utils/updateNotifier.js +39 -0
- package/dist/utils/updateNotifier.js.map +1 -0
- package/dist/utils/workspaceUtils.d.ts.map +1 -1
- package/dist/utils/workspaceUtils.js +0 -5
- package/dist/utils/workspaceUtils.js.map +1 -1
- package/dist/validators/llmJudge.d.ts.map +1 -1
- package/dist/validators/llmJudge.js +3 -3
- package/dist/validators/llmJudge.js.map +1 -1
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -1,12 +1,18 @@
|
|
|
1
1
|
# coding-agent-benchmarks
|
|
2
2
|
|
|
3
|
+
**[📖 Documentation](https://chiItepin.github.io/coding-agent-benchmarks)** | **[📦 npm](https://www.npmjs.com/package/coding-agent-benchmarks)** | **[⭐ GitHub](https://github.com/chiItepin/coding-agent-benchmarks)**
|
|
4
|
+
|
|
3
5
|
Open-source framework for evaluating AI coding assistants (like GitHub Copilot CLI or Claude Code) follow your coding standards. Here's the workflow:
|
|
4
6
|
1. You give it a prompt → 2. AI generates code → 3. Library validates → 4. You get a score
|
|
5
7
|
|
|
6
8
|

|
|
7
9
|
|
|
10
|
+
*Figure 1: Evaluation workflow - prompt → generate → validate → score*
|
|
11
|
+
|
|
8
12
|

|
|
9
13
|
|
|
14
|
+
*Figure 2: Example terminal output showing scenario evaluation results*
|
|
15
|
+
|
|
10
16
|
|
|
11
17
|
## Features
|
|
12
18
|
|
|
@@ -135,7 +141,7 @@ module.exports = {
|
|
|
135
141
|
defaultAdapter: 'copilot',
|
|
136
142
|
|
|
137
143
|
// Default LLM model for judge
|
|
138
|
-
defaultModel: 'openai/gpt-
|
|
144
|
+
defaultModel: 'openai/gpt-5',
|
|
139
145
|
|
|
140
146
|
// Default timeout for code generation (milliseconds)
|
|
141
147
|
// Individual scenarios can override this
|
|
@@ -172,33 +178,15 @@ module.exports = {
|
|
|
172
178
|
id: 'react-no-inline-styles',
|
|
173
179
|
category: 'react',
|
|
174
180
|
severity: 'major',
|
|
175
|
-
tags: ['react', 'styling', 'best-practices'],
|
|
176
181
|
description: 'Forbid inline style objects in React components',
|
|
177
|
-
prompt: 'Create a React functional component called Button that accepts a "label" prop and renders a styled button. Use CSS classes instead of inline styles.',
|
|
178
182
|
validationStrategy: {
|
|
179
183
|
patterns: {
|
|
180
|
-
forbiddenPatterns: [/style\s*=\s*\{\{
|
|
184
|
+
forbiddenPatterns: [/style\s*=\s*\{\{/],
|
|
181
185
|
requiredPatterns: [/className/],
|
|
182
186
|
},
|
|
183
187
|
},
|
|
184
188
|
},
|
|
185
|
-
|
|
186
|
-
id: 'async-error-handling',
|
|
187
|
-
category: 'general',
|
|
188
|
-
severity: 'critical',
|
|
189
|
-
tags: ['async', 'error-handling', 'robustness'],
|
|
190
|
-
description: 'Ensure async functions have proper error handling',
|
|
191
|
-
prompt: 'Create an async function called fetchUserData that takes a userId parameter, makes an HTTP request to fetch user data, and returns the user object. Handle errors appropriately.',
|
|
192
|
-
validationStrategy: {
|
|
193
|
-
patterns: {
|
|
194
|
-
requiredPatterns: [/async\s+function\s+fetchUserData|const\s+fetchUserData.*async/, /try|catch|\.catch\(/],
|
|
195
|
-
},
|
|
196
|
-
llmJudge: {
|
|
197
|
-
enabled: true,
|
|
198
|
-
judgmentPrompt: 'Evaluate the error handling in this async function. Does it use try/catch or .catch()? Are errors logged or re-thrown appropriately?',
|
|
199
|
-
},
|
|
200
|
-
},
|
|
201
|
-
},
|
|
189
|
+
// Add more scenarios...
|
|
202
190
|
],
|
|
203
191
|
};
|
|
204
192
|
```
|
|
@@ -211,29 +199,6 @@ You can configure timeouts at three levels (in order of precedence):
|
|
|
211
199
|
2. **Global default**: Set `defaultTimeout` in your config file
|
|
212
200
|
3. **Built-in default**: 120000ms (2 minutes) if nothing else is specified
|
|
213
201
|
|
|
214
|
-
```javascript
|
|
215
|
-
// In benchmarks.config.js
|
|
216
|
-
module.exports = {
|
|
217
|
-
// Global default applies to all scenarios
|
|
218
|
-
defaultTimeout: 180000, // 3 minutes
|
|
219
|
-
|
|
220
|
-
scenarios: [
|
|
221
|
-
{
|
|
222
|
-
id: 'quick-check',
|
|
223
|
-
prompt: '...',
|
|
224
|
-
timeout: 60000, // Override: 1 minute for this scenario
|
|
225
|
-
// ...
|
|
226
|
-
},
|
|
227
|
-
{
|
|
228
|
-
id: 'complex-task',
|
|
229
|
-
prompt: '...',
|
|
230
|
-
// Will use defaultTimeout (3 minutes)
|
|
231
|
-
// ...
|
|
232
|
-
},
|
|
233
|
-
],
|
|
234
|
-
};
|
|
235
|
-
```
|
|
236
|
-
|
|
237
202
|
**Why configure timeouts?**
|
|
238
203
|
- Complex code generation tasks may need more time
|
|
239
204
|
- Simple checks can complete faster with shorter timeouts
|
|
@@ -271,133 +236,73 @@ validationStrategy: {
|
|
|
271
236
|
|
|
272
237
|
### LLM-as-Judge
|
|
273
238
|
|
|
274
|
-
Semantic evaluation using AI:
|
|
239
|
+
Semantic evaluation using AI (requires `GITHUB_TOKEN`):
|
|
275
240
|
|
|
276
241
|
```javascript
|
|
277
242
|
validationStrategy: {
|
|
278
243
|
llmJudge: {
|
|
279
244
|
enabled: true,
|
|
280
|
-
model: 'openai/gpt-
|
|
245
|
+
model: 'openai/gpt-5',
|
|
281
246
|
judgmentPrompt: `Evaluate if the code follows best practices...`,
|
|
282
247
|
},
|
|
283
248
|
}
|
|
284
249
|
```
|
|
285
250
|
|
|
286
|
-
The LLM judge requires a `GITHUB_TOKEN` environment variable with access to GitHub Models API.
|
|
287
|
-
|
|
288
251
|
### ESLint Integration
|
|
289
252
|
|
|
290
253
|

|
|
291
254
|
|
|
255
|
+
*Figure 3: ESLint validator detecting code quality issues in generated code*
|
|
292
256
|
|
|
293
257
|
Run ESLint on generated code:
|
|
294
258
|
|
|
295
259
|
```javascript
|
|
296
260
|
validationStrategy: {
|
|
297
|
-
eslint: {
|
|
298
|
-
enabled: true,
|
|
299
|
-
configPath: '.eslintrc.js', // optional
|
|
300
|
-
},
|
|
261
|
+
eslint: { enabled: true, configPath: '.eslintrc.js' },
|
|
301
262
|
}
|
|
302
263
|
```
|
|
303
264
|
|
|
304
265
|
## Scoring System
|
|
305
266
|
|
|
306
|
-
The scoring system operates at three levels: per-validator scoring, per-scenario scoring, and summary scoring.
|
|
307
|
-
|
|
308
267
|
### Per-Validator Scoring
|
|
309
268
|
|
|
310
|
-
Each validator
|
|
311
|
-
|
|
312
|
-
#### Pattern Validator
|
|
313
|
-
|
|
314
|
-
Uses exponential decay based on weighted violations:
|
|
315
|
-
|
|
316
|
-
```
|
|
317
|
-
score = e^(-totalWeight)
|
|
318
|
-
```
|
|
319
|
-
|
|
320
|
-
Where `totalWeight` is the sum of violation weights:
|
|
321
|
-
- **Critical violations**: 1.0 weight each
|
|
322
|
-
- **Major violations**: 0.7 weight each
|
|
323
|
-
- **Minor violations**: 0.3 weight each
|
|
324
|
-
|
|
325
|
-
**Examples**:
|
|
326
|
-
- 0 violations → score = 1.0 (perfect)
|
|
327
|
-
- 1 critical violation → score ≈ 0.37
|
|
328
|
-
- 1 major violation → score ≈ 0.50
|
|
329
|
-
- 2 minor violations → score ≈ 0.55
|
|
330
|
-
|
|
331
|
-
#### LLM Judge Validator
|
|
332
|
-
|
|
333
|
-
The LLM (GPT-4 or other model) evaluates the code semantically and returns:
|
|
334
|
-
- An `overallScore` from 0.0 to 1.0
|
|
335
|
-
- A list of violations with explanations
|
|
336
|
-
- Passed if: score ≥ 0.7 AND no violations
|
|
337
|
-
|
|
338
|
-
The LLM judge provides semantic understanding beyond pattern matching, evaluating whether the code actually solves the problem correctly and follows best practices.
|
|
339
|
-
|
|
340
|
-
#### ESLint Validator
|
|
341
|
-
|
|
342
|
-
This validator runs ESLint on the generated code and scores based on the number and severity of linting violations. Note that ESLint must be installed and configured in your project for this validator to work. If you don't have ESLint set up globally, disable this validator or provide a custom validator.
|
|
343
|
-
|
|
344
|
-
Uses exponential decay with a dampening factor:
|
|
345
|
-
|
|
346
|
-
```
|
|
347
|
-
score = e^(-totalWeight / 2)
|
|
348
|
-
```
|
|
269
|
+
Each validator independently evaluates generated code and produces a score from 0.0 to 1.0:
|
|
349
270
|
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
271
|
+
| Validator | Scoring Method | Notes |
|
|
272
|
+
|-----------|----------------|-------|
|
|
273
|
+
| **Pattern** | Uses exponential decay based on weighted violations | Critical: 1.0 weight, Major: 0.7 weight, Minor: 0.3 weight |
|
|
274
|
+
| **LLM Judge** | AI evaluates semantically, returns 0.0-1.0 score | Passes if score ≥ 0.7 AND no violations |
|
|
275
|
+
| **ESLint** | Exponential decay with dampening factor (÷2) | ESLint errors → Major (0.7), warnings → Minor (0.3) |
|
|
355
276
|
|
|
356
277
|
### Per-Scenario Scoring
|
|
357
278
|
|
|
358
|
-
Each scenario receives an **overall score**
|
|
359
|
-
|
|
360
|
-
```
|
|
361
|
-
overallScore = average of all active validator scores
|
|
362
|
-
```
|
|
279
|
+
Each scenario receives an **overall score** = **average of all active validator scores**
|
|
363
280
|
|
|
364
|
-
**Active validators** are those that
|
|
365
|
-
- Are configured in the scenario's `validationStrategy`
|
|
366
|
-
- Successfully ran (did not return score = -1)
|
|
281
|
+
**Active validators** are those configured in `validationStrategy` that successfully ran (score ≠ -1).
|
|
367
282
|
|
|
368
283
|
**Pass/Fail Criteria**:
|
|
369
284
|
- ✅ **PASS**: `overallScore ≥ 0.8` AND `violations.length === 0`
|
|
370
285
|
- ❌ **FAIL**: `overallScore < 0.8` OR `violations.length > 0`
|
|
371
286
|
- ⚠️ **SKIP**: An error occurred during evaluation (timeout, adapter failure, etc.)
|
|
372
287
|
|
|
373
|
-
**Example**:
|
|
374
|
-
```
|
|
375
|
-
overallScore = (0.9 + 0.8) / 2 = 0.85
|
|
376
|
-
```
|
|
288
|
+
**Example**: Pattern (0.9) + LLM Judge (0.8) + ESLint (skipped) → `overallScore = (0.9 + 0.8) / 2 = 0.85`
|
|
377
289
|
|
|
378
290
|
### Summary Scoring
|
|
379
291
|
|
|
380
|
-
After evaluating all scenarios, the framework calculates
|
|
292
|
+
After evaluating all scenarios, the framework calculates:
|
|
381
293
|
|
|
382
294
|
```javascript
|
|
383
295
|
{
|
|
384
|
-
total: 10, // Total
|
|
385
|
-
passed: 7, //
|
|
386
|
-
failed: 2, //
|
|
387
|
-
skipped: 1, //
|
|
388
|
-
averageScore: 0.78, // Average of all scenario
|
|
296
|
+
total: 10, // Total scenarios
|
|
297
|
+
passed: 7, // overallScore ≥ 0.8 and no violations
|
|
298
|
+
failed: 2, // Evaluated but didn't pass
|
|
299
|
+
skipped: 1, // Encountered errors
|
|
300
|
+
averageScore: 0.78, // Average of all scenario scores
|
|
389
301
|
totalViolations: 8 // Sum of violations across all scenarios
|
|
390
302
|
}
|
|
391
303
|
```
|
|
392
304
|
|
|
393
|
-
**
|
|
394
|
-
```
|
|
395
|
-
averageScore = (sum of all scenario scores) / total scenarios
|
|
396
|
-
```
|
|
397
|
-
|
|
398
|
-
This includes scores from failed scenarios, providing an overall quality metric across your entire test suite.
|
|
399
|
-
|
|
400
|
-
**Transparency**: When baselines are saved, the per-validator breakdown is included in the baseline file, allowing you to trace exactly which validator contributed what score. See [Baseline File Format](#baseline-file-format) for details.
|
|
305
|
+
**Transparency**: Baselines include per-validator breakdowns. See [Baseline File Format](#baseline-file-format) for details.
|
|
401
306
|
|
|
402
307
|
### Score Interpretation
|
|
403
308
|
|
|
@@ -417,11 +322,6 @@ When baseline tracking is enabled, you'll see delta metrics:
|
|
|
417
322
|
↑ +18.5% improvement from baseline
|
|
418
323
|
```
|
|
419
324
|
|
|
420
|
-
The percentage is calculated as:
|
|
421
|
-
```
|
|
422
|
-
percentage = (currentScore - baselineScore) / baselineScore * 100
|
|
423
|
-
```
|
|
424
|
-
|
|
425
325
|
## Baseline Tracking
|
|
426
326
|
|
|
427
327
|
Track evaluation results over time by enabling baseline management in your config file:
|
|
@@ -449,42 +349,21 @@ When `compareBaseline` is enabled, the report will show score deltas and whether
|
|
|
449
349
|
|
|
450
350
|
### Baseline File Format
|
|
451
351
|
|
|
452
|
-
|
|
352
|
+
Path: `.benchmarks/baselines/{adapter}/{model}/{scenario-id}.json`
|
|
353
|
+
|
|
354
|
+
Each baseline file provides complete score traceability:
|
|
453
355
|
|
|
454
356
|
```json
|
|
455
357
|
{
|
|
456
358
|
"scenarioId": "typescript-no-any",
|
|
457
359
|
"score": 0.85,
|
|
458
360
|
"violations": [
|
|
459
|
-
{
|
|
460
|
-
"type": "pattern",
|
|
461
|
-
"message": "Forbidden pattern found: :\\s*any\\b",
|
|
462
|
-
"file": "src/types.ts",
|
|
463
|
-
"line": 12,
|
|
464
|
-
"severity": "critical",
|
|
465
|
-
"details": "Matched: \"metadata: any\""
|
|
466
|
-
}
|
|
361
|
+
{ "type": "pattern", "message": "Forbidden pattern found: :\\s*any\\b", "file": "src/types.ts", ... }
|
|
467
362
|
],
|
|
468
363
|
"validationResults": [
|
|
469
|
-
{
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
"violations": [...],
|
|
473
|
-
"validatorType": "pattern"
|
|
474
|
-
},
|
|
475
|
-
{
|
|
476
|
-
"passed": true,
|
|
477
|
-
"score": 1.0,
|
|
478
|
-
"violations": [],
|
|
479
|
-
"validatorType": "llm-judge"
|
|
480
|
-
},
|
|
481
|
-
{
|
|
482
|
-
"passed": true,
|
|
483
|
-
"score": -1,
|
|
484
|
-
"violations": [],
|
|
485
|
-
"validatorType": "eslint",
|
|
486
|
-
"error": "ESLint not found"
|
|
487
|
-
}
|
|
364
|
+
{ "passed": false, "score": 0.37, "validatorType": "pattern", "violations": [...] },
|
|
365
|
+
{ "passed": true, "score": 1.0, "validatorType": "llm-judge", "violations": [] },
|
|
366
|
+
{ "passed": true, "score": -1, "validatorType": "eslint", "error": "ESLint not found" }
|
|
488
367
|
],
|
|
489
368
|
"timestamp": "2026-01-23T22:28:32.216Z",
|
|
490
369
|
"adapter": "copilot",
|
|
@@ -493,15 +372,11 @@ Each baseline file contains complete transparency into how the score was calcula
|
|
|
493
372
|
```
|
|
494
373
|
|
|
495
374
|
**Key fields**:
|
|
496
|
-
- `score
|
|
497
|
-
- `violations
|
|
498
|
-
- `validationResults
|
|
499
|
-
- Individual validator score
|
|
500
|
-
- Whether that validator passed
|
|
501
|
-
- Violations specific to that validator
|
|
502
|
-
- Any errors that occurred (`score: -1` means skipped)
|
|
375
|
+
- `score` - Overall scenario score (average of active validators)
|
|
376
|
+
- `violations` - All violations from all validators combined
|
|
377
|
+
- `validationResults` - Per-validator breakdown (score, passed, violations, errors)
|
|
503
378
|
|
|
504
|
-
**
|
|
379
|
+
**Traceability**: You can always trace the overall score back to individual validator scores (e.g., `score: 0.067` → check `validationResults` for Pattern: 0.135, LLM Judge: 0.00).
|
|
505
380
|
|
|
506
381
|
## CLI Commands
|
|
507
382
|
|
|
@@ -509,24 +384,26 @@ Each baseline file contains complete transparency into how the score was calcula
|
|
|
509
384
|
|
|
510
385
|
Run benchmark evaluations.
|
|
511
386
|
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
387
|
+
| Option | Description | Default/Example |
|
|
388
|
+
|--------|-------------|-----------------|
|
|
389
|
+
| `--scenario <pattern>` | Filter by scenario ID (supports wildcards) | `typescript-*` |
|
|
390
|
+
| `--category <categories>` | Filter by category (comma-separated) | `typescript,react` |
|
|
391
|
+
| `--tag <tags>` | Filter by tags (comma-separated) | `safety,types` |
|
|
392
|
+
| `--adapter <type>` | Adapter to use | `copilot` or `claude-code` |
|
|
393
|
+
| `--model <model>` | LLM model for judge | `openai/gpt-5` |
|
|
394
|
+
| `--threshold <number>` | Minimum passing score | `0.8` |
|
|
395
|
+
| `--verbose` | Show detailed output | - |
|
|
396
|
+
| `--output <file>` | Export JSON report | `report.json` |
|
|
397
|
+
| `--workspace-root <path>` | Workspace root directory | Current directory |
|
|
522
398
|
|
|
523
399
|
### `list`
|
|
524
400
|
|
|
525
401
|
List available test scenarios.
|
|
526
402
|
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
403
|
+
| Option | Description |
|
|
404
|
+
|--------|-------------|
|
|
405
|
+
| `--category <categories>` | Filter by category (comma-separated) |
|
|
406
|
+
| `--tag <tags>` | Filter by tags (comma-separated) |
|
|
530
407
|
|
|
531
408
|
### `check`
|
|
532
409
|
|
|
@@ -536,8 +413,9 @@ Check if coding agent CLIs are available.
|
|
|
536
413
|
|
|
537
414
|
Test LLM judge with a custom prompt (for debugging).
|
|
538
415
|
|
|
539
|
-
|
|
540
|
-
|
|
416
|
+
| Option | Description |
|
|
417
|
+
|--------|-------------|
|
|
418
|
+
| `--model <model>` | LLM model to use |
|
|
541
419
|
|
|
542
420
|
## Understanding Output
|
|
543
421
|
|
|
@@ -586,26 +464,15 @@ import { Evaluator, loadConfig } from 'coding-agent-benchmarks';
|
|
|
586
464
|
async function runEvaluation() {
|
|
587
465
|
const { config, scenarios } = await loadConfig();
|
|
588
466
|
|
|
589
|
-
// Create evaluator
|
|
590
467
|
const evaluator = new Evaluator({
|
|
591
468
|
adapter: 'copilot',
|
|
592
|
-
model: 'openai/gpt-
|
|
469
|
+
model: 'openai/gpt-5',
|
|
593
470
|
verbose: true,
|
|
594
471
|
saveBaseline: config.saveBaseline,
|
|
595
472
|
compareBaseline: config.compareBaseline,
|
|
596
473
|
});
|
|
597
474
|
|
|
598
|
-
// Check adapter availability
|
|
599
|
-
const available = await evaluator.checkAdapterAvailability();
|
|
600
|
-
if (!available) {
|
|
601
|
-
throw new Error('Adapter not available');
|
|
602
|
-
}
|
|
603
|
-
|
|
604
|
-
// Run evaluation
|
|
605
475
|
const report = await evaluator.evaluate(scenarios);
|
|
606
|
-
|
|
607
|
-
console.log(`Passed: ${report.summary.passed}/${report.summary.total}`);
|
|
608
|
-
console.log(`Average score: ${report.summary.averageScore.toFixed(2)}`);
|
|
609
476
|
}
|
|
610
477
|
|
|
611
478
|
runEvaluation();
|
|
@@ -616,106 +483,62 @@ runEvaluation();
|
|
|
616
483
|
Implement the `CodeValidator` interface:
|
|
617
484
|
|
|
618
485
|
```typescript
|
|
619
|
-
import { CodeValidator, ValidationResult
|
|
486
|
+
import { CodeValidator, ValidationResult } from 'coding-agent-benchmarks';
|
|
620
487
|
|
|
621
488
|
export class CustomValidator implements CodeValidator {
|
|
622
489
|
public readonly type = 'custom';
|
|
623
490
|
|
|
624
|
-
async validate(
|
|
625
|
-
files: readonly string[],
|
|
626
|
-
scenario: TestScenario
|
|
627
|
-
): Promise<ValidationResult> {
|
|
491
|
+
async validate(files: readonly string[], scenario: TestScenario): Promise<ValidationResult> {
|
|
628
492
|
// Your validation logic here
|
|
629
|
-
return {
|
|
630
|
-
passed: true,
|
|
631
|
-
score: 1.0,
|
|
632
|
-
violations: [],
|
|
633
|
-
validatorType: 'custom',
|
|
634
|
-
};
|
|
493
|
+
return { passed: true, score: 1.0, violations: [], validatorType: 'custom' };
|
|
635
494
|
}
|
|
636
495
|
}
|
|
637
496
|
```
|
|
638
497
|
|
|
498
|
+
See CONTRIBUTING.md for complete examples.
|
|
499
|
+
|
|
639
500
|
## Creating Custom Adapters
|
|
640
501
|
|
|
641
502
|
Implement the `CodeGenerationAdapter` interface:
|
|
642
503
|
|
|
643
504
|
```typescript
|
|
644
|
-
import { CodeGenerationAdapter
|
|
505
|
+
import { CodeGenerationAdapter } from 'coding-agent-benchmarks';
|
|
645
506
|
|
|
646
507
|
export class CustomAdapter implements CodeGenerationAdapter {
|
|
647
|
-
public readonly type
|
|
648
|
-
|
|
649
|
-
async checkAvailability(): Promise<boolean> {
|
|
650
|
-
// Check if CLI is available
|
|
651
|
-
return true;
|
|
652
|
-
}
|
|
508
|
+
public readonly type = 'custom-adapter';
|
|
653
509
|
|
|
654
|
-
async
|
|
655
|
-
|
|
656
|
-
contextFiles?: readonly string[],
|
|
657
|
-
timeout?: number
|
|
658
|
-
): Promise<string[]> {
|
|
659
|
-
// Generate code and return changed files
|
|
660
|
-
return ['path/to/generated/file.ts'];
|
|
661
|
-
}
|
|
510
|
+
async checkAvailability(): Promise<boolean> { /* ... */ }
|
|
511
|
+
async generate(prompt: string, contextFiles?: readonly string[], timeout?: number): Promise<string[]> { /* ... */ }
|
|
662
512
|
}
|
|
663
513
|
```
|
|
664
514
|
|
|
515
|
+
See CONTRIBUTING.md for complete examples.
|
|
516
|
+
|
|
665
517
|
## GitHub Authentication (for LLM Judge)
|
|
666
518
|
|
|
667
|
-
LLM-as-judge validation requires GitHub authentication to access GitHub Models API.
|
|
519
|
+
LLM-as-judge validation requires GitHub authentication to access GitHub Models API. Choose one option:
|
|
668
520
|
|
|
669
521
|
### Option 1: Personal Access Token (Recommended)
|
|
670
522
|
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
6. Set environment variable:
|
|
677
|
-
```bash
|
|
678
|
-
export GITHUB_TOKEN=ghp_xxxxxxxxxxxxxxxxxxxx
|
|
679
|
-
```
|
|
523
|
+
Create a token at https://github.com/settings/tokens with the **`models:read`** scope, then set it as an environment variable:
|
|
524
|
+
|
|
525
|
+
```bash
|
|
526
|
+
export GITHUB_TOKEN=ghp_xxxxxxxxxxxxxxxxxxxx
|
|
527
|
+
```
|
|
680
528
|
|
|
681
529
|
### Option 2: GitHub CLI (Automatic)
|
|
682
530
|
|
|
683
|
-
If
|
|
531
|
+
If GitHub CLI is installed, tokens are auto-detected:
|
|
684
532
|
|
|
685
533
|
```bash
|
|
686
|
-
#
|
|
687
|
-
brew install gh # macOS
|
|
688
|
-
# or download from https://cli.github.com
|
|
689
|
-
|
|
690
|
-
# Authenticate (one time)
|
|
534
|
+
brew install gh # or download from https://cli.github.com
|
|
691
535
|
gh auth login
|
|
692
|
-
|
|
693
536
|
# Token will be used automatically - no GITHUB_TOKEN needed!
|
|
694
537
|
```
|
|
695
538
|
|
|
696
|
-
### Check Authentication
|
|
697
|
-
|
|
698
|
-
```bash
|
|
699
|
-
npx coding-agent-benchmarks check
|
|
700
|
-
```
|
|
701
|
-
|
|
702
|
-
Output:
|
|
703
|
-
```
|
|
704
|
-
Checking adapter availability...
|
|
705
|
-
GitHub Copilot CLI: ✓ Available
|
|
706
|
-
Claude Code CLI: ✗ Not found
|
|
707
|
-
|
|
708
|
-
Checking GitHub authentication...
|
|
709
|
-
✓ Using token from GitHub CLI (gh auth token)
|
|
710
|
-
```
|
|
711
|
-
|
|
712
|
-
## How It Works
|
|
539
|
+
### Check Authentication
|
|
713
540
|
|
|
714
|
-
|
|
715
|
-
2. **File Tracking**: Git is used to detect which files were created/modified
|
|
716
|
-
3. **Validation**: Multiple validators check the generated code
|
|
717
|
-
4. **Scoring**: Results are aggregated and compared against thresholds
|
|
718
|
-
5. **Reporting**: Results are displayed in terminal and optionally exported as JSON
|
|
541
|
+
Run `npx coding-agent-benchmarks check` to verify authentication status.
|
|
719
542
|
|
|
720
543
|
## Requirements
|
|
721
544
|
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"claudeCodeCLI.d.ts","sourceRoot":"","sources":["../../src/adapters/claudeCodeCLI.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AAOjD,MAAM,WAAW,2BAA2B;IAC1C,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAID,qBAAa,oBAAqB,YAAW,qBAAqB;IAChE,SAAgB,IAAI,EAAG,aAAa,CAAU;IAC9C,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,KAAK,CAAS;gBAEV,OAAO,CAAC,EAAE,2BAA2B;IAKjD;;OAEG;IACG,iBAAiB,IAAI,OAAO,CAAC,OAAO,CAAC;IAgB3C;;OAEG;IACH,QAAQ,IAAI,MAAM;IAIlB;;OAEG;IACH,OAAO,CAAC,WAAW;IA8BnB;;;OAGG;IACG,QAAQ,CACZ,MAAM,EAAE,MAAM,EACd,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,GACtB,OAAO,CAAC,MAAM,EAAE,CAAC;
|
|
1
|
+
{"version":3,"file":"claudeCodeCLI.d.ts","sourceRoot":"","sources":["../../src/adapters/claudeCodeCLI.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AAOjD,MAAM,WAAW,2BAA2B;IAC1C,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAID,qBAAa,oBAAqB,YAAW,qBAAqB;IAChE,SAAgB,IAAI,EAAG,aAAa,CAAU;IAC9C,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,KAAK,CAAS;gBAEV,OAAO,CAAC,EAAE,2BAA2B;IAKjD;;OAEG;IACG,iBAAiB,IAAI,OAAO,CAAC,OAAO,CAAC;IAgB3C;;OAEG;IACH,QAAQ,IAAI,MAAM;IAIlB;;OAEG;IACH,OAAO,CAAC,WAAW;IA8BnB;;;OAGG;IACG,QAAQ,CACZ,MAAM,EAAE,MAAM,EACd,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,GACtB,OAAO,CAAC,MAAM,EAAE,CAAC;CAuGrB"}
|
|
@@ -105,13 +105,10 @@ class ClaudeCodeCLIAdapter {
|
|
|
105
105
|
throw new Error("Claude Code CLI is not available. Please install it first: https://docs.anthropic.com/en/docs/build-with-claude/claude-code");
|
|
106
106
|
}
|
|
107
107
|
const fullPrompt = this.buildPrompt(prompt, contextFiles);
|
|
108
|
-
// Capture git status before generation
|
|
109
108
|
const statusBefore = (0, gitUtils_1.getGitStatusPorcelain)(this.workspaceRoot);
|
|
110
|
-
// Write prompt to temp file and pipe via stdin (matches @copilot-evals pattern)
|
|
111
109
|
return new Promise((resolve, reject) => {
|
|
112
110
|
const tempFile = path.join(this.workspaceRoot, ".claude-eval-prompt.txt");
|
|
113
111
|
fs.writeFileSync(tempFile, fullPrompt, "utf8");
|
|
114
|
-
// Cleanup function
|
|
115
112
|
const cleanup = () => {
|
|
116
113
|
try {
|
|
117
114
|
if (fs.existsSync(tempFile)) {
|
|
@@ -122,7 +119,6 @@ class ClaudeCodeCLIAdapter {
|
|
|
122
119
|
// Ignore cleanup errors
|
|
123
120
|
}
|
|
124
121
|
};
|
|
125
|
-
// Register cleanup on process termination
|
|
126
122
|
const cleanupOnExit = () => {
|
|
127
123
|
cleanup();
|
|
128
124
|
};
|
|
@@ -142,7 +138,6 @@ class ClaudeCodeCLIAdapter {
|
|
|
142
138
|
proc.stderr?.on("data", (data) => {
|
|
143
139
|
stderr += data.toString();
|
|
144
140
|
});
|
|
145
|
-
// Set timeout only if specified (null/undefined = no timeout)
|
|
146
141
|
let timeoutHandle = null;
|
|
147
142
|
if (timeout !== null && timeout !== undefined) {
|
|
148
143
|
timeoutHandle = setTimeout(() => {
|
|
@@ -164,7 +159,6 @@ class ClaudeCodeCLIAdapter {
|
|
|
164
159
|
reject(new Error(`Claude Code CLI exited with code ${code}\nStderr: ${stderr}`));
|
|
165
160
|
return;
|
|
166
161
|
}
|
|
167
|
-
// Get files changed during generation (diff before/after)
|
|
168
162
|
try {
|
|
169
163
|
const statusAfter = (0, gitUtils_1.getGitStatusPorcelain)(this.workspaceRoot);
|
|
170
164
|
const changedFiles = (0, gitUtils_1.getChangedFilesDiff)(statusBefore, statusAfter, this.workspaceRoot);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"claudeCodeCLI.js","sourceRoot":"","sources":["../../src/adapters/claudeCodeCLI.ts"],"names":[],"mappings":";AAAA;;GAEG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEH,iDAAsC;AACtC,uCAAyB;AACzB,2CAA6B;AAE7B,gDAA+E;AAC/E,4DAGiC;AAOjC,MAAM,aAAa,GAAG,QAAQ,CAAC;AAE/B,MAAa,oBAAoB;IAK/B,YAAY,OAAqC;QAJjC,SAAI,GAAG,aAAsB,CAAC;QAK5C,IAAI,CAAC,aAAa,GAAG,IAAA,qCAAoB,EAAC,OAAO,EAAE,aAAa,CAAC,CAAC;QAClE,IAAI,CAAC,KAAK,GAAG,OAAO,EAAE,KAAK,IAAI,aAAa,CAAC;IAC/C,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,iBAAiB;QACrB,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;YAC7B,MAAM,IAAI,GAAG,IAAA,qBAAK,EAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,EAAE;gBACtC,KAAK,EAAE,MAAM;aACd,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;gBACxB,OAAO,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC;YACtB,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE;gBACpB,OAAO,CAAC,KAAK,CAAC,CAAC;YACjB,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,QAAQ;QACN,OAAO,IAAI,CAAC,KAAK,CAAC;IACpB,CAAC;IAED;;OAEG;IACK,WAAW,CACjB,MAAc,EACd,YAAgC;QAEhC,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,IAAI,YAAY,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5C,MAAM,QAAQ,GAAG,IAAA,iCAAgB,EAAC,IAAI,CAAC,aAAa,EAAE,YAAY,CAAC,CAAC;YACpE,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxB,KAAK,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;gBAClC,MAAM,cAAc,GAAG,QAAQ;qBAC5B,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;oBACX,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,YAAY,CAAC;oBAC5D,OAAO,OAAO,GAAG,CAAC,IAAI,WAAW,GAAG,KAAK,GAAG,CAAC,OAAO,UAAU,CAAC;gBACjE,CAAC,CAAC;qBACD,IAAI,CAAC,MAAM,CAAC,CAAC;gBAChB,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAC3B,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACvB,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACnB,KAAK,CAAC,IAAI,CACR,0GAA0G,CAC3G,CAAC;QAEF,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,QAAQ,CACZ,MAAc,EACd,YAAgC,EAChC,OAAuB;QAEvB,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,iBAAiB,EAAE,CAAC;QACnD,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,MAAM,IAAI,KAAK,CACb,6HAA6H,CAC9H,CAAC;QACJ,CAAC;QAED,MAAM,UAAU,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;QAE1D,
|
|
1
|
+
{"version":3,"file":"claudeCodeCLI.js","sourceRoot":"","sources":["../../src/adapters/claudeCodeCLI.ts"],"names":[],"mappings":";AAAA;;GAEG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEH,iDAAsC;AACtC,uCAAyB;AACzB,2CAA6B;AAE7B,gDAA+E;AAC/E,4DAGiC;AAOjC,MAAM,aAAa,GAAG,QAAQ,CAAC;AAE/B,MAAa,oBAAoB;IAK/B,YAAY,OAAqC;QAJjC,SAAI,GAAG,aAAsB,CAAC;QAK5C,IAAI,CAAC,aAAa,GAAG,IAAA,qCAAoB,EAAC,OAAO,EAAE,aAAa,CAAC,CAAC;QAClE,IAAI,CAAC,KAAK,GAAG,OAAO,EAAE,KAAK,IAAI,aAAa,CAAC;IAC/C,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,iBAAiB;QACrB,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;YAC7B,MAAM,IAAI,GAAG,IAAA,qBAAK,EAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,EAAE;gBACtC,KAAK,EAAE,MAAM;aACd,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;gBACxB,OAAO,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC;YACtB,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE;gBACpB,OAAO,CAAC,KAAK,CAAC,CAAC;YACjB,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,QAAQ;QACN,OAAO,IAAI,CAAC,KAAK,CAAC;IACpB,CAAC;IAED;;OAEG;IACK,WAAW,CACjB,MAAc,EACd,YAAgC;QAEhC,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,IAAI,YAAY,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5C,MAAM,QAAQ,GAAG,IAAA,iCAAgB,EAAC,IAAI,CAAC,aAAa,EAAE,YAAY,CAAC,CAAC;YACpE,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxB,KAAK,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;gBAClC,MAAM,cAAc,GAAG,QAAQ;qBAC5B,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;oBACX,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,YAAY,CAAC;oBAC5D,OAAO,OAAO,GAAG,CAAC,IAAI,WAAW,GAAG,KAAK,GAAG,CAAC,OAAO,UAAU,CAAC;gBACjE,CAAC,CAAC;qBACD,IAAI,CAAC,MAAM,CAAC,CAAC;gBAChB,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAC3B,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACvB,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACnB,KAAK,CAAC,IAAI,CACR,0GAA0G,CAC3G,CAAC;QAEF,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,QAAQ,CACZ,MAAc,EACd,YAAgC,EAChC,OAAuB;QAEvB,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,iBAAiB,EAAE,CAAC;QACnD,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,MAAM,IAAI,KAAK,CACb,6HAA6H,CAC9H,CAAC;QACJ,CAAC;QAED,MAAM,UAAU,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;QAE1D,MAAM,YAAY,GAAG,IAAA,gCAAqB,EAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAE/D,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;YACrC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,aAAa,EAAE,yBAAyB,CAAC,CAAC;YAC1E,EAAE,CAAC,aAAa,CAAC,QAAQ,EAAE,UAAU,EAAE,MAAM,CAAC,CAAC;YAE/C,MAAM,OAAO,GAAG,GAAS,EAAE;gBACzB,IAAI,CAAC;oBACH,IAAI,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;wBAC5B,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC;oBAC1B,CAAC;gBACH,CAAC;gBAAC,MAAM,CAAC;oBACP,wBAAwB;gBAC1B,CAAC;YACH,CAAC,CAAC;YAEF,MAAM,aAAa,GAAG,GAAS,EAAE;gBAC/B,OAAO,EAAE,CAAC;YACZ,CAAC,CAAC;YACF,OAAO,CAAC,IAAI,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;YACtC,OAAO,CAAC,IAAI,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;YAEvC,MAAM,OAAO,GAAG,QAAQ,QAAQ,sBAAsB,IAAI,CAAC,KAAK,yIAAyI,CAAC;YAC1M,MAAM,IAAI,GAAG,IAAA,qBAAK,EAAC,IAAI,EAAE,CAAC,IAAI,EAAE,OAAO,CAAC,EAAE;gBACxC,GAAG,EAAE,IAAI,CAAC,aAAa;gBACvB,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;aAChC,CAAC,CAAC;YAEH,6DAA6D;YAC7D,IAAI,MAAM,GAAG,EAAE,CAAC;YAChB,IAAI,MAAM,GAAG,EAAE,CAAC;YAEhB,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;gBAC/B,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAC5B,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;gBAC/B,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAC5B,CAAC,CAAC,CAAC;YAEH,IAAI,aAAa,GAA0B,IAAI,CAAC;YAChD,IAAI,OAAO,KAAK,IAAI,IAAI,OAAO,KAAK,SAAS,EAAE,CAAC;gBAC9C,aAAa,GAAG,UAAU,CAAC,GAAG,EAAE;oBAC9B,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;oBACrB,OAAO,EAAE,CAAC;oBACV,OAAO,CAAC,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;oBAChD,OAAO,CAAC,cAAc,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;oBACjD,MAAM,CAAC,IAAI,KAAK,CAAC,mCAAmC,OAAO,IAAI,CAAC,CAAC,CAAC;gBACpE,CAAC,EAAE,OAAO,CAAC,CAAC;YACd,CAAC;YAED,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;gBACxB,IAAI,aAAa,EAAE,CAAC;oBAClB,YAAY,CAAC,aAAa,CAAC,CAAC;gBAC9B,CAAC;gBAED,OAAO,EAAE,CAAC;gBACV,OAAO,CAAC,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;gBAChD,OAAO,CAAC,cAAc,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;gBAEjD,IAAI,IAAI,KAAK,CAAC,EAAE,CAAC;oBACf,MAAM,CACJ,IAAI,KAAK,CACP,oCAAoC,IAAI,aAAa,MAAM,EAAE,CAC9D,CACF,CAAC;oBACF,OAAO;gBACT,CAAC;gBAED,IAAI,CAAC;oBACH,MAAM,WAAW,GAAG,IAAA,gCAAqB,EAAC,IAAI,CAAC,aAAa,CAAC,CAAC;oBAC9D,MAAM,YAAY,GAAG,IAAA,8BAAmB,EACtC,YAAY,EACZ,WAAW,EACX,IAAI,CAAC,aAAa,CACnB,CAAC;oBACF,OAAO,CAAC,YAAY,CAAC,CAAC;gBACxB,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,CAAC,IAAI,KAAK,CAAC,gCAAgC,KAAK,EAAE,CAAC,CAAC,CAAC;gBAC7D,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;gBACzB,IAAI,aAAa,EAAE,CAAC;oBAClB,YAAY,CAAC,aAAa,CAAC,CAAC;gBAC9B,CAAC;gBACD,OAAO,EAAE,CAAC;gBACV,OAAO,CAAC,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;gBAChD,OAAO,CAAC,cAAc,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;gBACjD,MAAM,CAAC,IAAI,KAAK,CAAC,oCAAoC,KAAK,EAAE,CAAC,CAAC,CAAC;YACjE,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;CACF;AApLD,oDAoLC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"copilotCLI.d.ts","sourceRoot":"","sources":["../../src/adapters/copilotCLI.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AAOjD,MAAM,WAAW,wBAAwB;IACvC,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAID,qBAAa,iBAAkB,YAAW,qBAAqB;IAC7D,SAAgB,IAAI,EAAG,SAAS,CAAU;IAC1C,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,KAAK,CAAS;gBAEV,OAAO,CAAC,EAAE,wBAAwB;IAKxC,iBAAiB,IAAI,OAAO,CAAC,OAAO,CAAC;IAgB3C;;OAEG;IACH,QAAQ,IAAI,MAAM;IAIlB;;OAEG;IACH,OAAO,CAAC,WAAW;IA8BnB;;;OAGG;IACG,QAAQ,CACZ,MAAM,EAAE,MAAM,EACd,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,GACtB,OAAO,CAAC,MAAM,EAAE,CAAC;
|
|
1
|
+
{"version":3,"file":"copilotCLI.d.ts","sourceRoot":"","sources":["../../src/adapters/copilotCLI.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AAOjD,MAAM,WAAW,wBAAwB;IACvC,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAID,qBAAa,iBAAkB,YAAW,qBAAqB;IAC7D,SAAgB,IAAI,EAAG,SAAS,CAAU;IAC1C,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,KAAK,CAAS;gBAEV,OAAO,CAAC,EAAE,wBAAwB;IAKxC,iBAAiB,IAAI,OAAO,CAAC,OAAO,CAAC;IAgB3C;;OAEG;IACH,QAAQ,IAAI,MAAM;IAIlB;;OAEG;IACH,OAAO,CAAC,WAAW;IA8BnB;;;OAGG;IACG,QAAQ,CACZ,MAAM,EAAE,MAAM,EACd,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,GACtB,OAAO,CAAC,MAAM,EAAE,CAAC;CAyGrB"}
|
|
@@ -102,13 +102,10 @@ class CopilotCLIAdapter {
|
|
|
102
102
|
throw new Error("GitHub Copilot CLI is not available. Please install it first: https://docs.github.com/en/copilot/how-tos/set-up/install-copilot-cli");
|
|
103
103
|
}
|
|
104
104
|
const fullPrompt = this.buildPrompt(prompt, contextFiles);
|
|
105
|
-
// Capture git status before generation
|
|
106
105
|
const statusBefore = (0, gitUtils_1.getGitStatusPorcelain)(this.workspaceRoot);
|
|
107
|
-
// Write prompt to temp file and pipe via stdin (matches @copilot-evals pattern)
|
|
108
106
|
return new Promise((resolve, reject) => {
|
|
109
107
|
const tempFile = path.join(this.workspaceRoot, ".copilot-eval-prompt.txt");
|
|
110
108
|
fs.writeFileSync(tempFile, fullPrompt, "utf8");
|
|
111
|
-
// Cleanup function
|
|
112
109
|
const cleanup = () => {
|
|
113
110
|
try {
|
|
114
111
|
if (fs.existsSync(tempFile)) {
|
|
@@ -119,7 +116,6 @@ class CopilotCLIAdapter {
|
|
|
119
116
|
// Ignore cleanup errors
|
|
120
117
|
}
|
|
121
118
|
};
|
|
122
|
-
// Register cleanup on process termination
|
|
123
119
|
const cleanupOnExit = () => {
|
|
124
120
|
cleanup();
|
|
125
121
|
};
|
|
@@ -138,7 +134,6 @@ class CopilotCLIAdapter {
|
|
|
138
134
|
proc.stderr?.on("data", (data) => {
|
|
139
135
|
stderr += data.toString();
|
|
140
136
|
});
|
|
141
|
-
// Set timeout only if specified (null/undefined = no timeout)
|
|
142
137
|
let timeoutHandle = null;
|
|
143
138
|
if (timeout !== null && timeout !== undefined) {
|
|
144
139
|
timeoutHandle = setTimeout(() => {
|
|
@@ -160,7 +155,6 @@ class CopilotCLIAdapter {
|
|
|
160
155
|
reject(new Error(`Copilot CLI exited with code ${code}\nStderr: ${stderr}`));
|
|
161
156
|
return;
|
|
162
157
|
}
|
|
163
|
-
// Get files changed during generation (diff before/after)
|
|
164
158
|
try {
|
|
165
159
|
const statusAfter = (0, gitUtils_1.getGitStatusPorcelain)(this.workspaceRoot);
|
|
166
160
|
const changedFiles = (0, gitUtils_1.getChangedFilesDiff)(statusBefore, statusAfter, this.workspaceRoot);
|