coding-agent-benchmarks 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +78 -257
- package/dist/adapters/claudeCodeCLI.d.ts.map +1 -1
- package/dist/adapters/claudeCodeCLI.js +0 -6
- package/dist/adapters/claudeCodeCLI.js.map +1 -1
- package/dist/adapters/copilotCLI.d.ts.map +1 -1
- package/dist/adapters/copilotCLI.js +0 -6
- package/dist/adapters/copilotCLI.js.map +1 -1
- package/dist/config/loader.d.ts.map +1 -1
- package/dist/config/loader.js +0 -7
- package/dist/config/loader.js.map +1 -1
- package/dist/evaluator.d.ts +0 -12
- package/dist/evaluator.d.ts.map +1 -1
- package/dist/evaluator.js +0 -36
- package/dist/evaluator.js.map +1 -1
- package/dist/runner.js +2 -11
- package/dist/runner.js.map +1 -1
- package/dist/types.d.ts +1 -1
- package/dist/utils/baselineManager.d.ts.map +1 -1
- package/dist/utils/baselineManager.js +0 -4
- package/dist/utils/baselineManager.js.map +1 -1
- package/dist/utils/gitUtils.d.ts.map +1 -1
- package/dist/utils/gitUtils.js +0 -6
- package/dist/utils/gitUtils.js.map +1 -1
- package/dist/utils/workspaceUtils.d.ts.map +1 -1
- package/dist/utils/workspaceUtils.js +0 -5
- package/dist/utils/workspaceUtils.js.map +1 -1
- package/dist/validators/llmJudge.d.ts.map +1 -1
- package/dist/validators/llmJudge.js +3 -3
- package/dist/validators/llmJudge.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -5,8 +5,12 @@ Open-source framework for evaluating AI coding assistants (like GitHub Copilot C
|
|
|
5
5
|
|
|
6
6
|

|
|
7
7
|
|
|
8
|
+
*Figure 1: Evaluation workflow - prompt → generate → validate → score*
|
|
9
|
+
|
|
8
10
|

|
|
9
11
|
|
|
12
|
+
*Figure 2: Example terminal output showing scenario evaluation results*
|
|
13
|
+
|
|
10
14
|
|
|
11
15
|
## Features
|
|
12
16
|
|
|
@@ -135,7 +139,7 @@ module.exports = {
|
|
|
135
139
|
defaultAdapter: 'copilot',
|
|
136
140
|
|
|
137
141
|
// Default LLM model for judge
|
|
138
|
-
defaultModel: 'openai/gpt-
|
|
142
|
+
defaultModel: 'openai/gpt-5',
|
|
139
143
|
|
|
140
144
|
// Default timeout for code generation (milliseconds)
|
|
141
145
|
// Individual scenarios can override this
|
|
@@ -172,33 +176,15 @@ module.exports = {
|
|
|
172
176
|
id: 'react-no-inline-styles',
|
|
173
177
|
category: 'react',
|
|
174
178
|
severity: 'major',
|
|
175
|
-
tags: ['react', 'styling', 'best-practices'],
|
|
176
179
|
description: 'Forbid inline style objects in React components',
|
|
177
|
-
prompt: 'Create a React functional component called Button that accepts a "label" prop and renders a styled button. Use CSS classes instead of inline styles.',
|
|
178
180
|
validationStrategy: {
|
|
179
181
|
patterns: {
|
|
180
|
-
forbiddenPatterns: [/style\s*=\s*\{\{
|
|
182
|
+
forbiddenPatterns: [/style\s*=\s*\{\{/],
|
|
181
183
|
requiredPatterns: [/className/],
|
|
182
184
|
},
|
|
183
185
|
},
|
|
184
186
|
},
|
|
185
|
-
|
|
186
|
-
id: 'async-error-handling',
|
|
187
|
-
category: 'general',
|
|
188
|
-
severity: 'critical',
|
|
189
|
-
tags: ['async', 'error-handling', 'robustness'],
|
|
190
|
-
description: 'Ensure async functions have proper error handling',
|
|
191
|
-
prompt: 'Create an async function called fetchUserData that takes a userId parameter, makes an HTTP request to fetch user data, and returns the user object. Handle errors appropriately.',
|
|
192
|
-
validationStrategy: {
|
|
193
|
-
patterns: {
|
|
194
|
-
requiredPatterns: [/async\s+function\s+fetchUserData|const\s+fetchUserData.*async/, /try|catch|\.catch\(/],
|
|
195
|
-
},
|
|
196
|
-
llmJudge: {
|
|
197
|
-
enabled: true,
|
|
198
|
-
judgmentPrompt: 'Evaluate the error handling in this async function. Does it use try/catch or .catch()? Are errors logged or re-thrown appropriately?',
|
|
199
|
-
},
|
|
200
|
-
},
|
|
201
|
-
},
|
|
187
|
+
// Add more scenarios...
|
|
202
188
|
],
|
|
203
189
|
};
|
|
204
190
|
```
|
|
@@ -211,29 +197,6 @@ You can configure timeouts at three levels (in order of precedence):
|
|
|
211
197
|
2. **Global default**: Set `defaultTimeout` in your config file
|
|
212
198
|
3. **Built-in default**: 120000ms (2 minutes) if nothing else is specified
|
|
213
199
|
|
|
214
|
-
```javascript
|
|
215
|
-
// In benchmarks.config.js
|
|
216
|
-
module.exports = {
|
|
217
|
-
// Global default applies to all scenarios
|
|
218
|
-
defaultTimeout: 180000, // 3 minutes
|
|
219
|
-
|
|
220
|
-
scenarios: [
|
|
221
|
-
{
|
|
222
|
-
id: 'quick-check',
|
|
223
|
-
prompt: '...',
|
|
224
|
-
timeout: 60000, // Override: 1 minute for this scenario
|
|
225
|
-
// ...
|
|
226
|
-
},
|
|
227
|
-
{
|
|
228
|
-
id: 'complex-task',
|
|
229
|
-
prompt: '...',
|
|
230
|
-
// Will use defaultTimeout (3 minutes)
|
|
231
|
-
// ...
|
|
232
|
-
},
|
|
233
|
-
],
|
|
234
|
-
};
|
|
235
|
-
```
|
|
236
|
-
|
|
237
200
|
**Why configure timeouts?**
|
|
238
201
|
- Complex code generation tasks may need more time
|
|
239
202
|
- Simple checks can complete faster with shorter timeouts
|
|
@@ -271,133 +234,73 @@ validationStrategy: {
|
|
|
271
234
|
|
|
272
235
|
### LLM-as-Judge
|
|
273
236
|
|
|
274
|
-
Semantic evaluation using AI:
|
|
237
|
+
Semantic evaluation using AI (requires `GITHUB_TOKEN`):
|
|
275
238
|
|
|
276
239
|
```javascript
|
|
277
240
|
validationStrategy: {
|
|
278
241
|
llmJudge: {
|
|
279
242
|
enabled: true,
|
|
280
|
-
model: 'openai/gpt-
|
|
243
|
+
model: 'openai/gpt-5',
|
|
281
244
|
judgmentPrompt: `Evaluate if the code follows best practices...`,
|
|
282
245
|
},
|
|
283
246
|
}
|
|
284
247
|
```
|
|
285
248
|
|
|
286
|
-
The LLM judge requires a `GITHUB_TOKEN` environment variable with access to GitHub Models API.
|
|
287
|
-
|
|
288
249
|
### ESLint Integration
|
|
289
250
|
|
|
290
251
|

|
|
291
252
|
|
|
253
|
+
*Figure 3: ESLint validator detecting code quality issues in generated code*
|
|
292
254
|
|
|
293
255
|
Run ESLint on generated code:
|
|
294
256
|
|
|
295
257
|
```javascript
|
|
296
258
|
validationStrategy: {
|
|
297
|
-
eslint: {
|
|
298
|
-
enabled: true,
|
|
299
|
-
configPath: '.eslintrc.js', // optional
|
|
300
|
-
},
|
|
259
|
+
eslint: { enabled: true, configPath: '.eslintrc.js' },
|
|
301
260
|
}
|
|
302
261
|
```
|
|
303
262
|
|
|
304
263
|
## Scoring System
|
|
305
264
|
|
|
306
|
-
The scoring system operates at three levels: per-validator scoring, per-scenario scoring, and summary scoring.
|
|
307
|
-
|
|
308
265
|
### Per-Validator Scoring
|
|
309
266
|
|
|
310
|
-
Each validator
|
|
311
|
-
|
|
312
|
-
#### Pattern Validator
|
|
313
|
-
|
|
314
|
-
Uses exponential decay based on weighted violations:
|
|
315
|
-
|
|
316
|
-
```
|
|
317
|
-
score = e^(-totalWeight)
|
|
318
|
-
```
|
|
319
|
-
|
|
320
|
-
Where `totalWeight` is the sum of violation weights:
|
|
321
|
-
- **Critical violations**: 1.0 weight each
|
|
322
|
-
- **Major violations**: 0.7 weight each
|
|
323
|
-
- **Minor violations**: 0.3 weight each
|
|
324
|
-
|
|
325
|
-
**Examples**:
|
|
326
|
-
- 0 violations → score = 1.0 (perfect)
|
|
327
|
-
- 1 critical violation → score ≈ 0.37
|
|
328
|
-
- 1 major violation → score ≈ 0.50
|
|
329
|
-
- 2 minor violations → score ≈ 0.55
|
|
330
|
-
|
|
331
|
-
#### LLM Judge Validator
|
|
332
|
-
|
|
333
|
-
The LLM (GPT-4 or other model) evaluates the code semantically and returns:
|
|
334
|
-
- An `overallScore` from 0.0 to 1.0
|
|
335
|
-
- A list of violations with explanations
|
|
336
|
-
- Passed if: score ≥ 0.7 AND no violations
|
|
337
|
-
|
|
338
|
-
The LLM judge provides semantic understanding beyond pattern matching, evaluating whether the code actually solves the problem correctly and follows best practices.
|
|
339
|
-
|
|
340
|
-
#### ESLint Validator
|
|
341
|
-
|
|
342
|
-
This validator runs ESLint on the generated code and scores based on the number and severity of linting violations. Note that ESLint must be installed and configured in your project for this validator to work. If you don't have ESLint set up globally, disable this validator or provide a custom validator.
|
|
343
|
-
|
|
344
|
-
Uses exponential decay with a dampening factor:
|
|
345
|
-
|
|
346
|
-
```
|
|
347
|
-
score = e^(-totalWeight / 2)
|
|
348
|
-
```
|
|
349
|
-
|
|
350
|
-
ESLint violations are mapped to severity:
|
|
351
|
-
- ESLint error (severity 2) → **Major** violation (0.7 weight)
|
|
352
|
-
- ESLint warning (severity 1) → **Minor** violation (0.3 weight)
|
|
267
|
+
Each validator independently evaluates generated code and produces a score from 0.0 to 1.0:
|
|
353
268
|
|
|
354
|
-
|
|
269
|
+
| Validator | Scoring Method | Notes |
|
|
270
|
+
|-----------|----------------|-------|
|
|
271
|
+
| **Pattern** | Uses exponential decay based on weighted violations | Critical: 1.0 weight, Major: 0.7 weight, Minor: 0.3 weight |
|
|
272
|
+
| **LLM Judge** | AI evaluates semantically, returns 0.0-1.0 score | Passes if score ≥ 0.7 AND no violations |
|
|
273
|
+
| **ESLint** | Exponential decay with dampening factor (÷2) | ESLint errors → Major (0.7), warnings → Minor (0.3) |
|
|
355
274
|
|
|
356
275
|
### Per-Scenario Scoring
|
|
357
276
|
|
|
358
|
-
Each scenario receives an **overall score**
|
|
359
|
-
|
|
360
|
-
```
|
|
361
|
-
overallScore = average of all active validator scores
|
|
362
|
-
```
|
|
277
|
+
Each scenario receives an **overall score** = **average of all active validator scores**
|
|
363
278
|
|
|
364
|
-
**Active validators** are those that
|
|
365
|
-
- Are configured in the scenario's `validationStrategy`
|
|
366
|
-
- Successfully ran (did not return score = -1)
|
|
279
|
+
**Active validators** are those configured in `validationStrategy` that successfully ran (score ≠ -1).
|
|
367
280
|
|
|
368
281
|
**Pass/Fail Criteria**:
|
|
369
282
|
- ✅ **PASS**: `overallScore ≥ 0.8` AND `violations.length === 0`
|
|
370
283
|
- ❌ **FAIL**: `overallScore < 0.8` OR `violations.length > 0`
|
|
371
284
|
- ⚠️ **SKIP**: An error occurred during evaluation (timeout, adapter failure, etc.)
|
|
372
285
|
|
|
373
|
-
**Example**:
|
|
374
|
-
```
|
|
375
|
-
overallScore = (0.9 + 0.8) / 2 = 0.85
|
|
376
|
-
```
|
|
286
|
+
**Example**: Pattern (0.9) + LLM Judge (0.8) + ESLint (skipped) → `overallScore = (0.9 + 0.8) / 2 = 0.85`
|
|
377
287
|
|
|
378
288
|
### Summary Scoring
|
|
379
289
|
|
|
380
|
-
After evaluating all scenarios, the framework calculates
|
|
290
|
+
After evaluating all scenarios, the framework calculates:
|
|
381
291
|
|
|
382
292
|
```javascript
|
|
383
293
|
{
|
|
384
|
-
total: 10, // Total
|
|
385
|
-
passed: 7, //
|
|
386
|
-
failed: 2, //
|
|
387
|
-
skipped: 1, //
|
|
388
|
-
averageScore: 0.78, // Average of all scenario
|
|
294
|
+
total: 10, // Total scenarios
|
|
295
|
+
passed: 7, // overallScore ≥ 0.8 and no violations
|
|
296
|
+
failed: 2, // Evaluated but didn't pass
|
|
297
|
+
skipped: 1, // Encountered errors
|
|
298
|
+
averageScore: 0.78, // Average of all scenario scores
|
|
389
299
|
totalViolations: 8 // Sum of violations across all scenarios
|
|
390
300
|
}
|
|
391
301
|
```
|
|
392
302
|
|
|
393
|
-
**
|
|
394
|
-
```
|
|
395
|
-
averageScore = (sum of all scenario scores) / total scenarios
|
|
396
|
-
```
|
|
397
|
-
|
|
398
|
-
This includes scores from failed scenarios, providing an overall quality metric across your entire test suite.
|
|
399
|
-
|
|
400
|
-
**Transparency**: When baselines are saved, the per-validator breakdown is included in the baseline file, allowing you to trace exactly which validator contributed what score. See [Baseline File Format](#baseline-file-format) for details.
|
|
303
|
+
**Transparency**: Baselines include per-validator breakdowns. See [Baseline File Format](#baseline-file-format) for details.
|
|
401
304
|
|
|
402
305
|
### Score Interpretation
|
|
403
306
|
|
|
@@ -417,11 +320,6 @@ When baseline tracking is enabled, you'll see delta metrics:
|
|
|
417
320
|
↑ +18.5% improvement from baseline
|
|
418
321
|
```
|
|
419
322
|
|
|
420
|
-
The percentage is calculated as:
|
|
421
|
-
```
|
|
422
|
-
percentage = (currentScore - baselineScore) / baselineScore * 100
|
|
423
|
-
```
|
|
424
|
-
|
|
425
323
|
## Baseline Tracking
|
|
426
324
|
|
|
427
325
|
Track evaluation results over time by enabling baseline management in your config file:
|
|
@@ -449,42 +347,21 @@ When `compareBaseline` is enabled, the report will show score deltas and whether
|
|
|
449
347
|
|
|
450
348
|
### Baseline File Format
|
|
451
349
|
|
|
452
|
-
|
|
350
|
+
Path: `.benchmarks/baselines/{adapter}/{model}/{scenario-id}.json`
|
|
351
|
+
|
|
352
|
+
Each baseline file provides complete score traceability:
|
|
453
353
|
|
|
454
354
|
```json
|
|
455
355
|
{
|
|
456
356
|
"scenarioId": "typescript-no-any",
|
|
457
357
|
"score": 0.85,
|
|
458
358
|
"violations": [
|
|
459
|
-
{
|
|
460
|
-
"type": "pattern",
|
|
461
|
-
"message": "Forbidden pattern found: :\\s*any\\b",
|
|
462
|
-
"file": "src/types.ts",
|
|
463
|
-
"line": 12,
|
|
464
|
-
"severity": "critical",
|
|
465
|
-
"details": "Matched: \"metadata: any\""
|
|
466
|
-
}
|
|
359
|
+
{ "type": "pattern", "message": "Forbidden pattern found: :\\s*any\\b", "file": "src/types.ts", ... }
|
|
467
360
|
],
|
|
468
361
|
"validationResults": [
|
|
469
|
-
{
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
"violations": [...],
|
|
473
|
-
"validatorType": "pattern"
|
|
474
|
-
},
|
|
475
|
-
{
|
|
476
|
-
"passed": true,
|
|
477
|
-
"score": 1.0,
|
|
478
|
-
"violations": [],
|
|
479
|
-
"validatorType": "llm-judge"
|
|
480
|
-
},
|
|
481
|
-
{
|
|
482
|
-
"passed": true,
|
|
483
|
-
"score": -1,
|
|
484
|
-
"violations": [],
|
|
485
|
-
"validatorType": "eslint",
|
|
486
|
-
"error": "ESLint not found"
|
|
487
|
-
}
|
|
362
|
+
{ "passed": false, "score": 0.37, "validatorType": "pattern", "violations": [...] },
|
|
363
|
+
{ "passed": true, "score": 1.0, "validatorType": "llm-judge", "violations": [] },
|
|
364
|
+
{ "passed": true, "score": -1, "validatorType": "eslint", "error": "ESLint not found" }
|
|
488
365
|
],
|
|
489
366
|
"timestamp": "2026-01-23T22:28:32.216Z",
|
|
490
367
|
"adapter": "copilot",
|
|
@@ -493,15 +370,11 @@ Each baseline file contains complete transparency into how the score was calcula
|
|
|
493
370
|
```
|
|
494
371
|
|
|
495
372
|
**Key fields**:
|
|
496
|
-
- `score
|
|
497
|
-
- `violations
|
|
498
|
-
- `validationResults
|
|
499
|
-
- Individual validator score
|
|
500
|
-
- Whether that validator passed
|
|
501
|
-
- Violations specific to that validator
|
|
502
|
-
- Any errors that occurred (`score: -1` means skipped)
|
|
373
|
+
- `score` - Overall scenario score (average of active validators)
|
|
374
|
+
- `violations` - All violations from all validators combined
|
|
375
|
+
- `validationResults` - Per-validator breakdown (score, passed, violations, errors)
|
|
503
376
|
|
|
504
|
-
**
|
|
377
|
+
**Traceability**: You can always trace the overall score back to individual validator scores (e.g., `score: 0.067` → check `validationResults` for Pattern: 0.135, LLM Judge: 0.00).
|
|
505
378
|
|
|
506
379
|
## CLI Commands
|
|
507
380
|
|
|
@@ -509,24 +382,26 @@ Each baseline file contains complete transparency into how the score was calcula
|
|
|
509
382
|
|
|
510
383
|
Run benchmark evaluations.
|
|
511
384
|
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
385
|
+
| Option | Description | Default/Example |
|
|
386
|
+
|--------|-------------|-----------------|
|
|
387
|
+
| `--scenario <pattern>` | Filter by scenario ID (supports wildcards) | `typescript-*` |
|
|
388
|
+
| `--category <categories>` | Filter by category (comma-separated) | `typescript,react` |
|
|
389
|
+
| `--tag <tags>` | Filter by tags (comma-separated) | `safety,types` |
|
|
390
|
+
| `--adapter <type>` | Adapter to use | `copilot` or `claude-code` |
|
|
391
|
+
| `--model <model>` | LLM model for judge | `openai/gpt-5` |
|
|
392
|
+
| `--threshold <number>` | Minimum passing score | `0.8` |
|
|
393
|
+
| `--verbose` | Show detailed output | - |
|
|
394
|
+
| `--output <file>` | Export JSON report | `report.json` |
|
|
395
|
+
| `--workspace-root <path>` | Workspace root directory | Current directory |
|
|
522
396
|
|
|
523
397
|
### `list`
|
|
524
398
|
|
|
525
399
|
List available test scenarios.
|
|
526
400
|
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
401
|
+
| Option | Description |
|
|
402
|
+
|--------|-------------|
|
|
403
|
+
| `--category <categories>` | Filter by category (comma-separated) |
|
|
404
|
+
| `--tag <tags>` | Filter by tags (comma-separated) |
|
|
530
405
|
|
|
531
406
|
### `check`
|
|
532
407
|
|
|
@@ -536,8 +411,9 @@ Check if coding agent CLIs are available.
|
|
|
536
411
|
|
|
537
412
|
Test LLM judge with a custom prompt (for debugging).
|
|
538
413
|
|
|
539
|
-
|
|
540
|
-
|
|
414
|
+
| Option | Description |
|
|
415
|
+
|--------|-------------|
|
|
416
|
+
| `--model <model>` | LLM model to use |
|
|
541
417
|
|
|
542
418
|
## Understanding Output
|
|
543
419
|
|
|
@@ -586,26 +462,15 @@ import { Evaluator, loadConfig } from 'coding-agent-benchmarks';
|
|
|
586
462
|
async function runEvaluation() {
|
|
587
463
|
const { config, scenarios } = await loadConfig();
|
|
588
464
|
|
|
589
|
-
// Create evaluator
|
|
590
465
|
const evaluator = new Evaluator({
|
|
591
466
|
adapter: 'copilot',
|
|
592
|
-
model: 'openai/gpt-
|
|
467
|
+
model: 'openai/gpt-5',
|
|
593
468
|
verbose: true,
|
|
594
469
|
saveBaseline: config.saveBaseline,
|
|
595
470
|
compareBaseline: config.compareBaseline,
|
|
596
471
|
});
|
|
597
472
|
|
|
598
|
-
// Check adapter availability
|
|
599
|
-
const available = await evaluator.checkAdapterAvailability();
|
|
600
|
-
if (!available) {
|
|
601
|
-
throw new Error('Adapter not available');
|
|
602
|
-
}
|
|
603
|
-
|
|
604
|
-
// Run evaluation
|
|
605
473
|
const report = await evaluator.evaluate(scenarios);
|
|
606
|
-
|
|
607
|
-
console.log(`Passed: ${report.summary.passed}/${report.summary.total}`);
|
|
608
|
-
console.log(`Average score: ${report.summary.averageScore.toFixed(2)}`);
|
|
609
474
|
}
|
|
610
475
|
|
|
611
476
|
runEvaluation();
|
|
@@ -616,106 +481,62 @@ runEvaluation();
|
|
|
616
481
|
Implement the `CodeValidator` interface:
|
|
617
482
|
|
|
618
483
|
```typescript
|
|
619
|
-
import { CodeValidator, ValidationResult
|
|
484
|
+
import { CodeValidator, ValidationResult } from 'coding-agent-benchmarks';
|
|
620
485
|
|
|
621
486
|
export class CustomValidator implements CodeValidator {
|
|
622
487
|
public readonly type = 'custom';
|
|
623
488
|
|
|
624
|
-
async validate(
|
|
625
|
-
files: readonly string[],
|
|
626
|
-
scenario: TestScenario
|
|
627
|
-
): Promise<ValidationResult> {
|
|
489
|
+
async validate(files: readonly string[], scenario: TestScenario): Promise<ValidationResult> {
|
|
628
490
|
// Your validation logic here
|
|
629
|
-
return {
|
|
630
|
-
passed: true,
|
|
631
|
-
score: 1.0,
|
|
632
|
-
violations: [],
|
|
633
|
-
validatorType: 'custom',
|
|
634
|
-
};
|
|
491
|
+
return { passed: true, score: 1.0, violations: [], validatorType: 'custom' };
|
|
635
492
|
}
|
|
636
493
|
}
|
|
637
494
|
```
|
|
638
495
|
|
|
496
|
+
See CONTRIBUTING.md for complete examples.
|
|
497
|
+
|
|
639
498
|
## Creating Custom Adapters
|
|
640
499
|
|
|
641
500
|
Implement the `CodeGenerationAdapter` interface:
|
|
642
501
|
|
|
643
502
|
```typescript
|
|
644
|
-
import { CodeGenerationAdapter
|
|
503
|
+
import { CodeGenerationAdapter } from 'coding-agent-benchmarks';
|
|
645
504
|
|
|
646
505
|
export class CustomAdapter implements CodeGenerationAdapter {
|
|
647
|
-
public readonly type
|
|
648
|
-
|
|
649
|
-
async checkAvailability(): Promise<boolean> {
|
|
650
|
-
// Check if CLI is available
|
|
651
|
-
return true;
|
|
652
|
-
}
|
|
506
|
+
public readonly type = 'custom-adapter';
|
|
653
507
|
|
|
654
|
-
async
|
|
655
|
-
|
|
656
|
-
contextFiles?: readonly string[],
|
|
657
|
-
timeout?: number
|
|
658
|
-
): Promise<string[]> {
|
|
659
|
-
// Generate code and return changed files
|
|
660
|
-
return ['path/to/generated/file.ts'];
|
|
661
|
-
}
|
|
508
|
+
async checkAvailability(): Promise<boolean> { /* ... */ }
|
|
509
|
+
async generate(prompt: string, contextFiles?: readonly string[], timeout?: number): Promise<string[]> { /* ... */ }
|
|
662
510
|
}
|
|
663
511
|
```
|
|
664
512
|
|
|
513
|
+
See CONTRIBUTING.md for complete examples.
|
|
514
|
+
|
|
665
515
|
## GitHub Authentication (for LLM Judge)
|
|
666
516
|
|
|
667
|
-
LLM-as-judge validation requires GitHub authentication to access GitHub Models API.
|
|
517
|
+
LLM-as-judge validation requires GitHub authentication to access GitHub Models API. Choose one option:
|
|
668
518
|
|
|
669
519
|
### Option 1: Personal Access Token (Recommended)
|
|
670
520
|
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
6. Set environment variable:
|
|
677
|
-
```bash
|
|
678
|
-
export GITHUB_TOKEN=ghp_xxxxxxxxxxxxxxxxxxxx
|
|
679
|
-
```
|
|
521
|
+
Create a token at https://github.com/settings/tokens with the **`models:read`** scope, then set it as an environment variable:
|
|
522
|
+
|
|
523
|
+
```bash
|
|
524
|
+
export GITHUB_TOKEN=ghp_xxxxxxxxxxxxxxxxxxxx
|
|
525
|
+
```
|
|
680
526
|
|
|
681
527
|
### Option 2: GitHub CLI (Automatic)
|
|
682
528
|
|
|
683
|
-
If
|
|
529
|
+
If GitHub CLI is installed, tokens are auto-detected:
|
|
684
530
|
|
|
685
531
|
```bash
|
|
686
|
-
#
|
|
687
|
-
brew install gh # macOS
|
|
688
|
-
# or download from https://cli.github.com
|
|
689
|
-
|
|
690
|
-
# Authenticate (one time)
|
|
532
|
+
brew install gh # or download from https://cli.github.com
|
|
691
533
|
gh auth login
|
|
692
|
-
|
|
693
534
|
# Token will be used automatically - no GITHUB_TOKEN needed!
|
|
694
535
|
```
|
|
695
536
|
|
|
696
|
-
### Check Authentication
|
|
697
|
-
|
|
698
|
-
```bash
|
|
699
|
-
npx coding-agent-benchmarks check
|
|
700
|
-
```
|
|
701
|
-
|
|
702
|
-
Output:
|
|
703
|
-
```
|
|
704
|
-
Checking adapter availability...
|
|
705
|
-
GitHub Copilot CLI: ✓ Available
|
|
706
|
-
Claude Code CLI: ✗ Not found
|
|
707
|
-
|
|
708
|
-
Checking GitHub authentication...
|
|
709
|
-
✓ Using token from GitHub CLI (gh auth token)
|
|
710
|
-
```
|
|
711
|
-
|
|
712
|
-
## How It Works
|
|
537
|
+
### Check Authentication
|
|
713
538
|
|
|
714
|
-
|
|
715
|
-
2. **File Tracking**: Git is used to detect which files were created/modified
|
|
716
|
-
3. **Validation**: Multiple validators check the generated code
|
|
717
|
-
4. **Scoring**: Results are aggregated and compared against thresholds
|
|
718
|
-
5. **Reporting**: Results are displayed in terminal and optionally exported as JSON
|
|
539
|
+
Run `npx coding-agent-benchmarks check` to verify authentication status.
|
|
719
540
|
|
|
720
541
|
## Requirements
|
|
721
542
|
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"claudeCodeCLI.d.ts","sourceRoot":"","sources":["../../src/adapters/claudeCodeCLI.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AAOjD,MAAM,WAAW,2BAA2B;IAC1C,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAID,qBAAa,oBAAqB,YAAW,qBAAqB;IAChE,SAAgB,IAAI,EAAG,aAAa,CAAU;IAC9C,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,KAAK,CAAS;gBAEV,OAAO,CAAC,EAAE,2BAA2B;IAKjD;;OAEG;IACG,iBAAiB,IAAI,OAAO,CAAC,OAAO,CAAC;IAgB3C;;OAEG;IACH,QAAQ,IAAI,MAAM;IAIlB;;OAEG;IACH,OAAO,CAAC,WAAW;IA8BnB;;;OAGG;IACG,QAAQ,CACZ,MAAM,EAAE,MAAM,EACd,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,GACtB,OAAO,CAAC,MAAM,EAAE,CAAC;
|
|
1
|
+
{"version":3,"file":"claudeCodeCLI.d.ts","sourceRoot":"","sources":["../../src/adapters/claudeCodeCLI.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AAOjD,MAAM,WAAW,2BAA2B;IAC1C,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAID,qBAAa,oBAAqB,YAAW,qBAAqB;IAChE,SAAgB,IAAI,EAAG,aAAa,CAAU;IAC9C,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,KAAK,CAAS;gBAEV,OAAO,CAAC,EAAE,2BAA2B;IAKjD;;OAEG;IACG,iBAAiB,IAAI,OAAO,CAAC,OAAO,CAAC;IAgB3C;;OAEG;IACH,QAAQ,IAAI,MAAM;IAIlB;;OAEG;IACH,OAAO,CAAC,WAAW;IA8BnB;;;OAGG;IACG,QAAQ,CACZ,MAAM,EAAE,MAAM,EACd,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,GACtB,OAAO,CAAC,MAAM,EAAE,CAAC;CAuGrB"}
|
|
@@ -105,13 +105,10 @@ class ClaudeCodeCLIAdapter {
|
|
|
105
105
|
throw new Error("Claude Code CLI is not available. Please install it first: https://docs.anthropic.com/en/docs/build-with-claude/claude-code");
|
|
106
106
|
}
|
|
107
107
|
const fullPrompt = this.buildPrompt(prompt, contextFiles);
|
|
108
|
-
// Capture git status before generation
|
|
109
108
|
const statusBefore = (0, gitUtils_1.getGitStatusPorcelain)(this.workspaceRoot);
|
|
110
|
-
// Write prompt to temp file and pipe via stdin (matches @copilot-evals pattern)
|
|
111
109
|
return new Promise((resolve, reject) => {
|
|
112
110
|
const tempFile = path.join(this.workspaceRoot, ".claude-eval-prompt.txt");
|
|
113
111
|
fs.writeFileSync(tempFile, fullPrompt, "utf8");
|
|
114
|
-
// Cleanup function
|
|
115
112
|
const cleanup = () => {
|
|
116
113
|
try {
|
|
117
114
|
if (fs.existsSync(tempFile)) {
|
|
@@ -122,7 +119,6 @@ class ClaudeCodeCLIAdapter {
|
|
|
122
119
|
// Ignore cleanup errors
|
|
123
120
|
}
|
|
124
121
|
};
|
|
125
|
-
// Register cleanup on process termination
|
|
126
122
|
const cleanupOnExit = () => {
|
|
127
123
|
cleanup();
|
|
128
124
|
};
|
|
@@ -142,7 +138,6 @@ class ClaudeCodeCLIAdapter {
|
|
|
142
138
|
proc.stderr?.on("data", (data) => {
|
|
143
139
|
stderr += data.toString();
|
|
144
140
|
});
|
|
145
|
-
// Set timeout only if specified (null/undefined = no timeout)
|
|
146
141
|
let timeoutHandle = null;
|
|
147
142
|
if (timeout !== null && timeout !== undefined) {
|
|
148
143
|
timeoutHandle = setTimeout(() => {
|
|
@@ -164,7 +159,6 @@ class ClaudeCodeCLIAdapter {
|
|
|
164
159
|
reject(new Error(`Claude Code CLI exited with code ${code}\nStderr: ${stderr}`));
|
|
165
160
|
return;
|
|
166
161
|
}
|
|
167
|
-
// Get files changed during generation (diff before/after)
|
|
168
162
|
try {
|
|
169
163
|
const statusAfter = (0, gitUtils_1.getGitStatusPorcelain)(this.workspaceRoot);
|
|
170
164
|
const changedFiles = (0, gitUtils_1.getChangedFilesDiff)(statusBefore, statusAfter, this.workspaceRoot);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"claudeCodeCLI.js","sourceRoot":"","sources":["../../src/adapters/claudeCodeCLI.ts"],"names":[],"mappings":";AAAA;;GAEG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEH,iDAAsC;AACtC,uCAAyB;AACzB,2CAA6B;AAE7B,gDAA+E;AAC/E,4DAGiC;AAOjC,MAAM,aAAa,GAAG,QAAQ,CAAC;AAE/B,MAAa,oBAAoB;IAK/B,YAAY,OAAqC;QAJjC,SAAI,GAAG,aAAsB,CAAC;QAK5C,IAAI,CAAC,aAAa,GAAG,IAAA,qCAAoB,EAAC,OAAO,EAAE,aAAa,CAAC,CAAC;QAClE,IAAI,CAAC,KAAK,GAAG,OAAO,EAAE,KAAK,IAAI,aAAa,CAAC;IAC/C,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,iBAAiB;QACrB,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;YAC7B,MAAM,IAAI,GAAG,IAAA,qBAAK,EAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,EAAE;gBACtC,KAAK,EAAE,MAAM;aACd,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;gBACxB,OAAO,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC;YACtB,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE;gBACpB,OAAO,CAAC,KAAK,CAAC,CAAC;YACjB,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,QAAQ;QACN,OAAO,IAAI,CAAC,KAAK,CAAC;IACpB,CAAC;IAED;;OAEG;IACK,WAAW,CACjB,MAAc,EACd,YAAgC;QAEhC,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,IAAI,YAAY,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5C,MAAM,QAAQ,GAAG,IAAA,iCAAgB,EAAC,IAAI,CAAC,aAAa,EAAE,YAAY,CAAC,CAAC;YACpE,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxB,KAAK,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;gBAClC,MAAM,cAAc,GAAG,QAAQ;qBAC5B,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;oBACX,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,YAAY,CAAC;oBAC5D,OAAO,OAAO,GAAG,CAAC,IAAI,WAAW,GAAG,KAAK,GAAG,CAAC,OAAO,UAAU,CAAC;gBACjE,CAAC,CAAC;qBACD,IAAI,CAAC,MAAM,CAAC,CAAC;gBAChB,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAC3B,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACvB,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACnB,KAAK,CAAC,IAAI,CACR,0GAA0G,CAC3G,CAAC;QAEF,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,QAAQ,CACZ,MAAc,EACd,YAAgC,EAChC,OAAuB;QAEvB,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,iBAAiB,EAAE,CAAC;QACnD,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,MAAM,IAAI,KAAK,CACb,6HAA6H,CAC9H,CAAC;QACJ,CAAC;QAED,MAAM,UAAU,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;QAE1D,
|
|
1
|
+
{"version":3,"file":"claudeCodeCLI.js","sourceRoot":"","sources":["../../src/adapters/claudeCodeCLI.ts"],"names":[],"mappings":";AAAA;;GAEG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEH,iDAAsC;AACtC,uCAAyB;AACzB,2CAA6B;AAE7B,gDAA+E;AAC/E,4DAGiC;AAOjC,MAAM,aAAa,GAAG,QAAQ,CAAC;AAE/B,MAAa,oBAAoB;IAK/B,YAAY,OAAqC;QAJjC,SAAI,GAAG,aAAsB,CAAC;QAK5C,IAAI,CAAC,aAAa,GAAG,IAAA,qCAAoB,EAAC,OAAO,EAAE,aAAa,CAAC,CAAC;QAClE,IAAI,CAAC,KAAK,GAAG,OAAO,EAAE,KAAK,IAAI,aAAa,CAAC;IAC/C,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,iBAAiB;QACrB,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;YAC7B,MAAM,IAAI,GAAG,IAAA,qBAAK,EAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,EAAE;gBACtC,KAAK,EAAE,MAAM;aACd,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;gBACxB,OAAO,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC;YACtB,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE;gBACpB,OAAO,CAAC,KAAK,CAAC,CAAC;YACjB,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,QAAQ;QACN,OAAO,IAAI,CAAC,KAAK,CAAC;IACpB,CAAC;IAED;;OAEG;IACK,WAAW,CACjB,MAAc,EACd,YAAgC;QAEhC,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,IAAI,YAAY,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5C,MAAM,QAAQ,GAAG,IAAA,iCAAgB,EAAC,IAAI,CAAC,aAAa,EAAE,YAAY,CAAC,CAAC;YACpE,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxB,KAAK,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;gBAClC,MAAM,cAAc,GAAG,QAAQ;qBAC5B,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;oBACX,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,YAAY,CAAC;oBAC5D,OAAO,OAAO,GAAG,CAAC,IAAI,WAAW,GAAG,KAAK,GAAG,CAAC,OAAO,UAAU,CAAC;gBACjE,CAAC,CAAC;qBACD,IAAI,CAAC,MAAM,CAAC,CAAC;gBAChB,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAC3B,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACvB,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACnB,KAAK,CAAC,IAAI,CACR,0GAA0G,CAC3G,CAAC;QAEF,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,QAAQ,CACZ,MAAc,EACd,YAAgC,EAChC,OAAuB;QAEvB,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,iBAAiB,EAAE,CAAC;QACnD,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,MAAM,IAAI,KAAK,CACb,6HAA6H,CAC9H,CAAC;QACJ,CAAC;QAED,MAAM,UAAU,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;QAE1D,MAAM,YAAY,GAAG,IAAA,gCAAqB,EAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAE/D,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;YACrC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,aAAa,EAAE,yBAAyB,CAAC,CAAC;YAC1E,EAAE,CAAC,aAAa,CAAC,QAAQ,EAAE,UAAU,EAAE,MAAM,CAAC,CAAC;YAE/C,MAAM,OAAO,GAAG,GAAS,EAAE;gBACzB,IAAI,CAAC;oBACH,IAAI,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;wBAC5B,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC;oBAC1B,CAAC;gBACH,CAAC;gBAAC,MAAM,CAAC;oBACP,wBAAwB;gBAC1B,CAAC;YACH,CAAC,CAAC;YAEF,MAAM,aAAa,GAAG,GAAS,EAAE;gBAC/B,OAAO,EAAE,CAAC;YACZ,CAAC,CAAC;YACF,OAAO,CAAC,IAAI,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;YACtC,OAAO,CAAC,IAAI,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;YAEvC,MAAM,OAAO,GAAG,QAAQ,QAAQ,sBAAsB,IAAI,CAAC,KAAK,yIAAyI,CAAC;YAC1M,MAAM,IAAI,GAAG,IAAA,qBAAK,EAAC,IAAI,EAAE,CAAC,IAAI,EAAE,OAAO,CAAC,EAAE;gBACxC,GAAG,EAAE,IAAI,CAAC,aAAa;gBACvB,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;aAChC,CAAC,CAAC;YAEH,6DAA6D;YAC7D,IAAI,MAAM,GAAG,EAAE,CAAC;YAChB,IAAI,MAAM,GAAG,EAAE,CAAC;YAEhB,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;gBAC/B,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAC5B,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;gBAC/B,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAC5B,CAAC,CAAC,CAAC;YAEH,IAAI,aAAa,GAA0B,IAAI,CAAC;YAChD,IAAI,OAAO,KAAK,IAAI,IAAI,OAAO,KAAK,SAAS,EAAE,CAAC;gBAC9C,aAAa,GAAG,UAAU,CAAC,GAAG,EAAE;oBAC9B,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;oBACrB,OAAO,EAAE,CAAC;oBACV,OAAO,CAAC,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;oBAChD,OAAO,CAAC,cAAc,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;oBACjD,MAAM,CAAC,IAAI,KAAK,CAAC,mCAAmC,OAAO,IAAI,CAAC,CAAC,CAAC;gBACpE,CAAC,EAAE,OAAO,CAAC,CAAC;YACd,CAAC;YAED,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;gBACxB,IAAI,aAAa,EAAE,CAAC;oBAClB,YAAY,CAAC,aAAa,CAAC,CAAC;gBAC9B,CAAC;gBAED,OAAO,EAAE,CAAC;gBACV,OAAO,CAAC,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;gBAChD,OAAO,CAAC,cAAc,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;gBAEjD,IAAI,IAAI,KAAK,CAAC,EAAE,CAAC;oBACf,MAAM,CACJ,IAAI,KAAK,CACP,oCAAoC,IAAI,aAAa,MAAM,EAAE,CAC9D,CACF,CAAC;oBACF,OAAO;gBACT,CAAC;gBAED,IAAI,CAAC;oBACH,MAAM,WAAW,GAAG,IAAA,gCAAqB,EAAC,IAAI,CAAC,aAAa,CAAC,CAAC;oBAC9D,MAAM,YAAY,GAAG,IAAA,8BAAmB,EACtC,YAAY,EACZ,WAAW,EACX,IAAI,CAAC,aAAa,CACnB,CAAC;oBACF,OAAO,CAAC,YAAY,CAAC,CAAC;gBACxB,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,CAAC,IAAI,KAAK,CAAC,gCAAgC,KAAK,EAAE,CAAC,CAAC,CAAC;gBAC7D,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;gBACzB,IAAI,aAAa,EAAE,CAAC;oBAClB,YAAY,CAAC,aAAa,CAAC,CAAC;gBAC9B,CAAC;gBACD,OAAO,EAAE,CAAC;gBACV,OAAO,CAAC,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;gBAChD,OAAO,CAAC,cAAc,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;gBACjD,MAAM,CAAC,IAAI,KAAK,CAAC,oCAAoC,KAAK,EAAE,CAAC,CAAC,CAAC;YACjE,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;CACF;AApLD,oDAoLC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"copilotCLI.d.ts","sourceRoot":"","sources":["../../src/adapters/copilotCLI.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AAOjD,MAAM,WAAW,wBAAwB;IACvC,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAID,qBAAa,iBAAkB,YAAW,qBAAqB;IAC7D,SAAgB,IAAI,EAAG,SAAS,CAAU;IAC1C,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,KAAK,CAAS;gBAEV,OAAO,CAAC,EAAE,wBAAwB;IAKxC,iBAAiB,IAAI,OAAO,CAAC,OAAO,CAAC;IAgB3C;;OAEG;IACH,QAAQ,IAAI,MAAM;IAIlB;;OAEG;IACH,OAAO,CAAC,WAAW;IA8BnB;;;OAGG;IACG,QAAQ,CACZ,MAAM,EAAE,MAAM,EACd,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,GACtB,OAAO,CAAC,MAAM,EAAE,CAAC;
|
|
1
|
+
{"version":3,"file":"copilotCLI.d.ts","sourceRoot":"","sources":["../../src/adapters/copilotCLI.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AAOjD,MAAM,WAAW,wBAAwB;IACvC,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAID,qBAAa,iBAAkB,YAAW,qBAAqB;IAC7D,SAAgB,IAAI,EAAG,SAAS,CAAU;IAC1C,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,KAAK,CAAS;gBAEV,OAAO,CAAC,EAAE,wBAAwB;IAKxC,iBAAiB,IAAI,OAAO,CAAC,OAAO,CAAC;IAgB3C;;OAEG;IACH,QAAQ,IAAI,MAAM;IAIlB;;OAEG;IACH,OAAO,CAAC,WAAW;IA8BnB;;;OAGG;IACG,QAAQ,CACZ,MAAM,EAAE,MAAM,EACd,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,GACtB,OAAO,CAAC,MAAM,EAAE,CAAC;CAyGrB"}
|
|
@@ -102,13 +102,10 @@ class CopilotCLIAdapter {
|
|
|
102
102
|
throw new Error("GitHub Copilot CLI is not available. Please install it first: https://docs.github.com/en/copilot/how-tos/set-up/install-copilot-cli");
|
|
103
103
|
}
|
|
104
104
|
const fullPrompt = this.buildPrompt(prompt, contextFiles);
|
|
105
|
-
// Capture git status before generation
|
|
106
105
|
const statusBefore = (0, gitUtils_1.getGitStatusPorcelain)(this.workspaceRoot);
|
|
107
|
-
// Write prompt to temp file and pipe via stdin (matches @copilot-evals pattern)
|
|
108
106
|
return new Promise((resolve, reject) => {
|
|
109
107
|
const tempFile = path.join(this.workspaceRoot, ".copilot-eval-prompt.txt");
|
|
110
108
|
fs.writeFileSync(tempFile, fullPrompt, "utf8");
|
|
111
|
-
// Cleanup function
|
|
112
109
|
const cleanup = () => {
|
|
113
110
|
try {
|
|
114
111
|
if (fs.existsSync(tempFile)) {
|
|
@@ -119,7 +116,6 @@ class CopilotCLIAdapter {
|
|
|
119
116
|
// Ignore cleanup errors
|
|
120
117
|
}
|
|
121
118
|
};
|
|
122
|
-
// Register cleanup on process termination
|
|
123
119
|
const cleanupOnExit = () => {
|
|
124
120
|
cleanup();
|
|
125
121
|
};
|
|
@@ -138,7 +134,6 @@ class CopilotCLIAdapter {
|
|
|
138
134
|
proc.stderr?.on("data", (data) => {
|
|
139
135
|
stderr += data.toString();
|
|
140
136
|
});
|
|
141
|
-
// Set timeout only if specified (null/undefined = no timeout)
|
|
142
137
|
let timeoutHandle = null;
|
|
143
138
|
if (timeout !== null && timeout !== undefined) {
|
|
144
139
|
timeoutHandle = setTimeout(() => {
|
|
@@ -160,7 +155,6 @@ class CopilotCLIAdapter {
|
|
|
160
155
|
reject(new Error(`Copilot CLI exited with code ${code}\nStderr: ${stderr}`));
|
|
161
156
|
return;
|
|
162
157
|
}
|
|
163
|
-
// Get files changed during generation (diff before/after)
|
|
164
158
|
try {
|
|
165
159
|
const statusAfter = (0, gitUtils_1.getGitStatusPorcelain)(this.workspaceRoot);
|
|
166
160
|
const changedFiles = (0, gitUtils_1.getChangedFilesDiff)(statusBefore, statusAfter, this.workspaceRoot);
|