coding-agent-benchmarks 0.3.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/README.md +80 -257
  2. package/dist/adapters/claudeCodeCLI.d.ts.map +1 -1
  3. package/dist/adapters/claudeCodeCLI.js +0 -6
  4. package/dist/adapters/claudeCodeCLI.js.map +1 -1
  5. package/dist/adapters/copilotCLI.d.ts.map +1 -1
  6. package/dist/adapters/copilotCLI.js +0 -6
  7. package/dist/adapters/copilotCLI.js.map +1 -1
  8. package/dist/config/loader.d.ts.map +1 -1
  9. package/dist/config/loader.js +0 -7
  10. package/dist/config/loader.js.map +1 -1
  11. package/dist/evaluator.d.ts +0 -12
  12. package/dist/evaluator.d.ts.map +1 -1
  13. package/dist/evaluator.js +0 -36
  14. package/dist/evaluator.js.map +1 -1
  15. package/dist/index.d.ts +2 -0
  16. package/dist/index.d.ts.map +1 -1
  17. package/dist/index.js +4 -1
  18. package/dist/index.js.map +1 -1
  19. package/dist/runner.js +12 -14
  20. package/dist/runner.js.map +1 -1
  21. package/dist/types.d.ts +1 -1
  22. package/dist/utils/baselineManager.d.ts.map +1 -1
  23. package/dist/utils/baselineManager.js +0 -4
  24. package/dist/utils/baselineManager.js.map +1 -1
  25. package/dist/utils/gitUtils.d.ts.map +1 -1
  26. package/dist/utils/gitUtils.js +0 -6
  27. package/dist/utils/gitUtils.js.map +1 -1
  28. package/dist/utils/updateChecker.d.ts +9 -0
  29. package/dist/utils/updateChecker.d.ts.map +1 -0
  30. package/dist/utils/updateChecker.js +100 -0
  31. package/dist/utils/updateChecker.js.map +1 -0
  32. package/dist/utils/updateNotifier.d.ts +3 -0
  33. package/dist/utils/updateNotifier.d.ts.map +1 -0
  34. package/dist/utils/updateNotifier.js +39 -0
  35. package/dist/utils/updateNotifier.js.map +1 -0
  36. package/dist/utils/workspaceUtils.d.ts.map +1 -1
  37. package/dist/utils/workspaceUtils.js +0 -5
  38. package/dist/utils/workspaceUtils.js.map +1 -1
  39. package/dist/validators/llmJudge.d.ts.map +1 -1
  40. package/dist/validators/llmJudge.js +3 -3
  41. package/dist/validators/llmJudge.js.map +1 -1
  42. package/package.json +2 -1
package/README.md CHANGED
@@ -1,12 +1,18 @@
1
1
  # coding-agent-benchmarks
2
2
 
3
+ **[📖 Documentation](https://chiItepin.github.io/coding-agent-benchmarks)** | **[📦 npm](https://www.npmjs.com/package/coding-agent-benchmarks)** | **[⭐ GitHub](https://github.com/chiItepin/coding-agent-benchmarks)**
4
+
3
5
  Open-source framework for evaluating AI coding assistants (like GitHub Copilot CLI or Claude Code) follow your coding standards. Here's the workflow:
4
6
  1. You give it a prompt → 2. AI generates code → 3. Library validates → 4. You get a score
5
7
 
6
8
  ![WhatsApp Image 2026-01-23 at 9 04 49 AM](https://github.com/user-attachments/assets/3544d04f-37a5-47b0-a013-669c6015d26f)
7
9
 
10
+ *Figure 1: Evaluation workflow - prompt → generate → validate → score*
11
+
8
12
  ![WhatsApp Image 2026-01-24 at 1 58 31 PM](https://github.com/user-attachments/assets/f93ea3e0-74f8-4789-ab43-97245acc91b6)
9
13
 
14
+ *Figure 2: Example terminal output showing scenario evaluation results*
15
+
10
16
 
11
17
  ## Features
12
18
 
@@ -135,7 +141,7 @@ module.exports = {
135
141
  defaultAdapter: 'copilot',
136
142
 
137
143
  // Default LLM model for judge
138
- defaultModel: 'openai/gpt-4.1',
144
+ defaultModel: 'openai/gpt-5',
139
145
 
140
146
  // Default timeout for code generation (milliseconds)
141
147
  // Individual scenarios can override this
@@ -172,33 +178,15 @@ module.exports = {
172
178
  id: 'react-no-inline-styles',
173
179
  category: 'react',
174
180
  severity: 'major',
175
- tags: ['react', 'styling', 'best-practices'],
176
181
  description: 'Forbid inline style objects in React components',
177
- prompt: 'Create a React functional component called Button that accepts a "label" prop and renders a styled button. Use CSS classes instead of inline styles.',
178
182
  validationStrategy: {
179
183
  patterns: {
180
- forbiddenPatterns: [/style\s*=\s*\{\{/, /style\s*=\s*\{[^}]*\}/],
184
+ forbiddenPatterns: [/style\s*=\s*\{\{/],
181
185
  requiredPatterns: [/className/],
182
186
  },
183
187
  },
184
188
  },
185
- {
186
- id: 'async-error-handling',
187
- category: 'general',
188
- severity: 'critical',
189
- tags: ['async', 'error-handling', 'robustness'],
190
- description: 'Ensure async functions have proper error handling',
191
- prompt: 'Create an async function called fetchUserData that takes a userId parameter, makes an HTTP request to fetch user data, and returns the user object. Handle errors appropriately.',
192
- validationStrategy: {
193
- patterns: {
194
- requiredPatterns: [/async\s+function\s+fetchUserData|const\s+fetchUserData.*async/, /try|catch|\.catch\(/],
195
- },
196
- llmJudge: {
197
- enabled: true,
198
- judgmentPrompt: 'Evaluate the error handling in this async function. Does it use try/catch or .catch()? Are errors logged or re-thrown appropriately?',
199
- },
200
- },
201
- },
189
+ // Add more scenarios...
202
190
  ],
203
191
  };
204
192
  ```
@@ -211,29 +199,6 @@ You can configure timeouts at three levels (in order of precedence):
211
199
  2. **Global default**: Set `defaultTimeout` in your config file
212
200
  3. **Built-in default**: 120000ms (2 minutes) if nothing else is specified
213
201
 
214
- ```javascript
215
- // In benchmarks.config.js
216
- module.exports = {
217
- // Global default applies to all scenarios
218
- defaultTimeout: 180000, // 3 minutes
219
-
220
- scenarios: [
221
- {
222
- id: 'quick-check',
223
- prompt: '...',
224
- timeout: 60000, // Override: 1 minute for this scenario
225
- // ...
226
- },
227
- {
228
- id: 'complex-task',
229
- prompt: '...',
230
- // Will use defaultTimeout (3 minutes)
231
- // ...
232
- },
233
- ],
234
- };
235
- ```
236
-
237
202
  **Why configure timeouts?**
238
203
  - Complex code generation tasks may need more time
239
204
  - Simple checks can complete faster with shorter timeouts
@@ -271,133 +236,73 @@ validationStrategy: {
271
236
 
272
237
  ### LLM-as-Judge
273
238
 
274
- Semantic evaluation using AI:
239
+ Semantic evaluation using AI (requires `GITHUB_TOKEN`):
275
240
 
276
241
  ```javascript
277
242
  validationStrategy: {
278
243
  llmJudge: {
279
244
  enabled: true,
280
- model: 'openai/gpt-4.1', // or 'gpt-4o'
245
+ model: 'openai/gpt-5',
281
246
  judgmentPrompt: `Evaluate if the code follows best practices...`,
282
247
  },
283
248
  }
284
249
  ```
285
250
 
286
- The LLM judge requires a `GITHUB_TOKEN` environment variable with access to GitHub Models API.
287
-
288
251
  ### ESLint Integration
289
252
 
290
253
  ![WhatsApp Image 2026-01-24 at 2 09 11 PM](https://github.com/user-attachments/assets/12af93e8-ed7c-4153-a183-20601a925965)
291
254
 
255
+ *Figure 3: ESLint validator detecting code quality issues in generated code*
292
256
 
293
257
  Run ESLint on generated code:
294
258
 
295
259
  ```javascript
296
260
  validationStrategy: {
297
- eslint: {
298
- enabled: true,
299
- configPath: '.eslintrc.js', // optional
300
- },
261
+ eslint: { enabled: true, configPath: '.eslintrc.js' },
301
262
  }
302
263
  ```
303
264
 
304
265
  ## Scoring System
305
266
 
306
- The scoring system operates at three levels: per-validator scoring, per-scenario scoring, and summary scoring.
307
-
308
267
  ### Per-Validator Scoring
309
268
 
310
- Each validator (Pattern, LLM Judge, ESLint) independently evaluates the generated code and produces a score from 0.0 to 1.0:
311
-
312
- #### Pattern Validator
313
-
314
- Uses exponential decay based on weighted violations:
315
-
316
- ```
317
- score = e^(-totalWeight)
318
- ```
319
-
320
- Where `totalWeight` is the sum of violation weights:
321
- - **Critical violations**: 1.0 weight each
322
- - **Major violations**: 0.7 weight each
323
- - **Minor violations**: 0.3 weight each
324
-
325
- **Examples**:
326
- - 0 violations → score = 1.0 (perfect)
327
- - 1 critical violation → score ≈ 0.37
328
- - 1 major violation → score ≈ 0.50
329
- - 2 minor violations → score ≈ 0.55
330
-
331
- #### LLM Judge Validator
332
-
333
- The LLM (GPT-4 or other model) evaluates the code semantically and returns:
334
- - An `overallScore` from 0.0 to 1.0
335
- - A list of violations with explanations
336
- - Passed if: score ≥ 0.7 AND no violations
337
-
338
- The LLM judge provides semantic understanding beyond pattern matching, evaluating whether the code actually solves the problem correctly and follows best practices.
339
-
340
- #### ESLint Validator
341
-
342
- This validator runs ESLint on the generated code and scores based on the number and severity of linting violations. Note that ESLint must be installed and configured in your project for this validator to work. If you don't have ESLint set up globally, disable this validator or provide a custom validator.
343
-
344
- Uses exponential decay with a dampening factor:
345
-
346
- ```
347
- score = e^(-totalWeight / 2)
348
- ```
269
+ Each validator independently evaluates generated code and produces a score from 0.0 to 1.0:
349
270
 
350
- ESLint violations are mapped to severity:
351
- - ESLint error (severity 2) → **Major** violation (0.7 weight)
352
- - ESLint warning (severity 1) **Minor** violation (0.3 weight)
353
-
354
- The `/2` dampening factor makes ESLint less punitive since projects often have many minor linting issues.
271
+ | Validator | Scoring Method | Notes |
272
+ |-----------|----------------|-------|
273
+ | **Pattern** | Uses exponential decay based on weighted violations | Critical: 1.0 weight, Major: 0.7 weight, Minor: 0.3 weight |
274
+ | **LLM Judge** | AI evaluates semantically, returns 0.0-1.0 score | Passes if score ≥ 0.7 AND no violations |
275
+ | **ESLint** | Exponential decay with dampening factor (÷2) | ESLint errors Major (0.7), warnings Minor (0.3) |
355
276
 
356
277
  ### Per-Scenario Scoring
357
278
 
358
- Each scenario receives an **overall score** calculated as:
359
-
360
- ```
361
- overallScore = average of all active validator scores
362
- ```
279
+ Each scenario receives an **overall score** = **average of all active validator scores**
363
280
 
364
- **Active validators** are those that:
365
- - Are configured in the scenario's `validationStrategy`
366
- - Successfully ran (did not return score = -1)
281
+ **Active validators** are those configured in `validationStrategy` that successfully ran (score ≠ -1).
367
282
 
368
283
  **Pass/Fail Criteria**:
369
284
  - ✅ **PASS**: `overallScore ≥ 0.8` AND `violations.length === 0`
370
285
  - ❌ **FAIL**: `overallScore < 0.8` OR `violations.length > 0`
371
286
  - ⚠️ **SKIP**: An error occurred during evaluation (timeout, adapter failure, etc.)
372
287
 
373
- **Example**: If Pattern validator returns 0.9, LLM Judge returns 0.8, and ESLint is skipped:
374
- ```
375
- overallScore = (0.9 + 0.8) / 2 = 0.85
376
- ```
288
+ **Example**: Pattern (0.9) + LLM Judge (0.8) + ESLint (skipped) → `overallScore = (0.9 + 0.8) / 2 = 0.85`
377
289
 
378
290
  ### Summary Scoring
379
291
 
380
- After evaluating all scenarios, the framework calculates summary statistics:
292
+ After evaluating all scenarios, the framework calculates:
381
293
 
382
294
  ```javascript
383
295
  {
384
- total: 10, // Total number of scenarios
385
- passed: 7, // Scenarios with overallScore ≥ 0.8 and no violations
386
- failed: 2, // Scenarios evaluated but didn't pass
387
- skipped: 1, // Scenarios that encountered errors
388
- averageScore: 0.78, // Average of all scenario overallScores
296
+ total: 10, // Total scenarios
297
+ passed: 7, // overallScore ≥ 0.8 and no violations
298
+ failed: 2, // Evaluated but didn't pass
299
+ skipped: 1, // Encountered errors
300
+ averageScore: 0.78, // Average of all scenario scores
389
301
  totalViolations: 8 // Sum of violations across all scenarios
390
302
  }
391
303
  ```
392
304
 
393
- **Average Score Calculation**:
394
- ```
395
- averageScore = (sum of all scenario scores) / total scenarios
396
- ```
397
-
398
- This includes scores from failed scenarios, providing an overall quality metric across your entire test suite.
399
-
400
- **Transparency**: When baselines are saved, the per-validator breakdown is included in the baseline file, allowing you to trace exactly which validator contributed what score. See [Baseline File Format](#baseline-file-format) for details.
305
+ **Transparency**: Baselines include per-validator breakdowns. See [Baseline File Format](#baseline-file-format) for details.
401
306
 
402
307
  ### Score Interpretation
403
308
 
@@ -417,11 +322,6 @@ When baseline tracking is enabled, you'll see delta metrics:
417
322
  ↑ +18.5% improvement from baseline
418
323
  ```
419
324
 
420
- The percentage is calculated as:
421
- ```
422
- percentage = (currentScore - baselineScore) / baselineScore * 100
423
- ```
424
-
425
325
  ## Baseline Tracking
426
326
 
427
327
  Track evaluation results over time by enabling baseline management in your config file:
@@ -449,42 +349,21 @@ When `compareBaseline` is enabled, the report will show score deltas and whether
449
349
 
450
350
  ### Baseline File Format
451
351
 
452
- Each baseline file contains complete transparency into how the score was calculated:
352
+ Path: `.benchmarks/baselines/{adapter}/{model}/{scenario-id}.json`
353
+
354
+ Each baseline file provides complete score traceability:
453
355
 
454
356
  ```json
455
357
  {
456
358
  "scenarioId": "typescript-no-any",
457
359
  "score": 0.85,
458
360
  "violations": [
459
- {
460
- "type": "pattern",
461
- "message": "Forbidden pattern found: :\\s*any\\b",
462
- "file": "src/types.ts",
463
- "line": 12,
464
- "severity": "critical",
465
- "details": "Matched: \"metadata: any\""
466
- }
361
+ { "type": "pattern", "message": "Forbidden pattern found: :\\s*any\\b", "file": "src/types.ts", ... }
467
362
  ],
468
363
  "validationResults": [
469
- {
470
- "passed": false,
471
- "score": 0.37,
472
- "violations": [...],
473
- "validatorType": "pattern"
474
- },
475
- {
476
- "passed": true,
477
- "score": 1.0,
478
- "violations": [],
479
- "validatorType": "llm-judge"
480
- },
481
- {
482
- "passed": true,
483
- "score": -1,
484
- "violations": [],
485
- "validatorType": "eslint",
486
- "error": "ESLint not found"
487
- }
364
+ { "passed": false, "score": 0.37, "validatorType": "pattern", "violations": [...] },
365
+ { "passed": true, "score": 1.0, "validatorType": "llm-judge", "violations": [] },
366
+ { "passed": true, "score": -1, "validatorType": "eslint", "error": "ESLint not found" }
488
367
  ],
489
368
  "timestamp": "2026-01-23T22:28:32.216Z",
490
369
  "adapter": "copilot",
@@ -493,15 +372,11 @@ Each baseline file contains complete transparency into how the score was calcula
493
372
  ```
494
373
 
495
374
  **Key fields**:
496
- - `score`: Overall scenario score (average of active validators)
497
- - `violations`: All violations from all validators combined
498
- - `validationResults`: Per-validator breakdown showing:
499
- - Individual validator score
500
- - Whether that validator passed
501
- - Violations specific to that validator
502
- - Any errors that occurred (`score: -1` means skipped)
375
+ - `score` - Overall scenario score (average of active validators)
376
+ - `violations` - All violations from all validators combined
377
+ - `validationResults` - Per-validator breakdown (score, passed, violations, errors)
503
378
 
504
- **Score Traceability**: With this format, you can always trace the overall score back to individual validator scores. For example, if you see `score: 0.067`, you can look at `validationResults` to see which validators contributed what scores (e.g., Pattern: 0.135, LLM Judge: 0.00).
379
+ **Traceability**: You can always trace the overall score back to individual validator scores (e.g., `score: 0.067` check `validationResults` for Pattern: 0.135, LLM Judge: 0.00).
505
380
 
506
381
  ## CLI Commands
507
382
 
@@ -509,24 +384,26 @@ Each baseline file contains complete transparency into how the score was calcula
509
384
 
510
385
  Run benchmark evaluations.
511
386
 
512
- **Options:**
513
- - `--scenario <pattern>`: Filter by scenario ID (supports wildcards like `typescript-*`)
514
- - `--category <categories>`: Filter by category (comma-separated)
515
- - `--tag <tags>`: Filter by tags (comma-separated)
516
- - `--adapter <type>`: Adapter to use (`copilot` or `claude-code`)
517
- - `--model <model>`: LLM model for judge (default: `openai/gpt-4.1`)
518
- - `--threshold <number>`: Minimum passing score (default: `0.8`)
519
- - `--verbose`: Show detailed output
520
- - `--output <file>`: Export JSON report
521
- - `--workspace-root <path>`: Workspace root directory
387
+ | Option | Description | Default/Example |
388
+ |--------|-------------|-----------------|
389
+ | `--scenario <pattern>` | Filter by scenario ID (supports wildcards) | `typescript-*` |
390
+ | `--category <categories>` | Filter by category (comma-separated) | `typescript,react` |
391
+ | `--tag <tags>` | Filter by tags (comma-separated) | `safety,types` |
392
+ | `--adapter <type>` | Adapter to use | `copilot` or `claude-code` |
393
+ | `--model <model>` | LLM model for judge | `openai/gpt-5` |
394
+ | `--threshold <number>` | Minimum passing score | `0.8` |
395
+ | `--verbose` | Show detailed output | - |
396
+ | `--output <file>` | Export JSON report | `report.json` |
397
+ | `--workspace-root <path>` | Workspace root directory | Current directory |
522
398
 
523
399
  ### `list`
524
400
 
525
401
  List available test scenarios.
526
402
 
527
- **Options:**
528
- - `--category <categories>`: Filter by category
529
- - `--tag <tags>`: Filter by tags
403
+ | Option | Description |
404
+ |--------|-------------|
405
+ | `--category <categories>` | Filter by category (comma-separated) |
406
+ | `--tag <tags>` | Filter by tags (comma-separated) |
530
407
 
531
408
  ### `check`
532
409
 
@@ -536,8 +413,9 @@ Check if coding agent CLIs are available.
536
413
 
537
414
  Test LLM judge with a custom prompt (for debugging).
538
415
 
539
- **Options:**
540
- - `--model <model>`: LLM model to use
416
+ | Option | Description |
417
+ |--------|-------------|
418
+ | `--model <model>` | LLM model to use |
541
419
 
542
420
  ## Understanding Output
543
421
 
@@ -586,26 +464,15 @@ import { Evaluator, loadConfig } from 'coding-agent-benchmarks';
586
464
  async function runEvaluation() {
587
465
  const { config, scenarios } = await loadConfig();
588
466
 
589
- // Create evaluator
590
467
  const evaluator = new Evaluator({
591
468
  adapter: 'copilot',
592
- model: 'openai/gpt-4.1',
469
+ model: 'openai/gpt-5',
593
470
  verbose: true,
594
471
  saveBaseline: config.saveBaseline,
595
472
  compareBaseline: config.compareBaseline,
596
473
  });
597
474
 
598
- // Check adapter availability
599
- const available = await evaluator.checkAdapterAvailability();
600
- if (!available) {
601
- throw new Error('Adapter not available');
602
- }
603
-
604
- // Run evaluation
605
475
  const report = await evaluator.evaluate(scenarios);
606
-
607
- console.log(`Passed: ${report.summary.passed}/${report.summary.total}`);
608
- console.log(`Average score: ${report.summary.averageScore.toFixed(2)}`);
609
476
  }
610
477
 
611
478
  runEvaluation();
@@ -616,106 +483,62 @@ runEvaluation();
616
483
  Implement the `CodeValidator` interface:
617
484
 
618
485
  ```typescript
619
- import { CodeValidator, ValidationResult, TestScenario } from 'coding-agent-benchmarks';
486
+ import { CodeValidator, ValidationResult } from 'coding-agent-benchmarks';
620
487
 
621
488
  export class CustomValidator implements CodeValidator {
622
489
  public readonly type = 'custom';
623
490
 
624
- async validate(
625
- files: readonly string[],
626
- scenario: TestScenario
627
- ): Promise<ValidationResult> {
491
+ async validate(files: readonly string[], scenario: TestScenario): Promise<ValidationResult> {
628
492
  // Your validation logic here
629
- return {
630
- passed: true,
631
- score: 1.0,
632
- violations: [],
633
- validatorType: 'custom',
634
- };
493
+ return { passed: true, score: 1.0, violations: [], validatorType: 'custom' };
635
494
  }
636
495
  }
637
496
  ```
638
497
 
498
+ See CONTRIBUTING.md for complete examples.
499
+
639
500
  ## Creating Custom Adapters
640
501
 
641
502
  Implement the `CodeGenerationAdapter` interface:
642
503
 
643
504
  ```typescript
644
- import { CodeGenerationAdapter, AdapterType } from 'coding-agent-benchmarks';
505
+ import { CodeGenerationAdapter } from 'coding-agent-benchmarks';
645
506
 
646
507
  export class CustomAdapter implements CodeGenerationAdapter {
647
- public readonly type: AdapterType = 'copilot'; // or extend the type
648
-
649
- async checkAvailability(): Promise<boolean> {
650
- // Check if CLI is available
651
- return true;
652
- }
508
+ public readonly type = 'custom-adapter';
653
509
 
654
- async generate(
655
- prompt: string,
656
- contextFiles?: readonly string[],
657
- timeout?: number
658
- ): Promise<string[]> {
659
- // Generate code and return changed files
660
- return ['path/to/generated/file.ts'];
661
- }
510
+ async checkAvailability(): Promise<boolean> { /* ... */ }
511
+ async generate(prompt: string, contextFiles?: readonly string[], timeout?: number): Promise<string[]> { /* ... */ }
662
512
  }
663
513
  ```
664
514
 
515
+ See CONTRIBUTING.md for complete examples.
516
+
665
517
  ## GitHub Authentication (for LLM Judge)
666
518
 
667
- LLM-as-judge validation requires GitHub authentication to access GitHub Models API. There are **two easy options** - no OAuth registration needed!
519
+ LLM-as-judge validation requires GitHub authentication to access GitHub Models API. Choose one option:
668
520
 
669
521
  ### Option 1: Personal Access Token (Recommended)
670
522
 
671
- 1. Create token at https://github.com/settings/tokens
672
- 2. Click "Generate new token (classic)"
673
- 3. Give it a name (e.g., "coding-agent-benchmarks")
674
- 4. Select scope: **`models:read`**
675
- 5. Generate and copy the token
676
- 6. Set environment variable:
677
- ```bash
678
- export GITHUB_TOKEN=ghp_xxxxxxxxxxxxxxxxxxxx
679
- ```
523
+ Create a token at https://github.com/settings/tokens with the **`models:read`** scope, then set it as an environment variable:
524
+
525
+ ```bash
526
+ export GITHUB_TOKEN=ghp_xxxxxxxxxxxxxxxxxxxx
527
+ ```
680
528
 
681
529
  ### Option 2: GitHub CLI (Automatic)
682
530
 
683
- If you have GitHub CLI installed, tokens are auto-detected:
531
+ If GitHub CLI is installed, tokens are auto-detected:
684
532
 
685
533
  ```bash
686
- # Install GitHub CLI
687
- brew install gh # macOS
688
- # or download from https://cli.github.com
689
-
690
- # Authenticate (one time)
534
+ brew install gh # or download from https://cli.github.com
691
535
  gh auth login
692
-
693
536
  # Token will be used automatically - no GITHUB_TOKEN needed!
694
537
  ```
695
538
 
696
- ### Check Authentication Status
697
-
698
- ```bash
699
- npx coding-agent-benchmarks check
700
- ```
701
-
702
- Output:
703
- ```
704
- Checking adapter availability...
705
- GitHub Copilot CLI: ✓ Available
706
- Claude Code CLI: ✗ Not found
707
-
708
- Checking GitHub authentication...
709
- ✓ Using token from GitHub CLI (gh auth token)
710
- ```
711
-
712
- ## How It Works
539
+ ### Check Authentication
713
540
 
714
- 1. **Code Generation**: The adapter spawns a coding agent CLI with a prompt
715
- 2. **File Tracking**: Git is used to detect which files were created/modified
716
- 3. **Validation**: Multiple validators check the generated code
717
- 4. **Scoring**: Results are aggregated and compared against thresholds
718
- 5. **Reporting**: Results are displayed in terminal and optionally exported as JSON
541
+ Run `npx coding-agent-benchmarks check` to verify authentication status.
719
542
 
720
543
  ## Requirements
721
544
 
@@ -1 +1 @@
1
- {"version":3,"file":"claudeCodeCLI.d.ts","sourceRoot":"","sources":["../../src/adapters/claudeCodeCLI.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AAOjD,MAAM,WAAW,2BAA2B;IAC1C,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAID,qBAAa,oBAAqB,YAAW,qBAAqB;IAChE,SAAgB,IAAI,EAAG,aAAa,CAAU;IAC9C,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,KAAK,CAAS;gBAEV,OAAO,CAAC,EAAE,2BAA2B;IAKjD;;OAEG;IACG,iBAAiB,IAAI,OAAO,CAAC,OAAO,CAAC;IAgB3C;;OAEG;IACH,QAAQ,IAAI,MAAM;IAIlB;;OAEG;IACH,OAAO,CAAC,WAAW;IA8BnB;;;OAGG;IACG,QAAQ,CACZ,MAAM,EAAE,MAAM,EACd,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,GACtB,OAAO,CAAC,MAAM,EAAE,CAAC;CA6GrB"}
1
+ {"version":3,"file":"claudeCodeCLI.d.ts","sourceRoot":"","sources":["../../src/adapters/claudeCodeCLI.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AAOjD,MAAM,WAAW,2BAA2B;IAC1C,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAID,qBAAa,oBAAqB,YAAW,qBAAqB;IAChE,SAAgB,IAAI,EAAG,aAAa,CAAU;IAC9C,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,KAAK,CAAS;gBAEV,OAAO,CAAC,EAAE,2BAA2B;IAKjD;;OAEG;IACG,iBAAiB,IAAI,OAAO,CAAC,OAAO,CAAC;IAgB3C;;OAEG;IACH,QAAQ,IAAI,MAAM;IAIlB;;OAEG;IACH,OAAO,CAAC,WAAW;IA8BnB;;;OAGG;IACG,QAAQ,CACZ,MAAM,EAAE,MAAM,EACd,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,GACtB,OAAO,CAAC,MAAM,EAAE,CAAC;CAuGrB"}
@@ -105,13 +105,10 @@ class ClaudeCodeCLIAdapter {
105
105
  throw new Error("Claude Code CLI is not available. Please install it first: https://docs.anthropic.com/en/docs/build-with-claude/claude-code");
106
106
  }
107
107
  const fullPrompt = this.buildPrompt(prompt, contextFiles);
108
- // Capture git status before generation
109
108
  const statusBefore = (0, gitUtils_1.getGitStatusPorcelain)(this.workspaceRoot);
110
- // Write prompt to temp file and pipe via stdin (matches @copilot-evals pattern)
111
109
  return new Promise((resolve, reject) => {
112
110
  const tempFile = path.join(this.workspaceRoot, ".claude-eval-prompt.txt");
113
111
  fs.writeFileSync(tempFile, fullPrompt, "utf8");
114
- // Cleanup function
115
112
  const cleanup = () => {
116
113
  try {
117
114
  if (fs.existsSync(tempFile)) {
@@ -122,7 +119,6 @@ class ClaudeCodeCLIAdapter {
122
119
  // Ignore cleanup errors
123
120
  }
124
121
  };
125
- // Register cleanup on process termination
126
122
  const cleanupOnExit = () => {
127
123
  cleanup();
128
124
  };
@@ -142,7 +138,6 @@ class ClaudeCodeCLIAdapter {
142
138
  proc.stderr?.on("data", (data) => {
143
139
  stderr += data.toString();
144
140
  });
145
- // Set timeout only if specified (null/undefined = no timeout)
146
141
  let timeoutHandle = null;
147
142
  if (timeout !== null && timeout !== undefined) {
148
143
  timeoutHandle = setTimeout(() => {
@@ -164,7 +159,6 @@ class ClaudeCodeCLIAdapter {
164
159
  reject(new Error(`Claude Code CLI exited with code ${code}\nStderr: ${stderr}`));
165
160
  return;
166
161
  }
167
- // Get files changed during generation (diff before/after)
168
162
  try {
169
163
  const statusAfter = (0, gitUtils_1.getGitStatusPorcelain)(this.workspaceRoot);
170
164
  const changedFiles = (0, gitUtils_1.getChangedFilesDiff)(statusBefore, statusAfter, this.workspaceRoot);
@@ -1 +1 @@
1
- {"version":3,"file":"claudeCodeCLI.js","sourceRoot":"","sources":["../../src/adapters/claudeCodeCLI.ts"],"names":[],"mappings":";AAAA;;GAEG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEH,iDAAsC;AACtC,uCAAyB;AACzB,2CAA6B;AAE7B,gDAA+E;AAC/E,4DAGiC;AAOjC,MAAM,aAAa,GAAG,QAAQ,CAAC;AAE/B,MAAa,oBAAoB;IAK/B,YAAY,OAAqC;QAJjC,SAAI,GAAG,aAAsB,CAAC;QAK5C,IAAI,CAAC,aAAa,GAAG,IAAA,qCAAoB,EAAC,OAAO,EAAE,aAAa,CAAC,CAAC;QAClE,IAAI,CAAC,KAAK,GAAG,OAAO,EAAE,KAAK,IAAI,aAAa,CAAC;IAC/C,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,iBAAiB;QACrB,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;YAC7B,MAAM,IAAI,GAAG,IAAA,qBAAK,EAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,EAAE;gBACtC,KAAK,EAAE,MAAM;aACd,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;gBACxB,OAAO,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC;YACtB,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE;gBACpB,OAAO,CAAC,KAAK,CAAC,CAAC;YACjB,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,QAAQ;QACN,OAAO,IAAI,CAAC,KAAK,CAAC;IACpB,CAAC;IAED;;OAEG;IACK,WAAW,CACjB,MAAc,EACd,YAAgC;QAEhC,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,IAAI,YAAY,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5C,MAAM,QAAQ,GAAG,IAAA,iCAAgB,EAAC,IAAI,CAAC,aAAa,EAAE,YAAY,CAAC,CAAC;YACpE,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxB,KAAK,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;gBAClC,MAAM,cAAc,GAAG,QAAQ;qBAC5B,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;oBACX,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,YAAY,CAAC;oBAC5D,OAAO,OAAO,GAAG,CAAC,IAAI,WAAW,GAAG,KAAK,GAAG,CAAC,OAAO,UAAU,CAAC;gBACjE,CAAC,CAAC;qBACD,IAAI,CAAC,MAAM,CAAC,CAAC;gBAChB,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAC3B,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACvB,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACnB,KAAK,CAAC,IAAI,CACR,0GAA0G,CAC3G,CAAC;QAEF,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,QAAQ,CACZ,MAAc,EACd,YAAgC,EAChC,OAAuB;QAEvB,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,iBAAiB,EAAE,CAAC;QACnD,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,MAAM,IAAI,KAAK,CACb,6HAA6H,CAC9H,CAAC;QACJ,CAAC;QAED,MAAM,UAAU,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;QAE1D,uCAAuC;QACvC,MAAM,YAAY,GAAG,IAAA,gCAAqB,EAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAE/D,gFAAgF;QAChF,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;YACrC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,aAAa,EAAE,yBAAyB,CAAC,CAAC;YAC1E,EAAE,CAAC,aAAa,CAAC,QAAQ,EAAE,UAAU,EAAE,MAAM,CAAC,CAAC;YAE/C,mBAAmB;YACnB,MAAM,OAAO,GAAG,GAAS,EAAE;gBACzB,IAAI,CAAC;oBACH,IAAI,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;wBAC5B,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC;oBAC1B,CAAC;gBACH,CAAC;gBAAC,MAAM,CAAC;oBACP,wBAAwB;gBAC1B,CAAC;YACH,CAAC,CAAC;YAEF,0CAA0C;YAC1C,MAAM,aAAa,GAAG,GAAS,EAAE;gBAC/B,OAAO,EAAE,CAAC;YACZ,CAAC,CAAC;YACF,OAAO,CAAC,IAAI,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;YACtC,OAAO,CAAC,IAAI,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;YAEvC,MAAM,OAAO,GAAG,QAAQ,QAAQ,sBAAsB,IAAI,CAAC,KAAK,yIAAyI,CAAC;YAC1M,MAAM,IAAI,GAAG,IAAA,qBAAK,EAAC,IAAI,EAAE,CAAC,IAAI,EAAE,OAAO,CAAC,EAAE;gBACxC,GAAG,EAAE,IAAI,CAAC,aAAa;gBACvB,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;aAChC,CAAC,CAAC;YAEH,6DAA6D;YAC7D,IAAI,MAAM,GAAG,EAAE,CAAC;YAChB,IAAI,MAAM,GAAG,EAAE,CAAC;YAEhB,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;gBAC/B,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAC5B,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;gBAC/B,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAC5B,CAAC,CAAC,CAAC;YAEH,8DAA8D;YAC9D,IAAI,aAAa,GAA0B,IAAI,CAAC;YAChD,IAAI,OAAO,KAAK,IAAI,IAAI,OAAO,KAAK,SAAS,EAAE,CAAC;gBAC9C,aAAa,GAAG,UAAU,CAAC,GAAG,EAAE;oBAC9B,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;oBACrB,OAAO,EAAE,CAAC;oBACV,OAAO,CAAC,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;oBAChD,OAAO,CAAC,cAAc,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;oBACjD,MAAM,CAAC,IAAI,KAAK,CAAC,mCAAmC,OAAO,IAAI,CAAC,CAAC,CAAC;gBACpE,CAAC,EAAE,OAAO,CAAC,CAAC;YACd,CAAC;YAED,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;gBACxB,IAAI,aAAa,EAAE,CAAC;oBAClB,YAAY,CAAC,aAAa,CAAC,CAAC;gBAC9B,CAAC;gBAED,OAAO,EAAE,CAAC;gBACV,OAAO,CAAC,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;gBAChD,OAAO,CAAC,cAAc,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;gBAEjD,IAAI,IAAI,KAAK,CAAC,EAAE,CAAC;oBACf,MAAM,CACJ,IAAI,KAAK,CACP,oCAAoC,IAAI,aAAa,MAAM,EAAE,CAC9D,CACF,CAAC;oBACF,OAAO;gBACT,CAAC;gBAED,0DAA0D;gBAC1D,IAAI,CAAC;oBACH,MAAM,WAAW,GAAG,IAAA,gCAAqB,EAAC,IAAI,CAAC,aAAa,CAAC,CAAC;oBAC9D,MAAM,YAAY,GAAG,IAAA,8BAAmB,EACtC,YAAY,EACZ,WAAW,EACX,IAAI,CAAC,aAAa,CACnB,CAAC;oBACF,OAAO,CAAC,YAAY,CAAC,CAAC;gBACxB,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,CAAC,IAAI,KAAK,CAAC,gCAAgC,KAAK,EAAE,CAAC,CAAC,CAAC;gBAC7D,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;gBACzB,IAAI,aAAa,EAAE,CAAC;oBAClB,YAAY,CAAC,aAAa,CAAC,CAAC;gBAC9B,CAAC;gBACD,OAAO,EAAE,CAAC;gBACV,OAAO,CAAC,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;gBAChD,OAAO,CAAC,cAAc,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;gBACjD,MAAM,CAAC,IAAI,KAAK,CAAC,oCAAoC,KAAK,EAAE,CAAC,CAAC,CAAC;YACjE,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;CACF;AA1LD,oDA0LC"}
1
+ {"version":3,"file":"claudeCodeCLI.js","sourceRoot":"","sources":["../../src/adapters/claudeCodeCLI.ts"],"names":[],"mappings":";AAAA;;GAEG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEH,iDAAsC;AACtC,uCAAyB;AACzB,2CAA6B;AAE7B,gDAA+E;AAC/E,4DAGiC;AAOjC,MAAM,aAAa,GAAG,QAAQ,CAAC;AAE/B,MAAa,oBAAoB;IAK/B,YAAY,OAAqC;QAJjC,SAAI,GAAG,aAAsB,CAAC;QAK5C,IAAI,CAAC,aAAa,GAAG,IAAA,qCAAoB,EAAC,OAAO,EAAE,aAAa,CAAC,CAAC;QAClE,IAAI,CAAC,KAAK,GAAG,OAAO,EAAE,KAAK,IAAI,aAAa,CAAC;IAC/C,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,iBAAiB;QACrB,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;YAC7B,MAAM,IAAI,GAAG,IAAA,qBAAK,EAAC,OAAO,EAAE,CAAC,QAAQ,CAAC,EAAE;gBACtC,KAAK,EAAE,MAAM;aACd,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;gBACxB,OAAO,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC;YACtB,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE;gBACpB,OAAO,CAAC,KAAK,CAAC,CAAC;YACjB,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,QAAQ;QACN,OAAO,IAAI,CAAC,KAAK,CAAC;IACpB,CAAC;IAED;;OAEG;IACK,WAAW,CACjB,MAAc,EACd,YAAgC;QAEhC,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,IAAI,YAAY,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5C,MAAM,QAAQ,GAAG,IAAA,iCAAgB,EAAC,IAAI,CAAC,aAAa,EAAE,YAAY,CAAC,CAAC;YACpE,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACxB,KAAK,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;gBAClC,MAAM,cAAc,GAAG,QAAQ;qBAC5B,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;oBACX,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,YAAY,CAAC;oBAC5D,OAAO,OAAO,GAAG,CAAC,IAAI,WAAW,GAAG,KAAK,GAAG,CAAC,OAAO,UAAU,CAAC;gBACjE,CAAC,CAAC;qBACD,IAAI,CAAC,MAAM,CAAC,CAAC;gBAChB,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;gBAC3B,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACvB,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACnB,KAAK,CAAC,IAAI,CACR,0GAA0G,CAC3G,CAAC;QAEF,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,QAAQ,CACZ,MAAc,EACd,YAAgC,EAChC,OAAuB;QAEvB,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,iBAAiB,EAAE,CAAC;QACnD,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,MAAM,IAAI,KAAK,CACb,6HAA6H,CAC9H,CAAC;QACJ,CAAC;QAED,MAAM,UAAU,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;QAE1D,MAAM,YAAY,GAAG,IAAA,gCAAqB,EAAC,IAAI,CAAC,aAAa,CAAC,CAAC;QAE/D,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;YACrC,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,aAAa,EAAE,yBAAyB,CAAC,CAAC;YAC1E,EAAE,CAAC,aAAa,CAAC,QAAQ,EAAE,UAAU,EAAE,MAAM,CAAC,CAAC;YAE/C,MAAM,OAAO,GAAG,GAAS,EAAE;gBACzB,IAAI,CAAC;oBACH,IAAI,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;wBAC5B,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC;oBAC1B,CAAC;gBACH,CAAC;gBAAC,MAAM,CAAC;oBACP,wBAAwB;gBAC1B,CAAC;YACH,CAAC,CAAC;YAEF,MAAM,aAAa,GAAG,GAAS,EAAE;gBAC/B,OAAO,EAAE,CAAC;YACZ,CAAC,CAAC;YACF,OAAO,CAAC,IAAI,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;YACtC,OAAO,CAAC,IAAI,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;YAEvC,MAAM,OAAO,GAAG,QAAQ,QAAQ,sBAAsB,IAAI,CAAC,KAAK,yIAAyI,CAAC;YAC1M,MAAM,IAAI,GAAG,IAAA,qBAAK,EAAC,IAAI,EAAE,CAAC,IAAI,EAAE,OAAO,CAAC,EAAE;gBACxC,GAAG,EAAE,IAAI,CAAC,aAAa;gBACvB,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;aAChC,CAAC,CAAC;YAEH,6DAA6D;YAC7D,IAAI,MAAM,GAAG,EAAE,CAAC;YAChB,IAAI,MAAM,GAAG,EAAE,CAAC;YAEhB,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;gBAC/B,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAC5B,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;gBAC/B,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAC5B,CAAC,CAAC,CAAC;YAEH,IAAI,aAAa,GAA0B,IAAI,CAAC;YAChD,IAAI,OAAO,KAAK,IAAI,IAAI,OAAO,KAAK,SAAS,EAAE,CAAC;gBAC9C,aAAa,GAAG,UAAU,CAAC,GAAG,EAAE;oBAC9B,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;oBACrB,OAAO,EAAE,CAAC;oBACV,OAAO,CAAC,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;oBAChD,OAAO,CAAC,cAAc,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;oBACjD,MAAM,CAAC,IAAI,KAAK,CAAC,mCAAmC,OAAO,IAAI,CAAC,CAAC,CAAC;gBACpE,CAAC,EAAE,OAAO,CAAC,CAAC;YACd,CAAC;YAED,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;gBACxB,IAAI,aAAa,EAAE,CAAC;oBAClB,YAAY,CAAC,aAAa,CAAC,CAAC;gBAC9B,CAAC;gBAED,OAAO,EAAE,CAAC;gBACV,OAAO,CAAC,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;gBAChD,OAAO,CAAC,cAAc,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;gBAEjD,IAAI,IAAI,KAAK,CAAC,EAAE,CAAC;oBACf,MAAM,CACJ,IAAI,KAAK,CACP,oCAAoC,IAAI,aAAa,MAAM,EAAE,CAC9D,CACF,CAAC;oBACF,OAAO;gBACT,CAAC;gBAED,IAAI,CAAC;oBACH,MAAM,WAAW,GAAG,IAAA,gCAAqB,EAAC,IAAI,CAAC,aAAa,CAAC,CAAC;oBAC9D,MAAM,YAAY,GAAG,IAAA,8BAAmB,EACtC,YAAY,EACZ,WAAW,EACX,IAAI,CAAC,aAAa,CACnB,CAAC;oBACF,OAAO,CAAC,YAAY,CAAC,CAAC;gBACxB,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,CAAC,IAAI,KAAK,CAAC,gCAAgC,KAAK,EAAE,CAAC,CAAC,CAAC;gBAC7D,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;gBACzB,IAAI,aAAa,EAAE,CAAC;oBAClB,YAAY,CAAC,aAAa,CAAC,CAAC;gBAC9B,CAAC;gBACD,OAAO,EAAE,CAAC;gBACV,OAAO,CAAC,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;gBAChD,OAAO,CAAC,cAAc,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;gBACjD,MAAM,CAAC,IAAI,KAAK,CAAC,oCAAoC,KAAK,EAAE,CAAC,CAAC,CAAC;YACjE,CAAC,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;CACF;AApLD,oDAoLC"}
@@ -1 +1 @@
1
- {"version":3,"file":"copilotCLI.d.ts","sourceRoot":"","sources":["../../src/adapters/copilotCLI.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AAOjD,MAAM,WAAW,wBAAwB;IACvC,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAID,qBAAa,iBAAkB,YAAW,qBAAqB;IAC7D,SAAgB,IAAI,EAAG,SAAS,CAAU;IAC1C,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,KAAK,CAAS;gBAEV,OAAO,CAAC,EAAE,wBAAwB;IAKxC,iBAAiB,IAAI,OAAO,CAAC,OAAO,CAAC;IAgB3C;;OAEG;IACH,QAAQ,IAAI,MAAM;IAIlB;;OAEG;IACH,OAAO,CAAC,WAAW;IA8BnB;;;OAGG;IACG,QAAQ,CACZ,MAAM,EAAE,MAAM,EACd,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,GACtB,OAAO,CAAC,MAAM,EAAE,CAAC;CA+GrB"}
1
+ {"version":3,"file":"copilotCLI.d.ts","sourceRoot":"","sources":["../../src/adapters/copilotCLI.ts"],"names":[],"mappings":"AAAA;;GAEG;AAKH,OAAO,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAC;AAOjD,MAAM,WAAW,wBAAwB;IACvC,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAID,qBAAa,iBAAkB,YAAW,qBAAqB;IAC7D,SAAgB,IAAI,EAAG,SAAS,CAAU;IAC1C,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,KAAK,CAAS;gBAEV,OAAO,CAAC,EAAE,wBAAwB;IAKxC,iBAAiB,IAAI,OAAO,CAAC,OAAO,CAAC;IAgB3C;;OAEG;IACH,QAAQ,IAAI,MAAM;IAIlB;;OAEG;IACH,OAAO,CAAC,WAAW;IA8BnB;;;OAGG;IACG,QAAQ,CACZ,MAAM,EAAE,MAAM,EACd,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,EAChC,OAAO,CAAC,EAAE,MAAM,GAAG,IAAI,GACtB,OAAO,CAAC,MAAM,EAAE,CAAC;CAyGrB"}
@@ -102,13 +102,10 @@ class CopilotCLIAdapter {
102
102
  throw new Error("GitHub Copilot CLI is not available. Please install it first: https://docs.github.com/en/copilot/how-tos/set-up/install-copilot-cli");
103
103
  }
104
104
  const fullPrompt = this.buildPrompt(prompt, contextFiles);
105
- // Capture git status before generation
106
105
  const statusBefore = (0, gitUtils_1.getGitStatusPorcelain)(this.workspaceRoot);
107
- // Write prompt to temp file and pipe via stdin (matches @copilot-evals pattern)
108
106
  return new Promise((resolve, reject) => {
109
107
  const tempFile = path.join(this.workspaceRoot, ".copilot-eval-prompt.txt");
110
108
  fs.writeFileSync(tempFile, fullPrompt, "utf8");
111
- // Cleanup function
112
109
  const cleanup = () => {
113
110
  try {
114
111
  if (fs.existsSync(tempFile)) {
@@ -119,7 +116,6 @@ class CopilotCLIAdapter {
119
116
  // Ignore cleanup errors
120
117
  }
121
118
  };
122
- // Register cleanup on process termination
123
119
  const cleanupOnExit = () => {
124
120
  cleanup();
125
121
  };
@@ -138,7 +134,6 @@ class CopilotCLIAdapter {
138
134
  proc.stderr?.on("data", (data) => {
139
135
  stderr += data.toString();
140
136
  });
141
- // Set timeout only if specified (null/undefined = no timeout)
142
137
  let timeoutHandle = null;
143
138
  if (timeout !== null && timeout !== undefined) {
144
139
  timeoutHandle = setTimeout(() => {
@@ -160,7 +155,6 @@ class CopilotCLIAdapter {
160
155
  reject(new Error(`Copilot CLI exited with code ${code}\nStderr: ${stderr}`));
161
156
  return;
162
157
  }
163
- // Get files changed during generation (diff before/after)
164
158
  try {
165
159
  const statusAfter = (0, gitUtils_1.getGitStatusPorcelain)(this.workspaceRoot);
166
160
  const changedFiles = (0, gitUtils_1.getChangedFilesDiff)(statusBefore, statusAfter, this.workspaceRoot);