aiforcecli-chat 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/License.MD +49 -0
  2. package/README.md +642 -0
  3. package/aiforcecli.config.example.json +66 -0
  4. package/assets/README.md +14 -0
  5. package/dist/cli.js +2 -0
  6. package/dist/index.js +2 -0
  7. package/package.json +62 -0
  8. package/tools/scorecard/README.md +92 -0
  9. package/tools/scorecard/config.json +134 -0
  10. package/tools/scorecard/fetch.mjs +335 -0
  11. package/tools/scorecard/generate.mjs +289 -0
  12. package/tools/scorecard/generated/example/invalid-rows.json +1 -0
  13. package/tools/scorecard/generated/example/scorecard-report.md +147 -0
  14. package/tools/scorecard/generated/example/scorecard.compact.json +61 -0
  15. package/tools/scorecard/generated/example/scorecard.json +1492 -0
  16. package/tools/scorecard/generated/example/unmapped-models.json +1492 -0
  17. package/tools/scorecard/generated/raw/aider_polyglot.html +21071 -0
  18. package/tools/scorecard/generated/raw/terminal_bench_2_1.html +2 -0
  19. package/tools/scorecard/generated/scorecard/invalid-rows.json +1 -0
  20. package/tools/scorecard/generated/scorecard/scorecard-report.md +133 -0
  21. package/tools/scorecard/generated/scorecard/scorecard.compact.json +51 -0
  22. package/tools/scorecard/generated/scorecard/scorecard.json +1181 -0
  23. package/tools/scorecard/generated/scorecard/unmapped-models.json +1492 -0
  24. package/tools/scorecard/generated/scorecard-example/invalid-rows.json +1 -0
  25. package/tools/scorecard/generated/scorecard-example/scorecard-report.md +40 -0
  26. package/tools/scorecard/generated/scorecard-example/scorecard.compact.json +22 -0
  27. package/tools/scorecard/generated/scorecard-example/scorecard.json +389 -0
  28. package/tools/scorecard/generated/scorecard-example/unmapped-models.json +1 -0
  29. package/tools/scorecard/generated/scorecard-fetch/raw/aider_polyglot.html +21071 -0
  30. package/tools/scorecard/generated/scorecard-fetch/raw/terminal_bench_2_1.html +2 -0
  31. package/tools/scorecard/snapshots/example.normalized.example.json +38 -0
  32. package/tools/scorecard/snapshots/live.aider_polyglot.json +1318 -0
  33. package/tools/scorecard/snapshots/live.terminal_bench_2_1.json +294 -0
@@ -0,0 +1,1492 @@
1
+ {
2
+ "version": "manual.2026.06.16",
3
+ "generatedAt": "2026-06-16T20:38:21.186Z",
4
+ "taskTypes": [
5
+ "bugfix",
6
+ "feature",
7
+ "refactor",
8
+ "test",
9
+ "docs",
10
+ "security",
11
+ "perf",
12
+ "general"
13
+ ],
14
+ "notes": [
15
+ "Generated scorecard artifact. It is not used by the application unless explicitly wired in later.",
16
+ "Scores are normalized public benchmark priors, not private repo outcomes."
17
+ ],
18
+ "scores": {
19
+ "antigravity:gemini-3.1-pro": {
20
+ "bugfix": {
21
+ "score": 0.7331,
22
+ "confidence": 0.2143,
23
+ "evidenceWeight": 0.5456,
24
+ "sources": [
25
+ {
26
+ "source": "terminal_bench",
27
+ "benchmark": "terminal_bench",
28
+ "metric": "accuracy",
29
+ "score": 0.7066,
30
+ "weight": 0.1574,
31
+ "date": "2026-05-05",
32
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
33
+ "modelRaw": "Gemini CLI Gemini 3.1 Pro"
34
+ },
35
+ {
36
+ "source": "terminal_bench",
37
+ "benchmark": "terminal_bench",
38
+ "metric": "accuracy",
39
+ "score": 0.6629,
40
+ "weight": 0.1556,
41
+ "date": "2026-05-02",
42
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
43
+ "modelRaw": "Gemini CLI Gemini 3 Pro"
44
+ },
45
+ {
46
+ "source": "aider_polyglot",
47
+ "benchmark": "aider_polyglot",
48
+ "metric": "pass_rate_2",
49
+ "score": 0.831,
50
+ "weight": 0.0805,
51
+ "sampleSize": 225,
52
+ "date": "2025-06-06",
53
+ "url": "https://aider.chat/docs/leaderboards/",
54
+ "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
55
+ },
56
+ {
57
+ "source": "aider_polyglot",
58
+ "benchmark": "aider_polyglot",
59
+ "metric": "pass_rate_2",
60
+ "score": 0.791,
61
+ "weight": 0.0805,
62
+ "sampleSize": 225,
63
+ "date": "2025-06-06",
64
+ "url": "https://aider.chat/docs/leaderboards/",
65
+ "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
66
+ },
67
+ {
68
+ "source": "aider_polyglot",
69
+ "benchmark": "aider_polyglot",
70
+ "metric": "pass_rate_2",
71
+ "score": 0.769,
72
+ "weight": 0.0717,
73
+ "sampleSize": 225,
74
+ "date": "2025-05-07",
75
+ "url": "https://aider.chat/docs/leaderboards/",
76
+ "modelRaw": "Gemini 2.5 Pro Preview 05-06"
77
+ }
78
+ ]
79
+ },
80
+ "feature": {
81
+ "score": 0.7424,
82
+ "confidence": 0.1985,
83
+ "evidenceWeight": 0.4954,
84
+ "sources": [
85
+ {
86
+ "source": "terminal_bench",
87
+ "benchmark": "terminal_bench",
88
+ "metric": "accuracy",
89
+ "score": 0.7066,
90
+ "weight": 0.1224,
91
+ "date": "2026-05-05",
92
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
93
+ "modelRaw": "Gemini CLI Gemini 3.1 Pro"
94
+ },
95
+ {
96
+ "source": "terminal_bench",
97
+ "benchmark": "terminal_bench",
98
+ "metric": "accuracy",
99
+ "score": 0.6629,
100
+ "weight": 0.121,
101
+ "date": "2026-05-02",
102
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
103
+ "modelRaw": "Gemini CLI Gemini 3 Pro"
104
+ },
105
+ {
106
+ "source": "aider_polyglot",
107
+ "benchmark": "aider_polyglot",
108
+ "metric": "pass_rate_2",
109
+ "score": 0.831,
110
+ "weight": 0.0872,
111
+ "sampleSize": 225,
112
+ "date": "2025-06-06",
113
+ "url": "https://aider.chat/docs/leaderboards/",
114
+ "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
115
+ },
116
+ {
117
+ "source": "aider_polyglot",
118
+ "benchmark": "aider_polyglot",
119
+ "metric": "pass_rate_2",
120
+ "score": 0.791,
121
+ "weight": 0.0872,
122
+ "sampleSize": 225,
123
+ "date": "2025-06-06",
124
+ "url": "https://aider.chat/docs/leaderboards/",
125
+ "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
126
+ },
127
+ {
128
+ "source": "aider_polyglot",
129
+ "benchmark": "aider_polyglot",
130
+ "metric": "pass_rate_2",
131
+ "score": 0.769,
132
+ "weight": 0.0777,
133
+ "sampleSize": 225,
134
+ "date": "2025-05-07",
135
+ "url": "https://aider.chat/docs/leaderboards/",
136
+ "modelRaw": "Gemini 2.5 Pro Preview 05-06"
137
+ }
138
+ ]
139
+ },
140
+ "refactor": {
141
+ "score": 0.7574,
142
+ "confidence": 0.1949,
143
+ "evidenceWeight": 0.484,
144
+ "sources": [
145
+ {
146
+ "source": "aider_polyglot",
147
+ "benchmark": "aider_polyglot",
148
+ "metric": "pass_rate_2",
149
+ "score": 0.831,
150
+ "weight": 0.1073,
151
+ "sampleSize": 225,
152
+ "date": "2025-06-06",
153
+ "url": "https://aider.chat/docs/leaderboards/",
154
+ "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
155
+ },
156
+ {
157
+ "source": "aider_polyglot",
158
+ "benchmark": "aider_polyglot",
159
+ "metric": "pass_rate_2",
160
+ "score": 0.791,
161
+ "weight": 0.1073,
162
+ "sampleSize": 225,
163
+ "date": "2025-06-06",
164
+ "url": "https://aider.chat/docs/leaderboards/",
165
+ "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
166
+ },
167
+ {
168
+ "source": "aider_polyglot",
169
+ "benchmark": "aider_polyglot",
170
+ "metric": "pass_rate_2",
171
+ "score": 0.769,
172
+ "weight": 0.0956,
173
+ "sampleSize": 225,
174
+ "date": "2025-05-07",
175
+ "url": "https://aider.chat/docs/leaderboards/",
176
+ "modelRaw": "Gemini 2.5 Pro Preview 05-06"
177
+ },
178
+ {
179
+ "source": "terminal_bench",
180
+ "benchmark": "terminal_bench",
181
+ "metric": "accuracy",
182
+ "score": 0.7066,
183
+ "weight": 0.0874,
184
+ "date": "2026-05-05",
185
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
186
+ "modelRaw": "Gemini CLI Gemini 3.1 Pro"
187
+ },
188
+ {
189
+ "source": "terminal_bench",
190
+ "benchmark": "terminal_bench",
191
+ "metric": "accuracy",
192
+ "score": 0.6629,
193
+ "weight": 0.0864,
194
+ "date": "2026-05-02",
195
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
196
+ "modelRaw": "Gemini CLI Gemini 3 Pro"
197
+ }
198
+ ]
199
+ },
200
+ "test": {
201
+ "score": 0.7364,
202
+ "confidence": 0.1608,
203
+ "evidenceWeight": 0.3831,
204
+ "sources": [
205
+ {
206
+ "source": "terminal_bench",
207
+ "benchmark": "terminal_bench",
208
+ "metric": "accuracy",
209
+ "score": 0.7066,
210
+ "weight": 0.1049,
211
+ "date": "2026-05-05",
212
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
213
+ "modelRaw": "Gemini CLI Gemini 3.1 Pro"
214
+ },
215
+ {
216
+ "source": "terminal_bench",
217
+ "benchmark": "terminal_bench",
218
+ "metric": "accuracy",
219
+ "score": 0.6629,
220
+ "weight": 0.1037,
221
+ "date": "2026-05-02",
222
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
223
+ "modelRaw": "Gemini CLI Gemini 3 Pro"
224
+ },
225
+ {
226
+ "source": "aider_polyglot",
227
+ "benchmark": "aider_polyglot",
228
+ "metric": "pass_rate_2",
229
+ "score": 0.831,
230
+ "weight": 0.0603,
231
+ "sampleSize": 225,
232
+ "date": "2025-06-06",
233
+ "url": "https://aider.chat/docs/leaderboards/",
234
+ "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
235
+ },
236
+ {
237
+ "source": "aider_polyglot",
238
+ "benchmark": "aider_polyglot",
239
+ "metric": "pass_rate_2",
240
+ "score": 0.791,
241
+ "weight": 0.0603,
242
+ "sampleSize": 225,
243
+ "date": "2025-06-06",
244
+ "url": "https://aider.chat/docs/leaderboards/",
245
+ "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
246
+ },
247
+ {
248
+ "source": "aider_polyglot",
249
+ "benchmark": "aider_polyglot",
250
+ "metric": "pass_rate_2",
251
+ "score": 0.769,
252
+ "weight": 0.0538,
253
+ "sampleSize": 225,
254
+ "date": "2025-05-07",
255
+ "url": "https://aider.chat/docs/leaderboards/",
256
+ "modelRaw": "Gemini 2.5 Pro Preview 05-06"
257
+ }
258
+ ]
259
+ },
260
+ "docs": {
261
+ "score": 0.7981,
262
+ "confidence": 0.0462,
263
+ "evidenceWeight": 0.0969,
264
+ "sources": [
265
+ {
266
+ "source": "aider_polyglot",
267
+ "benchmark": "aider_polyglot",
268
+ "metric": "pass_rate_2",
269
+ "score": 0.831,
270
+ "weight": 0.0335,
271
+ "sampleSize": 225,
272
+ "date": "2025-06-06",
273
+ "url": "https://aider.chat/docs/leaderboards/",
274
+ "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
275
+ },
276
+ {
277
+ "source": "aider_polyglot",
278
+ "benchmark": "aider_polyglot",
279
+ "metric": "pass_rate_2",
280
+ "score": 0.791,
281
+ "weight": 0.0335,
282
+ "sampleSize": 225,
283
+ "date": "2025-06-06",
284
+ "url": "https://aider.chat/docs/leaderboards/",
285
+ "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
286
+ },
287
+ {
288
+ "source": "aider_polyglot",
289
+ "benchmark": "aider_polyglot",
290
+ "metric": "pass_rate_2",
291
+ "score": 0.769,
292
+ "weight": 0.0299,
293
+ "sampleSize": 225,
294
+ "date": "2025-05-07",
295
+ "url": "https://aider.chat/docs/leaderboards/",
296
+ "modelRaw": "Gemini 2.5 Pro Preview 05-06"
297
+ }
298
+ ]
299
+ },
300
+ "security": {
301
+ "score": 0.7208,
302
+ "confidence": 0.1325,
303
+ "evidenceWeight": 0.3056,
304
+ "sources": [
305
+ {
306
+ "source": "terminal_bench",
307
+ "benchmark": "terminal_bench",
308
+ "metric": "accuracy",
309
+ "score": 0.7066,
310
+ "weight": 0.1049,
311
+ "date": "2026-05-05",
312
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
313
+ "modelRaw": "Gemini CLI Gemini 3.1 Pro"
314
+ },
315
+ {
316
+ "source": "terminal_bench",
317
+ "benchmark": "terminal_bench",
318
+ "metric": "accuracy",
319
+ "score": 0.6629,
320
+ "weight": 0.1037,
321
+ "date": "2026-05-02",
322
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
323
+ "modelRaw": "Gemini CLI Gemini 3 Pro"
324
+ },
325
+ {
326
+ "source": "aider_polyglot",
327
+ "benchmark": "aider_polyglot",
328
+ "metric": "pass_rate_2",
329
+ "score": 0.831,
330
+ "weight": 0.0335,
331
+ "sampleSize": 225,
332
+ "date": "2025-06-06",
333
+ "url": "https://aider.chat/docs/leaderboards/",
334
+ "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
335
+ },
336
+ {
337
+ "source": "aider_polyglot",
338
+ "benchmark": "aider_polyglot",
339
+ "metric": "pass_rate_2",
340
+ "score": 0.791,
341
+ "weight": 0.0335,
342
+ "sampleSize": 225,
343
+ "date": "2025-06-06",
344
+ "url": "https://aider.chat/docs/leaderboards/",
345
+ "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
346
+ },
347
+ {
348
+ "source": "aider_polyglot",
349
+ "benchmark": "aider_polyglot",
350
+ "metric": "pass_rate_2",
351
+ "score": 0.769,
352
+ "weight": 0.0299,
353
+ "sampleSize": 225,
354
+ "date": "2025-05-07",
355
+ "url": "https://aider.chat/docs/leaderboards/",
356
+ "modelRaw": "Gemini 2.5 Pro Preview 05-06"
357
+ }
358
+ ]
359
+ },
360
+ "perf": {
361
+ "score": 0.7254,
362
+ "confidence": 0.1593,
363
+ "evidenceWeight": 0.3791,
364
+ "sources": [
365
+ {
366
+ "source": "terminal_bench",
367
+ "benchmark": "terminal_bench",
368
+ "metric": "accuracy",
369
+ "score": 0.7066,
370
+ "weight": 0.1224,
371
+ "date": "2026-05-05",
372
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
373
+ "modelRaw": "Gemini CLI Gemini 3.1 Pro"
374
+ },
375
+ {
376
+ "source": "terminal_bench",
377
+ "benchmark": "terminal_bench",
378
+ "metric": "accuracy",
379
+ "score": 0.6629,
380
+ "weight": 0.121,
381
+ "date": "2026-05-02",
382
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
383
+ "modelRaw": "Gemini CLI Gemini 3 Pro"
384
+ },
385
+ {
386
+ "source": "aider_polyglot",
387
+ "benchmark": "aider_polyglot",
388
+ "metric": "pass_rate_2",
389
+ "score": 0.831,
390
+ "weight": 0.0469,
391
+ "sampleSize": 225,
392
+ "date": "2025-06-06",
393
+ "url": "https://aider.chat/docs/leaderboards/",
394
+ "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
395
+ },
396
+ {
397
+ "source": "aider_polyglot",
398
+ "benchmark": "aider_polyglot",
399
+ "metric": "pass_rate_2",
400
+ "score": 0.791,
401
+ "weight": 0.0469,
402
+ "sampleSize": 225,
403
+ "date": "2025-06-06",
404
+ "url": "https://aider.chat/docs/leaderboards/",
405
+ "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
406
+ },
407
+ {
408
+ "source": "aider_polyglot",
409
+ "benchmark": "aider_polyglot",
410
+ "metric": "pass_rate_2",
411
+ "score": 0.769,
412
+ "weight": 0.0418,
413
+ "sampleSize": 225,
414
+ "date": "2025-05-07",
415
+ "url": "https://aider.chat/docs/leaderboards/",
416
+ "modelRaw": "Gemini 2.5 Pro Preview 05-06"
417
+ }
418
+ ]
419
+ },
420
+ "general": {
421
+ "score": 0.7231,
422
+ "confidence": 0.2397,
423
+ "evidenceWeight": 0.6305,
424
+ "sources": [
425
+ {
426
+ "source": "terminal_bench",
427
+ "benchmark": "terminal_bench",
428
+ "metric": "accuracy",
429
+ "score": 0.7066,
430
+ "weight": 0.2098,
431
+ "date": "2026-05-05",
432
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
433
+ "modelRaw": "Gemini CLI Gemini 3.1 Pro"
434
+ },
435
+ {
436
+ "source": "terminal_bench",
437
+ "benchmark": "terminal_bench",
438
+ "metric": "accuracy",
439
+ "score": 0.6629,
440
+ "weight": 0.2074,
441
+ "date": "2026-05-02",
442
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
443
+ "modelRaw": "Gemini CLI Gemini 3 Pro"
444
+ },
445
+ {
446
+ "source": "aider_polyglot",
447
+ "benchmark": "aider_polyglot",
448
+ "metric": "pass_rate_2",
449
+ "score": 0.831,
450
+ "weight": 0.0738,
451
+ "sampleSize": 225,
452
+ "date": "2025-06-06",
453
+ "url": "https://aider.chat/docs/leaderboards/",
454
+ "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
455
+ },
456
+ {
457
+ "source": "aider_polyglot",
458
+ "benchmark": "aider_polyglot",
459
+ "metric": "pass_rate_2",
460
+ "score": 0.791,
461
+ "weight": 0.0738,
462
+ "sampleSize": 225,
463
+ "date": "2025-06-06",
464
+ "url": "https://aider.chat/docs/leaderboards/",
465
+ "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
466
+ },
467
+ {
468
+ "source": "aider_polyglot",
469
+ "benchmark": "aider_polyglot",
470
+ "metric": "pass_rate_2",
471
+ "score": 0.769,
472
+ "weight": 0.0657,
473
+ "sampleSize": 225,
474
+ "date": "2025-05-07",
475
+ "url": "https://aider.chat/docs/leaderboards/",
476
+ "modelRaw": "Gemini 2.5 Pro Preview 05-06"
477
+ }
478
+ ]
479
+ }
480
+ },
481
+ "claude-code:opus": {
482
+ "bugfix": {
483
+ "score": 0.7454,
484
+ "confidence": 0.1407,
485
+ "evidenceWeight": 0.3276,
486
+ "sources": [
487
+ {
488
+ "source": "terminal_bench",
489
+ "benchmark": "terminal_bench",
490
+ "metric": "accuracy",
491
+ "score": 0.7888,
492
+ "weight": 0.1726,
493
+ "date": "2026-05-29",
494
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
495
+ "modelRaw": "Claude Code Claude Opus 4.8"
496
+ },
497
+ {
498
+ "source": "terminal_bench",
499
+ "benchmark": "terminal_bench",
500
+ "metric": "accuracy",
501
+ "score": 0.6972,
502
+ "weight": 0.155,
503
+ "date": "2026-05-01",
504
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
505
+ "modelRaw": "Claude Code Claude Opus 4.7"
506
+ }
507
+ ]
508
+ },
509
+ "feature": {
510
+ "score": 0.7454,
511
+ "confidence": 0.113,
512
+ "evidenceWeight": 0.2548,
513
+ "sources": [
514
+ {
515
+ "source": "terminal_bench",
516
+ "benchmark": "terminal_bench",
517
+ "metric": "accuracy",
518
+ "score": 0.7888,
519
+ "weight": 0.1343,
520
+ "date": "2026-05-29",
521
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
522
+ "modelRaw": "Claude Code Claude Opus 4.8"
523
+ },
524
+ {
525
+ "source": "terminal_bench",
526
+ "benchmark": "terminal_bench",
527
+ "metric": "accuracy",
528
+ "score": 0.6972,
529
+ "weight": 0.1205,
530
+ "date": "2026-05-01",
531
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
532
+ "modelRaw": "Claude Code Claude Opus 4.7"
533
+ }
534
+ ]
535
+ },
536
+ "refactor": {
537
+ "score": 0.7454,
538
+ "confidence": 0.0834,
539
+ "evidenceWeight": 0.182,
540
+ "sources": [
541
+ {
542
+ "source": "terminal_bench",
543
+ "benchmark": "terminal_bench",
544
+ "metric": "accuracy",
545
+ "score": 0.7888,
546
+ "weight": 0.0959,
547
+ "date": "2026-05-29",
548
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
549
+ "modelRaw": "Claude Code Claude Opus 4.8"
550
+ },
551
+ {
552
+ "source": "terminal_bench",
553
+ "benchmark": "terminal_bench",
554
+ "metric": "accuracy",
555
+ "score": 0.6972,
556
+ "weight": 0.0861,
557
+ "date": "2026-05-01",
558
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
559
+ "modelRaw": "Claude Code Claude Opus 4.7"
560
+ }
561
+ ]
562
+ },
563
+ "test": {
564
+ "score": 0.7454,
565
+ "confidence": 0.0984,
566
+ "evidenceWeight": 0.2184,
567
+ "sources": [
568
+ {
569
+ "source": "terminal_bench",
570
+ "benchmark": "terminal_bench",
571
+ "metric": "accuracy",
572
+ "score": 0.7888,
573
+ "weight": 0.1151,
574
+ "date": "2026-05-29",
575
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
576
+ "modelRaw": "Claude Code Claude Opus 4.8"
577
+ },
578
+ {
579
+ "source": "terminal_bench",
580
+ "benchmark": "terminal_bench",
581
+ "metric": "accuracy",
582
+ "score": 0.6972,
583
+ "weight": 0.1033,
584
+ "date": "2026-05-01",
585
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
586
+ "modelRaw": "Claude Code Claude Opus 4.7"
587
+ }
588
+ ]
589
+ },
590
+ "security": {
591
+ "score": 0.7454,
592
+ "confidence": 0.0984,
593
+ "evidenceWeight": 0.2184,
594
+ "sources": [
595
+ {
596
+ "source": "terminal_bench",
597
+ "benchmark": "terminal_bench",
598
+ "metric": "accuracy",
599
+ "score": 0.7888,
600
+ "weight": 0.1151,
601
+ "date": "2026-05-29",
602
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
603
+ "modelRaw": "Claude Code Claude Opus 4.8"
604
+ },
605
+ {
606
+ "source": "terminal_bench",
607
+ "benchmark": "terminal_bench",
608
+ "metric": "accuracy",
609
+ "score": 0.6972,
610
+ "weight": 0.1033,
611
+ "date": "2026-05-01",
612
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
613
+ "modelRaw": "Claude Code Claude Opus 4.7"
614
+ }
615
+ ]
616
+ },
617
+ "perf": {
618
+ "score": 0.7454,
619
+ "confidence": 0.113,
620
+ "evidenceWeight": 0.2548,
621
+ "sources": [
622
+ {
623
+ "source": "terminal_bench",
624
+ "benchmark": "terminal_bench",
625
+ "metric": "accuracy",
626
+ "score": 0.7888,
627
+ "weight": 0.1343,
628
+ "date": "2026-05-29",
629
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
630
+ "modelRaw": "Claude Code Claude Opus 4.8"
631
+ },
632
+ {
633
+ "source": "terminal_bench",
634
+ "benchmark": "terminal_bench",
635
+ "metric": "accuracy",
636
+ "score": 0.6972,
637
+ "weight": 0.1205,
638
+ "date": "2026-05-01",
639
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
640
+ "modelRaw": "Claude Code Claude Opus 4.7"
641
+ }
642
+ ]
643
+ },
644
+ "general": {
645
+ "score": 0.7454,
646
+ "confidence": 0.1793,
647
+ "evidenceWeight": 0.4368,
648
+ "sources": [
649
+ {
650
+ "source": "terminal_bench",
651
+ "benchmark": "terminal_bench",
652
+ "metric": "accuracy",
653
+ "score": 0.7888,
654
+ "weight": 0.2302,
655
+ "date": "2026-05-29",
656
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
657
+ "modelRaw": "Claude Code Claude Opus 4.8"
658
+ },
659
+ {
660
+ "source": "terminal_bench",
661
+ "benchmark": "terminal_bench",
662
+ "metric": "accuracy",
663
+ "score": 0.6972,
664
+ "weight": 0.2066,
665
+ "date": "2026-05-01",
666
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
667
+ "modelRaw": "Claude Code Claude Opus 4.7"
668
+ }
669
+ ]
670
+ }
671
+ },
672
+ "claude-code:sonnet": {
673
+ "bugfix": {
674
+ "score": 0.7098,
675
+ "confidence": 0.387,
676
+ "evidenceWeight": 1.2626,
677
+ "sources": [
678
+ {
679
+ "source": "swebench_verified",
680
+ "benchmark": "swebench_verified",
681
+ "metric": "resolved",
682
+ "score": 0.72,
683
+ "weight": 0.9408,
684
+ "sampleSize": 500,
685
+ "date": "2026-06-01",
686
+ "url": "https://www.swebench.com/",
687
+ "modelRaw": "Claude Sonnet"
688
+ },
689
+ {
690
+ "source": "aider_polyglot",
691
+ "benchmark": "aider_polyglot",
692
+ "metric": "percent_correct",
693
+ "score": 0.68,
694
+ "weight": 0.3218,
695
+ "sampleSize": 225,
696
+ "date": "2026-06-01",
697
+ "url": "https://aider.chat/docs/leaderboards/",
698
+ "modelRaw": "Claude Sonnet"
699
+ }
700
+ ]
701
+ },
702
+ "feature": {
703
+ "score": 0.6994,
704
+ "confidence": 0.2532,
705
+ "evidenceWeight": 0.6779,
706
+ "sources": [
707
+ {
708
+ "source": "aider_polyglot",
709
+ "benchmark": "aider_polyglot",
710
+ "metric": "percent_correct",
711
+ "score": 0.68,
712
+ "weight": 0.3487,
713
+ "sampleSize": 225,
714
+ "date": "2026-06-01",
715
+ "url": "https://aider.chat/docs/leaderboards/",
716
+ "modelRaw": "Claude Sonnet"
717
+ },
718
+ {
719
+ "source": "swebench_verified",
720
+ "benchmark": "swebench_verified",
721
+ "metric": "resolved",
722
+ "score": 0.72,
723
+ "weight": 0.3293,
724
+ "sampleSize": 500,
725
+ "date": "2026-06-01",
726
+ "url": "https://www.swebench.com/",
727
+ "modelRaw": "Claude Sonnet"
728
+ }
729
+ ]
730
+ },
731
+ "refactor": {
732
+ "score": 0.7019,
733
+ "confidence": 0.3212,
734
+ "evidenceWeight": 0.9465,
735
+ "sources": [
736
+ {
737
+ "source": "swebench_verified",
738
+ "benchmark": "swebench_verified",
739
+ "metric": "resolved",
740
+ "score": 0.72,
741
+ "weight": 0.5174,
742
+ "sampleSize": 500,
743
+ "date": "2026-06-01",
744
+ "url": "https://www.swebench.com/",
745
+ "modelRaw": "Claude Sonnet"
746
+ },
747
+ {
748
+ "source": "aider_polyglot",
749
+ "benchmark": "aider_polyglot",
750
+ "metric": "percent_correct",
751
+ "score": 0.68,
752
+ "weight": 0.4291,
753
+ "sampleSize": 225,
754
+ "date": "2026-06-01",
755
+ "url": "https://aider.chat/docs/leaderboards/",
756
+ "modelRaw": "Claude Sonnet"
757
+ }
758
+ ]
759
+ },
760
+ "test": {
761
+ "score": 0.6997,
762
+ "confidence": 0.1924,
763
+ "evidenceWeight": 0.4766,
764
+ "sources": [
765
+ {
766
+ "source": "aider_polyglot",
767
+ "benchmark": "aider_polyglot",
768
+ "metric": "percent_correct",
769
+ "score": 0.68,
770
+ "weight": 0.2414,
771
+ "sampleSize": 225,
772
+ "date": "2026-06-01",
773
+ "url": "https://aider.chat/docs/leaderboards/",
774
+ "modelRaw": "Claude Sonnet"
775
+ },
776
+ {
777
+ "source": "swebench_verified",
778
+ "benchmark": "swebench_verified",
779
+ "metric": "resolved",
780
+ "score": 0.72,
781
+ "weight": 0.2352,
782
+ "sampleSize": 500,
783
+ "date": "2026-06-01",
784
+ "url": "https://www.swebench.com/",
785
+ "modelRaw": "Claude Sonnet"
786
+ }
787
+ ]
788
+ },
789
+ "docs": {
790
+ "score": 0.68,
791
+ "confidence": 0.0628,
792
+ "evidenceWeight": 0.1341,
793
+ "sources": [
794
+ {
795
+ "source": "aider_polyglot",
796
+ "benchmark": "aider_polyglot",
797
+ "metric": "percent_correct",
798
+ "score": 0.68,
799
+ "weight": 0.1341,
800
+ "sampleSize": 225,
801
+ "date": "2026-06-01",
802
+ "url": "https://aider.chat/docs/leaderboards/",
803
+ "modelRaw": "Claude Sonnet"
804
+ }
805
+ ]
806
+ },
807
+ "security": {
808
+ "score": 0.7055,
809
+ "confidence": 0.1559,
810
+ "evidenceWeight": 0.3693,
811
+ "sources": [
812
+ {
813
+ "source": "swebench_verified",
814
+ "benchmark": "swebench_verified",
815
+ "metric": "resolved",
816
+ "score": 0.72,
817
+ "weight": 0.2352,
818
+ "sampleSize": 500,
819
+ "date": "2026-06-01",
820
+ "url": "https://www.swebench.com/",
821
+ "modelRaw": "Claude Sonnet"
822
+ },
823
+ {
824
+ "source": "aider_polyglot",
825
+ "benchmark": "aider_polyglot",
826
+ "metric": "percent_correct",
827
+ "score": 0.68,
828
+ "weight": 0.1341,
829
+ "sampleSize": 225,
830
+ "date": "2026-06-01",
831
+ "url": "https://aider.chat/docs/leaderboards/",
832
+ "modelRaw": "Claude Sonnet"
833
+ }
834
+ ]
835
+ },
836
+ "perf": {
837
+ "score": 0.7022,
838
+ "confidence": 0.1746,
839
+ "evidenceWeight": 0.4229,
840
+ "sources": [
841
+ {
842
+ "source": "swebench_verified",
843
+ "benchmark": "swebench_verified",
844
+ "metric": "resolved",
845
+ "score": 0.72,
846
+ "weight": 0.2352,
847
+ "sampleSize": 500,
848
+ "date": "2026-06-01",
849
+ "url": "https://www.swebench.com/",
850
+ "modelRaw": "Claude Sonnet"
851
+ },
852
+ {
853
+ "source": "aider_polyglot",
854
+ "benchmark": "aider_polyglot",
855
+ "metric": "percent_correct",
856
+ "score": 0.68,
857
+ "weight": 0.1877,
858
+ "sampleSize": 225,
859
+ "date": "2026-06-01",
860
+ "url": "https://aider.chat/docs/leaderboards/",
861
+ "modelRaw": "Claude Sonnet"
862
+ }
863
+ ]
864
+ },
865
+ "general": {
866
+ "score": 0.7036,
867
+ "confidence": 0.2643,
868
+ "evidenceWeight": 0.7184,
869
+ "sources": [
870
+ {
871
+ "source": "swebench_verified",
872
+ "benchmark": "swebench_verified",
873
+ "metric": "resolved",
874
+ "score": 0.72,
875
+ "weight": 0.4233,
876
+ "sampleSize": 500,
877
+ "date": "2026-06-01",
878
+ "url": "https://www.swebench.com/",
879
+ "modelRaw": "Claude Sonnet"
880
+ },
881
+ {
882
+ "source": "aider_polyglot",
883
+ "benchmark": "aider_polyglot",
884
+ "metric": "percent_correct",
885
+ "score": 0.68,
886
+ "weight": 0.295,
887
+ "sampleSize": 225,
888
+ "date": "2026-06-01",
889
+ "url": "https://aider.chat/docs/leaderboards/",
890
+ "modelRaw": "Claude Sonnet"
891
+ }
892
+ ]
893
+ }
894
+ },
895
+ "codex:gpt-5.4": {
896
+ "bugfix": {
897
+ "score": 0.867,
898
+ "confidence": 0.0519,
899
+ "evidenceWeight": 0.1095,
900
+ "sources": [
901
+ {
902
+ "source": "aider_polyglot",
903
+ "benchmark": "aider_polyglot",
904
+ "metric": "pass_rate_2",
905
+ "score": 0.867,
906
+ "weight": 0.1095,
907
+ "sampleSize": 225,
908
+ "date": "2025-08-25",
909
+ "url": "https://aider.chat/docs/leaderboards/",
910
+ "modelRaw": "gpt-5 medium"
911
+ }
912
+ ]
913
+ },
914
+ "feature": {
915
+ "score": 0.867,
916
+ "confidence": 0.056,
917
+ "evidenceWeight": 0.1186,
918
+ "sources": [
919
+ {
920
+ "source": "aider_polyglot",
921
+ "benchmark": "aider_polyglot",
922
+ "metric": "pass_rate_2",
923
+ "score": 0.867,
924
+ "weight": 0.1186,
925
+ "sampleSize": 225,
926
+ "date": "2025-08-25",
927
+ "url": "https://aider.chat/docs/leaderboards/",
928
+ "modelRaw": "gpt-5 medium"
929
+ }
930
+ ]
931
+ },
932
+ "refactor": {
933
+ "score": 0.867,
934
+ "confidence": 0.068,
935
+ "evidenceWeight": 0.146,
936
+ "sources": [
937
+ {
938
+ "source": "aider_polyglot",
939
+ "benchmark": "aider_polyglot",
940
+ "metric": "pass_rate_2",
941
+ "score": 0.867,
942
+ "weight": 0.146,
943
+ "sampleSize": 225,
944
+ "date": "2025-08-25",
945
+ "url": "https://aider.chat/docs/leaderboards/",
946
+ "modelRaw": "gpt-5 medium"
947
+ }
948
+ ]
949
+ },
950
+ "test": {
951
+ "score": 0.867,
952
+ "confidence": 0.0394,
953
+ "evidenceWeight": 0.0821,
954
+ "sources": [
955
+ {
956
+ "source": "aider_polyglot",
957
+ "benchmark": "aider_polyglot",
958
+ "metric": "pass_rate_2",
959
+ "score": 0.867,
960
+ "weight": 0.0821,
961
+ "sampleSize": 225,
962
+ "date": "2025-08-25",
963
+ "url": "https://aider.chat/docs/leaderboards/",
964
+ "modelRaw": "gpt-5 medium"
965
+ }
966
+ ]
967
+ },
968
+ "docs": {
969
+ "score": 0.867,
970
+ "confidence": 0.0223,
971
+ "evidenceWeight": 0.0456,
972
+ "sources": [
973
+ {
974
+ "source": "aider_polyglot",
975
+ "benchmark": "aider_polyglot",
976
+ "metric": "pass_rate_2",
977
+ "score": 0.867,
978
+ "weight": 0.0456,
979
+ "sampleSize": 225,
980
+ "date": "2025-08-25",
981
+ "url": "https://aider.chat/docs/leaderboards/",
982
+ "modelRaw": "gpt-5 medium"
983
+ }
984
+ ]
985
+ },
986
+ "security": {
987
+ "score": 0.867,
988
+ "confidence": 0.0223,
989
+ "evidenceWeight": 0.0456,
990
+ "sources": [
991
+ {
992
+ "source": "aider_polyglot",
993
+ "benchmark": "aider_polyglot",
994
+ "metric": "pass_rate_2",
995
+ "score": 0.867,
996
+ "weight": 0.0456,
997
+ "sampleSize": 225,
998
+ "date": "2025-08-25",
999
+ "url": "https://aider.chat/docs/leaderboards/",
1000
+ "modelRaw": "gpt-5 medium"
1001
+ }
1002
+ ]
1003
+ },
1004
+ "perf": {
1005
+ "score": 0.867,
1006
+ "confidence": 0.0309,
1007
+ "evidenceWeight": 0.0639,
1008
+ "sources": [
1009
+ {
1010
+ "source": "aider_polyglot",
1011
+ "benchmark": "aider_polyglot",
1012
+ "metric": "pass_rate_2",
1013
+ "score": 0.867,
1014
+ "weight": 0.0639,
1015
+ "sampleSize": 225,
1016
+ "date": "2025-08-25",
1017
+ "url": "https://aider.chat/docs/leaderboards/",
1018
+ "modelRaw": "gpt-5 medium"
1019
+ }
1020
+ ]
1021
+ },
1022
+ "general": {
1023
+ "score": 0.867,
1024
+ "confidence": 0.0478,
1025
+ "evidenceWeight": 0.1004,
1026
+ "sources": [
1027
+ {
1028
+ "source": "aider_polyglot",
1029
+ "benchmark": "aider_polyglot",
1030
+ "metric": "pass_rate_2",
1031
+ "score": 0.867,
1032
+ "weight": 0.1004,
1033
+ "sampleSize": 225,
1034
+ "date": "2025-08-25",
1035
+ "url": "https://aider.chat/docs/leaderboards/",
1036
+ "modelRaw": "gpt-5 medium"
1037
+ }
1038
+ ]
1039
+ }
1040
+ },
1041
+ "codex:gpt-5.4-mini": {
1042
+ "bugfix": {
1043
+ "score": 0.7738,
1044
+ "confidence": 0.1057,
1045
+ "evidenceWeight": 0.2364,
1046
+ "sources": [
1047
+ {
1048
+ "source": "evalplus",
1049
+ "benchmark": "evalplus",
1050
+ "metric": "pass_at_1",
1051
+ "score": 0.74,
1052
+ "weight": 0.1269,
1053
+ "sampleSize": 378,
1054
+ "date": "2026-05-20",
1055
+ "url": "https://github.com/evalplus/evalplus",
1056
+ "modelRaw": "gpt-5.4-mini"
1057
+ },
1058
+ {
1059
+ "source": "aider_polyglot",
1060
+ "benchmark": "aider_polyglot",
1061
+ "metric": "pass_rate_2",
1062
+ "score": 0.813,
1063
+ "weight": 0.1095,
1064
+ "sampleSize": 225,
1065
+ "date": "2025-08-25",
1066
+ "url": "https://aider.chat/docs/leaderboards/",
1067
+ "modelRaw": "gpt-5 low"
1068
+ }
1069
+ ]
1070
+ },
1071
+ "feature": {
1072
+ "score": 0.7618,
1073
+ "confidence": 0.1659,
1074
+ "evidenceWeight": 0.3978,
1075
+ "sources": [
1076
+ {
1077
+ "source": "evalplus",
1078
+ "benchmark": "evalplus",
1079
+ "metric": "pass_at_1",
1080
+ "score": 0.74,
1081
+ "weight": 0.2792,
1082
+ "sampleSize": 378,
1083
+ "date": "2026-05-20",
1084
+ "url": "https://github.com/evalplus/evalplus",
1085
+ "modelRaw": "gpt-5.4-mini"
1086
+ },
1087
+ {
1088
+ "source": "aider_polyglot",
1089
+ "benchmark": "aider_polyglot",
1090
+ "metric": "pass_rate_2",
1091
+ "score": 0.813,
1092
+ "weight": 0.1186,
1093
+ "sampleSize": 225,
1094
+ "date": "2025-08-25",
1095
+ "url": "https://aider.chat/docs/leaderboards/",
1096
+ "modelRaw": "gpt-5 low"
1097
+ }
1098
+ ]
1099
+ },
1100
+ "refactor": {
1101
+ "score": 0.7791,
1102
+ "confidence": 0.1201,
1103
+ "evidenceWeight": 0.2729,
1104
+ "sources": [
1105
+ {
1106
+ "source": "aider_polyglot",
1107
+ "benchmark": "aider_polyglot",
1108
+ "metric": "pass_rate_2",
1109
+ "score": 0.813,
1110
+ "weight": 0.146,
1111
+ "sampleSize": 225,
1112
+ "date": "2025-08-25",
1113
+ "url": "https://aider.chat/docs/leaderboards/",
1114
+ "modelRaw": "gpt-5 low"
1115
+ },
1116
+ {
1117
+ "source": "evalplus",
1118
+ "benchmark": "evalplus",
1119
+ "metric": "pass_at_1",
1120
+ "score": 0.74,
1121
+ "weight": 0.1269,
1122
+ "sampleSize": 378,
1123
+ "date": "2026-05-20",
1124
+ "url": "https://github.com/evalplus/evalplus",
1125
+ "modelRaw": "gpt-5.4-mini"
1126
+ }
1127
+ ]
1128
+ },
1129
+ "test": {
1130
+ "score": 0.7537,
1131
+ "confidence": 0.1795,
1132
+ "evidenceWeight": 0.4375,
1133
+ "sources": [
1134
+ {
1135
+ "source": "evalplus",
1136
+ "benchmark": "evalplus",
1137
+ "metric": "pass_at_1",
1138
+ "score": 0.74,
1139
+ "weight": 0.3554,
1140
+ "sampleSize": 378,
1141
+ "date": "2026-05-20",
1142
+ "url": "https://github.com/evalplus/evalplus",
1143
+ "modelRaw": "gpt-5.4-mini"
1144
+ },
1145
+ {
1146
+ "source": "aider_polyglot",
1147
+ "benchmark": "aider_polyglot",
1148
+ "metric": "pass_rate_2",
1149
+ "score": 0.813,
1150
+ "weight": 0.0821,
1151
+ "sampleSize": 225,
1152
+ "date": "2025-08-25",
1153
+ "url": "https://aider.chat/docs/leaderboards/",
1154
+ "modelRaw": "gpt-5 low"
1155
+ }
1156
+ ]
1157
+ },
1158
+ "docs": {
1159
+ "score": 0.7746,
1160
+ "confidence": 0.046,
1161
+ "evidenceWeight": 0.0964,
1162
+ "sources": [
1163
+ {
1164
+ "source": "evalplus",
1165
+ "benchmark": "evalplus",
1166
+ "metric": "pass_at_1",
1167
+ "score": 0.74,
1168
+ "weight": 0.0508,
1169
+ "sampleSize": 378,
1170
+ "date": "2026-05-20",
1171
+ "url": "https://github.com/evalplus/evalplus",
1172
+ "modelRaw": "gpt-5.4-mini"
1173
+ },
1174
+ {
1175
+ "source": "aider_polyglot",
1176
+ "benchmark": "aider_polyglot",
1177
+ "metric": "pass_rate_2",
1178
+ "score": 0.813,
1179
+ "weight": 0.0456,
1180
+ "sampleSize": 225,
1181
+ "date": "2025-08-25",
1182
+ "url": "https://aider.chat/docs/leaderboards/",
1183
+ "modelRaw": "gpt-5 low"
1184
+ }
1185
+ ]
1186
+ },
1187
+ "security": {
1188
+ "score": 0.7673,
1189
+ "confidence": 0.0574,
1190
+ "evidenceWeight": 0.1218,
1191
+ "sources": [
1192
+ {
1193
+ "source": "evalplus",
1194
+ "benchmark": "evalplus",
1195
+ "metric": "pass_at_1",
1196
+ "score": 0.74,
1197
+ "weight": 0.0762,
1198
+ "sampleSize": 378,
1199
+ "date": "2026-05-20",
1200
+ "url": "https://github.com/evalplus/evalplus",
1201
+ "modelRaw": "gpt-5.4-mini"
1202
+ },
1203
+ {
1204
+ "source": "aider_polyglot",
1205
+ "benchmark": "aider_polyglot",
1206
+ "metric": "pass_rate_2",
1207
+ "score": 0.813,
1208
+ "weight": 0.0456,
1209
+ "sampleSize": 225,
1210
+ "date": "2025-08-25",
1211
+ "url": "https://aider.chat/docs/leaderboards/",
1212
+ "modelRaw": "gpt-5 low"
1213
+ }
1214
+ ]
1215
+ },
1216
+ "perf": {
1217
+ "score": 0.7682,
1218
+ "confidence": 0.0764,
1219
+ "evidenceWeight": 0.1654,
1220
+ "sources": [
1221
+ {
1222
+ "source": "evalplus",
1223
+ "benchmark": "evalplus",
1224
+ "metric": "pass_at_1",
1225
+ "score": 0.74,
1226
+ "weight": 0.1015,
1227
+ "sampleSize": 378,
1228
+ "date": "2026-05-20",
1229
+ "url": "https://github.com/evalplus/evalplus",
1230
+ "modelRaw": "gpt-5.4-mini"
1231
+ },
1232
+ {
1233
+ "source": "aider_polyglot",
1234
+ "benchmark": "aider_polyglot",
1235
+ "metric": "pass_rate_2",
1236
+ "score": 0.813,
1237
+ "weight": 0.0639,
1238
+ "sampleSize": 225,
1239
+ "date": "2025-08-25",
1240
+ "url": "https://aider.chat/docs/leaderboards/",
1241
+ "modelRaw": "gpt-5 low"
1242
+ }
1243
+ ]
1244
+ },
1245
+ "general": {
1246
+ "score": 0.7623,
1247
+ "confidence": 0.1412,
1248
+ "evidenceWeight": 0.3288,
1249
+ "sources": [
1250
+ {
1251
+ "source": "evalplus",
1252
+ "benchmark": "evalplus",
1253
+ "metric": "pass_at_1",
1254
+ "score": 0.74,
1255
+ "weight": 0.2285,
1256
+ "sampleSize": 378,
1257
+ "date": "2026-05-20",
1258
+ "url": "https://github.com/evalplus/evalplus",
1259
+ "modelRaw": "gpt-5.4-mini"
1260
+ },
1261
+ {
1262
+ "source": "aider_polyglot",
1263
+ "benchmark": "aider_polyglot",
1264
+ "metric": "pass_rate_2",
1265
+ "score": 0.813,
1266
+ "weight": 0.1004,
1267
+ "sampleSize": 225,
1268
+ "date": "2025-08-25",
1269
+ "url": "https://aider.chat/docs/leaderboards/",
1270
+ "modelRaw": "gpt-5 low"
1271
+ }
1272
+ ]
1273
+ }
1274
+ },
1275
+ "codex:gpt-5.5": {
1276
+ "bugfix": {
1277
+ "score": 0.8528,
1278
+ "confidence": 0.1165,
1279
+ "evidenceWeight": 0.2636,
1280
+ "sources": [
1281
+ {
1282
+ "source": "terminal_bench",
1283
+ "benchmark": "terminal_bench",
1284
+ "metric": "accuracy",
1285
+ "score": 0.8337,
1286
+ "weight": 0.155,
1287
+ "date": "2026-05-01",
1288
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
1289
+ "modelRaw": "Codex CLI GPT-5.5"
1290
+ },
1291
+ {
1292
+ "source": "aider_polyglot",
1293
+ "benchmark": "aider_polyglot",
1294
+ "metric": "pass_rate_2",
1295
+ "score": 0.88,
1296
+ "weight": 0.1087,
1297
+ "sampleSize": 225,
1298
+ "date": "2025-08-23",
1299
+ "url": "https://aider.chat/docs/leaderboards/",
1300
+ "modelRaw": "gpt-5 high"
1301
+ }
1302
+ ]
1303
+ },
1304
+ "feature": {
1305
+ "score": 0.8566,
1306
+ "confidence": 0.1064,
1307
+ "evidenceWeight": 0.2382,
1308
+ "sources": [
1309
+ {
1310
+ "source": "terminal_bench",
1311
+ "benchmark": "terminal_bench",
1312
+ "metric": "accuracy",
1313
+ "score": 0.8337,
1314
+ "weight": 0.1205,
1315
+ "date": "2026-05-01",
1316
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
1317
+ "modelRaw": "Codex CLI GPT-5.5"
1318
+ },
1319
+ {
1320
+ "source": "aider_polyglot",
1321
+ "benchmark": "aider_polyglot",
1322
+ "metric": "pass_rate_2",
1323
+ "score": 0.88,
1324
+ "weight": 0.1177,
1325
+ "sampleSize": 225,
1326
+ "date": "2025-08-23",
1327
+ "url": "https://aider.chat/docs/leaderboards/",
1328
+ "modelRaw": "gpt-5 high"
1329
+ }
1330
+ ]
1331
+ },
1332
+ "refactor": {
1333
+ "score": 0.8627,
1334
+ "confidence": 0.1035,
1335
+ "evidenceWeight": 0.231,
1336
+ "sources": [
1337
+ {
1338
+ "source": "aider_polyglot",
1339
+ "benchmark": "aider_polyglot",
1340
+ "metric": "pass_rate_2",
1341
+ "score": 0.88,
1342
+ "weight": 0.1449,
1343
+ "sampleSize": 225,
1344
+ "date": "2025-08-23",
1345
+ "url": "https://aider.chat/docs/leaderboards/",
1346
+ "modelRaw": "gpt-5 high"
1347
+ },
1348
+ {
1349
+ "source": "terminal_bench",
1350
+ "benchmark": "terminal_bench",
1351
+ "metric": "accuracy",
1352
+ "score": 0.8337,
1353
+ "weight": 0.0861,
1354
+ "date": "2026-05-01",
1355
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
1356
+ "modelRaw": "Codex CLI GPT-5.5"
1357
+ }
1358
+ ]
1359
+ },
1360
+ "test": {
1361
+ "score": 0.8541,
1362
+ "confidence": 0.0846,
1363
+ "evidenceWeight": 0.1848,
1364
+ "sources": [
1365
+ {
1366
+ "source": "terminal_bench",
1367
+ "benchmark": "terminal_bench",
1368
+ "metric": "accuracy",
1369
+ "score": 0.8337,
1370
+ "weight": 0.1033,
1371
+ "date": "2026-05-01",
1372
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
1373
+ "modelRaw": "Codex CLI GPT-5.5"
1374
+ },
1375
+ {
1376
+ "source": "aider_polyglot",
1377
+ "benchmark": "aider_polyglot",
1378
+ "metric": "pass_rate_2",
1379
+ "score": 0.88,
1380
+ "weight": 0.0815,
1381
+ "sampleSize": 225,
1382
+ "date": "2025-08-23",
1383
+ "url": "https://aider.chat/docs/leaderboards/",
1384
+ "modelRaw": "gpt-5 high"
1385
+ }
1386
+ ]
1387
+ },
1388
+ "docs": {
1389
+ "score": 0.88,
1390
+ "confidence": 0.0221,
1391
+ "evidenceWeight": 0.0453,
1392
+ "sources": [
1393
+ {
1394
+ "source": "aider_polyglot",
1395
+ "benchmark": "aider_polyglot",
1396
+ "metric": "pass_rate_2",
1397
+ "score": 0.88,
1398
+ "weight": 0.0453,
1399
+ "sampleSize": 225,
1400
+ "date": "2025-08-23",
1401
+ "url": "https://aider.chat/docs/leaderboards/",
1402
+ "modelRaw": "gpt-5 high"
1403
+ }
1404
+ ]
1405
+ },
1406
+ "security": {
1407
+ "score": 0.8478,
1408
+ "confidence": 0.0692,
1409
+ "evidenceWeight": 0.1486,
1410
+ "sources": [
1411
+ {
1412
+ "source": "terminal_bench",
1413
+ "benchmark": "terminal_bench",
1414
+ "metric": "accuracy",
1415
+ "score": 0.8337,
1416
+ "weight": 0.1033,
1417
+ "date": "2026-05-01",
1418
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
1419
+ "modelRaw": "Codex CLI GPT-5.5"
1420
+ },
1421
+ {
1422
+ "source": "aider_polyglot",
1423
+ "benchmark": "aider_polyglot",
1424
+ "metric": "pass_rate_2",
1425
+ "score": 0.88,
1426
+ "weight": 0.0453,
1427
+ "sampleSize": 225,
1428
+ "date": "2025-08-23",
1429
+ "url": "https://aider.chat/docs/leaderboards/",
1430
+ "modelRaw": "gpt-5 high"
1431
+ }
1432
+ ]
1433
+ },
1434
+ "perf": {
1435
+ "score": 0.8497,
1436
+ "confidence": 0.0842,
1437
+ "evidenceWeight": 0.1839,
1438
+ "sources": [
1439
+ {
1440
+ "source": "terminal_bench",
1441
+ "benchmark": "terminal_bench",
1442
+ "metric": "accuracy",
1443
+ "score": 0.8337,
1444
+ "weight": 0.1205,
1445
+ "date": "2026-05-01",
1446
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
1447
+ "modelRaw": "Codex CLI GPT-5.5"
1448
+ },
1449
+ {
1450
+ "source": "aider_polyglot",
1451
+ "benchmark": "aider_polyglot",
1452
+ "metric": "pass_rate_2",
1453
+ "score": 0.88,
1454
+ "weight": 0.0634,
1455
+ "sampleSize": 225,
1456
+ "date": "2025-08-23",
1457
+ "url": "https://aider.chat/docs/leaderboards/",
1458
+ "modelRaw": "gpt-5 high"
1459
+ }
1460
+ ]
1461
+ },
1462
+ "general": {
1463
+ "score": 0.8488,
1464
+ "confidence": 0.1328,
1465
+ "evidenceWeight": 0.3062,
1466
+ "sources": [
1467
+ {
1468
+ "source": "terminal_bench",
1469
+ "benchmark": "terminal_bench",
1470
+ "metric": "accuracy",
1471
+ "score": 0.8337,
1472
+ "weight": 0.2066,
1473
+ "date": "2026-05-01",
1474
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
1475
+ "modelRaw": "Codex CLI GPT-5.5"
1476
+ },
1477
+ {
1478
+ "source": "aider_polyglot",
1479
+ "benchmark": "aider_polyglot",
1480
+ "metric": "pass_rate_2",
1481
+ "score": 0.88,
1482
+ "weight": 0.0996,
1483
+ "sampleSize": 225,
1484
+ "date": "2025-08-23",
1485
+ "url": "https://aider.chat/docs/leaderboards/",
1486
+ "modelRaw": "gpt-5 high"
1487
+ }
1488
+ ]
1489
+ }
1490
+ }
1491
+ }
1492
+ }