aiforcecli-chat 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/License.MD +49 -0
  2. package/README.md +642 -0
  3. package/aiforcecli.config.example.json +66 -0
  4. package/assets/README.md +14 -0
  5. package/dist/cli.js +2 -0
  6. package/dist/index.js +2 -0
  7. package/package.json +62 -0
  8. package/tools/scorecard/README.md +92 -0
  9. package/tools/scorecard/config.json +134 -0
  10. package/tools/scorecard/fetch.mjs +335 -0
  11. package/tools/scorecard/generate.mjs +289 -0
  12. package/tools/scorecard/generated/example/invalid-rows.json +1 -0
  13. package/tools/scorecard/generated/example/scorecard-report.md +147 -0
  14. package/tools/scorecard/generated/example/scorecard.compact.json +61 -0
  15. package/tools/scorecard/generated/example/scorecard.json +1492 -0
  16. package/tools/scorecard/generated/example/unmapped-models.json +1492 -0
  17. package/tools/scorecard/generated/raw/aider_polyglot.html +21071 -0
  18. package/tools/scorecard/generated/raw/terminal_bench_2_1.html +2 -0
  19. package/tools/scorecard/generated/scorecard/invalid-rows.json +1 -0
  20. package/tools/scorecard/generated/scorecard/scorecard-report.md +133 -0
  21. package/tools/scorecard/generated/scorecard/scorecard.compact.json +51 -0
  22. package/tools/scorecard/generated/scorecard/scorecard.json +1181 -0
  23. package/tools/scorecard/generated/scorecard/unmapped-models.json +1492 -0
  24. package/tools/scorecard/generated/scorecard-example/invalid-rows.json +1 -0
  25. package/tools/scorecard/generated/scorecard-example/scorecard-report.md +40 -0
  26. package/tools/scorecard/generated/scorecard-example/scorecard.compact.json +22 -0
  27. package/tools/scorecard/generated/scorecard-example/scorecard.json +389 -0
  28. package/tools/scorecard/generated/scorecard-example/unmapped-models.json +1 -0
  29. package/tools/scorecard/generated/scorecard-fetch/raw/aider_polyglot.html +21071 -0
  30. package/tools/scorecard/generated/scorecard-fetch/raw/terminal_bench_2_1.html +2 -0
  31. package/tools/scorecard/snapshots/example.normalized.example.json +38 -0
  32. package/tools/scorecard/snapshots/live.aider_polyglot.json +1318 -0
  33. package/tools/scorecard/snapshots/live.terminal_bench_2_1.json +294 -0
@@ -0,0 +1,1181 @@
1
+ {
2
+ "version": "manual.2026.06.16",
3
+ "generatedAt": "2026-06-16T21:10:57.926Z",
4
+ "taskTypes": [
5
+ "bugfix",
6
+ "feature",
7
+ "refactor",
8
+ "test",
9
+ "docs",
10
+ "security",
11
+ "perf",
12
+ "general"
13
+ ],
14
+ "notes": [
15
+ "Generated scorecard artifact. It is not used by the application unless explicitly wired in later.",
16
+ "Scores are normalized public benchmark priors, not private repo outcomes."
17
+ ],
18
+ "scores": {
19
+ "antigravity:gemini-3.1-pro": {
20
+ "bugfix": {
21
+ "score": 0.7331,
22
+ "confidence": 0.2143,
23
+ "evidenceWeight": 0.5455,
24
+ "sources": [
25
+ {
26
+ "source": "terminal_bench",
27
+ "benchmark": "terminal_bench",
28
+ "metric": "accuracy",
29
+ "score": 0.7066,
30
+ "weight": 0.1574,
31
+ "date": "2026-05-05",
32
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
33
+ "modelRaw": "Gemini CLI Gemini 3.1 Pro"
34
+ },
35
+ {
36
+ "source": "terminal_bench",
37
+ "benchmark": "terminal_bench",
38
+ "metric": "accuracy",
39
+ "score": 0.6629,
40
+ "weight": 0.1556,
41
+ "date": "2026-05-02",
42
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
43
+ "modelRaw": "Gemini CLI Gemini 3 Pro"
44
+ },
45
+ {
46
+ "source": "aider_polyglot",
47
+ "benchmark": "aider_polyglot",
48
+ "metric": "pass_rate_2",
49
+ "score": 0.831,
50
+ "weight": 0.0805,
51
+ "sampleSize": 225,
52
+ "date": "2025-06-06",
53
+ "url": "https://aider.chat/docs/leaderboards/",
54
+ "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
55
+ },
56
+ {
57
+ "source": "aider_polyglot",
58
+ "benchmark": "aider_polyglot",
59
+ "metric": "pass_rate_2",
60
+ "score": 0.791,
61
+ "weight": 0.0805,
62
+ "sampleSize": 225,
63
+ "date": "2025-06-06",
64
+ "url": "https://aider.chat/docs/leaderboards/",
65
+ "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
66
+ },
67
+ {
68
+ "source": "aider_polyglot",
69
+ "benchmark": "aider_polyglot",
70
+ "metric": "pass_rate_2",
71
+ "score": 0.769,
72
+ "weight": 0.0717,
73
+ "sampleSize": 225,
74
+ "date": "2025-05-07",
75
+ "url": "https://aider.chat/docs/leaderboards/",
76
+ "modelRaw": "Gemini 2.5 Pro Preview 05-06"
77
+ }
78
+ ]
79
+ },
80
+ "feature": {
81
+ "score": 0.7424,
82
+ "confidence": 0.1985,
83
+ "evidenceWeight": 0.4954,
84
+ "sources": [
85
+ {
86
+ "source": "terminal_bench",
87
+ "benchmark": "terminal_bench",
88
+ "metric": "accuracy",
89
+ "score": 0.7066,
90
+ "weight": 0.1224,
91
+ "date": "2026-05-05",
92
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
93
+ "modelRaw": "Gemini CLI Gemini 3.1 Pro"
94
+ },
95
+ {
96
+ "source": "terminal_bench",
97
+ "benchmark": "terminal_bench",
98
+ "metric": "accuracy",
99
+ "score": 0.6629,
100
+ "weight": 0.121,
101
+ "date": "2026-05-02",
102
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
103
+ "modelRaw": "Gemini CLI Gemini 3 Pro"
104
+ },
105
+ {
106
+ "source": "aider_polyglot",
107
+ "benchmark": "aider_polyglot",
108
+ "metric": "pass_rate_2",
109
+ "score": 0.831,
110
+ "weight": 0.0872,
111
+ "sampleSize": 225,
112
+ "date": "2025-06-06",
113
+ "url": "https://aider.chat/docs/leaderboards/",
114
+ "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
115
+ },
116
+ {
117
+ "source": "aider_polyglot",
118
+ "benchmark": "aider_polyglot",
119
+ "metric": "pass_rate_2",
120
+ "score": 0.791,
121
+ "weight": 0.0872,
122
+ "sampleSize": 225,
123
+ "date": "2025-06-06",
124
+ "url": "https://aider.chat/docs/leaderboards/",
125
+ "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
126
+ },
127
+ {
128
+ "source": "aider_polyglot",
129
+ "benchmark": "aider_polyglot",
130
+ "metric": "pass_rate_2",
131
+ "score": 0.769,
132
+ "weight": 0.0777,
133
+ "sampleSize": 225,
134
+ "date": "2025-05-07",
135
+ "url": "https://aider.chat/docs/leaderboards/",
136
+ "modelRaw": "Gemini 2.5 Pro Preview 05-06"
137
+ }
138
+ ]
139
+ },
140
+ "refactor": {
141
+ "score": 0.7574,
142
+ "confidence": 0.1948,
143
+ "evidenceWeight": 0.484,
144
+ "sources": [
145
+ {
146
+ "source": "aider_polyglot",
147
+ "benchmark": "aider_polyglot",
148
+ "metric": "pass_rate_2",
149
+ "score": 0.831,
150
+ "weight": 0.1073,
151
+ "sampleSize": 225,
152
+ "date": "2025-06-06",
153
+ "url": "https://aider.chat/docs/leaderboards/",
154
+ "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
155
+ },
156
+ {
157
+ "source": "aider_polyglot",
158
+ "benchmark": "aider_polyglot",
159
+ "metric": "pass_rate_2",
160
+ "score": 0.791,
161
+ "weight": 0.1073,
162
+ "sampleSize": 225,
163
+ "date": "2025-06-06",
164
+ "url": "https://aider.chat/docs/leaderboards/",
165
+ "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
166
+ },
167
+ {
168
+ "source": "aider_polyglot",
169
+ "benchmark": "aider_polyglot",
170
+ "metric": "pass_rate_2",
171
+ "score": 0.769,
172
+ "weight": 0.0956,
173
+ "sampleSize": 225,
174
+ "date": "2025-05-07",
175
+ "url": "https://aider.chat/docs/leaderboards/",
176
+ "modelRaw": "Gemini 2.5 Pro Preview 05-06"
177
+ },
178
+ {
179
+ "source": "terminal_bench",
180
+ "benchmark": "terminal_bench",
181
+ "metric": "accuracy",
182
+ "score": 0.7066,
183
+ "weight": 0.0874,
184
+ "date": "2026-05-05",
185
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
186
+ "modelRaw": "Gemini CLI Gemini 3.1 Pro"
187
+ },
188
+ {
189
+ "source": "terminal_bench",
190
+ "benchmark": "terminal_bench",
191
+ "metric": "accuracy",
192
+ "score": 0.6629,
193
+ "weight": 0.0864,
194
+ "date": "2026-05-02",
195
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
196
+ "modelRaw": "Gemini CLI Gemini 3 Pro"
197
+ }
198
+ ]
199
+ },
200
+ "test": {
201
+ "score": 0.7364,
202
+ "confidence": 0.1607,
203
+ "evidenceWeight": 0.3831,
204
+ "sources": [
205
+ {
206
+ "source": "terminal_bench",
207
+ "benchmark": "terminal_bench",
208
+ "metric": "accuracy",
209
+ "score": 0.7066,
210
+ "weight": 0.1049,
211
+ "date": "2026-05-05",
212
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
213
+ "modelRaw": "Gemini CLI Gemini 3.1 Pro"
214
+ },
215
+ {
216
+ "source": "terminal_bench",
217
+ "benchmark": "terminal_bench",
218
+ "metric": "accuracy",
219
+ "score": 0.6629,
220
+ "weight": 0.1037,
221
+ "date": "2026-05-02",
222
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
223
+ "modelRaw": "Gemini CLI Gemini 3 Pro"
224
+ },
225
+ {
226
+ "source": "aider_polyglot",
227
+ "benchmark": "aider_polyglot",
228
+ "metric": "pass_rate_2",
229
+ "score": 0.831,
230
+ "weight": 0.0603,
231
+ "sampleSize": 225,
232
+ "date": "2025-06-06",
233
+ "url": "https://aider.chat/docs/leaderboards/",
234
+ "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
235
+ },
236
+ {
237
+ "source": "aider_polyglot",
238
+ "benchmark": "aider_polyglot",
239
+ "metric": "pass_rate_2",
240
+ "score": 0.791,
241
+ "weight": 0.0603,
242
+ "sampleSize": 225,
243
+ "date": "2025-06-06",
244
+ "url": "https://aider.chat/docs/leaderboards/",
245
+ "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
246
+ },
247
+ {
248
+ "source": "aider_polyglot",
249
+ "benchmark": "aider_polyglot",
250
+ "metric": "pass_rate_2",
251
+ "score": 0.769,
252
+ "weight": 0.0538,
253
+ "sampleSize": 225,
254
+ "date": "2025-05-07",
255
+ "url": "https://aider.chat/docs/leaderboards/",
256
+ "modelRaw": "Gemini 2.5 Pro Preview 05-06"
257
+ }
258
+ ]
259
+ },
260
+ "docs": {
261
+ "score": 0.7981,
262
+ "confidence": 0.0462,
263
+ "evidenceWeight": 0.0969,
264
+ "sources": [
265
+ {
266
+ "source": "aider_polyglot",
267
+ "benchmark": "aider_polyglot",
268
+ "metric": "pass_rate_2",
269
+ "score": 0.831,
270
+ "weight": 0.0335,
271
+ "sampleSize": 225,
272
+ "date": "2025-06-06",
273
+ "url": "https://aider.chat/docs/leaderboards/",
274
+ "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
275
+ },
276
+ {
277
+ "source": "aider_polyglot",
278
+ "benchmark": "aider_polyglot",
279
+ "metric": "pass_rate_2",
280
+ "score": 0.791,
281
+ "weight": 0.0335,
282
+ "sampleSize": 225,
283
+ "date": "2025-06-06",
284
+ "url": "https://aider.chat/docs/leaderboards/",
285
+ "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
286
+ },
287
+ {
288
+ "source": "aider_polyglot",
289
+ "benchmark": "aider_polyglot",
290
+ "metric": "pass_rate_2",
291
+ "score": 0.769,
292
+ "weight": 0.0299,
293
+ "sampleSize": 225,
294
+ "date": "2025-05-07",
295
+ "url": "https://aider.chat/docs/leaderboards/",
296
+ "modelRaw": "Gemini 2.5 Pro Preview 05-06"
297
+ }
298
+ ]
299
+ },
300
+ "security": {
301
+ "score": 0.7208,
302
+ "confidence": 0.1325,
303
+ "evidenceWeight": 0.3055,
304
+ "sources": [
305
+ {
306
+ "source": "terminal_bench",
307
+ "benchmark": "terminal_bench",
308
+ "metric": "accuracy",
309
+ "score": 0.7066,
310
+ "weight": 0.1049,
311
+ "date": "2026-05-05",
312
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
313
+ "modelRaw": "Gemini CLI Gemini 3.1 Pro"
314
+ },
315
+ {
316
+ "source": "terminal_bench",
317
+ "benchmark": "terminal_bench",
318
+ "metric": "accuracy",
319
+ "score": 0.6629,
320
+ "weight": 0.1037,
321
+ "date": "2026-05-02",
322
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
323
+ "modelRaw": "Gemini CLI Gemini 3 Pro"
324
+ },
325
+ {
326
+ "source": "aider_polyglot",
327
+ "benchmark": "aider_polyglot",
328
+ "metric": "pass_rate_2",
329
+ "score": 0.831,
330
+ "weight": 0.0335,
331
+ "sampleSize": 225,
332
+ "date": "2025-06-06",
333
+ "url": "https://aider.chat/docs/leaderboards/",
334
+ "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
335
+ },
336
+ {
337
+ "source": "aider_polyglot",
338
+ "benchmark": "aider_polyglot",
339
+ "metric": "pass_rate_2",
340
+ "score": 0.791,
341
+ "weight": 0.0335,
342
+ "sampleSize": 225,
343
+ "date": "2025-06-06",
344
+ "url": "https://aider.chat/docs/leaderboards/",
345
+ "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
346
+ },
347
+ {
348
+ "source": "aider_polyglot",
349
+ "benchmark": "aider_polyglot",
350
+ "metric": "pass_rate_2",
351
+ "score": 0.769,
352
+ "weight": 0.0299,
353
+ "sampleSize": 225,
354
+ "date": "2025-05-07",
355
+ "url": "https://aider.chat/docs/leaderboards/",
356
+ "modelRaw": "Gemini 2.5 Pro Preview 05-06"
357
+ }
358
+ ]
359
+ },
360
+ "perf": {
361
+ "score": 0.7254,
362
+ "confidence": 0.1593,
363
+ "evidenceWeight": 0.3791,
364
+ "sources": [
365
+ {
366
+ "source": "terminal_bench",
367
+ "benchmark": "terminal_bench",
368
+ "metric": "accuracy",
369
+ "score": 0.7066,
370
+ "weight": 0.1224,
371
+ "date": "2026-05-05",
372
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
373
+ "modelRaw": "Gemini CLI Gemini 3.1 Pro"
374
+ },
375
+ {
376
+ "source": "terminal_bench",
377
+ "benchmark": "terminal_bench",
378
+ "metric": "accuracy",
379
+ "score": 0.6629,
380
+ "weight": 0.121,
381
+ "date": "2026-05-02",
382
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
383
+ "modelRaw": "Gemini CLI Gemini 3 Pro"
384
+ },
385
+ {
386
+ "source": "aider_polyglot",
387
+ "benchmark": "aider_polyglot",
388
+ "metric": "pass_rate_2",
389
+ "score": 0.831,
390
+ "weight": 0.0469,
391
+ "sampleSize": 225,
392
+ "date": "2025-06-06",
393
+ "url": "https://aider.chat/docs/leaderboards/",
394
+ "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
395
+ },
396
+ {
397
+ "source": "aider_polyglot",
398
+ "benchmark": "aider_polyglot",
399
+ "metric": "pass_rate_2",
400
+ "score": 0.791,
401
+ "weight": 0.0469,
402
+ "sampleSize": 225,
403
+ "date": "2025-06-06",
404
+ "url": "https://aider.chat/docs/leaderboards/",
405
+ "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
406
+ },
407
+ {
408
+ "source": "aider_polyglot",
409
+ "benchmark": "aider_polyglot",
410
+ "metric": "pass_rate_2",
411
+ "score": 0.769,
412
+ "weight": 0.0418,
413
+ "sampleSize": 225,
414
+ "date": "2025-05-07",
415
+ "url": "https://aider.chat/docs/leaderboards/",
416
+ "modelRaw": "Gemini 2.5 Pro Preview 05-06"
417
+ }
418
+ ]
419
+ },
420
+ "general": {
421
+ "score": 0.7231,
422
+ "confidence": 0.2397,
423
+ "evidenceWeight": 0.6304,
424
+ "sources": [
425
+ {
426
+ "source": "terminal_bench",
427
+ "benchmark": "terminal_bench",
428
+ "metric": "accuracy",
429
+ "score": 0.7066,
430
+ "weight": 0.2098,
431
+ "date": "2026-05-05",
432
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
433
+ "modelRaw": "Gemini CLI Gemini 3.1 Pro"
434
+ },
435
+ {
436
+ "source": "terminal_bench",
437
+ "benchmark": "terminal_bench",
438
+ "metric": "accuracy",
439
+ "score": 0.6629,
440
+ "weight": 0.2074,
441
+ "date": "2026-05-02",
442
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
443
+ "modelRaw": "Gemini CLI Gemini 3 Pro"
444
+ },
445
+ {
446
+ "source": "aider_polyglot",
447
+ "benchmark": "aider_polyglot",
448
+ "metric": "pass_rate_2",
449
+ "score": 0.831,
450
+ "weight": 0.0738,
451
+ "sampleSize": 225,
452
+ "date": "2025-06-06",
453
+ "url": "https://aider.chat/docs/leaderboards/",
454
+ "modelRaw": "gemini-2.5-pro-preview-06-05 32k think"
455
+ },
456
+ {
457
+ "source": "aider_polyglot",
458
+ "benchmark": "aider_polyglot",
459
+ "metric": "pass_rate_2",
460
+ "score": 0.791,
461
+ "weight": 0.0738,
462
+ "sampleSize": 225,
463
+ "date": "2025-06-06",
464
+ "url": "https://aider.chat/docs/leaderboards/",
465
+ "modelRaw": "gemini-2.5-pro-preview-06-05 default think"
466
+ },
467
+ {
468
+ "source": "aider_polyglot",
469
+ "benchmark": "aider_polyglot",
470
+ "metric": "pass_rate_2",
471
+ "score": 0.769,
472
+ "weight": 0.0657,
473
+ "sampleSize": 225,
474
+ "date": "2025-05-07",
475
+ "url": "https://aider.chat/docs/leaderboards/",
476
+ "modelRaw": "Gemini 2.5 Pro Preview 05-06"
477
+ }
478
+ ]
479
+ }
480
+ },
481
+ "claude-code:opus": {
482
+ "bugfix": {
483
+ "score": 0.7454,
484
+ "confidence": 0.1407,
485
+ "evidenceWeight": 0.3276,
486
+ "sources": [
487
+ {
488
+ "source": "terminal_bench",
489
+ "benchmark": "terminal_bench",
490
+ "metric": "accuracy",
491
+ "score": 0.7888,
492
+ "weight": 0.1726,
493
+ "date": "2026-05-29",
494
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
495
+ "modelRaw": "Claude Code Claude Opus 4.8"
496
+ },
497
+ {
498
+ "source": "terminal_bench",
499
+ "benchmark": "terminal_bench",
500
+ "metric": "accuracy",
501
+ "score": 0.6972,
502
+ "weight": 0.155,
503
+ "date": "2026-05-01",
504
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
505
+ "modelRaw": "Claude Code Claude Opus 4.7"
506
+ }
507
+ ]
508
+ },
509
+ "feature": {
510
+ "score": 0.7454,
511
+ "confidence": 0.113,
512
+ "evidenceWeight": 0.2548,
513
+ "sources": [
514
+ {
515
+ "source": "terminal_bench",
516
+ "benchmark": "terminal_bench",
517
+ "metric": "accuracy",
518
+ "score": 0.7888,
519
+ "weight": 0.1342,
520
+ "date": "2026-05-29",
521
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
522
+ "modelRaw": "Claude Code Claude Opus 4.8"
523
+ },
524
+ {
525
+ "source": "terminal_bench",
526
+ "benchmark": "terminal_bench",
527
+ "metric": "accuracy",
528
+ "score": 0.6972,
529
+ "weight": 0.1205,
530
+ "date": "2026-05-01",
531
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
532
+ "modelRaw": "Claude Code Claude Opus 4.7"
533
+ }
534
+ ]
535
+ },
536
+ "refactor": {
537
+ "score": 0.7454,
538
+ "confidence": 0.0834,
539
+ "evidenceWeight": 0.182,
540
+ "sources": [
541
+ {
542
+ "source": "terminal_bench",
543
+ "benchmark": "terminal_bench",
544
+ "metric": "accuracy",
545
+ "score": 0.7888,
546
+ "weight": 0.0959,
547
+ "date": "2026-05-29",
548
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
549
+ "modelRaw": "Claude Code Claude Opus 4.8"
550
+ },
551
+ {
552
+ "source": "terminal_bench",
553
+ "benchmark": "terminal_bench",
554
+ "metric": "accuracy",
555
+ "score": 0.6972,
556
+ "weight": 0.0861,
557
+ "date": "2026-05-01",
558
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
559
+ "modelRaw": "Claude Code Claude Opus 4.7"
560
+ }
561
+ ]
562
+ },
563
+ "test": {
564
+ "score": 0.7454,
565
+ "confidence": 0.0984,
566
+ "evidenceWeight": 0.2184,
567
+ "sources": [
568
+ {
569
+ "source": "terminal_bench",
570
+ "benchmark": "terminal_bench",
571
+ "metric": "accuracy",
572
+ "score": 0.7888,
573
+ "weight": 0.1151,
574
+ "date": "2026-05-29",
575
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
576
+ "modelRaw": "Claude Code Claude Opus 4.8"
577
+ },
578
+ {
579
+ "source": "terminal_bench",
580
+ "benchmark": "terminal_bench",
581
+ "metric": "accuracy",
582
+ "score": 0.6972,
583
+ "weight": 0.1033,
584
+ "date": "2026-05-01",
585
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
586
+ "modelRaw": "Claude Code Claude Opus 4.7"
587
+ }
588
+ ]
589
+ },
590
+ "security": {
591
+ "score": 0.7454,
592
+ "confidence": 0.0984,
593
+ "evidenceWeight": 0.2184,
594
+ "sources": [
595
+ {
596
+ "source": "terminal_bench",
597
+ "benchmark": "terminal_bench",
598
+ "metric": "accuracy",
599
+ "score": 0.7888,
600
+ "weight": 0.1151,
601
+ "date": "2026-05-29",
602
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
603
+ "modelRaw": "Claude Code Claude Opus 4.8"
604
+ },
605
+ {
606
+ "source": "terminal_bench",
607
+ "benchmark": "terminal_bench",
608
+ "metric": "accuracy",
609
+ "score": 0.6972,
610
+ "weight": 0.1033,
611
+ "date": "2026-05-01",
612
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
613
+ "modelRaw": "Claude Code Claude Opus 4.7"
614
+ }
615
+ ]
616
+ },
617
+ "perf": {
618
+ "score": 0.7454,
619
+ "confidence": 0.113,
620
+ "evidenceWeight": 0.2548,
621
+ "sources": [
622
+ {
623
+ "source": "terminal_bench",
624
+ "benchmark": "terminal_bench",
625
+ "metric": "accuracy",
626
+ "score": 0.7888,
627
+ "weight": 0.1342,
628
+ "date": "2026-05-29",
629
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
630
+ "modelRaw": "Claude Code Claude Opus 4.8"
631
+ },
632
+ {
633
+ "source": "terminal_bench",
634
+ "benchmark": "terminal_bench",
635
+ "metric": "accuracy",
636
+ "score": 0.6972,
637
+ "weight": 0.1205,
638
+ "date": "2026-05-01",
639
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
640
+ "modelRaw": "Claude Code Claude Opus 4.7"
641
+ }
642
+ ]
643
+ },
644
+ "general": {
645
+ "score": 0.7454,
646
+ "confidence": 0.1792,
647
+ "evidenceWeight": 0.4368,
648
+ "sources": [
649
+ {
650
+ "source": "terminal_bench",
651
+ "benchmark": "terminal_bench",
652
+ "metric": "accuracy",
653
+ "score": 0.7888,
654
+ "weight": 0.2301,
655
+ "date": "2026-05-29",
656
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
657
+ "modelRaw": "Claude Code Claude Opus 4.8"
658
+ },
659
+ {
660
+ "source": "terminal_bench",
661
+ "benchmark": "terminal_bench",
662
+ "metric": "accuracy",
663
+ "score": 0.6972,
664
+ "weight": 0.2066,
665
+ "date": "2026-05-01",
666
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
667
+ "modelRaw": "Claude Code Claude Opus 4.7"
668
+ }
669
+ ]
670
+ }
671
+ },
672
+ "codex:gpt-5.4": {
673
+ "bugfix": {
674
+ "score": 0.867,
675
+ "confidence": 0.0519,
676
+ "evidenceWeight": 0.1095,
677
+ "sources": [
678
+ {
679
+ "source": "aider_polyglot",
680
+ "benchmark": "aider_polyglot",
681
+ "metric": "pass_rate_2",
682
+ "score": 0.867,
683
+ "weight": 0.1095,
684
+ "sampleSize": 225,
685
+ "date": "2025-08-25",
686
+ "url": "https://aider.chat/docs/leaderboards/",
687
+ "modelRaw": "gpt-5 medium"
688
+ }
689
+ ]
690
+ },
691
+ "feature": {
692
+ "score": 0.867,
693
+ "confidence": 0.056,
694
+ "evidenceWeight": 0.1186,
695
+ "sources": [
696
+ {
697
+ "source": "aider_polyglot",
698
+ "benchmark": "aider_polyglot",
699
+ "metric": "pass_rate_2",
700
+ "score": 0.867,
701
+ "weight": 0.1186,
702
+ "sampleSize": 225,
703
+ "date": "2025-08-25",
704
+ "url": "https://aider.chat/docs/leaderboards/",
705
+ "modelRaw": "gpt-5 medium"
706
+ }
707
+ ]
708
+ },
709
+ "refactor": {
710
+ "score": 0.867,
711
+ "confidence": 0.068,
712
+ "evidenceWeight": 0.146,
713
+ "sources": [
714
+ {
715
+ "source": "aider_polyglot",
716
+ "benchmark": "aider_polyglot",
717
+ "metric": "pass_rate_2",
718
+ "score": 0.867,
719
+ "weight": 0.146,
720
+ "sampleSize": 225,
721
+ "date": "2025-08-25",
722
+ "url": "https://aider.chat/docs/leaderboards/",
723
+ "modelRaw": "gpt-5 medium"
724
+ }
725
+ ]
726
+ },
727
+ "test": {
728
+ "score": 0.867,
729
+ "confidence": 0.0394,
730
+ "evidenceWeight": 0.0821,
731
+ "sources": [
732
+ {
733
+ "source": "aider_polyglot",
734
+ "benchmark": "aider_polyglot",
735
+ "metric": "pass_rate_2",
736
+ "score": 0.867,
737
+ "weight": 0.0821,
738
+ "sampleSize": 225,
739
+ "date": "2025-08-25",
740
+ "url": "https://aider.chat/docs/leaderboards/",
741
+ "modelRaw": "gpt-5 medium"
742
+ }
743
+ ]
744
+ },
745
+ "docs": {
746
+ "score": 0.867,
747
+ "confidence": 0.0223,
748
+ "evidenceWeight": 0.0456,
749
+ "sources": [
750
+ {
751
+ "source": "aider_polyglot",
752
+ "benchmark": "aider_polyglot",
753
+ "metric": "pass_rate_2",
754
+ "score": 0.867,
755
+ "weight": 0.0456,
756
+ "sampleSize": 225,
757
+ "date": "2025-08-25",
758
+ "url": "https://aider.chat/docs/leaderboards/",
759
+ "modelRaw": "gpt-5 medium"
760
+ }
761
+ ]
762
+ },
763
+ "security": {
764
+ "score": 0.867,
765
+ "confidence": 0.0223,
766
+ "evidenceWeight": 0.0456,
767
+ "sources": [
768
+ {
769
+ "source": "aider_polyglot",
770
+ "benchmark": "aider_polyglot",
771
+ "metric": "pass_rate_2",
772
+ "score": 0.867,
773
+ "weight": 0.0456,
774
+ "sampleSize": 225,
775
+ "date": "2025-08-25",
776
+ "url": "https://aider.chat/docs/leaderboards/",
777
+ "modelRaw": "gpt-5 medium"
778
+ }
779
+ ]
780
+ },
781
+ "perf": {
782
+ "score": 0.867,
783
+ "confidence": 0.0309,
784
+ "evidenceWeight": 0.0639,
785
+ "sources": [
786
+ {
787
+ "source": "aider_polyglot",
788
+ "benchmark": "aider_polyglot",
789
+ "metric": "pass_rate_2",
790
+ "score": 0.867,
791
+ "weight": 0.0639,
792
+ "sampleSize": 225,
793
+ "date": "2025-08-25",
794
+ "url": "https://aider.chat/docs/leaderboards/",
795
+ "modelRaw": "gpt-5 medium"
796
+ }
797
+ ]
798
+ },
799
+ "general": {
800
+ "score": 0.867,
801
+ "confidence": 0.0478,
802
+ "evidenceWeight": 0.1004,
803
+ "sources": [
804
+ {
805
+ "source": "aider_polyglot",
806
+ "benchmark": "aider_polyglot",
807
+ "metric": "pass_rate_2",
808
+ "score": 0.867,
809
+ "weight": 0.1004,
810
+ "sampleSize": 225,
811
+ "date": "2025-08-25",
812
+ "url": "https://aider.chat/docs/leaderboards/",
813
+ "modelRaw": "gpt-5 medium"
814
+ }
815
+ ]
816
+ }
817
+ },
818
+ "codex:gpt-5.4-mini": {
819
+ "bugfix": {
820
+ "score": 0.813,
821
+ "confidence": 0.0519,
822
+ "evidenceWeight": 0.1095,
823
+ "sources": [
824
+ {
825
+ "source": "aider_polyglot",
826
+ "benchmark": "aider_polyglot",
827
+ "metric": "pass_rate_2",
828
+ "score": 0.813,
829
+ "weight": 0.1095,
830
+ "sampleSize": 225,
831
+ "date": "2025-08-25",
832
+ "url": "https://aider.chat/docs/leaderboards/",
833
+ "modelRaw": "gpt-5 low"
834
+ }
835
+ ]
836
+ },
837
+ "feature": {
838
+ "score": 0.813,
839
+ "confidence": 0.056,
840
+ "evidenceWeight": 0.1186,
841
+ "sources": [
842
+ {
843
+ "source": "aider_polyglot",
844
+ "benchmark": "aider_polyglot",
845
+ "metric": "pass_rate_2",
846
+ "score": 0.813,
847
+ "weight": 0.1186,
848
+ "sampleSize": 225,
849
+ "date": "2025-08-25",
850
+ "url": "https://aider.chat/docs/leaderboards/",
851
+ "modelRaw": "gpt-5 low"
852
+ }
853
+ ]
854
+ },
855
+ "refactor": {
856
+ "score": 0.813,
857
+ "confidence": 0.068,
858
+ "evidenceWeight": 0.146,
859
+ "sources": [
860
+ {
861
+ "source": "aider_polyglot",
862
+ "benchmark": "aider_polyglot",
863
+ "metric": "pass_rate_2",
864
+ "score": 0.813,
865
+ "weight": 0.146,
866
+ "sampleSize": 225,
867
+ "date": "2025-08-25",
868
+ "url": "https://aider.chat/docs/leaderboards/",
869
+ "modelRaw": "gpt-5 low"
870
+ }
871
+ ]
872
+ },
873
+ "test": {
874
+ "score": 0.813,
875
+ "confidence": 0.0394,
876
+ "evidenceWeight": 0.0821,
877
+ "sources": [
878
+ {
879
+ "source": "aider_polyglot",
880
+ "benchmark": "aider_polyglot",
881
+ "metric": "pass_rate_2",
882
+ "score": 0.813,
883
+ "weight": 0.0821,
884
+ "sampleSize": 225,
885
+ "date": "2025-08-25",
886
+ "url": "https://aider.chat/docs/leaderboards/",
887
+ "modelRaw": "gpt-5 low"
888
+ }
889
+ ]
890
+ },
891
+ "docs": {
892
+ "score": 0.813,
893
+ "confidence": 0.0223,
894
+ "evidenceWeight": 0.0456,
895
+ "sources": [
896
+ {
897
+ "source": "aider_polyglot",
898
+ "benchmark": "aider_polyglot",
899
+ "metric": "pass_rate_2",
900
+ "score": 0.813,
901
+ "weight": 0.0456,
902
+ "sampleSize": 225,
903
+ "date": "2025-08-25",
904
+ "url": "https://aider.chat/docs/leaderboards/",
905
+ "modelRaw": "gpt-5 low"
906
+ }
907
+ ]
908
+ },
909
+ "security": {
910
+ "score": 0.813,
911
+ "confidence": 0.0223,
912
+ "evidenceWeight": 0.0456,
913
+ "sources": [
914
+ {
915
+ "source": "aider_polyglot",
916
+ "benchmark": "aider_polyglot",
917
+ "metric": "pass_rate_2",
918
+ "score": 0.813,
919
+ "weight": 0.0456,
920
+ "sampleSize": 225,
921
+ "date": "2025-08-25",
922
+ "url": "https://aider.chat/docs/leaderboards/",
923
+ "modelRaw": "gpt-5 low"
924
+ }
925
+ ]
926
+ },
927
+ "perf": {
928
+ "score": 0.813,
929
+ "confidence": 0.0309,
930
+ "evidenceWeight": 0.0639,
931
+ "sources": [
932
+ {
933
+ "source": "aider_polyglot",
934
+ "benchmark": "aider_polyglot",
935
+ "metric": "pass_rate_2",
936
+ "score": 0.813,
937
+ "weight": 0.0639,
938
+ "sampleSize": 225,
939
+ "date": "2025-08-25",
940
+ "url": "https://aider.chat/docs/leaderboards/",
941
+ "modelRaw": "gpt-5 low"
942
+ }
943
+ ]
944
+ },
945
+ "general": {
946
+ "score": 0.813,
947
+ "confidence": 0.0478,
948
+ "evidenceWeight": 0.1004,
949
+ "sources": [
950
+ {
951
+ "source": "aider_polyglot",
952
+ "benchmark": "aider_polyglot",
953
+ "metric": "pass_rate_2",
954
+ "score": 0.813,
955
+ "weight": 0.1004,
956
+ "sampleSize": 225,
957
+ "date": "2025-08-25",
958
+ "url": "https://aider.chat/docs/leaderboards/",
959
+ "modelRaw": "gpt-5 low"
960
+ }
961
+ ]
962
+ }
963
+ },
964
+ "codex:gpt-5.5": {
965
+ "bugfix": {
966
+ "score": 0.8528,
967
+ "confidence": 0.1165,
968
+ "evidenceWeight": 0.2636,
969
+ "sources": [
970
+ {
971
+ "source": "terminal_bench",
972
+ "benchmark": "terminal_bench",
973
+ "metric": "accuracy",
974
+ "score": 0.8337,
975
+ "weight": 0.155,
976
+ "date": "2026-05-01",
977
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
978
+ "modelRaw": "Codex CLI GPT-5.5"
979
+ },
980
+ {
981
+ "source": "aider_polyglot",
982
+ "benchmark": "aider_polyglot",
983
+ "metric": "pass_rate_2",
984
+ "score": 0.88,
985
+ "weight": 0.1086,
986
+ "sampleSize": 225,
987
+ "date": "2025-08-23",
988
+ "url": "https://aider.chat/docs/leaderboards/",
989
+ "modelRaw": "gpt-5 high"
990
+ }
991
+ ]
992
+ },
993
+ "feature": {
994
+ "score": 0.8566,
995
+ "confidence": 0.1064,
996
+ "evidenceWeight": 0.2382,
997
+ "sources": [
998
+ {
999
+ "source": "terminal_bench",
1000
+ "benchmark": "terminal_bench",
1001
+ "metric": "accuracy",
1002
+ "score": 0.8337,
1003
+ "weight": 0.1205,
1004
+ "date": "2026-05-01",
1005
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
1006
+ "modelRaw": "Codex CLI GPT-5.5"
1007
+ },
1008
+ {
1009
+ "source": "aider_polyglot",
1010
+ "benchmark": "aider_polyglot",
1011
+ "metric": "pass_rate_2",
1012
+ "score": 0.88,
1013
+ "weight": 0.1177,
1014
+ "sampleSize": 225,
1015
+ "date": "2025-08-23",
1016
+ "url": "https://aider.chat/docs/leaderboards/",
1017
+ "modelRaw": "gpt-5 high"
1018
+ }
1019
+ ]
1020
+ },
1021
+ "refactor": {
1022
+ "score": 0.8627,
1023
+ "confidence": 0.1035,
1024
+ "evidenceWeight": 0.2309,
1025
+ "sources": [
1026
+ {
1027
+ "source": "aider_polyglot",
1028
+ "benchmark": "aider_polyglot",
1029
+ "metric": "pass_rate_2",
1030
+ "score": 0.88,
1031
+ "weight": 0.1449,
1032
+ "sampleSize": 225,
1033
+ "date": "2025-08-23",
1034
+ "url": "https://aider.chat/docs/leaderboards/",
1035
+ "modelRaw": "gpt-5 high"
1036
+ },
1037
+ {
1038
+ "source": "terminal_bench",
1039
+ "benchmark": "terminal_bench",
1040
+ "metric": "accuracy",
1041
+ "score": 0.8337,
1042
+ "weight": 0.0861,
1043
+ "date": "2026-05-01",
1044
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
1045
+ "modelRaw": "Codex CLI GPT-5.5"
1046
+ }
1047
+ ]
1048
+ },
1049
+ "test": {
1050
+ "score": 0.8541,
1051
+ "confidence": 0.0846,
1052
+ "evidenceWeight": 0.1848,
1053
+ "sources": [
1054
+ {
1055
+ "source": "terminal_bench",
1056
+ "benchmark": "terminal_bench",
1057
+ "metric": "accuracy",
1058
+ "score": 0.8337,
1059
+ "weight": 0.1033,
1060
+ "date": "2026-05-01",
1061
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
1062
+ "modelRaw": "Codex CLI GPT-5.5"
1063
+ },
1064
+ {
1065
+ "source": "aider_polyglot",
1066
+ "benchmark": "aider_polyglot",
1067
+ "metric": "pass_rate_2",
1068
+ "score": 0.88,
1069
+ "weight": 0.0815,
1070
+ "sampleSize": 225,
1071
+ "date": "2025-08-23",
1072
+ "url": "https://aider.chat/docs/leaderboards/",
1073
+ "modelRaw": "gpt-5 high"
1074
+ }
1075
+ ]
1076
+ },
1077
+ "docs": {
1078
+ "score": 0.88,
1079
+ "confidence": 0.0221,
1080
+ "evidenceWeight": 0.0453,
1081
+ "sources": [
1082
+ {
1083
+ "source": "aider_polyglot",
1084
+ "benchmark": "aider_polyglot",
1085
+ "metric": "pass_rate_2",
1086
+ "score": 0.88,
1087
+ "weight": 0.0453,
1088
+ "sampleSize": 225,
1089
+ "date": "2025-08-23",
1090
+ "url": "https://aider.chat/docs/leaderboards/",
1091
+ "modelRaw": "gpt-5 high"
1092
+ }
1093
+ ]
1094
+ },
1095
+ "security": {
1096
+ "score": 0.8478,
1097
+ "confidence": 0.0692,
1098
+ "evidenceWeight": 0.1486,
1099
+ "sources": [
1100
+ {
1101
+ "source": "terminal_bench",
1102
+ "benchmark": "terminal_bench",
1103
+ "metric": "accuracy",
1104
+ "score": 0.8337,
1105
+ "weight": 0.1033,
1106
+ "date": "2026-05-01",
1107
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
1108
+ "modelRaw": "Codex CLI GPT-5.5"
1109
+ },
1110
+ {
1111
+ "source": "aider_polyglot",
1112
+ "benchmark": "aider_polyglot",
1113
+ "metric": "pass_rate_2",
1114
+ "score": 0.88,
1115
+ "weight": 0.0453,
1116
+ "sampleSize": 225,
1117
+ "date": "2025-08-23",
1118
+ "url": "https://aider.chat/docs/leaderboards/",
1119
+ "modelRaw": "gpt-5 high"
1120
+ }
1121
+ ]
1122
+ },
1123
+ "perf": {
1124
+ "score": 0.8497,
1125
+ "confidence": 0.0842,
1126
+ "evidenceWeight": 0.1839,
1127
+ "sources": [
1128
+ {
1129
+ "source": "terminal_bench",
1130
+ "benchmark": "terminal_bench",
1131
+ "metric": "accuracy",
1132
+ "score": 0.8337,
1133
+ "weight": 0.1205,
1134
+ "date": "2026-05-01",
1135
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
1136
+ "modelRaw": "Codex CLI GPT-5.5"
1137
+ },
1138
+ {
1139
+ "source": "aider_polyglot",
1140
+ "benchmark": "aider_polyglot",
1141
+ "metric": "pass_rate_2",
1142
+ "score": 0.88,
1143
+ "weight": 0.0634,
1144
+ "sampleSize": 225,
1145
+ "date": "2025-08-23",
1146
+ "url": "https://aider.chat/docs/leaderboards/",
1147
+ "modelRaw": "gpt-5 high"
1148
+ }
1149
+ ]
1150
+ },
1151
+ "general": {
1152
+ "score": 0.8488,
1153
+ "confidence": 0.1328,
1154
+ "evidenceWeight": 0.3062,
1155
+ "sources": [
1156
+ {
1157
+ "source": "terminal_bench",
1158
+ "benchmark": "terminal_bench",
1159
+ "metric": "accuracy",
1160
+ "score": 0.8337,
1161
+ "weight": 0.2066,
1162
+ "date": "2026-05-01",
1163
+ "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.1",
1164
+ "modelRaw": "Codex CLI GPT-5.5"
1165
+ },
1166
+ {
1167
+ "source": "aider_polyglot",
1168
+ "benchmark": "aider_polyglot",
1169
+ "metric": "pass_rate_2",
1170
+ "score": 0.88,
1171
+ "weight": 0.0996,
1172
+ "sampleSize": 225,
1173
+ "date": "2025-08-23",
1174
+ "url": "https://aider.chat/docs/leaderboards/",
1175
+ "modelRaw": "gpt-5 high"
1176
+ }
1177
+ ]
1178
+ }
1179
+ }
1180
+ }
1181
+ }