aiforcecli-chat 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/License.MD +49 -0
- package/README.md +642 -0
- package/aiforcecli.config.example.json +66 -0
- package/assets/README.md +14 -0
- package/dist/cli.js +2 -0
- package/dist/index.js +2 -0
- package/package.json +62 -0
- package/tools/scorecard/README.md +92 -0
- package/tools/scorecard/config.json +134 -0
- package/tools/scorecard/fetch.mjs +335 -0
- package/tools/scorecard/generate.mjs +289 -0
- package/tools/scorecard/generated/example/invalid-rows.json +1 -0
- package/tools/scorecard/generated/example/scorecard-report.md +147 -0
- package/tools/scorecard/generated/example/scorecard.compact.json +61 -0
- package/tools/scorecard/generated/example/scorecard.json +1492 -0
- package/tools/scorecard/generated/example/unmapped-models.json +1492 -0
- package/tools/scorecard/generated/raw/aider_polyglot.html +21071 -0
- package/tools/scorecard/generated/raw/terminal_bench_2_1.html +2 -0
- package/tools/scorecard/generated/scorecard/invalid-rows.json +1 -0
- package/tools/scorecard/generated/scorecard/scorecard-report.md +133 -0
- package/tools/scorecard/generated/scorecard/scorecard.compact.json +51 -0
- package/tools/scorecard/generated/scorecard/scorecard.json +1181 -0
- package/tools/scorecard/generated/scorecard/unmapped-models.json +1492 -0
- package/tools/scorecard/generated/scorecard-example/invalid-rows.json +1 -0
- package/tools/scorecard/generated/scorecard-example/scorecard-report.md +40 -0
- package/tools/scorecard/generated/scorecard-example/scorecard.compact.json +22 -0
- package/tools/scorecard/generated/scorecard-example/scorecard.json +389 -0
- package/tools/scorecard/generated/scorecard-example/unmapped-models.json +1 -0
- package/tools/scorecard/generated/scorecard-fetch/raw/aider_polyglot.html +21071 -0
- package/tools/scorecard/generated/scorecard-fetch/raw/terminal_bench_2_1.html +2 -0
- package/tools/scorecard/snapshots/example.normalized.example.json +38 -0
- package/tools/scorecard/snapshots/live.aider_polyglot.json +1318 -0
- package/tools/scorecard/snapshots/live.terminal_bench_2_1.json +294 -0
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# Generated Scorecard Report
|
|
2
|
+
|
|
3
|
+
Generated: 2026-06-16T20:38:21.186Z
|
|
4
|
+
Snapshots: 3
|
|
5
|
+
Mapped models: 6
|
|
6
|
+
Unmapped rows: 69
|
|
7
|
+
Invalid rows: 0
|
|
8
|
+
|
|
9
|
+
## Snapshot Files
|
|
10
|
+
|
|
11
|
+
- tools\scorecard\snapshots\example.normalized.example.json
|
|
12
|
+
- tools\scorecard\snapshots\live.aider_polyglot.json
|
|
13
|
+
- tools\scorecard\snapshots\live.terminal_bench_2_1.json
|
|
14
|
+
|
|
15
|
+
## Scores
|
|
16
|
+
|
|
17
|
+
### antigravity:gemini-3.1-pro
|
|
18
|
+
|
|
19
|
+
| Task | Score | Confidence | Evidence Weight | Top Evidence |
|
|
20
|
+
| --- | ---: | ---: | ---: | --- |
|
|
21
|
+
| bugfix | 73.3% | 21.4% | 0.5456 | terminal_bench/accuracy (Gemini CLI Gemini 3.1 Pro) |
|
|
22
|
+
| feature | 74.2% | 19.9% | 0.4954 | terminal_bench/accuracy (Gemini CLI Gemini 3.1 Pro) |
|
|
23
|
+
| refactor | 75.7% | 19.5% | 0.484 | aider_polyglot/pass_rate_2 (gemini-2.5-pro-preview-06-05 32k think) |
|
|
24
|
+
| test | 73.6% | 16.1% | 0.3831 | terminal_bench/accuracy (Gemini CLI Gemini 3.1 Pro) |
|
|
25
|
+
| docs | 79.8% | 4.6% | 0.0969 | aider_polyglot/pass_rate_2 (gemini-2.5-pro-preview-06-05 32k think) |
|
|
26
|
+
| security | 72.1% | 13.3% | 0.3056 | terminal_bench/accuracy (Gemini CLI Gemini 3.1 Pro) |
|
|
27
|
+
| perf | 72.5% | 15.9% | 0.3791 | terminal_bench/accuracy (Gemini CLI Gemini 3.1 Pro) |
|
|
28
|
+
| general | 72.3% | 24% | 0.6305 | terminal_bench/accuracy (Gemini CLI Gemini 3.1 Pro) |
|
|
29
|
+
|
|
30
|
+
### claude-code:opus
|
|
31
|
+
|
|
32
|
+
| Task | Score | Confidence | Evidence Weight | Top Evidence |
|
|
33
|
+
| --- | ---: | ---: | ---: | --- |
|
|
34
|
+
| bugfix | 74.5% | 14.1% | 0.3276 | terminal_bench/accuracy (Claude Code Claude Opus 4.8) |
|
|
35
|
+
| feature | 74.5% | 11.3% | 0.2548 | terminal_bench/accuracy (Claude Code Claude Opus 4.8) |
|
|
36
|
+
| refactor | 74.5% | 8.3% | 0.182 | terminal_bench/accuracy (Claude Code Claude Opus 4.8) |
|
|
37
|
+
| test | 74.5% | 9.8% | 0.2184 | terminal_bench/accuracy (Claude Code Claude Opus 4.8) |
|
|
38
|
+
| security | 74.5% | 9.8% | 0.2184 | terminal_bench/accuracy (Claude Code Claude Opus 4.8) |
|
|
39
|
+
| perf | 74.5% | 11.3% | 0.2548 | terminal_bench/accuracy (Claude Code Claude Opus 4.8) |
|
|
40
|
+
| general | 74.5% | 17.9% | 0.4368 | terminal_bench/accuracy (Claude Code Claude Opus 4.8) |
|
|
41
|
+
|
|
42
|
+
### claude-code:sonnet
|
|
43
|
+
|
|
44
|
+
| Task | Score | Confidence | Evidence Weight | Top Evidence |
|
|
45
|
+
| --- | ---: | ---: | ---: | --- |
|
|
46
|
+
| bugfix | 71% | 38.7% | 1.2626 | swebench_verified/resolved (Claude Sonnet) |
|
|
47
|
+
| feature | 69.9% | 25.3% | 0.6779 | aider_polyglot/percent_correct (Claude Sonnet) |
|
|
48
|
+
| refactor | 70.2% | 32.1% | 0.9465 | swebench_verified/resolved (Claude Sonnet) |
|
|
49
|
+
| test | 70% | 19.2% | 0.4766 | aider_polyglot/percent_correct (Claude Sonnet) |
|
|
50
|
+
| docs | 68% | 6.3% | 0.1341 | aider_polyglot/percent_correct (Claude Sonnet) |
|
|
51
|
+
| security | 70.6% | 15.6% | 0.3693 | swebench_verified/resolved (Claude Sonnet) |
|
|
52
|
+
| perf | 70.2% | 17.5% | 0.4229 | swebench_verified/resolved (Claude Sonnet) |
|
|
53
|
+
| general | 70.4% | 26.4% | 0.7184 | swebench_verified/resolved (Claude Sonnet) |
|
|
54
|
+
|
|
55
|
+
### codex:gpt-5.4
|
|
56
|
+
|
|
57
|
+
| Task | Score | Confidence | Evidence Weight | Top Evidence |
|
|
58
|
+
| --- | ---: | ---: | ---: | --- |
|
|
59
|
+
| bugfix | 86.7% | 5.2% | 0.1095 | aider_polyglot/pass_rate_2 (gpt-5 medium) |
|
|
60
|
+
| feature | 86.7% | 5.6% | 0.1186 | aider_polyglot/pass_rate_2 (gpt-5 medium) |
|
|
61
|
+
| refactor | 86.7% | 6.8% | 0.146 | aider_polyglot/pass_rate_2 (gpt-5 medium) |
|
|
62
|
+
| test | 86.7% | 3.9% | 0.0821 | aider_polyglot/pass_rate_2 (gpt-5 medium) |
|
|
63
|
+
| docs | 86.7% | 2.2% | 0.0456 | aider_polyglot/pass_rate_2 (gpt-5 medium) |
|
|
64
|
+
| security | 86.7% | 2.2% | 0.0456 | aider_polyglot/pass_rate_2 (gpt-5 medium) |
|
|
65
|
+
| perf | 86.7% | 3.1% | 0.0639 | aider_polyglot/pass_rate_2 (gpt-5 medium) |
|
|
66
|
+
| general | 86.7% | 4.8% | 0.1004 | aider_polyglot/pass_rate_2 (gpt-5 medium) |
|
|
67
|
+
|
|
68
|
+
### codex:gpt-5.4-mini
|
|
69
|
+
|
|
70
|
+
| Task | Score | Confidence | Evidence Weight | Top Evidence |
|
|
71
|
+
| --- | ---: | ---: | ---: | --- |
|
|
72
|
+
| bugfix | 77.4% | 10.6% | 0.2364 | evalplus/pass_at_1 (gpt-5.4-mini) |
|
|
73
|
+
| feature | 76.2% | 16.6% | 0.3978 | evalplus/pass_at_1 (gpt-5.4-mini) |
|
|
74
|
+
| refactor | 77.9% | 12% | 0.2729 | aider_polyglot/pass_rate_2 (gpt-5 low) |
|
|
75
|
+
| test | 75.4% | 18% | 0.4375 | evalplus/pass_at_1 (gpt-5.4-mini) |
|
|
76
|
+
| docs | 77.5% | 4.6% | 0.0964 | evalplus/pass_at_1 (gpt-5.4-mini) |
|
|
77
|
+
| security | 76.7% | 5.7% | 0.1218 | evalplus/pass_at_1 (gpt-5.4-mini) |
|
|
78
|
+
| perf | 76.8% | 7.6% | 0.1654 | evalplus/pass_at_1 (gpt-5.4-mini) |
|
|
79
|
+
| general | 76.2% | 14.1% | 0.3288 | evalplus/pass_at_1 (gpt-5.4-mini) |
|
|
80
|
+
|
|
81
|
+
### codex:gpt-5.5
|
|
82
|
+
|
|
83
|
+
| Task | Score | Confidence | Evidence Weight | Top Evidence |
|
|
84
|
+
| --- | ---: | ---: | ---: | --- |
|
|
85
|
+
| bugfix | 85.3% | 11.7% | 0.2636 | terminal_bench/accuracy (Codex CLI GPT-5.5) |
|
|
86
|
+
| feature | 85.7% | 10.6% | 0.2382 | terminal_bench/accuracy (Codex CLI GPT-5.5) |
|
|
87
|
+
| refactor | 86.3% | 10.4% | 0.231 | aider_polyglot/pass_rate_2 (gpt-5 high) |
|
|
88
|
+
| test | 85.4% | 8.5% | 0.1848 | terminal_bench/accuracy (Codex CLI GPT-5.5) |
|
|
89
|
+
| docs | 88% | 2.2% | 0.0453 | aider_polyglot/pass_rate_2 (gpt-5 high) |
|
|
90
|
+
| security | 84.8% | 6.9% | 0.1486 | terminal_bench/accuracy (Codex CLI GPT-5.5) |
|
|
91
|
+
| perf | 85% | 8.4% | 0.1839 | terminal_bench/accuracy (Codex CLI GPT-5.5) |
|
|
92
|
+
| general | 84.9% | 13.3% | 0.3062 | terminal_bench/accuracy (Codex CLI GPT-5.5) |
|
|
93
|
+
|
|
94
|
+
## Unmapped Models
|
|
95
|
+
|
|
96
|
+
- o3-pro high from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
97
|
+
- o3 high from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
98
|
+
- grok-4 high from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
99
|
+
- o3 high + gpt-4.1 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
100
|
+
- o3 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
101
|
+
- DeepSeek-V3.2-Exp Reasoner from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
102
|
+
- Gemini 2.5 Pro Preview 03-25 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
103
|
+
- claude-opus-4-20250514 32k thinking from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
104
|
+
- o4-mini high from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
105
|
+
- DeepSeek R1 0528 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
106
|
+
- claude-opus-4-20250514 no think from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
107
|
+
- DeepSeek-V3.2-Exp Chat from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
108
|
+
- claude-3-7-sonnet-20250219 32k thinking tokens from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
109
|
+
- DeepSeek R1 + claude-3-5-sonnet-20241022 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
110
|
+
- o1-2024-12-17 high from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
111
|
+
- claude-sonnet-4-20250514 32k thinking from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
112
|
+
- claude-3-7-sonnet-20250219 no thinking from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
113
|
+
- o3-mini high from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
114
|
+
- Qwen3 235B A22B diff, no think, Alibaba API from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
115
|
+
- Kimi K2 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
116
|
+
- DeepSeek R1 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
117
|
+
- claude-sonnet-4-20250514 no thinking from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
118
|
+
- gemini-2.5-flash-preview-05-20 24k think from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
119
|
+
- DeepSeek V3 0324 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
120
|
+
- Quasar Alpha from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
121
|
+
- o3-mini medium from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
122
|
+
- Grok 3 Beta from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
123
|
+
- Optimus Alpha from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
124
|
+
- gpt-4.1 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
125
|
+
- claude-3-5-sonnet-20241022 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
126
|
+
- Grok 3 Mini Beta high from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
127
|
+
- DeepSeek Chat V3 prev from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
128
|
+
- gemini-2.5-flash-preview-04-17 default from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
129
|
+
- chatgpt-4o-latest 2025-03-29 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
130
|
+
- gpt-4.5-preview from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
131
|
+
- gemini-2.5-flash-preview-05-20 no think from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
132
|
+
- gpt-oss-120b high from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
133
|
+
- Qwen3 32B from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
134
|
+
- gemini-exp-1206 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
135
|
+
- Gemini 2.0 Pro exp-02-05 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
136
|
+
- Grok 3 Mini Beta low from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
137
|
+
- o1-mini-2024-09-12 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
138
|
+
- gpt-4.1-mini from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
139
|
+
- claude-3-5-haiku-20241022 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
140
|
+
- chatgpt-4o-latest 2025-02-15 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
141
|
+
- QwQ-32B + Qwen 2.5 Coder Instruct from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
142
|
+
- gpt-4o-2024-08-06 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
143
|
+
- gemini-2.0-flash-exp from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
144
|
+
- qwen-max-2025-01-25 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
145
|
+
- QwQ-32B from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
|
|
146
|
+
- ...19 more
|
|
147
|
+
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
{
|
|
2
|
+
"antigravity:gemini-3.1-pro": {
|
|
3
|
+
"bugfix": 0.7331,
|
|
4
|
+
"feature": 0.7424,
|
|
5
|
+
"refactor": 0.7574,
|
|
6
|
+
"test": 0.7364,
|
|
7
|
+
"docs": 0.7981,
|
|
8
|
+
"security": 0.7208,
|
|
9
|
+
"perf": 0.7254,
|
|
10
|
+
"general": 0.7231
|
|
11
|
+
},
|
|
12
|
+
"claude-code:opus": {
|
|
13
|
+
"bugfix": 0.7454,
|
|
14
|
+
"feature": 0.7454,
|
|
15
|
+
"refactor": 0.7454,
|
|
16
|
+
"test": 0.7454,
|
|
17
|
+
"security": 0.7454,
|
|
18
|
+
"perf": 0.7454,
|
|
19
|
+
"general": 0.7454
|
|
20
|
+
},
|
|
21
|
+
"claude-code:sonnet": {
|
|
22
|
+
"bugfix": 0.7098,
|
|
23
|
+
"feature": 0.6994,
|
|
24
|
+
"refactor": 0.7019,
|
|
25
|
+
"test": 0.6997,
|
|
26
|
+
"docs": 0.68,
|
|
27
|
+
"security": 0.7055,
|
|
28
|
+
"perf": 0.7022,
|
|
29
|
+
"general": 0.7036
|
|
30
|
+
},
|
|
31
|
+
"codex:gpt-5.4": {
|
|
32
|
+
"bugfix": 0.867,
|
|
33
|
+
"feature": 0.867,
|
|
34
|
+
"refactor": 0.867,
|
|
35
|
+
"test": 0.867,
|
|
36
|
+
"docs": 0.867,
|
|
37
|
+
"security": 0.867,
|
|
38
|
+
"perf": 0.867,
|
|
39
|
+
"general": 0.867
|
|
40
|
+
},
|
|
41
|
+
"codex:gpt-5.4-mini": {
|
|
42
|
+
"bugfix": 0.7738,
|
|
43
|
+
"feature": 0.7618,
|
|
44
|
+
"refactor": 0.7791,
|
|
45
|
+
"test": 0.7537,
|
|
46
|
+
"docs": 0.7746,
|
|
47
|
+
"security": 0.7673,
|
|
48
|
+
"perf": 0.7682,
|
|
49
|
+
"general": 0.7623
|
|
50
|
+
},
|
|
51
|
+
"codex:gpt-5.5": {
|
|
52
|
+
"bugfix": 0.8528,
|
|
53
|
+
"feature": 0.8566,
|
|
54
|
+
"refactor": 0.8627,
|
|
55
|
+
"test": 0.8541,
|
|
56
|
+
"docs": 0.88,
|
|
57
|
+
"security": 0.8478,
|
|
58
|
+
"perf": 0.8497,
|
|
59
|
+
"general": 0.8488
|
|
60
|
+
}
|
|
61
|
+
}
|