aiforcecli-chat 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/License.MD +49 -0
  2. package/README.md +642 -0
  3. package/aiforcecli.config.example.json +66 -0
  4. package/assets/README.md +14 -0
  5. package/dist/cli.js +2 -0
  6. package/dist/index.js +2 -0
  7. package/package.json +62 -0
  8. package/tools/scorecard/README.md +92 -0
  9. package/tools/scorecard/config.json +134 -0
  10. package/tools/scorecard/fetch.mjs +335 -0
  11. package/tools/scorecard/generate.mjs +289 -0
  12. package/tools/scorecard/generated/example/invalid-rows.json +1 -0
  13. package/tools/scorecard/generated/example/scorecard-report.md +147 -0
  14. package/tools/scorecard/generated/example/scorecard.compact.json +61 -0
  15. package/tools/scorecard/generated/example/scorecard.json +1492 -0
  16. package/tools/scorecard/generated/example/unmapped-models.json +1492 -0
  17. package/tools/scorecard/generated/raw/aider_polyglot.html +21071 -0
  18. package/tools/scorecard/generated/raw/terminal_bench_2_1.html +2 -0
  19. package/tools/scorecard/generated/scorecard/invalid-rows.json +1 -0
  20. package/tools/scorecard/generated/scorecard/scorecard-report.md +133 -0
  21. package/tools/scorecard/generated/scorecard/scorecard.compact.json +51 -0
  22. package/tools/scorecard/generated/scorecard/scorecard.json +1181 -0
  23. package/tools/scorecard/generated/scorecard/unmapped-models.json +1492 -0
  24. package/tools/scorecard/generated/scorecard-example/invalid-rows.json +1 -0
  25. package/tools/scorecard/generated/scorecard-example/scorecard-report.md +40 -0
  26. package/tools/scorecard/generated/scorecard-example/scorecard.compact.json +22 -0
  27. package/tools/scorecard/generated/scorecard-example/scorecard.json +389 -0
  28. package/tools/scorecard/generated/scorecard-example/unmapped-models.json +1 -0
  29. package/tools/scorecard/generated/scorecard-fetch/raw/aider_polyglot.html +21071 -0
  30. package/tools/scorecard/generated/scorecard-fetch/raw/terminal_bench_2_1.html +2 -0
  31. package/tools/scorecard/snapshots/example.normalized.example.json +38 -0
  32. package/tools/scorecard/snapshots/live.aider_polyglot.json +1318 -0
  33. package/tools/scorecard/snapshots/live.terminal_bench_2_1.json +294 -0
@@ -0,0 +1,147 @@
1
+ # Generated Scorecard Report
2
+
3
+ Generated: 2026-06-16T20:38:21.186Z
4
+ Snapshots: 3
5
+ Mapped models: 6
6
+ Unmapped rows: 69
7
+ Invalid rows: 0
8
+
9
+ ## Snapshot Files
10
+
11
+ - tools\scorecard\snapshots\example.normalized.example.json
12
+ - tools\scorecard\snapshots\live.aider_polyglot.json
13
+ - tools\scorecard\snapshots\live.terminal_bench_2_1.json
14
+
15
+ ## Scores
16
+
17
+ ### antigravity:gemini-3.1-pro
18
+
19
+ | Task | Score | Confidence | Evidence Weight | Top Evidence |
20
+ | --- | ---: | ---: | ---: | --- |
21
+ | bugfix | 73.3% | 21.4% | 0.5456 | terminal_bench/accuracy (Gemini CLI Gemini 3.1 Pro) |
22
+ | feature | 74.2% | 19.9% | 0.4954 | terminal_bench/accuracy (Gemini CLI Gemini 3.1 Pro) |
23
+ | refactor | 75.7% | 19.5% | 0.484 | aider_polyglot/pass_rate_2 (gemini-2.5-pro-preview-06-05 32k think) |
24
+ | test | 73.6% | 16.1% | 0.3831 | terminal_bench/accuracy (Gemini CLI Gemini 3.1 Pro) |
25
+ | docs | 79.8% | 4.6% | 0.0969 | aider_polyglot/pass_rate_2 (gemini-2.5-pro-preview-06-05 32k think) |
26
+ | security | 72.1% | 13.3% | 0.3056 | terminal_bench/accuracy (Gemini CLI Gemini 3.1 Pro) |
27
+ | perf | 72.5% | 15.9% | 0.3791 | terminal_bench/accuracy (Gemini CLI Gemini 3.1 Pro) |
28
+ | general | 72.3% | 24% | 0.6305 | terminal_bench/accuracy (Gemini CLI Gemini 3.1 Pro) |
29
+
30
+ ### claude-code:opus
31
+
32
+ | Task | Score | Confidence | Evidence Weight | Top Evidence |
33
+ | --- | ---: | ---: | ---: | --- |
34
+ | bugfix | 74.5% | 14.1% | 0.3276 | terminal_bench/accuracy (Claude Code Claude Opus 4.8) |
35
+ | feature | 74.5% | 11.3% | 0.2548 | terminal_bench/accuracy (Claude Code Claude Opus 4.8) |
36
+ | refactor | 74.5% | 8.3% | 0.182 | terminal_bench/accuracy (Claude Code Claude Opus 4.8) |
37
+ | test | 74.5% | 9.8% | 0.2184 | terminal_bench/accuracy (Claude Code Claude Opus 4.8) |
38
+ | security | 74.5% | 9.8% | 0.2184 | terminal_bench/accuracy (Claude Code Claude Opus 4.8) |
39
+ | perf | 74.5% | 11.3% | 0.2548 | terminal_bench/accuracy (Claude Code Claude Opus 4.8) |
40
+ | general | 74.5% | 17.9% | 0.4368 | terminal_bench/accuracy (Claude Code Claude Opus 4.8) |
41
+
42
+ ### claude-code:sonnet
43
+
44
+ | Task | Score | Confidence | Evidence Weight | Top Evidence |
45
+ | --- | ---: | ---: | ---: | --- |
46
+ | bugfix | 71% | 38.7% | 1.2626 | swebench_verified/resolved (Claude Sonnet) |
47
+ | feature | 69.9% | 25.3% | 0.6779 | aider_polyglot/percent_correct (Claude Sonnet) |
48
+ | refactor | 70.2% | 32.1% | 0.9465 | swebench_verified/resolved (Claude Sonnet) |
49
+ | test | 70% | 19.2% | 0.4766 | aider_polyglot/percent_correct (Claude Sonnet) |
50
+ | docs | 68% | 6.3% | 0.1341 | aider_polyglot/percent_correct (Claude Sonnet) |
51
+ | security | 70.6% | 15.6% | 0.3693 | swebench_verified/resolved (Claude Sonnet) |
52
+ | perf | 70.2% | 17.5% | 0.4229 | swebench_verified/resolved (Claude Sonnet) |
53
+ | general | 70.4% | 26.4% | 0.7184 | swebench_verified/resolved (Claude Sonnet) |
54
+
55
+ ### codex:gpt-5.4
56
+
57
+ | Task | Score | Confidence | Evidence Weight | Top Evidence |
58
+ | --- | ---: | ---: | ---: | --- |
59
+ | bugfix | 86.7% | 5.2% | 0.1095 | aider_polyglot/pass_rate_2 (gpt-5 medium) |
60
+ | feature | 86.7% | 5.6% | 0.1186 | aider_polyglot/pass_rate_2 (gpt-5 medium) |
61
+ | refactor | 86.7% | 6.8% | 0.146 | aider_polyglot/pass_rate_2 (gpt-5 medium) |
62
+ | test | 86.7% | 3.9% | 0.0821 | aider_polyglot/pass_rate_2 (gpt-5 medium) |
63
+ | docs | 86.7% | 2.2% | 0.0456 | aider_polyglot/pass_rate_2 (gpt-5 medium) |
64
+ | security | 86.7% | 2.2% | 0.0456 | aider_polyglot/pass_rate_2 (gpt-5 medium) |
65
+ | perf | 86.7% | 3.1% | 0.0639 | aider_polyglot/pass_rate_2 (gpt-5 medium) |
66
+ | general | 86.7% | 4.8% | 0.1004 | aider_polyglot/pass_rate_2 (gpt-5 medium) |
67
+
68
+ ### codex:gpt-5.4-mini
69
+
70
+ | Task | Score | Confidence | Evidence Weight | Top Evidence |
71
+ | --- | ---: | ---: | ---: | --- |
72
+ | bugfix | 77.4% | 10.6% | 0.2364 | evalplus/pass_at_1 (gpt-5.4-mini) |
73
+ | feature | 76.2% | 16.6% | 0.3978 | evalplus/pass_at_1 (gpt-5.4-mini) |
74
+ | refactor | 77.9% | 12% | 0.2729 | aider_polyglot/pass_rate_2 (gpt-5 low) |
75
+ | test | 75.4% | 18% | 0.4375 | evalplus/pass_at_1 (gpt-5.4-mini) |
76
+ | docs | 77.5% | 4.6% | 0.0964 | evalplus/pass_at_1 (gpt-5.4-mini) |
77
+ | security | 76.7% | 5.7% | 0.1218 | evalplus/pass_at_1 (gpt-5.4-mini) |
78
+ | perf | 76.8% | 7.6% | 0.1654 | evalplus/pass_at_1 (gpt-5.4-mini) |
79
+ | general | 76.2% | 14.1% | 0.3288 | evalplus/pass_at_1 (gpt-5.4-mini) |
80
+
81
+ ### codex:gpt-5.5
82
+
83
+ | Task | Score | Confidence | Evidence Weight | Top Evidence |
84
+ | --- | ---: | ---: | ---: | --- |
85
+ | bugfix | 85.3% | 11.7% | 0.2636 | terminal_bench/accuracy (Codex CLI GPT-5.5) |
86
+ | feature | 85.7% | 10.6% | 0.2382 | terminal_bench/accuracy (Codex CLI GPT-5.5) |
87
+ | refactor | 86.3% | 10.4% | 0.231 | aider_polyglot/pass_rate_2 (gpt-5 high) |
88
+ | test | 85.4% | 8.5% | 0.1848 | terminal_bench/accuracy (Codex CLI GPT-5.5) |
89
+ | docs | 88% | 2.2% | 0.0453 | aider_polyglot/pass_rate_2 (gpt-5 high) |
90
+ | security | 84.8% | 6.9% | 0.1486 | terminal_bench/accuracy (Codex CLI GPT-5.5) |
91
+ | perf | 85% | 8.4% | 0.1839 | terminal_bench/accuracy (Codex CLI GPT-5.5) |
92
+ | general | 84.9% | 13.3% | 0.3062 | terminal_bench/accuracy (Codex CLI GPT-5.5) |
93
+
94
+ ## Unmapped Models
95
+
96
+ - o3-pro high from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
97
+ - o3 high from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
98
+ - grok-4 high from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
99
+ - o3 high + gpt-4.1 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
100
+ - o3 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
101
+ - DeepSeek-V3.2-Exp Reasoner from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
102
+ - Gemini 2.5 Pro Preview 03-25 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
103
+ - claude-opus-4-20250514 32k thinking from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
104
+ - o4-mini high from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
105
+ - DeepSeek R1 0528 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
106
+ - claude-opus-4-20250514 no think from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
107
+ - DeepSeek-V3.2-Exp Chat from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
108
+ - claude-3-7-sonnet-20250219 32k thinking tokens from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
109
+ - DeepSeek R1 + claude-3-5-sonnet-20241022 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
110
+ - o1-2024-12-17 high from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
111
+ - claude-sonnet-4-20250514 32k thinking from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
112
+ - claude-3-7-sonnet-20250219 no thinking from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
113
+ - o3-mini high from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
114
+ - Qwen3 235B A22B diff, no think, Alibaba API from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
115
+ - Kimi K2 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
116
+ - DeepSeek R1 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
117
+ - claude-sonnet-4-20250514 no thinking from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
118
+ - gemini-2.5-flash-preview-05-20 24k think from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
119
+ - DeepSeek V3 0324 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
120
+ - Quasar Alpha from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
121
+ - o3-mini medium from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
122
+ - Grok 3 Beta from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
123
+ - Optimus Alpha from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
124
+ - gpt-4.1 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
125
+ - claude-3-5-sonnet-20241022 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
126
+ - Grok 3 Mini Beta high from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
127
+ - DeepSeek Chat V3 prev from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
128
+ - gemini-2.5-flash-preview-04-17 default from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
129
+ - chatgpt-4o-latest 2025-03-29 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
130
+ - gpt-4.5-preview from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
131
+ - gemini-2.5-flash-preview-05-20 no think from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
132
+ - gpt-oss-120b high from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
133
+ - Qwen3 32B from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
134
+ - gemini-exp-1206 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
135
+ - Gemini 2.0 Pro exp-02-05 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
136
+ - Grok 3 Mini Beta low from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
137
+ - o1-mini-2024-09-12 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
138
+ - gpt-4.1-mini from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
139
+ - claude-3-5-haiku-20241022 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
140
+ - chatgpt-4o-latest 2025-02-15 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
141
+ - QwQ-32B + Qwen 2.5 Coder Instruct from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
142
+ - gpt-4o-2024-08-06 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
143
+ - gemini-2.0-flash-exp from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
144
+ - qwen-max-2025-01-25 from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
145
+ - QwQ-32B from aider_polyglot in tools\scorecard\snapshots\live.aider_polyglot.json
146
+ - ...19 more
147
+
@@ -0,0 +1,61 @@
1
+ {
2
+ "antigravity:gemini-3.1-pro": {
3
+ "bugfix": 0.7331,
4
+ "feature": 0.7424,
5
+ "refactor": 0.7574,
6
+ "test": 0.7364,
7
+ "docs": 0.7981,
8
+ "security": 0.7208,
9
+ "perf": 0.7254,
10
+ "general": 0.7231
11
+ },
12
+ "claude-code:opus": {
13
+ "bugfix": 0.7454,
14
+ "feature": 0.7454,
15
+ "refactor": 0.7454,
16
+ "test": 0.7454,
17
+ "security": 0.7454,
18
+ "perf": 0.7454,
19
+ "general": 0.7454
20
+ },
21
+ "claude-code:sonnet": {
22
+ "bugfix": 0.7098,
23
+ "feature": 0.6994,
24
+ "refactor": 0.7019,
25
+ "test": 0.6997,
26
+ "docs": 0.68,
27
+ "security": 0.7055,
28
+ "perf": 0.7022,
29
+ "general": 0.7036
30
+ },
31
+ "codex:gpt-5.4": {
32
+ "bugfix": 0.867,
33
+ "feature": 0.867,
34
+ "refactor": 0.867,
35
+ "test": 0.867,
36
+ "docs": 0.867,
37
+ "security": 0.867,
38
+ "perf": 0.867,
39
+ "general": 0.867
40
+ },
41
+ "codex:gpt-5.4-mini": {
42
+ "bugfix": 0.7738,
43
+ "feature": 0.7618,
44
+ "refactor": 0.7791,
45
+ "test": 0.7537,
46
+ "docs": 0.7746,
47
+ "security": 0.7673,
48
+ "perf": 0.7682,
49
+ "general": 0.7623
50
+ },
51
+ "codex:gpt-5.5": {
52
+ "bugfix": 0.8528,
53
+ "feature": 0.8566,
54
+ "refactor": 0.8627,
55
+ "test": 0.8541,
56
+ "docs": 0.88,
57
+ "security": 0.8478,
58
+ "perf": 0.8497,
59
+ "general": 0.8488
60
+ }
61
+ }