aiforcecli-chat 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/License.MD +49 -0
  2. package/README.md +642 -0
  3. package/aiforcecli.config.example.json +66 -0
  4. package/assets/README.md +14 -0
  5. package/dist/cli.js +2 -0
  6. package/dist/index.js +2 -0
  7. package/package.json +62 -0
  8. package/tools/scorecard/README.md +92 -0
  9. package/tools/scorecard/config.json +134 -0
  10. package/tools/scorecard/fetch.mjs +335 -0
  11. package/tools/scorecard/generate.mjs +289 -0
  12. package/tools/scorecard/generated/example/invalid-rows.json +1 -0
  13. package/tools/scorecard/generated/example/scorecard-report.md +147 -0
  14. package/tools/scorecard/generated/example/scorecard.compact.json +61 -0
  15. package/tools/scorecard/generated/example/scorecard.json +1492 -0
  16. package/tools/scorecard/generated/example/unmapped-models.json +1492 -0
  17. package/tools/scorecard/generated/raw/aider_polyglot.html +21071 -0
  18. package/tools/scorecard/generated/raw/terminal_bench_2_1.html +2 -0
  19. package/tools/scorecard/generated/scorecard/invalid-rows.json +1 -0
  20. package/tools/scorecard/generated/scorecard/scorecard-report.md +133 -0
  21. package/tools/scorecard/generated/scorecard/scorecard.compact.json +51 -0
  22. package/tools/scorecard/generated/scorecard/scorecard.json +1181 -0
  23. package/tools/scorecard/generated/scorecard/unmapped-models.json +1492 -0
  24. package/tools/scorecard/generated/scorecard-example/invalid-rows.json +1 -0
  25. package/tools/scorecard/generated/scorecard-example/scorecard-report.md +40 -0
  26. package/tools/scorecard/generated/scorecard-example/scorecard.compact.json +22 -0
  27. package/tools/scorecard/generated/scorecard-example/scorecard.json +389 -0
  28. package/tools/scorecard/generated/scorecard-example/unmapped-models.json +1 -0
  29. package/tools/scorecard/generated/scorecard-fetch/raw/aider_polyglot.html +21071 -0
  30. package/tools/scorecard/generated/scorecard-fetch/raw/terminal_bench_2_1.html +2 -0
  31. package/tools/scorecard/snapshots/example.normalized.example.json +38 -0
  32. package/tools/scorecard/snapshots/live.aider_polyglot.json +1318 -0
  33. package/tools/scorecard/snapshots/live.terminal_bench_2_1.json +294 -0
@@ -0,0 +1,40 @@
1
+ # Generated Scorecard Report
2
+
3
+ Generated: 2026-06-16T20:24:53.386Z
4
+ Snapshots: 1
5
+ Mapped models: 2
6
+ Unmapped rows: 0
7
+ Invalid rows: 0
8
+
9
+ ## Snapshot Files
10
+
11
+ - tools\scorecard\snapshots\example.normalized.example.json
12
+
13
+ ## Scores
14
+
15
+ ### claude-code:sonnet
16
+
17
+ | Task | Score | Confidence | Evidence Weight | Top Evidence |
18
+ | --- | ---: | ---: | ---: | --- |
19
+ | bugfix | 71% | 38.7% | 1.2626 | swebench_verified/resolved (Claude Sonnet) |
20
+ | feature | 69.9% | 25.3% | 0.678 | aider_polyglot/percent_correct (Claude Sonnet) |
21
+ | refactor | 70.2% | 32.1% | 0.9466 | swebench_verified/resolved (Claude Sonnet) |
22
+ | test | 70% | 19.2% | 0.4766 | aider_polyglot/percent_correct (Claude Sonnet) |
23
+ | docs | 68% | 6.3% | 0.1341 | aider_polyglot/percent_correct (Claude Sonnet) |
24
+ | security | 70.6% | 15.6% | 0.3693 | swebench_verified/resolved (Claude Sonnet) |
25
+ | perf | 70.2% | 17.5% | 0.4229 | swebench_verified/resolved (Claude Sonnet) |
26
+ | general | 70.4% | 26.4% | 0.7184 | swebench_verified/resolved (Claude Sonnet) |
27
+
28
+ ### codex:gpt-5.4-mini
29
+
30
+ | Task | Score | Confidence | Evidence Weight | Top Evidence |
31
+ | --- | ---: | ---: | ---: | --- |
32
+ | bugfix | 74% | 6% | 0.1269 | evalplus/pass_at_1 (gpt-5.4-mini) |
33
+ | feature | 74% | 12.3% | 0.2792 | evalplus/pass_at_1 (gpt-5.4-mini) |
34
+ | refactor | 74% | 6% | 0.1269 | evalplus/pass_at_1 (gpt-5.4-mini) |
35
+ | test | 74% | 15.1% | 0.3554 | evalplus/pass_at_1 (gpt-5.4-mini) |
36
+ | docs | 74% | 2.5% | 0.0508 | evalplus/pass_at_1 (gpt-5.4-mini) |
37
+ | security | 74% | 3.7% | 0.0762 | evalplus/pass_at_1 (gpt-5.4-mini) |
38
+ | perf | 74% | 4.8% | 0.1015 | evalplus/pass_at_1 (gpt-5.4-mini) |
39
+ | general | 74% | 10.3% | 0.2285 | evalplus/pass_at_1 (gpt-5.4-mini) |
40
+
@@ -0,0 +1,22 @@
1
+ {
2
+ "claude-code:sonnet": {
3
+ "bugfix": 0.7098,
4
+ "feature": 0.6994,
5
+ "refactor": 0.7019,
6
+ "test": 0.6997,
7
+ "docs": 0.68,
8
+ "security": 0.7055,
9
+ "perf": 0.7022,
10
+ "general": 0.7036
11
+ },
12
+ "codex:gpt-5.4-mini": {
13
+ "bugfix": 0.74,
14
+ "feature": 0.74,
15
+ "refactor": 0.74,
16
+ "test": 0.74,
17
+ "docs": 0.74,
18
+ "security": 0.74,
19
+ "perf": 0.74,
20
+ "general": 0.74
21
+ }
22
+ }
@@ -0,0 +1,389 @@
1
+ {
2
+ "version": "manual.2026.06.16",
3
+ "generatedAt": "2026-06-16T20:24:53.386Z",
4
+ "taskTypes": [
5
+ "bugfix",
6
+ "feature",
7
+ "refactor",
8
+ "test",
9
+ "docs",
10
+ "security",
11
+ "perf",
12
+ "general"
13
+ ],
14
+ "notes": [
15
+ "Generated scorecard artifact. It is not used by the application unless explicitly wired in later.",
16
+ "Scores are normalized public benchmark priors, not private repo outcomes."
17
+ ],
18
+ "scores": {
19
+ "claude-code:sonnet": {
20
+ "bugfix": {
21
+ "score": 0.7098,
22
+ "confidence": 0.387,
23
+ "evidenceWeight": 1.2626,
24
+ "sources": [
25
+ {
26
+ "source": "swebench_verified",
27
+ "benchmark": "swebench_verified",
28
+ "metric": "resolved",
29
+ "score": 0.72,
30
+ "weight": 0.9408,
31
+ "sampleSize": 500,
32
+ "date": "2026-06-01",
33
+ "url": "https://www.swebench.com/",
34
+ "modelRaw": "Claude Sonnet"
35
+ },
36
+ {
37
+ "source": "aider_polyglot",
38
+ "benchmark": "aider_polyglot",
39
+ "metric": "percent_correct",
40
+ "score": 0.68,
41
+ "weight": 0.3219,
42
+ "sampleSize": 225,
43
+ "date": "2026-06-01",
44
+ "url": "https://aider.chat/docs/leaderboards/",
45
+ "modelRaw": "Claude Sonnet"
46
+ }
47
+ ]
48
+ },
49
+ "feature": {
50
+ "score": 0.6994,
51
+ "confidence": 0.2532,
52
+ "evidenceWeight": 0.678,
53
+ "sources": [
54
+ {
55
+ "source": "aider_polyglot",
56
+ "benchmark": "aider_polyglot",
57
+ "metric": "percent_correct",
58
+ "score": 0.68,
59
+ "weight": 0.3487,
60
+ "sampleSize": 225,
61
+ "date": "2026-06-01",
62
+ "url": "https://aider.chat/docs/leaderboards/",
63
+ "modelRaw": "Claude Sonnet"
64
+ },
65
+ {
66
+ "source": "swebench_verified",
67
+ "benchmark": "swebench_verified",
68
+ "metric": "resolved",
69
+ "score": 0.72,
70
+ "weight": 0.3293,
71
+ "sampleSize": 500,
72
+ "date": "2026-06-01",
73
+ "url": "https://www.swebench.com/",
74
+ "modelRaw": "Claude Sonnet"
75
+ }
76
+ ]
77
+ },
78
+ "refactor": {
79
+ "score": 0.7019,
80
+ "confidence": 0.3212,
81
+ "evidenceWeight": 0.9466,
82
+ "sources": [
83
+ {
84
+ "source": "swebench_verified",
85
+ "benchmark": "swebench_verified",
86
+ "metric": "resolved",
87
+ "score": 0.72,
88
+ "weight": 0.5174,
89
+ "sampleSize": 500,
90
+ "date": "2026-06-01",
91
+ "url": "https://www.swebench.com/",
92
+ "modelRaw": "Claude Sonnet"
93
+ },
94
+ {
95
+ "source": "aider_polyglot",
96
+ "benchmark": "aider_polyglot",
97
+ "metric": "percent_correct",
98
+ "score": 0.68,
99
+ "weight": 0.4291,
100
+ "sampleSize": 225,
101
+ "date": "2026-06-01",
102
+ "url": "https://aider.chat/docs/leaderboards/",
103
+ "modelRaw": "Claude Sonnet"
104
+ }
105
+ ]
106
+ },
107
+ "test": {
108
+ "score": 0.6997,
109
+ "confidence": 0.1924,
110
+ "evidenceWeight": 0.4766,
111
+ "sources": [
112
+ {
113
+ "source": "aider_polyglot",
114
+ "benchmark": "aider_polyglot",
115
+ "metric": "percent_correct",
116
+ "score": 0.68,
117
+ "weight": 0.2414,
118
+ "sampleSize": 225,
119
+ "date": "2026-06-01",
120
+ "url": "https://aider.chat/docs/leaderboards/",
121
+ "modelRaw": "Claude Sonnet"
122
+ },
123
+ {
124
+ "source": "swebench_verified",
125
+ "benchmark": "swebench_verified",
126
+ "metric": "resolved",
127
+ "score": 0.72,
128
+ "weight": 0.2352,
129
+ "sampleSize": 500,
130
+ "date": "2026-06-01",
131
+ "url": "https://www.swebench.com/",
132
+ "modelRaw": "Claude Sonnet"
133
+ }
134
+ ]
135
+ },
136
+ "docs": {
137
+ "score": 0.68,
138
+ "confidence": 0.0628,
139
+ "evidenceWeight": 0.1341,
140
+ "sources": [
141
+ {
142
+ "source": "aider_polyglot",
143
+ "benchmark": "aider_polyglot",
144
+ "metric": "percent_correct",
145
+ "score": 0.68,
146
+ "weight": 0.1341,
147
+ "sampleSize": 225,
148
+ "date": "2026-06-01",
149
+ "url": "https://aider.chat/docs/leaderboards/",
150
+ "modelRaw": "Claude Sonnet"
151
+ }
152
+ ]
153
+ },
154
+ "security": {
155
+ "score": 0.7055,
156
+ "confidence": 0.1559,
157
+ "evidenceWeight": 0.3693,
158
+ "sources": [
159
+ {
160
+ "source": "swebench_verified",
161
+ "benchmark": "swebench_verified",
162
+ "metric": "resolved",
163
+ "score": 0.72,
164
+ "weight": 0.2352,
165
+ "sampleSize": 500,
166
+ "date": "2026-06-01",
167
+ "url": "https://www.swebench.com/",
168
+ "modelRaw": "Claude Sonnet"
169
+ },
170
+ {
171
+ "source": "aider_polyglot",
172
+ "benchmark": "aider_polyglot",
173
+ "metric": "percent_correct",
174
+ "score": 0.68,
175
+ "weight": 0.1341,
176
+ "sampleSize": 225,
177
+ "date": "2026-06-01",
178
+ "url": "https://aider.chat/docs/leaderboards/",
179
+ "modelRaw": "Claude Sonnet"
180
+ }
181
+ ]
182
+ },
183
+ "perf": {
184
+ "score": 0.7022,
185
+ "confidence": 0.1746,
186
+ "evidenceWeight": 0.4229,
187
+ "sources": [
188
+ {
189
+ "source": "swebench_verified",
190
+ "benchmark": "swebench_verified",
191
+ "metric": "resolved",
192
+ "score": 0.72,
193
+ "weight": 0.2352,
194
+ "sampleSize": 500,
195
+ "date": "2026-06-01",
196
+ "url": "https://www.swebench.com/",
197
+ "modelRaw": "Claude Sonnet"
198
+ },
199
+ {
200
+ "source": "aider_polyglot",
201
+ "benchmark": "aider_polyglot",
202
+ "metric": "percent_correct",
203
+ "score": 0.68,
204
+ "weight": 0.1878,
205
+ "sampleSize": 225,
206
+ "date": "2026-06-01",
207
+ "url": "https://aider.chat/docs/leaderboards/",
208
+ "modelRaw": "Claude Sonnet"
209
+ }
210
+ ]
211
+ },
212
+ "general": {
213
+ "score": 0.7036,
214
+ "confidence": 0.2643,
215
+ "evidenceWeight": 0.7184,
216
+ "sources": [
217
+ {
218
+ "source": "swebench_verified",
219
+ "benchmark": "swebench_verified",
220
+ "metric": "resolved",
221
+ "score": 0.72,
222
+ "weight": 0.4234,
223
+ "sampleSize": 500,
224
+ "date": "2026-06-01",
225
+ "url": "https://www.swebench.com/",
226
+ "modelRaw": "Claude Sonnet"
227
+ },
228
+ {
229
+ "source": "aider_polyglot",
230
+ "benchmark": "aider_polyglot",
231
+ "metric": "percent_correct",
232
+ "score": 0.68,
233
+ "weight": 0.295,
234
+ "sampleSize": 225,
235
+ "date": "2026-06-01",
236
+ "url": "https://aider.chat/docs/leaderboards/",
237
+ "modelRaw": "Claude Sonnet"
238
+ }
239
+ ]
240
+ }
241
+ },
242
+ "codex:gpt-5.4-mini": {
243
+ "bugfix": {
244
+ "score": 0.74,
245
+ "confidence": 0.0597,
246
+ "evidenceWeight": 0.1269,
247
+ "sources": [
248
+ {
249
+ "source": "evalplus",
250
+ "benchmark": "evalplus",
251
+ "metric": "pass_at_1",
252
+ "score": 0.74,
253
+ "weight": 0.1269,
254
+ "sampleSize": 378,
255
+ "date": "2026-05-20",
256
+ "url": "https://github.com/evalplus/evalplus",
257
+ "modelRaw": "gpt-5.4-mini"
258
+ }
259
+ ]
260
+ },
261
+ "feature": {
262
+ "score": 0.74,
263
+ "confidence": 0.1225,
264
+ "evidenceWeight": 0.2792,
265
+ "sources": [
266
+ {
267
+ "source": "evalplus",
268
+ "benchmark": "evalplus",
269
+ "metric": "pass_at_1",
270
+ "score": 0.74,
271
+ "weight": 0.2792,
272
+ "sampleSize": 378,
273
+ "date": "2026-05-20",
274
+ "url": "https://github.com/evalplus/evalplus",
275
+ "modelRaw": "gpt-5.4-mini"
276
+ }
277
+ ]
278
+ },
279
+ "refactor": {
280
+ "score": 0.74,
281
+ "confidence": 0.0597,
282
+ "evidenceWeight": 0.1269,
283
+ "sources": [
284
+ {
285
+ "source": "evalplus",
286
+ "benchmark": "evalplus",
287
+ "metric": "pass_at_1",
288
+ "score": 0.74,
289
+ "weight": 0.1269,
290
+ "sampleSize": 378,
291
+ "date": "2026-05-20",
292
+ "url": "https://github.com/evalplus/evalplus",
293
+ "modelRaw": "gpt-5.4-mini"
294
+ }
295
+ ]
296
+ },
297
+ "test": {
298
+ "score": 0.74,
299
+ "confidence": 0.1509,
300
+ "evidenceWeight": 0.3554,
301
+ "sources": [
302
+ {
303
+ "source": "evalplus",
304
+ "benchmark": "evalplus",
305
+ "metric": "pass_at_1",
306
+ "score": 0.74,
307
+ "weight": 0.3554,
308
+ "sampleSize": 378,
309
+ "date": "2026-05-20",
310
+ "url": "https://github.com/evalplus/evalplus",
311
+ "modelRaw": "gpt-5.4-mini"
312
+ }
313
+ ]
314
+ },
315
+ "docs": {
316
+ "score": 0.74,
317
+ "confidence": 0.0248,
318
+ "evidenceWeight": 0.0508,
319
+ "sources": [
320
+ {
321
+ "source": "evalplus",
322
+ "benchmark": "evalplus",
323
+ "metric": "pass_at_1",
324
+ "score": 0.74,
325
+ "weight": 0.0508,
326
+ "sampleSize": 378,
327
+ "date": "2026-05-20",
328
+ "url": "https://github.com/evalplus/evalplus",
329
+ "modelRaw": "gpt-5.4-mini"
330
+ }
331
+ ]
332
+ },
333
+ "security": {
334
+ "score": 0.74,
335
+ "confidence": 0.0367,
336
+ "evidenceWeight": 0.0762,
337
+ "sources": [
338
+ {
339
+ "source": "evalplus",
340
+ "benchmark": "evalplus",
341
+ "metric": "pass_at_1",
342
+ "score": 0.74,
343
+ "weight": 0.0762,
344
+ "sampleSize": 378,
345
+ "date": "2026-05-20",
346
+ "url": "https://github.com/evalplus/evalplus",
347
+ "modelRaw": "gpt-5.4-mini"
348
+ }
349
+ ]
350
+ },
351
+ "perf": {
352
+ "score": 0.74,
353
+ "confidence": 0.0483,
354
+ "evidenceWeight": 0.1015,
355
+ "sources": [
356
+ {
357
+ "source": "evalplus",
358
+ "benchmark": "evalplus",
359
+ "metric": "pass_at_1",
360
+ "score": 0.74,
361
+ "weight": 0.1015,
362
+ "sampleSize": 378,
363
+ "date": "2026-05-20",
364
+ "url": "https://github.com/evalplus/evalplus",
365
+ "modelRaw": "gpt-5.4-mini"
366
+ }
367
+ ]
368
+ },
369
+ "general": {
370
+ "score": 0.74,
371
+ "confidence": 0.1025,
372
+ "evidenceWeight": 0.2285,
373
+ "sources": [
374
+ {
375
+ "source": "evalplus",
376
+ "benchmark": "evalplus",
377
+ "metric": "pass_at_1",
378
+ "score": 0.74,
379
+ "weight": 0.2285,
380
+ "sampleSize": 378,
381
+ "date": "2026-05-20",
382
+ "url": "https://github.com/evalplus/evalplus",
383
+ "modelRaw": "gpt-5.4-mini"
384
+ }
385
+ ]
386
+ }
387
+ }
388
+ }
389
+ }