aiforcecli-chat 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/License.MD +49 -0
- package/README.md +642 -0
- package/aiforcecli.config.example.json +66 -0
- package/assets/README.md +14 -0
- package/dist/cli.js +2 -0
- package/dist/index.js +2 -0
- package/package.json +62 -0
- package/tools/scorecard/README.md +92 -0
- package/tools/scorecard/config.json +134 -0
- package/tools/scorecard/fetch.mjs +335 -0
- package/tools/scorecard/generate.mjs +289 -0
- package/tools/scorecard/generated/example/invalid-rows.json +1 -0
- package/tools/scorecard/generated/example/scorecard-report.md +147 -0
- package/tools/scorecard/generated/example/scorecard.compact.json +61 -0
- package/tools/scorecard/generated/example/scorecard.json +1492 -0
- package/tools/scorecard/generated/example/unmapped-models.json +1492 -0
- package/tools/scorecard/generated/raw/aider_polyglot.html +21071 -0
- package/tools/scorecard/generated/raw/terminal_bench_2_1.html +2 -0
- package/tools/scorecard/generated/scorecard/invalid-rows.json +1 -0
- package/tools/scorecard/generated/scorecard/scorecard-report.md +133 -0
- package/tools/scorecard/generated/scorecard/scorecard.compact.json +51 -0
- package/tools/scorecard/generated/scorecard/scorecard.json +1181 -0
- package/tools/scorecard/generated/scorecard/unmapped-models.json +1492 -0
- package/tools/scorecard/generated/scorecard-example/invalid-rows.json +1 -0
- package/tools/scorecard/generated/scorecard-example/scorecard-report.md +40 -0
- package/tools/scorecard/generated/scorecard-example/scorecard.compact.json +22 -0
- package/tools/scorecard/generated/scorecard-example/scorecard.json +389 -0
- package/tools/scorecard/generated/scorecard-example/unmapped-models.json +1 -0
- package/tools/scorecard/generated/scorecard-fetch/raw/aider_polyglot.html +21071 -0
- package/tools/scorecard/generated/scorecard-fetch/raw/terminal_bench_2_1.html +2 -0
- package/tools/scorecard/snapshots/example.normalized.example.json +38 -0
- package/tools/scorecard/snapshots/live.aider_polyglot.json +1318 -0
- package/tools/scorecard/snapshots/live.terminal_bench_2_1.json +294 -0
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import fs from 'node:fs';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
|
|
5
|
+
function parseArgs(argv) {
|
|
6
|
+
const args = {
|
|
7
|
+
config: path.join('tools', 'scorecard', 'config.json'),
|
|
8
|
+
sourceIds: undefined,
|
|
9
|
+
rawDir: path.join('generated', 'scorecard-fetch', 'raw'),
|
|
10
|
+
};
|
|
11
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
12
|
+
const a = argv[i];
|
|
13
|
+
if (a === '--config') args.config = needValue(argv, ++i, a);
|
|
14
|
+
else if (a === '--source') {
|
|
15
|
+
const values = needValue(argv, ++i, a).split(',').map((s) => s.trim()).filter(Boolean);
|
|
16
|
+
args.sourceIds = [...(args.sourceIds ?? []), ...values];
|
|
17
|
+
} else if (a === '--raw-dir') args.rawDir = needValue(argv, ++i, a);
|
|
18
|
+
else if (a === '--help' || a === '-h') {
|
|
19
|
+
printHelp();
|
|
20
|
+
process.exit(0);
|
|
21
|
+
} else {
|
|
22
|
+
throw new Error(`Unknown argument: ${a}`);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
return args;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function needValue(argv, i, flag) {
|
|
29
|
+
const v = argv[i];
|
|
30
|
+
if (!v || v.startsWith('--')) throw new Error(`${flag} requires a value`);
|
|
31
|
+
return v;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function printHelp() {
|
|
35
|
+
console.log(`Usage: node tools/scorecard/fetch.mjs [options]
|
|
36
|
+
|
|
37
|
+
Options:
|
|
38
|
+
--config <path> Generator config JSON
|
|
39
|
+
--source <id[,id]> Fetch only selected source IDs
|
|
40
|
+
--raw-dir <dir> Directory for raw fetched HTML snapshots
|
|
41
|
+
`);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function readJson(file) {
|
|
45
|
+
return JSON.parse(fs.readFileSync(file, 'utf8'));
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
async function fetchText(url) {
|
|
49
|
+
const res = await fetch(url, {
|
|
50
|
+
headers: {
|
|
51
|
+
'user-agent': 'aiforcecli-scorecard-fetch/0.1 (+manual update command)',
|
|
52
|
+
accept: 'text/html,application/json;q=0.9,text/plain;q=0.8,*/*;q=0.5',
|
|
53
|
+
},
|
|
54
|
+
});
|
|
55
|
+
if (!res.ok) throw new Error(`GET ${url} failed: ${res.status} ${res.statusText}`);
|
|
56
|
+
return await res.text();
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function htmlToText(html) {
|
|
60
|
+
return html
|
|
61
|
+
.replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, '\n')
|
|
62
|
+
.replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, '\n')
|
|
63
|
+
.replace(/<br\s*\/?>/gi, '\n')
|
|
64
|
+
.replace(/<\/(p|div|li|tr|td|th|h\d|details|summary)>/gi, '\n')
|
|
65
|
+
.replace(/<[^>]+>/g, ' ')
|
|
66
|
+
.replace(/ /g, ' ')
|
|
67
|
+
.replace(/&/g, '&')
|
|
68
|
+
.replace(/</g, '<')
|
|
69
|
+
.replace(/>/g, '>')
|
|
70
|
+
.replace(/'/g, "'")
|
|
71
|
+
.replace(/"/g, '"')
|
|
72
|
+
.replace(/\r/g, '\n');
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
function textLines(html) {
|
|
76
|
+
return htmlToText(html)
|
|
77
|
+
.split('\n')
|
|
78
|
+
.map((line) => line.replace(/\s+/g, ' ').trim())
|
|
79
|
+
.filter(Boolean);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function compactModelName(name) {
|
|
83
|
+
return String(name ?? '')
|
|
84
|
+
.replace(/^▶\s*/, '')
|
|
85
|
+
.replace(/[()]/g, ' ')
|
|
86
|
+
.replace(/\s+/g, ' ')
|
|
87
|
+
.trim();
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function stripTags(s) {
|
|
91
|
+
return htmlToText(s).replace(/\s+/g, ' ').trim();
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function detailValue(html, label) {
|
|
95
|
+
const re = new RegExp(`${label.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}[\\s\\S]*?<\\/strong>\\s*([^<\\n]+)`, 'i');
|
|
96
|
+
const m = html.match(re);
|
|
97
|
+
return m ? stripTags(m[1]) : undefined;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
function parseAiderPolyglot(html, source) {
|
|
101
|
+
const htmlRows = parseAiderHtmlRows(html, source);
|
|
102
|
+
if (htmlRows.length > 0) return htmlRows;
|
|
103
|
+
|
|
104
|
+
const lines = textLines(html);
|
|
105
|
+
const rows = [];
|
|
106
|
+
let current;
|
|
107
|
+
|
|
108
|
+
function flush() {
|
|
109
|
+
if (!current || current.score == null) return;
|
|
110
|
+
rows.push({
|
|
111
|
+
source: source.source ?? source.id,
|
|
112
|
+
benchmark: source.benchmark ?? 'aider_polyglot',
|
|
113
|
+
url: source.url,
|
|
114
|
+
modelRaw: current.modelRaw,
|
|
115
|
+
metric: 'pass_rate_2',
|
|
116
|
+
score: current.score,
|
|
117
|
+
scoreScale: 'percent',
|
|
118
|
+
sampleSize: current.sampleSize,
|
|
119
|
+
date: current.date,
|
|
120
|
+
extra: current.extra,
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
for (const line of lines) {
|
|
125
|
+
if (line.startsWith('▶ ')) {
|
|
126
|
+
flush();
|
|
127
|
+
current = { modelRaw: compactModelName(line), extra: {} };
|
|
128
|
+
continue;
|
|
129
|
+
}
|
|
130
|
+
if (!current) continue;
|
|
131
|
+
let m = line.match(/^Pass rate 2\s*:\s*([\d.]+)/i);
|
|
132
|
+
if (m) {
|
|
133
|
+
current.score = Number(m[1]);
|
|
134
|
+
continue;
|
|
135
|
+
}
|
|
136
|
+
m = line.match(/^Test cases\s*:\s*(\d+)/i);
|
|
137
|
+
if (m) {
|
|
138
|
+
current.sampleSize = Number(m[1]);
|
|
139
|
+
continue;
|
|
140
|
+
}
|
|
141
|
+
m = line.match(/^Date\s*:\s*(\d{4}-\d{2}-\d{2})/i);
|
|
142
|
+
if (m) {
|
|
143
|
+
current.date = m[1];
|
|
144
|
+
continue;
|
|
145
|
+
}
|
|
146
|
+
m = line.match(/^Total cost\s*:\s*([\d.]+)/i);
|
|
147
|
+
if (m) current.extra.totalCostUsd = Number(m[1]);
|
|
148
|
+
}
|
|
149
|
+
flush();
|
|
150
|
+
return rows;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
function parseAiderHtmlRows(html, source) {
|
|
154
|
+
const rows = [];
|
|
155
|
+
const rowRe = /<tr id="main-row-(\d+)"[\s\S]*?<\/tr>\s*<tr class="details-row" id="details-\1"[\s\S]*?<\/tr>/gi;
|
|
156
|
+
let match;
|
|
157
|
+
while ((match = rowRe.exec(html))) {
|
|
158
|
+
const block = match[0];
|
|
159
|
+
const modelMatch = block.match(/<td style="padding:\s*8px;"><span>([\s\S]*?)<\/span><\/td>/i);
|
|
160
|
+
const modelRaw = modelMatch ? compactModelName(stripTags(modelMatch[1])) : detailValue(block, 'Model');
|
|
161
|
+
const score = Number(detailValue(block, 'Pass rate 2'));
|
|
162
|
+
if (!modelRaw || !Number.isFinite(score)) continue;
|
|
163
|
+
rows.push({
|
|
164
|
+
source: source.source ?? source.id,
|
|
165
|
+
benchmark: source.benchmark ?? 'aider_polyglot',
|
|
166
|
+
url: source.url,
|
|
167
|
+
modelRaw,
|
|
168
|
+
metric: 'pass_rate_2',
|
|
169
|
+
score,
|
|
170
|
+
scoreScale: 'percent',
|
|
171
|
+
sampleSize: maybeNumber(detailValue(block, 'Test cases')),
|
|
172
|
+
date: detailValue(block, 'Date'),
|
|
173
|
+
extra: {
|
|
174
|
+
passRate1: maybeNumber(detailValue(block, 'Pass rate 1')),
|
|
175
|
+
passNum1: maybeNumber(detailValue(block, 'Pass num 1')),
|
|
176
|
+
passNum2: maybeNumber(detailValue(block, 'Pass num 2')),
|
|
177
|
+
totalCostUsd: maybeNumber(detailValue(block, 'Total cost')),
|
|
178
|
+
secondsPerCase: maybeNumber(detailValue(block, 'Seconds per case')),
|
|
179
|
+
editFormat: detailValue(block, 'Edit format'),
|
|
180
|
+
},
|
|
181
|
+
});
|
|
182
|
+
}
|
|
183
|
+
return rows;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
function maybeNumber(v) {
|
|
187
|
+
if (v == null) return undefined;
|
|
188
|
+
const n = Number(String(v).replace(/[$,%]/g, '').trim());
|
|
189
|
+
return Number.isFinite(n) ? n : undefined;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
function findJsonArrayAfter(text, key) {
|
|
193
|
+
const start = text.indexOf(key);
|
|
194
|
+
if (start < 0) return undefined;
|
|
195
|
+
const bracket = text.indexOf('[', start + key.length);
|
|
196
|
+
if (bracket < 0) return undefined;
|
|
197
|
+
let depth = 0;
|
|
198
|
+
let inString = false;
|
|
199
|
+
let escaped = false;
|
|
200
|
+
for (let i = bracket; i < text.length; i += 1) {
|
|
201
|
+
const ch = text[i];
|
|
202
|
+
if (inString) {
|
|
203
|
+
if (escaped) escaped = false;
|
|
204
|
+
else if (ch === '\\') escaped = true;
|
|
205
|
+
else if (ch === '"') inString = false;
|
|
206
|
+
continue;
|
|
207
|
+
}
|
|
208
|
+
if (ch === '"') inString = true;
|
|
209
|
+
else if (ch === '[') depth += 1;
|
|
210
|
+
else if (ch === ']') {
|
|
211
|
+
depth -= 1;
|
|
212
|
+
if (depth === 0) return text.slice(bracket, i + 1);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
return undefined;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
function parseTerminalBench(html, source) {
|
|
219
|
+
const serialized = html.replace(/\\"/g, '"').replace(/\\u0026/g, '&');
|
|
220
|
+
const arr = findJsonArrayAfter(serialized, '"rows"');
|
|
221
|
+
if (arr) {
|
|
222
|
+
try {
|
|
223
|
+
const parsed = JSON.parse(arr);
|
|
224
|
+
if (Array.isArray(parsed)) {
|
|
225
|
+
return parsed
|
|
226
|
+
.filter((r) => typeof r.agent === 'string' && Array.isArray(r.model) && typeof r.accuracy === 'number')
|
|
227
|
+
.map((r) => ({
|
|
228
|
+
source: source.source ?? 'terminal_bench',
|
|
229
|
+
benchmark: source.benchmark ?? 'terminal_bench',
|
|
230
|
+
url: source.url,
|
|
231
|
+
modelRaw: `${r.agent} ${r.model.join(' + ')}`,
|
|
232
|
+
metric: 'accuracy',
|
|
233
|
+
score: r.accuracy,
|
|
234
|
+
scoreScale: '0-1',
|
|
235
|
+
date: r.date,
|
|
236
|
+
extra: {
|
|
237
|
+
agent: r.agent,
|
|
238
|
+
model: r.model,
|
|
239
|
+
stderr: r.stderr,
|
|
240
|
+
verified: r.verified,
|
|
241
|
+
agentName: r.agentName,
|
|
242
|
+
agentVersion: r.agentVersion,
|
|
243
|
+
modelNames: r.modelNames,
|
|
244
|
+
modelProviders: r.modelProviders,
|
|
245
|
+
},
|
|
246
|
+
}));
|
|
247
|
+
}
|
|
248
|
+
} catch {
|
|
249
|
+
// Fall through to text parser below.
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
const lines = textLines(html);
|
|
254
|
+
const rows = [];
|
|
255
|
+
const prefixes = source.agentPrefixes ?? ['Codex CLI', 'Claude Code', 'Gemini CLI', 'Terminus 2'];
|
|
256
|
+
|
|
257
|
+
for (let i = 1; i < lines.length; i += 1) {
|
|
258
|
+
const acc = lines[i].match(/^([\d.]+)%\s*±\s*([\d.]+)/);
|
|
259
|
+
if (!acc) continue;
|
|
260
|
+
const meta = lines[i - 1];
|
|
261
|
+
const dateMatch = meta.match(/\b\d{4}-\d{2}-\d{2}\b/);
|
|
262
|
+
if (!dateMatch || dateMatch.index == null) continue;
|
|
263
|
+
const beforeDate = meta.slice(0, dateMatch.index).trim();
|
|
264
|
+
const date = dateMatch[0];
|
|
265
|
+
const prefix = prefixes.find((p) => beforeDate === p || beforeDate.startsWith(`${p} `));
|
|
266
|
+
if (!prefix) continue;
|
|
267
|
+
const model = beforeDate.slice(prefix.length).trim();
|
|
268
|
+
if (!model) continue;
|
|
269
|
+
rows.push({
|
|
270
|
+
source: source.source ?? 'terminal_bench',
|
|
271
|
+
benchmark: source.benchmark ?? 'terminal_bench',
|
|
272
|
+
url: source.url,
|
|
273
|
+
modelRaw: `${prefix} ${model}`,
|
|
274
|
+
metric: 'accuracy',
|
|
275
|
+
score: Number(acc[1]),
|
|
276
|
+
scoreScale: 'percent',
|
|
277
|
+
date,
|
|
278
|
+
extra: {
|
|
279
|
+
agent: prefix,
|
|
280
|
+
model,
|
|
281
|
+
stderrPct: Number(acc[2]),
|
|
282
|
+
},
|
|
283
|
+
});
|
|
284
|
+
}
|
|
285
|
+
return rows;
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
function parseRows(html, source) {
|
|
289
|
+
if (source.parser === 'aiderPolyglot') return parseAiderPolyglot(html, source);
|
|
290
|
+
if (source.parser === 'terminalBench') return parseTerminalBench(html, source);
|
|
291
|
+
throw new Error(`Unsupported parser for ${source.id}: ${source.parser}`);
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
function outputSnapshot(source, rows, fetchedAt) {
|
|
295
|
+
return {
|
|
296
|
+
source: source.id,
|
|
297
|
+
fetchedAt,
|
|
298
|
+
url: source.url,
|
|
299
|
+
parser: source.parser,
|
|
300
|
+
rows,
|
|
301
|
+
};
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
async function main() {
|
|
305
|
+
const args = parseArgs(process.argv.slice(2));
|
|
306
|
+
const config = readJson(args.config);
|
|
307
|
+
const selected = new Set(args.sourceIds ?? []);
|
|
308
|
+
const sources = (config.fetchSources ?? []).filter((s) => selected.size === 0 || selected.has(s.id));
|
|
309
|
+
if (sources.length === 0) throw new Error('No fetch sources selected.');
|
|
310
|
+
|
|
311
|
+
fs.mkdirSync(args.rawDir, { recursive: true });
|
|
312
|
+
|
|
313
|
+
let totalRows = 0;
|
|
314
|
+
for (const source of sources) {
|
|
315
|
+
console.log(`fetching ${source.id} from ${source.url}`);
|
|
316
|
+
const html = await fetchText(source.url);
|
|
317
|
+
const fetchedAt = new Date().toISOString();
|
|
318
|
+
const rawPath = path.join(args.rawDir, `${source.id}.html`);
|
|
319
|
+
fs.writeFileSync(rawPath, html);
|
|
320
|
+
|
|
321
|
+
const rows = parseRows(html, source);
|
|
322
|
+
totalRows += rows.length;
|
|
323
|
+
fs.mkdirSync(path.dirname(source.output), { recursive: true });
|
|
324
|
+
fs.writeFileSync(source.output, `${JSON.stringify(outputSnapshot(source, rows, fetchedAt), null, 2)}\n`);
|
|
325
|
+
console.log(`wrote ${source.output} (${rows.length} row(s))`);
|
|
326
|
+
}
|
|
327
|
+
console.log(`fetched ${sources.length} source(s), ${totalRows} normalized row(s)`);
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
try {
|
|
331
|
+
await main();
|
|
332
|
+
} catch (err) {
|
|
333
|
+
console.error(err instanceof Error ? err.message : String(err));
|
|
334
|
+
process.exit(1);
|
|
335
|
+
}
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import fs from 'node:fs';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
|
|
5
|
+
const TASK_TYPES = ['bugfix', 'feature', 'refactor', 'test', 'docs', 'security', 'perf', 'general'];
|
|
6
|
+
|
|
7
|
+
function parseArgs(argv) {
|
|
8
|
+
const args = {
|
|
9
|
+
config: path.join('tools', 'scorecard', 'config.json'),
|
|
10
|
+
input: [path.join('tools', 'scorecard', 'snapshots')],
|
|
11
|
+
out: path.join('generated', 'scorecard'),
|
|
12
|
+
includeExamples: false,
|
|
13
|
+
};
|
|
14
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
15
|
+
const a = argv[i];
|
|
16
|
+
if (a === '--config') args.config = needValue(argv, ++i, a);
|
|
17
|
+
else if (a === '--input') args.input.push(needValue(argv, ++i, a));
|
|
18
|
+
else if (a === '--out') args.out = needValue(argv, ++i, a);
|
|
19
|
+
else if (a === '--include-examples') args.includeExamples = true;
|
|
20
|
+
else if (a === '--help' || a === '-h') {
|
|
21
|
+
printHelp();
|
|
22
|
+
process.exit(0);
|
|
23
|
+
} else {
|
|
24
|
+
throw new Error(`Unknown argument: ${a}`);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
return args;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function needValue(argv, i, flag) {
|
|
31
|
+
const v = argv[i];
|
|
32
|
+
if (!v || v.startsWith('--')) throw new Error(`${flag} requires a value`);
|
|
33
|
+
return v;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function printHelp() {
|
|
37
|
+
console.log(`Usage: node tools/scorecard/generate.mjs [options]
|
|
38
|
+
|
|
39
|
+
Options:
|
|
40
|
+
--config <path> Generator config JSON
|
|
41
|
+
--input <path> Snapshot file or directory; can be repeated
|
|
42
|
+
--out <dir> Output directory
|
|
43
|
+
--include-examples Include *.example.json files
|
|
44
|
+
`);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function readJson(file) {
|
|
48
|
+
return JSON.parse(fs.readFileSync(file, 'utf8'));
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function listJsonFiles(inputs, includeExamples) {
|
|
52
|
+
const files = [];
|
|
53
|
+
for (const input of inputs) {
|
|
54
|
+
if (!fs.existsSync(input)) continue;
|
|
55
|
+
const stat = fs.statSync(input);
|
|
56
|
+
if (stat.isDirectory()) {
|
|
57
|
+
for (const name of fs.readdirSync(input)) {
|
|
58
|
+
const full = path.join(input, name);
|
|
59
|
+
if (fs.statSync(full).isDirectory()) continue;
|
|
60
|
+
if (!name.endsWith('.json')) continue;
|
|
61
|
+
if (!includeExamples && name.endsWith('.example.json')) continue;
|
|
62
|
+
files.push(full);
|
|
63
|
+
}
|
|
64
|
+
} else if (input.endsWith('.json')) {
|
|
65
|
+
const name = path.basename(input);
|
|
66
|
+
if (includeExamples || !name.endsWith('.example.json')) files.push(input);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
return [...new Set(files)].sort();
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function extractRows(file) {
|
|
73
|
+
const data = readJson(file);
|
|
74
|
+
const rows = Array.isArray(data) ? data : data.rows;
|
|
75
|
+
if (!Array.isArray(rows)) throw new Error(`${file} must be an array or an object with rows[]`);
|
|
76
|
+
return rows.map((row, index) => ({ ...row, snapshotFile: file, snapshotIndex: index }));
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function normalizeKey(s) {
|
|
80
|
+
return String(s ?? '')
|
|
81
|
+
.trim()
|
|
82
|
+
.toLowerCase()
|
|
83
|
+
.replace(/[_/]+/g, ' ')
|
|
84
|
+
.replace(/\s+/g, ' ');
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function canonicalModel(row, aliases) {
|
|
88
|
+
if (row.modelCanonical) return row.modelCanonical;
|
|
89
|
+
const raw = normalizeKey(row.modelRaw);
|
|
90
|
+
return aliases[raw];
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function clamp(n, lo, hi) {
|
|
94
|
+
return Math.max(lo, Math.min(hi, n));
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function normalizeScore(row) {
|
|
98
|
+
const raw = Number(row.score);
|
|
99
|
+
if (!Number.isFinite(raw)) return undefined;
|
|
100
|
+
if (row.scoreScale === 'percent') return clamp(raw / 100, 0, 1);
|
|
101
|
+
if (row.scoreScale === '0-1') return clamp(raw, 0, 1);
|
|
102
|
+
return raw > 1 ? clamp(raw / 100, 0, 1) : clamp(raw, 0, 1);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function ageDays(dateLike, now) {
|
|
106
|
+
if (!dateLike) return undefined;
|
|
107
|
+
const t = new Date(dateLike).getTime();
|
|
108
|
+
if (!Number.isFinite(t)) return undefined;
|
|
109
|
+
return Math.max(0, (now.getTime() - t) / 86_400_000);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
function freshnessWeight(row, config, now) {
|
|
113
|
+
const age = ageDays(row.date, now);
|
|
114
|
+
if (age == null) return 0.7;
|
|
115
|
+
const halfLife = Number(config.halfLifeDays ?? 180);
|
|
116
|
+
return clamp(Math.exp((-Math.log(2) * age) / halfLife), 0.05, 1);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
function sampleConfidence(row, config) {
|
|
120
|
+
if (row.confidence != null) return clamp(Number(row.confidence), 0, 1);
|
|
121
|
+
const sampleSize = Number(row.sampleSize);
|
|
122
|
+
if (!Number.isFinite(sampleSize) || sampleSize <= 0) return Number(config.unknownSampleConfidence ?? 0.55);
|
|
123
|
+
const full = Number(config.sampleSizeFullConfidence ?? 500);
|
|
124
|
+
return clamp(Math.sqrt(sampleSize / full), 0.15, 1);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
function rowWeight(row, task, config, now) {
|
|
128
|
+
const benchmark = row.benchmark ?? row.source;
|
|
129
|
+
const taskWeights = config.benchmarkTaskWeights?.[benchmark] ?? {};
|
|
130
|
+
const relevance = Number(taskWeights[task] ?? 0);
|
|
131
|
+
if (relevance <= 0) return 0;
|
|
132
|
+
const source = row.source ?? benchmark;
|
|
133
|
+
const reliability = Number(config.sourceReliability?.[source] ?? 0.5);
|
|
134
|
+
return relevance * reliability * freshnessWeight(row, config, now) * sampleConfidence(row, config);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
function scoreRows(rows, config, now) {
|
|
138
|
+
const byArm = new Map();
|
|
139
|
+
const unmapped = [];
|
|
140
|
+
const invalid = [];
|
|
141
|
+
|
|
142
|
+
for (const row of rows) {
|
|
143
|
+
const model = canonicalModel(row, config.modelAliases ?? {});
|
|
144
|
+
const score = normalizeScore(row);
|
|
145
|
+
if (!model) {
|
|
146
|
+
unmapped.push(row);
|
|
147
|
+
continue;
|
|
148
|
+
}
|
|
149
|
+
if (score == null) {
|
|
150
|
+
invalid.push({ ...row, reason: 'score is missing or invalid' });
|
|
151
|
+
continue;
|
|
152
|
+
}
|
|
153
|
+
if (!byArm.has(model)) byArm.set(model, []);
|
|
154
|
+
byArm.get(model).push({ ...row, modelCanonical: model, normalizedScore: score });
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const scores = {};
|
|
158
|
+
const compact = {};
|
|
159
|
+
for (const [model, modelRows] of [...byArm.entries()].sort(([a], [b]) => a.localeCompare(b))) {
|
|
160
|
+
scores[model] = {};
|
|
161
|
+
compact[model] = {};
|
|
162
|
+
for (const task of TASK_TYPES) {
|
|
163
|
+
const evidence = [];
|
|
164
|
+
let weighted = 0;
|
|
165
|
+
let totalWeight = 0;
|
|
166
|
+
for (const row of modelRows) {
|
|
167
|
+
const weight = rowWeight(row, task, config, now);
|
|
168
|
+
if (weight <= 0) continue;
|
|
169
|
+
weighted += row.normalizedScore * weight;
|
|
170
|
+
totalWeight += weight;
|
|
171
|
+
evidence.push({
|
|
172
|
+
source: row.source,
|
|
173
|
+
benchmark: row.benchmark ?? row.source,
|
|
174
|
+
metric: row.metric,
|
|
175
|
+
score: round(row.normalizedScore, 4),
|
|
176
|
+
weight: round(weight, 4),
|
|
177
|
+
sampleSize: row.sampleSize,
|
|
178
|
+
date: row.date,
|
|
179
|
+
url: row.url,
|
|
180
|
+
modelRaw: row.modelRaw,
|
|
181
|
+
});
|
|
182
|
+
}
|
|
183
|
+
if (totalWeight <= 0) continue;
|
|
184
|
+
const score = weighted / totalWeight;
|
|
185
|
+
const confidence = totalWeight / (totalWeight + 2);
|
|
186
|
+
scores[model][task] = {
|
|
187
|
+
score: round(score, 4),
|
|
188
|
+
confidence: round(confidence, 4),
|
|
189
|
+
evidenceWeight: round(totalWeight, 4),
|
|
190
|
+
sources: evidence.sort((a, b) => b.weight - a.weight),
|
|
191
|
+
};
|
|
192
|
+
compact[model][task] = round(score, 4);
|
|
193
|
+
}
|
|
194
|
+
if (Object.keys(scores[model]).length === 0) delete scores[model];
|
|
195
|
+
if (Object.keys(compact[model]).length === 0) delete compact[model];
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
return { scores, compact, unmapped, invalid };
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
function round(n, digits) {
|
|
202
|
+
const p = 10 ** digits;
|
|
203
|
+
return Math.round(n * p) / p;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
function makeReport(result, files, now) {
|
|
207
|
+
const lines = [];
|
|
208
|
+
lines.push('# Generated Scorecard Report');
|
|
209
|
+
lines.push('');
|
|
210
|
+
lines.push(`Generated: ${now.toISOString()}`);
|
|
211
|
+
lines.push(`Snapshots: ${files.length}`);
|
|
212
|
+
lines.push(`Mapped models: ${Object.keys(result.scores).length}`);
|
|
213
|
+
lines.push(`Unmapped rows: ${result.unmapped.length}`);
|
|
214
|
+
lines.push(`Invalid rows: ${result.invalid.length}`);
|
|
215
|
+
lines.push('');
|
|
216
|
+
lines.push('## Snapshot Files');
|
|
217
|
+
lines.push('');
|
|
218
|
+
for (const file of files) lines.push(`- ${file}`);
|
|
219
|
+
lines.push('');
|
|
220
|
+
lines.push('## Scores');
|
|
221
|
+
lines.push('');
|
|
222
|
+
for (const [model, tasks] of Object.entries(result.scores)) {
|
|
223
|
+
lines.push(`### ${model}`);
|
|
224
|
+
lines.push('');
|
|
225
|
+
lines.push('| Task | Score | Confidence | Evidence Weight | Top Evidence |');
|
|
226
|
+
lines.push('| --- | ---: | ---: | ---: | --- |');
|
|
227
|
+
for (const task of TASK_TYPES) {
|
|
228
|
+
const value = tasks[task];
|
|
229
|
+
if (!value) continue;
|
|
230
|
+
const top = value.sources[0];
|
|
231
|
+
const label = top ? `${top.benchmark}/${top.metric} (${top.modelRaw})` : '';
|
|
232
|
+
lines.push(`| ${task} | ${pct(value.score)} | ${pct(value.confidence)} | ${value.evidenceWeight} | ${label} |`);
|
|
233
|
+
}
|
|
234
|
+
lines.push('');
|
|
235
|
+
}
|
|
236
|
+
if (result.unmapped.length > 0) {
|
|
237
|
+
lines.push('## Unmapped Models');
|
|
238
|
+
lines.push('');
|
|
239
|
+
for (const row of result.unmapped.slice(0, 50)) {
|
|
240
|
+
lines.push(`- ${row.modelRaw ?? '(missing modelRaw)'} from ${row.source ?? row.benchmark ?? '(unknown source)'} in ${row.snapshotFile}`);
|
|
241
|
+
}
|
|
242
|
+
if (result.unmapped.length > 50) lines.push(`- ...${result.unmapped.length - 50} more`);
|
|
243
|
+
lines.push('');
|
|
244
|
+
}
|
|
245
|
+
return `${lines.join('\n')}\n`;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
function pct(n) {
|
|
249
|
+
return `${Math.round(n * 1000) / 10}%`;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
function main() {
|
|
253
|
+
const args = parseArgs(process.argv.slice(2));
|
|
254
|
+
const now = new Date();
|
|
255
|
+
const config = readJson(args.config);
|
|
256
|
+
const files = listJsonFiles(args.input, args.includeExamples);
|
|
257
|
+
const rows = files.flatMap(extractRows);
|
|
258
|
+
const result = scoreRows(rows, config, now);
|
|
259
|
+
const versionDate = now.toISOString().slice(0, 10).replaceAll('-', '.');
|
|
260
|
+
const output = {
|
|
261
|
+
version: `${config.versionPrefix ?? 'manual'}.${versionDate}`,
|
|
262
|
+
generatedAt: now.toISOString(),
|
|
263
|
+
taskTypes: TASK_TYPES,
|
|
264
|
+
notes: [
|
|
265
|
+
'Generated scorecard artifact. It is not used by the application unless explicitly wired in later.',
|
|
266
|
+
'Scores are normalized public benchmark priors, not private repo outcomes.',
|
|
267
|
+
],
|
|
268
|
+
scores: result.scores,
|
|
269
|
+
};
|
|
270
|
+
|
|
271
|
+
fs.mkdirSync(args.out, { recursive: true });
|
|
272
|
+
fs.writeFileSync(path.join(args.out, 'scorecard.json'), `${JSON.stringify(output, null, 2)}\n`);
|
|
273
|
+
fs.writeFileSync(path.join(args.out, 'scorecard.compact.json'), `${JSON.stringify(result.compact, null, 2)}\n`);
|
|
274
|
+
fs.writeFileSync(path.join(args.out, 'unmapped-models.json'), `${JSON.stringify(result.unmapped, null, 2)}\n`);
|
|
275
|
+
fs.writeFileSync(path.join(args.out, 'invalid-rows.json'), `${JSON.stringify(result.invalid, null, 2)}\n`);
|
|
276
|
+
fs.writeFileSync(path.join(args.out, 'scorecard-report.md'), makeReport(result, files, now));
|
|
277
|
+
|
|
278
|
+
console.log(`scorecard generator read ${rows.length} row(s) from ${files.length} file(s)`);
|
|
279
|
+
console.log(`wrote ${args.out}`);
|
|
280
|
+
if (result.unmapped.length > 0) console.log(`unmapped rows: ${result.unmapped.length}`);
|
|
281
|
+
if (result.invalid.length > 0) console.log(`invalid rows: ${result.invalid.length}`);
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
try {
|
|
285
|
+
main();
|
|
286
|
+
} catch (err) {
|
|
287
|
+
console.error(err instanceof Error ? err.message : String(err));
|
|
288
|
+
process.exit(1);
|
|
289
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
[]
|