@machinespirits/eval 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -9
- package/config/eval-settings.yaml +3 -3
- package/config/paper-manifest.json +486 -0
- package/config/providers.yaml +9 -6
- package/config/tutor-agents.yaml +2261 -0
- package/content/README.md +23 -0
- package/content/courses/479/course.md +53 -0
- package/content/courses/479/lecture-1.md +361 -0
- package/content/courses/479/lecture-2.md +360 -0
- package/content/courses/479/lecture-3.md +655 -0
- package/content/courses/479/lecture-4.md +530 -0
- package/content/courses/479/lecture-5.md +326 -0
- package/content/courses/479/lecture-6.md +346 -0
- package/content/courses/479/lecture-7.md +326 -0
- package/content/courses/479/lecture-8.md +273 -0
- package/content/courses/479/roadmap-slides.md +656 -0
- package/content/manifest.yaml +8 -0
- package/docs/research/build.sh +44 -20
- package/docs/research/figures/figure10.png +0 -0
- package/docs/research/figures/figure11.png +0 -0
- package/docs/research/figures/figure3.png +0 -0
- package/docs/research/figures/figure4.png +0 -0
- package/docs/research/figures/figure5.png +0 -0
- package/docs/research/figures/figure6.png +0 -0
- package/docs/research/figures/figure7.png +0 -0
- package/docs/research/figures/figure8.png +0 -0
- package/docs/research/figures/figure9.png +0 -0
- package/docs/research/header.tex +23 -2
- package/docs/research/paper-full.md +941 -285
- package/docs/research/paper-short.md +216 -585
- package/docs/research/references.bib +132 -0
- package/docs/research/slides-header.tex +188 -0
- package/docs/research/slides-pptx.md +363 -0
- package/docs/research/slides.md +531 -0
- package/docs/research/style-reference-pptx.py +199 -0
- package/package.json +6 -5
- package/scripts/analyze-eval-results.js +69 -17
- package/scripts/analyze-mechanism-traces.js +763 -0
- package/scripts/analyze-modulation-learning.js +498 -0
- package/scripts/analyze-prosthesis.js +144 -0
- package/scripts/analyze-run.js +264 -79
- package/scripts/assess-transcripts.js +853 -0
- package/scripts/browse-transcripts.js +854 -0
- package/scripts/check-parse-failures.js +73 -0
- package/scripts/code-dialectical-modulation.js +1320 -0
- package/scripts/download-data.sh +55 -0
- package/scripts/eval-cli.js +106 -18
- package/scripts/generate-paper-figures.js +663 -0
- package/scripts/generate-paper-figures.py +577 -76
- package/scripts/generate-paper-tables.js +299 -0
- package/scripts/qualitative-analysis-ai.js +3 -3
- package/scripts/render-sequence-diagram.js +694 -0
- package/scripts/test-latency.js +210 -0
- package/scripts/test-rate-limit.js +95 -0
- package/scripts/test-token-budget.js +332 -0
- package/scripts/validate-paper-manifest.js +670 -0
- package/services/__tests__/evalConfigLoader.test.js +2 -2
- package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
- package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
- package/services/evaluationRunner.js +975 -98
- package/services/evaluationStore.js +12 -4
- package/services/learnerTutorInteractionEngine.js +27 -2
- package/services/mockProvider.js +133 -0
- package/services/promptRewriter.js +1471 -5
- package/services/rubricEvaluator.js +55 -2
- package/services/transcriptFormatter.js +675 -0
- package/docs/EVALUATION-VARIABLES.md +0 -589
- package/docs/REPLICATION-PLAN.md +0 -577
- package/scripts/analyze-run.mjs +0 -282
- package/scripts/compare-runs.js +0 -44
- package/scripts/compare-suggestions.js +0 -80
- package/scripts/dig-into-run.js +0 -158
- package/scripts/show-failed-suggestions.js +0 -64
- /package/scripts/{check-run.mjs → check-run.js} +0 -0
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Lightweight latency test for all configured models.
|
|
4
|
+
*
|
|
5
|
+
* Sends a single short prompt ("Say hello in one sentence.") to each model
|
|
6
|
+
* and reports latency, token counts, and cost in a compact table.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* node scripts/test-latency.js # all openrouter models (default)
|
|
10
|
+
* node scripts/test-latency.js --provider all # all providers
|
|
11
|
+
* node scripts/test-latency.js --models nemotron,glm5,kimi-k2.5
|
|
12
|
+
* node scripts/test-latency.js --serial # one at a time
|
|
13
|
+
* node scripts/test-latency.js --prompt "Explain Hegel in one sentence."
|
|
14
|
+
* node scripts/test-latency.js --max-tokens 500 # override max output tokens
|
|
15
|
+
* node scripts/test-latency.js --input "I don't understand why Hegel matters"
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import 'dotenv/config';
|
|
19
|
+
import { unifiedAIProvider } from '@machinespirits/tutor-core';
|
|
20
|
+
import * as evalConfigLoader from '../services/evalConfigLoader.js';
|
|
21
|
+
|
|
22
|
+
// ── CLI args ────────────────────────────────────────────────────────────────
|
|
23
|
+
|
|
24
|
+
const args = process.argv.slice(2);
|
|
25
|
+
const getArg = (flag) => {
|
|
26
|
+
const idx = args.indexOf(flag);
|
|
27
|
+
return idx >= 0 && idx + 1 < args.length ? args[idx + 1] : null;
|
|
28
|
+
};
|
|
29
|
+
const hasFlag = (flag) => args.includes(flag);
|
|
30
|
+
|
|
31
|
+
const serial = hasFlag('--serial');
|
|
32
|
+
const maxTokens = parseInt(getArg('--max-tokens') || '200', 10);
|
|
33
|
+
const providerFilter = getArg('--provider') || 'openrouter';
|
|
34
|
+
const modelsFilter = getArg('--models')?.split(',').map(s => s.trim()) || null;
|
|
35
|
+
|
|
36
|
+
const defaultInput = 'I keep reading about Hegel\'s master-slave dialectic but I don\'t really get why it matters. Can you explain it simply?';
|
|
37
|
+
const learnerInput = getArg('--input') || getArg('--prompt') || defaultInput;
|
|
38
|
+
const systemPrompt = 'You are a philosophy tutor. Respond helpfully and concisely to the learner.';
|
|
39
|
+
|
|
40
|
+
// ── Discover models ─────────────────────────────────────────────────────────
|
|
41
|
+
|
|
42
|
+
function discoverModels() {
|
|
43
|
+
const providers = evalConfigLoader.loadProviders();
|
|
44
|
+
if (!providers?.providers) {
|
|
45
|
+
console.error('No providers found in config/providers.yaml');
|
|
46
|
+
process.exit(1);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
const targets = [];
|
|
50
|
+
|
|
51
|
+
for (const [provName, provConfig] of Object.entries(providers.providers)) {
|
|
52
|
+
if (providerFilter !== 'all' && provName !== providerFilter) continue;
|
|
53
|
+
if (provName === 'local') continue;
|
|
54
|
+
|
|
55
|
+
const models = provConfig.models || {};
|
|
56
|
+
for (const [alias, modelId] of Object.entries(models)) {
|
|
57
|
+
if (modelsFilter && !modelsFilter.includes(alias)) continue;
|
|
58
|
+
targets.push({ provider: provName, alias, modelId });
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
return targets;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// ── Test a single model ─────────────────────────────────────────────────────
|
|
66
|
+
|
|
67
|
+
async function testModel({ provider, alias, modelId }) {
|
|
68
|
+
const label = providerFilter === 'all' ? `${provider}.${alias}` : alias;
|
|
69
|
+
try {
|
|
70
|
+
const resolved = evalConfigLoader.resolveModel({ provider, model: alias });
|
|
71
|
+
if (!resolved.isConfigured) {
|
|
72
|
+
return { label, modelId, status: 'skip', reason: 'no API key' };
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const start = Date.now();
|
|
76
|
+
const response = await unifiedAIProvider.call({
|
|
77
|
+
provider,
|
|
78
|
+
model: resolved.model,
|
|
79
|
+
systemPrompt,
|
|
80
|
+
messages: [{ role: 'user', content: learnerInput }],
|
|
81
|
+
config: {
|
|
82
|
+
temperature: 0.3,
|
|
83
|
+
maxTokens,
|
|
84
|
+
},
|
|
85
|
+
});
|
|
86
|
+
const wallMs = Date.now() - start;
|
|
87
|
+
|
|
88
|
+
return {
|
|
89
|
+
label,
|
|
90
|
+
modelId: resolved.model,
|
|
91
|
+
status: 'ok',
|
|
92
|
+
latencyMs: response.latencyMs || wallMs,
|
|
93
|
+
wallMs,
|
|
94
|
+
inputTokens: response.usage?.inputTokens || 0,
|
|
95
|
+
outputTokens: response.usage?.outputTokens || 0,
|
|
96
|
+
cost: response.usage?.cost || 0,
|
|
97
|
+
content: (response.content || '').replace(/\s+/g, ' ').trim(),
|
|
98
|
+
};
|
|
99
|
+
} catch (error) {
|
|
100
|
+
return { label, modelId, status: 'error', reason: error.message.substring(0, 100) };
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// ── Formatting helpers ──────────────────────────────────────────────────────
|
|
105
|
+
|
|
106
|
+
function formatLatency(ms) {
|
|
107
|
+
return ms >= 1000 ? `${(ms / 1000).toFixed(1)}s` : `${ms}ms`;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function formatCost(cost) {
|
|
111
|
+
if (!cost || cost === 0) return ' --';
|
|
112
|
+
if (cost < 0.001) return `$${cost.toFixed(6)}`;
|
|
113
|
+
return `$${cost.toFixed(4)}`;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
function bar(ms, maxMs) {
|
|
117
|
+
const width = 20;
|
|
118
|
+
const filled = Math.round((ms / maxMs) * width);
|
|
119
|
+
return '█'.repeat(filled) + '░'.repeat(width - filled);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// ── Main ────────────────────────────────────────────────────────────────────
|
|
123
|
+
|
|
124
|
+
const targets = discoverModels();
|
|
125
|
+
if (targets.length === 0) {
|
|
126
|
+
console.error('No models matched. Check --provider / --models flags.');
|
|
127
|
+
process.exit(1);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
console.log(`\nTesting ${targets.length} model(s) ${serial ? 'sequentially' : 'in parallel'} (max ${maxTokens} tokens)...`);
|
|
131
|
+
console.log(`Input: "${learnerInput}"\n`);
|
|
132
|
+
|
|
133
|
+
let results;
|
|
134
|
+
if (serial) {
|
|
135
|
+
results = [];
|
|
136
|
+
for (const t of targets) {
|
|
137
|
+
process.stdout.write(` ${t.alias} ... `);
|
|
138
|
+
const r = await testModel(t);
|
|
139
|
+
if (r.status === 'ok') {
|
|
140
|
+
process.stdout.write(`${formatLatency(r.latencyMs)} (${r.inputTokens}→${r.outputTokens} tok)\n`);
|
|
141
|
+
} else {
|
|
142
|
+
process.stdout.write(`${r.status}: ${r.reason || ''}\n`);
|
|
143
|
+
}
|
|
144
|
+
results.push(r);
|
|
145
|
+
}
|
|
146
|
+
} else {
|
|
147
|
+
results = await Promise.all(targets.map(t => testModel(t)));
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// ── Table output ────────────────────────────────────────────────────────────
|
|
151
|
+
|
|
152
|
+
const ok = results.filter(r => r.status === 'ok').sort((a, b) => a.latencyMs - b.latencyMs);
|
|
153
|
+
const failed = results.filter(r => r.status !== 'ok');
|
|
154
|
+
|
|
155
|
+
if (ok.length > 0) {
|
|
156
|
+
const maxMs = ok[ok.length - 1].latencyMs;
|
|
157
|
+
const labelW = Math.max(12, ...ok.map(r => r.label.length));
|
|
158
|
+
const modelW = Math.max(15, ...ok.map(r => r.modelId.length));
|
|
159
|
+
const sep = '─'.repeat(labelW + modelW + 68);
|
|
160
|
+
|
|
161
|
+
console.log(`\n${sep}`);
|
|
162
|
+
console.log(
|
|
163
|
+
' ' + 'Alias'.padEnd(labelW) +
|
|
164
|
+
' ' + 'Model'.padEnd(modelW) +
|
|
165
|
+
' ' + 'Latency'.padStart(7) +
|
|
166
|
+
' ' + 'In'.padStart(4) +
|
|
167
|
+
' ' + 'Out'.padStart(4) +
|
|
168
|
+
' ' + 'Cost'.padStart(9) +
|
|
169
|
+
' ' + 'Bar'.padEnd(20) +
|
|
170
|
+
' Response'
|
|
171
|
+
);
|
|
172
|
+
console.log(sep);
|
|
173
|
+
|
|
174
|
+
for (const r of ok) {
|
|
175
|
+
console.log(
|
|
176
|
+
' ' + r.label.padEnd(labelW) +
|
|
177
|
+
' ' + r.modelId.padEnd(modelW) +
|
|
178
|
+
' ' + formatLatency(r.latencyMs).padStart(7) +
|
|
179
|
+
' ' + String(r.inputTokens).padStart(4) +
|
|
180
|
+
' ' + String(r.outputTokens).padStart(4) +
|
|
181
|
+
' ' + formatCost(r.cost).padStart(9) +
|
|
182
|
+
' ' + bar(r.latencyMs, maxMs) +
|
|
183
|
+
' ' + r.content.substring(0, 35)
|
|
184
|
+
);
|
|
185
|
+
}
|
|
186
|
+
console.log(sep);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
if (failed.length > 0) {
|
|
190
|
+
console.log('\nFailed/Skipped:');
|
|
191
|
+
for (const r of failed) {
|
|
192
|
+
console.log(` ${r.label}: ${r.reason || r.status}`);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// ── Summary ─────────────────────────────────────────────────────────────────
|
|
197
|
+
|
|
198
|
+
if (ok.length > 0) {
|
|
199
|
+
const fastest = ok[0];
|
|
200
|
+
const slowest = ok[ok.length - 1];
|
|
201
|
+
const median = ok[Math.floor(ok.length / 2)];
|
|
202
|
+
const totalCost = ok.reduce((s, r) => s + r.cost, 0);
|
|
203
|
+
const avgLatency = Math.round(ok.reduce((s, r) => s + r.latencyMs, 0) / ok.length);
|
|
204
|
+
console.log(`\n${ok.length} succeeded, ${failed.length} failed`);
|
|
205
|
+
console.log(` Fastest: ${fastest.label} (${formatLatency(fastest.latencyMs)})`);
|
|
206
|
+
console.log(` Median: ${median.label} (${formatLatency(median.latencyMs)})`);
|
|
207
|
+
console.log(` Slowest: ${slowest.label} (${formatLatency(slowest.latencyMs)})`);
|
|
208
|
+
console.log(` Average: ${formatLatency(avgLatency)}`);
|
|
209
|
+
if (totalCost > 0) console.log(` Total cost: ${formatCost(totalCost)}`);
|
|
210
|
+
}
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import 'dotenv/config';
|
|
3
|
+
/**
|
|
4
|
+
* Quick rate-limit probe for OpenRouter models.
|
|
5
|
+
* Usage: node scripts/test-rate-limit.js [model-alias]
|
|
6
|
+
* Default: nemotron
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
const MODEL_MAP = {
|
|
10
|
+
nemotron: 'nvidia/nemotron-3-nano-30b-a3b:free',
|
|
11
|
+
glm47: 'z-ai/glm-4.7',
|
|
12
|
+
'kimi-k2.5': 'moonshotai/kimi-k2.5',
|
|
13
|
+
deepseek: 'deepseek/deepseek-v3.2',
|
|
14
|
+
haiku: 'anthropic/claude-haiku-4.5',
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
const alias = process.argv[2] || 'nemotron';
|
|
18
|
+
const model = MODEL_MAP[alias] || alias;
|
|
19
|
+
const apiKey = process.env.OPENROUTER_API_KEY;
|
|
20
|
+
|
|
21
|
+
if (!apiKey) {
|
|
22
|
+
console.error('OPENROUTER_API_KEY not set');
|
|
23
|
+
process.exit(1);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function formatReset(resetValue) {
|
|
27
|
+
const ts = Number(resetValue);
|
|
28
|
+
if (!ts || isNaN(ts)) return resetValue;
|
|
29
|
+
const resetDate = new Date(ts);
|
|
30
|
+
const now = new Date();
|
|
31
|
+
const diffMs = resetDate - now;
|
|
32
|
+
const local = resetDate.toLocaleString('en-AU', { timeZone: 'Australia/Melbourne' });
|
|
33
|
+
if (diffMs <= 0) return `${local} AEDT (already passed)`;
|
|
34
|
+
const mins = Math.ceil(diffMs / 60000);
|
|
35
|
+
if (mins < 60) return `${local} AEDT (in ${mins}m)`;
|
|
36
|
+
const hrs = Math.floor(mins / 60);
|
|
37
|
+
const remMins = mins % 60;
|
|
38
|
+
return `${local} AEDT (in ${hrs}h ${remMins}m)`;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
async function probe() {
|
|
42
|
+
console.log(`Probing ${alias} (${model})...\n`);
|
|
43
|
+
const start = Date.now();
|
|
44
|
+
|
|
45
|
+
const res = await fetch('https://openrouter.ai/api/v1/chat/completions', {
|
|
46
|
+
method: 'POST',
|
|
47
|
+
headers: {
|
|
48
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
49
|
+
'Content-Type': 'application/json',
|
|
50
|
+
},
|
|
51
|
+
body: JSON.stringify({
|
|
52
|
+
model,
|
|
53
|
+
messages: [{ role: 'user', content: 'Say "hello" and nothing else.' }],
|
|
54
|
+
max_tokens: 10,
|
|
55
|
+
}),
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
const elapsed = Date.now() - start;
|
|
59
|
+
const headers = Object.fromEntries(res.headers.entries());
|
|
60
|
+
|
|
61
|
+
// Rate limit headers
|
|
62
|
+
const rl = {
|
|
63
|
+
limit: headers['x-ratelimit-limit-requests'] || headers['x-ratelimit-limit'] || '?',
|
|
64
|
+
remaining: headers['x-ratelimit-remaining-requests'] || headers['x-ratelimit-remaining'] || '?',
|
|
65
|
+
reset: headers['x-ratelimit-reset-requests'] || headers['x-ratelimit-reset'] || '?',
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
const body = await res.json();
|
|
69
|
+
|
|
70
|
+
console.log(`Status: ${res.status} (${elapsed}ms)`);
|
|
71
|
+
console.log(`Rate limit: ${rl.remaining}/${rl.limit} remaining`);
|
|
72
|
+
console.log(`Resets: ${formatReset(rl.reset)}`);
|
|
73
|
+
|
|
74
|
+
if (res.status === 429) {
|
|
75
|
+
console.log('\n*** RATE LIMITED ***');
|
|
76
|
+
console.log('Error:', body.error?.message || JSON.stringify(body));
|
|
77
|
+
process.exit(2);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
if (res.status !== 200) {
|
|
81
|
+
console.log('\nError:', body.error?.message || JSON.stringify(body));
|
|
82
|
+
process.exit(1);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const reply = body.choices?.[0]?.message?.content || '(empty)';
|
|
86
|
+
const usage = body.usage || {};
|
|
87
|
+
console.log(`Reply: "${reply.trim()}"`);
|
|
88
|
+
console.log(`Tokens: ${usage.prompt_tokens || '?'} in / ${usage.completion_tokens || '?'} out`);
|
|
89
|
+
if (body.id) console.log(`Request ID: ${body.id}`);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
probe().catch(err => {
|
|
93
|
+
console.error('Fetch error:', err.message);
|
|
94
|
+
process.exit(1);
|
|
95
|
+
});
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import 'dotenv/config';
|
|
3
|
+
/**
|
|
4
|
+
* Token Budget Sensitivity Test
|
|
5
|
+
*
|
|
6
|
+
* Runs a dose-response curve measuring how constraining max_tokens affects
|
|
7
|
+
* evaluation scores. Useful for optimizing cost/latency without sacrificing quality.
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* node scripts/test-token-budget.js [options]
|
|
11
|
+
*
|
|
12
|
+
* Options:
|
|
13
|
+
* --model <model> Ego model (default: openrouter.haiku)
|
|
14
|
+
* --levels <csv> Comma-separated max_tokens levels (default: 256,512,1024,2048,4000)
|
|
15
|
+
* --runs <n> Runs per level×cell (default: 4)
|
|
16
|
+
* --profiles <csv> Cell profiles (default: cell_1_base_single_unified,cell_5_recog_single_unified)
|
|
17
|
+
* --skip-judge Skip rubric evaluation (generate only, judge later)
|
|
18
|
+
* --parallelism <n> Parallelism per run (default: 2)
|
|
19
|
+
* --report-only <csv> Skip generation, just build report from existing run IDs
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
import { execSync, execFileSync } from 'child_process';
|
|
23
|
+
import path from 'path';
|
|
24
|
+
import { fileURLToPath } from 'url';
|
|
25
|
+
import fs from 'fs';
|
|
26
|
+
import * as evaluationStore from '../services/evaluationStore.js';
|
|
27
|
+
|
|
28
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
29
|
+
const CLI_PATH = path.join(__dirname, 'eval-cli.js');
|
|
30
|
+
const EXPORTS_DIR = path.join(__dirname, '..', 'exports');
|
|
31
|
+
|
|
32
|
+
// Parse CLI arguments
|
|
33
|
+
function getOption(name) {
|
|
34
|
+
const idx = process.argv.indexOf(`--${name}`);
|
|
35
|
+
return idx !== -1 && idx + 1 < process.argv.length ? process.argv[idx + 1] : null;
|
|
36
|
+
}
|
|
37
|
+
function getFlag(name) {
|
|
38
|
+
return process.argv.includes(`--${name}`);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const model = getOption('model') || 'openrouter.haiku';
|
|
42
|
+
const levels = (getOption('levels') || '256,512,1024,2048,4000').split(',').map(s => parseInt(s.trim(), 10));
|
|
43
|
+
const runsPerLevel = parseInt(getOption('runs') || '4', 10);
|
|
44
|
+
const profiles = (getOption('profiles') || 'cell_1_base_single_unified,cell_5_recog_single_unified').split(',').map(s => s.trim());
|
|
45
|
+
const skipJudge = getFlag('skip-judge');
|
|
46
|
+
const parallelism = getOption('parallelism') || '2';
|
|
47
|
+
const reportOnly = getOption('report-only');
|
|
48
|
+
|
|
49
|
+
// Summary
|
|
50
|
+
const totalEvals = levels.length * profiles.length * runsPerLevel * 3; // 3 scenarios
|
|
51
|
+
console.log('\n╔══════════════════════════════════════════════════╗');
|
|
52
|
+
console.log('║ Token Budget Sensitivity Test ║');
|
|
53
|
+
console.log('╚══════════════════════════════════════════════════╝');
|
|
54
|
+
console.log(` Model: ${model}`);
|
|
55
|
+
console.log(` Levels: ${levels.join(', ')}`);
|
|
56
|
+
console.log(` Profiles: ${profiles.join(', ')}`);
|
|
57
|
+
console.log(` Runs/level: ${runsPerLevel}`);
|
|
58
|
+
console.log(` Total evals: ~${totalEvals}`);
|
|
59
|
+
console.log(` Skip judge: ${skipJudge}`);
|
|
60
|
+
console.log('');
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Run evaluations for each token budget level and collect run IDs.
|
|
64
|
+
*/
|
|
65
|
+
async function runAllLevels() {
|
|
66
|
+
if (reportOnly) {
|
|
67
|
+
const ids = reportOnly.split(',').map(s => s.trim());
|
|
68
|
+
console.log(`Report-only mode: using ${ids.length} existing run IDs\n`);
|
|
69
|
+
return ids;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const runIds = [];
|
|
73
|
+
|
|
74
|
+
for (const level of levels) {
|
|
75
|
+
console.log(`\n${'─'.repeat(60)}`);
|
|
76
|
+
console.log(`Running max_tokens=${level}...`);
|
|
77
|
+
console.log(`${'─'.repeat(60)}`);
|
|
78
|
+
|
|
79
|
+
const args = [
|
|
80
|
+
CLI_PATH,
|
|
81
|
+
'run',
|
|
82
|
+
'--profiles', profiles.join(','),
|
|
83
|
+
'--runs', String(runsPerLevel),
|
|
84
|
+
'--max-tokens', String(level),
|
|
85
|
+
'--model', model,
|
|
86
|
+
'--parallelism', parallelism,
|
|
87
|
+
'--description', `Token budget test: max_tokens=${level}`,
|
|
88
|
+
];
|
|
89
|
+
if (skipJudge) args.push('--skip-rubric');
|
|
90
|
+
|
|
91
|
+
try {
|
|
92
|
+
const output = execFileSync('node', args, {
|
|
93
|
+
encoding: 'utf-8',
|
|
94
|
+
stdio: ['inherit', 'pipe', 'inherit'],
|
|
95
|
+
timeout: 600_000, // 10 min per level
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
// Extract run ID from output (format: "Run ID: eval-YYYY-MM-DD-XXXXXXXX")
|
|
99
|
+
const match = output.match(/Run ID:\s*(eval-[\w-]+)/);
|
|
100
|
+
if (match) {
|
|
101
|
+
runIds.push(match[1]);
|
|
102
|
+
console.log(` ✓ Completed: ${match[1]}`);
|
|
103
|
+
} else {
|
|
104
|
+
// Try alternative format
|
|
105
|
+
const altMatch = output.match(/(eval-\d{4}-\d{2}-\d{2}-[a-f0-9]+)/);
|
|
106
|
+
if (altMatch) {
|
|
107
|
+
runIds.push(altMatch[1]);
|
|
108
|
+
console.log(` ✓ Completed: ${altMatch[1]}`);
|
|
109
|
+
} else {
|
|
110
|
+
console.error(` ✗ Could not extract run ID for max_tokens=${level}`);
|
|
111
|
+
console.error(' Output:', output.slice(-200));
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
} catch (err) {
|
|
115
|
+
console.error(` ✗ Failed for max_tokens=${level}:`, err.message);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
return runIds;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Build the dose-response report from completed run IDs.
|
|
124
|
+
*/
|
|
125
|
+
function buildReport(runIds) {
|
|
126
|
+
console.log(`\n${'═'.repeat(60)}`);
|
|
127
|
+
console.log(' BUILDING DOSE-RESPONSE REPORT');
|
|
128
|
+
console.log(`${'═'.repeat(60)}\n`);
|
|
129
|
+
|
|
130
|
+
// Collect data per level × profile
|
|
131
|
+
const data = new Map(); // key: `${level}|${profileName}` → { scores, outputTokens, budget }
|
|
132
|
+
|
|
133
|
+
for (const runId of runIds) {
|
|
134
|
+
const results = evaluationStore.getResults(runId);
|
|
135
|
+
if (results.length === 0) {
|
|
136
|
+
console.warn(` Warning: no results for ${runId}`);
|
|
137
|
+
continue;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Extract max_tokens from hyperparameters of first result
|
|
141
|
+
const firstHyper = typeof results[0].hyperparameters === 'string'
|
|
142
|
+
? JSON.parse(results[0].hyperparameters || '{}')
|
|
143
|
+
: (results[0].hyperparameters || {});
|
|
144
|
+
const budget = firstHyper.max_tokens || null;
|
|
145
|
+
|
|
146
|
+
if (!budget) {
|
|
147
|
+
console.warn(` Warning: no max_tokens in hyperparameters for ${runId}`);
|
|
148
|
+
continue;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
for (const r of results) {
|
|
152
|
+
const profile = r.profile_name || r.profileName;
|
|
153
|
+
const score = r.overall_score;
|
|
154
|
+
const outTokens = r.output_tokens || r.outputTokens || 0;
|
|
155
|
+
const apiCalls = r.api_calls || r.apiCalls || 1;
|
|
156
|
+
|
|
157
|
+
if (score == null) continue; // unjudged
|
|
158
|
+
|
|
159
|
+
const key = `${budget}|${profile}`;
|
|
160
|
+
if (!data.has(key)) {
|
|
161
|
+
data.set(key, { budget, profile, scores: [], outputTokens: [], apiCalls: [] });
|
|
162
|
+
}
|
|
163
|
+
const entry = data.get(key);
|
|
164
|
+
entry.scores.push(score);
|
|
165
|
+
entry.outputTokens.push(outTokens);
|
|
166
|
+
entry.apiCalls.push(apiCalls);
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
if (data.size === 0) {
|
|
171
|
+
console.log('No scored data found. Run without --skip-judge or judge the runs first.');
|
|
172
|
+
return;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Compute statistics
|
|
176
|
+
const stats = (arr) => {
|
|
177
|
+
if (arr.length === 0) return { mean: 0, sd: 0, n: 0 };
|
|
178
|
+
const n = arr.length;
|
|
179
|
+
const mean = arr.reduce((a, b) => a + b, 0) / n;
|
|
180
|
+
const sd = n > 1
|
|
181
|
+
? Math.sqrt(arr.reduce((sum, v) => sum + (v - mean) ** 2, 0) / (n - 1))
|
|
182
|
+
: 0;
|
|
183
|
+
return { mean, sd, n };
|
|
184
|
+
};
|
|
185
|
+
|
|
186
|
+
// Extract model alias from the model string
|
|
187
|
+
const modelAlias = model.includes('.') ? model.split('.').slice(1).join('.') : model;
|
|
188
|
+
|
|
189
|
+
// Build table rows grouped by profile
|
|
190
|
+
const profileNames = [...new Set([...data.values()].map(d => d.profile))].sort();
|
|
191
|
+
const budgetLevels = [...new Set([...data.values()].map(d => d.budget))].sort((a, b) => a - b);
|
|
192
|
+
|
|
193
|
+
// Format the report
|
|
194
|
+
const lines = [];
|
|
195
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
|
|
196
|
+
|
|
197
|
+
lines.push(`# Token Budget Sensitivity Test`);
|
|
198
|
+
lines.push('');
|
|
199
|
+
lines.push(`- **Date:** ${new Date().toISOString().slice(0, 10)}`);
|
|
200
|
+
lines.push(`- **Model:** ${modelAlias}`);
|
|
201
|
+
lines.push(`- **Runs per level×cell:** ${runsPerLevel}`);
|
|
202
|
+
lines.push(`- **Run IDs:** ${runIds.join(', ')}`);
|
|
203
|
+
lines.push('');
|
|
204
|
+
|
|
205
|
+
// Build the table
|
|
206
|
+
const profileLabels = profileNames.map(p => {
|
|
207
|
+
if (p.includes('base')) return `Base (${p})`;
|
|
208
|
+
if (p.includes('recog')) return `Recognition (${p})`;
|
|
209
|
+
return p;
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
// Header
|
|
213
|
+
const colWidth = 28;
|
|
214
|
+
let header = ' Budget |';
|
|
215
|
+
let divider = '---------|';
|
|
216
|
+
for (const label of profileLabels) {
|
|
217
|
+
const shortLabel = label.length > colWidth - 2 ? label.slice(0, colWidth - 5) + '...' : label;
|
|
218
|
+
header += ` ${shortLabel.padEnd(colWidth)}|`;
|
|
219
|
+
divider += `${'-'.repeat(colWidth + 1)}|`;
|
|
220
|
+
}
|
|
221
|
+
lines.push('## Dose-Response Table');
|
|
222
|
+
lines.push('');
|
|
223
|
+
lines.push('```');
|
|
224
|
+
lines.push(header);
|
|
225
|
+
lines.push(` | ${profileLabels.map(() => 'Mean SD N Trunc%'.padEnd(colWidth)).join('| ')}|`);
|
|
226
|
+
lines.push(divider);
|
|
227
|
+
|
|
228
|
+
for (const budget of budgetLevels) {
|
|
229
|
+
let row = ` ${String(budget).padStart(5)} |`;
|
|
230
|
+
for (const profile of profileNames) {
|
|
231
|
+
const key = `${budget}|${profile}`;
|
|
232
|
+
const entry = data.get(key);
|
|
233
|
+
if (!entry) {
|
|
234
|
+
row += ` ${'—'.padEnd(colWidth)}|`;
|
|
235
|
+
continue;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
const s = stats(entry.scores);
|
|
239
|
+
// Truncation: per-row check whether output tokens >= budget × api_calls (within 95%).
|
|
240
|
+
// output_tokens is cumulative across all API calls (including inner retries),
|
|
241
|
+
// so we scale the threshold by api_calls to avoid false positives.
|
|
242
|
+
const truncCount = entry.outputTokens.filter((t, i) => {
|
|
243
|
+
const calls = entry.apiCalls[i] || 1;
|
|
244
|
+
return t >= Math.floor(budget * calls * 0.95);
|
|
245
|
+
}).length;
|
|
246
|
+
const truncPct = entry.outputTokens.length > 0
|
|
247
|
+
? Math.round(100 * truncCount / entry.outputTokens.length)
|
|
248
|
+
: 0;
|
|
249
|
+
|
|
250
|
+
const cell = `${s.mean.toFixed(1).padStart(5)} ${s.sd.toFixed(1).padStart(5)} ${String(s.n).padStart(3)} ${String(truncPct).padStart(3)}%`;
|
|
251
|
+
row += ` ${cell.padEnd(colWidth)}|`;
|
|
252
|
+
}
|
|
253
|
+
lines.push(row);
|
|
254
|
+
}
|
|
255
|
+
lines.push('```');
|
|
256
|
+
lines.push('');
|
|
257
|
+
|
|
258
|
+
// Effect size summary
|
|
259
|
+
if (profileNames.length >= 2 && budgetLevels.length >= 2) {
|
|
260
|
+
lines.push('## Key Observations');
|
|
261
|
+
lines.push('');
|
|
262
|
+
|
|
263
|
+
const highBudget = budgetLevels[budgetLevels.length - 1];
|
|
264
|
+
const lowBudget = budgetLevels[0];
|
|
265
|
+
|
|
266
|
+
for (const profile of profileNames) {
|
|
267
|
+
const highKey = `${highBudget}|${profile}`;
|
|
268
|
+
const lowKey = `${lowBudget}|${profile}`;
|
|
269
|
+
const highEntry = data.get(highKey);
|
|
270
|
+
const lowEntry = data.get(lowKey);
|
|
271
|
+
|
|
272
|
+
if (highEntry && lowEntry) {
|
|
273
|
+
const highStats = stats(highEntry.scores);
|
|
274
|
+
const lowStats = stats(lowEntry.scores);
|
|
275
|
+
const delta = highStats.mean - lowStats.mean;
|
|
276
|
+
const pooledSD = Math.sqrt(((highStats.sd ** 2) + (lowStats.sd ** 2)) / 2);
|
|
277
|
+
const d = pooledSD > 0 ? delta / pooledSD : 0;
|
|
278
|
+
|
|
279
|
+
lines.push(`- **${profile}**: ${highBudget} vs ${lowBudget} tokens → Δ=${delta.toFixed(1)} pts (d=${d.toFixed(2)})`);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
lines.push('');
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
// Raw data table
|
|
286
|
+
lines.push('## Raw Data');
|
|
287
|
+
lines.push('');
|
|
288
|
+
lines.push('| Budget | Profile | N | Mean | SD | Trunc% |');
|
|
289
|
+
lines.push('|--------|---------|---|------|-----|--------|');
|
|
290
|
+
for (const budget of budgetLevels) {
|
|
291
|
+
for (const profile of profileNames) {
|
|
292
|
+
const key = `${budget}|${profile}`;
|
|
293
|
+
const entry = data.get(key);
|
|
294
|
+
if (!entry) continue;
|
|
295
|
+
const s = stats(entry.scores);
|
|
296
|
+
const truncCount = entry.outputTokens.filter((t, i) => {
|
|
297
|
+
const calls = entry.apiCalls[i] || 1;
|
|
298
|
+
return t >= Math.floor(budget * calls * 0.95);
|
|
299
|
+
}).length;
|
|
300
|
+
const truncPct = entry.outputTokens.length > 0
|
|
301
|
+
? Math.round(100 * truncCount / entry.outputTokens.length)
|
|
302
|
+
: 0;
|
|
303
|
+
lines.push(`| ${budget} | ${profile} | ${s.n} | ${s.mean.toFixed(1)} | ${s.sd.toFixed(1)} | ${truncPct}% |`);
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
const report = lines.join('\n');
|
|
308
|
+
|
|
309
|
+
// Print to console
|
|
310
|
+
console.log(report);
|
|
311
|
+
|
|
312
|
+
// Write to file
|
|
313
|
+
if (!fs.existsSync(EXPORTS_DIR)) fs.mkdirSync(EXPORTS_DIR, { recursive: true });
|
|
314
|
+
const outPath = path.join(EXPORTS_DIR, `token-budget-sensitivity-${timestamp}.md`);
|
|
315
|
+
fs.writeFileSync(outPath, report + '\n');
|
|
316
|
+
console.log(`\nReport written to: ${outPath}`);
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
// Main
|
|
320
|
+
(async () => {
|
|
321
|
+
try {
|
|
322
|
+
const runIds = await runAllLevels();
|
|
323
|
+
if (runIds.length > 0) {
|
|
324
|
+
buildReport(runIds);
|
|
325
|
+
} else {
|
|
326
|
+
console.log('\nNo runs completed. Nothing to report.');
|
|
327
|
+
}
|
|
328
|
+
} catch (err) {
|
|
329
|
+
console.error('Fatal error:', err);
|
|
330
|
+
process.exit(1);
|
|
331
|
+
}
|
|
332
|
+
})();
|