@machinespirits/eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +91 -9
  2. package/config/eval-settings.yaml +3 -3
  3. package/config/paper-manifest.json +486 -0
  4. package/config/providers.yaml +9 -6
  5. package/config/tutor-agents.yaml +2261 -0
  6. package/content/README.md +23 -0
  7. package/content/courses/479/course.md +53 -0
  8. package/content/courses/479/lecture-1.md +361 -0
  9. package/content/courses/479/lecture-2.md +360 -0
  10. package/content/courses/479/lecture-3.md +655 -0
  11. package/content/courses/479/lecture-4.md +530 -0
  12. package/content/courses/479/lecture-5.md +326 -0
  13. package/content/courses/479/lecture-6.md +346 -0
  14. package/content/courses/479/lecture-7.md +326 -0
  15. package/content/courses/479/lecture-8.md +273 -0
  16. package/content/courses/479/roadmap-slides.md +656 -0
  17. package/content/manifest.yaml +8 -0
  18. package/docs/research/build.sh +44 -20
  19. package/docs/research/figures/figure10.png +0 -0
  20. package/docs/research/figures/figure11.png +0 -0
  21. package/docs/research/figures/figure3.png +0 -0
  22. package/docs/research/figures/figure4.png +0 -0
  23. package/docs/research/figures/figure5.png +0 -0
  24. package/docs/research/figures/figure6.png +0 -0
  25. package/docs/research/figures/figure7.png +0 -0
  26. package/docs/research/figures/figure8.png +0 -0
  27. package/docs/research/figures/figure9.png +0 -0
  28. package/docs/research/header.tex +23 -2
  29. package/docs/research/paper-full.md +941 -285
  30. package/docs/research/paper-short.md +216 -585
  31. package/docs/research/references.bib +132 -0
  32. package/docs/research/slides-header.tex +188 -0
  33. package/docs/research/slides-pptx.md +363 -0
  34. package/docs/research/slides.md +531 -0
  35. package/docs/research/style-reference-pptx.py +199 -0
  36. package/package.json +6 -5
  37. package/scripts/analyze-eval-results.js +69 -17
  38. package/scripts/analyze-mechanism-traces.js +763 -0
  39. package/scripts/analyze-modulation-learning.js +498 -0
  40. package/scripts/analyze-prosthesis.js +144 -0
  41. package/scripts/analyze-run.js +264 -79
  42. package/scripts/assess-transcripts.js +853 -0
  43. package/scripts/browse-transcripts.js +854 -0
  44. package/scripts/check-parse-failures.js +73 -0
  45. package/scripts/code-dialectical-modulation.js +1320 -0
  46. package/scripts/download-data.sh +55 -0
  47. package/scripts/eval-cli.js +106 -18
  48. package/scripts/generate-paper-figures.js +663 -0
  49. package/scripts/generate-paper-figures.py +577 -76
  50. package/scripts/generate-paper-tables.js +299 -0
  51. package/scripts/qualitative-analysis-ai.js +3 -3
  52. package/scripts/render-sequence-diagram.js +694 -0
  53. package/scripts/test-latency.js +210 -0
  54. package/scripts/test-rate-limit.js +95 -0
  55. package/scripts/test-token-budget.js +332 -0
  56. package/scripts/validate-paper-manifest.js +670 -0
  57. package/services/__tests__/evalConfigLoader.test.js +2 -2
  58. package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
  59. package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
  60. package/services/evaluationRunner.js +975 -98
  61. package/services/evaluationStore.js +12 -4
  62. package/services/learnerTutorInteractionEngine.js +27 -2
  63. package/services/mockProvider.js +133 -0
  64. package/services/promptRewriter.js +1471 -5
  65. package/services/rubricEvaluator.js +55 -2
  66. package/services/transcriptFormatter.js +675 -0
  67. package/docs/EVALUATION-VARIABLES.md +0 -589
  68. package/docs/REPLICATION-PLAN.md +0 -577
  69. package/scripts/analyze-run.mjs +0 -282
  70. package/scripts/compare-runs.js +0 -44
  71. package/scripts/compare-suggestions.js +0 -80
  72. package/scripts/dig-into-run.js +0 -158
  73. package/scripts/show-failed-suggestions.js +0 -64
  74. /package/scripts/{check-run.mjs → check-run.js} +0 -0
@@ -0,0 +1,210 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Lightweight latency test for all configured models.
4
+ *
5
+ * Sends a single short prompt ("Say hello in one sentence.") to each model
6
+ * and reports latency, token counts, and cost in a compact table.
7
+ *
8
+ * Usage:
9
+ * node scripts/test-latency.js # all openrouter models (default)
10
+ * node scripts/test-latency.js --provider all # all providers
11
+ * node scripts/test-latency.js --models nemotron,glm5,kimi-k2.5
12
+ * node scripts/test-latency.js --serial # one at a time
13
+ * node scripts/test-latency.js --prompt "Explain Hegel in one sentence."
14
+ * node scripts/test-latency.js --max-tokens 500 # override max output tokens
15
+ * node scripts/test-latency.js --input "I don't understand why Hegel matters"
16
+ */
17
+
18
+ import 'dotenv/config';
19
+ import { unifiedAIProvider } from '@machinespirits/tutor-core';
20
+ import * as evalConfigLoader from '../services/evalConfigLoader.js';
21
+
22
+ // ── CLI args ────────────────────────────────────────────────────────────────
23
+
24
+ const args = process.argv.slice(2);
25
+ const getArg = (flag) => {
26
+ const idx = args.indexOf(flag);
27
+ return idx >= 0 && idx + 1 < args.length ? args[idx + 1] : null;
28
+ };
29
+ const hasFlag = (flag) => args.includes(flag);
30
+
31
+ const serial = hasFlag('--serial');
32
+ const maxTokens = parseInt(getArg('--max-tokens') || '200', 10);
33
+ const providerFilter = getArg('--provider') || 'openrouter';
34
+ const modelsFilter = getArg('--models')?.split(',').map(s => s.trim()) || null;
35
+
36
+ const defaultInput = 'I keep reading about Hegel\'s master-slave dialectic but I don\'t really get why it matters. Can you explain it simply?';
37
+ const learnerInput = getArg('--input') || getArg('--prompt') || defaultInput;
38
+ const systemPrompt = 'You are a philosophy tutor. Respond helpfully and concisely to the learner.';
39
+
40
+ // ── Discover models ─────────────────────────────────────────────────────────
41
+
42
+ function discoverModels() {
43
+ const providers = evalConfigLoader.loadProviders();
44
+ if (!providers?.providers) {
45
+ console.error('No providers found in config/providers.yaml');
46
+ process.exit(1);
47
+ }
48
+
49
+ const targets = [];
50
+
51
+ for (const [provName, provConfig] of Object.entries(providers.providers)) {
52
+ if (providerFilter !== 'all' && provName !== providerFilter) continue;
53
+ if (provName === 'local') continue;
54
+
55
+ const models = provConfig.models || {};
56
+ for (const [alias, modelId] of Object.entries(models)) {
57
+ if (modelsFilter && !modelsFilter.includes(alias)) continue;
58
+ targets.push({ provider: provName, alias, modelId });
59
+ }
60
+ }
61
+
62
+ return targets;
63
+ }
64
+
65
+ // ── Test a single model ─────────────────────────────────────────────────────
66
+
67
+ async function testModel({ provider, alias, modelId }) {
68
+ const label = providerFilter === 'all' ? `${provider}.${alias}` : alias;
69
+ try {
70
+ const resolved = evalConfigLoader.resolveModel({ provider, model: alias });
71
+ if (!resolved.isConfigured) {
72
+ return { label, modelId, status: 'skip', reason: 'no API key' };
73
+ }
74
+
75
+ const start = Date.now();
76
+ const response = await unifiedAIProvider.call({
77
+ provider,
78
+ model: resolved.model,
79
+ systemPrompt,
80
+ messages: [{ role: 'user', content: learnerInput }],
81
+ config: {
82
+ temperature: 0.3,
83
+ maxTokens,
84
+ },
85
+ });
86
+ const wallMs = Date.now() - start;
87
+
88
+ return {
89
+ label,
90
+ modelId: resolved.model,
91
+ status: 'ok',
92
+ latencyMs: response.latencyMs || wallMs,
93
+ wallMs,
94
+ inputTokens: response.usage?.inputTokens || 0,
95
+ outputTokens: response.usage?.outputTokens || 0,
96
+ cost: response.usage?.cost || 0,
97
+ content: (response.content || '').replace(/\s+/g, ' ').trim(),
98
+ };
99
+ } catch (error) {
100
+ return { label, modelId, status: 'error', reason: error.message.substring(0, 100) };
101
+ }
102
+ }
103
+
104
+ // ── Formatting helpers ──────────────────────────────────────────────────────
105
+
106
+ function formatLatency(ms) {
107
+ return ms >= 1000 ? `${(ms / 1000).toFixed(1)}s` : `${ms}ms`;
108
+ }
109
+
110
+ function formatCost(cost) {
111
+ if (!cost || cost === 0) return ' --';
112
+ if (cost < 0.001) return `$${cost.toFixed(6)}`;
113
+ return `$${cost.toFixed(4)}`;
114
+ }
115
+
116
+ function bar(ms, maxMs) {
117
+ const width = 20;
118
+ const filled = Math.round((ms / maxMs) * width);
119
+ return '█'.repeat(filled) + '░'.repeat(width - filled);
120
+ }
121
+
122
+ // ── Main ────────────────────────────────────────────────────────────────────
123
+
124
+ const targets = discoverModels();
125
+ if (targets.length === 0) {
126
+ console.error('No models matched. Check --provider / --models flags.');
127
+ process.exit(1);
128
+ }
129
+
130
+ console.log(`\nTesting ${targets.length} model(s) ${serial ? 'sequentially' : 'in parallel'} (max ${maxTokens} tokens)...`);
131
+ console.log(`Input: "${learnerInput}"\n`);
132
+
133
+ let results;
134
+ if (serial) {
135
+ results = [];
136
+ for (const t of targets) {
137
+ process.stdout.write(` ${t.alias} ... `);
138
+ const r = await testModel(t);
139
+ if (r.status === 'ok') {
140
+ process.stdout.write(`${formatLatency(r.latencyMs)} (${r.inputTokens}→${r.outputTokens} tok)\n`);
141
+ } else {
142
+ process.stdout.write(`${r.status}: ${r.reason || ''}\n`);
143
+ }
144
+ results.push(r);
145
+ }
146
+ } else {
147
+ results = await Promise.all(targets.map(t => testModel(t)));
148
+ }
149
+
150
+ // ── Table output ────────────────────────────────────────────────────────────
151
+
152
+ const ok = results.filter(r => r.status === 'ok').sort((a, b) => a.latencyMs - b.latencyMs);
153
+ const failed = results.filter(r => r.status !== 'ok');
154
+
155
+ if (ok.length > 0) {
156
+ const maxMs = ok[ok.length - 1].latencyMs;
157
+ const labelW = Math.max(12, ...ok.map(r => r.label.length));
158
+ const modelW = Math.max(15, ...ok.map(r => r.modelId.length));
159
+ const sep = '─'.repeat(labelW + modelW + 68);
160
+
161
+ console.log(`\n${sep}`);
162
+ console.log(
163
+ ' ' + 'Alias'.padEnd(labelW) +
164
+ ' ' + 'Model'.padEnd(modelW) +
165
+ ' ' + 'Latency'.padStart(7) +
166
+ ' ' + 'In'.padStart(4) +
167
+ ' ' + 'Out'.padStart(4) +
168
+ ' ' + 'Cost'.padStart(9) +
169
+ ' ' + 'Bar'.padEnd(20) +
170
+ ' Response'
171
+ );
172
+ console.log(sep);
173
+
174
+ for (const r of ok) {
175
+ console.log(
176
+ ' ' + r.label.padEnd(labelW) +
177
+ ' ' + r.modelId.padEnd(modelW) +
178
+ ' ' + formatLatency(r.latencyMs).padStart(7) +
179
+ ' ' + String(r.inputTokens).padStart(4) +
180
+ ' ' + String(r.outputTokens).padStart(4) +
181
+ ' ' + formatCost(r.cost).padStart(9) +
182
+ ' ' + bar(r.latencyMs, maxMs) +
183
+ ' ' + r.content.substring(0, 35)
184
+ );
185
+ }
186
+ console.log(sep);
187
+ }
188
+
189
+ if (failed.length > 0) {
190
+ console.log('\nFailed/Skipped:');
191
+ for (const r of failed) {
192
+ console.log(` ${r.label}: ${r.reason || r.status}`);
193
+ }
194
+ }
195
+
196
+ // ── Summary ─────────────────────────────────────────────────────────────────
197
+
198
+ if (ok.length > 0) {
199
+ const fastest = ok[0];
200
+ const slowest = ok[ok.length - 1];
201
+ const median = ok[Math.floor(ok.length / 2)];
202
+ const totalCost = ok.reduce((s, r) => s + r.cost, 0);
203
+ const avgLatency = Math.round(ok.reduce((s, r) => s + r.latencyMs, 0) / ok.length);
204
+ console.log(`\n${ok.length} succeeded, ${failed.length} failed`);
205
+ console.log(` Fastest: ${fastest.label} (${formatLatency(fastest.latencyMs)})`);
206
+ console.log(` Median: ${median.label} (${formatLatency(median.latencyMs)})`);
207
+ console.log(` Slowest: ${slowest.label} (${formatLatency(slowest.latencyMs)})`);
208
+ console.log(` Average: ${formatLatency(avgLatency)}`);
209
+ if (totalCost > 0) console.log(` Total cost: ${formatCost(totalCost)}`);
210
+ }
@@ -0,0 +1,95 @@
1
+ #!/usr/bin/env node
2
+ import 'dotenv/config';
3
+ /**
4
+ * Quick rate-limit probe for OpenRouter models.
5
+ * Usage: node scripts/test-rate-limit.js [model-alias]
6
+ * Default: nemotron
7
+ */
8
+
9
+ const MODEL_MAP = {
10
+ nemotron: 'nvidia/nemotron-3-nano-30b-a3b:free',
11
+ glm47: 'z-ai/glm-4.7',
12
+ 'kimi-k2.5': 'moonshotai/kimi-k2.5',
13
+ deepseek: 'deepseek/deepseek-v3.2',
14
+ haiku: 'anthropic/claude-haiku-4.5',
15
+ };
16
+
17
+ const alias = process.argv[2] || 'nemotron';
18
+ const model = MODEL_MAP[alias] || alias;
19
+ const apiKey = process.env.OPENROUTER_API_KEY;
20
+
21
+ if (!apiKey) {
22
+ console.error('OPENROUTER_API_KEY not set');
23
+ process.exit(1);
24
+ }
25
+
26
+ function formatReset(resetValue) {
27
+ const ts = Number(resetValue);
28
+ if (!ts || isNaN(ts)) return resetValue;
29
+ const resetDate = new Date(ts);
30
+ const now = new Date();
31
+ const diffMs = resetDate - now;
32
+ const local = resetDate.toLocaleString('en-AU', { timeZone: 'Australia/Melbourne' });
33
+ if (diffMs <= 0) return `${local} AEDT (already passed)`;
34
+ const mins = Math.ceil(diffMs / 60000);
35
+ if (mins < 60) return `${local} AEDT (in ${mins}m)`;
36
+ const hrs = Math.floor(mins / 60);
37
+ const remMins = mins % 60;
38
+ return `${local} AEDT (in ${hrs}h ${remMins}m)`;
39
+ }
40
+
41
+ async function probe() {
42
+ console.log(`Probing ${alias} (${model})...\n`);
43
+ const start = Date.now();
44
+
45
+ const res = await fetch('https://openrouter.ai/api/v1/chat/completions', {
46
+ method: 'POST',
47
+ headers: {
48
+ 'Authorization': `Bearer ${apiKey}`,
49
+ 'Content-Type': 'application/json',
50
+ },
51
+ body: JSON.stringify({
52
+ model,
53
+ messages: [{ role: 'user', content: 'Say "hello" and nothing else.' }],
54
+ max_tokens: 10,
55
+ }),
56
+ });
57
+
58
+ const elapsed = Date.now() - start;
59
+ const headers = Object.fromEntries(res.headers.entries());
60
+
61
+ // Rate limit headers
62
+ const rl = {
63
+ limit: headers['x-ratelimit-limit-requests'] || headers['x-ratelimit-limit'] || '?',
64
+ remaining: headers['x-ratelimit-remaining-requests'] || headers['x-ratelimit-remaining'] || '?',
65
+ reset: headers['x-ratelimit-reset-requests'] || headers['x-ratelimit-reset'] || '?',
66
+ };
67
+
68
+ const body = await res.json();
69
+
70
+ console.log(`Status: ${res.status} (${elapsed}ms)`);
71
+ console.log(`Rate limit: ${rl.remaining}/${rl.limit} remaining`);
72
+ console.log(`Resets: ${formatReset(rl.reset)}`);
73
+
74
+ if (res.status === 429) {
75
+ console.log('\n*** RATE LIMITED ***');
76
+ console.log('Error:', body.error?.message || JSON.stringify(body));
77
+ process.exit(2);
78
+ }
79
+
80
+ if (res.status !== 200) {
81
+ console.log('\nError:', body.error?.message || JSON.stringify(body));
82
+ process.exit(1);
83
+ }
84
+
85
+ const reply = body.choices?.[0]?.message?.content || '(empty)';
86
+ const usage = body.usage || {};
87
+ console.log(`Reply: "${reply.trim()}"`);
88
+ console.log(`Tokens: ${usage.prompt_tokens || '?'} in / ${usage.completion_tokens || '?'} out`);
89
+ if (body.id) console.log(`Request ID: ${body.id}`);
90
+ }
91
+
92
+ probe().catch(err => {
93
+ console.error('Fetch error:', err.message);
94
+ process.exit(1);
95
+ });
@@ -0,0 +1,332 @@
1
+ #!/usr/bin/env node
2
+ import 'dotenv/config';
3
+ /**
4
+ * Token Budget Sensitivity Test
5
+ *
6
+ * Runs a dose-response curve measuring how constraining max_tokens affects
7
+ * evaluation scores. Useful for optimizing cost/latency without sacrificing quality.
8
+ *
9
+ * Usage:
10
+ * node scripts/test-token-budget.js [options]
11
+ *
12
+ * Options:
13
+ * --model <model> Ego model (default: openrouter.haiku)
14
+ * --levels <csv> Comma-separated max_tokens levels (default: 256,512,1024,2048,4000)
15
+ * --runs <n> Runs per level×cell (default: 4)
16
+ * --profiles <csv> Cell profiles (default: cell_1_base_single_unified,cell_5_recog_single_unified)
17
+ * --skip-judge Skip rubric evaluation (generate only, judge later)
18
+ * --parallelism <n> Parallelism per run (default: 2)
19
+ * --report-only <csv> Skip generation, just build report from existing run IDs
20
+ */
21
+
22
+ import { execSync, execFileSync } from 'child_process';
23
+ import path from 'path';
24
+ import { fileURLToPath } from 'url';
25
+ import fs from 'fs';
26
+ import * as evaluationStore from '../services/evaluationStore.js';
27
+
28
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
29
+ const CLI_PATH = path.join(__dirname, 'eval-cli.js');
30
+ const EXPORTS_DIR = path.join(__dirname, '..', 'exports');
31
+
32
+ // Parse CLI arguments
33
+ function getOption(name) {
34
+ const idx = process.argv.indexOf(`--${name}`);
35
+ return idx !== -1 && idx + 1 < process.argv.length ? process.argv[idx + 1] : null;
36
+ }
37
+ function getFlag(name) {
38
+ return process.argv.includes(`--${name}`);
39
+ }
40
+
41
+ const model = getOption('model') || 'openrouter.haiku';
42
+ const levels = (getOption('levels') || '256,512,1024,2048,4000').split(',').map(s => parseInt(s.trim(), 10));
43
+ const runsPerLevel = parseInt(getOption('runs') || '4', 10);
44
+ const profiles = (getOption('profiles') || 'cell_1_base_single_unified,cell_5_recog_single_unified').split(',').map(s => s.trim());
45
+ const skipJudge = getFlag('skip-judge');
46
+ const parallelism = getOption('parallelism') || '2';
47
+ const reportOnly = getOption('report-only');
48
+
49
+ // Summary
50
+ const totalEvals = levels.length * profiles.length * runsPerLevel * 3; // 3 scenarios
51
+ console.log('\n╔══════════════════════════════════════════════════╗');
52
+ console.log('║ Token Budget Sensitivity Test ║');
53
+ console.log('╚══════════════════════════════════════════════════╝');
54
+ console.log(` Model: ${model}`);
55
+ console.log(` Levels: ${levels.join(', ')}`);
56
+ console.log(` Profiles: ${profiles.join(', ')}`);
57
+ console.log(` Runs/level: ${runsPerLevel}`);
58
+ console.log(` Total evals: ~${totalEvals}`);
59
+ console.log(` Skip judge: ${skipJudge}`);
60
+ console.log('');
61
+
62
+ /**
63
+ * Run evaluations for each token budget level and collect run IDs.
64
+ */
65
+ async function runAllLevels() {
66
+ if (reportOnly) {
67
+ const ids = reportOnly.split(',').map(s => s.trim());
68
+ console.log(`Report-only mode: using ${ids.length} existing run IDs\n`);
69
+ return ids;
70
+ }
71
+
72
+ const runIds = [];
73
+
74
+ for (const level of levels) {
75
+ console.log(`\n${'─'.repeat(60)}`);
76
+ console.log(`Running max_tokens=${level}...`);
77
+ console.log(`${'─'.repeat(60)}`);
78
+
79
+ const args = [
80
+ CLI_PATH,
81
+ 'run',
82
+ '--profiles', profiles.join(','),
83
+ '--runs', String(runsPerLevel),
84
+ '--max-tokens', String(level),
85
+ '--model', model,
86
+ '--parallelism', parallelism,
87
+ '--description', `Token budget test: max_tokens=${level}`,
88
+ ];
89
+ if (skipJudge) args.push('--skip-rubric');
90
+
91
+ try {
92
+ const output = execFileSync('node', args, {
93
+ encoding: 'utf-8',
94
+ stdio: ['inherit', 'pipe', 'inherit'],
95
+ timeout: 600_000, // 10 min per level
96
+ });
97
+
98
+ // Extract run ID from output (format: "Run ID: eval-YYYY-MM-DD-XXXXXXXX")
99
+ const match = output.match(/Run ID:\s*(eval-[\w-]+)/);
100
+ if (match) {
101
+ runIds.push(match[1]);
102
+ console.log(` ✓ Completed: ${match[1]}`);
103
+ } else {
104
+ // Try alternative format
105
+ const altMatch = output.match(/(eval-\d{4}-\d{2}-\d{2}-[a-f0-9]+)/);
106
+ if (altMatch) {
107
+ runIds.push(altMatch[1]);
108
+ console.log(` ✓ Completed: ${altMatch[1]}`);
109
+ } else {
110
+ console.error(` ✗ Could not extract run ID for max_tokens=${level}`);
111
+ console.error(' Output:', output.slice(-200));
112
+ }
113
+ }
114
+ } catch (err) {
115
+ console.error(` ✗ Failed for max_tokens=${level}:`, err.message);
116
+ }
117
+ }
118
+
119
+ return runIds;
120
+ }
121
+
122
+ /**
123
+ * Build the dose-response report from completed run IDs.
124
+ */
125
+ function buildReport(runIds) {
126
+ console.log(`\n${'═'.repeat(60)}`);
127
+ console.log(' BUILDING DOSE-RESPONSE REPORT');
128
+ console.log(`${'═'.repeat(60)}\n`);
129
+
130
+ // Collect data per level × profile
131
+ const data = new Map(); // key: `${level}|${profileName}` → { scores, outputTokens, budget }
132
+
133
+ for (const runId of runIds) {
134
+ const results = evaluationStore.getResults(runId);
135
+ if (results.length === 0) {
136
+ console.warn(` Warning: no results for ${runId}`);
137
+ continue;
138
+ }
139
+
140
+ // Extract max_tokens from hyperparameters of first result
141
+ const firstHyper = typeof results[0].hyperparameters === 'string'
142
+ ? JSON.parse(results[0].hyperparameters || '{}')
143
+ : (results[0].hyperparameters || {});
144
+ const budget = firstHyper.max_tokens || null;
145
+
146
+ if (!budget) {
147
+ console.warn(` Warning: no max_tokens in hyperparameters for ${runId}`);
148
+ continue;
149
+ }
150
+
151
+ for (const r of results) {
152
+ const profile = r.profile_name || r.profileName;
153
+ const score = r.overall_score;
154
+ const outTokens = r.output_tokens || r.outputTokens || 0;
155
+ const apiCalls = r.api_calls || r.apiCalls || 1;
156
+
157
+ if (score == null) continue; // unjudged
158
+
159
+ const key = `${budget}|${profile}`;
160
+ if (!data.has(key)) {
161
+ data.set(key, { budget, profile, scores: [], outputTokens: [], apiCalls: [] });
162
+ }
163
+ const entry = data.get(key);
164
+ entry.scores.push(score);
165
+ entry.outputTokens.push(outTokens);
166
+ entry.apiCalls.push(apiCalls);
167
+ }
168
+ }
169
+
170
+ if (data.size === 0) {
171
+ console.log('No scored data found. Run without --skip-judge or judge the runs first.');
172
+ return;
173
+ }
174
+
175
+ // Compute statistics
176
+ const stats = (arr) => {
177
+ if (arr.length === 0) return { mean: 0, sd: 0, n: 0 };
178
+ const n = arr.length;
179
+ const mean = arr.reduce((a, b) => a + b, 0) / n;
180
+ const sd = n > 1
181
+ ? Math.sqrt(arr.reduce((sum, v) => sum + (v - mean) ** 2, 0) / (n - 1))
182
+ : 0;
183
+ return { mean, sd, n };
184
+ };
185
+
186
+ // Extract model alias from the model string
187
+ const modelAlias = model.includes('.') ? model.split('.').slice(1).join('.') : model;
188
+
189
+ // Build table rows grouped by profile
190
+ const profileNames = [...new Set([...data.values()].map(d => d.profile))].sort();
191
+ const budgetLevels = [...new Set([...data.values()].map(d => d.budget))].sort((a, b) => a - b);
192
+
193
+ // Format the report
194
+ const lines = [];
195
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
196
+
197
+ lines.push(`# Token Budget Sensitivity Test`);
198
+ lines.push('');
199
+ lines.push(`- **Date:** ${new Date().toISOString().slice(0, 10)}`);
200
+ lines.push(`- **Model:** ${modelAlias}`);
201
+ lines.push(`- **Runs per level×cell:** ${runsPerLevel}`);
202
+ lines.push(`- **Run IDs:** ${runIds.join(', ')}`);
203
+ lines.push('');
204
+
205
+ // Build the table
206
+ const profileLabels = profileNames.map(p => {
207
+ if (p.includes('base')) return `Base (${p})`;
208
+ if (p.includes('recog')) return `Recognition (${p})`;
209
+ return p;
210
+ });
211
+
212
+ // Header
213
+ const colWidth = 28;
214
+ let header = ' Budget |';
215
+ let divider = '---------|';
216
+ for (const label of profileLabels) {
217
+ const shortLabel = label.length > colWidth - 2 ? label.slice(0, colWidth - 5) + '...' : label;
218
+ header += ` ${shortLabel.padEnd(colWidth)}|`;
219
+ divider += `${'-'.repeat(colWidth + 1)}|`;
220
+ }
221
+ lines.push('## Dose-Response Table');
222
+ lines.push('');
223
+ lines.push('```');
224
+ lines.push(header);
225
+ lines.push(` | ${profileLabels.map(() => 'Mean SD N Trunc%'.padEnd(colWidth)).join('| ')}|`);
226
+ lines.push(divider);
227
+
228
+ for (const budget of budgetLevels) {
229
+ let row = ` ${String(budget).padStart(5)} |`;
230
+ for (const profile of profileNames) {
231
+ const key = `${budget}|${profile}`;
232
+ const entry = data.get(key);
233
+ if (!entry) {
234
+ row += ` ${'—'.padEnd(colWidth)}|`;
235
+ continue;
236
+ }
237
+
238
+ const s = stats(entry.scores);
239
+ // Truncation: per-row check whether output tokens >= budget × api_calls (within 95%).
240
+ // output_tokens is cumulative across all API calls (including inner retries),
241
+ // so we scale the threshold by api_calls to avoid false positives.
242
+ const truncCount = entry.outputTokens.filter((t, i) => {
243
+ const calls = entry.apiCalls[i] || 1;
244
+ return t >= Math.floor(budget * calls * 0.95);
245
+ }).length;
246
+ const truncPct = entry.outputTokens.length > 0
247
+ ? Math.round(100 * truncCount / entry.outputTokens.length)
248
+ : 0;
249
+
250
+ const cell = `${s.mean.toFixed(1).padStart(5)} ${s.sd.toFixed(1).padStart(5)} ${String(s.n).padStart(3)} ${String(truncPct).padStart(3)}%`;
251
+ row += ` ${cell.padEnd(colWidth)}|`;
252
+ }
253
+ lines.push(row);
254
+ }
255
+ lines.push('```');
256
+ lines.push('');
257
+
258
+ // Effect size summary
259
+ if (profileNames.length >= 2 && budgetLevels.length >= 2) {
260
+ lines.push('## Key Observations');
261
+ lines.push('');
262
+
263
+ const highBudget = budgetLevels[budgetLevels.length - 1];
264
+ const lowBudget = budgetLevels[0];
265
+
266
+ for (const profile of profileNames) {
267
+ const highKey = `${highBudget}|${profile}`;
268
+ const lowKey = `${lowBudget}|${profile}`;
269
+ const highEntry = data.get(highKey);
270
+ const lowEntry = data.get(lowKey);
271
+
272
+ if (highEntry && lowEntry) {
273
+ const highStats = stats(highEntry.scores);
274
+ const lowStats = stats(lowEntry.scores);
275
+ const delta = highStats.mean - lowStats.mean;
276
+ const pooledSD = Math.sqrt(((highStats.sd ** 2) + (lowStats.sd ** 2)) / 2);
277
+ const d = pooledSD > 0 ? delta / pooledSD : 0;
278
+
279
+ lines.push(`- **${profile}**: ${highBudget} vs ${lowBudget} tokens → Δ=${delta.toFixed(1)} pts (d=${d.toFixed(2)})`);
280
+ }
281
+ }
282
+ lines.push('');
283
+ }
284
+
285
+ // Raw data table
286
+ lines.push('## Raw Data');
287
+ lines.push('');
288
+ lines.push('| Budget | Profile | N | Mean | SD | Trunc% |');
289
+ lines.push('|--------|---------|---|------|-----|--------|');
290
+ for (const budget of budgetLevels) {
291
+ for (const profile of profileNames) {
292
+ const key = `${budget}|${profile}`;
293
+ const entry = data.get(key);
294
+ if (!entry) continue;
295
+ const s = stats(entry.scores);
296
+ const truncCount = entry.outputTokens.filter((t, i) => {
297
+ const calls = entry.apiCalls[i] || 1;
298
+ return t >= Math.floor(budget * calls * 0.95);
299
+ }).length;
300
+ const truncPct = entry.outputTokens.length > 0
301
+ ? Math.round(100 * truncCount / entry.outputTokens.length)
302
+ : 0;
303
+ lines.push(`| ${budget} | ${profile} | ${s.n} | ${s.mean.toFixed(1)} | ${s.sd.toFixed(1)} | ${truncPct}% |`);
304
+ }
305
+ }
306
+
307
+ const report = lines.join('\n');
308
+
309
+ // Print to console
310
+ console.log(report);
311
+
312
+ // Write to file
313
+ if (!fs.existsSync(EXPORTS_DIR)) fs.mkdirSync(EXPORTS_DIR, { recursive: true });
314
+ const outPath = path.join(EXPORTS_DIR, `token-budget-sensitivity-${timestamp}.md`);
315
+ fs.writeFileSync(outPath, report + '\n');
316
+ console.log(`\nReport written to: ${outPath}`);
317
+ }
318
+
319
+ // Main
320
+ (async () => {
321
+ try {
322
+ const runIds = await runAllLevels();
323
+ if (runIds.length > 0) {
324
+ buildReport(runIds);
325
+ } else {
326
+ console.log('\nNo runs completed. Nothing to report.');
327
+ }
328
+ } catch (err) {
329
+ console.error('Fatal error:', err);
330
+ process.exit(1);
331
+ }
332
+ })();