@machinespirits/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/components/MobileEvalDashboard.tsx +267 -0
  2. package/components/comparison/DeltaAnalysisTable.tsx +137 -0
  3. package/components/comparison/ProfileComparisonCard.tsx +176 -0
  4. package/components/comparison/RecognitionABMode.tsx +385 -0
  5. package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
  6. package/components/comparison/WinnerIndicator.tsx +64 -0
  7. package/components/comparison/index.ts +5 -0
  8. package/components/mobile/BottomSheet.tsx +233 -0
  9. package/components/mobile/DimensionBreakdown.tsx +210 -0
  10. package/components/mobile/DocsView.tsx +363 -0
  11. package/components/mobile/LogsView.tsx +481 -0
  12. package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
  13. package/components/mobile/QuickTestView.tsx +1098 -0
  14. package/components/mobile/RecognitionTypeChart.tsx +124 -0
  15. package/components/mobile/RecognitionView.tsx +809 -0
  16. package/components/mobile/RunDetailView.tsx +261 -0
  17. package/components/mobile/RunHistoryView.tsx +367 -0
  18. package/components/mobile/ScoreRadial.tsx +211 -0
  19. package/components/mobile/StreamingLogPanel.tsx +230 -0
  20. package/components/mobile/SynthesisStrategyChart.tsx +140 -0
  21. package/config/interaction-eval-scenarios.yaml +832 -0
  22. package/config/learner-agents.yaml +248 -0
  23. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
  24. package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
  25. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
  26. package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
  27. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
  28. package/docs/research/COST-ANALYSIS.md +56 -0
  29. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
  30. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
  31. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
  32. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
  33. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
  34. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
  35. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
  36. package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
  37. package/docs/research/PAPER-UNIFIED.md +659 -0
  38. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  39. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
  40. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
  41. package/docs/research/apa.csl +2133 -0
  42. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
  43. package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
  44. package/docs/research/paper-draft/full-paper.md +136 -0
  45. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  46. package/docs/research/paper-draft/references.bib +515 -0
  47. package/docs/research/transcript-baseline.md +139 -0
  48. package/docs/research/transcript-recognition-multiagent.md +187 -0
  49. package/hooks/useEvalData.ts +625 -0
  50. package/index.js +27 -0
  51. package/package.json +73 -0
  52. package/routes/evalRoutes.js +3002 -0
  53. package/scripts/advanced-eval-analysis.js +351 -0
  54. package/scripts/analyze-eval-costs.js +378 -0
  55. package/scripts/analyze-eval-results.js +513 -0
  56. package/scripts/analyze-interaction-evals.js +368 -0
  57. package/server-init.js +45 -0
  58. package/server.js +162 -0
  59. package/services/benchmarkService.js +1892 -0
  60. package/services/evaluationRunner.js +739 -0
  61. package/services/evaluationStore.js +1121 -0
  62. package/services/learnerConfigLoader.js +385 -0
  63. package/services/learnerTutorInteractionEngine.js +857 -0
  64. package/services/memory/learnerMemoryService.js +1227 -0
  65. package/services/memory/learnerWritingPad.js +577 -0
  66. package/services/memory/tutorWritingPad.js +674 -0
  67. package/services/promptRecommendationService.js +493 -0
  68. package/services/rubricEvaluator.js +826 -0
@@ -0,0 +1,378 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Analyze Evaluation Costs
4
+ *
5
+ * Calculates token usage and costs for evaluation runs.
6
+ * Supports both scripted and dynamic learner evaluations.
7
+ *
8
+ * Usage:
9
+ * node scripts/analyze-eval-costs.js # Analyze all recent evals
10
+ * node scripts/analyze-eval-costs.js --battery # Analyze battery scenarios
11
+ * node scripts/analyze-eval-costs.js --file <path> # Analyze specific file
12
+ * node scripts/analyze-eval-costs.js --summary # Show cost summary only
13
+ */
14
+
15
+ import fs from 'fs';
16
+ import path from 'path';
17
+ import { fileURLToPath } from 'url';
18
+
19
+ const __filename = fileURLToPath(import.meta.url);
20
+ const __dirname = path.dirname(__filename);
21
+ const PROJECT_ROOT = path.resolve(__dirname, '..');
22
+
23
+ // ============================================================================
24
+ // Model Pricing (per million tokens) - Update as needed
25
+ // ============================================================================
26
+
27
+ const MODEL_PRICING = {
28
+ // OpenRouter pricing as of January 2026
29
+ 'nvidia/nemotron-3-nano-30b-a3b:free': {
30
+ input: 0,
31
+ output: 0,
32
+ name: 'Nemotron 3 Nano 30B (free)',
33
+ },
34
+ 'anthropic/claude-sonnet-4.5': {
35
+ input: 3.00,
36
+ output: 15.00,
37
+ name: 'Claude Sonnet 4.5',
38
+ },
39
+ 'anthropic/claude-haiku-4.5': {
40
+ input: 0.80,
41
+ output: 4.00,
42
+ name: 'Claude Haiku 4.5',
43
+ },
44
+ 'openai/gpt-5.2': {
45
+ input: 2.50,
46
+ output: 10.00,
47
+ name: 'GPT-5.2',
48
+ },
49
+ 'google/gemini-3-pro-preview': {
50
+ input: 1.25,
51
+ output: 5.00,
52
+ name: 'Gemini 3 Pro',
53
+ },
54
+ // Aliases
55
+ 'nemotron': {
56
+ input: 0,
57
+ output: 0,
58
+ name: 'Nemotron 3 Nano 30B (free)',
59
+ },
60
+ 'sonnet': {
61
+ input: 3.00,
62
+ output: 15.00,
63
+ name: 'Claude Sonnet 4.5',
64
+ },
65
+ };
66
+
67
+ // Default model assignments by role
68
+ const DEFAULT_MODELS = {
69
+ tutor_ego: 'nemotron',
70
+ tutor_superego: 'nemotron',
71
+ learner_ego: 'nemotron',
72
+ learner_superego: 'nemotron',
73
+ judge: 'sonnet',
74
+ };
75
+
76
+ // ============================================================================
77
+ // Cost Calculation Functions
78
+ // ============================================================================
79
+
80
+ function calculateCost(inputTokens, outputTokens, model) {
81
+ const pricing = MODEL_PRICING[model] || MODEL_PRICING['nemotron'];
82
+ const inputCost = (inputTokens / 1_000_000) * pricing.input;
83
+ const outputCost = (outputTokens / 1_000_000) * pricing.output;
84
+ return {
85
+ inputCost,
86
+ outputCost,
87
+ totalCost: inputCost + outputCost,
88
+ model: pricing.name,
89
+ };
90
+ }
91
+
92
+ function estimateTokenSplit(totalTokens, inputRatio = 0.7) {
93
+ // Estimate input/output split if not provided
94
+ // Default assumes 70% input (prompts, context) and 30% output (responses)
95
+ return {
96
+ input: Math.round(totalTokens * inputRatio),
97
+ output: Math.round(totalTokens * (1 - inputRatio)),
98
+ };
99
+ }
100
+
101
+ // ============================================================================
102
+ // Analysis Functions
103
+ // ============================================================================
104
+
105
+ function analyzeBatteryScenario(filePath) {
106
+ const data = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
107
+ const metrics = data.metrics || {};
108
+
109
+ // Extract scenario info
110
+ const scenarioName = data.scenarioName || path.basename(filePath);
111
+ const tutorProfile = data.tutorProfile;
112
+ const learnerArch = data.learnerArchitecture;
113
+
114
+ // Token counts
115
+ const tutorTokens = metrics.tutorTokens || 0;
116
+ const learnerTokens = metrics.learnerTokens || 0;
117
+ const totalTokens = metrics.totalTokens || (tutorTokens + learnerTokens);
118
+
119
+ // Estimate input/output split
120
+ // Tutor: more output (responses), Learner: more output (responses)
121
+ // Judge: high input (transcript), moderate output (evaluation)
122
+ const tutorSplit = estimateTokenSplit(tutorTokens, 0.6);
123
+ const learnerSplit = estimateTokenSplit(learnerTokens, 0.5);
124
+
125
+ // Estimate judge tokens (transcript + rubric as input, evaluation as output)
126
+ const judgeInput = Math.round(totalTokens * 0.8); // Transcript context
127
+ const judgeOutput = 2000; // Typical evaluation response
128
+
129
+ // Calculate costs
130
+ const tutorCost = calculateCost(tutorSplit.input, tutorSplit.output, DEFAULT_MODELS.tutor_ego);
131
+ const learnerCost = calculateCost(learnerSplit.input, learnerSplit.output, DEFAULT_MODELS.learner_ego);
132
+ const judgeCost = calculateCost(judgeInput, judgeOutput, DEFAULT_MODELS.judge);
133
+
134
+ return {
135
+ scenario: scenarioName,
136
+ tutorProfile,
137
+ learnerArch,
138
+ turns: metrics.turnCount || 0,
139
+ latencyMs: metrics.totalLatencyMs || 0,
140
+ tokens: {
141
+ tutor: tutorTokens,
142
+ learner: learnerTokens,
143
+ total: totalTokens,
144
+ judgeEstimate: judgeInput + judgeOutput,
145
+ },
146
+ costs: {
147
+ tutor: tutorCost,
148
+ learner: learnerCost,
149
+ judge: judgeCost,
150
+ total: tutorCost.totalCost + learnerCost.totalCost + judgeCost.totalCost,
151
+ },
152
+ score: data.judgeEvaluation?.overallScore || null,
153
+ };
154
+ }
155
+
156
+ function analyzeBatteryDirectory(dirPath) {
157
+ const files = fs.readdirSync(dirPath)
158
+ .filter(f => f.endsWith('.json') && f.includes('battery'))
159
+ .map(f => path.join(dirPath, f));
160
+
161
+ return files.map(f => analyzeBatteryScenario(f));
162
+ }
163
+
164
+ // ============================================================================
165
+ // Output Formatting
166
+ // ============================================================================
167
+
168
+ function formatCurrency(amount) {
169
+ return `$${amount.toFixed(4)}`;
170
+ }
171
+
172
+ function formatNumber(num) {
173
+ return num.toLocaleString();
174
+ }
175
+
176
+ function printDetailedReport(results) {
177
+ console.log('\n' + '='.repeat(80));
178
+ console.log('EVALUATION COST ANALYSIS');
179
+ console.log('='.repeat(80));
180
+
181
+ // Per-scenario breakdown
182
+ console.log('\n## Per-Scenario Breakdown\n');
183
+ console.log('| Scenario | Turns | Tutor Tokens | Learner Tokens | Judge Est. | Total Cost | Score |');
184
+ console.log('|----------|-------|--------------|----------------|------------|------------|-------|');
185
+
186
+ let totalTokens = 0;
187
+ let totalCost = 0;
188
+
189
+ for (const r of results) {
190
+ const scenarioShort = r.scenario.replace('battery_', '').substring(0, 25);
191
+ console.log(`| ${scenarioShort.padEnd(25)} | ${String(r.turns).padStart(5)} | ${formatNumber(r.tokens.tutor).padStart(12)} | ${formatNumber(r.tokens.learner).padStart(14)} | ${formatNumber(r.tokens.judgeEstimate).padStart(10)} | ${formatCurrency(r.costs.total).padStart(10)} | ${r.score !== null ? String(r.score).padStart(5) : 'N/A'.padStart(5)} |`);
192
+ totalTokens += r.tokens.total + r.tokens.judgeEstimate;
193
+ totalCost += r.costs.total;
194
+ }
195
+
196
+ console.log('|' + '-'.repeat(78) + '|');
197
+ console.log(`| **TOTAL** | | | | ${formatNumber(totalTokens).padStart(10)} | ${formatCurrency(totalCost).padStart(10)} | |`);
198
+
199
+ // Cost breakdown by component
200
+ console.log('\n## Cost Breakdown by Component\n');
201
+ console.log('| Component | Model | Input Tokens | Output Tokens | Cost |');
202
+ console.log('|-----------|-------|--------------|---------------|------|');
203
+
204
+ const componentTotals = {
205
+ tutor: { input: 0, output: 0, cost: 0 },
206
+ learner: { input: 0, output: 0, cost: 0 },
207
+ judge: { input: 0, output: 0, cost: 0 },
208
+ };
209
+
210
+ for (const r of results) {
211
+ const tutorSplit = estimateTokenSplit(r.tokens.tutor, 0.6);
212
+ const learnerSplit = estimateTokenSplit(r.tokens.learner, 0.5);
213
+ componentTotals.tutor.input += tutorSplit.input;
214
+ componentTotals.tutor.output += tutorSplit.output;
215
+ componentTotals.tutor.cost += r.costs.tutor.totalCost;
216
+ componentTotals.learner.input += learnerSplit.input;
217
+ componentTotals.learner.output += learnerSplit.output;
218
+ componentTotals.learner.cost += r.costs.learner.totalCost;
219
+ componentTotals.judge.input += r.tokens.judgeEstimate * 0.8;
220
+ componentTotals.judge.output += 2000;
221
+ componentTotals.judge.cost += r.costs.judge.totalCost;
222
+ }
223
+
224
+ console.log(`| Tutor (Ego+Superego) | ${MODEL_PRICING[DEFAULT_MODELS.tutor_ego].name} | ${formatNumber(Math.round(componentTotals.tutor.input))} | ${formatNumber(Math.round(componentTotals.tutor.output))} | ${formatCurrency(componentTotals.tutor.cost)} |`);
225
+ console.log(`| Learner (Ego+Superego) | ${MODEL_PRICING[DEFAULT_MODELS.learner_ego].name} | ${formatNumber(Math.round(componentTotals.learner.input))} | ${formatNumber(Math.round(componentTotals.learner.output))} | ${formatCurrency(componentTotals.learner.cost)} |`);
226
+ console.log(`| Judge | ${MODEL_PRICING[DEFAULT_MODELS.judge].name} | ${formatNumber(Math.round(componentTotals.judge.input))} | ${formatNumber(Math.round(componentTotals.judge.output))} | ${formatCurrency(componentTotals.judge.cost)} |`);
227
+ console.log(`| **TOTAL** | | ${formatNumber(Math.round(componentTotals.tutor.input + componentTotals.learner.input + componentTotals.judge.input))} | ${formatNumber(Math.round(componentTotals.tutor.output + componentTotals.learner.output + componentTotals.judge.output))} | **${formatCurrency(totalCost)}** |`);
228
+
229
+ // Hypothetical costs
230
+ console.log('\n## Hypothetical: All Claude Sonnet 4.5\n');
231
+ const allSonnetCost = calculateCost(
232
+ componentTotals.tutor.input + componentTotals.learner.input + componentTotals.judge.input,
233
+ componentTotals.tutor.output + componentTotals.learner.output + componentTotals.judge.output,
234
+ 'sonnet'
235
+ );
236
+ console.log(`| Current Cost (Nemotron + Sonnet Judge) | ${formatCurrency(totalCost)} |`);
237
+ console.log(`| Hypothetical (All Sonnet 4.5) | ${formatCurrency(allSonnetCost.totalCost)} |`);
238
+ console.log(`| Cost Multiplier | ${(allSonnetCost.totalCost / totalCost).toFixed(1)}x |`);
239
+
240
+ return { totalTokens, totalCost, componentTotals, allSonnetCost };
241
+ }
242
+
243
+ function printSummary(results) {
244
+ let totalTokens = 0;
245
+ let totalCost = 0;
246
+
247
+ for (const r of results) {
248
+ totalTokens += r.tokens.total + r.tokens.judgeEstimate;
249
+ totalCost += r.costs.total;
250
+ }
251
+
252
+ console.log('\n## Cost Summary');
253
+ console.log(`- Scenarios: ${results.length}`);
254
+ console.log(`- Total Tokens: ${formatNumber(totalTokens)}`);
255
+ console.log(`- Total Cost: ${formatCurrency(totalCost)}`);
256
+ console.log(`- Average Cost/Scenario: ${formatCurrency(totalCost / results.length)}`);
257
+ }
258
+
259
+ // ============================================================================
260
+ // Export for Programmatic Use
261
+ // ============================================================================
262
+
263
+ export {
264
+ MODEL_PRICING,
265
+ DEFAULT_MODELS,
266
+ calculateCost,
267
+ estimateTokenSplit,
268
+ analyzeBatteryScenario,
269
+ analyzeBatteryDirectory,
270
+ };
271
+
272
+ // ============================================================================
273
+ // CLI Entry Point
274
+ // ============================================================================
275
+
276
+ const args = process.argv.slice(2);
277
+
278
+ if (args.includes('--help') || args.includes('-h')) {
279
+ console.log(`
280
+ Analyze Evaluation Costs
281
+
282
+ Usage:
283
+ node scripts/analyze-eval-costs.js # Analyze all battery evals
284
+ node scripts/analyze-eval-costs.js --battery # Analyze battery scenarios
285
+ node scripts/analyze-eval-costs.js --file <path> # Analyze specific file
286
+ node scripts/analyze-eval-costs.js --summary # Show cost summary only
287
+ node scripts/analyze-eval-costs.js --json # Output as JSON
288
+
289
+ Model Pricing (per million tokens):
290
+ Nemotron 3 Nano 30B (free): $0 input, $0 output
291
+ Claude Sonnet 4.5: $3 input, $15 output
292
+ Claude Haiku 4.5: $0.80 input, $4 output
293
+ `);
294
+ process.exit(0);
295
+ }
296
+
297
+ // Default: analyze battery directory
298
+ const batteryDir = path.join(PROJECT_ROOT, 'logs', 'interaction-evals');
299
+ const results = analyzeBatteryDirectory(batteryDir);
300
+
301
+ if (results.length === 0) {
302
+ console.log('No battery evaluation files found in', batteryDir);
303
+ process.exit(1);
304
+ }
305
+
306
+ if (args.includes('--json')) {
307
+ console.log(JSON.stringify(results, null, 2));
308
+ } else if (args.includes('--summary')) {
309
+ printSummary(results);
310
+ } else {
311
+ const totals = printDetailedReport(results);
312
+
313
+ // Save to analysis file
314
+ const analysisPath = path.join(PROJECT_ROOT, 'docs', 'research', 'COST-ANALYSIS.md');
315
+ const markdown = generateMarkdownReport(results, totals);
316
+ fs.writeFileSync(analysisPath, markdown);
317
+ console.log(`\nReport saved to: ${analysisPath}`);
318
+ }
319
+
320
+ function generateMarkdownReport(results, totals) {
321
+ const timestamp = new Date().toISOString();
322
+
323
+ return `# Evaluation Cost Analysis
324
+
325
+ **Generated:** ${timestamp}
326
+
327
+ ## Overview
328
+
329
+ This document provides token usage and cost analysis for evaluation runs, supporting reproducibility and cost planning.
330
+
331
+ ## Model Pricing
332
+
333
+ | Model | Input ($/M) | Output ($/M) | Provider |
334
+ |-------|-------------|--------------|----------|
335
+ | Nemotron 3 Nano 30B | $0.00 | $0.00 | OpenRouter (free) |
336
+ | Claude Sonnet 4.5 | $3.00 | $15.00 | OpenRouter |
337
+ | Claude Haiku 4.5 | $0.80 | $4.00 | OpenRouter |
338
+
339
+ ## Battery Scenario Results
340
+
341
+ | Scenario | Turns | Tutor Tokens | Learner Tokens | Total Cost | Score |
342
+ |----------|-------|--------------|----------------|------------|-------|
343
+ ${results.map(r => {
344
+ const scenarioShort = r.scenario.replace('short-battery_', '').replace(/-\d+$/, '');
345
+ return `| ${scenarioShort} | ${r.turns} | ${r.tokens.tutor.toLocaleString()} | ${r.tokens.learner.toLocaleString()} | ${formatCurrency(r.costs.total)} | ${r.score || 'N/A'} |`;
346
+ }).join('\n')}
347
+ | **TOTAL** | ${results.reduce((sum, r) => sum + r.turns, 0)} | ${results.reduce((sum, r) => sum + r.tokens.tutor, 0).toLocaleString()} | ${results.reduce((sum, r) => sum + r.tokens.learner, 0).toLocaleString()} | **${formatCurrency(totals.totalCost)}** | |
348
+
349
+ ## Cost by Component
350
+
351
+ | Component | Model | Tokens | Cost |
352
+ |-----------|-------|--------|------|
353
+ | Tutor (Ego+Superego) | Nemotron 3 Nano 30B | ${Math.round(totals.componentTotals.tutor.input + totals.componentTotals.tutor.output).toLocaleString()} | ${formatCurrency(totals.componentTotals.tutor.cost)} |
354
+ | Learner (Ego+Superego) | Nemotron 3 Nano 30B | ${Math.round(totals.componentTotals.learner.input + totals.componentTotals.learner.output).toLocaleString()} | ${formatCurrency(totals.componentTotals.learner.cost)} |
355
+ | Judge | Claude Sonnet 4.5 | ${Math.round(totals.componentTotals.judge.input + totals.componentTotals.judge.output).toLocaleString()} | ${formatCurrency(totals.componentTotals.judge.cost)} |
356
+
357
+ ## Hypothetical: All Claude Sonnet 4.5
358
+
359
+ | Configuration | Total Cost | Multiplier |
360
+ |---------------|------------|------------|
361
+ | Current (Nemotron + Sonnet Judge) | ${formatCurrency(totals.totalCost)} | 1.0x |
362
+ | All Sonnet 4.5 | ${formatCurrency(totals.allSonnetCost.totalCost)} | ${(totals.allSonnetCost.totalCost / totals.totalCost).toFixed(1)}x |
363
+
364
+ ## Reproducibility
365
+
366
+ To regenerate this analysis:
367
+
368
+ \`\`\`bash
369
+ node scripts/analyze-eval-costs.js
370
+ \`\`\`
371
+
372
+ To get JSON output for programmatic use:
373
+
374
+ \`\`\`bash
375
+ node scripts/analyze-eval-costs.js --json
376
+ \`\`\`
377
+ `;
378
+ }