@machinespirits/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/components/MobileEvalDashboard.tsx +267 -0
  2. package/components/comparison/DeltaAnalysisTable.tsx +137 -0
  3. package/components/comparison/ProfileComparisonCard.tsx +176 -0
  4. package/components/comparison/RecognitionABMode.tsx +385 -0
  5. package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
  6. package/components/comparison/WinnerIndicator.tsx +64 -0
  7. package/components/comparison/index.ts +5 -0
  8. package/components/mobile/BottomSheet.tsx +233 -0
  9. package/components/mobile/DimensionBreakdown.tsx +210 -0
  10. package/components/mobile/DocsView.tsx +363 -0
  11. package/components/mobile/LogsView.tsx +481 -0
  12. package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
  13. package/components/mobile/QuickTestView.tsx +1098 -0
  14. package/components/mobile/RecognitionTypeChart.tsx +124 -0
  15. package/components/mobile/RecognitionView.tsx +809 -0
  16. package/components/mobile/RunDetailView.tsx +261 -0
  17. package/components/mobile/RunHistoryView.tsx +367 -0
  18. package/components/mobile/ScoreRadial.tsx +211 -0
  19. package/components/mobile/StreamingLogPanel.tsx +230 -0
  20. package/components/mobile/SynthesisStrategyChart.tsx +140 -0
  21. package/config/interaction-eval-scenarios.yaml +832 -0
  22. package/config/learner-agents.yaml +248 -0
  23. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
  24. package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
  25. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
  26. package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
  27. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
  28. package/docs/research/COST-ANALYSIS.md +56 -0
  29. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
  30. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
  31. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
  32. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
  33. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
  34. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
  35. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
  36. package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
  37. package/docs/research/PAPER-UNIFIED.md +659 -0
  38. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  39. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
  40. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
  41. package/docs/research/apa.csl +2133 -0
  42. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
  43. package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
  44. package/docs/research/paper-draft/full-paper.md +136 -0
  45. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  46. package/docs/research/paper-draft/references.bib +515 -0
  47. package/docs/research/transcript-baseline.md +139 -0
  48. package/docs/research/transcript-recognition-multiagent.md +187 -0
  49. package/hooks/useEvalData.ts +625 -0
  50. package/index.js +27 -0
  51. package/package.json +73 -0
  52. package/routes/evalRoutes.js +3002 -0
  53. package/scripts/advanced-eval-analysis.js +351 -0
  54. package/scripts/analyze-eval-costs.js +378 -0
  55. package/scripts/analyze-eval-results.js +513 -0
  56. package/scripts/analyze-interaction-evals.js +368 -0
  57. package/server-init.js +45 -0
  58. package/server.js +162 -0
  59. package/services/benchmarkService.js +1892 -0
  60. package/services/evaluationRunner.js +739 -0
  61. package/services/evaluationStore.js +1121 -0
  62. package/services/learnerConfigLoader.js +385 -0
  63. package/services/learnerTutorInteractionEngine.js +857 -0
  64. package/services/memory/learnerMemoryService.js +1227 -0
  65. package/services/memory/learnerWritingPad.js +577 -0
  66. package/services/memory/tutorWritingPad.js +674 -0
  67. package/services/promptRecommendationService.js +493 -0
  68. package/services/rubricEvaluator.js +826 -0
@@ -0,0 +1,493 @@
1
+ /**
2
+ * Prompt Recommendation Service
3
+ *
4
+ * Analyzes evaluation results and generates recommendations to improve
5
+ * tutor prompts. Uses a powerful evaluator model to analyze failures
6
+ * and weaknesses from weaker tutor models.
7
+ *
8
+ * Evaluator configuration is loaded from config/evaluation-rubric.yaml
9
+ * Provider details are resolved from config/providers.yaml
10
+ */
11
+
12
+ import fs from 'fs';
13
+ import path from 'path';
14
+ import { fileURLToPath } from 'url';
15
+ import Anthropic from '@anthropic-ai/sdk';
16
+ import { tutorApiService as tutorApi, tutorConfigLoader as configLoader } from '@machinespirits/tutor-core';
17
+
18
+ const __filename = fileURLToPath(import.meta.url);
19
+ const __dirname = path.dirname(__filename);
20
+ const ROOT_DIR = path.resolve(__dirname, '..');
21
+ const PROMPTS_DIR = path.join(ROOT_DIR, 'prompts');
22
+
23
+ /**
24
+ * Get recommender config, resolving model references via providers.yaml
25
+ * Uses 'recommender' config from evaluation-rubric.yaml (falls back to 'evaluator')
26
+ */
27
+ function getEvaluatorConfig() {
28
+ const rubric = tutorApi.loadRubric();
29
+ // Prefer 'recommender' for prompt analysis, fall back to legacy 'evaluator'
30
+ const evalConfig = rubric?.recommender || rubric?.evaluator;
31
+
32
+ if (!evalConfig?.model) {
33
+ console.warn('[promptRecommendation] No recommender in evaluation-rubric.yaml, using defaults');
34
+ return {
35
+ provider: 'openrouter',
36
+ model: 'deepseek/deepseek-chat-v3-0324',
37
+ hyperparameters: { temperature: 0.4, max_tokens: 6000 },
38
+ };
39
+ }
40
+
41
+ // Try to resolve primary model
42
+ try {
43
+ const resolved = configLoader.resolveModel(evalConfig.model);
44
+ if (resolved.isConfigured) {
45
+ return {
46
+ provider: resolved.provider,
47
+ model: resolved.model,
48
+ apiKey: resolved.apiKey,
49
+ baseUrl: resolved.baseUrl,
50
+ hyperparameters: evalConfig.hyperparameters || {},
51
+ };
52
+ }
53
+ } catch (e) {
54
+ console.warn(`[promptRecommendation] Failed to resolve model: ${e.message}`);
55
+ }
56
+
57
+ // Try fallback
58
+ if (evalConfig.fallback?.model) {
59
+ try {
60
+ const fallback = configLoader.resolveModel(evalConfig.fallback.model);
61
+ if (fallback.isConfigured) {
62
+ console.log(`[promptRecommendation] Using fallback: ${fallback.provider}/${fallback.model}`);
63
+ return {
64
+ provider: fallback.provider,
65
+ model: fallback.model,
66
+ apiKey: fallback.apiKey,
67
+ baseUrl: fallback.baseUrl,
68
+ hyperparameters: evalConfig.fallback.hyperparameters || evalConfig.hyperparameters || {},
69
+ };
70
+ }
71
+ } catch (e) {
72
+ console.warn(`[promptRecommendation] Failed to resolve fallback: ${e.message}`);
73
+ }
74
+ }
75
+
76
+ // Return primary anyway - will fail with helpful error
77
+ const resolved = configLoader.resolveModel(evalConfig.model);
78
+ return {
79
+ provider: resolved.provider,
80
+ model: resolved.model,
81
+ hyperparameters: evalConfig.hyperparameters || {},
82
+ };
83
+ }
84
+
85
+ /**
86
+ * Read a prompt file
87
+ */
88
+ function readPromptFile(filename) {
89
+ const filePath = path.join(PROMPTS_DIR, filename);
90
+ if (!fs.existsSync(filePath)) {
91
+ return null;
92
+ }
93
+ return fs.readFileSync(filePath, 'utf-8');
94
+ }
95
+
96
+ /**
97
+ * Analyze evaluation results to find patterns and issues
98
+ */
99
+ function analyzeResults(results) {
100
+ const analysis = {
101
+ totalResults: results.length,
102
+ successCount: results.filter(r => r.success).length,
103
+ failureCount: results.filter(r => !r.success).length,
104
+ avgScore: 0,
105
+ lowScoreResults: [],
106
+ validationFailures: [],
107
+ dimensionWeaknesses: {},
108
+ commonIssues: [],
109
+ };
110
+
111
+ // Calculate average and find low scores
112
+ const scores = results.filter(r => r.overallScore != null).map(r => r.overallScore);
113
+ if (scores.length > 0) {
114
+ analysis.avgScore = scores.reduce((a, b) => a + b, 0) / scores.length;
115
+ analysis.lowScoreResults = results
116
+ .filter(r => r.overallScore != null && r.overallScore < 70)
117
+ .slice(0, 10); // Top 10 low scorers
118
+ }
119
+
120
+ // Find validation failures
121
+ analysis.validationFailures = results
122
+ .filter(r => !r.passesRequired || !r.passesForbidden)
123
+ .map(r => ({
124
+ scenarioId: r.scenarioId,
125
+ scenarioName: r.scenarioName,
126
+ requiredMissing: r.requiredMissing || [],
127
+ forbiddenFound: r.forbiddenFound || [],
128
+ suggestion: r.suggestions?.[0],
129
+ }))
130
+ .slice(0, 10);
131
+
132
+ // Aggregate dimension scores
133
+ const dimensionScores = {};
134
+ for (const result of results) {
135
+ if (result.scores) {
136
+ for (const [dim, score] of Object.entries(result.scores)) {
137
+ if (score != null) {
138
+ if (!dimensionScores[dim]) dimensionScores[dim] = [];
139
+ dimensionScores[dim].push(score);
140
+ }
141
+ }
142
+ }
143
+ }
144
+
145
+ // Find weak dimensions (avg < 3.5 out of 5)
146
+ for (const [dim, scores] of Object.entries(dimensionScores)) {
147
+ const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
148
+ if (avg < 3.5) {
149
+ analysis.dimensionWeaknesses[dim] = {
150
+ avgScore: avg,
151
+ sampleCount: scores.length,
152
+ };
153
+ }
154
+ }
155
+
156
+ return analysis;
157
+ }
158
+
159
+ /**
160
+ * Build the analysis prompt for the evaluator
161
+ */
162
+ function buildAnalysisPrompt(analysis, egoPrompt, superegoPrompt, profileName) {
163
+ const sections = [];
164
+
165
+ sections.push(`# Prompt Improvement Analysis Request
166
+
167
+ You are an expert in LLM prompt engineering and educational AI systems. Your task is to analyze evaluation results from an AI tutoring system and recommend specific, actionable improvements to the prompts.
168
+
169
+ ## Context
170
+
171
+ This tutoring system uses an Ego/Superego dialogue architecture:
172
+ - **Ego Agent**: The student-facing tutor that generates learning suggestions
173
+ - **Superego Agent**: The critic that reviews and refines suggestions
174
+
175
+ The evaluation tested profile: **${profileName || 'unknown'}**
176
+
177
+ ## Evaluation Summary
178
+
179
+ - Total tests: ${analysis.totalResults}
180
+ - Successes: ${analysis.successCount}
181
+ - Failures: ${analysis.failureCount}
182
+ - Average score: ${analysis.avgScore.toFixed(1)}/100
183
+ `);
184
+
185
+ // Dimension weaknesses
186
+ if (Object.keys(analysis.dimensionWeaknesses).length > 0) {
187
+ sections.push(`
188
+ ## Weak Dimensions
189
+
190
+ The following rubric dimensions scored below 3.5/5 on average:
191
+
192
+ ${Object.entries(analysis.dimensionWeaknesses)
193
+ .map(([dim, data]) => `- **${dim}**: ${data.avgScore.toFixed(2)}/5 (${data.sampleCount} samples)`)
194
+ .join('\n')}
195
+ `);
196
+ }
197
+
198
+ // Validation failures
199
+ if (analysis.validationFailures.length > 0) {
200
+ sections.push(`
201
+ ## Validation Failures
202
+
203
+ These tests failed required/forbidden element checks:
204
+
205
+ ${analysis.validationFailures.slice(0, 5).map(f => `
206
+ ### ${f.scenarioName} (${f.scenarioId})
207
+ - Required elements missing: ${f.requiredMissing.length > 0 ? f.requiredMissing.join(', ') : 'none'}
208
+ - Forbidden elements found: ${f.forbiddenFound.length > 0 ? f.forbiddenFound.join(', ') : 'none'}
209
+ - Generated suggestion: "${f.suggestion?.title || 'N/A'}" - ${f.suggestion?.message?.substring(0, 100) || 'N/A'}...
210
+ `).join('\n')}
211
+ `);
212
+ }
213
+
214
+ // Low score examples
215
+ if (analysis.lowScoreResults.length > 0) {
216
+ sections.push(`
217
+ ## Low-Scoring Examples
218
+
219
+ These tests scored below 70/100:
220
+
221
+ ${analysis.lowScoreResults.slice(0, 5).map(r => `
222
+ ### ${r.scenarioName} (score: ${r.overallScore?.toFixed(1)})
223
+ - Suggestion: "${r.suggestions?.[0]?.title || 'N/A'}"
224
+ - Message: ${r.suggestions?.[0]?.message?.substring(0, 150) || 'N/A'}...
225
+ - Evaluation reasoning: ${r.evaluationReasoning?.substring(0, 200) || 'N/A'}...
226
+ `).join('\n')}
227
+ `);
228
+ }
229
+
230
+ // Current prompts
231
+ sections.push(`
232
+ ## Current Ego Prompt
233
+
234
+ \`\`\`markdown
235
+ ${egoPrompt?.substring(0, 3000) || 'Not available'}
236
+ ${egoPrompt?.length > 3000 ? '\n... (truncated)' : ''}
237
+ \`\`\`
238
+
239
+ ## Current Superego Prompt
240
+
241
+ \`\`\`markdown
242
+ ${superegoPrompt?.substring(0, 3000) || 'Not available'}
243
+ ${superegoPrompt?.length > 3000 ? '\n... (truncated)' : ''}
244
+ \`\`\`
245
+ `);
246
+
247
+ // Request
248
+ sections.push(`
249
+ ## Your Task
250
+
251
+ Based on this analysis, provide specific recommendations to improve the prompts. Structure your response as:
252
+
253
+ 1. **Key Issues Identified**: What patterns in the failures suggest prompt weaknesses?
254
+
255
+ 2. **Ego Prompt Recommendations**:
256
+ - Specific sections to modify
257
+ - Exact text changes or additions
258
+ - Rationale for each change
259
+
260
+ 3. **Superego Prompt Recommendations**:
261
+ - Specific sections to modify
262
+ - Exact text changes or additions
263
+ - Rationale for each change
264
+
265
+ 4. **New Examples or Constraints**: Any new examples, edge cases, or constraints to add
266
+
267
+ 5. **Priority Ranking**: Rank your recommendations by expected impact
268
+
269
+ Be specific and actionable. Quote exact text to change when possible.
270
+ `);
271
+
272
+ return sections.join('\n');
273
+ }
274
+
275
+ /**
276
+ * Call the evaluator model to generate recommendations
277
+ * Uses config from evaluation-rubric.yaml
278
+ */
279
+ async function callEvaluator(prompt, options = {}) {
280
+ const { budget = false } = options;
281
+
282
+ // Get config from yaml (handles fallbacks automatically)
283
+ const config = getEvaluatorConfig();
284
+ const { provider, model, hyperparameters } = config;
285
+ const maxTokens = hyperparameters?.max_tokens ?? 4000;
286
+ const temperature = hyperparameters?.temperature ?? 0.3;
287
+
288
+ if (provider === 'openrouter') {
289
+ return callOpenRouterEvaluator(prompt, model, { maxTokens, temperature });
290
+ }
291
+
292
+ if (provider !== 'anthropic') {
293
+ throw new Error(`Provider ${provider} not yet supported for recommendations`);
294
+ }
295
+
296
+ const apiKey = process.env.ANTHROPIC_API_KEY;
297
+ if (!apiKey) {
298
+ throw new Error('ANTHROPIC_API_KEY not set');
299
+ }
300
+
301
+ const client = new Anthropic({ apiKey });
302
+
303
+ const response = await client.messages.create({
304
+ model,
305
+ max_tokens: maxTokens,
306
+ temperature,
307
+ messages: [
308
+ {
309
+ role: 'user',
310
+ content: prompt,
311
+ },
312
+ ],
313
+ });
314
+
315
+ return {
316
+ content: response.content[0]?.text || '',
317
+ model: response.model,
318
+ inputTokens: response.usage?.input_tokens,
319
+ outputTokens: response.usage?.output_tokens,
320
+ };
321
+ }
322
+
323
+ /**
324
+ * Call OpenRouter for evaluation
325
+ */
326
+ async function callOpenRouterEvaluator(prompt, model, options = {}) {
327
+ const { maxTokens = 4000, temperature = 0.3 } = options;
328
+
329
+ const apiKey = process.env.OPENROUTER_API_KEY;
330
+ if (!apiKey) {
331
+ throw new Error('OPENROUTER_API_KEY not set');
332
+ }
333
+
334
+ const response = await fetch('https://openrouter.ai/api/v1/chat/completions', {
335
+ method: 'POST',
336
+ headers: {
337
+ 'Authorization': `Bearer ${apiKey}`,
338
+ 'Content-Type': 'application/json',
339
+ 'HTTP-Referer': 'https://machinespirits.org',
340
+ 'X-Title': 'Machine Spirits Tutor Eval',
341
+ },
342
+ body: JSON.stringify({
343
+ model,
344
+ max_tokens: maxTokens,
345
+ temperature,
346
+ messages: [
347
+ {
348
+ role: 'user',
349
+ content: prompt,
350
+ },
351
+ ],
352
+ }),
353
+ });
354
+
355
+ if (!response.ok) {
356
+ const errorText = await response.text();
357
+ throw new Error(`OpenRouter error: ${response.status} ${errorText}`);
358
+ }
359
+
360
+ const data = await response.json();
361
+ const content = data.choices?.[0]?.message?.content || '';
362
+
363
+ return {
364
+ content,
365
+ model: data.model || model,
366
+ inputTokens: data.usage?.prompt_tokens,
367
+ outputTokens: data.usage?.completion_tokens,
368
+ };
369
+ }
370
+
371
+ /**
372
+ * Generate prompt improvement recommendations based on evaluation results
373
+ *
374
+ * @param {Object} options - Options
375
+ * @param {string} options.runId - Evaluation run ID to analyze
376
+ * @param {Object[]} options.results - Or pass results directly
377
+ * @param {string} options.profileName - Profile that was evaluated
378
+ * @param {string} options.egoPromptFile - Ego prompt file to analyze
379
+ * @param {string} options.superegoPromptFile - Superego prompt file to analyze
380
+ * @param {string} options.evaluatorModel - Model to use for analysis (default: claude-sonnet-4)
381
+ * @param {string} options.evaluatorProvider - Provider: 'anthropic' or 'openrouter'
382
+ * @param {boolean} options.budget - Use budget evaluator model
383
+ * @returns {Promise<Object>} Recommendations
384
+ */
385
+ export async function generateRecommendations(options = {}) {
386
+ const {
387
+ results = [],
388
+ profileName = 'unknown',
389
+ egoPromptFile = 'tutor-ego.md',
390
+ superegoPromptFile = 'tutor-superego.md',
391
+ evaluatorModel = null,
392
+ evaluatorProvider = 'anthropic',
393
+ budget = false,
394
+ } = options;
395
+
396
+ if (results.length === 0) {
397
+ throw new Error('No evaluation results provided');
398
+ }
399
+
400
+ // Read prompts
401
+ const egoPrompt = readPromptFile(egoPromptFile);
402
+ const superegoPrompt = readPromptFile(superegoPromptFile);
403
+
404
+ if (!egoPrompt && !superegoPrompt) {
405
+ throw new Error('Could not read any prompt files');
406
+ }
407
+
408
+ // Analyze results
409
+ const analysis = analyzeResults(results);
410
+
411
+ // Check if there's enough signal for recommendations
412
+ if (analysis.avgScore > 90 && analysis.validationFailures.length === 0) {
413
+ return {
414
+ success: true,
415
+ needsImprovement: false,
416
+ message: 'Prompts are performing well. Average score is above 90 with no validation failures.',
417
+ analysis,
418
+ };
419
+ }
420
+
421
+ // Build analysis prompt
422
+ const analysisPrompt = buildAnalysisPrompt(analysis, egoPrompt, superegoPrompt, profileName);
423
+
424
+ // Get evaluator config from yaml
425
+ const evalConfig = getEvaluatorConfig();
426
+ console.log(`\nGenerating recommendations using ${evalConfig.provider}/${evalConfig.model}...`);
427
+
428
+ const evalResult = await callEvaluator(analysisPrompt);
429
+
430
+ return {
431
+ success: true,
432
+ needsImprovement: true,
433
+ analysis,
434
+ recommendations: evalResult.content,
435
+ evaluatorModel: evalResult.model,
436
+ usage: {
437
+ inputTokens: evalResult.inputTokens,
438
+ outputTokens: evalResult.outputTokens,
439
+ },
440
+ };
441
+ }
442
+
443
+ /**
444
+ * Format recommendations for CLI display
445
+ */
446
+ export function formatRecommendations(result) {
447
+ const lines = [];
448
+
449
+ lines.push('');
450
+ lines.push('═'.repeat(80));
451
+ lines.push('PROMPT IMPROVEMENT RECOMMENDATIONS');
452
+ lines.push('═'.repeat(80));
453
+ lines.push('');
454
+
455
+ if (!result.needsImprovement) {
456
+ lines.push('✓ ' + result.message);
457
+ lines.push('');
458
+ return lines.join('\n');
459
+ }
460
+
461
+ // Analysis summary
462
+ lines.push('ANALYSIS SUMMARY');
463
+ lines.push('─'.repeat(40));
464
+ lines.push(`Total tests analyzed: ${result.analysis.totalResults}`);
465
+ lines.push(`Average score: ${result.analysis.avgScore.toFixed(1)}/100`);
466
+ lines.push(`Validation failures: ${result.analysis.validationFailures.length}`);
467
+
468
+ if (Object.keys(result.analysis.dimensionWeaknesses).length > 0) {
469
+ lines.push('');
470
+ lines.push('Weak dimensions:');
471
+ for (const [dim, data] of Object.entries(result.analysis.dimensionWeaknesses)) {
472
+ lines.push(` • ${dim}: ${data.avgScore.toFixed(2)}/5`);
473
+ }
474
+ }
475
+
476
+ lines.push('');
477
+ lines.push('─'.repeat(80));
478
+ lines.push('');
479
+ lines.push(result.recommendations);
480
+ lines.push('');
481
+ lines.push('─'.repeat(80));
482
+ lines.push(`Evaluator: ${result.evaluatorModel}`);
483
+ lines.push(`Tokens: ${result.usage.inputTokens} in / ${result.usage.outputTokens} out`);
484
+ lines.push('═'.repeat(80));
485
+
486
+ return lines.join('\n');
487
+ }
488
+
489
+ export default {
490
+ generateRecommendations,
491
+ formatRecommendations,
492
+ analyzeResults,
493
+ };