@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
@@ -0,0 +1,87 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Seed Database
4
+ *
5
+ * Creates a small sample dataset so new users can explore the CLI
6
+ * (runs, report, export) without running a full evaluation.
7
+ *
8
+ * Usage: node scripts/seed-db.js
9
+ */
10
+
11
+ import * as evaluationStore from '../services/evaluationStore.js';
12
+
13
+ const SEED_RUN_ID = 'seed-sample-factorial';
14
+
15
+ // Check if seed data already exists
16
+ const existing = evaluationStore.getRun(SEED_RUN_ID);
17
+ if (existing) {
18
+ console.log(`Seed run '${SEED_RUN_ID}' already exists. Delete it first to re-seed.`);
19
+ process.exit(0);
20
+ }
21
+
22
+ console.log('Creating seed evaluation run...');
23
+
24
+ // Insert run directly (createRun auto-generates IDs, so use the store's db)
25
+ const run = evaluationStore.createRun({
26
+ description: 'Sample 2x2x2 factorial (seed data for demonstration)',
27
+ totalScenarios: 1,
28
+ totalConfigurations: 8,
29
+ });
30
+
31
+ // We need the auto-generated ID — use it going forward
32
+ const runId = run.id;
33
+
34
+ // 8 factorial cells with representative scores (matching paper Table 5 means)
35
+ const cells = [
36
+ { profile: 'cell_1_base_single_unified', recog: false, multi: false, learner: 'unified', score: 77.6 },
37
+ { profile: 'cell_2_base_single_psycho', recog: false, multi: false, learner: 'ego_superego', score: 80.0 },
38
+ { profile: 'cell_3_base_multi_unified', recog: false, multi: true, learner: 'unified', score: 76.6 },
39
+ { profile: 'cell_4_base_multi_psycho', recog: false, multi: true, learner: 'ego_superego', score: 81.5 },
40
+ { profile: 'cell_5_recog_single_unified', recog: true, multi: false, learner: 'unified', score: 92.8 },
41
+ { profile: 'cell_6_recog_single_psycho', recog: true, multi: false, learner: 'ego_superego', score: 83.4 },
42
+ { profile: 'cell_7_recog_multi_unified', recog: true, multi: true, learner: 'unified', score: 92.3 },
43
+ { profile: 'cell_8_recog_multi_psycho', recog: true, multi: true, learner: 'ego_superego', score: 86.7 },
44
+ ];
45
+
46
+ for (const cell of cells) {
47
+ const base = cell.score / 20; // scale 0-100 → approx 1-5
48
+ evaluationStore.storeResult(runId, {
49
+ scenarioId: 'struggling_learner',
50
+ scenarioName: 'Struggling Learner',
51
+ provider: 'openrouter',
52
+ model: 'moonshotai/kimi-k2.5',
53
+ profileName: cell.profile,
54
+ suggestions: [{
55
+ type: 'review',
56
+ priority: 'high',
57
+ title: 'Sample suggestion',
58
+ message: `Sample ${cell.recog ? 'recognition-theory' : 'base'} tutor response for a struggling learner.`,
59
+ }],
60
+ latencyMs: 5000 + Math.floor(Math.random() * 10000),
61
+ scores: {
62
+ relevance: Math.min(5, Math.max(1, Math.round(base + (Math.random() - 0.5)))),
63
+ specificity: Math.min(5, Math.max(1, Math.round(base + (Math.random() - 0.5)))),
64
+ pedagogical: Math.min(5, Math.max(1, Math.round(base + (Math.random() - 0.5)))),
65
+ personalization: Math.min(5, Math.max(1, Math.round(base + (Math.random() - 0.5)))),
66
+ actionability: Math.min(5, Math.max(1, Math.round(base + (Math.random() - 0.5)))),
67
+ tone: Math.min(5, Math.max(1, Math.round(base + (Math.random() - 0.5)))),
68
+ },
69
+ overallScore: cell.score,
70
+ judgeModel: 'seed-data',
71
+ success: true,
72
+ factors: {
73
+ recognition: cell.recog,
74
+ multi_agent_tutor: cell.multi,
75
+ multi_agent_learner: cell.learner === 'ego_superego',
76
+ },
77
+ learnerArchitecture: cell.learner,
78
+ });
79
+ }
80
+
81
+ // Mark run complete
82
+ evaluationStore.completeRun(runId);
83
+
84
+ console.log(`Seed run created: ${runId}`);
85
+ console.log(' 8 factorial cells, 1 scenario each');
86
+ console.log(` Try: node scripts/eval-cli.js runs`);
87
+ console.log(` Try: node scripts/eval-cli.js report ${runId}`);
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env node
2
+ import fs from 'fs';
3
+ import path from 'path';
4
+
5
+ const dir = 'logs/tutor-dialogues';
6
+ const files = fs.readdirSync(dir).filter(f => f.endsWith('.json'));
7
+
8
+ // Get recent files
9
+ const recentFiles = files.filter(f => {
10
+ const stat = fs.statSync(path.join(dir, f));
11
+ return new Date(stat.mtime) >= new Date('2026-02-03');
12
+ });
13
+
14
+ console.log(`Scanning ${recentFiles.length} dialogue files...\n`);
15
+
16
+ // Find recognition profile dialogues
17
+ let recognitionExamples = [];
18
+
19
+ for (const f of recentFiles) {
20
+ try {
21
+ const d = JSON.parse(fs.readFileSync(path.join(dir, f), 'utf8'));
22
+ if (d.profileName !== 'recognition') continue;
23
+
24
+ const suggestions = d.suggestions || [];
25
+ if (suggestions.length === 0) continue;
26
+
27
+ const first = suggestions[0];
28
+ const text = ((first.title || '') + ' ' + (first.message || '')).toLowerCase();
29
+ const hasReview = text.includes('review');
30
+
31
+ // Store suggestion content
32
+ recognitionExamples.push({
33
+ file: f,
34
+ title: first.title || '',
35
+ message: first.message || '',
36
+ hasReview,
37
+ text
38
+ });
39
+ } catch (e) {}
40
+ }
41
+
42
+ // Show examples without "review"
43
+ const failingExamples = recognitionExamples.filter(e => !e.hasReview);
44
+ const passingExamples = recognitionExamples.filter(e => e.hasReview);
45
+
46
+ console.log(`Recognition profile: ${recognitionExamples.length} total dialogues`);
47
+ console.log(` With "review": ${passingExamples.length}`);
48
+ console.log(` Without "review": ${failingExamples.length}\n`);
49
+
50
+ console.log('=== FAILING EXAMPLES (no "review" in text) ===\n');
51
+ for (const ex of failingExamples.slice(0, 6)) {
52
+ console.log(`File: ${ex.file}`);
53
+ console.log(`Title: "${ex.title}"`);
54
+ console.log(`Message: ${ex.message.substring(0, 300)}...`);
55
+ console.log('---\n');
56
+ }
57
+
58
+ console.log('=== PASSING EXAMPLES (has "review") ===\n');
59
+ for (const ex of passingExamples.slice(0, 3)) {
60
+ console.log(`File: ${ex.file}`);
61
+ console.log(`Title: "${ex.title}"`);
62
+ console.log(`Message: ${ex.message.substring(0, 200)}...`);
63
+ console.log('---\n');
64
+ }
@@ -0,0 +1,192 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * Content Validation CLI
5
+ *
6
+ * Validates that the content package is accessible and all lectures load
7
+ * correctly. Also previews parsed content for debugging.
8
+ *
9
+ * Usage:
10
+ * node scripts/validate-content.js # Validate all content
11
+ * node scripts/validate-content.js --lecture 479-lecture-3 # Show parsed lecture
12
+ * node scripts/validate-content.js --preview 479-lecture-3 # Show full curriculum context
13
+ * node scripts/validate-content.js --scenarios # Check all scenarios' content refs
14
+ */
15
+
16
+ import * as contentResolver from '../services/contentResolver.js';
17
+ import * as evalConfigLoader from '../services/evalConfigLoader.js';
18
+
19
+ // ── Helpers ────────────────────────────────────────────────────────────────────
20
+
21
+ function initContentResolver() {
22
+ const contentConfig = evalConfigLoader.getContentConfig();
23
+ if (!contentConfig?.content_package_path) {
24
+ console.error('Error: No content.content_package_path in config/eval-settings.yaml');
25
+ process.exit(1);
26
+ }
27
+
28
+ contentResolver.configure({
29
+ contentPackagePath: contentConfig.content_package_path,
30
+ maxLectureChars: contentConfig.max_lecture_chars,
31
+ includeSpeakerNotes: contentConfig.include_speaker_notes,
32
+ });
33
+
34
+ if (!contentResolver.isConfigured()) {
35
+ console.error(`Error: Content directory not found at: ${contentConfig.content_package_path}`);
36
+ process.exit(1);
37
+ }
38
+
39
+ return contentConfig;
40
+ }
41
+
42
+ // ── Commands ──────────────────────────────────────────────────────────────────
43
+
44
+ function validateAll() {
45
+ const config = initContentResolver();
46
+ console.log(`Content package: ${config.content_package_path}`);
47
+ console.log('');
48
+
49
+ const courses = contentResolver.listAvailableCourses();
50
+ console.log(`Found ${courses.length} course(s): ${courses.join(', ')}`);
51
+ console.log('');
52
+
53
+ for (const courseId of courses) {
54
+ const meta = contentResolver.loadCourseMeta(courseId);
55
+ if (meta) {
56
+ console.log(` [${courseId}] ${meta.title || '(no title)'}`);
57
+ if (meta.instructor) console.log(` Instructor: ${meta.instructor}`);
58
+ if (meta.objectives?.length) console.log(` Objectives: ${meta.objectives.length}`);
59
+ } else {
60
+ console.log(` [${courseId}] ERROR: could not load course.md`);
61
+ }
62
+ }
63
+
64
+ console.log('');
65
+ const errors = contentResolver.validateContent();
66
+ if (errors.length === 0) {
67
+ console.log('Validation PASSED - all content loads correctly.');
68
+ } else {
69
+ console.log(`Validation FAILED - ${errors.length} error(s):`);
70
+ for (const err of errors) {
71
+ console.log(` - ${err}`);
72
+ }
73
+ process.exit(1);
74
+ }
75
+ }
76
+
77
+ function showLecture(lectureRef) {
78
+ initContentResolver();
79
+
80
+ const raw = contentResolver.loadLecture(lectureRef);
81
+ if (!raw) {
82
+ console.error(`Error: Could not load lecture "${lectureRef}"`);
83
+ process.exit(1);
84
+ }
85
+
86
+ const parsed = contentResolver.parseLectureMarkdown(raw);
87
+ console.log(`Lecture: ${lectureRef}`);
88
+ console.log(`Total characters: ${raw.length}`);
89
+ console.log(`Slides: ${parsed.slides.length}`);
90
+ console.log(`Speaker notes blocks: ${parsed.notes.length}`);
91
+ console.log('');
92
+
93
+ for (let i = 0; i < parsed.slides.length; i++) {
94
+ const slide = parsed.slides[i];
95
+ const preview = slide.slice(0, 120).replace(/\n/g, ' ');
96
+ console.log(` Slide ${i + 1}: ${preview}${slide.length > 120 ? '...' : ''}`);
97
+ }
98
+
99
+ if (parsed.notes.length > 0) {
100
+ console.log('');
101
+ console.log('Speaker Notes:');
102
+ for (let i = 0; i < parsed.notes.length; i++) {
103
+ const preview = parsed.notes[i].slice(0, 100).replace(/\n/g, ' ');
104
+ console.log(` [${i + 1}] ${preview}${parsed.notes[i].length > 100 ? '...' : ''}`);
105
+ }
106
+ }
107
+ }
108
+
109
+ function previewCurriculum(lectureRef) {
110
+ initContentResolver();
111
+
112
+ const context = contentResolver.buildCurriculumContext({
113
+ currentContent: lectureRef,
114
+ });
115
+
116
+ if (!context) {
117
+ console.error(`Error: Could not build curriculum context for "${lectureRef}"`);
118
+ process.exit(1);
119
+ }
120
+
121
+ console.log(`Curriculum context for: ${lectureRef}`);
122
+ console.log(`Total characters: ${context.length}`);
123
+ console.log('='.repeat(80));
124
+ console.log(context);
125
+ console.log('='.repeat(80));
126
+ }
127
+
128
+ function checkScenarios() {
129
+ initContentResolver();
130
+
131
+ const scenarios = evalConfigLoader.listScenarios();
132
+ console.log(`Checking ${scenarios.length} scenario(s) for content references...\n`);
133
+
134
+ let resolved = 0;
135
+ let unresolved = 0;
136
+ let noContent = 0;
137
+
138
+ for (const scenarioMeta of scenarios) {
139
+ const scenario = evalConfigLoader.getScenario(scenarioMeta.id);
140
+ const { currentContent } = contentResolver.resolveScenarioContent(scenario);
141
+
142
+ if (currentContent) {
143
+ const raw = contentResolver.loadLecture(currentContent);
144
+ if (raw) {
145
+ console.log(` [OK] ${scenarioMeta.id} → ${currentContent} (${raw.length} chars)`);
146
+ resolved++;
147
+ } else {
148
+ console.log(` [FAIL] ${scenarioMeta.id} → ${currentContent} (NOT FOUND)`);
149
+ unresolved++;
150
+ }
151
+ } else {
152
+ console.log(` [NONE] ${scenarioMeta.id} → no content reference (will use course overview only)`);
153
+ noContent++;
154
+ }
155
+ }
156
+
157
+ console.log('');
158
+ console.log(`Results: ${resolved} resolved, ${unresolved} failed, ${noContent} no content ref`);
159
+
160
+ if (unresolved > 0) {
161
+ process.exit(1);
162
+ }
163
+ }
164
+
165
+ // ── Main ──────────────────────────────────────────────────────────────────────
166
+
167
+ const args = process.argv.slice(2);
168
+
169
+ if (args.includes('--help') || args.includes('-h')) {
170
+ console.log(`Usage:
171
+ node scripts/validate-content.js # Validate all content
172
+ node scripts/validate-content.js --lecture 479-lecture-3 # Show parsed lecture
173
+ node scripts/validate-content.js --preview 479-lecture-3 # Show full curriculum context
174
+ node scripts/validate-content.js --scenarios # Check all scenarios' content refs`);
175
+ process.exit(0);
176
+ }
177
+
178
+ if (args.includes('--lecture')) {
179
+ const idx = args.indexOf('--lecture');
180
+ const ref = args[idx + 1];
181
+ if (!ref) { console.error('Missing lecture ref'); process.exit(1); }
182
+ showLecture(ref);
183
+ } else if (args.includes('--preview')) {
184
+ const idx = args.indexOf('--preview');
185
+ const ref = args[idx + 1];
186
+ if (!ref) { console.error('Missing lecture ref'); process.exit(1); }
187
+ previewCurriculum(ref);
188
+ } else if (args.includes('--scenarios')) {
189
+ checkScenarios();
190
+ } else {
191
+ validateAll();
192
+ }
package/server.js CHANGED
@@ -20,10 +20,11 @@
20
20
  import express from 'express';
21
21
  import path from 'path';
22
22
  import { fileURLToPath } from 'url';
23
- import { existsSync, mkdirSync } from 'fs';
23
+ import { existsSync, mkdirSync, readFileSync } from 'fs';
24
24
 
25
25
  const __filename = fileURLToPath(import.meta.url);
26
26
  const __dirname = path.dirname(__filename);
27
+ const pkg = JSON.parse(readFileSync(path.join(__dirname, 'package.json'), 'utf-8'));
27
28
 
28
29
  const app = express();
29
30
  const PORT = Number(process.env.PORT) || 8081;
@@ -44,7 +45,7 @@ app.get('/health', (req, res) => {
44
45
  res.json({
45
46
  status: 'ok',
46
47
  package: '@machinespirits/eval',
47
- version: '0.1.0',
48
+ version: pkg.version,
48
49
  mode: isStandalone ? 'standalone' : 'mounted',
49
50
  });
50
51
  });