@machinespirits/eval 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/providers.yaml +60 -0
  9. package/config/suggestion-scenarios.yaml +1399 -0
  10. package/config/tutor-agents.yaml +716 -0
  11. package/docs/EVALUATION-VARIABLES.md +589 -0
  12. package/docs/REPLICATION-PLAN.md +577 -0
  13. package/docs/research/build.sh +74 -0
  14. package/docs/research/figures/figure1.png +0 -0
  15. package/docs/research/figures/figure2.png +0 -0
  16. package/docs/research/figures/figure3.png +0 -0
  17. package/docs/research/figures/figure4.png +0 -0
  18. package/docs/research/figures/figure5.png +0 -0
  19. package/docs/research/figures/figure6.png +0 -0
  20. package/docs/research/header.tex +4 -0
  21. package/docs/research/paper-full.md +1909 -0
  22. package/docs/research/paper-short.md +805 -0
  23. package/docs/research/references.bib +1011 -0
  24. package/index.js +15 -6
  25. package/package.json +14 -21
  26. package/routes/evalRoutes.js +88 -36
  27. package/scripts/analyze-judge-reliability.js +401 -0
  28. package/scripts/analyze-run.js +97 -0
  29. package/scripts/analyze-run.mjs +282 -0
  30. package/scripts/analyze-validation-failures.js +141 -0
  31. package/scripts/check-run.mjs +17 -0
  32. package/scripts/code-impasse-strategies.js +1132 -0
  33. package/scripts/compare-runs.js +44 -0
  34. package/scripts/compare-suggestions.js +80 -0
  35. package/scripts/compare-transformation.js +116 -0
  36. package/scripts/dig-into-run.js +158 -0
  37. package/scripts/eval-cli.js +2626 -0
  38. package/scripts/generate-paper-figures.py +452 -0
  39. package/scripts/qualitative-analysis-ai.js +1313 -0
  40. package/scripts/qualitative-analysis.js +688 -0
  41. package/scripts/seed-db.js +87 -0
  42. package/scripts/show-failed-suggestions.js +64 -0
  43. package/scripts/validate-content.js +192 -0
  44. package/server.js +3 -2
  45. package/services/__tests__/evalConfigLoader.test.js +338 -0
  46. package/services/anovaStats.js +499 -0
  47. package/services/contentResolver.js +407 -0
  48. package/services/dialogueTraceAnalyzer.js +454 -0
  49. package/services/evalConfigLoader.js +625 -0
  50. package/services/evaluationRunner.js +2171 -270
  51. package/services/evaluationStore.js +564 -29
  52. package/services/learnerConfigLoader.js +75 -5
  53. package/services/learnerRubricEvaluator.js +284 -0
  54. package/services/learnerTutorInteractionEngine.js +375 -0
  55. package/services/processUtils.js +18 -0
  56. package/services/progressLogger.js +98 -0
  57. package/services/promptRecommendationService.js +31 -26
  58. package/services/promptRewriter.js +427 -0
  59. package/services/rubricEvaluator.js +543 -70
  60. package/services/streamingReporter.js +104 -0
  61. package/services/turnComparisonAnalyzer.js +494 -0
  62. package/components/MobileEvalDashboard.tsx +0 -267
  63. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  64. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  65. package/components/comparison/RecognitionABMode.tsx +0 -385
  66. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  67. package/components/comparison/WinnerIndicator.tsx +0 -64
  68. package/components/comparison/index.ts +0 -5
  69. package/components/mobile/BottomSheet.tsx +0 -233
  70. package/components/mobile/DimensionBreakdown.tsx +0 -210
  71. package/components/mobile/DocsView.tsx +0 -363
  72. package/components/mobile/LogsView.tsx +0 -481
  73. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  74. package/components/mobile/QuickTestView.tsx +0 -1098
  75. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  76. package/components/mobile/RecognitionView.tsx +0 -809
  77. package/components/mobile/RunDetailView.tsx +0 -261
  78. package/components/mobile/RunHistoryView.tsx +0 -367
  79. package/components/mobile/ScoreRadial.tsx +0 -211
  80. package/components/mobile/StreamingLogPanel.tsx +0 -230
  81. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  82. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  83. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  84. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  85. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  86. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  87. package/docs/research/COST-ANALYSIS.md +0 -56
  88. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  89. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  90. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  91. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  92. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  93. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  94. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  95. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  96. package/docs/research/PAPER-UNIFIED.md +0 -659
  97. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  98. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  99. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  100. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  101. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  102. package/docs/research/paper-draft/full-paper.md +0 -136
  103. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  104. package/docs/research/paper-draft/references.bib +0 -515
  105. package/docs/research/transcript-baseline.md +0 -139
  106. package/docs/research/transcript-recognition-multiagent.md +0 -187
  107. package/hooks/useEvalData.ts +0 -625
  108. package/server-init.js +0 -45
  109. package/services/benchmarkService.js +0 -1892
  110. package/types.ts +0 -165
  111. package/utils/haptics.ts +0 -45
@@ -0,0 +1,87 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Seed Database
4
+ *
5
+ * Creates a small sample dataset so new users can explore the CLI
6
+ * (runs, report, export) without running a full evaluation.
7
+ *
8
+ * Usage: node scripts/seed-db.js
9
+ */
10
+
11
+ import * as evaluationStore from '../services/evaluationStore.js';
12
+
13
+ const SEED_RUN_ID = 'seed-sample-factorial';
14
+
15
+ // Check if seed data already exists
16
+ const existing = evaluationStore.getRun(SEED_RUN_ID);
17
+ if (existing) {
18
+ console.log(`Seed run '${SEED_RUN_ID}' already exists. Delete it first to re-seed.`);
19
+ process.exit(0);
20
+ }
21
+
22
+ console.log('Creating seed evaluation run...');
23
+
24
+ // Insert run directly (createRun auto-generates IDs, so use the store's db)
25
+ const run = evaluationStore.createRun({
26
+ description: 'Sample 2x2x2 factorial (seed data for demonstration)',
27
+ totalScenarios: 1,
28
+ totalConfigurations: 8,
29
+ });
30
+
31
+ // We need the auto-generated ID — use it going forward
32
+ const runId = run.id;
33
+
34
+ // 8 factorial cells with representative scores (matching paper Table 5 means)
35
+ const cells = [
36
+ { profile: 'cell_1_base_single_unified', recog: false, multi: false, learner: 'unified', score: 77.6 },
37
+ { profile: 'cell_2_base_single_psycho', recog: false, multi: false, learner: 'ego_superego', score: 80.0 },
38
+ { profile: 'cell_3_base_multi_unified', recog: false, multi: true, learner: 'unified', score: 76.6 },
39
+ { profile: 'cell_4_base_multi_psycho', recog: false, multi: true, learner: 'ego_superego', score: 81.5 },
40
+ { profile: 'cell_5_recog_single_unified', recog: true, multi: false, learner: 'unified', score: 92.8 },
41
+ { profile: 'cell_6_recog_single_psycho', recog: true, multi: false, learner: 'ego_superego', score: 83.4 },
42
+ { profile: 'cell_7_recog_multi_unified', recog: true, multi: true, learner: 'unified', score: 92.3 },
43
+ { profile: 'cell_8_recog_multi_psycho', recog: true, multi: true, learner: 'ego_superego', score: 86.7 },
44
+ ];
45
+
46
+ for (const cell of cells) {
47
+ const base = cell.score / 20; // scale 0-100 → approx 1-5
48
+ evaluationStore.storeResult(runId, {
49
+ scenarioId: 'struggling_learner',
50
+ scenarioName: 'Struggling Learner',
51
+ provider: 'openrouter',
52
+ model: 'moonshotai/kimi-k2.5',
53
+ profileName: cell.profile,
54
+ suggestions: [{
55
+ type: 'review',
56
+ priority: 'high',
57
+ title: 'Sample suggestion',
58
+ message: `Sample ${cell.recog ? 'recognition-theory' : 'base'} tutor response for a struggling learner.`,
59
+ }],
60
+ latencyMs: 5000 + Math.floor(Math.random() * 10000),
61
+ scores: {
62
+ relevance: Math.min(5, Math.max(1, Math.round(base + (Math.random() - 0.5)))),
63
+ specificity: Math.min(5, Math.max(1, Math.round(base + (Math.random() - 0.5)))),
64
+ pedagogical: Math.min(5, Math.max(1, Math.round(base + (Math.random() - 0.5)))),
65
+ personalization: Math.min(5, Math.max(1, Math.round(base + (Math.random() - 0.5)))),
66
+ actionability: Math.min(5, Math.max(1, Math.round(base + (Math.random() - 0.5)))),
67
+ tone: Math.min(5, Math.max(1, Math.round(base + (Math.random() - 0.5)))),
68
+ },
69
+ overallScore: cell.score,
70
+ judgeModel: 'seed-data',
71
+ success: true,
72
+ factors: {
73
+ recognition: cell.recog,
74
+ multi_agent_tutor: cell.multi,
75
+ multi_agent_learner: cell.learner === 'ego_superego',
76
+ },
77
+ learnerArchitecture: cell.learner,
78
+ });
79
+ }
80
+
81
+ // Mark run complete
82
+ evaluationStore.completeRun(runId);
83
+
84
+ console.log(`Seed run created: ${runId}`);
85
+ console.log(' 8 factorial cells, 1 scenario each');
86
+ console.log(` Try: node scripts/eval-cli.js runs`);
87
+ console.log(` Try: node scripts/eval-cli.js report ${runId}`);
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env node
2
+ import fs from 'fs';
3
+ import path from 'path';
4
+
5
+ const dir = 'logs/tutor-dialogues';
6
+ const files = fs.readdirSync(dir).filter(f => f.endsWith('.json'));
7
+
8
+ // Get recent files
9
+ const recentFiles = files.filter(f => {
10
+ const stat = fs.statSync(path.join(dir, f));
11
+ return new Date(stat.mtime) >= new Date('2026-02-03');
12
+ });
13
+
14
+ console.log(`Scanning ${recentFiles.length} dialogue files...\n`);
15
+
16
+ // Find recognition profile dialogues
17
+ let recognitionExamples = [];
18
+
19
+ for (const f of recentFiles) {
20
+ try {
21
+ const d = JSON.parse(fs.readFileSync(path.join(dir, f), 'utf8'));
22
+ if (d.profileName !== 'recognition') continue;
23
+
24
+ const suggestions = d.suggestions || [];
25
+ if (suggestions.length === 0) continue;
26
+
27
+ const first = suggestions[0];
28
+ const text = ((first.title || '') + ' ' + (first.message || '')).toLowerCase();
29
+ const hasReview = text.includes('review');
30
+
31
+ // Store suggestion content
32
+ recognitionExamples.push({
33
+ file: f,
34
+ title: first.title || '',
35
+ message: first.message || '',
36
+ hasReview,
37
+ text
38
+ });
39
+ } catch (e) {}
40
+ }
41
+
42
+ // Show examples without "review"
43
+ const failingExamples = recognitionExamples.filter(e => !e.hasReview);
44
+ const passingExamples = recognitionExamples.filter(e => e.hasReview);
45
+
46
+ console.log(`Recognition profile: ${recognitionExamples.length} total dialogues`);
47
+ console.log(` With "review": ${passingExamples.length}`);
48
+ console.log(` Without "review": ${failingExamples.length}\n`);
49
+
50
+ console.log('=== FAILING EXAMPLES (no "review" in text) ===\n');
51
+ for (const ex of failingExamples.slice(0, 6)) {
52
+ console.log(`File: ${ex.file}`);
53
+ console.log(`Title: "${ex.title}"`);
54
+ console.log(`Message: ${ex.message.substring(0, 300)}...`);
55
+ console.log('---\n');
56
+ }
57
+
58
+ console.log('=== PASSING EXAMPLES (has "review") ===\n');
59
+ for (const ex of passingExamples.slice(0, 3)) {
60
+ console.log(`File: ${ex.file}`);
61
+ console.log(`Title: "${ex.title}"`);
62
+ console.log(`Message: ${ex.message.substring(0, 200)}...`);
63
+ console.log('---\n');
64
+ }
@@ -0,0 +1,192 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * Content Validation CLI
5
+ *
6
+ * Validates that the content package is accessible and all lectures load
7
+ * correctly. Also previews parsed content for debugging.
8
+ *
9
+ * Usage:
10
+ * node scripts/validate-content.js # Validate all content
11
+ * node scripts/validate-content.js --lecture 479-lecture-3 # Show parsed lecture
12
+ * node scripts/validate-content.js --preview 479-lecture-3 # Show full curriculum context
13
+ * node scripts/validate-content.js --scenarios # Check all scenarios' content refs
14
+ */
15
+
16
+ import * as contentResolver from '../services/contentResolver.js';
17
+ import * as evalConfigLoader from '../services/evalConfigLoader.js';
18
+
19
+ // ── Helpers ────────────────────────────────────────────────────────────────────
20
+
21
+ function initContentResolver() {
22
+ const contentConfig = evalConfigLoader.getContentConfig();
23
+ if (!contentConfig?.content_package_path) {
24
+ console.error('Error: No content.content_package_path in config/eval-settings.yaml');
25
+ process.exit(1);
26
+ }
27
+
28
+ contentResolver.configure({
29
+ contentPackagePath: contentConfig.content_package_path,
30
+ maxLectureChars: contentConfig.max_lecture_chars,
31
+ includeSpeakerNotes: contentConfig.include_speaker_notes,
32
+ });
33
+
34
+ if (!contentResolver.isConfigured()) {
35
+ console.error(`Error: Content directory not found at: ${contentConfig.content_package_path}`);
36
+ process.exit(1);
37
+ }
38
+
39
+ return contentConfig;
40
+ }
41
+
42
+ // ── Commands ──────────────────────────────────────────────────────────────────
43
+
44
+ function validateAll() {
45
+ const config = initContentResolver();
46
+ console.log(`Content package: ${config.content_package_path}`);
47
+ console.log('');
48
+
49
+ const courses = contentResolver.listAvailableCourses();
50
+ console.log(`Found ${courses.length} course(s): ${courses.join(', ')}`);
51
+ console.log('');
52
+
53
+ for (const courseId of courses) {
54
+ const meta = contentResolver.loadCourseMeta(courseId);
55
+ if (meta) {
56
+ console.log(` [${courseId}] ${meta.title || '(no title)'}`);
57
+ if (meta.instructor) console.log(` Instructor: ${meta.instructor}`);
58
+ if (meta.objectives?.length) console.log(` Objectives: ${meta.objectives.length}`);
59
+ } else {
60
+ console.log(` [${courseId}] ERROR: could not load course.md`);
61
+ }
62
+ }
63
+
64
+ console.log('');
65
+ const errors = contentResolver.validateContent();
66
+ if (errors.length === 0) {
67
+ console.log('Validation PASSED - all content loads correctly.');
68
+ } else {
69
+ console.log(`Validation FAILED - ${errors.length} error(s):`);
70
+ for (const err of errors) {
71
+ console.log(` - ${err}`);
72
+ }
73
+ process.exit(1);
74
+ }
75
+ }
76
+
77
+ function showLecture(lectureRef) {
78
+ initContentResolver();
79
+
80
+ const raw = contentResolver.loadLecture(lectureRef);
81
+ if (!raw) {
82
+ console.error(`Error: Could not load lecture "${lectureRef}"`);
83
+ process.exit(1);
84
+ }
85
+
86
+ const parsed = contentResolver.parseLectureMarkdown(raw);
87
+ console.log(`Lecture: ${lectureRef}`);
88
+ console.log(`Total characters: ${raw.length}`);
89
+ console.log(`Slides: ${parsed.slides.length}`);
90
+ console.log(`Speaker notes blocks: ${parsed.notes.length}`);
91
+ console.log('');
92
+
93
+ for (let i = 0; i < parsed.slides.length; i++) {
94
+ const slide = parsed.slides[i];
95
+ const preview = slide.slice(0, 120).replace(/\n/g, ' ');
96
+ console.log(` Slide ${i + 1}: ${preview}${slide.length > 120 ? '...' : ''}`);
97
+ }
98
+
99
+ if (parsed.notes.length > 0) {
100
+ console.log('');
101
+ console.log('Speaker Notes:');
102
+ for (let i = 0; i < parsed.notes.length; i++) {
103
+ const preview = parsed.notes[i].slice(0, 100).replace(/\n/g, ' ');
104
+ console.log(` [${i + 1}] ${preview}${parsed.notes[i].length > 100 ? '...' : ''}`);
105
+ }
106
+ }
107
+ }
108
+
109
+ function previewCurriculum(lectureRef) {
110
+ initContentResolver();
111
+
112
+ const context = contentResolver.buildCurriculumContext({
113
+ currentContent: lectureRef,
114
+ });
115
+
116
+ if (!context) {
117
+ console.error(`Error: Could not build curriculum context for "${lectureRef}"`);
118
+ process.exit(1);
119
+ }
120
+
121
+ console.log(`Curriculum context for: ${lectureRef}`);
122
+ console.log(`Total characters: ${context.length}`);
123
+ console.log('='.repeat(80));
124
+ console.log(context);
125
+ console.log('='.repeat(80));
126
+ }
127
+
128
+ function checkScenarios() {
129
+ initContentResolver();
130
+
131
+ const scenarios = evalConfigLoader.listScenarios();
132
+ console.log(`Checking ${scenarios.length} scenario(s) for content references...\n`);
133
+
134
+ let resolved = 0;
135
+ let unresolved = 0;
136
+ let noContent = 0;
137
+
138
+ for (const scenarioMeta of scenarios) {
139
+ const scenario = evalConfigLoader.getScenario(scenarioMeta.id);
140
+ const { currentContent } = contentResolver.resolveScenarioContent(scenario);
141
+
142
+ if (currentContent) {
143
+ const raw = contentResolver.loadLecture(currentContent);
144
+ if (raw) {
145
+ console.log(` [OK] ${scenarioMeta.id} → ${currentContent} (${raw.length} chars)`);
146
+ resolved++;
147
+ } else {
148
+ console.log(` [FAIL] ${scenarioMeta.id} → ${currentContent} (NOT FOUND)`);
149
+ unresolved++;
150
+ }
151
+ } else {
152
+ console.log(` [NONE] ${scenarioMeta.id} → no content reference (will use course overview only)`);
153
+ noContent++;
154
+ }
155
+ }
156
+
157
+ console.log('');
158
+ console.log(`Results: ${resolved} resolved, ${unresolved} failed, ${noContent} no content ref`);
159
+
160
+ if (unresolved > 0) {
161
+ process.exit(1);
162
+ }
163
+ }
164
+
165
+ // ── Main ──────────────────────────────────────────────────────────────────────
166
+
167
+ const args = process.argv.slice(2);
168
+
169
+ if (args.includes('--help') || args.includes('-h')) {
170
+ console.log(`Usage:
171
+ node scripts/validate-content.js # Validate all content
172
+ node scripts/validate-content.js --lecture 479-lecture-3 # Show parsed lecture
173
+ node scripts/validate-content.js --preview 479-lecture-3 # Show full curriculum context
174
+ node scripts/validate-content.js --scenarios # Check all scenarios' content refs`);
175
+ process.exit(0);
176
+ }
177
+
178
+ if (args.includes('--lecture')) {
179
+ const idx = args.indexOf('--lecture');
180
+ const ref = args[idx + 1];
181
+ if (!ref) { console.error('Missing lecture ref'); process.exit(1); }
182
+ showLecture(ref);
183
+ } else if (args.includes('--preview')) {
184
+ const idx = args.indexOf('--preview');
185
+ const ref = args[idx + 1];
186
+ if (!ref) { console.error('Missing lecture ref'); process.exit(1); }
187
+ previewCurriculum(ref);
188
+ } else if (args.includes('--scenarios')) {
189
+ checkScenarios();
190
+ } else {
191
+ validateAll();
192
+ }
package/server.js CHANGED
@@ -20,10 +20,11 @@
20
20
  import express from 'express';
21
21
  import path from 'path';
22
22
  import { fileURLToPath } from 'url';
23
- import { existsSync, mkdirSync } from 'fs';
23
+ import { existsSync, mkdirSync, readFileSync } from 'fs';
24
24
 
25
25
  const __filename = fileURLToPath(import.meta.url);
26
26
  const __dirname = path.dirname(__filename);
27
+ const pkg = JSON.parse(readFileSync(path.join(__dirname, 'package.json'), 'utf-8'));
27
28
 
28
29
  const app = express();
29
30
  const PORT = Number(process.env.PORT) || 8081;
@@ -44,7 +45,7 @@ app.get('/health', (req, res) => {
44
45
  res.json({
45
46
  status: 'ok',
46
47
  package: '@machinespirits/eval',
47
- version: '0.1.0',
48
+ version: pkg.version,
48
49
  mode: isStandalone ? 'standalone' : 'mounted',
49
50
  });
50
51
  });