@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
@@ -0,0 +1,407 @@
1
+ /**
2
+ * Content Resolver
3
+ *
4
+ * Loads actual course content (markdown lectures) from the content package
5
+ * repo on disk, builds structured curriculum context strings, and provides
6
+ * them to tutorApi.buildContext() so the tutor can give content-specific
7
+ * responses during evaluations.
8
+ *
9
+ * Uses mtime-based caching (same pattern as evalConfigLoader).
10
+ */
11
+
12
+ import fs from 'fs';
13
+ import path from 'path';
14
+
15
+ // ── Configuration ──────────────────────────────────────────────────────────────
16
+
17
+ let contentPackagePath = null;
18
+ let maxLectureChars = 50000;
19
+ let includeSpeakerNotes = true;
20
+
21
+ // ── Caches (mtime-based) ──────────────────────────────────────────────────────
22
+
23
+ /** @type {Map<string, {data: any, mtime: number}>} */
24
+ const courseMetaCache = new Map();
25
+ /** @type {Map<string, {data: string, mtime: number}>} */
26
+ const lectureRawCache = new Map();
27
+
28
+ // ── Public API ────────────────────────────────────────────────────────────────
29
+
30
+ /**
31
+ * Set the content package root directory.
32
+ *
33
+ * @param {Object} opts
34
+ * @param {string} opts.contentPackagePath - Absolute or eval-relative path
35
+ * @param {number} [opts.maxLectureChars]
36
+ * @param {boolean} [opts.includeSpeakerNotes]
37
+ */
38
+ export function configure(opts) {
39
+ if (opts.contentPackagePath) {
40
+ contentPackagePath = opts.contentPackagePath;
41
+ }
42
+ if (opts.maxLectureChars != null) {
43
+ maxLectureChars = opts.maxLectureChars;
44
+ }
45
+ if (opts.includeSpeakerNotes != null) {
46
+ includeSpeakerNotes = opts.includeSpeakerNotes;
47
+ }
48
+ }
49
+
50
+ /**
51
+ * Whether the resolver is configured and the content directory exists.
52
+ */
53
+ export function isConfigured() {
54
+ if (!contentPackagePath) return false;
55
+ try {
56
+ return fs.statSync(path.join(contentPackagePath, 'courses')).isDirectory();
57
+ } catch {
58
+ return false;
59
+ }
60
+ }
61
+
62
+ // ── Course Metadata ───────────────────────────────────────────────────────────
63
+
64
+ /**
65
+ * Parse YAML frontmatter from a course.md file.
66
+ *
67
+ * @param {string} courseId - e.g. "479"
68
+ * @returns {Object|null} Parsed frontmatter object
69
+ */
70
+ export function loadCourseMeta(courseId) {
71
+ if (!contentPackagePath) return null;
72
+
73
+ const filePath = path.join(contentPackagePath, 'courses', courseId, 'course.md');
74
+
75
+ try {
76
+ const stats = fs.statSync(filePath);
77
+ const cached = courseMetaCache.get(courseId);
78
+ if (cached && cached.mtime === stats.mtimeMs) {
79
+ return cached.data;
80
+ }
81
+
82
+ const raw = fs.readFileSync(filePath, 'utf-8');
83
+ const meta = parseFrontmatter(raw);
84
+ courseMetaCache.set(courseId, { data: meta, mtime: stats.mtimeMs });
85
+ return meta;
86
+ } catch {
87
+ return null;
88
+ }
89
+ }
90
+
91
+ // ── Lecture Loading ───────────────────────────────────────────────────────────
92
+
93
+ /**
94
+ * Load a lecture's raw markdown content.
95
+ *
96
+ * @param {string} lectureRef - e.g. "479-lecture-3"
97
+ * @returns {string|null} Raw markdown text
98
+ */
99
+ export function loadLecture(lectureRef) {
100
+ if (!contentPackagePath) return null;
101
+
102
+ const parsed = parseLectureRef(lectureRef);
103
+ if (!parsed) return null;
104
+
105
+ const filePath = path.join(
106
+ contentPackagePath, 'courses', parsed.courseId, `lecture-${parsed.lectureNum}.md`
107
+ );
108
+
109
+ try {
110
+ const stats = fs.statSync(filePath);
111
+ const cached = lectureRawCache.get(lectureRef);
112
+ if (cached && cached.mtime === stats.mtimeMs) {
113
+ return cached.data;
114
+ }
115
+
116
+ const raw = fs.readFileSync(filePath, 'utf-8');
117
+ lectureRawCache.set(lectureRef, { data: raw, mtime: stats.mtimeMs });
118
+ return raw;
119
+ } catch {
120
+ return null;
121
+ }
122
+ }
123
+
124
+ /**
125
+ * Parse lecture markdown into slides and speaker notes.
126
+ *
127
+ * @param {string} raw - Raw markdown content
128
+ * @returns {{ slides: string[], notes: string[] }}
129
+ */
130
+ export function parseLectureMarkdown(raw) {
131
+ // Split on slide delimiter (--- on its own line)
132
+ const slides = raw.split(/\n---\n/).map(s => s.trim()).filter(Boolean);
133
+
134
+ const notes = [];
135
+ const contentSlides = [];
136
+
137
+ for (const slide of slides) {
138
+ // Extract ```notes ... ``` blocks
139
+ const noteMatch = slide.match(/```notes\s*\n([\s\S]*?)```/);
140
+ if (noteMatch) {
141
+ notes.push(noteMatch[1].trim());
142
+ }
143
+ contentSlides.push(slide);
144
+ }
145
+
146
+ return { slides: contentSlides, notes };
147
+ }
148
+
149
+ // ── Curriculum Context Builder ────────────────────────────────────────────────
150
+
151
+ /**
152
+ * Build the formatted curriculum context string that gets passed to
153
+ * tutorApi.buildContext() as the second argument.
154
+ *
155
+ * @param {Object} opts
156
+ * @param {string|null} opts.currentContent - Lecture ref, e.g. "479-lecture-3"
157
+ * @param {string[]} [opts.courseIds] - Course IDs to include (derived from currentContent if omitted)
158
+ * @returns {string|null}
159
+ */
160
+ export function buildCurriculumContext(opts = {}) {
161
+ if (!isConfigured()) return null;
162
+
163
+ const { currentContent = null, courseIds: explicitCourseIds } = opts;
164
+
165
+ // Determine course IDs to include
166
+ let courseIds = explicitCourseIds;
167
+ if (!courseIds && currentContent) {
168
+ const parsed = parseLectureRef(currentContent);
169
+ if (parsed) courseIds = [parsed.courseId];
170
+ }
171
+ if (!courseIds || courseIds.length === 0) {
172
+ console.warn('[contentResolver] No course hint provided (missing current_content or course_ids on scenario) — skipping curriculum context');
173
+ return null;
174
+ }
175
+
176
+ const parts = [];
177
+
178
+ for (const courseId of courseIds) {
179
+ const meta = loadCourseMeta(courseId);
180
+ if (!meta) continue;
181
+
182
+ // Course overview
183
+ parts.push(`## Course: EPOL ${courseId} - ${meta.title || courseId}`);
184
+ if (meta.instructor) parts.push(`Instructor: ${meta.instructor}${meta.semester ? ` | Semester: ${meta.semester}` : ''}`);
185
+ if (meta.description) parts.push(`Description: ${meta.description.trim()}`);
186
+ if (meta.objectives?.length) {
187
+ parts.push('Objectives:');
188
+ for (const obj of meta.objectives) {
189
+ parts.push(`- ${obj}`);
190
+ }
191
+ }
192
+
193
+ // Lecture listing
194
+ const lectures = listCourseLectures(courseId);
195
+ if (lectures.length > 0) {
196
+ parts.push('');
197
+ parts.push('### Lecture Overview');
198
+ for (let i = 0; i < lectures.length; i++) {
199
+ const ref = `${courseId}-lecture-${i + 1}`;
200
+ const title = getLectureTitle(courseId, i + 1) || `Lecture ${i + 1}`;
201
+ const marker = ref === currentContent ? ' **[CURRENT]**' : '';
202
+ parts.push(`${i + 1}. ${title} (${ref})${marker}`);
203
+ }
204
+ }
205
+ }
206
+
207
+ // Current lecture full content
208
+ if (currentContent) {
209
+ const raw = loadLecture(currentContent);
210
+ if (raw) {
211
+ parts.push('');
212
+ parts.push('---');
213
+ parts.push('');
214
+ parts.push(`## Current Lecture Content: ${currentContent}`);
215
+ parts.push('');
216
+
217
+ let lectureText = raw;
218
+
219
+ // Optionally strip speaker notes
220
+ if (!includeSpeakerNotes) {
221
+ lectureText = lectureText.replace(/```notes\s*\n[\s\S]*?```/g, '');
222
+ }
223
+
224
+ // Apply character limit
225
+ if (lectureText.length > maxLectureChars) {
226
+ lectureText = lectureText.slice(0, maxLectureChars) + '\n\n[... truncated for token budget ...]';
227
+ }
228
+
229
+ parts.push(lectureText);
230
+ }
231
+ }
232
+
233
+ const result = parts.join('\n');
234
+ return result || null;
235
+ }
236
+
237
+ // ── Scenario Content Resolution ───────────────────────────────────────────────
238
+
239
+ /**
240
+ * Extract the content reference for a scenario.
241
+ *
242
+ * Looks for:
243
+ * 1. `scenario.current_content` (explicit field)
244
+ * 2. Regex match "Currently viewing: XXX-lecture-N" in learner_context
245
+ *
246
+ * @param {Object} scenario
247
+ * @returns {{ currentContent: string|null, courseIds: string[] }}
248
+ */
249
+ export function resolveScenarioContent(scenario) {
250
+ let currentContent = scenario?.current_content || null;
251
+
252
+ // Fallback: extract from learner_context text
253
+ if (!currentContent && scenario?.learner_context) {
254
+ const match = scenario.learner_context.match(/Currently viewing[:\s]*(\d+-lecture-\d+)/i);
255
+ if (match) {
256
+ currentContent = match[1];
257
+ }
258
+ }
259
+
260
+ // Derive courseIds: explicit scenario field takes priority, then derive from currentContent
261
+ const courseIds = scenario?.course_ids ? [...scenario.course_ids] : [];
262
+ if (currentContent) {
263
+ const parsed = parseLectureRef(currentContent);
264
+ if (parsed && !courseIds.includes(parsed.courseId)) {
265
+ courseIds.push(parsed.courseId);
266
+ }
267
+ }
268
+
269
+ return { currentContent, courseIds };
270
+ }
271
+
272
+ // ── Discovery ─────────────────────────────────────────────────────────────────
273
+
274
+ /**
275
+ * List all available course IDs by scanning the courses/ directory.
276
+ *
277
+ * @returns {string[]}
278
+ */
279
+ export function listAvailableCourses() {
280
+ if (!contentPackagePath) return [];
281
+
282
+ const coursesDir = path.join(contentPackagePath, 'courses');
283
+ try {
284
+ return fs.readdirSync(coursesDir).filter(name => {
285
+ const courseDir = path.join(coursesDir, name);
286
+ return fs.statSync(courseDir).isDirectory() &&
287
+ fs.existsSync(path.join(courseDir, 'course.md'));
288
+ });
289
+ } catch {
290
+ return [];
291
+ }
292
+ }
293
+
294
+ /**
295
+ * Validate all content can load. Returns errors (empty array = OK).
296
+ *
297
+ * @returns {string[]} Array of error messages
298
+ */
299
+ export function validateContent() {
300
+ const errors = [];
301
+
302
+ if (!isConfigured()) {
303
+ errors.push(`Content package not configured or not found at: ${contentPackagePath || '(not set)'}`);
304
+ return errors;
305
+ }
306
+
307
+ const courses = listAvailableCourses();
308
+ if (courses.length === 0) {
309
+ errors.push('No courses found in content package');
310
+ return errors;
311
+ }
312
+
313
+ for (const courseId of courses) {
314
+ const meta = loadCourseMeta(courseId);
315
+ if (!meta) {
316
+ errors.push(`Course ${courseId}: failed to load course.md frontmatter`);
317
+ continue;
318
+ }
319
+ if (!meta.title) {
320
+ errors.push(`Course ${courseId}: missing title in frontmatter`);
321
+ }
322
+
323
+ const lectures = listCourseLectures(courseId);
324
+ for (let i = 0; i < lectures.length; i++) {
325
+ const ref = `${courseId}-lecture-${i + 1}`;
326
+ const raw = loadLecture(ref);
327
+ if (!raw) {
328
+ errors.push(`Lecture ${ref}: failed to load`);
329
+ } else if (raw.length < 50) {
330
+ errors.push(`Lecture ${ref}: suspiciously short (${raw.length} chars)`);
331
+ }
332
+ }
333
+ }
334
+
335
+ return errors;
336
+ }
337
+
338
+ // ── Internal Helpers ──────────────────────────────────────────────────────────
339
+
340
+ import yaml from 'yaml';
341
+
342
+ /**
343
+ * Parse YAML frontmatter delimited by --- from a markdown file.
344
+ */
345
+ function parseFrontmatter(content) {
346
+ const match = content.match(/^---\n([\s\S]*?)\n---/);
347
+ if (!match) return null;
348
+
349
+ try {
350
+ return yaml.parse(match[1]);
351
+ } catch {
352
+ return null;
353
+ }
354
+ }
355
+
356
+ /**
357
+ * Parse a lecture reference like "479-lecture-3" into components.
358
+ */
359
+ function parseLectureRef(ref) {
360
+ const match = ref.match(/^(\d+)-lecture-(\d+)$/);
361
+ if (!match) return null;
362
+ return { courseId: match[1], lectureNum: match[2] };
363
+ }
364
+
365
+ /**
366
+ * List lecture files for a course (sorted numerically).
367
+ */
368
+ function listCourseLectures(courseId) {
369
+ if (!contentPackagePath) return [];
370
+ const courseDir = path.join(contentPackagePath, 'courses', courseId);
371
+ try {
372
+ return fs.readdirSync(courseDir)
373
+ .filter(f => /^lecture-\d+\.md$/.test(f))
374
+ .sort((a, b) => {
375
+ const na = parseInt(a.match(/\d+/)[0], 10);
376
+ const nb = parseInt(b.match(/\d+/)[0], 10);
377
+ return na - nb;
378
+ });
379
+ } catch {
380
+ return [];
381
+ }
382
+ }
383
+
384
+ /**
385
+ * Get the title of a lecture by reading its first heading.
386
+ */
387
+ function getLectureTitle(courseId, lectureNum) {
388
+ const ref = `${courseId}-lecture-${lectureNum}`;
389
+ const raw = loadLecture(ref);
390
+ if (!raw) return null;
391
+
392
+ // Look for first ## or # heading
393
+ const match = raw.match(/^#{1,2}\s+(.+)$/m);
394
+ return match ? match[1].trim() : null;
395
+ }
396
+
397
+ export default {
398
+ configure,
399
+ isConfigured,
400
+ loadCourseMeta,
401
+ loadLecture,
402
+ parseLectureMarkdown,
403
+ buildCurriculumContext,
404
+ resolveScenarioContent,
405
+ listAvailableCourses,
406
+ validateContent,
407
+ };