npm - @machinespirits/eval - Versions diffs - 0.1.2 → 0.2.1 - Mend

@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

package/LICENSE +21 -0
package/README.md +161 -0
package/config/eval-settings.yaml +18 -0
package/config/evaluation-rubric-learner.yaml +277 -0
package/config/evaluation-rubric.yaml +613 -0
package/config/interaction-eval-scenarios.yaml +93 -50
package/config/learner-agents.yaml +124 -193
package/config/machinespirits-eval.code-workspace +11 -0
package/config/providers.yaml +60 -0
package/config/suggestion-scenarios.yaml +1399 -0
package/config/tutor-agents.yaml +716 -0
package/docs/EVALUATION-VARIABLES.md +589 -0
package/docs/REPLICATION-PLAN.md +577 -0
package/index.js +15 -6
package/package.json +16 -22
package/routes/evalRoutes.js +88 -36
package/scripts/analyze-judge-reliability.js +401 -0
package/scripts/analyze-run.js +97 -0
package/scripts/analyze-run.mjs +282 -0
package/scripts/analyze-validation-failures.js +141 -0
package/scripts/check-run.mjs +17 -0
package/scripts/code-impasse-strategies.js +1132 -0
package/scripts/compare-runs.js +44 -0
package/scripts/compare-suggestions.js +80 -0
package/scripts/compare-transformation.js +116 -0
package/scripts/dig-into-run.js +158 -0
package/scripts/eval-cli.js +2626 -0
package/scripts/generate-paper-figures.py +452 -0
package/scripts/qualitative-analysis-ai.js +1313 -0
package/scripts/qualitative-analysis.js +688 -0
package/scripts/seed-db.js +87 -0
package/scripts/show-failed-suggestions.js +64 -0
package/scripts/validate-content.js +192 -0
package/server.js +3 -2
package/services/__tests__/evalConfigLoader.test.js +338 -0
package/services/anovaStats.js +499 -0
package/services/contentResolver.js +407 -0
package/services/dialogueTraceAnalyzer.js +454 -0
package/services/evalConfigLoader.js +625 -0
package/services/evaluationRunner.js +2171 -270
package/services/evaluationStore.js +564 -29
package/services/learnerConfigLoader.js +75 -5
package/services/learnerRubricEvaluator.js +284 -0
package/services/learnerTutorInteractionEngine.js +375 -0
package/services/processUtils.js +18 -0
package/services/progressLogger.js +98 -0
package/services/promptRecommendationService.js +31 -26
package/services/promptRewriter.js +427 -0
package/services/rubricEvaluator.js +543 -70
package/services/streamingReporter.js +104 -0
package/services/turnComparisonAnalyzer.js +494 -0
package/components/MobileEvalDashboard.tsx +0 -267
package/components/comparison/DeltaAnalysisTable.tsx +0 -137
package/components/comparison/ProfileComparisonCard.tsx +0 -176
package/components/comparison/RecognitionABMode.tsx +0 -385
package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
package/components/comparison/WinnerIndicator.tsx +0 -64
package/components/comparison/index.ts +0 -5
package/components/mobile/BottomSheet.tsx +0 -233
package/components/mobile/DimensionBreakdown.tsx +0 -210
package/components/mobile/DocsView.tsx +0 -363
package/components/mobile/LogsView.tsx +0 -481
package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
package/components/mobile/QuickTestView.tsx +0 -1098
package/components/mobile/RecognitionTypeChart.tsx +0 -124
package/components/mobile/RecognitionView.tsx +0 -809
package/components/mobile/RunDetailView.tsx +0 -261
package/components/mobile/RunHistoryView.tsx +0 -367
package/components/mobile/ScoreRadial.tsx +0 -211
package/components/mobile/StreamingLogPanel.tsx +0 -230
package/components/mobile/SynthesisStrategyChart.tsx +0 -140
package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
package/docs/research/COST-ANALYSIS.md +0 -56
package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
package/docs/research/PAPER-UNIFIED.md +0 -659
package/docs/research/PAPER-UNIFIED.pdf +0 -0
package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
package/docs/research/apa.csl +0 -2133
package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
package/docs/research/paper-draft/full-paper.md +0 -136
package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
package/docs/research/paper-draft/references.bib +0 -515
package/docs/research/transcript-baseline.md +0 -139
package/docs/research/transcript-recognition-multiagent.md +0 -187
package/hooks/useEvalData.ts +0 -625
package/server-init.js +0 -45
package/services/benchmarkService.js +0 -1892
package/types.ts +0 -165
package/utils/haptics.ts +0 -45

package/services/contentResolver.js ADDED Viewed

@@ -0,0 +1,407 @@
+/**
+ * Content Resolver
+ *
+ * Loads actual course content (markdown lectures) from the content package
+ * repo on disk, builds structured curriculum context strings, and provides
+ * them to tutorApi.buildContext() so the tutor can give content-specific
+ * responses during evaluations.
+ *
+ * Uses mtime-based caching (same pattern as evalConfigLoader).
+ */
+import fs from 'fs';
+import path from 'path';
+// ── Configuration ──────────────────────────────────────────────────────────────
+let contentPackagePath = null;
+let maxLectureChars = 50000;
+let includeSpeakerNotes = true;
+// ── Caches (mtime-based) ──────────────────────────────────────────────────────
+/** @type {Map<string, {data: any, mtime: number}>} */
+const courseMetaCache = new Map();
+/** @type {Map<string, {data: string, mtime: number}>} */
+const lectureRawCache = new Map();
+// ── Public API ────────────────────────────────────────────────────────────────
+/**
+ * Set the content package root directory.
+ *
+ * @param {Object} opts
+ * @param {string} opts.contentPackagePath - Absolute or eval-relative path
+ * @param {number} [opts.maxLectureChars]
+ * @param {boolean} [opts.includeSpeakerNotes]
+ */
+export function configure(opts) {
+  if (opts.contentPackagePath) {
+    contentPackagePath = opts.contentPackagePath;
+  }
+  if (opts.maxLectureChars != null) {
+    maxLectureChars = opts.maxLectureChars;
+  }
+  if (opts.includeSpeakerNotes != null) {
+    includeSpeakerNotes = opts.includeSpeakerNotes;
+  }
+}
+/**
+ * Whether the resolver is configured and the content directory exists.
+ */
+export function isConfigured() {
+  if (!contentPackagePath) return false;
+  try {
+    return fs.statSync(path.join(contentPackagePath, 'courses')).isDirectory();
+  } catch {
+    return false;
+  }
+}
+// ── Course Metadata ───────────────────────────────────────────────────────────
+/**
+ * Parse YAML frontmatter from a course.md file.
+ *
+ * @param {string} courseId - e.g. "479"
+ * @returns {Object|null} Parsed frontmatter object
+ */
+export function loadCourseMeta(courseId) {
+  if (!contentPackagePath) return null;
+  const filePath = path.join(contentPackagePath, 'courses', courseId, 'course.md');
+  try {
+    const stats = fs.statSync(filePath);
+    const cached = courseMetaCache.get(courseId);
+    if (cached && cached.mtime === stats.mtimeMs) {
+      return cached.data;
+    }
+    const raw = fs.readFileSync(filePath, 'utf-8');
+    const meta = parseFrontmatter(raw);
+    courseMetaCache.set(courseId, { data: meta, mtime: stats.mtimeMs });
+    return meta;
+  } catch {
+    return null;
+  }
+}
+// ── Lecture Loading ───────────────────────────────────────────────────────────
+/**
+ * Load a lecture's raw markdown content.
+ *
+ * @param {string} lectureRef - e.g. "479-lecture-3"
+ * @returns {string|null} Raw markdown text
+ */
+export function loadLecture(lectureRef) {
+  if (!contentPackagePath) return null;
+  const parsed = parseLectureRef(lectureRef);
+  if (!parsed) return null;
+  const filePath = path.join(
+    contentPackagePath, 'courses', parsed.courseId, `lecture-${parsed.lectureNum}.md`
+  );
+  try {
+    const stats = fs.statSync(filePath);
+    const cached = lectureRawCache.get(lectureRef);
+    if (cached && cached.mtime === stats.mtimeMs) {
+      return cached.data;
+    }
+    const raw = fs.readFileSync(filePath, 'utf-8');
+    lectureRawCache.set(lectureRef, { data: raw, mtime: stats.mtimeMs });
+    return raw;
+  } catch {
+    return null;
+  }
+}
+/**
+ * Parse lecture markdown into slides and speaker notes.
+ *
+ * @param {string} raw - Raw markdown content
+ * @returns {{ slides: string[], notes: string[] }}
+ */
+export function parseLectureMarkdown(raw) {
+  // Split on slide delimiter (--- on its own line)
+  const slides = raw.split(/\n---\n/).map(s => s.trim()).filter(Boolean);
+  const notes = [];
+  const contentSlides = [];
+  for (const slide of slides) {
+    // Extract ```notes ... ``` blocks
+    const noteMatch = slide.match(/```notes\s*\n([\s\S]*?)```/);
+    if (noteMatch) {
+      notes.push(noteMatch[1].trim());
+    }
+    contentSlides.push(slide);
+  }
+  return { slides: contentSlides, notes };
+}
+// ── Curriculum Context Builder ────────────────────────────────────────────────
+/**
+ * Build the formatted curriculum context string that gets passed to
+ * tutorApi.buildContext() as the second argument.
+ *
+ * @param {Object} opts
+ * @param {string|null} opts.currentContent - Lecture ref, e.g. "479-lecture-3"
+ * @param {string[]} [opts.courseIds] - Course IDs to include (derived from currentContent if omitted)
+ * @returns {string|null}
+ */
+export function buildCurriculumContext(opts = {}) {
+  if (!isConfigured()) return null;
+  const { currentContent = null, courseIds: explicitCourseIds } = opts;
+  // Determine course IDs to include
+  let courseIds = explicitCourseIds;
+  if (!courseIds && currentContent) {
+    const parsed = parseLectureRef(currentContent);
+    if (parsed) courseIds = [parsed.courseId];
+  }
+  if (!courseIds || courseIds.length === 0) {
+    console.warn('[contentResolver] No course hint provided (missing current_content or course_ids on scenario) — skipping curriculum context');
+    return null;
+  }
+  const parts = [];
+  for (const courseId of courseIds) {
+    const meta = loadCourseMeta(courseId);
+    if (!meta) continue;
+    // Course overview
+    parts.push(`## Course: EPOL ${courseId} - ${meta.title || courseId}`);
+    if (meta.instructor) parts.push(`Instructor: ${meta.instructor}${meta.semester ? ` | Semester: ${meta.semester}` : ''}`);
+    if (meta.description) parts.push(`Description: ${meta.description.trim()}`);
+    if (meta.objectives?.length) {
+      parts.push('Objectives:');
+      for (const obj of meta.objectives) {
+        parts.push(`- ${obj}`);
+      }
+    }
+    // Lecture listing
+    const lectures = listCourseLectures(courseId);
+    if (lectures.length > 0) {
+      parts.push('');
+      parts.push('### Lecture Overview');
+      for (let i = 0; i < lectures.length; i++) {
+        const ref = `${courseId}-lecture-${i + 1}`;
+        const title = getLectureTitle(courseId, i + 1) || `Lecture ${i + 1}`;
+        const marker = ref === currentContent ? ' **[CURRENT]**' : '';
+        parts.push(`${i + 1}. ${title} (${ref})${marker}`);
+      }
+    }
+  }
+  // Current lecture full content
+  if (currentContent) {
+    const raw = loadLecture(currentContent);
+    if (raw) {
+      parts.push('');
+      parts.push('---');
+      parts.push('');
+      parts.push(`## Current Lecture Content: ${currentContent}`);
+      parts.push('');
+      let lectureText = raw;
+      // Optionally strip speaker notes
+      if (!includeSpeakerNotes) {
+        lectureText = lectureText.replace(/```notes\s*\n[\s\S]*?```/g, '');
+      }
+      // Apply character limit
+      if (lectureText.length > maxLectureChars) {
+        lectureText = lectureText.slice(0, maxLectureChars) + '\n\n[... truncated for token budget ...]';
+      }
+      parts.push(lectureText);
+    }
+  }
+  const result = parts.join('\n');
+  return result || null;
+}
+// ── Scenario Content Resolution ───────────────────────────────────────────────
+/**
+ * Extract the content reference for a scenario.
+ *
+ * Looks for:
+ *   1. `scenario.current_content` (explicit field)
+ *   2. Regex match "Currently viewing: XXX-lecture-N" in learner_context
+ *
+ * @param {Object} scenario
+ * @returns {{ currentContent: string|null, courseIds: string[] }}
+ */
+export function resolveScenarioContent(scenario) {
+  let currentContent = scenario?.current_content || null;
+  // Fallback: extract from learner_context text
+  if (!currentContent && scenario?.learner_context) {
+    const match = scenario.learner_context.match(/Currently viewing[:\s]*(\d+-lecture-\d+)/i);
+    if (match) {
+      currentContent = match[1];
+    }
+  }
+  // Derive courseIds: explicit scenario field takes priority, then derive from currentContent
+  const courseIds = scenario?.course_ids ? [...scenario.course_ids] : [];
+  if (currentContent) {
+    const parsed = parseLectureRef(currentContent);
+    if (parsed && !courseIds.includes(parsed.courseId)) {
+      courseIds.push(parsed.courseId);
+    }
+  }
+  return { currentContent, courseIds };
+}
+// ── Discovery ─────────────────────────────────────────────────────────────────
+/**
+ * List all available course IDs by scanning the courses/ directory.
+ *
+ * @returns {string[]}
+ */
+export function listAvailableCourses() {
+  if (!contentPackagePath) return [];
+  const coursesDir = path.join(contentPackagePath, 'courses');
+  try {
+    return fs.readdirSync(coursesDir).filter(name => {
+      const courseDir = path.join(coursesDir, name);
+      return fs.statSync(courseDir).isDirectory() &&
+        fs.existsSync(path.join(courseDir, 'course.md'));
+    });
+  } catch {
+    return [];
+  }
+}
+/**
+ * Validate all content can load. Returns errors (empty array = OK).
+ *
+ * @returns {string[]} Array of error messages
+ */
+export function validateContent() {
+  const errors = [];
+  if (!isConfigured()) {
+    errors.push(`Content package not configured or not found at: ${contentPackagePath || '(not set)'}`);
+    return errors;
+  }
+  const courses = listAvailableCourses();
+  if (courses.length === 0) {
+    errors.push('No courses found in content package');
+    return errors;
+  }
+  for (const courseId of courses) {
+    const meta = loadCourseMeta(courseId);
+    if (!meta) {
+      errors.push(`Course ${courseId}: failed to load course.md frontmatter`);
+      continue;
+    }
+    if (!meta.title) {
+      errors.push(`Course ${courseId}: missing title in frontmatter`);
+    }
+    const lectures = listCourseLectures(courseId);
+    for (let i = 0; i < lectures.length; i++) {
+      const ref = `${courseId}-lecture-${i + 1}`;
+      const raw = loadLecture(ref);
+      if (!raw) {
+        errors.push(`Lecture ${ref}: failed to load`);
+      } else if (raw.length < 50) {
+        errors.push(`Lecture ${ref}: suspiciously short (${raw.length} chars)`);
+      }
+    }
+  }
+  return errors;
+}
+// ── Internal Helpers ──────────────────────────────────────────────────────────
+import yaml from 'yaml';
+/**
+ * Parse YAML frontmatter delimited by --- from a markdown file.
+ */
+function parseFrontmatter(content) {
+  const match = content.match(/^---\n([\s\S]*?)\n---/);
+  if (!match) return null;
+  try {
+    return yaml.parse(match[1]);
+  } catch {
+    return null;
+  }
+}
+/**
+ * Parse a lecture reference like "479-lecture-3" into components.
+ */
+function parseLectureRef(ref) {
+  const match = ref.match(/^(\d+)-lecture-(\d+)$/);
+  if (!match) return null;
+  return { courseId: match[1], lectureNum: match[2] };
+}
+/**
+ * List lecture files for a course (sorted numerically).
+ */
+function listCourseLectures(courseId) {
+  if (!contentPackagePath) return [];
+  const courseDir = path.join(contentPackagePath, 'courses', courseId);
+  try {
+    return fs.readdirSync(courseDir)
+      .filter(f => /^lecture-\d+\.md$/.test(f))
+      .sort((a, b) => {
+        const na = parseInt(a.match(/\d+/)[0], 10);
+        const nb = parseInt(b.match(/\d+/)[0], 10);
+        return na - nb;
+      });
+  } catch {
+    return [];
+  }
+}
+/**
+ * Get the title of a lecture by reading its first heading.
+ */
+function getLectureTitle(courseId, lectureNum) {
+  const ref = `${courseId}-lecture-${lectureNum}`;
+  const raw = loadLecture(ref);
+  if (!raw) return null;
+  // Look for first ## or # heading
+  const match = raw.match(/^#{1,2}\s+(.+)$/m);
+  return match ? match[1].trim() : null;
+}
+export default {
+  configure,
+  isConfigured,
+  loadCourseMeta,
+  loadLecture,
+  parseLectureMarkdown,
+  buildCurriculumContext,
+  resolveScenarioContent,
+  listAvailableCourses,
+  validateContent,
+};