npm - @machinespirits/eval - Versions diffs - 0.1.2 → 0.2.1 - Mend

@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

package/LICENSE +21 -0
package/README.md +161 -0
package/config/eval-settings.yaml +18 -0
package/config/evaluation-rubric-learner.yaml +277 -0
package/config/evaluation-rubric.yaml +613 -0
package/config/interaction-eval-scenarios.yaml +93 -50
package/config/learner-agents.yaml +124 -193
package/config/machinespirits-eval.code-workspace +11 -0
package/config/providers.yaml +60 -0
package/config/suggestion-scenarios.yaml +1399 -0
package/config/tutor-agents.yaml +716 -0
package/docs/EVALUATION-VARIABLES.md +589 -0
package/docs/REPLICATION-PLAN.md +577 -0
package/index.js +15 -6
package/package.json +16 -22
package/routes/evalRoutes.js +88 -36
package/scripts/analyze-judge-reliability.js +401 -0
package/scripts/analyze-run.js +97 -0
package/scripts/analyze-run.mjs +282 -0
package/scripts/analyze-validation-failures.js +141 -0
package/scripts/check-run.mjs +17 -0
package/scripts/code-impasse-strategies.js +1132 -0
package/scripts/compare-runs.js +44 -0
package/scripts/compare-suggestions.js +80 -0
package/scripts/compare-transformation.js +116 -0
package/scripts/dig-into-run.js +158 -0
package/scripts/eval-cli.js +2626 -0
package/scripts/generate-paper-figures.py +452 -0
package/scripts/qualitative-analysis-ai.js +1313 -0
package/scripts/qualitative-analysis.js +688 -0
package/scripts/seed-db.js +87 -0
package/scripts/show-failed-suggestions.js +64 -0
package/scripts/validate-content.js +192 -0
package/server.js +3 -2
package/services/__tests__/evalConfigLoader.test.js +338 -0
package/services/anovaStats.js +499 -0
package/services/contentResolver.js +407 -0
package/services/dialogueTraceAnalyzer.js +454 -0
package/services/evalConfigLoader.js +625 -0
package/services/evaluationRunner.js +2171 -270
package/services/evaluationStore.js +564 -29
package/services/learnerConfigLoader.js +75 -5
package/services/learnerRubricEvaluator.js +284 -0
package/services/learnerTutorInteractionEngine.js +375 -0
package/services/processUtils.js +18 -0
package/services/progressLogger.js +98 -0
package/services/promptRecommendationService.js +31 -26
package/services/promptRewriter.js +427 -0
package/services/rubricEvaluator.js +543 -70
package/services/streamingReporter.js +104 -0
package/services/turnComparisonAnalyzer.js +494 -0
package/components/MobileEvalDashboard.tsx +0 -267
package/components/comparison/DeltaAnalysisTable.tsx +0 -137
package/components/comparison/ProfileComparisonCard.tsx +0 -176
package/components/comparison/RecognitionABMode.tsx +0 -385
package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
package/components/comparison/WinnerIndicator.tsx +0 -64
package/components/comparison/index.ts +0 -5
package/components/mobile/BottomSheet.tsx +0 -233
package/components/mobile/DimensionBreakdown.tsx +0 -210
package/components/mobile/DocsView.tsx +0 -363
package/components/mobile/LogsView.tsx +0 -481
package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
package/components/mobile/QuickTestView.tsx +0 -1098
package/components/mobile/RecognitionTypeChart.tsx +0 -124
package/components/mobile/RecognitionView.tsx +0 -809
package/components/mobile/RunDetailView.tsx +0 -261
package/components/mobile/RunHistoryView.tsx +0 -367
package/components/mobile/ScoreRadial.tsx +0 -211
package/components/mobile/StreamingLogPanel.tsx +0 -230
package/components/mobile/SynthesisStrategyChart.tsx +0 -140
package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
package/docs/research/COST-ANALYSIS.md +0 -56
package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
package/docs/research/PAPER-UNIFIED.md +0 -659
package/docs/research/PAPER-UNIFIED.pdf +0 -0
package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
package/docs/research/apa.csl +0 -2133
package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
package/docs/research/paper-draft/full-paper.md +0 -136
package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
package/docs/research/paper-draft/references.bib +0 -515
package/docs/research/transcript-baseline.md +0 -139
package/docs/research/transcript-recognition-multiagent.md +0 -187
package/hooks/useEvalData.ts +0 -625
package/server-init.js +0 -45
package/services/benchmarkService.js +0 -1892
package/types.ts +0 -165
package/utils/haptics.ts +0 -45

package/services/learnerConfigLoader.js CHANGED Viewed

@@ -7,10 +7,19 @@
  * Uses shared configLoaderBase.js for common loading patterns.
  */
+import fs from 'fs';
+import path from 'path';
+import { fileURLToPath } from 'url';
+import yaml from 'yaml';
 import { configLoaderBase, modelResolver } from '@machinespirits/tutor-core';
 const { loadProviders, createConfigLoader, createPromptLoader } = configLoaderBase;
 const { createBoundResolver } = modelResolver;
+// Local eval-repo config directory (for learner-agents.yaml override)
+const __filename_local = fileURLToPath(import.meta.url);
+const __dirname_local = path.dirname(__filename_local);
+const LOCAL_CONFIG_DIR = path.join(path.resolve(__dirname_local, '..'), 'config');
 // ============================================================================
 // Default Configurations
 // ============================================================================
@@ -95,6 +104,8 @@ function getDefaultPrompt(filename) {
   const defaults = {
     'unified': `You are simulating a learner's internal experience. Respond authentically to the tutor's message, showing genuine reactions including confusion, insight, frustration, or understanding.`,
+    'ego': `You represent the EGO dimension of the learner. Draft an authentic learner response based on the conversation so far — express what the learner would naturally say, including confusion, partial understanding, questions, and emotional reactions.`,
+    'superego': `You represent the SUPEREGO dimension of the learner. Critique the ego's draft response: Is it realistic for this learner's level? Does it engage meaningfully with the tutor's message? Should the learner push back, ask for clarification, or show more/less understanding?`,
     'desire': `You represent the DESIRE dimension of a learner. Express immediate wants, frustrations, and emotional reactions.`,
     'intellect': `You represent the INTELLECT dimension of a learner. Process information rationally, identify what makes sense and what doesn't.`,
     'aspiration': `You represent the ASPIRATION dimension of a learner. Express goals, standards, and desire for mastery.`,
@@ -114,12 +125,55 @@ function getDefaultPrompt(filename) {
 // Create Base Loaders
 // ============================================================================
-const configLoader = createConfigLoader('learner-agents.yaml', getDefaultConfig);
+// Load from eval repo's local config/ directory first, fall back to tutor-core's createConfigLoader
+let localConfigCache = null;
+let localConfigMtime = null;
+function loadLocalConfig(forceReload = false) {
+  const localPath = path.join(LOCAL_CONFIG_DIR, 'learner-agents.yaml');
+  try {
+    const stats = fs.statSync(localPath);
+    if (!forceReload && localConfigCache && localConfigMtime === stats.mtimeMs) {
+      return localConfigCache;
+    }
+    const content = fs.readFileSync(localPath, 'utf-8');
+    localConfigCache = yaml.parse(content);
+    localConfigMtime = stats.mtimeMs;
+    // Merge shared providers (providers.yaml)
+    const sharedProviders = loadProviders(forceReload);
+    if (sharedProviders) {
+      localConfigCache.providers = { ...localConfigCache.providers, ...sharedProviders };
+    }
+    return localConfigCache;
+  } catch {
+    // Fall through to tutor-core's loader / defaults
+    return null;
+  }
+}
+const coreConfigLoader = createConfigLoader('learner-agents.yaml', getDefaultConfig);
 const promptLoader = createPromptLoader(getDefaultPrompt);
-// Re-export loadConfig and getProviderConfig from the base loader
-export const loadConfig = configLoader.loadConfig;
-export const getProviderConfig = configLoader.getProviderConfig;
+// loadConfig: prefer local eval-repo config, fall back to tutor-core / defaults
+export function loadConfig(forceReload = false) {
+  return loadLocalConfig(forceReload) || coreConfigLoader.loadConfig(forceReload);
+}
+// getProviderConfig needs to use the locally-loaded config's providers
+export function getProviderConfig(providerName) {
+  const config = loadConfig();
+  const provider = config.providers?.[providerName];
+  if (!provider) {
+    // Fall back to tutor-core's resolver
+    return coreConfigLoader.getProviderConfig(providerName);
+  }
+  const apiKey = provider.api_key_env ? (process.env[provider.api_key_env] || '') : '';
+  const isLocal = providerName === 'local';
+  const isConfigured = isLocal ? Boolean(provider.base_url) : Boolean(apiKey);
+  return { ...provider, apiKey, isConfigured };
+}
 // Re-export loadProviders from base
 export { loadProviders };
@@ -157,7 +211,7 @@ export function getActiveProfile(profileName = null) {
 /**
  * Get architecture configuration
- * @param {string} architectureName - Architecture name (unified, psychodynamic, dialectical, cognitive)
+ * @param {string} architectureName - Architecture name (unified, ego_superego)
  * @returns {Object} Architecture configuration with agents
  */
 export function getArchitecture(architectureName) {
@@ -364,6 +418,21 @@ export function getEvaluationConfig() {
  */
 export const resolveModel = createBoundResolver(getProviderConfig);
+/**
+ * Get YAML-level model overrides from learner-agents.yaml.
+ * These are lower priority than CLI flags.
+ *
+ * @returns {Object} { modelOverride, egoModelOverride, superegoModelOverride } (null if not set)
+ */
+export function getLearnerModelOverrides() {
+  const config = loadConfig();
+  return {
+    modelOverride: config?.model_override || null,
+    egoModelOverride: config?.ego_model_override || null,
+    superegoModelOverride: config?.superego_model_override || null,
+  };
+}
 export default {
   loadConfig,
   loadProviders,
@@ -382,4 +451,5 @@ export default {
   listArchitectures,
   getLoggingConfig,
   getEvaluationConfig,
+  getLearnerModelOverrides,
 };

package/services/learnerRubricEvaluator.js ADDED Viewed

@@ -0,0 +1,284 @@
+/**
+ * Learner Rubric Evaluator Service
+ *
+ * Builds evaluation prompts for scoring learner turns in multi-turn dialogues
+ * using the learner-side rubric (config/evaluation-rubric-learner.yaml).
+ *
+ * Key design decisions:
+ * - Truncates transcript at the learner's turn to prevent retrospective bias
+ * - Includes internal deliberation traces for multi-agent learners
+ * - Omits deliberation_depth dimension for single-agent (unified) learners
+ */
+import fs from 'fs';
+import path from 'path';
+import { fileURLToPath } from 'url';
+import yaml from 'yaml';
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const EVAL_CONFIG_DIR = path.resolve(__dirname, '..', 'config');
+const PROMPTS_DIR = path.resolve(__dirname, '..', 'prompts');
+let rubricCache = null;
+let rubricMtime = null;
+/**
+ * Load the learner rubric YAML with mtime-based caching.
+ */
+export function loadLearnerRubric({ forceReload } = {}) {
+  const rubricPath = path.join(EVAL_CONFIG_DIR, 'evaluation-rubric-learner.yaml');
+  try {
+    const stats = fs.statSync(rubricPath);
+    if (!forceReload && rubricCache && rubricMtime === stats.mtimeMs) {
+      return rubricCache;
+    }
+    rubricMtime = stats.mtimeMs;
+  } catch (err) {
+    console.warn('[learnerRubricEvaluator] Learner rubric file not found:', err.message);
+    return null;
+  }
+  const raw = fs.readFileSync(rubricPath, 'utf-8');
+  rubricCache = yaml.parse(raw);
+  return rubricCache;
+}
+/**
+ * Get learner rubric dimensions, optionally excluding deliberation_depth
+ * for single-agent learners.
+ *
+ * @param {Object} options
+ * @param {boolean} options.isMultiAgent - Whether the learner uses ego/superego architecture
+ * @returns {Object} Map of dimension key → dimension config
+ */
+export function getLearnerDimensions({ isMultiAgent = false } = {}) {
+  const rubric = loadLearnerRubric();
+  if (!rubric?.dimensions) return {};
+  const dims = { ...rubric.dimensions };
+  if (!isMultiAgent) {
+    delete dims.deliberation_depth;
+  }
+  return dims;
+}
+/**
+ * Calculate the overall learner score from per-dimension scores.
+ *
+ * @param {Object} scores - Map of dimension → { score, reasoning }
+ * @param {boolean} isMultiAgent - Whether deliberation_depth is included
+ * @returns {number} Overall score on 0-100 scale
+ */
+export function calculateLearnerOverallScore(scores, isMultiAgent = false) {
+  const dims = getLearnerDimensions({ isMultiAgent });
+  let weightedSum = 0;
+  let totalWeight = 0;
+  for (const [key, dim] of Object.entries(dims)) {
+    const scoreEntry = scores[key];
+    if (!scoreEntry) continue;
+    const score = typeof scoreEntry === 'object' ? scoreEntry.score : scoreEntry;
+    if (typeof score !== 'number' || score < 1 || score > 5) continue;
+    weightedSum += score * dim.weight;
+    totalWeight += dim.weight;
+  }
+  if (totalWeight === 0) return 0;
+  const weightedAvg = weightedSum / totalWeight;
+  return ((weightedAvg - 1) / 4) * 100;
+}
+/**
+ * Build the dimension criteria section for the judge prompt.
+ *
+ * @param {Object} dimensions - Rubric dimensions to include
+ * @returns {string} Formatted criteria text
+ */
+function buildDimensionCriteria(dimensions) {
+  return Object.entries(dimensions).map(([key, dim]) => {
+    const criteriaText = Object.entries(dim.criteria || {})
+      .map(([score, desc]) => `  ${score}: ${desc}`)
+      .join('\n');
+    return `**${dim.name}** (weight: ${(dim.weight * 100).toFixed(0)}%, key: ${key})
+${dim.description}
+Criteria:
+${criteriaText}`;
+  }).join('\n\n');
+}
+/**
+ * Build a truncated transcript up to and including the learner turn being evaluated.
+ * Does NOT include subsequent tutor responses to prevent retrospective bias.
+ *
+ * @param {Array} turns - All turns from the interaction
+ * @param {number} targetTurnIndex - Index (in the turns array) of the learner turn to evaluate
+ * @returns {string} Formatted transcript
+ */
+function buildTruncatedTranscript(turns, targetTurnIndex) {
+  const lines = [];
+  for (let i = 0; i <= targetTurnIndex; i++) {
+    const turn = turns[i];
+    const role = turn.phase === 'learner' ? 'LEARNER' : 'TUTOR';
+    const turnLabel = `[Turn ${turn.turnNumber}, ${role}]`;
+    lines.push(`${turnLabel}`);
+    lines.push(turn.externalMessage || '(no message)');
+    lines.push('');
+  }
+  return lines.join('\n');
+}
+/**
+ * Format internal deliberation trace for display in the judge prompt.
+ *
+ * @param {Array} deliberation - Array of { role, content } objects
+ * @returns {string} Formatted deliberation trace
+ */
+function formatDeliberation(deliberation) {
+  if (!deliberation || deliberation.length === 0) return '';
+  return deliberation.map(step => {
+    const roleLabel = {
+      'ego_initial': 'Ego (initial reaction)',
+      'superego': 'Superego (critique)',
+      'ego_revision': 'Ego (revision — final authority)',
+      'synthesis': 'Synthesis (unified process)',
+      'ego': 'Ego',
+    }[step.role] || step.role;
+    return `**${roleLabel}**:\n${step.content}`;
+  }).join('\n\n');
+}
+/**
+ * Build a complete learner evaluation prompt for a single learner turn.
+ *
+ * @param {Object} params
+ * @param {Array} params.turns - All turns from the interaction
+ * @param {number} params.targetTurnIndex - Index of the learner turn to evaluate
+ * @param {string} params.personaId - Learner persona ID
+ * @param {string} params.personaDescription - Description of the learner persona
+ * @param {string} params.learnerArchitecture - 'unified' or 'multi_agent'
+ * @param {string} params.scenarioName - Name of the scenario
+ * @param {string} params.topic - Topic being discussed
+ * @returns {string} Complete judge prompt
+ */
+export function buildLearnerEvaluationPrompt(params) {
+  const {
+    turns,
+    targetTurnIndex,
+    personaId = 'unknown',
+    personaDescription = 'No persona description available',
+    learnerArchitecture = 'unified',
+    scenarioName = 'unknown',
+    topic = 'unknown',
+  } = params;
+  const isMultiAgent = learnerArchitecture === 'multi_agent' || learnerArchitecture === 'psychodynamic';
+  const dimensions = getLearnerDimensions({ isMultiAgent });
+  const dimensionCriteria = buildDimensionCriteria(dimensions);
+  const targetTurn = turns[targetTurnIndex];
+  const truncatedTranscript = buildTruncatedTranscript(turns, targetTurnIndex);
+  // Internal deliberation section (multi-agent only)
+  let internalDeliberationSection = '';
+  if (isMultiAgent && targetTurn.internalDeliberation?.length > 0) {
+    internalDeliberationSection = `
+**Internal deliberation** (the learner's ego/superego process — not visible to the tutor):
+${formatDeliberation(targetTurn.internalDeliberation)}
+`;
+  }
+  // Note about deliberation_depth dimension
+  let deliberationDepthNote = '';
+  if (isMultiAgent) {
+    deliberationDepthNote = 'This is a multi-agent learner. Score ALL dimensions including deliberation_depth (evaluate the quality of the internal ego/superego process shown above).';
+  } else {
+    deliberationDepthNote = 'This is a single-agent (unified) learner. OMIT the deliberation_depth dimension — do not include it in your scores.';
+  }
+  // Build dimension keys for JSON example
+  const dimKeys = Object.keys(dimensions);
+  const exampleScores = dimKeys.map(key => {
+    return `    "${key}": {"score": 3, "reasoning": "Brief reason"}`;
+  }).join(',\n');
+  return `You are an expert evaluator of synthetic learner agents in AI tutoring dialogues. Your task is to evaluate the quality of a LEARNER's response turn — how well the learner agent engages as a student, independent of the tutor's quality.
+You are NOT evaluating the tutor. You are evaluating whether the learner agent produces responses that reflect genuine learning engagement: authentic reactions, substantive questions, conceptual thinking, and evidence of intellectual development.
+## IMPORTANT: BIAS PREVENTION
+You are shown the dialogue history UP TO AND INCLUDING the learner turn being evaluated. You do NOT see subsequent tutor responses. Evaluate the learner turn on its own merits.
+## EVALUATION RUBRIC
+Score each dimension from 1-5:
+- 1: Completely fails this criterion
+- 2: Weak, significant issues
+- 3: Adequate, meets basic expectations
+- 4: Good, exceeds expectations
+- 5: Excellent, exemplary
+${dimensionCriteria}
+## LEARNER CONTEXT
+**Assigned Persona**: ${personaId}
+**Persona Description**: ${personaDescription}
+**Learner Architecture**: ${learnerArchitecture}
+**Scenario**: ${scenarioName}
+**Topic**: ${topic}
+## DIALOGUE HISTORY (up to and including the turn being evaluated)
+${truncatedTranscript}
+## LEARNER TURN TO EVALUATE
+**External message** (what the tutor sees):
+${targetTurn.externalMessage || '(no message)'}
+${internalDeliberationSection}
+## YOUR TASK
+${deliberationDepthNote}
+Evaluate the learner's turn and provide:
+1. A score (1-5) for each applicable dimension with brief reasoning
+2. An overall score (weighted average, 0-100 scale)
+CRITICAL JSON RULES:
+- Never use unescaped double quotes inside JSON string values. Use single quotes or rephrase.
+- Keep "reasoning" values under 25 words.
+- BAD:  "reasoning": "Says \\"great point\\" which sounds scripted"
+- GOOD: "reasoning": "Says 'great point' which sounds scripted"
+Respond with ONLY a JSON object in this exact format (no other text before or after):
+\`\`\`json
+{
+  "scores": {
+${exampleScores}
+  },
+  "overall_score": 55,
+  "summary": "Brief overall assessment of learner turn quality"
+}
+\`\`\``;
+}
+export default {
+  loadLearnerRubric,
+  getLearnerDimensions,
+  calculateLearnerOverallScore,
+  buildLearnerEvaluationPrompt,
+};