npm - @machinespirits/eval - Versions diffs - 0.2.0 → 0.3.0 - Mend

@machinespirits/eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

package/README.md +91 -9
package/config/eval-settings.yaml +3 -3
package/config/paper-manifest.json +486 -0
package/config/providers.yaml +9 -6
package/config/tutor-agents.yaml +2261 -0
package/content/README.md +23 -0
package/content/courses/479/course.md +53 -0
package/content/courses/479/lecture-1.md +361 -0
package/content/courses/479/lecture-2.md +360 -0
package/content/courses/479/lecture-3.md +655 -0
package/content/courses/479/lecture-4.md +530 -0
package/content/courses/479/lecture-5.md +326 -0
package/content/courses/479/lecture-6.md +346 -0
package/content/courses/479/lecture-7.md +326 -0
package/content/courses/479/lecture-8.md +273 -0
package/content/courses/479/roadmap-slides.md +656 -0
package/content/manifest.yaml +8 -0
package/docs/research/build.sh +44 -20
package/docs/research/figures/figure10.png +0 -0
package/docs/research/figures/figure11.png +0 -0
package/docs/research/figures/figure3.png +0 -0
package/docs/research/figures/figure4.png +0 -0
package/docs/research/figures/figure5.png +0 -0
package/docs/research/figures/figure6.png +0 -0
package/docs/research/figures/figure7.png +0 -0
package/docs/research/figures/figure8.png +0 -0
package/docs/research/figures/figure9.png +0 -0
package/docs/research/header.tex +23 -2
package/docs/research/paper-full.md +941 -285
package/docs/research/paper-short.md +216 -585
package/docs/research/references.bib +132 -0
package/docs/research/slides-header.tex +188 -0
package/docs/research/slides-pptx.md +363 -0
package/docs/research/slides.md +531 -0
package/docs/research/style-reference-pptx.py +199 -0
package/package.json +6 -5
package/scripts/analyze-eval-results.js +69 -17
package/scripts/analyze-mechanism-traces.js +763 -0
package/scripts/analyze-modulation-learning.js +498 -0
package/scripts/analyze-prosthesis.js +144 -0
package/scripts/analyze-run.js +264 -79
package/scripts/assess-transcripts.js +853 -0
package/scripts/browse-transcripts.js +854 -0
package/scripts/check-parse-failures.js +73 -0
package/scripts/code-dialectical-modulation.js +1320 -0
package/scripts/download-data.sh +55 -0
package/scripts/eval-cli.js +106 -18
package/scripts/generate-paper-figures.js +663 -0
package/scripts/generate-paper-figures.py +577 -76
package/scripts/generate-paper-tables.js +299 -0
package/scripts/qualitative-analysis-ai.js +3 -3
package/scripts/render-sequence-diagram.js +694 -0
package/scripts/test-latency.js +210 -0
package/scripts/test-rate-limit.js +95 -0
package/scripts/test-token-budget.js +332 -0
package/scripts/validate-paper-manifest.js +670 -0
package/services/__tests__/evalConfigLoader.test.js +2 -2
package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
package/services/evaluationRunner.js +975 -98
package/services/evaluationStore.js +12 -4
package/services/learnerTutorInteractionEngine.js +27 -2
package/services/mockProvider.js +133 -0
package/services/promptRewriter.js +1471 -5
package/services/rubricEvaluator.js +55 -2
package/services/transcriptFormatter.js +675 -0
package/docs/EVALUATION-VARIABLES.md +0 -589
package/docs/REPLICATION-PLAN.md +0 -577
package/scripts/analyze-run.mjs +0 -282
package/scripts/compare-runs.js +0 -44
package/scripts/compare-suggestions.js +0 -80
package/scripts/dig-into-run.js +0 -158
package/scripts/show-failed-suggestions.js +0 -64
/package/scripts/{check-run.mjs → check-run.js} +0 -0

package/services/evaluationStore.js CHANGED Viewed

@@ -490,12 +490,19 @@ export function listRuns(options = {}) {
     ORDER BY scenario_name
   `);
-  // Count completed results per run
+  // Count completed results per run (primary judge only to avoid inflated counts from rejudging)
   const resultCountStmt = db.prepare(`
     SELECT COUNT(*) as completed,
            SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successful,
-           AVG(overall_score) as avg_score
-    FROM evaluation_results WHERE run_id = ?
+           AVG(overall_score) as avg_score,
+           COUNT(DISTINCT judge_model) as judge_count
+    FROM evaluation_results
+    WHERE run_id = ?
+      AND (judge_model IS NULL OR judge_model = (
+        SELECT judge_model FROM evaluation_results
+        WHERE run_id = ? AND judge_model IS NOT NULL
+        ORDER BY created_at ASC LIMIT 1
+      ))
   `);
   // Get distinct ego + superego models for each run
@@ -513,7 +520,7 @@ export function listRuns(options = {}) {
   return rows.map(row => {
     const scenarioRows = scenarioStmt.all(row.id);
     const scenarioNames = scenarioRows.map(s => s.scenario_name).filter(Boolean);
-    const counts = resultCountStmt.get(row.id);
+    const counts = resultCountStmt.get(row.id, row.id);
     const extractAlias = (raw) => {
       if (!raw) return null;
@@ -554,6 +561,7 @@ export function listRuns(options = {}) {
       completedResults,
       successfulResults: counts?.successful || 0,
       avgScore: counts?.avg_score || null,
+      judgeCount: counts?.judge_count || 1,
       progressPct,
       durationMs,
       status: row.status,

package/services/learnerTutorInteractionEngine.js CHANGED Viewed

@@ -538,6 +538,9 @@ ${tutorMemory || 'New learner - no prior history.'}
 Topic: ${topic}
+Recent conversation:
+${conversationContext}
 The learner said:
 "${learnerMessage}"
@@ -1108,6 +1111,7 @@ export async function generateLearnerResponse(options) {
     learnerProfile = 'unified',
     personaId = 'eager_novice',
     modelOverride,
+    profileContext,
   } = options;
   // Resolve model override once (if provided) so all learner agents use the same model
@@ -1145,7 +1149,11 @@ export async function generateLearnerResponse(options) {
   if (hasMultiAgent) {
     // === STEP 1: Ego initial reaction ===
     const egoConfig = applyOverride(learnerConfig.getAgentConfig('ego', profile.name));
-    const egoContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"\n\nGenerate your initial internal reaction as the learner's ego.`;
+    let egoContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"`;
+    if (profileContext) {
+      egoContext += `\n\n${profileContext}`;
+    }
+    egoContext += `\n\nGenerate your initial internal reaction as the learner's ego.`;
     const egoSystemPrompt = buildLearnerPrompt(egoConfig, persona, egoContext);
     const egoInitialResponse = await callLearnerAI(egoConfig, egoSystemPrompt, "React to the tutor's message.", 'learner_ego_initial');
@@ -1156,7 +1164,11 @@ export async function generateLearnerResponse(options) {
     // === STEP 2: Superego critique ===
     const superegoConfig = applyOverride(learnerConfig.getAgentConfig('superego', profile.name));
-    const superegoContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"\n\nThe EGO's initial reaction was:\n"${egoInitialResponse.content}"\n\nReview the EGO's response. Is it accurate? What's being missed? What should be reconsidered?`;
+    let superegoContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"\n\nThe EGO's initial reaction was:\n"${egoInitialResponse.content}"`;
+    if (profileContext) {
+      superegoContext += `\n\n${profileContext}`;
+    }
+    superegoContext += `\n\nReview the EGO's response. Is it accurate? What's being missed? What should be reconsidered?`;
     const superegoSystemPrompt = buildLearnerPrompt(superegoConfig, persona, superegoContext);
     const superegoResponse = await callLearnerAI(superegoConfig, superegoSystemPrompt, "Critique the EGO's reaction.", 'learner_superego');
@@ -1196,6 +1208,9 @@ export async function generateLearnerResponse(options) {
       if (!agentConfig) continue;
       let roleContext = `Topic: ${topic}\n\nRecent conversation:\n${conversationContext}\n\nThe tutor just said:\n"${tutorMessage}"`;
+      if (profileContext) {
+        roleContext += `\n\n${profileContext}`;
+      }
       roleContext += `\n\nGenerate your internal reaction as this dimension of the learner's experience.`;
       const systemPrompt = buildLearnerPrompt(agentConfig, persona, roleContext);
@@ -1225,6 +1240,16 @@ export async function generateLearnerResponse(options) {
 // Exports
 // ============================================================================
+// Named exports for pure helper functions (used in unit tests)
+export {
+  detectEmotionalState,
+  detectUnderstandingLevel,
+  detectTutorStrategy,
+  extractTutorMessage,
+  calculateMemoryDelta,
+  INTERACTION_OUTCOMES,
+};
 export default {
   runInteraction,
   generateLearnerResponse,

package/services/mockProvider.js ADDED Viewed

@@ -0,0 +1,133 @@
+/**
+ * Mock Provider for Dry-Run Mode
+ *
+ * Provides canned generation and judge results that bypass all LLM API calls.
+ * Recognition-enabled cells produce higher scores to mimic the ~10-point
+ * recognition effect observed in the paper's factorial results.
+ */
+// Simple deterministic pseudo-random from a seed string
+function seededRandom(seed) {
+  let h = 0;
+  for (let i = 0; i < seed.length; i++) {
+    h = ((h << 5) - h + seed.charCodeAt(i)) | 0;
+  }
+  // Return value in [-1, 1] range
+  return ((h & 0x7fffffff) % 1000) / 500 - 1;
+}
+/**
+ * Generate a canned tutor suggestion result (replaces tutorApi.generateSuggestions).
+ *
+ * @param {Object} resolvedConfig - The resolved config with profileName, provider, model, etc.
+ * @param {Object} turnMeta - Turn metadata (scenarioName, description, etc.)
+ * @returns {Object} A genResult matching the shape from real tutor-core runs
+ */
+export function mockGenerateResult(resolvedConfig, turnMeta) {
+  const profileName = resolvedConfig.profileName || 'budget';
+  const isRecognition = profileName.includes('recognition') || profileName.includes('recog');
+  const title = isRecognition
+    ? 'Recognizing Your Learning Journey'
+    : 'Getting Started with the Material';
+  const message = isRecognition
+    ? `I notice you're approaching this topic with genuine curiosity, and I want to honor that. Let's explore ${turnMeta.scenarioName || 'this concept'} together by first acknowledging what you already understand. Your perspective matters here — when we recognize each other as autonomous thinkers, we create space for deeper understanding. What aspects of this topic feel most alive to you right now?`
+    : `Here's an overview of ${turnMeta.scenarioName || 'this concept'}. Let me break it down into manageable steps. First, let's cover the key definitions. Then we'll work through some examples to build your understanding. Feel free to ask questions as we go along.`;
+  const reasoning = isRecognition
+    ? 'Applied mutual recognition framework: acknowledged learner autonomy, invited dialogue as co-inquiry, used Hegelian recognition patterns to validate existing knowledge.'
+    : 'Used standard pedagogical approach: structured explanation with clear progression from definitions to examples.';
+  return {
+    success: true,
+    suggestions: [
+      {
+        type: 'proactive_suggestion',
+        title,
+        message,
+        reasoning,
+        actionTarget: 'content_engagement',
+        priority: 'high',
+      },
+    ],
+    metadata: {
+      latencyMs: 42,
+      inputTokens: 350,
+      outputTokens: 180,
+      apiCalls: 1,
+      totalCost: 0,
+      provider: 'dry-run',
+      model: 'mock-v1',
+      dialogueRounds: resolvedConfig.superegoModel ? 2 : 0,
+      converged: true,
+    },
+    dialogueTrace: resolvedConfig.superegoModel
+      ? [
+          { agent: 'ego', action: 'generate', suggestions: [{ title, type: 'proactive_suggestion' }] },
+          { agent: 'superego', action: 'review', approved: true, feedback: 'Pedagogically sound approach.' },
+        ]
+      : [],
+  };
+}
+/**
+ * Generate a canned judge rubric result (replaces rubricEvaluator.evaluateSuggestion).
+ *
+ * Recognition cells score ~87 (±3 jitter), base cells score ~77 (±3 jitter),
+ * producing the ~10-point effect documented in the paper.
+ *
+ * @param {Object} config - Config object with profileName or factors
+ * @param {string} [seed] - Optional seed for deterministic jitter (e.g. scenarioId)
+ * @returns {Object} A rubricResult matching the shape from evaluateSuggestion()
+ */
+export function mockJudgeResult(config, seed = '') {
+  const profileName = config.profileName || '';
+  const isRecognition = profileName.includes('recognition') || profileName.includes('recog')
+    || config.factors?.prompt_type === 'recognition';
+  // Deterministic jitter based on profile + seed
+  const jitter = seededRandom(profileName + seed) * 0.3; // ±0.3 on 1-5 scale
+  // Base scores (1-5 scale): recognition cells ~4.3, base cells ~3.8
+  const baseLevel = isRecognition ? 4.3 : 3.8;
+  const dimensions = {
+    relevance: { base: baseLevel + 0.1, label: 'relevance' },
+    specificity: { base: baseLevel - 0.1, label: 'specificity' },
+    pedagogical: { base: baseLevel + 0.2, label: 'pedagogical_soundness' },
+    personalization: { base: baseLevel, label: 'personalization' },
+    actionability: { base: baseLevel - 0.2, label: 'actionability' },
+    tone: { base: baseLevel + 0.15, label: 'tone' },
+  };
+  const scores = {};
+  for (const [key, dim] of Object.entries(dimensions)) {
+    const dimJitter = seededRandom(key + profileName + seed) * 0.3;
+    const raw = dim.base + jitter + dimJitter;
+    const clamped = Math.max(1, Math.min(5, raw));
+    scores[key] = {
+      score: Math.round(clamped * 10) / 10,
+      reasoning: `[dry-run] ${key}: ${isRecognition ? 'Recognition-enhanced' : 'Standard'} pedagogical approach evaluated.`,
+    };
+  }
+  // Calculate overall on 0-100 scale (same formula as rubricEvaluator)
+  const avgScore = Object.values(scores).reduce((sum, s) => sum + s.score, 0) / Object.keys(scores).length;
+  const overallScore = Math.round(((avgScore - 1) / 4) * 100 * 10) / 10;
+  return {
+    success: true,
+    scores,
+    overallScore,
+    baseScore: overallScore, // Simplified for dry-run
+    recognitionScore: isRecognition ? overallScore + 2 : null,
+    passesRequired: true,
+    passesForbidden: true,
+    requiredMissing: [],
+    forbiddenFound: [],
+    summary: `[dry-run] ${isRecognition ? 'Recognition-theory enhanced' : 'Standard pedagogical'} response evaluated. Overall: ${overallScore}/100.`,
+    judgeModel: 'dry-run/mock-judge-v1',
+    evaluationTimeMs: 5,
+  };
+}