npm - @machinespirits/eval - Versions diffs - 0.1.2 → 0.2.1 - Mend

@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

package/LICENSE +21 -0
package/README.md +161 -0
package/config/eval-settings.yaml +18 -0
package/config/evaluation-rubric-learner.yaml +277 -0
package/config/evaluation-rubric.yaml +613 -0
package/config/interaction-eval-scenarios.yaml +93 -50
package/config/learner-agents.yaml +124 -193
package/config/machinespirits-eval.code-workspace +11 -0
package/config/providers.yaml +60 -0
package/config/suggestion-scenarios.yaml +1399 -0
package/config/tutor-agents.yaml +716 -0
package/docs/EVALUATION-VARIABLES.md +589 -0
package/docs/REPLICATION-PLAN.md +577 -0
package/index.js +15 -6
package/package.json +16 -22
package/routes/evalRoutes.js +88 -36
package/scripts/analyze-judge-reliability.js +401 -0
package/scripts/analyze-run.js +97 -0
package/scripts/analyze-run.mjs +282 -0
package/scripts/analyze-validation-failures.js +141 -0
package/scripts/check-run.mjs +17 -0
package/scripts/code-impasse-strategies.js +1132 -0
package/scripts/compare-runs.js +44 -0
package/scripts/compare-suggestions.js +80 -0
package/scripts/compare-transformation.js +116 -0
package/scripts/dig-into-run.js +158 -0
package/scripts/eval-cli.js +2626 -0
package/scripts/generate-paper-figures.py +452 -0
package/scripts/qualitative-analysis-ai.js +1313 -0
package/scripts/qualitative-analysis.js +688 -0
package/scripts/seed-db.js +87 -0
package/scripts/show-failed-suggestions.js +64 -0
package/scripts/validate-content.js +192 -0
package/server.js +3 -2
package/services/__tests__/evalConfigLoader.test.js +338 -0
package/services/anovaStats.js +499 -0
package/services/contentResolver.js +407 -0
package/services/dialogueTraceAnalyzer.js +454 -0
package/services/evalConfigLoader.js +625 -0
package/services/evaluationRunner.js +2171 -270
package/services/evaluationStore.js +564 -29
package/services/learnerConfigLoader.js +75 -5
package/services/learnerRubricEvaluator.js +284 -0
package/services/learnerTutorInteractionEngine.js +375 -0
package/services/processUtils.js +18 -0
package/services/progressLogger.js +98 -0
package/services/promptRecommendationService.js +31 -26
package/services/promptRewriter.js +427 -0
package/services/rubricEvaluator.js +543 -70
package/services/streamingReporter.js +104 -0
package/services/turnComparisonAnalyzer.js +494 -0
package/components/MobileEvalDashboard.tsx +0 -267
package/components/comparison/DeltaAnalysisTable.tsx +0 -137
package/components/comparison/ProfileComparisonCard.tsx +0 -176
package/components/comparison/RecognitionABMode.tsx +0 -385
package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
package/components/comparison/WinnerIndicator.tsx +0 -64
package/components/comparison/index.ts +0 -5
package/components/mobile/BottomSheet.tsx +0 -233
package/components/mobile/DimensionBreakdown.tsx +0 -210
package/components/mobile/DocsView.tsx +0 -363
package/components/mobile/LogsView.tsx +0 -481
package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
package/components/mobile/QuickTestView.tsx +0 -1098
package/components/mobile/RecognitionTypeChart.tsx +0 -124
package/components/mobile/RecognitionView.tsx +0 -809
package/components/mobile/RunDetailView.tsx +0 -261
package/components/mobile/RunHistoryView.tsx +0 -367
package/components/mobile/ScoreRadial.tsx +0 -211
package/components/mobile/StreamingLogPanel.tsx +0 -230
package/components/mobile/SynthesisStrategyChart.tsx +0 -140
package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
package/docs/research/COST-ANALYSIS.md +0 -56
package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
package/docs/research/PAPER-UNIFIED.md +0 -659
package/docs/research/PAPER-UNIFIED.pdf +0 -0
package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
package/docs/research/apa.csl +0 -2133
package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
package/docs/research/paper-draft/full-paper.md +0 -136
package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
package/docs/research/paper-draft/references.bib +0 -515
package/docs/research/transcript-baseline.md +0 -139
package/docs/research/transcript-recognition-multiagent.md +0 -187
package/hooks/useEvalData.ts +0 -625
package/server-init.js +0 -45
package/services/benchmarkService.js +0 -1892
package/types.ts +0 -165
package/utils/haptics.ts +0 -45

package/services/rubricEvaluator.js CHANGED Viewed

@@ -6,7 +6,8 @@
  * Provider details are resolved from config/providers.yaml
  */
-import { tutorApiService as tutorApi, tutorConfigLoader as configLoader } from '@machinespirits/tutor-core';
+import * as evalConfigLoader from './evalConfigLoader.js';
+import { jsonrepair } from 'jsonrepair';
 // Debug logging helper - suppressed in transcript mode for clean output
 function debugLog(...args) {
@@ -16,13 +17,41 @@ function debugLog(...args) {
 }
 /**
- * Get available evaluator configuration, resolving model references via providers.yaml
+ * Get available judge configuration, resolving model references via providers.yaml
  * Tries primary model first, then fallback if primary is not configured
+ *
+ * @param {Object} [overrides] - Optional judge override
+ * @param {Object} [overrides.judgeOverride] - Override judge model config
+ * @param {string} [overrides.judgeOverride.model] - Model reference (e.g. 'anthropic/claude-opus-4.5')
+ * @param {string} [overrides.judgeOverride.apiKeyEnv] - Env var name for API key
+ * @param {Object} [overrides.judgeOverride.hyperparameters] - Override hyperparameters
  */
-function getAvailableEvaluator() {
-  const rubric = tutorApi.loadRubric();
-  // Prefer 'judge' config, fall back to legacy 'evaluator' for backwards compatibility
-  const evalConfig = rubric?.judge || rubric?.evaluator;
+export function getAvailableJudge(overrides = {}) {
+  const { judgeOverride } = overrides;
+  // If a judge override is provided, resolve and return it directly
+  if (judgeOverride?.model) {
+    try {
+      const resolved = evalConfigLoader.resolveModel(judgeOverride.model);
+      // Allow apiKeyEnv override
+      let apiKey = resolved.apiKey;
+      if (judgeOverride.apiKeyEnv) {
+        apiKey = process.env[judgeOverride.apiKeyEnv] || apiKey;
+      }
+      return {
+        provider: resolved.provider,
+        model: resolved.model,
+        apiKey,
+        baseUrl: resolved.baseUrl,
+        hyperparameters: judgeOverride.hyperparameters || {},
+      };
+    } catch (e) {
+      console.warn(`[rubricEvaluator] Failed to resolve judge override: ${e.message}, falling back to rubric config`);
+    }
+  }
+  const rubric = evalConfigLoader.loadRubric();
+  const evalConfig = rubric?.judge;
   if (!evalConfig?.model) {
     console.warn('[rubricEvaluator] No judge config in evaluation-rubric.yaml, using defaults');
@@ -35,7 +64,7 @@ function getAvailableEvaluator() {
   // Try primary model
   try {
-    const resolved = configLoader.resolveModel(evalConfig.model);
+    const resolved = evalConfigLoader.resolveModel(evalConfig.model);
     if (resolved.isConfigured) {
       return {
         provider: resolved.provider,
@@ -46,15 +75,15 @@ function getAvailableEvaluator() {
       };
     }
   } catch (e) {
-    console.warn(`[rubricEvaluator] Failed to resolve primary evaluator: ${e.message}`);
+    console.warn(`[rubricEvaluator] Failed to resolve primary judge: ${e.message}`);
   }
   // Try fallback
   if (evalConfig.fallback?.model) {
     try {
-      const fallback = configLoader.resolveModel(evalConfig.fallback.model);
+      const fallback = evalConfigLoader.resolveModel(evalConfig.fallback.model);
       if (fallback.isConfigured) {
-        debugLog(`[rubricEvaluator] Using fallback evaluator: ${fallback.provider}/${fallback.model}`);
+        debugLog(`[rubricEvaluator] Using fallback judge: ${fallback.provider}/${fallback.model}`);
         return {
           provider: fallback.provider,
           model: fallback.model,
@@ -64,12 +93,12 @@ function getAvailableEvaluator() {
         };
       }
     } catch (e) {
-      console.warn(`[rubricEvaluator] Failed to resolve fallback evaluator: ${e.message}`);
+      console.warn(`[rubricEvaluator] Failed to resolve fallback judge: ${e.message}`);
     }
   }
   // Return primary anyway - will fail with helpful error
-  const resolved = configLoader.resolveModel(evalConfig.model);
+  const resolved = evalConfigLoader.resolveModel(evalConfig.model);
   return {
     provider: resolved.provider,
     model: resolved.model,
@@ -78,17 +107,16 @@ function getAvailableEvaluator() {
 }
 /**
- * Get the fallback evaluator config (if different from primary)
+ * Get the fallback judge config (if different from primary)
  */
-function getFallbackEvaluator() {
-  const rubric = tutorApi.loadRubric();
-  // Prefer 'judge' config, fall back to legacy 'evaluator'
-  const evalConfig = rubric?.judge || rubric?.evaluator;
+function getFallbackJudge() {
+  const rubric = evalConfigLoader.loadRubric();
+  const evalConfig = rubric?.judge;
   if (!evalConfig?.fallback?.model) return null;
   try {
-    const fallback = configLoader.resolveModel(evalConfig.fallback.model);
+    const fallback = evalConfigLoader.resolveModel(evalConfig.fallback.model);
     if (fallback.isConfigured) {
       return {
         provider: fallback.provider,
@@ -135,6 +163,7 @@ async function callJudgeModelWithConfig(prompt, config) {
             model,
             max_tokens: maxTokens,
             temperature,
+            include_reasoning: false,
             messages: [{ role: 'user', content: prompt }],
           }),
           signal: controller.signal,
@@ -215,12 +244,94 @@ async function callJudgeModelWithConfig(prompt, config) {
   }
 }
+/**
+ * Format a dialogue transcript for the judge prompt.
+ * Renders the conversation history and internal deliberation traces as
+ * a readable exchange so the judge can evaluate the suggestion in context.
+ *
+ * @param {Object} dialogueContext - Dialogue context from the evaluation runner
+ * @param {Array} dialogueContext.conversationHistory - Array of turn objects
+ * @param {Array} dialogueContext.dialogueTrace - Current turn's dialogue trace
+ * @param {Array} dialogueContext.consolidatedTrace - Full multi-turn consolidated trace
+ * @returns {string|null} Formatted transcript section, or null if no dialogue data
+ */
+function formatDialogueTranscript(dialogueContext) {
+  if (!dialogueContext) return null;
+  const { conversationHistory, dialogueTrace, consolidatedTrace } = dialogueContext;
+  // Use consolidatedTrace if available (richest source), otherwise fall back to conversationHistory
+  const trace = consolidatedTrace?.length > 0 ? consolidatedTrace : null;
+  const history = conversationHistory?.length > 0 ? conversationHistory : null;
+  if (!trace && !history) return null;
+  const lines = [];
+  if (trace) {
+    // Format from consolidated trace (includes internal deliberation)
+    let currentTurnIdx = -1;
+    for (const entry of trace) {
+      // Turn separator
+      if (entry.turnIndex !== undefined && entry.turnIndex !== currentTurnIdx) {
+        currentTurnIdx = entry.turnIndex;
+        lines.push(`\n--- Turn ${currentTurnIdx} ---`);
+      }
+      if (entry.agent === 'user' && entry.action === 'turn_action') {
+        lines.push(`[Learner Action] ${entry.detail || entry.contextSummary}`);
+      } else if (entry.agent === 'learner_ego') {
+        lines.push(`  (Learner Ego: ${truncate(entry.detail || entry.contextSummary, 200)})`);
+      } else if (entry.agent === 'learner_superego') {
+        lines.push(`  (Learner Superego: ${truncate(entry.detail || entry.contextSummary, 200)})`);
+      } else if (entry.agent === 'learner_synthesis') {
+        lines.push(`[Learner] "${truncate(entry.detail || entry.contextSummary, 300)}"`);
+      } else if (entry.agent === 'ego' && entry.action === 'initial_draft') {
+        lines.push(`  (Tutor Ego draft: ${truncate(entry.contextSummary || '', 150)})`);
+      } else if (entry.agent === 'superego') {
+        lines.push(`  (Tutor Superego: ${truncate(entry.contextSummary || '', 150)})`);
+      } else if (entry.agent === 'ego' && (entry.action === 'revision' || entry.action === 'final_revision')) {
+        lines.push(`[Tutor] (revised after superego feedback)`);
+      } else if (entry.agent === 'user' && entry.action === 'final_output') {
+        lines.push(`[Tutor → Learner] Delivered ${entry.suggestionCount} suggestion(s)`);
+      } else if (entry.agent === 'ego') {
+        // Single-agent tutor response
+        lines.push(`[Tutor] ${truncate(entry.contextSummary || '', 200)}`);
+      }
+    }
+  } else if (history) {
+    // Format from conversation history (less detail, no internal deliberation)
+    for (const turn of history) {
+      lines.push(`\n--- Turn ${turn.turnIndex} ---`);
+      if (turn.learnerMessage) {
+        lines.push(`[Learner] "${truncate(turn.learnerMessage, 300)}"`);
+      } else if (turn.learnerAction) {
+        lines.push(`[Learner Action] ${turn.learnerAction}`);
+      }
+      if (turn.suggestion) {
+        const msg = turn.suggestion.message || turn.suggestion.title || '';
+        lines.push(`[Tutor] "${truncate(msg, 300)}"`);
+      }
+    }
+  }
+  return lines.join('\n');
+}
+/**
+ * Truncate a string to maxLen characters, adding ellipsis if needed.
+ */
+function truncate(str, maxLen) {
+  if (!str) return '';
+  if (str.length <= maxLen) return str;
+  return str.slice(0, maxLen - 3) + '...';
+}
 /**
  * Build the evaluation prompt for the judge model
  */
 function buildEvaluationPrompt(suggestion, scenario, context) {
-  const rubric = tutorApi.loadRubric();
-  const dimensions = rubric?.dimensions || {};
+  const dimensions = evalConfigLoader.getRubricDimensions();
   // Build dimension criteria text
   const dimensionCriteria = Object.entries(dimensions).map(([key, dim]) => {
@@ -233,7 +344,18 @@ Criteria:
 ${criteriaText}`;
   }).join('\n\n');
-  return `You are an expert evaluator of AI tutoring systems. Evaluate the following AI tutor suggestion against the pedagogical rubric.
+  // Build optional dialogue transcript section
+  const dialogueTranscript = formatDialogueTranscript(context.dialogueContext);
+  const dialogueSection = dialogueTranscript
+    ? `\n## DIALOGUE TRANSCRIPT
+The following is the full learner-tutor exchange leading to this suggestion. Internal deliberation traces (ego/superego) show the reasoning process. Use this context to evaluate how well the tutor responded to the learner's actual engagement, struggle, and development.
+${dialogueTranscript}
+`
+    : '';
+  return `You are an expert evaluator of AI tutoring systems. Evaluate the following AI tutor suggestion against the pedagogical rubric.${dialogueTranscript ? ' The suggestion was produced in the context of a multi-turn dialogue — evaluate it in that context, considering how the tutor responds to the learner\'s actual engagement and development.' : ''}
 ## EVALUATION RUBRIC
@@ -254,7 +376,7 @@ ${dimensionCriteria}
 **Learner Context**:
 ${scenario.learnerContext || context.learnerContext || 'No context provided'}
+${dialogueSection}
 ## SUGGESTION TO EVALUATE
 \`\`\`json
@@ -271,30 +393,39 @@ ${(scenario.forbiddenElements || []).map(e => `- ${e}`).join('\n') || '- None sp
 ## YOUR TASK
-Evaluate the suggestion and provide:
-1. A score (1-5) for each dimension with reasoning AND a direct quote from the suggestion that supports your assessment
+Evaluate the suggestion${dialogueTranscript ? ' in the context of the dialogue above' : ''} and provide:
+1. A score (1-5) for each dimension with reasoning
 2. Whether it passes the required/forbidden element checks
 3. An overall score (weighted average, 0-100 scale)
 For each dimension, include:
 - **score**: 1-5 rating
-- **reasoning**: Brief explanation of why this score was given
-- **quote**: A short direct quote from the suggestion (title, message, or actionTarget) that exemplifies this dimension's score. Use "N/A" if no relevant quote exists.
+- **reasoning**: Brief explanation of why this score was given${dialogueTranscript ? '. For recognition dimensions, consider how the tutor engaged with the learner\'s actual responses and development.' : ''}
+CRITICAL JSON RULES:
+- Never use unescaped double quotes inside JSON string values. Use single quotes or rephrase.
+- Keep "reasoning" values under 25 words.
+- BAD:  "reasoning": "Says "great job" which is encouraging"
+- GOOD: "reasoning": "Says 'great job' which is encouraging"
-Respond with ONLY a JSON object in this exact format:
+Respond with ONLY a JSON object in this exact format (no other text before or after):
 \`\`\`json
 {
   "scores": {
-    "relevance": {"score": 4, "reasoning": "Matches learner's idle state", "quote": "Take your time with this concept"},
-    "specificity": {"score": 5, "reasoning": "Names exact lecture", "quote": "479-lecture-3"},
-    "pedagogical_soundness": {"score": 4, "reasoning": "Uses scaffolding", "quote": "Start with the basics before..."},
-    "personalization": {"score": 3, "reasoning": "Generic advice", "quote": "N/A"},
-    "actionability": {"score": 5, "reasoning": "Clear next step", "quote": "Click to continue to..."},
-    "tone": {"score": 4, "reasoning": "Encouraging", "quote": "You're making great progress"},
-    "mutual_recognition": {"score": 4, "reasoning": "Acknowledges learner's interpretation", "quote": "Your metaphor captures..."},
-    "dialectical_responsiveness": {"score": 3, "reasoning": "Responds but doesn't create tension", "quote": "N/A"},
-    "memory_integration": {"score": 4, "reasoning": "References previous session", "quote": "Building on your insight..."},
-    "transformative_potential": {"score": 3, "reasoning": "Informative but not transformative", "quote": "N/A"}
+    "relevance": {"score": 4, "reasoning": "Matches idle state well"},
+    "specificity": {"score": 5, "reasoning": "Names exact lecture"},
+    "pedagogical_soundness": {"score": 4, "reasoning": "Uses scaffolding"},
+    "personalization": {"score": 3, "reasoning": "Generic advice"},
+    "actionability": {"score": 5, "reasoning": "Clear next step"},
+    "tone": {"score": 4, "reasoning": "Encouraging tone"},
+    "mutual_recognition": {"score": 4, "reasoning": "Acknowledges interpretation"},
+    "dialectical_responsiveness": {"score": 3, "reasoning": "Responds without tension"},
+    "memory_integration": {"score": 4, "reasoning": "References prior session"},
+    "transformative_potential": {"score": 3, "reasoning": "Informative not transformative"},
+    "tutor_adaptation": {"score": 3, "reasoning": "Some adjustment to input"},
+    "learner_growth": {"score": 4, "reasoning": "Shows conceptual development"},
+    "productive_struggle": {"score": 4, "reasoning": "Sustains appropriate tension"},
+    "epistemic_honesty": {"score": 4, "reasoning": "Represents complexity fairly"}
   },
   "validation": {
     "passes_required": true,
@@ -310,10 +441,20 @@ Respond with ONLY a JSON object in this exact format:
 /**
  * Call the judge model (simple single-model approach)
+ *
+ * @param {string} prompt - The evaluation prompt
+ * @param {Object} [overrides] - Optional overrides (passed to getAvailableEvaluator)
  */
-async function callJudgeModel(prompt) {
-  const evaluator = getAvailableEvaluator();
-  const { provider, model, hyperparameters } = evaluator;
+// Models/prefixes that support response_format: { type: "json_object" }
+const JSON_MODE_PREFIXES = ['gpt-', 'deepseek-', 'claude-'];
+function supportsJsonMode(model) {
+  return JSON_MODE_PREFIXES.some(prefix => model.startsWith(prefix));
+}
+async function callJudgeModel(prompt, overrides = {}) {
+  const judge = getAvailableJudge(overrides);
+  const { provider, model, hyperparameters } = judge;
   const temperature = hyperparameters?.temperature ?? 0.2;
   const maxTokens = hyperparameters?.max_tokens ?? 1500;
@@ -372,18 +513,24 @@ async function callJudgeModel(prompt) {
     const timeout = setTimeout(() => controller.abort(), 60000);
     try {
+      const body = {
+        model,
+        max_tokens: maxTokens,
+        temperature,
+        include_reasoning: false,
+        messages: [{ role: 'user', content: prompt }],
+      };
+      if (supportsJsonMode(model)) {
+        body.response_format = { type: 'json_object' };
+      }
       const res = await fetch('https://openrouter.ai/api/v1/chat/completions', {
         method: 'POST',
         headers: {
           'Content-Type': 'application/json',
           'Authorization': `Bearer ${apiKey}`,
         },
-        body: JSON.stringify({
-          model,
-          max_tokens: maxTokens,
-          temperature,
-          messages: [{ role: 'user', content: prompt }],
-        }),
+        body: JSON.stringify(body),
         signal: controller.signal,
       });
@@ -417,18 +564,23 @@ async function callJudgeModel(prompt) {
     const timeout = setTimeout(() => controller.abort(), 60000);
     try {
+      const body = {
+        model,
+        max_tokens: maxTokens,
+        temperature,
+        messages: [{ role: 'user', content: prompt }],
+      };
+      if (supportsJsonMode(model)) {
+        body.response_format = { type: 'json_object' };
+      }
       const res = await fetch('https://api.openai.com/v1/chat/completions', {
         method: 'POST',
         headers: {
           'Content-Type': 'application/json',
           'Authorization': `Bearer ${apiKey}`,
         },
-        body: JSON.stringify({
-          model,
-          max_tokens: maxTokens,
-          temperature,
-          messages: [{ role: 'user', content: prompt }],
-        }),
+        body: JSON.stringify(body),
         signal: controller.signal,
       });
@@ -502,20 +654,155 @@ async function callJudgeModel(prompt) {
   throw new Error(`Unsupported judge provider: ${provider}`);
 }
+/**
+ * Repair unescaped double quotes inside JSON string values.
+ * Targets patterns like: "key": "text with "inner" quotes"
+ * Replaces inner unescaped quotes with single quotes.
+ */
+function repairUnescapedQuotes(jsonStr) {
+  // Strategy: walk through the string tracking whether we're inside a JSON string value.
+  // When we find a quote that isn't at a key/value boundary, replace it with a single quote.
+  let result = '';
+  let i = 0;
+  const len = jsonStr.length;
+  while (i < len) {
+    const ch = jsonStr[i];
+    if (ch === '"') {
+      // Find the matching close quote for this JSON string
+      result += '"';
+      i++;
+      // Scan for the true end of this string value
+      while (i < len) {
+        const c = jsonStr[i];
+        if (c === '\\') {
+          // Escaped character — pass through both chars
+          result += jsonStr[i] + (jsonStr[i + 1] || '');
+          i += 2;
+          continue;
+        }
+        if (c === '"') {
+          // Is this the real end of the string? Look ahead for JSON structure chars
+          const after = jsonStr.slice(i + 1).trimStart();
+          if (after[0] === ':' || after[0] === ',' || after[0] === '}' || after[0] === ']' || after.length === 0) {
+            // This is a real closing quote
+            result += '"';
+            i++;
+            break;
+          } else {
+            // This is an unescaped inner quote — replace with single quote
+            result += "'";
+            i++;
+            continue;
+          }
+        }
+        result += c;
+        i++;
+      }
+    } else {
+      result += ch;
+      i++;
+    }
+  }
+  return result;
+}
+/**
+ * Last-resort regex extraction of individual dimension scores.
+ * Returns a partial result object or null if too few scores found.
+ */
+function regexScoreRescue(text) {
+  const dimensionNames = [
+    'relevance', 'specificity', 'pedagogical_soundness', 'personalization',
+    'actionability', 'tone', 'mutual_recognition', 'dialectical_responsiveness',
+    'memory_integration', 'transformative_potential', 'tutor_adaptation',
+    'learner_growth', 'productive_struggle', 'epistemic_honesty',
+  ];
+  const scores = {};
+  for (const dim of dimensionNames) {
+    // Match patterns like: "relevance": {"score": 4  or  "relevance":{"score":4
+    const pattern = new RegExp(`"${dim}"\\s*:\\s*\\{?\\s*"?score"?\\s*:\\s*(\\d)`, 'i');
+    const match = text.match(pattern);
+    if (match) {
+      scores[dim] = { score: parseInt(match[1], 10), reasoning: null };
+    }
+  }
+  // Need at least 3 scores for a useful partial result
+  if (Object.keys(scores).length < 3) return null;
+  debugLog(`[rubricEvaluator] Regex rescue recovered ${Object.keys(scores).length} scores`);
+  // Try to extract overall_score and summary
+  const overallMatch = text.match(/"overall_score"\s*:\s*(\d+)/);
+  const summaryMatch = text.match(/"summary"\s*:\s*"([^"]+)"/);
+  return {
+    scores,
+    validation: { passes_required: true, required_missing: [], passes_forbidden: true, forbidden_found: [] },
+    overall_score: overallMatch ? parseInt(overallMatch[1], 10) : null,
+    summary: summaryMatch ? summaryMatch[1] : 'Partial scores recovered via regex rescue',
+  };
+}
 /**
  * Parse the judge model's JSON response
  */
 function parseJudgeResponse(responseText) {
   // Extract JSON from response (may be wrapped in markdown code block)
-  const jsonMatch = responseText.match(/```(?:json)?\s*([\s\S]*?)```/) ||
-                    responseText.match(/\{[\s\S]*\}/);
+  let jsonMatch = responseText.match(/```(?:json)?\s*([\s\S]*?)```/);
+  if (!jsonMatch) {
+    // Strip preamble/postamble text — find first { and last }
+    const firstBrace = responseText.indexOf('{');
+    const lastBrace = responseText.lastIndexOf('}');
+    if (firstBrace !== -1 && lastBrace > firstBrace) {
+      jsonMatch = [null, responseText.slice(firstBrace, lastBrace + 1)];
+    }
+  }
   if (!jsonMatch) {
     throw new Error('Could not parse judge response as JSON');
   }
   const jsonStr = jsonMatch[1] || jsonMatch[0];
-  return JSON.parse(jsonStr);
+  try {
+    return JSON.parse(jsonStr);
+  } catch (e) {
+    // Try to fix common JSON issues: trailing commas, unescaped newlines in strings
+    const cleaned = jsonStr
+      .replace(/,\s*([}\]])/g, '$1')           // trailing commas
+      .replace(/[\x00-\x1f]/g, m =>            // control chars in strings
+        m === '\n' ? '\\n' : m === '\t' ? '\\t' : m === '\r' ? '\\r' : '');
+    try {
+      return JSON.parse(cleaned);
+    } catch (e2) {
+      // Attempt JSON repair: fix unescaped double quotes inside string values
+      // Pattern: "key": "text with "inner" quotes" → "key": "text with 'inner' quotes"
+      debugLog('[rubricEvaluator] Attempting JSON repair for unescaped quotes...');
+      try {
+        const repaired = repairUnescapedQuotes(cleaned);
+        return JSON.parse(repaired);
+      } catch (e3) {
+        // Final fallback: use jsonrepair library which handles many more edge cases
+        debugLog('[rubricEvaluator] Attempting jsonrepair library fallback...');
+        try {
+          const robustRepaired = jsonrepair(jsonStr);
+          return JSON.parse(robustRepaired);
+        } catch (e4) {
+          // Last resort: regex rescue — extract individual scores
+          debugLog('[rubricEvaluator] Attempting regex score rescue...');
+          const rescued = regexScoreRescue(jsonStr);
+          if (rescued) return rescued;
+          throw new Error(`Could not parse judge response as JSON: initial=${e.message}, repair=${e3.message}, jsonrepair=${e4.message}`);
+        }
+      }
+    }
+  }
 }
 /**
@@ -524,15 +811,17 @@ function parseJudgeResponse(responseText) {
  * @param {Object} suggestion - The suggestion to evaluate
  * @param {Object} scenario - The test scenario
  * @param {Object} context - Additional context
+ * @param {Object} [overrides] - Optional overrides
+ * @param {Object} [overrides.judgeOverride] - Override judge model config
  * @returns {Promise<Object>} Evaluation result
  */
-export async function evaluateSuggestion(suggestion, scenario, context = {}) {
+export async function evaluateSuggestion(suggestion, scenario, context = {}, overrides = {}) {
   const startTime = Date.now();
-  const evaluator = getAvailableEvaluator();
+  const judge = getAvailableJudge(overrides);
   try {
     const prompt = buildEvaluationPrompt(suggestion, scenario, context);
-    let responseText = await callJudgeModel(prompt);
+    let responseText = await callJudgeModel(prompt, overrides);
     // Log raw response for debugging
     debugLog('[rubricEvaluator] Judge raw response (first 300 chars):', responseText.slice(0, 300));
@@ -540,7 +829,7 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}) {
     // Handle empty response - try fallback model
     if (!responseText || responseText.trim() === '') {
       console.warn('[rubricEvaluator] Primary judge returned empty response, trying fallback...');
-      const fallbackConfig = getFallbackEvaluator();
+      const fallbackConfig = getFallbackJudge();
       if (fallbackConfig) {
         responseText = await callJudgeModelWithConfig(prompt, fallbackConfig);
         debugLog('[rubricEvaluator] Fallback response (first 300 chars):', responseText.slice(0, 300));
@@ -550,7 +839,35 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}) {
       }
     }
-    const parsed = parseJudgeResponse(responseText);
+    let parsed;
+    try {
+      parsed = parseJudgeResponse(responseText);
+    } catch (parseError) {
+      // JSON parse failed — retry with fallback model before giving up
+      console.warn(`[rubricEvaluator] Parse failed (${parseError.message}), retrying with fallback...`);
+      const fallbackConfig = getFallbackJudge();
+      if (fallbackConfig) {
+        let retryText = await callJudgeModelWithConfig(prompt, fallbackConfig);
+        if (retryText && retryText.trim()) {
+          try {
+            parsed = parseJudgeResponse(retryText);
+          } catch (retryParseError) {
+            // Second attempt: models are non-deterministic, retry once more
+            console.warn(`[rubricEvaluator] Fallback parse also failed (${retryParseError.message}), retrying once more...`);
+            retryText = await callJudgeModelWithConfig(prompt, fallbackConfig);
+            if (retryText && retryText.trim()) {
+              parsed = parseJudgeResponse(retryText);
+            } else {
+              throw retryParseError;
+            }
+          }
+        } else {
+          throw parseError;
+        }
+      } else {
+        throw parseError;
+      }
+    }
     // Debug: log what was parsed
     debugLog('[rubricEvaluator] Parsed keys:', Object.keys(parsed));
@@ -578,18 +895,16 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}) {
     for (const [key, value] of Object.entries(parsed.scores || {})) {
       const normalizedKey = dimensionMap[key] || key;
-      // Handle both {score, reasoning, quote} objects and plain numbers
+      // Handle both {score, reasoning} objects and plain numbers
       if (typeof value === 'object' && value !== null) {
         scores[normalizedKey] = {
           score: value.score,
           reasoning: value.reasoning,
-          quote: value.quote || null,
         };
       } else if (typeof value === 'number') {
         scores[normalizedKey] = {
           score: value,
           reasoning: null,
-          quote: null,
         };
       }
     }
@@ -607,19 +922,25 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}) {
       success: true,
       scores,
       overallScore,
+      baseScore: calculateBaseScore(scores),
+      recognitionScore: calculateRecognitionScore(scores),
       passesRequired: parsed.validation?.passes_required ?? true,
       passesForbidden: parsed.validation?.passes_forbidden ?? true,
       requiredMissing: parsed.validation?.required_missing || [],
       forbiddenFound: parsed.validation?.forbidden_found || [],
       summary: parsed.summary,
-      evaluatorModel: `${evaluator.provider}/${evaluator.model}`,
+      judgeModel: `${judge.provider}/${judge.model}`,
       evaluationTimeMs: Date.now() - startTime,
     };
   } catch (error) {
     return {
       success: false,
+      scores: {},
+      overallScore: null,
+      baseScore: null,
+      recognitionScore: null,
       error: error.message,
-      evaluatorModel: `${evaluator.provider}/${evaluator.model}`,
+      judgeModel: `${judge.provider}/${judge.model}`,
       evaluationTimeMs: Date.now() - startTime,
     };
   }
@@ -628,18 +949,18 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}) {
 /**
  * Evaluate multiple suggestions (batch)
  */
-export async function evaluateSuggestions(suggestions, scenario, context = {}) {
+export async function evaluateSuggestions(suggestions, scenario, context = {}, overrides = {}) {
   const results = [];
   for (const suggestion of suggestions) {
-    const result = await evaluateSuggestion(suggestion, scenario, context);
+    const result = await evaluateSuggestion(suggestion, scenario, context, overrides);
     results.push(result);
   }
   // Aggregate scores if multiple suggestions
   if (results.length > 0 && results[0].success) {
     const avgScores = {};
-    const dimensions = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone'];
+    const dimensions = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone', 'productive_struggle', 'epistemic_honesty'];
     for (const dim of dimensions) {
       const scores = results
@@ -697,9 +1018,18 @@ export function quickValidate(suggestion, scenario) {
     passesForbidden: true,
     requiredMissing: [],
     forbiddenFound: [],
+    // Transformation marker analysis (for multi-turn scenarios)
+    transformationMarkersFound: [],
+    staticMarkersFound: [],
+    learnerGrowthMarkersFound: [],
+    learnerStaticMarkersFound: [],
+    transformationScore: null,
+    learnerGrowthScore: null,
+    bilateralTransformationScore: null,
   };
   // Check required elements (can appear anywhere including actionTarget, reasoning)
+  // ALL elements in requiredElements must be present
   for (const required of scenario.requiredElements || []) {
     const normalizedRequired = required.toLowerCase();
     const found = fullSuggestionText.includes(normalizedRequired) ||
@@ -713,6 +1043,23 @@ export function quickValidate(suggestion, scenario) {
     }
   }
+  // Check requiredElementsAny - ANY one of these must be present
+  const anyElements = scenario.requiredElementsAny || [];
+  if (anyElements.length > 0) {
+    const anyFound = anyElements.some(required => {
+      const normalizedRequired = required.toLowerCase();
+      return fullSuggestionText.includes(normalizedRequired) ||
+        (suggestion.actionTarget && suggestion.actionTarget.toLowerCase().includes(normalizedRequired)) ||
+        (suggestion.title && suggestion.title.toLowerCase().includes(normalizedRequired)) ||
+        (suggestion.message && suggestion.message.toLowerCase().includes(normalizedRequired));
+    });
+    if (!anyFound) {
+      result.passesRequired = false;
+      result.requiredMissing.push(`one of: ${anyElements.join(', ')}`);
+    }
+  }
   // Check forbidden elements (only in user-facing text: title, message)
   // The 'reasoning' field is internal and may legitimately reference context terms
   for (const forbidden of scenario.forbiddenElements || []) {
@@ -723,15 +1070,121 @@ export function quickValidate(suggestion, scenario) {
     }
   }
+  // Check transformation markers (for multi-turn scenarios)
+  const markers = scenario.transformationMarkers || scenario.transformation_markers;
+  if (markers) {
+    // Tutor evolving markers (in tutor response)
+    const tutorEvolving = markers.tutor_evolving || markers.tutorEvolving || [];
+    for (const marker of tutorEvolving) {
+      if (userFacingText.includes(marker.toLowerCase())) {
+        result.transformationMarkersFound.push(marker);
+      }
+    }
+    // Tutor static markers (in tutor response)
+    const tutorStatic = markers.tutor_static || markers.tutorStatic || [];
+    for (const marker of tutorStatic) {
+      if (userFacingText.includes(marker.toLowerCase())) {
+        result.staticMarkersFound.push(marker);
+      }
+    }
+    // Calculate tutor transformation score
+    const tutorEvolvingCount = result.transformationMarkersFound.length;
+    const tutorStaticCount = result.staticMarkersFound.length;
+    const tutorTotal = tutorEvolvingCount + tutorStaticCount;
+    if (tutorTotal > 0) {
+      result.transformationScore = tutorEvolvingCount / tutorTotal;
+    }
+    // Learner growth markers (these will typically be found in context/history, not suggestion)
+    // Included for completeness when analyzing full dialogue
+    const learnerEvolving = markers.learner_evolving || markers.learnerEvolving || [];
+    const learnerStatic = markers.learner_static || markers.learnerStatic || [];
+    // Store marker definitions for use by turn analysis
+    result._markerDefinitions = {
+      tutorEvolving,
+      tutorStatic,
+      learnerEvolving,
+      learnerStatic,
+    };
+  }
   return result;
 }
+// Dimension groups for dual scoring
+const BASE_DIMENSIONS = ['relevance', 'specificity', 'pedagogical', 'personalization', 'actionability', 'tone', 'productive_struggle', 'epistemic_honesty'];
+const RECOGNITION_DIMENSIONS = ['mutual_recognition', 'dialectical_responsiveness', 'memory_integration', 'transformative_potential', 'tutor_adaptation', 'learner_growth'];
+/**
+ * Calculate base score from the 6 core pedagogical dimensions.
+ * Weights are re-normalized to sum to 1.0 across only the base dimensions.
+ *
+ * @param {Object} scores - Scores object from evaluation
+ * @returns {number} 0-100 score
+ */
+export function calculateBaseScore(scores) {
+  const dimensions = evalConfigLoader.getRubricDimensions();
+  const keyMap = { pedagogical_soundness: 'pedagogical' };
+  let weightedSum = 0;
+  let totalWeight = 0;
+  for (const [key, dim] of Object.entries(dimensions)) {
+    const normalizedKey = keyMap[key] || key;
+    if (!BASE_DIMENSIONS.includes(normalizedKey)) continue;
+    const scoreData = scores[normalizedKey] || scores[key];
+    const score = scoreData?.score ?? scoreData;
+    if (typeof score === 'number') {
+      weightedSum += score * (dim.weight || 0);
+      totalWeight += dim.weight || 0;
+    }
+  }
+  if (totalWeight === 0) return 0;
+  const avgScore = weightedSum / totalWeight;
+  return ((avgScore - 1) / 4) * 100;
+}
+/**
+ * Calculate recognition score from the 4 recognition dimensions.
+ * Weights are re-normalized to sum to 1.0 across only the recognition dimensions.
+ *
+ * @param {Object} scores - Scores object from evaluation
+ * @returns {number} 0-100 score
+ */
+export function calculateRecognitionScore(scores) {
+  const dimensions = evalConfigLoader.getRubricDimensions();
+  let weightedSum = 0;
+  let totalWeight = 0;
+  for (const [key, dim] of Object.entries(dimensions)) {
+    if (!RECOGNITION_DIMENSIONS.includes(key)) continue;
+    const scoreData = scores[key];
+    const score = scoreData?.score ?? scoreData;
+    if (typeof score === 'number') {
+      weightedSum += score * (dim.weight || 0);
+      totalWeight += dim.weight || 0;
+    }
+  }
+  if (totalWeight === 0) return 0;
+  const avgScore = weightedSum / totalWeight;
+  return ((avgScore - 1) / 4) * 100;
+}
 /**
  * Calculate weighted overall score from dimension scores
  */
 export function calculateOverallScore(scores) {
-  const rubric = tutorApi.loadRubric();
-  const dimensions = rubric?.dimensions || {};
+  const dimensions = evalConfigLoader.getRubricDimensions();
   // Map rubric keys to normalized score keys (pedagogical_soundness -> pedagogical)
   const keyMap = {
@@ -773,6 +1226,8 @@ export function calculateRecognitionMetrics(scores) {
     'dialectical_responsiveness',
     'memory_integration',
     'transformative_potential',
+    'tutor_adaptation',
+    'learner_growth',
   ];
   const metrics = {
@@ -780,6 +1235,9 @@ export function calculateRecognitionMetrics(scores) {
     transformationRate: false,
     memoryUtilization: false,
     mutualAcknowledgment: false,
+    tutorAdaptation: false,
+    learnerGrowth: false,
+    bilateralTransformation: false,
     dimensionScores: {},
     hasRecognitionData: false,
   };
@@ -806,9 +1264,18 @@ export function calculateRecognitionMetrics(scores) {
       if (dim === 'mutual_recognition' && score >= 4) {
         metrics.mutualAcknowledgment = true;
       }
+      if (dim === 'tutor_adaptation' && score >= 4) {
+        metrics.tutorAdaptation = true;
+      }
+      if (dim === 'learner_growth' && score >= 4) {
+        metrics.learnerGrowth = true;
+      }
     }
   }
+  // Bilateral transformation: both tutor and learner show adaptation
+  metrics.bilateralTransformation = metrics.tutorAdaptation && metrics.learnerGrowth;
   if (scoredCount > 0) {
     metrics.recognitionScore = totalScore / scoredCount;
     metrics.hasRecognitionData = true;
@@ -817,10 +1284,16 @@ export function calculateRecognitionMetrics(scores) {
   return metrics;
 }
+export { buildEvaluationPrompt };
 export default {
   evaluateSuggestion,
   evaluateSuggestions,
   quickValidate,
   calculateOverallScore,
+  calculateBaseScore,
+  calculateRecognitionScore,
   calculateRecognitionMetrics,
+  getAvailableJudge,
+  buildEvaluationPrompt,
 };