npm - @machinespirits/eval - Versions diffs - 0.1.2 → 0.2.1 - Mend

@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

package/LICENSE +21 -0
package/README.md +161 -0
package/config/eval-settings.yaml +18 -0
package/config/evaluation-rubric-learner.yaml +277 -0
package/config/evaluation-rubric.yaml +613 -0
package/config/interaction-eval-scenarios.yaml +93 -50
package/config/learner-agents.yaml +124 -193
package/config/machinespirits-eval.code-workspace +11 -0
package/config/providers.yaml +60 -0
package/config/suggestion-scenarios.yaml +1399 -0
package/config/tutor-agents.yaml +716 -0
package/docs/EVALUATION-VARIABLES.md +589 -0
package/docs/REPLICATION-PLAN.md +577 -0
package/index.js +15 -6
package/package.json +16 -22
package/routes/evalRoutes.js +88 -36
package/scripts/analyze-judge-reliability.js +401 -0
package/scripts/analyze-run.js +97 -0
package/scripts/analyze-run.mjs +282 -0
package/scripts/analyze-validation-failures.js +141 -0
package/scripts/check-run.mjs +17 -0
package/scripts/code-impasse-strategies.js +1132 -0
package/scripts/compare-runs.js +44 -0
package/scripts/compare-suggestions.js +80 -0
package/scripts/compare-transformation.js +116 -0
package/scripts/dig-into-run.js +158 -0
package/scripts/eval-cli.js +2626 -0
package/scripts/generate-paper-figures.py +452 -0
package/scripts/qualitative-analysis-ai.js +1313 -0
package/scripts/qualitative-analysis.js +688 -0
package/scripts/seed-db.js +87 -0
package/scripts/show-failed-suggestions.js +64 -0
package/scripts/validate-content.js +192 -0
package/server.js +3 -2
package/services/__tests__/evalConfigLoader.test.js +338 -0
package/services/anovaStats.js +499 -0
package/services/contentResolver.js +407 -0
package/services/dialogueTraceAnalyzer.js +454 -0
package/services/evalConfigLoader.js +625 -0
package/services/evaluationRunner.js +2171 -270
package/services/evaluationStore.js +564 -29
package/services/learnerConfigLoader.js +75 -5
package/services/learnerRubricEvaluator.js +284 -0
package/services/learnerTutorInteractionEngine.js +375 -0
package/services/processUtils.js +18 -0
package/services/progressLogger.js +98 -0
package/services/promptRecommendationService.js +31 -26
package/services/promptRewriter.js +427 -0
package/services/rubricEvaluator.js +543 -70
package/services/streamingReporter.js +104 -0
package/services/turnComparisonAnalyzer.js +494 -0
package/components/MobileEvalDashboard.tsx +0 -267
package/components/comparison/DeltaAnalysisTable.tsx +0 -137
package/components/comparison/ProfileComparisonCard.tsx +0 -176
package/components/comparison/RecognitionABMode.tsx +0 -385
package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
package/components/comparison/WinnerIndicator.tsx +0 -64
package/components/comparison/index.ts +0 -5
package/components/mobile/BottomSheet.tsx +0 -233
package/components/mobile/DimensionBreakdown.tsx +0 -210
package/components/mobile/DocsView.tsx +0 -363
package/components/mobile/LogsView.tsx +0 -481
package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
package/components/mobile/QuickTestView.tsx +0 -1098
package/components/mobile/RecognitionTypeChart.tsx +0 -124
package/components/mobile/RecognitionView.tsx +0 -809
package/components/mobile/RunDetailView.tsx +0 -261
package/components/mobile/RunHistoryView.tsx +0 -367
package/components/mobile/ScoreRadial.tsx +0 -211
package/components/mobile/StreamingLogPanel.tsx +0 -230
package/components/mobile/SynthesisStrategyChart.tsx +0 -140
package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
package/docs/research/COST-ANALYSIS.md +0 -56
package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
package/docs/research/PAPER-UNIFIED.md +0 -659
package/docs/research/PAPER-UNIFIED.pdf +0 -0
package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
package/docs/research/apa.csl +0 -2133
package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
package/docs/research/paper-draft/full-paper.md +0 -136
package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
package/docs/research/paper-draft/references.bib +0 -515
package/docs/research/transcript-baseline.md +0 -139
package/docs/research/transcript-recognition-multiagent.md +0 -187
package/hooks/useEvalData.ts +0 -625
package/server-init.js +0 -45
package/services/benchmarkService.js +0 -1892
package/types.ts +0 -165
package/utils/haptics.ts +0 -45

package/services/evaluationStore.js CHANGED Viewed

@@ -9,19 +9,30 @@ import Database from 'better-sqlite3';
 import path from 'path';
 import { fileURLToPath } from 'url';
 import { randomBytes } from 'crypto';
+import { isPidAlive } from './processUtils.js';
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = path.dirname(__filename);
 const ROOT_DIR = path.resolve(__dirname, '..');
 const DATA_DIR = path.join(ROOT_DIR, 'data');
-// Initialize database
-const dbPath = path.join(DATA_DIR, 'evaluations.db');
+// Initialize database — override with EVAL_DB_PATH env var for test isolation
+const dbPath = process.env.EVAL_DB_PATH || path.join(DATA_DIR, 'evaluations.db');
 const db = new Database(dbPath);
 // Enable WAL mode for better concurrent access
 db.pragma('journal_mode = WAL');
+// Migrate: rename evaluator_model → judge_model if the old column exists
+try {
+  const cols = db.prepare('PRAGMA table_info(evaluation_results)').all().map(c => c.name);
+  if (cols.includes('evaluator_model') && !cols.includes('judge_model')) {
+    db.exec('ALTER TABLE evaluation_results RENAME COLUMN evaluator_model TO judge_model');
+  }
+} catch (e) {
+  // Table may not exist yet (first run)
+}
 // Create tables
 db.exec(`
   -- Evaluation runs (batches of tests)
@@ -81,7 +92,7 @@ db.exec(`
     -- Metadata
     created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
-    evaluator_model TEXT,
+    judge_model TEXT,
     evaluation_reasoning TEXT,
     success BOOLEAN DEFAULT 1,
     error_message TEXT
@@ -103,6 +114,13 @@ try {
 }
 db.exec(`CREATE INDEX IF NOT EXISTS idx_results_dialogue ON evaluation_results(dialogue_id)`);
+// Migration: Add scenario_type column if it doesn't exist
+try {
+  db.exec(`ALTER TABLE evaluation_results ADD COLUMN scenario_type TEXT DEFAULT 'suggestion'`);
+} catch (e) {
+  // Column already exists, ignore
+}
 // Migration: Add scores_with_reasoning column if it doesn't exist
 try {
   db.exec(`ALTER TABLE evaluation_results ADD COLUMN scores_with_reasoning TEXT`);
@@ -117,6 +135,66 @@ try {
   // Column already exists, ignore
 }
+// Migration: Add dual scoring columns if they don't exist
+try {
+  db.exec(`ALTER TABLE evaluation_results ADD COLUMN base_score REAL`);
+} catch (e) {
+  // Column already exists, ignore
+}
+try {
+  db.exec(`ALTER TABLE evaluation_results ADD COLUMN recognition_score REAL`);
+} catch (e) {
+  // Column already exists, ignore
+}
+// Migration: Add ego_model and superego_model columns
+try {
+  db.exec(`ALTER TABLE evaluation_results ADD COLUMN ego_model TEXT`);
+} catch (e) {
+  // Column already exists, ignore
+}
+try {
+  db.exec(`ALTER TABLE evaluation_results ADD COLUMN superego_model TEXT`);
+} catch (e) {
+  // Column already exists, ignore
+}
+// Migration: Add factorial factor columns
+try {
+  db.exec(`ALTER TABLE evaluation_results ADD COLUMN factor_recognition BOOLEAN`);
+} catch (e) { /* Column already exists */ }
+try {
+  db.exec(`ALTER TABLE evaluation_results ADD COLUMN factor_multi_agent_tutor BOOLEAN`);
+} catch (e) { /* Column already exists */ }
+try {
+  db.exec(`ALTER TABLE evaluation_results ADD COLUMN factor_multi_agent_learner BOOLEAN`);
+} catch (e) { /* Column already exists */ }
+try {
+  db.exec(`ALTER TABLE evaluation_results ADD COLUMN learner_architecture TEXT`);
+} catch (e) { /* Column already exists */ }
+try {
+  db.exec(`ALTER TABLE evaluation_results ADD COLUMN scoring_method TEXT`);
+} catch (e) { /* Column already exists */ }
+// Migration: Add learner-side evaluation columns to evaluation_results
+try {
+  db.exec(`ALTER TABLE evaluation_results ADD COLUMN learner_scores TEXT`);
+} catch (e) { /* Column already exists */ }
+try {
+  db.exec(`ALTER TABLE evaluation_results ADD COLUMN learner_overall_score REAL`);
+} catch (e) { /* Column already exists */ }
+try {
+  db.exec(`ALTER TABLE evaluation_results ADD COLUMN learner_judge_model TEXT`);
+} catch (e) { /* Column already exists */ }
+// Migration: Add reproducibility metadata columns to evaluation_runs
+try {
+  db.exec(`ALTER TABLE evaluation_runs ADD COLUMN git_commit TEXT`);
+} catch (e) { /* Column already exists */ }
+try {
+  db.exec(`ALTER TABLE evaluation_runs ADD COLUMN package_version TEXT`);
+} catch (e) { /* Column already exists */ }
 // Migration: Revert any accidental renames (batch→matrix, interact→interaction)
 try {
   const revertRuns = db.prepare(`
@@ -181,6 +259,17 @@ db.exec(`
   CREATE INDEX IF NOT EXISTS idx_interaction_created ON interaction_evaluations(created_at);
 `);
+// Migration: Add learner-side evaluation columns to interaction_evaluations
+try {
+  db.exec(`ALTER TABLE interaction_evaluations ADD COLUMN learner_scores TEXT`);
+} catch (e) { /* Column already exists */ }
+try {
+  db.exec(`ALTER TABLE interaction_evaluations ADD COLUMN learner_overall_score REAL`);
+} catch (e) { /* Column already exists */ }
+try {
+  db.exec(`ALTER TABLE interaction_evaluations ADD COLUMN learner_judge_model TEXT`);
+} catch (e) { /* Column already exists */ }
 /**
  * Generate a unique run ID
  */
@@ -205,13 +294,14 @@ export function createRun(options = {}) {
   } = options;
   const id = generateRunId();
+  const now = new Date().toISOString();
   const stmt = db.prepare(`
-    INSERT INTO evaluation_runs (id, description, total_scenarios, total_configurations, metadata)
-    VALUES (?, ?, ?, ?, ?)
+    INSERT INTO evaluation_runs (id, created_at, description, total_scenarios, total_configurations, metadata, git_commit, package_version)
+    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
   `);
-  stmt.run(id, description, totalScenarios, totalConfigurations, JSON.stringify(metadata));
+  stmt.run(id, now, description, totalScenarios, totalConfigurations, JSON.stringify(metadata), metadata.gitCommit || null, metadata.packageVersion || null);
   return {
     id,
@@ -227,16 +317,29 @@ export function createRun(options = {}) {
  * Update a run's status
  */
 export function updateRun(runId, updates) {
-  const { status, totalTests, completedAt } = updates;
+  const { status, totalTests, completedAt, metadata } = updates;
+  // If metadata provided, merge with existing
+  if (metadata) {
+    const existing = getRun(runId);
+    const mergedMetadata = { ...(existing?.metadata || {}), ...metadata };
+    const stmt = db.prepare(`UPDATE evaluation_runs SET metadata = ? WHERE id = ?`);
+    stmt.run(JSON.stringify(mergedMetadata), runId);
+  }
   if (status === 'completed') {
     const stmt = db.prepare(`
       UPDATE evaluation_runs
-      SET status = ?, total_tests = ?, completed_at = ?
+      SET status = ?, completed_at = ?
       WHERE id = ?
     `);
-    stmt.run(status, totalTests || 0, completedAt || new Date().toISOString(), runId);
-  } else {
+    stmt.run(status, completedAt || new Date().toISOString(), runId);
+  } else if (status && totalTests != null) {
+    const stmt = db.prepare(`
+      UPDATE evaluation_runs SET status = ?, total_tests = ? WHERE id = ?
+    `);
+    stmt.run(status, totalTests, runId);
+  } else if (status) {
     const stmt = db.prepare(`
       UPDATE evaluation_runs SET status = ? WHERE id = ?
     `);
@@ -254,23 +357,33 @@ export function updateRun(runId, updates) {
 export function storeResult(runId, result) {
   const stmt = db.prepare(`
     INSERT INTO evaluation_results (
-      run_id, scenario_id, scenario_name,
+      run_id, scenario_id, scenario_name, scenario_type,
       provider, model, profile_name, hyperparameters, prompt_id,
+      ego_model, superego_model,
       suggestions, raw_response,
       latency_ms, input_tokens, output_tokens, cost, dialogue_rounds, api_calls, dialogue_id,
       score_relevance, score_specificity, score_pedagogical,
       score_personalization, score_actionability, score_tone, overall_score,
+      base_score, recognition_score,
       passes_required, passes_forbidden, required_missing, forbidden_found,
-      evaluator_model, evaluation_reasoning, scores_with_reasoning, success, error_message
+      judge_model, evaluation_reasoning, scores_with_reasoning, success, error_message,
+      factor_recognition, factor_multi_agent_tutor, factor_multi_agent_learner, learner_architecture,
+      scoring_method,
+      created_at
     ) VALUES (
-      ?, ?, ?,
+      ?, ?, ?, ?,
       ?, ?, ?, ?, ?,
       ?, ?,
+      ?, ?,
       ?, ?, ?, ?, ?, ?, ?,
       ?, ?, ?,
       ?, ?, ?, ?,
+      ?, ?,
+      ?, ?, ?, ?,
+      ?, ?, ?, ?, ?,
       ?, ?, ?, ?,
-      ?, ?, ?, ?, ?
+      ?,
+      ?
     )
   `);
@@ -278,11 +391,14 @@ export function storeResult(runId, result) {
     runId,
     result.scenarioId,
     result.scenarioName,
+    result.scenarioType || 'suggestion',
     result.provider,
     result.model,
     result.profileName,
     JSON.stringify(result.hyperparameters || {}),
     result.promptId,
+    result.egoModel || null,
+    result.superegoModel || null,
     JSON.stringify(result.suggestions || []),
     result.rawResponse,
     result.latencyMs,
@@ -299,15 +415,23 @@ export function storeResult(runId, result) {
     result.scores?.actionability,
     result.scores?.tone,
     result.overallScore,
+    result.baseScore,
+    result.recognitionScore,
     result.passesRequired ? 1 : 0,
     result.passesForbidden ? 1 : 0,
     JSON.stringify(result.requiredMissing || []),
     JSON.stringify(result.forbiddenFound || []),
-    result.evaluatorModel,
+    result.judgeModel,
     result.evaluationReasoning,
     result.scoresWithReasoning ? JSON.stringify(result.scoresWithReasoning) : null,
     result.success ? 1 : 0,
-    result.errorMessage
+    result.errorMessage,
+    result.factors?.recognition != null ? (result.factors.recognition ? 1 : 0) : null,
+    result.factors?.multi_agent_tutor != null ? (result.factors.multi_agent_tutor ? 1 : 0) : null,
+    result.factors?.multi_agent_learner != null ? (result.factors.multi_agent_learner ? 1 : 0) : null,
+    result.learnerArchitecture || null,
+    result.scoringMethod || null,
+    new Date().toISOString()
   );
   return info.lastInsertRowid;
@@ -331,6 +455,8 @@ export function getRun(runId) {
     status: row.status,
     completedAt: row.completed_at,
     metadata: JSON.parse(row.metadata || '{}'),
+    gitCommit: row.git_commit,
+    packageVersion: row.package_version,
   };
 }
@@ -338,7 +464,7 @@ export function getRun(runId) {
  * List all runs with scenario names
  */
 export function listRuns(options = {}) {
-  const { limit = 20, status = null } = options;
+  const { limit = null, status = null } = options;
   let query = 'SELECT * FROM evaluation_runs';
   const params = [];
@@ -348,8 +474,11 @@ export function listRuns(options = {}) {
     params.push(status);
   }
-  query += ' ORDER BY created_at DESC LIMIT ?';
-  params.push(limit);
+  query += ' ORDER BY created_at ASC';
+  if (limit) {
+    query += ' LIMIT ?';
+    params.push(limit);
+  }
   const stmt = db.prepare(query);
   const rows = stmt.all(...params);
@@ -361,9 +490,59 @@ export function listRuns(options = {}) {
     ORDER BY scenario_name
   `);
+  // Count completed results per run
+  const resultCountStmt = db.prepare(`
+    SELECT COUNT(*) as completed,
+           SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successful,
+           AVG(overall_score) as avg_score
+    FROM evaluation_results WHERE run_id = ?
+  `);
+  // Get distinct ego + superego models for each run
+  const modelStmt = db.prepare(`
+    SELECT DISTINCT ego_model FROM evaluation_results
+    WHERE run_id = ? AND ego_model IS NOT NULL
+    ORDER BY ego_model
+  `);
+  const superegoModelStmt = db.prepare(`
+    SELECT DISTINCT superego_model FROM evaluation_results
+    WHERE run_id = ? AND superego_model IS NOT NULL
+    ORDER BY superego_model
+  `);
   return rows.map(row => {
     const scenarioRows = scenarioStmt.all(row.id);
     const scenarioNames = scenarioRows.map(s => s.scenario_name).filter(Boolean);
+    const counts = resultCountStmt.get(row.id);
+    const extractAlias = (raw) => {
+      if (!raw) return null;
+      const dotIdx = raw.indexOf('.');
+      return dotIdx !== -1 ? raw.slice(dotIdx + 1) : raw;
+    };
+    const modelRows = modelStmt.all(row.id);
+    const superegoRows = superegoModelStmt.all(row.id);
+    const models = [...new Set([
+      ...modelRows.map(m => extractAlias(m.ego_model)),
+      ...superegoRows.map(m => extractAlias(m.superego_model)),
+    ].filter(Boolean))];
+    const completedResults = counts?.completed || 0;
+    const totalTests = row.total_tests || 0;
+    const progressPct = totalTests > 0 ? Math.min(100, Math.round((completedResults / totalTests) * 100)) : null;
+    // Compute duration: for completed runs use completed_at - created_at;
+    // for running runs compute elapsed from now.
+    let durationMs = null;
+    if (row.created_at) {
+      const start = new Date(row.created_at).getTime();
+      if (row.completed_at) {
+        durationMs = new Date(row.completed_at).getTime() - start;
+      } else if (row.status === 'running') {
+        durationMs = Date.now() - start;
+      }
+    }
     return {
       id: row.id,
@@ -371,10 +550,16 @@ export function listRuns(options = {}) {
       description: row.description,
       totalScenarios: row.total_scenarios,
       totalConfigurations: row.total_configurations,
-      totalTests: row.total_tests,
+      totalTests,
+      completedResults,
+      successfulResults: counts?.successful || 0,
+      avgScore: counts?.avg_score || null,
+      progressPct,
+      durationMs,
       status: row.status,
       completedAt: row.completed_at,
       scenarioNames, // Scenario names from results
+      models, // Distinct ego model aliases used
       metadata: JSON.parse(row.metadata || '{}'), // Structured metadata
     };
   });
@@ -384,7 +569,7 @@ export function listRuns(options = {}) {
  * Get results for a run
  */
 export function getResults(runId, options = {}) {
-  const { scenarioId = null, provider = null, model = null } = options;
+  const { scenarioId = null, provider = null, model = null, profileName = null } = options;
   let query = 'SELECT * FROM evaluation_results WHERE run_id = ?';
   const params = [runId];
@@ -404,6 +589,11 @@ export function getResults(runId, options = {}) {
     params.push(model);
   }
+  if (profileName) {
+    query += ' AND profile_name = ?';
+    params.push(profileName);
+  }
   query += ' ORDER BY created_at';
   const stmt = db.prepare(query);
@@ -420,6 +610,9 @@ export function getRunStats(runId) {
     SELECT
       provider,
       model,
+      profile_name,
+      ego_model,
+      superego_model,
       COUNT(*) as total_tests,
       SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successful_tests,
       AVG(overall_score) as avg_score,
@@ -429,6 +622,8 @@ export function getRunStats(runId) {
       AVG(score_personalization) as avg_personalization,
       AVG(score_actionability) as avg_actionability,
       AVG(score_tone) as avg_tone,
+      AVG(base_score) as avg_base_score,
+      AVG(recognition_score) as avg_recognition_score,
       AVG(latency_ms) as avg_latency,
       SUM(input_tokens) as total_input_tokens,
       SUM(output_tokens) as total_output_tokens,
@@ -436,7 +631,7 @@ export function getRunStats(runId) {
       SUM(CASE WHEN passes_forbidden = 1 THEN 1 ELSE 0 END) as passes_forbidden
     FROM evaluation_results
     WHERE run_id = ?
-    GROUP BY provider, model
+    GROUP BY provider, model, profile_name
     ORDER BY avg_score DESC
   `);
@@ -445,10 +640,15 @@ export function getRunStats(runId) {
   return rows.map(row => ({
     provider: row.provider,
     model: row.model,
+    profileName: row.profile_name,
+    egoModel: row.ego_model,
+    superegoModel: row.superego_model,
     totalTests: row.total_tests,
     successfulTests: row.successful_tests,
     successRate: row.total_tests > 0 ? row.successful_tests / row.total_tests : 0,
     avgScore: row.avg_score,
+    avgBaseScore: row.avg_base_score,
+    avgRecognitionScore: row.avg_recognition_score,
     dimensions: {
       relevance: row.avg_relevance,
       specificity: row.avg_specificity,
@@ -478,13 +678,18 @@ export function getScenarioStats(runId) {
       scenario_name,
       provider,
       model,
+      profile_name,
+      ego_model,
+      superego_model,
       AVG(overall_score) as avg_score,
+      AVG(base_score) as avg_base_score,
+      AVG(recognition_score) as avg_recognition_score,
       AVG(latency_ms) as avg_latency,
       SUM(CASE WHEN passes_required = 1 AND passes_forbidden = 1 THEN 1 ELSE 0 END) as passes_validation,
       COUNT(*) as runs
     FROM evaluation_results
     WHERE run_id = ?
-    GROUP BY scenario_id, provider, model
+    GROUP BY scenario_id, provider, model, profile_name
     ORDER BY scenario_id, avg_score DESC
   `);
@@ -503,7 +708,12 @@ export function getScenarioStats(runId) {
     grouped[row.scenario_id].configurations.push({
       provider: row.provider,
       model: row.model,
+      profileName: row.profile_name,
+      egoModel: row.ego_model,
+      superegoModel: row.superego_model,
       avgScore: row.avg_score,
+      avgBaseScore: row.avg_base_score,
+      avgRecognitionScore: row.avg_recognition_score,
       avgLatencyMs: row.avg_latency,
       passesValidation: row.passes_validation === row.runs,
       runs: row.runs,
@@ -623,7 +833,16 @@ export function exportToCsv(runId) {
     r.success ? 1 : 0,
   ]);
-  return [headers.join(','), ...rows.map(row => row.join(','))].join('\n');
+  const escapeCsvField = (value) => {
+    if (value == null) return '';
+    const str = String(value);
+    if (str.includes(',') || str.includes('"') || str.includes('\n')) {
+      return '"' + str.replace(/"/g, '""') + '"';
+    }
+    return str;
+  };
+  return [headers.join(','), ...rows.map(row => row.map(escapeCsvField).join(','))].join('\n');
 }
 /**
@@ -734,6 +953,8 @@ export function findIncompleteRuns(options = {}) {
   return rows.map(row => {
     const resultsStmt = db.prepare('SELECT COUNT(*) as count FROM evaluation_results WHERE run_id = ?');
     const resultsCount = resultsStmt.get(row.id).count;
+    const metadata = JSON.parse(row.metadata || '{}');
+    const pid = metadata?.pid;
     return {
       id: row.id,
@@ -744,7 +965,9 @@ export function findIncompleteRuns(options = {}) {
       expectedTests: row.total_scenarios * row.total_configurations,
       resultsFound: resultsCount,
       ageMinutes: Math.round((Date.now() - new Date(row.created_at).getTime()) / 60000),
-      metadata: JSON.parse(row.metadata || '{}'),
+      metadata,
+      pid,
+      pidAlive: isPidAlive(pid),
     };
   });
 }
@@ -762,16 +985,28 @@ export function autoCompleteStaleRuns(options = {}) {
   const incompleteRuns = findIncompleteRuns({ olderThanMinutes });
+  // Filter out runs whose PID is still alive
+  const staleRuns = incompleteRuns.filter(run => {
+    const pid = run.metadata?.pid;
+    const isAlive = isPidAlive(pid);
+    if (isAlive) {
+      console.log(`  Skipping ${run.id}: pid ${pid} still running`);
+    }
+    return !isAlive;
+  });
   if (dryRun) {
     return {
       dryRun: true,
       found: incompleteRuns.length,
-      runs: incompleteRuns,
+      stale: staleRuns.length,
+      skippedAlive: incompleteRuns.length - staleRuns.length,
+      runs: staleRuns,
     };
   }
   const completed = [];
-  for (const run of incompleteRuns) {
+  for (const run of staleRuns) {
     try {
       const result = completeRun(run.id);
       completed.push(result);
@@ -786,6 +1021,8 @@ export function autoCompleteStaleRuns(options = {}) {
   return {
     found: incompleteRuns.length,
+    stale: staleRuns.length,
+    skippedAlive: incompleteRuns.length - staleRuns.length,
     completed: completed.length,
     runs: completed,
   };
@@ -827,8 +1064,9 @@ export function getIncompleteTests(runId, profiles, scenarios) {
   const results = getResults(runId);
   const completedSet = new Set();
-  // Build set of completed (profile, scenarioId) pairs
+  // Build set of completed (profile, scenarioId) pairs — only count successes
   for (const result of results) {
+    if (result.success === false || result.success === 0) continue;
     const key = `${result.profileName}:${result.scenarioId}`;
     completedSet.add(key);
   }
@@ -899,9 +1137,12 @@ function parseResultRow(row) {
     runId: row.run_id,
     scenarioId: row.scenario_id,
     scenarioName: row.scenario_name,
+    scenarioType: row.scenario_type || 'suggestion',
     provider: row.provider,
     model: row.model,
     profileName: row.profile_name,
+    egoModel: row.ego_model,
+    superegoModel: row.superego_model,
     hyperparameters: JSON.parse(row.hyperparameters || '{}'),
     promptId: row.prompt_id,
     suggestions: JSON.parse(row.suggestions || '[]'),
@@ -914,15 +1155,29 @@ function parseResultRow(row) {
     dialogueId: row.dialogue_id,
     scores,
     overallScore: row.overall_score,
+    scoringMethod: row.scoring_method || null,
+    baseScore: row.base_score,
+    recognitionScore: row.recognition_score,
     passesRequired: Boolean(row.passes_required),
     passesForbidden: Boolean(row.passes_forbidden),
     requiredMissing: JSON.parse(row.required_missing || '[]'),
     forbiddenFound: JSON.parse(row.forbidden_found || '[]'),
-    evaluatorModel: row.evaluator_model,
+    judgeModel: row.judge_model,
     evaluationReasoning: row.evaluation_reasoning,
     success: Boolean(row.success),
     errorMessage: row.error_message,
     createdAt: row.created_at,
+    factors: (row.factor_recognition != null || row.factor_multi_agent_tutor != null || row.factor_multi_agent_learner != null)
+      ? {
+          recognition: Boolean(row.factor_recognition),
+          multi_agent_tutor: Boolean(row.factor_multi_agent_tutor),
+          multi_agent_learner: Boolean(row.factor_multi_agent_learner),
+        }
+      : null,
+    learnerArchitecture: row.learner_architecture || null,
+    learnerScores: row.learner_scores ? JSON.parse(row.learner_scores) : null,
+    learnerOverallScore: row.learner_overall_score != null ? row.learner_overall_score : null,
+    learnerJudgeModel: row.learner_judge_model || null,
   };
 }
@@ -1052,6 +1307,9 @@ export function getInteractionEval(evalId) {
     uniqueOutcomes: JSON.parse(row.unique_outcomes || '[]'),
     judgeOverallScore: row.judge_overall_score,
     judgeEvaluation: JSON.parse(row.judge_evaluation || 'null'),
+    learnerScores: JSON.parse(row.learner_scores || 'null'),
+    learnerOverallScore: row.learner_overall_score,
+    learnerJudgeModel: row.learner_judge_model,
     createdAt: row.created_at,
   };
 }
@@ -1096,10 +1354,284 @@ export function getInteractionEvalByRunId(runId) {
   };
 }
+/**
+ * Get factorial cell data for ANOVA analysis.
+ *
+ * Returns scores grouped by cell key ("r0_t0_l0", etc.)
+ * Only includes results that have factor tags stored.
+ *
+ * @param {string} runId - The run ID
+ * @param {Object} [options] - Options
+ * @param {string} [options.scoreColumn='overall_score'] - Which score to use
+ * @returns {Object} Map of cellKey → [score, ...]
+ */
+export function getFactorialCellData(runId, options = {}) {
+  const { scoreColumn = 'overall_score' } = options;
+  // Whitelist valid score columns to prevent SQL injection
+  const validColumns = ['overall_score', 'base_score', 'recognition_score'];
+  const col = validColumns.includes(scoreColumn) ? scoreColumn : 'overall_score';
+  const stmt = db.prepare(`
+    SELECT factor_recognition, factor_multi_agent_tutor, factor_multi_agent_learner, ${col} as score
+    FROM evaluation_results
+    WHERE run_id = ? AND factor_recognition IS NOT NULL AND ${col} IS NOT NULL AND success = 1
+  `);
+  const rows = stmt.all(runId);
+  const cells = {};
+  for (const row of rows) {
+    const key = `r${row.factor_recognition}_t${row.factor_multi_agent_tutor}_l${row.factor_multi_agent_learner}`;
+    if (!cells[key]) cells[key] = [];
+    cells[key].push(row.score);
+  }
+  return cells;
+}
+/**
+ * Store a new judgment row for an existing result (preserves judgment history).
+ * Copies the original result's response data but adds new scores from a different judge.
+ * This enables inter-judge reliability analysis.
+ *
+ * @param {Object} originalResult - The original result row (from getResults)
+ * @param {Object} evaluation - The new evaluation scores
+ * @returns {number} The new row ID
+ */
+export function storeRejudgment(originalResult, evaluation) {
+  const stmt = db.prepare(`
+    INSERT INTO evaluation_results (
+      run_id, scenario_id, scenario_name, scenario_type,
+      provider, model, profile_name, hyperparameters, prompt_id,
+      ego_model, superego_model,
+      suggestions, raw_response,
+      latency_ms, input_tokens, output_tokens, cost, dialogue_rounds, api_calls, dialogue_id,
+      score_relevance, score_specificity, score_pedagogical,
+      score_personalization, score_actionability, score_tone, overall_score,
+      base_score, recognition_score,
+      passes_required, passes_forbidden, required_missing, forbidden_found,
+      judge_model, evaluation_reasoning, scores_with_reasoning, success, error_message,
+      factor_recognition, factor_multi_agent_tutor, factor_multi_agent_learner, learner_architecture,
+      scoring_method,
+      created_at
+    ) VALUES (
+      ?, ?, ?, ?,
+      ?, ?, ?, ?, ?,
+      ?, ?,
+      ?, ?,
+      ?, ?, ?, ?, ?, ?, ?,
+      ?, ?, ?,
+      ?, ?, ?, ?,
+      ?, ?,
+      ?, ?, ?, ?,
+      ?, ?, ?, ?, ?,
+      ?, ?, ?, ?,
+      ?,
+      ?
+    )
+  `);
+  const scores = evaluation.scores || {};
+  const info = stmt.run(
+    originalResult.runId,
+    originalResult.scenarioId,
+    originalResult.scenarioName,
+    originalResult.scenarioType || 'suggestion',
+    originalResult.provider,
+    originalResult.model,
+    originalResult.profileName,
+    typeof originalResult.hyperparameters === 'string'
+      ? originalResult.hyperparameters
+      : JSON.stringify(originalResult.hyperparameters || {}),
+    originalResult.promptId,
+    originalResult.egoModel || null,
+    originalResult.superegoModel || null,
+    typeof originalResult.suggestions === 'string'
+      ? originalResult.suggestions
+      : JSON.stringify(originalResult.suggestions || []),
+    originalResult.rawResponse,
+    originalResult.latencyMs,
+    originalResult.inputTokens,
+    originalResult.outputTokens,
+    originalResult.cost,
+    originalResult.dialogueRounds,
+    originalResult.apiCalls,
+    originalResult.dialogueId,
+    // New scores from the new judge
+    scores.relevance?.score ?? scores.relevance ?? null,
+    scores.specificity?.score ?? scores.specificity ?? null,
+    scores.pedagogical?.score ?? scores.pedagogical ?? null,
+    scores.personalization?.score ?? scores.personalization ?? null,
+    scores.actionability?.score ?? scores.actionability ?? null,
+    scores.tone?.score ?? scores.tone ?? null,
+    evaluation.overallScore ?? null,
+    evaluation.baseScore ?? null,
+    evaluation.recognitionScore ?? null,
+    evaluation.passesRequired ? 1 : 0,
+    evaluation.passesForbidden ? 1 : 0,
+    JSON.stringify(evaluation.requiredMissing || []),
+    JSON.stringify(evaluation.forbiddenFound || []),
+    evaluation.judgeModel || null,
+    evaluation.summary || null,
+    evaluation.scores ? JSON.stringify(evaluation.scores) : null,
+    1, // success
+    null, // error_message
+    originalResult.factorRecognition ?? null,
+    originalResult.factorMultiAgentTutor ?? null,
+    originalResult.factorMultiAgentLearner ?? null,
+    originalResult.learnerArchitecture || null,
+    'rubric',  // Rejudgments only store successful rubric evaluations
+    new Date().toISOString()
+  );
+  return info.lastInsertRowid;
+}
+/**
+ * Update score columns for an existing result row (for rejudging - overwrites history)
+ * @deprecated Use storeRejudgment() to preserve judgment history for reliability analysis
+ */
+export function updateResultScores(resultId, evaluation) {
+  const stmt = db.prepare(`
+    UPDATE evaluation_results SET
+      score_relevance = ?,
+      score_specificity = ?,
+      score_pedagogical = ?,
+      score_personalization = ?,
+      score_actionability = ?,
+      score_tone = ?,
+      overall_score = ?,
+      base_score = ?,
+      recognition_score = ?,
+      passes_required = ?,
+      passes_forbidden = ?,
+      required_missing = ?,
+      forbidden_found = ?,
+      judge_model = ?,
+      evaluation_reasoning = ?,
+      scores_with_reasoning = ?,
+      scoring_method = ?
+    WHERE id = ?
+  `);
+  const scores = evaluation.scores || {};
+  stmt.run(
+    scores.relevance?.score ?? scores.relevance ?? null,
+    scores.specificity?.score ?? scores.specificity ?? null,
+    scores.pedagogical?.score ?? scores.pedagogical ?? null,
+    scores.personalization?.score ?? scores.personalization ?? null,
+    scores.actionability?.score ?? scores.actionability ?? null,
+    scores.tone?.score ?? scores.tone ?? null,
+    evaluation.overallScore ?? null,
+    evaluation.baseScore ?? null,
+    evaluation.recognitionScore ?? null,
+    evaluation.passesRequired ? 1 : 0,
+    evaluation.passesForbidden ? 1 : 0,
+    JSON.stringify(evaluation.requiredMissing || []),
+    JSON.stringify(evaluation.forbiddenFound || []),
+    evaluation.judgeModel || null,
+    evaluation.summary || null,
+    evaluation.scores ? JSON.stringify(evaluation.scores) : null,
+    'rubric',  // Only called on successful evaluations
+    resultId
+  );
+}
+/**
+ * Update learner-side evaluation scores on an evaluation_results row.
+ *
+ * @param {string} resultId - The evaluation result ID
+ * @param {Object} evaluation - Learner evaluation data
+ * @param {Object} evaluation.scores - Per-turn learner scores (JSON-serializable)
+ * @param {number} evaluation.overallScore - Weighted average learner score (0-100)
+ * @param {string} evaluation.judgeModel - Model used for judging
+ */
+export function updateResultLearnerScores(resultId, evaluation) {
+  const stmt = db.prepare(`
+    UPDATE evaluation_results SET
+      learner_scores = ?,
+      learner_overall_score = ?,
+      learner_judge_model = ?
+    WHERE id = ?
+  `);
+  stmt.run(
+    JSON.stringify(evaluation.scores),
+    evaluation.overallScore,
+    evaluation.judgeModel || null,
+    resultId
+  );
+}
+/**
+ * List all interaction evaluations for a given run ID.
+ *
+ * @param {string} runId - The run ID
+ * @returns {Array} Array of interaction evaluation objects
+ */
+export function listInteractionEvalsByRunId(runId) {
+  const stmt = db.prepare('SELECT * FROM interaction_evaluations WHERE run_id = ? ORDER BY created_at');
+  const rows = stmt.all(runId);
+  return rows.map(row => ({
+    evalId: row.id,
+    runId: row.run_id,
+    scenarioId: row.scenario_id,
+    scenarioName: row.scenario_name,
+    evalType: row.eval_type,
+    learnerProfile: row.learner_profile,
+    tutorProfile: row.tutor_profile,
+    personaId: row.persona_id,
+    learnerAgents: JSON.parse(row.learner_agents || '[]'),
+    turnCount: row.turn_count,
+    turns: JSON.parse(row.turns || '[]'),
+    formattedTranscript: row.formatted_transcript,
+    totalTokens: row.total_tokens,
+    finalLearnerState: row.final_learner_state,
+    finalUnderstanding: row.final_understanding,
+    judgeOverallScore: row.judge_overall_score,
+    learnerScores: JSON.parse(row.learner_scores || 'null'),
+    learnerOverallScore: row.learner_overall_score,
+    learnerJudgeModel: row.learner_judge_model,
+    createdAt: row.created_at,
+  }));
+}
+/**
+ * Update learner-side evaluation scores for an interaction evaluation.
+ *
+ * @param {string} evalId - The interaction evaluation ID
+ * @param {Object} evaluation - Learner evaluation data
+ * @param {Object} evaluation.scores - Per-turn scores: { turnIndex: { dimension: {score, reasoning} } }
+ * @param {number} evaluation.overallScore - Weighted average learner score (0-100)
+ * @param {string} evaluation.judgeModel - Model used for judging
+ */
+export function updateInteractionLearnerScores(evalId, evaluation) {
+  const stmt = db.prepare(`
+    UPDATE interaction_evaluations
+    SET learner_scores = ?,
+        learner_overall_score = ?,
+        learner_judge_model = ?
+    WHERE id = ?
+  `);
+  stmt.run(
+    JSON.stringify(evaluation.scores),
+    evaluation.overallScore,
+    evaluation.judgeModel || null,
+    evalId
+  );
+}
 export default {
   createRun,
   updateRun,
   storeResult,
+  storeRejudgment,
+  updateResultScores,
+  updateResultLearnerScores,
   getRun,
   listRuns,
   getResults,
@@ -1113,9 +1645,12 @@ export default {
   findIncompleteRuns,
   autoCompleteStaleRuns,
   getIncompleteTests,
+  getFactorialCellData,
   // Interaction evaluations
   storeInteractionEval,
   listInteractionEvals,
+  listInteractionEvalsByRunId,
   getInteractionEval,
   getInteractionEvalByRunId,
+  updateInteractionLearnerScores,
 };