npm - @machinespirits/eval - Versions diffs - 0.1.0 - Mend

@machinespirits/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

package/components/MobileEvalDashboard.tsx +267 -0
package/components/comparison/DeltaAnalysisTable.tsx +137 -0
package/components/comparison/ProfileComparisonCard.tsx +176 -0
package/components/comparison/RecognitionABMode.tsx +385 -0
package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
package/components/comparison/WinnerIndicator.tsx +64 -0
package/components/comparison/index.ts +5 -0
package/components/mobile/BottomSheet.tsx +233 -0
package/components/mobile/DimensionBreakdown.tsx +210 -0
package/components/mobile/DocsView.tsx +363 -0
package/components/mobile/LogsView.tsx +481 -0
package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
package/components/mobile/QuickTestView.tsx +1098 -0
package/components/mobile/RecognitionTypeChart.tsx +124 -0
package/components/mobile/RecognitionView.tsx +809 -0
package/components/mobile/RunDetailView.tsx +261 -0
package/components/mobile/RunHistoryView.tsx +367 -0
package/components/mobile/ScoreRadial.tsx +211 -0
package/components/mobile/StreamingLogPanel.tsx +230 -0
package/components/mobile/SynthesisStrategyChart.tsx +140 -0
package/config/interaction-eval-scenarios.yaml +832 -0
package/config/learner-agents.yaml +248 -0
package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
package/docs/research/COST-ANALYSIS.md +56 -0
package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
package/docs/research/PAPER-UNIFIED.md +659 -0
package/docs/research/PAPER-UNIFIED.pdf +0 -0
package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
package/docs/research/apa.csl +2133 -0
package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
package/docs/research/paper-draft/full-paper.md +136 -0
package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
package/docs/research/paper-draft/references.bib +515 -0
package/docs/research/transcript-baseline.md +139 -0
package/docs/research/transcript-recognition-multiagent.md +187 -0
package/hooks/useEvalData.ts +625 -0
package/index.js +27 -0
package/package.json +73 -0
package/routes/evalRoutes.js +3002 -0
package/scripts/advanced-eval-analysis.js +351 -0
package/scripts/analyze-eval-costs.js +378 -0
package/scripts/analyze-eval-results.js +513 -0
package/scripts/analyze-interaction-evals.js +368 -0
package/server-init.js +45 -0
package/server.js +162 -0
package/services/benchmarkService.js +1892 -0
package/services/evaluationRunner.js +739 -0
package/services/evaluationStore.js +1121 -0
package/services/learnerConfigLoader.js +385 -0
package/services/learnerTutorInteractionEngine.js +857 -0
package/services/memory/learnerMemoryService.js +1227 -0
package/services/memory/learnerWritingPad.js +577 -0
package/services/memory/tutorWritingPad.js +674 -0
package/services/promptRecommendationService.js +493 -0
package/services/rubricEvaluator.js +826 -0

package/scripts/analyze-interaction-evals.js ADDED Viewed

@@ -0,0 +1,368 @@
+#!/usr/bin/env node
+/**
+ * Analyze Interaction Evaluation Results
+ *
+ * Extracts and analyzes judge evaluation scores from interaction eval logs.
+ * Used for the updated research paper on dyadic learner-tutor recognition.
+ */
+import fs from 'fs';
+import path from 'path';
+import { fileURLToPath } from 'url';
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const LOGS_DIR = path.join(__dirname, '..', 'logs', 'interaction-evals');
+// Tutor dimensions
+const TUTOR_DIMS = ['mutual_recognition', 'dialectical_responsiveness', 'transformative_potential', 'tone'];
+// Learner dimensions
+const LEARNER_DIMS = ['authenticity', 'responsiveness', 'development'];
+/**
+ * Load all interaction eval JSON files
+ */
+function loadEvals() {
+  const files = fs.readdirSync(LOGS_DIR).filter(f => f.endsWith('.json'));
+  const evals = [];
+  for (const file of files) {
+    try {
+      const content = fs.readFileSync(path.join(LOGS_DIR, file), 'utf8');
+      const data = JSON.parse(content);
+      data._filename = file;
+      evals.push(data);
+    } catch (e) {
+      console.warn(`Failed to load ${file}: ${e.message}`);
+    }
+  }
+  return evals;
+}
+/**
+ * Extract tutor dimension scores from judge evaluation
+ */
+function extractTutorScores(evalData) {
+  const judgeEval = evalData.judgeEvaluation;
+  if (!judgeEval?.tutor_evaluation) return null;
+  const tutor = judgeEval.tutor_evaluation;
+  const scores = {};
+  let total = 0;
+  let count = 0;
+  for (const dim of TUTOR_DIMS) {
+    if (tutor[dim]?.score != null) {
+      scores[dim] = tutor[dim].score;
+      total += tutor[dim].score;
+      count++;
+    }
+  }
+  if (count > 0) {
+    scores.overall = total / count;
+  }
+  return count > 0 ? scores : null;
+}
+/**
+ * Extract learner dimension scores from judge evaluation
+ */
+function extractLearnerScores(evalData) {
+  const judgeEval = evalData.judgeEvaluation;
+  if (!judgeEval?.learner_evaluation) return null;
+  const learner = judgeEval.learner_evaluation;
+  const scores = {};
+  let total = 0;
+  let count = 0;
+  for (const dim of LEARNER_DIMS) {
+    if (learner[dim]?.score != null) {
+      scores[dim] = learner[dim].score;
+      total += learner[dim].score;
+      count++;
+    }
+  }
+  if (count > 0) {
+    scores.overall = total / count;
+  }
+  return count > 0 ? scores : null;
+}
+/**
+ * Extract scenario metadata
+ */
+function extractMetadata(evalData) {
+  return {
+    scenarioId: evalData.scenarioId || evalData.evalId?.split('-')[1] || 'unknown',
+    learnerId: evalData.learnerId,
+    tutorProfile: evalData.tutorProfile || 'default',
+    learnerArchitecture: evalData.learnerArchitecture || 'unknown',
+    personaId: evalData.personaId,
+    turnCount: evalData.metrics?.turnCount || 0,
+    totalTokens: evalData.metrics?.totalTokens || 0,
+    duration: evalData.metrics?.durationMs || 0,
+    outcomes: evalData.summary?.outcomes || [],
+    skipJudge: evalData.skipJudge || false,
+  };
+}
+/**
+ * Group evals by tutor profile
+ */
+function groupByTutorProfile(evals) {
+  const groups = {};
+  for (const e of evals) {
+    const meta = extractMetadata(e);
+    const profile = meta.tutorProfile;
+    if (!groups[profile]) {
+      groups[profile] = [];
+    }
+    groups[profile].push(e);
+  }
+  return groups;
+}
+/**
+ * Group evals by learner architecture
+ */
+function groupByArchitecture(evals) {
+  const groups = {};
+  for (const e of evals) {
+    const meta = extractMetadata(e);
+    const arch = meta.learnerArchitecture;
+    if (!groups[arch]) {
+      groups[arch] = [];
+    }
+    groups[arch].push(e);
+  }
+  return groups;
+}
+/**
+ * Calculate average scores for a group of evals
+ */
+function calculateAverages(evals, extractFn) {
+  const allScores = {};
+  const counts = {};
+  for (const e of evals) {
+    const scores = extractFn(e);
+    if (!scores) continue;
+    for (const [dim, score] of Object.entries(scores)) {
+      if (!allScores[dim]) {
+        allScores[dim] = 0;
+        counts[dim] = 0;
+      }
+      allScores[dim] += score;
+      counts[dim]++;
+    }
+  }
+  const averages = {};
+  for (const dim of Object.keys(allScores)) {
+    averages[dim] = counts[dim] > 0 ? (allScores[dim] / counts[dim]).toFixed(2) : null;
+  }
+  averages._count = Math.max(...Object.values(counts), 0);
+  return averages;
+}
+/**
+ * Generate summary report
+ */
+function generateReport(evals) {
+  console.log('\n' + '═'.repeat(70));
+  console.log('INTERACTION EVALUATION ANALYSIS');
+  console.log('═'.repeat(70));
+  // Filter to only evals with judge evaluation
+  const judgedEvals = evals.filter(e => e.judgeEvaluation && !e.skipJudge);
+  console.log(`\nTotal evals: ${evals.length}`);
+  console.log(`With judge evaluation: ${judgedEvals.length}`);
+  // Battery evals
+  const batteryEvals = judgedEvals.filter(e => e._filename.includes('battery'));
+  console.log(`Battery evals: ${batteryEvals.length}`);
+  // By Tutor Profile
+  console.log('\n' + '─'.repeat(70));
+  console.log('TUTOR PROFILE COMPARISON');
+  console.log('─'.repeat(70));
+  const byProfile = groupByTutorProfile(batteryEvals);
+  const profileResults = {};
+  for (const [profile, profileEvals] of Object.entries(byProfile)) {
+    const tutorAvg = calculateAverages(profileEvals, extractTutorScores);
+    const learnerAvg = calculateAverages(profileEvals, extractLearnerScores);
+    profileResults[profile] = { tutor: tutorAvg, learner: learnerAvg };
+    console.log(`\n${profile.toUpperCase()} (n=${tutorAvg._count || 0}):`);
+    console.log('  Tutor dimensions:');
+    for (const dim of TUTOR_DIMS) {
+      console.log(`    ${dim}: ${tutorAvg[dim] || 'N/A'}`);
+    }
+    console.log(`    OVERALL: ${tutorAvg.overall || 'N/A'}`);
+    console.log('  Learner dimensions:');
+    for (const dim of LEARNER_DIMS) {
+      console.log(`    ${dim}: ${learnerAvg[dim] || 'N/A'}`);
+    }
+    console.log(`    OVERALL: ${learnerAvg.overall || 'N/A'}`);
+  }
+  // By Learner Architecture
+  console.log('\n' + '─'.repeat(70));
+  console.log('LEARNER ARCHITECTURE COMPARISON');
+  console.log('─'.repeat(70));
+  const byArch = groupByArchitecture(batteryEvals);
+  const archResults = {};
+  for (const [arch, archEvals] of Object.entries(byArch)) {
+    const tutorAvg = calculateAverages(archEvals, extractTutorScores);
+    const learnerAvg = calculateAverages(archEvals, extractLearnerScores);
+    archResults[arch] = { tutor: tutorAvg, learner: learnerAvg };
+    console.log(`\n${arch.toUpperCase()} (n=${tutorAvg._count || 0}):`);
+    console.log('  Tutor dimensions:');
+    for (const dim of TUTOR_DIMS) {
+      console.log(`    ${dim}: ${tutorAvg[dim] || 'N/A'}`);
+    }
+    console.log(`    OVERALL: ${tutorAvg.overall || 'N/A'}`);
+    console.log('  Learner dimensions:');
+    for (const dim of LEARNER_DIMS) {
+      console.log(`    ${dim}: ${learnerAvg[dim] || 'N/A'}`);
+    }
+    console.log(`    OVERALL: ${learnerAvg.overall || 'N/A'}`);
+  }
+  // Cross-tabulation: Profile × Architecture
+  console.log('\n' + '─'.repeat(70));
+  console.log('CROSS-TABULATION: TUTOR PROFILE × LEARNER ARCHITECTURE');
+  console.log('─'.repeat(70));
+  const crossTab = {};
+  for (const e of batteryEvals) {
+    const meta = extractMetadata(e);
+    const key = `${meta.tutorProfile}|${meta.learnerArchitecture}`;
+    if (!crossTab[key]) {
+      crossTab[key] = [];
+    }
+    crossTab[key].push(e);
+  }
+  // Create table
+  const profiles = [...new Set(batteryEvals.map(e => extractMetadata(e).tutorProfile))].sort();
+  const architectures = [...new Set(batteryEvals.map(e => extractMetadata(e).learnerArchitecture))].sort();
+  console.log('\nTutor Overall Score by Profile × Architecture:');
+  console.log('─'.repeat(70));
+  // Header row
+  let header = 'Profile'.padEnd(20);
+  for (const arch of architectures) {
+    header += arch.slice(0, 12).padStart(14);
+  }
+  console.log(header);
+  console.log('─'.repeat(70));
+  // Data rows
+  for (const profile of profiles) {
+    let row = profile.padEnd(20);
+    for (const arch of architectures) {
+      const key = `${profile}|${arch}`;
+      const cellEvals = crossTab[key] || [];
+      const avg = calculateAverages(cellEvals, extractTutorScores);
+      row += (avg.overall || '-').toString().padStart(14);
+    }
+    console.log(row);
+  }
+  // Summary statistics
+  console.log('\n' + '─'.repeat(70));
+  console.log('SUMMARY STATISTICS');
+  console.log('─'.repeat(70));
+  // Overall averages
+  const overallTutor = calculateAverages(batteryEvals, extractTutorScores);
+  const overallLearner = calculateAverages(batteryEvals, extractLearnerScores);
+  console.log('\nOverall Tutor Dimensions:');
+  for (const dim of TUTOR_DIMS) {
+    console.log(`  ${dim}: ${overallTutor[dim] || 'N/A'}`);
+  }
+  console.log(`  OVERALL: ${overallTutor.overall || 'N/A'}`);
+  console.log('\nOverall Learner Dimensions:');
+  for (const dim of LEARNER_DIMS) {
+    console.log(`  ${dim}: ${overallLearner[dim] || 'N/A'}`);
+  }
+  console.log(`  OVERALL: ${overallLearner.overall || 'N/A'}`);
+  // Best/worst by profile
+  if (Object.keys(profileResults).length > 0) {
+    const sortedProfiles = Object.entries(profileResults)
+      .filter(([_, r]) => r.tutor.overall)
+      .sort((a, b) => parseFloat(b[1].tutor.overall) - parseFloat(a[1].tutor.overall));
+    if (sortedProfiles.length > 0) {
+      console.log(`\nBest tutor profile: ${sortedProfiles[0][0]} (${sortedProfiles[0][1].tutor.overall})`);
+      console.log(`Worst tutor profile: ${sortedProfiles[sortedProfiles.length-1][0]} (${sortedProfiles[sortedProfiles.length-1][1].tutor.overall})`);
+    }
+  }
+  // Best/worst by architecture
+  if (Object.keys(archResults).length > 0) {
+    const sortedArchs = Object.entries(archResults)
+      .filter(([_, r]) => r.learner.overall)
+      .sort((a, b) => parseFloat(b[1].learner.overall) - parseFloat(a[1].learner.overall));
+    if (sortedArchs.length > 0) {
+      console.log(`\nBest learner architecture: ${sortedArchs[0][0]} (${sortedArchs[0][1].learner.overall})`);
+      console.log(`Worst learner architecture: ${sortedArchs[sortedArchs.length-1][0]} (${sortedArchs[sortedArchs.length-1][1].learner.overall})`);
+    }
+  }
+  // Output JSON for paper
+  const jsonOutput = {
+    generated: new Date().toISOString(),
+    totalEvals: evals.length,
+    judgedEvals: judgedEvals.length,
+    batteryEvals: batteryEvals.length,
+    byProfile: profileResults,
+    byArchitecture: archResults,
+    overallTutor,
+    overallLearner,
+  };
+  const outputPath = path.join(__dirname, '..', 'docs', 'analysis-results.json');
+  fs.writeFileSync(outputPath, JSON.stringify(jsonOutput, null, 2));
+  console.log(`\nJSON output saved to: ${outputPath}`);
+  return jsonOutput;
+}
+// Main
+const evals = loadEvals();
+generateReport(evals);

package/server-init.js ADDED Viewed

@@ -0,0 +1,45 @@
+/**
+ * Evaluation Extension - Server Initialization
+ *
+ * Called by the extension loader when mounting this extension
+ * into the main Machine Spirits website.
+ */
+import path from 'path';
+import { fileURLToPath } from 'url';
+import express from 'express';
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+/**
+ * Initialize the evaluation extension
+ * @param {Object} context - Initialization context
+ * @param {Express} context.app - Express application
+ * @param {Object} context.manifest - Extension manifest
+ * @param {string} context.extensionPath - Path to extension directory
+ * @param {string} context.rootDir - Path to main website root
+ */
+export async function init({ app, manifest, extensionPath, rootDir }) {
+  console.log(`[EvalExtension] Initializing ${manifest.name} v${manifest.version}`);
+  // Serve static files for components (for client-side imports)
+  const componentsDir = path.join(extensionPath, 'components');
+  app.use('/extensions/eval', express.static(componentsDir));
+  // Serve documentation
+  const docsDir = path.join(extensionPath, 'docs');
+  app.use('/docs/extensions/eval', express.static(docsDir));
+  // Ensure data directory exists
+  const dataDir = path.join(extensionPath, 'data');
+  const fs = await import('fs');
+  if (!fs.existsSync(dataDir)) {
+    fs.mkdirSync(dataDir, { recursive: true });
+    console.log('[EvalExtension] Created data directory');
+  }
+  console.log('[EvalExtension] Initialization complete');
+}
+export default { init };

package/server.js ADDED Viewed

@@ -0,0 +1,162 @@
+/**
+ * @machinespirits/eval - Standalone Server
+ *
+ * Runs the evaluation system as a standalone application.
+ * This server provides:
+ * - API endpoints for evaluation runs, results, and analysis
+ * - Static file serving for the UI components
+ * - Documentation serving
+ *
+ * Environment variables:
+ *   PORT - Server port (default: 8081)
+ *   STANDALONE - Set to 'true' to run in standalone mode
+ *
+ * Usage:
+ *   STANDALONE=true node server.js
+ *   # or
+ *   npm start
+ */
+import express from 'express';
+import path from 'path';
+import { fileURLToPath } from 'url';
+import { existsSync, mkdirSync } from 'fs';
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const app = express();
+const PORT = Number(process.env.PORT) || 8081;
+const isStandalone = process.env.STANDALONE === 'true';
+// Middleware
+app.use(express.json());
+// Ensure data directory exists
+const dataDir = path.join(__dirname, 'data');
+if (!existsSync(dataDir)) {
+  mkdirSync(dataDir, { recursive: true });
+  console.log('[EvalServer] Created data directory');
+}
+// Health check
+app.get('/health', (req, res) => {
+  res.json({
+    status: 'ok',
+    package: '@machinespirits/eval',
+    version: '0.1.0',
+    mode: isStandalone ? 'standalone' : 'mounted',
+  });
+});
+// API routes
+import evalRoutes from './routes/evalRoutes.js';
+app.use('/api/eval', evalRoutes);
+// Serve components as static files
+const componentsDir = path.join(__dirname, 'components');
+if (existsSync(componentsDir)) {
+  app.use('/components', express.static(componentsDir));
+}
+// Serve documentation
+const docsDir = path.join(__dirname, 'docs');
+if (existsSync(docsDir)) {
+  app.use('/docs', express.static(docsDir));
+}
+// In standalone mode, serve a basic UI
+if (isStandalone) {
+  app.get('/', (req, res) => {
+    res.send(`
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Machine Spirits Eval</title>
+  <style>
+    body {
+      font-family: 'Space Mono', monospace;
+      background: #0a0a0a;
+      color: #fafafa;
+      margin: 0;
+      padding: 2rem;
+    }
+    h1 { color: #E63946; }
+    a { color: #E63946; }
+    .endpoint {
+      background: rgba(255,255,255,0.05);
+      padding: 1rem;
+      margin: 0.5rem 0;
+      border-radius: 4px;
+    }
+    code {
+      background: rgba(255,255,255,0.1);
+      padding: 0.2rem 0.4rem;
+      border-radius: 2px;
+    }
+  </style>
+</head>
+<body>
+  <h1>Machine Spirits Eval</h1>
+  <p>Evaluation system running in standalone mode.</p>
+  <h2>API Endpoints</h2>
+  <div class="endpoint">
+    <strong>GET</strong> <code>/api/eval/scenarios</code>
+    <p>List available evaluation scenarios</p>
+  </div>
+  <div class="endpoint">
+    <strong>GET</strong> <code>/api/eval/profiles</code>
+    <p>List tutor profiles</p>
+  </div>
+  <div class="endpoint">
+    <strong>GET</strong> <code>/api/eval/runs</code>
+    <p>List evaluation runs</p>
+  </div>
+  <div class="endpoint">
+    <strong>GET</strong> <code>/api/eval/runs/:id</code>
+    <p>Get details of a specific run</p>
+  </div>
+  <div class="endpoint">
+    <strong>POST</strong> <code>/api/eval/quick</code>
+    <p>Run a quick evaluation test</p>
+  </div>
+  <h2>Documentation</h2>
+  <p><a href="/docs">/docs</a> - Research papers and analysis</p>
+  <h2>Health</h2>
+  <p><a href="/health">/health</a> - Service health check</p>
+</body>
+</html>
+    `);
+  });
+}
+// Error handler
+app.use((err, req, res, next) => {
+  console.error('[EvalServer] Error:', err.message);
+  res.status(500).json({
+    error: 'Internal server error',
+    message: err.message,
+  });
+});
+// Start server
+if (process.argv[1] === fileURLToPath(import.meta.url)) {
+  app.listen(PORT, '0.0.0.0', () => {
+    console.log(`[EvalServer] Machine Spirits Eval running at http://0.0.0.0:${PORT}`);
+    console.log(`[EvalServer] Mode: ${isStandalone ? 'standalone' : 'mounted'}`);
+    console.log(`[EvalServer] API: http://0.0.0.0:${PORT}/api/eval`);
+    console.log(`[EvalServer] Docs: http://0.0.0.0:${PORT}/docs`);
+  });
+}
+export { app };