npm - @machinespirits/eval - Versions diffs - 0.1.2 → 0.2.1 - Mend

@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

package/LICENSE +21 -0
package/README.md +161 -0
package/config/eval-settings.yaml +18 -0
package/config/evaluation-rubric-learner.yaml +277 -0
package/config/evaluation-rubric.yaml +613 -0
package/config/interaction-eval-scenarios.yaml +93 -50
package/config/learner-agents.yaml +124 -193
package/config/machinespirits-eval.code-workspace +11 -0
package/config/providers.yaml +60 -0
package/config/suggestion-scenarios.yaml +1399 -0
package/config/tutor-agents.yaml +716 -0
package/docs/EVALUATION-VARIABLES.md +589 -0
package/docs/REPLICATION-PLAN.md +577 -0
package/index.js +15 -6
package/package.json +16 -22
package/routes/evalRoutes.js +88 -36
package/scripts/analyze-judge-reliability.js +401 -0
package/scripts/analyze-run.js +97 -0
package/scripts/analyze-run.mjs +282 -0
package/scripts/analyze-validation-failures.js +141 -0
package/scripts/check-run.mjs +17 -0
package/scripts/code-impasse-strategies.js +1132 -0
package/scripts/compare-runs.js +44 -0
package/scripts/compare-suggestions.js +80 -0
package/scripts/compare-transformation.js +116 -0
package/scripts/dig-into-run.js +158 -0
package/scripts/eval-cli.js +2626 -0
package/scripts/generate-paper-figures.py +452 -0
package/scripts/qualitative-analysis-ai.js +1313 -0
package/scripts/qualitative-analysis.js +688 -0
package/scripts/seed-db.js +87 -0
package/scripts/show-failed-suggestions.js +64 -0
package/scripts/validate-content.js +192 -0
package/server.js +3 -2
package/services/__tests__/evalConfigLoader.test.js +338 -0
package/services/anovaStats.js +499 -0
package/services/contentResolver.js +407 -0
package/services/dialogueTraceAnalyzer.js +454 -0
package/services/evalConfigLoader.js +625 -0
package/services/evaluationRunner.js +2171 -270
package/services/evaluationStore.js +564 -29
package/services/learnerConfigLoader.js +75 -5
package/services/learnerRubricEvaluator.js +284 -0
package/services/learnerTutorInteractionEngine.js +375 -0
package/services/processUtils.js +18 -0
package/services/progressLogger.js +98 -0
package/services/promptRecommendationService.js +31 -26
package/services/promptRewriter.js +427 -0
package/services/rubricEvaluator.js +543 -70
package/services/streamingReporter.js +104 -0
package/services/turnComparisonAnalyzer.js +494 -0
package/components/MobileEvalDashboard.tsx +0 -267
package/components/comparison/DeltaAnalysisTable.tsx +0 -137
package/components/comparison/ProfileComparisonCard.tsx +0 -176
package/components/comparison/RecognitionABMode.tsx +0 -385
package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
package/components/comparison/WinnerIndicator.tsx +0 -64
package/components/comparison/index.ts +0 -5
package/components/mobile/BottomSheet.tsx +0 -233
package/components/mobile/DimensionBreakdown.tsx +0 -210
package/components/mobile/DocsView.tsx +0 -363
package/components/mobile/LogsView.tsx +0 -481
package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
package/components/mobile/QuickTestView.tsx +0 -1098
package/components/mobile/RecognitionTypeChart.tsx +0 -124
package/components/mobile/RecognitionView.tsx +0 -809
package/components/mobile/RunDetailView.tsx +0 -261
package/components/mobile/RunHistoryView.tsx +0 -367
package/components/mobile/ScoreRadial.tsx +0 -211
package/components/mobile/StreamingLogPanel.tsx +0 -230
package/components/mobile/SynthesisStrategyChart.tsx +0 -140
package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
package/docs/research/COST-ANALYSIS.md +0 -56
package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
package/docs/research/PAPER-UNIFIED.md +0 -659
package/docs/research/PAPER-UNIFIED.pdf +0 -0
package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
package/docs/research/apa.csl +0 -2133
package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
package/docs/research/paper-draft/full-paper.md +0 -136
package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
package/docs/research/paper-draft/references.bib +0 -515
package/docs/research/transcript-baseline.md +0 -139
package/docs/research/transcript-recognition-multiagent.md +0 -187
package/hooks/useEvalData.ts +0 -625
package/server-init.js +0 -45
package/services/benchmarkService.js +0 -1892
package/types.ts +0 -165
package/utils/haptics.ts +0 -45

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@machinespirits/eval",
-  "version": "0.1.2",
+  "version": "0.2.1",
   "description": "Evaluation system for Machine Spirits tutor - benchmarking, rubric evaluation, and analysis tools",
   "type": "module",
   "main": "index.js",
@@ -8,35 +8,27 @@
     ".": "./index.js",
     "./services/*": "./services/*.js",
     "./routes/*": "./routes/*.js",
-    "./config/*": "./config/*",
-    "./components/*": "./components/*.tsx",
-    "./components/mobile/*": "./components/mobile/*.tsx",
-    "./components/comparison": "./components/comparison/index.ts",
-    "./components/comparison/*": "./components/comparison/*.tsx",
-    "./hooks/*": "./hooks/*.ts",
-    "./types": "./types.ts",
-    "./utils/*": "./utils/*.ts"
+    "./config/*": "./config/*"
   },
   "files": [
     "index.js",
     "server.js",
-    "server-init.js",
-    "types.ts",
     "routes/",
     "services/",
-    "components/",
-    "hooks/",
     "config/",
     "scripts/",
-    "utils/",
-    "docs/"
+    "docs/EVALUATION-VARIABLES.md",
+    "docs/REPLICATION-PLAN.md"
   ],
   "scripts": {
     "start": "STANDALONE=true node server.js",
     "dev": "STANDALONE=true node server.js",
     "eval": "node scripts/eval-cli.js",
     "eval:quick": "node scripts/eval-cli.js quick",
-    "eval:test": "node scripts/eval-cli.js test"
+    "eval:test": "node scripts/eval-cli.js test",
+    "seed": "node scripts/seed-db.js",
+    "test": "node --test --test-force-exit 'services/__tests__/*.test.js' 'tests/*.test.js'",
+    "content:validate": "node scripts/validate-content.js"
   },
   "keywords": [
     "evaluation",
@@ -52,8 +44,8 @@
     "url": "https://github.com/liammagee/machinespirits-eval"
   },
   "peerDependencies": {
-    "@machinespirits/tutor-core": ">=0.1.0",
-    "@anthropic-ai/sdk": ">=0.71.0"
+    "@anthropic-ai/sdk": "0.71.2",
+    "@machinespirits/tutor-core": ">=0.3.1"
   },
   "peerDependenciesMeta": {
     "@anthropic-ai/sdk": {
@@ -61,12 +53,14 @@
     }
   },
   "dependencies": {
-    "express": "^4.19.2",
-    "yaml": "^2.8.2",
-    "better-sqlite3": "^12.5.0"
+    "better-sqlite3": "12.5.0",
+    "dotenv": "17.2.3",
+    "express": "4.19.2",
+    "jsonrepair": "3.13.2",
+    "yaml": "2.8.2"
   },
   "devDependencies": {
-    "@types/node": "^22.14.0"
+    "@types/node": "22.14.0"
   },
   "engines": {
     "node": ">=18.0.0"

package/routes/evalRoutes.js CHANGED Viewed

@@ -14,21 +14,44 @@ import * as evaluationStore from '../services/evaluationStore.js';
 import * as learnerConfigLoader from '../services/learnerConfigLoader.js';
 import * as promptRecommendationService from '../services/promptRecommendationService.js';
 import interactionEngine from '../services/learnerTutorInteractionEngine.js';
-// Import core tutor services from @machinespirits/tutor-core
-import {
-  tutorApiService as tutorApi,
-  dialogueLogService,
-  monitoringService,
-  aiConfigService,
-  writingPadService
-} from '@machinespirits/tutor-core';
-const { getApiKey, getDefaultModel } = aiConfigService;
-const { clearConscious, getWritingPad } = writingPadService;
+import * as evalConfigLoader from '../services/evalConfigLoader.js';
+// Lazy-loaded tutor-core services — resolved on first request so this module
+// can be imported without tutor-core installed at parse time.
+// Module-scoped vars are populated by the middleware below; existing handler
+// code references them unchanged.
+let tutorApi, tutorConfigLoader, dialogueLogService, monitoringService;
+let getApiKey, getDefaultModel, clearConscious, getWritingPad;
+let _tutorCoreLoaded = false;
+async function ensureTutorCore() {
+  if (_tutorCoreLoaded) return;
+  const mod = await import('@machinespirits/tutor-core');
+  tutorApi = mod.tutorApiService;
+  tutorConfigLoader = mod.tutorConfigLoader;
+  dialogueLogService = mod.dialogueLogService;
+  monitoringService = mod.monitoringService;
+  getApiKey = mod.aiConfigService.getApiKey;
+  getDefaultModel = mod.aiConfigService.getDefaultModel;
+  clearConscious = mod.writingPadService.clearConscious;
+  getWritingPad = mod.writingPadService.getWritingPad;
+  _tutorCoreLoaded = true;
+}
 import fs from 'fs';
 import path from 'path';
 const router = Router();
+// Resolve tutor-core on first request
+router.use(async (req, res, next) => {
+  try {
+    await ensureTutorCore();
+    next();
+  } catch (err) {
+    res.status(503).json({ error: 'tutor-core not available', message: err.message });
+  }
+});
 // ============================================================================
 // CRASH PROTECTION: Track active evaluation streams
 // ============================================================================
@@ -139,7 +162,7 @@ const PROMPTS_DIR = path.join(process.cwd(), 'prompts');
  */
 router.get('/scenarios', (req, res) => {
   try {
-    const scenarios = tutorApi.listScenarios();
+    const scenarios = evalConfigLoader.listScenarios();
     res.json({ success: true, scenarios });
   } catch (error) {
     console.error('[EvalRoutes] List scenarios error:', error);
@@ -153,7 +176,7 @@ router.get('/scenarios', (req, res) => {
  */
 router.get('/scenarios/:id', (req, res) => {
   try {
-    const scenario = tutorApi.getScenario(req.params.id);
+    const scenario = evalConfigLoader.getScenario(req.params.id);
     if (!scenario) {
       return res.status(404).json({ error: 'Scenario not found' });
     }
@@ -170,7 +193,7 @@ router.get('/scenarios/:id', (req, res) => {
  */
 router.get('/profiles', (req, res) => {
   try {
-    const profiles = tutorApi.listProfiles();
+    const profiles = tutorConfigLoader.listProfiles();
     res.json({ success: true, profiles });
   } catch (error) {
     console.error('[EvalRoutes] List profiles error:', error);
@@ -199,7 +222,7 @@ router.get('/learner-profiles', (req, res) => {
  */
 router.get('/configurations', (req, res) => {
   try {
-    const configurations = tutorApi.listConfigurations();
+    const configurations = evalConfigLoader.listConfigurations();
     res.json({ success: true, configurations });
   } catch (error) {
     console.error('[EvalRoutes] List configurations error:', error);
@@ -218,18 +241,40 @@ router.get('/configurations', (req, res) => {
  * Body: {
  *   profile: "budget",           // Profile name or config string
  *   scenario: "new_user_first_visit",  // Scenario ID (optional)
- *   skipRubric: true            // Skip AI judge evaluation (optional)
+ *   skipRubric: true,            // Skip AI judge evaluation (optional)
+ *   judgeOverride: null,         // Override judge model (optional)
+ *   provider: null,              // Override tutor provider (optional)
+ *   model: null,                 // Override tutor model (optional)
+ *   egoModel: null,              // Override ego model (optional)
+ *   superegoStrategy: null,      // Superego intervention strategy (optional)
+ *   hyperparameters: null        // Override hyperparameters (optional)
  * }
  */
 router.post('/quick', async (req, res) => {
   try {
-    const { profile = 'budget', scenario = 'new_user_first_visit', skipRubric = false } = req.body;
+    const {
+      profile = 'budget',
+      scenario = 'new_user_first_visit',
+      skipRubric = false,
+      judgeOverride = null,
+      provider,
+      model,
+      egoModel,
+      superegoStrategy,
+      hyperparameters,
+    } = req.body;
-    // Build config
-    const config = { profileName: profile };
+    // Build config with optional tutor overrides
+    const config = {
+      profileName: profile,
+      ...(provider && { provider }),
+      ...(model && { model }),
+      ...(egoModel && { egoModel }),
+      ...(hyperparameters && { hyperparameters }),
+    };
     // Get scenario name for description
-    const scenarioDetails = tutorApi.getScenario(scenario);
+    const scenarioDetails = evalConfigLoader.getScenario(scenario);
     const scenarioName = scenarioDetails?.name || scenario;
     // Create a run to persist result to history
@@ -242,6 +287,11 @@ router.post('/quick', async (req, res) => {
         profiles: [profile],
         scenarios: [scenario],
         scenarioNames: [scenarioName],
+        judgeOverride: judgeOverride || undefined,
+        ...(provider && { provider }),
+        ...(model && { model }),
+        ...(egoModel && { egoModel }),
+        ...(superegoStrategy && { superegoStrategy }),
       },
     });
@@ -249,6 +299,8 @@ router.post('/quick', async (req, res) => {
       scenarioId: scenario,
       skipRubricEval: skipRubric,
       verbose: false,
+      judgeOverride,
+      superegoStrategy,
     });
     // Store result to history
@@ -288,9 +340,9 @@ router.post('/quick', async (req, res) => {
         totalTokens: (result.inputTokens || 0) + (result.outputTokens || 0),
         apiCalls: result.apiCalls,
         dialogueRounds: result.dialogueRounds,
-        // Evaluator reasoning
+        // Judge reasoning
         evaluationReasoning: result.evaluationReasoning,
-        evaluatorModel: result.evaluatorModel,
+        judgeModel: result.judgeModel,
         // Scenario context for display (original user request)
         scenarioContext: scenarioDetails ? {
           description: scenarioDetails.description,
@@ -344,7 +396,7 @@ router.get('/stream/quick', async (req, res) => {
     const outputSize = req.query.outputSize || 'normal'; // compact, normal, expanded
     // Get scenario name for description
-    const scenarioDetails = tutorApi.getScenario(scenario);
+    const scenarioDetails = evalConfigLoader.getScenario(scenario);
     const scenarioName = scenarioDetails?.name || scenario;
     // Create a run to persist result to history (status: 'running')
@@ -431,7 +483,7 @@ router.get('/stream/quick', async (req, res) => {
       dialogueId: result.dialogueId,
       // Evaluator reasoning
       evaluationReasoning: result.evaluationReasoning,
-      evaluatorModel: result.evaluatorModel,
+      judgeModel: result.judgeModel,
       // Scenario context for display (original user request)
       scenarioContext: scenarioDetails ? {
         description: scenarioDetails.description,
@@ -557,7 +609,7 @@ router.post('/matrix', async (req, res) => {
     let { profiles = [], scenarios = 'all', skipRubric = false } = req.body;
     // Default profiles if none specified
-    const allProfiles = tutorApi.listProfiles();
+    const allProfiles = tutorConfigLoader.listProfiles();
     if (profiles.length === 0) {
       profiles = ['budget', 'experimental', 'default', 'fast'].filter(p =>
         allProfiles.some(ap => ap.name === p)
@@ -576,7 +628,7 @@ router.post('/matrix', async (req, res) => {
     }
     // Get scenarios
-    const allScenarios = tutorApi.listScenarios();
+    const allScenarios = evalConfigLoader.listScenarios();
     const scenariosToRun = scenarios === 'all'
       ? allScenarios
       : allScenarios.filter(s => scenarios.includes(s.id));
@@ -752,7 +804,7 @@ router.get('/stream/matrix', async (req, res) => {
     const outputSize = req.query.outputSize || 'normal';
     // Get all available profiles
-    const allProfiles = tutorApi.listProfiles();
+    const allProfiles = tutorConfigLoader.listProfiles();
     if (profiles.length === 0) {
       profiles = ['budget', 'experimental', 'default', 'fast'].filter(p =>
         allProfiles.some(ap => ap.name === p)
@@ -767,7 +819,7 @@ router.get('/stream/matrix', async (req, res) => {
     }
     // Get scenarios
-    const allScenarios = tutorApi.listScenarios();
+    const allScenarios = evalConfigLoader.listScenarios();
     const scenariosToRun = scenarios === 'all'
       ? allScenarios
       : allScenarios.filter(s => scenarios.includes(s.id));
@@ -1163,7 +1215,7 @@ router.get('/stream/interact', async (req, res) => {
         learnerId,
         personaId: persona,
         tutorProfile,
-        learnerProfile: dialogueEnabled ? 'psychodynamic' : 'unified',
+        learnerProfile: dialogueEnabled ? 'ego_superego' : 'unified',
         topic,
         scenario: {
           name: `Interactive Evaluation - ${persona}`,
@@ -1216,8 +1268,8 @@ router.get('/stream/interact', async (req, res) => {
       learnerId,
       personaId: persona,
       tutorProfile,
-      learnerArchitecture: dialogueEnabled ? 'psychodynamic' : 'unified',
-      learnerProfile: dialogueEnabled ? 'psychodynamic' : 'unified',
+      learnerArchitecture: dialogueEnabled ? 'ego_superego' : 'unified',
+      learnerProfile: dialogueEnabled ? 'ego_superego' : 'unified',
       topic,
       interaction: interactionTrace,
       turnCount: interactionTrace.turns.length,
@@ -1253,7 +1305,7 @@ router.get('/stream/interact', async (req, res) => {
           runType: 'interaction',
           profiles: [tutorProfile],
           personaId: persona,
-          learnerArchitecture: dialogueEnabled ? 'psychodynamic' : 'unified',
+          learnerArchitecture: dialogueEnabled ? 'ego_superego' : 'unified',
           topic,
           fastMode: !runJudge,
         },
@@ -1852,7 +1904,7 @@ router.post('/prompts/recommend', async (req, res) => {
       profileName = runResults[0]?.profileName || profileName;
     } else if (profile) {
       // Run fresh evaluations
-      const allScenarios = tutorApi.listScenarios();
+      const allScenarios = evalConfigLoader.listScenarios();
       const scenariosToRun = scenarios === 'all'
         ? allScenarios
         : allScenarios.filter(s => scenarios.includes(s.id));
@@ -1944,7 +1996,7 @@ router.get('/stream/run', async (req, res) => {
     const outputSize = req.query.outputSize || 'normal';
     // Get all scenarios to run
-    const allScenarios = tutorApi.listScenarios();
+    const allScenarios = evalConfigLoader.listScenarios();
     const scenariosToRun = scenarios === 'all'
       ? allScenarios
       : allScenarios.filter(s => scenarios.includes(s.id));
@@ -2513,7 +2565,7 @@ router.get('/runs/:runId/resume-status', (req, res) => {
     }
     // Get scenarios
-    const allScenarios = tutorApi.listScenarios();
+    const allScenarios = evalConfigLoader.listScenarios();
     const scenarios = scenariosParam === 'all'
       ? allScenarios
       : allScenarios.filter(s => scenariosParam.includes(s.id));
@@ -2669,7 +2721,7 @@ router.get('/stream/recognition-ab', async (req, res) => {
     const outputSize = req.query.outputSize || 'normal';
     // Validate profiles exist
-    const allProfiles = tutorApi.listProfiles();
+    const allProfiles = tutorConfigLoader.listProfiles();
     const validProfiles = profiles.filter(p => allProfiles.some(ap => ap.name === p));
     if (validProfiles.length !== 2) {
@@ -2682,7 +2734,7 @@ router.get('/stream/recognition-ab', async (req, res) => {
     }
     // Get only recognition_test scenarios
-    const allScenarios = tutorApi.listScenarios();
+    const allScenarios = evalConfigLoader.listScenarios();
     const recognitionScenarios = allScenarios.filter(s => s.recognition_test === true);
     if (recognitionScenarios.length === 0) {