npm - @machinespirits/eval - Versions diffs - 0.1.2 → 0.2.1 - Mend

@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

package/LICENSE +21 -0
package/README.md +161 -0
package/config/eval-settings.yaml +18 -0
package/config/evaluation-rubric-learner.yaml +277 -0
package/config/evaluation-rubric.yaml +613 -0
package/config/interaction-eval-scenarios.yaml +93 -50
package/config/learner-agents.yaml +124 -193
package/config/machinespirits-eval.code-workspace +11 -0
package/config/providers.yaml +60 -0
package/config/suggestion-scenarios.yaml +1399 -0
package/config/tutor-agents.yaml +716 -0
package/docs/EVALUATION-VARIABLES.md +589 -0
package/docs/REPLICATION-PLAN.md +577 -0
package/index.js +15 -6
package/package.json +16 -22
package/routes/evalRoutes.js +88 -36
package/scripts/analyze-judge-reliability.js +401 -0
package/scripts/analyze-run.js +97 -0
package/scripts/analyze-run.mjs +282 -0
package/scripts/analyze-validation-failures.js +141 -0
package/scripts/check-run.mjs +17 -0
package/scripts/code-impasse-strategies.js +1132 -0
package/scripts/compare-runs.js +44 -0
package/scripts/compare-suggestions.js +80 -0
package/scripts/compare-transformation.js +116 -0
package/scripts/dig-into-run.js +158 -0
package/scripts/eval-cli.js +2626 -0
package/scripts/generate-paper-figures.py +452 -0
package/scripts/qualitative-analysis-ai.js +1313 -0
package/scripts/qualitative-analysis.js +688 -0
package/scripts/seed-db.js +87 -0
package/scripts/show-failed-suggestions.js +64 -0
package/scripts/validate-content.js +192 -0
package/server.js +3 -2
package/services/__tests__/evalConfigLoader.test.js +338 -0
package/services/anovaStats.js +499 -0
package/services/contentResolver.js +407 -0
package/services/dialogueTraceAnalyzer.js +454 -0
package/services/evalConfigLoader.js +625 -0
package/services/evaluationRunner.js +2171 -270
package/services/evaluationStore.js +564 -29
package/services/learnerConfigLoader.js +75 -5
package/services/learnerRubricEvaluator.js +284 -0
package/services/learnerTutorInteractionEngine.js +375 -0
package/services/processUtils.js +18 -0
package/services/progressLogger.js +98 -0
package/services/promptRecommendationService.js +31 -26
package/services/promptRewriter.js +427 -0
package/services/rubricEvaluator.js +543 -70
package/services/streamingReporter.js +104 -0
package/services/turnComparisonAnalyzer.js +494 -0
package/components/MobileEvalDashboard.tsx +0 -267
package/components/comparison/DeltaAnalysisTable.tsx +0 -137
package/components/comparison/ProfileComparisonCard.tsx +0 -176
package/components/comparison/RecognitionABMode.tsx +0 -385
package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
package/components/comparison/WinnerIndicator.tsx +0 -64
package/components/comparison/index.ts +0 -5
package/components/mobile/BottomSheet.tsx +0 -233
package/components/mobile/DimensionBreakdown.tsx +0 -210
package/components/mobile/DocsView.tsx +0 -363
package/components/mobile/LogsView.tsx +0 -481
package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
package/components/mobile/QuickTestView.tsx +0 -1098
package/components/mobile/RecognitionTypeChart.tsx +0 -124
package/components/mobile/RecognitionView.tsx +0 -809
package/components/mobile/RunDetailView.tsx +0 -261
package/components/mobile/RunHistoryView.tsx +0 -367
package/components/mobile/ScoreRadial.tsx +0 -211
package/components/mobile/StreamingLogPanel.tsx +0 -230
package/components/mobile/SynthesisStrategyChart.tsx +0 -140
package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
package/docs/research/COST-ANALYSIS.md +0 -56
package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
package/docs/research/PAPER-UNIFIED.md +0 -659
package/docs/research/PAPER-UNIFIED.pdf +0 -0
package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
package/docs/research/apa.csl +0 -2133
package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
package/docs/research/paper-draft/full-paper.md +0 -136
package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
package/docs/research/paper-draft/references.bib +0 -515
package/docs/research/transcript-baseline.md +0 -139
package/docs/research/transcript-recognition-multiagent.md +0 -187
package/hooks/useEvalData.ts +0 -625
package/server-init.js +0 -45
package/services/benchmarkService.js +0 -1892
package/types.ts +0 -165
package/utils/haptics.ts +0 -45

package/config/interaction-eval-scenarios.yaml CHANGED Viewed

@@ -30,9 +30,10 @@ short_term_scenarios:
   recognition_request:
     id: "recognition_request"
+    type: interaction
     name: "Learner Seeks Recognition"
     description: "Learner shares their understanding, seeking validation and engagement"
-    turns: 4
+    turn_count: 4
     topic: "Hegel's recognition dialectic"
     learner:
@@ -78,9 +79,10 @@ short_term_scenarios:
   frustration_moment:
     id: "frustration_moment"
+    type: interaction
     name: "Learner Expresses Frustration"
     description: "Learner is stuck and becoming frustrated"
-    turns: 5
+    turn_count: 5
     topic: "Aufhebung (sublation)"
     learner:
@@ -126,9 +128,10 @@ short_term_scenarios:
   misconception_surface:
     id: "misconception_surface"
+    type: interaction
     name: "Misconception Revealed"
     description: "Learner reveals a misconception that needs gentle correction"
-    turns: 4
+    turn_count: 4
     topic: "Thesis-Antithesis-Synthesis"
     learner:
@@ -174,9 +177,10 @@ short_term_scenarios:
   breakthrough_moment:
     id: "breakthrough_moment"
+    type: interaction
     name: "Learner Shows Insight"
     description: "Learner demonstrates genuine understanding"
-    turns: 3
+    turn_count: 3
     topic: "Self-consciousness"
     learner:
@@ -222,9 +226,10 @@ short_term_scenarios:
   resistant_engagement:
     id: "resistant_engagement"
+    type: interaction
     name: "Resistant but Capable Learner"
     description: "Intelligent learner pushes back on claims"
-    turns: 6
+    turn_count: 6
     topic: "Hegel's relevance today"
     learner:
@@ -276,6 +281,7 @@ long_term_scenarios:
   novice_to_practitioner:
     id: "novice_to_practitioner"
+    type: interaction
     name: "Learning Arc: Novice to Practitioner"
     description: "Track learner development across multiple sessions"
     sessions: 5
@@ -334,6 +340,7 @@ long_term_scenarios:
   stranger_to_recognized:
     id: "stranger_to_recognized"
+    type: interaction
     name: "Relationship Arc: Developing Trust"
     description: "Track relationship development across sessions"
     sessions: 4
@@ -374,6 +381,7 @@ long_term_scenarios:
   tutor_adaptation:
     id: "tutor_adaptation"
+    type: interaction
     name: "Tutor Learning Arc"
     description: "Track tutor's accumulated knowledge about learner"
     sessions: 4
@@ -412,90 +420,119 @@ long_term_scenarios:
 evaluation_dimensions:
-  # Learner dimensions
+  # Learner dimensions (total weight: 0.40)
   learner:
     authenticity:
+      weight: 0.10
       description: "Internal dynamics reflect persona realistically"
       scoring:
-        5: "Internal voices perfectly calibrated to persona"
-        3: "Generally authentic but some inconsistency"
-        1: "Feels performative or mismatched to persona"
+        5: "Internal voices perfectly calibrated to persona; feels like a real learner"
+        4: "Mostly authentic with occasional minor inconsistencies"
+        3: "Generally authentic but some noticeable gaps in persona"
+        2: "Frequently inconsistent; persona breaks character"
+        1: "Feels performative or completely mismatched to persona"
     responsiveness:
+      weight: 0.10
       description: "Genuine reaction to tutor's engagement"
       scoring:
-        5: "Clearly processing and responding to tutor input"
-        3: "Some response but not deeply engaged"
-        1: "Ignores or dismisses tutor's contributions"
+        5: "Clearly processing and responding to tutor input; reactions feel earned"
+        4: "Responsive with minor gaps; mostly engages with tutor's points"
+        3: "Some response but not deeply engaged with specifics"
+        2: "Superficial reactions; largely ignores tutor's actual content"
+        1: "Ignores or dismisses tutor's contributions entirely"
     development:
+      weight: 0.10
       description: "Shows movement in understanding"
       scoring:
-        5: "Clear trajectory of understanding change"
-        3: "Some development visible"
-        1: "No discernible change"
+        5: "Clear trajectory of understanding change; visible learning arc"
+        4: "Noticeable development with minor plateaus"
+        3: "Some development visible but uneven"
+        2: "Minimal change; understanding mostly static"
+        1: "No discernible change across turns"
     emotional_trajectory:
+      weight: 0.05
       description: "Emotional state changes appropriately"
       scoring:
-        5: "Emotions shift naturally with interaction"
-        3: "Some emotional movement"
-        1: "Emotional state static regardless of input"
+        5: "Emotions shift naturally with interaction; affective arc feels real"
+        4: "Mostly natural emotional shifts with minor flat spots"
+        3: "Some emotional movement but transitions feel mechanical"
+        2: "Emotional state largely static; shifts feel forced"
+        1: "Emotional state completely static regardless of input"
     knowledge_retention:
+      weight: 0.05
       description: "Concepts persist across sessions"
       scoring:
-        5: "Strong retention with appropriate decay patterns"
-        3: "Moderate retention"
-        1: "No retention between sessions"
+        5: "Strong retention with appropriate decay patterns; references prior learning"
+        4: "Good retention; most concepts persist with minor gaps"
+        3: "Moderate retention; some concepts lost between sessions"
+        2: "Weak retention; frequently forgets prior material"
+        1: "No retention between sessions; starts fresh each time"
-  # Tutor dimensions
+  # Tutor dimensions (total weight: 0.40)
   tutor:
     strategy_adaptation:
+      weight: 0.15
       description: "Modifies approach based on effectiveness"
       scoring:
-        5: "Clearly learns and adapts strategies"
-        3: "Some adaptation visible"
-        1: "Same approach regardless of results"
+        5: "Clearly learns and adapts strategies; abandoned approaches don't recur"
+        4: "Good adaptation with occasional repetition of ineffective strategies"
+        3: "Some adaptation visible but slow to change approach"
+        2: "Minimal adaptation; mostly repeats same strategies"
+        1: "Same approach regardless of results; no learning"
     scaffolding_reduction:
+      weight: 0.15
       description: "Fades support as learner grows"
       scoring:
-        5: "Perfect calibration of support level"
-        3: "Some appropriate fading"
+        5: "Perfect calibration of support level; fading tracks learner growth"
+        4: "Good fading with minor over- or under-support"
+        3: "Some appropriate fading but inconsistent calibration"
+        2: "Poor calibration; support level mismatched to learner needs"
         1: "Constant support level regardless of growth"
     memory_utilization:
+      weight: 0.10
       description: "Effectively uses accumulated knowledge"
       scoring:
-        5: "Seamlessly integrates prior knowledge"
-        3: "Some reference to history"
-        1: "Treats each session as isolated"
+        5: "Seamlessly integrates prior knowledge; references feel natural"
+        4: "Good use of history with occasional missed opportunities"
+        3: "Some reference to history but doesn't fully leverage it"
+        2: "Rare references to prior interactions; mostly treats as new"
+        1: "Treats each session as isolated; no accumulated understanding"
-  # Relationship dimensions
+  # Relationship dimensions (total weight: 0.20)
   relationship:
     trust_trajectory:
+      weight: 0.10
       description: "Trust develops appropriately over time"
       scoring:
-        5: "Natural, earned trust development"
-        3: "Some trust building visible"
-        1: "No trust development"
+        5: "Natural, earned trust development; vulnerability emerges organically"
+        4: "Good trust trajectory with minor pacing issues"
+        3: "Some trust building visible but feels scripted"
+        2: "Minimal trust development; interaction stays surface-level"
+        1: "No trust development; purely transactional throughout"
     mutual_recognition_depth:
+      weight: 0.10
       description: "Both parties show understanding of other"
       scoring:
-        5: "Genuine mutual recognition achieved"
-        3: "Some recognition present"
-        1: "Purely transactional interaction"
+        5: "Genuine mutual recognition achieved; both parties transformed"
+        4: "Good recognition with minor asymmetries"
+        3: "Some recognition present but largely one-directional"
+        2: "Minimal recognition; interaction remains functional"
+        1: "Purely transactional interaction; no recognition dynamics"
 # ══════════════════════════════════════════════════════════════════════════════
 # JUDGE CONFIGURATION
 # ══════════════════════════════════════════════════════════════════════════════
 judge:
-  # Use OpenRouter model IDs when running via OpenRouter
-  model: "anthropic/claude-sonnet-4.5"
-  fallback_model: "openai/gpt-4o"
+  # Model config defined in evaluation-rubric.yaml → interaction_judge
+  # (single source of truth for all judge models)
   system_prompt: |
     You are evaluating a learner-tutor interaction from the perspective of pedagogical quality and authentic learning dynamics.
@@ -542,11 +579,12 @@ battery_scenarios:
   # ------------------------------------------------------------------------------
   battery_unified_baseline:
     id: "battery_unified_baseline"
+    type: interaction
     name: "Battery: Unified Learner + Baseline Tutor"
     description: "Single-agent learner with baseline tutor configuration"
     architecture: "unified"
     tutor_profile: "baseline"
-    turns: 3
+    turn_count: 3
     topic: "Recognition and self-consciousness"
     learner:
@@ -591,11 +629,12 @@ battery_scenarios:
   # ------------------------------------------------------------------------------
   battery_ego_superego_recognition:
     id: "battery_ego_superego_recognition"
+    type: interaction
     name: "Battery: Ego/Superego Learner + Recognition Tutor"
     description: "Two-agent learner with recognition-focused tutor"
     architecture: "ego_superego"
     tutor_profile: "recognition"
-    turns: 4
+    turn_count: 4
     topic: "The master-slave dialectic"
     learner:
@@ -640,11 +679,12 @@ battery_scenarios:
   # ------------------------------------------------------------------------------
   battery_dialectical_budget:
     id: "battery_dialectical_budget"
+    type: interaction
     name: "Battery: Dialectical Learner + Budget Tutor"
     description: "Thesis-antithesis learner with budget (minimal) tutor"
     architecture: "dialectical"
     tutor_profile: "budget"
-    turns: 3
+    turn_count: 3
     topic: "Sublation and the unity of opposites"
     learner:
@@ -685,15 +725,16 @@ battery_scenarios:
           weight: 0.4
   # ------------------------------------------------------------------------------
-  # Psychodynamic Learner + Recognition Plus Tutor
+  # Ego/Superego Learner + Recognition Plus Tutor
   # ------------------------------------------------------------------------------
   battery_psychodynamic_recognition_plus:
     id: "battery_psychodynamic_recognition_plus"
-    name: "Battery: Psychodynamic Learner + Recognition Plus Tutor"
-    description: "Id/Ego/Superego learner with enhanced recognition tutor"
-    architecture: "psychodynamic"
+    type: interaction
+    name: "Battery: Ego/Superego Learner + Recognition Plus Tutor"
+    description: "Ego/superego learner with enhanced recognition tutor"
+    architecture: "ego_superego"
     tutor_profile: "recognition_plus"
-    turns: 4
+    turn_count: 4
     topic: "Desire and the self in Hegel"
     learner:
@@ -738,11 +779,12 @@ battery_scenarios:
   # ------------------------------------------------------------------------------
   battery_cognitive_quality:
     id: "battery_cognitive_quality"
+    type: interaction
     name: "Battery: Cognitive Learner + Quality Tutor"
     description: "Memory/reasoning/meta cognitive learner with quality tutor"
     architecture: "cognitive"
     tutor_profile: "quality"
-    turns: 4
+    turn_count: 4
     topic: "Spirit and collective consciousness"
     learner:
@@ -787,11 +829,12 @@ battery_scenarios:
   # ------------------------------------------------------------------------------
   battery_extended_dialogue:
     id: "battery_extended_dialogue"
+    type: interaction
     name: "Battery: Extended Multi-Turn Dialogue"
     description: "Longer dialogue to test sustained interaction quality"
     architecture: "ego_superego"
     tutor_profile: "recognition"
-    turns: 8
+    turn_count: 8
     topic: "The stages of consciousness in the Phenomenology"
     learner: