npm - @machinespirits/eval - Versions diffs - 0.1.2 → 0.2.1 - Mend

@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

package/LICENSE +21 -0
package/README.md +161 -0
package/config/eval-settings.yaml +18 -0
package/config/evaluation-rubric-learner.yaml +277 -0
package/config/evaluation-rubric.yaml +613 -0
package/config/interaction-eval-scenarios.yaml +93 -50
package/config/learner-agents.yaml +124 -193
package/config/machinespirits-eval.code-workspace +11 -0
package/config/providers.yaml +60 -0
package/config/suggestion-scenarios.yaml +1399 -0
package/config/tutor-agents.yaml +716 -0
package/docs/EVALUATION-VARIABLES.md +589 -0
package/docs/REPLICATION-PLAN.md +577 -0
package/index.js +15 -6
package/package.json +16 -22
package/routes/evalRoutes.js +88 -36
package/scripts/analyze-judge-reliability.js +401 -0
package/scripts/analyze-run.js +97 -0
package/scripts/analyze-run.mjs +282 -0
package/scripts/analyze-validation-failures.js +141 -0
package/scripts/check-run.mjs +17 -0
package/scripts/code-impasse-strategies.js +1132 -0
package/scripts/compare-runs.js +44 -0
package/scripts/compare-suggestions.js +80 -0
package/scripts/compare-transformation.js +116 -0
package/scripts/dig-into-run.js +158 -0
package/scripts/eval-cli.js +2626 -0
package/scripts/generate-paper-figures.py +452 -0
package/scripts/qualitative-analysis-ai.js +1313 -0
package/scripts/qualitative-analysis.js +688 -0
package/scripts/seed-db.js +87 -0
package/scripts/show-failed-suggestions.js +64 -0
package/scripts/validate-content.js +192 -0
package/server.js +3 -2
package/services/__tests__/evalConfigLoader.test.js +338 -0
package/services/anovaStats.js +499 -0
package/services/contentResolver.js +407 -0
package/services/dialogueTraceAnalyzer.js +454 -0
package/services/evalConfigLoader.js +625 -0
package/services/evaluationRunner.js +2171 -270
package/services/evaluationStore.js +564 -29
package/services/learnerConfigLoader.js +75 -5
package/services/learnerRubricEvaluator.js +284 -0
package/services/learnerTutorInteractionEngine.js +375 -0
package/services/processUtils.js +18 -0
package/services/progressLogger.js +98 -0
package/services/promptRecommendationService.js +31 -26
package/services/promptRewriter.js +427 -0
package/services/rubricEvaluator.js +543 -70
package/services/streamingReporter.js +104 -0
package/services/turnComparisonAnalyzer.js +494 -0
package/components/MobileEvalDashboard.tsx +0 -267
package/components/comparison/DeltaAnalysisTable.tsx +0 -137
package/components/comparison/ProfileComparisonCard.tsx +0 -176
package/components/comparison/RecognitionABMode.tsx +0 -385
package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
package/components/comparison/WinnerIndicator.tsx +0 -64
package/components/comparison/index.ts +0 -5
package/components/mobile/BottomSheet.tsx +0 -233
package/components/mobile/DimensionBreakdown.tsx +0 -210
package/components/mobile/DocsView.tsx +0 -363
package/components/mobile/LogsView.tsx +0 -481
package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
package/components/mobile/QuickTestView.tsx +0 -1098
package/components/mobile/RecognitionTypeChart.tsx +0 -124
package/components/mobile/RecognitionView.tsx +0 -809
package/components/mobile/RunDetailView.tsx +0 -261
package/components/mobile/RunHistoryView.tsx +0 -367
package/components/mobile/ScoreRadial.tsx +0 -211
package/components/mobile/StreamingLogPanel.tsx +0 -230
package/components/mobile/SynthesisStrategyChart.tsx +0 -140
package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
package/docs/research/COST-ANALYSIS.md +0 -56
package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
package/docs/research/PAPER-UNIFIED.md +0 -659
package/docs/research/PAPER-UNIFIED.pdf +0 -0
package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
package/docs/research/apa.csl +0 -2133
package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
package/docs/research/paper-draft/full-paper.md +0 -136
package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
package/docs/research/paper-draft/references.bib +0 -515
package/docs/research/transcript-baseline.md +0 -139
package/docs/research/transcript-recognition-multiagent.md +0 -187
package/hooks/useEvalData.ts +0 -625
package/server-init.js +0 -45
package/services/benchmarkService.js +0 -1892
package/types.ts +0 -165
package/utils/haptics.ts +0 -45

package/config/learner-agents.yaml CHANGED Viewed

@@ -2,212 +2,166 @@
 # Defines how the simulated learner generates responses during evaluation
 #
 # ============================================================================
-# LEARNER ARCHITECTURES
+# MODEL OVERRIDES (optional)
 # ============================================================================
+# These override ALL profile models when uncommented. Useful for quick testing.
+# CLI flags (--model, --ego-model, --superego-model) take precedence over these.
 #
-# The learner architecture determines whether the simulated learner has
+# model_override: openrouter.haiku        # Override ALL models (ego + superego + synthesis)
+# ego_model_override: openrouter.nemotron # Override only ego model
+# superego_model_override: openrouter.kimi-k2.5  # Override only superego model
+#
+# ============================================================================
+# LEARNER PROFILES
+# ============================================================================
+#
+# The learner profile determines whether the simulated learner has
 # internal deliberation before generating responses. This enables testing
 # whether multi-agent learner simulation improves evaluation validity.
 #
-# Architectures:
+# Profiles:
 # 1. unified: Single learner agent (no internal dialogue)
-# 2. psychodynamic: Freudian desire/intellect/aspiration deliberation
-# 3. dialectical: Hegelian thesis/antithesis/synthesis process
+# 2. ego_superego: Two-agent ego/superego deliberation (mirrors tutor architecture)
+# 3. psychodynamic: Legacy alias → resolves to ego_superego
 #
 # ============================================================================
-# Active architecture (can be overridden by tutor profile)
+# Active profile (can be overridden by tutor profile's learner_architecture)
 active_architecture: unified
 # ============================================================================
-# ARCHITECTURE DEFINITIONS
+# PROFILES
 # ============================================================================
-architectures:
-  # Unified: Single agent with no internal deliberation
-  # Simple, fast, good for baseline comparisons
+profiles:
+  # Single-agent: no internal deliberation
   unified:
-    name: "Unified Learner"
-    description: "Single learner agent without internal deliberation"
-    deliberation:
+    description: "Single unified learner agent"
+    architecture: unified
+    unified_learner:
+      provider: openrouter
+      model: kimi-k2.5
+      prompt_file: learner-unified.md
+      hyperparameters:
+        temperature: 0.7
+        max_tokens: 500
+    dialogue:
       enabled: false
       max_rounds: 0
-    agent:
+  # Two-agent: mirrors tutor ego/superego pattern
+  ego_superego:
+    description: "Ego/superego learner — mirrors tutor architecture"
+    architecture: ego_superego
+    ego:
       provider: openrouter
       model: nemotron
-      prompt_file: learner-unified.md
+      prompt_file: learner-ego.md
       hyperparameters:
         temperature: 0.7
+        max_tokens: 400
+    superego:
+      provider: openrouter
+      model: kimi-k2.5
+      prompt_file: learner-superego.md
+      hyperparameters:
+        temperature: 0.5
+        max_tokens: 400
+    synthesis:
+      provider: openrouter
+      model: kimi-k2.5
+      prompt_file: learner-synthesis.md
+      hyperparameters:
+        temperature: 0.6
         max_tokens: 500
-  # Psychodynamic: Freudian-inspired desire/intellect/aspiration
-  # Internal deliberation between:
-  # - Desire (Id-like): What the learner wants emotionally
-  # - Intellect (Ego-like): Rational analysis of the situation
-  # - Aspiration (Superego-like): Idealized learning goals
-  psychodynamic:
-    name: "Psychodynamic Learner"
-    description: "Freudian-inspired internal deliberation between desire, intellect, and aspiration"
-    deliberation:
+    dialogue:
       enabled: true
       max_rounds: 2
-      convergence_threshold: 0.7
-    agents:
-      desire:
-        role: "id"
-        description: "Emotional/affective responses - what the learner WANTS"
-        provider: openrouter
-        model: nemotron
-        prompt_file: learner-desire.md
-        hyperparameters:
-          temperature: 0.8  # Higher temp for more emotional/varied responses
-          max_tokens: 400
-      intellect:
-        role: "ego"
-        description: "Rational analysis - what the learner THINKS"
-        provider: openrouter
-        model: nemotron
-        prompt_file: learner-intellect.md
-        hyperparameters:
-          temperature: 0.5  # Lower temp for more analytical responses
-          max_tokens: 400
-      aspiration:
-        role: "superego"
-        description: "Idealized goals - what the learner SHOULD want"
-        provider: openrouter
-        model: nemotron
-        prompt_file: learner-aspiration.md
-        hyperparameters:
-          temperature: 0.6
-          max_tokens: 400
-      synthesizer:
-        description: "Integrates the three voices into a coherent response"
-        provider: openrouter
-        model: nemotron
-        prompt_file: learner-synthesizer.md
-        hyperparameters:
-          temperature: 0.6
-          max_tokens: 500
-    deliberation_process: |
-      The psychodynamic learner simulates internal conflict:
-      1. DESIRE (Id): "I want to skip ahead / I'm bored / This is frustrating"
-         - Immediate emotional reactions
-         - Avoidance tendencies
-         - Curiosity and excitement
-      2. INTELLECT (Ego): "This doesn't make sense yet / I need more examples"
-         - Rational assessment of understanding
-         - Strategic thinking about learning path
-         - Reality testing
-      3. ASPIRATION (Superego): "I should master this / I want to be an expert"
-         - Long-term learning goals
-         - Internalized expectations
-         - Self-improvement drives
-      4. SYNTHESIS: Integration into coherent learner response
-         - Balances immediate desires with long-term goals
-         - Produces realistic learner behavior
+  # Recognition-enhanced: single unified learner with recognition prompts
+  unified_recognition:
+    description: "Single unified learner with recognition-aware prompt"
+    architecture: unified
+    unified_learner:
+      provider: openrouter
+      model: kimi-k2.5
+      prompt_file: learner-unified.md
+      hyperparameters:
+        temperature: 0.7
+        max_tokens: 600
+    dialogue:
+      enabled: false
+      max_rounds: 0
-  # Dialectical: Hegelian thesis/antithesis/synthesis
-  # Internal deliberation where:
-  # - Thesis: Initial position/understanding
-  # - Antithesis: Challenge/complication to that position
-  # - Synthesis: New integrated understanding
-  dialectical:
-    name: "Dialectical Learner"
-    description: "Hegelian-inspired internal dialectic between thesis, antithesis, and synthesis"
-    deliberation:
+  # Recognition-enhanced: ego/superego with recognition-specific prompts
+  ego_superego_recognition:
+    description: "Ego/superego learner with recognition-aware prompts and memory"
+    architecture: ego_superego
+    ego:
+      provider: openrouter
+      model: nemotron
+      prompt_file: learner-ego-recognition.md
+      hyperparameters:
+        temperature: 0.7
+        max_tokens: 600
+    superego:
+      provider: openrouter
+      model: kimi-k2.5
+      prompt_file: learner-superego-recognition.md
+      hyperparameters:
+        temperature: 0.5
+        max_tokens: 600
+    synthesis:
+      provider: openrouter
+      model: kimi-k2.5
+      prompt_file: learner-synthesis-recognition.md
+      hyperparameters:
+        temperature: 0.6
+        max_tokens: 700
+    dialogue:
       enabled: true
       max_rounds: 2
-      convergence_threshold: 0.7
-    agents:
-      thesis:
-        role: "thesis"
-        description: "Initial understanding or position"
-        provider: openrouter
-        model: nemotron
-        prompt_file: learner-thesis.md
-        hyperparameters:
-          temperature: 0.6
-          max_tokens: 400
-      antithesis:
-        role: "antithesis"
-        description: "Challenge or complication to the thesis"
-        provider: openrouter
-        model: nemotron
-        prompt_file: learner-antithesis.md
-        hyperparameters:
-          temperature: 0.7
-          max_tokens: 400
-      synthesis:
-        role: "synthesis"
-        description: "Integration that preserves and overcomes the tension"
-        provider: openrouter
-        model: nemotron
-        prompt_file: learner-synthesis.md
-        hyperparameters:
-          temperature: 0.6
-          max_tokens: 500
-    deliberation_process: |
-      The dialectical learner simulates Hegelian movement:
-      1. THESIS: "I understand X as..."
-         - Initial grasp of the concept
-         - Current mental model
-         - Working hypothesis
-      2. ANTITHESIS: "But wait, what about Y? That complicates things..."
-         - Internal contradiction discovered
-         - New information that doesn't fit
-         - Productive confusion
-      3. SYNTHESIS: "So actually, X and Y together mean..."
-         - New understanding that integrates both
-         - Aufhebung: preserves while overcoming
-         - Readiness for next dialectical cycle
+  # Legacy alias so existing 'psychodynamic' references resolve to ego_superego
+  psychodynamic:
+    description: "Legacy alias — uses ego/superego architecture"
+    architecture: ego_superego
+    ego:
+      provider: openrouter
+      model: nemotron
+      prompt_file: learner-ego.md
+      hyperparameters:
+        temperature: 0.7
+        max_tokens: 400
+    superego:
+      provider: openrouter
+      model: kimi-k2.5
+      prompt_file: learner-superego.md
+      hyperparameters:
+        temperature: 0.5
+        max_tokens: 400
+    synthesis:
+      provider: openrouter
+      model: kimi-k2.5
+      prompt_file: learner-synthesis.md
+      hyperparameters:
+        temperature: 0.6
+        max_tokens: 500
+    dialogue:
+      enabled: true
+      max_rounds: 2
 # ============================================================================
 # PERSONA MODIFIERS
 # ============================================================================
-# These modifiers adjust the base architecture based on learner persona
+# These modifiers adjust learner behavior based on persona
 persona_modifiers:
-  confused_novice:
-    desire_weight: 0.4      # Strong emotional reactions to confusion
-    intellect_weight: 0.3   # Struggles with analysis
-    aspiration_weight: 0.3  # Wants to succeed but uncertain
-  eager_explorer:
-    desire_weight: 0.5      # Curiosity-driven
-    intellect_weight: 0.3   # Quick but sometimes shallow analysis
-    aspiration_weight: 0.2  # Less concerned with "should"
-  focused_achiever:
-    desire_weight: 0.2      # Controlled emotional reactions
-    intellect_weight: 0.4   # Strong analytical focus
-    aspiration_weight: 0.4  # Clear goals
-  struggling_anxious:
-    desire_weight: 0.5      # Strong anxiety-driven responses
-    intellect_weight: 0.2   # Anxiety impairs analysis
-    aspiration_weight: 0.3  # High expectations create pressure
-  adversarial_tester:
-    desire_weight: 0.3      # Enjoys challenging
-    intellect_weight: 0.4   # Analytical about finding weaknesses
-    aspiration_weight: 0.3  # Wants to be thorough
+  confused_novice: {}
+  eager_explorer: {}
+  focused_achiever: {}
+  struggling_anxious: {}
+  adversarial_tester: {}
 # ============================================================================
 # EVALUATION SETTINGS
@@ -222,27 +176,4 @@ evaluation:
   metrics:
     - deliberation_rounds
     - internal_coherence
-    - desire_intellect_tension
-    - aspiration_alignment
     - response_authenticity
-# ============================================================================
-# ABLATION STUDY SUPPORT
-# ============================================================================
-ablation:
-  # Mapping of ablation profiles to learner architectures
-  profile_architectures:
-    ablation_baseline_unified: unified
-    ablation_baseline_multilearner: psychodynamic
-    ablation_multiagent_unified: unified
-    ablation_multiagent_multilearner: psychodynamic
-    ablation_recognition_unified: unified
-    ablation_recognition_multilearner: psychodynamic
-    ablation_recognition_multiagent_unified: unified
-    ablation_recognition_multiagent_multilearner: psychodynamic
-  # Which architectures to compare in ablation studies
-  architectures_to_compare:
-    - unified
-    - psychodynamic

package/config/machinespirits-eval.code-workspace ADDED Viewed

@@ -0,0 +1,11 @@
+{
+	"folders": [
+		{
+			"path": ".."
+		},
+		{
+			"path": "../../machinespirits-tutor-core"
+		}
+	],
+	"settings": {}
+}

package/config/providers.yaml ADDED Viewed

@@ -0,0 +1,60 @@
+# Shared AI Provider Configuration
+# Used by both tutor-agents.yaml and evaluation-rubric.yaml
+#
+# Model IDs are current as of January 2025. Update when new models release.
+providers:
+  anthropic:
+    api_key_env: ANTHROPIC_API_KEY
+    base_url: https://api.anthropic.com/v1/messages
+    default_model: claude-sonnet-4-5
+    models:
+      haiku: claude-haiku-4-5
+      sonnet: claude-sonnet-4-5
+      opus: claude-opus-4-5
+  openai:
+    api_key_env: OPENAI_API_KEY
+    base_url: https://api.openai.com/v1/chat/completions
+    default_model: gpt-5-mini
+    models:
+      mini: gpt-5-mini
+      standard: gpt-5.2
+  openrouter:
+    api_key_env: OPENROUTER_API_KEY
+    base_url: https://openrouter.ai/api/v1/chat/completions
+    default_model: nvidia/nemotron-3-nano-30b-a3b:free
+    models:
+      # Budget-friendly options (free tier)
+      nemotron: nvidia/nemotron-3-nano-30b-a3b:free
+      glm47: z-ai/glm-4.7
+      kimi-k2: moonshotai/kimi-k2-thinking
+      "kimi-k2.5": moonshotai/kimi-k2.5
+      deepseek: deepseek/deepseek-v3.2
+      minimax: minimax/minimax-m2.1ate
+      haiku: anthropic/claude-haiku-4.5
+      gpt-oss: openai/gpt-oss-120b
+      # Mid-tier options
+      sonnet: anthropic/claude-sonnet-4.5
+      gpt-mini: openai/gpt-5-mini
+      gemini-flash: google/gemini-3-flash-preview
+      # Premium options
+      opus: anthropic/claude-opus-4.5
+      gpt: openai/gpt-5.2
+      gemini-pro: google/gemini-3-pro-preview
+  gemini:
+    api_key_env: GEMINI_API_KEY
+    base_url: https://generativelanguage.googleapis.com/v1beta/models
+    default_model: gemini-3-flash-preview
+    models:
+      flash: gemini-3-flash-preview
+      pro: gemini-3-pro-preview
+  local:
+    base_url: http://localhost:1234/v1/chat/completions
+    format: openai
+    default_model: local-model
+    models:
+      default: local-model