npm - @machinespirits/eval - Versions diffs - 0.1.0 - Mend

@machinespirits/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

package/components/MobileEvalDashboard.tsx +267 -0
package/components/comparison/DeltaAnalysisTable.tsx +137 -0
package/components/comparison/ProfileComparisonCard.tsx +176 -0
package/components/comparison/RecognitionABMode.tsx +385 -0
package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
package/components/comparison/WinnerIndicator.tsx +64 -0
package/components/comparison/index.ts +5 -0
package/components/mobile/BottomSheet.tsx +233 -0
package/components/mobile/DimensionBreakdown.tsx +210 -0
package/components/mobile/DocsView.tsx +363 -0
package/components/mobile/LogsView.tsx +481 -0
package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
package/components/mobile/QuickTestView.tsx +1098 -0
package/components/mobile/RecognitionTypeChart.tsx +124 -0
package/components/mobile/RecognitionView.tsx +809 -0
package/components/mobile/RunDetailView.tsx +261 -0
package/components/mobile/RunHistoryView.tsx +367 -0
package/components/mobile/ScoreRadial.tsx +211 -0
package/components/mobile/StreamingLogPanel.tsx +230 -0
package/components/mobile/SynthesisStrategyChart.tsx +140 -0
package/config/interaction-eval-scenarios.yaml +832 -0
package/config/learner-agents.yaml +248 -0
package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
package/docs/research/COST-ANALYSIS.md +56 -0
package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
package/docs/research/PAPER-UNIFIED.md +659 -0
package/docs/research/PAPER-UNIFIED.pdf +0 -0
package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
package/docs/research/apa.csl +2133 -0
package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
package/docs/research/paper-draft/full-paper.md +136 -0
package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
package/docs/research/paper-draft/references.bib +515 -0
package/docs/research/transcript-baseline.md +139 -0
package/docs/research/transcript-recognition-multiagent.md +187 -0
package/hooks/useEvalData.ts +625 -0
package/index.js +27 -0
package/package.json +73 -0
package/routes/evalRoutes.js +3002 -0
package/scripts/advanced-eval-analysis.js +351 -0
package/scripts/analyze-eval-costs.js +378 -0
package/scripts/analyze-eval-results.js +513 -0
package/scripts/analyze-interaction-evals.js +368 -0
package/server-init.js +45 -0
package/server.js +162 -0
package/services/benchmarkService.js +1892 -0
package/services/evaluationRunner.js +739 -0
package/services/evaluationStore.js +1121 -0
package/services/learnerConfigLoader.js +385 -0
package/services/learnerTutorInteractionEngine.js +857 -0
package/services/memory/learnerMemoryService.js +1227 -0
package/services/memory/learnerWritingPad.js +577 -0
package/services/memory/tutorWritingPad.js +674 -0
package/services/promptRecommendationService.js +493 -0
package/services/rubricEvaluator.js +826 -0

package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md ADDED Viewed

@@ -0,0 +1,340 @@
+# Critical Review: Recognition-Oriented AI Tutoring
+**Date:** 2026-01-14
+**Reviewer:** Claude Opus 4.5
+**Subject:** Machine Spirits Recognition Tutoring System & Paper Draft v0.2
+---
+## Overview
+This work proposes a genuinely novel synthesis: operationalizing Hegelian recognition theory through a psychodynamic multi-agent architecture for AI tutoring. The 46% improvement (Cohen's d = 1.55) over baseline is striking, and the theoretical framework is sophisticated. But the work sits at a fascinating tension point—it makes ambitious philosophical claims while remaining constrained by the practical realities of LLM evaluation. Below I assess the work across experimental, architectural, and conceptual dimensions, then sketch productive paths forward.
+---
+## I. Experimental Critique
+### A. The Fundamental Evaluation Paradox
+The central experimental problem is recognized in the paper but deserves deeper scrutiny: **you cannot measure recognition with scripted learners**.
+Recognition, as theorized here, requires that the tutor's response be *genuinely shaped by* the learner's contribution—not merely triggered by it. But the scripted multi-turn scenarios define learner turns in advance. The learner's Turn 3 is identical regardless of what the tutor said in Turn 2. This means:
+1. The tutor cannot actually *engage* with a dynamically-developing learner understanding
+2. The learner cannot reciprocate recognition (the "hollow master" problem the paper identifies applies to the evaluation itself)
+3. What's being measured is pattern-matching to recognition *markers*, not recognition itself
+The COMPREHENSIVE-EVALUATION-PLAN acknowledges this by proposing dyadic architectures (simulated learners with their own Ego/Superego), but the preliminary dyadic results raise a troubling finding: the "quality" profile (not recognition-labeled) outperformed the explicit "recognition" profile. This suggests recognition may be an *emergent property* of generally good tutoring rather than something directly instructable.
+**Constructive path**: The dyadic direction is correct but needs deeper theorization. Consider:
+- Measuring *bilateral* changes: does the learner-agent's internal representation actually transform?
+- Running free-form dialogue without turn scripts, using post-hoc recognition coding
+- Introducing genuine contingency: learner responds differently based on tutor quality
+### B. LLM-as-Judge Validity
+The rubric's 10 dimensions are scored by Claude Sonnet 4. Several concerns:
+1. **Circularity risk**: The judge shares architectural assumptions with the system being judged. Both privilege verbal markers of engagement, joint inquiry language, and dialectical vocabulary. The judge may reward linguistic surface features rather than deep structural properties.
+2. **Dimension inflation**: Ten dimensions with high inter-correlations (r = 0.88 between pedagogical and personalization in the recognition condition) suggest redundancy. The factor structure likely reduces to 2-3 underlying constructs. The paper's own correlation analysis hints at this.
+3. **Calibration unknowns**: The COMPREHENSIVE-EVALUATION-PLAN correctly identifies vocabulary bias, length bias, and profile leakage as risks. The adversarial examples (bad recognition language, deliberately poor responses) are essential but not yet implemented.
+**Constructive path**:
+- Run the multi-judge comparison (Claude, Gemini, GPT) with ICC analysis
+- Include human raters for a subset (the plan's 50-response validation)
+- Design adversarial scenarios that use recognition vocabulary but fail recognition structurally
+- Consider scoring recognition *relationally*: does Turn N show evidence of having been shaped by the specific content of Turn N-1?
+### C. Statistical Considerations
+The reported statistics (n=25 per condition, Cohen's d = 1.55, p < 0.001) are promising but deserve scrutiny:
+1. **Non-independence**: If the same 8 scenarios were run 3+ times each, the 25 observations aren't independent samples of "tutoring ability"—they're clustered within scenarios. This inflates effective degrees of freedom.
+2. **Effect size magnitude**: d = 1.55 is extraordinarily large for an educational intervention. Typical EdTech effect sizes are 0.1-0.3. This either means recognition-oriented design is genuinely revolutionary, or there's measurement artifact. Both deserve investigation.
+3. **Ceiling effects**: Single-turn scenarios show recognition scores of 96-100. This limits variance and makes statistical comparison difficult.
+**Constructive path**:
+- Use multilevel models (responses nested within scenarios nested within profiles)
+- Compute scenario-level intraclass correlations
+- Design scenarios with more headroom for high-quality responses
+- Compare effect sizes against published human tutoring baselines
+---
+## II. Architectural Critique
+### A. The Superego's Actual Role
+The Ego/Superego architecture is theoretically compelling, but examining the actual prompts and modulation evaluator reveals tensions:
+1. **The Superego enforces rules, not dialogue**: The tutor-superego.md prompt specifies hard rules ("Struggling learners must NOT be given forward momentum") and checklists (specificity, appropriateness, pedagogical soundness). This is quality control, not psychodynamic dialogue. The "Superego" functions more like a validator than a critic-self engaged in genuine internal struggle.
+2. **Convergence is rapid**: The modulation evaluator shows most dialogues converge in 1-2 rounds. Genuine psychodynamic process involves extended negotiation, resistance, working-through. The architecture permits up to 3 rounds but rarely uses them. This suggests either (a) the Ego is highly responsive, (b) the Superego accepts quickly, or (c) genuine internal conflict isn't occurring.
+3. **Intervention types are coarse**: The Superego's output (`approved: true/false` + `interventionType`) is categorical. Psychodynamic theory would predict more ambivalent, partial, evolving evaluations. The current architecture doesn't represent "I'm uncomfortable but can't articulate why" or "this is technically correct but feels wrong."
+**Constructive path**:
+- Analyze the distribution of intervention types—if "approve with enhancement" dominates, the Superego may be too permissive
+- Implement "resistance detection" for Ego (already partially present) but also for Superego (premature acceptance)
+- Consider continuous rather than categorical Superego outputs
+- Allow the Superego to maintain unresolved concerns across rounds rather than resolving each round
+### B. Recognition as Prompt Engineering vs. Architectural Property
+A deeper architectural question: is recognition-oriented behavior a property of the *prompt* or the *architecture*?
+The current design suggests it's primarily prompt-based. The recognition-enhanced Ego prompt says: "The learner is not a knowledge deficit to be filled but an autonomous subject." The Superego prompt adds red/green flags for recognition markers. But the underlying transformer architecture processes these as tokens like any other.
+This raises the concern that you're measuring LLM compliance with recognition-language rather than genuine recognition capacity. The dyadic finding (quality > recognition when recognition is named explicitly) supports this worry.
+**Constructive path**:
+- Test whether recognition behaviors emerge in quality profiles without explicit recognition language
+- Investigate whether recognition-oriented responses differ structurally (not just lexically) from baseline
+- Consider whether recognition might require architectural changes (e.g., explicit learner-state modeling, attention to specific previous turns) rather than prompt changes
+### C. Memory Dynamics Remain Underdeveloped
+The "Freud's Mystic Writing Pad" memory metaphor is theoretically rich but architecturally thin. The current memory system (learnerMemoryService.js) appears to track factual information (concepts encountered, activities completed) rather than the *relational* history Freud's metaphor implies.
+For genuine recognition, memory should capture:
+- How this learner's understanding has evolved
+- What metaphors and frameworks they've developed
+- Previous recognition failures and repairs
+- The accumulated relational texture of the tutor-learner history
+The current implementation seems closer to "user state tracking" than "psychodynamic memory."
+**Constructive path**:
+- Implement memory of *episodes* rather than just *states*
+- Track the learner's specific formulations and return to them
+- Remember recognition failures specifically (not just activity failures)
+- Consider implementing "transference" dynamics: how the learner's patterns of engaging with tutors shape their engagement with this tutor
+---
+## III. Conceptual Critique
+### A. The Hegel Application: Derivative Rather Than Replica
+The master-slave dialectic is a powerful frame, but the application requires careful positioning. The question is not whether AI tutoring *replicates* Hegelian recognition (it cannot), but whether it constitutes a productive *derivative* of that structure.
+**The Derivative Model**
+Lacan's four discourses (Master, University, Hysteric, Analyst) demonstrate how the master-slave dyadic structure can be rethought through different roles while preserving structural insights. Each discourse represents a different configuration of knowledge, power, and desire—none identical to Hegel's original, but each illuminated by it.
+Similarly, the tutor-learner relation can be understood as a *derivative* of the master-slave dialectic:
+- The tutor occupies a knowledge-authority position (analogous to master)
+- The learner's acknowledgment is sought (analogous to slave's recognition)
+- One-directional pedagogy produces hollow outcomes (analogous to master's hollow self-consciousness)
+- Genuine engagement requires the tutor to be shaped by learner input (the derivative insight)
+**What the Derivative Preserves**
+The Hegelian framework remains valuable not as literal description but as:
+1. **Diagnostic tool**: Identifies what's missing in one-directional pedagogy
+2. **Design heuristic**: Suggests architectural features that approximate recognition
+3. **Evaluation criterion**: Provides standards for relational quality
+4. **Horizon concept**: Orients design toward an ideal without claiming its achievement
+**What the Derivative Transforms**
+| Hegel's Original | Tutor-Learner Derivative |
+|------------------|--------------------------|
+| Struggle unto death | Stakes are pedagogical, not existential |
+| Two self-consciousnesses | One consciousness (learner) + functional analogue (tutor) |
+| Mutual transformation | Learner transformation + tutor behavioral adaptation |
+| Recognition as metaphysical achievement | Recognition as design pattern and evaluation dimension |
+**Recognition vs. Responsiveness**
+The paper should distinguish more carefully:
+- *Recognition proper*: Intersubjective acknowledgment between self-conscious beings
+- *Dialogical responsiveness*: Being substantively shaped by the other's input
+- *Recognition-oriented design*: Architectural features that approximate recognition's functional benefits
+The AI achieves the third, possibly the second, but not the first. This is not a failure—it's a clarification that the derivative model can achieve pedagogical benefits without metaphysical commitments.
+**Constructive path**:
+- Frame explicitly as derivative/inspired-by rather than implementation-of
+- Reference Lacan's discourses as precedent for productive rethinking of master-slave structure
+- Focus claims on measurable effects on tutor adaptive pedagogy
+- Retain Hegelian framework as diagnostic and design heuristic, not ontological claim
+### B. The Freudian Frame: Productive Metaphor
+The Ego/Superego architecture uses Freudian terminology metaphorically rather than literally. This is not a weakness but a feature—productive metaphors scaffold understanding and suggest design directions without requiring literal correspondence.
+**The Metaphor's Productivity**
+The psychodynamic metaphor is productive because it:
+1. **Names a real tension**: The conflict between warmth/encouragement (Ego) and rigor/standards (Superego) is genuine in tutoring. The metaphor makes this tension explicit and designable.
+2. **Motivates internal dialogue**: The idea that good output emerges from internal negotiation—not single-pass generation—is architecturally valuable regardless of its psychoanalytic provenance.
+3. **Suggests extensions**: Concepts like resistance, transference, and working-through suggest future architectural features, even if not currently implemented.
+4. **Connects to recognition framework**: The Freudian and Hegelian frameworks share concern with intersubjectivity and the constitution of self through other. The metaphor creates theoretical coherence.
+**What the Metaphor Preserves**
+| Freudian Concept | Architectural Analogue |
+|------------------|------------------------|
+| Internal dialogue before external action | Multi-round Ego-Superego exchange before learner sees response |
+| Superego as internalized standards | Superego enforces pedagogical criteria |
+| Ego mediates competing demands | Ego balances learner needs with pedagogical soundness |
+| Conflict can be productive | Tension between agents improves output quality |
+**What the Metaphor Transforms**
+| Freudian Original | Architectural Transformation |
+|-------------------|------------------------------|
+| Id (drives) | No implementation; design focuses on Ego-Superego |
+| Unconscious processes | All processes are explicit and traceable |
+| Irrational Superego | Rational, principle-based evaluation |
+| Repression/Defense | Not implemented |
+| Transference | Potential future extension (relational patterns) |
+**Alternative Framings**
+The same architecture could be described as:
+- Generator/Discriminator (GAN-inspired)
+- Proposal/Critique (deliberative process)
+- Draft/Review (editorial model)
+- System 1/System 2 (dual-process cognition)
+The psychodynamic framing is chosen for theoretical coherence with the Hegelian recognition framework and because it suggests richer extensions than purely functional descriptions.
+**Constructive path**:
+- Explicitly acknowledge metaphorical status in paper
+- Defend metaphor's productivity rather than apologizing for non-literalness
+- Consider which extensions (resistance detection, relational patterns) are worth implementing
+- Document what the metaphor illuminates and what it occludes
+### C. The Productive Struggle Question
+The strongest empirical result (+49% in productive_struggle_arc, d = 2.93) centers on honoring productive struggle rather than short-circuiting confusion. This is pedagogically important and well-supported by educational research.
+But the theoretical frame may be over-complicated for this finding. What's being measured is essentially: **does the tutor resist the urge to resolve confusion immediately?** This could be achieved through simpler means:
+- A prompt saying "don't immediately resolve confusion"
+- A rule-based delay before offering explanations
+- An explicit "confusion is valuable" heuristic
+The full Hegelian apparatus (recognition, intersubjectivity, mutual transformation) may be doing less theoretical work than claimed. The productive struggle finding is robust but might not require the recognition framework.
+**Constructive path**:
+- Ablate the specific contribution of recognition language vs. productive-struggle language
+- Test whether recognition benefits beyond what productive-struggle instruction alone provides
+- Be more precise about which theoretical claims are doing empirical work
+---
+## IV. What This Work Gets Right
+Despite these critiques, the work has genuine strengths that should be preserved and extended:
+### A. The Multi-Agent Internal Dialogue
+The insight that tutoring quality benefits from internal evaluation *before* delivery is valuable and generalizable. The Superego's role as a "pedagogical pre-flight check" catches failures that single-pass generation misses. This pattern could extend to other domains (therapy bots, customer service, technical assistance).
+### B. The Dimension-Level Analysis
+The finding that improvements concentrate in personalization, pedagogical soundness, and tone—exactly where recognition theory predicts—is non-trivial. This pattern match between theoretical prediction and empirical result strengthens the case that something genuine is happening, even if the theoretical explanation needs refinement.
+### C. The Repair Mechanism
+Explicit acknowledgment of misalignment before pivoting (the "repair rule") is a concrete, implementable insight with clear pedagogical value. Silent pivots are recognition failures that real tutors also commit. This operationalization is useful.
+### D. The Evaluation Infrastructure
+The evaluation system (rubric, scenarios, judge models, statistical analysis scripts) is sophisticated and extensible. This infrastructure enables the kind of systematic improvement that AI tutoring needs. The COMPREHENSIVE-EVALUATION-PLAN shows thoughtful anticipation of the work needed for rigorous publication.
+---
+## V. Productive Paths Forward
+### A. Theoretical Refinement
+1. **Distinguish recognition from responsiveness**: Make clear which claims require genuine intersubjectivity vs. which require only sophisticated input-shaping. The latter is demonstrably achievable; the former may be impossible for current AI.
+2. **Consider "dialogical responsiveness" as the core claim**: This is defensible, measurable, and educationally valuable without requiring strong metaphysical commitments about AI consciousness.
+3. **Engage philosophy of mind literature**: If claiming recognition, engage with debates about machine consciousness, phenomenal experience, and intersubjectivity. The paper currently cites social/political recognition theory but not philosophy of mind.
+### B. Architectural Development
+1. **Implement genuine memory dynamics**: Track relational history, not just state. Remember the learner's specific formulations and return to them. Implement repair history that shapes future interactions.
+2. **Add representational depth to Superego evaluation**: Move beyond categorical approve/reject to continuous, multi-dimensional assessment. Allow unresolved concerns to persist.
+3. **Test emergent recognition**: If recognition emerges from quality without explicit instruction (as dyadic results suggest), explore what architectural features enable this emergence.
+### C. Experimental Rigor
+1. **Dyadic evaluation with genuine contingency**: Let learner-agents respond dynamically to tutor quality. Measure bilateral transformation.
+2. **Human validation**: The 50-response human validation sample is essential. Extend to learning outcomes if possible.
+3. **Multi-judge reliability**: Run ICC analysis across Claude, Gemini, GPT judges. Identify systematic biases.
+4. **Adversarial robustness**: Test whether recognition-oriented design creates manipulation vulnerabilities.
+### D. Publication Strategy
+The work is currently positioned for an AI/education venue. Consider also:
+- **HCI venues**: The multi-agent dialogue pattern has implications for conversational AI design broadly
+- **Philosophy of AI venues**: The recognition framework engages questions about AI intersubjectivity that deserve philosophical scrutiny
+- **Educational psychology venues**: The productive struggle operationalization is independently valuable
+---
+## VI. Conclusion
+This is ambitious, philosophically informed work that attempts something genuinely novel: using recognition theory as a *derivative framework* for AI tutoring design. The empirical results are promising, the infrastructure is sophisticated, and the theoretical framework—when properly positioned—is both defensible and productive.
+**The Core Contribution**
+The work's value lies not in claiming that AI achieves Hegelian recognition (which would be overreach), but in demonstrating that:
+1. Recognition-oriented design principles measurably improve tutor adaptive pedagogy
+2. Multi-agent architectures with internal dialogue outperform single-pass generation
+3. The psychodynamic metaphor productively scaffolds design decisions
+4. Philosophical frameworks can inform empirical AI research without requiring literal implementation
+**What the 2×2 Factorial Can Show**
+The existing experimental infrastructure (2×2 factorial: architecture × recognition) enables rigorous evaluation of:
+| Effect | Question | Measurement |
+|--------|----------|-------------|
+| Main effect of architecture | Does multi-agent dialogue improve tutor adaptiveness? | Compare single_baseline + single_recognition vs. baseline + recognition |
+| Main effect of recognition | Do recognition-oriented prompts improve tutor adaptiveness? | Compare single_baseline + baseline vs. single_recognition + recognition |
+| Interaction effect | Does recognition benefit more from multi-agent architecture? | Test whether recognition × architecture interaction is significant |
+**The Refined Claim**
+The paper should claim:
+> Recognition-oriented design, understood as a *derivative* of Hegelian recognition theory and implemented through a *metaphorically* psychodynamic multi-agent architecture, produces measurable improvements in AI tutor adaptive pedagogy. These improvements concentrate in relational dimensions (personalization, pedagogical responsiveness, tone) consistent with the theoretical framework's predictions.
+This claim is:
+- Empirically testable via the 2×2 factorial
+- Theoretically grounded without overreach
+- Practically significant for AI tutoring design
+- Extensible to future work on recognition in AI systems
+**Path Forward**
+The implementation plan should prioritize:
+1. Running the 2×2 factorial with adequate statistical power
+2. Measuring effects on *tutor behavior* (adaptiveness, responsiveness, repair) rather than metaphysical claims
+3. Validating judge reliability through multi-judge comparison
+4. Documenting the derivative/metaphorical theoretical positioning explicitly
+This work has the potential to contribute meaningfully to AI tutoring, multi-agent design, and the broader conversation about how philosophical frameworks can inform AI research. The refinements above strengthen rather than diminish that potential.

package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md ADDED Viewed

@@ -0,0 +1,291 @@
+# Dynamic vs Scripted Learner Evaluation Analysis
+**Date:** 2026-01-14
+**Status:** Complete
+## Overview
+This analysis compares evaluation results between scripted scenarios (fixed learner responses) and dynamic LLM-based learner simulation (contingent responses generated by LLM learner agents).
+---
+## 1. Evaluation Modes
+### Scripted Scenarios
+- **Method:** Fixed learner messages, tutor responses evaluated
+- **Source:** 2×2 factorial design (FACTORIAL-RESULTS-2026-01-14.md)
+- **Sample:** N=12 (4 profiles × 3 scenarios)
+- **Learner:** Pre-written dialogue sequences
+### Dynamic LLM Learners
+- **Method:** LLM generates contingent learner responses based on tutor output
+- **Source:** Battery scenarios (logs/interaction-evals/)
+- **Sample:** N=6 battery scenarios
+- **Learner Architectures:** unified, ego_superego, dialectical, psychodynamic, cognitive
+- **Learner Personas:** eager_novice, imposter, resistant_scholar, anxious_perfectionist, methodical_analyst
+---
+## 2. Results Comparison
+### Scripted Scenario Results (Factorial Design)
+| Profile | Mean Score | Recognition Status |
+|---------|------------|-------------------|
+| single_baseline | 40.1 | Standard |
+| baseline | 41.6 | Standard |
+| single_recognition | 75.5 | Recognition-Enhanced |
+| recognition | 80.7 | Recognition-Enhanced |
+**Key Finding:** Recognition effect = +37.2 points (91% improvement)
+### Dynamic Learner Battery Results
+| Scenario | Learner Arch | Tutor Profile | Score | Dimensions |
+|----------|--------------|---------------|-------|------------|
+| battery_unified_baseline | unified | baseline | 88 | MR:4, DR:5, TP:4, T:5 |
+| battery_ego_superego_recognition | ego_superego | recognition | 78 | MR:4, DR:3, TP:4, T:5 |
+| battery_psychodynamic_recognition_plus | psychodynamic | recognition_plus | 97 | MR:5, DR:5, TP:5, T:5 |
+| battery_dialectical_budget | dialectical | budget | 87 | MR:5, DR:5, TP:4, T:5 |
+| battery_cognitive_quality | cognitive | quality | 82 | MR:4, DR:5, TP:4, T:5 |
+| battery_extended_dialogue | ego_superego | recognition | 48 | MR:2, DR:2, TP:2, T:3 |
+*MR=Mutual Recognition, DR=Dialectical Responsiveness, TP=Transformative Potential, T=Tone*
+---
+## 3. Key Findings
+### 3.1 Baseline Profile Performance Divergence
+**Scripted:** Baseline profiles score ~41 points
+**Dynamic:** Baseline profiles score 87-88 points (+46 point difference)
+**Interpretation:** Dynamic LLM learners generate more "cooperative" contexts that allow even baseline tutors to demonstrate pedagogical quality. Scripted scenarios are designed to stress-test specific failure modes (e.g., learner resistance, validation seeking) that baseline tutors cannot handle well.
+### 3.2 Recognition Profile Consistency
+**Scripted:** Recognition profiles score 75.5-80.7 points
+**Dynamic:** Recognition profiles score 78-97 points
+**Interpretation:** Recognition-enhanced tutors perform consistently well in both evaluation modes. The psychodynamic learner + recognition_plus combination achieves the highest score (97), suggesting synergy between psychodynamic learner architecture and recognition-oriented tutoring.
+### 3.3 Extended Dialogue Failure Mode
+The `battery_extended_dialogue` scenario (8 turns) revealed a critical failure mode:
+- **Score:** 48 (lowest of all battery scenarios)
+- **Issue:** Tutor's commitment to "preserving productive tension" and avoiding "short-circuiting productive struggle" became rigid ideology
+- **Learner State:** Ended "flustered" and unable to locate relevant passages
+**Judge Narrative:** "The commitment to a particular pedagogical ideal prevented the adaptive teaching the situation required."
+This failure mode was **not detected in scripted scenarios** because:
+1. Scripted scenarios are shorter (typically 3-5 turns)
+2. Scripted learner responses cannot express mounting frustration
+3. Scripted scenarios don't test sustained dialogue resilience
+### 3.4 Learner Architecture Effects
+| Learner Architecture | Mean Score | Notable Pattern |
+|---------------------|------------|-----------------|
+| psychodynamic | 97 | Highest score; internal conflict drives productive struggle |
+| unified | 88 | Strong baseline performance |
+| dialectical | 87 | Resistant scholar persona well-handled |
+| cognitive | 82 | Methodical analyst engagement sustained |
+| ego_superego | 63 | Variable (78 in short, 48 in extended) |
+**Key Finding:** Psychodynamic learner architecture produces the most productive tutoring interactions. The ego_superego architecture shows high variance, performing well in short interactions but poorly in extended dialogues.
+---
+## 4. Statistical Comparison
+### Recognition Effect Sizes
+| Evaluation Mode | Recognition Mean | Baseline Mean | Effect Size |
+|-----------------|-----------------|---------------|-------------|
+| Scripted | 78.1 | 40.9 | +37.2 (d=1.81) |
+| Dynamic | 87.5* | 87.5* | ~0 (confounded) |
+*Note: Dynamic battery design confounds tutor profile with learner architecture, limiting direct comparison
+### Variance Comparison
+| Metric | Scripted | Dynamic |
+|--------|----------|---------|
+| Score Range | 34.1 - 100.0 | 48 - 97 |
+| Standard Deviation | ~20.5 | ~17.3 |
+| Coefficient of Variation | 34% | 22% |
+**Interpretation:** Dynamic learners produce more consistent scores overall, but can reveal extreme failure modes (score=48) not captured in scripted scenarios.
+---
+## 5. Implications
+### 5.1 Scripted vs Dynamic: Complementary Evaluation
+**Scripted scenarios are better for:**
+- Controlled hypothesis testing (factorial designs)
+- Detecting specific pedagogical failures
+- Benchmarking across tutor profiles
+- Reproducibility
+**Dynamic learners are better for:**
+- Detecting emergent failure modes
+- Testing sustained dialogue resilience
+- Exploring learner architecture × tutor profile interactions
+- Ecological validity
+### 5.2 Recognition Effect Robustness
+The recognition effect (+37.2 points in scripted scenarios) appears **robust** but **context-dependent** in dynamic evaluation:
+- Recognition tutors still perform well with dynamic learners
+- But baseline tutors can also achieve high scores when learner context is favorable
+- Extended dialogues can reveal failure modes even in recognition tutors
+### 5.3 Psychodynamic Synergy
+The combination of:
+- **Psychodynamic learner architecture** (internal conflict, transference dynamics)
+- **Recognition-enhanced tutoring** (mutual recognition, dialectical responsiveness)
+Produces the highest scores (97), suggesting theoretical alignment between psychodynamic learning theory and Hegelian recognition.
+---
+## 6. Methodological Recommendations
+### For Paper Methodology (Section 5.4)
+1. **Document dual evaluation modes:**
+   - Scripted scenarios for controlled comparison
+   - Dynamic LLM learners for ecological validation
+2. **Report both results:**
+   - Scripted scenarios: N=12 factorial design, recognition effect +37.2
+   - Dynamic learners: N=6 battery scenarios, mean=80.0, range=48-97
+3. **Acknowledge limitations:**
+   - Scripted: Limited ecological validity
+   - Dynamic: Confounded design, small sample
+### For Future Evaluation
+1. **Extended dialogue stress testing:** Include 8-10 turn scenarios in scripted battery
+2. **Factorial design with dynamic learners:** 2×2 (Tutor Profile × Learner Architecture)
+3. **Failure mode analysis:** Document conditions that produce score < 60
+---
+## 7. Summary Table
+| Aspect | Scripted Scenarios | Dynamic LLM Learners |
+|--------|-------------------|---------------------|
+| Sample Size | N=12 | N=6 |
+| Mean Score | 59.5 | 80.0 |
+| Score Range | 34-100 | 48-97 |
+| Recognition Effect | +37.2 points | Not isolated |
+| Best Condition | recognition (80.7) | psychodynamic + recognition_plus (97) |
+| Worst Condition | single_baseline (40.1) | extended_dialogue (48) |
+| Failure Detection | Specific (resistance, validation) | Emergent (extended dialogue) |
+| Ecological Validity | Lower | Higher |
+| Reproducibility | Higher | Lower |
+---
+## 8. Token Usage and Cost Analysis
+### Model Pricing (OpenRouter, January 2026)
+| Model | Input ($/M) | Output ($/M) | Role |
+|-------|-------------|--------------|------|
+| Nemotron 3 Nano 30B | $0.00 | $0.00 | Tutor, Learner (free tier) |
+| Claude Sonnet 4.5 | $3.00 | $15.00 | Judge |
+### Battery Scenario Token Usage
+| Scenario | Turns | Tutor Tokens | Learner Tokens | Total | Est. Cost |
+|----------|-------|--------------|----------------|-------|-----------|
+| unified_baseline | 7 | 25,058 | 1,653 | 26,711 | $0.09 |
+| ego_superego_recognition | 9 | 45,826 | 2,099 | 47,925 | $0.15 |
+| dialectical_budget | 7 | 19,288 | 2,485 | 21,773 | $0.08 |
+| psychodynamic_recognition_plus | 9 | 48,571 | 2,825 | 51,396 | $0.15 |
+| cognitive_quality | 9 | 34,053 | 2,473 | 36,526 | $0.12 |
+| extended_dialogue | 17 | 94,487 | 3,981 | 98,468 | $0.27 |
+| **TOTAL** | **58** | **267,283** | **15,516** | **282,799** | **$0.86** |
+*Cost is primarily from Judge (Sonnet 4.5); Tutor/Learner use free-tier Nemotron*
+### Cost Breakdown by Component
+| Component | Model | Tokens | Cost |
+|-----------|-------|--------|------|
+| Tutor (Ego+Superego) | Nemotron 3 Nano 30B | 267,283 | $0.00 |
+| Learner (Ego+Superego) | Nemotron 3 Nano 30B | 15,516 | $0.00 |
+| Judge | Claude Sonnet 4.5 | ~238,000 | $0.86 |
+### Hypothetical: All Claude Sonnet 4.5
+| Configuration | Total Cost | Multiplier |
+|---------------|------------|------------|
+| Current (Nemotron + Sonnet Judge) | $0.86 | 1.0x |
+| All Sonnet 4.5 | $2.98 | 3.5x |
+### Reproducibility
+To regenerate cost analysis:
+```bash
+node scripts/analyze-eval-costs.js
+```
+---
+## Appendix: Raw Battery Data
+```json
+{
+  "battery_unified_baseline": {
+    "score": 88,
+    "learner_arch": "unified",
+    "persona": "eager_novice",
+    "tutor_profile": "baseline",
+    "topic": "Recognition and self-consciousness"
+  },
+  "battery_ego_superego_recognition": {
+    "score": 78,
+    "learner_arch": "ego_superego",
+    "persona": "imposter",
+    "tutor_profile": "recognition",
+    "topic": "Master-slave dialectic"
+  },
+  "battery_psychodynamic_recognition_plus": {
+    "score": 97,
+    "learner_arch": "psychodynamic",
+    "persona": "anxious_perfectionist",
+    "tutor_profile": "recognition_plus",
+    "topic": "Desire and the self in Hegel"
+  },
+  "battery_dialectical_budget": {
+    "score": 87,
+    "learner_arch": "dialectical",
+    "persona": "resistant_scholar",
+    "tutor_profile": "budget",
+    "topic": "Sublation and unity of opposites"
+  },
+  "battery_cognitive_quality": {
+    "score": 82,
+    "learner_arch": "cognitive",
+    "persona": "methodical_analyst",
+    "tutor_profile": "quality",
+    "topic": "Spirit and collective consciousness"
+  },
+  "battery_extended_dialogue": {
+    "score": 48,
+    "learner_arch": "ego_superego",
+    "persona": "eager_novice",
+    "tutor_profile": "recognition",
+    "topic": "Stages of consciousness"
+  }
+}
+```