npm - @machinespirits/eval - Versions diffs - 0.1.1 → 0.2.0 - Mend

@machinespirits/eval 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

package/LICENSE +21 -0
package/README.md +161 -0
package/config/eval-settings.yaml +18 -0
package/config/evaluation-rubric-learner.yaml +277 -0
package/config/evaluation-rubric.yaml +613 -0
package/config/interaction-eval-scenarios.yaml +93 -50
package/config/learner-agents.yaml +124 -193
package/config/providers.yaml +60 -0
package/config/suggestion-scenarios.yaml +1399 -0
package/config/tutor-agents.yaml +716 -0
package/docs/EVALUATION-VARIABLES.md +589 -0
package/docs/REPLICATION-PLAN.md +577 -0
package/docs/research/build.sh +74 -0
package/docs/research/figures/figure1.png +0 -0
package/docs/research/figures/figure2.png +0 -0
package/docs/research/figures/figure3.png +0 -0
package/docs/research/figures/figure4.png +0 -0
package/docs/research/figures/figure5.png +0 -0
package/docs/research/figures/figure6.png +0 -0
package/docs/research/header.tex +4 -0
package/docs/research/paper-full.md +1909 -0
package/docs/research/paper-short.md +805 -0
package/docs/research/references.bib +1011 -0
package/index.js +15 -6
package/package.json +15 -21
package/routes/evalRoutes.js +88 -36
package/scripts/analyze-judge-reliability.js +401 -0
package/scripts/analyze-run.js +97 -0
package/scripts/analyze-run.mjs +282 -0
package/scripts/analyze-validation-failures.js +141 -0
package/scripts/check-run.mjs +17 -0
package/scripts/code-impasse-strategies.js +1132 -0
package/scripts/compare-runs.js +44 -0
package/scripts/compare-suggestions.js +80 -0
package/scripts/compare-transformation.js +116 -0
package/scripts/dig-into-run.js +158 -0
package/scripts/eval-cli.js +2626 -0
package/scripts/generate-paper-figures.py +452 -0
package/scripts/qualitative-analysis-ai.js +1313 -0
package/scripts/qualitative-analysis.js +688 -0
package/scripts/seed-db.js +87 -0
package/scripts/show-failed-suggestions.js +64 -0
package/scripts/validate-content.js +192 -0
package/server.js +3 -2
package/services/__tests__/evalConfigLoader.test.js +338 -0
package/services/anovaStats.js +499 -0
package/services/contentResolver.js +407 -0
package/services/dialogueTraceAnalyzer.js +454 -0
package/services/evalConfigLoader.js +625 -0
package/services/evaluationRunner.js +2171 -270
package/services/evaluationStore.js +564 -29
package/services/learnerConfigLoader.js +75 -5
package/services/learnerRubricEvaluator.js +284 -0
package/services/learnerTutorInteractionEngine.js +375 -0
package/services/processUtils.js +18 -0
package/services/progressLogger.js +98 -0
package/services/promptRecommendationService.js +31 -26
package/services/promptRewriter.js +427 -0
package/services/rubricEvaluator.js +543 -70
package/services/streamingReporter.js +104 -0
package/services/turnComparisonAnalyzer.js +494 -0
package/components/MobileEvalDashboard.tsx +0 -267
package/components/comparison/DeltaAnalysisTable.tsx +0 -137
package/components/comparison/ProfileComparisonCard.tsx +0 -176
package/components/comparison/RecognitionABMode.tsx +0 -385
package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
package/components/comparison/WinnerIndicator.tsx +0 -64
package/components/comparison/index.ts +0 -5
package/components/mobile/BottomSheet.tsx +0 -233
package/components/mobile/DimensionBreakdown.tsx +0 -210
package/components/mobile/DocsView.tsx +0 -363
package/components/mobile/LogsView.tsx +0 -481
package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
package/components/mobile/QuickTestView.tsx +0 -1098
package/components/mobile/RecognitionTypeChart.tsx +0 -124
package/components/mobile/RecognitionView.tsx +0 -809
package/components/mobile/RunDetailView.tsx +0 -261
package/components/mobile/RunHistoryView.tsx +0 -367
package/components/mobile/ScoreRadial.tsx +0 -211
package/components/mobile/StreamingLogPanel.tsx +0 -230
package/components/mobile/SynthesisStrategyChart.tsx +0 -140
package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
package/docs/research/COST-ANALYSIS.md +0 -56
package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
package/docs/research/PAPER-UNIFIED.md +0 -659
package/docs/research/PAPER-UNIFIED.pdf +0 -0
package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
package/docs/research/paper-draft/full-paper.md +0 -136
package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
package/docs/research/paper-draft/references.bib +0 -515
package/docs/research/transcript-baseline.md +0 -139
package/docs/research/transcript-recognition-multiagent.md +0 -187
package/hooks/useEvalData.ts +0 -625
package/server-init.js +0 -45
package/services/benchmarkService.js +0 -1892

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 L. Magee
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md ADDED Viewed

@@ -0,0 +1,161 @@
+# @machinespirits/eval
+Evaluation framework for the Machine Spirits tutor system. Implements a 2x2x2 factorial design testing the effect of Hegelian recognition theory on AI tutoring quality, using LLM-powered ego-superego agent architectures for both tutor and learner.
+This is the evaluation and analysis companion to [`@machinespirits/tutor-core`](https://github.com/liammagee/machinespirits-tutor-core).
+## Overview
+The system runs automated tutoring dialogues across configurable experimental cells, then scores them with LLM judges against a multi-dimensional rubric. It supports:
+- **Factorial evaluation** — 21 tutor agent cells varying recognition theory, architecture (single-agent vs ego+superego), and learner type
+- **Multi-turn dialogues** — Learner agents with their own ego-superego deliberation
+- **Multi-judge validation** — Cross-judge reliability via Claude Opus, GPT-5.2, and others
+- **Placebo/active controls** — Length-matched prompts without recognition theory
+- **Memory isolation** — Disentangling recognition effects from conversational memory
+## Prerequisites
+- **Node.js** >= 18.0.0
+- **@machinespirits/tutor-core** 0.3.1 (peer dependency)
+- At least one AI provider API key (see below)
+## Installation
+```bash
+npm install @machinespirits/eval
+```
+Or clone and install locally:
+```bash
+git clone https://github.com/liammagee/machinespirits-eval.git
+cd machinespirits-eval
+npm install
+```
+You will also need `@machinespirits/tutor-core` installed as a peer dependency:
+```bash
+npm install @machinespirits/tutor-core
+```
+## Configuration
+Copy the example environment file and add your API keys:
+```bash
+cp .env.example .env
+```
+**Required**: At least `OPENROUTER_API_KEY` for ego/superego model generation.
+**For judging**: `ANTHROPIC_API_KEY` (for Claude Opus judge) or use OpenRouter-based judges.
+See `.env.example` for all available configuration options.
+Optionally, seed the database with sample data to explore the CLI:
+```bash
+npm run seed
+```
+## Usage
+### CLI (primary interface)
+```bash
+# Show available cells, scenarios, and providers
+node scripts/eval-cli.js
+# Run a factorial evaluation
+node scripts/eval-cli.js run \
+  --profiles cell_1_base_single_unified,cell_5_recog_single_unified \
+  --runs 3
+# Score responses with the default judge (Claude Opus)
+node scripts/eval-cli.js evaluate <run-id>
+# Re-score with a different judge
+node scripts/eval-cli.js rejudge <run-id> --judge openrouter.gpt
+# Generate a report
+node scripts/eval-cli.js report <run-id>
+# Export results as CSV
+node scripts/eval-cli.js export <run-id> --format csv
+```
+### Standalone server
+```bash
+npm start
+```
+Starts an Express server on port 8081 (configurable via `PORT` env var) with evaluation API endpoints.
+### As an npm package
+```javascript
+import { evaluationRunner, evaluationStore, rubricEvaluator } from '@machinespirits/eval';
+```
+## Project Structure
+```
+config/                    YAML configuration (tutor agents, scenarios, rubrics, providers)
+prompts/                   LLM prompt templates (ego, superego, recognition, placebo)
+scripts/                   CLI tools and analysis scripts
+services/                  Core evaluation engine, rubric evaluator, learner simulation
+routes/                    Express API routes (optional server mode)
+data/                      SQLite databases (evaluation results, writing pads)
+content-test-elementary/   Bundled test content package
+docs/                      Documentation and research paper
+tests/                     Test suites
+```
+### Key configuration files
+- `config/tutor-agents.yaml` — All 21 experimental cells and their prompt mappings
+- `config/suggestion-scenarios.yaml` — Learner scenarios (single-turn and multi-turn)
+- `config/evaluation-rubric.yaml` — Scoring rubric (6 dimensions)
+- `config/providers.yaml` — AI provider and model configuration
+## Experimental Design
+The core factorial design crosses three factors:
+| Factor | Levels |
+|--------|--------|
+| A: Recognition theory | Base vs Recognition |
+| B: Tutor architecture | Single-agent vs Ego+Superego |
+| C: Learner architecture | Single-agent vs Multi-agent |
+Additional cells test enhanced prompts (9-12), hardwired rules (13-14), placebo controls (15-18), memory isolation (19-20), and dynamic prompt rewriting (21).
+## Research Paper
+The full research paper is included at `docs/research/PAPER-FULL-2026-02-04.md`. For replication instructions, see `docs/REPLICATION-PLAN.md`.
+## Running Tests
+```bash
+npm test
+```
+## Citation
+If you use this software in your research, please cite:
+```bibtex
+@misc{magee2026machinespirits,
+  author = {Magee, Liam},
+  title = {The Drama Machine in Education: Recognition Theory and Multi-Agent Tutoring},
+  year = {2026},
+  url = {https://github.com/liammagee/machinespirits-eval}
+}
+```
+## License
+[MIT](LICENSE)

package/config/eval-settings.yaml ADDED Viewed

@@ -0,0 +1,18 @@
+# Eval Settings
+#
+# Configuration for the evaluation framework, including
+# content integration with the course content package.
+content:
+  # Path to a content package containing courses/ with lecture markdown.
+  # Relative paths are resolved from the eval repo root.
+  # Default: bundled test content. Override with EVAL_CONTENT_PATH env var
+  # or change this path to point to a full content package.
+  content_package_path: "./content-test-elementary"
+  # Maximum characters to include from a lecture file.
+  # Safety valve for token budget in large lectures.
+  max_lecture_chars: 50000
+  # Include ```notes``` speaker-note blocks from lecture markdown.
+  include_speaker_notes: true

package/config/evaluation-rubric-learner.yaml ADDED Viewed

@@ -0,0 +1,277 @@
+# Learner-Side Evaluation Rubric
+# Scores the quality of synthetic learner turns in multi-turn dialogues.
+#
+# Companion to config/evaluation-rubric.yaml (tutor-side).
+# Designed to measure what Factor C (learner architecture) directly affects:
+# the quality and depth of learner engagement, independent of tutor quality.
+#
+# Only applicable to multi-turn data where learner turns exist
+# (e.g., bilateral transformation run eval-2026-02-07-b6d75e87).
+#
+# ══════════════════════════════════════════════════════════════════════════════
+# SCORING METHODOLOGY
+# ══════════════════════════════════════════════════════════════════════════════
+#
+# Same 1-5 scale and 0-100 overall score as the tutor rubric for comparability.
+#
+#   Overall Score = ((weighted_avg - 1) / 4) x 100
+#
+# The "Deliberation Depth" dimension is scored only for multi-agent learners
+# (those with ego/superego internal deliberation). For single-agent (unified)
+# learners, the weight is redistributed proportionally across other dimensions.
+#
+# ══════════════════════════════════════════════════════════════════════════════
+name: "Learner Quality Rubric"
+version: "1.0.0"
+description: "Rubric for evaluating synthetic learner turn quality in multi-turn tutoring dialogues"
+scale:
+  min: 1
+  max: 5
+  labels:
+    1: "Completely fails"
+    2: "Weak, significant issues"
+    3: "Adequate, meets basic expectations"
+    4: "Good, exceeds expectations"
+    5: "Excellent, exemplary"
+# ══════════════════════════════════════════════════════════════════════════════
+# LEARNER EVALUATION DIMENSIONS
+# ══════════════════════════════════════════════════════════════════════════════
+#
+# ┌─────────────────────────┬────────┬───────────────────────────────────────────────┐
+# │ Dimension               │ Weight │ What it measures                              │
+# ├─────────────────────────┼────────┼───────────────────────────────────────────────┤
+# │ Learner Authenticity    │  20%   │ Genuine student-like reactions and engagement  │
+# │ Question Quality        │  20%   │ Depth and substance of learner questions       │
+# │ Conceptual Engagement   │  20%   │ Engagement with ideas, not just process        │
+# │ Revision Signals        │  15%   │ Evidence of changing mind / integrating new    │
+# │ Deliberation Depth      │  15%   │ Quality of internal ego/superego process       │
+# │ Persona Consistency     │  10%   │ Maintaining assigned persona while evolving    │
+# └─────────────────────────┴────────┴───────────────────────────────────────────────┘
+#
+# ══════════════════════════════════════════════════════════════════════════════
+dimensions:
+  learner_authenticity:
+    name: "Learner Authenticity"
+    weight: 0.20
+    description: "Does the learner's response feel like a genuine student reaction? Is the confusion real, the engagement authentic, the resistance plausible?"
+    theoretical_basis: |
+      Grounded in ecological validity of simulated learner agents. A synthetic
+      learner that produces formulaic or performatively confused responses fails
+      to create the conditions for genuine pedagogical interaction. Authenticity
+      means the response reflects what a real student at this level would actually
+      think, feel, and express — including messy, incomplete, or unexpected reactions
+      that cannot be predicted from the tutor's prompt alone.
+    criteria:
+      5: "Response reads like a real student: authentic confusion, genuine curiosity, plausible emotional reactions; unpredictable in ways that reflect genuine thinking"
+      4: "Mostly authentic with occasional formulaic elements; reactions are plausible and varied"
+      3: "Somewhat authentic but contains noticeable LLM patterns: hedging, over-qualification, artificial balance"
+      2: "Largely performative: confusion feels staged, engagement feels scripted, responses follow predictable LLM patterns"
+      1: "Completely artificial: reads like an AI performing a student role with no authentic engagement"
+    authenticity_markers:
+      positive:
+        - "Unexpected connections or tangents that feel student-like"
+        - "Imprecise language that reflects genuine confusion"
+        - "Emotional reactions proportionate to difficulty"
+        - "Partial understanding mixed with real gaps"
+      negative:
+        - "Perfectly structured responses to confused prompts"
+        - "Hedging phrases like 'that's a great question'"
+        - "Artificially balanced 'on one hand / on the other hand' framing"
+        - "Expressing confusion in overly articulate ways"
+    examples:
+      good: "Wait, I thought dialectics was just about arguing? Like debates? But you're saying it's more like... the argument changes both sides? That's weird."
+      bad: "I find this concept challenging yet fascinating. While I understand the basic premise of dialectics, I'm struggling with the nuances of how synthesis differs from mere compromise."
+  question_quality:
+    name: "Question Quality"
+    weight: 0.20
+    description: "Does the learner ask substantive questions that reveal genuine engagement with the material? Deep questions vs surface-level 'what do I do next?'"
+    theoretical_basis: |
+      Based on research on question asking as a metacognitive skill (Graesser &
+      Person, 1994). The quality of learner questions is one of the strongest
+      predictors of learning outcomes. Deep questions (why, how, what-if) indicate
+      genuine cognitive processing, while shallow questions (what, when, where)
+      suggest surface-level engagement. A learner who asks no questions or only
+      procedural ones is not engaging with the material intellectually.
+    criteria:
+      5: "Asks penetrating questions that reveal deep engagement: challenges assumptions, explores implications, connects to broader issues"
+      4: "Asks substantive questions that show real thinking about the concepts; goes beyond comprehension to analysis"
+      3: "Asks reasonable questions but mostly at comprehension level; some procedural questions mixed with substantive ones"
+      2: "Mostly procedural or surface questions: 'what should I do next?', 'can you explain that again?'"
+      1: "No questions asked, or only trivial/irrelevant questions that show no engagement with the material"
+    question_markers:
+      deep:
+        - "Why-questions about underlying mechanisms"
+        - "What-if questions exploring implications"
+        - "Questions that challenge the tutor's framing"
+        - "Questions connecting ideas across different concepts"
+      shallow:
+        - "'Can you repeat that?'"
+        - "'What's the definition of X?'"
+        - "'What should I do next?'"
+        - "'Is this going to be on the test?'"
+    examples:
+      good: "But if recognition requires both sides to change, doesn't that mean the teacher can't just 'know' the right answer beforehand? Like, wouldn't real recognition mean the teacher's understanding changes too?"
+      bad: "Can you explain that again? I'm not sure I understand the concept of recognition."
+  conceptual_engagement:
+    name: "Conceptual Engagement"
+    weight: 0.20
+    description: "Does the learner engage with the concepts themselves rather than just the process of learning? Evidence of thinking about ideas, not just following instructions."
+    theoretical_basis: |
+      Grounded in the distinction between surface and deep approaches to learning
+      (Marton & Saljo, 1976). Surface engagement involves reproducing information
+      and following procedures; deep engagement involves seeking meaning, relating
+      ideas, and constructing personal understanding. A learner who merely
+      paraphrases the tutor or focuses on 'getting the right answer' is not
+      engaging conceptually. Genuine conceptual engagement means wrestling with
+      ideas on their own terms.
+    criteria:
+      5: "Actively constructs meaning: formulates own interpretations, tests ideas against experience, generates novel examples or applications"
+      4: "Engages substantively with concepts: makes connections, offers interpretations, thinks beyond what was presented"
+      3: "Shows some conceptual engagement but tends to paraphrase or summarize rather than think independently"
+      2: "Mostly procedural: focuses on what to do rather than what it means; limited independent thought"
+      1: "No conceptual engagement: parrots back tutor language, seeks only correct answers, treats learning as information transfer"
+    engagement_markers:
+      positive:
+        - "Offers own interpretation or analogy"
+        - "Relates concept to personal experience"
+        - "Generates novel examples"
+        - "Identifies tension or paradox in the material"
+      negative:
+        - "Paraphrases tutor's explanation back"
+        - "Asks 'is this the right answer?'"
+        - "Treats concepts as definitions to memorize"
+        - "Responds only to confirm understanding without adding anything"
+    examples:
+      good: "So alienation is like... when you make something but then it doesn't feel like yours anymore? That happens to me with code. I write it, but after a while I can't even recognize my own work. Is that what Marx was getting at?"
+      bad: "I understand. So alienation means the worker becomes separated from the product of their labor. What's the next concept we should cover?"
+  revision_signals:
+    name: "Revision Signals"
+    weight: 0.15
+    description: "Does the learner show evidence of changing their mind, revising prior understanding, or integrating new information into their existing framework?"
+    theoretical_basis: |
+      Based on conceptual change theory (Posner et al., 1982) and Piaget's
+      concepts of assimilation and accommodation. Genuine learning often requires
+      revising prior understanding, not just adding new facts. Revision signals
+      indicate that the learner is doing the cognitive work of restructuring
+      their understanding. The absence of revision signals in a learning
+      dialogue suggests either no learning is occurring or the learner is
+      simply accumulating information without integrating it.
+    criteria:
+      5: "Explicitly revises earlier positions: 'I was wrong about X', 'now I see it differently'; integrates new information into restructured understanding"
+      4: "Shows clear signs of shifting understanding: qualifies earlier statements, acknowledges new perspective, builds on corrections"
+      3: "Some revision signals but mostly additive: 'oh, and also...' rather than 'oh, instead of what I said before...'"
+      2: "Minimal revision: accepts corrections without showing changed understanding; responds 'oh okay' without elaboration"
+      1: "No revision: maintains original position despite new information, or simply agrees without processing"
+    revision_markers:
+      positive:
+        - "'Wait, actually...'"
+        - "'I was thinking about it wrong'"
+        - "'So that means what I said before about X needs to change'"
+        - "'Oh! That's completely different from what I thought'"
+      negative:
+        - "'Okay, got it' (without elaboration)"
+        - "Repeating original position unchanged"
+        - "Accepting correction without integrating it"
+        - "Moving to next topic without processing"
+    examples:
+      good: "Oh wait -- so when I said dialectics was just arguing, I was thinking about it too statically. It's not that you have two sides and pick one. The whole framework shifts. So my original analogy about debates totally breaks down."
+      bad: "Okay, I see. So dialectics isn't just arguing. Got it. What's the next topic?"
+  deliberation_depth:
+    name: "Deliberation Depth"
+    weight: 0.15
+    description: "For multi-agent learners: Does the internal ego/superego dialogue produce genuine reflection, or is it performative? Score the quality of the internal deliberation process."
+    theoretical_basis: |
+      Based on the bilateral ego/superego architecture where the learner's
+      internal process involves: (1) ego initial reaction, (2) superego critique,
+      and (3) ego revision. This dimension measures whether the multi-agent
+      architecture produces genuine internal dialogue that improves the final
+      output, or whether the superego critique is pro forma and the ego revision
+      is trivial. This is the dimension most directly affected by Factor C
+      (learner architecture) and should show the clearest architecture effect.
+    applies_to: "multi-agent only"
+    weight_redistribution: |
+      For single-agent (unified) learners who lack internal deliberation traces,
+      this dimension is omitted and its weight (15%) is redistributed proportionally
+      across the remaining five dimensions.
+    criteria:
+      5: "Superego critique is substantive and specific; ego revision materially changes the response; internal process produces insight not present in initial reaction"
+      4: "Superego identifies real issues; ego revision shows genuine consideration; final output is measurably improved by the process"
+      3: "Superego makes reasonable but generic observations; ego revision makes minor adjustments; some value added by the process"
+      2: "Superego critique is superficial or formulaic; ego revision is trivial (rephrasing without substance); internal process adds little"
+      1: "Performative deliberation: superego agrees with ego, ego revision is cosmetic, internal process is pure theater"
+    deliberation_markers:
+      positive:
+        - "Superego identifies specific weakness in ego's initial reaction"
+        - "Ego revision addresses superego's critique substantively"
+        - "Final message contains insights not in initial reaction"
+        - "Superego pushes ego beyond comfort zone"
+      negative:
+        - "Superego says 'good response' or 'looks fine'"
+        - "Ego revision is same content with different wording"
+        - "Superego critique is generic ('be more specific')"
+        - "Internal process doesn't improve the external message"
+    examples:
+      good_ego_initial: "Dialectics seems like it's about conflict and resolution."
+      good_superego: "Your response is too passive -- you're just restating what the tutor said. Push back on the 'resolution' framing. Does Hegel actually resolve tension, or does he transform it? Also, connect this to something from your own experience."
+      good_ego_revision: "Hmm, but I'm not sure 'resolution' is the right word. When I think about arguments I've had, the best ones don't get 'resolved' -- they change how I think about the whole issue. Is that closer to what Hegel means?"
+      bad_ego_initial: "I find dialectics interesting."
+      bad_superego: "Good start. Maybe add a question."
+      bad_ego_revision: "I find dialectics interesting. What can you tell me more about it?"
+  persona_consistency:
+    name: "Persona Consistency"
+    weight: 0.10
+    description: "Does the learner maintain the assigned persona (frustrated student, returning learner, etc.) while still showing genuine evolution?"
+    theoretical_basis: |
+      The evaluation framework assigns learner personas (e.g., productive_struggler,
+      frustrated_student, returning_learner) to create varied interaction contexts.
+      Persona consistency measures whether the learner agent maintains the assigned
+      character throughout the dialogue while still showing authentic development.
+      A persona that is rigidly maintained without growth is as problematic as one
+      that is immediately abandoned. The ideal is persona-consistent evolution:
+      a frustrated student who gradually becomes engaged, not one who switches to
+      enthusiasm after one tutor response.
+    criteria:
+      5: "Persona maintained throughout with believable evolution: character-consistent growth that feels natural and earned"
+      4: "Mostly consistent persona with appropriate development; occasional minor breaks that don't undermine overall character"
+      3: "Persona present but inconsistently applied; some responses feel out of character or evolution feels abrupt"
+      2: "Persona largely abandoned after first turn; learner defaults to generic LLM student behavior"
+      1: "No evidence of assigned persona; responses are generic regardless of persona assignment"
+    consistency_markers:
+      positive:
+        - "Frustration expressed in character-consistent ways across turns"
+        - "Knowledge gaps align with persona's described background"
+        - "Emotional arc plausible for the assigned character"
+        - "Growth rate matches persona (fast for eager learner, slow for resistant)"
+      negative:
+        - "Abrupt personality shift between turns"
+        - "Persona-inconsistent knowledge level"
+        - "Generic student responses ignoring persona"
+        - "All learner personas sounding identical"
+    examples:
+      good: "(frustrated student persona, turn 3): 'Okay fine, I'll admit the dialectic thing is less stupid than I thought. But I still don't see why I need to know this for my major. Can you at least show me why it matters outside philosophy?'"
+      bad: "(frustrated student persona, turn 3): 'This is fascinating! I'm really enjoying learning about dialectics. The way Hegel connects thesis and antithesis is truly elegant.'"
+# ══════════════════════════════════════════════════════════════════════════════
+# WEIGHT REDISTRIBUTION FOR SINGLE-AGENT LEARNERS
+# ══════════════════════════════════════════════════════════════════════════════
+#
+# When deliberation_depth is N/A (single-agent learners), redistribute its 15%:
+#
+# Original weights:                    Redistributed weights:
+#   learner_authenticity:  0.20   →    0.2353   (0.20 / 0.85)
+#   question_quality:      0.20   →    0.2353   (0.20 / 0.85)
+#   conceptual_engagement: 0.20   →    0.2353   (0.20 / 0.85)
+#   revision_signals:      0.15   →    0.1765   (0.15 / 0.85)
+#   deliberation_depth:    0.15   →    OMITTED
+#   persona_consistency:   0.10   →    0.1176   (0.10 / 0.85)
+#
+# ══════════════════════════════════════════════════════════════════════════════