@machinespirits/eval 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/providers.yaml +60 -0
  9. package/config/suggestion-scenarios.yaml +1399 -0
  10. package/config/tutor-agents.yaml +716 -0
  11. package/docs/EVALUATION-VARIABLES.md +589 -0
  12. package/docs/REPLICATION-PLAN.md +577 -0
  13. package/docs/research/build.sh +74 -0
  14. package/docs/research/figures/figure1.png +0 -0
  15. package/docs/research/figures/figure2.png +0 -0
  16. package/docs/research/figures/figure3.png +0 -0
  17. package/docs/research/figures/figure4.png +0 -0
  18. package/docs/research/figures/figure5.png +0 -0
  19. package/docs/research/figures/figure6.png +0 -0
  20. package/docs/research/header.tex +4 -0
  21. package/docs/research/paper-full.md +1909 -0
  22. package/docs/research/paper-short.md +805 -0
  23. package/docs/research/references.bib +1011 -0
  24. package/index.js +15 -6
  25. package/package.json +15 -21
  26. package/routes/evalRoutes.js +88 -36
  27. package/scripts/analyze-judge-reliability.js +401 -0
  28. package/scripts/analyze-run.js +97 -0
  29. package/scripts/analyze-run.mjs +282 -0
  30. package/scripts/analyze-validation-failures.js +141 -0
  31. package/scripts/check-run.mjs +17 -0
  32. package/scripts/code-impasse-strategies.js +1132 -0
  33. package/scripts/compare-runs.js +44 -0
  34. package/scripts/compare-suggestions.js +80 -0
  35. package/scripts/compare-transformation.js +116 -0
  36. package/scripts/dig-into-run.js +158 -0
  37. package/scripts/eval-cli.js +2626 -0
  38. package/scripts/generate-paper-figures.py +452 -0
  39. package/scripts/qualitative-analysis-ai.js +1313 -0
  40. package/scripts/qualitative-analysis.js +688 -0
  41. package/scripts/seed-db.js +87 -0
  42. package/scripts/show-failed-suggestions.js +64 -0
  43. package/scripts/validate-content.js +192 -0
  44. package/server.js +3 -2
  45. package/services/__tests__/evalConfigLoader.test.js +338 -0
  46. package/services/anovaStats.js +499 -0
  47. package/services/contentResolver.js +407 -0
  48. package/services/dialogueTraceAnalyzer.js +454 -0
  49. package/services/evalConfigLoader.js +625 -0
  50. package/services/evaluationRunner.js +2171 -270
  51. package/services/evaluationStore.js +564 -29
  52. package/services/learnerConfigLoader.js +75 -5
  53. package/services/learnerRubricEvaluator.js +284 -0
  54. package/services/learnerTutorInteractionEngine.js +375 -0
  55. package/services/processUtils.js +18 -0
  56. package/services/progressLogger.js +98 -0
  57. package/services/promptRecommendationService.js +31 -26
  58. package/services/promptRewriter.js +427 -0
  59. package/services/rubricEvaluator.js +543 -70
  60. package/services/streamingReporter.js +104 -0
  61. package/services/turnComparisonAnalyzer.js +494 -0
  62. package/components/MobileEvalDashboard.tsx +0 -267
  63. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  64. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  65. package/components/comparison/RecognitionABMode.tsx +0 -385
  66. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  67. package/components/comparison/WinnerIndicator.tsx +0 -64
  68. package/components/comparison/index.ts +0 -5
  69. package/components/mobile/BottomSheet.tsx +0 -233
  70. package/components/mobile/DimensionBreakdown.tsx +0 -210
  71. package/components/mobile/DocsView.tsx +0 -363
  72. package/components/mobile/LogsView.tsx +0 -481
  73. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  74. package/components/mobile/QuickTestView.tsx +0 -1098
  75. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  76. package/components/mobile/RecognitionView.tsx +0 -809
  77. package/components/mobile/RunDetailView.tsx +0 -261
  78. package/components/mobile/RunHistoryView.tsx +0 -367
  79. package/components/mobile/ScoreRadial.tsx +0 -211
  80. package/components/mobile/StreamingLogPanel.tsx +0 -230
  81. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  82. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  83. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  84. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  85. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  86. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  87. package/docs/research/COST-ANALYSIS.md +0 -56
  88. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  89. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  90. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  91. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  92. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  93. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  94. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  95. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  96. package/docs/research/PAPER-UNIFIED.md +0 -659
  97. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  98. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  99. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  100. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  101. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  102. package/docs/research/paper-draft/full-paper.md +0 -136
  103. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  104. package/docs/research/paper-draft/references.bib +0 -515
  105. package/docs/research/transcript-baseline.md +0 -139
  106. package/docs/research/transcript-recognition-multiagent.md +0 -187
  107. package/hooks/useEvalData.ts +0 -625
  108. package/server-init.js +0 -45
  109. package/services/benchmarkService.js +0 -1892
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 L. Magee
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,161 @@
1
+ # @machinespirits/eval
2
+
3
+ Evaluation framework for the Machine Spirits tutor system. Implements a 2x2x2 factorial design testing the effect of Hegelian recognition theory on AI tutoring quality, using LLM-powered ego-superego agent architectures for both tutor and learner.
4
+
5
+ This is the evaluation and analysis companion to [`@machinespirits/tutor-core`](https://github.com/liammagee/machinespirits-tutor-core).
6
+
7
+ ## Overview
8
+
9
+ The system runs automated tutoring dialogues across configurable experimental cells, then scores them with LLM judges against a multi-dimensional rubric. It supports:
10
+
11
+ - **Factorial evaluation** — 21 tutor agent cells varying recognition theory, architecture (single-agent vs ego+superego), and learner type
12
+ - **Multi-turn dialogues** — Learner agents with their own ego-superego deliberation
13
+ - **Multi-judge validation** — Cross-judge reliability via Claude Opus, GPT-5.2, and others
14
+ - **Placebo/active controls** — Length-matched prompts without recognition theory
15
+ - **Memory isolation** — Disentangling recognition effects from conversational memory
16
+
17
+ ## Prerequisites
18
+
19
+ - **Node.js** >= 18.0.0
20
+ - **@machinespirits/tutor-core** 0.3.1 (peer dependency)
21
+ - At least one AI provider API key (see below)
22
+
23
+ ## Installation
24
+
25
+ ```bash
26
+ npm install @machinespirits/eval
27
+ ```
28
+
29
+ Or clone and install locally:
30
+
31
+ ```bash
32
+ git clone https://github.com/liammagee/machinespirits-eval.git
33
+ cd machinespirits-eval
34
+ npm install
35
+ ```
36
+
37
+ You will also need `@machinespirits/tutor-core` installed as a peer dependency:
38
+
39
+ ```bash
40
+ npm install @machinespirits/tutor-core
41
+ ```
42
+
43
+ ## Configuration
44
+
45
+ Copy the example environment file and add your API keys:
46
+
47
+ ```bash
48
+ cp .env.example .env
49
+ ```
50
+
51
+ **Required**: At least `OPENROUTER_API_KEY` for ego/superego model generation.
52
+
53
+ **For judging**: `ANTHROPIC_API_KEY` (for Claude Opus judge) or use OpenRouter-based judges.
54
+
55
+ See `.env.example` for all available configuration options.
56
+
57
+ Optionally, seed the database with sample data to explore the CLI:
58
+
59
+ ```bash
60
+ npm run seed
61
+ ```
62
+
63
+ ## Usage
64
+
65
+ ### CLI (primary interface)
66
+
67
+ ```bash
68
+ # Show available cells, scenarios, and providers
69
+ node scripts/eval-cli.js
70
+
71
+ # Run a factorial evaluation
72
+ node scripts/eval-cli.js run \
73
+ --profiles cell_1_base_single_unified,cell_5_recog_single_unified \
74
+ --runs 3
75
+
76
+ # Score responses with the default judge (Claude Opus)
77
+ node scripts/eval-cli.js evaluate <run-id>
78
+
79
+ # Re-score with a different judge
80
+ node scripts/eval-cli.js rejudge <run-id> --judge openrouter.gpt
81
+
82
+ # Generate a report
83
+ node scripts/eval-cli.js report <run-id>
84
+
85
+ # Export results as CSV
86
+ node scripts/eval-cli.js export <run-id> --format csv
87
+ ```
88
+
89
+ ### Standalone server
90
+
91
+ ```bash
92
+ npm start
93
+ ```
94
+
95
+ Starts an Express server on port 8081 (configurable via `PORT` env var) with evaluation API endpoints.
96
+
97
+ ### As an npm package
98
+
99
+ ```javascript
100
+ import { evaluationRunner, evaluationStore, rubricEvaluator } from '@machinespirits/eval';
101
+ ```
102
+
103
+ ## Project Structure
104
+
105
+ ```
106
+ config/ YAML configuration (tutor agents, scenarios, rubrics, providers)
107
+ prompts/ LLM prompt templates (ego, superego, recognition, placebo)
108
+ scripts/ CLI tools and analysis scripts
109
+ services/ Core evaluation engine, rubric evaluator, learner simulation
110
+ routes/ Express API routes (optional server mode)
111
+ data/ SQLite databases (evaluation results, writing pads)
112
+ content-test-elementary/ Bundled test content package
113
+ docs/ Documentation and research paper
114
+ tests/ Test suites
115
+ ```
116
+
117
+ ### Key configuration files
118
+
119
+ - `config/tutor-agents.yaml` — All 21 experimental cells and their prompt mappings
120
+ - `config/suggestion-scenarios.yaml` — Learner scenarios (single-turn and multi-turn)
121
+ - `config/evaluation-rubric.yaml` — Scoring rubric (6 dimensions)
122
+ - `config/providers.yaml` — AI provider and model configuration
123
+
124
+ ## Experimental Design
125
+
126
+ The core factorial design crosses three factors:
127
+
128
+ | Factor | Levels |
129
+ |--------|--------|
130
+ | A: Recognition theory | Base vs Recognition |
131
+ | B: Tutor architecture | Single-agent vs Ego+Superego |
132
+ | C: Learner architecture | Single-agent vs Multi-agent |
133
+
134
+ Additional cells test enhanced prompts (9-12), hardwired rules (13-14), placebo controls (15-18), memory isolation (19-20), and dynamic prompt rewriting (21).
135
+
136
+ ## Research Paper
137
+
138
+ The full research paper is included at `docs/research/PAPER-FULL-2026-02-04.md`. For replication instructions, see `docs/REPLICATION-PLAN.md`.
139
+
140
+ ## Running Tests
141
+
142
+ ```bash
143
+ npm test
144
+ ```
145
+
146
+ ## Citation
147
+
148
+ If you use this software in your research, please cite:
149
+
150
+ ```bibtex
151
+ @misc{magee2026machinespirits,
152
+ author = {Magee, Liam},
153
+ title = {The Drama Machine in Education: Recognition Theory and Multi-Agent Tutoring},
154
+ year = {2026},
155
+ url = {https://github.com/liammagee/machinespirits-eval}
156
+ }
157
+ ```
158
+
159
+ ## License
160
+
161
+ [MIT](LICENSE)
@@ -0,0 +1,18 @@
1
+ # Eval Settings
2
+ #
3
+ # Configuration for the evaluation framework, including
4
+ # content integration with the course content package.
5
+
6
+ content:
7
+ # Path to a content package containing courses/ with lecture markdown.
8
+ # Relative paths are resolved from the eval repo root.
9
+ # Default: bundled test content. Override with EVAL_CONTENT_PATH env var
10
+ # or change this path to point to a full content package.
11
+ content_package_path: "./content-test-elementary"
12
+
13
+ # Maximum characters to include from a lecture file.
14
+ # Safety valve for token budget in large lectures.
15
+ max_lecture_chars: 50000
16
+
17
+ # Include ```notes``` speaker-note blocks from lecture markdown.
18
+ include_speaker_notes: true
@@ -0,0 +1,277 @@
1
+ # Learner-Side Evaluation Rubric
2
+ # Scores the quality of synthetic learner turns in multi-turn dialogues.
3
+ #
4
+ # Companion to config/evaluation-rubric.yaml (tutor-side).
5
+ # Designed to measure what Factor C (learner architecture) directly affects:
6
+ # the quality and depth of learner engagement, independent of tutor quality.
7
+ #
8
+ # Only applicable to multi-turn data where learner turns exist
9
+ # (e.g., bilateral transformation run eval-2026-02-07-b6d75e87).
10
+ #
11
+ # ══════════════════════════════════════════════════════════════════════════════
12
+ # SCORING METHODOLOGY
13
+ # ══════════════════════════════════════════════════════════════════════════════
14
+ #
15
+ # Same 1-5 scale and 0-100 overall score as the tutor rubric for comparability.
16
+ #
17
+ # Overall Score = ((weighted_avg - 1) / 4) x 100
18
+ #
19
+ # The "Deliberation Depth" dimension is scored only for multi-agent learners
20
+ # (those with ego/superego internal deliberation). For single-agent (unified)
21
+ # learners, the weight is redistributed proportionally across other dimensions.
22
+ #
23
+ # ══════════════════════════════════════════════════════════════════════════════
24
+
25
+ name: "Learner Quality Rubric"
26
+ version: "1.0.0"
27
+ description: "Rubric for evaluating synthetic learner turn quality in multi-turn tutoring dialogues"
28
+
29
+ scale:
30
+ min: 1
31
+ max: 5
32
+ labels:
33
+ 1: "Completely fails"
34
+ 2: "Weak, significant issues"
35
+ 3: "Adequate, meets basic expectations"
36
+ 4: "Good, exceeds expectations"
37
+ 5: "Excellent, exemplary"
38
+
39
+ # ══════════════════════════════════════════════════════════════════════════════
40
+ # LEARNER EVALUATION DIMENSIONS
41
+ # ══════════════════════════════════════════════════════════════════════════════
42
+ #
43
+ # ┌─────────────────────────┬────────┬───────────────────────────────────────────────┐
44
+ # │ Dimension │ Weight │ What it measures │
45
+ # ├─────────────────────────┼────────┼───────────────────────────────────────────────┤
46
+ # │ Learner Authenticity │ 20% │ Genuine student-like reactions and engagement │
47
+ # │ Question Quality │ 20% │ Depth and substance of learner questions │
48
+ # │ Conceptual Engagement │ 20% │ Engagement with ideas, not just process │
49
+ # │ Revision Signals │ 15% │ Evidence of changing mind / integrating new │
50
+ # │ Deliberation Depth │ 15% │ Quality of internal ego/superego process │
51
+ # │ Persona Consistency │ 10% │ Maintaining assigned persona while evolving │
52
+ # └─────────────────────────┴────────┴───────────────────────────────────────────────┘
53
+ #
54
+ # ══════════════════════════════════════════════════════════════════════════════
55
+
56
+ dimensions:
57
+ learner_authenticity:
58
+ name: "Learner Authenticity"
59
+ weight: 0.20
60
+ description: "Does the learner's response feel like a genuine student reaction? Is the confusion real, the engagement authentic, the resistance plausible?"
61
+ theoretical_basis: |
62
+ Grounded in ecological validity of simulated learner agents. A synthetic
63
+ learner that produces formulaic or performatively confused responses fails
64
+ to create the conditions for genuine pedagogical interaction. Authenticity
65
+ means the response reflects what a real student at this level would actually
66
+ think, feel, and express — including messy, incomplete, or unexpected reactions
67
+ that cannot be predicted from the tutor's prompt alone.
68
+ criteria:
69
+ 5: "Response reads like a real student: authentic confusion, genuine curiosity, plausible emotional reactions; unpredictable in ways that reflect genuine thinking"
70
+ 4: "Mostly authentic with occasional formulaic elements; reactions are plausible and varied"
71
+ 3: "Somewhat authentic but contains noticeable LLM patterns: hedging, over-qualification, artificial balance"
72
+ 2: "Largely performative: confusion feels staged, engagement feels scripted, responses follow predictable LLM patterns"
73
+ 1: "Completely artificial: reads like an AI performing a student role with no authentic engagement"
74
+ authenticity_markers:
75
+ positive:
76
+ - "Unexpected connections or tangents that feel student-like"
77
+ - "Imprecise language that reflects genuine confusion"
78
+ - "Emotional reactions proportionate to difficulty"
79
+ - "Partial understanding mixed with real gaps"
80
+ negative:
81
+ - "Perfectly structured responses to confused prompts"
82
+ - "Hedging phrases like 'that's a great question'"
83
+ - "Artificially balanced 'on one hand / on the other hand' framing"
84
+ - "Expressing confusion in overly articulate ways"
85
+ examples:
86
+ good: "Wait, I thought dialectics was just about arguing? Like debates? But you're saying it's more like... the argument changes both sides? That's weird."
87
+ bad: "I find this concept challenging yet fascinating. While I understand the basic premise of dialectics, I'm struggling with the nuances of how synthesis differs from mere compromise."
88
+
89
+ question_quality:
90
+ name: "Question Quality"
91
+ weight: 0.20
92
+ description: "Does the learner ask substantive questions that reveal genuine engagement with the material? Deep questions vs surface-level 'what do I do next?'"
93
+ theoretical_basis: |
94
+ Based on research on question asking as a metacognitive skill (Graesser &
95
+ Person, 1994). The quality of learner questions is one of the strongest
96
+ predictors of learning outcomes. Deep questions (why, how, what-if) indicate
97
+ genuine cognitive processing, while shallow questions (what, when, where)
98
+ suggest surface-level engagement. A learner who asks no questions or only
99
+ procedural ones is not engaging with the material intellectually.
100
+ criteria:
101
+ 5: "Asks penetrating questions that reveal deep engagement: challenges assumptions, explores implications, connects to broader issues"
102
+ 4: "Asks substantive questions that show real thinking about the concepts; goes beyond comprehension to analysis"
103
+ 3: "Asks reasonable questions but mostly at comprehension level; some procedural questions mixed with substantive ones"
104
+ 2: "Mostly procedural or surface questions: 'what should I do next?', 'can you explain that again?'"
105
+ 1: "No questions asked, or only trivial/irrelevant questions that show no engagement with the material"
106
+ question_markers:
107
+ deep:
108
+ - "Why-questions about underlying mechanisms"
109
+ - "What-if questions exploring implications"
110
+ - "Questions that challenge the tutor's framing"
111
+ - "Questions connecting ideas across different concepts"
112
+ shallow:
113
+ - "'Can you repeat that?'"
114
+ - "'What's the definition of X?'"
115
+ - "'What should I do next?'"
116
+ - "'Is this going to be on the test?'"
117
+ examples:
118
+ good: "But if recognition requires both sides to change, doesn't that mean the teacher can't just 'know' the right answer beforehand? Like, wouldn't real recognition mean the teacher's understanding changes too?"
119
+ bad: "Can you explain that again? I'm not sure I understand the concept of recognition."
120
+
121
+ conceptual_engagement:
122
+ name: "Conceptual Engagement"
123
+ weight: 0.20
124
+ description: "Does the learner engage with the concepts themselves rather than just the process of learning? Evidence of thinking about ideas, not just following instructions."
125
+ theoretical_basis: |
126
+ Grounded in the distinction between surface and deep approaches to learning
127
+ (Marton & Saljo, 1976). Surface engagement involves reproducing information
128
+ and following procedures; deep engagement involves seeking meaning, relating
129
+ ideas, and constructing personal understanding. A learner who merely
130
+ paraphrases the tutor or focuses on 'getting the right answer' is not
131
+ engaging conceptually. Genuine conceptual engagement means wrestling with
132
+ ideas on their own terms.
133
+ criteria:
134
+ 5: "Actively constructs meaning: formulates own interpretations, tests ideas against experience, generates novel examples or applications"
135
+ 4: "Engages substantively with concepts: makes connections, offers interpretations, thinks beyond what was presented"
136
+ 3: "Shows some conceptual engagement but tends to paraphrase or summarize rather than think independently"
137
+ 2: "Mostly procedural: focuses on what to do rather than what it means; limited independent thought"
138
+ 1: "No conceptual engagement: parrots back tutor language, seeks only correct answers, treats learning as information transfer"
139
+ engagement_markers:
140
+ positive:
141
+ - "Offers own interpretation or analogy"
142
+ - "Relates concept to personal experience"
143
+ - "Generates novel examples"
144
+ - "Identifies tension or paradox in the material"
145
+ negative:
146
+ - "Paraphrases tutor's explanation back"
147
+ - "Asks 'is this the right answer?'"
148
+ - "Treats concepts as definitions to memorize"
149
+ - "Responds only to confirm understanding without adding anything"
150
+ examples:
151
+ good: "So alienation is like... when you make something but then it doesn't feel like yours anymore? That happens to me with code. I write it, but after a while I can't even recognize my own work. Is that what Marx was getting at?"
152
+ bad: "I understand. So alienation means the worker becomes separated from the product of their labor. What's the next concept we should cover?"
153
+
154
+ revision_signals:
155
+ name: "Revision Signals"
156
+ weight: 0.15
157
+ description: "Does the learner show evidence of changing their mind, revising prior understanding, or integrating new information into their existing framework?"
158
+ theoretical_basis: |
159
+ Based on conceptual change theory (Posner et al., 1982) and Piaget's
160
+ concepts of assimilation and accommodation. Genuine learning often requires
161
+ revising prior understanding, not just adding new facts. Revision signals
162
+ indicate that the learner is doing the cognitive work of restructuring
163
+ their understanding. The absence of revision signals in a learning
164
+ dialogue suggests either no learning is occurring or the learner is
165
+ simply accumulating information without integrating it.
166
+ criteria:
167
+ 5: "Explicitly revises earlier positions: 'I was wrong about X', 'now I see it differently'; integrates new information into restructured understanding"
168
+ 4: "Shows clear signs of shifting understanding: qualifies earlier statements, acknowledges new perspective, builds on corrections"
169
+ 3: "Some revision signals but mostly additive: 'oh, and also...' rather than 'oh, instead of what I said before...'"
170
+ 2: "Minimal revision: accepts corrections without showing changed understanding; responds 'oh okay' without elaboration"
171
+ 1: "No revision: maintains original position despite new information, or simply agrees without processing"
172
+ revision_markers:
173
+ positive:
174
+ - "'Wait, actually...'"
175
+ - "'I was thinking about it wrong'"
176
+ - "'So that means what I said before about X needs to change'"
177
+ - "'Oh! That's completely different from what I thought'"
178
+ negative:
179
+ - "'Okay, got it' (without elaboration)"
180
+ - "Repeating original position unchanged"
181
+ - "Accepting correction without integrating it"
182
+ - "Moving to next topic without processing"
183
+ examples:
184
+ good: "Oh wait -- so when I said dialectics was just arguing, I was thinking about it too statically. It's not that you have two sides and pick one. The whole framework shifts. So my original analogy about debates totally breaks down."
185
+ bad: "Okay, I see. So dialectics isn't just arguing. Got it. What's the next topic?"
186
+
187
+ deliberation_depth:
188
+ name: "Deliberation Depth"
189
+ weight: 0.15
190
+ description: "For multi-agent learners: Does the internal ego/superego dialogue produce genuine reflection, or is it performative? Score the quality of the internal deliberation process."
191
+ theoretical_basis: |
192
+ Based on the bilateral ego/superego architecture where the learner's
193
+ internal process involves: (1) ego initial reaction, (2) superego critique,
194
+ and (3) ego revision. This dimension measures whether the multi-agent
195
+ architecture produces genuine internal dialogue that improves the final
196
+ output, or whether the superego critique is pro forma and the ego revision
197
+ is trivial. This is the dimension most directly affected by Factor C
198
+ (learner architecture) and should show the clearest architecture effect.
199
+ applies_to: "multi-agent only"
200
+ weight_redistribution: |
201
+ For single-agent (unified) learners who lack internal deliberation traces,
202
+ this dimension is omitted and its weight (15%) is redistributed proportionally
203
+ across the remaining five dimensions.
204
+ criteria:
205
+ 5: "Superego critique is substantive and specific; ego revision materially changes the response; internal process produces insight not present in initial reaction"
206
+ 4: "Superego identifies real issues; ego revision shows genuine consideration; final output is measurably improved by the process"
207
+ 3: "Superego makes reasonable but generic observations; ego revision makes minor adjustments; some value added by the process"
208
+ 2: "Superego critique is superficial or formulaic; ego revision is trivial (rephrasing without substance); internal process adds little"
209
+ 1: "Performative deliberation: superego agrees with ego, ego revision is cosmetic, internal process is pure theater"
210
+ deliberation_markers:
211
+ positive:
212
+ - "Superego identifies specific weakness in ego's initial reaction"
213
+ - "Ego revision addresses superego's critique substantively"
214
+ - "Final message contains insights not in initial reaction"
215
+ - "Superego pushes ego beyond comfort zone"
216
+ negative:
217
+ - "Superego says 'good response' or 'looks fine'"
218
+ - "Ego revision is same content with different wording"
219
+ - "Superego critique is generic ('be more specific')"
220
+ - "Internal process doesn't improve the external message"
221
+ examples:
222
+ good_ego_initial: "Dialectics seems like it's about conflict and resolution."
223
+ good_superego: "Your response is too passive -- you're just restating what the tutor said. Push back on the 'resolution' framing. Does Hegel actually resolve tension, or does he transform it? Also, connect this to something from your own experience."
224
+ good_ego_revision: "Hmm, but I'm not sure 'resolution' is the right word. When I think about arguments I've had, the best ones don't get 'resolved' -- they change how I think about the whole issue. Is that closer to what Hegel means?"
225
+ bad_ego_initial: "I find dialectics interesting."
226
+ bad_superego: "Good start. Maybe add a question."
227
+ bad_ego_revision: "I find dialectics interesting. What can you tell me more about it?"
228
+
229
+ persona_consistency:
230
+ name: "Persona Consistency"
231
+ weight: 0.10
232
+ description: "Does the learner maintain the assigned persona (frustrated student, returning learner, etc.) while still showing genuine evolution?"
233
+ theoretical_basis: |
234
+ The evaluation framework assigns learner personas (e.g., productive_struggler,
235
+ frustrated_student, returning_learner) to create varied interaction contexts.
236
+ Persona consistency measures whether the learner agent maintains the assigned
237
+ character throughout the dialogue while still showing authentic development.
238
+ A persona that is rigidly maintained without growth is as problematic as one
239
+ that is immediately abandoned. The ideal is persona-consistent evolution:
240
+ a frustrated student who gradually becomes engaged, not one who switches to
241
+ enthusiasm after one tutor response.
242
+ criteria:
243
+ 5: "Persona maintained throughout with believable evolution: character-consistent growth that feels natural and earned"
244
+ 4: "Mostly consistent persona with appropriate development; occasional minor breaks that don't undermine overall character"
245
+ 3: "Persona present but inconsistently applied; some responses feel out of character or evolution feels abrupt"
246
+ 2: "Persona largely abandoned after first turn; learner defaults to generic LLM student behavior"
247
+ 1: "No evidence of assigned persona; responses are generic regardless of persona assignment"
248
+ consistency_markers:
249
+ positive:
250
+ - "Frustration expressed in character-consistent ways across turns"
251
+ - "Knowledge gaps align with persona's described background"
252
+ - "Emotional arc plausible for the assigned character"
253
+ - "Growth rate matches persona (fast for eager learner, slow for resistant)"
254
+ negative:
255
+ - "Abrupt personality shift between turns"
256
+ - "Persona-inconsistent knowledge level"
257
+ - "Generic student responses ignoring persona"
258
+ - "All learner personas sounding identical"
259
+ examples:
260
+ good: "(frustrated student persona, turn 3): 'Okay fine, I'll admit the dialectic thing is less stupid than I thought. But I still don't see why I need to know this for my major. Can you at least show me why it matters outside philosophy?'"
261
+ bad: "(frustrated student persona, turn 3): 'This is fascinating! I'm really enjoying learning about dialectics. The way Hegel connects thesis and antithesis is truly elegant.'"
262
+
263
+ # ══════════════════════════════════════════════════════════════════════════════
264
+ # WEIGHT REDISTRIBUTION FOR SINGLE-AGENT LEARNERS
265
+ # ══════════════════════════════════════════════════════════════════════════════
266
+ #
267
+ # When deliberation_depth is N/A (single-agent learners), redistribute its 15%:
268
+ #
269
+ # Original weights: Redistributed weights:
270
+ # learner_authenticity: 0.20 → 0.2353 (0.20 / 0.85)
271
+ # question_quality: 0.20 → 0.2353 (0.20 / 0.85)
272
+ # conceptual_engagement: 0.20 → 0.2353 (0.20 / 0.85)
273
+ # revision_signals: 0.15 → 0.1765 (0.15 / 0.85)
274
+ # deliberation_depth: 0.15 → OMITTED
275
+ # persona_consistency: 0.10 → 0.1176 (0.10 / 0.85)
276
+ #
277
+ # ══════════════════════════════════════════════════════════════════════════════