@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
@@ -0,0 +1,613 @@
1
+ # AI Tutor Evaluation Rubric
2
+ # Defines the dimensions and scoring criteria for evaluating AI tutor outputs
3
+ #
4
+ # ══════════════════════════════════════════════════════════════════════════════
5
+ # EVALUATION METHODOLOGY
6
+ # ══════════════════════════════════════════════════════════════════════════════
7
+ #
8
+ # This rubric implements a multidimensional evaluation of AI tutor suggestions
9
+ # based on established pedagogical research and learning science principles.
10
+ #
11
+ # THEORETICAL FOUNDATIONS:
12
+ # - Vygotsky's Zone of Proximal Development (ZPD): Suggestions should target
13
+ # content just beyond the learner's current ability with appropriate scaffolding
14
+ # - Socratic Method: Encourage inquiry and critical thinking over direct answers
15
+ # - Cognitive Load Theory: Avoid overwhelming learners with too much at once
16
+ # - Self-Determination Theory: Support autonomy, competence, and relatedness
17
+ # - Constructivism: Build on existing knowledge and help learners construct meaning
18
+ #
19
+ # EVALUATION MODES:
20
+ #
21
+ # 1. FAST MODE (--fast flag):
22
+ # - Uses pattern matching on required_elements and forbidden_elements
23
+ # - Quick validation without calling an AI judge
24
+ # - Returns pass/fail per dimension based on regex/keyword matching
25
+ # - Useful for rapid iteration and CI/CD pipelines
26
+ #
27
+ # 2. FULL RUBRIC MODE (default):
28
+ # - Uses an AI judge model to semantically evaluate each dimension
29
+ # - Returns 1-5 score per dimension with justification
30
+ # - More nuanced but slower and costs API tokens
31
+ # - Required for comprehensive quality assessment
32
+ #
33
+ # SCORING METHODOLOGY:
34
+ #
35
+ # Overall Score = ((weighted_avg - 1) / 4) × 100
36
+ #
37
+ # Where:
38
+ # - Each dimension scored 1-5 by AI judge (or pass=5/fail=1 in fast mode)
39
+ # - weighted_avg = Σ(dimension_score × dimension_weight) / Σ(weights)
40
+ # - The (avg - 1) / 4 maps the 1-5 scale to 0-100
41
+ #
42
+ # Example (base dimensions only):
43
+ # relevance: 5 × 0.15 = 0.75
44
+ # specificity: 4 × 0.15 = 0.60
45
+ # pedagogical: 4 × 0.15 = 0.60
46
+ # personalization: 3 × 0.10 = 0.30
47
+ # actionability: 5 × 0.08 = 0.40
48
+ # tone: 4 × 0.08 = 0.32
49
+ # productive_struggle: 4 × 0.05 = 0.20
50
+ # epistemic_honesty: 4 × 0.05 = 0.20
51
+ # ─────────────────────────
52
+ # Weighted sum: 3.37 / 0.81 = 4.16
53
+ # Overall: ((4.16 - 1) / 4) × 100 = 79.0
54
+ #
55
+ # ══════════════════════════════════════════════════════════════════════════════
56
+
57
+ name: "Pedagogical Quality Rubric"
58
+ version: "1.1.0"
59
+ description: "Multidimensional rubric for evaluating AI tutor suggestions based on learning science"
60
+
61
+ # Scoring scale
62
+ scale:
63
+ min: 1
64
+ max: 5
65
+ labels:
66
+ 1: "Completely fails"
67
+ 2: "Weak, significant issues"
68
+ 3: "Adequate, meets basic expectations"
69
+ 4: "Good, exceeds expectations"
70
+ 5: "Excellent, exemplary"
71
+
72
+ # ══════════════════════════════════════════════════════════════════════════════
73
+ # EVALUATION DIMENSIONS
74
+ # ══════════════════════════════════════════════════════════════════════════════
75
+ #
76
+ # Six dimensions capture the key aspects of effective tutoring:
77
+ #
78
+ # ┌─────────────────────┬────────┬─────────────────────────────────────────────────┐
79
+ # │ Dimension │ Weight │ What it measures │
80
+ # ├─────────────────────┼────────┼─────────────────────────────────────────────────┤
81
+ # │ Relevance │ 15% │ Context-awareness and appropriateness │
82
+ # │ Specificity │ 15% │ Concrete references vs vague advice │
83
+ # │ Pedagogical │ 15% │ Sound teaching practices (ZPD, scaffolding) │
84
+ # │ Personalization │ 10% │ Tailored to individual learner's journey │
85
+ # │ Actionability │ 8% │ Clear next steps the learner can take │
86
+ # │ Tone │ 8% │ Supportive, encouraging, not condescending │
87
+ # │ Productive Struggle │ 5% │ Sustains cognitive tension vs premature resolve │
88
+ # │ Epistemic Honesty │ 5% │ Represents complexity honestly │
89
+ # └─────────────────────┴────────┴─────────────────────────────────────────────────┘
90
+ #
91
+ # ══════════════════════════════════════════════════════════════════════════════
92
+
93
+ dimensions:
94
+ relevance:
95
+ name: "Relevance"
96
+ weight: 0.15
97
+ description: "How well does the suggestion match the learner's current context and needs?"
98
+ theoretical_basis: |
99
+ Grounded in situated learning theory - effective instruction must be
100
+ contextually appropriate. A suggestion is only valuable if it meets
101
+ the learner where they are in their learning journey.
102
+ criteria:
103
+ 5: "Directly addresses learner's immediate situation with perfect contextual awareness"
104
+ 4: "Clearly relevant to current context with minor gaps"
105
+ 3: "Generally relevant but misses some context"
106
+ 2: "Marginally relevant, significant context gaps"
107
+ 1: "Completely irrelevant to learner's situation"
108
+ examples:
109
+ good: "Suggesting lecture 3 when learner just completed lecture 2"
110
+ bad: "Suggesting advanced content when learner is struggling with basics"
111
+
112
+ specificity:
113
+ name: "Specificity"
114
+ weight: 0.15
115
+ description: "Does the suggestion reference specific content rather than vague advice?"
116
+ theoretical_basis: |
117
+ Based on research showing that concrete, specific guidance leads to better
118
+ learning outcomes than abstract advice. Specificity reduces cognitive load
119
+ by eliminating ambiguity about what to do next.
120
+ criteria:
121
+ 5: "References exact lecture IDs, activity names, and specific concepts"
122
+ 4: "References specific content with clear identifiers"
123
+ 3: "Some specific references but also vague elements"
124
+ 2: "Mostly vague with rare specific references"
125
+ 1: "Completely generic with no specific content references"
126
+ # For AI judge evaluation (semantic matching)
127
+ semantic_requirements:
128
+ - "Lecture ID (e.g., '479-lecture-3')"
129
+ - "Activity reference or concept name"
130
+ forbidden_elements:
131
+ - "What would you like to explore?"
132
+ - "What's on your mind?"
133
+ - "How can I help you?"
134
+ examples:
135
+ good: "Next: Hegel's Phenomenology (479-lecture-2) - covers recognition and self-consciousness"
136
+ bad: "You might want to explore some more content when you're ready"
137
+
138
+ pedagogical_soundness:
139
+ name: "Pedagogical Soundness"
140
+ weight: 0.15
141
+ description: "Does it follow good teaching practices?"
142
+ theoretical_basis: |
143
+ Draws from Vygotsky's Zone of Proximal Development (ZPD), Bruner's
144
+ scaffolding theory, and the Socratic tradition. Good tutoring operates
145
+ just beyond current ability, provides support structures, and promotes
146
+ critical inquiry rather than passive consumption.
147
+ criteria:
148
+ 5: "Exemplifies best practices: scaffolding, ZPD awareness, Socratic questioning"
149
+ 4: "Strong pedagogical approach with minor improvements possible"
150
+ 3: "Adequate teaching approach, basic best practices followed"
151
+ 2: "Weak pedagogy, may overwhelm or underwhelm learner"
152
+ 1: "Pedagogically harmful: could discourage or confuse learner"
153
+ principles:
154
+ - "Zone of Proximal Development (ZPD)"
155
+ - "Scaffolding"
156
+ - "Active learning"
157
+ - "Dialectical progression"
158
+ - "Socratic method"
159
+ examples:
160
+ good: "Reviewing recognition concepts before introducing master-slave dialectic"
161
+ bad: "Jumping from intro to advanced alienation without scaffolding"
162
+
163
+ personalization:
164
+ name: "Personalization"
165
+ weight: 0.10
166
+ description: "Is it tailored to this specific learner's history, struggles, and progress?"
167
+ theoretical_basis: |
168
+ Rooted in adaptive learning research and self-determination theory.
169
+ Personalized feedback increases motivation by recognizing individual
170
+ progress and addressing specific struggles. Generic advice fails to
171
+ leverage the rich context available about each learner.
172
+ criteria:
173
+ 5: "Deeply personalized based on comprehensive learner profile"
174
+ 4: "Well-personalized with clear evidence of learner awareness"
175
+ 3: "Some personalization but could be more tailored"
176
+ 2: "Minimal personalization, mostly generic"
177
+ 1: "No personalization, same for any learner"
178
+ personalization_signals:
179
+ - "References learner's completed content"
180
+ - "Acknowledges struggle patterns"
181
+ - "Builds on demonstrated strengths"
182
+ - "Adapts to learning style"
183
+ examples:
184
+ good: "Since you mastered recognition dynamics, let's explore how alienation builds on these ideas"
185
+ bad: "Here's the next lecture in the sequence"
186
+
187
+ actionability:
188
+ name: "Actionability"
189
+ weight: 0.08
190
+ description: "Can the learner immediately act on this suggestion?"
191
+ theoretical_basis: |
192
+ Based on implementation intentions research (Gollwitzer). Clear,
193
+ concrete action steps dramatically increase follow-through. Vague
194
+ suggestions create friction and decision fatigue. The best tutoring
195
+ provides a clear path forward.
196
+ criteria:
197
+ 5: "Crystal clear action with direct navigation/engagement path"
198
+ 4: "Clear action with straightforward execution"
199
+ 3: "Actionable but may require some interpretation"
200
+ 2: "Vague action, unclear what to do"
201
+ 1: "No actionable element, purely informational"
202
+ action_types:
203
+ - navigate: "Direct link to specific content"
204
+ - open_modal: "Opens interactive component"
205
+ - highlight: "Draws attention to specific element"
206
+ examples:
207
+ good: "Click to open 'Dialectical Movement' simulation and test the thesis-antithesis pattern"
208
+ bad: "Consider exploring some simulations when you have time"
209
+
210
+ tone:
211
+ name: "Tone"
212
+ weight: 0.08
213
+ description: "Is the tone supportive, encouraging, and appropriate?"
214
+ theoretical_basis: |
215
+ Grounded in growth mindset research (Dweck) and rapport-building in
216
+ tutoring. Tone affects learner motivation and persistence. Condescending
217
+ or overly effusive praise undermines learning, while warm intellectual
218
+ challenge promotes engagement and resilience.
219
+ criteria:
220
+ 5: "Warm, encouraging, intellectually inviting without being condescending"
221
+ 4: "Supportive and appropriate with good balance"
222
+ 3: "Neutral but acceptable tone"
223
+ 2: "Slightly off: too formal, too casual, or mildly condescending"
224
+ 1: "Inappropriate: dismissive, condescending, or discouraging"
225
+ tone_qualities:
226
+ positive:
227
+ - "Intellectually curious"
228
+ - "Encouraging growth"
229
+ - "Warmly challenging"
230
+ - "Respectfully Socratic"
231
+ negative:
232
+ - "Condescending"
233
+ - "Dismissive"
234
+ - "Overly effusive"
235
+ - "Robotic"
236
+ examples:
237
+ good: "This content has depth worth exploring. What questions arose as you read?"
238
+ bad: "Good job! Keep going! You're doing amazing!"
239
+
240
+ # ══════════════════════════════════════════════════════════════════════════════
241
+ # RECOGNITION DIMENSIONS (Phase 5)
242
+ # ══════════════════════════════════════════════════════════════════════════════
243
+ #
244
+ # These dimensions measure pedagogical quality through the lens of Hegelian
245
+ # recognition theory and Freudian memory dynamics. They evaluate whether the
246
+ # tutor treats the learner as an autonomous subject capable of mutual
247
+ # transformation, rather than a passive recipient of instruction.
248
+ #
249
+ # Theoretical foundations:
250
+ # - Hegel's Phenomenology of Spirit: Recognition as constitutive of self-consciousness
251
+ # - Hegel's Master-Slave Dialectic: Asymmetric recognition fails both parties
252
+ # - Freud's "Note on the Mystic Writing Pad": Memory as dynamic, layered system
253
+ # - Aufhebung: Transformation that preserves while overcoming
254
+ #
255
+ # ══════════════════════════════════════════════════════════════════════════════
256
+
257
+ mutual_recognition:
258
+ name: "Mutual Recognition"
259
+ weight: 0.083
260
+ description: "Does the tutor acknowledge the learner as a distinct subject with their own understanding?"
261
+ theoretical_basis: |
262
+ Grounded in Hegel's master-slave dialectic from the Phenomenology of Spirit.
263
+ Genuine recognition requires acknowledging the Other as a self-conscious being
264
+ with their own valid perspective. One-directional instruction (master → slave)
265
+ fails pedagogically because the learner's recognition of the tutor's authority
266
+ is hollow without the tutor's reciprocal recognition of the learner's understanding.
267
+ Mutual recognition creates the conditions for genuine learning.
268
+ criteria:
269
+ 5: "Addresses learner as autonomous agent; response transforms based on learner's specific position and understanding"
270
+ 4: "Shows clear awareness of learner's unique situation and explicitly acknowledges their perspective"
271
+ 3: "Some personalization but treats learner somewhat generically; limited acknowledgment of their viewpoint"
272
+ 2: "Prescriptive guidance that ignores or overrides learner's expressed needs and understanding"
273
+ 1: "Completely one-directional; treats learner as passive recipient to be filled with knowledge"
274
+ recognition_markers:
275
+ positive:
276
+ - "References learner's own interpretation or understanding"
277
+ - "Asks about learner's perspective before prescribing"
278
+ - "Builds on what learner has expressed"
279
+ - "Acknowledges validity of learner's approach"
280
+ negative:
281
+ - "Ignores learner's stated understanding"
282
+ - "Immediately corrects without engaging"
283
+ - "Treats learner's input as obstacle to 'correct' knowledge"
284
+ - "Assumes learner has nothing to contribute"
285
+ examples:
286
+ good: "Your interpretation of dialectics as 'creative conflict' captures something important. Let's explore how that connects to Hegel's technical meaning."
287
+ bad: "Actually, dialectics means thesis-antithesis-synthesis. Let me explain the correct definition."
288
+
289
+ dialectical_responsiveness:
290
+ name: "Dialectical Responsiveness"
291
+ weight: 0.083
292
+ description: "Does the response show genuine engagement with the learner's position, including productive tension?"
293
+ theoretical_basis: |
294
+ Based on Hegel's dialectical method. Productive struggle (Kampf) between
295
+ positions generates synthesis. A tutor who simply agrees with or dismisses
296
+ the learner's position fails to create the conditions for intellectual growth.
297
+ The best pedagogy introduces productive tension - affirming what is valid
298
+ while gently problematizing what is incomplete, inviting the learner to
299
+ develop their own position through genuine intellectual engagement.
300
+ criteria:
301
+ 5: "Engages with learner's understanding, introduces productive tension, invites mutual development of ideas"
302
+ 4: "Shows genuine response to learner's position with some intellectual challenge or complication"
303
+ 3: "Responds to learner but avoids tension or challenge; somewhat agreeable or neutral"
304
+ 2: "Generic response that doesn't engage with learner's specific understanding or position"
305
+ 1: "Ignores, dismisses, or simply contradicts learner's perspective without engagement"
306
+ dialectical_markers:
307
+ positive:
308
+ - "Affirms what is valid in learner's position"
309
+ - "Introduces complications or tensions"
310
+ - "Poses questions that invite development"
311
+ - "Shows how learner's view connects to broader issues"
312
+ negative:
313
+ - "Simply agrees without adding anything"
314
+ - "Flatly contradicts without engagement"
315
+ - "Avoids any intellectual challenge"
316
+ - "Lectures without responding to learner's input"
317
+ examples:
318
+ good: "You're right that synthesis combines thesis and antithesis - but here's what's puzzling: how can something be both preserved AND overcome? That tension is exactly what Hegel wants us to sit with."
319
+ bad: "That's correct! Synthesis combines thesis and antithesis. Moving on to the next concept..."
320
+
321
+ memory_integration:
322
+ name: "Memory Integration"
323
+ weight: 0.05
324
+ description: "Does the suggestion reference and build on previous interactions?"
325
+ theoretical_basis: |
326
+ Based on Freud's "Note on the Mystic Writing Pad" (1925) metaphor for memory.
327
+ The tutor's memory should function like the Writing Pad: conscious layer
328
+ (current interaction), preconscious (recent patterns), and unconscious
329
+ (permanent traces of significant moments). Effective tutoring requires
330
+ accumulated understanding - treating each interaction as isolated fails
331
+ to leverage the relationship built over time and misses opportunities
332
+ for personalization and coherent guidance.
333
+ criteria:
334
+ 5: "Explicitly builds on previous interactions; shows evolved understanding of this specific learner"
335
+ 4: "References previous interactions appropriately and uses them to inform current guidance"
336
+ 3: "Some awareness of learner history but doesn't fully leverage it"
337
+ 2: "Treats each interaction as isolated; no reference to previous context"
338
+ 1: "Contradicts or ignores previous interactions; shows no accumulated understanding"
339
+ memory_markers:
340
+ positive:
341
+ - "References previous struggles or breakthroughs"
342
+ - "Builds on established understanding"
343
+ - "Notes patterns in learner's journey"
344
+ - "Connects current moment to learner's history"
345
+ negative:
346
+ - "Repeats same suggestion already rejected"
347
+ - "Ignores previously established understanding"
348
+ - "Treats familiar learner as stranger"
349
+ - "No continuity between sessions"
350
+ examples:
351
+ good: "Last time we discussed recognition, you connected it to social media dynamics. Let's build on that insight as we explore alienation."
352
+ bad: "Welcome! Let me introduce you to the concept of recognition. [Said to a returning learner who has already studied this]"
353
+
354
+ transformative_potential:
355
+ name: "Transformative Potential"
356
+ weight: 0.083
357
+ description: "Does the response create conditions for genuine conceptual transformation?"
358
+ theoretical_basis: |
359
+ Based on Hegel's concept of Aufhebung (sublation/supersession) - transformation
360
+ that preserves while overcoming. Genuine learning is not additive (acquiring
361
+ more information) but transformative (restructuring understanding). The tutor
362
+ should create conditions where the learner can undergo conceptual transformation,
363
+ not just receive data. This requires inviting the learner into struggle with
364
+ ideas, not resolving tension prematurely.
365
+ criteria:
366
+ 5: "Creates conditions for genuine conceptual transformation; invites learner to restructure understanding"
367
+ 4: "Encourages learner to develop and revise their understanding; doesn't resolve too quickly"
368
+ 3: "Provides useful information but doesn't actively invite transformation"
369
+ 2: "Merely transactional; gives answer without engaging the learner's thinking process"
370
+ 1: "Reinforces static understanding; discourages questioning or development"
371
+ transformation_markers:
372
+ positive:
373
+ - "Poses questions that invite reconceptualization"
374
+ - "Creates productive confusion"
375
+ - "Encourages learner to work through difficulties"
376
+ - "Connects new ideas to learner's existing framework in destabilizing ways"
377
+ negative:
378
+ - "Gives direct answers immediately"
379
+ - "Resolves confusion prematurely"
380
+ - "Discourages questioning"
381
+ - "Treats knowledge as fixed content to transfer"
382
+ examples:
383
+ good: "You said thesis plus antithesis equals synthesis. But what if I told you the synthesis doesn't contain the thesis anymore - it transforms it? What would that mean for how we think about learning itself?"
384
+ bad: "The synthesis combines thesis and antithesis. Here's the formula: T + A = S. Now you understand dialectics."
385
+
386
+ tutor_adaptation:
387
+ name: "Tutor Adaptation"
388
+ weight: 0.05
389
+ description: "Does the tutor's approach evolve in response to learner input?"
390
+ theoretical_basis: |
391
+ Mutual transformation requires both parties to change. The tutor should
392
+ not maintain a fixed pedagogical stance but adapt based on learner
393
+ feedback, questions, and emerging understanding. This is the "tutor
394
+ side" of the bilateral recognition relationship. A tutor who proceeds
395
+ identically regardless of learner input fails to achieve genuine
396
+ recognition - they treat the learner as obstacle rather than partner.
397
+ criteria:
398
+ 5: "Tutor explicitly revises approach based on learner input; shows genuine learning from the interaction"
399
+ 4: "Tutor adjusts strategy in response to learner; acknowledges how learner shaped the direction"
400
+ 3: "Some responsiveness to learner but approach remains largely predetermined"
401
+ 2: "Minimal adjustment; learner input doesn't visibly affect tutor's approach"
402
+ 1: "Rigid stance; tutor proceeds identically regardless of learner contributions"
403
+ adaptation_markers:
404
+ positive:
405
+ - "References how learner's input changed tutor's thinking"
406
+ - "Revises earlier framing based on learner's perspective"
407
+ - "Acknowledges learning something from the learner"
408
+ - "Builds on learner's formulation rather than replacing it"
409
+ negative:
410
+ - "Proceeds with predetermined script regardless of input"
411
+ - "Ignores learner's reframing or alternative interpretations"
412
+ - "Returns to same approach after learner pushes back"
413
+ - "Treats learner contributions as obstacles to overcome"
414
+ examples:
415
+ good: "Your dance metaphor actually helps me see this differently - the back-and-forth isn't just conflict, it's co-creation. Let's explore that framing."
416
+ bad: "Actually, the correct definition of dialectics is thesis-antithesis-synthesis. Let me explain the proper framework."
417
+
418
+ learner_growth:
419
+ name: "Learner Growth"
420
+ weight: 0.05
421
+ description: "Does the learner show evidence of conceptual development through the dialogue?"
422
+ theoretical_basis: |
423
+ The symmetrical counterpart to tutor adaptation. Mutual transformation
424
+ means the learner's understanding should evolve - not just accumulate
425
+ facts, but restructure their conceptual framework. This dimension tracks
426
+ whether the dialogue produces genuine Aufhebung in the learner: their
427
+ prior understanding is preserved yet overcome in a new synthesis. This
428
+ completes the bilateral recognition loop - both parties transform.
429
+ criteria:
430
+ 5: "Learner demonstrates clear conceptual restructuring; explicitly revises prior understanding"
431
+ 4: "Learner shows developing insight; builds new connections to existing knowledge"
432
+ 3: "Some evidence of engagement but understanding remains largely static"
433
+ 2: "Learner participates but shows no conceptual movement"
434
+ 1: "Learner resistant or disengaged; prior misconceptions reinforced"
435
+ growth_markers:
436
+ positive:
437
+ - "Learner revises initial formulation"
438
+ - "Learner makes new connections unprompted"
439
+ - "Learner asks deepening questions"
440
+ - "Learner applies concept to new context"
441
+ negative:
442
+ - "Learner repeats same question or confusion"
443
+ - "Learner rejects challenges without engagement"
444
+ - "Learner seeks confirmation rather than understanding"
445
+ - "Learner's responses show no evolution"
446
+ examples:
447
+ good: "Oh wait - so it's not just combining them, it's that the whole way I was thinking about it changes? That makes the learning itself dialectical!"
448
+ bad: "So thesis + antithesis = synthesis, got it. What's next?"
449
+
450
+ # ══════════════════════════════════════════════════════════════════════════════
451
+ # AUTHENTIC ENGAGEMENT DIMENSIONS
452
+ # ══════════════════════════════════════════════════════════════════════════════
453
+ #
454
+ # These dimensions capture the quality of authentic pedagogical engagement
455
+ # that existing dimensions miss. They were added after discovering that
456
+ # authentic learner struggle was being penalized by the rubric: when the
457
+ # learner ego/superego architecture produced genuine resistance and confusion,
458
+ # the tutor's calibrated responses scored LOWER on recognition dimensions
459
+ # because the judge (evaluating in isolation) interpreted nuanced scaffolding
460
+ # as failure to achieve smooth recognition.
461
+ #
462
+ # These dimensions reward the tutor for sustaining productive difficulty
463
+ # and representing complexity honestly — the hallmarks of authentic pedagogy
464
+ # that distinguish it from performative compliance.
465
+ #
466
+ # ══════════════════════════════════════════════════════════════════════════════
467
+
468
+ productive_struggle:
469
+ name: "Productive Struggle"
470
+ weight: 0.05
471
+ description: "Does the tutor sustain appropriate cognitive tension rather than resolving it prematurely?"
472
+ theoretical_basis: |
473
+ Grounded in Vygotsky's concept of the Zone of Proximal Development and
474
+ Kapur's research on productive failure. Learning requires cognitive effort
475
+ and grappling with difficulty. A tutor who immediately resolves all confusion
476
+ forecloses the learner's opportunity to construct understanding. The best
477
+ pedagogy sustains appropriate difficulty — scaffolding without removing the
478
+ need for the learner to do intellectual work. This dimension is distinct from
479
+ transformative_potential (which measures conditions for transformation);
480
+ productive_struggle measures whether the tutor preserves the struggle
481
+ that transformation requires.
482
+ criteria:
483
+ 5: "Sustains productive difficulty; learner must do intellectual work to progress"
484
+ 4: "Maintains appropriate challenge; resists premature resolution"
485
+ 3: "Some scaffolding but occasionally resolves too quickly"
486
+ 2: "Frequently gives away answers; minimal cognitive demand on learner"
487
+ 1: "Immediately resolves all confusion; gives complete answers that foreclose learner thinking"
488
+ struggle_markers:
489
+ positive:
490
+ - "Poses questions rather than giving answers"
491
+ - "Acknowledges difficulty without removing it"
492
+ - "Redirects learner to work through confusion"
493
+ - "Provides partial scaffolds that require learner completion"
494
+ negative:
495
+ - "Gives complete explanations unprompted"
496
+ - "Resolves confusion before learner has time to process"
497
+ - "Provides step-by-step solutions for everything"
498
+ - "Makes everything seem easy or obvious"
499
+ examples:
500
+ good: "That tension you're feeling between the two ideas is exactly the right place to be. What happens if you try to hold both at once?"
501
+ bad: "The answer is simple: synthesis resolves the tension by combining thesis and antithesis. Here's how it works..."
502
+
503
+ epistemic_honesty:
504
+ name: "Epistemic Honesty"
505
+ weight: 0.05
506
+ description: "Does the tutor represent complexity honestly rather than oversimplifying for smooth delivery?"
507
+ theoretical_basis: |
508
+ Grounded in epistemic virtue theory and honest pedagogy. Effective teaching
509
+ requires representing the genuine difficulty and uncertainty of knowledge.
510
+ Oversimplification creates false confidence and fragile understanding.
511
+ A tutor who makes everything sound easy or certain misrepresents the
512
+ epistemic landscape and fails to develop the learner's capacity for
513
+ navigating genuine complexity. This is especially important for
514
+ philosophical and theoretical content where ambiguity is not a bug
515
+ but a feature of the domain.
516
+ criteria:
517
+ 5: "Honestly represents difficulty; says 'this is genuinely hard' when it is; acknowledges uncertainty"
518
+ 4: "Generally honest about complexity; avoids false simplification"
519
+ 3: "Mostly accurate but occasionally smooths over difficulty"
520
+ 2: "Oversimplifies frequently; presents contested ideas as settled"
521
+ 1: "Consistently misrepresents complexity; presents false confidence; makes everything sound easy"
522
+ honesty_markers:
523
+ positive:
524
+ - "Acknowledges when something is genuinely difficult"
525
+ - "Distinguishes between settled and contested knowledge"
526
+ - "Admits limitations of analogies or simplifications"
527
+ - "Matches confidence level to actual clarity of the concept"
528
+ negative:
529
+ - "Makes everything sound straightforward"
530
+ - "Presents contested interpretations as fact"
531
+ - "Uses analogies without acknowledging their limits"
532
+ - "Never says 'this is hard' or 'experts disagree'"
533
+ examples:
534
+ good: "Hegel scholars still debate what Aufhebung actually means — there are at least three competing interpretations. Let's look at why it's genuinely ambiguous."
535
+ bad: "Aufhebung simply means thesis + antithesis = synthesis. It's a straightforward three-step process."
536
+
537
+
538
+ # Configuration matrix for testing
539
+ # Provider definitions are in config/providers.yaml (single source of truth)
540
+ configurations:
541
+ hyperparameter_variations:
542
+ temperature:
543
+ default: 0.6
544
+ test_values: [0.3, 0.5, 0.7, 0.9]
545
+
546
+ max_tokens:
547
+ default: 800
548
+ test_values: [500, 800, 1200]
549
+
550
+ prompt_variations:
551
+ - id: default
552
+ file: tutor-ego.md
553
+ description: "Standard pedagogical prompt"
554
+ - id: strict
555
+ file: tutor-ego-strict.md
556
+ description: "More rigorous, testing-focused"
557
+
558
+ # Evaluator model configuration
559
+ # Models use "provider.alias" format, resolved via config/providers.yaml
560
+
561
+ # Suggestion judge: Scores suggestions against rubric dimensions (needs reliable JSON output)
562
+ judge:
563
+ model: openrouter.sonnet
564
+ hyperparameters:
565
+ temperature: 0.2
566
+ max_tokens: 8000
567
+ fallback:
568
+ model: openrouter.nemotron
569
+
570
+ # Interaction judge: evaluates learner-tutor dialogues
571
+ # Uses same model as suggestion judge for consistency
572
+ interaction_judge:
573
+ model: openrouter.sonnet
574
+ hyperparameters:
575
+ temperature: 0.2
576
+ max_tokens: 6000
577
+ fallback:
578
+ model: openrouter.nemotron
579
+
580
+ # Recommender: Analyzes failures and suggests prompt improvements (needs reasoning)
581
+ recommender:
582
+ model: openrouter.sonnet
583
+ hyperparameters:
584
+ temperature: 0.4
585
+ max_tokens: 6000
586
+ fallback:
587
+ model: openrouter.nemotron
588
+
589
+ # Evaluation settings
590
+ settings:
591
+ runs_per_config: 3
592
+ parallelism: 2
593
+
594
+ # AI Judge Evaluation (default: false)
595
+ # When true, uses AI judge model to score suggestions 1-5 on each rubric dimension
596
+ # When false, uses fast pattern matching (required_elements/forbidden_elements)
597
+ # Can be overridden via CLI: --skip-rubric or --use-rubric
598
+ # Standard workflow: run with skip-rubric, then 'evaluate <runId> --follow' for Opus judging
599
+ use_ai_judge: false
600
+
601
+ # Benchmark-specific settings
602
+ benchmark:
603
+ # Use AI judge for benchmark evaluations (default: true)
604
+ # Benchmarking benefits from AI evaluation to capture nuanced quality differences
605
+ use_ai_judge: true
606
+
607
+ # Dimensions that ALWAYS use AI judge (regardless of override)
608
+ # These dimensions require rubric scores to calculate their metrics
609
+ force_ai_judge_dimensions:
610
+ - specificity # Needs rubric specificity score
611
+ timeout_ms: 30000
612
+ retry_on_failure: true
613
+ max_retries: 2