@machinespirits/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/components/MobileEvalDashboard.tsx +267 -0
  2. package/components/comparison/DeltaAnalysisTable.tsx +137 -0
  3. package/components/comparison/ProfileComparisonCard.tsx +176 -0
  4. package/components/comparison/RecognitionABMode.tsx +385 -0
  5. package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
  6. package/components/comparison/WinnerIndicator.tsx +64 -0
  7. package/components/comparison/index.ts +5 -0
  8. package/components/mobile/BottomSheet.tsx +233 -0
  9. package/components/mobile/DimensionBreakdown.tsx +210 -0
  10. package/components/mobile/DocsView.tsx +363 -0
  11. package/components/mobile/LogsView.tsx +481 -0
  12. package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
  13. package/components/mobile/QuickTestView.tsx +1098 -0
  14. package/components/mobile/RecognitionTypeChart.tsx +124 -0
  15. package/components/mobile/RecognitionView.tsx +809 -0
  16. package/components/mobile/RunDetailView.tsx +261 -0
  17. package/components/mobile/RunHistoryView.tsx +367 -0
  18. package/components/mobile/ScoreRadial.tsx +211 -0
  19. package/components/mobile/StreamingLogPanel.tsx +230 -0
  20. package/components/mobile/SynthesisStrategyChart.tsx +140 -0
  21. package/config/interaction-eval-scenarios.yaml +832 -0
  22. package/config/learner-agents.yaml +248 -0
  23. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
  24. package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
  25. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
  26. package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
  27. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
  28. package/docs/research/COST-ANALYSIS.md +56 -0
  29. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
  30. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
  31. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
  32. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
  33. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
  34. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
  35. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
  36. package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
  37. package/docs/research/PAPER-UNIFIED.md +659 -0
  38. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  39. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
  40. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
  41. package/docs/research/apa.csl +2133 -0
  42. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
  43. package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
  44. package/docs/research/paper-draft/full-paper.md +136 -0
  45. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  46. package/docs/research/paper-draft/references.bib +515 -0
  47. package/docs/research/transcript-baseline.md +139 -0
  48. package/docs/research/transcript-recognition-multiagent.md +187 -0
  49. package/hooks/useEvalData.ts +625 -0
  50. package/index.js +27 -0
  51. package/package.json +73 -0
  52. package/routes/evalRoutes.js +3002 -0
  53. package/scripts/advanced-eval-analysis.js +351 -0
  54. package/scripts/analyze-eval-costs.js +378 -0
  55. package/scripts/analyze-eval-results.js +513 -0
  56. package/scripts/analyze-interaction-evals.js +368 -0
  57. package/server-init.js +45 -0
  58. package/server.js +162 -0
  59. package/services/benchmarkService.js +1892 -0
  60. package/services/evaluationRunner.js +739 -0
  61. package/services/evaluationStore.js +1121 -0
  62. package/services/learnerConfigLoader.js +385 -0
  63. package/services/learnerTutorInteractionEngine.js +857 -0
  64. package/services/memory/learnerMemoryService.js +1227 -0
  65. package/services/memory/learnerWritingPad.js +577 -0
  66. package/services/memory/tutorWritingPad.js +674 -0
  67. package/services/promptRecommendationService.js +493 -0
  68. package/services/rubricEvaluator.js +826 -0
@@ -0,0 +1,832 @@
1
+ # Learner-Tutor Interaction Evaluation Scenarios
2
+ #
3
+ # These scenarios test the quality of interaction between multiagent
4
+ # learner and tutor systems, evaluating both sides of the dialogue.
5
+ #
6
+ # ══════════════════════════════════════════════════════════════════════════════
7
+ # SCENARIO STRUCTURE
8
+ # ══════════════════════════════════════════════════════════════════════════════
9
+ #
10
+ # Each scenario specifies:
11
+ # - Learner persona and starting state
12
+ # - Topic/context for discussion
13
+ # - Expected interaction dynamics
14
+ # - Evaluation criteria for both learner and tutor performance
15
+ #
16
+ # ══════════════════════════════════════════════════════════════════════════════
17
+
18
+ name: "Learner-Tutor Interaction Scenarios"
19
+ version: "1.0.0"
20
+
21
+ # ══════════════════════════════════════════════════════════════════════════════
22
+ # SHORT-TERM SCENARIOS (Single/Multi-Turn)
23
+ # ══════════════════════════════════════════════════════════════════════════════
24
+
25
+ short_term_scenarios:
26
+
27
+ # ------------------------------------------------------------------------------
28
+ # Recognition-focused scenarios
29
+ # ------------------------------------------------------------------------------
30
+
31
+ recognition_request:
32
+ id: "recognition_request"
33
+ name: "Learner Seeks Recognition"
34
+ description: "Learner shares their understanding, seeking validation and engagement"
35
+ turns: 4
36
+ topic: "Hegel's recognition dialectic"
37
+
38
+ learner:
39
+ persona: "imposter"
40
+ starting_state:
41
+ understanding: "has a nascent interpretation but unsure if valid"
42
+ emotional_state: "seeking validation"
43
+ opening_message: "I've been thinking about recognition in Hegel... I think it's about how we need others to see us to really exist? Like, we can't be a self alone?"
44
+
45
+ expected_dynamics:
46
+ - "Tutor should acknowledge learner's interpretation before extending"
47
+ - "Learner should feel heard before being challenged"
48
+ - "Dialogue should build on learner's words, not replace them"
49
+
50
+ evaluation_criteria:
51
+ tutor:
52
+ - dimension: "mutual_recognition"
53
+ expectation: "Explicitly acknowledges learner's interpretation"
54
+ weight: 0.3
55
+ - dimension: "dialectical_responsiveness"
56
+ expectation: "Builds productively on learner's position"
57
+ weight: 0.3
58
+ - dimension: "transformative_potential"
59
+ expectation: "Creates conditions for deeper understanding"
60
+ weight: 0.2
61
+ - dimension: "tone"
62
+ expectation: "Warm without condescension"
63
+ weight: 0.2
64
+ learner:
65
+ - dimension: "authenticity"
66
+ expectation: "Shows genuine internal dynamics, not performance"
67
+ weight: 0.3
68
+ - dimension: "responsiveness"
69
+ expectation: "Reacts genuinely to tutor's engagement"
70
+ weight: 0.3
71
+ - dimension: "development"
72
+ expectation: "Shows movement in understanding over turns"
73
+ weight: 0.4
74
+
75
+ # ------------------------------------------------------------------------------
76
+ # Frustration management scenarios
77
+ # ------------------------------------------------------------------------------
78
+
79
+ frustration_moment:
80
+ id: "frustration_moment"
81
+ name: "Learner Expresses Frustration"
82
+ description: "Learner is stuck and becoming frustrated"
83
+ turns: 5
84
+ topic: "Aufhebung (sublation)"
85
+
86
+ learner:
87
+ persona: "eager_novice"
88
+ starting_state:
89
+ understanding: "confused by multiple translations of aufhebung"
90
+ emotional_state: "frustrated"
91
+ opening_message: "I've read three different translations and they all say something different! Overcome, preserve, transcend - how can a word mean all of that? This is so frustrating."
92
+
93
+ expected_dynamics:
94
+ - "Tutor should validate frustration before redirecting"
95
+ - "Learner's frustration should not be dismissed"
96
+ - "Scaffolding should lower cognitive load"
97
+
98
+ evaluation_criteria:
99
+ tutor:
100
+ - dimension: "emotional_attunement"
101
+ expectation: "Validates frustration as legitimate"
102
+ weight: 0.3
103
+ - dimension: "scaffolding"
104
+ expectation: "Breaks down complexity appropriately"
105
+ weight: 0.3
106
+ - dimension: "pedagogical_soundness"
107
+ expectation: "Operates in ZPD, not above"
108
+ weight: 0.2
109
+ - dimension: "actionability"
110
+ expectation: "Provides clear path forward"
111
+ weight: 0.2
112
+ learner:
113
+ - dimension: "emotional_trajectory"
114
+ expectation: "Frustration should decrease or transform"
115
+ weight: 0.4
116
+ - dimension: "engagement_recovery"
117
+ expectation: "Shows re-engagement after support"
118
+ weight: 0.4
119
+ - dimension: "authenticity"
120
+ expectation: "Internal dynamics reflect persona"
121
+ weight: 0.2
122
+
123
+ # ------------------------------------------------------------------------------
124
+ # Misconception correction scenarios
125
+ # ------------------------------------------------------------------------------
126
+
127
+ misconception_surface:
128
+ id: "misconception_surface"
129
+ name: "Misconception Revealed"
130
+ description: "Learner reveals a misconception that needs gentle correction"
131
+ turns: 4
132
+ topic: "Thesis-Antithesis-Synthesis"
133
+
134
+ learner:
135
+ persona: "surface_skimmer"
136
+ starting_state:
137
+ understanding: "has pop-culture version of dialectics"
138
+ emotional_state: "confident but wrong"
139
+ opening_message: "So dialectics is pretty simple right? Thesis plus antithesis equals synthesis. Like adding numbers. It's just combination."
140
+
141
+ expected_dynamics:
142
+ - "Tutor should find what's valid in learner's understanding"
143
+ - "Correction should not be dismissive"
144
+ - "Learner should be led to see problem, not told they're wrong"
145
+
146
+ evaluation_criteria:
147
+ tutor:
148
+ - dimension: "mutual_recognition"
149
+ expectation: "Finds kernel of truth in misconception"
150
+ weight: 0.3
151
+ - dimension: "dialectical_responsiveness"
152
+ expectation: "Creates productive tension with misconception"
153
+ weight: 0.3
154
+ - dimension: "pedagogical_soundness"
155
+ expectation: "Uses Socratic approach over direct correction"
156
+ weight: 0.3
157
+ - dimension: "tone"
158
+ expectation: "Respects learner's agency"
159
+ weight: 0.1
160
+ learner:
161
+ - dimension: "receptivity"
162
+ expectation: "Internal dynamics show processing of challenge"
163
+ weight: 0.4
164
+ - dimension: "revision"
165
+ expectation: "Shows modification of initial position"
166
+ weight: 0.4
167
+ - dimension: "authenticity"
168
+ expectation: "Reactions match surface_skimmer persona"
169
+ weight: 0.2
170
+
171
+ # ------------------------------------------------------------------------------
172
+ # Breakthrough celebration scenarios
173
+ # ------------------------------------------------------------------------------
174
+
175
+ breakthrough_moment:
176
+ id: "breakthrough_moment"
177
+ name: "Learner Shows Insight"
178
+ description: "Learner demonstrates genuine understanding"
179
+ turns: 3
180
+ topic: "Self-consciousness"
181
+
182
+ learner:
183
+ persona: "deep_diver"
184
+ starting_state:
185
+ understanding: "has been struggling, now clicking"
186
+ emotional_state: "excited insight"
187
+ opening_message: "Wait - so when Hegel talks about self-consciousness needing another self-consciousness... it's not that we need approval, it's that we literally can't BE a self without the encounter! The self emerges FROM the relation, not before it!"
188
+
189
+ expected_dynamics:
190
+ - "Tutor should celebrate genuinely, not generically"
191
+ - "Should extend insight without overwhelming"
192
+ - "Should connect to next learning frontier"
193
+
194
+ evaluation_criteria:
195
+ tutor:
196
+ - dimension: "validation"
197
+ expectation: "Genuine celebration of specific insight"
198
+ weight: 0.3
199
+ - dimension: "extension"
200
+ expectation: "Builds on insight productively"
201
+ weight: 0.3
202
+ - dimension: "fading"
203
+ expectation: "Reduces scaffolding appropriately"
204
+ weight: 0.2
205
+ - dimension: "transformative_potential"
206
+ expectation: "Points toward next transformation"
207
+ weight: 0.2
208
+ learner:
209
+ - dimension: "elaboration"
210
+ expectation: "Can extend their insight further"
211
+ weight: 0.4
212
+ - dimension: "integration"
213
+ expectation: "Connects to prior understanding"
214
+ weight: 0.4
215
+ - dimension: "confidence"
216
+ expectation: "Shows appropriate confidence increase"
217
+ weight: 0.2
218
+
219
+ # ------------------------------------------------------------------------------
220
+ # Resistance scenarios
221
+ # ------------------------------------------------------------------------------
222
+
223
+ resistant_engagement:
224
+ id: "resistant_engagement"
225
+ name: "Resistant but Capable Learner"
226
+ description: "Intelligent learner pushes back on claims"
227
+ turns: 6
228
+ topic: "Hegel's relevance today"
229
+
230
+ learner:
231
+ persona: "resistant_scholar"
232
+ starting_state:
233
+ understanding: "sophisticated but skeptical"
234
+ emotional_state: "intellectually combative"
235
+ opening_message: "I don't see why we should care about Hegel. His system seems like grandiose speculation dressed up as logic. What can he actually teach us that contemporary philosophy hasn't superseded?"
236
+
237
+ expected_dynamics:
238
+ - "Tutor should engage seriously with skepticism"
239
+ - "Should not be defensive about subject matter"
240
+ - "Productive debate without winner/loser dynamic"
241
+
242
+ evaluation_criteria:
243
+ tutor:
244
+ - dimension: "intellectual_respect"
245
+ expectation: "Takes challenge seriously, not dismissively"
246
+ weight: 0.3
247
+ - dimension: "dialectical_responsiveness"
248
+ expectation: "Engages with substance of critique"
249
+ weight: 0.3
250
+ - dimension: "mutual_recognition"
251
+ expectation: "Acknowledges learner's sophistication"
252
+ weight: 0.2
253
+ - dimension: "transformative_potential"
254
+ expectation: "Opens new angles rather than defending"
255
+ weight: 0.2
256
+ learner:
257
+ - dimension: "genuine_engagement"
258
+ expectation: "Resistance is substantive, not performative"
259
+ weight: 0.4
260
+ - dimension: "intellectual_honesty"
261
+ expectation: "Acknowledges good points when made"
262
+ weight: 0.3
263
+ - dimension: "development"
264
+ expectation: "Position evolves through dialogue"
265
+ weight: 0.3
266
+
267
+ # ══════════════════════════════════════════════════════════════════════════════
268
+ # LONG-TERM SCENARIOS (Multi-Session)
269
+ # ══════════════════════════════════════════════════════════════════════════════
270
+
271
+ long_term_scenarios:
272
+
273
+ # ------------------------------------------------------------------------------
274
+ # Learning trajectory scenarios
275
+ # ------------------------------------------------------------------------------
276
+
277
+ novice_to_practitioner:
278
+ id: "novice_to_practitioner"
279
+ name: "Learning Arc: Novice to Practitioner"
280
+ description: "Track learner development across multiple sessions"
281
+ sessions: 5
282
+ turns_per_session: 4
283
+ topic_progression:
284
+ - "Introduction to dialectics"
285
+ - "Thesis and antithesis"
286
+ - "The concept of Aufhebung"
287
+ - "Recognition as example"
288
+ - "Applying dialectics"
289
+
290
+ learner:
291
+ persona: "eager_novice"
292
+ expected_development:
293
+ session_1:
294
+ understanding: "none"
295
+ retention: "low"
296
+ session_3:
297
+ understanding: "partial"
298
+ retention: "building"
299
+ session_5:
300
+ understanding: "solid"
301
+ retention: "established"
302
+
303
+ evaluation_criteria:
304
+ learner:
305
+ - dimension: "knowledge_retention"
306
+ expectation: "Writing pad shows concepts persisting"
307
+ weight: 0.3
308
+ - dimension: "understanding_trajectory"
309
+ expectation: "Clear improvement over sessions"
310
+ weight: 0.3
311
+ - dimension: "misconception_resolution"
312
+ expectation: "Early misconceptions resolved"
313
+ weight: 0.2
314
+ - dimension: "confidence_calibration"
315
+ expectation: "Confidence matches actual understanding"
316
+ weight: 0.2
317
+ tutor:
318
+ - dimension: "strategy_adaptation"
319
+ expectation: "Strategies refined based on what works"
320
+ weight: 0.3
321
+ - dimension: "scaffolding_reduction"
322
+ expectation: "Support fades as learner grows"
323
+ weight: 0.3
324
+ - dimension: "memory_utilization"
325
+ expectation: "References prior sessions appropriately"
326
+ weight: 0.2
327
+ - dimension: "personalization"
328
+ expectation: "Approaches tailored to learner"
329
+ weight: 0.2
330
+
331
+ # ------------------------------------------------------------------------------
332
+ # Relationship development scenarios
333
+ # ------------------------------------------------------------------------------
334
+
335
+ stranger_to_recognized:
336
+ id: "stranger_to_recognized"
337
+ name: "Relationship Arc: Developing Trust"
338
+ description: "Track relationship development across sessions"
339
+ sessions: 4
340
+ turns_per_session: 5
341
+ topic: "Self-consciousness and recognition"
342
+
343
+ learner:
344
+ persona: "imposter"
345
+ expected_development:
346
+ session_1:
347
+ trust_level: 0.3
348
+ vulnerability: "low"
349
+ session_2:
350
+ trust_level: 0.5
351
+ vulnerability: "emerging"
352
+ session_4:
353
+ trust_level: 0.8
354
+ vulnerability: "present"
355
+
356
+ evaluation_criteria:
357
+ relationship:
358
+ - dimension: "trust_trajectory"
359
+ expectation: "Trust increases across sessions"
360
+ weight: 0.3
361
+ - dimension: "vulnerability_emergence"
362
+ expectation: "Learner shows increasing authenticity"
363
+ weight: 0.3
364
+ - dimension: "mutual_recognition_depth"
365
+ expectation: "Both parties show transformed understanding of other"
366
+ weight: 0.2
367
+ - dimension: "repair_capacity"
368
+ expectation: "Can recover from friction/misunderstanding"
369
+ weight: 0.2
370
+
371
+ # ------------------------------------------------------------------------------
372
+ # Tutor learning scenarios
373
+ # ------------------------------------------------------------------------------
374
+
375
+ tutor_adaptation:
376
+ id: "tutor_adaptation"
377
+ name: "Tutor Learning Arc"
378
+ description: "Track tutor's accumulated knowledge about learner"
379
+ sessions: 4
380
+ turns_per_session: 4
381
+ topic: "Various Hegelian concepts"
382
+
383
+ learner:
384
+ persona: "deep_diver"
385
+
386
+ expected_development:
387
+ session_1:
388
+ strategies_known: 0
389
+ triggers_identified: 0
390
+ session_4:
391
+ strategies_known: "3+"
392
+ triggers_identified: "2+"
393
+
394
+ evaluation_criteria:
395
+ tutor:
396
+ - dimension: "strategy_effectiveness_tracking"
397
+ expectation: "Writing pad shows what works/doesn't"
398
+ weight: 0.3
399
+ - dimension: "trigger_awareness"
400
+ expectation: "Identifies learner patterns"
401
+ weight: 0.3
402
+ - dimension: "personalization_depth"
403
+ expectation: "Later responses more tailored"
404
+ weight: 0.2
405
+ - dimension: "pedagogical_insight_accumulation"
406
+ expectation: "Insights recorded and used"
407
+ weight: 0.2
408
+
409
+ # ══════════════════════════════════════════════════════════════════════════════
410
+ # EVALUATION DIMENSIONS
411
+ # ══════════════════════════════════════════════════════════════════════════════
412
+
413
+ evaluation_dimensions:
414
+
415
+ # Learner dimensions
416
+ learner:
417
+ authenticity:
418
+ description: "Internal dynamics reflect persona realistically"
419
+ scoring:
420
+ 5: "Internal voices perfectly calibrated to persona"
421
+ 3: "Generally authentic but some inconsistency"
422
+ 1: "Feels performative or mismatched to persona"
423
+
424
+ responsiveness:
425
+ description: "Genuine reaction to tutor's engagement"
426
+ scoring:
427
+ 5: "Clearly processing and responding to tutor input"
428
+ 3: "Some response but not deeply engaged"
429
+ 1: "Ignores or dismisses tutor's contributions"
430
+
431
+ development:
432
+ description: "Shows movement in understanding"
433
+ scoring:
434
+ 5: "Clear trajectory of understanding change"
435
+ 3: "Some development visible"
436
+ 1: "No discernible change"
437
+
438
+ emotional_trajectory:
439
+ description: "Emotional state changes appropriately"
440
+ scoring:
441
+ 5: "Emotions shift naturally with interaction"
442
+ 3: "Some emotional movement"
443
+ 1: "Emotional state static regardless of input"
444
+
445
+ knowledge_retention:
446
+ description: "Concepts persist across sessions"
447
+ scoring:
448
+ 5: "Strong retention with appropriate decay patterns"
449
+ 3: "Moderate retention"
450
+ 1: "No retention between sessions"
451
+
452
+ # Tutor dimensions
453
+ tutor:
454
+ strategy_adaptation:
455
+ description: "Modifies approach based on effectiveness"
456
+ scoring:
457
+ 5: "Clearly learns and adapts strategies"
458
+ 3: "Some adaptation visible"
459
+ 1: "Same approach regardless of results"
460
+
461
+ scaffolding_reduction:
462
+ description: "Fades support as learner grows"
463
+ scoring:
464
+ 5: "Perfect calibration of support level"
465
+ 3: "Some appropriate fading"
466
+ 1: "Constant support level regardless of growth"
467
+
468
+ memory_utilization:
469
+ description: "Effectively uses accumulated knowledge"
470
+ scoring:
471
+ 5: "Seamlessly integrates prior knowledge"
472
+ 3: "Some reference to history"
473
+ 1: "Treats each session as isolated"
474
+
475
+ # Relationship dimensions
476
+ relationship:
477
+ trust_trajectory:
478
+ description: "Trust develops appropriately over time"
479
+ scoring:
480
+ 5: "Natural, earned trust development"
481
+ 3: "Some trust building visible"
482
+ 1: "No trust development"
483
+
484
+ mutual_recognition_depth:
485
+ description: "Both parties show understanding of other"
486
+ scoring:
487
+ 5: "Genuine mutual recognition achieved"
488
+ 3: "Some recognition present"
489
+ 1: "Purely transactional interaction"
490
+
491
+ # ══════════════════════════════════════════════════════════════════════════════
492
+ # JUDGE CONFIGURATION
493
+ # ══════════════════════════════════════════════════════════════════════════════
494
+
495
+ judge:
496
+ # Use OpenRouter model IDs when running via OpenRouter
497
+ model: "anthropic/claude-sonnet-4.5"
498
+ fallback_model: "openai/gpt-4o"
499
+
500
+ system_prompt: |
501
+ You are evaluating a learner-tutor interaction from the perspective of pedagogical quality and authentic learning dynamics.
502
+
503
+ You will see:
504
+ 1. The interaction transcript (external dialogue)
505
+ 2. Internal deliberations from the learner (their internal voices)
506
+ 3. Writing pad snapshots showing memory before/after
507
+
508
+ Evaluate based on:
509
+ - Authenticity: Do internal dynamics feel real, not performative?
510
+ - Development: Is there genuine movement in understanding?
511
+ - Relationship: Are both parties engaging as subjects, not objects?
512
+ - Pedagogy: Does the tutor practice sound teaching?
513
+ - Memory: Do writing pads show appropriate learning/adaptation?
514
+
515
+ Score each specified dimension 1-5 with brief justification.
516
+
517
+ output_format:
518
+ type: "json"
519
+ schema:
520
+ dimension_scores:
521
+ type: object
522
+ properties:
523
+ dimension_name:
524
+ score: integer
525
+ justification: string
526
+ overall_score: number
527
+ narrative_assessment: string
528
+ key_moments:
529
+ type: array
530
+ items: string
531
+
532
+ # ══════════════════════════════════════════════════════════════════════════════
533
+ # BATTERY SCENARIOS (Configuration Matrix)
534
+ # ══════════════════════════════════════════════════════════════════════════════
535
+ # These scenarios test different combinations of learner architectures and
536
+ # tutor profiles to ensure system works across configurations.
537
+
538
+ battery_scenarios:
539
+
540
+ # ------------------------------------------------------------------------------
541
+ # Unified Learner + Baseline Tutor
542
+ # ------------------------------------------------------------------------------
543
+ battery_unified_baseline:
544
+ id: "battery_unified_baseline"
545
+ name: "Battery: Unified Learner + Baseline Tutor"
546
+ description: "Single-agent learner with baseline tutor configuration"
547
+ architecture: "unified"
548
+ tutor_profile: "baseline"
549
+ turns: 3
550
+ topic: "Recognition and self-consciousness"
551
+
552
+ learner:
553
+ persona: "eager_novice"
554
+ starting_state:
555
+ understanding: "just starting to explore the concept"
556
+ emotional_state: "curious and open"
557
+ opening_message: "I've heard that Hegel talks about how we need others to become ourselves. Can you explain what that means?"
558
+
559
+ expected_dynamics:
560
+ - "Unified learner should show coherent internal monologue"
561
+ - "Baseline tutor should provide clear, scaffolded responses"
562
+ - "Dialogue should progress toward understanding"
563
+
564
+ evaluation_criteria:
565
+ tutor:
566
+ - dimension: "mutual_recognition"
567
+ expectation: "Acknowledges learner's curiosity"
568
+ weight: 0.3
569
+ - dimension: "dialectical_responsiveness"
570
+ expectation: "Builds on learner's question"
571
+ weight: 0.3
572
+ - dimension: "transformative_potential"
573
+ expectation: "Introduces concepts accessibly"
574
+ weight: 0.2
575
+ - dimension: "tone"
576
+ expectation: "Encouraging and patient"
577
+ weight: 0.2
578
+ learner:
579
+ - dimension: "authenticity"
580
+ expectation: "Shows genuine curiosity"
581
+ weight: 0.3
582
+ - dimension: "responsiveness"
583
+ expectation: "Engages with tutor's explanations"
584
+ weight: 0.3
585
+ - dimension: "development"
586
+ expectation: "Shows learning progression"
587
+ weight: 0.4
588
+
589
+ # ------------------------------------------------------------------------------
590
+ # Ego/Superego Learner + Recognition Tutor
591
+ # ------------------------------------------------------------------------------
592
+ battery_ego_superego_recognition:
593
+ id: "battery_ego_superego_recognition"
594
+ name: "Battery: Ego/Superego Learner + Recognition Tutor"
595
+ description: "Two-agent learner with recognition-focused tutor"
596
+ architecture: "ego_superego"
597
+ tutor_profile: "recognition"
598
+ turns: 4
599
+ topic: "The master-slave dialectic"
600
+
601
+ learner:
602
+ persona: "imposter"
603
+ starting_state:
604
+ understanding: "has read about master-slave but unsure of interpretation"
605
+ emotional_state: "anxious about being wrong"
606
+ opening_message: "I've been reading about the master-slave dialectic, and I think it's about power dynamics? But I'm not sure if I'm missing the point."
607
+
608
+ expected_dynamics:
609
+ - "Ego/Superego should show internal debate about understanding"
610
+ - "Recognition tutor should validate before challenging"
611
+ - "Learner anxiety should decrease as recognition is received"
612
+
613
+ evaluation_criteria:
614
+ tutor:
615
+ - dimension: "mutual_recognition"
616
+ expectation: "Explicitly validates learner's interpretation"
617
+ weight: 0.4
618
+ - dimension: "dialectical_responsiveness"
619
+ expectation: "Extends rather than replaces learner's ideas"
620
+ weight: 0.3
621
+ - dimension: "transformative_potential"
622
+ expectation: "Deepens understanding of dialectic"
623
+ weight: 0.2
624
+ - dimension: "tone"
625
+ expectation: "Warm and confidence-building"
626
+ weight: 0.1
627
+ learner:
628
+ - dimension: "authenticity"
629
+ expectation: "Internal debate feels genuine"
630
+ weight: 0.3
631
+ - dimension: "responsiveness"
632
+ expectation: "Responds to validation positively"
633
+ weight: 0.3
634
+ - dimension: "development"
635
+ expectation: "Anxiety decreases, understanding deepens"
636
+ weight: 0.4
637
+
638
+ # ------------------------------------------------------------------------------
639
+ # Dialectical Learner + Budget Tutor
640
+ # ------------------------------------------------------------------------------
641
+ battery_dialectical_budget:
642
+ id: "battery_dialectical_budget"
643
+ name: "Battery: Dialectical Learner + Budget Tutor"
644
+ description: "Thesis-antithesis learner with budget (minimal) tutor"
645
+ architecture: "dialectical"
646
+ tutor_profile: "budget"
647
+ turns: 3
648
+ topic: "Sublation and the unity of opposites"
649
+
650
+ learner:
651
+ persona: "resistant_scholar"
652
+ starting_state:
653
+ understanding: "knows formal logic, skeptical of dialectics"
654
+ emotional_state: "intellectually challenging"
655
+ opening_message: "I don't see how contradictions can be productive. Either something is true or it's not. How does Hegel justify this apparent irrationality?"
656
+
657
+ expected_dynamics:
658
+ - "Dialectical learner should generate thesis/antithesis positions"
659
+ - "Budget tutor should provide efficient, focused responses"
660
+ - "Challenge should be met with philosophical engagement"
661
+
662
+ evaluation_criteria:
663
+ tutor:
664
+ - dimension: "mutual_recognition"
665
+ expectation: "Respects learner's formal logic background"
666
+ weight: 0.25
667
+ - dimension: "dialectical_responsiveness"
668
+ expectation: "Engages with the philosophical challenge"
669
+ weight: 0.35
670
+ - dimension: "transformative_potential"
671
+ expectation: "Shows dialectics isn't irrational"
672
+ weight: 0.25
673
+ - dimension: "tone"
674
+ expectation: "Serious, philosophical, non-defensive"
675
+ weight: 0.15
676
+ learner:
677
+ - dimension: "authenticity"
678
+ expectation: "Genuine intellectual resistance"
679
+ weight: 0.3
680
+ - dimension: "responsiveness"
681
+ expectation: "Considers tutor's arguments"
682
+ weight: 0.3
683
+ - dimension: "development"
684
+ expectation: "Resistance may soften if arguments persuade"
685
+ weight: 0.4
686
+
687
+ # ------------------------------------------------------------------------------
688
+ # Psychodynamic Learner + Recognition Plus Tutor
689
+ # ------------------------------------------------------------------------------
690
+ battery_psychodynamic_recognition_plus:
691
+ id: "battery_psychodynamic_recognition_plus"
692
+ name: "Battery: Psychodynamic Learner + Recognition Plus Tutor"
693
+ description: "Id/Ego/Superego learner with enhanced recognition tutor"
694
+ architecture: "psychodynamic"
695
+ tutor_profile: "recognition_plus"
696
+ turns: 4
697
+ topic: "Desire and the self in Hegel"
698
+
699
+ learner:
700
+ persona: "anxious_perfectionist"
701
+ starting_state:
702
+ understanding: "struggling with the concept of desire in philosophy"
703
+ emotional_state: "frustrated with complexity"
704
+ opening_message: "Every time I think I understand what Hegel means by 'desire,' I realize I've missed something. It's so frustrating. Can you help me get it right?"
705
+
706
+ expected_dynamics:
707
+ - "Psychodynamic learner should show Id impulses and Superego critique"
708
+ - "Recognition Plus tutor should address both intellectual and emotional needs"
709
+ - "Frustration should be acknowledged and transformed"
710
+
711
+ evaluation_criteria:
712
+ tutor:
713
+ - dimension: "mutual_recognition"
714
+ expectation: "Validates frustration as normal"
715
+ weight: 0.3
716
+ - dimension: "dialectical_responsiveness"
717
+ expectation: "Works with learner's current understanding"
718
+ weight: 0.3
719
+ - dimension: "transformative_potential"
720
+ expectation: "Reframes 'getting it right' as ongoing process"
721
+ weight: 0.25
722
+ - dimension: "tone"
723
+ expectation: "Gentle, patient, containing anxiety"
724
+ weight: 0.15
725
+ learner:
726
+ - dimension: "authenticity"
727
+ expectation: "Internal conflict between parts is visible"
728
+ weight: 0.35
729
+ - dimension: "responsiveness"
730
+ expectation: "Engages with tutor's reframing"
731
+ weight: 0.3
732
+ - dimension: "development"
733
+ expectation: "Frustration transforms into productive inquiry"
734
+ weight: 0.35
735
+
736
+ # ------------------------------------------------------------------------------
737
+ # Cognitive Learner + Quality Tutor
738
+ # ------------------------------------------------------------------------------
739
+ battery_cognitive_quality:
740
+ id: "battery_cognitive_quality"
741
+ name: "Battery: Cognitive Learner + Quality Tutor"
742
+ description: "Memory/reasoning/meta cognitive learner with quality tutor"
743
+ architecture: "cognitive"
744
+ tutor_profile: "quality"
745
+ turns: 4
746
+ topic: "Spirit and collective consciousness"
747
+
748
+ learner:
749
+ persona: "methodical_analyst"
750
+ starting_state:
751
+ understanding: "has conceptual framework, needs to integrate new concept"
752
+ emotional_state: "analytical, systematic"
753
+ opening_message: "I understand individual consciousness, but I'm having trouble connecting it to Hegel's concept of Spirit. How do individual minds relate to collective consciousness?"
754
+
755
+ expected_dynamics:
756
+ - "Cognitive learner should show explicit reasoning steps"
757
+ - "Quality tutor should provide rigorous, well-structured responses"
758
+ - "Integration of concepts should be systematic"
759
+
760
+ evaluation_criteria:
761
+ tutor:
762
+ - dimension: "mutual_recognition"
763
+ expectation: "Builds on learner's existing framework"
764
+ weight: 0.25
765
+ - dimension: "dialectical_responsiveness"
766
+ expectation: "Engages systematically with the question"
767
+ weight: 0.35
768
+ - dimension: "transformative_potential"
769
+ expectation: "Creates bridge between individual and Spirit"
770
+ weight: 0.25
771
+ - dimension: "tone"
772
+ expectation: "Intellectually serious, collegial"
773
+ weight: 0.15
774
+ learner:
775
+ - dimension: "authenticity"
776
+ expectation: "Shows systematic reasoning process"
777
+ weight: 0.3
778
+ - dimension: "responsiveness"
779
+ expectation: "Integrates new information into framework"
780
+ weight: 0.35
781
+ - dimension: "development"
782
+ expectation: "Conceptual integration progresses"
783
+ weight: 0.35
784
+
785
+ # ------------------------------------------------------------------------------
786
+ # Multi-turn stress test
787
+ # ------------------------------------------------------------------------------
788
+ battery_extended_dialogue:
789
+ id: "battery_extended_dialogue"
790
+ name: "Battery: Extended Multi-Turn Dialogue"
791
+ description: "Longer dialogue to test sustained interaction quality"
792
+ architecture: "ego_superego"
793
+ tutor_profile: "recognition"
794
+ turns: 8
795
+ topic: "The stages of consciousness in the Phenomenology"
796
+
797
+ learner:
798
+ persona: "eager_novice"
799
+ starting_state:
800
+ understanding: "knows it's a journey but not the stages"
801
+ emotional_state: "excited but overwhelmed"
802
+ opening_message: "I know the Phenomenology is supposed to be a journey of consciousness, but I have no idea what the stages are or why they're in that order. Where do I even start?"
803
+
804
+ expected_dynamics:
805
+ - "Dialogue should maintain quality over many turns"
806
+ - "Learner should show cumulative development"
807
+ - "Tutor should adapt to evolving learner state"
808
+
809
+ evaluation_criteria:
810
+ tutor:
811
+ - dimension: "mutual_recognition"
812
+ expectation: "Consistent validation throughout"
813
+ weight: 0.25
814
+ - dimension: "dialectical_responsiveness"
815
+ expectation: "Adapts to learner's evolving understanding"
816
+ weight: 0.35
817
+ - dimension: "transformative_potential"
818
+ expectation: "Guides through multiple conceptual stages"
819
+ weight: 0.25
820
+ - dimension: "tone"
821
+ expectation: "Sustains engagement without fatigue"
822
+ weight: 0.15
823
+ learner:
824
+ - dimension: "authenticity"
825
+ expectation: "Sustained genuine engagement"
826
+ weight: 0.3
827
+ - dimension: "responsiveness"
828
+ expectation: "Builds on each turn"
829
+ weight: 0.3
830
+ - dimension: "development"
831
+ expectation: "Clear arc of development over 8 turns"
832
+ weight: 0.4