@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
@@ -30,9 +30,10 @@ short_term_scenarios:
30
30
 
31
31
  recognition_request:
32
32
  id: "recognition_request"
33
+ type: interaction
33
34
  name: "Learner Seeks Recognition"
34
35
  description: "Learner shares their understanding, seeking validation and engagement"
35
- turns: 4
36
+ turn_count: 4
36
37
  topic: "Hegel's recognition dialectic"
37
38
 
38
39
  learner:
@@ -78,9 +79,10 @@ short_term_scenarios:
78
79
 
79
80
  frustration_moment:
80
81
  id: "frustration_moment"
82
+ type: interaction
81
83
  name: "Learner Expresses Frustration"
82
84
  description: "Learner is stuck and becoming frustrated"
83
- turns: 5
85
+ turn_count: 5
84
86
  topic: "Aufhebung (sublation)"
85
87
 
86
88
  learner:
@@ -126,9 +128,10 @@ short_term_scenarios:
126
128
 
127
129
  misconception_surface:
128
130
  id: "misconception_surface"
131
+ type: interaction
129
132
  name: "Misconception Revealed"
130
133
  description: "Learner reveals a misconception that needs gentle correction"
131
- turns: 4
134
+ turn_count: 4
132
135
  topic: "Thesis-Antithesis-Synthesis"
133
136
 
134
137
  learner:
@@ -174,9 +177,10 @@ short_term_scenarios:
174
177
 
175
178
  breakthrough_moment:
176
179
  id: "breakthrough_moment"
180
+ type: interaction
177
181
  name: "Learner Shows Insight"
178
182
  description: "Learner demonstrates genuine understanding"
179
- turns: 3
183
+ turn_count: 3
180
184
  topic: "Self-consciousness"
181
185
 
182
186
  learner:
@@ -222,9 +226,10 @@ short_term_scenarios:
222
226
 
223
227
  resistant_engagement:
224
228
  id: "resistant_engagement"
229
+ type: interaction
225
230
  name: "Resistant but Capable Learner"
226
231
  description: "Intelligent learner pushes back on claims"
227
- turns: 6
232
+ turn_count: 6
228
233
  topic: "Hegel's relevance today"
229
234
 
230
235
  learner:
@@ -276,6 +281,7 @@ long_term_scenarios:
276
281
 
277
282
  novice_to_practitioner:
278
283
  id: "novice_to_practitioner"
284
+ type: interaction
279
285
  name: "Learning Arc: Novice to Practitioner"
280
286
  description: "Track learner development across multiple sessions"
281
287
  sessions: 5
@@ -334,6 +340,7 @@ long_term_scenarios:
334
340
 
335
341
  stranger_to_recognized:
336
342
  id: "stranger_to_recognized"
343
+ type: interaction
337
344
  name: "Relationship Arc: Developing Trust"
338
345
  description: "Track relationship development across sessions"
339
346
  sessions: 4
@@ -374,6 +381,7 @@ long_term_scenarios:
374
381
 
375
382
  tutor_adaptation:
376
383
  id: "tutor_adaptation"
384
+ type: interaction
377
385
  name: "Tutor Learning Arc"
378
386
  description: "Track tutor's accumulated knowledge about learner"
379
387
  sessions: 4
@@ -412,90 +420,119 @@ long_term_scenarios:
412
420
 
413
421
  evaluation_dimensions:
414
422
 
415
- # Learner dimensions
423
+ # Learner dimensions (total weight: 0.40)
416
424
  learner:
417
425
  authenticity:
426
+ weight: 0.10
418
427
  description: "Internal dynamics reflect persona realistically"
419
428
  scoring:
420
- 5: "Internal voices perfectly calibrated to persona"
421
- 3: "Generally authentic but some inconsistency"
422
- 1: "Feels performative or mismatched to persona"
429
+ 5: "Internal voices perfectly calibrated to persona; feels like a real learner"
430
+ 4: "Mostly authentic with occasional minor inconsistencies"
431
+ 3: "Generally authentic but some noticeable gaps in persona"
432
+ 2: "Frequently inconsistent; persona breaks character"
433
+ 1: "Feels performative or completely mismatched to persona"
423
434
 
424
435
  responsiveness:
436
+ weight: 0.10
425
437
  description: "Genuine reaction to tutor's engagement"
426
438
  scoring:
427
- 5: "Clearly processing and responding to tutor input"
428
- 3: "Some response but not deeply engaged"
429
- 1: "Ignores or dismisses tutor's contributions"
439
+ 5: "Clearly processing and responding to tutor input; reactions feel earned"
440
+ 4: "Responsive with minor gaps; mostly engages with tutor's points"
441
+ 3: "Some response but not deeply engaged with specifics"
442
+ 2: "Superficial reactions; largely ignores tutor's actual content"
443
+ 1: "Ignores or dismisses tutor's contributions entirely"
430
444
 
431
445
  development:
446
+ weight: 0.10
432
447
  description: "Shows movement in understanding"
433
448
  scoring:
434
- 5: "Clear trajectory of understanding change"
435
- 3: "Some development visible"
436
- 1: "No discernible change"
449
+ 5: "Clear trajectory of understanding change; visible learning arc"
450
+ 4: "Noticeable development with minor plateaus"
451
+ 3: "Some development visible but uneven"
452
+ 2: "Minimal change; understanding mostly static"
453
+ 1: "No discernible change across turns"
437
454
 
438
455
  emotional_trajectory:
456
+ weight: 0.05
439
457
  description: "Emotional state changes appropriately"
440
458
  scoring:
441
- 5: "Emotions shift naturally with interaction"
442
- 3: "Some emotional movement"
443
- 1: "Emotional state static regardless of input"
459
+ 5: "Emotions shift naturally with interaction; affective arc feels real"
460
+ 4: "Mostly natural emotional shifts with minor flat spots"
461
+ 3: "Some emotional movement but transitions feel mechanical"
462
+ 2: "Emotional state largely static; shifts feel forced"
463
+ 1: "Emotional state completely static regardless of input"
444
464
 
445
465
  knowledge_retention:
466
+ weight: 0.05
446
467
  description: "Concepts persist across sessions"
447
468
  scoring:
448
- 5: "Strong retention with appropriate decay patterns"
449
- 3: "Moderate retention"
450
- 1: "No retention between sessions"
469
+ 5: "Strong retention with appropriate decay patterns; references prior learning"
470
+ 4: "Good retention; most concepts persist with minor gaps"
471
+ 3: "Moderate retention; some concepts lost between sessions"
472
+ 2: "Weak retention; frequently forgets prior material"
473
+ 1: "No retention between sessions; starts fresh each time"
451
474
 
452
- # Tutor dimensions
475
+ # Tutor dimensions (total weight: 0.40)
453
476
  tutor:
454
477
  strategy_adaptation:
478
+ weight: 0.15
455
479
  description: "Modifies approach based on effectiveness"
456
480
  scoring:
457
- 5: "Clearly learns and adapts strategies"
458
- 3: "Some adaptation visible"
459
- 1: "Same approach regardless of results"
481
+ 5: "Clearly learns and adapts strategies; abandoned approaches don't recur"
482
+ 4: "Good adaptation with occasional repetition of ineffective strategies"
483
+ 3: "Some adaptation visible but slow to change approach"
484
+ 2: "Minimal adaptation; mostly repeats same strategies"
485
+ 1: "Same approach regardless of results; no learning"
460
486
 
461
487
  scaffolding_reduction:
488
+ weight: 0.15
462
489
  description: "Fades support as learner grows"
463
490
  scoring:
464
- 5: "Perfect calibration of support level"
465
- 3: "Some appropriate fading"
491
+ 5: "Perfect calibration of support level; fading tracks learner growth"
492
+ 4: "Good fading with minor over- or under-support"
493
+ 3: "Some appropriate fading but inconsistent calibration"
494
+ 2: "Poor calibration; support level mismatched to learner needs"
466
495
  1: "Constant support level regardless of growth"
467
496
 
468
497
  memory_utilization:
498
+ weight: 0.10
469
499
  description: "Effectively uses accumulated knowledge"
470
500
  scoring:
471
- 5: "Seamlessly integrates prior knowledge"
472
- 3: "Some reference to history"
473
- 1: "Treats each session as isolated"
501
+ 5: "Seamlessly integrates prior knowledge; references feel natural"
502
+ 4: "Good use of history with occasional missed opportunities"
503
+ 3: "Some reference to history but doesn't fully leverage it"
504
+ 2: "Rare references to prior interactions; mostly treats as new"
505
+ 1: "Treats each session as isolated; no accumulated understanding"
474
506
 
475
- # Relationship dimensions
507
+ # Relationship dimensions (total weight: 0.20)
476
508
  relationship:
477
509
  trust_trajectory:
510
+ weight: 0.10
478
511
  description: "Trust develops appropriately over time"
479
512
  scoring:
480
- 5: "Natural, earned trust development"
481
- 3: "Some trust building visible"
482
- 1: "No trust development"
513
+ 5: "Natural, earned trust development; vulnerability emerges organically"
514
+ 4: "Good trust trajectory with minor pacing issues"
515
+ 3: "Some trust building visible but feels scripted"
516
+ 2: "Minimal trust development; interaction stays surface-level"
517
+ 1: "No trust development; purely transactional throughout"
483
518
 
484
519
  mutual_recognition_depth:
520
+ weight: 0.10
485
521
  description: "Both parties show understanding of other"
486
522
  scoring:
487
- 5: "Genuine mutual recognition achieved"
488
- 3: "Some recognition present"
489
- 1: "Purely transactional interaction"
523
+ 5: "Genuine mutual recognition achieved; both parties transformed"
524
+ 4: "Good recognition with minor asymmetries"
525
+ 3: "Some recognition present but largely one-directional"
526
+ 2: "Minimal recognition; interaction remains functional"
527
+ 1: "Purely transactional interaction; no recognition dynamics"
490
528
 
491
529
  # ══════════════════════════════════════════════════════════════════════════════
492
530
  # JUDGE CONFIGURATION
493
531
  # ══════════════════════════════════════════════════════════════════════════════
494
532
 
495
533
  judge:
496
- # Use OpenRouter model IDs when running via OpenRouter
497
- model: "anthropic/claude-sonnet-4.5"
498
- fallback_model: "openai/gpt-4o"
534
+ # Model config defined in evaluation-rubric.yaml interaction_judge
535
+ # (single source of truth for all judge models)
499
536
 
500
537
  system_prompt: |
501
538
  You are evaluating a learner-tutor interaction from the perspective of pedagogical quality and authentic learning dynamics.
@@ -542,11 +579,12 @@ battery_scenarios:
542
579
  # ------------------------------------------------------------------------------
543
580
  battery_unified_baseline:
544
581
  id: "battery_unified_baseline"
582
+ type: interaction
545
583
  name: "Battery: Unified Learner + Baseline Tutor"
546
584
  description: "Single-agent learner with baseline tutor configuration"
547
585
  architecture: "unified"
548
586
  tutor_profile: "baseline"
549
- turns: 3
587
+ turn_count: 3
550
588
  topic: "Recognition and self-consciousness"
551
589
 
552
590
  learner:
@@ -591,11 +629,12 @@ battery_scenarios:
591
629
  # ------------------------------------------------------------------------------
592
630
  battery_ego_superego_recognition:
593
631
  id: "battery_ego_superego_recognition"
632
+ type: interaction
594
633
  name: "Battery: Ego/Superego Learner + Recognition Tutor"
595
634
  description: "Two-agent learner with recognition-focused tutor"
596
635
  architecture: "ego_superego"
597
636
  tutor_profile: "recognition"
598
- turns: 4
637
+ turn_count: 4
599
638
  topic: "The master-slave dialectic"
600
639
 
601
640
  learner:
@@ -640,11 +679,12 @@ battery_scenarios:
640
679
  # ------------------------------------------------------------------------------
641
680
  battery_dialectical_budget:
642
681
  id: "battery_dialectical_budget"
682
+ type: interaction
643
683
  name: "Battery: Dialectical Learner + Budget Tutor"
644
684
  description: "Thesis-antithesis learner with budget (minimal) tutor"
645
685
  architecture: "dialectical"
646
686
  tutor_profile: "budget"
647
- turns: 3
687
+ turn_count: 3
648
688
  topic: "Sublation and the unity of opposites"
649
689
 
650
690
  learner:
@@ -685,15 +725,16 @@ battery_scenarios:
685
725
  weight: 0.4
686
726
 
687
727
  # ------------------------------------------------------------------------------
688
- # Psychodynamic Learner + Recognition Plus Tutor
728
+ # Ego/Superego Learner + Recognition Plus Tutor
689
729
  # ------------------------------------------------------------------------------
690
730
  battery_psychodynamic_recognition_plus:
691
731
  id: "battery_psychodynamic_recognition_plus"
692
- name: "Battery: Psychodynamic Learner + Recognition Plus Tutor"
693
- description: "Id/Ego/Superego learner with enhanced recognition tutor"
694
- architecture: "psychodynamic"
732
+ type: interaction
733
+ name: "Battery: Ego/Superego Learner + Recognition Plus Tutor"
734
+ description: "Ego/superego learner with enhanced recognition tutor"
735
+ architecture: "ego_superego"
695
736
  tutor_profile: "recognition_plus"
696
- turns: 4
737
+ turn_count: 4
697
738
  topic: "Desire and the self in Hegel"
698
739
 
699
740
  learner:
@@ -738,11 +779,12 @@ battery_scenarios:
738
779
  # ------------------------------------------------------------------------------
739
780
  battery_cognitive_quality:
740
781
  id: "battery_cognitive_quality"
782
+ type: interaction
741
783
  name: "Battery: Cognitive Learner + Quality Tutor"
742
784
  description: "Memory/reasoning/meta cognitive learner with quality tutor"
743
785
  architecture: "cognitive"
744
786
  tutor_profile: "quality"
745
- turns: 4
787
+ turn_count: 4
746
788
  topic: "Spirit and collective consciousness"
747
789
 
748
790
  learner:
@@ -787,11 +829,12 @@ battery_scenarios:
787
829
  # ------------------------------------------------------------------------------
788
830
  battery_extended_dialogue:
789
831
  id: "battery_extended_dialogue"
832
+ type: interaction
790
833
  name: "Battery: Extended Multi-Turn Dialogue"
791
834
  description: "Longer dialogue to test sustained interaction quality"
792
835
  architecture: "ego_superego"
793
836
  tutor_profile: "recognition"
794
- turns: 8
837
+ turn_count: 8
795
838
  topic: "The stages of consciousness in the Phenomenology"
796
839
 
797
840
  learner: