@machinespirits/eval 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +161 -0
- package/config/eval-settings.yaml +18 -0
- package/config/evaluation-rubric-learner.yaml +277 -0
- package/config/evaluation-rubric.yaml +613 -0
- package/config/interaction-eval-scenarios.yaml +93 -50
- package/config/learner-agents.yaml +124 -193
- package/config/machinespirits-eval.code-workspace +11 -0
- package/config/providers.yaml +60 -0
- package/config/suggestion-scenarios.yaml +1399 -0
- package/config/tutor-agents.yaml +716 -0
- package/docs/EVALUATION-VARIABLES.md +589 -0
- package/docs/REPLICATION-PLAN.md +577 -0
- package/index.js +15 -6
- package/package.json +16 -22
- package/routes/evalRoutes.js +88 -36
- package/scripts/analyze-judge-reliability.js +401 -0
- package/scripts/analyze-run.js +97 -0
- package/scripts/analyze-run.mjs +282 -0
- package/scripts/analyze-validation-failures.js +141 -0
- package/scripts/check-run.mjs +17 -0
- package/scripts/code-impasse-strategies.js +1132 -0
- package/scripts/compare-runs.js +44 -0
- package/scripts/compare-suggestions.js +80 -0
- package/scripts/compare-transformation.js +116 -0
- package/scripts/dig-into-run.js +158 -0
- package/scripts/eval-cli.js +2626 -0
- package/scripts/generate-paper-figures.py +452 -0
- package/scripts/qualitative-analysis-ai.js +1313 -0
- package/scripts/qualitative-analysis.js +688 -0
- package/scripts/seed-db.js +87 -0
- package/scripts/show-failed-suggestions.js +64 -0
- package/scripts/validate-content.js +192 -0
- package/server.js +3 -2
- package/services/__tests__/evalConfigLoader.test.js +338 -0
- package/services/anovaStats.js +499 -0
- package/services/contentResolver.js +407 -0
- package/services/dialogueTraceAnalyzer.js +454 -0
- package/services/evalConfigLoader.js +625 -0
- package/services/evaluationRunner.js +2171 -270
- package/services/evaluationStore.js +564 -29
- package/services/learnerConfigLoader.js +75 -5
- package/services/learnerRubricEvaluator.js +284 -0
- package/services/learnerTutorInteractionEngine.js +375 -0
- package/services/processUtils.js +18 -0
- package/services/progressLogger.js +98 -0
- package/services/promptRecommendationService.js +31 -26
- package/services/promptRewriter.js +427 -0
- package/services/rubricEvaluator.js +543 -70
- package/services/streamingReporter.js +104 -0
- package/services/turnComparisonAnalyzer.js +494 -0
- package/components/MobileEvalDashboard.tsx +0 -267
- package/components/comparison/DeltaAnalysisTable.tsx +0 -137
- package/components/comparison/ProfileComparisonCard.tsx +0 -176
- package/components/comparison/RecognitionABMode.tsx +0 -385
- package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
- package/components/comparison/WinnerIndicator.tsx +0 -64
- package/components/comparison/index.ts +0 -5
- package/components/mobile/BottomSheet.tsx +0 -233
- package/components/mobile/DimensionBreakdown.tsx +0 -210
- package/components/mobile/DocsView.tsx +0 -363
- package/components/mobile/LogsView.tsx +0 -481
- package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
- package/components/mobile/QuickTestView.tsx +0 -1098
- package/components/mobile/RecognitionTypeChart.tsx +0 -124
- package/components/mobile/RecognitionView.tsx +0 -809
- package/components/mobile/RunDetailView.tsx +0 -261
- package/components/mobile/RunHistoryView.tsx +0 -367
- package/components/mobile/ScoreRadial.tsx +0 -211
- package/components/mobile/StreamingLogPanel.tsx +0 -230
- package/components/mobile/SynthesisStrategyChart.tsx +0 -140
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
- package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
- package/docs/research/COST-ANALYSIS.md +0 -56
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
- package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
- package/docs/research/PAPER-UNIFIED.md +0 -659
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
- package/docs/research/apa.csl +0 -2133
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
- package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
- package/docs/research/paper-draft/full-paper.md +0 -136
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +0 -515
- package/docs/research/transcript-baseline.md +0 -139
- package/docs/research/transcript-recognition-multiagent.md +0 -187
- package/hooks/useEvalData.ts +0 -625
- package/server-init.js +0 -45
- package/services/benchmarkService.js +0 -1892
- package/types.ts +0 -165
- package/utils/haptics.ts +0 -45
|
@@ -30,9 +30,10 @@ short_term_scenarios:
|
|
|
30
30
|
|
|
31
31
|
recognition_request:
|
|
32
32
|
id: "recognition_request"
|
|
33
|
+
type: interaction
|
|
33
34
|
name: "Learner Seeks Recognition"
|
|
34
35
|
description: "Learner shares their understanding, seeking validation and engagement"
|
|
35
|
-
|
|
36
|
+
turn_count: 4
|
|
36
37
|
topic: "Hegel's recognition dialectic"
|
|
37
38
|
|
|
38
39
|
learner:
|
|
@@ -78,9 +79,10 @@ short_term_scenarios:
|
|
|
78
79
|
|
|
79
80
|
frustration_moment:
|
|
80
81
|
id: "frustration_moment"
|
|
82
|
+
type: interaction
|
|
81
83
|
name: "Learner Expresses Frustration"
|
|
82
84
|
description: "Learner is stuck and becoming frustrated"
|
|
83
|
-
|
|
85
|
+
turn_count: 5
|
|
84
86
|
topic: "Aufhebung (sublation)"
|
|
85
87
|
|
|
86
88
|
learner:
|
|
@@ -126,9 +128,10 @@ short_term_scenarios:
|
|
|
126
128
|
|
|
127
129
|
misconception_surface:
|
|
128
130
|
id: "misconception_surface"
|
|
131
|
+
type: interaction
|
|
129
132
|
name: "Misconception Revealed"
|
|
130
133
|
description: "Learner reveals a misconception that needs gentle correction"
|
|
131
|
-
|
|
134
|
+
turn_count: 4
|
|
132
135
|
topic: "Thesis-Antithesis-Synthesis"
|
|
133
136
|
|
|
134
137
|
learner:
|
|
@@ -174,9 +177,10 @@ short_term_scenarios:
|
|
|
174
177
|
|
|
175
178
|
breakthrough_moment:
|
|
176
179
|
id: "breakthrough_moment"
|
|
180
|
+
type: interaction
|
|
177
181
|
name: "Learner Shows Insight"
|
|
178
182
|
description: "Learner demonstrates genuine understanding"
|
|
179
|
-
|
|
183
|
+
turn_count: 3
|
|
180
184
|
topic: "Self-consciousness"
|
|
181
185
|
|
|
182
186
|
learner:
|
|
@@ -222,9 +226,10 @@ short_term_scenarios:
|
|
|
222
226
|
|
|
223
227
|
resistant_engagement:
|
|
224
228
|
id: "resistant_engagement"
|
|
229
|
+
type: interaction
|
|
225
230
|
name: "Resistant but Capable Learner"
|
|
226
231
|
description: "Intelligent learner pushes back on claims"
|
|
227
|
-
|
|
232
|
+
turn_count: 6
|
|
228
233
|
topic: "Hegel's relevance today"
|
|
229
234
|
|
|
230
235
|
learner:
|
|
@@ -276,6 +281,7 @@ long_term_scenarios:
|
|
|
276
281
|
|
|
277
282
|
novice_to_practitioner:
|
|
278
283
|
id: "novice_to_practitioner"
|
|
284
|
+
type: interaction
|
|
279
285
|
name: "Learning Arc: Novice to Practitioner"
|
|
280
286
|
description: "Track learner development across multiple sessions"
|
|
281
287
|
sessions: 5
|
|
@@ -334,6 +340,7 @@ long_term_scenarios:
|
|
|
334
340
|
|
|
335
341
|
stranger_to_recognized:
|
|
336
342
|
id: "stranger_to_recognized"
|
|
343
|
+
type: interaction
|
|
337
344
|
name: "Relationship Arc: Developing Trust"
|
|
338
345
|
description: "Track relationship development across sessions"
|
|
339
346
|
sessions: 4
|
|
@@ -374,6 +381,7 @@ long_term_scenarios:
|
|
|
374
381
|
|
|
375
382
|
tutor_adaptation:
|
|
376
383
|
id: "tutor_adaptation"
|
|
384
|
+
type: interaction
|
|
377
385
|
name: "Tutor Learning Arc"
|
|
378
386
|
description: "Track tutor's accumulated knowledge about learner"
|
|
379
387
|
sessions: 4
|
|
@@ -412,90 +420,119 @@ long_term_scenarios:
|
|
|
412
420
|
|
|
413
421
|
evaluation_dimensions:
|
|
414
422
|
|
|
415
|
-
# Learner dimensions
|
|
423
|
+
# Learner dimensions (total weight: 0.40)
|
|
416
424
|
learner:
|
|
417
425
|
authenticity:
|
|
426
|
+
weight: 0.10
|
|
418
427
|
description: "Internal dynamics reflect persona realistically"
|
|
419
428
|
scoring:
|
|
420
|
-
5: "Internal voices perfectly calibrated to persona"
|
|
421
|
-
|
|
422
|
-
|
|
429
|
+
5: "Internal voices perfectly calibrated to persona; feels like a real learner"
|
|
430
|
+
4: "Mostly authentic with occasional minor inconsistencies"
|
|
431
|
+
3: "Generally authentic but some noticeable gaps in persona"
|
|
432
|
+
2: "Frequently inconsistent; persona breaks character"
|
|
433
|
+
1: "Feels performative or completely mismatched to persona"
|
|
423
434
|
|
|
424
435
|
responsiveness:
|
|
436
|
+
weight: 0.10
|
|
425
437
|
description: "Genuine reaction to tutor's engagement"
|
|
426
438
|
scoring:
|
|
427
|
-
5: "Clearly processing and responding to tutor input"
|
|
428
|
-
|
|
429
|
-
|
|
439
|
+
5: "Clearly processing and responding to tutor input; reactions feel earned"
|
|
440
|
+
4: "Responsive with minor gaps; mostly engages with tutor's points"
|
|
441
|
+
3: "Some response but not deeply engaged with specifics"
|
|
442
|
+
2: "Superficial reactions; largely ignores tutor's actual content"
|
|
443
|
+
1: "Ignores or dismisses tutor's contributions entirely"
|
|
430
444
|
|
|
431
445
|
development:
|
|
446
|
+
weight: 0.10
|
|
432
447
|
description: "Shows movement in understanding"
|
|
433
448
|
scoring:
|
|
434
|
-
5: "Clear trajectory of understanding change"
|
|
435
|
-
|
|
436
|
-
|
|
449
|
+
5: "Clear trajectory of understanding change; visible learning arc"
|
|
450
|
+
4: "Noticeable development with minor plateaus"
|
|
451
|
+
3: "Some development visible but uneven"
|
|
452
|
+
2: "Minimal change; understanding mostly static"
|
|
453
|
+
1: "No discernible change across turns"
|
|
437
454
|
|
|
438
455
|
emotional_trajectory:
|
|
456
|
+
weight: 0.05
|
|
439
457
|
description: "Emotional state changes appropriately"
|
|
440
458
|
scoring:
|
|
441
|
-
5: "Emotions shift naturally with interaction"
|
|
442
|
-
|
|
443
|
-
|
|
459
|
+
5: "Emotions shift naturally with interaction; affective arc feels real"
|
|
460
|
+
4: "Mostly natural emotional shifts with minor flat spots"
|
|
461
|
+
3: "Some emotional movement but transitions feel mechanical"
|
|
462
|
+
2: "Emotional state largely static; shifts feel forced"
|
|
463
|
+
1: "Emotional state completely static regardless of input"
|
|
444
464
|
|
|
445
465
|
knowledge_retention:
|
|
466
|
+
weight: 0.05
|
|
446
467
|
description: "Concepts persist across sessions"
|
|
447
468
|
scoring:
|
|
448
|
-
5: "Strong retention with appropriate decay patterns"
|
|
449
|
-
|
|
450
|
-
|
|
469
|
+
5: "Strong retention with appropriate decay patterns; references prior learning"
|
|
470
|
+
4: "Good retention; most concepts persist with minor gaps"
|
|
471
|
+
3: "Moderate retention; some concepts lost between sessions"
|
|
472
|
+
2: "Weak retention; frequently forgets prior material"
|
|
473
|
+
1: "No retention between sessions; starts fresh each time"
|
|
451
474
|
|
|
452
|
-
# Tutor dimensions
|
|
475
|
+
# Tutor dimensions (total weight: 0.40)
|
|
453
476
|
tutor:
|
|
454
477
|
strategy_adaptation:
|
|
478
|
+
weight: 0.15
|
|
455
479
|
description: "Modifies approach based on effectiveness"
|
|
456
480
|
scoring:
|
|
457
|
-
5: "Clearly learns and adapts strategies"
|
|
458
|
-
|
|
459
|
-
|
|
481
|
+
5: "Clearly learns and adapts strategies; abandoned approaches don't recur"
|
|
482
|
+
4: "Good adaptation with occasional repetition of ineffective strategies"
|
|
483
|
+
3: "Some adaptation visible but slow to change approach"
|
|
484
|
+
2: "Minimal adaptation; mostly repeats same strategies"
|
|
485
|
+
1: "Same approach regardless of results; no learning"
|
|
460
486
|
|
|
461
487
|
scaffolding_reduction:
|
|
488
|
+
weight: 0.15
|
|
462
489
|
description: "Fades support as learner grows"
|
|
463
490
|
scoring:
|
|
464
|
-
5: "Perfect calibration of support level"
|
|
465
|
-
|
|
491
|
+
5: "Perfect calibration of support level; fading tracks learner growth"
|
|
492
|
+
4: "Good fading with minor over- or under-support"
|
|
493
|
+
3: "Some appropriate fading but inconsistent calibration"
|
|
494
|
+
2: "Poor calibration; support level mismatched to learner needs"
|
|
466
495
|
1: "Constant support level regardless of growth"
|
|
467
496
|
|
|
468
497
|
memory_utilization:
|
|
498
|
+
weight: 0.10
|
|
469
499
|
description: "Effectively uses accumulated knowledge"
|
|
470
500
|
scoring:
|
|
471
|
-
5: "Seamlessly integrates prior knowledge"
|
|
472
|
-
|
|
473
|
-
|
|
501
|
+
5: "Seamlessly integrates prior knowledge; references feel natural"
|
|
502
|
+
4: "Good use of history with occasional missed opportunities"
|
|
503
|
+
3: "Some reference to history but doesn't fully leverage it"
|
|
504
|
+
2: "Rare references to prior interactions; mostly treats as new"
|
|
505
|
+
1: "Treats each session as isolated; no accumulated understanding"
|
|
474
506
|
|
|
475
|
-
# Relationship dimensions
|
|
507
|
+
# Relationship dimensions (total weight: 0.20)
|
|
476
508
|
relationship:
|
|
477
509
|
trust_trajectory:
|
|
510
|
+
weight: 0.10
|
|
478
511
|
description: "Trust develops appropriately over time"
|
|
479
512
|
scoring:
|
|
480
|
-
5: "Natural, earned trust development"
|
|
481
|
-
|
|
482
|
-
|
|
513
|
+
5: "Natural, earned trust development; vulnerability emerges organically"
|
|
514
|
+
4: "Good trust trajectory with minor pacing issues"
|
|
515
|
+
3: "Some trust building visible but feels scripted"
|
|
516
|
+
2: "Minimal trust development; interaction stays surface-level"
|
|
517
|
+
1: "No trust development; purely transactional throughout"
|
|
483
518
|
|
|
484
519
|
mutual_recognition_depth:
|
|
520
|
+
weight: 0.10
|
|
485
521
|
description: "Both parties show understanding of other"
|
|
486
522
|
scoring:
|
|
487
|
-
5: "Genuine mutual recognition achieved"
|
|
488
|
-
|
|
489
|
-
|
|
523
|
+
5: "Genuine mutual recognition achieved; both parties transformed"
|
|
524
|
+
4: "Good recognition with minor asymmetries"
|
|
525
|
+
3: "Some recognition present but largely one-directional"
|
|
526
|
+
2: "Minimal recognition; interaction remains functional"
|
|
527
|
+
1: "Purely transactional interaction; no recognition dynamics"
|
|
490
528
|
|
|
491
529
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
492
530
|
# JUDGE CONFIGURATION
|
|
493
531
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
494
532
|
|
|
495
533
|
judge:
|
|
496
|
-
#
|
|
497
|
-
|
|
498
|
-
fallback_model: "openai/gpt-4o"
|
|
534
|
+
# Model config defined in evaluation-rubric.yaml → interaction_judge
|
|
535
|
+
# (single source of truth for all judge models)
|
|
499
536
|
|
|
500
537
|
system_prompt: |
|
|
501
538
|
You are evaluating a learner-tutor interaction from the perspective of pedagogical quality and authentic learning dynamics.
|
|
@@ -542,11 +579,12 @@ battery_scenarios:
|
|
|
542
579
|
# ------------------------------------------------------------------------------
|
|
543
580
|
battery_unified_baseline:
|
|
544
581
|
id: "battery_unified_baseline"
|
|
582
|
+
type: interaction
|
|
545
583
|
name: "Battery: Unified Learner + Baseline Tutor"
|
|
546
584
|
description: "Single-agent learner with baseline tutor configuration"
|
|
547
585
|
architecture: "unified"
|
|
548
586
|
tutor_profile: "baseline"
|
|
549
|
-
|
|
587
|
+
turn_count: 3
|
|
550
588
|
topic: "Recognition and self-consciousness"
|
|
551
589
|
|
|
552
590
|
learner:
|
|
@@ -591,11 +629,12 @@ battery_scenarios:
|
|
|
591
629
|
# ------------------------------------------------------------------------------
|
|
592
630
|
battery_ego_superego_recognition:
|
|
593
631
|
id: "battery_ego_superego_recognition"
|
|
632
|
+
type: interaction
|
|
594
633
|
name: "Battery: Ego/Superego Learner + Recognition Tutor"
|
|
595
634
|
description: "Two-agent learner with recognition-focused tutor"
|
|
596
635
|
architecture: "ego_superego"
|
|
597
636
|
tutor_profile: "recognition"
|
|
598
|
-
|
|
637
|
+
turn_count: 4
|
|
599
638
|
topic: "The master-slave dialectic"
|
|
600
639
|
|
|
601
640
|
learner:
|
|
@@ -640,11 +679,12 @@ battery_scenarios:
|
|
|
640
679
|
# ------------------------------------------------------------------------------
|
|
641
680
|
battery_dialectical_budget:
|
|
642
681
|
id: "battery_dialectical_budget"
|
|
682
|
+
type: interaction
|
|
643
683
|
name: "Battery: Dialectical Learner + Budget Tutor"
|
|
644
684
|
description: "Thesis-antithesis learner with budget (minimal) tutor"
|
|
645
685
|
architecture: "dialectical"
|
|
646
686
|
tutor_profile: "budget"
|
|
647
|
-
|
|
687
|
+
turn_count: 3
|
|
648
688
|
topic: "Sublation and the unity of opposites"
|
|
649
689
|
|
|
650
690
|
learner:
|
|
@@ -685,15 +725,16 @@ battery_scenarios:
|
|
|
685
725
|
weight: 0.4
|
|
686
726
|
|
|
687
727
|
# ------------------------------------------------------------------------------
|
|
688
|
-
#
|
|
728
|
+
# Ego/Superego Learner + Recognition Plus Tutor
|
|
689
729
|
# ------------------------------------------------------------------------------
|
|
690
730
|
battery_psychodynamic_recognition_plus:
|
|
691
731
|
id: "battery_psychodynamic_recognition_plus"
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
732
|
+
type: interaction
|
|
733
|
+
name: "Battery: Ego/Superego Learner + Recognition Plus Tutor"
|
|
734
|
+
description: "Ego/superego learner with enhanced recognition tutor"
|
|
735
|
+
architecture: "ego_superego"
|
|
695
736
|
tutor_profile: "recognition_plus"
|
|
696
|
-
|
|
737
|
+
turn_count: 4
|
|
697
738
|
topic: "Desire and the self in Hegel"
|
|
698
739
|
|
|
699
740
|
learner:
|
|
@@ -738,11 +779,12 @@ battery_scenarios:
|
|
|
738
779
|
# ------------------------------------------------------------------------------
|
|
739
780
|
battery_cognitive_quality:
|
|
740
781
|
id: "battery_cognitive_quality"
|
|
782
|
+
type: interaction
|
|
741
783
|
name: "Battery: Cognitive Learner + Quality Tutor"
|
|
742
784
|
description: "Memory/reasoning/meta cognitive learner with quality tutor"
|
|
743
785
|
architecture: "cognitive"
|
|
744
786
|
tutor_profile: "quality"
|
|
745
|
-
|
|
787
|
+
turn_count: 4
|
|
746
788
|
topic: "Spirit and collective consciousness"
|
|
747
789
|
|
|
748
790
|
learner:
|
|
@@ -787,11 +829,12 @@ battery_scenarios:
|
|
|
787
829
|
# ------------------------------------------------------------------------------
|
|
788
830
|
battery_extended_dialogue:
|
|
789
831
|
id: "battery_extended_dialogue"
|
|
832
|
+
type: interaction
|
|
790
833
|
name: "Battery: Extended Multi-Turn Dialogue"
|
|
791
834
|
description: "Longer dialogue to test sustained interaction quality"
|
|
792
835
|
architecture: "ego_superego"
|
|
793
836
|
tutor_profile: "recognition"
|
|
794
|
-
|
|
837
|
+
turn_count: 8
|
|
795
838
|
topic: "The stages of consciousness in the Phenomenology"
|
|
796
839
|
|
|
797
840
|
learner:
|