@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
@@ -2,212 +2,166 @@
2
2
  # Defines how the simulated learner generates responses during evaluation
3
3
  #
4
4
  # ============================================================================
5
- # LEARNER ARCHITECTURES
5
+ # MODEL OVERRIDES (optional)
6
6
  # ============================================================================
7
+ # These override ALL profile models when uncommented. Useful for quick testing.
8
+ # CLI flags (--model, --ego-model, --superego-model) take precedence over these.
7
9
  #
8
- # The learner architecture determines whether the simulated learner has
10
+ # model_override: openrouter.haiku # Override ALL models (ego + superego + synthesis)
11
+ # ego_model_override: openrouter.nemotron # Override only ego model
12
+ # superego_model_override: openrouter.kimi-k2.5 # Override only superego model
13
+ #
14
+ # ============================================================================
15
+ # LEARNER PROFILES
16
+ # ============================================================================
17
+ #
18
+ # The learner profile determines whether the simulated learner has
9
19
  # internal deliberation before generating responses. This enables testing
10
20
  # whether multi-agent learner simulation improves evaluation validity.
11
21
  #
12
- # Architectures:
22
+ # Profiles:
13
23
  # 1. unified: Single learner agent (no internal dialogue)
14
- # 2. psychodynamic: Freudian desire/intellect/aspiration deliberation
15
- # 3. dialectical: Hegelian thesis/antithesis/synthesis process
24
+ # 2. ego_superego: Two-agent ego/superego deliberation (mirrors tutor architecture)
25
+ # 3. psychodynamic: Legacy alias → resolves to ego_superego
16
26
  #
17
27
  # ============================================================================
18
28
 
19
- # Active architecture (can be overridden by tutor profile)
29
+ # Active profile (can be overridden by tutor profile's learner_architecture)
20
30
  active_architecture: unified
21
31
 
22
32
  # ============================================================================
23
- # ARCHITECTURE DEFINITIONS
33
+ # PROFILES
24
34
  # ============================================================================
25
35
 
26
- architectures:
27
- # Unified: Single agent with no internal deliberation
28
- # Simple, fast, good for baseline comparisons
36
+ profiles:
37
+ # Single-agent: no internal deliberation
29
38
  unified:
30
- name: "Unified Learner"
31
- description: "Single learner agent without internal deliberation"
32
- deliberation:
39
+ description: "Single unified learner agent"
40
+ architecture: unified
41
+ unified_learner:
42
+ provider: openrouter
43
+ model: kimi-k2.5
44
+ prompt_file: learner-unified.md
45
+ hyperparameters:
46
+ temperature: 0.7
47
+ max_tokens: 500
48
+ dialogue:
33
49
  enabled: false
34
50
  max_rounds: 0
35
51
 
36
- agent:
52
+ # Two-agent: mirrors tutor ego/superego pattern
53
+ ego_superego:
54
+ description: "Ego/superego learner — mirrors tutor architecture"
55
+ architecture: ego_superego
56
+ ego:
37
57
  provider: openrouter
38
58
  model: nemotron
39
- prompt_file: learner-unified.md
59
+ prompt_file: learner-ego.md
40
60
  hyperparameters:
41
61
  temperature: 0.7
62
+ max_tokens: 400
63
+ superego:
64
+ provider: openrouter
65
+ model: kimi-k2.5
66
+ prompt_file: learner-superego.md
67
+ hyperparameters:
68
+ temperature: 0.5
69
+ max_tokens: 400
70
+ synthesis:
71
+ provider: openrouter
72
+ model: kimi-k2.5
73
+ prompt_file: learner-synthesis.md
74
+ hyperparameters:
75
+ temperature: 0.6
42
76
  max_tokens: 500
43
-
44
- # Psychodynamic: Freudian-inspired desire/intellect/aspiration
45
- # Internal deliberation between:
46
- # - Desire (Id-like): What the learner wants emotionally
47
- # - Intellect (Ego-like): Rational analysis of the situation
48
- # - Aspiration (Superego-like): Idealized learning goals
49
- psychodynamic:
50
- name: "Psychodynamic Learner"
51
- description: "Freudian-inspired internal deliberation between desire, intellect, and aspiration"
52
- deliberation:
77
+ dialogue:
53
78
  enabled: true
54
79
  max_rounds: 2
55
- convergence_threshold: 0.7
56
-
57
- agents:
58
- desire:
59
- role: "id"
60
- description: "Emotional/affective responses - what the learner WANTS"
61
- provider: openrouter
62
- model: nemotron
63
- prompt_file: learner-desire.md
64
- hyperparameters:
65
- temperature: 0.8 # Higher temp for more emotional/varied responses
66
- max_tokens: 400
67
80
 
68
- intellect:
69
- role: "ego"
70
- description: "Rational analysis - what the learner THINKS"
71
- provider: openrouter
72
- model: nemotron
73
- prompt_file: learner-intellect.md
74
- hyperparameters:
75
- temperature: 0.5 # Lower temp for more analytical responses
76
- max_tokens: 400
77
-
78
- aspiration:
79
- role: "superego"
80
- description: "Idealized goals - what the learner SHOULD want"
81
- provider: openrouter
82
- model: nemotron
83
- prompt_file: learner-aspiration.md
84
- hyperparameters:
85
- temperature: 0.6
86
- max_tokens: 400
87
-
88
- synthesizer:
89
- description: "Integrates the three voices into a coherent response"
90
- provider: openrouter
91
- model: nemotron
92
- prompt_file: learner-synthesizer.md
93
- hyperparameters:
94
- temperature: 0.6
95
- max_tokens: 500
96
-
97
- deliberation_process: |
98
- The psychodynamic learner simulates internal conflict:
99
-
100
- 1. DESIRE (Id): "I want to skip ahead / I'm bored / This is frustrating"
101
- - Immediate emotional reactions
102
- - Avoidance tendencies
103
- - Curiosity and excitement
104
-
105
- 2. INTELLECT (Ego): "This doesn't make sense yet / I need more examples"
106
- - Rational assessment of understanding
107
- - Strategic thinking about learning path
108
- - Reality testing
109
-
110
- 3. ASPIRATION (Superego): "I should master this / I want to be an expert"
111
- - Long-term learning goals
112
- - Internalized expectations
113
- - Self-improvement drives
114
-
115
- 4. SYNTHESIS: Integration into coherent learner response
116
- - Balances immediate desires with long-term goals
117
- - Produces realistic learner behavior
81
+ # Recognition-enhanced: single unified learner with recognition prompts
82
+ unified_recognition:
83
+ description: "Single unified learner with recognition-aware prompt"
84
+ architecture: unified
85
+ unified_learner:
86
+ provider: openrouter
87
+ model: kimi-k2.5
88
+ prompt_file: learner-unified.md
89
+ hyperparameters:
90
+ temperature: 0.7
91
+ max_tokens: 600
92
+ dialogue:
93
+ enabled: false
94
+ max_rounds: 0
118
95
 
119
- # Dialectical: Hegelian thesis/antithesis/synthesis
120
- # Internal deliberation where:
121
- # - Thesis: Initial position/understanding
122
- # - Antithesis: Challenge/complication to that position
123
- # - Synthesis: New integrated understanding
124
- dialectical:
125
- name: "Dialectical Learner"
126
- description: "Hegelian-inspired internal dialectic between thesis, antithesis, and synthesis"
127
- deliberation:
96
+ # Recognition-enhanced: ego/superego with recognition-specific prompts
97
+ ego_superego_recognition:
98
+ description: "Ego/superego learner with recognition-aware prompts and memory"
99
+ architecture: ego_superego
100
+ ego:
101
+ provider: openrouter
102
+ model: nemotron
103
+ prompt_file: learner-ego-recognition.md
104
+ hyperparameters:
105
+ temperature: 0.7
106
+ max_tokens: 600
107
+ superego:
108
+ provider: openrouter
109
+ model: kimi-k2.5
110
+ prompt_file: learner-superego-recognition.md
111
+ hyperparameters:
112
+ temperature: 0.5
113
+ max_tokens: 600
114
+ synthesis:
115
+ provider: openrouter
116
+ model: kimi-k2.5
117
+ prompt_file: learner-synthesis-recognition.md
118
+ hyperparameters:
119
+ temperature: 0.6
120
+ max_tokens: 700
121
+ dialogue:
128
122
  enabled: true
129
123
  max_rounds: 2
130
- convergence_threshold: 0.7
131
-
132
- agents:
133
- thesis:
134
- role: "thesis"
135
- description: "Initial understanding or position"
136
- provider: openrouter
137
- model: nemotron
138
- prompt_file: learner-thesis.md
139
- hyperparameters:
140
- temperature: 0.6
141
- max_tokens: 400
142
-
143
- antithesis:
144
- role: "antithesis"
145
- description: "Challenge or complication to the thesis"
146
- provider: openrouter
147
- model: nemotron
148
- prompt_file: learner-antithesis.md
149
- hyperparameters:
150
- temperature: 0.7
151
- max_tokens: 400
152
-
153
- synthesis:
154
- role: "synthesis"
155
- description: "Integration that preserves and overcomes the tension"
156
- provider: openrouter
157
- model: nemotron
158
- prompt_file: learner-synthesis.md
159
- hyperparameters:
160
- temperature: 0.6
161
- max_tokens: 500
162
124
 
163
- deliberation_process: |
164
- The dialectical learner simulates Hegelian movement:
165
-
166
- 1. THESIS: "I understand X as..."
167
- - Initial grasp of the concept
168
- - Current mental model
169
- - Working hypothesis
170
-
171
- 2. ANTITHESIS: "But wait, what about Y? That complicates things..."
172
- - Internal contradiction discovered
173
- - New information that doesn't fit
174
- - Productive confusion
175
-
176
- 3. SYNTHESIS: "So actually, X and Y together mean..."
177
- - New understanding that integrates both
178
- - Aufhebung: preserves while overcoming
179
- - Readiness for next dialectical cycle
125
+ # Legacy alias so existing 'psychodynamic' references resolve to ego_superego
126
+ psychodynamic:
127
+ description: "Legacy alias — uses ego/superego architecture"
128
+ architecture: ego_superego
129
+ ego:
130
+ provider: openrouter
131
+ model: nemotron
132
+ prompt_file: learner-ego.md
133
+ hyperparameters:
134
+ temperature: 0.7
135
+ max_tokens: 400
136
+ superego:
137
+ provider: openrouter
138
+ model: kimi-k2.5
139
+ prompt_file: learner-superego.md
140
+ hyperparameters:
141
+ temperature: 0.5
142
+ max_tokens: 400
143
+ synthesis:
144
+ provider: openrouter
145
+ model: kimi-k2.5
146
+ prompt_file: learner-synthesis.md
147
+ hyperparameters:
148
+ temperature: 0.6
149
+ max_tokens: 500
150
+ dialogue:
151
+ enabled: true
152
+ max_rounds: 2
180
153
 
181
154
  # ============================================================================
182
155
  # PERSONA MODIFIERS
183
156
  # ============================================================================
184
- # These modifiers adjust the base architecture based on learner persona
157
+ # These modifiers adjust learner behavior based on persona
185
158
 
186
159
  persona_modifiers:
187
- confused_novice:
188
- desire_weight: 0.4 # Strong emotional reactions to confusion
189
- intellect_weight: 0.3 # Struggles with analysis
190
- aspiration_weight: 0.3 # Wants to succeed but uncertain
191
-
192
- eager_explorer:
193
- desire_weight: 0.5 # Curiosity-driven
194
- intellect_weight: 0.3 # Quick but sometimes shallow analysis
195
- aspiration_weight: 0.2 # Less concerned with "should"
196
-
197
- focused_achiever:
198
- desire_weight: 0.2 # Controlled emotional reactions
199
- intellect_weight: 0.4 # Strong analytical focus
200
- aspiration_weight: 0.4 # Clear goals
201
-
202
- struggling_anxious:
203
- desire_weight: 0.5 # Strong anxiety-driven responses
204
- intellect_weight: 0.2 # Anxiety impairs analysis
205
- aspiration_weight: 0.3 # High expectations create pressure
206
-
207
- adversarial_tester:
208
- desire_weight: 0.3 # Enjoys challenging
209
- intellect_weight: 0.4 # Analytical about finding weaknesses
210
- aspiration_weight: 0.3 # Wants to be thorough
160
+ confused_novice: {}
161
+ eager_explorer: {}
162
+ focused_achiever: {}
163
+ struggling_anxious: {}
164
+ adversarial_tester: {}
211
165
 
212
166
  # ============================================================================
213
167
  # EVALUATION SETTINGS
@@ -222,27 +176,4 @@ evaluation:
222
176
  metrics:
223
177
  - deliberation_rounds
224
178
  - internal_coherence
225
- - desire_intellect_tension
226
- - aspiration_alignment
227
179
  - response_authenticity
228
-
229
- # ============================================================================
230
- # ABLATION STUDY SUPPORT
231
- # ============================================================================
232
-
233
- ablation:
234
- # Mapping of ablation profiles to learner architectures
235
- profile_architectures:
236
- ablation_baseline_unified: unified
237
- ablation_baseline_multilearner: psychodynamic
238
- ablation_multiagent_unified: unified
239
- ablation_multiagent_multilearner: psychodynamic
240
- ablation_recognition_unified: unified
241
- ablation_recognition_multilearner: psychodynamic
242
- ablation_recognition_multiagent_unified: unified
243
- ablation_recognition_multiagent_multilearner: psychodynamic
244
-
245
- # Which architectures to compare in ablation studies
246
- architectures_to_compare:
247
- - unified
248
- - psychodynamic
@@ -0,0 +1,11 @@
1
+ {
2
+ "folders": [
3
+ {
4
+ "path": ".."
5
+ },
6
+ {
7
+ "path": "../../machinespirits-tutor-core"
8
+ }
9
+ ],
10
+ "settings": {}
11
+ }
@@ -0,0 +1,60 @@
1
+ # Shared AI Provider Configuration
2
+ # Used by both tutor-agents.yaml and evaluation-rubric.yaml
3
+ #
4
+ # Model IDs are current as of January 2025. Update when new models release.
5
+
6
+ providers:
7
+ anthropic:
8
+ api_key_env: ANTHROPIC_API_KEY
9
+ base_url: https://api.anthropic.com/v1/messages
10
+ default_model: claude-sonnet-4-5
11
+ models:
12
+ haiku: claude-haiku-4-5
13
+ sonnet: claude-sonnet-4-5
14
+ opus: claude-opus-4-5
15
+
16
+ openai:
17
+ api_key_env: OPENAI_API_KEY
18
+ base_url: https://api.openai.com/v1/chat/completions
19
+ default_model: gpt-5-mini
20
+ models:
21
+ mini: gpt-5-mini
22
+ standard: gpt-5.2
23
+
24
+ openrouter:
25
+ api_key_env: OPENROUTER_API_KEY
26
+ base_url: https://openrouter.ai/api/v1/chat/completions
27
+ default_model: nvidia/nemotron-3-nano-30b-a3b:free
28
+ models:
29
+ # Budget-friendly options (free tier)
30
+ nemotron: nvidia/nemotron-3-nano-30b-a3b:free
31
+ glm47: z-ai/glm-4.7
32
+ kimi-k2: moonshotai/kimi-k2-thinking
33
+ "kimi-k2.5": moonshotai/kimi-k2.5
34
+ deepseek: deepseek/deepseek-v3.2
35
+ minimax: minimax/minimax-m2.1ate
36
+ haiku: anthropic/claude-haiku-4.5
37
+ gpt-oss: openai/gpt-oss-120b
38
+ # Mid-tier options
39
+ sonnet: anthropic/claude-sonnet-4.5
40
+ gpt-mini: openai/gpt-5-mini
41
+ gemini-flash: google/gemini-3-flash-preview
42
+ # Premium options
43
+ opus: anthropic/claude-opus-4.5
44
+ gpt: openai/gpt-5.2
45
+ gemini-pro: google/gemini-3-pro-preview
46
+
47
+ gemini:
48
+ api_key_env: GEMINI_API_KEY
49
+ base_url: https://generativelanguage.googleapis.com/v1beta/models
50
+ default_model: gemini-3-flash-preview
51
+ models:
52
+ flash: gemini-3-flash-preview
53
+ pro: gemini-3-pro-preview
54
+
55
+ local:
56
+ base_url: http://localhost:1234/v1/chat/completions
57
+ format: openai
58
+ default_model: local-model
59
+ models:
60
+ default: local-model