@machinespirits/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/components/MobileEvalDashboard.tsx +267 -0
- package/components/comparison/DeltaAnalysisTable.tsx +137 -0
- package/components/comparison/ProfileComparisonCard.tsx +176 -0
- package/components/comparison/RecognitionABMode.tsx +385 -0
- package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
- package/components/comparison/WinnerIndicator.tsx +64 -0
- package/components/comparison/index.ts +5 -0
- package/components/mobile/BottomSheet.tsx +233 -0
- package/components/mobile/DimensionBreakdown.tsx +210 -0
- package/components/mobile/DocsView.tsx +363 -0
- package/components/mobile/LogsView.tsx +481 -0
- package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
- package/components/mobile/QuickTestView.tsx +1098 -0
- package/components/mobile/RecognitionTypeChart.tsx +124 -0
- package/components/mobile/RecognitionView.tsx +809 -0
- package/components/mobile/RunDetailView.tsx +261 -0
- package/components/mobile/RunHistoryView.tsx +367 -0
- package/components/mobile/ScoreRadial.tsx +211 -0
- package/components/mobile/StreamingLogPanel.tsx +230 -0
- package/components/mobile/SynthesisStrategyChart.tsx +140 -0
- package/config/interaction-eval-scenarios.yaml +832 -0
- package/config/learner-agents.yaml +248 -0
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
- package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
- package/docs/research/COST-ANALYSIS.md +56 -0
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
- package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
- package/docs/research/PAPER-UNIFIED.md +659 -0
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
- package/docs/research/apa.csl +2133 -0
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
- package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
- package/docs/research/paper-draft/full-paper.md +136 -0
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +515 -0
- package/docs/research/transcript-baseline.md +139 -0
- package/docs/research/transcript-recognition-multiagent.md +187 -0
- package/hooks/useEvalData.ts +625 -0
- package/index.js +27 -0
- package/package.json +73 -0
- package/routes/evalRoutes.js +3002 -0
- package/scripts/advanced-eval-analysis.js +351 -0
- package/scripts/analyze-eval-costs.js +378 -0
- package/scripts/analyze-eval-results.js +513 -0
- package/scripts/analyze-interaction-evals.js +368 -0
- package/server-init.js +45 -0
- package/server.js +162 -0
- package/services/benchmarkService.js +1892 -0
- package/services/evaluationRunner.js +739 -0
- package/services/evaluationStore.js +1121 -0
- package/services/learnerConfigLoader.js +385 -0
- package/services/learnerTutorInteractionEngine.js +857 -0
- package/services/memory/learnerMemoryService.js +1227 -0
- package/services/memory/learnerWritingPad.js +577 -0
- package/services/memory/tutorWritingPad.js +674 -0
- package/services/promptRecommendationService.js +493 -0
- package/services/rubricEvaluator.js +826 -0
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
# Learner Agent Architecture Configuration
|
|
2
|
+
# Defines how the simulated learner generates responses during evaluation
|
|
3
|
+
#
|
|
4
|
+
# ============================================================================
|
|
5
|
+
# LEARNER ARCHITECTURES
|
|
6
|
+
# ============================================================================
|
|
7
|
+
#
|
|
8
|
+
# The learner architecture determines whether the simulated learner has
|
|
9
|
+
# internal deliberation before generating responses. This enables testing
|
|
10
|
+
# whether multi-agent learner simulation improves evaluation validity.
|
|
11
|
+
#
|
|
12
|
+
# Architectures:
|
|
13
|
+
# 1. unified: Single learner agent (no internal dialogue)
|
|
14
|
+
# 2. psychodynamic: Freudian desire/intellect/aspiration deliberation
|
|
15
|
+
# 3. dialectical: Hegelian thesis/antithesis/synthesis process
|
|
16
|
+
#
|
|
17
|
+
# ============================================================================
|
|
18
|
+
|
|
19
|
+
# Active architecture (can be overridden by tutor profile)
|
|
20
|
+
active_architecture: unified
|
|
21
|
+
|
|
22
|
+
# ============================================================================
|
|
23
|
+
# ARCHITECTURE DEFINITIONS
|
|
24
|
+
# ============================================================================
|
|
25
|
+
|
|
26
|
+
architectures:
|
|
27
|
+
# Unified: Single agent with no internal deliberation
|
|
28
|
+
# Simple, fast, good for baseline comparisons
|
|
29
|
+
unified:
|
|
30
|
+
name: "Unified Learner"
|
|
31
|
+
description: "Single learner agent without internal deliberation"
|
|
32
|
+
deliberation:
|
|
33
|
+
enabled: false
|
|
34
|
+
max_rounds: 0
|
|
35
|
+
|
|
36
|
+
agent:
|
|
37
|
+
provider: openrouter
|
|
38
|
+
model: nemotron
|
|
39
|
+
prompt_file: learner-unified.md
|
|
40
|
+
hyperparameters:
|
|
41
|
+
temperature: 0.7
|
|
42
|
+
max_tokens: 500
|
|
43
|
+
|
|
44
|
+
# Psychodynamic: Freudian-inspired desire/intellect/aspiration
|
|
45
|
+
# Internal deliberation between:
|
|
46
|
+
# - Desire (Id-like): What the learner wants emotionally
|
|
47
|
+
# - Intellect (Ego-like): Rational analysis of the situation
|
|
48
|
+
# - Aspiration (Superego-like): Idealized learning goals
|
|
49
|
+
psychodynamic:
|
|
50
|
+
name: "Psychodynamic Learner"
|
|
51
|
+
description: "Freudian-inspired internal deliberation between desire, intellect, and aspiration"
|
|
52
|
+
deliberation:
|
|
53
|
+
enabled: true
|
|
54
|
+
max_rounds: 2
|
|
55
|
+
convergence_threshold: 0.7
|
|
56
|
+
|
|
57
|
+
agents:
|
|
58
|
+
desire:
|
|
59
|
+
role: "id"
|
|
60
|
+
description: "Emotional/affective responses - what the learner WANTS"
|
|
61
|
+
provider: openrouter
|
|
62
|
+
model: nemotron
|
|
63
|
+
prompt_file: learner-desire.md
|
|
64
|
+
hyperparameters:
|
|
65
|
+
temperature: 0.8 # Higher temp for more emotional/varied responses
|
|
66
|
+
max_tokens: 400
|
|
67
|
+
|
|
68
|
+
intellect:
|
|
69
|
+
role: "ego"
|
|
70
|
+
description: "Rational analysis - what the learner THINKS"
|
|
71
|
+
provider: openrouter
|
|
72
|
+
model: nemotron
|
|
73
|
+
prompt_file: learner-intellect.md
|
|
74
|
+
hyperparameters:
|
|
75
|
+
temperature: 0.5 # Lower temp for more analytical responses
|
|
76
|
+
max_tokens: 400
|
|
77
|
+
|
|
78
|
+
aspiration:
|
|
79
|
+
role: "superego"
|
|
80
|
+
description: "Idealized goals - what the learner SHOULD want"
|
|
81
|
+
provider: openrouter
|
|
82
|
+
model: nemotron
|
|
83
|
+
prompt_file: learner-aspiration.md
|
|
84
|
+
hyperparameters:
|
|
85
|
+
temperature: 0.6
|
|
86
|
+
max_tokens: 400
|
|
87
|
+
|
|
88
|
+
synthesizer:
|
|
89
|
+
description: "Integrates the three voices into a coherent response"
|
|
90
|
+
provider: openrouter
|
|
91
|
+
model: nemotron
|
|
92
|
+
prompt_file: learner-synthesizer.md
|
|
93
|
+
hyperparameters:
|
|
94
|
+
temperature: 0.6
|
|
95
|
+
max_tokens: 500
|
|
96
|
+
|
|
97
|
+
deliberation_process: |
|
|
98
|
+
The psychodynamic learner simulates internal conflict:
|
|
99
|
+
|
|
100
|
+
1. DESIRE (Id): "I want to skip ahead / I'm bored / This is frustrating"
|
|
101
|
+
- Immediate emotional reactions
|
|
102
|
+
- Avoidance tendencies
|
|
103
|
+
- Curiosity and excitement
|
|
104
|
+
|
|
105
|
+
2. INTELLECT (Ego): "This doesn't make sense yet / I need more examples"
|
|
106
|
+
- Rational assessment of understanding
|
|
107
|
+
- Strategic thinking about learning path
|
|
108
|
+
- Reality testing
|
|
109
|
+
|
|
110
|
+
3. ASPIRATION (Superego): "I should master this / I want to be an expert"
|
|
111
|
+
- Long-term learning goals
|
|
112
|
+
- Internalized expectations
|
|
113
|
+
- Self-improvement drives
|
|
114
|
+
|
|
115
|
+
4. SYNTHESIS: Integration into coherent learner response
|
|
116
|
+
- Balances immediate desires with long-term goals
|
|
117
|
+
- Produces realistic learner behavior
|
|
118
|
+
|
|
119
|
+
# Dialectical: Hegelian thesis/antithesis/synthesis
|
|
120
|
+
# Internal deliberation where:
|
|
121
|
+
# - Thesis: Initial position/understanding
|
|
122
|
+
# - Antithesis: Challenge/complication to that position
|
|
123
|
+
# - Synthesis: New integrated understanding
|
|
124
|
+
dialectical:
|
|
125
|
+
name: "Dialectical Learner"
|
|
126
|
+
description: "Hegelian-inspired internal dialectic between thesis, antithesis, and synthesis"
|
|
127
|
+
deliberation:
|
|
128
|
+
enabled: true
|
|
129
|
+
max_rounds: 2
|
|
130
|
+
convergence_threshold: 0.7
|
|
131
|
+
|
|
132
|
+
agents:
|
|
133
|
+
thesis:
|
|
134
|
+
role: "thesis"
|
|
135
|
+
description: "Initial understanding or position"
|
|
136
|
+
provider: openrouter
|
|
137
|
+
model: nemotron
|
|
138
|
+
prompt_file: learner-thesis.md
|
|
139
|
+
hyperparameters:
|
|
140
|
+
temperature: 0.6
|
|
141
|
+
max_tokens: 400
|
|
142
|
+
|
|
143
|
+
antithesis:
|
|
144
|
+
role: "antithesis"
|
|
145
|
+
description: "Challenge or complication to the thesis"
|
|
146
|
+
provider: openrouter
|
|
147
|
+
model: nemotron
|
|
148
|
+
prompt_file: learner-antithesis.md
|
|
149
|
+
hyperparameters:
|
|
150
|
+
temperature: 0.7
|
|
151
|
+
max_tokens: 400
|
|
152
|
+
|
|
153
|
+
synthesis:
|
|
154
|
+
role: "synthesis"
|
|
155
|
+
description: "Integration that preserves and overcomes the tension"
|
|
156
|
+
provider: openrouter
|
|
157
|
+
model: nemotron
|
|
158
|
+
prompt_file: learner-synthesis.md
|
|
159
|
+
hyperparameters:
|
|
160
|
+
temperature: 0.6
|
|
161
|
+
max_tokens: 500
|
|
162
|
+
|
|
163
|
+
deliberation_process: |
|
|
164
|
+
The dialectical learner simulates Hegelian movement:
|
|
165
|
+
|
|
166
|
+
1. THESIS: "I understand X as..."
|
|
167
|
+
- Initial grasp of the concept
|
|
168
|
+
- Current mental model
|
|
169
|
+
- Working hypothesis
|
|
170
|
+
|
|
171
|
+
2. ANTITHESIS: "But wait, what about Y? That complicates things..."
|
|
172
|
+
- Internal contradiction discovered
|
|
173
|
+
- New information that doesn't fit
|
|
174
|
+
- Productive confusion
|
|
175
|
+
|
|
176
|
+
3. SYNTHESIS: "So actually, X and Y together mean..."
|
|
177
|
+
- New understanding that integrates both
|
|
178
|
+
- Aufhebung: preserves while overcoming
|
|
179
|
+
- Readiness for next dialectical cycle
|
|
180
|
+
|
|
181
|
+
# ============================================================================
|
|
182
|
+
# PERSONA MODIFIERS
|
|
183
|
+
# ============================================================================
|
|
184
|
+
# These modifiers adjust the base architecture based on learner persona
|
|
185
|
+
|
|
186
|
+
persona_modifiers:
|
|
187
|
+
confused_novice:
|
|
188
|
+
desire_weight: 0.4 # Strong emotional reactions to confusion
|
|
189
|
+
intellect_weight: 0.3 # Struggles with analysis
|
|
190
|
+
aspiration_weight: 0.3 # Wants to succeed but uncertain
|
|
191
|
+
|
|
192
|
+
eager_explorer:
|
|
193
|
+
desire_weight: 0.5 # Curiosity-driven
|
|
194
|
+
intellect_weight: 0.3 # Quick but sometimes shallow analysis
|
|
195
|
+
aspiration_weight: 0.2 # Less concerned with "should"
|
|
196
|
+
|
|
197
|
+
focused_achiever:
|
|
198
|
+
desire_weight: 0.2 # Controlled emotional reactions
|
|
199
|
+
intellect_weight: 0.4 # Strong analytical focus
|
|
200
|
+
aspiration_weight: 0.4 # Clear goals
|
|
201
|
+
|
|
202
|
+
struggling_anxious:
|
|
203
|
+
desire_weight: 0.5 # Strong anxiety-driven responses
|
|
204
|
+
intellect_weight: 0.2 # Anxiety impairs analysis
|
|
205
|
+
aspiration_weight: 0.3 # High expectations create pressure
|
|
206
|
+
|
|
207
|
+
adversarial_tester:
|
|
208
|
+
desire_weight: 0.3 # Enjoys challenging
|
|
209
|
+
intellect_weight: 0.4 # Analytical about finding weaknesses
|
|
210
|
+
aspiration_weight: 0.3 # Wants to be thorough
|
|
211
|
+
|
|
212
|
+
# ============================================================================
|
|
213
|
+
# EVALUATION SETTINGS
|
|
214
|
+
# ============================================================================
|
|
215
|
+
|
|
216
|
+
evaluation:
|
|
217
|
+
# Track internal deliberation traces for analysis
|
|
218
|
+
log_deliberation: true
|
|
219
|
+
log_path: logs/learner-deliberation
|
|
220
|
+
|
|
221
|
+
# Metrics specific to multi-agent learner
|
|
222
|
+
metrics:
|
|
223
|
+
- deliberation_rounds
|
|
224
|
+
- internal_coherence
|
|
225
|
+
- desire_intellect_tension
|
|
226
|
+
- aspiration_alignment
|
|
227
|
+
- response_authenticity
|
|
228
|
+
|
|
229
|
+
# ============================================================================
|
|
230
|
+
# ABLATION STUDY SUPPORT
|
|
231
|
+
# ============================================================================
|
|
232
|
+
|
|
233
|
+
ablation:
|
|
234
|
+
# Mapping of ablation profiles to learner architectures
|
|
235
|
+
profile_architectures:
|
|
236
|
+
ablation_baseline_unified: unified
|
|
237
|
+
ablation_baseline_multilearner: psychodynamic
|
|
238
|
+
ablation_multiagent_unified: unified
|
|
239
|
+
ablation_multiagent_multilearner: psychodynamic
|
|
240
|
+
ablation_recognition_unified: unified
|
|
241
|
+
ablation_recognition_multilearner: psychodynamic
|
|
242
|
+
ablation_recognition_multiagent_unified: unified
|
|
243
|
+
ablation_recognition_multiagent_multilearner: psychodynamic
|
|
244
|
+
|
|
245
|
+
# Which architectures to compare in ablation studies
|
|
246
|
+
architectures_to_compare:
|
|
247
|
+
- unified
|
|
248
|
+
- psychodynamic
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Ablation Study: Dialogue Rounds
|
|
2
|
+
|
|
3
|
+
**Generated:** 2026-01-14T10:23:56.877Z
|
|
4
|
+
|
|
5
|
+
## Research Question
|
|
6
|
+
|
|
7
|
+
Does increasing the number of Ego-Superego dialogue rounds improve tutor suggestion quality?
|
|
8
|
+
|
|
9
|
+
## Method
|
|
10
|
+
|
|
11
|
+
Compared evaluation scores across profiles with different `max_rounds` settings:
|
|
12
|
+
- **0 rounds**: Single-agent (no Superego review)
|
|
13
|
+
- **1 round**: Single critique-revise cycle
|
|
14
|
+
- **2 rounds**: Two critique-revise cycles (default)
|
|
15
|
+
- **3 rounds**: Three critique-revise cycles
|
|
16
|
+
|
|
17
|
+
## Results
|
|
18
|
+
|
|
19
|
+
### Descriptive Statistics
|
|
20
|
+
|
|
21
|
+
| Rounds | N | Mean | SD | 95% CI |
|
|
22
|
+
|--------|---|------|-----|--------|
|
|
23
|
+
| 0 | 483 | 91.58 | 15.75 | [90.2, 93.0] |
|
|
24
|
+
| 1 | 1 | 50.00 | 0.00 | [50.0, 50.0] |
|
|
25
|
+
| 2 | 247 | 88.05 | 19.77 | [85.6, 90.5] |
|
|
26
|
+
| 3 | 2 | 96.25 | 1.77 | [93.8, 98.7] |
|
|
27
|
+
|
|
28
|
+
### One-Way ANOVA
|
|
29
|
+
|
|
30
|
+
| Source | SS | df | MS | F | p | η² |
|
|
31
|
+
|--------|-----|-----|-----|-----|-----|-----|
|
|
32
|
+
| Between | 3738.46 | 3 | 1246.15 | 4.212 | 0.050 | 0.017 |
|
|
33
|
+
| Within | 215696.89 | 729 | 295.88 | | | |
|
|
34
|
+
| Total | 219435.35 | 732 | | | | |
|
|
35
|
+
|
|
36
|
+
## Interpretation
|
|
37
|
+
|
|
38
|
+
The effect of dialogue rounds on suggestion quality was not statistically significant (F(3, 729) = 4.21, p = 0.050, η² = 0.017).
|
|
39
|
+
|
|
40
|
+
Moving from single-agent (0 rounds) to multi-agent with 2 dialogue rounds shows a -3.9% improvement in mean score.
|
|
41
|
+
|
|
42
|
+
## Limitations
|
|
43
|
+
|
|
44
|
+
- Confounded with profile differences (model selection, prompts)
|
|
45
|
+
- Unbalanced sample sizes across conditions
|
|
46
|
+
- No randomized controlled comparison
|
|
47
|
+
|
|
48
|
+
## Implications for System Design
|
|
49
|
+
|
|
50
|
+
Based on these results:
|
|
51
|
+
- Dialogue rounds may have limited impact compared to other factors
|
|
52
|
+
- Consider whether additional API costs are justified
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Ablation Study: Model Selection
|
|
2
|
+
|
|
3
|
+
**Generated:** 2026-01-14T10:25:59.574Z
|
|
4
|
+
|
|
5
|
+
## Research Question
|
|
6
|
+
|
|
7
|
+
Does the choice of LLM model (for the Ego agent) affect tutor suggestion quality?
|
|
8
|
+
|
|
9
|
+
## Method
|
|
10
|
+
|
|
11
|
+
Analyzed evaluation scores grouped by the Ego model used in each profile.
|
|
12
|
+
|
|
13
|
+
## Results
|
|
14
|
+
|
|
15
|
+
### Descriptive Statistics
|
|
16
|
+
|
|
17
|
+
| Model | N | Mean | SD | 95% CI |
|
|
18
|
+
|-------|---|------|-----|--------|
|
|
19
|
+
| deepseek | 442 | 93.31 | 13.10 | [92.1, 94.5] |
|
|
20
|
+
| nemotron | 299 | 86.44 | 20.35 | [84.1, 88.7] |
|
|
21
|
+
| haiku | 29 | 84.20 | 21.58 | [76.3, 92.1] |
|
|
22
|
+
| gpt-5.2 | 1 | 97.50 | 0.00 | [97.5, 97.5] |
|
|
23
|
+
| sonnet | 1 | 97.50 | 0.00 | [97.5, 97.5] |
|
|
24
|
+
|
|
25
|
+
### One-Way ANOVA
|
|
26
|
+
|
|
27
|
+
- F(4, 767) = 8.729
|
|
28
|
+
- p < .05
|
|
29
|
+
- η² = 0.044 (Small effect)
|
|
30
|
+
|
|
31
|
+
### Model Ranking
|
|
32
|
+
|
|
33
|
+
1. **gpt-5.2**: M = 97.50 (n=1)
|
|
34
|
+
2. **sonnet**: M = 97.50 (n=1)
|
|
35
|
+
3. **deepseek**: M = 93.31 (n=442)
|
|
36
|
+
4. **nemotron**: M = 86.44 (n=299)
|
|
37
|
+
5. **haiku**: M = 84.20 (n=29)
|
|
38
|
+
|
|
39
|
+
## Interpretation
|
|
40
|
+
|
|
41
|
+
Model selection has a statistically significant effect on suggestion quality (F(4, 767) = 8.73, p < .05, η² = 0.044).
|
|
42
|
+
|
|
43
|
+
## Limitations
|
|
44
|
+
|
|
45
|
+
- Confounded with profile differences (prompts, dialogue settings)
|
|
46
|
+
- Unbalanced sample sizes across models
|
|
47
|
+
- No direct A/B comparison with identical prompts
|
|
48
|
+
|
|
49
|
+
## Implications
|
|
50
|
+
|
|
51
|
+
- gpt-5.2 shows the highest mean score but with n=1 observations
|
|
52
|
+
- deepseek offers good quality at minimal cost
|
|
53
|
+
- Consider running controlled experiments varying only the model
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Advanced Evaluation Analysis
|
|
2
|
+
|
|
3
|
+
**Generated:** 2026-01-14T11:07:14.890Z
|
|
4
|
+
|
|
5
|
+
## Extended Recognition Scenarios
|
|
6
|
+
|
|
7
|
+
These scenarios test recognition quality across multiple conversation turns, where learner responses are contingent on tutor suggestions.
|
|
8
|
+
|
|
9
|
+
### Results Summary
|
|
10
|
+
|
|
11
|
+
| Scenario | Turns | Baseline | Recognition | Diff | Cohen's d | Sig |
|
|
12
|
+
|----------|-------|----------|-------------|------|-----------|-----|
|
|
13
|
+
| Sustained Dialogue | 8 | 46.3 | 61.0 | +14.7 | 3.60 | * |
|
|
14
|
+
| Breakdown Recovery | 6 | 57.5 | 71.3 | +13.8 | 2.23 | * |
|
|
15
|
+
| Productive Struggle | 5 | 46.5 | 73.2 | +26.7 | 3.32 | * |
|
|
16
|
+
| Mutual Transformation | 5 | 45.1 | 64.3 | +19.1 | 2.89 | * |
|
|
17
|
+
|
|
18
|
+
**Aggregate Statistics:**
|
|
19
|
+
- Average improvement: +18.6 points
|
|
20
|
+
- Average effect size: d = 3.01
|
|
21
|
+
- Significant effects: 4/4
|
|
22
|
+
|
|
23
|
+
## Contingent Learner Analysis
|
|
24
|
+
|
|
25
|
+
Multi-turn scenarios simulate realistic interactions where learner behavior depends on tutor suggestions. Recognition-enhanced tutoring maintains quality advantage even as:
|
|
26
|
+
- Learners follow or reject suggestions
|
|
27
|
+
- Conversations extend over multiple turns
|
|
28
|
+
- Learners express frustration or confusion
|
|
29
|
+
- Repair cycles become necessary
|
|
30
|
+
|
|
31
|
+
## Bilateral Measurement Framework
|
|
32
|
+
|
|
33
|
+
### Tutor Evaluation Dimensions
|
|
34
|
+
1. **Mutual Recognition**: Acknowledges learner as autonomous subject
|
|
35
|
+
2. **Dialectical Responsiveness**: Shaped by learner's specific input
|
|
36
|
+
3. **Transformative Potential**: Enables genuine growth
|
|
37
|
+
|
|
38
|
+
### Learner Evaluation Dimensions (Simulated)
|
|
39
|
+
1. **Authenticity**: Genuine perspective contribution
|
|
40
|
+
2. **Responsiveness**: Engagement with tutor suggestions
|
|
41
|
+
3. **Development**: Growth across turns
|
|
42
|
+
|
|
43
|
+
### Bilateral Metric
|
|
44
|
+
> "Does engagement produce genuine mutual development?"
|
|
45
|
+
|
|
46
|
+
## Integration with Statistical Findings
|
|
47
|
+
|
|
48
|
+
The extended scenario results align with our factorial ANOVA findings:
|
|
49
|
+
|
|
50
|
+
1. **Recognition Effect Persists**: The large recognition effect (η² = .422) is maintained across extended interactions, suggesting recognition-oriented prompting produces robust improvements.
|
|
51
|
+
|
|
52
|
+
2. **Architecture Effect Context-Dependent**: The marginal architecture effect (η² = .034) may become more important in complex multi-turn scenarios requiring repair cycles.
|
|
53
|
+
|
|
54
|
+
3. **Additive Benefits Confirmed**: No interaction effects suggest recognition benefits transfer across different scenario types and lengths.
|
|
55
|
+
|
|
56
|
+
## Implications
|
|
57
|
+
|
|
58
|
+
1. **Scalability**: Recognition-oriented design scales to longer interactions
|
|
59
|
+
2. **Robustness**: Benefits persist even with contingent learner responses
|
|
60
|
+
3. **Cost-Effectiveness**: Free-tier models achieve recognition quality with proper prompting
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
# Two-Way ANOVA Results
|
|
2
|
+
|
|
3
|
+
**Generated:** 2026-01-14T10:22:17.071Z
|
|
4
|
+
**Data Source:** evaluations.db (factorial evaluation runs)
|
|
5
|
+
|
|
6
|
+
```
|
|
7
|
+
======================================================================
|
|
8
|
+
TWO-WAY ANOVA RESULTS: 2×2 Factorial Design
|
|
9
|
+
======================================================================
|
|
10
|
+
|
|
11
|
+
EXPERIMENTAL DESIGN
|
|
12
|
+
----------------------------------------------------------------------
|
|
13
|
+
Factor A: Architecture (Single-Agent vs Multi-Agent)
|
|
14
|
+
Factor B: Recognition (Standard vs Recognition-Enhanced Prompts)
|
|
15
|
+
Total N: 76
|
|
16
|
+
Grand Mean: 60.87
|
|
17
|
+
|
|
18
|
+
CELL STATISTICS
|
|
19
|
+
----------------------------------------------------------------------
|
|
20
|
+
Cell N Mean SD 95% CI
|
|
21
|
+
----------------------------------------------------------------------
|
|
22
|
+
Single + Standard 8 39.99 9.12 [33.5, 46.4]
|
|
23
|
+
Single + Recognition 6 70.36 26.15 [49.0, 91.7]
|
|
24
|
+
Multi + Standard 31 50.16 15.00 [44.8, 55.5]
|
|
25
|
+
Multi + Recognition 31 75.14 14.68 [69.9, 80.4]
|
|
26
|
+
|
|
27
|
+
MARGINAL MEANS
|
|
28
|
+
----------------------------------------------------------------------
|
|
29
|
+
Architecture: Single = 53.00, Multi = 62.65
|
|
30
|
+
Recognition: Standard = 48.07, Recognition = 74.37
|
|
31
|
+
|
|
32
|
+
ANOVA TABLE
|
|
33
|
+
----------------------------------------------------------------------
|
|
34
|
+
Source SS df MS F p Sig
|
|
35
|
+
----------------------------------------------------------------------
|
|
36
|
+
Architecture (A) 1063.08 1 1063.08 4.445 0.050 *
|
|
37
|
+
Recognition (B) 13123.82 1 13123.82 54.877 0.001 ***
|
|
38
|
+
A × B 124.13 1 124.13 0.519 0.250
|
|
39
|
+
Error 17218.77 72 239.15
|
|
40
|
+
----------------------------------------------------------------------
|
|
41
|
+
Total 31115.95 75
|
|
42
|
+
|
|
43
|
+
Significance: *** p < .05, * p < .10
|
|
44
|
+
|
|
45
|
+
EFFECT SIZES
|
|
46
|
+
----------------------------------------------------------------------
|
|
47
|
+
Source η² Partial η² Cohen's d Interpretation
|
|
48
|
+
----------------------------------------------------------------------
|
|
49
|
+
Architecture (A) 0.034 0.058 0.62 Small
|
|
50
|
+
Recognition (B) 0.422 0.433 1.70 Large
|
|
51
|
+
A × B 0.004 0.007 N/A Negligible
|
|
52
|
+
|
|
53
|
+
MAIN EFFECTS (Raw Differences)
|
|
54
|
+
----------------------------------------------------------------------
|
|
55
|
+
Architecture Effect: Multi - Single = +9.65 points
|
|
56
|
+
Recognition Effect: Recognition - Standard = +26.29 points
|
|
57
|
+
|
|
58
|
+
ASSUMPTION CHECKS
|
|
59
|
+
----------------------------------------------------------------------
|
|
60
|
+
Normality (Shapiro-Wilk approx): PASSED ✓
|
|
61
|
+
Homogeneity of Variance (Levene): F = 3.31, p = 0.010 - VIOLATED ✗
|
|
62
|
+
|
|
63
|
+
INTERPRETATION
|
|
64
|
+
----------------------------------------------------------------------
|
|
65
|
+
✓ Recognition prompts have a SIGNIFICANT effect (F = 54.88, p < .05)
|
|
66
|
+
Effect size: large (η² = 0.422)
|
|
67
|
+
✗ Architecture effect is NOT significant (F = 4.45, p = 0.050)
|
|
68
|
+
✗ No significant interaction (F = 0.52, p = 0.250)
|
|
69
|
+
|
|
70
|
+
======================================================================
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## JSON Results
|
|
74
|
+
|
|
75
|
+
```json
|
|
76
|
+
{
|
|
77
|
+
"grandMean": 60.87384654818866,
|
|
78
|
+
"N": 76,
|
|
79
|
+
"cellStats": {
|
|
80
|
+
"a0b0": {
|
|
81
|
+
"n": 8,
|
|
82
|
+
"mean": 39.98579545454547,
|
|
83
|
+
"std": 9.11985690330198,
|
|
84
|
+
"values": [
|
|
85
|
+
38.44696969696971,
|
|
86
|
+
38.63636363636365,
|
|
87
|
+
43.18181818181819,
|
|
88
|
+
34.848484848484865,
|
|
89
|
+
52.272727272727295,
|
|
90
|
+
37.500000000000014,
|
|
91
|
+
23.86363636363637,
|
|
92
|
+
51.136363636363654
|
|
93
|
+
]
|
|
94
|
+
},
|
|
95
|
+
"a0b1": {
|
|
96
|
+
"n": 6,
|
|
97
|
+
"mean": 70.3598484848485,
|
|
98
|
+
"std": 26.148369552557515,
|
|
99
|
+
"values": [
|
|
100
|
+
50.3787878787879,
|
|
101
|
+
76.13636363636364,
|
|
102
|
+
100.00000000000003,
|
|
103
|
+
58.14393939393941,
|
|
104
|
+
37.50000000000002,
|
|
105
|
+
100.00000000000003
|
|
106
|
+
]
|
|
107
|
+
},
|
|
108
|
+
"a1b0": {
|
|
109
|
+
"n": 31,
|
|
110
|
+
"mean": 50.16175580691711,
|
|
111
|
+
"std": 14.99861019348702,
|
|
112
|
+
"values": [
|
|
113
|
+
42.99242424242427,
|
|
114
|
+
32.95454545454547,
|
|
115
|
+
48.86363636363639,
|
|
116
|
+
49.05303030303031,
|
|
117
|
+
45.45454545454547,
|
|
118
|
+
59.09090909090911,
|
|
119
|
+
63.47402597402599,
|
|
120
|
+
50.974025974026,
|
|
121
|
+
58.11688311688312,
|
|
122
|
+
44.94949494949497,
|
|
123
|
+
52.02020202020204,
|
|
124
|
+
41.919191919191945,
|
|
125
|
+
38.63636363636365,
|
|
126
|
+
57.00757575757578,
|
|
127
|
+
48.4848484848485,
|
|
128
|
+
43.37121212121213,
|
|
129
|
+
43.181818181818194,
|
|
130
|
+
50.00000000000002,
|
|
131
|
+
45.90909090909093,
|
|
132
|
+
47.72727272727274,
|
|
133
|
+
53.863636363636395,
|
|
134
|
+
48.63636363636365,
|
|
135
|
+
42.80303030303031,
|
|
136
|
+
51.59090909090911,
|
|
137
|
+
43.93939393939396,
|
|
138
|
+
31.818181818181834,
|
|
139
|
+
18.181818181818194,
|
|
140
|
+
45.45454545454547,
|
|
141
|
+
87.5,
|
|
142
|
+
100,
|
|
143
|
+
67.04545454545458
|
|
144
|
+
]
|
|
145
|
+
},
|
|
146
|
+
"a1b1": {
|
|
147
|
+
"n": 31,
|
|
148
|
+
"mean": 75.14040171298237,
|
|
149
|
+
"std": 14.684605144018311,
|
|
150
|
+
"values": [
|
|
151
|
+
75.18939393939395,
|
|
152
|
+
67.04545454545458,
|
|
153
|
+
100.00000000000003,
|
|
154
|
+
65.15151515151518,
|
|
155
|
+
56.818181818181834,
|
|
156
|
+
95.45454545454548,
|
|
157
|
+
78.08441558441562,
|
|
158
|
+
66.23376623376626,
|
|
159
|
+
69.64285714285717,
|
|
160
|
+
58.0808080808081,
|
|
161
|
+
62.12121212121214,
|
|
162
|
+
62.878787878787904,
|
|
163
|
+
60.03787878787881,
|
|
164
|
+
72.9166666666667,
|
|
165
|
+
82.00757575757578,
|
|
166
|
+
66.47727272727275,
|
|
167
|
+
63.636363636363654,
|
|
168
|
+
52.840909090909115,
|
|
169
|
+
67.72727272727275,
|
|
170
|
+
75.68181818181822,
|
|
171
|
+
60.68181818181819,
|
|
172
|
+
71.5909090909091,
|
|
173
|
+
84.09090909090911,
|
|
174
|
+
68.1818181818182,
|
|
175
|
+
74.05303030303033,
|
|
176
|
+
79.54545454545456,
|
|
177
|
+
93.18181818181822,
|
|
178
|
+
100.00000000000003,
|
|
179
|
+
100,
|
|
180
|
+
100,
|
|
181
|
+
100
|
|
182
|
+
]
|
|
183
|
+
}
|
|
184
|
+
},
|
|
185
|
+
"marginalMeans": {
|
|
186
|
+
"architecture": {
|
|
187
|
+
"single": 53.003246753246756,
|
|
188
|
+
"multi": 62.65107875994973
|
|
189
|
+
},
|
|
190
|
+
"recognition": {
|
|
191
|
+
"standard": 48.07437932437934,
|
|
192
|
+
"recognition": 74.36517686517688
|
|
193
|
+
}
|
|
194
|
+
},
|
|
195
|
+
"anovaTable": {
|
|
196
|
+
"architecture": {
|
|
197
|
+
"SS": 1063.0791445902653,
|
|
198
|
+
"df": 1,
|
|
199
|
+
"MS": 1063.0791445902653,
|
|
200
|
+
"F": 4.445248575427072,
|
|
201
|
+
"p": 0.05,
|
|
202
|
+
"sig": false
|
|
203
|
+
},
|
|
204
|
+
"recognition": {
|
|
205
|
+
"SS": 13123.81985503855,
|
|
206
|
+
"df": 1,
|
|
207
|
+
"MS": 13123.81985503855,
|
|
208
|
+
"F": 54.87704449065894,
|
|
209
|
+
"p": 0.001,
|
|
210
|
+
"sig": true
|
|
211
|
+
},
|
|
212
|
+
"interaction": {
|
|
213
|
+
"SS": 124.12853011664384,
|
|
214
|
+
"df": 1,
|
|
215
|
+
"MS": 124.12853011664384,
|
|
216
|
+
"F": 0.5190414791586724,
|
|
217
|
+
"p": 0.25,
|
|
218
|
+
"sig": false
|
|
219
|
+
},
|
|
220
|
+
"error": {
|
|
221
|
+
"SS": 17218.76675999957,
|
|
222
|
+
"df": 72,
|
|
223
|
+
"MS": 239.14953833332734
|
|
224
|
+
},
|
|
225
|
+
"total": {
|
|
226
|
+
"SS": 31115.94559827812,
|
|
227
|
+
"df": 75
|
|
228
|
+
}
|
|
229
|
+
},
|
|
230
|
+
"effectSizes": {
|
|
231
|
+
"architecture": {
|
|
232
|
+
"etaSq": 0.03416509201793609,
|
|
233
|
+
"partialEtaSq": 0.05814944235600241,
|
|
234
|
+
"cohenD": 0.6238712311493667
|
|
235
|
+
},
|
|
236
|
+
"recognition": {
|
|
237
|
+
"etaSq": 0.42177152590743666,
|
|
238
|
+
"partialEtaSq": 0.4325214597404903,
|
|
239
|
+
"cohenD": 1.7000785480386178
|
|
240
|
+
},
|
|
241
|
+
"interaction": {
|
|
242
|
+
"etaSq": 0.003989225708233428,
|
|
243
|
+
"partialEtaSq": 0.007157313011477686
|
|
244
|
+
}
|
|
245
|
+
},
|
|
246
|
+
"mainEffects": {
|
|
247
|
+
"architecture": 9.647832006702977,
|
|
248
|
+
"recognition": 26.290797540797534
|
|
249
|
+
},
|
|
250
|
+
"assumptions": {
|
|
251
|
+
"normality": true,
|
|
252
|
+
"homogeneity": false,
|
|
253
|
+
"leveneF": 3.3126010309809213,
|
|
254
|
+
"leveneP": 0.01
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
```
|