@machinespirits/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/components/MobileEvalDashboard.tsx +267 -0
  2. package/components/comparison/DeltaAnalysisTable.tsx +137 -0
  3. package/components/comparison/ProfileComparisonCard.tsx +176 -0
  4. package/components/comparison/RecognitionABMode.tsx +385 -0
  5. package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
  6. package/components/comparison/WinnerIndicator.tsx +64 -0
  7. package/components/comparison/index.ts +5 -0
  8. package/components/mobile/BottomSheet.tsx +233 -0
  9. package/components/mobile/DimensionBreakdown.tsx +210 -0
  10. package/components/mobile/DocsView.tsx +363 -0
  11. package/components/mobile/LogsView.tsx +481 -0
  12. package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
  13. package/components/mobile/QuickTestView.tsx +1098 -0
  14. package/components/mobile/RecognitionTypeChart.tsx +124 -0
  15. package/components/mobile/RecognitionView.tsx +809 -0
  16. package/components/mobile/RunDetailView.tsx +261 -0
  17. package/components/mobile/RunHistoryView.tsx +367 -0
  18. package/components/mobile/ScoreRadial.tsx +211 -0
  19. package/components/mobile/StreamingLogPanel.tsx +230 -0
  20. package/components/mobile/SynthesisStrategyChart.tsx +140 -0
  21. package/config/interaction-eval-scenarios.yaml +832 -0
  22. package/config/learner-agents.yaml +248 -0
  23. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
  24. package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
  25. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
  26. package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
  27. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
  28. package/docs/research/COST-ANALYSIS.md +56 -0
  29. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
  30. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
  31. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
  32. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
  33. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
  34. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
  35. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
  36. package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
  37. package/docs/research/PAPER-UNIFIED.md +659 -0
  38. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  39. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
  40. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
  41. package/docs/research/apa.csl +2133 -0
  42. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
  43. package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
  44. package/docs/research/paper-draft/full-paper.md +136 -0
  45. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  46. package/docs/research/paper-draft/references.bib +515 -0
  47. package/docs/research/transcript-baseline.md +139 -0
  48. package/docs/research/transcript-recognition-multiagent.md +187 -0
  49. package/hooks/useEvalData.ts +625 -0
  50. package/index.js +27 -0
  51. package/package.json +73 -0
  52. package/routes/evalRoutes.js +3002 -0
  53. package/scripts/advanced-eval-analysis.js +351 -0
  54. package/scripts/analyze-eval-costs.js +378 -0
  55. package/scripts/analyze-eval-results.js +513 -0
  56. package/scripts/analyze-interaction-evals.js +368 -0
  57. package/server-init.js +45 -0
  58. package/server.js +162 -0
  59. package/services/benchmarkService.js +1892 -0
  60. package/services/evaluationRunner.js +739 -0
  61. package/services/evaluationStore.js +1121 -0
  62. package/services/learnerConfigLoader.js +385 -0
  63. package/services/learnerTutorInteractionEngine.js +857 -0
  64. package/services/memory/learnerMemoryService.js +1227 -0
  65. package/services/memory/learnerWritingPad.js +577 -0
  66. package/services/memory/tutorWritingPad.js +674 -0
  67. package/services/promptRecommendationService.js +493 -0
  68. package/services/rubricEvaluator.js +826 -0
@@ -0,0 +1,248 @@
1
+ # Learner Agent Architecture Configuration
2
+ # Defines how the simulated learner generates responses during evaluation
3
+ #
4
+ # ============================================================================
5
+ # LEARNER ARCHITECTURES
6
+ # ============================================================================
7
+ #
8
+ # The learner architecture determines whether the simulated learner has
9
+ # internal deliberation before generating responses. This enables testing
10
+ # whether multi-agent learner simulation improves evaluation validity.
11
+ #
12
+ # Architectures:
13
+ # 1. unified: Single learner agent (no internal dialogue)
14
+ # 2. psychodynamic: Freudian desire/intellect/aspiration deliberation
15
+ # 3. dialectical: Hegelian thesis/antithesis/synthesis process
16
+ #
17
+ # ============================================================================
18
+
19
+ # Active architecture (can be overridden by tutor profile)
20
+ active_architecture: unified
21
+
22
+ # ============================================================================
23
+ # ARCHITECTURE DEFINITIONS
24
+ # ============================================================================
25
+
26
+ architectures:
27
+ # Unified: Single agent with no internal deliberation
28
+ # Simple, fast, good for baseline comparisons
29
+ unified:
30
+ name: "Unified Learner"
31
+ description: "Single learner agent without internal deliberation"
32
+ deliberation:
33
+ enabled: false
34
+ max_rounds: 0
35
+
36
+ agent:
37
+ provider: openrouter
38
+ model: nemotron
39
+ prompt_file: learner-unified.md
40
+ hyperparameters:
41
+ temperature: 0.7
42
+ max_tokens: 500
43
+
44
+ # Psychodynamic: Freudian-inspired desire/intellect/aspiration
45
+ # Internal deliberation between:
46
+ # - Desire (Id-like): What the learner wants emotionally
47
+ # - Intellect (Ego-like): Rational analysis of the situation
48
+ # - Aspiration (Superego-like): Idealized learning goals
49
+ psychodynamic:
50
+ name: "Psychodynamic Learner"
51
+ description: "Freudian-inspired internal deliberation between desire, intellect, and aspiration"
52
+ deliberation:
53
+ enabled: true
54
+ max_rounds: 2
55
+ convergence_threshold: 0.7
56
+
57
+ agents:
58
+ desire:
59
+ role: "id"
60
+ description: "Emotional/affective responses - what the learner WANTS"
61
+ provider: openrouter
62
+ model: nemotron
63
+ prompt_file: learner-desire.md
64
+ hyperparameters:
65
+ temperature: 0.8 # Higher temp for more emotional/varied responses
66
+ max_tokens: 400
67
+
68
+ intellect:
69
+ role: "ego"
70
+ description: "Rational analysis - what the learner THINKS"
71
+ provider: openrouter
72
+ model: nemotron
73
+ prompt_file: learner-intellect.md
74
+ hyperparameters:
75
+ temperature: 0.5 # Lower temp for more analytical responses
76
+ max_tokens: 400
77
+
78
+ aspiration:
79
+ role: "superego"
80
+ description: "Idealized goals - what the learner SHOULD want"
81
+ provider: openrouter
82
+ model: nemotron
83
+ prompt_file: learner-aspiration.md
84
+ hyperparameters:
85
+ temperature: 0.6
86
+ max_tokens: 400
87
+
88
+ synthesizer:
89
+ description: "Integrates the three voices into a coherent response"
90
+ provider: openrouter
91
+ model: nemotron
92
+ prompt_file: learner-synthesizer.md
93
+ hyperparameters:
94
+ temperature: 0.6
95
+ max_tokens: 500
96
+
97
+ deliberation_process: |
98
+ The psychodynamic learner simulates internal conflict:
99
+
100
+ 1. DESIRE (Id): "I want to skip ahead / I'm bored / This is frustrating"
101
+ - Immediate emotional reactions
102
+ - Avoidance tendencies
103
+ - Curiosity and excitement
104
+
105
+ 2. INTELLECT (Ego): "This doesn't make sense yet / I need more examples"
106
+ - Rational assessment of understanding
107
+ - Strategic thinking about learning path
108
+ - Reality testing
109
+
110
+ 3. ASPIRATION (Superego): "I should master this / I want to be an expert"
111
+ - Long-term learning goals
112
+ - Internalized expectations
113
+ - Self-improvement drives
114
+
115
+ 4. SYNTHESIS: Integration into coherent learner response
116
+ - Balances immediate desires with long-term goals
117
+ - Produces realistic learner behavior
118
+
119
+ # Dialectical: Hegelian thesis/antithesis/synthesis
120
+ # Internal deliberation where:
121
+ # - Thesis: Initial position/understanding
122
+ # - Antithesis: Challenge/complication to that position
123
+ # - Synthesis: New integrated understanding
124
+ dialectical:
125
+ name: "Dialectical Learner"
126
+ description: "Hegelian-inspired internal dialectic between thesis, antithesis, and synthesis"
127
+ deliberation:
128
+ enabled: true
129
+ max_rounds: 2
130
+ convergence_threshold: 0.7
131
+
132
+ agents:
133
+ thesis:
134
+ role: "thesis"
135
+ description: "Initial understanding or position"
136
+ provider: openrouter
137
+ model: nemotron
138
+ prompt_file: learner-thesis.md
139
+ hyperparameters:
140
+ temperature: 0.6
141
+ max_tokens: 400
142
+
143
+ antithesis:
144
+ role: "antithesis"
145
+ description: "Challenge or complication to the thesis"
146
+ provider: openrouter
147
+ model: nemotron
148
+ prompt_file: learner-antithesis.md
149
+ hyperparameters:
150
+ temperature: 0.7
151
+ max_tokens: 400
152
+
153
+ synthesis:
154
+ role: "synthesis"
155
+ description: "Integration that preserves and overcomes the tension"
156
+ provider: openrouter
157
+ model: nemotron
158
+ prompt_file: learner-synthesis.md
159
+ hyperparameters:
160
+ temperature: 0.6
161
+ max_tokens: 500
162
+
163
+ deliberation_process: |
164
+ The dialectical learner simulates Hegelian movement:
165
+
166
+ 1. THESIS: "I understand X as..."
167
+ - Initial grasp of the concept
168
+ - Current mental model
169
+ - Working hypothesis
170
+
171
+ 2. ANTITHESIS: "But wait, what about Y? That complicates things..."
172
+ - Internal contradiction discovered
173
+ - New information that doesn't fit
174
+ - Productive confusion
175
+
176
+ 3. SYNTHESIS: "So actually, X and Y together mean..."
177
+ - New understanding that integrates both
178
+ - Aufhebung: preserves while overcoming
179
+ - Readiness for next dialectical cycle
180
+
181
+ # ============================================================================
182
+ # PERSONA MODIFIERS
183
+ # ============================================================================
184
+ # These modifiers adjust the base architecture based on learner persona
185
+
186
+ persona_modifiers:
187
+ confused_novice:
188
+ desire_weight: 0.4 # Strong emotional reactions to confusion
189
+ intellect_weight: 0.3 # Struggles with analysis
190
+ aspiration_weight: 0.3 # Wants to succeed but uncertain
191
+
192
+ eager_explorer:
193
+ desire_weight: 0.5 # Curiosity-driven
194
+ intellect_weight: 0.3 # Quick but sometimes shallow analysis
195
+ aspiration_weight: 0.2 # Less concerned with "should"
196
+
197
+ focused_achiever:
198
+ desire_weight: 0.2 # Controlled emotional reactions
199
+ intellect_weight: 0.4 # Strong analytical focus
200
+ aspiration_weight: 0.4 # Clear goals
201
+
202
+ struggling_anxious:
203
+ desire_weight: 0.5 # Strong anxiety-driven responses
204
+ intellect_weight: 0.2 # Anxiety impairs analysis
205
+ aspiration_weight: 0.3 # High expectations create pressure
206
+
207
+ adversarial_tester:
208
+ desire_weight: 0.3 # Enjoys challenging
209
+ intellect_weight: 0.4 # Analytical about finding weaknesses
210
+ aspiration_weight: 0.3 # Wants to be thorough
211
+
212
+ # ============================================================================
213
+ # EVALUATION SETTINGS
214
+ # ============================================================================
215
+
216
+ evaluation:
217
+ # Track internal deliberation traces for analysis
218
+ log_deliberation: true
219
+ log_path: logs/learner-deliberation
220
+
221
+ # Metrics specific to multi-agent learner
222
+ metrics:
223
+ - deliberation_rounds
224
+ - internal_coherence
225
+ - desire_intellect_tension
226
+ - aspiration_alignment
227
+ - response_authenticity
228
+
229
+ # ============================================================================
230
+ # ABLATION STUDY SUPPORT
231
+ # ============================================================================
232
+
233
+ ablation:
234
+ # Mapping of ablation profiles to learner architectures
235
+ profile_architectures:
236
+ ablation_baseline_unified: unified
237
+ ablation_baseline_multilearner: psychodynamic
238
+ ablation_multiagent_unified: unified
239
+ ablation_multiagent_multilearner: psychodynamic
240
+ ablation_recognition_unified: unified
241
+ ablation_recognition_multilearner: psychodynamic
242
+ ablation_recognition_multiagent_unified: unified
243
+ ablation_recognition_multiagent_multilearner: psychodynamic
244
+
245
+ # Which architectures to compare in ablation studies
246
+ architectures_to_compare:
247
+ - unified
248
+ - psychodynamic
@@ -0,0 +1,52 @@
1
+ # Ablation Study: Dialogue Rounds
2
+
3
+ **Generated:** 2026-01-14T10:23:56.877Z
4
+
5
+ ## Research Question
6
+
7
+ Does increasing the number of Ego-Superego dialogue rounds improve tutor suggestion quality?
8
+
9
+ ## Method
10
+
11
+ Compared evaluation scores across profiles with different `max_rounds` settings:
12
+ - **0 rounds**: Single-agent (no Superego review)
13
+ - **1 round**: Single critique-revise cycle
14
+ - **2 rounds**: Two critique-revise cycles (default)
15
+ - **3 rounds**: Three critique-revise cycles
16
+
17
+ ## Results
18
+
19
+ ### Descriptive Statistics
20
+
21
+ | Rounds | N | Mean | SD | 95% CI |
22
+ |--------|---|------|-----|--------|
23
+ | 0 | 483 | 91.58 | 15.75 | [90.2, 93.0] |
24
+ | 1 | 1 | 50.00 | 0.00 | [50.0, 50.0] |
25
+ | 2 | 247 | 88.05 | 19.77 | [85.6, 90.5] |
26
+ | 3 | 2 | 96.25 | 1.77 | [93.8, 98.7] |
27
+
28
+ ### One-Way ANOVA
29
+
30
+ | Source | SS | df | MS | F | p | η² |
31
+ |--------|-----|-----|-----|-----|-----|-----|
32
+ | Between | 3738.46 | 3 | 1246.15 | 4.212 | 0.050 | 0.017 |
33
+ | Within | 215696.89 | 729 | 295.88 | | | |
34
+ | Total | 219435.35 | 732 | | | | |
35
+
36
+ ## Interpretation
37
+
38
+ The effect of dialogue rounds on suggestion quality was not statistically significant (F(3, 729) = 4.21, p = 0.050, η² = 0.017).
39
+
40
+ Moving from single-agent (0 rounds) to multi-agent with 2 dialogue rounds shows a -3.9% improvement in mean score.
41
+
42
+ ## Limitations
43
+
44
+ - Confounded with profile differences (model selection, prompts)
45
+ - Unbalanced sample sizes across conditions
46
+ - No randomized controlled comparison
47
+
48
+ ## Implications for System Design
49
+
50
+ Based on these results:
51
+ - Dialogue rounds may have limited impact compared to other factors
52
+ - Consider whether additional API costs are justified
@@ -0,0 +1,53 @@
1
+ # Ablation Study: Model Selection
2
+
3
+ **Generated:** 2026-01-14T10:25:59.574Z
4
+
5
+ ## Research Question
6
+
7
+ Does the choice of LLM model (for the Ego agent) affect tutor suggestion quality?
8
+
9
+ ## Method
10
+
11
+ Analyzed evaluation scores grouped by the Ego model used in each profile.
12
+
13
+ ## Results
14
+
15
+ ### Descriptive Statistics
16
+
17
+ | Model | N | Mean | SD | 95% CI |
18
+ |-------|---|------|-----|--------|
19
+ | deepseek | 442 | 93.31 | 13.10 | [92.1, 94.5] |
20
+ | nemotron | 299 | 86.44 | 20.35 | [84.1, 88.7] |
21
+ | haiku | 29 | 84.20 | 21.58 | [76.3, 92.1] |
22
+ | gpt-5.2 | 1 | 97.50 | 0.00 | [97.5, 97.5] |
23
+ | sonnet | 1 | 97.50 | 0.00 | [97.5, 97.5] |
24
+
25
+ ### One-Way ANOVA
26
+
27
+ - F(4, 767) = 8.729
28
+ - p < .05
29
+ - η² = 0.044 (Small effect)
30
+
31
+ ### Model Ranking
32
+
33
+ 1. **gpt-5.2**: M = 97.50 (n=1)
34
+ 2. **sonnet**: M = 97.50 (n=1)
35
+ 3. **deepseek**: M = 93.31 (n=442)
36
+ 4. **nemotron**: M = 86.44 (n=299)
37
+ 5. **haiku**: M = 84.20 (n=29)
38
+
39
+ ## Interpretation
40
+
41
+ Model selection has a statistically significant effect on suggestion quality (F(4, 767) = 8.73, p < .05, η² = 0.044).
42
+
43
+ ## Limitations
44
+
45
+ - Confounded with profile differences (prompts, dialogue settings)
46
+ - Unbalanced sample sizes across models
47
+ - No direct A/B comparison with identical prompts
48
+
49
+ ## Implications
50
+
51
+ - gpt-5.2 shows the highest mean score but with n=1 observations
52
+ - deepseek offers good quality at minimal cost
53
+ - Consider running controlled experiments varying only the model
@@ -0,0 +1,60 @@
1
+ # Advanced Evaluation Analysis
2
+
3
+ **Generated:** 2026-01-14T11:07:14.890Z
4
+
5
+ ## Extended Recognition Scenarios
6
+
7
+ These scenarios test recognition quality across multiple conversation turns, where learner responses are contingent on tutor suggestions.
8
+
9
+ ### Results Summary
10
+
11
+ | Scenario | Turns | Baseline | Recognition | Diff | Cohen's d | Sig |
12
+ |----------|-------|----------|-------------|------|-----------|-----|
13
+ | Sustained Dialogue | 8 | 46.3 | 61.0 | +14.7 | 3.60 | * |
14
+ | Breakdown Recovery | 6 | 57.5 | 71.3 | +13.8 | 2.23 | * |
15
+ | Productive Struggle | 5 | 46.5 | 73.2 | +26.7 | 3.32 | * |
16
+ | Mutual Transformation | 5 | 45.1 | 64.3 | +19.1 | 2.89 | * |
17
+
18
+ **Aggregate Statistics:**
19
+ - Average improvement: +18.6 points
20
+ - Average effect size: d = 3.01
21
+ - Significant effects: 4/4
22
+
23
+ ## Contingent Learner Analysis
24
+
25
+ Multi-turn scenarios simulate realistic interactions where learner behavior depends on tutor suggestions. Recognition-enhanced tutoring maintains quality advantage even as:
26
+ - Learners follow or reject suggestions
27
+ - Conversations extend over multiple turns
28
+ - Learners express frustration or confusion
29
+ - Repair cycles become necessary
30
+
31
+ ## Bilateral Measurement Framework
32
+
33
+ ### Tutor Evaluation Dimensions
34
+ 1. **Mutual Recognition**: Acknowledges learner as autonomous subject
35
+ 2. **Dialectical Responsiveness**: Shaped by learner's specific input
36
+ 3. **Transformative Potential**: Enables genuine growth
37
+
38
+ ### Learner Evaluation Dimensions (Simulated)
39
+ 1. **Authenticity**: Genuine perspective contribution
40
+ 2. **Responsiveness**: Engagement with tutor suggestions
41
+ 3. **Development**: Growth across turns
42
+
43
+ ### Bilateral Metric
44
+ > "Does engagement produce genuine mutual development?"
45
+
46
+ ## Integration with Statistical Findings
47
+
48
+ The extended scenario results align with our factorial ANOVA findings:
49
+
50
+ 1. **Recognition Effect Persists**: The large recognition effect (η² = .422) is maintained across extended interactions, suggesting recognition-oriented prompting produces robust improvements.
51
+
52
+ 2. **Architecture Effect Context-Dependent**: The marginal architecture effect (η² = .034) may become more important in complex multi-turn scenarios requiring repair cycles.
53
+
54
+ 3. **Additive Benefits Confirmed**: No interaction effects suggest recognition benefits transfer across different scenario types and lengths.
55
+
56
+ ## Implications
57
+
58
+ 1. **Scalability**: Recognition-oriented design scales to longer interactions
59
+ 2. **Robustness**: Benefits persist even with contingent learner responses
60
+ 3. **Cost-Effectiveness**: Free-tier models achieve recognition quality with proper prompting
@@ -0,0 +1,257 @@
1
+ # Two-Way ANOVA Results
2
+
3
+ **Generated:** 2026-01-14T10:22:17.071Z
4
+ **Data Source:** evaluations.db (factorial evaluation runs)
5
+
6
+ ```
7
+ ======================================================================
8
+ TWO-WAY ANOVA RESULTS: 2×2 Factorial Design
9
+ ======================================================================
10
+
11
+ EXPERIMENTAL DESIGN
12
+ ----------------------------------------------------------------------
13
+ Factor A: Architecture (Single-Agent vs Multi-Agent)
14
+ Factor B: Recognition (Standard vs Recognition-Enhanced Prompts)
15
+ Total N: 76
16
+ Grand Mean: 60.87
17
+
18
+ CELL STATISTICS
19
+ ----------------------------------------------------------------------
20
+ Cell N Mean SD 95% CI
21
+ ----------------------------------------------------------------------
22
+ Single + Standard 8 39.99 9.12 [33.5, 46.4]
23
+ Single + Recognition 6 70.36 26.15 [49.0, 91.7]
24
+ Multi + Standard 31 50.16 15.00 [44.8, 55.5]
25
+ Multi + Recognition 31 75.14 14.68 [69.9, 80.4]
26
+
27
+ MARGINAL MEANS
28
+ ----------------------------------------------------------------------
29
+ Architecture: Single = 53.00, Multi = 62.65
30
+ Recognition: Standard = 48.07, Recognition = 74.37
31
+
32
+ ANOVA TABLE
33
+ ----------------------------------------------------------------------
34
+ Source SS df MS F p Sig
35
+ ----------------------------------------------------------------------
36
+ Architecture (A) 1063.08 1 1063.08 4.445 0.050 *
37
+ Recognition (B) 13123.82 1 13123.82 54.877 0.001 ***
38
+ A × B 124.13 1 124.13 0.519 0.250
39
+ Error 17218.77 72 239.15
40
+ ----------------------------------------------------------------------
41
+ Total 31115.95 75
42
+
43
+ Significance: *** p < .05, * p < .10
44
+
45
+ EFFECT SIZES
46
+ ----------------------------------------------------------------------
47
+ Source η² Partial η² Cohen's d Interpretation
48
+ ----------------------------------------------------------------------
49
+ Architecture (A) 0.034 0.058 0.62 Small
50
+ Recognition (B) 0.422 0.433 1.70 Large
51
+ A × B 0.004 0.007 N/A Negligible
52
+
53
+ MAIN EFFECTS (Raw Differences)
54
+ ----------------------------------------------------------------------
55
+ Architecture Effect: Multi - Single = +9.65 points
56
+ Recognition Effect: Recognition - Standard = +26.29 points
57
+
58
+ ASSUMPTION CHECKS
59
+ ----------------------------------------------------------------------
60
+ Normality (Shapiro-Wilk approx): PASSED ✓
61
+ Homogeneity of Variance (Levene): F = 3.31, p = 0.010 - VIOLATED ✗
62
+
63
+ INTERPRETATION
64
+ ----------------------------------------------------------------------
65
+ ✓ Recognition prompts have a SIGNIFICANT effect (F = 54.88, p < .05)
66
+ Effect size: large (η² = 0.422)
67
+ ✗ Architecture effect is NOT significant (F = 4.45, p = 0.050)
68
+ ✗ No significant interaction (F = 0.52, p = 0.250)
69
+
70
+ ======================================================================
71
+ ```
72
+
73
+ ## JSON Results
74
+
75
+ ```json
76
+ {
77
+ "grandMean": 60.87384654818866,
78
+ "N": 76,
79
+ "cellStats": {
80
+ "a0b0": {
81
+ "n": 8,
82
+ "mean": 39.98579545454547,
83
+ "std": 9.11985690330198,
84
+ "values": [
85
+ 38.44696969696971,
86
+ 38.63636363636365,
87
+ 43.18181818181819,
88
+ 34.848484848484865,
89
+ 52.272727272727295,
90
+ 37.500000000000014,
91
+ 23.86363636363637,
92
+ 51.136363636363654
93
+ ]
94
+ },
95
+ "a0b1": {
96
+ "n": 6,
97
+ "mean": 70.3598484848485,
98
+ "std": 26.148369552557515,
99
+ "values": [
100
+ 50.3787878787879,
101
+ 76.13636363636364,
102
+ 100.00000000000003,
103
+ 58.14393939393941,
104
+ 37.50000000000002,
105
+ 100.00000000000003
106
+ ]
107
+ },
108
+ "a1b0": {
109
+ "n": 31,
110
+ "mean": 50.16175580691711,
111
+ "std": 14.99861019348702,
112
+ "values": [
113
+ 42.99242424242427,
114
+ 32.95454545454547,
115
+ 48.86363636363639,
116
+ 49.05303030303031,
117
+ 45.45454545454547,
118
+ 59.09090909090911,
119
+ 63.47402597402599,
120
+ 50.974025974026,
121
+ 58.11688311688312,
122
+ 44.94949494949497,
123
+ 52.02020202020204,
124
+ 41.919191919191945,
125
+ 38.63636363636365,
126
+ 57.00757575757578,
127
+ 48.4848484848485,
128
+ 43.37121212121213,
129
+ 43.181818181818194,
130
+ 50.00000000000002,
131
+ 45.90909090909093,
132
+ 47.72727272727274,
133
+ 53.863636363636395,
134
+ 48.63636363636365,
135
+ 42.80303030303031,
136
+ 51.59090909090911,
137
+ 43.93939393939396,
138
+ 31.818181818181834,
139
+ 18.181818181818194,
140
+ 45.45454545454547,
141
+ 87.5,
142
+ 100,
143
+ 67.04545454545458
144
+ ]
145
+ },
146
+ "a1b1": {
147
+ "n": 31,
148
+ "mean": 75.14040171298237,
149
+ "std": 14.684605144018311,
150
+ "values": [
151
+ 75.18939393939395,
152
+ 67.04545454545458,
153
+ 100.00000000000003,
154
+ 65.15151515151518,
155
+ 56.818181818181834,
156
+ 95.45454545454548,
157
+ 78.08441558441562,
158
+ 66.23376623376626,
159
+ 69.64285714285717,
160
+ 58.0808080808081,
161
+ 62.12121212121214,
162
+ 62.878787878787904,
163
+ 60.03787878787881,
164
+ 72.9166666666667,
165
+ 82.00757575757578,
166
+ 66.47727272727275,
167
+ 63.636363636363654,
168
+ 52.840909090909115,
169
+ 67.72727272727275,
170
+ 75.68181818181822,
171
+ 60.68181818181819,
172
+ 71.5909090909091,
173
+ 84.09090909090911,
174
+ 68.1818181818182,
175
+ 74.05303030303033,
176
+ 79.54545454545456,
177
+ 93.18181818181822,
178
+ 100.00000000000003,
179
+ 100,
180
+ 100,
181
+ 100
182
+ ]
183
+ }
184
+ },
185
+ "marginalMeans": {
186
+ "architecture": {
187
+ "single": 53.003246753246756,
188
+ "multi": 62.65107875994973
189
+ },
190
+ "recognition": {
191
+ "standard": 48.07437932437934,
192
+ "recognition": 74.36517686517688
193
+ }
194
+ },
195
+ "anovaTable": {
196
+ "architecture": {
197
+ "SS": 1063.0791445902653,
198
+ "df": 1,
199
+ "MS": 1063.0791445902653,
200
+ "F": 4.445248575427072,
201
+ "p": 0.05,
202
+ "sig": false
203
+ },
204
+ "recognition": {
205
+ "SS": 13123.81985503855,
206
+ "df": 1,
207
+ "MS": 13123.81985503855,
208
+ "F": 54.87704449065894,
209
+ "p": 0.001,
210
+ "sig": true
211
+ },
212
+ "interaction": {
213
+ "SS": 124.12853011664384,
214
+ "df": 1,
215
+ "MS": 124.12853011664384,
216
+ "F": 0.5190414791586724,
217
+ "p": 0.25,
218
+ "sig": false
219
+ },
220
+ "error": {
221
+ "SS": 17218.76675999957,
222
+ "df": 72,
223
+ "MS": 239.14953833332734
224
+ },
225
+ "total": {
226
+ "SS": 31115.94559827812,
227
+ "df": 75
228
+ }
229
+ },
230
+ "effectSizes": {
231
+ "architecture": {
232
+ "etaSq": 0.03416509201793609,
233
+ "partialEtaSq": 0.05814944235600241,
234
+ "cohenD": 0.6238712311493667
235
+ },
236
+ "recognition": {
237
+ "etaSq": 0.42177152590743666,
238
+ "partialEtaSq": 0.4325214597404903,
239
+ "cohenD": 1.7000785480386178
240
+ },
241
+ "interaction": {
242
+ "etaSq": 0.003989225708233428,
243
+ "partialEtaSq": 0.007157313011477686
244
+ }
245
+ },
246
+ "mainEffects": {
247
+ "architecture": 9.647832006702977,
248
+ "recognition": 26.290797540797534
249
+ },
250
+ "assumptions": {
251
+ "normality": true,
252
+ "homogeneity": false,
253
+ "leveneF": 3.3126010309809213,
254
+ "leveneP": 0.01
255
+ }
256
+ }
257
+ ```