@machinespirits/eval 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +161 -0
- package/config/eval-settings.yaml +18 -0
- package/config/evaluation-rubric-learner.yaml +277 -0
- package/config/evaluation-rubric.yaml +613 -0
- package/config/interaction-eval-scenarios.yaml +93 -50
- package/config/learner-agents.yaml +124 -193
- package/config/machinespirits-eval.code-workspace +11 -0
- package/config/providers.yaml +60 -0
- package/config/suggestion-scenarios.yaml +1399 -0
- package/config/tutor-agents.yaml +716 -0
- package/docs/EVALUATION-VARIABLES.md +589 -0
- package/docs/REPLICATION-PLAN.md +577 -0
- package/index.js +15 -6
- package/package.json +16 -22
- package/routes/evalRoutes.js +88 -36
- package/scripts/analyze-judge-reliability.js +401 -0
- package/scripts/analyze-run.js +97 -0
- package/scripts/analyze-run.mjs +282 -0
- package/scripts/analyze-validation-failures.js +141 -0
- package/scripts/check-run.mjs +17 -0
- package/scripts/code-impasse-strategies.js +1132 -0
- package/scripts/compare-runs.js +44 -0
- package/scripts/compare-suggestions.js +80 -0
- package/scripts/compare-transformation.js +116 -0
- package/scripts/dig-into-run.js +158 -0
- package/scripts/eval-cli.js +2626 -0
- package/scripts/generate-paper-figures.py +452 -0
- package/scripts/qualitative-analysis-ai.js +1313 -0
- package/scripts/qualitative-analysis.js +688 -0
- package/scripts/seed-db.js +87 -0
- package/scripts/show-failed-suggestions.js +64 -0
- package/scripts/validate-content.js +192 -0
- package/server.js +3 -2
- package/services/__tests__/evalConfigLoader.test.js +338 -0
- package/services/anovaStats.js +499 -0
- package/services/contentResolver.js +407 -0
- package/services/dialogueTraceAnalyzer.js +454 -0
- package/services/evalConfigLoader.js +625 -0
- package/services/evaluationRunner.js +2171 -270
- package/services/evaluationStore.js +564 -29
- package/services/learnerConfigLoader.js +75 -5
- package/services/learnerRubricEvaluator.js +284 -0
- package/services/learnerTutorInteractionEngine.js +375 -0
- package/services/processUtils.js +18 -0
- package/services/progressLogger.js +98 -0
- package/services/promptRecommendationService.js +31 -26
- package/services/promptRewriter.js +427 -0
- package/services/rubricEvaluator.js +543 -70
- package/services/streamingReporter.js +104 -0
- package/services/turnComparisonAnalyzer.js +494 -0
- package/components/MobileEvalDashboard.tsx +0 -267
- package/components/comparison/DeltaAnalysisTable.tsx +0 -137
- package/components/comparison/ProfileComparisonCard.tsx +0 -176
- package/components/comparison/RecognitionABMode.tsx +0 -385
- package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
- package/components/comparison/WinnerIndicator.tsx +0 -64
- package/components/comparison/index.ts +0 -5
- package/components/mobile/BottomSheet.tsx +0 -233
- package/components/mobile/DimensionBreakdown.tsx +0 -210
- package/components/mobile/DocsView.tsx +0 -363
- package/components/mobile/LogsView.tsx +0 -481
- package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
- package/components/mobile/QuickTestView.tsx +0 -1098
- package/components/mobile/RecognitionTypeChart.tsx +0 -124
- package/components/mobile/RecognitionView.tsx +0 -809
- package/components/mobile/RunDetailView.tsx +0 -261
- package/components/mobile/RunHistoryView.tsx +0 -367
- package/components/mobile/ScoreRadial.tsx +0 -211
- package/components/mobile/StreamingLogPanel.tsx +0 -230
- package/components/mobile/SynthesisStrategyChart.tsx +0 -140
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
- package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
- package/docs/research/COST-ANALYSIS.md +0 -56
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
- package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
- package/docs/research/PAPER-UNIFIED.md +0 -659
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
- package/docs/research/apa.csl +0 -2133
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
- package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
- package/docs/research/paper-draft/full-paper.md +0 -136
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +0 -515
- package/docs/research/transcript-baseline.md +0 -139
- package/docs/research/transcript-recognition-multiagent.md +0 -187
- package/hooks/useEvalData.ts +0 -625
- package/server-init.js +0 -45
- package/services/benchmarkService.js +0 -1892
- package/types.ts +0 -165
- package/utils/haptics.ts +0 -45
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
# Advanced Evaluation Analysis
|
|
2
|
-
|
|
3
|
-
**Generated:** 2026-01-14T11:07:14.890Z
|
|
4
|
-
|
|
5
|
-
## Extended Recognition Scenarios
|
|
6
|
-
|
|
7
|
-
These scenarios test recognition quality across multiple conversation turns, where learner responses are contingent on tutor suggestions.
|
|
8
|
-
|
|
9
|
-
### Results Summary
|
|
10
|
-
|
|
11
|
-
| Scenario | Turns | Baseline | Recognition | Diff | Cohen's d | Sig |
|
|
12
|
-
|----------|-------|----------|-------------|------|-----------|-----|
|
|
13
|
-
| Sustained Dialogue | 8 | 46.3 | 61.0 | +14.7 | 3.60 | * |
|
|
14
|
-
| Breakdown Recovery | 6 | 57.5 | 71.3 | +13.8 | 2.23 | * |
|
|
15
|
-
| Productive Struggle | 5 | 46.5 | 73.2 | +26.7 | 3.32 | * |
|
|
16
|
-
| Mutual Transformation | 5 | 45.1 | 64.3 | +19.1 | 2.89 | * |
|
|
17
|
-
|
|
18
|
-
**Aggregate Statistics:**
|
|
19
|
-
- Average improvement: +18.6 points
|
|
20
|
-
- Average effect size: d = 3.01
|
|
21
|
-
- Significant effects: 4/4
|
|
22
|
-
|
|
23
|
-
## Contingent Learner Analysis
|
|
24
|
-
|
|
25
|
-
Multi-turn scenarios simulate realistic interactions where learner behavior depends on tutor suggestions. Recognition-enhanced tutoring maintains quality advantage even as:
|
|
26
|
-
- Learners follow or reject suggestions
|
|
27
|
-
- Conversations extend over multiple turns
|
|
28
|
-
- Learners express frustration or confusion
|
|
29
|
-
- Repair cycles become necessary
|
|
30
|
-
|
|
31
|
-
## Bilateral Measurement Framework
|
|
32
|
-
|
|
33
|
-
### Tutor Evaluation Dimensions
|
|
34
|
-
1. **Mutual Recognition**: Acknowledges learner as autonomous subject
|
|
35
|
-
2. **Dialectical Responsiveness**: Shaped by learner's specific input
|
|
36
|
-
3. **Transformative Potential**: Enables genuine growth
|
|
37
|
-
|
|
38
|
-
### Learner Evaluation Dimensions (Simulated)
|
|
39
|
-
1. **Authenticity**: Genuine perspective contribution
|
|
40
|
-
2. **Responsiveness**: Engagement with tutor suggestions
|
|
41
|
-
3. **Development**: Growth across turns
|
|
42
|
-
|
|
43
|
-
### Bilateral Metric
|
|
44
|
-
> "Does engagement produce genuine mutual development?"
|
|
45
|
-
|
|
46
|
-
## Integration with Statistical Findings
|
|
47
|
-
|
|
48
|
-
The extended scenario results align with our factorial ANOVA findings:
|
|
49
|
-
|
|
50
|
-
1. **Recognition Effect Persists**: The large recognition effect (η² = .422) is maintained across extended interactions, suggesting recognition-oriented prompting produces robust improvements.
|
|
51
|
-
|
|
52
|
-
2. **Architecture Effect Context-Dependent**: The marginal architecture effect (η² = .034) may become more important in complex multi-turn scenarios requiring repair cycles.
|
|
53
|
-
|
|
54
|
-
3. **Additive Benefits Confirmed**: No interaction effects suggest recognition benefits transfer across different scenario types and lengths.
|
|
55
|
-
|
|
56
|
-
## Implications
|
|
57
|
-
|
|
58
|
-
1. **Scalability**: Recognition-oriented design scales to longer interactions
|
|
59
|
-
2. **Robustness**: Benefits persist even with contingent learner responses
|
|
60
|
-
3. **Cost-Effectiveness**: Free-tier models achieve recognition quality with proper prompting
|
|
@@ -1,257 +0,0 @@
|
|
|
1
|
-
# Two-Way ANOVA Results
|
|
2
|
-
|
|
3
|
-
**Generated:** 2026-01-14T10:22:17.071Z
|
|
4
|
-
**Data Source:** evaluations.db (factorial evaluation runs)
|
|
5
|
-
|
|
6
|
-
```
|
|
7
|
-
======================================================================
|
|
8
|
-
TWO-WAY ANOVA RESULTS: 2×2 Factorial Design
|
|
9
|
-
======================================================================
|
|
10
|
-
|
|
11
|
-
EXPERIMENTAL DESIGN
|
|
12
|
-
----------------------------------------------------------------------
|
|
13
|
-
Factor A: Architecture (Single-Agent vs Multi-Agent)
|
|
14
|
-
Factor B: Recognition (Standard vs Recognition-Enhanced Prompts)
|
|
15
|
-
Total N: 76
|
|
16
|
-
Grand Mean: 60.87
|
|
17
|
-
|
|
18
|
-
CELL STATISTICS
|
|
19
|
-
----------------------------------------------------------------------
|
|
20
|
-
Cell N Mean SD 95% CI
|
|
21
|
-
----------------------------------------------------------------------
|
|
22
|
-
Single + Standard 8 39.99 9.12 [33.5, 46.4]
|
|
23
|
-
Single + Recognition 6 70.36 26.15 [49.0, 91.7]
|
|
24
|
-
Multi + Standard 31 50.16 15.00 [44.8, 55.5]
|
|
25
|
-
Multi + Recognition 31 75.14 14.68 [69.9, 80.4]
|
|
26
|
-
|
|
27
|
-
MARGINAL MEANS
|
|
28
|
-
----------------------------------------------------------------------
|
|
29
|
-
Architecture: Single = 53.00, Multi = 62.65
|
|
30
|
-
Recognition: Standard = 48.07, Recognition = 74.37
|
|
31
|
-
|
|
32
|
-
ANOVA TABLE
|
|
33
|
-
----------------------------------------------------------------------
|
|
34
|
-
Source SS df MS F p Sig
|
|
35
|
-
----------------------------------------------------------------------
|
|
36
|
-
Architecture (A) 1063.08 1 1063.08 4.445 0.050 *
|
|
37
|
-
Recognition (B) 13123.82 1 13123.82 54.877 0.001 ***
|
|
38
|
-
A × B 124.13 1 124.13 0.519 0.250
|
|
39
|
-
Error 17218.77 72 239.15
|
|
40
|
-
----------------------------------------------------------------------
|
|
41
|
-
Total 31115.95 75
|
|
42
|
-
|
|
43
|
-
Significance: *** p < .05, * p < .10
|
|
44
|
-
|
|
45
|
-
EFFECT SIZES
|
|
46
|
-
----------------------------------------------------------------------
|
|
47
|
-
Source η² Partial η² Cohen's d Interpretation
|
|
48
|
-
----------------------------------------------------------------------
|
|
49
|
-
Architecture (A) 0.034 0.058 0.62 Small
|
|
50
|
-
Recognition (B) 0.422 0.433 1.70 Large
|
|
51
|
-
A × B 0.004 0.007 N/A Negligible
|
|
52
|
-
|
|
53
|
-
MAIN EFFECTS (Raw Differences)
|
|
54
|
-
----------------------------------------------------------------------
|
|
55
|
-
Architecture Effect: Multi - Single = +9.65 points
|
|
56
|
-
Recognition Effect: Recognition - Standard = +26.29 points
|
|
57
|
-
|
|
58
|
-
ASSUMPTION CHECKS
|
|
59
|
-
----------------------------------------------------------------------
|
|
60
|
-
Normality (Shapiro-Wilk approx): PASSED ✓
|
|
61
|
-
Homogeneity of Variance (Levene): F = 3.31, p = 0.010 - VIOLATED ✗
|
|
62
|
-
|
|
63
|
-
INTERPRETATION
|
|
64
|
-
----------------------------------------------------------------------
|
|
65
|
-
✓ Recognition prompts have a SIGNIFICANT effect (F = 54.88, p < .05)
|
|
66
|
-
Effect size: large (η² = 0.422)
|
|
67
|
-
✗ Architecture effect is NOT significant (F = 4.45, p = 0.050)
|
|
68
|
-
✗ No significant interaction (F = 0.52, p = 0.250)
|
|
69
|
-
|
|
70
|
-
======================================================================
|
|
71
|
-
```
|
|
72
|
-
|
|
73
|
-
## JSON Results
|
|
74
|
-
|
|
75
|
-
```json
|
|
76
|
-
{
|
|
77
|
-
"grandMean": 60.87384654818866,
|
|
78
|
-
"N": 76,
|
|
79
|
-
"cellStats": {
|
|
80
|
-
"a0b0": {
|
|
81
|
-
"n": 8,
|
|
82
|
-
"mean": 39.98579545454547,
|
|
83
|
-
"std": 9.11985690330198,
|
|
84
|
-
"values": [
|
|
85
|
-
38.44696969696971,
|
|
86
|
-
38.63636363636365,
|
|
87
|
-
43.18181818181819,
|
|
88
|
-
34.848484848484865,
|
|
89
|
-
52.272727272727295,
|
|
90
|
-
37.500000000000014,
|
|
91
|
-
23.86363636363637,
|
|
92
|
-
51.136363636363654
|
|
93
|
-
]
|
|
94
|
-
},
|
|
95
|
-
"a0b1": {
|
|
96
|
-
"n": 6,
|
|
97
|
-
"mean": 70.3598484848485,
|
|
98
|
-
"std": 26.148369552557515,
|
|
99
|
-
"values": [
|
|
100
|
-
50.3787878787879,
|
|
101
|
-
76.13636363636364,
|
|
102
|
-
100.00000000000003,
|
|
103
|
-
58.14393939393941,
|
|
104
|
-
37.50000000000002,
|
|
105
|
-
100.00000000000003
|
|
106
|
-
]
|
|
107
|
-
},
|
|
108
|
-
"a1b0": {
|
|
109
|
-
"n": 31,
|
|
110
|
-
"mean": 50.16175580691711,
|
|
111
|
-
"std": 14.99861019348702,
|
|
112
|
-
"values": [
|
|
113
|
-
42.99242424242427,
|
|
114
|
-
32.95454545454547,
|
|
115
|
-
48.86363636363639,
|
|
116
|
-
49.05303030303031,
|
|
117
|
-
45.45454545454547,
|
|
118
|
-
59.09090909090911,
|
|
119
|
-
63.47402597402599,
|
|
120
|
-
50.974025974026,
|
|
121
|
-
58.11688311688312,
|
|
122
|
-
44.94949494949497,
|
|
123
|
-
52.02020202020204,
|
|
124
|
-
41.919191919191945,
|
|
125
|
-
38.63636363636365,
|
|
126
|
-
57.00757575757578,
|
|
127
|
-
48.4848484848485,
|
|
128
|
-
43.37121212121213,
|
|
129
|
-
43.181818181818194,
|
|
130
|
-
50.00000000000002,
|
|
131
|
-
45.90909090909093,
|
|
132
|
-
47.72727272727274,
|
|
133
|
-
53.863636363636395,
|
|
134
|
-
48.63636363636365,
|
|
135
|
-
42.80303030303031,
|
|
136
|
-
51.59090909090911,
|
|
137
|
-
43.93939393939396,
|
|
138
|
-
31.818181818181834,
|
|
139
|
-
18.181818181818194,
|
|
140
|
-
45.45454545454547,
|
|
141
|
-
87.5,
|
|
142
|
-
100,
|
|
143
|
-
67.04545454545458
|
|
144
|
-
]
|
|
145
|
-
},
|
|
146
|
-
"a1b1": {
|
|
147
|
-
"n": 31,
|
|
148
|
-
"mean": 75.14040171298237,
|
|
149
|
-
"std": 14.684605144018311,
|
|
150
|
-
"values": [
|
|
151
|
-
75.18939393939395,
|
|
152
|
-
67.04545454545458,
|
|
153
|
-
100.00000000000003,
|
|
154
|
-
65.15151515151518,
|
|
155
|
-
56.818181818181834,
|
|
156
|
-
95.45454545454548,
|
|
157
|
-
78.08441558441562,
|
|
158
|
-
66.23376623376626,
|
|
159
|
-
69.64285714285717,
|
|
160
|
-
58.0808080808081,
|
|
161
|
-
62.12121212121214,
|
|
162
|
-
62.878787878787904,
|
|
163
|
-
60.03787878787881,
|
|
164
|
-
72.9166666666667,
|
|
165
|
-
82.00757575757578,
|
|
166
|
-
66.47727272727275,
|
|
167
|
-
63.636363636363654,
|
|
168
|
-
52.840909090909115,
|
|
169
|
-
67.72727272727275,
|
|
170
|
-
75.68181818181822,
|
|
171
|
-
60.68181818181819,
|
|
172
|
-
71.5909090909091,
|
|
173
|
-
84.09090909090911,
|
|
174
|
-
68.1818181818182,
|
|
175
|
-
74.05303030303033,
|
|
176
|
-
79.54545454545456,
|
|
177
|
-
93.18181818181822,
|
|
178
|
-
100.00000000000003,
|
|
179
|
-
100,
|
|
180
|
-
100,
|
|
181
|
-
100
|
|
182
|
-
]
|
|
183
|
-
}
|
|
184
|
-
},
|
|
185
|
-
"marginalMeans": {
|
|
186
|
-
"architecture": {
|
|
187
|
-
"single": 53.003246753246756,
|
|
188
|
-
"multi": 62.65107875994973
|
|
189
|
-
},
|
|
190
|
-
"recognition": {
|
|
191
|
-
"standard": 48.07437932437934,
|
|
192
|
-
"recognition": 74.36517686517688
|
|
193
|
-
}
|
|
194
|
-
},
|
|
195
|
-
"anovaTable": {
|
|
196
|
-
"architecture": {
|
|
197
|
-
"SS": 1063.0791445902653,
|
|
198
|
-
"df": 1,
|
|
199
|
-
"MS": 1063.0791445902653,
|
|
200
|
-
"F": 4.445248575427072,
|
|
201
|
-
"p": 0.05,
|
|
202
|
-
"sig": false
|
|
203
|
-
},
|
|
204
|
-
"recognition": {
|
|
205
|
-
"SS": 13123.81985503855,
|
|
206
|
-
"df": 1,
|
|
207
|
-
"MS": 13123.81985503855,
|
|
208
|
-
"F": 54.87704449065894,
|
|
209
|
-
"p": 0.001,
|
|
210
|
-
"sig": true
|
|
211
|
-
},
|
|
212
|
-
"interaction": {
|
|
213
|
-
"SS": 124.12853011664384,
|
|
214
|
-
"df": 1,
|
|
215
|
-
"MS": 124.12853011664384,
|
|
216
|
-
"F": 0.5190414791586724,
|
|
217
|
-
"p": 0.25,
|
|
218
|
-
"sig": false
|
|
219
|
-
},
|
|
220
|
-
"error": {
|
|
221
|
-
"SS": 17218.76675999957,
|
|
222
|
-
"df": 72,
|
|
223
|
-
"MS": 239.14953833332734
|
|
224
|
-
},
|
|
225
|
-
"total": {
|
|
226
|
-
"SS": 31115.94559827812,
|
|
227
|
-
"df": 75
|
|
228
|
-
}
|
|
229
|
-
},
|
|
230
|
-
"effectSizes": {
|
|
231
|
-
"architecture": {
|
|
232
|
-
"etaSq": 0.03416509201793609,
|
|
233
|
-
"partialEtaSq": 0.05814944235600241,
|
|
234
|
-
"cohenD": 0.6238712311493667
|
|
235
|
-
},
|
|
236
|
-
"recognition": {
|
|
237
|
-
"etaSq": 0.42177152590743666,
|
|
238
|
-
"partialEtaSq": 0.4325214597404903,
|
|
239
|
-
"cohenD": 1.7000785480386178
|
|
240
|
-
},
|
|
241
|
-
"interaction": {
|
|
242
|
-
"etaSq": 0.003989225708233428,
|
|
243
|
-
"partialEtaSq": 0.007157313011477686
|
|
244
|
-
}
|
|
245
|
-
},
|
|
246
|
-
"mainEffects": {
|
|
247
|
-
"architecture": 9.647832006702977,
|
|
248
|
-
"recognition": 26.290797540797534
|
|
249
|
-
},
|
|
250
|
-
"assumptions": {
|
|
251
|
-
"normality": true,
|
|
252
|
-
"homogeneity": false,
|
|
253
|
-
"leveneF": 3.3126010309809213,
|
|
254
|
-
"leveneP": 0.01
|
|
255
|
-
}
|
|
256
|
-
}
|
|
257
|
-
```
|