@machinespirits/eval 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/providers.yaml +60 -0
  9. package/config/suggestion-scenarios.yaml +1399 -0
  10. package/config/tutor-agents.yaml +716 -0
  11. package/docs/EVALUATION-VARIABLES.md +589 -0
  12. package/docs/REPLICATION-PLAN.md +577 -0
  13. package/docs/research/build.sh +74 -0
  14. package/docs/research/figures/figure1.png +0 -0
  15. package/docs/research/figures/figure2.png +0 -0
  16. package/docs/research/figures/figure3.png +0 -0
  17. package/docs/research/figures/figure4.png +0 -0
  18. package/docs/research/figures/figure5.png +0 -0
  19. package/docs/research/figures/figure6.png +0 -0
  20. package/docs/research/header.tex +4 -0
  21. package/docs/research/paper-full.md +1909 -0
  22. package/docs/research/paper-short.md +805 -0
  23. package/docs/research/references.bib +1011 -0
  24. package/index.js +15 -6
  25. package/package.json +14 -21
  26. package/routes/evalRoutes.js +88 -36
  27. package/scripts/analyze-judge-reliability.js +401 -0
  28. package/scripts/analyze-run.js +97 -0
  29. package/scripts/analyze-run.mjs +282 -0
  30. package/scripts/analyze-validation-failures.js +141 -0
  31. package/scripts/check-run.mjs +17 -0
  32. package/scripts/code-impasse-strategies.js +1132 -0
  33. package/scripts/compare-runs.js +44 -0
  34. package/scripts/compare-suggestions.js +80 -0
  35. package/scripts/compare-transformation.js +116 -0
  36. package/scripts/dig-into-run.js +158 -0
  37. package/scripts/eval-cli.js +2626 -0
  38. package/scripts/generate-paper-figures.py +452 -0
  39. package/scripts/qualitative-analysis-ai.js +1313 -0
  40. package/scripts/qualitative-analysis.js +688 -0
  41. package/scripts/seed-db.js +87 -0
  42. package/scripts/show-failed-suggestions.js +64 -0
  43. package/scripts/validate-content.js +192 -0
  44. package/server.js +3 -2
  45. package/services/__tests__/evalConfigLoader.test.js +338 -0
  46. package/services/anovaStats.js +499 -0
  47. package/services/contentResolver.js +407 -0
  48. package/services/dialogueTraceAnalyzer.js +454 -0
  49. package/services/evalConfigLoader.js +625 -0
  50. package/services/evaluationRunner.js +2171 -270
  51. package/services/evaluationStore.js +564 -29
  52. package/services/learnerConfigLoader.js +75 -5
  53. package/services/learnerRubricEvaluator.js +284 -0
  54. package/services/learnerTutorInteractionEngine.js +375 -0
  55. package/services/processUtils.js +18 -0
  56. package/services/progressLogger.js +98 -0
  57. package/services/promptRecommendationService.js +31 -26
  58. package/services/promptRewriter.js +427 -0
  59. package/services/rubricEvaluator.js +543 -70
  60. package/services/streamingReporter.js +104 -0
  61. package/services/turnComparisonAnalyzer.js +494 -0
  62. package/components/MobileEvalDashboard.tsx +0 -267
  63. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  64. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  65. package/components/comparison/RecognitionABMode.tsx +0 -385
  66. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  67. package/components/comparison/WinnerIndicator.tsx +0 -64
  68. package/components/comparison/index.ts +0 -5
  69. package/components/mobile/BottomSheet.tsx +0 -233
  70. package/components/mobile/DimensionBreakdown.tsx +0 -210
  71. package/components/mobile/DocsView.tsx +0 -363
  72. package/components/mobile/LogsView.tsx +0 -481
  73. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  74. package/components/mobile/QuickTestView.tsx +0 -1098
  75. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  76. package/components/mobile/RecognitionView.tsx +0 -809
  77. package/components/mobile/RunDetailView.tsx +0 -261
  78. package/components/mobile/RunHistoryView.tsx +0 -367
  79. package/components/mobile/ScoreRadial.tsx +0 -211
  80. package/components/mobile/StreamingLogPanel.tsx +0 -230
  81. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  82. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  83. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  84. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  85. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  86. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  87. package/docs/research/COST-ANALYSIS.md +0 -56
  88. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  89. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  90. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  91. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  92. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  93. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  94. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  95. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  96. package/docs/research/PAPER-UNIFIED.md +0 -659
  97. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  98. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  99. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  100. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  101. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  102. package/docs/research/paper-draft/full-paper.md +0 -136
  103. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  104. package/docs/research/paper-draft/references.bib +0 -515
  105. package/docs/research/transcript-baseline.md +0 -139
  106. package/docs/research/transcript-recognition-multiagent.md +0 -187
  107. package/hooks/useEvalData.ts +0 -625
  108. package/server-init.js +0 -45
  109. package/services/benchmarkService.js +0 -1892
  110. package/types.ts +0 -165
  111. package/utils/haptics.ts +0 -45
@@ -1,60 +0,0 @@
1
- # Advanced Evaluation Analysis
2
-
3
- **Generated:** 2026-01-14T11:07:14.890Z
4
-
5
- ## Extended Recognition Scenarios
6
-
7
- These scenarios test recognition quality across multiple conversation turns, where learner responses are contingent on tutor suggestions.
8
-
9
- ### Results Summary
10
-
11
- | Scenario | Turns | Baseline | Recognition | Diff | Cohen's d | Sig |
12
- |----------|-------|----------|-------------|------|-----------|-----|
13
- | Sustained Dialogue | 8 | 46.3 | 61.0 | +14.7 | 3.60 | * |
14
- | Breakdown Recovery | 6 | 57.5 | 71.3 | +13.8 | 2.23 | * |
15
- | Productive Struggle | 5 | 46.5 | 73.2 | +26.7 | 3.32 | * |
16
- | Mutual Transformation | 5 | 45.1 | 64.3 | +19.1 | 2.89 | * |
17
-
18
- **Aggregate Statistics:**
19
- - Average improvement: +18.6 points
20
- - Average effect size: d = 3.01
21
- - Significant effects: 4/4
22
-
23
- ## Contingent Learner Analysis
24
-
25
- Multi-turn scenarios simulate realistic interactions where learner behavior depends on tutor suggestions. Recognition-enhanced tutoring maintains quality advantage even as:
26
- - Learners follow or reject suggestions
27
- - Conversations extend over multiple turns
28
- - Learners express frustration or confusion
29
- - Repair cycles become necessary
30
-
31
- ## Bilateral Measurement Framework
32
-
33
- ### Tutor Evaluation Dimensions
34
- 1. **Mutual Recognition**: Acknowledges learner as autonomous subject
35
- 2. **Dialectical Responsiveness**: Shaped by learner's specific input
36
- 3. **Transformative Potential**: Enables genuine growth
37
-
38
- ### Learner Evaluation Dimensions (Simulated)
39
- 1. **Authenticity**: Genuine perspective contribution
40
- 2. **Responsiveness**: Engagement with tutor suggestions
41
- 3. **Development**: Growth across turns
42
-
43
- ### Bilateral Metric
44
- > "Does engagement produce genuine mutual development?"
45
-
46
- ## Integration with Statistical Findings
47
-
48
- The extended scenario results align with our factorial ANOVA findings:
49
-
50
- 1. **Recognition Effect Persists**: The large recognition effect (η² = .422) is maintained across extended interactions, suggesting recognition-oriented prompting produces robust improvements.
51
-
52
- 2. **Architecture Effect Context-Dependent**: The marginal architecture effect (η² = .034) may become more important in complex multi-turn scenarios requiring repair cycles.
53
-
54
- 3. **Additive Benefits Confirmed**: No interaction effects suggest recognition benefits transfer across different scenario types and lengths.
55
-
56
- ## Implications
57
-
58
- 1. **Scalability**: Recognition-oriented design scales to longer interactions
59
- 2. **Robustness**: Benefits persist even with contingent learner responses
60
- 3. **Cost-Effectiveness**: Free-tier models achieve recognition quality with proper prompting
@@ -1,257 +0,0 @@
1
- # Two-Way ANOVA Results
2
-
3
- **Generated:** 2026-01-14T10:22:17.071Z
4
- **Data Source:** evaluations.db (factorial evaluation runs)
5
-
6
- ```
7
- ======================================================================
8
- TWO-WAY ANOVA RESULTS: 2×2 Factorial Design
9
- ======================================================================
10
-
11
- EXPERIMENTAL DESIGN
12
- ----------------------------------------------------------------------
13
- Factor A: Architecture (Single-Agent vs Multi-Agent)
14
- Factor B: Recognition (Standard vs Recognition-Enhanced Prompts)
15
- Total N: 76
16
- Grand Mean: 60.87
17
-
18
- CELL STATISTICS
19
- ----------------------------------------------------------------------
20
- Cell N Mean SD 95% CI
21
- ----------------------------------------------------------------------
22
- Single + Standard 8 39.99 9.12 [33.5, 46.4]
23
- Single + Recognition 6 70.36 26.15 [49.0, 91.7]
24
- Multi + Standard 31 50.16 15.00 [44.8, 55.5]
25
- Multi + Recognition 31 75.14 14.68 [69.9, 80.4]
26
-
27
- MARGINAL MEANS
28
- ----------------------------------------------------------------------
29
- Architecture: Single = 53.00, Multi = 62.65
30
- Recognition: Standard = 48.07, Recognition = 74.37
31
-
32
- ANOVA TABLE
33
- ----------------------------------------------------------------------
34
- Source SS df MS F p Sig
35
- ----------------------------------------------------------------------
36
- Architecture (A) 1063.08 1 1063.08 4.445 0.050 *
37
- Recognition (B) 13123.82 1 13123.82 54.877 0.001 ***
38
- A × B 124.13 1 124.13 0.519 0.250
39
- Error 17218.77 72 239.15
40
- ----------------------------------------------------------------------
41
- Total 31115.95 75
42
-
43
- Significance: *** p < .05, * p < .10
44
-
45
- EFFECT SIZES
46
- ----------------------------------------------------------------------
47
- Source η² Partial η² Cohen's d Interpretation
48
- ----------------------------------------------------------------------
49
- Architecture (A) 0.034 0.058 0.62 Small
50
- Recognition (B) 0.422 0.433 1.70 Large
51
- A × B 0.004 0.007 N/A Negligible
52
-
53
- MAIN EFFECTS (Raw Differences)
54
- ----------------------------------------------------------------------
55
- Architecture Effect: Multi - Single = +9.65 points
56
- Recognition Effect: Recognition - Standard = +26.29 points
57
-
58
- ASSUMPTION CHECKS
59
- ----------------------------------------------------------------------
60
- Normality (Shapiro-Wilk approx): PASSED ✓
61
- Homogeneity of Variance (Levene): F = 3.31, p = 0.010 - VIOLATED ✗
62
-
63
- INTERPRETATION
64
- ----------------------------------------------------------------------
65
- ✓ Recognition prompts have a SIGNIFICANT effect (F = 54.88, p < .05)
66
- Effect size: large (η² = 0.422)
67
- ✗ Architecture effect is NOT significant (F = 4.45, p = 0.050)
68
- ✗ No significant interaction (F = 0.52, p = 0.250)
69
-
70
- ======================================================================
71
- ```
72
-
73
- ## JSON Results
74
-
75
- ```json
76
- {
77
- "grandMean": 60.87384654818866,
78
- "N": 76,
79
- "cellStats": {
80
- "a0b0": {
81
- "n": 8,
82
- "mean": 39.98579545454547,
83
- "std": 9.11985690330198,
84
- "values": [
85
- 38.44696969696971,
86
- 38.63636363636365,
87
- 43.18181818181819,
88
- 34.848484848484865,
89
- 52.272727272727295,
90
- 37.500000000000014,
91
- 23.86363636363637,
92
- 51.136363636363654
93
- ]
94
- },
95
- "a0b1": {
96
- "n": 6,
97
- "mean": 70.3598484848485,
98
- "std": 26.148369552557515,
99
- "values": [
100
- 50.3787878787879,
101
- 76.13636363636364,
102
- 100.00000000000003,
103
- 58.14393939393941,
104
- 37.50000000000002,
105
- 100.00000000000003
106
- ]
107
- },
108
- "a1b0": {
109
- "n": 31,
110
- "mean": 50.16175580691711,
111
- "std": 14.99861019348702,
112
- "values": [
113
- 42.99242424242427,
114
- 32.95454545454547,
115
- 48.86363636363639,
116
- 49.05303030303031,
117
- 45.45454545454547,
118
- 59.09090909090911,
119
- 63.47402597402599,
120
- 50.974025974026,
121
- 58.11688311688312,
122
- 44.94949494949497,
123
- 52.02020202020204,
124
- 41.919191919191945,
125
- 38.63636363636365,
126
- 57.00757575757578,
127
- 48.4848484848485,
128
- 43.37121212121213,
129
- 43.181818181818194,
130
- 50.00000000000002,
131
- 45.90909090909093,
132
- 47.72727272727274,
133
- 53.863636363636395,
134
- 48.63636363636365,
135
- 42.80303030303031,
136
- 51.59090909090911,
137
- 43.93939393939396,
138
- 31.818181818181834,
139
- 18.181818181818194,
140
- 45.45454545454547,
141
- 87.5,
142
- 100,
143
- 67.04545454545458
144
- ]
145
- },
146
- "a1b1": {
147
- "n": 31,
148
- "mean": 75.14040171298237,
149
- "std": 14.684605144018311,
150
- "values": [
151
- 75.18939393939395,
152
- 67.04545454545458,
153
- 100.00000000000003,
154
- 65.15151515151518,
155
- 56.818181818181834,
156
- 95.45454545454548,
157
- 78.08441558441562,
158
- 66.23376623376626,
159
- 69.64285714285717,
160
- 58.0808080808081,
161
- 62.12121212121214,
162
- 62.878787878787904,
163
- 60.03787878787881,
164
- 72.9166666666667,
165
- 82.00757575757578,
166
- 66.47727272727275,
167
- 63.636363636363654,
168
- 52.840909090909115,
169
- 67.72727272727275,
170
- 75.68181818181822,
171
- 60.68181818181819,
172
- 71.5909090909091,
173
- 84.09090909090911,
174
- 68.1818181818182,
175
- 74.05303030303033,
176
- 79.54545454545456,
177
- 93.18181818181822,
178
- 100.00000000000003,
179
- 100,
180
- 100,
181
- 100
182
- ]
183
- }
184
- },
185
- "marginalMeans": {
186
- "architecture": {
187
- "single": 53.003246753246756,
188
- "multi": 62.65107875994973
189
- },
190
- "recognition": {
191
- "standard": 48.07437932437934,
192
- "recognition": 74.36517686517688
193
- }
194
- },
195
- "anovaTable": {
196
- "architecture": {
197
- "SS": 1063.0791445902653,
198
- "df": 1,
199
- "MS": 1063.0791445902653,
200
- "F": 4.445248575427072,
201
- "p": 0.05,
202
- "sig": false
203
- },
204
- "recognition": {
205
- "SS": 13123.81985503855,
206
- "df": 1,
207
- "MS": 13123.81985503855,
208
- "F": 54.87704449065894,
209
- "p": 0.001,
210
- "sig": true
211
- },
212
- "interaction": {
213
- "SS": 124.12853011664384,
214
- "df": 1,
215
- "MS": 124.12853011664384,
216
- "F": 0.5190414791586724,
217
- "p": 0.25,
218
- "sig": false
219
- },
220
- "error": {
221
- "SS": 17218.76675999957,
222
- "df": 72,
223
- "MS": 239.14953833332734
224
- },
225
- "total": {
226
- "SS": 31115.94559827812,
227
- "df": 75
228
- }
229
- },
230
- "effectSizes": {
231
- "architecture": {
232
- "etaSq": 0.03416509201793609,
233
- "partialEtaSq": 0.05814944235600241,
234
- "cohenD": 0.6238712311493667
235
- },
236
- "recognition": {
237
- "etaSq": 0.42177152590743666,
238
- "partialEtaSq": 0.4325214597404903,
239
- "cohenD": 1.7000785480386178
240
- },
241
- "interaction": {
242
- "etaSq": 0.003989225708233428,
243
- "partialEtaSq": 0.007157313011477686
244
- }
245
- },
246
- "mainEffects": {
247
- "architecture": 9.647832006702977,
248
- "recognition": 26.290797540797534
249
- },
250
- "assumptions": {
251
- "normality": true,
252
- "homogeneity": false,
253
- "leveneF": 3.3126010309809213,
254
- "leveneP": 0.01
255
- }
256
- }
257
- ```