@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
@@ -1,60 +0,0 @@
1
- # Advanced Evaluation Analysis
2
-
3
- **Generated:** 2026-01-14T11:07:14.890Z
4
-
5
- ## Extended Recognition Scenarios
6
-
7
- These scenarios test recognition quality across multiple conversation turns, where learner responses are contingent on tutor suggestions.
8
-
9
- ### Results Summary
10
-
11
- | Scenario | Turns | Baseline | Recognition | Diff | Cohen's d | Sig |
12
- |----------|-------|----------|-------------|------|-----------|-----|
13
- | Sustained Dialogue | 8 | 46.3 | 61.0 | +14.7 | 3.60 | * |
14
- | Breakdown Recovery | 6 | 57.5 | 71.3 | +13.8 | 2.23 | * |
15
- | Productive Struggle | 5 | 46.5 | 73.2 | +26.7 | 3.32 | * |
16
- | Mutual Transformation | 5 | 45.1 | 64.3 | +19.1 | 2.89 | * |
17
-
18
- **Aggregate Statistics:**
19
- - Average improvement: +18.6 points
20
- - Average effect size: d = 3.01
21
- - Significant effects: 4/4
22
-
23
- ## Contingent Learner Analysis
24
-
25
- Multi-turn scenarios simulate realistic interactions where learner behavior depends on tutor suggestions. Recognition-enhanced tutoring maintains quality advantage even as:
26
- - Learners follow or reject suggestions
27
- - Conversations extend over multiple turns
28
- - Learners express frustration or confusion
29
- - Repair cycles become necessary
30
-
31
- ## Bilateral Measurement Framework
32
-
33
- ### Tutor Evaluation Dimensions
34
- 1. **Mutual Recognition**: Acknowledges learner as autonomous subject
35
- 2. **Dialectical Responsiveness**: Shaped by learner's specific input
36
- 3. **Transformative Potential**: Enables genuine growth
37
-
38
- ### Learner Evaluation Dimensions (Simulated)
39
- 1. **Authenticity**: Genuine perspective contribution
40
- 2. **Responsiveness**: Engagement with tutor suggestions
41
- 3. **Development**: Growth across turns
42
-
43
- ### Bilateral Metric
44
- > "Does engagement produce genuine mutual development?"
45
-
46
- ## Integration with Statistical Findings
47
-
48
- The extended scenario results align with our factorial ANOVA findings:
49
-
50
- 1. **Recognition Effect Persists**: The large recognition effect (η² = .422) is maintained across extended interactions, suggesting recognition-oriented prompting produces robust improvements.
51
-
52
- 2. **Architecture Effect Context-Dependent**: The marginal architecture effect (η² = .034) may become more important in complex multi-turn scenarios requiring repair cycles.
53
-
54
- 3. **Additive Benefits Confirmed**: No interaction effects suggest recognition benefits transfer across different scenario types and lengths.
55
-
56
- ## Implications
57
-
58
- 1. **Scalability**: Recognition-oriented design scales to longer interactions
59
- 2. **Robustness**: Benefits persist even with contingent learner responses
60
- 3. **Cost-Effectiveness**: Free-tier models achieve recognition quality with proper prompting
@@ -1,257 +0,0 @@
1
- # Two-Way ANOVA Results
2
-
3
- **Generated:** 2026-01-14T10:22:17.071Z
4
- **Data Source:** evaluations.db (factorial evaluation runs)
5
-
6
- ```
7
- ======================================================================
8
- TWO-WAY ANOVA RESULTS: 2×2 Factorial Design
9
- ======================================================================
10
-
11
- EXPERIMENTAL DESIGN
12
- ----------------------------------------------------------------------
13
- Factor A: Architecture (Single-Agent vs Multi-Agent)
14
- Factor B: Recognition (Standard vs Recognition-Enhanced Prompts)
15
- Total N: 76
16
- Grand Mean: 60.87
17
-
18
- CELL STATISTICS
19
- ----------------------------------------------------------------------
20
- Cell N Mean SD 95% CI
21
- ----------------------------------------------------------------------
22
- Single + Standard 8 39.99 9.12 [33.5, 46.4]
23
- Single + Recognition 6 70.36 26.15 [49.0, 91.7]
24
- Multi + Standard 31 50.16 15.00 [44.8, 55.5]
25
- Multi + Recognition 31 75.14 14.68 [69.9, 80.4]
26
-
27
- MARGINAL MEANS
28
- ----------------------------------------------------------------------
29
- Architecture: Single = 53.00, Multi = 62.65
30
- Recognition: Standard = 48.07, Recognition = 74.37
31
-
32
- ANOVA TABLE
33
- ----------------------------------------------------------------------
34
- Source SS df MS F p Sig
35
- ----------------------------------------------------------------------
36
- Architecture (A) 1063.08 1 1063.08 4.445 0.050 *
37
- Recognition (B) 13123.82 1 13123.82 54.877 0.001 ***
38
- A × B 124.13 1 124.13 0.519 0.250
39
- Error 17218.77 72 239.15
40
- ----------------------------------------------------------------------
41
- Total 31115.95 75
42
-
43
- Significance: *** p < .05, * p < .10
44
-
45
- EFFECT SIZES
46
- ----------------------------------------------------------------------
47
- Source η² Partial η² Cohen's d Interpretation
48
- ----------------------------------------------------------------------
49
- Architecture (A) 0.034 0.058 0.62 Small
50
- Recognition (B) 0.422 0.433 1.70 Large
51
- A × B 0.004 0.007 N/A Negligible
52
-
53
- MAIN EFFECTS (Raw Differences)
54
- ----------------------------------------------------------------------
55
- Architecture Effect: Multi - Single = +9.65 points
56
- Recognition Effect: Recognition - Standard = +26.29 points
57
-
58
- ASSUMPTION CHECKS
59
- ----------------------------------------------------------------------
60
- Normality (Shapiro-Wilk approx): PASSED ✓
61
- Homogeneity of Variance (Levene): F = 3.31, p = 0.010 - VIOLATED ✗
62
-
63
- INTERPRETATION
64
- ----------------------------------------------------------------------
65
- ✓ Recognition prompts have a SIGNIFICANT effect (F = 54.88, p < .05)
66
- Effect size: large (η² = 0.422)
67
- ✗ Architecture effect is NOT significant (F = 4.45, p = 0.050)
68
- ✗ No significant interaction (F = 0.52, p = 0.250)
69
-
70
- ======================================================================
71
- ```
72
-
73
- ## JSON Results
74
-
75
- ```json
76
- {
77
- "grandMean": 60.87384654818866,
78
- "N": 76,
79
- "cellStats": {
80
- "a0b0": {
81
- "n": 8,
82
- "mean": 39.98579545454547,
83
- "std": 9.11985690330198,
84
- "values": [
85
- 38.44696969696971,
86
- 38.63636363636365,
87
- 43.18181818181819,
88
- 34.848484848484865,
89
- 52.272727272727295,
90
- 37.500000000000014,
91
- 23.86363636363637,
92
- 51.136363636363654
93
- ]
94
- },
95
- "a0b1": {
96
- "n": 6,
97
- "mean": 70.3598484848485,
98
- "std": 26.148369552557515,
99
- "values": [
100
- 50.3787878787879,
101
- 76.13636363636364,
102
- 100.00000000000003,
103
- 58.14393939393941,
104
- 37.50000000000002,
105
- 100.00000000000003
106
- ]
107
- },
108
- "a1b0": {
109
- "n": 31,
110
- "mean": 50.16175580691711,
111
- "std": 14.99861019348702,
112
- "values": [
113
- 42.99242424242427,
114
- 32.95454545454547,
115
- 48.86363636363639,
116
- 49.05303030303031,
117
- 45.45454545454547,
118
- 59.09090909090911,
119
- 63.47402597402599,
120
- 50.974025974026,
121
- 58.11688311688312,
122
- 44.94949494949497,
123
- 52.02020202020204,
124
- 41.919191919191945,
125
- 38.63636363636365,
126
- 57.00757575757578,
127
- 48.4848484848485,
128
- 43.37121212121213,
129
- 43.181818181818194,
130
- 50.00000000000002,
131
- 45.90909090909093,
132
- 47.72727272727274,
133
- 53.863636363636395,
134
- 48.63636363636365,
135
- 42.80303030303031,
136
- 51.59090909090911,
137
- 43.93939393939396,
138
- 31.818181818181834,
139
- 18.181818181818194,
140
- 45.45454545454547,
141
- 87.5,
142
- 100,
143
- 67.04545454545458
144
- ]
145
- },
146
- "a1b1": {
147
- "n": 31,
148
- "mean": 75.14040171298237,
149
- "std": 14.684605144018311,
150
- "values": [
151
- 75.18939393939395,
152
- 67.04545454545458,
153
- 100.00000000000003,
154
- 65.15151515151518,
155
- 56.818181818181834,
156
- 95.45454545454548,
157
- 78.08441558441562,
158
- 66.23376623376626,
159
- 69.64285714285717,
160
- 58.0808080808081,
161
- 62.12121212121214,
162
- 62.878787878787904,
163
- 60.03787878787881,
164
- 72.9166666666667,
165
- 82.00757575757578,
166
- 66.47727272727275,
167
- 63.636363636363654,
168
- 52.840909090909115,
169
- 67.72727272727275,
170
- 75.68181818181822,
171
- 60.68181818181819,
172
- 71.5909090909091,
173
- 84.09090909090911,
174
- 68.1818181818182,
175
- 74.05303030303033,
176
- 79.54545454545456,
177
- 93.18181818181822,
178
- 100.00000000000003,
179
- 100,
180
- 100,
181
- 100
182
- ]
183
- }
184
- },
185
- "marginalMeans": {
186
- "architecture": {
187
- "single": 53.003246753246756,
188
- "multi": 62.65107875994973
189
- },
190
- "recognition": {
191
- "standard": 48.07437932437934,
192
- "recognition": 74.36517686517688
193
- }
194
- },
195
- "anovaTable": {
196
- "architecture": {
197
- "SS": 1063.0791445902653,
198
- "df": 1,
199
- "MS": 1063.0791445902653,
200
- "F": 4.445248575427072,
201
- "p": 0.05,
202
- "sig": false
203
- },
204
- "recognition": {
205
- "SS": 13123.81985503855,
206
- "df": 1,
207
- "MS": 13123.81985503855,
208
- "F": 54.87704449065894,
209
- "p": 0.001,
210
- "sig": true
211
- },
212
- "interaction": {
213
- "SS": 124.12853011664384,
214
- "df": 1,
215
- "MS": 124.12853011664384,
216
- "F": 0.5190414791586724,
217
- "p": 0.25,
218
- "sig": false
219
- },
220
- "error": {
221
- "SS": 17218.76675999957,
222
- "df": 72,
223
- "MS": 239.14953833332734
224
- },
225
- "total": {
226
- "SS": 31115.94559827812,
227
- "df": 75
228
- }
229
- },
230
- "effectSizes": {
231
- "architecture": {
232
- "etaSq": 0.03416509201793609,
233
- "partialEtaSq": 0.05814944235600241,
234
- "cohenD": 0.6238712311493667
235
- },
236
- "recognition": {
237
- "etaSq": 0.42177152590743666,
238
- "partialEtaSq": 0.4325214597404903,
239
- "cohenD": 1.7000785480386178
240
- },
241
- "interaction": {
242
- "etaSq": 0.003989225708233428,
243
- "partialEtaSq": 0.007157313011477686
244
- }
245
- },
246
- "mainEffects": {
247
- "architecture": 9.647832006702977,
248
- "recognition": 26.290797540797534
249
- },
250
- "assumptions": {
251
- "normality": true,
252
- "homogeneity": false,
253
- "leveneF": 3.3126010309809213,
254
- "leveneP": 0.01
255
- }
256
- }
257
- ```