@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
@@ -0,0 +1,452 @@
1
+ #!/usr/bin/env python3
2
+ """Generate all paper figures from data.
3
+
4
+ Usage:
5
+ python scripts/generate-paper-figures.py
6
+
7
+ Outputs 5 PNGs to docs/research/figures/
8
+ """
9
+
10
+ import os
11
+ import matplotlib
12
+ matplotlib.use('Agg')
13
+ import matplotlib.pyplot as plt
14
+ import matplotlib.patches as mpatches
15
+ from matplotlib.patches import FancyBboxPatch, FancyArrowPatch
16
+ import numpy as np
17
+
18
+ OUTPUT_DIR = os.path.join(os.path.dirname(__file__), '..', 'docs', 'research', 'figures')
19
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
20
+
21
+ # Common styling
22
+ plt.rcParams.update({
23
+ 'font.size': 13,
24
+ 'axes.labelsize': 15,
25
+ 'axes.titlesize': 16,
26
+ 'xtick.labelsize': 12,
27
+ 'ytick.labelsize': 12,
28
+ 'legend.fontsize': 12,
29
+ 'figure.dpi': 300,
30
+ 'savefig.dpi': 300,
31
+ 'savefig.bbox': 'tight',
32
+ 'savefig.pad_inches': 0.3,
33
+ 'font.family': 'sans-serif',
34
+ })
35
+
36
+
37
+ def draw_box(ax, xy, width, height, text, facecolor='#E8E8E8', edgecolor='#333333',
38
+ fontsize=12, fontweight='normal', text_color='black', alpha=1.0, zorder=2):
39
+ """Draw a rounded box with centered text."""
40
+ box = FancyBboxPatch(xy, width, height,
41
+ boxstyle="round,pad=0.02",
42
+ facecolor=facecolor, edgecolor=edgecolor,
43
+ linewidth=1.5, alpha=alpha, zorder=zorder)
44
+ ax.add_patch(box)
45
+ cx = xy[0] + width / 2
46
+ cy = xy[1] + height / 2
47
+ ax.text(cx, cy, text, ha='center', va='center',
48
+ fontsize=fontsize, fontweight=fontweight, color=text_color, zorder=zorder + 1)
49
+
50
+
51
+ def draw_arrow(ax, start, end, color='#333333', style='->', lw=1.5, zorder=1):
52
+ """Draw an arrow between two points."""
53
+ ax.annotate('', xy=end, xytext=start,
54
+ arrowprops=dict(arrowstyle=style, color=color, lw=lw),
55
+ zorder=zorder)
56
+
57
+
58
+ # ── Figure 1: Ego/Superego Architecture ──────────────────────────────────────
59
+
60
+ def figure1():
61
+ fig, ax = plt.subplots(figsize=(12, 7))
62
+ ax.set_xlim(0, 12)
63
+ ax.set_ylim(0, 7)
64
+ ax.axis('off')
65
+ ax.set_title('Figure 1: Ego/Superego Architecture', fontsize=16, fontweight='bold', pad=15)
66
+
67
+ # Tutor system container
68
+ container = FancyBboxPatch((0.3, 0.5), 8.9, 6.0,
69
+ boxstyle="round,pad=0.1",
70
+ facecolor='#F5F5F5', edgecolor='#666666',
71
+ linewidth=2, linestyle='--', zorder=0)
72
+ ax.add_patch(container)
73
+ ax.text(4.75, 6.2, 'Tutor System', ha='center', va='center',
74
+ fontsize=15, fontweight='bold', color='#444444', zorder=1)
75
+
76
+ # Writing Pad (Memory)
77
+ draw_box(ax, (0.8, 3.5), 2.2, 1.2, 'Writing Pad\n(Memory)',
78
+ facecolor='#D4E6F1', fontsize=12, fontweight='bold')
79
+
80
+ # Ego
81
+ draw_box(ax, (4.0, 3.5), 2.0, 1.2, 'Ego',
82
+ facecolor='#ABEBC6', fontsize=14, fontweight='bold')
83
+
84
+ # Superego
85
+ draw_box(ax, (4.0, 1.2), 2.0, 1.2, 'Superego',
86
+ facecolor='#F9E79F', fontsize=14, fontweight='bold')
87
+
88
+ # Accept / Modify / Reject
89
+ draw_box(ax, (7.0, 1.2), 2.0, 1.2, 'Accept /\nModify / Reject',
90
+ facecolor='#FADBD8', fontsize=11, fontweight='bold')
91
+
92
+ # Final Suggestion
93
+ draw_box(ax, (7.0, 3.5), 2.0, 1.2, 'Final\nSuggestion',
94
+ facecolor='#D5F5E3', fontsize=12, fontweight='bold')
95
+
96
+ # Learner (outside container)
97
+ draw_box(ax, (9.8, 3.5), 1.8, 1.2, 'Learner',
98
+ facecolor='#D7BDE2', fontsize=14, fontweight='bold')
99
+
100
+ # Arrows
101
+ # Writing Pad -> Ego
102
+ draw_arrow(ax, (3.0, 4.1), (4.0, 4.1))
103
+ ax.text(3.5, 4.4, 'Memory\ntraces', ha='center', va='bottom', fontsize=10,
104
+ fontstyle='italic', color='#555555')
105
+
106
+ # Ego -> Superego
107
+ draw_arrow(ax, (5.0, 3.5), (5.0, 2.4))
108
+ ax.text(5.3, 2.95, 'Proposal', ha='left', va='center', fontsize=10, color='#555555')
109
+
110
+ # Superego -> Accept/Modify/Reject
111
+ draw_arrow(ax, (6.0, 1.8), (7.0, 1.8))
112
+ ax.text(6.5, 2.0, 'Verdict', ha='center', va='bottom', fontsize=10, color='#555555')
113
+
114
+ # Accept/Modify/Reject -> Ego (feedback loop)
115
+ ax.annotate('', xy=(5.5, 3.5), xytext=(7.5, 2.4),
116
+ arrowprops=dict(arrowstyle='->', color='#C0392B', lw=1.5,
117
+ connectionstyle='arc3,rad=0.3'),
118
+ zorder=1)
119
+ ax.text(7.0, 3.15, 'Revise', ha='center', va='center', fontsize=10,
120
+ color='#C0392B', fontstyle='italic')
121
+
122
+ # Accept -> Final Suggestion
123
+ draw_arrow(ax, (8.0, 2.4), (8.0, 3.5), color='#27AE60')
124
+ ax.text(8.3, 2.95, 'Accept', ha='left', va='center', fontsize=10, color='#27AE60')
125
+
126
+ # Final Suggestion -> Learner
127
+ draw_arrow(ax, (9.0, 4.1), (9.8, 4.1))
128
+
129
+ # Learner feedback arrow (back to Writing Pad)
130
+ ax.annotate('', xy=(1.9, 3.5), xytext=(10.7, 3.5),
131
+ arrowprops=dict(arrowstyle='->', color='#7D3C98', lw=1.5,
132
+ connectionstyle='arc3,rad=-0.5',
133
+ linestyle='dashed'),
134
+ zorder=1)
135
+ ax.text(6.0, 0.85, 'Learner responses shape future encounters',
136
+ ha='center', va='center', fontsize=10, fontstyle='italic', color='#7D3C98')
137
+
138
+ fig.savefig(os.path.join(OUTPUT_DIR, 'figure1.png'))
139
+ plt.close(fig)
140
+ print(' figure1.png')
141
+
142
+
143
+ # ── Figure 2: Recognition vs. Baseline Response Flow ─────────────────────────
144
+
145
+ def figure2():
146
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 8))
147
+ fig.suptitle('Figure 2: Recognition vs. Baseline Response Flow', fontsize=16, fontweight='bold', y=0.97)
148
+
149
+ for ax in (ax1, ax2):
150
+ ax.set_xlim(0, 10)
151
+ ax.set_ylim(0, 10)
152
+ ax.axis('off')
153
+
154
+ # Common learner quote at top
155
+ learner_quote = '"I think dialectics is\nlike a spiral..."'
156
+
157
+ # ── Left: Baseline Flow ──
158
+ ax1.set_title('Baseline Flow', fontsize=14, fontweight='bold', pad=10)
159
+ base_color = '#D5D8DC'
160
+ base_edge = '#5D6D7E'
161
+
162
+ # Learner quote
163
+ draw_box(ax1, (1.5, 8.2), 7, 1.1, learner_quote,
164
+ facecolor='#EBF5FB', edgecolor='#2980B9', fontsize=11, fontweight='normal')
165
+
166
+ steps = [
167
+ (6.8, 'Acknowledge', '"That\'s an interesting\nobservation..."'),
168
+ (5.4, 'Redirect', '"Let me explain what\ndialectics actually is..."'),
169
+ (4.0, 'Instruct', '"Dialectics involves\nthesis, antithesis,\nsynthesis..."'),
170
+ ]
171
+ for y, label, detail in steps:
172
+ draw_box(ax1, (1.5, y), 3.0, 1.1, label,
173
+ facecolor=base_color, edgecolor=base_edge, fontsize=12, fontweight='bold')
174
+ ax1.text(6.5, y + 0.55, detail, ha='center', va='center',
175
+ fontsize=10, fontstyle='italic', color='#555555',
176
+ bbox=dict(boxstyle='round,pad=0.3', facecolor='white', edgecolor='#CCCCCC', alpha=0.8))
177
+
178
+ # Outcome
179
+ draw_box(ax1, (1.5, 2.2), 7, 1.2, 'WAYPOINT\nLearner acknowledged, then redirected',
180
+ facecolor='#FADBD8', edgecolor='#E74C3C', fontsize=11, fontweight='bold')
181
+
182
+ # Arrows
183
+ draw_arrow(ax1, (5.0, 8.2), (3.0, 7.9))
184
+ draw_arrow(ax1, (3.0, 6.8), (3.0, 6.5))
185
+ draw_arrow(ax1, (3.0, 5.4), (3.0, 5.1))
186
+ draw_arrow(ax1, (3.0, 4.0), (3.0, 3.4))
187
+
188
+ # ── Right: Recognition Flow ──
189
+ ax2.set_title('Recognition Flow', fontsize=14, fontweight='bold', pad=10)
190
+ recog_color = '#D5F5E3'
191
+ recog_edge = '#27AE60'
192
+
193
+ # Learner quote
194
+ draw_box(ax2, (1.5, 8.2), 7, 1.1, learner_quote,
195
+ facecolor='#EBF5FB', edgecolor='#2980B9', fontsize=11, fontweight='normal')
196
+
197
+ steps = [
198
+ (6.8, 'Engage', '"A spiral—that\'s a\npowerful metaphor..."'),
199
+ (5.4, 'Explore', '"What makes you see\nit as circular rather\nthan linear?"'),
200
+ (4.0, 'Synthesize', '"Your spiral captures\nsomething the textbook\nmisses..."'),
201
+ ]
202
+ for y, label, detail in steps:
203
+ draw_box(ax2, (1.5, y), 3.0, 1.1, label,
204
+ facecolor=recog_color, edgecolor=recog_edge, fontsize=12, fontweight='bold')
205
+ ax2.text(6.5, y + 0.55, detail, ha='center', va='center',
206
+ fontsize=10, fontstyle='italic', color='#555555',
207
+ bbox=dict(boxstyle='round,pad=0.3', facecolor='white', edgecolor='#CCCCCC', alpha=0.8))
208
+
209
+ # Outcome
210
+ draw_box(ax2, (1.5, 2.2), 7, 1.2, 'SITE OF JOINT INQUIRY\nLearner\'s understanding shapes interaction',
211
+ facecolor='#D5F5E3', edgecolor='#27AE60', fontsize=11, fontweight='bold')
212
+
213
+ # Arrows
214
+ draw_arrow(ax2, (5.0, 8.2), (3.0, 7.9))
215
+ draw_arrow(ax2, (3.0, 6.8), (3.0, 6.5))
216
+ draw_arrow(ax2, (3.0, 5.4), (3.0, 5.1))
217
+ draw_arrow(ax2, (3.0, 4.0), (3.0, 3.4))
218
+
219
+ fig.tight_layout(rect=[0, 0, 1, 0.94])
220
+ fig.savefig(os.path.join(OUTPUT_DIR, 'figure2.png'))
221
+ plt.close(fig)
222
+ print(' figure2.png')
223
+
224
+
225
+ # ── Figure 3: Recognition Effect Decomposition ───────────────────────────────
226
+
227
+ def figure3():
228
+ fig, ax = plt.subplots(figsize=(10, 4))
229
+
230
+ total = 20.1
231
+ prompt_eng = 11.4
232
+ recog_unique = 8.7
233
+ prompt_pct = prompt_eng / total * 100 # 57%
234
+ recog_pct = recog_unique / total * 100 # 43%
235
+
236
+ # Stacked horizontal bar
237
+ bar_height = 0.5
238
+ y = 0
239
+
240
+ ax.barh(y, prompt_eng, height=bar_height, color='#85C1E9', edgecolor='#2471A3',
241
+ linewidth=1.5, label=f'Prompt Engineering: +{prompt_eng} pts ({prompt_pct:.0f}%)')
242
+ ax.barh(y, recog_unique, height=bar_height, left=prompt_eng, color='#82E0AA',
243
+ edgecolor='#1E8449', linewidth=1.5,
244
+ label=f'Recognition Unique: +{recog_unique} pts ({recog_pct:.0f}%)')
245
+
246
+ # Labels on bars
247
+ ax.text(prompt_eng / 2, y, f'+{prompt_eng} pts\n({prompt_pct:.0f}%)',
248
+ ha='center', va='center', fontsize=13, fontweight='bold', color='#1A5276')
249
+ ax.text(prompt_eng + recog_unique / 2, y,
250
+ f'+{recog_unique} pts\n({recog_pct:.0f}%)',
251
+ ha='center', va='center', fontsize=13, fontweight='bold', color='#145A32')
252
+
253
+ # Total label
254
+ ax.text(total + 0.3, y, f'Total: +{total} pts',
255
+ ha='left', va='center', fontsize=13, fontweight='bold', color='#333333')
256
+
257
+ ax.set_xlim(0, 26)
258
+ ax.set_ylim(-0.8, 0.8)
259
+ ax.set_xlabel('Score Improvement (points)', fontsize=14)
260
+ ax.set_yticks([])
261
+ ax.set_title('Figure 3: Recognition Effect Decomposition\n(Base → Enhanced → Recognition)',
262
+ fontsize=15, fontweight='bold')
263
+ ax.legend(loc='upper right', fontsize=11, framealpha=0.9)
264
+ ax.spines['top'].set_visible(False)
265
+ ax.spines['right'].set_visible(False)
266
+ ax.spines['left'].set_visible(False)
267
+
268
+ fig.tight_layout()
269
+ fig.savefig(os.path.join(OUTPUT_DIR, 'figure3.png'))
270
+ plt.close(fig)
271
+ print(' figure3.png')
272
+
273
+
274
+ # ── Figure 4: Multi-Agent Synergy by Prompt Type ─────────────────────────────
275
+
276
+ def figure4():
277
+ fig, ax = plt.subplots(figsize=(10, 5.5))
278
+
279
+ categories = ['Recognition\nPrompts', 'Enhanced\nPrompts']
280
+ single = [72.2, 83.3]
281
+ multi = [81.5, 83.3]
282
+ deltas = ['+9.2**', '+0.0']
283
+
284
+ y = np.arange(len(categories))
285
+ bar_height = 0.3
286
+
287
+ bars1 = ax.barh(y + bar_height/2, single, bar_height, color='#85C1E9',
288
+ edgecolor='#2471A3', linewidth=1.5, label='Single-Agent')
289
+ bars2 = ax.barh(y - bar_height/2, multi, bar_height, color='#82E0AA',
290
+ edgecolor='#1E8449', linewidth=1.5, label='Multi-Agent')
291
+
292
+ # Score labels
293
+ for bar, val in zip(bars1, single):
294
+ ax.text(val + 0.5, bar.get_y() + bar.get_height()/2, f'{val}',
295
+ va='center', fontsize=12, fontweight='bold')
296
+ for bar, val in zip(bars2, multi):
297
+ ax.text(val + 0.5, bar.get_y() + bar.get_height()/2, f'{val}',
298
+ va='center', fontsize=12, fontweight='bold')
299
+
300
+ # Delta labels
301
+ for i, delta in enumerate(deltas):
302
+ ax.text(max(single[i], multi[i]) + 4.5, y[i],
303
+ f'Δ {delta}', ha='center', va='center',
304
+ fontsize=12, fontweight='bold',
305
+ color='#C0392B' if '**' in delta else '#555555')
306
+
307
+ ax.set_xlim(0, 100)
308
+ ax.set_yticks(y)
309
+ ax.set_yticklabels(categories, fontsize=13)
310
+ ax.set_xlabel('Mean Score', fontsize=14)
311
+ ax.set_title('Figure 4: Multi-Agent Synergy by Prompt Type\n(Preliminary N=36)',
312
+ fontsize=15, fontweight='bold')
313
+ ax.legend(loc='lower right', fontsize=12, framealpha=0.9)
314
+ ax.spines['top'].set_visible(False)
315
+ ax.spines['right'].set_visible(False)
316
+
317
+ fig.text(0.12, 0.02, '** Significant synergy effect (p < .05); however, this did not replicate\n'
318
+ ' in the 5-model probe (N=826, mean interaction = −2.2 pts)',
319
+ fontsize=11, fontstyle='italic', color='#777777')
320
+
321
+ fig.tight_layout(rect=[0, 0.1, 1, 1])
322
+ fig.savefig(os.path.join(OUTPUT_DIR, 'figure4.png'))
323
+ plt.close(fig)
324
+ print(' figure4.png')
325
+
326
+
327
+ # ── Figure 5: Factor Effects Invert by Domain ────────────────────────────────
328
+
329
+ def figure5():
330
+ fig, ax = plt.subplots(figsize=(10, 5.5))
331
+
332
+ factors = ['A: Recognition\nEffect', 'B: Multi-Agent\nEffect', 'C: Learner\nEffect']
333
+ phil = [15.4, -0.8, 2.1]
334
+ elem = [4.4, 9.9, 0.75]
335
+
336
+ y = np.arange(len(factors))
337
+ bar_height = 0.3
338
+
339
+ bars_phil = ax.barh(y + bar_height/2, phil, bar_height, color='#5DADE2',
340
+ edgecolor='#2471A3', linewidth=1.5, label='Philosophy')
341
+ bars_elem = ax.barh(y - bar_height/2, elem, bar_height, color='#F0B27A',
342
+ edgecolor='#CA6F1E', linewidth=1.5, label='Elementary Math')
343
+
344
+ # Score labels
345
+ for bar, val in zip(bars_phil, phil):
346
+ label = f'+{val}' if val >= 0 else f'{val}'
347
+ offset = 0.3 if val >= 0 else -0.3
348
+ ha = 'left' if val >= 0 else 'right'
349
+ ax.text(val + offset, bar.get_y() + bar.get_height()/2, label,
350
+ va='center', ha=ha, fontsize=12, fontweight='bold', color='#1A5276')
351
+ for bar, val in zip(bars_elem, elem):
352
+ ax.text(val + 0.3, bar.get_y() + bar.get_height()/2, f'+{val}',
353
+ va='center', fontsize=12, fontweight='bold', color='#784212')
354
+
355
+ ax.set_xlim(-2, 18)
356
+ ax.axvline(x=0, color='#999999', linewidth=0.8, linestyle='-')
357
+ ax.set_yticks(y)
358
+ ax.set_yticklabels(factors, fontsize=13)
359
+ ax.set_xlabel('Effect Size (points)', fontsize=14)
360
+ ax.set_title('Figure 5: Factor Effects Invert by Domain',
361
+ fontsize=15, fontweight='bold')
362
+ ax.legend(loc='lower right', fontsize=12, framealpha=0.9)
363
+ ax.spines['top'].set_visible(False)
364
+ ax.spines['right'].set_visible(False)
365
+
366
+ fig.text(0.12, 0.02,
367
+ 'Factor dominance inverts: Philosophy favors recognition (A); Elementary favors architecture (B).\n'
368
+ 'Elementary recognition partially model-dependent (Kimi shows d ≈ 0.61).',
369
+ fontsize=11, fontstyle='italic', color='#777777')
370
+
371
+ fig.tight_layout(rect=[0, 0.1, 1, 1])
372
+ fig.savefig(os.path.join(OUTPUT_DIR, 'figure5.png'))
373
+ plt.close(fig)
374
+ print(' figure5.png')
375
+
376
+
377
+ # ── Figure 6: Emergent Theme Word Clouds ──────────────────────────────────────
378
+
379
+ def figure6():
380
+ try:
381
+ from wordcloud import WordCloud
382
+ except ImportError:
383
+ print(' figure6.png SKIPPED (pip install wordcloud)')
384
+ return
385
+
386
+ import json
387
+ data_path = os.path.join(os.path.dirname(__file__), '..', 'exports',
388
+ 'qualitative-ai-claude-code-sample300-2026-02-08.json')
389
+ if not os.path.exists(data_path):
390
+ print(' figure6.png SKIPPED (discovery data not found)')
391
+ return
392
+
393
+ with open(data_path) as f:
394
+ data = json.load(f)
395
+
396
+ themes = data['discovery']['analysis']['themeFrequency']
397
+
398
+ base_freq = {}
399
+ recog_freq = {}
400
+ for key, t in themes.items():
401
+ label = t['label']
402
+ b = t.get('base', 0)
403
+ r = t.get('recognition', 0)
404
+ if b + r >= 3:
405
+ if b > 0:
406
+ base_freq[label] = b
407
+ if r > 0:
408
+ recog_freq[label] = r
409
+
410
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))
411
+
412
+ wc_base = WordCloud(
413
+ width=1200, height=800, background_color='white', colormap='OrRd',
414
+ max_words=30, max_font_size=120, min_font_size=14,
415
+ prefer_horizontal=0.85, relative_scaling=0.5, margin=10,
416
+ collocations=False,
417
+ ).generate_from_frequencies(base_freq)
418
+
419
+ wc_recog = WordCloud(
420
+ width=1200, height=800, background_color='white', colormap='YlGn',
421
+ max_words=30, max_font_size=120, min_font_size=14,
422
+ prefer_horizontal=0.85, relative_scaling=0.5, margin=10,
423
+ collocations=False,
424
+ ).generate_from_frequencies(recog_freq)
425
+
426
+ ax1.imshow(wc_base, interpolation='bilinear')
427
+ ax1.set_title('Base Condition', fontsize=18, fontweight='bold', pad=15)
428
+ ax1.axis('off')
429
+
430
+ ax2.imshow(wc_recog, interpolation='bilinear')
431
+ ax2.set_title('Recognition Condition', fontsize=18, fontweight='bold', pad=15)
432
+ ax2.axis('off')
433
+
434
+ fig.suptitle('Figure 6: Emergent Theme Word Clouds (AI Discovery, N=300)',
435
+ fontsize=16, fontweight='bold', y=0.98)
436
+ fig.tight_layout(rect=[0, 0.02, 1, 0.94])
437
+ fig.savefig(os.path.join(OUTPUT_DIR, 'figure6.png'))
438
+ plt.close(fig)
439
+ print(' figure6.png')
440
+
441
+
442
+ # ── Main ──────────────────────────────────────────────────────────────────────
443
+
444
+ if __name__ == '__main__':
445
+ print('Generating paper figures...')
446
+ figure1()
447
+ figure2()
448
+ figure3()
449
+ figure4()
450
+ figure5()
451
+ figure6()
452
+ print(f'Done. Output: {os.path.abspath(OUTPUT_DIR)}/')