@machinespirits/eval 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/README.md +91 -9
  2. package/config/eval-settings.yaml +3 -3
  3. package/config/paper-manifest.json +486 -0
  4. package/config/providers.yaml +9 -6
  5. package/config/tutor-agents.yaml +2261 -0
  6. package/content/README.md +23 -0
  7. package/content/courses/479/course.md +53 -0
  8. package/content/courses/479/lecture-1.md +361 -0
  9. package/content/courses/479/lecture-2.md +360 -0
  10. package/content/courses/479/lecture-3.md +655 -0
  11. package/content/courses/479/lecture-4.md +530 -0
  12. package/content/courses/479/lecture-5.md +326 -0
  13. package/content/courses/479/lecture-6.md +346 -0
  14. package/content/courses/479/lecture-7.md +326 -0
  15. package/content/courses/479/lecture-8.md +273 -0
  16. package/content/courses/479/roadmap-slides.md +656 -0
  17. package/content/manifest.yaml +8 -0
  18. package/docs/research/apa.csl +2133 -0
  19. package/docs/research/build.sh +98 -0
  20. package/docs/research/figures/figure1.png +0 -0
  21. package/docs/research/figures/figure10.png +0 -0
  22. package/docs/research/figures/figure11.png +0 -0
  23. package/docs/research/figures/figure2.png +0 -0
  24. package/docs/research/figures/figure3.png +0 -0
  25. package/docs/research/figures/figure4.png +0 -0
  26. package/docs/research/figures/figure5.png +0 -0
  27. package/docs/research/figures/figure6.png +0 -0
  28. package/docs/research/figures/figure7.png +0 -0
  29. package/docs/research/figures/figure8.png +0 -0
  30. package/docs/research/figures/figure9.png +0 -0
  31. package/docs/research/header.tex +25 -0
  32. package/docs/research/paper-full.md +2565 -0
  33. package/docs/research/paper-short.md +436 -0
  34. package/docs/research/references.bib +1143 -0
  35. package/docs/research/slides-header.tex +188 -0
  36. package/docs/research/slides-pptx.md +363 -0
  37. package/docs/research/slides.md +531 -0
  38. package/docs/research/style-reference-pptx.py +199 -0
  39. package/package.json +5 -5
  40. package/scripts/analyze-eval-results.js +69 -17
  41. package/scripts/analyze-mechanism-traces.js +763 -0
  42. package/scripts/analyze-modulation-learning.js +498 -0
  43. package/scripts/analyze-prosthesis.js +144 -0
  44. package/scripts/analyze-run.js +264 -79
  45. package/scripts/assess-transcripts.js +853 -0
  46. package/scripts/browse-transcripts.js +854 -0
  47. package/scripts/check-parse-failures.js +73 -0
  48. package/scripts/code-dialectical-modulation.js +1320 -0
  49. package/scripts/download-data.sh +55 -0
  50. package/scripts/eval-cli.js +106 -18
  51. package/scripts/generate-paper-figures.js +663 -0
  52. package/scripts/generate-paper-figures.py +577 -76
  53. package/scripts/generate-paper-tables.js +299 -0
  54. package/scripts/qualitative-analysis-ai.js +3 -3
  55. package/scripts/render-sequence-diagram.js +694 -0
  56. package/scripts/test-latency.js +210 -0
  57. package/scripts/test-rate-limit.js +95 -0
  58. package/scripts/test-token-budget.js +332 -0
  59. package/scripts/validate-paper-manifest.js +670 -0
  60. package/services/__tests__/evalConfigLoader.test.js +2 -2
  61. package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
  62. package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
  63. package/services/evaluationRunner.js +975 -98
  64. package/services/evaluationStore.js +12 -4
  65. package/services/learnerTutorInteractionEngine.js +27 -2
  66. package/services/mockProvider.js +133 -0
  67. package/services/promptRewriter.js +1471 -5
  68. package/services/rubricEvaluator.js +55 -2
  69. package/services/transcriptFormatter.js +675 -0
  70. package/config/machinespirits-eval.code-workspace +0 -11
  71. package/docs/EVALUATION-VARIABLES.md +0 -589
  72. package/docs/REPLICATION-PLAN.md +0 -577
  73. package/scripts/analyze-run.mjs +0 -282
  74. package/scripts/compare-runs.js +0 -44
  75. package/scripts/compare-suggestions.js +0 -80
  76. package/scripts/dig-into-run.js +0 -158
  77. package/scripts/show-failed-suggestions.js +0 -64
  78. /package/scripts/{check-run.mjs → check-run.js} +0 -0
@@ -4,10 +4,15 @@
4
4
  Usage:
5
5
  python scripts/generate-paper-figures.py
6
6
 
7
- Outputs 5 PNGs to docs/research/figures/
7
+ Reads config/paper-manifest.json and queries data/evaluations.db to produce
8
+ data-driven figures. Falls back to hardcoded values if DB is unavailable.
9
+
10
+ Outputs PNGs to docs/research/figures/
8
11
  """
9
12
 
10
13
  import os
14
+ import json
15
+ import sqlite3
11
16
  import matplotlib
12
17
  matplotlib.use('Agg')
13
18
  import matplotlib.pyplot as plt
@@ -15,9 +20,78 @@ import matplotlib.patches as mpatches
15
20
  from matplotlib.patches import FancyBboxPatch, FancyArrowPatch
16
21
  import numpy as np
17
22
 
18
- OUTPUT_DIR = os.path.join(os.path.dirname(__file__), '..', 'docs', 'research', 'figures')
23
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
24
+ ROOT_DIR = os.path.join(SCRIPT_DIR, '..')
25
+ OUTPUT_DIR = os.path.join(ROOT_DIR, 'docs', 'research', 'figures')
26
+ MANIFEST_PATH = os.path.join(ROOT_DIR, 'config', 'paper-manifest.json')
27
+ DB_PATH = os.path.join(ROOT_DIR, 'data', 'evaluations.db')
28
+
19
29
  os.makedirs(OUTPUT_DIR, exist_ok=True)
20
30
 
31
+ # ── Data Layer ───────────────────────────────────────────────────────────────
32
+
33
+ _manifest = None
34
+ _db = None
35
+
36
+ def get_manifest():
37
+ global _manifest
38
+ if _manifest is None and os.path.exists(MANIFEST_PATH):
39
+ with open(MANIFEST_PATH) as f:
40
+ _manifest = json.load(f)
41
+ return _manifest
42
+
43
+ def get_db():
44
+ global _db
45
+ if _db is None and os.path.exists(DB_PATH):
46
+ _db = sqlite3.connect(DB_PATH)
47
+ _db.row_factory = sqlite3.Row
48
+ return _db
49
+
50
+ def query_cell_means(run_ids, judge_filter='claude-opus%'):
51
+ """Query mean overall_score per profile (cell) for given runs."""
52
+ db = get_db()
53
+ if not db:
54
+ return {}
55
+ placeholders = ','.join('?' * len(run_ids))
56
+ rows = db.execute(f"""
57
+ SELECT profile_name, AVG(overall_score) as mean, COUNT(*) as n,
58
+ -- stdev via manual calculation
59
+ AVG(overall_score * overall_score) - AVG(overall_score) * AVG(overall_score) as var
60
+ FROM evaluation_results
61
+ WHERE run_id IN ({placeholders})
62
+ AND judge_model LIKE ?
63
+ AND overall_score IS NOT NULL
64
+ GROUP BY profile_name
65
+ """, [*run_ids, judge_filter]).fetchall()
66
+ return {r['profile_name']: {'mean': r['mean'], 'n': r['n'],
67
+ 'sd': (r['var'] ** 0.5) if r['var'] and r['var'] > 0 else 0}
68
+ for r in rows}
69
+
70
+ def extract_cell_number(profile_name):
71
+ """Extract cell number from profile_name like 'cell_5_recog_single_unified'."""
72
+ parts = profile_name.split('_')
73
+ if len(parts) >= 2 and parts[0] == 'cell':
74
+ try:
75
+ return int(parts[1])
76
+ except ValueError:
77
+ pass
78
+ return None
79
+
80
+ def compute_2x2_effects(cell_means, base_single, base_multi, recog_single, recog_multi):
81
+ """Compute recognition effect, architecture effect, and interaction from 4 cell means."""
82
+ bs = cell_means.get(base_single, {}).get('mean')
83
+ bm = cell_means.get(base_multi, {}).get('mean')
84
+ rs = cell_means.get(recog_single, {}).get('mean')
85
+ rm = cell_means.get(recog_multi, {}).get('mean')
86
+ if None in (bs, bm, rs, rm):
87
+ return None
88
+ recog_effect = ((rs + rm) / 2) - ((bs + bm) / 2)
89
+ arch_effect = ((bm + rm) / 2) - ((bs + rs) / 2)
90
+ interaction = (rm - rs) - (bm - bs)
91
+ return {'recog_effect': recog_effect, 'arch_effect': arch_effect,
92
+ 'interaction': interaction,
93
+ 'means': {'bs': bs, 'bm': bm, 'rs': rs, 'rm': rm}}
94
+
21
95
  # Common styling
22
96
  plt.rcParams.update({
23
97
  'font.size': 13,
@@ -227,9 +301,9 @@ def figure2():
227
301
  def figure3():
228
302
  fig, ax = plt.subplots(figsize=(10, 4))
229
303
 
230
- total = 20.1
231
- prompt_eng = 11.4
232
- recog_unique = 8.7
304
+ total = 19.7
305
+ prompt_eng = 11.6
306
+ recog_unique = 8.0
233
307
  prompt_pct = prompt_eng / total * 100 # 57%
234
308
  recog_pct = recog_unique / total * 100 # 43%
235
309
 
@@ -274,51 +348,89 @@ def figure3():
274
348
  # ── Figure 4: Multi-Agent Synergy by Prompt Type ─────────────────────────────
275
349
 
276
350
  def figure4():
277
- fig, ax = plt.subplots(figsize=(10, 5.5))
278
-
279
- categories = ['Recognition\nPrompts', 'Enhanced\nPrompts']
280
- single = [72.2, 83.3]
281
- multi = [81.5, 83.3]
282
- deltas = ['+9.2**', '+0.0']
351
+ """Multi-model A×B interaction probe (Table 8, N=655 across 5 ego models).
352
+ Shows recognition effect and A×B interaction per model, confirming
353
+ architecture is additive, not synergistic."""
283
354
 
284
- y = np.arange(len(categories))
285
- bar_height = 0.3
286
-
287
- bars1 = ax.barh(y + bar_height/2, single, bar_height, color='#85C1E9',
288
- edgecolor='#2471A3', linewidth=1.5, label='Single-Agent')
289
- bars2 = ax.barh(y - bar_height/2, multi, bar_height, color='#82E0AA',
290
- edgecolor='#1E8449', linewidth=1.5, label='Multi-Agent')
355
+ fig, ax = plt.subplots(figsize=(10, 5.5))
291
356
 
292
- # Score labels
293
- for bar, val in zip(bars1, single):
294
- ax.text(val + 0.5, bar.get_y() + bar.get_height()/2, f'{val}',
295
- va='center', fontsize=12, fontweight='bold')
296
- for bar, val in zip(bars2, multi):
297
- ax.text(val + 0.5, bar.get_y() + bar.get_height()/2, f'{val}',
298
- va='center', fontsize=12, fontweight='bold')
299
-
300
- # Delta labels
301
- for i, delta in enumerate(deltas):
302
- ax.text(max(single[i], multi[i]) + 4.5, y[i],
303
- f'Δ {delta}', ha='center', va='center',
304
- fontsize=12, fontweight='bold',
305
- color='#C0392B' if '**' in delta else '#555555')
306
-
307
- ax.set_xlim(0, 100)
308
- ax.set_yticks(y)
309
- ax.set_yticklabels(categories, fontsize=13)
310
- ax.set_xlabel('Mean Score', fontsize=14)
311
- ax.set_title('Figure 4: Multi-Agent Synergy by Prompt Type\n(Preliminary N=36)',
357
+ manifest = get_manifest()
358
+ fig4_config = manifest['figures']['figure4'] if manifest else None
359
+
360
+ # Try data-driven from DB
361
+ models = []
362
+ recog_effect = []
363
+ ab_interaction = []
364
+ data_driven = False
365
+
366
+ if fig4_config and get_db():
367
+ for key in ['kimi', 'nemotron', 'deepseek', 'glm', 'haiku']:
368
+ cfg = fig4_config['runs'][key]
369
+ cell_means = query_cell_means(cfg['run_ids'], fig4_config['judge_filter'])
370
+ if not cell_means:
371
+ break
372
+ effects = compute_2x2_effects(
373
+ cell_means,
374
+ 'cell_1_base_single_unified', 'cell_3_base_multi_unified',
375
+ 'cell_5_recog_single_unified', 'cell_7_recog_multi_unified')
376
+ if not effects:
377
+ break
378
+ total_n = sum(v['n'] for v in cell_means.values())
379
+ models.append(f"{cfg['label']}\n(N={total_n})")
380
+ recog_effect.append(round(effects['recog_effect'], 1))
381
+ ab_interaction.append(round(effects['interaction'], 1))
382
+ else:
383
+ data_driven = True
384
+ print(' [data-driven from DB]')
385
+
386
+ if not data_driven:
387
+ # Fallback hardcoded values
388
+ models = ['Kimi K2.5\n(N=179)', 'Nemotron\n(N=119)', 'DeepSeek\n(N=120)',
389
+ 'GLM-4.7\n(N=117)', 'Haiku 4.5\n(N=120)']
390
+ recog_effect = [15.5, 16.0, 14.0, 17.8, 9.6]
391
+ ab_interaction = [0.5, -5.7, -1.4, -0.7, -1.6]
392
+ print(' [hardcoded fallback]')
393
+
394
+ x = np.arange(len(models))
395
+ w = 0.35
396
+
397
+ bars_r = ax.bar(x - w/2, recog_effect, w, label='Recognition Effect (A)',
398
+ color='#27AE60', edgecolor='#1E8449', linewidth=1.2)
399
+ bars_i = ax.bar(x + w/2, ab_interaction, w, label='A×B Interaction',
400
+ color='#E74C3C', edgecolor='#C0392B', linewidth=1.2)
401
+
402
+ # Value labels
403
+ for bar in bars_r:
404
+ ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3,
405
+ f'+{bar.get_height():.1f}', ha='center', va='bottom',
406
+ fontsize=10, fontweight='bold', color='#1E8449')
407
+ for bar in bars_i:
408
+ val = bar.get_height()
409
+ y_pos = val - 0.5 if val < 0 else val + 0.3
410
+ va = 'top' if val < 0 else 'bottom'
411
+ sign = '+' if val > 0 else ''
412
+ ax.text(bar.get_x() + bar.get_width()/2, y_pos,
413
+ f'{sign}{val:.1f}', ha='center', va=va,
414
+ fontsize=10, fontweight='bold', color='#C0392B')
415
+
416
+ ax.axhline(0, color='#999', linewidth=0.8, linestyle='-')
417
+ ax.set_ylim(-8, 22)
418
+ ax.set_xticks(x)
419
+ ax.set_xticklabels(models, fontsize=11)
420
+ ax.set_ylabel('Effect Size (points)', fontsize=14)
421
+ ax.set_title('Figure 4: Architecture is Additive, Not Synergistic\n'
422
+ '(Multi-Model A×B Probe, N=655, Opus Judge)',
312
423
  fontsize=15, fontweight='bold')
313
- ax.legend(loc='lower right', fontsize=12, framealpha=0.9)
424
+ ax.legend(loc='upper right', fontsize=12, framealpha=0.9)
314
425
  ax.spines['top'].set_visible(False)
315
426
  ax.spines['right'].set_visible(False)
316
427
 
317
- fig.text(0.12, 0.02, '** Significant synergy effect (p < .05); however, this did not replicate\n'
318
- ' in the 5-model probe (N=826, mean interaction = −2.2 pts)',
428
+ fig.text(0.10, 0.02,
429
+ 'Recognition effect replicates across all 5 models (+9.6 to +17.8). '
430
+ 'A×B interaction is negligible (mean −1.8 pts).',
319
431
  fontsize=11, fontstyle='italic', color='#777777')
320
432
 
321
- fig.tight_layout(rect=[0, 0.1, 1, 1])
433
+ fig.tight_layout(rect=[0, 0.08, 1, 1])
322
434
  fig.savefig(os.path.join(OUTPUT_DIR, 'figure4.png'))
323
435
  plt.close(fig)
324
436
  print(' figure4.png')
@@ -327,19 +439,53 @@ def figure4():
327
439
  # ── Figure 5: Factor Effects Invert by Domain ────────────────────────────────
328
440
 
329
441
  def figure5():
442
+ """Factor effects by domain using Kimi K2.5 for both domains.
443
+ Elementary: eval-2026-02-05-e87f452d (N=60, cells 1,3,5,7).
444
+ Philosophy: factorial cells 1,3,5,7 (N=179)."""
445
+
330
446
  fig, ax = plt.subplots(figsize=(10, 5.5))
331
447
 
332
- factors = ['A: Recognition\nEffect', 'B: Multi-Agent\nEffect', 'C: Learner\nEffect']
333
- phil = [15.4, -0.8, 2.1]
334
- elem = [4.4, 9.9, 0.75]
448
+ manifest = get_manifest()
449
+ fig5_config = manifest['figures']['figure5'] if manifest else None
450
+
451
+ factors = ['A: Recognition\nEffect', 'B: Multi-Agent\nEffect']
452
+ phil = None
453
+ elem = None
454
+ phil_n = 179
455
+ elem_n = 60
456
+ data_driven = False
457
+
458
+ if fig5_config and get_db():
459
+ # Philosophy: factorial single-learner cells
460
+ phil_means = query_cell_means([fig5_config['runs']['philosophy']], fig5_config['judge_filter'])
461
+ elem_means = query_cell_means([fig5_config['runs']['elementary']], fig5_config['judge_filter'])
462
+ if phil_means and elem_means:
463
+ phil_fx = compute_2x2_effects(phil_means,
464
+ 'cell_1_base_single_unified', 'cell_3_base_multi_unified',
465
+ 'cell_5_recog_single_unified', 'cell_7_recog_multi_unified')
466
+ elem_fx = compute_2x2_effects(elem_means,
467
+ 'cell_1_base_single_unified', 'cell_3_base_multi_unified',
468
+ 'cell_5_recog_single_unified', 'cell_7_recog_multi_unified')
469
+ if phil_fx and elem_fx:
470
+ phil = [round(phil_fx['recog_effect'], 1), round(phil_fx['arch_effect'], 1)]
471
+ elem = [round(elem_fx['recog_effect'], 1), round(elem_fx['arch_effect'], 1)]
472
+ phil_n = sum(v['n'] for v in phil_means.values())
473
+ elem_n = sum(v['n'] for v in elem_means.values())
474
+ data_driven = True
475
+ print(' [data-driven from DB]')
476
+
477
+ if not data_driven:
478
+ phil = [15.4, -0.8]
479
+ elem = [9.9, 3.0]
480
+ print(' [hardcoded fallback]')
335
481
 
336
482
  y = np.arange(len(factors))
337
483
  bar_height = 0.3
338
484
 
339
485
  bars_phil = ax.barh(y + bar_height/2, phil, bar_height, color='#5DADE2',
340
- edgecolor='#2471A3', linewidth=1.5, label='Philosophy')
486
+ edgecolor='#2471A3', linewidth=1.5, label=f'Philosophy (Kimi, N={phil_n})')
341
487
  bars_elem = ax.barh(y - bar_height/2, elem, bar_height, color='#F0B27A',
342
- edgecolor='#CA6F1E', linewidth=1.5, label='Elementary Math')
488
+ edgecolor='#CA6F1E', linewidth=1.5, label=f'Elementary Math (Kimi, N={elem_n})')
343
489
 
344
490
  # Score labels
345
491
  for bar, val in zip(bars_phil, phil):
@@ -357,15 +503,15 @@ def figure5():
357
503
  ax.set_yticks(y)
358
504
  ax.set_yticklabels(factors, fontsize=13)
359
505
  ax.set_xlabel('Effect Size (points)', fontsize=14)
360
- ax.set_title('Figure 5: Factor Effects Invert by Domain',
506
+ ax.set_title('Figure 5: Factor Effects by Domain (Kimi K2.5)',
361
507
  fontsize=15, fontweight='bold')
362
508
  ax.legend(loc='lower right', fontsize=12, framealpha=0.9)
363
509
  ax.spines['top'].set_visible(False)
364
510
  ax.spines['right'].set_visible(False)
365
511
 
366
512
  fig.text(0.12, 0.02,
367
- 'Factor dominance inverts: Philosophy favors recognition (A); Elementary favors architecture (B).\n'
368
- 'Elementary recognition partially model-dependent (Kimi shows d 0.61).',
513
+ 'Recognition dominates in both domains. Architecture provides small additive benefit\n'
514
+ 'on elementary content (+3.0 pts) and negligible effect on philosophy (−0.8 pts).',
369
515
  fontsize=11, fontstyle='italic', color='#777777')
370
516
 
371
517
  fig.tight_layout(rect=[0, 0.1, 1, 1])
@@ -374,64 +520,127 @@ def figure5():
374
520
  print(' figure5.png')
375
521
 
376
522
 
377
- # ── Figure 6: Emergent Theme Word Clouds ──────────────────────────────────────
523
+ # ── Figure 6: Tutor Language Word Clouds ──────────────────────────────────────
378
524
 
379
525
  def figure6():
526
+ """Word clouds from actual tutor transcript text (N=350 factorial responses).
527
+ Shows the raw linguistic differences between base and recognition conditions,
528
+ complementing the AI theme coding in Tables 17b–d."""
529
+
380
530
  try:
381
531
  from wordcloud import WordCloud
382
532
  except ImportError:
383
533
  print(' figure6.png SKIPPED (pip install wordcloud)')
384
534
  return
385
535
 
536
+ import sqlite3
386
537
  import json
387
- data_path = os.path.join(os.path.dirname(__file__), '..', 'exports',
388
- 'qualitative-ai-claude-code-sample300-2026-02-08.json')
389
- if not os.path.exists(data_path):
390
- print(' figure6.png SKIPPED (discovery data not found)')
391
- return
392
-
393
- with open(data_path) as f:
394
- data = json.load(f)
538
+ import re
395
539
 
396
- themes = data['discovery']['analysis']['themeFrequency']
540
+ db_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'evaluations.db')
541
+ if not os.path.exists(db_path):
542
+ print(' figure6.png SKIPPED (database not found)')
543
+ return
397
544
 
398
- base_freq = {}
399
- recog_freq = {}
400
- for key, t in themes.items():
401
- label = t['label']
402
- b = t.get('base', 0)
403
- r = t.get('recognition', 0)
404
- if b + r >= 3:
405
- if b > 0:
406
- base_freq[label] = b
407
- if r > 0:
408
- recog_freq[label] = r
545
+ conn = sqlite3.connect(db_path)
546
+ rows = conn.execute("""
547
+ SELECT profile_name, suggestions
548
+ FROM evaluation_results
549
+ WHERE run_id IN ('eval-2026-02-03-f5d4dd93', 'eval-2026-02-06-a933d745')
550
+ AND overall_score IS NOT NULL
551
+ AND judge_model LIKE '%claude%'
552
+ """).fetchall()
553
+ conn.close()
554
+
555
+ # Extract message text from JSON suggestions
556
+ base_texts = []
557
+ recog_texts = []
558
+ for profile, suggestions_json in rows:
559
+ try:
560
+ suggestions = json.loads(suggestions_json)
561
+ text_parts = []
562
+ for s in suggestions:
563
+ if isinstance(s, dict):
564
+ for key in ('message', 'title', 'reason'):
565
+ if key in s and s[key]:
566
+ text_parts.append(str(s[key]))
567
+ text = ' '.join(text_parts)
568
+ except (json.JSONDecodeError, TypeError):
569
+ text = str(suggestions_json) if suggestions_json else ''
570
+
571
+ if 'recog' in profile:
572
+ recog_texts.append(text)
573
+ else:
574
+ base_texts.append(text)
575
+
576
+ base_corpus = ' '.join(base_texts)
577
+ recog_corpus = ' '.join(recog_texts)
578
+
579
+ # Pedagogical stop words — remove generic terms common to both conditions
580
+ # so the clouds highlight what *differs*
581
+ stop_words = {
582
+ # Standard English stop words
583
+ 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
584
+ 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
585
+ 'should', 'may', 'might', 'shall', 'can', 'need', 'dare', 'ought',
586
+ 'used', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from',
587
+ 'as', 'into', 'through', 'during', 'before', 'after', 'above', 'below',
588
+ 'between', 'out', 'off', 'over', 'under', 'again', 'further', 'then',
589
+ 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'each',
590
+ 'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
591
+ 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
592
+ 'just', 'because', 'but', 'and', 'or', 'if', 'while', 'about', 'up',
593
+ 'that', 'this', 'these', 'those', 'it', 'its', 'he', 'she', 'they',
594
+ 'them', 'their', 'we', 'our', 'you', 'your', 'i', 'me', 'my', 'also',
595
+ 'which', 'who', 'whom', 'what', 'any', 'much', 'many', 'well',
596
+ 'still', 'even', 'back', 'get', 'go', 'make', 'like', 'take',
597
+ 'one', 'two', 'first', 'new', 'way', 'us',
598
+ # Common tutoring terms shared by both conditions
599
+ 'lecture', 'student', 'course', 'content', 'topic', 'material',
600
+ 'next', 'current', 'help', 'suggest', 'review', 'start', 'continue',
601
+ 'see', 'know', 'think', 'let', 'look', 'want', 'come',
602
+ }
603
+
604
+ def text_to_freq(corpus, stop_words):
605
+ words = re.findall(r'[a-z]{3,}', corpus.lower())
606
+ freq = {}
607
+ for w in words:
608
+ if w not in stop_words:
609
+ freq[w] = freq.get(w, 0) + 1
610
+ return freq
611
+
612
+ base_freq = text_to_freq(base_corpus, stop_words)
613
+ recog_freq = text_to_freq(recog_corpus, stop_words)
614
+
615
+ if not base_freq or not recog_freq:
616
+ print(' figure6.png SKIPPED (no text extracted)')
617
+ return
409
618
 
410
619
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))
411
620
 
412
621
  wc_base = WordCloud(
413
622
  width=1200, height=800, background_color='white', colormap='OrRd',
414
- max_words=30, max_font_size=120, min_font_size=14,
623
+ max_words=50, max_font_size=120, min_font_size=14,
415
624
  prefer_horizontal=0.85, relative_scaling=0.5, margin=10,
416
625
  collocations=False,
417
626
  ).generate_from_frequencies(base_freq)
418
627
 
419
628
  wc_recog = WordCloud(
420
629
  width=1200, height=800, background_color='white', colormap='YlGn',
421
- max_words=30, max_font_size=120, min_font_size=14,
630
+ max_words=50, max_font_size=120, min_font_size=14,
422
631
  prefer_horizontal=0.85, relative_scaling=0.5, margin=10,
423
632
  collocations=False,
424
633
  ).generate_from_frequencies(recog_freq)
425
634
 
426
635
  ax1.imshow(wc_base, interpolation='bilinear')
427
- ax1.set_title('Base Condition', fontsize=18, fontweight='bold', pad=15)
636
+ ax1.set_title('Base Condition (N=172)', fontsize=18, fontweight='bold', pad=15)
428
637
  ax1.axis('off')
429
638
 
430
639
  ax2.imshow(wc_recog, interpolation='bilinear')
431
- ax2.set_title('Recognition Condition', fontsize=18, fontweight='bold', pad=15)
640
+ ax2.set_title('Recognition Condition (N=178)', fontsize=18, fontweight='bold', pad=15)
432
641
  ax2.axis('off')
433
642
 
434
- fig.suptitle('Figure 6: Emergent Theme Word Clouds (AI Discovery, N=300)',
643
+ fig.suptitle('Figure 6: Tutor Language Word Clouds (Factorial, N=350)',
435
644
  fontsize=16, fontweight='bold', y=0.98)
436
645
  fig.tight_layout(rect=[0, 0.02, 1, 0.94])
437
646
  fig.savefig(os.path.join(OUTPUT_DIR, 'figure6.png'))
@@ -439,14 +648,306 @@ def figure6():
439
648
  print(' figure6.png')
440
649
 
441
650
 
651
+ # ── Figure 7: Persona × Recognition (Section 6.8) ───────────────────────────
652
+
653
+ def figure7():
654
+ """Grouped bar chart: superego persona × recognition for dialectical
655
+ multi-turn modulation (cells 28-33, N=90)."""
656
+
657
+ fig, ax = plt.subplots(figsize=(9, 5.5))
658
+
659
+ manifest = get_manifest()
660
+ fig7_config = manifest['figures']['figure7'] if manifest else None
661
+
662
+ personas = ['Suspicious', 'Adversary', 'Advocate']
663
+ base = None
664
+ recog = None
665
+ total_n = 90
666
+ data_driven = False
667
+
668
+ if fig7_config and get_db():
669
+ cell_means = query_cell_means(fig7_config['runs'], fig7_config['judge_filter'])
670
+ if cell_means:
671
+ # Cells 28-33: base/recog × suspicious/adversary/advocate
672
+ persona_cells = {
673
+ 'Suspicious': ('cell_28_base_dialectical_suspicious_unified',
674
+ 'cell_29_recog_dialectical_suspicious_unified'),
675
+ 'Adversary': ('cell_30_base_dialectical_adversary_unified',
676
+ 'cell_31_recog_dialectical_adversary_unified'),
677
+ 'Advocate': ('cell_32_base_dialectical_advocate_unified',
678
+ 'cell_33_recog_dialectical_advocate_unified'),
679
+ }
680
+ base = []
681
+ recog = []
682
+ for persona in personas:
683
+ b_key, r_key = persona_cells[persona]
684
+ b_data = cell_means.get(b_key)
685
+ r_data = cell_means.get(r_key)
686
+ if b_data and r_data:
687
+ base.append(round(b_data['mean'], 1))
688
+ recog.append(round(r_data['mean'], 1))
689
+ else:
690
+ base = None
691
+ break
692
+ if base and len(base) == 3:
693
+ total_n = sum(v['n'] for v in cell_means.values())
694
+ data_driven = True
695
+ print(' [data-driven from DB]')
696
+
697
+ if not data_driven:
698
+ base = [85.7, 88.5, 82.0]
699
+ recog = [90.2, 88.5, 95.6]
700
+ print(' [hardcoded fallback]')
701
+
702
+ deltas = [r - b for r, b in zip(recog, base)]
703
+
704
+ x = np.arange(len(personas))
705
+ w = 0.35
706
+
707
+ bars_b = ax.bar(x - w/2, base, w, label='Base', color='#95A5A6', edgecolor='#7F8C8D', linewidth=1.2)
708
+ bars_r = ax.bar(x + w/2, recog, w, label='Recognition', color='#27AE60', edgecolor='#1E8449', linewidth=1.2)
709
+
710
+ # Value labels
711
+ for bar in bars_b:
712
+ ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
713
+ f'{bar.get_height():.1f}', ha='center', va='bottom', fontsize=11, fontweight='bold', color='#555')
714
+ for bar in bars_r:
715
+ ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
716
+ f'{bar.get_height():.1f}', ha='center', va='bottom', fontsize=11, fontweight='bold', color='#1E8449')
717
+
718
+ # Delta annotations
719
+ for i, d in enumerate(deltas):
720
+ color = '#C0392B' if d > 2 else '#888' if abs(d) <= 2 else '#2471A3'
721
+ sign = '+' if d >= 0 else ''
722
+ ax.text(x[i] + w/2 + 0.08, recog[i] - 2, f'{sign}{d:.1f}',
723
+ fontsize=11, fontweight='bold', color=color, va='center')
724
+
725
+ ax.set_ylim(75, 100)
726
+ ax.set_xticks(x)
727
+ ax.set_xticklabels(personas, fontsize=13)
728
+ ax.set_ylabel('Mean Score', fontsize=14)
729
+ ax.set_title(f'Figure 7: Superego Persona × Recognition\n(Dialectical Multi-Turn, N={total_n}, Opus Judge)',
730
+ fontsize=15, fontweight='bold')
731
+ ax.legend(fontsize=12, framealpha=0.9)
732
+ ax.spines['top'].set_visible(False)
733
+ ax.spines['right'].set_visible(False)
734
+
735
+ fig.text(0.10, 0.02,
736
+ 'Advocate persona shows largest recognition effect (+13.6); '
737
+ 'adversary shows zero effect due to over-deference.',
738
+ fontsize=11, fontstyle='italic', color='#777777')
739
+
740
+ fig.tight_layout(rect=[0, 0.08, 1, 1])
741
+ fig.savefig(os.path.join(OUTPUT_DIR, 'figure7.png'))
742
+ plt.close(fig)
743
+ print(' figure7.png')
744
+
745
+
746
+ # ── Figure 8: Scripted vs Dynamic Learner Mechanism Spread (Section 6.10) ────
747
+
748
+ def figure8():
749
+ """Side-by-side comparison of mechanism spread under scripted vs dynamic learners."""
750
+
751
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6), sharey=True)
752
+
753
+ manifest = get_manifest()
754
+ fig8_config = manifest['figures']['figure8'] if manifest else None
755
+
756
+ scripted_labels = None
757
+ scripted_vals = None
758
+ dynamic_labels = None
759
+ dynamic_vals = None
760
+ scripted_n = 360
761
+ dynamic_n = 240
762
+ data_driven = False
763
+
764
+ # Cell-to-mechanism mapping for recognition cells in e0e3a622
765
+ scripted_recog_cells = {
766
+ 'cell_41_recog_dialectical_suspicious_unified_superego': 'Self-reflect (susp.)',
767
+ 'cell_43_recog_dialectical_adversary_unified_superego': 'Adversary',
768
+ 'cell_45_recog_dialectical_advocate_unified_superego': 'Advocate',
769
+ 'cell_47_recog_dialectical_suspicious_unified_quantitative': 'Quantitative',
770
+ 'cell_49_recog_dialectical_suspicious_unified_erosion': 'Erosion',
771
+ 'cell_51_recog_dialectical_suspicious_unified_intersubjective': 'Intersubjective',
772
+ 'cell_53_recog_dialectical_suspicious_unified_combined': 'Combined',
773
+ 'cell_55_recog_dialectical_profile_tutor': 'Prof. (tutor)',
774
+ 'cell_57_recog_dialectical_profile_bidirectional': 'Prof. (bidir)',
775
+ }
776
+ # Dynamic learner recognition cells from 6c033830 + a2b2717c
777
+ dynamic_recog_cells = {
778
+ 'cell_61_recog_dialectical_selfreflect_psycho': 'Self-reflect',
779
+ 'cell_63_recog_dialectical_profile_bidirectional_psycho': 'Profiling',
780
+ 'cell_64_recog_dialectical_intersubjective_psycho': 'Intersubjective',
781
+ 'cell_65_recog_dialectical_combined_psycho': 'Combined',
782
+ }
783
+
784
+ if fig8_config and get_db():
785
+ # Scripted
786
+ s_means = query_cell_means([fig8_config['runs']['scripted']], fig8_config['judge_filter'])
787
+ if s_means:
788
+ s_labels = []
789
+ s_vals = []
790
+ for cell, label in scripted_recog_cells.items():
791
+ data = s_means.get(cell)
792
+ if data:
793
+ s_labels.append(label)
794
+ s_vals.append(round(data['mean'], 1))
795
+ if len(s_labels) >= 7:
796
+ scripted_labels = s_labels
797
+ scripted_vals = s_vals
798
+ scripted_n = sum(v['n'] for v in s_means.values())
799
+
800
+ # Dynamic
801
+ d_run_ids = [fig8_config['runs']['dynamic_60_63'], fig8_config['runs']['dynamic_64_65']]
802
+ d_means = query_cell_means(d_run_ids, fig8_config['judge_filter'])
803
+ if d_means:
804
+ d_labels = []
805
+ d_vals = []
806
+ for cell, label in dynamic_recog_cells.items():
807
+ data = d_means.get(cell)
808
+ if data:
809
+ d_labels.append(label)
810
+ d_vals.append(round(data['mean'], 1))
811
+ if len(d_labels) >= 3:
812
+ dynamic_labels = d_labels
813
+ dynamic_vals = d_vals
814
+ dynamic_n = sum(v['n'] for v in d_means.values())
815
+
816
+ if scripted_labels and dynamic_labels:
817
+ data_driven = True
818
+ print(' [data-driven from DB]')
819
+
820
+ if not data_driven:
821
+ scripted_labels = ['Prof. (bidir)', 'Quantitative', 'Combined', 'Prof. (tutor)',
822
+ 'Self-reflect', 'Intersubjective', 'Erosion', 'Adversary', 'Advocate']
823
+ scripted_vals = [92.7, 92.6, 92.4, 92.4, 92.1, 91.7, 90.8, 92.6, 90.3]
824
+ dynamic_labels = ['Profiling', 'Combined', 'Self-reflect', 'Intersubjective']
825
+ dynamic_vals = [88.8, 87.8, 85.9, 82.8]
826
+ print(' [hardcoded fallback]')
827
+
828
+ # Sort both by value descending
829
+ s_order = np.argsort(scripted_vals)[::-1]
830
+ scripted_labels = [scripted_labels[i] for i in s_order]
831
+ scripted_vals = [scripted_vals[i] for i in s_order]
832
+
833
+ d_order = np.argsort(dynamic_vals)[::-1]
834
+ dynamic_labels = [dynamic_labels[i] for i in d_order]
835
+ dynamic_vals = [dynamic_vals[i] for i in d_order]
836
+
837
+ # Scripted panel
838
+ colors_s = ['#27AE60'] * len(scripted_vals)
839
+ bars_s = ax1.barh(range(len(scripted_vals)), scripted_vals, color=colors_s, edgecolor='#1E8449', alpha=0.8)
840
+ ax1.set_yticks(range(len(scripted_labels)))
841
+ ax1.set_yticklabels(scripted_labels, fontsize=11)
842
+ ax1.set_xlim(80, 96)
843
+ ax1.set_xlabel('Mean Score (Recognition)', fontsize=12)
844
+ s_range = max(scripted_vals) - min(scripted_vals)
845
+ ax1.set_title(f'Scripted Learner (N={scripted_n})\n{s_range:.1f}-pt range', fontsize=14, fontweight='bold')
846
+ for i, v in enumerate(scripted_vals):
847
+ ax1.text(v + 0.2, i, f'{v:.1f}', va='center', fontsize=10, fontweight='bold')
848
+ # Highlight the band
849
+ ax1.axvspan(min(scripted_vals), max(scripted_vals), alpha=0.1, color='green')
850
+ ax1.spines['top'].set_visible(False)
851
+ ax1.spines['right'].set_visible(False)
852
+
853
+ # Dynamic panel
854
+ colors_d = ['#27AE60' if v > 86 else '#F39C12' if v > 84 else '#E74C3C' for v in dynamic_vals]
855
+ bars_d = ax2.barh(range(len(dynamic_vals)), dynamic_vals, color=colors_d, edgecolor='#333', alpha=0.8)
856
+ ax2.set_yticks(range(len(dynamic_labels)))
857
+ ax2.set_yticklabels(dynamic_labels, fontsize=11)
858
+ ax2.set_xlim(80, 96)
859
+ ax2.set_xlabel('Mean Score (Recognition)', fontsize=12)
860
+ d_range = max(dynamic_vals) - min(dynamic_vals)
861
+ ax2.set_title(f'Dynamic Learner (N={dynamic_n})\n{d_range:.1f}-pt range', fontsize=14, fontweight='bold')
862
+ for i, v in enumerate(dynamic_vals):
863
+ ax2.text(v + 0.2, i, f'{v:.1f}', va='center', fontsize=10, fontweight='bold')
864
+ ax2.axvspan(min(dynamic_vals), max(dynamic_vals), alpha=0.1, color='orange')
865
+ ax2.spines['top'].set_visible(False)
866
+ ax2.spines['right'].set_visible(False)
867
+
868
+ fig.suptitle('Figure 8: Mechanism Differentiation — Scripted vs Dynamic Learner',
869
+ fontsize=16, fontweight='bold', y=1.02)
870
+ fig.tight_layout()
871
+ fig.savefig(os.path.join(OUTPUT_DIR, 'figure8.png'))
872
+ plt.close(fig)
873
+ print(' figure8.png')
874
+
875
+
876
+ # ── Figure 9: Qualitative Tag Divergence (Section 6.11) ─────────────────────
877
+
878
+ def figure9():
879
+ """Diverging bar chart: tag frequency difference (recognition - base)
880
+ from bilateral run qualitative assessment."""
881
+
882
+ fig, ax = plt.subplots(figsize=(10, 5.5))
883
+
884
+ tags = ['recognition_moment', 'ego_autonomy', 'emotional_attunement',
885
+ 'strategy_shift', 'learner_breakthrough',
886
+ 'ego_compliance', 'superego_overcorrection', 'missed_scaffold',
887
+ 'stalling']
888
+ # Note: missed_scaffold was 101.7% due to duplicate tag counting per dialogue;
889
+ # capped at 100.0% (deduplicated per dialogue).
890
+ tags = ['recognition_moment', 'strategy_shift', 'emotional_attunement',
891
+ 'learner_breakthrough',
892
+ 'ego_compliance', 'superego_overcorrection', 'missed_scaffold',
893
+ 'stalling']
894
+ base_pct = [0.0, 0.0, 6.9, 80.0, 70.7, 69.0, 100.0, 100.0]
895
+ recog_pct = [51.7, 30.0, 36.7, 80.0, 60.0, 50.0, 68.3, 45.0]
896
+
897
+ diff = [r - b for r, b in zip(recog_pct, base_pct)]
898
+
899
+ # Sort by difference
900
+ order = np.argsort(diff)
901
+ tags = [tags[i] for i in order]
902
+ diff = [diff[i] for i in order]
903
+
904
+ colors = ['#27AE60' if d > 0 else '#E74C3C' for d in diff]
905
+
906
+ bars = ax.barh(range(len(tags)), diff, color=colors, edgecolor='#333', alpha=0.85)
907
+
908
+ # Clean tag names
909
+ clean = [t.replace('_', ' ').title() for t in tags]
910
+ ax.set_yticks(range(len(clean)))
911
+ ax.set_yticklabels(clean, fontsize=11)
912
+ ax.set_xlabel('Percentage Point Difference (Recognition − Base)', fontsize=12)
913
+ ax.axvline(0, color='black', linewidth=0.8)
914
+
915
+ # Value labels
916
+ for i, d in enumerate(diff):
917
+ sign = '+' if d > 0 else ''
918
+ ha = 'left' if d >= 0 else 'right'
919
+ offset = 1.5 if d >= 0 else -1.5
920
+ ax.text(d + offset, i, f'{sign}{d:.0f}%', va='center', ha=ha, fontsize=10, fontweight='bold')
921
+
922
+ ax.set_title('Figure 9: Qualitative Tag Divergence\n(Bilateral Run, N=118, Base vs Recognition)',
923
+ fontsize=15, fontweight='bold')
924
+ ax.spines['top'].set_visible(False)
925
+ ax.spines['right'].set_visible(False)
926
+
927
+ fig.tight_layout()
928
+ fig.savefig(os.path.join(OUTPUT_DIR, 'figure9.png'))
929
+ plt.close(fig)
930
+ print(' figure9.png')
931
+
932
+
442
933
  # ── Main ──────────────────────────────────────────────────────────────────────
443
934
 
444
935
  if __name__ == '__main__':
445
936
  print('Generating paper figures...')
937
+ if get_manifest() and get_db():
938
+ print(f' Manifest: {MANIFEST_PATH}')
939
+ print(f' Database: {DB_PATH}')
940
+ else:
941
+ print(' WARNING: manifest or DB not found, using hardcoded fallbacks')
446
942
  figure1()
447
943
  figure2()
448
944
  figure3()
449
945
  figure4()
450
946
  figure5()
451
947
  figure6()
948
+ figure7()
949
+ figure8()
950
+ figure9()
951
+ if _db:
952
+ _db.close()
452
953
  print(f'Done. Output: {os.path.abspath(OUTPUT_DIR)}/')