@machinespirits/eval 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -9
- package/config/eval-settings.yaml +3 -3
- package/config/paper-manifest.json +486 -0
- package/config/providers.yaml +9 -6
- package/config/tutor-agents.yaml +2261 -0
- package/content/README.md +23 -0
- package/content/courses/479/course.md +53 -0
- package/content/courses/479/lecture-1.md +361 -0
- package/content/courses/479/lecture-2.md +360 -0
- package/content/courses/479/lecture-3.md +655 -0
- package/content/courses/479/lecture-4.md +530 -0
- package/content/courses/479/lecture-5.md +326 -0
- package/content/courses/479/lecture-6.md +346 -0
- package/content/courses/479/lecture-7.md +326 -0
- package/content/courses/479/lecture-8.md +273 -0
- package/content/courses/479/roadmap-slides.md +656 -0
- package/content/manifest.yaml +8 -0
- package/docs/research/apa.csl +2133 -0
- package/docs/research/build.sh +98 -0
- package/docs/research/figures/figure1.png +0 -0
- package/docs/research/figures/figure10.png +0 -0
- package/docs/research/figures/figure11.png +0 -0
- package/docs/research/figures/figure2.png +0 -0
- package/docs/research/figures/figure3.png +0 -0
- package/docs/research/figures/figure4.png +0 -0
- package/docs/research/figures/figure5.png +0 -0
- package/docs/research/figures/figure6.png +0 -0
- package/docs/research/figures/figure7.png +0 -0
- package/docs/research/figures/figure8.png +0 -0
- package/docs/research/figures/figure9.png +0 -0
- package/docs/research/header.tex +25 -0
- package/docs/research/paper-full.md +2565 -0
- package/docs/research/paper-short.md +436 -0
- package/docs/research/references.bib +1143 -0
- package/docs/research/slides-header.tex +188 -0
- package/docs/research/slides-pptx.md +363 -0
- package/docs/research/slides.md +531 -0
- package/docs/research/style-reference-pptx.py +199 -0
- package/package.json +5 -5
- package/scripts/analyze-eval-results.js +69 -17
- package/scripts/analyze-mechanism-traces.js +763 -0
- package/scripts/analyze-modulation-learning.js +498 -0
- package/scripts/analyze-prosthesis.js +144 -0
- package/scripts/analyze-run.js +264 -79
- package/scripts/assess-transcripts.js +853 -0
- package/scripts/browse-transcripts.js +854 -0
- package/scripts/check-parse-failures.js +73 -0
- package/scripts/code-dialectical-modulation.js +1320 -0
- package/scripts/download-data.sh +55 -0
- package/scripts/eval-cli.js +106 -18
- package/scripts/generate-paper-figures.js +663 -0
- package/scripts/generate-paper-figures.py +577 -76
- package/scripts/generate-paper-tables.js +299 -0
- package/scripts/qualitative-analysis-ai.js +3 -3
- package/scripts/render-sequence-diagram.js +694 -0
- package/scripts/test-latency.js +210 -0
- package/scripts/test-rate-limit.js +95 -0
- package/scripts/test-token-budget.js +332 -0
- package/scripts/validate-paper-manifest.js +670 -0
- package/services/__tests__/evalConfigLoader.test.js +2 -2
- package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
- package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
- package/services/evaluationRunner.js +975 -98
- package/services/evaluationStore.js +12 -4
- package/services/learnerTutorInteractionEngine.js +27 -2
- package/services/mockProvider.js +133 -0
- package/services/promptRewriter.js +1471 -5
- package/services/rubricEvaluator.js +55 -2
- package/services/transcriptFormatter.js +675 -0
- package/config/machinespirits-eval.code-workspace +0 -11
- package/docs/EVALUATION-VARIABLES.md +0 -589
- package/docs/REPLICATION-PLAN.md +0 -577
- package/scripts/analyze-run.mjs +0 -282
- package/scripts/compare-runs.js +0 -44
- package/scripts/compare-suggestions.js +0 -80
- package/scripts/dig-into-run.js +0 -158
- package/scripts/show-failed-suggestions.js +0 -64
- /package/scripts/{check-run.mjs → check-run.js} +0 -0
|
@@ -4,10 +4,15 @@
|
|
|
4
4
|
Usage:
|
|
5
5
|
python scripts/generate-paper-figures.py
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
Reads config/paper-manifest.json and queries data/evaluations.db to produce
|
|
8
|
+
data-driven figures. Falls back to hardcoded values if DB is unavailable.
|
|
9
|
+
|
|
10
|
+
Outputs PNGs to docs/research/figures/
|
|
8
11
|
"""
|
|
9
12
|
|
|
10
13
|
import os
|
|
14
|
+
import json
|
|
15
|
+
import sqlite3
|
|
11
16
|
import matplotlib
|
|
12
17
|
matplotlib.use('Agg')
|
|
13
18
|
import matplotlib.pyplot as plt
|
|
@@ -15,9 +20,78 @@ import matplotlib.patches as mpatches
|
|
|
15
20
|
from matplotlib.patches import FancyBboxPatch, FancyArrowPatch
|
|
16
21
|
import numpy as np
|
|
17
22
|
|
|
18
|
-
|
|
23
|
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
24
|
+
ROOT_DIR = os.path.join(SCRIPT_DIR, '..')
|
|
25
|
+
OUTPUT_DIR = os.path.join(ROOT_DIR, 'docs', 'research', 'figures')
|
|
26
|
+
MANIFEST_PATH = os.path.join(ROOT_DIR, 'config', 'paper-manifest.json')
|
|
27
|
+
DB_PATH = os.path.join(ROOT_DIR, 'data', 'evaluations.db')
|
|
28
|
+
|
|
19
29
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
20
30
|
|
|
31
|
+
# ── Data Layer ───────────────────────────────────────────────────────────────
|
|
32
|
+
|
|
33
|
+
_manifest = None
|
|
34
|
+
_db = None
|
|
35
|
+
|
|
36
|
+
def get_manifest():
|
|
37
|
+
global _manifest
|
|
38
|
+
if _manifest is None and os.path.exists(MANIFEST_PATH):
|
|
39
|
+
with open(MANIFEST_PATH) as f:
|
|
40
|
+
_manifest = json.load(f)
|
|
41
|
+
return _manifest
|
|
42
|
+
|
|
43
|
+
def get_db():
|
|
44
|
+
global _db
|
|
45
|
+
if _db is None and os.path.exists(DB_PATH):
|
|
46
|
+
_db = sqlite3.connect(DB_PATH)
|
|
47
|
+
_db.row_factory = sqlite3.Row
|
|
48
|
+
return _db
|
|
49
|
+
|
|
50
|
+
def query_cell_means(run_ids, judge_filter='claude-opus%'):
|
|
51
|
+
"""Query mean overall_score per profile (cell) for given runs."""
|
|
52
|
+
db = get_db()
|
|
53
|
+
if not db:
|
|
54
|
+
return {}
|
|
55
|
+
placeholders = ','.join('?' * len(run_ids))
|
|
56
|
+
rows = db.execute(f"""
|
|
57
|
+
SELECT profile_name, AVG(overall_score) as mean, COUNT(*) as n,
|
|
58
|
+
-- stdev via manual calculation
|
|
59
|
+
AVG(overall_score * overall_score) - AVG(overall_score) * AVG(overall_score) as var
|
|
60
|
+
FROM evaluation_results
|
|
61
|
+
WHERE run_id IN ({placeholders})
|
|
62
|
+
AND judge_model LIKE ?
|
|
63
|
+
AND overall_score IS NOT NULL
|
|
64
|
+
GROUP BY profile_name
|
|
65
|
+
""", [*run_ids, judge_filter]).fetchall()
|
|
66
|
+
return {r['profile_name']: {'mean': r['mean'], 'n': r['n'],
|
|
67
|
+
'sd': (r['var'] ** 0.5) if r['var'] and r['var'] > 0 else 0}
|
|
68
|
+
for r in rows}
|
|
69
|
+
|
|
70
|
+
def extract_cell_number(profile_name):
|
|
71
|
+
"""Extract cell number from profile_name like 'cell_5_recog_single_unified'."""
|
|
72
|
+
parts = profile_name.split('_')
|
|
73
|
+
if len(parts) >= 2 and parts[0] == 'cell':
|
|
74
|
+
try:
|
|
75
|
+
return int(parts[1])
|
|
76
|
+
except ValueError:
|
|
77
|
+
pass
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
def compute_2x2_effects(cell_means, base_single, base_multi, recog_single, recog_multi):
|
|
81
|
+
"""Compute recognition effect, architecture effect, and interaction from 4 cell means."""
|
|
82
|
+
bs = cell_means.get(base_single, {}).get('mean')
|
|
83
|
+
bm = cell_means.get(base_multi, {}).get('mean')
|
|
84
|
+
rs = cell_means.get(recog_single, {}).get('mean')
|
|
85
|
+
rm = cell_means.get(recog_multi, {}).get('mean')
|
|
86
|
+
if None in (bs, bm, rs, rm):
|
|
87
|
+
return None
|
|
88
|
+
recog_effect = ((rs + rm) / 2) - ((bs + bm) / 2)
|
|
89
|
+
arch_effect = ((bm + rm) / 2) - ((bs + rs) / 2)
|
|
90
|
+
interaction = (rm - rs) - (bm - bs)
|
|
91
|
+
return {'recog_effect': recog_effect, 'arch_effect': arch_effect,
|
|
92
|
+
'interaction': interaction,
|
|
93
|
+
'means': {'bs': bs, 'bm': bm, 'rs': rs, 'rm': rm}}
|
|
94
|
+
|
|
21
95
|
# Common styling
|
|
22
96
|
plt.rcParams.update({
|
|
23
97
|
'font.size': 13,
|
|
@@ -227,9 +301,9 @@ def figure2():
|
|
|
227
301
|
def figure3():
|
|
228
302
|
fig, ax = plt.subplots(figsize=(10, 4))
|
|
229
303
|
|
|
230
|
-
total =
|
|
231
|
-
prompt_eng = 11.
|
|
232
|
-
recog_unique = 8.
|
|
304
|
+
total = 19.7
|
|
305
|
+
prompt_eng = 11.6
|
|
306
|
+
recog_unique = 8.0
|
|
233
307
|
prompt_pct = prompt_eng / total * 100 # 57%
|
|
234
308
|
recog_pct = recog_unique / total * 100 # 43%
|
|
235
309
|
|
|
@@ -274,51 +348,89 @@ def figure3():
|
|
|
274
348
|
# ── Figure 4: Multi-Agent Synergy by Prompt Type ─────────────────────────────
|
|
275
349
|
|
|
276
350
|
def figure4():
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
single = [72.2, 83.3]
|
|
281
|
-
multi = [81.5, 83.3]
|
|
282
|
-
deltas = ['+9.2**', '+0.0']
|
|
351
|
+
"""Multi-model A×B interaction probe (Table 8, N=655 across 5 ego models).
|
|
352
|
+
Shows recognition effect and A×B interaction per model, confirming
|
|
353
|
+
architecture is additive, not synergistic."""
|
|
283
354
|
|
|
284
|
-
|
|
285
|
-
bar_height = 0.3
|
|
286
|
-
|
|
287
|
-
bars1 = ax.barh(y + bar_height/2, single, bar_height, color='#85C1E9',
|
|
288
|
-
edgecolor='#2471A3', linewidth=1.5, label='Single-Agent')
|
|
289
|
-
bars2 = ax.barh(y - bar_height/2, multi, bar_height, color='#82E0AA',
|
|
290
|
-
edgecolor='#1E8449', linewidth=1.5, label='Multi-Agent')
|
|
355
|
+
fig, ax = plt.subplots(figsize=(10, 5.5))
|
|
291
356
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
357
|
+
manifest = get_manifest()
|
|
358
|
+
fig4_config = manifest['figures']['figure4'] if manifest else None
|
|
359
|
+
|
|
360
|
+
# Try data-driven from DB
|
|
361
|
+
models = []
|
|
362
|
+
recog_effect = []
|
|
363
|
+
ab_interaction = []
|
|
364
|
+
data_driven = False
|
|
365
|
+
|
|
366
|
+
if fig4_config and get_db():
|
|
367
|
+
for key in ['kimi', 'nemotron', 'deepseek', 'glm', 'haiku']:
|
|
368
|
+
cfg = fig4_config['runs'][key]
|
|
369
|
+
cell_means = query_cell_means(cfg['run_ids'], fig4_config['judge_filter'])
|
|
370
|
+
if not cell_means:
|
|
371
|
+
break
|
|
372
|
+
effects = compute_2x2_effects(
|
|
373
|
+
cell_means,
|
|
374
|
+
'cell_1_base_single_unified', 'cell_3_base_multi_unified',
|
|
375
|
+
'cell_5_recog_single_unified', 'cell_7_recog_multi_unified')
|
|
376
|
+
if not effects:
|
|
377
|
+
break
|
|
378
|
+
total_n = sum(v['n'] for v in cell_means.values())
|
|
379
|
+
models.append(f"{cfg['label']}\n(N={total_n})")
|
|
380
|
+
recog_effect.append(round(effects['recog_effect'], 1))
|
|
381
|
+
ab_interaction.append(round(effects['interaction'], 1))
|
|
382
|
+
else:
|
|
383
|
+
data_driven = True
|
|
384
|
+
print(' [data-driven from DB]')
|
|
385
|
+
|
|
386
|
+
if not data_driven:
|
|
387
|
+
# Fallback hardcoded values
|
|
388
|
+
models = ['Kimi K2.5\n(N=179)', 'Nemotron\n(N=119)', 'DeepSeek\n(N=120)',
|
|
389
|
+
'GLM-4.7\n(N=117)', 'Haiku 4.5\n(N=120)']
|
|
390
|
+
recog_effect = [15.5, 16.0, 14.0, 17.8, 9.6]
|
|
391
|
+
ab_interaction = [0.5, -5.7, -1.4, -0.7, -1.6]
|
|
392
|
+
print(' [hardcoded fallback]')
|
|
393
|
+
|
|
394
|
+
x = np.arange(len(models))
|
|
395
|
+
w = 0.35
|
|
396
|
+
|
|
397
|
+
bars_r = ax.bar(x - w/2, recog_effect, w, label='Recognition Effect (A)',
|
|
398
|
+
color='#27AE60', edgecolor='#1E8449', linewidth=1.2)
|
|
399
|
+
bars_i = ax.bar(x + w/2, ab_interaction, w, label='A×B Interaction',
|
|
400
|
+
color='#E74C3C', edgecolor='#C0392B', linewidth=1.2)
|
|
401
|
+
|
|
402
|
+
# Value labels
|
|
403
|
+
for bar in bars_r:
|
|
404
|
+
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3,
|
|
405
|
+
f'+{bar.get_height():.1f}', ha='center', va='bottom',
|
|
406
|
+
fontsize=10, fontweight='bold', color='#1E8449')
|
|
407
|
+
for bar in bars_i:
|
|
408
|
+
val = bar.get_height()
|
|
409
|
+
y_pos = val - 0.5 if val < 0 else val + 0.3
|
|
410
|
+
va = 'top' if val < 0 else 'bottom'
|
|
411
|
+
sign = '+' if val > 0 else ''
|
|
412
|
+
ax.text(bar.get_x() + bar.get_width()/2, y_pos,
|
|
413
|
+
f'{sign}{val:.1f}', ha='center', va=va,
|
|
414
|
+
fontsize=10, fontweight='bold', color='#C0392B')
|
|
415
|
+
|
|
416
|
+
ax.axhline(0, color='#999', linewidth=0.8, linestyle='-')
|
|
417
|
+
ax.set_ylim(-8, 22)
|
|
418
|
+
ax.set_xticks(x)
|
|
419
|
+
ax.set_xticklabels(models, fontsize=11)
|
|
420
|
+
ax.set_ylabel('Effect Size (points)', fontsize=14)
|
|
421
|
+
ax.set_title('Figure 4: Architecture is Additive, Not Synergistic\n'
|
|
422
|
+
'(Multi-Model A×B Probe, N=655, Opus Judge)',
|
|
312
423
|
fontsize=15, fontweight='bold')
|
|
313
|
-
ax.legend(loc='
|
|
424
|
+
ax.legend(loc='upper right', fontsize=12, framealpha=0.9)
|
|
314
425
|
ax.spines['top'].set_visible(False)
|
|
315
426
|
ax.spines['right'].set_visible(False)
|
|
316
427
|
|
|
317
|
-
fig.text(0.
|
|
318
|
-
'
|
|
428
|
+
fig.text(0.10, 0.02,
|
|
429
|
+
'Recognition effect replicates across all 5 models (+9.6 to +17.8). '
|
|
430
|
+
'A×B interaction is negligible (mean −1.8 pts).',
|
|
319
431
|
fontsize=11, fontstyle='italic', color='#777777')
|
|
320
432
|
|
|
321
|
-
fig.tight_layout(rect=[0, 0.
|
|
433
|
+
fig.tight_layout(rect=[0, 0.08, 1, 1])
|
|
322
434
|
fig.savefig(os.path.join(OUTPUT_DIR, 'figure4.png'))
|
|
323
435
|
plt.close(fig)
|
|
324
436
|
print(' figure4.png')
|
|
@@ -327,19 +439,53 @@ def figure4():
|
|
|
327
439
|
# ── Figure 5: Factor Effects Invert by Domain ────────────────────────────────
|
|
328
440
|
|
|
329
441
|
def figure5():
|
|
442
|
+
"""Factor effects by domain using Kimi K2.5 for both domains.
|
|
443
|
+
Elementary: eval-2026-02-05-e87f452d (N=60, cells 1,3,5,7).
|
|
444
|
+
Philosophy: factorial cells 1,3,5,7 (N=179)."""
|
|
445
|
+
|
|
330
446
|
fig, ax = plt.subplots(figsize=(10, 5.5))
|
|
331
447
|
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
448
|
+
manifest = get_manifest()
|
|
449
|
+
fig5_config = manifest['figures']['figure5'] if manifest else None
|
|
450
|
+
|
|
451
|
+
factors = ['A: Recognition\nEffect', 'B: Multi-Agent\nEffect']
|
|
452
|
+
phil = None
|
|
453
|
+
elem = None
|
|
454
|
+
phil_n = 179
|
|
455
|
+
elem_n = 60
|
|
456
|
+
data_driven = False
|
|
457
|
+
|
|
458
|
+
if fig5_config and get_db():
|
|
459
|
+
# Philosophy: factorial single-learner cells
|
|
460
|
+
phil_means = query_cell_means([fig5_config['runs']['philosophy']], fig5_config['judge_filter'])
|
|
461
|
+
elem_means = query_cell_means([fig5_config['runs']['elementary']], fig5_config['judge_filter'])
|
|
462
|
+
if phil_means and elem_means:
|
|
463
|
+
phil_fx = compute_2x2_effects(phil_means,
|
|
464
|
+
'cell_1_base_single_unified', 'cell_3_base_multi_unified',
|
|
465
|
+
'cell_5_recog_single_unified', 'cell_7_recog_multi_unified')
|
|
466
|
+
elem_fx = compute_2x2_effects(elem_means,
|
|
467
|
+
'cell_1_base_single_unified', 'cell_3_base_multi_unified',
|
|
468
|
+
'cell_5_recog_single_unified', 'cell_7_recog_multi_unified')
|
|
469
|
+
if phil_fx and elem_fx:
|
|
470
|
+
phil = [round(phil_fx['recog_effect'], 1), round(phil_fx['arch_effect'], 1)]
|
|
471
|
+
elem = [round(elem_fx['recog_effect'], 1), round(elem_fx['arch_effect'], 1)]
|
|
472
|
+
phil_n = sum(v['n'] for v in phil_means.values())
|
|
473
|
+
elem_n = sum(v['n'] for v in elem_means.values())
|
|
474
|
+
data_driven = True
|
|
475
|
+
print(' [data-driven from DB]')
|
|
476
|
+
|
|
477
|
+
if not data_driven:
|
|
478
|
+
phil = [15.4, -0.8]
|
|
479
|
+
elem = [9.9, 3.0]
|
|
480
|
+
print(' [hardcoded fallback]')
|
|
335
481
|
|
|
336
482
|
y = np.arange(len(factors))
|
|
337
483
|
bar_height = 0.3
|
|
338
484
|
|
|
339
485
|
bars_phil = ax.barh(y + bar_height/2, phil, bar_height, color='#5DADE2',
|
|
340
|
-
edgecolor='#2471A3', linewidth=1.5, label='Philosophy')
|
|
486
|
+
edgecolor='#2471A3', linewidth=1.5, label=f'Philosophy (Kimi, N={phil_n})')
|
|
341
487
|
bars_elem = ax.barh(y - bar_height/2, elem, bar_height, color='#F0B27A',
|
|
342
|
-
edgecolor='#CA6F1E', linewidth=1.5, label='Elementary Math')
|
|
488
|
+
edgecolor='#CA6F1E', linewidth=1.5, label=f'Elementary Math (Kimi, N={elem_n})')
|
|
343
489
|
|
|
344
490
|
# Score labels
|
|
345
491
|
for bar, val in zip(bars_phil, phil):
|
|
@@ -357,15 +503,15 @@ def figure5():
|
|
|
357
503
|
ax.set_yticks(y)
|
|
358
504
|
ax.set_yticklabels(factors, fontsize=13)
|
|
359
505
|
ax.set_xlabel('Effect Size (points)', fontsize=14)
|
|
360
|
-
ax.set_title('Figure 5: Factor Effects
|
|
506
|
+
ax.set_title('Figure 5: Factor Effects by Domain (Kimi K2.5)',
|
|
361
507
|
fontsize=15, fontweight='bold')
|
|
362
508
|
ax.legend(loc='lower right', fontsize=12, framealpha=0.9)
|
|
363
509
|
ax.spines['top'].set_visible(False)
|
|
364
510
|
ax.spines['right'].set_visible(False)
|
|
365
511
|
|
|
366
512
|
fig.text(0.12, 0.02,
|
|
367
|
-
'
|
|
368
|
-
'
|
|
513
|
+
'Recognition dominates in both domains. Architecture provides small additive benefit\n'
|
|
514
|
+
'on elementary content (+3.0 pts) and negligible effect on philosophy (−0.8 pts).',
|
|
369
515
|
fontsize=11, fontstyle='italic', color='#777777')
|
|
370
516
|
|
|
371
517
|
fig.tight_layout(rect=[0, 0.1, 1, 1])
|
|
@@ -374,64 +520,127 @@ def figure5():
|
|
|
374
520
|
print(' figure5.png')
|
|
375
521
|
|
|
376
522
|
|
|
377
|
-
# ── Figure 6:
|
|
523
|
+
# ── Figure 6: Tutor Language Word Clouds ──────────────────────────────────────
|
|
378
524
|
|
|
379
525
|
def figure6():
|
|
526
|
+
"""Word clouds from actual tutor transcript text (N=350 factorial responses).
|
|
527
|
+
Shows the raw linguistic differences between base and recognition conditions,
|
|
528
|
+
complementing the AI theme coding in Tables 17b–d."""
|
|
529
|
+
|
|
380
530
|
try:
|
|
381
531
|
from wordcloud import WordCloud
|
|
382
532
|
except ImportError:
|
|
383
533
|
print(' figure6.png SKIPPED (pip install wordcloud)')
|
|
384
534
|
return
|
|
385
535
|
|
|
536
|
+
import sqlite3
|
|
386
537
|
import json
|
|
387
|
-
|
|
388
|
-
'qualitative-ai-claude-code-sample300-2026-02-08.json')
|
|
389
|
-
if not os.path.exists(data_path):
|
|
390
|
-
print(' figure6.png SKIPPED (discovery data not found)')
|
|
391
|
-
return
|
|
392
|
-
|
|
393
|
-
with open(data_path) as f:
|
|
394
|
-
data = json.load(f)
|
|
538
|
+
import re
|
|
395
539
|
|
|
396
|
-
|
|
540
|
+
db_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'evaluations.db')
|
|
541
|
+
if not os.path.exists(db_path):
|
|
542
|
+
print(' figure6.png SKIPPED (database not found)')
|
|
543
|
+
return
|
|
397
544
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
545
|
+
conn = sqlite3.connect(db_path)
|
|
546
|
+
rows = conn.execute("""
|
|
547
|
+
SELECT profile_name, suggestions
|
|
548
|
+
FROM evaluation_results
|
|
549
|
+
WHERE run_id IN ('eval-2026-02-03-f5d4dd93', 'eval-2026-02-06-a933d745')
|
|
550
|
+
AND overall_score IS NOT NULL
|
|
551
|
+
AND judge_model LIKE '%claude%'
|
|
552
|
+
""").fetchall()
|
|
553
|
+
conn.close()
|
|
554
|
+
|
|
555
|
+
# Extract message text from JSON suggestions
|
|
556
|
+
base_texts = []
|
|
557
|
+
recog_texts = []
|
|
558
|
+
for profile, suggestions_json in rows:
|
|
559
|
+
try:
|
|
560
|
+
suggestions = json.loads(suggestions_json)
|
|
561
|
+
text_parts = []
|
|
562
|
+
for s in suggestions:
|
|
563
|
+
if isinstance(s, dict):
|
|
564
|
+
for key in ('message', 'title', 'reason'):
|
|
565
|
+
if key in s and s[key]:
|
|
566
|
+
text_parts.append(str(s[key]))
|
|
567
|
+
text = ' '.join(text_parts)
|
|
568
|
+
except (json.JSONDecodeError, TypeError):
|
|
569
|
+
text = str(suggestions_json) if suggestions_json else ''
|
|
570
|
+
|
|
571
|
+
if 'recog' in profile:
|
|
572
|
+
recog_texts.append(text)
|
|
573
|
+
else:
|
|
574
|
+
base_texts.append(text)
|
|
575
|
+
|
|
576
|
+
base_corpus = ' '.join(base_texts)
|
|
577
|
+
recog_corpus = ' '.join(recog_texts)
|
|
578
|
+
|
|
579
|
+
# Pedagogical stop words — remove generic terms common to both conditions
|
|
580
|
+
# so the clouds highlight what *differs*
|
|
581
|
+
stop_words = {
|
|
582
|
+
# Standard English stop words
|
|
583
|
+
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
|
|
584
|
+
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
|
|
585
|
+
'should', 'may', 'might', 'shall', 'can', 'need', 'dare', 'ought',
|
|
586
|
+
'used', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from',
|
|
587
|
+
'as', 'into', 'through', 'during', 'before', 'after', 'above', 'below',
|
|
588
|
+
'between', 'out', 'off', 'over', 'under', 'again', 'further', 'then',
|
|
589
|
+
'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'each',
|
|
590
|
+
'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
|
|
591
|
+
'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
|
|
592
|
+
'just', 'because', 'but', 'and', 'or', 'if', 'while', 'about', 'up',
|
|
593
|
+
'that', 'this', 'these', 'those', 'it', 'its', 'he', 'she', 'they',
|
|
594
|
+
'them', 'their', 'we', 'our', 'you', 'your', 'i', 'me', 'my', 'also',
|
|
595
|
+
'which', 'who', 'whom', 'what', 'any', 'much', 'many', 'well',
|
|
596
|
+
'still', 'even', 'back', 'get', 'go', 'make', 'like', 'take',
|
|
597
|
+
'one', 'two', 'first', 'new', 'way', 'us',
|
|
598
|
+
# Common tutoring terms shared by both conditions
|
|
599
|
+
'lecture', 'student', 'course', 'content', 'topic', 'material',
|
|
600
|
+
'next', 'current', 'help', 'suggest', 'review', 'start', 'continue',
|
|
601
|
+
'see', 'know', 'think', 'let', 'look', 'want', 'come',
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
def text_to_freq(corpus, stop_words):
|
|
605
|
+
words = re.findall(r'[a-z]{3,}', corpus.lower())
|
|
606
|
+
freq = {}
|
|
607
|
+
for w in words:
|
|
608
|
+
if w not in stop_words:
|
|
609
|
+
freq[w] = freq.get(w, 0) + 1
|
|
610
|
+
return freq
|
|
611
|
+
|
|
612
|
+
base_freq = text_to_freq(base_corpus, stop_words)
|
|
613
|
+
recog_freq = text_to_freq(recog_corpus, stop_words)
|
|
614
|
+
|
|
615
|
+
if not base_freq or not recog_freq:
|
|
616
|
+
print(' figure6.png SKIPPED (no text extracted)')
|
|
617
|
+
return
|
|
409
618
|
|
|
410
619
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))
|
|
411
620
|
|
|
412
621
|
wc_base = WordCloud(
|
|
413
622
|
width=1200, height=800, background_color='white', colormap='OrRd',
|
|
414
|
-
max_words=
|
|
623
|
+
max_words=50, max_font_size=120, min_font_size=14,
|
|
415
624
|
prefer_horizontal=0.85, relative_scaling=0.5, margin=10,
|
|
416
625
|
collocations=False,
|
|
417
626
|
).generate_from_frequencies(base_freq)
|
|
418
627
|
|
|
419
628
|
wc_recog = WordCloud(
|
|
420
629
|
width=1200, height=800, background_color='white', colormap='YlGn',
|
|
421
|
-
max_words=
|
|
630
|
+
max_words=50, max_font_size=120, min_font_size=14,
|
|
422
631
|
prefer_horizontal=0.85, relative_scaling=0.5, margin=10,
|
|
423
632
|
collocations=False,
|
|
424
633
|
).generate_from_frequencies(recog_freq)
|
|
425
634
|
|
|
426
635
|
ax1.imshow(wc_base, interpolation='bilinear')
|
|
427
|
-
ax1.set_title('Base Condition', fontsize=18, fontweight='bold', pad=15)
|
|
636
|
+
ax1.set_title('Base Condition (N=172)', fontsize=18, fontweight='bold', pad=15)
|
|
428
637
|
ax1.axis('off')
|
|
429
638
|
|
|
430
639
|
ax2.imshow(wc_recog, interpolation='bilinear')
|
|
431
|
-
ax2.set_title('Recognition Condition', fontsize=18, fontweight='bold', pad=15)
|
|
640
|
+
ax2.set_title('Recognition Condition (N=178)', fontsize=18, fontweight='bold', pad=15)
|
|
432
641
|
ax2.axis('off')
|
|
433
642
|
|
|
434
|
-
fig.suptitle('Figure 6:
|
|
643
|
+
fig.suptitle('Figure 6: Tutor Language Word Clouds (Factorial, N=350)',
|
|
435
644
|
fontsize=16, fontweight='bold', y=0.98)
|
|
436
645
|
fig.tight_layout(rect=[0, 0.02, 1, 0.94])
|
|
437
646
|
fig.savefig(os.path.join(OUTPUT_DIR, 'figure6.png'))
|
|
@@ -439,14 +648,306 @@ def figure6():
|
|
|
439
648
|
print(' figure6.png')
|
|
440
649
|
|
|
441
650
|
|
|
651
|
+
# ── Figure 7: Persona × Recognition (Section 6.8) ───────────────────────────
|
|
652
|
+
|
|
653
|
+
def figure7():
|
|
654
|
+
"""Grouped bar chart: superego persona × recognition for dialectical
|
|
655
|
+
multi-turn modulation (cells 28-33, N=90)."""
|
|
656
|
+
|
|
657
|
+
fig, ax = plt.subplots(figsize=(9, 5.5))
|
|
658
|
+
|
|
659
|
+
manifest = get_manifest()
|
|
660
|
+
fig7_config = manifest['figures']['figure7'] if manifest else None
|
|
661
|
+
|
|
662
|
+
personas = ['Suspicious', 'Adversary', 'Advocate']
|
|
663
|
+
base = None
|
|
664
|
+
recog = None
|
|
665
|
+
total_n = 90
|
|
666
|
+
data_driven = False
|
|
667
|
+
|
|
668
|
+
if fig7_config and get_db():
|
|
669
|
+
cell_means = query_cell_means(fig7_config['runs'], fig7_config['judge_filter'])
|
|
670
|
+
if cell_means:
|
|
671
|
+
# Cells 28-33: base/recog × suspicious/adversary/advocate
|
|
672
|
+
persona_cells = {
|
|
673
|
+
'Suspicious': ('cell_28_base_dialectical_suspicious_unified',
|
|
674
|
+
'cell_29_recog_dialectical_suspicious_unified'),
|
|
675
|
+
'Adversary': ('cell_30_base_dialectical_adversary_unified',
|
|
676
|
+
'cell_31_recog_dialectical_adversary_unified'),
|
|
677
|
+
'Advocate': ('cell_32_base_dialectical_advocate_unified',
|
|
678
|
+
'cell_33_recog_dialectical_advocate_unified'),
|
|
679
|
+
}
|
|
680
|
+
base = []
|
|
681
|
+
recog = []
|
|
682
|
+
for persona in personas:
|
|
683
|
+
b_key, r_key = persona_cells[persona]
|
|
684
|
+
b_data = cell_means.get(b_key)
|
|
685
|
+
r_data = cell_means.get(r_key)
|
|
686
|
+
if b_data and r_data:
|
|
687
|
+
base.append(round(b_data['mean'], 1))
|
|
688
|
+
recog.append(round(r_data['mean'], 1))
|
|
689
|
+
else:
|
|
690
|
+
base = None
|
|
691
|
+
break
|
|
692
|
+
if base and len(base) == 3:
|
|
693
|
+
total_n = sum(v['n'] for v in cell_means.values())
|
|
694
|
+
data_driven = True
|
|
695
|
+
print(' [data-driven from DB]')
|
|
696
|
+
|
|
697
|
+
if not data_driven:
|
|
698
|
+
base = [85.7, 88.5, 82.0]
|
|
699
|
+
recog = [90.2, 88.5, 95.6]
|
|
700
|
+
print(' [hardcoded fallback]')
|
|
701
|
+
|
|
702
|
+
deltas = [r - b for r, b in zip(recog, base)]
|
|
703
|
+
|
|
704
|
+
x = np.arange(len(personas))
|
|
705
|
+
w = 0.35
|
|
706
|
+
|
|
707
|
+
bars_b = ax.bar(x - w/2, base, w, label='Base', color='#95A5A6', edgecolor='#7F8C8D', linewidth=1.2)
|
|
708
|
+
bars_r = ax.bar(x + w/2, recog, w, label='Recognition', color='#27AE60', edgecolor='#1E8449', linewidth=1.2)
|
|
709
|
+
|
|
710
|
+
# Value labels
|
|
711
|
+
for bar in bars_b:
|
|
712
|
+
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
|
|
713
|
+
f'{bar.get_height():.1f}', ha='center', va='bottom', fontsize=11, fontweight='bold', color='#555')
|
|
714
|
+
for bar in bars_r:
|
|
715
|
+
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
|
|
716
|
+
f'{bar.get_height():.1f}', ha='center', va='bottom', fontsize=11, fontweight='bold', color='#1E8449')
|
|
717
|
+
|
|
718
|
+
# Delta annotations
|
|
719
|
+
for i, d in enumerate(deltas):
|
|
720
|
+
color = '#C0392B' if d > 2 else '#888' if abs(d) <= 2 else '#2471A3'
|
|
721
|
+
sign = '+' if d >= 0 else ''
|
|
722
|
+
ax.text(x[i] + w/2 + 0.08, recog[i] - 2, f'{sign}{d:.1f}',
|
|
723
|
+
fontsize=11, fontweight='bold', color=color, va='center')
|
|
724
|
+
|
|
725
|
+
ax.set_ylim(75, 100)
|
|
726
|
+
ax.set_xticks(x)
|
|
727
|
+
ax.set_xticklabels(personas, fontsize=13)
|
|
728
|
+
ax.set_ylabel('Mean Score', fontsize=14)
|
|
729
|
+
ax.set_title(f'Figure 7: Superego Persona × Recognition\n(Dialectical Multi-Turn, N={total_n}, Opus Judge)',
|
|
730
|
+
fontsize=15, fontweight='bold')
|
|
731
|
+
ax.legend(fontsize=12, framealpha=0.9)
|
|
732
|
+
ax.spines['top'].set_visible(False)
|
|
733
|
+
ax.spines['right'].set_visible(False)
|
|
734
|
+
|
|
735
|
+
fig.text(0.10, 0.02,
|
|
736
|
+
'Advocate persona shows largest recognition effect (+13.6); '
|
|
737
|
+
'adversary shows zero effect due to over-deference.',
|
|
738
|
+
fontsize=11, fontstyle='italic', color='#777777')
|
|
739
|
+
|
|
740
|
+
fig.tight_layout(rect=[0, 0.08, 1, 1])
|
|
741
|
+
fig.savefig(os.path.join(OUTPUT_DIR, 'figure7.png'))
|
|
742
|
+
plt.close(fig)
|
|
743
|
+
print(' figure7.png')
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
# ── Figure 8: Scripted vs Dynamic Learner Mechanism Spread (Section 6.10) ────
|
|
747
|
+
|
|
748
|
+
def figure8():
|
|
749
|
+
"""Side-by-side comparison of mechanism spread under scripted vs dynamic learners."""
|
|
750
|
+
|
|
751
|
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6), sharey=True)
|
|
752
|
+
|
|
753
|
+
manifest = get_manifest()
|
|
754
|
+
fig8_config = manifest['figures']['figure8'] if manifest else None
|
|
755
|
+
|
|
756
|
+
scripted_labels = None
|
|
757
|
+
scripted_vals = None
|
|
758
|
+
dynamic_labels = None
|
|
759
|
+
dynamic_vals = None
|
|
760
|
+
scripted_n = 360
|
|
761
|
+
dynamic_n = 240
|
|
762
|
+
data_driven = False
|
|
763
|
+
|
|
764
|
+
# Cell-to-mechanism mapping for recognition cells in e0e3a622
|
|
765
|
+
scripted_recog_cells = {
|
|
766
|
+
'cell_41_recog_dialectical_suspicious_unified_superego': 'Self-reflect (susp.)',
|
|
767
|
+
'cell_43_recog_dialectical_adversary_unified_superego': 'Adversary',
|
|
768
|
+
'cell_45_recog_dialectical_advocate_unified_superego': 'Advocate',
|
|
769
|
+
'cell_47_recog_dialectical_suspicious_unified_quantitative': 'Quantitative',
|
|
770
|
+
'cell_49_recog_dialectical_suspicious_unified_erosion': 'Erosion',
|
|
771
|
+
'cell_51_recog_dialectical_suspicious_unified_intersubjective': 'Intersubjective',
|
|
772
|
+
'cell_53_recog_dialectical_suspicious_unified_combined': 'Combined',
|
|
773
|
+
'cell_55_recog_dialectical_profile_tutor': 'Prof. (tutor)',
|
|
774
|
+
'cell_57_recog_dialectical_profile_bidirectional': 'Prof. (bidir)',
|
|
775
|
+
}
|
|
776
|
+
# Dynamic learner recognition cells from 6c033830 + a2b2717c
|
|
777
|
+
dynamic_recog_cells = {
|
|
778
|
+
'cell_61_recog_dialectical_selfreflect_psycho': 'Self-reflect',
|
|
779
|
+
'cell_63_recog_dialectical_profile_bidirectional_psycho': 'Profiling',
|
|
780
|
+
'cell_64_recog_dialectical_intersubjective_psycho': 'Intersubjective',
|
|
781
|
+
'cell_65_recog_dialectical_combined_psycho': 'Combined',
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
if fig8_config and get_db():
|
|
785
|
+
# Scripted
|
|
786
|
+
s_means = query_cell_means([fig8_config['runs']['scripted']], fig8_config['judge_filter'])
|
|
787
|
+
if s_means:
|
|
788
|
+
s_labels = []
|
|
789
|
+
s_vals = []
|
|
790
|
+
for cell, label in scripted_recog_cells.items():
|
|
791
|
+
data = s_means.get(cell)
|
|
792
|
+
if data:
|
|
793
|
+
s_labels.append(label)
|
|
794
|
+
s_vals.append(round(data['mean'], 1))
|
|
795
|
+
if len(s_labels) >= 7:
|
|
796
|
+
scripted_labels = s_labels
|
|
797
|
+
scripted_vals = s_vals
|
|
798
|
+
scripted_n = sum(v['n'] for v in s_means.values())
|
|
799
|
+
|
|
800
|
+
# Dynamic
|
|
801
|
+
d_run_ids = [fig8_config['runs']['dynamic_60_63'], fig8_config['runs']['dynamic_64_65']]
|
|
802
|
+
d_means = query_cell_means(d_run_ids, fig8_config['judge_filter'])
|
|
803
|
+
if d_means:
|
|
804
|
+
d_labels = []
|
|
805
|
+
d_vals = []
|
|
806
|
+
for cell, label in dynamic_recog_cells.items():
|
|
807
|
+
data = d_means.get(cell)
|
|
808
|
+
if data:
|
|
809
|
+
d_labels.append(label)
|
|
810
|
+
d_vals.append(round(data['mean'], 1))
|
|
811
|
+
if len(d_labels) >= 3:
|
|
812
|
+
dynamic_labels = d_labels
|
|
813
|
+
dynamic_vals = d_vals
|
|
814
|
+
dynamic_n = sum(v['n'] for v in d_means.values())
|
|
815
|
+
|
|
816
|
+
if scripted_labels and dynamic_labels:
|
|
817
|
+
data_driven = True
|
|
818
|
+
print(' [data-driven from DB]')
|
|
819
|
+
|
|
820
|
+
if not data_driven:
|
|
821
|
+
scripted_labels = ['Prof. (bidir)', 'Quantitative', 'Combined', 'Prof. (tutor)',
|
|
822
|
+
'Self-reflect', 'Intersubjective', 'Erosion', 'Adversary', 'Advocate']
|
|
823
|
+
scripted_vals = [92.7, 92.6, 92.4, 92.4, 92.1, 91.7, 90.8, 92.6, 90.3]
|
|
824
|
+
dynamic_labels = ['Profiling', 'Combined', 'Self-reflect', 'Intersubjective']
|
|
825
|
+
dynamic_vals = [88.8, 87.8, 85.9, 82.8]
|
|
826
|
+
print(' [hardcoded fallback]')
|
|
827
|
+
|
|
828
|
+
# Sort both by value descending
|
|
829
|
+
s_order = np.argsort(scripted_vals)[::-1]
|
|
830
|
+
scripted_labels = [scripted_labels[i] for i in s_order]
|
|
831
|
+
scripted_vals = [scripted_vals[i] for i in s_order]
|
|
832
|
+
|
|
833
|
+
d_order = np.argsort(dynamic_vals)[::-1]
|
|
834
|
+
dynamic_labels = [dynamic_labels[i] for i in d_order]
|
|
835
|
+
dynamic_vals = [dynamic_vals[i] for i in d_order]
|
|
836
|
+
|
|
837
|
+
# Scripted panel
|
|
838
|
+
colors_s = ['#27AE60'] * len(scripted_vals)
|
|
839
|
+
bars_s = ax1.barh(range(len(scripted_vals)), scripted_vals, color=colors_s, edgecolor='#1E8449', alpha=0.8)
|
|
840
|
+
ax1.set_yticks(range(len(scripted_labels)))
|
|
841
|
+
ax1.set_yticklabels(scripted_labels, fontsize=11)
|
|
842
|
+
ax1.set_xlim(80, 96)
|
|
843
|
+
ax1.set_xlabel('Mean Score (Recognition)', fontsize=12)
|
|
844
|
+
s_range = max(scripted_vals) - min(scripted_vals)
|
|
845
|
+
ax1.set_title(f'Scripted Learner (N={scripted_n})\n{s_range:.1f}-pt range', fontsize=14, fontweight='bold')
|
|
846
|
+
for i, v in enumerate(scripted_vals):
|
|
847
|
+
ax1.text(v + 0.2, i, f'{v:.1f}', va='center', fontsize=10, fontweight='bold')
|
|
848
|
+
# Highlight the band
|
|
849
|
+
ax1.axvspan(min(scripted_vals), max(scripted_vals), alpha=0.1, color='green')
|
|
850
|
+
ax1.spines['top'].set_visible(False)
|
|
851
|
+
ax1.spines['right'].set_visible(False)
|
|
852
|
+
|
|
853
|
+
# Dynamic panel
|
|
854
|
+
colors_d = ['#27AE60' if v > 86 else '#F39C12' if v > 84 else '#E74C3C' for v in dynamic_vals]
|
|
855
|
+
bars_d = ax2.barh(range(len(dynamic_vals)), dynamic_vals, color=colors_d, edgecolor='#333', alpha=0.8)
|
|
856
|
+
ax2.set_yticks(range(len(dynamic_labels)))
|
|
857
|
+
ax2.set_yticklabels(dynamic_labels, fontsize=11)
|
|
858
|
+
ax2.set_xlim(80, 96)
|
|
859
|
+
ax2.set_xlabel('Mean Score (Recognition)', fontsize=12)
|
|
860
|
+
d_range = max(dynamic_vals) - min(dynamic_vals)
|
|
861
|
+
ax2.set_title(f'Dynamic Learner (N={dynamic_n})\n{d_range:.1f}-pt range', fontsize=14, fontweight='bold')
|
|
862
|
+
for i, v in enumerate(dynamic_vals):
|
|
863
|
+
ax2.text(v + 0.2, i, f'{v:.1f}', va='center', fontsize=10, fontweight='bold')
|
|
864
|
+
ax2.axvspan(min(dynamic_vals), max(dynamic_vals), alpha=0.1, color='orange')
|
|
865
|
+
ax2.spines['top'].set_visible(False)
|
|
866
|
+
ax2.spines['right'].set_visible(False)
|
|
867
|
+
|
|
868
|
+
fig.suptitle('Figure 8: Mechanism Differentiation — Scripted vs Dynamic Learner',
|
|
869
|
+
fontsize=16, fontweight='bold', y=1.02)
|
|
870
|
+
fig.tight_layout()
|
|
871
|
+
fig.savefig(os.path.join(OUTPUT_DIR, 'figure8.png'))
|
|
872
|
+
plt.close(fig)
|
|
873
|
+
print(' figure8.png')
|
|
874
|
+
|
|
875
|
+
|
|
876
|
+
# ── Figure 9: Qualitative Tag Divergence (Section 6.11) ─────────────────────
|
|
877
|
+
|
|
878
|
+
def figure9():
|
|
879
|
+
"""Diverging bar chart: tag frequency difference (recognition - base)
|
|
880
|
+
from bilateral run qualitative assessment."""
|
|
881
|
+
|
|
882
|
+
fig, ax = plt.subplots(figsize=(10, 5.5))
|
|
883
|
+
|
|
884
|
+
tags = ['recognition_moment', 'ego_autonomy', 'emotional_attunement',
|
|
885
|
+
'strategy_shift', 'learner_breakthrough',
|
|
886
|
+
'ego_compliance', 'superego_overcorrection', 'missed_scaffold',
|
|
887
|
+
'stalling']
|
|
888
|
+
# Note: missed_scaffold was 101.7% due to duplicate tag counting per dialogue;
|
|
889
|
+
# capped at 100.0% (deduplicated per dialogue).
|
|
890
|
+
tags = ['recognition_moment', 'strategy_shift', 'emotional_attunement',
|
|
891
|
+
'learner_breakthrough',
|
|
892
|
+
'ego_compliance', 'superego_overcorrection', 'missed_scaffold',
|
|
893
|
+
'stalling']
|
|
894
|
+
base_pct = [0.0, 0.0, 6.9, 80.0, 70.7, 69.0, 100.0, 100.0]
|
|
895
|
+
recog_pct = [51.7, 30.0, 36.7, 80.0, 60.0, 50.0, 68.3, 45.0]
|
|
896
|
+
|
|
897
|
+
diff = [r - b for r, b in zip(recog_pct, base_pct)]
|
|
898
|
+
|
|
899
|
+
# Sort by difference
|
|
900
|
+
order = np.argsort(diff)
|
|
901
|
+
tags = [tags[i] for i in order]
|
|
902
|
+
diff = [diff[i] for i in order]
|
|
903
|
+
|
|
904
|
+
colors = ['#27AE60' if d > 0 else '#E74C3C' for d in diff]
|
|
905
|
+
|
|
906
|
+
bars = ax.barh(range(len(tags)), diff, color=colors, edgecolor='#333', alpha=0.85)
|
|
907
|
+
|
|
908
|
+
# Clean tag names
|
|
909
|
+
clean = [t.replace('_', ' ').title() for t in tags]
|
|
910
|
+
ax.set_yticks(range(len(clean)))
|
|
911
|
+
ax.set_yticklabels(clean, fontsize=11)
|
|
912
|
+
ax.set_xlabel('Percentage Point Difference (Recognition − Base)', fontsize=12)
|
|
913
|
+
ax.axvline(0, color='black', linewidth=0.8)
|
|
914
|
+
|
|
915
|
+
# Value labels
|
|
916
|
+
for i, d in enumerate(diff):
|
|
917
|
+
sign = '+' if d > 0 else ''
|
|
918
|
+
ha = 'left' if d >= 0 else 'right'
|
|
919
|
+
offset = 1.5 if d >= 0 else -1.5
|
|
920
|
+
ax.text(d + offset, i, f'{sign}{d:.0f}%', va='center', ha=ha, fontsize=10, fontweight='bold')
|
|
921
|
+
|
|
922
|
+
ax.set_title('Figure 9: Qualitative Tag Divergence\n(Bilateral Run, N=118, Base vs Recognition)',
|
|
923
|
+
fontsize=15, fontweight='bold')
|
|
924
|
+
ax.spines['top'].set_visible(False)
|
|
925
|
+
ax.spines['right'].set_visible(False)
|
|
926
|
+
|
|
927
|
+
fig.tight_layout()
|
|
928
|
+
fig.savefig(os.path.join(OUTPUT_DIR, 'figure9.png'))
|
|
929
|
+
plt.close(fig)
|
|
930
|
+
print(' figure9.png')
|
|
931
|
+
|
|
932
|
+
|
|
442
933
|
# ── Main ──────────────────────────────────────────────────────────────────────
|
|
443
934
|
|
|
444
935
|
if __name__ == '__main__':
|
|
445
936
|
print('Generating paper figures...')
|
|
937
|
+
if get_manifest() and get_db():
|
|
938
|
+
print(f' Manifest: {MANIFEST_PATH}')
|
|
939
|
+
print(f' Database: {DB_PATH}')
|
|
940
|
+
else:
|
|
941
|
+
print(' WARNING: manifest or DB not found, using hardcoded fallbacks')
|
|
446
942
|
figure1()
|
|
447
943
|
figure2()
|
|
448
944
|
figure3()
|
|
449
945
|
figure4()
|
|
450
946
|
figure5()
|
|
451
947
|
figure6()
|
|
948
|
+
figure7()
|
|
949
|
+
figure8()
|
|
950
|
+
figure9()
|
|
951
|
+
if _db:
|
|
952
|
+
_db.close()
|
|
452
953
|
print(f'Done. Output: {os.path.abspath(OUTPUT_DIR)}/')
|