npm - @machinespirits/eval - Versions diffs - 0.2.1 → 0.3.0 - Mend

@machinespirits/eval 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

package/README.md +91 -9
package/config/eval-settings.yaml +3 -3
package/config/paper-manifest.json +486 -0
package/config/providers.yaml +9 -6
package/config/tutor-agents.yaml +2261 -0
package/content/README.md +23 -0
package/content/courses/479/course.md +53 -0
package/content/courses/479/lecture-1.md +361 -0
package/content/courses/479/lecture-2.md +360 -0
package/content/courses/479/lecture-3.md +655 -0
package/content/courses/479/lecture-4.md +530 -0
package/content/courses/479/lecture-5.md +326 -0
package/content/courses/479/lecture-6.md +346 -0
package/content/courses/479/lecture-7.md +326 -0
package/content/courses/479/lecture-8.md +273 -0
package/content/courses/479/roadmap-slides.md +656 -0
package/content/manifest.yaml +8 -0
package/docs/research/apa.csl +2133 -0
package/docs/research/build.sh +98 -0
package/docs/research/figures/figure1.png +0 -0
package/docs/research/figures/figure10.png +0 -0
package/docs/research/figures/figure11.png +0 -0
package/docs/research/figures/figure2.png +0 -0
package/docs/research/figures/figure3.png +0 -0
package/docs/research/figures/figure4.png +0 -0
package/docs/research/figures/figure5.png +0 -0
package/docs/research/figures/figure6.png +0 -0
package/docs/research/figures/figure7.png +0 -0
package/docs/research/figures/figure8.png +0 -0
package/docs/research/figures/figure9.png +0 -0
package/docs/research/header.tex +25 -0
package/docs/research/paper-full.md +2565 -0
package/docs/research/paper-short.md +436 -0
package/docs/research/references.bib +1143 -0
package/docs/research/slides-header.tex +188 -0
package/docs/research/slides-pptx.md +363 -0
package/docs/research/slides.md +531 -0
package/docs/research/style-reference-pptx.py +199 -0
package/package.json +5 -5
package/scripts/analyze-eval-results.js +69 -17
package/scripts/analyze-mechanism-traces.js +763 -0
package/scripts/analyze-modulation-learning.js +498 -0
package/scripts/analyze-prosthesis.js +144 -0
package/scripts/analyze-run.js +264 -79
package/scripts/assess-transcripts.js +853 -0
package/scripts/browse-transcripts.js +854 -0
package/scripts/check-parse-failures.js +73 -0
package/scripts/code-dialectical-modulation.js +1320 -0
package/scripts/download-data.sh +55 -0
package/scripts/eval-cli.js +106 -18
package/scripts/generate-paper-figures.js +663 -0
package/scripts/generate-paper-figures.py +577 -76
package/scripts/generate-paper-tables.js +299 -0
package/scripts/qualitative-analysis-ai.js +3 -3
package/scripts/render-sequence-diagram.js +694 -0
package/scripts/test-latency.js +210 -0
package/scripts/test-rate-limit.js +95 -0
package/scripts/test-token-budget.js +332 -0
package/scripts/validate-paper-manifest.js +670 -0
package/services/__tests__/evalConfigLoader.test.js +2 -2
package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
package/services/evaluationRunner.js +975 -98
package/services/evaluationStore.js +12 -4
package/services/learnerTutorInteractionEngine.js +27 -2
package/services/mockProvider.js +133 -0
package/services/promptRewriter.js +1471 -5
package/services/rubricEvaluator.js +55 -2
package/services/transcriptFormatter.js +675 -0
package/config/machinespirits-eval.code-workspace +0 -11
package/docs/EVALUATION-VARIABLES.md +0 -589
package/docs/REPLICATION-PLAN.md +0 -577
package/scripts/analyze-run.mjs +0 -282
package/scripts/compare-runs.js +0 -44
package/scripts/compare-suggestions.js +0 -80
package/scripts/dig-into-run.js +0 -158
package/scripts/show-failed-suggestions.js +0 -64
/package/scripts/{check-run.mjs → check-run.js} +0 -0

package/scripts/generate-paper-figures.py CHANGED Viewed

@@ -4,10 +4,15 @@
 Usage:
     python scripts/generate-paper-figures.py
-Outputs 5 PNGs to docs/research/figures/
+Reads config/paper-manifest.json and queries data/evaluations.db to produce
+data-driven figures. Falls back to hardcoded values if DB is unavailable.
+Outputs PNGs to docs/research/figures/
 """
 import os
+import json
+import sqlite3
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
@@ -15,9 +20,78 @@ import matplotlib.patches as mpatches
 from matplotlib.patches import FancyBboxPatch, FancyArrowPatch
 import numpy as np
-OUTPUT_DIR = os.path.join(os.path.dirname(__file__), '..', 'docs', 'research', 'figures')
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+ROOT_DIR = os.path.join(SCRIPT_DIR, '..')
+OUTPUT_DIR = os.path.join(ROOT_DIR, 'docs', 'research', 'figures')
+MANIFEST_PATH = os.path.join(ROOT_DIR, 'config', 'paper-manifest.json')
+DB_PATH = os.path.join(ROOT_DIR, 'data', 'evaluations.db')
 os.makedirs(OUTPUT_DIR, exist_ok=True)
+# ── Data Layer ───────────────────────────────────────────────────────────────
+_manifest = None
+_db = None
+def get_manifest():
+    global _manifest
+    if _manifest is None and os.path.exists(MANIFEST_PATH):
+        with open(MANIFEST_PATH) as f:
+            _manifest = json.load(f)
+    return _manifest
+def get_db():
+    global _db
+    if _db is None and os.path.exists(DB_PATH):
+        _db = sqlite3.connect(DB_PATH)
+        _db.row_factory = sqlite3.Row
+    return _db
+def query_cell_means(run_ids, judge_filter='claude-opus%'):
+    """Query mean overall_score per profile (cell) for given runs."""
+    db = get_db()
+    if not db:
+        return {}
+    placeholders = ','.join('?' * len(run_ids))
+    rows = db.execute(f"""
+        SELECT profile_name, AVG(overall_score) as mean, COUNT(*) as n,
+               -- stdev via manual calculation
+               AVG(overall_score * overall_score) - AVG(overall_score) * AVG(overall_score) as var
+        FROM evaluation_results
+        WHERE run_id IN ({placeholders})
+          AND judge_model LIKE ?
+          AND overall_score IS NOT NULL
+        GROUP BY profile_name
+    """, [*run_ids, judge_filter]).fetchall()
+    return {r['profile_name']: {'mean': r['mean'], 'n': r['n'],
+            'sd': (r['var'] ** 0.5) if r['var'] and r['var'] > 0 else 0}
+            for r in rows}
+def extract_cell_number(profile_name):
+    """Extract cell number from profile_name like 'cell_5_recog_single_unified'."""
+    parts = profile_name.split('_')
+    if len(parts) >= 2 and parts[0] == 'cell':
+        try:
+            return int(parts[1])
+        except ValueError:
+            pass
+    return None
+def compute_2x2_effects(cell_means, base_single, base_multi, recog_single, recog_multi):
+    """Compute recognition effect, architecture effect, and interaction from 4 cell means."""
+    bs = cell_means.get(base_single, {}).get('mean')
+    bm = cell_means.get(base_multi, {}).get('mean')
+    rs = cell_means.get(recog_single, {}).get('mean')
+    rm = cell_means.get(recog_multi, {}).get('mean')
+    if None in (bs, bm, rs, rm):
+        return None
+    recog_effect = ((rs + rm) / 2) - ((bs + bm) / 2)
+    arch_effect = ((bm + rm) / 2) - ((bs + rs) / 2)
+    interaction = (rm - rs) - (bm - bs)
+    return {'recog_effect': recog_effect, 'arch_effect': arch_effect,
+            'interaction': interaction,
+            'means': {'bs': bs, 'bm': bm, 'rs': rs, 'rm': rm}}
 # Common styling
 plt.rcParams.update({
     'font.size': 13,
@@ -227,9 +301,9 @@ def figure2():
 def figure3():
     fig, ax = plt.subplots(figsize=(10, 4))
-    total = 20.1
-    prompt_eng = 11.4
-    recog_unique = 8.7
+    total = 19.7
+    prompt_eng = 11.6
+    recog_unique = 8.0
     prompt_pct = prompt_eng / total * 100  # 57%
     recog_pct = recog_unique / total * 100  # 43%
@@ -274,51 +348,89 @@ def figure3():
 # ── Figure 4: Multi-Agent Synergy by Prompt Type ─────────────────────────────
 def figure4():
-    fig, ax = plt.subplots(figsize=(10, 5.5))
-    categories = ['Recognition\nPrompts', 'Enhanced\nPrompts']
-    single = [72.2, 83.3]
-    multi = [81.5, 83.3]
-    deltas = ['+9.2**', '+0.0']
+    """Multi-model A×B interaction probe (Table 8, N=655 across 5 ego models).
+    Shows recognition effect and A×B interaction per model, confirming
+    architecture is additive, not synergistic."""
-    y = np.arange(len(categories))
-    bar_height = 0.3
-    bars1 = ax.barh(y + bar_height/2, single, bar_height, color='#85C1E9',
-                    edgecolor='#2471A3', linewidth=1.5, label='Single-Agent')
-    bars2 = ax.barh(y - bar_height/2, multi, bar_height, color='#82E0AA',
-                    edgecolor='#1E8449', linewidth=1.5, label='Multi-Agent')
+    fig, ax = plt.subplots(figsize=(10, 5.5))
-    # Score labels
-    for bar, val in zip(bars1, single):
-        ax.text(val + 0.5, bar.get_y() + bar.get_height()/2, f'{val}',
-                va='center', fontsize=12, fontweight='bold')
-    for bar, val in zip(bars2, multi):
-        ax.text(val + 0.5, bar.get_y() + bar.get_height()/2, f'{val}',
-                va='center', fontsize=12, fontweight='bold')
-    # Delta labels
-    for i, delta in enumerate(deltas):
-        ax.text(max(single[i], multi[i]) + 4.5, y[i],
-                f'Δ {delta}', ha='center', va='center',
-                fontsize=12, fontweight='bold',
-                color='#C0392B' if '**' in delta else '#555555')
-    ax.set_xlim(0, 100)
-    ax.set_yticks(y)
-    ax.set_yticklabels(categories, fontsize=13)
-    ax.set_xlabel('Mean Score', fontsize=14)
-    ax.set_title('Figure 4: Multi-Agent Synergy by Prompt Type\n(Preliminary N=36)',
+    manifest = get_manifest()
+    fig4_config = manifest['figures']['figure4'] if manifest else None
+    # Try data-driven from DB
+    models = []
+    recog_effect = []
+    ab_interaction = []
+    data_driven = False
+    if fig4_config and get_db():
+        for key in ['kimi', 'nemotron', 'deepseek', 'glm', 'haiku']:
+            cfg = fig4_config['runs'][key]
+            cell_means = query_cell_means(cfg['run_ids'], fig4_config['judge_filter'])
+            if not cell_means:
+                break
+            effects = compute_2x2_effects(
+                cell_means,
+                'cell_1_base_single_unified', 'cell_3_base_multi_unified',
+                'cell_5_recog_single_unified', 'cell_7_recog_multi_unified')
+            if not effects:
+                break
+            total_n = sum(v['n'] for v in cell_means.values())
+            models.append(f"{cfg['label']}\n(N={total_n})")
+            recog_effect.append(round(effects['recog_effect'], 1))
+            ab_interaction.append(round(effects['interaction'], 1))
+        else:
+            data_driven = True
+            print('    [data-driven from DB]')
+    if not data_driven:
+        # Fallback hardcoded values
+        models = ['Kimi K2.5\n(N=179)', 'Nemotron\n(N=119)', 'DeepSeek\n(N=120)',
+                  'GLM-4.7\n(N=117)', 'Haiku 4.5\n(N=120)']
+        recog_effect = [15.5, 16.0, 14.0, 17.8, 9.6]
+        ab_interaction = [0.5, -5.7, -1.4, -0.7, -1.6]
+        print('    [hardcoded fallback]')
+    x = np.arange(len(models))
+    w = 0.35
+    bars_r = ax.bar(x - w/2, recog_effect, w, label='Recognition Effect (A)',
+                    color='#27AE60', edgecolor='#1E8449', linewidth=1.2)
+    bars_i = ax.bar(x + w/2, ab_interaction, w, label='A×B Interaction',
+                    color='#E74C3C', edgecolor='#C0392B', linewidth=1.2)
+    # Value labels
+    for bar in bars_r:
+        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3,
+                f'+{bar.get_height():.1f}', ha='center', va='bottom',
+                fontsize=10, fontweight='bold', color='#1E8449')
+    for bar in bars_i:
+        val = bar.get_height()
+        y_pos = val - 0.5 if val < 0 else val + 0.3
+        va = 'top' if val < 0 else 'bottom'
+        sign = '+' if val > 0 else ''
+        ax.text(bar.get_x() + bar.get_width()/2, y_pos,
+                f'{sign}{val:.1f}', ha='center', va=va,
+                fontsize=10, fontweight='bold', color='#C0392B')
+    ax.axhline(0, color='#999', linewidth=0.8, linestyle='-')
+    ax.set_ylim(-8, 22)
+    ax.set_xticks(x)
+    ax.set_xticklabels(models, fontsize=11)
+    ax.set_ylabel('Effect Size (points)', fontsize=14)
+    ax.set_title('Figure 4: Architecture is Additive, Not Synergistic\n'
+                 '(Multi-Model A×B Probe, N=655, Opus Judge)',
                  fontsize=15, fontweight='bold')
-    ax.legend(loc='lower right', fontsize=12, framealpha=0.9)
+    ax.legend(loc='upper right', fontsize=12, framealpha=0.9)
     ax.spines['top'].set_visible(False)
     ax.spines['right'].set_visible(False)
-    fig.text(0.12, 0.02, '** Significant synergy effect (p < .05); however, this did not replicate\n'
-             '     in the 5-model probe (N=826, mean interaction = −2.2 pts)',
+    fig.text(0.10, 0.02,
+             'Recognition effect replicates across all 5 models (+9.6 to +17.8). '
+             'A×B interaction is negligible (mean −1.8 pts).',
              fontsize=11, fontstyle='italic', color='#777777')
-    fig.tight_layout(rect=[0, 0.1, 1, 1])
+    fig.tight_layout(rect=[0, 0.08, 1, 1])
     fig.savefig(os.path.join(OUTPUT_DIR, 'figure4.png'))
     plt.close(fig)
     print('  figure4.png')
@@ -327,19 +439,53 @@ def figure4():
 # ── Figure 5: Factor Effects Invert by Domain ────────────────────────────────
 def figure5():
+    """Factor effects by domain using Kimi K2.5 for both domains.
+    Elementary: eval-2026-02-05-e87f452d (N=60, cells 1,3,5,7).
+    Philosophy: factorial cells 1,3,5,7 (N=179)."""
     fig, ax = plt.subplots(figsize=(10, 5.5))
-    factors = ['A: Recognition\nEffect', 'B: Multi-Agent\nEffect', 'C: Learner\nEffect']
-    phil = [15.4, -0.8, 2.1]
-    elem = [4.4, 9.9, 0.75]
+    manifest = get_manifest()
+    fig5_config = manifest['figures']['figure5'] if manifest else None
+    factors = ['A: Recognition\nEffect', 'B: Multi-Agent\nEffect']
+    phil = None
+    elem = None
+    phil_n = 179
+    elem_n = 60
+    data_driven = False
+    if fig5_config and get_db():
+        # Philosophy: factorial single-learner cells
+        phil_means = query_cell_means([fig5_config['runs']['philosophy']], fig5_config['judge_filter'])
+        elem_means = query_cell_means([fig5_config['runs']['elementary']], fig5_config['judge_filter'])
+        if phil_means and elem_means:
+            phil_fx = compute_2x2_effects(phil_means,
+                'cell_1_base_single_unified', 'cell_3_base_multi_unified',
+                'cell_5_recog_single_unified', 'cell_7_recog_multi_unified')
+            elem_fx = compute_2x2_effects(elem_means,
+                'cell_1_base_single_unified', 'cell_3_base_multi_unified',
+                'cell_5_recog_single_unified', 'cell_7_recog_multi_unified')
+            if phil_fx and elem_fx:
+                phil = [round(phil_fx['recog_effect'], 1), round(phil_fx['arch_effect'], 1)]
+                elem = [round(elem_fx['recog_effect'], 1), round(elem_fx['arch_effect'], 1)]
+                phil_n = sum(v['n'] for v in phil_means.values())
+                elem_n = sum(v['n'] for v in elem_means.values())
+                data_driven = True
+                print('    [data-driven from DB]')
+    if not data_driven:
+        phil = [15.4, -0.8]
+        elem = [9.9, 3.0]
+        print('    [hardcoded fallback]')
     y = np.arange(len(factors))
     bar_height = 0.3
     bars_phil = ax.barh(y + bar_height/2, phil, bar_height, color='#5DADE2',
-                        edgecolor='#2471A3', linewidth=1.5, label='Philosophy')
+                        edgecolor='#2471A3', linewidth=1.5, label=f'Philosophy (Kimi, N={phil_n})')
     bars_elem = ax.barh(y - bar_height/2, elem, bar_height, color='#F0B27A',
-                        edgecolor='#CA6F1E', linewidth=1.5, label='Elementary Math')
+                        edgecolor='#CA6F1E', linewidth=1.5, label=f'Elementary Math (Kimi, N={elem_n})')
     # Score labels
     for bar, val in zip(bars_phil, phil):
@@ -357,15 +503,15 @@ def figure5():
     ax.set_yticks(y)
     ax.set_yticklabels(factors, fontsize=13)
     ax.set_xlabel('Effect Size (points)', fontsize=14)
-    ax.set_title('Figure 5: Factor Effects Invert by Domain',
+    ax.set_title('Figure 5: Factor Effects by Domain (Kimi K2.5)',
                  fontsize=15, fontweight='bold')
     ax.legend(loc='lower right', fontsize=12, framealpha=0.9)
     ax.spines['top'].set_visible(False)
     ax.spines['right'].set_visible(False)
     fig.text(0.12, 0.02,
-             'Factor dominance inverts: Philosophy favors recognition (A); Elementary favors architecture (B).\n'
-             'Elementary recognition partially model-dependent (Kimi shows d ≈ 0.61).',
+             'Recognition dominates in both domains. Architecture provides small additive benefit\n'
+             'on elementary content (+3.0 pts) and negligible effect on philosophy (−0.8 pts).',
              fontsize=11, fontstyle='italic', color='#777777')
     fig.tight_layout(rect=[0, 0.1, 1, 1])
@@ -374,64 +520,127 @@ def figure5():
     print('  figure5.png')
-# ── Figure 6: Emergent Theme Word Clouds ──────────────────────────────────────
+# ── Figure 6: Tutor Language Word Clouds ──────────────────────────────────────
 def figure6():
+    """Word clouds from actual tutor transcript text (N=350 factorial responses).
+    Shows the raw linguistic differences between base and recognition conditions,
+    complementing the AI theme coding in Tables 17b–d."""
     try:
         from wordcloud import WordCloud
     except ImportError:
         print('  figure6.png SKIPPED (pip install wordcloud)')
         return
+    import sqlite3
     import json
-    data_path = os.path.join(os.path.dirname(__file__), '..', 'exports',
-                             'qualitative-ai-claude-code-sample300-2026-02-08.json')
-    if not os.path.exists(data_path):
-        print('  figure6.png SKIPPED (discovery data not found)')
-        return
-    with open(data_path) as f:
-        data = json.load(f)
+    import re
-    themes = data['discovery']['analysis']['themeFrequency']
+    db_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'evaluations.db')
+    if not os.path.exists(db_path):
+        print('  figure6.png SKIPPED (database not found)')
+        return
-    base_freq = {}
-    recog_freq = {}
-    for key, t in themes.items():
-        label = t['label']
-        b = t.get('base', 0)
-        r = t.get('recognition', 0)
-        if b + r >= 3:
-            if b > 0:
-                base_freq[label] = b
-            if r > 0:
-                recog_freq[label] = r
+    conn = sqlite3.connect(db_path)
+    rows = conn.execute("""
+        SELECT profile_name, suggestions
+        FROM evaluation_results
+        WHERE run_id IN ('eval-2026-02-03-f5d4dd93', 'eval-2026-02-06-a933d745')
+          AND overall_score IS NOT NULL
+          AND judge_model LIKE '%claude%'
+    """).fetchall()
+    conn.close()
+    # Extract message text from JSON suggestions
+    base_texts = []
+    recog_texts = []
+    for profile, suggestions_json in rows:
+        try:
+            suggestions = json.loads(suggestions_json)
+            text_parts = []
+            for s in suggestions:
+                if isinstance(s, dict):
+                    for key in ('message', 'title', 'reason'):
+                        if key in s and s[key]:
+                            text_parts.append(str(s[key]))
+            text = ' '.join(text_parts)
+        except (json.JSONDecodeError, TypeError):
+            text = str(suggestions_json) if suggestions_json else ''
+        if 'recog' in profile:
+            recog_texts.append(text)
+        else:
+            base_texts.append(text)
+    base_corpus = ' '.join(base_texts)
+    recog_corpus = ' '.join(recog_texts)
+    # Pedagogical stop words — remove generic terms common to both conditions
+    # so the clouds highlight what *differs*
+    stop_words = {
+        # Standard English stop words
+        'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
+        'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
+        'should', 'may', 'might', 'shall', 'can', 'need', 'dare', 'ought',
+        'used', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from',
+        'as', 'into', 'through', 'during', 'before', 'after', 'above', 'below',
+        'between', 'out', 'off', 'over', 'under', 'again', 'further', 'then',
+        'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'each',
+        'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
+        'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
+        'just', 'because', 'but', 'and', 'or', 'if', 'while', 'about', 'up',
+        'that', 'this', 'these', 'those', 'it', 'its', 'he', 'she', 'they',
+        'them', 'their', 'we', 'our', 'you', 'your', 'i', 'me', 'my', 'also',
+        'which', 'who', 'whom', 'what', 'any', 'much', 'many', 'well',
+        'still', 'even', 'back', 'get', 'go', 'make', 'like', 'take',
+        'one', 'two', 'first', 'new', 'way', 'us',
+        # Common tutoring terms shared by both conditions
+        'lecture', 'student', 'course', 'content', 'topic', 'material',
+        'next', 'current', 'help', 'suggest', 'review', 'start', 'continue',
+        'see', 'know', 'think', 'let', 'look', 'want', 'come',
+    }
+    def text_to_freq(corpus, stop_words):
+        words = re.findall(r'[a-z]{3,}', corpus.lower())
+        freq = {}
+        for w in words:
+            if w not in stop_words:
+                freq[w] = freq.get(w, 0) + 1
+        return freq
+    base_freq = text_to_freq(base_corpus, stop_words)
+    recog_freq = text_to_freq(recog_corpus, stop_words)
+    if not base_freq or not recog_freq:
+        print('  figure6.png SKIPPED (no text extracted)')
+        return
     fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))
     wc_base = WordCloud(
         width=1200, height=800, background_color='white', colormap='OrRd',
-        max_words=30, max_font_size=120, min_font_size=14,
+        max_words=50, max_font_size=120, min_font_size=14,
         prefer_horizontal=0.85, relative_scaling=0.5, margin=10,
         collocations=False,
     ).generate_from_frequencies(base_freq)
     wc_recog = WordCloud(
         width=1200, height=800, background_color='white', colormap='YlGn',
-        max_words=30, max_font_size=120, min_font_size=14,
+        max_words=50, max_font_size=120, min_font_size=14,
         prefer_horizontal=0.85, relative_scaling=0.5, margin=10,
         collocations=False,
     ).generate_from_frequencies(recog_freq)
     ax1.imshow(wc_base, interpolation='bilinear')
-    ax1.set_title('Base Condition', fontsize=18, fontweight='bold', pad=15)
+    ax1.set_title('Base Condition (N=172)', fontsize=18, fontweight='bold', pad=15)
     ax1.axis('off')
     ax2.imshow(wc_recog, interpolation='bilinear')
-    ax2.set_title('Recognition Condition', fontsize=18, fontweight='bold', pad=15)
+    ax2.set_title('Recognition Condition (N=178)', fontsize=18, fontweight='bold', pad=15)
     ax2.axis('off')
-    fig.suptitle('Figure 6: Emergent Theme Word Clouds (AI Discovery, N=300)',
+    fig.suptitle('Figure 6: Tutor Language Word Clouds (Factorial, N=350)',
                  fontsize=16, fontweight='bold', y=0.98)
     fig.tight_layout(rect=[0, 0.02, 1, 0.94])
     fig.savefig(os.path.join(OUTPUT_DIR, 'figure6.png'))
@@ -439,14 +648,306 @@ def figure6():
     print('  figure6.png')
+# ── Figure 7: Persona × Recognition (Section 6.8) ───────────────────────────
+def figure7():
+    """Grouped bar chart: superego persona × recognition for dialectical
+    multi-turn modulation (cells 28-33, N=90)."""
+    fig, ax = plt.subplots(figsize=(9, 5.5))
+    manifest = get_manifest()
+    fig7_config = manifest['figures']['figure7'] if manifest else None
+    personas = ['Suspicious', 'Adversary', 'Advocate']
+    base = None
+    recog = None
+    total_n = 90
+    data_driven = False
+    if fig7_config and get_db():
+        cell_means = query_cell_means(fig7_config['runs'], fig7_config['judge_filter'])
+        if cell_means:
+            # Cells 28-33: base/recog × suspicious/adversary/advocate
+            persona_cells = {
+                'Suspicious': ('cell_28_base_dialectical_suspicious_unified',
+                               'cell_29_recog_dialectical_suspicious_unified'),
+                'Adversary':  ('cell_30_base_dialectical_adversary_unified',
+                               'cell_31_recog_dialectical_adversary_unified'),
+                'Advocate':   ('cell_32_base_dialectical_advocate_unified',
+                               'cell_33_recog_dialectical_advocate_unified'),
+            }
+            base = []
+            recog = []
+            for persona in personas:
+                b_key, r_key = persona_cells[persona]
+                b_data = cell_means.get(b_key)
+                r_data = cell_means.get(r_key)
+                if b_data and r_data:
+                    base.append(round(b_data['mean'], 1))
+                    recog.append(round(r_data['mean'], 1))
+                else:
+                    base = None
+                    break
+            if base and len(base) == 3:
+                total_n = sum(v['n'] for v in cell_means.values())
+                data_driven = True
+                print('    [data-driven from DB]')
+    if not data_driven:
+        base = [85.7, 88.5, 82.0]
+        recog = [90.2, 88.5, 95.6]
+        print('    [hardcoded fallback]')
+    deltas = [r - b for r, b in zip(recog, base)]
+    x = np.arange(len(personas))
+    w = 0.35
+    bars_b = ax.bar(x - w/2, base, w, label='Base', color='#95A5A6', edgecolor='#7F8C8D', linewidth=1.2)
+    bars_r = ax.bar(x + w/2, recog, w, label='Recognition', color='#27AE60', edgecolor='#1E8449', linewidth=1.2)
+    # Value labels
+    for bar in bars_b:
+        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
+                f'{bar.get_height():.1f}', ha='center', va='bottom', fontsize=11, fontweight='bold', color='#555')
+    for bar in bars_r:
+        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
+                f'{bar.get_height():.1f}', ha='center', va='bottom', fontsize=11, fontweight='bold', color='#1E8449')
+    # Delta annotations
+    for i, d in enumerate(deltas):
+        color = '#C0392B' if d > 2 else '#888' if abs(d) <= 2 else '#2471A3'
+        sign = '+' if d >= 0 else ''
+        ax.text(x[i] + w/2 + 0.08, recog[i] - 2, f'{sign}{d:.1f}',
+                fontsize=11, fontweight='bold', color=color, va='center')
+    ax.set_ylim(75, 100)
+    ax.set_xticks(x)
+    ax.set_xticklabels(personas, fontsize=13)
+    ax.set_ylabel('Mean Score', fontsize=14)
+    ax.set_title(f'Figure 7: Superego Persona × Recognition\n(Dialectical Multi-Turn, N={total_n}, Opus Judge)',
+                 fontsize=15, fontweight='bold')
+    ax.legend(fontsize=12, framealpha=0.9)
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    fig.text(0.10, 0.02,
+             'Advocate persona shows largest recognition effect (+13.6); '
+             'adversary shows zero effect due to over-deference.',
+             fontsize=11, fontstyle='italic', color='#777777')
+    fig.tight_layout(rect=[0, 0.08, 1, 1])
+    fig.savefig(os.path.join(OUTPUT_DIR, 'figure7.png'))
+    plt.close(fig)
+    print('  figure7.png')
+# ── Figure 8: Scripted vs Dynamic Learner Mechanism Spread (Section 6.10) ────
+def figure8():
+    """Side-by-side comparison of mechanism spread under scripted vs dynamic learners."""
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6), sharey=True)
+    manifest = get_manifest()
+    fig8_config = manifest['figures']['figure8'] if manifest else None
+    scripted_labels = None
+    scripted_vals = None
+    dynamic_labels = None
+    dynamic_vals = None
+    scripted_n = 360
+    dynamic_n = 240
+    data_driven = False
+    # Cell-to-mechanism mapping for recognition cells in e0e3a622
+    scripted_recog_cells = {
+        'cell_41_recog_dialectical_suspicious_unified_superego': 'Self-reflect (susp.)',
+        'cell_43_recog_dialectical_adversary_unified_superego': 'Adversary',
+        'cell_45_recog_dialectical_advocate_unified_superego': 'Advocate',
+        'cell_47_recog_dialectical_suspicious_unified_quantitative': 'Quantitative',
+        'cell_49_recog_dialectical_suspicious_unified_erosion': 'Erosion',
+        'cell_51_recog_dialectical_suspicious_unified_intersubjective': 'Intersubjective',
+        'cell_53_recog_dialectical_suspicious_unified_combined': 'Combined',
+        'cell_55_recog_dialectical_profile_tutor': 'Prof. (tutor)',
+        'cell_57_recog_dialectical_profile_bidirectional': 'Prof. (bidir)',
+    }
+    # Dynamic learner recognition cells from 6c033830 + a2b2717c
+    dynamic_recog_cells = {
+        'cell_61_recog_dialectical_selfreflect_psycho': 'Self-reflect',
+        'cell_63_recog_dialectical_profile_bidirectional_psycho': 'Profiling',
+        'cell_64_recog_dialectical_intersubjective_psycho': 'Intersubjective',
+        'cell_65_recog_dialectical_combined_psycho': 'Combined',
+    }
+    if fig8_config and get_db():
+        # Scripted
+        s_means = query_cell_means([fig8_config['runs']['scripted']], fig8_config['judge_filter'])
+        if s_means:
+            s_labels = []
+            s_vals = []
+            for cell, label in scripted_recog_cells.items():
+                data = s_means.get(cell)
+                if data:
+                    s_labels.append(label)
+                    s_vals.append(round(data['mean'], 1))
+            if len(s_labels) >= 7:
+                scripted_labels = s_labels
+                scripted_vals = s_vals
+                scripted_n = sum(v['n'] for v in s_means.values())
+        # Dynamic
+        d_run_ids = [fig8_config['runs']['dynamic_60_63'], fig8_config['runs']['dynamic_64_65']]
+        d_means = query_cell_means(d_run_ids, fig8_config['judge_filter'])
+        if d_means:
+            d_labels = []
+            d_vals = []
+            for cell, label in dynamic_recog_cells.items():
+                data = d_means.get(cell)
+                if data:
+                    d_labels.append(label)
+                    d_vals.append(round(data['mean'], 1))
+            if len(d_labels) >= 3:
+                dynamic_labels = d_labels
+                dynamic_vals = d_vals
+                dynamic_n = sum(v['n'] for v in d_means.values())
+        if scripted_labels and dynamic_labels:
+            data_driven = True
+            print('    [data-driven from DB]')
+    if not data_driven:
+        scripted_labels = ['Prof. (bidir)', 'Quantitative', 'Combined', 'Prof. (tutor)',
+                           'Self-reflect', 'Intersubjective', 'Erosion', 'Adversary', 'Advocate']
+        scripted_vals = [92.7, 92.6, 92.4, 92.4, 92.1, 91.7, 90.8, 92.6, 90.3]
+        dynamic_labels = ['Profiling', 'Combined', 'Self-reflect', 'Intersubjective']
+        dynamic_vals = [88.8, 87.8, 85.9, 82.8]
+        print('    [hardcoded fallback]')
+    # Sort both by value descending
+    s_order = np.argsort(scripted_vals)[::-1]
+    scripted_labels = [scripted_labels[i] for i in s_order]
+    scripted_vals = [scripted_vals[i] for i in s_order]
+    d_order = np.argsort(dynamic_vals)[::-1]
+    dynamic_labels = [dynamic_labels[i] for i in d_order]
+    dynamic_vals = [dynamic_vals[i] for i in d_order]
+    # Scripted panel
+    colors_s = ['#27AE60'] * len(scripted_vals)
+    bars_s = ax1.barh(range(len(scripted_vals)), scripted_vals, color=colors_s, edgecolor='#1E8449', alpha=0.8)
+    ax1.set_yticks(range(len(scripted_labels)))
+    ax1.set_yticklabels(scripted_labels, fontsize=11)
+    ax1.set_xlim(80, 96)
+    ax1.set_xlabel('Mean Score (Recognition)', fontsize=12)
+    s_range = max(scripted_vals) - min(scripted_vals)
+    ax1.set_title(f'Scripted Learner (N={scripted_n})\n{s_range:.1f}-pt range', fontsize=14, fontweight='bold')
+    for i, v in enumerate(scripted_vals):
+        ax1.text(v + 0.2, i, f'{v:.1f}', va='center', fontsize=10, fontweight='bold')
+    # Highlight the band
+    ax1.axvspan(min(scripted_vals), max(scripted_vals), alpha=0.1, color='green')
+    ax1.spines['top'].set_visible(False)
+    ax1.spines['right'].set_visible(False)
+    # Dynamic panel
+    colors_d = ['#27AE60' if v > 86 else '#F39C12' if v > 84 else '#E74C3C' for v in dynamic_vals]
+    bars_d = ax2.barh(range(len(dynamic_vals)), dynamic_vals, color=colors_d, edgecolor='#333', alpha=0.8)
+    ax2.set_yticks(range(len(dynamic_labels)))
+    ax2.set_yticklabels(dynamic_labels, fontsize=11)
+    ax2.set_xlim(80, 96)
+    ax2.set_xlabel('Mean Score (Recognition)', fontsize=12)
+    d_range = max(dynamic_vals) - min(dynamic_vals)
+    ax2.set_title(f'Dynamic Learner (N={dynamic_n})\n{d_range:.1f}-pt range', fontsize=14, fontweight='bold')
+    for i, v in enumerate(dynamic_vals):
+        ax2.text(v + 0.2, i, f'{v:.1f}', va='center', fontsize=10, fontweight='bold')
+    ax2.axvspan(min(dynamic_vals), max(dynamic_vals), alpha=0.1, color='orange')
+    ax2.spines['top'].set_visible(False)
+    ax2.spines['right'].set_visible(False)
+    fig.suptitle('Figure 8: Mechanism Differentiation — Scripted vs Dynamic Learner',
+                 fontsize=16, fontweight='bold', y=1.02)
+    fig.tight_layout()
+    fig.savefig(os.path.join(OUTPUT_DIR, 'figure8.png'))
+    plt.close(fig)
+    print('  figure8.png')
+# ── Figure 9: Qualitative Tag Divergence (Section 6.11) ─────────────────────
+def figure9():
+    """Diverging bar chart: tag frequency difference (recognition - base)
+    from bilateral run qualitative assessment."""
+    fig, ax = plt.subplots(figsize=(10, 5.5))
+    tags = ['recognition_moment', 'ego_autonomy', 'emotional_attunement',
+            'strategy_shift', 'learner_breakthrough',
+            'ego_compliance', 'superego_overcorrection', 'missed_scaffold',
+            'stalling']
+    # Note: missed_scaffold was 101.7% due to duplicate tag counting per dialogue;
+    # capped at 100.0% (deduplicated per dialogue).
+    tags = ['recognition_moment', 'strategy_shift', 'emotional_attunement',
+            'learner_breakthrough',
+            'ego_compliance', 'superego_overcorrection', 'missed_scaffold',
+            'stalling']
+    base_pct = [0.0, 0.0, 6.9, 80.0, 70.7, 69.0, 100.0, 100.0]
+    recog_pct = [51.7, 30.0, 36.7, 80.0, 60.0, 50.0, 68.3, 45.0]
+    diff = [r - b for r, b in zip(recog_pct, base_pct)]
+    # Sort by difference
+    order = np.argsort(diff)
+    tags = [tags[i] for i in order]
+    diff = [diff[i] for i in order]
+    colors = ['#27AE60' if d > 0 else '#E74C3C' for d in diff]
+    bars = ax.barh(range(len(tags)), diff, color=colors, edgecolor='#333', alpha=0.85)
+    # Clean tag names
+    clean = [t.replace('_', ' ').title() for t in tags]
+    ax.set_yticks(range(len(clean)))
+    ax.set_yticklabels(clean, fontsize=11)
+    ax.set_xlabel('Percentage Point Difference (Recognition − Base)', fontsize=12)
+    ax.axvline(0, color='black', linewidth=0.8)
+    # Value labels
+    for i, d in enumerate(diff):
+        sign = '+' if d > 0 else ''
+        ha = 'left' if d >= 0 else 'right'
+        offset = 1.5 if d >= 0 else -1.5
+        ax.text(d + offset, i, f'{sign}{d:.0f}%', va='center', ha=ha, fontsize=10, fontweight='bold')
+    ax.set_title('Figure 9: Qualitative Tag Divergence\n(Bilateral Run, N=118, Base vs Recognition)',
+                 fontsize=15, fontweight='bold')
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    fig.tight_layout()
+    fig.savefig(os.path.join(OUTPUT_DIR, 'figure9.png'))
+    plt.close(fig)
+    print('  figure9.png')
 # ── Main ──────────────────────────────────────────────────────────────────────
 if __name__ == '__main__':
     print('Generating paper figures...')
+    if get_manifest() and get_db():
+        print(f'  Manifest: {MANIFEST_PATH}')
+        print(f'  Database: {DB_PATH}')
+    else:
+        print('  WARNING: manifest or DB not found, using hardcoded fallbacks')
     figure1()
     figure2()
     figure3()
     figure4()
     figure5()
     figure6()
+    figure7()
+    figure8()
+    figure9()
+    if _db:
+        _db.close()
     print(f'Done. Output: {os.path.abspath(OUTPUT_DIR)}/')