npm - @machinespirits/eval - Versions diffs - 0.2.0 → 0.3.0 - Mend

@machinespirits/eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

package/README.md +91 -9
package/config/eval-settings.yaml +3 -3
package/config/paper-manifest.json +486 -0
package/config/providers.yaml +9 -6
package/config/tutor-agents.yaml +2261 -0
package/content/README.md +23 -0
package/content/courses/479/course.md +53 -0
package/content/courses/479/lecture-1.md +361 -0
package/content/courses/479/lecture-2.md +360 -0
package/content/courses/479/lecture-3.md +655 -0
package/content/courses/479/lecture-4.md +530 -0
package/content/courses/479/lecture-5.md +326 -0
package/content/courses/479/lecture-6.md +346 -0
package/content/courses/479/lecture-7.md +326 -0
package/content/courses/479/lecture-8.md +273 -0
package/content/courses/479/roadmap-slides.md +656 -0
package/content/manifest.yaml +8 -0
package/docs/research/build.sh +44 -20
package/docs/research/figures/figure10.png +0 -0
package/docs/research/figures/figure11.png +0 -0
package/docs/research/figures/figure3.png +0 -0
package/docs/research/figures/figure4.png +0 -0
package/docs/research/figures/figure5.png +0 -0
package/docs/research/figures/figure6.png +0 -0
package/docs/research/figures/figure7.png +0 -0
package/docs/research/figures/figure8.png +0 -0
package/docs/research/figures/figure9.png +0 -0
package/docs/research/header.tex +23 -2
package/docs/research/paper-full.md +941 -285
package/docs/research/paper-short.md +216 -585
package/docs/research/references.bib +132 -0
package/docs/research/slides-header.tex +188 -0
package/docs/research/slides-pptx.md +363 -0
package/docs/research/slides.md +531 -0
package/docs/research/style-reference-pptx.py +199 -0
package/package.json +6 -5
package/scripts/analyze-eval-results.js +69 -17
package/scripts/analyze-mechanism-traces.js +763 -0
package/scripts/analyze-modulation-learning.js +498 -0
package/scripts/analyze-prosthesis.js +144 -0
package/scripts/analyze-run.js +264 -79
package/scripts/assess-transcripts.js +853 -0
package/scripts/browse-transcripts.js +854 -0
package/scripts/check-parse-failures.js +73 -0
package/scripts/code-dialectical-modulation.js +1320 -0
package/scripts/download-data.sh +55 -0
package/scripts/eval-cli.js +106 -18
package/scripts/generate-paper-figures.js +663 -0
package/scripts/generate-paper-figures.py +577 -76
package/scripts/generate-paper-tables.js +299 -0
package/scripts/qualitative-analysis-ai.js +3 -3
package/scripts/render-sequence-diagram.js +694 -0
package/scripts/test-latency.js +210 -0
package/scripts/test-rate-limit.js +95 -0
package/scripts/test-token-budget.js +332 -0
package/scripts/validate-paper-manifest.js +670 -0
package/services/__tests__/evalConfigLoader.test.js +2 -2
package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
package/services/evaluationRunner.js +975 -98
package/services/evaluationStore.js +12 -4
package/services/learnerTutorInteractionEngine.js +27 -2
package/services/mockProvider.js +133 -0
package/services/promptRewriter.js +1471 -5
package/services/rubricEvaluator.js +55 -2
package/services/transcriptFormatter.js +675 -0
package/docs/EVALUATION-VARIABLES.md +0 -589
package/docs/REPLICATION-PLAN.md +0 -577
package/scripts/analyze-run.mjs +0 -282
package/scripts/compare-runs.js +0 -44
package/scripts/compare-suggestions.js +0 -80
package/scripts/dig-into-run.js +0 -158
package/scripts/show-failed-suggestions.js +0 -64
/package/scripts/{check-run.mjs → check-run.js} +0 -0

package/README.md CHANGED Viewed

@@ -8,7 +8,7 @@ This is the evaluation and analysis companion to [`@machinespirits/tutor-core`](
 The system runs automated tutoring dialogues across configurable experimental cells, then scores them with LLM judges against a multi-dimensional rubric. It supports:
-- **Factorial evaluation** — 21 tutor agent cells varying recognition theory, architecture (single-agent vs ego+superego), and learner type
+- **Factorial evaluation** — 70 tutor agent cells varying recognition theory, architecture, learner type, and mechanism design
 - **Multi-turn dialogues** — Learner agents with their own ego-superego deliberation
 - **Multi-judge validation** — Cross-judge reliability via Claude Opus, GPT-5.2, and others
 - **Placebo/active controls** — Length-matched prompts without recognition theory
@@ -17,7 +17,7 @@ The system runs automated tutoring dialogues across configurable experimental ce
 ## Prerequisites
 - **Node.js** >= 18.0.0
-- **@machinespirits/tutor-core** 0.3.1 (peer dependency)
+- **@machinespirits/tutor-core** >= 0.3.1 (peer dependency)
 - At least one AI provider API key (see below)
 ## Installation
@@ -86,6 +86,18 @@ node scripts/eval-cli.js report <run-id>
 node scripts/eval-cli.js export <run-id> --format csv
 ```
+### Dry-run mode (no API keys required)
+Verify the full pipeline without API calls:
+```bash
+node scripts/eval-cli.js quick --dry-run
+node scripts/eval-cli.js run --dry-run --runs 2
+node scripts/eval-cli.js run --dry-run --runs 3 --scenario new_user_first_visit
+```
+Dry-run uses deterministic mock data that mirrors real score distributions (recognition cells ~85-92, base cells ~72-82). All downstream steps (DB storage, ANOVA, reporting) work normally on the mock data.
 ### Standalone server
 ```bash
@@ -109,14 +121,16 @@ scripts/                   CLI tools and analysis scripts
 services/                  Core evaluation engine, rubric evaluator, learner simulation
 routes/                    Express API routes (optional server mode)
 data/                      SQLite databases (evaluation results, writing pads)
-content-test-elementary/   Bundled test content package
-docs/                      Documentation and research paper
+content/                   Bundled course content (philosophy 479)
+content-test-elementary/   Bundled test content (elementary 101)
+notebooks/                 Reproducibility notebook (Jupyter)
+docs/research/             Research paper and build scripts
 tests/                     Test suites
 ```
 ### Key configuration files
-- `config/tutor-agents.yaml` — All 21 experimental cells and their prompt mappings
+- `config/tutor-agents.yaml` — All 70 experimental cells and their prompt mappings
 - `config/suggestion-scenarios.yaml` — Learner scenarios (single-turn and multi-turn)
 - `config/evaluation-rubric.yaml` — Scoring rubric (6 dimensions)
 - `config/providers.yaml` — AI provider and model configuration
@@ -131,11 +145,73 @@ The core factorial design crosses three factors:
 | B: Tutor architecture | Single-agent vs Ego+Superego |
 | C: Learner architecture | Single-agent vs Multi-agent |
-Additional cells test enhanced prompts (9-12), hardwired rules (13-14), placebo controls (15-18), memory isolation (19-20), and dynamic prompt rewriting (21).
+Additional cells test enhanced prompts (9-12), hardwired rules (13-14), placebo controls (15-18), memory isolation (19-20), dynamic prompt rewriting (21), dialectical superego modulation (22-39), self-reflective evolution (40-45), insight-action mechanisms (46-53), other-ego profiling (54-59), and dynamic learner mechanism testing (60-70).
+## Reproducing Paper Findings
+The full research paper is at `docs/research/paper-full.md`. A Jupyter notebook in `notebooks/` independently reproduces all 17 tables and key statistical findings.
+The evaluation dataset (database + dialogue logs, ~19 MB) is available as a [GitHub Release artifact](https://github.com/liammagee/machinespirits-eval/releases/tag/v0.2.0). See `notebooks/README.md` for setup instructions.
+To re-run evaluations from scratch (rather than reproducing from saved data), expect ~$65–90 USD in API costs and 48–72 hours wall-clock time. See the CLI help (`node scripts/eval-cli.js --help`) for details on running cells, judging, and exporting results.
+## Scripts Reference
+### Analysis
+| Script | Description |
+|--------|-------------|
+| `analyze-eval-results.js` | Statistical analysis (ANOVA, effect sizes, marginal means) |
+| `analyze-judge-reliability.js` | Inter-judge reliability (requires rejudged data) |
+| `analyze-mechanism-traces.js <runId>` | Process trace analysis for mechanism comparison runs |
+| `analyze-eval-costs.js` | Cost breakdown across runs |
+| `analyze-interaction-evals.js` | Interaction evaluation results |
+| `analyze-modulation-learning.js` | Modulation metrics and learning outcomes |
+| `advanced-eval-analysis.js` | Extended multi-turn scenario analysis |
+| `compare-transformation.js` | Transformation metrics (adaptation, growth indices) |
-## Research Paper
+### Qualitative
-The full research paper is included at `docs/research/PAPER-FULL-2026-02-04.md`. For replication instructions, see `docs/REPLICATION-PLAN.md`.
+| Script | Description |
+|--------|-------------|
+| `assess-transcripts.js <runId>` | Qualitative transcript assessment (`--blinded`, `--force`) |
+| `browse-transcripts.js` | Interactive transcript browser (terminal UI) |
+| `qualitative-analysis-ai.js` | AI-based thematic analysis of transcripts |
+| `code-impasse-strategies.js` | Code impasse dialogues into Hegelian resolution strategies |
+| `code-dialectical-modulation.js` | Code superego modulation patterns |
+### Paper & Validation
+| Script | Description |
+|--------|-------------|
+| `generate-paper-tables.js` | Generate tables and validate prose against DB |
+| `validate-paper-manifest.js` | Validate paper claims against evaluation data |
+| `render-sequence-diagram.js` | Render architecture sequence diagrams |
+| `validate-content.js` | Validate tutorial content files |
+### Utilities
+| Script | Description |
+|--------|-------------|
+| `test-rate-limit.js [model]` | Probe OpenRouter rate limits (default: nemotron) |
+| `test-latency.js` | Latency test across all configured models |
+| `seed-db.js` | Initialize/seed the SQLite database |
+All scripts are in `scripts/` and run with `node scripts/<name>`.
+## Claude Code Skills
+This project includes [Claude Code skills](https://docs.anthropic.com/en/docs/claude-code/skills) (`.claude/skills/`) that encode common evaluation workflows as slash commands. In any Claude Code session:
+| Command | What it does |
+|---------|-------------|
+| `/analyze-run <runId>` | Pull scores from DB, compute means, effect sizes, flag issues |
+| `/check-models [alias]` | Probe OpenRouter rate limits and availability |
+| `/build-paper` | Build paper PDF, check citations and cross-references |
+| `/run-eval <cells> --runs N` | Full generation + judging pipeline with pre-flight checks |
+| `/query-db <question>` | Natural language query against the evaluation database |
+Skills with a `description` field (`analyze-run`, `check-models`, `query-db`) can also be invoked automatically by Claude when relevant to the conversation. `/run-eval` requires explicit invocation since it consumes API credits.
 ## Running Tests
@@ -143,6 +219,12 @@ The full research paper is included at `docs/research/PAPER-FULL-2026-02-04.md`.
 npm test
 ```
+## Known Deferred Risks
+This repository currently accepts a small set of known risks because it is run as an internal localhost-only evaluation system.
+See `notes/known-risks-localhost-2026-02-13.md` for the tracked risk register, acceptance scope, and hardening triggers required before broader deployment.
 ## Citation
 If you use this software in your research, please cite:
@@ -150,7 +232,7 @@ If you use this software in your research, please cite:
 ```bibtex
 @misc{magee2026machinespirits,
   author = {Magee, Liam},
-  title = {The Drama Machine in Education: Recognition Theory and Multi-Agent Tutoring},
+  title = {\textit{Geist} in the Machine: Recognition Theory and Multi-Agent Tutoring},
   year = {2026},
   url = {https://github.com/liammagee/machinespirits-eval}
 }

package/config/eval-settings.yaml CHANGED Viewed

@@ -6,9 +6,9 @@
 content:
   # Path to a content package containing courses/ with lecture markdown.
   # Relative paths are resolved from the eval repo root.
-  # Default: bundled test content. Override with EVAL_CONTENT_PATH env var
-  # or change this path to point to a full content package.
-  content_package_path: "./content-test-elementary"
+  # Default: bundled 479 course content. Override with EVAL_CONTENT_PATH env var
+  # to use the full content-philosophy package (e.g., "../machinespirits-content-philosophy").
+  content_package_path: "./content"
   # Maximum characters to include from a lecture file.
   # Safety valve for token budget in large lectures.

package/config/paper-manifest.json ADDED Viewed

@@ -0,0 +1,486 @@
+{
+  "version": "1.4.0",
+  "generated": "2026-02-17",
+  "database": "data/evaluations.db",
+  "paper": "docs/research/paper-full.md",
+  "key_evaluations": [
+    {
+      "run_ids": ["eval-2026-02-03-86b159cd"],
+      "label": "Recognition validation",
+      "section": "6.1",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "response",
+      "expected_attempts": 36,
+      "expected_scored": 36
+    },
+    {
+      "run_ids": ["eval-2026-02-03-f5d4dd93"],
+      "label": "Full factorial, cells 1-5,7 (Kimi)",
+      "section": "6.3",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "response",
+      "expected_attempts": 262,
+      "expected_scored": 262
+    },
+    {
+      "run_ids": ["eval-2026-02-06-a933d745"],
+      "label": "Full factorial, cells 6,8 re-run (Kimi)",
+      "section": "6.3",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "response",
+      "expected_attempts": 90,
+      "expected_scored": 88
+    },
+    {
+      "run_ids": ["eval-2026-02-05-10b344fb"],
+      "label": "A×B replication (Kimi)",
+      "section": "6.4",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "response",
+      "expected_attempts": 60,
+      "expected_scored": 60
+    },
+    {
+      "run_ids": ["eval-2026-02-05-e87f452d"],
+      "label": "Domain generalizability (Kimi)",
+      "section": "6.5",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "response",
+      "expected_attempts": 60,
+      "expected_scored": 60
+    },
+    {
+      "run_ids": ["eval-2026-02-05-daf60f79"],
+      "label": "Dynamic rewrite evolution (run 1)",
+      "section": "6.18",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "response",
+      "expected_attempts": 29,
+      "expected_scored": 27
+    },
+    {
+      "run_ids": ["eval-2026-02-05-49bb2017"],
+      "label": "Dynamic rewrite evolution (run 2)",
+      "section": "6.18",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "response",
+      "expected_attempts": 30,
+      "expected_scored": 27
+    },
+    {
+      "run_ids": ["eval-2026-02-05-12aebedb"],
+      "label": "Dynamic rewrite evolution (run 3)",
+      "section": "6.18",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "response",
+      "expected_attempts": 30,
+      "expected_scored": 29
+    },
+    {
+      "run_ids": ["eval-2026-02-06-81f2d5a1"],
+      "label": "Memory isolation (run 1)",
+      "section": "6.2",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "response",
+      "expected_attempts": 60,
+      "expected_scored": 60
+    },
+    {
+      "run_ids": ["eval-2026-02-06-ac9ea8f5"],
+      "label": "Memory isolation (run 2)",
+      "section": "6.2",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "response",
+      "expected_attempts": 62,
+      "expected_scored": 62
+    },
+    {
+      "run_ids": ["eval-2026-02-06-a9ae06ee"],
+      "label": "Active control (post-hoc)",
+      "section": "6.2",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "response",
+      "expected_attempts": 119,
+      "expected_scored": 118
+    },
+    {
+      "run_ids": ["eval-2026-02-07-b6d75e87"],
+      "label": "Bilateral transformation (multi-turn)",
+      "section": "6.15",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "dialogue",
+      "expected_attempts": 120,
+      "expected_scored": 118
+    },
+    {
+      "run_ids": ["eval-2026-02-07-722087ac"],
+      "label": "A×B probe: Nemotron",
+      "section": "6.4",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "response",
+      "expected_attempts": 120,
+      "expected_scored": 119
+    },
+    {
+      "run_ids": ["eval-2026-02-07-70ef73a3"],
+      "label": "A×B probe: DeepSeek V3.2",
+      "section": "6.4",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "response",
+      "expected_attempts": 120,
+      "expected_scored": 120
+    },
+    {
+      "run_ids": ["eval-2026-02-07-6b3e6565"],
+      "label": "A×B probe: GLM-4.7",
+      "section": "6.4",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "response",
+      "expected_attempts": 120,
+      "expected_scored": 117
+    },
+    {
+      "run_ids": ["eval-2026-02-07-6ead24c7"],
+      "label": "A×B probe: Claude Haiku 4.5",
+      "section": "6.4",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "response",
+      "expected_attempts": 120,
+      "expected_scored": 120
+    },
+    {
+      "run_ids": ["eval-2026-02-08-f896275d"],
+      "label": "Dialectical impasse test",
+      "section": "6.20",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "dialogue",
+      "expected_attempts": 24,
+      "expected_scored": 24
+    },
+    {
+      "run_ids": ["eval-2026-02-08-65a6718f"],
+      "label": "Hardwired rules ablation (Kimi)",
+      "section": "6.7",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "response",
+      "expected_attempts": 72,
+      "expected_scored": 72
+    },
+    {
+      "run_ids": ["eval-2026-02-07-b6d75e87"],
+      "label": "Learner-side evaluation (symmetric)",
+      "section": "6.16",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "learner turn",
+      "expected_attempts": 118,
+      "expected_scored": 118,
+      "note": "Same run as bilateral transformation; scored with learner rubric"
+    },
+    {
+      "run_ids": ["eval-2026-02-11-35c53e99", "eval-2026-02-11-5f6d51f5"],
+      "label": "Dialectical modulation, standard (cells 22-27)",
+      "section": "6.8",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "response",
+      "expected_attempts": 84,
+      "expected_scored": 84
+    },
+    {
+      "run_ids": ["eval-2026-02-11-a54235ea"],
+      "label": "Dialectical modulation, multi-turn (cells 28-33)",
+      "section": "6.8",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "dialogue",
+      "expected_attempts": 90,
+      "expected_scored": 90
+    },
+    {
+      "run_ids": ["eval-2026-02-13-8d40e086"],
+      "label": "Self-reflective evolution (cells 40-45, Nemotron)",
+      "section": "6.9",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "dialogue",
+      "expected_attempts": 90,
+      "expected_scored": 90
+    },
+    {
+      "run_ids": ["eval-2026-02-14-559d854b"],
+      "label": "Self-reflect Nemotron non-replication (cells 40-45)",
+      "section": "6.9",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "dialogue",
+      "expected_attempts": 60,
+      "expected_scored": 60,
+      "profile_filter": "cell_4%_dialectical_%_superego",
+      "note": "Run contains cells 40-59 (N=167) but paper uses only cells 40-45 (N=60); cells 46-59 superseded by 49b33fdd"
+    },
+    {
+      "run_ids": ["eval-2026-02-14-e0e3a622"],
+      "label": "Mechanism robustness, scripted (cells 40-59)",
+      "section": "6.10",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "dialogue",
+      "expected_attempts": 360,
+      "expected_scored": 360
+    },
+    {
+      "run_ids": ["eval-2026-02-14-6c033830"],
+      "label": "Dynamic learner mechanisms (cells 60-63)",
+      "section": "6.10",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "dialogue",
+      "expected_attempts": 120,
+      "expected_scored": 120
+    },
+    {
+      "run_ids": ["eval-2026-02-14-a2b2717c"],
+      "label": "Dynamic learner mechanisms (cells 64-65)",
+      "section": "6.10",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "dialogue",
+      "expected_attempts": 120,
+      "expected_scored": 120
+    },
+    {
+      "run_ids": ["eval-2026-02-14-49b33fdd"],
+      "label": "Mechanism robustness, Nemotron (cells 40-59)",
+      "section": "6.10",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "dialogue",
+      "expected_attempts": 360,
+      "expected_scored": 360
+    },
+    {
+      "run_ids": ["eval-2026-02-17-25aaae85"],
+      "label": "Cognitive prosthesis (cells 66-68, Nemotron)",
+      "section": "6.10",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "dialogue",
+      "expected_attempts": 90,
+      "expected_scored": 90
+    },
+    {
+      "run_ids": ["eval-2026-02-18-f489c0ea"],
+      "label": "Cognitive prosthesis smoke test (Haiku)",
+      "section": "6.10",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "dialogue",
+      "expected_attempts": 6,
+      "expected_scored": 6
+    },
+    {
+      "run_ids": ["eval-2026-02-15-664073ab"],
+      "label": "Dynamic learner base mechanisms (cells 69-70)",
+      "section": "6.10",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "dialogue",
+      "expected_attempts": 60,
+      "expected_scored": 60
+    },
+    {
+      "run_ids": ["eval-2026-02-17-deee5fd6"],
+      "label": "Prompt elaboration baseline, Haiku (cells 1, 71)",
+      "section": "6.21",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "single-turn",
+      "expected_attempts": 72,
+      "expected_scored": 72
+    },
+    {
+      "run_ids": ["eval-2026-02-17-27d7b4e3"],
+      "label": "Prompt elaboration baseline, Kimi (cells 1, 71)",
+      "section": "6.21",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "single-turn",
+      "expected_attempts": 72,
+      "expected_scored": 72
+    },
+    {
+      "run_ids": ["eval-2026-02-17-0eb3de77"],
+      "label": "Token budget 256, Haiku (run 1)",
+      "section": "6.22",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "mixed",
+      "expected_attempts": 36,
+      "expected_scored": 36
+    },
+    {
+      "run_ids": ["eval-2026-02-17-5a640782"],
+      "label": "Token budget 256, Haiku (run 2)",
+      "section": "6.22",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "mixed",
+      "expected_attempts": 36,
+      "expected_scored": 36
+    },
+    {
+      "run_ids": ["eval-2026-02-17-5f281654"],
+      "label": "Token budget 512, Haiku",
+      "section": "6.22",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "mixed",
+      "expected_attempts": 36,
+      "expected_scored": 36
+    },
+    {
+      "run_ids": ["eval-2026-02-17-0f6dcd97"],
+      "label": "Token budget 2048, Haiku",
+      "section": "6.22",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "mixed",
+      "expected_attempts": 36,
+      "expected_scored": 36
+    },
+    {
+      "run_ids": ["eval-2026-02-17-d32ed226"],
+      "label": "Token budget default, Haiku",
+      "section": "6.22",
+      "primary_judge_pattern": "claude-opus%",
+      "unit": "mixed",
+      "expected_attempts": 18,
+      "expected_scored": 18
+    }
+  ],
+  "totals": {
+    "evaluations": 37,
+    "expected_attempts": 3398,
+    "expected_scored": 3383,
+    "opus_primary_count": 37,
+    "sonnet_primary_count": 0,
+    "sonnet_primary_runs": []
+  },
+  "figures": {
+    "figure1": {
+      "title": "2×2×2 Factorial Cell Means",
+      "section": "6.3",
+      "runs": ["eval-2026-02-03-f5d4dd93", "eval-2026-02-06-a933d745"],
+      "cells": [1, 2, 3, 4, 5, 6, 7, 8],
+      "judge_filter": "claude-opus%",
+      "notes": "Combine f5d4dd93 (cells 1-5,7) and a933d745 (cells 6,8)"
+    },
+    "figure2": {
+      "title": "Memory Isolation 2×2",
+      "section": "6.2",
+      "runs": ["eval-2026-02-06-81f2d5a1", "eval-2026-02-06-ac9ea8f5"],
+      "profiles": {
+        "base": "cell_1_base_single_unified",
+        "memory_only": "cell_19_memory_single_unified",
+        "recognition_only": "cell_20_recog_nomem_single_unified",
+        "recognition_plus_memory": "cell_5_recog_single_unified"
+      },
+      "judge_filter": "claude-opus%"
+    },
+    "figure3": {
+      "title": "Active Control Comparison",
+      "section": "6.2",
+      "runs": {
+        "active_control": "eval-2026-02-06-a9ae06ee",
+        "factorial_base": ["eval-2026-02-03-f5d4dd93", "eval-2026-02-06-a933d745"]
+      },
+      "judge_filter": "claude-opus%",
+      "notes": "Cross-run comparison: active control (Nemotron) vs factorial (Kimi). Model confound acknowledged."
+    },
+    "figure4": {
+      "title": "Multi-Model A×B Probe",
+      "section": "6.4",
+      "runs": {
+        "kimi": {
+          "run_ids": ["eval-2026-02-03-f5d4dd93"],
+          "cells": [1, 3, 5, 7],
+          "label": "Kimi K2.5",
+          "expected_n": 179
+        },
+        "nemotron": {
+          "run_ids": ["eval-2026-02-07-722087ac"],
+          "cells": [1, 3, 5, 7],
+          "label": "Nemotron",
+          "expected_n": 119
+        },
+        "deepseek": {
+          "run_ids": ["eval-2026-02-07-70ef73a3"],
+          "cells": [1, 3, 5, 7],
+          "label": "DeepSeek V3.2",
+          "expected_n": 120
+        },
+        "glm": {
+          "run_ids": ["eval-2026-02-07-6b3e6565"],
+          "cells": [1, 3, 5, 7],
+          "label": "GLM-4.7",
+          "expected_n": 117
+        },
+        "haiku": {
+          "run_ids": ["eval-2026-02-07-6ead24c7"],
+          "cells": [1, 3, 5, 7],
+          "label": "Claude Haiku 4.5",
+          "expected_n": 120
+        }
+      },
+      "judge_filter": "claude-opus%"
+    },
+    "figure5": {
+      "title": "Domain Generalizability",
+      "section": "6.5",
+      "runs": {
+        "elementary": "eval-2026-02-05-e87f452d",
+        "philosophy": "eval-2026-02-03-f5d4dd93"
+      },
+      "cells": [1, 3, 5, 7],
+      "judge_filter": "claude-opus%",
+      "notes": "Elementary from domain gen run; philosophy from factorial single-learner cells"
+    },
+    "figure7": {
+      "title": "Superego Persona × Recognition",
+      "section": "6.8",
+      "runs": ["eval-2026-02-11-a54235ea"],
+      "cells": [28, 29, 30, 31, 32, 33],
+      "judge_filter": "claude-opus%"
+    },
+    "figure8": {
+      "title": "Mechanism Differentiation — Scripted vs Dynamic",
+      "section": "6.10",
+      "runs": {
+        "scripted": "eval-2026-02-14-e0e3a622",
+        "dynamic_60_63": "eval-2026-02-14-6c033830",
+        "dynamic_64_65": "eval-2026-02-14-a2b2717c",
+        "cognitive": "eval-2026-02-14-50487df7",
+        "base_69_70": "eval-2026-02-15-664073ab"
+      },
+      "judge_filter": "claude-opus%",
+      "notes": "50487df7 uses sonnet judge; others use opus"
+    },
+    "figure9": {
+      "title": "Qualitative Tag Divergence",
+      "section": "6.11",
+      "runs": ["eval-2026-02-07-b6d75e87"],
+      "source": "qualitative_assessment column, not overall_score",
+      "data_driven": false,
+      "notes": "Tag data from qualitative assessment, not numerical scores. Kept as hardcoded values with comment."
+    }
+  },
+  "tables": {
+    "table2": {
+      "title": "Evaluation Sample Summary",
+      "source": "key_evaluations array above",
+      "notes": "Generated from manifest + DB query. Paper Table 2 'Attempts' column may exceed manifest expected_attempts because it includes unjudged failures (rows with empty judge_model) that are excluded from manifest counts."
+    },
+    "appendix_d": {
+      "title": "Reproducibility and Key Evaluation Run IDs",
+      "source": "key_evaluations array above",
+      "notes": "Generated from manifest"
+    }
+  },
+  "prose_n_references": [
+    {"location": "abstract", "pattern": "N=3,383 primary scored"},
+    {"location": "introduction (line ~44)", "pattern": "N=3,383 primary scored responses"},
+    {"location": "methods (Table 2 totals)", "pattern": "**3,383**"},
+    {"location": "methods (line ~572)", "pattern": "N=3,383 scored"},
+    {"location": "methods (line ~574)", "pattern": "N=3,383"},
+    {"location": "discussion (line ~1817)", "pattern": "N=3,383"},
+    {"location": "conclusion (line ~1899)", "pattern": "N=3,383 primary scored"}
+  ]
+}