@machinespirits/eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +91 -9
  2. package/config/eval-settings.yaml +3 -3
  3. package/config/paper-manifest.json +486 -0
  4. package/config/providers.yaml +9 -6
  5. package/config/tutor-agents.yaml +2261 -0
  6. package/content/README.md +23 -0
  7. package/content/courses/479/course.md +53 -0
  8. package/content/courses/479/lecture-1.md +361 -0
  9. package/content/courses/479/lecture-2.md +360 -0
  10. package/content/courses/479/lecture-3.md +655 -0
  11. package/content/courses/479/lecture-4.md +530 -0
  12. package/content/courses/479/lecture-5.md +326 -0
  13. package/content/courses/479/lecture-6.md +346 -0
  14. package/content/courses/479/lecture-7.md +326 -0
  15. package/content/courses/479/lecture-8.md +273 -0
  16. package/content/courses/479/roadmap-slides.md +656 -0
  17. package/content/manifest.yaml +8 -0
  18. package/docs/research/build.sh +44 -20
  19. package/docs/research/figures/figure10.png +0 -0
  20. package/docs/research/figures/figure11.png +0 -0
  21. package/docs/research/figures/figure3.png +0 -0
  22. package/docs/research/figures/figure4.png +0 -0
  23. package/docs/research/figures/figure5.png +0 -0
  24. package/docs/research/figures/figure6.png +0 -0
  25. package/docs/research/figures/figure7.png +0 -0
  26. package/docs/research/figures/figure8.png +0 -0
  27. package/docs/research/figures/figure9.png +0 -0
  28. package/docs/research/header.tex +23 -2
  29. package/docs/research/paper-full.md +941 -285
  30. package/docs/research/paper-short.md +216 -585
  31. package/docs/research/references.bib +132 -0
  32. package/docs/research/slides-header.tex +188 -0
  33. package/docs/research/slides-pptx.md +363 -0
  34. package/docs/research/slides.md +531 -0
  35. package/docs/research/style-reference-pptx.py +199 -0
  36. package/package.json +6 -5
  37. package/scripts/analyze-eval-results.js +69 -17
  38. package/scripts/analyze-mechanism-traces.js +763 -0
  39. package/scripts/analyze-modulation-learning.js +498 -0
  40. package/scripts/analyze-prosthesis.js +144 -0
  41. package/scripts/analyze-run.js +264 -79
  42. package/scripts/assess-transcripts.js +853 -0
  43. package/scripts/browse-transcripts.js +854 -0
  44. package/scripts/check-parse-failures.js +73 -0
  45. package/scripts/code-dialectical-modulation.js +1320 -0
  46. package/scripts/download-data.sh +55 -0
  47. package/scripts/eval-cli.js +106 -18
  48. package/scripts/generate-paper-figures.js +663 -0
  49. package/scripts/generate-paper-figures.py +577 -76
  50. package/scripts/generate-paper-tables.js +299 -0
  51. package/scripts/qualitative-analysis-ai.js +3 -3
  52. package/scripts/render-sequence-diagram.js +694 -0
  53. package/scripts/test-latency.js +210 -0
  54. package/scripts/test-rate-limit.js +95 -0
  55. package/scripts/test-token-budget.js +332 -0
  56. package/scripts/validate-paper-manifest.js +670 -0
  57. package/services/__tests__/evalConfigLoader.test.js +2 -2
  58. package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
  59. package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
  60. package/services/evaluationRunner.js +975 -98
  61. package/services/evaluationStore.js +12 -4
  62. package/services/learnerTutorInteractionEngine.js +27 -2
  63. package/services/mockProvider.js +133 -0
  64. package/services/promptRewriter.js +1471 -5
  65. package/services/rubricEvaluator.js +55 -2
  66. package/services/transcriptFormatter.js +675 -0
  67. package/docs/EVALUATION-VARIABLES.md +0 -589
  68. package/docs/REPLICATION-PLAN.md +0 -577
  69. package/scripts/analyze-run.mjs +0 -282
  70. package/scripts/compare-runs.js +0 -44
  71. package/scripts/compare-suggestions.js +0 -80
  72. package/scripts/dig-into-run.js +0 -158
  73. package/scripts/show-failed-suggestions.js +0 -64
  74. /package/scripts/{check-run.mjs → check-run.js} +0 -0
package/README.md CHANGED
@@ -8,7 +8,7 @@ This is the evaluation and analysis companion to [`@machinespirits/tutor-core`](
8
8
 
9
9
  The system runs automated tutoring dialogues across configurable experimental cells, then scores them with LLM judges against a multi-dimensional rubric. It supports:
10
10
 
11
- - **Factorial evaluation** — 21 tutor agent cells varying recognition theory, architecture (single-agent vs ego+superego), and learner type
11
+ - **Factorial evaluation** — 70 tutor agent cells varying recognition theory, architecture, learner type, and mechanism design
12
12
  - **Multi-turn dialogues** — Learner agents with their own ego-superego deliberation
13
13
  - **Multi-judge validation** — Cross-judge reliability via Claude Opus, GPT-5.2, and others
14
14
  - **Placebo/active controls** — Length-matched prompts without recognition theory
@@ -17,7 +17,7 @@ The system runs automated tutoring dialogues across configurable experimental ce
17
17
  ## Prerequisites
18
18
 
19
19
  - **Node.js** >= 18.0.0
20
- - **@machinespirits/tutor-core** 0.3.1 (peer dependency)
20
+ - **@machinespirits/tutor-core** >= 0.3.1 (peer dependency)
21
21
  - At least one AI provider API key (see below)
22
22
 
23
23
  ## Installation
@@ -86,6 +86,18 @@ node scripts/eval-cli.js report <run-id>
86
86
  node scripts/eval-cli.js export <run-id> --format csv
87
87
  ```
88
88
 
89
+ ### Dry-run mode (no API keys required)
90
+
91
+ Verify the full pipeline without API calls:
92
+
93
+ ```bash
94
+ node scripts/eval-cli.js quick --dry-run
95
+ node scripts/eval-cli.js run --dry-run --runs 2
96
+ node scripts/eval-cli.js run --dry-run --runs 3 --scenario new_user_first_visit
97
+ ```
98
+
99
+ Dry-run uses deterministic mock data that mirrors real score distributions (recognition cells ~85-92, base cells ~72-82). All downstream steps (DB storage, ANOVA, reporting) work normally on the mock data.
100
+
89
101
  ### Standalone server
90
102
 
91
103
  ```bash
@@ -109,14 +121,16 @@ scripts/ CLI tools and analysis scripts
109
121
  services/ Core evaluation engine, rubric evaluator, learner simulation
110
122
  routes/ Express API routes (optional server mode)
111
123
  data/ SQLite databases (evaluation results, writing pads)
112
- content-test-elementary/ Bundled test content package
113
- docs/ Documentation and research paper
124
+ content/ Bundled course content (philosophy 479)
125
+ content-test-elementary/ Bundled test content (elementary 101)
126
+ notebooks/ Reproducibility notebook (Jupyter)
127
+ docs/research/ Research paper and build scripts
114
128
  tests/ Test suites
115
129
  ```
116
130
 
117
131
  ### Key configuration files
118
132
 
119
- - `config/tutor-agents.yaml` — All 21 experimental cells and their prompt mappings
133
+ - `config/tutor-agents.yaml` — All 70 experimental cells and their prompt mappings
120
134
  - `config/suggestion-scenarios.yaml` — Learner scenarios (single-turn and multi-turn)
121
135
  - `config/evaluation-rubric.yaml` — Scoring rubric (6 dimensions)
122
136
  - `config/providers.yaml` — AI provider and model configuration
@@ -131,11 +145,73 @@ The core factorial design crosses three factors:
131
145
  | B: Tutor architecture | Single-agent vs Ego+Superego |
132
146
  | C: Learner architecture | Single-agent vs Multi-agent |
133
147
 
134
- Additional cells test enhanced prompts (9-12), hardwired rules (13-14), placebo controls (15-18), memory isolation (19-20), and dynamic prompt rewriting (21).
148
+ Additional cells test enhanced prompts (9-12), hardwired rules (13-14), placebo controls (15-18), memory isolation (19-20), dynamic prompt rewriting (21), dialectical superego modulation (22-39), self-reflective evolution (40-45), insight-action mechanisms (46-53), other-ego profiling (54-59), and dynamic learner mechanism testing (60-70).
149
+
150
+ ## Reproducing Paper Findings
151
+
152
+ The full research paper is at `docs/research/paper-full.md`. A Jupyter notebook in `notebooks/` independently reproduces all 17 tables and key statistical findings.
153
+
154
+ The evaluation dataset (database + dialogue logs, ~19 MB) is available as a [GitHub Release artifact](https://github.com/liammagee/machinespirits-eval/releases/tag/v0.2.0). See `notebooks/README.md` for setup instructions.
155
+
156
+ To re-run evaluations from scratch (rather than reproducing from saved data), expect ~$65–90 USD in API costs and 48–72 hours wall-clock time. See the CLI help (`node scripts/eval-cli.js --help`) for details on running cells, judging, and exporting results.
157
+
158
+ ## Scripts Reference
159
+
160
+ ### Analysis
161
+
162
+ | Script | Description |
163
+ |--------|-------------|
164
+ | `analyze-eval-results.js` | Statistical analysis (ANOVA, effect sizes, marginal means) |
165
+ | `analyze-judge-reliability.js` | Inter-judge reliability (requires rejudged data) |
166
+ | `analyze-mechanism-traces.js <runId>` | Process trace analysis for mechanism comparison runs |
167
+ | `analyze-eval-costs.js` | Cost breakdown across runs |
168
+ | `analyze-interaction-evals.js` | Interaction evaluation results |
169
+ | `analyze-modulation-learning.js` | Modulation metrics and learning outcomes |
170
+ | `advanced-eval-analysis.js` | Extended multi-turn scenario analysis |
171
+ | `compare-transformation.js` | Transformation metrics (adaptation, growth indices) |
135
172
 
136
- ## Research Paper
173
+ ### Qualitative
137
174
 
138
- The full research paper is included at `docs/research/PAPER-FULL-2026-02-04.md`. For replication instructions, see `docs/REPLICATION-PLAN.md`.
175
+ | Script | Description |
176
+ |--------|-------------|
177
+ | `assess-transcripts.js <runId>` | Qualitative transcript assessment (`--blinded`, `--force`) |
178
+ | `browse-transcripts.js` | Interactive transcript browser (terminal UI) |
179
+ | `qualitative-analysis-ai.js` | AI-based thematic analysis of transcripts |
180
+ | `code-impasse-strategies.js` | Code impasse dialogues into Hegelian resolution strategies |
181
+ | `code-dialectical-modulation.js` | Code superego modulation patterns |
182
+
183
+ ### Paper & Validation
184
+
185
+ | Script | Description |
186
+ |--------|-------------|
187
+ | `generate-paper-tables.js` | Generate tables and validate prose against DB |
188
+ | `validate-paper-manifest.js` | Validate paper claims against evaluation data |
189
+ | `render-sequence-diagram.js` | Render architecture sequence diagrams |
190
+ | `validate-content.js` | Validate tutorial content files |
191
+
192
+ ### Utilities
193
+
194
+ | Script | Description |
195
+ |--------|-------------|
196
+ | `test-rate-limit.js [model]` | Probe OpenRouter rate limits (default: nemotron) |
197
+ | `test-latency.js` | Latency test across all configured models |
198
+ | `seed-db.js` | Initialize/seed the SQLite database |
199
+
200
+ All scripts are in `scripts/` and run with `node scripts/<name>`.
201
+
202
+ ## Claude Code Skills
203
+
204
+ This project includes [Claude Code skills](https://docs.anthropic.com/en/docs/claude-code/skills) (`.claude/skills/`) that encode common evaluation workflows as slash commands. In any Claude Code session:
205
+
206
+ | Command | What it does |
207
+ |---------|-------------|
208
+ | `/analyze-run <runId>` | Pull scores from DB, compute means, effect sizes, flag issues |
209
+ | `/check-models [alias]` | Probe OpenRouter rate limits and availability |
210
+ | `/build-paper` | Build paper PDF, check citations and cross-references |
211
+ | `/run-eval <cells> --runs N` | Full generation + judging pipeline with pre-flight checks |
212
+ | `/query-db <question>` | Natural language query against the evaluation database |
213
+
214
+ Skills with a `description` field (`analyze-run`, `check-models`, `query-db`) can also be invoked automatically by Claude when relevant to the conversation. `/run-eval` requires explicit invocation since it consumes API credits.
139
215
 
140
216
  ## Running Tests
141
217
 
@@ -143,6 +219,12 @@ The full research paper is included at `docs/research/PAPER-FULL-2026-02-04.md`.
143
219
  npm test
144
220
  ```
145
221
 
222
+ ## Known Deferred Risks
223
+
224
+ This repository currently accepts a small set of known risks because it is run as an internal localhost-only evaluation system.
225
+
226
+ See `notes/known-risks-localhost-2026-02-13.md` for the tracked risk register, acceptance scope, and hardening triggers required before broader deployment.
227
+
146
228
  ## Citation
147
229
 
148
230
  If you use this software in your research, please cite:
@@ -150,7 +232,7 @@ If you use this software in your research, please cite:
150
232
  ```bibtex
151
233
  @misc{magee2026machinespirits,
152
234
  author = {Magee, Liam},
153
- title = {The Drama Machine in Education: Recognition Theory and Multi-Agent Tutoring},
235
+ title = {\textit{Geist} in the Machine: Recognition Theory and Multi-Agent Tutoring},
154
236
  year = {2026},
155
237
  url = {https://github.com/liammagee/machinespirits-eval}
156
238
  }
@@ -6,9 +6,9 @@
6
6
  content:
7
7
  # Path to a content package containing courses/ with lecture markdown.
8
8
  # Relative paths are resolved from the eval repo root.
9
- # Default: bundled test content. Override with EVAL_CONTENT_PATH env var
10
- # or change this path to point to a full content package.
11
- content_package_path: "./content-test-elementary"
9
+ # Default: bundled 479 course content. Override with EVAL_CONTENT_PATH env var
10
+ # to use the full content-philosophy package (e.g., "../machinespirits-content-philosophy").
11
+ content_package_path: "./content"
12
12
 
13
13
  # Maximum characters to include from a lecture file.
14
14
  # Safety valve for token budget in large lectures.
@@ -0,0 +1,486 @@
1
+ {
2
+ "version": "1.4.0",
3
+ "generated": "2026-02-17",
4
+ "database": "data/evaluations.db",
5
+ "paper": "docs/research/paper-full.md",
6
+
7
+ "key_evaluations": [
8
+ {
9
+ "run_ids": ["eval-2026-02-03-86b159cd"],
10
+ "label": "Recognition validation",
11
+ "section": "6.1",
12
+ "primary_judge_pattern": "claude-opus%",
13
+ "unit": "response",
14
+ "expected_attempts": 36,
15
+ "expected_scored": 36
16
+ },
17
+ {
18
+ "run_ids": ["eval-2026-02-03-f5d4dd93"],
19
+ "label": "Full factorial, cells 1-5,7 (Kimi)",
20
+ "section": "6.3",
21
+ "primary_judge_pattern": "claude-opus%",
22
+ "unit": "response",
23
+ "expected_attempts": 262,
24
+ "expected_scored": 262
25
+ },
26
+ {
27
+ "run_ids": ["eval-2026-02-06-a933d745"],
28
+ "label": "Full factorial, cells 6,8 re-run (Kimi)",
29
+ "section": "6.3",
30
+ "primary_judge_pattern": "claude-opus%",
31
+ "unit": "response",
32
+ "expected_attempts": 90,
33
+ "expected_scored": 88
34
+ },
35
+ {
36
+ "run_ids": ["eval-2026-02-05-10b344fb"],
37
+ "label": "A×B replication (Kimi)",
38
+ "section": "6.4",
39
+ "primary_judge_pattern": "claude-opus%",
40
+ "unit": "response",
41
+ "expected_attempts": 60,
42
+ "expected_scored": 60
43
+ },
44
+ {
45
+ "run_ids": ["eval-2026-02-05-e87f452d"],
46
+ "label": "Domain generalizability (Kimi)",
47
+ "section": "6.5",
48
+ "primary_judge_pattern": "claude-opus%",
49
+ "unit": "response",
50
+ "expected_attempts": 60,
51
+ "expected_scored": 60
52
+ },
53
+ {
54
+ "run_ids": ["eval-2026-02-05-daf60f79"],
55
+ "label": "Dynamic rewrite evolution (run 1)",
56
+ "section": "6.18",
57
+ "primary_judge_pattern": "claude-opus%",
58
+ "unit": "response",
59
+ "expected_attempts": 29,
60
+ "expected_scored": 27
61
+ },
62
+ {
63
+ "run_ids": ["eval-2026-02-05-49bb2017"],
64
+ "label": "Dynamic rewrite evolution (run 2)",
65
+ "section": "6.18",
66
+ "primary_judge_pattern": "claude-opus%",
67
+ "unit": "response",
68
+ "expected_attempts": 30,
69
+ "expected_scored": 27
70
+ },
71
+ {
72
+ "run_ids": ["eval-2026-02-05-12aebedb"],
73
+ "label": "Dynamic rewrite evolution (run 3)",
74
+ "section": "6.18",
75
+ "primary_judge_pattern": "claude-opus%",
76
+ "unit": "response",
77
+ "expected_attempts": 30,
78
+ "expected_scored": 29
79
+ },
80
+ {
81
+ "run_ids": ["eval-2026-02-06-81f2d5a1"],
82
+ "label": "Memory isolation (run 1)",
83
+ "section": "6.2",
84
+ "primary_judge_pattern": "claude-opus%",
85
+ "unit": "response",
86
+ "expected_attempts": 60,
87
+ "expected_scored": 60
88
+ },
89
+ {
90
+ "run_ids": ["eval-2026-02-06-ac9ea8f5"],
91
+ "label": "Memory isolation (run 2)",
92
+ "section": "6.2",
93
+ "primary_judge_pattern": "claude-opus%",
94
+ "unit": "response",
95
+ "expected_attempts": 62,
96
+ "expected_scored": 62
97
+ },
98
+ {
99
+ "run_ids": ["eval-2026-02-06-a9ae06ee"],
100
+ "label": "Active control (post-hoc)",
101
+ "section": "6.2",
102
+ "primary_judge_pattern": "claude-opus%",
103
+ "unit": "response",
104
+ "expected_attempts": 119,
105
+ "expected_scored": 118
106
+ },
107
+ {
108
+ "run_ids": ["eval-2026-02-07-b6d75e87"],
109
+ "label": "Bilateral transformation (multi-turn)",
110
+ "section": "6.15",
111
+ "primary_judge_pattern": "claude-opus%",
112
+ "unit": "dialogue",
113
+ "expected_attempts": 120,
114
+ "expected_scored": 118
115
+ },
116
+ {
117
+ "run_ids": ["eval-2026-02-07-722087ac"],
118
+ "label": "A×B probe: Nemotron",
119
+ "section": "6.4",
120
+ "primary_judge_pattern": "claude-opus%",
121
+ "unit": "response",
122
+ "expected_attempts": 120,
123
+ "expected_scored": 119
124
+ },
125
+ {
126
+ "run_ids": ["eval-2026-02-07-70ef73a3"],
127
+ "label": "A×B probe: DeepSeek V3.2",
128
+ "section": "6.4",
129
+ "primary_judge_pattern": "claude-opus%",
130
+ "unit": "response",
131
+ "expected_attempts": 120,
132
+ "expected_scored": 120
133
+ },
134
+ {
135
+ "run_ids": ["eval-2026-02-07-6b3e6565"],
136
+ "label": "A×B probe: GLM-4.7",
137
+ "section": "6.4",
138
+ "primary_judge_pattern": "claude-opus%",
139
+ "unit": "response",
140
+ "expected_attempts": 120,
141
+ "expected_scored": 117
142
+ },
143
+ {
144
+ "run_ids": ["eval-2026-02-07-6ead24c7"],
145
+ "label": "A×B probe: Claude Haiku 4.5",
146
+ "section": "6.4",
147
+ "primary_judge_pattern": "claude-opus%",
148
+ "unit": "response",
149
+ "expected_attempts": 120,
150
+ "expected_scored": 120
151
+ },
152
+ {
153
+ "run_ids": ["eval-2026-02-08-f896275d"],
154
+ "label": "Dialectical impasse test",
155
+ "section": "6.20",
156
+ "primary_judge_pattern": "claude-opus%",
157
+ "unit": "dialogue",
158
+ "expected_attempts": 24,
159
+ "expected_scored": 24
160
+ },
161
+ {
162
+ "run_ids": ["eval-2026-02-08-65a6718f"],
163
+ "label": "Hardwired rules ablation (Kimi)",
164
+ "section": "6.7",
165
+ "primary_judge_pattern": "claude-opus%",
166
+ "unit": "response",
167
+ "expected_attempts": 72,
168
+ "expected_scored": 72
169
+ },
170
+ {
171
+ "run_ids": ["eval-2026-02-07-b6d75e87"],
172
+ "label": "Learner-side evaluation (symmetric)",
173
+ "section": "6.16",
174
+ "primary_judge_pattern": "claude-opus%",
175
+ "unit": "learner turn",
176
+ "expected_attempts": 118,
177
+ "expected_scored": 118,
178
+ "note": "Same run as bilateral transformation; scored with learner rubric"
179
+ },
180
+ {
181
+ "run_ids": ["eval-2026-02-11-35c53e99", "eval-2026-02-11-5f6d51f5"],
182
+ "label": "Dialectical modulation, standard (cells 22-27)",
183
+ "section": "6.8",
184
+ "primary_judge_pattern": "claude-opus%",
185
+ "unit": "response",
186
+ "expected_attempts": 84,
187
+ "expected_scored": 84
188
+ },
189
+ {
190
+ "run_ids": ["eval-2026-02-11-a54235ea"],
191
+ "label": "Dialectical modulation, multi-turn (cells 28-33)",
192
+ "section": "6.8",
193
+ "primary_judge_pattern": "claude-opus%",
194
+ "unit": "dialogue",
195
+ "expected_attempts": 90,
196
+ "expected_scored": 90
197
+ },
198
+ {
199
+ "run_ids": ["eval-2026-02-13-8d40e086"],
200
+ "label": "Self-reflective evolution (cells 40-45, Nemotron)",
201
+ "section": "6.9",
202
+ "primary_judge_pattern": "claude-opus%",
203
+ "unit": "dialogue",
204
+ "expected_attempts": 90,
205
+ "expected_scored": 90
206
+ },
207
+ {
208
+ "run_ids": ["eval-2026-02-14-559d854b"],
209
+ "label": "Self-reflect Nemotron non-replication (cells 40-45)",
210
+ "section": "6.9",
211
+ "primary_judge_pattern": "claude-opus%",
212
+ "unit": "dialogue",
213
+ "expected_attempts": 60,
214
+ "expected_scored": 60,
215
+ "profile_filter": "cell_4%_dialectical_%_superego",
216
+ "note": "Run contains cells 40-59 (N=167) but paper uses only cells 40-45 (N=60); cells 46-59 superseded by 49b33fdd"
217
+ },
218
+ {
219
+ "run_ids": ["eval-2026-02-14-e0e3a622"],
220
+ "label": "Mechanism robustness, scripted (cells 40-59)",
221
+ "section": "6.10",
222
+ "primary_judge_pattern": "claude-opus%",
223
+ "unit": "dialogue",
224
+ "expected_attempts": 360,
225
+ "expected_scored": 360
226
+ },
227
+ {
228
+ "run_ids": ["eval-2026-02-14-6c033830"],
229
+ "label": "Dynamic learner mechanisms (cells 60-63)",
230
+ "section": "6.10",
231
+ "primary_judge_pattern": "claude-opus%",
232
+ "unit": "dialogue",
233
+ "expected_attempts": 120,
234
+ "expected_scored": 120
235
+ },
236
+ {
237
+ "run_ids": ["eval-2026-02-14-a2b2717c"],
238
+ "label": "Dynamic learner mechanisms (cells 64-65)",
239
+ "section": "6.10",
240
+ "primary_judge_pattern": "claude-opus%",
241
+ "unit": "dialogue",
242
+ "expected_attempts": 120,
243
+ "expected_scored": 120
244
+ },
245
+ {
246
+ "run_ids": ["eval-2026-02-14-49b33fdd"],
247
+ "label": "Mechanism robustness, Nemotron (cells 40-59)",
248
+ "section": "6.10",
249
+ "primary_judge_pattern": "claude-opus%",
250
+ "unit": "dialogue",
251
+ "expected_attempts": 360,
252
+ "expected_scored": 360
253
+ },
254
+ {
255
+ "run_ids": ["eval-2026-02-17-25aaae85"],
256
+ "label": "Cognitive prosthesis (cells 66-68, Nemotron)",
257
+ "section": "6.10",
258
+ "primary_judge_pattern": "claude-opus%",
259
+ "unit": "dialogue",
260
+ "expected_attempts": 90,
261
+ "expected_scored": 90
262
+ },
263
+ {
264
+ "run_ids": ["eval-2026-02-18-f489c0ea"],
265
+ "label": "Cognitive prosthesis smoke test (Haiku)",
266
+ "section": "6.10",
267
+ "primary_judge_pattern": "claude-opus%",
268
+ "unit": "dialogue",
269
+ "expected_attempts": 6,
270
+ "expected_scored": 6
271
+ },
272
+ {
273
+ "run_ids": ["eval-2026-02-15-664073ab"],
274
+ "label": "Dynamic learner base mechanisms (cells 69-70)",
275
+ "section": "6.10",
276
+ "primary_judge_pattern": "claude-opus%",
277
+ "unit": "dialogue",
278
+ "expected_attempts": 60,
279
+ "expected_scored": 60
280
+ },
281
+ {
282
+ "run_ids": ["eval-2026-02-17-deee5fd6"],
283
+ "label": "Prompt elaboration baseline, Haiku (cells 1, 71)",
284
+ "section": "6.21",
285
+ "primary_judge_pattern": "claude-opus%",
286
+ "unit": "single-turn",
287
+ "expected_attempts": 72,
288
+ "expected_scored": 72
289
+ },
290
+ {
291
+ "run_ids": ["eval-2026-02-17-27d7b4e3"],
292
+ "label": "Prompt elaboration baseline, Kimi (cells 1, 71)",
293
+ "section": "6.21",
294
+ "primary_judge_pattern": "claude-opus%",
295
+ "unit": "single-turn",
296
+ "expected_attempts": 72,
297
+ "expected_scored": 72
298
+ },
299
+ {
300
+ "run_ids": ["eval-2026-02-17-0eb3de77"],
301
+ "label": "Token budget 256, Haiku (run 1)",
302
+ "section": "6.22",
303
+ "primary_judge_pattern": "claude-opus%",
304
+ "unit": "mixed",
305
+ "expected_attempts": 36,
306
+ "expected_scored": 36
307
+ },
308
+ {
309
+ "run_ids": ["eval-2026-02-17-5a640782"],
310
+ "label": "Token budget 256, Haiku (run 2)",
311
+ "section": "6.22",
312
+ "primary_judge_pattern": "claude-opus%",
313
+ "unit": "mixed",
314
+ "expected_attempts": 36,
315
+ "expected_scored": 36
316
+ },
317
+ {
318
+ "run_ids": ["eval-2026-02-17-5f281654"],
319
+ "label": "Token budget 512, Haiku",
320
+ "section": "6.22",
321
+ "primary_judge_pattern": "claude-opus%",
322
+ "unit": "mixed",
323
+ "expected_attempts": 36,
324
+ "expected_scored": 36
325
+ },
326
+ {
327
+ "run_ids": ["eval-2026-02-17-0f6dcd97"],
328
+ "label": "Token budget 2048, Haiku",
329
+ "section": "6.22",
330
+ "primary_judge_pattern": "claude-opus%",
331
+ "unit": "mixed",
332
+ "expected_attempts": 36,
333
+ "expected_scored": 36
334
+ },
335
+ {
336
+ "run_ids": ["eval-2026-02-17-d32ed226"],
337
+ "label": "Token budget default, Haiku",
338
+ "section": "6.22",
339
+ "primary_judge_pattern": "claude-opus%",
340
+ "unit": "mixed",
341
+ "expected_attempts": 18,
342
+ "expected_scored": 18
343
+ }
344
+ ],
345
+
346
+ "totals": {
347
+ "evaluations": 37,
348
+ "expected_attempts": 3398,
349
+ "expected_scored": 3383,
350
+ "opus_primary_count": 37,
351
+ "sonnet_primary_count": 0,
352
+ "sonnet_primary_runs": []
353
+ },
354
+
355
+ "figures": {
356
+ "figure1": {
357
+ "title": "2×2×2 Factorial Cell Means",
358
+ "section": "6.3",
359
+ "runs": ["eval-2026-02-03-f5d4dd93", "eval-2026-02-06-a933d745"],
360
+ "cells": [1, 2, 3, 4, 5, 6, 7, 8],
361
+ "judge_filter": "claude-opus%",
362
+ "notes": "Combine f5d4dd93 (cells 1-5,7) and a933d745 (cells 6,8)"
363
+ },
364
+ "figure2": {
365
+ "title": "Memory Isolation 2×2",
366
+ "section": "6.2",
367
+ "runs": ["eval-2026-02-06-81f2d5a1", "eval-2026-02-06-ac9ea8f5"],
368
+ "profiles": {
369
+ "base": "cell_1_base_single_unified",
370
+ "memory_only": "cell_19_memory_single_unified",
371
+ "recognition_only": "cell_20_recog_nomem_single_unified",
372
+ "recognition_plus_memory": "cell_5_recog_single_unified"
373
+ },
374
+ "judge_filter": "claude-opus%"
375
+ },
376
+ "figure3": {
377
+ "title": "Active Control Comparison",
378
+ "section": "6.2",
379
+ "runs": {
380
+ "active_control": "eval-2026-02-06-a9ae06ee",
381
+ "factorial_base": ["eval-2026-02-03-f5d4dd93", "eval-2026-02-06-a933d745"]
382
+ },
383
+ "judge_filter": "claude-opus%",
384
+ "notes": "Cross-run comparison: active control (Nemotron) vs factorial (Kimi). Model confound acknowledged."
385
+ },
386
+ "figure4": {
387
+ "title": "Multi-Model A×B Probe",
388
+ "section": "6.4",
389
+ "runs": {
390
+ "kimi": {
391
+ "run_ids": ["eval-2026-02-03-f5d4dd93"],
392
+ "cells": [1, 3, 5, 7],
393
+ "label": "Kimi K2.5",
394
+ "expected_n": 179
395
+ },
396
+ "nemotron": {
397
+ "run_ids": ["eval-2026-02-07-722087ac"],
398
+ "cells": [1, 3, 5, 7],
399
+ "label": "Nemotron",
400
+ "expected_n": 119
401
+ },
402
+ "deepseek": {
403
+ "run_ids": ["eval-2026-02-07-70ef73a3"],
404
+ "cells": [1, 3, 5, 7],
405
+ "label": "DeepSeek V3.2",
406
+ "expected_n": 120
407
+ },
408
+ "glm": {
409
+ "run_ids": ["eval-2026-02-07-6b3e6565"],
410
+ "cells": [1, 3, 5, 7],
411
+ "label": "GLM-4.7",
412
+ "expected_n": 117
413
+ },
414
+ "haiku": {
415
+ "run_ids": ["eval-2026-02-07-6ead24c7"],
416
+ "cells": [1, 3, 5, 7],
417
+ "label": "Claude Haiku 4.5",
418
+ "expected_n": 120
419
+ }
420
+ },
421
+ "judge_filter": "claude-opus%"
422
+ },
423
+ "figure5": {
424
+ "title": "Domain Generalizability",
425
+ "section": "6.5",
426
+ "runs": {
427
+ "elementary": "eval-2026-02-05-e87f452d",
428
+ "philosophy": "eval-2026-02-03-f5d4dd93"
429
+ },
430
+ "cells": [1, 3, 5, 7],
431
+ "judge_filter": "claude-opus%",
432
+ "notes": "Elementary from domain gen run; philosophy from factorial single-learner cells"
433
+ },
434
+ "figure7": {
435
+ "title": "Superego Persona × Recognition",
436
+ "section": "6.8",
437
+ "runs": ["eval-2026-02-11-a54235ea"],
438
+ "cells": [28, 29, 30, 31, 32, 33],
439
+ "judge_filter": "claude-opus%"
440
+ },
441
+ "figure8": {
442
+ "title": "Mechanism Differentiation — Scripted vs Dynamic",
443
+ "section": "6.10",
444
+ "runs": {
445
+ "scripted": "eval-2026-02-14-e0e3a622",
446
+ "dynamic_60_63": "eval-2026-02-14-6c033830",
447
+ "dynamic_64_65": "eval-2026-02-14-a2b2717c",
448
+ "cognitive": "eval-2026-02-14-50487df7",
449
+ "base_69_70": "eval-2026-02-15-664073ab"
450
+ },
451
+ "judge_filter": "claude-opus%",
452
+ "notes": "50487df7 uses sonnet judge; others use opus"
453
+ },
454
+ "figure9": {
455
+ "title": "Qualitative Tag Divergence",
456
+ "section": "6.11",
457
+ "runs": ["eval-2026-02-07-b6d75e87"],
458
+ "source": "qualitative_assessment column, not overall_score",
459
+ "data_driven": false,
460
+ "notes": "Tag data from qualitative assessment, not numerical scores. Kept as hardcoded values with comment."
461
+ }
462
+ },
463
+
464
+ "tables": {
465
+ "table2": {
466
+ "title": "Evaluation Sample Summary",
467
+ "source": "key_evaluations array above",
468
+ "notes": "Generated from manifest + DB query. Paper Table 2 'Attempts' column may exceed manifest expected_attempts because it includes unjudged failures (rows with empty judge_model) that are excluded from manifest counts."
469
+ },
470
+ "appendix_d": {
471
+ "title": "Reproducibility and Key Evaluation Run IDs",
472
+ "source": "key_evaluations array above",
473
+ "notes": "Generated from manifest"
474
+ }
475
+ },
476
+
477
+ "prose_n_references": [
478
+ {"location": "abstract", "pattern": "N=3,383 primary scored"},
479
+ {"location": "introduction (line ~44)", "pattern": "N=3,383 primary scored responses"},
480
+ {"location": "methods (Table 2 totals)", "pattern": "**3,383**"},
481
+ {"location": "methods (line ~572)", "pattern": "N=3,383 scored"},
482
+ {"location": "methods (line ~574)", "pattern": "N=3,383"},
483
+ {"location": "discussion (line ~1817)", "pattern": "N=3,383"},
484
+ {"location": "conclusion (line ~1899)", "pattern": "N=3,383 primary scored"}
485
+ ]
486
+ }