@machinespirits/eval 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -9
- package/config/eval-settings.yaml +3 -3
- package/config/paper-manifest.json +486 -0
- package/config/providers.yaml +9 -6
- package/config/tutor-agents.yaml +2261 -0
- package/content/README.md +23 -0
- package/content/courses/479/course.md +53 -0
- package/content/courses/479/lecture-1.md +361 -0
- package/content/courses/479/lecture-2.md +360 -0
- package/content/courses/479/lecture-3.md +655 -0
- package/content/courses/479/lecture-4.md +530 -0
- package/content/courses/479/lecture-5.md +326 -0
- package/content/courses/479/lecture-6.md +346 -0
- package/content/courses/479/lecture-7.md +326 -0
- package/content/courses/479/lecture-8.md +273 -0
- package/content/courses/479/roadmap-slides.md +656 -0
- package/content/manifest.yaml +8 -0
- package/docs/research/build.sh +44 -20
- package/docs/research/figures/figure10.png +0 -0
- package/docs/research/figures/figure11.png +0 -0
- package/docs/research/figures/figure3.png +0 -0
- package/docs/research/figures/figure4.png +0 -0
- package/docs/research/figures/figure5.png +0 -0
- package/docs/research/figures/figure6.png +0 -0
- package/docs/research/figures/figure7.png +0 -0
- package/docs/research/figures/figure8.png +0 -0
- package/docs/research/figures/figure9.png +0 -0
- package/docs/research/header.tex +23 -2
- package/docs/research/paper-full.md +941 -285
- package/docs/research/paper-short.md +216 -585
- package/docs/research/references.bib +132 -0
- package/docs/research/slides-header.tex +188 -0
- package/docs/research/slides-pptx.md +363 -0
- package/docs/research/slides.md +531 -0
- package/docs/research/style-reference-pptx.py +199 -0
- package/package.json +6 -5
- package/scripts/analyze-eval-results.js +69 -17
- package/scripts/analyze-mechanism-traces.js +763 -0
- package/scripts/analyze-modulation-learning.js +498 -0
- package/scripts/analyze-prosthesis.js +144 -0
- package/scripts/analyze-run.js +264 -79
- package/scripts/assess-transcripts.js +853 -0
- package/scripts/browse-transcripts.js +854 -0
- package/scripts/check-parse-failures.js +73 -0
- package/scripts/code-dialectical-modulation.js +1320 -0
- package/scripts/download-data.sh +55 -0
- package/scripts/eval-cli.js +106 -18
- package/scripts/generate-paper-figures.js +663 -0
- package/scripts/generate-paper-figures.py +577 -76
- package/scripts/generate-paper-tables.js +299 -0
- package/scripts/qualitative-analysis-ai.js +3 -3
- package/scripts/render-sequence-diagram.js +694 -0
- package/scripts/test-latency.js +210 -0
- package/scripts/test-rate-limit.js +95 -0
- package/scripts/test-token-budget.js +332 -0
- package/scripts/validate-paper-manifest.js +670 -0
- package/services/__tests__/evalConfigLoader.test.js +2 -2
- package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
- package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
- package/services/evaluationRunner.js +975 -98
- package/services/evaluationStore.js +12 -4
- package/services/learnerTutorInteractionEngine.js +27 -2
- package/services/mockProvider.js +133 -0
- package/services/promptRewriter.js +1471 -5
- package/services/rubricEvaluator.js +55 -2
- package/services/transcriptFormatter.js +675 -0
- package/docs/EVALUATION-VARIABLES.md +0 -589
- package/docs/REPLICATION-PLAN.md +0 -577
- package/scripts/analyze-run.mjs +0 -282
- package/scripts/compare-runs.js +0 -44
- package/scripts/compare-suggestions.js +0 -80
- package/scripts/dig-into-run.js +0 -158
- package/scripts/show-failed-suggestions.js +0 -64
- /package/scripts/{check-run.mjs → check-run.js} +0 -0
package/README.md
CHANGED
|
@@ -8,7 +8,7 @@ This is the evaluation and analysis companion to [`@machinespirits/tutor-core`](
|
|
|
8
8
|
|
|
9
9
|
The system runs automated tutoring dialogues across configurable experimental cells, then scores them with LLM judges against a multi-dimensional rubric. It supports:
|
|
10
10
|
|
|
11
|
-
- **Factorial evaluation** —
|
|
11
|
+
- **Factorial evaluation** — 70 tutor agent cells varying recognition theory, architecture, learner type, and mechanism design
|
|
12
12
|
- **Multi-turn dialogues** — Learner agents with their own ego-superego deliberation
|
|
13
13
|
- **Multi-judge validation** — Cross-judge reliability via Claude Opus, GPT-5.2, and others
|
|
14
14
|
- **Placebo/active controls** — Length-matched prompts without recognition theory
|
|
@@ -17,7 +17,7 @@ The system runs automated tutoring dialogues across configurable experimental ce
|
|
|
17
17
|
## Prerequisites
|
|
18
18
|
|
|
19
19
|
- **Node.js** >= 18.0.0
|
|
20
|
-
- **@machinespirits/tutor-core** 0.3.1 (peer dependency)
|
|
20
|
+
- **@machinespirits/tutor-core** >= 0.3.1 (peer dependency)
|
|
21
21
|
- At least one AI provider API key (see below)
|
|
22
22
|
|
|
23
23
|
## Installation
|
|
@@ -86,6 +86,18 @@ node scripts/eval-cli.js report <run-id>
|
|
|
86
86
|
node scripts/eval-cli.js export <run-id> --format csv
|
|
87
87
|
```
|
|
88
88
|
|
|
89
|
+
### Dry-run mode (no API keys required)
|
|
90
|
+
|
|
91
|
+
Verify the full pipeline without API calls:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
node scripts/eval-cli.js quick --dry-run
|
|
95
|
+
node scripts/eval-cli.js run --dry-run --runs 2
|
|
96
|
+
node scripts/eval-cli.js run --dry-run --runs 3 --scenario new_user_first_visit
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Dry-run uses deterministic mock data that mirrors real score distributions (recognition cells ~85-92, base cells ~72-82). All downstream steps (DB storage, ANOVA, reporting) work normally on the mock data.
|
|
100
|
+
|
|
89
101
|
### Standalone server
|
|
90
102
|
|
|
91
103
|
```bash
|
|
@@ -109,14 +121,16 @@ scripts/ CLI tools and analysis scripts
|
|
|
109
121
|
services/ Core evaluation engine, rubric evaluator, learner simulation
|
|
110
122
|
routes/ Express API routes (optional server mode)
|
|
111
123
|
data/ SQLite databases (evaluation results, writing pads)
|
|
112
|
-
content
|
|
113
|
-
|
|
124
|
+
content/ Bundled course content (philosophy 479)
|
|
125
|
+
content-test-elementary/ Bundled test content (elementary 101)
|
|
126
|
+
notebooks/ Reproducibility notebook (Jupyter)
|
|
127
|
+
docs/research/ Research paper and build scripts
|
|
114
128
|
tests/ Test suites
|
|
115
129
|
```
|
|
116
130
|
|
|
117
131
|
### Key configuration files
|
|
118
132
|
|
|
119
|
-
- `config/tutor-agents.yaml` — All
|
|
133
|
+
- `config/tutor-agents.yaml` — All 70 experimental cells and their prompt mappings
|
|
120
134
|
- `config/suggestion-scenarios.yaml` — Learner scenarios (single-turn and multi-turn)
|
|
121
135
|
- `config/evaluation-rubric.yaml` — Scoring rubric (6 dimensions)
|
|
122
136
|
- `config/providers.yaml` — AI provider and model configuration
|
|
@@ -131,11 +145,73 @@ The core factorial design crosses three factors:
|
|
|
131
145
|
| B: Tutor architecture | Single-agent vs Ego+Superego |
|
|
132
146
|
| C: Learner architecture | Single-agent vs Multi-agent |
|
|
133
147
|
|
|
134
|
-
Additional cells test enhanced prompts (9-12), hardwired rules (13-14), placebo controls (15-18), memory isolation (19-20),
|
|
148
|
+
Additional cells test enhanced prompts (9-12), hardwired rules (13-14), placebo controls (15-18), memory isolation (19-20), dynamic prompt rewriting (21), dialectical superego modulation (22-39), self-reflective evolution (40-45), insight-action mechanisms (46-53), other-ego profiling (54-59), and dynamic learner mechanism testing (60-70).
|
|
149
|
+
|
|
150
|
+
## Reproducing Paper Findings
|
|
151
|
+
|
|
152
|
+
The full research paper is at `docs/research/paper-full.md`. A Jupyter notebook in `notebooks/` independently reproduces all 17 tables and key statistical findings.
|
|
153
|
+
|
|
154
|
+
The evaluation dataset (database + dialogue logs, ~19 MB) is available as a [GitHub Release artifact](https://github.com/liammagee/machinespirits-eval/releases/tag/v0.2.0). See `notebooks/README.md` for setup instructions.
|
|
155
|
+
|
|
156
|
+
To re-run evaluations from scratch (rather than reproducing from saved data), expect ~$65–90 USD in API costs and 48–72 hours wall-clock time. See the CLI help (`node scripts/eval-cli.js --help`) for details on running cells, judging, and exporting results.
|
|
157
|
+
|
|
158
|
+
## Scripts Reference
|
|
159
|
+
|
|
160
|
+
### Analysis
|
|
161
|
+
|
|
162
|
+
| Script | Description |
|
|
163
|
+
|--------|-------------|
|
|
164
|
+
| `analyze-eval-results.js` | Statistical analysis (ANOVA, effect sizes, marginal means) |
|
|
165
|
+
| `analyze-judge-reliability.js` | Inter-judge reliability (requires rejudged data) |
|
|
166
|
+
| `analyze-mechanism-traces.js <runId>` | Process trace analysis for mechanism comparison runs |
|
|
167
|
+
| `analyze-eval-costs.js` | Cost breakdown across runs |
|
|
168
|
+
| `analyze-interaction-evals.js` | Interaction evaluation results |
|
|
169
|
+
| `analyze-modulation-learning.js` | Modulation metrics and learning outcomes |
|
|
170
|
+
| `advanced-eval-analysis.js` | Extended multi-turn scenario analysis |
|
|
171
|
+
| `compare-transformation.js` | Transformation metrics (adaptation, growth indices) |
|
|
135
172
|
|
|
136
|
-
|
|
173
|
+
### Qualitative
|
|
137
174
|
|
|
138
|
-
|
|
175
|
+
| Script | Description |
|
|
176
|
+
|--------|-------------|
|
|
177
|
+
| `assess-transcripts.js <runId>` | Qualitative transcript assessment (`--blinded`, `--force`) |
|
|
178
|
+
| `browse-transcripts.js` | Interactive transcript browser (terminal UI) |
|
|
179
|
+
| `qualitative-analysis-ai.js` | AI-based thematic analysis of transcripts |
|
|
180
|
+
| `code-impasse-strategies.js` | Code impasse dialogues into Hegelian resolution strategies |
|
|
181
|
+
| `code-dialectical-modulation.js` | Code superego modulation patterns |
|
|
182
|
+
|
|
183
|
+
### Paper & Validation
|
|
184
|
+
|
|
185
|
+
| Script | Description |
|
|
186
|
+
|--------|-------------|
|
|
187
|
+
| `generate-paper-tables.js` | Generate tables and validate prose against DB |
|
|
188
|
+
| `validate-paper-manifest.js` | Validate paper claims against evaluation data |
|
|
189
|
+
| `render-sequence-diagram.js` | Render architecture sequence diagrams |
|
|
190
|
+
| `validate-content.js` | Validate tutorial content files |
|
|
191
|
+
|
|
192
|
+
### Utilities
|
|
193
|
+
|
|
194
|
+
| Script | Description |
|
|
195
|
+
|--------|-------------|
|
|
196
|
+
| `test-rate-limit.js [model]` | Probe OpenRouter rate limits (default: nemotron) |
|
|
197
|
+
| `test-latency.js` | Latency test across all configured models |
|
|
198
|
+
| `seed-db.js` | Initialize/seed the SQLite database |
|
|
199
|
+
|
|
200
|
+
All scripts are in `scripts/` and run with `node scripts/<name>`.
|
|
201
|
+
|
|
202
|
+
## Claude Code Skills
|
|
203
|
+
|
|
204
|
+
This project includes [Claude Code skills](https://docs.anthropic.com/en/docs/claude-code/skills) (`.claude/skills/`) that encode common evaluation workflows as slash commands. In any Claude Code session:
|
|
205
|
+
|
|
206
|
+
| Command | What it does |
|
|
207
|
+
|---------|-------------|
|
|
208
|
+
| `/analyze-run <runId>` | Pull scores from DB, compute means, effect sizes, flag issues |
|
|
209
|
+
| `/check-models [alias]` | Probe OpenRouter rate limits and availability |
|
|
210
|
+
| `/build-paper` | Build paper PDF, check citations and cross-references |
|
|
211
|
+
| `/run-eval <cells> --runs N` | Full generation + judging pipeline with pre-flight checks |
|
|
212
|
+
| `/query-db <question>` | Natural language query against the evaluation database |
|
|
213
|
+
|
|
214
|
+
Skills with a `description` field (`analyze-run`, `check-models`, `query-db`) can also be invoked automatically by Claude when relevant to the conversation. `/run-eval` requires explicit invocation since it consumes API credits.
|
|
139
215
|
|
|
140
216
|
## Running Tests
|
|
141
217
|
|
|
@@ -143,6 +219,12 @@ The full research paper is included at `docs/research/PAPER-FULL-2026-02-04.md`.
|
|
|
143
219
|
npm test
|
|
144
220
|
```
|
|
145
221
|
|
|
222
|
+
## Known Deferred Risks
|
|
223
|
+
|
|
224
|
+
This repository currently accepts a small set of known risks because it is run as an internal localhost-only evaluation system.
|
|
225
|
+
|
|
226
|
+
See `notes/known-risks-localhost-2026-02-13.md` for the tracked risk register, acceptance scope, and hardening triggers required before broader deployment.
|
|
227
|
+
|
|
146
228
|
## Citation
|
|
147
229
|
|
|
148
230
|
If you use this software in your research, please cite:
|
|
@@ -150,7 +232,7 @@ If you use this software in your research, please cite:
|
|
|
150
232
|
```bibtex
|
|
151
233
|
@misc{magee2026machinespirits,
|
|
152
234
|
author = {Magee, Liam},
|
|
153
|
-
title = {
|
|
235
|
+
title = {\textit{Geist} in the Machine: Recognition Theory and Multi-Agent Tutoring},
|
|
154
236
|
year = {2026},
|
|
155
237
|
url = {https://github.com/liammagee/machinespirits-eval}
|
|
156
238
|
}
|
|
@@ -6,9 +6,9 @@
|
|
|
6
6
|
content:
|
|
7
7
|
# Path to a content package containing courses/ with lecture markdown.
|
|
8
8
|
# Relative paths are resolved from the eval repo root.
|
|
9
|
-
# Default: bundled
|
|
10
|
-
#
|
|
11
|
-
content_package_path: "./content
|
|
9
|
+
# Default: bundled 479 course content. Override with EVAL_CONTENT_PATH env var
|
|
10
|
+
# to use the full content-philosophy package (e.g., "../machinespirits-content-philosophy").
|
|
11
|
+
content_package_path: "./content"
|
|
12
12
|
|
|
13
13
|
# Maximum characters to include from a lecture file.
|
|
14
14
|
# Safety valve for token budget in large lectures.
|
|
@@ -0,0 +1,486 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": "1.4.0",
|
|
3
|
+
"generated": "2026-02-17",
|
|
4
|
+
"database": "data/evaluations.db",
|
|
5
|
+
"paper": "docs/research/paper-full.md",
|
|
6
|
+
|
|
7
|
+
"key_evaluations": [
|
|
8
|
+
{
|
|
9
|
+
"run_ids": ["eval-2026-02-03-86b159cd"],
|
|
10
|
+
"label": "Recognition validation",
|
|
11
|
+
"section": "6.1",
|
|
12
|
+
"primary_judge_pattern": "claude-opus%",
|
|
13
|
+
"unit": "response",
|
|
14
|
+
"expected_attempts": 36,
|
|
15
|
+
"expected_scored": 36
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"run_ids": ["eval-2026-02-03-f5d4dd93"],
|
|
19
|
+
"label": "Full factorial, cells 1-5,7 (Kimi)",
|
|
20
|
+
"section": "6.3",
|
|
21
|
+
"primary_judge_pattern": "claude-opus%",
|
|
22
|
+
"unit": "response",
|
|
23
|
+
"expected_attempts": 262,
|
|
24
|
+
"expected_scored": 262
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"run_ids": ["eval-2026-02-06-a933d745"],
|
|
28
|
+
"label": "Full factorial, cells 6,8 re-run (Kimi)",
|
|
29
|
+
"section": "6.3",
|
|
30
|
+
"primary_judge_pattern": "claude-opus%",
|
|
31
|
+
"unit": "response",
|
|
32
|
+
"expected_attempts": 90,
|
|
33
|
+
"expected_scored": 88
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
"run_ids": ["eval-2026-02-05-10b344fb"],
|
|
37
|
+
"label": "A×B replication (Kimi)",
|
|
38
|
+
"section": "6.4",
|
|
39
|
+
"primary_judge_pattern": "claude-opus%",
|
|
40
|
+
"unit": "response",
|
|
41
|
+
"expected_attempts": 60,
|
|
42
|
+
"expected_scored": 60
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
"run_ids": ["eval-2026-02-05-e87f452d"],
|
|
46
|
+
"label": "Domain generalizability (Kimi)",
|
|
47
|
+
"section": "6.5",
|
|
48
|
+
"primary_judge_pattern": "claude-opus%",
|
|
49
|
+
"unit": "response",
|
|
50
|
+
"expected_attempts": 60,
|
|
51
|
+
"expected_scored": 60
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"run_ids": ["eval-2026-02-05-daf60f79"],
|
|
55
|
+
"label": "Dynamic rewrite evolution (run 1)",
|
|
56
|
+
"section": "6.18",
|
|
57
|
+
"primary_judge_pattern": "claude-opus%",
|
|
58
|
+
"unit": "response",
|
|
59
|
+
"expected_attempts": 29,
|
|
60
|
+
"expected_scored": 27
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"run_ids": ["eval-2026-02-05-49bb2017"],
|
|
64
|
+
"label": "Dynamic rewrite evolution (run 2)",
|
|
65
|
+
"section": "6.18",
|
|
66
|
+
"primary_judge_pattern": "claude-opus%",
|
|
67
|
+
"unit": "response",
|
|
68
|
+
"expected_attempts": 30,
|
|
69
|
+
"expected_scored": 27
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
"run_ids": ["eval-2026-02-05-12aebedb"],
|
|
73
|
+
"label": "Dynamic rewrite evolution (run 3)",
|
|
74
|
+
"section": "6.18",
|
|
75
|
+
"primary_judge_pattern": "claude-opus%",
|
|
76
|
+
"unit": "response",
|
|
77
|
+
"expected_attempts": 30,
|
|
78
|
+
"expected_scored": 29
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
"run_ids": ["eval-2026-02-06-81f2d5a1"],
|
|
82
|
+
"label": "Memory isolation (run 1)",
|
|
83
|
+
"section": "6.2",
|
|
84
|
+
"primary_judge_pattern": "claude-opus%",
|
|
85
|
+
"unit": "response",
|
|
86
|
+
"expected_attempts": 60,
|
|
87
|
+
"expected_scored": 60
|
|
88
|
+
},
|
|
89
|
+
{
|
|
90
|
+
"run_ids": ["eval-2026-02-06-ac9ea8f5"],
|
|
91
|
+
"label": "Memory isolation (run 2)",
|
|
92
|
+
"section": "6.2",
|
|
93
|
+
"primary_judge_pattern": "claude-opus%",
|
|
94
|
+
"unit": "response",
|
|
95
|
+
"expected_attempts": 62,
|
|
96
|
+
"expected_scored": 62
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
"run_ids": ["eval-2026-02-06-a9ae06ee"],
|
|
100
|
+
"label": "Active control (post-hoc)",
|
|
101
|
+
"section": "6.2",
|
|
102
|
+
"primary_judge_pattern": "claude-opus%",
|
|
103
|
+
"unit": "response",
|
|
104
|
+
"expected_attempts": 119,
|
|
105
|
+
"expected_scored": 118
|
|
106
|
+
},
|
|
107
|
+
{
|
|
108
|
+
"run_ids": ["eval-2026-02-07-b6d75e87"],
|
|
109
|
+
"label": "Bilateral transformation (multi-turn)",
|
|
110
|
+
"section": "6.15",
|
|
111
|
+
"primary_judge_pattern": "claude-opus%",
|
|
112
|
+
"unit": "dialogue",
|
|
113
|
+
"expected_attempts": 120,
|
|
114
|
+
"expected_scored": 118
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
"run_ids": ["eval-2026-02-07-722087ac"],
|
|
118
|
+
"label": "A×B probe: Nemotron",
|
|
119
|
+
"section": "6.4",
|
|
120
|
+
"primary_judge_pattern": "claude-opus%",
|
|
121
|
+
"unit": "response",
|
|
122
|
+
"expected_attempts": 120,
|
|
123
|
+
"expected_scored": 119
|
|
124
|
+
},
|
|
125
|
+
{
|
|
126
|
+
"run_ids": ["eval-2026-02-07-70ef73a3"],
|
|
127
|
+
"label": "A×B probe: DeepSeek V3.2",
|
|
128
|
+
"section": "6.4",
|
|
129
|
+
"primary_judge_pattern": "claude-opus%",
|
|
130
|
+
"unit": "response",
|
|
131
|
+
"expected_attempts": 120,
|
|
132
|
+
"expected_scored": 120
|
|
133
|
+
},
|
|
134
|
+
{
|
|
135
|
+
"run_ids": ["eval-2026-02-07-6b3e6565"],
|
|
136
|
+
"label": "A×B probe: GLM-4.7",
|
|
137
|
+
"section": "6.4",
|
|
138
|
+
"primary_judge_pattern": "claude-opus%",
|
|
139
|
+
"unit": "response",
|
|
140
|
+
"expected_attempts": 120,
|
|
141
|
+
"expected_scored": 117
|
|
142
|
+
},
|
|
143
|
+
{
|
|
144
|
+
"run_ids": ["eval-2026-02-07-6ead24c7"],
|
|
145
|
+
"label": "A×B probe: Claude Haiku 4.5",
|
|
146
|
+
"section": "6.4",
|
|
147
|
+
"primary_judge_pattern": "claude-opus%",
|
|
148
|
+
"unit": "response",
|
|
149
|
+
"expected_attempts": 120,
|
|
150
|
+
"expected_scored": 120
|
|
151
|
+
},
|
|
152
|
+
{
|
|
153
|
+
"run_ids": ["eval-2026-02-08-f896275d"],
|
|
154
|
+
"label": "Dialectical impasse test",
|
|
155
|
+
"section": "6.20",
|
|
156
|
+
"primary_judge_pattern": "claude-opus%",
|
|
157
|
+
"unit": "dialogue",
|
|
158
|
+
"expected_attempts": 24,
|
|
159
|
+
"expected_scored": 24
|
|
160
|
+
},
|
|
161
|
+
{
|
|
162
|
+
"run_ids": ["eval-2026-02-08-65a6718f"],
|
|
163
|
+
"label": "Hardwired rules ablation (Kimi)",
|
|
164
|
+
"section": "6.7",
|
|
165
|
+
"primary_judge_pattern": "claude-opus%",
|
|
166
|
+
"unit": "response",
|
|
167
|
+
"expected_attempts": 72,
|
|
168
|
+
"expected_scored": 72
|
|
169
|
+
},
|
|
170
|
+
{
|
|
171
|
+
"run_ids": ["eval-2026-02-07-b6d75e87"],
|
|
172
|
+
"label": "Learner-side evaluation (symmetric)",
|
|
173
|
+
"section": "6.16",
|
|
174
|
+
"primary_judge_pattern": "claude-opus%",
|
|
175
|
+
"unit": "learner turn",
|
|
176
|
+
"expected_attempts": 118,
|
|
177
|
+
"expected_scored": 118,
|
|
178
|
+
"note": "Same run as bilateral transformation; scored with learner rubric"
|
|
179
|
+
},
|
|
180
|
+
{
|
|
181
|
+
"run_ids": ["eval-2026-02-11-35c53e99", "eval-2026-02-11-5f6d51f5"],
|
|
182
|
+
"label": "Dialectical modulation, standard (cells 22-27)",
|
|
183
|
+
"section": "6.8",
|
|
184
|
+
"primary_judge_pattern": "claude-opus%",
|
|
185
|
+
"unit": "response",
|
|
186
|
+
"expected_attempts": 84,
|
|
187
|
+
"expected_scored": 84
|
|
188
|
+
},
|
|
189
|
+
{
|
|
190
|
+
"run_ids": ["eval-2026-02-11-a54235ea"],
|
|
191
|
+
"label": "Dialectical modulation, multi-turn (cells 28-33)",
|
|
192
|
+
"section": "6.8",
|
|
193
|
+
"primary_judge_pattern": "claude-opus%",
|
|
194
|
+
"unit": "dialogue",
|
|
195
|
+
"expected_attempts": 90,
|
|
196
|
+
"expected_scored": 90
|
|
197
|
+
},
|
|
198
|
+
{
|
|
199
|
+
"run_ids": ["eval-2026-02-13-8d40e086"],
|
|
200
|
+
"label": "Self-reflective evolution (cells 40-45, Nemotron)",
|
|
201
|
+
"section": "6.9",
|
|
202
|
+
"primary_judge_pattern": "claude-opus%",
|
|
203
|
+
"unit": "dialogue",
|
|
204
|
+
"expected_attempts": 90,
|
|
205
|
+
"expected_scored": 90
|
|
206
|
+
},
|
|
207
|
+
{
|
|
208
|
+
"run_ids": ["eval-2026-02-14-559d854b"],
|
|
209
|
+
"label": "Self-reflect Nemotron non-replication (cells 40-45)",
|
|
210
|
+
"section": "6.9",
|
|
211
|
+
"primary_judge_pattern": "claude-opus%",
|
|
212
|
+
"unit": "dialogue",
|
|
213
|
+
"expected_attempts": 60,
|
|
214
|
+
"expected_scored": 60,
|
|
215
|
+
"profile_filter": "cell_4%_dialectical_%_superego",
|
|
216
|
+
"note": "Run contains cells 40-59 (N=167) but paper uses only cells 40-45 (N=60); cells 46-59 superseded by 49b33fdd"
|
|
217
|
+
},
|
|
218
|
+
{
|
|
219
|
+
"run_ids": ["eval-2026-02-14-e0e3a622"],
|
|
220
|
+
"label": "Mechanism robustness, scripted (cells 40-59)",
|
|
221
|
+
"section": "6.10",
|
|
222
|
+
"primary_judge_pattern": "claude-opus%",
|
|
223
|
+
"unit": "dialogue",
|
|
224
|
+
"expected_attempts": 360,
|
|
225
|
+
"expected_scored": 360
|
|
226
|
+
},
|
|
227
|
+
{
|
|
228
|
+
"run_ids": ["eval-2026-02-14-6c033830"],
|
|
229
|
+
"label": "Dynamic learner mechanisms (cells 60-63)",
|
|
230
|
+
"section": "6.10",
|
|
231
|
+
"primary_judge_pattern": "claude-opus%",
|
|
232
|
+
"unit": "dialogue",
|
|
233
|
+
"expected_attempts": 120,
|
|
234
|
+
"expected_scored": 120
|
|
235
|
+
},
|
|
236
|
+
{
|
|
237
|
+
"run_ids": ["eval-2026-02-14-a2b2717c"],
|
|
238
|
+
"label": "Dynamic learner mechanisms (cells 64-65)",
|
|
239
|
+
"section": "6.10",
|
|
240
|
+
"primary_judge_pattern": "claude-opus%",
|
|
241
|
+
"unit": "dialogue",
|
|
242
|
+
"expected_attempts": 120,
|
|
243
|
+
"expected_scored": 120
|
|
244
|
+
},
|
|
245
|
+
{
|
|
246
|
+
"run_ids": ["eval-2026-02-14-49b33fdd"],
|
|
247
|
+
"label": "Mechanism robustness, Nemotron (cells 40-59)",
|
|
248
|
+
"section": "6.10",
|
|
249
|
+
"primary_judge_pattern": "claude-opus%",
|
|
250
|
+
"unit": "dialogue",
|
|
251
|
+
"expected_attempts": 360,
|
|
252
|
+
"expected_scored": 360
|
|
253
|
+
},
|
|
254
|
+
{
|
|
255
|
+
"run_ids": ["eval-2026-02-17-25aaae85"],
|
|
256
|
+
"label": "Cognitive prosthesis (cells 66-68, Nemotron)",
|
|
257
|
+
"section": "6.10",
|
|
258
|
+
"primary_judge_pattern": "claude-opus%",
|
|
259
|
+
"unit": "dialogue",
|
|
260
|
+
"expected_attempts": 90,
|
|
261
|
+
"expected_scored": 90
|
|
262
|
+
},
|
|
263
|
+
{
|
|
264
|
+
"run_ids": ["eval-2026-02-18-f489c0ea"],
|
|
265
|
+
"label": "Cognitive prosthesis smoke test (Haiku)",
|
|
266
|
+
"section": "6.10",
|
|
267
|
+
"primary_judge_pattern": "claude-opus%",
|
|
268
|
+
"unit": "dialogue",
|
|
269
|
+
"expected_attempts": 6,
|
|
270
|
+
"expected_scored": 6
|
|
271
|
+
},
|
|
272
|
+
{
|
|
273
|
+
"run_ids": ["eval-2026-02-15-664073ab"],
|
|
274
|
+
"label": "Dynamic learner base mechanisms (cells 69-70)",
|
|
275
|
+
"section": "6.10",
|
|
276
|
+
"primary_judge_pattern": "claude-opus%",
|
|
277
|
+
"unit": "dialogue",
|
|
278
|
+
"expected_attempts": 60,
|
|
279
|
+
"expected_scored": 60
|
|
280
|
+
},
|
|
281
|
+
{
|
|
282
|
+
"run_ids": ["eval-2026-02-17-deee5fd6"],
|
|
283
|
+
"label": "Prompt elaboration baseline, Haiku (cells 1, 71)",
|
|
284
|
+
"section": "6.21",
|
|
285
|
+
"primary_judge_pattern": "claude-opus%",
|
|
286
|
+
"unit": "single-turn",
|
|
287
|
+
"expected_attempts": 72,
|
|
288
|
+
"expected_scored": 72
|
|
289
|
+
},
|
|
290
|
+
{
|
|
291
|
+
"run_ids": ["eval-2026-02-17-27d7b4e3"],
|
|
292
|
+
"label": "Prompt elaboration baseline, Kimi (cells 1, 71)",
|
|
293
|
+
"section": "6.21",
|
|
294
|
+
"primary_judge_pattern": "claude-opus%",
|
|
295
|
+
"unit": "single-turn",
|
|
296
|
+
"expected_attempts": 72,
|
|
297
|
+
"expected_scored": 72
|
|
298
|
+
},
|
|
299
|
+
{
|
|
300
|
+
"run_ids": ["eval-2026-02-17-0eb3de77"],
|
|
301
|
+
"label": "Token budget 256, Haiku (run 1)",
|
|
302
|
+
"section": "6.22",
|
|
303
|
+
"primary_judge_pattern": "claude-opus%",
|
|
304
|
+
"unit": "mixed",
|
|
305
|
+
"expected_attempts": 36,
|
|
306
|
+
"expected_scored": 36
|
|
307
|
+
},
|
|
308
|
+
{
|
|
309
|
+
"run_ids": ["eval-2026-02-17-5a640782"],
|
|
310
|
+
"label": "Token budget 256, Haiku (run 2)",
|
|
311
|
+
"section": "6.22",
|
|
312
|
+
"primary_judge_pattern": "claude-opus%",
|
|
313
|
+
"unit": "mixed",
|
|
314
|
+
"expected_attempts": 36,
|
|
315
|
+
"expected_scored": 36
|
|
316
|
+
},
|
|
317
|
+
{
|
|
318
|
+
"run_ids": ["eval-2026-02-17-5f281654"],
|
|
319
|
+
"label": "Token budget 512, Haiku",
|
|
320
|
+
"section": "6.22",
|
|
321
|
+
"primary_judge_pattern": "claude-opus%",
|
|
322
|
+
"unit": "mixed",
|
|
323
|
+
"expected_attempts": 36,
|
|
324
|
+
"expected_scored": 36
|
|
325
|
+
},
|
|
326
|
+
{
|
|
327
|
+
"run_ids": ["eval-2026-02-17-0f6dcd97"],
|
|
328
|
+
"label": "Token budget 2048, Haiku",
|
|
329
|
+
"section": "6.22",
|
|
330
|
+
"primary_judge_pattern": "claude-opus%",
|
|
331
|
+
"unit": "mixed",
|
|
332
|
+
"expected_attempts": 36,
|
|
333
|
+
"expected_scored": 36
|
|
334
|
+
},
|
|
335
|
+
{
|
|
336
|
+
"run_ids": ["eval-2026-02-17-d32ed226"],
|
|
337
|
+
"label": "Token budget default, Haiku",
|
|
338
|
+
"section": "6.22",
|
|
339
|
+
"primary_judge_pattern": "claude-opus%",
|
|
340
|
+
"unit": "mixed",
|
|
341
|
+
"expected_attempts": 18,
|
|
342
|
+
"expected_scored": 18
|
|
343
|
+
}
|
|
344
|
+
],
|
|
345
|
+
|
|
346
|
+
"totals": {
|
|
347
|
+
"evaluations": 37,
|
|
348
|
+
"expected_attempts": 3398,
|
|
349
|
+
"expected_scored": 3383,
|
|
350
|
+
"opus_primary_count": 37,
|
|
351
|
+
"sonnet_primary_count": 0,
|
|
352
|
+
"sonnet_primary_runs": []
|
|
353
|
+
},
|
|
354
|
+
|
|
355
|
+
"figures": {
|
|
356
|
+
"figure1": {
|
|
357
|
+
"title": "2×2×2 Factorial Cell Means",
|
|
358
|
+
"section": "6.3",
|
|
359
|
+
"runs": ["eval-2026-02-03-f5d4dd93", "eval-2026-02-06-a933d745"],
|
|
360
|
+
"cells": [1, 2, 3, 4, 5, 6, 7, 8],
|
|
361
|
+
"judge_filter": "claude-opus%",
|
|
362
|
+
"notes": "Combine f5d4dd93 (cells 1-5,7) and a933d745 (cells 6,8)"
|
|
363
|
+
},
|
|
364
|
+
"figure2": {
|
|
365
|
+
"title": "Memory Isolation 2×2",
|
|
366
|
+
"section": "6.2",
|
|
367
|
+
"runs": ["eval-2026-02-06-81f2d5a1", "eval-2026-02-06-ac9ea8f5"],
|
|
368
|
+
"profiles": {
|
|
369
|
+
"base": "cell_1_base_single_unified",
|
|
370
|
+
"memory_only": "cell_19_memory_single_unified",
|
|
371
|
+
"recognition_only": "cell_20_recog_nomem_single_unified",
|
|
372
|
+
"recognition_plus_memory": "cell_5_recog_single_unified"
|
|
373
|
+
},
|
|
374
|
+
"judge_filter": "claude-opus%"
|
|
375
|
+
},
|
|
376
|
+
"figure3": {
|
|
377
|
+
"title": "Active Control Comparison",
|
|
378
|
+
"section": "6.2",
|
|
379
|
+
"runs": {
|
|
380
|
+
"active_control": "eval-2026-02-06-a9ae06ee",
|
|
381
|
+
"factorial_base": ["eval-2026-02-03-f5d4dd93", "eval-2026-02-06-a933d745"]
|
|
382
|
+
},
|
|
383
|
+
"judge_filter": "claude-opus%",
|
|
384
|
+
"notes": "Cross-run comparison: active control (Nemotron) vs factorial (Kimi). Model confound acknowledged."
|
|
385
|
+
},
|
|
386
|
+
"figure4": {
|
|
387
|
+
"title": "Multi-Model A×B Probe",
|
|
388
|
+
"section": "6.4",
|
|
389
|
+
"runs": {
|
|
390
|
+
"kimi": {
|
|
391
|
+
"run_ids": ["eval-2026-02-03-f5d4dd93"],
|
|
392
|
+
"cells": [1, 3, 5, 7],
|
|
393
|
+
"label": "Kimi K2.5",
|
|
394
|
+
"expected_n": 179
|
|
395
|
+
},
|
|
396
|
+
"nemotron": {
|
|
397
|
+
"run_ids": ["eval-2026-02-07-722087ac"],
|
|
398
|
+
"cells": [1, 3, 5, 7],
|
|
399
|
+
"label": "Nemotron",
|
|
400
|
+
"expected_n": 119
|
|
401
|
+
},
|
|
402
|
+
"deepseek": {
|
|
403
|
+
"run_ids": ["eval-2026-02-07-70ef73a3"],
|
|
404
|
+
"cells": [1, 3, 5, 7],
|
|
405
|
+
"label": "DeepSeek V3.2",
|
|
406
|
+
"expected_n": 120
|
|
407
|
+
},
|
|
408
|
+
"glm": {
|
|
409
|
+
"run_ids": ["eval-2026-02-07-6b3e6565"],
|
|
410
|
+
"cells": [1, 3, 5, 7],
|
|
411
|
+
"label": "GLM-4.7",
|
|
412
|
+
"expected_n": 117
|
|
413
|
+
},
|
|
414
|
+
"haiku": {
|
|
415
|
+
"run_ids": ["eval-2026-02-07-6ead24c7"],
|
|
416
|
+
"cells": [1, 3, 5, 7],
|
|
417
|
+
"label": "Claude Haiku 4.5",
|
|
418
|
+
"expected_n": 120
|
|
419
|
+
}
|
|
420
|
+
},
|
|
421
|
+
"judge_filter": "claude-opus%"
|
|
422
|
+
},
|
|
423
|
+
"figure5": {
|
|
424
|
+
"title": "Domain Generalizability",
|
|
425
|
+
"section": "6.5",
|
|
426
|
+
"runs": {
|
|
427
|
+
"elementary": "eval-2026-02-05-e87f452d",
|
|
428
|
+
"philosophy": "eval-2026-02-03-f5d4dd93"
|
|
429
|
+
},
|
|
430
|
+
"cells": [1, 3, 5, 7],
|
|
431
|
+
"judge_filter": "claude-opus%",
|
|
432
|
+
"notes": "Elementary from domain gen run; philosophy from factorial single-learner cells"
|
|
433
|
+
},
|
|
434
|
+
"figure7": {
|
|
435
|
+
"title": "Superego Persona × Recognition",
|
|
436
|
+
"section": "6.8",
|
|
437
|
+
"runs": ["eval-2026-02-11-a54235ea"],
|
|
438
|
+
"cells": [28, 29, 30, 31, 32, 33],
|
|
439
|
+
"judge_filter": "claude-opus%"
|
|
440
|
+
},
|
|
441
|
+
"figure8": {
|
|
442
|
+
"title": "Mechanism Differentiation — Scripted vs Dynamic",
|
|
443
|
+
"section": "6.10",
|
|
444
|
+
"runs": {
|
|
445
|
+
"scripted": "eval-2026-02-14-e0e3a622",
|
|
446
|
+
"dynamic_60_63": "eval-2026-02-14-6c033830",
|
|
447
|
+
"dynamic_64_65": "eval-2026-02-14-a2b2717c",
|
|
448
|
+
"cognitive": "eval-2026-02-14-50487df7",
|
|
449
|
+
"base_69_70": "eval-2026-02-15-664073ab"
|
|
450
|
+
},
|
|
451
|
+
"judge_filter": "claude-opus%",
|
|
452
|
+
"notes": "50487df7 uses sonnet judge; others use opus"
|
|
453
|
+
},
|
|
454
|
+
"figure9": {
|
|
455
|
+
"title": "Qualitative Tag Divergence",
|
|
456
|
+
"section": "6.11",
|
|
457
|
+
"runs": ["eval-2026-02-07-b6d75e87"],
|
|
458
|
+
"source": "qualitative_assessment column, not overall_score",
|
|
459
|
+
"data_driven": false,
|
|
460
|
+
"notes": "Tag data from qualitative assessment, not numerical scores. Kept as hardcoded values with comment."
|
|
461
|
+
}
|
|
462
|
+
},
|
|
463
|
+
|
|
464
|
+
"tables": {
|
|
465
|
+
"table2": {
|
|
466
|
+
"title": "Evaluation Sample Summary",
|
|
467
|
+
"source": "key_evaluations array above",
|
|
468
|
+
"notes": "Generated from manifest + DB query. Paper Table 2 'Attempts' column may exceed manifest expected_attempts because it includes unjudged failures (rows with empty judge_model) that are excluded from manifest counts."
|
|
469
|
+
},
|
|
470
|
+
"appendix_d": {
|
|
471
|
+
"title": "Reproducibility and Key Evaluation Run IDs",
|
|
472
|
+
"source": "key_evaluations array above",
|
|
473
|
+
"notes": "Generated from manifest"
|
|
474
|
+
}
|
|
475
|
+
},
|
|
476
|
+
|
|
477
|
+
"prose_n_references": [
|
|
478
|
+
{"location": "abstract", "pattern": "N=3,383 primary scored"},
|
|
479
|
+
{"location": "introduction (line ~44)", "pattern": "N=3,383 primary scored responses"},
|
|
480
|
+
{"location": "methods (Table 2 totals)", "pattern": "**3,383**"},
|
|
481
|
+
{"location": "methods (line ~572)", "pattern": "N=3,383 scored"},
|
|
482
|
+
{"location": "methods (line ~574)", "pattern": "N=3,383"},
|
|
483
|
+
{"location": "discussion (line ~1817)", "pattern": "N=3,383"},
|
|
484
|
+
{"location": "conclusion (line ~1899)", "pattern": "N=3,383 primary scored"}
|
|
485
|
+
]
|
|
486
|
+
}
|