@machinespirits/eval 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +161 -0
- package/config/eval-settings.yaml +18 -0
- package/config/evaluation-rubric-learner.yaml +277 -0
- package/config/evaluation-rubric.yaml +613 -0
- package/config/interaction-eval-scenarios.yaml +93 -50
- package/config/learner-agents.yaml +124 -193
- package/config/machinespirits-eval.code-workspace +11 -0
- package/config/providers.yaml +60 -0
- package/config/suggestion-scenarios.yaml +1399 -0
- package/config/tutor-agents.yaml +716 -0
- package/docs/EVALUATION-VARIABLES.md +589 -0
- package/docs/REPLICATION-PLAN.md +577 -0
- package/index.js +15 -6
- package/package.json +16 -22
- package/routes/evalRoutes.js +88 -36
- package/scripts/analyze-judge-reliability.js +401 -0
- package/scripts/analyze-run.js +97 -0
- package/scripts/analyze-run.mjs +282 -0
- package/scripts/analyze-validation-failures.js +141 -0
- package/scripts/check-run.mjs +17 -0
- package/scripts/code-impasse-strategies.js +1132 -0
- package/scripts/compare-runs.js +44 -0
- package/scripts/compare-suggestions.js +80 -0
- package/scripts/compare-transformation.js +116 -0
- package/scripts/dig-into-run.js +158 -0
- package/scripts/eval-cli.js +2626 -0
- package/scripts/generate-paper-figures.py +452 -0
- package/scripts/qualitative-analysis-ai.js +1313 -0
- package/scripts/qualitative-analysis.js +688 -0
- package/scripts/seed-db.js +87 -0
- package/scripts/show-failed-suggestions.js +64 -0
- package/scripts/validate-content.js +192 -0
- package/server.js +3 -2
- package/services/__tests__/evalConfigLoader.test.js +338 -0
- package/services/anovaStats.js +499 -0
- package/services/contentResolver.js +407 -0
- package/services/dialogueTraceAnalyzer.js +454 -0
- package/services/evalConfigLoader.js +625 -0
- package/services/evaluationRunner.js +2171 -270
- package/services/evaluationStore.js +564 -29
- package/services/learnerConfigLoader.js +75 -5
- package/services/learnerRubricEvaluator.js +284 -0
- package/services/learnerTutorInteractionEngine.js +375 -0
- package/services/processUtils.js +18 -0
- package/services/progressLogger.js +98 -0
- package/services/promptRecommendationService.js +31 -26
- package/services/promptRewriter.js +427 -0
- package/services/rubricEvaluator.js +543 -70
- package/services/streamingReporter.js +104 -0
- package/services/turnComparisonAnalyzer.js +494 -0
- package/components/MobileEvalDashboard.tsx +0 -267
- package/components/comparison/DeltaAnalysisTable.tsx +0 -137
- package/components/comparison/ProfileComparisonCard.tsx +0 -176
- package/components/comparison/RecognitionABMode.tsx +0 -385
- package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
- package/components/comparison/WinnerIndicator.tsx +0 -64
- package/components/comparison/index.ts +0 -5
- package/components/mobile/BottomSheet.tsx +0 -233
- package/components/mobile/DimensionBreakdown.tsx +0 -210
- package/components/mobile/DocsView.tsx +0 -363
- package/components/mobile/LogsView.tsx +0 -481
- package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
- package/components/mobile/QuickTestView.tsx +0 -1098
- package/components/mobile/RecognitionTypeChart.tsx +0 -124
- package/components/mobile/RecognitionView.tsx +0 -809
- package/components/mobile/RunDetailView.tsx +0 -261
- package/components/mobile/RunHistoryView.tsx +0 -367
- package/components/mobile/ScoreRadial.tsx +0 -211
- package/components/mobile/StreamingLogPanel.tsx +0 -230
- package/components/mobile/SynthesisStrategyChart.tsx +0 -140
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
- package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
- package/docs/research/COST-ANALYSIS.md +0 -56
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
- package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
- package/docs/research/PAPER-UNIFIED.md +0 -659
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
- package/docs/research/apa.csl +0 -2133
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
- package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
- package/docs/research/paper-draft/full-paper.md +0 -136
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +0 -515
- package/docs/research/transcript-baseline.md +0 -139
- package/docs/research/transcript-recognition-multiagent.md +0 -187
- package/hooks/useEvalData.ts +0 -625
- package/server-init.js +0 -45
- package/services/benchmarkService.js +0 -1892
- package/types.ts +0 -165
- package/utils/haptics.ts +0 -45
|
@@ -0,0 +1,716 @@
|
|
|
1
|
+
# Local Tutor Agent Configuration (Eval Repo Override)
|
|
2
|
+
#
|
|
3
|
+
# This file overrides tutor-core's tutor-agents.yaml for evaluation purposes.
|
|
4
|
+
# Model aliases (e.g. "sonnet", "nemotron") are resolved through providers.yaml.
|
|
5
|
+
#
|
|
6
|
+
# ============================================================================
|
|
7
|
+
# MODEL OVERRIDES (optional)
|
|
8
|
+
# ============================================================================
|
|
9
|
+
# These override ALL profile models when uncommented. Useful for quick testing.
|
|
10
|
+
# CLI flags (--model, --ego-model, --superego-model) take precedence over these.
|
|
11
|
+
#
|
|
12
|
+
# model_override: openrouter.haiku # Override ALL models (ego + superego)
|
|
13
|
+
# ego_model_override: openrouter.nemotron # Override only ego model
|
|
14
|
+
# superego_model_override: openrouter.kimi-k2.5 # Override only superego model
|
|
15
|
+
#
|
|
16
|
+
# ============================================================================
|
|
17
|
+
# FACTORIAL EVALUATION DESIGN
|
|
18
|
+
# ============================================================================
|
|
19
|
+
#
|
|
20
|
+
# Three independent variables:
|
|
21
|
+
#
|
|
22
|
+
# Factor A: Prompt Type — base / enhanced / recognition (3 levels)
|
|
23
|
+
# Factor B: Tutor Architecture — single ego vs ego+superego dialogue (2 levels)
|
|
24
|
+
# Factor C: Learner Architecture — unified vs ego_superego (2 levels)
|
|
25
|
+
#
|
|
26
|
+
# Factor A levels:
|
|
27
|
+
# - base: Original Claude-generated prompts (default LLM tutoring)
|
|
28
|
+
# - enhanced: Matches recognition specificity without recognition theory
|
|
29
|
+
# - recognition: Full recognition-enhanced prompts with Hegelian theory
|
|
30
|
+
# - hardwired: Base + 5 superego-derived rules (no live superego) [ABLATION]
|
|
31
|
+
#
|
|
32
|
+
# This design enables isolating:
|
|
33
|
+
# - Base vs Enhanced = prompt engineering effect
|
|
34
|
+
# - Enhanced vs Recognition = isolated recognition theory effect
|
|
35
|
+
# - Base vs Recognition = combined effect (original comparison)
|
|
36
|
+
#
|
|
37
|
+
# All cells use nemotron (free tier) to isolate architectural effects from model cost.
|
|
38
|
+
#
|
|
39
|
+
# Design Matrix (12 main cells + 2 ablation cells):
|
|
40
|
+
#
|
|
41
|
+
# ┌────────────────────┬──────────────────────────┬──────────────────────────┐
|
|
42
|
+
# │ │ Unified Learner │ Ego/Superego Learner │
|
|
43
|
+
# ├────────────────────┼──────────────┬───────────┼──────────────┬───────────┤
|
|
44
|
+
# │ │ Single Tutor │ E+S Tutor │ Single Tutor │ E+S Tutor │
|
|
45
|
+
# ├────────────────────┼──────────────┼───────────┼──────────────┼───────────┤
|
|
46
|
+
# │ Base │ cell_1 │ cell_3 │ cell_2 │ cell_4 │
|
|
47
|
+
# ├────────────────────┼──────────────┼───────────┼──────────────┼───────────┤
|
|
48
|
+
# │ Enhanced │ cell_9 │ cell_11 │ cell_10 │ cell_12 │
|
|
49
|
+
# ├────────────────────┼──────────────┼───────────┼──────────────┼───────────┤
|
|
50
|
+
# │ Recognition │ cell_5 │ cell_7 │ cell_6 │ cell_8 │
|
|
51
|
+
# └────────────────────┴──────────────┴───────────┴──────────────┴───────────┘
|
|
52
|
+
#
|
|
53
|
+
# ABLATION (hardwired = superego rules in ego prompt, no live superego):
|
|
54
|
+
# ┌────────────────────┬──────────────┬──────────────┐
|
|
55
|
+
# │ Hardwired │ cell_13 │ cell_14 │
|
|
56
|
+
# └────────────────────┴──────────────┴──────────────┘
|
|
57
|
+
#
|
|
58
|
+
# Ablation comparisons:
|
|
59
|
+
# - cell_1/2 vs cell_13/14 = Do hardwired rules improve single-agent?
|
|
60
|
+
# - cell_3/4 vs cell_13/14 = Can hardwired rules replace live superego?
|
|
61
|
+
#
|
|
62
|
+
# This allows analyzing:
|
|
63
|
+
# - Main effect of prompt type (base vs enhanced vs recognition)
|
|
64
|
+
# - Main effect of tutor architecture (single agent vs ego + superego dialogue)
|
|
65
|
+
# - Main effect of learner architecture (unified vs ego_superego)
|
|
66
|
+
# - All two-way and three-way interactions
|
|
67
|
+
# - Superego architecture ablation (rules vs dynamic dialogue)
|
|
68
|
+
#
|
|
69
|
+
# ============================================================================
|
|
70
|
+
|
|
71
|
+
active_profile: budget
|
|
72
|
+
|
|
73
|
+
profiles:
|
|
74
|
+
# ===========================================================================
|
|
75
|
+
# DEVELOPMENT PROFILE (not part of factorial design)
|
|
76
|
+
# ===========================================================================
|
|
77
|
+
|
|
78
|
+
budget:
|
|
79
|
+
description: "Budget-friendly single agent - Nemotron (free) via OpenRouter, no dialogue"
|
|
80
|
+
dialogue:
|
|
81
|
+
enabled: false
|
|
82
|
+
max_rounds: 0
|
|
83
|
+
ego:
|
|
84
|
+
provider: openrouter
|
|
85
|
+
model: nemotron
|
|
86
|
+
prompt_file: tutor-ego.md
|
|
87
|
+
hyperparameters:
|
|
88
|
+
temperature: 0.6
|
|
89
|
+
max_tokens: 8000
|
|
90
|
+
superego: null
|
|
91
|
+
|
|
92
|
+
# ===========================================================================
|
|
93
|
+
# 2×2×2 FACTORIAL CELLS (Recognition × Tutor Architecture × Learner Architecture)
|
|
94
|
+
# ===========================================================================
|
|
95
|
+
# These 8 profiles support the unified factorial evaluation pipeline.
|
|
96
|
+
# Use: node scripts/eval-cli.js run --factorial --runs 3
|
|
97
|
+
# All use nemotron (free tier) to isolate architectural effects from model cost.
|
|
98
|
+
|
|
99
|
+
# Cell 1: Base × Single Agent × Unified Learner
|
|
100
|
+
cell_1_base_single_unified:
|
|
101
|
+
description: "Factorial cell 1: baseline single-agent tutor, unified learner"
|
|
102
|
+
factors:
|
|
103
|
+
prompt_type: base
|
|
104
|
+
multi_agent_tutor: false
|
|
105
|
+
multi_agent_learner: false
|
|
106
|
+
learner_architecture: unified
|
|
107
|
+
recognition_mode: false
|
|
108
|
+
memory_enabled: false
|
|
109
|
+
dialogue:
|
|
110
|
+
enabled: false
|
|
111
|
+
max_rounds: 0
|
|
112
|
+
ego:
|
|
113
|
+
provider: openrouter
|
|
114
|
+
model: nemotron
|
|
115
|
+
prompt_file: tutor-ego.md
|
|
116
|
+
hyperparameters:
|
|
117
|
+
temperature: 0.6
|
|
118
|
+
max_tokens: 8000
|
|
119
|
+
superego: null
|
|
120
|
+
|
|
121
|
+
# Cell 2: Base × Single Agent × Ego/Superego Learner
|
|
122
|
+
cell_2_base_single_psycho:
|
|
123
|
+
description: "Factorial cell 2: baseline single-agent tutor, ego/superego learner"
|
|
124
|
+
factors:
|
|
125
|
+
prompt_type: base
|
|
126
|
+
multi_agent_tutor: false
|
|
127
|
+
multi_agent_learner: true
|
|
128
|
+
learner_architecture: ego_superego
|
|
129
|
+
recognition_mode: false
|
|
130
|
+
memory_enabled: false
|
|
131
|
+
dialogue:
|
|
132
|
+
enabled: false
|
|
133
|
+
max_rounds: 0
|
|
134
|
+
ego:
|
|
135
|
+
provider: openrouter
|
|
136
|
+
model: nemotron
|
|
137
|
+
prompt_file: tutor-ego.md
|
|
138
|
+
hyperparameters:
|
|
139
|
+
temperature: 0.6
|
|
140
|
+
max_tokens: 8000
|
|
141
|
+
superego: null
|
|
142
|
+
|
|
143
|
+
# Cell 3: Base × Multi-Agent Tutor × Unified Learner
|
|
144
|
+
cell_3_base_multi_unified:
|
|
145
|
+
description: "Factorial cell 3: ego+superego tutor, unified learner"
|
|
146
|
+
factors:
|
|
147
|
+
prompt_type: base
|
|
148
|
+
multi_agent_tutor: true
|
|
149
|
+
multi_agent_learner: false
|
|
150
|
+
learner_architecture: unified
|
|
151
|
+
recognition_mode: false
|
|
152
|
+
memory_enabled: false
|
|
153
|
+
dialogue:
|
|
154
|
+
enabled: true
|
|
155
|
+
max_rounds: 2
|
|
156
|
+
convergence_threshold: 0.8
|
|
157
|
+
ego:
|
|
158
|
+
provider: openrouter
|
|
159
|
+
model: nemotron
|
|
160
|
+
staging: front
|
|
161
|
+
prompt_file: tutor-ego.md
|
|
162
|
+
hyperparameters:
|
|
163
|
+
temperature: 0.6
|
|
164
|
+
max_tokens: 8000
|
|
165
|
+
superego:
|
|
166
|
+
provider: openrouter
|
|
167
|
+
model: kimi-k2.5
|
|
168
|
+
staging: back
|
|
169
|
+
prompt_file: tutor-superego.md
|
|
170
|
+
hyperparameters:
|
|
171
|
+
temperature: 0.2
|
|
172
|
+
max_tokens: 8000
|
|
173
|
+
|
|
174
|
+
# Cell 4: Base × Multi-Agent Tutor × Ego/Superego Learner
|
|
175
|
+
cell_4_base_multi_psycho:
|
|
176
|
+
description: "Factorial cell 4: ego+superego tutor, ego/superego learner"
|
|
177
|
+
factors:
|
|
178
|
+
prompt_type: base
|
|
179
|
+
multi_agent_tutor: true
|
|
180
|
+
multi_agent_learner: true
|
|
181
|
+
learner_architecture: ego_superego
|
|
182
|
+
recognition_mode: false
|
|
183
|
+
memory_enabled: false
|
|
184
|
+
dialogue:
|
|
185
|
+
enabled: true
|
|
186
|
+
max_rounds: 2
|
|
187
|
+
convergence_threshold: 0.8
|
|
188
|
+
ego:
|
|
189
|
+
provider: openrouter
|
|
190
|
+
model: nemotron
|
|
191
|
+
staging: front
|
|
192
|
+
prompt_file: tutor-ego.md
|
|
193
|
+
hyperparameters:
|
|
194
|
+
temperature: 0.6
|
|
195
|
+
max_tokens: 8000
|
|
196
|
+
superego:
|
|
197
|
+
provider: openrouter
|
|
198
|
+
model: kimi-k2.5
|
|
199
|
+
staging: back
|
|
200
|
+
prompt_file: tutor-superego.md
|
|
201
|
+
hyperparameters:
|
|
202
|
+
temperature: 0.2
|
|
203
|
+
max_tokens: 8000
|
|
204
|
+
|
|
205
|
+
# Cell 5: Recognition × Single Agent × Unified Learner
|
|
206
|
+
cell_5_recog_single_unified:
|
|
207
|
+
description: "Factorial cell 5: recognition single-agent tutor, unified learner"
|
|
208
|
+
factors:
|
|
209
|
+
prompt_type: recognition
|
|
210
|
+
multi_agent_tutor: false
|
|
211
|
+
multi_agent_learner: false
|
|
212
|
+
learner_architecture: unified_recognition
|
|
213
|
+
recognition_mode: true
|
|
214
|
+
memory_enabled: true
|
|
215
|
+
dialogue:
|
|
216
|
+
enabled: false
|
|
217
|
+
max_rounds: 0
|
|
218
|
+
ego:
|
|
219
|
+
provider: openrouter
|
|
220
|
+
model: nemotron
|
|
221
|
+
prompt_file: tutor-ego-recognition.md
|
|
222
|
+
hyperparameters:
|
|
223
|
+
temperature: 0.6
|
|
224
|
+
max_tokens: 8000
|
|
225
|
+
superego: null
|
|
226
|
+
|
|
227
|
+
# Cell 6: Recognition × Single Agent × Ego/Superego Learner
|
|
228
|
+
cell_6_recog_single_psycho:
|
|
229
|
+
description: "Factorial cell 6: recognition single-agent tutor, ego/superego learner"
|
|
230
|
+
factors:
|
|
231
|
+
prompt_type: recognition
|
|
232
|
+
multi_agent_tutor: false
|
|
233
|
+
multi_agent_learner: true
|
|
234
|
+
learner_architecture: ego_superego_recognition
|
|
235
|
+
recognition_mode: true
|
|
236
|
+
memory_enabled: true
|
|
237
|
+
dialogue:
|
|
238
|
+
enabled: false
|
|
239
|
+
max_rounds: 0
|
|
240
|
+
ego:
|
|
241
|
+
provider: openrouter
|
|
242
|
+
model: kimi-k2.5
|
|
243
|
+
prompt_file: tutor-ego-recognition.md
|
|
244
|
+
hyperparameters:
|
|
245
|
+
temperature: 0.6
|
|
246
|
+
max_tokens: 8000
|
|
247
|
+
superego: null
|
|
248
|
+
|
|
249
|
+
# Cell 7: Recognition × Multi-Agent Tutor × Unified Learner
|
|
250
|
+
cell_7_recog_multi_unified:
|
|
251
|
+
description: "Factorial cell 7: recognition ego+superego tutor, unified learner"
|
|
252
|
+
factors:
|
|
253
|
+
prompt_type: recognition
|
|
254
|
+
multi_agent_tutor: true
|
|
255
|
+
multi_agent_learner: false
|
|
256
|
+
learner_architecture: unified_recognition
|
|
257
|
+
recognition_mode: true
|
|
258
|
+
memory_enabled: true
|
|
259
|
+
dialogue:
|
|
260
|
+
enabled: true
|
|
261
|
+
max_rounds: 2
|
|
262
|
+
convergence_threshold: 0.7
|
|
263
|
+
ego:
|
|
264
|
+
provider: openrouter
|
|
265
|
+
model: nemotron
|
|
266
|
+
staging: front
|
|
267
|
+
prompt_file: tutor-ego-recognition.md
|
|
268
|
+
hyperparameters:
|
|
269
|
+
temperature: 0.6
|
|
270
|
+
max_tokens: 8000
|
|
271
|
+
superego:
|
|
272
|
+
provider: openrouter
|
|
273
|
+
model: kimi-k2.5
|
|
274
|
+
staging: back
|
|
275
|
+
prompt_file: tutor-superego-recognition.md
|
|
276
|
+
hyperparameters:
|
|
277
|
+
temperature: 0.2
|
|
278
|
+
max_tokens: 8000
|
|
279
|
+
intervention_strategies:
|
|
280
|
+
enforce_mutual_recognition: true
|
|
281
|
+
require_memory_integration: true
|
|
282
|
+
assess_transformative_potential: true
|
|
283
|
+
|
|
284
|
+
# Cell 8: Recognition × Multi-Agent Tutor × Ego/Superego Learner
|
|
285
|
+
cell_8_recog_multi_psycho:
|
|
286
|
+
description: "Factorial cell 8: recognition ego+superego tutor, ego/superego learner"
|
|
287
|
+
factors:
|
|
288
|
+
prompt_type: recognition
|
|
289
|
+
multi_agent_tutor: true
|
|
290
|
+
multi_agent_learner: true
|
|
291
|
+
learner_architecture: ego_superego_recognition
|
|
292
|
+
recognition_mode: true
|
|
293
|
+
memory_enabled: true
|
|
294
|
+
dialogue:
|
|
295
|
+
enabled: true
|
|
296
|
+
max_rounds: 2
|
|
297
|
+
convergence_threshold: 0.7
|
|
298
|
+
ego:
|
|
299
|
+
provider: openrouter
|
|
300
|
+
model: kimi-k2.5
|
|
301
|
+
staging: front
|
|
302
|
+
prompt_file: tutor-ego-recognition.md
|
|
303
|
+
hyperparameters:
|
|
304
|
+
temperature: 0.6
|
|
305
|
+
max_tokens: 8000
|
|
306
|
+
superego:
|
|
307
|
+
provider: openrouter
|
|
308
|
+
model: kimi-k2.5
|
|
309
|
+
staging: back
|
|
310
|
+
prompt_file: tutor-superego-recognition.md
|
|
311
|
+
hyperparameters:
|
|
312
|
+
temperature: 0.2
|
|
313
|
+
max_tokens: 8000
|
|
314
|
+
intervention_strategies:
|
|
315
|
+
enforce_mutual_recognition: true
|
|
316
|
+
require_memory_integration: true
|
|
317
|
+
assess_transformative_potential: true
|
|
318
|
+
|
|
319
|
+
# ===========================================================================
|
|
320
|
+
# ENHANCED CELLS (9-12): Better prompting WITHOUT recognition theory
|
|
321
|
+
# ===========================================================================
|
|
322
|
+
# These cells use tutor-ego-enhanced.md and tutor-superego-enhanced.md which
|
|
323
|
+
# match the recognition prompts' specificity (8 rules, 10 examples, checklists)
|
|
324
|
+
# but WITHOUT Hegelian recognition theory language.
|
|
325
|
+
#
|
|
326
|
+
# Comparing Enhanced vs Recognition isolates the recognition theory effect.
|
|
327
|
+
# Comparing Base vs Enhanced isolates the prompt engineering effect.
|
|
328
|
+
|
|
329
|
+
# Cell 9: Enhanced × Single Agent × Unified Learner
|
|
330
|
+
cell_9_enhanced_single_unified:
|
|
331
|
+
description: "Factorial cell 9: enhanced single-agent tutor, unified learner"
|
|
332
|
+
factors:
|
|
333
|
+
prompt_type: enhanced
|
|
334
|
+
multi_agent_tutor: false
|
|
335
|
+
multi_agent_learner: false
|
|
336
|
+
learner_architecture: unified
|
|
337
|
+
recognition_mode: false
|
|
338
|
+
memory_enabled: false
|
|
339
|
+
dialogue:
|
|
340
|
+
enabled: false
|
|
341
|
+
max_rounds: 0
|
|
342
|
+
ego:
|
|
343
|
+
provider: openrouter
|
|
344
|
+
model: nemotron
|
|
345
|
+
prompt_file: tutor-ego-enhanced.md
|
|
346
|
+
hyperparameters:
|
|
347
|
+
temperature: 0.6
|
|
348
|
+
max_tokens: 8000
|
|
349
|
+
superego: null
|
|
350
|
+
|
|
351
|
+
# Cell 10: Enhanced × Single Agent × Ego/Superego Learner
|
|
352
|
+
cell_10_enhanced_single_psycho:
|
|
353
|
+
description: "Factorial cell 10: enhanced single-agent tutor, ego/superego learner"
|
|
354
|
+
factors:
|
|
355
|
+
prompt_type: enhanced
|
|
356
|
+
multi_agent_tutor: false
|
|
357
|
+
multi_agent_learner: true
|
|
358
|
+
learner_architecture: ego_superego
|
|
359
|
+
recognition_mode: false
|
|
360
|
+
memory_enabled: false
|
|
361
|
+
dialogue:
|
|
362
|
+
enabled: false
|
|
363
|
+
max_rounds: 0
|
|
364
|
+
ego:
|
|
365
|
+
provider: openrouter
|
|
366
|
+
model: nemotron
|
|
367
|
+
prompt_file: tutor-ego-enhanced.md
|
|
368
|
+
hyperparameters:
|
|
369
|
+
temperature: 0.6
|
|
370
|
+
max_tokens: 8000
|
|
371
|
+
superego: null
|
|
372
|
+
|
|
373
|
+
# Cell 11: Enhanced × Multi-Agent Tutor × Unified Learner
|
|
374
|
+
cell_11_enhanced_multi_unified:
|
|
375
|
+
description: "Factorial cell 11: enhanced ego+superego tutor, unified learner"
|
|
376
|
+
factors:
|
|
377
|
+
prompt_type: enhanced
|
|
378
|
+
multi_agent_tutor: true
|
|
379
|
+
multi_agent_learner: false
|
|
380
|
+
learner_architecture: unified
|
|
381
|
+
recognition_mode: false
|
|
382
|
+
memory_enabled: false
|
|
383
|
+
dialogue:
|
|
384
|
+
enabled: true
|
|
385
|
+
max_rounds: 2
|
|
386
|
+
convergence_threshold: 0.8
|
|
387
|
+
ego:
|
|
388
|
+
provider: openrouter
|
|
389
|
+
model: nemotron
|
|
390
|
+
staging: front
|
|
391
|
+
prompt_file: tutor-ego-enhanced.md
|
|
392
|
+
hyperparameters:
|
|
393
|
+
temperature: 0.6
|
|
394
|
+
max_tokens: 8000
|
|
395
|
+
superego:
|
|
396
|
+
provider: openrouter
|
|
397
|
+
model: kimi-k2.5
|
|
398
|
+
staging: back
|
|
399
|
+
prompt_file: tutor-superego-enhanced.md
|
|
400
|
+
hyperparameters:
|
|
401
|
+
temperature: 0.2
|
|
402
|
+
max_tokens: 8000
|
|
403
|
+
|
|
404
|
+
# Cell 12: Enhanced × Multi-Agent Tutor × Ego/Superego Learner
|
|
405
|
+
cell_12_enhanced_multi_psycho:
|
|
406
|
+
description: "Factorial cell 12: enhanced ego+superego tutor, ego/superego learner"
|
|
407
|
+
factors:
|
|
408
|
+
prompt_type: enhanced
|
|
409
|
+
multi_agent_tutor: true
|
|
410
|
+
multi_agent_learner: true
|
|
411
|
+
learner_architecture: ego_superego
|
|
412
|
+
recognition_mode: false
|
|
413
|
+
memory_enabled: false
|
|
414
|
+
dialogue:
|
|
415
|
+
enabled: true
|
|
416
|
+
max_rounds: 2
|
|
417
|
+
convergence_threshold: 0.8
|
|
418
|
+
ego:
|
|
419
|
+
provider: openrouter
|
|
420
|
+
model: nemotron
|
|
421
|
+
staging: front
|
|
422
|
+
prompt_file: tutor-ego-enhanced.md
|
|
423
|
+
hyperparameters:
|
|
424
|
+
temperature: 0.6
|
|
425
|
+
max_tokens: 8000
|
|
426
|
+
superego:
|
|
427
|
+
provider: openrouter
|
|
428
|
+
model: kimi-k2.5
|
|
429
|
+
staging: back
|
|
430
|
+
prompt_file: tutor-superego-enhanced.md
|
|
431
|
+
hyperparameters:
|
|
432
|
+
temperature: 0.2
|
|
433
|
+
max_tokens: 8000
|
|
434
|
+
|
|
435
|
+
# ===========================================================================
|
|
436
|
+
# HARDWIRED CELLS (13-14): Superego-derived rules, NO live superego
|
|
437
|
+
# ===========================================================================
|
|
438
|
+
# These cells test whether the superego's value is in the rules it enforces
|
|
439
|
+
# or in the dynamic dialogue itself.
|
|
440
|
+
#
|
|
441
|
+
# Uses tutor-ego-hardwired.md which embeds 5 rules derived from analyzing
|
|
442
|
+
# 186 superego rejections (engagement, specificity, memory, level-matching,
|
|
443
|
+
# absolute struggle stop).
|
|
444
|
+
#
|
|
445
|
+
# Comparison:
|
|
446
|
+
# cell_1/2 (base, no superego) vs cell_13/14 (hardwired, no superego)
|
|
447
|
+
# → Does hardwiring superego rules improve single-agent performance?
|
|
448
|
+
# cell_3/4 (base + superego) vs cell_13/14 (hardwired, no superego)
|
|
449
|
+
# → Can hardwired rules replace live superego dialogue?
|
|
450
|
+
|
|
451
|
+
# Cell 13: Hardwired × Single Agent × Unified Learner
|
|
452
|
+
cell_13_hardwired_single_unified:
|
|
453
|
+
description: "Factorial cell 13: hardwired single-agent tutor (superego rules embedded), unified learner"
|
|
454
|
+
factors:
|
|
455
|
+
prompt_type: hardwired
|
|
456
|
+
multi_agent_tutor: false
|
|
457
|
+
multi_agent_learner: false
|
|
458
|
+
learner_architecture: unified
|
|
459
|
+
recognition_mode: false
|
|
460
|
+
memory_enabled: false
|
|
461
|
+
dialogue:
|
|
462
|
+
enabled: false
|
|
463
|
+
max_rounds: 0
|
|
464
|
+
ego:
|
|
465
|
+
provider: openrouter
|
|
466
|
+
model: nemotron
|
|
467
|
+
prompt_file: tutor-ego-hardwired.md
|
|
468
|
+
hyperparameters:
|
|
469
|
+
temperature: 0.6
|
|
470
|
+
max_tokens: 8000
|
|
471
|
+
superego: null
|
|
472
|
+
|
|
473
|
+
# Cell 14: Hardwired × Single Agent × Ego/Superego Learner
|
|
474
|
+
cell_14_hardwired_single_psycho:
|
|
475
|
+
description: "Factorial cell 14: hardwired single-agent tutor (superego rules embedded), ego/superego learner"
|
|
476
|
+
factors:
|
|
477
|
+
prompt_type: hardwired
|
|
478
|
+
multi_agent_tutor: false
|
|
479
|
+
multi_agent_learner: true
|
|
480
|
+
learner_architecture: ego_superego
|
|
481
|
+
recognition_mode: false
|
|
482
|
+
memory_enabled: false
|
|
483
|
+
dialogue:
|
|
484
|
+
enabled: false
|
|
485
|
+
max_rounds: 0
|
|
486
|
+
ego:
|
|
487
|
+
provider: openrouter
|
|
488
|
+
model: nemotron
|
|
489
|
+
prompt_file: tutor-ego-hardwired.md
|
|
490
|
+
hyperparameters:
|
|
491
|
+
temperature: 0.6
|
|
492
|
+
max_tokens: 8000
|
|
493
|
+
superego: null
|
|
494
|
+
|
|
495
|
+
# ===========================================================================
|
|
496
|
+
# PLACEBO CELLS (15-18): Length-matched prompts WITHOUT recognition theory
|
|
497
|
+
# ===========================================================================
|
|
498
|
+
# These cells control for prompt length/complexity vs recognition theory content.
|
|
499
|
+
# Placebo prompts match recognition prompt length but use only pedagogical
|
|
500
|
+
# best practices without Hegelian concepts (mutual recognition, autonomous
|
|
501
|
+
# subject, productive struggle as transformation).
|
|
502
|
+
#
|
|
503
|
+
# Comparison:
|
|
504
|
+
# cell_9/10 (enhanced) vs cell_15/16 (placebo) vs cell_5/6 (recognition)
|
|
505
|
+
# → Isolates recognition theory value from prompt length/complexity
|
|
506
|
+
|
|
507
|
+
# Cell 15: Placebo × Single Agent × Unified Learner
|
|
508
|
+
cell_15_placebo_single_unified:
|
|
509
|
+
description: "Factorial cell 15: placebo single-agent tutor (length-matched, no recognition theory), unified learner"
|
|
510
|
+
factors:
|
|
511
|
+
prompt_type: placebo
|
|
512
|
+
multi_agent_tutor: false
|
|
513
|
+
multi_agent_learner: false
|
|
514
|
+
learner_architecture: unified
|
|
515
|
+
recognition_mode: false
|
|
516
|
+
memory_enabled: true
|
|
517
|
+
dialogue:
|
|
518
|
+
enabled: false
|
|
519
|
+
max_rounds: 0
|
|
520
|
+
ego:
|
|
521
|
+
provider: openrouter
|
|
522
|
+
model: nemotron
|
|
523
|
+
prompt_file: tutor-ego-placebo.md
|
|
524
|
+
hyperparameters:
|
|
525
|
+
temperature: 0.6
|
|
526
|
+
max_tokens: 8000
|
|
527
|
+
superego: null
|
|
528
|
+
|
|
529
|
+
# Cell 16: Placebo × Single Agent × Ego/Superego Learner
|
|
530
|
+
cell_16_placebo_single_psycho:
|
|
531
|
+
description: "Factorial cell 16: placebo single-agent tutor (length-matched, no recognition theory), ego/superego learner"
|
|
532
|
+
factors:
|
|
533
|
+
prompt_type: placebo
|
|
534
|
+
multi_agent_tutor: false
|
|
535
|
+
multi_agent_learner: true
|
|
536
|
+
learner_architecture: ego_superego
|
|
537
|
+
recognition_mode: false
|
|
538
|
+
memory_enabled: true
|
|
539
|
+
dialogue:
|
|
540
|
+
enabled: false
|
|
541
|
+
max_rounds: 0
|
|
542
|
+
ego:
|
|
543
|
+
provider: openrouter
|
|
544
|
+
model: nemotron
|
|
545
|
+
prompt_file: tutor-ego-placebo.md
|
|
546
|
+
hyperparameters:
|
|
547
|
+
temperature: 0.6
|
|
548
|
+
max_tokens: 8000
|
|
549
|
+
superego: null
|
|
550
|
+
|
|
551
|
+
# Cell 17: Placebo × Multi-Agent Tutor × Unified Learner
|
|
552
|
+
cell_17_placebo_multi_unified:
|
|
553
|
+
description: "Factorial cell 17: placebo ego+superego tutor (length-matched, no recognition theory), unified learner"
|
|
554
|
+
factors:
|
|
555
|
+
prompt_type: placebo
|
|
556
|
+
multi_agent_tutor: true
|
|
557
|
+
multi_agent_learner: false
|
|
558
|
+
learner_architecture: unified
|
|
559
|
+
recognition_mode: false
|
|
560
|
+
memory_enabled: true
|
|
561
|
+
dialogue:
|
|
562
|
+
enabled: true
|
|
563
|
+
max_rounds: 2
|
|
564
|
+
convergence_threshold: 0.7
|
|
565
|
+
ego:
|
|
566
|
+
provider: openrouter
|
|
567
|
+
model: nemotron
|
|
568
|
+
staging: front
|
|
569
|
+
prompt_file: tutor-ego-placebo.md
|
|
570
|
+
hyperparameters:
|
|
571
|
+
temperature: 0.6
|
|
572
|
+
max_tokens: 8000
|
|
573
|
+
superego:
|
|
574
|
+
provider: openrouter
|
|
575
|
+
model: kimi-k2.5
|
|
576
|
+
staging: back
|
|
577
|
+
prompt_file: tutor-superego-placebo.md
|
|
578
|
+
hyperparameters:
|
|
579
|
+
temperature: 0.2
|
|
580
|
+
max_tokens: 8000
|
|
581
|
+
|
|
582
|
+
# Cell 18: Placebo × Multi-Agent Tutor × Ego/Superego Learner
|
|
583
|
+
cell_18_placebo_multi_psycho:
|
|
584
|
+
description: "Factorial cell 18: placebo ego+superego tutor (length-matched, no recognition theory), ego/superego learner"
|
|
585
|
+
factors:
|
|
586
|
+
prompt_type: placebo
|
|
587
|
+
multi_agent_tutor: true
|
|
588
|
+
multi_agent_learner: true
|
|
589
|
+
learner_architecture: ego_superego
|
|
590
|
+
recognition_mode: false
|
|
591
|
+
memory_enabled: true
|
|
592
|
+
dialogue:
|
|
593
|
+
enabled: true
|
|
594
|
+
max_rounds: 2
|
|
595
|
+
convergence_threshold: 0.7
|
|
596
|
+
ego:
|
|
597
|
+
provider: openrouter
|
|
598
|
+
model: nemotron
|
|
599
|
+
staging: front
|
|
600
|
+
prompt_file: tutor-ego-placebo.md
|
|
601
|
+
hyperparameters:
|
|
602
|
+
temperature: 0.6
|
|
603
|
+
max_tokens: 8000
|
|
604
|
+
superego:
|
|
605
|
+
provider: openrouter
|
|
606
|
+
model: kimi-k2.5
|
|
607
|
+
staging: back
|
|
608
|
+
prompt_file: tutor-superego-placebo.md
|
|
609
|
+
hyperparameters:
|
|
610
|
+
temperature: 0.2
|
|
611
|
+
max_tokens: 8000
|
|
612
|
+
|
|
613
|
+
# ===========================================================================
|
|
614
|
+
# MEMORY CONFOUND ISOLATION CELLS (19-20)
|
|
615
|
+
# ===========================================================================
|
|
616
|
+
# 2×2 Memory × Recognition design (single-agent, unified learner held constant).
|
|
617
|
+
# Reuses cell_1 (base) and cell_5 (full recognition) to complete the 2×2.
|
|
618
|
+
#
|
|
619
|
+
# Memory OFF Memory ON
|
|
620
|
+
# Recog OFF cell_1 (exists) cell_19 (NEW)
|
|
621
|
+
# Recog ON cell_20 (NEW) cell_5 (exists)
|
|
622
|
+
|
|
623
|
+
# Cell 19: Memory-Only × Single Agent × Unified Learner
|
|
624
|
+
cell_19_memory_single_unified:
|
|
625
|
+
description: "Memory isolation: base + memory integration, no recognition theory"
|
|
626
|
+
factors:
|
|
627
|
+
prompt_type: memory
|
|
628
|
+
multi_agent_tutor: false
|
|
629
|
+
multi_agent_learner: false
|
|
630
|
+
learner_architecture: unified
|
|
631
|
+
recognition_mode: false
|
|
632
|
+
memory_enabled: true
|
|
633
|
+
dialogue:
|
|
634
|
+
enabled: false
|
|
635
|
+
max_rounds: 0
|
|
636
|
+
ego:
|
|
637
|
+
provider: openrouter
|
|
638
|
+
model: nemotron
|
|
639
|
+
prompt_file: tutor-ego-memory.md
|
|
640
|
+
hyperparameters:
|
|
641
|
+
temperature: 0.6
|
|
642
|
+
max_tokens: 8000
|
|
643
|
+
superego: null
|
|
644
|
+
|
|
645
|
+
# Cell 20: Recognition-No-Memory × Single Agent × Unified Learner
|
|
646
|
+
cell_20_recog_nomem_single_unified:
|
|
647
|
+
description: "Memory isolation: recognition theory without memory integration"
|
|
648
|
+
factors:
|
|
649
|
+
prompt_type: recognition_nomem
|
|
650
|
+
multi_agent_tutor: false
|
|
651
|
+
multi_agent_learner: false
|
|
652
|
+
learner_architecture: unified
|
|
653
|
+
recognition_mode: true
|
|
654
|
+
memory_enabled: false
|
|
655
|
+
dialogue:
|
|
656
|
+
enabled: false
|
|
657
|
+
max_rounds: 0
|
|
658
|
+
ego:
|
|
659
|
+
provider: openrouter
|
|
660
|
+
model: nemotron
|
|
661
|
+
prompt_file: tutor-ego-recognition-nomem.md
|
|
662
|
+
hyperparameters:
|
|
663
|
+
temperature: 0.6
|
|
664
|
+
max_tokens: 8000
|
|
665
|
+
superego: null
|
|
666
|
+
|
|
667
|
+
# ===========================================================================
|
|
668
|
+
# DYNAMIC PROMPT REWRITING CELL (21) - v2
|
|
669
|
+
# ===========================================================================
|
|
670
|
+
# Tests whether feeding deliberation insights back as directives
|
|
671
|
+
# improves multi-turn tutoring outcomes. Compare against cell_7 (static).
|
|
672
|
+
#
|
|
673
|
+
# v2 improvements:
|
|
674
|
+
# - LLM-based directive synthesis (uses superego model for rich context analysis)
|
|
675
|
+
# - Writing Pad memory activated via learnerId threading
|
|
676
|
+
# - Dialectical negotiation enabled (memory surfaces in prompts)
|
|
677
|
+
|
|
678
|
+
cell_21_recog_multi_unified_rewrite:
|
|
679
|
+
description: "Prompt rewriting v2: LLM directives + active Writing Pad"
|
|
680
|
+
factors:
|
|
681
|
+
prompt_type: recognition
|
|
682
|
+
multi_agent_tutor: true
|
|
683
|
+
multi_agent_learner: false
|
|
684
|
+
dynamic_rewriting: true
|
|
685
|
+
learner_architecture: unified_recognition
|
|
686
|
+
recognition_mode: true
|
|
687
|
+
memory_enabled: true
|
|
688
|
+
writing_pad_enabled: true # Explicitly enable Writing Pad
|
|
689
|
+
dialectical_negotiation: true # Memory surfaces in prompts
|
|
690
|
+
prompt_rewriting:
|
|
691
|
+
enabled: true
|
|
692
|
+
strategy: llm # LLM-based synthesis (vs 'template')
|
|
693
|
+
dialogue:
|
|
694
|
+
enabled: true
|
|
695
|
+
max_rounds: 2
|
|
696
|
+
convergence_threshold: 0.7
|
|
697
|
+
ego:
|
|
698
|
+
provider: openrouter
|
|
699
|
+
model: nemotron
|
|
700
|
+
staging: front
|
|
701
|
+
prompt_file: tutor-ego-recognition.md
|
|
702
|
+
hyperparameters:
|
|
703
|
+
temperature: 0.6
|
|
704
|
+
max_tokens: 8000
|
|
705
|
+
superego:
|
|
706
|
+
provider: openrouter
|
|
707
|
+
model: kimi-k2.5
|
|
708
|
+
staging: back
|
|
709
|
+
prompt_file: tutor-superego-recognition.md
|
|
710
|
+
hyperparameters:
|
|
711
|
+
temperature: 0.2
|
|
712
|
+
max_tokens: 8000
|
|
713
|
+
intervention_strategies:
|
|
714
|
+
enforce_mutual_recognition: true
|
|
715
|
+
require_memory_integration: true
|
|
716
|
+
assess_transformative_potential: true
|