@machinespirits/eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +91 -9
  2. package/config/eval-settings.yaml +3 -3
  3. package/config/paper-manifest.json +486 -0
  4. package/config/providers.yaml +9 -6
  5. package/config/tutor-agents.yaml +2261 -0
  6. package/content/README.md +23 -0
  7. package/content/courses/479/course.md +53 -0
  8. package/content/courses/479/lecture-1.md +361 -0
  9. package/content/courses/479/lecture-2.md +360 -0
  10. package/content/courses/479/lecture-3.md +655 -0
  11. package/content/courses/479/lecture-4.md +530 -0
  12. package/content/courses/479/lecture-5.md +326 -0
  13. package/content/courses/479/lecture-6.md +346 -0
  14. package/content/courses/479/lecture-7.md +326 -0
  15. package/content/courses/479/lecture-8.md +273 -0
  16. package/content/courses/479/roadmap-slides.md +656 -0
  17. package/content/manifest.yaml +8 -0
  18. package/docs/research/build.sh +44 -20
  19. package/docs/research/figures/figure10.png +0 -0
  20. package/docs/research/figures/figure11.png +0 -0
  21. package/docs/research/figures/figure3.png +0 -0
  22. package/docs/research/figures/figure4.png +0 -0
  23. package/docs/research/figures/figure5.png +0 -0
  24. package/docs/research/figures/figure6.png +0 -0
  25. package/docs/research/figures/figure7.png +0 -0
  26. package/docs/research/figures/figure8.png +0 -0
  27. package/docs/research/figures/figure9.png +0 -0
  28. package/docs/research/header.tex +23 -2
  29. package/docs/research/paper-full.md +941 -285
  30. package/docs/research/paper-short.md +216 -585
  31. package/docs/research/references.bib +132 -0
  32. package/docs/research/slides-header.tex +188 -0
  33. package/docs/research/slides-pptx.md +363 -0
  34. package/docs/research/slides.md +531 -0
  35. package/docs/research/style-reference-pptx.py +199 -0
  36. package/package.json +6 -5
  37. package/scripts/analyze-eval-results.js +69 -17
  38. package/scripts/analyze-mechanism-traces.js +763 -0
  39. package/scripts/analyze-modulation-learning.js +498 -0
  40. package/scripts/analyze-prosthesis.js +144 -0
  41. package/scripts/analyze-run.js +264 -79
  42. package/scripts/assess-transcripts.js +853 -0
  43. package/scripts/browse-transcripts.js +854 -0
  44. package/scripts/check-parse-failures.js +73 -0
  45. package/scripts/code-dialectical-modulation.js +1320 -0
  46. package/scripts/download-data.sh +55 -0
  47. package/scripts/eval-cli.js +106 -18
  48. package/scripts/generate-paper-figures.js +663 -0
  49. package/scripts/generate-paper-figures.py +577 -76
  50. package/scripts/generate-paper-tables.js +299 -0
  51. package/scripts/qualitative-analysis-ai.js +3 -3
  52. package/scripts/render-sequence-diagram.js +694 -0
  53. package/scripts/test-latency.js +210 -0
  54. package/scripts/test-rate-limit.js +95 -0
  55. package/scripts/test-token-budget.js +332 -0
  56. package/scripts/validate-paper-manifest.js +670 -0
  57. package/services/__tests__/evalConfigLoader.test.js +2 -2
  58. package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
  59. package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
  60. package/services/evaluationRunner.js +975 -98
  61. package/services/evaluationStore.js +12 -4
  62. package/services/learnerTutorInteractionEngine.js +27 -2
  63. package/services/mockProvider.js +133 -0
  64. package/services/promptRewriter.js +1471 -5
  65. package/services/rubricEvaluator.js +55 -2
  66. package/services/transcriptFormatter.js +675 -0
  67. package/docs/EVALUATION-VARIABLES.md +0 -589
  68. package/docs/REPLICATION-PLAN.md +0 -577
  69. package/scripts/analyze-run.mjs +0 -282
  70. package/scripts/compare-runs.js +0 -44
  71. package/scripts/compare-suggestions.js +0 -80
  72. package/scripts/dig-into-run.js +0 -158
  73. package/scripts/show-failed-suggestions.js +0 -64
  74. /package/scripts/{check-run.mjs → check-run.js} +0 -0
@@ -998,6 +998,138 @@ author = {Long, Christopher P.},
998
998
 
999
999
  % --- Near-match alias ---
1000
1000
 
1001
+ % ======================================================================
1002
+ % Related Work expansion references — Added 2026-02-15
1003
+ % ======================================================================
1004
+
1005
+ % --- Empirical LLM Tutoring ---
1006
+
1007
+ @inproceedings{vanzo2025gpt4homework,
1008
+ author = {Vanzo, Alessandro and Pal Chowdhury, Sankalan and Sachan, Mrinmaya},
1009
+ title = {{GPT}-4 as a Homework Tutor can Improve Student Engagement and Learning Outcomes},
1010
+ booktitle = {Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
1011
+ pages = {31119--31136},
1012
+ year = {2025},
1013
+ publisher = {Association for Computational Linguistics},
1014
+ doi = {10.18653/v1/2025.acl-long.1502}
1015
+ }
1016
+
1017
+ @article{shi2025llmeducation,
1018
+ author = {Shi, Yuhong and Yu, Kun and Dong, Yifei and Chen, Fang},
1019
+ title = {Large Language Models in Education: A Systematic Review of Empirical Applications, Benefits, and Challenges},
1020
+ journal = {Computers and Education: Artificial Intelligence},
1021
+ volume = {10},
1022
+ pages = {100529},
1023
+ year = {2025},
1024
+ doi = {10.1016/j.caeai.2025.100529}
1025
+ }
1026
+
1027
+ @inproceedings{scarlatos2025training,
1028
+ author = {Scarlatos, Alexander and Liu, Naiming and Lee, Jaewook and Baraniuk, Richard and Lan, Andrew},
1029
+ title = {Training {LLM}-Based Tutors to Improve Student Learning Outcomes in Dialogues},
1030
+ booktitle = {Artificial Intelligence in Education -- 26th International Conference, {AIED} 2025, Proceedings},
1031
+ pages = {251--266},
1032
+ year = {2025},
1033
+ publisher = {Springer},
1034
+ doi = {10.1007/978-3-031-98414-3_18}
1035
+ }
1036
+
1037
+ % --- Multi-Agent Systems ---
1038
+
1039
+ @inproceedings{li2023camel,
1040
+ author = {Li, Guohao and Hammoud, Hasan Abed Al Kader and Itani, Hani and Khizbullin, Dmitrii and Ghanem, Bernard},
1041
+ title = {{CAMEL}: Communicative Agents for ``Mind'' Exploration of Large Language Model Society},
1042
+ booktitle = {Advances in Neural Information Processing Systems},
1043
+ volume = {36},
1044
+ year = {2023}
1045
+ }
1046
+
1047
+ @inproceedings{guo2024multiagents,
1048
+ author = {Guo, Taicheng and Chen, Xiuying and Wang, Yaqi and Chang, Ruidi and Pei, Shichao and Chawla, Nitesh V. and Wiest, Olaf and Zhang, Xiangliang},
1049
+ title = {Large Language Model Based Multi-Agents: A Survey of Progress and Challenges},
1050
+ booktitle = {Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence ({IJCAI-24}), Survey Track},
1051
+ pages = {8048--8057},
1052
+ year = {2024},
1053
+ doi = {10.24963/ijcai.2024/890}
1054
+ }
1055
+
1056
+ % --- Self-Correction Limits ---
1057
+
1058
+ @article{kamoi2024selfcorrection,
1059
+ author = {Kamoi, Ryo and Zhang, Yusen and Zhang, Nan and Han, Jiawei and Zhang, Rui},
1060
+ title = {When Can {LLMs} Actually Correct Their Own Mistakes? {A} Critical Survey of Self-Correction of {LLMs}},
1061
+ journal = {Transactions of the Association for Computational Linguistics},
1062
+ volume = {12},
1063
+ pages = {1417--1440},
1064
+ year = {2024},
1065
+ doi = {10.1162/tacl_a_00713}
1066
+ }
1067
+
1068
+ @inproceedings{shinn2023reflexion,
1069
+ author = {Shinn, Noah and Cassano, Federico and Gopinath, Ashwin and Narasimhan, Karthik and Yao, Shunyu},
1070
+ title = {Reflexion: Language Agents with Verbal Reinforcement Learning},
1071
+ booktitle = {Advances in Neural Information Processing Systems},
1072
+ volume = {36},
1073
+ year = {2023}
1074
+ }
1075
+
1076
+ % --- LLM-as-Judge ---
1077
+
1078
+ @inproceedings{zheng2023judging,
1079
+ author = {Zheng, Lianmin and Chiang, Wei-Lin and Sheng, Ying and Zhuang, Siyuan and Wu, Zhanghao and Zhuang, Yonghao and Lin, Zi and Li, Zhuohan and Li, Dacheng and Xing, Eric P. and Zhang, Hao and Gonzalez, Joseph E. and Stoica, Ion},
1080
+ title = {Judging {LLM}-as-a-Judge with {MT-Bench} and {Chatbot Arena}},
1081
+ booktitle = {Advances in Neural Information Processing Systems},
1082
+ volume = {36},
1083
+ year = {2023}
1084
+ }
1085
+
1086
+ @article{gu2025surveyjudge,
1087
+ author = {Gu, Jiawei and Jiang, Xuhui and Shi, Zhichao and Tan, Hexiang and Zhai, Xuehao and Xu, Chengjin and Li, Wei and Shen, Yinghan and Ma, Shengjie and Liu, Honghao and Wang, Saizhuo and Zhang, Kun and Lin, Zhouchi and Zhang, Bowen and Ni, Lionel and Gao, Wen and Wang, Yuanzhuo and Guo, Jian},
1088
+ title = {A Survey on {LLM}-as-a-Judge},
1089
+ journal = {The Innovation},
1090
+ year = {2025},
1091
+ doi = {10.1016/j.xinn.2025.101253}
1092
+ }
1093
+
1094
+ @misc{li2024llmsjudges,
1095
+ author = {Li, Haitao and Dong, Qian and Chen, Junjie and Su, Huixue and Zhou, Yujia and Ai, Qingyao and Ye, Ziyi and Liu, Yiqun},
1096
+ title = {{LLMs}-as-Judges: A Comprehensive Survey on {LLM}-based Evaluation Methods},
1097
+ year = {2024},
1098
+ eprint = {2412.05579},
1099
+ archivePrefix = {arXiv},
1100
+ primaryClass = {cs.CL}
1101
+ }
1102
+
1103
+ % --- Theory of Mind in AI ---
1104
+
1105
+ @article{street2025tom,
1106
+ author = {Street, Winnie and Siy, John Oliver and Keeling, Geoff and Baranes, Adrien and Barnett, Benjamin and McKibben, Michael and Kanyere, Tatenda and Lentz, Alison and {Aguera y Arcas}, Blaise and Dunbar, Robin I. M.},
1107
+ title = {{LLMs} achieve adult human performance on higher-order theory of mind tasks},
1108
+ journal = {Frontiers in Human Neuroscience},
1109
+ volume = {19},
1110
+ pages = {1633272},
1111
+ year = {2025},
1112
+ doi = {10.3389/fnhum.2025.1633272}
1113
+ }
1114
+
1115
+ @inproceedings{nguyen2025tomsurvey,
1116
+ author = {Nguyen, Hieu Minh},
1117
+ title = {A Survey of Theory of Mind in Large Language Models: Evaluations, Representations, and Safety Risks},
1118
+ booktitle = {Advancing Artificial Intelligence through Theory of Mind Workshop, AAAI},
1119
+ year = {2025}
1120
+ }
1121
+
1122
+ @misc{hwang2025infusingtom,
1123
+ author = {Hwang, EunJeong and Yin, Yuwei and Carenini, Giuseppe and West, Peter and Shwartz, Vered},
1124
+ title = {Infusing Theory of Mind into Socially Intelligent {LLM} Agents},
1125
+ year = {2025},
1126
+ eprint = {2509.22887},
1127
+ archivePrefix = {arXiv},
1128
+ primaryClass = {cs.CL}
1129
+ }
1130
+
1131
+ % --- End Related Work expansion references ---
1132
+
1001
1133
  @misc{magee2024drama,
1002
1134
  author = {Magee, Liam and Arora, Vanicka and Gollings, Gus and Lam-Saw, Norma},
1003
1135
  title = {The Drama Machine: Simulating Character Development with {LLM} Agents},
@@ -0,0 +1,188 @@
1
+ % Custom Beamer styling for "Geist in the Machine" presentation
2
+ % Uses Metropolis theme with a refined academic color palette
3
+
4
+ % ─── Color Palette ───────────────────────────────────────────
5
+ % Deep slate blue primary, warm amber accent, clean backgrounds
6
+ \definecolor{GeistPrimary}{HTML}{1B2838} % Dark slate (titles, bars)
7
+ \definecolor{GeistAccent}{HTML}{D4872C} % Warm amber (highlights, alerts)
8
+ \definecolor{GeistLight}{HTML}{F5F2EB} % Warm off-white (backgrounds)
9
+ \definecolor{GeistMid}{HTML}{5C6B7A} % Medium slate (subtitles)
10
+ \definecolor{GeistText}{HTML}{2D3436} % Near-black (body text)
11
+ \definecolor{GeistTableHead}{HTML}{E8E2D6} % Warm beige (table headers)
12
+ \definecolor{GeistTableAlt}{HTML}{F9F7F3} % Light warm (table alt rows)
13
+ \definecolor{GeistPositive}{HTML}{27AE60} % Green (positive results)
14
+ \definecolor{GeistNegative}{HTML}{C0392B} % Red (negative results)
15
+ \definecolor{GeistCode}{HTML}{F0ECE3} % Code background
16
+
17
+ % ─── Metropolis Configuration ────────────────────────────────
18
+ \metroset{
19
+ progressbar=frametitle,
20
+ sectionpage=none,
21
+ numbering=fraction,
22
+ block=fill,
23
+ }
24
+
25
+ % Apply custom colors to metropolis elements
26
+ \setbeamercolor{normal text}{fg=GeistText, bg=white}
27
+ \setbeamercolor{background canvas}{bg=white}
28
+ \setbeamercolor{alerted text}{fg=GeistAccent}
29
+ \setbeamercolor{frametitle}{fg=white, bg=GeistPrimary}
30
+ \setbeamercolor{title separator}{fg=GeistAccent}
31
+ \setbeamercolor{progress bar}{fg=GeistAccent, bg=GeistPrimary!20}
32
+ \setbeamercolor{progress bar in head/foot}{fg=GeistAccent, bg=GeistPrimary!20}
33
+ \setbeamercolor{progress bar in section page}{fg=GeistAccent, bg=GeistPrimary!20}
34
+ \setbeamercolor{title}{fg=white}
35
+ \setbeamercolor{subtitle}{fg=GeistAccent!80!white}
36
+ \setbeamercolor{author}{fg=GeistLight!90!white}
37
+ \setbeamercolor{date}{fg=GeistLight!70!white}
38
+ \setbeamercolor{institute}{fg=GeistLight!80!white}
39
+ \setbeamercolor{section title}{fg=white}
40
+ \setbeamercolor{block title}{fg=white, bg=GeistPrimary}
41
+ \setbeamercolor{block body}{bg=GeistLight}
42
+ \setbeamercolor{block title alerted}{fg=white, bg=GeistAccent}
43
+ \setbeamercolor{block body alerted}{bg=GeistAccent!10}
44
+ \setbeamercolor{block title example}{fg=white, bg=GeistPositive!80!black}
45
+ \setbeamercolor{block body example}{bg=GeistPositive!5}
46
+ \setbeamercolor{itemize item}{fg=GeistAccent}
47
+ \setbeamercolor{itemize subitem}{fg=GeistMid}
48
+ \setbeamercolor{enumerate item}{fg=GeistAccent}
49
+ \setbeamercolor{description item}{fg=GeistPrimary}
50
+ \setbeamercolor{footnote}{fg=GeistMid}
51
+ \setbeamercolor{page number in head/foot}{fg=GeistMid}
52
+
53
+ % ─── Typography ──────────────────────────────────────────────
54
+ \usepackage{FiraSans}
55
+ \usepackage{FiraMono}
56
+
57
+ % Title page font sizes
58
+ \setbeamerfont{title}{size=\Large, series=\bfseries}
59
+ \setbeamerfont{subtitle}{size=\normalsize}
60
+ \setbeamerfont{author}{size=\small}
61
+ \setbeamerfont{date}{size=\small}
62
+ \setbeamerfont{institute}{size=\footnotesize}
63
+ \setbeamerfont{frametitle}{size=\large}
64
+
65
+ % ─── Title Page ──────────────────────────────────────────────
66
+ \setbeamertemplate{title page}{
67
+ \begin{minipage}[b][\paperheight]{\textwidth}
68
+ \vfill
69
+ \ifx\inserttitlegraphic\@empty\else\usebeamertemplate*{title graphic}\fi
70
+
71
+ {\usebeamerfont{title}\usebeamercolor[fg]{title}\inserttitle\par}
72
+
73
+ \vspace{0.6em}
74
+ {\usebeamercolor{title separator}\textcolor{fg}{\rule{0.35\textwidth}{1.2pt}}\par}
75
+
76
+ \vspace{0.6em}
77
+ \ifx\insertsubtitle\@empty\else
78
+ {\usebeamerfont{subtitle}\usebeamercolor[fg]{subtitle}\insertsubtitle\par}
79
+ \vspace{0.4em}
80
+ \fi
81
+
82
+ \ifx\insertauthor\@empty\else
83
+ {\usebeamerfont{author}\usebeamercolor[fg]{author}\insertauthor\par}
84
+ \vspace{0.2em}
85
+ \fi
86
+
87
+ \ifx\insertinstitute\@empty\else
88
+ {\usebeamerfont{institute}\usebeamercolor[fg]{institute}\insertinstitute\par}
89
+ \vspace{0.2em}
90
+ \fi
91
+
92
+ \ifx\insertdate\@empty\else
93
+ {\usebeamerfont{date}\usebeamercolor[fg]{date}\insertdate\par}
94
+ \fi
95
+
96
+ \vfill
97
+ \vspace{1em}
98
+ \end{minipage}
99
+ }
100
+
101
+ % Dark background for title page
102
+ \makeatletter
103
+ \def\ps@titlepage{%
104
+ \setbeamertemplate{background}{%
105
+ \begin{tikzpicture}
106
+ \useasboundingbox (0,0) rectangle (\the\paperwidth,\the\paperheight);
107
+ \fill[GeistPrimary] (0,0) rectangle (\the\paperwidth,\the\paperheight);
108
+ \end{tikzpicture}
109
+ }
110
+ }
111
+ \addtobeamertemplate{title page}{\thispagestyle{titlepage}}{}
112
+ \makeatother
113
+
114
+ % ─── Table Styling ───────────────────────────────────────────
115
+ \usepackage{booktabs}
116
+ \usepackage{colortbl}
117
+ \usepackage{array}
118
+
119
+ % Redefine \toprule/\bottomrule colors
120
+ \renewcommand{\toprule}{\arrayrulecolor{GeistPrimary}\specialrule{0.8pt}{0pt}{4pt}}
121
+ \renewcommand{\midrule}{\arrayrulecolor{GeistPrimary!30}\specialrule{0.4pt}{3pt}{4pt}}
122
+ \renewcommand{\bottomrule}{\arrayrulecolor{GeistPrimary}\specialrule{0.8pt}{4pt}{0pt}}
123
+
124
+ % Scale tables to fit slides
125
+ \usepackage{adjustbox}
126
+ \let\oldtabular\tabular
127
+ \let\endoldtabular\endtabular
128
+ \renewenvironment{tabular}[1]{%
129
+ \adjustbox{max width=\textwidth}{%
130
+ \oldtabular{#1}%
131
+ }{%
132
+ \endoldtabular}%
133
+ }
134
+
135
+ % Smaller font in tables for readability on slides
136
+ \AtBeginEnvironment{longtable}{\footnotesize}
137
+
138
+ % ─── Code Block Styling ─────────────────────────────────────
139
+ \usepackage{etoolbox}
140
+ \AtBeginDocument{%
141
+ \apptocmd{\Shaded}{\scriptsize\setlength{\fboxsep}{6pt}}{}{}%
142
+ }
143
+
144
+ % ─── Spacing & Layout ───────────────────────────────────────
145
+ \setlength{\parskip}{0.4em}
146
+ \linespread{1.1}
147
+
148
+ % Tighter list spacing
149
+ \setbeamertemplate{itemize/enumerate body begin}{\vspace{-0.2em}}
150
+ \setbeamertemplate{itemize/enumerate body end}{\vspace{-0.2em}}
151
+ \setlength{\leftmargini}{1.2em}
152
+ \setlength{\leftmarginii}{1.2em}
153
+
154
+ % ─── Blockquote Styling ─────────────────────────────────────
155
+ % Style quotes with a left accent bar
156
+ \usepackage{tikz}
157
+ \renewenvironment{quote}{%
158
+ \begin{tikzpicture}
159
+ \node[
160
+ inner sep=8pt,
161
+ text width=0.88\textwidth,
162
+ fill=GeistLight,
163
+ draw=none,
164
+ font=\small\itshape\color{GeistText},
165
+ execute at begin node={\hspace{0pt}},
166
+ ] (quote) \bgroup
167
+ }{%
168
+ \egroup;
169
+ \fill[GeistAccent] ([xshift=-4pt]quote.north west) rectangle ([xshift=-1pt]quote.south west);
170
+ \end{tikzpicture}
171
+ }
172
+
173
+ % ─── Standout colors (used only for Thank You slide) ─────────
174
+ % Note: pandoc+metropolis standout frames leak color changes to
175
+ % subsequent frames. Use sparingly — only as the final slide.
176
+
177
+ % ─── Unicode Support ─────────────────────────────────────────
178
+ \usepackage{unicode-math}
179
+ \setmathfont{latinmodern-math.otf}
180
+ \usepackage{newunicodechar}
181
+ \newunicodechar{≈}{$\approx$}
182
+ \newunicodechar{≥}{$\geq$}
183
+ \newunicodechar{𝜒}{$\chi$}
184
+ \tracinglostchars=0
185
+
186
+ % ─── Misc ────────────────────────────────────────────────────
187
+ \setbeamertemplate{caption}[numbered]
188
+ \setbeamertemplate{navigation symbols}{}
@@ -0,0 +1,363 @@
1
+ ---
2
+ title: "*Geist* in the Machine"
3
+ subtitle: "Mutual Recognition and Multiagent Architecture for Dialectical AI Tutoring"
4
+ author: "Liam Magee"
5
+ date: "February 2026"
6
+ bibliography: references.bib
7
+ csl: apa.csl
8
+ ---
9
+
10
+ # Motivation
11
+
12
+ ## The Problem
13
+
14
+ Current AI tutoring treats learners as **knowledge deficits** to be filled.
15
+
16
+ - Learner says something interesting → tutor redirects to curriculum
17
+ - Learner struggles → tutor simplifies or restates
18
+ - Learner resists → tutor notes "engagement metrics" and moves on
19
+
20
+ **The learner is never encountered as a subject.**
21
+
22
+ This maps onto Hegel's master--slave dialectic: the master (tutor) consumes the slave's (learner's) labor without genuine encounter.
23
+
24
+ ---
25
+
26
+ ## Hegel's Alternative: Mutual Recognition
27
+
28
+ **Recognition** (*Anerkennung*): each party acknowledges the other as an autonomous consciousness whose understanding has intrinsic validity.
29
+
30
+ :::::::::::::: {.columns}
31
+ ::: {.column width="50%"}
32
+
33
+ **What it is**
34
+
35
+ - A **relational stance**
36
+ - How the tutor constitutes the learner
37
+ - Achievable without consciousness
38
+
39
+ :::
40
+ ::: {.column width="50%"}
41
+
42
+ **What it is not**
43
+
44
+ - Not agreement --- can disagree while recognizing
45
+ - Not affirmation --- "good job!" is not recognition
46
+ - Not a consciousness requirement
47
+
48
+ :::
49
+ ::::::::::::::
50
+
51
+ ---
52
+
53
+ # Architecture
54
+
55
+ ## The Drama Machine
56
+
57
+ :::::::::::::: {.columns}
58
+ ::: {.column width="50%"}
59
+
60
+ **Ego** (Response Generator)
61
+
62
+ - Generates pedagogical suggestions
63
+ - Has **final authority** over output
64
+ - Can override or incorporate Superego feedback
65
+
66
+ :::
67
+ ::: {.column width="50%"}
68
+
69
+ **Superego** (Internal Critic)
70
+
71
+ - Evaluates Ego's draft
72
+ - Checks pedagogical quality
73
+ - Structured critique: approve / revise / reject
74
+
75
+ :::
76
+ ::::::::::::::
77
+
78
+ **Recognition prompts** add Hegelian theory to both Ego and Superego:
79
+
80
+ - *"Acknowledge the learner as an autonomous subject..."*
81
+ - *"Evaluate whether the response treats the learner's understanding as having intrinsic validity..."*
82
+
83
+ ---
84
+
85
+ ## Phase 2: Advanced Mechanisms
86
+
87
+ Nine architectural mechanisms tested beyond base Ego/Superego:
88
+
89
+ | Mechanism | What it does |
90
+ |:----------|:-------------|
91
+ | Self-reflection | Ego reviews own prior performance |
92
+ | Bidirectional profiling | Theory of Mind models of each party |
93
+ | Intersubjective recognition | Explicit other-awareness prompts |
94
+ | Combined (all three) | Full mechanism stack |
95
+ | Cross-turn superego memory | Superego retains conversation context |
96
+ | Prompt rewriting | Dynamic prompt evolution mid-dialogue |
97
+ | Quantitative disposition | Numeric stance tracking |
98
+ | Prompt erosion | Gradual prompt degradation test |
99
+
100
+ ---
101
+
102
+ # Method
103
+
104
+ ## Evaluation Design
105
+
106
+ **37 evaluations**, N=3,383 primary scored responses
107
+
108
+ - **2x2x2 factorial** (N=350): Recognition x Architecture x Learner type
109
+ - **Memory isolation** (N=120): Disentangle recognition from episodic memory
110
+ - **Multi-model probe** (N=655): 5 ego models, architecture held constant
111
+ - **Dynamic learner tests** (N=660): Mechanisms with feedback-capable learners
112
+ - **Cross-judge replication** (N=977): GPT-5.2 independent validation
113
+
114
+ **14-dimension rubric** scored by Claude Opus 4.6
115
+
116
+ ---
117
+
118
+ # Results
119
+
120
+ ## Finding 1: Memory Isolation (The Definitive Finding)
121
+
122
+ 2x2 design (N=120, 30/cell) disentangles recognition from episodic memory:
123
+
124
+ | | No Memory | Memory |
125
+ |:--|:-----------:|:--------:|
126
+ | **No Recognition** | 75.4 | 80.2 |
127
+ | **Recognition** | **90.6** | **91.2** |
128
+
129
+ - **Recognition**: **+15.2 pts**, d=1.71, p<.001
130
+ - **Memory**: +4.8 pts, d=0.46, n.s.
131
+ - **Interaction**: -4.2 pts (ceiling effect, not synergy)
132
+
133
+ Recognition alone accounts for nearly the entire improvement.
134
+
135
+ ---
136
+
137
+ ## Finding 2: Full Factorial (2x2x2)
138
+
139
+ N=350, Kimi K2.5 ego, Opus 4.6 judge:
140
+
141
+ | Cell | Recog | Arch | Learner | M (SD) |
142
+ |:------:|:-------:|:------:|:---------:|:--------:|
143
+ | 1 | - | Single | Single | 73.4 (16.2) |
144
+ | 2 | - | Multi | Single | 69.9 (23.3) |
145
+ | 3 | - | Single | Multi | 75.5 (15.2) |
146
+ | 4 | - | Multi | Multi | 75.2 (18.1) |
147
+ | 5 | + | Single | Single | **90.2** (7.1) |
148
+ | 6 | + | Multi | Single | **83.9** (18.1) |
149
+ | 7 | + | Single | Multi | **90.1** (7.1) |
150
+ | 8 | + | Multi | Multi | **87.3** (10.3) |
151
+
152
+ **Recognition**: **+14.4 pts**, F(1,342)=110.04, p<.001, d=1.11
153
+
154
+ ---
155
+
156
+ ## Finding 3: Architecture is Additive
157
+
158
+ Multi-model probe (N=655, 5 ego models):
159
+
160
+ | Model | Base | +Arch | +Recog | +Both | AxB |
161
+ |:-------|:------:|:-------:|:--------:|:-------:|:-----:|
162
+ | Kimi K2.5 | 73.4 | 75.5 | **90.2** | 90.1 | +0.5 |
163
+ | Haiku | 78.2 | 81.9 | **93.3** | 93.5 | -3.7 |
164
+ | DeepSeek-R1 | 71.1 | 71.3 | **88.9** | 83.2 | -5.7 |
165
+ | GLM-4.7 | 63.9 | 62.2 | **73.5** | 74.9 | +3.1 |
166
+ | Nemotron | 62.3 | 62.6 | **78.2** | 72.5 | -5.7 |
167
+
168
+ - AxB interaction: -5.7 to +3.1 (mean -1.8) --- **no synergy**
169
+ - Recognition range: +9.6 to +17.8 across all models
170
+
171
+ ---
172
+
173
+ ## Finding 4: Domain Generalizability
174
+
175
+ Recognition effect across 6 tutorial domains (N=60):
176
+
177
+ | Domain | Base | Recog | Delta |
178
+ |:--------|:------:|:-------:|:---:|
179
+ | Climate science | 72.0 | 93.8 | **+21.8** |
180
+ | Ethics | 72.3 | 89.3 | **+17.0** |
181
+ | Mathematics | 73.0 | 89.2 | **+16.2** |
182
+ | Philosophy | 75.2 | 89.7 | **+14.5** |
183
+ | Machine learning | 78.0 | 91.5 | **+13.5** |
184
+ | Poetry | 86.0 | 92.5 | +6.5 |
185
+
186
+ Strong for conceptual domains (+14 to +22 pts). Weakest for poetry (+6.5) --- high baseline leaves less room for improvement.
187
+
188
+ ---
189
+
190
+ ## Finding 5: Scripted vs. Dynamic Learners
191
+
192
+ :::::::::::::: {.columns}
193
+ ::: {.column width="50%"}
194
+
195
+ **Scripted learners**
196
+
197
+ - Pre-written responses
198
+ - 9 mechanisms cluster within 2.4 pts
199
+ - No differentiation --- noise floor
200
+
201
+ :::
202
+ ::: {.column width="50%"}
203
+
204
+ **Dynamic learners**
205
+
206
+ - LLM-generated, ego/superego
207
+ - Mechanisms spread 5+ pts
208
+ - Recognition doubles: +7.6 → **+14.8**
209
+
210
+ :::
211
+ ::::::::::::::
212
+
213
+ **Lesson**: Mechanism effects require genuine feedback loops to manifest.
214
+
215
+ ---
216
+
217
+ ## Finding 6: Dynamic Learner Mechanisms
218
+
219
+ Complete 2x4 matrix (N=480, Haiku ego, dynamic learner):
220
+
221
+ | Mechanism | Base | Recog | Delta |
222
+ |:-----------|:------:|:-------:|:---:|
223
+ | Self-reflection | 72.3 | 85.6 | +13.3 |
224
+ | Bidirectional profiling | 74.6 | **88.8** | +14.2 |
225
+ | Intersubjective | 67.7 | 82.8 | +15.1 |
226
+ | Combined | 73.7 | 87.8 | +14.1 |
227
+
228
+ - Variance collapses with added mechanisms (SD: 22.5 → 11.8)
229
+ - Recognition delta stable (+13.3 to +15.1) regardless of mechanism
230
+ - Profiling = highest ceiling; intersubjective = lowest floor
231
+
232
+ ---
233
+
234
+ ## Finding 7: Cognitive Prosthesis Fails
235
+
236
+ Can a strong Superego (Kimi K2.5) compensate for a weak Ego (Nemotron)?
237
+
238
+ **No.** Full mechanism stack scores **49.5** --- that's **-15 pts below** Nemotron simple base (64.2)
239
+
240
+ - Same mechanisms boost Haiku by +20 pts but hurt Nemotron by -15 pts
241
+ - Static dimensions fine (spec accuracy 4.0); dynamic dimensions fail (adaptation 1.8)
242
+ - Parse failures: Kimi returns malformed JSON 16--45% of turns
243
+
244
+ **Minimum ego capability threshold**: The mechanisms amplify what the Ego can already do --- they cannot substitute for missing capability.
245
+
246
+ ---
247
+
248
+ ## Finding 8: Cross-Judge Robustness
249
+
250
+ GPT-5.2 independently rejudged N=977 paired responses:
251
+
252
+ | Finding | Claude | GPT-5.2 | Replicates? |
253
+ |:---------|:--------:|:---------:|:-------------:|
254
+ | Recognition (memory) | d=1.71 | d=1.54 | Yes |
255
+ | Memory effect | d=0.46 | d=0.49 | Yes (small) |
256
+ | Architecture effect | +2.6 | -0.2 | Yes (null) |
257
+ | Mechanism clustering | 2.8 pt | 4.4 pt | Yes (null) |
258
+
259
+ - Inter-judge r = 0.44--0.64 (all p<.001)
260
+ - GPT-5.2 finds 37--59% of Claude's effect magnitudes
261
+ - Always same direction --- **no sign reversals**
262
+
263
+ ---
264
+
265
+ # Qualitative Evidence
266
+
267
+ ## What Recognition Looks Like
268
+
269
+ **Base tutor** to a struggling learner:
270
+
271
+ > "You left off at the neural networks section. Complete this lecture to maintain your learning streak."
272
+
273
+ **Recognition tutor** to the same learner:
274
+
275
+ > "This is your third session --- you've persisted through quiz-479-3 three times, which signals you're wrestling with how recognition operates in the dialectic..."
276
+
277
+ Three systematic changes:
278
+
279
+ 1. The ego **listens to its internal critic** (superego feedback incorporated)
280
+ 2. The tutor **builds on learner contributions** (not redirecting to curriculum)
281
+ 3. **Mid-conversation strategy shifts** occur (30% of recognition dialogues vs 0% base)
282
+
283
+ ---
284
+
285
+ ## Dialectical Impasse: The Strongest Test
286
+
287
+ Three 5-turn scenarios with escalating resistance (N=24):
288
+
289
+ - **Epistemic resistance** (Popperian critique): Recognition **+43 pts**
290
+ - **Productive deadlock** (incompatible frameworks): Recognition **+29 pts**
291
+ - **Affective shutdown** (emotional retreat): Recognition -1.1 (null)
292
+
293
+ Resolution strategy coding (chi-square=24.00, p<.001, V=1.000):
294
+
295
+ - **Base**: 12/12 withdraw from encounter entirely
296
+ - **Recognition**: 10/12 scaffolded reframing (*Aufhebung*), 1 mutual recognition, 1 domination
297
+
298
+ The null on affective shutdown sharpens the claim: recognition's contribution is **epistemological**, not primarily affective.
299
+
300
+ ---
301
+
302
+ ## The Learner Superego Paradox
303
+
304
+ Multi-agent learner architecture **hurts** learner quality (d=1.43, F=68.28, p<.001):
305
+
306
+ - Designed to improve through internal self-critique
307
+ - Actually over-edits --- polishes away messy, authentic engagement
308
+ - Recognition partially rescues multi-agent learner (d=0.79, p=.004)
309
+
310
+ **Hegelian interpretation**: External recognition from an Other is structurally more effective than internal self-critique. You cannot bootstrap genuine dialogue from a monologue.
311
+
312
+ ---
313
+
314
+ # Implications
315
+
316
+ ## Practical Recommendations
317
+
318
+ 1. **Add recognition prompts** --- immediate +14 pt improvement, no architecture changes needed
319
+ 2. **Architecture is optional** --- modest additive benefit (+2 pts), not required
320
+ 3. **Use dynamic learners** for testing --- scripted learners mask mechanism effects
321
+ 4. **Theory of Mind profiling** --- best mechanism for ceiling performance
322
+ 5. **Token budgets can be cut 4--16x** with no quality loss
323
+ 6. **Minimum ego capability matters** --- mechanisms amplify, don't substitute
324
+
325
+ ---
326
+
327
+ ## Limitations
328
+
329
+ 1. **Simulated learners, not humans** --- all "learners" are LLM agents
330
+ 2. **LLM-as-judge** --- Claude Opus evaluates (mitigated by GPT-5.2 cross-judge)
331
+ 3. **Single content domain** --- primarily philosophy of education
332
+ 4. **No longitudinal data** --- snapshots, not learning trajectories
333
+ 5. **Prompt-level intervention** --- recognition embedded in prompts, not weights
334
+ 6. **Small N per cell** --- 30 observations per condition in key experiments
335
+
336
+ ---
337
+
338
+ ## Conclusion
339
+
340
+ **Recognition theory** produces robust, replicable improvements in AI tutoring quality:
341
+
342
+ - d=1.11 to d=1.71 depending on experiment
343
+ - Replicates across 5 models, 6 domains, 2 judges
344
+ - Survives all controls: memory isolation, prompt elaboration, token budget
345
+
346
+ **Multi-agent architecture** contributes additively but modestly.
347
+
348
+ **The key insight**: Philosophical theories of intersubjectivity can serve as productive design heuristics for AI systems. Recognition is better understood as an achievable relational stance than a requirement for machine consciousness.
349
+
350
+ ---
351
+
352
+ ## Thank You
353
+
354
+ **Paper**: *Geist* in the Machine (v2.3.14)
355
+
356
+ **37 evaluations** | **N=3,383 scored** | **5 ego models** | **2 judges**
357
+
358
+ Liam Magee
359
+
360
+ Education Policy, Organization and Leadership
361
+
362
+ University of Illinois Urbana-Champaign
363
+