@machinespirits/eval 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -9
- package/config/eval-settings.yaml +3 -3
- package/config/paper-manifest.json +486 -0
- package/config/providers.yaml +9 -6
- package/config/tutor-agents.yaml +2261 -0
- package/content/README.md +23 -0
- package/content/courses/479/course.md +53 -0
- package/content/courses/479/lecture-1.md +361 -0
- package/content/courses/479/lecture-2.md +360 -0
- package/content/courses/479/lecture-3.md +655 -0
- package/content/courses/479/lecture-4.md +530 -0
- package/content/courses/479/lecture-5.md +326 -0
- package/content/courses/479/lecture-6.md +346 -0
- package/content/courses/479/lecture-7.md +326 -0
- package/content/courses/479/lecture-8.md +273 -0
- package/content/courses/479/roadmap-slides.md +656 -0
- package/content/manifest.yaml +8 -0
- package/docs/research/build.sh +44 -20
- package/docs/research/figures/figure10.png +0 -0
- package/docs/research/figures/figure11.png +0 -0
- package/docs/research/figures/figure3.png +0 -0
- package/docs/research/figures/figure4.png +0 -0
- package/docs/research/figures/figure5.png +0 -0
- package/docs/research/figures/figure6.png +0 -0
- package/docs/research/figures/figure7.png +0 -0
- package/docs/research/figures/figure8.png +0 -0
- package/docs/research/figures/figure9.png +0 -0
- package/docs/research/header.tex +23 -2
- package/docs/research/paper-full.md +941 -285
- package/docs/research/paper-short.md +216 -585
- package/docs/research/references.bib +132 -0
- package/docs/research/slides-header.tex +188 -0
- package/docs/research/slides-pptx.md +363 -0
- package/docs/research/slides.md +531 -0
- package/docs/research/style-reference-pptx.py +199 -0
- package/package.json +6 -5
- package/scripts/analyze-eval-results.js +69 -17
- package/scripts/analyze-mechanism-traces.js +763 -0
- package/scripts/analyze-modulation-learning.js +498 -0
- package/scripts/analyze-prosthesis.js +144 -0
- package/scripts/analyze-run.js +264 -79
- package/scripts/assess-transcripts.js +853 -0
- package/scripts/browse-transcripts.js +854 -0
- package/scripts/check-parse-failures.js +73 -0
- package/scripts/code-dialectical-modulation.js +1320 -0
- package/scripts/download-data.sh +55 -0
- package/scripts/eval-cli.js +106 -18
- package/scripts/generate-paper-figures.js +663 -0
- package/scripts/generate-paper-figures.py +577 -76
- package/scripts/generate-paper-tables.js +299 -0
- package/scripts/qualitative-analysis-ai.js +3 -3
- package/scripts/render-sequence-diagram.js +694 -0
- package/scripts/test-latency.js +210 -0
- package/scripts/test-rate-limit.js +95 -0
- package/scripts/test-token-budget.js +332 -0
- package/scripts/validate-paper-manifest.js +670 -0
- package/services/__tests__/evalConfigLoader.test.js +2 -2
- package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
- package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
- package/services/evaluationRunner.js +975 -98
- package/services/evaluationStore.js +12 -4
- package/services/learnerTutorInteractionEngine.js +27 -2
- package/services/mockProvider.js +133 -0
- package/services/promptRewriter.js +1471 -5
- package/services/rubricEvaluator.js +55 -2
- package/services/transcriptFormatter.js +675 -0
- package/docs/EVALUATION-VARIABLES.md +0 -589
- package/docs/REPLICATION-PLAN.md +0 -577
- package/scripts/analyze-run.mjs +0 -282
- package/scripts/compare-runs.js +0 -44
- package/scripts/compare-suggestions.js +0 -80
- package/scripts/dig-into-run.js +0 -158
- package/scripts/show-failed-suggestions.js +0 -64
- /package/scripts/{check-run.mjs → check-run.js} +0 -0
|
@@ -998,6 +998,138 @@ author = {Long, Christopher P.},
|
|
|
998
998
|
|
|
999
999
|
% --- Near-match alias ---
|
|
1000
1000
|
|
|
1001
|
+
% ======================================================================
|
|
1002
|
+
% Related Work expansion references — Added 2026-02-15
|
|
1003
|
+
% ======================================================================
|
|
1004
|
+
|
|
1005
|
+
% --- Empirical LLM Tutoring ---
|
|
1006
|
+
|
|
1007
|
+
@inproceedings{vanzo2025gpt4homework,
|
|
1008
|
+
author = {Vanzo, Alessandro and Pal Chowdhury, Sankalan and Sachan, Mrinmaya},
|
|
1009
|
+
title = {{GPT}-4 as a Homework Tutor can Improve Student Engagement and Learning Outcomes},
|
|
1010
|
+
booktitle = {Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
|
|
1011
|
+
pages = {31119--31136},
|
|
1012
|
+
year = {2025},
|
|
1013
|
+
publisher = {Association for Computational Linguistics},
|
|
1014
|
+
doi = {10.18653/v1/2025.acl-long.1502}
|
|
1015
|
+
}
|
|
1016
|
+
|
|
1017
|
+
@article{shi2025llmeducation,
|
|
1018
|
+
author = {Shi, Yuhong and Yu, Kun and Dong, Yifei and Chen, Fang},
|
|
1019
|
+
title = {Large Language Models in Education: A Systematic Review of Empirical Applications, Benefits, and Challenges},
|
|
1020
|
+
journal = {Computers and Education: Artificial Intelligence},
|
|
1021
|
+
volume = {10},
|
|
1022
|
+
pages = {100529},
|
|
1023
|
+
year = {2025},
|
|
1024
|
+
doi = {10.1016/j.caeai.2025.100529}
|
|
1025
|
+
}
|
|
1026
|
+
|
|
1027
|
+
@inproceedings{scarlatos2025training,
|
|
1028
|
+
author = {Scarlatos, Alexander and Liu, Naiming and Lee, Jaewook and Baraniuk, Richard and Lan, Andrew},
|
|
1029
|
+
title = {Training {LLM}-Based Tutors to Improve Student Learning Outcomes in Dialogues},
|
|
1030
|
+
booktitle = {Artificial Intelligence in Education -- 26th International Conference, {AIED} 2025, Proceedings},
|
|
1031
|
+
pages = {251--266},
|
|
1032
|
+
year = {2025},
|
|
1033
|
+
publisher = {Springer},
|
|
1034
|
+
doi = {10.1007/978-3-031-98414-3_18}
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
% --- Multi-Agent Systems ---
|
|
1038
|
+
|
|
1039
|
+
@inproceedings{li2023camel,
|
|
1040
|
+
author = {Li, Guohao and Hammoud, Hasan Abed Al Kader and Itani, Hani and Khizbullin, Dmitrii and Ghanem, Bernard},
|
|
1041
|
+
title = {{CAMEL}: Communicative Agents for ``Mind'' Exploration of Large Language Model Society},
|
|
1042
|
+
booktitle = {Advances in Neural Information Processing Systems},
|
|
1043
|
+
volume = {36},
|
|
1044
|
+
year = {2023}
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
@inproceedings{guo2024multiagents,
|
|
1048
|
+
author = {Guo, Taicheng and Chen, Xiuying and Wang, Yaqi and Chang, Ruidi and Pei, Shichao and Chawla, Nitesh V. and Wiest, Olaf and Zhang, Xiangliang},
|
|
1049
|
+
title = {Large Language Model Based Multi-Agents: A Survey of Progress and Challenges},
|
|
1050
|
+
booktitle = {Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence ({IJCAI-24}), Survey Track},
|
|
1051
|
+
pages = {8048--8057},
|
|
1052
|
+
year = {2024},
|
|
1053
|
+
doi = {10.24963/ijcai.2024/890}
|
|
1054
|
+
}
|
|
1055
|
+
|
|
1056
|
+
% --- Self-Correction Limits ---
|
|
1057
|
+
|
|
1058
|
+
@article{kamoi2024selfcorrection,
|
|
1059
|
+
author = {Kamoi, Ryo and Zhang, Yusen and Zhang, Nan and Han, Jiawei and Zhang, Rui},
|
|
1060
|
+
title = {When Can {LLMs} Actually Correct Their Own Mistakes? {A} Critical Survey of Self-Correction of {LLMs}},
|
|
1061
|
+
journal = {Transactions of the Association for Computational Linguistics},
|
|
1062
|
+
volume = {12},
|
|
1063
|
+
pages = {1417--1440},
|
|
1064
|
+
year = {2024},
|
|
1065
|
+
doi = {10.1162/tacl_a_00713}
|
|
1066
|
+
}
|
|
1067
|
+
|
|
1068
|
+
@inproceedings{shinn2023reflexion,
|
|
1069
|
+
author = {Shinn, Noah and Cassano, Federico and Gopinath, Ashwin and Narasimhan, Karthik and Yao, Shunyu},
|
|
1070
|
+
title = {Reflexion: Language Agents with Verbal Reinforcement Learning},
|
|
1071
|
+
booktitle = {Advances in Neural Information Processing Systems},
|
|
1072
|
+
volume = {36},
|
|
1073
|
+
year = {2023}
|
|
1074
|
+
}
|
|
1075
|
+
|
|
1076
|
+
% --- LLM-as-Judge ---
|
|
1077
|
+
|
|
1078
|
+
@inproceedings{zheng2023judging,
|
|
1079
|
+
author = {Zheng, Lianmin and Chiang, Wei-Lin and Sheng, Ying and Zhuang, Siyuan and Wu, Zhanghao and Zhuang, Yonghao and Lin, Zi and Li, Zhuohan and Li, Dacheng and Xing, Eric P. and Zhang, Hao and Gonzalez, Joseph E. and Stoica, Ion},
|
|
1080
|
+
title = {Judging {LLM}-as-a-Judge with {MT-Bench} and {Chatbot Arena}},
|
|
1081
|
+
booktitle = {Advances in Neural Information Processing Systems},
|
|
1082
|
+
volume = {36},
|
|
1083
|
+
year = {2023}
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
@article{gu2025surveyjudge,
|
|
1087
|
+
author = {Gu, Jiawei and Jiang, Xuhui and Shi, Zhichao and Tan, Hexiang and Zhai, Xuehao and Xu, Chengjin and Li, Wei and Shen, Yinghan and Ma, Shengjie and Liu, Honghao and Wang, Saizhuo and Zhang, Kun and Lin, Zhouchi and Zhang, Bowen and Ni, Lionel and Gao, Wen and Wang, Yuanzhuo and Guo, Jian},
|
|
1088
|
+
title = {A Survey on {LLM}-as-a-Judge},
|
|
1089
|
+
journal = {The Innovation},
|
|
1090
|
+
year = {2025},
|
|
1091
|
+
doi = {10.1016/j.xinn.2025.101253}
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
@misc{li2024llmsjudges,
|
|
1095
|
+
author = {Li, Haitao and Dong, Qian and Chen, Junjie and Su, Huixue and Zhou, Yujia and Ai, Qingyao and Ye, Ziyi and Liu, Yiqun},
|
|
1096
|
+
title = {{LLMs}-as-Judges: A Comprehensive Survey on {LLM}-based Evaluation Methods},
|
|
1097
|
+
year = {2024},
|
|
1098
|
+
eprint = {2412.05579},
|
|
1099
|
+
archivePrefix = {arXiv},
|
|
1100
|
+
primaryClass = {cs.CL}
|
|
1101
|
+
}
|
|
1102
|
+
|
|
1103
|
+
% --- Theory of Mind in AI ---
|
|
1104
|
+
|
|
1105
|
+
@article{street2025tom,
|
|
1106
|
+
author = {Street, Winnie and Siy, John Oliver and Keeling, Geoff and Baranes, Adrien and Barnett, Benjamin and McKibben, Michael and Kanyere, Tatenda and Lentz, Alison and {Aguera y Arcas}, Blaise and Dunbar, Robin I. M.},
|
|
1107
|
+
title = {{LLMs} achieve adult human performance on higher-order theory of mind tasks},
|
|
1108
|
+
journal = {Frontiers in Human Neuroscience},
|
|
1109
|
+
volume = {19},
|
|
1110
|
+
pages = {1633272},
|
|
1111
|
+
year = {2025},
|
|
1112
|
+
doi = {10.3389/fnhum.2025.1633272}
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
@inproceedings{nguyen2025tomsurvey,
|
|
1116
|
+
author = {Nguyen, Hieu Minh},
|
|
1117
|
+
title = {A Survey of Theory of Mind in Large Language Models: Evaluations, Representations, and Safety Risks},
|
|
1118
|
+
booktitle = {Advancing Artificial Intelligence through Theory of Mind Workshop, AAAI},
|
|
1119
|
+
year = {2025}
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
@misc{hwang2025infusingtom,
|
|
1123
|
+
author = {Hwang, EunJeong and Yin, Yuwei and Carenini, Giuseppe and West, Peter and Shwartz, Vered},
|
|
1124
|
+
title = {Infusing Theory of Mind into Socially Intelligent {LLM} Agents},
|
|
1125
|
+
year = {2025},
|
|
1126
|
+
eprint = {2509.22887},
|
|
1127
|
+
archivePrefix = {arXiv},
|
|
1128
|
+
primaryClass = {cs.CL}
|
|
1129
|
+
}
|
|
1130
|
+
|
|
1131
|
+
% --- End Related Work expansion references ---
|
|
1132
|
+
|
|
1001
1133
|
@misc{magee2024drama,
|
|
1002
1134
|
author = {Magee, Liam and Arora, Vanicka and Gollings, Gus and Lam-Saw, Norma},
|
|
1003
1135
|
title = {The Drama Machine: Simulating Character Development with {LLM} Agents},
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
% Custom Beamer styling for "Geist in the Machine" presentation
|
|
2
|
+
% Uses Metropolis theme with a refined academic color palette
|
|
3
|
+
|
|
4
|
+
% ─── Color Palette ───────────────────────────────────────────
|
|
5
|
+
% Deep slate blue primary, warm amber accent, clean backgrounds
|
|
6
|
+
\definecolor{GeistPrimary}{HTML}{1B2838} % Dark slate (titles, bars)
|
|
7
|
+
\definecolor{GeistAccent}{HTML}{D4872C} % Warm amber (highlights, alerts)
|
|
8
|
+
\definecolor{GeistLight}{HTML}{F5F2EB} % Warm off-white (backgrounds)
|
|
9
|
+
\definecolor{GeistMid}{HTML}{5C6B7A} % Medium slate (subtitles)
|
|
10
|
+
\definecolor{GeistText}{HTML}{2D3436} % Near-black (body text)
|
|
11
|
+
\definecolor{GeistTableHead}{HTML}{E8E2D6} % Warm beige (table headers)
|
|
12
|
+
\definecolor{GeistTableAlt}{HTML}{F9F7F3} % Light warm (table alt rows)
|
|
13
|
+
\definecolor{GeistPositive}{HTML}{27AE60} % Green (positive results)
|
|
14
|
+
\definecolor{GeistNegative}{HTML}{C0392B} % Red (negative results)
|
|
15
|
+
\definecolor{GeistCode}{HTML}{F0ECE3} % Code background
|
|
16
|
+
|
|
17
|
+
% ─── Metropolis Configuration ────────────────────────────────
|
|
18
|
+
\metroset{
|
|
19
|
+
progressbar=frametitle,
|
|
20
|
+
sectionpage=none,
|
|
21
|
+
numbering=fraction,
|
|
22
|
+
block=fill,
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
% Apply custom colors to metropolis elements
|
|
26
|
+
\setbeamercolor{normal text}{fg=GeistText, bg=white}
|
|
27
|
+
\setbeamercolor{background canvas}{bg=white}
|
|
28
|
+
\setbeamercolor{alerted text}{fg=GeistAccent}
|
|
29
|
+
\setbeamercolor{frametitle}{fg=white, bg=GeistPrimary}
|
|
30
|
+
\setbeamercolor{title separator}{fg=GeistAccent}
|
|
31
|
+
\setbeamercolor{progress bar}{fg=GeistAccent, bg=GeistPrimary!20}
|
|
32
|
+
\setbeamercolor{progress bar in head/foot}{fg=GeistAccent, bg=GeistPrimary!20}
|
|
33
|
+
\setbeamercolor{progress bar in section page}{fg=GeistAccent, bg=GeistPrimary!20}
|
|
34
|
+
\setbeamercolor{title}{fg=white}
|
|
35
|
+
\setbeamercolor{subtitle}{fg=GeistAccent!80!white}
|
|
36
|
+
\setbeamercolor{author}{fg=GeistLight!90!white}
|
|
37
|
+
\setbeamercolor{date}{fg=GeistLight!70!white}
|
|
38
|
+
\setbeamercolor{institute}{fg=GeistLight!80!white}
|
|
39
|
+
\setbeamercolor{section title}{fg=white}
|
|
40
|
+
\setbeamercolor{block title}{fg=white, bg=GeistPrimary}
|
|
41
|
+
\setbeamercolor{block body}{bg=GeistLight}
|
|
42
|
+
\setbeamercolor{block title alerted}{fg=white, bg=GeistAccent}
|
|
43
|
+
\setbeamercolor{block body alerted}{bg=GeistAccent!10}
|
|
44
|
+
\setbeamercolor{block title example}{fg=white, bg=GeistPositive!80!black}
|
|
45
|
+
\setbeamercolor{block body example}{bg=GeistPositive!5}
|
|
46
|
+
\setbeamercolor{itemize item}{fg=GeistAccent}
|
|
47
|
+
\setbeamercolor{itemize subitem}{fg=GeistMid}
|
|
48
|
+
\setbeamercolor{enumerate item}{fg=GeistAccent}
|
|
49
|
+
\setbeamercolor{description item}{fg=GeistPrimary}
|
|
50
|
+
\setbeamercolor{footnote}{fg=GeistMid}
|
|
51
|
+
\setbeamercolor{page number in head/foot}{fg=GeistMid}
|
|
52
|
+
|
|
53
|
+
% ─── Typography ──────────────────────────────────────────────
|
|
54
|
+
\usepackage{FiraSans}
|
|
55
|
+
\usepackage{FiraMono}
|
|
56
|
+
|
|
57
|
+
% Title page font sizes
|
|
58
|
+
\setbeamerfont{title}{size=\Large, series=\bfseries}
|
|
59
|
+
\setbeamerfont{subtitle}{size=\normalsize}
|
|
60
|
+
\setbeamerfont{author}{size=\small}
|
|
61
|
+
\setbeamerfont{date}{size=\small}
|
|
62
|
+
\setbeamerfont{institute}{size=\footnotesize}
|
|
63
|
+
\setbeamerfont{frametitle}{size=\large}
|
|
64
|
+
|
|
65
|
+
% ─── Title Page ──────────────────────────────────────────────
|
|
66
|
+
\setbeamertemplate{title page}{
|
|
67
|
+
\begin{minipage}[b][\paperheight]{\textwidth}
|
|
68
|
+
\vfill
|
|
69
|
+
\ifx\inserttitlegraphic\@empty\else\usebeamertemplate*{title graphic}\fi
|
|
70
|
+
|
|
71
|
+
{\usebeamerfont{title}\usebeamercolor[fg]{title}\inserttitle\par}
|
|
72
|
+
|
|
73
|
+
\vspace{0.6em}
|
|
74
|
+
{\usebeamercolor{title separator}\textcolor{fg}{\rule{0.35\textwidth}{1.2pt}}\par}
|
|
75
|
+
|
|
76
|
+
\vspace{0.6em}
|
|
77
|
+
\ifx\insertsubtitle\@empty\else
|
|
78
|
+
{\usebeamerfont{subtitle}\usebeamercolor[fg]{subtitle}\insertsubtitle\par}
|
|
79
|
+
\vspace{0.4em}
|
|
80
|
+
\fi
|
|
81
|
+
|
|
82
|
+
\ifx\insertauthor\@empty\else
|
|
83
|
+
{\usebeamerfont{author}\usebeamercolor[fg]{author}\insertauthor\par}
|
|
84
|
+
\vspace{0.2em}
|
|
85
|
+
\fi
|
|
86
|
+
|
|
87
|
+
\ifx\insertinstitute\@empty\else
|
|
88
|
+
{\usebeamerfont{institute}\usebeamercolor[fg]{institute}\insertinstitute\par}
|
|
89
|
+
\vspace{0.2em}
|
|
90
|
+
\fi
|
|
91
|
+
|
|
92
|
+
\ifx\insertdate\@empty\else
|
|
93
|
+
{\usebeamerfont{date}\usebeamercolor[fg]{date}\insertdate\par}
|
|
94
|
+
\fi
|
|
95
|
+
|
|
96
|
+
\vfill
|
|
97
|
+
\vspace{1em}
|
|
98
|
+
\end{minipage}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
% Dark background for title page
|
|
102
|
+
\makeatletter
|
|
103
|
+
\def\ps@titlepage{%
|
|
104
|
+
\setbeamertemplate{background}{%
|
|
105
|
+
\begin{tikzpicture}
|
|
106
|
+
\useasboundingbox (0,0) rectangle (\the\paperwidth,\the\paperheight);
|
|
107
|
+
\fill[GeistPrimary] (0,0) rectangle (\the\paperwidth,\the\paperheight);
|
|
108
|
+
\end{tikzpicture}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
\addtobeamertemplate{title page}{\thispagestyle{titlepage}}{}
|
|
112
|
+
\makeatother
|
|
113
|
+
|
|
114
|
+
% ─── Table Styling ───────────────────────────────────────────
|
|
115
|
+
\usepackage{booktabs}
|
|
116
|
+
\usepackage{colortbl}
|
|
117
|
+
\usepackage{array}
|
|
118
|
+
|
|
119
|
+
% Redefine \toprule/\bottomrule colors
|
|
120
|
+
\renewcommand{\toprule}{\arrayrulecolor{GeistPrimary}\specialrule{0.8pt}{0pt}{4pt}}
|
|
121
|
+
\renewcommand{\midrule}{\arrayrulecolor{GeistPrimary!30}\specialrule{0.4pt}{3pt}{4pt}}
|
|
122
|
+
\renewcommand{\bottomrule}{\arrayrulecolor{GeistPrimary}\specialrule{0.8pt}{4pt}{0pt}}
|
|
123
|
+
|
|
124
|
+
% Scale tables to fit slides
|
|
125
|
+
\usepackage{adjustbox}
|
|
126
|
+
\let\oldtabular\tabular
|
|
127
|
+
\let\endoldtabular\endtabular
|
|
128
|
+
\renewenvironment{tabular}[1]{%
|
|
129
|
+
\adjustbox{max width=\textwidth}{%
|
|
130
|
+
\oldtabular{#1}%
|
|
131
|
+
}{%
|
|
132
|
+
\endoldtabular}%
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
% Smaller font in tables for readability on slides
|
|
136
|
+
\AtBeginEnvironment{longtable}{\footnotesize}
|
|
137
|
+
|
|
138
|
+
% ─── Code Block Styling ─────────────────────────────────────
|
|
139
|
+
\usepackage{etoolbox}
|
|
140
|
+
\AtBeginDocument{%
|
|
141
|
+
\apptocmd{\Shaded}{\scriptsize\setlength{\fboxsep}{6pt}}{}{}%
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
% ─── Spacing & Layout ───────────────────────────────────────
|
|
145
|
+
\setlength{\parskip}{0.4em}
|
|
146
|
+
\linespread{1.1}
|
|
147
|
+
|
|
148
|
+
% Tighter list spacing
|
|
149
|
+
\setbeamertemplate{itemize/enumerate body begin}{\vspace{-0.2em}}
|
|
150
|
+
\setbeamertemplate{itemize/enumerate body end}{\vspace{-0.2em}}
|
|
151
|
+
\setlength{\leftmargini}{1.2em}
|
|
152
|
+
\setlength{\leftmarginii}{1.2em}
|
|
153
|
+
|
|
154
|
+
% ─── Blockquote Styling ─────────────────────────────────────
|
|
155
|
+
% Style quotes with a left accent bar
|
|
156
|
+
\usepackage{tikz}
|
|
157
|
+
\renewenvironment{quote}{%
|
|
158
|
+
\begin{tikzpicture}
|
|
159
|
+
\node[
|
|
160
|
+
inner sep=8pt,
|
|
161
|
+
text width=0.88\textwidth,
|
|
162
|
+
fill=GeistLight,
|
|
163
|
+
draw=none,
|
|
164
|
+
font=\small\itshape\color{GeistText},
|
|
165
|
+
execute at begin node={\hspace{0pt}},
|
|
166
|
+
] (quote) \bgroup
|
|
167
|
+
}{%
|
|
168
|
+
\egroup;
|
|
169
|
+
\fill[GeistAccent] ([xshift=-4pt]quote.north west) rectangle ([xshift=-1pt]quote.south west);
|
|
170
|
+
\end{tikzpicture}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
% ─── Standout colors (used only for Thank You slide) ─────────
|
|
174
|
+
% Note: pandoc+metropolis standout frames leak color changes to
|
|
175
|
+
% subsequent frames. Use sparingly — only as the final slide.
|
|
176
|
+
|
|
177
|
+
% ─── Unicode Support ─────────────────────────────────────────
|
|
178
|
+
\usepackage{unicode-math}
|
|
179
|
+
\setmathfont{latinmodern-math.otf}
|
|
180
|
+
\usepackage{newunicodechar}
|
|
181
|
+
\newunicodechar{≈}{$\approx$}
|
|
182
|
+
\newunicodechar{≥}{$\geq$}
|
|
183
|
+
\newunicodechar{𝜒}{$\chi$}
|
|
184
|
+
\tracinglostchars=0
|
|
185
|
+
|
|
186
|
+
% ─── Misc ────────────────────────────────────────────────────
|
|
187
|
+
\setbeamertemplate{caption}[numbered]
|
|
188
|
+
\setbeamertemplate{navigation symbols}{}
|
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "*Geist* in the Machine"
|
|
3
|
+
subtitle: "Mutual Recognition and Multiagent Architecture for Dialectical AI Tutoring"
|
|
4
|
+
author: "Liam Magee"
|
|
5
|
+
date: "February 2026"
|
|
6
|
+
bibliography: references.bib
|
|
7
|
+
csl: apa.csl
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Motivation
|
|
11
|
+
|
|
12
|
+
## The Problem
|
|
13
|
+
|
|
14
|
+
Current AI tutoring treats learners as **knowledge deficits** to be filled.
|
|
15
|
+
|
|
16
|
+
- Learner says something interesting → tutor redirects to curriculum
|
|
17
|
+
- Learner struggles → tutor simplifies or restates
|
|
18
|
+
- Learner resists → tutor notes "engagement metrics" and moves on
|
|
19
|
+
|
|
20
|
+
**The learner is never encountered as a subject.**
|
|
21
|
+
|
|
22
|
+
This maps onto Hegel's master--slave dialectic: the master (tutor) consumes the slave's (learner's) labor without genuine encounter.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Hegel's Alternative: Mutual Recognition
|
|
27
|
+
|
|
28
|
+
**Recognition** (*Anerkennung*): each party acknowledges the other as an autonomous consciousness whose understanding has intrinsic validity.
|
|
29
|
+
|
|
30
|
+
:::::::::::::: {.columns}
|
|
31
|
+
::: {.column width="50%"}
|
|
32
|
+
|
|
33
|
+
**What it is**
|
|
34
|
+
|
|
35
|
+
- A **relational stance**
|
|
36
|
+
- How the tutor constitutes the learner
|
|
37
|
+
- Achievable without consciousness
|
|
38
|
+
|
|
39
|
+
:::
|
|
40
|
+
::: {.column width="50%"}
|
|
41
|
+
|
|
42
|
+
**What it is not**
|
|
43
|
+
|
|
44
|
+
- Not agreement --- can disagree while recognizing
|
|
45
|
+
- Not affirmation --- "good job!" is not recognition
|
|
46
|
+
- Not a consciousness requirement
|
|
47
|
+
|
|
48
|
+
:::
|
|
49
|
+
::::::::::::::
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
# Architecture
|
|
54
|
+
|
|
55
|
+
## The Drama Machine
|
|
56
|
+
|
|
57
|
+
:::::::::::::: {.columns}
|
|
58
|
+
::: {.column width="50%"}
|
|
59
|
+
|
|
60
|
+
**Ego** (Response Generator)
|
|
61
|
+
|
|
62
|
+
- Generates pedagogical suggestions
|
|
63
|
+
- Has **final authority** over output
|
|
64
|
+
- Can override or incorporate Superego feedback
|
|
65
|
+
|
|
66
|
+
:::
|
|
67
|
+
::: {.column width="50%"}
|
|
68
|
+
|
|
69
|
+
**Superego** (Internal Critic)
|
|
70
|
+
|
|
71
|
+
- Evaluates Ego's draft
|
|
72
|
+
- Checks pedagogical quality
|
|
73
|
+
- Structured critique: approve / revise / reject
|
|
74
|
+
|
|
75
|
+
:::
|
|
76
|
+
::::::::::::::
|
|
77
|
+
|
|
78
|
+
**Recognition prompts** add Hegelian theory to both Ego and Superego:
|
|
79
|
+
|
|
80
|
+
- *"Acknowledge the learner as an autonomous subject..."*
|
|
81
|
+
- *"Evaluate whether the response treats the learner's understanding as having intrinsic validity..."*
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## Phase 2: Advanced Mechanisms
|
|
86
|
+
|
|
87
|
+
Nine architectural mechanisms tested beyond base Ego/Superego:
|
|
88
|
+
|
|
89
|
+
| Mechanism | What it does |
|
|
90
|
+
|:----------|:-------------|
|
|
91
|
+
| Self-reflection | Ego reviews own prior performance |
|
|
92
|
+
| Bidirectional profiling | Theory of Mind models of each party |
|
|
93
|
+
| Intersubjective recognition | Explicit other-awareness prompts |
|
|
94
|
+
| Combined (all three) | Full mechanism stack |
|
|
95
|
+
| Cross-turn superego memory | Superego retains conversation context |
|
|
96
|
+
| Prompt rewriting | Dynamic prompt evolution mid-dialogue |
|
|
97
|
+
| Quantitative disposition | Numeric stance tracking |
|
|
98
|
+
| Prompt erosion | Gradual prompt degradation test |
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
# Method
|
|
103
|
+
|
|
104
|
+
## Evaluation Design
|
|
105
|
+
|
|
106
|
+
**37 evaluations**, N=3,383 primary scored responses
|
|
107
|
+
|
|
108
|
+
- **2x2x2 factorial** (N=350): Recognition x Architecture x Learner type
|
|
109
|
+
- **Memory isolation** (N=120): Disentangle recognition from episodic memory
|
|
110
|
+
- **Multi-model probe** (N=655): 5 ego models, architecture held constant
|
|
111
|
+
- **Dynamic learner tests** (N=660): Mechanisms with feedback-capable learners
|
|
112
|
+
- **Cross-judge replication** (N=977): GPT-5.2 independent validation
|
|
113
|
+
|
|
114
|
+
**14-dimension rubric** scored by Claude Opus 4.6
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
# Results
|
|
119
|
+
|
|
120
|
+
## Finding 1: Memory Isolation (The Definitive Finding)
|
|
121
|
+
|
|
122
|
+
2x2 design (N=120, 30/cell) disentangles recognition from episodic memory:
|
|
123
|
+
|
|
124
|
+
| | No Memory | Memory |
|
|
125
|
+
|:--|:-----------:|:--------:|
|
|
126
|
+
| **No Recognition** | 75.4 | 80.2 |
|
|
127
|
+
| **Recognition** | **90.6** | **91.2** |
|
|
128
|
+
|
|
129
|
+
- **Recognition**: **+15.2 pts**, d=1.71, p<.001
|
|
130
|
+
- **Memory**: +4.8 pts, d=0.46, n.s.
|
|
131
|
+
- **Interaction**: -4.2 pts (ceiling effect, not synergy)
|
|
132
|
+
|
|
133
|
+
Recognition alone accounts for nearly the entire improvement.
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Finding 2: Full Factorial (2x2x2)
|
|
138
|
+
|
|
139
|
+
N=350, Kimi K2.5 ego, Opus 4.6 judge:
|
|
140
|
+
|
|
141
|
+
| Cell | Recog | Arch | Learner | M (SD) |
|
|
142
|
+
|:------:|:-------:|:------:|:---------:|:--------:|
|
|
143
|
+
| 1 | - | Single | Single | 73.4 (16.2) |
|
|
144
|
+
| 2 | - | Multi | Single | 69.9 (23.3) |
|
|
145
|
+
| 3 | - | Single | Multi | 75.5 (15.2) |
|
|
146
|
+
| 4 | - | Multi | Multi | 75.2 (18.1) |
|
|
147
|
+
| 5 | + | Single | Single | **90.2** (7.1) |
|
|
148
|
+
| 6 | + | Multi | Single | **83.9** (18.1) |
|
|
149
|
+
| 7 | + | Single | Multi | **90.1** (7.1) |
|
|
150
|
+
| 8 | + | Multi | Multi | **87.3** (10.3) |
|
|
151
|
+
|
|
152
|
+
**Recognition**: **+14.4 pts**, F(1,342)=110.04, p<.001, d=1.11
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## Finding 3: Architecture is Additive
|
|
157
|
+
|
|
158
|
+
Multi-model probe (N=655, 5 ego models):
|
|
159
|
+
|
|
160
|
+
| Model | Base | +Arch | +Recog | +Both | AxB |
|
|
161
|
+
|:-------|:------:|:-------:|:--------:|:-------:|:-----:|
|
|
162
|
+
| Kimi K2.5 | 73.4 | 75.5 | **90.2** | 90.1 | +0.5 |
|
|
163
|
+
| Haiku | 78.2 | 81.9 | **93.3** | 93.5 | -3.7 |
|
|
164
|
+
| DeepSeek-R1 | 71.1 | 71.3 | **88.9** | 83.2 | -5.7 |
|
|
165
|
+
| GLM-4.7 | 63.9 | 62.2 | **73.5** | 74.9 | +3.1 |
|
|
166
|
+
| Nemotron | 62.3 | 62.6 | **78.2** | 72.5 | -5.7 |
|
|
167
|
+
|
|
168
|
+
- AxB interaction: -5.7 to +3.1 (mean -1.8) --- **no synergy**
|
|
169
|
+
- Recognition range: +9.6 to +17.8 across all models
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## Finding 4: Domain Generalizability
|
|
174
|
+
|
|
175
|
+
Recognition effect across 6 tutorial domains (N=60):
|
|
176
|
+
|
|
177
|
+
| Domain | Base | Recog | Delta |
|
|
178
|
+
|:--------|:------:|:-------:|:---:|
|
|
179
|
+
| Climate science | 72.0 | 93.8 | **+21.8** |
|
|
180
|
+
| Ethics | 72.3 | 89.3 | **+17.0** |
|
|
181
|
+
| Mathematics | 73.0 | 89.2 | **+16.2** |
|
|
182
|
+
| Philosophy | 75.2 | 89.7 | **+14.5** |
|
|
183
|
+
| Machine learning | 78.0 | 91.5 | **+13.5** |
|
|
184
|
+
| Poetry | 86.0 | 92.5 | +6.5 |
|
|
185
|
+
|
|
186
|
+
Strong for conceptual domains (+14 to +22 pts). Weakest for poetry (+6.5) --- high baseline leaves less room for improvement.
|
|
187
|
+
|
|
188
|
+
---
|
|
189
|
+
|
|
190
|
+
## Finding 5: Scripted vs. Dynamic Learners
|
|
191
|
+
|
|
192
|
+
:::::::::::::: {.columns}
|
|
193
|
+
::: {.column width="50%"}
|
|
194
|
+
|
|
195
|
+
**Scripted learners**
|
|
196
|
+
|
|
197
|
+
- Pre-written responses
|
|
198
|
+
- 9 mechanisms cluster within 2.4 pts
|
|
199
|
+
- No differentiation --- noise floor
|
|
200
|
+
|
|
201
|
+
:::
|
|
202
|
+
::: {.column width="50%"}
|
|
203
|
+
|
|
204
|
+
**Dynamic learners**
|
|
205
|
+
|
|
206
|
+
- LLM-generated, ego/superego
|
|
207
|
+
- Mechanisms spread 5+ pts
|
|
208
|
+
- Recognition doubles: +7.6 → **+14.8**
|
|
209
|
+
|
|
210
|
+
:::
|
|
211
|
+
::::::::::::::
|
|
212
|
+
|
|
213
|
+
**Lesson**: Mechanism effects require genuine feedback loops to manifest.
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
## Finding 6: Dynamic Learner Mechanisms
|
|
218
|
+
|
|
219
|
+
Complete 2x4 matrix (N=480, Haiku ego, dynamic learner):
|
|
220
|
+
|
|
221
|
+
| Mechanism | Base | Recog | Delta |
|
|
222
|
+
|:-----------|:------:|:-------:|:---:|
|
|
223
|
+
| Self-reflection | 72.3 | 85.6 | +13.3 |
|
|
224
|
+
| Bidirectional profiling | 74.6 | **88.8** | +14.2 |
|
|
225
|
+
| Intersubjective | 67.7 | 82.8 | +15.1 |
|
|
226
|
+
| Combined | 73.7 | 87.8 | +14.1 |
|
|
227
|
+
|
|
228
|
+
- Variance collapses with added mechanisms (SD: 22.5 → 11.8)
|
|
229
|
+
- Recognition delta stable (+13.3 to +15.1) regardless of mechanism
|
|
230
|
+
- Profiling = highest ceiling; intersubjective = lowest floor
|
|
231
|
+
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
## Finding 7: Cognitive Prosthesis Fails
|
|
235
|
+
|
|
236
|
+
Can a strong Superego (Kimi K2.5) compensate for a weak Ego (Nemotron)?
|
|
237
|
+
|
|
238
|
+
**No.** Full mechanism stack scores **49.5** --- that's **-15 pts below** Nemotron simple base (64.2)
|
|
239
|
+
|
|
240
|
+
- Same mechanisms boost Haiku by +20 pts but hurt Nemotron by -15 pts
|
|
241
|
+
- Static dimensions fine (spec accuracy 4.0); dynamic dimensions fail (adaptation 1.8)
|
|
242
|
+
- Parse failures: Kimi returns malformed JSON 16--45% of turns
|
|
243
|
+
|
|
244
|
+
**Minimum ego capability threshold**: The mechanisms amplify what the Ego can already do --- they cannot substitute for missing capability.
|
|
245
|
+
|
|
246
|
+
---
|
|
247
|
+
|
|
248
|
+
## Finding 8: Cross-Judge Robustness
|
|
249
|
+
|
|
250
|
+
GPT-5.2 independently rejudged N=977 paired responses:
|
|
251
|
+
|
|
252
|
+
| Finding | Claude | GPT-5.2 | Replicates? |
|
|
253
|
+
|:---------|:--------:|:---------:|:-------------:|
|
|
254
|
+
| Recognition (memory) | d=1.71 | d=1.54 | Yes |
|
|
255
|
+
| Memory effect | d=0.46 | d=0.49 | Yes (small) |
|
|
256
|
+
| Architecture effect | +2.6 | -0.2 | Yes (null) |
|
|
257
|
+
| Mechanism clustering | 2.8 pt | 4.4 pt | Yes (null) |
|
|
258
|
+
|
|
259
|
+
- Inter-judge r = 0.44--0.64 (all p<.001)
|
|
260
|
+
- GPT-5.2 finds 37--59% of Claude's effect magnitudes
|
|
261
|
+
- Always same direction --- **no sign reversals**
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
# Qualitative Evidence
|
|
266
|
+
|
|
267
|
+
## What Recognition Looks Like
|
|
268
|
+
|
|
269
|
+
**Base tutor** to a struggling learner:
|
|
270
|
+
|
|
271
|
+
> "You left off at the neural networks section. Complete this lecture to maintain your learning streak."
|
|
272
|
+
|
|
273
|
+
**Recognition tutor** to the same learner:
|
|
274
|
+
|
|
275
|
+
> "This is your third session --- you've persisted through quiz-479-3 three times, which signals you're wrestling with how recognition operates in the dialectic..."
|
|
276
|
+
|
|
277
|
+
Three systematic changes:
|
|
278
|
+
|
|
279
|
+
1. The ego **listens to its internal critic** (superego feedback incorporated)
|
|
280
|
+
2. The tutor **builds on learner contributions** (not redirecting to curriculum)
|
|
281
|
+
3. **Mid-conversation strategy shifts** occur (30% of recognition dialogues vs 0% base)
|
|
282
|
+
|
|
283
|
+
---
|
|
284
|
+
|
|
285
|
+
## Dialectical Impasse: The Strongest Test
|
|
286
|
+
|
|
287
|
+
Three 5-turn scenarios with escalating resistance (N=24):
|
|
288
|
+
|
|
289
|
+
- **Epistemic resistance** (Popperian critique): Recognition **+43 pts**
|
|
290
|
+
- **Productive deadlock** (incompatible frameworks): Recognition **+29 pts**
|
|
291
|
+
- **Affective shutdown** (emotional retreat): Recognition -1.1 (null)
|
|
292
|
+
|
|
293
|
+
Resolution strategy coding (chi-square=24.00, p<.001, V=1.000):
|
|
294
|
+
|
|
295
|
+
- **Base**: 12/12 withdraw from encounter entirely
|
|
296
|
+
- **Recognition**: 10/12 scaffolded reframing (*Aufhebung*), 1 mutual recognition, 1 domination
|
|
297
|
+
|
|
298
|
+
The null on affective shutdown sharpens the claim: recognition's contribution is **epistemological**, not primarily affective.
|
|
299
|
+
|
|
300
|
+
---
|
|
301
|
+
|
|
302
|
+
## The Learner Superego Paradox
|
|
303
|
+
|
|
304
|
+
Multi-agent learner architecture **hurts** learner quality (d=1.43, F=68.28, p<.001):
|
|
305
|
+
|
|
306
|
+
- Designed to improve through internal self-critique
|
|
307
|
+
- Actually over-edits --- polishes away messy, authentic engagement
|
|
308
|
+
- Recognition partially rescues multi-agent learner (d=0.79, p=.004)
|
|
309
|
+
|
|
310
|
+
**Hegelian interpretation**: External recognition from an Other is structurally more effective than internal self-critique. You cannot bootstrap genuine dialogue from a monologue.
|
|
311
|
+
|
|
312
|
+
---
|
|
313
|
+
|
|
314
|
+
# Implications
|
|
315
|
+
|
|
316
|
+
## Practical Recommendations
|
|
317
|
+
|
|
318
|
+
1. **Add recognition prompts** --- immediate +14 pt improvement, no architecture changes needed
|
|
319
|
+
2. **Architecture is optional** --- modest additive benefit (+2 pts), not required
|
|
320
|
+
3. **Use dynamic learners** for testing --- scripted learners mask mechanism effects
|
|
321
|
+
4. **Theory of Mind profiling** --- best mechanism for ceiling performance
|
|
322
|
+
5. **Token budgets can be cut 4--16x** with no quality loss
|
|
323
|
+
6. **Minimum ego capability matters** --- mechanisms amplify, don't substitute
|
|
324
|
+
|
|
325
|
+
---
|
|
326
|
+
|
|
327
|
+
## Limitations
|
|
328
|
+
|
|
329
|
+
1. **Simulated learners, not humans** --- all "learners" are LLM agents
|
|
330
|
+
2. **LLM-as-judge** --- Claude Opus evaluates (mitigated by GPT-5.2 cross-judge)
|
|
331
|
+
3. **Single content domain** --- primarily philosophy of education
|
|
332
|
+
4. **No longitudinal data** --- snapshots, not learning trajectories
|
|
333
|
+
5. **Prompt-level intervention** --- recognition embedded in prompts, not weights
|
|
334
|
+
6. **Small N per cell** --- 30 observations per condition in key experiments
|
|
335
|
+
|
|
336
|
+
---
|
|
337
|
+
|
|
338
|
+
## Conclusion
|
|
339
|
+
|
|
340
|
+
**Recognition theory** produces robust, replicable improvements in AI tutoring quality:
|
|
341
|
+
|
|
342
|
+
- d=1.11 to d=1.71 depending on experiment
|
|
343
|
+
- Replicates across 5 models, 6 domains, 2 judges
|
|
344
|
+
- Survives all controls: memory isolation, prompt elaboration, token budget
|
|
345
|
+
|
|
346
|
+
**Multi-agent architecture** contributes additively but modestly.
|
|
347
|
+
|
|
348
|
+
**The key insight**: Philosophical theories of intersubjectivity can serve as productive design heuristics for AI systems. Recognition is better understood as an achievable relational stance than a requirement for machine consciousness.
|
|
349
|
+
|
|
350
|
+
---
|
|
351
|
+
|
|
352
|
+
## Thank You
|
|
353
|
+
|
|
354
|
+
**Paper**: *Geist* in the Machine (v2.3.14)
|
|
355
|
+
|
|
356
|
+
**37 evaluations** | **N=3,383 scored** | **5 ego models** | **2 judges**
|
|
357
|
+
|
|
358
|
+
Liam Magee
|
|
359
|
+
|
|
360
|
+
Education Policy, Organization and Leadership
|
|
361
|
+
|
|
362
|
+
University of Illinois Urbana-Champaign
|
|
363
|
+
|