@machinespirits/eval 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -9
- package/config/eval-settings.yaml +3 -3
- package/config/paper-manifest.json +486 -0
- package/config/providers.yaml +9 -6
- package/config/tutor-agents.yaml +2261 -0
- package/content/README.md +23 -0
- package/content/courses/479/course.md +53 -0
- package/content/courses/479/lecture-1.md +361 -0
- package/content/courses/479/lecture-2.md +360 -0
- package/content/courses/479/lecture-3.md +655 -0
- package/content/courses/479/lecture-4.md +530 -0
- package/content/courses/479/lecture-5.md +326 -0
- package/content/courses/479/lecture-6.md +346 -0
- package/content/courses/479/lecture-7.md +326 -0
- package/content/courses/479/lecture-8.md +273 -0
- package/content/courses/479/roadmap-slides.md +656 -0
- package/content/manifest.yaml +8 -0
- package/docs/research/build.sh +44 -20
- package/docs/research/figures/figure10.png +0 -0
- package/docs/research/figures/figure11.png +0 -0
- package/docs/research/figures/figure3.png +0 -0
- package/docs/research/figures/figure4.png +0 -0
- package/docs/research/figures/figure5.png +0 -0
- package/docs/research/figures/figure6.png +0 -0
- package/docs/research/figures/figure7.png +0 -0
- package/docs/research/figures/figure8.png +0 -0
- package/docs/research/figures/figure9.png +0 -0
- package/docs/research/header.tex +23 -2
- package/docs/research/paper-full.md +941 -285
- package/docs/research/paper-short.md +216 -585
- package/docs/research/references.bib +132 -0
- package/docs/research/slides-header.tex +188 -0
- package/docs/research/slides-pptx.md +363 -0
- package/docs/research/slides.md +531 -0
- package/docs/research/style-reference-pptx.py +199 -0
- package/package.json +6 -5
- package/scripts/analyze-eval-results.js +69 -17
- package/scripts/analyze-mechanism-traces.js +763 -0
- package/scripts/analyze-modulation-learning.js +498 -0
- package/scripts/analyze-prosthesis.js +144 -0
- package/scripts/analyze-run.js +264 -79
- package/scripts/assess-transcripts.js +853 -0
- package/scripts/browse-transcripts.js +854 -0
- package/scripts/check-parse-failures.js +73 -0
- package/scripts/code-dialectical-modulation.js +1320 -0
- package/scripts/download-data.sh +55 -0
- package/scripts/eval-cli.js +106 -18
- package/scripts/generate-paper-figures.js +663 -0
- package/scripts/generate-paper-figures.py +577 -76
- package/scripts/generate-paper-tables.js +299 -0
- package/scripts/qualitative-analysis-ai.js +3 -3
- package/scripts/render-sequence-diagram.js +694 -0
- package/scripts/test-latency.js +210 -0
- package/scripts/test-rate-limit.js +95 -0
- package/scripts/test-token-budget.js +332 -0
- package/scripts/validate-paper-manifest.js +670 -0
- package/services/__tests__/evalConfigLoader.test.js +2 -2
- package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
- package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
- package/services/evaluationRunner.js +975 -98
- package/services/evaluationStore.js +12 -4
- package/services/learnerTutorInteractionEngine.js +27 -2
- package/services/mockProvider.js +133 -0
- package/services/promptRewriter.js +1471 -5
- package/services/rubricEvaluator.js +55 -2
- package/services/transcriptFormatter.js +675 -0
- package/docs/EVALUATION-VARIABLES.md +0 -589
- package/docs/REPLICATION-PLAN.md +0 -577
- package/scripts/analyze-run.mjs +0 -282
- package/scripts/compare-runs.js +0 -44
- package/scripts/compare-suggestions.js +0 -80
- package/scripts/dig-into-run.js +0 -158
- package/scripts/show-failed-suggestions.js +0 -64
- /package/scripts/{check-run.mjs → check-run.js} +0 -0
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Style the pandoc reference.pptx to match the Geist in the Machine theme."""
|
|
3
|
+
|
|
4
|
+
from pptx import Presentation
|
|
5
|
+
from pptx.util import Inches, Pt, Emu
|
|
6
|
+
from pptx.dml.color import RGBColor
|
|
7
|
+
from pptx.enum.text import PP_ALIGN, MSO_ANCHOR
|
|
8
|
+
from pptx.oxml.ns import qn, nsmap
|
|
9
|
+
import copy
|
|
10
|
+
|
|
11
|
+
# ─── Color Palette (matches slides-header.tex) ──────────────
|
|
12
|
+
GEIST_PRIMARY = RGBColor(0x1B, 0x28, 0x38) # Dark slate
|
|
13
|
+
GEIST_ACCENT = RGBColor(0xD4, 0x87, 0x2C) # Warm amber
|
|
14
|
+
GEIST_LIGHT = RGBColor(0xF5, 0xF2, 0xEB) # Warm off-white
|
|
15
|
+
GEIST_MID = RGBColor(0x5C, 0x6B, 0x7A) # Medium slate
|
|
16
|
+
GEIST_TEXT = RGBColor(0x2D, 0x34, 0x36) # Near-black
|
|
17
|
+
WHITE = RGBColor(0xFF, 0xFF, 0xFF)
|
|
18
|
+
|
|
19
|
+
FONT_HEADING = 'Helvetica Neue'
|
|
20
|
+
FONT_BODY = 'Helvetica Neue'
|
|
21
|
+
FONT_MONO = 'Fira Mono'
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def set_slide_bg(slide_layout, color):
|
|
25
|
+
"""Set background color on a slide layout."""
|
|
26
|
+
bg = slide_layout.background
|
|
27
|
+
fill = bg.fill
|
|
28
|
+
fill.solid()
|
|
29
|
+
fill.fore_color.rgb = color
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def style_placeholder(ph, font_name=None, font_size=None, font_color=None,
|
|
33
|
+
bold=None, alignment=None):
|
|
34
|
+
"""Style a placeholder's default text properties."""
|
|
35
|
+
if ph.has_text_frame:
|
|
36
|
+
for paragraph in ph.text_frame.paragraphs:
|
|
37
|
+
if alignment is not None:
|
|
38
|
+
paragraph.alignment = alignment
|
|
39
|
+
for run in paragraph.runs:
|
|
40
|
+
if font_name:
|
|
41
|
+
run.font.name = font_name
|
|
42
|
+
if font_size:
|
|
43
|
+
run.font.size = font_size
|
|
44
|
+
if font_color:
|
|
45
|
+
run.font.color.rgb = font_color
|
|
46
|
+
if bold is not None:
|
|
47
|
+
run.font.bold = bold
|
|
48
|
+
|
|
49
|
+
# Also set the default text style via XML for new text
|
|
50
|
+
sp = ph._sp
|
|
51
|
+
txBody = sp.find(qn('p:txBody'))
|
|
52
|
+
if txBody is not None:
|
|
53
|
+
for defRPr_parent in txBody.findall(qn('a:lstStyle')):
|
|
54
|
+
for level in defRPr_parent:
|
|
55
|
+
defRPr = level.find(qn('a:defRPr'))
|
|
56
|
+
if defRPr is not None and font_name:
|
|
57
|
+
# Set latin font
|
|
58
|
+
latin = defRPr.find(qn('a:latin'))
|
|
59
|
+
if latin is not None:
|
|
60
|
+
latin.set('typeface', font_name)
|
|
61
|
+
else:
|
|
62
|
+
latin = defRPr.makeelement(qn('a:latin'), {'typeface': font_name})
|
|
63
|
+
defRPr.append(latin)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def update_theme_colors(prs):
|
|
67
|
+
"""Update the theme color scheme to match our palette."""
|
|
68
|
+
from lxml import etree
|
|
69
|
+
|
|
70
|
+
# Access the slide master's theme
|
|
71
|
+
slide_master = prs.slide_masters[0]
|
|
72
|
+
|
|
73
|
+
# Find theme XML through the relationship
|
|
74
|
+
for rel in slide_master.part.rels.values():
|
|
75
|
+
if 'theme' in rel.reltype:
|
|
76
|
+
theme_element = etree.fromstring(rel.target_part.blob)
|
|
77
|
+
theme_part_ref = rel.target_part
|
|
78
|
+
|
|
79
|
+
# Find the color scheme
|
|
80
|
+
theme_elements = theme_element.findall('.//' + qn('a:clrScheme'))
|
|
81
|
+
for clr_scheme in theme_elements:
|
|
82
|
+
# Update specific theme colors
|
|
83
|
+
color_map = {
|
|
84
|
+
'dk1': GEIST_PRIMARY, # Dark 1
|
|
85
|
+
'dk2': GEIST_TEXT, # Dark 2
|
|
86
|
+
'lt1': WHITE, # Light 1
|
|
87
|
+
'lt2': GEIST_LIGHT, # Light 2
|
|
88
|
+
'accent1': GEIST_ACCENT, # Accent 1
|
|
89
|
+
'accent2': GEIST_MID, # Accent 2
|
|
90
|
+
'accent3': RGBColor(0x27, 0xAE, 0x60), # Green
|
|
91
|
+
'accent4': RGBColor(0xC0, 0x39, 0x2B), # Red
|
|
92
|
+
'hlink': GEIST_ACCENT, # Hyperlink
|
|
93
|
+
'folHlink': GEIST_MID, # Followed hyperlink
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
for color_name, rgb in color_map.items():
|
|
97
|
+
el = clr_scheme.find(qn(f'a:{color_name}'))
|
|
98
|
+
if el is not None:
|
|
99
|
+
# Remove existing color children
|
|
100
|
+
for child in list(el):
|
|
101
|
+
el.remove(child)
|
|
102
|
+
# Add srgbClr
|
|
103
|
+
srgb = el.makeelement(qn('a:srgbClr'), {'val': str(rgb)})
|
|
104
|
+
el.append(srgb)
|
|
105
|
+
|
|
106
|
+
# Update font scheme
|
|
107
|
+
font_schemes = theme_element.findall('.//' + qn('a:fontScheme'))
|
|
108
|
+
for font_scheme in font_schemes:
|
|
109
|
+
for font_type in ['majorFont', 'minorFont']:
|
|
110
|
+
font_el = font_scheme.find(qn(f'a:{font_type}'))
|
|
111
|
+
if font_el is not None:
|
|
112
|
+
latin = font_el.find(qn('a:latin'))
|
|
113
|
+
if latin is not None:
|
|
114
|
+
typeface = FONT_HEADING if font_type == 'majorFont' else FONT_BODY
|
|
115
|
+
latin.set('typeface', typeface)
|
|
116
|
+
|
|
117
|
+
# Save modified theme back
|
|
118
|
+
theme_part_ref._blob = etree.tostring(theme_element, xml_declaration=True,
|
|
119
|
+
encoding='UTF-8', standalone=True)
|
|
120
|
+
break
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def main():
|
|
124
|
+
prs = Presentation('reference.pptx')
|
|
125
|
+
|
|
126
|
+
# Update theme colors
|
|
127
|
+
update_theme_colors(prs)
|
|
128
|
+
|
|
129
|
+
# Style each slide layout
|
|
130
|
+
slide_master = prs.slide_masters[0]
|
|
131
|
+
|
|
132
|
+
for layout in slide_master.slide_layouts:
|
|
133
|
+
layout_name = layout.name.lower()
|
|
134
|
+
|
|
135
|
+
if 'title' in layout_name and 'content' not in layout_name:
|
|
136
|
+
# Title slide - dark background
|
|
137
|
+
set_slide_bg(layout, GEIST_PRIMARY)
|
|
138
|
+
|
|
139
|
+
for ph in layout.placeholders:
|
|
140
|
+
if ph.placeholder_format.idx == 0: # Title
|
|
141
|
+
style_placeholder(ph, font_name=FONT_HEADING,
|
|
142
|
+
font_size=Pt(36), font_color=WHITE,
|
|
143
|
+
bold=True)
|
|
144
|
+
elif ph.placeholder_format.idx == 1: # Subtitle
|
|
145
|
+
style_placeholder(ph, font_name=FONT_BODY,
|
|
146
|
+
font_size=Pt(18), font_color=GEIST_ACCENT)
|
|
147
|
+
|
|
148
|
+
elif 'section' in layout_name:
|
|
149
|
+
# Section header - dark background
|
|
150
|
+
set_slide_bg(layout, GEIST_PRIMARY)
|
|
151
|
+
|
|
152
|
+
for ph in layout.placeholders:
|
|
153
|
+
if ph.placeholder_format.idx == 0: # Title
|
|
154
|
+
style_placeholder(ph, font_name=FONT_HEADING,
|
|
155
|
+
font_size=Pt(32), font_color=WHITE,
|
|
156
|
+
bold=True)
|
|
157
|
+
else:
|
|
158
|
+
style_placeholder(ph, font_name=FONT_BODY,
|
|
159
|
+
font_color=GEIST_ACCENT)
|
|
160
|
+
|
|
161
|
+
elif 'two' in layout_name and 'content' in layout_name:
|
|
162
|
+
# Two-column layout
|
|
163
|
+
set_slide_bg(layout, WHITE)
|
|
164
|
+
for ph in layout.placeholders:
|
|
165
|
+
if ph.placeholder_format.idx == 0: # Title
|
|
166
|
+
style_placeholder(ph, font_name=FONT_HEADING,
|
|
167
|
+
font_size=Pt(28), font_color=GEIST_PRIMARY,
|
|
168
|
+
bold=True)
|
|
169
|
+
else:
|
|
170
|
+
style_placeholder(ph, font_name=FONT_BODY,
|
|
171
|
+
font_size=Pt(16), font_color=GEIST_TEXT)
|
|
172
|
+
|
|
173
|
+
elif 'blank' in layout_name:
|
|
174
|
+
set_slide_bg(layout, WHITE)
|
|
175
|
+
|
|
176
|
+
else:
|
|
177
|
+
# Content slides - white background
|
|
178
|
+
set_slide_bg(layout, WHITE)
|
|
179
|
+
|
|
180
|
+
for ph in layout.placeholders:
|
|
181
|
+
idx = ph.placeholder_format.idx
|
|
182
|
+
if idx == 0: # Title
|
|
183
|
+
style_placeholder(ph, font_name=FONT_HEADING,
|
|
184
|
+
font_size=Pt(28), font_color=GEIST_PRIMARY,
|
|
185
|
+
bold=True)
|
|
186
|
+
# Add bottom border to title area via shape properties
|
|
187
|
+
elif idx == 1: # Body/content
|
|
188
|
+
style_placeholder(ph, font_name=FONT_BODY,
|
|
189
|
+
font_size=Pt(18), font_color=GEIST_TEXT)
|
|
190
|
+
|
|
191
|
+
# Style the slide master itself
|
|
192
|
+
set_slide_bg(slide_master, WHITE)
|
|
193
|
+
|
|
194
|
+
prs.save('reference.pptx')
|
|
195
|
+
print('Styled reference.pptx successfully.')
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
if __name__ == '__main__':
|
|
199
|
+
main()
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@machinespirits/eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "Evaluation system for Machine Spirits tutor - benchmarking, rubric evaluation, and analysis tools",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -17,7 +17,8 @@
|
|
|
17
17
|
"services/",
|
|
18
18
|
"config/",
|
|
19
19
|
"scripts/",
|
|
20
|
-
"
|
|
20
|
+
"content/",
|
|
21
|
+
"docs/research/"
|
|
21
22
|
],
|
|
22
23
|
"scripts": {
|
|
23
24
|
"start": "STANDALONE=true node server.js",
|
|
@@ -26,7 +27,7 @@
|
|
|
26
27
|
"eval:quick": "node scripts/eval-cli.js quick",
|
|
27
28
|
"eval:test": "node scripts/eval-cli.js test",
|
|
28
29
|
"seed": "node scripts/seed-db.js",
|
|
29
|
-
"test": "node --test --test-force-exit
|
|
30
|
+
"test": "node --test --test-force-exit services/__tests__/*.test.js tests/*.test.js",
|
|
30
31
|
"content:validate": "node scripts/validate-content.js"
|
|
31
32
|
},
|
|
32
33
|
"keywords": [
|
|
@@ -44,7 +45,7 @@
|
|
|
44
45
|
},
|
|
45
46
|
"peerDependencies": {
|
|
46
47
|
"@anthropic-ai/sdk": "0.71.2",
|
|
47
|
-
"@machinespirits/tutor-core": "0.3.1"
|
|
48
|
+
"@machinespirits/tutor-core": ">=0.3.1"
|
|
48
49
|
},
|
|
49
50
|
"peerDependenciesMeta": {
|
|
50
51
|
"@anthropic-ai/sdk": {
|
|
@@ -62,6 +63,6 @@
|
|
|
62
63
|
"@types/node": "22.14.0"
|
|
63
64
|
},
|
|
64
65
|
"engines": {
|
|
65
|
-
"node": ">=
|
|
66
|
+
"node": ">=20.0.0"
|
|
66
67
|
}
|
|
67
68
|
}
|
|
@@ -120,29 +120,81 @@ function tCDF(t, df) {
|
|
|
120
120
|
return 1 - 0.5 * incompleteBeta(df / 2, 0.5, x);
|
|
121
121
|
}
|
|
122
122
|
|
|
123
|
-
//
|
|
123
|
+
// Regularized incomplete beta function I_x(a, b) via continued fraction.
|
|
124
|
+
// Uses the standard DLMF 8.17.22 recurrence (Numerical Recipes §6.4).
|
|
124
125
|
function incompleteBeta(a, b, x) {
|
|
125
|
-
|
|
126
|
-
if (x
|
|
127
|
-
if (x === 1) return 1;
|
|
126
|
+
if (x <= 0) return 0;
|
|
127
|
+
if (x >= 1) return 1;
|
|
128
128
|
|
|
129
|
-
// Use
|
|
130
|
-
if (
|
|
131
|
-
return
|
|
129
|
+
// Use symmetry relation when x > (a+1)/(a+b+2) for faster convergence
|
|
130
|
+
if (x > (a + 1) / (a + b + 2)) {
|
|
131
|
+
return 1 - incompleteBeta(b, a, 1 - x);
|
|
132
132
|
}
|
|
133
133
|
|
|
134
|
-
|
|
135
|
-
const
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
134
|
+
const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b);
|
|
135
|
+
const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta);
|
|
136
|
+
|
|
137
|
+
// Evaluate continued fraction with modified Lentz's method
|
|
138
|
+
const maxIter = 200;
|
|
139
|
+
const eps = 3e-14;
|
|
140
|
+
const fpmin = 1e-30;
|
|
141
|
+
|
|
142
|
+
let qab = a + b;
|
|
143
|
+
let qap = a + 1;
|
|
144
|
+
let qam = a - 1;
|
|
145
|
+
let c = 1;
|
|
146
|
+
let d = 1 - qab * x / qap;
|
|
147
|
+
if (Math.abs(d) < fpmin) d = fpmin;
|
|
148
|
+
d = 1 / d;
|
|
149
|
+
let h = d;
|
|
150
|
+
|
|
151
|
+
for (let m = 1; m <= maxIter; m++) {
|
|
152
|
+
let m2 = 2 * m;
|
|
153
|
+
|
|
154
|
+
// Even step: d_{2m}
|
|
155
|
+
let aa = m * (b - m) * x / ((qam + m2) * (a + m2));
|
|
156
|
+
d = 1 + aa * d;
|
|
157
|
+
if (Math.abs(d) < fpmin) d = fpmin;
|
|
158
|
+
c = 1 + aa / c;
|
|
159
|
+
if (Math.abs(c) < fpmin) c = fpmin;
|
|
160
|
+
d = 1 / d;
|
|
161
|
+
h *= d * c;
|
|
162
|
+
|
|
163
|
+
// Odd step: d_{2m+1}
|
|
164
|
+
aa = -(a + m) * (qab + m) * x / ((a + m2) * (qap + m2));
|
|
165
|
+
d = 1 + aa * d;
|
|
166
|
+
if (Math.abs(d) < fpmin) d = fpmin;
|
|
167
|
+
c = 1 + aa / c;
|
|
168
|
+
if (Math.abs(c) < fpmin) c = fpmin;
|
|
169
|
+
d = 1 / d;
|
|
170
|
+
const delta = d * c;
|
|
171
|
+
h *= delta;
|
|
172
|
+
|
|
173
|
+
if (Math.abs(delta - 1) < eps) break;
|
|
140
174
|
}
|
|
141
|
-
sum *= x / steps;
|
|
142
175
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
176
|
+
return front * h / a;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// Log-gamma function (avoids overflow for large arguments)
|
|
180
|
+
function lnGamma(z) {
|
|
181
|
+
if (z <= 0) return Infinity;
|
|
182
|
+
// Lanczos approximation (g=7, same coefficients as existing gamma function)
|
|
183
|
+
const g = 7;
|
|
184
|
+
const c = [0.99999999999980993, 676.5203681218851, -1259.1392167224028,
|
|
185
|
+
771.32342877765313, -176.61502916214059, 12.507343278686905,
|
|
186
|
+
-0.13857109526572012, 9.9843695780195716e-6, 1.5056327351493116e-7];
|
|
187
|
+
|
|
188
|
+
if (z < 0.5) {
|
|
189
|
+
return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z);
|
|
190
|
+
}
|
|
191
|
+
z -= 1;
|
|
192
|
+
let x = c[0];
|
|
193
|
+
for (let i = 1; i < g + 2; i++) {
|
|
194
|
+
x += c[i] / (z + i);
|
|
195
|
+
}
|
|
196
|
+
const t = z + g + 0.5;
|
|
197
|
+
return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x);
|
|
146
198
|
}
|
|
147
199
|
|
|
148
200
|
function gamma(n) {
|