@machinespirits/eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +91 -9
  2. package/config/eval-settings.yaml +3 -3
  3. package/config/paper-manifest.json +486 -0
  4. package/config/providers.yaml +9 -6
  5. package/config/tutor-agents.yaml +2261 -0
  6. package/content/README.md +23 -0
  7. package/content/courses/479/course.md +53 -0
  8. package/content/courses/479/lecture-1.md +361 -0
  9. package/content/courses/479/lecture-2.md +360 -0
  10. package/content/courses/479/lecture-3.md +655 -0
  11. package/content/courses/479/lecture-4.md +530 -0
  12. package/content/courses/479/lecture-5.md +326 -0
  13. package/content/courses/479/lecture-6.md +346 -0
  14. package/content/courses/479/lecture-7.md +326 -0
  15. package/content/courses/479/lecture-8.md +273 -0
  16. package/content/courses/479/roadmap-slides.md +656 -0
  17. package/content/manifest.yaml +8 -0
  18. package/docs/research/build.sh +44 -20
  19. package/docs/research/figures/figure10.png +0 -0
  20. package/docs/research/figures/figure11.png +0 -0
  21. package/docs/research/figures/figure3.png +0 -0
  22. package/docs/research/figures/figure4.png +0 -0
  23. package/docs/research/figures/figure5.png +0 -0
  24. package/docs/research/figures/figure6.png +0 -0
  25. package/docs/research/figures/figure7.png +0 -0
  26. package/docs/research/figures/figure8.png +0 -0
  27. package/docs/research/figures/figure9.png +0 -0
  28. package/docs/research/header.tex +23 -2
  29. package/docs/research/paper-full.md +941 -285
  30. package/docs/research/paper-short.md +216 -585
  31. package/docs/research/references.bib +132 -0
  32. package/docs/research/slides-header.tex +188 -0
  33. package/docs/research/slides-pptx.md +363 -0
  34. package/docs/research/slides.md +531 -0
  35. package/docs/research/style-reference-pptx.py +199 -0
  36. package/package.json +6 -5
  37. package/scripts/analyze-eval-results.js +69 -17
  38. package/scripts/analyze-mechanism-traces.js +763 -0
  39. package/scripts/analyze-modulation-learning.js +498 -0
  40. package/scripts/analyze-prosthesis.js +144 -0
  41. package/scripts/analyze-run.js +264 -79
  42. package/scripts/assess-transcripts.js +853 -0
  43. package/scripts/browse-transcripts.js +854 -0
  44. package/scripts/check-parse-failures.js +73 -0
  45. package/scripts/code-dialectical-modulation.js +1320 -0
  46. package/scripts/download-data.sh +55 -0
  47. package/scripts/eval-cli.js +106 -18
  48. package/scripts/generate-paper-figures.js +663 -0
  49. package/scripts/generate-paper-figures.py +577 -76
  50. package/scripts/generate-paper-tables.js +299 -0
  51. package/scripts/qualitative-analysis-ai.js +3 -3
  52. package/scripts/render-sequence-diagram.js +694 -0
  53. package/scripts/test-latency.js +210 -0
  54. package/scripts/test-rate-limit.js +95 -0
  55. package/scripts/test-token-budget.js +332 -0
  56. package/scripts/validate-paper-manifest.js +670 -0
  57. package/services/__tests__/evalConfigLoader.test.js +2 -2
  58. package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
  59. package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
  60. package/services/evaluationRunner.js +975 -98
  61. package/services/evaluationStore.js +12 -4
  62. package/services/learnerTutorInteractionEngine.js +27 -2
  63. package/services/mockProvider.js +133 -0
  64. package/services/promptRewriter.js +1471 -5
  65. package/services/rubricEvaluator.js +55 -2
  66. package/services/transcriptFormatter.js +675 -0
  67. package/docs/EVALUATION-VARIABLES.md +0 -589
  68. package/docs/REPLICATION-PLAN.md +0 -577
  69. package/scripts/analyze-run.mjs +0 -282
  70. package/scripts/compare-runs.js +0 -44
  71. package/scripts/compare-suggestions.js +0 -80
  72. package/scripts/dig-into-run.js +0 -158
  73. package/scripts/show-failed-suggestions.js +0 -64
  74. /package/scripts/{check-run.mjs → check-run.js} +0 -0
@@ -0,0 +1,199 @@
1
+ #!/usr/bin/env python3
2
+ """Style the pandoc reference.pptx to match the Geist in the Machine theme."""
3
+
4
+ from pptx import Presentation
5
+ from pptx.util import Inches, Pt, Emu
6
+ from pptx.dml.color import RGBColor
7
+ from pptx.enum.text import PP_ALIGN, MSO_ANCHOR
8
+ from pptx.oxml.ns import qn, nsmap
9
+ import copy
10
+
11
+ # ─── Color Palette (matches slides-header.tex) ──────────────
12
+ GEIST_PRIMARY = RGBColor(0x1B, 0x28, 0x38) # Dark slate
13
+ GEIST_ACCENT = RGBColor(0xD4, 0x87, 0x2C) # Warm amber
14
+ GEIST_LIGHT = RGBColor(0xF5, 0xF2, 0xEB) # Warm off-white
15
+ GEIST_MID = RGBColor(0x5C, 0x6B, 0x7A) # Medium slate
16
+ GEIST_TEXT = RGBColor(0x2D, 0x34, 0x36) # Near-black
17
+ WHITE = RGBColor(0xFF, 0xFF, 0xFF)
18
+
19
+ FONT_HEADING = 'Helvetica Neue'
20
+ FONT_BODY = 'Helvetica Neue'
21
+ FONT_MONO = 'Fira Mono'
22
+
23
+
24
+ def set_slide_bg(slide_layout, color):
25
+ """Set background color on a slide layout."""
26
+ bg = slide_layout.background
27
+ fill = bg.fill
28
+ fill.solid()
29
+ fill.fore_color.rgb = color
30
+
31
+
32
+ def style_placeholder(ph, font_name=None, font_size=None, font_color=None,
33
+ bold=None, alignment=None):
34
+ """Style a placeholder's default text properties."""
35
+ if ph.has_text_frame:
36
+ for paragraph in ph.text_frame.paragraphs:
37
+ if alignment is not None:
38
+ paragraph.alignment = alignment
39
+ for run in paragraph.runs:
40
+ if font_name:
41
+ run.font.name = font_name
42
+ if font_size:
43
+ run.font.size = font_size
44
+ if font_color:
45
+ run.font.color.rgb = font_color
46
+ if bold is not None:
47
+ run.font.bold = bold
48
+
49
+ # Also set the default text style via XML for new text
50
+ sp = ph._sp
51
+ txBody = sp.find(qn('p:txBody'))
52
+ if txBody is not None:
53
+ for defRPr_parent in txBody.findall(qn('a:lstStyle')):
54
+ for level in defRPr_parent:
55
+ defRPr = level.find(qn('a:defRPr'))
56
+ if defRPr is not None and font_name:
57
+ # Set latin font
58
+ latin = defRPr.find(qn('a:latin'))
59
+ if latin is not None:
60
+ latin.set('typeface', font_name)
61
+ else:
62
+ latin = defRPr.makeelement(qn('a:latin'), {'typeface': font_name})
63
+ defRPr.append(latin)
64
+
65
+
66
+ def update_theme_colors(prs):
67
+ """Update the theme color scheme to match our palette."""
68
+ from lxml import etree
69
+
70
+ # Access the slide master's theme
71
+ slide_master = prs.slide_masters[0]
72
+
73
+ # Find theme XML through the relationship
74
+ for rel in slide_master.part.rels.values():
75
+ if 'theme' in rel.reltype:
76
+ theme_element = etree.fromstring(rel.target_part.blob)
77
+ theme_part_ref = rel.target_part
78
+
79
+ # Find the color scheme
80
+ theme_elements = theme_element.findall('.//' + qn('a:clrScheme'))
81
+ for clr_scheme in theme_elements:
82
+ # Update specific theme colors
83
+ color_map = {
84
+ 'dk1': GEIST_PRIMARY, # Dark 1
85
+ 'dk2': GEIST_TEXT, # Dark 2
86
+ 'lt1': WHITE, # Light 1
87
+ 'lt2': GEIST_LIGHT, # Light 2
88
+ 'accent1': GEIST_ACCENT, # Accent 1
89
+ 'accent2': GEIST_MID, # Accent 2
90
+ 'accent3': RGBColor(0x27, 0xAE, 0x60), # Green
91
+ 'accent4': RGBColor(0xC0, 0x39, 0x2B), # Red
92
+ 'hlink': GEIST_ACCENT, # Hyperlink
93
+ 'folHlink': GEIST_MID, # Followed hyperlink
94
+ }
95
+
96
+ for color_name, rgb in color_map.items():
97
+ el = clr_scheme.find(qn(f'a:{color_name}'))
98
+ if el is not None:
99
+ # Remove existing color children
100
+ for child in list(el):
101
+ el.remove(child)
102
+ # Add srgbClr
103
+ srgb = el.makeelement(qn('a:srgbClr'), {'val': str(rgb)})
104
+ el.append(srgb)
105
+
106
+ # Update font scheme
107
+ font_schemes = theme_element.findall('.//' + qn('a:fontScheme'))
108
+ for font_scheme in font_schemes:
109
+ for font_type in ['majorFont', 'minorFont']:
110
+ font_el = font_scheme.find(qn(f'a:{font_type}'))
111
+ if font_el is not None:
112
+ latin = font_el.find(qn('a:latin'))
113
+ if latin is not None:
114
+ typeface = FONT_HEADING if font_type == 'majorFont' else FONT_BODY
115
+ latin.set('typeface', typeface)
116
+
117
+ # Save modified theme back
118
+ theme_part_ref._blob = etree.tostring(theme_element, xml_declaration=True,
119
+ encoding='UTF-8', standalone=True)
120
+ break
121
+
122
+
123
+ def main():
124
+ prs = Presentation('reference.pptx')
125
+
126
+ # Update theme colors
127
+ update_theme_colors(prs)
128
+
129
+ # Style each slide layout
130
+ slide_master = prs.slide_masters[0]
131
+
132
+ for layout in slide_master.slide_layouts:
133
+ layout_name = layout.name.lower()
134
+
135
+ if 'title' in layout_name and 'content' not in layout_name:
136
+ # Title slide - dark background
137
+ set_slide_bg(layout, GEIST_PRIMARY)
138
+
139
+ for ph in layout.placeholders:
140
+ if ph.placeholder_format.idx == 0: # Title
141
+ style_placeholder(ph, font_name=FONT_HEADING,
142
+ font_size=Pt(36), font_color=WHITE,
143
+ bold=True)
144
+ elif ph.placeholder_format.idx == 1: # Subtitle
145
+ style_placeholder(ph, font_name=FONT_BODY,
146
+ font_size=Pt(18), font_color=GEIST_ACCENT)
147
+
148
+ elif 'section' in layout_name:
149
+ # Section header - dark background
150
+ set_slide_bg(layout, GEIST_PRIMARY)
151
+
152
+ for ph in layout.placeholders:
153
+ if ph.placeholder_format.idx == 0: # Title
154
+ style_placeholder(ph, font_name=FONT_HEADING,
155
+ font_size=Pt(32), font_color=WHITE,
156
+ bold=True)
157
+ else:
158
+ style_placeholder(ph, font_name=FONT_BODY,
159
+ font_color=GEIST_ACCENT)
160
+
161
+ elif 'two' in layout_name and 'content' in layout_name:
162
+ # Two-column layout
163
+ set_slide_bg(layout, WHITE)
164
+ for ph in layout.placeholders:
165
+ if ph.placeholder_format.idx == 0: # Title
166
+ style_placeholder(ph, font_name=FONT_HEADING,
167
+ font_size=Pt(28), font_color=GEIST_PRIMARY,
168
+ bold=True)
169
+ else:
170
+ style_placeholder(ph, font_name=FONT_BODY,
171
+ font_size=Pt(16), font_color=GEIST_TEXT)
172
+
173
+ elif 'blank' in layout_name:
174
+ set_slide_bg(layout, WHITE)
175
+
176
+ else:
177
+ # Content slides - white background
178
+ set_slide_bg(layout, WHITE)
179
+
180
+ for ph in layout.placeholders:
181
+ idx = ph.placeholder_format.idx
182
+ if idx == 0: # Title
183
+ style_placeholder(ph, font_name=FONT_HEADING,
184
+ font_size=Pt(28), font_color=GEIST_PRIMARY,
185
+ bold=True)
186
+ # Add bottom border to title area via shape properties
187
+ elif idx == 1: # Body/content
188
+ style_placeholder(ph, font_name=FONT_BODY,
189
+ font_size=Pt(18), font_color=GEIST_TEXT)
190
+
191
+ # Style the slide master itself
192
+ set_slide_bg(slide_master, WHITE)
193
+
194
+ prs.save('reference.pptx')
195
+ print('Styled reference.pptx successfully.')
196
+
197
+
198
+ if __name__ == '__main__':
199
+ main()
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@machinespirits/eval",
3
- "version": "0.2.0",
3
+ "version": "0.3.0",
4
4
  "description": "Evaluation system for Machine Spirits tutor - benchmarking, rubric evaluation, and analysis tools",
5
5
  "type": "module",
6
6
  "main": "index.js",
@@ -17,7 +17,8 @@
17
17
  "services/",
18
18
  "config/",
19
19
  "scripts/",
20
- "docs/"
20
+ "content/",
21
+ "docs/research/"
21
22
  ],
22
23
  "scripts": {
23
24
  "start": "STANDALONE=true node server.js",
@@ -26,7 +27,7 @@
26
27
  "eval:quick": "node scripts/eval-cli.js quick",
27
28
  "eval:test": "node scripts/eval-cli.js test",
28
29
  "seed": "node scripts/seed-db.js",
29
- "test": "node --test --test-force-exit 'services/__tests__/*.test.js' 'tests/*.test.js'",
30
+ "test": "node --test --test-force-exit services/__tests__/*.test.js tests/*.test.js",
30
31
  "content:validate": "node scripts/validate-content.js"
31
32
  },
32
33
  "keywords": [
@@ -44,7 +45,7 @@
44
45
  },
45
46
  "peerDependencies": {
46
47
  "@anthropic-ai/sdk": "0.71.2",
47
- "@machinespirits/tutor-core": "0.3.1"
48
+ "@machinespirits/tutor-core": ">=0.3.1"
48
49
  },
49
50
  "peerDependenciesMeta": {
50
51
  "@anthropic-ai/sdk": {
@@ -62,6 +63,6 @@
62
63
  "@types/node": "22.14.0"
63
64
  },
64
65
  "engines": {
65
- "node": ">=18.0.0"
66
+ "node": ">=20.0.0"
66
67
  }
67
68
  }
@@ -120,29 +120,81 @@ function tCDF(t, df) {
120
120
  return 1 - 0.5 * incompleteBeta(df / 2, 0.5, x);
121
121
  }
122
122
 
123
- // Incomplete beta function approximation (very simplified)
123
+ // Regularized incomplete beta function I_x(a, b) via continued fraction.
124
+ // Uses the standard DLMF 8.17.22 recurrence (Numerical Recipes §6.4).
124
125
  function incompleteBeta(a, b, x) {
125
- // This is a rough approximation; for production use a proper library
126
- if (x === 0) return 0;
127
- if (x === 1) return 1;
126
+ if (x <= 0) return 0;
127
+ if (x >= 1) return 1;
128
128
 
129
- // Use normal approximation for large df
130
- if (a > 30) {
131
- return x < 0.5 ? 0 : 1;
129
+ // Use symmetry relation when x > (a+1)/(a+b+2) for faster convergence
130
+ if (x > (a + 1) / (a + b + 2)) {
131
+ return 1 - incompleteBeta(b, a, 1 - x);
132
132
  }
133
133
 
134
- // Simple numerical integration
135
- const steps = 100;
136
- let sum = 0;
137
- for (let i = 0; i < steps; i++) {
138
- const xi = (i + 0.5) / steps * x;
139
- sum += Math.pow(xi, a - 1) * Math.pow(1 - xi, b - 1);
134
+ const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b);
135
+ const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta);
136
+
137
+ // Evaluate continued fraction with modified Lentz's method
138
+ const maxIter = 200;
139
+ const eps = 3e-14;
140
+ const fpmin = 1e-30;
141
+
142
+ let qab = a + b;
143
+ let qap = a + 1;
144
+ let qam = a - 1;
145
+ let c = 1;
146
+ let d = 1 - qab * x / qap;
147
+ if (Math.abs(d) < fpmin) d = fpmin;
148
+ d = 1 / d;
149
+ let h = d;
150
+
151
+ for (let m = 1; m <= maxIter; m++) {
152
+ let m2 = 2 * m;
153
+
154
+ // Even step: d_{2m}
155
+ let aa = m * (b - m) * x / ((qam + m2) * (a + m2));
156
+ d = 1 + aa * d;
157
+ if (Math.abs(d) < fpmin) d = fpmin;
158
+ c = 1 + aa / c;
159
+ if (Math.abs(c) < fpmin) c = fpmin;
160
+ d = 1 / d;
161
+ h *= d * c;
162
+
163
+ // Odd step: d_{2m+1}
164
+ aa = -(a + m) * (qab + m) * x / ((a + m2) * (qap + m2));
165
+ d = 1 + aa * d;
166
+ if (Math.abs(d) < fpmin) d = fpmin;
167
+ c = 1 + aa / c;
168
+ if (Math.abs(c) < fpmin) c = fpmin;
169
+ d = 1 / d;
170
+ const delta = d * c;
171
+ h *= delta;
172
+
173
+ if (Math.abs(delta - 1) < eps) break;
140
174
  }
141
- sum *= x / steps;
142
175
 
143
- // Normalize (approximate)
144
- const beta = gamma(a) * gamma(b) / gamma(a + b);
145
- return sum / beta;
176
+ return front * h / a;
177
+ }
178
+
179
+ // Log-gamma function (avoids overflow for large arguments)
180
+ function lnGamma(z) {
181
+ if (z <= 0) return Infinity;
182
+ // Lanczos approximation (g=7, same coefficients as existing gamma function)
183
+ const g = 7;
184
+ const c = [0.99999999999980993, 676.5203681218851, -1259.1392167224028,
185
+ 771.32342877765313, -176.61502916214059, 12.507343278686905,
186
+ -0.13857109526572012, 9.9843695780195716e-6, 1.5056327351493116e-7];
187
+
188
+ if (z < 0.5) {
189
+ return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z);
190
+ }
191
+ z -= 1;
192
+ let x = c[0];
193
+ for (let i = 1; i < g + 2; i++) {
194
+ x += c[i] / (z + i);
195
+ }
196
+ const t = z + g + 0.5;
197
+ return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x);
146
198
  }
147
199
 
148
200
  function gamma(n) {