@machinespirits/eval 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -9
- package/config/eval-settings.yaml +3 -3
- package/config/paper-manifest.json +486 -0
- package/config/providers.yaml +9 -6
- package/config/tutor-agents.yaml +2261 -0
- package/content/README.md +23 -0
- package/content/courses/479/course.md +53 -0
- package/content/courses/479/lecture-1.md +361 -0
- package/content/courses/479/lecture-2.md +360 -0
- package/content/courses/479/lecture-3.md +655 -0
- package/content/courses/479/lecture-4.md +530 -0
- package/content/courses/479/lecture-5.md +326 -0
- package/content/courses/479/lecture-6.md +346 -0
- package/content/courses/479/lecture-7.md +326 -0
- package/content/courses/479/lecture-8.md +273 -0
- package/content/courses/479/roadmap-slides.md +656 -0
- package/content/manifest.yaml +8 -0
- package/docs/research/build.sh +44 -20
- package/docs/research/figures/figure10.png +0 -0
- package/docs/research/figures/figure11.png +0 -0
- package/docs/research/figures/figure3.png +0 -0
- package/docs/research/figures/figure4.png +0 -0
- package/docs/research/figures/figure5.png +0 -0
- package/docs/research/figures/figure6.png +0 -0
- package/docs/research/figures/figure7.png +0 -0
- package/docs/research/figures/figure8.png +0 -0
- package/docs/research/figures/figure9.png +0 -0
- package/docs/research/header.tex +23 -2
- package/docs/research/paper-full.md +941 -285
- package/docs/research/paper-short.md +216 -585
- package/docs/research/references.bib +132 -0
- package/docs/research/slides-header.tex +188 -0
- package/docs/research/slides-pptx.md +363 -0
- package/docs/research/slides.md +531 -0
- package/docs/research/style-reference-pptx.py +199 -0
- package/package.json +6 -5
- package/scripts/analyze-eval-results.js +69 -17
- package/scripts/analyze-mechanism-traces.js +763 -0
- package/scripts/analyze-modulation-learning.js +498 -0
- package/scripts/analyze-prosthesis.js +144 -0
- package/scripts/analyze-run.js +264 -79
- package/scripts/assess-transcripts.js +853 -0
- package/scripts/browse-transcripts.js +854 -0
- package/scripts/check-parse-failures.js +73 -0
- package/scripts/code-dialectical-modulation.js +1320 -0
- package/scripts/download-data.sh +55 -0
- package/scripts/eval-cli.js +106 -18
- package/scripts/generate-paper-figures.js +663 -0
- package/scripts/generate-paper-figures.py +577 -76
- package/scripts/generate-paper-tables.js +299 -0
- package/scripts/qualitative-analysis-ai.js +3 -3
- package/scripts/render-sequence-diagram.js +694 -0
- package/scripts/test-latency.js +210 -0
- package/scripts/test-rate-limit.js +95 -0
- package/scripts/test-token-budget.js +332 -0
- package/scripts/validate-paper-manifest.js +670 -0
- package/services/__tests__/evalConfigLoader.test.js +2 -2
- package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
- package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
- package/services/evaluationRunner.js +975 -98
- package/services/evaluationStore.js +12 -4
- package/services/learnerTutorInteractionEngine.js +27 -2
- package/services/mockProvider.js +133 -0
- package/services/promptRewriter.js +1471 -5
- package/services/rubricEvaluator.js +55 -2
- package/services/transcriptFormatter.js +675 -0
- package/docs/EVALUATION-VARIABLES.md +0 -589
- package/docs/REPLICATION-PLAN.md +0 -577
- package/scripts/analyze-run.mjs +0 -282
- package/scripts/compare-runs.js +0 -44
- package/scripts/compare-suggestions.js +0 -80
- package/scripts/dig-into-run.js +0 -158
- package/scripts/show-failed-suggestions.js +0 -64
- /package/scripts/{check-run.mjs → check-run.js} +0 -0
package/config/providers.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# Shared AI Provider Configuration
|
|
2
2
|
# Used by both tutor-agents.yaml and evaluation-rubric.yaml
|
|
3
3
|
#
|
|
4
|
-
# Model IDs are current as of
|
|
4
|
+
# Model IDs are current as of February 2026. Update when new models release.
|
|
5
5
|
|
|
6
6
|
providers:
|
|
7
7
|
anthropic:
|
|
@@ -11,7 +11,7 @@ providers:
|
|
|
11
11
|
models:
|
|
12
12
|
haiku: claude-haiku-4-5
|
|
13
13
|
sonnet: claude-sonnet-4-5
|
|
14
|
-
opus: claude-opus-4-
|
|
14
|
+
opus: claude-opus-4-6
|
|
15
15
|
|
|
16
16
|
openai:
|
|
17
17
|
api_key_env: OPENAI_API_KEY
|
|
@@ -26,21 +26,24 @@ providers:
|
|
|
26
26
|
base_url: https://openrouter.ai/api/v1/chat/completions
|
|
27
27
|
default_model: nvidia/nemotron-3-nano-30b-a3b:free
|
|
28
28
|
models:
|
|
29
|
-
# Budget-friendly options
|
|
29
|
+
# Budget-friendly options
|
|
30
30
|
nemotron: nvidia/nemotron-3-nano-30b-a3b:free
|
|
31
31
|
glm47: z-ai/glm-4.7
|
|
32
|
+
glm5: z-ai/glm-5
|
|
32
33
|
kimi-k2: moonshotai/kimi-k2-thinking
|
|
33
34
|
"kimi-k2.5": moonshotai/kimi-k2.5
|
|
34
35
|
deepseek: deepseek/deepseek-v3.2
|
|
35
|
-
minimax: minimax/minimax-m2.
|
|
36
|
+
minimax: minimax/minimax-m2.5
|
|
36
37
|
haiku: anthropic/claude-haiku-4.5
|
|
37
38
|
gpt-oss: openai/gpt-oss-120b
|
|
39
|
+
qwen3.5-plus: qwen/qwen3.5-plus-02-15
|
|
40
|
+
qwen3.5: qwen/qwen3.5-397b-a17b
|
|
38
41
|
# Mid-tier options
|
|
39
|
-
sonnet: anthropic/claude-sonnet-4.
|
|
42
|
+
sonnet: anthropic/claude-sonnet-4.6
|
|
40
43
|
gpt-mini: openai/gpt-5-mini
|
|
41
44
|
gemini-flash: google/gemini-3-flash-preview
|
|
42
45
|
# Premium options
|
|
43
|
-
opus: anthropic/claude-opus-4.
|
|
46
|
+
opus: anthropic/claude-opus-4.6
|
|
44
47
|
gpt: openai/gpt-5.2
|
|
45
48
|
gemini-pro: google/gemini-3-pro-preview
|
|
46
49
|
|