@machinespirits/eval 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -9
- package/config/eval-settings.yaml +3 -3
- package/config/paper-manifest.json +486 -0
- package/config/providers.yaml +9 -6
- package/config/tutor-agents.yaml +2261 -0
- package/content/README.md +23 -0
- package/content/courses/479/course.md +53 -0
- package/content/courses/479/lecture-1.md +361 -0
- package/content/courses/479/lecture-2.md +360 -0
- package/content/courses/479/lecture-3.md +655 -0
- package/content/courses/479/lecture-4.md +530 -0
- package/content/courses/479/lecture-5.md +326 -0
- package/content/courses/479/lecture-6.md +346 -0
- package/content/courses/479/lecture-7.md +326 -0
- package/content/courses/479/lecture-8.md +273 -0
- package/content/courses/479/roadmap-slides.md +656 -0
- package/content/manifest.yaml +8 -0
- package/docs/research/build.sh +44 -20
- package/docs/research/figures/figure10.png +0 -0
- package/docs/research/figures/figure11.png +0 -0
- package/docs/research/figures/figure3.png +0 -0
- package/docs/research/figures/figure4.png +0 -0
- package/docs/research/figures/figure5.png +0 -0
- package/docs/research/figures/figure6.png +0 -0
- package/docs/research/figures/figure7.png +0 -0
- package/docs/research/figures/figure8.png +0 -0
- package/docs/research/figures/figure9.png +0 -0
- package/docs/research/header.tex +23 -2
- package/docs/research/paper-full.md +941 -285
- package/docs/research/paper-short.md +216 -585
- package/docs/research/references.bib +132 -0
- package/docs/research/slides-header.tex +188 -0
- package/docs/research/slides-pptx.md +363 -0
- package/docs/research/slides.md +531 -0
- package/docs/research/style-reference-pptx.py +199 -0
- package/package.json +6 -5
- package/scripts/analyze-eval-results.js +69 -17
- package/scripts/analyze-mechanism-traces.js +763 -0
- package/scripts/analyze-modulation-learning.js +498 -0
- package/scripts/analyze-prosthesis.js +144 -0
- package/scripts/analyze-run.js +264 -79
- package/scripts/assess-transcripts.js +853 -0
- package/scripts/browse-transcripts.js +854 -0
- package/scripts/check-parse-failures.js +73 -0
- package/scripts/code-dialectical-modulation.js +1320 -0
- package/scripts/download-data.sh +55 -0
- package/scripts/eval-cli.js +106 -18
- package/scripts/generate-paper-figures.js +663 -0
- package/scripts/generate-paper-figures.py +577 -76
- package/scripts/generate-paper-tables.js +299 -0
- package/scripts/qualitative-analysis-ai.js +3 -3
- package/scripts/render-sequence-diagram.js +694 -0
- package/scripts/test-latency.js +210 -0
- package/scripts/test-rate-limit.js +95 -0
- package/scripts/test-token-budget.js +332 -0
- package/scripts/validate-paper-manifest.js +670 -0
- package/services/__tests__/evalConfigLoader.test.js +2 -2
- package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
- package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
- package/services/evaluationRunner.js +975 -98
- package/services/evaluationStore.js +12 -4
- package/services/learnerTutorInteractionEngine.js +27 -2
- package/services/mockProvider.js +133 -0
- package/services/promptRewriter.js +1471 -5
- package/services/rubricEvaluator.js +55 -2
- package/services/transcriptFormatter.js +675 -0
- package/docs/EVALUATION-VARIABLES.md +0 -589
- package/docs/REPLICATION-PLAN.md +0 -577
- package/scripts/analyze-run.mjs +0 -282
- package/scripts/compare-runs.js +0 -44
- package/scripts/compare-suggestions.js +0 -80
- package/scripts/dig-into-run.js +0 -158
- package/scripts/show-failed-suggestions.js +0 -64
- /package/scripts/{check-run.mjs → check-run.js} +0 -0
|
@@ -16,6 +16,59 @@ function debugLog(...args) {
|
|
|
16
16
|
}
|
|
17
17
|
}
|
|
18
18
|
|
|
19
|
+
/**
|
|
20
|
+
* Normalize a judge model label to a canonical, human-readable form.
|
|
21
|
+
* Strips routing prefixes (e.g. "openrouter/anthropic/") and maps
|
|
22
|
+
* known model IDs to short names with version numbers.
|
|
23
|
+
*
|
|
24
|
+
* Examples:
|
|
25
|
+
* "openrouter/anthropic/claude-sonnet-4.5" → "claude-sonnet-4.5"
|
|
26
|
+
* "openrouter/openai/gpt-5.2" → "gpt-5.2"
|
|
27
|
+
* "openrouter/moonshotai/kimi-k2.5" → "kimi-k2.5"
|
|
28
|
+
* "anthropic/claude-opus-4-5" → "claude-opus-4.5"
|
|
29
|
+
* "openrouter/nvidia/nemotron-..." → "nemotron"
|
|
30
|
+
*/
|
|
31
|
+
export function normalizeJudgeLabel(provider, model) {
|
|
32
|
+
// For known model IDs, extract the canonical name
|
|
33
|
+
const MODEL_MAP = {
|
|
34
|
+
'anthropic/claude-opus-4.5': 'claude-opus-4.5',
|
|
35
|
+
'anthropic/claude-opus-4-5': 'claude-opus-4.5',
|
|
36
|
+
'anthropic/claude-opus-4-6': 'claude-opus-4.6',
|
|
37
|
+
'anthropic/claude-sonnet-4.5': 'claude-sonnet-4.5',
|
|
38
|
+
'anthropic/claude-sonnet-4-5': 'claude-sonnet-4.5',
|
|
39
|
+
'anthropic/claude-haiku-4.5': 'claude-haiku-4.5',
|
|
40
|
+
'anthropic/claude-haiku-4-5': 'claude-haiku-4.5',
|
|
41
|
+
'openai/gpt-5.2': 'gpt-5.2',
|
|
42
|
+
'openai/gpt-5-mini': 'gpt-5-mini',
|
|
43
|
+
'openai/gpt-oss-120b': 'gpt-oss-120b',
|
|
44
|
+
'moonshotai/kimi-k2.5': 'kimi-k2.5',
|
|
45
|
+
'moonshotai/kimi-k2-thinking': 'kimi-k2',
|
|
46
|
+
'deepseek/deepseek-v3.2': 'deepseek-v3.2',
|
|
47
|
+
'z-ai/glm-4.7': 'glm-4.7',
|
|
48
|
+
'z-ai/glm-5': 'glm-5',
|
|
49
|
+
'google/gemini-3-flash-preview': 'gemini-3-flash',
|
|
50
|
+
'google/gemini-3-pro-preview': 'gemini-3-pro',
|
|
51
|
+
'minimax/minimax-m2.5': 'minimax-m2.5',
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
// Try direct model lookup (handles openrouter paths like "anthropic/claude-sonnet-4.5")
|
|
55
|
+
if (MODEL_MAP[model]) return MODEL_MAP[model];
|
|
56
|
+
|
|
57
|
+
// Try full provider/model path
|
|
58
|
+
const fullPath = `${provider}/${model}`;
|
|
59
|
+
if (MODEL_MAP[fullPath]) return MODEL_MAP[fullPath];
|
|
60
|
+
|
|
61
|
+
// For nvidia/nemotron variants, normalize to "nemotron"
|
|
62
|
+
if (model.includes('nemotron')) return 'nemotron';
|
|
63
|
+
|
|
64
|
+
// Fallback: strip common routing prefixes, keep the model name
|
|
65
|
+
const stripped = model
|
|
66
|
+
.replace(/^(anthropic|openai|moonshotai|deepseek|z-ai|google|minimax|nvidia)\//, '')
|
|
67
|
+
.replace(/:free$/, '');
|
|
68
|
+
|
|
69
|
+
return stripped || `${provider}/${model}`;
|
|
70
|
+
}
|
|
71
|
+
|
|
19
72
|
/**
|
|
20
73
|
* Get available judge configuration, resolving model references via providers.yaml
|
|
21
74
|
* Tries primary model first, then fallback if primary is not configured
|
|
@@ -929,7 +982,7 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}, ove
|
|
|
929
982
|
requiredMissing: parsed.validation?.required_missing || [],
|
|
930
983
|
forbiddenFound: parsed.validation?.forbidden_found || [],
|
|
931
984
|
summary: parsed.summary,
|
|
932
|
-
judgeModel:
|
|
985
|
+
judgeModel: normalizeJudgeLabel(judge.provider, judge.model),
|
|
933
986
|
evaluationTimeMs: Date.now() - startTime,
|
|
934
987
|
};
|
|
935
988
|
} catch (error) {
|
|
@@ -940,7 +993,7 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}, ove
|
|
|
940
993
|
baseScore: null,
|
|
941
994
|
recognitionScore: null,
|
|
942
995
|
error: error.message,
|
|
943
|
-
judgeModel:
|
|
996
|
+
judgeModel: normalizeJudgeLabel(judge.provider, judge.model),
|
|
944
997
|
evaluationTimeMs: Date.now() - startTime,
|
|
945
998
|
};
|
|
946
999
|
}
|