@machinespirits/eval 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +91 -9
  2. package/config/eval-settings.yaml +3 -3
  3. package/config/paper-manifest.json +486 -0
  4. package/config/providers.yaml +9 -6
  5. package/config/tutor-agents.yaml +2261 -0
  6. package/content/README.md +23 -0
  7. package/content/courses/479/course.md +53 -0
  8. package/content/courses/479/lecture-1.md +361 -0
  9. package/content/courses/479/lecture-2.md +360 -0
  10. package/content/courses/479/lecture-3.md +655 -0
  11. package/content/courses/479/lecture-4.md +530 -0
  12. package/content/courses/479/lecture-5.md +326 -0
  13. package/content/courses/479/lecture-6.md +346 -0
  14. package/content/courses/479/lecture-7.md +326 -0
  15. package/content/courses/479/lecture-8.md +273 -0
  16. package/content/courses/479/roadmap-slides.md +656 -0
  17. package/content/manifest.yaml +8 -0
  18. package/docs/research/build.sh +44 -20
  19. package/docs/research/figures/figure10.png +0 -0
  20. package/docs/research/figures/figure11.png +0 -0
  21. package/docs/research/figures/figure3.png +0 -0
  22. package/docs/research/figures/figure4.png +0 -0
  23. package/docs/research/figures/figure5.png +0 -0
  24. package/docs/research/figures/figure6.png +0 -0
  25. package/docs/research/figures/figure7.png +0 -0
  26. package/docs/research/figures/figure8.png +0 -0
  27. package/docs/research/figures/figure9.png +0 -0
  28. package/docs/research/header.tex +23 -2
  29. package/docs/research/paper-full.md +941 -285
  30. package/docs/research/paper-short.md +216 -585
  31. package/docs/research/references.bib +132 -0
  32. package/docs/research/slides-header.tex +188 -0
  33. package/docs/research/slides-pptx.md +363 -0
  34. package/docs/research/slides.md +531 -0
  35. package/docs/research/style-reference-pptx.py +199 -0
  36. package/package.json +6 -5
  37. package/scripts/analyze-eval-results.js +69 -17
  38. package/scripts/analyze-mechanism-traces.js +763 -0
  39. package/scripts/analyze-modulation-learning.js +498 -0
  40. package/scripts/analyze-prosthesis.js +144 -0
  41. package/scripts/analyze-run.js +264 -79
  42. package/scripts/assess-transcripts.js +853 -0
  43. package/scripts/browse-transcripts.js +854 -0
  44. package/scripts/check-parse-failures.js +73 -0
  45. package/scripts/code-dialectical-modulation.js +1320 -0
  46. package/scripts/download-data.sh +55 -0
  47. package/scripts/eval-cli.js +106 -18
  48. package/scripts/generate-paper-figures.js +663 -0
  49. package/scripts/generate-paper-figures.py +577 -76
  50. package/scripts/generate-paper-tables.js +299 -0
  51. package/scripts/qualitative-analysis-ai.js +3 -3
  52. package/scripts/render-sequence-diagram.js +694 -0
  53. package/scripts/test-latency.js +210 -0
  54. package/scripts/test-rate-limit.js +95 -0
  55. package/scripts/test-token-budget.js +332 -0
  56. package/scripts/validate-paper-manifest.js +670 -0
  57. package/services/__tests__/evalConfigLoader.test.js +2 -2
  58. package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
  59. package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
  60. package/services/evaluationRunner.js +975 -98
  61. package/services/evaluationStore.js +12 -4
  62. package/services/learnerTutorInteractionEngine.js +27 -2
  63. package/services/mockProvider.js +133 -0
  64. package/services/promptRewriter.js +1471 -5
  65. package/services/rubricEvaluator.js +55 -2
  66. package/services/transcriptFormatter.js +675 -0
  67. package/docs/EVALUATION-VARIABLES.md +0 -589
  68. package/docs/REPLICATION-PLAN.md +0 -577
  69. package/scripts/analyze-run.mjs +0 -282
  70. package/scripts/compare-runs.js +0 -44
  71. package/scripts/compare-suggestions.js +0 -80
  72. package/scripts/dig-into-run.js +0 -158
  73. package/scripts/show-failed-suggestions.js +0 -64
  74. /package/scripts/{check-run.mjs → check-run.js} +0 -0
@@ -16,6 +16,59 @@ function debugLog(...args) {
16
16
  }
17
17
  }
18
18
 
19
+ /**
20
+ * Normalize a judge model label to a canonical, human-readable form.
21
+ * Strips routing prefixes (e.g. "openrouter/anthropic/") and maps
22
+ * known model IDs to short names with version numbers.
23
+ *
24
+ * Examples:
25
+ * "openrouter/anthropic/claude-sonnet-4.5" → "claude-sonnet-4.5"
26
+ * "openrouter/openai/gpt-5.2" → "gpt-5.2"
27
+ * "openrouter/moonshotai/kimi-k2.5" → "kimi-k2.5"
28
+ * "anthropic/claude-opus-4-5" → "claude-opus-4.5"
29
+ * "openrouter/nvidia/nemotron-..." → "nemotron"
30
+ */
31
+ export function normalizeJudgeLabel(provider, model) {
32
+ // For known model IDs, extract the canonical name
33
+ const MODEL_MAP = {
34
+ 'anthropic/claude-opus-4.5': 'claude-opus-4.5',
35
+ 'anthropic/claude-opus-4-5': 'claude-opus-4.5',
36
+ 'anthropic/claude-opus-4-6': 'claude-opus-4.6',
37
+ 'anthropic/claude-sonnet-4.5': 'claude-sonnet-4.5',
38
+ 'anthropic/claude-sonnet-4-5': 'claude-sonnet-4.5',
39
+ 'anthropic/claude-haiku-4.5': 'claude-haiku-4.5',
40
+ 'anthropic/claude-haiku-4-5': 'claude-haiku-4.5',
41
+ 'openai/gpt-5.2': 'gpt-5.2',
42
+ 'openai/gpt-5-mini': 'gpt-5-mini',
43
+ 'openai/gpt-oss-120b': 'gpt-oss-120b',
44
+ 'moonshotai/kimi-k2.5': 'kimi-k2.5',
45
+ 'moonshotai/kimi-k2-thinking': 'kimi-k2',
46
+ 'deepseek/deepseek-v3.2': 'deepseek-v3.2',
47
+ 'z-ai/glm-4.7': 'glm-4.7',
48
+ 'z-ai/glm-5': 'glm-5',
49
+ 'google/gemini-3-flash-preview': 'gemini-3-flash',
50
+ 'google/gemini-3-pro-preview': 'gemini-3-pro',
51
+ 'minimax/minimax-m2.5': 'minimax-m2.5',
52
+ };
53
+
54
+ // Try direct model lookup (handles openrouter paths like "anthropic/claude-sonnet-4.5")
55
+ if (MODEL_MAP[model]) return MODEL_MAP[model];
56
+
57
+ // Try full provider/model path
58
+ const fullPath = `${provider}/${model}`;
59
+ if (MODEL_MAP[fullPath]) return MODEL_MAP[fullPath];
60
+
61
+ // For nvidia/nemotron variants, normalize to "nemotron"
62
+ if (model.includes('nemotron')) return 'nemotron';
63
+
64
+ // Fallback: strip common routing prefixes, keep the model name
65
+ const stripped = model
66
+ .replace(/^(anthropic|openai|moonshotai|deepseek|z-ai|google|minimax|nvidia)\//, '')
67
+ .replace(/:free$/, '');
68
+
69
+ return stripped || `${provider}/${model}`;
70
+ }
71
+
19
72
  /**
20
73
  * Get available judge configuration, resolving model references via providers.yaml
21
74
  * Tries primary model first, then fallback if primary is not configured
@@ -929,7 +982,7 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}, ove
929
982
  requiredMissing: parsed.validation?.required_missing || [],
930
983
  forbiddenFound: parsed.validation?.forbidden_found || [],
931
984
  summary: parsed.summary,
932
- judgeModel: `${judge.provider}/${judge.model}`,
985
+ judgeModel: normalizeJudgeLabel(judge.provider, judge.model),
933
986
  evaluationTimeMs: Date.now() - startTime,
934
987
  };
935
988
  } catch (error) {
@@ -940,7 +993,7 @@ export async function evaluateSuggestion(suggestion, scenario, context = {}, ove
940
993
  baseScore: null,
941
994
  recognitionScore: null,
942
995
  error: error.message,
943
- judgeModel: `${judge.provider}/${judge.model}`,
996
+ judgeModel: normalizeJudgeLabel(judge.provider, judge.model),
944
997
  evaluationTimeMs: Date.now() - startTime,
945
998
  };
946
999
  }