@machinespirits/eval 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/providers.yaml +60 -0
  9. package/config/suggestion-scenarios.yaml +1399 -0
  10. package/config/tutor-agents.yaml +716 -0
  11. package/docs/EVALUATION-VARIABLES.md +589 -0
  12. package/docs/REPLICATION-PLAN.md +577 -0
  13. package/docs/research/build.sh +74 -0
  14. package/docs/research/figures/figure1.png +0 -0
  15. package/docs/research/figures/figure2.png +0 -0
  16. package/docs/research/figures/figure3.png +0 -0
  17. package/docs/research/figures/figure4.png +0 -0
  18. package/docs/research/figures/figure5.png +0 -0
  19. package/docs/research/figures/figure6.png +0 -0
  20. package/docs/research/header.tex +4 -0
  21. package/docs/research/paper-full.md +1909 -0
  22. package/docs/research/paper-short.md +805 -0
  23. package/docs/research/references.bib +1011 -0
  24. package/index.js +15 -6
  25. package/package.json +14 -21
  26. package/routes/evalRoutes.js +88 -36
  27. package/scripts/analyze-judge-reliability.js +401 -0
  28. package/scripts/analyze-run.js +97 -0
  29. package/scripts/analyze-run.mjs +282 -0
  30. package/scripts/analyze-validation-failures.js +141 -0
  31. package/scripts/check-run.mjs +17 -0
  32. package/scripts/code-impasse-strategies.js +1132 -0
  33. package/scripts/compare-runs.js +44 -0
  34. package/scripts/compare-suggestions.js +80 -0
  35. package/scripts/compare-transformation.js +116 -0
  36. package/scripts/dig-into-run.js +158 -0
  37. package/scripts/eval-cli.js +2626 -0
  38. package/scripts/generate-paper-figures.py +452 -0
  39. package/scripts/qualitative-analysis-ai.js +1313 -0
  40. package/scripts/qualitative-analysis.js +688 -0
  41. package/scripts/seed-db.js +87 -0
  42. package/scripts/show-failed-suggestions.js +64 -0
  43. package/scripts/validate-content.js +192 -0
  44. package/server.js +3 -2
  45. package/services/__tests__/evalConfigLoader.test.js +338 -0
  46. package/services/anovaStats.js +499 -0
  47. package/services/contentResolver.js +407 -0
  48. package/services/dialogueTraceAnalyzer.js +454 -0
  49. package/services/evalConfigLoader.js +625 -0
  50. package/services/evaluationRunner.js +2171 -270
  51. package/services/evaluationStore.js +564 -29
  52. package/services/learnerConfigLoader.js +75 -5
  53. package/services/learnerRubricEvaluator.js +284 -0
  54. package/services/learnerTutorInteractionEngine.js +375 -0
  55. package/services/processUtils.js +18 -0
  56. package/services/progressLogger.js +98 -0
  57. package/services/promptRecommendationService.js +31 -26
  58. package/services/promptRewriter.js +427 -0
  59. package/services/rubricEvaluator.js +543 -70
  60. package/services/streamingReporter.js +104 -0
  61. package/services/turnComparisonAnalyzer.js +494 -0
  62. package/components/MobileEvalDashboard.tsx +0 -267
  63. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  64. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  65. package/components/comparison/RecognitionABMode.tsx +0 -385
  66. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  67. package/components/comparison/WinnerIndicator.tsx +0 -64
  68. package/components/comparison/index.ts +0 -5
  69. package/components/mobile/BottomSheet.tsx +0 -233
  70. package/components/mobile/DimensionBreakdown.tsx +0 -210
  71. package/components/mobile/DocsView.tsx +0 -363
  72. package/components/mobile/LogsView.tsx +0 -481
  73. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  74. package/components/mobile/QuickTestView.tsx +0 -1098
  75. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  76. package/components/mobile/RecognitionView.tsx +0 -809
  77. package/components/mobile/RunDetailView.tsx +0 -261
  78. package/components/mobile/RunHistoryView.tsx +0 -367
  79. package/components/mobile/ScoreRadial.tsx +0 -211
  80. package/components/mobile/StreamingLogPanel.tsx +0 -230
  81. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  82. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  83. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  84. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  85. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  86. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  87. package/docs/research/COST-ANALYSIS.md +0 -56
  88. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  89. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  90. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  91. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  92. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  93. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  94. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  95. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  96. package/docs/research/PAPER-UNIFIED.md +0 -659
  97. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  98. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  99. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  100. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  101. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  102. package/docs/research/paper-draft/full-paper.md +0 -136
  103. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  104. package/docs/research/paper-draft/references.bib +0 -515
  105. package/docs/research/transcript-baseline.md +0 -139
  106. package/docs/research/transcript-recognition-multiagent.md +0 -187
  107. package/hooks/useEvalData.ts +0 -625
  108. package/server-init.js +0 -45
  109. package/services/benchmarkService.js +0 -1892
  110. package/types.ts +0 -165
  111. package/utils/haptics.ts +0 -45
@@ -0,0 +1,338 @@
1
+ /**
2
+ * Tests for evalConfigLoader provider loading and model resolution.
3
+ *
4
+ * Uses node:test (built-in, no dependencies required).
5
+ * Run: node --test services/__tests__/evalConfigLoader.test.js
6
+ */
7
+
8
+ import { describe, it, beforeEach, afterEach } from 'node:test';
9
+ import assert from 'node:assert/strict';
10
+ import fs from 'fs';
11
+ import path from 'path';
12
+ import { fileURLToPath } from 'url';
13
+ import yaml from 'yaml';
14
+
15
+ import {
16
+ loadProviders,
17
+ getProviderConfig,
18
+ resolveModel,
19
+ } from '../evalConfigLoader.js';
20
+
21
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
22
+ const CONFIG_DIR = path.resolve(__dirname, '../../config');
23
+ const PROVIDERS_PATH = path.join(CONFIG_DIR, 'providers.yaml');
24
+
25
+ // ============================================================================
26
+ // loadProviders
27
+ // ============================================================================
28
+
29
+ describe('loadProviders', () => {
30
+ it('loads and parses providers.yaml', () => {
31
+ const data = loadProviders({ forceReload: true });
32
+ assert.ok(data, 'should return parsed data');
33
+ assert.ok(data.providers, 'should have providers key');
34
+ });
35
+
36
+ it('contains expected provider keys', () => {
37
+ const data = loadProviders({ forceReload: true });
38
+ const keys = Object.keys(data.providers);
39
+ assert.ok(keys.includes('anthropic'), 'should have anthropic');
40
+ assert.ok(keys.includes('openai'), 'should have openai');
41
+ assert.ok(keys.includes('openrouter'), 'should have openrouter');
42
+ assert.ok(keys.includes('gemini'), 'should have gemini');
43
+ assert.ok(keys.includes('local'), 'should have local');
44
+ });
45
+
46
+ it('returns cached result on second call', () => {
47
+ const first = loadProviders({ forceReload: true });
48
+ const second = loadProviders();
49
+ assert.strictEqual(first, second, 'should return same cached reference');
50
+ });
51
+
52
+ it('returns fresh result with forceReload', () => {
53
+ const first = loadProviders({ forceReload: true });
54
+ const second = loadProviders({ forceReload: true });
55
+ // Both should have the same content but forceReload re-reads the file.
56
+ // They may or may not be the same reference (re-parsed), but should be equal.
57
+ assert.deepStrictEqual(first, second);
58
+ });
59
+
60
+ it('each provider has models map', () => {
61
+ const data = loadProviders({ forceReload: true });
62
+ for (const [name, provider] of Object.entries(data.providers)) {
63
+ assert.ok(provider.models, `${name} should have models`);
64
+ assert.ok(
65
+ typeof provider.models === 'object',
66
+ `${name}.models should be an object`
67
+ );
68
+ }
69
+ });
70
+ });
71
+
72
+ // ============================================================================
73
+ // getProviderConfig
74
+ // ============================================================================
75
+
76
+ describe('getProviderConfig', () => {
77
+ // Save and restore env vars to avoid side effects
78
+ const savedEnv = {};
79
+ const envKeys = [
80
+ 'ANTHROPIC_API_KEY',
81
+ 'OPENAI_API_KEY',
82
+ 'OPENROUTER_API_KEY',
83
+ 'GEMINI_API_KEY',
84
+ ];
85
+
86
+ beforeEach(() => {
87
+ for (const key of envKeys) {
88
+ savedEnv[key] = process.env[key];
89
+ }
90
+ });
91
+
92
+ afterEach(() => {
93
+ for (const key of envKeys) {
94
+ if (savedEnv[key] === undefined) {
95
+ delete process.env[key];
96
+ } else {
97
+ process.env[key] = savedEnv[key];
98
+ }
99
+ }
100
+ });
101
+
102
+ it('returns config for a known provider', () => {
103
+ const config = getProviderConfig('anthropic');
104
+ assert.ok(config, 'should return config');
105
+ assert.ok(config.models, 'should have models');
106
+ assert.ok(config.base_url, 'should have base_url');
107
+ assert.strictEqual(config.api_key_env, 'ANTHROPIC_API_KEY');
108
+ });
109
+
110
+ it('throws for unknown provider', () => {
111
+ assert.throws(
112
+ () => getProviderConfig('nonexistent'),
113
+ /Unknown provider: nonexistent/
114
+ );
115
+ });
116
+
117
+ it('resolves API key from environment', () => {
118
+ process.env.ANTHROPIC_API_KEY = 'test-key-123';
119
+ const config = getProviderConfig('anthropic', { forceReload: true });
120
+ assert.strictEqual(config.apiKey, 'test-key-123');
121
+ assert.strictEqual(config.isConfigured, true);
122
+ });
123
+
124
+ it('reports isConfigured=false when API key is missing', () => {
125
+ delete process.env.OPENAI_API_KEY;
126
+ const config = getProviderConfig('openai', { forceReload: true });
127
+ assert.strictEqual(config.apiKey, '');
128
+ assert.strictEqual(config.isConfigured, false);
129
+ });
130
+
131
+ it('local provider is configured when base_url exists (no API key needed)', () => {
132
+ const config = getProviderConfig('local');
133
+ assert.strictEqual(config.apiKey, '');
134
+ // local has base_url in the yaml, so should be configured
135
+ assert.strictEqual(config.isConfigured, true);
136
+ });
137
+
138
+ it('spreads all provider fields into result', () => {
139
+ const config = getProviderConfig('openrouter');
140
+ assert.ok(config.base_url, 'should include base_url from yaml');
141
+ assert.ok(config.default_model, 'should include default_model from yaml');
142
+ assert.ok(config.models, 'should include models from yaml');
143
+ });
144
+ });
145
+
146
+ // ============================================================================
147
+ // resolveModel — string format
148
+ // ============================================================================
149
+
150
+ describe('resolveModel (string format)', () => {
151
+ it('resolves "anthropic.sonnet" to full model ID', () => {
152
+ const r = resolveModel('anthropic.sonnet');
153
+ assert.strictEqual(r.provider, 'anthropic');
154
+ assert.strictEqual(r.model, 'claude-sonnet-4-5');
155
+ assert.ok('apiKey' in r, 'should have apiKey field');
156
+ assert.ok('isConfigured' in r, 'should have isConfigured field');
157
+ assert.ok('baseUrl' in r, 'should have baseUrl field');
158
+ });
159
+
160
+ it('resolves "anthropic.haiku"', () => {
161
+ const r = resolveModel('anthropic.haiku');
162
+ assert.strictEqual(r.provider, 'anthropic');
163
+ assert.strictEqual(r.model, 'claude-haiku-4-5');
164
+ });
165
+
166
+ it('resolves "anthropic.opus"', () => {
167
+ const r = resolveModel('anthropic.opus');
168
+ assert.strictEqual(r.provider, 'anthropic');
169
+ assert.strictEqual(r.model, 'claude-opus-4-5');
170
+ });
171
+
172
+ it('resolves "openai.mini"', () => {
173
+ const r = resolveModel('openai.mini');
174
+ assert.strictEqual(r.provider, 'openai');
175
+ assert.strictEqual(r.model, 'gpt-5-mini');
176
+ });
177
+
178
+ it('resolves "openai.standard"', () => {
179
+ const r = resolveModel('openai.standard');
180
+ assert.strictEqual(r.provider, 'openai');
181
+ assert.strictEqual(r.model, 'gpt-5.2');
182
+ });
183
+
184
+ it('resolves "openrouter.sonnet" to openrouter model ID', () => {
185
+ const r = resolveModel('openrouter.sonnet');
186
+ assert.strictEqual(r.provider, 'openrouter');
187
+ assert.strictEqual(r.model, 'anthropic/claude-sonnet-4.5');
188
+ });
189
+
190
+ it('resolves "openrouter.nemotron"', () => {
191
+ const r = resolveModel('openrouter.nemotron');
192
+ assert.strictEqual(r.provider, 'openrouter');
193
+ assert.strictEqual(r.model, 'nvidia/nemotron-3-nano-30b-a3b:free');
194
+ });
195
+
196
+ it('resolves "openrouter.deepseek"', () => {
197
+ const r = resolveModel('openrouter.deepseek');
198
+ assert.strictEqual(r.provider, 'openrouter');
199
+ assert.strictEqual(r.model, 'deepseek/deepseek-v3.2');
200
+ });
201
+
202
+ it('resolves "gemini.flash"', () => {
203
+ const r = resolveModel('gemini.flash');
204
+ assert.strictEqual(r.provider, 'gemini');
205
+ assert.strictEqual(r.model, 'gemini-3-flash-preview');
206
+ });
207
+
208
+ it('resolves "gemini.pro"', () => {
209
+ const r = resolveModel('gemini.pro');
210
+ assert.strictEqual(r.provider, 'gemini');
211
+ assert.strictEqual(r.model, 'gemini-3-pro-preview');
212
+ });
213
+
214
+ it('resolves "local.default"', () => {
215
+ const r = resolveModel('local.default');
216
+ assert.strictEqual(r.provider, 'local');
217
+ assert.strictEqual(r.model, 'local-model');
218
+ });
219
+
220
+ it('passes through unknown alias as-is', () => {
221
+ const r = resolveModel('openrouter.some-future-model');
222
+ assert.strictEqual(r.provider, 'openrouter');
223
+ assert.strictEqual(r.model, 'some-future-model');
224
+ });
225
+
226
+ it('returns baseUrl from provider config', () => {
227
+ const r = resolveModel('openrouter.sonnet');
228
+ assert.strictEqual(r.baseUrl, 'https://openrouter.ai/api/v1/chat/completions');
229
+ });
230
+ });
231
+
232
+ // ============================================================================
233
+ // resolveModel — object format
234
+ // ============================================================================
235
+
236
+ describe('resolveModel (object format)', () => {
237
+ it('resolves { provider, model } object', () => {
238
+ const r = resolveModel({ provider: 'anthropic', model: 'haiku' });
239
+ assert.strictEqual(r.provider, 'anthropic');
240
+ assert.strictEqual(r.model, 'claude-haiku-4-5');
241
+ });
242
+
243
+ it('passes through unknown model alias in object format', () => {
244
+ const r = resolveModel({ provider: 'openai', model: 'gpt-99-turbo' });
245
+ assert.strictEqual(r.provider, 'openai');
246
+ assert.strictEqual(r.model, 'gpt-99-turbo');
247
+ });
248
+ });
249
+
250
+ // ============================================================================
251
+ // resolveModel — error cases
252
+ // ============================================================================
253
+
254
+ describe('resolveModel (error cases)', () => {
255
+ it('throws on single-part string (no dot)', () => {
256
+ assert.throws(
257
+ () => resolveModel('sonnet'),
258
+ /Invalid model reference.*Use format "provider\.model"/
259
+ );
260
+ });
261
+
262
+ it('splits on first dot only (handles aliases with dots like kimi-k2.5)', () => {
263
+ // "openrouter.kimi-k2.5" should parse as provider=openrouter, alias=kimi-k2.5
264
+ const r = resolveModel('openrouter.kimi-k2.5');
265
+ assert.strictEqual(r.provider, 'openrouter');
266
+ assert.strictEqual(r.model, 'moonshotai/kimi-k2.5');
267
+ });
268
+
269
+ it('throws on unknown provider', () => {
270
+ assert.throws(
271
+ () => resolveModel('fakeprovider.model'),
272
+ /Unknown provider: fakeprovider/
273
+ );
274
+ });
275
+
276
+ it('throws on object missing provider', () => {
277
+ assert.throws(
278
+ () => resolveModel({ model: 'haiku' }),
279
+ /must have both "provider" and "model"/
280
+ );
281
+ });
282
+
283
+ it('throws on object missing model', () => {
284
+ assert.throws(
285
+ () => resolveModel({ provider: 'anthropic' }),
286
+ /must have both "provider" and "model"/
287
+ );
288
+ });
289
+
290
+ it('throws on null', () => {
291
+ assert.throws(
292
+ () => resolveModel(null),
293
+ /Model reference must be a string or object/
294
+ );
295
+ });
296
+
297
+ it('throws on number', () => {
298
+ assert.throws(
299
+ () => resolveModel(42),
300
+ /Model reference must be a string or object/
301
+ );
302
+ });
303
+
304
+ it('throws on empty object', () => {
305
+ assert.throws(
306
+ () => resolveModel({}),
307
+ /must have both "provider" and "model"/
308
+ );
309
+ });
310
+ });
311
+
312
+ // ============================================================================
313
+ // resolveModel — consistency with providers.yaml
314
+ // ============================================================================
315
+
316
+ describe('resolveModel consistency', () => {
317
+ it('every alias in every provider resolves without error', () => {
318
+ const data = loadProviders({ forceReload: true });
319
+ for (const [providerName, provider] of Object.entries(data.providers)) {
320
+ for (const alias of Object.keys(provider.models || {})) {
321
+ const r = resolveModel(`${providerName}.${alias}`);
322
+ assert.strictEqual(r.provider, providerName);
323
+ // Resolved model should match the value in yaml
324
+ assert.strictEqual(
325
+ r.model,
326
+ provider.models[alias],
327
+ `${providerName}.${alias} should resolve to ${provider.models[alias]}`
328
+ );
329
+ }
330
+ }
331
+ });
332
+
333
+ it('string and object format produce identical results', () => {
334
+ const fromString = resolveModel('anthropic.sonnet');
335
+ const fromObject = resolveModel({ provider: 'anthropic', model: 'sonnet' });
336
+ assert.deepStrictEqual(fromString, fromObject);
337
+ });
338
+ });