@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
@@ -0,0 +1,338 @@
1
+ /**
2
+ * Tests for evalConfigLoader provider loading and model resolution.
3
+ *
4
+ * Uses node:test (built-in, no dependencies required).
5
+ * Run: node --test services/__tests__/evalConfigLoader.test.js
6
+ */
7
+
8
+ import { describe, it, beforeEach, afterEach } from 'node:test';
9
+ import assert from 'node:assert/strict';
10
+ import fs from 'fs';
11
+ import path from 'path';
12
+ import { fileURLToPath } from 'url';
13
+ import yaml from 'yaml';
14
+
15
+ import {
16
+ loadProviders,
17
+ getProviderConfig,
18
+ resolveModel,
19
+ } from '../evalConfigLoader.js';
20
+
21
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
22
+ const CONFIG_DIR = path.resolve(__dirname, '../../config');
23
+ const PROVIDERS_PATH = path.join(CONFIG_DIR, 'providers.yaml');
24
+
25
+ // ============================================================================
26
+ // loadProviders
27
+ // ============================================================================
28
+
29
+ describe('loadProviders', () => {
30
+ it('loads and parses providers.yaml', () => {
31
+ const data = loadProviders({ forceReload: true });
32
+ assert.ok(data, 'should return parsed data');
33
+ assert.ok(data.providers, 'should have providers key');
34
+ });
35
+
36
+ it('contains expected provider keys', () => {
37
+ const data = loadProviders({ forceReload: true });
38
+ const keys = Object.keys(data.providers);
39
+ assert.ok(keys.includes('anthropic'), 'should have anthropic');
40
+ assert.ok(keys.includes('openai'), 'should have openai');
41
+ assert.ok(keys.includes('openrouter'), 'should have openrouter');
42
+ assert.ok(keys.includes('gemini'), 'should have gemini');
43
+ assert.ok(keys.includes('local'), 'should have local');
44
+ });
45
+
46
+ it('returns cached result on second call', () => {
47
+ const first = loadProviders({ forceReload: true });
48
+ const second = loadProviders();
49
+ assert.strictEqual(first, second, 'should return same cached reference');
50
+ });
51
+
52
+ it('returns fresh result with forceReload', () => {
53
+ const first = loadProviders({ forceReload: true });
54
+ const second = loadProviders({ forceReload: true });
55
+ // Both should have the same content but forceReload re-reads the file.
56
+ // They may or may not be the same reference (re-parsed), but should be equal.
57
+ assert.deepStrictEqual(first, second);
58
+ });
59
+
60
+ it('each provider has models map', () => {
61
+ const data = loadProviders({ forceReload: true });
62
+ for (const [name, provider] of Object.entries(data.providers)) {
63
+ assert.ok(provider.models, `${name} should have models`);
64
+ assert.ok(
65
+ typeof provider.models === 'object',
66
+ `${name}.models should be an object`
67
+ );
68
+ }
69
+ });
70
+ });
71
+
72
+ // ============================================================================
73
+ // getProviderConfig
74
+ // ============================================================================
75
+
76
+ describe('getProviderConfig', () => {
77
+ // Save and restore env vars to avoid side effects
78
+ const savedEnv = {};
79
+ const envKeys = [
80
+ 'ANTHROPIC_API_KEY',
81
+ 'OPENAI_API_KEY',
82
+ 'OPENROUTER_API_KEY',
83
+ 'GEMINI_API_KEY',
84
+ ];
85
+
86
+ beforeEach(() => {
87
+ for (const key of envKeys) {
88
+ savedEnv[key] = process.env[key];
89
+ }
90
+ });
91
+
92
+ afterEach(() => {
93
+ for (const key of envKeys) {
94
+ if (savedEnv[key] === undefined) {
95
+ delete process.env[key];
96
+ } else {
97
+ process.env[key] = savedEnv[key];
98
+ }
99
+ }
100
+ });
101
+
102
+ it('returns config for a known provider', () => {
103
+ const config = getProviderConfig('anthropic');
104
+ assert.ok(config, 'should return config');
105
+ assert.ok(config.models, 'should have models');
106
+ assert.ok(config.base_url, 'should have base_url');
107
+ assert.strictEqual(config.api_key_env, 'ANTHROPIC_API_KEY');
108
+ });
109
+
110
+ it('throws for unknown provider', () => {
111
+ assert.throws(
112
+ () => getProviderConfig('nonexistent'),
113
+ /Unknown provider: nonexistent/
114
+ );
115
+ });
116
+
117
+ it('resolves API key from environment', () => {
118
+ process.env.ANTHROPIC_API_KEY = 'test-key-123';
119
+ const config = getProviderConfig('anthropic', { forceReload: true });
120
+ assert.strictEqual(config.apiKey, 'test-key-123');
121
+ assert.strictEqual(config.isConfigured, true);
122
+ });
123
+
124
+ it('reports isConfigured=false when API key is missing', () => {
125
+ delete process.env.OPENAI_API_KEY;
126
+ const config = getProviderConfig('openai', { forceReload: true });
127
+ assert.strictEqual(config.apiKey, '');
128
+ assert.strictEqual(config.isConfigured, false);
129
+ });
130
+
131
+ it('local provider is configured when base_url exists (no API key needed)', () => {
132
+ const config = getProviderConfig('local');
133
+ assert.strictEqual(config.apiKey, '');
134
+ // local has base_url in the yaml, so should be configured
135
+ assert.strictEqual(config.isConfigured, true);
136
+ });
137
+
138
+ it('spreads all provider fields into result', () => {
139
+ const config = getProviderConfig('openrouter');
140
+ assert.ok(config.base_url, 'should include base_url from yaml');
141
+ assert.ok(config.default_model, 'should include default_model from yaml');
142
+ assert.ok(config.models, 'should include models from yaml');
143
+ });
144
+ });
145
+
146
+ // ============================================================================
147
+ // resolveModel — string format
148
+ // ============================================================================
149
+
150
+ describe('resolveModel (string format)', () => {
151
+ it('resolves "anthropic.sonnet" to full model ID', () => {
152
+ const r = resolveModel('anthropic.sonnet');
153
+ assert.strictEqual(r.provider, 'anthropic');
154
+ assert.strictEqual(r.model, 'claude-sonnet-4-5');
155
+ assert.ok('apiKey' in r, 'should have apiKey field');
156
+ assert.ok('isConfigured' in r, 'should have isConfigured field');
157
+ assert.ok('baseUrl' in r, 'should have baseUrl field');
158
+ });
159
+
160
+ it('resolves "anthropic.haiku"', () => {
161
+ const r = resolveModel('anthropic.haiku');
162
+ assert.strictEqual(r.provider, 'anthropic');
163
+ assert.strictEqual(r.model, 'claude-haiku-4-5');
164
+ });
165
+
166
+ it('resolves "anthropic.opus"', () => {
167
+ const r = resolveModel('anthropic.opus');
168
+ assert.strictEqual(r.provider, 'anthropic');
169
+ assert.strictEqual(r.model, 'claude-opus-4-5');
170
+ });
171
+
172
+ it('resolves "openai.mini"', () => {
173
+ const r = resolveModel('openai.mini');
174
+ assert.strictEqual(r.provider, 'openai');
175
+ assert.strictEqual(r.model, 'gpt-5-mini');
176
+ });
177
+
178
+ it('resolves "openai.standard"', () => {
179
+ const r = resolveModel('openai.standard');
180
+ assert.strictEqual(r.provider, 'openai');
181
+ assert.strictEqual(r.model, 'gpt-5.2');
182
+ });
183
+
184
+ it('resolves "openrouter.sonnet" to openrouter model ID', () => {
185
+ const r = resolveModel('openrouter.sonnet');
186
+ assert.strictEqual(r.provider, 'openrouter');
187
+ assert.strictEqual(r.model, 'anthropic/claude-sonnet-4.5');
188
+ });
189
+
190
+ it('resolves "openrouter.nemotron"', () => {
191
+ const r = resolveModel('openrouter.nemotron');
192
+ assert.strictEqual(r.provider, 'openrouter');
193
+ assert.strictEqual(r.model, 'nvidia/nemotron-3-nano-30b-a3b:free');
194
+ });
195
+
196
+ it('resolves "openrouter.deepseek"', () => {
197
+ const r = resolveModel('openrouter.deepseek');
198
+ assert.strictEqual(r.provider, 'openrouter');
199
+ assert.strictEqual(r.model, 'deepseek/deepseek-v3.2');
200
+ });
201
+
202
+ it('resolves "gemini.flash"', () => {
203
+ const r = resolveModel('gemini.flash');
204
+ assert.strictEqual(r.provider, 'gemini');
205
+ assert.strictEqual(r.model, 'gemini-3-flash-preview');
206
+ });
207
+
208
+ it('resolves "gemini.pro"', () => {
209
+ const r = resolveModel('gemini.pro');
210
+ assert.strictEqual(r.provider, 'gemini');
211
+ assert.strictEqual(r.model, 'gemini-3-pro-preview');
212
+ });
213
+
214
+ it('resolves "local.default"', () => {
215
+ const r = resolveModel('local.default');
216
+ assert.strictEqual(r.provider, 'local');
217
+ assert.strictEqual(r.model, 'local-model');
218
+ });
219
+
220
+ it('passes through unknown alias as-is', () => {
221
+ const r = resolveModel('openrouter.some-future-model');
222
+ assert.strictEqual(r.provider, 'openrouter');
223
+ assert.strictEqual(r.model, 'some-future-model');
224
+ });
225
+
226
+ it('returns baseUrl from provider config', () => {
227
+ const r = resolveModel('openrouter.sonnet');
228
+ assert.strictEqual(r.baseUrl, 'https://openrouter.ai/api/v1/chat/completions');
229
+ });
230
+ });
231
+
232
+ // ============================================================================
233
+ // resolveModel — object format
234
+ // ============================================================================
235
+
236
+ describe('resolveModel (object format)', () => {
237
+ it('resolves { provider, model } object', () => {
238
+ const r = resolveModel({ provider: 'anthropic', model: 'haiku' });
239
+ assert.strictEqual(r.provider, 'anthropic');
240
+ assert.strictEqual(r.model, 'claude-haiku-4-5');
241
+ });
242
+
243
+ it('passes through unknown model alias in object format', () => {
244
+ const r = resolveModel({ provider: 'openai', model: 'gpt-99-turbo' });
245
+ assert.strictEqual(r.provider, 'openai');
246
+ assert.strictEqual(r.model, 'gpt-99-turbo');
247
+ });
248
+ });
249
+
250
+ // ============================================================================
251
+ // resolveModel — error cases
252
+ // ============================================================================
253
+
254
+ describe('resolveModel (error cases)', () => {
255
+ it('throws on single-part string (no dot)', () => {
256
+ assert.throws(
257
+ () => resolveModel('sonnet'),
258
+ /Invalid model reference.*Use format "provider\.model"/
259
+ );
260
+ });
261
+
262
+ it('splits on first dot only (handles aliases with dots like kimi-k2.5)', () => {
263
+ // "openrouter.kimi-k2.5" should parse as provider=openrouter, alias=kimi-k2.5
264
+ const r = resolveModel('openrouter.kimi-k2.5');
265
+ assert.strictEqual(r.provider, 'openrouter');
266
+ assert.strictEqual(r.model, 'moonshotai/kimi-k2.5');
267
+ });
268
+
269
+ it('throws on unknown provider', () => {
270
+ assert.throws(
271
+ () => resolveModel('fakeprovider.model'),
272
+ /Unknown provider: fakeprovider/
273
+ );
274
+ });
275
+
276
+ it('throws on object missing provider', () => {
277
+ assert.throws(
278
+ () => resolveModel({ model: 'haiku' }),
279
+ /must have both "provider" and "model"/
280
+ );
281
+ });
282
+
283
+ it('throws on object missing model', () => {
284
+ assert.throws(
285
+ () => resolveModel({ provider: 'anthropic' }),
286
+ /must have both "provider" and "model"/
287
+ );
288
+ });
289
+
290
+ it('throws on null', () => {
291
+ assert.throws(
292
+ () => resolveModel(null),
293
+ /Model reference must be a string or object/
294
+ );
295
+ });
296
+
297
+ it('throws on number', () => {
298
+ assert.throws(
299
+ () => resolveModel(42),
300
+ /Model reference must be a string or object/
301
+ );
302
+ });
303
+
304
+ it('throws on empty object', () => {
305
+ assert.throws(
306
+ () => resolveModel({}),
307
+ /must have both "provider" and "model"/
308
+ );
309
+ });
310
+ });
311
+
312
+ // ============================================================================
313
+ // resolveModel — consistency with providers.yaml
314
+ // ============================================================================
315
+
316
+ describe('resolveModel consistency', () => {
317
+ it('every alias in every provider resolves without error', () => {
318
+ const data = loadProviders({ forceReload: true });
319
+ for (const [providerName, provider] of Object.entries(data.providers)) {
320
+ for (const alias of Object.keys(provider.models || {})) {
321
+ const r = resolveModel(`${providerName}.${alias}`);
322
+ assert.strictEqual(r.provider, providerName);
323
+ // Resolved model should match the value in yaml
324
+ assert.strictEqual(
325
+ r.model,
326
+ provider.models[alias],
327
+ `${providerName}.${alias} should resolve to ${provider.models[alias]}`
328
+ );
329
+ }
330
+ }
331
+ });
332
+
333
+ it('string and object format produce identical results', () => {
334
+ const fromString = resolveModel('anthropic.sonnet');
335
+ const fromObject = resolveModel({ provider: 'anthropic', model: 'sonnet' });
336
+ assert.deepStrictEqual(fromString, fromObject);
337
+ });
338
+ });