@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@machinespirits/eval",
3
- "version": "0.1.2",
3
+ "version": "0.2.1",
4
4
  "description": "Evaluation system for Machine Spirits tutor - benchmarking, rubric evaluation, and analysis tools",
5
5
  "type": "module",
6
6
  "main": "index.js",
@@ -8,35 +8,27 @@
8
8
  ".": "./index.js",
9
9
  "./services/*": "./services/*.js",
10
10
  "./routes/*": "./routes/*.js",
11
- "./config/*": "./config/*",
12
- "./components/*": "./components/*.tsx",
13
- "./components/mobile/*": "./components/mobile/*.tsx",
14
- "./components/comparison": "./components/comparison/index.ts",
15
- "./components/comparison/*": "./components/comparison/*.tsx",
16
- "./hooks/*": "./hooks/*.ts",
17
- "./types": "./types.ts",
18
- "./utils/*": "./utils/*.ts"
11
+ "./config/*": "./config/*"
19
12
  },
20
13
  "files": [
21
14
  "index.js",
22
15
  "server.js",
23
- "server-init.js",
24
- "types.ts",
25
16
  "routes/",
26
17
  "services/",
27
- "components/",
28
- "hooks/",
29
18
  "config/",
30
19
  "scripts/",
31
- "utils/",
32
- "docs/"
20
+ "docs/EVALUATION-VARIABLES.md",
21
+ "docs/REPLICATION-PLAN.md"
33
22
  ],
34
23
  "scripts": {
35
24
  "start": "STANDALONE=true node server.js",
36
25
  "dev": "STANDALONE=true node server.js",
37
26
  "eval": "node scripts/eval-cli.js",
38
27
  "eval:quick": "node scripts/eval-cli.js quick",
39
- "eval:test": "node scripts/eval-cli.js test"
28
+ "eval:test": "node scripts/eval-cli.js test",
29
+ "seed": "node scripts/seed-db.js",
30
+ "test": "node --test --test-force-exit 'services/__tests__/*.test.js' 'tests/*.test.js'",
31
+ "content:validate": "node scripts/validate-content.js"
40
32
  },
41
33
  "keywords": [
42
34
  "evaluation",
@@ -52,8 +44,8 @@
52
44
  "url": "https://github.com/liammagee/machinespirits-eval"
53
45
  },
54
46
  "peerDependencies": {
55
- "@machinespirits/tutor-core": ">=0.1.0",
56
- "@anthropic-ai/sdk": ">=0.71.0"
47
+ "@anthropic-ai/sdk": "0.71.2",
48
+ "@machinespirits/tutor-core": ">=0.3.1"
57
49
  },
58
50
  "peerDependenciesMeta": {
59
51
  "@anthropic-ai/sdk": {
@@ -61,12 +53,14 @@
61
53
  }
62
54
  },
63
55
  "dependencies": {
64
- "express": "^4.19.2",
65
- "yaml": "^2.8.2",
66
- "better-sqlite3": "^12.5.0"
56
+ "better-sqlite3": "12.5.0",
57
+ "dotenv": "17.2.3",
58
+ "express": "4.19.2",
59
+ "jsonrepair": "3.13.2",
60
+ "yaml": "2.8.2"
67
61
  },
68
62
  "devDependencies": {
69
- "@types/node": "^22.14.0"
63
+ "@types/node": "22.14.0"
70
64
  },
71
65
  "engines": {
72
66
  "node": ">=18.0.0"
@@ -14,21 +14,44 @@ import * as evaluationStore from '../services/evaluationStore.js';
14
14
  import * as learnerConfigLoader from '../services/learnerConfigLoader.js';
15
15
  import * as promptRecommendationService from '../services/promptRecommendationService.js';
16
16
  import interactionEngine from '../services/learnerTutorInteractionEngine.js';
17
- // Import core tutor services from @machinespirits/tutor-core
18
- import {
19
- tutorApiService as tutorApi,
20
- dialogueLogService,
21
- monitoringService,
22
- aiConfigService,
23
- writingPadService
24
- } from '@machinespirits/tutor-core';
25
- const { getApiKey, getDefaultModel } = aiConfigService;
26
- const { clearConscious, getWritingPad } = writingPadService;
17
+ import * as evalConfigLoader from '../services/evalConfigLoader.js';
18
+ // Lazy-loaded tutor-core services — resolved on first request so this module
19
+ // can be imported without tutor-core installed at parse time.
20
+ // Module-scoped vars are populated by the middleware below; existing handler
21
+ // code references them unchanged.
22
+ let tutorApi, tutorConfigLoader, dialogueLogService, monitoringService;
23
+ let getApiKey, getDefaultModel, clearConscious, getWritingPad;
24
+ let _tutorCoreLoaded = false;
25
+
26
+ async function ensureTutorCore() {
27
+ if (_tutorCoreLoaded) return;
28
+ const mod = await import('@machinespirits/tutor-core');
29
+ tutorApi = mod.tutorApiService;
30
+ tutorConfigLoader = mod.tutorConfigLoader;
31
+ dialogueLogService = mod.dialogueLogService;
32
+ monitoringService = mod.monitoringService;
33
+ getApiKey = mod.aiConfigService.getApiKey;
34
+ getDefaultModel = mod.aiConfigService.getDefaultModel;
35
+ clearConscious = mod.writingPadService.clearConscious;
36
+ getWritingPad = mod.writingPadService.getWritingPad;
37
+ _tutorCoreLoaded = true;
38
+ }
39
+
27
40
  import fs from 'fs';
28
41
  import path from 'path';
29
42
 
30
43
  const router = Router();
31
44
 
45
+ // Resolve tutor-core on first request
46
+ router.use(async (req, res, next) => {
47
+ try {
48
+ await ensureTutorCore();
49
+ next();
50
+ } catch (err) {
51
+ res.status(503).json({ error: 'tutor-core not available', message: err.message });
52
+ }
53
+ });
54
+
32
55
  // ============================================================================
33
56
  // CRASH PROTECTION: Track active evaluation streams
34
57
  // ============================================================================
@@ -139,7 +162,7 @@ const PROMPTS_DIR = path.join(process.cwd(), 'prompts');
139
162
  */
140
163
  router.get('/scenarios', (req, res) => {
141
164
  try {
142
- const scenarios = tutorApi.listScenarios();
165
+ const scenarios = evalConfigLoader.listScenarios();
143
166
  res.json({ success: true, scenarios });
144
167
  } catch (error) {
145
168
  console.error('[EvalRoutes] List scenarios error:', error);
@@ -153,7 +176,7 @@ router.get('/scenarios', (req, res) => {
153
176
  */
154
177
  router.get('/scenarios/:id', (req, res) => {
155
178
  try {
156
- const scenario = tutorApi.getScenario(req.params.id);
179
+ const scenario = evalConfigLoader.getScenario(req.params.id);
157
180
  if (!scenario) {
158
181
  return res.status(404).json({ error: 'Scenario not found' });
159
182
  }
@@ -170,7 +193,7 @@ router.get('/scenarios/:id', (req, res) => {
170
193
  */
171
194
  router.get('/profiles', (req, res) => {
172
195
  try {
173
- const profiles = tutorApi.listProfiles();
196
+ const profiles = tutorConfigLoader.listProfiles();
174
197
  res.json({ success: true, profiles });
175
198
  } catch (error) {
176
199
  console.error('[EvalRoutes] List profiles error:', error);
@@ -199,7 +222,7 @@ router.get('/learner-profiles', (req, res) => {
199
222
  */
200
223
  router.get('/configurations', (req, res) => {
201
224
  try {
202
- const configurations = tutorApi.listConfigurations();
225
+ const configurations = evalConfigLoader.listConfigurations();
203
226
  res.json({ success: true, configurations });
204
227
  } catch (error) {
205
228
  console.error('[EvalRoutes] List configurations error:', error);
@@ -218,18 +241,40 @@ router.get('/configurations', (req, res) => {
218
241
  * Body: {
219
242
  * profile: "budget", // Profile name or config string
220
243
  * scenario: "new_user_first_visit", // Scenario ID (optional)
221
- * skipRubric: true // Skip AI judge evaluation (optional)
244
+ * skipRubric: true, // Skip AI judge evaluation (optional)
245
+ * judgeOverride: null, // Override judge model (optional)
246
+ * provider: null, // Override tutor provider (optional)
247
+ * model: null, // Override tutor model (optional)
248
+ * egoModel: null, // Override ego model (optional)
249
+ * superegoStrategy: null, // Superego intervention strategy (optional)
250
+ * hyperparameters: null // Override hyperparameters (optional)
222
251
  * }
223
252
  */
224
253
  router.post('/quick', async (req, res) => {
225
254
  try {
226
- const { profile = 'budget', scenario = 'new_user_first_visit', skipRubric = false } = req.body;
255
+ const {
256
+ profile = 'budget',
257
+ scenario = 'new_user_first_visit',
258
+ skipRubric = false,
259
+ judgeOverride = null,
260
+ provider,
261
+ model,
262
+ egoModel,
263
+ superegoStrategy,
264
+ hyperparameters,
265
+ } = req.body;
227
266
 
228
- // Build config
229
- const config = { profileName: profile };
267
+ // Build config with optional tutor overrides
268
+ const config = {
269
+ profileName: profile,
270
+ ...(provider && { provider }),
271
+ ...(model && { model }),
272
+ ...(egoModel && { egoModel }),
273
+ ...(hyperparameters && { hyperparameters }),
274
+ };
230
275
 
231
276
  // Get scenario name for description
232
- const scenarioDetails = tutorApi.getScenario(scenario);
277
+ const scenarioDetails = evalConfigLoader.getScenario(scenario);
233
278
  const scenarioName = scenarioDetails?.name || scenario;
234
279
 
235
280
  // Create a run to persist result to history
@@ -242,6 +287,11 @@ router.post('/quick', async (req, res) => {
242
287
  profiles: [profile],
243
288
  scenarios: [scenario],
244
289
  scenarioNames: [scenarioName],
290
+ judgeOverride: judgeOverride || undefined,
291
+ ...(provider && { provider }),
292
+ ...(model && { model }),
293
+ ...(egoModel && { egoModel }),
294
+ ...(superegoStrategy && { superegoStrategy }),
245
295
  },
246
296
  });
247
297
 
@@ -249,6 +299,8 @@ router.post('/quick', async (req, res) => {
249
299
  scenarioId: scenario,
250
300
  skipRubricEval: skipRubric,
251
301
  verbose: false,
302
+ judgeOverride,
303
+ superegoStrategy,
252
304
  });
253
305
 
254
306
  // Store result to history
@@ -288,9 +340,9 @@ router.post('/quick', async (req, res) => {
288
340
  totalTokens: (result.inputTokens || 0) + (result.outputTokens || 0),
289
341
  apiCalls: result.apiCalls,
290
342
  dialogueRounds: result.dialogueRounds,
291
- // Evaluator reasoning
343
+ // Judge reasoning
292
344
  evaluationReasoning: result.evaluationReasoning,
293
- evaluatorModel: result.evaluatorModel,
345
+ judgeModel: result.judgeModel,
294
346
  // Scenario context for display (original user request)
295
347
  scenarioContext: scenarioDetails ? {
296
348
  description: scenarioDetails.description,
@@ -344,7 +396,7 @@ router.get('/stream/quick', async (req, res) => {
344
396
  const outputSize = req.query.outputSize || 'normal'; // compact, normal, expanded
345
397
 
346
398
  // Get scenario name for description
347
- const scenarioDetails = tutorApi.getScenario(scenario);
399
+ const scenarioDetails = evalConfigLoader.getScenario(scenario);
348
400
  const scenarioName = scenarioDetails?.name || scenario;
349
401
 
350
402
  // Create a run to persist result to history (status: 'running')
@@ -431,7 +483,7 @@ router.get('/stream/quick', async (req, res) => {
431
483
  dialogueId: result.dialogueId,
432
484
  // Evaluator reasoning
433
485
  evaluationReasoning: result.evaluationReasoning,
434
- evaluatorModel: result.evaluatorModel,
486
+ judgeModel: result.judgeModel,
435
487
  // Scenario context for display (original user request)
436
488
  scenarioContext: scenarioDetails ? {
437
489
  description: scenarioDetails.description,
@@ -557,7 +609,7 @@ router.post('/matrix', async (req, res) => {
557
609
  let { profiles = [], scenarios = 'all', skipRubric = false } = req.body;
558
610
 
559
611
  // Default profiles if none specified
560
- const allProfiles = tutorApi.listProfiles();
612
+ const allProfiles = tutorConfigLoader.listProfiles();
561
613
  if (profiles.length === 0) {
562
614
  profiles = ['budget', 'experimental', 'default', 'fast'].filter(p =>
563
615
  allProfiles.some(ap => ap.name === p)
@@ -576,7 +628,7 @@ router.post('/matrix', async (req, res) => {
576
628
  }
577
629
 
578
630
  // Get scenarios
579
- const allScenarios = tutorApi.listScenarios();
631
+ const allScenarios = evalConfigLoader.listScenarios();
580
632
  const scenariosToRun = scenarios === 'all'
581
633
  ? allScenarios
582
634
  : allScenarios.filter(s => scenarios.includes(s.id));
@@ -752,7 +804,7 @@ router.get('/stream/matrix', async (req, res) => {
752
804
  const outputSize = req.query.outputSize || 'normal';
753
805
 
754
806
  // Get all available profiles
755
- const allProfiles = tutorApi.listProfiles();
807
+ const allProfiles = tutorConfigLoader.listProfiles();
756
808
  if (profiles.length === 0) {
757
809
  profiles = ['budget', 'experimental', 'default', 'fast'].filter(p =>
758
810
  allProfiles.some(ap => ap.name === p)
@@ -767,7 +819,7 @@ router.get('/stream/matrix', async (req, res) => {
767
819
  }
768
820
 
769
821
  // Get scenarios
770
- const allScenarios = tutorApi.listScenarios();
822
+ const allScenarios = evalConfigLoader.listScenarios();
771
823
  const scenariosToRun = scenarios === 'all'
772
824
  ? allScenarios
773
825
  : allScenarios.filter(s => scenarios.includes(s.id));
@@ -1163,7 +1215,7 @@ router.get('/stream/interact', async (req, res) => {
1163
1215
  learnerId,
1164
1216
  personaId: persona,
1165
1217
  tutorProfile,
1166
- learnerProfile: dialogueEnabled ? 'psychodynamic' : 'unified',
1218
+ learnerProfile: dialogueEnabled ? 'ego_superego' : 'unified',
1167
1219
  topic,
1168
1220
  scenario: {
1169
1221
  name: `Interactive Evaluation - ${persona}`,
@@ -1216,8 +1268,8 @@ router.get('/stream/interact', async (req, res) => {
1216
1268
  learnerId,
1217
1269
  personaId: persona,
1218
1270
  tutorProfile,
1219
- learnerArchitecture: dialogueEnabled ? 'psychodynamic' : 'unified',
1220
- learnerProfile: dialogueEnabled ? 'psychodynamic' : 'unified',
1271
+ learnerArchitecture: dialogueEnabled ? 'ego_superego' : 'unified',
1272
+ learnerProfile: dialogueEnabled ? 'ego_superego' : 'unified',
1221
1273
  topic,
1222
1274
  interaction: interactionTrace,
1223
1275
  turnCount: interactionTrace.turns.length,
@@ -1253,7 +1305,7 @@ router.get('/stream/interact', async (req, res) => {
1253
1305
  runType: 'interaction',
1254
1306
  profiles: [tutorProfile],
1255
1307
  personaId: persona,
1256
- learnerArchitecture: dialogueEnabled ? 'psychodynamic' : 'unified',
1308
+ learnerArchitecture: dialogueEnabled ? 'ego_superego' : 'unified',
1257
1309
  topic,
1258
1310
  fastMode: !runJudge,
1259
1311
  },
@@ -1852,7 +1904,7 @@ router.post('/prompts/recommend', async (req, res) => {
1852
1904
  profileName = runResults[0]?.profileName || profileName;
1853
1905
  } else if (profile) {
1854
1906
  // Run fresh evaluations
1855
- const allScenarios = tutorApi.listScenarios();
1907
+ const allScenarios = evalConfigLoader.listScenarios();
1856
1908
  const scenariosToRun = scenarios === 'all'
1857
1909
  ? allScenarios
1858
1910
  : allScenarios.filter(s => scenarios.includes(s.id));
@@ -1944,7 +1996,7 @@ router.get('/stream/run', async (req, res) => {
1944
1996
  const outputSize = req.query.outputSize || 'normal';
1945
1997
 
1946
1998
  // Get all scenarios to run
1947
- const allScenarios = tutorApi.listScenarios();
1999
+ const allScenarios = evalConfigLoader.listScenarios();
1948
2000
  const scenariosToRun = scenarios === 'all'
1949
2001
  ? allScenarios
1950
2002
  : allScenarios.filter(s => scenarios.includes(s.id));
@@ -2513,7 +2565,7 @@ router.get('/runs/:runId/resume-status', (req, res) => {
2513
2565
  }
2514
2566
 
2515
2567
  // Get scenarios
2516
- const allScenarios = tutorApi.listScenarios();
2568
+ const allScenarios = evalConfigLoader.listScenarios();
2517
2569
  const scenarios = scenariosParam === 'all'
2518
2570
  ? allScenarios
2519
2571
  : allScenarios.filter(s => scenariosParam.includes(s.id));
@@ -2669,7 +2721,7 @@ router.get('/stream/recognition-ab', async (req, res) => {
2669
2721
  const outputSize = req.query.outputSize || 'normal';
2670
2722
 
2671
2723
  // Validate profiles exist
2672
- const allProfiles = tutorApi.listProfiles();
2724
+ const allProfiles = tutorConfigLoader.listProfiles();
2673
2725
  const validProfiles = profiles.filter(p => allProfiles.some(ap => ap.name === p));
2674
2726
 
2675
2727
  if (validProfiles.length !== 2) {
@@ -2682,7 +2734,7 @@ router.get('/stream/recognition-ab', async (req, res) => {
2682
2734
  }
2683
2735
 
2684
2736
  // Get only recognition_test scenarios
2685
- const allScenarios = tutorApi.listScenarios();
2737
+ const allScenarios = evalConfigLoader.listScenarios();
2686
2738
  const recognitionScenarios = allScenarios.filter(s => s.recognition_test === true);
2687
2739
 
2688
2740
  if (recognitionScenarios.length === 0) {