@machinespirits/eval 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +161 -0
- package/config/eval-settings.yaml +18 -0
- package/config/evaluation-rubric-learner.yaml +277 -0
- package/config/evaluation-rubric.yaml +613 -0
- package/config/interaction-eval-scenarios.yaml +93 -50
- package/config/learner-agents.yaml +124 -193
- package/config/machinespirits-eval.code-workspace +11 -0
- package/config/providers.yaml +60 -0
- package/config/suggestion-scenarios.yaml +1399 -0
- package/config/tutor-agents.yaml +716 -0
- package/docs/EVALUATION-VARIABLES.md +589 -0
- package/docs/REPLICATION-PLAN.md +577 -0
- package/index.js +15 -6
- package/package.json +16 -22
- package/routes/evalRoutes.js +88 -36
- package/scripts/analyze-judge-reliability.js +401 -0
- package/scripts/analyze-run.js +97 -0
- package/scripts/analyze-run.mjs +282 -0
- package/scripts/analyze-validation-failures.js +141 -0
- package/scripts/check-run.mjs +17 -0
- package/scripts/code-impasse-strategies.js +1132 -0
- package/scripts/compare-runs.js +44 -0
- package/scripts/compare-suggestions.js +80 -0
- package/scripts/compare-transformation.js +116 -0
- package/scripts/dig-into-run.js +158 -0
- package/scripts/eval-cli.js +2626 -0
- package/scripts/generate-paper-figures.py +452 -0
- package/scripts/qualitative-analysis-ai.js +1313 -0
- package/scripts/qualitative-analysis.js +688 -0
- package/scripts/seed-db.js +87 -0
- package/scripts/show-failed-suggestions.js +64 -0
- package/scripts/validate-content.js +192 -0
- package/server.js +3 -2
- package/services/__tests__/evalConfigLoader.test.js +338 -0
- package/services/anovaStats.js +499 -0
- package/services/contentResolver.js +407 -0
- package/services/dialogueTraceAnalyzer.js +454 -0
- package/services/evalConfigLoader.js +625 -0
- package/services/evaluationRunner.js +2171 -270
- package/services/evaluationStore.js +564 -29
- package/services/learnerConfigLoader.js +75 -5
- package/services/learnerRubricEvaluator.js +284 -0
- package/services/learnerTutorInteractionEngine.js +375 -0
- package/services/processUtils.js +18 -0
- package/services/progressLogger.js +98 -0
- package/services/promptRecommendationService.js +31 -26
- package/services/promptRewriter.js +427 -0
- package/services/rubricEvaluator.js +543 -70
- package/services/streamingReporter.js +104 -0
- package/services/turnComparisonAnalyzer.js +494 -0
- package/components/MobileEvalDashboard.tsx +0 -267
- package/components/comparison/DeltaAnalysisTable.tsx +0 -137
- package/components/comparison/ProfileComparisonCard.tsx +0 -176
- package/components/comparison/RecognitionABMode.tsx +0 -385
- package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
- package/components/comparison/WinnerIndicator.tsx +0 -64
- package/components/comparison/index.ts +0 -5
- package/components/mobile/BottomSheet.tsx +0 -233
- package/components/mobile/DimensionBreakdown.tsx +0 -210
- package/components/mobile/DocsView.tsx +0 -363
- package/components/mobile/LogsView.tsx +0 -481
- package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
- package/components/mobile/QuickTestView.tsx +0 -1098
- package/components/mobile/RecognitionTypeChart.tsx +0 -124
- package/components/mobile/RecognitionView.tsx +0 -809
- package/components/mobile/RunDetailView.tsx +0 -261
- package/components/mobile/RunHistoryView.tsx +0 -367
- package/components/mobile/ScoreRadial.tsx +0 -211
- package/components/mobile/StreamingLogPanel.tsx +0 -230
- package/components/mobile/SynthesisStrategyChart.tsx +0 -140
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
- package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
- package/docs/research/COST-ANALYSIS.md +0 -56
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
- package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
- package/docs/research/PAPER-UNIFIED.md +0 -659
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
- package/docs/research/apa.csl +0 -2133
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
- package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
- package/docs/research/paper-draft/full-paper.md +0 -136
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +0 -515
- package/docs/research/transcript-baseline.md +0 -139
- package/docs/research/transcript-recognition-multiagent.md +0 -187
- package/hooks/useEvalData.ts +0 -625
- package/server-init.js +0 -45
- package/services/benchmarkService.js +0 -1892
- package/types.ts +0 -165
- package/utils/haptics.ts +0 -45
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@machinespirits/eval",
|
|
3
|
-
"version": "0.1
|
|
3
|
+
"version": "0.2.1",
|
|
4
4
|
"description": "Evaluation system for Machine Spirits tutor - benchmarking, rubric evaluation, and analysis tools",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
@@ -8,35 +8,27 @@
|
|
|
8
8
|
".": "./index.js",
|
|
9
9
|
"./services/*": "./services/*.js",
|
|
10
10
|
"./routes/*": "./routes/*.js",
|
|
11
|
-
"./config/*": "./config/*"
|
|
12
|
-
"./components/*": "./components/*.tsx",
|
|
13
|
-
"./components/mobile/*": "./components/mobile/*.tsx",
|
|
14
|
-
"./components/comparison": "./components/comparison/index.ts",
|
|
15
|
-
"./components/comparison/*": "./components/comparison/*.tsx",
|
|
16
|
-
"./hooks/*": "./hooks/*.ts",
|
|
17
|
-
"./types": "./types.ts",
|
|
18
|
-
"./utils/*": "./utils/*.ts"
|
|
11
|
+
"./config/*": "./config/*"
|
|
19
12
|
},
|
|
20
13
|
"files": [
|
|
21
14
|
"index.js",
|
|
22
15
|
"server.js",
|
|
23
|
-
"server-init.js",
|
|
24
|
-
"types.ts",
|
|
25
16
|
"routes/",
|
|
26
17
|
"services/",
|
|
27
|
-
"components/",
|
|
28
|
-
"hooks/",
|
|
29
18
|
"config/",
|
|
30
19
|
"scripts/",
|
|
31
|
-
"
|
|
32
|
-
"docs/"
|
|
20
|
+
"docs/EVALUATION-VARIABLES.md",
|
|
21
|
+
"docs/REPLICATION-PLAN.md"
|
|
33
22
|
],
|
|
34
23
|
"scripts": {
|
|
35
24
|
"start": "STANDALONE=true node server.js",
|
|
36
25
|
"dev": "STANDALONE=true node server.js",
|
|
37
26
|
"eval": "node scripts/eval-cli.js",
|
|
38
27
|
"eval:quick": "node scripts/eval-cli.js quick",
|
|
39
|
-
"eval:test": "node scripts/eval-cli.js test"
|
|
28
|
+
"eval:test": "node scripts/eval-cli.js test",
|
|
29
|
+
"seed": "node scripts/seed-db.js",
|
|
30
|
+
"test": "node --test --test-force-exit 'services/__tests__/*.test.js' 'tests/*.test.js'",
|
|
31
|
+
"content:validate": "node scripts/validate-content.js"
|
|
40
32
|
},
|
|
41
33
|
"keywords": [
|
|
42
34
|
"evaluation",
|
|
@@ -52,8 +44,8 @@
|
|
|
52
44
|
"url": "https://github.com/liammagee/machinespirits-eval"
|
|
53
45
|
},
|
|
54
46
|
"peerDependencies": {
|
|
55
|
-
"@
|
|
56
|
-
"@
|
|
47
|
+
"@anthropic-ai/sdk": "0.71.2",
|
|
48
|
+
"@machinespirits/tutor-core": ">=0.3.1"
|
|
57
49
|
},
|
|
58
50
|
"peerDependenciesMeta": {
|
|
59
51
|
"@anthropic-ai/sdk": {
|
|
@@ -61,12 +53,14 @@
|
|
|
61
53
|
}
|
|
62
54
|
},
|
|
63
55
|
"dependencies": {
|
|
64
|
-
"
|
|
65
|
-
"
|
|
66
|
-
"
|
|
56
|
+
"better-sqlite3": "12.5.0",
|
|
57
|
+
"dotenv": "17.2.3",
|
|
58
|
+
"express": "4.19.2",
|
|
59
|
+
"jsonrepair": "3.13.2",
|
|
60
|
+
"yaml": "2.8.2"
|
|
67
61
|
},
|
|
68
62
|
"devDependencies": {
|
|
69
|
-
"@types/node": "
|
|
63
|
+
"@types/node": "22.14.0"
|
|
70
64
|
},
|
|
71
65
|
"engines": {
|
|
72
66
|
"node": ">=18.0.0"
|
package/routes/evalRoutes.js
CHANGED
|
@@ -14,21 +14,44 @@ import * as evaluationStore from '../services/evaluationStore.js';
|
|
|
14
14
|
import * as learnerConfigLoader from '../services/learnerConfigLoader.js';
|
|
15
15
|
import * as promptRecommendationService from '../services/promptRecommendationService.js';
|
|
16
16
|
import interactionEngine from '../services/learnerTutorInteractionEngine.js';
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
17
|
+
import * as evalConfigLoader from '../services/evalConfigLoader.js';
|
|
18
|
+
// Lazy-loaded tutor-core services — resolved on first request so this module
|
|
19
|
+
// can be imported without tutor-core installed at parse time.
|
|
20
|
+
// Module-scoped vars are populated by the middleware below; existing handler
|
|
21
|
+
// code references them unchanged.
|
|
22
|
+
let tutorApi, tutorConfigLoader, dialogueLogService, monitoringService;
|
|
23
|
+
let getApiKey, getDefaultModel, clearConscious, getWritingPad;
|
|
24
|
+
let _tutorCoreLoaded = false;
|
|
25
|
+
|
|
26
|
+
async function ensureTutorCore() {
|
|
27
|
+
if (_tutorCoreLoaded) return;
|
|
28
|
+
const mod = await import('@machinespirits/tutor-core');
|
|
29
|
+
tutorApi = mod.tutorApiService;
|
|
30
|
+
tutorConfigLoader = mod.tutorConfigLoader;
|
|
31
|
+
dialogueLogService = mod.dialogueLogService;
|
|
32
|
+
monitoringService = mod.monitoringService;
|
|
33
|
+
getApiKey = mod.aiConfigService.getApiKey;
|
|
34
|
+
getDefaultModel = mod.aiConfigService.getDefaultModel;
|
|
35
|
+
clearConscious = mod.writingPadService.clearConscious;
|
|
36
|
+
getWritingPad = mod.writingPadService.getWritingPad;
|
|
37
|
+
_tutorCoreLoaded = true;
|
|
38
|
+
}
|
|
39
|
+
|
|
27
40
|
import fs from 'fs';
|
|
28
41
|
import path from 'path';
|
|
29
42
|
|
|
30
43
|
const router = Router();
|
|
31
44
|
|
|
45
|
+
// Resolve tutor-core on first request
|
|
46
|
+
router.use(async (req, res, next) => {
|
|
47
|
+
try {
|
|
48
|
+
await ensureTutorCore();
|
|
49
|
+
next();
|
|
50
|
+
} catch (err) {
|
|
51
|
+
res.status(503).json({ error: 'tutor-core not available', message: err.message });
|
|
52
|
+
}
|
|
53
|
+
});
|
|
54
|
+
|
|
32
55
|
// ============================================================================
|
|
33
56
|
// CRASH PROTECTION: Track active evaluation streams
|
|
34
57
|
// ============================================================================
|
|
@@ -139,7 +162,7 @@ const PROMPTS_DIR = path.join(process.cwd(), 'prompts');
|
|
|
139
162
|
*/
|
|
140
163
|
router.get('/scenarios', (req, res) => {
|
|
141
164
|
try {
|
|
142
|
-
const scenarios =
|
|
165
|
+
const scenarios = evalConfigLoader.listScenarios();
|
|
143
166
|
res.json({ success: true, scenarios });
|
|
144
167
|
} catch (error) {
|
|
145
168
|
console.error('[EvalRoutes] List scenarios error:', error);
|
|
@@ -153,7 +176,7 @@ router.get('/scenarios', (req, res) => {
|
|
|
153
176
|
*/
|
|
154
177
|
router.get('/scenarios/:id', (req, res) => {
|
|
155
178
|
try {
|
|
156
|
-
const scenario =
|
|
179
|
+
const scenario = evalConfigLoader.getScenario(req.params.id);
|
|
157
180
|
if (!scenario) {
|
|
158
181
|
return res.status(404).json({ error: 'Scenario not found' });
|
|
159
182
|
}
|
|
@@ -170,7 +193,7 @@ router.get('/scenarios/:id', (req, res) => {
|
|
|
170
193
|
*/
|
|
171
194
|
router.get('/profiles', (req, res) => {
|
|
172
195
|
try {
|
|
173
|
-
const profiles =
|
|
196
|
+
const profiles = tutorConfigLoader.listProfiles();
|
|
174
197
|
res.json({ success: true, profiles });
|
|
175
198
|
} catch (error) {
|
|
176
199
|
console.error('[EvalRoutes] List profiles error:', error);
|
|
@@ -199,7 +222,7 @@ router.get('/learner-profiles', (req, res) => {
|
|
|
199
222
|
*/
|
|
200
223
|
router.get('/configurations', (req, res) => {
|
|
201
224
|
try {
|
|
202
|
-
const configurations =
|
|
225
|
+
const configurations = evalConfigLoader.listConfigurations();
|
|
203
226
|
res.json({ success: true, configurations });
|
|
204
227
|
} catch (error) {
|
|
205
228
|
console.error('[EvalRoutes] List configurations error:', error);
|
|
@@ -218,18 +241,40 @@ router.get('/configurations', (req, res) => {
|
|
|
218
241
|
* Body: {
|
|
219
242
|
* profile: "budget", // Profile name or config string
|
|
220
243
|
* scenario: "new_user_first_visit", // Scenario ID (optional)
|
|
221
|
-
* skipRubric: true // Skip AI judge evaluation (optional)
|
|
244
|
+
* skipRubric: true, // Skip AI judge evaluation (optional)
|
|
245
|
+
* judgeOverride: null, // Override judge model (optional)
|
|
246
|
+
* provider: null, // Override tutor provider (optional)
|
|
247
|
+
* model: null, // Override tutor model (optional)
|
|
248
|
+
* egoModel: null, // Override ego model (optional)
|
|
249
|
+
* superegoStrategy: null, // Superego intervention strategy (optional)
|
|
250
|
+
* hyperparameters: null // Override hyperparameters (optional)
|
|
222
251
|
* }
|
|
223
252
|
*/
|
|
224
253
|
router.post('/quick', async (req, res) => {
|
|
225
254
|
try {
|
|
226
|
-
const {
|
|
255
|
+
const {
|
|
256
|
+
profile = 'budget',
|
|
257
|
+
scenario = 'new_user_first_visit',
|
|
258
|
+
skipRubric = false,
|
|
259
|
+
judgeOverride = null,
|
|
260
|
+
provider,
|
|
261
|
+
model,
|
|
262
|
+
egoModel,
|
|
263
|
+
superegoStrategy,
|
|
264
|
+
hyperparameters,
|
|
265
|
+
} = req.body;
|
|
227
266
|
|
|
228
|
-
// Build config
|
|
229
|
-
const config = {
|
|
267
|
+
// Build config with optional tutor overrides
|
|
268
|
+
const config = {
|
|
269
|
+
profileName: profile,
|
|
270
|
+
...(provider && { provider }),
|
|
271
|
+
...(model && { model }),
|
|
272
|
+
...(egoModel && { egoModel }),
|
|
273
|
+
...(hyperparameters && { hyperparameters }),
|
|
274
|
+
};
|
|
230
275
|
|
|
231
276
|
// Get scenario name for description
|
|
232
|
-
const scenarioDetails =
|
|
277
|
+
const scenarioDetails = evalConfigLoader.getScenario(scenario);
|
|
233
278
|
const scenarioName = scenarioDetails?.name || scenario;
|
|
234
279
|
|
|
235
280
|
// Create a run to persist result to history
|
|
@@ -242,6 +287,11 @@ router.post('/quick', async (req, res) => {
|
|
|
242
287
|
profiles: [profile],
|
|
243
288
|
scenarios: [scenario],
|
|
244
289
|
scenarioNames: [scenarioName],
|
|
290
|
+
judgeOverride: judgeOverride || undefined,
|
|
291
|
+
...(provider && { provider }),
|
|
292
|
+
...(model && { model }),
|
|
293
|
+
...(egoModel && { egoModel }),
|
|
294
|
+
...(superegoStrategy && { superegoStrategy }),
|
|
245
295
|
},
|
|
246
296
|
});
|
|
247
297
|
|
|
@@ -249,6 +299,8 @@ router.post('/quick', async (req, res) => {
|
|
|
249
299
|
scenarioId: scenario,
|
|
250
300
|
skipRubricEval: skipRubric,
|
|
251
301
|
verbose: false,
|
|
302
|
+
judgeOverride,
|
|
303
|
+
superegoStrategy,
|
|
252
304
|
});
|
|
253
305
|
|
|
254
306
|
// Store result to history
|
|
@@ -288,9 +340,9 @@ router.post('/quick', async (req, res) => {
|
|
|
288
340
|
totalTokens: (result.inputTokens || 0) + (result.outputTokens || 0),
|
|
289
341
|
apiCalls: result.apiCalls,
|
|
290
342
|
dialogueRounds: result.dialogueRounds,
|
|
291
|
-
//
|
|
343
|
+
// Judge reasoning
|
|
292
344
|
evaluationReasoning: result.evaluationReasoning,
|
|
293
|
-
|
|
345
|
+
judgeModel: result.judgeModel,
|
|
294
346
|
// Scenario context for display (original user request)
|
|
295
347
|
scenarioContext: scenarioDetails ? {
|
|
296
348
|
description: scenarioDetails.description,
|
|
@@ -344,7 +396,7 @@ router.get('/stream/quick', async (req, res) => {
|
|
|
344
396
|
const outputSize = req.query.outputSize || 'normal'; // compact, normal, expanded
|
|
345
397
|
|
|
346
398
|
// Get scenario name for description
|
|
347
|
-
const scenarioDetails =
|
|
399
|
+
const scenarioDetails = evalConfigLoader.getScenario(scenario);
|
|
348
400
|
const scenarioName = scenarioDetails?.name || scenario;
|
|
349
401
|
|
|
350
402
|
// Create a run to persist result to history (status: 'running')
|
|
@@ -431,7 +483,7 @@ router.get('/stream/quick', async (req, res) => {
|
|
|
431
483
|
dialogueId: result.dialogueId,
|
|
432
484
|
// Evaluator reasoning
|
|
433
485
|
evaluationReasoning: result.evaluationReasoning,
|
|
434
|
-
|
|
486
|
+
judgeModel: result.judgeModel,
|
|
435
487
|
// Scenario context for display (original user request)
|
|
436
488
|
scenarioContext: scenarioDetails ? {
|
|
437
489
|
description: scenarioDetails.description,
|
|
@@ -557,7 +609,7 @@ router.post('/matrix', async (req, res) => {
|
|
|
557
609
|
let { profiles = [], scenarios = 'all', skipRubric = false } = req.body;
|
|
558
610
|
|
|
559
611
|
// Default profiles if none specified
|
|
560
|
-
const allProfiles =
|
|
612
|
+
const allProfiles = tutorConfigLoader.listProfiles();
|
|
561
613
|
if (profiles.length === 0) {
|
|
562
614
|
profiles = ['budget', 'experimental', 'default', 'fast'].filter(p =>
|
|
563
615
|
allProfiles.some(ap => ap.name === p)
|
|
@@ -576,7 +628,7 @@ router.post('/matrix', async (req, res) => {
|
|
|
576
628
|
}
|
|
577
629
|
|
|
578
630
|
// Get scenarios
|
|
579
|
-
const allScenarios =
|
|
631
|
+
const allScenarios = evalConfigLoader.listScenarios();
|
|
580
632
|
const scenariosToRun = scenarios === 'all'
|
|
581
633
|
? allScenarios
|
|
582
634
|
: allScenarios.filter(s => scenarios.includes(s.id));
|
|
@@ -752,7 +804,7 @@ router.get('/stream/matrix', async (req, res) => {
|
|
|
752
804
|
const outputSize = req.query.outputSize || 'normal';
|
|
753
805
|
|
|
754
806
|
// Get all available profiles
|
|
755
|
-
const allProfiles =
|
|
807
|
+
const allProfiles = tutorConfigLoader.listProfiles();
|
|
756
808
|
if (profiles.length === 0) {
|
|
757
809
|
profiles = ['budget', 'experimental', 'default', 'fast'].filter(p =>
|
|
758
810
|
allProfiles.some(ap => ap.name === p)
|
|
@@ -767,7 +819,7 @@ router.get('/stream/matrix', async (req, res) => {
|
|
|
767
819
|
}
|
|
768
820
|
|
|
769
821
|
// Get scenarios
|
|
770
|
-
const allScenarios =
|
|
822
|
+
const allScenarios = evalConfigLoader.listScenarios();
|
|
771
823
|
const scenariosToRun = scenarios === 'all'
|
|
772
824
|
? allScenarios
|
|
773
825
|
: allScenarios.filter(s => scenarios.includes(s.id));
|
|
@@ -1163,7 +1215,7 @@ router.get('/stream/interact', async (req, res) => {
|
|
|
1163
1215
|
learnerId,
|
|
1164
1216
|
personaId: persona,
|
|
1165
1217
|
tutorProfile,
|
|
1166
|
-
learnerProfile: dialogueEnabled ? '
|
|
1218
|
+
learnerProfile: dialogueEnabled ? 'ego_superego' : 'unified',
|
|
1167
1219
|
topic,
|
|
1168
1220
|
scenario: {
|
|
1169
1221
|
name: `Interactive Evaluation - ${persona}`,
|
|
@@ -1216,8 +1268,8 @@ router.get('/stream/interact', async (req, res) => {
|
|
|
1216
1268
|
learnerId,
|
|
1217
1269
|
personaId: persona,
|
|
1218
1270
|
tutorProfile,
|
|
1219
|
-
learnerArchitecture: dialogueEnabled ? '
|
|
1220
|
-
learnerProfile: dialogueEnabled ? '
|
|
1271
|
+
learnerArchitecture: dialogueEnabled ? 'ego_superego' : 'unified',
|
|
1272
|
+
learnerProfile: dialogueEnabled ? 'ego_superego' : 'unified',
|
|
1221
1273
|
topic,
|
|
1222
1274
|
interaction: interactionTrace,
|
|
1223
1275
|
turnCount: interactionTrace.turns.length,
|
|
@@ -1253,7 +1305,7 @@ router.get('/stream/interact', async (req, res) => {
|
|
|
1253
1305
|
runType: 'interaction',
|
|
1254
1306
|
profiles: [tutorProfile],
|
|
1255
1307
|
personaId: persona,
|
|
1256
|
-
learnerArchitecture: dialogueEnabled ? '
|
|
1308
|
+
learnerArchitecture: dialogueEnabled ? 'ego_superego' : 'unified',
|
|
1257
1309
|
topic,
|
|
1258
1310
|
fastMode: !runJudge,
|
|
1259
1311
|
},
|
|
@@ -1852,7 +1904,7 @@ router.post('/prompts/recommend', async (req, res) => {
|
|
|
1852
1904
|
profileName = runResults[0]?.profileName || profileName;
|
|
1853
1905
|
} else if (profile) {
|
|
1854
1906
|
// Run fresh evaluations
|
|
1855
|
-
const allScenarios =
|
|
1907
|
+
const allScenarios = evalConfigLoader.listScenarios();
|
|
1856
1908
|
const scenariosToRun = scenarios === 'all'
|
|
1857
1909
|
? allScenarios
|
|
1858
1910
|
: allScenarios.filter(s => scenarios.includes(s.id));
|
|
@@ -1944,7 +1996,7 @@ router.get('/stream/run', async (req, res) => {
|
|
|
1944
1996
|
const outputSize = req.query.outputSize || 'normal';
|
|
1945
1997
|
|
|
1946
1998
|
// Get all scenarios to run
|
|
1947
|
-
const allScenarios =
|
|
1999
|
+
const allScenarios = evalConfigLoader.listScenarios();
|
|
1948
2000
|
const scenariosToRun = scenarios === 'all'
|
|
1949
2001
|
? allScenarios
|
|
1950
2002
|
: allScenarios.filter(s => scenarios.includes(s.id));
|
|
@@ -2513,7 +2565,7 @@ router.get('/runs/:runId/resume-status', (req, res) => {
|
|
|
2513
2565
|
}
|
|
2514
2566
|
|
|
2515
2567
|
// Get scenarios
|
|
2516
|
-
const allScenarios =
|
|
2568
|
+
const allScenarios = evalConfigLoader.listScenarios();
|
|
2517
2569
|
const scenarios = scenariosParam === 'all'
|
|
2518
2570
|
? allScenarios
|
|
2519
2571
|
: allScenarios.filter(s => scenariosParam.includes(s.id));
|
|
@@ -2669,7 +2721,7 @@ router.get('/stream/recognition-ab', async (req, res) => {
|
|
|
2669
2721
|
const outputSize = req.query.outputSize || 'normal';
|
|
2670
2722
|
|
|
2671
2723
|
// Validate profiles exist
|
|
2672
|
-
const allProfiles =
|
|
2724
|
+
const allProfiles = tutorConfigLoader.listProfiles();
|
|
2673
2725
|
const validProfiles = profiles.filter(p => allProfiles.some(ap => ap.name === p));
|
|
2674
2726
|
|
|
2675
2727
|
if (validProfiles.length !== 2) {
|
|
@@ -2682,7 +2734,7 @@ router.get('/stream/recognition-ab', async (req, res) => {
|
|
|
2682
2734
|
}
|
|
2683
2735
|
|
|
2684
2736
|
// Get only recognition_test scenarios
|
|
2685
|
-
const allScenarios =
|
|
2737
|
+
const allScenarios = evalConfigLoader.listScenarios();
|
|
2686
2738
|
const recognitionScenarios = allScenarios.filter(s => s.recognition_test === true);
|
|
2687
2739
|
|
|
2688
2740
|
if (recognitionScenarios.length === 0) {
|