@machinespirits/eval 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -9
- package/config/eval-settings.yaml +3 -3
- package/config/paper-manifest.json +486 -0
- package/config/providers.yaml +9 -6
- package/config/tutor-agents.yaml +2261 -0
- package/content/README.md +23 -0
- package/content/courses/479/course.md +53 -0
- package/content/courses/479/lecture-1.md +361 -0
- package/content/courses/479/lecture-2.md +360 -0
- package/content/courses/479/lecture-3.md +655 -0
- package/content/courses/479/lecture-4.md +530 -0
- package/content/courses/479/lecture-5.md +326 -0
- package/content/courses/479/lecture-6.md +346 -0
- package/content/courses/479/lecture-7.md +326 -0
- package/content/courses/479/lecture-8.md +273 -0
- package/content/courses/479/roadmap-slides.md +656 -0
- package/content/manifest.yaml +8 -0
- package/docs/research/build.sh +44 -20
- package/docs/research/figures/figure10.png +0 -0
- package/docs/research/figures/figure11.png +0 -0
- package/docs/research/figures/figure3.png +0 -0
- package/docs/research/figures/figure4.png +0 -0
- package/docs/research/figures/figure5.png +0 -0
- package/docs/research/figures/figure6.png +0 -0
- package/docs/research/figures/figure7.png +0 -0
- package/docs/research/figures/figure8.png +0 -0
- package/docs/research/figures/figure9.png +0 -0
- package/docs/research/header.tex +23 -2
- package/docs/research/paper-full.md +941 -285
- package/docs/research/paper-short.md +216 -585
- package/docs/research/references.bib +132 -0
- package/docs/research/slides-header.tex +188 -0
- package/docs/research/slides-pptx.md +363 -0
- package/docs/research/slides.md +531 -0
- package/docs/research/style-reference-pptx.py +199 -0
- package/package.json +6 -5
- package/scripts/analyze-eval-results.js +69 -17
- package/scripts/analyze-mechanism-traces.js +763 -0
- package/scripts/analyze-modulation-learning.js +498 -0
- package/scripts/analyze-prosthesis.js +144 -0
- package/scripts/analyze-run.js +264 -79
- package/scripts/assess-transcripts.js +853 -0
- package/scripts/browse-transcripts.js +854 -0
- package/scripts/check-parse-failures.js +73 -0
- package/scripts/code-dialectical-modulation.js +1320 -0
- package/scripts/download-data.sh +55 -0
- package/scripts/eval-cli.js +106 -18
- package/scripts/generate-paper-figures.js +663 -0
- package/scripts/generate-paper-figures.py +577 -76
- package/scripts/generate-paper-tables.js +299 -0
- package/scripts/qualitative-analysis-ai.js +3 -3
- package/scripts/render-sequence-diagram.js +694 -0
- package/scripts/test-latency.js +210 -0
- package/scripts/test-rate-limit.js +95 -0
- package/scripts/test-token-budget.js +332 -0
- package/scripts/validate-paper-manifest.js +670 -0
- package/services/__tests__/evalConfigLoader.test.js +2 -2
- package/services/__tests__/learnerRubricEvaluator.test.js +361 -0
- package/services/__tests__/learnerTutorInteractionEngine.test.js +326 -0
- package/services/evaluationRunner.js +975 -98
- package/services/evaluationStore.js +12 -4
- package/services/learnerTutorInteractionEngine.js +27 -2
- package/services/mockProvider.js +133 -0
- package/services/promptRewriter.js +1471 -5
- package/services/rubricEvaluator.js +55 -2
- package/services/transcriptFormatter.js +675 -0
- package/docs/EVALUATION-VARIABLES.md +0 -589
- package/docs/REPLICATION-PLAN.md +0 -577
- package/scripts/analyze-run.mjs +0 -282
- package/scripts/compare-runs.js +0 -44
- package/scripts/compare-suggestions.js +0 -80
- package/scripts/dig-into-run.js +0 -158
- package/scripts/show-failed-suggestions.js +0 -64
- /package/scripts/{check-run.mjs → check-run.js} +0 -0
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
#
|
|
3
|
+
# Download evaluation databases from the GitHub release.
|
|
4
|
+
# Creates symlinks in data/ pointing to ~/.machinespirits-data/.
|
|
5
|
+
#
|
|
6
|
+
# Usage: ./scripts/download-data.sh [--tag v2.3.14]
|
|
7
|
+
#
|
|
8
|
+
# Requires: gh (GitHub CLI), authenticated
|
|
9
|
+
|
|
10
|
+
set -euo pipefail
|
|
11
|
+
|
|
12
|
+
REPO="liammagee/machinespirits-eval"
|
|
13
|
+
TAG="${1:-v2.3.14}"
|
|
14
|
+
DATA_DIR="$HOME/.machinespirits-data"
|
|
15
|
+
LINK_DIR="$(cd "$(dirname "$0")/.." && pwd)/data"
|
|
16
|
+
|
|
17
|
+
# Strip --tag prefix if provided
|
|
18
|
+
TAG="${TAG#--tag }"
|
|
19
|
+
TAG="${TAG#--tag=}"
|
|
20
|
+
|
|
21
|
+
echo "Downloading evaluation databases from release ${TAG}..."
|
|
22
|
+
echo " Target: ${DATA_DIR}"
|
|
23
|
+
echo ""
|
|
24
|
+
|
|
25
|
+
mkdir -p "${DATA_DIR}"
|
|
26
|
+
|
|
27
|
+
DB_FILES=(evaluations.db learner-writing-pad.db tutor-writing-pad.db writing-pads.db)
|
|
28
|
+
|
|
29
|
+
for f in "${DB_FILES[@]}"; do
|
|
30
|
+
if [ -f "${DATA_DIR}/${f}" ]; then
|
|
31
|
+
echo " [skip] ${f} (already exists)"
|
|
32
|
+
else
|
|
33
|
+
echo " [download] ${f}..."
|
|
34
|
+
gh release download "${TAG}" --repo "${REPO}" --pattern "${f}" --dir "${DATA_DIR}"
|
|
35
|
+
fi
|
|
36
|
+
done
|
|
37
|
+
|
|
38
|
+
echo ""
|
|
39
|
+
echo "Creating symlinks in ${LINK_DIR}/..."
|
|
40
|
+
|
|
41
|
+
for f in "${DB_FILES[@]}"; do
|
|
42
|
+
target="${DATA_DIR}/${f}"
|
|
43
|
+
link="${LINK_DIR}/${f}"
|
|
44
|
+
if [ -L "${link}" ]; then
|
|
45
|
+
echo " [skip] ${f} (symlink exists)"
|
|
46
|
+
elif [ -f "${link}" ]; then
|
|
47
|
+
echo " [skip] ${f} (regular file exists — remove manually if you want a symlink)"
|
|
48
|
+
else
|
|
49
|
+
ln -s "${target}" "${link}"
|
|
50
|
+
echo " [link] ${f} -> ${target}"
|
|
51
|
+
fi
|
|
52
|
+
done
|
|
53
|
+
|
|
54
|
+
echo ""
|
|
55
|
+
echo "Done. Verify with: node -e \"import Database from 'better-sqlite3'; const db = new Database('data/evaluations.db'); console.log(db.prepare('SELECT COUNT(*) as n FROM evaluation_results').get())\""
|
package/scripts/eval-cli.js
CHANGED
|
@@ -44,6 +44,7 @@ import 'dotenv/config';
|
|
|
44
44
|
* --refresh <ms> Refresh interval for 'watch' (default: 2000) or 'evaluate --follow' (default: 5000)
|
|
45
45
|
* --force Actually complete stale runs (for 'cleanup'; dry-run without it)
|
|
46
46
|
* --older-than <min> Staleness threshold in minutes (for 'cleanup', default: 30)
|
|
47
|
+
* --dry-run Use mock data instead of API calls (no API keys required)
|
|
47
48
|
*
|
|
48
49
|
* The default `run` uses the 2x2x2 factorial design:
|
|
49
50
|
* Factor A: Recognition prompts (off / on)
|
|
@@ -68,6 +69,7 @@ import { buildLearnerEvaluationPrompt, calculateLearnerOverallScore } from '../s
|
|
|
68
69
|
import { readProgressLog, getProgressLogPath } from '../services/progressLogger.js';
|
|
69
70
|
import * as evalConfigLoader from '../services/evalConfigLoader.js';
|
|
70
71
|
const { getScenario } = evalConfigLoader;
|
|
72
|
+
import { formatTranscript } from '../services/transcriptFormatter.js';
|
|
71
73
|
import { spawn } from 'child_process';
|
|
72
74
|
import readline from 'readline';
|
|
73
75
|
import fs from 'fs';
|
|
@@ -767,15 +769,17 @@ async function main() {
|
|
|
767
769
|
const scenarioId = getOption('scenario', 'new_user_first_visit');
|
|
768
770
|
const profile = getOption('profile', 'budget');
|
|
769
771
|
const verbose = getFlag('verbose');
|
|
772
|
+
const dryRun = getFlag('dry-run');
|
|
770
773
|
const evalSettingsQt = evalConfigLoader.getEvalSettings();
|
|
771
|
-
const skipRubricEval = getFlag('skip-rubric') || !evalSettingsQt.useAIJudge;
|
|
774
|
+
const skipRubricEval = dryRun ? false : (getFlag('skip-rubric') || !evalSettingsQt.useAIJudge);
|
|
772
775
|
const config = { profileName: profile };
|
|
773
776
|
|
|
774
|
-
console.log(`\nRunning quick test (profile: ${profile}, scenario: ${scenarioId})...\n`);
|
|
777
|
+
console.log(`\nRunning quick test (profile: ${profile}, scenario: ${scenarioId}${dryRun ? ', dry-run' : ''})...\n`);
|
|
775
778
|
const result = await evaluationRunner.quickTest(config, {
|
|
776
779
|
scenarioId,
|
|
777
780
|
verbose,
|
|
778
781
|
skipRubricEval,
|
|
782
|
+
dryRun,
|
|
779
783
|
});
|
|
780
784
|
console.log('\nResult:');
|
|
781
785
|
console.log(JSON.stringify(result, null, 2));
|
|
@@ -784,9 +788,11 @@ async function main() {
|
|
|
784
788
|
|
|
785
789
|
case 'run': {
|
|
786
790
|
const verbose = getFlag('verbose');
|
|
791
|
+
const dryRun = getFlag('dry-run');
|
|
787
792
|
// CLI --use-rubric forces rubric on; --skip-rubric forces off; otherwise use config default
|
|
793
|
+
// --dry-run always enables rubric (mock judge has no cost)
|
|
788
794
|
const evalSettings = evalConfigLoader.getEvalSettings();
|
|
789
|
-
const skipRubricEval = getFlag('use-rubric') ? false : (getFlag('skip-rubric') || !evalSettings.useAIJudge);
|
|
795
|
+
const skipRubricEval = dryRun ? false : (getFlag('use-rubric') ? false : (getFlag('skip-rubric') || !evalSettings.useAIJudge));
|
|
790
796
|
const runsPerConfig = parseInt(getOption('runs', '1'), 10);
|
|
791
797
|
const parallelism = parseInt(getOption('parallelism', '2'), 10);
|
|
792
798
|
const description = getOption('description');
|
|
@@ -796,6 +802,9 @@ async function main() {
|
|
|
796
802
|
const modelOverride = getOption('model');
|
|
797
803
|
const egoModelOverride = getOption('ego-model');
|
|
798
804
|
const superegoModelOverride = getOption('superego-model');
|
|
805
|
+
const learnerModelOverride = getOption('learner-model');
|
|
806
|
+
const transcriptMode = getFlag('transcript');
|
|
807
|
+
const maxTokensOverride = getOption('max-tokens');
|
|
799
808
|
|
|
800
809
|
// --cluster and --scenario are mutually exclusive
|
|
801
810
|
if (clusterOpt && scenarioOpt) {
|
|
@@ -847,6 +856,8 @@ async function main() {
|
|
|
847
856
|
if (egoModelOverride) console.log(` Ego model override: ${egoModelOverride}`);
|
|
848
857
|
if (superegoModelOverride) console.log(` Superego model override: ${superegoModelOverride}`);
|
|
849
858
|
}
|
|
859
|
+
if (learnerModelOverride) console.log(` Learner model override: ${learnerModelOverride}`);
|
|
860
|
+
if (maxTokensOverride) console.log(` Max tokens override: ${maxTokensOverride}`);
|
|
850
861
|
console.log('');
|
|
851
862
|
}
|
|
852
863
|
|
|
@@ -860,12 +871,16 @@ async function main() {
|
|
|
860
871
|
runsPerConfig,
|
|
861
872
|
parallelism,
|
|
862
873
|
skipRubricEval,
|
|
863
|
-
description: description || (isFactorial ? '2x2x2 Factorial Evaluation' : null),
|
|
874
|
+
description: description || (dryRun ? 'Dry-run evaluation (mock data)' : (isFactorial ? '2x2x2 Factorial Evaluation' : null)),
|
|
864
875
|
verbose,
|
|
865
876
|
scenarioFilter: clusterOpt || null,
|
|
866
877
|
modelOverride: modelOverride || null,
|
|
867
878
|
egoModelOverride: egoModelOverride || null,
|
|
868
879
|
superegoModelOverride: superegoModelOverride || null,
|
|
880
|
+
learnerModelOverride: learnerModelOverride || null,
|
|
881
|
+
dryRun,
|
|
882
|
+
transcriptMode,
|
|
883
|
+
maxTokensOverride: maxTokensOverride ? parseInt(maxTokensOverride, 10) : null,
|
|
869
884
|
});
|
|
870
885
|
// Extract unique model aliases used across all configs (ego + superego)
|
|
871
886
|
const extractAlias = (raw) => {
|
|
@@ -884,6 +899,63 @@ async function main() {
|
|
|
884
899
|
if (modelAliases.length > 0) {
|
|
885
900
|
console.log(`Models: ${modelAliases.join(', ')}`);
|
|
886
901
|
}
|
|
902
|
+
|
|
903
|
+
// Token / cost / latency summary report
|
|
904
|
+
if (result.runId) {
|
|
905
|
+
const runResults = evaluationStore.getResults(result.runId);
|
|
906
|
+
if (runResults.length > 0) {
|
|
907
|
+
console.log('\n' + '='.repeat(80));
|
|
908
|
+
console.log(' TOKEN & COST SUMMARY');
|
|
909
|
+
console.log('='.repeat(80));
|
|
910
|
+
|
|
911
|
+
// Per-result breakdown
|
|
912
|
+
const header = ' # | Scenario | In Tok | Out Tok | API | Rounds | Latency | Cost';
|
|
913
|
+
const divider = ' ' + '-'.repeat(header.length - 2);
|
|
914
|
+
console.log(header);
|
|
915
|
+
console.log(divider);
|
|
916
|
+
|
|
917
|
+
let totalIn = 0, totalOut = 0, totalApi = 0, totalRounds = 0, totalLatency = 0, totalCost = 0;
|
|
918
|
+
|
|
919
|
+
runResults.forEach((r, i) => {
|
|
920
|
+
const inTok = r.input_tokens || r.inputTokens || 0;
|
|
921
|
+
const outTok = r.output_tokens || r.outputTokens || 0;
|
|
922
|
+
const apiCalls = r.api_calls || r.apiCalls || 0;
|
|
923
|
+
const rounds = r.dialogue_rounds || r.dialogueRounds || 0;
|
|
924
|
+
const latMs = r.latency_ms || r.latencyMs || 0;
|
|
925
|
+
const cost = r.cost || 0;
|
|
926
|
+
|
|
927
|
+
totalIn += inTok;
|
|
928
|
+
totalOut += outTok;
|
|
929
|
+
totalApi += apiCalls;
|
|
930
|
+
totalRounds += rounds;
|
|
931
|
+
totalLatency += latMs;
|
|
932
|
+
totalCost += cost;
|
|
933
|
+
|
|
934
|
+
const scenLabel = (r.scenario_id || r.scenarioId || '').substring(0, 32).padEnd(32);
|
|
935
|
+
const latStr = latMs >= 1000 ? `${(latMs / 1000).toFixed(1)}s` : `${latMs}ms`;
|
|
936
|
+
const costStr = cost > 0 ? `$${cost.toFixed(4)}` : '-';
|
|
937
|
+
console.log(` ${String(i + 1).padStart(2)} | ${scenLabel} | ${String(inTok).padStart(7)} | ${String(outTok).padStart(7)} | ${String(apiCalls).padStart(4)} | ${String(rounds).padStart(6)} | ${latStr.padStart(9)} | ${costStr}`);
|
|
938
|
+
});
|
|
939
|
+
|
|
940
|
+
console.log(divider);
|
|
941
|
+
const totalLatStr = totalLatency >= 1000 ? `${(totalLatency / 1000).toFixed(1)}s` : `${totalLatency}ms`;
|
|
942
|
+
const totalCostStr = totalCost > 0 ? `$${totalCost.toFixed(4)}` : '-';
|
|
943
|
+
console.log(` ${'TOTAL'.padStart(2)} | ${''.padEnd(32)} | ${String(totalIn).padStart(7)} | ${String(totalOut).padStart(7)} | ${String(totalApi).padStart(4)} | ${String(totalRounds).padStart(6)} | ${totalLatStr.padStart(9)} | ${totalCostStr}`);
|
|
944
|
+
|
|
945
|
+
// Per-token cost efficiency
|
|
946
|
+
const totalTok = totalIn + totalOut;
|
|
947
|
+
if (totalTok > 0) {
|
|
948
|
+
const avgLatPerCall = totalApi > 0 ? (totalLatency / totalApi / 1000).toFixed(2) : '-';
|
|
949
|
+
console.log(`\n Tokens: ${totalTok.toLocaleString()} total (${totalIn.toLocaleString()} in + ${totalOut.toLocaleString()} out)`);
|
|
950
|
+
console.log(` Avg latency/API call: ${avgLatPerCall}s | Results: ${runResults.length} | API calls: ${totalApi}`);
|
|
951
|
+
if (totalCost > 0) {
|
|
952
|
+
console.log(` Cost/1K tokens: $${(totalCost / totalTok * 1000).toFixed(4)}`);
|
|
953
|
+
}
|
|
954
|
+
}
|
|
955
|
+
console.log('='.repeat(80));
|
|
956
|
+
}
|
|
957
|
+
}
|
|
958
|
+
|
|
887
959
|
console.log(JSON.stringify(result, null, 2));
|
|
888
960
|
|
|
889
961
|
// Factorial post-analysis: print cell means and ANOVA for each score type
|
|
@@ -964,6 +1036,11 @@ async function main() {
|
|
|
964
1036
|
} else if (run.completedResults > 0) {
|
|
965
1037
|
progress = `${run.completedResults} done`;
|
|
966
1038
|
}
|
|
1039
|
+
// Show per-turn progress for running multi-turn tests
|
|
1040
|
+
const turnProgress = run.metadata?.turnProgress;
|
|
1041
|
+
if (run.status === 'running' && turnProgress) {
|
|
1042
|
+
progress += ` T${turnProgress.current}/${turnProgress.total}`;
|
|
1043
|
+
}
|
|
967
1044
|
const avg = run.avgScore != null ? run.avgScore.toFixed(1) : '--';
|
|
968
1045
|
// Duration formatting
|
|
969
1046
|
let duration = '--';
|
|
@@ -1223,11 +1300,18 @@ async function main() {
|
|
|
1223
1300
|
case 'transcript': {
|
|
1224
1301
|
const runId = args.find(a => !a.startsWith('--') && a !== 'transcript');
|
|
1225
1302
|
if (!runId) {
|
|
1226
|
-
console.error('Usage: eval-cli.js transcript <runId> [--scenario <id>]');
|
|
1303
|
+
console.error('Usage: eval-cli.js transcript <runId> [--scenario <id>] [--detail play|compact|messages-only|full|bilateral]');
|
|
1227
1304
|
process.exit(1);
|
|
1228
1305
|
}
|
|
1229
1306
|
|
|
1230
1307
|
const scenarioFilter = getOption('scenario');
|
|
1308
|
+
// Determine detail level: --compact and --messages-only are shortcuts, --detail is explicit
|
|
1309
|
+
let detailLevel = getOption('detail') || 'play';
|
|
1310
|
+
if (getFlag('compact')) detailLevel = 'compact';
|
|
1311
|
+
if (getFlag('messages-only')) detailLevel = 'messages-only';
|
|
1312
|
+
if (getFlag('full')) detailLevel = 'full';
|
|
1313
|
+
if (getFlag('bilateral')) detailLevel = 'bilateral';
|
|
1314
|
+
|
|
1231
1315
|
const results = evaluationStore.getResults(runId, {
|
|
1232
1316
|
scenarioId: scenarioFilter || null,
|
|
1233
1317
|
});
|
|
@@ -1237,7 +1321,7 @@ async function main() {
|
|
|
1237
1321
|
break;
|
|
1238
1322
|
}
|
|
1239
1323
|
|
|
1240
|
-
console.log(`\nTranscripts for run: ${runId} (${results.length} results)\n`);
|
|
1324
|
+
console.log(`\nTranscripts for run: ${runId} (${results.length} results, detail: ${detailLevel})\n`);
|
|
1241
1325
|
|
|
1242
1326
|
for (const result of results) {
|
|
1243
1327
|
console.log('='.repeat(80));
|
|
@@ -1246,10 +1330,9 @@ async function main() {
|
|
|
1246
1330
|
console.log(`Score: ${result.overallScore != null ? result.overallScore.toFixed(1) : '--'} | Success: ${result.success}`);
|
|
1247
1331
|
console.log('-'.repeat(80));
|
|
1248
1332
|
|
|
1249
|
-
// Try dialogue log file first
|
|
1333
|
+
// Try dialogue log file first (rich trace with metadata)
|
|
1250
1334
|
let printed = false;
|
|
1251
1335
|
if (result.dialogueId) {
|
|
1252
|
-
// Search for the dialogue file (may include date prefix in filename)
|
|
1253
1336
|
const files = fs.existsSync(LOGS_DIR)
|
|
1254
1337
|
? fs.readdirSync(LOGS_DIR).filter(f => f.includes(result.dialogueId))
|
|
1255
1338
|
: [];
|
|
@@ -1258,24 +1341,29 @@ async function main() {
|
|
|
1258
1341
|
try {
|
|
1259
1342
|
const dialogue = JSON.parse(fs.readFileSync(path.join(LOGS_DIR, files[0]), 'utf-8'));
|
|
1260
1343
|
const trace = dialogue.dialogueTrace || [];
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1344
|
+
if (trace.length > 0) {
|
|
1345
|
+
const formatted = formatTranscript(trace, {
|
|
1346
|
+
detail: detailLevel,
|
|
1347
|
+
scenarioName: result.scenarioName || result.scenarioId,
|
|
1348
|
+
profileName: result.profileName,
|
|
1349
|
+
totalTurns: dialogue.totalTurns || 0,
|
|
1350
|
+
});
|
|
1351
|
+
console.log(formatted);
|
|
1352
|
+
printed = true;
|
|
1264
1353
|
}
|
|
1265
|
-
if (trace.length > 0) printed = true;
|
|
1266
1354
|
} catch (e) {
|
|
1267
|
-
// Fall through to
|
|
1355
|
+
// Fall through to legacy format
|
|
1268
1356
|
}
|
|
1269
1357
|
}
|
|
1270
1358
|
}
|
|
1271
1359
|
|
|
1272
|
-
// Fall back to suggestions / raw response from DB
|
|
1360
|
+
// Fall back to legacy format (suggestions / raw response from DB)
|
|
1273
1361
|
if (!printed) {
|
|
1274
1362
|
if (result.suggestions?.length > 0) {
|
|
1275
1363
|
console.log('Suggestions:');
|
|
1276
1364
|
for (const s of result.suggestions) {
|
|
1277
1365
|
const text = typeof s === 'string' ? s : (s.text || s.content || JSON.stringify(s));
|
|
1278
|
-
console.log(`
|
|
1366
|
+
console.log(` \u2022 ${text}`);
|
|
1279
1367
|
}
|
|
1280
1368
|
console.log('');
|
|
1281
1369
|
}
|
|
@@ -1831,7 +1919,7 @@ async function main() {
|
|
|
1831
1919
|
requiredMissing: parsed.validation?.required_missing || [],
|
|
1832
1920
|
forbiddenFound: parsed.validation?.forbidden_found || [],
|
|
1833
1921
|
summary: parsed.summary,
|
|
1834
|
-
judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-
|
|
1922
|
+
judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-opus-4.6',
|
|
1835
1923
|
};
|
|
1836
1924
|
|
|
1837
1925
|
evaluationStore.updateResultScores(result.id, evaluation);
|
|
@@ -2023,7 +2111,7 @@ async function main() {
|
|
|
2023
2111
|
recognitionScore,
|
|
2024
2112
|
scores: normalizedScores,
|
|
2025
2113
|
summary: parsed.summary,
|
|
2026
|
-
judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-
|
|
2114
|
+
judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-opus-4.6',
|
|
2027
2115
|
};
|
|
2028
2116
|
|
|
2029
2117
|
// Save to dialogue log
|
|
@@ -2578,7 +2666,7 @@ async function main() {
|
|
|
2578
2666
|
evaluationStore.updateResultLearnerScores(result.id, {
|
|
2579
2667
|
scores: turnScores,
|
|
2580
2668
|
overallScore: dialogueLearnerScore,
|
|
2581
|
-
judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-
|
|
2669
|
+
judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-opus-4.6',
|
|
2582
2670
|
});
|
|
2583
2671
|
|
|
2584
2672
|
allScores.push(dialogueLearnerScore);
|