@machinespirits/eval 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +161 -0
- package/config/eval-settings.yaml +18 -0
- package/config/evaluation-rubric-learner.yaml +277 -0
- package/config/evaluation-rubric.yaml +613 -0
- package/config/interaction-eval-scenarios.yaml +93 -50
- package/config/learner-agents.yaml +124 -193
- package/config/machinespirits-eval.code-workspace +11 -0
- package/config/providers.yaml +60 -0
- package/config/suggestion-scenarios.yaml +1399 -0
- package/config/tutor-agents.yaml +716 -0
- package/docs/EVALUATION-VARIABLES.md +589 -0
- package/docs/REPLICATION-PLAN.md +577 -0
- package/index.js +15 -6
- package/package.json +16 -22
- package/routes/evalRoutes.js +88 -36
- package/scripts/analyze-judge-reliability.js +401 -0
- package/scripts/analyze-run.js +97 -0
- package/scripts/analyze-run.mjs +282 -0
- package/scripts/analyze-validation-failures.js +141 -0
- package/scripts/check-run.mjs +17 -0
- package/scripts/code-impasse-strategies.js +1132 -0
- package/scripts/compare-runs.js +44 -0
- package/scripts/compare-suggestions.js +80 -0
- package/scripts/compare-transformation.js +116 -0
- package/scripts/dig-into-run.js +158 -0
- package/scripts/eval-cli.js +2626 -0
- package/scripts/generate-paper-figures.py +452 -0
- package/scripts/qualitative-analysis-ai.js +1313 -0
- package/scripts/qualitative-analysis.js +688 -0
- package/scripts/seed-db.js +87 -0
- package/scripts/show-failed-suggestions.js +64 -0
- package/scripts/validate-content.js +192 -0
- package/server.js +3 -2
- package/services/__tests__/evalConfigLoader.test.js +338 -0
- package/services/anovaStats.js +499 -0
- package/services/contentResolver.js +407 -0
- package/services/dialogueTraceAnalyzer.js +454 -0
- package/services/evalConfigLoader.js +625 -0
- package/services/evaluationRunner.js +2171 -270
- package/services/evaluationStore.js +564 -29
- package/services/learnerConfigLoader.js +75 -5
- package/services/learnerRubricEvaluator.js +284 -0
- package/services/learnerTutorInteractionEngine.js +375 -0
- package/services/processUtils.js +18 -0
- package/services/progressLogger.js +98 -0
- package/services/promptRecommendationService.js +31 -26
- package/services/promptRewriter.js +427 -0
- package/services/rubricEvaluator.js +543 -70
- package/services/streamingReporter.js +104 -0
- package/services/turnComparisonAnalyzer.js +494 -0
- package/components/MobileEvalDashboard.tsx +0 -267
- package/components/comparison/DeltaAnalysisTable.tsx +0 -137
- package/components/comparison/ProfileComparisonCard.tsx +0 -176
- package/components/comparison/RecognitionABMode.tsx +0 -385
- package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
- package/components/comparison/WinnerIndicator.tsx +0 -64
- package/components/comparison/index.ts +0 -5
- package/components/mobile/BottomSheet.tsx +0 -233
- package/components/mobile/DimensionBreakdown.tsx +0 -210
- package/components/mobile/DocsView.tsx +0 -363
- package/components/mobile/LogsView.tsx +0 -481
- package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
- package/components/mobile/QuickTestView.tsx +0 -1098
- package/components/mobile/RecognitionTypeChart.tsx +0 -124
- package/components/mobile/RecognitionView.tsx +0 -809
- package/components/mobile/RunDetailView.tsx +0 -261
- package/components/mobile/RunHistoryView.tsx +0 -367
- package/components/mobile/ScoreRadial.tsx +0 -211
- package/components/mobile/StreamingLogPanel.tsx +0 -230
- package/components/mobile/SynthesisStrategyChart.tsx +0 -140
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
- package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
- package/docs/research/COST-ANALYSIS.md +0 -56
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
- package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
- package/docs/research/PAPER-UNIFIED.md +0 -659
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
- package/docs/research/apa.csl +0 -2133
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
- package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
- package/docs/research/paper-draft/full-paper.md +0 -136
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +0 -515
- package/docs/research/transcript-baseline.md +0 -139
- package/docs/research/transcript-recognition-multiagent.md +0 -187
- package/hooks/useEvalData.ts +0 -625
- package/server-init.js +0 -45
- package/services/benchmarkService.js +0 -1892
- package/types.ts +0 -165
- package/utils/haptics.ts +0 -45
|
@@ -0,0 +1,2626 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import 'dotenv/config';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Evaluation CLI
|
|
7
|
+
*
|
|
8
|
+
* Command-line interface for running tutor evaluations.
|
|
9
|
+
*
|
|
10
|
+
* Usage:
|
|
11
|
+
* node scripts/eval-cli.js # List available options
|
|
12
|
+
* node scripts/eval-cli.js quick # Run a quick test with defaults
|
|
13
|
+
* node scripts/eval-cli.js test # Run a quick test (alias)
|
|
14
|
+
* node scripts/eval-cli.js run # Run 2x2x2 factorial evaluation (default)
|
|
15
|
+
* node scripts/eval-cli.js runs # List past evaluation runs
|
|
16
|
+
* node scripts/eval-cli.js report <runId> # Show report for a previous run
|
|
17
|
+
* node scripts/eval-cli.js transcript <runId> # Show full transcripts for a run
|
|
18
|
+
* node scripts/eval-cli.js status <runId> # Quick snapshot of a run's state
|
|
19
|
+
* node scripts/eval-cli.js watch <runId> # Live-updating progress table
|
|
20
|
+
* node scripts/eval-cli.js export <runId> # Export results to file for offline review
|
|
21
|
+
* node scripts/eval-cli.js cleanup # Preview stale runs (dry-run by default)
|
|
22
|
+
* node scripts/eval-cli.js cleanup --force # Actually mark stale runs as completed
|
|
23
|
+
* node scripts/eval-cli.js resume <runId> # Resume an incomplete run (re-run missing tests)
|
|
24
|
+
* node scripts/eval-cli.js revert <runId> # Revert a completed/failed run to 'running'
|
|
25
|
+
* node scripts/eval-cli.js rejudge <runId> # Re-run AI judge (adds new rows for reliability)
|
|
26
|
+
* node scripts/eval-cli.js rejudge <runId> --overwrite # Re-run AI judge (replaces existing)
|
|
27
|
+
* node scripts/eval-cli.js evaluate <runId> # Judge skip-rubric results via claude CLI
|
|
28
|
+
* node scripts/eval-cli.js evaluate <runId> --follow # Poll & judge results as they appear
|
|
29
|
+
* node scripts/eval-cli.js evaluate-learner <runId> # Score learner turns from multi-turn interactions
|
|
30
|
+
* node scripts/eval-cli.js chat # AI conversational interface
|
|
31
|
+
*
|
|
32
|
+
* Options:
|
|
33
|
+
* --scenario <id> Scenario ID or comma-separated IDs (default: all scenarios)
|
|
34
|
+
* --cluster <name> Scenario cluster filter: single-turn, multi-turn, core, mood, benchmark, recognition, multi_turn (comma-separated OK)
|
|
35
|
+
* --profile <name> Override profile(s) — comma-separated or single name
|
|
36
|
+
* --all-profiles Use ALL profiles instead of the 8 factorial cells
|
|
37
|
+
* --skip-rubric Skip AI-based rubric evaluation
|
|
38
|
+
* --verbose Enable verbose output
|
|
39
|
+
* --runs <n> Replications per cell (for 'run' command, default: 1)
|
|
40
|
+
* --parallelism <n> Parallel test count (for 'run' command, default: 2)
|
|
41
|
+
* --description <text> Description for the evaluation run
|
|
42
|
+
* --db Use SQLite instead of JSONL for 'watch' (slower but persistent)
|
|
43
|
+
* --follow Poll for new results in 'evaluate' (live follow mode)
|
|
44
|
+
* --refresh <ms> Refresh interval for 'watch' (default: 2000) or 'evaluate --follow' (default: 5000)
|
|
45
|
+
* --force Actually complete stale runs (for 'cleanup'; dry-run without it)
|
|
46
|
+
* --older-than <min> Staleness threshold in minutes (for 'cleanup', default: 30)
|
|
47
|
+
*
|
|
48
|
+
* The default `run` uses the 2x2x2 factorial design:
|
|
49
|
+
* Factor A: Recognition prompts (off / on)
|
|
50
|
+
* Factor B: Multi-agent tutor (single / ego+superego)
|
|
51
|
+
* Factor C: Multi-agent learner (unified / ego_superego)
|
|
52
|
+
* = 8 cells, all nemotron (free tier) to isolate architecture effects.
|
|
53
|
+
*
|
|
54
|
+
* Examples:
|
|
55
|
+
* eval-cli.js run --runs 3 # 8 cells × all scenarios × 3 reps
|
|
56
|
+
* eval-cli.js run --runs 1 --scenario new_user_first_visit # Quick single-scenario check
|
|
57
|
+
* eval-cli.js run --cluster multi-turn --runs 1 # Only multi-turn scenarios
|
|
58
|
+
* eval-cli.js run --cluster core,mood --runs 1 # Core + mood scenarios
|
|
59
|
+
* eval-cli.js run --profile budget,baseline # Override: only these profiles
|
|
60
|
+
* eval-cli.js run --all-profiles --runs 1 # Legacy: every profile in tutor-agents.yaml
|
|
61
|
+
*/
|
|
62
|
+
|
|
63
|
+
import * as evaluationRunner from '../services/evaluationRunner.js';
|
|
64
|
+
import * as anovaStats from '../services/anovaStats.js';
|
|
65
|
+
import * as evaluationStore from '../services/evaluationStore.js';
|
|
66
|
+
import { getAvailableJudge, buildEvaluationPrompt, calculateOverallScore, calculateBaseScore, calculateRecognitionScore } from '../services/rubricEvaluator.js';
|
|
67
|
+
import { buildLearnerEvaluationPrompt, calculateLearnerOverallScore } from '../services/learnerRubricEvaluator.js';
|
|
68
|
+
import { readProgressLog, getProgressLogPath } from '../services/progressLogger.js';
|
|
69
|
+
import * as evalConfigLoader from '../services/evalConfigLoader.js';
|
|
70
|
+
const { getScenario } = evalConfigLoader;
|
|
71
|
+
import { spawn } from 'child_process';
|
|
72
|
+
import readline from 'readline';
|
|
73
|
+
import fs from 'fs';
|
|
74
|
+
import path from 'path';
|
|
75
|
+
import { fileURLToPath } from 'url';
|
|
76
|
+
|
|
77
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
78
|
+
const LOGS_DIR = path.resolve(__dirname, '..', 'logs', 'tutor-dialogues');
|
|
79
|
+
|
|
80
|
+
const args = process.argv.slice(2);
|
|
81
|
+
const command = args.find(a => !a.startsWith('--')) || 'list';
|
|
82
|
+
|
|
83
|
+
function getFlag(name) {
|
|
84
|
+
return args.includes(`--${name}`);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function getOption(name, defaultValue = undefined) {
|
|
88
|
+
const idx = args.indexOf(`--${name}`);
|
|
89
|
+
if (idx === -1 || idx + 1 >= args.length) return defaultValue;
|
|
90
|
+
return args[idx + 1];
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
import { isPidAlive } from '../services/processUtils.js';
|
|
94
|
+
|
|
95
|
+
// ── watch / status helpers ────────────────────────────────────────
|
|
96
|
+
|
|
97
|
+
function formatMs(ms) {
|
|
98
|
+
if (ms < 1000) return `${ms}ms`;
|
|
99
|
+
if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`;
|
|
100
|
+
const m = Math.floor(ms / 60000);
|
|
101
|
+
const s = Math.round((ms % 60000) / 1000);
|
|
102
|
+
return `${m}m ${s}s`;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Format a single dialogue trace entry for display.
|
|
107
|
+
* Handles both legacy format (role/speaker/content) and the structured
|
|
108
|
+
* multi-agent format (agent/action/suggestions/feedback).
|
|
109
|
+
*/
|
|
110
|
+
function formatTraceEntry(entry) {
|
|
111
|
+
// Legacy format
|
|
112
|
+
if (entry.role || entry.speaker) {
|
|
113
|
+
const role = (entry.role || entry.speaker).toUpperCase();
|
|
114
|
+
const content = entry.content || entry.message || entry.text || '';
|
|
115
|
+
return `[${role}] ${content}`;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// Structured multi-agent format
|
|
119
|
+
const agent = (entry.agent || 'unknown').toUpperCase();
|
|
120
|
+
const action = entry.action || '';
|
|
121
|
+
|
|
122
|
+
switch (action) {
|
|
123
|
+
case 'context_input': {
|
|
124
|
+
const ctx = entry.contextData || {};
|
|
125
|
+
const parts = [];
|
|
126
|
+
if (ctx.currentPage) parts.push(ctx.currentPage.replace(/^\*+:\s*/, ''));
|
|
127
|
+
if (ctx.strugglesCount) parts.push(`${ctx.strugglesCount} struggle signals`);
|
|
128
|
+
if (ctx.sessions) parts.push(`${ctx.sessions} prior sessions`);
|
|
129
|
+
return `[CONTEXT] ${parts.length ? parts.join(', ') : '(scenario input)'}`;
|
|
130
|
+
}
|
|
131
|
+
case 'generate': {
|
|
132
|
+
const titles = (entry.suggestions || []).map(s => s.title || s.type).join('; ');
|
|
133
|
+
return `[EGO → SUPEREGO] Generated: ${titles}`;
|
|
134
|
+
}
|
|
135
|
+
case 'review': {
|
|
136
|
+
const verdict = entry.verdict || {};
|
|
137
|
+
const approved = entry.approved ?? verdict.approved;
|
|
138
|
+
const tag = approved ? '✓ APPROVED' : '→ REVISE';
|
|
139
|
+
const feedback = entry.feedback || verdict.feedback || '';
|
|
140
|
+
const summary = feedback.length > 200 ? feedback.substring(0, 200) + '…' : feedback;
|
|
141
|
+
return `[SUPEREGO ${tag}] ${summary}`;
|
|
142
|
+
}
|
|
143
|
+
case 'revise': {
|
|
144
|
+
const titles = (entry.suggestions || []).map(s => s.title || s.type).join('; ');
|
|
145
|
+
return `[EGO revised] ${titles}`;
|
|
146
|
+
}
|
|
147
|
+
case 'final_output': {
|
|
148
|
+
const detail = entry.contextSummary || entry.detail || `Turn ${(entry.turnIndex || 0) + 1} complete`;
|
|
149
|
+
return `[OUTPUT] ${detail}`;
|
|
150
|
+
}
|
|
151
|
+
case 'turn_action': {
|
|
152
|
+
const learnerMsg = entry.contextSummary || entry.detail || '';
|
|
153
|
+
return `[LEARNER] ${learnerMsg}`;
|
|
154
|
+
}
|
|
155
|
+
default: {
|
|
156
|
+
const content = entry.content || entry.message || entry.text || entry.contextSummary || action;
|
|
157
|
+
return `[${agent}:${action}] ${content}`;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Build a scenario×profile grid from JSONL events.
|
|
164
|
+
* Returns { scenarios, profiles, grid, completedTests, totalTests, runDone }.
|
|
165
|
+
*/
|
|
166
|
+
function buildGridFromEvents(events) {
|
|
167
|
+
let scenarios = [];
|
|
168
|
+
let profiles = [];
|
|
169
|
+
let originalTotalTests = 0; // From first run_start (original plan)
|
|
170
|
+
let completedTests = 0;
|
|
171
|
+
let runDone = false;
|
|
172
|
+
let durationMs = null;
|
|
173
|
+
let isResumed = false;
|
|
174
|
+
const grid = {}; // grid[scenarioName][profileName] = { score, success, ... }
|
|
175
|
+
|
|
176
|
+
for (const ev of events) {
|
|
177
|
+
if (ev.eventType === 'run_start') {
|
|
178
|
+
scenarios = ev.scenarios || [];
|
|
179
|
+
profiles = ev.profiles || [];
|
|
180
|
+
// Keep the FIRST run_start's totalTests (original plan), ignore resume's smaller count
|
|
181
|
+
if (originalTotalTests === 0) {
|
|
182
|
+
originalTotalTests = ev.totalTests || 0;
|
|
183
|
+
} else {
|
|
184
|
+
isResumed = true; // This is a resume
|
|
185
|
+
}
|
|
186
|
+
} else if (ev.eventType === 'test_complete') {
|
|
187
|
+
// Count actual events instead of relying on per-event completedCount
|
|
188
|
+
completedTests++;
|
|
189
|
+
const sName = ev.scenarioName || ev.scenarioId;
|
|
190
|
+
const pName = ev.profileName || '?';
|
|
191
|
+
if (!grid[sName]) grid[sName] = {};
|
|
192
|
+
grid[sName][pName] = {
|
|
193
|
+
score: ev.overallScore,
|
|
194
|
+
success: ev.success,
|
|
195
|
+
latencyMs: ev.latencyMs,
|
|
196
|
+
};
|
|
197
|
+
} else if (ev.eventType === 'test_error') {
|
|
198
|
+
completedTests++;
|
|
199
|
+
const sName = ev.scenarioName || ev.scenarioId;
|
|
200
|
+
const pName = ev.profileName || '?';
|
|
201
|
+
if (!grid[sName]) grid[sName] = {};
|
|
202
|
+
grid[sName][pName] = {
|
|
203
|
+
score: null,
|
|
204
|
+
success: false,
|
|
205
|
+
error: ev.errorMessage,
|
|
206
|
+
};
|
|
207
|
+
} else if (ev.eventType === 'run_complete') {
|
|
208
|
+
runDone = true;
|
|
209
|
+
durationMs = ev.durationMs;
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// If no run_start was found, infer scenarios and profiles from the grid
|
|
214
|
+
if (scenarios.length === 0) {
|
|
215
|
+
scenarios = Object.keys(grid);
|
|
216
|
+
}
|
|
217
|
+
if (profiles.length === 0) {
|
|
218
|
+
const profileSet = new Set();
|
|
219
|
+
for (const scenarioData of Object.values(grid)) {
|
|
220
|
+
for (const profile of Object.keys(scenarioData)) {
|
|
221
|
+
profileSet.add(profile);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
profiles = [...profileSet];
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
return { scenarios, profiles, grid, completedTests, totalTests: originalTotalTests, runDone, durationMs };
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Render the scenario×profile grid table as a string.
|
|
232
|
+
*/
|
|
233
|
+
function renderGrid({ scenarios, profiles, grid, completedTests, totalTests, runDone, durationMs }) {
|
|
234
|
+
const lines = [];
|
|
235
|
+
const pct = totalTests > 0 ? Math.round((completedTests / totalTests) * 100) : 0;
|
|
236
|
+
lines.push(`Progress: ${completedTests}/${totalTests} (${pct}%)${runDone ? ' DONE' : ' running...'}${durationMs ? ` ${formatMs(durationMs)}` : ''}`);
|
|
237
|
+
lines.push('');
|
|
238
|
+
|
|
239
|
+
// Determine column widths
|
|
240
|
+
const scenarioColWidth = Math.max(20, ...scenarios.map(s => s.length));
|
|
241
|
+
const profileColWidth = Math.max(8, ...profiles.map(p => p.length));
|
|
242
|
+
|
|
243
|
+
// Header row
|
|
244
|
+
const header = ''.padEnd(scenarioColWidth) + ' | ' + profiles.map(p => p.padEnd(profileColWidth)).join(' | ');
|
|
245
|
+
lines.push(header);
|
|
246
|
+
lines.push('-'.repeat(header.length));
|
|
247
|
+
|
|
248
|
+
// Data rows
|
|
249
|
+
for (const scenario of scenarios) {
|
|
250
|
+
const cells = profiles.map(profile => {
|
|
251
|
+
const cell = grid[scenario]?.[profile];
|
|
252
|
+
if (!cell) return ''.padEnd(profileColWidth);
|
|
253
|
+
if (cell.error) return 'ERR'.padEnd(profileColWidth);
|
|
254
|
+
if (!cell.success) return 'FAIL'.padEnd(profileColWidth);
|
|
255
|
+
const scoreStr = cell.score != null ? cell.score.toFixed(1) : '--';
|
|
256
|
+
return scoreStr.padEnd(profileColWidth);
|
|
257
|
+
});
|
|
258
|
+
const row = scenario.substring(0, scenarioColWidth).padEnd(scenarioColWidth) + ' | ' + cells.join(' | ');
|
|
259
|
+
lines.push(row);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
return lines.join('\n');
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// ── chat command ─────────────────────────────────────────────────
|
|
266
|
+
|
|
267
|
+
const CHAT_TOOLS = [
|
|
268
|
+
{
|
|
269
|
+
type: 'function',
|
|
270
|
+
function: {
|
|
271
|
+
name: 'list_runs',
|
|
272
|
+
description: 'List recent evaluation runs. Returns run IDs, statuses, scores, and descriptions.',
|
|
273
|
+
parameters: {
|
|
274
|
+
type: 'object',
|
|
275
|
+
properties: {
|
|
276
|
+
limit: { type: 'number', description: 'Max runs to return (default 20)' },
|
|
277
|
+
status: { type: 'string', description: 'Filter by status: running, completed, failed' },
|
|
278
|
+
},
|
|
279
|
+
},
|
|
280
|
+
},
|
|
281
|
+
},
|
|
282
|
+
{
|
|
283
|
+
type: 'function',
|
|
284
|
+
function: {
|
|
285
|
+
name: 'get_run_report',
|
|
286
|
+
description: 'Generate a full text report for a run including rankings, dimension breakdown, scenario performance, and ANOVA.',
|
|
287
|
+
parameters: {
|
|
288
|
+
type: 'object',
|
|
289
|
+
properties: {
|
|
290
|
+
runId: { type: 'string', description: 'The evaluation run ID' },
|
|
291
|
+
},
|
|
292
|
+
required: ['runId'],
|
|
293
|
+
},
|
|
294
|
+
},
|
|
295
|
+
},
|
|
296
|
+
{
|
|
297
|
+
type: 'function',
|
|
298
|
+
function: {
|
|
299
|
+
name: 'get_transcript',
|
|
300
|
+
description: 'Get dialogue transcripts for a run, optionally filtered to a single scenario.',
|
|
301
|
+
parameters: {
|
|
302
|
+
type: 'object',
|
|
303
|
+
properties: {
|
|
304
|
+
runId: { type: 'string', description: 'The evaluation run ID' },
|
|
305
|
+
scenarioId: { type: 'string', description: 'Optional scenario ID to filter' },
|
|
306
|
+
},
|
|
307
|
+
required: ['runId'],
|
|
308
|
+
},
|
|
309
|
+
},
|
|
310
|
+
},
|
|
311
|
+
{
|
|
312
|
+
type: 'function',
|
|
313
|
+
function: {
|
|
314
|
+
name: 'run_anova',
|
|
315
|
+
description: 'Run a 2x2x2 three-way ANOVA on factorial cell data for a given run. Requires factor-tagged results.',
|
|
316
|
+
parameters: {
|
|
317
|
+
type: 'object',
|
|
318
|
+
properties: {
|
|
319
|
+
runId: { type: 'string', description: 'The evaluation run ID' },
|
|
320
|
+
},
|
|
321
|
+
required: ['runId'],
|
|
322
|
+
},
|
|
323
|
+
},
|
|
324
|
+
},
|
|
325
|
+
{
|
|
326
|
+
type: 'function',
|
|
327
|
+
function: {
|
|
328
|
+
name: 'run_evaluation',
|
|
329
|
+
description: 'Start a new evaluation run. Can specify scenarios, profiles, cluster filters, and replications.',
|
|
330
|
+
parameters: {
|
|
331
|
+
type: 'object',
|
|
332
|
+
properties: {
|
|
333
|
+
scenarios: {
|
|
334
|
+
type: 'array',
|
|
335
|
+
items: { type: 'string' },
|
|
336
|
+
description: 'Scenario IDs to run (omit for all)',
|
|
337
|
+
},
|
|
338
|
+
profiles: {
|
|
339
|
+
type: 'array',
|
|
340
|
+
items: { type: 'string' },
|
|
341
|
+
description: 'Profile names to test (omit for default factorial)',
|
|
342
|
+
},
|
|
343
|
+
cluster: {
|
|
344
|
+
type: 'string',
|
|
345
|
+
description: 'Scenario cluster filter: single-turn, multi-turn, or category names (core, mood, benchmark, recognition, multi_turn). Comma-separated for multiple.',
|
|
346
|
+
},
|
|
347
|
+
runs: { type: 'number', description: 'Replications per cell (default 1)' },
|
|
348
|
+
description: { type: 'string', description: 'Description for this run' },
|
|
349
|
+
},
|
|
350
|
+
},
|
|
351
|
+
},
|
|
352
|
+
},
|
|
353
|
+
{
|
|
354
|
+
type: 'function',
|
|
355
|
+
function: {
|
|
356
|
+
name: 'quick_test',
|
|
357
|
+
description: 'Run a quick single-scenario test with one profile.',
|
|
358
|
+
parameters: {
|
|
359
|
+
type: 'object',
|
|
360
|
+
properties: {
|
|
361
|
+
scenarioId: { type: 'string', description: 'Scenario ID (default: new_user_first_visit)' },
|
|
362
|
+
profile: { type: 'string', description: 'Profile name (default: budget)' },
|
|
363
|
+
},
|
|
364
|
+
},
|
|
365
|
+
},
|
|
366
|
+
},
|
|
367
|
+
{
|
|
368
|
+
type: 'function',
|
|
369
|
+
function: {
|
|
370
|
+
name: 'cleanup_stale',
|
|
371
|
+
description: 'Find and optionally complete stale runs stuck in "running" state.',
|
|
372
|
+
parameters: {
|
|
373
|
+
type: 'object',
|
|
374
|
+
properties: {
|
|
375
|
+
olderThanMinutes: { type: 'number', description: 'Staleness threshold (default 30)' },
|
|
376
|
+
force: { type: 'boolean', description: 'Actually complete them (default false = dry run)' },
|
|
377
|
+
},
|
|
378
|
+
},
|
|
379
|
+
},
|
|
380
|
+
},
|
|
381
|
+
{
|
|
382
|
+
type: 'function',
|
|
383
|
+
function: {
|
|
384
|
+
name: 'list_options',
|
|
385
|
+
description: 'List available scenarios, configurations, and profiles.',
|
|
386
|
+
parameters: { type: 'object', properties: {} },
|
|
387
|
+
},
|
|
388
|
+
},
|
|
389
|
+
{
|
|
390
|
+
type: 'function',
|
|
391
|
+
function: {
|
|
392
|
+
name: 'export_results',
|
|
393
|
+
description: 'Export full results for a run as JSON (run metadata, stats, scenario stats, individual results).',
|
|
394
|
+
parameters: {
|
|
395
|
+
type: 'object',
|
|
396
|
+
properties: {
|
|
397
|
+
runId: { type: 'string', description: 'The evaluation run ID' },
|
|
398
|
+
},
|
|
399
|
+
required: ['runId'],
|
|
400
|
+
},
|
|
401
|
+
},
|
|
402
|
+
},
|
|
403
|
+
{
|
|
404
|
+
type: 'function',
|
|
405
|
+
function: {
|
|
406
|
+
name: 'complete_run',
|
|
407
|
+
description: 'Mark an incomplete run as completed with whatever results exist.',
|
|
408
|
+
parameters: {
|
|
409
|
+
type: 'object',
|
|
410
|
+
properties: {
|
|
411
|
+
runId: { type: 'string', description: 'The evaluation run ID' },
|
|
412
|
+
},
|
|
413
|
+
required: ['runId'],
|
|
414
|
+
},
|
|
415
|
+
},
|
|
416
|
+
},
|
|
417
|
+
{
|
|
418
|
+
type: 'function',
|
|
419
|
+
function: {
|
|
420
|
+
name: 'revert_run',
|
|
421
|
+
description: 'Revert a completed/failed run back to "running" status.',
|
|
422
|
+
parameters: {
|
|
423
|
+
type: 'object',
|
|
424
|
+
properties: {
|
|
425
|
+
runId: { type: 'string', description: 'The evaluation run ID' },
|
|
426
|
+
},
|
|
427
|
+
required: ['runId'],
|
|
428
|
+
},
|
|
429
|
+
},
|
|
430
|
+
},
|
|
431
|
+
{
|
|
432
|
+
type: 'function',
|
|
433
|
+
function: {
|
|
434
|
+
name: 'get_run_status',
|
|
435
|
+
description: 'Get detailed status of a run including per-profile stats and scenario breakdown.',
|
|
436
|
+
parameters: {
|
|
437
|
+
type: 'object',
|
|
438
|
+
properties: {
|
|
439
|
+
runId: { type: 'string', description: 'The evaluation run ID' },
|
|
440
|
+
},
|
|
441
|
+
required: ['runId'],
|
|
442
|
+
},
|
|
443
|
+
},
|
|
444
|
+
},
|
|
445
|
+
];
|
|
446
|
+
|
|
447
|
+
function truncate(str, maxLen = 4000) {
|
|
448
|
+
if (typeof str !== 'string') str = JSON.stringify(str, null, 2);
|
|
449
|
+
if (str.length <= maxLen) return str;
|
|
450
|
+
return str.slice(0, maxLen) + `\n... (truncated, ${str.length - maxLen} chars omitted)`;
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
async function executeTool(name, params) {
|
|
454
|
+
switch (name) {
|
|
455
|
+
case 'list_runs': {
|
|
456
|
+
const runs = evaluationStore.listRuns({
|
|
457
|
+
limit: params.limit || 20,
|
|
458
|
+
status: params.status || null,
|
|
459
|
+
});
|
|
460
|
+
return JSON.stringify(runs, null, 2);
|
|
461
|
+
}
|
|
462
|
+
case 'get_run_report': {
|
|
463
|
+
const report = evaluationRunner.generateReport(params.runId);
|
|
464
|
+
return truncate(report);
|
|
465
|
+
}
|
|
466
|
+
case 'get_transcript': {
|
|
467
|
+
const results = evaluationStore.getResults(params.runId, {
|
|
468
|
+
scenarioId: params.scenarioId || null,
|
|
469
|
+
});
|
|
470
|
+
if (results.length === 0) return 'No results found for this run.';
|
|
471
|
+
|
|
472
|
+
const lines = [];
|
|
473
|
+
for (const r of results) {
|
|
474
|
+
lines.push(`--- ${r.scenarioName || r.scenarioId} | ${r.profileName} | score=${r.overallScore?.toFixed(1) ?? '--'} ---`);
|
|
475
|
+
let printed = false;
|
|
476
|
+
if (r.dialogueId) {
|
|
477
|
+
const files = fs.existsSync(LOGS_DIR)
|
|
478
|
+
? fs.readdirSync(LOGS_DIR).filter(f => f.includes(r.dialogueId))
|
|
479
|
+
: [];
|
|
480
|
+
if (files.length > 0) {
|
|
481
|
+
try {
|
|
482
|
+
const dialogue = JSON.parse(fs.readFileSync(path.join(LOGS_DIR, files[0]), 'utf-8'));
|
|
483
|
+
for (const entry of (dialogue.dialogueTrace || [])) {
|
|
484
|
+
lines.push(`[${(entry.role || 'unknown').toUpperCase()}] ${entry.content || ''}`);
|
|
485
|
+
}
|
|
486
|
+
printed = true;
|
|
487
|
+
} catch (e) { /* fall through */ }
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
if (!printed && r.suggestions?.length > 0) {
|
|
491
|
+
lines.push('Suggestions:');
|
|
492
|
+
for (const s of r.suggestions) {
|
|
493
|
+
lines.push(` • ${typeof s === 'string' ? s : (s.text || s.message || JSON.stringify(s))}`);
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
if (r.evaluationReasoning) lines.push(`Judge: ${r.evaluationReasoning}`);
|
|
497
|
+
lines.push('');
|
|
498
|
+
}
|
|
499
|
+
return truncate(lines.join('\n'));
|
|
500
|
+
}
|
|
501
|
+
case 'run_anova': {
|
|
502
|
+
const scoreTypes = [
|
|
503
|
+
{ column: 'overall_score', label: 'Overall Score' },
|
|
504
|
+
{ column: 'base_score', label: 'Base Score' },
|
|
505
|
+
{ column: 'recognition_score', label: 'Recognition Score' },
|
|
506
|
+
];
|
|
507
|
+
const parts = [];
|
|
508
|
+
for (const { column, label } of scoreTypes) {
|
|
509
|
+
const cellData = evaluationStore.getFactorialCellData(params.runId, { scoreColumn: column });
|
|
510
|
+
const totalSamples = Object.values(cellData).reduce((s, arr) => s + arr.length, 0);
|
|
511
|
+
if (totalSamples === 0) continue;
|
|
512
|
+
if (totalSamples <= 8) {
|
|
513
|
+
parts.push(`${label}: Only ${totalSamples} samples — need > 8 for ANOVA.`);
|
|
514
|
+
continue;
|
|
515
|
+
}
|
|
516
|
+
const result = anovaStats.runThreeWayANOVA(cellData);
|
|
517
|
+
parts.push(anovaStats.formatANOVAReport(result, { scoreLabel: label }));
|
|
518
|
+
}
|
|
519
|
+
return parts.length > 0 ? parts.join('\n') : 'No factorial cell data found for this run.';
|
|
520
|
+
}
|
|
521
|
+
case 'run_evaluation': {
|
|
522
|
+
const scenarios = params.scenarios?.length > 0 ? params.scenarios : 'all';
|
|
523
|
+
let configurations = 'factorial';
|
|
524
|
+
if (params.profiles?.length > 0) {
|
|
525
|
+
configurations = params.profiles.map(name => ({
|
|
526
|
+
provider: null, model: null, profileName: name, label: name,
|
|
527
|
+
}));
|
|
528
|
+
}
|
|
529
|
+
const result = await evaluationRunner.runEvaluation({
|
|
530
|
+
scenarios,
|
|
531
|
+
configurations,
|
|
532
|
+
runsPerConfig: params.runs || 1,
|
|
533
|
+
description: params.description || 'Chat-initiated evaluation',
|
|
534
|
+
scenarioFilter: params.cluster || null,
|
|
535
|
+
});
|
|
536
|
+
return JSON.stringify(result, null, 2);
|
|
537
|
+
}
|
|
538
|
+
case 'quick_test': {
|
|
539
|
+
const config = { profileName: params.profile || 'budget' };
|
|
540
|
+
const result = await evaluationRunner.quickTest(config, {
|
|
541
|
+
scenarioId: params.scenarioId || 'new_user_first_visit',
|
|
542
|
+
});
|
|
543
|
+
return truncate(JSON.stringify(result, null, 2));
|
|
544
|
+
}
|
|
545
|
+
case 'cleanup_stale': {
|
|
546
|
+
const result = evaluationStore.autoCompleteStaleRuns({
|
|
547
|
+
olderThanMinutes: params.olderThanMinutes || 30,
|
|
548
|
+
dryRun: !params.force,
|
|
549
|
+
});
|
|
550
|
+
return JSON.stringify(result, null, 2);
|
|
551
|
+
}
|
|
552
|
+
case 'list_options': {
|
|
553
|
+
const opts = evaluationRunner.listOptions();
|
|
554
|
+
return truncate(JSON.stringify({
|
|
555
|
+
scenarios: opts.scenarios.map(s => ({ id: s.id, name: s.name, isMultiTurn: s.isMultiTurn })),
|
|
556
|
+
profiles: opts.profiles?.map(p => ({ name: p.name, description: p.description })),
|
|
557
|
+
}, null, 2));
|
|
558
|
+
}
|
|
559
|
+
case 'export_results': {
|
|
560
|
+
const data = evaluationStore.exportToJson(params.runId);
|
|
561
|
+
return truncate(JSON.stringify(data, null, 2));
|
|
562
|
+
}
|
|
563
|
+
case 'complete_run': {
|
|
564
|
+
const result = evaluationStore.completeRun(params.runId);
|
|
565
|
+
return JSON.stringify(result, null, 2);
|
|
566
|
+
}
|
|
567
|
+
case 'revert_run': {
|
|
568
|
+
const run = evaluationStore.getRun(params.runId);
|
|
569
|
+
if (!run) return `Run not found: ${params.runId}`;
|
|
570
|
+
if (run.status === 'running') return `Run ${params.runId} is already running.`;
|
|
571
|
+
evaluationStore.updateRun(params.runId, { status: 'running' });
|
|
572
|
+
return `Reverted run ${params.runId} from '${run.status}' to 'running'.`;
|
|
573
|
+
}
|
|
574
|
+
case 'get_run_status': {
|
|
575
|
+
const runData = evaluationRunner.getRunResults(params.runId);
|
|
576
|
+
return truncate(JSON.stringify({
|
|
577
|
+
run: runData.run,
|
|
578
|
+
stats: runData.stats,
|
|
579
|
+
resultCount: runData.results.length,
|
|
580
|
+
}, null, 2));
|
|
581
|
+
}
|
|
582
|
+
default:
|
|
583
|
+
return `Unknown tool: ${name}`;
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
async function callOpenRouter(messages, model, apiKey) {
|
|
588
|
+
const res = await fetch('https://openrouter.ai/api/v1/chat/completions', {
|
|
589
|
+
method: 'POST',
|
|
590
|
+
headers: {
|
|
591
|
+
'Content-Type': 'application/json',
|
|
592
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
593
|
+
},
|
|
594
|
+
body: JSON.stringify({
|
|
595
|
+
model,
|
|
596
|
+
messages,
|
|
597
|
+
tools: CHAT_TOOLS,
|
|
598
|
+
temperature: 0.3,
|
|
599
|
+
max_tokens: 4096,
|
|
600
|
+
}),
|
|
601
|
+
});
|
|
602
|
+
|
|
603
|
+
if (!res.ok) {
|
|
604
|
+
const body = await res.text().catch(() => '');
|
|
605
|
+
throw new Error(`OpenRouter API error: ${res.status} — ${body.slice(0, 300)}`);
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
return res.json();
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
async function runChat() {
|
|
612
|
+
const judge = getAvailableJudge();
|
|
613
|
+
const apiKey = process.env.OPENROUTER_API_KEY;
|
|
614
|
+
if (!apiKey) {
|
|
615
|
+
console.error('OPENROUTER_API_KEY not set. Required for chat mode.');
|
|
616
|
+
process.exit(1);
|
|
617
|
+
}
|
|
618
|
+
const model = `${judge.provider === 'openrouter' ? '' : judge.provider + '/'}${judge.model}`;
|
|
619
|
+
const chatModel = judge.provider === 'openrouter' ? judge.model : `${judge.provider}/${judge.model}`;
|
|
620
|
+
|
|
621
|
+
console.log(`\nEval Chat (model: ${chatModel})`);
|
|
622
|
+
console.log('Type your questions about evaluation runs. Use "quit" or "exit" to leave.\n');
|
|
623
|
+
|
|
624
|
+
const rl = readline.createInterface({
|
|
625
|
+
input: process.stdin,
|
|
626
|
+
output: process.stdout,
|
|
627
|
+
prompt: 'eval> ',
|
|
628
|
+
});
|
|
629
|
+
|
|
630
|
+
const messages = [
|
|
631
|
+
{
|
|
632
|
+
role: 'system',
|
|
633
|
+
content: `You are an AI assistant for a tutor evaluation system. You help users inspect evaluation runs, view reports, run ANOVA analyses, start new evaluations, and manage run lifecycle.
|
|
634
|
+
|
|
635
|
+
You have access to tools that query a SQLite database of evaluation runs and results. Each run tests tutor AI configurations against pedagogical scenarios and scores them with an AI judge.
|
|
636
|
+
|
|
637
|
+
Key concepts:
|
|
638
|
+
- Runs contain multiple test results (scenario × profile combinations)
|
|
639
|
+
- The 2×2×2 factorial design tests: Recognition prompts (A), Multi-agent tutor (B), Multi-agent learner (C)
|
|
640
|
+
- ANOVA analyses test significance of these factors
|
|
641
|
+
- Profiles define tutor configurations (model, architecture, etc.)
|
|
642
|
+
- Scenarios define learner situations to evaluate
|
|
643
|
+
|
|
644
|
+
When showing data, be concise. Summarise key findings rather than dumping raw JSON. Use tables where helpful.
|
|
645
|
+
When the user asks to see "recent runs" or "latest", use list_runs.
|
|
646
|
+
When asked about a specific run, use get_run_report or get_run_status.
|
|
647
|
+
For statistical analysis, use run_anova.
|
|
648
|
+
To see available test scenarios and profiles, use list_options.`,
|
|
649
|
+
},
|
|
650
|
+
];
|
|
651
|
+
|
|
652
|
+
const prompt = () => rl.prompt();
|
|
653
|
+
|
|
654
|
+
rl.on('close', () => {
|
|
655
|
+
console.log('\nBye.');
|
|
656
|
+
process.exit(0);
|
|
657
|
+
});
|
|
658
|
+
|
|
659
|
+
prompt();
|
|
660
|
+
|
|
661
|
+
for await (const line of rl) {
|
|
662
|
+
const input = line.trim();
|
|
663
|
+
if (!input) { prompt(); continue; }
|
|
664
|
+
if (input === 'quit' || input === 'exit') {
|
|
665
|
+
console.log('Bye.');
|
|
666
|
+
process.exit(0);
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
messages.push({ role: 'user', content: input });
|
|
670
|
+
|
|
671
|
+
try {
|
|
672
|
+
let done = false;
|
|
673
|
+
while (!done) {
|
|
674
|
+
const response = await callOpenRouter(messages, chatModel, apiKey);
|
|
675
|
+
const choice = response.choices?.[0];
|
|
676
|
+
if (!choice) {
|
|
677
|
+
console.log('[No response from model]');
|
|
678
|
+
done = true;
|
|
679
|
+
break;
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
const msg = choice.message;
|
|
683
|
+
messages.push(msg);
|
|
684
|
+
|
|
685
|
+
// Handle tool calls
|
|
686
|
+
if (msg.tool_calls?.length > 0) {
|
|
687
|
+
for (const tc of msg.tool_calls) {
|
|
688
|
+
const fnName = tc.function.name;
|
|
689
|
+
let fnArgs = {};
|
|
690
|
+
try { fnArgs = JSON.parse(tc.function.arguments || '{}'); } catch (e) { /* empty */ }
|
|
691
|
+
|
|
692
|
+
process.stdout.write(` [calling ${fnName}...]\n`);
|
|
693
|
+
let result;
|
|
694
|
+
try {
|
|
695
|
+
result = await executeTool(fnName, fnArgs);
|
|
696
|
+
} catch (err) {
|
|
697
|
+
result = `Error: ${err.message}`;
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
messages.push({
|
|
701
|
+
role: 'tool',
|
|
702
|
+
tool_call_id: tc.id,
|
|
703
|
+
content: typeof result === 'string' ? result : JSON.stringify(result),
|
|
704
|
+
});
|
|
705
|
+
}
|
|
706
|
+
// Loop back to get the model's summary of tool results
|
|
707
|
+
} else {
|
|
708
|
+
// Text response — print it
|
|
709
|
+
const text = msg.content || '';
|
|
710
|
+
console.log(`\n${text}\n`);
|
|
711
|
+
done = true;
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
} catch (err) {
|
|
715
|
+
console.error(`\nError: ${err.message}\n`);
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
prompt();
|
|
719
|
+
}
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
async function main() {
|
|
723
|
+
try {
|
|
724
|
+
switch (command) {
|
|
725
|
+
case 'list': {
|
|
726
|
+
const options = evaluationRunner.listOptions();
|
|
727
|
+
|
|
728
|
+
// Factorial design — the default run mode
|
|
729
|
+
if (options.profiles?.length) {
|
|
730
|
+
const cellProfiles = options.profiles.filter(p => p.name.startsWith('cell_'));
|
|
731
|
+
const regularProfiles = options.profiles.filter(p => !p.name.startsWith('cell_'));
|
|
732
|
+
|
|
733
|
+
if (cellProfiles.length > 0) {
|
|
734
|
+
console.log('\n2x2x2 Factorial Cells (default `run` configuration):');
|
|
735
|
+
console.log(' A: Recognition B: Tutor arch. C: Learner arch.\n');
|
|
736
|
+
for (const p of cellProfiles) {
|
|
737
|
+
const arch = p.dialogueEnabled ? 'ego+superego' : 'single-agent';
|
|
738
|
+
console.log(` ${p.name.padEnd(32)} ${arch.padEnd(14)} ${p.description || ''}`);
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
if (regularProfiles.length > 0) {
|
|
743
|
+
console.log('\nOther Profiles (use --profile <name> or --all-profiles):');
|
|
744
|
+
for (const p of regularProfiles) {
|
|
745
|
+
const ego = p.egoProvider && p.egoModel ? ` [${p.egoProvider}/${p.egoModel}]` : '';
|
|
746
|
+
const dialogue = p.dialogueEnabled ? ` (dialogue: ${p.maxRounds}r)` : ' (single)';
|
|
747
|
+
console.log(` ${p.name}${ego}${dialogue} - ${p.description || ''}`);
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
console.log('\nScenarios:');
|
|
753
|
+
for (const s of options.scenarios) {
|
|
754
|
+
const mt = s.isMultiTurn ? ` [${s.turnCount}T]` : '';
|
|
755
|
+
console.log(` ${s.id}${mt} - ${s.name || s.id}`);
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
console.log('\nProvider Configurations:');
|
|
759
|
+
for (const c of options.configurations) {
|
|
760
|
+
console.log(` ${c.provider}/${c.model}`);
|
|
761
|
+
}
|
|
762
|
+
break;
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
case 'quick':
|
|
766
|
+
case 'test': {
|
|
767
|
+
const scenarioId = getOption('scenario', 'new_user_first_visit');
|
|
768
|
+
const profile = getOption('profile', 'budget');
|
|
769
|
+
const verbose = getFlag('verbose');
|
|
770
|
+
const evalSettingsQt = evalConfigLoader.getEvalSettings();
|
|
771
|
+
const skipRubricEval = getFlag('skip-rubric') || !evalSettingsQt.useAIJudge;
|
|
772
|
+
const config = { profileName: profile };
|
|
773
|
+
|
|
774
|
+
console.log(`\nRunning quick test (profile: ${profile}, scenario: ${scenarioId})...\n`);
|
|
775
|
+
const result = await evaluationRunner.quickTest(config, {
|
|
776
|
+
scenarioId,
|
|
777
|
+
verbose,
|
|
778
|
+
skipRubricEval,
|
|
779
|
+
});
|
|
780
|
+
console.log('\nResult:');
|
|
781
|
+
console.log(JSON.stringify(result, null, 2));
|
|
782
|
+
break;
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
case 'run': {
|
|
786
|
+
const verbose = getFlag('verbose');
|
|
787
|
+
// CLI --use-rubric forces rubric on; --skip-rubric forces off; otherwise use config default
|
|
788
|
+
const evalSettings = evalConfigLoader.getEvalSettings();
|
|
789
|
+
const skipRubricEval = getFlag('use-rubric') ? false : (getFlag('skip-rubric') || !evalSettings.useAIJudge);
|
|
790
|
+
const runsPerConfig = parseInt(getOption('runs', '1'), 10);
|
|
791
|
+
const parallelism = parseInt(getOption('parallelism', '2'), 10);
|
|
792
|
+
const description = getOption('description');
|
|
793
|
+
const clusterOpt = getOption('cluster');
|
|
794
|
+
const scenarioOpt = getOption('scenario') || getOption('scenarios');
|
|
795
|
+
const allProfiles = getFlag('all-profiles');
|
|
796
|
+
const modelOverride = getOption('model');
|
|
797
|
+
const egoModelOverride = getOption('ego-model');
|
|
798
|
+
const superegoModelOverride = getOption('superego-model');
|
|
799
|
+
|
|
800
|
+
// --cluster and --scenario are mutually exclusive
|
|
801
|
+
if (clusterOpt && scenarioOpt) {
|
|
802
|
+
console.error('Error: --cluster and --scenario are mutually exclusive.');
|
|
803
|
+
process.exit(1);
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
const scenarios = scenarioOpt
|
|
807
|
+
? scenarioOpt.split(',').map(s => s.trim())
|
|
808
|
+
: 'all';
|
|
809
|
+
|
|
810
|
+
// Determine configurations: explicit --profile overrides everything,
|
|
811
|
+
// --all-profiles loads every profile, default is the 8 factorial cells.
|
|
812
|
+
const profileOpt = getOption('config') || getOption('profile') || getOption('profiles');
|
|
813
|
+
let configurations;
|
|
814
|
+
let isFactorial = false;
|
|
815
|
+
|
|
816
|
+
if (profileOpt) {
|
|
817
|
+
// Explicit profile selection (single or comma-separated)
|
|
818
|
+
const profileNames = profileOpt.includes(',')
|
|
819
|
+
? profileOpt.split(',').map(s => s.trim())
|
|
820
|
+
: [profileOpt];
|
|
821
|
+
configurations = profileNames.map(name => ({
|
|
822
|
+
provider: null,
|
|
823
|
+
model: null,
|
|
824
|
+
profileName: name,
|
|
825
|
+
label: name,
|
|
826
|
+
}));
|
|
827
|
+
// Check if the selection happens to be factorial cells
|
|
828
|
+
isFactorial = profileNames.every(n => n.startsWith('cell_'));
|
|
829
|
+
} else if (allProfiles) {
|
|
830
|
+
configurations = 'profiles';
|
|
831
|
+
} else {
|
|
832
|
+
// Default: 2×2×2 factorial design
|
|
833
|
+
isFactorial = true;
|
|
834
|
+
configurations = 'factorial';
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
if (isFactorial) {
|
|
838
|
+
const cellCount = 8;
|
|
839
|
+
console.log('\n2x2x2 Factorial Design');
|
|
840
|
+
console.log(` Factor A: Recognition (off / on)`);
|
|
841
|
+
console.log(` Factor B: Tutor arch. (single / ego+superego)`);
|
|
842
|
+
console.log(` Factor C: Learner arch. (unified / ego_superego)`);
|
|
843
|
+
console.log(` Cells: ${cellCount} | Runs/cell: ${runsPerConfig} | Per scenario: ${cellCount * runsPerConfig}`);
|
|
844
|
+
if (modelOverride) {
|
|
845
|
+
console.log(` Model override: ${modelOverride}`);
|
|
846
|
+
} else if (egoModelOverride || superegoModelOverride) {
|
|
847
|
+
if (egoModelOverride) console.log(` Ego model override: ${egoModelOverride}`);
|
|
848
|
+
if (superegoModelOverride) console.log(` Superego model override: ${superegoModelOverride}`);
|
|
849
|
+
}
|
|
850
|
+
console.log('');
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
if (clusterOpt) {
|
|
854
|
+
console.log(`Cluster filter: ${clusterOpt}\n`);
|
|
855
|
+
}
|
|
856
|
+
console.log('Starting evaluation run...\n');
|
|
857
|
+
const result = await evaluationRunner.runEvaluation({
|
|
858
|
+
scenarios,
|
|
859
|
+
configurations,
|
|
860
|
+
runsPerConfig,
|
|
861
|
+
parallelism,
|
|
862
|
+
skipRubricEval,
|
|
863
|
+
description: description || (isFactorial ? '2x2x2 Factorial Evaluation' : null),
|
|
864
|
+
verbose,
|
|
865
|
+
scenarioFilter: clusterOpt || null,
|
|
866
|
+
modelOverride: modelOverride || null,
|
|
867
|
+
egoModelOverride: egoModelOverride || null,
|
|
868
|
+
superegoModelOverride: superegoModelOverride || null,
|
|
869
|
+
});
|
|
870
|
+
// Extract unique model aliases used across all configs (ego + superego)
|
|
871
|
+
const extractAlias = (raw) => {
|
|
872
|
+
if (!raw) return null;
|
|
873
|
+
const dotIdx = raw.indexOf('.');
|
|
874
|
+
return dotIdx !== -1 ? raw.slice(dotIdx + 1) : raw;
|
|
875
|
+
};
|
|
876
|
+
const modelAliases = [...new Set(
|
|
877
|
+
(result.stats || []).flatMap(s => [
|
|
878
|
+
extractAlias(s.egoModel || s.model),
|
|
879
|
+
extractAlias(s.superegoModel),
|
|
880
|
+
]).filter(Boolean)
|
|
881
|
+
)];
|
|
882
|
+
|
|
883
|
+
console.log('\nEvaluation complete.');
|
|
884
|
+
if (modelAliases.length > 0) {
|
|
885
|
+
console.log(`Models: ${modelAliases.join(', ')}`);
|
|
886
|
+
}
|
|
887
|
+
console.log(JSON.stringify(result, null, 2));
|
|
888
|
+
|
|
889
|
+
// Factorial post-analysis: print cell means and ANOVA for each score type
|
|
890
|
+
if (result.runId) {
|
|
891
|
+
const scoreTypes = [
|
|
892
|
+
{ column: 'overall_score', label: 'Overall Score' },
|
|
893
|
+
{ column: 'base_score', label: 'Base Score' },
|
|
894
|
+
{ column: 'recognition_score', label: 'Recognition Score' },
|
|
895
|
+
];
|
|
896
|
+
|
|
897
|
+
for (const { column, label } of scoreTypes) {
|
|
898
|
+
const cellData = evaluationStore.getFactorialCellData(result.runId, { scoreColumn: column });
|
|
899
|
+
const cellKeys = Object.keys(cellData);
|
|
900
|
+
const totalSamples = cellKeys.reduce((sum, k) => sum + cellData[k].length, 0);
|
|
901
|
+
|
|
902
|
+
if (totalSamples === 0) continue;
|
|
903
|
+
|
|
904
|
+
console.log('\n' + '='.repeat(70));
|
|
905
|
+
console.log(` FACTORIAL ANALYSIS: ${label.toUpperCase()}`);
|
|
906
|
+
console.log('='.repeat(70));
|
|
907
|
+
|
|
908
|
+
for (const key of cellKeys.sort()) {
|
|
909
|
+
const scores = cellData[key];
|
|
910
|
+
const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
911
|
+
const sd = scores.length > 1
|
|
912
|
+
? Math.sqrt(scores.reduce((acc, s) => acc + (s - mean) ** 2, 0) / (scores.length - 1))
|
|
913
|
+
: 0;
|
|
914
|
+
const cellLabel = key.replace(/r(\d)_t(\d)_l(\d)/, (_, r, t, l) =>
|
|
915
|
+
`Recog=${r === '1' ? 'Y' : 'N'} Tutor=${t === '1' ? 'Multi' : 'Single'} Learner=${l === '1' ? 'Psycho' : 'Unified'}`
|
|
916
|
+
);
|
|
917
|
+
console.log(` ${cellLabel.padEnd(52)} mean=${mean.toFixed(1)} sd=${sd.toFixed(1)} n=${scores.length}`);
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
if (totalSamples > 8) {
|
|
921
|
+
const anovaResult = anovaStats.runThreeWayANOVA(cellData);
|
|
922
|
+
console.log(anovaStats.formatANOVAReport(anovaResult, { scoreLabel: label }));
|
|
923
|
+
} else {
|
|
924
|
+
console.log(`\n Need > 8 total samples for ANOVA (have ${totalSamples}). Increase --runs.`);
|
|
925
|
+
}
|
|
926
|
+
}
|
|
927
|
+
}
|
|
928
|
+
break;
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
case 'runs': {
|
|
932
|
+
const limitOpt = getOption('limit');
|
|
933
|
+
const limit = limitOpt ? parseInt(limitOpt, 10) : null;
|
|
934
|
+
const statusFilter = getOption('status') || null;
|
|
935
|
+
const runs = evaluationStore.listRuns({ limit, status: statusFilter });
|
|
936
|
+
|
|
937
|
+
if (runs.length === 0) {
|
|
938
|
+
console.log('\nNo evaluation runs found.');
|
|
939
|
+
break;
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
console.log(`\nEvaluation runs (${runs.length} total):\n`);
|
|
943
|
+
console.log(
|
|
944
|
+
' ' +
|
|
945
|
+
'ID'.padEnd(40) +
|
|
946
|
+
'Status'.padEnd(12) +
|
|
947
|
+
'Progress'.padEnd(18) +
|
|
948
|
+
'Avg'.padEnd(7) +
|
|
949
|
+
'Duration'.padEnd(10) +
|
|
950
|
+
'Created'.padEnd(24) +
|
|
951
|
+
'Description'
|
|
952
|
+
);
|
|
953
|
+
console.log(' ' + '-'.repeat(130));
|
|
954
|
+
|
|
955
|
+
for (const run of runs) {
|
|
956
|
+
const created = run.createdAt
|
|
957
|
+
? new Date(run.createdAt).toLocaleString()
|
|
958
|
+
: '--';
|
|
959
|
+
// Progress: show completed/total (pct%)
|
|
960
|
+
let progress = '--';
|
|
961
|
+
if (run.totalTests > 0) {
|
|
962
|
+
const pct = run.progressPct != null ? run.progressPct : 100;
|
|
963
|
+
progress = `${run.completedResults}/${run.totalTests} (${pct}%)`;
|
|
964
|
+
} else if (run.completedResults > 0) {
|
|
965
|
+
progress = `${run.completedResults} done`;
|
|
966
|
+
}
|
|
967
|
+
const avg = run.avgScore != null ? run.avgScore.toFixed(1) : '--';
|
|
968
|
+
// Duration formatting
|
|
969
|
+
let duration = '--';
|
|
970
|
+
if (run.durationMs != null) {
|
|
971
|
+
const totalSec = Math.round(run.durationMs / 1000);
|
|
972
|
+
const m = Math.floor(totalSec / 60);
|
|
973
|
+
const s = totalSec % 60;
|
|
974
|
+
duration = m > 0 ? `${m}m ${s}s` : `${s}s`;
|
|
975
|
+
}
|
|
976
|
+
const desc = run.description || '';
|
|
977
|
+
const models = (run.models && run.models.length > 0) ? run.models.join(', ') : '--';
|
|
978
|
+
console.log(
|
|
979
|
+
' ' +
|
|
980
|
+
run.id.padEnd(40) +
|
|
981
|
+
(run.status || '--').padEnd(12) +
|
|
982
|
+
progress.padEnd(18) +
|
|
983
|
+
avg.padEnd(7) +
|
|
984
|
+
duration.padEnd(10) +
|
|
985
|
+
created.padEnd(24) +
|
|
986
|
+
desc
|
|
987
|
+
);
|
|
988
|
+
if (models !== '--') {
|
|
989
|
+
console.log(' ' + ` Models: ${models}`);
|
|
990
|
+
}
|
|
991
|
+
}
|
|
992
|
+
console.log('');
|
|
993
|
+
break;
|
|
994
|
+
}
|
|
995
|
+
|
|
996
|
+
case 'report': {
|
|
997
|
+
const runId = args.find(a => !a.startsWith('--') && a !== 'report');
|
|
998
|
+
if (!runId) {
|
|
999
|
+
console.error('Usage: eval-cli.js report <runId>');
|
|
1000
|
+
process.exit(1);
|
|
1001
|
+
}
|
|
1002
|
+
const report = evaluationRunner.generateReport(runId);
|
|
1003
|
+
console.log(report);
|
|
1004
|
+
break;
|
|
1005
|
+
}
|
|
1006
|
+
|
|
1007
|
+
case 'status': {
|
|
1008
|
+
// Quick snapshot of a run's current state
|
|
1009
|
+
const runId = args.find(a => !a.startsWith('--') && a !== 'status');
|
|
1010
|
+
if (!runId) {
|
|
1011
|
+
console.error('Usage: eval-cli.js status <runId>');
|
|
1012
|
+
process.exit(1);
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
// Try JSONL first for in-progress runs
|
|
1016
|
+
const events = readProgressLog(runId);
|
|
1017
|
+
if (events.length > 0) {
|
|
1018
|
+
let { scenarios, profiles, grid, completedTests, totalTests, runDone, durationMs } = buildGridFromEvents(events);
|
|
1019
|
+
|
|
1020
|
+
// Check if process is still alive (for running runs)
|
|
1021
|
+
let statusLabel = runDone ? 'completed' : 'running';
|
|
1022
|
+
const runData = evaluationRunner.getRunResults(runId);
|
|
1023
|
+
const pid = runData?.run?.metadata?.pid;
|
|
1024
|
+
|
|
1025
|
+
// If JSONL has no run_start (totalTests=0), fall back to DB for the total
|
|
1026
|
+
if (totalTests === 0 && runData?.run) {
|
|
1027
|
+
totalTests = (runData.run.totalScenarios || scenarios.length) * (runData.run.totalConfigurations || profiles.length);
|
|
1028
|
+
}
|
|
1029
|
+
|
|
1030
|
+
// For resumed runs, completed can exceed total - cap display at total
|
|
1031
|
+
const displayCompleted = Math.min(completedTests, totalTests);
|
|
1032
|
+
const pct = totalTests > 0 ? Math.min(100, Math.round((displayCompleted / totalTests) * 100)) : 0;
|
|
1033
|
+
if (!runDone && pid) {
|
|
1034
|
+
const alive = isPidAlive(pid);
|
|
1035
|
+
if (!alive) {
|
|
1036
|
+
statusLabel = `STALE (pid ${pid} dead)`;
|
|
1037
|
+
} else {
|
|
1038
|
+
statusLabel = `running (pid ${pid})`;
|
|
1039
|
+
}
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
console.log(`\nRun: ${runId}`);
|
|
1043
|
+
console.log(`Status: ${statusLabel}`);
|
|
1044
|
+
console.log(`Progress: ${displayCompleted}/${totalTests} tests (${pct}%)${completedTests > totalTests ? ` [${completedTests - totalTests} retried]` : ''}`);
|
|
1045
|
+
if (durationMs) console.log(`Duration: ${formatMs(durationMs)}`);
|
|
1046
|
+
console.log(`Scenarios: ${scenarios.length} | Profiles: ${profiles.length}`);
|
|
1047
|
+
|
|
1048
|
+
// Per-scenario completion counts
|
|
1049
|
+
if (scenarios.length > 0) {
|
|
1050
|
+
console.log('\nScenario completion:');
|
|
1051
|
+
for (const s of scenarios) {
|
|
1052
|
+
const done = profiles.filter(p => grid[s]?.[p]).length;
|
|
1053
|
+
const scores = profiles
|
|
1054
|
+
.filter(p => grid[s]?.[p]?.score != null)
|
|
1055
|
+
.map(p => grid[s][p].score);
|
|
1056
|
+
const avg = scores.length > 0
|
|
1057
|
+
? (scores.reduce((a, b) => a + b, 0) / scores.length).toFixed(1)
|
|
1058
|
+
: '--';
|
|
1059
|
+
console.log(` ${s}: ${done}/${profiles.length} profiles done, avg=${avg}`);
|
|
1060
|
+
}
|
|
1061
|
+
}
|
|
1062
|
+
|
|
1063
|
+
// Top performers
|
|
1064
|
+
const profileScores = {};
|
|
1065
|
+
for (const s of scenarios) {
|
|
1066
|
+
for (const p of profiles) {
|
|
1067
|
+
const cell = grid[s]?.[p];
|
|
1068
|
+
if (cell?.score != null) {
|
|
1069
|
+
if (!profileScores[p]) profileScores[p] = [];
|
|
1070
|
+
profileScores[p].push(cell.score);
|
|
1071
|
+
}
|
|
1072
|
+
}
|
|
1073
|
+
}
|
|
1074
|
+
const ranked = Object.entries(profileScores)
|
|
1075
|
+
.map(([name, scores]) => ({
|
|
1076
|
+
name,
|
|
1077
|
+
avg: scores.reduce((a, b) => a + b, 0) / scores.length,
|
|
1078
|
+
count: scores.length,
|
|
1079
|
+
}))
|
|
1080
|
+
.sort((a, b) => b.avg - a.avg);
|
|
1081
|
+
if (ranked.length > 0) {
|
|
1082
|
+
console.log('\nTop performers:');
|
|
1083
|
+
for (const r of ranked.slice(0, 5)) {
|
|
1084
|
+
console.log(` ${r.name}: avg=${r.avg.toFixed(1)} (${r.count} tests)`);
|
|
1085
|
+
}
|
|
1086
|
+
}
|
|
1087
|
+
} else {
|
|
1088
|
+
// Fallback: read from SQLite
|
|
1089
|
+
const runData = evaluationRunner.getRunResults(runId);
|
|
1090
|
+
console.log(`\nRun: ${runId}`);
|
|
1091
|
+
console.log(`Status: ${runData.run.status}`);
|
|
1092
|
+
const createdLocal = runData.run.createdAt
|
|
1093
|
+
? new Date(runData.run.createdAt).toLocaleString()
|
|
1094
|
+
: '--';
|
|
1095
|
+
console.log(`Created: ${createdLocal}`);
|
|
1096
|
+
console.log(`Description: ${runData.run.description || 'N/A'}`);
|
|
1097
|
+
// Count unique (scenario, profile) pairs to handle rejudge duplicates
|
|
1098
|
+
const uniqueTests = new Set(runData.results.map(r => `${r.scenarioId}:${r.profileName}`)).size;
|
|
1099
|
+
console.log(`Tests: ${runData.run.totalTests || uniqueTests}`);
|
|
1100
|
+
|
|
1101
|
+
if (runData.stats.length > 0) {
|
|
1102
|
+
console.log('\nTop performers:');
|
|
1103
|
+
for (const stat of runData.stats.slice(0, 10)) {
|
|
1104
|
+
const label = stat.profileName || `${stat.provider}/${stat.model}`;
|
|
1105
|
+
const base = stat.avgBaseScore != null ? ` base=${stat.avgBaseScore.toFixed(1)}` : '';
|
|
1106
|
+
const recog = stat.avgRecognitionScore != null ? ` recog=${stat.avgRecognitionScore.toFixed(1)}` : '';
|
|
1107
|
+
console.log(` ${label}: avg=${stat.avgScore?.toFixed(1) || '--'}${base}${recog} (${stat.totalTests} tests)`);
|
|
1108
|
+
}
|
|
1109
|
+
}
|
|
1110
|
+
}
|
|
1111
|
+
console.log('');
|
|
1112
|
+
break;
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
case 'watch': {
|
|
1116
|
+
// Live-updating scenario×profile grid table
|
|
1117
|
+
const runId = args.find(a => !a.startsWith('--') && a !== 'watch');
|
|
1118
|
+
if (!runId) {
|
|
1119
|
+
console.error('Usage: eval-cli.js watch <runId> [--refresh 2000] [--db]');
|
|
1120
|
+
process.exit(1);
|
|
1121
|
+
}
|
|
1122
|
+
|
|
1123
|
+
const refreshMs = parseInt(getOption('refresh', '2000'), 10);
|
|
1124
|
+
const useDb = getFlag('db');
|
|
1125
|
+
|
|
1126
|
+
console.log(`Watching run: ${runId} (refresh every ${refreshMs}ms, source: ${useDb ? 'SQLite' : 'JSONL'})`);
|
|
1127
|
+
console.log('Press Ctrl+C to stop.\n');
|
|
1128
|
+
|
|
1129
|
+
const renderFromJsonl = () => {
|
|
1130
|
+
const events = readProgressLog(runId);
|
|
1131
|
+
if (events.length === 0) {
|
|
1132
|
+
return { output: 'Waiting for progress data...', done: false };
|
|
1133
|
+
}
|
|
1134
|
+
const data = buildGridFromEvents(events);
|
|
1135
|
+
// If JSONL has no run_start (totalTests=0), fall back to DB for the total
|
|
1136
|
+
if (data.totalTests === 0) {
|
|
1137
|
+
try {
|
|
1138
|
+
const runData = evaluationRunner.getRunResults(runId);
|
|
1139
|
+
const run = runData.run;
|
|
1140
|
+
data.totalTests = (run.totalScenarios || 1) * (run.totalConfigurations || 1);
|
|
1141
|
+
} catch {
|
|
1142
|
+
// If DB lookup fails, infer from grid
|
|
1143
|
+
data.totalTests = data.scenarios.length * data.profiles.length || data.completedTests;
|
|
1144
|
+
}
|
|
1145
|
+
}
|
|
1146
|
+
return { output: renderGrid(data), done: data.runDone };
|
|
1147
|
+
};
|
|
1148
|
+
|
|
1149
|
+
const renderFromDb = () => {
|
|
1150
|
+
try {
|
|
1151
|
+
const runData = evaluationRunner.getRunResults(runId);
|
|
1152
|
+
const results = runData.results || [];
|
|
1153
|
+
// Build grid from DB results
|
|
1154
|
+
const scenarios = [...new Set(results.map(r => r.scenarioName || r.scenarioId))];
|
|
1155
|
+
const profiles = [...new Set(results.map(r => r.profileName || `${r.provider}/${r.model}`))];
|
|
1156
|
+
const grid = {};
|
|
1157
|
+
for (const r of results) {
|
|
1158
|
+
const sName = r.scenarioName || r.scenarioId;
|
|
1159
|
+
const pName = r.profileName || `${r.provider}/${r.model}`;
|
|
1160
|
+
if (!grid[sName]) grid[sName] = {};
|
|
1161
|
+
grid[sName][pName] = {
|
|
1162
|
+
score: r.overallScore,
|
|
1163
|
+
success: r.success,
|
|
1164
|
+
latencyMs: r.latencyMs,
|
|
1165
|
+
};
|
|
1166
|
+
}
|
|
1167
|
+
const totalTests = (runData.run.totalScenarios || scenarios.length) * (runData.run.totalConfigurations || profiles.length);
|
|
1168
|
+
const done = runData.run.status === 'completed';
|
|
1169
|
+
// Count unique (scenario, profile) pairs instead of total rows (handles rejudge duplicates)
|
|
1170
|
+
const uniqueCompleted = new Set(results.map(r => `${r.scenarioId}:${r.profileName}`)).size;
|
|
1171
|
+
return {
|
|
1172
|
+
output: renderGrid({ scenarios, profiles, grid, completedTests: uniqueCompleted, totalTests, runDone: done, durationMs: null }),
|
|
1173
|
+
done,
|
|
1174
|
+
};
|
|
1175
|
+
} catch (e) {
|
|
1176
|
+
return { output: `Error reading DB: ${e.message}`, done: false };
|
|
1177
|
+
}
|
|
1178
|
+
};
|
|
1179
|
+
|
|
1180
|
+
const render = useDb ? renderFromDb : renderFromJsonl;
|
|
1181
|
+
|
|
1182
|
+
// Initial check — if JSONL doesn't exist yet, wait for it
|
|
1183
|
+
if (!useDb) {
|
|
1184
|
+
const logPath = getProgressLogPath(runId);
|
|
1185
|
+
if (!fs.existsSync(logPath)) {
|
|
1186
|
+
console.log(`Waiting for progress log: ${logPath}`);
|
|
1187
|
+
}
|
|
1188
|
+
}
|
|
1189
|
+
|
|
1190
|
+
// Poll loop
|
|
1191
|
+
let lastOutput = '';
|
|
1192
|
+
const poll = () => {
|
|
1193
|
+
const { output, done } = render();
|
|
1194
|
+
if (output !== lastOutput) {
|
|
1195
|
+
// Clear screen and redraw
|
|
1196
|
+
process.stdout.write('\x1b[2J\x1b[H');
|
|
1197
|
+
console.log(`Watch: ${runId} (${new Date().toLocaleTimeString()})`);
|
|
1198
|
+
console.log('');
|
|
1199
|
+
console.log(output);
|
|
1200
|
+
lastOutput = output;
|
|
1201
|
+
}
|
|
1202
|
+
if (done) {
|
|
1203
|
+
console.log('\nRun complete. Exiting watch.');
|
|
1204
|
+
process.exit(0);
|
|
1205
|
+
}
|
|
1206
|
+
};
|
|
1207
|
+
|
|
1208
|
+
poll();
|
|
1209
|
+
const interval = setInterval(poll, refreshMs);
|
|
1210
|
+
|
|
1211
|
+
// Clean exit on Ctrl+C
|
|
1212
|
+
process.on('SIGINT', () => {
|
|
1213
|
+
clearInterval(interval);
|
|
1214
|
+
console.log('\nStopped watching.');
|
|
1215
|
+
process.exit(0);
|
|
1216
|
+
});
|
|
1217
|
+
|
|
1218
|
+
// Keep process alive
|
|
1219
|
+
await new Promise(() => {});
|
|
1220
|
+
break;
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
case 'transcript': {
|
|
1224
|
+
const runId = args.find(a => !a.startsWith('--') && a !== 'transcript');
|
|
1225
|
+
if (!runId) {
|
|
1226
|
+
console.error('Usage: eval-cli.js transcript <runId> [--scenario <id>]');
|
|
1227
|
+
process.exit(1);
|
|
1228
|
+
}
|
|
1229
|
+
|
|
1230
|
+
const scenarioFilter = getOption('scenario');
|
|
1231
|
+
const results = evaluationStore.getResults(runId, {
|
|
1232
|
+
scenarioId: scenarioFilter || null,
|
|
1233
|
+
});
|
|
1234
|
+
|
|
1235
|
+
if (results.length === 0) {
|
|
1236
|
+
console.log(`\nNo results found for run: ${runId}`);
|
|
1237
|
+
break;
|
|
1238
|
+
}
|
|
1239
|
+
|
|
1240
|
+
console.log(`\nTranscripts for run: ${runId} (${results.length} results)\n`);
|
|
1241
|
+
|
|
1242
|
+
for (const result of results) {
|
|
1243
|
+
console.log('='.repeat(80));
|
|
1244
|
+
console.log(`Scenario: ${result.scenarioName || result.scenarioId}`);
|
|
1245
|
+
console.log(`Profile: ${result.profileName || `${result.provider}/${result.model}`}`);
|
|
1246
|
+
console.log(`Score: ${result.overallScore != null ? result.overallScore.toFixed(1) : '--'} | Success: ${result.success}`);
|
|
1247
|
+
console.log('-'.repeat(80));
|
|
1248
|
+
|
|
1249
|
+
// Try dialogue log file first
|
|
1250
|
+
let printed = false;
|
|
1251
|
+
if (result.dialogueId) {
|
|
1252
|
+
// Search for the dialogue file (may include date prefix in filename)
|
|
1253
|
+
const files = fs.existsSync(LOGS_DIR)
|
|
1254
|
+
? fs.readdirSync(LOGS_DIR).filter(f => f.includes(result.dialogueId))
|
|
1255
|
+
: [];
|
|
1256
|
+
|
|
1257
|
+
if (files.length > 0) {
|
|
1258
|
+
try {
|
|
1259
|
+
const dialogue = JSON.parse(fs.readFileSync(path.join(LOGS_DIR, files[0]), 'utf-8'));
|
|
1260
|
+
const trace = dialogue.dialogueTrace || [];
|
|
1261
|
+
for (const entry of trace) {
|
|
1262
|
+
console.log(formatTraceEntry(entry));
|
|
1263
|
+
console.log('');
|
|
1264
|
+
}
|
|
1265
|
+
if (trace.length > 0) printed = true;
|
|
1266
|
+
} catch (e) {
|
|
1267
|
+
// Fall through to suggestions
|
|
1268
|
+
}
|
|
1269
|
+
}
|
|
1270
|
+
}
|
|
1271
|
+
|
|
1272
|
+
// Fall back to suggestions / raw response from DB
|
|
1273
|
+
if (!printed) {
|
|
1274
|
+
if (result.suggestions?.length > 0) {
|
|
1275
|
+
console.log('Suggestions:');
|
|
1276
|
+
for (const s of result.suggestions) {
|
|
1277
|
+
const text = typeof s === 'string' ? s : (s.text || s.content || JSON.stringify(s));
|
|
1278
|
+
console.log(` • ${text}`);
|
|
1279
|
+
}
|
|
1280
|
+
console.log('');
|
|
1281
|
+
}
|
|
1282
|
+
if (result.evaluationReasoning) {
|
|
1283
|
+
console.log('Judge reasoning:');
|
|
1284
|
+
console.log(` ${result.evaluationReasoning}`);
|
|
1285
|
+
console.log('');
|
|
1286
|
+
}
|
|
1287
|
+
}
|
|
1288
|
+
|
|
1289
|
+
if (result.errorMessage) {
|
|
1290
|
+
console.log(`ERROR: ${result.errorMessage}`);
|
|
1291
|
+
console.log('');
|
|
1292
|
+
}
|
|
1293
|
+
}
|
|
1294
|
+
|
|
1295
|
+
// Also check for interaction evals
|
|
1296
|
+
const interactionEvals = evaluationStore.listInteractionEvals({ limit: 200 });
|
|
1297
|
+
const runInteractions = interactionEvals.filter(e => e.runId === runId);
|
|
1298
|
+
|
|
1299
|
+
if (runInteractions.length > 0) {
|
|
1300
|
+
console.log('\n' + '='.repeat(80));
|
|
1301
|
+
console.log(' INTERACTION TRANSCRIPTS');
|
|
1302
|
+
console.log('='.repeat(80));
|
|
1303
|
+
|
|
1304
|
+
for (const ie of runInteractions) {
|
|
1305
|
+
const full = evaluationStore.getInteractionEval(ie.evalId);
|
|
1306
|
+
if (!full) continue;
|
|
1307
|
+
|
|
1308
|
+
console.log(`\nScenario: ${full.scenarioName || full.scenarioId}`);
|
|
1309
|
+
console.log(`Tutor: ${full.tutorProfile} | Learner: ${full.learnerProfile}`);
|
|
1310
|
+
console.log(`Turns: ${full.turnCount} | Score: ${full.judgeOverallScore ?? '--'}`);
|
|
1311
|
+
console.log('-'.repeat(80));
|
|
1312
|
+
|
|
1313
|
+
if (full.formattedTranscript) {
|
|
1314
|
+
console.log(full.formattedTranscript);
|
|
1315
|
+
} else if (full.turns?.length > 0) {
|
|
1316
|
+
for (const turn of full.turns) {
|
|
1317
|
+
const speaker = (turn.phase || turn.role || 'unknown').toUpperCase();
|
|
1318
|
+
console.log(`[Turn ${turn.turnNumber || '?'}] ${speaker}:`);
|
|
1319
|
+
console.log(turn.externalMessage || turn.content || '');
|
|
1320
|
+
console.log('');
|
|
1321
|
+
}
|
|
1322
|
+
}
|
|
1323
|
+
}
|
|
1324
|
+
}
|
|
1325
|
+
|
|
1326
|
+
break;
|
|
1327
|
+
}
|
|
1328
|
+
|
|
1329
|
+
case 'cleanup': {
|
|
1330
|
+
const force = getFlag('force');
|
|
1331
|
+
const threshold = parseInt(getOption('older-than', '30'), 10);
|
|
1332
|
+
|
|
1333
|
+
console.log(`\nScanning for stale runs (running > ${threshold} minutes)...`);
|
|
1334
|
+
|
|
1335
|
+
// Dry-run by default; require --force to actually complete
|
|
1336
|
+
const dryRun = !force;
|
|
1337
|
+
if (dryRun) console.log(' (dry run — pass --force to actually complete stale runs)\n');
|
|
1338
|
+
|
|
1339
|
+
const result = evaluationStore.autoCompleteStaleRuns({
|
|
1340
|
+
olderThanMinutes: threshold,
|
|
1341
|
+
dryRun,
|
|
1342
|
+
});
|
|
1343
|
+
|
|
1344
|
+
if (result.found === 0) {
|
|
1345
|
+
console.log('No stale runs found.');
|
|
1346
|
+
} else if (dryRun) {
|
|
1347
|
+
console.log(`Found ${result.found} stale run(s):\n`);
|
|
1348
|
+
for (const run of result.runs) {
|
|
1349
|
+
console.log(` ${run.id} age=${run.ageMinutes}m results=${run.resultsFound} desc="${run.description || ''}"` );
|
|
1350
|
+
}
|
|
1351
|
+
console.log('\nRe-run with --force to mark these as completed.');
|
|
1352
|
+
} else {
|
|
1353
|
+
console.log(`Processed ${result.completed} stale run(s):\n`);
|
|
1354
|
+
for (const run of result.runs) {
|
|
1355
|
+
const status = run.status || (run.alreadyCompleted ? 'already completed' : 'unknown');
|
|
1356
|
+
const partial = run.wasPartial ? ` (partial: ${run.completionRate}%)` : '';
|
|
1357
|
+
console.log(` ${run.runId} → ${status}${partial} results=${run.resultsFound || '--'}`);
|
|
1358
|
+
}
|
|
1359
|
+
}
|
|
1360
|
+
|
|
1361
|
+
console.log('');
|
|
1362
|
+
break;
|
|
1363
|
+
}
|
|
1364
|
+
|
|
1365
|
+
case 'resume': {
|
|
1366
|
+
const runId = args.find(a => !a.startsWith('--') && a !== 'resume');
|
|
1367
|
+
if (!runId) {
|
|
1368
|
+
console.error('Usage: eval-cli.js resume <runId> [--parallelism N] [--verbose] [--force]');
|
|
1369
|
+
process.exit(1);
|
|
1370
|
+
}
|
|
1371
|
+
|
|
1372
|
+
const verbose = getFlag('verbose');
|
|
1373
|
+
const force = getFlag('force');
|
|
1374
|
+
const parallelism = parseInt(getOption('parallelism', '2'), 10);
|
|
1375
|
+
|
|
1376
|
+
const result = await evaluationRunner.resumeEvaluation({
|
|
1377
|
+
runId,
|
|
1378
|
+
parallelism,
|
|
1379
|
+
verbose,
|
|
1380
|
+
force,
|
|
1381
|
+
});
|
|
1382
|
+
|
|
1383
|
+
if (result.alreadyComplete) {
|
|
1384
|
+
break;
|
|
1385
|
+
}
|
|
1386
|
+
|
|
1387
|
+
// Extract unique model aliases (same as `run` command)
|
|
1388
|
+
const extractAlias = (raw) => {
|
|
1389
|
+
if (!raw) return null;
|
|
1390
|
+
const dotIdx = raw.indexOf('.');
|
|
1391
|
+
return dotIdx !== -1 ? raw.slice(dotIdx + 1) : raw;
|
|
1392
|
+
};
|
|
1393
|
+
const modelAliases = [...new Set(
|
|
1394
|
+
(result.stats || []).flatMap(s => [
|
|
1395
|
+
extractAlias(s.egoModel || s.model),
|
|
1396
|
+
extractAlias(s.superegoModel),
|
|
1397
|
+
]).filter(Boolean)
|
|
1398
|
+
)];
|
|
1399
|
+
|
|
1400
|
+
console.log('\nResume complete.');
|
|
1401
|
+
if (modelAliases.length > 0) {
|
|
1402
|
+
console.log(`Models: ${modelAliases.join(', ')}`);
|
|
1403
|
+
}
|
|
1404
|
+
console.log(` Total tests (all): ${result.totalTests}`);
|
|
1405
|
+
console.log(` Resumed tests: ${result.resumedTests}`);
|
|
1406
|
+
console.log(` Successful (this run): ${result.successfulTests}`);
|
|
1407
|
+
console.log(JSON.stringify(result, null, 2));
|
|
1408
|
+
|
|
1409
|
+
// Factorial post-analysis (same as `run` command)
|
|
1410
|
+
if (result.runId) {
|
|
1411
|
+
const scoreTypes = [
|
|
1412
|
+
{ column: 'overall_score', label: 'Overall Score' },
|
|
1413
|
+
{ column: 'base_score', label: 'Base Score' },
|
|
1414
|
+
{ column: 'recognition_score', label: 'Recognition Score' },
|
|
1415
|
+
];
|
|
1416
|
+
|
|
1417
|
+
for (const { column, label } of scoreTypes) {
|
|
1418
|
+
const cellData = evaluationStore.getFactorialCellData(result.runId, { scoreColumn: column });
|
|
1419
|
+
const cellKeys = Object.keys(cellData);
|
|
1420
|
+
const totalSamples = cellKeys.reduce((sum, k) => sum + cellData[k].length, 0);
|
|
1421
|
+
|
|
1422
|
+
if (totalSamples === 0) continue;
|
|
1423
|
+
|
|
1424
|
+
console.log('\n' + '='.repeat(70));
|
|
1425
|
+
console.log(` FACTORIAL ANALYSIS: ${label.toUpperCase()}`);
|
|
1426
|
+
console.log('='.repeat(70));
|
|
1427
|
+
|
|
1428
|
+
for (const key of cellKeys.sort()) {
|
|
1429
|
+
const scores = cellData[key];
|
|
1430
|
+
const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
1431
|
+
const sd = scores.length > 1
|
|
1432
|
+
? Math.sqrt(scores.reduce((acc, s) => acc + (s - mean) ** 2, 0) / (scores.length - 1))
|
|
1433
|
+
: 0;
|
|
1434
|
+
const cellLabel = key.replace(/r(\d)_t(\d)_l(\d)/, (_, r, t, l) =>
|
|
1435
|
+
`Recog=${r === '1' ? 'Y' : 'N'} Tutor=${t === '1' ? 'Multi' : 'Single'} Learner=${l === '1' ? 'Psycho' : 'Unified'}`
|
|
1436
|
+
);
|
|
1437
|
+
console.log(` ${cellLabel.padEnd(52)} mean=${mean.toFixed(1)} sd=${sd.toFixed(1)} n=${scores.length}`);
|
|
1438
|
+
}
|
|
1439
|
+
|
|
1440
|
+
if (totalSamples > 8) {
|
|
1441
|
+
const anovaResult = anovaStats.runThreeWayANOVA(cellData);
|
|
1442
|
+
console.log(anovaStats.formatANOVAReport(anovaResult, { scoreLabel: label }));
|
|
1443
|
+
} else {
|
|
1444
|
+
console.log(`\n Need > 8 total samples for ANOVA (have ${totalSamples}). Increase --runs.`);
|
|
1445
|
+
}
|
|
1446
|
+
}
|
|
1447
|
+
}
|
|
1448
|
+
break;
|
|
1449
|
+
}
|
|
1450
|
+
|
|
1451
|
+
case 'revert': {
|
|
1452
|
+
const runId = args.find(a => !a.startsWith('--') && a !== 'revert');
|
|
1453
|
+
if (!runId) {
|
|
1454
|
+
console.error('Usage: eval-cli.js revert <runId>');
|
|
1455
|
+
process.exit(1);
|
|
1456
|
+
}
|
|
1457
|
+
|
|
1458
|
+
const run = evaluationStore.getRun(runId);
|
|
1459
|
+
if (!run) {
|
|
1460
|
+
console.error(`Run not found: ${runId}`);
|
|
1461
|
+
process.exit(1);
|
|
1462
|
+
}
|
|
1463
|
+
|
|
1464
|
+
if (run.status === 'running') {
|
|
1465
|
+
console.log(`Run ${runId} is already in 'running' state.`);
|
|
1466
|
+
break;
|
|
1467
|
+
}
|
|
1468
|
+
|
|
1469
|
+
console.log(`Reverting run ${runId} from '${run.status}' → 'running'...`);
|
|
1470
|
+
evaluationStore.updateRun(runId, { status: 'running' });
|
|
1471
|
+
console.log('Done.');
|
|
1472
|
+
break;
|
|
1473
|
+
}
|
|
1474
|
+
|
|
1475
|
+
case 'chat': {
|
|
1476
|
+
await runChat();
|
|
1477
|
+
break;
|
|
1478
|
+
}
|
|
1479
|
+
|
|
1480
|
+
case 'rejudge': {
|
|
1481
|
+
const runId = args.find(a => !a.startsWith('--') && a !== 'rejudge');
|
|
1482
|
+
if (!runId) {
|
|
1483
|
+
console.error('Usage: eval-cli.js rejudge <runId> [--judge <model>] [--scenario <id>] [--verbose] [--overwrite]');
|
|
1484
|
+
console.error('');
|
|
1485
|
+
console.error('By default, creates new rows (preserves history for inter-judge reliability).');
|
|
1486
|
+
console.error('Use --overwrite to replace existing scores instead.');
|
|
1487
|
+
process.exit(1);
|
|
1488
|
+
}
|
|
1489
|
+
|
|
1490
|
+
// Restore env overrides from run metadata
|
|
1491
|
+
{
|
|
1492
|
+
const runData = evaluationStore.getRun(runId);
|
|
1493
|
+
const meta = typeof runData?.metadata === 'string' ? JSON.parse(runData.metadata) : runData?.metadata;
|
|
1494
|
+
if (meta?.scenariosFile && !process.env.EVAL_SCENARIOS_FILE) {
|
|
1495
|
+
process.env.EVAL_SCENARIOS_FILE = meta.scenariosFile;
|
|
1496
|
+
console.log(`[rejudge] Restored EVAL_SCENARIOS_FILE from run metadata: ${meta.scenariosFile}`);
|
|
1497
|
+
}
|
|
1498
|
+
if (meta?.contentPath && !process.env.EVAL_CONTENT_PATH) {
|
|
1499
|
+
process.env.EVAL_CONTENT_PATH = meta.contentPath;
|
|
1500
|
+
console.log(`[rejudge] Restored EVAL_CONTENT_PATH from run metadata: ${meta.contentPath}`);
|
|
1501
|
+
}
|
|
1502
|
+
}
|
|
1503
|
+
|
|
1504
|
+
const verbose = getFlag('verbose');
|
|
1505
|
+
const overwrite = getFlag('overwrite');
|
|
1506
|
+
const judgeOverride = getOption('judge') || null;
|
|
1507
|
+
const scenarioFilter = getOption('scenario') || null;
|
|
1508
|
+
|
|
1509
|
+
console.log(`\nRejudging run: ${runId}`);
|
|
1510
|
+
if (judgeOverride) console.log(` Judge override: ${judgeOverride}`);
|
|
1511
|
+
if (scenarioFilter) console.log(` Scenario filter: ${scenarioFilter}`);
|
|
1512
|
+
console.log(` Mode: ${overwrite ? 'overwrite (replace existing)' : 'preserve history (add new rows)'}`);
|
|
1513
|
+
console.log('');
|
|
1514
|
+
|
|
1515
|
+
const summary = await evaluationRunner.rejudgeRun(runId, {
|
|
1516
|
+
judgeOverride,
|
|
1517
|
+
verbose,
|
|
1518
|
+
scenarioFilter,
|
|
1519
|
+
overwrite,
|
|
1520
|
+
});
|
|
1521
|
+
|
|
1522
|
+
console.log('\n' + '='.repeat(60));
|
|
1523
|
+
console.log(' REJUDGE SUMMARY');
|
|
1524
|
+
console.log('='.repeat(60));
|
|
1525
|
+
console.log(` Run: ${summary.runId}`);
|
|
1526
|
+
console.log(` Total: ${summary.total}`);
|
|
1527
|
+
console.log(` Succeeded: ${summary.succeeded}`);
|
|
1528
|
+
console.log(` Failed: ${summary.failed}`);
|
|
1529
|
+
console.log(` Old avg: ${summary.oldAvgScore?.toFixed(2) ?? 'N/A'}`);
|
|
1530
|
+
console.log(` New avg: ${summary.newAvgScore?.toFixed(2) ?? 'N/A'}`);
|
|
1531
|
+
if (summary.scoreDelta != null) {
|
|
1532
|
+
const sign = summary.scoreDelta >= 0 ? '+' : '';
|
|
1533
|
+
console.log(` Delta: ${sign}${summary.scoreDelta.toFixed(2)}`);
|
|
1534
|
+
}
|
|
1535
|
+
console.log('');
|
|
1536
|
+
break;
|
|
1537
|
+
}
|
|
1538
|
+
|
|
1539
|
+
case 'export': {
|
|
1540
|
+
const runId = args.find(a => !a.startsWith('--') && a !== 'export');
|
|
1541
|
+
if (!runId) {
|
|
1542
|
+
console.error('Usage: eval-cli.js export <runId> [--scenario <id>] [--profile <name>] [--output <path>]');
|
|
1543
|
+
process.exit(1);
|
|
1544
|
+
}
|
|
1545
|
+
|
|
1546
|
+
const scenarioFilter = getOption('scenario') || null;
|
|
1547
|
+
const profileFilter = getOption('profile') || null;
|
|
1548
|
+
const outputOption = getOption('output') || null;
|
|
1549
|
+
|
|
1550
|
+
const results = evaluationStore.getResults(runId, {
|
|
1551
|
+
scenarioId: scenarioFilter,
|
|
1552
|
+
profileName: profileFilter,
|
|
1553
|
+
});
|
|
1554
|
+
|
|
1555
|
+
if (results.length === 0) {
|
|
1556
|
+
console.log(`\nNo results found for run: ${runId}`);
|
|
1557
|
+
break;
|
|
1558
|
+
}
|
|
1559
|
+
|
|
1560
|
+
// Build output
|
|
1561
|
+
const lines = [];
|
|
1562
|
+
lines.push(`# Evaluation Export — Run ${runId}`);
|
|
1563
|
+
lines.push(`# ${results.length} result(s)`);
|
|
1564
|
+
if (scenarioFilter) lines.push(`# Scenario filter: ${scenarioFilter}`);
|
|
1565
|
+
if (profileFilter) lines.push(`# Profile filter: ${profileFilter}`);
|
|
1566
|
+
lines.push('');
|
|
1567
|
+
|
|
1568
|
+
for (const result of results) {
|
|
1569
|
+
const scenario = getScenario(result.scenarioId);
|
|
1570
|
+
|
|
1571
|
+
lines.push('='.repeat(80));
|
|
1572
|
+
lines.push(`Scenario: ${result.scenarioName || result.scenarioId}`);
|
|
1573
|
+
lines.push(`Profile: ${result.profileName || `${result.provider}/${result.model}`}`);
|
|
1574
|
+
lines.push(`Provider: ${result.provider} Model: ${result.model}`);
|
|
1575
|
+
if (result.egoModel || result.superegoModel) {
|
|
1576
|
+
lines.push(`Ego: ${result.egoModel || 'N/A'} Superego: ${result.superegoModel || 'N/A'}`);
|
|
1577
|
+
}
|
|
1578
|
+
lines.push(`Score: ${result.overallScore != null ? result.overallScore.toFixed(1) : 'NOT EVALUATED'}`);
|
|
1579
|
+
lines.push('='.repeat(80));
|
|
1580
|
+
lines.push('');
|
|
1581
|
+
|
|
1582
|
+
if (scenario) {
|
|
1583
|
+
if (scenario.learner_context) {
|
|
1584
|
+
lines.push('### Scenario Context');
|
|
1585
|
+
lines.push(scenario.learner_context.trim());
|
|
1586
|
+
lines.push('');
|
|
1587
|
+
}
|
|
1588
|
+
if (scenario.expected_behavior) {
|
|
1589
|
+
lines.push('### Expected Behavior');
|
|
1590
|
+
lines.push(scenario.expected_behavior);
|
|
1591
|
+
lines.push('');
|
|
1592
|
+
}
|
|
1593
|
+
if (scenario.required_elements?.length > 0) {
|
|
1594
|
+
lines.push('### Required Elements');
|
|
1595
|
+
for (const el of scenario.required_elements) lines.push(`- ${el}`);
|
|
1596
|
+
lines.push('');
|
|
1597
|
+
}
|
|
1598
|
+
if (scenario.forbidden_elements?.length > 0) {
|
|
1599
|
+
lines.push('### Forbidden Elements');
|
|
1600
|
+
for (const el of scenario.forbidden_elements) lines.push(`- ${el}`);
|
|
1601
|
+
lines.push('');
|
|
1602
|
+
}
|
|
1603
|
+
}
|
|
1604
|
+
|
|
1605
|
+
// Tutor suggestion(s)
|
|
1606
|
+
if (result.suggestions?.length > 0) {
|
|
1607
|
+
lines.push('### Tutor Suggestion');
|
|
1608
|
+
for (const s of result.suggestions) {
|
|
1609
|
+
if (typeof s === 'string') {
|
|
1610
|
+
lines.push(s);
|
|
1611
|
+
} else {
|
|
1612
|
+
if (s.title) lines.push(`Title: ${s.title}`);
|
|
1613
|
+
if (s.message || s.text || s.content) lines.push(`Message: ${s.message || s.text || s.content}`);
|
|
1614
|
+
if (s.action) lines.push(`Action: ${s.action}${s.actionTarget ? ' → ' + s.actionTarget : ''}`);
|
|
1615
|
+
if (s.reasoning) lines.push(`Reasoning: ${s.reasoning}`);
|
|
1616
|
+
}
|
|
1617
|
+
}
|
|
1618
|
+
lines.push('');
|
|
1619
|
+
}
|
|
1620
|
+
|
|
1621
|
+
// Dialogue trace
|
|
1622
|
+
if (result.dialogueId) {
|
|
1623
|
+
const files = fs.existsSync(LOGS_DIR)
|
|
1624
|
+
? fs.readdirSync(LOGS_DIR).filter(f => f.includes(result.dialogueId))
|
|
1625
|
+
: [];
|
|
1626
|
+
|
|
1627
|
+
if (files.length > 0) {
|
|
1628
|
+
try {
|
|
1629
|
+
const dialogue = JSON.parse(fs.readFileSync(path.join(LOGS_DIR, files[0]), 'utf-8'));
|
|
1630
|
+
const trace = dialogue.dialogueTrace || [];
|
|
1631
|
+
if (trace.length > 0) {
|
|
1632
|
+
lines.push('### Dialogue Trace');
|
|
1633
|
+
for (const entry of trace) {
|
|
1634
|
+
lines.push(formatTraceEntry(entry));
|
|
1635
|
+
}
|
|
1636
|
+
lines.push('');
|
|
1637
|
+
}
|
|
1638
|
+
} catch (e) {
|
|
1639
|
+
// skip
|
|
1640
|
+
}
|
|
1641
|
+
}
|
|
1642
|
+
}
|
|
1643
|
+
|
|
1644
|
+
if (result.errorMessage) {
|
|
1645
|
+
lines.push(`### Error`);
|
|
1646
|
+
lines.push(result.errorMessage);
|
|
1647
|
+
lines.push('');
|
|
1648
|
+
}
|
|
1649
|
+
|
|
1650
|
+
lines.push('');
|
|
1651
|
+
}
|
|
1652
|
+
|
|
1653
|
+
// Determine output path
|
|
1654
|
+
let outputPath = outputOption;
|
|
1655
|
+
if (!outputPath) {
|
|
1656
|
+
const exportsDir = path.resolve(__dirname, '..', 'exports');
|
|
1657
|
+
if (!fs.existsSync(exportsDir)) fs.mkdirSync(exportsDir, { recursive: true });
|
|
1658
|
+
let filename = `eval-${runId}`;
|
|
1659
|
+
if (scenarioFilter) filename += `-${scenarioFilter}`;
|
|
1660
|
+
if (profileFilter) filename += `-${profileFilter}`;
|
|
1661
|
+
filename += '.md';
|
|
1662
|
+
outputPath = path.join(exportsDir, filename);
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
fs.writeFileSync(outputPath, lines.join('\n'), 'utf-8');
|
|
1666
|
+
console.log(`\nExported ${results.length} result(s) to: ${outputPath}`);
|
|
1667
|
+
break;
|
|
1668
|
+
}
|
|
1669
|
+
|
|
1670
|
+
case 'evaluate': {
|
|
1671
|
+
const runId = args.find(a => !a.startsWith('--') && a !== 'evaluate');
|
|
1672
|
+
if (!runId) {
|
|
1673
|
+
console.error('Usage: eval-cli.js evaluate <runId> [--scenario <id>] [--profile <name>] [--model <model>] [--force] [--follow] [--review] [--refresh <ms>] [--verbose]');
|
|
1674
|
+
process.exit(1);
|
|
1675
|
+
}
|
|
1676
|
+
|
|
1677
|
+
const verbose = getFlag('verbose');
|
|
1678
|
+
const force = getFlag('force');
|
|
1679
|
+
const follow = getFlag('follow');
|
|
1680
|
+
const review = getFlag('review');
|
|
1681
|
+
const refreshMs = parseInt(getOption('refresh', '5000'), 10);
|
|
1682
|
+
const scenarioFilter = getOption('scenario') || getOption('scenarios') || null;
|
|
1683
|
+
const profileFilter = getOption('profile') || getOption('profiles') || null;
|
|
1684
|
+
const modelOverride = getOption('model') || null;
|
|
1685
|
+
|
|
1686
|
+
// Restore env overrides from run metadata (e.g. EVAL_SCENARIOS_FILE for domain generalizability runs)
|
|
1687
|
+
{
|
|
1688
|
+
const runData = evaluationStore.getRun(runId);
|
|
1689
|
+
const meta = typeof runData?.metadata === 'string' ? JSON.parse(runData.metadata) : runData?.metadata;
|
|
1690
|
+
if (meta?.scenariosFile && !process.env.EVAL_SCENARIOS_FILE) {
|
|
1691
|
+
process.env.EVAL_SCENARIOS_FILE = meta.scenariosFile;
|
|
1692
|
+
console.log(`[evaluate] Restored EVAL_SCENARIOS_FILE from run metadata: ${meta.scenariosFile}`);
|
|
1693
|
+
}
|
|
1694
|
+
if (meta?.contentPath && !process.env.EVAL_CONTENT_PATH) {
|
|
1695
|
+
process.env.EVAL_CONTENT_PATH = meta.contentPath;
|
|
1696
|
+
console.log(`[evaluate] Restored EVAL_CONTENT_PATH from run metadata: ${meta.contentPath}`);
|
|
1697
|
+
}
|
|
1698
|
+
}
|
|
1699
|
+
|
|
1700
|
+
// Helper: evaluate a single result via claude CLI
|
|
1701
|
+
async function evaluateOneResult(result, tag) {
|
|
1702
|
+
const scenarioId = result.scenarioId;
|
|
1703
|
+
const profileName = result.profileName || `${result.provider}/${result.model}`;
|
|
1704
|
+
|
|
1705
|
+
const scenario = getScenario(scenarioId);
|
|
1706
|
+
if (!scenario) {
|
|
1707
|
+
console.log(`${tag} ${scenarioId} / ${profileName} ... SKIP (scenario not found)`);
|
|
1708
|
+
return null;
|
|
1709
|
+
}
|
|
1710
|
+
|
|
1711
|
+
const suggestion = result.suggestions?.[0];
|
|
1712
|
+
if (!suggestion) {
|
|
1713
|
+
console.log(`${tag} ${scenarioId} / ${profileName} ... SKIP (no suggestion)`);
|
|
1714
|
+
return null;
|
|
1715
|
+
}
|
|
1716
|
+
|
|
1717
|
+
// Load dialogue log for multi-turn context (if available)
|
|
1718
|
+
let dialogueContext = null;
|
|
1719
|
+
const dialogueId = result.dialogueId;
|
|
1720
|
+
if (dialogueId) {
|
|
1721
|
+
const logPath = path.join(LOGS_DIR, `${dialogueId}.json`);
|
|
1722
|
+
try {
|
|
1723
|
+
if (fs.existsSync(logPath)) {
|
|
1724
|
+
const dialogueLog = JSON.parse(fs.readFileSync(logPath, 'utf-8'));
|
|
1725
|
+
if (dialogueLog.isMultiTurn && dialogueLog.dialogueTrace?.length > 0) {
|
|
1726
|
+
dialogueContext = {
|
|
1727
|
+
consolidatedTrace: dialogueLog.dialogueTrace,
|
|
1728
|
+
conversationHistory: (dialogueLog.turnResults || []).map((t, i) => ({
|
|
1729
|
+
turnIndex: i,
|
|
1730
|
+
turnId: t.turnId,
|
|
1731
|
+
suggestion: t.suggestions?.[0],
|
|
1732
|
+
learnerAction: t.learnerAction,
|
|
1733
|
+
learnerMessage: t.learnerMessage,
|
|
1734
|
+
})),
|
|
1735
|
+
};
|
|
1736
|
+
if (verbose) {
|
|
1737
|
+
console.log(`${tag} loaded dialogue transcript (${dialogueLog.dialogueTrace.length} trace entries)`);
|
|
1738
|
+
}
|
|
1739
|
+
}
|
|
1740
|
+
}
|
|
1741
|
+
} catch (e) {
|
|
1742
|
+
if (verbose) console.log(`${tag} could not load dialogue log: ${e.message}`);
|
|
1743
|
+
}
|
|
1744
|
+
}
|
|
1745
|
+
|
|
1746
|
+
const prompt = buildEvaluationPrompt(suggestion, {
|
|
1747
|
+
name: scenario.name,
|
|
1748
|
+
description: scenario.description,
|
|
1749
|
+
expectedBehavior: scenario.expected_behavior,
|
|
1750
|
+
learnerContext: scenario.learner_context,
|
|
1751
|
+
requiredElements: scenario.required_elements,
|
|
1752
|
+
forbiddenElements: scenario.forbidden_elements,
|
|
1753
|
+
}, { dialogueContext });
|
|
1754
|
+
|
|
1755
|
+
const claudeArgs = ['-p', '-', '--output-format', 'text'];
|
|
1756
|
+
if (modelOverride) {
|
|
1757
|
+
claudeArgs.push('--model', modelOverride);
|
|
1758
|
+
}
|
|
1759
|
+
|
|
1760
|
+
if (verbose) {
|
|
1761
|
+
console.log(`${tag} ${scenarioId} / ${profileName} ... calling claude`);
|
|
1762
|
+
}
|
|
1763
|
+
|
|
1764
|
+
const stdout = await new Promise((resolve, reject) => {
|
|
1765
|
+
const env = { ...process.env };
|
|
1766
|
+
delete env.ANTHROPIC_API_KEY;
|
|
1767
|
+
const child = spawn('claude', claudeArgs, {
|
|
1768
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
1769
|
+
env,
|
|
1770
|
+
});
|
|
1771
|
+
let out = '';
|
|
1772
|
+
let err = '';
|
|
1773
|
+
child.stdout.on('data', d => { out += d; });
|
|
1774
|
+
child.stderr.on('data', d => { err += d; });
|
|
1775
|
+
child.on('error', reject);
|
|
1776
|
+
child.on('close', code => {
|
|
1777
|
+
if (code !== 0) reject(new Error(err || out || `claude exited with code ${code}`));
|
|
1778
|
+
else resolve(out);
|
|
1779
|
+
});
|
|
1780
|
+
child.stdin.write(prompt);
|
|
1781
|
+
child.stdin.end();
|
|
1782
|
+
});
|
|
1783
|
+
|
|
1784
|
+
let jsonStr = stdout.trim();
|
|
1785
|
+
const fenceMatch = jsonStr.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
1786
|
+
if (fenceMatch) {
|
|
1787
|
+
jsonStr = fenceMatch[1].trim();
|
|
1788
|
+
} else {
|
|
1789
|
+
const firstBrace = jsonStr.indexOf('{');
|
|
1790
|
+
const lastBrace = jsonStr.lastIndexOf('}');
|
|
1791
|
+
if (firstBrace !== -1 && lastBrace > firstBrace) {
|
|
1792
|
+
jsonStr = jsonStr.slice(firstBrace, lastBrace + 1);
|
|
1793
|
+
}
|
|
1794
|
+
}
|
|
1795
|
+
|
|
1796
|
+
const parsed = JSON.parse(jsonStr);
|
|
1797
|
+
|
|
1798
|
+
const dimensionMap = {
|
|
1799
|
+
relevance: 'relevance',
|
|
1800
|
+
specificity: 'specificity',
|
|
1801
|
+
pedagogical_soundness: 'pedagogical',
|
|
1802
|
+
pedagogical: 'pedagogical',
|
|
1803
|
+
personalization: 'personalization',
|
|
1804
|
+
actionability: 'actionability',
|
|
1805
|
+
tone: 'tone',
|
|
1806
|
+
};
|
|
1807
|
+
|
|
1808
|
+
const normalizedScores = {};
|
|
1809
|
+
for (const [key, value] of Object.entries(parsed.scores || {})) {
|
|
1810
|
+
const normalizedKey = dimensionMap[key] || key;
|
|
1811
|
+
if (typeof value === 'object' && value !== null) {
|
|
1812
|
+
normalizedScores[normalizedKey] = { score: value.score, reasoning: value.reasoning };
|
|
1813
|
+
} else if (typeof value === 'number') {
|
|
1814
|
+
normalizedScores[normalizedKey] = { score: value, reasoning: null };
|
|
1815
|
+
}
|
|
1816
|
+
}
|
|
1817
|
+
|
|
1818
|
+
const overallScore = Object.keys(normalizedScores).length > 0
|
|
1819
|
+
? calculateOverallScore(normalizedScores)
|
|
1820
|
+
: parsed.overall_score;
|
|
1821
|
+
const baseScore = calculateBaseScore(normalizedScores);
|
|
1822
|
+
const recognitionScore = calculateRecognitionScore(normalizedScores);
|
|
1823
|
+
|
|
1824
|
+
const evaluation = {
|
|
1825
|
+
scores: normalizedScores,
|
|
1826
|
+
overallScore,
|
|
1827
|
+
baseScore,
|
|
1828
|
+
recognitionScore,
|
|
1829
|
+
passesRequired: parsed.validation?.passes_required ?? true,
|
|
1830
|
+
passesForbidden: parsed.validation?.passes_forbidden ?? true,
|
|
1831
|
+
requiredMissing: parsed.validation?.required_missing || [],
|
|
1832
|
+
forbiddenFound: parsed.validation?.forbidden_found || [],
|
|
1833
|
+
summary: parsed.summary,
|
|
1834
|
+
judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-code/opus',
|
|
1835
|
+
};
|
|
1836
|
+
|
|
1837
|
+
evaluationStore.updateResultScores(result.id, evaluation);
|
|
1838
|
+
|
|
1839
|
+
// Score line
|
|
1840
|
+
const dimScores = Object.entries(normalizedScores)
|
|
1841
|
+
.map(([k, v]) => `${k}=${v.score}`)
|
|
1842
|
+
.join(' ');
|
|
1843
|
+
console.log(`${tag} ${scenarioId} / ${profileName} ... ${overallScore.toFixed(1)} (${dimScores})`);
|
|
1844
|
+
|
|
1845
|
+
if (verbose) {
|
|
1846
|
+
// Truncated suggestion excerpt
|
|
1847
|
+
const suggText = typeof suggestion === 'string'
|
|
1848
|
+
? suggestion
|
|
1849
|
+
: (suggestion.message || suggestion.text || suggestion.content || JSON.stringify(suggestion));
|
|
1850
|
+
const truncSugg = suggText.length > 200
|
|
1851
|
+
? suggText.slice(0, 200).replace(/\n/g, ' ') + '...'
|
|
1852
|
+
: suggText.replace(/\n/g, ' ');
|
|
1853
|
+
console.log(` Suggestion: ${truncSugg}`);
|
|
1854
|
+
|
|
1855
|
+
// Judge summary
|
|
1856
|
+
if (parsed.summary) {
|
|
1857
|
+
const truncSummary = parsed.summary.length > 300
|
|
1858
|
+
? parsed.summary.slice(0, 300).replace(/\n/g, ' ') + '...'
|
|
1859
|
+
: parsed.summary.replace(/\n/g, ' ');
|
|
1860
|
+
console.log(` Judge: ${truncSummary}`);
|
|
1861
|
+
}
|
|
1862
|
+
console.log('');
|
|
1863
|
+
}
|
|
1864
|
+
|
|
1865
|
+
return overallScore;
|
|
1866
|
+
}
|
|
1867
|
+
|
|
1868
|
+
// Helper: print summary
|
|
1869
|
+
function printEvaluateSummary(succeeded, failed, totalAttempted, scores) {
|
|
1870
|
+
const avgScore = scores.length > 0
|
|
1871
|
+
? scores.reduce((a, b) => a + b, 0) / scores.length
|
|
1872
|
+
: 0;
|
|
1873
|
+
|
|
1874
|
+
console.log('\n' + '='.repeat(50));
|
|
1875
|
+
console.log(' EVALUATE SUMMARY');
|
|
1876
|
+
console.log('='.repeat(50));
|
|
1877
|
+
console.log(` Total: ${totalAttempted}`);
|
|
1878
|
+
console.log(` Succeeded: ${succeeded}`);
|
|
1879
|
+
console.log(` Failed: ${failed}`);
|
|
1880
|
+
if (scores.length > 0) {
|
|
1881
|
+
console.log(` Avg score: ${avgScore.toFixed(1)}`);
|
|
1882
|
+
}
|
|
1883
|
+
console.log('');
|
|
1884
|
+
}
|
|
1885
|
+
|
|
1886
|
+
// Helper: run holistic dialogue evaluation for multi-turn dialogues
|
|
1887
|
+
async function evaluateHolisticDialogues(evaluatedResults) {
|
|
1888
|
+
// Group results by dialogueId to find multi-turn dialogues
|
|
1889
|
+
const dialogueGroups = new Map();
|
|
1890
|
+
for (const result of evaluatedResults) {
|
|
1891
|
+
if (result.dialogueId) {
|
|
1892
|
+
if (!dialogueGroups.has(result.dialogueId)) {
|
|
1893
|
+
dialogueGroups.set(result.dialogueId, []);
|
|
1894
|
+
}
|
|
1895
|
+
dialogueGroups.get(result.dialogueId).push(result);
|
|
1896
|
+
}
|
|
1897
|
+
}
|
|
1898
|
+
|
|
1899
|
+
// Filter to multi-turn dialogues (2+ results sharing a dialogueId)
|
|
1900
|
+
const multiTurnDialogues = [...dialogueGroups.entries()].filter(([, results]) => results.length > 1);
|
|
1901
|
+
if (multiTurnDialogues.length === 0) return;
|
|
1902
|
+
|
|
1903
|
+
console.log(`\n${'─'.repeat(50)}`);
|
|
1904
|
+
console.log(` HOLISTIC DIALOGUE EVALUATION (${multiTurnDialogues.length} dialogue(s))`);
|
|
1905
|
+
console.log(`${'─'.repeat(50)}\n`);
|
|
1906
|
+
|
|
1907
|
+
for (const [dialogueId, results] of multiTurnDialogues) {
|
|
1908
|
+
const logPath = path.join(LOGS_DIR, `${dialogueId}.json`);
|
|
1909
|
+
let dialogueLog;
|
|
1910
|
+
try {
|
|
1911
|
+
if (!fs.existsSync(logPath)) {
|
|
1912
|
+
console.log(` ${dialogueId} ... SKIP (dialogue log not found)`);
|
|
1913
|
+
continue;
|
|
1914
|
+
}
|
|
1915
|
+
dialogueLog = JSON.parse(fs.readFileSync(logPath, 'utf-8'));
|
|
1916
|
+
} catch (e) {
|
|
1917
|
+
console.log(` ${dialogueId} ... SKIP (could not load: ${e.message})`);
|
|
1918
|
+
continue;
|
|
1919
|
+
}
|
|
1920
|
+
|
|
1921
|
+
if (!dialogueLog.isMultiTurn || !dialogueLog.dialogueTrace?.length) {
|
|
1922
|
+
console.log(` ${dialogueId} ... SKIP (not multi-turn or no trace)`);
|
|
1923
|
+
continue;
|
|
1924
|
+
}
|
|
1925
|
+
|
|
1926
|
+
// Build context from the dialogue log
|
|
1927
|
+
const consolidatedTrace = dialogueLog.dialogueTrace;
|
|
1928
|
+
const conversationHistory = (dialogueLog.turnResults || []).map((t, i) => ({
|
|
1929
|
+
turnIndex: i,
|
|
1930
|
+
turnId: t.turnId,
|
|
1931
|
+
suggestion: t.suggestions?.[0],
|
|
1932
|
+
learnerAction: t.learnerAction,
|
|
1933
|
+
learnerMessage: t.learnerMessage,
|
|
1934
|
+
}));
|
|
1935
|
+
|
|
1936
|
+
// Use the last turn's suggestion as the focal point
|
|
1937
|
+
const lastResult = results[results.length - 1];
|
|
1938
|
+
const lastSuggestion = lastResult.suggestions?.[0];
|
|
1939
|
+
if (!lastSuggestion) {
|
|
1940
|
+
console.log(` ${dialogueId} ... SKIP (no suggestion on last turn)`);
|
|
1941
|
+
continue;
|
|
1942
|
+
}
|
|
1943
|
+
|
|
1944
|
+
const scenarioId = lastResult.scenarioId;
|
|
1945
|
+
const scenario = getScenario(scenarioId);
|
|
1946
|
+
if (!scenario) {
|
|
1947
|
+
console.log(` ${dialogueId} ... SKIP (scenario ${scenarioId} not found)`);
|
|
1948
|
+
continue;
|
|
1949
|
+
}
|
|
1950
|
+
|
|
1951
|
+
const prompt = buildEvaluationPrompt(lastSuggestion, {
|
|
1952
|
+
name: `${scenario.name} (holistic dialogue)`,
|
|
1953
|
+
description: `Holistic evaluation of ${results.length}-turn dialogue. Score the overall quality of the tutoring interaction across all turns, not just this final response.`,
|
|
1954
|
+
expectedBehavior: scenario.expected_behavior,
|
|
1955
|
+
learnerContext: scenario.learner_context,
|
|
1956
|
+
requiredElements: scenario.required_elements,
|
|
1957
|
+
forbiddenElements: scenario.forbidden_elements,
|
|
1958
|
+
}, {
|
|
1959
|
+
dialogueContext: { conversationHistory, consolidatedTrace },
|
|
1960
|
+
});
|
|
1961
|
+
|
|
1962
|
+
try {
|
|
1963
|
+
const claudeArgs = ['-p', '-', '--output-format', 'text'];
|
|
1964
|
+
if (modelOverride) claudeArgs.push('--model', modelOverride);
|
|
1965
|
+
|
|
1966
|
+
const stdout = await new Promise((resolve, reject) => {
|
|
1967
|
+
const env = { ...process.env };
|
|
1968
|
+
delete env.ANTHROPIC_API_KEY;
|
|
1969
|
+
const child = spawn('claude', claudeArgs, {
|
|
1970
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
1971
|
+
env,
|
|
1972
|
+
});
|
|
1973
|
+
let out = '';
|
|
1974
|
+
let err = '';
|
|
1975
|
+
child.stdout.on('data', d => { out += d; });
|
|
1976
|
+
child.stderr.on('data', d => { err += d; });
|
|
1977
|
+
child.on('error', reject);
|
|
1978
|
+
child.on('close', code => {
|
|
1979
|
+
if (code !== 0) reject(new Error(err || out || `claude exited with code ${code}`));
|
|
1980
|
+
else resolve(out);
|
|
1981
|
+
});
|
|
1982
|
+
child.stdin.write(prompt);
|
|
1983
|
+
child.stdin.end();
|
|
1984
|
+
});
|
|
1985
|
+
|
|
1986
|
+
let jsonStr = stdout.trim();
|
|
1987
|
+
const fenceMatch = jsonStr.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
1988
|
+
if (fenceMatch) {
|
|
1989
|
+
jsonStr = fenceMatch[1].trim();
|
|
1990
|
+
} else {
|
|
1991
|
+
const firstBrace = jsonStr.indexOf('{');
|
|
1992
|
+
const lastBrace = jsonStr.lastIndexOf('}');
|
|
1993
|
+
if (firstBrace !== -1 && lastBrace > firstBrace) {
|
|
1994
|
+
jsonStr = jsonStr.slice(firstBrace, lastBrace + 1);
|
|
1995
|
+
}
|
|
1996
|
+
}
|
|
1997
|
+
|
|
1998
|
+
const parsed = JSON.parse(jsonStr);
|
|
1999
|
+
|
|
2000
|
+
const normalizedScores = {};
|
|
2001
|
+
const dimensionMap = {
|
|
2002
|
+
relevance: 'relevance', specificity: 'specificity',
|
|
2003
|
+
pedagogical_soundness: 'pedagogical', pedagogical: 'pedagogical',
|
|
2004
|
+
personalization: 'personalization', actionability: 'actionability', tone: 'tone',
|
|
2005
|
+
};
|
|
2006
|
+
for (const [key, value] of Object.entries(parsed.scores || {})) {
|
|
2007
|
+
const normalizedKey = dimensionMap[key] || key;
|
|
2008
|
+
if (typeof value === 'object' && value !== null) {
|
|
2009
|
+
normalizedScores[normalizedKey] = { score: value.score, reasoning: value.reasoning };
|
|
2010
|
+
} else if (typeof value === 'number') {
|
|
2011
|
+
normalizedScores[normalizedKey] = { score: value, reasoning: null };
|
|
2012
|
+
}
|
|
2013
|
+
}
|
|
2014
|
+
|
|
2015
|
+
const overallScore = Object.keys(normalizedScores).length > 0
|
|
2016
|
+
? calculateOverallScore(normalizedScores) : parsed.overall_score;
|
|
2017
|
+
const baseScore = calculateBaseScore(normalizedScores);
|
|
2018
|
+
const recognitionScore = calculateRecognitionScore(normalizedScores);
|
|
2019
|
+
|
|
2020
|
+
const holisticScore = {
|
|
2021
|
+
overallScore,
|
|
2022
|
+
baseScore,
|
|
2023
|
+
recognitionScore,
|
|
2024
|
+
scores: normalizedScores,
|
|
2025
|
+
summary: parsed.summary,
|
|
2026
|
+
judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-code/opus',
|
|
2027
|
+
};
|
|
2028
|
+
|
|
2029
|
+
// Save to dialogue log
|
|
2030
|
+
dialogueLog.holisticDialogueScore = holisticScore;
|
|
2031
|
+
fs.writeFileSync(logPath, JSON.stringify(dialogueLog, null, 2));
|
|
2032
|
+
|
|
2033
|
+
const profileName = lastResult.profileName || `${lastResult.provider}/${lastResult.model}`;
|
|
2034
|
+
console.log(` ${scenarioId} / ${profileName} ... holistic=${overallScore.toFixed(1)} (base=${baseScore.toFixed(1)} recog=${recognitionScore.toFixed(1)})`);
|
|
2035
|
+
if (verbose && parsed.summary) {
|
|
2036
|
+
const truncSummary = parsed.summary.length > 300
|
|
2037
|
+
? parsed.summary.slice(0, 300).replace(/\n/g, ' ') + '...'
|
|
2038
|
+
: parsed.summary.replace(/\n/g, ' ');
|
|
2039
|
+
console.log(` Judge: ${truncSummary}\n`);
|
|
2040
|
+
}
|
|
2041
|
+
} catch (err) {
|
|
2042
|
+
const msg = err.stderr ? err.stderr.slice(0, 200) : err.message;
|
|
2043
|
+
console.log(` ${dialogueId} ... FAIL: ${msg}`);
|
|
2044
|
+
}
|
|
2045
|
+
}
|
|
2046
|
+
}
|
|
2047
|
+
|
|
2048
|
+
// ── Review mode: show stored reasoning without re-evaluating ──
|
|
2049
|
+
if (review) {
|
|
2050
|
+
const results = evaluationStore.getResults(runId, {
|
|
2051
|
+
scenarioId: scenarioFilter,
|
|
2052
|
+
profileName: profileFilter,
|
|
2053
|
+
});
|
|
2054
|
+
|
|
2055
|
+
if (results.length === 0) {
|
|
2056
|
+
console.error(`No results found for run: ${runId}`);
|
|
2057
|
+
process.exit(1);
|
|
2058
|
+
}
|
|
2059
|
+
|
|
2060
|
+
const evaluated = results.filter(r => r.baseScore != null);
|
|
2061
|
+
if (evaluated.length === 0) {
|
|
2062
|
+
console.log('No evaluated results to review. Run evaluate first.');
|
|
2063
|
+
break;
|
|
2064
|
+
}
|
|
2065
|
+
|
|
2066
|
+
console.log(`\nReviewing ${evaluated.length} evaluated result(s) for run: ${runId}\n`);
|
|
2067
|
+
|
|
2068
|
+
for (let i = 0; i < evaluated.length; i++) {
|
|
2069
|
+
const r = evaluated[i];
|
|
2070
|
+
const profileName = r.profileName || `${r.provider}/${r.model}`;
|
|
2071
|
+
|
|
2072
|
+
// Dimension scores on one line
|
|
2073
|
+
const dimScores = Object.entries(r.scores || {})
|
|
2074
|
+
.filter(([, v]) => v != null)
|
|
2075
|
+
.map(([k, v]) => {
|
|
2076
|
+
const score = typeof v === 'object' ? v.score : v;
|
|
2077
|
+
return `${k}=${score}`;
|
|
2078
|
+
})
|
|
2079
|
+
.join(' ');
|
|
2080
|
+
|
|
2081
|
+
console.log(`[${i + 1}/${evaluated.length}] ${r.scenarioId} / ${profileName} ... ${r.overallScore?.toFixed(1) ?? '--'} (${dimScores})`);
|
|
2082
|
+
|
|
2083
|
+
// Suggestion excerpt
|
|
2084
|
+
const suggestion = r.suggestions?.[0];
|
|
2085
|
+
if (suggestion) {
|
|
2086
|
+
const suggText = typeof suggestion === 'string'
|
|
2087
|
+
? suggestion
|
|
2088
|
+
: (suggestion.message || suggestion.text || suggestion.content || JSON.stringify(suggestion));
|
|
2089
|
+
const truncSugg = suggText.length > 200
|
|
2090
|
+
? suggText.slice(0, 200).replace(/\n/g, ' ') + '...'
|
|
2091
|
+
: suggText.replace(/\n/g, ' ');
|
|
2092
|
+
console.log(` Suggestion: ${truncSugg}`);
|
|
2093
|
+
}
|
|
2094
|
+
|
|
2095
|
+
// Judge summary
|
|
2096
|
+
if (r.evaluationReasoning) {
|
|
2097
|
+
const truncReasoning = r.evaluationReasoning.length > 300
|
|
2098
|
+
? r.evaluationReasoning.slice(0, 300).replace(/\n/g, ' ') + '...'
|
|
2099
|
+
: r.evaluationReasoning.replace(/\n/g, ' ');
|
|
2100
|
+
console.log(` Judge: ${truncReasoning}`);
|
|
2101
|
+
}
|
|
2102
|
+
|
|
2103
|
+
// Per-dimension reasoning (verbose only)
|
|
2104
|
+
if (verbose && r.scores) {
|
|
2105
|
+
for (const [dim, val] of Object.entries(r.scores)) {
|
|
2106
|
+
if (typeof val === 'object' && val?.reasoning) {
|
|
2107
|
+
const truncDim = val.reasoning.length > 150
|
|
2108
|
+
? val.reasoning.slice(0, 150).replace(/\n/g, ' ') + '...'
|
|
2109
|
+
: val.reasoning.replace(/\n/g, ' ');
|
|
2110
|
+
console.log(` ${dim} (${val.score}): ${truncDim}`);
|
|
2111
|
+
}
|
|
2112
|
+
}
|
|
2113
|
+
}
|
|
2114
|
+
console.log('');
|
|
2115
|
+
}
|
|
2116
|
+
|
|
2117
|
+
// Quick stats
|
|
2118
|
+
const reviewScores = evaluated.map(r => r.overallScore).filter(s => s != null);
|
|
2119
|
+
if (reviewScores.length > 0) {
|
|
2120
|
+
const avg = reviewScores.reduce((a, b) => a + b, 0) / reviewScores.length;
|
|
2121
|
+
const sd = Math.sqrt(reviewScores.reduce((acc, s) => acc + (s - avg) ** 2, 0) / (reviewScores.length - 1));
|
|
2122
|
+
console.log(`Reviewed ${evaluated.length} results: avg=${avg.toFixed(1)} sd=${sd.toFixed(1)}`);
|
|
2123
|
+
}
|
|
2124
|
+
break;
|
|
2125
|
+
}
|
|
2126
|
+
|
|
2127
|
+
let succeeded = 0;
|
|
2128
|
+
let failed = 0;
|
|
2129
|
+
const scores = [];
|
|
2130
|
+
|
|
2131
|
+
if (follow) {
|
|
2132
|
+
// ── Follow mode: poll for new unevaluated results ──
|
|
2133
|
+
// Show initial status
|
|
2134
|
+
const initialResults = evaluationStore.getResults(runId, {
|
|
2135
|
+
scenarioId: scenarioFilter,
|
|
2136
|
+
profileName: profileFilter,
|
|
2137
|
+
});
|
|
2138
|
+
const initialTotal = initialResults.filter(r => r.success).length;
|
|
2139
|
+
const initialUnevaluated = initialResults.filter(r => r.baseScore == null && r.success).length;
|
|
2140
|
+
const initialEvaluated = initialTotal - initialUnevaluated;
|
|
2141
|
+
|
|
2142
|
+
console.log(`\nFollowing run: ${runId}`);
|
|
2143
|
+
console.log(` Already scored: ${initialEvaluated}/${initialTotal}`);
|
|
2144
|
+
console.log(` Need scoring: ${initialUnevaluated}`);
|
|
2145
|
+
if (modelOverride) console.log(` Model: ${modelOverride}`);
|
|
2146
|
+
console.log(` Polling every ${refreshMs}ms for new results...`);
|
|
2147
|
+
console.log('');
|
|
2148
|
+
|
|
2149
|
+
const processedIds = new Set();
|
|
2150
|
+
let evalCounter = 0;
|
|
2151
|
+
let interrupted = false;
|
|
2152
|
+
|
|
2153
|
+
// SIGINT handler: print summary so far and exit
|
|
2154
|
+
const sigintHandler = () => {
|
|
2155
|
+
interrupted = true;
|
|
2156
|
+
console.log('\n\nInterrupted by user.');
|
|
2157
|
+
printEvaluateSummary(succeeded, failed, succeeded + failed, scores);
|
|
2158
|
+
process.exit(0);
|
|
2159
|
+
};
|
|
2160
|
+
process.on('SIGINT', sigintHandler);
|
|
2161
|
+
|
|
2162
|
+
const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));
|
|
2163
|
+
|
|
2164
|
+
while (!interrupted) {
|
|
2165
|
+
// Fetch results that have a suggestion but no rubric evaluation
|
|
2166
|
+
const results = evaluationStore.getResults(runId, {
|
|
2167
|
+
scenarioId: scenarioFilter,
|
|
2168
|
+
profileName: profileFilter,
|
|
2169
|
+
});
|
|
2170
|
+
|
|
2171
|
+
const unevaluated = results.filter(r =>
|
|
2172
|
+
r.baseScore == null && r.success && !processedIds.has(r.id)
|
|
2173
|
+
);
|
|
2174
|
+
|
|
2175
|
+
// Total results available so far (for progress display)
|
|
2176
|
+
const totalResults = results.filter(r => r.success).length;
|
|
2177
|
+
const alreadyEvaluated = results.filter(r => r.baseScore != null && r.success).length;
|
|
2178
|
+
|
|
2179
|
+
// Process each new unevaluated result
|
|
2180
|
+
let batchIndex = 0;
|
|
2181
|
+
const batchSize = unevaluated.length;
|
|
2182
|
+
for (const result of unevaluated) {
|
|
2183
|
+
if (interrupted) break;
|
|
2184
|
+
processedIds.add(result.id);
|
|
2185
|
+
evalCounter++;
|
|
2186
|
+
batchIndex++;
|
|
2187
|
+
// Show: [batch progress] (overall: evaluated/total)
|
|
2188
|
+
const tag = `[${batchIndex}/${batchSize}] (${alreadyEvaluated + batchIndex}/${totalResults} scored)`;
|
|
2189
|
+
|
|
2190
|
+
try {
|
|
2191
|
+
const score = await evaluateOneResult(result, tag);
|
|
2192
|
+
if (score != null) {
|
|
2193
|
+
scores.push(score);
|
|
2194
|
+
succeeded++;
|
|
2195
|
+
} else {
|
|
2196
|
+
failed++;
|
|
2197
|
+
}
|
|
2198
|
+
} catch (err) {
|
|
2199
|
+
failed++;
|
|
2200
|
+
const msg = err.stderr ? err.stderr.slice(0, 200) : err.message;
|
|
2201
|
+
const profileName = result.profileName || `${result.provider}/${result.model}`;
|
|
2202
|
+
console.log(`${tag} ${result.scenarioId} / ${profileName} ... FAIL: ${msg}`);
|
|
2203
|
+
if (verbose) console.error(err);
|
|
2204
|
+
}
|
|
2205
|
+
}
|
|
2206
|
+
|
|
2207
|
+
// Check if run is done and no unevaluated results remain
|
|
2208
|
+
const run = evaluationStore.getRun(runId);
|
|
2209
|
+
const runStatus = run?.status || 'unknown';
|
|
2210
|
+
|
|
2211
|
+
if (runStatus !== 'running' && unevaluated.length === 0) {
|
|
2212
|
+
// Re-check one more time to avoid race condition
|
|
2213
|
+
const finalResults = evaluationStore.getResults(runId, {
|
|
2214
|
+
scenarioId: scenarioFilter,
|
|
2215
|
+
profileName: profileFilter,
|
|
2216
|
+
});
|
|
2217
|
+
const finalUnevaluated = finalResults.filter(r =>
|
|
2218
|
+
r.baseScore == null && r.success && !processedIds.has(r.id)
|
|
2219
|
+
);
|
|
2220
|
+
if (finalUnevaluated.length === 0) {
|
|
2221
|
+
console.log(`\nRun ${runStatus}. All results evaluated.`);
|
|
2222
|
+
break;
|
|
2223
|
+
}
|
|
2224
|
+
}
|
|
2225
|
+
|
|
2226
|
+
// Status line while waiting
|
|
2227
|
+
const evaluatedCount = results.filter(r => r.baseScore != null).length;
|
|
2228
|
+
console.log(`Waiting for new results... (${evaluatedCount} evaluated of ${totalResults} total, run ${runStatus})`);
|
|
2229
|
+
|
|
2230
|
+
await sleep(refreshMs);
|
|
2231
|
+
}
|
|
2232
|
+
|
|
2233
|
+
process.removeListener('SIGINT', sigintHandler);
|
|
2234
|
+
printEvaluateSummary(succeeded, failed, succeeded + failed, scores);
|
|
2235
|
+
|
|
2236
|
+
// Holistic dialogue evaluation for multi-turn dialogues
|
|
2237
|
+
const allResults = evaluationStore.getResults(runId, {
|
|
2238
|
+
scenarioId: scenarioFilter,
|
|
2239
|
+
profileName: profileFilter,
|
|
2240
|
+
}).filter(r => r.success && r.baseScore != null);
|
|
2241
|
+
await evaluateHolisticDialogues(allResults);
|
|
2242
|
+
|
|
2243
|
+
} else {
|
|
2244
|
+
// ── One-shot mode (existing behavior) ──
|
|
2245
|
+
|
|
2246
|
+
// Load results for this run
|
|
2247
|
+
const results = evaluationStore.getResults(runId, {
|
|
2248
|
+
scenarioId: scenarioFilter,
|
|
2249
|
+
profileName: profileFilter,
|
|
2250
|
+
});
|
|
2251
|
+
|
|
2252
|
+
if (results.length === 0) {
|
|
2253
|
+
console.error(`No results found for run: ${runId}`);
|
|
2254
|
+
process.exit(1);
|
|
2255
|
+
}
|
|
2256
|
+
|
|
2257
|
+
// Filter to unevaluated results unless --force
|
|
2258
|
+
// Use baseScore == null to detect skip-rubric results (overallScore=100 but no rubric dims)
|
|
2259
|
+
const toEvaluate = force
|
|
2260
|
+
? results
|
|
2261
|
+
: results.filter(r => r.baseScore == null && r.success);
|
|
2262
|
+
|
|
2263
|
+
if (toEvaluate.length === 0) {
|
|
2264
|
+
console.log('All results already have rubric scores. Use --review to inspect reasoning, or --force to re-evaluate.');
|
|
2265
|
+
break;
|
|
2266
|
+
}
|
|
2267
|
+
|
|
2268
|
+
console.log(`\nEvaluating ${toEvaluate.length} result(s) for run: ${runId}`);
|
|
2269
|
+
if (modelOverride) console.log(` Model: ${modelOverride}`);
|
|
2270
|
+
console.log('');
|
|
2271
|
+
|
|
2272
|
+
for (let i = 0; i < toEvaluate.length; i++) {
|
|
2273
|
+
const result = toEvaluate[i];
|
|
2274
|
+
const tag = `[${i + 1}/${toEvaluate.length}]`;
|
|
2275
|
+
|
|
2276
|
+
try {
|
|
2277
|
+
const score = await evaluateOneResult(result, tag);
|
|
2278
|
+
if (score != null) {
|
|
2279
|
+
scores.push(score);
|
|
2280
|
+
succeeded++;
|
|
2281
|
+
} else {
|
|
2282
|
+
failed++;
|
|
2283
|
+
}
|
|
2284
|
+
} catch (err) {
|
|
2285
|
+
failed++;
|
|
2286
|
+
const profileName = result.profileName || `${result.provider}/${result.model}`;
|
|
2287
|
+
const msg = err.stderr ? err.stderr.slice(0, 200) : err.message;
|
|
2288
|
+
console.log(`${tag} ${result.scenarioId} / ${profileName} ... FAIL: ${msg}`);
|
|
2289
|
+
if (verbose) console.error(err);
|
|
2290
|
+
}
|
|
2291
|
+
}
|
|
2292
|
+
|
|
2293
|
+
printEvaluateSummary(succeeded, failed, toEvaluate.length, scores);
|
|
2294
|
+
|
|
2295
|
+
// Holistic dialogue evaluation for multi-turn dialogues
|
|
2296
|
+
await evaluateHolisticDialogues(toEvaluate.filter(r => r.success));
|
|
2297
|
+
}
|
|
2298
|
+
break;
|
|
2299
|
+
}
|
|
2300
|
+
|
|
2301
|
+
case 'evaluate-learner': {
|
|
2302
|
+
// ── Learner-side evaluation: score learner turns from multi-turn dialogues ──
|
|
2303
|
+
//
|
|
2304
|
+
// Data lives in evaluation_results (per-dialogue rows with dialogueId)
|
|
2305
|
+
// and logs/tutor-dialogues/*.json (full dialogue traces with learner turns).
|
|
2306
|
+
//
|
|
2307
|
+
// For each dialogue:
|
|
2308
|
+
// 1. Load the log file to get learner turn messages + deliberation traces
|
|
2309
|
+
// 2. Build a learner evaluation prompt per learner turn (truncated context)
|
|
2310
|
+
// 3. Call Claude as judge
|
|
2311
|
+
// 4. Store per-turn scores as JSON + overall learner score on the result row
|
|
2312
|
+
|
|
2313
|
+
const runId = args.find(a => !a.startsWith('--') && a !== 'evaluate-learner');
|
|
2314
|
+
if (!runId) {
|
|
2315
|
+
console.error('Usage: eval-cli.js evaluate-learner <runId> [--model <model>] [--force] [--verbose] [--arch <architecture>]');
|
|
2316
|
+
console.error(' Scores learner turns from dialogue logs using the learner rubric.');
|
|
2317
|
+
console.error(' Only works on multi-turn runs with learner turns (e.g., bilateral transformation).');
|
|
2318
|
+
console.error(' --arch filters by learner_architecture (e.g., ego_superego_recognition)');
|
|
2319
|
+
process.exit(1);
|
|
2320
|
+
}
|
|
2321
|
+
|
|
2322
|
+
const verbose = getFlag('verbose');
|
|
2323
|
+
const force = getFlag('force');
|
|
2324
|
+
const modelOverride = getOption('model') || null;
|
|
2325
|
+
const profileFilter = getOption('profile') || getOption('profiles') || null;
|
|
2326
|
+
const archFilter = getOption('arch') || null;
|
|
2327
|
+
|
|
2328
|
+
// Load results with dialogue IDs (multi-turn data)
|
|
2329
|
+
const allResults = evaluationStore.getResults(runId, { profileName: profileFilter });
|
|
2330
|
+
let dialogueResults = allResults.filter(r => r.dialogueId && r.success);
|
|
2331
|
+
if (archFilter) {
|
|
2332
|
+
dialogueResults = dialogueResults.filter(r => r.learnerArchitecture === archFilter);
|
|
2333
|
+
}
|
|
2334
|
+
|
|
2335
|
+
if (dialogueResults.length === 0) {
|
|
2336
|
+
console.error(`No multi-turn dialogue results found for run: ${runId}`);
|
|
2337
|
+
console.error('This command only works on runs that produced dialogue log files.');
|
|
2338
|
+
process.exit(1);
|
|
2339
|
+
}
|
|
2340
|
+
|
|
2341
|
+
// Filter to those needing learner evaluation (unless --force)
|
|
2342
|
+
const toEvaluate = force
|
|
2343
|
+
? dialogueResults
|
|
2344
|
+
: dialogueResults.filter(r => r.learnerOverallScore == null);
|
|
2345
|
+
|
|
2346
|
+
if (toEvaluate.length === 0) {
|
|
2347
|
+
console.log('All dialogue results already have learner scores. Use --force to re-evaluate.');
|
|
2348
|
+
break;
|
|
2349
|
+
}
|
|
2350
|
+
|
|
2351
|
+
console.log(`\nEvaluating learner turns for ${toEvaluate.length} dialogue(s) from run: ${runId}`);
|
|
2352
|
+
if (modelOverride) console.log(` Model: ${modelOverride}`);
|
|
2353
|
+
console.log('');
|
|
2354
|
+
|
|
2355
|
+
let succeeded = 0;
|
|
2356
|
+
let failed = 0;
|
|
2357
|
+
const allScores = [];
|
|
2358
|
+
|
|
2359
|
+
for (let i = 0; i < toEvaluate.length; i++) {
|
|
2360
|
+
const result = toEvaluate[i];
|
|
2361
|
+
const profileName = result.profileName || `${result.provider}/${result.model}`;
|
|
2362
|
+
const tag = `[${i + 1}/${toEvaluate.length}]`;
|
|
2363
|
+
|
|
2364
|
+
// Load dialogue log file
|
|
2365
|
+
const logPath = path.join(LOGS_DIR, `${result.dialogueId}.json`);
|
|
2366
|
+
let dialogueLog;
|
|
2367
|
+
try {
|
|
2368
|
+
if (!fs.existsSync(logPath)) {
|
|
2369
|
+
console.log(`${tag} ${result.scenarioId} / ${profileName} ... SKIP (log file not found)`);
|
|
2370
|
+
failed++;
|
|
2371
|
+
continue;
|
|
2372
|
+
}
|
|
2373
|
+
dialogueLog = JSON.parse(fs.readFileSync(logPath, 'utf-8'));
|
|
2374
|
+
} catch (e) {
|
|
2375
|
+
console.log(`${tag} ${result.scenarioId} / ${profileName} ... SKIP (${e.message})`);
|
|
2376
|
+
failed++;
|
|
2377
|
+
continue;
|
|
2378
|
+
}
|
|
2379
|
+
|
|
2380
|
+
if (!dialogueLog.isMultiTurn) {
|
|
2381
|
+
console.log(`${tag} ${result.scenarioId} / ${profileName} ... SKIP (not multi-turn)`);
|
|
2382
|
+
failed++;
|
|
2383
|
+
continue;
|
|
2384
|
+
}
|
|
2385
|
+
|
|
2386
|
+
const trace = dialogueLog.dialogueTrace || [];
|
|
2387
|
+
const learnerArch = dialogueLog.learnerArchitecture || 'unified';
|
|
2388
|
+
const isMultiAgent = learnerArch.includes('ego_superego') || learnerArch === 'multi_agent' || learnerArch.includes('psychodynamic');
|
|
2389
|
+
|
|
2390
|
+
// Extract learner turns from dialogue trace.
|
|
2391
|
+
// Each learner turn consists of:
|
|
2392
|
+
// - turn_action entry (contextSummary = external message)
|
|
2393
|
+
// - For multi-agent: preceding learner_ego_initial, learner_superego, learner_ego_revision entries
|
|
2394
|
+
const learnerTurns = [];
|
|
2395
|
+
const turnActionEntries = trace.filter(t => t.agent === 'user' && t.action === 'turn_action');
|
|
2396
|
+
|
|
2397
|
+
for (const ta of turnActionEntries) {
|
|
2398
|
+
const turnData = {
|
|
2399
|
+
turnIndex: ta.turnIndex,
|
|
2400
|
+
externalMessage: ta.contextSummary || '',
|
|
2401
|
+
internalDeliberation: [],
|
|
2402
|
+
};
|
|
2403
|
+
|
|
2404
|
+
// Find deliberation entries associated with this turn action
|
|
2405
|
+
// They appear before the turn_action in the trace and after the previous tutor turn
|
|
2406
|
+
if (isMultiAgent) {
|
|
2407
|
+
const taIdx = trace.indexOf(ta);
|
|
2408
|
+
// Walk backward from turn_action to find learner deliberation entries
|
|
2409
|
+
for (let j = taIdx - 1; j >= 0; j--) {
|
|
2410
|
+
const entry = trace[j];
|
|
2411
|
+
if (entry.agent === 'learner_ego_initial' && entry.action === 'deliberation') {
|
|
2412
|
+
turnData.internalDeliberation.unshift({ role: 'ego_initial', content: entry.contextSummary || '' });
|
|
2413
|
+
break; // ego_initial is the first step, stop here
|
|
2414
|
+
} else if (entry.agent === 'learner_superego' && entry.action === 'deliberation') {
|
|
2415
|
+
turnData.internalDeliberation.unshift({ role: 'superego', content: entry.contextSummary || '' });
|
|
2416
|
+
} else if (entry.agent === 'learner_ego_revision' && entry.action === 'deliberation') {
|
|
2417
|
+
turnData.internalDeliberation.unshift({ role: 'ego_revision', content: entry.contextSummary || '' });
|
|
2418
|
+
} else if (entry.agent === 'learner_synthesis' && entry.action === 'response') {
|
|
2419
|
+
// synthesis is the final merged output, skip (same as external message)
|
|
2420
|
+
} else if (entry.agent === 'ego' || entry.agent === 'system') {
|
|
2421
|
+
break; // Reached the tutor's turn, stop
|
|
2422
|
+
}
|
|
2423
|
+
}
|
|
2424
|
+
}
|
|
2425
|
+
|
|
2426
|
+
learnerTurns.push(turnData);
|
|
2427
|
+
}
|
|
2428
|
+
|
|
2429
|
+
if (learnerTurns.length === 0) {
|
|
2430
|
+
console.log(`${tag} ${result.scenarioId} / ${profileName} ... SKIP (no learner turns in trace)`);
|
|
2431
|
+
failed++;
|
|
2432
|
+
continue;
|
|
2433
|
+
}
|
|
2434
|
+
|
|
2435
|
+
// Build a reconstructed turn array for the prompt builder
|
|
2436
|
+
// Interleave tutor suggestions and learner messages
|
|
2437
|
+
const reconstructedTurns = [];
|
|
2438
|
+
const turnResults = dialogueLog.turnResults || [];
|
|
2439
|
+
|
|
2440
|
+
// Turn 0: initial tutor suggestion
|
|
2441
|
+
if (turnResults.length > 0) {
|
|
2442
|
+
const sug = turnResults[0].suggestions?.[0];
|
|
2443
|
+
reconstructedTurns.push({
|
|
2444
|
+
turnNumber: 0,
|
|
2445
|
+
phase: 'tutor',
|
|
2446
|
+
externalMessage: sug?.message || sug?.text || JSON.stringify(sug),
|
|
2447
|
+
});
|
|
2448
|
+
}
|
|
2449
|
+
|
|
2450
|
+
// Subsequent turns: learner → tutor pairs
|
|
2451
|
+
for (let lt = 0; lt < learnerTurns.length; lt++) {
|
|
2452
|
+
reconstructedTurns.push({
|
|
2453
|
+
turnNumber: lt + 1,
|
|
2454
|
+
phase: 'learner',
|
|
2455
|
+
externalMessage: learnerTurns[lt].externalMessage,
|
|
2456
|
+
internalDeliberation: learnerTurns[lt].internalDeliberation,
|
|
2457
|
+
});
|
|
2458
|
+
|
|
2459
|
+
// Add corresponding tutor response (if exists)
|
|
2460
|
+
const tutorTurn = turnResults[lt + 1];
|
|
2461
|
+
if (tutorTurn) {
|
|
2462
|
+
const sug = tutorTurn.suggestions?.[0];
|
|
2463
|
+
reconstructedTurns.push({
|
|
2464
|
+
turnNumber: lt + 1,
|
|
2465
|
+
phase: 'tutor',
|
|
2466
|
+
externalMessage: sug?.message || sug?.text || JSON.stringify(sug),
|
|
2467
|
+
});
|
|
2468
|
+
}
|
|
2469
|
+
}
|
|
2470
|
+
|
|
2471
|
+
// Get scenario info
|
|
2472
|
+
const scenario = getScenario(result.scenarioId);
|
|
2473
|
+
const scenarioName = scenario?.name || result.scenarioId;
|
|
2474
|
+
|
|
2475
|
+
// Use learnerContext from the dialogue log as persona description
|
|
2476
|
+
const personaDescription = dialogueLog.learnerContext || 'No persona description available';
|
|
2477
|
+
|
|
2478
|
+
const turnScores = {};
|
|
2479
|
+
let turnSucceeded = 0;
|
|
2480
|
+
|
|
2481
|
+
// Score each learner turn
|
|
2482
|
+
for (let lt = 0; lt < learnerTurns.length; lt++) {
|
|
2483
|
+
// Find the learner turn's index in reconstructedTurns
|
|
2484
|
+
const targetIdx = reconstructedTurns.findIndex((t, idx) =>
|
|
2485
|
+
t.phase === 'learner' && t.externalMessage === learnerTurns[lt].externalMessage && idx > 0
|
|
2486
|
+
);
|
|
2487
|
+
|
|
2488
|
+
if (targetIdx === -1) continue;
|
|
2489
|
+
|
|
2490
|
+
const turnTag = `${tag} ${result.scenarioId} / ${profileName} learner-turn-${lt + 1}`;
|
|
2491
|
+
|
|
2492
|
+
try {
|
|
2493
|
+
const prompt = buildLearnerEvaluationPrompt({
|
|
2494
|
+
turns: reconstructedTurns,
|
|
2495
|
+
targetTurnIndex: targetIdx,
|
|
2496
|
+
personaId: profileName,
|
|
2497
|
+
personaDescription,
|
|
2498
|
+
learnerArchitecture: isMultiAgent ? 'multi_agent' : 'unified',
|
|
2499
|
+
scenarioName,
|
|
2500
|
+
topic: result.scenarioId,
|
|
2501
|
+
});
|
|
2502
|
+
|
|
2503
|
+
const claudeArgs = ['-p', '-', '--output-format', 'text'];
|
|
2504
|
+
if (modelOverride) {
|
|
2505
|
+
claudeArgs.push('--model', modelOverride);
|
|
2506
|
+
}
|
|
2507
|
+
|
|
2508
|
+
if (verbose) {
|
|
2509
|
+
console.log(`${turnTag} ... calling claude`);
|
|
2510
|
+
}
|
|
2511
|
+
|
|
2512
|
+
const stdout = await new Promise((resolve, reject) => {
|
|
2513
|
+
const env = { ...process.env };
|
|
2514
|
+
delete env.ANTHROPIC_API_KEY;
|
|
2515
|
+
const child = spawn('claude', claudeArgs, {
|
|
2516
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
2517
|
+
env,
|
|
2518
|
+
});
|
|
2519
|
+
let out = '';
|
|
2520
|
+
let err = '';
|
|
2521
|
+
child.stdout.on('data', d => { out += d; });
|
|
2522
|
+
child.stderr.on('data', d => { err += d; });
|
|
2523
|
+
child.on('error', reject);
|
|
2524
|
+
child.on('close', code => {
|
|
2525
|
+
if (code !== 0) reject(new Error(err || out || `claude exited with code ${code}`));
|
|
2526
|
+
else resolve(out);
|
|
2527
|
+
});
|
|
2528
|
+
child.stdin.write(prompt);
|
|
2529
|
+
child.stdin.end();
|
|
2530
|
+
});
|
|
2531
|
+
|
|
2532
|
+
// Parse JSON response
|
|
2533
|
+
let jsonStr = stdout.trim();
|
|
2534
|
+
const fenceMatch = jsonStr.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
2535
|
+
if (fenceMatch) {
|
|
2536
|
+
jsonStr = fenceMatch[1].trim();
|
|
2537
|
+
} else {
|
|
2538
|
+
const firstBrace = jsonStr.indexOf('{');
|
|
2539
|
+
const lastBrace = jsonStr.lastIndexOf('}');
|
|
2540
|
+
if (firstBrace !== -1 && lastBrace > firstBrace) {
|
|
2541
|
+
jsonStr = jsonStr.slice(firstBrace, lastBrace + 1);
|
|
2542
|
+
}
|
|
2543
|
+
}
|
|
2544
|
+
|
|
2545
|
+
const parsed = JSON.parse(jsonStr);
|
|
2546
|
+
const turnOverall = calculateLearnerOverallScore(parsed.scores || {}, isMultiAgent);
|
|
2547
|
+
|
|
2548
|
+
turnScores[lt] = {
|
|
2549
|
+
turnIndex: lt + 1,
|
|
2550
|
+
scores: parsed.scores,
|
|
2551
|
+
overallScore: turnOverall,
|
|
2552
|
+
summary: parsed.summary,
|
|
2553
|
+
};
|
|
2554
|
+
|
|
2555
|
+
const dimScores = Object.entries(parsed.scores || {})
|
|
2556
|
+
.map(([k, v]) => `${k}=${typeof v === 'object' ? v.score : v}`)
|
|
2557
|
+
.join(' ');
|
|
2558
|
+
console.log(`${turnTag} ... ${turnOverall.toFixed(1)} (${dimScores})`);
|
|
2559
|
+
|
|
2560
|
+
if (verbose && parsed.summary) {
|
|
2561
|
+
console.log(` Judge: ${parsed.summary}`);
|
|
2562
|
+
}
|
|
2563
|
+
|
|
2564
|
+
turnSucceeded++;
|
|
2565
|
+
} catch (err) {
|
|
2566
|
+
const msg = err.stderr ? err.stderr.slice(0, 200) : err.message;
|
|
2567
|
+
console.log(`${turnTag} ... FAIL: ${msg}`);
|
|
2568
|
+
if (verbose) console.error(err);
|
|
2569
|
+
}
|
|
2570
|
+
}
|
|
2571
|
+
|
|
2572
|
+
if (turnSucceeded > 0) {
|
|
2573
|
+
// Calculate dialogue-level learner score (average across turns)
|
|
2574
|
+
const turnOveralls = Object.values(turnScores).map(ts => ts.overallScore);
|
|
2575
|
+
const dialogueLearnerScore = turnOveralls.reduce((a, b) => a + b, 0) / turnOveralls.length;
|
|
2576
|
+
|
|
2577
|
+
// Store in database on the evaluation_results row
|
|
2578
|
+
evaluationStore.updateResultLearnerScores(result.id, {
|
|
2579
|
+
scores: turnScores,
|
|
2580
|
+
overallScore: dialogueLearnerScore,
|
|
2581
|
+
judgeModel: modelOverride ? `claude-code/${modelOverride}` : 'claude-code/opus',
|
|
2582
|
+
});
|
|
2583
|
+
|
|
2584
|
+
allScores.push(dialogueLearnerScore);
|
|
2585
|
+
succeeded++;
|
|
2586
|
+
|
|
2587
|
+
console.log(` → Dialogue learner score: ${dialogueLearnerScore.toFixed(1)} (${turnSucceeded} turns scored)`);
|
|
2588
|
+
console.log('');
|
|
2589
|
+
} else {
|
|
2590
|
+
failed++;
|
|
2591
|
+
}
|
|
2592
|
+
}
|
|
2593
|
+
|
|
2594
|
+
// Summary
|
|
2595
|
+
console.log('\n' + '='.repeat(50));
|
|
2596
|
+
console.log(' EVALUATE-LEARNER SUMMARY');
|
|
2597
|
+
console.log('='.repeat(50));
|
|
2598
|
+
console.log(` Total dialogues: ${toEvaluate.length}`);
|
|
2599
|
+
console.log(` Succeeded: ${succeeded}`);
|
|
2600
|
+
console.log(` Failed: ${failed}`);
|
|
2601
|
+
if (allScores.length > 0) {
|
|
2602
|
+
const avg = allScores.reduce((a, b) => a + b, 0) / allScores.length;
|
|
2603
|
+
const sd = allScores.length > 1
|
|
2604
|
+
? Math.sqrt(allScores.reduce((acc, s) => acc + (s - avg) ** 2, 0) / (allScores.length - 1))
|
|
2605
|
+
: 0;
|
|
2606
|
+
console.log(` Avg learner score: ${avg.toFixed(1)} (SD=${sd.toFixed(1)})`);
|
|
2607
|
+
}
|
|
2608
|
+
console.log('');
|
|
2609
|
+
break;
|
|
2610
|
+
}
|
|
2611
|
+
|
|
2612
|
+
default:
|
|
2613
|
+
console.error(`Unknown command: ${command}`);
|
|
2614
|
+
console.error('Available commands: list, quick, test, run, runs, report, status, watch, transcript, export, cleanup, resume, revert, rejudge, evaluate, evaluate-learner, chat');
|
|
2615
|
+
process.exit(1);
|
|
2616
|
+
}
|
|
2617
|
+
} catch (error) {
|
|
2618
|
+
console.error(`\nError: ${error.message}`);
|
|
2619
|
+
if (getFlag('verbose')) {
|
|
2620
|
+
console.error(error.stack);
|
|
2621
|
+
}
|
|
2622
|
+
process.exit(1);
|
|
2623
|
+
}
|
|
2624
|
+
}
|
|
2625
|
+
|
|
2626
|
+
main();
|