@machinespirits/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/components/MobileEvalDashboard.tsx +267 -0
- package/components/comparison/DeltaAnalysisTable.tsx +137 -0
- package/components/comparison/ProfileComparisonCard.tsx +176 -0
- package/components/comparison/RecognitionABMode.tsx +385 -0
- package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
- package/components/comparison/WinnerIndicator.tsx +64 -0
- package/components/comparison/index.ts +5 -0
- package/components/mobile/BottomSheet.tsx +233 -0
- package/components/mobile/DimensionBreakdown.tsx +210 -0
- package/components/mobile/DocsView.tsx +363 -0
- package/components/mobile/LogsView.tsx +481 -0
- package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
- package/components/mobile/QuickTestView.tsx +1098 -0
- package/components/mobile/RecognitionTypeChart.tsx +124 -0
- package/components/mobile/RecognitionView.tsx +809 -0
- package/components/mobile/RunDetailView.tsx +261 -0
- package/components/mobile/RunHistoryView.tsx +367 -0
- package/components/mobile/ScoreRadial.tsx +211 -0
- package/components/mobile/StreamingLogPanel.tsx +230 -0
- package/components/mobile/SynthesisStrategyChart.tsx +140 -0
- package/config/interaction-eval-scenarios.yaml +832 -0
- package/config/learner-agents.yaml +248 -0
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
- package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
- package/docs/research/COST-ANALYSIS.md +56 -0
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
- package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
- package/docs/research/PAPER-UNIFIED.md +659 -0
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
- package/docs/research/apa.csl +2133 -0
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
- package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
- package/docs/research/paper-draft/full-paper.md +136 -0
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +515 -0
- package/docs/research/transcript-baseline.md +139 -0
- package/docs/research/transcript-recognition-multiagent.md +187 -0
- package/hooks/useEvalData.ts +625 -0
- package/index.js +27 -0
- package/package.json +73 -0
- package/routes/evalRoutes.js +3002 -0
- package/scripts/advanced-eval-analysis.js +351 -0
- package/scripts/analyze-eval-costs.js +378 -0
- package/scripts/analyze-eval-results.js +513 -0
- package/scripts/analyze-interaction-evals.js +368 -0
- package/server-init.js +45 -0
- package/server.js +162 -0
- package/services/benchmarkService.js +1892 -0
- package/services/evaluationRunner.js +739 -0
- package/services/evaluationStore.js +1121 -0
- package/services/learnerConfigLoader.js +385 -0
- package/services/learnerTutorInteractionEngine.js +857 -0
- package/services/memory/learnerMemoryService.js +1227 -0
- package/services/memory/learnerWritingPad.js +577 -0
- package/services/memory/tutorWritingPad.js +674 -0
- package/services/promptRecommendationService.js +493 -0
- package/services/rubricEvaluator.js +826 -0
|
@@ -0,0 +1,1121 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation Store Service
|
|
3
|
+
*
|
|
4
|
+
* SQLite-based storage for AI tutor evaluation results.
|
|
5
|
+
* Supports querying, aggregation, comparison, and export.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import Database from 'better-sqlite3';
|
|
9
|
+
import path from 'path';
|
|
10
|
+
import { fileURLToPath } from 'url';
|
|
11
|
+
import { randomBytes } from 'crypto';
|
|
12
|
+
|
|
13
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
14
|
+
const __dirname = path.dirname(__filename);
|
|
15
|
+
const ROOT_DIR = path.resolve(__dirname, '..');
|
|
16
|
+
const DATA_DIR = path.join(ROOT_DIR, 'data');
|
|
17
|
+
|
|
18
|
+
// Initialize database
|
|
19
|
+
const dbPath = path.join(DATA_DIR, 'evaluations.db');
|
|
20
|
+
const db = new Database(dbPath);
|
|
21
|
+
|
|
22
|
+
// Enable WAL mode for better concurrent access
|
|
23
|
+
db.pragma('journal_mode = WAL');
|
|
24
|
+
|
|
25
|
+
// Create tables
|
|
26
|
+
db.exec(`
|
|
27
|
+
-- Evaluation runs (batches of tests)
|
|
28
|
+
CREATE TABLE IF NOT EXISTS evaluation_runs (
|
|
29
|
+
id TEXT PRIMARY KEY,
|
|
30
|
+
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
31
|
+
description TEXT,
|
|
32
|
+
total_scenarios INTEGER DEFAULT 0,
|
|
33
|
+
total_configurations INTEGER DEFAULT 0,
|
|
34
|
+
total_tests INTEGER DEFAULT 0,
|
|
35
|
+
status TEXT DEFAULT 'running',
|
|
36
|
+
completed_at DATETIME,
|
|
37
|
+
metadata TEXT -- JSON
|
|
38
|
+
);
|
|
39
|
+
|
|
40
|
+
-- Individual evaluation results
|
|
41
|
+
CREATE TABLE IF NOT EXISTS evaluation_results (
|
|
42
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
43
|
+
run_id TEXT REFERENCES evaluation_runs(id),
|
|
44
|
+
scenario_id TEXT NOT NULL,
|
|
45
|
+
scenario_name TEXT,
|
|
46
|
+
|
|
47
|
+
-- Configuration
|
|
48
|
+
provider TEXT NOT NULL,
|
|
49
|
+
model TEXT NOT NULL,
|
|
50
|
+
profile_name TEXT,
|
|
51
|
+
hyperparameters TEXT, -- JSON
|
|
52
|
+
prompt_id TEXT,
|
|
53
|
+
|
|
54
|
+
-- Raw output
|
|
55
|
+
suggestions TEXT, -- JSON array
|
|
56
|
+
raw_response TEXT,
|
|
57
|
+
|
|
58
|
+
-- Performance metrics
|
|
59
|
+
latency_ms INTEGER,
|
|
60
|
+
input_tokens INTEGER,
|
|
61
|
+
output_tokens INTEGER,
|
|
62
|
+
cost REAL, -- OpenRouter API cost in USD
|
|
63
|
+
dialogue_rounds INTEGER,
|
|
64
|
+
api_calls INTEGER,
|
|
65
|
+
dialogue_id TEXT, -- For linking to dialogue logs
|
|
66
|
+
|
|
67
|
+
-- Rubric scores (1-5 scale)
|
|
68
|
+
score_relevance REAL,
|
|
69
|
+
score_specificity REAL,
|
|
70
|
+
score_pedagogical REAL,
|
|
71
|
+
score_personalization REAL,
|
|
72
|
+
score_actionability REAL,
|
|
73
|
+
score_tone REAL,
|
|
74
|
+
overall_score REAL,
|
|
75
|
+
|
|
76
|
+
-- Validation
|
|
77
|
+
passes_required BOOLEAN,
|
|
78
|
+
passes_forbidden BOOLEAN,
|
|
79
|
+
required_missing TEXT, -- JSON array
|
|
80
|
+
forbidden_found TEXT, -- JSON array
|
|
81
|
+
|
|
82
|
+
-- Metadata
|
|
83
|
+
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
84
|
+
evaluator_model TEXT,
|
|
85
|
+
evaluation_reasoning TEXT,
|
|
86
|
+
success BOOLEAN DEFAULT 1,
|
|
87
|
+
error_message TEXT
|
|
88
|
+
);
|
|
89
|
+
|
|
90
|
+
-- Indexes for efficient querying
|
|
91
|
+
CREATE INDEX IF NOT EXISTS idx_results_run ON evaluation_results(run_id);
|
|
92
|
+
CREATE INDEX IF NOT EXISTS idx_results_scenario ON evaluation_results(scenario_id);
|
|
93
|
+
CREATE INDEX IF NOT EXISTS idx_results_provider ON evaluation_results(provider, model);
|
|
94
|
+
CREATE INDEX IF NOT EXISTS idx_results_created ON evaluation_results(created_at);
|
|
95
|
+
CREATE INDEX IF NOT EXISTS idx_runs_created ON evaluation_runs(created_at);
|
|
96
|
+
`);
|
|
97
|
+
|
|
98
|
+
// Migration: Add dialogue_id column if it doesn't exist
|
|
99
|
+
try {
|
|
100
|
+
db.exec(`ALTER TABLE evaluation_results ADD COLUMN dialogue_id TEXT`);
|
|
101
|
+
} catch (e) {
|
|
102
|
+
// Column already exists, ignore
|
|
103
|
+
}
|
|
104
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_results_dialogue ON evaluation_results(dialogue_id)`);
|
|
105
|
+
|
|
106
|
+
// Migration: Add scores_with_reasoning column if it doesn't exist
|
|
107
|
+
try {
|
|
108
|
+
db.exec(`ALTER TABLE evaluation_results ADD COLUMN scores_with_reasoning TEXT`);
|
|
109
|
+
} catch (e) {
|
|
110
|
+
// Column already exists, ignore
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// Migration: Add cost column if it doesn't exist
|
|
114
|
+
try {
|
|
115
|
+
db.exec(`ALTER TABLE evaluation_results ADD COLUMN cost REAL`);
|
|
116
|
+
} catch (e) {
|
|
117
|
+
// Column already exists, ignore
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Migration: Revert any accidental renames (batch→matrix, interact→interaction)
|
|
121
|
+
try {
|
|
122
|
+
const revertRuns = db.prepare(`
|
|
123
|
+
UPDATE evaluation_runs
|
|
124
|
+
SET metadata = REPLACE(REPLACE(metadata, '"runType":"batch"', '"runType":"matrix"'), '"runType":"interact"', '"runType":"interaction"')
|
|
125
|
+
WHERE metadata LIKE '%"runType":"batch"%' OR metadata LIKE '%"runType":"interact"%'
|
|
126
|
+
`);
|
|
127
|
+
revertRuns.run();
|
|
128
|
+
} catch (e) {
|
|
129
|
+
// Ignore errors
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Create interaction evaluation tables
|
|
133
|
+
db.exec(`
|
|
134
|
+
-- Interaction evaluation results (learner-tutor dialogues)
|
|
135
|
+
CREATE TABLE IF NOT EXISTS interaction_evaluations (
|
|
136
|
+
id TEXT PRIMARY KEY,
|
|
137
|
+
run_id TEXT REFERENCES evaluation_runs(id),
|
|
138
|
+
scenario_id TEXT NOT NULL,
|
|
139
|
+
scenario_name TEXT,
|
|
140
|
+
eval_type TEXT DEFAULT 'short_term',
|
|
141
|
+
|
|
142
|
+
-- Configuration
|
|
143
|
+
learner_profile TEXT,
|
|
144
|
+
tutor_profile TEXT,
|
|
145
|
+
persona_id TEXT,
|
|
146
|
+
learner_agents TEXT, -- JSON array of agent roles
|
|
147
|
+
|
|
148
|
+
-- Interaction data
|
|
149
|
+
turn_count INTEGER,
|
|
150
|
+
turns TEXT, -- JSON array of turn objects
|
|
151
|
+
sequence_diagram TEXT,
|
|
152
|
+
formatted_transcript TEXT,
|
|
153
|
+
|
|
154
|
+
-- Memory snapshots
|
|
155
|
+
learner_memory_before TEXT, -- JSON
|
|
156
|
+
learner_memory_after TEXT, -- JSON
|
|
157
|
+
tutor_memory_before TEXT, -- JSON
|
|
158
|
+
tutor_memory_after TEXT, -- JSON
|
|
159
|
+
|
|
160
|
+
-- Metrics
|
|
161
|
+
total_tokens INTEGER,
|
|
162
|
+
learner_tokens INTEGER,
|
|
163
|
+
tutor_tokens INTEGER,
|
|
164
|
+
latency_ms INTEGER,
|
|
165
|
+
|
|
166
|
+
-- Outcomes
|
|
167
|
+
final_learner_state TEXT,
|
|
168
|
+
final_understanding TEXT,
|
|
169
|
+
unique_outcomes TEXT, -- JSON array
|
|
170
|
+
|
|
171
|
+
-- Judge evaluation
|
|
172
|
+
judge_overall_score REAL,
|
|
173
|
+
judge_evaluation TEXT, -- JSON
|
|
174
|
+
|
|
175
|
+
-- Timestamps
|
|
176
|
+
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
177
|
+
);
|
|
178
|
+
|
|
179
|
+
CREATE INDEX IF NOT EXISTS idx_interaction_run ON interaction_evaluations(run_id);
|
|
180
|
+
CREATE INDEX IF NOT EXISTS idx_interaction_scenario ON interaction_evaluations(scenario_id);
|
|
181
|
+
CREATE INDEX IF NOT EXISTS idx_interaction_created ON interaction_evaluations(created_at);
|
|
182
|
+
`);
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Generate a unique run ID
|
|
186
|
+
*/
|
|
187
|
+
function generateRunId() {
|
|
188
|
+
const timestamp = new Date().toISOString().slice(0, 10);
|
|
189
|
+
const suffix = randomBytes(4).toString('hex');
|
|
190
|
+
return `eval-${timestamp}-${suffix}`;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
/**
|
|
194
|
+
* Create a new evaluation run
|
|
195
|
+
*
|
|
196
|
+
* @param {Object} options - Run options
|
|
197
|
+
* @returns {Object} Created run
|
|
198
|
+
*/
|
|
199
|
+
export function createRun(options = {}) {
|
|
200
|
+
const {
|
|
201
|
+
description = null,
|
|
202
|
+
totalScenarios = 0,
|
|
203
|
+
totalConfigurations = 0,
|
|
204
|
+
metadata = {},
|
|
205
|
+
} = options;
|
|
206
|
+
|
|
207
|
+
const id = generateRunId();
|
|
208
|
+
|
|
209
|
+
const stmt = db.prepare(`
|
|
210
|
+
INSERT INTO evaluation_runs (id, description, total_scenarios, total_configurations, metadata)
|
|
211
|
+
VALUES (?, ?, ?, ?, ?)
|
|
212
|
+
`);
|
|
213
|
+
|
|
214
|
+
stmt.run(id, description, totalScenarios, totalConfigurations, JSON.stringify(metadata));
|
|
215
|
+
|
|
216
|
+
return {
|
|
217
|
+
id,
|
|
218
|
+
description,
|
|
219
|
+
totalScenarios,
|
|
220
|
+
totalConfigurations,
|
|
221
|
+
status: 'running',
|
|
222
|
+
createdAt: new Date().toISOString(),
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/**
|
|
227
|
+
* Update a run's status
|
|
228
|
+
*/
|
|
229
|
+
export function updateRun(runId, updates) {
|
|
230
|
+
const { status, totalTests, completedAt } = updates;
|
|
231
|
+
|
|
232
|
+
if (status === 'completed') {
|
|
233
|
+
const stmt = db.prepare(`
|
|
234
|
+
UPDATE evaluation_runs
|
|
235
|
+
SET status = ?, total_tests = ?, completed_at = ?
|
|
236
|
+
WHERE id = ?
|
|
237
|
+
`);
|
|
238
|
+
stmt.run(status, totalTests || 0, completedAt || new Date().toISOString(), runId);
|
|
239
|
+
} else {
|
|
240
|
+
const stmt = db.prepare(`
|
|
241
|
+
UPDATE evaluation_runs SET status = ? WHERE id = ?
|
|
242
|
+
`);
|
|
243
|
+
stmt.run(status, runId);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Store an individual evaluation result
|
|
249
|
+
*
|
|
250
|
+
* @param {string} runId - The run ID
|
|
251
|
+
* @param {Object} result - The evaluation result
|
|
252
|
+
* @returns {number} Inserted row ID
|
|
253
|
+
*/
|
|
254
|
+
export function storeResult(runId, result) {
|
|
255
|
+
const stmt = db.prepare(`
|
|
256
|
+
INSERT INTO evaluation_results (
|
|
257
|
+
run_id, scenario_id, scenario_name,
|
|
258
|
+
provider, model, profile_name, hyperparameters, prompt_id,
|
|
259
|
+
suggestions, raw_response,
|
|
260
|
+
latency_ms, input_tokens, output_tokens, cost, dialogue_rounds, api_calls, dialogue_id,
|
|
261
|
+
score_relevance, score_specificity, score_pedagogical,
|
|
262
|
+
score_personalization, score_actionability, score_tone, overall_score,
|
|
263
|
+
passes_required, passes_forbidden, required_missing, forbidden_found,
|
|
264
|
+
evaluator_model, evaluation_reasoning, scores_with_reasoning, success, error_message
|
|
265
|
+
) VALUES (
|
|
266
|
+
?, ?, ?,
|
|
267
|
+
?, ?, ?, ?, ?,
|
|
268
|
+
?, ?,
|
|
269
|
+
?, ?, ?, ?, ?, ?, ?,
|
|
270
|
+
?, ?, ?,
|
|
271
|
+
?, ?, ?, ?,
|
|
272
|
+
?, ?, ?, ?,
|
|
273
|
+
?, ?, ?, ?, ?
|
|
274
|
+
)
|
|
275
|
+
`);
|
|
276
|
+
|
|
277
|
+
const info = stmt.run(
|
|
278
|
+
runId,
|
|
279
|
+
result.scenarioId,
|
|
280
|
+
result.scenarioName,
|
|
281
|
+
result.provider,
|
|
282
|
+
result.model,
|
|
283
|
+
result.profileName,
|
|
284
|
+
JSON.stringify(result.hyperparameters || {}),
|
|
285
|
+
result.promptId,
|
|
286
|
+
JSON.stringify(result.suggestions || []),
|
|
287
|
+
result.rawResponse,
|
|
288
|
+
result.latencyMs,
|
|
289
|
+
result.inputTokens,
|
|
290
|
+
result.outputTokens,
|
|
291
|
+
result.cost,
|
|
292
|
+
result.dialogueRounds,
|
|
293
|
+
result.apiCalls,
|
|
294
|
+
result.dialogueId,
|
|
295
|
+
result.scores?.relevance,
|
|
296
|
+
result.scores?.specificity,
|
|
297
|
+
result.scores?.pedagogical,
|
|
298
|
+
result.scores?.personalization,
|
|
299
|
+
result.scores?.actionability,
|
|
300
|
+
result.scores?.tone,
|
|
301
|
+
result.overallScore,
|
|
302
|
+
result.passesRequired ? 1 : 0,
|
|
303
|
+
result.passesForbidden ? 1 : 0,
|
|
304
|
+
JSON.stringify(result.requiredMissing || []),
|
|
305
|
+
JSON.stringify(result.forbiddenFound || []),
|
|
306
|
+
result.evaluatorModel,
|
|
307
|
+
result.evaluationReasoning,
|
|
308
|
+
result.scoresWithReasoning ? JSON.stringify(result.scoresWithReasoning) : null,
|
|
309
|
+
result.success ? 1 : 0,
|
|
310
|
+
result.errorMessage
|
|
311
|
+
);
|
|
312
|
+
|
|
313
|
+
return info.lastInsertRowid;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/**
|
|
317
|
+
* Get a run by ID
|
|
318
|
+
*/
|
|
319
|
+
export function getRun(runId) {
|
|
320
|
+
const stmt = db.prepare('SELECT * FROM evaluation_runs WHERE id = ?');
|
|
321
|
+
const row = stmt.get(runId);
|
|
322
|
+
if (!row) return null;
|
|
323
|
+
|
|
324
|
+
return {
|
|
325
|
+
id: row.id,
|
|
326
|
+
createdAt: row.created_at,
|
|
327
|
+
description: row.description,
|
|
328
|
+
totalScenarios: row.total_scenarios,
|
|
329
|
+
totalConfigurations: row.total_configurations,
|
|
330
|
+
totalTests: row.total_tests,
|
|
331
|
+
status: row.status,
|
|
332
|
+
completedAt: row.completed_at,
|
|
333
|
+
metadata: JSON.parse(row.metadata || '{}'),
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
/**
|
|
338
|
+
* List all runs with scenario names
|
|
339
|
+
*/
|
|
340
|
+
export function listRuns(options = {}) {
|
|
341
|
+
const { limit = 20, status = null } = options;
|
|
342
|
+
|
|
343
|
+
let query = 'SELECT * FROM evaluation_runs';
|
|
344
|
+
const params = [];
|
|
345
|
+
|
|
346
|
+
if (status) {
|
|
347
|
+
query += ' WHERE status = ?';
|
|
348
|
+
params.push(status);
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
query += ' ORDER BY created_at DESC LIMIT ?';
|
|
352
|
+
params.push(limit);
|
|
353
|
+
|
|
354
|
+
const stmt = db.prepare(query);
|
|
355
|
+
const rows = stmt.all(...params);
|
|
356
|
+
|
|
357
|
+
// Get distinct scenario names for each run
|
|
358
|
+
const scenarioStmt = db.prepare(`
|
|
359
|
+
SELECT DISTINCT scenario_name FROM evaluation_results
|
|
360
|
+
WHERE run_id = ? AND scenario_name IS NOT NULL
|
|
361
|
+
ORDER BY scenario_name
|
|
362
|
+
`);
|
|
363
|
+
|
|
364
|
+
return rows.map(row => {
|
|
365
|
+
const scenarioRows = scenarioStmt.all(row.id);
|
|
366
|
+
const scenarioNames = scenarioRows.map(s => s.scenario_name).filter(Boolean);
|
|
367
|
+
|
|
368
|
+
return {
|
|
369
|
+
id: row.id,
|
|
370
|
+
createdAt: row.created_at,
|
|
371
|
+
description: row.description,
|
|
372
|
+
totalScenarios: row.total_scenarios,
|
|
373
|
+
totalConfigurations: row.total_configurations,
|
|
374
|
+
totalTests: row.total_tests,
|
|
375
|
+
status: row.status,
|
|
376
|
+
completedAt: row.completed_at,
|
|
377
|
+
scenarioNames, // Scenario names from results
|
|
378
|
+
metadata: JSON.parse(row.metadata || '{}'), // Structured metadata
|
|
379
|
+
};
|
|
380
|
+
});
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/**
|
|
384
|
+
* Get results for a run
|
|
385
|
+
*/
|
|
386
|
+
export function getResults(runId, options = {}) {
|
|
387
|
+
const { scenarioId = null, provider = null, model = null } = options;
|
|
388
|
+
|
|
389
|
+
let query = 'SELECT * FROM evaluation_results WHERE run_id = ?';
|
|
390
|
+
const params = [runId];
|
|
391
|
+
|
|
392
|
+
if (scenarioId) {
|
|
393
|
+
query += ' AND scenario_id = ?';
|
|
394
|
+
params.push(scenarioId);
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
if (provider) {
|
|
398
|
+
query += ' AND provider = ?';
|
|
399
|
+
params.push(provider);
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
if (model) {
|
|
403
|
+
query += ' AND model = ?';
|
|
404
|
+
params.push(model);
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
query += ' ORDER BY created_at';
|
|
408
|
+
|
|
409
|
+
const stmt = db.prepare(query);
|
|
410
|
+
const rows = stmt.all(...params);
|
|
411
|
+
|
|
412
|
+
return rows.map(parseResultRow);
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
/**
|
|
416
|
+
* Get aggregated statistics for a run
|
|
417
|
+
*/
|
|
418
|
+
export function getRunStats(runId) {
|
|
419
|
+
const stmt = db.prepare(`
|
|
420
|
+
SELECT
|
|
421
|
+
provider,
|
|
422
|
+
model,
|
|
423
|
+
COUNT(*) as total_tests,
|
|
424
|
+
SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successful_tests,
|
|
425
|
+
AVG(overall_score) as avg_score,
|
|
426
|
+
AVG(score_relevance) as avg_relevance,
|
|
427
|
+
AVG(score_specificity) as avg_specificity,
|
|
428
|
+
AVG(score_pedagogical) as avg_pedagogical,
|
|
429
|
+
AVG(score_personalization) as avg_personalization,
|
|
430
|
+
AVG(score_actionability) as avg_actionability,
|
|
431
|
+
AVG(score_tone) as avg_tone,
|
|
432
|
+
AVG(latency_ms) as avg_latency,
|
|
433
|
+
SUM(input_tokens) as total_input_tokens,
|
|
434
|
+
SUM(output_tokens) as total_output_tokens,
|
|
435
|
+
SUM(CASE WHEN passes_required = 1 THEN 1 ELSE 0 END) as passes_required,
|
|
436
|
+
SUM(CASE WHEN passes_forbidden = 1 THEN 1 ELSE 0 END) as passes_forbidden
|
|
437
|
+
FROM evaluation_results
|
|
438
|
+
WHERE run_id = ?
|
|
439
|
+
GROUP BY provider, model
|
|
440
|
+
ORDER BY avg_score DESC
|
|
441
|
+
`);
|
|
442
|
+
|
|
443
|
+
const rows = stmt.all(runId);
|
|
444
|
+
|
|
445
|
+
return rows.map(row => ({
|
|
446
|
+
provider: row.provider,
|
|
447
|
+
model: row.model,
|
|
448
|
+
totalTests: row.total_tests,
|
|
449
|
+
successfulTests: row.successful_tests,
|
|
450
|
+
successRate: row.total_tests > 0 ? row.successful_tests / row.total_tests : 0,
|
|
451
|
+
avgScore: row.avg_score,
|
|
452
|
+
dimensions: {
|
|
453
|
+
relevance: row.avg_relevance,
|
|
454
|
+
specificity: row.avg_specificity,
|
|
455
|
+
pedagogical: row.avg_pedagogical,
|
|
456
|
+
personalization: row.avg_personalization,
|
|
457
|
+
actionability: row.avg_actionability,
|
|
458
|
+
tone: row.avg_tone,
|
|
459
|
+
},
|
|
460
|
+
avgLatencyMs: row.avg_latency,
|
|
461
|
+
totalInputTokens: row.total_input_tokens,
|
|
462
|
+
totalOutputTokens: row.total_output_tokens,
|
|
463
|
+
passesRequired: row.passes_required,
|
|
464
|
+
passesForbidden: row.passes_forbidden,
|
|
465
|
+
validationPassRate: row.total_tests > 0
|
|
466
|
+
? (row.passes_required + row.passes_forbidden) / (row.total_tests * 2)
|
|
467
|
+
: 0,
|
|
468
|
+
}));
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
/**
|
|
472
|
+
* Get scenario-level statistics for a run
|
|
473
|
+
*/
|
|
474
|
+
export function getScenarioStats(runId) {
|
|
475
|
+
const stmt = db.prepare(`
|
|
476
|
+
SELECT
|
|
477
|
+
scenario_id,
|
|
478
|
+
scenario_name,
|
|
479
|
+
provider,
|
|
480
|
+
model,
|
|
481
|
+
AVG(overall_score) as avg_score,
|
|
482
|
+
AVG(latency_ms) as avg_latency,
|
|
483
|
+
SUM(CASE WHEN passes_required = 1 AND passes_forbidden = 1 THEN 1 ELSE 0 END) as passes_validation,
|
|
484
|
+
COUNT(*) as runs
|
|
485
|
+
FROM evaluation_results
|
|
486
|
+
WHERE run_id = ?
|
|
487
|
+
GROUP BY scenario_id, provider, model
|
|
488
|
+
ORDER BY scenario_id, avg_score DESC
|
|
489
|
+
`);
|
|
490
|
+
|
|
491
|
+
const rows = stmt.all(runId);
|
|
492
|
+
|
|
493
|
+
// Group by scenario
|
|
494
|
+
const grouped = {};
|
|
495
|
+
for (const row of rows) {
|
|
496
|
+
if (!grouped[row.scenario_id]) {
|
|
497
|
+
grouped[row.scenario_id] = {
|
|
498
|
+
scenarioId: row.scenario_id,
|
|
499
|
+
scenarioName: row.scenario_name,
|
|
500
|
+
configurations: [],
|
|
501
|
+
};
|
|
502
|
+
}
|
|
503
|
+
grouped[row.scenario_id].configurations.push({
|
|
504
|
+
provider: row.provider,
|
|
505
|
+
model: row.model,
|
|
506
|
+
avgScore: row.avg_score,
|
|
507
|
+
avgLatencyMs: row.avg_latency,
|
|
508
|
+
passesValidation: row.passes_validation === row.runs,
|
|
509
|
+
runs: row.runs,
|
|
510
|
+
});
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
return Object.values(grouped);
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
/**
|
|
517
|
+
* Compare two configurations across all scenarios
|
|
518
|
+
*/
|
|
519
|
+
export function compareConfigs(runId, config1, config2) {
|
|
520
|
+
const getConfigResults = (provider, model) => {
|
|
521
|
+
const stmt = db.prepare(`
|
|
522
|
+
SELECT
|
|
523
|
+
scenario_id,
|
|
524
|
+
AVG(overall_score) as avg_score,
|
|
525
|
+
AVG(score_relevance) as relevance,
|
|
526
|
+
AVG(score_specificity) as specificity,
|
|
527
|
+
AVG(score_pedagogical) as pedagogical,
|
|
528
|
+
AVG(score_personalization) as personalization,
|
|
529
|
+
AVG(score_actionability) as actionability,
|
|
530
|
+
AVG(score_tone) as tone,
|
|
531
|
+
AVG(latency_ms) as latency,
|
|
532
|
+
SUM(CASE WHEN passes_required = 1 AND passes_forbidden = 1 THEN 1 ELSE 0 END) * 1.0 / COUNT(*) as pass_rate
|
|
533
|
+
FROM evaluation_results
|
|
534
|
+
WHERE run_id = ? AND provider = ? AND model = ?
|
|
535
|
+
GROUP BY scenario_id
|
|
536
|
+
`);
|
|
537
|
+
return stmt.all(runId, provider, model);
|
|
538
|
+
};
|
|
539
|
+
|
|
540
|
+
const results1 = getConfigResults(config1.provider, config1.model);
|
|
541
|
+
const results2 = getConfigResults(config2.provider, config2.model);
|
|
542
|
+
|
|
543
|
+
// Build comparison
|
|
544
|
+
const comparison = [];
|
|
545
|
+
const scenarios = new Set([...results1.map(r => r.scenario_id), ...results2.map(r => r.scenario_id)]);
|
|
546
|
+
|
|
547
|
+
for (const scenarioId of scenarios) {
|
|
548
|
+
const r1 = results1.find(r => r.scenario_id === scenarioId) || {};
|
|
549
|
+
const r2 = results2.find(r => r.scenario_id === scenarioId) || {};
|
|
550
|
+
|
|
551
|
+
comparison.push({
|
|
552
|
+
scenarioId,
|
|
553
|
+
config1Score: r1.avg_score || null,
|
|
554
|
+
config2Score: r2.avg_score || null,
|
|
555
|
+
difference: (r1.avg_score || 0) - (r2.avg_score || 0),
|
|
556
|
+
winner: r1.avg_score > r2.avg_score ? 'config1'
|
|
557
|
+
: r2.avg_score > r1.avg_score ? 'config2'
|
|
558
|
+
: 'tie',
|
|
559
|
+
});
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
// Overall stats
|
|
563
|
+
const overall = {
|
|
564
|
+
config1Wins: comparison.filter(c => c.winner === 'config1').length,
|
|
565
|
+
config2Wins: comparison.filter(c => c.winner === 'config2').length,
|
|
566
|
+
ties: comparison.filter(c => c.winner === 'tie').length,
|
|
567
|
+
config1AvgScore: results1.reduce((sum, r) => sum + r.avg_score, 0) / (results1.length || 1),
|
|
568
|
+
config2AvgScore: results2.reduce((sum, r) => sum + r.avg_score, 0) / (results2.length || 1),
|
|
569
|
+
};
|
|
570
|
+
|
|
571
|
+
return { comparison, overall };
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
/**
|
|
575
|
+
* Export results to JSON
|
|
576
|
+
*/
|
|
577
|
+
export function exportToJson(runId) {
|
|
578
|
+
const run = getRun(runId);
|
|
579
|
+
const results = getResults(runId);
|
|
580
|
+
const stats = getRunStats(runId);
|
|
581
|
+
const scenarioStats = getScenarioStats(runId);
|
|
582
|
+
|
|
583
|
+
return {
|
|
584
|
+
run,
|
|
585
|
+
stats,
|
|
586
|
+
scenarioStats,
|
|
587
|
+
results,
|
|
588
|
+
exportedAt: new Date().toISOString(),
|
|
589
|
+
};
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
/**
|
|
593
|
+
* Export results to CSV format
|
|
594
|
+
*/
|
|
595
|
+
export function exportToCsv(runId) {
|
|
596
|
+
const results = getResults(runId);
|
|
597
|
+
|
|
598
|
+
const headers = [
|
|
599
|
+
'scenario_id', 'scenario_name', 'provider', 'model',
|
|
600
|
+
'overall_score', 'relevance', 'specificity', 'pedagogical',
|
|
601
|
+
'personalization', 'actionability', 'tone',
|
|
602
|
+
'latency_ms', 'input_tokens', 'output_tokens',
|
|
603
|
+
'passes_required', 'passes_forbidden', 'success'
|
|
604
|
+
];
|
|
605
|
+
|
|
606
|
+
const rows = results.map(r => [
|
|
607
|
+
r.scenarioId,
|
|
608
|
+
r.scenarioName,
|
|
609
|
+
r.provider,
|
|
610
|
+
r.model,
|
|
611
|
+
r.overallScore,
|
|
612
|
+
r.scores?.relevance,
|
|
613
|
+
r.scores?.specificity,
|
|
614
|
+
r.scores?.pedagogical,
|
|
615
|
+
r.scores?.personalization,
|
|
616
|
+
r.scores?.actionability,
|
|
617
|
+
r.scores?.tone,
|
|
618
|
+
r.latencyMs,
|
|
619
|
+
r.inputTokens,
|
|
620
|
+
r.outputTokens,
|
|
621
|
+
r.passesRequired ? 1 : 0,
|
|
622
|
+
r.passesForbidden ? 1 : 0,
|
|
623
|
+
r.success ? 1 : 0,
|
|
624
|
+
]);
|
|
625
|
+
|
|
626
|
+
return [headers.join(','), ...rows.map(row => row.join(','))].join('\n');
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
/**
|
|
630
|
+
* Complete an incomplete evaluation run
|
|
631
|
+
*
|
|
632
|
+
* Marks a stuck/interrupted run as completed with whatever results exist.
|
|
633
|
+
* Returns summary of what was completed.
|
|
634
|
+
*
|
|
635
|
+
* @param {string} runId - The run ID to complete
|
|
636
|
+
* @returns {Object} Completion summary
|
|
637
|
+
*/
|
|
638
|
+
export function completeRun(runId) {
|
|
639
|
+
const run = getRun(runId);
|
|
640
|
+
if (!run) {
|
|
641
|
+
throw new Error(`Run not found: ${runId}`);
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
if (run.status === 'completed') {
|
|
645
|
+
return {
|
|
646
|
+
alreadyCompleted: true,
|
|
647
|
+
runId,
|
|
648
|
+
message: 'Run was already marked as completed',
|
|
649
|
+
};
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
// Get all results for this run
|
|
653
|
+
const results = getResults(runId);
|
|
654
|
+
|
|
655
|
+
if (results.length === 0) {
|
|
656
|
+
// No results at all - mark as failed
|
|
657
|
+
updateRun(runId, {
|
|
658
|
+
status: 'failed',
|
|
659
|
+
totalTests: 0,
|
|
660
|
+
completedAt: new Date().toISOString(),
|
|
661
|
+
});
|
|
662
|
+
|
|
663
|
+
return {
|
|
664
|
+
runId,
|
|
665
|
+
status: 'failed',
|
|
666
|
+
message: 'No results found - marked as failed',
|
|
667
|
+
resultsFound: 0,
|
|
668
|
+
expectedTests: run.totalScenarios * run.totalConfigurations,
|
|
669
|
+
};
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
// Find the last result timestamp
|
|
673
|
+
const lastResultTime = results.reduce((latest, r) => {
|
|
674
|
+
const time = new Date(r.createdAt).getTime();
|
|
675
|
+
return time > latest ? time : latest;
|
|
676
|
+
}, 0);
|
|
677
|
+
|
|
678
|
+
const completedAt = new Date(lastResultTime).toISOString();
|
|
679
|
+
|
|
680
|
+
// Update run as completed with partial results
|
|
681
|
+
updateRun(runId, {
|
|
682
|
+
status: 'completed',
|
|
683
|
+
totalTests: results.length,
|
|
684
|
+
completedAt,
|
|
685
|
+
});
|
|
686
|
+
|
|
687
|
+
// Calculate completion percentage
|
|
688
|
+
const expectedTests = run.totalScenarios * run.totalConfigurations;
|
|
689
|
+
const completionRate = expectedTests > 0 ? (results.length / expectedTests) * 100 : 0;
|
|
690
|
+
|
|
691
|
+
// Get profile breakdown
|
|
692
|
+
const profileBreakdown = {};
|
|
693
|
+
for (const result of results) {
|
|
694
|
+
const profile = result.profileName || 'unknown';
|
|
695
|
+
if (!profileBreakdown[profile]) {
|
|
696
|
+
profileBreakdown[profile] = 0;
|
|
697
|
+
}
|
|
698
|
+
profileBreakdown[profile]++;
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
return {
|
|
702
|
+
runId,
|
|
703
|
+
status: 'completed',
|
|
704
|
+
message: 'Run marked as completed with partial results',
|
|
705
|
+
resultsFound: results.length,
|
|
706
|
+
expectedTests,
|
|
707
|
+
completionRate: Math.round(completionRate),
|
|
708
|
+
completedAt,
|
|
709
|
+
profileBreakdown,
|
|
710
|
+
wasPartial: results.length < expectedTests,
|
|
711
|
+
};
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
/**
|
|
715
|
+
* Find all incomplete (stuck) evaluation runs
|
|
716
|
+
*
|
|
717
|
+
* @param {Object} options - Query options
|
|
718
|
+
* @returns {Array} List of incomplete runs
|
|
719
|
+
*/
|
|
720
|
+
export function findIncompleteRuns(options = {}) {
|
|
721
|
+
const { olderThanMinutes = 30 } = options;
|
|
722
|
+
|
|
723
|
+
const cutoffTime = new Date(Date.now() - olderThanMinutes * 60 * 1000).toISOString();
|
|
724
|
+
|
|
725
|
+
const stmt = db.prepare(`
|
|
726
|
+
SELECT * FROM evaluation_runs
|
|
727
|
+
WHERE status = 'running'
|
|
728
|
+
AND created_at < ?
|
|
729
|
+
ORDER BY created_at DESC
|
|
730
|
+
`);
|
|
731
|
+
|
|
732
|
+
const rows = stmt.all(cutoffTime);
|
|
733
|
+
|
|
734
|
+
return rows.map(row => {
|
|
735
|
+
const resultsStmt = db.prepare('SELECT COUNT(*) as count FROM evaluation_results WHERE run_id = ?');
|
|
736
|
+
const resultsCount = resultsStmt.get(row.id).count;
|
|
737
|
+
|
|
738
|
+
return {
|
|
739
|
+
id: row.id,
|
|
740
|
+
createdAt: row.created_at,
|
|
741
|
+
description: row.description,
|
|
742
|
+
totalScenarios: row.total_scenarios,
|
|
743
|
+
totalConfigurations: row.total_configurations,
|
|
744
|
+
expectedTests: row.total_scenarios * row.total_configurations,
|
|
745
|
+
resultsFound: resultsCount,
|
|
746
|
+
ageMinutes: Math.round((Date.now() - new Date(row.created_at).getTime()) / 60000),
|
|
747
|
+
metadata: JSON.parse(row.metadata || '{}'),
|
|
748
|
+
};
|
|
749
|
+
});
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
/**
|
|
753
|
+
* Auto-complete all stale runs
|
|
754
|
+
*
|
|
755
|
+
* Finds and completes all runs stuck in "running" state for more than the threshold.
|
|
756
|
+
*
|
|
757
|
+
* @param {Object} options - Options
|
|
758
|
+
* @returns {Array} List of completed runs
|
|
759
|
+
*/
|
|
760
|
+
export function autoCompleteStaleRuns(options = {}) {
|
|
761
|
+
const { olderThanMinutes = 30, dryRun = false } = options;
|
|
762
|
+
|
|
763
|
+
const incompleteRuns = findIncompleteRuns({ olderThanMinutes });
|
|
764
|
+
|
|
765
|
+
if (dryRun) {
|
|
766
|
+
return {
|
|
767
|
+
dryRun: true,
|
|
768
|
+
found: incompleteRuns.length,
|
|
769
|
+
runs: incompleteRuns,
|
|
770
|
+
};
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
const completed = [];
|
|
774
|
+
for (const run of incompleteRuns) {
|
|
775
|
+
try {
|
|
776
|
+
const result = completeRun(run.id);
|
|
777
|
+
completed.push(result);
|
|
778
|
+
} catch (error) {
|
|
779
|
+
completed.push({
|
|
780
|
+
runId: run.id,
|
|
781
|
+
status: 'error',
|
|
782
|
+
error: error.message,
|
|
783
|
+
});
|
|
784
|
+
}
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
return {
|
|
788
|
+
found: incompleteRuns.length,
|
|
789
|
+
completed: completed.length,
|
|
790
|
+
runs: completed,
|
|
791
|
+
};
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
/**
|
|
795
|
+
* Delete a run and its results
|
|
796
|
+
*/
|
|
797
|
+
export function deleteRun(runId) {
|
|
798
|
+
const deleteResults = db.prepare('DELETE FROM evaluation_results WHERE run_id = ?');
|
|
799
|
+
const deleteRun = db.prepare('DELETE FROM evaluation_runs WHERE id = ?');
|
|
800
|
+
|
|
801
|
+
const transaction = db.transaction(() => {
|
|
802
|
+
deleteResults.run(runId);
|
|
803
|
+
deleteRun.run(runId);
|
|
804
|
+
});
|
|
805
|
+
|
|
806
|
+
transaction();
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
/**
|
|
810
|
+
* Get incomplete tests for a run to enable resumption
|
|
811
|
+
*
|
|
812
|
+
* Given a run ID and the expected test matrix (profiles x scenarios),
|
|
813
|
+
* returns which tests have NOT been completed yet.
|
|
814
|
+
*
|
|
815
|
+
* @param {string} runId - The run ID
|
|
816
|
+
* @param {Array} profiles - Array of profile names
|
|
817
|
+
* @param {Array} scenarios - Array of scenario objects with { id, name }
|
|
818
|
+
* @returns {Object} { completed, remaining, progress }
|
|
819
|
+
*/
|
|
820
|
+
export function getIncompleteTests(runId, profiles, scenarios) {
|
|
821
|
+
const run = getRun(runId);
|
|
822
|
+
if (!run) {
|
|
823
|
+
throw new Error(`Run not found: ${runId}`);
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
// Get all completed tests for this run
|
|
827
|
+
const results = getResults(runId);
|
|
828
|
+
const completedSet = new Set();
|
|
829
|
+
|
|
830
|
+
// Build set of completed (profile, scenarioId) pairs
|
|
831
|
+
for (const result of results) {
|
|
832
|
+
const key = `${result.profileName}:${result.scenarioId}`;
|
|
833
|
+
completedSet.add(key);
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
// Build list of all expected tests
|
|
837
|
+
const allTests = [];
|
|
838
|
+
const remainingTests = [];
|
|
839
|
+
|
|
840
|
+
for (const profile of profiles) {
|
|
841
|
+
for (const scenario of scenarios) {
|
|
842
|
+
const testKey = `${profile}:${scenario.id}`;
|
|
843
|
+
const test = {
|
|
844
|
+
profile,
|
|
845
|
+
scenarioId: scenario.id,
|
|
846
|
+
scenarioName: scenario.name,
|
|
847
|
+
};
|
|
848
|
+
|
|
849
|
+
allTests.push(test);
|
|
850
|
+
|
|
851
|
+
if (!completedSet.has(testKey)) {
|
|
852
|
+
remainingTests.push(test);
|
|
853
|
+
}
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
const expectedCount = allTests.length;
|
|
858
|
+
const completedCount = expectedCount - remainingTests.length;
|
|
859
|
+
const progress = expectedCount > 0 ? (completedCount / expectedCount) * 100 : 0;
|
|
860
|
+
|
|
861
|
+
return {
|
|
862
|
+
runId,
|
|
863
|
+
totalExpected: expectedCount,
|
|
864
|
+
completed: completedCount,
|
|
865
|
+
remaining: remainingTests.length,
|
|
866
|
+
progress: Math.round(progress),
|
|
867
|
+
remainingTests,
|
|
868
|
+
status: run.status,
|
|
869
|
+
canResume: remainingTests.length > 0 && run.status === 'running',
|
|
870
|
+
};
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
/**
|
|
874
|
+
* Parse a result row from the database
|
|
875
|
+
*/
|
|
876
|
+
function parseResultRow(row) {
|
|
877
|
+
// Parse scoresWithReasoning if available, otherwise build from numeric scores
|
|
878
|
+
let scoresWithReasoning = null;
|
|
879
|
+
if (row.scores_with_reasoning) {
|
|
880
|
+
try {
|
|
881
|
+
scoresWithReasoning = JSON.parse(row.scores_with_reasoning);
|
|
882
|
+
} catch (e) {
|
|
883
|
+
// Ignore parse errors
|
|
884
|
+
}
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
// Build the scores object - use scoresWithReasoning if available
|
|
888
|
+
const scores = scoresWithReasoning || {
|
|
889
|
+
relevance: row.score_relevance,
|
|
890
|
+
specificity: row.score_specificity,
|
|
891
|
+
pedagogical: row.score_pedagogical,
|
|
892
|
+
personalization: row.score_personalization,
|
|
893
|
+
actionability: row.score_actionability,
|
|
894
|
+
tone: row.score_tone,
|
|
895
|
+
};
|
|
896
|
+
|
|
897
|
+
return {
|
|
898
|
+
id: row.id,
|
|
899
|
+
runId: row.run_id,
|
|
900
|
+
scenarioId: row.scenario_id,
|
|
901
|
+
scenarioName: row.scenario_name,
|
|
902
|
+
provider: row.provider,
|
|
903
|
+
model: row.model,
|
|
904
|
+
profileName: row.profile_name,
|
|
905
|
+
hyperparameters: JSON.parse(row.hyperparameters || '{}'),
|
|
906
|
+
promptId: row.prompt_id,
|
|
907
|
+
suggestions: JSON.parse(row.suggestions || '[]'),
|
|
908
|
+
latencyMs: row.latency_ms,
|
|
909
|
+
inputTokens: row.input_tokens,
|
|
910
|
+
outputTokens: row.output_tokens,
|
|
911
|
+
cost: row.cost,
|
|
912
|
+
dialogueRounds: row.dialogue_rounds,
|
|
913
|
+
apiCalls: row.api_calls,
|
|
914
|
+
dialogueId: row.dialogue_id,
|
|
915
|
+
scores,
|
|
916
|
+
overallScore: row.overall_score,
|
|
917
|
+
passesRequired: Boolean(row.passes_required),
|
|
918
|
+
passesForbidden: Boolean(row.passes_forbidden),
|
|
919
|
+
requiredMissing: JSON.parse(row.required_missing || '[]'),
|
|
920
|
+
forbiddenFound: JSON.parse(row.forbidden_found || '[]'),
|
|
921
|
+
evaluatorModel: row.evaluator_model,
|
|
922
|
+
evaluationReasoning: row.evaluation_reasoning,
|
|
923
|
+
success: Boolean(row.success),
|
|
924
|
+
errorMessage: row.error_message,
|
|
925
|
+
createdAt: row.created_at,
|
|
926
|
+
};
|
|
927
|
+
}
|
|
928
|
+
|
|
929
|
+
// ============================================================================
|
|
930
|
+
// Interaction Evaluation Functions
|
|
931
|
+
// ============================================================================
|
|
932
|
+
|
|
933
|
+
/**
|
|
934
|
+
* Store an interaction evaluation result
|
|
935
|
+
*/
|
|
936
|
+
export function storeInteractionEval(evalData) {
|
|
937
|
+
const stmt = db.prepare(`
|
|
938
|
+
INSERT INTO interaction_evaluations (
|
|
939
|
+
id, run_id, scenario_id, scenario_name, eval_type,
|
|
940
|
+
learner_profile, tutor_profile, persona_id, learner_agents,
|
|
941
|
+
turn_count, turns, sequence_diagram, formatted_transcript,
|
|
942
|
+
learner_memory_before, learner_memory_after, tutor_memory_before, tutor_memory_after,
|
|
943
|
+
total_tokens, learner_tokens, tutor_tokens, latency_ms,
|
|
944
|
+
final_learner_state, final_understanding, unique_outcomes,
|
|
945
|
+
judge_overall_score, judge_evaluation
|
|
946
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
947
|
+
`);
|
|
948
|
+
|
|
949
|
+
stmt.run(
|
|
950
|
+
evalData.evalId,
|
|
951
|
+
evalData.runId || null,
|
|
952
|
+
evalData.scenarioId,
|
|
953
|
+
evalData.scenarioName,
|
|
954
|
+
evalData.type || 'short_term',
|
|
955
|
+
evalData.learnerProfile || null,
|
|
956
|
+
evalData.tutorProfile || 'default',
|
|
957
|
+
evalData.personaId || null,
|
|
958
|
+
JSON.stringify(evalData.learnerAgents || []),
|
|
959
|
+
evalData.metrics?.turnCount || evalData.interaction?.turns?.length || 0,
|
|
960
|
+
JSON.stringify(evalData.interaction?.turns || []),
|
|
961
|
+
evalData.sequenceDiagram || null,
|
|
962
|
+
evalData.formattedTranscript || null,
|
|
963
|
+
JSON.stringify(evalData.interaction?.writingPadSnapshots?.learner?.before || null),
|
|
964
|
+
JSON.stringify(evalData.interaction?.writingPadSnapshots?.learner?.after || null),
|
|
965
|
+
JSON.stringify(evalData.interaction?.writingPadSnapshots?.tutor?.before || null),
|
|
966
|
+
JSON.stringify(evalData.interaction?.writingPadSnapshots?.tutor?.after || null),
|
|
967
|
+
evalData.metrics?.totalTokens || 0,
|
|
968
|
+
evalData.metrics?.learnerTokens || 0,
|
|
969
|
+
evalData.metrics?.tutorTokens || 0,
|
|
970
|
+
evalData.metrics?.totalLatencyMs || 0,
|
|
971
|
+
evalData.interaction?.summary?.learnerFinalState || null,
|
|
972
|
+
evalData.interaction?.summary?.learnerFinalUnderstanding || null,
|
|
973
|
+
JSON.stringify(evalData.interaction?.summary?.uniqueOutcomes || []),
|
|
974
|
+
// Extract overall score from multiple possible locations in judge evaluation
|
|
975
|
+
evalData.judgeEvaluation?.overall_assessment?.score ??
|
|
976
|
+
evalData.judgeEvaluation?.narrative_summary?.overall_quality ??
|
|
977
|
+
evalData.judgeEvaluation?.overall_score ?? null,
|
|
978
|
+
JSON.stringify(evalData.judgeEvaluation || null)
|
|
979
|
+
);
|
|
980
|
+
|
|
981
|
+
return evalData.evalId;
|
|
982
|
+
}
|
|
983
|
+
|
|
984
|
+
/**
|
|
985
|
+
* List interaction evaluations
|
|
986
|
+
*/
|
|
987
|
+
export function listInteractionEvals(options = {}) {
|
|
988
|
+
const { limit = 50, scenarioId = null } = options;
|
|
989
|
+
|
|
990
|
+
let sql = `
|
|
991
|
+
SELECT * FROM interaction_evaluations
|
|
992
|
+
${scenarioId ? 'WHERE scenario_id = ?' : ''}
|
|
993
|
+
ORDER BY created_at DESC
|
|
994
|
+
LIMIT ?
|
|
995
|
+
`;
|
|
996
|
+
|
|
997
|
+
const stmt = db.prepare(sql);
|
|
998
|
+
const rows = scenarioId ? stmt.all(scenarioId, limit) : stmt.all(limit);
|
|
999
|
+
|
|
1000
|
+
return rows.map(row => ({
|
|
1001
|
+
evalId: row.id,
|
|
1002
|
+
runId: row.run_id,
|
|
1003
|
+
scenarioId: row.scenario_id,
|
|
1004
|
+
scenarioName: row.scenario_name,
|
|
1005
|
+
evalType: row.eval_type,
|
|
1006
|
+
learnerProfile: row.learner_profile,
|
|
1007
|
+
tutorProfile: row.tutor_profile,
|
|
1008
|
+
personaId: row.persona_id,
|
|
1009
|
+
turnCount: row.turn_count,
|
|
1010
|
+
totalTokens: row.total_tokens,
|
|
1011
|
+
latencyMs: row.latency_ms,
|
|
1012
|
+
finalLearnerState: row.final_learner_state,
|
|
1013
|
+
finalUnderstanding: row.final_understanding,
|
|
1014
|
+
judgeOverallScore: row.judge_overall_score,
|
|
1015
|
+
createdAt: row.created_at,
|
|
1016
|
+
}));
|
|
1017
|
+
}
|
|
1018
|
+
|
|
1019
|
+
/**
|
|
1020
|
+
* Get a specific interaction evaluation
|
|
1021
|
+
*/
|
|
1022
|
+
export function getInteractionEval(evalId) {
|
|
1023
|
+
const stmt = db.prepare('SELECT * FROM interaction_evaluations WHERE id = ?');
|
|
1024
|
+
const row = stmt.get(evalId);
|
|
1025
|
+
|
|
1026
|
+
if (!row) return null;
|
|
1027
|
+
|
|
1028
|
+
return {
|
|
1029
|
+
evalId: row.id,
|
|
1030
|
+
runId: row.run_id,
|
|
1031
|
+
scenarioId: row.scenario_id,
|
|
1032
|
+
scenarioName: row.scenario_name,
|
|
1033
|
+
evalType: row.eval_type,
|
|
1034
|
+
learnerProfile: row.learner_profile,
|
|
1035
|
+
tutorProfile: row.tutor_profile,
|
|
1036
|
+
personaId: row.persona_id,
|
|
1037
|
+
learnerAgents: JSON.parse(row.learner_agents || '[]'),
|
|
1038
|
+
turnCount: row.turn_count,
|
|
1039
|
+
turns: JSON.parse(row.turns || '[]'),
|
|
1040
|
+
sequenceDiagram: row.sequence_diagram,
|
|
1041
|
+
formattedTranscript: row.formatted_transcript,
|
|
1042
|
+
learnerMemoryBefore: JSON.parse(row.learner_memory_before || 'null'),
|
|
1043
|
+
learnerMemoryAfter: JSON.parse(row.learner_memory_after || 'null'),
|
|
1044
|
+
tutorMemoryBefore: JSON.parse(row.tutor_memory_before || 'null'),
|
|
1045
|
+
tutorMemoryAfter: JSON.parse(row.tutor_memory_after || 'null'),
|
|
1046
|
+
totalTokens: row.total_tokens,
|
|
1047
|
+
learnerTokens: row.learner_tokens,
|
|
1048
|
+
tutorTokens: row.tutor_tokens,
|
|
1049
|
+
latencyMs: row.latency_ms,
|
|
1050
|
+
finalLearnerState: row.final_learner_state,
|
|
1051
|
+
finalUnderstanding: row.final_understanding,
|
|
1052
|
+
uniqueOutcomes: JSON.parse(row.unique_outcomes || '[]'),
|
|
1053
|
+
judgeOverallScore: row.judge_overall_score,
|
|
1054
|
+
judgeEvaluation: JSON.parse(row.judge_evaluation || 'null'),
|
|
1055
|
+
createdAt: row.created_at,
|
|
1056
|
+
};
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1059
|
+
/**
|
|
1060
|
+
* Get an interaction evaluation by its run ID (for Interact tab runs)
|
|
1061
|
+
*/
|
|
1062
|
+
export function getInteractionEvalByRunId(runId) {
|
|
1063
|
+
const stmt = db.prepare('SELECT * FROM interaction_evaluations WHERE run_id = ? ORDER BY created_at DESC LIMIT 1');
|
|
1064
|
+
const row = stmt.get(runId);
|
|
1065
|
+
|
|
1066
|
+
if (!row) return null;
|
|
1067
|
+
|
|
1068
|
+
return {
|
|
1069
|
+
evalId: row.id,
|
|
1070
|
+
runId: row.run_id,
|
|
1071
|
+
scenarioId: row.scenario_id,
|
|
1072
|
+
scenarioName: row.scenario_name,
|
|
1073
|
+
evalType: row.eval_type,
|
|
1074
|
+
learnerProfile: row.learner_profile,
|
|
1075
|
+
tutorProfile: row.tutor_profile,
|
|
1076
|
+
personaId: row.persona_id,
|
|
1077
|
+
learnerAgents: JSON.parse(row.learner_agents || '[]'),
|
|
1078
|
+
turnCount: row.turn_count,
|
|
1079
|
+
turns: JSON.parse(row.turns || '[]'),
|
|
1080
|
+
sequenceDiagram: row.sequence_diagram,
|
|
1081
|
+
formattedTranscript: row.formatted_transcript,
|
|
1082
|
+
learnerMemoryBefore: JSON.parse(row.learner_memory_before || 'null'),
|
|
1083
|
+
learnerMemoryAfter: JSON.parse(row.learner_memory_after || 'null'),
|
|
1084
|
+
tutorMemoryBefore: JSON.parse(row.tutor_memory_before || 'null'),
|
|
1085
|
+
tutorMemoryAfter: JSON.parse(row.tutor_memory_after || 'null'),
|
|
1086
|
+
totalTokens: row.total_tokens,
|
|
1087
|
+
learnerTokens: row.learner_tokens,
|
|
1088
|
+
tutorTokens: row.tutor_tokens,
|
|
1089
|
+
latencyMs: row.latency_ms,
|
|
1090
|
+
finalLearnerState: row.final_learner_state,
|
|
1091
|
+
finalUnderstanding: row.final_understanding,
|
|
1092
|
+
uniqueOutcomes: JSON.parse(row.unique_outcomes || '[]'),
|
|
1093
|
+
judgeOverallScore: row.judge_overall_score,
|
|
1094
|
+
judgeEvaluation: JSON.parse(row.judge_evaluation || 'null'),
|
|
1095
|
+
createdAt: row.created_at,
|
|
1096
|
+
};
|
|
1097
|
+
}
|
|
1098
|
+
|
|
1099
|
+
export default {
|
|
1100
|
+
createRun,
|
|
1101
|
+
updateRun,
|
|
1102
|
+
storeResult,
|
|
1103
|
+
getRun,
|
|
1104
|
+
listRuns,
|
|
1105
|
+
getResults,
|
|
1106
|
+
getRunStats,
|
|
1107
|
+
getScenarioStats,
|
|
1108
|
+
compareConfigs,
|
|
1109
|
+
exportToJson,
|
|
1110
|
+
exportToCsv,
|
|
1111
|
+
deleteRun,
|
|
1112
|
+
completeRun,
|
|
1113
|
+
findIncompleteRuns,
|
|
1114
|
+
autoCompleteStaleRuns,
|
|
1115
|
+
getIncompleteTests,
|
|
1116
|
+
// Interaction evaluations
|
|
1117
|
+
storeInteractionEval,
|
|
1118
|
+
listInteractionEvals,
|
|
1119
|
+
getInteractionEval,
|
|
1120
|
+
getInteractionEvalByRunId,
|
|
1121
|
+
};
|