@machinespirits/eval 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +161 -0
- package/config/eval-settings.yaml +18 -0
- package/config/evaluation-rubric-learner.yaml +277 -0
- package/config/evaluation-rubric.yaml +613 -0
- package/config/interaction-eval-scenarios.yaml +93 -50
- package/config/learner-agents.yaml +124 -193
- package/config/machinespirits-eval.code-workspace +11 -0
- package/config/providers.yaml +60 -0
- package/config/suggestion-scenarios.yaml +1399 -0
- package/config/tutor-agents.yaml +716 -0
- package/docs/EVALUATION-VARIABLES.md +589 -0
- package/docs/REPLICATION-PLAN.md +577 -0
- package/index.js +15 -6
- package/package.json +16 -22
- package/routes/evalRoutes.js +88 -36
- package/scripts/analyze-judge-reliability.js +401 -0
- package/scripts/analyze-run.js +97 -0
- package/scripts/analyze-run.mjs +282 -0
- package/scripts/analyze-validation-failures.js +141 -0
- package/scripts/check-run.mjs +17 -0
- package/scripts/code-impasse-strategies.js +1132 -0
- package/scripts/compare-runs.js +44 -0
- package/scripts/compare-suggestions.js +80 -0
- package/scripts/compare-transformation.js +116 -0
- package/scripts/dig-into-run.js +158 -0
- package/scripts/eval-cli.js +2626 -0
- package/scripts/generate-paper-figures.py +452 -0
- package/scripts/qualitative-analysis-ai.js +1313 -0
- package/scripts/qualitative-analysis.js +688 -0
- package/scripts/seed-db.js +87 -0
- package/scripts/show-failed-suggestions.js +64 -0
- package/scripts/validate-content.js +192 -0
- package/server.js +3 -2
- package/services/__tests__/evalConfigLoader.test.js +338 -0
- package/services/anovaStats.js +499 -0
- package/services/contentResolver.js +407 -0
- package/services/dialogueTraceAnalyzer.js +454 -0
- package/services/evalConfigLoader.js +625 -0
- package/services/evaluationRunner.js +2171 -270
- package/services/evaluationStore.js +564 -29
- package/services/learnerConfigLoader.js +75 -5
- package/services/learnerRubricEvaluator.js +284 -0
- package/services/learnerTutorInteractionEngine.js +375 -0
- package/services/processUtils.js +18 -0
- package/services/progressLogger.js +98 -0
- package/services/promptRecommendationService.js +31 -26
- package/services/promptRewriter.js +427 -0
- package/services/rubricEvaluator.js +543 -70
- package/services/streamingReporter.js +104 -0
- package/services/turnComparisonAnalyzer.js +494 -0
- package/components/MobileEvalDashboard.tsx +0 -267
- package/components/comparison/DeltaAnalysisTable.tsx +0 -137
- package/components/comparison/ProfileComparisonCard.tsx +0 -176
- package/components/comparison/RecognitionABMode.tsx +0 -385
- package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
- package/components/comparison/WinnerIndicator.tsx +0 -64
- package/components/comparison/index.ts +0 -5
- package/components/mobile/BottomSheet.tsx +0 -233
- package/components/mobile/DimensionBreakdown.tsx +0 -210
- package/components/mobile/DocsView.tsx +0 -363
- package/components/mobile/LogsView.tsx +0 -481
- package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
- package/components/mobile/QuickTestView.tsx +0 -1098
- package/components/mobile/RecognitionTypeChart.tsx +0 -124
- package/components/mobile/RecognitionView.tsx +0 -809
- package/components/mobile/RunDetailView.tsx +0 -261
- package/components/mobile/RunHistoryView.tsx +0 -367
- package/components/mobile/ScoreRadial.tsx +0 -211
- package/components/mobile/StreamingLogPanel.tsx +0 -230
- package/components/mobile/SynthesisStrategyChart.tsx +0 -140
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
- package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
- package/docs/research/COST-ANALYSIS.md +0 -56
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
- package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
- package/docs/research/PAPER-UNIFIED.md +0 -659
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
- package/docs/research/apa.csl +0 -2133
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
- package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
- package/docs/research/paper-draft/full-paper.md +0 -136
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +0 -515
- package/docs/research/transcript-baseline.md +0 -139
- package/docs/research/transcript-recognition-multiagent.md +0 -187
- package/hooks/useEvalData.ts +0 -625
- package/server-init.js +0 -45
- package/services/benchmarkService.js +0 -1892
- package/types.ts +0 -165
- package/utils/haptics.ts +0 -45
|
@@ -9,19 +9,30 @@ import Database from 'better-sqlite3';
|
|
|
9
9
|
import path from 'path';
|
|
10
10
|
import { fileURLToPath } from 'url';
|
|
11
11
|
import { randomBytes } from 'crypto';
|
|
12
|
+
import { isPidAlive } from './processUtils.js';
|
|
12
13
|
|
|
13
14
|
const __filename = fileURLToPath(import.meta.url);
|
|
14
15
|
const __dirname = path.dirname(__filename);
|
|
15
16
|
const ROOT_DIR = path.resolve(__dirname, '..');
|
|
16
17
|
const DATA_DIR = path.join(ROOT_DIR, 'data');
|
|
17
18
|
|
|
18
|
-
// Initialize database
|
|
19
|
-
const dbPath = path.join(DATA_DIR, 'evaluations.db');
|
|
19
|
+
// Initialize database — override with EVAL_DB_PATH env var for test isolation
|
|
20
|
+
const dbPath = process.env.EVAL_DB_PATH || path.join(DATA_DIR, 'evaluations.db');
|
|
20
21
|
const db = new Database(dbPath);
|
|
21
22
|
|
|
22
23
|
// Enable WAL mode for better concurrent access
|
|
23
24
|
db.pragma('journal_mode = WAL');
|
|
24
25
|
|
|
26
|
+
// Migrate: rename evaluator_model → judge_model if the old column exists
|
|
27
|
+
try {
|
|
28
|
+
const cols = db.prepare('PRAGMA table_info(evaluation_results)').all().map(c => c.name);
|
|
29
|
+
if (cols.includes('evaluator_model') && !cols.includes('judge_model')) {
|
|
30
|
+
db.exec('ALTER TABLE evaluation_results RENAME COLUMN evaluator_model TO judge_model');
|
|
31
|
+
}
|
|
32
|
+
} catch (e) {
|
|
33
|
+
// Table may not exist yet (first run)
|
|
34
|
+
}
|
|
35
|
+
|
|
25
36
|
// Create tables
|
|
26
37
|
db.exec(`
|
|
27
38
|
-- Evaluation runs (batches of tests)
|
|
@@ -81,7 +92,7 @@ db.exec(`
|
|
|
81
92
|
|
|
82
93
|
-- Metadata
|
|
83
94
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
84
|
-
|
|
95
|
+
judge_model TEXT,
|
|
85
96
|
evaluation_reasoning TEXT,
|
|
86
97
|
success BOOLEAN DEFAULT 1,
|
|
87
98
|
error_message TEXT
|
|
@@ -103,6 +114,13 @@ try {
|
|
|
103
114
|
}
|
|
104
115
|
db.exec(`CREATE INDEX IF NOT EXISTS idx_results_dialogue ON evaluation_results(dialogue_id)`);
|
|
105
116
|
|
|
117
|
+
// Migration: Add scenario_type column if it doesn't exist
|
|
118
|
+
try {
|
|
119
|
+
db.exec(`ALTER TABLE evaluation_results ADD COLUMN scenario_type TEXT DEFAULT 'suggestion'`);
|
|
120
|
+
} catch (e) {
|
|
121
|
+
// Column already exists, ignore
|
|
122
|
+
}
|
|
123
|
+
|
|
106
124
|
// Migration: Add scores_with_reasoning column if it doesn't exist
|
|
107
125
|
try {
|
|
108
126
|
db.exec(`ALTER TABLE evaluation_results ADD COLUMN scores_with_reasoning TEXT`);
|
|
@@ -117,6 +135,66 @@ try {
|
|
|
117
135
|
// Column already exists, ignore
|
|
118
136
|
}
|
|
119
137
|
|
|
138
|
+
// Migration: Add dual scoring columns if they don't exist
|
|
139
|
+
try {
|
|
140
|
+
db.exec(`ALTER TABLE evaluation_results ADD COLUMN base_score REAL`);
|
|
141
|
+
} catch (e) {
|
|
142
|
+
// Column already exists, ignore
|
|
143
|
+
}
|
|
144
|
+
try {
|
|
145
|
+
db.exec(`ALTER TABLE evaluation_results ADD COLUMN recognition_score REAL`);
|
|
146
|
+
} catch (e) {
|
|
147
|
+
// Column already exists, ignore
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// Migration: Add ego_model and superego_model columns
|
|
151
|
+
try {
|
|
152
|
+
db.exec(`ALTER TABLE evaluation_results ADD COLUMN ego_model TEXT`);
|
|
153
|
+
} catch (e) {
|
|
154
|
+
// Column already exists, ignore
|
|
155
|
+
}
|
|
156
|
+
try {
|
|
157
|
+
db.exec(`ALTER TABLE evaluation_results ADD COLUMN superego_model TEXT`);
|
|
158
|
+
} catch (e) {
|
|
159
|
+
// Column already exists, ignore
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Migration: Add factorial factor columns
|
|
163
|
+
try {
|
|
164
|
+
db.exec(`ALTER TABLE evaluation_results ADD COLUMN factor_recognition BOOLEAN`);
|
|
165
|
+
} catch (e) { /* Column already exists */ }
|
|
166
|
+
try {
|
|
167
|
+
db.exec(`ALTER TABLE evaluation_results ADD COLUMN factor_multi_agent_tutor BOOLEAN`);
|
|
168
|
+
} catch (e) { /* Column already exists */ }
|
|
169
|
+
try {
|
|
170
|
+
db.exec(`ALTER TABLE evaluation_results ADD COLUMN factor_multi_agent_learner BOOLEAN`);
|
|
171
|
+
} catch (e) { /* Column already exists */ }
|
|
172
|
+
try {
|
|
173
|
+
db.exec(`ALTER TABLE evaluation_results ADD COLUMN learner_architecture TEXT`);
|
|
174
|
+
} catch (e) { /* Column already exists */ }
|
|
175
|
+
try {
|
|
176
|
+
db.exec(`ALTER TABLE evaluation_results ADD COLUMN scoring_method TEXT`);
|
|
177
|
+
} catch (e) { /* Column already exists */ }
|
|
178
|
+
|
|
179
|
+
// Migration: Add learner-side evaluation columns to evaluation_results
|
|
180
|
+
try {
|
|
181
|
+
db.exec(`ALTER TABLE evaluation_results ADD COLUMN learner_scores TEXT`);
|
|
182
|
+
} catch (e) { /* Column already exists */ }
|
|
183
|
+
try {
|
|
184
|
+
db.exec(`ALTER TABLE evaluation_results ADD COLUMN learner_overall_score REAL`);
|
|
185
|
+
} catch (e) { /* Column already exists */ }
|
|
186
|
+
try {
|
|
187
|
+
db.exec(`ALTER TABLE evaluation_results ADD COLUMN learner_judge_model TEXT`);
|
|
188
|
+
} catch (e) { /* Column already exists */ }
|
|
189
|
+
|
|
190
|
+
// Migration: Add reproducibility metadata columns to evaluation_runs
|
|
191
|
+
try {
|
|
192
|
+
db.exec(`ALTER TABLE evaluation_runs ADD COLUMN git_commit TEXT`);
|
|
193
|
+
} catch (e) { /* Column already exists */ }
|
|
194
|
+
try {
|
|
195
|
+
db.exec(`ALTER TABLE evaluation_runs ADD COLUMN package_version TEXT`);
|
|
196
|
+
} catch (e) { /* Column already exists */ }
|
|
197
|
+
|
|
120
198
|
// Migration: Revert any accidental renames (batch→matrix, interact→interaction)
|
|
121
199
|
try {
|
|
122
200
|
const revertRuns = db.prepare(`
|
|
@@ -181,6 +259,17 @@ db.exec(`
|
|
|
181
259
|
CREATE INDEX IF NOT EXISTS idx_interaction_created ON interaction_evaluations(created_at);
|
|
182
260
|
`);
|
|
183
261
|
|
|
262
|
+
// Migration: Add learner-side evaluation columns to interaction_evaluations
|
|
263
|
+
try {
|
|
264
|
+
db.exec(`ALTER TABLE interaction_evaluations ADD COLUMN learner_scores TEXT`);
|
|
265
|
+
} catch (e) { /* Column already exists */ }
|
|
266
|
+
try {
|
|
267
|
+
db.exec(`ALTER TABLE interaction_evaluations ADD COLUMN learner_overall_score REAL`);
|
|
268
|
+
} catch (e) { /* Column already exists */ }
|
|
269
|
+
try {
|
|
270
|
+
db.exec(`ALTER TABLE interaction_evaluations ADD COLUMN learner_judge_model TEXT`);
|
|
271
|
+
} catch (e) { /* Column already exists */ }
|
|
272
|
+
|
|
184
273
|
/**
|
|
185
274
|
* Generate a unique run ID
|
|
186
275
|
*/
|
|
@@ -205,13 +294,14 @@ export function createRun(options = {}) {
|
|
|
205
294
|
} = options;
|
|
206
295
|
|
|
207
296
|
const id = generateRunId();
|
|
297
|
+
const now = new Date().toISOString();
|
|
208
298
|
|
|
209
299
|
const stmt = db.prepare(`
|
|
210
|
-
INSERT INTO evaluation_runs (id, description, total_scenarios, total_configurations, metadata)
|
|
211
|
-
VALUES (?, ?, ?, ?, ?)
|
|
300
|
+
INSERT INTO evaluation_runs (id, created_at, description, total_scenarios, total_configurations, metadata, git_commit, package_version)
|
|
301
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
212
302
|
`);
|
|
213
303
|
|
|
214
|
-
stmt.run(id, description, totalScenarios, totalConfigurations, JSON.stringify(metadata));
|
|
304
|
+
stmt.run(id, now, description, totalScenarios, totalConfigurations, JSON.stringify(metadata), metadata.gitCommit || null, metadata.packageVersion || null);
|
|
215
305
|
|
|
216
306
|
return {
|
|
217
307
|
id,
|
|
@@ -227,16 +317,29 @@ export function createRun(options = {}) {
|
|
|
227
317
|
* Update a run's status
|
|
228
318
|
*/
|
|
229
319
|
export function updateRun(runId, updates) {
|
|
230
|
-
const { status, totalTests, completedAt } = updates;
|
|
320
|
+
const { status, totalTests, completedAt, metadata } = updates;
|
|
321
|
+
|
|
322
|
+
// If metadata provided, merge with existing
|
|
323
|
+
if (metadata) {
|
|
324
|
+
const existing = getRun(runId);
|
|
325
|
+
const mergedMetadata = { ...(existing?.metadata || {}), ...metadata };
|
|
326
|
+
const stmt = db.prepare(`UPDATE evaluation_runs SET metadata = ? WHERE id = ?`);
|
|
327
|
+
stmt.run(JSON.stringify(mergedMetadata), runId);
|
|
328
|
+
}
|
|
231
329
|
|
|
232
330
|
if (status === 'completed') {
|
|
233
331
|
const stmt = db.prepare(`
|
|
234
332
|
UPDATE evaluation_runs
|
|
235
|
-
SET status = ?,
|
|
333
|
+
SET status = ?, completed_at = ?
|
|
236
334
|
WHERE id = ?
|
|
237
335
|
`);
|
|
238
|
-
stmt.run(status,
|
|
239
|
-
} else {
|
|
336
|
+
stmt.run(status, completedAt || new Date().toISOString(), runId);
|
|
337
|
+
} else if (status && totalTests != null) {
|
|
338
|
+
const stmt = db.prepare(`
|
|
339
|
+
UPDATE evaluation_runs SET status = ?, total_tests = ? WHERE id = ?
|
|
340
|
+
`);
|
|
341
|
+
stmt.run(status, totalTests, runId);
|
|
342
|
+
} else if (status) {
|
|
240
343
|
const stmt = db.prepare(`
|
|
241
344
|
UPDATE evaluation_runs SET status = ? WHERE id = ?
|
|
242
345
|
`);
|
|
@@ -254,23 +357,33 @@ export function updateRun(runId, updates) {
|
|
|
254
357
|
export function storeResult(runId, result) {
|
|
255
358
|
const stmt = db.prepare(`
|
|
256
359
|
INSERT INTO evaluation_results (
|
|
257
|
-
run_id, scenario_id, scenario_name,
|
|
360
|
+
run_id, scenario_id, scenario_name, scenario_type,
|
|
258
361
|
provider, model, profile_name, hyperparameters, prompt_id,
|
|
362
|
+
ego_model, superego_model,
|
|
259
363
|
suggestions, raw_response,
|
|
260
364
|
latency_ms, input_tokens, output_tokens, cost, dialogue_rounds, api_calls, dialogue_id,
|
|
261
365
|
score_relevance, score_specificity, score_pedagogical,
|
|
262
366
|
score_personalization, score_actionability, score_tone, overall_score,
|
|
367
|
+
base_score, recognition_score,
|
|
263
368
|
passes_required, passes_forbidden, required_missing, forbidden_found,
|
|
264
|
-
|
|
369
|
+
judge_model, evaluation_reasoning, scores_with_reasoning, success, error_message,
|
|
370
|
+
factor_recognition, factor_multi_agent_tutor, factor_multi_agent_learner, learner_architecture,
|
|
371
|
+
scoring_method,
|
|
372
|
+
created_at
|
|
265
373
|
) VALUES (
|
|
266
|
-
?, ?, ?,
|
|
374
|
+
?, ?, ?, ?,
|
|
267
375
|
?, ?, ?, ?, ?,
|
|
268
376
|
?, ?,
|
|
377
|
+
?, ?,
|
|
269
378
|
?, ?, ?, ?, ?, ?, ?,
|
|
270
379
|
?, ?, ?,
|
|
271
380
|
?, ?, ?, ?,
|
|
381
|
+
?, ?,
|
|
382
|
+
?, ?, ?, ?,
|
|
383
|
+
?, ?, ?, ?, ?,
|
|
272
384
|
?, ?, ?, ?,
|
|
273
|
-
?,
|
|
385
|
+
?,
|
|
386
|
+
?
|
|
274
387
|
)
|
|
275
388
|
`);
|
|
276
389
|
|
|
@@ -278,11 +391,14 @@ export function storeResult(runId, result) {
|
|
|
278
391
|
runId,
|
|
279
392
|
result.scenarioId,
|
|
280
393
|
result.scenarioName,
|
|
394
|
+
result.scenarioType || 'suggestion',
|
|
281
395
|
result.provider,
|
|
282
396
|
result.model,
|
|
283
397
|
result.profileName,
|
|
284
398
|
JSON.stringify(result.hyperparameters || {}),
|
|
285
399
|
result.promptId,
|
|
400
|
+
result.egoModel || null,
|
|
401
|
+
result.superegoModel || null,
|
|
286
402
|
JSON.stringify(result.suggestions || []),
|
|
287
403
|
result.rawResponse,
|
|
288
404
|
result.latencyMs,
|
|
@@ -299,15 +415,23 @@ export function storeResult(runId, result) {
|
|
|
299
415
|
result.scores?.actionability,
|
|
300
416
|
result.scores?.tone,
|
|
301
417
|
result.overallScore,
|
|
418
|
+
result.baseScore,
|
|
419
|
+
result.recognitionScore,
|
|
302
420
|
result.passesRequired ? 1 : 0,
|
|
303
421
|
result.passesForbidden ? 1 : 0,
|
|
304
422
|
JSON.stringify(result.requiredMissing || []),
|
|
305
423
|
JSON.stringify(result.forbiddenFound || []),
|
|
306
|
-
result.
|
|
424
|
+
result.judgeModel,
|
|
307
425
|
result.evaluationReasoning,
|
|
308
426
|
result.scoresWithReasoning ? JSON.stringify(result.scoresWithReasoning) : null,
|
|
309
427
|
result.success ? 1 : 0,
|
|
310
|
-
result.errorMessage
|
|
428
|
+
result.errorMessage,
|
|
429
|
+
result.factors?.recognition != null ? (result.factors.recognition ? 1 : 0) : null,
|
|
430
|
+
result.factors?.multi_agent_tutor != null ? (result.factors.multi_agent_tutor ? 1 : 0) : null,
|
|
431
|
+
result.factors?.multi_agent_learner != null ? (result.factors.multi_agent_learner ? 1 : 0) : null,
|
|
432
|
+
result.learnerArchitecture || null,
|
|
433
|
+
result.scoringMethod || null,
|
|
434
|
+
new Date().toISOString()
|
|
311
435
|
);
|
|
312
436
|
|
|
313
437
|
return info.lastInsertRowid;
|
|
@@ -331,6 +455,8 @@ export function getRun(runId) {
|
|
|
331
455
|
status: row.status,
|
|
332
456
|
completedAt: row.completed_at,
|
|
333
457
|
metadata: JSON.parse(row.metadata || '{}'),
|
|
458
|
+
gitCommit: row.git_commit,
|
|
459
|
+
packageVersion: row.package_version,
|
|
334
460
|
};
|
|
335
461
|
}
|
|
336
462
|
|
|
@@ -338,7 +464,7 @@ export function getRun(runId) {
|
|
|
338
464
|
* List all runs with scenario names
|
|
339
465
|
*/
|
|
340
466
|
export function listRuns(options = {}) {
|
|
341
|
-
const { limit =
|
|
467
|
+
const { limit = null, status = null } = options;
|
|
342
468
|
|
|
343
469
|
let query = 'SELECT * FROM evaluation_runs';
|
|
344
470
|
const params = [];
|
|
@@ -348,8 +474,11 @@ export function listRuns(options = {}) {
|
|
|
348
474
|
params.push(status);
|
|
349
475
|
}
|
|
350
476
|
|
|
351
|
-
query += ' ORDER BY created_at
|
|
352
|
-
|
|
477
|
+
query += ' ORDER BY created_at ASC';
|
|
478
|
+
if (limit) {
|
|
479
|
+
query += ' LIMIT ?';
|
|
480
|
+
params.push(limit);
|
|
481
|
+
}
|
|
353
482
|
|
|
354
483
|
const stmt = db.prepare(query);
|
|
355
484
|
const rows = stmt.all(...params);
|
|
@@ -361,9 +490,59 @@ export function listRuns(options = {}) {
|
|
|
361
490
|
ORDER BY scenario_name
|
|
362
491
|
`);
|
|
363
492
|
|
|
493
|
+
// Count completed results per run
|
|
494
|
+
const resultCountStmt = db.prepare(`
|
|
495
|
+
SELECT COUNT(*) as completed,
|
|
496
|
+
SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successful,
|
|
497
|
+
AVG(overall_score) as avg_score
|
|
498
|
+
FROM evaluation_results WHERE run_id = ?
|
|
499
|
+
`);
|
|
500
|
+
|
|
501
|
+
// Get distinct ego + superego models for each run
|
|
502
|
+
const modelStmt = db.prepare(`
|
|
503
|
+
SELECT DISTINCT ego_model FROM evaluation_results
|
|
504
|
+
WHERE run_id = ? AND ego_model IS NOT NULL
|
|
505
|
+
ORDER BY ego_model
|
|
506
|
+
`);
|
|
507
|
+
const superegoModelStmt = db.prepare(`
|
|
508
|
+
SELECT DISTINCT superego_model FROM evaluation_results
|
|
509
|
+
WHERE run_id = ? AND superego_model IS NOT NULL
|
|
510
|
+
ORDER BY superego_model
|
|
511
|
+
`);
|
|
512
|
+
|
|
364
513
|
return rows.map(row => {
|
|
365
514
|
const scenarioRows = scenarioStmt.all(row.id);
|
|
366
515
|
const scenarioNames = scenarioRows.map(s => s.scenario_name).filter(Boolean);
|
|
516
|
+
const counts = resultCountStmt.get(row.id);
|
|
517
|
+
|
|
518
|
+
const extractAlias = (raw) => {
|
|
519
|
+
if (!raw) return null;
|
|
520
|
+
const dotIdx = raw.indexOf('.');
|
|
521
|
+
return dotIdx !== -1 ? raw.slice(dotIdx + 1) : raw;
|
|
522
|
+
};
|
|
523
|
+
|
|
524
|
+
const modelRows = modelStmt.all(row.id);
|
|
525
|
+
const superegoRows = superegoModelStmt.all(row.id);
|
|
526
|
+
const models = [...new Set([
|
|
527
|
+
...modelRows.map(m => extractAlias(m.ego_model)),
|
|
528
|
+
...superegoRows.map(m => extractAlias(m.superego_model)),
|
|
529
|
+
].filter(Boolean))];
|
|
530
|
+
|
|
531
|
+
const completedResults = counts?.completed || 0;
|
|
532
|
+
const totalTests = row.total_tests || 0;
|
|
533
|
+
const progressPct = totalTests > 0 ? Math.min(100, Math.round((completedResults / totalTests) * 100)) : null;
|
|
534
|
+
|
|
535
|
+
// Compute duration: for completed runs use completed_at - created_at;
|
|
536
|
+
// for running runs compute elapsed from now.
|
|
537
|
+
let durationMs = null;
|
|
538
|
+
if (row.created_at) {
|
|
539
|
+
const start = new Date(row.created_at).getTime();
|
|
540
|
+
if (row.completed_at) {
|
|
541
|
+
durationMs = new Date(row.completed_at).getTime() - start;
|
|
542
|
+
} else if (row.status === 'running') {
|
|
543
|
+
durationMs = Date.now() - start;
|
|
544
|
+
}
|
|
545
|
+
}
|
|
367
546
|
|
|
368
547
|
return {
|
|
369
548
|
id: row.id,
|
|
@@ -371,10 +550,16 @@ export function listRuns(options = {}) {
|
|
|
371
550
|
description: row.description,
|
|
372
551
|
totalScenarios: row.total_scenarios,
|
|
373
552
|
totalConfigurations: row.total_configurations,
|
|
374
|
-
totalTests
|
|
553
|
+
totalTests,
|
|
554
|
+
completedResults,
|
|
555
|
+
successfulResults: counts?.successful || 0,
|
|
556
|
+
avgScore: counts?.avg_score || null,
|
|
557
|
+
progressPct,
|
|
558
|
+
durationMs,
|
|
375
559
|
status: row.status,
|
|
376
560
|
completedAt: row.completed_at,
|
|
377
561
|
scenarioNames, // Scenario names from results
|
|
562
|
+
models, // Distinct ego model aliases used
|
|
378
563
|
metadata: JSON.parse(row.metadata || '{}'), // Structured metadata
|
|
379
564
|
};
|
|
380
565
|
});
|
|
@@ -384,7 +569,7 @@ export function listRuns(options = {}) {
|
|
|
384
569
|
* Get results for a run
|
|
385
570
|
*/
|
|
386
571
|
export function getResults(runId, options = {}) {
|
|
387
|
-
const { scenarioId = null, provider = null, model = null } = options;
|
|
572
|
+
const { scenarioId = null, provider = null, model = null, profileName = null } = options;
|
|
388
573
|
|
|
389
574
|
let query = 'SELECT * FROM evaluation_results WHERE run_id = ?';
|
|
390
575
|
const params = [runId];
|
|
@@ -404,6 +589,11 @@ export function getResults(runId, options = {}) {
|
|
|
404
589
|
params.push(model);
|
|
405
590
|
}
|
|
406
591
|
|
|
592
|
+
if (profileName) {
|
|
593
|
+
query += ' AND profile_name = ?';
|
|
594
|
+
params.push(profileName);
|
|
595
|
+
}
|
|
596
|
+
|
|
407
597
|
query += ' ORDER BY created_at';
|
|
408
598
|
|
|
409
599
|
const stmt = db.prepare(query);
|
|
@@ -420,6 +610,9 @@ export function getRunStats(runId) {
|
|
|
420
610
|
SELECT
|
|
421
611
|
provider,
|
|
422
612
|
model,
|
|
613
|
+
profile_name,
|
|
614
|
+
ego_model,
|
|
615
|
+
superego_model,
|
|
423
616
|
COUNT(*) as total_tests,
|
|
424
617
|
SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successful_tests,
|
|
425
618
|
AVG(overall_score) as avg_score,
|
|
@@ -429,6 +622,8 @@ export function getRunStats(runId) {
|
|
|
429
622
|
AVG(score_personalization) as avg_personalization,
|
|
430
623
|
AVG(score_actionability) as avg_actionability,
|
|
431
624
|
AVG(score_tone) as avg_tone,
|
|
625
|
+
AVG(base_score) as avg_base_score,
|
|
626
|
+
AVG(recognition_score) as avg_recognition_score,
|
|
432
627
|
AVG(latency_ms) as avg_latency,
|
|
433
628
|
SUM(input_tokens) as total_input_tokens,
|
|
434
629
|
SUM(output_tokens) as total_output_tokens,
|
|
@@ -436,7 +631,7 @@ export function getRunStats(runId) {
|
|
|
436
631
|
SUM(CASE WHEN passes_forbidden = 1 THEN 1 ELSE 0 END) as passes_forbidden
|
|
437
632
|
FROM evaluation_results
|
|
438
633
|
WHERE run_id = ?
|
|
439
|
-
GROUP BY provider, model
|
|
634
|
+
GROUP BY provider, model, profile_name
|
|
440
635
|
ORDER BY avg_score DESC
|
|
441
636
|
`);
|
|
442
637
|
|
|
@@ -445,10 +640,15 @@ export function getRunStats(runId) {
|
|
|
445
640
|
return rows.map(row => ({
|
|
446
641
|
provider: row.provider,
|
|
447
642
|
model: row.model,
|
|
643
|
+
profileName: row.profile_name,
|
|
644
|
+
egoModel: row.ego_model,
|
|
645
|
+
superegoModel: row.superego_model,
|
|
448
646
|
totalTests: row.total_tests,
|
|
449
647
|
successfulTests: row.successful_tests,
|
|
450
648
|
successRate: row.total_tests > 0 ? row.successful_tests / row.total_tests : 0,
|
|
451
649
|
avgScore: row.avg_score,
|
|
650
|
+
avgBaseScore: row.avg_base_score,
|
|
651
|
+
avgRecognitionScore: row.avg_recognition_score,
|
|
452
652
|
dimensions: {
|
|
453
653
|
relevance: row.avg_relevance,
|
|
454
654
|
specificity: row.avg_specificity,
|
|
@@ -478,13 +678,18 @@ export function getScenarioStats(runId) {
|
|
|
478
678
|
scenario_name,
|
|
479
679
|
provider,
|
|
480
680
|
model,
|
|
681
|
+
profile_name,
|
|
682
|
+
ego_model,
|
|
683
|
+
superego_model,
|
|
481
684
|
AVG(overall_score) as avg_score,
|
|
685
|
+
AVG(base_score) as avg_base_score,
|
|
686
|
+
AVG(recognition_score) as avg_recognition_score,
|
|
482
687
|
AVG(latency_ms) as avg_latency,
|
|
483
688
|
SUM(CASE WHEN passes_required = 1 AND passes_forbidden = 1 THEN 1 ELSE 0 END) as passes_validation,
|
|
484
689
|
COUNT(*) as runs
|
|
485
690
|
FROM evaluation_results
|
|
486
691
|
WHERE run_id = ?
|
|
487
|
-
GROUP BY scenario_id, provider, model
|
|
692
|
+
GROUP BY scenario_id, provider, model, profile_name
|
|
488
693
|
ORDER BY scenario_id, avg_score DESC
|
|
489
694
|
`);
|
|
490
695
|
|
|
@@ -503,7 +708,12 @@ export function getScenarioStats(runId) {
|
|
|
503
708
|
grouped[row.scenario_id].configurations.push({
|
|
504
709
|
provider: row.provider,
|
|
505
710
|
model: row.model,
|
|
711
|
+
profileName: row.profile_name,
|
|
712
|
+
egoModel: row.ego_model,
|
|
713
|
+
superegoModel: row.superego_model,
|
|
506
714
|
avgScore: row.avg_score,
|
|
715
|
+
avgBaseScore: row.avg_base_score,
|
|
716
|
+
avgRecognitionScore: row.avg_recognition_score,
|
|
507
717
|
avgLatencyMs: row.avg_latency,
|
|
508
718
|
passesValidation: row.passes_validation === row.runs,
|
|
509
719
|
runs: row.runs,
|
|
@@ -623,7 +833,16 @@ export function exportToCsv(runId) {
|
|
|
623
833
|
r.success ? 1 : 0,
|
|
624
834
|
]);
|
|
625
835
|
|
|
626
|
-
|
|
836
|
+
const escapeCsvField = (value) => {
|
|
837
|
+
if (value == null) return '';
|
|
838
|
+
const str = String(value);
|
|
839
|
+
if (str.includes(',') || str.includes('"') || str.includes('\n')) {
|
|
840
|
+
return '"' + str.replace(/"/g, '""') + '"';
|
|
841
|
+
}
|
|
842
|
+
return str;
|
|
843
|
+
};
|
|
844
|
+
|
|
845
|
+
return [headers.join(','), ...rows.map(row => row.map(escapeCsvField).join(','))].join('\n');
|
|
627
846
|
}
|
|
628
847
|
|
|
629
848
|
/**
|
|
@@ -734,6 +953,8 @@ export function findIncompleteRuns(options = {}) {
|
|
|
734
953
|
return rows.map(row => {
|
|
735
954
|
const resultsStmt = db.prepare('SELECT COUNT(*) as count FROM evaluation_results WHERE run_id = ?');
|
|
736
955
|
const resultsCount = resultsStmt.get(row.id).count;
|
|
956
|
+
const metadata = JSON.parse(row.metadata || '{}');
|
|
957
|
+
const pid = metadata?.pid;
|
|
737
958
|
|
|
738
959
|
return {
|
|
739
960
|
id: row.id,
|
|
@@ -744,7 +965,9 @@ export function findIncompleteRuns(options = {}) {
|
|
|
744
965
|
expectedTests: row.total_scenarios * row.total_configurations,
|
|
745
966
|
resultsFound: resultsCount,
|
|
746
967
|
ageMinutes: Math.round((Date.now() - new Date(row.created_at).getTime()) / 60000),
|
|
747
|
-
metadata
|
|
968
|
+
metadata,
|
|
969
|
+
pid,
|
|
970
|
+
pidAlive: isPidAlive(pid),
|
|
748
971
|
};
|
|
749
972
|
});
|
|
750
973
|
}
|
|
@@ -762,16 +985,28 @@ export function autoCompleteStaleRuns(options = {}) {
|
|
|
762
985
|
|
|
763
986
|
const incompleteRuns = findIncompleteRuns({ olderThanMinutes });
|
|
764
987
|
|
|
988
|
+
// Filter out runs whose PID is still alive
|
|
989
|
+
const staleRuns = incompleteRuns.filter(run => {
|
|
990
|
+
const pid = run.metadata?.pid;
|
|
991
|
+
const isAlive = isPidAlive(pid);
|
|
992
|
+
if (isAlive) {
|
|
993
|
+
console.log(` Skipping ${run.id}: pid ${pid} still running`);
|
|
994
|
+
}
|
|
995
|
+
return !isAlive;
|
|
996
|
+
});
|
|
997
|
+
|
|
765
998
|
if (dryRun) {
|
|
766
999
|
return {
|
|
767
1000
|
dryRun: true,
|
|
768
1001
|
found: incompleteRuns.length,
|
|
769
|
-
|
|
1002
|
+
stale: staleRuns.length,
|
|
1003
|
+
skippedAlive: incompleteRuns.length - staleRuns.length,
|
|
1004
|
+
runs: staleRuns,
|
|
770
1005
|
};
|
|
771
1006
|
}
|
|
772
1007
|
|
|
773
1008
|
const completed = [];
|
|
774
|
-
for (const run of
|
|
1009
|
+
for (const run of staleRuns) {
|
|
775
1010
|
try {
|
|
776
1011
|
const result = completeRun(run.id);
|
|
777
1012
|
completed.push(result);
|
|
@@ -786,6 +1021,8 @@ export function autoCompleteStaleRuns(options = {}) {
|
|
|
786
1021
|
|
|
787
1022
|
return {
|
|
788
1023
|
found: incompleteRuns.length,
|
|
1024
|
+
stale: staleRuns.length,
|
|
1025
|
+
skippedAlive: incompleteRuns.length - staleRuns.length,
|
|
789
1026
|
completed: completed.length,
|
|
790
1027
|
runs: completed,
|
|
791
1028
|
};
|
|
@@ -827,8 +1064,9 @@ export function getIncompleteTests(runId, profiles, scenarios) {
|
|
|
827
1064
|
const results = getResults(runId);
|
|
828
1065
|
const completedSet = new Set();
|
|
829
1066
|
|
|
830
|
-
// Build set of completed (profile, scenarioId) pairs
|
|
1067
|
+
// Build set of completed (profile, scenarioId) pairs — only count successes
|
|
831
1068
|
for (const result of results) {
|
|
1069
|
+
if (result.success === false || result.success === 0) continue;
|
|
832
1070
|
const key = `${result.profileName}:${result.scenarioId}`;
|
|
833
1071
|
completedSet.add(key);
|
|
834
1072
|
}
|
|
@@ -899,9 +1137,12 @@ function parseResultRow(row) {
|
|
|
899
1137
|
runId: row.run_id,
|
|
900
1138
|
scenarioId: row.scenario_id,
|
|
901
1139
|
scenarioName: row.scenario_name,
|
|
1140
|
+
scenarioType: row.scenario_type || 'suggestion',
|
|
902
1141
|
provider: row.provider,
|
|
903
1142
|
model: row.model,
|
|
904
1143
|
profileName: row.profile_name,
|
|
1144
|
+
egoModel: row.ego_model,
|
|
1145
|
+
superegoModel: row.superego_model,
|
|
905
1146
|
hyperparameters: JSON.parse(row.hyperparameters || '{}'),
|
|
906
1147
|
promptId: row.prompt_id,
|
|
907
1148
|
suggestions: JSON.parse(row.suggestions || '[]'),
|
|
@@ -914,15 +1155,29 @@ function parseResultRow(row) {
|
|
|
914
1155
|
dialogueId: row.dialogue_id,
|
|
915
1156
|
scores,
|
|
916
1157
|
overallScore: row.overall_score,
|
|
1158
|
+
scoringMethod: row.scoring_method || null,
|
|
1159
|
+
baseScore: row.base_score,
|
|
1160
|
+
recognitionScore: row.recognition_score,
|
|
917
1161
|
passesRequired: Boolean(row.passes_required),
|
|
918
1162
|
passesForbidden: Boolean(row.passes_forbidden),
|
|
919
1163
|
requiredMissing: JSON.parse(row.required_missing || '[]'),
|
|
920
1164
|
forbiddenFound: JSON.parse(row.forbidden_found || '[]'),
|
|
921
|
-
|
|
1165
|
+
judgeModel: row.judge_model,
|
|
922
1166
|
evaluationReasoning: row.evaluation_reasoning,
|
|
923
1167
|
success: Boolean(row.success),
|
|
924
1168
|
errorMessage: row.error_message,
|
|
925
1169
|
createdAt: row.created_at,
|
|
1170
|
+
factors: (row.factor_recognition != null || row.factor_multi_agent_tutor != null || row.factor_multi_agent_learner != null)
|
|
1171
|
+
? {
|
|
1172
|
+
recognition: Boolean(row.factor_recognition),
|
|
1173
|
+
multi_agent_tutor: Boolean(row.factor_multi_agent_tutor),
|
|
1174
|
+
multi_agent_learner: Boolean(row.factor_multi_agent_learner),
|
|
1175
|
+
}
|
|
1176
|
+
: null,
|
|
1177
|
+
learnerArchitecture: row.learner_architecture || null,
|
|
1178
|
+
learnerScores: row.learner_scores ? JSON.parse(row.learner_scores) : null,
|
|
1179
|
+
learnerOverallScore: row.learner_overall_score != null ? row.learner_overall_score : null,
|
|
1180
|
+
learnerJudgeModel: row.learner_judge_model || null,
|
|
926
1181
|
};
|
|
927
1182
|
}
|
|
928
1183
|
|
|
@@ -1052,6 +1307,9 @@ export function getInteractionEval(evalId) {
|
|
|
1052
1307
|
uniqueOutcomes: JSON.parse(row.unique_outcomes || '[]'),
|
|
1053
1308
|
judgeOverallScore: row.judge_overall_score,
|
|
1054
1309
|
judgeEvaluation: JSON.parse(row.judge_evaluation || 'null'),
|
|
1310
|
+
learnerScores: JSON.parse(row.learner_scores || 'null'),
|
|
1311
|
+
learnerOverallScore: row.learner_overall_score,
|
|
1312
|
+
learnerJudgeModel: row.learner_judge_model,
|
|
1055
1313
|
createdAt: row.created_at,
|
|
1056
1314
|
};
|
|
1057
1315
|
}
|
|
@@ -1096,10 +1354,284 @@ export function getInteractionEvalByRunId(runId) {
|
|
|
1096
1354
|
};
|
|
1097
1355
|
}
|
|
1098
1356
|
|
|
1357
|
+
/**
|
|
1358
|
+
* Get factorial cell data for ANOVA analysis.
|
|
1359
|
+
*
|
|
1360
|
+
* Returns scores grouped by cell key ("r0_t0_l0", etc.)
|
|
1361
|
+
* Only includes results that have factor tags stored.
|
|
1362
|
+
*
|
|
1363
|
+
* @param {string} runId - The run ID
|
|
1364
|
+
* @param {Object} [options] - Options
|
|
1365
|
+
* @param {string} [options.scoreColumn='overall_score'] - Which score to use
|
|
1366
|
+
* @returns {Object} Map of cellKey → [score, ...]
|
|
1367
|
+
*/
|
|
1368
|
+
export function getFactorialCellData(runId, options = {}) {
|
|
1369
|
+
const { scoreColumn = 'overall_score' } = options;
|
|
1370
|
+
|
|
1371
|
+
// Whitelist valid score columns to prevent SQL injection
|
|
1372
|
+
const validColumns = ['overall_score', 'base_score', 'recognition_score'];
|
|
1373
|
+
const col = validColumns.includes(scoreColumn) ? scoreColumn : 'overall_score';
|
|
1374
|
+
|
|
1375
|
+
const stmt = db.prepare(`
|
|
1376
|
+
SELECT factor_recognition, factor_multi_agent_tutor, factor_multi_agent_learner, ${col} as score
|
|
1377
|
+
FROM evaluation_results
|
|
1378
|
+
WHERE run_id = ? AND factor_recognition IS NOT NULL AND ${col} IS NOT NULL AND success = 1
|
|
1379
|
+
`);
|
|
1380
|
+
|
|
1381
|
+
const rows = stmt.all(runId);
|
|
1382
|
+
const cells = {};
|
|
1383
|
+
|
|
1384
|
+
for (const row of rows) {
|
|
1385
|
+
const key = `r${row.factor_recognition}_t${row.factor_multi_agent_tutor}_l${row.factor_multi_agent_learner}`;
|
|
1386
|
+
if (!cells[key]) cells[key] = [];
|
|
1387
|
+
cells[key].push(row.score);
|
|
1388
|
+
}
|
|
1389
|
+
|
|
1390
|
+
return cells;
|
|
1391
|
+
}
|
|
1392
|
+
|
|
1393
|
+
/**
|
|
1394
|
+
* Store a new judgment row for an existing result (preserves judgment history).
|
|
1395
|
+
* Copies the original result's response data but adds new scores from a different judge.
|
|
1396
|
+
* This enables inter-judge reliability analysis.
|
|
1397
|
+
*
|
|
1398
|
+
* @param {Object} originalResult - The original result row (from getResults)
|
|
1399
|
+
* @param {Object} evaluation - The new evaluation scores
|
|
1400
|
+
* @returns {number} The new row ID
|
|
1401
|
+
*/
|
|
1402
|
+
export function storeRejudgment(originalResult, evaluation) {
|
|
1403
|
+
const stmt = db.prepare(`
|
|
1404
|
+
INSERT INTO evaluation_results (
|
|
1405
|
+
run_id, scenario_id, scenario_name, scenario_type,
|
|
1406
|
+
provider, model, profile_name, hyperparameters, prompt_id,
|
|
1407
|
+
ego_model, superego_model,
|
|
1408
|
+
suggestions, raw_response,
|
|
1409
|
+
latency_ms, input_tokens, output_tokens, cost, dialogue_rounds, api_calls, dialogue_id,
|
|
1410
|
+
score_relevance, score_specificity, score_pedagogical,
|
|
1411
|
+
score_personalization, score_actionability, score_tone, overall_score,
|
|
1412
|
+
base_score, recognition_score,
|
|
1413
|
+
passes_required, passes_forbidden, required_missing, forbidden_found,
|
|
1414
|
+
judge_model, evaluation_reasoning, scores_with_reasoning, success, error_message,
|
|
1415
|
+
factor_recognition, factor_multi_agent_tutor, factor_multi_agent_learner, learner_architecture,
|
|
1416
|
+
scoring_method,
|
|
1417
|
+
created_at
|
|
1418
|
+
) VALUES (
|
|
1419
|
+
?, ?, ?, ?,
|
|
1420
|
+
?, ?, ?, ?, ?,
|
|
1421
|
+
?, ?,
|
|
1422
|
+
?, ?,
|
|
1423
|
+
?, ?, ?, ?, ?, ?, ?,
|
|
1424
|
+
?, ?, ?,
|
|
1425
|
+
?, ?, ?, ?,
|
|
1426
|
+
?, ?,
|
|
1427
|
+
?, ?, ?, ?,
|
|
1428
|
+
?, ?, ?, ?, ?,
|
|
1429
|
+
?, ?, ?, ?,
|
|
1430
|
+
?,
|
|
1431
|
+
?
|
|
1432
|
+
)
|
|
1433
|
+
`);
|
|
1434
|
+
|
|
1435
|
+
const scores = evaluation.scores || {};
|
|
1436
|
+
|
|
1437
|
+
const info = stmt.run(
|
|
1438
|
+
originalResult.runId,
|
|
1439
|
+
originalResult.scenarioId,
|
|
1440
|
+
originalResult.scenarioName,
|
|
1441
|
+
originalResult.scenarioType || 'suggestion',
|
|
1442
|
+
originalResult.provider,
|
|
1443
|
+
originalResult.model,
|
|
1444
|
+
originalResult.profileName,
|
|
1445
|
+
typeof originalResult.hyperparameters === 'string'
|
|
1446
|
+
? originalResult.hyperparameters
|
|
1447
|
+
: JSON.stringify(originalResult.hyperparameters || {}),
|
|
1448
|
+
originalResult.promptId,
|
|
1449
|
+
originalResult.egoModel || null,
|
|
1450
|
+
originalResult.superegoModel || null,
|
|
1451
|
+
typeof originalResult.suggestions === 'string'
|
|
1452
|
+
? originalResult.suggestions
|
|
1453
|
+
: JSON.stringify(originalResult.suggestions || []),
|
|
1454
|
+
originalResult.rawResponse,
|
|
1455
|
+
originalResult.latencyMs,
|
|
1456
|
+
originalResult.inputTokens,
|
|
1457
|
+
originalResult.outputTokens,
|
|
1458
|
+
originalResult.cost,
|
|
1459
|
+
originalResult.dialogueRounds,
|
|
1460
|
+
originalResult.apiCalls,
|
|
1461
|
+
originalResult.dialogueId,
|
|
1462
|
+
// New scores from the new judge
|
|
1463
|
+
scores.relevance?.score ?? scores.relevance ?? null,
|
|
1464
|
+
scores.specificity?.score ?? scores.specificity ?? null,
|
|
1465
|
+
scores.pedagogical?.score ?? scores.pedagogical ?? null,
|
|
1466
|
+
scores.personalization?.score ?? scores.personalization ?? null,
|
|
1467
|
+
scores.actionability?.score ?? scores.actionability ?? null,
|
|
1468
|
+
scores.tone?.score ?? scores.tone ?? null,
|
|
1469
|
+
evaluation.overallScore ?? null,
|
|
1470
|
+
evaluation.baseScore ?? null,
|
|
1471
|
+
evaluation.recognitionScore ?? null,
|
|
1472
|
+
evaluation.passesRequired ? 1 : 0,
|
|
1473
|
+
evaluation.passesForbidden ? 1 : 0,
|
|
1474
|
+
JSON.stringify(evaluation.requiredMissing || []),
|
|
1475
|
+
JSON.stringify(evaluation.forbiddenFound || []),
|
|
1476
|
+
evaluation.judgeModel || null,
|
|
1477
|
+
evaluation.summary || null,
|
|
1478
|
+
evaluation.scores ? JSON.stringify(evaluation.scores) : null,
|
|
1479
|
+
1, // success
|
|
1480
|
+
null, // error_message
|
|
1481
|
+
originalResult.factorRecognition ?? null,
|
|
1482
|
+
originalResult.factorMultiAgentTutor ?? null,
|
|
1483
|
+
originalResult.factorMultiAgentLearner ?? null,
|
|
1484
|
+
originalResult.learnerArchitecture || null,
|
|
1485
|
+
'rubric', // Rejudgments only store successful rubric evaluations
|
|
1486
|
+
new Date().toISOString()
|
|
1487
|
+
);
|
|
1488
|
+
|
|
1489
|
+
return info.lastInsertRowid;
|
|
1490
|
+
}
|
|
1491
|
+
|
|
1492
|
+
/**
|
|
1493
|
+
* Update score columns for an existing result row (for rejudging - overwrites history)
|
|
1494
|
+
* @deprecated Use storeRejudgment() to preserve judgment history for reliability analysis
|
|
1495
|
+
*/
|
|
1496
|
+
export function updateResultScores(resultId, evaluation) {
|
|
1497
|
+
const stmt = db.prepare(`
|
|
1498
|
+
UPDATE evaluation_results SET
|
|
1499
|
+
score_relevance = ?,
|
|
1500
|
+
score_specificity = ?,
|
|
1501
|
+
score_pedagogical = ?,
|
|
1502
|
+
score_personalization = ?,
|
|
1503
|
+
score_actionability = ?,
|
|
1504
|
+
score_tone = ?,
|
|
1505
|
+
overall_score = ?,
|
|
1506
|
+
base_score = ?,
|
|
1507
|
+
recognition_score = ?,
|
|
1508
|
+
passes_required = ?,
|
|
1509
|
+
passes_forbidden = ?,
|
|
1510
|
+
required_missing = ?,
|
|
1511
|
+
forbidden_found = ?,
|
|
1512
|
+
judge_model = ?,
|
|
1513
|
+
evaluation_reasoning = ?,
|
|
1514
|
+
scores_with_reasoning = ?,
|
|
1515
|
+
scoring_method = ?
|
|
1516
|
+
WHERE id = ?
|
|
1517
|
+
`);
|
|
1518
|
+
|
|
1519
|
+
const scores = evaluation.scores || {};
|
|
1520
|
+
stmt.run(
|
|
1521
|
+
scores.relevance?.score ?? scores.relevance ?? null,
|
|
1522
|
+
scores.specificity?.score ?? scores.specificity ?? null,
|
|
1523
|
+
scores.pedagogical?.score ?? scores.pedagogical ?? null,
|
|
1524
|
+
scores.personalization?.score ?? scores.personalization ?? null,
|
|
1525
|
+
scores.actionability?.score ?? scores.actionability ?? null,
|
|
1526
|
+
scores.tone?.score ?? scores.tone ?? null,
|
|
1527
|
+
evaluation.overallScore ?? null,
|
|
1528
|
+
evaluation.baseScore ?? null,
|
|
1529
|
+
evaluation.recognitionScore ?? null,
|
|
1530
|
+
evaluation.passesRequired ? 1 : 0,
|
|
1531
|
+
evaluation.passesForbidden ? 1 : 0,
|
|
1532
|
+
JSON.stringify(evaluation.requiredMissing || []),
|
|
1533
|
+
JSON.stringify(evaluation.forbiddenFound || []),
|
|
1534
|
+
evaluation.judgeModel || null,
|
|
1535
|
+
evaluation.summary || null,
|
|
1536
|
+
evaluation.scores ? JSON.stringify(evaluation.scores) : null,
|
|
1537
|
+
'rubric', // Only called on successful evaluations
|
|
1538
|
+
resultId
|
|
1539
|
+
);
|
|
1540
|
+
}
|
|
1541
|
+
|
|
1542
|
+
/**
|
|
1543
|
+
* Update learner-side evaluation scores on an evaluation_results row.
|
|
1544
|
+
*
|
|
1545
|
+
* @param {string} resultId - The evaluation result ID
|
|
1546
|
+
* @param {Object} evaluation - Learner evaluation data
|
|
1547
|
+
* @param {Object} evaluation.scores - Per-turn learner scores (JSON-serializable)
|
|
1548
|
+
* @param {number} evaluation.overallScore - Weighted average learner score (0-100)
|
|
1549
|
+
* @param {string} evaluation.judgeModel - Model used for judging
|
|
1550
|
+
*/
|
|
1551
|
+
export function updateResultLearnerScores(resultId, evaluation) {
|
|
1552
|
+
const stmt = db.prepare(`
|
|
1553
|
+
UPDATE evaluation_results SET
|
|
1554
|
+
learner_scores = ?,
|
|
1555
|
+
learner_overall_score = ?,
|
|
1556
|
+
learner_judge_model = ?
|
|
1557
|
+
WHERE id = ?
|
|
1558
|
+
`);
|
|
1559
|
+
|
|
1560
|
+
stmt.run(
|
|
1561
|
+
JSON.stringify(evaluation.scores),
|
|
1562
|
+
evaluation.overallScore,
|
|
1563
|
+
evaluation.judgeModel || null,
|
|
1564
|
+
resultId
|
|
1565
|
+
);
|
|
1566
|
+
}
|
|
1567
|
+
|
|
1568
|
+
/**
|
|
1569
|
+
* List all interaction evaluations for a given run ID.
|
|
1570
|
+
*
|
|
1571
|
+
* @param {string} runId - The run ID
|
|
1572
|
+
* @returns {Array} Array of interaction evaluation objects
|
|
1573
|
+
*/
|
|
1574
|
+
export function listInteractionEvalsByRunId(runId) {
|
|
1575
|
+
const stmt = db.prepare('SELECT * FROM interaction_evaluations WHERE run_id = ? ORDER BY created_at');
|
|
1576
|
+
const rows = stmt.all(runId);
|
|
1577
|
+
|
|
1578
|
+
return rows.map(row => ({
|
|
1579
|
+
evalId: row.id,
|
|
1580
|
+
runId: row.run_id,
|
|
1581
|
+
scenarioId: row.scenario_id,
|
|
1582
|
+
scenarioName: row.scenario_name,
|
|
1583
|
+
evalType: row.eval_type,
|
|
1584
|
+
learnerProfile: row.learner_profile,
|
|
1585
|
+
tutorProfile: row.tutor_profile,
|
|
1586
|
+
personaId: row.persona_id,
|
|
1587
|
+
learnerAgents: JSON.parse(row.learner_agents || '[]'),
|
|
1588
|
+
turnCount: row.turn_count,
|
|
1589
|
+
turns: JSON.parse(row.turns || '[]'),
|
|
1590
|
+
formattedTranscript: row.formatted_transcript,
|
|
1591
|
+
totalTokens: row.total_tokens,
|
|
1592
|
+
finalLearnerState: row.final_learner_state,
|
|
1593
|
+
finalUnderstanding: row.final_understanding,
|
|
1594
|
+
judgeOverallScore: row.judge_overall_score,
|
|
1595
|
+
learnerScores: JSON.parse(row.learner_scores || 'null'),
|
|
1596
|
+
learnerOverallScore: row.learner_overall_score,
|
|
1597
|
+
learnerJudgeModel: row.learner_judge_model,
|
|
1598
|
+
createdAt: row.created_at,
|
|
1599
|
+
}));
|
|
1600
|
+
}
|
|
1601
|
+
|
|
1602
|
+
/**
|
|
1603
|
+
* Update learner-side evaluation scores for an interaction evaluation.
|
|
1604
|
+
*
|
|
1605
|
+
* @param {string} evalId - The interaction evaluation ID
|
|
1606
|
+
* @param {Object} evaluation - Learner evaluation data
|
|
1607
|
+
* @param {Object} evaluation.scores - Per-turn scores: { turnIndex: { dimension: {score, reasoning} } }
|
|
1608
|
+
* @param {number} evaluation.overallScore - Weighted average learner score (0-100)
|
|
1609
|
+
* @param {string} evaluation.judgeModel - Model used for judging
|
|
1610
|
+
*/
|
|
1611
|
+
export function updateInteractionLearnerScores(evalId, evaluation) {
|
|
1612
|
+
const stmt = db.prepare(`
|
|
1613
|
+
UPDATE interaction_evaluations
|
|
1614
|
+
SET learner_scores = ?,
|
|
1615
|
+
learner_overall_score = ?,
|
|
1616
|
+
learner_judge_model = ?
|
|
1617
|
+
WHERE id = ?
|
|
1618
|
+
`);
|
|
1619
|
+
|
|
1620
|
+
stmt.run(
|
|
1621
|
+
JSON.stringify(evaluation.scores),
|
|
1622
|
+
evaluation.overallScore,
|
|
1623
|
+
evaluation.judgeModel || null,
|
|
1624
|
+
evalId
|
|
1625
|
+
);
|
|
1626
|
+
}
|
|
1627
|
+
|
|
1099
1628
|
export default {
|
|
1100
1629
|
createRun,
|
|
1101
1630
|
updateRun,
|
|
1102
1631
|
storeResult,
|
|
1632
|
+
storeRejudgment,
|
|
1633
|
+
updateResultScores,
|
|
1634
|
+
updateResultLearnerScores,
|
|
1103
1635
|
getRun,
|
|
1104
1636
|
listRuns,
|
|
1105
1637
|
getResults,
|
|
@@ -1113,9 +1645,12 @@ export default {
|
|
|
1113
1645
|
findIncompleteRuns,
|
|
1114
1646
|
autoCompleteStaleRuns,
|
|
1115
1647
|
getIncompleteTests,
|
|
1648
|
+
getFactorialCellData,
|
|
1116
1649
|
// Interaction evaluations
|
|
1117
1650
|
storeInteractionEval,
|
|
1118
1651
|
listInteractionEvals,
|
|
1652
|
+
listInteractionEvalsByRunId,
|
|
1119
1653
|
getInteractionEval,
|
|
1120
1654
|
getInteractionEvalByRunId,
|
|
1655
|
+
updateInteractionLearnerScores,
|
|
1121
1656
|
};
|