@machinespirits/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/components/MobileEvalDashboard.tsx +267 -0
  2. package/components/comparison/DeltaAnalysisTable.tsx +137 -0
  3. package/components/comparison/ProfileComparisonCard.tsx +176 -0
  4. package/components/comparison/RecognitionABMode.tsx +385 -0
  5. package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
  6. package/components/comparison/WinnerIndicator.tsx +64 -0
  7. package/components/comparison/index.ts +5 -0
  8. package/components/mobile/BottomSheet.tsx +233 -0
  9. package/components/mobile/DimensionBreakdown.tsx +210 -0
  10. package/components/mobile/DocsView.tsx +363 -0
  11. package/components/mobile/LogsView.tsx +481 -0
  12. package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
  13. package/components/mobile/QuickTestView.tsx +1098 -0
  14. package/components/mobile/RecognitionTypeChart.tsx +124 -0
  15. package/components/mobile/RecognitionView.tsx +809 -0
  16. package/components/mobile/RunDetailView.tsx +261 -0
  17. package/components/mobile/RunHistoryView.tsx +367 -0
  18. package/components/mobile/ScoreRadial.tsx +211 -0
  19. package/components/mobile/StreamingLogPanel.tsx +230 -0
  20. package/components/mobile/SynthesisStrategyChart.tsx +140 -0
  21. package/config/interaction-eval-scenarios.yaml +832 -0
  22. package/config/learner-agents.yaml +248 -0
  23. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
  24. package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
  25. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
  26. package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
  27. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
  28. package/docs/research/COST-ANALYSIS.md +56 -0
  29. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
  30. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
  31. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
  32. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
  33. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
  34. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
  35. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
  36. package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
  37. package/docs/research/PAPER-UNIFIED.md +659 -0
  38. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  39. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
  40. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
  41. package/docs/research/apa.csl +2133 -0
  42. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
  43. package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
  44. package/docs/research/paper-draft/full-paper.md +136 -0
  45. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  46. package/docs/research/paper-draft/references.bib +515 -0
  47. package/docs/research/transcript-baseline.md +139 -0
  48. package/docs/research/transcript-recognition-multiagent.md +187 -0
  49. package/hooks/useEvalData.ts +625 -0
  50. package/index.js +27 -0
  51. package/package.json +73 -0
  52. package/routes/evalRoutes.js +3002 -0
  53. package/scripts/advanced-eval-analysis.js +351 -0
  54. package/scripts/analyze-eval-costs.js +378 -0
  55. package/scripts/analyze-eval-results.js +513 -0
  56. package/scripts/analyze-interaction-evals.js +368 -0
  57. package/server-init.js +45 -0
  58. package/server.js +162 -0
  59. package/services/benchmarkService.js +1892 -0
  60. package/services/evaluationRunner.js +739 -0
  61. package/services/evaluationStore.js +1121 -0
  62. package/services/learnerConfigLoader.js +385 -0
  63. package/services/learnerTutorInteractionEngine.js +857 -0
  64. package/services/memory/learnerMemoryService.js +1227 -0
  65. package/services/memory/learnerWritingPad.js +577 -0
  66. package/services/memory/tutorWritingPad.js +674 -0
  67. package/services/promptRecommendationService.js +493 -0
  68. package/services/rubricEvaluator.js +826 -0
@@ -0,0 +1,1121 @@
1
+ /**
2
+ * Evaluation Store Service
3
+ *
4
+ * SQLite-based storage for AI tutor evaluation results.
5
+ * Supports querying, aggregation, comparison, and export.
6
+ */
7
+
8
+ import Database from 'better-sqlite3';
9
+ import path from 'path';
10
+ import { fileURLToPath } from 'url';
11
+ import { randomBytes } from 'crypto';
12
+
13
+ const __filename = fileURLToPath(import.meta.url);
14
+ const __dirname = path.dirname(__filename);
15
+ const ROOT_DIR = path.resolve(__dirname, '..');
16
+ const DATA_DIR = path.join(ROOT_DIR, 'data');
17
+
18
+ // Initialize database
19
+ const dbPath = path.join(DATA_DIR, 'evaluations.db');
20
+ const db = new Database(dbPath);
21
+
22
+ // Enable WAL mode for better concurrent access
23
+ db.pragma('journal_mode = WAL');
24
+
25
+ // Create tables
26
+ db.exec(`
27
+ -- Evaluation runs (batches of tests)
28
+ CREATE TABLE IF NOT EXISTS evaluation_runs (
29
+ id TEXT PRIMARY KEY,
30
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
31
+ description TEXT,
32
+ total_scenarios INTEGER DEFAULT 0,
33
+ total_configurations INTEGER DEFAULT 0,
34
+ total_tests INTEGER DEFAULT 0,
35
+ status TEXT DEFAULT 'running',
36
+ completed_at DATETIME,
37
+ metadata TEXT -- JSON
38
+ );
39
+
40
+ -- Individual evaluation results
41
+ CREATE TABLE IF NOT EXISTS evaluation_results (
42
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
43
+ run_id TEXT REFERENCES evaluation_runs(id),
44
+ scenario_id TEXT NOT NULL,
45
+ scenario_name TEXT,
46
+
47
+ -- Configuration
48
+ provider TEXT NOT NULL,
49
+ model TEXT NOT NULL,
50
+ profile_name TEXT,
51
+ hyperparameters TEXT, -- JSON
52
+ prompt_id TEXT,
53
+
54
+ -- Raw output
55
+ suggestions TEXT, -- JSON array
56
+ raw_response TEXT,
57
+
58
+ -- Performance metrics
59
+ latency_ms INTEGER,
60
+ input_tokens INTEGER,
61
+ output_tokens INTEGER,
62
+ cost REAL, -- OpenRouter API cost in USD
63
+ dialogue_rounds INTEGER,
64
+ api_calls INTEGER,
65
+ dialogue_id TEXT, -- For linking to dialogue logs
66
+
67
+ -- Rubric scores (1-5 scale)
68
+ score_relevance REAL,
69
+ score_specificity REAL,
70
+ score_pedagogical REAL,
71
+ score_personalization REAL,
72
+ score_actionability REAL,
73
+ score_tone REAL,
74
+ overall_score REAL,
75
+
76
+ -- Validation
77
+ passes_required BOOLEAN,
78
+ passes_forbidden BOOLEAN,
79
+ required_missing TEXT, -- JSON array
80
+ forbidden_found TEXT, -- JSON array
81
+
82
+ -- Metadata
83
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
84
+ evaluator_model TEXT,
85
+ evaluation_reasoning TEXT,
86
+ success BOOLEAN DEFAULT 1,
87
+ error_message TEXT
88
+ );
89
+
90
+ -- Indexes for efficient querying
91
+ CREATE INDEX IF NOT EXISTS idx_results_run ON evaluation_results(run_id);
92
+ CREATE INDEX IF NOT EXISTS idx_results_scenario ON evaluation_results(scenario_id);
93
+ CREATE INDEX IF NOT EXISTS idx_results_provider ON evaluation_results(provider, model);
94
+ CREATE INDEX IF NOT EXISTS idx_results_created ON evaluation_results(created_at);
95
+ CREATE INDEX IF NOT EXISTS idx_runs_created ON evaluation_runs(created_at);
96
+ `);
97
+
98
+ // Migration: Add dialogue_id column if it doesn't exist
99
+ try {
100
+ db.exec(`ALTER TABLE evaluation_results ADD COLUMN dialogue_id TEXT`);
101
+ } catch (e) {
102
+ // Column already exists, ignore
103
+ }
104
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_results_dialogue ON evaluation_results(dialogue_id)`);
105
+
106
+ // Migration: Add scores_with_reasoning column if it doesn't exist
107
+ try {
108
+ db.exec(`ALTER TABLE evaluation_results ADD COLUMN scores_with_reasoning TEXT`);
109
+ } catch (e) {
110
+ // Column already exists, ignore
111
+ }
112
+
113
+ // Migration: Add cost column if it doesn't exist
114
+ try {
115
+ db.exec(`ALTER TABLE evaluation_results ADD COLUMN cost REAL`);
116
+ } catch (e) {
117
+ // Column already exists, ignore
118
+ }
119
+
120
+ // Migration: Revert any accidental renames (batch→matrix, interact→interaction)
121
+ try {
122
+ const revertRuns = db.prepare(`
123
+ UPDATE evaluation_runs
124
+ SET metadata = REPLACE(REPLACE(metadata, '"runType":"batch"', '"runType":"matrix"'), '"runType":"interact"', '"runType":"interaction"')
125
+ WHERE metadata LIKE '%"runType":"batch"%' OR metadata LIKE '%"runType":"interact"%'
126
+ `);
127
+ revertRuns.run();
128
+ } catch (e) {
129
+ // Ignore errors
130
+ }
131
+
132
+ // Create interaction evaluation tables
133
+ db.exec(`
134
+ -- Interaction evaluation results (learner-tutor dialogues)
135
+ CREATE TABLE IF NOT EXISTS interaction_evaluations (
136
+ id TEXT PRIMARY KEY,
137
+ run_id TEXT REFERENCES evaluation_runs(id),
138
+ scenario_id TEXT NOT NULL,
139
+ scenario_name TEXT,
140
+ eval_type TEXT DEFAULT 'short_term',
141
+
142
+ -- Configuration
143
+ learner_profile TEXT,
144
+ tutor_profile TEXT,
145
+ persona_id TEXT,
146
+ learner_agents TEXT, -- JSON array of agent roles
147
+
148
+ -- Interaction data
149
+ turn_count INTEGER,
150
+ turns TEXT, -- JSON array of turn objects
151
+ sequence_diagram TEXT,
152
+ formatted_transcript TEXT,
153
+
154
+ -- Memory snapshots
155
+ learner_memory_before TEXT, -- JSON
156
+ learner_memory_after TEXT, -- JSON
157
+ tutor_memory_before TEXT, -- JSON
158
+ tutor_memory_after TEXT, -- JSON
159
+
160
+ -- Metrics
161
+ total_tokens INTEGER,
162
+ learner_tokens INTEGER,
163
+ tutor_tokens INTEGER,
164
+ latency_ms INTEGER,
165
+
166
+ -- Outcomes
167
+ final_learner_state TEXT,
168
+ final_understanding TEXT,
169
+ unique_outcomes TEXT, -- JSON array
170
+
171
+ -- Judge evaluation
172
+ judge_overall_score REAL,
173
+ judge_evaluation TEXT, -- JSON
174
+
175
+ -- Timestamps
176
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP
177
+ );
178
+
179
+ CREATE INDEX IF NOT EXISTS idx_interaction_run ON interaction_evaluations(run_id);
180
+ CREATE INDEX IF NOT EXISTS idx_interaction_scenario ON interaction_evaluations(scenario_id);
181
+ CREATE INDEX IF NOT EXISTS idx_interaction_created ON interaction_evaluations(created_at);
182
+ `);
183
+
184
+ /**
185
+ * Generate a unique run ID
186
+ */
187
+ function generateRunId() {
188
+ const timestamp = new Date().toISOString().slice(0, 10);
189
+ const suffix = randomBytes(4).toString('hex');
190
+ return `eval-${timestamp}-${suffix}`;
191
+ }
192
+
193
+ /**
194
+ * Create a new evaluation run
195
+ *
196
+ * @param {Object} options - Run options
197
+ * @returns {Object} Created run
198
+ */
199
+ export function createRun(options = {}) {
200
+ const {
201
+ description = null,
202
+ totalScenarios = 0,
203
+ totalConfigurations = 0,
204
+ metadata = {},
205
+ } = options;
206
+
207
+ const id = generateRunId();
208
+
209
+ const stmt = db.prepare(`
210
+ INSERT INTO evaluation_runs (id, description, total_scenarios, total_configurations, metadata)
211
+ VALUES (?, ?, ?, ?, ?)
212
+ `);
213
+
214
+ stmt.run(id, description, totalScenarios, totalConfigurations, JSON.stringify(metadata));
215
+
216
+ return {
217
+ id,
218
+ description,
219
+ totalScenarios,
220
+ totalConfigurations,
221
+ status: 'running',
222
+ createdAt: new Date().toISOString(),
223
+ };
224
+ }
225
+
226
+ /**
227
+ * Update a run's status
228
+ */
229
+ export function updateRun(runId, updates) {
230
+ const { status, totalTests, completedAt } = updates;
231
+
232
+ if (status === 'completed') {
233
+ const stmt = db.prepare(`
234
+ UPDATE evaluation_runs
235
+ SET status = ?, total_tests = ?, completed_at = ?
236
+ WHERE id = ?
237
+ `);
238
+ stmt.run(status, totalTests || 0, completedAt || new Date().toISOString(), runId);
239
+ } else {
240
+ const stmt = db.prepare(`
241
+ UPDATE evaluation_runs SET status = ? WHERE id = ?
242
+ `);
243
+ stmt.run(status, runId);
244
+ }
245
+ }
246
+
247
+ /**
248
+ * Store an individual evaluation result
249
+ *
250
+ * @param {string} runId - The run ID
251
+ * @param {Object} result - The evaluation result
252
+ * @returns {number} Inserted row ID
253
+ */
254
+ export function storeResult(runId, result) {
255
+ const stmt = db.prepare(`
256
+ INSERT INTO evaluation_results (
257
+ run_id, scenario_id, scenario_name,
258
+ provider, model, profile_name, hyperparameters, prompt_id,
259
+ suggestions, raw_response,
260
+ latency_ms, input_tokens, output_tokens, cost, dialogue_rounds, api_calls, dialogue_id,
261
+ score_relevance, score_specificity, score_pedagogical,
262
+ score_personalization, score_actionability, score_tone, overall_score,
263
+ passes_required, passes_forbidden, required_missing, forbidden_found,
264
+ evaluator_model, evaluation_reasoning, scores_with_reasoning, success, error_message
265
+ ) VALUES (
266
+ ?, ?, ?,
267
+ ?, ?, ?, ?, ?,
268
+ ?, ?,
269
+ ?, ?, ?, ?, ?, ?, ?,
270
+ ?, ?, ?,
271
+ ?, ?, ?, ?,
272
+ ?, ?, ?, ?,
273
+ ?, ?, ?, ?, ?
274
+ )
275
+ `);
276
+
277
+ const info = stmt.run(
278
+ runId,
279
+ result.scenarioId,
280
+ result.scenarioName,
281
+ result.provider,
282
+ result.model,
283
+ result.profileName,
284
+ JSON.stringify(result.hyperparameters || {}),
285
+ result.promptId,
286
+ JSON.stringify(result.suggestions || []),
287
+ result.rawResponse,
288
+ result.latencyMs,
289
+ result.inputTokens,
290
+ result.outputTokens,
291
+ result.cost,
292
+ result.dialogueRounds,
293
+ result.apiCalls,
294
+ result.dialogueId,
295
+ result.scores?.relevance,
296
+ result.scores?.specificity,
297
+ result.scores?.pedagogical,
298
+ result.scores?.personalization,
299
+ result.scores?.actionability,
300
+ result.scores?.tone,
301
+ result.overallScore,
302
+ result.passesRequired ? 1 : 0,
303
+ result.passesForbidden ? 1 : 0,
304
+ JSON.stringify(result.requiredMissing || []),
305
+ JSON.stringify(result.forbiddenFound || []),
306
+ result.evaluatorModel,
307
+ result.evaluationReasoning,
308
+ result.scoresWithReasoning ? JSON.stringify(result.scoresWithReasoning) : null,
309
+ result.success ? 1 : 0,
310
+ result.errorMessage
311
+ );
312
+
313
+ return info.lastInsertRowid;
314
+ }
315
+
316
+ /**
317
+ * Get a run by ID
318
+ */
319
+ export function getRun(runId) {
320
+ const stmt = db.prepare('SELECT * FROM evaluation_runs WHERE id = ?');
321
+ const row = stmt.get(runId);
322
+ if (!row) return null;
323
+
324
+ return {
325
+ id: row.id,
326
+ createdAt: row.created_at,
327
+ description: row.description,
328
+ totalScenarios: row.total_scenarios,
329
+ totalConfigurations: row.total_configurations,
330
+ totalTests: row.total_tests,
331
+ status: row.status,
332
+ completedAt: row.completed_at,
333
+ metadata: JSON.parse(row.metadata || '{}'),
334
+ };
335
+ }
336
+
337
+ /**
338
+ * List all runs with scenario names
339
+ */
340
+ export function listRuns(options = {}) {
341
+ const { limit = 20, status = null } = options;
342
+
343
+ let query = 'SELECT * FROM evaluation_runs';
344
+ const params = [];
345
+
346
+ if (status) {
347
+ query += ' WHERE status = ?';
348
+ params.push(status);
349
+ }
350
+
351
+ query += ' ORDER BY created_at DESC LIMIT ?';
352
+ params.push(limit);
353
+
354
+ const stmt = db.prepare(query);
355
+ const rows = stmt.all(...params);
356
+
357
+ // Get distinct scenario names for each run
358
+ const scenarioStmt = db.prepare(`
359
+ SELECT DISTINCT scenario_name FROM evaluation_results
360
+ WHERE run_id = ? AND scenario_name IS NOT NULL
361
+ ORDER BY scenario_name
362
+ `);
363
+
364
+ return rows.map(row => {
365
+ const scenarioRows = scenarioStmt.all(row.id);
366
+ const scenarioNames = scenarioRows.map(s => s.scenario_name).filter(Boolean);
367
+
368
+ return {
369
+ id: row.id,
370
+ createdAt: row.created_at,
371
+ description: row.description,
372
+ totalScenarios: row.total_scenarios,
373
+ totalConfigurations: row.total_configurations,
374
+ totalTests: row.total_tests,
375
+ status: row.status,
376
+ completedAt: row.completed_at,
377
+ scenarioNames, // Scenario names from results
378
+ metadata: JSON.parse(row.metadata || '{}'), // Structured metadata
379
+ };
380
+ });
381
+ }
382
+
383
+ /**
384
+ * Get results for a run
385
+ */
386
+ export function getResults(runId, options = {}) {
387
+ const { scenarioId = null, provider = null, model = null } = options;
388
+
389
+ let query = 'SELECT * FROM evaluation_results WHERE run_id = ?';
390
+ const params = [runId];
391
+
392
+ if (scenarioId) {
393
+ query += ' AND scenario_id = ?';
394
+ params.push(scenarioId);
395
+ }
396
+
397
+ if (provider) {
398
+ query += ' AND provider = ?';
399
+ params.push(provider);
400
+ }
401
+
402
+ if (model) {
403
+ query += ' AND model = ?';
404
+ params.push(model);
405
+ }
406
+
407
+ query += ' ORDER BY created_at';
408
+
409
+ const stmt = db.prepare(query);
410
+ const rows = stmt.all(...params);
411
+
412
+ return rows.map(parseResultRow);
413
+ }
414
+
415
+ /**
416
+ * Get aggregated statistics for a run
417
+ */
418
+ export function getRunStats(runId) {
419
+ const stmt = db.prepare(`
420
+ SELECT
421
+ provider,
422
+ model,
423
+ COUNT(*) as total_tests,
424
+ SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successful_tests,
425
+ AVG(overall_score) as avg_score,
426
+ AVG(score_relevance) as avg_relevance,
427
+ AVG(score_specificity) as avg_specificity,
428
+ AVG(score_pedagogical) as avg_pedagogical,
429
+ AVG(score_personalization) as avg_personalization,
430
+ AVG(score_actionability) as avg_actionability,
431
+ AVG(score_tone) as avg_tone,
432
+ AVG(latency_ms) as avg_latency,
433
+ SUM(input_tokens) as total_input_tokens,
434
+ SUM(output_tokens) as total_output_tokens,
435
+ SUM(CASE WHEN passes_required = 1 THEN 1 ELSE 0 END) as passes_required,
436
+ SUM(CASE WHEN passes_forbidden = 1 THEN 1 ELSE 0 END) as passes_forbidden
437
+ FROM evaluation_results
438
+ WHERE run_id = ?
439
+ GROUP BY provider, model
440
+ ORDER BY avg_score DESC
441
+ `);
442
+
443
+ const rows = stmt.all(runId);
444
+
445
+ return rows.map(row => ({
446
+ provider: row.provider,
447
+ model: row.model,
448
+ totalTests: row.total_tests,
449
+ successfulTests: row.successful_tests,
450
+ successRate: row.total_tests > 0 ? row.successful_tests / row.total_tests : 0,
451
+ avgScore: row.avg_score,
452
+ dimensions: {
453
+ relevance: row.avg_relevance,
454
+ specificity: row.avg_specificity,
455
+ pedagogical: row.avg_pedagogical,
456
+ personalization: row.avg_personalization,
457
+ actionability: row.avg_actionability,
458
+ tone: row.avg_tone,
459
+ },
460
+ avgLatencyMs: row.avg_latency,
461
+ totalInputTokens: row.total_input_tokens,
462
+ totalOutputTokens: row.total_output_tokens,
463
+ passesRequired: row.passes_required,
464
+ passesForbidden: row.passes_forbidden,
465
+ validationPassRate: row.total_tests > 0
466
+ ? (row.passes_required + row.passes_forbidden) / (row.total_tests * 2)
467
+ : 0,
468
+ }));
469
+ }
470
+
471
+ /**
472
+ * Get scenario-level statistics for a run
473
+ */
474
+ export function getScenarioStats(runId) {
475
+ const stmt = db.prepare(`
476
+ SELECT
477
+ scenario_id,
478
+ scenario_name,
479
+ provider,
480
+ model,
481
+ AVG(overall_score) as avg_score,
482
+ AVG(latency_ms) as avg_latency,
483
+ SUM(CASE WHEN passes_required = 1 AND passes_forbidden = 1 THEN 1 ELSE 0 END) as passes_validation,
484
+ COUNT(*) as runs
485
+ FROM evaluation_results
486
+ WHERE run_id = ?
487
+ GROUP BY scenario_id, provider, model
488
+ ORDER BY scenario_id, avg_score DESC
489
+ `);
490
+
491
+ const rows = stmt.all(runId);
492
+
493
+ // Group by scenario
494
+ const grouped = {};
495
+ for (const row of rows) {
496
+ if (!grouped[row.scenario_id]) {
497
+ grouped[row.scenario_id] = {
498
+ scenarioId: row.scenario_id,
499
+ scenarioName: row.scenario_name,
500
+ configurations: [],
501
+ };
502
+ }
503
+ grouped[row.scenario_id].configurations.push({
504
+ provider: row.provider,
505
+ model: row.model,
506
+ avgScore: row.avg_score,
507
+ avgLatencyMs: row.avg_latency,
508
+ passesValidation: row.passes_validation === row.runs,
509
+ runs: row.runs,
510
+ });
511
+ }
512
+
513
+ return Object.values(grouped);
514
+ }
515
+
516
+ /**
517
+ * Compare two configurations across all scenarios
518
+ */
519
+ export function compareConfigs(runId, config1, config2) {
520
+ const getConfigResults = (provider, model) => {
521
+ const stmt = db.prepare(`
522
+ SELECT
523
+ scenario_id,
524
+ AVG(overall_score) as avg_score,
525
+ AVG(score_relevance) as relevance,
526
+ AVG(score_specificity) as specificity,
527
+ AVG(score_pedagogical) as pedagogical,
528
+ AVG(score_personalization) as personalization,
529
+ AVG(score_actionability) as actionability,
530
+ AVG(score_tone) as tone,
531
+ AVG(latency_ms) as latency,
532
+ SUM(CASE WHEN passes_required = 1 AND passes_forbidden = 1 THEN 1 ELSE 0 END) * 1.0 / COUNT(*) as pass_rate
533
+ FROM evaluation_results
534
+ WHERE run_id = ? AND provider = ? AND model = ?
535
+ GROUP BY scenario_id
536
+ `);
537
+ return stmt.all(runId, provider, model);
538
+ };
539
+
540
+ const results1 = getConfigResults(config1.provider, config1.model);
541
+ const results2 = getConfigResults(config2.provider, config2.model);
542
+
543
+ // Build comparison
544
+ const comparison = [];
545
+ const scenarios = new Set([...results1.map(r => r.scenario_id), ...results2.map(r => r.scenario_id)]);
546
+
547
+ for (const scenarioId of scenarios) {
548
+ const r1 = results1.find(r => r.scenario_id === scenarioId) || {};
549
+ const r2 = results2.find(r => r.scenario_id === scenarioId) || {};
550
+
551
+ comparison.push({
552
+ scenarioId,
553
+ config1Score: r1.avg_score || null,
554
+ config2Score: r2.avg_score || null,
555
+ difference: (r1.avg_score || 0) - (r2.avg_score || 0),
556
+ winner: r1.avg_score > r2.avg_score ? 'config1'
557
+ : r2.avg_score > r1.avg_score ? 'config2'
558
+ : 'tie',
559
+ });
560
+ }
561
+
562
+ // Overall stats
563
+ const overall = {
564
+ config1Wins: comparison.filter(c => c.winner === 'config1').length,
565
+ config2Wins: comparison.filter(c => c.winner === 'config2').length,
566
+ ties: comparison.filter(c => c.winner === 'tie').length,
567
+ config1AvgScore: results1.reduce((sum, r) => sum + r.avg_score, 0) / (results1.length || 1),
568
+ config2AvgScore: results2.reduce((sum, r) => sum + r.avg_score, 0) / (results2.length || 1),
569
+ };
570
+
571
+ return { comparison, overall };
572
+ }
573
+
574
+ /**
575
+ * Export results to JSON
576
+ */
577
+ export function exportToJson(runId) {
578
+ const run = getRun(runId);
579
+ const results = getResults(runId);
580
+ const stats = getRunStats(runId);
581
+ const scenarioStats = getScenarioStats(runId);
582
+
583
+ return {
584
+ run,
585
+ stats,
586
+ scenarioStats,
587
+ results,
588
+ exportedAt: new Date().toISOString(),
589
+ };
590
+ }
591
+
592
+ /**
593
+ * Export results to CSV format
594
+ */
595
+ export function exportToCsv(runId) {
596
+ const results = getResults(runId);
597
+
598
+ const headers = [
599
+ 'scenario_id', 'scenario_name', 'provider', 'model',
600
+ 'overall_score', 'relevance', 'specificity', 'pedagogical',
601
+ 'personalization', 'actionability', 'tone',
602
+ 'latency_ms', 'input_tokens', 'output_tokens',
603
+ 'passes_required', 'passes_forbidden', 'success'
604
+ ];
605
+
606
+ const rows = results.map(r => [
607
+ r.scenarioId,
608
+ r.scenarioName,
609
+ r.provider,
610
+ r.model,
611
+ r.overallScore,
612
+ r.scores?.relevance,
613
+ r.scores?.specificity,
614
+ r.scores?.pedagogical,
615
+ r.scores?.personalization,
616
+ r.scores?.actionability,
617
+ r.scores?.tone,
618
+ r.latencyMs,
619
+ r.inputTokens,
620
+ r.outputTokens,
621
+ r.passesRequired ? 1 : 0,
622
+ r.passesForbidden ? 1 : 0,
623
+ r.success ? 1 : 0,
624
+ ]);
625
+
626
+ return [headers.join(','), ...rows.map(row => row.join(','))].join('\n');
627
+ }
628
+
629
+ /**
630
+ * Complete an incomplete evaluation run
631
+ *
632
+ * Marks a stuck/interrupted run as completed with whatever results exist.
633
+ * Returns summary of what was completed.
634
+ *
635
+ * @param {string} runId - The run ID to complete
636
+ * @returns {Object} Completion summary
637
+ */
638
+ export function completeRun(runId) {
639
+ const run = getRun(runId);
640
+ if (!run) {
641
+ throw new Error(`Run not found: ${runId}`);
642
+ }
643
+
644
+ if (run.status === 'completed') {
645
+ return {
646
+ alreadyCompleted: true,
647
+ runId,
648
+ message: 'Run was already marked as completed',
649
+ };
650
+ }
651
+
652
+ // Get all results for this run
653
+ const results = getResults(runId);
654
+
655
+ if (results.length === 0) {
656
+ // No results at all - mark as failed
657
+ updateRun(runId, {
658
+ status: 'failed',
659
+ totalTests: 0,
660
+ completedAt: new Date().toISOString(),
661
+ });
662
+
663
+ return {
664
+ runId,
665
+ status: 'failed',
666
+ message: 'No results found - marked as failed',
667
+ resultsFound: 0,
668
+ expectedTests: run.totalScenarios * run.totalConfigurations,
669
+ };
670
+ }
671
+
672
+ // Find the last result timestamp
673
+ const lastResultTime = results.reduce((latest, r) => {
674
+ const time = new Date(r.createdAt).getTime();
675
+ return time > latest ? time : latest;
676
+ }, 0);
677
+
678
+ const completedAt = new Date(lastResultTime).toISOString();
679
+
680
+ // Update run as completed with partial results
681
+ updateRun(runId, {
682
+ status: 'completed',
683
+ totalTests: results.length,
684
+ completedAt,
685
+ });
686
+
687
+ // Calculate completion percentage
688
+ const expectedTests = run.totalScenarios * run.totalConfigurations;
689
+ const completionRate = expectedTests > 0 ? (results.length / expectedTests) * 100 : 0;
690
+
691
+ // Get profile breakdown
692
+ const profileBreakdown = {};
693
+ for (const result of results) {
694
+ const profile = result.profileName || 'unknown';
695
+ if (!profileBreakdown[profile]) {
696
+ profileBreakdown[profile] = 0;
697
+ }
698
+ profileBreakdown[profile]++;
699
+ }
700
+
701
+ return {
702
+ runId,
703
+ status: 'completed',
704
+ message: 'Run marked as completed with partial results',
705
+ resultsFound: results.length,
706
+ expectedTests,
707
+ completionRate: Math.round(completionRate),
708
+ completedAt,
709
+ profileBreakdown,
710
+ wasPartial: results.length < expectedTests,
711
+ };
712
+ }
713
+
714
+ /**
715
+ * Find all incomplete (stuck) evaluation runs
716
+ *
717
+ * @param {Object} options - Query options
718
+ * @returns {Array} List of incomplete runs
719
+ */
720
+ export function findIncompleteRuns(options = {}) {
721
+ const { olderThanMinutes = 30 } = options;
722
+
723
+ const cutoffTime = new Date(Date.now() - olderThanMinutes * 60 * 1000).toISOString();
724
+
725
+ const stmt = db.prepare(`
726
+ SELECT * FROM evaluation_runs
727
+ WHERE status = 'running'
728
+ AND created_at < ?
729
+ ORDER BY created_at DESC
730
+ `);
731
+
732
+ const rows = stmt.all(cutoffTime);
733
+
734
+ return rows.map(row => {
735
+ const resultsStmt = db.prepare('SELECT COUNT(*) as count FROM evaluation_results WHERE run_id = ?');
736
+ const resultsCount = resultsStmt.get(row.id).count;
737
+
738
+ return {
739
+ id: row.id,
740
+ createdAt: row.created_at,
741
+ description: row.description,
742
+ totalScenarios: row.total_scenarios,
743
+ totalConfigurations: row.total_configurations,
744
+ expectedTests: row.total_scenarios * row.total_configurations,
745
+ resultsFound: resultsCount,
746
+ ageMinutes: Math.round((Date.now() - new Date(row.created_at).getTime()) / 60000),
747
+ metadata: JSON.parse(row.metadata || '{}'),
748
+ };
749
+ });
750
+ }
751
+
752
+ /**
753
+ * Auto-complete all stale runs
754
+ *
755
+ * Finds and completes all runs stuck in "running" state for more than the threshold.
756
+ *
757
+ * @param {Object} options - Options
758
+ * @returns {Array} List of completed runs
759
+ */
760
+ export function autoCompleteStaleRuns(options = {}) {
761
+ const { olderThanMinutes = 30, dryRun = false } = options;
762
+
763
+ const incompleteRuns = findIncompleteRuns({ olderThanMinutes });
764
+
765
+ if (dryRun) {
766
+ return {
767
+ dryRun: true,
768
+ found: incompleteRuns.length,
769
+ runs: incompleteRuns,
770
+ };
771
+ }
772
+
773
+ const completed = [];
774
+ for (const run of incompleteRuns) {
775
+ try {
776
+ const result = completeRun(run.id);
777
+ completed.push(result);
778
+ } catch (error) {
779
+ completed.push({
780
+ runId: run.id,
781
+ status: 'error',
782
+ error: error.message,
783
+ });
784
+ }
785
+ }
786
+
787
+ return {
788
+ found: incompleteRuns.length,
789
+ completed: completed.length,
790
+ runs: completed,
791
+ };
792
+ }
793
+
794
+ /**
795
+ * Delete a run and its results
796
+ */
797
+ export function deleteRun(runId) {
798
+ const deleteResults = db.prepare('DELETE FROM evaluation_results WHERE run_id = ?');
799
+ const deleteRun = db.prepare('DELETE FROM evaluation_runs WHERE id = ?');
800
+
801
+ const transaction = db.transaction(() => {
802
+ deleteResults.run(runId);
803
+ deleteRun.run(runId);
804
+ });
805
+
806
+ transaction();
807
+ }
808
+
809
+ /**
810
+ * Get incomplete tests for a run to enable resumption
811
+ *
812
+ * Given a run ID and the expected test matrix (profiles x scenarios),
813
+ * returns which tests have NOT been completed yet.
814
+ *
815
+ * @param {string} runId - The run ID
816
+ * @param {Array} profiles - Array of profile names
817
+ * @param {Array} scenarios - Array of scenario objects with { id, name }
818
+ * @returns {Object} { completed, remaining, progress }
819
+ */
820
+ export function getIncompleteTests(runId, profiles, scenarios) {
821
+ const run = getRun(runId);
822
+ if (!run) {
823
+ throw new Error(`Run not found: ${runId}`);
824
+ }
825
+
826
+ // Get all completed tests for this run
827
+ const results = getResults(runId);
828
+ const completedSet = new Set();
829
+
830
+ // Build set of completed (profile, scenarioId) pairs
831
+ for (const result of results) {
832
+ const key = `${result.profileName}:${result.scenarioId}`;
833
+ completedSet.add(key);
834
+ }
835
+
836
+ // Build list of all expected tests
837
+ const allTests = [];
838
+ const remainingTests = [];
839
+
840
+ for (const profile of profiles) {
841
+ for (const scenario of scenarios) {
842
+ const testKey = `${profile}:${scenario.id}`;
843
+ const test = {
844
+ profile,
845
+ scenarioId: scenario.id,
846
+ scenarioName: scenario.name,
847
+ };
848
+
849
+ allTests.push(test);
850
+
851
+ if (!completedSet.has(testKey)) {
852
+ remainingTests.push(test);
853
+ }
854
+ }
855
+ }
856
+
857
+ const expectedCount = allTests.length;
858
+ const completedCount = expectedCount - remainingTests.length;
859
+ const progress = expectedCount > 0 ? (completedCount / expectedCount) * 100 : 0;
860
+
861
+ return {
862
+ runId,
863
+ totalExpected: expectedCount,
864
+ completed: completedCount,
865
+ remaining: remainingTests.length,
866
+ progress: Math.round(progress),
867
+ remainingTests,
868
+ status: run.status,
869
+ canResume: remainingTests.length > 0 && run.status === 'running',
870
+ };
871
+ }
872
+
873
+ /**
874
+ * Parse a result row from the database
875
+ */
876
+ function parseResultRow(row) {
877
+ // Parse scoresWithReasoning if available, otherwise build from numeric scores
878
+ let scoresWithReasoning = null;
879
+ if (row.scores_with_reasoning) {
880
+ try {
881
+ scoresWithReasoning = JSON.parse(row.scores_with_reasoning);
882
+ } catch (e) {
883
+ // Ignore parse errors
884
+ }
885
+ }
886
+
887
+ // Build the scores object - use scoresWithReasoning if available
888
+ const scores = scoresWithReasoning || {
889
+ relevance: row.score_relevance,
890
+ specificity: row.score_specificity,
891
+ pedagogical: row.score_pedagogical,
892
+ personalization: row.score_personalization,
893
+ actionability: row.score_actionability,
894
+ tone: row.score_tone,
895
+ };
896
+
897
+ return {
898
+ id: row.id,
899
+ runId: row.run_id,
900
+ scenarioId: row.scenario_id,
901
+ scenarioName: row.scenario_name,
902
+ provider: row.provider,
903
+ model: row.model,
904
+ profileName: row.profile_name,
905
+ hyperparameters: JSON.parse(row.hyperparameters || '{}'),
906
+ promptId: row.prompt_id,
907
+ suggestions: JSON.parse(row.suggestions || '[]'),
908
+ latencyMs: row.latency_ms,
909
+ inputTokens: row.input_tokens,
910
+ outputTokens: row.output_tokens,
911
+ cost: row.cost,
912
+ dialogueRounds: row.dialogue_rounds,
913
+ apiCalls: row.api_calls,
914
+ dialogueId: row.dialogue_id,
915
+ scores,
916
+ overallScore: row.overall_score,
917
+ passesRequired: Boolean(row.passes_required),
918
+ passesForbidden: Boolean(row.passes_forbidden),
919
+ requiredMissing: JSON.parse(row.required_missing || '[]'),
920
+ forbiddenFound: JSON.parse(row.forbidden_found || '[]'),
921
+ evaluatorModel: row.evaluator_model,
922
+ evaluationReasoning: row.evaluation_reasoning,
923
+ success: Boolean(row.success),
924
+ errorMessage: row.error_message,
925
+ createdAt: row.created_at,
926
+ };
927
+ }
928
+
929
+ // ============================================================================
930
+ // Interaction Evaluation Functions
931
+ // ============================================================================
932
+
933
+ /**
934
+ * Store an interaction evaluation result
935
+ */
936
+ export function storeInteractionEval(evalData) {
937
+ const stmt = db.prepare(`
938
+ INSERT INTO interaction_evaluations (
939
+ id, run_id, scenario_id, scenario_name, eval_type,
940
+ learner_profile, tutor_profile, persona_id, learner_agents,
941
+ turn_count, turns, sequence_diagram, formatted_transcript,
942
+ learner_memory_before, learner_memory_after, tutor_memory_before, tutor_memory_after,
943
+ total_tokens, learner_tokens, tutor_tokens, latency_ms,
944
+ final_learner_state, final_understanding, unique_outcomes,
945
+ judge_overall_score, judge_evaluation
946
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
947
+ `);
948
+
949
+ stmt.run(
950
+ evalData.evalId,
951
+ evalData.runId || null,
952
+ evalData.scenarioId,
953
+ evalData.scenarioName,
954
+ evalData.type || 'short_term',
955
+ evalData.learnerProfile || null,
956
+ evalData.tutorProfile || 'default',
957
+ evalData.personaId || null,
958
+ JSON.stringify(evalData.learnerAgents || []),
959
+ evalData.metrics?.turnCount || evalData.interaction?.turns?.length || 0,
960
+ JSON.stringify(evalData.interaction?.turns || []),
961
+ evalData.sequenceDiagram || null,
962
+ evalData.formattedTranscript || null,
963
+ JSON.stringify(evalData.interaction?.writingPadSnapshots?.learner?.before || null),
964
+ JSON.stringify(evalData.interaction?.writingPadSnapshots?.learner?.after || null),
965
+ JSON.stringify(evalData.interaction?.writingPadSnapshots?.tutor?.before || null),
966
+ JSON.stringify(evalData.interaction?.writingPadSnapshots?.tutor?.after || null),
967
+ evalData.metrics?.totalTokens || 0,
968
+ evalData.metrics?.learnerTokens || 0,
969
+ evalData.metrics?.tutorTokens || 0,
970
+ evalData.metrics?.totalLatencyMs || 0,
971
+ evalData.interaction?.summary?.learnerFinalState || null,
972
+ evalData.interaction?.summary?.learnerFinalUnderstanding || null,
973
+ JSON.stringify(evalData.interaction?.summary?.uniqueOutcomes || []),
974
+ // Extract overall score from multiple possible locations in judge evaluation
975
+ evalData.judgeEvaluation?.overall_assessment?.score ??
976
+ evalData.judgeEvaluation?.narrative_summary?.overall_quality ??
977
+ evalData.judgeEvaluation?.overall_score ?? null,
978
+ JSON.stringify(evalData.judgeEvaluation || null)
979
+ );
980
+
981
+ return evalData.evalId;
982
+ }
983
+
984
+ /**
985
+ * List interaction evaluations
986
+ */
987
+ export function listInteractionEvals(options = {}) {
988
+ const { limit = 50, scenarioId = null } = options;
989
+
990
+ let sql = `
991
+ SELECT * FROM interaction_evaluations
992
+ ${scenarioId ? 'WHERE scenario_id = ?' : ''}
993
+ ORDER BY created_at DESC
994
+ LIMIT ?
995
+ `;
996
+
997
+ const stmt = db.prepare(sql);
998
+ const rows = scenarioId ? stmt.all(scenarioId, limit) : stmt.all(limit);
999
+
1000
+ return rows.map(row => ({
1001
+ evalId: row.id,
1002
+ runId: row.run_id,
1003
+ scenarioId: row.scenario_id,
1004
+ scenarioName: row.scenario_name,
1005
+ evalType: row.eval_type,
1006
+ learnerProfile: row.learner_profile,
1007
+ tutorProfile: row.tutor_profile,
1008
+ personaId: row.persona_id,
1009
+ turnCount: row.turn_count,
1010
+ totalTokens: row.total_tokens,
1011
+ latencyMs: row.latency_ms,
1012
+ finalLearnerState: row.final_learner_state,
1013
+ finalUnderstanding: row.final_understanding,
1014
+ judgeOverallScore: row.judge_overall_score,
1015
+ createdAt: row.created_at,
1016
+ }));
1017
+ }
1018
+
1019
+ /**
1020
+ * Get a specific interaction evaluation
1021
+ */
1022
+ export function getInteractionEval(evalId) {
1023
+ const stmt = db.prepare('SELECT * FROM interaction_evaluations WHERE id = ?');
1024
+ const row = stmt.get(evalId);
1025
+
1026
+ if (!row) return null;
1027
+
1028
+ return {
1029
+ evalId: row.id,
1030
+ runId: row.run_id,
1031
+ scenarioId: row.scenario_id,
1032
+ scenarioName: row.scenario_name,
1033
+ evalType: row.eval_type,
1034
+ learnerProfile: row.learner_profile,
1035
+ tutorProfile: row.tutor_profile,
1036
+ personaId: row.persona_id,
1037
+ learnerAgents: JSON.parse(row.learner_agents || '[]'),
1038
+ turnCount: row.turn_count,
1039
+ turns: JSON.parse(row.turns || '[]'),
1040
+ sequenceDiagram: row.sequence_diagram,
1041
+ formattedTranscript: row.formatted_transcript,
1042
+ learnerMemoryBefore: JSON.parse(row.learner_memory_before || 'null'),
1043
+ learnerMemoryAfter: JSON.parse(row.learner_memory_after || 'null'),
1044
+ tutorMemoryBefore: JSON.parse(row.tutor_memory_before || 'null'),
1045
+ tutorMemoryAfter: JSON.parse(row.tutor_memory_after || 'null'),
1046
+ totalTokens: row.total_tokens,
1047
+ learnerTokens: row.learner_tokens,
1048
+ tutorTokens: row.tutor_tokens,
1049
+ latencyMs: row.latency_ms,
1050
+ finalLearnerState: row.final_learner_state,
1051
+ finalUnderstanding: row.final_understanding,
1052
+ uniqueOutcomes: JSON.parse(row.unique_outcomes || '[]'),
1053
+ judgeOverallScore: row.judge_overall_score,
1054
+ judgeEvaluation: JSON.parse(row.judge_evaluation || 'null'),
1055
+ createdAt: row.created_at,
1056
+ };
1057
+ }
1058
+
1059
+ /**
1060
+ * Get an interaction evaluation by its run ID (for Interact tab runs)
1061
+ */
1062
+ export function getInteractionEvalByRunId(runId) {
1063
+ const stmt = db.prepare('SELECT * FROM interaction_evaluations WHERE run_id = ? ORDER BY created_at DESC LIMIT 1');
1064
+ const row = stmt.get(runId);
1065
+
1066
+ if (!row) return null;
1067
+
1068
+ return {
1069
+ evalId: row.id,
1070
+ runId: row.run_id,
1071
+ scenarioId: row.scenario_id,
1072
+ scenarioName: row.scenario_name,
1073
+ evalType: row.eval_type,
1074
+ learnerProfile: row.learner_profile,
1075
+ tutorProfile: row.tutor_profile,
1076
+ personaId: row.persona_id,
1077
+ learnerAgents: JSON.parse(row.learner_agents || '[]'),
1078
+ turnCount: row.turn_count,
1079
+ turns: JSON.parse(row.turns || '[]'),
1080
+ sequenceDiagram: row.sequence_diagram,
1081
+ formattedTranscript: row.formatted_transcript,
1082
+ learnerMemoryBefore: JSON.parse(row.learner_memory_before || 'null'),
1083
+ learnerMemoryAfter: JSON.parse(row.learner_memory_after || 'null'),
1084
+ tutorMemoryBefore: JSON.parse(row.tutor_memory_before || 'null'),
1085
+ tutorMemoryAfter: JSON.parse(row.tutor_memory_after || 'null'),
1086
+ totalTokens: row.total_tokens,
1087
+ learnerTokens: row.learner_tokens,
1088
+ tutorTokens: row.tutor_tokens,
1089
+ latencyMs: row.latency_ms,
1090
+ finalLearnerState: row.final_learner_state,
1091
+ finalUnderstanding: row.final_understanding,
1092
+ uniqueOutcomes: JSON.parse(row.unique_outcomes || '[]'),
1093
+ judgeOverallScore: row.judge_overall_score,
1094
+ judgeEvaluation: JSON.parse(row.judge_evaluation || 'null'),
1095
+ createdAt: row.created_at,
1096
+ };
1097
+ }
1098
+
1099
+ export default {
1100
+ createRun,
1101
+ updateRun,
1102
+ storeResult,
1103
+ getRun,
1104
+ listRuns,
1105
+ getResults,
1106
+ getRunStats,
1107
+ getScenarioStats,
1108
+ compareConfigs,
1109
+ exportToJson,
1110
+ exportToCsv,
1111
+ deleteRun,
1112
+ completeRun,
1113
+ findIncompleteRuns,
1114
+ autoCompleteStaleRuns,
1115
+ getIncompleteTests,
1116
+ // Interaction evaluations
1117
+ storeInteractionEval,
1118
+ listInteractionEvals,
1119
+ getInteractionEval,
1120
+ getInteractionEvalByRunId,
1121
+ };