@machinespirits/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/components/MobileEvalDashboard.tsx +267 -0
  2. package/components/comparison/DeltaAnalysisTable.tsx +137 -0
  3. package/components/comparison/ProfileComparisonCard.tsx +176 -0
  4. package/components/comparison/RecognitionABMode.tsx +385 -0
  5. package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
  6. package/components/comparison/WinnerIndicator.tsx +64 -0
  7. package/components/comparison/index.ts +5 -0
  8. package/components/mobile/BottomSheet.tsx +233 -0
  9. package/components/mobile/DimensionBreakdown.tsx +210 -0
  10. package/components/mobile/DocsView.tsx +363 -0
  11. package/components/mobile/LogsView.tsx +481 -0
  12. package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
  13. package/components/mobile/QuickTestView.tsx +1098 -0
  14. package/components/mobile/RecognitionTypeChart.tsx +124 -0
  15. package/components/mobile/RecognitionView.tsx +809 -0
  16. package/components/mobile/RunDetailView.tsx +261 -0
  17. package/components/mobile/RunHistoryView.tsx +367 -0
  18. package/components/mobile/ScoreRadial.tsx +211 -0
  19. package/components/mobile/StreamingLogPanel.tsx +230 -0
  20. package/components/mobile/SynthesisStrategyChart.tsx +140 -0
  21. package/config/interaction-eval-scenarios.yaml +832 -0
  22. package/config/learner-agents.yaml +248 -0
  23. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
  24. package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
  25. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
  26. package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
  27. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
  28. package/docs/research/COST-ANALYSIS.md +56 -0
  29. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
  30. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
  31. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
  32. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
  33. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
  34. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
  35. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
  36. package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
  37. package/docs/research/PAPER-UNIFIED.md +659 -0
  38. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  39. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
  40. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
  41. package/docs/research/apa.csl +2133 -0
  42. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
  43. package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
  44. package/docs/research/paper-draft/full-paper.md +136 -0
  45. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  46. package/docs/research/paper-draft/references.bib +515 -0
  47. package/docs/research/transcript-baseline.md +139 -0
  48. package/docs/research/transcript-recognition-multiagent.md +187 -0
  49. package/hooks/useEvalData.ts +625 -0
  50. package/index.js +27 -0
  51. package/package.json +73 -0
  52. package/routes/evalRoutes.js +3002 -0
  53. package/scripts/advanced-eval-analysis.js +351 -0
  54. package/scripts/analyze-eval-costs.js +378 -0
  55. package/scripts/analyze-eval-results.js +513 -0
  56. package/scripts/analyze-interaction-evals.js +368 -0
  57. package/server-init.js +45 -0
  58. package/server.js +162 -0
  59. package/services/benchmarkService.js +1892 -0
  60. package/services/evaluationRunner.js +739 -0
  61. package/services/evaluationStore.js +1121 -0
  62. package/services/learnerConfigLoader.js +385 -0
  63. package/services/learnerTutorInteractionEngine.js +857 -0
  64. package/services/memory/learnerMemoryService.js +1227 -0
  65. package/services/memory/learnerWritingPad.js +577 -0
  66. package/services/memory/tutorWritingPad.js +674 -0
  67. package/services/promptRecommendationService.js +493 -0
  68. package/services/rubricEvaluator.js +826 -0
@@ -0,0 +1,368 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * Analyze Interaction Evaluation Results
5
+ *
6
+ * Extracts and analyzes judge evaluation scores from interaction eval logs.
7
+ * Used for the updated research paper on dyadic learner-tutor recognition.
8
+ */
9
+
10
+ import fs from 'fs';
11
+ import path from 'path';
12
+ import { fileURLToPath } from 'url';
13
+
14
+ const __filename = fileURLToPath(import.meta.url);
15
+ const __dirname = path.dirname(__filename);
16
+
17
+ const LOGS_DIR = path.join(__dirname, '..', 'logs', 'interaction-evals');
18
+
19
+ // Tutor dimensions
20
+ const TUTOR_DIMS = ['mutual_recognition', 'dialectical_responsiveness', 'transformative_potential', 'tone'];
21
+
22
+ // Learner dimensions
23
+ const LEARNER_DIMS = ['authenticity', 'responsiveness', 'development'];
24
+
25
+ /**
26
+ * Load all interaction eval JSON files
27
+ */
28
+ function loadEvals() {
29
+ const files = fs.readdirSync(LOGS_DIR).filter(f => f.endsWith('.json'));
30
+ const evals = [];
31
+
32
+ for (const file of files) {
33
+ try {
34
+ const content = fs.readFileSync(path.join(LOGS_DIR, file), 'utf8');
35
+ const data = JSON.parse(content);
36
+ data._filename = file;
37
+ evals.push(data);
38
+ } catch (e) {
39
+ console.warn(`Failed to load ${file}: ${e.message}`);
40
+ }
41
+ }
42
+
43
+ return evals;
44
+ }
45
+
46
+ /**
47
+ * Extract tutor dimension scores from judge evaluation
48
+ */
49
+ function extractTutorScores(evalData) {
50
+ const judgeEval = evalData.judgeEvaluation;
51
+ if (!judgeEval?.tutor_evaluation) return null;
52
+
53
+ const tutor = judgeEval.tutor_evaluation;
54
+ const scores = {};
55
+ let total = 0;
56
+ let count = 0;
57
+
58
+ for (const dim of TUTOR_DIMS) {
59
+ if (tutor[dim]?.score != null) {
60
+ scores[dim] = tutor[dim].score;
61
+ total += tutor[dim].score;
62
+ count++;
63
+ }
64
+ }
65
+
66
+ if (count > 0) {
67
+ scores.overall = total / count;
68
+ }
69
+
70
+ return count > 0 ? scores : null;
71
+ }
72
+
73
+ /**
74
+ * Extract learner dimension scores from judge evaluation
75
+ */
76
+ function extractLearnerScores(evalData) {
77
+ const judgeEval = evalData.judgeEvaluation;
78
+ if (!judgeEval?.learner_evaluation) return null;
79
+
80
+ const learner = judgeEval.learner_evaluation;
81
+ const scores = {};
82
+ let total = 0;
83
+ let count = 0;
84
+
85
+ for (const dim of LEARNER_DIMS) {
86
+ if (learner[dim]?.score != null) {
87
+ scores[dim] = learner[dim].score;
88
+ total += learner[dim].score;
89
+ count++;
90
+ }
91
+ }
92
+
93
+ if (count > 0) {
94
+ scores.overall = total / count;
95
+ }
96
+
97
+ return count > 0 ? scores : null;
98
+ }
99
+
100
+ /**
101
+ * Extract scenario metadata
102
+ */
103
+ function extractMetadata(evalData) {
104
+ return {
105
+ scenarioId: evalData.scenarioId || evalData.evalId?.split('-')[1] || 'unknown',
106
+ learnerId: evalData.learnerId,
107
+ tutorProfile: evalData.tutorProfile || 'default',
108
+ learnerArchitecture: evalData.learnerArchitecture || 'unknown',
109
+ personaId: evalData.personaId,
110
+ turnCount: evalData.metrics?.turnCount || 0,
111
+ totalTokens: evalData.metrics?.totalTokens || 0,
112
+ duration: evalData.metrics?.durationMs || 0,
113
+ outcomes: evalData.summary?.outcomes || [],
114
+ skipJudge: evalData.skipJudge || false,
115
+ };
116
+ }
117
+
118
+ /**
119
+ * Group evals by tutor profile
120
+ */
121
+ function groupByTutorProfile(evals) {
122
+ const groups = {};
123
+
124
+ for (const e of evals) {
125
+ const meta = extractMetadata(e);
126
+ const profile = meta.tutorProfile;
127
+
128
+ if (!groups[profile]) {
129
+ groups[profile] = [];
130
+ }
131
+ groups[profile].push(e);
132
+ }
133
+
134
+ return groups;
135
+ }
136
+
137
+ /**
138
+ * Group evals by learner architecture
139
+ */
140
+ function groupByArchitecture(evals) {
141
+ const groups = {};
142
+
143
+ for (const e of evals) {
144
+ const meta = extractMetadata(e);
145
+ const arch = meta.learnerArchitecture;
146
+
147
+ if (!groups[arch]) {
148
+ groups[arch] = [];
149
+ }
150
+ groups[arch].push(e);
151
+ }
152
+
153
+ return groups;
154
+ }
155
+
156
+ /**
157
+ * Calculate average scores for a group of evals
158
+ */
159
+ function calculateAverages(evals, extractFn) {
160
+ const allScores = {};
161
+ const counts = {};
162
+
163
+ for (const e of evals) {
164
+ const scores = extractFn(e);
165
+ if (!scores) continue;
166
+
167
+ for (const [dim, score] of Object.entries(scores)) {
168
+ if (!allScores[dim]) {
169
+ allScores[dim] = 0;
170
+ counts[dim] = 0;
171
+ }
172
+ allScores[dim] += score;
173
+ counts[dim]++;
174
+ }
175
+ }
176
+
177
+ const averages = {};
178
+ for (const dim of Object.keys(allScores)) {
179
+ averages[dim] = counts[dim] > 0 ? (allScores[dim] / counts[dim]).toFixed(2) : null;
180
+ }
181
+ averages._count = Math.max(...Object.values(counts), 0);
182
+
183
+ return averages;
184
+ }
185
+
186
+ /**
187
+ * Generate summary report
188
+ */
189
+ function generateReport(evals) {
190
+ console.log('\n' + '═'.repeat(70));
191
+ console.log('INTERACTION EVALUATION ANALYSIS');
192
+ console.log('═'.repeat(70));
193
+
194
+ // Filter to only evals with judge evaluation
195
+ const judgedEvals = evals.filter(e => e.judgeEvaluation && !e.skipJudge);
196
+ console.log(`\nTotal evals: ${evals.length}`);
197
+ console.log(`With judge evaluation: ${judgedEvals.length}`);
198
+
199
+ // Battery evals
200
+ const batteryEvals = judgedEvals.filter(e => e._filename.includes('battery'));
201
+ console.log(`Battery evals: ${batteryEvals.length}`);
202
+
203
+ // By Tutor Profile
204
+ console.log('\n' + '─'.repeat(70));
205
+ console.log('TUTOR PROFILE COMPARISON');
206
+ console.log('─'.repeat(70));
207
+
208
+ const byProfile = groupByTutorProfile(batteryEvals);
209
+ const profileResults = {};
210
+
211
+ for (const [profile, profileEvals] of Object.entries(byProfile)) {
212
+ const tutorAvg = calculateAverages(profileEvals, extractTutorScores);
213
+ const learnerAvg = calculateAverages(profileEvals, extractLearnerScores);
214
+
215
+ profileResults[profile] = { tutor: tutorAvg, learner: learnerAvg };
216
+
217
+ console.log(`\n${profile.toUpperCase()} (n=${tutorAvg._count || 0}):`);
218
+ console.log(' Tutor dimensions:');
219
+ for (const dim of TUTOR_DIMS) {
220
+ console.log(` ${dim}: ${tutorAvg[dim] || 'N/A'}`);
221
+ }
222
+ console.log(` OVERALL: ${tutorAvg.overall || 'N/A'}`);
223
+
224
+ console.log(' Learner dimensions:');
225
+ for (const dim of LEARNER_DIMS) {
226
+ console.log(` ${dim}: ${learnerAvg[dim] || 'N/A'}`);
227
+ }
228
+ console.log(` OVERALL: ${learnerAvg.overall || 'N/A'}`);
229
+ }
230
+
231
+ // By Learner Architecture
232
+ console.log('\n' + '─'.repeat(70));
233
+ console.log('LEARNER ARCHITECTURE COMPARISON');
234
+ console.log('─'.repeat(70));
235
+
236
+ const byArch = groupByArchitecture(batteryEvals);
237
+ const archResults = {};
238
+
239
+ for (const [arch, archEvals] of Object.entries(byArch)) {
240
+ const tutorAvg = calculateAverages(archEvals, extractTutorScores);
241
+ const learnerAvg = calculateAverages(archEvals, extractLearnerScores);
242
+
243
+ archResults[arch] = { tutor: tutorAvg, learner: learnerAvg };
244
+
245
+ console.log(`\n${arch.toUpperCase()} (n=${tutorAvg._count || 0}):`);
246
+ console.log(' Tutor dimensions:');
247
+ for (const dim of TUTOR_DIMS) {
248
+ console.log(` ${dim}: ${tutorAvg[dim] || 'N/A'}`);
249
+ }
250
+ console.log(` OVERALL: ${tutorAvg.overall || 'N/A'}`);
251
+
252
+ console.log(' Learner dimensions:');
253
+ for (const dim of LEARNER_DIMS) {
254
+ console.log(` ${dim}: ${learnerAvg[dim] || 'N/A'}`);
255
+ }
256
+ console.log(` OVERALL: ${learnerAvg.overall || 'N/A'}`);
257
+ }
258
+
259
+ // Cross-tabulation: Profile × Architecture
260
+ console.log('\n' + '─'.repeat(70));
261
+ console.log('CROSS-TABULATION: TUTOR PROFILE × LEARNER ARCHITECTURE');
262
+ console.log('─'.repeat(70));
263
+
264
+ const crossTab = {};
265
+ for (const e of batteryEvals) {
266
+ const meta = extractMetadata(e);
267
+ const key = `${meta.tutorProfile}|${meta.learnerArchitecture}`;
268
+
269
+ if (!crossTab[key]) {
270
+ crossTab[key] = [];
271
+ }
272
+ crossTab[key].push(e);
273
+ }
274
+
275
+ // Create table
276
+ const profiles = [...new Set(batteryEvals.map(e => extractMetadata(e).tutorProfile))].sort();
277
+ const architectures = [...new Set(batteryEvals.map(e => extractMetadata(e).learnerArchitecture))].sort();
278
+
279
+ console.log('\nTutor Overall Score by Profile × Architecture:');
280
+ console.log('─'.repeat(70));
281
+
282
+ // Header row
283
+ let header = 'Profile'.padEnd(20);
284
+ for (const arch of architectures) {
285
+ header += arch.slice(0, 12).padStart(14);
286
+ }
287
+ console.log(header);
288
+ console.log('─'.repeat(70));
289
+
290
+ // Data rows
291
+ for (const profile of profiles) {
292
+ let row = profile.padEnd(20);
293
+ for (const arch of architectures) {
294
+ const key = `${profile}|${arch}`;
295
+ const cellEvals = crossTab[key] || [];
296
+ const avg = calculateAverages(cellEvals, extractTutorScores);
297
+ row += (avg.overall || '-').toString().padStart(14);
298
+ }
299
+ console.log(row);
300
+ }
301
+
302
+ // Summary statistics
303
+ console.log('\n' + '─'.repeat(70));
304
+ console.log('SUMMARY STATISTICS');
305
+ console.log('─'.repeat(70));
306
+
307
+ // Overall averages
308
+ const overallTutor = calculateAverages(batteryEvals, extractTutorScores);
309
+ const overallLearner = calculateAverages(batteryEvals, extractLearnerScores);
310
+
311
+ console.log('\nOverall Tutor Dimensions:');
312
+ for (const dim of TUTOR_DIMS) {
313
+ console.log(` ${dim}: ${overallTutor[dim] || 'N/A'}`);
314
+ }
315
+ console.log(` OVERALL: ${overallTutor.overall || 'N/A'}`);
316
+
317
+ console.log('\nOverall Learner Dimensions:');
318
+ for (const dim of LEARNER_DIMS) {
319
+ console.log(` ${dim}: ${overallLearner[dim] || 'N/A'}`);
320
+ }
321
+ console.log(` OVERALL: ${overallLearner.overall || 'N/A'}`);
322
+
323
+ // Best/worst by profile
324
+ if (Object.keys(profileResults).length > 0) {
325
+ const sortedProfiles = Object.entries(profileResults)
326
+ .filter(([_, r]) => r.tutor.overall)
327
+ .sort((a, b) => parseFloat(b[1].tutor.overall) - parseFloat(a[1].tutor.overall));
328
+
329
+ if (sortedProfiles.length > 0) {
330
+ console.log(`\nBest tutor profile: ${sortedProfiles[0][0]} (${sortedProfiles[0][1].tutor.overall})`);
331
+ console.log(`Worst tutor profile: ${sortedProfiles[sortedProfiles.length-1][0]} (${sortedProfiles[sortedProfiles.length-1][1].tutor.overall})`);
332
+ }
333
+ }
334
+
335
+ // Best/worst by architecture
336
+ if (Object.keys(archResults).length > 0) {
337
+ const sortedArchs = Object.entries(archResults)
338
+ .filter(([_, r]) => r.learner.overall)
339
+ .sort((a, b) => parseFloat(b[1].learner.overall) - parseFloat(a[1].learner.overall));
340
+
341
+ if (sortedArchs.length > 0) {
342
+ console.log(`\nBest learner architecture: ${sortedArchs[0][0]} (${sortedArchs[0][1].learner.overall})`);
343
+ console.log(`Worst learner architecture: ${sortedArchs[sortedArchs.length-1][0]} (${sortedArchs[sortedArchs.length-1][1].learner.overall})`);
344
+ }
345
+ }
346
+
347
+ // Output JSON for paper
348
+ const jsonOutput = {
349
+ generated: new Date().toISOString(),
350
+ totalEvals: evals.length,
351
+ judgedEvals: judgedEvals.length,
352
+ batteryEvals: batteryEvals.length,
353
+ byProfile: profileResults,
354
+ byArchitecture: archResults,
355
+ overallTutor,
356
+ overallLearner,
357
+ };
358
+
359
+ const outputPath = path.join(__dirname, '..', 'docs', 'analysis-results.json');
360
+ fs.writeFileSync(outputPath, JSON.stringify(jsonOutput, null, 2));
361
+ console.log(`\nJSON output saved to: ${outputPath}`);
362
+
363
+ return jsonOutput;
364
+ }
365
+
366
+ // Main
367
+ const evals = loadEvals();
368
+ generateReport(evals);
package/server-init.js ADDED
@@ -0,0 +1,45 @@
1
+ /**
2
+ * Evaluation Extension - Server Initialization
3
+ *
4
+ * Called by the extension loader when mounting this extension
5
+ * into the main Machine Spirits website.
6
+ */
7
+
8
+ import path from 'path';
9
+ import { fileURLToPath } from 'url';
10
+ import express from 'express';
11
+
12
+ const __filename = fileURLToPath(import.meta.url);
13
+ const __dirname = path.dirname(__filename);
14
+
15
+ /**
16
+ * Initialize the evaluation extension
17
+ * @param {Object} context - Initialization context
18
+ * @param {Express} context.app - Express application
19
+ * @param {Object} context.manifest - Extension manifest
20
+ * @param {string} context.extensionPath - Path to extension directory
21
+ * @param {string} context.rootDir - Path to main website root
22
+ */
23
+ export async function init({ app, manifest, extensionPath, rootDir }) {
24
+ console.log(`[EvalExtension] Initializing ${manifest.name} v${manifest.version}`);
25
+
26
+ // Serve static files for components (for client-side imports)
27
+ const componentsDir = path.join(extensionPath, 'components');
28
+ app.use('/extensions/eval', express.static(componentsDir));
29
+
30
+ // Serve documentation
31
+ const docsDir = path.join(extensionPath, 'docs');
32
+ app.use('/docs/extensions/eval', express.static(docsDir));
33
+
34
+ // Ensure data directory exists
35
+ const dataDir = path.join(extensionPath, 'data');
36
+ const fs = await import('fs');
37
+ if (!fs.existsSync(dataDir)) {
38
+ fs.mkdirSync(dataDir, { recursive: true });
39
+ console.log('[EvalExtension] Created data directory');
40
+ }
41
+
42
+ console.log('[EvalExtension] Initialization complete');
43
+ }
44
+
45
+ export default { init };
package/server.js ADDED
@@ -0,0 +1,162 @@
1
+ /**
2
+ * @machinespirits/eval - Standalone Server
3
+ *
4
+ * Runs the evaluation system as a standalone application.
5
+ * This server provides:
6
+ * - API endpoints for evaluation runs, results, and analysis
7
+ * - Static file serving for the UI components
8
+ * - Documentation serving
9
+ *
10
+ * Environment variables:
11
+ * PORT - Server port (default: 8081)
12
+ * STANDALONE - Set to 'true' to run in standalone mode
13
+ *
14
+ * Usage:
15
+ * STANDALONE=true node server.js
16
+ * # or
17
+ * npm start
18
+ */
19
+
20
+ import express from 'express';
21
+ import path from 'path';
22
+ import { fileURLToPath } from 'url';
23
+ import { existsSync, mkdirSync } from 'fs';
24
+
25
+ const __filename = fileURLToPath(import.meta.url);
26
+ const __dirname = path.dirname(__filename);
27
+
28
+ const app = express();
29
+ const PORT = Number(process.env.PORT) || 8081;
30
+ const isStandalone = process.env.STANDALONE === 'true';
31
+
32
+ // Middleware
33
+ app.use(express.json());
34
+
35
+ // Ensure data directory exists
36
+ const dataDir = path.join(__dirname, 'data');
37
+ if (!existsSync(dataDir)) {
38
+ mkdirSync(dataDir, { recursive: true });
39
+ console.log('[EvalServer] Created data directory');
40
+ }
41
+
42
+ // Health check
43
+ app.get('/health', (req, res) => {
44
+ res.json({
45
+ status: 'ok',
46
+ package: '@machinespirits/eval',
47
+ version: '0.1.0',
48
+ mode: isStandalone ? 'standalone' : 'mounted',
49
+ });
50
+ });
51
+
52
+ // API routes
53
+ import evalRoutes from './routes/evalRoutes.js';
54
+ app.use('/api/eval', evalRoutes);
55
+
56
+ // Serve components as static files
57
+ const componentsDir = path.join(__dirname, 'components');
58
+ if (existsSync(componentsDir)) {
59
+ app.use('/components', express.static(componentsDir));
60
+ }
61
+
62
+ // Serve documentation
63
+ const docsDir = path.join(__dirname, 'docs');
64
+ if (existsSync(docsDir)) {
65
+ app.use('/docs', express.static(docsDir));
66
+ }
67
+
68
+ // In standalone mode, serve a basic UI
69
+ if (isStandalone) {
70
+ app.get('/', (req, res) => {
71
+ res.send(`
72
+ <!DOCTYPE html>
73
+ <html lang="en">
74
+ <head>
75
+ <meta charset="UTF-8">
76
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
77
+ <title>Machine Spirits Eval</title>
78
+ <style>
79
+ body {
80
+ font-family: 'Space Mono', monospace;
81
+ background: #0a0a0a;
82
+ color: #fafafa;
83
+ margin: 0;
84
+ padding: 2rem;
85
+ }
86
+ h1 { color: #E63946; }
87
+ a { color: #E63946; }
88
+ .endpoint {
89
+ background: rgba(255,255,255,0.05);
90
+ padding: 1rem;
91
+ margin: 0.5rem 0;
92
+ border-radius: 4px;
93
+ }
94
+ code {
95
+ background: rgba(255,255,255,0.1);
96
+ padding: 0.2rem 0.4rem;
97
+ border-radius: 2px;
98
+ }
99
+ </style>
100
+ </head>
101
+ <body>
102
+ <h1>Machine Spirits Eval</h1>
103
+ <p>Evaluation system running in standalone mode.</p>
104
+
105
+ <h2>API Endpoints</h2>
106
+
107
+ <div class="endpoint">
108
+ <strong>GET</strong> <code>/api/eval/scenarios</code>
109
+ <p>List available evaluation scenarios</p>
110
+ </div>
111
+
112
+ <div class="endpoint">
113
+ <strong>GET</strong> <code>/api/eval/profiles</code>
114
+ <p>List tutor profiles</p>
115
+ </div>
116
+
117
+ <div class="endpoint">
118
+ <strong>GET</strong> <code>/api/eval/runs</code>
119
+ <p>List evaluation runs</p>
120
+ </div>
121
+
122
+ <div class="endpoint">
123
+ <strong>GET</strong> <code>/api/eval/runs/:id</code>
124
+ <p>Get details of a specific run</p>
125
+ </div>
126
+
127
+ <div class="endpoint">
128
+ <strong>POST</strong> <code>/api/eval/quick</code>
129
+ <p>Run a quick evaluation test</p>
130
+ </div>
131
+
132
+ <h2>Documentation</h2>
133
+ <p><a href="/docs">/docs</a> - Research papers and analysis</p>
134
+
135
+ <h2>Health</h2>
136
+ <p><a href="/health">/health</a> - Service health check</p>
137
+ </body>
138
+ </html>
139
+ `);
140
+ });
141
+ }
142
+
143
+ // Error handler
144
+ app.use((err, req, res, next) => {
145
+ console.error('[EvalServer] Error:', err.message);
146
+ res.status(500).json({
147
+ error: 'Internal server error',
148
+ message: err.message,
149
+ });
150
+ });
151
+
152
+ // Start server
153
+ if (process.argv[1] === fileURLToPath(import.meta.url)) {
154
+ app.listen(PORT, '0.0.0.0', () => {
155
+ console.log(`[EvalServer] Machine Spirits Eval running at http://0.0.0.0:${PORT}`);
156
+ console.log(`[EvalServer] Mode: ${isStandalone ? 'standalone' : 'mounted'}`);
157
+ console.log(`[EvalServer] API: http://0.0.0.0:${PORT}/api/eval`);
158
+ console.log(`[EvalServer] Docs: http://0.0.0.0:${PORT}/docs`);
159
+ });
160
+ }
161
+
162
+ export { app };