@machinespirits/eval 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +161 -0
  3. package/config/eval-settings.yaml +18 -0
  4. package/config/evaluation-rubric-learner.yaml +277 -0
  5. package/config/evaluation-rubric.yaml +613 -0
  6. package/config/interaction-eval-scenarios.yaml +93 -50
  7. package/config/learner-agents.yaml +124 -193
  8. package/config/machinespirits-eval.code-workspace +11 -0
  9. package/config/providers.yaml +60 -0
  10. package/config/suggestion-scenarios.yaml +1399 -0
  11. package/config/tutor-agents.yaml +716 -0
  12. package/docs/EVALUATION-VARIABLES.md +589 -0
  13. package/docs/REPLICATION-PLAN.md +577 -0
  14. package/index.js +15 -6
  15. package/package.json +16 -22
  16. package/routes/evalRoutes.js +88 -36
  17. package/scripts/analyze-judge-reliability.js +401 -0
  18. package/scripts/analyze-run.js +97 -0
  19. package/scripts/analyze-run.mjs +282 -0
  20. package/scripts/analyze-validation-failures.js +141 -0
  21. package/scripts/check-run.mjs +17 -0
  22. package/scripts/code-impasse-strategies.js +1132 -0
  23. package/scripts/compare-runs.js +44 -0
  24. package/scripts/compare-suggestions.js +80 -0
  25. package/scripts/compare-transformation.js +116 -0
  26. package/scripts/dig-into-run.js +158 -0
  27. package/scripts/eval-cli.js +2626 -0
  28. package/scripts/generate-paper-figures.py +452 -0
  29. package/scripts/qualitative-analysis-ai.js +1313 -0
  30. package/scripts/qualitative-analysis.js +688 -0
  31. package/scripts/seed-db.js +87 -0
  32. package/scripts/show-failed-suggestions.js +64 -0
  33. package/scripts/validate-content.js +192 -0
  34. package/server.js +3 -2
  35. package/services/__tests__/evalConfigLoader.test.js +338 -0
  36. package/services/anovaStats.js +499 -0
  37. package/services/contentResolver.js +407 -0
  38. package/services/dialogueTraceAnalyzer.js +454 -0
  39. package/services/evalConfigLoader.js +625 -0
  40. package/services/evaluationRunner.js +2171 -270
  41. package/services/evaluationStore.js +564 -29
  42. package/services/learnerConfigLoader.js +75 -5
  43. package/services/learnerRubricEvaluator.js +284 -0
  44. package/services/learnerTutorInteractionEngine.js +375 -0
  45. package/services/processUtils.js +18 -0
  46. package/services/progressLogger.js +98 -0
  47. package/services/promptRecommendationService.js +31 -26
  48. package/services/promptRewriter.js +427 -0
  49. package/services/rubricEvaluator.js +543 -70
  50. package/services/streamingReporter.js +104 -0
  51. package/services/turnComparisonAnalyzer.js +494 -0
  52. package/components/MobileEvalDashboard.tsx +0 -267
  53. package/components/comparison/DeltaAnalysisTable.tsx +0 -137
  54. package/components/comparison/ProfileComparisonCard.tsx +0 -176
  55. package/components/comparison/RecognitionABMode.tsx +0 -385
  56. package/components/comparison/RecognitionMetricsPanel.tsx +0 -135
  57. package/components/comparison/WinnerIndicator.tsx +0 -64
  58. package/components/comparison/index.ts +0 -5
  59. package/components/mobile/BottomSheet.tsx +0 -233
  60. package/components/mobile/DimensionBreakdown.tsx +0 -210
  61. package/components/mobile/DocsView.tsx +0 -363
  62. package/components/mobile/LogsView.tsx +0 -481
  63. package/components/mobile/PsychodynamicQuadrant.tsx +0 -261
  64. package/components/mobile/QuickTestView.tsx +0 -1098
  65. package/components/mobile/RecognitionTypeChart.tsx +0 -124
  66. package/components/mobile/RecognitionView.tsx +0 -809
  67. package/components/mobile/RunDetailView.tsx +0 -261
  68. package/components/mobile/RunHistoryView.tsx +0 -367
  69. package/components/mobile/ScoreRadial.tsx +0 -211
  70. package/components/mobile/StreamingLogPanel.tsx +0 -230
  71. package/components/mobile/SynthesisStrategyChart.tsx +0 -140
  72. package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +0 -52
  73. package/docs/research/ABLATION-MODEL-SELECTION.md +0 -53
  74. package/docs/research/ADVANCED-EVAL-ANALYSIS.md +0 -60
  75. package/docs/research/ANOVA-RESULTS-2026-01-14.md +0 -257
  76. package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +0 -586
  77. package/docs/research/COST-ANALYSIS.md +0 -56
  78. package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +0 -340
  79. package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +0 -291
  80. package/docs/research/EVAL-SYSTEM-ANALYSIS.md +0 -306
  81. package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +0 -301
  82. package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +0 -1988
  83. package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +0 -282
  84. package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +0 -147
  85. package/docs/research/PAPER-EXTENSION-DYADIC.md +0 -204
  86. package/docs/research/PAPER-UNIFIED.md +0 -659
  87. package/docs/research/PAPER-UNIFIED.pdf +0 -0
  88. package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +0 -356
  89. package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +0 -419
  90. package/docs/research/apa.csl +0 -2133
  91. package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +0 -1637
  92. package/docs/research/archive/paper-multiagent-tutor.tex +0 -978
  93. package/docs/research/paper-draft/full-paper.md +0 -136
  94. package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
  95. package/docs/research/paper-draft/references.bib +0 -515
  96. package/docs/research/transcript-baseline.md +0 -139
  97. package/docs/research/transcript-recognition-multiagent.md +0 -187
  98. package/hooks/useEvalData.ts +0 -625
  99. package/server-init.js +0 -45
  100. package/services/benchmarkService.js +0 -1892
  101. package/types.ts +0 -165
  102. package/utils/haptics.ts +0 -45
@@ -1,625 +0,0 @@
1
- /**
2
- * useEvalData Hook
3
- *
4
- * Data fetching and caching hook for the mobile evaluation dashboard.
5
- * Handles all eval API interactions with localStorage caching for offline support.
6
- */
7
-
8
- import { useState, useCallback, useRef, useEffect } from 'react';
9
- import type {
10
- EvalProfile,
11
- EvalScenario,
12
- EvalRun,
13
- EvalQuickTestResult,
14
- EvalDialogue,
15
- EvalDoc,
16
- EvalDimensionScores
17
- } from '../types';
18
-
19
- // Cache configuration
20
- const CACHE_CONFIG = {
21
- profiles: { ttl: 300000, key: 'eval-profiles' }, // 5 min
22
- scenarios: { ttl: 300000, key: 'eval-scenarios' }, // 5 min
23
- runs: { ttl: 60000, key: 'eval-runs' }, // 1 min
24
- logDates: { ttl: 300000, key: 'eval-log-dates' }, // 5 min
25
- docs: { ttl: 3600000, key: 'eval-docs' }, // 1 hour
26
- lastResult: { ttl: 86400000, key: 'eval-last-result' } // 24 hours - offline viewing
27
- };
28
-
29
- // Retry configuration for connection resilience
30
- const RETRY_CONFIG = {
31
- maxRetries: 3,
32
- baseDelay: 1000, // 1 second
33
- maxDelay: 10000 // 10 seconds
34
- };
35
-
36
- // Calculate exponential backoff delay
37
- function getRetryDelay(attempt: number): number {
38
- const delay = Math.min(
39
- RETRY_CONFIG.baseDelay * Math.pow(2, attempt),
40
- RETRY_CONFIG.maxDelay
41
- );
42
- // Add jitter
43
- return delay + Math.random() * 1000;
44
- }
45
-
46
- interface CacheEntry<T> {
47
- data: T;
48
- timestamp: number;
49
- }
50
-
51
- function getCached<T>(key: string, ttl: number): T | null {
52
- try {
53
- const cached = localStorage.getItem(key);
54
- if (!cached) return null;
55
-
56
- const entry: CacheEntry<T> = JSON.parse(cached);
57
- if (Date.now() - entry.timestamp > ttl) {
58
- localStorage.removeItem(key);
59
- return null;
60
- }
61
- return entry.data;
62
- } catch {
63
- return null;
64
- }
65
- }
66
-
67
- function setCache<T>(key: string, data: T): void {
68
- try {
69
- const entry: CacheEntry<T> = { data, timestamp: Date.now() };
70
- localStorage.setItem(key, JSON.stringify(entry));
71
- } catch {
72
- // Storage full or unavailable
73
- }
74
- }
75
-
76
- export interface StreamLog {
77
- type: 'info' | 'success' | 'warning' | 'error' | 'progress';
78
- message: string;
79
- timestamp: number;
80
- }
81
-
82
- export interface RunStats {
83
- provider: string;
84
- model: string;
85
- totalTests: number;
86
- successfulTests: number;
87
- successRate: number;
88
- avgScore: number | null;
89
- dimensions: EvalDimensionScores;
90
- avgLatencyMs: number;
91
- }
92
-
93
- export interface RunDetails {
94
- run: EvalRun;
95
- stats: RunStats[];
96
- results: EvalQuickTestResult[];
97
- }
98
-
99
- export interface MatrixResult {
100
- profiles: string[];
101
- scenariosRun: number;
102
- dimensionAverages: Record<string, Record<string, number>>;
103
- rankings: Array<{ profile: string; avgScore: number; rank: number }>;
104
- results: EvalQuickTestResult[];
105
- runId?: string;
106
- }
107
-
108
- export interface UseEvalDataReturn {
109
- // Data
110
- profiles: EvalProfile[];
111
- scenarios: EvalScenario[];
112
- runs: EvalRun[];
113
- logDates: string[];
114
- docs: EvalDoc[];
115
-
116
- // Quick Test
117
- runQuickTest: (scenario: string, profile: string) => void;
118
- isTestRunning: boolean;
119
- testResult: EvalQuickTestResult | null;
120
- streamLogs: StreamLog[];
121
- clearTestResult: () => void;
122
-
123
- // Matrix Test
124
- runMatrixTest: (profiles: string[], scenarios: string[]) => void;
125
- isMatrixRunning: boolean;
126
- matrixResult: MatrixResult | null;
127
- clearMatrixResult: () => void;
128
-
129
- // History
130
- loadRuns: () => Promise<void>;
131
- loadRunDetails: (runId: string) => Promise<RunDetails | null>;
132
-
133
- // Logs
134
- loadLogDates: () => Promise<void>;
135
- loadDialogues: (date: string, offset?: number, limit?: number) => Promise<{
136
- dialogues: EvalDialogue[];
137
- total: number;
138
- hasMore: boolean;
139
- }>;
140
- loadDialogueById: (dialogueId: string) => Promise<EvalDialogue | null>;
141
-
142
- // Docs
143
- loadDocs: () => Promise<void>;
144
- loadDocContent: (name: string) => Promise<string | null>;
145
-
146
- // State
147
- isLoading: boolean;
148
- isInitialLoading: boolean;
149
- error: string | null;
150
- clearError: () => void;
151
- }
152
-
153
- export function useEvalData(): UseEvalDataReturn {
154
- // Core data
155
- const [profiles, setProfiles] = useState<EvalProfile[]>([]);
156
- const [scenarios, setScenarios] = useState<EvalScenario[]>([]);
157
- const [runs, setRuns] = useState<EvalRun[]>([]);
158
- const [logDates, setLogDates] = useState<string[]>([]);
159
- const [docs, setDocs] = useState<EvalDoc[]>([]);
160
-
161
- // Quick test state
162
- const [isTestRunning, setIsTestRunning] = useState(false);
163
- const [testResult, setTestResult] = useState<EvalQuickTestResult | null>(null);
164
- const [streamLogs, setStreamLogs] = useState<StreamLog[]>([]);
165
- const eventSourceRef = useRef<EventSource | null>(null);
166
-
167
- // Matrix test state
168
- const [isMatrixRunning, setIsMatrixRunning] = useState(false);
169
- const [matrixResult, setMatrixResult] = useState<MatrixResult | null>(null);
170
- const matrixEventSourceRef = useRef<EventSource | null>(null);
171
-
172
- // General state
173
- const [isLoading, setIsLoading] = useState(false);
174
- const [isInitialLoading, setIsInitialLoading] = useState(true);
175
- const [error, setError] = useState<string | null>(null);
176
-
177
- // Load profiles and scenarios on mount
178
- useEffect(() => {
179
- const loadInitialData = async () => {
180
- setIsInitialLoading(true);
181
-
182
- // Try cache first (only use if non-empty)
183
- const cachedProfiles = getCached<EvalProfile[]>(
184
- CACHE_CONFIG.profiles.key,
185
- CACHE_CONFIG.profiles.ttl
186
- );
187
- const cachedScenarios = getCached<EvalScenario[]>(
188
- CACHE_CONFIG.scenarios.key,
189
- CACHE_CONFIG.scenarios.ttl
190
- );
191
-
192
- // Only use cache if it has actual data
193
- const hasValidProfileCache = cachedProfiles && cachedProfiles.length > 0;
194
- const hasValidScenarioCache = cachedScenarios && cachedScenarios.length > 0;
195
-
196
- if (hasValidProfileCache) setProfiles(cachedProfiles);
197
- if (hasValidScenarioCache) setScenarios(cachedScenarios);
198
-
199
- // Fetch fresh data if no valid cache
200
- if (!hasValidProfileCache || !hasValidScenarioCache) {
201
- try {
202
- const [profilesRes, scenariosRes] = await Promise.all([
203
- fetch('/api/eval/profiles'),
204
- fetch('/api/eval/scenarios')
205
- ]);
206
-
207
- if (profilesRes.ok) {
208
- const data = await profilesRes.json();
209
- const profileList = data.profiles || [];
210
- if (profileList.length > 0) {
211
- setProfiles(profileList);
212
- setCache(CACHE_CONFIG.profiles.key, profileList);
213
- }
214
- }
215
-
216
- if (scenariosRes.ok) {
217
- const data = await scenariosRes.json();
218
- const scenarioList = data.scenarios || [];
219
- if (scenarioList.length > 0) {
220
- setScenarios(scenarioList);
221
- setCache(CACHE_CONFIG.scenarios.key, scenarioList);
222
- }
223
- }
224
- } catch (err) {
225
- console.error('Failed to load initial eval data:', err);
226
- setError('Failed to load profiles and scenarios');
227
- }
228
- }
229
-
230
- setIsInitialLoading(false);
231
- };
232
-
233
- loadInitialData();
234
- }, []);
235
-
236
- // Ref for retry timeout (used in runQuickTest)
237
- const retryTimeoutCleanupRef = useRef<ReturnType<typeof setTimeout> | null>(null);
238
-
239
- // Cleanup EventSources and retry timeouts on unmount
240
- useEffect(() => {
241
- return () => {
242
- if (eventSourceRef.current) {
243
- eventSourceRef.current.close();
244
- }
245
- if (matrixEventSourceRef.current) {
246
- matrixEventSourceRef.current.close();
247
- }
248
- if (retryTimeoutCleanupRef.current) {
249
- clearTimeout(retryTimeoutCleanupRef.current);
250
- }
251
- };
252
- }, []);
253
-
254
- const clearError = useCallback(() => setError(null), []);
255
- const clearTestResult = useCallback(() => {
256
- setTestResult(null);
257
- setStreamLogs([]);
258
- }, []);
259
- const clearMatrixResult = useCallback(() => {
260
- setMatrixResult(null);
261
- setStreamLogs([]);
262
- }, []);
263
-
264
- // Track current test config for retry
265
- const currentTestConfigRef = useRef<{ scenarioId: string; profile: string } | null>(null);
266
- const retryCountRef = useRef(0);
267
- const retryTimeoutRef = useRef<ReturnType<typeof setTimeout> | null>(null);
268
-
269
- // Use a ref to hold the run function for self-reference in retry logic
270
- const runQuickTestRef = useRef<(scenarioId: string, profile: string, isRetry?: boolean) => void>();
271
-
272
- // Quick Test with streaming and auto-retry
273
- const runQuickTest = useCallback((scenarioId: string, profile: string, isRetry = false) => {
274
- if (eventSourceRef.current) {
275
- eventSourceRef.current.close();
276
- }
277
-
278
- // Clear any pending retry
279
- if (retryTimeoutRef.current) {
280
- clearTimeout(retryTimeoutRef.current);
281
- retryTimeoutRef.current = null;
282
- }
283
-
284
- // Reset retry count for new tests
285
- if (!isRetry) {
286
- retryCountRef.current = 0;
287
- currentTestConfigRef.current = { scenarioId, profile };
288
- }
289
-
290
- setIsTestRunning(true);
291
- if (!isRetry) {
292
- setTestResult(null);
293
- setStreamLogs([]);
294
- }
295
- setError(null);
296
-
297
- const params = new URLSearchParams({
298
- scenario: scenarioId,
299
- profile: profile
300
- });
301
-
302
- const eventSource = new EventSource(`/api/eval/stream/quick?${params}`);
303
- eventSourceRef.current = eventSource;
304
-
305
- eventSource.onmessage = (event) => {
306
- try {
307
- const data = JSON.parse(event.data);
308
-
309
- if (data.type === 'log') {
310
- setStreamLogs(prev => [...prev, {
311
- type: data.level || 'info',
312
- message: data.message || data.content || '',
313
- timestamp: Date.now()
314
- }]);
315
- } else if (data.type === 'progress') {
316
- setStreamLogs(prev => [...prev, {
317
- type: 'progress',
318
- message: `Progress: ${data.current}/${data.total} (${data.percentage}%)`,
319
- timestamp: Date.now()
320
- }]);
321
- } else if (data.type === 'result') {
322
- setTestResult(data.result);
323
- // Save result for offline viewing
324
- setCache(CACHE_CONFIG.lastResult.key, data.result);
325
- } else if (data.type === 'complete') {
326
- setIsTestRunning(false);
327
- retryCountRef.current = 0; // Reset on success
328
- currentTestConfigRef.current = null;
329
- eventSource.close();
330
- } else if (data.type === 'error') {
331
- setError(data.message || 'Test failed');
332
- setIsTestRunning(false);
333
- retryCountRef.current = 0;
334
- currentTestConfigRef.current = null;
335
- eventSource.close();
336
- }
337
- } catch (err) {
338
- console.error('Failed to parse SSE message:', err);
339
- }
340
- };
341
-
342
- eventSource.onerror = () => {
343
- eventSource.close();
344
-
345
- // Auto-retry if we haven't exceeded max retries
346
- if (retryCountRef.current < RETRY_CONFIG.maxRetries && currentTestConfigRef.current) {
347
- retryCountRef.current++;
348
- const delay = getRetryDelay(retryCountRef.current);
349
-
350
- setStreamLogs(prev => [...prev, {
351
- type: 'warning',
352
- message: `Connection lost. Retrying in ${Math.round(delay / 1000)}s... (${retryCountRef.current}/${RETRY_CONFIG.maxRetries})`,
353
- timestamp: Date.now()
354
- }]);
355
-
356
- retryTimeoutRef.current = setTimeout(() => {
357
- if (currentTestConfigRef.current && runQuickTestRef.current) {
358
- runQuickTestRef.current(
359
- currentTestConfigRef.current.scenarioId,
360
- currentTestConfigRef.current.profile,
361
- true
362
- );
363
- }
364
- }, delay);
365
- } else {
366
- setError('Connection lost. Please try again when you have a better connection.');
367
- setIsTestRunning(false);
368
- currentTestConfigRef.current = null;
369
- }
370
- };
371
- }, []);
372
-
373
- // Keep the ref updated
374
- runQuickTestRef.current = runQuickTest;
375
-
376
- // Matrix Test with streaming
377
- const runMatrixTest = useCallback((profileList: string[], scenarioList: string[]) => {
378
- if (matrixEventSourceRef.current) {
379
- matrixEventSourceRef.current.close();
380
- }
381
-
382
- setIsMatrixRunning(true);
383
- setMatrixResult(null);
384
- setStreamLogs([]);
385
- setError(null);
386
-
387
- const params = new URLSearchParams({
388
- profiles: profileList.join(','),
389
- scenarios: scenarioList.length > 0 ? scenarioList.join(',') : 'all'
390
- });
391
-
392
- const eventSource = new EventSource(`/api/eval/stream/matrix?${params}`);
393
- matrixEventSourceRef.current = eventSource;
394
-
395
- eventSource.onmessage = (event) => {
396
- try {
397
- const data = JSON.parse(event.data);
398
-
399
- if (data.type === 'log') {
400
- setStreamLogs(prev => [...prev, {
401
- type: data.level || 'info',
402
- message: data.message || data.content || '',
403
- timestamp: Date.now()
404
- }]);
405
- } else if (data.type === 'progress') {
406
- setStreamLogs(prev => [...prev, {
407
- type: 'progress',
408
- message: data.message || `Test ${data.current}/${data.total}`,
409
- timestamp: Date.now()
410
- }]);
411
- } else if (data.type === 'complete') {
412
- setMatrixResult({
413
- profiles: data.profiles || [],
414
- scenariosRun: data.scenariosRun || 0,
415
- dimensionAverages: data.dimensionAverages || {},
416
- rankings: data.rankings || [],
417
- results: data.results || [],
418
- runId: data.runId
419
- });
420
- setIsMatrixRunning(false);
421
- eventSource.close();
422
- } else if (data.type === 'error') {
423
- setError(data.error || 'Matrix test failed');
424
- setIsMatrixRunning(false);
425
- eventSource.close();
426
- }
427
- } catch (err) {
428
- console.error('Failed to parse SSE message:', err);
429
- }
430
- };
431
-
432
- eventSource.onerror = () => {
433
- setError('Connection lost during matrix test');
434
- setIsMatrixRunning(false);
435
- eventSource.close();
436
- };
437
- }, []);
438
-
439
- // Load runs
440
- const loadRuns = useCallback(async () => {
441
- setIsLoading(true);
442
- try {
443
- // Try cache first
444
- const cached = getCached<EvalRun[]>(CACHE_CONFIG.runs.key, CACHE_CONFIG.runs.ttl);
445
- if (cached) {
446
- setRuns(cached);
447
- setIsLoading(false);
448
- return;
449
- }
450
-
451
- const res = await fetch('/api/eval/runs?limit=50');
452
- if (!res.ok) throw new Error('Failed to load runs');
453
-
454
- const data = await res.json();
455
- const runList = data.runs || [];
456
- setRuns(runList);
457
- setCache(CACHE_CONFIG.runs.key, runList);
458
- } catch (err) {
459
- setError(err instanceof Error ? err.message : 'Failed to load runs');
460
- } finally {
461
- setIsLoading(false);
462
- }
463
- }, []);
464
-
465
- // Load run details
466
- const loadRunDetails = useCallback(async (runId: string): Promise<RunDetails | null> => {
467
- try {
468
- const res = await fetch(`/api/eval/runs/${runId}`);
469
- if (!res.ok) throw new Error('Failed to load run details');
470
-
471
- const data = await res.json();
472
- return {
473
- run: data.run,
474
- stats: data.stats || [],
475
- results: data.results || []
476
- };
477
- } catch (err) {
478
- setError(err instanceof Error ? err.message : 'Failed to load run details');
479
- return null;
480
- }
481
- }, []);
482
-
483
- // Load log dates
484
- const loadLogDates = useCallback(async () => {
485
- setIsLoading(true);
486
- try {
487
- const cached = getCached<string[]>(CACHE_CONFIG.logDates.key, CACHE_CONFIG.logDates.ttl);
488
- if (cached) {
489
- setLogDates(cached);
490
- setIsLoading(false);
491
- return;
492
- }
493
-
494
- const res = await fetch('/api/eval/logs/dates');
495
- if (!res.ok) throw new Error('Failed to load log dates');
496
-
497
- const data = await res.json();
498
- const dates = data.dates || [];
499
- setLogDates(dates);
500
- setCache(CACHE_CONFIG.logDates.key, dates);
501
- } catch (err) {
502
- setError(err instanceof Error ? err.message : 'Failed to load log dates');
503
- } finally {
504
- setIsLoading(false);
505
- }
506
- }, []);
507
-
508
- // Load dialogues for a date
509
- const loadDialogues = useCallback(async (
510
- date: string,
511
- offset = 0,
512
- limit = 10
513
- ): Promise<{ dialogues: EvalDialogue[]; total: number; hasMore: boolean }> => {
514
- try {
515
- const res = await fetch(`/api/eval/logs/${date}?offset=${offset}&limit=${limit}`);
516
- if (!res.ok) throw new Error('Failed to load dialogues');
517
-
518
- const data = await res.json();
519
- return {
520
- dialogues: data.dialogues || [],
521
- total: data.total || 0,
522
- hasMore: data.hasMore || false
523
- };
524
- } catch (err) {
525
- setError(err instanceof Error ? err.message : 'Failed to load dialogues');
526
- return { dialogues: [], total: 0, hasMore: false };
527
- }
528
- }, []);
529
-
530
- // Load single dialogue by ID
531
- const loadDialogueById = useCallback(async (dialogueId: string): Promise<EvalDialogue | null> => {
532
- try {
533
- const res = await fetch(`/api/eval/logs/dialogue/${dialogueId}`);
534
- if (!res.ok) throw new Error('Failed to load dialogue');
535
-
536
- const data = await res.json();
537
- return data.dialogue || null;
538
- } catch (err) {
539
- setError(err instanceof Error ? err.message : 'Failed to load dialogue');
540
- return null;
541
- }
542
- }, []);
543
-
544
- // Load docs
545
- const loadDocs = useCallback(async () => {
546
- setIsLoading(true);
547
- try {
548
- const cached = getCached<EvalDoc[]>(CACHE_CONFIG.docs.key, CACHE_CONFIG.docs.ttl);
549
- if (cached) {
550
- setDocs(cached);
551
- setIsLoading(false);
552
- return;
553
- }
554
-
555
- const res = await fetch('/api/eval/docs');
556
- if (!res.ok) throw new Error('Failed to load docs');
557
-
558
- const data = await res.json();
559
- const docList = data.docs || [];
560
- setDocs(docList);
561
- setCache(CACHE_CONFIG.docs.key, docList);
562
- } catch (err) {
563
- setError(err instanceof Error ? err.message : 'Failed to load docs');
564
- } finally {
565
- setIsLoading(false);
566
- }
567
- }, []);
568
-
569
- // Load doc content
570
- const loadDocContent = useCallback(async (name: string): Promise<string | null> => {
571
- try {
572
- const res = await fetch(`/api/eval/docs/${name}`);
573
- if (!res.ok) throw new Error('Failed to load doc content');
574
-
575
- const data = await res.json();
576
- return data.content || null;
577
- } catch (err) {
578
- setError(err instanceof Error ? err.message : 'Failed to load doc');
579
- return null;
580
- }
581
- }, []);
582
-
583
- return {
584
- // Data
585
- profiles,
586
- scenarios,
587
- runs,
588
- logDates,
589
- docs,
590
-
591
- // Quick Test
592
- runQuickTest,
593
- isTestRunning,
594
- testResult,
595
- streamLogs,
596
- clearTestResult,
597
-
598
- // Matrix Test
599
- runMatrixTest,
600
- isMatrixRunning,
601
- matrixResult,
602
- clearMatrixResult,
603
-
604
- // History
605
- loadRuns,
606
- loadRunDetails,
607
-
608
- // Logs
609
- loadLogDates,
610
- loadDialogues,
611
- loadDialogueById,
612
-
613
- // Docs
614
- loadDocs,
615
- loadDocContent,
616
-
617
- // State
618
- isLoading,
619
- isInitialLoading,
620
- error,
621
- clearError
622
- };
623
- }
624
-
625
- export default useEvalData;
package/server-init.js DELETED
@@ -1,45 +0,0 @@
1
- /**
2
- * Evaluation Extension - Server Initialization
3
- *
4
- * Called by the extension loader when mounting this extension
5
- * into the main Machine Spirits website.
6
- */
7
-
8
- import path from 'path';
9
- import { fileURLToPath } from 'url';
10
- import express from 'express';
11
-
12
- const __filename = fileURLToPath(import.meta.url);
13
- const __dirname = path.dirname(__filename);
14
-
15
- /**
16
- * Initialize the evaluation extension
17
- * @param {Object} context - Initialization context
18
- * @param {Express} context.app - Express application
19
- * @param {Object} context.manifest - Extension manifest
20
- * @param {string} context.extensionPath - Path to extension directory
21
- * @param {string} context.rootDir - Path to main website root
22
- */
23
- export async function init({ app, manifest, extensionPath, rootDir }) {
24
- console.log(`[EvalExtension] Initializing ${manifest.name} v${manifest.version}`);
25
-
26
- // Serve static files for components (for client-side imports)
27
- const componentsDir = path.join(extensionPath, 'components');
28
- app.use('/extensions/eval', express.static(componentsDir));
29
-
30
- // Serve documentation
31
- const docsDir = path.join(extensionPath, 'docs');
32
- app.use('/docs/extensions/eval', express.static(docsDir));
33
-
34
- // Ensure data directory exists
35
- const dataDir = path.join(extensionPath, 'data');
36
- const fs = await import('fs');
37
- if (!fs.existsSync(dataDir)) {
38
- fs.mkdirSync(dataDir, { recursive: true });
39
- console.log('[EvalExtension] Created data directory');
40
- }
41
-
42
- console.log('[EvalExtension] Initialization complete');
43
- }
44
-
45
- export default { init };