@machinespirits/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/components/MobileEvalDashboard.tsx +267 -0
- package/components/comparison/DeltaAnalysisTable.tsx +137 -0
- package/components/comparison/ProfileComparisonCard.tsx +176 -0
- package/components/comparison/RecognitionABMode.tsx +385 -0
- package/components/comparison/RecognitionMetricsPanel.tsx +135 -0
- package/components/comparison/WinnerIndicator.tsx +64 -0
- package/components/comparison/index.ts +5 -0
- package/components/mobile/BottomSheet.tsx +233 -0
- package/components/mobile/DimensionBreakdown.tsx +210 -0
- package/components/mobile/DocsView.tsx +363 -0
- package/components/mobile/LogsView.tsx +481 -0
- package/components/mobile/PsychodynamicQuadrant.tsx +261 -0
- package/components/mobile/QuickTestView.tsx +1098 -0
- package/components/mobile/RecognitionTypeChart.tsx +124 -0
- package/components/mobile/RecognitionView.tsx +809 -0
- package/components/mobile/RunDetailView.tsx +261 -0
- package/components/mobile/RunHistoryView.tsx +367 -0
- package/components/mobile/ScoreRadial.tsx +211 -0
- package/components/mobile/StreamingLogPanel.tsx +230 -0
- package/components/mobile/SynthesisStrategyChart.tsx +140 -0
- package/config/interaction-eval-scenarios.yaml +832 -0
- package/config/learner-agents.yaml +248 -0
- package/docs/research/ABLATION-DIALOGUE-ROUNDS.md +52 -0
- package/docs/research/ABLATION-MODEL-SELECTION.md +53 -0
- package/docs/research/ADVANCED-EVAL-ANALYSIS.md +60 -0
- package/docs/research/ANOVA-RESULTS-2026-01-14.md +257 -0
- package/docs/research/COMPREHENSIVE-EVALUATION-PLAN.md +586 -0
- package/docs/research/COST-ANALYSIS.md +56 -0
- package/docs/research/CRITICAL-REVIEW-RECOGNITION-TUTORING.md +340 -0
- package/docs/research/DYNAMIC-VS-SCRIPTED-ANALYSIS.md +291 -0
- package/docs/research/EVAL-SYSTEM-ANALYSIS.md +306 -0
- package/docs/research/FACTORIAL-RESULTS-2026-01-14.md +301 -0
- package/docs/research/IMPLEMENTATION-PLAN-CRITIQUE-RESPONSE.md +1988 -0
- package/docs/research/LONGITUDINAL-DYADIC-EVALUATION.md +282 -0
- package/docs/research/MULTI-JUDGE-VALIDATION-2026-01-14.md +147 -0
- package/docs/research/PAPER-EXTENSION-DYADIC.md +204 -0
- package/docs/research/PAPER-UNIFIED.md +659 -0
- package/docs/research/PAPER-UNIFIED.pdf +0 -0
- package/docs/research/PROMPT-IMPROVEMENTS-2026-01-14.md +356 -0
- package/docs/research/SESSION-NOTES-2026-01-11-RECOGNITION-EVAL.md +419 -0
- package/docs/research/apa.csl +2133 -0
- package/docs/research/archive/PAPER-DRAFT-RECOGNITION-TUTORING.md +1637 -0
- package/docs/research/archive/paper-multiagent-tutor.tex +978 -0
- package/docs/research/paper-draft/full-paper.md +136 -0
- package/docs/research/paper-draft/images/pasted-image-2026-01-24T03-47-47-846Z-d76a7ae2.png +0 -0
- package/docs/research/paper-draft/references.bib +515 -0
- package/docs/research/transcript-baseline.md +139 -0
- package/docs/research/transcript-recognition-multiagent.md +187 -0
- package/hooks/useEvalData.ts +625 -0
- package/index.js +27 -0
- package/package.json +73 -0
- package/routes/evalRoutes.js +3002 -0
- package/scripts/advanced-eval-analysis.js +351 -0
- package/scripts/analyze-eval-costs.js +378 -0
- package/scripts/analyze-eval-results.js +513 -0
- package/scripts/analyze-interaction-evals.js +368 -0
- package/server-init.js +45 -0
- package/server.js +162 -0
- package/services/benchmarkService.js +1892 -0
- package/services/evaluationRunner.js +739 -0
- package/services/evaluationStore.js +1121 -0
- package/services/learnerConfigLoader.js +385 -0
- package/services/learnerTutorInteractionEngine.js +857 -0
- package/services/memory/learnerMemoryService.js +1227 -0
- package/services/memory/learnerWritingPad.js +577 -0
- package/services/memory/tutorWritingPad.js +674 -0
- package/services/promptRecommendationService.js +493 -0
- package/services/rubricEvaluator.js +826 -0
|
@@ -0,0 +1,625 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* useEvalData Hook
|
|
3
|
+
*
|
|
4
|
+
* Data fetching and caching hook for the mobile evaluation dashboard.
|
|
5
|
+
* Handles all eval API interactions with localStorage caching for offline support.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { useState, useCallback, useRef, useEffect } from 'react';
|
|
9
|
+
import type {
|
|
10
|
+
EvalProfile,
|
|
11
|
+
EvalScenario,
|
|
12
|
+
EvalRun,
|
|
13
|
+
EvalQuickTestResult,
|
|
14
|
+
EvalDialogue,
|
|
15
|
+
EvalDoc,
|
|
16
|
+
EvalDimensionScores
|
|
17
|
+
} from '../types';
|
|
18
|
+
|
|
19
|
+
// Cache configuration
|
|
20
|
+
const CACHE_CONFIG = {
|
|
21
|
+
profiles: { ttl: 300000, key: 'eval-profiles' }, // 5 min
|
|
22
|
+
scenarios: { ttl: 300000, key: 'eval-scenarios' }, // 5 min
|
|
23
|
+
runs: { ttl: 60000, key: 'eval-runs' }, // 1 min
|
|
24
|
+
logDates: { ttl: 300000, key: 'eval-log-dates' }, // 5 min
|
|
25
|
+
docs: { ttl: 3600000, key: 'eval-docs' }, // 1 hour
|
|
26
|
+
lastResult: { ttl: 86400000, key: 'eval-last-result' } // 24 hours - offline viewing
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
// Retry configuration for connection resilience
|
|
30
|
+
const RETRY_CONFIG = {
|
|
31
|
+
maxRetries: 3,
|
|
32
|
+
baseDelay: 1000, // 1 second
|
|
33
|
+
maxDelay: 10000 // 10 seconds
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
// Calculate exponential backoff delay
|
|
37
|
+
function getRetryDelay(attempt: number): number {
|
|
38
|
+
const delay = Math.min(
|
|
39
|
+
RETRY_CONFIG.baseDelay * Math.pow(2, attempt),
|
|
40
|
+
RETRY_CONFIG.maxDelay
|
|
41
|
+
);
|
|
42
|
+
// Add jitter
|
|
43
|
+
return delay + Math.random() * 1000;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
interface CacheEntry<T> {
|
|
47
|
+
data: T;
|
|
48
|
+
timestamp: number;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function getCached<T>(key: string, ttl: number): T | null {
|
|
52
|
+
try {
|
|
53
|
+
const cached = localStorage.getItem(key);
|
|
54
|
+
if (!cached) return null;
|
|
55
|
+
|
|
56
|
+
const entry: CacheEntry<T> = JSON.parse(cached);
|
|
57
|
+
if (Date.now() - entry.timestamp > ttl) {
|
|
58
|
+
localStorage.removeItem(key);
|
|
59
|
+
return null;
|
|
60
|
+
}
|
|
61
|
+
return entry.data;
|
|
62
|
+
} catch {
|
|
63
|
+
return null;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function setCache<T>(key: string, data: T): void {
|
|
68
|
+
try {
|
|
69
|
+
const entry: CacheEntry<T> = { data, timestamp: Date.now() };
|
|
70
|
+
localStorage.setItem(key, JSON.stringify(entry));
|
|
71
|
+
} catch {
|
|
72
|
+
// Storage full or unavailable
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
export interface StreamLog {
|
|
77
|
+
type: 'info' | 'success' | 'warning' | 'error' | 'progress';
|
|
78
|
+
message: string;
|
|
79
|
+
timestamp: number;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
export interface RunStats {
|
|
83
|
+
provider: string;
|
|
84
|
+
model: string;
|
|
85
|
+
totalTests: number;
|
|
86
|
+
successfulTests: number;
|
|
87
|
+
successRate: number;
|
|
88
|
+
avgScore: number | null;
|
|
89
|
+
dimensions: EvalDimensionScores;
|
|
90
|
+
avgLatencyMs: number;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
export interface RunDetails {
|
|
94
|
+
run: EvalRun;
|
|
95
|
+
stats: RunStats[];
|
|
96
|
+
results: EvalQuickTestResult[];
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
export interface MatrixResult {
|
|
100
|
+
profiles: string[];
|
|
101
|
+
scenariosRun: number;
|
|
102
|
+
dimensionAverages: Record<string, Record<string, number>>;
|
|
103
|
+
rankings: Array<{ profile: string; avgScore: number; rank: number }>;
|
|
104
|
+
results: EvalQuickTestResult[];
|
|
105
|
+
runId?: string;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
export interface UseEvalDataReturn {
|
|
109
|
+
// Data
|
|
110
|
+
profiles: EvalProfile[];
|
|
111
|
+
scenarios: EvalScenario[];
|
|
112
|
+
runs: EvalRun[];
|
|
113
|
+
logDates: string[];
|
|
114
|
+
docs: EvalDoc[];
|
|
115
|
+
|
|
116
|
+
// Quick Test
|
|
117
|
+
runQuickTest: (scenario: string, profile: string) => void;
|
|
118
|
+
isTestRunning: boolean;
|
|
119
|
+
testResult: EvalQuickTestResult | null;
|
|
120
|
+
streamLogs: StreamLog[];
|
|
121
|
+
clearTestResult: () => void;
|
|
122
|
+
|
|
123
|
+
// Matrix Test
|
|
124
|
+
runMatrixTest: (profiles: string[], scenarios: string[]) => void;
|
|
125
|
+
isMatrixRunning: boolean;
|
|
126
|
+
matrixResult: MatrixResult | null;
|
|
127
|
+
clearMatrixResult: () => void;
|
|
128
|
+
|
|
129
|
+
// History
|
|
130
|
+
loadRuns: () => Promise<void>;
|
|
131
|
+
loadRunDetails: (runId: string) => Promise<RunDetails | null>;
|
|
132
|
+
|
|
133
|
+
// Logs
|
|
134
|
+
loadLogDates: () => Promise<void>;
|
|
135
|
+
loadDialogues: (date: string, offset?: number, limit?: number) => Promise<{
|
|
136
|
+
dialogues: EvalDialogue[];
|
|
137
|
+
total: number;
|
|
138
|
+
hasMore: boolean;
|
|
139
|
+
}>;
|
|
140
|
+
loadDialogueById: (dialogueId: string) => Promise<EvalDialogue | null>;
|
|
141
|
+
|
|
142
|
+
// Docs
|
|
143
|
+
loadDocs: () => Promise<void>;
|
|
144
|
+
loadDocContent: (name: string) => Promise<string | null>;
|
|
145
|
+
|
|
146
|
+
// State
|
|
147
|
+
isLoading: boolean;
|
|
148
|
+
isInitialLoading: boolean;
|
|
149
|
+
error: string | null;
|
|
150
|
+
clearError: () => void;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
export function useEvalData(): UseEvalDataReturn {
|
|
154
|
+
// Core data
|
|
155
|
+
const [profiles, setProfiles] = useState<EvalProfile[]>([]);
|
|
156
|
+
const [scenarios, setScenarios] = useState<EvalScenario[]>([]);
|
|
157
|
+
const [runs, setRuns] = useState<EvalRun[]>([]);
|
|
158
|
+
const [logDates, setLogDates] = useState<string[]>([]);
|
|
159
|
+
const [docs, setDocs] = useState<EvalDoc[]>([]);
|
|
160
|
+
|
|
161
|
+
// Quick test state
|
|
162
|
+
const [isTestRunning, setIsTestRunning] = useState(false);
|
|
163
|
+
const [testResult, setTestResult] = useState<EvalQuickTestResult | null>(null);
|
|
164
|
+
const [streamLogs, setStreamLogs] = useState<StreamLog[]>([]);
|
|
165
|
+
const eventSourceRef = useRef<EventSource | null>(null);
|
|
166
|
+
|
|
167
|
+
// Matrix test state
|
|
168
|
+
const [isMatrixRunning, setIsMatrixRunning] = useState(false);
|
|
169
|
+
const [matrixResult, setMatrixResult] = useState<MatrixResult | null>(null);
|
|
170
|
+
const matrixEventSourceRef = useRef<EventSource | null>(null);
|
|
171
|
+
|
|
172
|
+
// General state
|
|
173
|
+
const [isLoading, setIsLoading] = useState(false);
|
|
174
|
+
const [isInitialLoading, setIsInitialLoading] = useState(true);
|
|
175
|
+
const [error, setError] = useState<string | null>(null);
|
|
176
|
+
|
|
177
|
+
// Load profiles and scenarios on mount
|
|
178
|
+
useEffect(() => {
|
|
179
|
+
const loadInitialData = async () => {
|
|
180
|
+
setIsInitialLoading(true);
|
|
181
|
+
|
|
182
|
+
// Try cache first (only use if non-empty)
|
|
183
|
+
const cachedProfiles = getCached<EvalProfile[]>(
|
|
184
|
+
CACHE_CONFIG.profiles.key,
|
|
185
|
+
CACHE_CONFIG.profiles.ttl
|
|
186
|
+
);
|
|
187
|
+
const cachedScenarios = getCached<EvalScenario[]>(
|
|
188
|
+
CACHE_CONFIG.scenarios.key,
|
|
189
|
+
CACHE_CONFIG.scenarios.ttl
|
|
190
|
+
);
|
|
191
|
+
|
|
192
|
+
// Only use cache if it has actual data
|
|
193
|
+
const hasValidProfileCache = cachedProfiles && cachedProfiles.length > 0;
|
|
194
|
+
const hasValidScenarioCache = cachedScenarios && cachedScenarios.length > 0;
|
|
195
|
+
|
|
196
|
+
if (hasValidProfileCache) setProfiles(cachedProfiles);
|
|
197
|
+
if (hasValidScenarioCache) setScenarios(cachedScenarios);
|
|
198
|
+
|
|
199
|
+
// Fetch fresh data if no valid cache
|
|
200
|
+
if (!hasValidProfileCache || !hasValidScenarioCache) {
|
|
201
|
+
try {
|
|
202
|
+
const [profilesRes, scenariosRes] = await Promise.all([
|
|
203
|
+
fetch('/api/eval/profiles'),
|
|
204
|
+
fetch('/api/eval/scenarios')
|
|
205
|
+
]);
|
|
206
|
+
|
|
207
|
+
if (profilesRes.ok) {
|
|
208
|
+
const data = await profilesRes.json();
|
|
209
|
+
const profileList = data.profiles || [];
|
|
210
|
+
if (profileList.length > 0) {
|
|
211
|
+
setProfiles(profileList);
|
|
212
|
+
setCache(CACHE_CONFIG.profiles.key, profileList);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
if (scenariosRes.ok) {
|
|
217
|
+
const data = await scenariosRes.json();
|
|
218
|
+
const scenarioList = data.scenarios || [];
|
|
219
|
+
if (scenarioList.length > 0) {
|
|
220
|
+
setScenarios(scenarioList);
|
|
221
|
+
setCache(CACHE_CONFIG.scenarios.key, scenarioList);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
} catch (err) {
|
|
225
|
+
console.error('Failed to load initial eval data:', err);
|
|
226
|
+
setError('Failed to load profiles and scenarios');
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
setIsInitialLoading(false);
|
|
231
|
+
};
|
|
232
|
+
|
|
233
|
+
loadInitialData();
|
|
234
|
+
}, []);
|
|
235
|
+
|
|
236
|
+
// Ref for retry timeout (used in runQuickTest)
|
|
237
|
+
const retryTimeoutCleanupRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
|
238
|
+
|
|
239
|
+
// Cleanup EventSources and retry timeouts on unmount
|
|
240
|
+
useEffect(() => {
|
|
241
|
+
return () => {
|
|
242
|
+
if (eventSourceRef.current) {
|
|
243
|
+
eventSourceRef.current.close();
|
|
244
|
+
}
|
|
245
|
+
if (matrixEventSourceRef.current) {
|
|
246
|
+
matrixEventSourceRef.current.close();
|
|
247
|
+
}
|
|
248
|
+
if (retryTimeoutCleanupRef.current) {
|
|
249
|
+
clearTimeout(retryTimeoutCleanupRef.current);
|
|
250
|
+
}
|
|
251
|
+
};
|
|
252
|
+
}, []);
|
|
253
|
+
|
|
254
|
+
const clearError = useCallback(() => setError(null), []);
|
|
255
|
+
const clearTestResult = useCallback(() => {
|
|
256
|
+
setTestResult(null);
|
|
257
|
+
setStreamLogs([]);
|
|
258
|
+
}, []);
|
|
259
|
+
const clearMatrixResult = useCallback(() => {
|
|
260
|
+
setMatrixResult(null);
|
|
261
|
+
setStreamLogs([]);
|
|
262
|
+
}, []);
|
|
263
|
+
|
|
264
|
+
// Track current test config for retry
|
|
265
|
+
const currentTestConfigRef = useRef<{ scenarioId: string; profile: string } | null>(null);
|
|
266
|
+
const retryCountRef = useRef(0);
|
|
267
|
+
const retryTimeoutRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
|
268
|
+
|
|
269
|
+
// Use a ref to hold the run function for self-reference in retry logic
|
|
270
|
+
const runQuickTestRef = useRef<(scenarioId: string, profile: string, isRetry?: boolean) => void>();
|
|
271
|
+
|
|
272
|
+
// Quick Test with streaming and auto-retry
|
|
273
|
+
const runQuickTest = useCallback((scenarioId: string, profile: string, isRetry = false) => {
|
|
274
|
+
if (eventSourceRef.current) {
|
|
275
|
+
eventSourceRef.current.close();
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// Clear any pending retry
|
|
279
|
+
if (retryTimeoutRef.current) {
|
|
280
|
+
clearTimeout(retryTimeoutRef.current);
|
|
281
|
+
retryTimeoutRef.current = null;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// Reset retry count for new tests
|
|
285
|
+
if (!isRetry) {
|
|
286
|
+
retryCountRef.current = 0;
|
|
287
|
+
currentTestConfigRef.current = { scenarioId, profile };
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
setIsTestRunning(true);
|
|
291
|
+
if (!isRetry) {
|
|
292
|
+
setTestResult(null);
|
|
293
|
+
setStreamLogs([]);
|
|
294
|
+
}
|
|
295
|
+
setError(null);
|
|
296
|
+
|
|
297
|
+
const params = new URLSearchParams({
|
|
298
|
+
scenario: scenarioId,
|
|
299
|
+
profile: profile
|
|
300
|
+
});
|
|
301
|
+
|
|
302
|
+
const eventSource = new EventSource(`/api/eval/stream/quick?${params}`);
|
|
303
|
+
eventSourceRef.current = eventSource;
|
|
304
|
+
|
|
305
|
+
eventSource.onmessage = (event) => {
|
|
306
|
+
try {
|
|
307
|
+
const data = JSON.parse(event.data);
|
|
308
|
+
|
|
309
|
+
if (data.type === 'log') {
|
|
310
|
+
setStreamLogs(prev => [...prev, {
|
|
311
|
+
type: data.level || 'info',
|
|
312
|
+
message: data.message || data.content || '',
|
|
313
|
+
timestamp: Date.now()
|
|
314
|
+
}]);
|
|
315
|
+
} else if (data.type === 'progress') {
|
|
316
|
+
setStreamLogs(prev => [...prev, {
|
|
317
|
+
type: 'progress',
|
|
318
|
+
message: `Progress: ${data.current}/${data.total} (${data.percentage}%)`,
|
|
319
|
+
timestamp: Date.now()
|
|
320
|
+
}]);
|
|
321
|
+
} else if (data.type === 'result') {
|
|
322
|
+
setTestResult(data.result);
|
|
323
|
+
// Save result for offline viewing
|
|
324
|
+
setCache(CACHE_CONFIG.lastResult.key, data.result);
|
|
325
|
+
} else if (data.type === 'complete') {
|
|
326
|
+
setIsTestRunning(false);
|
|
327
|
+
retryCountRef.current = 0; // Reset on success
|
|
328
|
+
currentTestConfigRef.current = null;
|
|
329
|
+
eventSource.close();
|
|
330
|
+
} else if (data.type === 'error') {
|
|
331
|
+
setError(data.message || 'Test failed');
|
|
332
|
+
setIsTestRunning(false);
|
|
333
|
+
retryCountRef.current = 0;
|
|
334
|
+
currentTestConfigRef.current = null;
|
|
335
|
+
eventSource.close();
|
|
336
|
+
}
|
|
337
|
+
} catch (err) {
|
|
338
|
+
console.error('Failed to parse SSE message:', err);
|
|
339
|
+
}
|
|
340
|
+
};
|
|
341
|
+
|
|
342
|
+
eventSource.onerror = () => {
|
|
343
|
+
eventSource.close();
|
|
344
|
+
|
|
345
|
+
// Auto-retry if we haven't exceeded max retries
|
|
346
|
+
if (retryCountRef.current < RETRY_CONFIG.maxRetries && currentTestConfigRef.current) {
|
|
347
|
+
retryCountRef.current++;
|
|
348
|
+
const delay = getRetryDelay(retryCountRef.current);
|
|
349
|
+
|
|
350
|
+
setStreamLogs(prev => [...prev, {
|
|
351
|
+
type: 'warning',
|
|
352
|
+
message: `Connection lost. Retrying in ${Math.round(delay / 1000)}s... (${retryCountRef.current}/${RETRY_CONFIG.maxRetries})`,
|
|
353
|
+
timestamp: Date.now()
|
|
354
|
+
}]);
|
|
355
|
+
|
|
356
|
+
retryTimeoutRef.current = setTimeout(() => {
|
|
357
|
+
if (currentTestConfigRef.current && runQuickTestRef.current) {
|
|
358
|
+
runQuickTestRef.current(
|
|
359
|
+
currentTestConfigRef.current.scenarioId,
|
|
360
|
+
currentTestConfigRef.current.profile,
|
|
361
|
+
true
|
|
362
|
+
);
|
|
363
|
+
}
|
|
364
|
+
}, delay);
|
|
365
|
+
} else {
|
|
366
|
+
setError('Connection lost. Please try again when you have a better connection.');
|
|
367
|
+
setIsTestRunning(false);
|
|
368
|
+
currentTestConfigRef.current = null;
|
|
369
|
+
}
|
|
370
|
+
};
|
|
371
|
+
}, []);
|
|
372
|
+
|
|
373
|
+
// Keep the ref updated
|
|
374
|
+
runQuickTestRef.current = runQuickTest;
|
|
375
|
+
|
|
376
|
+
// Matrix Test with streaming
|
|
377
|
+
const runMatrixTest = useCallback((profileList: string[], scenarioList: string[]) => {
|
|
378
|
+
if (matrixEventSourceRef.current) {
|
|
379
|
+
matrixEventSourceRef.current.close();
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
setIsMatrixRunning(true);
|
|
383
|
+
setMatrixResult(null);
|
|
384
|
+
setStreamLogs([]);
|
|
385
|
+
setError(null);
|
|
386
|
+
|
|
387
|
+
const params = new URLSearchParams({
|
|
388
|
+
profiles: profileList.join(','),
|
|
389
|
+
scenarios: scenarioList.length > 0 ? scenarioList.join(',') : 'all'
|
|
390
|
+
});
|
|
391
|
+
|
|
392
|
+
const eventSource = new EventSource(`/api/eval/stream/matrix?${params}`);
|
|
393
|
+
matrixEventSourceRef.current = eventSource;
|
|
394
|
+
|
|
395
|
+
eventSource.onmessage = (event) => {
|
|
396
|
+
try {
|
|
397
|
+
const data = JSON.parse(event.data);
|
|
398
|
+
|
|
399
|
+
if (data.type === 'log') {
|
|
400
|
+
setStreamLogs(prev => [...prev, {
|
|
401
|
+
type: data.level || 'info',
|
|
402
|
+
message: data.message || data.content || '',
|
|
403
|
+
timestamp: Date.now()
|
|
404
|
+
}]);
|
|
405
|
+
} else if (data.type === 'progress') {
|
|
406
|
+
setStreamLogs(prev => [...prev, {
|
|
407
|
+
type: 'progress',
|
|
408
|
+
message: data.message || `Test ${data.current}/${data.total}`,
|
|
409
|
+
timestamp: Date.now()
|
|
410
|
+
}]);
|
|
411
|
+
} else if (data.type === 'complete') {
|
|
412
|
+
setMatrixResult({
|
|
413
|
+
profiles: data.profiles || [],
|
|
414
|
+
scenariosRun: data.scenariosRun || 0,
|
|
415
|
+
dimensionAverages: data.dimensionAverages || {},
|
|
416
|
+
rankings: data.rankings || [],
|
|
417
|
+
results: data.results || [],
|
|
418
|
+
runId: data.runId
|
|
419
|
+
});
|
|
420
|
+
setIsMatrixRunning(false);
|
|
421
|
+
eventSource.close();
|
|
422
|
+
} else if (data.type === 'error') {
|
|
423
|
+
setError(data.error || 'Matrix test failed');
|
|
424
|
+
setIsMatrixRunning(false);
|
|
425
|
+
eventSource.close();
|
|
426
|
+
}
|
|
427
|
+
} catch (err) {
|
|
428
|
+
console.error('Failed to parse SSE message:', err);
|
|
429
|
+
}
|
|
430
|
+
};
|
|
431
|
+
|
|
432
|
+
eventSource.onerror = () => {
|
|
433
|
+
setError('Connection lost during matrix test');
|
|
434
|
+
setIsMatrixRunning(false);
|
|
435
|
+
eventSource.close();
|
|
436
|
+
};
|
|
437
|
+
}, []);
|
|
438
|
+
|
|
439
|
+
// Load runs
|
|
440
|
+
const loadRuns = useCallback(async () => {
|
|
441
|
+
setIsLoading(true);
|
|
442
|
+
try {
|
|
443
|
+
// Try cache first
|
|
444
|
+
const cached = getCached<EvalRun[]>(CACHE_CONFIG.runs.key, CACHE_CONFIG.runs.ttl);
|
|
445
|
+
if (cached) {
|
|
446
|
+
setRuns(cached);
|
|
447
|
+
setIsLoading(false);
|
|
448
|
+
return;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
const res = await fetch('/api/eval/runs?limit=50');
|
|
452
|
+
if (!res.ok) throw new Error('Failed to load runs');
|
|
453
|
+
|
|
454
|
+
const data = await res.json();
|
|
455
|
+
const runList = data.runs || [];
|
|
456
|
+
setRuns(runList);
|
|
457
|
+
setCache(CACHE_CONFIG.runs.key, runList);
|
|
458
|
+
} catch (err) {
|
|
459
|
+
setError(err instanceof Error ? err.message : 'Failed to load runs');
|
|
460
|
+
} finally {
|
|
461
|
+
setIsLoading(false);
|
|
462
|
+
}
|
|
463
|
+
}, []);
|
|
464
|
+
|
|
465
|
+
// Load run details
|
|
466
|
+
const loadRunDetails = useCallback(async (runId: string): Promise<RunDetails | null> => {
|
|
467
|
+
try {
|
|
468
|
+
const res = await fetch(`/api/eval/runs/${runId}`);
|
|
469
|
+
if (!res.ok) throw new Error('Failed to load run details');
|
|
470
|
+
|
|
471
|
+
const data = await res.json();
|
|
472
|
+
return {
|
|
473
|
+
run: data.run,
|
|
474
|
+
stats: data.stats || [],
|
|
475
|
+
results: data.results || []
|
|
476
|
+
};
|
|
477
|
+
} catch (err) {
|
|
478
|
+
setError(err instanceof Error ? err.message : 'Failed to load run details');
|
|
479
|
+
return null;
|
|
480
|
+
}
|
|
481
|
+
}, []);
|
|
482
|
+
|
|
483
|
+
// Load log dates
|
|
484
|
+
const loadLogDates = useCallback(async () => {
|
|
485
|
+
setIsLoading(true);
|
|
486
|
+
try {
|
|
487
|
+
const cached = getCached<string[]>(CACHE_CONFIG.logDates.key, CACHE_CONFIG.logDates.ttl);
|
|
488
|
+
if (cached) {
|
|
489
|
+
setLogDates(cached);
|
|
490
|
+
setIsLoading(false);
|
|
491
|
+
return;
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
const res = await fetch('/api/eval/logs/dates');
|
|
495
|
+
if (!res.ok) throw new Error('Failed to load log dates');
|
|
496
|
+
|
|
497
|
+
const data = await res.json();
|
|
498
|
+
const dates = data.dates || [];
|
|
499
|
+
setLogDates(dates);
|
|
500
|
+
setCache(CACHE_CONFIG.logDates.key, dates);
|
|
501
|
+
} catch (err) {
|
|
502
|
+
setError(err instanceof Error ? err.message : 'Failed to load log dates');
|
|
503
|
+
} finally {
|
|
504
|
+
setIsLoading(false);
|
|
505
|
+
}
|
|
506
|
+
}, []);
|
|
507
|
+
|
|
508
|
+
// Load dialogues for a date
|
|
509
|
+
const loadDialogues = useCallback(async (
|
|
510
|
+
date: string,
|
|
511
|
+
offset = 0,
|
|
512
|
+
limit = 10
|
|
513
|
+
): Promise<{ dialogues: EvalDialogue[]; total: number; hasMore: boolean }> => {
|
|
514
|
+
try {
|
|
515
|
+
const res = await fetch(`/api/eval/logs/${date}?offset=${offset}&limit=${limit}`);
|
|
516
|
+
if (!res.ok) throw new Error('Failed to load dialogues');
|
|
517
|
+
|
|
518
|
+
const data = await res.json();
|
|
519
|
+
return {
|
|
520
|
+
dialogues: data.dialogues || [],
|
|
521
|
+
total: data.total || 0,
|
|
522
|
+
hasMore: data.hasMore || false
|
|
523
|
+
};
|
|
524
|
+
} catch (err) {
|
|
525
|
+
setError(err instanceof Error ? err.message : 'Failed to load dialogues');
|
|
526
|
+
return { dialogues: [], total: 0, hasMore: false };
|
|
527
|
+
}
|
|
528
|
+
}, []);
|
|
529
|
+
|
|
530
|
+
// Load single dialogue by ID
|
|
531
|
+
const loadDialogueById = useCallback(async (dialogueId: string): Promise<EvalDialogue | null> => {
|
|
532
|
+
try {
|
|
533
|
+
const res = await fetch(`/api/eval/logs/dialogue/${dialogueId}`);
|
|
534
|
+
if (!res.ok) throw new Error('Failed to load dialogue');
|
|
535
|
+
|
|
536
|
+
const data = await res.json();
|
|
537
|
+
return data.dialogue || null;
|
|
538
|
+
} catch (err) {
|
|
539
|
+
setError(err instanceof Error ? err.message : 'Failed to load dialogue');
|
|
540
|
+
return null;
|
|
541
|
+
}
|
|
542
|
+
}, []);
|
|
543
|
+
|
|
544
|
+
// Load docs
|
|
545
|
+
const loadDocs = useCallback(async () => {
|
|
546
|
+
setIsLoading(true);
|
|
547
|
+
try {
|
|
548
|
+
const cached = getCached<EvalDoc[]>(CACHE_CONFIG.docs.key, CACHE_CONFIG.docs.ttl);
|
|
549
|
+
if (cached) {
|
|
550
|
+
setDocs(cached);
|
|
551
|
+
setIsLoading(false);
|
|
552
|
+
return;
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
const res = await fetch('/api/eval/docs');
|
|
556
|
+
if (!res.ok) throw new Error('Failed to load docs');
|
|
557
|
+
|
|
558
|
+
const data = await res.json();
|
|
559
|
+
const docList = data.docs || [];
|
|
560
|
+
setDocs(docList);
|
|
561
|
+
setCache(CACHE_CONFIG.docs.key, docList);
|
|
562
|
+
} catch (err) {
|
|
563
|
+
setError(err instanceof Error ? err.message : 'Failed to load docs');
|
|
564
|
+
} finally {
|
|
565
|
+
setIsLoading(false);
|
|
566
|
+
}
|
|
567
|
+
}, []);
|
|
568
|
+
|
|
569
|
+
// Load doc content
|
|
570
|
+
const loadDocContent = useCallback(async (name: string): Promise<string | null> => {
|
|
571
|
+
try {
|
|
572
|
+
const res = await fetch(`/api/eval/docs/${name}`);
|
|
573
|
+
if (!res.ok) throw new Error('Failed to load doc content');
|
|
574
|
+
|
|
575
|
+
const data = await res.json();
|
|
576
|
+
return data.content || null;
|
|
577
|
+
} catch (err) {
|
|
578
|
+
setError(err instanceof Error ? err.message : 'Failed to load doc');
|
|
579
|
+
return null;
|
|
580
|
+
}
|
|
581
|
+
}, []);
|
|
582
|
+
|
|
583
|
+
return {
|
|
584
|
+
// Data
|
|
585
|
+
profiles,
|
|
586
|
+
scenarios,
|
|
587
|
+
runs,
|
|
588
|
+
logDates,
|
|
589
|
+
docs,
|
|
590
|
+
|
|
591
|
+
// Quick Test
|
|
592
|
+
runQuickTest,
|
|
593
|
+
isTestRunning,
|
|
594
|
+
testResult,
|
|
595
|
+
streamLogs,
|
|
596
|
+
clearTestResult,
|
|
597
|
+
|
|
598
|
+
// Matrix Test
|
|
599
|
+
runMatrixTest,
|
|
600
|
+
isMatrixRunning,
|
|
601
|
+
matrixResult,
|
|
602
|
+
clearMatrixResult,
|
|
603
|
+
|
|
604
|
+
// History
|
|
605
|
+
loadRuns,
|
|
606
|
+
loadRunDetails,
|
|
607
|
+
|
|
608
|
+
// Logs
|
|
609
|
+
loadLogDates,
|
|
610
|
+
loadDialogues,
|
|
611
|
+
loadDialogueById,
|
|
612
|
+
|
|
613
|
+
// Docs
|
|
614
|
+
loadDocs,
|
|
615
|
+
loadDocContent,
|
|
616
|
+
|
|
617
|
+
// State
|
|
618
|
+
isLoading,
|
|
619
|
+
isInitialLoading,
|
|
620
|
+
error,
|
|
621
|
+
clearError
|
|
622
|
+
};
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
export default useEvalData;
|
package/index.js
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @machinespirits/eval - Evaluation Extension
|
|
3
|
+
*
|
|
4
|
+
* Provides tutor evaluation, benchmarking, and analysis capabilities.
|
|
5
|
+
* Can run standalone or as an extension mounted in the main website.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// Core evaluation services
|
|
9
|
+
export * as evaluationRunner from './services/evaluationRunner.js';
|
|
10
|
+
export * as evaluationStore from './services/evaluationStore.js';
|
|
11
|
+
export * as rubricEvaluator from './services/rubricEvaluator.js';
|
|
12
|
+
export * as benchmarkService from './services/benchmarkService.js';
|
|
13
|
+
|
|
14
|
+
// Learner simulation services
|
|
15
|
+
export * as learnerConfigLoader from './services/learnerConfigLoader.js';
|
|
16
|
+
export * as learnerTutorInteractionEngine from './services/learnerTutorInteractionEngine.js';
|
|
17
|
+
export * as promptRecommendationService from './services/promptRecommendationService.js';
|
|
18
|
+
|
|
19
|
+
// Re-export routes for manual mounting
|
|
20
|
+
export { default as evalRoutes } from './routes/evalRoutes.js';
|
|
21
|
+
|
|
22
|
+
// Package metadata
|
|
23
|
+
export const packageInfo = {
|
|
24
|
+
name: '@machinespirits/eval',
|
|
25
|
+
version: '0.1.0',
|
|
26
|
+
description: 'Evaluation system for Machine Spirits tutor',
|
|
27
|
+
};
|