@arclabs561/ai-visual-test 0.7.3 → 0.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,325 +0,0 @@
1
- /**
2
- * Validation Framework
3
- *
4
- * Provides comprehensive validation of:
5
- * 1. Temporal perception accuracy (human time scales)
6
- * 2. VLLM judgment accuracy (against human ground truth)
7
- * 3. Gameplay temporal experience correctness
8
- * 4. Webpage evaluation correctness
9
- *
10
- * This framework validates that our systems produce correct results,
11
- * not just that they work.
12
- */
13
-
14
- import { humanPerceptionTime } from './temporal-decision.mjs';
15
- import { TIME_SCALES } from './temporal-constants.mjs';
16
- import { aggregateTemporalNotes, calculateCoherenceExported as calculateCoherence } from './temporal.mjs';
17
- import {
18
- compareJudgments,
19
- collectHumanJudgment,
20
- loadHumanJudgment
21
- } from '../evaluation/human-validation/human-validation.mjs';
22
- import { getHumanValidationManager } from './human-validation-manager.mjs';
23
- import { log, warn } from './logger.mjs';
24
-
25
- /**
26
- * Validate temporal perception against research values
27
- *
28
- * @param {Object} options - Validation options
29
- * @returns {Object} Validation results
30
- */
31
- export function validateTemporalPerception(options = {}) {
32
- const results = {
33
- researchAlignment: {},
34
- consistency: {},
35
- recommendations: []
36
- };
37
-
38
- // Validate visual appeal time (50ms research base)
39
- const visualAppealTime = humanPerceptionTime('visual-appeal', {
40
- attentionLevel: 'focused',
41
- actionComplexity: 'simple'
42
- });
43
-
44
- results.researchAlignment.visualAppeal = {
45
- researchValue: 50, // Lindgaard research
46
- actualValue: visualAppealTime,
47
- aligned: visualAppealTime >= 50 && visualAppealTime <= 200,
48
- note: visualAppealTime >= 100 ? 'Enforces 100ms minimum (implementation constraint)' : 'Matches research (50ms)'
49
- };
50
-
51
- // Validate instant threshold (100ms research)
52
- results.researchAlignment.instantThreshold = {
53
- researchValue: 100, // NN/g research
54
- actualValue: TIME_SCALES.INSTANT,
55
- aligned: TIME_SCALES.INSTANT === 100,
56
- note: TIME_SCALES.INSTANT === 100 ? 'Matches research' : 'Does not match research'
57
- };
58
-
59
- // Validate reading time scales with content
60
- const shortReading = humanPerceptionTime('reading', { contentLength: 100 });
61
- const longReading = humanPerceptionTime('reading', { contentLength: 1000 });
62
-
63
- results.researchAlignment.readingTime = {
64
- scalesWithContent: longReading > shortReading,
65
- shortContent: shortReading,
66
- longContent: longReading,
67
- note: longReading > shortReading ? 'Reading time scales with content' : 'Reading time does not scale correctly'
68
- };
69
-
70
- // Generate recommendations
71
- if (!results.researchAlignment.visualAppeal.aligned) {
72
- results.recommendations.push('Visual appeal time does not align with research (50ms)');
73
- }
74
- if (!results.researchAlignment.instantThreshold.aligned) {
75
- results.recommendations.push('Instant threshold does not match research (100ms)');
76
- }
77
- if (!results.researchAlignment.readingTime.scalesWithContent) {
78
- results.recommendations.push('Reading time does not scale correctly with content length');
79
- }
80
-
81
- return results;
82
- }
83
-
84
- /**
85
- * Validate VLLM judgment accuracy against human ground truth
86
- *
87
- * @param {Array} humanJudgments - Human judgment ground truth
88
- * @param {Array} vllmJudgments - VLLM judgments to validate
89
- * @param {Object} options - Validation options
90
- * @returns {Object} Validation results
91
- */
92
- export function validateVLLMAccuracy(humanJudgments, vllmJudgments, options = {}) {
93
- const {
94
- minCorrelation = 0.7,
95
- maxMAE = 1.0,
96
- minKappa = 0.6
97
- } = options;
98
-
99
- try {
100
- const calibration = compareJudgments(humanJudgments, vllmJudgments);
101
-
102
- const results = {
103
- calibration,
104
- isValid: false,
105
- issues: [],
106
- recommendations: []
107
- };
108
-
109
- // Check correlation
110
- if (calibration.agreement.pearson < minCorrelation) {
111
- results.issues.push(`Low correlation (${calibration.agreement.pearson.toFixed(3)} < ${minCorrelation})`);
112
- results.isValid = false;
113
- }
114
-
115
- // Check MAE
116
- if (calibration.agreement.mae > maxMAE) {
117
- results.issues.push(`High MAE (${calibration.agreement.mae.toFixed(2)} > ${maxMAE})`);
118
- results.isValid = false;
119
- }
120
-
121
- // Check Kappa
122
- if (calibration.agreement.kappa < minKappa) {
123
- results.issues.push(`Low Kappa (${calibration.agreement.kappa.toFixed(3)} < ${minKappa})`);
124
- results.isValid = false;
125
- }
126
-
127
- // If all checks pass
128
- if (results.issues.length === 0) {
129
- results.isValid = true;
130
- results.recommendations.push('VLLM judgments align well with human ground truth');
131
- } else {
132
- results.recommendations.push(...calibration.recommendations);
133
- }
134
-
135
- return results;
136
- } catch (error) {
137
- return {
138
- error: error.message,
139
- isValid: false,
140
- issues: ['Failed to compare judgments'],
141
- recommendations: ['Ensure human and VLLM judgments are properly matched']
142
- };
143
- }
144
- }
145
-
146
- /**
147
- * Validate gameplay temporal experience
148
- *
149
- * @param {Array} gameplayNotes - Temporal notes from gameplay
150
- * @param {Object} options - Validation options
151
- * @returns {Object} Validation results
152
- */
153
- export async function validateGameplayTemporal(gameplayNotes, options = {}) {
154
- const {
155
- minCoherenceForSmooth = 0.7,
156
- maxCoherenceForErratic = 0.5
157
- } = options;
158
-
159
- if (!gameplayNotes || gameplayNotes.length === 0) {
160
- return {
161
- isValid: false,
162
- issues: ['No gameplay notes provided'],
163
- recommendations: ['Provide gameplay notes for validation']
164
- };
165
- }
166
-
167
- const aggregated = await aggregateTemporalNotes(gameplayNotes, options);
168
-
169
- const results = {
170
- aggregated,
171
- isValid: true,
172
- issues: [],
173
- recommendations: []
174
- };
175
-
176
- // Check coherence
177
- if (aggregated.coherence < minCoherenceForSmooth && aggregated.coherence > maxCoherenceForErratic) {
178
- results.issues.push(`Moderate coherence (${aggregated.coherence.toFixed(3)}) - neither smooth nor clearly erratic`);
179
- results.recommendations.push('Review gameplay notes for consistency issues');
180
- }
181
-
182
- // Check for conflicts
183
- if (aggregated.conflicts && aggregated.conflicts.length > 0) {
184
- results.issues.push(`Detected ${aggregated.conflicts.length} conflicts in gameplay notes`);
185
- results.recommendations.push('Review conflicting observations in gameplay notes');
186
- }
187
-
188
- // Check window count
189
- if (aggregated.windows.length < 2) {
190
- results.issues.push('Insufficient windows for temporal analysis (need at least 2)');
191
- results.recommendations.push('Collect more gameplay notes or use smaller window size');
192
- }
193
-
194
- return results;
195
- }
196
-
197
- /**
198
- * Validate webpage evaluation correctness
199
- *
200
- * Validates that VLLM judgments about webpages align with human expectations.
201
- *
202
- * @param {Array} evaluations - Array of evaluation results
203
- * @param {Object} groundTruth - Ground truth data (if available)
204
- * @param {Object} options - Validation options
205
- * @returns {Object} Validation results
206
- */
207
- export function validateWebpageEvaluation(evaluations, groundTruth = null, options = {}) {
208
- const results = {
209
- evaluations,
210
- isValid: true,
211
- issues: [],
212
- recommendations: []
213
- };
214
-
215
- // If ground truth available, compare
216
- if (groundTruth) {
217
- const humanJudgments = groundTruth.humanJudgments || [];
218
- const vllmJudgments = evaluations.map(evaluation => ({
219
- id: evaluation.id || `eval-${Date.now()}`,
220
- vllmScore: evaluation.score,
221
- vllmIssues: evaluation.issues || [],
222
- vllmReasoning: evaluation.reasoning || '',
223
- provider: evaluation.provider || 'unknown',
224
- timestamp: evaluation.timestamp || new Date().toISOString()
225
- }));
226
-
227
- if (humanJudgments.length > 0 && vllmJudgments.length > 0) {
228
- const accuracy = validateVLLMAccuracy(humanJudgments, vllmJudgments, options);
229
- results.accuracy = accuracy;
230
- results.isValid = accuracy.isValid;
231
- results.issues.push(...accuracy.issues);
232
- results.recommendations.push(...accuracy.recommendations);
233
- }
234
- }
235
-
236
- // Validate evaluation structure
237
- for (const evaluation of evaluations) {
238
- if (evaluation.score === null || evaluation.score === undefined) {
239
- results.issues.push(`Evaluation ${evaluation.id || 'unknown'} has null/undefined score`);
240
- results.isValid = false;
241
- }
242
- if (evaluation.score !== null && (evaluation.score < 0 || evaluation.score > 10)) {
243
- results.issues.push(`Evaluation ${evaluation.id || 'unknown'} has invalid score: ${evaluation.score}`);
244
- results.isValid = false;
245
- }
246
- if (!Array.isArray(evaluation.issues)) {
247
- results.issues.push(`Evaluation ${evaluation.id || 'unknown'} has non-array issues`);
248
- results.isValid = false;
249
- }
250
- }
251
-
252
- return results;
253
- }
254
-
255
- /**
256
- * Comprehensive validation report
257
- *
258
- * Validates all aspects: temporal perception, VLLM accuracy, gameplay, webpage evaluation
259
- *
260
- * @param {Object} data - Validation data
261
- * @param {Object} options - Validation options
262
- * @returns {Object} Comprehensive validation report
263
- */
264
- export async function validateComprehensive(data, options = {}) {
265
- const report = {
266
- temporalPerception: null,
267
- vllmAccuracy: null,
268
- gameplayTemporal: null,
269
- webpageEvaluation: null,
270
- overall: {
271
- isValid: true,
272
- issues: [],
273
- recommendations: []
274
- }
275
- };
276
-
277
- // Validate temporal perception
278
- if (data.temporalPerception !== false) {
279
- report.temporalPerception = validateTemporalPerception(options);
280
- if (report.temporalPerception.recommendations.length > 0) {
281
- report.overall.issues.push('Temporal perception validation issues');
282
- report.overall.recommendations.push(...report.temporalPerception.recommendations);
283
- }
284
- }
285
-
286
- // Validate VLLM accuracy
287
- if (data.humanJudgments && data.vllmJudgments) {
288
- report.vllmAccuracy = validateVLLMAccuracy(data.humanJudgments, data.vllmJudgments, options);
289
- if (!report.vllmAccuracy.isValid) {
290
- report.overall.isValid = false;
291
- report.overall.issues.push('VLLM accuracy validation failed');
292
- report.overall.recommendations.push(...report.vllmAccuracy.recommendations);
293
- }
294
- }
295
-
296
- // Validate gameplay temporal
297
- if (data.gameplayNotes) {
298
- // Note: validateGameplayTemporal is async, but validateComprehensive is sync
299
- // For now, we'll handle this by making validateComprehensive async-aware
300
- // In practice, this should be awaited, but we maintain backward compatibility
301
- const gameplayResult = await validateGameplayTemporal(data.gameplayNotes, options);
302
- report.gameplayTemporal = gameplayResult;
303
- if (gameplayResult && gameplayResult.issues && gameplayResult.issues.length > 0) {
304
- report.overall.issues.push('Gameplay temporal validation issues');
305
- report.overall.recommendations.push(...(gameplayResult.recommendations || []));
306
- }
307
- }
308
-
309
- // Validate webpage evaluation
310
- if (data.evaluations) {
311
- report.webpageEvaluation = validateWebpageEvaluation(
312
- data.evaluations,
313
- data.groundTruth,
314
- options
315
- );
316
- if (!report.webpageEvaluation.isValid) {
317
- report.overall.isValid = false;
318
- report.overall.issues.push('Webpage evaluation validation failed');
319
- report.overall.recommendations.push(...report.webpageEvaluation.recommendations);
320
- }
321
- }
322
-
323
- return report;
324
- }
325
-