@arclabs561/ai-visual-test 0.7.3 → 0.7.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +3 -0
- package/index.d.ts +181 -3
- package/package.json +2 -6
- package/src/batch-optimizer.mjs +3 -3
- package/src/cache.mjs +3 -4
- package/src/calibration-suite.mjs +197 -0
- package/src/constants.mjs +11 -0
- package/src/cost-optimization.mjs +1 -1
- package/src/explanation-manager.mjs +10 -6
- package/src/human-validation-manager.mjs +21 -8
- package/src/index.mjs +20 -10
- package/src/integrations/playwright.mjs +9 -9
- package/src/judge.mjs +9 -18
- package/src/limitations.mjs +106 -0
- package/src/load-env.mjs +3 -2
- package/src/model-tier-selector.mjs +1 -1
- package/src/rubrics.mjs +22 -2
- package/src/score-calibration.mjs +177 -0
- package/src/temporal-decision-manager.mjs +1 -1
- package/src/temporal-preprocessor.mjs +1 -1
- package/src/type-guards.mjs +5 -5
- package/src/utils/cached-llm.mjs +1 -1
- package/src/validation-result-normalizer.mjs +17 -1
- package/src/validation.mjs +13 -13
- package/src/validators/index.mjs +23 -2
- package/src/pricing.mjs +0 -28
- package/src/utils/path-validator.mjs +0 -88
- package/src/validation-framework.mjs +0 -325
|
@@ -1,325 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Validation Framework
|
|
3
|
-
*
|
|
4
|
-
* Provides comprehensive validation of:
|
|
5
|
-
* 1. Temporal perception accuracy (human time scales)
|
|
6
|
-
* 2. VLLM judgment accuracy (against human ground truth)
|
|
7
|
-
* 3. Gameplay temporal experience correctness
|
|
8
|
-
* 4. Webpage evaluation correctness
|
|
9
|
-
*
|
|
10
|
-
* This framework validates that our systems produce correct results,
|
|
11
|
-
* not just that they work.
|
|
12
|
-
*/
|
|
13
|
-
|
|
14
|
-
import { humanPerceptionTime } from './temporal-decision.mjs';
|
|
15
|
-
import { TIME_SCALES } from './temporal-constants.mjs';
|
|
16
|
-
import { aggregateTemporalNotes, calculateCoherenceExported as calculateCoherence } from './temporal.mjs';
|
|
17
|
-
import {
|
|
18
|
-
compareJudgments,
|
|
19
|
-
collectHumanJudgment,
|
|
20
|
-
loadHumanJudgment
|
|
21
|
-
} from '../evaluation/human-validation/human-validation.mjs';
|
|
22
|
-
import { getHumanValidationManager } from './human-validation-manager.mjs';
|
|
23
|
-
import { log, warn } from './logger.mjs';
|
|
24
|
-
|
|
25
|
-
/**
|
|
26
|
-
* Validate temporal perception against research values
|
|
27
|
-
*
|
|
28
|
-
* @param {Object} options - Validation options
|
|
29
|
-
* @returns {Object} Validation results
|
|
30
|
-
*/
|
|
31
|
-
export function validateTemporalPerception(options = {}) {
|
|
32
|
-
const results = {
|
|
33
|
-
researchAlignment: {},
|
|
34
|
-
consistency: {},
|
|
35
|
-
recommendations: []
|
|
36
|
-
};
|
|
37
|
-
|
|
38
|
-
// Validate visual appeal time (50ms research base)
|
|
39
|
-
const visualAppealTime = humanPerceptionTime('visual-appeal', {
|
|
40
|
-
attentionLevel: 'focused',
|
|
41
|
-
actionComplexity: 'simple'
|
|
42
|
-
});
|
|
43
|
-
|
|
44
|
-
results.researchAlignment.visualAppeal = {
|
|
45
|
-
researchValue: 50, // Lindgaard research
|
|
46
|
-
actualValue: visualAppealTime,
|
|
47
|
-
aligned: visualAppealTime >= 50 && visualAppealTime <= 200,
|
|
48
|
-
note: visualAppealTime >= 100 ? 'Enforces 100ms minimum (implementation constraint)' : 'Matches research (50ms)'
|
|
49
|
-
};
|
|
50
|
-
|
|
51
|
-
// Validate instant threshold (100ms research)
|
|
52
|
-
results.researchAlignment.instantThreshold = {
|
|
53
|
-
researchValue: 100, // NN/g research
|
|
54
|
-
actualValue: TIME_SCALES.INSTANT,
|
|
55
|
-
aligned: TIME_SCALES.INSTANT === 100,
|
|
56
|
-
note: TIME_SCALES.INSTANT === 100 ? 'Matches research' : 'Does not match research'
|
|
57
|
-
};
|
|
58
|
-
|
|
59
|
-
// Validate reading time scales with content
|
|
60
|
-
const shortReading = humanPerceptionTime('reading', { contentLength: 100 });
|
|
61
|
-
const longReading = humanPerceptionTime('reading', { contentLength: 1000 });
|
|
62
|
-
|
|
63
|
-
results.researchAlignment.readingTime = {
|
|
64
|
-
scalesWithContent: longReading > shortReading,
|
|
65
|
-
shortContent: shortReading,
|
|
66
|
-
longContent: longReading,
|
|
67
|
-
note: longReading > shortReading ? 'Reading time scales with content' : 'Reading time does not scale correctly'
|
|
68
|
-
};
|
|
69
|
-
|
|
70
|
-
// Generate recommendations
|
|
71
|
-
if (!results.researchAlignment.visualAppeal.aligned) {
|
|
72
|
-
results.recommendations.push('Visual appeal time does not align with research (50ms)');
|
|
73
|
-
}
|
|
74
|
-
if (!results.researchAlignment.instantThreshold.aligned) {
|
|
75
|
-
results.recommendations.push('Instant threshold does not match research (100ms)');
|
|
76
|
-
}
|
|
77
|
-
if (!results.researchAlignment.readingTime.scalesWithContent) {
|
|
78
|
-
results.recommendations.push('Reading time does not scale correctly with content length');
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
return results;
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
/**
|
|
85
|
-
* Validate VLLM judgment accuracy against human ground truth
|
|
86
|
-
*
|
|
87
|
-
* @param {Array} humanJudgments - Human judgment ground truth
|
|
88
|
-
* @param {Array} vllmJudgments - VLLM judgments to validate
|
|
89
|
-
* @param {Object} options - Validation options
|
|
90
|
-
* @returns {Object} Validation results
|
|
91
|
-
*/
|
|
92
|
-
export function validateVLLMAccuracy(humanJudgments, vllmJudgments, options = {}) {
|
|
93
|
-
const {
|
|
94
|
-
minCorrelation = 0.7,
|
|
95
|
-
maxMAE = 1.0,
|
|
96
|
-
minKappa = 0.6
|
|
97
|
-
} = options;
|
|
98
|
-
|
|
99
|
-
try {
|
|
100
|
-
const calibration = compareJudgments(humanJudgments, vllmJudgments);
|
|
101
|
-
|
|
102
|
-
const results = {
|
|
103
|
-
calibration,
|
|
104
|
-
isValid: false,
|
|
105
|
-
issues: [],
|
|
106
|
-
recommendations: []
|
|
107
|
-
};
|
|
108
|
-
|
|
109
|
-
// Check correlation
|
|
110
|
-
if (calibration.agreement.pearson < minCorrelation) {
|
|
111
|
-
results.issues.push(`Low correlation (${calibration.agreement.pearson.toFixed(3)} < ${minCorrelation})`);
|
|
112
|
-
results.isValid = false;
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
// Check MAE
|
|
116
|
-
if (calibration.agreement.mae > maxMAE) {
|
|
117
|
-
results.issues.push(`High MAE (${calibration.agreement.mae.toFixed(2)} > ${maxMAE})`);
|
|
118
|
-
results.isValid = false;
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
// Check Kappa
|
|
122
|
-
if (calibration.agreement.kappa < minKappa) {
|
|
123
|
-
results.issues.push(`Low Kappa (${calibration.agreement.kappa.toFixed(3)} < ${minKappa})`);
|
|
124
|
-
results.isValid = false;
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
// If all checks pass
|
|
128
|
-
if (results.issues.length === 0) {
|
|
129
|
-
results.isValid = true;
|
|
130
|
-
results.recommendations.push('VLLM judgments align well with human ground truth');
|
|
131
|
-
} else {
|
|
132
|
-
results.recommendations.push(...calibration.recommendations);
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
return results;
|
|
136
|
-
} catch (error) {
|
|
137
|
-
return {
|
|
138
|
-
error: error.message,
|
|
139
|
-
isValid: false,
|
|
140
|
-
issues: ['Failed to compare judgments'],
|
|
141
|
-
recommendations: ['Ensure human and VLLM judgments are properly matched']
|
|
142
|
-
};
|
|
143
|
-
}
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
/**
|
|
147
|
-
* Validate gameplay temporal experience
|
|
148
|
-
*
|
|
149
|
-
* @param {Array} gameplayNotes - Temporal notes from gameplay
|
|
150
|
-
* @param {Object} options - Validation options
|
|
151
|
-
* @returns {Object} Validation results
|
|
152
|
-
*/
|
|
153
|
-
export async function validateGameplayTemporal(gameplayNotes, options = {}) {
|
|
154
|
-
const {
|
|
155
|
-
minCoherenceForSmooth = 0.7,
|
|
156
|
-
maxCoherenceForErratic = 0.5
|
|
157
|
-
} = options;
|
|
158
|
-
|
|
159
|
-
if (!gameplayNotes || gameplayNotes.length === 0) {
|
|
160
|
-
return {
|
|
161
|
-
isValid: false,
|
|
162
|
-
issues: ['No gameplay notes provided'],
|
|
163
|
-
recommendations: ['Provide gameplay notes for validation']
|
|
164
|
-
};
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
const aggregated = await aggregateTemporalNotes(gameplayNotes, options);
|
|
168
|
-
|
|
169
|
-
const results = {
|
|
170
|
-
aggregated,
|
|
171
|
-
isValid: true,
|
|
172
|
-
issues: [],
|
|
173
|
-
recommendations: []
|
|
174
|
-
};
|
|
175
|
-
|
|
176
|
-
// Check coherence
|
|
177
|
-
if (aggregated.coherence < minCoherenceForSmooth && aggregated.coherence > maxCoherenceForErratic) {
|
|
178
|
-
results.issues.push(`Moderate coherence (${aggregated.coherence.toFixed(3)}) - neither smooth nor clearly erratic`);
|
|
179
|
-
results.recommendations.push('Review gameplay notes for consistency issues');
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
// Check for conflicts
|
|
183
|
-
if (aggregated.conflicts && aggregated.conflicts.length > 0) {
|
|
184
|
-
results.issues.push(`Detected ${aggregated.conflicts.length} conflicts in gameplay notes`);
|
|
185
|
-
results.recommendations.push('Review conflicting observations in gameplay notes');
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
// Check window count
|
|
189
|
-
if (aggregated.windows.length < 2) {
|
|
190
|
-
results.issues.push('Insufficient windows for temporal analysis (need at least 2)');
|
|
191
|
-
results.recommendations.push('Collect more gameplay notes or use smaller window size');
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
return results;
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
/**
|
|
198
|
-
* Validate webpage evaluation correctness
|
|
199
|
-
*
|
|
200
|
-
* Validates that VLLM judgments about webpages align with human expectations.
|
|
201
|
-
*
|
|
202
|
-
* @param {Array} evaluations - Array of evaluation results
|
|
203
|
-
* @param {Object} groundTruth - Ground truth data (if available)
|
|
204
|
-
* @param {Object} options - Validation options
|
|
205
|
-
* @returns {Object} Validation results
|
|
206
|
-
*/
|
|
207
|
-
export function validateWebpageEvaluation(evaluations, groundTruth = null, options = {}) {
|
|
208
|
-
const results = {
|
|
209
|
-
evaluations,
|
|
210
|
-
isValid: true,
|
|
211
|
-
issues: [],
|
|
212
|
-
recommendations: []
|
|
213
|
-
};
|
|
214
|
-
|
|
215
|
-
// If ground truth available, compare
|
|
216
|
-
if (groundTruth) {
|
|
217
|
-
const humanJudgments = groundTruth.humanJudgments || [];
|
|
218
|
-
const vllmJudgments = evaluations.map(evaluation => ({
|
|
219
|
-
id: evaluation.id || `eval-${Date.now()}`,
|
|
220
|
-
vllmScore: evaluation.score,
|
|
221
|
-
vllmIssues: evaluation.issues || [],
|
|
222
|
-
vllmReasoning: evaluation.reasoning || '',
|
|
223
|
-
provider: evaluation.provider || 'unknown',
|
|
224
|
-
timestamp: evaluation.timestamp || new Date().toISOString()
|
|
225
|
-
}));
|
|
226
|
-
|
|
227
|
-
if (humanJudgments.length > 0 && vllmJudgments.length > 0) {
|
|
228
|
-
const accuracy = validateVLLMAccuracy(humanJudgments, vllmJudgments, options);
|
|
229
|
-
results.accuracy = accuracy;
|
|
230
|
-
results.isValid = accuracy.isValid;
|
|
231
|
-
results.issues.push(...accuracy.issues);
|
|
232
|
-
results.recommendations.push(...accuracy.recommendations);
|
|
233
|
-
}
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
// Validate evaluation structure
|
|
237
|
-
for (const evaluation of evaluations) {
|
|
238
|
-
if (evaluation.score === null || evaluation.score === undefined) {
|
|
239
|
-
results.issues.push(`Evaluation ${evaluation.id || 'unknown'} has null/undefined score`);
|
|
240
|
-
results.isValid = false;
|
|
241
|
-
}
|
|
242
|
-
if (evaluation.score !== null && (evaluation.score < 0 || evaluation.score > 10)) {
|
|
243
|
-
results.issues.push(`Evaluation ${evaluation.id || 'unknown'} has invalid score: ${evaluation.score}`);
|
|
244
|
-
results.isValid = false;
|
|
245
|
-
}
|
|
246
|
-
if (!Array.isArray(evaluation.issues)) {
|
|
247
|
-
results.issues.push(`Evaluation ${evaluation.id || 'unknown'} has non-array issues`);
|
|
248
|
-
results.isValid = false;
|
|
249
|
-
}
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
return results;
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
/**
|
|
256
|
-
* Comprehensive validation report
|
|
257
|
-
*
|
|
258
|
-
* Validates all aspects: temporal perception, VLLM accuracy, gameplay, webpage evaluation
|
|
259
|
-
*
|
|
260
|
-
* @param {Object} data - Validation data
|
|
261
|
-
* @param {Object} options - Validation options
|
|
262
|
-
* @returns {Object} Comprehensive validation report
|
|
263
|
-
*/
|
|
264
|
-
export async function validateComprehensive(data, options = {}) {
|
|
265
|
-
const report = {
|
|
266
|
-
temporalPerception: null,
|
|
267
|
-
vllmAccuracy: null,
|
|
268
|
-
gameplayTemporal: null,
|
|
269
|
-
webpageEvaluation: null,
|
|
270
|
-
overall: {
|
|
271
|
-
isValid: true,
|
|
272
|
-
issues: [],
|
|
273
|
-
recommendations: []
|
|
274
|
-
}
|
|
275
|
-
};
|
|
276
|
-
|
|
277
|
-
// Validate temporal perception
|
|
278
|
-
if (data.temporalPerception !== false) {
|
|
279
|
-
report.temporalPerception = validateTemporalPerception(options);
|
|
280
|
-
if (report.temporalPerception.recommendations.length > 0) {
|
|
281
|
-
report.overall.issues.push('Temporal perception validation issues');
|
|
282
|
-
report.overall.recommendations.push(...report.temporalPerception.recommendations);
|
|
283
|
-
}
|
|
284
|
-
}
|
|
285
|
-
|
|
286
|
-
// Validate VLLM accuracy
|
|
287
|
-
if (data.humanJudgments && data.vllmJudgments) {
|
|
288
|
-
report.vllmAccuracy = validateVLLMAccuracy(data.humanJudgments, data.vllmJudgments, options);
|
|
289
|
-
if (!report.vllmAccuracy.isValid) {
|
|
290
|
-
report.overall.isValid = false;
|
|
291
|
-
report.overall.issues.push('VLLM accuracy validation failed');
|
|
292
|
-
report.overall.recommendations.push(...report.vllmAccuracy.recommendations);
|
|
293
|
-
}
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
// Validate gameplay temporal
|
|
297
|
-
if (data.gameplayNotes) {
|
|
298
|
-
// Note: validateGameplayTemporal is async, but validateComprehensive is sync
|
|
299
|
-
// For now, we'll handle this by making validateComprehensive async-aware
|
|
300
|
-
// In practice, this should be awaited, but we maintain backward compatibility
|
|
301
|
-
const gameplayResult = await validateGameplayTemporal(data.gameplayNotes, options);
|
|
302
|
-
report.gameplayTemporal = gameplayResult;
|
|
303
|
-
if (gameplayResult && gameplayResult.issues && gameplayResult.issues.length > 0) {
|
|
304
|
-
report.overall.issues.push('Gameplay temporal validation issues');
|
|
305
|
-
report.overall.recommendations.push(...(gameplayResult.recommendations || []));
|
|
306
|
-
}
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
// Validate webpage evaluation
|
|
310
|
-
if (data.evaluations) {
|
|
311
|
-
report.webpageEvaluation = validateWebpageEvaluation(
|
|
312
|
-
data.evaluations,
|
|
313
|
-
data.groundTruth,
|
|
314
|
-
options
|
|
315
|
-
);
|
|
316
|
-
if (!report.webpageEvaluation.isValid) {
|
|
317
|
-
report.overall.isValid = false;
|
|
318
|
-
report.overall.issues.push('Webpage evaluation validation failed');
|
|
319
|
-
report.overall.recommendations.push(...report.webpageEvaluation.recommendations);
|
|
320
|
-
}
|
|
321
|
-
}
|
|
322
|
-
|
|
323
|
-
return report;
|
|
324
|
-
}
|
|
325
|
-
|