@arclabs561/ai-visual-test 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.secretsignore.example +20 -0
- package/CHANGELOG.md +360 -0
- package/CONTRIBUTING.md +63 -0
- package/DEPLOYMENT.md +80 -0
- package/LICENSE +22 -0
- package/README.md +142 -0
- package/SECURITY.md +108 -0
- package/api/health.js +34 -0
- package/api/validate.js +252 -0
- package/index.d.ts +1221 -0
- package/package.json +112 -0
- package/public/index.html +149 -0
- package/src/batch-optimizer.mjs +451 -0
- package/src/bias-detector.mjs +370 -0
- package/src/bias-mitigation.mjs +233 -0
- package/src/cache.mjs +433 -0
- package/src/config.mjs +268 -0
- package/src/constants.mjs +80 -0
- package/src/context-compressor.mjs +350 -0
- package/src/convenience.mjs +617 -0
- package/src/cost-tracker.mjs +257 -0
- package/src/cross-modal-consistency.mjs +170 -0
- package/src/data-extractor.mjs +232 -0
- package/src/dynamic-few-shot.mjs +140 -0
- package/src/dynamic-prompts.mjs +361 -0
- package/src/ensemble/index.mjs +53 -0
- package/src/ensemble-judge.mjs +366 -0
- package/src/error-handler.mjs +67 -0
- package/src/errors.mjs +167 -0
- package/src/experience-propagation.mjs +128 -0
- package/src/experience-tracer.mjs +487 -0
- package/src/explanation-manager.mjs +299 -0
- package/src/feedback-aggregator.mjs +248 -0
- package/src/game-goal-prompts.mjs +478 -0
- package/src/game-player.mjs +548 -0
- package/src/hallucination-detector.mjs +155 -0
- package/src/helpers/playwright.mjs +80 -0
- package/src/human-validation-manager.mjs +516 -0
- package/src/index.mjs +364 -0
- package/src/judge.mjs +929 -0
- package/src/latency-aware-batch-optimizer.mjs +192 -0
- package/src/load-env.mjs +159 -0
- package/src/logger.mjs +55 -0
- package/src/metrics.mjs +187 -0
- package/src/model-tier-selector.mjs +221 -0
- package/src/multi-modal/index.mjs +36 -0
- package/src/multi-modal-fusion.mjs +190 -0
- package/src/multi-modal.mjs +524 -0
- package/src/natural-language-specs.mjs +1071 -0
- package/src/pair-comparison.mjs +277 -0
- package/src/persona/index.mjs +42 -0
- package/src/persona-enhanced.mjs +200 -0
- package/src/persona-experience.mjs +572 -0
- package/src/position-counterbalance.mjs +140 -0
- package/src/prompt-composer.mjs +375 -0
- package/src/render-change-detector.mjs +583 -0
- package/src/research-enhanced-validation.mjs +436 -0
- package/src/retry.mjs +152 -0
- package/src/rubrics.mjs +231 -0
- package/src/score-tracker.mjs +277 -0
- package/src/smart-validator.mjs +447 -0
- package/src/spec-config.mjs +106 -0
- package/src/spec-templates.mjs +347 -0
- package/src/specs/index.mjs +38 -0
- package/src/temporal/index.mjs +102 -0
- package/src/temporal-adaptive.mjs +163 -0
- package/src/temporal-batch-optimizer.mjs +222 -0
- package/src/temporal-constants.mjs +69 -0
- package/src/temporal-context.mjs +49 -0
- package/src/temporal-decision-manager.mjs +271 -0
- package/src/temporal-decision.mjs +669 -0
- package/src/temporal-errors.mjs +58 -0
- package/src/temporal-note-pruner.mjs +173 -0
- package/src/temporal-preprocessor.mjs +543 -0
- package/src/temporal-prompt-formatter.mjs +219 -0
- package/src/temporal-validation.mjs +159 -0
- package/src/temporal.mjs +415 -0
- package/src/type-guards.mjs +311 -0
- package/src/uncertainty-reducer.mjs +470 -0
- package/src/utils/index.mjs +175 -0
- package/src/validation-framework.mjs +321 -0
- package/src/validation-result-normalizer.mjs +64 -0
- package/src/validation.mjs +243 -0
- package/src/validators/accessibility-programmatic.mjs +345 -0
- package/src/validators/accessibility-validator.mjs +223 -0
- package/src/validators/batch-validator.mjs +143 -0
- package/src/validators/hybrid-validator.mjs +268 -0
- package/src/validators/index.mjs +34 -0
- package/src/validators/prompt-builder.mjs +218 -0
- package/src/validators/rubric.mjs +85 -0
- package/src/validators/state-programmatic.mjs +260 -0
- package/src/validators/state-validator.mjs +291 -0
- package/vercel.json +27 -0
|
@@ -0,0 +1,470 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Uncertainty Reduction for VLLM API Calls
|
|
3
|
+
*
|
|
4
|
+
* Research-backed strategies to reduce uncertainty in VLLM judgments:
|
|
5
|
+
* - Multiple API calls (self-consistency, ensemble)
|
|
6
|
+
* - Logprob analysis (token-level confidence)
|
|
7
|
+
* - Hallucination detection
|
|
8
|
+
* - Confidence calibration
|
|
9
|
+
*
|
|
10
|
+
* Research: Self-consistency improves accuracy by 5-15% (arXiv:2203.11171)
|
|
11
|
+
* Research: Ensemble methods reduce uncertainty (arXiv:2305.10429)
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { detectHallucination } from './hallucination-detector.mjs';
|
|
15
|
+
import { log, warn } from './logger.mjs';
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Estimate uncertainty from logprobs
|
|
19
|
+
*
|
|
20
|
+
* @param {any} logprobs - Logprobs from API response
|
|
21
|
+
* @returns {Object} Uncertainty estimate
|
|
22
|
+
*/
|
|
23
|
+
export function estimateUncertainty(logprobs) {
|
|
24
|
+
if (!logprobs) {
|
|
25
|
+
return { uncertainty: 0.5, confidence: 0.5, method: 'default' };
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// OpenAI format: { tokens: [...], token_logprobs: [...] }
|
|
29
|
+
if (Array.isArray(logprobs.token_logprobs)) {
|
|
30
|
+
const valid = logprobs.token_logprobs.filter(p => p !== null);
|
|
31
|
+
if (valid.length === 0) {
|
|
32
|
+
return { uncertainty: 0.5, confidence: 0.5, method: 'no-logprobs' };
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const avgLogprob = valid.reduce((a, b) => a + b, 0) / valid.length;
|
|
36
|
+
const minLogprob = Math.min(...valid);
|
|
37
|
+
const maxLogprob = Math.max(...valid);
|
|
38
|
+
const variance = valid.reduce((sum, p) => sum + Math.pow(p - avgLogprob, 2), 0) / valid.length;
|
|
39
|
+
|
|
40
|
+
// Convert logprob to probability: exp(logprob)
|
|
41
|
+
const avgProb = Math.exp(avgLogprob);
|
|
42
|
+
const minProb = Math.exp(minLogprob);
|
|
43
|
+
|
|
44
|
+
// Uncertainty: inverse of confidence
|
|
45
|
+
// Low logprob (more negative) = high uncertainty
|
|
46
|
+
// Threshold: -2.0 ≈ 13% probability
|
|
47
|
+
const uncertainty = avgLogprob < -2.0
|
|
48
|
+
? Math.min(1.0, 1.0 - avgProb)
|
|
49
|
+
: Math.max(0.0, 1.0 - avgProb);
|
|
50
|
+
|
|
51
|
+
const confidence = 1.0 - uncertainty;
|
|
52
|
+
|
|
53
|
+
return {
|
|
54
|
+
uncertainty: Math.max(0, Math.min(1, uncertainty)),
|
|
55
|
+
confidence: Math.max(0, Math.min(1, confidence)),
|
|
56
|
+
method: 'logprobs',
|
|
57
|
+
avgLogprob,
|
|
58
|
+
avgProb,
|
|
59
|
+
minProb,
|
|
60
|
+
variance,
|
|
61
|
+
tokenCount: valid.length
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Gemini format: varies, may be nested
|
|
66
|
+
if (typeof logprobs === 'object' && logprobs !== null) {
|
|
67
|
+
// Try to extract any numeric logprob values
|
|
68
|
+
const values = extractNumericValues(logprobs);
|
|
69
|
+
if (values.length > 0) {
|
|
70
|
+
const avg = values.reduce((a, b) => a + b, 0) / values.length;
|
|
71
|
+
const uncertainty = avg < -2.0 ? Math.min(1.0, 1.0 - Math.exp(avg)) : Math.max(0.0, 1.0 - Math.exp(avg));
|
|
72
|
+
return {
|
|
73
|
+
uncertainty: Math.max(0, Math.min(1, uncertainty)),
|
|
74
|
+
confidence: 1.0 - uncertainty,
|
|
75
|
+
method: 'logprobs-gemini',
|
|
76
|
+
avgLogprob: avg
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
return { uncertainty: 0.5, confidence: 0.5, method: 'unknown-format' };
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Extract numeric values from nested object
|
|
86
|
+
*/
|
|
87
|
+
function extractNumericValues(obj, maxDepth = 3, depth = 0) {
|
|
88
|
+
if (depth > maxDepth) return [];
|
|
89
|
+
|
|
90
|
+
const values = [];
|
|
91
|
+
if (typeof obj === 'number') {
|
|
92
|
+
values.push(obj);
|
|
93
|
+
} else if (Array.isArray(obj)) {
|
|
94
|
+
obj.forEach(item => values.push(...extractNumericValues(item, maxDepth, depth + 1)));
|
|
95
|
+
} else if (typeof obj === 'object' && obj !== null) {
|
|
96
|
+
Object.values(obj).forEach(val => values.push(...extractNumericValues(val, maxDepth, depth + 1)));
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
return values;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Self-consistency check: Multiple API calls with same prompt
|
|
104
|
+
*
|
|
105
|
+
* Research: Self-consistency improves accuracy by 5-15% (arXiv:2203.11171)
|
|
106
|
+
*
|
|
107
|
+
* @param {Function} judgeFn - Function to call judge API
|
|
108
|
+
* @param {number} [n=3] - Number of calls to make
|
|
109
|
+
* @param {Object} [options={}] - Options
|
|
110
|
+
* @returns {Promise<Object>} Aggregated result with consistency metrics
|
|
111
|
+
*/
|
|
112
|
+
export async function selfConsistencyCheck(judgeFn, n = 3, options = {}) {
|
|
113
|
+
const {
|
|
114
|
+
minAgreement = 0.7, // Minimum agreement threshold
|
|
115
|
+
maxCalls = 5 // Maximum calls before giving up
|
|
116
|
+
} = options;
|
|
117
|
+
|
|
118
|
+
const results = [];
|
|
119
|
+
let attempts = 0;
|
|
120
|
+
|
|
121
|
+
// Make multiple calls
|
|
122
|
+
while (results.length < n && attempts < maxCalls) {
|
|
123
|
+
attempts++;
|
|
124
|
+
try {
|
|
125
|
+
const result = await judgeFn();
|
|
126
|
+
if (result && result.score !== null) {
|
|
127
|
+
results.push(result);
|
|
128
|
+
}
|
|
129
|
+
} catch (error) {
|
|
130
|
+
warn(`[Uncertainty] Self-consistency call ${attempts} failed: ${error.message}`);
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
if (results.length === 0) {
|
|
135
|
+
return {
|
|
136
|
+
score: null,
|
|
137
|
+
uncertainty: 1.0,
|
|
138
|
+
confidence: 0.0,
|
|
139
|
+
consistency: 0.0,
|
|
140
|
+
method: 'self-consistency-failed'
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Calculate consistency
|
|
145
|
+
const scores = results.map(r => r.score).filter(s => s !== null);
|
|
146
|
+
if (scores.length === 0) {
|
|
147
|
+
return {
|
|
148
|
+
score: null,
|
|
149
|
+
uncertainty: 1.0,
|
|
150
|
+
confidence: 0.0,
|
|
151
|
+
consistency: 0.0,
|
|
152
|
+
method: 'self-consistency-no-scores'
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Mean score
|
|
157
|
+
const meanScore = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
158
|
+
|
|
159
|
+
// Standard deviation (measure of consistency)
|
|
160
|
+
const variance = scores.reduce((sum, s) => sum + Math.pow(s - meanScore, 2), 0) / scores.length;
|
|
161
|
+
const stdDev = Math.sqrt(variance);
|
|
162
|
+
|
|
163
|
+
// Consistency: inverse of coefficient of variation
|
|
164
|
+
// Lower stdDev relative to mean = higher consistency
|
|
165
|
+
const consistency = meanScore > 0
|
|
166
|
+
? Math.max(0, Math.min(1, 1.0 - (stdDev / meanScore)))
|
|
167
|
+
: stdDev < 1.0 ? 1.0 - stdDev : 0.0;
|
|
168
|
+
|
|
169
|
+
// Uncertainty: inverse of consistency
|
|
170
|
+
const uncertainty = 1.0 - consistency;
|
|
171
|
+
|
|
172
|
+
// Confidence: weighted by consistency and number of calls
|
|
173
|
+
const confidence = consistency * Math.min(1.0, results.length / n);
|
|
174
|
+
|
|
175
|
+
// VERIFIABLE: Calculate improvement metrics if baseline is provided
|
|
176
|
+
// This allows verification of the "improves accuracy by 5-15%" claim
|
|
177
|
+
let improvementMetrics = null;
|
|
178
|
+
if (options.baselineScore !== undefined && options.baselineScore !== null) {
|
|
179
|
+
const scoreImprovement = meanScore - options.baselineScore;
|
|
180
|
+
// CRITICAL FIX: Handle baseline=0 case more robustly
|
|
181
|
+
// MCP research: When baseline is 0, standard percentage formula breaks (division by zero)
|
|
182
|
+
// Solution: Normalize against maximum scale (default 10, but configurable)
|
|
183
|
+
// This ensures consistent behavior across different scales (0-10, 0-100, etc.)
|
|
184
|
+
const maxScale = options.maxScale || 10; // Default to 0-10 scale, but allow override
|
|
185
|
+
const improvementPercent = options.baselineScore > 0
|
|
186
|
+
? (scoreImprovement / options.baselineScore) * 100
|
|
187
|
+
: (scoreImprovement / maxScale) * 100; // Normalize against scale maximum when baseline is 0
|
|
188
|
+
|
|
189
|
+
improvementMetrics = {
|
|
190
|
+
baselineScore: options.baselineScore,
|
|
191
|
+
improvedScore: meanScore,
|
|
192
|
+
improvement: scoreImprovement,
|
|
193
|
+
improvementPercent,
|
|
194
|
+
// Research claim: 5-15% improvement
|
|
195
|
+
meetsResearchClaim: improvementPercent >= 5 && improvementPercent <= 15
|
|
196
|
+
};
|
|
197
|
+
|
|
198
|
+
// VERIFIABLE: Log improvement when it meets research claim threshold
|
|
199
|
+
if (improvementPercent >= 5) {
|
|
200
|
+
log(`[SelfConsistency] Accuracy improvement: ${improvementPercent.toFixed(1)}% (${options.baselineScore.toFixed(1)} → ${meanScore.toFixed(1)})`);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
return {
|
|
205
|
+
score: Math.round(meanScore * 10) / 10, // Round to 1 decimal
|
|
206
|
+
uncertainty: Math.max(0, Math.min(1, uncertainty)),
|
|
207
|
+
confidence: Math.max(0, Math.min(1, confidence)),
|
|
208
|
+
consistency: Math.max(0, Math.min(1, consistency)),
|
|
209
|
+
method: 'self-consistency',
|
|
210
|
+
calls: results.length,
|
|
211
|
+
stdDev,
|
|
212
|
+
scores,
|
|
213
|
+
results,
|
|
214
|
+
// VERIFIABLE: Export improvement metrics to verify research claim
|
|
215
|
+
improvementMetrics
|
|
216
|
+
};
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* Ensemble uncertainty reduction
|
|
221
|
+
*
|
|
222
|
+
* Combine multiple uncertainty sources:
|
|
223
|
+
* - Logprob-based uncertainty
|
|
224
|
+
* - Self-consistency uncertainty
|
|
225
|
+
* - Hallucination detection
|
|
226
|
+
*
|
|
227
|
+
* @param {Object} sources - Uncertainty sources
|
|
228
|
+
* @returns {Object} Combined uncertainty estimate
|
|
229
|
+
*/
|
|
230
|
+
export function combineUncertaintySources(sources) {
|
|
231
|
+
const {
|
|
232
|
+
logprobs = null,
|
|
233
|
+
selfConsistency = null,
|
|
234
|
+
hallucination = null,
|
|
235
|
+
retryCount = 1
|
|
236
|
+
} = sources;
|
|
237
|
+
|
|
238
|
+
const estimates = [];
|
|
239
|
+
|
|
240
|
+
// 1. Logprob-based uncertainty
|
|
241
|
+
if (logprobs) {
|
|
242
|
+
const logprobEst = estimateUncertainty(logprobs);
|
|
243
|
+
estimates.push({
|
|
244
|
+
uncertainty: logprobEst.uncertainty,
|
|
245
|
+
confidence: logprobEst.confidence,
|
|
246
|
+
weight: 0.4,
|
|
247
|
+
source: 'logprobs'
|
|
248
|
+
});
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// 2. Self-consistency uncertainty
|
|
252
|
+
if (selfConsistency) {
|
|
253
|
+
estimates.push({
|
|
254
|
+
uncertainty: selfConsistency.uncertainty || (1.0 - selfConsistency.consistency),
|
|
255
|
+
confidence: selfConsistency.confidence || selfConsistency.consistency,
|
|
256
|
+
weight: 0.4,
|
|
257
|
+
source: 'self-consistency'
|
|
258
|
+
});
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// 3. Hallucination detection
|
|
262
|
+
if (hallucination) {
|
|
263
|
+
estimates.push({
|
|
264
|
+
uncertainty: 1.0 - hallucination.confidence,
|
|
265
|
+
confidence: hallucination.confidence,
|
|
266
|
+
weight: 0.2,
|
|
267
|
+
source: 'hallucination'
|
|
268
|
+
});
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// 4. Retry count (more retries = higher uncertainty)
|
|
272
|
+
if (retryCount > 1) {
|
|
273
|
+
estimates.push({
|
|
274
|
+
uncertainty: Math.min(0.3, (retryCount - 1) * 0.1),
|
|
275
|
+
confidence: Math.max(0.7, 1.0 - (retryCount - 1) * 0.1),
|
|
276
|
+
weight: 0.1,
|
|
277
|
+
source: 'retries'
|
|
278
|
+
});
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
// Weighted average
|
|
282
|
+
if (estimates.length === 0) {
|
|
283
|
+
return { uncertainty: 0.5, confidence: 0.5, method: 'default' };
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
const totalWeight = estimates.reduce((sum, e) => sum + e.weight, 0);
|
|
287
|
+
const weightedUncertainty = estimates.reduce((sum, e) => sum + (e.uncertainty * e.weight), 0) / totalWeight;
|
|
288
|
+
const weightedConfidence = estimates.reduce((sum, e) => sum + (e.confidence * e.weight), 0) / totalWeight;
|
|
289
|
+
|
|
290
|
+
return {
|
|
291
|
+
uncertainty: Math.max(0, Math.min(1, weightedUncertainty)),
|
|
292
|
+
confidence: Math.max(0, Math.min(1, weightedConfidence)),
|
|
293
|
+
method: 'ensemble',
|
|
294
|
+
sources: estimates.map(e => e.source),
|
|
295
|
+
breakdown: estimates
|
|
296
|
+
};
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
/**
|
|
300
|
+
* Determine if self-consistency should be used based on context (uncertainty × payout analysis)
|
|
301
|
+
*
|
|
302
|
+
* Based on research: Self-consistency provides highest ROI for:
|
|
303
|
+
* - Critical/high-stakes scenarios (expert, medical, accessibility)
|
|
304
|
+
* - Edge cases (extreme scores)
|
|
305
|
+
* - High uncertainty scenarios
|
|
306
|
+
* - High-impact issues (blocks-use, degrades-experience)
|
|
307
|
+
*
|
|
308
|
+
* @param {Object} context - Validation context
|
|
309
|
+
* @param {Object} partialResult - Partial validation result (score, issues, uncertainty)
|
|
310
|
+
* @returns {Object} { shouldUse: boolean, n: number, reason: string }
|
|
311
|
+
*/
|
|
312
|
+
export function shouldUseSelfConsistency(context = {}, partialResult = {}) {
|
|
313
|
+
const { testType, importance, impact } = context;
|
|
314
|
+
const { score, uncertainty, issues } = partialResult;
|
|
315
|
+
|
|
316
|
+
// Use constants for thresholds (imported at top level to avoid async)
|
|
317
|
+
// These values are documented in src/constants.mjs and docs/misc/UNCERTAINTY_TIER_LOGIC.md
|
|
318
|
+
const LOW_SCORE_THRESHOLD = 3; // Bottom 30% of 0-10 scale
|
|
319
|
+
const HIGH_SCORE_THRESHOLD = 9; // Top 10% of 0-10 scale
|
|
320
|
+
const HIGH_UNCERTAINTY_THRESHOLD = 0.3; // 30% uncertainty
|
|
321
|
+
const OVER_DETECTION_ISSUE_COUNT = 5; // 5+ issues might indicate hallucination
|
|
322
|
+
const TIER1_N = 5; // Tier 1: Critical scenarios (expert, medical, blocking issues)
|
|
323
|
+
const EDGE_CASE_N = 3; // Tier 2: Edge cases
|
|
324
|
+
|
|
325
|
+
// Tier 1: Critical scenarios (always use, N=5)
|
|
326
|
+
if (testType === 'expert-evaluation' || testType === 'medical') {
|
|
327
|
+
return {
|
|
328
|
+
shouldUse: true,
|
|
329
|
+
n: TIER1_N,
|
|
330
|
+
reason: `Critical test type: ${testType}`
|
|
331
|
+
};
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
// Tier 1: Critical issues (blocks-use with critical importance)
|
|
335
|
+
if (importance === 'critical' && impact === 'blocks-use') {
|
|
336
|
+
return {
|
|
337
|
+
shouldUse: true,
|
|
338
|
+
n: TIER1_N,
|
|
339
|
+
reason: 'Critical issue that blocks use'
|
|
340
|
+
};
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
// Tier 2: Edge cases (extreme scores)
|
|
344
|
+
// NOTE: Thresholds (3, 9) represent bottom 30% and top 10% of 0-10 scale
|
|
345
|
+
// These are where models are most likely to be incorrect
|
|
346
|
+
if (score !== null && (score <= LOW_SCORE_THRESHOLD || score >= HIGH_SCORE_THRESHOLD)) {
|
|
347
|
+
return {
|
|
348
|
+
shouldUse: true,
|
|
349
|
+
n: EDGE_CASE_N,
|
|
350
|
+
reason: `Edge case score: ${score}`
|
|
351
|
+
};
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
// Tier 2: High uncertainty
|
|
355
|
+
// NOTE: 0.3 threshold based on research showing uncertainty > 0.3 indicates low confidence
|
|
356
|
+
if (uncertainty !== null && uncertainty > HIGH_UNCERTAINTY_THRESHOLD) {
|
|
357
|
+
return {
|
|
358
|
+
shouldUse: true,
|
|
359
|
+
n: EDGE_CASE_N,
|
|
360
|
+
reason: `High uncertainty: ${uncertainty.toFixed(2)}`
|
|
361
|
+
};
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
// Tier 2: Many issues (over-detection risk)
|
|
365
|
+
// NOTE: 5+ issues might indicate hallucination/over-detection
|
|
366
|
+
if (Array.isArray(issues) && issues.length >= OVER_DETECTION_ISSUE_COUNT) {
|
|
367
|
+
return {
|
|
368
|
+
shouldUse: true,
|
|
369
|
+
n: EDGE_CASE_N,
|
|
370
|
+
reason: `Many issues detected: ${issues.length} (over-detection risk)`
|
|
371
|
+
};
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
// Tier 2: High-impact degradation
|
|
375
|
+
if (importance === 'high' && impact === 'degrades-experience') {
|
|
376
|
+
return {
|
|
377
|
+
shouldUse: true,
|
|
378
|
+
n: 3,
|
|
379
|
+
reason: 'High-impact issue that degrades experience'
|
|
380
|
+
};
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
// Tier 3: Standard scenarios (no self-consistency)
|
|
384
|
+
return {
|
|
385
|
+
shouldUse: false,
|
|
386
|
+
n: 0,
|
|
387
|
+
reason: 'Standard validation (logprobs + hallucination sufficient)'
|
|
388
|
+
};
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
/**
|
|
392
|
+
* Enhance validation result with uncertainty reduction
|
|
393
|
+
*
|
|
394
|
+
* @param {Object} partialResult - Partial validation result (judgment, logprobs, attempts, screenshotPath)
|
|
395
|
+
* @param {Object} [options={}] - Options
|
|
396
|
+
* @param {Object} [context={}] - Validation context (for adaptive self-consistency)
|
|
397
|
+
* @returns {Object} Uncertainty and confidence estimates
|
|
398
|
+
*/
|
|
399
|
+
export function enhanceWithUncertainty(partialResult, options = {}, context = {}) {
|
|
400
|
+
const {
|
|
401
|
+
enableSelfConsistency = false,
|
|
402
|
+
enableHallucinationCheck = true,
|
|
403
|
+
adaptiveSelfConsistency = true // New: adaptive strategy based on context
|
|
404
|
+
} = options;
|
|
405
|
+
|
|
406
|
+
// Extract uncertainty sources
|
|
407
|
+
const logprobs = partialResult.logprobs || null;
|
|
408
|
+
const attempts = partialResult.attempts || 1;
|
|
409
|
+
const judgment = partialResult.judgment || null;
|
|
410
|
+
const score = partialResult.score || null;
|
|
411
|
+
const issues = partialResult.issues || [];
|
|
412
|
+
const uncertainty = partialResult.uncertainty || null;
|
|
413
|
+
|
|
414
|
+
// Determine if self-consistency should be used (adaptive strategy)
|
|
415
|
+
let shouldUseSelfConsistencyValue = enableSelfConsistency;
|
|
416
|
+
let selfConsistencyN = 3;
|
|
417
|
+
let selfConsistencyReason = '';
|
|
418
|
+
|
|
419
|
+
if (adaptiveSelfConsistency && !enableSelfConsistency) {
|
|
420
|
+
// Check if context suggests self-consistency is warranted
|
|
421
|
+
const selfConsistencyDecision = shouldUseSelfConsistency(context, {
|
|
422
|
+
score,
|
|
423
|
+
uncertainty,
|
|
424
|
+
issues
|
|
425
|
+
});
|
|
426
|
+
shouldUseSelfConsistencyValue = selfConsistencyDecision.shouldUse;
|
|
427
|
+
selfConsistencyN = selfConsistencyDecision.n;
|
|
428
|
+
selfConsistencyReason = selfConsistencyDecision.reason;
|
|
429
|
+
} else if (enableSelfConsistency) {
|
|
430
|
+
selfConsistencyReason = 'Explicitly enabled';
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
// Estimate uncertainty from logprobs
|
|
434
|
+
const logprobUncertainty = logprobs ? estimateUncertainty(logprobs) : null;
|
|
435
|
+
|
|
436
|
+
// Check for hallucination
|
|
437
|
+
let hallucinationResult = null;
|
|
438
|
+
if (enableHallucinationCheck && judgment) {
|
|
439
|
+
try {
|
|
440
|
+
hallucinationResult = detectHallucination(
|
|
441
|
+
judgment,
|
|
442
|
+
partialResult.screenshotPath || null,
|
|
443
|
+
{ logprobs }
|
|
444
|
+
);
|
|
445
|
+
} catch (error) {
|
|
446
|
+
// Silently fail
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
// Combine uncertainty sources
|
|
451
|
+
const combined = combineUncertaintySources({
|
|
452
|
+
logprobs: logprobUncertainty,
|
|
453
|
+
hallucination: hallucinationResult,
|
|
454
|
+
retryCount: attempts
|
|
455
|
+
});
|
|
456
|
+
|
|
457
|
+
// Return uncertainty metrics with self-consistency recommendation
|
|
458
|
+
return {
|
|
459
|
+
uncertainty: combined.uncertainty,
|
|
460
|
+
confidence: combined.confidence,
|
|
461
|
+
uncertaintyMethod: combined.method,
|
|
462
|
+
uncertaintyBreakdown: combined.breakdown || null,
|
|
463
|
+
hallucination: hallucinationResult,
|
|
464
|
+
// Self-consistency recommendation (caller should use this if needed)
|
|
465
|
+
selfConsistencyRecommended: shouldUseSelfConsistencyValue,
|
|
466
|
+
selfConsistencyN,
|
|
467
|
+
selfConsistencyReason
|
|
468
|
+
};
|
|
469
|
+
}
|
|
470
|
+
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utils Sub-Module
|
|
3
|
+
*
|
|
4
|
+
* Utility functions, helpers, and infrastructure.
|
|
5
|
+
*
|
|
6
|
+
* Import from 'ai-visual-test/utils'
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
// Cache
|
|
10
|
+
export {
|
|
11
|
+
getCached,
|
|
12
|
+
setCached,
|
|
13
|
+
clearCache,
|
|
14
|
+
getCacheStats,
|
|
15
|
+
initCache,
|
|
16
|
+
generateCacheKey
|
|
17
|
+
} from '../cache.mjs';
|
|
18
|
+
|
|
19
|
+
// Config
|
|
20
|
+
export {
|
|
21
|
+
createConfig,
|
|
22
|
+
getProvider,
|
|
23
|
+
getConfig,
|
|
24
|
+
setConfig
|
|
25
|
+
} from '../config.mjs';
|
|
26
|
+
|
|
27
|
+
// Environment
|
|
28
|
+
export { loadEnv } from '../load-env.mjs';
|
|
29
|
+
|
|
30
|
+
// Logger
|
|
31
|
+
export { enableDebug, disableDebug, isDebugEnabled, warn, log, error } from '../logger.mjs';
|
|
32
|
+
|
|
33
|
+
// Errors
|
|
34
|
+
export {
|
|
35
|
+
AIBrowserTestError,
|
|
36
|
+
ValidationError,
|
|
37
|
+
CacheError,
|
|
38
|
+
ConfigError,
|
|
39
|
+
ProviderError,
|
|
40
|
+
TimeoutError,
|
|
41
|
+
FileError,
|
|
42
|
+
StateMismatchError,
|
|
43
|
+
isAIBrowserTestError,
|
|
44
|
+
isErrorType
|
|
45
|
+
} from '../errors.mjs';
|
|
46
|
+
|
|
47
|
+
// Retry
|
|
48
|
+
export {
|
|
49
|
+
retryWithBackoff,
|
|
50
|
+
isRetryableError,
|
|
51
|
+
calculateBackoff,
|
|
52
|
+
enhanceErrorMessage
|
|
53
|
+
} from '../retry.mjs';
|
|
54
|
+
|
|
55
|
+
// Cost tracking
|
|
56
|
+
export {
|
|
57
|
+
CostTracker,
|
|
58
|
+
getCostTracker,
|
|
59
|
+
recordCost,
|
|
60
|
+
getCostStats
|
|
61
|
+
} from '../cost-tracker.mjs';
|
|
62
|
+
|
|
63
|
+
// Score tracking
|
|
64
|
+
export { ScoreTracker } from '../score-tracker.mjs';
|
|
65
|
+
|
|
66
|
+
// Batch optimization
|
|
67
|
+
export { BatchOptimizer } from '../batch-optimizer.mjs';
|
|
68
|
+
export { LatencyAwareBatchOptimizer } from '../latency-aware-batch-optimizer.mjs';
|
|
69
|
+
|
|
70
|
+
// Data extraction
|
|
71
|
+
export { extractStructuredData } from '../data-extractor.mjs';
|
|
72
|
+
|
|
73
|
+
// Feedback aggregation
|
|
74
|
+
export { aggregateFeedback, generateRecommendations } from '../feedback-aggregator.mjs';
|
|
75
|
+
|
|
76
|
+
// Context compression
|
|
77
|
+
export { compressContext, compressStateHistory } from '../context-compressor.mjs';
|
|
78
|
+
|
|
79
|
+
// Metrics
|
|
80
|
+
export {
|
|
81
|
+
spearmanCorrelation,
|
|
82
|
+
pearsonCorrelation,
|
|
83
|
+
calculateRankAgreement
|
|
84
|
+
} from '../metrics.mjs';
|
|
85
|
+
|
|
86
|
+
// Type guards
|
|
87
|
+
export {
|
|
88
|
+
isObject,
|
|
89
|
+
isString,
|
|
90
|
+
isNumber,
|
|
91
|
+
isArray,
|
|
92
|
+
isFunction,
|
|
93
|
+
isPromise,
|
|
94
|
+
isValidationResult,
|
|
95
|
+
isValidationContext,
|
|
96
|
+
isPersona,
|
|
97
|
+
isTemporalNote,
|
|
98
|
+
assertObject,
|
|
99
|
+
assertString,
|
|
100
|
+
assertNonEmptyString,
|
|
101
|
+
assertNumber,
|
|
102
|
+
assertArray,
|
|
103
|
+
assertFunction,
|
|
104
|
+
pick,
|
|
105
|
+
getProperty
|
|
106
|
+
} from '../type-guards.mjs';
|
|
107
|
+
|
|
108
|
+
// Constants
|
|
109
|
+
export {
|
|
110
|
+
CACHE_CONSTANTS,
|
|
111
|
+
TEMPORAL_CONSTANTS,
|
|
112
|
+
API_CONSTANTS,
|
|
113
|
+
UNCERTAINTY_CONSTANTS,
|
|
114
|
+
BATCH_OPTIMIZER_CONSTANTS
|
|
115
|
+
} from '../constants.mjs';
|
|
116
|
+
|
|
117
|
+
// Validation result normalization
|
|
118
|
+
export { normalizeValidationResult } from '../validation-result-normalizer.mjs';
|
|
119
|
+
|
|
120
|
+
// Error handlers
|
|
121
|
+
export { initErrorHandlers } from '../error-handler.mjs';
|
|
122
|
+
|
|
123
|
+
// Uncertainty reduction
|
|
124
|
+
export {
|
|
125
|
+
estimateUncertainty,
|
|
126
|
+
selfConsistencyCheck,
|
|
127
|
+
combineUncertaintySources,
|
|
128
|
+
enhanceWithUncertainty,
|
|
129
|
+
shouldUseSelfConsistency
|
|
130
|
+
} from '../uncertainty-reducer.mjs';
|
|
131
|
+
|
|
132
|
+
// Dynamic few-shot
|
|
133
|
+
export {
|
|
134
|
+
selectFewShotExamples,
|
|
135
|
+
formatFewShotExamples
|
|
136
|
+
} from '../dynamic-few-shot.mjs';
|
|
137
|
+
|
|
138
|
+
// Dynamic prompts
|
|
139
|
+
export {
|
|
140
|
+
generateDynamicPrompt,
|
|
141
|
+
generatePromptVariations,
|
|
142
|
+
generateInteractionPrompt,
|
|
143
|
+
generateGameplayPrompt
|
|
144
|
+
} from '../dynamic-prompts.mjs';
|
|
145
|
+
|
|
146
|
+
// Rubrics
|
|
147
|
+
export {
|
|
148
|
+
DEFAULT_RUBRIC,
|
|
149
|
+
buildRubricPrompt,
|
|
150
|
+
getRubricForTestType
|
|
151
|
+
} from '../rubrics.mjs';
|
|
152
|
+
|
|
153
|
+
// Model tier selection
|
|
154
|
+
export {
|
|
155
|
+
selectModelTier,
|
|
156
|
+
selectProvider,
|
|
157
|
+
selectModelTierAndProvider
|
|
158
|
+
} from '../model-tier-selector.mjs';
|
|
159
|
+
|
|
160
|
+
// Smart validator
|
|
161
|
+
export {
|
|
162
|
+
validateSmart,
|
|
163
|
+
validateAccessibilitySmart,
|
|
164
|
+
validateStateSmart,
|
|
165
|
+
validateElementSmart,
|
|
166
|
+
detectValidationMethod
|
|
167
|
+
} from '../smart-validator.mjs';
|
|
168
|
+
|
|
169
|
+
// Human validation
|
|
170
|
+
export {
|
|
171
|
+
HumanValidationManager,
|
|
172
|
+
getHumanValidationManager,
|
|
173
|
+
initHumanValidation
|
|
174
|
+
} from '../human-validation-manager.mjs';
|
|
175
|
+
|