@arclabs561/ai-visual-test 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.secretsignore.example +20 -0
- package/CHANGELOG.md +360 -0
- package/CONTRIBUTING.md +63 -0
- package/DEPLOYMENT.md +80 -0
- package/LICENSE +22 -0
- package/README.md +142 -0
- package/SECURITY.md +108 -0
- package/api/health.js +34 -0
- package/api/validate.js +252 -0
- package/index.d.ts +1221 -0
- package/package.json +112 -0
- package/public/index.html +149 -0
- package/src/batch-optimizer.mjs +451 -0
- package/src/bias-detector.mjs +370 -0
- package/src/bias-mitigation.mjs +233 -0
- package/src/cache.mjs +433 -0
- package/src/config.mjs +268 -0
- package/src/constants.mjs +80 -0
- package/src/context-compressor.mjs +350 -0
- package/src/convenience.mjs +617 -0
- package/src/cost-tracker.mjs +257 -0
- package/src/cross-modal-consistency.mjs +170 -0
- package/src/data-extractor.mjs +232 -0
- package/src/dynamic-few-shot.mjs +140 -0
- package/src/dynamic-prompts.mjs +361 -0
- package/src/ensemble/index.mjs +53 -0
- package/src/ensemble-judge.mjs +366 -0
- package/src/error-handler.mjs +67 -0
- package/src/errors.mjs +167 -0
- package/src/experience-propagation.mjs +128 -0
- package/src/experience-tracer.mjs +487 -0
- package/src/explanation-manager.mjs +299 -0
- package/src/feedback-aggregator.mjs +248 -0
- package/src/game-goal-prompts.mjs +478 -0
- package/src/game-player.mjs +548 -0
- package/src/hallucination-detector.mjs +155 -0
- package/src/helpers/playwright.mjs +80 -0
- package/src/human-validation-manager.mjs +516 -0
- package/src/index.mjs +364 -0
- package/src/judge.mjs +929 -0
- package/src/latency-aware-batch-optimizer.mjs +192 -0
- package/src/load-env.mjs +159 -0
- package/src/logger.mjs +55 -0
- package/src/metrics.mjs +187 -0
- package/src/model-tier-selector.mjs +221 -0
- package/src/multi-modal/index.mjs +36 -0
- package/src/multi-modal-fusion.mjs +190 -0
- package/src/multi-modal.mjs +524 -0
- package/src/natural-language-specs.mjs +1071 -0
- package/src/pair-comparison.mjs +277 -0
- package/src/persona/index.mjs +42 -0
- package/src/persona-enhanced.mjs +200 -0
- package/src/persona-experience.mjs +572 -0
- package/src/position-counterbalance.mjs +140 -0
- package/src/prompt-composer.mjs +375 -0
- package/src/render-change-detector.mjs +583 -0
- package/src/research-enhanced-validation.mjs +436 -0
- package/src/retry.mjs +152 -0
- package/src/rubrics.mjs +231 -0
- package/src/score-tracker.mjs +277 -0
- package/src/smart-validator.mjs +447 -0
- package/src/spec-config.mjs +106 -0
- package/src/spec-templates.mjs +347 -0
- package/src/specs/index.mjs +38 -0
- package/src/temporal/index.mjs +102 -0
- package/src/temporal-adaptive.mjs +163 -0
- package/src/temporal-batch-optimizer.mjs +222 -0
- package/src/temporal-constants.mjs +69 -0
- package/src/temporal-context.mjs +49 -0
- package/src/temporal-decision-manager.mjs +271 -0
- package/src/temporal-decision.mjs +669 -0
- package/src/temporal-errors.mjs +58 -0
- package/src/temporal-note-pruner.mjs +173 -0
- package/src/temporal-preprocessor.mjs +543 -0
- package/src/temporal-prompt-formatter.mjs +219 -0
- package/src/temporal-validation.mjs +159 -0
- package/src/temporal.mjs +415 -0
- package/src/type-guards.mjs +311 -0
- package/src/uncertainty-reducer.mjs +470 -0
- package/src/utils/index.mjs +175 -0
- package/src/validation-framework.mjs +321 -0
- package/src/validation-result-normalizer.mjs +64 -0
- package/src/validation.mjs +243 -0
- package/src/validators/accessibility-programmatic.mjs +345 -0
- package/src/validators/accessibility-validator.mjs +223 -0
- package/src/validators/batch-validator.mjs +143 -0
- package/src/validators/hybrid-validator.mjs +268 -0
- package/src/validators/index.mjs +34 -0
- package/src/validators/prompt-builder.mjs +218 -0
- package/src/validators/rubric.mjs +85 -0
- package/src/validators/state-programmatic.mjs +260 -0
- package/src/validators/state-validator.mjs +291 -0
- package/vercel.json +27 -0
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Research-Enhanced Validation
|
|
3
|
+
*
|
|
4
|
+
* Enhanced validation functions that incorporate research findings from:
|
|
5
|
+
* - arXiv:2406.07791 (position bias, quality gaps)
|
|
6
|
+
* - arXiv:2407.01085 (length bias, AdapAlpaca)
|
|
7
|
+
* - arXiv:2412.05579 (LLM-as-judge best practices)
|
|
8
|
+
*
|
|
9
|
+
* These functions expose research-based features through a clean API.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { validateScreenshot } from './judge.mjs';
|
|
13
|
+
import { detectBias, detectPositionBias } from './bias-detector.mjs';
|
|
14
|
+
import { mitigateBias, mitigatePositionBias, applyBiasMitigation } from './bias-mitigation.mjs';
|
|
15
|
+
import { evaluateWithCounterBalance } from './position-counterbalance.mjs';
|
|
16
|
+
import { normalizeValidationResult } from './validation-result-normalizer.mjs';
|
|
17
|
+
import { log, warn } from './logger.mjs';
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Validate screenshot with research-enhanced bias detection and mitigation
|
|
21
|
+
*
|
|
22
|
+
* Incorporates findings from arXiv:2406.07791, 2407.01085, 2412.05579:
|
|
23
|
+
* - Quality gap analysis (equivocal case detection)
|
|
24
|
+
* - Judge-level, candidate-level, task-level factor tracking
|
|
25
|
+
* - Comprehensive bias detection and mitigation
|
|
26
|
+
* - Position bias metrics (PC, PF)
|
|
27
|
+
*
|
|
28
|
+
* @param {string} imagePath - Path to screenshot
|
|
29
|
+
* @param {string} prompt - Evaluation prompt
|
|
30
|
+
* @param {{
|
|
31
|
+
* enableBiasDetection?: boolean;
|
|
32
|
+
* enableBiasMitigation?: boolean;
|
|
33
|
+
* qualityGap?: number; // Quality gap (δ_q) between candidates (0-1, where 0.5 = tie)
|
|
34
|
+
* judgeModel?: string; // Judge model identifier
|
|
35
|
+
* taskMetadata?: { inputLength?: number; outputLength?: number; promptLength?: number };
|
|
36
|
+
* useCounterBalance?: boolean; // Use position counter-balancing
|
|
37
|
+
* [key: string]: any;
|
|
38
|
+
* }} [options={}] - Enhanced validation options
|
|
39
|
+
* @returns {Promise<import('./index.mjs').ValidationResult>} Enhanced validation result
|
|
40
|
+
*/
|
|
41
|
+
export async function validateWithResearchEnhancements(imagePath, prompt, options = {}) {
|
|
42
|
+
const {
|
|
43
|
+
enableBiasDetection = true,
|
|
44
|
+
enableBiasMitigation = true,
|
|
45
|
+
qualityGap = null,
|
|
46
|
+
judgeModel = null,
|
|
47
|
+
taskMetadata = {},
|
|
48
|
+
useCounterBalance = false,
|
|
49
|
+
...validationOptions
|
|
50
|
+
} = options;
|
|
51
|
+
|
|
52
|
+
// Perform validation
|
|
53
|
+
let result;
|
|
54
|
+
if (useCounterBalance) {
|
|
55
|
+
// Use counter-balancing for position bias mitigation
|
|
56
|
+
result = await evaluateWithCounterBalance(
|
|
57
|
+
validateScreenshot,
|
|
58
|
+
imagePath,
|
|
59
|
+
prompt,
|
|
60
|
+
validationOptions,
|
|
61
|
+
{ enabled: true }
|
|
62
|
+
);
|
|
63
|
+
} else {
|
|
64
|
+
result = await validateScreenshot(imagePath, prompt, validationOptions);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Add research-based enhancements
|
|
68
|
+
if (enableBiasDetection || enableBiasMitigation) {
|
|
69
|
+
const reasoning = result.reasoning || result.assessment || '';
|
|
70
|
+
|
|
71
|
+
// Detect biases
|
|
72
|
+
const biasDetection = enableBiasDetection ? detectBias(reasoning, {
|
|
73
|
+
checkVerbosity: true,
|
|
74
|
+
checkLength: true,
|
|
75
|
+
checkFormatting: true,
|
|
76
|
+
checkAuthority: true
|
|
77
|
+
}) : null;
|
|
78
|
+
|
|
79
|
+
// Add position bias detection if we have multiple judgments
|
|
80
|
+
let positionBias = null;
|
|
81
|
+
if (enableBiasDetection && Array.isArray(result.judgments)) {
|
|
82
|
+
positionBias = detectPositionBias(result.judgments, {
|
|
83
|
+
qualityGap: qualityGap,
|
|
84
|
+
judgeModel: judgeModel || validationOptions.provider || 'unknown',
|
|
85
|
+
taskMetadata: taskMetadata
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Apply mitigation if enabled
|
|
90
|
+
if (enableBiasMitigation && biasDetection?.hasBias) {
|
|
91
|
+
result = mitigateBias(result, biasDetection, {
|
|
92
|
+
adjustScores: true,
|
|
93
|
+
adjustIssues: false
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// Add research metadata
|
|
98
|
+
result.researchEnhancements = {
|
|
99
|
+
biasDetection: biasDetection,
|
|
100
|
+
positionBias: positionBias,
|
|
101
|
+
qualityGap: qualityGap ? {
|
|
102
|
+
value: qualityGap,
|
|
103
|
+
isEquivocal: Math.abs(qualityGap - 0.5) < 0.1,
|
|
104
|
+
note: Math.abs(qualityGap - 0.5) < 0.1
|
|
105
|
+
? 'Equivocal case (δ_q ≈ 0.5) - maximum position bias risk per arXiv:2406.07791'
|
|
106
|
+
: 'Quality gap analysis per research findings'
|
|
107
|
+
} : null,
|
|
108
|
+
factors: {
|
|
109
|
+
judgeModel: judgeModel || validationOptions.provider || 'unknown',
|
|
110
|
+
taskMetadata: taskMetadata
|
|
111
|
+
},
|
|
112
|
+
researchPapers: [
|
|
113
|
+
'arXiv:2406.07791 - Position bias, quality gaps',
|
|
114
|
+
'arXiv:2407.01085 - Length bias, AdapAlpaca',
|
|
115
|
+
'arXiv:2412.05579 - LLM-as-judge best practices'
|
|
116
|
+
]
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Normalize result structure before returning (ensures consistent structure)
|
|
121
|
+
return normalizeValidationResult(result, 'validateWithResearchEnhancements');
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Validate multiple screenshots with position bias analysis
|
|
126
|
+
*
|
|
127
|
+
* Based on arXiv:2406.07791 findings on position bias:
|
|
128
|
+
* - Calculates Position Consistency (PC) and Preference Fairness (PF) metrics
|
|
129
|
+
* - Detects equivocal cases (quality gap ≈ 0.5)
|
|
130
|
+
* - Tracks judge-level and task-level factors
|
|
131
|
+
*
|
|
132
|
+
* @param {string[]} imagePaths - Array of screenshot paths
|
|
133
|
+
* @param {string} prompt - Evaluation prompt
|
|
134
|
+
* @param {{
|
|
135
|
+
* calculateMetrics?: boolean; // Calculate PC/PF metrics
|
|
136
|
+
* qualityGap?: number;
|
|
137
|
+
* judgeModel?: string;
|
|
138
|
+
* taskMetadata?: { inputLength?: number; outputLength?: number; promptLength?: number };
|
|
139
|
+
* enableMitigation?: boolean;
|
|
140
|
+
* [key: string]: any;
|
|
141
|
+
* }} [options={}] - Validation options
|
|
142
|
+
* @returns {Promise<{
|
|
143
|
+
* judgments: import('./index.mjs').ValidationResult[];
|
|
144
|
+
* positionBias: import('./index.mjs').PositionBiasResult;
|
|
145
|
+
* qualityGap: { value: number; isEquivocal: boolean; note: string } | null;
|
|
146
|
+
* metrics?: { positionConsistency: number; preferenceFairness: object };
|
|
147
|
+
* }>} Multi-judgment result with position bias analysis
|
|
148
|
+
*/
|
|
149
|
+
export async function validateMultipleWithPositionAnalysis(imagePaths, prompt, options = {}) {
|
|
150
|
+
const {
|
|
151
|
+
calculateMetrics = true,
|
|
152
|
+
qualityGap = null,
|
|
153
|
+
judgeModel = null,
|
|
154
|
+
taskMetadata = {},
|
|
155
|
+
enableMitigation = false,
|
|
156
|
+
...validationOptions
|
|
157
|
+
} = options;
|
|
158
|
+
|
|
159
|
+
// Validate all screenshots
|
|
160
|
+
const judgments = await Promise.all(
|
|
161
|
+
imagePaths.map(path => validateScreenshot(path, prompt, validationOptions))
|
|
162
|
+
);
|
|
163
|
+
|
|
164
|
+
// Extract scores for position bias detection
|
|
165
|
+
const judgmentScores = judgments.map(j => ({ score: j.score }));
|
|
166
|
+
|
|
167
|
+
// Detect position bias with research metrics
|
|
168
|
+
const positionBias = detectPositionBias(judgmentScores, {
|
|
169
|
+
calculateMetrics: calculateMetrics,
|
|
170
|
+
qualityGap: qualityGap,
|
|
171
|
+
judgeModel: judgeModel || validationOptions.provider || 'unknown',
|
|
172
|
+
taskMetadata: taskMetadata
|
|
173
|
+
});
|
|
174
|
+
|
|
175
|
+
// Apply mitigation if enabled
|
|
176
|
+
let mitigatedJudgments = judgments;
|
|
177
|
+
if (enableMitigation && positionBias.detected) {
|
|
178
|
+
mitigatedJudgments = mitigatePositionBias(judgments, {
|
|
179
|
+
enabled: true
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Calculate quality gap if not provided
|
|
184
|
+
let calculatedQualityGap = qualityGap;
|
|
185
|
+
if (calculatedQualityGap === null && judgments.length >= 2) {
|
|
186
|
+
const scores = judgments.map(j => j.score).filter(s => s !== null);
|
|
187
|
+
if (scores.length >= 2) {
|
|
188
|
+
const scoreRange = Math.max(...scores) - Math.min(...scores);
|
|
189
|
+
const maxPossibleRange = 10; // Assuming 0-10 scale
|
|
190
|
+
calculatedQualityGap = 0.5 - Math.abs((scoreRange / maxPossibleRange) - 0.5);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
return {
|
|
195
|
+
judgments: mitigatedJudgments,
|
|
196
|
+
positionBias: positionBias,
|
|
197
|
+
qualityGap: calculatedQualityGap !== null ? {
|
|
198
|
+
value: calculatedQualityGap,
|
|
199
|
+
isEquivocal: Math.abs(calculatedQualityGap - 0.5) < 0.1,
|
|
200
|
+
note: Math.abs(calculatedQualityGap - 0.5) < 0.1
|
|
201
|
+
? 'Equivocal case (δ_q ≈ 0.5) - maximum position bias risk per arXiv:2406.07791'
|
|
202
|
+
: 'Quality gap analysis per research findings'
|
|
203
|
+
} : null,
|
|
204
|
+
metrics: positionBias.metrics || undefined,
|
|
205
|
+
researchMetadata: {
|
|
206
|
+
papers: ['arXiv:2406.07791'],
|
|
207
|
+
findings: [
|
|
208
|
+
'Position bias varies by judge and task',
|
|
209
|
+
'Quality gap strongly affects bias (parabolic relationship)',
|
|
210
|
+
'Equivocal cases (δ_q ≈ 0.5) cause maximum confusion'
|
|
211
|
+
]
|
|
212
|
+
}
|
|
213
|
+
};
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Validate with length alignment (AdapAlpaca-inspired)
|
|
218
|
+
*
|
|
219
|
+
* Based on arXiv:2407.01085 (AdapAlpaca):
|
|
220
|
+
* - Decomposes preference into desirability (length-independent) and information mass (length-dependent)
|
|
221
|
+
* - Aligns response lengths under equivalent intervals for fair comparison
|
|
222
|
+
* - Reduces length bias in evaluations
|
|
223
|
+
*
|
|
224
|
+
* Note: This is a simplified implementation. Full AdapAlpaca would require
|
|
225
|
+
* length bucketing and alignment before comparison.
|
|
226
|
+
*
|
|
227
|
+
* @param {string} imagePath - Path to screenshot
|
|
228
|
+
* @param {string} prompt - Evaluation prompt
|
|
229
|
+
* @param {{
|
|
230
|
+
* referenceLength?: number; // Reference response length for alignment
|
|
231
|
+
* lengthInterval?: number; // Length interval for bucketing
|
|
232
|
+
* enableLengthNormalization?: boolean;
|
|
233
|
+
* [key: string]: any;
|
|
234
|
+
* }} [options={}] - Length alignment options
|
|
235
|
+
* @returns {Promise<import('./index.mjs').ValidationResult>} Validation result with length alignment
|
|
236
|
+
*/
|
|
237
|
+
export async function validateWithLengthAlignment(imagePath, prompt, options = {}) {
|
|
238
|
+
const {
|
|
239
|
+
referenceLength = null,
|
|
240
|
+
lengthInterval = 50, // Characters
|
|
241
|
+
enableLengthNormalization = true,
|
|
242
|
+
...validationOptions
|
|
243
|
+
} = options;
|
|
244
|
+
|
|
245
|
+
// Perform validation
|
|
246
|
+
const result = await validateScreenshot(imagePath, prompt, validationOptions);
|
|
247
|
+
|
|
248
|
+
// Apply length-based bias detection and mitigation
|
|
249
|
+
if (enableLengthNormalization && result.reasoning) {
|
|
250
|
+
const reasoningLength = result.reasoning.length;
|
|
251
|
+
|
|
252
|
+
// Detect verbosity/length bias
|
|
253
|
+
const biasDetection = detectBias(result.reasoning, {
|
|
254
|
+
checkVerbosity: true,
|
|
255
|
+
checkLength: true
|
|
256
|
+
});
|
|
257
|
+
|
|
258
|
+
// Apply mitigation (simplified AdapAlpaca approach)
|
|
259
|
+
if (biasDetection.hasBias) {
|
|
260
|
+
const mitigated = mitigateBias(result, biasDetection, {
|
|
261
|
+
adjustScores: true
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
// Add AdapAlpaca metadata
|
|
265
|
+
mitigated.lengthAlignment = {
|
|
266
|
+
originalLength: reasoningLength,
|
|
267
|
+
referenceLength: referenceLength,
|
|
268
|
+
lengthInterval: lengthInterval,
|
|
269
|
+
normalized: true,
|
|
270
|
+
note: 'AdapAlpaca-inspired length normalization (arXiv:2407.01085). Full implementation would align lengths under equivalent intervals before comparison.',
|
|
271
|
+
researchPaper: 'arXiv:2407.01085 - Explaining Length Bias in LLM-Based Preference Evaluations'
|
|
272
|
+
};
|
|
273
|
+
|
|
274
|
+
// Normalize mitigated result before returning
|
|
275
|
+
return normalizeValidationResult(mitigated, 'validateWithLengthAlignment');
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// Normalize result structure before returning
|
|
280
|
+
return normalizeValidationResult(result, 'validateWithLengthAlignment');
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
/**
|
|
284
|
+
* Validate with explicit rubrics (research-backed)
|
|
285
|
+
*
|
|
286
|
+
* Based on arXiv:2412.05579 findings:
|
|
287
|
+
* - Explicit rubrics improve reliability by 10-20%
|
|
288
|
+
* - Reduce bias from superficial features
|
|
289
|
+
* - Provide structured evaluation criteria
|
|
290
|
+
*
|
|
291
|
+
* @param {string} imagePath - Path to screenshot
|
|
292
|
+
* @param {string} prompt - Evaluation prompt
|
|
293
|
+
* @param {{
|
|
294
|
+
* rubric?: string | object; // Explicit rubric to use
|
|
295
|
+
* useDefaultRubric?: boolean; // Use default research-backed rubric
|
|
296
|
+
* [key: string]: any;
|
|
297
|
+
* }} [options={}] - Rubric options
|
|
298
|
+
* @returns {Promise<import('./index.mjs').ValidationResult>} Validation result with explicit rubric
|
|
299
|
+
*/
|
|
300
|
+
export async function validateWithExplicitRubric(imagePath, prompt, options = {}) {
|
|
301
|
+
const {
|
|
302
|
+
rubric = null,
|
|
303
|
+
useDefaultRubric = true,
|
|
304
|
+
...validationOptions
|
|
305
|
+
} = options;
|
|
306
|
+
|
|
307
|
+
// Import rubric builder
|
|
308
|
+
const { buildRubricPrompt, DEFAULT_RUBRIC } = await import('./rubrics.mjs');
|
|
309
|
+
|
|
310
|
+
// Build prompt with explicit rubric
|
|
311
|
+
let enhancedPrompt = prompt;
|
|
312
|
+
if (useDefaultRubric && !rubric) {
|
|
313
|
+
enhancedPrompt = buildRubricPrompt(prompt, DEFAULT_RUBRIC);
|
|
314
|
+
} else if (rubric) {
|
|
315
|
+
enhancedPrompt = buildRubricPrompt(prompt, rubric);
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// Perform validation
|
|
319
|
+
const result = await validateScreenshot(imagePath, enhancedPrompt, validationOptions);
|
|
320
|
+
|
|
321
|
+
// Add rubric metadata
|
|
322
|
+
result.rubricEnhancement = {
|
|
323
|
+
used: true,
|
|
324
|
+
type: rubric ? 'custom' : 'default',
|
|
325
|
+
researchPaper: 'arXiv:2412.05579 - LLMs-as-Judges Survey',
|
|
326
|
+
finding: 'Explicit rubrics improve reliability by 10-20% and reduce bias from superficial features'
|
|
327
|
+
};
|
|
328
|
+
|
|
329
|
+
// Normalize result structure before returning
|
|
330
|
+
return normalizeValidationResult(result, 'validateWithExplicitRubric');
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
/**
|
|
334
|
+
* Comprehensive research-enhanced validation
|
|
335
|
+
*
|
|
336
|
+
* Combines all research enhancements:
|
|
337
|
+
* - Explicit rubrics (arXiv:2412.05579)
|
|
338
|
+
* - Bias detection and mitigation (arXiv:2406.07791, 2407.01085)
|
|
339
|
+
* - Quality gap analysis (arXiv:2406.07791)
|
|
340
|
+
* - Length alignment (arXiv:2407.01085)
|
|
341
|
+
* - Position bias metrics (arXiv:2406.07791)
|
|
342
|
+
*
|
|
343
|
+
* @param {string} imagePath - Path to screenshot
|
|
344
|
+
* @param {string} prompt - Evaluation prompt
|
|
345
|
+
* @param {{
|
|
346
|
+
* enableRubrics?: boolean;
|
|
347
|
+
* enableBiasDetection?: boolean;
|
|
348
|
+
* enableBiasMitigation?: boolean;
|
|
349
|
+
* enableLengthAlignment?: boolean;
|
|
350
|
+
* qualityGap?: number;
|
|
351
|
+
* judgeModel?: string;
|
|
352
|
+
* taskMetadata?: { inputLength?: number; outputLength?: number; promptLength?: number };
|
|
353
|
+
* [key: string]: any;
|
|
354
|
+
* }} [options={}] - Comprehensive options
|
|
355
|
+
* @returns {Promise<import('./index.mjs').ValidationResult>} Comprehensive validation result
|
|
356
|
+
*/
|
|
357
|
+
export async function validateWithAllResearchEnhancements(imagePath, prompt, options = {}) {
|
|
358
|
+
const {
|
|
359
|
+
enableRubrics = true,
|
|
360
|
+
enableBiasDetection = true,
|
|
361
|
+
enableBiasMitigation = true,
|
|
362
|
+
enableLengthAlignment = true,
|
|
363
|
+
qualityGap = null,
|
|
364
|
+
judgeModel = null,
|
|
365
|
+
taskMetadata = {},
|
|
366
|
+
...validationOptions
|
|
367
|
+
} = options;
|
|
368
|
+
|
|
369
|
+
// Step 1: Apply explicit rubric if enabled
|
|
370
|
+
let currentPrompt = prompt;
|
|
371
|
+
if (enableRubrics) {
|
|
372
|
+
const { buildRubricPrompt, DEFAULT_RUBRIC } = await import('./rubrics.mjs');
|
|
373
|
+
currentPrompt = buildRubricPrompt(currentPrompt, DEFAULT_RUBRIC);
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
// Step 2: Perform validation with length alignment if enabled
|
|
377
|
+
let result;
|
|
378
|
+
if (enableLengthAlignment) {
|
|
379
|
+
result = await validateWithLengthAlignment(imagePath, currentPrompt, {
|
|
380
|
+
...validationOptions,
|
|
381
|
+
enableLengthNormalization: true
|
|
382
|
+
});
|
|
383
|
+
} else {
|
|
384
|
+
result = await validateScreenshot(imagePath, currentPrompt, validationOptions);
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
// Step 3: Apply bias detection and mitigation
|
|
388
|
+
if (enableBiasDetection || enableBiasMitigation) {
|
|
389
|
+
const reasoning = result.reasoning || result.assessment || '';
|
|
390
|
+
const biasDetection = enableBiasDetection ? detectBias(reasoning, {
|
|
391
|
+
checkVerbosity: true,
|
|
392
|
+
checkLength: true,
|
|
393
|
+
checkFormatting: true,
|
|
394
|
+
checkAuthority: true
|
|
395
|
+
}) : null;
|
|
396
|
+
|
|
397
|
+
if (enableBiasMitigation && biasDetection?.hasBias) {
|
|
398
|
+
result = mitigateBias(result, biasDetection, {
|
|
399
|
+
adjustScores: true
|
|
400
|
+
});
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
// Add comprehensive research metadata
|
|
404
|
+
result.comprehensiveResearchEnhancements = {
|
|
405
|
+
rubrics: enableRubrics ? {
|
|
406
|
+
used: true,
|
|
407
|
+
paper: 'arXiv:2412.05579',
|
|
408
|
+
finding: 'Explicit rubrics improve reliability by 10-20%'
|
|
409
|
+
} : null,
|
|
410
|
+
biasDetection: biasDetection,
|
|
411
|
+
lengthAlignment: enableLengthAlignment ? {
|
|
412
|
+
applied: true,
|
|
413
|
+
paper: 'arXiv:2407.01085',
|
|
414
|
+
method: 'AdapAlpaca-inspired'
|
|
415
|
+
} : null,
|
|
416
|
+
qualityGap: qualityGap ? {
|
|
417
|
+
value: qualityGap,
|
|
418
|
+
isEquivocal: Math.abs(qualityGap - 0.5) < 0.1,
|
|
419
|
+
paper: 'arXiv:2406.07791'
|
|
420
|
+
} : null,
|
|
421
|
+
factors: {
|
|
422
|
+
judgeModel: judgeModel || validationOptions.provider || 'unknown',
|
|
423
|
+
taskMetadata: taskMetadata
|
|
424
|
+
},
|
|
425
|
+
researchPapers: [
|
|
426
|
+
'arXiv:2406.07791 - Position bias, quality gaps',
|
|
427
|
+
'arXiv:2407.01085 - Length bias, AdapAlpaca',
|
|
428
|
+
'arXiv:2412.05579 - LLM-as-judge best practices'
|
|
429
|
+
]
|
|
430
|
+
};
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
// Normalize result structure before returning (ensures consistent structure)
|
|
434
|
+
return normalizeValidationResult(result, 'validateWithAllResearchEnhancements');
|
|
435
|
+
}
|
|
436
|
+
|
package/src/retry.mjs
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Retry Logic with Exponential Backoff
|
|
3
|
+
*
|
|
4
|
+
* Provides retry mechanisms for API calls with exponential backoff,
|
|
5
|
+
* configurable retry counts, and error classification.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { ProviderError, TimeoutError } from './errors.mjs';
|
|
9
|
+
import { log, warn } from './logger.mjs';
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Check if an error is retryable
|
|
13
|
+
*
|
|
14
|
+
* @param {Error} error - Error to check
|
|
15
|
+
* @returns {boolean} True if error is retryable
|
|
16
|
+
*/
|
|
17
|
+
export function isRetryableError(error) {
|
|
18
|
+
// Network errors (timeouts, connection issues)
|
|
19
|
+
if (error instanceof TimeoutError) return true;
|
|
20
|
+
if (error.name === 'AbortError' || error.name === 'NetworkError') return true;
|
|
21
|
+
if (error.message?.includes('timeout') || error.message?.includes('network')) return true;
|
|
22
|
+
|
|
23
|
+
// Rate limiting (429)
|
|
24
|
+
if (error instanceof ProviderError && error.details?.statusCode === 429) return true;
|
|
25
|
+
if (error.message?.includes('rate limit') || error.message?.includes('429')) return true;
|
|
26
|
+
|
|
27
|
+
// Server errors (5xx)
|
|
28
|
+
if (error instanceof ProviderError) {
|
|
29
|
+
const status = error.details?.statusCode;
|
|
30
|
+
if (status >= 500 && status < 600) return true;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// Transient API errors
|
|
34
|
+
if (error.message?.includes('temporarily unavailable') ||
|
|
35
|
+
error.message?.includes('service unavailable') ||
|
|
36
|
+
error.message?.includes('internal server error')) {
|
|
37
|
+
return true;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Not retryable: authentication errors, validation errors, etc.
|
|
41
|
+
return false;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Calculate exponential backoff delay
|
|
46
|
+
*
|
|
47
|
+
* @param {number} attempt - Current attempt number (0-indexed)
|
|
48
|
+
* @param {number} baseDelay - Base delay in milliseconds
|
|
49
|
+
* @param {number} maxDelay - Maximum delay in milliseconds
|
|
50
|
+
* @param {boolean} jitter - Add random jitter to prevent thundering herd
|
|
51
|
+
* @returns {number} Delay in milliseconds
|
|
52
|
+
*/
|
|
53
|
+
export function calculateBackoff(attempt, baseDelay = 1000, maxDelay = 30000, jitter = true) {
|
|
54
|
+
const exponentialDelay = Math.min(baseDelay * Math.pow(2, attempt), maxDelay);
|
|
55
|
+
|
|
56
|
+
if (jitter) {
|
|
57
|
+
// Add ±25% random jitter
|
|
58
|
+
const jitterAmount = exponentialDelay * 0.25;
|
|
59
|
+
const jitterValue = (Math.random() * 2 - 1) * jitterAmount;
|
|
60
|
+
return Math.max(0, exponentialDelay + jitterValue);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
return exponentialDelay;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Retry a function with exponential backoff
|
|
68
|
+
*
|
|
69
|
+
* @template T
|
|
70
|
+
* @param {() => Promise<T>} fn - Function to retry
|
|
71
|
+
* @param {{
|
|
72
|
+
* maxRetries?: number;
|
|
73
|
+
* baseDelay?: number;
|
|
74
|
+
* maxDelay?: number;
|
|
75
|
+
* onRetry?: (error: Error, attempt: number, delay: number) => void;
|
|
76
|
+
* retryable?: (error: Error) => boolean;
|
|
77
|
+
* }} [options={}] - Retry options
|
|
78
|
+
* @returns {Promise<T>} Result of function
|
|
79
|
+
* @throws {Error} Last error if all retries fail
|
|
80
|
+
*/
|
|
81
|
+
export async function retryWithBackoff(fn, options = {}) {
|
|
82
|
+
const {
|
|
83
|
+
maxRetries = 3,
|
|
84
|
+
baseDelay = 1000,
|
|
85
|
+
maxDelay = 30000,
|
|
86
|
+
onRetry = null,
|
|
87
|
+
retryable = isRetryableError
|
|
88
|
+
} = options;
|
|
89
|
+
|
|
90
|
+
let lastError;
|
|
91
|
+
|
|
92
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
93
|
+
try {
|
|
94
|
+
return await fn();
|
|
95
|
+
} catch (error) {
|
|
96
|
+
lastError = error;
|
|
97
|
+
|
|
98
|
+
// Don't retry if error is not retryable
|
|
99
|
+
if (!retryable(error)) {
|
|
100
|
+
throw error;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Don't retry on last attempt
|
|
104
|
+
if (attempt >= maxRetries) {
|
|
105
|
+
break;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const delay = calculateBackoff(attempt, baseDelay, maxDelay);
|
|
109
|
+
|
|
110
|
+
if (onRetry) {
|
|
111
|
+
onRetry(error, attempt + 1, delay);
|
|
112
|
+
} else {
|
|
113
|
+
warn(`[Retry] Attempt ${attempt + 1}/${maxRetries} failed: ${error.message}. Retrying in ${delay}ms...`);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
await new Promise(resolve => setTimeout(resolve, delay));
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// All retries exhausted
|
|
121
|
+
throw lastError;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Enhanced error message with retry context
|
|
126
|
+
*
|
|
127
|
+
* @param {Error} error - Original error
|
|
128
|
+
* @param {number} attempts - Number of attempts made
|
|
129
|
+
* @param {string} operation - Operation that failed
|
|
130
|
+
* @returns {string} Enhanced error message
|
|
131
|
+
*/
|
|
132
|
+
export function enhanceErrorMessage(error, attempts, operation) {
|
|
133
|
+
const baseMessage = error.message || 'Unknown error';
|
|
134
|
+
const context = [];
|
|
135
|
+
|
|
136
|
+
context.push(`Operation: ${operation}`);
|
|
137
|
+
context.push(`Attempts: ${attempts}`);
|
|
138
|
+
|
|
139
|
+
if (error instanceof ProviderError) {
|
|
140
|
+
context.push(`Provider: ${error.provider}`);
|
|
141
|
+
if (error.details?.statusCode) {
|
|
142
|
+
context.push(`HTTP Status: ${error.details.statusCode}`);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
if (error instanceof TimeoutError) {
|
|
147
|
+
context.push(`Timeout: ${error.timeout}ms`);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
return `${baseMessage} (${context.join(', ')})`;
|
|
151
|
+
}
|
|
152
|
+
|