@arclabs561/ai-visual-test 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.secretsignore.example +20 -0
- package/CHANGELOG.md +360 -0
- package/CONTRIBUTING.md +63 -0
- package/DEPLOYMENT.md +80 -0
- package/LICENSE +22 -0
- package/README.md +142 -0
- package/SECURITY.md +108 -0
- package/api/health.js +34 -0
- package/api/validate.js +252 -0
- package/index.d.ts +1221 -0
- package/package.json +112 -0
- package/public/index.html +149 -0
- package/src/batch-optimizer.mjs +451 -0
- package/src/bias-detector.mjs +370 -0
- package/src/bias-mitigation.mjs +233 -0
- package/src/cache.mjs +433 -0
- package/src/config.mjs +268 -0
- package/src/constants.mjs +80 -0
- package/src/context-compressor.mjs +350 -0
- package/src/convenience.mjs +617 -0
- package/src/cost-tracker.mjs +257 -0
- package/src/cross-modal-consistency.mjs +170 -0
- package/src/data-extractor.mjs +232 -0
- package/src/dynamic-few-shot.mjs +140 -0
- package/src/dynamic-prompts.mjs +361 -0
- package/src/ensemble/index.mjs +53 -0
- package/src/ensemble-judge.mjs +366 -0
- package/src/error-handler.mjs +67 -0
- package/src/errors.mjs +167 -0
- package/src/experience-propagation.mjs +128 -0
- package/src/experience-tracer.mjs +487 -0
- package/src/explanation-manager.mjs +299 -0
- package/src/feedback-aggregator.mjs +248 -0
- package/src/game-goal-prompts.mjs +478 -0
- package/src/game-player.mjs +548 -0
- package/src/hallucination-detector.mjs +155 -0
- package/src/helpers/playwright.mjs +80 -0
- package/src/human-validation-manager.mjs +516 -0
- package/src/index.mjs +364 -0
- package/src/judge.mjs +929 -0
- package/src/latency-aware-batch-optimizer.mjs +192 -0
- package/src/load-env.mjs +159 -0
- package/src/logger.mjs +55 -0
- package/src/metrics.mjs +187 -0
- package/src/model-tier-selector.mjs +221 -0
- package/src/multi-modal/index.mjs +36 -0
- package/src/multi-modal-fusion.mjs +190 -0
- package/src/multi-modal.mjs +524 -0
- package/src/natural-language-specs.mjs +1071 -0
- package/src/pair-comparison.mjs +277 -0
- package/src/persona/index.mjs +42 -0
- package/src/persona-enhanced.mjs +200 -0
- package/src/persona-experience.mjs +572 -0
- package/src/position-counterbalance.mjs +140 -0
- package/src/prompt-composer.mjs +375 -0
- package/src/render-change-detector.mjs +583 -0
- package/src/research-enhanced-validation.mjs +436 -0
- package/src/retry.mjs +152 -0
- package/src/rubrics.mjs +231 -0
- package/src/score-tracker.mjs +277 -0
- package/src/smart-validator.mjs +447 -0
- package/src/spec-config.mjs +106 -0
- package/src/spec-templates.mjs +347 -0
- package/src/specs/index.mjs +38 -0
- package/src/temporal/index.mjs +102 -0
- package/src/temporal-adaptive.mjs +163 -0
- package/src/temporal-batch-optimizer.mjs +222 -0
- package/src/temporal-constants.mjs +69 -0
- package/src/temporal-context.mjs +49 -0
- package/src/temporal-decision-manager.mjs +271 -0
- package/src/temporal-decision.mjs +669 -0
- package/src/temporal-errors.mjs +58 -0
- package/src/temporal-note-pruner.mjs +173 -0
- package/src/temporal-preprocessor.mjs +543 -0
- package/src/temporal-prompt-formatter.mjs +219 -0
- package/src/temporal-validation.mjs +159 -0
- package/src/temporal.mjs +415 -0
- package/src/type-guards.mjs +311 -0
- package/src/uncertainty-reducer.mjs +470 -0
- package/src/utils/index.mjs +175 -0
- package/src/validation-framework.mjs +321 -0
- package/src/validation-result-normalizer.mjs +64 -0
- package/src/validation.mjs +243 -0
- package/src/validators/accessibility-programmatic.mjs +345 -0
- package/src/validators/accessibility-validator.mjs +223 -0
- package/src/validators/batch-validator.mjs +143 -0
- package/src/validators/hybrid-validator.mjs +268 -0
- package/src/validators/index.mjs +34 -0
- package/src/validators/prompt-builder.mjs +218 -0
- package/src/validators/rubric.mjs +85 -0
- package/src/validators/state-programmatic.mjs +260 -0
- package/src/validators/state-validator.mjs +291 -0
- package/vercel.json +27 -0
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Bias Detection for LLM-as-a-Judge
|
|
3
|
+
*
|
|
4
|
+
* Detects common biases in LLM judge evaluations:
|
|
5
|
+
* - Superficial feature bias (verbosity, length, formatting)
|
|
6
|
+
* - Position bias (favoring first/last responses) - Systematic study shows position bias is not random
|
|
7
|
+
* and varies significantly across judges and tasks (arXiv:2406.07791)
|
|
8
|
+
* - Verbosity bias (favoring longer responses) - LLMs prefer longer answers more than humans
|
|
9
|
+
* (arXiv:2310.10076, arXiv:2407.01085)
|
|
10
|
+
* - Recency bias (favoring recent information)
|
|
11
|
+
* - Authority bias (favoring authoritative-sounding responses)
|
|
12
|
+
*
|
|
13
|
+
* Research shows these biases can significantly impact evaluation quality.
|
|
14
|
+
* Position bias is particularly severe and requires counter-balancing (arXiv:2406.07791, arXiv:2508.02020).
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Detect superficial feature bias in judgment
|
|
19
|
+
*
|
|
20
|
+
* @param {string | object} judgment - Judgment object or text
|
|
21
|
+
* @param {{
|
|
22
|
+
* checkVerbosity?: boolean;
|
|
23
|
+
* checkLength?: boolean;
|
|
24
|
+
* checkFormatting?: boolean;
|
|
25
|
+
* checkPosition?: boolean;
|
|
26
|
+
* checkAuthority?: boolean;
|
|
27
|
+
* }} [options={}] - Detection options
|
|
28
|
+
* @returns {import('./index.mjs').BiasDetectionResult} Bias detection results
|
|
29
|
+
*/
|
|
30
|
+
export function detectBias(judgment, options = {}) {
|
|
31
|
+
const {
|
|
32
|
+
checkVerbosity = true,
|
|
33
|
+
checkLength = true,
|
|
34
|
+
checkFormatting = true,
|
|
35
|
+
checkPosition = false,
|
|
36
|
+
checkAuthority = true
|
|
37
|
+
} = options;
|
|
38
|
+
|
|
39
|
+
const judgmentText = typeof judgment === 'string'
|
|
40
|
+
? judgment
|
|
41
|
+
: JSON.stringify(judgment);
|
|
42
|
+
|
|
43
|
+
const biases = {
|
|
44
|
+
verbosity: checkVerbosity ? detectVerbosityBias(judgmentText) : null,
|
|
45
|
+
length: checkLength ? detectLengthBias(judgmentText) : null,
|
|
46
|
+
formatting: checkFormatting ? detectFormattingBias(judgmentText) : null,
|
|
47
|
+
authority: checkAuthority ? detectAuthorityBias(judgmentText) : null
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
const detectedBiases = Object.entries(biases)
|
|
51
|
+
.filter(([_, result]) => result && result.detected)
|
|
52
|
+
.map(([type, result]) => ({ type, ...result }));
|
|
53
|
+
|
|
54
|
+
return {
|
|
55
|
+
hasBias: detectedBiases.length > 0,
|
|
56
|
+
biases: detectedBiases,
|
|
57
|
+
severity: calculateSeverity(detectedBiases),
|
|
58
|
+
recommendations: generateRecommendations(detectedBiases)
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Detect verbosity bias (favoring longer responses)
|
|
64
|
+
*/
|
|
65
|
+
function detectVerbosityBias(text) {
|
|
66
|
+
const wordCount = text.split(/\s+/).length;
|
|
67
|
+
const avgWordLength = text.split(/\s+/).reduce((sum, word) => sum + word.length, 0) / wordCount;
|
|
68
|
+
|
|
69
|
+
// Flags: excessive length, repetitive phrases, filler words
|
|
70
|
+
const fillerWords = ['very', 'really', 'quite', 'rather', 'somewhat', 'rather', 'extremely'];
|
|
71
|
+
const fillerCount = fillerWords.reduce((count, word) => {
|
|
72
|
+
const regex = new RegExp(`\\b${word}\\b`, 'gi');
|
|
73
|
+
return count + (text.match(regex) || []).length;
|
|
74
|
+
}, 0);
|
|
75
|
+
|
|
76
|
+
const repetitivePhrases = findRepetitivePhrases(text);
|
|
77
|
+
|
|
78
|
+
const detected = wordCount > 500 || fillerCount > 10 || repetitivePhrases.length > 3;
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
detected,
|
|
82
|
+
score: detected ? Math.min(1.0, (wordCount / 1000) + (fillerCount / 20) + (repetitivePhrases.length / 5)) : 0,
|
|
83
|
+
evidence: {
|
|
84
|
+
wordCount,
|
|
85
|
+
fillerCount,
|
|
86
|
+
repetitivePhrases: repetitivePhrases.slice(0, 3),
|
|
87
|
+
avgWordLength
|
|
88
|
+
}
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Detect length bias (favoring responses based on length alone)
|
|
94
|
+
*/
|
|
95
|
+
function detectLengthBias(text) {
|
|
96
|
+
const length = text.length;
|
|
97
|
+
const hasLengthBasedReasoning = /length|size|long|short|brief|extensive/i.test(text);
|
|
98
|
+
|
|
99
|
+
return {
|
|
100
|
+
detected: hasLengthBasedReasoning && length > 200,
|
|
101
|
+
score: hasLengthBasedReasoning ? 0.7 : 0,
|
|
102
|
+
evidence: {
|
|
103
|
+
length,
|
|
104
|
+
mentionsLength: hasLengthBasedReasoning
|
|
105
|
+
}
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Detect formatting bias (favoring well-formatted responses)
|
|
111
|
+
*/
|
|
112
|
+
function detectFormattingBias(text) {
|
|
113
|
+
const hasMarkdown = /#{1,6}\s|^\*\s|^-\s|^\d+\.\s/m.test(text);
|
|
114
|
+
const hasLists = (text.match(/^\s*[-*]\s/gm) || []).length > 3;
|
|
115
|
+
const hasHeaders = (text.match(/^#{1,6}\s/gm) || []).length > 2;
|
|
116
|
+
|
|
117
|
+
// Check if judgment mentions formatting
|
|
118
|
+
const mentionsFormatting = /format|structure|organized|well-formatted|markdown/i.test(text);
|
|
119
|
+
|
|
120
|
+
return {
|
|
121
|
+
detected: mentionsFormatting && (hasMarkdown || hasLists || hasHeaders),
|
|
122
|
+
score: mentionsFormatting ? 0.6 : 0,
|
|
123
|
+
evidence: {
|
|
124
|
+
hasMarkdown,
|
|
125
|
+
hasLists,
|
|
126
|
+
hasHeaders,
|
|
127
|
+
mentionsFormatting
|
|
128
|
+
}
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Detect authority bias (favoring authoritative-sounding responses)
|
|
134
|
+
*/
|
|
135
|
+
function detectAuthorityBias(text) {
|
|
136
|
+
const authorityPhrases = [
|
|
137
|
+
'according to', 'research shows', 'studies indicate', 'experts say',
|
|
138
|
+
'it is well-known', 'commonly accepted', 'standard practice',
|
|
139
|
+
'best practice', 'industry standard', 'widely recognized'
|
|
140
|
+
];
|
|
141
|
+
|
|
142
|
+
const authorityCount = authorityPhrases.reduce((count, phrase) => {
|
|
143
|
+
const regex = new RegExp(phrase, 'gi');
|
|
144
|
+
return count + (text.match(regex) || []).length;
|
|
145
|
+
}, 0);
|
|
146
|
+
|
|
147
|
+
return {
|
|
148
|
+
detected: authorityCount > 2,
|
|
149
|
+
score: Math.min(1.0, authorityCount / 5),
|
|
150
|
+
evidence: {
|
|
151
|
+
authorityPhrasesFound: authorityCount
|
|
152
|
+
}
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Find repetitive phrases in text
|
|
158
|
+
*/
|
|
159
|
+
function findRepetitivePhrases(text) {
|
|
160
|
+
const words = text.toLowerCase().split(/\s+/);
|
|
161
|
+
const phrases = [];
|
|
162
|
+
|
|
163
|
+
// Check for 3-word phrases that repeat
|
|
164
|
+
for (let i = 0; i < words.length - 2; i++) {
|
|
165
|
+
const phrase = words.slice(i, i + 3).join(' ');
|
|
166
|
+
const count = (text.toLowerCase().match(new RegExp(phrase.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g')) || []).length;
|
|
167
|
+
if (count > 2) {
|
|
168
|
+
phrases.push({ phrase, count });
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
return phrases
|
|
173
|
+
.filter((p, i, arr) => arr.findIndex(x => x.phrase === p.phrase) === i)
|
|
174
|
+
.sort((a, b) => b.count - a.count)
|
|
175
|
+
.slice(0, 5);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Calculate overall bias severity
|
|
180
|
+
*/
|
|
181
|
+
function calculateSeverity(detectedBiases) {
|
|
182
|
+
if (detectedBiases.length === 0) return 'none';
|
|
183
|
+
|
|
184
|
+
const avgScore = detectedBiases.reduce((sum, b) => sum + (b.score || 0), 0) / detectedBiases.length;
|
|
185
|
+
|
|
186
|
+
if (avgScore >= 0.7) return 'high';
|
|
187
|
+
if (avgScore >= 0.4) return 'medium';
|
|
188
|
+
return 'low';
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* Generate recommendations based on detected biases
|
|
193
|
+
*/
|
|
194
|
+
function generateRecommendations(detectedBiases) {
|
|
195
|
+
const recommendations = [];
|
|
196
|
+
|
|
197
|
+
if (detectedBiases.some(b => b.type === 'verbosity')) {
|
|
198
|
+
recommendations.push('Judge may be favoring verbose responses. Focus evaluation on content quality, not length.');
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
if (detectedBiases.some(b => b.type === 'length')) {
|
|
202
|
+
recommendations.push('Judge may be biased by response length. Use rubric to focus on substantive content.');
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
if (detectedBiases.some(b => b.type === 'formatting')) {
|
|
206
|
+
recommendations.push('Judge may be favoring well-formatted responses. Evaluate content regardless of formatting.');
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
if (detectedBiases.some(b => b.type === 'authority')) {
|
|
210
|
+
recommendations.push('Judge may be biased by authoritative language. Focus on factual correctness, not tone.');
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
if (recommendations.length === 0) {
|
|
214
|
+
recommendations.push('No significant biases detected. Consider using ensemble judging for high-stakes evaluations.');
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
return recommendations;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Detect position bias in array of judgments
|
|
222
|
+
*
|
|
223
|
+
* Research: arXiv:2406.07791 introduces three metrics:
|
|
224
|
+
* - Repetition Stability (RS): Consistency across repeated evaluations (threshold: RS > 0.85)
|
|
225
|
+
* - Position Consistency (PC): Ratio of consistent judgments when order is swapped
|
|
226
|
+
* - Preference Fairness (PF): Extent to which judges favor specific positions
|
|
227
|
+
* - PF = 0: no bias
|
|
228
|
+
* - PF > 0: recency bias (favoring later positions)
|
|
229
|
+
* - PF < 0: primacy bias (favoring earlier positions)
|
|
230
|
+
*
|
|
231
|
+
* Key findings from research:
|
|
232
|
+
* - Quality gap strongly affects bias: parabolic relationship where PC increases with quality gap
|
|
233
|
+
* - Small quality gaps (δ_q ≈ 0.5) cause maximum confusion and position bias
|
|
234
|
+
* - Large quality gaps (δ_q → 0 or 1) lead to more consistent, fair judgments
|
|
235
|
+
* - Judge-level factors: model family, context window, max output length
|
|
236
|
+
* - Candidate-level factors: quality gap is primary, but also verbosity/length bias
|
|
237
|
+
* - Task-level factors: input/output length, prompt length, benchmark difficulty
|
|
238
|
+
*
|
|
239
|
+
* @param {Array<{ score: number | null }>} judgments - Array of judgment results with scores
|
|
240
|
+
* @param {{
|
|
241
|
+
* calculateMetrics?: boolean;
|
|
242
|
+
* swappedJudgments?: Array<{ score: number | null }>;
|
|
243
|
+
* qualityGap?: number; // Quality gap (δ_q) between candidates (0-1, where 0.5 = tie)
|
|
244
|
+
* judgeModel?: string; // Judge model identifier for judge-level analysis
|
|
245
|
+
* taskMetadata?: { inputLength?: number; outputLength?: number; promptLength?: number };
|
|
246
|
+
* }} [options={}] - Options for calculating research metrics
|
|
247
|
+
* @returns {import('./index.mjs').PositionBiasResult} Position bias detection result
|
|
248
|
+
*/
|
|
249
|
+
export function detectPositionBias(judgments, options = {}) {
|
|
250
|
+
const {
|
|
251
|
+
calculateMetrics = false,
|
|
252
|
+
swappedJudgments = null,
|
|
253
|
+
qualityGap = null,
|
|
254
|
+
judgeModel = null,
|
|
255
|
+
taskMetadata = {}
|
|
256
|
+
} = options;
|
|
257
|
+
|
|
258
|
+
if (judgments.length < 2) {
|
|
259
|
+
return { detected: false, reason: 'Need at least 2 judgments to detect position bias' };
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
const scores = judgments.map(j => {
|
|
263
|
+
const score = typeof j === 'object' ? j.score : null;
|
|
264
|
+
return score !== null && score !== undefined ? score : null;
|
|
265
|
+
}).filter(s => s !== null);
|
|
266
|
+
|
|
267
|
+
if (scores.length < 2) {
|
|
268
|
+
return { detected: false, reason: 'Not enough scores to detect position bias' };
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// Check if first or last scores are consistently higher
|
|
272
|
+
const firstScore = scores[0];
|
|
273
|
+
const lastScore = scores[scores.length - 1];
|
|
274
|
+
const middleScores = scores.slice(1, -1);
|
|
275
|
+
const avgMiddle = middleScores.length > 0
|
|
276
|
+
? middleScores.reduce((a, b) => a + b, 0) / middleScores.length
|
|
277
|
+
: (firstScore + lastScore) / 2;
|
|
278
|
+
|
|
279
|
+
const firstBias = Math.abs(firstScore - avgMiddle) > 2;
|
|
280
|
+
const lastBias = Math.abs(lastScore - avgMiddle) > 2;
|
|
281
|
+
|
|
282
|
+
// Calculate quality gap if not provided (estimate from score differences)
|
|
283
|
+
let estimatedQualityGap = qualityGap;
|
|
284
|
+
if (estimatedQualityGap === null && scores.length >= 2) {
|
|
285
|
+
const scoreRange = Math.max(...scores) - Math.min(...scores);
|
|
286
|
+
const maxPossibleRange = 10; // Assuming 0-10 scale
|
|
287
|
+
// Estimate quality gap: 0.5 = tie, 0 or 1 = large gap
|
|
288
|
+
estimatedQualityGap = 0.5 - Math.abs((scoreRange / maxPossibleRange) - 0.5);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// Research finding: small quality gaps (≈0.5) cause maximum position bias
|
|
292
|
+
const isEquivocalCase = estimatedQualityGap !== null && Math.abs(estimatedQualityGap - 0.5) < 0.1;
|
|
293
|
+
const qualityGapSeverity = estimatedQualityGap !== null
|
|
294
|
+
? (isEquivocalCase ? 'high' : estimatedQualityGap < 0.2 ? 'low' : 'medium')
|
|
295
|
+
: 'unknown';
|
|
296
|
+
|
|
297
|
+
const result = {
|
|
298
|
+
detected: firstBias || lastBias,
|
|
299
|
+
firstBias,
|
|
300
|
+
lastBias,
|
|
301
|
+
evidence: {
|
|
302
|
+
firstScore,
|
|
303
|
+
lastScore,
|
|
304
|
+
avgMiddle,
|
|
305
|
+
allScores: scores
|
|
306
|
+
},
|
|
307
|
+
// Research-based quality gap analysis
|
|
308
|
+
qualityGap: {
|
|
309
|
+
value: estimatedQualityGap,
|
|
310
|
+
severity: qualityGapSeverity,
|
|
311
|
+
isEquivocal: isEquivocalCase,
|
|
312
|
+
note: isEquivocalCase
|
|
313
|
+
? 'Equivocal case (quality gap ≈0.5) - maximum position bias risk per arXiv:2406.07791'
|
|
314
|
+
: 'Quality gap analysis per research findings'
|
|
315
|
+
},
|
|
316
|
+
// Judge-level and task-level factors
|
|
317
|
+
factors: {
|
|
318
|
+
judgeModel: judgeModel || 'unknown',
|
|
319
|
+
taskMetadata: taskMetadata || {},
|
|
320
|
+
note: 'Judge-level and task-level factors influence bias per research'
|
|
321
|
+
}
|
|
322
|
+
};
|
|
323
|
+
|
|
324
|
+
// Calculate research metrics if requested and swapped judgments provided
|
|
325
|
+
if (calculateMetrics && swappedJudgments && swappedJudgments.length === judgments.length) {
|
|
326
|
+
const swappedScores = swappedJudgments.map(j => {
|
|
327
|
+
const score = typeof j === 'object' ? j.score : null;
|
|
328
|
+
return score !== null && score !== undefined ? score : null;
|
|
329
|
+
}).filter(s => s !== null);
|
|
330
|
+
|
|
331
|
+
if (swappedScores.length === scores.length) {
|
|
332
|
+
// Position Consistency (PC): Ratio of consistent judgments when order is swapped
|
|
333
|
+
let consistentCount = 0;
|
|
334
|
+
for (let i = 0; i < scores.length; i++) {
|
|
335
|
+
const original = scores[i];
|
|
336
|
+
const swapped = swappedScores[swappedScores.length - 1 - i]; // Reversed order
|
|
337
|
+
// Consider consistent if difference is small (within 1 point)
|
|
338
|
+
if (Math.abs(original - swapped) <= 1.0) {
|
|
339
|
+
consistentCount++;
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
result.positionConsistency = consistentCount / scores.length;
|
|
343
|
+
|
|
344
|
+
// Preference Fairness (PF): Measure of position preference
|
|
345
|
+
// Simplified: check if first position is consistently preferred
|
|
346
|
+
const firstPrefCount = scores.filter((s, i) => {
|
|
347
|
+
if (i === 0) return false; // Skip first itself
|
|
348
|
+
return s < firstScore;
|
|
349
|
+
}).length;
|
|
350
|
+
result.preferenceFairness = {
|
|
351
|
+
firstPositionPreference: firstPrefCount / (scores.length - 1),
|
|
352
|
+
lastPositionPreference: scores.filter((s, i) => {
|
|
353
|
+
if (i === scores.length - 1) return false;
|
|
354
|
+
return s < lastScore;
|
|
355
|
+
}).length / (scores.length - 1)
|
|
356
|
+
};
|
|
357
|
+
|
|
358
|
+
// Note: Repetition Stability (RS) requires multiple evaluation runs with same order
|
|
359
|
+
// This would need to be calculated externally with repeated evaluations
|
|
360
|
+
result.metrics = {
|
|
361
|
+
positionConsistency: result.positionConsistency,
|
|
362
|
+
preferenceFairness: result.preferenceFairness,
|
|
363
|
+
note: 'Repetition Stability (RS) requires multiple runs - calculate externally'
|
|
364
|
+
};
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
return result;
|
|
369
|
+
}
|
|
370
|
+
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Active Bias Mitigation
|
|
3
|
+
*
|
|
4
|
+
* Uses bias detection to actively adjust scores and mitigate biases.
|
|
5
|
+
* Research shows active mitigation is more effective than detection alone.
|
|
6
|
+
*
|
|
7
|
+
* Based on research findings that counter-balancing and active score adjustment
|
|
8
|
+
* can effectively eliminate position bias and other evaluation biases.
|
|
9
|
+
*
|
|
10
|
+
* Research:
|
|
11
|
+
* - Position bias: Systematic study (arXiv:2406.07791) - Position bias not random, varies by judge/task
|
|
12
|
+
* - Counter-balancing: Effective elimination method (arXiv:2508.02020)
|
|
13
|
+
* - Verbosity bias: Length alignment reduces bias (arXiv:2407.01085 - AdapAlpaca)
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { detectBias, detectPositionBias } from './bias-detector.mjs';
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Apply bias mitigation to a judgment result
|
|
20
|
+
*
|
|
21
|
+
* @param {import('./index.mjs').ValidationResult} result - Original judgment result
|
|
22
|
+
* @param {import('./index.mjs').BiasDetectionResult} biasDetection - Bias detection results
|
|
23
|
+
* @param {{
|
|
24
|
+
* adjustScores?: boolean;
|
|
25
|
+
* adjustIssues?: boolean;
|
|
26
|
+
* minAdjustment?: number;
|
|
27
|
+
* maxAdjustment?: number;
|
|
28
|
+
* }} [options={}] - Mitigation options
|
|
29
|
+
* @returns {import('./index.mjs').ValidationResult} Adjusted result
|
|
30
|
+
*/
|
|
31
|
+
export function mitigateBias(result, biasDetection, options = {}) {
|
|
32
|
+
const {
|
|
33
|
+
adjustScores = true,
|
|
34
|
+
adjustIssues = false,
|
|
35
|
+
minAdjustment = -2.0,
|
|
36
|
+
maxAdjustment = 2.0
|
|
37
|
+
} = options;
|
|
38
|
+
|
|
39
|
+
if (!biasDetection || !biasDetection.hasBias) {
|
|
40
|
+
return {
|
|
41
|
+
...result,
|
|
42
|
+
biasMitigation: {
|
|
43
|
+
applied: false,
|
|
44
|
+
reason: 'No bias detected'
|
|
45
|
+
}
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
let adjustedScore = result.score;
|
|
50
|
+
let adjustments = [];
|
|
51
|
+
|
|
52
|
+
if (adjustScores && result.score !== null) {
|
|
53
|
+
// Calculate adjustment based on detected biases
|
|
54
|
+
let totalAdjustment = 0;
|
|
55
|
+
|
|
56
|
+
for (const bias of biasDetection.biases) {
|
|
57
|
+
let adjustment = 0;
|
|
58
|
+
|
|
59
|
+
switch (bias.type) {
|
|
60
|
+
case 'verbosity':
|
|
61
|
+
// Verbosity bias: reduce score if reasoning is too verbose
|
|
62
|
+
// Research: arXiv:2310.10076, arXiv:2407.01085
|
|
63
|
+
// LLMs prefer longer answers more than humans. AdapAlpaca (arXiv:2407.01085)
|
|
64
|
+
// proposes length alignment for fair comparison by decomposing preference into
|
|
65
|
+
// desirability (length-independent) and information mass (length-dependent).
|
|
66
|
+
//
|
|
67
|
+
// IMPORTANT: This is a SIMPLIFIED mitigation. We do NOT implement AdapAlpaca's
|
|
68
|
+
// full length alignment method or desirability/information mass decomposition.
|
|
69
|
+
// Full implementation would:
|
|
70
|
+
// - Align lengths of reference and test responses under equivalent length intervals
|
|
71
|
+
// - Decompose preference into desirability (length-independent) and information mass
|
|
72
|
+
// - Normalize response lengths before comparison
|
|
73
|
+
//
|
|
74
|
+
// Current implementation: Simple score reduction based on verbosity detection.
|
|
75
|
+
// This is NOT the AdapAlpaca method, just a simplified approximation.
|
|
76
|
+
adjustment = -0.5 * bias.score;
|
|
77
|
+
adjustments.push({
|
|
78
|
+
type: 'verbosity',
|
|
79
|
+
adjustment: adjustment.toFixed(2),
|
|
80
|
+
reason: 'Reduced score due to verbosity bias (research: arXiv:2310.10076, 2407.01085). Full AdapAlpaca would align lengths under equivalent intervals.',
|
|
81
|
+
researchNote: 'AdapAlpaca decomposes win rate into desirability (length-independent) and information mass (length-dependent)'
|
|
82
|
+
});
|
|
83
|
+
break;
|
|
84
|
+
|
|
85
|
+
case 'length':
|
|
86
|
+
// Length bias: reduce score if length was a factor
|
|
87
|
+
adjustment = -0.3 * bias.score;
|
|
88
|
+
adjustments.push({
|
|
89
|
+
type: 'length',
|
|
90
|
+
adjustment: adjustment.toFixed(2),
|
|
91
|
+
reason: 'Reduced score due to length bias'
|
|
92
|
+
});
|
|
93
|
+
break;
|
|
94
|
+
|
|
95
|
+
case 'formatting':
|
|
96
|
+
// Formatting bias: small reduction
|
|
97
|
+
adjustment = -0.2 * bias.score;
|
|
98
|
+
adjustments.push({
|
|
99
|
+
type: 'formatting',
|
|
100
|
+
adjustment: adjustment.toFixed(2),
|
|
101
|
+
reason: 'Reduced score due to formatting bias'
|
|
102
|
+
});
|
|
103
|
+
break;
|
|
104
|
+
|
|
105
|
+
case 'authority':
|
|
106
|
+
// Authority bias: reduce if overly authoritative language
|
|
107
|
+
adjustment = -0.4 * bias.score;
|
|
108
|
+
adjustments.push({
|
|
109
|
+
type: 'authority',
|
|
110
|
+
adjustment: adjustment.toFixed(2),
|
|
111
|
+
reason: 'Reduced score due to authority bias'
|
|
112
|
+
});
|
|
113
|
+
break;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
totalAdjustment += adjustment;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Clamp adjustment
|
|
120
|
+
totalAdjustment = Math.max(minAdjustment, Math.min(maxAdjustment, totalAdjustment));
|
|
121
|
+
|
|
122
|
+
// Apply adjustment
|
|
123
|
+
adjustedScore = Math.max(0, Math.min(10, (result.score || 0) + totalAdjustment));
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
...result,
|
|
128
|
+
score: adjustedScore,
|
|
129
|
+
originalScore: result.score,
|
|
130
|
+
biasMitigation: {
|
|
131
|
+
applied: true,
|
|
132
|
+
adjustments,
|
|
133
|
+
totalAdjustment: adjustedScore !== null && result.score !== null
|
|
134
|
+
? (adjustedScore - result.score).toFixed(2)
|
|
135
|
+
: '0.00',
|
|
136
|
+
detectedBiases: biasDetection.biases.map(b => b.type),
|
|
137
|
+
severity: biasDetection.severity
|
|
138
|
+
}
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Mitigate position bias in array of judgments
|
|
144
|
+
*
|
|
145
|
+
* @param {Array<import('./index.mjs').ValidationResult>} judgments - Array of judgment results
|
|
146
|
+
* @param {{
|
|
147
|
+
* randomizeOrder?: boolean;
|
|
148
|
+
* adjustScores?: boolean;
|
|
149
|
+
* }} [options={}] - Mitigation options
|
|
150
|
+
* @returns {Array<import('./index.mjs').ValidationResult>} Adjusted judgments
|
|
151
|
+
*/
|
|
152
|
+
export function mitigatePositionBias(judgments, options = {}) {
|
|
153
|
+
const {
|
|
154
|
+
randomizeOrder = true,
|
|
155
|
+
adjustScores = true
|
|
156
|
+
} = options;
|
|
157
|
+
|
|
158
|
+
// Detect position bias
|
|
159
|
+
const positionBias = detectPositionBias(judgments);
|
|
160
|
+
|
|
161
|
+
if (!positionBias.detected) {
|
|
162
|
+
return judgments.map(j => ({
|
|
163
|
+
...j,
|
|
164
|
+
biasMitigation: {
|
|
165
|
+
applied: false,
|
|
166
|
+
reason: 'No position bias detected'
|
|
167
|
+
}
|
|
168
|
+
}));
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// If randomizing, shuffle order (would need to be done before evaluation)
|
|
172
|
+
// For now, adjust scores
|
|
173
|
+
if (adjustScores) {
|
|
174
|
+
return judgments.map((judgment, index) => {
|
|
175
|
+
if (judgment.score === null) return judgment;
|
|
176
|
+
|
|
177
|
+
let adjustment = 0;
|
|
178
|
+
|
|
179
|
+
// Reduce first position bias
|
|
180
|
+
if (positionBias.firstBias && index === 0) {
|
|
181
|
+
adjustment = -1.0;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// Reduce last position bias
|
|
185
|
+
if (positionBias.lastBias && index === judgments.length - 1) {
|
|
186
|
+
adjustment = -1.0;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
const adjustedScore = Math.max(0, Math.min(10, (judgment.score || 0) + adjustment));
|
|
190
|
+
|
|
191
|
+
return {
|
|
192
|
+
...judgment,
|
|
193
|
+
score: adjustedScore,
|
|
194
|
+
originalScore: judgment.score,
|
|
195
|
+
biasMitigation: {
|
|
196
|
+
applied: true,
|
|
197
|
+
type: 'position',
|
|
198
|
+
adjustment: adjustment.toFixed(2),
|
|
199
|
+
reason: positionBias.firstBias && index === 0
|
|
200
|
+
? 'Reduced first position bias'
|
|
201
|
+
: positionBias.lastBias && index === judgments.length - 1
|
|
202
|
+
? 'Reduced last position bias'
|
|
203
|
+
: 'No adjustment needed'
|
|
204
|
+
}
|
|
205
|
+
};
|
|
206
|
+
});
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
return judgments;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Apply comprehensive bias mitigation to judgment
|
|
214
|
+
* Combines all mitigation strategies
|
|
215
|
+
*
|
|
216
|
+
* @param {import('./index.mjs').ValidationResult} result - Judgment result
|
|
217
|
+
* @param {string} reasoning - Reasoning text for bias detection
|
|
218
|
+
* @param {import('./index.mjs').BiasMitigationOptions} [options={}] - Mitigation options
|
|
219
|
+
* @returns {import('./index.mjs').ValidationResult} Mitigated result
|
|
220
|
+
*/
|
|
221
|
+
export function applyBiasMitigation(result, reasoning, options = {}) {
|
|
222
|
+
// Detect biases
|
|
223
|
+
const biasDetection = detectBias(reasoning || result.reasoning || '', {
|
|
224
|
+
checkVerbosity: true,
|
|
225
|
+
checkLength: true,
|
|
226
|
+
checkFormatting: true,
|
|
227
|
+
checkAuthority: true
|
|
228
|
+
});
|
|
229
|
+
|
|
230
|
+
// Apply mitigation
|
|
231
|
+
return mitigateBias(result, biasDetection, options);
|
|
232
|
+
}
|
|
233
|
+
|