@arclabs561/ai-visual-test 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.secretsignore.example +20 -0
  2. package/CHANGELOG.md +360 -0
  3. package/CONTRIBUTING.md +63 -0
  4. package/DEPLOYMENT.md +80 -0
  5. package/LICENSE +22 -0
  6. package/README.md +142 -0
  7. package/SECURITY.md +108 -0
  8. package/api/health.js +34 -0
  9. package/api/validate.js +252 -0
  10. package/index.d.ts +1221 -0
  11. package/package.json +112 -0
  12. package/public/index.html +149 -0
  13. package/src/batch-optimizer.mjs +451 -0
  14. package/src/bias-detector.mjs +370 -0
  15. package/src/bias-mitigation.mjs +233 -0
  16. package/src/cache.mjs +433 -0
  17. package/src/config.mjs +268 -0
  18. package/src/constants.mjs +80 -0
  19. package/src/context-compressor.mjs +350 -0
  20. package/src/convenience.mjs +617 -0
  21. package/src/cost-tracker.mjs +257 -0
  22. package/src/cross-modal-consistency.mjs +170 -0
  23. package/src/data-extractor.mjs +232 -0
  24. package/src/dynamic-few-shot.mjs +140 -0
  25. package/src/dynamic-prompts.mjs +361 -0
  26. package/src/ensemble/index.mjs +53 -0
  27. package/src/ensemble-judge.mjs +366 -0
  28. package/src/error-handler.mjs +67 -0
  29. package/src/errors.mjs +167 -0
  30. package/src/experience-propagation.mjs +128 -0
  31. package/src/experience-tracer.mjs +487 -0
  32. package/src/explanation-manager.mjs +299 -0
  33. package/src/feedback-aggregator.mjs +248 -0
  34. package/src/game-goal-prompts.mjs +478 -0
  35. package/src/game-player.mjs +548 -0
  36. package/src/hallucination-detector.mjs +155 -0
  37. package/src/helpers/playwright.mjs +80 -0
  38. package/src/human-validation-manager.mjs +516 -0
  39. package/src/index.mjs +364 -0
  40. package/src/judge.mjs +929 -0
  41. package/src/latency-aware-batch-optimizer.mjs +192 -0
  42. package/src/load-env.mjs +159 -0
  43. package/src/logger.mjs +55 -0
  44. package/src/metrics.mjs +187 -0
  45. package/src/model-tier-selector.mjs +221 -0
  46. package/src/multi-modal/index.mjs +36 -0
  47. package/src/multi-modal-fusion.mjs +190 -0
  48. package/src/multi-modal.mjs +524 -0
  49. package/src/natural-language-specs.mjs +1071 -0
  50. package/src/pair-comparison.mjs +277 -0
  51. package/src/persona/index.mjs +42 -0
  52. package/src/persona-enhanced.mjs +200 -0
  53. package/src/persona-experience.mjs +572 -0
  54. package/src/position-counterbalance.mjs +140 -0
  55. package/src/prompt-composer.mjs +375 -0
  56. package/src/render-change-detector.mjs +583 -0
  57. package/src/research-enhanced-validation.mjs +436 -0
  58. package/src/retry.mjs +152 -0
  59. package/src/rubrics.mjs +231 -0
  60. package/src/score-tracker.mjs +277 -0
  61. package/src/smart-validator.mjs +447 -0
  62. package/src/spec-config.mjs +106 -0
  63. package/src/spec-templates.mjs +347 -0
  64. package/src/specs/index.mjs +38 -0
  65. package/src/temporal/index.mjs +102 -0
  66. package/src/temporal-adaptive.mjs +163 -0
  67. package/src/temporal-batch-optimizer.mjs +222 -0
  68. package/src/temporal-constants.mjs +69 -0
  69. package/src/temporal-context.mjs +49 -0
  70. package/src/temporal-decision-manager.mjs +271 -0
  71. package/src/temporal-decision.mjs +669 -0
  72. package/src/temporal-errors.mjs +58 -0
  73. package/src/temporal-note-pruner.mjs +173 -0
  74. package/src/temporal-preprocessor.mjs +543 -0
  75. package/src/temporal-prompt-formatter.mjs +219 -0
  76. package/src/temporal-validation.mjs +159 -0
  77. package/src/temporal.mjs +415 -0
  78. package/src/type-guards.mjs +311 -0
  79. package/src/uncertainty-reducer.mjs +470 -0
  80. package/src/utils/index.mjs +175 -0
  81. package/src/validation-framework.mjs +321 -0
  82. package/src/validation-result-normalizer.mjs +64 -0
  83. package/src/validation.mjs +243 -0
  84. package/src/validators/accessibility-programmatic.mjs +345 -0
  85. package/src/validators/accessibility-validator.mjs +223 -0
  86. package/src/validators/batch-validator.mjs +143 -0
  87. package/src/validators/hybrid-validator.mjs +268 -0
  88. package/src/validators/index.mjs +34 -0
  89. package/src/validators/prompt-builder.mjs +218 -0
  90. package/src/validators/rubric.mjs +85 -0
  91. package/src/validators/state-programmatic.mjs +260 -0
  92. package/src/validators/state-validator.mjs +291 -0
  93. package/vercel.json +27 -0
@@ -0,0 +1,370 @@
1
+ /**
2
+ * Bias Detection for LLM-as-a-Judge
3
+ *
4
+ * Detects common biases in LLM judge evaluations:
5
+ * - Superficial feature bias (verbosity, length, formatting)
6
+ * - Position bias (favoring first/last responses) - Systematic study shows position bias is not random
7
+ * and varies significantly across judges and tasks (arXiv:2406.07791)
8
+ * - Verbosity bias (favoring longer responses) - LLMs prefer longer answers more than humans
9
+ * (arXiv:2310.10076, arXiv:2407.01085)
10
+ * - Recency bias (favoring recent information)
11
+ * - Authority bias (favoring authoritative-sounding responses)
12
+ *
13
+ * Research shows these biases can significantly impact evaluation quality.
14
+ * Position bias is particularly severe and requires counter-balancing (arXiv:2406.07791, arXiv:2508.02020).
15
+ */
16
+
17
+ /**
18
+ * Detect superficial feature bias in judgment
19
+ *
20
+ * @param {string | object} judgment - Judgment object or text
21
+ * @param {{
22
+ * checkVerbosity?: boolean;
23
+ * checkLength?: boolean;
24
+ * checkFormatting?: boolean;
25
+ * checkPosition?: boolean;
26
+ * checkAuthority?: boolean;
27
+ * }} [options={}] - Detection options
28
+ * @returns {import('./index.mjs').BiasDetectionResult} Bias detection results
29
+ */
30
+ export function detectBias(judgment, options = {}) {
31
+ const {
32
+ checkVerbosity = true,
33
+ checkLength = true,
34
+ checkFormatting = true,
35
+ checkPosition = false,
36
+ checkAuthority = true
37
+ } = options;
38
+
39
+ const judgmentText = typeof judgment === 'string'
40
+ ? judgment
41
+ : JSON.stringify(judgment);
42
+
43
+ const biases = {
44
+ verbosity: checkVerbosity ? detectVerbosityBias(judgmentText) : null,
45
+ length: checkLength ? detectLengthBias(judgmentText) : null,
46
+ formatting: checkFormatting ? detectFormattingBias(judgmentText) : null,
47
+ authority: checkAuthority ? detectAuthorityBias(judgmentText) : null
48
+ };
49
+
50
+ const detectedBiases = Object.entries(biases)
51
+ .filter(([_, result]) => result && result.detected)
52
+ .map(([type, result]) => ({ type, ...result }));
53
+
54
+ return {
55
+ hasBias: detectedBiases.length > 0,
56
+ biases: detectedBiases,
57
+ severity: calculateSeverity(detectedBiases),
58
+ recommendations: generateRecommendations(detectedBiases)
59
+ };
60
+ }
61
+
62
+ /**
63
+ * Detect verbosity bias (favoring longer responses)
64
+ */
65
+ function detectVerbosityBias(text) {
66
+ const wordCount = text.split(/\s+/).length;
67
+ const avgWordLength = text.split(/\s+/).reduce((sum, word) => sum + word.length, 0) / wordCount;
68
+
69
+ // Flags: excessive length, repetitive phrases, filler words
70
+ const fillerWords = ['very', 'really', 'quite', 'rather', 'somewhat', 'rather', 'extremely'];
71
+ const fillerCount = fillerWords.reduce((count, word) => {
72
+ const regex = new RegExp(`\\b${word}\\b`, 'gi');
73
+ return count + (text.match(regex) || []).length;
74
+ }, 0);
75
+
76
+ const repetitivePhrases = findRepetitivePhrases(text);
77
+
78
+ const detected = wordCount > 500 || fillerCount > 10 || repetitivePhrases.length > 3;
79
+
80
+ return {
81
+ detected,
82
+ score: detected ? Math.min(1.0, (wordCount / 1000) + (fillerCount / 20) + (repetitivePhrases.length / 5)) : 0,
83
+ evidence: {
84
+ wordCount,
85
+ fillerCount,
86
+ repetitivePhrases: repetitivePhrases.slice(0, 3),
87
+ avgWordLength
88
+ }
89
+ };
90
+ }
91
+
92
+ /**
93
+ * Detect length bias (favoring responses based on length alone)
94
+ */
95
+ function detectLengthBias(text) {
96
+ const length = text.length;
97
+ const hasLengthBasedReasoning = /length|size|long|short|brief|extensive/i.test(text);
98
+
99
+ return {
100
+ detected: hasLengthBasedReasoning && length > 200,
101
+ score: hasLengthBasedReasoning ? 0.7 : 0,
102
+ evidence: {
103
+ length,
104
+ mentionsLength: hasLengthBasedReasoning
105
+ }
106
+ };
107
+ }
108
+
109
+ /**
110
+ * Detect formatting bias (favoring well-formatted responses)
111
+ */
112
+ function detectFormattingBias(text) {
113
+ const hasMarkdown = /#{1,6}\s|^\*\s|^-\s|^\d+\.\s/m.test(text);
114
+ const hasLists = (text.match(/^\s*[-*]\s/gm) || []).length > 3;
115
+ const hasHeaders = (text.match(/^#{1,6}\s/gm) || []).length > 2;
116
+
117
+ // Check if judgment mentions formatting
118
+ const mentionsFormatting = /format|structure|organized|well-formatted|markdown/i.test(text);
119
+
120
+ return {
121
+ detected: mentionsFormatting && (hasMarkdown || hasLists || hasHeaders),
122
+ score: mentionsFormatting ? 0.6 : 0,
123
+ evidence: {
124
+ hasMarkdown,
125
+ hasLists,
126
+ hasHeaders,
127
+ mentionsFormatting
128
+ }
129
+ };
130
+ }
131
+
132
+ /**
133
+ * Detect authority bias (favoring authoritative-sounding responses)
134
+ */
135
+ function detectAuthorityBias(text) {
136
+ const authorityPhrases = [
137
+ 'according to', 'research shows', 'studies indicate', 'experts say',
138
+ 'it is well-known', 'commonly accepted', 'standard practice',
139
+ 'best practice', 'industry standard', 'widely recognized'
140
+ ];
141
+
142
+ const authorityCount = authorityPhrases.reduce((count, phrase) => {
143
+ const regex = new RegExp(phrase, 'gi');
144
+ return count + (text.match(regex) || []).length;
145
+ }, 0);
146
+
147
+ return {
148
+ detected: authorityCount > 2,
149
+ score: Math.min(1.0, authorityCount / 5),
150
+ evidence: {
151
+ authorityPhrasesFound: authorityCount
152
+ }
153
+ };
154
+ }
155
+
156
+ /**
157
+ * Find repetitive phrases in text
158
+ */
159
+ function findRepetitivePhrases(text) {
160
+ const words = text.toLowerCase().split(/\s+/);
161
+ const phrases = [];
162
+
163
+ // Check for 3-word phrases that repeat
164
+ for (let i = 0; i < words.length - 2; i++) {
165
+ const phrase = words.slice(i, i + 3).join(' ');
166
+ const count = (text.toLowerCase().match(new RegExp(phrase.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g')) || []).length;
167
+ if (count > 2) {
168
+ phrases.push({ phrase, count });
169
+ }
170
+ }
171
+
172
+ return phrases
173
+ .filter((p, i, arr) => arr.findIndex(x => x.phrase === p.phrase) === i)
174
+ .sort((a, b) => b.count - a.count)
175
+ .slice(0, 5);
176
+ }
177
+
178
+ /**
179
+ * Calculate overall bias severity
180
+ */
181
+ function calculateSeverity(detectedBiases) {
182
+ if (detectedBiases.length === 0) return 'none';
183
+
184
+ const avgScore = detectedBiases.reduce((sum, b) => sum + (b.score || 0), 0) / detectedBiases.length;
185
+
186
+ if (avgScore >= 0.7) return 'high';
187
+ if (avgScore >= 0.4) return 'medium';
188
+ return 'low';
189
+ }
190
+
191
+ /**
192
+ * Generate recommendations based on detected biases
193
+ */
194
+ function generateRecommendations(detectedBiases) {
195
+ const recommendations = [];
196
+
197
+ if (detectedBiases.some(b => b.type === 'verbosity')) {
198
+ recommendations.push('Judge may be favoring verbose responses. Focus evaluation on content quality, not length.');
199
+ }
200
+
201
+ if (detectedBiases.some(b => b.type === 'length')) {
202
+ recommendations.push('Judge may be biased by response length. Use rubric to focus on substantive content.');
203
+ }
204
+
205
+ if (detectedBiases.some(b => b.type === 'formatting')) {
206
+ recommendations.push('Judge may be favoring well-formatted responses. Evaluate content regardless of formatting.');
207
+ }
208
+
209
+ if (detectedBiases.some(b => b.type === 'authority')) {
210
+ recommendations.push('Judge may be biased by authoritative language. Focus on factual correctness, not tone.');
211
+ }
212
+
213
+ if (recommendations.length === 0) {
214
+ recommendations.push('No significant biases detected. Consider using ensemble judging for high-stakes evaluations.');
215
+ }
216
+
217
+ return recommendations;
218
+ }
219
+
220
+ /**
221
+ * Detect position bias in array of judgments
222
+ *
223
+ * Research: arXiv:2406.07791 introduces three metrics:
224
+ * - Repetition Stability (RS): Consistency across repeated evaluations (threshold: RS > 0.85)
225
+ * - Position Consistency (PC): Ratio of consistent judgments when order is swapped
226
+ * - Preference Fairness (PF): Extent to which judges favor specific positions
227
+ * - PF = 0: no bias
228
+ * - PF > 0: recency bias (favoring later positions)
229
+ * - PF < 0: primacy bias (favoring earlier positions)
230
+ *
231
+ * Key findings from research:
232
+ * - Quality gap strongly affects bias: parabolic relationship where PC increases with quality gap
233
+ * - Small quality gaps (δ_q ≈ 0.5) cause maximum confusion and position bias
234
+ * - Large quality gaps (δ_q → 0 or 1) lead to more consistent, fair judgments
235
+ * - Judge-level factors: model family, context window, max output length
236
+ * - Candidate-level factors: quality gap is primary, but also verbosity/length bias
237
+ * - Task-level factors: input/output length, prompt length, benchmark difficulty
238
+ *
239
+ * @param {Array<{ score: number | null }>} judgments - Array of judgment results with scores
240
+ * @param {{
241
+ * calculateMetrics?: boolean;
242
+ * swappedJudgments?: Array<{ score: number | null }>;
243
+ * qualityGap?: number; // Quality gap (δ_q) between candidates (0-1, where 0.5 = tie)
244
+ * judgeModel?: string; // Judge model identifier for judge-level analysis
245
+ * taskMetadata?: { inputLength?: number; outputLength?: number; promptLength?: number };
246
+ * }} [options={}] - Options for calculating research metrics
247
+ * @returns {import('./index.mjs').PositionBiasResult} Position bias detection result
248
+ */
249
+ export function detectPositionBias(judgments, options = {}) {
250
+ const {
251
+ calculateMetrics = false,
252
+ swappedJudgments = null,
253
+ qualityGap = null,
254
+ judgeModel = null,
255
+ taskMetadata = {}
256
+ } = options;
257
+
258
+ if (judgments.length < 2) {
259
+ return { detected: false, reason: 'Need at least 2 judgments to detect position bias' };
260
+ }
261
+
262
+ const scores = judgments.map(j => {
263
+ const score = typeof j === 'object' ? j.score : null;
264
+ return score !== null && score !== undefined ? score : null;
265
+ }).filter(s => s !== null);
266
+
267
+ if (scores.length < 2) {
268
+ return { detected: false, reason: 'Not enough scores to detect position bias' };
269
+ }
270
+
271
+ // Check if first or last scores are consistently higher
272
+ const firstScore = scores[0];
273
+ const lastScore = scores[scores.length - 1];
274
+ const middleScores = scores.slice(1, -1);
275
+ const avgMiddle = middleScores.length > 0
276
+ ? middleScores.reduce((a, b) => a + b, 0) / middleScores.length
277
+ : (firstScore + lastScore) / 2;
278
+
279
+ const firstBias = Math.abs(firstScore - avgMiddle) > 2;
280
+ const lastBias = Math.abs(lastScore - avgMiddle) > 2;
281
+
282
+ // Calculate quality gap if not provided (estimate from score differences)
283
+ let estimatedQualityGap = qualityGap;
284
+ if (estimatedQualityGap === null && scores.length >= 2) {
285
+ const scoreRange = Math.max(...scores) - Math.min(...scores);
286
+ const maxPossibleRange = 10; // Assuming 0-10 scale
287
+ // Estimate quality gap: 0.5 = tie, 0 or 1 = large gap
288
+ estimatedQualityGap = 0.5 - Math.abs((scoreRange / maxPossibleRange) - 0.5);
289
+ }
290
+
291
+ // Research finding: small quality gaps (≈0.5) cause maximum position bias
292
+ const isEquivocalCase = estimatedQualityGap !== null && Math.abs(estimatedQualityGap - 0.5) < 0.1;
293
+ const qualityGapSeverity = estimatedQualityGap !== null
294
+ ? (isEquivocalCase ? 'high' : estimatedQualityGap < 0.2 ? 'low' : 'medium')
295
+ : 'unknown';
296
+
297
+ const result = {
298
+ detected: firstBias || lastBias,
299
+ firstBias,
300
+ lastBias,
301
+ evidence: {
302
+ firstScore,
303
+ lastScore,
304
+ avgMiddle,
305
+ allScores: scores
306
+ },
307
+ // Research-based quality gap analysis
308
+ qualityGap: {
309
+ value: estimatedQualityGap,
310
+ severity: qualityGapSeverity,
311
+ isEquivocal: isEquivocalCase,
312
+ note: isEquivocalCase
313
+ ? 'Equivocal case (quality gap ≈0.5) - maximum position bias risk per arXiv:2406.07791'
314
+ : 'Quality gap analysis per research findings'
315
+ },
316
+ // Judge-level and task-level factors
317
+ factors: {
318
+ judgeModel: judgeModel || 'unknown',
319
+ taskMetadata: taskMetadata || {},
320
+ note: 'Judge-level and task-level factors influence bias per research'
321
+ }
322
+ };
323
+
324
+ // Calculate research metrics if requested and swapped judgments provided
325
+ if (calculateMetrics && swappedJudgments && swappedJudgments.length === judgments.length) {
326
+ const swappedScores = swappedJudgments.map(j => {
327
+ const score = typeof j === 'object' ? j.score : null;
328
+ return score !== null && score !== undefined ? score : null;
329
+ }).filter(s => s !== null);
330
+
331
+ if (swappedScores.length === scores.length) {
332
+ // Position Consistency (PC): Ratio of consistent judgments when order is swapped
333
+ let consistentCount = 0;
334
+ for (let i = 0; i < scores.length; i++) {
335
+ const original = scores[i];
336
+ const swapped = swappedScores[swappedScores.length - 1 - i]; // Reversed order
337
+ // Consider consistent if difference is small (within 1 point)
338
+ if (Math.abs(original - swapped) <= 1.0) {
339
+ consistentCount++;
340
+ }
341
+ }
342
+ result.positionConsistency = consistentCount / scores.length;
343
+
344
+ // Preference Fairness (PF): Measure of position preference
345
+ // Simplified: check if first position is consistently preferred
346
+ const firstPrefCount = scores.filter((s, i) => {
347
+ if (i === 0) return false; // Skip first itself
348
+ return s < firstScore;
349
+ }).length;
350
+ result.preferenceFairness = {
351
+ firstPositionPreference: firstPrefCount / (scores.length - 1),
352
+ lastPositionPreference: scores.filter((s, i) => {
353
+ if (i === scores.length - 1) return false;
354
+ return s < lastScore;
355
+ }).length / (scores.length - 1)
356
+ };
357
+
358
+ // Note: Repetition Stability (RS) requires multiple evaluation runs with same order
359
+ // This would need to be calculated externally with repeated evaluations
360
+ result.metrics = {
361
+ positionConsistency: result.positionConsistency,
362
+ preferenceFairness: result.preferenceFairness,
363
+ note: 'Repetition Stability (RS) requires multiple runs - calculate externally'
364
+ };
365
+ }
366
+ }
367
+
368
+ return result;
369
+ }
370
+
@@ -0,0 +1,233 @@
1
+ /**
2
+ * Active Bias Mitigation
3
+ *
4
+ * Uses bias detection to actively adjust scores and mitigate biases.
5
+ * Research shows active mitigation is more effective than detection alone.
6
+ *
7
+ * Based on research findings that counter-balancing and active score adjustment
8
+ * can effectively eliminate position bias and other evaluation biases.
9
+ *
10
+ * Research:
11
+ * - Position bias: Systematic study (arXiv:2406.07791) - Position bias not random, varies by judge/task
12
+ * - Counter-balancing: Effective elimination method (arXiv:2508.02020)
13
+ * - Verbosity bias: Length alignment reduces bias (arXiv:2407.01085 - AdapAlpaca)
14
+ */
15
+
16
+ import { detectBias, detectPositionBias } from './bias-detector.mjs';
17
+
18
+ /**
19
+ * Apply bias mitigation to a judgment result
20
+ *
21
+ * @param {import('./index.mjs').ValidationResult} result - Original judgment result
22
+ * @param {import('./index.mjs').BiasDetectionResult} biasDetection - Bias detection results
23
+ * @param {{
24
+ * adjustScores?: boolean;
25
+ * adjustIssues?: boolean;
26
+ * minAdjustment?: number;
27
+ * maxAdjustment?: number;
28
+ * }} [options={}] - Mitigation options
29
+ * @returns {import('./index.mjs').ValidationResult} Adjusted result
30
+ */
31
+ export function mitigateBias(result, biasDetection, options = {}) {
32
+ const {
33
+ adjustScores = true,
34
+ adjustIssues = false,
35
+ minAdjustment = -2.0,
36
+ maxAdjustment = 2.0
37
+ } = options;
38
+
39
+ if (!biasDetection || !biasDetection.hasBias) {
40
+ return {
41
+ ...result,
42
+ biasMitigation: {
43
+ applied: false,
44
+ reason: 'No bias detected'
45
+ }
46
+ };
47
+ }
48
+
49
+ let adjustedScore = result.score;
50
+ let adjustments = [];
51
+
52
+ if (adjustScores && result.score !== null) {
53
+ // Calculate adjustment based on detected biases
54
+ let totalAdjustment = 0;
55
+
56
+ for (const bias of biasDetection.biases) {
57
+ let adjustment = 0;
58
+
59
+ switch (bias.type) {
60
+ case 'verbosity':
61
+ // Verbosity bias: reduce score if reasoning is too verbose
62
+ // Research: arXiv:2310.10076, arXiv:2407.01085
63
+ // LLMs prefer longer answers more than humans. AdapAlpaca (arXiv:2407.01085)
64
+ // proposes length alignment for fair comparison by decomposing preference into
65
+ // desirability (length-independent) and information mass (length-dependent).
66
+ //
67
+ // IMPORTANT: This is a SIMPLIFIED mitigation. We do NOT implement AdapAlpaca's
68
+ // full length alignment method or desirability/information mass decomposition.
69
+ // Full implementation would:
70
+ // - Align lengths of reference and test responses under equivalent length intervals
71
+ // - Decompose preference into desirability (length-independent) and information mass
72
+ // - Normalize response lengths before comparison
73
+ //
74
+ // Current implementation: Simple score reduction based on verbosity detection.
75
+ // This is NOT the AdapAlpaca method, just a simplified approximation.
76
+ adjustment = -0.5 * bias.score;
77
+ adjustments.push({
78
+ type: 'verbosity',
79
+ adjustment: adjustment.toFixed(2),
80
+ reason: 'Reduced score due to verbosity bias (research: arXiv:2310.10076, 2407.01085). Full AdapAlpaca would align lengths under equivalent intervals.',
81
+ researchNote: 'AdapAlpaca decomposes win rate into desirability (length-independent) and information mass (length-dependent)'
82
+ });
83
+ break;
84
+
85
+ case 'length':
86
+ // Length bias: reduce score if length was a factor
87
+ adjustment = -0.3 * bias.score;
88
+ adjustments.push({
89
+ type: 'length',
90
+ adjustment: adjustment.toFixed(2),
91
+ reason: 'Reduced score due to length bias'
92
+ });
93
+ break;
94
+
95
+ case 'formatting':
96
+ // Formatting bias: small reduction
97
+ adjustment = -0.2 * bias.score;
98
+ adjustments.push({
99
+ type: 'formatting',
100
+ adjustment: adjustment.toFixed(2),
101
+ reason: 'Reduced score due to formatting bias'
102
+ });
103
+ break;
104
+
105
+ case 'authority':
106
+ // Authority bias: reduce if overly authoritative language
107
+ adjustment = -0.4 * bias.score;
108
+ adjustments.push({
109
+ type: 'authority',
110
+ adjustment: adjustment.toFixed(2),
111
+ reason: 'Reduced score due to authority bias'
112
+ });
113
+ break;
114
+ }
115
+
116
+ totalAdjustment += adjustment;
117
+ }
118
+
119
+ // Clamp adjustment
120
+ totalAdjustment = Math.max(minAdjustment, Math.min(maxAdjustment, totalAdjustment));
121
+
122
+ // Apply adjustment
123
+ adjustedScore = Math.max(0, Math.min(10, (result.score || 0) + totalAdjustment));
124
+ }
125
+
126
+ return {
127
+ ...result,
128
+ score: adjustedScore,
129
+ originalScore: result.score,
130
+ biasMitigation: {
131
+ applied: true,
132
+ adjustments,
133
+ totalAdjustment: adjustedScore !== null && result.score !== null
134
+ ? (adjustedScore - result.score).toFixed(2)
135
+ : '0.00',
136
+ detectedBiases: biasDetection.biases.map(b => b.type),
137
+ severity: biasDetection.severity
138
+ }
139
+ };
140
+ }
141
+
142
+ /**
143
+ * Mitigate position bias in array of judgments
144
+ *
145
+ * @param {Array<import('./index.mjs').ValidationResult>} judgments - Array of judgment results
146
+ * @param {{
147
+ * randomizeOrder?: boolean;
148
+ * adjustScores?: boolean;
149
+ * }} [options={}] - Mitigation options
150
+ * @returns {Array<import('./index.mjs').ValidationResult>} Adjusted judgments
151
+ */
152
+ export function mitigatePositionBias(judgments, options = {}) {
153
+ const {
154
+ randomizeOrder = true,
155
+ adjustScores = true
156
+ } = options;
157
+
158
+ // Detect position bias
159
+ const positionBias = detectPositionBias(judgments);
160
+
161
+ if (!positionBias.detected) {
162
+ return judgments.map(j => ({
163
+ ...j,
164
+ biasMitigation: {
165
+ applied: false,
166
+ reason: 'No position bias detected'
167
+ }
168
+ }));
169
+ }
170
+
171
+ // If randomizing, shuffle order (would need to be done before evaluation)
172
+ // For now, adjust scores
173
+ if (adjustScores) {
174
+ return judgments.map((judgment, index) => {
175
+ if (judgment.score === null) return judgment;
176
+
177
+ let adjustment = 0;
178
+
179
+ // Reduce first position bias
180
+ if (positionBias.firstBias && index === 0) {
181
+ adjustment = -1.0;
182
+ }
183
+
184
+ // Reduce last position bias
185
+ if (positionBias.lastBias && index === judgments.length - 1) {
186
+ adjustment = -1.0;
187
+ }
188
+
189
+ const adjustedScore = Math.max(0, Math.min(10, (judgment.score || 0) + adjustment));
190
+
191
+ return {
192
+ ...judgment,
193
+ score: adjustedScore,
194
+ originalScore: judgment.score,
195
+ biasMitigation: {
196
+ applied: true,
197
+ type: 'position',
198
+ adjustment: adjustment.toFixed(2),
199
+ reason: positionBias.firstBias && index === 0
200
+ ? 'Reduced first position bias'
201
+ : positionBias.lastBias && index === judgments.length - 1
202
+ ? 'Reduced last position bias'
203
+ : 'No adjustment needed'
204
+ }
205
+ };
206
+ });
207
+ }
208
+
209
+ return judgments;
210
+ }
211
+
212
+ /**
213
+ * Apply comprehensive bias mitigation to judgment
214
+ * Combines all mitigation strategies
215
+ *
216
+ * @param {import('./index.mjs').ValidationResult} result - Judgment result
217
+ * @param {string} reasoning - Reasoning text for bias detection
218
+ * @param {import('./index.mjs').BiasMitigationOptions} [options={}] - Mitigation options
219
+ * @returns {import('./index.mjs').ValidationResult} Mitigated result
220
+ */
221
+ export function applyBiasMitigation(result, reasoning, options = {}) {
222
+ // Detect biases
223
+ const biasDetection = detectBias(reasoning || result.reasoning || '', {
224
+ checkVerbosity: true,
225
+ checkLength: true,
226
+ checkFormatting: true,
227
+ checkAuthority: true
228
+ });
229
+
230
+ // Apply mitigation
231
+ return mitigateBias(result, biasDetection, options);
232
+ }
233
+