@arclabs561/ai-visual-test 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.secretsignore.example +20 -0
  2. package/CHANGELOG.md +360 -0
  3. package/CONTRIBUTING.md +63 -0
  4. package/DEPLOYMENT.md +80 -0
  5. package/LICENSE +22 -0
  6. package/README.md +142 -0
  7. package/SECURITY.md +108 -0
  8. package/api/health.js +34 -0
  9. package/api/validate.js +252 -0
  10. package/index.d.ts +1221 -0
  11. package/package.json +112 -0
  12. package/public/index.html +149 -0
  13. package/src/batch-optimizer.mjs +451 -0
  14. package/src/bias-detector.mjs +370 -0
  15. package/src/bias-mitigation.mjs +233 -0
  16. package/src/cache.mjs +433 -0
  17. package/src/config.mjs +268 -0
  18. package/src/constants.mjs +80 -0
  19. package/src/context-compressor.mjs +350 -0
  20. package/src/convenience.mjs +617 -0
  21. package/src/cost-tracker.mjs +257 -0
  22. package/src/cross-modal-consistency.mjs +170 -0
  23. package/src/data-extractor.mjs +232 -0
  24. package/src/dynamic-few-shot.mjs +140 -0
  25. package/src/dynamic-prompts.mjs +361 -0
  26. package/src/ensemble/index.mjs +53 -0
  27. package/src/ensemble-judge.mjs +366 -0
  28. package/src/error-handler.mjs +67 -0
  29. package/src/errors.mjs +167 -0
  30. package/src/experience-propagation.mjs +128 -0
  31. package/src/experience-tracer.mjs +487 -0
  32. package/src/explanation-manager.mjs +299 -0
  33. package/src/feedback-aggregator.mjs +248 -0
  34. package/src/game-goal-prompts.mjs +478 -0
  35. package/src/game-player.mjs +548 -0
  36. package/src/hallucination-detector.mjs +155 -0
  37. package/src/helpers/playwright.mjs +80 -0
  38. package/src/human-validation-manager.mjs +516 -0
  39. package/src/index.mjs +364 -0
  40. package/src/judge.mjs +929 -0
  41. package/src/latency-aware-batch-optimizer.mjs +192 -0
  42. package/src/load-env.mjs +159 -0
  43. package/src/logger.mjs +55 -0
  44. package/src/metrics.mjs +187 -0
  45. package/src/model-tier-selector.mjs +221 -0
  46. package/src/multi-modal/index.mjs +36 -0
  47. package/src/multi-modal-fusion.mjs +190 -0
  48. package/src/multi-modal.mjs +524 -0
  49. package/src/natural-language-specs.mjs +1071 -0
  50. package/src/pair-comparison.mjs +277 -0
  51. package/src/persona/index.mjs +42 -0
  52. package/src/persona-enhanced.mjs +200 -0
  53. package/src/persona-experience.mjs +572 -0
  54. package/src/position-counterbalance.mjs +140 -0
  55. package/src/prompt-composer.mjs +375 -0
  56. package/src/render-change-detector.mjs +583 -0
  57. package/src/research-enhanced-validation.mjs +436 -0
  58. package/src/retry.mjs +152 -0
  59. package/src/rubrics.mjs +231 -0
  60. package/src/score-tracker.mjs +277 -0
  61. package/src/smart-validator.mjs +447 -0
  62. package/src/spec-config.mjs +106 -0
  63. package/src/spec-templates.mjs +347 -0
  64. package/src/specs/index.mjs +38 -0
  65. package/src/temporal/index.mjs +102 -0
  66. package/src/temporal-adaptive.mjs +163 -0
  67. package/src/temporal-batch-optimizer.mjs +222 -0
  68. package/src/temporal-constants.mjs +69 -0
  69. package/src/temporal-context.mjs +49 -0
  70. package/src/temporal-decision-manager.mjs +271 -0
  71. package/src/temporal-decision.mjs +669 -0
  72. package/src/temporal-errors.mjs +58 -0
  73. package/src/temporal-note-pruner.mjs +173 -0
  74. package/src/temporal-preprocessor.mjs +543 -0
  75. package/src/temporal-prompt-formatter.mjs +219 -0
  76. package/src/temporal-validation.mjs +159 -0
  77. package/src/temporal.mjs +415 -0
  78. package/src/type-guards.mjs +311 -0
  79. package/src/uncertainty-reducer.mjs +470 -0
  80. package/src/utils/index.mjs +175 -0
  81. package/src/validation-framework.mjs +321 -0
  82. package/src/validation-result-normalizer.mjs +64 -0
  83. package/src/validation.mjs +243 -0
  84. package/src/validators/accessibility-programmatic.mjs +345 -0
  85. package/src/validators/accessibility-validator.mjs +223 -0
  86. package/src/validators/batch-validator.mjs +143 -0
  87. package/src/validators/hybrid-validator.mjs +268 -0
  88. package/src/validators/index.mjs +34 -0
  89. package/src/validators/prompt-builder.mjs +218 -0
  90. package/src/validators/rubric.mjs +85 -0
  91. package/src/validators/state-programmatic.mjs +260 -0
  92. package/src/validators/state-validator.mjs +291 -0
  93. package/vercel.json +27 -0
@@ -0,0 +1,470 @@
1
+ /**
2
+ * Uncertainty Reduction for VLLM API Calls
3
+ *
4
+ * Research-backed strategies to reduce uncertainty in VLLM judgments:
5
+ * - Multiple API calls (self-consistency, ensemble)
6
+ * - Logprob analysis (token-level confidence)
7
+ * - Hallucination detection
8
+ * - Confidence calibration
9
+ *
10
+ * Research: Self-consistency improves accuracy by 5-15% (arXiv:2203.11171)
11
+ * Research: Ensemble methods reduce uncertainty (arXiv:2305.10429)
12
+ */
13
+
14
+ import { detectHallucination } from './hallucination-detector.mjs';
15
+ import { log, warn } from './logger.mjs';
16
+
17
+ /**
18
+ * Estimate uncertainty from logprobs
19
+ *
20
+ * @param {any} logprobs - Logprobs from API response
21
+ * @returns {Object} Uncertainty estimate
22
+ */
23
+ export function estimateUncertainty(logprobs) {
24
+ if (!logprobs) {
25
+ return { uncertainty: 0.5, confidence: 0.5, method: 'default' };
26
+ }
27
+
28
+ // OpenAI format: { tokens: [...], token_logprobs: [...] }
29
+ if (Array.isArray(logprobs.token_logprobs)) {
30
+ const valid = logprobs.token_logprobs.filter(p => p !== null);
31
+ if (valid.length === 0) {
32
+ return { uncertainty: 0.5, confidence: 0.5, method: 'no-logprobs' };
33
+ }
34
+
35
+ const avgLogprob = valid.reduce((a, b) => a + b, 0) / valid.length;
36
+ const minLogprob = Math.min(...valid);
37
+ const maxLogprob = Math.max(...valid);
38
+ const variance = valid.reduce((sum, p) => sum + Math.pow(p - avgLogprob, 2), 0) / valid.length;
39
+
40
+ // Convert logprob to probability: exp(logprob)
41
+ const avgProb = Math.exp(avgLogprob);
42
+ const minProb = Math.exp(minLogprob);
43
+
44
+ // Uncertainty: inverse of confidence
45
+ // Low logprob (more negative) = high uncertainty
46
+ // Threshold: -2.0 ≈ 13% probability
47
+ const uncertainty = avgLogprob < -2.0
48
+ ? Math.min(1.0, 1.0 - avgProb)
49
+ : Math.max(0.0, 1.0 - avgProb);
50
+
51
+ const confidence = 1.0 - uncertainty;
52
+
53
+ return {
54
+ uncertainty: Math.max(0, Math.min(1, uncertainty)),
55
+ confidence: Math.max(0, Math.min(1, confidence)),
56
+ method: 'logprobs',
57
+ avgLogprob,
58
+ avgProb,
59
+ minProb,
60
+ variance,
61
+ tokenCount: valid.length
62
+ };
63
+ }
64
+
65
+ // Gemini format: varies, may be nested
66
+ if (typeof logprobs === 'object' && logprobs !== null) {
67
+ // Try to extract any numeric logprob values
68
+ const values = extractNumericValues(logprobs);
69
+ if (values.length > 0) {
70
+ const avg = values.reduce((a, b) => a + b, 0) / values.length;
71
+ const uncertainty = avg < -2.0 ? Math.min(1.0, 1.0 - Math.exp(avg)) : Math.max(0.0, 1.0 - Math.exp(avg));
72
+ return {
73
+ uncertainty: Math.max(0, Math.min(1, uncertainty)),
74
+ confidence: 1.0 - uncertainty,
75
+ method: 'logprobs-gemini',
76
+ avgLogprob: avg
77
+ };
78
+ }
79
+ }
80
+
81
+ return { uncertainty: 0.5, confidence: 0.5, method: 'unknown-format' };
82
+ }
83
+
84
+ /**
85
+ * Extract numeric values from nested object
86
+ */
87
+ function extractNumericValues(obj, maxDepth = 3, depth = 0) {
88
+ if (depth > maxDepth) return [];
89
+
90
+ const values = [];
91
+ if (typeof obj === 'number') {
92
+ values.push(obj);
93
+ } else if (Array.isArray(obj)) {
94
+ obj.forEach(item => values.push(...extractNumericValues(item, maxDepth, depth + 1)));
95
+ } else if (typeof obj === 'object' && obj !== null) {
96
+ Object.values(obj).forEach(val => values.push(...extractNumericValues(val, maxDepth, depth + 1)));
97
+ }
98
+
99
+ return values;
100
+ }
101
+
102
+ /**
103
+ * Self-consistency check: Multiple API calls with same prompt
104
+ *
105
+ * Research: Self-consistency improves accuracy by 5-15% (arXiv:2203.11171)
106
+ *
107
+ * @param {Function} judgeFn - Function to call judge API
108
+ * @param {number} [n=3] - Number of calls to make
109
+ * @param {Object} [options={}] - Options
110
+ * @returns {Promise<Object>} Aggregated result with consistency metrics
111
+ */
112
+ export async function selfConsistencyCheck(judgeFn, n = 3, options = {}) {
113
+ const {
114
+ minAgreement = 0.7, // Minimum agreement threshold
115
+ maxCalls = 5 // Maximum calls before giving up
116
+ } = options;
117
+
118
+ const results = [];
119
+ let attempts = 0;
120
+
121
+ // Make multiple calls
122
+ while (results.length < n && attempts < maxCalls) {
123
+ attempts++;
124
+ try {
125
+ const result = await judgeFn();
126
+ if (result && result.score !== null) {
127
+ results.push(result);
128
+ }
129
+ } catch (error) {
130
+ warn(`[Uncertainty] Self-consistency call ${attempts} failed: ${error.message}`);
131
+ }
132
+ }
133
+
134
+ if (results.length === 0) {
135
+ return {
136
+ score: null,
137
+ uncertainty: 1.0,
138
+ confidence: 0.0,
139
+ consistency: 0.0,
140
+ method: 'self-consistency-failed'
141
+ };
142
+ }
143
+
144
+ // Calculate consistency
145
+ const scores = results.map(r => r.score).filter(s => s !== null);
146
+ if (scores.length === 0) {
147
+ return {
148
+ score: null,
149
+ uncertainty: 1.0,
150
+ confidence: 0.0,
151
+ consistency: 0.0,
152
+ method: 'self-consistency-no-scores'
153
+ };
154
+ }
155
+
156
+ // Mean score
157
+ const meanScore = scores.reduce((a, b) => a + b, 0) / scores.length;
158
+
159
+ // Standard deviation (measure of consistency)
160
+ const variance = scores.reduce((sum, s) => sum + Math.pow(s - meanScore, 2), 0) / scores.length;
161
+ const stdDev = Math.sqrt(variance);
162
+
163
+ // Consistency: inverse of coefficient of variation
164
+ // Lower stdDev relative to mean = higher consistency
165
+ const consistency = meanScore > 0
166
+ ? Math.max(0, Math.min(1, 1.0 - (stdDev / meanScore)))
167
+ : stdDev < 1.0 ? 1.0 - stdDev : 0.0;
168
+
169
+ // Uncertainty: inverse of consistency
170
+ const uncertainty = 1.0 - consistency;
171
+
172
+ // Confidence: weighted by consistency and number of calls
173
+ const confidence = consistency * Math.min(1.0, results.length / n);
174
+
175
+ // VERIFIABLE: Calculate improvement metrics if baseline is provided
176
+ // This allows verification of the "improves accuracy by 5-15%" claim
177
+ let improvementMetrics = null;
178
+ if (options.baselineScore !== undefined && options.baselineScore !== null) {
179
+ const scoreImprovement = meanScore - options.baselineScore;
180
+ // CRITICAL FIX: Handle baseline=0 case more robustly
181
+ // MCP research: When baseline is 0, standard percentage formula breaks (division by zero)
182
+ // Solution: Normalize against maximum scale (default 10, but configurable)
183
+ // This ensures consistent behavior across different scales (0-10, 0-100, etc.)
184
+ const maxScale = options.maxScale || 10; // Default to 0-10 scale, but allow override
185
+ const improvementPercent = options.baselineScore > 0
186
+ ? (scoreImprovement / options.baselineScore) * 100
187
+ : (scoreImprovement / maxScale) * 100; // Normalize against scale maximum when baseline is 0
188
+
189
+ improvementMetrics = {
190
+ baselineScore: options.baselineScore,
191
+ improvedScore: meanScore,
192
+ improvement: scoreImprovement,
193
+ improvementPercent,
194
+ // Research claim: 5-15% improvement
195
+ meetsResearchClaim: improvementPercent >= 5 && improvementPercent <= 15
196
+ };
197
+
198
+ // VERIFIABLE: Log improvement when it meets research claim threshold
199
+ if (improvementPercent >= 5) {
200
+ log(`[SelfConsistency] Accuracy improvement: ${improvementPercent.toFixed(1)}% (${options.baselineScore.toFixed(1)} → ${meanScore.toFixed(1)})`);
201
+ }
202
+ }
203
+
204
+ return {
205
+ score: Math.round(meanScore * 10) / 10, // Round to 1 decimal
206
+ uncertainty: Math.max(0, Math.min(1, uncertainty)),
207
+ confidence: Math.max(0, Math.min(1, confidence)),
208
+ consistency: Math.max(0, Math.min(1, consistency)),
209
+ method: 'self-consistency',
210
+ calls: results.length,
211
+ stdDev,
212
+ scores,
213
+ results,
214
+ // VERIFIABLE: Export improvement metrics to verify research claim
215
+ improvementMetrics
216
+ };
217
+ }
218
+
219
+ /**
220
+ * Ensemble uncertainty reduction
221
+ *
222
+ * Combine multiple uncertainty sources:
223
+ * - Logprob-based uncertainty
224
+ * - Self-consistency uncertainty
225
+ * - Hallucination detection
226
+ *
227
+ * @param {Object} sources - Uncertainty sources
228
+ * @returns {Object} Combined uncertainty estimate
229
+ */
230
+ export function combineUncertaintySources(sources) {
231
+ const {
232
+ logprobs = null,
233
+ selfConsistency = null,
234
+ hallucination = null,
235
+ retryCount = 1
236
+ } = sources;
237
+
238
+ const estimates = [];
239
+
240
+ // 1. Logprob-based uncertainty
241
+ if (logprobs) {
242
+ const logprobEst = estimateUncertainty(logprobs);
243
+ estimates.push({
244
+ uncertainty: logprobEst.uncertainty,
245
+ confidence: logprobEst.confidence,
246
+ weight: 0.4,
247
+ source: 'logprobs'
248
+ });
249
+ }
250
+
251
+ // 2. Self-consistency uncertainty
252
+ if (selfConsistency) {
253
+ estimates.push({
254
+ uncertainty: selfConsistency.uncertainty || (1.0 - selfConsistency.consistency),
255
+ confidence: selfConsistency.confidence || selfConsistency.consistency,
256
+ weight: 0.4,
257
+ source: 'self-consistency'
258
+ });
259
+ }
260
+
261
+ // 3. Hallucination detection
262
+ if (hallucination) {
263
+ estimates.push({
264
+ uncertainty: 1.0 - hallucination.confidence,
265
+ confidence: hallucination.confidence,
266
+ weight: 0.2,
267
+ source: 'hallucination'
268
+ });
269
+ }
270
+
271
+ // 4. Retry count (more retries = higher uncertainty)
272
+ if (retryCount > 1) {
273
+ estimates.push({
274
+ uncertainty: Math.min(0.3, (retryCount - 1) * 0.1),
275
+ confidence: Math.max(0.7, 1.0 - (retryCount - 1) * 0.1),
276
+ weight: 0.1,
277
+ source: 'retries'
278
+ });
279
+ }
280
+
281
+ // Weighted average
282
+ if (estimates.length === 0) {
283
+ return { uncertainty: 0.5, confidence: 0.5, method: 'default' };
284
+ }
285
+
286
+ const totalWeight = estimates.reduce((sum, e) => sum + e.weight, 0);
287
+ const weightedUncertainty = estimates.reduce((sum, e) => sum + (e.uncertainty * e.weight), 0) / totalWeight;
288
+ const weightedConfidence = estimates.reduce((sum, e) => sum + (e.confidence * e.weight), 0) / totalWeight;
289
+
290
+ return {
291
+ uncertainty: Math.max(0, Math.min(1, weightedUncertainty)),
292
+ confidence: Math.max(0, Math.min(1, weightedConfidence)),
293
+ method: 'ensemble',
294
+ sources: estimates.map(e => e.source),
295
+ breakdown: estimates
296
+ };
297
+ }
298
+
299
+ /**
300
+ * Determine if self-consistency should be used based on context (uncertainty × payout analysis)
301
+ *
302
+ * Based on research: Self-consistency provides highest ROI for:
303
+ * - Critical/high-stakes scenarios (expert, medical, accessibility)
304
+ * - Edge cases (extreme scores)
305
+ * - High uncertainty scenarios
306
+ * - High-impact issues (blocks-use, degrades-experience)
307
+ *
308
+ * @param {Object} context - Validation context
309
+ * @param {Object} partialResult - Partial validation result (score, issues, uncertainty)
310
+ * @returns {Object} { shouldUse: boolean, n: number, reason: string }
311
+ */
312
+ export function shouldUseSelfConsistency(context = {}, partialResult = {}) {
313
+ const { testType, importance, impact } = context;
314
+ const { score, uncertainty, issues } = partialResult;
315
+
316
+ // Use constants for thresholds (imported at top level to avoid async)
317
+ // These values are documented in src/constants.mjs and docs/misc/UNCERTAINTY_TIER_LOGIC.md
318
+ const LOW_SCORE_THRESHOLD = 3; // Bottom 30% of 0-10 scale
319
+ const HIGH_SCORE_THRESHOLD = 9; // Top 10% of 0-10 scale
320
+ const HIGH_UNCERTAINTY_THRESHOLD = 0.3; // 30% uncertainty
321
+ const OVER_DETECTION_ISSUE_COUNT = 5; // 5+ issues might indicate hallucination
322
+ const TIER1_N = 5; // Tier 1: Critical scenarios (expert, medical, blocking issues)
323
+ const EDGE_CASE_N = 3; // Tier 2: Edge cases
324
+
325
+ // Tier 1: Critical scenarios (always use, N=5)
326
+ if (testType === 'expert-evaluation' || testType === 'medical') {
327
+ return {
328
+ shouldUse: true,
329
+ n: TIER1_N,
330
+ reason: `Critical test type: ${testType}`
331
+ };
332
+ }
333
+
334
+ // Tier 1: Critical issues (blocks-use with critical importance)
335
+ if (importance === 'critical' && impact === 'blocks-use') {
336
+ return {
337
+ shouldUse: true,
338
+ n: TIER1_N,
339
+ reason: 'Critical issue that blocks use'
340
+ };
341
+ }
342
+
343
+ // Tier 2: Edge cases (extreme scores)
344
+ // NOTE: Thresholds (3, 9) represent bottom 30% and top 10% of 0-10 scale
345
+ // These are where models are most likely to be incorrect
346
+ if (score !== null && (score <= LOW_SCORE_THRESHOLD || score >= HIGH_SCORE_THRESHOLD)) {
347
+ return {
348
+ shouldUse: true,
349
+ n: EDGE_CASE_N,
350
+ reason: `Edge case score: ${score}`
351
+ };
352
+ }
353
+
354
+ // Tier 2: High uncertainty
355
+ // NOTE: 0.3 threshold based on research showing uncertainty > 0.3 indicates low confidence
356
+ if (uncertainty !== null && uncertainty > HIGH_UNCERTAINTY_THRESHOLD) {
357
+ return {
358
+ shouldUse: true,
359
+ n: EDGE_CASE_N,
360
+ reason: `High uncertainty: ${uncertainty.toFixed(2)}`
361
+ };
362
+ }
363
+
364
+ // Tier 2: Many issues (over-detection risk)
365
+ // NOTE: 5+ issues might indicate hallucination/over-detection
366
+ if (Array.isArray(issues) && issues.length >= OVER_DETECTION_ISSUE_COUNT) {
367
+ return {
368
+ shouldUse: true,
369
+ n: EDGE_CASE_N,
370
+ reason: `Many issues detected: ${issues.length} (over-detection risk)`
371
+ };
372
+ }
373
+
374
+ // Tier 2: High-impact degradation
375
+ if (importance === 'high' && impact === 'degrades-experience') {
376
+ return {
377
+ shouldUse: true,
378
+ n: 3,
379
+ reason: 'High-impact issue that degrades experience'
380
+ };
381
+ }
382
+
383
+ // Tier 3: Standard scenarios (no self-consistency)
384
+ return {
385
+ shouldUse: false,
386
+ n: 0,
387
+ reason: 'Standard validation (logprobs + hallucination sufficient)'
388
+ };
389
+ }
390
+
391
+ /**
392
+ * Enhance validation result with uncertainty reduction
393
+ *
394
+ * @param {Object} partialResult - Partial validation result (judgment, logprobs, attempts, screenshotPath)
395
+ * @param {Object} [options={}] - Options
396
+ * @param {Object} [context={}] - Validation context (for adaptive self-consistency)
397
+ * @returns {Object} Uncertainty and confidence estimates
398
+ */
399
+ export function enhanceWithUncertainty(partialResult, options = {}, context = {}) {
400
+ const {
401
+ enableSelfConsistency = false,
402
+ enableHallucinationCheck = true,
403
+ adaptiveSelfConsistency = true // New: adaptive strategy based on context
404
+ } = options;
405
+
406
+ // Extract uncertainty sources
407
+ const logprobs = partialResult.logprobs || null;
408
+ const attempts = partialResult.attempts || 1;
409
+ const judgment = partialResult.judgment || null;
410
+ const score = partialResult.score || null;
411
+ const issues = partialResult.issues || [];
412
+ const uncertainty = partialResult.uncertainty || null;
413
+
414
+ // Determine if self-consistency should be used (adaptive strategy)
415
+ let shouldUseSelfConsistencyValue = enableSelfConsistency;
416
+ let selfConsistencyN = 3;
417
+ let selfConsistencyReason = '';
418
+
419
+ if (adaptiveSelfConsistency && !enableSelfConsistency) {
420
+ // Check if context suggests self-consistency is warranted
421
+ const selfConsistencyDecision = shouldUseSelfConsistency(context, {
422
+ score,
423
+ uncertainty,
424
+ issues
425
+ });
426
+ shouldUseSelfConsistencyValue = selfConsistencyDecision.shouldUse;
427
+ selfConsistencyN = selfConsistencyDecision.n;
428
+ selfConsistencyReason = selfConsistencyDecision.reason;
429
+ } else if (enableSelfConsistency) {
430
+ selfConsistencyReason = 'Explicitly enabled';
431
+ }
432
+
433
+ // Estimate uncertainty from logprobs
434
+ const logprobUncertainty = logprobs ? estimateUncertainty(logprobs) : null;
435
+
436
+ // Check for hallucination
437
+ let hallucinationResult = null;
438
+ if (enableHallucinationCheck && judgment) {
439
+ try {
440
+ hallucinationResult = detectHallucination(
441
+ judgment,
442
+ partialResult.screenshotPath || null,
443
+ { logprobs }
444
+ );
445
+ } catch (error) {
446
+ // Silently fail
447
+ }
448
+ }
449
+
450
+ // Combine uncertainty sources
451
+ const combined = combineUncertaintySources({
452
+ logprobs: logprobUncertainty,
453
+ hallucination: hallucinationResult,
454
+ retryCount: attempts
455
+ });
456
+
457
+ // Return uncertainty metrics with self-consistency recommendation
458
+ return {
459
+ uncertainty: combined.uncertainty,
460
+ confidence: combined.confidence,
461
+ uncertaintyMethod: combined.method,
462
+ uncertaintyBreakdown: combined.breakdown || null,
463
+ hallucination: hallucinationResult,
464
+ // Self-consistency recommendation (caller should use this if needed)
465
+ selfConsistencyRecommended: shouldUseSelfConsistencyValue,
466
+ selfConsistencyN,
467
+ selfConsistencyReason
468
+ };
469
+ }
470
+
@@ -0,0 +1,175 @@
1
+ /**
2
+ * Utils Sub-Module
3
+ *
4
+ * Utility functions, helpers, and infrastructure.
5
+ *
6
+ * Import from 'ai-visual-test/utils'
7
+ */
8
+
9
+ // Cache
10
+ export {
11
+ getCached,
12
+ setCached,
13
+ clearCache,
14
+ getCacheStats,
15
+ initCache,
16
+ generateCacheKey
17
+ } from '../cache.mjs';
18
+
19
+ // Config
20
+ export {
21
+ createConfig,
22
+ getProvider,
23
+ getConfig,
24
+ setConfig
25
+ } from '../config.mjs';
26
+
27
+ // Environment
28
+ export { loadEnv } from '../load-env.mjs';
29
+
30
+ // Logger
31
+ export { enableDebug, disableDebug, isDebugEnabled, warn, log, error } from '../logger.mjs';
32
+
33
+ // Errors
34
+ export {
35
+ AIBrowserTestError,
36
+ ValidationError,
37
+ CacheError,
38
+ ConfigError,
39
+ ProviderError,
40
+ TimeoutError,
41
+ FileError,
42
+ StateMismatchError,
43
+ isAIBrowserTestError,
44
+ isErrorType
45
+ } from '../errors.mjs';
46
+
47
+ // Retry
48
+ export {
49
+ retryWithBackoff,
50
+ isRetryableError,
51
+ calculateBackoff,
52
+ enhanceErrorMessage
53
+ } from '../retry.mjs';
54
+
55
+ // Cost tracking
56
+ export {
57
+ CostTracker,
58
+ getCostTracker,
59
+ recordCost,
60
+ getCostStats
61
+ } from '../cost-tracker.mjs';
62
+
63
+ // Score tracking
64
+ export { ScoreTracker } from '../score-tracker.mjs';
65
+
66
+ // Batch optimization
67
+ export { BatchOptimizer } from '../batch-optimizer.mjs';
68
+ export { LatencyAwareBatchOptimizer } from '../latency-aware-batch-optimizer.mjs';
69
+
70
+ // Data extraction
71
+ export { extractStructuredData } from '../data-extractor.mjs';
72
+
73
+ // Feedback aggregation
74
+ export { aggregateFeedback, generateRecommendations } from '../feedback-aggregator.mjs';
75
+
76
+ // Context compression
77
+ export { compressContext, compressStateHistory } from '../context-compressor.mjs';
78
+
79
+ // Metrics
80
+ export {
81
+ spearmanCorrelation,
82
+ pearsonCorrelation,
83
+ calculateRankAgreement
84
+ } from '../metrics.mjs';
85
+
86
+ // Type guards
87
+ export {
88
+ isObject,
89
+ isString,
90
+ isNumber,
91
+ isArray,
92
+ isFunction,
93
+ isPromise,
94
+ isValidationResult,
95
+ isValidationContext,
96
+ isPersona,
97
+ isTemporalNote,
98
+ assertObject,
99
+ assertString,
100
+ assertNonEmptyString,
101
+ assertNumber,
102
+ assertArray,
103
+ assertFunction,
104
+ pick,
105
+ getProperty
106
+ } from '../type-guards.mjs';
107
+
108
+ // Constants
109
+ export {
110
+ CACHE_CONSTANTS,
111
+ TEMPORAL_CONSTANTS,
112
+ API_CONSTANTS,
113
+ UNCERTAINTY_CONSTANTS,
114
+ BATCH_OPTIMIZER_CONSTANTS
115
+ } from '../constants.mjs';
116
+
117
+ // Validation result normalization
118
+ export { normalizeValidationResult } from '../validation-result-normalizer.mjs';
119
+
120
+ // Error handlers
121
+ export { initErrorHandlers } from '../error-handler.mjs';
122
+
123
+ // Uncertainty reduction
124
+ export {
125
+ estimateUncertainty,
126
+ selfConsistencyCheck,
127
+ combineUncertaintySources,
128
+ enhanceWithUncertainty,
129
+ shouldUseSelfConsistency
130
+ } from '../uncertainty-reducer.mjs';
131
+
132
+ // Dynamic few-shot
133
+ export {
134
+ selectFewShotExamples,
135
+ formatFewShotExamples
136
+ } from '../dynamic-few-shot.mjs';
137
+
138
+ // Dynamic prompts
139
+ export {
140
+ generateDynamicPrompt,
141
+ generatePromptVariations,
142
+ generateInteractionPrompt,
143
+ generateGameplayPrompt
144
+ } from '../dynamic-prompts.mjs';
145
+
146
+ // Rubrics
147
+ export {
148
+ DEFAULT_RUBRIC,
149
+ buildRubricPrompt,
150
+ getRubricForTestType
151
+ } from '../rubrics.mjs';
152
+
153
+ // Model tier selection
154
+ export {
155
+ selectModelTier,
156
+ selectProvider,
157
+ selectModelTierAndProvider
158
+ } from '../model-tier-selector.mjs';
159
+
160
+ // Smart validator
161
+ export {
162
+ validateSmart,
163
+ validateAccessibilitySmart,
164
+ validateStateSmart,
165
+ validateElementSmart,
166
+ detectValidationMethod
167
+ } from '../smart-validator.mjs';
168
+
169
+ // Human validation
170
+ export {
171
+ HumanValidationManager,
172
+ getHumanValidationManager,
173
+ initHumanValidation
174
+ } from '../human-validation-manager.mjs';
175
+