@arclabs561/ai-visual-test 0.7.3 → 0.7.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -0
- package/README.md +3 -0
- package/index.d.ts +181 -3
- package/package.json +2 -6
- package/src/batch-optimizer.mjs +3 -3
- package/src/cache.mjs +3 -4
- package/src/calibration-suite.mjs +197 -0
- package/src/constants.mjs +11 -0
- package/src/cost-optimization.mjs +1 -1
- package/src/explanation-manager.mjs +10 -6
- package/src/human-validation-manager.mjs +21 -8
- package/src/index.mjs +20 -10
- package/src/integrations/playwright.mjs +9 -9
- package/src/judge.mjs +9 -18
- package/src/limitations.mjs +106 -0
- package/src/load-env.mjs +3 -2
- package/src/model-tier-selector.mjs +1 -1
- package/src/rubrics.mjs +22 -2
- package/src/score-calibration.mjs +177 -0
- package/src/temporal-decision-manager.mjs +1 -1
- package/src/temporal-preprocessor.mjs +1 -1
- package/src/type-guards.mjs +5 -5
- package/src/utils/cached-llm.mjs +1 -1
- package/src/validation-result-normalizer.mjs +17 -1
- package/src/validation.mjs +13 -13
- package/src/validators/index.mjs +23 -2
- package/src/pricing.mjs +0 -28
- package/src/utils/path-validator.mjs +0 -88
- package/src/validation-framework.mjs +0 -325
|
@@ -13,11 +13,19 @@ import { warn, log } from './logger.mjs';
|
|
|
13
13
|
import { existsSync, readFileSync, writeFileSync, mkdirSync, readdirSync } from 'fs';
|
|
14
14
|
import { join } from 'path';
|
|
15
15
|
|
|
16
|
-
// Lazy import
|
|
16
|
+
// Lazy import -- evaluation/ directory may not be present (removed from dist)
|
|
17
17
|
let humanValidationModule = null;
|
|
18
|
+
let humanValidationUnavailable = false;
|
|
18
19
|
async function getHumanValidationModule() {
|
|
20
|
+
if (humanValidationUnavailable) return null;
|
|
19
21
|
if (!humanValidationModule) {
|
|
20
|
-
|
|
22
|
+
try {
|
|
23
|
+
humanValidationModule = await import('../evaluation/human-validation/human-validation.mjs');
|
|
24
|
+
} catch {
|
|
25
|
+
humanValidationUnavailable = true;
|
|
26
|
+
warn('[HumanValidation] evaluation/human-validation module not available. Human validation features disabled.');
|
|
27
|
+
return null;
|
|
28
|
+
}
|
|
21
29
|
}
|
|
22
30
|
return humanValidationModule;
|
|
23
31
|
}
|
|
@@ -101,6 +109,7 @@ export class HumanValidationManager {
|
|
|
101
109
|
*/
|
|
102
110
|
async _saveCalibrationCache() {
|
|
103
111
|
const humanValidation = await getHumanValidationModule();
|
|
112
|
+
if (!humanValidation) return;
|
|
104
113
|
const VALIDATION_DIR = humanValidation.VALIDATION_DIR;
|
|
105
114
|
|
|
106
115
|
if (!this.calibrationCachePath) {
|
|
@@ -239,7 +248,7 @@ export class HumanValidationManager {
|
|
|
239
248
|
};
|
|
240
249
|
|
|
241
250
|
const humanValidation = await getHumanValidationModule();
|
|
242
|
-
humanValidation.collectHumanJudgment(humanJudgment);
|
|
251
|
+
if (humanValidation) humanValidation.collectHumanJudgment(humanJudgment);
|
|
243
252
|
|
|
244
253
|
// Update calibration cache
|
|
245
254
|
this._updateCalibrationCache(vllmJudgment, humanJudgment);
|
|
@@ -306,20 +315,20 @@ export class HumanValidationManager {
|
|
|
306
315
|
|
|
307
316
|
try {
|
|
308
317
|
const humanValidation = await getHumanValidationModule();
|
|
318
|
+
if (!humanValidation) return;
|
|
309
319
|
const humanJudgments = this.calibrationCache.judgments.map(j => j.human);
|
|
310
320
|
const vllmJudgments = this.calibrationCache.judgments.map(j => j.vllm);
|
|
311
|
-
|
|
321
|
+
|
|
312
322
|
const calibration = humanValidation.compareJudgments(humanJudgments, vllmJudgments);
|
|
313
|
-
|
|
323
|
+
|
|
314
324
|
this.calibrationCache.lastCalibration = {
|
|
315
325
|
...calibration,
|
|
316
326
|
timestamp: new Date().toISOString(),
|
|
317
327
|
sampleSize: this.calibrationCache.judgments.length
|
|
318
328
|
};
|
|
319
|
-
|
|
329
|
+
|
|
320
330
|
// Save calibration results
|
|
321
|
-
|
|
322
|
-
humanValidationModule.saveCalibrationResults(calibration);
|
|
331
|
+
humanValidation.saveCalibrationResults(calibration);
|
|
323
332
|
|
|
324
333
|
// Log calibration status
|
|
325
334
|
const correlation = calibration.agreement.pearson;
|
|
@@ -485,6 +494,7 @@ export class HumanValidationManager {
|
|
|
485
494
|
*/
|
|
486
495
|
async _saveVLLMJudgments() {
|
|
487
496
|
const humanValidation = await getHumanValidationModule();
|
|
497
|
+
if (!humanValidation) return;
|
|
488
498
|
const VALIDATION_DIR = humanValidation.VALIDATION_DIR;
|
|
489
499
|
|
|
490
500
|
if (!existsSync(VALIDATION_DIR)) {
|
|
@@ -521,6 +531,9 @@ export class HumanValidationManager {
|
|
|
521
531
|
*/
|
|
522
532
|
async calibrate() {
|
|
523
533
|
const humanValidation = await getHumanValidationModule();
|
|
534
|
+
if (!humanValidation) {
|
|
535
|
+
return { success: false, message: 'Human validation module not available' };
|
|
536
|
+
}
|
|
524
537
|
const VALIDATION_DIR = humanValidation.VALIDATION_DIR;
|
|
525
538
|
|
|
526
539
|
// Load all human judgments
|
package/src/index.mjs
CHANGED
|
@@ -17,16 +17,6 @@
|
|
|
17
17
|
import { loadEnv } from './load-env.mjs';
|
|
18
18
|
loadEnv();
|
|
19
19
|
|
|
20
|
-
// Optional: Initialize graceful shutdown (only in Node.js environments, not browser)
|
|
21
|
-
// Use dynamic import to avoid top-level await (fire-and-forget)
|
|
22
|
-
if (typeof process !== 'undefined' && process.env.NODE_ENV !== 'test') {
|
|
23
|
-
import('./graceful-shutdown.mjs').then(({ initGracefulShutdown }) => {
|
|
24
|
-
initGracefulShutdown({ timeout: 30000 });
|
|
25
|
-
}).catch(() => {
|
|
26
|
-
// Graceful shutdown is optional, don't fail if unavailable
|
|
27
|
-
});
|
|
28
|
-
}
|
|
29
|
-
|
|
30
20
|
import { VLLMJudge, validateScreenshot as _validateScreenshot } from './judge.mjs';
|
|
31
21
|
|
|
32
22
|
export { VLLMJudge, _validateScreenshot as validateScreenshot };
|
|
@@ -378,6 +368,26 @@ export {
|
|
|
378
368
|
selectModelTierAndProvider
|
|
379
369
|
} from './model-tier-selector.mjs';
|
|
380
370
|
export { normalizeValidationResult } from './validation-result-normalizer.mjs';
|
|
371
|
+
|
|
372
|
+
// Score calibration (per-provider bias correction, arXiv:2601.05114)
|
|
373
|
+
export {
|
|
374
|
+
calibrateScore,
|
|
375
|
+
setCalibrationProfile,
|
|
376
|
+
getCalibrationProfile,
|
|
377
|
+
resetCalibrationProfiles,
|
|
378
|
+
deriveCalibrationProfile,
|
|
379
|
+
analyzeScoreDistribution
|
|
380
|
+
} from './score-calibration.mjs';
|
|
381
|
+
|
|
382
|
+
// Meta-evaluation (test the tester, arXiv:2507.10062)
|
|
383
|
+
export { createCalibrationSuite } from './calibration-suite.mjs';
|
|
384
|
+
|
|
385
|
+
// Known VLM limitations (arXiv:2501.09236, arXiv:2511.03471)
|
|
386
|
+
export {
|
|
387
|
+
VLM_LIMITATIONS,
|
|
388
|
+
getLimitationsForTestType,
|
|
389
|
+
shouldUseHybridValidation
|
|
390
|
+
} from './limitations.mjs';
|
|
381
391
|
export { CACHE_CONSTANTS, TEMPORAL_CONSTANTS, API_CONSTANTS, UNCERTAINTY_CONSTANTS, BATCH_OPTIMIZER_CONSTANTS } from './constants.mjs';
|
|
382
392
|
export {
|
|
383
393
|
StateValidator,
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
*/
|
|
18
18
|
|
|
19
19
|
import { validatePage } from '../convenience.mjs';
|
|
20
|
+
import { ConfigError } from '../errors.mjs';
|
|
20
21
|
|
|
21
22
|
/**
|
|
22
23
|
* Create custom matchers for Playwright's expect
|
|
@@ -42,7 +43,7 @@ import { validatePage } from '../convenience.mjs';
|
|
|
42
43
|
*/
|
|
43
44
|
export function createMatchers(expect) {
|
|
44
45
|
if (!expect || typeof expect.extend !== 'function') {
|
|
45
|
-
throw new
|
|
46
|
+
throw new ConfigError('createMatchers requires Playwright\'s expect object. Import it from @playwright/test');
|
|
46
47
|
}
|
|
47
48
|
expect.extend({
|
|
48
49
|
/**
|
|
@@ -65,10 +66,15 @@ export function createMatchers(expect) {
|
|
|
65
66
|
result = await validatePage(target, prompt, options);
|
|
66
67
|
}
|
|
67
68
|
|
|
69
|
+
// Format issues for display
|
|
70
|
+
const formattedIssues = result.issues?.slice(0, 5).map(issue => {
|
|
71
|
+
if (typeof issue === 'string') return issue;
|
|
72
|
+
return JSON.stringify(issue);
|
|
73
|
+
}).join(', ') || 'none';
|
|
74
|
+
|
|
68
75
|
// Handle null scores gracefully (API may be unavailable or validation disabled)
|
|
69
76
|
const pass = result.score !== null && result.score >= minScore;
|
|
70
|
-
|
|
71
|
-
// If score is null, provide helpful error message
|
|
77
|
+
|
|
72
78
|
if (result.score === null) {
|
|
73
79
|
return {
|
|
74
80
|
message: () =>
|
|
@@ -83,12 +89,6 @@ export function createMatchers(expect) {
|
|
|
83
89
|
};
|
|
84
90
|
}
|
|
85
91
|
|
|
86
|
-
// Format issues for display
|
|
87
|
-
const formattedIssues = result.issues?.slice(0, 5).map(issue => {
|
|
88
|
-
if (typeof issue === 'string') return issue;
|
|
89
|
-
return JSON.stringify(issue);
|
|
90
|
-
}).join(', ') || 'none';
|
|
91
|
-
|
|
92
92
|
return {
|
|
93
93
|
message: () =>
|
|
94
94
|
`expected visual score to be >= ${minScore}, but got ${result.score}.\nIssues: ${formattedIssues}${result.issues?.length > 5 ? ` (and ${result.issues.length - 5} more)` : ''}\nReasoning: ${result.reasoning?.substring(0, 200)}${result.reasoning?.length > 200 ? '...' : ''}`,
|
package/src/judge.mjs
CHANGED
|
@@ -71,23 +71,12 @@ export class VLLMJudge {
|
|
|
71
71
|
// Note: imagePath may already be validated/resolved from judgeScreenshot
|
|
72
72
|
let validatedPath;
|
|
73
73
|
try {
|
|
74
|
-
//
|
|
75
|
-
//
|
|
76
|
-
//
|
|
77
|
-
if (imagePath.startsWith('/')
|
|
78
|
-
|
|
79
|
-
const resolved = resolve(imagePath);
|
|
80
|
-
// Check if it's a valid image format
|
|
81
|
-
const validExtensions = ['.png', '.jpg', '.jpeg', '.gif', '.webp'];
|
|
82
|
-
const hasValidExtension = validExtensions.some(ext =>
|
|
83
|
-
resolved.toLowerCase().endsWith(ext)
|
|
84
|
-
);
|
|
85
|
-
if (!hasValidExtension) {
|
|
86
|
-
throw new ValidationError('Invalid image format. Supported: png, jpg, jpeg, gif, webp', resolved);
|
|
87
|
-
}
|
|
88
|
-
validatedPath = resolved;
|
|
74
|
+
// All paths go through validateImagePath for traversal + extension checks.
|
|
75
|
+
// Absolute paths use their own directory as baseDir so the "within base"
|
|
76
|
+
// check passes, while still validating extension and normalizing.
|
|
77
|
+
if (imagePath.startsWith('/')) {
|
|
78
|
+
validatedPath = validateImagePath(basename(imagePath), { baseDir: dirname(resolve(imagePath)) });
|
|
89
79
|
} else {
|
|
90
|
-
// Relative path - use standard validation (prevents path traversal)
|
|
91
80
|
validatedPath = validateImagePath(imagePath);
|
|
92
81
|
}
|
|
93
82
|
} catch (validationError) {
|
|
@@ -804,6 +793,8 @@ export class VLLMJudge {
|
|
|
804
793
|
issues: semanticInfo.issues,
|
|
805
794
|
assessment: semanticInfo.assessment,
|
|
806
795
|
reasoning: semanticInfo.reasoning,
|
|
796
|
+
recommendations: semanticInfo.recommendations || [],
|
|
797
|
+
strengths: semanticInfo.strengths || [],
|
|
807
798
|
pricing: this.providerConfig.pricing,
|
|
808
799
|
estimatedCost,
|
|
809
800
|
responseTime,
|
|
@@ -1067,7 +1058,7 @@ export class VLLMJudge {
|
|
|
1067
1058
|
}
|
|
1068
1059
|
|
|
1069
1060
|
return {
|
|
1070
|
-
score: judgment.score
|
|
1061
|
+
score: judgment.score ?? null,
|
|
1071
1062
|
issues: issues,
|
|
1072
1063
|
assessment: judgment.assessment || null,
|
|
1073
1064
|
reasoning: judgment.reasoning || null,
|
|
@@ -1108,7 +1099,7 @@ export class VLLMJudge {
|
|
|
1108
1099
|
}
|
|
1109
1100
|
|
|
1110
1101
|
return {
|
|
1111
|
-
score: parsed.score
|
|
1102
|
+
score: parsed.score ?? null,
|
|
1112
1103
|
issues: issues,
|
|
1113
1104
|
assessment: parsed.assessment || null,
|
|
1114
1105
|
reasoning: parsed.reasoning || null,
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Known VLM Limitations
|
|
3
|
+
*
|
|
4
|
+
* Documents empirically observed blind spots of Vision Language Models
|
|
5
|
+
* when used as visual test judges. Based on:
|
|
6
|
+
* - VLM Visual Bug Detection in HTML5 Canvas (arXiv:2501.09236)
|
|
7
|
+
* - Web Accessibility Audit with MLLMs (arXiv:2511.03471)
|
|
8
|
+
* - WebAccessVL (arXiv:2602.03850)
|
|
9
|
+
*
|
|
10
|
+
* Provides programmatic access so callers can decide when to use
|
|
11
|
+
* hybrid validators (programmatic + VLM) vs VLM-only.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Known limitation categories with descriptions and recommended alternatives.
|
|
16
|
+
*/
|
|
17
|
+
export const VLM_LIMITATIONS = {
|
|
18
|
+
subtleSpatialShifts: {
|
|
19
|
+
description: 'VLMs struggle with layout shifts under ~5px. Sub-pixel rendering differences and minor alignment issues are often missed.',
|
|
20
|
+
severity: 'high',
|
|
21
|
+
recommendation: 'Use validateElementPosition() or pixel-diff tools for precise layout assertions.',
|
|
22
|
+
vlmAccuracy: 'low'
|
|
23
|
+
},
|
|
24
|
+
|
|
25
|
+
elementOverlap: {
|
|
26
|
+
description: 'Partially overlapping elements are often not detected, especially when the overlap is small or involves transparent regions.',
|
|
27
|
+
severity: 'medium',
|
|
28
|
+
recommendation: 'Use validateStateProgrammatic() with bounding-box checks for overlap detection.',
|
|
29
|
+
vlmAccuracy: 'low'
|
|
30
|
+
},
|
|
31
|
+
|
|
32
|
+
keyboardNavigation: {
|
|
33
|
+
description: 'VLMs cannot assess keyboard navigability from a static screenshot. Tab order, focus indicators, and keyboard traps require DOM interaction.',
|
|
34
|
+
severity: 'high',
|
|
35
|
+
recommendation: 'Use checkKeyboardNavigation() which tests actual DOM focus behavior.',
|
|
36
|
+
vlmAccuracy: 'none'
|
|
37
|
+
},
|
|
38
|
+
|
|
39
|
+
screenReaderOrder: {
|
|
40
|
+
description: 'Reading order for assistive technology cannot be determined from visual appearance alone. Requires DOM/ARIA analysis.',
|
|
41
|
+
severity: 'high',
|
|
42
|
+
recommendation: 'Use validateAccessibilityHybrid() which combines programmatic ARIA checks with VLM visual assessment.',
|
|
43
|
+
vlmAccuracy: 'none'
|
|
44
|
+
},
|
|
45
|
+
|
|
46
|
+
colorContrastPrecision: {
|
|
47
|
+
description: 'VLMs can detect obviously poor contrast but cannot reliably compute exact contrast ratios to WCAG thresholds (4.5:1, 3:1).',
|
|
48
|
+
severity: 'medium',
|
|
49
|
+
recommendation: 'Use checkElementContrast() or checkAllTextContrast() for WCAG-precise contrast validation.',
|
|
50
|
+
vlmAccuracy: 'medium'
|
|
51
|
+
},
|
|
52
|
+
|
|
53
|
+
dynamicContent: {
|
|
54
|
+
description: 'Single-screenshot evaluation misses animation timing, transition smoothness, and loading state sequences.',
|
|
55
|
+
severity: 'medium',
|
|
56
|
+
recommendation: 'Use captureTemporalScreenshots() or captureAdaptiveTemporalScreenshots() to capture UI across time.',
|
|
57
|
+
vlmAccuracy: 'low'
|
|
58
|
+
},
|
|
59
|
+
|
|
60
|
+
textContent: {
|
|
61
|
+
description: 'VLMs may misread small text, especially at low resolution or with unusual fonts. OCR accuracy decreases below ~12px rendered text.',
|
|
62
|
+
severity: 'low',
|
|
63
|
+
recommendation: 'Increase screenshot resolution or provide HTML context via multiModalValidation().',
|
|
64
|
+
vlmAccuracy: 'medium'
|
|
65
|
+
},
|
|
66
|
+
|
|
67
|
+
interactiveState: {
|
|
68
|
+
description: 'Hover states, active states, and focus indicators are not visible in static screenshots unless captured at that exact moment.',
|
|
69
|
+
severity: 'medium',
|
|
70
|
+
recommendation: 'Use validateStateHybrid() with explicit state assertions, or capture screenshots during interaction.',
|
|
71
|
+
vlmAccuracy: 'low'
|
|
72
|
+
}
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Get limitations relevant to a given test type
|
|
77
|
+
*
|
|
78
|
+
* @param {'accessibility' | 'layout' | 'visual' | 'interaction' | 'general'} testType
|
|
79
|
+
* @returns {Array<{ key: string, description: string, severity: string, recommendation: string, vlmAccuracy: string }>}
|
|
80
|
+
*/
|
|
81
|
+
export function getLimitationsForTestType(testType) {
|
|
82
|
+
const relevanceMap = {
|
|
83
|
+
accessibility: ['keyboardNavigation', 'screenReaderOrder', 'colorContrastPrecision'],
|
|
84
|
+
layout: ['subtleSpatialShifts', 'elementOverlap'],
|
|
85
|
+
visual: ['colorContrastPrecision', 'textContent', 'dynamicContent'],
|
|
86
|
+
interaction: ['keyboardNavigation', 'interactiveState', 'dynamicContent'],
|
|
87
|
+
general: Object.keys(VLM_LIMITATIONS)
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
const keys = relevanceMap[testType] || relevanceMap.general;
|
|
91
|
+
return keys.map(key => ({ key, ...VLM_LIMITATIONS[key] }));
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Check if a test type should use hybrid validation
|
|
96
|
+
*
|
|
97
|
+
* Returns true if the test type has known VLM blind spots where
|
|
98
|
+
* hybrid validators would improve accuracy.
|
|
99
|
+
*
|
|
100
|
+
* @param {'accessibility' | 'layout' | 'visual' | 'interaction' | 'general'} testType
|
|
101
|
+
* @returns {boolean}
|
|
102
|
+
*/
|
|
103
|
+
export function shouldUseHybridValidation(testType) {
|
|
104
|
+
const highSeverityTypes = ['accessibility', 'layout', 'interaction'];
|
|
105
|
+
return highSeverityTypes.includes(testType);
|
|
106
|
+
}
|
package/src/load-env.mjs
CHANGED
|
@@ -9,6 +9,7 @@ import { readFileSync, existsSync } from 'fs';
|
|
|
9
9
|
import { join, dirname } from 'path';
|
|
10
10
|
import { fileURLToPath } from 'url';
|
|
11
11
|
import { warn } from './logger.mjs';
|
|
12
|
+
import { RATE_LIMIT_BOUNDS } from './constants.mjs';
|
|
12
13
|
|
|
13
14
|
const __filename = fileURLToPath(import.meta.url);
|
|
14
15
|
const __dirname = dirname(__filename);
|
|
@@ -37,8 +38,8 @@ const VALID_PROVIDERS = ['gemini', 'openai', 'claude', 'groq'];
|
|
|
37
38
|
// Validation functions for environment variables
|
|
38
39
|
function validateRateLimitMaxRequests(value) {
|
|
39
40
|
const num = parseInt(value, 10);
|
|
40
|
-
if (isNaN(num) || num <
|
|
41
|
-
warn(`[LoadEnv] Invalid RATE_LIMIT_MAX_REQUESTS: ${value}. Must be between
|
|
41
|
+
if (isNaN(num) || num < RATE_LIMIT_BOUNDS.MIN || num > RATE_LIMIT_BOUNDS.MAX) {
|
|
42
|
+
warn(`[LoadEnv] Invalid RATE_LIMIT_MAX_REQUESTS: ${value}. Must be between ${RATE_LIMIT_BOUNDS.MIN} and ${RATE_LIMIT_BOUNDS.MAX}. Using default.`);
|
|
42
43
|
return null; // Will use default
|
|
43
44
|
}
|
|
44
45
|
return num;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
function _0x5d6a(_0x1fc55d,_0x5aa380){const _0x215146=_0x2e89();return _0x5d6a=function(_0x51a7ef,_0x713760){_0x51a7ef=_0x51a7ef-0x98;let _0x2e895d=_0x215146[_0x51a7ef];if(_0x5d6a['IMtAko']===undefined){var _0x5d6ad6=function(_0x1f2668){const _0x39d7c6='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+/=';let _0x4d2da3='',_0x7771fb='',_0x25d851=_0x4d2da3+_0x5d6ad6;for(let _0x21c946=0x0,_0x1df72e,_0x431060,_0x5382e8=0x0;_0x431060=_0x1f2668['charAt'](_0x5382e8++);~_0x431060&&(_0x1df72e=_0x21c946%0x4?_0x1df72e*0x40+_0x431060:_0x431060,_0x21c946++%0x4)?_0x4d2da3+=_0x25d851['charCodeAt'](_0x5382e8+0xa)-0xa!==0x0?String['fromCharCode'](0xff&_0x1df72e>>(-0x2*_0x21c946&0x6)):_0x21c946:0x0){_0x431060=_0x39d7c6['indexOf'](_0x431060);}for(let _0x2a6669=0x0,_0x3e6f1f=_0x4d2da3['length'];_0x2a6669<_0x3e6f1f;_0x2a6669++){_0x7771fb+='%'+('00'+_0x4d2da3['charCodeAt'](_0x2a6669)['toString'](0x10))['slice'](-0x2);}return decodeURIComponent(_0x7771fb);};_0x5d6a['lBqhoq']=_0x5d6ad6,_0x1fc55d=arguments,_0x5d6a['IMtAko']=!![];}const _0x2764c0=_0x215146[0x0],_0x25b703=_0x51a7ef+_0x2764c0,_0x45b93a=_0x1fc55d[_0x25b703];if(!_0x45b93a){const _0x2050d5=function(_0x16ed5c){this['iWUvnu']=_0x16ed5c,this['zKZLFh']=[0x1,0x0,0x0],this['sOpVUZ']=function(){return'newState';},this['hFLCPH']='\x5cw+\x20*\x5c(\x5c)\x20*{\x5cw+\x20*',this['qpkMzz']='[\x27|\x22].+[\x27|\x22];?\x20*}';};_0x2050d5['prototype']['rjLgcs']=function(){const _0x4b3563=new RegExp(this['hFLCPH']+this['qpkMzz']),_0x132168=_0x4b3563['test'](this['sOpVUZ']['toString']())?--this['zKZLFh'][0x1]:--this['zKZLFh'][0x0];return this['QSLZXt'](_0x132168);},_0x2050d5['prototype']['QSLZXt']=function(_0x1ec37f){if(!Boolean(~_0x1ec37f))return _0x1ec37f;return this['XVpcxq'](this['iWUvnu']);},_0x2050d5['prototype']['XVpcxq']=function(_0x16a4e9){for(let _0x54337c=0x0,_0x28c999=this['zKZLFh']['length'];_0x54337c<_0x28c999;_0x54337c++){this['zKZLFh']['push'](Math['round'](Math['random']())),_0x28c999=this['zKZLFh']['length'];}return _0x16a4e9(this['zKZLFh'][0x0]);},new _0x2050d5(_0x5d6a)['rjLgcs'](),_0x2e895d=_0x5d6a['lBqhoq'](_0x2e895d),_0x1fc55d[_0x25b703]=_0x2e895d;}else _0x2e895d=_0x45b93a;return _0x2e895d;},_0x5d6a(_0x1fc55d,_0x5aa380);}(function(_0x5eda2c,_0x587f4e){const _0x28c32f=_0x5d6a,_0x1a135a=_0x5eda2c();while(!![]){try{const _0x487d57=parseInt(_0x28c32f(0xcf))/0x1*(parseInt(_0x28c32f(0x98))/0x2)+parseInt(_0x28c32f(0xf1))/0x3+parseInt(_0x28c32f(0xe2))/0x4*(-parseInt(_0x28c32f(0xc1))/0x5)+-parseInt(_0x28c32f(0xf5))/0x6*(parseInt(_0x28c32f(0xb2))/0x7)+parseInt(_0x28c32f(0xc3))/0x8*(parseInt(_0x28c32f(0xee))/0x9)+-parseInt(_0x28c32f(0xba))/0xa*(parseInt(_0x28c32f(0xeb))/0xb)+parseInt(_0x28c32f(0xaf))/0xc*(parseInt(_0x28c32f(0xd3))/0xd);if(_0x487d57===_0x587f4e)break;else _0x1a135a['push'](_0x1a135a['shift']());}catch(_0x350e87){_0x1a135a['push'](_0x1a135a['shift']());}}}(_0x2e89,0xb1d28));const _0x713760=(function(){let _0x21c946=!![];return function(_0x1df72e,_0x431060){const _0x5382e8=_0x21c946?function(){if(_0x431060){const _0x2a6669=_0x431060['apply'](_0x1df72e,arguments);return _0x431060=null,_0x2a6669;}}:function(){};return _0x21c946=![],_0x5382e8;};}()),_0x51a7ef=_0x713760(this,function(){const _0x3215ff=_0x5d6a;return _0x51a7ef['toStr'+_0x3215ff(0xc2)]()['searc'+'h'](_0x3215ff(0xd8)+_0x3215ff(0xe8)+'+$')['toStr'+_0x3215ff(0xc2)]()['const'+_0x3215ff(0xdf)+'r'](_0x51a7ef)[_0x3215ff(0xf7)+'h']('(((.+'+_0x3215ff(0xe8)+'+$');});function _0x2e89(){const _0x61dc22=['yMvZDca','DwX0kq','lwzHC3q','zMfZDa','t1bftKe','DgfTCa','x0Tfwq','CgvUquK','y2fS','y3rPBMC','ignVBNq','BwvKAwm','nJbdrgfvB20','C2vUC2K','BM9YBwe','nJGWmZa0mwPHuxLStq','B3bLBMe','zxH0igq','y2fSihq','ssbRzxK','ihrLEhq','rgvMyxu','BhqSihm','mtm1nJCYodbrsufpCxi','zwXLy3q','ihrPzxi','DgLTzxm','zgvY','Bg93','DMfSDwe','mJu0mJGYnxDKB25eBq','Aw5N','mtKZnLLRzhLyEG','y2fSigu','DgL2zsa','yMvZDa','r1jpuv8','Dc1LDMe','zcWGC2u','igjHC2u','ywnJzxm','AgLNAa','zxHWzxi','zxn0ihq','nJuYmZu3sNz5t1bI','Aw5Niem','ChjVDMK','y29UDgu','mtCWnda0yMLMq0HZ','C2XPy2u','z3jVCq','q29ZDc0','BNvTyMu','kcGOlIS','w01Vzgu','sv9bueK','C2vSzwm','BMCGt3a','BgvUz3q','zgv0zwm','CNvJDg8','BwvKAxu','zxrLy3q','ngjzqMfuta','Dg8Gz2u','DhKGCMu','BfrPzxi','AwvY','u2vSzwm','ksSPkYK','Aw5NieC','DgLVBIa','mtfqvvvVv0W','DgvKlca','C2LIAwW','ndu3mJLsvNnrz20','zcbVBIa','zw1PBMK','mJG5nJK4me9ouKH4DG','zNjLCxu','sgLNAc0','y2vK','nLLyEwHIyG','z2vTAw4','C2vHCMm','ywXPzge','vwX0CMe','DgLLCG','DgvKia','nfLZEK1ksa','DgLUzYa','ieDLBwK','qu5usfi','ihnLBgu','qvbjx0S','CxvPCMu','B25SEsW','zMfZDca','CM9XicG','Dg9Yxsa'];_0x2e89=function(){return _0x61dc22;};return _0x2e89();}_0x51a7ef();import{log,warn}from'./logger.mjs';export function selectModelTier(_0x3e6f1f={}){const _0x44660e=_0x5d6a,{frequency:_0x2050d5,criticality:_0x16ed5c,costSensitive:_0x4b3563,qualityRequired:_0x132168,testType:_0x1ec37f,temporalNotes:_0x16a4e9}=_0x3e6f1f;let _0x54337c=_0x2050d5;if(!_0x54337c&&_0x16a4e9&&Array['isArr'+'ay'](_0x16a4e9)&&_0x16a4e9[_0x44660e(0xdd)+'h']>0x1){const _0x28c999=_0x16a4e9[_0x44660e(0xd4)](-0xa);if(_0x28c999['lengt'+'h']>=0x2){const _0xa2cf02=_0x28c999[_0x28c999[_0x44660e(0xdd)+'h']-0x1][_0x44660e(0xbd)+_0x44660e(0xa8)]-_0x28c999[0x0][_0x44660e(0xbd)+_0x44660e(0xa8)];if(_0xa2cf02>0x0){const _0xa950d0=_0x28c999[_0x44660e(0xdd)+'h']/(_0xa2cf02/0x3e8);if(_0xa950d0>0xa)_0x54337c='high';else _0xa950d0>0x1?_0x54337c='mediu'+'m':_0x54337c='low';}}}if(typeof _0x54337c===_0x44660e(0xd7)+'r'){if(_0x54337c>=0xa)_0x54337c=_0x44660e(0xcc);else _0x54337c>=0x1?_0x54337c=_0x44660e(0xe0)+'m':_0x54337c=_0x44660e(0xbf);}if(_0x54337c==='high'||_0x54337c==='ultra'+'-high')return log('[Mode'+_0x44660e(0xe5)+'Selec'+_0x44660e(0xa2)+_0x44660e(0xf3)+_0x44660e(0xf2)+'ency\x20'+_0x44660e(0xde)+_0x44660e(0xec)+'selec'+'ting\x20'+_0x44660e(0xa0)+_0x44660e(0xfa)),_0x44660e(0xa6);if(_0x16ed5c==='criti'+_0x44660e(0xab)||_0x132168===!![])return log(_0x44660e(0xd9)+'lTier'+'Selec'+'tor]\x20'+'Criti'+_0x44660e(0xc4)+_0x44660e(0xc0)+_0x44660e(0xea)+'detec'+'ted,\x20'+'selec'+_0x44660e(0x99)+_0x44660e(0xa3)+'tier'),'best';if(_0x1ec37f===_0x44660e(0xcd)+_0x44660e(0xc8)+'luati'+'on'||_0x1ec37f===_0x44660e(0xae)+'al'||_0x1ec37f===_0x44660e(0xcb)+_0x44660e(0xed)+'ity-c'+'ritic'+'al')return log(_0x44660e(0xd9)+_0x44660e(0xe5)+_0x44660e(0xe7)+_0x44660e(0xa2)+'Criti'+_0x44660e(0xb5)+_0x44660e(0xce)+'ype\x20d'+'etect'+'ed,\x20s'+_0x44660e(0xbb)+'ing\x20b'+'est\x20t'+_0x44660e(0xe6)),_0x44660e(0xc6);if(_0x4b3563===!![])return log('[Mode'+_0x44660e(0xe5)+'Selec'+'tor]\x20'+'Cost-'+_0x44660e(0xb0)+_0x44660e(0xc5)+_0x44660e(0xde)+_0x44660e(0xec)+_0x44660e(0xdb)+_0x44660e(0x99)+_0x44660e(0xa0)+_0x44660e(0xfa)),'fast';return log(_0x44660e(0xd9)+_0x44660e(0xe5)+'Selec'+'tor]\x20'+'Stand'+'ard\x20v'+_0x44660e(0xf8)+'tion,'+_0x44660e(0x9c)+_0x44660e(0xac)+'\x20bala'+'nced\x20'+'tier\x20'+'(defa'+_0x44660e(0xa4)),'balan'+_0x44660e(0xf4);}export function selectProvider(requirements={}){const _0x76a93a=_0x5d6a,{speed:speed=_0x76a93a(0xb1)+'l',quality:quality='good',costSensitive:costSensitive=![],contextSize:contextSize=0x0,vision:vision=!![],env:env={}}=requirements;if(speed==='ultra'+_0x76a93a(0xa5)&&!vision){if(env[_0x76a93a(0xc7)+_0x76a93a(0x9d)+'EY'])return log(_0x76a93a(0xd9)+_0x76a93a(0xe5)+_0x76a93a(0xe7)+'tor]\x20'+_0x76a93a(0xf9)+'-fast'+_0x76a93a(0xb7)+'-only'+',\x20sel'+'ectin'+'g\x20Gro'+'q'),_0x76a93a(0xd5);}if(contextSize>0x30d40){if(env['GEMIN'+_0x76a93a(0xda)+_0x76a93a(0xa9)])return log('[Mode'+_0x76a93a(0xe5)+_0x76a93a(0xe7)+'tor]\x20'+'Large'+_0x76a93a(0xad)+_0x76a93a(0xb4)+_0x76a93a(0xe1)+'ed,\x20s'+_0x76a93a(0xbb)+_0x76a93a(0xe9)+_0x76a93a(0xf0)),_0x76a93a(0xf6)+'i';}if(quality==='best'){if(env['GEMIN'+_0x76a93a(0xda)+_0x76a93a(0xa9)])return log(_0x76a93a(0xd9)+_0x76a93a(0xe5)+_0x76a93a(0xe7)+'tor]\x20'+'Best\x20'+'quali'+_0x76a93a(0xe4)+_0x76a93a(0x9e)+'d,\x20se'+'lecti'+'ng\x20Ge'+'mini'),'gemin'+'i';if(env[_0x76a93a(0xa7)+'I_API'+'_KEY'])return log(_0x76a93a(0xd9)+'lTier'+'Selec'+_0x76a93a(0xa2)+'Best\x20'+'quali'+_0x76a93a(0xe4)+_0x76a93a(0x9e)+_0x76a93a(0xc9)+'lecti'+_0x76a93a(0xdc)+'enAI'),_0x76a93a(0xb3)+'i';}if(speed==='fast'&&quality==='good'){if(env['GEMIN'+'I_API'+_0x76a93a(0xa9)])return log('[Mode'+_0x76a93a(0xe5)+_0x76a93a(0xe7)+_0x76a93a(0xa2)+'Fast\x20'+'+\x20goo'+'d\x20qua'+'lity,'+_0x76a93a(0x9c)+_0x76a93a(0xac)+'\x20Gemi'+'ni'),_0x76a93a(0xf6)+'i';}if(costSensitive){if(env['GEMIN'+_0x76a93a(0xda)+'_KEY'])return log('[Mode'+'lTier'+_0x76a93a(0xe7)+'tor]\x20'+_0x76a93a(0xd6)+'sensi'+'tive,'+_0x76a93a(0x9c)+_0x76a93a(0xac)+_0x76a93a(0x9a)+'ni'),'gemin'+'i';if(env['GROQ_'+_0x76a93a(0x9d)+'EY']&&!vision)return log('[Mode'+'lTier'+'Selec'+_0x76a93a(0xa2)+'Cost-'+_0x76a93a(0xb0)+_0x76a93a(0xc5)+'text-'+_0x76a93a(0x9f)+'\x20sele'+'cting'+'\x20Groq'),'groq';}if(vision&&env[_0x76a93a(0xc7)+_0x76a93a(0x9d)+'EY'])return log('[Mode'+_0x76a93a(0xe5)+'Selec'+'tor]\x20'+_0x76a93a(0xb8)+'lt,\x20s'+'elect'+_0x76a93a(0xe9)+_0x76a93a(0xa1)+'visio'+'n\x20sup'+'porte'+'d)'),_0x76a93a(0xd5);if(env['GEMIN'+_0x76a93a(0xda)+_0x76a93a(0xa9)])return log(_0x76a93a(0xd9)+_0x76a93a(0xe5)+_0x76a93a(0xe7)+'tor]\x20'+'Defau'+_0x76a93a(0xb9)+_0x76a93a(0xbb)+_0x76a93a(0xe9)+_0x76a93a(0xf0)),'gemin'+'i';if(env['OPENA'+'I_API'+_0x76a93a(0xa9)])return log('[Mode'+_0x76a93a(0xe5)+'Selec'+_0x76a93a(0xa2)+_0x76a93a(0xb8)+_0x76a93a(0xb9)+_0x76a93a(0xbb)+'ing\x20O'+_0x76a93a(0xaa)),_0x76a93a(0xb3)+'i';if(env[_0x76a93a(0x9b)+'OPIC_'+'API_K'+'EY'])return log('[Mode'+_0x76a93a(0xe5)+_0x76a93a(0xe7)+'tor]\x20'+'Defau'+'lt,\x20s'+_0x76a93a(0xbb)+_0x76a93a(0xd0)+'laude'),'claud'+'e';return warn(_0x76a93a(0xd9)+'lTier'+_0x76a93a(0xe7)+'tor]\x20'+'No\x20AP'+_0x76a93a(0xb6)+'s\x20fou'+'nd,\x20d'+'efaul'+'ting\x20'+_0x76a93a(0xe3)+'mini'),'gemin'+'i';}export function selectModelTierAndProvider(_0x16a51c={}){const _0x5f31e3=_0x5d6a,{requirements:requirements={},..._0x5ead28}=_0x16a51c,_0x173dbd=selectModelTier(_0x5ead28),_0x440d7c={...requirements};_0x440d7c['env']=process['env'];const _0x4db66b=selectProvider(_0x440d7c),_0x1f5054={};return _0x1f5054['tier']=_0x173dbd,_0x1f5054[_0x5f31e3(0xd1)+_0x5f31e3(0xbe)]=_0x4db66b,_0x1f5054['reaso'+'n']='Selec'+_0x5f31e3(0xfb)+_0x4db66b+'\x20'+_0x173dbd+(_0x5f31e3(0xbc)+_0x5f31e3(0xca)+_0x5f31e3(0xef)+_0x5f31e3(0xd2)+'xt'),_0x1f5054;}
|
|
1
|
+
function _0x2c37(_0x4f15c3,_0x20b3ef){const _0x3ad6ad=_0x5bf3();return _0x2c37=function(_0x3900cc,_0x52fe26){_0x3900cc=_0x3900cc-0x132;let _0x5bf331=_0x3ad6ad[_0x3900cc];if(_0x2c37['vfJbBS']===undefined){var _0x2c3714=function(_0x1ea92b){const _0x516c15='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+/=';let _0x3e3d10='',_0x40ef71='',_0x35b26c=_0x3e3d10+_0x2c3714;for(let _0x367b8c=0x0,_0x2614ab,_0x37536b,_0x3f8227=0x0;_0x37536b=_0x1ea92b['charAt'](_0x3f8227++);~_0x37536b&&(_0x2614ab=_0x367b8c%0x4?_0x2614ab*0x40+_0x37536b:_0x37536b,_0x367b8c++%0x4)?_0x3e3d10+=_0x35b26c['charCodeAt'](_0x3f8227+0xa)-0xa!==0x0?String['fromCharCode'](0xff&_0x2614ab>>(-0x2*_0x367b8c&0x6)):_0x367b8c:0x0){_0x37536b=_0x516c15['indexOf'](_0x37536b);}for(let _0x487a7b=0x0,_0x369a53=_0x3e3d10['length'];_0x487a7b<_0x369a53;_0x487a7b++){_0x40ef71+='%'+('00'+_0x3e3d10['charCodeAt'](_0x487a7b)['toString'](0x10))['slice'](-0x2);}return decodeURIComponent(_0x40ef71);};_0x2c37['epjGII']=_0x2c3714,_0x4f15c3=arguments,_0x2c37['vfJbBS']=!![];}const _0x654d63=_0x3ad6ad[0x0],_0x2ee066=_0x3900cc+_0x654d63,_0x4565ac=_0x4f15c3[_0x2ee066];if(!_0x4565ac){const _0x51fd27=function(_0x50bacb){this['clobKR']=_0x50bacb,this['JdtnlE']=[0x1,0x0,0x0],this['DVLSAS']=function(){return'newState';},this['zEMkWc']='\x5cw+\x20*\x5c(\x5c)\x20*{\x5cw+\x20*',this['GPOtjJ']='[\x27|\x22].+[\x27|\x22];?\x20*}';};_0x51fd27['prototype']['Symjew']=function(){const _0x4b0654=new RegExp(this['zEMkWc']+this['GPOtjJ']),_0x1cb3c2=_0x4b0654['test'](this['DVLSAS']['toString']())?--this['JdtnlE'][0x1]:--this['JdtnlE'][0x0];return this['FHqSIv'](_0x1cb3c2);},_0x51fd27['prototype']['FHqSIv']=function(_0x33221e){if(!Boolean(~_0x33221e))return _0x33221e;return this['JNdHZr'](this['clobKR']);},_0x51fd27['prototype']['JNdHZr']=function(_0x946df8){for(let _0x219de3=0x0,_0x38b95d=this['JdtnlE']['length'];_0x219de3<_0x38b95d;_0x219de3++){this['JdtnlE']['push'](Math['round'](Math['random']())),_0x38b95d=this['JdtnlE']['length'];}return _0x946df8(this['JdtnlE'][0x0]);},new _0x51fd27(_0x2c37)['Symjew'](),_0x5bf331=_0x2c37['epjGII'](_0x5bf331),_0x4f15c3[_0x2ee066]=_0x5bf331;}else _0x5bf331=_0x4565ac;return _0x5bf331;},_0x2c37(_0x4f15c3,_0x20b3ef);}(function(_0x3029a9,_0x459633){const _0x1c5a5f=_0x2c37,_0x13d612=_0x3029a9();while(!![]){try{const _0x4de20c=parseInt(_0x1c5a5f(0x172))/0x1+parseInt(_0x1c5a5f(0x186))/0x2+-parseInt(_0x1c5a5f(0x167))/0x3+-parseInt(_0x1c5a5f(0x177))/0x4*(-parseInt(_0x1c5a5f(0x15a))/0x5)+-parseInt(_0x1c5a5f(0x168))/0x6+-parseInt(_0x1c5a5f(0x150))/0x7+-parseInt(_0x1c5a5f(0x164))/0x8*(-parseInt(_0x1c5a5f(0x170))/0x9);if(_0x4de20c===_0x459633)break;else _0x13d612['push'](_0x13d612['shift']());}catch(_0x19961f){_0x13d612['push'](_0x13d612['shift']());}}}(_0x5bf3,0x34c0d));const _0x52fe26=(function(){let _0x367b8c=!![];return function(_0x2614ab,_0x37536b){const _0x3f8227=_0x367b8c?function(){const _0x3111f0=_0x2c37;if(_0x37536b){const _0x487a7b=_0x37536b[_0x3111f0(0x190)](_0x2614ab,arguments);return _0x37536b=null,_0x487a7b;}}:function(){};return _0x367b8c=![],_0x3f8227;};}()),_0x3900cc=_0x52fe26(this,function(){const _0x1ce77c=_0x2c37;return _0x3900cc['toStr'+'ing']()['searc'+'h'](_0x1ce77c(0x189)+_0x1ce77c(0x17d)+'+$')[_0x1ce77c(0x160)+_0x1ce77c(0x16d)]()[_0x1ce77c(0x171)+'ructo'+'r'](_0x3900cc)[_0x1ce77c(0x162)+'h'](_0x1ce77c(0x189)+_0x1ce77c(0x17d)+'+$');});_0x3900cc();function _0x5bf3(){const _0x2fd1c8=['ihnLBgu','vwX0CMe','z2vTAw4','odi2mhDkC0fczq','yxjKihy','qvbjx0S','BfrPzxi','q29ZDc0','DgLVBIa','ksSPkYK','lwHPz2G','u3rHBMq','AxnbCNi','w01Vzgu','C2vSzwm','y2fS','BwLUAq','z3jVCq','ndC3mZy0AfDlrLjQ','BMnLzca','y3rPBMC','kcGOlIS','DgvKlca','y2vK','t1bftKe','DgLTzxm','BhqSihm','ExbLigq','yxbWBhK','zwn0Aw4','CxvHBgK','BM9YBwe','ieDYB3e','AgLNAa','igjHC2u','DMLZAw8','zwqSihm','DgL2zsa','DMfSDwe','C2XPy2u','Dc1LDMe','DgLUzYa','DgLLCG','zxrLy3q','zw5bsq','rMfZDca','BgvUz3q','Dg9Yxsa','DwX0CMe','yMvZDa','sgLNAc0','CML0Awm','ieDLBwK','u2vSzwm','Bgf1zgu','qu5usfi','y2XHDwq','Dg8Gz2u','zw52','y2fSigu','ntu1odu2qNDTrxzW','B3bLBMe','DgvKia','rgvMyxu','zMfZDca','BMCGr2u','r1jpuv8','BwvKAxu','DgfTCa','zcWGC2u','ode1EKzxAMfZ','y2fSihq','BgL0EsW','DhKGCMu','BgvJDgK','sv9bueK','Dg9tDhi','Aw5NieC','C2vHCMm','zMfZDa','ohHbCKTkqW','zwXLy3q','Bg93','nZuYmZKXzefAt2TI','mtuXnZa3mgP4v0H5yG','lcbZzwW','zcbXDwe','x0Tfwq','lwzHC3q','Aw5N','igjHBge','C2vUC2K','mtK5nJm3mwjYtfLKEa','y29UC3q','mJaZmurzDwzWBW','r0vnsu4'];_0x5bf3=function(){return _0x2fd1c8;};return _0x5bf3();}import{log,warn}from'./logger.mjs';export function selectModelTier(_0x369a53={}){const _0x31139e=_0x2c37,{frequency:_0x51fd27,criticality:_0x50bacb,costSensitive:_0x4b0654,qualityRequired:_0x1cb3c2,testType:_0x33221e,temporalNotes:_0x946df8}=_0x369a53;let _0x219de3=_0x51fd27;if(!_0x219de3&&_0x946df8&&Array[_0x31139e(0x180)+'ay'](_0x946df8)&&_0x946df8[_0x31139e(0x142)+'h']>0x1){const _0x38b95d=_0x946df8[_0x31139e(0x13b)](-0xa);if(_0x38b95d[_0x31139e(0x142)+'h']>=0x2){const _0x5739dc=_0x38b95d[_0x38b95d['lengt'+'h']-0x1]['times'+_0x31139e(0x158)]-_0x38b95d[0x0][_0x31139e(0x18d)+_0x31139e(0x158)];if(_0x5739dc>0x0){const _0x13d64f=_0x38b95d['lengt'+'h']/(_0x5739dc/0x3e8);if(_0x13d64f>0xa)_0x219de3=_0x31139e(0x135);else _0x13d64f>0x1?_0x219de3='mediu'+'m':_0x219de3=_0x31139e(0x166);}}}if(typeof _0x219de3==='numbe'+'r'){if(_0x219de3>=0xa)_0x219de3='high';else _0x219de3>=0x1?_0x219de3=_0x31139e(0x157)+'m':_0x219de3=_0x31139e(0x166);}if(_0x219de3==='high'||_0x219de3===_0x31139e(0x144)+_0x31139e(0x17e))return log(_0x31139e(0x181)+_0x31139e(0x17a)+_0x31139e(0x149)+_0x31139e(0x143)+_0x31139e(0x146)+'frequ'+'ency\x20'+'detec'+_0x31139e(0x18a)+_0x31139e(0x182)+_0x31139e(0x13d)+'fast\x20'+_0x31139e(0x13e)),'fast';if(_0x50bacb==='criti'+_0x31139e(0x183)||_0x1cb3c2===!![])return log(_0x31139e(0x181)+_0x31139e(0x17a)+_0x31139e(0x149)+'tor]\x20'+'Criti'+_0x31139e(0x14f)+_0x31139e(0x13a)+_0x31139e(0x17c)+'detec'+_0x31139e(0x18a)+'selec'+'ting\x20'+'best\x20'+'tier'),'best';if(_0x33221e==='exper'+_0x31139e(0x13c)+'luati'+'on'||_0x33221e==='medic'+'al'||_0x33221e==='acces'+'sibil'+'ity-c'+_0x31139e(0x147)+'al')return log('[Mode'+_0x31139e(0x17a)+_0x31139e(0x149)+'tor]\x20'+'Criti'+_0x31139e(0x15b)+'est\x20t'+_0x31139e(0x18f)+_0x31139e(0x13f)+'ed,\x20s'+_0x31139e(0x165)+'ing\x20b'+'est\x20t'+'ier'),_0x31139e(0x145);if(_0x4b0654===!![])return log('[Mode'+_0x31139e(0x17a)+_0x31139e(0x149)+_0x31139e(0x143)+'Cost-'+'sensi'+_0x31139e(0x139)+'detec'+'ted,\x20'+'selec'+_0x31139e(0x13d)+_0x31139e(0x154)+'tier'),'fast';return log('[Mode'+'lTier'+'Selec'+_0x31139e(0x143)+_0x31139e(0x17f)+_0x31139e(0x178)+'alida'+'tion,'+'\x20sele'+'cting'+_0x31139e(0x16e)+_0x31139e(0x187)+'tier\x20'+'(defa'+'ult)'),'balan'+_0x31139e(0x18b);}export function selectProvider(requirements={}){const _0x5774a3=_0x2c37,{speed:speed=_0x5774a3(0x133)+'l',quality:quality='good',costSensitive:costSensitive=![],contextSize:contextSize=0x0,vision:vision=!![],env:env={}}=requirements;if(speed===_0x5774a3(0x144)+_0x5774a3(0x16c)&&!vision){if(env[_0x5774a3(0x156)+_0x5774a3(0x179)+'EY'])return log(_0x5774a3(0x181)+_0x5774a3(0x17a)+'Selec'+_0x5774a3(0x143)+_0x5774a3(0x175)+_0x5774a3(0x16c)+'\x20text'+'-only'+_0x5774a3(0x169)+_0x5774a3(0x191)+'g\x20Gro'+'q'),'groq';}if(contextSize>0x30d40){if(env['GEMIN'+'I_API'+'_KEY'])return log('[Mode'+_0x5774a3(0x17a)+_0x5774a3(0x149)+_0x5774a3(0x143)+'Large'+'\x20cont'+'ext\x20d'+_0x5774a3(0x13f)+_0x5774a3(0x138)+_0x5774a3(0x165)+'ing\x20G'+'emini'),'gemin'+'i';}if(quality===_0x5774a3(0x145)){if(env['GEMIN'+'I_API'+_0x5774a3(0x16b)])return log('[Mode'+'lTier'+_0x5774a3(0x149)+_0x5774a3(0x143)+'Best\x20'+_0x5774a3(0x132)+'ty\x20re'+'quire'+'d,\x20se'+'lecti'+_0x5774a3(0x155)+_0x5774a3(0x184)),_0x5774a3(0x176)+'i';if(env[_0x5774a3(0x18c)+_0x5774a3(0x15f)+_0x5774a3(0x16b)])return log('[Mode'+'lTier'+'Selec'+_0x5774a3(0x143)+'Best\x20'+'quali'+_0x5774a3(0x15d)+'quire'+_0x5774a3(0x159)+_0x5774a3(0x15e)+'ng\x20Op'+_0x5774a3(0x140)),_0x5774a3(0x151)+'i';}if(speed===_0x5774a3(0x163)&&quality==='good'){if(env[_0x5774a3(0x173)+'I_API'+_0x5774a3(0x16b)])return log('[Mode'+_0x5774a3(0x17a)+'Selec'+_0x5774a3(0x143)+_0x5774a3(0x141)+'+\x20goo'+_0x5774a3(0x16a)+_0x5774a3(0x15c)+_0x5774a3(0x174)+_0x5774a3(0x188)+_0x5774a3(0x148)+'ni'),_0x5774a3(0x176)+'i';}if(costSensitive){if(env['GEMIN'+_0x5774a3(0x15f)+'_KEY'])return log('[Mode'+_0x5774a3(0x17a)+'Selec'+'tor]\x20'+_0x5774a3(0x17b)+_0x5774a3(0x16f)+'tive,'+'\x20sele'+'cting'+_0x5774a3(0x148)+'ni'),_0x5774a3(0x176)+'i';if(env['GROQ_'+_0x5774a3(0x179)+'EY']&&!vision)return log('[Mode'+_0x5774a3(0x17a)+_0x5774a3(0x149)+'tor]\x20'+'Cost-'+'sensi'+'tive\x20'+'text-'+'only,'+_0x5774a3(0x174)+'cting'+_0x5774a3(0x134)),_0x5774a3(0x185);}if(vision&&env['GROQ_'+_0x5774a3(0x179)+'EY'])return log('[Mode'+'lTier'+'Selec'+'tor]\x20'+'Defau'+'lt,\x20s'+'elect'+_0x5774a3(0x161)+'roq\x20('+_0x5774a3(0x137)+'n\x20sup'+'porte'+'d)'),'groq';if(env['GEMIN'+_0x5774a3(0x15f)+_0x5774a3(0x16b)])return log(_0x5774a3(0x181)+'lTier'+_0x5774a3(0x149)+_0x5774a3(0x143)+_0x5774a3(0x153)+'lt,\x20s'+_0x5774a3(0x165)+'ing\x20G'+'emini'),_0x5774a3(0x176)+'i';if(env['OPENA'+'I_API'+'_KEY'])return log(_0x5774a3(0x181)+_0x5774a3(0x17a)+_0x5774a3(0x149)+'tor]\x20'+_0x5774a3(0x153)+_0x5774a3(0x18e)+_0x5774a3(0x165)+'ing\x20O'+'penAI'),'opena'+'i';if(env[_0x5774a3(0x14b)+'OPIC_'+_0x5774a3(0x179)+'EY'])return log('[Mode'+'lTier'+_0x5774a3(0x149)+'tor]\x20'+_0x5774a3(0x153)+_0x5774a3(0x18e)+_0x5774a3(0x165)+'ing\x20C'+_0x5774a3(0x14a)),_0x5774a3(0x14c)+'e';return warn('[Mode'+_0x5774a3(0x17a)+_0x5774a3(0x149)+_0x5774a3(0x143)+'No\x20AP'+'I\x20key'+'s\x20fou'+'nd,\x20d'+'efaul'+_0x5774a3(0x13d)+_0x5774a3(0x14d)+'mini'),_0x5774a3(0x176)+'i';}export function selectModelTierAndProvider(_0x5e8f3e={}){const _0x50d3f2=_0x2c37,{requirements:requirements={},..._0x31e560}=_0x5e8f3e,_0x5e580f=selectModelTier(_0x31e560),_0x5178d5={...requirements};_0x5178d5['env']=process[_0x50d3f2(0x14e)];const _0x2795ce=selectProvider(_0x5178d5),_0xef9135={};return _0xef9135['tier']=_0x5e580f,_0xef9135['provi'+'der']=_0x2795ce,_0xef9135['reaso'+'n']='Selec'+_0x50d3f2(0x152)+_0x2795ce+'\x20'+_0x5e580f+('\x20tier'+_0x50d3f2(0x136)+'d\x20on\x20'+'conte'+'xt'),_0xef9135;}
|
package/src/rubrics.mjs
CHANGED
|
@@ -72,12 +72,19 @@ export const DEFAULT_RUBRIC = {
|
|
|
72
72
|
|
|
73
73
|
/**
|
|
74
74
|
* Build rubric prompt section
|
|
75
|
-
*
|
|
75
|
+
*
|
|
76
76
|
* @param {import('./index.mjs').Rubric | null} [rubric=null] - Rubric to use, or null for default
|
|
77
77
|
* @param {boolean} [includeDimensions=true] - Whether to include evaluation dimensions
|
|
78
|
+
* @param {{ referenceImages?: Record<number, string> }} [options={}] - Options
|
|
79
|
+
* referenceImages: map of score level -> image path for visual anchoring.
|
|
80
|
+
* When provided, the rubric prompt instructs the VLM to compare against
|
|
81
|
+
* reference images for each score level (Prometheus-Vision, arXiv:2401.06591).
|
|
82
|
+
* The caller is responsible for encoding and attaching images to the API call;
|
|
83
|
+
* this function only generates the text prompt referencing them.
|
|
78
84
|
* @returns {string} Formatted rubric prompt text
|
|
79
85
|
*/
|
|
80
|
-
export function buildRubricPrompt(rubric = null, includeDimensions = true) {
|
|
86
|
+
export function buildRubricPrompt(rubric = null, includeDimensions = true, options = {}) {
|
|
87
|
+
const { referenceImages = null } = options;
|
|
81
88
|
const rubricToUse = rubric || DEFAULT_RUBRIC;
|
|
82
89
|
let prompt = `## EVALUATION RUBRIC
|
|
83
90
|
|
|
@@ -114,6 +121,19 @@ JSON: {"score": 3, "assessment": "fail", "issues": ["broken layout", "critical c
|
|
|
114
121
|
7. List specific issues found (if any)
|
|
115
122
|
8. Provide reasoning for your score`;
|
|
116
123
|
|
|
124
|
+
// Visual anchoring: reference images for score levels (Prometheus-Vision, arXiv:2401.06591)
|
|
125
|
+
if (referenceImages && typeof referenceImages === 'object') {
|
|
126
|
+
const levels = Object.keys(referenceImages).map(Number).sort((a, b) => b - a);
|
|
127
|
+
if (levels.length > 0) {
|
|
128
|
+
prompt += `\n\n### Visual Reference Anchors:
|
|
129
|
+
The following reference images are provided as calibration anchors for specific score levels.
|
|
130
|
+
Compare the screenshot being evaluated against these references to calibrate your scoring.
|
|
131
|
+
${levels.map(level => `- **Score ${level}**: See reference image labeled "REF_SCORE_${level}"`).join('\n')}
|
|
132
|
+
|
|
133
|
+
Use these references to anchor your absolute scores. A screenshot similar in quality to REF_SCORE_9 should score around 9, etc.`;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
117
137
|
if (includeDimensions && rubricToUse.dimensions) {
|
|
118
138
|
prompt += `\n\n### Evaluation Dimensions:
|
|
119
139
|
${Object.entries(rubricToUse.dimensions)
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Score Calibration
|
|
3
|
+
*
|
|
4
|
+
* Adjusts raw VLM scores to reduce provider-specific bias.
|
|
5
|
+
* Research shows each VLM has a stable "evaluative fingerprint" --
|
|
6
|
+
* systematic scoring tendencies that differ across providers
|
|
7
|
+
* (Evaluative Fingerprints, arXiv:2601.05114).
|
|
8
|
+
*
|
|
9
|
+
* Supports:
|
|
10
|
+
* - Per-provider linear calibration (offset + scale)
|
|
11
|
+
* - User-supplied calibration profiles
|
|
12
|
+
* - Score histogram analysis for drift detection
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import { warn } from './logger.mjs';
|
|
16
|
+
import { ValidationError } from './errors.mjs';
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Default calibration profiles per provider.
|
|
20
|
+
*
|
|
21
|
+
* These are initial estimates based on observed tendencies.
|
|
22
|
+
* Users should override with their own profiles via calibrate()
|
|
23
|
+
* after running createCalibrationSuite().
|
|
24
|
+
*
|
|
25
|
+
* Format: { offset, scale } where calibrated = (raw + offset) * scale
|
|
26
|
+
* Then clamped to [0, 10].
|
|
27
|
+
*/
|
|
28
|
+
const DEFAULT_PROFILES = {
|
|
29
|
+
gemini: { offset: 0, scale: 1.0 },
|
|
30
|
+
openai: { offset: 0, scale: 1.0 },
|
|
31
|
+
claude: { offset: 0, scale: 1.0 },
|
|
32
|
+
groq: { offset: 0, scale: 1.0 },
|
|
33
|
+
openrouter: { offset: 0, scale: 1.0 }
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
// User-supplied profiles override defaults
|
|
37
|
+
let userProfiles = {};
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Set calibration profile for a provider
|
|
41
|
+
*
|
|
42
|
+
* @param {string} provider - Provider name
|
|
43
|
+
* @param {{ offset: number, scale: number }} profile - Calibration profile
|
|
44
|
+
*/
|
|
45
|
+
export function setCalibrationProfile(provider, profile) {
|
|
46
|
+
if (typeof profile.offset !== 'number' || typeof profile.scale !== 'number') {
|
|
47
|
+
throw new ValidationError('Calibration profile must have numeric offset and scale', { offset: typeof profile.offset, scale: typeof profile.scale });
|
|
48
|
+
}
|
|
49
|
+
if (profile.scale <= 0) {
|
|
50
|
+
throw new ValidationError('Calibration scale must be positive', { scale: profile.scale });
|
|
51
|
+
}
|
|
52
|
+
userProfiles[provider] = { ...profile };
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Get calibration profile for a provider
|
|
57
|
+
*
|
|
58
|
+
* @param {string} provider - Provider name
|
|
59
|
+
* @returns {{ offset: number, scale: number }} Calibration profile
|
|
60
|
+
*/
|
|
61
|
+
export function getCalibrationProfile(provider) {
|
|
62
|
+
return userProfiles[provider] || DEFAULT_PROFILES[provider] || { offset: 0, scale: 1.0 };
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Reset all calibration profiles to defaults
|
|
67
|
+
*/
|
|
68
|
+
export function resetCalibrationProfiles() {
|
|
69
|
+
userProfiles = {};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Calibrate a raw score using the provider's profile
|
|
74
|
+
*
|
|
75
|
+
* @param {number | null} score - Raw score from VLM (0-10)
|
|
76
|
+
* @param {string} provider - Provider name
|
|
77
|
+
* @returns {number | null} Calibrated score (0-10), or null if input is null
|
|
78
|
+
*/
|
|
79
|
+
export function calibrateScore(score, provider) {
|
|
80
|
+
if (score === null || score === undefined) {
|
|
81
|
+
return null;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const profile = getCalibrationProfile(provider);
|
|
85
|
+
const calibrated = (score + profile.offset) * profile.scale;
|
|
86
|
+
|
|
87
|
+
// Clamp to [0, 10]
|
|
88
|
+
return Math.max(0, Math.min(10, Math.round(calibrated * 100) / 100));
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Derive a calibration profile from labeled data
|
|
93
|
+
*
|
|
94
|
+
* Given pairs of (raw VLM score, expected score), computes the
|
|
95
|
+
* least-squares linear fit: expected = raw * scale + offset.
|
|
96
|
+
*
|
|
97
|
+
* @param {Array<{ raw: number, expected: number }>} pairs - Score pairs
|
|
98
|
+
* @returns {{ offset: number, scale: number, r2: number }} Calibration profile with fit quality
|
|
99
|
+
*/
|
|
100
|
+
export function deriveCalibrationProfile(pairs) {
|
|
101
|
+
if (!Array.isArray(pairs) || pairs.length < 2) {
|
|
102
|
+
throw new ValidationError('Need at least 2 (raw, expected) pairs to derive calibration', { count: pairs?.length ?? 0 });
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
const n = pairs.length;
|
|
106
|
+
let sumX = 0, sumY = 0, sumXX = 0, sumXY = 0, sumYY = 0;
|
|
107
|
+
|
|
108
|
+
for (const { raw, expected } of pairs) {
|
|
109
|
+
sumX += raw;
|
|
110
|
+
sumY += expected;
|
|
111
|
+
sumXX += raw * raw;
|
|
112
|
+
sumXY += raw * expected;
|
|
113
|
+
sumYY += expected * expected;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
const denom = n * sumXX - sumX * sumX;
|
|
117
|
+
|
|
118
|
+
if (Math.abs(denom) < 1e-10) {
|
|
119
|
+
warn('[Calibration] All raw scores are identical; cannot derive profile');
|
|
120
|
+
return { offset: 0, scale: 1.0, r2: 0 };
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Linear regression: expected = scale * raw + offset_intercept
|
|
124
|
+
// We want calibrated = (raw + offset) * scale, so:
|
|
125
|
+
// calibrated = raw * scale + offset * scale
|
|
126
|
+
// Matching: scale = slope, offset_intercept = offset * scale -> offset = intercept / scale
|
|
127
|
+
const slope = (n * sumXY - sumX * sumY) / denom;
|
|
128
|
+
const intercept = (sumY - slope * sumX) / n;
|
|
129
|
+
|
|
130
|
+
// Convert to our format: calibrated = (raw + offset) * scale
|
|
131
|
+
const scale = slope || 1.0;
|
|
132
|
+
const offset = scale !== 0 ? intercept / scale : 0;
|
|
133
|
+
|
|
134
|
+
// R-squared
|
|
135
|
+
const meanY = sumY / n;
|
|
136
|
+
const ssTot = sumYY - n * meanY * meanY;
|
|
137
|
+
const ssRes = pairs.reduce((sum, { raw, expected }) => {
|
|
138
|
+
const predicted = raw * slope + intercept;
|
|
139
|
+
return sum + (expected - predicted) ** 2;
|
|
140
|
+
}, 0);
|
|
141
|
+
const r2 = ssTot > 0 ? 1 - ssRes / ssTot : 0;
|
|
142
|
+
|
|
143
|
+
return { offset, scale, r2 };
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Analyze score distribution for a provider to detect drift
|
|
148
|
+
*
|
|
149
|
+
* @param {number[]} scores - Array of scores from a single provider
|
|
150
|
+
* @returns {{ mean: number, stddev: number, skew: number, histogram: Record<number, number> }}
|
|
151
|
+
*/
|
|
152
|
+
export function analyzeScoreDistribution(scores) {
|
|
153
|
+
if (!scores.length) {
|
|
154
|
+
return { mean: 0, stddev: 0, skew: 0, histogram: {} };
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const n = scores.length;
|
|
158
|
+
const mean = scores.reduce((a, b) => a + b, 0) / n;
|
|
159
|
+
|
|
160
|
+
const variance = scores.reduce((sum, s) => sum + (s - mean) ** 2, 0) / n;
|
|
161
|
+
const stddev = Math.sqrt(variance);
|
|
162
|
+
|
|
163
|
+
// Skewness (Fisher's)
|
|
164
|
+
const skew = stddev > 0
|
|
165
|
+
? scores.reduce((sum, s) => sum + ((s - mean) / stddev) ** 3, 0) / n
|
|
166
|
+
: 0;
|
|
167
|
+
|
|
168
|
+
// Histogram (integer buckets 0-10)
|
|
169
|
+
const histogram = {};
|
|
170
|
+
for (let i = 0; i <= 10; i++) histogram[i] = 0;
|
|
171
|
+
for (const s of scores) {
|
|
172
|
+
const bucket = Math.max(0, Math.min(10, Math.round(s)));
|
|
173
|
+
histogram[bucket]++;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
return { mean, stddev, skew, histogram };
|
|
177
|
+
}
|