@arclabs561/ai-visual-test 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.secretsignore.example +20 -0
- package/CHANGELOG.md +360 -0
- package/CONTRIBUTING.md +63 -0
- package/DEPLOYMENT.md +80 -0
- package/LICENSE +22 -0
- package/README.md +142 -0
- package/SECURITY.md +108 -0
- package/api/health.js +34 -0
- package/api/validate.js +252 -0
- package/index.d.ts +1221 -0
- package/package.json +112 -0
- package/public/index.html +149 -0
- package/src/batch-optimizer.mjs +451 -0
- package/src/bias-detector.mjs +370 -0
- package/src/bias-mitigation.mjs +233 -0
- package/src/cache.mjs +433 -0
- package/src/config.mjs +268 -0
- package/src/constants.mjs +80 -0
- package/src/context-compressor.mjs +350 -0
- package/src/convenience.mjs +617 -0
- package/src/cost-tracker.mjs +257 -0
- package/src/cross-modal-consistency.mjs +170 -0
- package/src/data-extractor.mjs +232 -0
- package/src/dynamic-few-shot.mjs +140 -0
- package/src/dynamic-prompts.mjs +361 -0
- package/src/ensemble/index.mjs +53 -0
- package/src/ensemble-judge.mjs +366 -0
- package/src/error-handler.mjs +67 -0
- package/src/errors.mjs +167 -0
- package/src/experience-propagation.mjs +128 -0
- package/src/experience-tracer.mjs +487 -0
- package/src/explanation-manager.mjs +299 -0
- package/src/feedback-aggregator.mjs +248 -0
- package/src/game-goal-prompts.mjs +478 -0
- package/src/game-player.mjs +548 -0
- package/src/hallucination-detector.mjs +155 -0
- package/src/helpers/playwright.mjs +80 -0
- package/src/human-validation-manager.mjs +516 -0
- package/src/index.mjs +364 -0
- package/src/judge.mjs +929 -0
- package/src/latency-aware-batch-optimizer.mjs +192 -0
- package/src/load-env.mjs +159 -0
- package/src/logger.mjs +55 -0
- package/src/metrics.mjs +187 -0
- package/src/model-tier-selector.mjs +221 -0
- package/src/multi-modal/index.mjs +36 -0
- package/src/multi-modal-fusion.mjs +190 -0
- package/src/multi-modal.mjs +524 -0
- package/src/natural-language-specs.mjs +1071 -0
- package/src/pair-comparison.mjs +277 -0
- package/src/persona/index.mjs +42 -0
- package/src/persona-enhanced.mjs +200 -0
- package/src/persona-experience.mjs +572 -0
- package/src/position-counterbalance.mjs +140 -0
- package/src/prompt-composer.mjs +375 -0
- package/src/render-change-detector.mjs +583 -0
- package/src/research-enhanced-validation.mjs +436 -0
- package/src/retry.mjs +152 -0
- package/src/rubrics.mjs +231 -0
- package/src/score-tracker.mjs +277 -0
- package/src/smart-validator.mjs +447 -0
- package/src/spec-config.mjs +106 -0
- package/src/spec-templates.mjs +347 -0
- package/src/specs/index.mjs +38 -0
- package/src/temporal/index.mjs +102 -0
- package/src/temporal-adaptive.mjs +163 -0
- package/src/temporal-batch-optimizer.mjs +222 -0
- package/src/temporal-constants.mjs +69 -0
- package/src/temporal-context.mjs +49 -0
- package/src/temporal-decision-manager.mjs +271 -0
- package/src/temporal-decision.mjs +669 -0
- package/src/temporal-errors.mjs +58 -0
- package/src/temporal-note-pruner.mjs +173 -0
- package/src/temporal-preprocessor.mjs +543 -0
- package/src/temporal-prompt-formatter.mjs +219 -0
- package/src/temporal-validation.mjs +159 -0
- package/src/temporal.mjs +415 -0
- package/src/type-guards.mjs +311 -0
- package/src/uncertainty-reducer.mjs +470 -0
- package/src/utils/index.mjs +175 -0
- package/src/validation-framework.mjs +321 -0
- package/src/validation-result-normalizer.mjs +64 -0
- package/src/validation.mjs +243 -0
- package/src/validators/accessibility-programmatic.mjs +345 -0
- package/src/validators/accessibility-validator.mjs +223 -0
- package/src/validators/batch-validator.mjs +143 -0
- package/src/validators/hybrid-validator.mjs +268 -0
- package/src/validators/index.mjs +34 -0
- package/src/validators/prompt-builder.mjs +218 -0
- package/src/validators/rubric.mjs +85 -0
- package/src/validators/state-programmatic.mjs +260 -0
- package/src/validators/state-validator.mjs +291 -0
- package/vercel.json +27 -0
package/src/judge.mjs
ADDED
|
@@ -0,0 +1,929 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* VLLM Judge
|
|
3
|
+
*
|
|
4
|
+
* Core screenshot validation using Vision Language Models.
|
|
5
|
+
* Supports multiple providers (Gemini, OpenAI, Claude, Groq).
|
|
6
|
+
*
|
|
7
|
+
* GROQ INTEGRATION:
|
|
8
|
+
* - Groq uses OpenAI-compatible API (routes to callOpenAIAPI)
|
|
9
|
+
* - ~0.22s latency (10x faster than typical providers)
|
|
10
|
+
* - Best for high-frequency decisions (10-60Hz temporal decisions)
|
|
11
|
+
*
|
|
12
|
+
* NOTE: Groq should also be added to @arclabs561/llm-utils for text-only LLM calls.
|
|
13
|
+
* This package handles VLLM (vision) calls; llm-utils handles text-only calls.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { readFileSync, writeFileSync, existsSync } from 'fs';
|
|
17
|
+
import { join, dirname } from 'path';
|
|
18
|
+
import { fileURLToPath } from 'url';
|
|
19
|
+
import { createConfig, getConfig } from './config.mjs';
|
|
20
|
+
import { getCached, setCached } from './cache.mjs';
|
|
21
|
+
import { FileError, ProviderError, TimeoutError } from './errors.mjs';
|
|
22
|
+
import { log, warn } from './logger.mjs';
|
|
23
|
+
import { retryWithBackoff, enhanceErrorMessage } from './retry.mjs';
|
|
24
|
+
import { recordCost } from './cost-tracker.mjs';
|
|
25
|
+
import { normalizeValidationResult } from './validation-result-normalizer.mjs';
|
|
26
|
+
|
|
27
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
28
|
+
const __dirname = dirname(__filename);
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* VLLM Judge Class
|
|
32
|
+
*
|
|
33
|
+
* Handles screenshot validation using Vision Language Models.
|
|
34
|
+
*/
|
|
35
|
+
export class VLLMJudge {
|
|
36
|
+
constructor(options = {}) {
|
|
37
|
+
this.config = createConfig(options);
|
|
38
|
+
this.provider = this.config.provider;
|
|
39
|
+
this.apiKey = this.config.apiKey;
|
|
40
|
+
this.providerConfig = this.config.providerConfig;
|
|
41
|
+
this.enabled = this.config.enabled;
|
|
42
|
+
this._cacheInitialized = false;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Initialize cache (lazy initialization)
|
|
47
|
+
*/
|
|
48
|
+
async _initCache() {
|
|
49
|
+
if (this._cacheInitialized || !this.config.cache.enabled) return;
|
|
50
|
+
|
|
51
|
+
if (this.config.cache.dir) {
|
|
52
|
+
const { initCache } = await import('./cache.mjs');
|
|
53
|
+
initCache(this.config.cache.dir);
|
|
54
|
+
}
|
|
55
|
+
this._cacheInitialized = true;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Convert image to base64 for API
|
|
60
|
+
*/
|
|
61
|
+
imageToBase64(imagePath) {
|
|
62
|
+
if (!existsSync(imagePath)) {
|
|
63
|
+
throw new FileError(`Screenshot not found: ${imagePath}`, imagePath);
|
|
64
|
+
}
|
|
65
|
+
try {
|
|
66
|
+
const imageBuffer = readFileSync(imagePath);
|
|
67
|
+
return imageBuffer.toString('base64');
|
|
68
|
+
} catch (error) {
|
|
69
|
+
throw new FileError(`Failed to read screenshot: ${error.message}`, imagePath, { originalError: error.message });
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Judge screenshot using VLLM API
|
|
75
|
+
*
|
|
76
|
+
* @param {string | string[]} imagePath - Single image path or array of image paths for comparison
|
|
77
|
+
* @param {string} prompt - Evaluation prompt
|
|
78
|
+
* @param {import('./index.mjs').ValidationContext} [context={}] - Validation context
|
|
79
|
+
* @returns {Promise<import('./index.mjs').ValidationResult>} Validation result
|
|
80
|
+
*/
|
|
81
|
+
async judgeScreenshot(imagePath, prompt, context = {}) {
|
|
82
|
+
// Support both single image and multi-image (for pair comparison)
|
|
83
|
+
const imagePaths = Array.isArray(imagePath) ? imagePath : [imagePath];
|
|
84
|
+
const isMultiImage = imagePaths.length > 1;
|
|
85
|
+
if (!this.enabled) {
|
|
86
|
+
// Return normalized disabled result
|
|
87
|
+
return normalizeValidationResult({
|
|
88
|
+
enabled: false,
|
|
89
|
+
provider: this.provider,
|
|
90
|
+
message: `API validation disabled (set ${this.provider.toUpperCase()}_API_KEY or API_KEY)`,
|
|
91
|
+
pricing: this.providerConfig.pricing,
|
|
92
|
+
score: null,
|
|
93
|
+
issues: [],
|
|
94
|
+
reasoning: 'API validation is disabled',
|
|
95
|
+
assessment: null
|
|
96
|
+
}, 'judgeScreenshot-disabled');
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Initialize cache if needed
|
|
100
|
+
await this._initCache();
|
|
101
|
+
|
|
102
|
+
// Check cache first (if caching enabled)
|
|
103
|
+
const useCache = context.useCache !== false && this.config.cache.enabled;
|
|
104
|
+
if (useCache) {
|
|
105
|
+
const cacheKey = isMultiImage ? imagePaths.join('|') : imagePath;
|
|
106
|
+
const cached = getCached(cacheKey, prompt, context);
|
|
107
|
+
if (cached) {
|
|
108
|
+
if (this.config.debug.verbose) {
|
|
109
|
+
log(`[VLLM] Cache hit for ${cacheKey}`);
|
|
110
|
+
}
|
|
111
|
+
return { ...cached, cached: true };
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
const startTime = Date.now();
|
|
116
|
+
const timeout = context.timeout || this.config.performance.timeout;
|
|
117
|
+
const abortController = new AbortController();
|
|
118
|
+
const timeoutId = setTimeout(() => abortController.abort(), timeout);
|
|
119
|
+
|
|
120
|
+
let response;
|
|
121
|
+
let data;
|
|
122
|
+
let judgment = null;
|
|
123
|
+
let error = null;
|
|
124
|
+
let attempts = 0;
|
|
125
|
+
|
|
126
|
+
try {
|
|
127
|
+
// Convert all images to base64
|
|
128
|
+
const base64Images = imagePaths.map(path => this.imageToBase64(path));
|
|
129
|
+
const fullPrompt = await this.buildPrompt(prompt, context, isMultiImage);
|
|
130
|
+
|
|
131
|
+
// Retry API calls with exponential backoff
|
|
132
|
+
const maxRetries = context.maxRetries ?? 3;
|
|
133
|
+
const apiResult = await retryWithBackoff(async () => {
|
|
134
|
+
attempts++;
|
|
135
|
+
let apiResponse;
|
|
136
|
+
let apiData;
|
|
137
|
+
let logprobs = null; // Declare once outside switch
|
|
138
|
+
|
|
139
|
+
// Route to appropriate API based on provider
|
|
140
|
+
switch (this.provider) {
|
|
141
|
+
case 'gemini':
|
|
142
|
+
apiResponse = await this.callGeminiAPI(base64Images, fullPrompt, abortController.signal, isMultiImage);
|
|
143
|
+
clearTimeout(timeoutId);
|
|
144
|
+
apiData = await apiResponse.json();
|
|
145
|
+
|
|
146
|
+
if (apiData.error) {
|
|
147
|
+
const statusCode = apiResponse.status;
|
|
148
|
+
throw new ProviderError(
|
|
149
|
+
`Gemini API error: ${apiData.error.message}`,
|
|
150
|
+
'gemini',
|
|
151
|
+
{
|
|
152
|
+
apiError: apiData.error,
|
|
153
|
+
statusCode,
|
|
154
|
+
retryable: statusCode === 429 || statusCode >= 500
|
|
155
|
+
}
|
|
156
|
+
);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Extract logprobs if available (for uncertainty estimation)
|
|
160
|
+
logprobs = apiData.candidates?.[0]?.content?.parts?.[0]?.logprobs || null;
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
judgment: apiData.candidates?.[0]?.content?.parts?.[0]?.text || 'No response',
|
|
164
|
+
data: apiData,
|
|
165
|
+
logprobs
|
|
166
|
+
};
|
|
167
|
+
|
|
168
|
+
case 'openai':
|
|
169
|
+
apiResponse = await this.callOpenAIAPI(base64Images, fullPrompt, abortController.signal, isMultiImage);
|
|
170
|
+
clearTimeout(timeoutId);
|
|
171
|
+
apiData = await apiResponse.json();
|
|
172
|
+
|
|
173
|
+
if (apiData.error) {
|
|
174
|
+
const statusCode = apiResponse.status;
|
|
175
|
+
throw new ProviderError(
|
|
176
|
+
`OpenAI API error: ${apiData.error.message}`,
|
|
177
|
+
'openai',
|
|
178
|
+
{
|
|
179
|
+
apiError: apiData.error,
|
|
180
|
+
statusCode,
|
|
181
|
+
retryable: statusCode === 429 || statusCode >= 500
|
|
182
|
+
}
|
|
183
|
+
);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Extract logprobs if available (OpenAI provides logprobs when requested)
|
|
187
|
+
logprobs = apiData.choices?.[0]?.logprobs || null;
|
|
188
|
+
|
|
189
|
+
return {
|
|
190
|
+
judgment: apiData.choices?.[0]?.message?.content || 'No response',
|
|
191
|
+
data: apiData,
|
|
192
|
+
logprobs
|
|
193
|
+
};
|
|
194
|
+
|
|
195
|
+
case 'claude':
|
|
196
|
+
apiResponse = await this.callClaudeAPI(base64Images, fullPrompt, abortController.signal, isMultiImage);
|
|
197
|
+
clearTimeout(timeoutId);
|
|
198
|
+
apiData = await apiResponse.json();
|
|
199
|
+
|
|
200
|
+
if (apiData.error) {
|
|
201
|
+
const statusCode = apiResponse.status;
|
|
202
|
+
throw new ProviderError(
|
|
203
|
+
`Claude API error: ${apiData.error.message || 'Unknown error'}`,
|
|
204
|
+
'claude',
|
|
205
|
+
{
|
|
206
|
+
apiError: apiData.error,
|
|
207
|
+
statusCode,
|
|
208
|
+
retryable: statusCode === 429 || statusCode >= 500
|
|
209
|
+
}
|
|
210
|
+
);
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// Claude doesn't provide logprobs in standard API
|
|
214
|
+
logprobs = null;
|
|
215
|
+
|
|
216
|
+
return {
|
|
217
|
+
judgment: apiData.content?.[0]?.text || 'No response',
|
|
218
|
+
data: apiData,
|
|
219
|
+
logprobs
|
|
220
|
+
};
|
|
221
|
+
|
|
222
|
+
case 'groq':
|
|
223
|
+
// Groq uses OpenAI-compatible API, so we can reuse callOpenAIAPI
|
|
224
|
+
// Groq's endpoint is already set in providerConfig.apiUrl (https://api.groq.com/openai/v1)
|
|
225
|
+
apiResponse = await this.callOpenAIAPI(base64Images, fullPrompt, abortController.signal, isMultiImage);
|
|
226
|
+
clearTimeout(timeoutId);
|
|
227
|
+
apiData = await apiResponse.json();
|
|
228
|
+
|
|
229
|
+
if (apiData.error) {
|
|
230
|
+
const statusCode = apiResponse.status;
|
|
231
|
+
throw new ProviderError(
|
|
232
|
+
`Groq API error: ${apiData.error.message || 'Unknown error'}`,
|
|
233
|
+
'groq',
|
|
234
|
+
{
|
|
235
|
+
apiError: apiData.error,
|
|
236
|
+
statusCode,
|
|
237
|
+
retryable: statusCode === 429 || statusCode >= 500
|
|
238
|
+
}
|
|
239
|
+
);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Groq may provide logprobs (OpenAI-compatible, but check availability)
|
|
243
|
+
logprobs = apiData.choices?.[0]?.logprobs || null;
|
|
244
|
+
|
|
245
|
+
return {
|
|
246
|
+
judgment: apiData.choices?.[0]?.message?.content || 'No response',
|
|
247
|
+
data: apiData,
|
|
248
|
+
logprobs
|
|
249
|
+
};
|
|
250
|
+
|
|
251
|
+
default:
|
|
252
|
+
throw new ProviderError(`Unknown provider: ${this.provider}`, this.provider);
|
|
253
|
+
}
|
|
254
|
+
}, {
|
|
255
|
+
maxRetries,
|
|
256
|
+
baseDelay: 1000,
|
|
257
|
+
maxDelay: 30000,
|
|
258
|
+
onRetry: (err, attempt, delay) => {
|
|
259
|
+
if (this.config.debug.verbose) {
|
|
260
|
+
warn(`[VLLM] Retry ${attempt}/${maxRetries} for ${this.provider} API: ${err.message} (waiting ${delay}ms)`);
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
});
|
|
264
|
+
|
|
265
|
+
judgment = apiResult.judgment;
|
|
266
|
+
data = apiResult.data;
|
|
267
|
+
const logprobs = apiResult.logprobs || null;
|
|
268
|
+
|
|
269
|
+
const responseTime = Date.now() - startTime;
|
|
270
|
+
const semanticInfo = this.extractSemanticInfo(judgment);
|
|
271
|
+
|
|
272
|
+
// Enhance with uncertainty reduction (if enabled)
|
|
273
|
+
let uncertainty = null;
|
|
274
|
+
let confidence = null;
|
|
275
|
+
let selfConsistencyRecommended = false;
|
|
276
|
+
let selfConsistencyN = 0;
|
|
277
|
+
let selfConsistencyReason = '';
|
|
278
|
+
|
|
279
|
+
if (context.enableUncertaintyReduction !== false) {
|
|
280
|
+
try {
|
|
281
|
+
const { enhanceWithUncertainty } = await import('./uncertainty-reducer.mjs');
|
|
282
|
+
// Pass context and partial result for adaptive self-consistency decision
|
|
283
|
+
const enhanced = enhanceWithUncertainty({
|
|
284
|
+
judgment,
|
|
285
|
+
logprobs,
|
|
286
|
+
attempts,
|
|
287
|
+
screenshotPath: imagePath,
|
|
288
|
+
score: semanticInfo.score,
|
|
289
|
+
issues: semanticInfo.issues || []
|
|
290
|
+
}, {
|
|
291
|
+
enableHallucinationCheck: context.enableHallucinationCheck !== false,
|
|
292
|
+
adaptiveSelfConsistency: context.adaptiveSelfConsistency !== false
|
|
293
|
+
}, context);
|
|
294
|
+
uncertainty = enhanced.uncertainty;
|
|
295
|
+
confidence = enhanced.confidence;
|
|
296
|
+
// Extract self-consistency recommendation (for future use or logging)
|
|
297
|
+
selfConsistencyRecommended = enhanced.selfConsistencyRecommended || false;
|
|
298
|
+
selfConsistencyN = enhanced.selfConsistencyN || 0;
|
|
299
|
+
selfConsistencyReason = enhanced.selfConsistencyReason || '';
|
|
300
|
+
} catch (error) {
|
|
301
|
+
// Silently fail - uncertainty reduction is optional
|
|
302
|
+
if (this.config.debug.verbose) {
|
|
303
|
+
warn(`[VLLM] Uncertainty reduction failed: ${error.message}`);
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// Estimate cost (data might not be available if retry succeeded)
|
|
309
|
+
const estimatedCost = data ? this.estimateCost(data, this.provider) : null;
|
|
310
|
+
|
|
311
|
+
// Record cost for tracking
|
|
312
|
+
if (estimatedCost && estimatedCost.totalCost) {
|
|
313
|
+
try {
|
|
314
|
+
recordCost({
|
|
315
|
+
provider: this.provider,
|
|
316
|
+
cost: estimatedCost.totalCost,
|
|
317
|
+
inputTokens: estimatedCost.inputTokens || 0,
|
|
318
|
+
outputTokens: estimatedCost.outputTokens || 0,
|
|
319
|
+
testName: context.testType || context.step || 'unknown'
|
|
320
|
+
});
|
|
321
|
+
} catch {
|
|
322
|
+
// Silently fail if cost tracking unavailable
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
const validationResult = {
|
|
327
|
+
enabled: true,
|
|
328
|
+
provider: this.provider,
|
|
329
|
+
judgment,
|
|
330
|
+
score: semanticInfo.score,
|
|
331
|
+
issues: semanticInfo.issues,
|
|
332
|
+
assessment: semanticInfo.assessment,
|
|
333
|
+
reasoning: semanticInfo.reasoning,
|
|
334
|
+
pricing: this.providerConfig.pricing,
|
|
335
|
+
estimatedCost,
|
|
336
|
+
responseTime,
|
|
337
|
+
timestamp: new Date().toISOString(),
|
|
338
|
+
testName: context.testType || context.step || 'unknown',
|
|
339
|
+
viewport: context.viewport || null,
|
|
340
|
+
raw: data || null,
|
|
341
|
+
semantic: semanticInfo,
|
|
342
|
+
attempts: attempts || 1,
|
|
343
|
+
logprobs, // Include logprobs for uncertainty estimation (if available)
|
|
344
|
+
uncertainty, // Uncertainty estimate (0-1, higher = more uncertain)
|
|
345
|
+
confidence, // Confidence estimate (0-1, higher = more confident)
|
|
346
|
+
screenshotPath: imagePath, // Include for human validation
|
|
347
|
+
// Self-consistency recommendation (based on uncertainty × payout analysis)
|
|
348
|
+
selfConsistencyRecommended, // Whether self-consistency is recommended for this validation
|
|
349
|
+
selfConsistencyN, // Recommended number of self-consistency calls (0 = not recommended)
|
|
350
|
+
selfConsistencyReason // Reason for recommendation (for logging/debugging)
|
|
351
|
+
};
|
|
352
|
+
|
|
353
|
+
// Collect VLLM judgment for human validation (non-blocking)
|
|
354
|
+
if (context.enableHumanValidation !== false) {
|
|
355
|
+
try {
|
|
356
|
+
const { getHumanValidationManager } = await import('./human-validation-manager.mjs');
|
|
357
|
+
const manager = getHumanValidationManager();
|
|
358
|
+
if (manager && manager.enabled) {
|
|
359
|
+
// Non-blocking: Don't wait for human validation collection
|
|
360
|
+
manager.collectVLLMJudgment(validationResult, imagePath, prompt, context)
|
|
361
|
+
.catch(err => {
|
|
362
|
+
// Silently fail - human validation is optional
|
|
363
|
+
if (this.config.debug.verbose) {
|
|
364
|
+
warn('[VLLM] Human validation collection failed:', err.message);
|
|
365
|
+
}
|
|
366
|
+
});
|
|
367
|
+
}
|
|
368
|
+
} catch (err) {
|
|
369
|
+
// Silently fail if human validation manager not available
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
// Apply calibration if available (non-blocking check)
|
|
374
|
+
if (context.applyCalibration !== false && validationResult.score !== null) {
|
|
375
|
+
try {
|
|
376
|
+
const { getHumanValidationManager } = await import('./human-validation-manager.mjs');
|
|
377
|
+
const manager = getHumanValidationManager();
|
|
378
|
+
if (manager && manager.enabled) {
|
|
379
|
+
const calibratedScore = manager.applyCalibration(validationResult.score);
|
|
380
|
+
if (calibratedScore !== validationResult.score) {
|
|
381
|
+
validationResult.originalScore = validationResult.score;
|
|
382
|
+
validationResult.score = calibratedScore;
|
|
383
|
+
validationResult.calibrated = true;
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
} catch (err) {
|
|
387
|
+
// Silently fail if calibration not available
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
// Cache result (use first image path for single image, or combined key for multi-image)
|
|
392
|
+
if (useCache) {
|
|
393
|
+
const cacheKey = isMultiImage ? imagePaths.join('|') : imagePath;
|
|
394
|
+
setCached(cacheKey, prompt, context, validationResult);
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// Normalize result structure before returning (ensures consistent structure)
|
|
398
|
+
return normalizeValidationResult(validationResult, 'judgeScreenshot');
|
|
399
|
+
} catch (err) {
|
|
400
|
+
clearTimeout(timeoutId);
|
|
401
|
+
error = err;
|
|
402
|
+
|
|
403
|
+
// Handle timeout errors specifically
|
|
404
|
+
if (error.name === 'AbortError' || error.message?.includes('timeout') || error.message?.includes('aborted')) {
|
|
405
|
+
const enhancedMessage = enhanceErrorMessage(
|
|
406
|
+
new TimeoutError(`VLLM API call timed out after ${timeout}ms`, timeout),
|
|
407
|
+
attempts || 1,
|
|
408
|
+
'judgeScreenshot'
|
|
409
|
+
);
|
|
410
|
+
throw new TimeoutError(enhancedMessage, timeout, {
|
|
411
|
+
provider: this.provider,
|
|
412
|
+
imagePath,
|
|
413
|
+
attempts: attempts || 1
|
|
414
|
+
});
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
// Re-throw ProviderError with enhanced context
|
|
418
|
+
if (error instanceof ProviderError) {
|
|
419
|
+
const enhancedMessage = enhanceErrorMessage(error, attempts || 1, 'judgeScreenshot');
|
|
420
|
+
throw new ProviderError(enhancedMessage, this.provider, {
|
|
421
|
+
...error.details,
|
|
422
|
+
imagePath,
|
|
423
|
+
prompt: prompt.substring(0, 100),
|
|
424
|
+
attempts: attempts || 1
|
|
425
|
+
});
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
// Re-throw FileError and TimeoutError as-is (already have context)
|
|
429
|
+
if (error instanceof FileError || error instanceof TimeoutError) {
|
|
430
|
+
throw error;
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
// For other errors, enhance message and throw (consistent error handling)
|
|
434
|
+
const enhancedMessage = enhanceErrorMessage(error, attempts || 1, 'judgeScreenshot');
|
|
435
|
+
throw new ProviderError(
|
|
436
|
+
`VLLM API call failed: ${enhancedMessage}`,
|
|
437
|
+
this.provider,
|
|
438
|
+
{
|
|
439
|
+
imagePath,
|
|
440
|
+
prompt: prompt.substring(0, 100),
|
|
441
|
+
attempts: attempts || 1,
|
|
442
|
+
originalError: error.message
|
|
443
|
+
}
|
|
444
|
+
);
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
/**
|
|
449
|
+
* Build prompt for screenshot validation
|
|
450
|
+
*
|
|
451
|
+
* Uses unified prompt composition system for research-backed consistency.
|
|
452
|
+
* Research: Explicit rubrics improve reliability by 10-20% (arXiv:2412.05579)
|
|
453
|
+
*
|
|
454
|
+
* Supports variable goals: if context.goal is provided, it will be used to generate
|
|
455
|
+
* the base prompt before composition. This allows seamless integration of variable
|
|
456
|
+
* goals throughout the system.
|
|
457
|
+
*
|
|
458
|
+
* @param {string} prompt - Base prompt (or ignored if context.goal is provided)
|
|
459
|
+
* @param {import('./index.mjs').ValidationContext} context - Validation context
|
|
460
|
+
* @param {boolean} [isMultiImage=false] - Whether this is a multi-image comparison
|
|
461
|
+
* @returns {string} Full prompt with context
|
|
462
|
+
*/
|
|
463
|
+
async buildPrompt(prompt, context = {}, isMultiImage = false) {
|
|
464
|
+
// If custom prompt builder provided, use it
|
|
465
|
+
if (context.promptBuilder && typeof context.promptBuilder === 'function') {
|
|
466
|
+
return context.promptBuilder(prompt, context);
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
// Use unified prompt composition system (which handles variable goals)
|
|
470
|
+
// Pass goal in context - composeSingleImagePrompt/composeComparisonPrompt will handle it
|
|
471
|
+
try {
|
|
472
|
+
if (isMultiImage) {
|
|
473
|
+
return await composeComparisonPrompt(prompt, context, {
|
|
474
|
+
includeRubric: context.includeRubric !== false // Default true (research-backed)
|
|
475
|
+
});
|
|
476
|
+
} else {
|
|
477
|
+
return await composeSingleImagePrompt(prompt, context, {
|
|
478
|
+
includeRubric: context.includeRubric !== false, // Default true (research-backed)
|
|
479
|
+
temporalNotes: context.temporalNotes || null
|
|
480
|
+
});
|
|
481
|
+
}
|
|
482
|
+
} catch (error) {
|
|
483
|
+
// Fallback to basic prompt building if composition fails
|
|
484
|
+
if (this.config.debug.verbose) {
|
|
485
|
+
warn(`[VLLM] Prompt composition failed, using fallback: ${error.message}`);
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
// Basic fallback (original implementation)
|
|
489
|
+
let fullPrompt = prompt;
|
|
490
|
+
const contextParts = [];
|
|
491
|
+
if (context.testType) {
|
|
492
|
+
contextParts.push(`Test Type: ${context.testType}`);
|
|
493
|
+
}
|
|
494
|
+
if (context.viewport) {
|
|
495
|
+
contextParts.push(`Viewport: ${context.viewport.width}x${context.viewport.height}`);
|
|
496
|
+
}
|
|
497
|
+
if (context.gameState) {
|
|
498
|
+
contextParts.push(`Game State: ${JSON.stringify(context.gameState)}`);
|
|
499
|
+
}
|
|
500
|
+
if (contextParts.length > 0) {
|
|
501
|
+
fullPrompt = `${prompt}\n\nContext:\n${contextParts.join('\n')}`;
|
|
502
|
+
}
|
|
503
|
+
if (isMultiImage) {
|
|
504
|
+
fullPrompt = `${fullPrompt}\n\nYou are comparing two screenshots side-by-side. Return JSON with:
|
|
505
|
+
{
|
|
506
|
+
"winner": "A" | "B" | "tie",
|
|
507
|
+
"confidence": 0.0-1.0,
|
|
508
|
+
"reasoning": "explanation",
|
|
509
|
+
"differences": ["difference1", "difference2"],
|
|
510
|
+
"scores": {"A": 0-10, "B": 0-10}
|
|
511
|
+
}`;
|
|
512
|
+
}
|
|
513
|
+
return fullPrompt;
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
/**
|
|
518
|
+
* Extract semantic information from judgment text
|
|
519
|
+
*/
|
|
520
|
+
extractSemanticInfo(judgment) {
|
|
521
|
+
// Handle case where judgment is already an object
|
|
522
|
+
if (typeof judgment === 'object' && judgment !== null && !Array.isArray(judgment)) {
|
|
523
|
+
// Normalize issues: handle both array of strings and array of objects
|
|
524
|
+
let issues = judgment.issues || [];
|
|
525
|
+
if (issues.length > 0 && typeof issues[0] === 'string') {
|
|
526
|
+
// Convert string array to object array for consistency
|
|
527
|
+
issues = issues.map(desc => ({
|
|
528
|
+
description: desc,
|
|
529
|
+
importance: 'medium',
|
|
530
|
+
annoyance: 'medium',
|
|
531
|
+
impact: 'minor-inconvenience'
|
|
532
|
+
}));
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
// Normalize recommendations: handle both array of strings and array of objects
|
|
536
|
+
let recommendations = judgment.recommendations || [];
|
|
537
|
+
if (recommendations.length > 0 && typeof recommendations[0] === 'string') {
|
|
538
|
+
recommendations = recommendations.map(suggestion => ({
|
|
539
|
+
priority: 'medium',
|
|
540
|
+
suggestion,
|
|
541
|
+
expectedImpact: 'improved user experience'
|
|
542
|
+
}));
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
return {
|
|
546
|
+
score: judgment.score || null,
|
|
547
|
+
issues: issues,
|
|
548
|
+
assessment: judgment.assessment || null,
|
|
549
|
+
reasoning: judgment.reasoning || null,
|
|
550
|
+
strengths: judgment.strengths || [],
|
|
551
|
+
recommendations: recommendations,
|
|
552
|
+
evidence: judgment.evidence || null,
|
|
553
|
+
brutalistViolations: judgment.brutalistViolations || [],
|
|
554
|
+
zeroToleranceViolations: judgment.zeroToleranceViolations || []
|
|
555
|
+
};
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
// Handle case where judgment is a string
|
|
559
|
+
const judgmentText = typeof judgment === 'string' ? judgment : String(judgment || '');
|
|
560
|
+
|
|
561
|
+
try {
|
|
562
|
+
const jsonMatch = judgmentText.match(/\{[\s\S]*\}/);
|
|
563
|
+
if (jsonMatch) {
|
|
564
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
565
|
+
// Normalize issues and recommendations
|
|
566
|
+
let issues = parsed.issues || [];
|
|
567
|
+
if (issues.length > 0 && typeof issues[0] === 'string') {
|
|
568
|
+
issues = issues.map(desc => ({
|
|
569
|
+
description: desc,
|
|
570
|
+
importance: 'medium',
|
|
571
|
+
annoyance: 'medium',
|
|
572
|
+
impact: 'minor-inconvenience'
|
|
573
|
+
}));
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
let recommendations = parsed.recommendations || [];
|
|
577
|
+
if (recommendations.length > 0 && typeof recommendations[0] === 'string') {
|
|
578
|
+
recommendations = recommendations.map(suggestion => ({
|
|
579
|
+
priority: 'medium',
|
|
580
|
+
suggestion,
|
|
581
|
+
expectedImpact: 'improved user experience'
|
|
582
|
+
}));
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
return {
|
|
586
|
+
score: parsed.score || null,
|
|
587
|
+
issues: issues,
|
|
588
|
+
assessment: parsed.assessment || null,
|
|
589
|
+
reasoning: parsed.reasoning || null,
|
|
590
|
+
strengths: parsed.strengths || [],
|
|
591
|
+
recommendations: recommendations,
|
|
592
|
+
evidence: parsed.evidence || null,
|
|
593
|
+
brutalistViolations: parsed.brutalistViolations || [],
|
|
594
|
+
zeroToleranceViolations: parsed.zeroToleranceViolations || []
|
|
595
|
+
};
|
|
596
|
+
}
|
|
597
|
+
} catch (e) {
|
|
598
|
+
// Fall through to regex extraction
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
// Fallback: extract basic info from text
|
|
602
|
+
// Try to extract score from the full judgment text (including reasoning)
|
|
603
|
+
const extractedScore = this.extractScore(judgmentText);
|
|
604
|
+
|
|
605
|
+
return {
|
|
606
|
+
score: extractedScore,
|
|
607
|
+
issues: this.extractIssues(judgmentText),
|
|
608
|
+
assessment: this.extractAssessment(judgmentText),
|
|
609
|
+
reasoning: judgmentText.substring(0, 500)
|
|
610
|
+
};
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
/**
|
|
614
|
+
* Extract score from judgment text
|
|
615
|
+
*/
|
|
616
|
+
extractScore(judgment) {
|
|
617
|
+
if (!judgment || typeof judgment !== 'string') return null;
|
|
618
|
+
|
|
619
|
+
const patterns = [
|
|
620
|
+
// JSON format: "score": 7
|
|
621
|
+
/"score"\s*:\s*(\d+)/i,
|
|
622
|
+
// Text format: Score: 7 or Score 7
|
|
623
|
+
/score[:\s]*(\d+)/i,
|
|
624
|
+
// Fraction format: score: 7/10 or 7/10
|
|
625
|
+
/score[:\s]*(\d+)\s*\/\s*10/i,
|
|
626
|
+
/(\d+)\s*\/\s*10/i,
|
|
627
|
+
// Rating format: Rating: 7, Rated 7
|
|
628
|
+
/rating[:\s]*(\d+)/i,
|
|
629
|
+
/rated[:\s]*(\d+)/i,
|
|
630
|
+
// Verdict format: Verdict: PASS (7/10) or Verdict: FAIL (3/10)
|
|
631
|
+
/verdict[:\s]*(?:pass|fail)[:\s]*\((\d+)\s*\/\s*10\)/i,
|
|
632
|
+
// Markdown format: **Score**: 7 or ## Score: 7
|
|
633
|
+
/\*\*score\*\*[:\s]*(\d+)/i,
|
|
634
|
+
/##\s*score[:\s]*(\d+)/i,
|
|
635
|
+
// Structured text: "Overall Score: 7 out of 10"
|
|
636
|
+
/overall\s*score[:\s]*(\d+)\s*(?:out\s*of|\/)\s*10/i,
|
|
637
|
+
// Standalone number at start (common when API returns just "10" or "9" as reasoning)
|
|
638
|
+
// Match: "10", "10.", "10 ", "10\n", etc.
|
|
639
|
+
/^\s*(\d{1,2})(?:\s|\.|$)/,
|
|
640
|
+
// Number followed by common words (e.g., "10 out of 10", "9/10")
|
|
641
|
+
/^(\d{1,2})\s*(?:out\s*of|\/)\s*10/i,
|
|
642
|
+
// "Rate from 1-10" response patterns
|
|
643
|
+
/rate[:\s]*(\d{1,2})\s*(?:out\s*of|\/)?\s*10/i,
|
|
644
|
+
// Very simple: just a number 0-10 at the start (for cases like "10" with nothing else)
|
|
645
|
+
/^(\d{1,2})$/
|
|
646
|
+
];
|
|
647
|
+
|
|
648
|
+
for (const pattern of patterns) {
|
|
649
|
+
const match = judgment.match(pattern);
|
|
650
|
+
if (match) {
|
|
651
|
+
const score = parseInt(match[1]);
|
|
652
|
+
if (score >= 0 && score <= 10) {
|
|
653
|
+
return score;
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
// Try to infer from verdict language
|
|
659
|
+
const lower = judgment.toLowerCase();
|
|
660
|
+
if (lower.includes('excellent') || lower.includes('outstanding')) {
|
|
661
|
+
return 9;
|
|
662
|
+
}
|
|
663
|
+
if (lower.includes('very good') || lower.includes('great')) {
|
|
664
|
+
return 8;
|
|
665
|
+
}
|
|
666
|
+
if (lower.includes('good') || lower.includes('satisfactory')) {
|
|
667
|
+
return 7;
|
|
668
|
+
}
|
|
669
|
+
if (lower.includes('fair') || lower.includes('adequate')) {
|
|
670
|
+
return 6;
|
|
671
|
+
}
|
|
672
|
+
if (lower.includes('poor') || lower.includes('needs improvement')) {
|
|
673
|
+
return 4;
|
|
674
|
+
}
|
|
675
|
+
if (lower.includes('fail') && !lower.includes('pass')) {
|
|
676
|
+
return 3;
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
return null;
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
/**
|
|
683
|
+
* Extract issues from judgment text
|
|
684
|
+
*/
|
|
685
|
+
extractIssues(judgment) {
|
|
686
|
+
try {
|
|
687
|
+
const jsonMatch = judgment.match(/\{[\s\S]*\}/);
|
|
688
|
+
if (jsonMatch) {
|
|
689
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
690
|
+
return parsed.issues || [];
|
|
691
|
+
}
|
|
692
|
+
} catch (e) {
|
|
693
|
+
// Fall through to regex
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
const issues = [];
|
|
697
|
+
const lines = judgment.split('\n');
|
|
698
|
+
for (const line of lines) {
|
|
699
|
+
if (line.match(/[-*]\s*(.+)/i) || line.match(/\d+\.\s*(.+)/i)) {
|
|
700
|
+
issues.push(line.replace(/[-*]\s*|\d+\.\s*/i, '').trim());
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
return issues;
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
/**
|
|
708
|
+
* Extract assessment from judgment text
|
|
709
|
+
*/
|
|
710
|
+
extractAssessment(judgment) {
|
|
711
|
+
try {
|
|
712
|
+
const jsonMatch = judgment.match(/\{[\s\S]*\}/);
|
|
713
|
+
if (jsonMatch) {
|
|
714
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
715
|
+
return parsed.assessment || null;
|
|
716
|
+
}
|
|
717
|
+
} catch (e) {
|
|
718
|
+
// Fall through to regex
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
const lower = judgment.toLowerCase();
|
|
722
|
+
if (lower.includes('pass') && !lower.includes('fail')) {
|
|
723
|
+
return 'pass';
|
|
724
|
+
}
|
|
725
|
+
if (lower.includes('fail')) {
|
|
726
|
+
return 'fail';
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
return null;
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
/**
|
|
733
|
+
* Call Google Gemini API
|
|
734
|
+
*
|
|
735
|
+
* @param {string | string[]} base64Images - Single image or array of images (base64)
|
|
736
|
+
* @param {string} prompt - Evaluation prompt
|
|
737
|
+
* @param {AbortSignal} signal - Abort signal for timeout
|
|
738
|
+
* @param {boolean} [isMultiImage=false] - Whether this is a multi-image request
|
|
739
|
+
* @returns {Promise<Response>} API response
|
|
740
|
+
*/
|
|
741
|
+
async callGeminiAPI(base64Images, prompt, signal, isMultiImage = false) {
|
|
742
|
+
const images = Array.isArray(base64Images) ? base64Images : [base64Images];
|
|
743
|
+
|
|
744
|
+
// Build parts array: text prompt + all images
|
|
745
|
+
const parts = [{ text: prompt }];
|
|
746
|
+
for (const base64Image of images) {
|
|
747
|
+
parts.push({
|
|
748
|
+
inline_data: {
|
|
749
|
+
mime_type: 'image/png',
|
|
750
|
+
data: base64Image
|
|
751
|
+
}
|
|
752
|
+
});
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
// SECURITY: Use header for API key, not URL parameter
|
|
756
|
+
// API keys in URLs are exposed in logs, browser history, referrer headers
|
|
757
|
+
return fetch(
|
|
758
|
+
`${this.providerConfig.apiUrl}/models/${this.providerConfig.model}:generateContent`,
|
|
759
|
+
{
|
|
760
|
+
method: 'POST',
|
|
761
|
+
headers: {
|
|
762
|
+
'Content-Type': 'application/json',
|
|
763
|
+
'x-goog-api-key': this.apiKey // Use header instead of URL parameter
|
|
764
|
+
},
|
|
765
|
+
signal,
|
|
766
|
+
body: JSON.stringify({
|
|
767
|
+
contents: [{ parts }],
|
|
768
|
+
generationConfig: {
|
|
769
|
+
temperature: 0.1,
|
|
770
|
+
maxOutputTokens: 2000,
|
|
771
|
+
topP: 0.95,
|
|
772
|
+
topK: 40
|
|
773
|
+
}
|
|
774
|
+
})
|
|
775
|
+
}
|
|
776
|
+
);
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
/**
|
|
780
|
+
* Call OpenAI API
|
|
781
|
+
*
|
|
782
|
+
* @param {string | string[]} base64Images - Single image or array of images (base64)
|
|
783
|
+
* @param {string} prompt - Evaluation prompt
|
|
784
|
+
* @param {AbortSignal} signal - Abort signal for timeout
|
|
785
|
+
* @param {boolean} [isMultiImage=false] - Whether this is a multi-image request
|
|
786
|
+
* @returns {Promise<Response>} API response
|
|
787
|
+
*/
|
|
788
|
+
async callOpenAIAPI(base64Images, prompt, signal, isMultiImage = false) {
|
|
789
|
+
const images = Array.isArray(base64Images) ? base64Images : [base64Images];
|
|
790
|
+
|
|
791
|
+
// Build content array: text prompt + all images
|
|
792
|
+
const content = [{ type: 'text', text: prompt }];
|
|
793
|
+
for (const base64Image of images) {
|
|
794
|
+
content.push({
|
|
795
|
+
type: 'image_url',
|
|
796
|
+
image_url: { url: `data:image/png;base64,${base64Image}` }
|
|
797
|
+
});
|
|
798
|
+
}
|
|
799
|
+
|
|
800
|
+
return fetch(`${this.providerConfig.apiUrl}/chat/completions`, {
|
|
801
|
+
method: 'POST',
|
|
802
|
+
headers: {
|
|
803
|
+
'Content-Type': 'application/json',
|
|
804
|
+
'Authorization': `Bearer ${this.apiKey}`
|
|
805
|
+
},
|
|
806
|
+
signal,
|
|
807
|
+
body: JSON.stringify({
|
|
808
|
+
model: this.providerConfig.model,
|
|
809
|
+
messages: [{
|
|
810
|
+
role: 'user',
|
|
811
|
+
content
|
|
812
|
+
}],
|
|
813
|
+
// Some OpenAI models have limited parameter support
|
|
814
|
+
// Models that only support default temperature (1): gpt-4o-mini, gpt-5
|
|
815
|
+
// Models that support custom temperature: gpt-4o, gpt-4-turbo, etc.
|
|
816
|
+
// Only include temperature if model supports custom values (omit for models that require default)
|
|
817
|
+
...(this.providerConfig.model.includes('mini') || this.providerConfig.model.includes('gpt-5')
|
|
818
|
+
? {} // Use default temperature (1) - don't specify for models that require it
|
|
819
|
+
: { temperature: 0.1, top_p: 0.95 } // Custom values for models that support them
|
|
820
|
+
),
|
|
821
|
+
// Use max_completion_tokens for newer models (gpt-4o, gpt-5), max_tokens for older models
|
|
822
|
+
...(this.providerConfig.model.startsWith('gpt-4o') || this.providerConfig.model.startsWith('gpt-5')
|
|
823
|
+
? { max_completion_tokens: 2000 }
|
|
824
|
+
: { max_tokens: 2000 })
|
|
825
|
+
// Note: logprobs removed - not all OpenAI models support it (e.g., vision models)
|
|
826
|
+
// If needed, can be conditionally added based on model support
|
|
827
|
+
})
|
|
828
|
+
});
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
/**
|
|
832
|
+
* Call Anthropic Claude API
|
|
833
|
+
*
|
|
834
|
+
* @param {string | string[]} base64Images - Single image or array of images (base64)
|
|
835
|
+
* @param {string} prompt - Evaluation prompt
|
|
836
|
+
* @param {AbortSignal} signal - Abort signal for timeout
|
|
837
|
+
* @param {boolean} [isMultiImage=false] - Whether this is a multi-image request
|
|
838
|
+
* @returns {Promise<Response>} API response
|
|
839
|
+
*/
|
|
840
|
+
async callClaudeAPI(base64Images, prompt, signal, isMultiImage = false) {
|
|
841
|
+
const images = Array.isArray(base64Images) ? base64Images : [base64Images];
|
|
842
|
+
|
|
843
|
+
// Build content array: text prompt + all images
|
|
844
|
+
const content = [{ type: 'text', text: prompt }];
|
|
845
|
+
for (const base64Image of images) {
|
|
846
|
+
content.push({
|
|
847
|
+
type: 'image',
|
|
848
|
+
source: {
|
|
849
|
+
type: 'base64',
|
|
850
|
+
media_type: 'image/png',
|
|
851
|
+
data: base64Image
|
|
852
|
+
}
|
|
853
|
+
});
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
return fetch(`${this.providerConfig.apiUrl}/messages`, {
|
|
857
|
+
method: 'POST',
|
|
858
|
+
headers: {
|
|
859
|
+
'Content-Type': 'application/json',
|
|
860
|
+
'x-api-key': this.apiKey,
|
|
861
|
+
'anthropic-version': '2023-06-01'
|
|
862
|
+
},
|
|
863
|
+
signal,
|
|
864
|
+
body: JSON.stringify({
|
|
865
|
+
model: this.providerConfig.model,
|
|
866
|
+
max_tokens: 2000, // Increased for pair comparison
|
|
867
|
+
messages: [{
|
|
868
|
+
role: 'user',
|
|
869
|
+
content
|
|
870
|
+
}]
|
|
871
|
+
})
|
|
872
|
+
});
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
/**
|
|
876
|
+
* Estimate cost based on token usage
|
|
877
|
+
*/
|
|
878
|
+
estimateCost(data, provider) {
|
|
879
|
+
if (!this.providerConfig.pricing || this.providerConfig.pricing.input === 0) {
|
|
880
|
+
return null; // Free or self-hosted
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
let inputTokens = 0;
|
|
884
|
+
let outputTokens = 0;
|
|
885
|
+
|
|
886
|
+
switch (provider) {
|
|
887
|
+
case 'gemini':
|
|
888
|
+
inputTokens = data.usageMetadata?.promptTokenCount || 0;
|
|
889
|
+
outputTokens = data.usageMetadata?.candidatesTokenCount || 0;
|
|
890
|
+
break;
|
|
891
|
+
case 'openai':
|
|
892
|
+
inputTokens = data.usage?.prompt_tokens || 0;
|
|
893
|
+
outputTokens = data.usage?.completion_tokens || 0;
|
|
894
|
+
break;
|
|
895
|
+
case 'claude':
|
|
896
|
+
inputTokens = data.usage?.input_tokens || 0;
|
|
897
|
+
outputTokens = data.usage?.output_tokens || 0;
|
|
898
|
+
break;
|
|
899
|
+
case 'groq':
|
|
900
|
+
// Groq uses OpenAI-compatible API format
|
|
901
|
+
inputTokens = data.usage?.prompt_tokens || 0;
|
|
902
|
+
outputTokens = data.usage?.completion_tokens || 0;
|
|
903
|
+
break;
|
|
904
|
+
}
|
|
905
|
+
|
|
906
|
+
const inputCost = (inputTokens / 1_000_000) * this.providerConfig.pricing.input;
|
|
907
|
+
const outputCost = (outputTokens / 1_000_000) * this.providerConfig.pricing.output;
|
|
908
|
+
const totalCost = inputCost + outputCost;
|
|
909
|
+
|
|
910
|
+
return {
|
|
911
|
+
inputTokens,
|
|
912
|
+
outputTokens,
|
|
913
|
+
inputCost: inputCost.toFixed(6),
|
|
914
|
+
outputCost: outputCost.toFixed(6),
|
|
915
|
+
totalCost: totalCost.toFixed(6),
|
|
916
|
+
currency: 'USD'
|
|
917
|
+
};
|
|
918
|
+
}
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
/**
|
|
922
|
+
* Validate screenshot (convenience function)
|
|
923
|
+
*
|
|
924
|
+
* Creates a judge instance and validates a screenshot.
|
|
925
|
+
*/
|
|
926
|
+
export async function validateScreenshot(imagePath, prompt, context = {}) {
|
|
927
|
+
const judge = new VLLMJudge(context);
|
|
928
|
+
return judge.judgeScreenshot(imagePath, prompt, context);
|
|
929
|
+
}
|