@arclabs561/ai-visual-test 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.secretsignore.example +20 -0
  2. package/CHANGELOG.md +360 -0
  3. package/CONTRIBUTING.md +63 -0
  4. package/DEPLOYMENT.md +80 -0
  5. package/LICENSE +22 -0
  6. package/README.md +142 -0
  7. package/SECURITY.md +108 -0
  8. package/api/health.js +34 -0
  9. package/api/validate.js +252 -0
  10. package/index.d.ts +1221 -0
  11. package/package.json +112 -0
  12. package/public/index.html +149 -0
  13. package/src/batch-optimizer.mjs +451 -0
  14. package/src/bias-detector.mjs +370 -0
  15. package/src/bias-mitigation.mjs +233 -0
  16. package/src/cache.mjs +433 -0
  17. package/src/config.mjs +268 -0
  18. package/src/constants.mjs +80 -0
  19. package/src/context-compressor.mjs +350 -0
  20. package/src/convenience.mjs +617 -0
  21. package/src/cost-tracker.mjs +257 -0
  22. package/src/cross-modal-consistency.mjs +170 -0
  23. package/src/data-extractor.mjs +232 -0
  24. package/src/dynamic-few-shot.mjs +140 -0
  25. package/src/dynamic-prompts.mjs +361 -0
  26. package/src/ensemble/index.mjs +53 -0
  27. package/src/ensemble-judge.mjs +366 -0
  28. package/src/error-handler.mjs +67 -0
  29. package/src/errors.mjs +167 -0
  30. package/src/experience-propagation.mjs +128 -0
  31. package/src/experience-tracer.mjs +487 -0
  32. package/src/explanation-manager.mjs +299 -0
  33. package/src/feedback-aggregator.mjs +248 -0
  34. package/src/game-goal-prompts.mjs +478 -0
  35. package/src/game-player.mjs +548 -0
  36. package/src/hallucination-detector.mjs +155 -0
  37. package/src/helpers/playwright.mjs +80 -0
  38. package/src/human-validation-manager.mjs +516 -0
  39. package/src/index.mjs +364 -0
  40. package/src/judge.mjs +929 -0
  41. package/src/latency-aware-batch-optimizer.mjs +192 -0
  42. package/src/load-env.mjs +159 -0
  43. package/src/logger.mjs +55 -0
  44. package/src/metrics.mjs +187 -0
  45. package/src/model-tier-selector.mjs +221 -0
  46. package/src/multi-modal/index.mjs +36 -0
  47. package/src/multi-modal-fusion.mjs +190 -0
  48. package/src/multi-modal.mjs +524 -0
  49. package/src/natural-language-specs.mjs +1071 -0
  50. package/src/pair-comparison.mjs +277 -0
  51. package/src/persona/index.mjs +42 -0
  52. package/src/persona-enhanced.mjs +200 -0
  53. package/src/persona-experience.mjs +572 -0
  54. package/src/position-counterbalance.mjs +140 -0
  55. package/src/prompt-composer.mjs +375 -0
  56. package/src/render-change-detector.mjs +583 -0
  57. package/src/research-enhanced-validation.mjs +436 -0
  58. package/src/retry.mjs +152 -0
  59. package/src/rubrics.mjs +231 -0
  60. package/src/score-tracker.mjs +277 -0
  61. package/src/smart-validator.mjs +447 -0
  62. package/src/spec-config.mjs +106 -0
  63. package/src/spec-templates.mjs +347 -0
  64. package/src/specs/index.mjs +38 -0
  65. package/src/temporal/index.mjs +102 -0
  66. package/src/temporal-adaptive.mjs +163 -0
  67. package/src/temporal-batch-optimizer.mjs +222 -0
  68. package/src/temporal-constants.mjs +69 -0
  69. package/src/temporal-context.mjs +49 -0
  70. package/src/temporal-decision-manager.mjs +271 -0
  71. package/src/temporal-decision.mjs +669 -0
  72. package/src/temporal-errors.mjs +58 -0
  73. package/src/temporal-note-pruner.mjs +173 -0
  74. package/src/temporal-preprocessor.mjs +543 -0
  75. package/src/temporal-prompt-formatter.mjs +219 -0
  76. package/src/temporal-validation.mjs +159 -0
  77. package/src/temporal.mjs +415 -0
  78. package/src/type-guards.mjs +311 -0
  79. package/src/uncertainty-reducer.mjs +470 -0
  80. package/src/utils/index.mjs +175 -0
  81. package/src/validation-framework.mjs +321 -0
  82. package/src/validation-result-normalizer.mjs +64 -0
  83. package/src/validation.mjs +243 -0
  84. package/src/validators/accessibility-programmatic.mjs +345 -0
  85. package/src/validators/accessibility-validator.mjs +223 -0
  86. package/src/validators/batch-validator.mjs +143 -0
  87. package/src/validators/hybrid-validator.mjs +268 -0
  88. package/src/validators/index.mjs +34 -0
  89. package/src/validators/prompt-builder.mjs +218 -0
  90. package/src/validators/rubric.mjs +85 -0
  91. package/src/validators/state-programmatic.mjs +260 -0
  92. package/src/validators/state-validator.mjs +291 -0
  93. package/vercel.json +27 -0
package/src/judge.mjs ADDED
@@ -0,0 +1,929 @@
1
+ /**
2
+ * VLLM Judge
3
+ *
4
+ * Core screenshot validation using Vision Language Models.
5
+ * Supports multiple providers (Gemini, OpenAI, Claude, Groq).
6
+ *
7
+ * GROQ INTEGRATION:
8
+ * - Groq uses OpenAI-compatible API (routes to callOpenAIAPI)
9
+ * - ~0.22s latency (10x faster than typical providers)
10
+ * - Best for high-frequency decisions (10-60Hz temporal decisions)
11
+ *
12
+ * NOTE: Groq should also be added to @arclabs561/llm-utils for text-only LLM calls.
13
+ * This package handles VLLM (vision) calls; llm-utils handles text-only calls.
14
+ */
15
+
16
+ import { readFileSync, writeFileSync, existsSync } from 'fs';
17
+ import { join, dirname } from 'path';
18
+ import { fileURLToPath } from 'url';
19
+ import { createConfig, getConfig } from './config.mjs';
20
+ import { getCached, setCached } from './cache.mjs';
21
+ import { FileError, ProviderError, TimeoutError } from './errors.mjs';
22
+ import { log, warn } from './logger.mjs';
23
+ import { retryWithBackoff, enhanceErrorMessage } from './retry.mjs';
24
+ import { recordCost } from './cost-tracker.mjs';
25
+ import { normalizeValidationResult } from './validation-result-normalizer.mjs';
26
+
27
+ const __filename = fileURLToPath(import.meta.url);
28
+ const __dirname = dirname(__filename);
29
+
30
+ /**
31
+ * VLLM Judge Class
32
+ *
33
+ * Handles screenshot validation using Vision Language Models.
34
+ */
35
+ export class VLLMJudge {
36
+ constructor(options = {}) {
37
+ this.config = createConfig(options);
38
+ this.provider = this.config.provider;
39
+ this.apiKey = this.config.apiKey;
40
+ this.providerConfig = this.config.providerConfig;
41
+ this.enabled = this.config.enabled;
42
+ this._cacheInitialized = false;
43
+ }
44
+
45
+ /**
46
+ * Initialize cache (lazy initialization)
47
+ */
48
+ async _initCache() {
49
+ if (this._cacheInitialized || !this.config.cache.enabled) return;
50
+
51
+ if (this.config.cache.dir) {
52
+ const { initCache } = await import('./cache.mjs');
53
+ initCache(this.config.cache.dir);
54
+ }
55
+ this._cacheInitialized = true;
56
+ }
57
+
58
+ /**
59
+ * Convert image to base64 for API
60
+ */
61
+ imageToBase64(imagePath) {
62
+ if (!existsSync(imagePath)) {
63
+ throw new FileError(`Screenshot not found: ${imagePath}`, imagePath);
64
+ }
65
+ try {
66
+ const imageBuffer = readFileSync(imagePath);
67
+ return imageBuffer.toString('base64');
68
+ } catch (error) {
69
+ throw new FileError(`Failed to read screenshot: ${error.message}`, imagePath, { originalError: error.message });
70
+ }
71
+ }
72
+
73
+ /**
74
+ * Judge screenshot using VLLM API
75
+ *
76
+ * @param {string | string[]} imagePath - Single image path or array of image paths for comparison
77
+ * @param {string} prompt - Evaluation prompt
78
+ * @param {import('./index.mjs').ValidationContext} [context={}] - Validation context
79
+ * @returns {Promise<import('./index.mjs').ValidationResult>} Validation result
80
+ */
81
+ async judgeScreenshot(imagePath, prompt, context = {}) {
82
+ // Support both single image and multi-image (for pair comparison)
83
+ const imagePaths = Array.isArray(imagePath) ? imagePath : [imagePath];
84
+ const isMultiImage = imagePaths.length > 1;
85
+ if (!this.enabled) {
86
+ // Return normalized disabled result
87
+ return normalizeValidationResult({
88
+ enabled: false,
89
+ provider: this.provider,
90
+ message: `API validation disabled (set ${this.provider.toUpperCase()}_API_KEY or API_KEY)`,
91
+ pricing: this.providerConfig.pricing,
92
+ score: null,
93
+ issues: [],
94
+ reasoning: 'API validation is disabled',
95
+ assessment: null
96
+ }, 'judgeScreenshot-disabled');
97
+ }
98
+
99
+ // Initialize cache if needed
100
+ await this._initCache();
101
+
102
+ // Check cache first (if caching enabled)
103
+ const useCache = context.useCache !== false && this.config.cache.enabled;
104
+ if (useCache) {
105
+ const cacheKey = isMultiImage ? imagePaths.join('|') : imagePath;
106
+ const cached = getCached(cacheKey, prompt, context);
107
+ if (cached) {
108
+ if (this.config.debug.verbose) {
109
+ log(`[VLLM] Cache hit for ${cacheKey}`);
110
+ }
111
+ return { ...cached, cached: true };
112
+ }
113
+ }
114
+
115
+ const startTime = Date.now();
116
+ const timeout = context.timeout || this.config.performance.timeout;
117
+ const abortController = new AbortController();
118
+ const timeoutId = setTimeout(() => abortController.abort(), timeout);
119
+
120
+ let response;
121
+ let data;
122
+ let judgment = null;
123
+ let error = null;
124
+ let attempts = 0;
125
+
126
+ try {
127
+ // Convert all images to base64
128
+ const base64Images = imagePaths.map(path => this.imageToBase64(path));
129
+ const fullPrompt = await this.buildPrompt(prompt, context, isMultiImage);
130
+
131
+ // Retry API calls with exponential backoff
132
+ const maxRetries = context.maxRetries ?? 3;
133
+ const apiResult = await retryWithBackoff(async () => {
134
+ attempts++;
135
+ let apiResponse;
136
+ let apiData;
137
+ let logprobs = null; // Declare once outside switch
138
+
139
+ // Route to appropriate API based on provider
140
+ switch (this.provider) {
141
+ case 'gemini':
142
+ apiResponse = await this.callGeminiAPI(base64Images, fullPrompt, abortController.signal, isMultiImage);
143
+ clearTimeout(timeoutId);
144
+ apiData = await apiResponse.json();
145
+
146
+ if (apiData.error) {
147
+ const statusCode = apiResponse.status;
148
+ throw new ProviderError(
149
+ `Gemini API error: ${apiData.error.message}`,
150
+ 'gemini',
151
+ {
152
+ apiError: apiData.error,
153
+ statusCode,
154
+ retryable: statusCode === 429 || statusCode >= 500
155
+ }
156
+ );
157
+ }
158
+
159
+ // Extract logprobs if available (for uncertainty estimation)
160
+ logprobs = apiData.candidates?.[0]?.content?.parts?.[0]?.logprobs || null;
161
+
162
+ return {
163
+ judgment: apiData.candidates?.[0]?.content?.parts?.[0]?.text || 'No response',
164
+ data: apiData,
165
+ logprobs
166
+ };
167
+
168
+ case 'openai':
169
+ apiResponse = await this.callOpenAIAPI(base64Images, fullPrompt, abortController.signal, isMultiImage);
170
+ clearTimeout(timeoutId);
171
+ apiData = await apiResponse.json();
172
+
173
+ if (apiData.error) {
174
+ const statusCode = apiResponse.status;
175
+ throw new ProviderError(
176
+ `OpenAI API error: ${apiData.error.message}`,
177
+ 'openai',
178
+ {
179
+ apiError: apiData.error,
180
+ statusCode,
181
+ retryable: statusCode === 429 || statusCode >= 500
182
+ }
183
+ );
184
+ }
185
+
186
+ // Extract logprobs if available (OpenAI provides logprobs when requested)
187
+ logprobs = apiData.choices?.[0]?.logprobs || null;
188
+
189
+ return {
190
+ judgment: apiData.choices?.[0]?.message?.content || 'No response',
191
+ data: apiData,
192
+ logprobs
193
+ };
194
+
195
+ case 'claude':
196
+ apiResponse = await this.callClaudeAPI(base64Images, fullPrompt, abortController.signal, isMultiImage);
197
+ clearTimeout(timeoutId);
198
+ apiData = await apiResponse.json();
199
+
200
+ if (apiData.error) {
201
+ const statusCode = apiResponse.status;
202
+ throw new ProviderError(
203
+ `Claude API error: ${apiData.error.message || 'Unknown error'}`,
204
+ 'claude',
205
+ {
206
+ apiError: apiData.error,
207
+ statusCode,
208
+ retryable: statusCode === 429 || statusCode >= 500
209
+ }
210
+ );
211
+ }
212
+
213
+ // Claude doesn't provide logprobs in standard API
214
+ logprobs = null;
215
+
216
+ return {
217
+ judgment: apiData.content?.[0]?.text || 'No response',
218
+ data: apiData,
219
+ logprobs
220
+ };
221
+
222
+ case 'groq':
223
+ // Groq uses OpenAI-compatible API, so we can reuse callOpenAIAPI
224
+ // Groq's endpoint is already set in providerConfig.apiUrl (https://api.groq.com/openai/v1)
225
+ apiResponse = await this.callOpenAIAPI(base64Images, fullPrompt, abortController.signal, isMultiImage);
226
+ clearTimeout(timeoutId);
227
+ apiData = await apiResponse.json();
228
+
229
+ if (apiData.error) {
230
+ const statusCode = apiResponse.status;
231
+ throw new ProviderError(
232
+ `Groq API error: ${apiData.error.message || 'Unknown error'}`,
233
+ 'groq',
234
+ {
235
+ apiError: apiData.error,
236
+ statusCode,
237
+ retryable: statusCode === 429 || statusCode >= 500
238
+ }
239
+ );
240
+ }
241
+
242
+ // Groq may provide logprobs (OpenAI-compatible, but check availability)
243
+ logprobs = apiData.choices?.[0]?.logprobs || null;
244
+
245
+ return {
246
+ judgment: apiData.choices?.[0]?.message?.content || 'No response',
247
+ data: apiData,
248
+ logprobs
249
+ };
250
+
251
+ default:
252
+ throw new ProviderError(`Unknown provider: ${this.provider}`, this.provider);
253
+ }
254
+ }, {
255
+ maxRetries,
256
+ baseDelay: 1000,
257
+ maxDelay: 30000,
258
+ onRetry: (err, attempt, delay) => {
259
+ if (this.config.debug.verbose) {
260
+ warn(`[VLLM] Retry ${attempt}/${maxRetries} for ${this.provider} API: ${err.message} (waiting ${delay}ms)`);
261
+ }
262
+ }
263
+ });
264
+
265
+ judgment = apiResult.judgment;
266
+ data = apiResult.data;
267
+ const logprobs = apiResult.logprobs || null;
268
+
269
+ const responseTime = Date.now() - startTime;
270
+ const semanticInfo = this.extractSemanticInfo(judgment);
271
+
272
+ // Enhance with uncertainty reduction (if enabled)
273
+ let uncertainty = null;
274
+ let confidence = null;
275
+ let selfConsistencyRecommended = false;
276
+ let selfConsistencyN = 0;
277
+ let selfConsistencyReason = '';
278
+
279
+ if (context.enableUncertaintyReduction !== false) {
280
+ try {
281
+ const { enhanceWithUncertainty } = await import('./uncertainty-reducer.mjs');
282
+ // Pass context and partial result for adaptive self-consistency decision
283
+ const enhanced = enhanceWithUncertainty({
284
+ judgment,
285
+ logprobs,
286
+ attempts,
287
+ screenshotPath: imagePath,
288
+ score: semanticInfo.score,
289
+ issues: semanticInfo.issues || []
290
+ }, {
291
+ enableHallucinationCheck: context.enableHallucinationCheck !== false,
292
+ adaptiveSelfConsistency: context.adaptiveSelfConsistency !== false
293
+ }, context);
294
+ uncertainty = enhanced.uncertainty;
295
+ confidence = enhanced.confidence;
296
+ // Extract self-consistency recommendation (for future use or logging)
297
+ selfConsistencyRecommended = enhanced.selfConsistencyRecommended || false;
298
+ selfConsistencyN = enhanced.selfConsistencyN || 0;
299
+ selfConsistencyReason = enhanced.selfConsistencyReason || '';
300
+ } catch (error) {
301
+ // Silently fail - uncertainty reduction is optional
302
+ if (this.config.debug.verbose) {
303
+ warn(`[VLLM] Uncertainty reduction failed: ${error.message}`);
304
+ }
305
+ }
306
+ }
307
+
308
+ // Estimate cost (data might not be available if retry succeeded)
309
+ const estimatedCost = data ? this.estimateCost(data, this.provider) : null;
310
+
311
+ // Record cost for tracking
312
+ if (estimatedCost && estimatedCost.totalCost) {
313
+ try {
314
+ recordCost({
315
+ provider: this.provider,
316
+ cost: estimatedCost.totalCost,
317
+ inputTokens: estimatedCost.inputTokens || 0,
318
+ outputTokens: estimatedCost.outputTokens || 0,
319
+ testName: context.testType || context.step || 'unknown'
320
+ });
321
+ } catch {
322
+ // Silently fail if cost tracking unavailable
323
+ }
324
+ }
325
+
326
+ const validationResult = {
327
+ enabled: true,
328
+ provider: this.provider,
329
+ judgment,
330
+ score: semanticInfo.score,
331
+ issues: semanticInfo.issues,
332
+ assessment: semanticInfo.assessment,
333
+ reasoning: semanticInfo.reasoning,
334
+ pricing: this.providerConfig.pricing,
335
+ estimatedCost,
336
+ responseTime,
337
+ timestamp: new Date().toISOString(),
338
+ testName: context.testType || context.step || 'unknown',
339
+ viewport: context.viewport || null,
340
+ raw: data || null,
341
+ semantic: semanticInfo,
342
+ attempts: attempts || 1,
343
+ logprobs, // Include logprobs for uncertainty estimation (if available)
344
+ uncertainty, // Uncertainty estimate (0-1, higher = more uncertain)
345
+ confidence, // Confidence estimate (0-1, higher = more confident)
346
+ screenshotPath: imagePath, // Include for human validation
347
+ // Self-consistency recommendation (based on uncertainty × payout analysis)
348
+ selfConsistencyRecommended, // Whether self-consistency is recommended for this validation
349
+ selfConsistencyN, // Recommended number of self-consistency calls (0 = not recommended)
350
+ selfConsistencyReason // Reason for recommendation (for logging/debugging)
351
+ };
352
+
353
+ // Collect VLLM judgment for human validation (non-blocking)
354
+ if (context.enableHumanValidation !== false) {
355
+ try {
356
+ const { getHumanValidationManager } = await import('./human-validation-manager.mjs');
357
+ const manager = getHumanValidationManager();
358
+ if (manager && manager.enabled) {
359
+ // Non-blocking: Don't wait for human validation collection
360
+ manager.collectVLLMJudgment(validationResult, imagePath, prompt, context)
361
+ .catch(err => {
362
+ // Silently fail - human validation is optional
363
+ if (this.config.debug.verbose) {
364
+ warn('[VLLM] Human validation collection failed:', err.message);
365
+ }
366
+ });
367
+ }
368
+ } catch (err) {
369
+ // Silently fail if human validation manager not available
370
+ }
371
+ }
372
+
373
+ // Apply calibration if available (non-blocking check)
374
+ if (context.applyCalibration !== false && validationResult.score !== null) {
375
+ try {
376
+ const { getHumanValidationManager } = await import('./human-validation-manager.mjs');
377
+ const manager = getHumanValidationManager();
378
+ if (manager && manager.enabled) {
379
+ const calibratedScore = manager.applyCalibration(validationResult.score);
380
+ if (calibratedScore !== validationResult.score) {
381
+ validationResult.originalScore = validationResult.score;
382
+ validationResult.score = calibratedScore;
383
+ validationResult.calibrated = true;
384
+ }
385
+ }
386
+ } catch (err) {
387
+ // Silently fail if calibration not available
388
+ }
389
+ }
390
+
391
+ // Cache result (use first image path for single image, or combined key for multi-image)
392
+ if (useCache) {
393
+ const cacheKey = isMultiImage ? imagePaths.join('|') : imagePath;
394
+ setCached(cacheKey, prompt, context, validationResult);
395
+ }
396
+
397
+ // Normalize result structure before returning (ensures consistent structure)
398
+ return normalizeValidationResult(validationResult, 'judgeScreenshot');
399
+ } catch (err) {
400
+ clearTimeout(timeoutId);
401
+ error = err;
402
+
403
+ // Handle timeout errors specifically
404
+ if (error.name === 'AbortError' || error.message?.includes('timeout') || error.message?.includes('aborted')) {
405
+ const enhancedMessage = enhanceErrorMessage(
406
+ new TimeoutError(`VLLM API call timed out after ${timeout}ms`, timeout),
407
+ attempts || 1,
408
+ 'judgeScreenshot'
409
+ );
410
+ throw new TimeoutError(enhancedMessage, timeout, {
411
+ provider: this.provider,
412
+ imagePath,
413
+ attempts: attempts || 1
414
+ });
415
+ }
416
+
417
+ // Re-throw ProviderError with enhanced context
418
+ if (error instanceof ProviderError) {
419
+ const enhancedMessage = enhanceErrorMessage(error, attempts || 1, 'judgeScreenshot');
420
+ throw new ProviderError(enhancedMessage, this.provider, {
421
+ ...error.details,
422
+ imagePath,
423
+ prompt: prompt.substring(0, 100),
424
+ attempts: attempts || 1
425
+ });
426
+ }
427
+
428
+ // Re-throw FileError and TimeoutError as-is (already have context)
429
+ if (error instanceof FileError || error instanceof TimeoutError) {
430
+ throw error;
431
+ }
432
+
433
+ // For other errors, enhance message and throw (consistent error handling)
434
+ const enhancedMessage = enhanceErrorMessage(error, attempts || 1, 'judgeScreenshot');
435
+ throw new ProviderError(
436
+ `VLLM API call failed: ${enhancedMessage}`,
437
+ this.provider,
438
+ {
439
+ imagePath,
440
+ prompt: prompt.substring(0, 100),
441
+ attempts: attempts || 1,
442
+ originalError: error.message
443
+ }
444
+ );
445
+ }
446
+ }
447
+
448
+ /**
449
+ * Build prompt for screenshot validation
450
+ *
451
+ * Uses unified prompt composition system for research-backed consistency.
452
+ * Research: Explicit rubrics improve reliability by 10-20% (arXiv:2412.05579)
453
+ *
454
+ * Supports variable goals: if context.goal is provided, it will be used to generate
455
+ * the base prompt before composition. This allows seamless integration of variable
456
+ * goals throughout the system.
457
+ *
458
+ * @param {string} prompt - Base prompt (or ignored if context.goal is provided)
459
+ * @param {import('./index.mjs').ValidationContext} context - Validation context
460
+ * @param {boolean} [isMultiImage=false] - Whether this is a multi-image comparison
461
+ * @returns {string} Full prompt with context
462
+ */
463
+ async buildPrompt(prompt, context = {}, isMultiImage = false) {
464
+ // If custom prompt builder provided, use it
465
+ if (context.promptBuilder && typeof context.promptBuilder === 'function') {
466
+ return context.promptBuilder(prompt, context);
467
+ }
468
+
469
+ // Use unified prompt composition system (which handles variable goals)
470
+ // Pass goal in context - composeSingleImagePrompt/composeComparisonPrompt will handle it
471
+ try {
472
+ if (isMultiImage) {
473
+ return await composeComparisonPrompt(prompt, context, {
474
+ includeRubric: context.includeRubric !== false // Default true (research-backed)
475
+ });
476
+ } else {
477
+ return await composeSingleImagePrompt(prompt, context, {
478
+ includeRubric: context.includeRubric !== false, // Default true (research-backed)
479
+ temporalNotes: context.temporalNotes || null
480
+ });
481
+ }
482
+ } catch (error) {
483
+ // Fallback to basic prompt building if composition fails
484
+ if (this.config.debug.verbose) {
485
+ warn(`[VLLM] Prompt composition failed, using fallback: ${error.message}`);
486
+ }
487
+
488
+ // Basic fallback (original implementation)
489
+ let fullPrompt = prompt;
490
+ const contextParts = [];
491
+ if (context.testType) {
492
+ contextParts.push(`Test Type: ${context.testType}`);
493
+ }
494
+ if (context.viewport) {
495
+ contextParts.push(`Viewport: ${context.viewport.width}x${context.viewport.height}`);
496
+ }
497
+ if (context.gameState) {
498
+ contextParts.push(`Game State: ${JSON.stringify(context.gameState)}`);
499
+ }
500
+ if (contextParts.length > 0) {
501
+ fullPrompt = `${prompt}\n\nContext:\n${contextParts.join('\n')}`;
502
+ }
503
+ if (isMultiImage) {
504
+ fullPrompt = `${fullPrompt}\n\nYou are comparing two screenshots side-by-side. Return JSON with:
505
+ {
506
+ "winner": "A" | "B" | "tie",
507
+ "confidence": 0.0-1.0,
508
+ "reasoning": "explanation",
509
+ "differences": ["difference1", "difference2"],
510
+ "scores": {"A": 0-10, "B": 0-10}
511
+ }`;
512
+ }
513
+ return fullPrompt;
514
+ }
515
+ }
516
+
517
+ /**
518
+ * Extract semantic information from judgment text
519
+ */
520
+ extractSemanticInfo(judgment) {
521
+ // Handle case where judgment is already an object
522
+ if (typeof judgment === 'object' && judgment !== null && !Array.isArray(judgment)) {
523
+ // Normalize issues: handle both array of strings and array of objects
524
+ let issues = judgment.issues || [];
525
+ if (issues.length > 0 && typeof issues[0] === 'string') {
526
+ // Convert string array to object array for consistency
527
+ issues = issues.map(desc => ({
528
+ description: desc,
529
+ importance: 'medium',
530
+ annoyance: 'medium',
531
+ impact: 'minor-inconvenience'
532
+ }));
533
+ }
534
+
535
+ // Normalize recommendations: handle both array of strings and array of objects
536
+ let recommendations = judgment.recommendations || [];
537
+ if (recommendations.length > 0 && typeof recommendations[0] === 'string') {
538
+ recommendations = recommendations.map(suggestion => ({
539
+ priority: 'medium',
540
+ suggestion,
541
+ expectedImpact: 'improved user experience'
542
+ }));
543
+ }
544
+
545
+ return {
546
+ score: judgment.score || null,
547
+ issues: issues,
548
+ assessment: judgment.assessment || null,
549
+ reasoning: judgment.reasoning || null,
550
+ strengths: judgment.strengths || [],
551
+ recommendations: recommendations,
552
+ evidence: judgment.evidence || null,
553
+ brutalistViolations: judgment.brutalistViolations || [],
554
+ zeroToleranceViolations: judgment.zeroToleranceViolations || []
555
+ };
556
+ }
557
+
558
+ // Handle case where judgment is a string
559
+ const judgmentText = typeof judgment === 'string' ? judgment : String(judgment || '');
560
+
561
+ try {
562
+ const jsonMatch = judgmentText.match(/\{[\s\S]*\}/);
563
+ if (jsonMatch) {
564
+ const parsed = JSON.parse(jsonMatch[0]);
565
+ // Normalize issues and recommendations
566
+ let issues = parsed.issues || [];
567
+ if (issues.length > 0 && typeof issues[0] === 'string') {
568
+ issues = issues.map(desc => ({
569
+ description: desc,
570
+ importance: 'medium',
571
+ annoyance: 'medium',
572
+ impact: 'minor-inconvenience'
573
+ }));
574
+ }
575
+
576
+ let recommendations = parsed.recommendations || [];
577
+ if (recommendations.length > 0 && typeof recommendations[0] === 'string') {
578
+ recommendations = recommendations.map(suggestion => ({
579
+ priority: 'medium',
580
+ suggestion,
581
+ expectedImpact: 'improved user experience'
582
+ }));
583
+ }
584
+
585
+ return {
586
+ score: parsed.score || null,
587
+ issues: issues,
588
+ assessment: parsed.assessment || null,
589
+ reasoning: parsed.reasoning || null,
590
+ strengths: parsed.strengths || [],
591
+ recommendations: recommendations,
592
+ evidence: parsed.evidence || null,
593
+ brutalistViolations: parsed.brutalistViolations || [],
594
+ zeroToleranceViolations: parsed.zeroToleranceViolations || []
595
+ };
596
+ }
597
+ } catch (e) {
598
+ // Fall through to regex extraction
599
+ }
600
+
601
+ // Fallback: extract basic info from text
602
+ // Try to extract score from the full judgment text (including reasoning)
603
+ const extractedScore = this.extractScore(judgmentText);
604
+
605
+ return {
606
+ score: extractedScore,
607
+ issues: this.extractIssues(judgmentText),
608
+ assessment: this.extractAssessment(judgmentText),
609
+ reasoning: judgmentText.substring(0, 500)
610
+ };
611
+ }
612
+
613
+ /**
614
+ * Extract score from judgment text
615
+ */
616
+ extractScore(judgment) {
617
+ if (!judgment || typeof judgment !== 'string') return null;
618
+
619
+ const patterns = [
620
+ // JSON format: "score": 7
621
+ /"score"\s*:\s*(\d+)/i,
622
+ // Text format: Score: 7 or Score 7
623
+ /score[:\s]*(\d+)/i,
624
+ // Fraction format: score: 7/10 or 7/10
625
+ /score[:\s]*(\d+)\s*\/\s*10/i,
626
+ /(\d+)\s*\/\s*10/i,
627
+ // Rating format: Rating: 7, Rated 7
628
+ /rating[:\s]*(\d+)/i,
629
+ /rated[:\s]*(\d+)/i,
630
+ // Verdict format: Verdict: PASS (7/10) or Verdict: FAIL (3/10)
631
+ /verdict[:\s]*(?:pass|fail)[:\s]*\((\d+)\s*\/\s*10\)/i,
632
+ // Markdown format: **Score**: 7 or ## Score: 7
633
+ /\*\*score\*\*[:\s]*(\d+)/i,
634
+ /##\s*score[:\s]*(\d+)/i,
635
+ // Structured text: "Overall Score: 7 out of 10"
636
+ /overall\s*score[:\s]*(\d+)\s*(?:out\s*of|\/)\s*10/i,
637
+ // Standalone number at start (common when API returns just "10" or "9" as reasoning)
638
+ // Match: "10", "10.", "10 ", "10\n", etc.
639
+ /^\s*(\d{1,2})(?:\s|\.|$)/,
640
+ // Number followed by common words (e.g., "10 out of 10", "9/10")
641
+ /^(\d{1,2})\s*(?:out\s*of|\/)\s*10/i,
642
+ // "Rate from 1-10" response patterns
643
+ /rate[:\s]*(\d{1,2})\s*(?:out\s*of|\/)?\s*10/i,
644
+ // Very simple: just a number 0-10 at the start (for cases like "10" with nothing else)
645
+ /^(\d{1,2})$/
646
+ ];
647
+
648
+ for (const pattern of patterns) {
649
+ const match = judgment.match(pattern);
650
+ if (match) {
651
+ const score = parseInt(match[1]);
652
+ if (score >= 0 && score <= 10) {
653
+ return score;
654
+ }
655
+ }
656
+ }
657
+
658
+ // Try to infer from verdict language
659
+ const lower = judgment.toLowerCase();
660
+ if (lower.includes('excellent') || lower.includes('outstanding')) {
661
+ return 9;
662
+ }
663
+ if (lower.includes('very good') || lower.includes('great')) {
664
+ return 8;
665
+ }
666
+ if (lower.includes('good') || lower.includes('satisfactory')) {
667
+ return 7;
668
+ }
669
+ if (lower.includes('fair') || lower.includes('adequate')) {
670
+ return 6;
671
+ }
672
+ if (lower.includes('poor') || lower.includes('needs improvement')) {
673
+ return 4;
674
+ }
675
+ if (lower.includes('fail') && !lower.includes('pass')) {
676
+ return 3;
677
+ }
678
+
679
+ return null;
680
+ }
681
+
682
+ /**
683
+ * Extract issues from judgment text
684
+ */
685
+ extractIssues(judgment) {
686
+ try {
687
+ const jsonMatch = judgment.match(/\{[\s\S]*\}/);
688
+ if (jsonMatch) {
689
+ const parsed = JSON.parse(jsonMatch[0]);
690
+ return parsed.issues || [];
691
+ }
692
+ } catch (e) {
693
+ // Fall through to regex
694
+ }
695
+
696
+ const issues = [];
697
+ const lines = judgment.split('\n');
698
+ for (const line of lines) {
699
+ if (line.match(/[-*]\s*(.+)/i) || line.match(/\d+\.\s*(.+)/i)) {
700
+ issues.push(line.replace(/[-*]\s*|\d+\.\s*/i, '').trim());
701
+ }
702
+ }
703
+
704
+ return issues;
705
+ }
706
+
707
+ /**
708
+ * Extract assessment from judgment text
709
+ */
710
+ extractAssessment(judgment) {
711
+ try {
712
+ const jsonMatch = judgment.match(/\{[\s\S]*\}/);
713
+ if (jsonMatch) {
714
+ const parsed = JSON.parse(jsonMatch[0]);
715
+ return parsed.assessment || null;
716
+ }
717
+ } catch (e) {
718
+ // Fall through to regex
719
+ }
720
+
721
+ const lower = judgment.toLowerCase();
722
+ if (lower.includes('pass') && !lower.includes('fail')) {
723
+ return 'pass';
724
+ }
725
+ if (lower.includes('fail')) {
726
+ return 'fail';
727
+ }
728
+
729
+ return null;
730
+ }
731
+
732
+ /**
733
+ * Call Google Gemini API
734
+ *
735
+ * @param {string | string[]} base64Images - Single image or array of images (base64)
736
+ * @param {string} prompt - Evaluation prompt
737
+ * @param {AbortSignal} signal - Abort signal for timeout
738
+ * @param {boolean} [isMultiImage=false] - Whether this is a multi-image request
739
+ * @returns {Promise<Response>} API response
740
+ */
741
+ async callGeminiAPI(base64Images, prompt, signal, isMultiImage = false) {
742
+ const images = Array.isArray(base64Images) ? base64Images : [base64Images];
743
+
744
+ // Build parts array: text prompt + all images
745
+ const parts = [{ text: prompt }];
746
+ for (const base64Image of images) {
747
+ parts.push({
748
+ inline_data: {
749
+ mime_type: 'image/png',
750
+ data: base64Image
751
+ }
752
+ });
753
+ }
754
+
755
+ // SECURITY: Use header for API key, not URL parameter
756
+ // API keys in URLs are exposed in logs, browser history, referrer headers
757
+ return fetch(
758
+ `${this.providerConfig.apiUrl}/models/${this.providerConfig.model}:generateContent`,
759
+ {
760
+ method: 'POST',
761
+ headers: {
762
+ 'Content-Type': 'application/json',
763
+ 'x-goog-api-key': this.apiKey // Use header instead of URL parameter
764
+ },
765
+ signal,
766
+ body: JSON.stringify({
767
+ contents: [{ parts }],
768
+ generationConfig: {
769
+ temperature: 0.1,
770
+ maxOutputTokens: 2000,
771
+ topP: 0.95,
772
+ topK: 40
773
+ }
774
+ })
775
+ }
776
+ );
777
+ }
778
+
779
+ /**
780
+ * Call OpenAI API
781
+ *
782
+ * @param {string | string[]} base64Images - Single image or array of images (base64)
783
+ * @param {string} prompt - Evaluation prompt
784
+ * @param {AbortSignal} signal - Abort signal for timeout
785
+ * @param {boolean} [isMultiImage=false] - Whether this is a multi-image request
786
+ * @returns {Promise<Response>} API response
787
+ */
788
+ async callOpenAIAPI(base64Images, prompt, signal, isMultiImage = false) {
789
+ const images = Array.isArray(base64Images) ? base64Images : [base64Images];
790
+
791
+ // Build content array: text prompt + all images
792
+ const content = [{ type: 'text', text: prompt }];
793
+ for (const base64Image of images) {
794
+ content.push({
795
+ type: 'image_url',
796
+ image_url: { url: `data:image/png;base64,${base64Image}` }
797
+ });
798
+ }
799
+
800
+ return fetch(`${this.providerConfig.apiUrl}/chat/completions`, {
801
+ method: 'POST',
802
+ headers: {
803
+ 'Content-Type': 'application/json',
804
+ 'Authorization': `Bearer ${this.apiKey}`
805
+ },
806
+ signal,
807
+ body: JSON.stringify({
808
+ model: this.providerConfig.model,
809
+ messages: [{
810
+ role: 'user',
811
+ content
812
+ }],
813
+ // Some OpenAI models have limited parameter support
814
+ // Models that only support default temperature (1): gpt-4o-mini, gpt-5
815
+ // Models that support custom temperature: gpt-4o, gpt-4-turbo, etc.
816
+ // Only include temperature if model supports custom values (omit for models that require default)
817
+ ...(this.providerConfig.model.includes('mini') || this.providerConfig.model.includes('gpt-5')
818
+ ? {} // Use default temperature (1) - don't specify for models that require it
819
+ : { temperature: 0.1, top_p: 0.95 } // Custom values for models that support them
820
+ ),
821
+ // Use max_completion_tokens for newer models (gpt-4o, gpt-5), max_tokens for older models
822
+ ...(this.providerConfig.model.startsWith('gpt-4o') || this.providerConfig.model.startsWith('gpt-5')
823
+ ? { max_completion_tokens: 2000 }
824
+ : { max_tokens: 2000 })
825
+ // Note: logprobs removed - not all OpenAI models support it (e.g., vision models)
826
+ // If needed, can be conditionally added based on model support
827
+ })
828
+ });
829
+ }
830
+
831
+ /**
832
+ * Call Anthropic Claude API
833
+ *
834
+ * @param {string | string[]} base64Images - Single image or array of images (base64)
835
+ * @param {string} prompt - Evaluation prompt
836
+ * @param {AbortSignal} signal - Abort signal for timeout
837
+ * @param {boolean} [isMultiImage=false] - Whether this is a multi-image request
838
+ * @returns {Promise<Response>} API response
839
+ */
840
+ async callClaudeAPI(base64Images, prompt, signal, isMultiImage = false) {
841
+ const images = Array.isArray(base64Images) ? base64Images : [base64Images];
842
+
843
+ // Build content array: text prompt + all images
844
+ const content = [{ type: 'text', text: prompt }];
845
+ for (const base64Image of images) {
846
+ content.push({
847
+ type: 'image',
848
+ source: {
849
+ type: 'base64',
850
+ media_type: 'image/png',
851
+ data: base64Image
852
+ }
853
+ });
854
+ }
855
+
856
+ return fetch(`${this.providerConfig.apiUrl}/messages`, {
857
+ method: 'POST',
858
+ headers: {
859
+ 'Content-Type': 'application/json',
860
+ 'x-api-key': this.apiKey,
861
+ 'anthropic-version': '2023-06-01'
862
+ },
863
+ signal,
864
+ body: JSON.stringify({
865
+ model: this.providerConfig.model,
866
+ max_tokens: 2000, // Increased for pair comparison
867
+ messages: [{
868
+ role: 'user',
869
+ content
870
+ }]
871
+ })
872
+ });
873
+ }
874
+
875
+ /**
876
+ * Estimate cost based on token usage
877
+ */
878
+ estimateCost(data, provider) {
879
+ if (!this.providerConfig.pricing || this.providerConfig.pricing.input === 0) {
880
+ return null; // Free or self-hosted
881
+ }
882
+
883
+ let inputTokens = 0;
884
+ let outputTokens = 0;
885
+
886
+ switch (provider) {
887
+ case 'gemini':
888
+ inputTokens = data.usageMetadata?.promptTokenCount || 0;
889
+ outputTokens = data.usageMetadata?.candidatesTokenCount || 0;
890
+ break;
891
+ case 'openai':
892
+ inputTokens = data.usage?.prompt_tokens || 0;
893
+ outputTokens = data.usage?.completion_tokens || 0;
894
+ break;
895
+ case 'claude':
896
+ inputTokens = data.usage?.input_tokens || 0;
897
+ outputTokens = data.usage?.output_tokens || 0;
898
+ break;
899
+ case 'groq':
900
+ // Groq uses OpenAI-compatible API format
901
+ inputTokens = data.usage?.prompt_tokens || 0;
902
+ outputTokens = data.usage?.completion_tokens || 0;
903
+ break;
904
+ }
905
+
906
+ const inputCost = (inputTokens / 1_000_000) * this.providerConfig.pricing.input;
907
+ const outputCost = (outputTokens / 1_000_000) * this.providerConfig.pricing.output;
908
+ const totalCost = inputCost + outputCost;
909
+
910
+ return {
911
+ inputTokens,
912
+ outputTokens,
913
+ inputCost: inputCost.toFixed(6),
914
+ outputCost: outputCost.toFixed(6),
915
+ totalCost: totalCost.toFixed(6),
916
+ currency: 'USD'
917
+ };
918
+ }
919
+ }
920
+
921
+ /**
922
+ * Validate screenshot (convenience function)
923
+ *
924
+ * Creates a judge instance and validates a screenshot.
925
+ */
926
+ export async function validateScreenshot(imagePath, prompt, context = {}) {
927
+ const judge = new VLLMJudge(context);
928
+ return judge.judgeScreenshot(imagePath, prompt, context);
929
+ }