@arclabs561/ai-visual-test 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.secretsignore.example +20 -0
  2. package/CHANGELOG.md +360 -0
  3. package/CONTRIBUTING.md +63 -0
  4. package/DEPLOYMENT.md +80 -0
  5. package/LICENSE +22 -0
  6. package/README.md +142 -0
  7. package/SECURITY.md +108 -0
  8. package/api/health.js +34 -0
  9. package/api/validate.js +252 -0
  10. package/index.d.ts +1221 -0
  11. package/package.json +112 -0
  12. package/public/index.html +149 -0
  13. package/src/batch-optimizer.mjs +451 -0
  14. package/src/bias-detector.mjs +370 -0
  15. package/src/bias-mitigation.mjs +233 -0
  16. package/src/cache.mjs +433 -0
  17. package/src/config.mjs +268 -0
  18. package/src/constants.mjs +80 -0
  19. package/src/context-compressor.mjs +350 -0
  20. package/src/convenience.mjs +617 -0
  21. package/src/cost-tracker.mjs +257 -0
  22. package/src/cross-modal-consistency.mjs +170 -0
  23. package/src/data-extractor.mjs +232 -0
  24. package/src/dynamic-few-shot.mjs +140 -0
  25. package/src/dynamic-prompts.mjs +361 -0
  26. package/src/ensemble/index.mjs +53 -0
  27. package/src/ensemble-judge.mjs +366 -0
  28. package/src/error-handler.mjs +67 -0
  29. package/src/errors.mjs +167 -0
  30. package/src/experience-propagation.mjs +128 -0
  31. package/src/experience-tracer.mjs +487 -0
  32. package/src/explanation-manager.mjs +299 -0
  33. package/src/feedback-aggregator.mjs +248 -0
  34. package/src/game-goal-prompts.mjs +478 -0
  35. package/src/game-player.mjs +548 -0
  36. package/src/hallucination-detector.mjs +155 -0
  37. package/src/helpers/playwright.mjs +80 -0
  38. package/src/human-validation-manager.mjs +516 -0
  39. package/src/index.mjs +364 -0
  40. package/src/judge.mjs +929 -0
  41. package/src/latency-aware-batch-optimizer.mjs +192 -0
  42. package/src/load-env.mjs +159 -0
  43. package/src/logger.mjs +55 -0
  44. package/src/metrics.mjs +187 -0
  45. package/src/model-tier-selector.mjs +221 -0
  46. package/src/multi-modal/index.mjs +36 -0
  47. package/src/multi-modal-fusion.mjs +190 -0
  48. package/src/multi-modal.mjs +524 -0
  49. package/src/natural-language-specs.mjs +1071 -0
  50. package/src/pair-comparison.mjs +277 -0
  51. package/src/persona/index.mjs +42 -0
  52. package/src/persona-enhanced.mjs +200 -0
  53. package/src/persona-experience.mjs +572 -0
  54. package/src/position-counterbalance.mjs +140 -0
  55. package/src/prompt-composer.mjs +375 -0
  56. package/src/render-change-detector.mjs +583 -0
  57. package/src/research-enhanced-validation.mjs +436 -0
  58. package/src/retry.mjs +152 -0
  59. package/src/rubrics.mjs +231 -0
  60. package/src/score-tracker.mjs +277 -0
  61. package/src/smart-validator.mjs +447 -0
  62. package/src/spec-config.mjs +106 -0
  63. package/src/spec-templates.mjs +347 -0
  64. package/src/specs/index.mjs +38 -0
  65. package/src/temporal/index.mjs +102 -0
  66. package/src/temporal-adaptive.mjs +163 -0
  67. package/src/temporal-batch-optimizer.mjs +222 -0
  68. package/src/temporal-constants.mjs +69 -0
  69. package/src/temporal-context.mjs +49 -0
  70. package/src/temporal-decision-manager.mjs +271 -0
  71. package/src/temporal-decision.mjs +669 -0
  72. package/src/temporal-errors.mjs +58 -0
  73. package/src/temporal-note-pruner.mjs +173 -0
  74. package/src/temporal-preprocessor.mjs +543 -0
  75. package/src/temporal-prompt-formatter.mjs +219 -0
  76. package/src/temporal-validation.mjs +159 -0
  77. package/src/temporal.mjs +415 -0
  78. package/src/type-guards.mjs +311 -0
  79. package/src/uncertainty-reducer.mjs +470 -0
  80. package/src/utils/index.mjs +175 -0
  81. package/src/validation-framework.mjs +321 -0
  82. package/src/validation-result-normalizer.mjs +64 -0
  83. package/src/validation.mjs +243 -0
  84. package/src/validators/accessibility-programmatic.mjs +345 -0
  85. package/src/validators/accessibility-validator.mjs +223 -0
  86. package/src/validators/batch-validator.mjs +143 -0
  87. package/src/validators/hybrid-validator.mjs +268 -0
  88. package/src/validators/index.mjs +34 -0
  89. package/src/validators/prompt-builder.mjs +218 -0
  90. package/src/validators/rubric.mjs +85 -0
  91. package/src/validators/state-programmatic.mjs +260 -0
  92. package/src/validators/state-validator.mjs +291 -0
  93. package/vercel.json +27 -0
@@ -0,0 +1,451 @@
1
+ /**
2
+ * Batch Optimizer
3
+ *
4
+ * Optimizes VLLM API calls by:
5
+ * - Queueing requests for better throughput
6
+ * - Caching responses for identical screenshots
7
+ * - Implementing request pooling with concurrency limits
8
+ *
9
+ * General-purpose utility - no domain-specific logic.
10
+ *
11
+ * CACHE ARCHITECTURE NOTE:
12
+ * - This has its OWN in-memory cache (Map), separate from VLLM cache
13
+ * - Cache key generation fixed (2025-01): Now uses SHA-256 hash, no truncation
14
+ * - Purpose: Short-term caching during request batching (process lifetime only)
15
+ * - Why separate: Different lifetime (process vs 7 days), different purpose (batching optimization vs persistence),
16
+ * different failure domain (memory-only, no disk I/O), serves different lifecycle (request batching vs API responses)
17
+ * - No coordination with VLLM cache (by design - they serve different purposes with minimal data overlap)
18
+ * - No size limits or eviction (grows unbounded in long-running processes - acceptable for process-scoped cache)
19
+ * - See docs/CACHE_ARCHITECTURE_DEEP_DIVE.md for details
20
+ */
21
+
22
+ import { createHash } from 'crypto';
23
+
24
+ /**
25
+ * Batch Optimizer Class
26
+ *
27
+ * Optimizes VLLM API calls by queueing requests and caching responses.
28
+ *
29
+ * @class BatchOptimizer
30
+ */
31
+ import { API_CONSTANTS, BATCH_OPTIMIZER_CONSTANTS } from './constants.mjs';
32
+ import { TimeoutError } from './errors.mjs';
33
+ import { warn } from './logger.mjs';
34
+
35
+ export class BatchOptimizer {
36
+ /**
37
+ * @param {{
38
+ * maxConcurrency?: number;
39
+ * batchSize?: number;
40
+ * cacheEnabled?: boolean;
41
+ * maxQueueSize?: number;
42
+ * requestTimeout?: number;
43
+ * }} [options={}] - Optimizer options
44
+ */
45
+ constructor(options = {}) {
46
+ const {
47
+ maxConcurrency = API_CONSTANTS.DEFAULT_MAX_CONCURRENCY,
48
+ batchSize = 3,
49
+ cacheEnabled = true,
50
+ maxQueueSize = BATCH_OPTIMIZER_CONSTANTS.MAX_QUEUE_SIZE,
51
+ requestTimeout = BATCH_OPTIMIZER_CONSTANTS.REQUEST_TIMEOUT_MS
52
+ } = options;
53
+
54
+ this.queue = [];
55
+ this.processing = false;
56
+ this.cache = cacheEnabled ? new Map() : null;
57
+ this.batchSize = batchSize;
58
+ this.maxConcurrency = maxConcurrency;
59
+ this.activeRequests = 0;
60
+ this.maxQueueSize = maxQueueSize;
61
+ this.requestTimeout = requestTimeout;
62
+
63
+ // CRITICAL FIX: Initialize metrics in constructor to prevent undefined errors
64
+ // Metrics are used in _queueRequest before getPerformanceMetrics() is called
65
+ this.metrics = {
66
+ queueRejections: 0,
67
+ timeouts: 0,
68
+ totalQueued: 0,
69
+ totalProcessed: 0,
70
+ averageWaitTime: 0,
71
+ waitTimes: []
72
+ };
73
+ }
74
+
75
+ /**
76
+ * Generate cache key from screenshot path and prompt
77
+ *
78
+ * NOTE: This cache key generation may need improvement for better cache hit rates
79
+ *
80
+ * Issues:
81
+ * BUG FIX (2025-01): Fixed truncation and string concatenation issues.
82
+ *
83
+ * Previous issues:
84
+ * 1. Truncation: prompt truncated to 100 chars, context to 50 chars
85
+ * - Causes collisions: different prompts with same prefix = same key
86
+ * - Wrong cache hits = incorrect results
87
+ *
88
+ * 2. String concatenation, not hash
89
+ * - VLLM cache uses SHA-256 hash (secure, no collisions)
90
+ * - This used string concatenation (collision-prone)
91
+ * - Inconsistent with VLLM cache approach
92
+ *
93
+ * 3. Whitespace removal in prompt
94
+ * - `replace(/\s+/g, '')` removed all whitespace
95
+ * - "Check accessibility" vs "Checkaccessibility" = same key (wrong!)
96
+ *
97
+ * Fix: Use SHA-256 hash like VLLM cache, don't truncate
98
+ * - Hash full content to avoid collisions
99
+ * - Cryptographically secure (collisions are extremely unlikely)
100
+ * - Consistent with VLLM cache approach
101
+ */
102
+ _getCacheKey(imagePath, prompt, context) {
103
+ const keyData = {
104
+ imagePath,
105
+ prompt: prompt || '',
106
+ context: context ? JSON.stringify(context) : ''
107
+ };
108
+ const keyString = JSON.stringify(keyData);
109
+ return createHash('sha256').update(keyString).digest('hex');
110
+ }
111
+
112
+ /**
113
+ * Batch validate multiple screenshots
114
+ *
115
+ * @param {string | string[]} imagePaths - Single image path or array of image paths
116
+ * @param {string} prompt - Validation prompt
117
+ * @param {import('./index.mjs').ValidationContext} [context={}] - Validation context
118
+ * @returns {Promise<import('./index.mjs').ValidationResult[]>} Array of validation results
119
+ */
120
+ async batchValidate(imagePaths, prompt, context = {}) {
121
+ if (!Array.isArray(imagePaths)) {
122
+ imagePaths = [imagePaths];
123
+ }
124
+
125
+ // Handle empty array
126
+ if (imagePaths.length === 0) {
127
+ return [];
128
+ }
129
+
130
+ // Process all screenshots in parallel (respecting concurrency limit)
131
+ const results = await Promise.all(
132
+ imagePaths.map(path => this._queueRequest(path, prompt, context))
133
+ );
134
+
135
+ return results;
136
+ }
137
+
138
+ /**
139
+ * Queue VLLM request for batch processing
140
+ *
141
+ * SECURITY: Queue size limit prevents memory leaks from unbounded queue growth
142
+ */
143
+ async _queueRequest(imagePath, prompt, context, validateFn = null) {
144
+ // Check cache first
145
+ if (this.cache) {
146
+ const cacheKey = this._getCacheKey(imagePath, prompt, context);
147
+ if (this.cache.has(cacheKey)) {
148
+ return this.cache.get(cacheKey);
149
+ }
150
+ }
151
+
152
+ // If under concurrency limit, process immediately
153
+ // NOTE: Track metrics for immediate processing too (not just queued requests)
154
+ // Note: totalQueued counts ALL requests (immediate + queued), totalProcessed counts completed requests
155
+ if (this.activeRequests < this.maxConcurrency) {
156
+ try {
157
+ this.metrics.totalQueued++; // Count immediate processing in total requests
158
+ // Note: totalProcessed will be incremented when request completes (in resolve handler for queued, or we could add it here)
159
+ // For now, we track it in the resolve handler for consistency
160
+ } catch (metricsError) {
161
+ warn(`[BatchOptimizer] Error updating metrics: ${metricsError.message}`);
162
+ }
163
+ // Track start time for immediate processing (for consistency with queued requests)
164
+ const startTime = Date.now();
165
+ // CRITICAL FIX: Wrap _processRequest in try-catch to ensure metrics balance even on errors
166
+ // MCP research confirms: If totalQueued is incremented but request fails, metrics become inaccurate
167
+ // This ensures totalProcessed is tracked even if _processRequest throws
168
+ try {
169
+ const result = await this._processRequest(imagePath, prompt, context, validateFn);
170
+ // Track successful completion for immediate processing
171
+ try {
172
+ this.metrics.totalProcessed++;
173
+ const waitTime = Date.now() - startTime;
174
+ this.metrics.waitTimes.push(waitTime);
175
+ if (this.metrics.waitTimes.length > 100) {
176
+ this.metrics.waitTimes.shift();
177
+ }
178
+ if (this.metrics.waitTimes.length === 1) {
179
+ this.metrics.averageWaitTime = waitTime;
180
+ } else {
181
+ const count = this.metrics.waitTimes.length;
182
+ this.metrics.averageWaitTime = this.metrics.averageWaitTime + (waitTime - this.metrics.averageWaitTime) / count;
183
+ }
184
+ } catch (metricsError) {
185
+ warn(`[BatchOptimizer] Error updating metrics: ${metricsError.message}`);
186
+ }
187
+ return result;
188
+ } catch (error) {
189
+ // CRITICAL FIX: Track failed requests to maintain metrics accuracy
190
+ // Even if request fails, we should track that it was "processed" (attempted)
191
+ // This prevents totalQueued > totalProcessed imbalance
192
+ try {
193
+ this.metrics.totalProcessed++; // Count failed attempts too
194
+ // Note: We could add a separate totalFailed counter, but for now we count all attempts
195
+ } catch (metricsError) {
196
+ warn(`[BatchOptimizer] Error updating failure metrics: ${metricsError.message}`);
197
+ }
198
+ // Re-throw error so caller can handle it
199
+ throw error;
200
+ }
201
+ }
202
+
203
+ // Check queue size limit (prevent memory leaks)
204
+ // VERIFIABLE: Track queue rejections to verify "prevents memory leaks" claim
205
+ // CRITICAL FIX: Increment totalQueued BEFORE checking queue size to ensure rejectionRate calculation is accurate
206
+ // This ensures rejected requests are included in the denominator for accurate rate calculation
207
+ const queueStartTime = Date.now();
208
+ try {
209
+ this.metrics.totalQueued++;
210
+ } catch (metricsError) {
211
+ // Metrics are best-effort, don't let them crash the application
212
+ warn(`[BatchOptimizer] Error updating metrics: ${metricsError.message}`);
213
+ }
214
+
215
+ if (this.queue.length >= this.maxQueueSize) {
216
+ try {
217
+ this.metrics.queueRejections++;
218
+ } catch (metricsError) {
219
+ warn(`[BatchOptimizer] Error updating rejection metrics: ${metricsError.message}`);
220
+ }
221
+ warn(`[BatchOptimizer] Queue is full (${this.queue.length}/${this.maxQueueSize}). Rejecting request to prevent memory leak. Total rejections: ${this.metrics.queueRejections}`);
222
+ throw new TimeoutError(
223
+ `Queue is full (${this.queue.length}/${this.maxQueueSize}). Too many concurrent requests.`,
224
+ { queueSize: this.queue.length, maxQueueSize: this.maxQueueSize }
225
+ );
226
+ }
227
+
228
+ // Otherwise, queue for later with timeout
229
+ // VERIFIABLE: Track queue time and timeouts to verify "prevents indefinite waiting" claim
230
+
231
+ return new Promise((resolve, reject) => {
232
+ // Set timeout for queued request (prevents indefinite waiting)
233
+ // NOTE: Use a flag to prevent double-counting if request completes just before timeout
234
+ let timeoutFired = false;
235
+ let queueEntry = null; // Store reference to queue entry for timeout callback
236
+
237
+ const timeoutId = setTimeout(() => {
238
+ timeoutFired = true;
239
+ // Remove from queue if still waiting
240
+ // CRITICAL FIX: Use stored queueEntry reference instead of searching by resolve function
241
+ // The resolve function is wrapped, so direct comparison might not work
242
+ if (queueEntry) {
243
+ const index = this.queue.indexOf(queueEntry);
244
+ if (index >= 0) {
245
+ this.queue.splice(index, 1);
246
+ // VERIFIABLE: Track timeout to verify claim
247
+ // Only increment if request was still in queue (not already processed)
248
+ // CRITICAL FIX: Wrap in try-catch to ensure metrics don't crash application
249
+ try {
250
+ this.metrics.timeouts++;
251
+ } catch (metricsError) {
252
+ warn(`[BatchOptimizer] Error updating timeout metrics: ${metricsError.message}`);
253
+ }
254
+ const waitTime = Date.now() - queueStartTime;
255
+ warn(`[BatchOptimizer] Request timed out after ${waitTime}ms in queue (limit: ${this.requestTimeout}ms). Total timeouts: ${this.metrics.timeouts}`);
256
+ reject(new TimeoutError(
257
+ `Request timed out after ${this.requestTimeout}ms in queue`,
258
+ { timeout: this.requestTimeout, queuePosition: index, waitTime }
259
+ ));
260
+ }
261
+ }
262
+ // If queueEntry not found, request was already processed, don't count as timeout
263
+ }, this.requestTimeout);
264
+
265
+ // Create queue entry with wrapped resolve/reject to clear timeout
266
+ queueEntry = {
267
+ imagePath,
268
+ prompt,
269
+ context,
270
+ validateFn,
271
+ queueStartTime, // Track when queued for wait time calculation
272
+ resolve: (value) => {
273
+ clearTimeout(timeoutId);
274
+ // CRITICAL FIX: Check if timeout already fired to prevent double-counting
275
+ if (!timeoutFired) {
276
+ // VERIFIABLE: Track wait time to verify queue performance
277
+ // CRITICAL FIX: Wrap in try-catch to ensure metrics don't crash application
278
+ try {
279
+ const waitTime = Date.now() - queueStartTime;
280
+ this.metrics.waitTimes.push(waitTime);
281
+ this.metrics.totalProcessed++;
282
+ // Keep only last 100 wait times for average calculation
283
+ if (this.metrics.waitTimes.length > 100) {
284
+ this.metrics.waitTimes.shift();
285
+ }
286
+ // OPTIMIZATION: Use running average instead of recalculating sum every time
287
+ // Running average: newAvg = oldAvg + (newValue - oldAvg) / count
288
+ if (this.metrics.waitTimes.length === 1) {
289
+ this.metrics.averageWaitTime = waitTime;
290
+ } else {
291
+ const count = this.metrics.waitTimes.length;
292
+ this.metrics.averageWaitTime = this.metrics.averageWaitTime + (waitTime - this.metrics.averageWaitTime) / count;
293
+ }
294
+ } catch (metricsError) {
295
+ // Metrics are best-effort, don't let them crash the application
296
+ warn(`[BatchOptimizer] Error updating metrics: ${metricsError.message}`);
297
+ }
298
+ }
299
+ resolve(value);
300
+ },
301
+ reject: (error) => {
302
+ clearTimeout(timeoutId);
303
+ reject(error);
304
+ }
305
+ };
306
+
307
+ this.queue.push(queueEntry);
308
+ this._processQueue();
309
+ });
310
+ }
311
+
312
+ /**
313
+ * Process a single request
314
+ */
315
+ async _processRequest(imagePath, prompt, context, validateFn) {
316
+ if (!validateFn) {
317
+ // Import validateScreenshot if not provided
318
+ const { validateScreenshot } = await import('./judge.mjs');
319
+ validateFn = validateScreenshot;
320
+ }
321
+
322
+ this.activeRequests++;
323
+
324
+ try {
325
+ const result = await validateFn(imagePath, prompt, context);
326
+
327
+ // Cache result if enabled
328
+ if (this.cache) {
329
+ const cacheKey = this._getCacheKey(imagePath, prompt, context);
330
+ this.cache.set(cacheKey, result);
331
+ }
332
+
333
+ return result;
334
+ } finally {
335
+ this.activeRequests--;
336
+ this._processQueue();
337
+ }
338
+ }
339
+
340
+ /**
341
+ * Process queued requests
342
+ */
343
+ async _processQueue() {
344
+ if (this.processing || this.queue.length === 0 || this.activeRequests >= this.maxConcurrency) {
345
+ return;
346
+ }
347
+
348
+ this.processing = true;
349
+
350
+ try {
351
+ while (this.queue.length > 0 && this.activeRequests < this.maxConcurrency) {
352
+ const batch = this.queue.splice(0, this.batchSize);
353
+
354
+ // Process batch in parallel
355
+ const promises = batch.map(async ({ imagePath, prompt, context, validateFn, resolve, reject }) => {
356
+ try {
357
+ // Check cache again (might have been added by another request)
358
+ if (this.cache) {
359
+ const cacheKey = this._getCacheKey(imagePath, prompt, context);
360
+ if (this.cache.has(cacheKey)) {
361
+ resolve(this.cache.get(cacheKey));
362
+ return;
363
+ }
364
+ }
365
+
366
+ const result = await this._processRequest(imagePath, prompt, context, validateFn);
367
+ resolve(result);
368
+ } catch (error) {
369
+ reject(error);
370
+ }
371
+ });
372
+
373
+ // Wait for batch to complete before processing next batch
374
+ await Promise.allSettled(promises);
375
+ }
376
+ } finally {
377
+ this.processing = false;
378
+ }
379
+ }
380
+
381
+ /**
382
+ * Clear cache (useful for testing)
383
+ *
384
+ * @returns {void}
385
+ */
386
+ clearCache() {
387
+ if (this.cache) {
388
+ this.cache.clear();
389
+ }
390
+ }
391
+
392
+ /**
393
+ * Get cache stats
394
+ *
395
+ * @returns {{ cacheSize: number; queueLength: number; activeRequests: number }} Cache statistics
396
+ */
397
+ getCacheStats() {
398
+ return {
399
+ cacheSize: this.cache ? this.cache.size : 0,
400
+ queueLength: this.queue.length,
401
+ activeRequests: this.activeRequests
402
+ };
403
+ }
404
+
405
+ /**
406
+ * Get performance metrics
407
+ *
408
+ * VERIFIABLE: Exports metrics to verify claims about queue limits and timeouts
409
+ *
410
+ * @returns {Object} Performance metrics including queue rejections and timeouts
411
+ */
412
+ getPerformanceMetrics() {
413
+ // NOTE: Metrics are initialized in constructor, but keep this check for safety
414
+ // for defensive programming (in case constructor wasn't called properly)
415
+ if (!this.metrics) {
416
+ this.metrics = {
417
+ queueRejections: 0,
418
+ timeouts: 0,
419
+ totalQueued: 0,
420
+ totalProcessed: 0,
421
+ averageWaitTime: 0,
422
+ waitTimes: []
423
+ };
424
+ }
425
+
426
+ return {
427
+ queue: {
428
+ currentLength: this.queue.length,
429
+ maxSize: this.maxQueueSize,
430
+ rejections: this.metrics.queueRejections,
431
+ totalQueued: this.metrics.totalQueued,
432
+ totalProcessed: this.metrics.totalProcessed,
433
+ averageWaitTime: this.metrics.averageWaitTime,
434
+ timeouts: this.metrics.timeouts,
435
+ timeoutRate: this.metrics.totalQueued > 0
436
+ ? (this.metrics.timeouts / this.metrics.totalQueued) * 100
437
+ : 0,
438
+ rejectionRate: this.metrics.totalQueued > 0
439
+ ? (this.metrics.queueRejections / (this.metrics.totalQueued + this.metrics.queueRejections)) * 100
440
+ : 0
441
+ },
442
+ concurrency: {
443
+ active: this.activeRequests,
444
+ max: this.maxConcurrency,
445
+ utilization: (this.activeRequests / this.maxConcurrency) * 100
446
+ },
447
+ cache: this.getCacheStats()
448
+ };
449
+ }
450
+ }
451
+