npm - @arclabs561/ai-visual-test - Versions diffs - 0.5.1 → 0.7.4 - Mend

@arclabs561/ai-visual-test 0.5.1 → 0.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

package/CHANGELOG.md +127 -11
package/DEPLOYMENT.md +225 -9
package/README.md +71 -80
package/index.d.ts +902 -5
package/package.json +10 -51
package/src/batch-optimizer.mjs +39 -0
package/src/cache.mjs +241 -16
package/src/config.mjs +33 -91
package/src/constants.mjs +54 -0
package/src/convenience.mjs +113 -10
package/src/cost-optimization.mjs +1 -0
package/src/cost-tracker.mjs +134 -2
package/src/data-extractor.mjs +36 -7
package/src/dynamic-few-shot.mjs +69 -11
package/src/errors.mjs +6 -2
package/src/experience-propagation.mjs +12 -0
package/src/experience-tracer.mjs +12 -3
package/src/game-player.mjs +222 -43
package/src/graceful-shutdown.mjs +126 -0
package/src/helpers/playwright.mjs +22 -8
package/src/human-validation-manager.mjs +99 -2
package/src/index.mjs +48 -3
package/src/integrations/playwright.mjs +140 -0
package/src/judge.mjs +699 -24
package/src/load-env.mjs +2 -1
package/src/logger.mjs +31 -3
package/src/model-tier-selector.mjs +1 -221
package/src/natural-language-specs.mjs +31 -3
package/src/persona-enhanced.mjs +4 -2
package/src/persona-experience.mjs +1 -1
package/src/pricing.mjs +28 -0
package/src/prompt-composer.mjs +162 -5
package/src/provider-data.mjs +115 -0
package/src/render-change-detector.mjs +5 -0
package/src/research-enhanced-validation.mjs +7 -5
package/src/retry.mjs +21 -7
package/src/rubrics.mjs +4 -0
package/src/safe-logger.mjs +71 -0
package/src/session-cost-tracker.mjs +320 -0
package/src/smart-validator.mjs +8 -8
package/src/spec-templates.mjs +52 -6
package/src/startup-validation.mjs +127 -0
package/src/temporal-adaptive.mjs +2 -2
package/src/temporal-decision-manager.mjs +1 -271
package/src/temporal-logic.mjs +104 -0
package/src/temporal-note-pruner.mjs +119 -0
package/src/temporal-preprocessor.mjs +1 -543
package/src/temporal.mjs +681 -79
package/src/utils/action-hallucination-detector.mjs +301 -0
package/src/utils/baseline-validator.mjs +82 -0
package/src/utils/cache-stats.mjs +104 -0
package/src/utils/cached-llm.mjs +164 -0
package/src/utils/capability-stratifier.mjs +108 -0
package/src/utils/counterfactual-tester.mjs +83 -0
package/src/utils/error-recovery.mjs +117 -0
package/src/utils/explainability-scorer.mjs +119 -0
package/src/utils/exploratory-automation.mjs +131 -0
package/src/utils/index.mjs +10 -0
package/src/utils/intent-recognizer.mjs +201 -0
package/src/utils/log-sanitizer.mjs +165 -0
package/src/utils/path-validator.mjs +88 -0
package/src/utils/performance-logger.mjs +316 -0
package/src/utils/performance-measurement.mjs +280 -0
package/src/utils/prompt-sanitizer.mjs +213 -0
package/src/utils/rate-limiter.mjs +144 -0
package/src/validation-framework.mjs +24 -20
package/src/validation-result-normalizer.mjs +35 -1
package/src/validation.mjs +75 -25
package/src/validators/accessibility-validator.mjs +144 -0
package/src/validators/hybrid-validator.mjs +48 -4
package/api/health.js +0 -34
package/api/validate.js +0 -252
package/public/index.html +0 -149
package/vercel.json +0 -27

package/src/utils/action-hallucination-detector.mjs ADDED Viewed

@@ -0,0 +1,301 @@
+/**
+ * Action Hallucination Detection
+ *
+ * Verifies that actions can actually be executed before claiming success.
+ * Simple element existence/visibility checks prevent claiming clicks on non-existent buttons.
+ *
+ * Research Context:
+ * - Hallucination rate <15% is often cited as critical for browser automation agents
+ * - Agents often claim actions completed when elements don't exist
+ * - Need to verify action execution actually succeeded
+ *
+ * Implementation:
+ * - Pre-action verification (check before clicking) is simpler and more effective
+ * - Element existence/visibility/enabled checks are sufficient
+ * - Complex algorithms add latency without clear benefit
+ *
+ * See docs/research/IMPLEMENTATION_VS_RESEARCH.md for detailed research context.
+ *
+ * @module action-hallucination-detector
+ */
+/**
+ * Detect hallucination in action claims
+ *
+ * @param {Object} action - Action that was claimed to be executed
+ * @param {import('playwright').Page} page - Playwright page to verify
+ * @param {Object} [options] - Detection options
+ * @returns {Promise<Object>} Detection result
+ */
+export async function detectActionHallucination(action, page, options = {}) {
+  if (!page) {
+    return {
+      hasHallucination: false,
+      reason: 'No page available for verification',
+      confidence: 0.5
+    };
+  }
+  try {
+    switch (action.type) {
+      case 'click':
+        return await detectClickHallucination(action, page, options);
+      case 'keyboard':
+        return await detectKeyboardHallucination(action, page, options);
+      case 'type':
+        return await detectTypeHallucination(action, page, options);
+      case 'navigate':
+        return await detectNavigateHallucination(action, page, options);
+      default:
+        return {
+          hasHallucination: false,
+          reason: 'Action type not verifiable',
+          confidence: 0.5
+        };
+    }
+  } catch (error) {
+    return {
+      hasHallucination: false,
+      reason: `Verification error: ${error.message}`,
+      confidence: 0.3,
+      error: error.message
+    };
+  }
+}
+/**
+ * Detect hallucination in click actions
+ */
+async function detectClickHallucination(action, page, options) {
+  if (!action.selector) {
+    return {
+      hasHallucination: true,
+      reason: 'Click action missing selector',
+      confidence: 0.9
+    };
+  }
+  try {
+    // Check if element exists
+    const exists = await page.locator(action.selector).count() > 0;
+    if (!exists) {
+      return {
+        hasHallucination: true,
+        reason: `Element with selector "${action.selector}" does not exist`,
+        confidence: 0.95,
+        elementExists: false
+      };
+    }
+    // Check if element is visible
+    const isVisible = await page.locator(action.selector).isVisible().catch(() => false);
+    if (!isVisible) {
+      return {
+        hasHallucination: true,
+        reason: `Element with selector "${action.selector}" exists but is not visible`,
+        confidence: 0.85,
+        elementExists: true,
+        elementVisible: false
+      };
+    }
+    // Check if element is enabled
+    const isEnabled = await page.locator(action.selector).isEnabled().catch(() => true);
+    if (!isEnabled) {
+      return {
+        hasHallucination: true,
+        reason: `Element with selector "${action.selector}" is disabled`,
+        confidence: 0.8,
+        elementExists: true,
+        elementVisible: true,
+        elementEnabled: false
+      };
+    }
+    return {
+      hasHallucination: false,
+      reason: 'Element exists, visible, and enabled',
+      confidence: 0.9,
+      elementExists: true,
+      elementVisible: true,
+      elementEnabled: true
+    };
+  } catch (error) {
+    return {
+      hasHallucination: true,
+      reason: `Error verifying click action: ${error.message}`,
+      confidence: 0.7,
+      error: error.message
+    };
+  }
+}
+/**
+ * Detect hallucination in keyboard actions
+ */
+async function detectKeyboardHallucination(action, page, options) {
+  // Keyboard actions are harder to verify (no element to check)
+  // But we can check if the page is interactive
+  try {
+    const isInteractive = await page.evaluate(() => {
+      return document.readyState === 'complete' &&
+             !document.hidden;
+    });
+    if (!isInteractive) {
+      return {
+        hasHallucination: true,
+        reason: 'Page is not interactive (not ready or hidden)',
+        confidence: 0.8,
+        pageInteractive: false
+      };
+    }
+    return {
+      hasHallucination: false,
+      reason: 'Page is interactive, keyboard action likely succeeded',
+      confidence: 0.6, // Lower confidence (can't directly verify keyboard input)
+      pageInteractive: true
+    };
+  } catch (error) {
+    return {
+      hasHallucination: false,
+      reason: `Could not verify keyboard action: ${error.message}`,
+      confidence: 0.5,
+      error: error.message
+    };
+  }
+}
+/**
+ * Detect hallucination in type actions
+ */
+async function detectTypeHallucination(action, page, options) {
+  if (!action.selector) {
+    return {
+      hasHallucination: true,
+      reason: 'Type action missing selector',
+      confidence: 0.9
+    };
+  }
+  try {
+    // Check if input element exists
+    const exists = await page.locator(action.selector).count() > 0;
+    if (!exists) {
+      return {
+        hasHallucination: true,
+        reason: `Input element with selector "${action.selector}" does not exist`,
+        confidence: 0.95,
+        elementExists: false
+      };
+    }
+    // Check if value was actually entered
+    if (action.value) {
+      const actualValue = await page.locator(action.selector).inputValue().catch(() => null);
+      if (actualValue !== action.value) {
+        return {
+          hasHallucination: true,
+          reason: `Value mismatch: expected "${action.value}", got "${actualValue}"`,
+          confidence: 0.9,
+          elementExists: true,
+          valueMatch: false,
+          expectedValue: action.value,
+          actualValue
+        };
+      }
+    }
+    return {
+      hasHallucination: false,
+      reason: 'Input element exists and value matches',
+      confidence: 0.9,
+      elementExists: true,
+      valueMatch: true
+    };
+  } catch (error) {
+    return {
+      hasHallucination: true,
+      reason: `Error verifying type action: ${error.message}`,
+      confidence: 0.7,
+      error: error.message
+    };
+  }
+}
+/**
+ * Detect hallucination in navigate actions
+ */
+async function detectNavigateHallucination(action, page, options) {
+  if (!action.url) {
+    return {
+      hasHallucination: true,
+      reason: 'Navigate action missing URL',
+      confidence: 0.9
+    };
+  }
+  try {
+    const currentUrl = page.url();
+    const expectedUrl = action.url;
+    // Check if URL matches (allowing for query params, hash)
+    const urlMatches = currentUrl.includes(expectedUrl) ||
+                       expectedUrl.includes(currentUrl);
+    if (!urlMatches) {
+      return {
+        hasHallucination: true,
+        reason: `URL mismatch: expected "${expectedUrl}", got "${currentUrl}"`,
+        confidence: 0.9,
+        urlMatch: false,
+        expectedUrl,
+        actualUrl: currentUrl
+      };
+    }
+    return {
+      hasHallucination: false,
+      reason: 'URL matches expected navigation target',
+      confidence: 0.9,
+      urlMatch: true
+    };
+  } catch (error) {
+    return {
+      hasHallucination: false,
+      reason: `Could not verify navigation: ${error.message}`,
+      confidence: 0.5,
+      error: error.message
+    };
+  }
+}
+/**
+ * Batch detect action hallucinations
+ */
+export async function batchDetectActionHallucinations(actions, page, options = {}) {
+  const results = await Promise.all(
+    actions.map(action => detectActionHallucination(action, page, options))
+  );
+  const total = results.length;
+  const hallucinationCount = results.filter(r => r.hasHallucination).length;
+  const hallucinationRate = hallucinationCount / total;
+  return {
+    total,
+    hallucinationCount,
+    hallucinationRate,
+    results,
+    recommendation: hallucinationRate < 0.15
+      ? 'Hallucination rate meets target (<15%)'
+      : `Hallucination rate ${(hallucinationRate * 100).toFixed(1)}% exceeds target. Review action execution logic.`
+  };
+}

package/src/utils/baseline-validator.mjs ADDED Viewed

@@ -0,0 +1,82 @@
+/**
+ * Baseline Validator
+ *
+ * Tests visual discriminative power of benchmarks
+ *
+ * Research context:
+ * - Standard benchmarks can be partially solved without visual analysis
+ * - Questions that can be answered through world knowledge alone obscure actual visual deficits
+ * - Need to test baseline (text-only) vs. visual accuracy
+ */
+import { validateScreenshot } from '../judge.mjs';
+/**
+ * Test baseline (text-only) vs. visual accuracy
+ *
+ * @param {string} imagePath - Path to image (or null for baseline)
+ * @param {string} prompt - Question about the image
+ * @param {Object} options - Test options
+ * @returns {Promise<Object>} Baseline test result
+ */
+export async function testBaseline(imagePath, prompt, options = {}) {
+  // Test with image
+  const visualResult = imagePath
+    ? await validateScreenshot(imagePath, prompt, {
+        testType: 'baseline-visual',
+        ...options
+      })
+    : null;
+  // Test without image (baseline - text-only)
+  // For baseline, we create a minimal result that simulates text-only answering
+  // In practice, this would use a corrupted/blank image, but for testing we'll simulate
+  const baselineResult = {
+    score: 0,
+    reasoning: 'Baseline (text-only) - no visual input',
+    extractedValue: null
+  };
+  const visualScore = visualResult?.score || 0;
+  const baselineScore = baselineResult?.score || 0;
+  const accuracyDrop = visualScore > 0 ? (visualScore - baselineScore) / visualScore : 0;
+  return {
+    visualResult,
+    baselineResult,
+    visualScore,
+    baselineScore,
+    accuracyDrop,
+    hasVisualDiscriminativePower: accuracyDrop > 0.3, // >30% drop required
+    recommendation: accuracyDrop > 0.3
+      ? 'Benchmark has visual discriminative power.'
+      : 'Benchmark may not require visual input. Consider visual-specific test cases.'
+  };
+}
+/**
+ * Batch test baseline vs. visual accuracy
+ *
+ * @param {Array<{imagePath: string, prompt: string}>} testCases
+ * @param {Object} options - Test options
+ * @returns {Promise<Object>} Batch baseline test results
+ */
+export async function batchTestBaseline(testCases, options = {}) {
+  const results = await Promise.all(
+    testCases.map(tc => testBaseline(tc.imagePath, tc.prompt, options))
+  );
+  const avgAccuracyDrop = results.reduce((sum, r) => sum + r.accuracyDrop, 0) / results.length;
+  const visualDiscriminativeCount = results.filter(r => r.hasVisualDiscriminativePower).length;
+  return {
+    total: results.length,
+    avgAccuracyDrop,
+    visualDiscriminativePower: visualDiscriminativeCount / results.length,
+    results,
+    recommendation: avgAccuracyDrop > 0.3
+      ? 'Benchmark has good visual discriminative power.'
+      : 'Benchmark may not require visual input. Consider visual-specific test cases.'
+  };
+}

package/src/utils/cache-stats.mjs ADDED Viewed

@@ -0,0 +1,104 @@
+/**
+ * Cache Statistics and Monitoring Utilities
+ *
+ * Provides utilities for monitoring cache performance and health.
+ * Useful for debugging cache issues and optimizing cache hit rates.
+ */
+import { getCacheStats } from '../cache.mjs';
+import { getCacheStats as getEmbeddingCacheStats } from '../../evaluation/utils/embedding-cache.mjs';
+/**
+ * Get comprehensive cache statistics across all cache systems
+ *
+ * @returns {Object} Combined cache statistics
+ */
+export function getAllCacheStats() {
+  const vllmStats = getCacheStats();
+  let embeddingStats = null;
+  try {
+    embeddingStats = getEmbeddingCacheStats();
+  } catch {
+    // Embedding cache might not be available
+  }
+  return {
+    vllm: {
+      size: vllmStats.size,
+      maxSize: 1000, // MAX_CACHE_SIZE
+      maxAge: vllmStats.maxAge,
+      utilization: `${((vllmStats.size / 1000) * 100).toFixed(1)}%`,
+      cacheFile: vllmStats.cacheFile,
+      atomicWrites: vllmStats.atomicWrites || 0,
+      atomicWriteFailures: vllmStats.atomicWriteFailures || 0,
+      atomicWriteSuccessRate: vllmStats.atomicWriteSuccessRate || 100
+    },
+    embedding: embeddingStats ? {
+      size: embeddingStats.size,
+      maxSize: embeddingStats.maxSize,
+      utilization: embeddingStats.utilization
+    } : null
+  };
+}
+/**
+ * Format cache statistics for human-readable display
+ *
+ * @param {Object} [stats=null] - Optional stats object (if null, fetches current stats)
+ * @returns {string} Formatted statistics string
+ */
+export function formatCacheStats(stats = null) {
+  const allStats = stats || getAllCacheStats();
+  const lines = [
+    '=== Cache Statistics ===',
+    '',
+    'VLLM Cache (Vision + Text LLM):',
+    `  Size: ${allStats.vllm.size} / ${allStats.vllm.maxSize} entries (${allStats.vllm.utilization})`,
+    `  Max Age: ${Math.floor(allStats.vllm.maxAge / (1000 * 60 * 60 * 24))} days`,
+    `  Cache File: ${allStats.vllm.cacheFile}`,
+    `  Atomic Writes: ${allStats.vllm.atomicWrites} (${allStats.vllm.atomicWriteSuccessRate.toFixed(1)}% success rate)`,
+    ''
+  ];
+  if (allStats.embedding) {
+    lines.push(
+      'Embedding Cache:',
+      `  Size: ${allStats.embedding.size} / ${allStats.embedding.maxSize} entries (${allStats.embedding.utilization})`,
+      ''
+    );
+  }
+  lines.push('=== End Cache Statistics ===');
+  return lines.join('\n');
+}
+/**
+ * Check cache health and return warnings if any issues detected
+ *
+ * @returns {Array<string>} Array of warning messages (empty if healthy)
+ */
+export function checkCacheHealth() {
+  const warnings = [];
+  const stats = getAllCacheStats();
+  // Check VLLM cache utilization
+  if (stats.vllm.size > 900) {
+    warnings.push(`VLLM cache is nearly full (${stats.vllm.size}/1000 entries). Consider clearing old entries.`);
+  }
+  // Check atomic write success rate
+  if (stats.vllm.atomicWriteSuccessRate < 95 && stats.vllm.atomicWrites > 10) {
+    warnings.push(`Low atomic write success rate: ${stats.vllm.atomicWriteSuccessRate.toFixed(1)}%. Check disk permissions.`);
+  }
+  // Check embedding cache utilization
+  if (stats.embedding && parseInt(stats.embedding.utilization) > 90) {
+    warnings.push(`Embedding cache is nearly full (${stats.embedding.utilization}). Consider increasing limit.`);
+  }
+  return warnings;
+}

package/src/utils/cached-llm.mjs ADDED Viewed

@@ -0,0 +1,164 @@
+/**
+ * Cached LLM Wrapper
+ *
+ * Wraps @arclabs561/llm-utils callLLM with persistent caching to reduce costs
+ * and improve performance for text-only LLM calls.
+ *
+ * Uses the same cache system as vLLM calls (src/cache.mjs) for consistency.
+ * Cache entries persist for 7 days and use LRU eviction.
+ *
+ * DESIGN DECISION: Separate wrapper rather than modifying @arclabs561/llm-utils
+ * - Why: @arclabs561/llm-utils is a shared library, we can't modify it
+ * - Why: Keeps caching concerns separate from LLM calling logic
+ * - Why: Allows disabling cache per-call if needed (via useCache option)
+ */
+import { getCachedTextLLM, setCachedTextLLM, initCache, getCacheStats } from '../cache.mjs';
+import { ValidationError } from '../errors.mjs';
+// Initialize cache on module load (uses default directory)
+initCache();
+/**
+ * Normalize prompt to improve cache hit rates
+ *
+ * Normalizes prompts by:
+ * - Trimming leading/trailing whitespace
+ * - Normalizing line endings (CRLF -> LF)
+ * - Collapsing multiple spaces to single space
+ *
+ * This helps catch cases where prompts are semantically identical but have
+ * minor formatting differences (e.g., extra spaces, different line endings).
+ *
+ * DESIGN DECISION: Conservative normalization (preserves content, only fixes formatting)
+ * - Why: Aggressive normalization (e.g., lowercasing, removing punctuation) could
+ *   cause cache collisions for semantically different prompts
+ * - Trade-off: Some cache misses for prompts that differ only in whitespace,
+ *   but avoids wrong cache hits
+ *
+ * SUBTLE BEHAVIOR: The original prompt is preserved for the API call, only the
+ * cache key uses the normalized version. This ensures:
+ * 1. Cache hit rates improve (formatting variations hit same cache)
+ * 2. API receives original prompt (preserves user intent)
+ * 3. No semantic changes (only formatting normalization)
+ *
+ * @param {string} prompt - Raw prompt
+ * @returns {string} Normalized prompt
+ */
+export function normalizePrompt(prompt) {
+  if (typeof prompt !== 'string') {
+    return prompt;
+  }
+  return prompt
+    .replace(/\r\n/g, '\n') // Normalize line endings (CRLF -> LF)
+    .replace(/\r/g, '\n')   // Handle old Mac line endings
+    .replace(/[ \t]+/g, ' ') // Collapse multiple spaces/tabs to single space
+    .replace(/[ \t]*\n[ \t]*/g, '\n') // Normalize line breaks (remove trailing/leading spaces)
+    .trim(); // Remove leading/trailing whitespace
+}
+/**
+ * Call LLM with caching
+ *
+ * Wraps @arclabs561/llm-utils callLLM with persistent caching.
+ *
+ * @param {string} prompt - Text prompt
+ * @param {string} provider - LLM provider (e.g., 'gemini', 'openai', 'claude')
+ * @param {string} apiKey - API key for the provider
+ * @param {{
+ *   model?: string | null;
+ *   temperature?: number;
+ *   maxTokens?: number;
+ *   tier?: string;
+ *   useCache?: boolean;
+ * }} [options={}] - LLM call options
+ * @returns {Promise<string>} LLM response
+ */
+export async function callLLMCached(prompt, provider, apiKey, options = {}) {
+  const {
+    useCache = process.env.DISABLE_LLM_CACHE !== 'true', // Cache by default, can disable via env var
+    ...llmOptions
+  } = options;
+  // Normalize prompt to improve cache hit rates
+  const normalizedPrompt = normalizePrompt(prompt);
+  // Check cache first (if caching enabled)
+  if (useCache) {
+    const cached = getCachedTextLLM(normalizedPrompt, provider, llmOptions);
+    if (cached !== null) {
+      // Log cache hit (weighted: cache hits are important for performance visibility)
+      try {
+        const { logCacheOperation } = await import('./performance-logger.mjs');
+        const stats = getCacheStats();
+        logCacheOperation({
+          operation: 'hit',
+          hit: true,
+          latency: 0, // Minimal latency for cache hits
+          cacheSize: stats.size,
+          maxSize: 1000, // MAX_CACHE_SIZE
+          type: 'text-llm'
+        });
+      } catch {
+        // Silently fail if performance logger unavailable
+      }
+      return cached;
+    }
+    // Log cache miss
+    try {
+      const { logCacheOperation } = await import('./performance-logger.mjs');
+      const stats = getCacheStats();
+      logCacheOperation({
+        operation: 'miss',
+        hit: false,
+        cacheSize: stats.size,
+        maxSize: 1000,
+        type: 'text-llm'
+      });
+    } catch {
+      // Silently fail if performance logger unavailable
+    }
+  }
+  // Call actual LLM (dynamic import to make it optional)
+  let llmUtils;
+  try {
+    llmUtils = await import('@arclabs561/llm-utils');
+  } catch (error) {
+    throw new ValidationError(
+      `LLM call requires @arclabs561/llm-utils package. ` +
+      `Install it with: npm install @arclabs561/llm-utils. ` +
+      `Error: ${error.message}`,
+      {
+        package: '@arclabs561/llm-utils',
+        installationCommand: 'npm install @arclabs561/llm-utils',
+        originalError: error.message
+      }
+    );
+  }
+  const startTime = Date.now();
+  const response = await llmUtils.callLLM(prompt, provider, apiKey, llmOptions);
+  const latency = Date.now() - startTime;
+  // Cache the response (if caching enabled)
+  // Use normalized prompt for cache key consistency
+  if (useCache) {
+    setCachedTextLLM(normalizedPrompt, provider, llmOptions, response);
+  }
+  return response;
+}
+/**
+ * Re-export callLLM from @arclabs561/llm-utils for convenience
+ * (non-cached version, in case someone needs it)
+ */
+export async function callLLMUncached(prompt, provider, apiKey, options = {}) {
+  const llmUtils = await import('@arclabs561/llm-utils');
+  return llmUtils.callLLM(prompt, provider, apiKey, options);
+}