npm - @arclabs561/ai-visual-test - Versions diffs - 0.5.1 - Mend

@arclabs561/ai-visual-test 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

package/.secretsignore.example +20 -0
package/CHANGELOG.md +360 -0
package/CONTRIBUTING.md +63 -0
package/DEPLOYMENT.md +80 -0
package/LICENSE +22 -0
package/README.md +142 -0
package/SECURITY.md +108 -0
package/api/health.js +34 -0
package/api/validate.js +252 -0
package/index.d.ts +1221 -0
package/package.json +112 -0
package/public/index.html +149 -0
package/src/batch-optimizer.mjs +451 -0
package/src/bias-detector.mjs +370 -0
package/src/bias-mitigation.mjs +233 -0
package/src/cache.mjs +433 -0
package/src/config.mjs +268 -0
package/src/constants.mjs +80 -0
package/src/context-compressor.mjs +350 -0
package/src/convenience.mjs +617 -0
package/src/cost-tracker.mjs +257 -0
package/src/cross-modal-consistency.mjs +170 -0
package/src/data-extractor.mjs +232 -0
package/src/dynamic-few-shot.mjs +140 -0
package/src/dynamic-prompts.mjs +361 -0
package/src/ensemble/index.mjs +53 -0
package/src/ensemble-judge.mjs +366 -0
package/src/error-handler.mjs +67 -0
package/src/errors.mjs +167 -0
package/src/experience-propagation.mjs +128 -0
package/src/experience-tracer.mjs +487 -0
package/src/explanation-manager.mjs +299 -0
package/src/feedback-aggregator.mjs +248 -0
package/src/game-goal-prompts.mjs +478 -0
package/src/game-player.mjs +548 -0
package/src/hallucination-detector.mjs +155 -0
package/src/helpers/playwright.mjs +80 -0
package/src/human-validation-manager.mjs +516 -0
package/src/index.mjs +364 -0
package/src/judge.mjs +929 -0
package/src/latency-aware-batch-optimizer.mjs +192 -0
package/src/load-env.mjs +159 -0
package/src/logger.mjs +55 -0
package/src/metrics.mjs +187 -0
package/src/model-tier-selector.mjs +221 -0
package/src/multi-modal/index.mjs +36 -0
package/src/multi-modal-fusion.mjs +190 -0
package/src/multi-modal.mjs +524 -0
package/src/natural-language-specs.mjs +1071 -0
package/src/pair-comparison.mjs +277 -0
package/src/persona/index.mjs +42 -0
package/src/persona-enhanced.mjs +200 -0
package/src/persona-experience.mjs +572 -0
package/src/position-counterbalance.mjs +140 -0
package/src/prompt-composer.mjs +375 -0
package/src/render-change-detector.mjs +583 -0
package/src/research-enhanced-validation.mjs +436 -0
package/src/retry.mjs +152 -0
package/src/rubrics.mjs +231 -0
package/src/score-tracker.mjs +277 -0
package/src/smart-validator.mjs +447 -0
package/src/spec-config.mjs +106 -0
package/src/spec-templates.mjs +347 -0
package/src/specs/index.mjs +38 -0
package/src/temporal/index.mjs +102 -0
package/src/temporal-adaptive.mjs +163 -0
package/src/temporal-batch-optimizer.mjs +222 -0
package/src/temporal-constants.mjs +69 -0
package/src/temporal-context.mjs +49 -0
package/src/temporal-decision-manager.mjs +271 -0
package/src/temporal-decision.mjs +669 -0
package/src/temporal-errors.mjs +58 -0
package/src/temporal-note-pruner.mjs +173 -0
package/src/temporal-preprocessor.mjs +543 -0
package/src/temporal-prompt-formatter.mjs +219 -0
package/src/temporal-validation.mjs +159 -0
package/src/temporal.mjs +415 -0
package/src/type-guards.mjs +311 -0
package/src/uncertainty-reducer.mjs +470 -0
package/src/utils/index.mjs +175 -0
package/src/validation-framework.mjs +321 -0
package/src/validation-result-normalizer.mjs +64 -0
package/src/validation.mjs +243 -0
package/src/validators/accessibility-programmatic.mjs +345 -0
package/src/validators/accessibility-validator.mjs +223 -0
package/src/validators/batch-validator.mjs +143 -0
package/src/validators/hybrid-validator.mjs +268 -0
package/src/validators/index.mjs +34 -0
package/src/validators/prompt-builder.mjs +218 -0
package/src/validators/rubric.mjs +85 -0
package/src/validators/state-programmatic.mjs +260 -0
package/src/validators/state-validator.mjs +291 -0
package/vercel.json +27 -0

package/src/pair-comparison.mjs ADDED Viewed

@@ -0,0 +1,277 @@
+/**
+ * Pair Comparison Evaluation
+ *
+ * Implements pairwise comparison evaluation method.
+ * Research shows Pair Comparison is more reliable than absolute scoring
+ * (MLLM-as-a-Judge, arXiv:2402.04788).
+ *
+ * Instead of scoring each screenshot independently, compares pairs
+ * to determine which is better, then derives relative scores.
+ */
+import { VLLMJudge } from './judge.mjs';
+import { detectBias, detectPositionBias } from './bias-detector.mjs';
+import { composeComparisonPrompt } from './prompt-composer.mjs';
+/**
+ * Compare two screenshots and determine which is better
+ *
+ * @param {string} imagePath1 - Path to first screenshot
+ * @param {string} imagePath2 - Path to second screenshot
+ * @param {string} prompt - Evaluation prompt describing what to compare
+ * @param {import('./index.mjs').ValidationContext} [context={}] - Validation context
+ * @returns {Promise<import('./index.mjs').PairComparisonResult>} Comparison result
+ */
+export async function comparePair(imagePath1, imagePath2, prompt, context = {}) {
+  const judge = new VLLMJudge(context);
+  if (!judge.enabled) {
+    return {
+      enabled: false,
+      winner: null,
+      confidence: null,
+      reasoning: 'VLLM validation is disabled',
+      comparison: null
+    };
+  }
+  const comparisonPrompt = buildComparisonPrompt(prompt, context);
+  // Randomize order to reduce position bias
+  const [first, second, order] = Math.random() > 0.5
+    ? [imagePath1, imagePath2, 'original']
+    : [imagePath2, imagePath1, 'reversed'];
+  const fullPrompt = `${comparisonPrompt}
+You are comparing two screenshots. Screenshot A is shown first, then Screenshot B.
+SCREENSHOT A:
+[First screenshot will be provided]
+SCREENSHOT B:
+[Second screenshot will be provided]
+Compare them and determine which is better based on the evaluation criteria. Return JSON:
+{
+  "winner": "A" | "B" | "tie",
+  "confidence": 0.0-1.0,
+  "reasoning": "explanation of comparison",
+  "differences": ["key difference 1", "key difference 2"],
+  "scores": {
+    "A": 0-10,
+    "B": 0-10
+  }
+}`;
+  try {
+    // TRUE MULTI-IMAGE COMPARISON: Send both images in single API call
+    // This is the research-optimal approach (MLLM-as-a-Judge, arXiv:2402.04788)
+    const comparisonResult = await judge.judgeScreenshot([first, second], comparisonPrompt, {
+      ...context,
+      comparisonContext: { position: 'both', total: 2 }
+    });
+    if (!comparisonResult.enabled || comparisonResult.error) {
+      return {
+        enabled: false,
+        winner: null,
+        confidence: null,
+        reasoning: comparisonResult.error || 'Comparison failed',
+        comparison: null,
+        error: comparisonResult.error || 'API disabled'
+      };
+    }
+    // Parse comparison result - expect JSON with winner, scores, reasoning
+    const judgment = comparisonResult.judgment || '';
+    let parsedComparison = null;
+    try {
+      const jsonMatch = judgment.match(/\{[\s\S]*\}/);
+      if (jsonMatch) {
+        parsedComparison = JSON.parse(jsonMatch[0]);
+      }
+    } catch (e) {
+      // Fall through to score-based comparison
+    }
+    // If we got structured comparison, use it
+    if (parsedComparison && parsedComparison.winner) {
+      const winner = parsedComparison.winner.toLowerCase();
+      const scoreA = parsedComparison.scores?.A ?? parsedComparison.scores?.['A'] ?? null;
+      const scoreB = parsedComparison.scores?.B ?? parsedComparison.scores?.['B'] ?? null;
+      // Map winner back to original order
+      const mappedWinner = order === 'reversed'
+        ? (winner === 'a' ? 'B' : winner === 'b' ? 'A' : 'tie')
+        : (winner === 'a' ? 'A' : winner === 'b' ? 'B' : 'tie');
+      return {
+        enabled: true,
+        winner: mappedWinner,
+        confidence: parsedComparison.confidence ?? 0.5,
+        reasoning: parsedComparison.reasoning || comparisonResult.reasoning || 'Direct comparison completed',
+        differences: parsedComparison.differences || [],
+        comparison: {
+          score1: scoreA ?? (mappedWinner === 'A' ? 8 : mappedWinner === 'B' ? 6 : 7),
+          score2: scoreB ?? (mappedWinner === 'B' ? 8 : mappedWinner === 'A' ? 6 : 7),
+          difference: scoreA && scoreB ? Math.abs(scoreA - scoreB) : null,
+          order: order === 'reversed' ? 'reversed' : 'original',
+          method: 'multi-image'
+        },
+        biasDetection: {
+          positionBias: false, // Multi-image eliminates position bias
+          adjusted: false
+        },
+        metadata: {
+          provider: comparisonResult.provider,
+          cached: comparisonResult.cached || false,
+          responseTime: comparisonResult.responseTime || 0,
+          estimatedCost: comparisonResult.estimatedCost,
+          logprobs: comparisonResult.logprobs || null // Include if available
+        }
+      };
+    }
+    // Fallback: If structured parse failed, treat as tie (multi-image should return structured JSON)
+    // This should rarely happen if prompt is clear
+    return {
+      enabled: true,
+      winner: 'tie',
+      confidence: 0.5,
+      reasoning: comparisonResult.reasoning || 'Comparison completed but could not parse structured result. Treating as tie.',
+      comparison: {
+        score1: comparisonResult.score ?? 7,
+        score2: comparisonResult.score ?? 7,
+        difference: 0,
+        order: order === 'reversed' ? 'reversed' : 'original',
+        method: 'multi-image-fallback'
+      },
+      biasDetection: {
+        positionBias: false,
+        adjusted: false
+      },
+      metadata: {
+        provider: comparisonResult.provider,
+        cached: comparisonResult.cached || false,
+        responseTime: comparisonResult.responseTime || 0,
+        estimatedCost: comparisonResult.estimatedCost,
+        logprobs: comparisonResult.logprobs || null,
+        warning: 'Structured comparison parse failed - using fallback'
+      }
+    };
+  } catch (error) {
+    return {
+      enabled: false,
+      winner: null,
+      confidence: null,
+      reasoning: `Comparison failed: ${error.message}`,
+      comparison: null,
+      error: error.message
+    };
+  }
+}
+/**
+ * Build comparison prompt from base prompt
+ *
+ * Now uses unified prompt composition system for research-backed consistency.
+ */
+function buildComparisonPrompt(basePrompt, context) {
+  try {
+    return composeComparisonPrompt(basePrompt, context, {
+      includeRubric: context.includeRubric !== false // Default true (research-backed)
+    });
+  } catch (error) {
+    // Fallback to basic comparison prompt
+    return `Compare the two screenshots based on the following criteria:
+${basePrompt}
+Focus on:
+- Which screenshot better meets the criteria?
+- What are the key differences?
+- Which has fewer issues?
+- Which provides better user experience?
+Be specific about what makes one better than the other.`;
+  }
+}
+/**
+ * Rank multiple screenshots using pairwise comparisons
+ * Uses tournament-style ranking
+ *
+ * @param {Array<string>} imagePaths - Array of screenshot paths
+ * @param {string} prompt - Evaluation prompt
+ * @param {import('./index.mjs').ValidationContext} [context={}] - Validation context
+ * @returns {Promise<import('./index.mjs').BatchRankingResult>} Ranking result
+ */
+export async function rankBatch(imagePaths, prompt, context = {}) {
+  if (imagePaths.length < 2) {
+    return {
+      enabled: false,
+      rankings: [],
+      error: 'Need at least 2 screenshots for ranking'
+    };
+  }
+  // For efficiency, compare each pair
+  // In practice, you might use a tournament bracket or sampling
+  const comparisons = [];
+  const scores = new Map();
+  // Compare all pairs
+  for (let i = 0; i < imagePaths.length; i++) {
+    for (let j = i + 1; j < imagePaths.length; j++) {
+      const comparison = await comparePair(
+        imagePaths[i],
+        imagePaths[j],
+        prompt,
+        context
+      );
+      if (comparison.enabled && comparison.winner !== 'tie') {
+        comparisons.push({
+          image1: i,
+          image2: j,
+          winner: comparison.winner === 'A' ? i : j,
+          confidence: comparison.confidence
+        });
+        // Update scores based on wins
+        const winnerIdx = comparison.winner === 'A' ? i : j;
+        const loserIdx = comparison.winner === 'A' ? j : i;
+        scores.set(winnerIdx, (scores.get(winnerIdx) || 0) + comparison.confidence);
+        scores.set(loserIdx, (scores.get(loserIdx) || 0) + (1 - comparison.confidence));
+      }
+    }
+  }
+  // Rank by scores
+  const rankings = Array.from(scores.entries())
+    .map(([idx, score]) => ({
+      index: idx,
+      path: imagePaths[idx],
+      score,
+      wins: comparisons.filter(c => c.winner === idx).length
+    }))
+    .sort((a, b) => b.score - a.score)
+    .map((r, rank) => ({
+      ...r,
+      rank: rank + 1
+    }));
+  return {
+    enabled: true,
+    rankings,
+    comparisons: comparisons.length,
+    metadata: {
+      totalScreenshots: imagePaths.length,
+      totalComparisons: comparisons.length
+    }
+  };
+}

package/src/persona/index.mjs ADDED Viewed

@@ -0,0 +1,42 @@
+/**
+ * Persona Sub-Module
+ *
+ * Persona-based experience testing and evaluation.
+ *
+ * Import from 'ai-visual-test/persona'
+ */
+// Core persona experience
+export {
+  experiencePageAsPersona,
+  experiencePageWithPersonas
+} from '../persona-experience.mjs';
+// Enhanced persona
+export {
+  createEnhancedPersona,
+  experiencePageWithEnhancedPersona,
+  calculatePersonaConsistency,
+  calculatePersonaDiversity
+} from '../persona-enhanced.mjs';
+// Experience tracing
+export {
+  ExperienceTrace,
+  ExperienceTracerManager,
+  getTracerManager
+} from '../experience-tracer.mjs';
+// Experience propagation
+export {
+  ExperiencePropagationTracker,
+  getPropagationTracker,
+  trackPropagation
+} from '../experience-propagation.mjs';
+// Explanation manager
+export {
+  ExplanationManager,
+  getExplanationManager
+} from '../explanation-manager.mjs';

package/src/persona-enhanced.mjs ADDED Viewed

@@ -0,0 +1,200 @@
+/**
+ * Enhanced Persona Structure
+ *
+ * Adds rich context to personas based on research findings:
+ * - Workflows, frustrations, usage patterns
+ * - Temporal evolution tracking
+ * - Consistency metrics
+ *
+ * Research:
+ * - "Can LLM be a Personalized Judge?" - Persona-based LLM judging with uncertainty estimation
+ * - "The Prompt Makes the Person(a)" - Systematic evaluation of sociodemographic persona prompting
+ * - "Persona-judge: Personalized Alignment of Large Language Models" - Personalized alignment
+ * - "PERSONA: Evaluating Pluralistic Alignment in LLMs" - Pluralistic alignment with personas
+ *
+ * Note: Research shows direct persona-based judging has low reliability, but uncertainty
+ * estimation improves performance to >80% agreement on high-certainty samples. LLMs struggle
+ * to authentically simulate marginalized groups. Multi-agent debate can amplify bias.
+ */
+import { experiencePageAsPersona } from './persona-experience.mjs';
+/**
+ * Enhanced persona structure with rich context
+ *
+ * @typedef {Object} EnhancedPersona
+ * @property {string} name - Persona name
+ * @property {string} device - Device type
+ * @property {string[]} goals - Primary goals
+ * @property {string[]} concerns - Primary concerns
+ * @property {Object} workflows - Documented workflows and use cases
+ * @property {string[]} frustrations - Specific frustrations
+ * @property {Object} usagePatterns - Historical usage patterns
+ * @property {Object} temporalEvolution - Temporal usage evolution
+ */
+/**
+ * Create enhanced persona with rich context
+ *
+ * @param {Object} basePersona - Base persona (name, device, goals, concerns)
+ * @param {{
+ *   workflows?: Object;
+ *   frustrations?: string[];
+ *   usagePatterns?: Object;
+ *   temporalEvolution?: Object;
+ * }} [context={}] - Rich context
+ * @returns {EnhancedPersona} Enhanced persona
+ */
+export function createEnhancedPersona(basePersona, context = {}) {
+  return {
+    ...basePersona,
+    workflows: context.workflows || {
+      primary: [],
+      secondary: [],
+      edgeCases: []
+    },
+    frustrations: context.frustrations || [],
+    usagePatterns: context.usagePatterns || {
+      frequency: 'unknown',
+      duration: 'unknown',
+      peakTimes: []
+    },
+    temporalEvolution: context.temporalEvolution || {
+      firstUse: null,
+      lastUse: null,
+      usageTrend: 'stable'
+    }
+  };
+}
+/**
+ * Calculate consistency metrics for persona observations
+ *
+ * @param {Array} observations - Array of observations from persona
+ * @returns {Object} Consistency metrics
+ */
+export function calculatePersonaConsistency(observations) {
+  if (observations.length < 2) {
+    return {
+      promptToLine: 1.0,
+      lineToLine: 1.0,
+      overall: 1.0
+    };
+  }
+  // Extract keywords from each observation
+  const keywordSets = observations.map(obs => {
+    const text = typeof obs === 'string' ? obs : obs.observation || '';
+    const words = text.toLowerCase()
+      .split(/\s+/)
+      .filter(w => w.length > 3)
+      .filter(w => !['the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can'].includes(w));
+    return new Set(words);
+  });
+  // Calculate prompt-to-line consistency (first vs all others)
+  const firstKeywords = keywordSets[0];
+  let promptToLineMatches = 0;
+  for (let i = 1; i < keywordSets.length; i++) {
+    const intersection = new Set([...firstKeywords].filter(x => keywordSets[i].has(x)));
+    const union = new Set([...firstKeywords, ...keywordSets[i]]);
+    const similarity = union.size > 0 ? intersection.size / union.size : 0;
+    promptToLineMatches += similarity;
+  }
+  const promptToLine = promptToLineMatches / Math.max(1, keywordSets.length - 1);
+  // Calculate line-to-line consistency (adjacent observations)
+  let lineToLineMatches = 0;
+  for (let i = 1; i < keywordSets.length; i++) {
+    const prev = keywordSets[i - 1];
+    const curr = keywordSets[i];
+    const intersection = new Set([...prev].filter(x => curr.has(x)));
+    const union = new Set([...prev, ...curr]);
+    const similarity = union.size > 0 ? intersection.size / union.size : 0;
+    lineToLineMatches += similarity;
+  }
+  const lineToLine = lineToLineMatches / Math.max(1, keywordSets.length - 1);
+  // Overall consistency (weighted average)
+  const overall = (promptToLine * 0.4 + lineToLine * 0.6);
+  return {
+    promptToLine,
+    lineToLine,
+    overall,
+    observationCount: observations.length
+  };
+}
+/**
+ * Experience page with enhanced persona
+ *
+ * @param {any} page - Playwright page object
+ * @param {EnhancedPersona} persona - Enhanced persona
+ * @param {Object} options - Experience options
+ * @returns {Promise<Object>} Experience result with consistency metrics
+ */
+export async function experiencePageWithEnhancedPersona(page, persona, options = {}) {
+  // Use base experience function
+  const experience = await experiencePageAsPersona(page, persona, options);
+  // Extract observations
+  const observations = experience.notes.map(n => n.observation || '');
+  // Calculate consistency metrics
+  const consistency = calculatePersonaConsistency(observations);
+  // Add persona context to experience
+  return {
+    ...experience,
+    persona: {
+      ...persona,
+      workflows: persona.workflows,
+      frustrations: persona.frustrations,
+      usagePatterns: persona.usagePatterns
+    },
+    consistency,
+    observations
+  };
+}
+/**
+ * Compare persona observations for diversity
+ *
+ * @param {Array} personaExperiences - Array of persona experience results
+ * @returns {Object} Diversity metrics
+ */
+export function calculatePersonaDiversity(personaExperiences) {
+  if (personaExperiences.length < 2) {
+    return {
+      diversityRatio: 0,
+      uniqueKeywords: 0,
+      totalKeywords: 0
+    };
+  }
+  // Extract all keywords from all personas
+  const allKeywords = personaExperiences.flatMap(exp => {
+    const observations = exp.observations || exp.notes?.map(n => n.observation || '') || [];
+    return observations.flatMap(obs => {
+      const words = obs.toLowerCase()
+        .split(/\s+/)
+        .filter(w => w.length > 3)
+        .filter(w => !['the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can'].includes(w));
+      return words;
+    });
+  });
+  const uniqueKeywords = new Set(allKeywords);
+  const diversityRatio = uniqueKeywords.size / Math.max(1, allKeywords.length);
+  return {
+    diversityRatio,
+    uniqueKeywords: uniqueKeywords.size,
+    totalKeywords: allKeywords.length,
+    personaCount: personaExperiences.length
+  };
+}