npm - @arclabs561/ai-visual-test - Versions diffs - 0.5.1 - Mend

@arclabs561/ai-visual-test 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

package/.secretsignore.example +20 -0
package/CHANGELOG.md +360 -0
package/CONTRIBUTING.md +63 -0
package/DEPLOYMENT.md +80 -0
package/LICENSE +22 -0
package/README.md +142 -0
package/SECURITY.md +108 -0
package/api/health.js +34 -0
package/api/validate.js +252 -0
package/index.d.ts +1221 -0
package/package.json +112 -0
package/public/index.html +149 -0
package/src/batch-optimizer.mjs +451 -0
package/src/bias-detector.mjs +370 -0
package/src/bias-mitigation.mjs +233 -0
package/src/cache.mjs +433 -0
package/src/config.mjs +268 -0
package/src/constants.mjs +80 -0
package/src/context-compressor.mjs +350 -0
package/src/convenience.mjs +617 -0
package/src/cost-tracker.mjs +257 -0
package/src/cross-modal-consistency.mjs +170 -0
package/src/data-extractor.mjs +232 -0
package/src/dynamic-few-shot.mjs +140 -0
package/src/dynamic-prompts.mjs +361 -0
package/src/ensemble/index.mjs +53 -0
package/src/ensemble-judge.mjs +366 -0
package/src/error-handler.mjs +67 -0
package/src/errors.mjs +167 -0
package/src/experience-propagation.mjs +128 -0
package/src/experience-tracer.mjs +487 -0
package/src/explanation-manager.mjs +299 -0
package/src/feedback-aggregator.mjs +248 -0
package/src/game-goal-prompts.mjs +478 -0
package/src/game-player.mjs +548 -0
package/src/hallucination-detector.mjs +155 -0
package/src/helpers/playwright.mjs +80 -0
package/src/human-validation-manager.mjs +516 -0
package/src/index.mjs +364 -0
package/src/judge.mjs +929 -0
package/src/latency-aware-batch-optimizer.mjs +192 -0
package/src/load-env.mjs +159 -0
package/src/logger.mjs +55 -0
package/src/metrics.mjs +187 -0
package/src/model-tier-selector.mjs +221 -0
package/src/multi-modal/index.mjs +36 -0
package/src/multi-modal-fusion.mjs +190 -0
package/src/multi-modal.mjs +524 -0
package/src/natural-language-specs.mjs +1071 -0
package/src/pair-comparison.mjs +277 -0
package/src/persona/index.mjs +42 -0
package/src/persona-enhanced.mjs +200 -0
package/src/persona-experience.mjs +572 -0
package/src/position-counterbalance.mjs +140 -0
package/src/prompt-composer.mjs +375 -0
package/src/render-change-detector.mjs +583 -0
package/src/research-enhanced-validation.mjs +436 -0
package/src/retry.mjs +152 -0
package/src/rubrics.mjs +231 -0
package/src/score-tracker.mjs +277 -0
package/src/smart-validator.mjs +447 -0
package/src/spec-config.mjs +106 -0
package/src/spec-templates.mjs +347 -0
package/src/specs/index.mjs +38 -0
package/src/temporal/index.mjs +102 -0
package/src/temporal-adaptive.mjs +163 -0
package/src/temporal-batch-optimizer.mjs +222 -0
package/src/temporal-constants.mjs +69 -0
package/src/temporal-context.mjs +49 -0
package/src/temporal-decision-manager.mjs +271 -0
package/src/temporal-decision.mjs +669 -0
package/src/temporal-errors.mjs +58 -0
package/src/temporal-note-pruner.mjs +173 -0
package/src/temporal-preprocessor.mjs +543 -0
package/src/temporal-prompt-formatter.mjs +219 -0
package/src/temporal-validation.mjs +159 -0
package/src/temporal.mjs +415 -0
package/src/type-guards.mjs +311 -0
package/src/uncertainty-reducer.mjs +470 -0
package/src/utils/index.mjs +175 -0
package/src/validation-framework.mjs +321 -0
package/src/validation-result-normalizer.mjs +64 -0
package/src/validation.mjs +243 -0
package/src/validators/accessibility-programmatic.mjs +345 -0
package/src/validators/accessibility-validator.mjs +223 -0
package/src/validators/batch-validator.mjs +143 -0
package/src/validators/hybrid-validator.mjs +268 -0
package/src/validators/index.mjs +34 -0
package/src/validators/prompt-builder.mjs +218 -0
package/src/validators/rubric.mjs +85 -0
package/src/validators/state-programmatic.mjs +260 -0
package/src/validators/state-validator.mjs +291 -0
package/vercel.json +27 -0

package/src/explanation-manager.mjs ADDED Viewed

@@ -0,0 +1,299 @@
+/**
+ * Explanation Manager
+ *
+ * Provides late interaction capabilities for explaining VLLM judgments.
+ * Allows humans to ask questions about judgments after they've been made.
+ */
+import { VLLMJudge } from './judge.mjs';
+import { getCached, setCached } from './cache.mjs';
+import { log, warn } from './logger.mjs';
+import { formatNotesForPrompt } from './temporal.mjs';
+/**
+ * Explanation Manager
+ *
+ * Manages interactive explanations of VLLM judgments
+ */
+export class ExplanationManager {
+  constructor(options = {}) {
+    this.judge = options.judge || new VLLMJudge(options);
+    this.cacheEnabled = options.cacheEnabled !== false;
+    this.explanations = new Map(); // In-memory cache of explanations
+  }
+  /**
+   * Get explanation for a judgment
+   *
+   * @param {Object} vllmJudgment - VLLM judgment to explain
+   * @param {string} question - Question about the judgment (optional)
+   * @param {Object} options - Explanation options
+   * @returns {Promise<Object>} Explanation response
+   */
+  async explainJudgment(vllmJudgment, question = null, options = {}) {
+    const {
+      screenshotPath = vllmJudgment.screenshot,
+      prompt = vllmJudgment.prompt,
+      context = vllmJudgment.context || {},
+      useCache = true,
+      // NEW: Temporal and experience context
+      temporalNotes = vllmJudgment.temporalNotes || context.temporalNotes || null,
+      aggregatedNotes = vllmJudgment.aggregatedNotes || context.aggregatedNotes || null,
+      experienceTrace = vllmJudgment.experienceTrace || context.experienceTrace || null
+    } = options;
+    // Build explanation prompt with temporal context
+    const explanationPrompt = this._buildExplanationPrompt(
+      vllmJudgment,
+      question,
+      { temporalNotes, aggregatedNotes, experienceTrace }
+    );
+    // Check cache
+    if (useCache && this.cacheEnabled) {
+      const cacheKey = `explain-${vllmJudgment.id}-${question || 'default'}`;
+      const cached = getCached(cacheKey, explanationPrompt, context);
+      if (cached) {
+        return cached;
+      }
+    }
+    // Get explanation from VLLM
+    try {
+      const result = await this.judge.judgeScreenshot(
+        screenshotPath,
+        explanationPrompt,
+        {
+          ...context,
+          useCache: false, // Don't cache explanation requests
+          enableHumanValidation: false // Don't collect explanations for validation
+        }
+      );
+      const explanation = {
+        question: question || 'Why did you score this the way you did?',
+        answer: result.reasoning || result.assessment || 'No explanation available',
+        confidence: this._extractConfidence(result),
+        timestamp: new Date().toISOString(),
+        judgmentId: vllmJudgment.id
+      };
+      // Cache explanation
+      if (useCache && this.cacheEnabled) {
+        const cacheKey = `explain-${vllmJudgment.id}-${question || 'default'}`;
+        setCached(cacheKey, explanationPrompt, context, explanation);
+      }
+      this.explanations.set(vllmJudgment.id, explanation);
+      return explanation;
+    } catch (error) {
+      warn('Failed to get explanation:', error.message);
+      return {
+        question: question || 'Why did you score this the way you did?',
+        answer: 'Unable to generate explanation at this time.',
+        error: error.message,
+        timestamp: new Date().toISOString()
+      };
+    }
+  }
+  /**
+   * Build explanation prompt with temporal and experience context
+   *
+   * @param {Object} vllmJudgment - VLLM judgment
+   * @param {string|null} question - Optional question
+   * @param {Object} temporalContext - Temporal and experience context
+   * @param {Object|null} temporalContext.temporalNotes - Raw temporal notes
+   * @param {Object|null} temporalContext.aggregatedNotes - Aggregated temporal notes
+   * @param {Object|null} temporalContext.experienceTrace - Experience trace data
+   */
+  _buildExplanationPrompt(vllmJudgment, question, temporalContext = {}) {
+    const { temporalNotes, aggregatedNotes, experienceTrace } = temporalContext;
+    let prompt = '';
+    // Base judgment context
+    if (question) {
+      prompt = `You previously evaluated this screenshot and gave it a score of ${vllmJudgment.vllmScore}/10.
+Your previous judgment:
+- Score: ${vllmJudgment.vllmScore}/10
+- Issues: ${vllmJudgment.vllmIssues.join(', ') || 'None'}
+- Reasoning: ${vllmJudgment.vllmReasoning}
+Question: ${question}
+Please provide a clear, detailed explanation addressing this question.`;
+    } else {
+      prompt = `You previously evaluated this screenshot and gave it a score of ${vllmJudgment.vllmScore}/10.
+Your previous judgment:
+- Score: ${vllmJudgment.vllmScore}/10
+- Issues: ${vllmJudgment.vllmIssues.join(', ') || 'None'}
+- Reasoning: ${vllmJudgment.vllmReasoning}
+Please provide a detailed explanation of:
+1. Why you scored it ${vllmJudgment.vllmScore}/10
+2. What specific evidence in the screenshot led to this score
+3. What the main issues are and why they matter
+4. What would need to change to improve the score
+Be specific and reference visual elements in the screenshot.`;
+    }
+    // Add temporal context if available
+    if (aggregatedNotes) {
+      prompt += `\n\nTEMPORAL CONTEXT:\n`;
+      prompt += formatNotesForPrompt(aggregatedNotes);
+      prompt += `\n\nWhen explaining, reference specific time points, trends, and temporal relationships (before/after/during).`;
+      prompt += ` Explain how the current judgment relates to previous observations and temporal patterns.`;
+    } else if (temporalNotes && Array.isArray(temporalNotes) && temporalNotes.length > 0) {
+      // Format raw temporal notes if aggregated notes not available
+      prompt += `\n\nTEMPORAL CONTEXT:\n`;
+      const recentNotes = temporalNotes.slice(-5);
+      recentNotes.forEach((note, i) => {
+        const time = note.elapsed ? `${(note.elapsed / 1000).toFixed(1)}s` : `step ${i + 1}`;
+        prompt += `  ${time}: ${note.observation || note.step || 'step'}\n`;
+        if (note.score !== null && note.score !== undefined) {
+          prompt += `    Score: ${note.score}/10\n`;
+        }
+      });
+      prompt += `\n\nWhen explaining, reference these temporal observations and explain how they relate to the current judgment.`;
+    }
+    // Add experience trace context if available
+    if (experienceTrace) {
+      prompt += `\n\nEXPERIENCE TRACE CONTEXT:\n`;
+      prompt += `Session ID: ${experienceTrace.sessionId || 'unknown'}\n`;
+      if (experienceTrace.persona) {
+        prompt += `Persona: ${experienceTrace.persona.name || 'unknown'}\n`;
+        if (experienceTrace.persona.goals) {
+          prompt += `Persona Goals: ${experienceTrace.persona.goals.join(', ')}\n`;
+        }
+      }
+      if (experienceTrace.events && experienceTrace.events.length > 0) {
+        prompt += `\nRecent Events (last 5):\n`;
+        const recentEvents = experienceTrace.events.slice(-5);
+        recentEvents.forEach(event => {
+          const time = event.elapsed ? `${(event.elapsed / 1000).toFixed(1)}s` : 'unknown';
+          prompt += `  ${time}: ${event.type} - ${event.data.observation || event.data.action || ''}\n`;
+        });
+      }
+      if (experienceTrace.validations && experienceTrace.validations.length > 0) {
+        prompt += `\nPrevious Validations (last 3):\n`;
+        const recentValidations = experienceTrace.validations.slice(-3);
+        recentValidations.forEach(validation => {
+          const time = validation.elapsed ? `${(validation.elapsed / 1000).toFixed(1)}s` : 'unknown';
+          prompt += `  ${time}: Score ${validation.validation.score}/10 - ${validation.validation.reasoning?.substring(0, 100) || ''}\n`;
+        });
+      }
+      prompt += `\n\nWhen explaining, consider the user's journey, previous states, and how the current judgment fits into the overall experience.`;
+    }
+    // Add VLLM-specific guidance for temporal explanations
+    if (aggregatedNotes || temporalNotes || experienceTrace) {
+      prompt += `\n\nIMPORTANT: As a Vision-Language Model, when explaining temporal aspects:
+1. Visual citations: Reference specific image regions (coordinates, descriptions) when mentioning visual evidence
+2. Temporal citations: Reference specific time points (e.g., "at t=5s", "after 12 seconds")
+3. Temporal relationships: Explain before/after/during relationships and transitions
+4. Experience context: Reference the user's journey and how previous states influenced the judgment
+5. Trends: Explain improvement/decline trends and temporal coherence`;
+    }
+    return prompt;
+  }
+  /**
+   * Extract confidence from result
+   */
+  _extractConfidence(result) {
+    // Try to extract confidence from various sources
+    if (result.semantic?.confidence !== undefined) {
+      return result.semantic.confidence;
+    }
+    if (result.raw?.confidence !== undefined) {
+      return result.raw.confidence;
+    }
+    // Estimate from uncertainty if available
+    if (result.uncertainty !== undefined) {
+      return 1.0 - result.uncertainty;
+    }
+    return null;
+  }
+  /**
+   * Get confidence breakdown for different aspects
+   */
+  async getConfidenceBreakdown(vllmJudgment) {
+    const questions = [
+      'How confident are you in the overall score?',
+      'How confident are you in the issues you identified?',
+      'Are there any aspects you are uncertain about?'
+    ];
+    const breakdown = {
+      overall: null,
+      issues: null,
+      uncertainty: null,
+      aspects: []
+    };
+    // Get explanations for each question
+    for (const question of questions) {
+      const explanation = await this.explainJudgment(vllmJudgment, question, { useCache: true });
+      // Parse explanation to extract confidence info
+      // This is a simplified version - could be enhanced with structured output
+      breakdown.aspects.push({
+        question,
+        explanation: explanation.answer
+      });
+    }
+    return breakdown;
+  }
+  /**
+   * Explain disagreement between human and VLLM
+   */
+  async explainDisagreement(vllmJudgment, humanJudgment) {
+    const question = `A human reviewer scored this ${humanJudgment.humanScore}/10, but you scored it ${vllmJudgment.vllmScore}/10.
+The human identified these issues: ${humanJudgment.humanIssues.join(', ') || 'None'}.
+You identified these issues: ${vllmJudgment.vllmIssues.join(', ') || 'None'}.
+Please explain:
+1. Why there might be a difference in scores
+2. Whether you think the human's assessment is valid
+3. What you might have missed or over-emphasized
+4. How this could help improve future evaluations`;
+    return await this.explainJudgment(vllmJudgment, question);
+  }
+  /**
+   * Get cached explanation if available
+   */
+  getCachedExplanation(judgmentId, question = null) {
+    const key = question ? `${judgmentId}-${question}` : judgmentId;
+    return this.explanations.get(key) || null;
+  }
+}
+/**
+ * Global explanation manager instance
+ */
+let globalExplanationManager = null;
+/**
+ * Get or create global explanation manager
+ */
+export function getExplanationManager(options = {}) {
+  if (!globalExplanationManager) {
+    globalExplanationManager = new ExplanationManager(options);
+  }
+  return globalExplanationManager;
+}

package/src/feedback-aggregator.mjs ADDED Viewed

@@ -0,0 +1,248 @@
+/**
+ * Feedback Aggregator
+ *
+ * Aggregates judge feedback across multiple tests for iterative improvement.
+ *
+ * General-purpose utility - no domain-specific logic.
+ */
+/**
+ * Aggregate judge feedback from multiple test runs
+ *
+ * @param {import('./index.mjs').ValidationResult[]} judgeResults - Array of validation results
+ * @returns {import('./index.mjs').AggregatedFeedback} Aggregated feedback with statistics and recommendations
+ */
+export function aggregateFeedback(judgeResults) {
+  const aggregated = {
+    scores: [],
+    issues: {},
+    recommendations: {},
+    strengths: {},
+    weaknesses: {},
+    actionableItems: {},
+    categories: {
+      visual: [],
+      functional: [],
+      performance: [],
+      accessibility: [],
+      gameplay: [],
+      ux: [],
+      other: []
+    },
+    priority: {
+      critical: [],
+      high: [],
+      medium: [],
+      low: []
+    },
+    trends: {
+      score: [],
+      issues: [],
+      recommendations: []
+    }
+  };
+  judgeResults.forEach(result => {
+    // Aggregate scores
+    if (result.score !== null && result.score !== undefined) {
+      aggregated.scores.push(result.score);
+    }
+    // Aggregate semantic information
+    if (result.semantic) {
+      const sem = result.semantic;
+      // Aggregate issues by frequency
+      if (sem.issues) {
+        sem.issues.forEach(issue => {
+          aggregated.issues[issue] = (aggregated.issues[issue] || 0) + 1;
+        });
+      }
+      // Aggregate recommendations
+      if (sem.recommendations) {
+        sem.recommendations.forEach(rec => {
+          aggregated.recommendations[rec] = (aggregated.recommendations[rec] || 0) + 1;
+        });
+      }
+      // Aggregate strengths
+      if (sem.strengths) {
+        sem.strengths.forEach(strength => {
+          aggregated.strengths[strength] = (aggregated.strengths[strength] || 0) + 1;
+        });
+      }
+      // Aggregate weaknesses
+      if (sem.weaknesses) {
+        sem.weaknesses.forEach(weakness => {
+          aggregated.weaknesses[weakness] = (aggregated.weaknesses[weakness] || 0) + 1;
+        });
+      }
+      // Aggregate actionable items
+      if (sem.actionableItems) {
+        sem.actionableItems.forEach(item => {
+          aggregated.actionableItems[item] = (aggregated.actionableItems[item] || 0) + 1;
+        });
+      }
+      // Aggregate by category
+      if (sem.semanticCategories) {
+        Object.entries(sem.semanticCategories).forEach(([category, items]) => {
+          if (items && items.length > 0) {
+            aggregated.categories[category] = aggregated.categories[category] || [];
+            aggregated.categories[category].push(...items);
+          }
+        });
+      }
+      // Aggregate by priority
+      if (sem.priority) {
+        Object.entries(sem.priority).forEach(([level, items]) => {
+          if (items && items.length > 0) {
+            aggregated.priority[level] = aggregated.priority[level] || [];
+            aggregated.priority[level].push(...items);
+          }
+        });
+      }
+    }
+  });
+  // Calculate statistics
+  const stats = {
+    totalJudgments: judgeResults.length,
+    averageScore: aggregated.scores.length > 0
+      ? aggregated.scores.reduce((a, b) => a + b, 0) / aggregated.scores.length
+      : null,
+    minScore: aggregated.scores.length > 0 ? Math.min(...aggregated.scores) : null,
+    maxScore: aggregated.scores.length > 0 ? Math.max(...aggregated.scores) : null,
+    mostCommonIssues: Object.entries(aggregated.issues)
+      .sort((a, b) => b[1] - a[1])
+      .slice(0, 10)
+      .map(([issue, count]) => ({ issue, count })),
+    mostCommonRecommendations: Object.entries(aggregated.recommendations)
+      .sort((a, b) => b[1] - a[1])
+      .slice(0, 10)
+      .map(([rec, count]) => ({ rec, count })),
+    mostCommonStrengths: Object.entries(aggregated.strengths)
+      .sort((a, b) => b[1] - a[1])
+      .slice(0, 10)
+      .map(([strength, count]) => ({ strength, count })),
+    mostCommonWeaknesses: Object.entries(aggregated.weaknesses)
+      .sort((a, b) => b[1] - a[1])
+      .slice(0, 10)
+      .map(([weakness, count]) => ({ weakness, count })),
+    mostCommonActionableItems: Object.entries(aggregated.actionableItems)
+      .sort((a, b) => b[1] - a[1])
+      .slice(0, 10)
+      .map(([item, count]) => ({ item, count })),
+    categoryCounts: Object.entries(aggregated.categories)
+      .map(([category, items]) => ({ category, count: items.length }))
+      .sort((a, b) => b.count - a.count),
+    priorityCounts: Object.entries(aggregated.priority)
+      .map(([level, items]) => ({ level, count: items.length }))
+      .sort((a, b) => {
+        const order = { critical: 0, high: 1, medium: 2, low: 3 };
+        return (order[a.level] || 99) - (order[b.level] || 99);
+      })
+  };
+  return {
+    aggregated,
+    stats,
+    summary: generateSummary(aggregated, stats)
+  };
+}
+/**
+ * Generate human-readable summary
+ */
+function generateSummary(aggregated, stats) {
+  const parts = [];
+  parts.push(`Aggregated ${stats.totalJudgments} judge results.`);
+  if (stats.averageScore !== null) {
+    parts.push(`Average score: ${stats.averageScore.toFixed(1)}/10 (range: ${stats.minScore}-${stats.maxScore}).`);
+  }
+  if (stats.mostCommonIssues.length > 0) {
+    parts.push(`Most common issues: ${stats.mostCommonIssues.slice(0, 3).map(i => i.issue).join(', ')}.`);
+  }
+  if (stats.mostCommonRecommendations.length > 0) {
+    parts.push(`Most common recommendations: ${stats.mostCommonRecommendations.slice(0, 3).map(r => r.rec).join(', ')}.`);
+  }
+  if (stats.priorityCounts.length > 0) {
+    const critical = stats.priorityCounts.find(p => p.level === 'critical');
+    if (critical && critical.count > 0) {
+      parts.push(`Critical issues: ${critical.count}.`);
+    }
+  }
+  return parts.join(' ');
+}
+/**
+ * Generate recommendations from aggregated feedback
+ *
+ * @param {import('./index.mjs').AggregatedFeedback} aggregated - Aggregated feedback
+ * @returns {string[]} Array of recommendation strings
+ */
+export function generateRecommendations(aggregated) {
+  const recommendations = [];
+  // Critical priority items
+  if (aggregated.priority.critical && aggregated.priority.critical.length > 0) {
+    recommendations.push({
+      priority: 'critical',
+      category: 'all',
+      items: aggregated.priority.critical.slice(0, 5),
+      description: 'Critical issues that must be addressed immediately'
+    });
+  }
+  // High priority items
+  if (aggregated.priority.high && aggregated.priority.high.length > 0) {
+    recommendations.push({
+      priority: 'high',
+      category: 'all',
+      items: aggregated.priority.high.slice(0, 5),
+      description: 'High priority issues that should be addressed soon'
+    });
+  }
+  // Category-specific recommendations
+  Object.entries(aggregated.categories).forEach(([category, items]) => {
+    if (items && items.length > 0) {
+      const uniqueItems = [...new Set(items)];
+      if (uniqueItems.length > 0) {
+        recommendations.push({
+          priority: 'medium',
+          category,
+          items: uniqueItems.slice(0, 5),
+          description: `${category} improvements`
+        });
+      }
+    }
+  });
+  // Most common actionable items
+  const actionableEntries = Object.entries(aggregated.actionableItems || {})
+    .sort((a, b) => b[1] - a[1])
+    .slice(0, 10);
+  if (actionableEntries.length > 0) {
+    recommendations.push({
+      priority: 'medium',
+      category: 'actionable',
+      items: actionableEntries.map(([item, count]) => `${item} (mentioned ${count} times)`),
+      description: 'Most frequently mentioned actionable improvements'
+    });
+  }
+  return recommendations;
+}