npm - @arclabs561/ai-visual-test - Versions diffs - 0.5.1 - Mend

@arclabs561/ai-visual-test 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

package/.secretsignore.example +20 -0
package/CHANGELOG.md +360 -0
package/CONTRIBUTING.md +63 -0
package/DEPLOYMENT.md +80 -0
package/LICENSE +22 -0
package/README.md +142 -0
package/SECURITY.md +108 -0
package/api/health.js +34 -0
package/api/validate.js +252 -0
package/index.d.ts +1221 -0
package/package.json +112 -0
package/public/index.html +149 -0
package/src/batch-optimizer.mjs +451 -0
package/src/bias-detector.mjs +370 -0
package/src/bias-mitigation.mjs +233 -0
package/src/cache.mjs +433 -0
package/src/config.mjs +268 -0
package/src/constants.mjs +80 -0
package/src/context-compressor.mjs +350 -0
package/src/convenience.mjs +617 -0
package/src/cost-tracker.mjs +257 -0
package/src/cross-modal-consistency.mjs +170 -0
package/src/data-extractor.mjs +232 -0
package/src/dynamic-few-shot.mjs +140 -0
package/src/dynamic-prompts.mjs +361 -0
package/src/ensemble/index.mjs +53 -0
package/src/ensemble-judge.mjs +366 -0
package/src/error-handler.mjs +67 -0
package/src/errors.mjs +167 -0
package/src/experience-propagation.mjs +128 -0
package/src/experience-tracer.mjs +487 -0
package/src/explanation-manager.mjs +299 -0
package/src/feedback-aggregator.mjs +248 -0
package/src/game-goal-prompts.mjs +478 -0
package/src/game-player.mjs +548 -0
package/src/hallucination-detector.mjs +155 -0
package/src/helpers/playwright.mjs +80 -0
package/src/human-validation-manager.mjs +516 -0
package/src/index.mjs +364 -0
package/src/judge.mjs +929 -0
package/src/latency-aware-batch-optimizer.mjs +192 -0
package/src/load-env.mjs +159 -0
package/src/logger.mjs +55 -0
package/src/metrics.mjs +187 -0
package/src/model-tier-selector.mjs +221 -0
package/src/multi-modal/index.mjs +36 -0
package/src/multi-modal-fusion.mjs +190 -0
package/src/multi-modal.mjs +524 -0
package/src/natural-language-specs.mjs +1071 -0
package/src/pair-comparison.mjs +277 -0
package/src/persona/index.mjs +42 -0
package/src/persona-enhanced.mjs +200 -0
package/src/persona-experience.mjs +572 -0
package/src/position-counterbalance.mjs +140 -0
package/src/prompt-composer.mjs +375 -0
package/src/render-change-detector.mjs +583 -0
package/src/research-enhanced-validation.mjs +436 -0
package/src/retry.mjs +152 -0
package/src/rubrics.mjs +231 -0
package/src/score-tracker.mjs +277 -0
package/src/smart-validator.mjs +447 -0
package/src/spec-config.mjs +106 -0
package/src/spec-templates.mjs +347 -0
package/src/specs/index.mjs +38 -0
package/src/temporal/index.mjs +102 -0
package/src/temporal-adaptive.mjs +163 -0
package/src/temporal-batch-optimizer.mjs +222 -0
package/src/temporal-constants.mjs +69 -0
package/src/temporal-context.mjs +49 -0
package/src/temporal-decision-manager.mjs +271 -0
package/src/temporal-decision.mjs +669 -0
package/src/temporal-errors.mjs +58 -0
package/src/temporal-note-pruner.mjs +173 -0
package/src/temporal-preprocessor.mjs +543 -0
package/src/temporal-prompt-formatter.mjs +219 -0
package/src/temporal-validation.mjs +159 -0
package/src/temporal.mjs +415 -0
package/src/type-guards.mjs +311 -0
package/src/uncertainty-reducer.mjs +470 -0
package/src/utils/index.mjs +175 -0
package/src/validation-framework.mjs +321 -0
package/src/validation-result-normalizer.mjs +64 -0
package/src/validation.mjs +243 -0
package/src/validators/accessibility-programmatic.mjs +345 -0
package/src/validators/accessibility-validator.mjs +223 -0
package/src/validators/batch-validator.mjs +143 -0
package/src/validators/hybrid-validator.mjs +268 -0
package/src/validators/index.mjs +34 -0
package/src/validators/prompt-builder.mjs +218 -0
package/src/validators/rubric.mjs +85 -0
package/src/validators/state-programmatic.mjs +260 -0
package/src/validators/state-validator.mjs +291 -0
package/vercel.json +27 -0

package/src/rubrics.mjs ADDED Viewed

@@ -0,0 +1,231 @@
+/**
+ * Evaluation Rubrics
+ *
+ * Provides explicit scoring rubrics for LLM-as-a-judge evaluation.
+ * Research shows that explicit rubrics improve reliability by 10-20%
+ * and reduce bias from superficial features (LLMs-as-Judges Survey, arXiv:2412.05579).
+ */
+/**
+ * Default scoring rubric for screenshot validation
+ */
+export const DEFAULT_RUBRIC = {
+  score: {
+    description: 'Overall quality score from 0-10',
+    criteria: {
+      10: 'Perfect - No issues, excellent UX, all requirements met',
+      9: 'Excellent - Minor cosmetic issues, excellent UX',
+      8: 'Very Good - Minor issues that don\'t affect usability',
+      7: 'Good - Some issues but generally usable',
+      6: 'Acceptable - Issues present but functional',
+      5: 'Needs Improvement - Significant issues affecting usability',
+      4: 'Poor - Major issues, difficult to use',
+      3: 'Very Poor - Critical issues, barely functional',
+      2: 'Bad - Severe issues, mostly broken',
+      1: 'Very Bad - Almost completely broken',
+      0: 'Broken - Completely non-functional'
+    }
+  },
+  dimensions: {
+    visual: {
+      description: 'Visual design and aesthetics',
+      criteria: [
+        'Layout is clear and organized',
+        'Colors are appropriate and accessible',
+        'Typography is readable',
+        'Spacing is consistent',
+        'Visual hierarchy is clear'
+      ]
+    },
+    functional: {
+      description: 'Functional correctness',
+      criteria: [
+        'All interactive elements work correctly',
+        'Forms submit properly',
+        'Links navigate correctly',
+        'Buttons trigger expected actions',
+        'No broken functionality'
+      ]
+    },
+    usability: {
+      description: 'Ease of use',
+      criteria: [
+        'Purpose is clear',
+        'Actions are obvious',
+        'Feedback is provided',
+        'Error messages are helpful',
+        'Flow is intuitive'
+      ]
+    },
+    accessibility: {
+      description: 'Accessibility compliance',
+      criteria: [
+        'Keyboard navigation works',
+        'Screen reader compatible',
+        'Color contrast is sufficient',
+        'Text is readable',
+        'Interactive elements are accessible'
+      ]
+    }
+  }
+};
+/**
+ * Build rubric prompt section
+ *
+ * @param {import('./index.mjs').Rubric | null} [rubric=null] - Rubric to use, or null for default
+ * @param {boolean} [includeDimensions=true] - Whether to include evaluation dimensions
+ * @returns {string} Formatted rubric prompt text
+ */
+export function buildRubricPrompt(rubric = null, includeDimensions = true) {
+  const rubricToUse = rubric || DEFAULT_RUBRIC;
+  let prompt = `## EVALUATION RUBRIC
+### Scoring Scale (0-10):
+${Object.entries(rubricToUse.score.criteria)
+  .sort((a, b) => parseInt(b[0]) - parseInt(a[0]))
+  .map(([score, desc]) => `- ${score}: ${desc}`)
+  .join('\n')}
+### Example Evaluations (Few-Shot Learning):
+**Example 1 - High Quality (Score: 9)**
+Screenshot: Clean, accessible homepage with high contrast
+Evaluation: "Excellent design with clear navigation, high contrast (21:1), keyboard accessible. Minor: could improve spacing. Score: 9"
+JSON: {"score": 9, "assessment": "excellent", "issues": ["minor spacing"], "reasoning": "High quality with minor improvements needed"}
+**Example 2 - Medium Quality (Score: 6)**
+Screenshot: Functional but cluttered interface
+Evaluation: "Functional design but cluttered layout, moderate contrast (4.2:1), some accessibility issues. Score: 6"
+JSON: {"score": 6, "assessment": "needs-improvement", "issues": ["cluttered layout", "low contrast", "accessibility issues"], "reasoning": "Functional but needs significant improvements"}
+**Example 3 - Low Quality (Score: 3)**
+Screenshot: Broken layout with poor accessibility
+Evaluation: "Poor design with broken layout, very low contrast (2.1:1), not keyboard accessible, multiple critical issues. Score: 3"
+JSON: {"score": 3, "assessment": "fail", "issues": ["broken layout", "critical contrast violation", "no keyboard navigation"], "reasoning": "Multiple critical issues prevent usability"}
+### Evaluation Instructions:
+1. Evaluate the screenshot against the criteria below
+2. Consider both appearance and functional correctness
+3. Base your score on substantive content, not superficial features
+4. Ignore factors like response length, verbosity, or formatting style
+5. Focus on actual quality: correctness, clarity, usability, and accessibility
+6. Provide a score from 0-10 based on the rubric above
+7. List specific issues found (if any)
+8. Provide reasoning for your score`;
+  if (includeDimensions && rubricToUse.dimensions) {
+    prompt += `\n\n### Evaluation Dimensions:
+${Object.entries(rubricToUse.dimensions)
+  .map(([key, dim]) => `\n**${key.toUpperCase()}** (${dim.description}):\n${dim.criteria.map(c => `- ${c}`).join('\n')}`)
+  .join('\n')}`;
+  }
+  prompt += `\n\n### Issue Importance and Annoyance:
+For each issue you identify, consider:
+- **Importance**: How critical is this issue? (critical, high, medium, low)
+- **Annoyance**: How annoying/frustrating is this issue to users? (very-high, high, medium, low)
+- **Impact**: What is the impact on user experience? (blocks-use, degrades-experience, minor-inconvenience, cosmetic)
+### Suggestions and Evidence:
+When providing recommendations, include:
+- **Specific suggestions**: Concrete, actionable improvements
+- **Evidence**: What in the screenshot supports your judgment? (visual elements, layout issues, accessibility violations, etc.)
+- **Priority**: Which issues should be fixed first? (based on importance and annoyance)
+### Output Format:
+Provide your evaluation as JSON:
+{
+  "score": <0-10 integer>,
+  "assessment": "<pass|fail|needs-improvement>",
+  "issues": [
+    {
+      "description": "<issue description>",
+      "importance": "<critical|high|medium|low>",
+      "annoyance": "<very-high|high|medium|low>",
+      "impact": "<blocks-use|degrades-experience|minor-inconvenience|cosmetic>",
+      "evidence": "<what in the screenshot supports this issue>",
+      "suggestion": "<specific, actionable recommendation>"
+    }
+  ],
+  "reasoning": "<explanation of score>",
+  "strengths": ["<strength1>", "<strength2>", ...],
+  "recommendations": [
+    {
+      "priority": "<high|medium|low>",
+      "suggestion": "<specific recommendation>",
+      "evidence": "<what supports this recommendation>",
+      "expectedImpact": "<what improvement this would bring>"
+    }
+  ],
+  "evidence": {
+    "visual": "<visual evidence from screenshot>",
+    "functional": "<functional evidence>",
+    "accessibility": "<accessibility evidence>"
+  }
+}`;
+  return prompt;
+}
+/**
+ * Get rubric for specific test type
+ *
+ * @param {string} testType - Test type identifier (e.g., 'payment-screen', 'gameplay', 'form')
+ * @returns {import('./index.mjs').Rubric} Rubric configured for the test type
+ */
+export function getRubricForTestType(testType) {
+  const testTypeRubrics = {
+    'payment-screen': {
+      ...DEFAULT_RUBRIC,
+      dimensions: {
+        ...DEFAULT_RUBRIC.dimensions,
+        payment: {
+          description: 'Payment functionality',
+          criteria: [
+            'Payment code is clearly visible',
+            'Payment links are obvious',
+            'Payment flow is trustworthy',
+            'Connection to game access is clear',
+            'Payment instructions are clear'
+          ]
+        }
+      }
+    },
+    'gameplay': {
+      ...DEFAULT_RUBRIC,
+      dimensions: {
+        ...DEFAULT_RUBRIC.dimensions,
+        gameplay: {
+          description: 'Gameplay experience',
+          criteria: [
+            'Game is visually engaging',
+            'Controls are intuitive',
+            'Feedback is clear',
+            'Game is balanced',
+            'Experience is fun'
+          ]
+        }
+      }
+    },
+    'form': {
+      ...DEFAULT_RUBRIC,
+      dimensions: {
+        ...DEFAULT_RUBRIC.dimensions,
+        form: {
+          description: 'Form usability',
+          criteria: [
+            'Labels are clear',
+            'Placeholders are helpful',
+            'Validation is clear',
+            'Submit button is obvious',
+            'Error messages are helpful'
+          ]
+        }
+      }
+    }
+  };
+  return testTypeRubrics[testType] || DEFAULT_RUBRIC;
+}

package/src/score-tracker.mjs ADDED Viewed

@@ -0,0 +1,277 @@
+/**
+ * Score Tracker
+ *
+ * Tracks test scores over time for regression detection and improvement tracking.
+ * Stores baselines in JSON files for comparison.
+ *
+ * General-purpose utility - no domain-specific logic.
+ */
+import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs';
+import { join, dirname } from 'path';
+import { fileURLToPath } from 'url';
+import { warn } from './logger.mjs';
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+/**
+ * Score Tracker Class
+ *
+ * Tracks test scores over time for regression detection and improvement tracking.
+ *
+ * @class ScoreTracker
+ */
+export class ScoreTracker {
+  /**
+   * @param {{
+   *   baselineDir?: string;
+   *   autoSave?: boolean;
+   * }} [options={}] - Tracker options
+   */
+  constructor(options = {}) {
+    const {
+      baselineDir = join(process.cwd(), 'test-results', 'baselines'),
+      autoSave = true
+    } = options;
+    this.baselineDir = baselineDir;
+    this.autoSave = autoSave;
+    this.baselineFile = join(baselineDir, 'scores.json');
+    // Ensure baseline directory exists
+    if (!existsSync(baselineDir)) {
+      mkdirSync(baselineDir, { recursive: true });
+    }
+  }
+  /**
+   * Load baseline scores
+   */
+  _loadBaselines() {
+    if (!existsSync(this.baselineFile)) {
+      return {};
+    }
+    try {
+      const content = readFileSync(this.baselineFile, 'utf8');
+      if (!content || content.trim().length === 0) {
+        return {};
+      }
+      return JSON.parse(content);
+    } catch (error) {
+      // SECURITY: Don't expose file paths or internal details in error
+      warn(`[ScoreTracker] Failed to load baselines: ${error instanceof SyntaxError ? 'Invalid JSON format' : 'File read error'}`);
+      return {};
+    }
+  }
+  /**
+   * Save baseline scores
+   */
+  _saveBaselines(baselines) {
+    if (!this.autoSave) return;
+    try {
+      writeFileSync(this.baselineFile, JSON.stringify(baselines, null, 2), 'utf8');
+    } catch (error) {
+      warn(`[ScoreTracker] Failed to save baselines: ${error.message}`);
+    }
+  }
+  /**
+   * Record a test score
+   *
+   * @param {string} testName - Name of the test
+   * @param {number} score - Test score (0-10)
+   * @param {Record<string, unknown>} [metadata={}] - Additional metadata
+   * @returns {{ score: number; timestamp: string; metadata: Record<string, unknown> }} Recorded entry
+   */
+  record(testName, score, metadata = {}) {
+    const baselines = this._loadBaselines();
+    const now = new Date().toISOString();
+    if (!baselines[testName]) {
+      baselines[testName] = {
+        history: [],
+        current: null,
+        baseline: null,
+        firstRecorded: now,
+        lastUpdated: now
+      };
+    }
+    const entry = {
+      score,
+      timestamp: now,
+      metadata
+    };
+    baselines[testName].history.push(entry);
+    baselines[testName].current = score;
+    baselines[testName].lastUpdated = now;
+    // Set baseline if not set (first score becomes baseline)
+    if (baselines[testName].baseline === null) {
+      baselines[testName].baseline = score;
+      baselines[testName].baselineSetAt = now;
+    }
+    // Keep only last 100 entries per test
+    if (baselines[testName].history.length > 100) {
+      baselines[testName].history = baselines[testName].history.slice(-100);
+    }
+    this._saveBaselines(baselines);
+    return entry;
+  }
+  /**
+   * Get baseline for a test
+   *
+   * @param {string} testName - Name of the test
+   * @returns {number | null} Baseline score or null if not set
+   */
+  getBaseline(testName) {
+    const baselines = this._loadBaselines();
+    return baselines[testName]?.baseline ?? null;
+  }
+  /**
+   * Get current score for a test
+   *
+   * @param {string} testName - Name of the test
+   * @returns {number | null} Current score or null if not recorded
+   */
+  getCurrent(testName) {
+    const baselines = this._loadBaselines();
+    return baselines[testName]?.current ?? null;
+  }
+  /**
+   * Compare current score with baseline
+   *
+   * @param {string} testName - Name of the test
+   * @param {number} currentScore - Current score to compare
+   * @returns {{ hasBaseline: boolean; baseline: number | null; current: number; improved: boolean; delta: number; percentage: number } | null} Comparison result or null if no baseline
+   */
+  compare(testName, currentScore) {
+    const baselines = this._loadBaselines();
+    const testData = baselines[testName];
+    if (!testData || testData.baseline === null) {
+      return {
+        hasBaseline: false,
+        baseline: null,
+        current: currentScore,
+        delta: null,
+        regression: false,
+        improvement: false,
+        trend: 'unknown'
+      };
+    }
+    const baseline = testData.baseline;
+    const delta = currentScore - baseline;
+    const regression = delta < -1; // Score dropped by more than 1 point
+    const improvement = delta > 1; // Score improved by more than 1 point
+    // Calculate trend from recent history
+    const recentScores = testData.history.slice(-10).map(e => e.score);
+    const trend = recentScores.length >= 3
+      ? (recentScores[recentScores.length - 1] > recentScores[0] ? 'improving' :
+         recentScores[recentScores.length - 1] < recentScores[0] ? 'declining' : 'stable')
+      : 'unknown';
+    return {
+      hasBaseline: true,
+      baseline,
+      current: currentScore,
+      delta,
+      regression,
+      improvement,
+      trend,
+      history: testData.history.slice(-10) // Last 10 scores
+    };
+  }
+  /**
+   * Update baseline (e.g., after fixing issues)
+   *
+   * @param {string} testName - Name of the test
+   * @param {number | null} [newBaseline=null] - New baseline score, or null to use current score
+   * @returns {boolean} True if baseline was updated
+   */
+  updateBaseline(testName, newBaseline = null) {
+    const baselines = this._loadBaselines();
+    if (!baselines[testName]) {
+      return false;
+    }
+    if (newBaseline === null) {
+      // Use current score as new baseline
+      newBaseline = baselines[testName].current;
+    }
+    baselines[testName].baseline = newBaseline;
+    baselines[testName].baselineSetAt = new Date().toISOString();
+    this._saveBaselines(baselines);
+    return true;
+  }
+  /**
+   * Get all baselines
+   */
+  getAll() {
+    return this._loadBaselines();
+  }
+  /**
+   * Get baseline stats
+   *
+   * @returns {import('./index.mjs').ScoreTracker['getStats']} Statistics object
+   */
+  getStats() {
+    const baselines = this._loadBaselines();
+    const stats = {
+      totalTests: Object.keys(baselines).length,
+      testsWithBaselines: 0,
+      testsWithRegressions: 0,
+      testsWithImprovements: 0,
+      averageScore: 0,
+      averageBaseline: 0
+    };
+    let totalScore = 0;
+    let totalBaseline = 0;
+    let count = 0;
+    for (const [testName, testData] of Object.entries(baselines)) {
+      if (testData.baseline !== null) {
+        stats.testsWithBaselines++;
+        totalBaseline += testData.baseline;
+        if (testData.current !== null) {
+          totalScore += testData.current;
+          count++;
+          const comparison = this.compare(testName, testData.current);
+          if (comparison.regression) {
+            stats.testsWithRegressions++;
+          }
+          if (comparison.improvement) {
+            stats.testsWithImprovements++;
+          }
+        }
+      }
+    }
+    if (count > 0) {
+      stats.averageScore = totalScore / count;
+      stats.averageBaseline = totalBaseline / stats.testsWithBaselines;
+    }
+    return stats;
+  }
+}