@arclabs561/ai-visual-test 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.secretsignore.example +20 -0
  2. package/CHANGELOG.md +360 -0
  3. package/CONTRIBUTING.md +63 -0
  4. package/DEPLOYMENT.md +80 -0
  5. package/LICENSE +22 -0
  6. package/README.md +142 -0
  7. package/SECURITY.md +108 -0
  8. package/api/health.js +34 -0
  9. package/api/validate.js +252 -0
  10. package/index.d.ts +1221 -0
  11. package/package.json +112 -0
  12. package/public/index.html +149 -0
  13. package/src/batch-optimizer.mjs +451 -0
  14. package/src/bias-detector.mjs +370 -0
  15. package/src/bias-mitigation.mjs +233 -0
  16. package/src/cache.mjs +433 -0
  17. package/src/config.mjs +268 -0
  18. package/src/constants.mjs +80 -0
  19. package/src/context-compressor.mjs +350 -0
  20. package/src/convenience.mjs +617 -0
  21. package/src/cost-tracker.mjs +257 -0
  22. package/src/cross-modal-consistency.mjs +170 -0
  23. package/src/data-extractor.mjs +232 -0
  24. package/src/dynamic-few-shot.mjs +140 -0
  25. package/src/dynamic-prompts.mjs +361 -0
  26. package/src/ensemble/index.mjs +53 -0
  27. package/src/ensemble-judge.mjs +366 -0
  28. package/src/error-handler.mjs +67 -0
  29. package/src/errors.mjs +167 -0
  30. package/src/experience-propagation.mjs +128 -0
  31. package/src/experience-tracer.mjs +487 -0
  32. package/src/explanation-manager.mjs +299 -0
  33. package/src/feedback-aggregator.mjs +248 -0
  34. package/src/game-goal-prompts.mjs +478 -0
  35. package/src/game-player.mjs +548 -0
  36. package/src/hallucination-detector.mjs +155 -0
  37. package/src/helpers/playwright.mjs +80 -0
  38. package/src/human-validation-manager.mjs +516 -0
  39. package/src/index.mjs +364 -0
  40. package/src/judge.mjs +929 -0
  41. package/src/latency-aware-batch-optimizer.mjs +192 -0
  42. package/src/load-env.mjs +159 -0
  43. package/src/logger.mjs +55 -0
  44. package/src/metrics.mjs +187 -0
  45. package/src/model-tier-selector.mjs +221 -0
  46. package/src/multi-modal/index.mjs +36 -0
  47. package/src/multi-modal-fusion.mjs +190 -0
  48. package/src/multi-modal.mjs +524 -0
  49. package/src/natural-language-specs.mjs +1071 -0
  50. package/src/pair-comparison.mjs +277 -0
  51. package/src/persona/index.mjs +42 -0
  52. package/src/persona-enhanced.mjs +200 -0
  53. package/src/persona-experience.mjs +572 -0
  54. package/src/position-counterbalance.mjs +140 -0
  55. package/src/prompt-composer.mjs +375 -0
  56. package/src/render-change-detector.mjs +583 -0
  57. package/src/research-enhanced-validation.mjs +436 -0
  58. package/src/retry.mjs +152 -0
  59. package/src/rubrics.mjs +231 -0
  60. package/src/score-tracker.mjs +277 -0
  61. package/src/smart-validator.mjs +447 -0
  62. package/src/spec-config.mjs +106 -0
  63. package/src/spec-templates.mjs +347 -0
  64. package/src/specs/index.mjs +38 -0
  65. package/src/temporal/index.mjs +102 -0
  66. package/src/temporal-adaptive.mjs +163 -0
  67. package/src/temporal-batch-optimizer.mjs +222 -0
  68. package/src/temporal-constants.mjs +69 -0
  69. package/src/temporal-context.mjs +49 -0
  70. package/src/temporal-decision-manager.mjs +271 -0
  71. package/src/temporal-decision.mjs +669 -0
  72. package/src/temporal-errors.mjs +58 -0
  73. package/src/temporal-note-pruner.mjs +173 -0
  74. package/src/temporal-preprocessor.mjs +543 -0
  75. package/src/temporal-prompt-formatter.mjs +219 -0
  76. package/src/temporal-validation.mjs +159 -0
  77. package/src/temporal.mjs +415 -0
  78. package/src/type-guards.mjs +311 -0
  79. package/src/uncertainty-reducer.mjs +470 -0
  80. package/src/utils/index.mjs +175 -0
  81. package/src/validation-framework.mjs +321 -0
  82. package/src/validation-result-normalizer.mjs +64 -0
  83. package/src/validation.mjs +243 -0
  84. package/src/validators/accessibility-programmatic.mjs +345 -0
  85. package/src/validators/accessibility-validator.mjs +223 -0
  86. package/src/validators/batch-validator.mjs +143 -0
  87. package/src/validators/hybrid-validator.mjs +268 -0
  88. package/src/validators/index.mjs +34 -0
  89. package/src/validators/prompt-builder.mjs +218 -0
  90. package/src/validators/rubric.mjs +85 -0
  91. package/src/validators/state-programmatic.mjs +260 -0
  92. package/src/validators/state-validator.mjs +291 -0
  93. package/vercel.json +27 -0
@@ -0,0 +1,221 @@
1
+ /**
2
+ * Model Tier Selector
3
+ *
4
+ * Automatically selects the best model tier based on context (frequency, criticality, cost).
5
+ * Similar pattern to smart-validator.mjs which auto-selects validator types.
6
+ *
7
+ * Design Philosophy:
8
+ * - High-frequency decisions (10-60Hz) → use 'fast' tier
9
+ * - Critical evaluations → use 'best' tier
10
+ * - Cost-sensitive → use 'fast' tier
11
+ * - Standard validations → use 'balanced' tier (default)
12
+ *
13
+ * This prevents the common mistake of using expensive models for high-frequency decisions.
14
+ */
15
+
16
+ import { log, warn } from './logger.mjs';
17
+
18
+ /**
19
+ * Select model tier based on context
20
+ *
21
+ * @param {Object} context - Validation context
22
+ * @param {string|number} [context.frequency] - Decision frequency ('high'|'medium'|'low' or Hz number)
23
+ * @param {string} [context.criticality] - Criticality level ('critical'|'high'|'medium'|'low')
24
+ * @param {boolean} [context.costSensitive] - Cost-sensitive operation
25
+ * @param {boolean} [context.qualityRequired] - High quality required
26
+ * @param {string} [context.testType] - Test type (may indicate criticality)
27
+ * @param {Object} [context.temporalNotes] - Temporal notes (for frequency detection)
28
+ * @returns {string} Model tier ('fast'|'balanced'|'best')
29
+ */
30
+ export function selectModelTier(context = {}) {
31
+ const {
32
+ frequency,
33
+ criticality,
34
+ costSensitive,
35
+ qualityRequired,
36
+ testType,
37
+ temporalNotes
38
+ } = context;
39
+
40
+ // Detect frequency from temporal notes if available
41
+ let detectedFrequency = frequency;
42
+ if (!detectedFrequency && temporalNotes && Array.isArray(temporalNotes) && temporalNotes.length > 1) {
43
+ const recentNotes = temporalNotes.slice(-10);
44
+ if (recentNotes.length >= 2) {
45
+ const timeSpan = recentNotes[recentNotes.length - 1].timestamp - recentNotes[0].timestamp;
46
+ if (timeSpan > 0) {
47
+ const notesPerSecond = recentNotes.length / (timeSpan / 1000);
48
+ if (notesPerSecond > 10) {
49
+ detectedFrequency = 'high';
50
+ } else if (notesPerSecond > 1) {
51
+ detectedFrequency = 'medium';
52
+ } else {
53
+ detectedFrequency = 'low';
54
+ }
55
+ }
56
+ }
57
+ }
58
+
59
+ // Convert numeric frequency to category
60
+ if (typeof detectedFrequency === 'number') {
61
+ if (detectedFrequency >= 10) {
62
+ detectedFrequency = 'high'; // 10-60Hz
63
+ } else if (detectedFrequency >= 1) {
64
+ detectedFrequency = 'medium'; // 1-10Hz
65
+ } else {
66
+ detectedFrequency = 'low'; // <1Hz
67
+ }
68
+ }
69
+
70
+ // Tier 1: High-frequency decisions (10-60Hz) → fast
71
+ // Rationale: Speed is critical, quality can be lower
72
+ if (detectedFrequency === 'high' || detectedFrequency === 'ultra-high') {
73
+ log('[ModelTierSelector] High-frequency detected, selecting fast tier');
74
+ return 'fast';
75
+ }
76
+
77
+ // Tier 2: Critical evaluations → best
78
+ // Rationale: Quality is critical, speed can be slower
79
+ if (criticality === 'critical' || qualityRequired === true) {
80
+ log('[ModelTierSelector] Critical evaluation detected, selecting best tier');
81
+ return 'best';
82
+ }
83
+
84
+ // Check testType for critical indicators
85
+ if (testType === 'expert-evaluation' || testType === 'medical' || testType === 'accessibility-critical') {
86
+ log('[ModelTierSelector] Critical test type detected, selecting best tier');
87
+ return 'best';
88
+ }
89
+
90
+ // Tier 3: Cost-sensitive → fast
91
+ // Rationale: Minimize cost, acceptable quality
92
+ if (costSensitive === true) {
93
+ log('[ModelTierSelector] Cost-sensitive detected, selecting fast tier');
94
+ return 'fast';
95
+ }
96
+
97
+ // Tier 4: Standard validations → balanced (default)
98
+ // Rationale: Best balance of speed and quality
99
+ log('[ModelTierSelector] Standard validation, selecting balanced tier (default)');
100
+ return 'balanced';
101
+ }
102
+
103
+ /**
104
+ * Select provider based on requirements
105
+ *
106
+ * @param {Object} requirements - Provider requirements
107
+ * @param {string} [requirements.speed] - Speed requirement ('ultra-fast'|'fast'|'normal'|'slow')
108
+ * @param {string} [requirements.quality] - Quality requirement ('best'|'good'|'acceptable')
109
+ * @param {boolean} [requirements.costSensitive] - Cost-sensitive
110
+ * @param {number} [requirements.contextSize] - Context size in tokens
111
+ * @param {boolean} [requirements.vision] - Vision required (default: true for VLLM)
112
+ * @param {Object} [requirements.env] - Environment variables (for API key detection)
113
+ * @returns {string} Provider name ('gemini'|'openai'|'claude'|'groq')
114
+ */
115
+ export function selectProvider(requirements = {}) {
116
+ const {
117
+ speed = 'normal',
118
+ quality = 'good',
119
+ costSensitive = false,
120
+ contextSize = 0,
121
+ vision = true, // Default true for VLLM
122
+ env = {}
123
+ } = requirements;
124
+
125
+ // Ultra-fast, text-only → Groq (if no vision needed)
126
+ if (speed === 'ultra-fast' && !vision) {
127
+ if (env.GROQ_API_KEY) {
128
+ log('[ModelTierSelector] Ultra-fast text-only, selecting Groq');
129
+ return 'groq';
130
+ }
131
+ }
132
+
133
+ // Large context → Gemini (1M+ tokens)
134
+ if (contextSize > 200000) {
135
+ if (env.GEMINI_API_KEY) {
136
+ log('[ModelTierSelector] Large context detected, selecting Gemini');
137
+ return 'gemini';
138
+ }
139
+ }
140
+
141
+ // Best quality → Gemini 2.5 Pro or GPT-5
142
+ if (quality === 'best') {
143
+ if (env.GEMINI_API_KEY) {
144
+ log('[ModelTierSelector] Best quality required, selecting Gemini');
145
+ return 'gemini';
146
+ }
147
+ if (env.OPENAI_API_KEY) {
148
+ log('[ModelTierSelector] Best quality required, selecting OpenAI');
149
+ return 'openai';
150
+ }
151
+ }
152
+
153
+ // Fast + good quality → Gemini Flash
154
+ if (speed === 'fast' && quality === 'good') {
155
+ if (env.GEMINI_API_KEY) {
156
+ log('[ModelTierSelector] Fast + good quality, selecting Gemini');
157
+ return 'gemini';
158
+ }
159
+ }
160
+
161
+ // Cost-sensitive → Gemini (free tier, lower cost)
162
+ if (costSensitive) {
163
+ if (env.GEMINI_API_KEY) {
164
+ log('[ModelTierSelector] Cost-sensitive, selecting Gemini');
165
+ return 'gemini';
166
+ }
167
+ if (env.GROQ_API_KEY && !vision) {
168
+ log('[ModelTierSelector] Cost-sensitive text-only, selecting Groq');
169
+ return 'groq';
170
+ }
171
+ }
172
+
173
+ // Default → Auto-detect from available API keys
174
+ // Priority: Groq (if vision supported) > Gemini > OpenAI > Claude
175
+ if (vision && env.GROQ_API_KEY) {
176
+ log('[ModelTierSelector] Default, selecting Groq (vision supported)');
177
+ return 'groq';
178
+ }
179
+ if (env.GEMINI_API_KEY) {
180
+ log('[ModelTierSelector] Default, selecting Gemini');
181
+ return 'gemini';
182
+ }
183
+ if (env.OPENAI_API_KEY) {
184
+ log('[ModelTierSelector] Default, selecting OpenAI');
185
+ return 'openai';
186
+ }
187
+ if (env.ANTHROPIC_API_KEY) {
188
+ log('[ModelTierSelector] Default, selecting Claude');
189
+ return 'claude';
190
+ }
191
+
192
+ // Fallback
193
+ warn('[ModelTierSelector] No API keys found, defaulting to gemini');
194
+ return 'gemini';
195
+ }
196
+
197
+ /**
198
+ * Select model tier and provider based on context
199
+ *
200
+ * Combines tier and provider selection for convenience.
201
+ *
202
+ * @param {Object} context - Validation context
203
+ * @param {Object} [context.requirements] - Provider requirements
204
+ * @returns {{tier: string, provider: string, reason: string}}
205
+ */
206
+ export function selectModelTierAndProvider(context = {}) {
207
+ const { requirements = {}, ...tierContext } = context;
208
+
209
+ const tier = selectModelTier(tierContext);
210
+ const provider = selectProvider({
211
+ ...requirements,
212
+ env: process.env
213
+ });
214
+
215
+ return {
216
+ tier,
217
+ provider,
218
+ reason: `Selected ${provider} ${tier} tier based on context`
219
+ };
220
+ }
221
+
@@ -0,0 +1,36 @@
1
+ /**
2
+ * Multi-Modal Sub-Module
3
+ *
4
+ * Multi-modal validation features (screenshot + HTML + CSS + rendered code).
5
+ *
6
+ * Import from 'ai-visual-test/multi-modal'
7
+ */
8
+
9
+ // Core multi-modal functions
10
+ export {
11
+ multiModalValidation,
12
+ captureTemporalScreenshots,
13
+ extractRenderedCode,
14
+ multiPerspectiveEvaluation
15
+ } from '../multi-modal.mjs';
16
+
17
+ // Multi-modal fusion
18
+ export {
19
+ buildStructuredFusionPrompt,
20
+ calculateModalityWeights,
21
+ compareFusionStrategies
22
+ } from '../multi-modal-fusion.mjs';
23
+
24
+ // Cross-modal consistency
25
+ export {
26
+ checkCrossModalConsistency,
27
+ validateExperienceConsistency
28
+ } from '../cross-modal-consistency.mjs';
29
+
30
+ // Prompt composition
31
+ export {
32
+ composeSingleImagePrompt,
33
+ composeComparisonPrompt,
34
+ composeMultiModalPrompt
35
+ } from '../prompt-composer.mjs';
36
+
@@ -0,0 +1,190 @@
1
+ /**
2
+ * Attention-Based Multi-Modal Fusion
3
+ *
4
+ * Implements structured fusion with attention mechanisms for combining
5
+ * screenshot, HTML, CSS, and rendered code modalities.
6
+ *
7
+ * Research:
8
+ * - "Multimodal Fusion and Vision-Language Models: A Survey for Robot Vision" - Comprehensive survey
9
+ * - "Cross-Modal Consistency in Multimodal Large Language Models" - Consistency issues in GPT-4V
10
+ * - "Post-pre-training for Modality Alignment in Vision-Language Foundation Models" - CLIP-Refine
11
+ * - "Attention-Based Multimodal Fusion" - Various papers on attention mechanisms
12
+ *
13
+ * Key findings: Structured fusion outperforms simple concatenation. Modality gap exists even
14
+ * after contrastive training. Cross-attention enables selective information integration.
15
+ * Hallucination is a major issue, especially with stylized images.
16
+ *
17
+ * Note: This implementation uses heuristic-based attention weighting. Full research implementation
18
+ * would use learned cross-attention mechanisms and address the modality gap.
19
+ */
20
+
21
+ /**
22
+ * Calculate attention weights for different modalities
23
+ *
24
+ * @param {Object} modalities - Available modalities
25
+ * @param {string} [modalities.screenshot] - Screenshot path
26
+ * @param {Object} [modalities.renderedCode] - Rendered code (HTML, CSS, DOM)
27
+ * @param {Object} [modalities.gameState] - Game state
28
+ * @param {string} prompt - Validation prompt
29
+ * @returns {Object} Attention weights for each modality
30
+ */
31
+ export function calculateModalityWeights(modalities, prompt) {
32
+ const weights = {
33
+ screenshot: 0.4, // Base weight for visual
34
+ html: 0.2,
35
+ css: 0.2,
36
+ dom: 0.1,
37
+ gameState: 0.1
38
+ };
39
+
40
+ // Adjust weights based on prompt content
41
+ const promptLower = prompt.toLowerCase();
42
+
43
+ // If prompt mentions visual/design, increase screenshot weight
44
+ if (promptLower.includes('visual') || promptLower.includes('design') || promptLower.includes('appearance')) {
45
+ weights.screenshot = 0.5;
46
+ weights.html = 0.2;
47
+ weights.css = 0.2;
48
+ weights.dom = 0.05;
49
+ weights.gameState = 0.05;
50
+ }
51
+
52
+ // If prompt mentions structure/layout, increase HTML/DOM weight
53
+ if (promptLower.includes('structure') || promptLower.includes('layout') || promptLower.includes('html')) {
54
+ weights.html = 0.3;
55
+ weights.dom = 0.2;
56
+ weights.screenshot = 0.3;
57
+ weights.css = 0.15;
58
+ weights.gameState = 0.05;
59
+ }
60
+
61
+ // If prompt mentions styling, increase CSS weight
62
+ if (promptLower.includes('style') || promptLower.includes('css') || promptLower.includes('styling')) {
63
+ weights.css = 0.3;
64
+ weights.screenshot = 0.35;
65
+ weights.html = 0.2;
66
+ weights.dom = 0.1;
67
+ weights.gameState = 0.05;
68
+ }
69
+
70
+ // If prompt mentions state/functionality, increase gameState weight
71
+ if (promptLower.includes('state') || promptLower.includes('function') || promptLower.includes('game')) {
72
+ weights.gameState = 0.2;
73
+ weights.screenshot = 0.35;
74
+ weights.html = 0.2;
75
+ weights.css = 0.15;
76
+ weights.dom = 0.1;
77
+ }
78
+
79
+ // Normalize weights
80
+ const total = Object.values(weights).reduce((a, b) => a + b, 0);
81
+ for (const key in weights) {
82
+ weights[key] = weights[key] / total;
83
+ }
84
+
85
+ return weights;
86
+ }
87
+
88
+ /**
89
+ * Build structured fusion prompt with attention weights
90
+ *
91
+ * @param {string} basePrompt - Base validation prompt
92
+ * @param {Object} modalities - Available modalities
93
+ * @param {string} [modalities.screenshot] - Screenshot path
94
+ * @param {Object} [modalities.renderedCode] - Rendered code
95
+ * @param {Object} [modalities.gameState] - Game state
96
+ * @returns {string} Structured fusion prompt
97
+ */
98
+ export function buildStructuredFusionPrompt(basePrompt, modalities) {
99
+ const weights = calculateModalityWeights(modalities, basePrompt);
100
+
101
+ const parts = [basePrompt];
102
+ parts.push('\n\n=== MULTI-MODAL CONTEXT (Weighted by Relevance) ===\n');
103
+
104
+ // Screenshot (always highest weight for visual validation)
105
+ if (modalities.screenshot) {
106
+ parts.push(`[VISUAL - Weight: ${(weights.screenshot * 100).toFixed(0)}%]`);
107
+ parts.push(`Screenshot: ${modalities.screenshot}`);
108
+ parts.push('Use this visual representation as the primary reference for appearance and layout.\n');
109
+ }
110
+
111
+ // HTML structure
112
+ if (modalities.renderedCode?.html) {
113
+ parts.push(`[STRUCTURE - Weight: ${(weights.html * 100).toFixed(0)}%]`);
114
+ parts.push('HTML Structure:');
115
+ parts.push(modalities.renderedCode.html.substring(0, 2000)); // Limit length
116
+ parts.push('\nUse this for understanding semantic structure and element hierarchy.\n');
117
+ }
118
+
119
+ // CSS styling
120
+ if (modalities.renderedCode?.criticalCSS) {
121
+ parts.push(`[STYLING - Weight: ${(weights.css * 100).toFixed(0)}%]`);
122
+ parts.push('Critical CSS:');
123
+ const cssText = typeof modalities.renderedCode.criticalCSS === 'string'
124
+ ? modalities.renderedCode.criticalCSS
125
+ : JSON.stringify(modalities.renderedCode.criticalCSS, null, 2);
126
+ parts.push(cssText.substring(0, 2000)); // Limit length
127
+ parts.push('\nUse this for understanding visual styling, positioning, and layout rules.\n');
128
+ }
129
+
130
+ // DOM structure
131
+ if (modalities.renderedCode?.domStructure) {
132
+ parts.push(`[DOM - Weight: ${(weights.dom * 100).toFixed(0)}%]`);
133
+ parts.push('DOM Structure:');
134
+ const domText = typeof modalities.renderedCode.domStructure === 'string'
135
+ ? modalities.renderedCode.domStructure
136
+ : JSON.stringify(modalities.renderedCode.domStructure, null, 2);
137
+ parts.push(domText.substring(0, 1000)); // Limit length
138
+ parts.push('\nUse this for understanding element relationships and computed properties.\n');
139
+ }
140
+
141
+ // Game state
142
+ if (modalities.gameState && Object.keys(modalities.gameState).length > 0) {
143
+ parts.push(`[STATE - Weight: ${(weights.gameState * 100).toFixed(0)}%]`);
144
+ parts.push('Game State:');
145
+ parts.push(JSON.stringify(modalities.gameState, null, 2));
146
+ parts.push('\nUse this for understanding functional state and dynamic behavior.\n');
147
+ }
148
+
149
+ parts.push('\n=== EVALUATION INSTRUCTIONS ===');
150
+ parts.push('1. Primary: Use screenshot for visual assessment');
151
+ parts.push('2. Secondary: Use HTML/CSS for structural validation');
152
+ parts.push('3. Tertiary: Use DOM/State for functional validation');
153
+ parts.push('4. Weight your assessment based on the relevance weights above');
154
+ parts.push('5. Cross-reference modalities to identify inconsistencies');
155
+
156
+ return parts.join('\n');
157
+ }
158
+
159
+ /**
160
+ * Compare structured fusion vs simple concatenation
161
+ *
162
+ * @param {string} basePrompt - Base prompt
163
+ * @param {Object} modalities - Available modalities
164
+ * @returns {Object} Comparison of fusion strategies
165
+ */
166
+ export function compareFusionStrategies(basePrompt, modalities) {
167
+ // Simple concatenation (current approach)
168
+ const simplePrompt = `${basePrompt}\n\nSCREENSHOT:\n${modalities.screenshot || 'N/A'}\n\nRENDERED CODE:\n${JSON.stringify(modalities.renderedCode || {}, null, 2)}\n\nGAME STATE:\n${JSON.stringify(modalities.gameState || {}, null, 2)}`;
169
+
170
+ // Structured fusion (new approach)
171
+ const structuredPrompt = buildStructuredFusionPrompt(basePrompt, modalities);
172
+
173
+ return {
174
+ simple: {
175
+ length: simplePrompt.length,
176
+ modalityCount: Object.keys(modalities).length,
177
+ hasWeights: false
178
+ },
179
+ structured: {
180
+ length: structuredPrompt.length,
181
+ modalityCount: Object.keys(modalities).length,
182
+ hasWeights: true,
183
+ weights: calculateModalityWeights(modalities, basePrompt)
184
+ },
185
+ recommendation: 'Use structured fusion for better modality integration'
186
+ };
187
+ }
188
+
189
+
190
+