@arclabs561/ai-visual-test 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.secretsignore.example +20 -0
  2. package/CHANGELOG.md +360 -0
  3. package/CONTRIBUTING.md +63 -0
  4. package/DEPLOYMENT.md +80 -0
  5. package/LICENSE +22 -0
  6. package/README.md +142 -0
  7. package/SECURITY.md +108 -0
  8. package/api/health.js +34 -0
  9. package/api/validate.js +252 -0
  10. package/index.d.ts +1221 -0
  11. package/package.json +112 -0
  12. package/public/index.html +149 -0
  13. package/src/batch-optimizer.mjs +451 -0
  14. package/src/bias-detector.mjs +370 -0
  15. package/src/bias-mitigation.mjs +233 -0
  16. package/src/cache.mjs +433 -0
  17. package/src/config.mjs +268 -0
  18. package/src/constants.mjs +80 -0
  19. package/src/context-compressor.mjs +350 -0
  20. package/src/convenience.mjs +617 -0
  21. package/src/cost-tracker.mjs +257 -0
  22. package/src/cross-modal-consistency.mjs +170 -0
  23. package/src/data-extractor.mjs +232 -0
  24. package/src/dynamic-few-shot.mjs +140 -0
  25. package/src/dynamic-prompts.mjs +361 -0
  26. package/src/ensemble/index.mjs +53 -0
  27. package/src/ensemble-judge.mjs +366 -0
  28. package/src/error-handler.mjs +67 -0
  29. package/src/errors.mjs +167 -0
  30. package/src/experience-propagation.mjs +128 -0
  31. package/src/experience-tracer.mjs +487 -0
  32. package/src/explanation-manager.mjs +299 -0
  33. package/src/feedback-aggregator.mjs +248 -0
  34. package/src/game-goal-prompts.mjs +478 -0
  35. package/src/game-player.mjs +548 -0
  36. package/src/hallucination-detector.mjs +155 -0
  37. package/src/helpers/playwright.mjs +80 -0
  38. package/src/human-validation-manager.mjs +516 -0
  39. package/src/index.mjs +364 -0
  40. package/src/judge.mjs +929 -0
  41. package/src/latency-aware-batch-optimizer.mjs +192 -0
  42. package/src/load-env.mjs +159 -0
  43. package/src/logger.mjs +55 -0
  44. package/src/metrics.mjs +187 -0
  45. package/src/model-tier-selector.mjs +221 -0
  46. package/src/multi-modal/index.mjs +36 -0
  47. package/src/multi-modal-fusion.mjs +190 -0
  48. package/src/multi-modal.mjs +524 -0
  49. package/src/natural-language-specs.mjs +1071 -0
  50. package/src/pair-comparison.mjs +277 -0
  51. package/src/persona/index.mjs +42 -0
  52. package/src/persona-enhanced.mjs +200 -0
  53. package/src/persona-experience.mjs +572 -0
  54. package/src/position-counterbalance.mjs +140 -0
  55. package/src/prompt-composer.mjs +375 -0
  56. package/src/render-change-detector.mjs +583 -0
  57. package/src/research-enhanced-validation.mjs +436 -0
  58. package/src/retry.mjs +152 -0
  59. package/src/rubrics.mjs +231 -0
  60. package/src/score-tracker.mjs +277 -0
  61. package/src/smart-validator.mjs +447 -0
  62. package/src/spec-config.mjs +106 -0
  63. package/src/spec-templates.mjs +347 -0
  64. package/src/specs/index.mjs +38 -0
  65. package/src/temporal/index.mjs +102 -0
  66. package/src/temporal-adaptive.mjs +163 -0
  67. package/src/temporal-batch-optimizer.mjs +222 -0
  68. package/src/temporal-constants.mjs +69 -0
  69. package/src/temporal-context.mjs +49 -0
  70. package/src/temporal-decision-manager.mjs +271 -0
  71. package/src/temporal-decision.mjs +669 -0
  72. package/src/temporal-errors.mjs +58 -0
  73. package/src/temporal-note-pruner.mjs +173 -0
  74. package/src/temporal-preprocessor.mjs +543 -0
  75. package/src/temporal-prompt-formatter.mjs +219 -0
  76. package/src/temporal-validation.mjs +159 -0
  77. package/src/temporal.mjs +415 -0
  78. package/src/type-guards.mjs +311 -0
  79. package/src/uncertainty-reducer.mjs +470 -0
  80. package/src/utils/index.mjs +175 -0
  81. package/src/validation-framework.mjs +321 -0
  82. package/src/validation-result-normalizer.mjs +64 -0
  83. package/src/validation.mjs +243 -0
  84. package/src/validators/accessibility-programmatic.mjs +345 -0
  85. package/src/validators/accessibility-validator.mjs +223 -0
  86. package/src/validators/batch-validator.mjs +143 -0
  87. package/src/validators/hybrid-validator.mjs +268 -0
  88. package/src/validators/index.mjs +34 -0
  89. package/src/validators/prompt-builder.mjs +218 -0
  90. package/src/validators/rubric.mjs +85 -0
  91. package/src/validators/state-programmatic.mjs +260 -0
  92. package/src/validators/state-validator.mjs +291 -0
  93. package/vercel.json +27 -0
@@ -0,0 +1,231 @@
1
+ /**
2
+ * Evaluation Rubrics
3
+ *
4
+ * Provides explicit scoring rubrics for LLM-as-a-judge evaluation.
5
+ * Research shows that explicit rubrics improve reliability by 10-20%
6
+ * and reduce bias from superficial features (LLMs-as-Judges Survey, arXiv:2412.05579).
7
+ */
8
+
9
+ /**
10
+ * Default scoring rubric for screenshot validation
11
+ */
12
+ export const DEFAULT_RUBRIC = {
13
+ score: {
14
+ description: 'Overall quality score from 0-10',
15
+ criteria: {
16
+ 10: 'Perfect - No issues, excellent UX, all requirements met',
17
+ 9: 'Excellent - Minor cosmetic issues, excellent UX',
18
+ 8: 'Very Good - Minor issues that don\'t affect usability',
19
+ 7: 'Good - Some issues but generally usable',
20
+ 6: 'Acceptable - Issues present but functional',
21
+ 5: 'Needs Improvement - Significant issues affecting usability',
22
+ 4: 'Poor - Major issues, difficult to use',
23
+ 3: 'Very Poor - Critical issues, barely functional',
24
+ 2: 'Bad - Severe issues, mostly broken',
25
+ 1: 'Very Bad - Almost completely broken',
26
+ 0: 'Broken - Completely non-functional'
27
+ }
28
+ },
29
+ dimensions: {
30
+ visual: {
31
+ description: 'Visual design and aesthetics',
32
+ criteria: [
33
+ 'Layout is clear and organized',
34
+ 'Colors are appropriate and accessible',
35
+ 'Typography is readable',
36
+ 'Spacing is consistent',
37
+ 'Visual hierarchy is clear'
38
+ ]
39
+ },
40
+ functional: {
41
+ description: 'Functional correctness',
42
+ criteria: [
43
+ 'All interactive elements work correctly',
44
+ 'Forms submit properly',
45
+ 'Links navigate correctly',
46
+ 'Buttons trigger expected actions',
47
+ 'No broken functionality'
48
+ ]
49
+ },
50
+ usability: {
51
+ description: 'Ease of use',
52
+ criteria: [
53
+ 'Purpose is clear',
54
+ 'Actions are obvious',
55
+ 'Feedback is provided',
56
+ 'Error messages are helpful',
57
+ 'Flow is intuitive'
58
+ ]
59
+ },
60
+ accessibility: {
61
+ description: 'Accessibility compliance',
62
+ criteria: [
63
+ 'Keyboard navigation works',
64
+ 'Screen reader compatible',
65
+ 'Color contrast is sufficient',
66
+ 'Text is readable',
67
+ 'Interactive elements are accessible'
68
+ ]
69
+ }
70
+ }
71
+ };
72
+
73
+ /**
74
+ * Build rubric prompt section
75
+ *
76
+ * @param {import('./index.mjs').Rubric | null} [rubric=null] - Rubric to use, or null for default
77
+ * @param {boolean} [includeDimensions=true] - Whether to include evaluation dimensions
78
+ * @returns {string} Formatted rubric prompt text
79
+ */
80
+ export function buildRubricPrompt(rubric = null, includeDimensions = true) {
81
+ const rubricToUse = rubric || DEFAULT_RUBRIC;
82
+ let prompt = `## EVALUATION RUBRIC
83
+
84
+ ### Scoring Scale (0-10):
85
+ ${Object.entries(rubricToUse.score.criteria)
86
+ .sort((a, b) => parseInt(b[0]) - parseInt(a[0]))
87
+ .map(([score, desc]) => `- ${score}: ${desc}`)
88
+ .join('\n')}
89
+
90
+ ### Example Evaluations (Few-Shot Learning):
91
+
92
+ **Example 1 - High Quality (Score: 9)**
93
+ Screenshot: Clean, accessible homepage with high contrast
94
+ Evaluation: "Excellent design with clear navigation, high contrast (21:1), keyboard accessible. Minor: could improve spacing. Score: 9"
95
+ JSON: {"score": 9, "assessment": "excellent", "issues": ["minor spacing"], "reasoning": "High quality with minor improvements needed"}
96
+
97
+ **Example 2 - Medium Quality (Score: 6)**
98
+ Screenshot: Functional but cluttered interface
99
+ Evaluation: "Functional design but cluttered layout, moderate contrast (4.2:1), some accessibility issues. Score: 6"
100
+ JSON: {"score": 6, "assessment": "needs-improvement", "issues": ["cluttered layout", "low contrast", "accessibility issues"], "reasoning": "Functional but needs significant improvements"}
101
+
102
+ **Example 3 - Low Quality (Score: 3)**
103
+ Screenshot: Broken layout with poor accessibility
104
+ Evaluation: "Poor design with broken layout, very low contrast (2.1:1), not keyboard accessible, multiple critical issues. Score: 3"
105
+ JSON: {"score": 3, "assessment": "fail", "issues": ["broken layout", "critical contrast violation", "no keyboard navigation"], "reasoning": "Multiple critical issues prevent usability"}
106
+
107
+ ### Evaluation Instructions:
108
+ 1. Evaluate the screenshot against the criteria below
109
+ 2. Consider both appearance and functional correctness
110
+ 3. Base your score on substantive content, not superficial features
111
+ 4. Ignore factors like response length, verbosity, or formatting style
112
+ 5. Focus on actual quality: correctness, clarity, usability, and accessibility
113
+ 6. Provide a score from 0-10 based on the rubric above
114
+ 7. List specific issues found (if any)
115
+ 8. Provide reasoning for your score`;
116
+
117
+ if (includeDimensions && rubricToUse.dimensions) {
118
+ prompt += `\n\n### Evaluation Dimensions:
119
+ ${Object.entries(rubricToUse.dimensions)
120
+ .map(([key, dim]) => `\n**${key.toUpperCase()}** (${dim.description}):\n${dim.criteria.map(c => `- ${c}`).join('\n')}`)
121
+ .join('\n')}`;
122
+ }
123
+
124
+ prompt += `\n\n### Issue Importance and Annoyance:
125
+ For each issue you identify, consider:
126
+ - **Importance**: How critical is this issue? (critical, high, medium, low)
127
+ - **Annoyance**: How annoying/frustrating is this issue to users? (very-high, high, medium, low)
128
+ - **Impact**: What is the impact on user experience? (blocks-use, degrades-experience, minor-inconvenience, cosmetic)
129
+
130
+ ### Suggestions and Evidence:
131
+ When providing recommendations, include:
132
+ - **Specific suggestions**: Concrete, actionable improvements
133
+ - **Evidence**: What in the screenshot supports your judgment? (visual elements, layout issues, accessibility violations, etc.)
134
+ - **Priority**: Which issues should be fixed first? (based on importance and annoyance)
135
+
136
+ ### Output Format:
137
+ Provide your evaluation as JSON:
138
+ {
139
+ "score": <0-10 integer>,
140
+ "assessment": "<pass|fail|needs-improvement>",
141
+ "issues": [
142
+ {
143
+ "description": "<issue description>",
144
+ "importance": "<critical|high|medium|low>",
145
+ "annoyance": "<very-high|high|medium|low>",
146
+ "impact": "<blocks-use|degrades-experience|minor-inconvenience|cosmetic>",
147
+ "evidence": "<what in the screenshot supports this issue>",
148
+ "suggestion": "<specific, actionable recommendation>"
149
+ }
150
+ ],
151
+ "reasoning": "<explanation of score>",
152
+ "strengths": ["<strength1>", "<strength2>", ...],
153
+ "recommendations": [
154
+ {
155
+ "priority": "<high|medium|low>",
156
+ "suggestion": "<specific recommendation>",
157
+ "evidence": "<what supports this recommendation>",
158
+ "expectedImpact": "<what improvement this would bring>"
159
+ }
160
+ ],
161
+ "evidence": {
162
+ "visual": "<visual evidence from screenshot>",
163
+ "functional": "<functional evidence>",
164
+ "accessibility": "<accessibility evidence>"
165
+ }
166
+ }`;
167
+
168
+ return prompt;
169
+ }
170
+
171
+ /**
172
+ * Get rubric for specific test type
173
+ *
174
+ * @param {string} testType - Test type identifier (e.g., 'payment-screen', 'gameplay', 'form')
175
+ * @returns {import('./index.mjs').Rubric} Rubric configured for the test type
176
+ */
177
+ export function getRubricForTestType(testType) {
178
+ const testTypeRubrics = {
179
+ 'payment-screen': {
180
+ ...DEFAULT_RUBRIC,
181
+ dimensions: {
182
+ ...DEFAULT_RUBRIC.dimensions,
183
+ payment: {
184
+ description: 'Payment functionality',
185
+ criteria: [
186
+ 'Payment code is clearly visible',
187
+ 'Payment links are obvious',
188
+ 'Payment flow is trustworthy',
189
+ 'Connection to game access is clear',
190
+ 'Payment instructions are clear'
191
+ ]
192
+ }
193
+ }
194
+ },
195
+ 'gameplay': {
196
+ ...DEFAULT_RUBRIC,
197
+ dimensions: {
198
+ ...DEFAULT_RUBRIC.dimensions,
199
+ gameplay: {
200
+ description: 'Gameplay experience',
201
+ criteria: [
202
+ 'Game is visually engaging',
203
+ 'Controls are intuitive',
204
+ 'Feedback is clear',
205
+ 'Game is balanced',
206
+ 'Experience is fun'
207
+ ]
208
+ }
209
+ }
210
+ },
211
+ 'form': {
212
+ ...DEFAULT_RUBRIC,
213
+ dimensions: {
214
+ ...DEFAULT_RUBRIC.dimensions,
215
+ form: {
216
+ description: 'Form usability',
217
+ criteria: [
218
+ 'Labels are clear',
219
+ 'Placeholders are helpful',
220
+ 'Validation is clear',
221
+ 'Submit button is obvious',
222
+ 'Error messages are helpful'
223
+ ]
224
+ }
225
+ }
226
+ }
227
+ };
228
+
229
+ return testTypeRubrics[testType] || DEFAULT_RUBRIC;
230
+ }
231
+
@@ -0,0 +1,277 @@
1
+ /**
2
+ * Score Tracker
3
+ *
4
+ * Tracks test scores over time for regression detection and improvement tracking.
5
+ * Stores baselines in JSON files for comparison.
6
+ *
7
+ * General-purpose utility - no domain-specific logic.
8
+ */
9
+
10
+ import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs';
11
+ import { join, dirname } from 'path';
12
+ import { fileURLToPath } from 'url';
13
+ import { warn } from './logger.mjs';
14
+
15
+ const __filename = fileURLToPath(import.meta.url);
16
+ const __dirname = dirname(__filename);
17
+
18
+ /**
19
+ * Score Tracker Class
20
+ *
21
+ * Tracks test scores over time for regression detection and improvement tracking.
22
+ *
23
+ * @class ScoreTracker
24
+ */
25
+ export class ScoreTracker {
26
+ /**
27
+ * @param {{
28
+ * baselineDir?: string;
29
+ * autoSave?: boolean;
30
+ * }} [options={}] - Tracker options
31
+ */
32
+ constructor(options = {}) {
33
+ const {
34
+ baselineDir = join(process.cwd(), 'test-results', 'baselines'),
35
+ autoSave = true
36
+ } = options;
37
+
38
+ this.baselineDir = baselineDir;
39
+ this.autoSave = autoSave;
40
+ this.baselineFile = join(baselineDir, 'scores.json');
41
+
42
+ // Ensure baseline directory exists
43
+ if (!existsSync(baselineDir)) {
44
+ mkdirSync(baselineDir, { recursive: true });
45
+ }
46
+ }
47
+
48
+ /**
49
+ * Load baseline scores
50
+ */
51
+ _loadBaselines() {
52
+ if (!existsSync(this.baselineFile)) {
53
+ return {};
54
+ }
55
+
56
+ try {
57
+ const content = readFileSync(this.baselineFile, 'utf8');
58
+ if (!content || content.trim().length === 0) {
59
+ return {};
60
+ }
61
+ return JSON.parse(content);
62
+ } catch (error) {
63
+ // SECURITY: Don't expose file paths or internal details in error
64
+ warn(`[ScoreTracker] Failed to load baselines: ${error instanceof SyntaxError ? 'Invalid JSON format' : 'File read error'}`);
65
+ return {};
66
+ }
67
+ }
68
+
69
+ /**
70
+ * Save baseline scores
71
+ */
72
+ _saveBaselines(baselines) {
73
+ if (!this.autoSave) return;
74
+
75
+ try {
76
+ writeFileSync(this.baselineFile, JSON.stringify(baselines, null, 2), 'utf8');
77
+ } catch (error) {
78
+ warn(`[ScoreTracker] Failed to save baselines: ${error.message}`);
79
+ }
80
+ }
81
+
82
+ /**
83
+ * Record a test score
84
+ *
85
+ * @param {string} testName - Name of the test
86
+ * @param {number} score - Test score (0-10)
87
+ * @param {Record<string, unknown>} [metadata={}] - Additional metadata
88
+ * @returns {{ score: number; timestamp: string; metadata: Record<string, unknown> }} Recorded entry
89
+ */
90
+ record(testName, score, metadata = {}) {
91
+ const baselines = this._loadBaselines();
92
+ const now = new Date().toISOString();
93
+
94
+ if (!baselines[testName]) {
95
+ baselines[testName] = {
96
+ history: [],
97
+ current: null,
98
+ baseline: null,
99
+ firstRecorded: now,
100
+ lastUpdated: now
101
+ };
102
+ }
103
+
104
+ const entry = {
105
+ score,
106
+ timestamp: now,
107
+ metadata
108
+ };
109
+
110
+ baselines[testName].history.push(entry);
111
+ baselines[testName].current = score;
112
+ baselines[testName].lastUpdated = now;
113
+
114
+ // Set baseline if not set (first score becomes baseline)
115
+ if (baselines[testName].baseline === null) {
116
+ baselines[testName].baseline = score;
117
+ baselines[testName].baselineSetAt = now;
118
+ }
119
+
120
+ // Keep only last 100 entries per test
121
+ if (baselines[testName].history.length > 100) {
122
+ baselines[testName].history = baselines[testName].history.slice(-100);
123
+ }
124
+
125
+ this._saveBaselines(baselines);
126
+ return entry;
127
+ }
128
+
129
+ /**
130
+ * Get baseline for a test
131
+ *
132
+ * @param {string} testName - Name of the test
133
+ * @returns {number | null} Baseline score or null if not set
134
+ */
135
+ getBaseline(testName) {
136
+ const baselines = this._loadBaselines();
137
+ return baselines[testName]?.baseline ?? null;
138
+ }
139
+
140
+ /**
141
+ * Get current score for a test
142
+ *
143
+ * @param {string} testName - Name of the test
144
+ * @returns {number | null} Current score or null if not recorded
145
+ */
146
+ getCurrent(testName) {
147
+ const baselines = this._loadBaselines();
148
+ return baselines[testName]?.current ?? null;
149
+ }
150
+
151
+ /**
152
+ * Compare current score with baseline
153
+ *
154
+ * @param {string} testName - Name of the test
155
+ * @param {number} currentScore - Current score to compare
156
+ * @returns {{ hasBaseline: boolean; baseline: number | null; current: number; improved: boolean; delta: number; percentage: number } | null} Comparison result or null if no baseline
157
+ */
158
+ compare(testName, currentScore) {
159
+ const baselines = this._loadBaselines();
160
+ const testData = baselines[testName];
161
+
162
+ if (!testData || testData.baseline === null) {
163
+ return {
164
+ hasBaseline: false,
165
+ baseline: null,
166
+ current: currentScore,
167
+ delta: null,
168
+ regression: false,
169
+ improvement: false,
170
+ trend: 'unknown'
171
+ };
172
+ }
173
+
174
+ const baseline = testData.baseline;
175
+ const delta = currentScore - baseline;
176
+ const regression = delta < -1; // Score dropped by more than 1 point
177
+ const improvement = delta > 1; // Score improved by more than 1 point
178
+
179
+ // Calculate trend from recent history
180
+ const recentScores = testData.history.slice(-10).map(e => e.score);
181
+ const trend = recentScores.length >= 3
182
+ ? (recentScores[recentScores.length - 1] > recentScores[0] ? 'improving' :
183
+ recentScores[recentScores.length - 1] < recentScores[0] ? 'declining' : 'stable')
184
+ : 'unknown';
185
+
186
+ return {
187
+ hasBaseline: true,
188
+ baseline,
189
+ current: currentScore,
190
+ delta,
191
+ regression,
192
+ improvement,
193
+ trend,
194
+ history: testData.history.slice(-10) // Last 10 scores
195
+ };
196
+ }
197
+
198
+ /**
199
+ * Update baseline (e.g., after fixing issues)
200
+ *
201
+ * @param {string} testName - Name of the test
202
+ * @param {number | null} [newBaseline=null] - New baseline score, or null to use current score
203
+ * @returns {boolean} True if baseline was updated
204
+ */
205
+ updateBaseline(testName, newBaseline = null) {
206
+ const baselines = this._loadBaselines();
207
+ if (!baselines[testName]) {
208
+ return false;
209
+ }
210
+
211
+ if (newBaseline === null) {
212
+ // Use current score as new baseline
213
+ newBaseline = baselines[testName].current;
214
+ }
215
+
216
+ baselines[testName].baseline = newBaseline;
217
+ baselines[testName].baselineSetAt = new Date().toISOString();
218
+ this._saveBaselines(baselines);
219
+ return true;
220
+ }
221
+
222
+ /**
223
+ * Get all baselines
224
+ */
225
+ getAll() {
226
+ return this._loadBaselines();
227
+ }
228
+
229
+ /**
230
+ * Get baseline stats
231
+ *
232
+ * @returns {import('./index.mjs').ScoreTracker['getStats']} Statistics object
233
+ */
234
+ getStats() {
235
+ const baselines = this._loadBaselines();
236
+ const stats = {
237
+ totalTests: Object.keys(baselines).length,
238
+ testsWithBaselines: 0,
239
+ testsWithRegressions: 0,
240
+ testsWithImprovements: 0,
241
+ averageScore: 0,
242
+ averageBaseline: 0
243
+ };
244
+
245
+ let totalScore = 0;
246
+ let totalBaseline = 0;
247
+ let count = 0;
248
+
249
+ for (const [testName, testData] of Object.entries(baselines)) {
250
+ if (testData.baseline !== null) {
251
+ stats.testsWithBaselines++;
252
+ totalBaseline += testData.baseline;
253
+
254
+ if (testData.current !== null) {
255
+ totalScore += testData.current;
256
+ count++;
257
+
258
+ const comparison = this.compare(testName, testData.current);
259
+ if (comparison.regression) {
260
+ stats.testsWithRegressions++;
261
+ }
262
+ if (comparison.improvement) {
263
+ stats.testsWithImprovements++;
264
+ }
265
+ }
266
+ }
267
+ }
268
+
269
+ if (count > 0) {
270
+ stats.averageScore = totalScore / count;
271
+ stats.averageBaseline = totalBaseline / stats.testsWithBaselines;
272
+ }
273
+
274
+ return stats;
275
+ }
276
+ }
277
+