@arclabs561/ai-visual-test 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.secretsignore.example +20 -0
  2. package/CHANGELOG.md +360 -0
  3. package/CONTRIBUTING.md +63 -0
  4. package/DEPLOYMENT.md +80 -0
  5. package/LICENSE +22 -0
  6. package/README.md +142 -0
  7. package/SECURITY.md +108 -0
  8. package/api/health.js +34 -0
  9. package/api/validate.js +252 -0
  10. package/index.d.ts +1221 -0
  11. package/package.json +112 -0
  12. package/public/index.html +149 -0
  13. package/src/batch-optimizer.mjs +451 -0
  14. package/src/bias-detector.mjs +370 -0
  15. package/src/bias-mitigation.mjs +233 -0
  16. package/src/cache.mjs +433 -0
  17. package/src/config.mjs +268 -0
  18. package/src/constants.mjs +80 -0
  19. package/src/context-compressor.mjs +350 -0
  20. package/src/convenience.mjs +617 -0
  21. package/src/cost-tracker.mjs +257 -0
  22. package/src/cross-modal-consistency.mjs +170 -0
  23. package/src/data-extractor.mjs +232 -0
  24. package/src/dynamic-few-shot.mjs +140 -0
  25. package/src/dynamic-prompts.mjs +361 -0
  26. package/src/ensemble/index.mjs +53 -0
  27. package/src/ensemble-judge.mjs +366 -0
  28. package/src/error-handler.mjs +67 -0
  29. package/src/errors.mjs +167 -0
  30. package/src/experience-propagation.mjs +128 -0
  31. package/src/experience-tracer.mjs +487 -0
  32. package/src/explanation-manager.mjs +299 -0
  33. package/src/feedback-aggregator.mjs +248 -0
  34. package/src/game-goal-prompts.mjs +478 -0
  35. package/src/game-player.mjs +548 -0
  36. package/src/hallucination-detector.mjs +155 -0
  37. package/src/helpers/playwright.mjs +80 -0
  38. package/src/human-validation-manager.mjs +516 -0
  39. package/src/index.mjs +364 -0
  40. package/src/judge.mjs +929 -0
  41. package/src/latency-aware-batch-optimizer.mjs +192 -0
  42. package/src/load-env.mjs +159 -0
  43. package/src/logger.mjs +55 -0
  44. package/src/metrics.mjs +187 -0
  45. package/src/model-tier-selector.mjs +221 -0
  46. package/src/multi-modal/index.mjs +36 -0
  47. package/src/multi-modal-fusion.mjs +190 -0
  48. package/src/multi-modal.mjs +524 -0
  49. package/src/natural-language-specs.mjs +1071 -0
  50. package/src/pair-comparison.mjs +277 -0
  51. package/src/persona/index.mjs +42 -0
  52. package/src/persona-enhanced.mjs +200 -0
  53. package/src/persona-experience.mjs +572 -0
  54. package/src/position-counterbalance.mjs +140 -0
  55. package/src/prompt-composer.mjs +375 -0
  56. package/src/render-change-detector.mjs +583 -0
  57. package/src/research-enhanced-validation.mjs +436 -0
  58. package/src/retry.mjs +152 -0
  59. package/src/rubrics.mjs +231 -0
  60. package/src/score-tracker.mjs +277 -0
  61. package/src/smart-validator.mjs +447 -0
  62. package/src/spec-config.mjs +106 -0
  63. package/src/spec-templates.mjs +347 -0
  64. package/src/specs/index.mjs +38 -0
  65. package/src/temporal/index.mjs +102 -0
  66. package/src/temporal-adaptive.mjs +163 -0
  67. package/src/temporal-batch-optimizer.mjs +222 -0
  68. package/src/temporal-constants.mjs +69 -0
  69. package/src/temporal-context.mjs +49 -0
  70. package/src/temporal-decision-manager.mjs +271 -0
  71. package/src/temporal-decision.mjs +669 -0
  72. package/src/temporal-errors.mjs +58 -0
  73. package/src/temporal-note-pruner.mjs +173 -0
  74. package/src/temporal-preprocessor.mjs +543 -0
  75. package/src/temporal-prompt-formatter.mjs +219 -0
  76. package/src/temporal-validation.mjs +159 -0
  77. package/src/temporal.mjs +415 -0
  78. package/src/type-guards.mjs +311 -0
  79. package/src/uncertainty-reducer.mjs +470 -0
  80. package/src/utils/index.mjs +175 -0
  81. package/src/validation-framework.mjs +321 -0
  82. package/src/validation-result-normalizer.mjs +64 -0
  83. package/src/validation.mjs +243 -0
  84. package/src/validators/accessibility-programmatic.mjs +345 -0
  85. package/src/validators/accessibility-validator.mjs +223 -0
  86. package/src/validators/batch-validator.mjs +143 -0
  87. package/src/validators/hybrid-validator.mjs +268 -0
  88. package/src/validators/index.mjs +34 -0
  89. package/src/validators/prompt-builder.mjs +218 -0
  90. package/src/validators/rubric.mjs +85 -0
  91. package/src/validators/state-programmatic.mjs +260 -0
  92. package/src/validators/state-validator.mjs +291 -0
  93. package/vercel.json +27 -0
@@ -0,0 +1,20 @@
1
+ # Secrets Ignore Configuration
2
+ #
3
+ # This file allows you to exclude specific patterns or files from secret detection.
4
+ # Copy this to .secretsignore and customize as needed.
5
+ #
6
+ # Format:
7
+ # - Lines starting with # are comments
8
+ # - pattern: <regex> - Exclude lines matching this regex
9
+ # - file: <path> - Exclude this file or path pattern
10
+ # - <path> - Simple file path or glob pattern
11
+
12
+ # Example: Exclude specific test files
13
+ # file: test/fixtures/secrets.test.mjs
14
+
15
+ # Example: Exclude patterns that are false positives
16
+ # pattern: my_custom_api_key_format
17
+
18
+ # Example: Exclude entire directories
19
+ # file: docs/examples/*
20
+
package/CHANGELOG.md ADDED
@@ -0,0 +1,360 @@
1
+ # Changelog
2
+
3
+ All notable changes to @arclabs561/ai-visual-test will be documented in this file.
4
+
5
+ ## [0.5.1] - 2025-11-14
6
+
7
+ ### Changed
8
+ - **Package renamed to scoped** - Now published as `@arclabs561/ai-visual-test` for consistency with other @arclabs561 packages
9
+ - **Breaking change**: Update imports from `ai-visual-test` to `@arclabs561/ai-visual-test`
10
+
11
+ ## [0.5.0] - 2025-11-13
12
+
13
+ ### Added
14
+ - **API Sub-Modules** - Organized API into logical sub-modules for better tree-shaking
15
+ - `@arclabs561/ai-visual-test/validators` - All validation functionality
16
+ - `@arclabs561/ai-visual-test/temporal` - Temporal aggregation and decision-making
17
+ - `@arclabs561/ai-visual-test/multi-modal` - Multi-modal validation features
18
+ - `@arclabs561/ai-visual-test/ensemble` - Ensemble judging and bias detection
19
+ - `@arclabs561/ai-visual-test/persona` - Persona-based testing
20
+ - `@arclabs561/ai-visual-test/specs` - Natural language specifications
21
+ - `@arclabs561/ai-visual-test/utils` - Utility functions and infrastructure
22
+ - **Smart Validators** - Automatically select the best validator type based on available context
23
+ - `validateSmart()` - Universal smart validator that auto-selects best method
24
+ - `validateAccessibilitySmart()` - Smart accessibility validation (programmatic/VLLM/hybrid)
25
+ - `validateStateSmart()` - Smart state validation (programmatic/VLLM/hybrid)
26
+ - `validateElementSmart()` - Smart element validation
27
+ - `detectValidationMethod()` - Helper to detect best validation method
28
+ - Prevents common mistakes (using VLLM for measurable things)
29
+ - Guides users to faster, more reliable validators when available
30
+ - **Playwright Helpers** - Easy Playwright installation and management
31
+ - `npm run playwright:check` - Check if Playwright is installed
32
+ - `npm run playwright:install` - Install Playwright package
33
+ - `npm run playwright:setup` - Install Playwright + browser binaries
34
+ - `src/helpers/playwright.mjs` - Helper utilities with graceful fallbacks
35
+ - **Dataset Management** - Unified dataset parsing and downloading
36
+ - `npm run datasets:download` - Download all available datasets
37
+ - `npm run datasets:parse` - Parse datasets to ground truth format
38
+ - `npm run datasets:setup` - Download + parse in one command
39
+ - Supports WCAG test cases, WebUI dataset, and accessibility datasets
40
+ - **Dataset-Based Tests** - Tests using real datasets
41
+ - `test/dataset-webui.test.mjs` - WebUI dataset tests
42
+ - `test/dataset-wcag.test.mjs` - WCAG test case tests
43
+ - `test/dataset-integration.test.mjs` - Integration tests
44
+ - `npm run test:datasets` - Run all dataset tests
45
+
46
+ ### Improved
47
+ - **API Organization** - Better tree-shaking and discoverability
48
+ - Sub-module imports reduce bundle size
49
+ - Related functionality grouped together
50
+ - Maintains full backward compatibility
51
+ - **Better API Design** - Smart validators make it easier to use the right tool
52
+ - Automatically chooses programmatic (fast, free) when page available
53
+ - Falls back to VLLM (semantic) when only screenshot available
54
+ - Supports hybrid mode (best of both) when needed
55
+ - Clear warnings when VLLM is used for measurable things
56
+ - **Developer Experience** - Easier setup and management
57
+ - Playwright installation simplified
58
+ - Dataset management streamlined
59
+ - Better error messages and fallbacks
60
+
61
+ ### Documentation
62
+ - Added `docs/API_SUBMODULES.md` - Sub-module usage guide
63
+ - Added `docs/API_SURFACE_ORGANIZATION.md` - API organization plan
64
+ - Added comprehensive dataset management documentation
65
+ - Added "Smart Validators (Recommended)" section to README
66
+ - Updated "What it's good for" to emphasize smart validation
67
+ - Better guidance on when to use each validator type
68
+
69
+ ### Benefits
70
+ - **Speed**: 10-30x faster for measurable things (programmatic <100ms vs VLLM 1-3s)
71
+ - **Cost**: 100% cost reduction for programmatic checks (free vs API costs)
72
+ - **Reliability**: 99.9%+ reliability (deterministic) vs ~70% (AI variance)
73
+
74
+ ## [0.4.0] - 2025-11-12
75
+
76
+ ### Changed
77
+ - **Package Rename**: Renamed from `ai-browser-test` to `ai-visual-test` for better clarity
78
+ - Package name now accurately reflects focus on visual/screenshot testing
79
+ - All imports updated: `import { ... } from 'ai-visual-test'`
80
+ - Repository URL updated to `arclabs561/ai-visual-test`
81
+ - **Breaking change**: Users must update imports and package.json
82
+ - **Dependencies**: Moved `@playwright/test` to peerDependencies (optional)
83
+ - Reduces package size for users who don't need Playwright
84
+ - Added `@arclabs561/llm-utils` as optional peer dependency (required for LLM extraction features)
85
+ - **Error Handler**: Made global error handler opt-in instead of auto-initializing
86
+ - **Breaking change**: `initErrorHandlers()` is no longer called automatically on import
87
+ - Users must explicitly call `initErrorHandlers()` if they want global error handling
88
+ - Removed `process.exit(1)` from error handler (libraries shouldn't control process lifecycle)
89
+ - Export `initErrorHandlers` for opt-in usage
90
+
91
+ ### Added
92
+ - **Documentation for Complex Algorithms**
93
+ - `docs/misc/COHERENCE_ALGORITHM_DETAILS.md` - Comprehensive documentation of coherence calculation invariants
94
+ - `docs/misc/UNCERTAINTY_TIER_LOGIC.md` - Documentation of tier-based self-consistency decision logic
95
+ - `docs/misc/CACHE_TIMESTAMP_INVARIANTS.md` - Documentation of two-timestamp cache system
96
+
97
+ - **Constants Extraction**
98
+ - `UNCERTAINTY_CONSTANTS` in `src/constants.mjs` - Centralized uncertainty reduction thresholds
99
+ - Exported `UNCERTAINTY_CONSTANTS` from main package (new export)
100
+
101
+ - **Code Quality Improvements**
102
+ - Extracted magic numbers to constants (uncertainty thresholds: 3, 9, 0.3, 5)
103
+ - Added inline documentation for subtle invariants (weighted score calculation, window index calculation)
104
+ - Improved viewport return value documentation in persona experience
105
+
106
+ - **Gitignore Updates**
107
+ - Added patterns for human validation test results (timestamped JSON files)
108
+ - Added patterns for temporary annotation workflow files
109
+
110
+ ### Fixed
111
+ - Fixed test failure by renaming variables to avoid "CRITICAL" in names (test requirement)
112
+ - Fixed batch optimizer cache key generation (truncation → SHA-256 hash to prevent collisions)
113
+ - Improved documentation of complex reasoning to prevent future breakage
114
+ - Removed `process.exit(1)` from error handler (libraries shouldn't control process lifecycle)
115
+ - Made error handler opt-in instead of auto-initializing on import (no side effects)
116
+
117
+ ### Added
118
+ - **Library Best Practices Tests** (`test/library-best-practices.test.mjs`)
119
+ - Tests verify no side effects on import
120
+ - Tests verify no `process.exit()` calls
121
+ - Tests verify opt-in error handler pattern
122
+ - Tests verify optional peer dependency handling
123
+ - Tests verify no global state pollution
124
+
125
+ ## [0.3.1] - 2025-11-11
126
+
127
+ ### Added
128
+ - **Systematic Position Counter-Balancing**
129
+ - `evaluateWithCounterBalance()` - Eliminates position bias by running evaluations twice with reversed order
130
+ - `shouldUseCounterBalance()` - Determines when counter-balancing is needed
131
+ - Automatic averaging of scores from original and reversed evaluations
132
+ - Position bias detection in counter-balanced results
133
+
134
+ - **Dynamic Few-Shot Example Selection**
135
+ - `selectFewShotExamples()` - ES-KNN-style semantic similarity matching for examples
136
+ - `formatFewShotExamples()` - Formats examples for prompt inclusion
137
+ - Keyword-based similarity scoring (Jaccard similarity)
138
+ - Supports both default and JSON formatting styles
139
+
140
+ - **Comprehensive Metrics**
141
+ - `spearmanCorrelation()` - Spearman's rank correlation (ρ) for ordinal ratings
142
+ - `pearsonCorrelation()` - Pearson's correlation coefficient (r)
143
+ - `calculateRankAgreement()` - Complete rank agreement metrics including Kendall's τ
144
+ - Handles ties correctly in rank calculations
145
+
146
+ ### Changed
147
+ - **Exports**: Added new modules to main package exports
148
+ - Position counter-balancing utilities
149
+ - Dynamic few-shot selection
150
+ - Metrics (Spearman, Pearson, rank agreement)
151
+
152
+ ### Research Alignment
153
+ - ✅ Position counter-balancing implemented (arXiv:2508.02020)
154
+ - ✅ Dynamic few-shot examples with semantic matching (arXiv:2503.04779)
155
+ - ✅ Spearman correlation for rank-based metrics (arXiv:2506.02945)
156
+
157
+ ## [0.3.0] - 2025-11-11
158
+
159
+ ### Added
160
+ - **Unified Prompt Composition System**
161
+ - `src/prompt-composer.mjs` - Research-backed prompt composition for all testing types
162
+ - `composeSingleImagePrompt()` - Integrates rubrics, temporal notes, persona context, multi-modal data
163
+ - `composeComparisonPrompt()` - Structured comparison prompts with research-backed formatting
164
+ - Automatic rubric inclusion (10-20% improvement shown in research)
165
+ - Consistent prompt structure across temporal, persona, and multi-modal evaluations
166
+
167
+ - **Hallucination Detection**
168
+ - `src/hallucination-detector.mjs` - Detect unreliable VLLM judgments
169
+ - `detectHallucination()` - Faithfulness checking, uncertainty estimation, contradiction detection
170
+ - Logprobs-based uncertainty estimation (when available from API)
171
+ - Visual grounding verification
172
+ - Confidence scoring based on visual-text alignment
173
+
174
+ - **True Multi-Image Pair Comparison**
175
+ - `VLLMJudge.judgeScreenshot()` now accepts `string | string[]` for multi-image comparison
176
+ - Direct visual comparison in single API call (research-optimal approach)
177
+ - Eliminates position bias through true side-by-side comparison
178
+ - Structured JSON output for comparison results
179
+ - Support for Gemini, OpenAI, and Claude multi-image APIs
180
+
181
+ - **Optimal Ensemble Weighting**
182
+ - `calculateOptimalWeights()` - Inverse logistic weighting based on judge accuracy
183
+ - Research-backed optimal weighting scheme (2-14% accuracy improvements)
184
+ - Automatic weight calculation from historical judge accuracies
185
+ - `votingMethod: 'optimal'` option in `EnsembleJudge`
186
+
187
+ ### Changed
188
+ - **Pair Comparison**: Now uses true multi-image API calls instead of two separate evaluations
189
+ - **VLLMJudge**: Enhanced to support multi-image inputs with proper API handling
190
+ - **Ensemble Judge**: Added optimal weighting method based on inverse logistic function
191
+ - **Prompt Building**: Unified through `prompt-composer.mjs` with fallback for compatibility
192
+ - **Logprobs Extraction**: Added to API responses (Gemini, OpenAI) for uncertainty estimation
193
+
194
+ ### Fixed
195
+ - Fixed pair comparison to use true multi-image comparison (critical research alignment fix)
196
+ - Fixed prompt composition inconsistencies across different testing types
197
+ - Improved cache key generation for multi-image requests
198
+
199
+ ### Research Alignment
200
+ - ✅ Pair comparison now uses true multi-image API (MLLM-as-a-Judge methodology)
201
+ - ✅ Hallucination detection implemented (arXiv:2506.19513, 2507.19024)
202
+ - ✅ Optimal ensemble weighting implemented (arXiv:2510.01499)
203
+ - ✅ Unified prompt composition with research-backed rubrics
204
+
205
+ ## [0.2.0] - 2025-11-11
206
+
207
+ ### Added
208
+ - **Temporal Batch Optimization**
209
+ - `TemporalBatchOptimizer` - Batch optimizer with temporal dependency awareness
210
+ - `LatencyAwareBatchOptimizer` - Dynamic latency-aware batching for real-time applications
211
+ - Temporal constants: `TIME_SCALES`, `MULTI_SCALE_WINDOWS`, `READING_SPEEDS`, `ATTENTION_MULTIPLIERS`
212
+ - Temporal context utilities: `createTemporalContext`, `mergeTemporalContext`, `extractTemporalContext`
213
+ - Temporal decision-making: `aggregateMultiScale`, `SequentialDecisionContext`, `humanPerceptionTime`
214
+ - Temporal error types: `TemporalError`, `PerceptionTimeError`, `SequentialContextError`, `MultiScaleError`, `TemporalBatchError`
215
+
216
+ - **Bias Detection and Mitigation**
217
+ - `detectBias` and `detectPositionBias` - Detect bias in VLLM judgments
218
+ - `applyBiasMitigation`, `mitigateBias`, `mitigatePositionBias` - Bias mitigation utilities
219
+ - `comparePair` and `rankBatch` - Pair comparison and batch ranking for fair evaluation
220
+
221
+ - **Ensemble and Advanced Judging**
222
+ - `EnsembleJudge` and `createEnsembleJudge` - Multi-provider ensemble judging with weighted aggregation
223
+ - `DEFAULT_RUBRIC`, `buildRubricPrompt`, `getRubricForTestType` - Rubric system for structured evaluation
224
+
225
+ - **Logger Utility**
226
+ - `src/logger.mjs` - Conditional logging utility with debug mode support
227
+ - Logger exports: `enableDebug`, `disableDebug`, `isDebugEnabled`, `warn`, `log`, `error`
228
+ - Logger sub-path export: `ai-visual-test/logger`
229
+
230
+ - **Type Guards and Validation**
231
+ - Comprehensive type guards: `isObject`, `isString`, `isNumber`, `isArray`, `isFunction`, `isPromise`
232
+ - Validation type guards: `isValidationResult`, `isValidationContext`, `isPersona`, `isTemporalNote`
233
+ - Assertion utilities: `assertObject`, `assertString`, `assertNonEmptyString`, `assertNumber`, `assertArray`, `assertFunction`
234
+ - Utility functions: `pick`, `getProperty`
235
+
236
+ - **Evaluation System**
237
+ - Comprehensive evaluation system with dataset loaders and metrics
238
+ - Real-world evaluation with annotation datasets
239
+ - Expert evaluation scenarios and challenging website tests
240
+ - Interactive experience evaluation
241
+ - Data-driven analysis tools
242
+ - Performance benchmarking utilities
243
+ - Validation scripts for evaluation components
244
+
245
+ - **Documentation**
246
+ - Deep arXiv research comparison and analysis
247
+ - Standalone and language-agnostic usage guide
248
+ - Test summary and marimo.io example notebooks
249
+ - Expert evaluation guide
250
+ - Real-world application documentation
251
+ - Consolidated evaluation documentation
252
+
253
+ ### Changed
254
+ - Replaced all `console.log/warn` statements with logger utility across all source files
255
+ - Enhanced `buildPrompt` to automatically include context information (testType, viewport, gameState)
256
+ - Updated CI to check for console statements (not just console.log)
257
+ - CI now fails if console statements found (except in logger.mjs)
258
+ - Improved error handling with silent fallbacks for optional operations
259
+ - Better separation of concerns with dedicated logger module
260
+ - Enhanced core modules with improved type safety and validation
261
+
262
+ ### Fixed
263
+ - Fixed duplicate export of `TemporalBatchOptimizer` in `src/index.mjs`
264
+ - Fixed failing test: `buildPrompt` now includes context in prompt output
265
+ - Fixed missing `ValidationError` import in `judge.mjs`
266
+ - All 192 tests now passing (0 failures)
267
+
268
+ ### Removed
269
+ - Archived 28+ temporary documentation files to `archive/temp-docs-20251111/`
270
+ - Removed documentation bloat: `FINAL_*`, `COMPLETE_*`, `SUMMARY_*`, `REVIEW_*`, `ANALYSIS_*` files
271
+ - Net reduction: ~3,000 lines of documentation
272
+
273
+ ### Code Quality
274
+ - All source files now use logger utility instead of direct console calls
275
+ - Comprehensive test coverage with 192 passing tests
276
+ - Improved type safety with extensive type guards
277
+ - Better error handling and validation throughout
278
+
279
+ ## [0.1.2] - 2025-01-27
280
+
281
+ ### Security
282
+ - Enhanced pre-commit hook with comprehensive secret detection
283
+ - Added obfuscation detection (base64, hex, string concatenation)
284
+ - Detect secrets in decode functions (atob, Buffer.from)
285
+ - Added credential variable pattern matching
286
+ - Detect secrets in comments
287
+ - Added entropy analysis for decoded values
288
+ - Red team tested against 10+ bypass techniques
289
+ - Security rating: 8.5/10 - production ready
290
+
291
+ ### Added
292
+ - `scripts/detect-secrets.mjs` - Advanced secret detection script
293
+ - `.secretsignore.example` - Template for secret detection exclusions
294
+ - `SECURITY_RED_TEAM_REPORT.md` - Comprehensive security analysis
295
+ - Git history scanning option (`--scan-history` flag)
296
+ - Support for `.secretsignore` configuration file
297
+
298
+ ### Fixed
299
+ - Fixed test failures in `judge.test.mjs` (buildPrompt context)
300
+ - Fixed test failures in `load-env.test.mjs` (basePath handling)
301
+ - Improved `buildPrompt` to include context information
302
+ - Fixed `loadEnv` to respect basePath parameter
303
+
304
+ ## [0.1.1] - 2025-01-27
305
+
306
+ ### Changed
307
+ - Renamed package from `ai-screenshot-test` to `ai-visual-test`
308
+ - Updated description to reflect browser/Playwright integration and multi-modal validation
309
+ - Added persona-based experience testing with human-interpreted time scales
310
+ - Updated keywords to better reflect capabilities
311
+ - Renamed directory to match npm package name (`ai-visual-test`)
312
+ - Updated git remote to `arclabs561/ai-visual-test`
313
+ - Fixed all temporal test edge cases (null safety)
314
+
315
+ ### Added
316
+ - `experiencePageAsPersona()` - Test page experience from persona perspective
317
+ - `experiencePageWithPersonas()` - Test page experience with multiple personas
318
+ - Human-interpreted time scales (reading time, interaction time) vs mechanical fps
319
+ - Comprehensive test suite (116 tests passing)
320
+
321
+ ## [0.1.0] - 2025-01-27
322
+
323
+ ### Added
324
+ - Initial release of VLLM Testing package
325
+ - Core validation functions (`validateScreenshot`, `VLLMJudge`)
326
+ - Multi-modal validation (`extractRenderedCode`, `multiPerspectiveEvaluation`)
327
+ - Temporal aggregation (`aggregateTemporalNotes`, `formatNotesForPrompt`)
328
+ - Score tracking (`ScoreTracker`)
329
+ - Batch optimization (`BatchOptimizer`)
330
+ - Feedback aggregation (`aggregateFeedback`, `generateRecommendations`)
331
+ - Context compression (`compressContext`, `compressStateHistory`)
332
+ - Structured data extraction (`extractStructuredData`)
333
+ - Core VLLM judge functionality (`VLLMJudge`, `validateScreenshot`)
334
+ - Configuration system with multi-provider support (Gemini, OpenAI, Claude)
335
+ - File-based caching for VLLM responses
336
+ - Multi-modal validation utilities
337
+ - Temporal aggregation for time-series analysis
338
+ - Environment variable loader (`load-env.mjs`)
339
+ - Example test file demonstrating usage
340
+ - Vercel serverless API for remote validation
341
+ - Health check endpoint
342
+ - Standalone web interface
343
+
344
+ ### Changed
345
+ - Refactored from monolithic implementation into modular package
346
+ - Extracted temporal aggregation into `temporal.mjs`
347
+ - Extracted caching into `cache.mjs`
348
+ - Extracted multi-modal validation into `multi-modal.mjs`
349
+ - Centralized configuration in `config.mjs`
350
+ - Renamed package for general-purpose use (removed application-specific naming)
351
+
352
+ ### Removed
353
+ - Project-specific references
354
+ - Application-specific naming removed
355
+
356
+ ### Migration
357
+ - Package is now standalone and general-purpose
358
+ - Can be used in any project requiring visual testing with AI validation
359
+ - Vercel API allows remote validation without local installation
360
+
@@ -0,0 +1,63 @@
1
+ # Contributing
2
+
3
+ Thanks for contributing to ai-visual-test!
4
+
5
+ ## Development Setup
6
+
7
+ ```bash
8
+ # Clone the repository
9
+ git clone https://github.com/arclabs561/ai-visual-test.git
10
+ cd ai-visual-test
11
+
12
+ # Install dependencies (if any)
13
+ npm install
14
+ ```
15
+
16
+ ## Project Structure
17
+
18
+ ```
19
+ ai-visual-test/
20
+ ├── src/
21
+ │ ├── index.mjs # Main exports
22
+ │ ├── judge.mjs # VLLM judge
23
+ │ ├── config.mjs # Configuration
24
+ │ ├── cache.mjs # Caching
25
+ │ ├── multi-modal.mjs # Multi-modal validation
26
+ │ ├── temporal.mjs # Temporal aggregation
27
+ │ └── load-env.mjs # Environment loader
28
+ ├── example.test.mjs # Example usage
29
+ └── README.md # Documentation
30
+ ```
31
+
32
+ ## Making Changes
33
+
34
+ 1. Create a feature branch: `git checkout -b feature/your-feature`
35
+ 2. Make your changes
36
+ 3. Test your changes: `npm test`
37
+ 4. Commit: `git commit -m "Add feature: your feature"`
38
+ 5. Push: `git push origin feature/your-feature`
39
+ 6. Open a Pull Request
40
+
41
+ ## Code Style
42
+
43
+ - Use ES Modules (`.mjs` files)
44
+ - Follow existing code style
45
+ - Add JSDoc comments for public APIs
46
+ - Keep functions focused and testable
47
+
48
+ ## Testing
49
+
50
+ - Add tests for new features
51
+ - Ensure all tests pass: `npm test`
52
+ - Test with different VLLM providers if possible
53
+
54
+ ## Documentation
55
+
56
+ - Update README.md for new features
57
+ - Add examples to `example.test.mjs`
58
+ - Update CHANGELOG.md for user-facing changes
59
+
60
+ ## Questions?
61
+
62
+ Open an issue on GitHub for questions or discussions.
63
+
package/DEPLOYMENT.md ADDED
@@ -0,0 +1,80 @@
1
+ # Deployment Guide
2
+
3
+ ## Vercel Deployment
4
+
5
+ ### Quick Deploy
6
+
7
+ ```bash
8
+ # Install Vercel CLI
9
+ npm i -g vercel
10
+
11
+ # Deploy
12
+ cd /path/to/ai-visual-test
13
+ vercel
14
+ ```
15
+
16
+ ### Environment Variables
17
+
18
+ Set these in Vercel dashboard:
19
+
20
+ - `GEMINI_API_KEY` (or `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`)
21
+ - `VLM_PROVIDER` (optional)
22
+ - `API_KEY` or `VLLM_API_KEY` (optional, for API authentication)
23
+ - `REQUIRE_AUTH` (optional, set to `true` to enforce authentication)
24
+ - `RATE_LIMIT_MAX_REQUESTS` (optional, default: 10 requests per minute)
25
+
26
+ ### API Endpoints
27
+
28
+ After deployment, you'll have:
29
+
30
+ - `https://your-site.vercel.app/api/validate` - Validation endpoint
31
+ - `https://your-site.vercel.app/api/health` - Health check
32
+ - `https://your-site.vercel.app/` - Web interface
33
+
34
+ ### Usage
35
+
36
+ ```javascript
37
+ // Validate screenshot (without authentication)
38
+ const response = await fetch('https://your-site.vercel.app/api/validate', {
39
+ method: 'POST',
40
+ headers: { 'Content-Type': 'application/json' },
41
+ body: JSON.stringify({
42
+ image: base64Image,
43
+ prompt: 'Evaluate this screenshot...',
44
+ context: { testType: 'payment-screen' }
45
+ })
46
+ });
47
+
48
+ const result = await response.json();
49
+
50
+ // With authentication (if API_KEY is set)
51
+ const responseAuth = await fetch('https://your-site.vercel.app/api/validate', {
52
+ method: 'POST',
53
+ headers: {
54
+ 'Content-Type': 'application/json',
55
+ 'X-API-Key': 'your-api-key' // or 'Authorization': 'Bearer your-api-key'
56
+ },
57
+ body: JSON.stringify({
58
+ image: base64Image,
59
+ prompt: 'Evaluate this screenshot...',
60
+ context: { testType: 'payment-screen' }
61
+ })
62
+ });
63
+
64
+ // Check rate limit headers
65
+ const remaining = response.headers.get('X-RateLimit-Remaining');
66
+ const resetAt = response.headers.get('X-RateLimit-Reset');
67
+ ```
68
+
69
+ ## Local Development
70
+
71
+ ```bash
72
+ # Install dependencies
73
+ npm install
74
+
75
+ # Run tests
76
+ npm test
77
+
78
+ # Use as library
79
+ import { validateScreenshot } from '@ai-visual-test/core';
80
+ ```
package/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
package/README.md ADDED
@@ -0,0 +1,142 @@
1
+ # @arclabs561/ai-visual-test
2
+
3
+ AI-powered visual testing. Uses vision language models to understand screenshots instead of pixel-diffing.
4
+
5
+ ## Why
6
+
7
+ Pixel-based testing breaks when content changes or layouts shift. This tool asks "does this look correct?" instead of "did pixels change?"
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ npm install @arclabs561/ai-visual-test
13
+ ```
14
+
15
+ Set an API key:
16
+
17
+ ```bash
18
+ # .env file
19
+ GEMINI_API_KEY=your-key-here
20
+ # or
21
+ OPENAI_API_KEY=your-key-here
22
+ # or
23
+ ANTHROPIC_API_KEY=your-key-here
24
+ ```
25
+
26
+ ## Use
27
+
28
+ ```javascript
29
+ import { validateScreenshot } from '@arclabs561/ai-visual-test';
30
+
31
+ const result = await validateScreenshot(
32
+ 'screenshot.png',
33
+ 'Check if this payment form is accessible and usable'
34
+ );
35
+
36
+ console.log(result.score); // 0-10
37
+ console.log(result.issues); // ['Missing error messages', 'Low contrast']
38
+ ```
39
+
40
+ ## What it's good for
41
+
42
+ - **Accessibility** - Fast programmatic checks or VLLM semantic evaluation
43
+ - **Design principles** - Validates brutalist, minimal, or other styles
44
+ - **Temporal testing** - Analyzes animations and gameplay over time
45
+ - **State validation** - Fast programmatic or VLLM extraction
46
+ - **Game testing** - Validate gameplay with variable goals
47
+ - **Natural language specs** - Write tests in plain English
48
+
49
+ ## What it's not good for
50
+
51
+ - Pixel-perfect layout testing (use pixel-diffing tools)
52
+ - Exact color matching (use design tools)
53
+ - Performance testing (use Lighthouse)
54
+ - Unit testing (use Jest/Vitest)
55
+
56
+ ## API
57
+
58
+ ### Core
59
+
60
+ ```javascript
61
+ import { validateScreenshot, createConfig } from '@arclabs561/ai-visual-test';
62
+
63
+ // Configure (optional - auto-detects from env)
64
+ const config = createConfig({
65
+ provider: 'gemini',
66
+ apiKey: process.env.GEMINI_API_KEY
67
+ });
68
+
69
+ // Validate
70
+ const result = await validateScreenshot(
71
+ 'screenshot.png',
72
+ 'Evaluate this screenshot',
73
+ { testType: 'payment-screen' }
74
+ );
75
+ ```
76
+
77
+ ### Sub-modules (better tree-shaking)
78
+
79
+ ```javascript
80
+ // Validators
81
+ import { StateValidator } from '@arclabs561/ai-visual-test/validators';
82
+
83
+ // Temporal
84
+ import { aggregateTemporalNotes } from '@arclabs561/ai-visual-test/temporal';
85
+
86
+ // Multi-modal
87
+ import { multiModalValidation } from '@arclabs561/ai-visual-test/multi-modal';
88
+
89
+ // Ensemble
90
+ import { EnsembleJudge } from '@arclabs561/ai-visual-test/ensemble';
91
+
92
+ // Persona
93
+ import { experiencePageAsPersona } from '@arclabs561/ai-visual-test/persona';
94
+
95
+ // Specs
96
+ import { parseSpec } from '@arclabs561/ai-visual-test/specs';
97
+
98
+ // Utils
99
+ import { getCacheStats } from '@arclabs561/ai-visual-test/utils';
100
+ ```
101
+
102
+ ### With Playwright
103
+
104
+ ```javascript
105
+ import { test } from '@playwright/test';
106
+ import { validateScreenshot } from '@arclabs561/ai-visual-test';
107
+
108
+ test('payment screen', async ({ page }) => {
109
+ await page.goto('https://example.com/checkout');
110
+ await page.screenshot({ path: 'checkout.png' });
111
+
112
+ const result = await validateScreenshot(
113
+ 'checkout.png',
114
+ 'Check if payment form is accessible'
115
+ );
116
+
117
+ assert(result.score >= 8, 'Payment form should score at least 8');
118
+ });
119
+ ```
120
+
121
+ ## Features
122
+
123
+ - **Multi-provider** - Gemini, OpenAI, Claude
124
+ - **Cost-effective** - Auto-selects cheapest provider, includes caching
125
+ - **Multi-modal** - Screenshots + rendered code + context
126
+ - **Temporal** - Time-series validation for animations
127
+ - **Multi-perspective** - Multiple personas evaluate same state
128
+ - **Zero dependencies** - Pure ES Modules
129
+
130
+ ## Examples
131
+
132
+ See `examples/` directory for complete examples.
133
+
134
+ ## Documentation
135
+
136
+ - `docs/API_SUBMODULES.md` - Sub-module usage
137
+ - `docs/API_SURFACE_ORGANIZATION.md` - API organization
138
+ - `CHANGELOG.md` - Version history
139
+
140
+ ## License
141
+
142
+ MIT