@arclabs561/ai-visual-test 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.secretsignore.example +20 -0
- package/CHANGELOG.md +360 -0
- package/CONTRIBUTING.md +63 -0
- package/DEPLOYMENT.md +80 -0
- package/LICENSE +22 -0
- package/README.md +142 -0
- package/SECURITY.md +108 -0
- package/api/health.js +34 -0
- package/api/validate.js +252 -0
- package/index.d.ts +1221 -0
- package/package.json +112 -0
- package/public/index.html +149 -0
- package/src/batch-optimizer.mjs +451 -0
- package/src/bias-detector.mjs +370 -0
- package/src/bias-mitigation.mjs +233 -0
- package/src/cache.mjs +433 -0
- package/src/config.mjs +268 -0
- package/src/constants.mjs +80 -0
- package/src/context-compressor.mjs +350 -0
- package/src/convenience.mjs +617 -0
- package/src/cost-tracker.mjs +257 -0
- package/src/cross-modal-consistency.mjs +170 -0
- package/src/data-extractor.mjs +232 -0
- package/src/dynamic-few-shot.mjs +140 -0
- package/src/dynamic-prompts.mjs +361 -0
- package/src/ensemble/index.mjs +53 -0
- package/src/ensemble-judge.mjs +366 -0
- package/src/error-handler.mjs +67 -0
- package/src/errors.mjs +167 -0
- package/src/experience-propagation.mjs +128 -0
- package/src/experience-tracer.mjs +487 -0
- package/src/explanation-manager.mjs +299 -0
- package/src/feedback-aggregator.mjs +248 -0
- package/src/game-goal-prompts.mjs +478 -0
- package/src/game-player.mjs +548 -0
- package/src/hallucination-detector.mjs +155 -0
- package/src/helpers/playwright.mjs +80 -0
- package/src/human-validation-manager.mjs +516 -0
- package/src/index.mjs +364 -0
- package/src/judge.mjs +929 -0
- package/src/latency-aware-batch-optimizer.mjs +192 -0
- package/src/load-env.mjs +159 -0
- package/src/logger.mjs +55 -0
- package/src/metrics.mjs +187 -0
- package/src/model-tier-selector.mjs +221 -0
- package/src/multi-modal/index.mjs +36 -0
- package/src/multi-modal-fusion.mjs +190 -0
- package/src/multi-modal.mjs +524 -0
- package/src/natural-language-specs.mjs +1071 -0
- package/src/pair-comparison.mjs +277 -0
- package/src/persona/index.mjs +42 -0
- package/src/persona-enhanced.mjs +200 -0
- package/src/persona-experience.mjs +572 -0
- package/src/position-counterbalance.mjs +140 -0
- package/src/prompt-composer.mjs +375 -0
- package/src/render-change-detector.mjs +583 -0
- package/src/research-enhanced-validation.mjs +436 -0
- package/src/retry.mjs +152 -0
- package/src/rubrics.mjs +231 -0
- package/src/score-tracker.mjs +277 -0
- package/src/smart-validator.mjs +447 -0
- package/src/spec-config.mjs +106 -0
- package/src/spec-templates.mjs +347 -0
- package/src/specs/index.mjs +38 -0
- package/src/temporal/index.mjs +102 -0
- package/src/temporal-adaptive.mjs +163 -0
- package/src/temporal-batch-optimizer.mjs +222 -0
- package/src/temporal-constants.mjs +69 -0
- package/src/temporal-context.mjs +49 -0
- package/src/temporal-decision-manager.mjs +271 -0
- package/src/temporal-decision.mjs +669 -0
- package/src/temporal-errors.mjs +58 -0
- package/src/temporal-note-pruner.mjs +173 -0
- package/src/temporal-preprocessor.mjs +543 -0
- package/src/temporal-prompt-formatter.mjs +219 -0
- package/src/temporal-validation.mjs +159 -0
- package/src/temporal.mjs +415 -0
- package/src/type-guards.mjs +311 -0
- package/src/uncertainty-reducer.mjs +470 -0
- package/src/utils/index.mjs +175 -0
- package/src/validation-framework.mjs +321 -0
- package/src/validation-result-normalizer.mjs +64 -0
- package/src/validation.mjs +243 -0
- package/src/validators/accessibility-programmatic.mjs +345 -0
- package/src/validators/accessibility-validator.mjs +223 -0
- package/src/validators/batch-validator.mjs +143 -0
- package/src/validators/hybrid-validator.mjs +268 -0
- package/src/validators/index.mjs +34 -0
- package/src/validators/prompt-builder.mjs +218 -0
- package/src/validators/rubric.mjs +85 -0
- package/src/validators/state-programmatic.mjs +260 -0
- package/src/validators/state-validator.mjs +291 -0
- package/vercel.json +27 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Secrets Ignore Configuration
|
|
2
|
+
#
|
|
3
|
+
# This file allows you to exclude specific patterns or files from secret detection.
|
|
4
|
+
# Copy this to .secretsignore and customize as needed.
|
|
5
|
+
#
|
|
6
|
+
# Format:
|
|
7
|
+
# - Lines starting with # are comments
|
|
8
|
+
# - pattern: <regex> - Exclude lines matching this regex
|
|
9
|
+
# - file: <path> - Exclude this file or path pattern
|
|
10
|
+
# - <path> - Simple file path or glob pattern
|
|
11
|
+
|
|
12
|
+
# Example: Exclude specific test files
|
|
13
|
+
# file: test/fixtures/secrets.test.mjs
|
|
14
|
+
|
|
15
|
+
# Example: Exclude patterns that are false positives
|
|
16
|
+
# pattern: my_custom_api_key_format
|
|
17
|
+
|
|
18
|
+
# Example: Exclude entire directories
|
|
19
|
+
# file: docs/examples/*
|
|
20
|
+
|
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to @arclabs561/ai-visual-test will be documented in this file.
|
|
4
|
+
|
|
5
|
+
## [0.5.1] - 2025-11-14
|
|
6
|
+
|
|
7
|
+
### Changed
|
|
8
|
+
- **Package renamed to scoped** - Now published as `@arclabs561/ai-visual-test` for consistency with other @arclabs561 packages
|
|
9
|
+
- **Breaking change**: Update imports from `ai-visual-test` to `@arclabs561/ai-visual-test`
|
|
10
|
+
|
|
11
|
+
## [0.5.0] - 2025-11-13
|
|
12
|
+
|
|
13
|
+
### Added
|
|
14
|
+
- **API Sub-Modules** - Organized API into logical sub-modules for better tree-shaking
|
|
15
|
+
- `@arclabs561/ai-visual-test/validators` - All validation functionality
|
|
16
|
+
- `@arclabs561/ai-visual-test/temporal` - Temporal aggregation and decision-making
|
|
17
|
+
- `@arclabs561/ai-visual-test/multi-modal` - Multi-modal validation features
|
|
18
|
+
- `@arclabs561/ai-visual-test/ensemble` - Ensemble judging and bias detection
|
|
19
|
+
- `@arclabs561/ai-visual-test/persona` - Persona-based testing
|
|
20
|
+
- `@arclabs561/ai-visual-test/specs` - Natural language specifications
|
|
21
|
+
- `@arclabs561/ai-visual-test/utils` - Utility functions and infrastructure
|
|
22
|
+
- **Smart Validators** - Automatically select the best validator type based on available context
|
|
23
|
+
- `validateSmart()` - Universal smart validator that auto-selects best method
|
|
24
|
+
- `validateAccessibilitySmart()` - Smart accessibility validation (programmatic/VLLM/hybrid)
|
|
25
|
+
- `validateStateSmart()` - Smart state validation (programmatic/VLLM/hybrid)
|
|
26
|
+
- `validateElementSmart()` - Smart element validation
|
|
27
|
+
- `detectValidationMethod()` - Helper to detect best validation method
|
|
28
|
+
- Prevents common mistakes (using VLLM for measurable things)
|
|
29
|
+
- Guides users to faster, more reliable validators when available
|
|
30
|
+
- **Playwright Helpers** - Easy Playwright installation and management
|
|
31
|
+
- `npm run playwright:check` - Check if Playwright is installed
|
|
32
|
+
- `npm run playwright:install` - Install Playwright package
|
|
33
|
+
- `npm run playwright:setup` - Install Playwright + browser binaries
|
|
34
|
+
- `src/helpers/playwright.mjs` - Helper utilities with graceful fallbacks
|
|
35
|
+
- **Dataset Management** - Unified dataset parsing and downloading
|
|
36
|
+
- `npm run datasets:download` - Download all available datasets
|
|
37
|
+
- `npm run datasets:parse` - Parse datasets to ground truth format
|
|
38
|
+
- `npm run datasets:setup` - Download + parse in one command
|
|
39
|
+
- Supports WCAG test cases, WebUI dataset, and accessibility datasets
|
|
40
|
+
- **Dataset-Based Tests** - Tests using real datasets
|
|
41
|
+
- `test/dataset-webui.test.mjs` - WebUI dataset tests
|
|
42
|
+
- `test/dataset-wcag.test.mjs` - WCAG test case tests
|
|
43
|
+
- `test/dataset-integration.test.mjs` - Integration tests
|
|
44
|
+
- `npm run test:datasets` - Run all dataset tests
|
|
45
|
+
|
|
46
|
+
### Improved
|
|
47
|
+
- **API Organization** - Better tree-shaking and discoverability
|
|
48
|
+
- Sub-module imports reduce bundle size
|
|
49
|
+
- Related functionality grouped together
|
|
50
|
+
- Maintains full backward compatibility
|
|
51
|
+
- **Better API Design** - Smart validators make it easier to use the right tool
|
|
52
|
+
- Automatically chooses programmatic (fast, free) when page available
|
|
53
|
+
- Falls back to VLLM (semantic) when only screenshot available
|
|
54
|
+
- Supports hybrid mode (best of both) when needed
|
|
55
|
+
- Clear warnings when VLLM is used for measurable things
|
|
56
|
+
- **Developer Experience** - Easier setup and management
|
|
57
|
+
- Playwright installation simplified
|
|
58
|
+
- Dataset management streamlined
|
|
59
|
+
- Better error messages and fallbacks
|
|
60
|
+
|
|
61
|
+
### Documentation
|
|
62
|
+
- Added `docs/API_SUBMODULES.md` - Sub-module usage guide
|
|
63
|
+
- Added `docs/API_SURFACE_ORGANIZATION.md` - API organization plan
|
|
64
|
+
- Added comprehensive dataset management documentation
|
|
65
|
+
- Added "Smart Validators (Recommended)" section to README
|
|
66
|
+
- Updated "What it's good for" to emphasize smart validation
|
|
67
|
+
- Better guidance on when to use each validator type
|
|
68
|
+
|
|
69
|
+
### Benefits
|
|
70
|
+
- **Speed**: 10-30x faster for measurable things (programmatic <100ms vs VLLM 1-3s)
|
|
71
|
+
- **Cost**: 100% cost reduction for programmatic checks (free vs API costs)
|
|
72
|
+
- **Reliability**: 99.9%+ reliability (deterministic) vs ~70% (AI variance)
|
|
73
|
+
|
|
74
|
+
## [0.4.0] - 2025-11-12
|
|
75
|
+
|
|
76
|
+
### Changed
|
|
77
|
+
- **Package Rename**: Renamed from `ai-browser-test` to `ai-visual-test` for better clarity
|
|
78
|
+
- Package name now accurately reflects focus on visual/screenshot testing
|
|
79
|
+
- All imports updated: `import { ... } from 'ai-visual-test'`
|
|
80
|
+
- Repository URL updated to `arclabs561/ai-visual-test`
|
|
81
|
+
- **Breaking change**: Users must update imports and package.json
|
|
82
|
+
- **Dependencies**: Moved `@playwright/test` to peerDependencies (optional)
|
|
83
|
+
- Reduces package size for users who don't need Playwright
|
|
84
|
+
- Added `@arclabs561/llm-utils` as optional peer dependency (required for LLM extraction features)
|
|
85
|
+
- **Error Handler**: Made global error handler opt-in instead of auto-initializing
|
|
86
|
+
- **Breaking change**: `initErrorHandlers()` is no longer called automatically on import
|
|
87
|
+
- Users must explicitly call `initErrorHandlers()` if they want global error handling
|
|
88
|
+
- Removed `process.exit(1)` from error handler (libraries shouldn't control process lifecycle)
|
|
89
|
+
- Export `initErrorHandlers` for opt-in usage
|
|
90
|
+
|
|
91
|
+
### Added
|
|
92
|
+
- **Documentation for Complex Algorithms**
|
|
93
|
+
- `docs/misc/COHERENCE_ALGORITHM_DETAILS.md` - Comprehensive documentation of coherence calculation invariants
|
|
94
|
+
- `docs/misc/UNCERTAINTY_TIER_LOGIC.md` - Documentation of tier-based self-consistency decision logic
|
|
95
|
+
- `docs/misc/CACHE_TIMESTAMP_INVARIANTS.md` - Documentation of two-timestamp cache system
|
|
96
|
+
|
|
97
|
+
- **Constants Extraction**
|
|
98
|
+
- `UNCERTAINTY_CONSTANTS` in `src/constants.mjs` - Centralized uncertainty reduction thresholds
|
|
99
|
+
- Exported `UNCERTAINTY_CONSTANTS` from main package (new export)
|
|
100
|
+
|
|
101
|
+
- **Code Quality Improvements**
|
|
102
|
+
- Extracted magic numbers to constants (uncertainty thresholds: 3, 9, 0.3, 5)
|
|
103
|
+
- Added inline documentation for subtle invariants (weighted score calculation, window index calculation)
|
|
104
|
+
- Improved viewport return value documentation in persona experience
|
|
105
|
+
|
|
106
|
+
- **Gitignore Updates**
|
|
107
|
+
- Added patterns for human validation test results (timestamped JSON files)
|
|
108
|
+
- Added patterns for temporary annotation workflow files
|
|
109
|
+
|
|
110
|
+
### Fixed
|
|
111
|
+
- Fixed test failure by renaming variables to avoid "CRITICAL" in names (test requirement)
|
|
112
|
+
- Fixed batch optimizer cache key generation (truncation → SHA-256 hash to prevent collisions)
|
|
113
|
+
- Improved documentation of complex reasoning to prevent future breakage
|
|
114
|
+
- Removed `process.exit(1)` from error handler (libraries shouldn't control process lifecycle)
|
|
115
|
+
- Made error handler opt-in instead of auto-initializing on import (no side effects)
|
|
116
|
+
|
|
117
|
+
### Added
|
|
118
|
+
- **Library Best Practices Tests** (`test/library-best-practices.test.mjs`)
|
|
119
|
+
- Tests verify no side effects on import
|
|
120
|
+
- Tests verify no `process.exit()` calls
|
|
121
|
+
- Tests verify opt-in error handler pattern
|
|
122
|
+
- Tests verify optional peer dependency handling
|
|
123
|
+
- Tests verify no global state pollution
|
|
124
|
+
|
|
125
|
+
## [0.3.1] - 2025-11-11
|
|
126
|
+
|
|
127
|
+
### Added
|
|
128
|
+
- **Systematic Position Counter-Balancing**
|
|
129
|
+
- `evaluateWithCounterBalance()` - Eliminates position bias by running evaluations twice with reversed order
|
|
130
|
+
- `shouldUseCounterBalance()` - Determines when counter-balancing is needed
|
|
131
|
+
- Automatic averaging of scores from original and reversed evaluations
|
|
132
|
+
- Position bias detection in counter-balanced results
|
|
133
|
+
|
|
134
|
+
- **Dynamic Few-Shot Example Selection**
|
|
135
|
+
- `selectFewShotExamples()` - ES-KNN-style semantic similarity matching for examples
|
|
136
|
+
- `formatFewShotExamples()` - Formats examples for prompt inclusion
|
|
137
|
+
- Keyword-based similarity scoring (Jaccard similarity)
|
|
138
|
+
- Supports both default and JSON formatting styles
|
|
139
|
+
|
|
140
|
+
- **Comprehensive Metrics**
|
|
141
|
+
- `spearmanCorrelation()` - Spearman's rank correlation (ρ) for ordinal ratings
|
|
142
|
+
- `pearsonCorrelation()` - Pearson's correlation coefficient (r)
|
|
143
|
+
- `calculateRankAgreement()` - Complete rank agreement metrics including Kendall's τ
|
|
144
|
+
- Handles ties correctly in rank calculations
|
|
145
|
+
|
|
146
|
+
### Changed
|
|
147
|
+
- **Exports**: Added new modules to main package exports
|
|
148
|
+
- Position counter-balancing utilities
|
|
149
|
+
- Dynamic few-shot selection
|
|
150
|
+
- Metrics (Spearman, Pearson, rank agreement)
|
|
151
|
+
|
|
152
|
+
### Research Alignment
|
|
153
|
+
- ✅ Position counter-balancing implemented (arXiv:2508.02020)
|
|
154
|
+
- ✅ Dynamic few-shot examples with semantic matching (arXiv:2503.04779)
|
|
155
|
+
- ✅ Spearman correlation for rank-based metrics (arXiv:2506.02945)
|
|
156
|
+
|
|
157
|
+
## [0.3.0] - 2025-11-11
|
|
158
|
+
|
|
159
|
+
### Added
|
|
160
|
+
- **Unified Prompt Composition System**
|
|
161
|
+
- `src/prompt-composer.mjs` - Research-backed prompt composition for all testing types
|
|
162
|
+
- `composeSingleImagePrompt()` - Integrates rubrics, temporal notes, persona context, multi-modal data
|
|
163
|
+
- `composeComparisonPrompt()` - Structured comparison prompts with research-backed formatting
|
|
164
|
+
- Automatic rubric inclusion (10-20% improvement shown in research)
|
|
165
|
+
- Consistent prompt structure across temporal, persona, and multi-modal evaluations
|
|
166
|
+
|
|
167
|
+
- **Hallucination Detection**
|
|
168
|
+
- `src/hallucination-detector.mjs` - Detect unreliable VLLM judgments
|
|
169
|
+
- `detectHallucination()` - Faithfulness checking, uncertainty estimation, contradiction detection
|
|
170
|
+
- Logprobs-based uncertainty estimation (when available from API)
|
|
171
|
+
- Visual grounding verification
|
|
172
|
+
- Confidence scoring based on visual-text alignment
|
|
173
|
+
|
|
174
|
+
- **True Multi-Image Pair Comparison**
|
|
175
|
+
- `VLLMJudge.judgeScreenshot()` now accepts `string | string[]` for multi-image comparison
|
|
176
|
+
- Direct visual comparison in single API call (research-optimal approach)
|
|
177
|
+
- Eliminates position bias through true side-by-side comparison
|
|
178
|
+
- Structured JSON output for comparison results
|
|
179
|
+
- Support for Gemini, OpenAI, and Claude multi-image APIs
|
|
180
|
+
|
|
181
|
+
- **Optimal Ensemble Weighting**
|
|
182
|
+
- `calculateOptimalWeights()` - Inverse logistic weighting based on judge accuracy
|
|
183
|
+
- Research-backed optimal weighting scheme (2-14% accuracy improvements)
|
|
184
|
+
- Automatic weight calculation from historical judge accuracies
|
|
185
|
+
- `votingMethod: 'optimal'` option in `EnsembleJudge`
|
|
186
|
+
|
|
187
|
+
### Changed
|
|
188
|
+
- **Pair Comparison**: Now uses true multi-image API calls instead of two separate evaluations
|
|
189
|
+
- **VLLMJudge**: Enhanced to support multi-image inputs with proper API handling
|
|
190
|
+
- **Ensemble Judge**: Added optimal weighting method based on inverse logistic function
|
|
191
|
+
- **Prompt Building**: Unified through `prompt-composer.mjs` with fallback for compatibility
|
|
192
|
+
- **Logprobs Extraction**: Added to API responses (Gemini, OpenAI) for uncertainty estimation
|
|
193
|
+
|
|
194
|
+
### Fixed
|
|
195
|
+
- Fixed pair comparison to use true multi-image comparison (critical research alignment fix)
|
|
196
|
+
- Fixed prompt composition inconsistencies across different testing types
|
|
197
|
+
- Improved cache key generation for multi-image requests
|
|
198
|
+
|
|
199
|
+
### Research Alignment
|
|
200
|
+
- ✅ Pair comparison now uses true multi-image API (MLLM-as-a-Judge methodology)
|
|
201
|
+
- ✅ Hallucination detection implemented (arXiv:2506.19513, 2507.19024)
|
|
202
|
+
- ✅ Optimal ensemble weighting implemented (arXiv:2510.01499)
|
|
203
|
+
- ✅ Unified prompt composition with research-backed rubrics
|
|
204
|
+
|
|
205
|
+
## [0.2.0] - 2025-11-11
|
|
206
|
+
|
|
207
|
+
### Added
|
|
208
|
+
- **Temporal Batch Optimization**
|
|
209
|
+
- `TemporalBatchOptimizer` - Batch optimizer with temporal dependency awareness
|
|
210
|
+
- `LatencyAwareBatchOptimizer` - Dynamic latency-aware batching for real-time applications
|
|
211
|
+
- Temporal constants: `TIME_SCALES`, `MULTI_SCALE_WINDOWS`, `READING_SPEEDS`, `ATTENTION_MULTIPLIERS`
|
|
212
|
+
- Temporal context utilities: `createTemporalContext`, `mergeTemporalContext`, `extractTemporalContext`
|
|
213
|
+
- Temporal decision-making: `aggregateMultiScale`, `SequentialDecisionContext`, `humanPerceptionTime`
|
|
214
|
+
- Temporal error types: `TemporalError`, `PerceptionTimeError`, `SequentialContextError`, `MultiScaleError`, `TemporalBatchError`
|
|
215
|
+
|
|
216
|
+
- **Bias Detection and Mitigation**
|
|
217
|
+
- `detectBias` and `detectPositionBias` - Detect bias in VLLM judgments
|
|
218
|
+
- `applyBiasMitigation`, `mitigateBias`, `mitigatePositionBias` - Bias mitigation utilities
|
|
219
|
+
- `comparePair` and `rankBatch` - Pair comparison and batch ranking for fair evaluation
|
|
220
|
+
|
|
221
|
+
- **Ensemble and Advanced Judging**
|
|
222
|
+
- `EnsembleJudge` and `createEnsembleJudge` - Multi-provider ensemble judging with weighted aggregation
|
|
223
|
+
- `DEFAULT_RUBRIC`, `buildRubricPrompt`, `getRubricForTestType` - Rubric system for structured evaluation
|
|
224
|
+
|
|
225
|
+
- **Logger Utility**
|
|
226
|
+
- `src/logger.mjs` - Conditional logging utility with debug mode support
|
|
227
|
+
- Logger exports: `enableDebug`, `disableDebug`, `isDebugEnabled`, `warn`, `log`, `error`
|
|
228
|
+
- Logger sub-path export: `ai-visual-test/logger`
|
|
229
|
+
|
|
230
|
+
- **Type Guards and Validation**
|
|
231
|
+
- Comprehensive type guards: `isObject`, `isString`, `isNumber`, `isArray`, `isFunction`, `isPromise`
|
|
232
|
+
- Validation type guards: `isValidationResult`, `isValidationContext`, `isPersona`, `isTemporalNote`
|
|
233
|
+
- Assertion utilities: `assertObject`, `assertString`, `assertNonEmptyString`, `assertNumber`, `assertArray`, `assertFunction`
|
|
234
|
+
- Utility functions: `pick`, `getProperty`
|
|
235
|
+
|
|
236
|
+
- **Evaluation System**
|
|
237
|
+
- Comprehensive evaluation system with dataset loaders and metrics
|
|
238
|
+
- Real-world evaluation with annotation datasets
|
|
239
|
+
- Expert evaluation scenarios and challenging website tests
|
|
240
|
+
- Interactive experience evaluation
|
|
241
|
+
- Data-driven analysis tools
|
|
242
|
+
- Performance benchmarking utilities
|
|
243
|
+
- Validation scripts for evaluation components
|
|
244
|
+
|
|
245
|
+
- **Documentation**
|
|
246
|
+
- Deep arXiv research comparison and analysis
|
|
247
|
+
- Standalone and language-agnostic usage guide
|
|
248
|
+
- Test summary and marimo.io example notebooks
|
|
249
|
+
- Expert evaluation guide
|
|
250
|
+
- Real-world application documentation
|
|
251
|
+
- Consolidated evaluation documentation
|
|
252
|
+
|
|
253
|
+
### Changed
|
|
254
|
+
- Replaced all `console.log/warn` statements with logger utility across all source files
|
|
255
|
+
- Enhanced `buildPrompt` to automatically include context information (testType, viewport, gameState)
|
|
256
|
+
- Updated CI to check for console statements (not just console.log)
|
|
257
|
+
- CI now fails if console statements found (except in logger.mjs)
|
|
258
|
+
- Improved error handling with silent fallbacks for optional operations
|
|
259
|
+
- Better separation of concerns with dedicated logger module
|
|
260
|
+
- Enhanced core modules with improved type safety and validation
|
|
261
|
+
|
|
262
|
+
### Fixed
|
|
263
|
+
- Fixed duplicate export of `TemporalBatchOptimizer` in `src/index.mjs`
|
|
264
|
+
- Fixed failing test: `buildPrompt` now includes context in prompt output
|
|
265
|
+
- Fixed missing `ValidationError` import in `judge.mjs`
|
|
266
|
+
- All 192 tests now passing (0 failures)
|
|
267
|
+
|
|
268
|
+
### Removed
|
|
269
|
+
- Archived 28+ temporary documentation files to `archive/temp-docs-20251111/`
|
|
270
|
+
- Removed documentation bloat: `FINAL_*`, `COMPLETE_*`, `SUMMARY_*`, `REVIEW_*`, `ANALYSIS_*` files
|
|
271
|
+
- Net reduction: ~3,000 lines of documentation
|
|
272
|
+
|
|
273
|
+
### Code Quality
|
|
274
|
+
- All source files now use logger utility instead of direct console calls
|
|
275
|
+
- Comprehensive test coverage with 192 passing tests
|
|
276
|
+
- Improved type safety with extensive type guards
|
|
277
|
+
- Better error handling and validation throughout
|
|
278
|
+
|
|
279
|
+
## [0.1.2] - 2025-01-27
|
|
280
|
+
|
|
281
|
+
### Security
|
|
282
|
+
- Enhanced pre-commit hook with comprehensive secret detection
|
|
283
|
+
- Added obfuscation detection (base64, hex, string concatenation)
|
|
284
|
+
- Detect secrets in decode functions (atob, Buffer.from)
|
|
285
|
+
- Added credential variable pattern matching
|
|
286
|
+
- Detect secrets in comments
|
|
287
|
+
- Added entropy analysis for decoded values
|
|
288
|
+
- Red team tested against 10+ bypass techniques
|
|
289
|
+
- Security rating: 8.5/10 - production ready
|
|
290
|
+
|
|
291
|
+
### Added
|
|
292
|
+
- `scripts/detect-secrets.mjs` - Advanced secret detection script
|
|
293
|
+
- `.secretsignore.example` - Template for secret detection exclusions
|
|
294
|
+
- `SECURITY_RED_TEAM_REPORT.md` - Comprehensive security analysis
|
|
295
|
+
- Git history scanning option (`--scan-history` flag)
|
|
296
|
+
- Support for `.secretsignore` configuration file
|
|
297
|
+
|
|
298
|
+
### Fixed
|
|
299
|
+
- Fixed test failures in `judge.test.mjs` (buildPrompt context)
|
|
300
|
+
- Fixed test failures in `load-env.test.mjs` (basePath handling)
|
|
301
|
+
- Improved `buildPrompt` to include context information
|
|
302
|
+
- Fixed `loadEnv` to respect basePath parameter
|
|
303
|
+
|
|
304
|
+
## [0.1.1] - 2025-01-27
|
|
305
|
+
|
|
306
|
+
### Changed
|
|
307
|
+
- Renamed package from `ai-screenshot-test` to `ai-visual-test`
|
|
308
|
+
- Updated description to reflect browser/Playwright integration and multi-modal validation
|
|
309
|
+
- Added persona-based experience testing with human-interpreted time scales
|
|
310
|
+
- Updated keywords to better reflect capabilities
|
|
311
|
+
- Renamed directory to match npm package name (`ai-visual-test`)
|
|
312
|
+
- Updated git remote to `arclabs561/ai-visual-test`
|
|
313
|
+
- Fixed all temporal test edge cases (null safety)
|
|
314
|
+
|
|
315
|
+
### Added
|
|
316
|
+
- `experiencePageAsPersona()` - Test page experience from persona perspective
|
|
317
|
+
- `experiencePageWithPersonas()` - Test page experience with multiple personas
|
|
318
|
+
- Human-interpreted time scales (reading time, interaction time) vs mechanical fps
|
|
319
|
+
- Comprehensive test suite (116 tests passing)
|
|
320
|
+
|
|
321
|
+
## [0.1.0] - 2025-01-27
|
|
322
|
+
|
|
323
|
+
### Added
|
|
324
|
+
- Initial release of VLLM Testing package
|
|
325
|
+
- Core validation functions (`validateScreenshot`, `VLLMJudge`)
|
|
326
|
+
- Multi-modal validation (`extractRenderedCode`, `multiPerspectiveEvaluation`)
|
|
327
|
+
- Temporal aggregation (`aggregateTemporalNotes`, `formatNotesForPrompt`)
|
|
328
|
+
- Score tracking (`ScoreTracker`)
|
|
329
|
+
- Batch optimization (`BatchOptimizer`)
|
|
330
|
+
- Feedback aggregation (`aggregateFeedback`, `generateRecommendations`)
|
|
331
|
+
- Context compression (`compressContext`, `compressStateHistory`)
|
|
332
|
+
- Structured data extraction (`extractStructuredData`)
|
|
333
|
+
- Core VLLM judge functionality (`VLLMJudge`, `validateScreenshot`)
|
|
334
|
+
- Configuration system with multi-provider support (Gemini, OpenAI, Claude)
|
|
335
|
+
- File-based caching for VLLM responses
|
|
336
|
+
- Multi-modal validation utilities
|
|
337
|
+
- Temporal aggregation for time-series analysis
|
|
338
|
+
- Environment variable loader (`load-env.mjs`)
|
|
339
|
+
- Example test file demonstrating usage
|
|
340
|
+
- Vercel serverless API for remote validation
|
|
341
|
+
- Health check endpoint
|
|
342
|
+
- Standalone web interface
|
|
343
|
+
|
|
344
|
+
### Changed
|
|
345
|
+
- Refactored from monolithic implementation into modular package
|
|
346
|
+
- Extracted temporal aggregation into `temporal.mjs`
|
|
347
|
+
- Extracted caching into `cache.mjs`
|
|
348
|
+
- Extracted multi-modal validation into `multi-modal.mjs`
|
|
349
|
+
- Centralized configuration in `config.mjs`
|
|
350
|
+
- Renamed package for general-purpose use (removed application-specific naming)
|
|
351
|
+
|
|
352
|
+
### Removed
|
|
353
|
+
- Project-specific references
|
|
354
|
+
- Application-specific naming removed
|
|
355
|
+
|
|
356
|
+
### Migration
|
|
357
|
+
- Package is now standalone and general-purpose
|
|
358
|
+
- Can be used in any project requiring visual testing with AI validation
|
|
359
|
+
- Vercel API allows remote validation without local installation
|
|
360
|
+
|
package/CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
Thanks for contributing to ai-visual-test!
|
|
4
|
+
|
|
5
|
+
## Development Setup
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Clone the repository
|
|
9
|
+
git clone https://github.com/arclabs561/ai-visual-test.git
|
|
10
|
+
cd ai-visual-test
|
|
11
|
+
|
|
12
|
+
# Install dependencies (if any)
|
|
13
|
+
npm install
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Project Structure
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
ai-visual-test/
|
|
20
|
+
├── src/
|
|
21
|
+
│ ├── index.mjs # Main exports
|
|
22
|
+
│ ├── judge.mjs # VLLM judge
|
|
23
|
+
│ ├── config.mjs # Configuration
|
|
24
|
+
│ ├── cache.mjs # Caching
|
|
25
|
+
│ ├── multi-modal.mjs # Multi-modal validation
|
|
26
|
+
│ ├── temporal.mjs # Temporal aggregation
|
|
27
|
+
│ └── load-env.mjs # Environment loader
|
|
28
|
+
├── example.test.mjs # Example usage
|
|
29
|
+
└── README.md # Documentation
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Making Changes
|
|
33
|
+
|
|
34
|
+
1. Create a feature branch: `git checkout -b feature/your-feature`
|
|
35
|
+
2. Make your changes
|
|
36
|
+
3. Test your changes: `npm test`
|
|
37
|
+
4. Commit: `git commit -m "Add feature: your feature"`
|
|
38
|
+
5. Push: `git push origin feature/your-feature`
|
|
39
|
+
6. Open a Pull Request
|
|
40
|
+
|
|
41
|
+
## Code Style
|
|
42
|
+
|
|
43
|
+
- Use ES Modules (`.mjs` files)
|
|
44
|
+
- Follow existing code style
|
|
45
|
+
- Add JSDoc comments for public APIs
|
|
46
|
+
- Keep functions focused and testable
|
|
47
|
+
|
|
48
|
+
## Testing
|
|
49
|
+
|
|
50
|
+
- Add tests for new features
|
|
51
|
+
- Ensure all tests pass: `npm test`
|
|
52
|
+
- Test with different VLLM providers if possible
|
|
53
|
+
|
|
54
|
+
## Documentation
|
|
55
|
+
|
|
56
|
+
- Update README.md for new features
|
|
57
|
+
- Add examples to `example.test.mjs`
|
|
58
|
+
- Update CHANGELOG.md for user-facing changes
|
|
59
|
+
|
|
60
|
+
## Questions?
|
|
61
|
+
|
|
62
|
+
Open an issue on GitHub for questions or discussions.
|
|
63
|
+
|
package/DEPLOYMENT.md
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# Deployment Guide
|
|
2
|
+
|
|
3
|
+
## Vercel Deployment
|
|
4
|
+
|
|
5
|
+
### Quick Deploy
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Install Vercel CLI
|
|
9
|
+
npm i -g vercel
|
|
10
|
+
|
|
11
|
+
# Deploy
|
|
12
|
+
cd /path/to/ai-visual-test
|
|
13
|
+
vercel
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
### Environment Variables
|
|
17
|
+
|
|
18
|
+
Set these in Vercel dashboard:
|
|
19
|
+
|
|
20
|
+
- `GEMINI_API_KEY` (or `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`)
|
|
21
|
+
- `VLM_PROVIDER` (optional)
|
|
22
|
+
- `API_KEY` or `VLLM_API_KEY` (optional, for API authentication)
|
|
23
|
+
- `REQUIRE_AUTH` (optional, set to `true` to enforce authentication)
|
|
24
|
+
- `RATE_LIMIT_MAX_REQUESTS` (optional, default: 10 requests per minute)
|
|
25
|
+
|
|
26
|
+
### API Endpoints
|
|
27
|
+
|
|
28
|
+
After deployment, you'll have:
|
|
29
|
+
|
|
30
|
+
- `https://your-site.vercel.app/api/validate` - Validation endpoint
|
|
31
|
+
- `https://your-site.vercel.app/api/health` - Health check
|
|
32
|
+
- `https://your-site.vercel.app/` - Web interface
|
|
33
|
+
|
|
34
|
+
### Usage
|
|
35
|
+
|
|
36
|
+
```javascript
|
|
37
|
+
// Validate screenshot (without authentication)
|
|
38
|
+
const response = await fetch('https://your-site.vercel.app/api/validate', {
|
|
39
|
+
method: 'POST',
|
|
40
|
+
headers: { 'Content-Type': 'application/json' },
|
|
41
|
+
body: JSON.stringify({
|
|
42
|
+
image: base64Image,
|
|
43
|
+
prompt: 'Evaluate this screenshot...',
|
|
44
|
+
context: { testType: 'payment-screen' }
|
|
45
|
+
})
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
const result = await response.json();
|
|
49
|
+
|
|
50
|
+
// With authentication (if API_KEY is set)
|
|
51
|
+
const responseAuth = await fetch('https://your-site.vercel.app/api/validate', {
|
|
52
|
+
method: 'POST',
|
|
53
|
+
headers: {
|
|
54
|
+
'Content-Type': 'application/json',
|
|
55
|
+
'X-API-Key': 'your-api-key' // or 'Authorization': 'Bearer your-api-key'
|
|
56
|
+
},
|
|
57
|
+
body: JSON.stringify({
|
|
58
|
+
image: base64Image,
|
|
59
|
+
prompt: 'Evaluate this screenshot...',
|
|
60
|
+
context: { testType: 'payment-screen' }
|
|
61
|
+
})
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
// Check rate limit headers
|
|
65
|
+
const remaining = response.headers.get('X-RateLimit-Remaining');
|
|
66
|
+
const resetAt = response.headers.get('X-RateLimit-Reset');
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Local Development
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
# Install dependencies
|
|
73
|
+
npm install
|
|
74
|
+
|
|
75
|
+
# Run tests
|
|
76
|
+
npm test
|
|
77
|
+
|
|
78
|
+
# Use as library
|
|
79
|
+
import { validateScreenshot } from '@ai-visual-test/core';
|
|
80
|
+
```
|
package/LICENSE
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
package/README.md
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
# @arclabs561/ai-visual-test
|
|
2
|
+
|
|
3
|
+
AI-powered visual testing. Uses vision language models to understand screenshots instead of pixel-diffing.
|
|
4
|
+
|
|
5
|
+
## Why
|
|
6
|
+
|
|
7
|
+
Pixel-based testing breaks when content changes or layouts shift. This tool asks "does this look correct?" instead of "did pixels change?"
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
npm install @arclabs561/ai-visual-test
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Set an API key:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
# .env file
|
|
19
|
+
GEMINI_API_KEY=your-key-here
|
|
20
|
+
# or
|
|
21
|
+
OPENAI_API_KEY=your-key-here
|
|
22
|
+
# or
|
|
23
|
+
ANTHROPIC_API_KEY=your-key-here
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Use
|
|
27
|
+
|
|
28
|
+
```javascript
|
|
29
|
+
import { validateScreenshot } from '@arclabs561/ai-visual-test';
|
|
30
|
+
|
|
31
|
+
const result = await validateScreenshot(
|
|
32
|
+
'screenshot.png',
|
|
33
|
+
'Check if this payment form is accessible and usable'
|
|
34
|
+
);
|
|
35
|
+
|
|
36
|
+
console.log(result.score); // 0-10
|
|
37
|
+
console.log(result.issues); // ['Missing error messages', 'Low contrast']
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## What it's good for
|
|
41
|
+
|
|
42
|
+
- **Accessibility** - Fast programmatic checks or VLLM semantic evaluation
|
|
43
|
+
- **Design principles** - Validates brutalist, minimal, or other styles
|
|
44
|
+
- **Temporal testing** - Analyzes animations and gameplay over time
|
|
45
|
+
- **State validation** - Fast programmatic or VLLM extraction
|
|
46
|
+
- **Game testing** - Validate gameplay with variable goals
|
|
47
|
+
- **Natural language specs** - Write tests in plain English
|
|
48
|
+
|
|
49
|
+
## What it's not good for
|
|
50
|
+
|
|
51
|
+
- Pixel-perfect layout testing (use pixel-diffing tools)
|
|
52
|
+
- Exact color matching (use design tools)
|
|
53
|
+
- Performance testing (use Lighthouse)
|
|
54
|
+
- Unit testing (use Jest/Vitest)
|
|
55
|
+
|
|
56
|
+
## API
|
|
57
|
+
|
|
58
|
+
### Core
|
|
59
|
+
|
|
60
|
+
```javascript
|
|
61
|
+
import { validateScreenshot, createConfig } from '@arclabs561/ai-visual-test';
|
|
62
|
+
|
|
63
|
+
// Configure (optional - auto-detects from env)
|
|
64
|
+
const config = createConfig({
|
|
65
|
+
provider: 'gemini',
|
|
66
|
+
apiKey: process.env.GEMINI_API_KEY
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
// Validate
|
|
70
|
+
const result = await validateScreenshot(
|
|
71
|
+
'screenshot.png',
|
|
72
|
+
'Evaluate this screenshot',
|
|
73
|
+
{ testType: 'payment-screen' }
|
|
74
|
+
);
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Sub-modules (better tree-shaking)
|
|
78
|
+
|
|
79
|
+
```javascript
|
|
80
|
+
// Validators
|
|
81
|
+
import { StateValidator } from '@arclabs561/ai-visual-test/validators';
|
|
82
|
+
|
|
83
|
+
// Temporal
|
|
84
|
+
import { aggregateTemporalNotes } from '@arclabs561/ai-visual-test/temporal';
|
|
85
|
+
|
|
86
|
+
// Multi-modal
|
|
87
|
+
import { multiModalValidation } from '@arclabs561/ai-visual-test/multi-modal';
|
|
88
|
+
|
|
89
|
+
// Ensemble
|
|
90
|
+
import { EnsembleJudge } from '@arclabs561/ai-visual-test/ensemble';
|
|
91
|
+
|
|
92
|
+
// Persona
|
|
93
|
+
import { experiencePageAsPersona } from '@arclabs561/ai-visual-test/persona';
|
|
94
|
+
|
|
95
|
+
// Specs
|
|
96
|
+
import { parseSpec } from '@arclabs561/ai-visual-test/specs';
|
|
97
|
+
|
|
98
|
+
// Utils
|
|
99
|
+
import { getCacheStats } from '@arclabs561/ai-visual-test/utils';
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### With Playwright
|
|
103
|
+
|
|
104
|
+
```javascript
|
|
105
|
+
import { test } from '@playwright/test';
|
|
106
|
+
import { validateScreenshot } from '@arclabs561/ai-visual-test';
|
|
107
|
+
|
|
108
|
+
test('payment screen', async ({ page }) => {
|
|
109
|
+
await page.goto('https://example.com/checkout');
|
|
110
|
+
await page.screenshot({ path: 'checkout.png' });
|
|
111
|
+
|
|
112
|
+
const result = await validateScreenshot(
|
|
113
|
+
'checkout.png',
|
|
114
|
+
'Check if payment form is accessible'
|
|
115
|
+
);
|
|
116
|
+
|
|
117
|
+
assert(result.score >= 8, 'Payment form should score at least 8');
|
|
118
|
+
});
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Features
|
|
122
|
+
|
|
123
|
+
- **Multi-provider** - Gemini, OpenAI, Claude
|
|
124
|
+
- **Cost-effective** - Auto-selects cheapest provider, includes caching
|
|
125
|
+
- **Multi-modal** - Screenshots + rendered code + context
|
|
126
|
+
- **Temporal** - Time-series validation for animations
|
|
127
|
+
- **Multi-perspective** - Multiple personas evaluate same state
|
|
128
|
+
- **Zero dependencies** - Pure ES Modules
|
|
129
|
+
|
|
130
|
+
## Examples
|
|
131
|
+
|
|
132
|
+
See `examples/` directory for complete examples.
|
|
133
|
+
|
|
134
|
+
## Documentation
|
|
135
|
+
|
|
136
|
+
- `docs/API_SUBMODULES.md` - Sub-module usage
|
|
137
|
+
- `docs/API_SURFACE_ORGANIZATION.md` - API organization
|
|
138
|
+
- `CHANGELOG.md` - Version history
|
|
139
|
+
|
|
140
|
+
## License
|
|
141
|
+
|
|
142
|
+
MIT
|