@arclabs561/ai-visual-test 0.5.1 → 0.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/CHANGELOG.md +127 -11
  2. package/DEPLOYMENT.md +225 -9
  3. package/README.md +71 -80
  4. package/index.d.ts +902 -5
  5. package/package.json +10 -51
  6. package/src/batch-optimizer.mjs +39 -0
  7. package/src/cache.mjs +241 -16
  8. package/src/config.mjs +33 -91
  9. package/src/constants.mjs +54 -0
  10. package/src/convenience.mjs +113 -10
  11. package/src/cost-optimization.mjs +1 -0
  12. package/src/cost-tracker.mjs +134 -2
  13. package/src/data-extractor.mjs +36 -7
  14. package/src/dynamic-few-shot.mjs +69 -11
  15. package/src/errors.mjs +6 -2
  16. package/src/experience-propagation.mjs +12 -0
  17. package/src/experience-tracer.mjs +12 -3
  18. package/src/game-player.mjs +222 -43
  19. package/src/graceful-shutdown.mjs +126 -0
  20. package/src/helpers/playwright.mjs +22 -8
  21. package/src/human-validation-manager.mjs +99 -2
  22. package/src/index.mjs +48 -3
  23. package/src/integrations/playwright.mjs +140 -0
  24. package/src/judge.mjs +699 -24
  25. package/src/load-env.mjs +2 -1
  26. package/src/logger.mjs +31 -3
  27. package/src/model-tier-selector.mjs +1 -221
  28. package/src/natural-language-specs.mjs +31 -3
  29. package/src/persona-enhanced.mjs +4 -2
  30. package/src/persona-experience.mjs +1 -1
  31. package/src/pricing.mjs +28 -0
  32. package/src/prompt-composer.mjs +162 -5
  33. package/src/provider-data.mjs +115 -0
  34. package/src/render-change-detector.mjs +5 -0
  35. package/src/research-enhanced-validation.mjs +7 -5
  36. package/src/retry.mjs +21 -7
  37. package/src/rubrics.mjs +4 -0
  38. package/src/safe-logger.mjs +71 -0
  39. package/src/session-cost-tracker.mjs +320 -0
  40. package/src/smart-validator.mjs +8 -8
  41. package/src/spec-templates.mjs +52 -6
  42. package/src/startup-validation.mjs +127 -0
  43. package/src/temporal-adaptive.mjs +2 -2
  44. package/src/temporal-decision-manager.mjs +1 -271
  45. package/src/temporal-logic.mjs +104 -0
  46. package/src/temporal-note-pruner.mjs +119 -0
  47. package/src/temporal-preprocessor.mjs +1 -543
  48. package/src/temporal.mjs +681 -79
  49. package/src/utils/action-hallucination-detector.mjs +301 -0
  50. package/src/utils/baseline-validator.mjs +82 -0
  51. package/src/utils/cache-stats.mjs +104 -0
  52. package/src/utils/cached-llm.mjs +164 -0
  53. package/src/utils/capability-stratifier.mjs +108 -0
  54. package/src/utils/counterfactual-tester.mjs +83 -0
  55. package/src/utils/error-recovery.mjs +117 -0
  56. package/src/utils/explainability-scorer.mjs +119 -0
  57. package/src/utils/exploratory-automation.mjs +131 -0
  58. package/src/utils/index.mjs +10 -0
  59. package/src/utils/intent-recognizer.mjs +201 -0
  60. package/src/utils/log-sanitizer.mjs +165 -0
  61. package/src/utils/path-validator.mjs +88 -0
  62. package/src/utils/performance-logger.mjs +316 -0
  63. package/src/utils/performance-measurement.mjs +280 -0
  64. package/src/utils/prompt-sanitizer.mjs +213 -0
  65. package/src/utils/rate-limiter.mjs +144 -0
  66. package/src/validation-framework.mjs +24 -20
  67. package/src/validation-result-normalizer.mjs +35 -1
  68. package/src/validation.mjs +75 -25
  69. package/src/validators/accessibility-validator.mjs +144 -0
  70. package/src/validators/hybrid-validator.mjs +48 -4
  71. package/api/health.js +0 -34
  72. package/api/validate.js +0 -252
  73. package/public/index.html +0 -149
  74. package/vercel.json +0 -27
package/index.d.ts CHANGED
@@ -210,26 +210,126 @@ export interface EnsembleResult {
210
210
  votingMethod: string;
211
211
  }
212
212
 
213
+ /**
214
+ * Ensemble Judge
215
+ *
216
+ * Uses multiple LLM providers to evaluate screenshots and aggregates results
217
+ * for improved accuracy (10-20% improvement with 3+ models).
218
+ *
219
+ * **Research:** Based on arXiv:2510.01499 - "Optimal LLM Aggregation"
220
+ *
221
+ * **Use when:** You need maximum reliability for critical evaluations
222
+ * (accessibility, quality checks, design validation).
223
+ *
224
+ * @example
225
+ * ```typescript
226
+ * const judge = new EnsembleJudge({
227
+ * judges: [
228
+ * new VLLMJudge({ provider: 'gemini' }),
229
+ * new VLLMJudge({ provider: 'openai' }),
230
+ * new VLLMJudge({ provider: 'claude' })
231
+ * ],
232
+ * votingMethod: 'weighted_average'
233
+ * });
234
+ *
235
+ * const result = await judge.evaluate(
236
+ * 'screenshot.png',
237
+ * 'Evaluate accessibility'
238
+ * );
239
+ *
240
+ * console.log(result.score); // Aggregated score
241
+ * console.log(result.agreement.score); // How much models agree
242
+ * ```
243
+ */
213
244
  export class EnsembleJudge {
245
+ /**
246
+ * Create a new Ensemble Judge instance.
247
+ *
248
+ * @param options - Ensemble options (judges, voting method, weights, etc.)
249
+ */
214
250
  constructor(options?: EnsembleJudgeOptions);
251
+
252
+ /**
253
+ * Evaluate screenshot using multiple judges and aggregate results.
254
+ *
255
+ * @param imagePath - Path to screenshot
256
+ * @param prompt - Evaluation prompt
257
+ * @param context - Optional validation context
258
+ * @returns Promise resolving to EnsembleResult with aggregated score and agreement metrics
259
+ */
215
260
  evaluate(imagePath: string, prompt: string, context?: Record<string, unknown>): Promise<EnsembleResult>;
216
261
  }
217
262
 
218
263
  export function createEnsembleJudge(providers?: string[], options?: EnsembleJudgeOptions): EnsembleJudge;
219
264
 
220
265
  // Core Types
266
+ /**
267
+ * Validation context for screenshot validation.
268
+ *
269
+ * Provides additional context to guide the AI evaluation, including test type,
270
+ * viewport information, game state, and optimization options.
271
+ *
272
+ * @example
273
+ * ```typescript
274
+ * const context: ValidationContext = {
275
+ * testType: 'accessibility',
276
+ * viewport: { width: 1920, height: 1080 },
277
+ * autoSelectTier: true,
278
+ * autoSelectProvider: true
279
+ * };
280
+ * ```
281
+ */
221
282
  export interface ValidationContext {
283
+ /** Test type identifier (e.g., 'accessibility', 'payment-screen', 'gameplay') */
222
284
  testType?: string;
285
+ /** Viewport dimensions for context-aware evaluation */
223
286
  viewport?: { width: number; height: number };
287
+ /** Game state or application state for context */
224
288
  gameState?: Record<string, unknown>;
289
+ /** Enable caching (default: true) */
225
290
  useCache?: boolean;
291
+ /** Request timeout in milliseconds */
226
292
  timeout?: number;
293
+ /** Use explicit rubric for consistent scoring */
227
294
  useRubric?: boolean;
295
+ /** Include dimension scores in evaluation */
228
296
  includeDimensions?: boolean;
297
+ /** URL of the page being tested */
229
298
  url?: string;
299
+ /** Description of the test scenario */
230
300
  description?: string;
301
+ /** Current step in multi-step test */
231
302
  step?: string;
303
+ /** Custom prompt builder function */
232
304
  promptBuilder?: (prompt: string, context: ValidationContext) => string;
305
+ /** Auto-select model tier (fast/balanced/best) based on context */
306
+ autoSelectTier?: boolean;
307
+ /** Auto-select provider (cheapest available) */
308
+ autoSelectProvider?: boolean;
309
+ /** Include cost comparison in results */
310
+ includeCostComparison?: boolean;
311
+ /** Frequency for high-frequency validation (Hz) */
312
+ frequency?: number;
313
+ /** Cost sensitivity flag for optimization */
314
+ costSensitive?: boolean;
315
+ /** Criticality level (low/medium/high/critical) */
316
+ criticality?: 'low' | 'medium' | 'high' | 'critical';
317
+ /** Model tier to use (fast/balanced/best) */
318
+ modelTier?: 'fast' | 'balanced' | 'best';
319
+ /** Temporal decision options (for high-frequency validation) */
320
+ useTemporalDecision?: boolean;
321
+ /** Temporal notes for decision context */
322
+ temporalNotes?: TemporalNote[];
323
+ /** Current state for temporal decision */
324
+ currentState?: Record<string, unknown>;
325
+ /** Previous state for temporal decision */
326
+ previousState?: Record<string, unknown>;
327
+ /** Previous result for temporal decision */
328
+ previousResult?: ValidationResult;
329
+ /** Temporal decision manager options */
330
+ temporalDecisionOptions?: Record<string, unknown>;
331
+ /** Per-call visual anchors (appended to config-level anchors) */
332
+ anchors?: VisualAnchors | null;
233
333
  }
234
334
 
235
335
  export interface EstimatedCost {
@@ -241,34 +341,168 @@ export interface EstimatedCost {
241
341
  currency: string;
242
342
  }
243
343
 
344
+ /** A structured issue with metadata (importance, evidence, suggestion). */
345
+ export interface RichIssue {
346
+ /** Human-readable issue description */
347
+ description: string;
348
+ /** Importance level */
349
+ importance?: 'low' | 'medium' | 'high' | 'critical';
350
+ /** Annoyance level */
351
+ annoyance?: 'low' | 'medium' | 'high';
352
+ /** Impact category */
353
+ impact?: string;
354
+ /** Evidence observed in the screenshot */
355
+ evidence?: string;
356
+ /** Suggested fix */
357
+ suggestion?: string;
358
+ }
359
+
360
+ /** A structured recommendation with priority and expected impact. */
361
+ export interface Recommendation {
362
+ /** Priority level */
363
+ priority?: 'low' | 'medium' | 'high';
364
+ /** What to change */
365
+ suggestion: string;
366
+ /** Expected improvement from the change */
367
+ expectedImpact?: string;
368
+ }
369
+
244
370
  export interface SemanticInfo {
245
371
  score: number | null;
246
- issues: string[];
372
+ issues: RichIssue[];
247
373
  assessment: string | null;
248
- reasoning: string;
374
+ reasoning: string | null;
375
+ strengths?: string[];
376
+ recommendations?: Recommendation[];
377
+ evidence?: string | Record<string, unknown> | null;
378
+ dimensionScores?: Record<string, number> | null;
249
379
  brutalistViolations?: string[];
250
380
  zeroToleranceViolations?: string[];
251
381
  }
252
382
 
383
+ /**
384
+ * Result of screenshot validation.
385
+ *
386
+ * Contains the AI's evaluation of the screenshot, including score, issues,
387
+ * reasoning, and metadata about the validation process.
388
+ *
389
+ * @example
390
+ * ```typescript
391
+ * const result: ValidationResult = {
392
+ * enabled: true,
393
+ * provider: 'gemini',
394
+ * score: 8.5,
395
+ * issues: ['Low contrast on submit button'],
396
+ * assessment: 'Good',
397
+ * reasoning: 'The form is mostly accessible...',
398
+ * estimatedCost: { totalCost: '0.000123', currency: 'USD' },
399
+ * responseTime: 1234,
400
+ * cached: false
401
+ * };
402
+ * ```
403
+ */
253
404
  export interface ValidationResult {
405
+ /** Whether validation was enabled (false if API key missing) */
254
406
  enabled: boolean;
407
+ /** LLM provider used (gemini, openai, claude, groq) */
255
408
  provider: string;
409
+ /** Quality score (0-10, null if validation failed) */
256
410
  score: number | null;
411
+ /** List of issues found (flat strings for backward compat) */
257
412
  issues: string[];
413
+ /** Structured issues with importance, evidence, and suggestions */
414
+ richIssues?: RichIssue[];
415
+ /** Overall assessment (e.g., 'Good', 'Needs Improvement') */
258
416
  assessment: string | null;
417
+ /** Detailed reasoning for the score */
259
418
  reasoning: string;
419
+ /** Actionable recommendations with priority and expected impact */
420
+ recommendations?: Recommendation[];
421
+ /** What the UI does well */
422
+ strengths?: string[];
423
+ /** Per-dimension scores (e.g., game_authenticity: 9, typography: 7) */
424
+ dimensionScores?: Record<string, number> | null;
425
+ /** Estimated API cost breakdown */
260
426
  estimatedCost?: EstimatedCost | null;
427
+ /** Response time in milliseconds */
261
428
  responseTime: number;
429
+ /** Whether result was served from cache */
262
430
  cached?: boolean;
431
+ /** Raw judgment text from LLM */
263
432
  judgment?: string;
433
+ /** Raw API response */
264
434
  raw?: unknown;
435
+ /** Extracted semantic information */
265
436
  semantic?: SemanticInfo;
437
+ /** Error message if validation failed */
266
438
  error?: string;
439
+ /** Status message */
267
440
  message?: string;
441
+ /** Provider pricing information */
268
442
  pricing?: { input: number; output: number };
443
+ /** Timestamp of validation */
269
444
  timestamp?: string;
445
+ /** Test name if provided */
270
446
  testName?: string;
447
+ /** Viewport dimensions if provided */
271
448
  viewport?: { width: number; height: number } | null;
449
+ /** Cost comparison information (if includeCostComparison enabled) */
450
+ costComparison?: {
451
+ current: { tier: string; provider: string; cost: number };
452
+ tiers: Record<string, number>;
453
+ savings: Record<string, { absolute: number; percent: number; cost: number }>;
454
+ recommendation: { tier: string; cost: number; savings: number; savingsPercent: number; reason: string };
455
+ };
456
+ /** Whether temporal decision skipped this call */
457
+ skipped?: boolean;
458
+ /** Reason for skipping (if skipped) */
459
+ skipReason?: string;
460
+ /** Urgency level (if temporal decision used) */
461
+ urgency?: 'low' | 'medium' | 'high';
462
+ }
463
+
464
+ /**
465
+ * A single visual anchor: either a plain text string or an object
466
+ * with optional dimension scoping and/or an image reference.
467
+ *
468
+ * Plain string: `"Card images large enough to see art"`
469
+ * With dimension: `{ text: "Card images large", dimension: "card_presentation" }`
470
+ * Image ref: `{ image: "/path/to/good.png", label: "Well-themed Magic layout" }`
471
+ * Image + dimension: `{ image: "/path/to/good.png", label: "...", dimension: "game_authenticity" }`
472
+ *
473
+ * Images accept a file path or a data URI (`data:image/png;base64,...`).
474
+ */
475
+ export type AnchorEntry = string | {
476
+ /** Text description of the anchor signal */
477
+ text?: string;
478
+ /** File path or data URI of a reference screenshot */
479
+ image?: string;
480
+ /** Short label for the image (shown in prompt) */
481
+ label?: string;
482
+ /** Rubric dimension this anchor relates to (e.g., "game_authenticity") */
483
+ dimension?: string;
484
+ };
485
+
486
+ /**
487
+ * Domain-level visual anchors for VLM evaluation grounding.
488
+ *
489
+ * Text anchors describe what to look for / flag in words.
490
+ * Image anchors provide reference screenshots as few-shot visual examples
491
+ * so the VLM can calibrate against concrete good/bad instances.
492
+ *
493
+ * Anchors can optionally be scoped to rubric dimensions via the
494
+ * `dimension` field on AnchorEntry objects.
495
+ *
496
+ * Set once in config for the project; per-call anchors in
497
+ * ValidationContext append to (not replace) config-level anchors.
498
+ */
499
+ export interface VisualAnchors {
500
+ /** Brief domain description injected as context (e.g., "Card game search UI for TCG players") */
501
+ domain?: string;
502
+ /** Positive signals the VLM should look for (text and/or image entries) */
503
+ positive?: AnchorEntry[];
504
+ /** Negative signals the VLM should flag (text and/or image entries) */
505
+ negative?: AnchorEntry[];
272
506
  }
273
507
 
274
508
  export interface ConfigOptions {
@@ -280,6 +514,8 @@ export interface ConfigOptions {
280
514
  maxConcurrency?: number;
281
515
  timeout?: number;
282
516
  verbose?: boolean;
517
+ /** Domain-level visual anchors included in every evaluation prompt */
518
+ anchors?: VisualAnchors | null;
283
519
  }
284
520
 
285
521
  export interface Config {
@@ -294,6 +530,8 @@ export interface Config {
294
530
  priority: number;
295
531
  };
296
532
  enabled: boolean;
533
+ /** Normalized visual anchors (null when none configured) */
534
+ anchors: VisualAnchors | null;
297
535
  cache: {
298
536
  enabled: boolean;
299
537
  dir: string | null;
@@ -307,28 +545,183 @@ export interface Config {
307
545
  };
308
546
  }
309
547
 
310
- // VLLMJudge Class
548
+ /**
549
+ * VLLM Judge Class
550
+ *
551
+ * Core screenshot validation engine using Vision Language Models.
552
+ * Supports multiple providers (Gemini, OpenAI, Claude, Groq) with automatic
553
+ * selection, caching, and cost optimization.
554
+ *
555
+ * **Use when:** You need fine-grained control over validation or custom judge implementations.
556
+ * **Otherwise:** Use `validateScreenshot()` function for simpler API.
557
+ *
558
+ * @example
559
+ * ```typescript
560
+ * // Create custom judge instance
561
+ * const judge = new VLLMJudge({
562
+ * provider: 'gemini',
563
+ * apiKey: process.env.GEMINI_API_KEY,
564
+ * cacheEnabled: true
565
+ * });
566
+ *
567
+ * const result = await judge.judgeScreenshot(
568
+ * 'screenshot.png',
569
+ * 'Evaluate this page'
570
+ * );
571
+ * ```
572
+ */
311
573
  export class VLLMJudge {
574
+ /**
575
+ * Create a new VLLM Judge instance.
576
+ *
577
+ * @param options - Configuration options (provider, API key, cache, etc.)
578
+ */
312
579
  constructor(options?: ConfigOptions);
580
+
581
+ /** Current provider name (gemini, openai, claude, groq) */
313
582
  provider: string;
583
+ /** API key for current provider */
314
584
  apiKey: string | null;
585
+ /** Provider configuration (model, pricing, etc.) */
315
586
  providerConfig: Config['providerConfig'];
587
+ /** Whether validation is enabled (false if API key missing) */
316
588
  enabled: boolean;
317
589
 
590
+ /**
591
+ * Convert image file to base64 string for API.
592
+ *
593
+ * @param imagePath - Path to image file
594
+ * @returns Base64-encoded image string
595
+ * @throws {FileError} If file not found or invalid format
596
+ */
318
597
  imageToBase64(imagePath: string): string;
598
+
599
+ /**
600
+ * Build evaluation prompt with context.
601
+ *
602
+ * @param prompt - Base evaluation prompt
603
+ * @param context - Validation context
604
+ * @returns Enhanced prompt with context
605
+ */
319
606
  buildPrompt(prompt: string, context: ValidationContext): string;
607
+
608
+ /**
609
+ * Extract semantic information from judgment text.
610
+ *
611
+ * @param judgment - Judgment text or object
612
+ * @returns Structured semantic information
613
+ */
320
614
  extractSemanticInfo(judgment: string | object): SemanticInfo;
615
+
616
+ /**
617
+ * Estimate API cost for validation.
618
+ *
619
+ * @param data - API request/response data
620
+ * @param provider - Provider name
621
+ * @returns Estimated cost breakdown or null
622
+ */
321
623
  estimateCost(data: unknown, provider: string): EstimatedCost | null;
322
- judgeScreenshot(imagePath: string, prompt: string, context?: ValidationContext): Promise<ValidationResult>;
624
+
625
+ /**
626
+ * Judge a screenshot using VLLM.
627
+ *
628
+ * @param imagePath - Path to screenshot or array for comparison
629
+ * @param prompt - Evaluation prompt
630
+ * @param context - Optional validation context
631
+ * @returns Promise resolving to ValidationResult
632
+ */
633
+ judgeScreenshot(imagePath: string | string[], prompt: string, context?: ValidationContext): Promise<ValidationResult>;
323
634
  }
324
635
 
325
636
  // Core Functions
637
+ /**
638
+ * Validate a screenshot using Vision Language Models (VLLM).
639
+ *
640
+ * This is the primary API function. It takes a screenshot and evaluation prompt,
641
+ * sends it to an AI model (Gemini, OpenAI, Claude, or Groq), and returns structured
642
+ * validation results with score, issues, and reasoning.
643
+ *
644
+ * **Key Features:**
645
+ * - Automatic provider selection (cheapest available)
646
+ * - Automatic tier selection (fast/balanced/best)
647
+ * - Built-in caching (7-day TTL)
648
+ * - Cost optimization
649
+ * - Temporal decision making (for high-frequency validation)
650
+ *
651
+ * @param imagePath - Path to screenshot file (PNG, JPEG, GIF, WebP) or array of paths for comparison
652
+ * @param prompt - Evaluation prompt (e.g., "Is this accessible?", "Check if payment form works")
653
+ * @param context - Optional validation context (testType, viewport, optimization options)
654
+ * @returns Promise resolving to ValidationResult with score, issues, reasoning, and metadata
655
+ *
656
+ * @example
657
+ * ```typescript
658
+ * // Basic usage
659
+ * const result = await validateScreenshot(
660
+ * 'screenshot.png',
661
+ * 'Check if this payment form is accessible'
662
+ * );
663
+ * console.log(result.score); // 8.5 (0-10 scale)
664
+ * console.log(result.issues); // ['Low contrast on button', 'Missing label']
665
+ * console.log(result.reasoning); // "The form is mostly accessible..."
666
+ * ```
667
+ *
668
+ * @example
669
+ * ```typescript
670
+ * // With cost optimization
671
+ * const result = await validateScreenshot(
672
+ * 'screenshot.png',
673
+ * 'Evaluate accessibility',
674
+ * {
675
+ * autoSelectTier: true,
676
+ * autoSelectProvider: true,
677
+ * includeCostComparison: true
678
+ * }
679
+ * );
680
+ * console.log(result.costComparison?.savings.fast?.percent); // 45% savings
681
+ * ```
682
+ *
683
+ * @example
684
+ * ```typescript
685
+ * // High-frequency validation (60Hz)
686
+ * const result = await validateScreenshot(
687
+ * 'frame.png',
688
+ * 'Is the game playable?',
689
+ * {
690
+ * frequency: 60,
691
+ * autoSelectTier: true,
692
+ * useTemporalDecision: true
693
+ * }
694
+ * );
695
+ * ```
696
+ *
697
+ * @throws {FileError} If screenshot file not found or invalid format
698
+ * @throws {ValidationError} If validation fails
699
+ * @throws {ProviderError} If API provider error occurs
700
+ * @throws {TimeoutError} If request times out
701
+ */
326
702
  export function validateScreenshot(
327
- imagePath: string,
703
+ imagePath: string | string[],
328
704
  prompt: string,
329
705
  context?: ValidationContext
330
706
  ): Promise<ValidationResult>;
331
707
 
708
+ /**
709
+ * Extract semantic information from VLLM judgment text.
710
+ *
711
+ * Parses AI judgment responses into structured data (score, issues, reasoning).
712
+ * Useful for custom implementations that need to parse judgment text.
713
+ *
714
+ * @param judgment - Judgment text or object from VLLM
715
+ * @returns Structured semantic information with score, issues, assessment, reasoning
716
+ *
717
+ * @example
718
+ * ```typescript
719
+ * const judgment = "Score: 8.5. Issues: Low contrast. Reasoning: The form is mostly accessible...";
720
+ * const info = extractSemanticInfo(judgment);
721
+ * console.log(info.score); // 8.5
722
+ * console.log(info.issues); // ['Low contrast']
723
+ * ```
724
+ */
332
725
  export function extractSemanticInfo(judgment: string | object): SemanticInfo;
333
726
 
334
727
  // Multi-Modal Types
@@ -405,11 +798,33 @@ export function multiModalValidation(
405
798
  }>;
406
799
 
407
800
  // Temporal Types
801
+ /**
802
+ * Temporal note for tracking state over time.
803
+ *
804
+ * Used in high-frequency validation (10-60Hz) to track observations
805
+ * and enable temporal decision making (reduces LLM calls by 98.5%).
806
+ *
807
+ * @example
808
+ * ```typescript
809
+ * const note: TemporalNote = {
810
+ * timestamp: Date.now(),
811
+ * elapsed: 100,
812
+ * score: 8.5,
813
+ * observation: 'Button clicked',
814
+ * step: 'checkout'
815
+ * };
816
+ * ```
817
+ */
408
818
  export interface TemporalNote {
819
+ /** Timestamp in milliseconds */
409
820
  timestamp?: number;
821
+ /** Elapsed time since start in milliseconds */
410
822
  elapsed?: number;
823
+ /** Quality score (0-10) */
411
824
  score?: number;
825
+ /** Observation description */
412
826
  observation?: string;
827
+ /** Step identifier */
413
828
  step?: string;
414
829
  }
415
830
 
@@ -437,6 +852,36 @@ export interface AggregatedTemporalNotes {
437
852
  }
438
853
 
439
854
  // Temporal Functions
855
+ /**
856
+ * Aggregate temporal notes into time windows with weighted scores.
857
+ *
858
+ * Used for high-frequency validation to reduce LLM calls by aggregating
859
+ * observations over time windows. Implements exponential decay weighting
860
+ * (recent notes weighted more heavily).
861
+ *
862
+ * **Research:** Inspired by arXiv:2505.17663 (DynToM) and arXiv:2507.15851
863
+ * (Human Temporal Cognition), adapted with exponential decay for practical use.
864
+ *
865
+ * @param notes - Array of temporal notes to aggregate
866
+ * @param options - Aggregation options
867
+ * @param options.windowSize - Time window size in milliseconds (default: 1000)
868
+ * @param options.decayFactor - Exponential decay factor (default: 0.9)
869
+ * @param options.coherenceThreshold - Coherence threshold for filtering (default: 0.5)
870
+ * @returns Aggregated notes with windows, summary, and coherence score
871
+ *
872
+ * @example
873
+ * ```typescript
874
+ * const notes: TemporalNote[] = [
875
+ * { timestamp: 0, score: 8, observation: 'Initial state' },
876
+ * { timestamp: 100, score: 8.5, observation: 'Button clicked' },
877
+ * { timestamp: 200, score: 9, observation: 'Form submitted' }
878
+ * ];
879
+ *
880
+ * const aggregated = aggregateTemporalNotes(notes);
881
+ * console.log(aggregated.coherence); // 0.92 (high coherence)
882
+ * console.log(aggregated.windows[0].avgScore); // 8.5
883
+ * ```
884
+ */
440
885
  export function aggregateTemporalNotes(
441
886
  notes: TemporalNote[],
442
887
  options?: {
@@ -450,6 +895,223 @@ export function formatNotesForPrompt(aggregated: AggregatedTemporalNotes): strin
450
895
 
451
896
  export function calculateCoherence(windows: TemporalWindow[]): number;
452
897
 
898
+ /**
899
+ * Temporal Decision Manager
900
+ *
901
+ * Decides when to call LLM vs. reuse previous result for high-frequency validation.
902
+ * Reduces LLM calls by 98.5% while maintaining accuracy through temporal coherence.
903
+ *
904
+ * **Research:** Based on arXiv:2406.12125 - "Efficient Sequential Decision Making with Large Language Models"
905
+ *
906
+ * **Core Insight:** Don't prompt on every state change, prompt when decision is needed.
907
+ *
908
+ * **Note:** Implementation is obfuscated to protect proprietary algorithms, but API is fully documented.
909
+ *
910
+ * @example
911
+ * ```typescript
912
+ * const manager = new TemporalDecisionManager({
913
+ * minNotesForPrompt: 3,
914
+ * coherenceThreshold: 0.5,
915
+ * urgencyThreshold: 0.3
916
+ * });
917
+ *
918
+ * const decision = await manager.shouldPrompt(
919
+ * currentState,
920
+ * previousState,
921
+ * temporalNotes,
922
+ * context
923
+ * );
924
+ *
925
+ * if (decision.shouldPrompt) {
926
+ * // Call LLM
927
+ * } else {
928
+ * // Reuse previous result
929
+ * }
930
+ * ```
931
+ */
932
+ export class TemporalDecisionManager {
933
+ /**
934
+ * Create a new Temporal Decision Manager.
935
+ *
936
+ * @param options - Decision manager options
937
+ * @param options.minNotesForPrompt - Minimum notes before prompting (default: 3)
938
+ * @param options.coherenceThreshold - Coherence threshold for prompting (default: 0.5)
939
+ * @param options.urgencyThreshold - Urgency threshold for prompting (default: 0.3)
940
+ * @param options.maxWaitTime - Maximum wait time before forcing prompt (default: 10000ms)
941
+ * @param options.stateChangeThreshold - State change threshold for prompting (default: 0.2)
942
+ * @param options.warmStartSteps - Use LLM for first N steps (default: 10)
943
+ * @param options.adaptiveSampling - Enable adaptive sampling (default: true)
944
+ */
945
+ constructor(options?: {
946
+ minNotesForPrompt?: number;
947
+ coherenceThreshold?: number;
948
+ urgencyThreshold?: number;
949
+ maxWaitTime?: number;
950
+ stateChangeThreshold?: number;
951
+ warmStartSteps?: number;
952
+ adaptiveSampling?: boolean;
953
+ });
954
+
955
+ /**
956
+ * Decide if we should prompt now or wait for more context.
957
+ *
958
+ * @param currentState - Current state object
959
+ * @param previousState - Previous state object (if any)
960
+ * @param temporalNotes - Array of temporal notes
961
+ * @param context - Additional context
962
+ * @returns Decision object with shouldPrompt, reason, and urgency
963
+ */
964
+ shouldPrompt(
965
+ currentState: Record<string, unknown>,
966
+ previousState: Record<string, unknown> | null,
967
+ temporalNotes: TemporalNote[],
968
+ context?: Record<string, unknown>
969
+ ): Promise<{
970
+ shouldPrompt: boolean;
971
+ reason: string;
972
+ urgency: 'low' | 'medium' | 'high';
973
+ }>;
974
+
975
+ /**
976
+ * Calculate state change magnitude.
977
+ *
978
+ * @param currentState - Current state
979
+ * @param previousState - Previous state
980
+ * @returns State change score (0-1)
981
+ */
982
+ calculateStateChange(
983
+ currentState: Record<string, unknown>,
984
+ previousState: Record<string, unknown> | null
985
+ ): number;
986
+
987
+ /**
988
+ * Check if current state is a decision point.
989
+ *
990
+ * @param currentState - Current state
991
+ * @param context - Additional context
992
+ * @returns True if decision point
993
+ */
994
+ isDecisionPoint(
995
+ currentState: Record<string, unknown>,
996
+ context?: Record<string, unknown>
997
+ ): boolean;
998
+
999
+ /**
1000
+ * Check if there's a recent user action.
1001
+ *
1002
+ * @param temporalNotes - Array of temporal notes
1003
+ * @param context - Additional context
1004
+ * @returns True if recent user action detected
1005
+ */
1006
+ hasRecentUserAction(
1007
+ temporalNotes: TemporalNote[],
1008
+ context?: Record<string, unknown>
1009
+ ): boolean;
1010
+ }
1011
+
1012
+ /**
1013
+ * Create a temporal decision manager with default options.
1014
+ *
1015
+ * @param options - Decision manager options
1016
+ * @returns New TemporalDecisionManager instance
1017
+ */
1018
+ export function createTemporalDecisionManager(options?: {
1019
+ minNotesForPrompt?: number;
1020
+ coherenceThreshold?: number;
1021
+ urgencyThreshold?: number;
1022
+ maxWaitTime?: number;
1023
+ stateChangeThreshold?: number;
1024
+ warmStartSteps?: number;
1025
+ adaptiveSampling?: boolean;
1026
+ }): TemporalDecisionManager;
1027
+
1028
+ /**
1029
+ * Temporal Preprocessing Manager
1030
+ *
1031
+ * Optimizes temporal note processing for high-frequency validation (10-60Hz).
1032
+ * Implements activity-based preprocessing patterns to reduce computational overhead.
1033
+ *
1034
+ * **Note:** Implementation is obfuscated to protect proprietary algorithms, but API is fully documented.
1035
+ *
1036
+ * @example
1037
+ * ```typescript
1038
+ * const manager = new TemporalPreprocessingManager({
1039
+ * activityThreshold: 0.5,
1040
+ * highFrequencyMode: true
1041
+ * });
1042
+ *
1043
+ * const processed = await manager.preprocess(temporalNotes, context);
1044
+ * ```
1045
+ */
1046
+ export class TemporalPreprocessingManager {
1047
+ /**
1048
+ * Create a new Temporal Preprocessing Manager.
1049
+ *
1050
+ * @param options - Preprocessing options
1051
+ */
1052
+ constructor(options?: Record<string, unknown>);
1053
+
1054
+ /**
1055
+ * Preprocess temporal notes for efficient handling.
1056
+ *
1057
+ * @param notes - Array of temporal notes
1058
+ * @param context - Additional context
1059
+ * @returns Processed notes
1060
+ */
1061
+ preprocess(
1062
+ notes: TemporalNote[],
1063
+ context?: Record<string, unknown>
1064
+ ): Promise<TemporalNote[]>;
1065
+ }
1066
+
1067
+ /**
1068
+ * Adaptive Temporal Processor
1069
+ *
1070
+ * Adaptively processes temporal notes based on activity patterns.
1071
+ *
1072
+ * @example
1073
+ * ```typescript
1074
+ * const processor = new AdaptiveTemporalProcessor();
1075
+ * const processed = await processor.process(notes, context);
1076
+ * ```
1077
+ */
1078
+ export class AdaptiveTemporalProcessor {
1079
+ /**
1080
+ * Create a new Adaptive Temporal Processor.
1081
+ *
1082
+ * @param options - Processor options
1083
+ */
1084
+ constructor(options?: Record<string, unknown>);
1085
+
1086
+ /**
1087
+ * Process temporal notes adaptively.
1088
+ *
1089
+ * @param notes - Array of temporal notes
1090
+ * @param context - Additional context
1091
+ * @returns Processed notes
1092
+ */
1093
+ process(
1094
+ notes: TemporalNote[],
1095
+ context?: Record<string, unknown>
1096
+ ): Promise<TemporalNote[]>;
1097
+ }
1098
+
1099
+ /**
1100
+ * Create a temporal preprocessing manager with default options.
1101
+ *
1102
+ * @param options - Preprocessing options
1103
+ * @returns New TemporalPreprocessingManager instance
1104
+ */
1105
+ export function createTemporalPreprocessingManager(options?: Record<string, unknown>): TemporalPreprocessingManager;
1106
+
1107
+ /**
1108
+ * Create an adaptive temporal processor with default options.
1109
+ *
1110
+ * @param options - Processor options
1111
+ * @returns New AdaptiveTemporalProcessor instance
1112
+ */
1113
+ export function createAdaptiveTemporalProcessor(options?: Record<string, unknown>): AdaptiveTemporalProcessor;
1114
+
453
1115
  // Cache Types
454
1116
  export interface CacheStats {
455
1117
  hits: number;
@@ -459,16 +1121,77 @@ export interface CacheStats {
459
1121
  }
460
1122
 
461
1123
  // Cache Functions
1124
+ /**
1125
+ * Initialize cache system.
1126
+ *
1127
+ * Sets up file-based caching with 7-day TTL. Cache persists across
1128
+ * process restarts and reduces API costs by serving cached results.
1129
+ *
1130
+ * @param cacheDir - Cache directory path (default: `.cache/ai-visual-test`)
1131
+ *
1132
+ * @example
1133
+ * ```typescript
1134
+ * initCache('/tmp/my-cache');
1135
+ * const result = await validateScreenshot('screenshot.png', 'Evaluate');
1136
+ * // Subsequent calls with same screenshot/prompt use cache
1137
+ * ```
1138
+ */
462
1139
  export function initCache(cacheDir?: string): void;
1140
+
1141
+ /**
1142
+ * Generate cache key for validation request.
1143
+ *
1144
+ * Creates SHA-256 hash of image path, prompt, and context for cache lookup.
1145
+ *
1146
+ * @param imagePath - Screenshot path
1147
+ * @param prompt - Evaluation prompt
1148
+ * @param context - Validation context
1149
+ * @returns Cache key string
1150
+ */
463
1151
  export function generateCacheKey(imagePath: string, prompt: string, context?: ValidationContext): string;
1152
+
1153
+ /**
1154
+ * Get cached validation result.
1155
+ *
1156
+ * @param imagePath - Screenshot path
1157
+ * @param prompt - Evaluation prompt
1158
+ * @param context - Validation context
1159
+ * @returns Cached ValidationResult or null if not cached
1160
+ */
464
1161
  export function getCached(imagePath: string, prompt: string, context?: ValidationContext): ValidationResult | null;
1162
+
1163
+ /**
1164
+ * Cache validation result.
1165
+ *
1166
+ * @param imagePath - Screenshot path
1167
+ * @param prompt - Evaluation prompt
1168
+ * @param context - Validation context
1169
+ * @param result - Validation result to cache
1170
+ */
465
1171
  export function setCached(
466
1172
  imagePath: string,
467
1173
  prompt: string,
468
1174
  context: ValidationContext,
469
1175
  result: ValidationResult
470
1176
  ): void;
1177
+
1178
+ /**
1179
+ * Clear all cached results.
1180
+ */
471
1181
  export function clearCache(): void;
1182
+
1183
+ /**
1184
+ * Get cache statistics.
1185
+ *
1186
+ * @returns Cache stats (hits, misses, size, hit rate)
1187
+ *
1188
+ * @example
1189
+ * ```typescript
1190
+ * const stats = getCacheStats();
1191
+ * console.log(`Hit rate: ${stats.hitRate * 100}%`); // 85%
1192
+ * console.log(`Cache size: ${stats.size}`); // 123
1193
+ * ```
1194
+ */
472
1195
  export function getCacheStats(): CacheStats;
473
1196
 
474
1197
  // Config Functions
@@ -507,10 +1230,58 @@ export class ScoreTracker {
507
1230
  }
508
1231
 
509
1232
  // BatchOptimizer Class
1233
+ /**
1234
+ * Batch Optimizer
1235
+ *
1236
+ * Optimizes validation of multiple screenshots by batching requests,
1237
+ * managing concurrency, and caching results.
1238
+ *
1239
+ * **Use when:** You need to validate multiple screenshots efficiently.
1240
+ *
1241
+ * @example
1242
+ * ```typescript
1243
+ * const optimizer = new BatchOptimizer({
1244
+ * maxConcurrency: 5,
1245
+ * batchSize: 10,
1246
+ * cacheEnabled: true
1247
+ * });
1248
+ *
1249
+ * const results = await optimizer.batchValidate(
1250
+ * ['screenshot1.png', 'screenshot2.png', 'screenshot3.png'],
1251
+ * 'Evaluate accessibility'
1252
+ * );
1253
+ *
1254
+ * console.log(results.length); // 3
1255
+ * ```
1256
+ */
510
1257
  export class BatchOptimizer {
1258
+ /**
1259
+ * Create a new Batch Optimizer instance.
1260
+ *
1261
+ * @param options - Optimizer options (maxConcurrency, batchSize, cacheEnabled)
1262
+ */
511
1263
  constructor(options?: { maxConcurrency?: number; batchSize?: number; cacheEnabled?: boolean });
1264
+
1265
+ /**
1266
+ * Validate multiple screenshots in batch.
1267
+ *
1268
+ * @param imagePaths - Single path, array of paths, or array of arrays for comparison
1269
+ * @param prompt - Evaluation prompt
1270
+ * @param context - Optional validation context
1271
+ * @returns Promise resolving to array of ValidationResults
1272
+ */
512
1273
  batchValidate(imagePaths: string | string[], prompt: string, context?: ValidationContext): Promise<ValidationResult[]>;
1274
+
1275
+ /**
1276
+ * Clear batch optimizer cache.
1277
+ */
513
1278
  clearCache(): void;
1279
+
1280
+ /**
1281
+ * Get cache statistics.
1282
+ *
1283
+ * @returns Cache stats (size, queue length, active requests)
1284
+ */
514
1285
  getCacheStats(): { cacheSize: number; queueLength: number; activeRequests: number };
515
1286
  }
516
1287
 
@@ -775,18 +1546,76 @@ export interface StateValidationResult<T = unknown> extends ValidationResult {
775
1546
  matches: boolean;
776
1547
  }
777
1548
 
1549
+ /**
1550
+ * State Validator
1551
+ *
1552
+ * Validates that visual state matches expected state using VLLM extraction.
1553
+ * Extracts state from screenshot and compares with expected state.
1554
+ *
1555
+ * **Use when:** You need to verify specific state values (cart count, button text, etc.)
1556
+ *
1557
+ * @example
1558
+ * ```typescript
1559
+ * const validator = new StateValidator();
1560
+ *
1561
+ * const result = await validator.validateState(
1562
+ * 'checkout.png',
1563
+ * {
1564
+ * cartCount: 1,
1565
+ * buttonText: 'Checkout'
1566
+ * },
1567
+ * {
1568
+ * testType: 'cart-state'
1569
+ * }
1570
+ * );
1571
+ *
1572
+ * console.log(result.matches); // true/false
1573
+ * console.log(result.discrepancies); // ['cartCount: expected 1, got 2']
1574
+ * ```
1575
+ */
778
1576
  export class StateValidator<T = unknown> {
1577
+ /**
1578
+ * Create a new State Validator instance.
1579
+ *
1580
+ * @param options - Validator options (tolerance, state extractor, etc.)
1581
+ */
779
1582
  constructor(options?: StateValidatorOptions<T>);
1583
+
1584
+ /**
1585
+ * Validate state (static method).
1586
+ *
1587
+ * @param screenshotPath - Path to screenshot or array for comparison
1588
+ * @param expectedState - Expected state object
1589
+ * @param options - Validation options
1590
+ * @returns Promise resolving to StateValidationResult
1591
+ */
780
1592
  static validate<T = unknown>(
781
1593
  screenshotPath: string | string[],
782
1594
  expectedState: T,
783
1595
  options?: StateValidationOptions<T>
784
1596
  ): Promise<StateValidationResult<T>>;
1597
+
1598
+ /**
1599
+ * Validate state matches expected state.
1600
+ *
1601
+ * @param screenshotPath - Path to screenshot or array for comparison
1602
+ * @param expectedState - Expected state object
1603
+ * @param options - Validation options
1604
+ * @returns Promise resolving to StateValidationResult
1605
+ */
785
1606
  validateState(
786
1607
  screenshotPath: string | string[],
787
1608
  expectedState: T,
788
1609
  options?: StateValidationOptions<T>
789
1610
  ): Promise<StateValidationResult<T>>;
1611
+
1612
+ /**
1613
+ * Build state validation prompt.
1614
+ *
1615
+ * @param expectedState - Expected state object
1616
+ * @param options - Validation options
1617
+ * @returns Validation prompt string
1618
+ */
790
1619
  buildStatePrompt(expectedState: T, options?: StateValidationOptions<T>): string;
791
1620
  }
792
1621
 
@@ -820,22 +1649,90 @@ export interface AccessibilityResult extends ValidationResult {
820
1649
  standards: string[];
821
1650
  }
822
1651
 
1652
+ /**
1653
+ * Accessibility Validator
1654
+ *
1655
+ * Validates accessibility using VLLM semantic evaluation.
1656
+ * Checks contrast, labels, keyboard navigation, error messages, and WCAG compliance.
1657
+ *
1658
+ * **Use when:** You need comprehensive accessibility validation beyond programmatic checks.
1659
+ *
1660
+ * @example
1661
+ * ```typescript
1662
+ * const validator = new AccessibilityValidator({
1663
+ * minContrast: 4.5,
1664
+ * standards: ['WCAG-AA']
1665
+ * });
1666
+ *
1667
+ * const result = await validator.validateAccessibility(
1668
+ * 'payment-form.png',
1669
+ * {
1670
+ * testType: 'accessibility'
1671
+ * }
1672
+ * );
1673
+ *
1674
+ * console.log(result.passes); // true/false
1675
+ * console.log(result.violations.zeroTolerance); // Critical violations
1676
+ * ```
1677
+ */
823
1678
  export class AccessibilityValidator {
1679
+ /**
1680
+ * Create a new Accessibility Validator instance.
1681
+ *
1682
+ * @param options - Validator options (minContrast, standards, etc.)
1683
+ */
824
1684
  constructor(options?: AccessibilityValidatorOptions);
1685
+
1686
+ /**
1687
+ * Validate accessibility (static method).
1688
+ *
1689
+ * @param screenshotPath - Path to screenshot or array for comparison
1690
+ * @param options - Validation options
1691
+ * @returns Promise resolving to AccessibilityResult
1692
+ */
825
1693
  static validate(
826
1694
  screenshotPath: string | string[],
827
1695
  options?: AccessibilityOptions
828
1696
  ): Promise<AccessibilityResult>;
1697
+
1698
+ /**
1699
+ * Validate accessibility of screenshot.
1700
+ *
1701
+ * @param screenshotPath - Path to screenshot or array for comparison
1702
+ * @param options - Validation options
1703
+ * @returns Promise resolving to AccessibilityResult
1704
+ */
829
1705
  validateAccessibility(
830
1706
  screenshotPath: string | string[],
831
1707
  options?: AccessibilityOptions
832
1708
  ): Promise<AccessibilityResult>;
1709
+
1710
+ /**
1711
+ * Build accessibility validation prompt.
1712
+ *
1713
+ * @param options - Validation options
1714
+ * @returns Validation prompt string
1715
+ */
833
1716
  buildAccessibilityPrompt(options?: AccessibilityOptions): string;
1717
+
1718
+ /**
1719
+ * Detect accessibility violations from validation result.
1720
+ *
1721
+ * @param result - Validation result
1722
+ * @returns Categorized violations (zeroTolerance, critical, warnings)
1723
+ */
834
1724
  detectViolations(result: ValidationResult): {
835
1725
  zeroTolerance: string[];
836
1726
  critical: string[];
837
1727
  warnings: string[];
838
1728
  };
1729
+
1730
+ /**
1731
+ * Extract contrast information from validation result.
1732
+ *
1733
+ * @param result - Validation result
1734
+ * @returns Contrast ratios and compliance status
1735
+ */
839
1736
  extractContrastInfo(result: ValidationResult): {
840
1737
  ratios: string[];
841
1738
  minRatio: number | null;