@arclabs561/ai-visual-test 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.secretsignore.example +20 -0
  2. package/CHANGELOG.md +360 -0
  3. package/CONTRIBUTING.md +63 -0
  4. package/DEPLOYMENT.md +80 -0
  5. package/LICENSE +22 -0
  6. package/README.md +142 -0
  7. package/SECURITY.md +108 -0
  8. package/api/health.js +34 -0
  9. package/api/validate.js +252 -0
  10. package/index.d.ts +1221 -0
  11. package/package.json +112 -0
  12. package/public/index.html +149 -0
  13. package/src/batch-optimizer.mjs +451 -0
  14. package/src/bias-detector.mjs +370 -0
  15. package/src/bias-mitigation.mjs +233 -0
  16. package/src/cache.mjs +433 -0
  17. package/src/config.mjs +268 -0
  18. package/src/constants.mjs +80 -0
  19. package/src/context-compressor.mjs +350 -0
  20. package/src/convenience.mjs +617 -0
  21. package/src/cost-tracker.mjs +257 -0
  22. package/src/cross-modal-consistency.mjs +170 -0
  23. package/src/data-extractor.mjs +232 -0
  24. package/src/dynamic-few-shot.mjs +140 -0
  25. package/src/dynamic-prompts.mjs +361 -0
  26. package/src/ensemble/index.mjs +53 -0
  27. package/src/ensemble-judge.mjs +366 -0
  28. package/src/error-handler.mjs +67 -0
  29. package/src/errors.mjs +167 -0
  30. package/src/experience-propagation.mjs +128 -0
  31. package/src/experience-tracer.mjs +487 -0
  32. package/src/explanation-manager.mjs +299 -0
  33. package/src/feedback-aggregator.mjs +248 -0
  34. package/src/game-goal-prompts.mjs +478 -0
  35. package/src/game-player.mjs +548 -0
  36. package/src/hallucination-detector.mjs +155 -0
  37. package/src/helpers/playwright.mjs +80 -0
  38. package/src/human-validation-manager.mjs +516 -0
  39. package/src/index.mjs +364 -0
  40. package/src/judge.mjs +929 -0
  41. package/src/latency-aware-batch-optimizer.mjs +192 -0
  42. package/src/load-env.mjs +159 -0
  43. package/src/logger.mjs +55 -0
  44. package/src/metrics.mjs +187 -0
  45. package/src/model-tier-selector.mjs +221 -0
  46. package/src/multi-modal/index.mjs +36 -0
  47. package/src/multi-modal-fusion.mjs +190 -0
  48. package/src/multi-modal.mjs +524 -0
  49. package/src/natural-language-specs.mjs +1071 -0
  50. package/src/pair-comparison.mjs +277 -0
  51. package/src/persona/index.mjs +42 -0
  52. package/src/persona-enhanced.mjs +200 -0
  53. package/src/persona-experience.mjs +572 -0
  54. package/src/position-counterbalance.mjs +140 -0
  55. package/src/prompt-composer.mjs +375 -0
  56. package/src/render-change-detector.mjs +583 -0
  57. package/src/research-enhanced-validation.mjs +436 -0
  58. package/src/retry.mjs +152 -0
  59. package/src/rubrics.mjs +231 -0
  60. package/src/score-tracker.mjs +277 -0
  61. package/src/smart-validator.mjs +447 -0
  62. package/src/spec-config.mjs +106 -0
  63. package/src/spec-templates.mjs +347 -0
  64. package/src/specs/index.mjs +38 -0
  65. package/src/temporal/index.mjs +102 -0
  66. package/src/temporal-adaptive.mjs +163 -0
  67. package/src/temporal-batch-optimizer.mjs +222 -0
  68. package/src/temporal-constants.mjs +69 -0
  69. package/src/temporal-context.mjs +49 -0
  70. package/src/temporal-decision-manager.mjs +271 -0
  71. package/src/temporal-decision.mjs +669 -0
  72. package/src/temporal-errors.mjs +58 -0
  73. package/src/temporal-note-pruner.mjs +173 -0
  74. package/src/temporal-preprocessor.mjs +543 -0
  75. package/src/temporal-prompt-formatter.mjs +219 -0
  76. package/src/temporal-validation.mjs +159 -0
  77. package/src/temporal.mjs +415 -0
  78. package/src/type-guards.mjs +311 -0
  79. package/src/uncertainty-reducer.mjs +470 -0
  80. package/src/utils/index.mjs +175 -0
  81. package/src/validation-framework.mjs +321 -0
  82. package/src/validation-result-normalizer.mjs +64 -0
  83. package/src/validation.mjs +243 -0
  84. package/src/validators/accessibility-programmatic.mjs +345 -0
  85. package/src/validators/accessibility-validator.mjs +223 -0
  86. package/src/validators/batch-validator.mjs +143 -0
  87. package/src/validators/hybrid-validator.mjs +268 -0
  88. package/src/validators/index.mjs +34 -0
  89. package/src/validators/prompt-builder.mjs +218 -0
  90. package/src/validators/rubric.mjs +85 -0
  91. package/src/validators/state-programmatic.mjs +260 -0
  92. package/src/validators/state-validator.mjs +291 -0
  93. package/vercel.json +27 -0
@@ -0,0 +1,572 @@
1
+ /**
2
+ * Persona-Based Page Experience Testing
3
+ *
4
+ * Tests page experience from different persona perspectives with human-interpreted time scales.
5
+ *
6
+ * Not just gameplay - any page experience can be tested with personas.
7
+ * Time scales are human-interpreted (reading time, interaction time, etc.) not mechanical fps.
8
+ */
9
+
10
+ import { warn, log } from './logger.mjs';
11
+ import { trackPropagation } from './experience-propagation.mjs';
12
+ import { checkCrossModalConsistency } from './cross-modal-consistency.mjs';
13
+
14
+ // Lazy import for variable goals
15
+ let generateGamePrompt = null;
16
+ async function getGenerateGamePrompt() {
17
+ if (!generateGamePrompt) {
18
+ try {
19
+ const module = await import('./game-goal-prompts.mjs');
20
+ generateGamePrompt = module.generateGamePrompt;
21
+ } catch (error) {
22
+ return null;
23
+ }
24
+ }
25
+ return generateGamePrompt;
26
+ }
27
+
28
+ /**
29
+ * Experience a page from a persona's perspective
30
+ *
31
+ * @param {any} page - Playwright page object
32
+ * @param {import('./index.mjs').Persona} persona - Persona configuration
33
+ * @param {import('./index.mjs').PersonaExperienceOptions} [options={}] - Experience options
34
+ * @returns {Promise<import('./index.mjs').PersonaExperienceResult>} Experience result with notes, screenshots, and evaluation
35
+ */
36
+ export async function experiencePageAsPersona(page, persona, options = {}) {
37
+ const {
38
+ viewport = { width: 1280, height: 720 },
39
+ device = 'desktop',
40
+ darkMode = false,
41
+ timeScale = 'human', // 'human' (reading/interaction time) or 'mechanical' (fps)
42
+ captureScreenshots = true,
43
+ captureState = true,
44
+ captureCode = true,
45
+ notes = [],
46
+ trace = null // Optional ExperienceTrace instance
47
+ } = options;
48
+
49
+ const experienceNotes = [...notes];
50
+ const screenshots = [];
51
+ const startTime = Date.now();
52
+
53
+ // If trace provided, add initial event
54
+ if (trace) {
55
+ trace.addEvent('experience-start', {
56
+ persona: persona.name,
57
+ viewport,
58
+ device,
59
+ timeScale
60
+ });
61
+ }
62
+
63
+ // Helper to capture screenshot at current state
64
+ const captureScreenshotNow = async (step, description) => {
65
+ if (!captureScreenshots) return null;
66
+
67
+ const timestamp = Date.now();
68
+ const elapsed = timestamp - startTime;
69
+ const screenshotPath = `test-results/persona-${persona.name.toLowerCase().replace(/\s+/g, '-')}-${step}-${timestamp}.png`;
70
+
71
+ try {
72
+ await page.screenshot({ path: screenshotPath, fullPage: true });
73
+ screenshots.push({
74
+ path: screenshotPath,
75
+ timestamp,
76
+ elapsed,
77
+ step,
78
+ description
79
+ });
80
+
81
+ // Add to trace if available
82
+ if (trace) {
83
+ trace.addScreenshot(screenshotPath, description || step);
84
+ }
85
+
86
+ return screenshotPath;
87
+ } catch (error) {
88
+ // Silently fail - screenshot capture is optional
89
+ return null;
90
+ }
91
+ };
92
+
93
+ // Set viewport based on persona device preference
94
+ //
95
+ // BUG FIX (2025-01): Viewports were only set if persona.device existed.
96
+ // This caused mobile/tablet personas to get desktop viewports (1280x720) when
97
+ // persona.device was not set but options.device was.
98
+ //
99
+ // The fix: Check both persona.device AND options.device
100
+ //
101
+ // Viewport sizes:
102
+ // - mobile: 375x667 (iPhone SE - smallest common mobile)
103
+ // - tablet: 768x1024 (iPad - standard tablet)
104
+ // - desktop: 1280x720 (standard desktop resolution)
105
+ //
106
+ // DON'T CHANGE VIEWPORT SIZES without:
107
+ // - Understanding why these sizes were chosen
108
+ // - Testing with different viewports
109
+ // - Validating persona diversity tests
110
+ const deviceToUse = persona.device || device;
111
+ if (deviceToUse) {
112
+ const deviceViewports = {
113
+ mobile: { width: 375, height: 667 },
114
+ tablet: { width: 768, height: 1024 },
115
+ desktop: { width: 1280, height: 720 }
116
+ };
117
+ const targetViewport = deviceViewports[deviceToUse];
118
+ if (targetViewport) {
119
+ await page.setViewportSize(targetViewport);
120
+ } else {
121
+ await page.setViewportSize(viewport);
122
+ }
123
+ } else {
124
+ await page.setViewportSize(viewport);
125
+ }
126
+
127
+ // Navigate to page
128
+ await page.goto(options.url || options.baseURL || 'about:blank', {
129
+ waitUntil: 'domcontentloaded'
130
+ });
131
+
132
+ // Capture screenshot immediately after page load
133
+ const pageLoadScreenshot = await captureScreenshotNow('page-load', 'Page loaded');
134
+ // Screenshot already added to trace in captureScreenshotNow
135
+
136
+ // Step 1: Initial page load experience (human time scale)
137
+ const initialLoadTime = await humanTimeScale('page-load', {
138
+ minTime: 1000, // Minimum 1 second to read page
139
+ maxTime: 5000, // Maximum 5 seconds for slow readers
140
+ timeScale
141
+ });
142
+
143
+ await page.waitForTimeout(initialLoadTime);
144
+
145
+ // Capture after initial reading time
146
+ await captureScreenshotNow('after-initial-read', 'After initial reading time');
147
+
148
+ // Extract initial state
149
+ let renderedCode = null;
150
+ let pageState = null;
151
+
152
+ if (captureCode) {
153
+ renderedCode = await extractRenderedCode(page);
154
+ // Track HTML/CSS capture
155
+ trackPropagation('capture', { renderedCode }, 'Captured HTML/CSS from page');
156
+ }
157
+
158
+ if (captureState) {
159
+ pageState = await page.evaluate(() => {
160
+ return {
161
+ title: document.title,
162
+ h1: document.querySelector('h1')?.textContent || '',
163
+ description: document.querySelector('meta[name="description"]')?.content || '',
164
+ viewport: { width: window.innerWidth, height: window.innerHeight },
165
+ darkMode: document.documentElement.classList.contains('dark') ||
166
+ window.matchMedia('(prefers-color-scheme: dark)').matches
167
+ };
168
+ });
169
+ }
170
+
171
+ // Persona's initial observation
172
+ // Preserve more HTML/CSS context (increased from 500 to 2000 chars, and always include critical CSS/DOM)
173
+ const initialNote = {
174
+ step: 'initial_experience',
175
+ persona: persona.name,
176
+ device: persona.device || device,
177
+ viewport: await page.viewportSize(),
178
+ observation: `Arrived at page - ${pageState?.title || 'unknown'}`,
179
+ pageState,
180
+ renderedCode: renderedCode ? {
181
+ html: renderedCode.html?.substring(0, 2000), // Increased from 500 to 2000
182
+ criticalCSS: renderedCode.criticalCSS, // Always preserve CSS
183
+ domStructure: renderedCode.domStructure // Always preserve DOM structure
184
+ } : null,
185
+ timestamp: Date.now(),
186
+ elapsed: Date.now() - startTime
187
+ };
188
+ experienceNotes.push(initialNote);
189
+
190
+ // Track HTML/CSS in notes
191
+ trackPropagation('notes', { renderedCode: initialNote.renderedCode, pageState: initialNote.pageState }, 'Added HTML/CSS to experience notes');
192
+
193
+ // Check cross-modal consistency
194
+ if (captureScreenshots && renderedCode) {
195
+ const consistency = checkCrossModalConsistency({
196
+ screenshot: pageLoadScreenshot,
197
+ renderedCode,
198
+ pageState
199
+ });
200
+ if (!consistency.isConsistent && consistency.issues.length > 0) {
201
+ warn(`[Experience] Cross-modal consistency issues: ${consistency.issues.join(', ')}`);
202
+ }
203
+ }
204
+
205
+ // Add to trace if available
206
+ if (trace) {
207
+ trace.addEvent('observation', {
208
+ step: 'initial_experience',
209
+ observation: initialNote.observation,
210
+ pageState: initialNote.pageState,
211
+ renderedCode: initialNote.renderedCode
212
+ });
213
+ if (pageState) {
214
+ trace.addStateSnapshot(pageState, 'initial_experience');
215
+ }
216
+ }
217
+
218
+ // Step 2: Reading/scanning experience (human time scale)
219
+ if (trace) {
220
+ trace.addEvent('observation', {
221
+ step: 'before-reading',
222
+ observation: 'About to read/scan page content'
223
+ });
224
+ }
225
+ await captureScreenshotNow('before-reading', 'Before reading/scanning');
226
+
227
+ const readingTime = await humanTimeScale('reading', {
228
+ minTime: 2000, // Minimum 2 seconds to read/scan
229
+ maxTime: 10000, // Maximum 10 seconds for thorough reading
230
+ timeScale,
231
+ contentLength: pageState?.h1?.length || 0
232
+ });
233
+
234
+ await page.waitForTimeout(readingTime);
235
+
236
+ // Capture after reading
237
+ if (trace) {
238
+ trace.addEvent('observation', {
239
+ step: 'after-reading',
240
+ observation: 'Finished reading/scanning page content'
241
+ });
242
+ }
243
+ await captureScreenshotNow('after-reading', 'After reading/scanning');
244
+
245
+ // Step 3: Interaction experience (if persona has goals)
246
+ if (persona.goals && persona.goals.length > 0) {
247
+ for (const goal of persona.goals) {
248
+ // Capture before interaction
249
+ if (trace) {
250
+ trace.addEvent('interaction', {
251
+ step: `before-${goal}`,
252
+ goal,
253
+ observation: `Preparing to ${goal}`
254
+ });
255
+ }
256
+ await captureScreenshotNow(`before-${goal}`, `Before ${goal}`);
257
+
258
+ const interactionTime = await humanTimeScale('interaction', {
259
+ minTime: 500, // Minimum 0.5 seconds to interact
260
+ maxTime: 3000, // Maximum 3 seconds for complex interactions
261
+ timeScale,
262
+ interactionType: goal
263
+ });
264
+
265
+ // Simulate persona trying to achieve goal
266
+ // This is extensible - different personas interact differently
267
+ await simulatePersonaInteraction(page, persona, goal);
268
+
269
+ // Capture immediately after interaction (before delay)
270
+ if (trace) {
271
+ trace.addEvent('interaction', {
272
+ step: `during-${goal}`,
273
+ goal,
274
+ observation: `Performing ${goal}`
275
+ });
276
+ }
277
+ await captureScreenshotNow(`during-${goal}`, `During ${goal}`);
278
+
279
+ await page.waitForTimeout(interactionTime);
280
+
281
+ // Capture after interaction delay
282
+ if (trace) {
283
+ trace.addEvent('interaction', {
284
+ step: `after-${goal}`,
285
+ goal,
286
+ observation: `Completed ${goal}`
287
+ });
288
+ }
289
+ await captureScreenshotNow(`after-${goal}`, `After ${goal}`);
290
+
291
+ // Update state
292
+ if (captureState) {
293
+ pageState = await page.evaluate(() => {
294
+ return {
295
+ title: document.title,
296
+ viewport: { width: window.innerWidth, height: window.innerHeight },
297
+ activeElement: document.activeElement?.tagName || null
298
+ };
299
+ });
300
+ }
301
+
302
+ const interactionNote = {
303
+ step: `interaction_${goal}`,
304
+ persona: persona.name,
305
+ goal,
306
+ observation: `Attempted to ${goal}`,
307
+ pageState,
308
+ timestamp: Date.now(),
309
+ elapsed: Date.now() - startTime
310
+ };
311
+ experienceNotes.push(interactionNote);
312
+
313
+ // Add to trace if available
314
+ if (trace) {
315
+ trace.addEvent('interaction', {
316
+ step: `interaction_${goal}`,
317
+ goal,
318
+ observation: interactionNote.observation,
319
+ pageState: interactionNote.pageState
320
+ });
321
+ if (pageState) {
322
+ trace.addStateSnapshot(pageState, `after-${goal}`);
323
+ }
324
+ }
325
+ }
326
+ }
327
+
328
+ // Capture final state
329
+ const finalScreenshot = await captureScreenshotNow('final-state', 'Final state');
330
+ // Screenshot already added to trace in captureScreenshotNow
331
+
332
+ // Add final event to trace
333
+ if (trace) {
334
+ trace.addEvent('experience-end', {
335
+ duration: Date.now() - startTime,
336
+ noteCount: experienceNotes.length,
337
+ screenshotCount: screenshots.length
338
+ });
339
+ }
340
+
341
+ // Track final propagation
342
+ trackPropagation('experience-complete', {
343
+ renderedCode,
344
+ pageState,
345
+ screenshot: screenshots.length > 0 ? screenshots[0].path : null
346
+ }, 'Experience complete');
347
+
348
+ // Final consistency check
349
+ let consistency = null;
350
+ if (captureScreenshots && renderedCode && screenshots.length > 0) {
351
+ consistency = checkCrossModalConsistency({
352
+ screenshot: screenshots[screenshots.length - 1].path,
353
+ renderedCode,
354
+ pageState
355
+ });
356
+ }
357
+
358
+ // Automatically aggregate temporal notes (use fixed temporal system)
359
+ let aggregated = null;
360
+ let aggregatedMultiScale = null;
361
+ if (experienceNotes.length > 0) {
362
+ try {
363
+ const { aggregateTemporalNotes } = await import('./temporal.mjs');
364
+ const { aggregateMultiScale } = await import('./temporal-decision.mjs');
365
+
366
+ // Standard temporal aggregation
367
+ aggregated = aggregateTemporalNotes(experienceNotes, {
368
+ windowSize: 10000, // 10 second windows
369
+ decayFactor: 0.9
370
+ });
371
+
372
+ // Multi-scale aggregation for richer analysis
373
+ // Always return multi-scale result (even if empty) for consistency
374
+ try {
375
+ aggregatedMultiScale = aggregateMultiScale(experienceNotes, {
376
+ attentionWeights: true
377
+ });
378
+ // Ensure it has the expected structure
379
+ if (!aggregatedMultiScale.scales) {
380
+ aggregatedMultiScale.scales = {};
381
+ }
382
+ if (!aggregatedMultiScale.coherence) {
383
+ aggregatedMultiScale.coherence = {};
384
+ }
385
+ } catch (error) {
386
+ // Return empty multi-scale result instead of null
387
+ warn(`[Experience] Multi-scale aggregation failed: ${error.message}`);
388
+ aggregatedMultiScale = {
389
+ scales: {},
390
+ summary: 'Multi-scale aggregation failed',
391
+ coherence: {}
392
+ };
393
+ }
394
+
395
+ trackPropagation('temporal-aggregation', {
396
+ windows: aggregated.windows.length,
397
+ coherence: aggregated.coherence,
398
+ scales: Object.keys(aggregatedMultiScale.scales || {})
399
+ }, 'Aggregated temporal notes automatically');
400
+ } catch (error) {
401
+ warn(`[Experience] Temporal aggregation failed: ${error.message}`);
402
+ }
403
+ }
404
+
405
+ // Get actual viewport size (may differ from requested if browser clamped it)
406
+ // This ensures we return what was actually set, not what we requested
407
+ const actualViewport = await page.viewportSize();
408
+
409
+ return {
410
+ persona: persona.name,
411
+ device: persona.device || device,
412
+ viewport: actualViewport,
413
+ notes: experienceNotes,
414
+ aggregated, // Include aggregated temporal notes
415
+ aggregatedMultiScale, // Include multi-scale aggregation
416
+ screenshots,
417
+ renderedCode,
418
+ pageState,
419
+ duration: Date.now() - startTime,
420
+ timeScale,
421
+ trace: trace ? trace.getSummary() : null,
422
+ consistency // Include consistency check result
423
+ };
424
+ }
425
+
426
+ /**
427
+ * Human-interpreted time scale
428
+ *
429
+ * Not mechanical fps - human reading/interaction time based on content and context.
430
+ * Now uses research-aligned humanPerceptionTime from temporal-decision.mjs
431
+ *
432
+ * @param {string} action - Action type ('page-load', 'reading', 'interaction')
433
+ * @param {Object} options - Time scale options
434
+ * @returns {Promise<number>} Time in milliseconds
435
+ */
436
+ async function humanTimeScale(action, options = {}) {
437
+ const {
438
+ minTime = 1000,
439
+ maxTime = 5000,
440
+ timeScale = 'human',
441
+ contentLength = 0,
442
+ interactionType = null,
443
+ persona = null,
444
+ attentionLevel = 'normal'
445
+ } = options;
446
+
447
+ if (timeScale === 'mechanical') {
448
+ // Mechanical fps - fixed intervals
449
+ return 1000 / 2; // 2 fps = 500ms
450
+ }
451
+
452
+ // Use research-aligned humanPerceptionTime if available
453
+ try {
454
+ const { humanPerceptionTime } = await import('./temporal-decision.mjs');
455
+
456
+ // Map action types
457
+ let perceptionAction = action;
458
+ if (action === 'page-load') perceptionAction = 'reading';
459
+ if (action === 'interaction') perceptionAction = 'interaction';
460
+
461
+ const perceptionTime = humanPerceptionTime(perceptionAction, {
462
+ persona,
463
+ attentionLevel,
464
+ actionComplexity: interactionType ? (interactionType === 'think' ? 'complex' : 'normal') : 'normal',
465
+ contentLength
466
+ });
467
+
468
+ // Clamp to min/max if provided
469
+ return Math.max(minTime || 0, Math.min(maxTime || Infinity, perceptionTime));
470
+ } catch (error) {
471
+ // Fallback to original implementation if import fails
472
+ // Silently fall back - this is expected in some environments
473
+ }
474
+
475
+ // Fallback: Human-interpreted time scale (original implementation)
476
+ switch (action) {
477
+ case 'page-load':
478
+ // Page load: 1-5 seconds depending on complexity
479
+ return Math.random() * (maxTime - minTime) + minTime;
480
+
481
+ case 'reading':
482
+ // Reading: Based on content length
483
+ // Average reading speed: 200-300 words per minute
484
+ // Rough estimate: 1 word = 5 characters
485
+ const words = contentLength / 5;
486
+ const readingSpeed = 250; // words per minute
487
+ const readingTime = (words / readingSpeed) * 60 * 1000; // milliseconds
488
+ return Math.max(minTime, Math.min(maxTime, readingTime));
489
+
490
+ case 'interaction':
491
+ // Interaction: Based on interaction type
492
+ const interactionTimes = {
493
+ 'click': 500,
494
+ 'type': 1000,
495
+ 'scroll': 800,
496
+ 'read': 2000,
497
+ 'think': 1500
498
+ };
499
+ return interactionTimes[interactionType] || minTime;
500
+
501
+ default:
502
+ return minTime;
503
+ }
504
+ }
505
+
506
+ /**
507
+ * Simulate persona interaction
508
+ *
509
+ * Different personas interact differently based on their goals and concerns.
510
+ *
511
+ * @param {Page} page - Playwright page object
512
+ * @param {Object} persona - Persona configuration
513
+ * @param {string} goal - Goal to achieve
514
+ */
515
+ async function simulatePersonaInteraction(page, persona, goal) {
516
+ // This is extensible - different personas interact differently
517
+ // For now, basic interaction simulation
518
+
519
+ if (goal.includes('click') || goal.includes('button')) {
520
+ // Try to find and click a button
521
+ const button = await page.locator('button').first();
522
+ if (await button.isVisible()) {
523
+ await button.click();
524
+ }
525
+ } else if (goal.includes('type') || goal.includes('input')) {
526
+ // Try to find and fill an input
527
+ const input = await page.locator('input[type="text"]').first();
528
+ if (await input.isVisible()) {
529
+ await input.fill('Test');
530
+ }
531
+ } else if (goal.includes('scroll') || goal.includes('read')) {
532
+ // Scroll to read more
533
+ await page.evaluate(() => window.scrollBy(0, window.innerHeight));
534
+ }
535
+ }
536
+
537
+ /**
538
+ * Extract rendered code (re-export from multi-modal)
539
+ */
540
+ async function extractRenderedCode(page) {
541
+ // Re-export from multi-modal.mjs
542
+ const { extractRenderedCode } = await import('./multi-modal.mjs');
543
+ return extractRenderedCode(page);
544
+ }
545
+
546
+ /**
547
+ * Experience page with multiple personas
548
+ *
549
+ * @param {Page} page - Playwright page object
550
+ * @param {Array} personas - Array of persona configurations
551
+ * @param {Object} options - Experience options
552
+ * @returns {Promise<Array>} Array of experience results
553
+ */
554
+ /**
555
+ * Experience a page from multiple persona perspectives
556
+ *
557
+ * @param {any} page - Playwright page object
558
+ * @param {import('./index.mjs').Persona[]} personas - Array of persona configurations
559
+ * @param {import('./index.mjs').PersonaExperienceOptions} [options={}] - Experience options
560
+ * @returns {Promise<import('./index.mjs').PersonaExperienceResult[]>} Array of experience results
561
+ */
562
+ export async function experiencePageWithPersonas(page, personas, options = {}) {
563
+ const experiences = [];
564
+
565
+ for (const persona of personas) {
566
+ const experience = await experiencePageAsPersona(page, persona, options);
567
+ experiences.push(experience);
568
+ }
569
+
570
+ return experiences;
571
+ }
572
+
@@ -0,0 +1,140 @@
1
+ /**
2
+ * Position Counter-Balancing for Single Evaluations
3
+ *
4
+ * Research: Position bias is severe and systematic (arXiv:2406.07791).
5
+ * Counter-balancing (running evaluations twice with reversed order) effectively
6
+ * eliminates bias (arXiv:2508.02020).
7
+ *
8
+ * Note: arXiv:2406.07791 is the systematic study showing position bias is not random
9
+ * and varies significantly across judges and tasks. arXiv:2508.02020 demonstrates
10
+ * that counter-balancing effectively eliminates this bias.
11
+ *
12
+ * This module provides systematic counter-balancing for single screenshot
13
+ * evaluations when position might matter (e.g., when comparing against baseline,
14
+ * or when context order matters).
15
+ */
16
+
17
+ import { normalizeValidationResult } from './validation-result-normalizer.mjs';
18
+
19
+ /**
20
+ * Run evaluation with counter-balancing to eliminate position bias
21
+ *
22
+ * @param {Function} evaluateFn - Function that performs evaluation: (imagePath, prompt, context) => Promise<Result>
23
+ * @param {string} imagePath - Path to screenshot
24
+ * @param {string} prompt - Evaluation prompt
25
+ * @param {import('./index.mjs').ValidationContext} context - Validation context
26
+ * @param {{
27
+ * enabled?: boolean;
28
+ * baselinePath?: string | null;
29
+ * contextOrder?: 'original' | 'reversed';
30
+ * }} [options={}] - Counter-balancing options
31
+ * @returns {Promise<import('./index.mjs').ValidationResult>} Counter-balanced result
32
+ */
33
+ export async function evaluateWithCounterBalance(evaluateFn, imagePath, prompt, context = {}, options = {}) {
34
+ const {
35
+ enabled = true,
36
+ baselinePath = null,
37
+ contextOrder = 'original'
38
+ } = options;
39
+
40
+ if (!enabled) {
41
+ // Just run once without counter-balancing
42
+ return await evaluateFn(imagePath, prompt, context);
43
+ }
44
+
45
+ // If no baseline and no context order dependency, no need for counter-balancing
46
+ if (!baselinePath && !context.contextOrder) {
47
+ return await evaluateFn(imagePath, prompt, context);
48
+ }
49
+
50
+ // Run evaluation twice: once with original order, once with reversed
51
+ const originalContext = { ...context, contextOrder: 'original' };
52
+ const reversedContext = { ...context, contextOrder: 'reversed' };
53
+
54
+ // If baseline exists, swap order in second evaluation
55
+ let firstResult, secondResult;
56
+
57
+ if (baselinePath) {
58
+ // First: image vs baseline
59
+ firstResult = await evaluateFn(imagePath, prompt, {
60
+ ...originalContext,
61
+ baseline: baselinePath,
62
+ comparisonOrder: 'image-first'
63
+ });
64
+
65
+ // Second: baseline vs image (reversed)
66
+ secondResult = await evaluateFn(baselinePath, prompt, {
67
+ ...reversedContext,
68
+ baseline: imagePath,
69
+ comparisonOrder: 'baseline-first'
70
+ });
71
+ } else {
72
+ // Just reverse context order
73
+ firstResult = await evaluateFn(imagePath, prompt, originalContext);
74
+ secondResult = await evaluateFn(imagePath, prompt, reversedContext);
75
+ }
76
+
77
+ // Average scores and combine results
78
+ const avgScore = firstResult.score !== null && secondResult.score !== null
79
+ ? (firstResult.score + secondResult.score) / 2
80
+ : firstResult.score ?? secondResult.score;
81
+
82
+ // Combine issues (deduplicate)
83
+ const allIssues = [
84
+ ...(firstResult.issues || []),
85
+ ...(secondResult.issues || [])
86
+ ];
87
+ const uniqueIssues = [...new Set(allIssues)];
88
+
89
+ // Combine reasoning
90
+ const combinedReasoning = `Counter-balanced evaluation:
91
+ Original: ${firstResult.reasoning || 'N/A'}
92
+ Reversed: ${secondResult.reasoning || 'N/A'}
93
+ Average score: ${avgScore?.toFixed(2) || 'N/A'}`;
94
+
95
+ const counterBalancedResult = {
96
+ ...firstResult,
97
+ score: avgScore,
98
+ issues: uniqueIssues,
99
+ reasoning: combinedReasoning,
100
+ counterBalanced: true,
101
+ originalScore: firstResult.score,
102
+ reversedScore: secondResult.score,
103
+ scoreDifference: firstResult.score !== null && secondResult.score !== null
104
+ ? Math.abs(firstResult.score - secondResult.score)
105
+ : null,
106
+ metadata: {
107
+ ...firstResult.metadata,
108
+ counterBalancing: {
109
+ enabled: true,
110
+ originalResult: firstResult,
111
+ reversedResult: secondResult,
112
+ positionBiasDetected: firstResult.score !== null && secondResult.score !== null
113
+ ? Math.abs(firstResult.score - secondResult.score) > 1.0
114
+ : false
115
+ }
116
+ }
117
+ };
118
+
119
+ // Normalize result structure before returning (ensures consistent structure)
120
+ return normalizeValidationResult(counterBalancedResult, 'evaluateWithCounterBalance');
121
+ }
122
+
123
+ /**
124
+ * Check if counter-balancing is needed for this evaluation
125
+ *
126
+ * @param {import('./index.mjs').ValidationContext} context - Validation context
127
+ * @returns {boolean} Whether counter-balancing should be applied
128
+ */
129
+ export function shouldUseCounterBalance(context) {
130
+ // Counter-balance if:
131
+ // 1. Baseline is provided (position matters in comparison)
132
+ // 2. Context order is explicitly set
133
+ // 3. Multiple images are being compared
134
+ return !!(
135
+ context.baseline ||
136
+ context.contextOrder ||
137
+ (Array.isArray(context.images) && context.images.length > 1)
138
+ );
139
+ }
140
+