@arclabs561/ai-visual-test 0.5.1 → 0.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/CHANGELOG.md +127 -11
  2. package/DEPLOYMENT.md +225 -9
  3. package/README.md +71 -80
  4. package/index.d.ts +902 -5
  5. package/package.json +10 -51
  6. package/src/batch-optimizer.mjs +39 -0
  7. package/src/cache.mjs +241 -16
  8. package/src/config.mjs +33 -91
  9. package/src/constants.mjs +54 -0
  10. package/src/convenience.mjs +113 -10
  11. package/src/cost-optimization.mjs +1 -0
  12. package/src/cost-tracker.mjs +134 -2
  13. package/src/data-extractor.mjs +36 -7
  14. package/src/dynamic-few-shot.mjs +69 -11
  15. package/src/errors.mjs +6 -2
  16. package/src/experience-propagation.mjs +12 -0
  17. package/src/experience-tracer.mjs +12 -3
  18. package/src/game-player.mjs +222 -43
  19. package/src/graceful-shutdown.mjs +126 -0
  20. package/src/helpers/playwright.mjs +22 -8
  21. package/src/human-validation-manager.mjs +99 -2
  22. package/src/index.mjs +48 -3
  23. package/src/integrations/playwright.mjs +140 -0
  24. package/src/judge.mjs +699 -24
  25. package/src/load-env.mjs +2 -1
  26. package/src/logger.mjs +31 -3
  27. package/src/model-tier-selector.mjs +1 -221
  28. package/src/natural-language-specs.mjs +31 -3
  29. package/src/persona-enhanced.mjs +4 -2
  30. package/src/persona-experience.mjs +1 -1
  31. package/src/pricing.mjs +28 -0
  32. package/src/prompt-composer.mjs +162 -5
  33. package/src/provider-data.mjs +115 -0
  34. package/src/render-change-detector.mjs +5 -0
  35. package/src/research-enhanced-validation.mjs +7 -5
  36. package/src/retry.mjs +21 -7
  37. package/src/rubrics.mjs +4 -0
  38. package/src/safe-logger.mjs +71 -0
  39. package/src/session-cost-tracker.mjs +320 -0
  40. package/src/smart-validator.mjs +8 -8
  41. package/src/spec-templates.mjs +52 -6
  42. package/src/startup-validation.mjs +127 -0
  43. package/src/temporal-adaptive.mjs +2 -2
  44. package/src/temporal-decision-manager.mjs +1 -271
  45. package/src/temporal-logic.mjs +104 -0
  46. package/src/temporal-note-pruner.mjs +119 -0
  47. package/src/temporal-preprocessor.mjs +1 -543
  48. package/src/temporal.mjs +681 -79
  49. package/src/utils/action-hallucination-detector.mjs +301 -0
  50. package/src/utils/baseline-validator.mjs +82 -0
  51. package/src/utils/cache-stats.mjs +104 -0
  52. package/src/utils/cached-llm.mjs +164 -0
  53. package/src/utils/capability-stratifier.mjs +108 -0
  54. package/src/utils/counterfactual-tester.mjs +83 -0
  55. package/src/utils/error-recovery.mjs +117 -0
  56. package/src/utils/explainability-scorer.mjs +119 -0
  57. package/src/utils/exploratory-automation.mjs +131 -0
  58. package/src/utils/index.mjs +10 -0
  59. package/src/utils/intent-recognizer.mjs +201 -0
  60. package/src/utils/log-sanitizer.mjs +165 -0
  61. package/src/utils/path-validator.mjs +88 -0
  62. package/src/utils/performance-logger.mjs +316 -0
  63. package/src/utils/performance-measurement.mjs +280 -0
  64. package/src/utils/prompt-sanitizer.mjs +213 -0
  65. package/src/utils/rate-limiter.mjs +144 -0
  66. package/src/validation-framework.mjs +24 -20
  67. package/src/validation-result-normalizer.mjs +35 -1
  68. package/src/validation.mjs +75 -25
  69. package/src/validators/accessibility-validator.mjs +144 -0
  70. package/src/validators/hybrid-validator.mjs +48 -4
  71. package/api/health.js +0 -34
  72. package/api/validate.js +0 -252
  73. package/public/index.html +0 -149
  74. package/vercel.json +0 -27
@@ -49,19 +49,45 @@ export async function decideGameAction(gameState, goal, history = []) {
49
49
  }
50
50
  );
51
51
 
52
- // Use VLLM to decide action
53
- const actionPrompt = `Based on the game state, decide what action to take.
54
- Goal: ${goal}
55
- Current state: ${stateEvaluation.reasoning?.substring(0, 200) || 'Unknown'}
56
- Previous actions: ${recentHistory.slice(-3).map(h => h.action?.key || h.action?.type || 'unknown').join(', ')}
52
+ // Enhanced Prompt with Reflexion and Chain of Thought
53
+ let reflexionContext = '';
54
+ const lastStep = recentHistory[recentHistory.length - 1];
55
+ if (lastStep && lastStep.result?.score !== undefined) {
56
+ const scoreDelta = (stateEvaluation.score || 0) - (lastStep.result.score || 0);
57
+ if (scoreDelta < 0) {
58
+ reflexionContext = `CRITICAL REFLEXION: The previous action (${JSON.stringify(lastStep.action)}) caused the score to drop by ${Math.abs(scoreDelta)}.
59
+ Analyze WHY this failed before choosing the next action. Avoid repeating the same mistake.`;
60
+ } else if (scoreDelta > 0) {
61
+ reflexionContext = `SUCCESS ANALYSIS: The previous action (${JSON.stringify(lastStep.action)}) increased the score by ${scoreDelta}. Continue this successful strategy.`;
62
+ }
63
+ }
64
+
65
+ const actionPrompt = `You are an expert game-playing agent. Your goal is: "${goal}".
66
+
67
+ ${reflexionContext}
68
+
69
+ CURRENT STATE:
70
+ - Visual Analysis: ${stateEvaluation.reasoning?.substring(0, 300) || 'No analysis available'}
71
+ - Score: ${stateEvaluation.score}
72
+ - History: ${recentHistory.length} steps taken
73
+
74
+ INSTRUCTIONS:
75
+ 1. THINK: Analyze the game state and physics step-by-step. Anticipate the consequences of moving Left, Right, Up, or Down.
76
+ 2. PLAN: Formulate a short-term plan (next 3 steps).
77
+ 3. ACT: Choose the single best immediate action.
78
+
79
+ Return JSON only:
80
+ {
81
+ "thought_process": "Step-by-step reasoning...",
82
+ "plan": "Short term plan...",
83
+ "type": "keyboard",
84
+ "key": "ArrowRight"
85
+ }
57
86
 
58
- Return action as JSON: { "type": "keyboard", "key": "ArrowRight" }
59
87
  Available actions:
60
88
  - keyboard: ArrowLeft, ArrowRight, ArrowUp, ArrowDown, Space, Enter
61
89
  - click: { "type": "click", "selector": "#button" }
62
- - wait: { "type": "wait", "duration": 100 }
63
-
64
- Choose the action that best achieves the goal.`;
90
+ - wait: { "type": "wait", "duration": 100 }`;
65
91
 
66
92
  const actionResult = await validateScreenshot(
67
93
  gameState.screenshot,
@@ -69,7 +95,8 @@ export async function decideGameAction(gameState, goal, history = []) {
69
95
  {
70
96
  extractStructured: true,
71
97
  testType: 'gameplay-decision',
72
- goal: goal
98
+ goal: goal,
99
+ temperature: 0.2 // Lower temperature for more deterministic gameplay
73
100
  }
74
101
  );
75
102
 
@@ -79,6 +106,10 @@ export async function decideGameAction(gameState, goal, history = []) {
79
106
  try {
80
107
  const parsed = JSON.parse(actionMatch[0]);
81
108
  if (parsed.type && (parsed.key || parsed.selector || parsed.duration !== undefined)) {
109
+ // Log thought process for debugging/transparency
110
+ if (parsed.thought_process) {
111
+ log(`[GamePlayer] Agent Thought: ${parsed.thought_process}`);
112
+ }
82
113
  return parsed;
83
114
  }
84
115
  } catch (e) {
@@ -107,24 +138,46 @@ export async function decideGameAction(gameState, goal, history = []) {
107
138
  * @param {Object} action - Action to execute
108
139
  */
109
140
  export async function executeGameAction(page, action) {
110
- switch (action.type) {
111
- case 'keyboard':
112
- await page.keyboard.press(action.key);
113
- break;
114
- case 'click':
115
- if (action.selector) {
116
- await page.click(action.selector);
117
- } else {
118
- warn('[GamePlayer] Click action missing selector');
119
- }
120
- break;
121
- case 'wait':
122
- await page.waitForTimeout(action.duration || 100);
123
- break;
124
- default:
125
- warn(`[GamePlayer] Unknown action type: ${action.type}, defaulting to wait`);
126
- await page.waitForTimeout(100);
141
+ let executionResult = { success: false, error: null };
142
+
143
+ try {
144
+ switch (action.type) {
145
+ case 'keyboard':
146
+ await page.keyboard.press(action.key);
147
+ executionResult.success = true;
148
+ break;
149
+ case 'click':
150
+ if (action.selector) {
151
+ // Verify element exists before clicking
152
+ const exists = await page.locator(action.selector).count() > 0;
153
+ if (!exists) {
154
+ executionResult.success = false;
155
+ executionResult.error = `Element not found: ${action.selector}`;
156
+ return executionResult;
157
+ }
158
+
159
+ await page.click(action.selector);
160
+ executionResult.success = true;
161
+ } else {
162
+ warn('[GamePlayer] Click action missing selector');
163
+ executionResult.error = 'Click action missing selector';
164
+ }
165
+ break;
166
+ case 'wait':
167
+ await page.waitForTimeout(action.duration || 100);
168
+ executionResult.success = true;
169
+ break;
170
+ default:
171
+ warn(`[GamePlayer] Unknown action type: ${action.type}, defaulting to wait`);
172
+ await page.waitForTimeout(100);
173
+ executionResult.success = true;
174
+ }
175
+ } catch (error) {
176
+ executionResult.success = false;
177
+ executionResult.error = error.message;
127
178
  }
179
+
180
+ return executionResult;
128
181
  }
129
182
 
130
183
  /**
@@ -189,39 +242,165 @@ export async function playGame(page, options = {}) {
189
242
  const screenshotPath = join(screenshotDir, `gameplay-step-${step}.png`);
190
243
  writeFileSync(screenshotPath, screenshot);
191
244
 
245
+ // 2. Extract game state from page (if available)
246
+ let gameState = null;
247
+ try {
248
+ gameState = await page.evaluate(() => {
249
+ // Try multiple ways to get game state
250
+ if (window.gameState) {
251
+ return window.gameState;
252
+ }
253
+ // Try common game state patterns
254
+ if (window.game) {
255
+ return {
256
+ score: window.game.score || 0,
257
+ level: window.game.level || 0,
258
+ lives: window.game.lives || 0,
259
+ gameActive: window.game.active !== false
260
+ };
261
+ }
262
+ // Try to extract from DOM
263
+ const scoreEl = document.querySelector('#score, .score, [data-score]');
264
+ const score = scoreEl ? parseInt(scoreEl.textContent?.match(/\d+/)?.[0] || '0') : null;
265
+ return {
266
+ score,
267
+ gameActive: true // Assume active if we can't detect
268
+ };
269
+ });
270
+ } catch (error) {
271
+ // Game state extraction is optional
272
+ log(`[GamePlayer] Could not extract game state: ${error.message}`);
273
+ }
274
+
192
275
  // 2. Understand current state (validation)
193
276
  currentState = {
194
277
  screenshot: screenshotPath,
195
278
  step,
196
- timestamp: Date.now()
279
+ timestamp: Date.now(),
280
+ gameState // Include extracted game state
197
281
  };
198
282
 
199
- const stateEvaluation = await validateScreenshot(
200
- screenshotPath,
201
- `Evaluate current game state. Goal: ${goal}`,
202
- {
203
- testType: 'gameplay',
204
- temporalNotes: history.map(h => ({
205
- step: h.step,
206
- action: h.action,
207
- result: h.result?.score
208
- }))
283
+ // Use TemporalDecisionManager to reduce LLM calls
284
+ // Only prompt when decision is needed, not on every state change
285
+ const temporalNotes = history.map(h => ({
286
+ step: h.step,
287
+ action: h.action,
288
+ result: h.result?.score,
289
+ timestamp: h.state?.timestamp || Date.now()
290
+ }));
291
+
292
+ let stateEvaluation;
293
+ if (step > 0 && history.length > 0) {
294
+ // Use TemporalDecisionManager for subsequent steps
295
+ try {
296
+ const { TemporalDecisionManager } = await import('./temporal-decision-manager.mjs');
297
+ const decisionManager = new TemporalDecisionManager({
298
+ minNotesForPrompt: 2,
299
+ coherenceThreshold: 0.5
300
+ });
301
+
302
+ const currentState = {
303
+ score: null,
304
+ step,
305
+ timestamp: Date.now()
306
+ };
307
+ const previousState = history[history.length - 1]?.result || null;
308
+
309
+ const decision = await decisionManager.shouldPrompt(currentState, previousState, temporalNotes, {
310
+ stage: 'gameplay',
311
+ testType: 'gameplay'
312
+ });
313
+
314
+ if (!decision.shouldPrompt && decision.urgency !== 'high' && previousState) {
315
+ // Don't prompt yet - reuse previous result
316
+ stateEvaluation = {
317
+ ...previousState,
318
+ skipped: true,
319
+ skipReason: decision.reason,
320
+ urgency: decision.urgency
321
+ };
322
+ } else {
323
+ // Prompt now (decision point or high urgency)
324
+ stateEvaluation = await validateScreenshot(
325
+ screenshotPath,
326
+ `Evaluate current game state. Goal: ${goal}`,
327
+ {
328
+ testType: 'gameplay',
329
+ temporalNotes,
330
+ sequenceIndex: step,
331
+ useTemporalDecision: true,
332
+ currentState,
333
+ previousState,
334
+ previousResult: previousState
335
+ }
336
+ );
337
+ }
338
+ } catch (error) {
339
+ // If TemporalDecisionManager fails, proceed with normal validation
340
+ stateEvaluation = await validateScreenshot(
341
+ screenshotPath,
342
+ `Evaluate current game state. Goal: ${goal}`,
343
+ {
344
+ testType: 'gameplay',
345
+ temporalNotes,
346
+ sequenceIndex: step
347
+ }
348
+ );
209
349
  }
210
- );
350
+ } else {
351
+ // First step - always validate
352
+ stateEvaluation = await validateScreenshot(
353
+ screenshotPath,
354
+ `Evaluate current game state. Goal: ${goal}`,
355
+ {
356
+ testType: 'gameplay',
357
+ temporalNotes,
358
+ sequenceIndex: step
359
+ }
360
+ );
361
+ }
211
362
 
212
363
  currentState.evaluation = stateEvaluation;
213
364
 
214
365
  // 3. Decide what action to take (decision-making)
215
- const action = await decideGameAction(
366
+ let action = await decideGameAction(
216
367
  currentState,
217
368
  goal,
218
369
  history
219
370
  );
220
371
 
221
- log(`[GamePlayer] Step ${step}: score=${stateEvaluation.score}, action=${action.type}:${action.key || action.selector || ''}`);
372
+ // Try action, with simple retry on failure
373
+ let actionExecuted = false;
374
+ let retries = 0;
375
+ const maxRetries = 2;
222
376
 
223
- // 4. Execute action (Playwright)
224
- await executeGameAction(page, action);
377
+ while (!actionExecuted && retries < maxRetries) {
378
+ log(`[GamePlayer] Step ${step}: score=${stateEvaluation.score}, action=${action.type}:${action.key || action.selector || ''}`);
379
+
380
+ // 4. Execute action (Playwright)
381
+ const executionResult = await executeGameAction(page, action);
382
+
383
+ if (executionResult.success) {
384
+ actionExecuted = true;
385
+ action.executionResult = executionResult;
386
+ } else {
387
+ // Action failed - wait and retry, or try simple alternative
388
+ retries++;
389
+ if (retries < maxRetries) {
390
+ const { createExploratoryStrategy } = await import('./utils/exploratory-automation.mjs');
391
+ const exploratoryStrategy = createExploratoryStrategy({ maxAttempts: 2 });
392
+ const nextAction = exploratoryStrategy.getNextAction(currentState, [action], goal);
393
+
394
+ if (nextAction) {
395
+ log(`[GamePlayer] Action failed, trying alternative: ${nextAction.type}`);
396
+ action = nextAction;
397
+ } else {
398
+ // Wait and retry original action
399
+ await page.waitForTimeout(500);
400
+ }
401
+ }
402
+ }
403
+ }
225
404
 
226
405
  // 5. Wait for next frame
227
406
  await page.waitForTimeout(1000 / fps);
@@ -0,0 +1,126 @@
1
+ /**
2
+ * Graceful Shutdown Handler
3
+ *
4
+ * Handles graceful shutdown for long-running processes.
5
+ * Ensures in-flight operations complete, caches are flushed, and resources are cleaned up.
6
+ */
7
+
8
+ import { log, warn, error } from './logger.mjs';
9
+
10
+ let shutdownHandlers = [];
11
+ let isShuttingDown = false;
12
+ let shutdownTimeout = 30000; // 30 seconds default timeout
13
+
14
+ /**
15
+ * Register a shutdown handler
16
+ *
17
+ * @param {Function} handler - Async function to call during shutdown
18
+ * @param {number} [priority=0] - Priority (higher = called first)
19
+ */
20
+ export function registerShutdownHandler(handler, priority = 0) {
21
+ if (typeof handler !== 'function') {
22
+ throw new TypeError('Shutdown handler must be a function');
23
+ }
24
+
25
+ shutdownHandlers.push({ handler, priority });
26
+ // Sort by priority (higher first)
27
+ shutdownHandlers.sort((a, b) => b.priority - a.priority);
28
+ }
29
+
30
+ /**
31
+ * Unregister a shutdown handler
32
+ *
33
+ * @param {Function} handler - Handler to remove
34
+ */
35
+ export function unregisterShutdownHandler(handler) {
36
+ shutdownHandlers = shutdownHandlers.filter(h => h.handler !== handler);
37
+ }
38
+
39
+ /**
40
+ * Perform graceful shutdown
41
+ *
42
+ * @param {Object} [options={}] - Shutdown options
43
+ * @param {number} [options.timeout=30000] - Timeout in milliseconds
44
+ * @param {string} [options.signal='SIGTERM'] - Signal name for logging
45
+ * @returns {Promise<void>}
46
+ */
47
+ export async function gracefulShutdown(options = {}) {
48
+ if (isShuttingDown) {
49
+ warn('[GracefulShutdown] Shutdown already in progress');
50
+ return;
51
+ }
52
+
53
+ isShuttingDown = true;
54
+ const { timeout = shutdownTimeout, signal = 'SIGTERM' } = options;
55
+
56
+ log(`[GracefulShutdown] Initiating graceful shutdown (signal: ${signal})...`);
57
+
58
+ // Set timeout to force exit if shutdown takes too long
59
+ const timeoutId = setTimeout(() => {
60
+ warn('[GracefulShutdown] Shutdown timeout exceeded, forcing exit');
61
+ process.exit(1);
62
+ }, timeout);
63
+
64
+ try {
65
+ // Execute shutdown handlers in priority order
66
+ for (const { handler } of shutdownHandlers) {
67
+ try {
68
+ await handler();
69
+ } catch (err) {
70
+ warn(`[GracefulShutdown] Handler failed:`, err);
71
+ // Continue with other handlers even if one fails
72
+ }
73
+ }
74
+
75
+ // Note: Cache is file-based and doesn't need explicit flushing
76
+ // File writes are atomic, so no cleanup needed
77
+ log('[GracefulShutdown] Cache is file-based, no flush needed');
78
+
79
+ clearTimeout(timeoutId);
80
+ log('[GracefulShutdown] Shutdown complete');
81
+ process.exit(0);
82
+ } catch (err) {
83
+ clearTimeout(timeoutId);
84
+ error('[GracefulShutdown] Shutdown failed:', err);
85
+ process.exit(1);
86
+ }
87
+ }
88
+
89
+ /**
90
+ * Initialize graceful shutdown handlers
91
+ *
92
+ * Registers signal handlers for SIGTERM and SIGINT.
93
+ *
94
+ * @param {Object} [options={}] - Initialization options
95
+ * @param {number} [options.timeout=30000] - Shutdown timeout
96
+ */
97
+ export function initGracefulShutdown(options = {}) {
98
+ shutdownTimeout = options.timeout || 30000;
99
+
100
+ // Register signal handlers
101
+ process.on('SIGTERM', () => {
102
+ log('[GracefulShutdown] Received SIGTERM');
103
+ gracefulShutdown({ signal: 'SIGTERM', timeout: shutdownTimeout });
104
+ });
105
+
106
+ process.on('SIGINT', () => {
107
+ log('[GracefulShutdown] Received SIGINT (Ctrl+C)');
108
+ gracefulShutdown({ signal: 'SIGINT', timeout: shutdownTimeout });
109
+ });
110
+
111
+ // Handle uncaught exceptions (best-effort cleanup)
112
+ process.on('uncaughtException', (err) => {
113
+ error('[GracefulShutdown] Uncaught exception:', err);
114
+ gracefulShutdown({ signal: 'uncaughtException', timeout: 5000 }); // Shorter timeout for crashes
115
+ });
116
+
117
+ // Handle unhandled promise rejections
118
+ process.on('unhandledRejection', (reason, promise) => {
119
+ warn('[GracefulShutdown] Unhandled promise rejection:', reason);
120
+ // Don't shutdown on unhandled rejections (may be recoverable)
121
+ // But log for monitoring
122
+ });
123
+
124
+ log('[GracefulShutdown] Graceful shutdown handlers initialized');
125
+ }
126
+
@@ -68,13 +68,27 @@ export async function getPlaywrightPage(options = {}) {
68
68
  };
69
69
  }
70
70
 
71
- const browser = await chromium.launch(options.browserOptions || {});
72
- const page = await browser.newPage();
73
-
74
- return {
75
- page,
76
- browser,
77
- isMock: false
78
- };
71
+ try {
72
+ const browser = await chromium.launch(options.browserOptions || {});
73
+ const page = await browser.newPage();
74
+
75
+ return {
76
+ page,
77
+ browser,
78
+ isMock: false
79
+ };
80
+ } catch (error) {
81
+ // Browser executable not found, fallback to mock
82
+ if (error.message.includes('Executable doesn\'t exist') ||
83
+ error.message.includes('browserType.launch') ||
84
+ error.message.includes('Browser not found')) {
85
+ return {
86
+ page: createMockPage(),
87
+ browser: null,
88
+ isMock: true
89
+ };
90
+ }
91
+ throw error;
92
+ }
79
93
  }
80
94
 
@@ -5,8 +5,8 @@
5
5
  * - Non-blocking: Doesn't slow down evaluations
6
6
  * - Automatic: Collects VLLM judgments when enabled
7
7
  * - Smart sampling: Requests human validation for interesting cases
8
- * - Learning: Automatically calibrates based on collected data
9
- * - Seamless: Works with all existing systems (batching, temporal, personas)
8
+ * - Learning: Calibrates based on collected data
9
+ * - Integration: Works with all existing systems (batching, temporal, personas)
10
10
  */
11
11
 
12
12
  import { warn, log } from './logger.mjs';
@@ -362,6 +362,103 @@ export class HumanValidationManager {
362
362
  };
363
363
  }
364
364
 
365
+ /**
366
+ * Track calibration degradation over screenshot sequences
367
+ *
368
+ * @param {number} sequenceIndex - Index in sequence
369
+ * @param {Object} result - Validation result
370
+ * @returns {Object} Degradation status
371
+ */
372
+ trackSequenceCalibration(sequenceIndex, result) {
373
+ if (!this.sequenceHistory) {
374
+ this.sequenceHistory = [];
375
+ }
376
+
377
+ const entry = {
378
+ index: sequenceIndex,
379
+ timestamp: Date.now(),
380
+ confidence: result.confidence || 0.5,
381
+ uncertainty: result.uncertainty || 0.5,
382
+ score: result.score,
383
+ logprobs: result.logprobs
384
+ };
385
+
386
+ this.sequenceHistory.push(entry);
387
+
388
+ // Detect degradation (compare recent vs early)
389
+ if (this.sequenceHistory.length >= 5) {
390
+ const recent = this.sequenceHistory.slice(-5);
391
+ const early = this.sequenceHistory.slice(0, 5);
392
+
393
+ const recentAvgConfidence = recent.reduce((sum, e) => sum + e.confidence, 0) / recent.length;
394
+ const earlyAvgConfidence = early.reduce((sum, e) => sum + e.confidence, 0) / early.length;
395
+
396
+ const degradation = earlyAvgConfidence - recentAvgConfidence;
397
+ const degradationThreshold = 0.15; // 15% drop
398
+
399
+ if (degradation > degradationThreshold) {
400
+ return {
401
+ degraded: true,
402
+ degradation,
403
+ recommendation: 'recalibrate_or_reduce_sequence',
404
+ suggestedAction: 'Use temporal graph representation or reduce sequence length'
405
+ };
406
+ }
407
+ }
408
+
409
+ return { degraded: false };
410
+ }
411
+
412
+ /**
413
+ * Get calibration quality metrics for sequence
414
+ */
415
+ getSequenceCalibrationMetrics() {
416
+ if (!this.sequenceHistory || this.sequenceHistory.length < 2) {
417
+ return { quality: 'unknown', recommendation: 'insufficient_data' };
418
+ }
419
+
420
+ const confidences = this.sequenceHistory.map(e => e.confidence);
421
+ const variance = this.calculateVariance(confidences);
422
+ const trend = this.calculateTrend(confidences);
423
+
424
+ if (variance > 0.1 && trend < -0.05) {
425
+ return {
426
+ quality: 'degrading',
427
+ variance,
428
+ trend,
429
+ recommendation: 'recalibrate_or_reduce_sequence'
430
+ };
431
+ }
432
+
433
+ return {
434
+ quality: variance < 0.05 ? 'stable' : 'variable',
435
+ variance,
436
+ trend
437
+ };
438
+ }
439
+
440
+ /**
441
+ * Calculate variance of values
442
+ */
443
+ calculateVariance(values) {
444
+ if (values.length === 0) return 0;
445
+ const mean = values.reduce((a, b) => a + b, 0) / values.length;
446
+ const squaredDiffs = values.map(v => Math.pow(v - mean, 2));
447
+ return squaredDiffs.reduce((a, b) => a + b, 0) / values.length;
448
+ }
449
+
450
+ /**
451
+ * Calculate trend of values (positive = increasing, negative = decreasing)
452
+ */
453
+ calculateTrend(values) {
454
+ if (values.length < 2) return 0;
455
+ const firstHalf = values.slice(0, Math.floor(values.length / 2));
456
+ const secondHalf = values.slice(Math.floor(values.length / 2));
457
+ const firstAvg = firstHalf.reduce((a, b) => a + b, 0) / firstHalf.length;
458
+ const secondAvg = secondHalf.reduce((a, b) => a + b, 0) / secondHalf.length;
459
+ return (secondAvg - firstAvg) / firstAvg;
460
+ }
461
+
365
462
  /**
366
463
  * Apply calibration adjustments to VLLM score
367
464
  *
package/src/index.mjs CHANGED
@@ -17,10 +17,35 @@
17
17
  import { loadEnv } from './load-env.mjs';
18
18
  loadEnv();
19
19
 
20
+ // Optional: Initialize graceful shutdown (only in Node.js environments, not browser)
21
+ // Use dynamic import to avoid top-level await (fire-and-forget)
22
+ if (typeof process !== 'undefined' && process.env.NODE_ENV !== 'test') {
23
+ import('./graceful-shutdown.mjs').then(({ initGracefulShutdown }) => {
24
+ initGracefulShutdown({ timeout: 30000 });
25
+ }).catch(() => {
26
+ // Graceful shutdown is optional, don't fail if unavailable
27
+ });
28
+ }
29
+
20
30
  import { VLLMJudge, validateScreenshot as _validateScreenshot } from './judge.mjs';
21
31
 
22
32
  export { VLLMJudge, _validateScreenshot as validateScreenshot };
23
33
 
34
+ // Export startup validation utilities
35
+ export { validateStartup, validateStartupSoft } from './startup-validation.mjs';
36
+
37
+ // Export graceful shutdown utilities
38
+ export { initGracefulShutdown, registerShutdownHandler, gracefulShutdown } from './graceful-shutdown.mjs';
39
+
40
+ // Export performance measurement utilities
41
+ export {
42
+ PerformanceMeasurement,
43
+ PerformanceProfiler,
44
+ measureAsync,
45
+ measureSync,
46
+ getProfiler
47
+ } from './utils/performance-measurement.mjs';
48
+
24
49
  /**
25
50
  * Extract semantic information from VLLM judgment text
26
51
  *
@@ -28,7 +53,7 @@ export { VLLMJudge, _validateScreenshot as validateScreenshot };
28
53
  * Useful for custom implementations that need to parse judgment text.
29
54
  *
30
55
  * @param {string | object} judgment - Judgment text or object from VLLM
31
- * @returns {import('./index.mjs').SemanticInfo} Structured semantic information with score, issues, assessment, reasoning
56
+ * @returns {Object} Structured semantic information with score, issues, assessment, reasoning, brutalistViolations (optional), zeroToleranceViolations (optional)
32
57
  */
33
58
  export function extractSemanticInfo(judgment) {
34
59
  // Create a temporary judge instance to access the method
@@ -170,12 +195,31 @@ export {
170
195
  calculateBackoff,
171
196
  enhanceErrorMessage
172
197
  } from './retry.mjs';
198
+
199
+ // Cost optimization utilities
200
+ export {
201
+ calculateCostComparison,
202
+ optimizeCost
203
+ } from './cost-optimization.mjs';
173
204
  export {
174
205
  CostTracker,
175
206
  getCostTracker,
176
207
  recordCost,
177
- getCostStats
208
+ getCostStats,
209
+ setBudgetLimit,
210
+ getBudgetStatus
178
211
  } from './cost-tracker.mjs';
212
+ // Session-level cost tracking
213
+ export {
214
+ startSession,
215
+ endSession,
216
+ getSessionCosts,
217
+ recordSessionCost,
218
+ recordSessionCacheHit,
219
+ recordSessionCacheMiss,
220
+ getActiveSessions,
221
+ getGlobalCostStats
222
+ } from './session-cost-tracker.mjs';
179
223
  export {
180
224
  DEFAULT_RUBRIC,
181
225
  buildRubricPrompt,
@@ -281,7 +325,8 @@ export {
281
325
  export {
282
326
  testGameplay,
283
327
  testBrowserExperience,
284
- validateWithGoals
328
+ validateWithGoals,
329
+ validatePage
285
330
  } from './convenience.mjs';
286
331
 
287
332
  // Game playing (optional - requires Playwright)