@arclabs561/ai-visual-test 0.5.1 → 0.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +127 -11
- package/DEPLOYMENT.md +225 -9
- package/README.md +71 -80
- package/index.d.ts +902 -5
- package/package.json +10 -51
- package/src/batch-optimizer.mjs +39 -0
- package/src/cache.mjs +241 -16
- package/src/config.mjs +33 -91
- package/src/constants.mjs +54 -0
- package/src/convenience.mjs +113 -10
- package/src/cost-optimization.mjs +1 -0
- package/src/cost-tracker.mjs +134 -2
- package/src/data-extractor.mjs +36 -7
- package/src/dynamic-few-shot.mjs +69 -11
- package/src/errors.mjs +6 -2
- package/src/experience-propagation.mjs +12 -0
- package/src/experience-tracer.mjs +12 -3
- package/src/game-player.mjs +222 -43
- package/src/graceful-shutdown.mjs +126 -0
- package/src/helpers/playwright.mjs +22 -8
- package/src/human-validation-manager.mjs +99 -2
- package/src/index.mjs +48 -3
- package/src/integrations/playwright.mjs +140 -0
- package/src/judge.mjs +699 -24
- package/src/load-env.mjs +2 -1
- package/src/logger.mjs +31 -3
- package/src/model-tier-selector.mjs +1 -221
- package/src/natural-language-specs.mjs +31 -3
- package/src/persona-enhanced.mjs +4 -2
- package/src/persona-experience.mjs +1 -1
- package/src/pricing.mjs +28 -0
- package/src/prompt-composer.mjs +162 -5
- package/src/provider-data.mjs +115 -0
- package/src/render-change-detector.mjs +5 -0
- package/src/research-enhanced-validation.mjs +7 -5
- package/src/retry.mjs +21 -7
- package/src/rubrics.mjs +4 -0
- package/src/safe-logger.mjs +71 -0
- package/src/session-cost-tracker.mjs +320 -0
- package/src/smart-validator.mjs +8 -8
- package/src/spec-templates.mjs +52 -6
- package/src/startup-validation.mjs +127 -0
- package/src/temporal-adaptive.mjs +2 -2
- package/src/temporal-decision-manager.mjs +1 -271
- package/src/temporal-logic.mjs +104 -0
- package/src/temporal-note-pruner.mjs +119 -0
- package/src/temporal-preprocessor.mjs +1 -543
- package/src/temporal.mjs +681 -79
- package/src/utils/action-hallucination-detector.mjs +301 -0
- package/src/utils/baseline-validator.mjs +82 -0
- package/src/utils/cache-stats.mjs +104 -0
- package/src/utils/cached-llm.mjs +164 -0
- package/src/utils/capability-stratifier.mjs +108 -0
- package/src/utils/counterfactual-tester.mjs +83 -0
- package/src/utils/error-recovery.mjs +117 -0
- package/src/utils/explainability-scorer.mjs +119 -0
- package/src/utils/exploratory-automation.mjs +131 -0
- package/src/utils/index.mjs +10 -0
- package/src/utils/intent-recognizer.mjs +201 -0
- package/src/utils/log-sanitizer.mjs +165 -0
- package/src/utils/path-validator.mjs +88 -0
- package/src/utils/performance-logger.mjs +316 -0
- package/src/utils/performance-measurement.mjs +280 -0
- package/src/utils/prompt-sanitizer.mjs +213 -0
- package/src/utils/rate-limiter.mjs +144 -0
- package/src/validation-framework.mjs +24 -20
- package/src/validation-result-normalizer.mjs +35 -1
- package/src/validation.mjs +75 -25
- package/src/validators/accessibility-validator.mjs +144 -0
- package/src/validators/hybrid-validator.mjs +48 -4
- package/api/health.js +0 -34
- package/api/validate.js +0 -252
- package/public/index.html +0 -149
- package/vercel.json +0 -27
package/src/game-player.mjs
CHANGED
|
@@ -49,19 +49,45 @@ export async function decideGameAction(gameState, goal, history = []) {
|
|
|
49
49
|
}
|
|
50
50
|
);
|
|
51
51
|
|
|
52
|
-
//
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
52
|
+
// Enhanced Prompt with Reflexion and Chain of Thought
|
|
53
|
+
let reflexionContext = '';
|
|
54
|
+
const lastStep = recentHistory[recentHistory.length - 1];
|
|
55
|
+
if (lastStep && lastStep.result?.score !== undefined) {
|
|
56
|
+
const scoreDelta = (stateEvaluation.score || 0) - (lastStep.result.score || 0);
|
|
57
|
+
if (scoreDelta < 0) {
|
|
58
|
+
reflexionContext = `CRITICAL REFLEXION: The previous action (${JSON.stringify(lastStep.action)}) caused the score to drop by ${Math.abs(scoreDelta)}.
|
|
59
|
+
Analyze WHY this failed before choosing the next action. Avoid repeating the same mistake.`;
|
|
60
|
+
} else if (scoreDelta > 0) {
|
|
61
|
+
reflexionContext = `SUCCESS ANALYSIS: The previous action (${JSON.stringify(lastStep.action)}) increased the score by ${scoreDelta}. Continue this successful strategy.`;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
const actionPrompt = `You are an expert game-playing agent. Your goal is: "${goal}".
|
|
66
|
+
|
|
67
|
+
${reflexionContext}
|
|
68
|
+
|
|
69
|
+
CURRENT STATE:
|
|
70
|
+
- Visual Analysis: ${stateEvaluation.reasoning?.substring(0, 300) || 'No analysis available'}
|
|
71
|
+
- Score: ${stateEvaluation.score}
|
|
72
|
+
- History: ${recentHistory.length} steps taken
|
|
73
|
+
|
|
74
|
+
INSTRUCTIONS:
|
|
75
|
+
1. THINK: Analyze the game state and physics step-by-step. Anticipate the consequences of moving Left, Right, Up, or Down.
|
|
76
|
+
2. PLAN: Formulate a short-term plan (next 3 steps).
|
|
77
|
+
3. ACT: Choose the single best immediate action.
|
|
78
|
+
|
|
79
|
+
Return JSON only:
|
|
80
|
+
{
|
|
81
|
+
"thought_process": "Step-by-step reasoning...",
|
|
82
|
+
"plan": "Short term plan...",
|
|
83
|
+
"type": "keyboard",
|
|
84
|
+
"key": "ArrowRight"
|
|
85
|
+
}
|
|
57
86
|
|
|
58
|
-
Return action as JSON: { "type": "keyboard", "key": "ArrowRight" }
|
|
59
87
|
Available actions:
|
|
60
88
|
- keyboard: ArrowLeft, ArrowRight, ArrowUp, ArrowDown, Space, Enter
|
|
61
89
|
- click: { "type": "click", "selector": "#button" }
|
|
62
|
-
- wait: { "type": "wait", "duration": 100 }
|
|
63
|
-
|
|
64
|
-
Choose the action that best achieves the goal.`;
|
|
90
|
+
- wait: { "type": "wait", "duration": 100 }`;
|
|
65
91
|
|
|
66
92
|
const actionResult = await validateScreenshot(
|
|
67
93
|
gameState.screenshot,
|
|
@@ -69,7 +95,8 @@ export async function decideGameAction(gameState, goal, history = []) {
|
|
|
69
95
|
{
|
|
70
96
|
extractStructured: true,
|
|
71
97
|
testType: 'gameplay-decision',
|
|
72
|
-
goal: goal
|
|
98
|
+
goal: goal,
|
|
99
|
+
temperature: 0.2 // Lower temperature for more deterministic gameplay
|
|
73
100
|
}
|
|
74
101
|
);
|
|
75
102
|
|
|
@@ -79,6 +106,10 @@ export async function decideGameAction(gameState, goal, history = []) {
|
|
|
79
106
|
try {
|
|
80
107
|
const parsed = JSON.parse(actionMatch[0]);
|
|
81
108
|
if (parsed.type && (parsed.key || parsed.selector || parsed.duration !== undefined)) {
|
|
109
|
+
// Log thought process for debugging/transparency
|
|
110
|
+
if (parsed.thought_process) {
|
|
111
|
+
log(`[GamePlayer] Agent Thought: ${parsed.thought_process}`);
|
|
112
|
+
}
|
|
82
113
|
return parsed;
|
|
83
114
|
}
|
|
84
115
|
} catch (e) {
|
|
@@ -107,24 +138,46 @@ export async function decideGameAction(gameState, goal, history = []) {
|
|
|
107
138
|
* @param {Object} action - Action to execute
|
|
108
139
|
*/
|
|
109
140
|
export async function executeGameAction(page, action) {
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
141
|
+
let executionResult = { success: false, error: null };
|
|
142
|
+
|
|
143
|
+
try {
|
|
144
|
+
switch (action.type) {
|
|
145
|
+
case 'keyboard':
|
|
146
|
+
await page.keyboard.press(action.key);
|
|
147
|
+
executionResult.success = true;
|
|
148
|
+
break;
|
|
149
|
+
case 'click':
|
|
150
|
+
if (action.selector) {
|
|
151
|
+
// Verify element exists before clicking
|
|
152
|
+
const exists = await page.locator(action.selector).count() > 0;
|
|
153
|
+
if (!exists) {
|
|
154
|
+
executionResult.success = false;
|
|
155
|
+
executionResult.error = `Element not found: ${action.selector}`;
|
|
156
|
+
return executionResult;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
await page.click(action.selector);
|
|
160
|
+
executionResult.success = true;
|
|
161
|
+
} else {
|
|
162
|
+
warn('[GamePlayer] Click action missing selector');
|
|
163
|
+
executionResult.error = 'Click action missing selector';
|
|
164
|
+
}
|
|
165
|
+
break;
|
|
166
|
+
case 'wait':
|
|
167
|
+
await page.waitForTimeout(action.duration || 100);
|
|
168
|
+
executionResult.success = true;
|
|
169
|
+
break;
|
|
170
|
+
default:
|
|
171
|
+
warn(`[GamePlayer] Unknown action type: ${action.type}, defaulting to wait`);
|
|
172
|
+
await page.waitForTimeout(100);
|
|
173
|
+
executionResult.success = true;
|
|
174
|
+
}
|
|
175
|
+
} catch (error) {
|
|
176
|
+
executionResult.success = false;
|
|
177
|
+
executionResult.error = error.message;
|
|
127
178
|
}
|
|
179
|
+
|
|
180
|
+
return executionResult;
|
|
128
181
|
}
|
|
129
182
|
|
|
130
183
|
/**
|
|
@@ -189,39 +242,165 @@ export async function playGame(page, options = {}) {
|
|
|
189
242
|
const screenshotPath = join(screenshotDir, `gameplay-step-${step}.png`);
|
|
190
243
|
writeFileSync(screenshotPath, screenshot);
|
|
191
244
|
|
|
245
|
+
// 2. Extract game state from page (if available)
|
|
246
|
+
let gameState = null;
|
|
247
|
+
try {
|
|
248
|
+
gameState = await page.evaluate(() => {
|
|
249
|
+
// Try multiple ways to get game state
|
|
250
|
+
if (window.gameState) {
|
|
251
|
+
return window.gameState;
|
|
252
|
+
}
|
|
253
|
+
// Try common game state patterns
|
|
254
|
+
if (window.game) {
|
|
255
|
+
return {
|
|
256
|
+
score: window.game.score || 0,
|
|
257
|
+
level: window.game.level || 0,
|
|
258
|
+
lives: window.game.lives || 0,
|
|
259
|
+
gameActive: window.game.active !== false
|
|
260
|
+
};
|
|
261
|
+
}
|
|
262
|
+
// Try to extract from DOM
|
|
263
|
+
const scoreEl = document.querySelector('#score, .score, [data-score]');
|
|
264
|
+
const score = scoreEl ? parseInt(scoreEl.textContent?.match(/\d+/)?.[0] || '0') : null;
|
|
265
|
+
return {
|
|
266
|
+
score,
|
|
267
|
+
gameActive: true // Assume active if we can't detect
|
|
268
|
+
};
|
|
269
|
+
});
|
|
270
|
+
} catch (error) {
|
|
271
|
+
// Game state extraction is optional
|
|
272
|
+
log(`[GamePlayer] Could not extract game state: ${error.message}`);
|
|
273
|
+
}
|
|
274
|
+
|
|
192
275
|
// 2. Understand current state (validation)
|
|
193
276
|
currentState = {
|
|
194
277
|
screenshot: screenshotPath,
|
|
195
278
|
step,
|
|
196
|
-
timestamp: Date.now()
|
|
279
|
+
timestamp: Date.now(),
|
|
280
|
+
gameState // Include extracted game state
|
|
197
281
|
};
|
|
198
282
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
283
|
+
// Use TemporalDecisionManager to reduce LLM calls
|
|
284
|
+
// Only prompt when decision is needed, not on every state change
|
|
285
|
+
const temporalNotes = history.map(h => ({
|
|
286
|
+
step: h.step,
|
|
287
|
+
action: h.action,
|
|
288
|
+
result: h.result?.score,
|
|
289
|
+
timestamp: h.state?.timestamp || Date.now()
|
|
290
|
+
}));
|
|
291
|
+
|
|
292
|
+
let stateEvaluation;
|
|
293
|
+
if (step > 0 && history.length > 0) {
|
|
294
|
+
// Use TemporalDecisionManager for subsequent steps
|
|
295
|
+
try {
|
|
296
|
+
const { TemporalDecisionManager } = await import('./temporal-decision-manager.mjs');
|
|
297
|
+
const decisionManager = new TemporalDecisionManager({
|
|
298
|
+
minNotesForPrompt: 2,
|
|
299
|
+
coherenceThreshold: 0.5
|
|
300
|
+
});
|
|
301
|
+
|
|
302
|
+
const currentState = {
|
|
303
|
+
score: null,
|
|
304
|
+
step,
|
|
305
|
+
timestamp: Date.now()
|
|
306
|
+
};
|
|
307
|
+
const previousState = history[history.length - 1]?.result || null;
|
|
308
|
+
|
|
309
|
+
const decision = await decisionManager.shouldPrompt(currentState, previousState, temporalNotes, {
|
|
310
|
+
stage: 'gameplay',
|
|
311
|
+
testType: 'gameplay'
|
|
312
|
+
});
|
|
313
|
+
|
|
314
|
+
if (!decision.shouldPrompt && decision.urgency !== 'high' && previousState) {
|
|
315
|
+
// Don't prompt yet - reuse previous result
|
|
316
|
+
stateEvaluation = {
|
|
317
|
+
...previousState,
|
|
318
|
+
skipped: true,
|
|
319
|
+
skipReason: decision.reason,
|
|
320
|
+
urgency: decision.urgency
|
|
321
|
+
};
|
|
322
|
+
} else {
|
|
323
|
+
// Prompt now (decision point or high urgency)
|
|
324
|
+
stateEvaluation = await validateScreenshot(
|
|
325
|
+
screenshotPath,
|
|
326
|
+
`Evaluate current game state. Goal: ${goal}`,
|
|
327
|
+
{
|
|
328
|
+
testType: 'gameplay',
|
|
329
|
+
temporalNotes,
|
|
330
|
+
sequenceIndex: step,
|
|
331
|
+
useTemporalDecision: true,
|
|
332
|
+
currentState,
|
|
333
|
+
previousState,
|
|
334
|
+
previousResult: previousState
|
|
335
|
+
}
|
|
336
|
+
);
|
|
337
|
+
}
|
|
338
|
+
} catch (error) {
|
|
339
|
+
// If TemporalDecisionManager fails, proceed with normal validation
|
|
340
|
+
stateEvaluation = await validateScreenshot(
|
|
341
|
+
screenshotPath,
|
|
342
|
+
`Evaluate current game state. Goal: ${goal}`,
|
|
343
|
+
{
|
|
344
|
+
testType: 'gameplay',
|
|
345
|
+
temporalNotes,
|
|
346
|
+
sequenceIndex: step
|
|
347
|
+
}
|
|
348
|
+
);
|
|
209
349
|
}
|
|
210
|
-
|
|
350
|
+
} else {
|
|
351
|
+
// First step - always validate
|
|
352
|
+
stateEvaluation = await validateScreenshot(
|
|
353
|
+
screenshotPath,
|
|
354
|
+
`Evaluate current game state. Goal: ${goal}`,
|
|
355
|
+
{
|
|
356
|
+
testType: 'gameplay',
|
|
357
|
+
temporalNotes,
|
|
358
|
+
sequenceIndex: step
|
|
359
|
+
}
|
|
360
|
+
);
|
|
361
|
+
}
|
|
211
362
|
|
|
212
363
|
currentState.evaluation = stateEvaluation;
|
|
213
364
|
|
|
214
365
|
// 3. Decide what action to take (decision-making)
|
|
215
|
-
|
|
366
|
+
let action = await decideGameAction(
|
|
216
367
|
currentState,
|
|
217
368
|
goal,
|
|
218
369
|
history
|
|
219
370
|
);
|
|
220
371
|
|
|
221
|
-
|
|
372
|
+
// Try action, with simple retry on failure
|
|
373
|
+
let actionExecuted = false;
|
|
374
|
+
let retries = 0;
|
|
375
|
+
const maxRetries = 2;
|
|
222
376
|
|
|
223
|
-
|
|
224
|
-
|
|
377
|
+
while (!actionExecuted && retries < maxRetries) {
|
|
378
|
+
log(`[GamePlayer] Step ${step}: score=${stateEvaluation.score}, action=${action.type}:${action.key || action.selector || ''}`);
|
|
379
|
+
|
|
380
|
+
// 4. Execute action (Playwright)
|
|
381
|
+
const executionResult = await executeGameAction(page, action);
|
|
382
|
+
|
|
383
|
+
if (executionResult.success) {
|
|
384
|
+
actionExecuted = true;
|
|
385
|
+
action.executionResult = executionResult;
|
|
386
|
+
} else {
|
|
387
|
+
// Action failed - wait and retry, or try simple alternative
|
|
388
|
+
retries++;
|
|
389
|
+
if (retries < maxRetries) {
|
|
390
|
+
const { createExploratoryStrategy } = await import('./utils/exploratory-automation.mjs');
|
|
391
|
+
const exploratoryStrategy = createExploratoryStrategy({ maxAttempts: 2 });
|
|
392
|
+
const nextAction = exploratoryStrategy.getNextAction(currentState, [action], goal);
|
|
393
|
+
|
|
394
|
+
if (nextAction) {
|
|
395
|
+
log(`[GamePlayer] Action failed, trying alternative: ${nextAction.type}`);
|
|
396
|
+
action = nextAction;
|
|
397
|
+
} else {
|
|
398
|
+
// Wait and retry original action
|
|
399
|
+
await page.waitForTimeout(500);
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
}
|
|
225
404
|
|
|
226
405
|
// 5. Wait for next frame
|
|
227
406
|
await page.waitForTimeout(1000 / fps);
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Graceful Shutdown Handler
|
|
3
|
+
*
|
|
4
|
+
* Handles graceful shutdown for long-running processes.
|
|
5
|
+
* Ensures in-flight operations complete, caches are flushed, and resources are cleaned up.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { log, warn, error } from './logger.mjs';
|
|
9
|
+
|
|
10
|
+
let shutdownHandlers = [];
|
|
11
|
+
let isShuttingDown = false;
|
|
12
|
+
let shutdownTimeout = 30000; // 30 seconds default timeout
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Register a shutdown handler
|
|
16
|
+
*
|
|
17
|
+
* @param {Function} handler - Async function to call during shutdown
|
|
18
|
+
* @param {number} [priority=0] - Priority (higher = called first)
|
|
19
|
+
*/
|
|
20
|
+
export function registerShutdownHandler(handler, priority = 0) {
|
|
21
|
+
if (typeof handler !== 'function') {
|
|
22
|
+
throw new TypeError('Shutdown handler must be a function');
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
shutdownHandlers.push({ handler, priority });
|
|
26
|
+
// Sort by priority (higher first)
|
|
27
|
+
shutdownHandlers.sort((a, b) => b.priority - a.priority);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Unregister a shutdown handler
|
|
32
|
+
*
|
|
33
|
+
* @param {Function} handler - Handler to remove
|
|
34
|
+
*/
|
|
35
|
+
export function unregisterShutdownHandler(handler) {
|
|
36
|
+
shutdownHandlers = shutdownHandlers.filter(h => h.handler !== handler);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Perform graceful shutdown
|
|
41
|
+
*
|
|
42
|
+
* @param {Object} [options={}] - Shutdown options
|
|
43
|
+
* @param {number} [options.timeout=30000] - Timeout in milliseconds
|
|
44
|
+
* @param {string} [options.signal='SIGTERM'] - Signal name for logging
|
|
45
|
+
* @returns {Promise<void>}
|
|
46
|
+
*/
|
|
47
|
+
export async function gracefulShutdown(options = {}) {
|
|
48
|
+
if (isShuttingDown) {
|
|
49
|
+
warn('[GracefulShutdown] Shutdown already in progress');
|
|
50
|
+
return;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
isShuttingDown = true;
|
|
54
|
+
const { timeout = shutdownTimeout, signal = 'SIGTERM' } = options;
|
|
55
|
+
|
|
56
|
+
log(`[GracefulShutdown] Initiating graceful shutdown (signal: ${signal})...`);
|
|
57
|
+
|
|
58
|
+
// Set timeout to force exit if shutdown takes too long
|
|
59
|
+
const timeoutId = setTimeout(() => {
|
|
60
|
+
warn('[GracefulShutdown] Shutdown timeout exceeded, forcing exit');
|
|
61
|
+
process.exit(1);
|
|
62
|
+
}, timeout);
|
|
63
|
+
|
|
64
|
+
try {
|
|
65
|
+
// Execute shutdown handlers in priority order
|
|
66
|
+
for (const { handler } of shutdownHandlers) {
|
|
67
|
+
try {
|
|
68
|
+
await handler();
|
|
69
|
+
} catch (err) {
|
|
70
|
+
warn(`[GracefulShutdown] Handler failed:`, err);
|
|
71
|
+
// Continue with other handlers even if one fails
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Note: Cache is file-based and doesn't need explicit flushing
|
|
76
|
+
// File writes are atomic, so no cleanup needed
|
|
77
|
+
log('[GracefulShutdown] Cache is file-based, no flush needed');
|
|
78
|
+
|
|
79
|
+
clearTimeout(timeoutId);
|
|
80
|
+
log('[GracefulShutdown] Shutdown complete');
|
|
81
|
+
process.exit(0);
|
|
82
|
+
} catch (err) {
|
|
83
|
+
clearTimeout(timeoutId);
|
|
84
|
+
error('[GracefulShutdown] Shutdown failed:', err);
|
|
85
|
+
process.exit(1);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Initialize graceful shutdown handlers
|
|
91
|
+
*
|
|
92
|
+
* Registers signal handlers for SIGTERM and SIGINT.
|
|
93
|
+
*
|
|
94
|
+
* @param {Object} [options={}] - Initialization options
|
|
95
|
+
* @param {number} [options.timeout=30000] - Shutdown timeout
|
|
96
|
+
*/
|
|
97
|
+
export function initGracefulShutdown(options = {}) {
|
|
98
|
+
shutdownTimeout = options.timeout || 30000;
|
|
99
|
+
|
|
100
|
+
// Register signal handlers
|
|
101
|
+
process.on('SIGTERM', () => {
|
|
102
|
+
log('[GracefulShutdown] Received SIGTERM');
|
|
103
|
+
gracefulShutdown({ signal: 'SIGTERM', timeout: shutdownTimeout });
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
process.on('SIGINT', () => {
|
|
107
|
+
log('[GracefulShutdown] Received SIGINT (Ctrl+C)');
|
|
108
|
+
gracefulShutdown({ signal: 'SIGINT', timeout: shutdownTimeout });
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
// Handle uncaught exceptions (best-effort cleanup)
|
|
112
|
+
process.on('uncaughtException', (err) => {
|
|
113
|
+
error('[GracefulShutdown] Uncaught exception:', err);
|
|
114
|
+
gracefulShutdown({ signal: 'uncaughtException', timeout: 5000 }); // Shorter timeout for crashes
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
// Handle unhandled promise rejections
|
|
118
|
+
process.on('unhandledRejection', (reason, promise) => {
|
|
119
|
+
warn('[GracefulShutdown] Unhandled promise rejection:', reason);
|
|
120
|
+
// Don't shutdown on unhandled rejections (may be recoverable)
|
|
121
|
+
// But log for monitoring
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
log('[GracefulShutdown] Graceful shutdown handlers initialized');
|
|
125
|
+
}
|
|
126
|
+
|
|
@@ -68,13 +68,27 @@ export async function getPlaywrightPage(options = {}) {
|
|
|
68
68
|
};
|
|
69
69
|
}
|
|
70
70
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
71
|
+
try {
|
|
72
|
+
const browser = await chromium.launch(options.browserOptions || {});
|
|
73
|
+
const page = await browser.newPage();
|
|
74
|
+
|
|
75
|
+
return {
|
|
76
|
+
page,
|
|
77
|
+
browser,
|
|
78
|
+
isMock: false
|
|
79
|
+
};
|
|
80
|
+
} catch (error) {
|
|
81
|
+
// Browser executable not found, fallback to mock
|
|
82
|
+
if (error.message.includes('Executable doesn\'t exist') ||
|
|
83
|
+
error.message.includes('browserType.launch') ||
|
|
84
|
+
error.message.includes('Browser not found')) {
|
|
85
|
+
return {
|
|
86
|
+
page: createMockPage(),
|
|
87
|
+
browser: null,
|
|
88
|
+
isMock: true
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
throw error;
|
|
92
|
+
}
|
|
79
93
|
}
|
|
80
94
|
|
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
* - Non-blocking: Doesn't slow down evaluations
|
|
6
6
|
* - Automatic: Collects VLLM judgments when enabled
|
|
7
7
|
* - Smart sampling: Requests human validation for interesting cases
|
|
8
|
-
* - Learning:
|
|
9
|
-
* -
|
|
8
|
+
* - Learning: Calibrates based on collected data
|
|
9
|
+
* - Integration: Works with all existing systems (batching, temporal, personas)
|
|
10
10
|
*/
|
|
11
11
|
|
|
12
12
|
import { warn, log } from './logger.mjs';
|
|
@@ -362,6 +362,103 @@ export class HumanValidationManager {
|
|
|
362
362
|
};
|
|
363
363
|
}
|
|
364
364
|
|
|
365
|
+
/**
|
|
366
|
+
* Track calibration degradation over screenshot sequences
|
|
367
|
+
*
|
|
368
|
+
* @param {number} sequenceIndex - Index in sequence
|
|
369
|
+
* @param {Object} result - Validation result
|
|
370
|
+
* @returns {Object} Degradation status
|
|
371
|
+
*/
|
|
372
|
+
trackSequenceCalibration(sequenceIndex, result) {
|
|
373
|
+
if (!this.sequenceHistory) {
|
|
374
|
+
this.sequenceHistory = [];
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
const entry = {
|
|
378
|
+
index: sequenceIndex,
|
|
379
|
+
timestamp: Date.now(),
|
|
380
|
+
confidence: result.confidence || 0.5,
|
|
381
|
+
uncertainty: result.uncertainty || 0.5,
|
|
382
|
+
score: result.score,
|
|
383
|
+
logprobs: result.logprobs
|
|
384
|
+
};
|
|
385
|
+
|
|
386
|
+
this.sequenceHistory.push(entry);
|
|
387
|
+
|
|
388
|
+
// Detect degradation (compare recent vs early)
|
|
389
|
+
if (this.sequenceHistory.length >= 5) {
|
|
390
|
+
const recent = this.sequenceHistory.slice(-5);
|
|
391
|
+
const early = this.sequenceHistory.slice(0, 5);
|
|
392
|
+
|
|
393
|
+
const recentAvgConfidence = recent.reduce((sum, e) => sum + e.confidence, 0) / recent.length;
|
|
394
|
+
const earlyAvgConfidence = early.reduce((sum, e) => sum + e.confidence, 0) / early.length;
|
|
395
|
+
|
|
396
|
+
const degradation = earlyAvgConfidence - recentAvgConfidence;
|
|
397
|
+
const degradationThreshold = 0.15; // 15% drop
|
|
398
|
+
|
|
399
|
+
if (degradation > degradationThreshold) {
|
|
400
|
+
return {
|
|
401
|
+
degraded: true,
|
|
402
|
+
degradation,
|
|
403
|
+
recommendation: 'recalibrate_or_reduce_sequence',
|
|
404
|
+
suggestedAction: 'Use temporal graph representation or reduce sequence length'
|
|
405
|
+
};
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
return { degraded: false };
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
/**
|
|
413
|
+
* Get calibration quality metrics for sequence
|
|
414
|
+
*/
|
|
415
|
+
getSequenceCalibrationMetrics() {
|
|
416
|
+
if (!this.sequenceHistory || this.sequenceHistory.length < 2) {
|
|
417
|
+
return { quality: 'unknown', recommendation: 'insufficient_data' };
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
const confidences = this.sequenceHistory.map(e => e.confidence);
|
|
421
|
+
const variance = this.calculateVariance(confidences);
|
|
422
|
+
const trend = this.calculateTrend(confidences);
|
|
423
|
+
|
|
424
|
+
if (variance > 0.1 && trend < -0.05) {
|
|
425
|
+
return {
|
|
426
|
+
quality: 'degrading',
|
|
427
|
+
variance,
|
|
428
|
+
trend,
|
|
429
|
+
recommendation: 'recalibrate_or_reduce_sequence'
|
|
430
|
+
};
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
return {
|
|
434
|
+
quality: variance < 0.05 ? 'stable' : 'variable',
|
|
435
|
+
variance,
|
|
436
|
+
trend
|
|
437
|
+
};
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
/**
|
|
441
|
+
* Calculate variance of values
|
|
442
|
+
*/
|
|
443
|
+
calculateVariance(values) {
|
|
444
|
+
if (values.length === 0) return 0;
|
|
445
|
+
const mean = values.reduce((a, b) => a + b, 0) / values.length;
|
|
446
|
+
const squaredDiffs = values.map(v => Math.pow(v - mean, 2));
|
|
447
|
+
return squaredDiffs.reduce((a, b) => a + b, 0) / values.length;
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
/**
|
|
451
|
+
* Calculate trend of values (positive = increasing, negative = decreasing)
|
|
452
|
+
*/
|
|
453
|
+
calculateTrend(values) {
|
|
454
|
+
if (values.length < 2) return 0;
|
|
455
|
+
const firstHalf = values.slice(0, Math.floor(values.length / 2));
|
|
456
|
+
const secondHalf = values.slice(Math.floor(values.length / 2));
|
|
457
|
+
const firstAvg = firstHalf.reduce((a, b) => a + b, 0) / firstHalf.length;
|
|
458
|
+
const secondAvg = secondHalf.reduce((a, b) => a + b, 0) / secondHalf.length;
|
|
459
|
+
return (secondAvg - firstAvg) / firstAvg;
|
|
460
|
+
}
|
|
461
|
+
|
|
365
462
|
/**
|
|
366
463
|
* Apply calibration adjustments to VLLM score
|
|
367
464
|
*
|
package/src/index.mjs
CHANGED
|
@@ -17,10 +17,35 @@
|
|
|
17
17
|
import { loadEnv } from './load-env.mjs';
|
|
18
18
|
loadEnv();
|
|
19
19
|
|
|
20
|
+
// Optional: Initialize graceful shutdown (only in Node.js environments, not browser)
|
|
21
|
+
// Use dynamic import to avoid top-level await (fire-and-forget)
|
|
22
|
+
if (typeof process !== 'undefined' && process.env.NODE_ENV !== 'test') {
|
|
23
|
+
import('./graceful-shutdown.mjs').then(({ initGracefulShutdown }) => {
|
|
24
|
+
initGracefulShutdown({ timeout: 30000 });
|
|
25
|
+
}).catch(() => {
|
|
26
|
+
// Graceful shutdown is optional, don't fail if unavailable
|
|
27
|
+
});
|
|
28
|
+
}
|
|
29
|
+
|
|
20
30
|
import { VLLMJudge, validateScreenshot as _validateScreenshot } from './judge.mjs';
|
|
21
31
|
|
|
22
32
|
export { VLLMJudge, _validateScreenshot as validateScreenshot };
|
|
23
33
|
|
|
34
|
+
// Export startup validation utilities
|
|
35
|
+
export { validateStartup, validateStartupSoft } from './startup-validation.mjs';
|
|
36
|
+
|
|
37
|
+
// Export graceful shutdown utilities
|
|
38
|
+
export { initGracefulShutdown, registerShutdownHandler, gracefulShutdown } from './graceful-shutdown.mjs';
|
|
39
|
+
|
|
40
|
+
// Export performance measurement utilities
|
|
41
|
+
export {
|
|
42
|
+
PerformanceMeasurement,
|
|
43
|
+
PerformanceProfiler,
|
|
44
|
+
measureAsync,
|
|
45
|
+
measureSync,
|
|
46
|
+
getProfiler
|
|
47
|
+
} from './utils/performance-measurement.mjs';
|
|
48
|
+
|
|
24
49
|
/**
|
|
25
50
|
* Extract semantic information from VLLM judgment text
|
|
26
51
|
*
|
|
@@ -28,7 +53,7 @@ export { VLLMJudge, _validateScreenshot as validateScreenshot };
|
|
|
28
53
|
* Useful for custom implementations that need to parse judgment text.
|
|
29
54
|
*
|
|
30
55
|
* @param {string | object} judgment - Judgment text or object from VLLM
|
|
31
|
-
* @returns {
|
|
56
|
+
* @returns {Object} Structured semantic information with score, issues, assessment, reasoning, brutalistViolations (optional), zeroToleranceViolations (optional)
|
|
32
57
|
*/
|
|
33
58
|
export function extractSemanticInfo(judgment) {
|
|
34
59
|
// Create a temporary judge instance to access the method
|
|
@@ -170,12 +195,31 @@ export {
|
|
|
170
195
|
calculateBackoff,
|
|
171
196
|
enhanceErrorMessage
|
|
172
197
|
} from './retry.mjs';
|
|
198
|
+
|
|
199
|
+
// Cost optimization utilities
|
|
200
|
+
export {
|
|
201
|
+
calculateCostComparison,
|
|
202
|
+
optimizeCost
|
|
203
|
+
} from './cost-optimization.mjs';
|
|
173
204
|
export {
|
|
174
205
|
CostTracker,
|
|
175
206
|
getCostTracker,
|
|
176
207
|
recordCost,
|
|
177
|
-
getCostStats
|
|
208
|
+
getCostStats,
|
|
209
|
+
setBudgetLimit,
|
|
210
|
+
getBudgetStatus
|
|
178
211
|
} from './cost-tracker.mjs';
|
|
212
|
+
// Session-level cost tracking
|
|
213
|
+
export {
|
|
214
|
+
startSession,
|
|
215
|
+
endSession,
|
|
216
|
+
getSessionCosts,
|
|
217
|
+
recordSessionCost,
|
|
218
|
+
recordSessionCacheHit,
|
|
219
|
+
recordSessionCacheMiss,
|
|
220
|
+
getActiveSessions,
|
|
221
|
+
getGlobalCostStats
|
|
222
|
+
} from './session-cost-tracker.mjs';
|
|
179
223
|
export {
|
|
180
224
|
DEFAULT_RUBRIC,
|
|
181
225
|
buildRubricPrompt,
|
|
@@ -281,7 +325,8 @@ export {
|
|
|
281
325
|
export {
|
|
282
326
|
testGameplay,
|
|
283
327
|
testBrowserExperience,
|
|
284
|
-
validateWithGoals
|
|
328
|
+
validateWithGoals,
|
|
329
|
+
validatePage
|
|
285
330
|
} from './convenience.mjs';
|
|
286
331
|
|
|
287
332
|
// Game playing (optional - requires Playwright)
|