@arclabs561/ai-visual-test 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.secretsignore.example +20 -0
- package/CHANGELOG.md +360 -0
- package/CONTRIBUTING.md +63 -0
- package/DEPLOYMENT.md +80 -0
- package/LICENSE +22 -0
- package/README.md +142 -0
- package/SECURITY.md +108 -0
- package/api/health.js +34 -0
- package/api/validate.js +252 -0
- package/index.d.ts +1221 -0
- package/package.json +112 -0
- package/public/index.html +149 -0
- package/src/batch-optimizer.mjs +451 -0
- package/src/bias-detector.mjs +370 -0
- package/src/bias-mitigation.mjs +233 -0
- package/src/cache.mjs +433 -0
- package/src/config.mjs +268 -0
- package/src/constants.mjs +80 -0
- package/src/context-compressor.mjs +350 -0
- package/src/convenience.mjs +617 -0
- package/src/cost-tracker.mjs +257 -0
- package/src/cross-modal-consistency.mjs +170 -0
- package/src/data-extractor.mjs +232 -0
- package/src/dynamic-few-shot.mjs +140 -0
- package/src/dynamic-prompts.mjs +361 -0
- package/src/ensemble/index.mjs +53 -0
- package/src/ensemble-judge.mjs +366 -0
- package/src/error-handler.mjs +67 -0
- package/src/errors.mjs +167 -0
- package/src/experience-propagation.mjs +128 -0
- package/src/experience-tracer.mjs +487 -0
- package/src/explanation-manager.mjs +299 -0
- package/src/feedback-aggregator.mjs +248 -0
- package/src/game-goal-prompts.mjs +478 -0
- package/src/game-player.mjs +548 -0
- package/src/hallucination-detector.mjs +155 -0
- package/src/helpers/playwright.mjs +80 -0
- package/src/human-validation-manager.mjs +516 -0
- package/src/index.mjs +364 -0
- package/src/judge.mjs +929 -0
- package/src/latency-aware-batch-optimizer.mjs +192 -0
- package/src/load-env.mjs +159 -0
- package/src/logger.mjs +55 -0
- package/src/metrics.mjs +187 -0
- package/src/model-tier-selector.mjs +221 -0
- package/src/multi-modal/index.mjs +36 -0
- package/src/multi-modal-fusion.mjs +190 -0
- package/src/multi-modal.mjs +524 -0
- package/src/natural-language-specs.mjs +1071 -0
- package/src/pair-comparison.mjs +277 -0
- package/src/persona/index.mjs +42 -0
- package/src/persona-enhanced.mjs +200 -0
- package/src/persona-experience.mjs +572 -0
- package/src/position-counterbalance.mjs +140 -0
- package/src/prompt-composer.mjs +375 -0
- package/src/render-change-detector.mjs +583 -0
- package/src/research-enhanced-validation.mjs +436 -0
- package/src/retry.mjs +152 -0
- package/src/rubrics.mjs +231 -0
- package/src/score-tracker.mjs +277 -0
- package/src/smart-validator.mjs +447 -0
- package/src/spec-config.mjs +106 -0
- package/src/spec-templates.mjs +347 -0
- package/src/specs/index.mjs +38 -0
- package/src/temporal/index.mjs +102 -0
- package/src/temporal-adaptive.mjs +163 -0
- package/src/temporal-batch-optimizer.mjs +222 -0
- package/src/temporal-constants.mjs +69 -0
- package/src/temporal-context.mjs +49 -0
- package/src/temporal-decision-manager.mjs +271 -0
- package/src/temporal-decision.mjs +669 -0
- package/src/temporal-errors.mjs +58 -0
- package/src/temporal-note-pruner.mjs +173 -0
- package/src/temporal-preprocessor.mjs +543 -0
- package/src/temporal-prompt-formatter.mjs +219 -0
- package/src/temporal-validation.mjs +159 -0
- package/src/temporal.mjs +415 -0
- package/src/type-guards.mjs +311 -0
- package/src/uncertainty-reducer.mjs +470 -0
- package/src/utils/index.mjs +175 -0
- package/src/validation-framework.mjs +321 -0
- package/src/validation-result-normalizer.mjs +64 -0
- package/src/validation.mjs +243 -0
- package/src/validators/accessibility-programmatic.mjs +345 -0
- package/src/validators/accessibility-validator.mjs +223 -0
- package/src/validators/batch-validator.mjs +143 -0
- package/src/validators/hybrid-validator.mjs +268 -0
- package/src/validators/index.mjs +34 -0
- package/src/validators/prompt-builder.mjs +218 -0
- package/src/validators/rubric.mjs +85 -0
- package/src/validators/state-programmatic.mjs +260 -0
- package/src/validators/state-validator.mjs +291 -0
- package/vercel.json +27 -0
|
@@ -0,0 +1,548 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Game Playing Module
|
|
3
|
+
*
|
|
4
|
+
* Optional module for actually playing games (not just testing them).
|
|
5
|
+
* Uses validation to understand game state, then makes decisions and executes actions.
|
|
6
|
+
*
|
|
7
|
+
* Originally motivated by interactive web applications that require
|
|
8
|
+
* real-time validation, variable goals, and temporal understanding.
|
|
9
|
+
*
|
|
10
|
+
* Design: Game playing = validation + decision-making + action execution
|
|
11
|
+
* - Validation: Understand game state from screenshots (we have this)
|
|
12
|
+
* - Decision-making: Choose what action to take (we add this)
|
|
13
|
+
* - Action execution: Execute actions via Playwright (we add this)
|
|
14
|
+
*
|
|
15
|
+
* Provides two interfaces:
|
|
16
|
+
* 1. `playGame()` - Internal loop (simple API for most users)
|
|
17
|
+
* 2. `GameGym` - External iterator (advanced API for power users, RL integration, parallel games)
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import { validateScreenshot } from './index.mjs';
|
|
21
|
+
import { writeFileSync, mkdirSync, existsSync } from 'fs';
|
|
22
|
+
import { join } from 'path';
|
|
23
|
+
import { log, warn } from './logger.mjs';
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Decides what action to take based on game state
|
|
27
|
+
*
|
|
28
|
+
* Uses VLLM to understand current state and decide next action.
|
|
29
|
+
*
|
|
30
|
+
* @param {Object} gameState - Current game state from screenshot
|
|
31
|
+
* @param {string} goal - Goal for gameplay (e.g., "maximize score", "survive")
|
|
32
|
+
* @param {Array} history - Previous actions and results
|
|
33
|
+
* @returns {Promise<Object>} Action to take { type: 'keyboard', key: 'ArrowRight', ... }
|
|
34
|
+
*/
|
|
35
|
+
export async function decideGameAction(gameState, goal, history = []) {
|
|
36
|
+
const recentHistory = history.slice(-5); // Last 5 steps for context
|
|
37
|
+
|
|
38
|
+
// Use VLLM to understand current state
|
|
39
|
+
const stateEvaluation = await validateScreenshot(
|
|
40
|
+
gameState.screenshot,
|
|
41
|
+
`Evaluate current game state. Goal: ${goal}. Recent history: ${recentHistory.length} steps.`,
|
|
42
|
+
{
|
|
43
|
+
testType: 'gameplay-state',
|
|
44
|
+
temporalNotes: recentHistory.map(h => ({
|
|
45
|
+
step: h.step,
|
|
46
|
+
action: h.action,
|
|
47
|
+
result: h.result?.score
|
|
48
|
+
}))
|
|
49
|
+
}
|
|
50
|
+
);
|
|
51
|
+
|
|
52
|
+
// Use VLLM to decide action
|
|
53
|
+
const actionPrompt = `Based on the game state, decide what action to take.
|
|
54
|
+
Goal: ${goal}
|
|
55
|
+
Current state: ${stateEvaluation.reasoning?.substring(0, 200) || 'Unknown'}
|
|
56
|
+
Previous actions: ${recentHistory.slice(-3).map(h => h.action?.key || h.action?.type || 'unknown').join(', ')}
|
|
57
|
+
|
|
58
|
+
Return action as JSON: { "type": "keyboard", "key": "ArrowRight" }
|
|
59
|
+
Available actions:
|
|
60
|
+
- keyboard: ArrowLeft, ArrowRight, ArrowUp, ArrowDown, Space, Enter
|
|
61
|
+
- click: { "type": "click", "selector": "#button" }
|
|
62
|
+
- wait: { "type": "wait", "duration": 100 }
|
|
63
|
+
|
|
64
|
+
Choose the action that best achieves the goal.`;
|
|
65
|
+
|
|
66
|
+
const actionResult = await validateScreenshot(
|
|
67
|
+
gameState.screenshot,
|
|
68
|
+
actionPrompt,
|
|
69
|
+
{
|
|
70
|
+
extractStructured: true,
|
|
71
|
+
testType: 'gameplay-decision',
|
|
72
|
+
goal: goal
|
|
73
|
+
}
|
|
74
|
+
);
|
|
75
|
+
|
|
76
|
+
// Parse action from reasoning (VLLM returns reasoning, we extract JSON)
|
|
77
|
+
const actionMatch = actionResult.reasoning?.match(/\{[\s\S]*"type"[\s\S]*\}/);
|
|
78
|
+
if (actionMatch) {
|
|
79
|
+
try {
|
|
80
|
+
const parsed = JSON.parse(actionMatch[0]);
|
|
81
|
+
if (parsed.type && (parsed.key || parsed.selector || parsed.duration !== undefined)) {
|
|
82
|
+
return parsed;
|
|
83
|
+
}
|
|
84
|
+
} catch (e) {
|
|
85
|
+
// Fall through to heuristic
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Fallback: simple heuristic based on score
|
|
90
|
+
// If score is low or decreasing, try different action
|
|
91
|
+
const lastScore = recentHistory.length > 0 ? recentHistory[recentHistory.length - 1].result?.score : null;
|
|
92
|
+
const currentScore = stateEvaluation.score;
|
|
93
|
+
|
|
94
|
+
if (lastScore !== null && currentScore < lastScore) {
|
|
95
|
+
// Score decreased, try different direction
|
|
96
|
+
return { type: 'keyboard', key: 'ArrowLeft' };
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Default: move right
|
|
100
|
+
return { type: 'keyboard', key: 'ArrowRight' };
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Executes a game action via Playwright
|
|
105
|
+
*
|
|
106
|
+
* @param {import('playwright').Page} page - Playwright page object
|
|
107
|
+
* @param {Object} action - Action to execute
|
|
108
|
+
*/
|
|
109
|
+
export async function executeGameAction(page, action) {
|
|
110
|
+
switch (action.type) {
|
|
111
|
+
case 'keyboard':
|
|
112
|
+
await page.keyboard.press(action.key);
|
|
113
|
+
break;
|
|
114
|
+
case 'click':
|
|
115
|
+
if (action.selector) {
|
|
116
|
+
await page.click(action.selector);
|
|
117
|
+
} else {
|
|
118
|
+
warn('[GamePlayer] Click action missing selector');
|
|
119
|
+
}
|
|
120
|
+
break;
|
|
121
|
+
case 'wait':
|
|
122
|
+
await page.waitForTimeout(action.duration || 100);
|
|
123
|
+
break;
|
|
124
|
+
default:
|
|
125
|
+
warn(`[GamePlayer] Unknown action type: ${action.type}, defaulting to wait`);
|
|
126
|
+
await page.waitForTimeout(100);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Plays a game by taking screenshots, making decisions, and executing actions
|
|
132
|
+
*
|
|
133
|
+
* Uses validation to understand game state, then makes decisions and executes actions.
|
|
134
|
+
* This is slower than human gameplay (1-5 FPS for decision-making, not 60 FPS)
|
|
135
|
+
* because VLLM calls take 1-3 seconds.
|
|
136
|
+
*
|
|
137
|
+
* Originally motivated by interactive web applications, but works for any web game.
|
|
138
|
+
*
|
|
139
|
+
* @param {import('playwright').Page} page - Playwright page object
|
|
140
|
+
* @param {Object} options - Game playing options
|
|
141
|
+
* @param {string} options.goal - Goal for gameplay (e.g., "maximize score")
|
|
142
|
+
* @param {number} options.maxSteps - Maximum number of steps
|
|
143
|
+
* @param {number} options.fps - Frames per second for decision-making (default: 2, not 60)
|
|
144
|
+
* @param {string} [options.gameActivationKey] - Keyboard key to activate game
|
|
145
|
+
* @param {string} [options.gameSelector] - Selector to wait for game activation
|
|
146
|
+
* @param {string} [options.tempDir] - Directory for temporary screenshots
|
|
147
|
+
* @returns {Promise<Object>} Gameplay result with history, final state, etc.
|
|
148
|
+
*/
|
|
149
|
+
export async function playGame(page, options = {}) {
|
|
150
|
+
const {
|
|
151
|
+
goal = 'Play the game well',
|
|
152
|
+
maxSteps = 100,
|
|
153
|
+
fps = 2, // 2 FPS for decision-making (not 60 FPS - AI needs time to think)
|
|
154
|
+
gameSelector = null,
|
|
155
|
+
gameActivationKey = null,
|
|
156
|
+
tempDir = null
|
|
157
|
+
} = options;
|
|
158
|
+
|
|
159
|
+
log('[GamePlayer] Starting game play:', { goal, maxSteps, fps, gameActivationKey });
|
|
160
|
+
|
|
161
|
+
// Activate game if needed
|
|
162
|
+
if (gameActivationKey) {
|
|
163
|
+
log(`[GamePlayer] Activating game with key: ${gameActivationKey}`);
|
|
164
|
+
await page.keyboard.press(gameActivationKey);
|
|
165
|
+
await page.waitForTimeout(500);
|
|
166
|
+
|
|
167
|
+
if (gameSelector) {
|
|
168
|
+
try {
|
|
169
|
+
await page.waitForSelector(gameSelector, { timeout: 5000 });
|
|
170
|
+
} catch (error) {
|
|
171
|
+
warn(`[GamePlayer] Game selector ${gameSelector} not found after activation`);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Create temp directory for screenshots
|
|
177
|
+
const screenshotDir = tempDir || join(process.cwd(), 'temp-gameplay');
|
|
178
|
+
if (!existsSync(screenshotDir)) {
|
|
179
|
+
mkdirSync(screenshotDir, { recursive: true });
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
const history = [];
|
|
183
|
+
let currentState = null;
|
|
184
|
+
|
|
185
|
+
for (let step = 0; step < maxSteps; step++) {
|
|
186
|
+
try {
|
|
187
|
+
// 1. Capture current state (screenshot)
|
|
188
|
+
const screenshot = await page.screenshot();
|
|
189
|
+
const screenshotPath = join(screenshotDir, `gameplay-step-${step}.png`);
|
|
190
|
+
writeFileSync(screenshotPath, screenshot);
|
|
191
|
+
|
|
192
|
+
// 2. Understand current state (validation)
|
|
193
|
+
currentState = {
|
|
194
|
+
screenshot: screenshotPath,
|
|
195
|
+
step,
|
|
196
|
+
timestamp: Date.now()
|
|
197
|
+
};
|
|
198
|
+
|
|
199
|
+
const stateEvaluation = await validateScreenshot(
|
|
200
|
+
screenshotPath,
|
|
201
|
+
`Evaluate current game state. Goal: ${goal}`,
|
|
202
|
+
{
|
|
203
|
+
testType: 'gameplay',
|
|
204
|
+
temporalNotes: history.map(h => ({
|
|
205
|
+
step: h.step,
|
|
206
|
+
action: h.action,
|
|
207
|
+
result: h.result?.score
|
|
208
|
+
}))
|
|
209
|
+
}
|
|
210
|
+
);
|
|
211
|
+
|
|
212
|
+
currentState.evaluation = stateEvaluation;
|
|
213
|
+
|
|
214
|
+
// 3. Decide what action to take (decision-making)
|
|
215
|
+
const action = await decideGameAction(
|
|
216
|
+
currentState,
|
|
217
|
+
goal,
|
|
218
|
+
history
|
|
219
|
+
);
|
|
220
|
+
|
|
221
|
+
log(`[GamePlayer] Step ${step}: score=${stateEvaluation.score}, action=${action.type}:${action.key || action.selector || ''}`);
|
|
222
|
+
|
|
223
|
+
// 4. Execute action (Playwright)
|
|
224
|
+
await executeGameAction(page, action);
|
|
225
|
+
|
|
226
|
+
// 5. Wait for next frame
|
|
227
|
+
await page.waitForTimeout(1000 / fps);
|
|
228
|
+
|
|
229
|
+
// 6. Record history
|
|
230
|
+
history.push({
|
|
231
|
+
step,
|
|
232
|
+
state: currentState,
|
|
233
|
+
action,
|
|
234
|
+
result: stateEvaluation
|
|
235
|
+
});
|
|
236
|
+
|
|
237
|
+
// 7. Check if game is over (optional)
|
|
238
|
+
if (stateEvaluation.score === 0 ||
|
|
239
|
+
stateEvaluation.issues?.some(i => i.toLowerCase().includes('game over')) ||
|
|
240
|
+
stateEvaluation.issues?.some(i => i.toLowerCase().includes('game ended'))) {
|
|
241
|
+
log(`[GamePlayer] Game over detected at step ${step}`);
|
|
242
|
+
break;
|
|
243
|
+
}
|
|
244
|
+
} catch (error) {
|
|
245
|
+
warn(`[GamePlayer] Error at step ${step}:`, error.message);
|
|
246
|
+
// Continue with next step (graceful degradation)
|
|
247
|
+
history.push({
|
|
248
|
+
step,
|
|
249
|
+
error: error.message,
|
|
250
|
+
state: currentState
|
|
251
|
+
});
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
return {
|
|
256
|
+
history,
|
|
257
|
+
finalState: currentState,
|
|
258
|
+
totalSteps: history.length,
|
|
259
|
+
goal,
|
|
260
|
+
success: currentState?.evaluation?.score !== null
|
|
261
|
+
};
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
/**
|
|
265
|
+
* Game Gym - External Iterator Pattern (RL Gym-style)
|
|
266
|
+
*
|
|
267
|
+
* Provides external iterator interface for game playing, enabling:
|
|
268
|
+
* - Explicit control over iteration
|
|
269
|
+
* - Batching across multiple games
|
|
270
|
+
* - RL algorithm integration
|
|
271
|
+
* - Parallel game instances
|
|
272
|
+
* - Checkpointing and state management
|
|
273
|
+
*
|
|
274
|
+
* Originally motivated by interactive web applications, but designed to work
|
|
275
|
+
* with any RL algorithm or advanced use case.
|
|
276
|
+
*
|
|
277
|
+
* @example
|
|
278
|
+
* ```javascript
|
|
279
|
+
* const gym = new GameGym(page, { goal: 'Maximize score' });
|
|
280
|
+
* let obs = await gym.reset();
|
|
281
|
+
*
|
|
282
|
+
* while (!gym.done) {
|
|
283
|
+
* const action = await decideAction(obs);
|
|
284
|
+
* const result = await gym.step(action);
|
|
285
|
+
* obs = result.observation;
|
|
286
|
+
* }
|
|
287
|
+
* ```
|
|
288
|
+
*/
|
|
289
|
+
export class GameGym {
|
|
290
|
+
constructor(page, options = {}) {
|
|
291
|
+
this.page = page;
|
|
292
|
+
this.options = {
|
|
293
|
+
goal: 'Play the game well',
|
|
294
|
+
maxSteps: 100,
|
|
295
|
+
fps: 2,
|
|
296
|
+
gameSelector: null,
|
|
297
|
+
gameActivationKey: null,
|
|
298
|
+
tempDir: null,
|
|
299
|
+
...options
|
|
300
|
+
};
|
|
301
|
+
|
|
302
|
+
this.currentState = null;
|
|
303
|
+
this.done = false;
|
|
304
|
+
this.stepCount = 0;
|
|
305
|
+
this.history = [];
|
|
306
|
+
|
|
307
|
+
// Create temp directory
|
|
308
|
+
const screenshotDir = this.options.tempDir || join(process.cwd(), 'temp-gameplay');
|
|
309
|
+
if (!existsSync(screenshotDir)) {
|
|
310
|
+
mkdirSync(screenshotDir, { recursive: true });
|
|
311
|
+
}
|
|
312
|
+
this.screenshotDir = screenshotDir;
|
|
313
|
+
|
|
314
|
+
log('[GameGym] Created gym:', { goal: this.options.goal, maxSteps: this.options.maxSteps });
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
/**
|
|
318
|
+
* Reset game to initial state
|
|
319
|
+
*
|
|
320
|
+
* @returns {Promise<Object>} Initial observation
|
|
321
|
+
*/
|
|
322
|
+
async reset() {
|
|
323
|
+
// Navigate to game if URL provided
|
|
324
|
+
if (this.options.url) {
|
|
325
|
+
await this.page.goto(this.options.url, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
326
|
+
await this.page.waitForLoadState('networkidle');
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// Activate game if needed
|
|
330
|
+
if (this.options.gameActivationKey) {
|
|
331
|
+
log(`[GameGym] Activating game with key: ${this.options.gameActivationKey}`);
|
|
332
|
+
await this.page.keyboard.press(this.options.gameActivationKey);
|
|
333
|
+
await this.page.waitForTimeout(500);
|
|
334
|
+
|
|
335
|
+
if (this.options.gameSelector) {
|
|
336
|
+
try {
|
|
337
|
+
await this.page.waitForSelector(this.options.gameSelector, { timeout: 5000 });
|
|
338
|
+
} catch (error) {
|
|
339
|
+
warn(`[GameGym] Game selector ${this.options.gameSelector} not found`);
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
// Capture initial state
|
|
345
|
+
const screenshot = await this.page.screenshot();
|
|
346
|
+
const screenshotPath = join(this.screenshotDir, `gameplay-reset-${Date.now()}.png`);
|
|
347
|
+
writeFileSync(screenshotPath, screenshot);
|
|
348
|
+
|
|
349
|
+
const evaluation = await validateScreenshot(
|
|
350
|
+
screenshotPath,
|
|
351
|
+
`Evaluate initial game state. Goal: ${this.options.goal}`,
|
|
352
|
+
{
|
|
353
|
+
testType: 'gameplay-reset',
|
|
354
|
+
goal: this.options.goal
|
|
355
|
+
}
|
|
356
|
+
);
|
|
357
|
+
|
|
358
|
+
this.currentState = {
|
|
359
|
+
observation: {
|
|
360
|
+
screenshot: screenshotPath,
|
|
361
|
+
evaluation: evaluation,
|
|
362
|
+
step: 0,
|
|
363
|
+
timestamp: Date.now()
|
|
364
|
+
},
|
|
365
|
+
reward: 0,
|
|
366
|
+
done: false,
|
|
367
|
+
info: {
|
|
368
|
+
score: evaluation.score,
|
|
369
|
+
issues: evaluation.issues || [],
|
|
370
|
+
goal: this.options.goal
|
|
371
|
+
}
|
|
372
|
+
};
|
|
373
|
+
|
|
374
|
+
this.done = false;
|
|
375
|
+
this.stepCount = 0;
|
|
376
|
+
this.history = [];
|
|
377
|
+
|
|
378
|
+
log('[GameGym] Reset complete:', { score: evaluation.score });
|
|
379
|
+
|
|
380
|
+
return this.currentState.observation;
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/**
|
|
384
|
+
* Execute action and return new observation
|
|
385
|
+
*
|
|
386
|
+
* @param {Object} action - Action to execute
|
|
387
|
+
* @returns {Promise<Object>} { observation, reward, done, info }
|
|
388
|
+
*/
|
|
389
|
+
async step(action) {
|
|
390
|
+
if (this.done) {
|
|
391
|
+
warn('[GameGym] Step called after game is done, reset first');
|
|
392
|
+
return this.currentState;
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
// Execute action
|
|
396
|
+
await executeGameAction(this.page, action);
|
|
397
|
+
|
|
398
|
+
// Wait for next frame
|
|
399
|
+
await this.page.waitForTimeout(1000 / this.options.fps);
|
|
400
|
+
|
|
401
|
+
// Capture new state
|
|
402
|
+
const screenshot = await this.page.screenshot();
|
|
403
|
+
const screenshotPath = join(this.screenshotDir, `gameplay-step-${this.stepCount + 1}.png`);
|
|
404
|
+
writeFileSync(screenshotPath, screenshot);
|
|
405
|
+
|
|
406
|
+
const evaluation = await validateScreenshot(
|
|
407
|
+
screenshotPath,
|
|
408
|
+
`Evaluate game state after action. Goal: ${this.options.goal}`,
|
|
409
|
+
{
|
|
410
|
+
testType: 'gameplay',
|
|
411
|
+
temporalNotes: this.history.map(h => ({
|
|
412
|
+
step: h.step,
|
|
413
|
+
action: h.action,
|
|
414
|
+
result: h.result?.score
|
|
415
|
+
})),
|
|
416
|
+
goal: this.options.goal
|
|
417
|
+
}
|
|
418
|
+
);
|
|
419
|
+
|
|
420
|
+
// Calculate reward (based on goal)
|
|
421
|
+
const previousScore = this.currentState?.observation?.evaluation?.score || 0;
|
|
422
|
+
const currentScore = evaluation.score || 0;
|
|
423
|
+
const reward = this.calculateReward(evaluation, this.currentState);
|
|
424
|
+
|
|
425
|
+
// Update state
|
|
426
|
+
this.stepCount++;
|
|
427
|
+
this.currentState = {
|
|
428
|
+
observation: {
|
|
429
|
+
screenshot: screenshotPath,
|
|
430
|
+
evaluation: evaluation,
|
|
431
|
+
step: this.stepCount,
|
|
432
|
+
timestamp: Date.now()
|
|
433
|
+
},
|
|
434
|
+
reward: reward,
|
|
435
|
+
done: this.isDone(evaluation),
|
|
436
|
+
info: {
|
|
437
|
+
score: currentScore,
|
|
438
|
+
scoreDelta: currentScore - previousScore,
|
|
439
|
+
issues: evaluation.issues || [],
|
|
440
|
+
goal: this.options.goal,
|
|
441
|
+
step: this.stepCount
|
|
442
|
+
}
|
|
443
|
+
};
|
|
444
|
+
|
|
445
|
+
// Record history
|
|
446
|
+
this.history.push({
|
|
447
|
+
step: this.stepCount,
|
|
448
|
+
action: action,
|
|
449
|
+
result: evaluation
|
|
450
|
+
});
|
|
451
|
+
|
|
452
|
+
this.done = this.currentState.done;
|
|
453
|
+
|
|
454
|
+
log(`[GameGym] Step ${this.stepCount}: score=${currentScore}, reward=${reward}, done=${this.done}`);
|
|
455
|
+
|
|
456
|
+
return this.currentState;
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
/**
|
|
460
|
+
* Calculate reward based on goal
|
|
461
|
+
*
|
|
462
|
+
* @param {Object} evaluation - Current evaluation
|
|
463
|
+
* @param {Object} previousState - Previous state
|
|
464
|
+
* @returns {number} Reward value
|
|
465
|
+
*/
|
|
466
|
+
calculateReward(evaluation, previousState) {
|
|
467
|
+
const currentScore = evaluation.score || 0;
|
|
468
|
+
const previousScore = previousState?.observation?.evaluation?.score || 0;
|
|
469
|
+
|
|
470
|
+
// Reward based on goal
|
|
471
|
+
if (this.options.goal.includes('maximize') || this.options.goal.includes('score')) {
|
|
472
|
+
// Reward for score increase
|
|
473
|
+
return currentScore - previousScore;
|
|
474
|
+
} else if (this.options.goal.includes('survive') || this.options.goal.includes('avoid')) {
|
|
475
|
+
// Reward for staying alive (penalize score decrease)
|
|
476
|
+
return currentScore > 0 ? 1 : -10;
|
|
477
|
+
} else {
|
|
478
|
+
// Default: reward for maintaining/improving score
|
|
479
|
+
return currentScore - previousScore;
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
/**
|
|
484
|
+
* Check if game is done
|
|
485
|
+
*
|
|
486
|
+
* @param {Object} evaluation - Current evaluation
|
|
487
|
+
* @returns {boolean} True if game is done
|
|
488
|
+
*/
|
|
489
|
+
isDone(evaluation) {
|
|
490
|
+
// Game over conditions
|
|
491
|
+
if (evaluation.score === 0) {
|
|
492
|
+
return true;
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
if (evaluation.issues?.some(i =>
|
|
496
|
+
i.toLowerCase().includes('game over') ||
|
|
497
|
+
i.toLowerCase().includes('game ended') ||
|
|
498
|
+
i.toLowerCase().includes('you lost')
|
|
499
|
+
)) {
|
|
500
|
+
return true;
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
// Max steps reached
|
|
504
|
+
if (this.stepCount >= this.options.maxSteps) {
|
|
505
|
+
return true;
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
return false;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
/**
|
|
512
|
+
* Get current observation without stepping
|
|
513
|
+
*
|
|
514
|
+
* @returns {Object} Current observation
|
|
515
|
+
*/
|
|
516
|
+
getObservation() {
|
|
517
|
+
return this.currentState?.observation || null;
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
/**
|
|
521
|
+
* Get game state for checkpointing
|
|
522
|
+
*
|
|
523
|
+
* @returns {Object} Game state
|
|
524
|
+
*/
|
|
525
|
+
getState() {
|
|
526
|
+
return {
|
|
527
|
+
observation: this.currentState?.observation,
|
|
528
|
+
stepCount: this.stepCount,
|
|
529
|
+
history: this.history,
|
|
530
|
+
done: this.done
|
|
531
|
+
};
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
/**
|
|
535
|
+
* Restore game state from checkpoint
|
|
536
|
+
*
|
|
537
|
+
* @param {Object} state - Game state from checkpoint
|
|
538
|
+
*/
|
|
539
|
+
restore(state) {
|
|
540
|
+
this.currentState = { observation: state.observation };
|
|
541
|
+
this.stepCount = state.stepCount;
|
|
542
|
+
this.history = state.history || [];
|
|
543
|
+
this.done = state.done || false;
|
|
544
|
+
|
|
545
|
+
log(`[GameGym] Restored from checkpoint: step ${this.stepCount}`);
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Hallucination Detection for VLLM Outputs
|
|
3
|
+
*
|
|
4
|
+
* Minimal implementation for detecting when VLLM generates unfaithful outputs.
|
|
5
|
+
* Research: arXiv:2506.19513, 2507.19024, 2509.10345
|
|
6
|
+
*
|
|
7
|
+
* Kept minimal for npm package - focuses on core faithfulness checking.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Detect hallucination in VLLM judgment
|
|
12
|
+
*
|
|
13
|
+
* @param {string} judgment - VLLM judgment text
|
|
14
|
+
* @param {string} [imagePath] - Optional: path to screenshot for visual grounding
|
|
15
|
+
* @param {{
|
|
16
|
+
* checkFaithfulness?: boolean;
|
|
17
|
+
* checkUncertainty?: boolean;
|
|
18
|
+
* logprobs?: any;
|
|
19
|
+
* }} [options={}] - Detection options
|
|
20
|
+
* @returns {import('./index.mjs').HallucinationDetectionResult} Detection result
|
|
21
|
+
*/
|
|
22
|
+
export function detectHallucination(judgment, imagePath = null, options = {}) {
|
|
23
|
+
const {
|
|
24
|
+
checkFaithfulness = true,
|
|
25
|
+
checkUncertainty = true,
|
|
26
|
+
logprobs = null
|
|
27
|
+
} = options;
|
|
28
|
+
|
|
29
|
+
const issues = [];
|
|
30
|
+
let confidence = 1.0;
|
|
31
|
+
|
|
32
|
+
// 1. Faithfulness checking: Look for claims that can't be verified from visual content
|
|
33
|
+
if (checkFaithfulness) {
|
|
34
|
+
const faithfulness = checkFaithfulnessToVisual(judgment);
|
|
35
|
+
if (!faithfulness.faithful) {
|
|
36
|
+
issues.push(...faithfulness.issues);
|
|
37
|
+
confidence *= 0.7; // Reduce confidence if unfaithful
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// 2. Uncertainty estimation: Use logprobs if available
|
|
42
|
+
if (checkUncertainty && logprobs) {
|
|
43
|
+
const uncertainty = estimateUncertaintyFromLogprobs(logprobs);
|
|
44
|
+
if (uncertainty.high) {
|
|
45
|
+
issues.push('High uncertainty detected in model output');
|
|
46
|
+
confidence *= uncertainty.confidence;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// 3. Contradiction detection: Check for self-contradictions
|
|
51
|
+
const contradictions = detectContradictions(judgment);
|
|
52
|
+
if (contradictions.length > 0) {
|
|
53
|
+
issues.push(...contradictions);
|
|
54
|
+
confidence *= 0.8;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return {
|
|
58
|
+
hasHallucination: issues.length > 0,
|
|
59
|
+
issues,
|
|
60
|
+
confidence: Math.max(0, Math.min(1, confidence)),
|
|
61
|
+
severity: issues.length > 2 ? 'high' : issues.length > 0 ? 'medium' : 'low'
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Check if judgment is faithful to visual content
|
|
67
|
+
* Minimal heuristic-based approach
|
|
68
|
+
*/
|
|
69
|
+
function checkFaithfulnessToVisual(judgment) {
|
|
70
|
+
const issues = [];
|
|
71
|
+
|
|
72
|
+
// Red flags for potential hallucination:
|
|
73
|
+
// 1. Overly specific claims without evidence
|
|
74
|
+
const specificClaims = /(exactly|precisely|specifically|definitely)\s+\d+/gi;
|
|
75
|
+
if (specificClaims.test(judgment)) {
|
|
76
|
+
issues.push('Overly specific numerical claims without visual evidence');
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// 2. Claims about non-visible elements
|
|
80
|
+
const nonVisible = /(hidden|invisible|behind|underneath|not visible|cannot see)/gi;
|
|
81
|
+
if (nonVisible.test(judgment) && !judgment.includes('cannot be seen')) {
|
|
82
|
+
issues.push('Claims about non-visible elements may be hallucinated');
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// 3. Excessive detail beyond what's reasonable from screenshot
|
|
86
|
+
const wordCount = judgment.split(/\s+/).length;
|
|
87
|
+
const detailDensity = (judgment.match(/(color|size|position|font|layout|spacing)/gi) || []).length;
|
|
88
|
+
if (detailDensity > wordCount / 10) {
|
|
89
|
+
issues.push('Excessive detail density may indicate hallucination');
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
return {
|
|
93
|
+
faithful: issues.length === 0,
|
|
94
|
+
issues
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Estimate uncertainty from logprobs
|
|
100
|
+
* Minimal implementation - just checks if logprobs indicate low confidence
|
|
101
|
+
*/
|
|
102
|
+
function estimateUncertaintyFromLogprobs(logprobs) {
|
|
103
|
+
if (!logprobs || typeof logprobs !== 'object') {
|
|
104
|
+
return { high: false, confidence: 1.0 };
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Extract average logprob if available
|
|
108
|
+
// OpenAI format: { tokens: [...], token_logprobs: [...] }
|
|
109
|
+
// Gemini format: varies
|
|
110
|
+
let avgLogprob = null;
|
|
111
|
+
|
|
112
|
+
if (Array.isArray(logprobs.token_logprobs)) {
|
|
113
|
+
const valid = logprobs.token_logprobs.filter(p => p !== null);
|
|
114
|
+
if (valid.length > 0) {
|
|
115
|
+
avgLogprob = valid.reduce((a, b) => a + b, 0) / valid.length;
|
|
116
|
+
}
|
|
117
|
+
} else if (typeof logprobs === 'number') {
|
|
118
|
+
avgLogprob = logprobs;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Low logprob (more negative) = high uncertainty
|
|
122
|
+
// Threshold: -2.0 is roughly 13% probability
|
|
123
|
+
const high = avgLogprob !== null && avgLogprob < -2.0;
|
|
124
|
+
const confidence = avgLogprob !== null
|
|
125
|
+
? Math.max(0, Math.min(1, (avgLogprob + 3) / 3)) // Map -3 to 0, 0 to 1
|
|
126
|
+
: 1.0;
|
|
127
|
+
|
|
128
|
+
return { high, confidence };
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Detect self-contradictions in judgment
|
|
133
|
+
*/
|
|
134
|
+
function detectContradictions(judgment) {
|
|
135
|
+
const issues = [];
|
|
136
|
+
const lower = judgment.toLowerCase();
|
|
137
|
+
|
|
138
|
+
// Common contradiction patterns
|
|
139
|
+
const patterns = [
|
|
140
|
+
{ positive: /(good|excellent|high|great)/i, negative: /(bad|poor|low|terrible)/i },
|
|
141
|
+
{ positive: /(pass|correct|valid)/i, negative: /(fail|incorrect|invalid)/i },
|
|
142
|
+
{ positive: /(accessible|usable)/i, negative: /(inaccessible|unusable)/i }
|
|
143
|
+
];
|
|
144
|
+
|
|
145
|
+
for (const { positive, negative } of patterns) {
|
|
146
|
+
if (positive.test(judgment) && negative.test(judgment)) {
|
|
147
|
+
issues.push('Contradictory statements detected in judgment');
|
|
148
|
+
break; // Only flag once
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
return issues;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
|