@arclabs561/ai-visual-test 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.secretsignore.example +20 -0
- package/CHANGELOG.md +360 -0
- package/CONTRIBUTING.md +63 -0
- package/DEPLOYMENT.md +80 -0
- package/LICENSE +22 -0
- package/README.md +142 -0
- package/SECURITY.md +108 -0
- package/api/health.js +34 -0
- package/api/validate.js +252 -0
- package/index.d.ts +1221 -0
- package/package.json +112 -0
- package/public/index.html +149 -0
- package/src/batch-optimizer.mjs +451 -0
- package/src/bias-detector.mjs +370 -0
- package/src/bias-mitigation.mjs +233 -0
- package/src/cache.mjs +433 -0
- package/src/config.mjs +268 -0
- package/src/constants.mjs +80 -0
- package/src/context-compressor.mjs +350 -0
- package/src/convenience.mjs +617 -0
- package/src/cost-tracker.mjs +257 -0
- package/src/cross-modal-consistency.mjs +170 -0
- package/src/data-extractor.mjs +232 -0
- package/src/dynamic-few-shot.mjs +140 -0
- package/src/dynamic-prompts.mjs +361 -0
- package/src/ensemble/index.mjs +53 -0
- package/src/ensemble-judge.mjs +366 -0
- package/src/error-handler.mjs +67 -0
- package/src/errors.mjs +167 -0
- package/src/experience-propagation.mjs +128 -0
- package/src/experience-tracer.mjs +487 -0
- package/src/explanation-manager.mjs +299 -0
- package/src/feedback-aggregator.mjs +248 -0
- package/src/game-goal-prompts.mjs +478 -0
- package/src/game-player.mjs +548 -0
- package/src/hallucination-detector.mjs +155 -0
- package/src/helpers/playwright.mjs +80 -0
- package/src/human-validation-manager.mjs +516 -0
- package/src/index.mjs +364 -0
- package/src/judge.mjs +929 -0
- package/src/latency-aware-batch-optimizer.mjs +192 -0
- package/src/load-env.mjs +159 -0
- package/src/logger.mjs +55 -0
- package/src/metrics.mjs +187 -0
- package/src/model-tier-selector.mjs +221 -0
- package/src/multi-modal/index.mjs +36 -0
- package/src/multi-modal-fusion.mjs +190 -0
- package/src/multi-modal.mjs +524 -0
- package/src/natural-language-specs.mjs +1071 -0
- package/src/pair-comparison.mjs +277 -0
- package/src/persona/index.mjs +42 -0
- package/src/persona-enhanced.mjs +200 -0
- package/src/persona-experience.mjs +572 -0
- package/src/position-counterbalance.mjs +140 -0
- package/src/prompt-composer.mjs +375 -0
- package/src/render-change-detector.mjs +583 -0
- package/src/research-enhanced-validation.mjs +436 -0
- package/src/retry.mjs +152 -0
- package/src/rubrics.mjs +231 -0
- package/src/score-tracker.mjs +277 -0
- package/src/smart-validator.mjs +447 -0
- package/src/spec-config.mjs +106 -0
- package/src/spec-templates.mjs +347 -0
- package/src/specs/index.mjs +38 -0
- package/src/temporal/index.mjs +102 -0
- package/src/temporal-adaptive.mjs +163 -0
- package/src/temporal-batch-optimizer.mjs +222 -0
- package/src/temporal-constants.mjs +69 -0
- package/src/temporal-context.mjs +49 -0
- package/src/temporal-decision-manager.mjs +271 -0
- package/src/temporal-decision.mjs +669 -0
- package/src/temporal-errors.mjs +58 -0
- package/src/temporal-note-pruner.mjs +173 -0
- package/src/temporal-preprocessor.mjs +543 -0
- package/src/temporal-prompt-formatter.mjs +219 -0
- package/src/temporal-validation.mjs +159 -0
- package/src/temporal.mjs +415 -0
- package/src/type-guards.mjs +311 -0
- package/src/uncertainty-reducer.mjs +470 -0
- package/src/utils/index.mjs +175 -0
- package/src/validation-framework.mjs +321 -0
- package/src/validation-result-normalizer.mjs +64 -0
- package/src/validation.mjs +243 -0
- package/src/validators/accessibility-programmatic.mjs +345 -0
- package/src/validators/accessibility-validator.mjs +223 -0
- package/src/validators/batch-validator.mjs +143 -0
- package/src/validators/hybrid-validator.mjs +268 -0
- package/src/validators/index.mjs +34 -0
- package/src/validators/prompt-builder.mjs +218 -0
- package/src/validators/rubric.mjs +85 -0
- package/src/validators/state-programmatic.mjs +260 -0
- package/src/validators/state-validator.mjs +291 -0
- package/vercel.json +27 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Playwright Helper Utilities
|
|
3
|
+
*
|
|
4
|
+
* Provides utilities for working with Playwright, including graceful
|
|
5
|
+
* handling when Playwright is not installed.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Get Playwright chromium browser, with graceful fallback
|
|
10
|
+
* @returns {Promise<{chromium: any, available: boolean}>}
|
|
11
|
+
*/
|
|
12
|
+
export async function getPlaywrightChromium() {
|
|
13
|
+
try {
|
|
14
|
+
const playwright = await import('playwright');
|
|
15
|
+
return {
|
|
16
|
+
chromium: playwright.chromium,
|
|
17
|
+
available: true
|
|
18
|
+
};
|
|
19
|
+
} catch (error) {
|
|
20
|
+
if (error.code === 'ERR_MODULE_NOT_FOUND' || error.message.includes('Cannot find module')) {
|
|
21
|
+
return {
|
|
22
|
+
chromium: null,
|
|
23
|
+
available: false,
|
|
24
|
+
error: 'Playwright not installed. Install with: npm install --save-dev @playwright/test'
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
throw error;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Check if Playwright is available
|
|
33
|
+
* @returns {Promise<boolean>}
|
|
34
|
+
*/
|
|
35
|
+
export async function isPlaywrightAvailable() {
|
|
36
|
+
const { available } = await getPlaywrightChromium();
|
|
37
|
+
return available;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Create a mock page object for testing when Playwright is not available
|
|
42
|
+
* @returns {object} Mock page object
|
|
43
|
+
*/
|
|
44
|
+
export function createMockPage() {
|
|
45
|
+
return {
|
|
46
|
+
goto: async () => {},
|
|
47
|
+
screenshot: async () => ({ path: 'mock-screenshot.png' }),
|
|
48
|
+
waitForLoadState: async () => {},
|
|
49
|
+
waitForTimeout: async () => {},
|
|
50
|
+
evaluate: async () => ({}),
|
|
51
|
+
close: async () => {}
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Get Playwright page with fallback to mock
|
|
57
|
+
* @param {object} options - Options for browser/page creation
|
|
58
|
+
* @returns {Promise<{page: any, browser: any, isMock: boolean}>}
|
|
59
|
+
*/
|
|
60
|
+
export async function getPlaywrightPage(options = {}) {
|
|
61
|
+
const { chromium, available } = await getPlaywrightChromium();
|
|
62
|
+
|
|
63
|
+
if (!available) {
|
|
64
|
+
return {
|
|
65
|
+
page: createMockPage(),
|
|
66
|
+
browser: null,
|
|
67
|
+
isMock: true
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
const browser = await chromium.launch(options.browserOptions || {});
|
|
72
|
+
const page = await browser.newPage();
|
|
73
|
+
|
|
74
|
+
return {
|
|
75
|
+
page,
|
|
76
|
+
browser,
|
|
77
|
+
isMock: false
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
|
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Human Validation Manager
|
|
3
|
+
*
|
|
4
|
+
* Cleverly integrates human validation into the evaluation pipeline:
|
|
5
|
+
* - Non-blocking: Doesn't slow down evaluations
|
|
6
|
+
* - Automatic: Collects VLLM judgments when enabled
|
|
7
|
+
* - Smart sampling: Requests human validation for interesting cases
|
|
8
|
+
* - Learning: Automatically calibrates based on collected data
|
|
9
|
+
* - Seamless: Works with all existing systems (batching, temporal, personas)
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { warn, log } from './logger.mjs';
|
|
13
|
+
import { existsSync, readFileSync, writeFileSync, mkdirSync, readdirSync } from 'fs';
|
|
14
|
+
import { join } from 'path';
|
|
15
|
+
|
|
16
|
+
// Lazy import to avoid circular dependencies
|
|
17
|
+
let humanValidationModule = null;
|
|
18
|
+
async function getHumanValidationModule() {
|
|
19
|
+
if (!humanValidationModule) {
|
|
20
|
+
humanValidationModule = await import('../evaluation/human-validation/human-validation.mjs');
|
|
21
|
+
}
|
|
22
|
+
return humanValidationModule;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Human Validation Manager
|
|
27
|
+
*
|
|
28
|
+
* Manages human validation collection and calibration
|
|
29
|
+
*/
|
|
30
|
+
export class HumanValidationManager {
|
|
31
|
+
/**
|
|
32
|
+
* @param {{
|
|
33
|
+
* enabled?: boolean;
|
|
34
|
+
* autoCollect?: boolean;
|
|
35
|
+
* smartSampling?: boolean;
|
|
36
|
+
* calibrationThreshold?: number;
|
|
37
|
+
* humanValidatorFn?: (vllmResult: any) => Promise<any> | null;
|
|
38
|
+
* }} [options={}] - Manager options
|
|
39
|
+
*/
|
|
40
|
+
constructor(options = {}) {
|
|
41
|
+
const {
|
|
42
|
+
enabled = false,
|
|
43
|
+
autoCollect = true, // Automatically collect VLLM judgments
|
|
44
|
+
smartSampling = true, // Only request human validation for interesting cases
|
|
45
|
+
calibrationThreshold = 0.7, // Minimum correlation for good calibration
|
|
46
|
+
humanValidatorFn = null // Optional function to request human validation
|
|
47
|
+
} = options;
|
|
48
|
+
|
|
49
|
+
this.enabled = enabled;
|
|
50
|
+
this.autoCollect = autoCollect;
|
|
51
|
+
this.smartSampling = smartSampling;
|
|
52
|
+
this.calibrationThreshold = calibrationThreshold;
|
|
53
|
+
this.humanValidatorFn = humanValidatorFn;
|
|
54
|
+
|
|
55
|
+
// Track VLLM judgments for calibration
|
|
56
|
+
this.vllmJudgments = [];
|
|
57
|
+
this.pendingValidations = new Map(); // Track pending human validations
|
|
58
|
+
|
|
59
|
+
// Calibration cache
|
|
60
|
+
this.calibrationCache = null;
|
|
61
|
+
this.calibrationCachePath = null; // Will be set after loading module
|
|
62
|
+
// Load calibration cache asynchronously
|
|
63
|
+
this._loadCalibrationCache().catch(() => {
|
|
64
|
+
// Silently fail - will retry later
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Load calibration cache
|
|
70
|
+
*/
|
|
71
|
+
async _loadCalibrationCache() {
|
|
72
|
+
try {
|
|
73
|
+
const humanValidation = await getHumanValidationModule();
|
|
74
|
+
const VALIDATION_DIR = humanValidation.VALIDATION_DIR;
|
|
75
|
+
|
|
76
|
+
// Ensure validation directory exists
|
|
77
|
+
if (!existsSync(VALIDATION_DIR)) {
|
|
78
|
+
mkdirSync(VALIDATION_DIR, { recursive: true });
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (!this.calibrationCachePath) {
|
|
82
|
+
this.calibrationCachePath = join(VALIDATION_DIR, 'calibration-cache.json');
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
if (existsSync(this.calibrationCachePath)) {
|
|
86
|
+
try {
|
|
87
|
+
this.calibrationCache = JSON.parse(readFileSync(this.calibrationCachePath, 'utf-8'));
|
|
88
|
+
} catch (error) {
|
|
89
|
+
warn('Failed to load calibration cache:', error.message);
|
|
90
|
+
this.calibrationCache = null;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
} catch (error) {
|
|
94
|
+
// Silently fail if module not available
|
|
95
|
+
this.calibrationCache = null;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Save calibration cache
|
|
101
|
+
*/
|
|
102
|
+
async _saveCalibrationCache() {
|
|
103
|
+
const humanValidation = await getHumanValidationModule();
|
|
104
|
+
const VALIDATION_DIR = humanValidation.VALIDATION_DIR;
|
|
105
|
+
|
|
106
|
+
if (!this.calibrationCachePath) {
|
|
107
|
+
this.calibrationCachePath = join(VALIDATION_DIR, 'calibration-cache.json');
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if (!existsSync(VALIDATION_DIR)) {
|
|
111
|
+
mkdirSync(VALIDATION_DIR, { recursive: true });
|
|
112
|
+
}
|
|
113
|
+
try {
|
|
114
|
+
writeFileSync(this.calibrationCachePath, JSON.stringify(this.calibrationCache, null, 2));
|
|
115
|
+
} catch (error) {
|
|
116
|
+
warn('Failed to save calibration cache:', error.message);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Check if result should trigger human validation (smart sampling)
|
|
122
|
+
*/
|
|
123
|
+
_shouldRequestHumanValidation(vllmResult) {
|
|
124
|
+
if (!this.smartSampling) return true; // Request all if not using smart sampling
|
|
125
|
+
|
|
126
|
+
// Request human validation for:
|
|
127
|
+
// 1. Edge cases (very high or very low scores)
|
|
128
|
+
const score = vllmResult.score;
|
|
129
|
+
if (score !== null && (score <= 3 || score >= 9)) {
|
|
130
|
+
return true;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// 2. High uncertainty (if available)
|
|
134
|
+
if (vllmResult.uncertainty && vllmResult.uncertainty > 0.3) {
|
|
135
|
+
return true;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// 3. Many issues detected (might be over-detection)
|
|
139
|
+
if (vllmResult.issues && vllmResult.issues.length >= 5) {
|
|
140
|
+
return true;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// 4. No issues but low score (might be under-detection)
|
|
144
|
+
if (vllmResult.issues && vllmResult.issues.length === 0 && score !== null && score < 6) {
|
|
145
|
+
return true;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// 5. Random sampling (10% of cases)
|
|
149
|
+
if (Math.random() < 0.1) {
|
|
150
|
+
return true;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
return false;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Collect VLLM judgment (non-blocking)
|
|
158
|
+
*
|
|
159
|
+
* @param {import('./index.mjs').ValidationResult} vllmResult - VLLM validation result
|
|
160
|
+
* @param {string} imagePath - Screenshot path
|
|
161
|
+
* @param {string} prompt - Evaluation prompt
|
|
162
|
+
* @param {import('./index.mjs').ValidationContext} context - Validation context
|
|
163
|
+
*/
|
|
164
|
+
async collectVLLMJudgment(vllmResult, imagePath, prompt, context = {}) {
|
|
165
|
+
if (!this.enabled || !this.autoCollect) return;
|
|
166
|
+
|
|
167
|
+
// Generate unique ID
|
|
168
|
+
const id = context.validationId || `vllm-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
|
|
169
|
+
|
|
170
|
+
// Store VLLM judgment with temporal and experience context
|
|
171
|
+
const vllmJudgment = {
|
|
172
|
+
id,
|
|
173
|
+
screenshot: imagePath,
|
|
174
|
+
prompt,
|
|
175
|
+
vllmScore: vllmResult.score,
|
|
176
|
+
vllmIssues: vllmResult.issues || [],
|
|
177
|
+
vllmReasoning: vllmResult.reasoning || vllmResult.assessment || '',
|
|
178
|
+
provider: vllmResult.provider || 'unknown',
|
|
179
|
+
timestamp: new Date().toISOString(),
|
|
180
|
+
// NEW: Store temporal and experience context for late interaction
|
|
181
|
+
temporalNotes: context.temporalNotes || null,
|
|
182
|
+
aggregatedNotes: context.aggregatedNotes || null,
|
|
183
|
+
experienceTrace: context.experienceTrace || null,
|
|
184
|
+
context: {
|
|
185
|
+
testType: context.testType,
|
|
186
|
+
viewport: context.viewport,
|
|
187
|
+
persona: context.persona?.name,
|
|
188
|
+
stage: context.stage,
|
|
189
|
+
step: context.step,
|
|
190
|
+
interaction: context.interaction,
|
|
191
|
+
sessionId: context.sessionId,
|
|
192
|
+
experienceTrace: context.experienceTrace?.sessionId || null
|
|
193
|
+
}
|
|
194
|
+
};
|
|
195
|
+
|
|
196
|
+
this.vllmJudgments.push(vllmJudgment);
|
|
197
|
+
|
|
198
|
+
// Request human validation if smart sampling says so
|
|
199
|
+
if (this._shouldRequestHumanValidation(vllmResult) && this.humanValidatorFn) {
|
|
200
|
+
// Non-blocking: Don't wait for human validation
|
|
201
|
+
this._requestHumanValidation(vllmJudgment).catch(error => {
|
|
202
|
+
warn('Failed to request human validation:', error.message);
|
|
203
|
+
});
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Auto-save periodically (every 10 judgments) - non-blocking
|
|
207
|
+
if (this.vllmJudgments.length % 10 === 0) {
|
|
208
|
+
// Don't await - save in background to avoid blocking
|
|
209
|
+
this._saveVLLMJudgments().catch(error => {
|
|
210
|
+
warn('Failed to auto-save VLLM judgments:', error.message);
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/**
|
|
216
|
+
* Request human validation (non-blocking)
|
|
217
|
+
*
|
|
218
|
+
* If humanValidatorFn is provided, uses it. Otherwise, queues the judgment
|
|
219
|
+
* for later review via the real-human-feedback tool.
|
|
220
|
+
*/
|
|
221
|
+
async _requestHumanValidation(vllmJudgment) {
|
|
222
|
+
if (this.humanValidatorFn) {
|
|
223
|
+
// Use provided validator function
|
|
224
|
+
try {
|
|
225
|
+
// Call human validator function (can be async, can return null)
|
|
226
|
+
const humanResult = await Promise.resolve(this.humanValidatorFn(vllmJudgment));
|
|
227
|
+
|
|
228
|
+
if (humanResult) {
|
|
229
|
+
// Store human judgment
|
|
230
|
+
const humanJudgment = {
|
|
231
|
+
id: vllmJudgment.id,
|
|
232
|
+
screenshot: vllmJudgment.screenshot,
|
|
233
|
+
prompt: vllmJudgment.prompt,
|
|
234
|
+
humanScore: humanResult.score,
|
|
235
|
+
humanIssues: humanResult.issues || [],
|
|
236
|
+
humanReasoning: humanResult.reasoning || '',
|
|
237
|
+
timestamp: new Date().toISOString(),
|
|
238
|
+
evaluatorId: humanResult.evaluatorId
|
|
239
|
+
};
|
|
240
|
+
|
|
241
|
+
const humanValidation = await getHumanValidationModule();
|
|
242
|
+
humanValidation.collectHumanJudgment(humanJudgment);
|
|
243
|
+
|
|
244
|
+
// Update calibration cache
|
|
245
|
+
this._updateCalibrationCache(vllmJudgment, humanJudgment);
|
|
246
|
+
}
|
|
247
|
+
} catch (error) {
|
|
248
|
+
// Silently fail - human validation is optional
|
|
249
|
+
warn('Human validation request failed:', error.message);
|
|
250
|
+
}
|
|
251
|
+
} else {
|
|
252
|
+
// No validator function - queue for later review
|
|
253
|
+
// The judgment is already saved to disk, so it will be available
|
|
254
|
+
// when the user runs: node evaluation/human-validation/real-human-feedback.mjs
|
|
255
|
+
log(`[Human Validation] Queued judgment ${vllmJudgment.id} for human review`);
|
|
256
|
+
log(`[Human Validation] Run 'node evaluation/human-validation/real-human-feedback.mjs' to review`);
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* Update calibration cache with new human judgment
|
|
262
|
+
*/
|
|
263
|
+
async _updateCalibrationCache(vllmJudgment, humanJudgment) {
|
|
264
|
+
if (!this.calibrationCache) {
|
|
265
|
+
this.calibrationCache = {
|
|
266
|
+
judgments: [],
|
|
267
|
+
lastCalibration: null,
|
|
268
|
+
stats: {
|
|
269
|
+
total: 0,
|
|
270
|
+
agreements: 0,
|
|
271
|
+
disagreements: 0
|
|
272
|
+
}
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
this.calibrationCache.judgments.push({
|
|
277
|
+
vllm: vllmJudgment,
|
|
278
|
+
human: humanJudgment,
|
|
279
|
+
timestamp: new Date().toISOString()
|
|
280
|
+
});
|
|
281
|
+
|
|
282
|
+
// Update stats
|
|
283
|
+
this.calibrationCache.stats.total++;
|
|
284
|
+
const scoreDiff = Math.abs(vllmJudgment.vllmScore - humanJudgment.humanScore);
|
|
285
|
+
if (scoreDiff <= 1) {
|
|
286
|
+
this.calibrationCache.stats.agreements++;
|
|
287
|
+
} else {
|
|
288
|
+
this.calibrationCache.stats.disagreements++;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// Recalibrate if we have enough data (every 20 judgments)
|
|
292
|
+
if (this.calibrationCache.judgments.length % 20 === 0) {
|
|
293
|
+
await this._recalibrate();
|
|
294
|
+
} else {
|
|
295
|
+
await this._saveCalibrationCache();
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
/**
|
|
300
|
+
* Recalibrate based on collected judgments
|
|
301
|
+
*/
|
|
302
|
+
async _recalibrate() {
|
|
303
|
+
if (!this.calibrationCache || this.calibrationCache.judgments.length < 10) {
|
|
304
|
+
return; // Need at least 10 judgments
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
try {
|
|
308
|
+
const humanValidation = await getHumanValidationModule();
|
|
309
|
+
const humanJudgments = this.calibrationCache.judgments.map(j => j.human);
|
|
310
|
+
const vllmJudgments = this.calibrationCache.judgments.map(j => j.vllm);
|
|
311
|
+
|
|
312
|
+
const calibration = humanValidation.compareJudgments(humanJudgments, vllmJudgments);
|
|
313
|
+
|
|
314
|
+
this.calibrationCache.lastCalibration = {
|
|
315
|
+
...calibration,
|
|
316
|
+
timestamp: new Date().toISOString(),
|
|
317
|
+
sampleSize: this.calibrationCache.judgments.length
|
|
318
|
+
};
|
|
319
|
+
|
|
320
|
+
// Save calibration results
|
|
321
|
+
const humanValidationModule = await getHumanValidationModule();
|
|
322
|
+
humanValidationModule.saveCalibrationResults(calibration);
|
|
323
|
+
|
|
324
|
+
// Log calibration status
|
|
325
|
+
const correlation = calibration.agreement.pearson;
|
|
326
|
+
if (correlation >= this.calibrationThreshold) {
|
|
327
|
+
log(`[Human Validation] Good calibration: r=${correlation.toFixed(3)}, κ=${calibration.agreement.kappa.toFixed(3)}`);
|
|
328
|
+
} else {
|
|
329
|
+
warn(`[Human Validation] Poor calibration: r=${correlation.toFixed(3)}, κ=${calibration.agreement.kappa.toFixed(3)}`);
|
|
330
|
+
warn(`[Human Validation] Recommendations: ${calibration.recommendations.join('; ')}`);
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
await this._saveCalibrationCache();
|
|
334
|
+
} catch (error) {
|
|
335
|
+
warn('Failed to recalibrate:', error.message);
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Get calibration status
|
|
341
|
+
*/
|
|
342
|
+
getCalibrationStatus() {
|
|
343
|
+
if (!this.calibrationCache || !this.calibrationCache.lastCalibration) {
|
|
344
|
+
return {
|
|
345
|
+
calibrated: false,
|
|
346
|
+
message: 'No calibration data available'
|
|
347
|
+
};
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
const cal = this.calibrationCache.lastCalibration;
|
|
351
|
+
const correlation = cal.agreement.pearson;
|
|
352
|
+
|
|
353
|
+
return {
|
|
354
|
+
calibrated: true,
|
|
355
|
+
correlation,
|
|
356
|
+
kappa: cal.agreement.kappa,
|
|
357
|
+
mae: cal.agreement.mae,
|
|
358
|
+
isGood: correlation >= this.calibrationThreshold,
|
|
359
|
+
sampleSize: cal.sampleSize,
|
|
360
|
+
recommendations: cal.recommendations,
|
|
361
|
+
lastCalibration: cal.timestamp
|
|
362
|
+
};
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
/**
|
|
366
|
+
* Apply calibration adjustments to VLLM score
|
|
367
|
+
*
|
|
368
|
+
* @param {number} vllmScore - Original VLLM score
|
|
369
|
+
* @returns {number} Calibrated score
|
|
370
|
+
*/
|
|
371
|
+
applyCalibration(vllmScore) {
|
|
372
|
+
if (!this.calibrationCache || !this.calibrationCache.lastCalibration) {
|
|
373
|
+
return vllmScore; // No calibration available
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
const bias = this.calibrationCache.lastCalibration.bias.scoreBias;
|
|
377
|
+
|
|
378
|
+
// Apply bias correction (simple linear adjustment)
|
|
379
|
+
// More sophisticated calibration could use logistic regression
|
|
380
|
+
const calibrated = vllmScore - bias;
|
|
381
|
+
|
|
382
|
+
// Clamp to valid range
|
|
383
|
+
return Math.max(0, Math.min(10, calibrated));
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
/**
|
|
387
|
+
* Save VLLM judgments to disk
|
|
388
|
+
*/
|
|
389
|
+
async _saveVLLMJudgments() {
|
|
390
|
+
const humanValidation = await getHumanValidationModule();
|
|
391
|
+
const VALIDATION_DIR = humanValidation.VALIDATION_DIR;
|
|
392
|
+
|
|
393
|
+
if (!existsSync(VALIDATION_DIR)) {
|
|
394
|
+
mkdirSync(VALIDATION_DIR, { recursive: true });
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
const path = join(VALIDATION_DIR, `vllm-judgments-${Date.now()}.json`);
|
|
398
|
+
try {
|
|
399
|
+
writeFileSync(path, JSON.stringify({
|
|
400
|
+
timestamp: new Date().toISOString(),
|
|
401
|
+
judgments: this.vllmJudgments
|
|
402
|
+
}, null, 2));
|
|
403
|
+
|
|
404
|
+
// Clear in-memory cache after saving (keep last 100)
|
|
405
|
+
if (this.vllmJudgments.length > 100) {
|
|
406
|
+
this.vllmJudgments = this.vllmJudgments.slice(-100);
|
|
407
|
+
}
|
|
408
|
+
} catch (error) {
|
|
409
|
+
warn('Failed to save VLLM judgments:', error.message);
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
/**
|
|
414
|
+
* Load existing VLLM judgments
|
|
415
|
+
*/
|
|
416
|
+
loadVLLMJudgments() {
|
|
417
|
+
// Load from disk if needed
|
|
418
|
+
// This is called when manager is initialized
|
|
419
|
+
return this.vllmJudgments;
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
/**
|
|
423
|
+
* Manually trigger calibration
|
|
424
|
+
*/
|
|
425
|
+
async calibrate() {
|
|
426
|
+
const humanValidation = await getHumanValidationModule();
|
|
427
|
+
const VALIDATION_DIR = humanValidation.VALIDATION_DIR;
|
|
428
|
+
|
|
429
|
+
// Load all human judgments
|
|
430
|
+
const humanJudgments = [];
|
|
431
|
+
|
|
432
|
+
if (existsSync(VALIDATION_DIR)) {
|
|
433
|
+
const files = readdirSync(VALIDATION_DIR);
|
|
434
|
+
for (const file of files) {
|
|
435
|
+
if (file.startsWith('human-') && file.endsWith('.json')) {
|
|
436
|
+
try {
|
|
437
|
+
const id = file.replace('human-', '').replace('.json', '');
|
|
438
|
+
const judgment = humanValidation.loadHumanJudgment(id);
|
|
439
|
+
if (judgment) {
|
|
440
|
+
humanJudgments.push(judgment);
|
|
441
|
+
}
|
|
442
|
+
} catch (error) {
|
|
443
|
+
// Skip invalid files
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
// Match with VLLM judgments
|
|
450
|
+
const vllmJudgments = this.vllmJudgments.filter(v =>
|
|
451
|
+
humanJudgments.some(h => h.id === v.id)
|
|
452
|
+
);
|
|
453
|
+
const matchedHumanJudgments = humanJudgments.filter(h =>
|
|
454
|
+
vllmJudgments.some(v => v.id === h.id)
|
|
455
|
+
);
|
|
456
|
+
|
|
457
|
+
if (matchedHumanJudgments.length === 0 || vllmJudgments.length === 0) {
|
|
458
|
+
return {
|
|
459
|
+
success: false,
|
|
460
|
+
message: 'No matched judgments found for calibration'
|
|
461
|
+
};
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
const calibration = humanValidation.compareJudgments(matchedHumanJudgments, vllmJudgments);
|
|
465
|
+
humanValidation.saveCalibrationResults(calibration);
|
|
466
|
+
|
|
467
|
+
this.calibrationCache = {
|
|
468
|
+
...this.calibrationCache,
|
|
469
|
+
lastCalibration: {
|
|
470
|
+
...calibration,
|
|
471
|
+
timestamp: new Date().toISOString(),
|
|
472
|
+
sampleSize: matchedHumanJudgments.length
|
|
473
|
+
}
|
|
474
|
+
};
|
|
475
|
+
await this._saveCalibrationCache();
|
|
476
|
+
|
|
477
|
+
return {
|
|
478
|
+
success: true,
|
|
479
|
+
calibration,
|
|
480
|
+
sampleSize: matchedHumanJudgments.length
|
|
481
|
+
};
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
/**
|
|
486
|
+
* Global human validation manager instance
|
|
487
|
+
*/
|
|
488
|
+
let globalHumanValidationManager = null;
|
|
489
|
+
|
|
490
|
+
/**
|
|
491
|
+
* Get or create global human validation manager
|
|
492
|
+
*
|
|
493
|
+
* @param {Object} options - Manager options
|
|
494
|
+
* @returns {HumanValidationManager} Manager instance
|
|
495
|
+
*/
|
|
496
|
+
export function getHumanValidationManager(options = {}) {
|
|
497
|
+
if (!globalHumanValidationManager) {
|
|
498
|
+
globalHumanValidationManager = new HumanValidationManager(options);
|
|
499
|
+
}
|
|
500
|
+
return globalHumanValidationManager;
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
/**
|
|
504
|
+
* Initialize human validation (call this to enable)
|
|
505
|
+
*
|
|
506
|
+
* @param {Object} options - Manager options
|
|
507
|
+
* @returns {HumanValidationManager} Manager instance
|
|
508
|
+
*/
|
|
509
|
+
export function initHumanValidation(options = {}) {
|
|
510
|
+
globalHumanValidationManager = new HumanValidationManager({
|
|
511
|
+
enabled: true,
|
|
512
|
+
...options
|
|
513
|
+
});
|
|
514
|
+
return globalHumanValidationManager;
|
|
515
|
+
}
|
|
516
|
+
|