@arclabs561/ai-visual-test 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.secretsignore.example +20 -0
  2. package/CHANGELOG.md +360 -0
  3. package/CONTRIBUTING.md +63 -0
  4. package/DEPLOYMENT.md +80 -0
  5. package/LICENSE +22 -0
  6. package/README.md +142 -0
  7. package/SECURITY.md +108 -0
  8. package/api/health.js +34 -0
  9. package/api/validate.js +252 -0
  10. package/index.d.ts +1221 -0
  11. package/package.json +112 -0
  12. package/public/index.html +149 -0
  13. package/src/batch-optimizer.mjs +451 -0
  14. package/src/bias-detector.mjs +370 -0
  15. package/src/bias-mitigation.mjs +233 -0
  16. package/src/cache.mjs +433 -0
  17. package/src/config.mjs +268 -0
  18. package/src/constants.mjs +80 -0
  19. package/src/context-compressor.mjs +350 -0
  20. package/src/convenience.mjs +617 -0
  21. package/src/cost-tracker.mjs +257 -0
  22. package/src/cross-modal-consistency.mjs +170 -0
  23. package/src/data-extractor.mjs +232 -0
  24. package/src/dynamic-few-shot.mjs +140 -0
  25. package/src/dynamic-prompts.mjs +361 -0
  26. package/src/ensemble/index.mjs +53 -0
  27. package/src/ensemble-judge.mjs +366 -0
  28. package/src/error-handler.mjs +67 -0
  29. package/src/errors.mjs +167 -0
  30. package/src/experience-propagation.mjs +128 -0
  31. package/src/experience-tracer.mjs +487 -0
  32. package/src/explanation-manager.mjs +299 -0
  33. package/src/feedback-aggregator.mjs +248 -0
  34. package/src/game-goal-prompts.mjs +478 -0
  35. package/src/game-player.mjs +548 -0
  36. package/src/hallucination-detector.mjs +155 -0
  37. package/src/helpers/playwright.mjs +80 -0
  38. package/src/human-validation-manager.mjs +516 -0
  39. package/src/index.mjs +364 -0
  40. package/src/judge.mjs +929 -0
  41. package/src/latency-aware-batch-optimizer.mjs +192 -0
  42. package/src/load-env.mjs +159 -0
  43. package/src/logger.mjs +55 -0
  44. package/src/metrics.mjs +187 -0
  45. package/src/model-tier-selector.mjs +221 -0
  46. package/src/multi-modal/index.mjs +36 -0
  47. package/src/multi-modal-fusion.mjs +190 -0
  48. package/src/multi-modal.mjs +524 -0
  49. package/src/natural-language-specs.mjs +1071 -0
  50. package/src/pair-comparison.mjs +277 -0
  51. package/src/persona/index.mjs +42 -0
  52. package/src/persona-enhanced.mjs +200 -0
  53. package/src/persona-experience.mjs +572 -0
  54. package/src/position-counterbalance.mjs +140 -0
  55. package/src/prompt-composer.mjs +375 -0
  56. package/src/render-change-detector.mjs +583 -0
  57. package/src/research-enhanced-validation.mjs +436 -0
  58. package/src/retry.mjs +152 -0
  59. package/src/rubrics.mjs +231 -0
  60. package/src/score-tracker.mjs +277 -0
  61. package/src/smart-validator.mjs +447 -0
  62. package/src/spec-config.mjs +106 -0
  63. package/src/spec-templates.mjs +347 -0
  64. package/src/specs/index.mjs +38 -0
  65. package/src/temporal/index.mjs +102 -0
  66. package/src/temporal-adaptive.mjs +163 -0
  67. package/src/temporal-batch-optimizer.mjs +222 -0
  68. package/src/temporal-constants.mjs +69 -0
  69. package/src/temporal-context.mjs +49 -0
  70. package/src/temporal-decision-manager.mjs +271 -0
  71. package/src/temporal-decision.mjs +669 -0
  72. package/src/temporal-errors.mjs +58 -0
  73. package/src/temporal-note-pruner.mjs +173 -0
  74. package/src/temporal-preprocessor.mjs +543 -0
  75. package/src/temporal-prompt-formatter.mjs +219 -0
  76. package/src/temporal-validation.mjs +159 -0
  77. package/src/temporal.mjs +415 -0
  78. package/src/type-guards.mjs +311 -0
  79. package/src/uncertainty-reducer.mjs +470 -0
  80. package/src/utils/index.mjs +175 -0
  81. package/src/validation-framework.mjs +321 -0
  82. package/src/validation-result-normalizer.mjs +64 -0
  83. package/src/validation.mjs +243 -0
  84. package/src/validators/accessibility-programmatic.mjs +345 -0
  85. package/src/validators/accessibility-validator.mjs +223 -0
  86. package/src/validators/batch-validator.mjs +143 -0
  87. package/src/validators/hybrid-validator.mjs +268 -0
  88. package/src/validators/index.mjs +34 -0
  89. package/src/validators/prompt-builder.mjs +218 -0
  90. package/src/validators/rubric.mjs +85 -0
  91. package/src/validators/state-programmatic.mjs +260 -0
  92. package/src/validators/state-validator.mjs +291 -0
  93. package/vercel.json +27 -0
@@ -0,0 +1,80 @@
1
+ /**
2
+ * Playwright Helper Utilities
3
+ *
4
+ * Provides utilities for working with Playwright, including graceful
5
+ * handling when Playwright is not installed.
6
+ */
7
+
8
+ /**
9
+ * Get Playwright chromium browser, with graceful fallback
10
+ * @returns {Promise<{chromium: any, available: boolean}>}
11
+ */
12
+ export async function getPlaywrightChromium() {
13
+ try {
14
+ const playwright = await import('playwright');
15
+ return {
16
+ chromium: playwright.chromium,
17
+ available: true
18
+ };
19
+ } catch (error) {
20
+ if (error.code === 'ERR_MODULE_NOT_FOUND' || error.message.includes('Cannot find module')) {
21
+ return {
22
+ chromium: null,
23
+ available: false,
24
+ error: 'Playwright not installed. Install with: npm install --save-dev @playwright/test'
25
+ };
26
+ }
27
+ throw error;
28
+ }
29
+ }
30
+
31
+ /**
32
+ * Check if Playwright is available
33
+ * @returns {Promise<boolean>}
34
+ */
35
+ export async function isPlaywrightAvailable() {
36
+ const { available } = await getPlaywrightChromium();
37
+ return available;
38
+ }
39
+
40
+ /**
41
+ * Create a mock page object for testing when Playwright is not available
42
+ * @returns {object} Mock page object
43
+ */
44
+ export function createMockPage() {
45
+ return {
46
+ goto: async () => {},
47
+ screenshot: async () => ({ path: 'mock-screenshot.png' }),
48
+ waitForLoadState: async () => {},
49
+ waitForTimeout: async () => {},
50
+ evaluate: async () => ({}),
51
+ close: async () => {}
52
+ };
53
+ }
54
+
55
+ /**
56
+ * Get Playwright page with fallback to mock
57
+ * @param {object} options - Options for browser/page creation
58
+ * @returns {Promise<{page: any, browser: any, isMock: boolean}>}
59
+ */
60
+ export async function getPlaywrightPage(options = {}) {
61
+ const { chromium, available } = await getPlaywrightChromium();
62
+
63
+ if (!available) {
64
+ return {
65
+ page: createMockPage(),
66
+ browser: null,
67
+ isMock: true
68
+ };
69
+ }
70
+
71
+ const browser = await chromium.launch(options.browserOptions || {});
72
+ const page = await browser.newPage();
73
+
74
+ return {
75
+ page,
76
+ browser,
77
+ isMock: false
78
+ };
79
+ }
80
+
@@ -0,0 +1,516 @@
1
+ /**
2
+ * Human Validation Manager
3
+ *
4
+ * Cleverly integrates human validation into the evaluation pipeline:
5
+ * - Non-blocking: Doesn't slow down evaluations
6
+ * - Automatic: Collects VLLM judgments when enabled
7
+ * - Smart sampling: Requests human validation for interesting cases
8
+ * - Learning: Automatically calibrates based on collected data
9
+ * - Seamless: Works with all existing systems (batching, temporal, personas)
10
+ */
11
+
12
+ import { warn, log } from './logger.mjs';
13
+ import { existsSync, readFileSync, writeFileSync, mkdirSync, readdirSync } from 'fs';
14
+ import { join } from 'path';
15
+
16
+ // Lazy import to avoid circular dependencies
17
+ let humanValidationModule = null;
18
+ async function getHumanValidationModule() {
19
+ if (!humanValidationModule) {
20
+ humanValidationModule = await import('../evaluation/human-validation/human-validation.mjs');
21
+ }
22
+ return humanValidationModule;
23
+ }
24
+
25
+ /**
26
+ * Human Validation Manager
27
+ *
28
+ * Manages human validation collection and calibration
29
+ */
30
+ export class HumanValidationManager {
31
+ /**
32
+ * @param {{
33
+ * enabled?: boolean;
34
+ * autoCollect?: boolean;
35
+ * smartSampling?: boolean;
36
+ * calibrationThreshold?: number;
37
+ * humanValidatorFn?: (vllmResult: any) => Promise<any> | null;
38
+ * }} [options={}] - Manager options
39
+ */
40
+ constructor(options = {}) {
41
+ const {
42
+ enabled = false,
43
+ autoCollect = true, // Automatically collect VLLM judgments
44
+ smartSampling = true, // Only request human validation for interesting cases
45
+ calibrationThreshold = 0.7, // Minimum correlation for good calibration
46
+ humanValidatorFn = null // Optional function to request human validation
47
+ } = options;
48
+
49
+ this.enabled = enabled;
50
+ this.autoCollect = autoCollect;
51
+ this.smartSampling = smartSampling;
52
+ this.calibrationThreshold = calibrationThreshold;
53
+ this.humanValidatorFn = humanValidatorFn;
54
+
55
+ // Track VLLM judgments for calibration
56
+ this.vllmJudgments = [];
57
+ this.pendingValidations = new Map(); // Track pending human validations
58
+
59
+ // Calibration cache
60
+ this.calibrationCache = null;
61
+ this.calibrationCachePath = null; // Will be set after loading module
62
+ // Load calibration cache asynchronously
63
+ this._loadCalibrationCache().catch(() => {
64
+ // Silently fail - will retry later
65
+ });
66
+ }
67
+
68
+ /**
69
+ * Load calibration cache
70
+ */
71
+ async _loadCalibrationCache() {
72
+ try {
73
+ const humanValidation = await getHumanValidationModule();
74
+ const VALIDATION_DIR = humanValidation.VALIDATION_DIR;
75
+
76
+ // Ensure validation directory exists
77
+ if (!existsSync(VALIDATION_DIR)) {
78
+ mkdirSync(VALIDATION_DIR, { recursive: true });
79
+ }
80
+
81
+ if (!this.calibrationCachePath) {
82
+ this.calibrationCachePath = join(VALIDATION_DIR, 'calibration-cache.json');
83
+ }
84
+
85
+ if (existsSync(this.calibrationCachePath)) {
86
+ try {
87
+ this.calibrationCache = JSON.parse(readFileSync(this.calibrationCachePath, 'utf-8'));
88
+ } catch (error) {
89
+ warn('Failed to load calibration cache:', error.message);
90
+ this.calibrationCache = null;
91
+ }
92
+ }
93
+ } catch (error) {
94
+ // Silently fail if module not available
95
+ this.calibrationCache = null;
96
+ }
97
+ }
98
+
99
+ /**
100
+ * Save calibration cache
101
+ */
102
+ async _saveCalibrationCache() {
103
+ const humanValidation = await getHumanValidationModule();
104
+ const VALIDATION_DIR = humanValidation.VALIDATION_DIR;
105
+
106
+ if (!this.calibrationCachePath) {
107
+ this.calibrationCachePath = join(VALIDATION_DIR, 'calibration-cache.json');
108
+ }
109
+
110
+ if (!existsSync(VALIDATION_DIR)) {
111
+ mkdirSync(VALIDATION_DIR, { recursive: true });
112
+ }
113
+ try {
114
+ writeFileSync(this.calibrationCachePath, JSON.stringify(this.calibrationCache, null, 2));
115
+ } catch (error) {
116
+ warn('Failed to save calibration cache:', error.message);
117
+ }
118
+ }
119
+
120
+ /**
121
+ * Check if result should trigger human validation (smart sampling)
122
+ */
123
+ _shouldRequestHumanValidation(vllmResult) {
124
+ if (!this.smartSampling) return true; // Request all if not using smart sampling
125
+
126
+ // Request human validation for:
127
+ // 1. Edge cases (very high or very low scores)
128
+ const score = vllmResult.score;
129
+ if (score !== null && (score <= 3 || score >= 9)) {
130
+ return true;
131
+ }
132
+
133
+ // 2. High uncertainty (if available)
134
+ if (vllmResult.uncertainty && vllmResult.uncertainty > 0.3) {
135
+ return true;
136
+ }
137
+
138
+ // 3. Many issues detected (might be over-detection)
139
+ if (vllmResult.issues && vllmResult.issues.length >= 5) {
140
+ return true;
141
+ }
142
+
143
+ // 4. No issues but low score (might be under-detection)
144
+ if (vllmResult.issues && vllmResult.issues.length === 0 && score !== null && score < 6) {
145
+ return true;
146
+ }
147
+
148
+ // 5. Random sampling (10% of cases)
149
+ if (Math.random() < 0.1) {
150
+ return true;
151
+ }
152
+
153
+ return false;
154
+ }
155
+
156
+ /**
157
+ * Collect VLLM judgment (non-blocking)
158
+ *
159
+ * @param {import('./index.mjs').ValidationResult} vllmResult - VLLM validation result
160
+ * @param {string} imagePath - Screenshot path
161
+ * @param {string} prompt - Evaluation prompt
162
+ * @param {import('./index.mjs').ValidationContext} context - Validation context
163
+ */
164
+ async collectVLLMJudgment(vllmResult, imagePath, prompt, context = {}) {
165
+ if (!this.enabled || !this.autoCollect) return;
166
+
167
+ // Generate unique ID
168
+ const id = context.validationId || `vllm-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
169
+
170
+ // Store VLLM judgment with temporal and experience context
171
+ const vllmJudgment = {
172
+ id,
173
+ screenshot: imagePath,
174
+ prompt,
175
+ vllmScore: vllmResult.score,
176
+ vllmIssues: vllmResult.issues || [],
177
+ vllmReasoning: vllmResult.reasoning || vllmResult.assessment || '',
178
+ provider: vllmResult.provider || 'unknown',
179
+ timestamp: new Date().toISOString(),
180
+ // NEW: Store temporal and experience context for late interaction
181
+ temporalNotes: context.temporalNotes || null,
182
+ aggregatedNotes: context.aggregatedNotes || null,
183
+ experienceTrace: context.experienceTrace || null,
184
+ context: {
185
+ testType: context.testType,
186
+ viewport: context.viewport,
187
+ persona: context.persona?.name,
188
+ stage: context.stage,
189
+ step: context.step,
190
+ interaction: context.interaction,
191
+ sessionId: context.sessionId,
192
+ experienceTrace: context.experienceTrace?.sessionId || null
193
+ }
194
+ };
195
+
196
+ this.vllmJudgments.push(vllmJudgment);
197
+
198
+ // Request human validation if smart sampling says so
199
+ if (this._shouldRequestHumanValidation(vllmResult) && this.humanValidatorFn) {
200
+ // Non-blocking: Don't wait for human validation
201
+ this._requestHumanValidation(vllmJudgment).catch(error => {
202
+ warn('Failed to request human validation:', error.message);
203
+ });
204
+ }
205
+
206
+ // Auto-save periodically (every 10 judgments) - non-blocking
207
+ if (this.vllmJudgments.length % 10 === 0) {
208
+ // Don't await - save in background to avoid blocking
209
+ this._saveVLLMJudgments().catch(error => {
210
+ warn('Failed to auto-save VLLM judgments:', error.message);
211
+ });
212
+ }
213
+ }
214
+
215
+ /**
216
+ * Request human validation (non-blocking)
217
+ *
218
+ * If humanValidatorFn is provided, uses it. Otherwise, queues the judgment
219
+ * for later review via the real-human-feedback tool.
220
+ */
221
+ async _requestHumanValidation(vllmJudgment) {
222
+ if (this.humanValidatorFn) {
223
+ // Use provided validator function
224
+ try {
225
+ // Call human validator function (can be async, can return null)
226
+ const humanResult = await Promise.resolve(this.humanValidatorFn(vllmJudgment));
227
+
228
+ if (humanResult) {
229
+ // Store human judgment
230
+ const humanJudgment = {
231
+ id: vllmJudgment.id,
232
+ screenshot: vllmJudgment.screenshot,
233
+ prompt: vllmJudgment.prompt,
234
+ humanScore: humanResult.score,
235
+ humanIssues: humanResult.issues || [],
236
+ humanReasoning: humanResult.reasoning || '',
237
+ timestamp: new Date().toISOString(),
238
+ evaluatorId: humanResult.evaluatorId
239
+ };
240
+
241
+ const humanValidation = await getHumanValidationModule();
242
+ humanValidation.collectHumanJudgment(humanJudgment);
243
+
244
+ // Update calibration cache
245
+ this._updateCalibrationCache(vllmJudgment, humanJudgment);
246
+ }
247
+ } catch (error) {
248
+ // Silently fail - human validation is optional
249
+ warn('Human validation request failed:', error.message);
250
+ }
251
+ } else {
252
+ // No validator function - queue for later review
253
+ // The judgment is already saved to disk, so it will be available
254
+ // when the user runs: node evaluation/human-validation/real-human-feedback.mjs
255
+ log(`[Human Validation] Queued judgment ${vllmJudgment.id} for human review`);
256
+ log(`[Human Validation] Run 'node evaluation/human-validation/real-human-feedback.mjs' to review`);
257
+ }
258
+ }
259
+
260
+ /**
261
+ * Update calibration cache with new human judgment
262
+ */
263
+ async _updateCalibrationCache(vllmJudgment, humanJudgment) {
264
+ if (!this.calibrationCache) {
265
+ this.calibrationCache = {
266
+ judgments: [],
267
+ lastCalibration: null,
268
+ stats: {
269
+ total: 0,
270
+ agreements: 0,
271
+ disagreements: 0
272
+ }
273
+ };
274
+ }
275
+
276
+ this.calibrationCache.judgments.push({
277
+ vllm: vllmJudgment,
278
+ human: humanJudgment,
279
+ timestamp: new Date().toISOString()
280
+ });
281
+
282
+ // Update stats
283
+ this.calibrationCache.stats.total++;
284
+ const scoreDiff = Math.abs(vllmJudgment.vllmScore - humanJudgment.humanScore);
285
+ if (scoreDiff <= 1) {
286
+ this.calibrationCache.stats.agreements++;
287
+ } else {
288
+ this.calibrationCache.stats.disagreements++;
289
+ }
290
+
291
+ // Recalibrate if we have enough data (every 20 judgments)
292
+ if (this.calibrationCache.judgments.length % 20 === 0) {
293
+ await this._recalibrate();
294
+ } else {
295
+ await this._saveCalibrationCache();
296
+ }
297
+ }
298
+
299
+ /**
300
+ * Recalibrate based on collected judgments
301
+ */
302
+ async _recalibrate() {
303
+ if (!this.calibrationCache || this.calibrationCache.judgments.length < 10) {
304
+ return; // Need at least 10 judgments
305
+ }
306
+
307
+ try {
308
+ const humanValidation = await getHumanValidationModule();
309
+ const humanJudgments = this.calibrationCache.judgments.map(j => j.human);
310
+ const vllmJudgments = this.calibrationCache.judgments.map(j => j.vllm);
311
+
312
+ const calibration = humanValidation.compareJudgments(humanJudgments, vllmJudgments);
313
+
314
+ this.calibrationCache.lastCalibration = {
315
+ ...calibration,
316
+ timestamp: new Date().toISOString(),
317
+ sampleSize: this.calibrationCache.judgments.length
318
+ };
319
+
320
+ // Save calibration results
321
+ const humanValidationModule = await getHumanValidationModule();
322
+ humanValidationModule.saveCalibrationResults(calibration);
323
+
324
+ // Log calibration status
325
+ const correlation = calibration.agreement.pearson;
326
+ if (correlation >= this.calibrationThreshold) {
327
+ log(`[Human Validation] Good calibration: r=${correlation.toFixed(3)}, κ=${calibration.agreement.kappa.toFixed(3)}`);
328
+ } else {
329
+ warn(`[Human Validation] Poor calibration: r=${correlation.toFixed(3)}, κ=${calibration.agreement.kappa.toFixed(3)}`);
330
+ warn(`[Human Validation] Recommendations: ${calibration.recommendations.join('; ')}`);
331
+ }
332
+
333
+ await this._saveCalibrationCache();
334
+ } catch (error) {
335
+ warn('Failed to recalibrate:', error.message);
336
+ }
337
+ }
338
+
339
+ /**
340
+ * Get calibration status
341
+ */
342
+ getCalibrationStatus() {
343
+ if (!this.calibrationCache || !this.calibrationCache.lastCalibration) {
344
+ return {
345
+ calibrated: false,
346
+ message: 'No calibration data available'
347
+ };
348
+ }
349
+
350
+ const cal = this.calibrationCache.lastCalibration;
351
+ const correlation = cal.agreement.pearson;
352
+
353
+ return {
354
+ calibrated: true,
355
+ correlation,
356
+ kappa: cal.agreement.kappa,
357
+ mae: cal.agreement.mae,
358
+ isGood: correlation >= this.calibrationThreshold,
359
+ sampleSize: cal.sampleSize,
360
+ recommendations: cal.recommendations,
361
+ lastCalibration: cal.timestamp
362
+ };
363
+ }
364
+
365
+ /**
366
+ * Apply calibration adjustments to VLLM score
367
+ *
368
+ * @param {number} vllmScore - Original VLLM score
369
+ * @returns {number} Calibrated score
370
+ */
371
+ applyCalibration(vllmScore) {
372
+ if (!this.calibrationCache || !this.calibrationCache.lastCalibration) {
373
+ return vllmScore; // No calibration available
374
+ }
375
+
376
+ const bias = this.calibrationCache.lastCalibration.bias.scoreBias;
377
+
378
+ // Apply bias correction (simple linear adjustment)
379
+ // More sophisticated calibration could use logistic regression
380
+ const calibrated = vllmScore - bias;
381
+
382
+ // Clamp to valid range
383
+ return Math.max(0, Math.min(10, calibrated));
384
+ }
385
+
386
+ /**
387
+ * Save VLLM judgments to disk
388
+ */
389
+ async _saveVLLMJudgments() {
390
+ const humanValidation = await getHumanValidationModule();
391
+ const VALIDATION_DIR = humanValidation.VALIDATION_DIR;
392
+
393
+ if (!existsSync(VALIDATION_DIR)) {
394
+ mkdirSync(VALIDATION_DIR, { recursive: true });
395
+ }
396
+
397
+ const path = join(VALIDATION_DIR, `vllm-judgments-${Date.now()}.json`);
398
+ try {
399
+ writeFileSync(path, JSON.stringify({
400
+ timestamp: new Date().toISOString(),
401
+ judgments: this.vllmJudgments
402
+ }, null, 2));
403
+
404
+ // Clear in-memory cache after saving (keep last 100)
405
+ if (this.vllmJudgments.length > 100) {
406
+ this.vllmJudgments = this.vllmJudgments.slice(-100);
407
+ }
408
+ } catch (error) {
409
+ warn('Failed to save VLLM judgments:', error.message);
410
+ }
411
+ }
412
+
413
+ /**
414
+ * Load existing VLLM judgments
415
+ */
416
+ loadVLLMJudgments() {
417
+ // Load from disk if needed
418
+ // This is called when manager is initialized
419
+ return this.vllmJudgments;
420
+ }
421
+
422
+ /**
423
+ * Manually trigger calibration
424
+ */
425
+ async calibrate() {
426
+ const humanValidation = await getHumanValidationModule();
427
+ const VALIDATION_DIR = humanValidation.VALIDATION_DIR;
428
+
429
+ // Load all human judgments
430
+ const humanJudgments = [];
431
+
432
+ if (existsSync(VALIDATION_DIR)) {
433
+ const files = readdirSync(VALIDATION_DIR);
434
+ for (const file of files) {
435
+ if (file.startsWith('human-') && file.endsWith('.json')) {
436
+ try {
437
+ const id = file.replace('human-', '').replace('.json', '');
438
+ const judgment = humanValidation.loadHumanJudgment(id);
439
+ if (judgment) {
440
+ humanJudgments.push(judgment);
441
+ }
442
+ } catch (error) {
443
+ // Skip invalid files
444
+ }
445
+ }
446
+ }
447
+ }
448
+
449
+ // Match with VLLM judgments
450
+ const vllmJudgments = this.vllmJudgments.filter(v =>
451
+ humanJudgments.some(h => h.id === v.id)
452
+ );
453
+ const matchedHumanJudgments = humanJudgments.filter(h =>
454
+ vllmJudgments.some(v => v.id === h.id)
455
+ );
456
+
457
+ if (matchedHumanJudgments.length === 0 || vllmJudgments.length === 0) {
458
+ return {
459
+ success: false,
460
+ message: 'No matched judgments found for calibration'
461
+ };
462
+ }
463
+
464
+ const calibration = humanValidation.compareJudgments(matchedHumanJudgments, vllmJudgments);
465
+ humanValidation.saveCalibrationResults(calibration);
466
+
467
+ this.calibrationCache = {
468
+ ...this.calibrationCache,
469
+ lastCalibration: {
470
+ ...calibration,
471
+ timestamp: new Date().toISOString(),
472
+ sampleSize: matchedHumanJudgments.length
473
+ }
474
+ };
475
+ await this._saveCalibrationCache();
476
+
477
+ return {
478
+ success: true,
479
+ calibration,
480
+ sampleSize: matchedHumanJudgments.length
481
+ };
482
+ }
483
+ }
484
+
485
+ /**
486
+ * Global human validation manager instance
487
+ */
488
+ let globalHumanValidationManager = null;
489
+
490
+ /**
491
+ * Get or create global human validation manager
492
+ *
493
+ * @param {Object} options - Manager options
494
+ * @returns {HumanValidationManager} Manager instance
495
+ */
496
+ export function getHumanValidationManager(options = {}) {
497
+ if (!globalHumanValidationManager) {
498
+ globalHumanValidationManager = new HumanValidationManager(options);
499
+ }
500
+ return globalHumanValidationManager;
501
+ }
502
+
503
+ /**
504
+ * Initialize human validation (call this to enable)
505
+ *
506
+ * @param {Object} options - Manager options
507
+ * @returns {HumanValidationManager} Manager instance
508
+ */
509
+ export function initHumanValidation(options = {}) {
510
+ globalHumanValidationManager = new HumanValidationManager({
511
+ enabled: true,
512
+ ...options
513
+ });
514
+ return globalHumanValidationManager;
515
+ }
516
+