@arclabs561/ai-visual-test 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.secretsignore.example +20 -0
- package/CHANGELOG.md +360 -0
- package/CONTRIBUTING.md +63 -0
- package/DEPLOYMENT.md +80 -0
- package/LICENSE +22 -0
- package/README.md +142 -0
- package/SECURITY.md +108 -0
- package/api/health.js +34 -0
- package/api/validate.js +252 -0
- package/index.d.ts +1221 -0
- package/package.json +112 -0
- package/public/index.html +149 -0
- package/src/batch-optimizer.mjs +451 -0
- package/src/bias-detector.mjs +370 -0
- package/src/bias-mitigation.mjs +233 -0
- package/src/cache.mjs +433 -0
- package/src/config.mjs +268 -0
- package/src/constants.mjs +80 -0
- package/src/context-compressor.mjs +350 -0
- package/src/convenience.mjs +617 -0
- package/src/cost-tracker.mjs +257 -0
- package/src/cross-modal-consistency.mjs +170 -0
- package/src/data-extractor.mjs +232 -0
- package/src/dynamic-few-shot.mjs +140 -0
- package/src/dynamic-prompts.mjs +361 -0
- package/src/ensemble/index.mjs +53 -0
- package/src/ensemble-judge.mjs +366 -0
- package/src/error-handler.mjs +67 -0
- package/src/errors.mjs +167 -0
- package/src/experience-propagation.mjs +128 -0
- package/src/experience-tracer.mjs +487 -0
- package/src/explanation-manager.mjs +299 -0
- package/src/feedback-aggregator.mjs +248 -0
- package/src/game-goal-prompts.mjs +478 -0
- package/src/game-player.mjs +548 -0
- package/src/hallucination-detector.mjs +155 -0
- package/src/helpers/playwright.mjs +80 -0
- package/src/human-validation-manager.mjs +516 -0
- package/src/index.mjs +364 -0
- package/src/judge.mjs +929 -0
- package/src/latency-aware-batch-optimizer.mjs +192 -0
- package/src/load-env.mjs +159 -0
- package/src/logger.mjs +55 -0
- package/src/metrics.mjs +187 -0
- package/src/model-tier-selector.mjs +221 -0
- package/src/multi-modal/index.mjs +36 -0
- package/src/multi-modal-fusion.mjs +190 -0
- package/src/multi-modal.mjs +524 -0
- package/src/natural-language-specs.mjs +1071 -0
- package/src/pair-comparison.mjs +277 -0
- package/src/persona/index.mjs +42 -0
- package/src/persona-enhanced.mjs +200 -0
- package/src/persona-experience.mjs +572 -0
- package/src/position-counterbalance.mjs +140 -0
- package/src/prompt-composer.mjs +375 -0
- package/src/render-change-detector.mjs +583 -0
- package/src/research-enhanced-validation.mjs +436 -0
- package/src/retry.mjs +152 -0
- package/src/rubrics.mjs +231 -0
- package/src/score-tracker.mjs +277 -0
- package/src/smart-validator.mjs +447 -0
- package/src/spec-config.mjs +106 -0
- package/src/spec-templates.mjs +347 -0
- package/src/specs/index.mjs +38 -0
- package/src/temporal/index.mjs +102 -0
- package/src/temporal-adaptive.mjs +163 -0
- package/src/temporal-batch-optimizer.mjs +222 -0
- package/src/temporal-constants.mjs +69 -0
- package/src/temporal-context.mjs +49 -0
- package/src/temporal-decision-manager.mjs +271 -0
- package/src/temporal-decision.mjs +669 -0
- package/src/temporal-errors.mjs +58 -0
- package/src/temporal-note-pruner.mjs +173 -0
- package/src/temporal-preprocessor.mjs +543 -0
- package/src/temporal-prompt-formatter.mjs +219 -0
- package/src/temporal-validation.mjs +159 -0
- package/src/temporal.mjs +415 -0
- package/src/type-guards.mjs +311 -0
- package/src/uncertainty-reducer.mjs +470 -0
- package/src/utils/index.mjs +175 -0
- package/src/validation-framework.mjs +321 -0
- package/src/validation-result-normalizer.mjs +64 -0
- package/src/validation.mjs +243 -0
- package/src/validators/accessibility-programmatic.mjs +345 -0
- package/src/validators/accessibility-validator.mjs +223 -0
- package/src/validators/batch-validator.mjs +143 -0
- package/src/validators/hybrid-validator.mjs +268 -0
- package/src/validators/index.mjs +34 -0
- package/src/validators/prompt-builder.mjs +218 -0
- package/src/validators/rubric.mjs +85 -0
- package/src/validators/state-programmatic.mjs +260 -0
- package/src/validators/state-validator.mjs +291 -0
- package/vercel.json +27 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Latency-Aware Batch Optimizer
|
|
3
|
+
*
|
|
4
|
+
* Adaptive batching that considers latency requirements for fast reactive games.
|
|
5
|
+
*
|
|
6
|
+
* Key features:
|
|
7
|
+
* - Bypasses batching for critical requests (<100ms requirement)
|
|
8
|
+
* - Adaptive batch sizing based on latency requirements
|
|
9
|
+
* - Deadline-based scheduling
|
|
10
|
+
* - Priority queue for fast games
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { BatchOptimizer } from './batch-optimizer.mjs';
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Latency-Aware Batch Optimizer
|
|
17
|
+
*
|
|
18
|
+
* Extends BatchOptimizer with latency awareness for fast reactive games.
|
|
19
|
+
*/
|
|
20
|
+
export class LatencyAwareBatchOptimizer extends BatchOptimizer {
|
|
21
|
+
/**
|
|
22
|
+
* @param {{
|
|
23
|
+
* maxConcurrency?: number;
|
|
24
|
+
* batchSize?: number;
|
|
25
|
+
* cacheEnabled?: boolean;
|
|
26
|
+
* defaultMaxLatency?: number;
|
|
27
|
+
* adaptiveBatchSize?: boolean;
|
|
28
|
+
* }} [options={}] - Optimizer options
|
|
29
|
+
*/
|
|
30
|
+
constructor(options = {}) {
|
|
31
|
+
super(options);
|
|
32
|
+
this.defaultMaxLatency = options.defaultMaxLatency || 1000; // Default 1 second
|
|
33
|
+
this.adaptiveBatchSize = options.adaptiveBatchSize !== false;
|
|
34
|
+
this.criticalRequests = new Set(); // Track critical requests
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Add request with latency requirement
|
|
39
|
+
*
|
|
40
|
+
* @param {string} imagePath - Screenshot path
|
|
41
|
+
* @param {string} prompt - Validation prompt
|
|
42
|
+
* @param {import('./index.mjs').ValidationContext} [context={}] - Validation context
|
|
43
|
+
* @param {number} [maxLatency=null] - Maximum acceptable latency in ms (null = use default)
|
|
44
|
+
* @returns {Promise<import('./index.mjs').ValidationResult>} Validation result
|
|
45
|
+
*/
|
|
46
|
+
async addRequest(imagePath, prompt, context = {}, maxLatency = null) {
|
|
47
|
+
const latencyRequirement = maxLatency || context.maxLatency || this.defaultMaxLatency;
|
|
48
|
+
const isCritical = latencyRequirement < 200; // Critical if <200ms
|
|
49
|
+
|
|
50
|
+
if (isCritical) {
|
|
51
|
+
this.criticalRequests.add(imagePath);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// If latency requirement is very tight, bypass batching
|
|
55
|
+
if (latencyRequirement < 100) {
|
|
56
|
+
// Process immediately, no batching
|
|
57
|
+
return this._processRequest(imagePath, prompt, {
|
|
58
|
+
...context,
|
|
59
|
+
maxLatency: latencyRequirement,
|
|
60
|
+
critical: true
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// For slightly less critical requests, use adaptive batch size
|
|
65
|
+
if (this.adaptiveBatchSize && latencyRequirement < 200) {
|
|
66
|
+
// Use smaller batch size for fast games
|
|
67
|
+
const originalBatchSize = this.batchSize;
|
|
68
|
+
this.batchSize = 1; // Process one at a time for very fast games
|
|
69
|
+
|
|
70
|
+
try {
|
|
71
|
+
return await this._queueRequest(imagePath, prompt, {
|
|
72
|
+
...context,
|
|
73
|
+
maxLatency: latencyRequirement,
|
|
74
|
+
critical: isCritical
|
|
75
|
+
});
|
|
76
|
+
} finally {
|
|
77
|
+
this.batchSize = originalBatchSize;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// For normal requests, use standard batching
|
|
82
|
+
return this._queueRequest(imagePath, prompt, {
|
|
83
|
+
...context,
|
|
84
|
+
maxLatency: latencyRequirement
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Process queue with latency awareness
|
|
90
|
+
*/
|
|
91
|
+
async _processQueue() {
|
|
92
|
+
if (this.processing || this.queue.length === 0 || this.activeRequests >= this.maxConcurrency) {
|
|
93
|
+
return;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
this.processing = true;
|
|
97
|
+
|
|
98
|
+
try {
|
|
99
|
+
// Sort queue by latency requirement (critical first)
|
|
100
|
+
const sortedQueue = [...this.queue].sort((a, b) => {
|
|
101
|
+
const latencyA = a.context?.maxLatency || this.defaultMaxLatency;
|
|
102
|
+
const latencyB = b.context?.maxLatency || this.defaultMaxLatency;
|
|
103
|
+
|
|
104
|
+
// Critical requests (low latency) come first
|
|
105
|
+
if (latencyA < latencyB) return -1;
|
|
106
|
+
if (latencyA > latencyB) return 1;
|
|
107
|
+
|
|
108
|
+
// If same latency, process in order
|
|
109
|
+
return 0;
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
while (sortedQueue.length > 0 && this.activeRequests < this.maxConcurrency) {
|
|
113
|
+
// Calculate adaptive batch size based on latency requirements
|
|
114
|
+
const batchSize = this.adaptiveBatchSize
|
|
115
|
+
? this._calculateAdaptiveBatchSize(sortedQueue)
|
|
116
|
+
: this.batchSize;
|
|
117
|
+
|
|
118
|
+
const batch = sortedQueue.splice(0, batchSize);
|
|
119
|
+
|
|
120
|
+
// Remove from original queue
|
|
121
|
+
batch.forEach(item => {
|
|
122
|
+
const index = this.queue.findIndex(q => q.imagePath === item.imagePath);
|
|
123
|
+
if (index >= 0) this.queue.splice(index, 1);
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
// Process batch
|
|
127
|
+
const promises = batch.map(async ({ imagePath, prompt, context, validateFn, resolve, reject }) => {
|
|
128
|
+
try {
|
|
129
|
+
// Check cache
|
|
130
|
+
if (this.cache) {
|
|
131
|
+
const cacheKey = this._getCacheKey(imagePath, prompt, context);
|
|
132
|
+
if (this.cache.has(cacheKey)) {
|
|
133
|
+
resolve(this.cache.get(cacheKey));
|
|
134
|
+
return;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
const result = await this._processRequest(imagePath, prompt, context, validateFn);
|
|
139
|
+
resolve(result);
|
|
140
|
+
} catch (error) {
|
|
141
|
+
reject(error);
|
|
142
|
+
} finally {
|
|
143
|
+
// Remove from critical requests if processed
|
|
144
|
+
this.criticalRequests.delete(imagePath);
|
|
145
|
+
}
|
|
146
|
+
});
|
|
147
|
+
|
|
148
|
+
await Promise.allSettled(promises);
|
|
149
|
+
}
|
|
150
|
+
} finally {
|
|
151
|
+
this.processing = false;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Calculate adaptive batch size based on latency requirements
|
|
157
|
+
*/
|
|
158
|
+
_calculateAdaptiveBatchSize(queue) {
|
|
159
|
+
if (queue.length === 0) return this.batchSize;
|
|
160
|
+
|
|
161
|
+
// Get latency requirement of first request
|
|
162
|
+
const firstLatency = queue[0].context?.maxLatency || this.defaultMaxLatency;
|
|
163
|
+
|
|
164
|
+
// Very fast games (<100ms) - no batching
|
|
165
|
+
if (firstLatency < 100) return 1;
|
|
166
|
+
|
|
167
|
+
// Fast games (<200ms) - small batches
|
|
168
|
+
if (firstLatency < 200) return 2;
|
|
169
|
+
|
|
170
|
+
// Normal games - standard batch size
|
|
171
|
+
return this.batchSize;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Get latency-aware statistics
|
|
176
|
+
*/
|
|
177
|
+
getLatencyStats() {
|
|
178
|
+
return {
|
|
179
|
+
...this.getCacheStats(),
|
|
180
|
+
criticalRequests: this.criticalRequests.size,
|
|
181
|
+
queueLatencyRequirements: this.queue.map(q => ({
|
|
182
|
+
imagePath: q.imagePath,
|
|
183
|
+
maxLatency: q.context?.maxLatency || this.defaultMaxLatency,
|
|
184
|
+
critical: (q.context?.maxLatency || this.defaultMaxLatency) < 200
|
|
185
|
+
}))
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
|
package/src/load-env.mjs
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Environment Variable Loader
|
|
3
|
+
*
|
|
4
|
+
* Loads environment variables from .env file if it exists.
|
|
5
|
+
* Works in both local development and deployed environments.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { readFileSync, existsSync } from 'fs';
|
|
9
|
+
import { join, dirname } from 'path';
|
|
10
|
+
import { fileURLToPath } from 'url';
|
|
11
|
+
import { warn } from './logger.mjs';
|
|
12
|
+
|
|
13
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
14
|
+
const __dirname = dirname(__filename);
|
|
15
|
+
|
|
16
|
+
// SECURITY: Whitelist allowed environment variable keys to prevent injection
|
|
17
|
+
// Only allow keys that are actually used by this application
|
|
18
|
+
const ALLOWED_ENV_KEYS = [
|
|
19
|
+
'GEMINI_API_KEY',
|
|
20
|
+
'OPENAI_API_KEY',
|
|
21
|
+
'ANTHROPIC_API_KEY',
|
|
22
|
+
'GROQ_API_KEY', // Added for Groq integration (high-frequency decisions)
|
|
23
|
+
'API_KEY',
|
|
24
|
+
'VLLM_API_KEY',
|
|
25
|
+
'VLM_PROVIDER',
|
|
26
|
+
'VLM_MODEL',
|
|
27
|
+
'VLM_MODEL_TIER',
|
|
28
|
+
'RATE_LIMIT_MAX_REQUESTS',
|
|
29
|
+
'REQUIRE_AUTH'
|
|
30
|
+
];
|
|
31
|
+
|
|
32
|
+
// Valid values for VLM_PROVIDER
|
|
33
|
+
// Groq added for high-frequency decisions (10-60Hz temporal decisions)
|
|
34
|
+
const VALID_PROVIDERS = ['gemini', 'openai', 'claude', 'groq'];
|
|
35
|
+
|
|
36
|
+
// Validation functions for environment variables
|
|
37
|
+
function validateRateLimitMaxRequests(value) {
|
|
38
|
+
const num = parseInt(value, 10);
|
|
39
|
+
if (isNaN(num) || num < 1 || num > 1000) {
|
|
40
|
+
warn(`[LoadEnv] Invalid RATE_LIMIT_MAX_REQUESTS: ${value}. Must be between 1 and 1000. Using default.`);
|
|
41
|
+
return null; // Will use default
|
|
42
|
+
}
|
|
43
|
+
return num;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
function validateVLMProvider(value) {
|
|
47
|
+
const normalized = value?.toLowerCase().trim();
|
|
48
|
+
if (normalized && !VALID_PROVIDERS.includes(normalized)) {
|
|
49
|
+
warn(`[LoadEnv] Invalid VLM_PROVIDER: ${value}. Must be one of: ${VALID_PROVIDERS.join(', ')}. Ignoring.`);
|
|
50
|
+
return null; // Ignore invalid provider
|
|
51
|
+
}
|
|
52
|
+
return normalized;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function validateRequireAuth(value) {
|
|
56
|
+
if (value === 'true' || value === '1' || value === 'yes') {
|
|
57
|
+
return true;
|
|
58
|
+
}
|
|
59
|
+
if (value === 'false' || value === '0' || value === 'no' || value === '') {
|
|
60
|
+
return false;
|
|
61
|
+
}
|
|
62
|
+
warn(`[LoadEnv] Invalid REQUIRE_AUTH: ${value}. Must be 'true' or 'false'. Using default.`);
|
|
63
|
+
return null; // Will use default
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Load environment variables from .env file
|
|
68
|
+
*
|
|
69
|
+
* @param {string | null} [basePath=null] - Base path to search for .env file (optional)
|
|
70
|
+
* @returns {boolean} True if .env file was found and loaded
|
|
71
|
+
*/
|
|
72
|
+
export function loadEnv(basePath = null) {
|
|
73
|
+
// Try multiple locations for .env file
|
|
74
|
+
const possiblePaths = basePath
|
|
75
|
+
? [
|
|
76
|
+
join(basePath, '.env'),
|
|
77
|
+
join(basePath, '..', '.env'),
|
|
78
|
+
join(basePath, '../..', '.env')
|
|
79
|
+
]
|
|
80
|
+
: [
|
|
81
|
+
join(process.cwd(), '.env'),
|
|
82
|
+
join(__dirname, '..', '.env'),
|
|
83
|
+
join(__dirname, '../../..', '.env'),
|
|
84
|
+
join(__dirname, '../../../..', '.env')
|
|
85
|
+
];
|
|
86
|
+
|
|
87
|
+
for (const envPath of possiblePaths) {
|
|
88
|
+
if (existsSync(envPath)) {
|
|
89
|
+
try {
|
|
90
|
+
const envContent = readFileSync(envPath, 'utf8');
|
|
91
|
+
const lines = envContent.split('\n');
|
|
92
|
+
|
|
93
|
+
for (const line of lines) {
|
|
94
|
+
const trimmed = line.trim();
|
|
95
|
+
|
|
96
|
+
// Skip comments and empty lines
|
|
97
|
+
if (!trimmed || trimmed.startsWith('#')) {
|
|
98
|
+
continue;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Parse KEY=VALUE format
|
|
102
|
+
const match = trimmed.match(/^([^=]+)=(.*)$/);
|
|
103
|
+
if (match) {
|
|
104
|
+
const key = match[1].trim();
|
|
105
|
+
let value = match[2].trim();
|
|
106
|
+
|
|
107
|
+
// SECURITY: Only allow whitelisted environment variable keys
|
|
108
|
+
// Prevents malicious .env files from setting arbitrary variables
|
|
109
|
+
if (!ALLOWED_ENV_KEYS.includes(key)) {
|
|
110
|
+
warn(`[LoadEnv] Ignoring unknown environment variable key: ${key}`);
|
|
111
|
+
continue;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Remove quotes if present
|
|
115
|
+
if ((value.startsWith('"') && value.endsWith('"')) ||
|
|
116
|
+
(value.startsWith("'") && value.endsWith("'"))) {
|
|
117
|
+
value = value.slice(1, -1);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Validate and transform values based on key
|
|
121
|
+
let validatedValue = value;
|
|
122
|
+
if (key === 'RATE_LIMIT_MAX_REQUESTS') {
|
|
123
|
+
const validated = validateRateLimitMaxRequests(value);
|
|
124
|
+
if (validated === null) {
|
|
125
|
+
continue; // Skip invalid value
|
|
126
|
+
}
|
|
127
|
+
validatedValue = String(validated);
|
|
128
|
+
} else if (key === 'VLM_PROVIDER') {
|
|
129
|
+
const validated = validateVLMProvider(value);
|
|
130
|
+
if (validated === null) {
|
|
131
|
+
continue; // Skip invalid value
|
|
132
|
+
}
|
|
133
|
+
validatedValue = validated;
|
|
134
|
+
} else if (key === 'REQUIRE_AUTH') {
|
|
135
|
+
const validated = validateRequireAuth(value);
|
|
136
|
+
if (validated === null) {
|
|
137
|
+
continue; // Skip invalid value
|
|
138
|
+
}
|
|
139
|
+
validatedValue = String(validated);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Only set if not already set (env vars take precedence)
|
|
143
|
+
if (!process.env[key]) {
|
|
144
|
+
process.env[key] = validatedValue;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
return true;
|
|
150
|
+
} catch (err) {
|
|
151
|
+
// Silently fail - .env is optional
|
|
152
|
+
return false;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
return false;
|
|
158
|
+
}
|
|
159
|
+
|
package/src/logger.mjs
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Simple logger utility
|
|
3
|
+
*
|
|
4
|
+
* Provides conditional logging that respects debug mode.
|
|
5
|
+
* In production, warnings are silent unless explicitly enabled.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
let DEBUG_ENABLED = false;
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Enable debug logging
|
|
12
|
+
*/
|
|
13
|
+
export function enableDebug() {
|
|
14
|
+
DEBUG_ENABLED = true;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Disable debug logging
|
|
19
|
+
*/
|
|
20
|
+
export function disableDebug() {
|
|
21
|
+
DEBUG_ENABLED = false;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Check if debug is enabled
|
|
26
|
+
*/
|
|
27
|
+
export function isDebugEnabled() {
|
|
28
|
+
return DEBUG_ENABLED;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Log a warning (only if debug enabled)
|
|
33
|
+
*/
|
|
34
|
+
export function warn(...args) {
|
|
35
|
+
if (DEBUG_ENABLED) {
|
|
36
|
+
console.warn(...args);
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Log info (only if debug enabled)
|
|
42
|
+
*/
|
|
43
|
+
export function log(...args) {
|
|
44
|
+
if (DEBUG_ENABLED) {
|
|
45
|
+
console.log(...args);
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Log error (always logged)
|
|
51
|
+
*/
|
|
52
|
+
export function error(...args) {
|
|
53
|
+
console.error(...args);
|
|
54
|
+
}
|
|
55
|
+
|
package/src/metrics.mjs
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluation Metrics
|
|
3
|
+
*
|
|
4
|
+
* Provides comprehensive metrics for evaluation results, including:
|
|
5
|
+
* - Spearman's rank correlation (for ordinal ratings)
|
|
6
|
+
* - Pearson's correlation
|
|
7
|
+
* - Agreement metrics
|
|
8
|
+
* - Rank-based metrics
|
|
9
|
+
*
|
|
10
|
+
* Research: Spearman's ρ is more appropriate than Pearson's r for LLM evaluation
|
|
11
|
+
* (arXiv:2506.02945).
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Calculate Spearman's rank correlation coefficient
|
|
16
|
+
*
|
|
17
|
+
* @param {Array<number>} x - First set of values
|
|
18
|
+
* @param {Array<number>} y - Second set of values
|
|
19
|
+
* @returns {number | null} Spearman's ρ (rho), or null if insufficient data
|
|
20
|
+
*/
|
|
21
|
+
export function spearmanCorrelation(x, y) {
|
|
22
|
+
if (x.length !== y.length || x.length < 2) {
|
|
23
|
+
return null;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// Remove pairs with null/undefined values
|
|
27
|
+
const pairs = x.map((xi, i) => [xi, y[i]])
|
|
28
|
+
.filter(([xi, yi]) => xi != null && yi != null);
|
|
29
|
+
|
|
30
|
+
if (pairs.length < 2) {
|
|
31
|
+
return null;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const xValues = pairs.map(p => p[0]);
|
|
35
|
+
const yValues = pairs.map(p => p[1]);
|
|
36
|
+
|
|
37
|
+
// Rank the values
|
|
38
|
+
const xRanks = rank(xValues);
|
|
39
|
+
const yRanks = rank(yValues);
|
|
40
|
+
|
|
41
|
+
// Calculate Pearson correlation on ranks
|
|
42
|
+
return pearsonCorrelation(xRanks, yRanks);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Calculate Pearson's correlation coefficient
|
|
47
|
+
*
|
|
48
|
+
* @param {Array<number>} x - First set of values
|
|
49
|
+
* @param {Array<number>} y - Second set of values
|
|
50
|
+
* @returns {number | null} Pearson's r, or null if insufficient data
|
|
51
|
+
*/
|
|
52
|
+
export function pearsonCorrelation(x, y) {
|
|
53
|
+
if (x.length !== y.length || x.length < 2) {
|
|
54
|
+
return null;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const n = x.length;
|
|
58
|
+
const xMean = x.reduce((a, b) => a + b, 0) / n;
|
|
59
|
+
const yMean = y.reduce((a, b) => a + b, 0) / n;
|
|
60
|
+
|
|
61
|
+
let numerator = 0;
|
|
62
|
+
let xVariance = 0;
|
|
63
|
+
let yVariance = 0;
|
|
64
|
+
|
|
65
|
+
for (let i = 0; i < n; i++) {
|
|
66
|
+
const xDiff = x[i] - xMean;
|
|
67
|
+
const yDiff = y[i] - yMean;
|
|
68
|
+
numerator += xDiff * yDiff;
|
|
69
|
+
xVariance += xDiff * xDiff;
|
|
70
|
+
yVariance += yDiff * yDiff;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
const denominator = Math.sqrt(xVariance * yVariance);
|
|
74
|
+
|
|
75
|
+
if (denominator === 0) {
|
|
76
|
+
return null; // No variance
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return numerator / denominator;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Rank values (handle ties by averaging)
|
|
84
|
+
*
|
|
85
|
+
* @param {Array<number>} values - Values to rank
|
|
86
|
+
* @returns {Array<number>} Ranks (1-indexed)
|
|
87
|
+
*/
|
|
88
|
+
function rank(values) {
|
|
89
|
+
const indexed = values.map((v, i) => ({ value: v, index: i }));
|
|
90
|
+
indexed.sort((a, b) => a.value - b.value);
|
|
91
|
+
|
|
92
|
+
const ranks = new Array(values.length);
|
|
93
|
+
let currentRank = 1;
|
|
94
|
+
|
|
95
|
+
for (let i = 0; i < indexed.length; i++) {
|
|
96
|
+
// Check for ties
|
|
97
|
+
let tieCount = 1;
|
|
98
|
+
let tieSum = currentRank;
|
|
99
|
+
|
|
100
|
+
while (i + tieCount < indexed.length &&
|
|
101
|
+
indexed[i].value === indexed[i + tieCount].value) {
|
|
102
|
+
tieSum += currentRank + tieCount;
|
|
103
|
+
tieCount++;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Average rank for ties
|
|
107
|
+
const avgRank = tieSum / tieCount;
|
|
108
|
+
|
|
109
|
+
for (let j = 0; j < tieCount; j++) {
|
|
110
|
+
ranks[indexed[i + j].index] = avgRank;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
i += tieCount - 1;
|
|
114
|
+
currentRank += tieCount;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
return ranks;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Calculate agreement between two rankings
|
|
122
|
+
*
|
|
123
|
+
* @param {Array<number>} ranking1 - First ranking (indices or scores)
|
|
124
|
+
* @param {Array<number>} ranking2 - Second ranking (indices or scores)
|
|
125
|
+
* @returns {{
|
|
126
|
+
* spearman: number | null;
|
|
127
|
+
* pearson: number | null;
|
|
128
|
+
* kendall: number | null;
|
|
129
|
+
* exactMatches: number;
|
|
130
|
+
* totalItems: number;
|
|
131
|
+
* }} Agreement metrics
|
|
132
|
+
*/
|
|
133
|
+
export function calculateRankAgreement(ranking1, ranking2) {
|
|
134
|
+
const spearman = spearmanCorrelation(ranking1, ranking2);
|
|
135
|
+
const pearson = pearsonCorrelation(ranking1, ranking2);
|
|
136
|
+
const kendall = kendallTau(ranking1, ranking2);
|
|
137
|
+
|
|
138
|
+
// Count exact matches
|
|
139
|
+
const exactMatches = ranking1.filter((r1, i) => r1 === ranking2[i]).length;
|
|
140
|
+
|
|
141
|
+
return {
|
|
142
|
+
spearman,
|
|
143
|
+
pearson,
|
|
144
|
+
kendall,
|
|
145
|
+
exactMatches,
|
|
146
|
+
totalItems: ranking1.length,
|
|
147
|
+
agreementRate: exactMatches / ranking1.length
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Calculate Kendall's tau (rank correlation)
|
|
153
|
+
*
|
|
154
|
+
* @param {Array<number>} x - First ranking
|
|
155
|
+
* @param {Array<number>} y - Second ranking
|
|
156
|
+
* @returns {number | null} Kendall's τ, or null if insufficient data
|
|
157
|
+
*/
|
|
158
|
+
function kendallTau(x, y) {
|
|
159
|
+
if (x.length !== y.length || x.length < 2) {
|
|
160
|
+
return null;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
let concordant = 0;
|
|
164
|
+
let discordant = 0;
|
|
165
|
+
|
|
166
|
+
for (let i = 0; i < x.length; i++) {
|
|
167
|
+
for (let j = i + 1; j < x.length; j++) {
|
|
168
|
+
const xOrder = x[i] - x[j];
|
|
169
|
+
const yOrder = y[i] - y[j];
|
|
170
|
+
|
|
171
|
+
if (xOrder * yOrder > 0) {
|
|
172
|
+
concordant++;
|
|
173
|
+
} else if (xOrder * yOrder < 0) {
|
|
174
|
+
discordant++;
|
|
175
|
+
}
|
|
176
|
+
// Ties (xOrder === 0 or yOrder === 0) are ignored
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
const total = concordant + discordant;
|
|
181
|
+
if (total === 0) {
|
|
182
|
+
return null;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
return (concordant - discordant) / total;
|
|
186
|
+
}
|
|
187
|
+
|