@arclabs561/ai-visual-test 0.5.1 → 0.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +127 -11
- package/DEPLOYMENT.md +225 -9
- package/README.md +71 -80
- package/index.d.ts +902 -5
- package/package.json +10 -51
- package/src/batch-optimizer.mjs +39 -0
- package/src/cache.mjs +241 -16
- package/src/config.mjs +33 -91
- package/src/constants.mjs +54 -0
- package/src/convenience.mjs +113 -10
- package/src/cost-optimization.mjs +1 -0
- package/src/cost-tracker.mjs +134 -2
- package/src/data-extractor.mjs +36 -7
- package/src/dynamic-few-shot.mjs +69 -11
- package/src/errors.mjs +6 -2
- package/src/experience-propagation.mjs +12 -0
- package/src/experience-tracer.mjs +12 -3
- package/src/game-player.mjs +222 -43
- package/src/graceful-shutdown.mjs +126 -0
- package/src/helpers/playwright.mjs +22 -8
- package/src/human-validation-manager.mjs +99 -2
- package/src/index.mjs +48 -3
- package/src/integrations/playwright.mjs +140 -0
- package/src/judge.mjs +699 -24
- package/src/load-env.mjs +2 -1
- package/src/logger.mjs +31 -3
- package/src/model-tier-selector.mjs +1 -221
- package/src/natural-language-specs.mjs +31 -3
- package/src/persona-enhanced.mjs +4 -2
- package/src/persona-experience.mjs +1 -1
- package/src/pricing.mjs +28 -0
- package/src/prompt-composer.mjs +162 -5
- package/src/provider-data.mjs +115 -0
- package/src/render-change-detector.mjs +5 -0
- package/src/research-enhanced-validation.mjs +7 -5
- package/src/retry.mjs +21 -7
- package/src/rubrics.mjs +4 -0
- package/src/safe-logger.mjs +71 -0
- package/src/session-cost-tracker.mjs +320 -0
- package/src/smart-validator.mjs +8 -8
- package/src/spec-templates.mjs +52 -6
- package/src/startup-validation.mjs +127 -0
- package/src/temporal-adaptive.mjs +2 -2
- package/src/temporal-decision-manager.mjs +1 -271
- package/src/temporal-logic.mjs +104 -0
- package/src/temporal-note-pruner.mjs +119 -0
- package/src/temporal-preprocessor.mjs +1 -543
- package/src/temporal.mjs +681 -79
- package/src/utils/action-hallucination-detector.mjs +301 -0
- package/src/utils/baseline-validator.mjs +82 -0
- package/src/utils/cache-stats.mjs +104 -0
- package/src/utils/cached-llm.mjs +164 -0
- package/src/utils/capability-stratifier.mjs +108 -0
- package/src/utils/counterfactual-tester.mjs +83 -0
- package/src/utils/error-recovery.mjs +117 -0
- package/src/utils/explainability-scorer.mjs +119 -0
- package/src/utils/exploratory-automation.mjs +131 -0
- package/src/utils/index.mjs +10 -0
- package/src/utils/intent-recognizer.mjs +201 -0
- package/src/utils/log-sanitizer.mjs +165 -0
- package/src/utils/path-validator.mjs +88 -0
- package/src/utils/performance-logger.mjs +316 -0
- package/src/utils/performance-measurement.mjs +280 -0
- package/src/utils/prompt-sanitizer.mjs +213 -0
- package/src/utils/rate-limiter.mjs +144 -0
- package/src/validation-framework.mjs +24 -20
- package/src/validation-result-normalizer.mjs +35 -1
- package/src/validation.mjs +75 -25
- package/src/validators/accessibility-validator.mjs +144 -0
- package/src/validators/hybrid-validator.mjs +48 -4
- package/api/health.js +0 -34
- package/api/validate.js +0 -252
- package/public/index.html +0 -149
- package/vercel.json +0 -27
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Provider Data & Pricing Configuration
|
|
3
|
+
*
|
|
4
|
+
* Central source of truth for provider configuration, models, and pricing.
|
|
5
|
+
* Extracted from config.mjs to facilitate updates and prevent drift.
|
|
6
|
+
*
|
|
7
|
+
* ⚠️ IMPORTANT: Model names should be verified against current provider documentation.
|
|
8
|
+
* Some models may be preview-only, deprecated, or have different names in production.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Model tiers for each provider
|
|
13
|
+
*
|
|
14
|
+
* Models can be overridden via environment variables:
|
|
15
|
+
* - VLM_MODEL_TIER: 'fast' | 'balanced' | 'best'
|
|
16
|
+
* - VLM_MODEL: explicit model name override
|
|
17
|
+
*
|
|
18
|
+
* GROQ INTEGRATION:
|
|
19
|
+
* - Groq added for high-frequency decisions (10-60Hz temporal decisions)
|
|
20
|
+
* - ~0.22s latency (vs 1-3s for other providers)
|
|
21
|
+
* - 185-276 tokens/sec throughput
|
|
22
|
+
* - OpenAI-compatible API
|
|
23
|
+
* - Cost-competitive, free tier available
|
|
24
|
+
* - Useful for: Fast tier decisions, high-Hz temporal decisions, real-time applications
|
|
25
|
+
*/
|
|
26
|
+
export const MODEL_TIERS = {
|
|
27
|
+
gemini: {
|
|
28
|
+
fast: 'gemini-2.5-flash', // Fast, cost-effective (stable)
|
|
29
|
+
balanced: 'gemini-2.5-flash', // Good balance (using Flash as default balanced too)
|
|
30
|
+
best: 'gemini-3-pro-preview' // High quality (preview)
|
|
31
|
+
},
|
|
32
|
+
openai: {
|
|
33
|
+
fast: 'gpt-4o-mini', // Fast, cheaper
|
|
34
|
+
balanced: 'gpt-4o', // Balanced (current production)
|
|
35
|
+
best: 'gpt-5' // High quality (late 2025, latest production)
|
|
36
|
+
},
|
|
37
|
+
claude: {
|
|
38
|
+
fast: 'claude-haiku-4-5', // Fast, cheaper (Haiku 4.5, Feb 2025)
|
|
39
|
+
balanced: 'claude-sonnet-4-5', // Balanced (Sept 2025)
|
|
40
|
+
best: 'claude-opus-4-6' // High quality (Opus 4.6, March 2026)
|
|
41
|
+
},
|
|
42
|
+
groq: {
|
|
43
|
+
// NOTE: Groq vision support requires different model
|
|
44
|
+
// For vision: meta-llama/llama-4-scout-17b-16e-instruct (preview, supports vision)
|
|
45
|
+
// For text-only: llama-3.3-70b-versatile is fastest (~0.22s latency)
|
|
46
|
+
fast: 'meta-llama/llama-4-scout-17b-16e-instruct', // Vision-capable, fastest Groq option
|
|
47
|
+
balanced: 'meta-llama/llama-4-scout-17b-16e-instruct', // Vision-capable, balanced
|
|
48
|
+
best: 'meta-llama/llama-4-scout-17b-16e-instruct' // Vision-capable, high quality (preview)
|
|
49
|
+
// WARNING: Groq vision models are preview-only. Text-only: use llama-3.3-70b-versatile
|
|
50
|
+
},
|
|
51
|
+
openrouter: {
|
|
52
|
+
// OpenRouter provides access to multiple models via unified API
|
|
53
|
+
fast: 'anthropic/claude-haiku-4-5', // Fast, cheaper via OpenRouter
|
|
54
|
+
balanced: 'anthropic/claude-sonnet-4-5', // Balanced via OpenRouter
|
|
55
|
+
best: 'anthropic/claude-opus-4-6' // High quality via OpenRouter
|
|
56
|
+
}
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Default provider configurations
|
|
61
|
+
*
|
|
62
|
+
* GROQ INTEGRATION:
|
|
63
|
+
* - OpenAI-compatible API (easy migration)
|
|
64
|
+
* - ~0.22s latency (10x faster than typical providers)
|
|
65
|
+
* - Useful for high-frequency decisions (10-60Hz temporal decisions)
|
|
66
|
+
* - Free tier available for testing
|
|
67
|
+
*/
|
|
68
|
+
export const PROVIDER_CONFIGS = {
|
|
69
|
+
gemini: {
|
|
70
|
+
name: 'gemini',
|
|
71
|
+
apiUrl: 'https://generativelanguage.googleapis.com/v1beta',
|
|
72
|
+
model: 'gemini-2.5-flash', // Latest stable (June 2025)
|
|
73
|
+
freeTier: true,
|
|
74
|
+
pricing: { input: 0.10, output: 0.40 }, // 2.5 Flash is cheaper
|
|
75
|
+
priority: 1
|
|
76
|
+
},
|
|
77
|
+
openai: {
|
|
78
|
+
name: 'openai',
|
|
79
|
+
apiUrl: 'https://api.openai.com/v1',
|
|
80
|
+
model: 'gpt-4o', // Current production
|
|
81
|
+
freeTier: false,
|
|
82
|
+
pricing: { input: 5.00, output: 15.00 },
|
|
83
|
+
priority: 2
|
|
84
|
+
},
|
|
85
|
+
claude: {
|
|
86
|
+
name: 'claude',
|
|
87
|
+
apiUrl: 'https://api.anthropic.com/v1',
|
|
88
|
+
model: 'claude-sonnet-4-5', // Latest flagship (Sept 2025)
|
|
89
|
+
freeTier: false,
|
|
90
|
+
pricing: { input: 3.00, output: 15.00 },
|
|
91
|
+
priority: 3
|
|
92
|
+
},
|
|
93
|
+
groq: {
|
|
94
|
+
name: 'groq',
|
|
95
|
+
apiUrl: 'https://api.groq.com/openai/v1', // OpenAI-compatible endpoint
|
|
96
|
+
model: 'meta-llama/llama-4-scout-17b-16e-instruct', // Vision-capable (preview), ~0.22s latency
|
|
97
|
+
freeTier: true, // Free tier available
|
|
98
|
+
pricing: { input: 0.59, output: 0.79 }, // Actual 2025 pricing: $0.59/$0.79 per 1M tokens (real-time API)
|
|
99
|
+
priority: 0, // Highest priority for high-frequency decisions
|
|
100
|
+
latency: 220, // ~0.22s latency in ms (10x faster than typical)
|
|
101
|
+
throughput: 200, // ~200 tokens/sec average
|
|
102
|
+
visionSupported: true // llama-4-scout-17b-16e-instruct supports vision (preview)
|
|
103
|
+
// Text-only alternative: llama-3.3-70b-versatile (faster, no vision)
|
|
104
|
+
},
|
|
105
|
+
openrouter: {
|
|
106
|
+
name: 'openrouter',
|
|
107
|
+
apiUrl: 'https://openrouter.ai/api/v1', // OpenAI-compatible endpoint
|
|
108
|
+
model: 'anthropic/claude-sonnet-4', // Default to Claude Sonnet via OpenRouter
|
|
109
|
+
freeTier: false,
|
|
110
|
+
pricing: { input: 3.00, output: 15.00 }, // Varies by model
|
|
111
|
+
priority: 2,
|
|
112
|
+
visionSupported: true
|
|
113
|
+
}
|
|
114
|
+
};
|
|
115
|
+
|
|
@@ -253,6 +253,11 @@ export function calculateOptimalFPS(changeHistory, options = {}) {
|
|
|
253
253
|
targetChangeInterval = 100 // Target: capture every 100ms of changes
|
|
254
254
|
} = options;
|
|
255
255
|
|
|
256
|
+
// Handle null/undefined or non-array input
|
|
257
|
+
if (!changeHistory || !Array.isArray(changeHistory)) {
|
|
258
|
+
return minFPS;
|
|
259
|
+
}
|
|
260
|
+
|
|
256
261
|
if (changeHistory.length < 2) {
|
|
257
262
|
return minFPS;
|
|
258
263
|
}
|
|
@@ -307,12 +307,14 @@ export async function validateWithExplicitRubric(imagePath, prompt, options = {}
|
|
|
307
307
|
// Import rubric builder
|
|
308
308
|
const { buildRubricPrompt, DEFAULT_RUBRIC } = await import('./rubrics.mjs');
|
|
309
309
|
|
|
310
|
-
// Build prompt
|
|
310
|
+
// Build prompt: user prompt + rubric evaluation framework.
|
|
311
|
+
// buildRubricPrompt(rubric, includeDimensions) returns the rubric text;
|
|
312
|
+
// we prepend the user's prompt so both are sent to the VLM.
|
|
313
|
+
const rubricToUse = rubric || (useDefaultRubric ? DEFAULT_RUBRIC : null);
|
|
311
314
|
let enhancedPrompt = prompt;
|
|
312
|
-
if (
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
enhancedPrompt = buildRubricPrompt(prompt, rubric);
|
|
315
|
+
if (rubricToUse) {
|
|
316
|
+
const rubricText = buildRubricPrompt(rubricToUse, true);
|
|
317
|
+
enhancedPrompt = `${prompt}\n\n${rubricText}`;
|
|
316
318
|
}
|
|
317
319
|
|
|
318
320
|
// Perform validation
|
package/src/retry.mjs
CHANGED
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
import { ProviderError, TimeoutError } from './errors.mjs';
|
|
9
9
|
import { log, warn } from './logger.mjs';
|
|
10
|
+
import { RETRY_CONSTANTS } from './constants.mjs';
|
|
10
11
|
|
|
11
12
|
/**
|
|
12
13
|
* Check if an error is retryable
|
|
@@ -50,12 +51,12 @@ export function isRetryableError(error) {
|
|
|
50
51
|
* @param {boolean} jitter - Add random jitter to prevent thundering herd
|
|
51
52
|
* @returns {number} Delay in milliseconds
|
|
52
53
|
*/
|
|
53
|
-
export function calculateBackoff(attempt, baseDelay =
|
|
54
|
+
export function calculateBackoff(attempt, baseDelay = RETRY_CONSTANTS.DEFAULT_BASE_DELAY_MS, maxDelay = RETRY_CONSTANTS.DEFAULT_MAX_DELAY_MS, jitter = true) {
|
|
54
55
|
const exponentialDelay = Math.min(baseDelay * Math.pow(2, attempt), maxDelay);
|
|
55
56
|
|
|
56
57
|
if (jitter) {
|
|
57
|
-
// Add
|
|
58
|
-
const jitterAmount = exponentialDelay *
|
|
58
|
+
// Add random jitter to prevent thundering herd
|
|
59
|
+
const jitterAmount = exponentialDelay * RETRY_CONSTANTS.JITTER_PERCENTAGE;
|
|
59
60
|
const jitterValue = (Math.random() * 2 - 1) * jitterAmount;
|
|
60
61
|
return Math.max(0, exponentialDelay + jitterValue);
|
|
61
62
|
}
|
|
@@ -80,9 +81,9 @@ export function calculateBackoff(attempt, baseDelay = 1000, maxDelay = 30000, ji
|
|
|
80
81
|
*/
|
|
81
82
|
export async function retryWithBackoff(fn, options = {}) {
|
|
82
83
|
const {
|
|
83
|
-
maxRetries =
|
|
84
|
-
baseDelay =
|
|
85
|
-
maxDelay =
|
|
84
|
+
maxRetries = RETRY_CONSTANTS.DEFAULT_MAX_RETRIES,
|
|
85
|
+
baseDelay = RETRY_CONSTANTS.DEFAULT_BASE_DELAY_MS,
|
|
86
|
+
maxDelay = RETRY_CONSTANTS.DEFAULT_MAX_DELAY_MS,
|
|
86
87
|
onRetry = null,
|
|
87
88
|
retryable = isRetryableError
|
|
88
89
|
} = options;
|
|
@@ -117,7 +118,20 @@ export async function retryWithBackoff(fn, options = {}) {
|
|
|
117
118
|
}
|
|
118
119
|
}
|
|
119
120
|
|
|
120
|
-
// All retries exhausted
|
|
121
|
+
// All retries exhausted - enhance error message with retry context
|
|
122
|
+
const enhancedMessage = enhanceErrorMessage(
|
|
123
|
+
lastError,
|
|
124
|
+
maxRetries + 1,
|
|
125
|
+
'retryWithBackoff'
|
|
126
|
+
);
|
|
127
|
+
|
|
128
|
+
// Preserve original error but enhance message
|
|
129
|
+
if (lastError instanceof Error) {
|
|
130
|
+
lastError.message = enhancedMessage;
|
|
131
|
+
} else {
|
|
132
|
+
lastError = new Error(enhancedMessage);
|
|
133
|
+
}
|
|
134
|
+
|
|
121
135
|
throw lastError;
|
|
122
136
|
}
|
|
123
137
|
|
package/src/rubrics.mjs
CHANGED
|
@@ -162,6 +162,10 @@ Provide your evaluation as JSON:
|
|
|
162
162
|
"visual": "<visual evidence from screenshot>",
|
|
163
163
|
"functional": "<functional evidence>",
|
|
164
164
|
"accessibility": "<accessibility evidence>"
|
|
165
|
+
},
|
|
166
|
+
"dimensionScores": {
|
|
167
|
+
"<dimension_name>": <0-10 integer>,
|
|
168
|
+
...for each dimension in the rubric
|
|
165
169
|
}
|
|
166
170
|
}`;
|
|
167
171
|
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Performance Logger Service
|
|
3
|
+
*
|
|
4
|
+
* Handles optional performance logging without fire-and-forget race conditions.
|
|
5
|
+
* Provides a safe, non-blocking interface for metrics.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
let loggerImplementation = null;
|
|
9
|
+
let isEnabled = false;
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Initialize the logger (lazy load)
|
|
13
|
+
*/
|
|
14
|
+
async function initLogger() {
|
|
15
|
+
if (loggerImplementation) return loggerImplementation;
|
|
16
|
+
|
|
17
|
+
try {
|
|
18
|
+
// Only load if actually needed/configured
|
|
19
|
+
const { logCacheOperation, logTemporalDecision } = await import('./utils/performance-logger.mjs');
|
|
20
|
+
loggerImplementation = { logCacheOperation, logTemporalDecision };
|
|
21
|
+
isEnabled = true;
|
|
22
|
+
} catch (error) {
|
|
23
|
+
// Graceful degradation - if logger fails to load, we just don't log
|
|
24
|
+
// This is better than crashing or hanging
|
|
25
|
+
isEnabled = false;
|
|
26
|
+
loggerImplementation = {
|
|
27
|
+
logCacheOperation: () => {},
|
|
28
|
+
logTemporalDecision: () => {}
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
return loggerImplementation;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Log a cache operation safely
|
|
36
|
+
*/
|
|
37
|
+
export function safeLogCacheOperation(data) {
|
|
38
|
+
// Non-blocking check
|
|
39
|
+
if (!isEnabled && !loggerImplementation) {
|
|
40
|
+
// Trigger init but don't wait for it (fire and forget INIT, not the log itself)
|
|
41
|
+
initLogger().then(logger => logger?.logCacheOperation(data)).catch(() => {});
|
|
42
|
+
return;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if (isEnabled && loggerImplementation) {
|
|
46
|
+
try {
|
|
47
|
+
loggerImplementation.logCacheOperation(data);
|
|
48
|
+
} catch {
|
|
49
|
+
// Swallow logging errors to prevent disrupting main flow
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Log a temporal decision safely
|
|
56
|
+
*/
|
|
57
|
+
export function safeLogTemporalDecision(data) {
|
|
58
|
+
if (!isEnabled && !loggerImplementation) {
|
|
59
|
+
initLogger().then(logger => logger?.logTemporalDecision(data)).catch(() => {});
|
|
60
|
+
return;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
if (isEnabled && loggerImplementation) {
|
|
64
|
+
try {
|
|
65
|
+
loggerImplementation.logTemporalDecision(data);
|
|
66
|
+
} catch {
|
|
67
|
+
// Swallow logging errors
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Session-Level Cost Tracker
|
|
3
|
+
*
|
|
4
|
+
* Tracks costs per test run/session with detailed breakdown and transparency.
|
|
5
|
+
* Provides "trap debug" hooks to show total ML API resources for usage tracking.
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* ```javascript
|
|
9
|
+
* import { startSession, endSession, getSessionCosts } from './session-cost-tracker.mjs';
|
|
10
|
+
*
|
|
11
|
+
* const sessionId = startSession('comprehensive-evaluation');
|
|
12
|
+
* // ... run tests ...
|
|
13
|
+
* const summary = endSession(sessionId);
|
|
14
|
+
* console.log(`Total cost: $${summary.totalCost.toFixed(4)}`);
|
|
15
|
+
* ```
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { getCostTracker, recordCost } from './cost-tracker.mjs';
|
|
19
|
+
import { getCacheStats } from './cache.mjs';
|
|
20
|
+
import { log, warn } from './logger.mjs';
|
|
21
|
+
import { writeFileSync, mkdirSync, existsSync } from 'fs';
|
|
22
|
+
import { join } from 'path';
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Active sessions
|
|
26
|
+
*/
|
|
27
|
+
const activeSessions = new Map();
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Session cost data structure
|
|
31
|
+
*/
|
|
32
|
+
class SessionCostData {
|
|
33
|
+
constructor(sessionId, name) {
|
|
34
|
+
this.sessionId = sessionId;
|
|
35
|
+
this.name = name;
|
|
36
|
+
this.startTime = Date.now();
|
|
37
|
+
this.endTime = null;
|
|
38
|
+
this.costs = {
|
|
39
|
+
total: 0,
|
|
40
|
+
byProvider: {},
|
|
41
|
+
byTest: {},
|
|
42
|
+
apiCalls: 0,
|
|
43
|
+
cacheHits: 0,
|
|
44
|
+
cacheMisses: 0,
|
|
45
|
+
tokens: {
|
|
46
|
+
input: 0,
|
|
47
|
+
output: 0,
|
|
48
|
+
total: 0
|
|
49
|
+
}
|
|
50
|
+
};
|
|
51
|
+
this.entries = [];
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
recordCostEntry(entry) {
|
|
55
|
+
this.costs.total += entry.cost || 0;
|
|
56
|
+
this.costs.apiCalls += 1;
|
|
57
|
+
|
|
58
|
+
// Track by provider
|
|
59
|
+
if (!this.costs.byProvider[entry.provider]) {
|
|
60
|
+
this.costs.byProvider[entry.provider] = { total: 0, calls: 0, tokens: { input: 0, output: 0 } };
|
|
61
|
+
}
|
|
62
|
+
this.costs.byProvider[entry.provider].total += entry.cost || 0;
|
|
63
|
+
this.costs.byProvider[entry.provider].calls += 1;
|
|
64
|
+
this.costs.byProvider[entry.provider].tokens.input += entry.inputTokens || 0;
|
|
65
|
+
this.costs.byProvider[entry.provider].tokens.output += entry.outputTokens || 0;
|
|
66
|
+
|
|
67
|
+
// Track by test
|
|
68
|
+
const testName = entry.testName || 'unknown';
|
|
69
|
+
if (!this.costs.byTest[testName]) {
|
|
70
|
+
this.costs.byTest[testName] = { total: 0, calls: 0 };
|
|
71
|
+
}
|
|
72
|
+
this.costs.byTest[testName].total += entry.cost || 0;
|
|
73
|
+
this.costs.byTest[testName].calls += 1;
|
|
74
|
+
|
|
75
|
+
// Track tokens
|
|
76
|
+
this.costs.tokens.input += entry.inputTokens || 0;
|
|
77
|
+
this.costs.tokens.output += entry.outputTokens || 0;
|
|
78
|
+
this.costs.tokens.total = this.costs.tokens.input + this.costs.tokens.output;
|
|
79
|
+
|
|
80
|
+
// Store entry
|
|
81
|
+
this.entries.push({
|
|
82
|
+
...entry,
|
|
83
|
+
timestamp: entry.timestamp || Date.now()
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
recordCacheHit() {
|
|
88
|
+
this.costs.cacheHits += 1;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
recordCacheMiss() {
|
|
92
|
+
this.costs.cacheMisses += 1;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
getSummary() {
|
|
96
|
+
const duration = (this.endTime || Date.now()) - this.startTime;
|
|
97
|
+
const cacheHitRate = this.costs.cacheHits + this.costs.cacheMisses > 0
|
|
98
|
+
? (this.costs.cacheHits / (this.costs.cacheHits + this.costs.cacheMisses) * 100).toFixed(1)
|
|
99
|
+
: 0;
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
sessionId: this.sessionId,
|
|
103
|
+
name: this.name,
|
|
104
|
+
duration: duration,
|
|
105
|
+
durationSeconds: (duration / 1000).toFixed(2),
|
|
106
|
+
costs: {
|
|
107
|
+
...this.costs,
|
|
108
|
+
cacheHitRate: `${cacheHitRate}%`,
|
|
109
|
+
averageCostPerCall: this.costs.apiCalls > 0
|
|
110
|
+
? (this.costs.total / this.costs.apiCalls).toFixed(6)
|
|
111
|
+
: 0,
|
|
112
|
+
costPerSecond: duration > 0
|
|
113
|
+
? ((this.costs.total / duration) * 1000).toFixed(6)
|
|
114
|
+
: 0
|
|
115
|
+
},
|
|
116
|
+
startTime: new Date(this.startTime).toISOString(),
|
|
117
|
+
endTime: this.endTime ? new Date(this.endTime).toISOString() : null
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Start a new cost tracking session
|
|
124
|
+
*
|
|
125
|
+
* @param {string} name - Session name (e.g., 'comprehensive-evaluation')
|
|
126
|
+
* @param {object} [options] - Session options
|
|
127
|
+
* @returns {string} Session ID
|
|
128
|
+
*/
|
|
129
|
+
export function startSession(name, options = {}) {
|
|
130
|
+
const sessionId = `session-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
|
|
131
|
+
const session = new SessionCostData(sessionId, name);
|
|
132
|
+
activeSessions.set(sessionId, session);
|
|
133
|
+
|
|
134
|
+
if (options.verbose !== false) {
|
|
135
|
+
log(`[CostTracker] Started session: ${name} (${sessionId})`);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
return sessionId;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* End a cost tracking session
|
|
143
|
+
*
|
|
144
|
+
* @param {string} sessionId - Session ID
|
|
145
|
+
* @param {object} [options] - Options
|
|
146
|
+
* @returns {object} Session summary
|
|
147
|
+
*/
|
|
148
|
+
export function endSession(sessionId, options = {}) {
|
|
149
|
+
const session = activeSessions.get(sessionId);
|
|
150
|
+
if (!session) {
|
|
151
|
+
warn(`[CostTracker] Session not found: ${sessionId}`);
|
|
152
|
+
return null;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
session.endTime = Date.now();
|
|
156
|
+
const summary = session.getSummary();
|
|
157
|
+
|
|
158
|
+
// Get cache stats
|
|
159
|
+
try {
|
|
160
|
+
const cacheStats = getCacheStats();
|
|
161
|
+
summary.cacheStats = cacheStats;
|
|
162
|
+
} catch (error) {
|
|
163
|
+
// Silently fail if cache stats unavailable
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Save session report
|
|
167
|
+
if (options.saveReport !== false) {
|
|
168
|
+
const reportsDir = join(process.cwd(), 'evaluation', 'results', 'cost-reports');
|
|
169
|
+
if (!existsSync(reportsDir)) {
|
|
170
|
+
mkdirSync(reportsDir, { recursive: true });
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
const reportFile = join(reportsDir, `cost-report-${sessionId}-${Date.now()}.json`);
|
|
174
|
+
writeFileSync(reportFile, JSON.stringify({
|
|
175
|
+
summary,
|
|
176
|
+
entries: session.entries,
|
|
177
|
+
timestamp: new Date().toISOString()
|
|
178
|
+
}, null, 2));
|
|
179
|
+
|
|
180
|
+
if (options.verbose !== false) {
|
|
181
|
+
log(`[CostTracker] Session report saved: ${reportFile}`);
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
// Print summary if verbose
|
|
186
|
+
if (options.verbose !== false) {
|
|
187
|
+
printSessionSummary(summary);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
activeSessions.delete(sessionId);
|
|
191
|
+
return summary;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* Record cost for current session
|
|
196
|
+
*
|
|
197
|
+
* @param {string} sessionId - Session ID
|
|
198
|
+
* @param {object} costData - Cost data
|
|
199
|
+
*/
|
|
200
|
+
export function recordSessionCost(sessionId, costData) {
|
|
201
|
+
const session = activeSessions.get(sessionId);
|
|
202
|
+
if (session) {
|
|
203
|
+
session.recordCostEntry(costData);
|
|
204
|
+
// Also record in global cost tracker
|
|
205
|
+
recordCost(costData);
|
|
206
|
+
} else {
|
|
207
|
+
// No active session, just record globally
|
|
208
|
+
recordCost(costData);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/**
|
|
213
|
+
* Record cache hit for current session
|
|
214
|
+
*
|
|
215
|
+
* @param {string} sessionId - Session ID
|
|
216
|
+
*/
|
|
217
|
+
export function recordSessionCacheHit(sessionId) {
|
|
218
|
+
const session = activeSessions.get(sessionId);
|
|
219
|
+
if (session) {
|
|
220
|
+
session.recordCacheHit();
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
/**
|
|
225
|
+
* Record cache miss for current session
|
|
226
|
+
*
|
|
227
|
+
* @param {string} sessionId - Session ID
|
|
228
|
+
*/
|
|
229
|
+
export function recordSessionCacheMiss(sessionId) {
|
|
230
|
+
const session = activeSessions.get(sessionId);
|
|
231
|
+
if (session) {
|
|
232
|
+
session.recordCacheMiss();
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
/**
|
|
237
|
+
* Get current session costs
|
|
238
|
+
*
|
|
239
|
+
* @param {string} sessionId - Session ID
|
|
240
|
+
* @returns {object} Current session costs
|
|
241
|
+
*/
|
|
242
|
+
export function getSessionCosts(sessionId) {
|
|
243
|
+
const session = activeSessions.get(sessionId);
|
|
244
|
+
if (!session) {
|
|
245
|
+
return null;
|
|
246
|
+
}
|
|
247
|
+
return session.getSummary();
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
/**
|
|
251
|
+
* Print session summary
|
|
252
|
+
*/
|
|
253
|
+
function printSessionSummary(summary) {
|
|
254
|
+
console.log('\n' + '='.repeat(70));
|
|
255
|
+
console.log(`💰 Cost Report: ${summary.name}`);
|
|
256
|
+
console.log('='.repeat(70));
|
|
257
|
+
console.log(`Session ID: ${summary.sessionId}`);
|
|
258
|
+
console.log(`Duration: ${summary.durationSeconds}s`);
|
|
259
|
+
console.log(`\n📊 API Usage:`);
|
|
260
|
+
console.log(` Total Cost: $${summary.costs.total.toFixed(4)}`);
|
|
261
|
+
console.log(` API Calls: ${summary.costs.apiCalls}`);
|
|
262
|
+
console.log(` Average per Call: $${summary.costs.averageCostPerCall}`);
|
|
263
|
+
console.log(` Cost per Second: $${summary.costs.costPerSecond}/s`);
|
|
264
|
+
|
|
265
|
+
console.log(`\n💾 Cache Performance:`);
|
|
266
|
+
console.log(` Cache Hits: ${summary.costs.cacheHits}`);
|
|
267
|
+
console.log(` Cache Misses: ${summary.costs.cacheMisses}`);
|
|
268
|
+
console.log(` Hit Rate: ${summary.costs.cacheHitRate}`);
|
|
269
|
+
const cacheSavings = summary.costs.cacheHits * parseFloat(summary.costs.averageCostPerCall);
|
|
270
|
+
if (cacheSavings > 0) {
|
|
271
|
+
console.log(` Estimated Savings: $${cacheSavings.toFixed(4)} (from cache hits)`);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
console.log(`\n🔢 Token Usage:`);
|
|
275
|
+
console.log(` Input Tokens: ${summary.costs.tokens.input.toLocaleString()}`);
|
|
276
|
+
console.log(` Output Tokens: ${summary.costs.tokens.output.toLocaleString()}`);
|
|
277
|
+
console.log(` Total Tokens: ${summary.costs.tokens.total.toLocaleString()}`);
|
|
278
|
+
|
|
279
|
+
if (Object.keys(summary.costs.byProvider).length > 0) {
|
|
280
|
+
console.log(`\n📦 By Provider:`);
|
|
281
|
+
for (const [provider, data] of Object.entries(summary.costs.byProvider)) {
|
|
282
|
+
console.log(` ${provider}:`);
|
|
283
|
+
console.log(` Cost: $${data.total.toFixed(4)}`);
|
|
284
|
+
console.log(` Calls: ${data.calls}`);
|
|
285
|
+
console.log(` Tokens: ${data.tokens.input.toLocaleString()} in, ${data.tokens.output.toLocaleString()} out`);
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
if (Object.keys(summary.costs.byTest).length > 0) {
|
|
290
|
+
console.log(`\n🧪 By Test (Top 10):`);
|
|
291
|
+
const sortedTests = Object.entries(summary.costs.byTest)
|
|
292
|
+
.sort((a, b) => b[1].total - a[1].total)
|
|
293
|
+
.slice(0, 10);
|
|
294
|
+
for (const [testName, data] of sortedTests) {
|
|
295
|
+
console.log(` ${testName}: $${data.total.toFixed(4)} (${data.calls} calls)`);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
console.log('='.repeat(70) + '\n');
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
/**
|
|
303
|
+
* Get all active sessions
|
|
304
|
+
*
|
|
305
|
+
* @returns {Array} Active session IDs
|
|
306
|
+
*/
|
|
307
|
+
export function getActiveSessions() {
|
|
308
|
+
return Array.from(activeSessions.keys());
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
/**
|
|
312
|
+
* Get global cost stats (across all sessions)
|
|
313
|
+
*
|
|
314
|
+
* @returns {object} Global cost statistics
|
|
315
|
+
*/
|
|
316
|
+
export function getGlobalCostStats() {
|
|
317
|
+
const tracker = getCostTracker();
|
|
318
|
+
return tracker.getStats();
|
|
319
|
+
}
|
|
320
|
+
|
package/src/smart-validator.mjs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Smart Validator Selector
|
|
3
3
|
*
|
|
4
|
-
*
|
|
4
|
+
* Selects validator type based on available context.
|
|
5
5
|
* Guides users to the right tool for the job.
|
|
6
6
|
*
|
|
7
7
|
* Design Philosophy:
|
|
@@ -36,10 +36,10 @@ import { log, warn } from './logger.mjs';
|
|
|
36
36
|
/**
|
|
37
37
|
* Smart accessibility validation
|
|
38
38
|
*
|
|
39
|
-
*
|
|
39
|
+
* Chooses validator based on available context:
|
|
40
40
|
* - Has page access → uses programmatic (fast, deterministic)
|
|
41
41
|
* - Only has screenshot → uses VLLM (semantic evaluation)
|
|
42
|
-
* - Has both and needs semantic context → uses hybrid (
|
|
42
|
+
* - Has both and needs semantic context → uses hybrid (combines both)
|
|
43
43
|
*
|
|
44
44
|
* @param {Object} options - Validation options
|
|
45
45
|
* @param {any} [options.page] - Playwright page object (if available)
|
|
@@ -75,7 +75,7 @@ export async function validateAccessibilitySmart(options = {}) {
|
|
|
75
75
|
|
|
76
76
|
// Decision tree:
|
|
77
77
|
// 1. Has page access → use programmatic (fast, deterministic)
|
|
78
|
-
// 2. Has both + need semantic → use hybrid (
|
|
78
|
+
// 2. Has both + need semantic → use hybrid (combines both)
|
|
79
79
|
// 3. Only screenshot → use VLLM (semantic evaluation)
|
|
80
80
|
|
|
81
81
|
if (page && !shouldUseHybrid) {
|
|
@@ -132,9 +132,9 @@ export async function validateAccessibilitySmart(options = {}) {
|
|
|
132
132
|
/**
|
|
133
133
|
* Smart state validation
|
|
134
134
|
*
|
|
135
|
-
*
|
|
135
|
+
* Chooses validator based on available context:
|
|
136
136
|
* - Has page access + direct state → uses programmatic (fast, deterministic)
|
|
137
|
-
* - Has page access + screenshot + need semantic → uses hybrid (
|
|
137
|
+
* - Has page access + screenshot + need semantic → uses hybrid (combines both)
|
|
138
138
|
* - Only screenshot → uses VLLM (extracts state from screenshot)
|
|
139
139
|
*
|
|
140
140
|
* @param {Object} options - Validation options
|
|
@@ -236,7 +236,7 @@ export async function validateStateSmart(options = {}) {
|
|
|
236
236
|
/**
|
|
237
237
|
* Smart element validation
|
|
238
238
|
*
|
|
239
|
-
* Validates element visibility, position, contrast, etc. using
|
|
239
|
+
* Validates element visibility, position, contrast, etc. using available methods.
|
|
240
240
|
*
|
|
241
241
|
* @param {Object} options - Validation options
|
|
242
242
|
* @param {any} options.page - Playwright page object
|
|
@@ -317,7 +317,7 @@ export async function validateElementSmart(options = {}) {
|
|
|
317
317
|
/**
|
|
318
318
|
* Smart validation with automatic tool selection
|
|
319
319
|
*
|
|
320
|
-
*
|
|
320
|
+
* Main entry point that selects validator
|
|
321
321
|
* based on what you're trying to validate and what context you have.
|
|
322
322
|
*
|
|
323
323
|
* @param {Object} options - Validation options
|