@arclabs561/ai-visual-test 0.5.1 → 0.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/CHANGELOG.md +127 -11
  2. package/DEPLOYMENT.md +225 -9
  3. package/README.md +71 -80
  4. package/index.d.ts +902 -5
  5. package/package.json +10 -51
  6. package/src/batch-optimizer.mjs +39 -0
  7. package/src/cache.mjs +241 -16
  8. package/src/config.mjs +33 -91
  9. package/src/constants.mjs +54 -0
  10. package/src/convenience.mjs +113 -10
  11. package/src/cost-optimization.mjs +1 -0
  12. package/src/cost-tracker.mjs +134 -2
  13. package/src/data-extractor.mjs +36 -7
  14. package/src/dynamic-few-shot.mjs +69 -11
  15. package/src/errors.mjs +6 -2
  16. package/src/experience-propagation.mjs +12 -0
  17. package/src/experience-tracer.mjs +12 -3
  18. package/src/game-player.mjs +222 -43
  19. package/src/graceful-shutdown.mjs +126 -0
  20. package/src/helpers/playwright.mjs +22 -8
  21. package/src/human-validation-manager.mjs +99 -2
  22. package/src/index.mjs +48 -3
  23. package/src/integrations/playwright.mjs +140 -0
  24. package/src/judge.mjs +699 -24
  25. package/src/load-env.mjs +2 -1
  26. package/src/logger.mjs +31 -3
  27. package/src/model-tier-selector.mjs +1 -221
  28. package/src/natural-language-specs.mjs +31 -3
  29. package/src/persona-enhanced.mjs +4 -2
  30. package/src/persona-experience.mjs +1 -1
  31. package/src/pricing.mjs +28 -0
  32. package/src/prompt-composer.mjs +162 -5
  33. package/src/provider-data.mjs +115 -0
  34. package/src/render-change-detector.mjs +5 -0
  35. package/src/research-enhanced-validation.mjs +7 -5
  36. package/src/retry.mjs +21 -7
  37. package/src/rubrics.mjs +4 -0
  38. package/src/safe-logger.mjs +71 -0
  39. package/src/session-cost-tracker.mjs +320 -0
  40. package/src/smart-validator.mjs +8 -8
  41. package/src/spec-templates.mjs +52 -6
  42. package/src/startup-validation.mjs +127 -0
  43. package/src/temporal-adaptive.mjs +2 -2
  44. package/src/temporal-decision-manager.mjs +1 -271
  45. package/src/temporal-logic.mjs +104 -0
  46. package/src/temporal-note-pruner.mjs +119 -0
  47. package/src/temporal-preprocessor.mjs +1 -543
  48. package/src/temporal.mjs +681 -79
  49. package/src/utils/action-hallucination-detector.mjs +301 -0
  50. package/src/utils/baseline-validator.mjs +82 -0
  51. package/src/utils/cache-stats.mjs +104 -0
  52. package/src/utils/cached-llm.mjs +164 -0
  53. package/src/utils/capability-stratifier.mjs +108 -0
  54. package/src/utils/counterfactual-tester.mjs +83 -0
  55. package/src/utils/error-recovery.mjs +117 -0
  56. package/src/utils/explainability-scorer.mjs +119 -0
  57. package/src/utils/exploratory-automation.mjs +131 -0
  58. package/src/utils/index.mjs +10 -0
  59. package/src/utils/intent-recognizer.mjs +201 -0
  60. package/src/utils/log-sanitizer.mjs +165 -0
  61. package/src/utils/path-validator.mjs +88 -0
  62. package/src/utils/performance-logger.mjs +316 -0
  63. package/src/utils/performance-measurement.mjs +280 -0
  64. package/src/utils/prompt-sanitizer.mjs +213 -0
  65. package/src/utils/rate-limiter.mjs +144 -0
  66. package/src/validation-framework.mjs +24 -20
  67. package/src/validation-result-normalizer.mjs +35 -1
  68. package/src/validation.mjs +75 -25
  69. package/src/validators/accessibility-validator.mjs +144 -0
  70. package/src/validators/hybrid-validator.mjs +48 -4
  71. package/api/health.js +0 -34
  72. package/api/validate.js +0 -252
  73. package/public/index.html +0 -149
  74. package/vercel.json +0 -27
@@ -0,0 +1,301 @@
1
+ /**
2
+ * Action Hallucination Detection
3
+ *
4
+ * Verifies that actions can actually be executed before claiming success.
5
+ * Simple element existence/visibility checks prevent claiming clicks on non-existent buttons.
6
+ *
7
+ * Research Context:
8
+ * - Hallucination rate <15% is often cited as critical for browser automation agents
9
+ * - Agents often claim actions completed when elements don't exist
10
+ * - Need to verify action execution actually succeeded
11
+ *
12
+ * Implementation:
13
+ * - Pre-action verification (check before clicking) is simpler and more effective
14
+ * - Element existence/visibility/enabled checks are sufficient
15
+ * - Complex algorithms add latency without clear benefit
16
+ *
17
+ * See docs/research/IMPLEMENTATION_VS_RESEARCH.md for detailed research context.
18
+ *
19
+ * @module action-hallucination-detector
20
+ */
21
+
22
+ /**
23
+ * Detect hallucination in action claims
24
+ *
25
+ * @param {Object} action - Action that was claimed to be executed
26
+ * @param {import('playwright').Page} page - Playwright page to verify
27
+ * @param {Object} [options] - Detection options
28
+ * @returns {Promise<Object>} Detection result
29
+ */
30
+ export async function detectActionHallucination(action, page, options = {}) {
31
+ if (!page) {
32
+ return {
33
+ hasHallucination: false,
34
+ reason: 'No page available for verification',
35
+ confidence: 0.5
36
+ };
37
+ }
38
+
39
+ try {
40
+ switch (action.type) {
41
+ case 'click':
42
+ return await detectClickHallucination(action, page, options);
43
+ case 'keyboard':
44
+ return await detectKeyboardHallucination(action, page, options);
45
+ case 'type':
46
+ return await detectTypeHallucination(action, page, options);
47
+ case 'navigate':
48
+ return await detectNavigateHallucination(action, page, options);
49
+ default:
50
+ return {
51
+ hasHallucination: false,
52
+ reason: 'Action type not verifiable',
53
+ confidence: 0.5
54
+ };
55
+ }
56
+ } catch (error) {
57
+ return {
58
+ hasHallucination: false,
59
+ reason: `Verification error: ${error.message}`,
60
+ confidence: 0.3,
61
+ error: error.message
62
+ };
63
+ }
64
+ }
65
+
66
+ /**
67
+ * Detect hallucination in click actions
68
+ */
69
+ async function detectClickHallucination(action, page, options) {
70
+ if (!action.selector) {
71
+ return {
72
+ hasHallucination: true,
73
+ reason: 'Click action missing selector',
74
+ confidence: 0.9
75
+ };
76
+ }
77
+
78
+ try {
79
+ // Check if element exists
80
+ const exists = await page.locator(action.selector).count() > 0;
81
+
82
+ if (!exists) {
83
+ return {
84
+ hasHallucination: true,
85
+ reason: `Element with selector "${action.selector}" does not exist`,
86
+ confidence: 0.95,
87
+ elementExists: false
88
+ };
89
+ }
90
+
91
+ // Check if element is visible
92
+ const isVisible = await page.locator(action.selector).isVisible().catch(() => false);
93
+
94
+ if (!isVisible) {
95
+ return {
96
+ hasHallucination: true,
97
+ reason: `Element with selector "${action.selector}" exists but is not visible`,
98
+ confidence: 0.85,
99
+ elementExists: true,
100
+ elementVisible: false
101
+ };
102
+ }
103
+
104
+ // Check if element is enabled
105
+ const isEnabled = await page.locator(action.selector).isEnabled().catch(() => true);
106
+
107
+ if (!isEnabled) {
108
+ return {
109
+ hasHallucination: true,
110
+ reason: `Element with selector "${action.selector}" is disabled`,
111
+ confidence: 0.8,
112
+ elementExists: true,
113
+ elementVisible: true,
114
+ elementEnabled: false
115
+ };
116
+ }
117
+
118
+ return {
119
+ hasHallucination: false,
120
+ reason: 'Element exists, visible, and enabled',
121
+ confidence: 0.9,
122
+ elementExists: true,
123
+ elementVisible: true,
124
+ elementEnabled: true
125
+ };
126
+ } catch (error) {
127
+ return {
128
+ hasHallucination: true,
129
+ reason: `Error verifying click action: ${error.message}`,
130
+ confidence: 0.7,
131
+ error: error.message
132
+ };
133
+ }
134
+ }
135
+
136
+ /**
137
+ * Detect hallucination in keyboard actions
138
+ */
139
+ async function detectKeyboardHallucination(action, page, options) {
140
+ // Keyboard actions are harder to verify (no element to check)
141
+ // But we can check if the page is interactive
142
+ try {
143
+ const isInteractive = await page.evaluate(() => {
144
+ return document.readyState === 'complete' &&
145
+ !document.hidden;
146
+ });
147
+
148
+ if (!isInteractive) {
149
+ return {
150
+ hasHallucination: true,
151
+ reason: 'Page is not interactive (not ready or hidden)',
152
+ confidence: 0.8,
153
+ pageInteractive: false
154
+ };
155
+ }
156
+
157
+ return {
158
+ hasHallucination: false,
159
+ reason: 'Page is interactive, keyboard action likely succeeded',
160
+ confidence: 0.6, // Lower confidence (can't directly verify keyboard input)
161
+ pageInteractive: true
162
+ };
163
+ } catch (error) {
164
+ return {
165
+ hasHallucination: false,
166
+ reason: `Could not verify keyboard action: ${error.message}`,
167
+ confidence: 0.5,
168
+ error: error.message
169
+ };
170
+ }
171
+ }
172
+
173
+ /**
174
+ * Detect hallucination in type actions
175
+ */
176
+ async function detectTypeHallucination(action, page, options) {
177
+ if (!action.selector) {
178
+ return {
179
+ hasHallucination: true,
180
+ reason: 'Type action missing selector',
181
+ confidence: 0.9
182
+ };
183
+ }
184
+
185
+ try {
186
+ // Check if input element exists
187
+ const exists = await page.locator(action.selector).count() > 0;
188
+
189
+ if (!exists) {
190
+ return {
191
+ hasHallucination: true,
192
+ reason: `Input element with selector "${action.selector}" does not exist`,
193
+ confidence: 0.95,
194
+ elementExists: false
195
+ };
196
+ }
197
+
198
+ // Check if value was actually entered
199
+ if (action.value) {
200
+ const actualValue = await page.locator(action.selector).inputValue().catch(() => null);
201
+
202
+ if (actualValue !== action.value) {
203
+ return {
204
+ hasHallucination: true,
205
+ reason: `Value mismatch: expected "${action.value}", got "${actualValue}"`,
206
+ confidence: 0.9,
207
+ elementExists: true,
208
+ valueMatch: false,
209
+ expectedValue: action.value,
210
+ actualValue
211
+ };
212
+ }
213
+ }
214
+
215
+ return {
216
+ hasHallucination: false,
217
+ reason: 'Input element exists and value matches',
218
+ confidence: 0.9,
219
+ elementExists: true,
220
+ valueMatch: true
221
+ };
222
+ } catch (error) {
223
+ return {
224
+ hasHallucination: true,
225
+ reason: `Error verifying type action: ${error.message}`,
226
+ confidence: 0.7,
227
+ error: error.message
228
+ };
229
+ }
230
+ }
231
+
232
+ /**
233
+ * Detect hallucination in navigate actions
234
+ */
235
+ async function detectNavigateHallucination(action, page, options) {
236
+ if (!action.url) {
237
+ return {
238
+ hasHallucination: true,
239
+ reason: 'Navigate action missing URL',
240
+ confidence: 0.9
241
+ };
242
+ }
243
+
244
+ try {
245
+ const currentUrl = page.url();
246
+ const expectedUrl = action.url;
247
+
248
+ // Check if URL matches (allowing for query params, hash)
249
+ const urlMatches = currentUrl.includes(expectedUrl) ||
250
+ expectedUrl.includes(currentUrl);
251
+
252
+ if (!urlMatches) {
253
+ return {
254
+ hasHallucination: true,
255
+ reason: `URL mismatch: expected "${expectedUrl}", got "${currentUrl}"`,
256
+ confidence: 0.9,
257
+ urlMatch: false,
258
+ expectedUrl,
259
+ actualUrl: currentUrl
260
+ };
261
+ }
262
+
263
+ return {
264
+ hasHallucination: false,
265
+ reason: 'URL matches expected navigation target',
266
+ confidence: 0.9,
267
+ urlMatch: true
268
+ };
269
+ } catch (error) {
270
+ return {
271
+ hasHallucination: false,
272
+ reason: `Could not verify navigation: ${error.message}`,
273
+ confidence: 0.5,
274
+ error: error.message
275
+ };
276
+ }
277
+ }
278
+
279
+ /**
280
+ * Batch detect action hallucinations
281
+ */
282
+ export async function batchDetectActionHallucinations(actions, page, options = {}) {
283
+ const results = await Promise.all(
284
+ actions.map(action => detectActionHallucination(action, page, options))
285
+ );
286
+
287
+ const total = results.length;
288
+ const hallucinationCount = results.filter(r => r.hasHallucination).length;
289
+ const hallucinationRate = hallucinationCount / total;
290
+
291
+ return {
292
+ total,
293
+ hallucinationCount,
294
+ hallucinationRate,
295
+ results,
296
+ recommendation: hallucinationRate < 0.15
297
+ ? 'Hallucination rate meets target (<15%)'
298
+ : `Hallucination rate ${(hallucinationRate * 100).toFixed(1)}% exceeds target. Review action execution logic.`
299
+ };
300
+ }
301
+
@@ -0,0 +1,82 @@
1
+ /**
2
+ * Baseline Validator
3
+ *
4
+ * Tests visual discriminative power of benchmarks
5
+ *
6
+ * Research context:
7
+ * - Standard benchmarks can be partially solved without visual analysis
8
+ * - Questions that can be answered through world knowledge alone obscure actual visual deficits
9
+ * - Need to test baseline (text-only) vs. visual accuracy
10
+ */
11
+
12
+ import { validateScreenshot } from '../judge.mjs';
13
+
14
+ /**
15
+ * Test baseline (text-only) vs. visual accuracy
16
+ *
17
+ * @param {string} imagePath - Path to image (or null for baseline)
18
+ * @param {string} prompt - Question about the image
19
+ * @param {Object} options - Test options
20
+ * @returns {Promise<Object>} Baseline test result
21
+ */
22
+ export async function testBaseline(imagePath, prompt, options = {}) {
23
+ // Test with image
24
+ const visualResult = imagePath
25
+ ? await validateScreenshot(imagePath, prompt, {
26
+ testType: 'baseline-visual',
27
+ ...options
28
+ })
29
+ : null;
30
+
31
+ // Test without image (baseline - text-only)
32
+ // For baseline, we create a minimal result that simulates text-only answering
33
+ // In practice, this would use a corrupted/blank image, but for testing we'll simulate
34
+ const baselineResult = {
35
+ score: 0,
36
+ reasoning: 'Baseline (text-only) - no visual input',
37
+ extractedValue: null
38
+ };
39
+
40
+ const visualScore = visualResult?.score || 0;
41
+ const baselineScore = baselineResult?.score || 0;
42
+ const accuracyDrop = visualScore > 0 ? (visualScore - baselineScore) / visualScore : 0;
43
+
44
+ return {
45
+ visualResult,
46
+ baselineResult,
47
+ visualScore,
48
+ baselineScore,
49
+ accuracyDrop,
50
+ hasVisualDiscriminativePower: accuracyDrop > 0.3, // >30% drop required
51
+ recommendation: accuracyDrop > 0.3
52
+ ? 'Benchmark has visual discriminative power.'
53
+ : 'Benchmark may not require visual input. Consider visual-specific test cases.'
54
+ };
55
+ }
56
+
57
+ /**
58
+ * Batch test baseline vs. visual accuracy
59
+ *
60
+ * @param {Array<{imagePath: string, prompt: string}>} testCases
61
+ * @param {Object} options - Test options
62
+ * @returns {Promise<Object>} Batch baseline test results
63
+ */
64
+ export async function batchTestBaseline(testCases, options = {}) {
65
+ const results = await Promise.all(
66
+ testCases.map(tc => testBaseline(tc.imagePath, tc.prompt, options))
67
+ );
68
+
69
+ const avgAccuracyDrop = results.reduce((sum, r) => sum + r.accuracyDrop, 0) / results.length;
70
+ const visualDiscriminativeCount = results.filter(r => r.hasVisualDiscriminativePower).length;
71
+
72
+ return {
73
+ total: results.length,
74
+ avgAccuracyDrop,
75
+ visualDiscriminativePower: visualDiscriminativeCount / results.length,
76
+ results,
77
+ recommendation: avgAccuracyDrop > 0.3
78
+ ? 'Benchmark has good visual discriminative power.'
79
+ : 'Benchmark may not require visual input. Consider visual-specific test cases.'
80
+ };
81
+ }
82
+
@@ -0,0 +1,104 @@
1
+ /**
2
+ * Cache Statistics and Monitoring Utilities
3
+ *
4
+ * Provides utilities for monitoring cache performance and health.
5
+ * Useful for debugging cache issues and optimizing cache hit rates.
6
+ */
7
+
8
+ import { getCacheStats } from '../cache.mjs';
9
+ import { getCacheStats as getEmbeddingCacheStats } from '../../evaluation/utils/embedding-cache.mjs';
10
+
11
+ /**
12
+ * Get comprehensive cache statistics across all cache systems
13
+ *
14
+ * @returns {Object} Combined cache statistics
15
+ */
16
+ export function getAllCacheStats() {
17
+ const vllmStats = getCacheStats();
18
+ let embeddingStats = null;
19
+
20
+ try {
21
+ embeddingStats = getEmbeddingCacheStats();
22
+ } catch {
23
+ // Embedding cache might not be available
24
+ }
25
+
26
+ return {
27
+ vllm: {
28
+ size: vllmStats.size,
29
+ maxSize: 1000, // MAX_CACHE_SIZE
30
+ maxAge: vllmStats.maxAge,
31
+ utilization: `${((vllmStats.size / 1000) * 100).toFixed(1)}%`,
32
+ cacheFile: vllmStats.cacheFile,
33
+ atomicWrites: vllmStats.atomicWrites || 0,
34
+ atomicWriteFailures: vllmStats.atomicWriteFailures || 0,
35
+ atomicWriteSuccessRate: vllmStats.atomicWriteSuccessRate || 100
36
+ },
37
+ embedding: embeddingStats ? {
38
+ size: embeddingStats.size,
39
+ maxSize: embeddingStats.maxSize,
40
+ utilization: embeddingStats.utilization
41
+ } : null
42
+ };
43
+ }
44
+
45
+ /**
46
+ * Format cache statistics for human-readable display
47
+ *
48
+ * @param {Object} [stats=null] - Optional stats object (if null, fetches current stats)
49
+ * @returns {string} Formatted statistics string
50
+ */
51
+ export function formatCacheStats(stats = null) {
52
+ const allStats = stats || getAllCacheStats();
53
+
54
+ const lines = [
55
+ '=== Cache Statistics ===',
56
+ '',
57
+ 'VLLM Cache (Vision + Text LLM):',
58
+ ` Size: ${allStats.vllm.size} / ${allStats.vllm.maxSize} entries (${allStats.vllm.utilization})`,
59
+ ` Max Age: ${Math.floor(allStats.vllm.maxAge / (1000 * 60 * 60 * 24))} days`,
60
+ ` Cache File: ${allStats.vllm.cacheFile}`,
61
+ ` Atomic Writes: ${allStats.vllm.atomicWrites} (${allStats.vllm.atomicWriteSuccessRate.toFixed(1)}% success rate)`,
62
+ ''
63
+ ];
64
+
65
+ if (allStats.embedding) {
66
+ lines.push(
67
+ 'Embedding Cache:',
68
+ ` Size: ${allStats.embedding.size} / ${allStats.embedding.maxSize} entries (${allStats.embedding.utilization})`,
69
+ ''
70
+ );
71
+ }
72
+
73
+ lines.push('=== End Cache Statistics ===');
74
+
75
+ return lines.join('\n');
76
+ }
77
+
78
+ /**
79
+ * Check cache health and return warnings if any issues detected
80
+ *
81
+ * @returns {Array<string>} Array of warning messages (empty if healthy)
82
+ */
83
+ export function checkCacheHealth() {
84
+ const warnings = [];
85
+ const stats = getAllCacheStats();
86
+
87
+ // Check VLLM cache utilization
88
+ if (stats.vllm.size > 900) {
89
+ warnings.push(`VLLM cache is nearly full (${stats.vllm.size}/1000 entries). Consider clearing old entries.`);
90
+ }
91
+
92
+ // Check atomic write success rate
93
+ if (stats.vllm.atomicWriteSuccessRate < 95 && stats.vllm.atomicWrites > 10) {
94
+ warnings.push(`Low atomic write success rate: ${stats.vllm.atomicWriteSuccessRate.toFixed(1)}%. Check disk permissions.`);
95
+ }
96
+
97
+ // Check embedding cache utilization
98
+ if (stats.embedding && parseInt(stats.embedding.utilization) > 90) {
99
+ warnings.push(`Embedding cache is nearly full (${stats.embedding.utilization}). Consider increasing limit.`);
100
+ }
101
+
102
+ return warnings;
103
+ }
104
+
@@ -0,0 +1,164 @@
1
+ /**
2
+ * Cached LLM Wrapper
3
+ *
4
+ * Wraps @arclabs561/llm-utils callLLM with persistent caching to reduce costs
5
+ * and improve performance for text-only LLM calls.
6
+ *
7
+ * Uses the same cache system as vLLM calls (src/cache.mjs) for consistency.
8
+ * Cache entries persist for 7 days and use LRU eviction.
9
+ *
10
+ * DESIGN DECISION: Separate wrapper rather than modifying @arclabs561/llm-utils
11
+ * - Why: @arclabs561/llm-utils is a shared library, we can't modify it
12
+ * - Why: Keeps caching concerns separate from LLM calling logic
13
+ * - Why: Allows disabling cache per-call if needed (via useCache option)
14
+ */
15
+
16
+ import { getCachedTextLLM, setCachedTextLLM, initCache, getCacheStats } from '../cache.mjs';
17
+ import { ValidationError } from '../errors.mjs';
18
+
19
+ // Initialize cache on module load (uses default directory)
20
+ initCache();
21
+
22
+ /**
23
+ * Normalize prompt to improve cache hit rates
24
+ *
25
+ * Normalizes prompts by:
26
+ * - Trimming leading/trailing whitespace
27
+ * - Normalizing line endings (CRLF -> LF)
28
+ * - Collapsing multiple spaces to single space
29
+ *
30
+ * This helps catch cases where prompts are semantically identical but have
31
+ * minor formatting differences (e.g., extra spaces, different line endings).
32
+ *
33
+ * DESIGN DECISION: Conservative normalization (preserves content, only fixes formatting)
34
+ * - Why: Aggressive normalization (e.g., lowercasing, removing punctuation) could
35
+ * cause cache collisions for semantically different prompts
36
+ * - Trade-off: Some cache misses for prompts that differ only in whitespace,
37
+ * but avoids wrong cache hits
38
+ *
39
+ * SUBTLE BEHAVIOR: The original prompt is preserved for the API call, only the
40
+ * cache key uses the normalized version. This ensures:
41
+ * 1. Cache hit rates improve (formatting variations hit same cache)
42
+ * 2. API receives original prompt (preserves user intent)
43
+ * 3. No semantic changes (only formatting normalization)
44
+ *
45
+ * @param {string} prompt - Raw prompt
46
+ * @returns {string} Normalized prompt
47
+ */
48
+ export function normalizePrompt(prompt) {
49
+ if (typeof prompt !== 'string') {
50
+ return prompt;
51
+ }
52
+
53
+ return prompt
54
+ .replace(/\r\n/g, '\n') // Normalize line endings (CRLF -> LF)
55
+ .replace(/\r/g, '\n') // Handle old Mac line endings
56
+ .replace(/[ \t]+/g, ' ') // Collapse multiple spaces/tabs to single space
57
+ .replace(/[ \t]*\n[ \t]*/g, '\n') // Normalize line breaks (remove trailing/leading spaces)
58
+ .trim(); // Remove leading/trailing whitespace
59
+ }
60
+
61
+ /**
62
+ * Call LLM with caching
63
+ *
64
+ * Wraps @arclabs561/llm-utils callLLM with persistent caching.
65
+ *
66
+ * @param {string} prompt - Text prompt
67
+ * @param {string} provider - LLM provider (e.g., 'gemini', 'openai', 'claude')
68
+ * @param {string} apiKey - API key for the provider
69
+ * @param {{
70
+ * model?: string | null;
71
+ * temperature?: number;
72
+ * maxTokens?: number;
73
+ * tier?: string;
74
+ * useCache?: boolean;
75
+ * }} [options={}] - LLM call options
76
+ * @returns {Promise<string>} LLM response
77
+ */
78
+ export async function callLLMCached(prompt, provider, apiKey, options = {}) {
79
+ const {
80
+ useCache = process.env.DISABLE_LLM_CACHE !== 'true', // Cache by default, can disable via env var
81
+ ...llmOptions
82
+ } = options;
83
+
84
+ // Normalize prompt to improve cache hit rates
85
+ const normalizedPrompt = normalizePrompt(prompt);
86
+
87
+ // Check cache first (if caching enabled)
88
+ if (useCache) {
89
+ const cached = getCachedTextLLM(normalizedPrompt, provider, llmOptions);
90
+ if (cached !== null) {
91
+ // Log cache hit (weighted: cache hits are important for performance visibility)
92
+ try {
93
+ const { logCacheOperation } = await import('./performance-logger.mjs');
94
+ const stats = getCacheStats();
95
+ logCacheOperation({
96
+ operation: 'hit',
97
+ hit: true,
98
+ latency: 0, // Minimal latency for cache hits
99
+ cacheSize: stats.size,
100
+ maxSize: 1000, // MAX_CACHE_SIZE
101
+ type: 'text-llm'
102
+ });
103
+ } catch {
104
+ // Silently fail if performance logger unavailable
105
+ }
106
+
107
+ return cached;
108
+ }
109
+
110
+ // Log cache miss
111
+ try {
112
+ const { logCacheOperation } = await import('./performance-logger.mjs');
113
+ const stats = getCacheStats();
114
+ logCacheOperation({
115
+ operation: 'miss',
116
+ hit: false,
117
+ cacheSize: stats.size,
118
+ maxSize: 1000,
119
+ type: 'text-llm'
120
+ });
121
+ } catch {
122
+ // Silently fail if performance logger unavailable
123
+ }
124
+ }
125
+
126
+ // Call actual LLM (dynamic import to make it optional)
127
+ let llmUtils;
128
+ try {
129
+ llmUtils = await import('@arclabs561/llm-utils');
130
+ } catch (error) {
131
+ throw new ValidationError(
132
+ `LLM call requires @arclabs561/llm-utils package. ` +
133
+ `Install it with: npm install @arclabs561/llm-utils. ` +
134
+ `Error: ${error.message}`,
135
+ {
136
+ package: '@arclabs561/llm-utils',
137
+ installationCommand: 'npm install @arclabs561/llm-utils',
138
+ originalError: error.message
139
+ }
140
+ );
141
+ }
142
+
143
+ const startTime = Date.now();
144
+ const response = await llmUtils.callLLM(prompt, provider, apiKey, llmOptions);
145
+ const latency = Date.now() - startTime;
146
+
147
+ // Cache the response (if caching enabled)
148
+ // Use normalized prompt for cache key consistency
149
+ if (useCache) {
150
+ setCachedTextLLM(normalizedPrompt, provider, llmOptions, response);
151
+ }
152
+
153
+ return response;
154
+ }
155
+
156
+ /**
157
+ * Re-export callLLM from @arclabs561/llm-utils for convenience
158
+ * (non-cached version, in case someone needs it)
159
+ */
160
+ export async function callLLMUncached(prompt, provider, apiKey, options = {}) {
161
+ const llmUtils = await import('@arclabs561/llm-utils');
162
+ return llmUtils.callLLM(prompt, provider, apiKey, options);
163
+ }
164
+