@arclabs561/ai-visual-test 0.5.1 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +102 -11
- package/DEPLOYMENT.md +225 -9
- package/README.md +71 -80
- package/index.d.ts +862 -3
- package/package.json +10 -51
- package/src/batch-optimizer.mjs +39 -0
- package/src/cache.mjs +241 -16
- package/src/config.mjs +33 -91
- package/src/constants.mjs +54 -0
- package/src/convenience.mjs +113 -10
- package/src/cost-optimization.mjs +1 -0
- package/src/cost-tracker.mjs +134 -2
- package/src/data-extractor.mjs +36 -7
- package/src/dynamic-few-shot.mjs +69 -11
- package/src/errors.mjs +6 -2
- package/src/experience-propagation.mjs +12 -0
- package/src/experience-tracer.mjs +12 -3
- package/src/game-player.mjs +222 -43
- package/src/graceful-shutdown.mjs +126 -0
- package/src/helpers/playwright.mjs +22 -8
- package/src/human-validation-manager.mjs +99 -2
- package/src/index.mjs +48 -3
- package/src/integrations/playwright.mjs +140 -0
- package/src/judge.mjs +697 -24
- package/src/load-env.mjs +2 -1
- package/src/logger.mjs +31 -3
- package/src/model-tier-selector.mjs +1 -221
- package/src/natural-language-specs.mjs +31 -3
- package/src/persona-enhanced.mjs +4 -2
- package/src/persona-experience.mjs +1 -1
- package/src/pricing.mjs +28 -0
- package/src/prompt-composer.mjs +162 -5
- package/src/provider-data.mjs +115 -0
- package/src/render-change-detector.mjs +5 -0
- package/src/research-enhanced-validation.mjs +7 -5
- package/src/retry.mjs +21 -7
- package/src/rubrics.mjs +4 -0
- package/src/safe-logger.mjs +71 -0
- package/src/session-cost-tracker.mjs +320 -0
- package/src/smart-validator.mjs +8 -8
- package/src/spec-templates.mjs +52 -6
- package/src/startup-validation.mjs +127 -0
- package/src/temporal-adaptive.mjs +2 -2
- package/src/temporal-decision-manager.mjs +1 -271
- package/src/temporal-logic.mjs +104 -0
- package/src/temporal-note-pruner.mjs +119 -0
- package/src/temporal-preprocessor.mjs +1 -543
- package/src/temporal.mjs +681 -79
- package/src/utils/action-hallucination-detector.mjs +301 -0
- package/src/utils/baseline-validator.mjs +82 -0
- package/src/utils/cache-stats.mjs +104 -0
- package/src/utils/cached-llm.mjs +164 -0
- package/src/utils/capability-stratifier.mjs +108 -0
- package/src/utils/counterfactual-tester.mjs +83 -0
- package/src/utils/error-recovery.mjs +117 -0
- package/src/utils/explainability-scorer.mjs +119 -0
- package/src/utils/exploratory-automation.mjs +131 -0
- package/src/utils/index.mjs +10 -0
- package/src/utils/intent-recognizer.mjs +201 -0
- package/src/utils/log-sanitizer.mjs +165 -0
- package/src/utils/path-validator.mjs +88 -0
- package/src/utils/performance-logger.mjs +316 -0
- package/src/utils/performance-measurement.mjs +280 -0
- package/src/utils/prompt-sanitizer.mjs +213 -0
- package/src/utils/rate-limiter.mjs +144 -0
- package/src/validation-framework.mjs +24 -20
- package/src/validation-result-normalizer.mjs +27 -1
- package/src/validation.mjs +75 -25
- package/src/validators/accessibility-validator.mjs +144 -0
- package/src/validators/hybrid-validator.mjs +48 -4
- package/api/health.js +0 -34
- package/api/validate.js +0 -252
- package/public/index.html +0 -149
- package/vercel.json +0 -27
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Capability Stratifier
|
|
3
|
+
*
|
|
4
|
+
* Tests VLLM capabilities at different levels (low/mid/high)
|
|
5
|
+
*
|
|
6
|
+
* Research context:
|
|
7
|
+
* - VLMs exhibit widespread deficits in low- and mid-level visual abilities
|
|
8
|
+
* - High-level object recognition performance cannot predict low-level capabilities
|
|
9
|
+
* - Need stratified testing to identify capability gaps
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { validateScreenshot } from '../judge.mjs';
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Test capability at specific level
|
|
16
|
+
*
|
|
17
|
+
* @param {string} level - 'low', 'mid', or 'high'
|
|
18
|
+
* @param {Array<{imagePath: string, prompt: string, expected: any}>} testCases
|
|
19
|
+
* @param {Object} options - Test options
|
|
20
|
+
* @returns {Promise<Object>} Capability test result
|
|
21
|
+
*/
|
|
22
|
+
export async function testCapabilityLevel(level, testCases, options = {}) {
|
|
23
|
+
const results = await Promise.all(
|
|
24
|
+
testCases.map(async (tc) => {
|
|
25
|
+
const result = await validateScreenshot(tc.imagePath, tc.prompt, {
|
|
26
|
+
testType: `capability-${level}`,
|
|
27
|
+
...options
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
const extractedValue = result.extractedValue || result.score;
|
|
31
|
+
const correct = extractedValue === tc.expected ||
|
|
32
|
+
(typeof extractedValue === 'number' && typeof tc.expected === 'number' &&
|
|
33
|
+
Math.abs(extractedValue - tc.expected) < 0.1);
|
|
34
|
+
|
|
35
|
+
return {
|
|
36
|
+
testCase: tc,
|
|
37
|
+
result,
|
|
38
|
+
correct,
|
|
39
|
+
extractedValue,
|
|
40
|
+
expected: tc.expected
|
|
41
|
+
};
|
|
42
|
+
})
|
|
43
|
+
);
|
|
44
|
+
|
|
45
|
+
const accuracy = results.filter(r => r.correct).length / results.length;
|
|
46
|
+
|
|
47
|
+
return {
|
|
48
|
+
level,
|
|
49
|
+
accuracy,
|
|
50
|
+
total: results.length,
|
|
51
|
+
correct: results.filter(r => r.correct).length,
|
|
52
|
+
results,
|
|
53
|
+
recommendation: accuracy < 0.7
|
|
54
|
+
? `Low ${level}-level capability accuracy. VLLM may struggle with ${level}-level visual tasks.`
|
|
55
|
+
: `${level}-level capability appears adequate.`
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Stratified capability testing (all levels)
|
|
61
|
+
*
|
|
62
|
+
* @param {Object} testSuites - {low: [...], mid: [...], high: [...]}
|
|
63
|
+
* @param {Object} options - Test options
|
|
64
|
+
* @returns {Promise<Object>} Stratified test results
|
|
65
|
+
*/
|
|
66
|
+
export async function testStratifiedCapabilities(testSuites, options = {}) {
|
|
67
|
+
const levels = ['low', 'mid', 'high'];
|
|
68
|
+
const results = {};
|
|
69
|
+
|
|
70
|
+
for (const level of levels) {
|
|
71
|
+
if (testSuites[level] && testSuites[level].length > 0) {
|
|
72
|
+
results[level] = await testCapabilityLevel(level, testSuites[level], options);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Detect gaps (high-level >0.9 but low-level <0.7)
|
|
77
|
+
const gaps = [];
|
|
78
|
+
if (results.high && results.low) {
|
|
79
|
+
if (results.high.accuracy > 0.9 && results.low.accuracy < 0.7) {
|
|
80
|
+
gaps.push({
|
|
81
|
+
type: 'high-low-gap',
|
|
82
|
+
highAccuracy: results.high.accuracy,
|
|
83
|
+
lowAccuracy: results.low.accuracy,
|
|
84
|
+
recommendation: 'High-level performance does not predict low-level capabilities. Validate low-level tasks separately.'
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
if (results.high && results.mid) {
|
|
90
|
+
if (results.high.accuracy > 0.9 && results.mid.accuracy < 0.7) {
|
|
91
|
+
gaps.push({
|
|
92
|
+
type: 'high-mid-gap',
|
|
93
|
+
highAccuracy: results.high.accuracy,
|
|
94
|
+
midAccuracy: results.mid.accuracy,
|
|
95
|
+
recommendation: 'High-level performance does not predict mid-level capabilities. Validate mid-level tasks separately.'
|
|
96
|
+
});
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
return {
|
|
101
|
+
results,
|
|
102
|
+
gaps,
|
|
103
|
+
overallRecommendation: gaps.length > 0
|
|
104
|
+
? 'Capability gaps detected. High-level performance cannot predict low/mid-level capabilities.'
|
|
105
|
+
: 'Capability levels appear consistent.'
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Counterfactual Tester
|
|
3
|
+
*
|
|
4
|
+
* Tests whether VLLM uses visual analysis vs. memorized knowledge
|
|
5
|
+
*
|
|
6
|
+
* Research context:
|
|
7
|
+
* - VLMs achieve only 58.57% accuracy on basic visual tasks
|
|
8
|
+
* - When counterfactual images contradict training data, accuracy drops to 17.05%
|
|
9
|
+
* - 75.70% of errors are bias-aligned rather than random
|
|
10
|
+
*
|
|
11
|
+
* This utility helps detect when VLLM is using memorization vs. visual analysis
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { validateScreenshot } from '../judge.mjs';
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Test counterfactual scenario
|
|
18
|
+
*
|
|
19
|
+
* @param {string} imagePath - Path to counterfactual image
|
|
20
|
+
* @param {string} prompt - Question about the image
|
|
21
|
+
* @param {any} expectedMemorized - What memorized knowledge would predict
|
|
22
|
+
* @param {any} expectedVisual - What visual analysis should find
|
|
23
|
+
* @param {Object} options - Test options
|
|
24
|
+
* @returns {Promise<Object>} Test result
|
|
25
|
+
*/
|
|
26
|
+
export async function testCounterfactual(imagePath, prompt, expectedMemorized, expectedVisual, options = {}) {
|
|
27
|
+
const result = await validateScreenshot(imagePath, prompt, {
|
|
28
|
+
testType: 'counterfactual',
|
|
29
|
+
...options
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
const extractedValue = result.extractedValue || result.score;
|
|
33
|
+
const usesVisual = extractedValue === expectedVisual ||
|
|
34
|
+
(typeof extractedValue === 'number' && typeof expectedVisual === 'number' &&
|
|
35
|
+
Math.abs(extractedValue - expectedVisual) < 0.1);
|
|
36
|
+
const usesMemorization = extractedValue === expectedMemorized ||
|
|
37
|
+
(typeof extractedValue === 'number' && typeof expectedMemorized === 'number' &&
|
|
38
|
+
Math.abs(extractedValue - expectedMemorized) < 0.1);
|
|
39
|
+
|
|
40
|
+
return {
|
|
41
|
+
extractedValue,
|
|
42
|
+
expectedMemorized,
|
|
43
|
+
expectedVisual,
|
|
44
|
+
usesVisual,
|
|
45
|
+
usesMemorization,
|
|
46
|
+
biasAligned: usesMemorization && !usesVisual,
|
|
47
|
+
result,
|
|
48
|
+
recommendation: usesMemorization
|
|
49
|
+
? 'VLLM appears to use memorized knowledge. Consider visual analysis validation.'
|
|
50
|
+
: 'VLLM appears to use visual analysis.'
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Batch test counterfactual scenarios
|
|
56
|
+
*
|
|
57
|
+
* @param {Array<{imagePath: string, prompt: string, expectedMemorized: any, expectedVisual: any}>} testCases
|
|
58
|
+
* @param {Object} options - Test options
|
|
59
|
+
* @returns {Promise<Object>} Batch test results
|
|
60
|
+
*/
|
|
61
|
+
export async function batchTestCounterfactual(testCases, options = {}) {
|
|
62
|
+
const results = await Promise.all(
|
|
63
|
+
testCases.map(tc =>
|
|
64
|
+
testCounterfactual(tc.imagePath, tc.prompt, tc.expectedMemorized, tc.expectedVisual, options)
|
|
65
|
+
)
|
|
66
|
+
);
|
|
67
|
+
|
|
68
|
+
const visualCount = results.filter(r => r.usesVisual).length;
|
|
69
|
+
const memorizationCount = results.filter(r => r.usesMemorization).length;
|
|
70
|
+
const biasAlignedCount = results.filter(r => r.biasAligned).length;
|
|
71
|
+
|
|
72
|
+
return {
|
|
73
|
+
total: results.length,
|
|
74
|
+
visualAccuracy: visualCount / results.length,
|
|
75
|
+
memorizationRate: memorizationCount / results.length,
|
|
76
|
+
biasAlignedRate: biasAlignedCount / results.length,
|
|
77
|
+
results,
|
|
78
|
+
recommendation: biasAlignedCount > results.length * 0.5
|
|
79
|
+
? 'High bias-aligned error rate. VLLM may be relying on memorized knowledge.'
|
|
80
|
+
: 'VLLM appears to use visual analysis appropriately.'
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Error Recovery for Browser Automation
|
|
3
|
+
*
|
|
4
|
+
* Simple retry logic: wait and retry, or try alternative action.
|
|
5
|
+
*
|
|
6
|
+
* Research Context:
|
|
7
|
+
* - Error recovery success rate >70% is often cited as critical for browser automation agents
|
|
8
|
+
* - Agents should gracefully handle failures and try alternatives
|
|
9
|
+
* - Need to avoid infinite retry loops
|
|
10
|
+
*
|
|
11
|
+
* Implementation:
|
|
12
|
+
* - Most errors are timeouts or element not found - simple wait + retry handles these
|
|
13
|
+
* - Complex error classification adds complexity without clear benefit
|
|
14
|
+
* - The VLLM can handle complex error recovery during action execution
|
|
15
|
+
*
|
|
16
|
+
* See docs/research/IMPLEMENTATION_VS_RESEARCH.md for detailed research context.
|
|
17
|
+
*
|
|
18
|
+
* @module error-recovery
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Error recovery strategy
|
|
23
|
+
*/
|
|
24
|
+
export class ErrorRecoveryStrategy {
|
|
25
|
+
constructor(options = {}) {
|
|
26
|
+
this.maxRetries = options.maxRetries || 3;
|
|
27
|
+
this.retryDelay = options.retryDelay || 1000;
|
|
28
|
+
this.recoveryHistory = [];
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Attempt to recover from error
|
|
33
|
+
*
|
|
34
|
+
* @param {Error} error - The error that occurred
|
|
35
|
+
* @param {Object} action - The action that failed
|
|
36
|
+
* @param {Object} context - Current context (page, state, etc.)
|
|
37
|
+
* @returns {Promise<Object|null>} Recovery action or null if no recovery possible
|
|
38
|
+
*/
|
|
39
|
+
async attemptRecovery(error, action, context = {}) {
|
|
40
|
+
if (this.recoveryHistory.length >= this.maxRetries) {
|
|
41
|
+
return null; // Max retries reached
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const recovery = this.generateRecoveryAction(error, action, context);
|
|
45
|
+
|
|
46
|
+
if (!recovery) {
|
|
47
|
+
return null; // No recovery strategy available
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
this.recoveryHistory.push({
|
|
51
|
+
error: error.message,
|
|
52
|
+
action,
|
|
53
|
+
recovery,
|
|
54
|
+
timestamp: Date.now()
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
return recovery;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Generate recovery action based on error type
|
|
62
|
+
*
|
|
63
|
+
* Simple strategy: wait longer for timeouts/network, wait and retry for others.
|
|
64
|
+
*/
|
|
65
|
+
generateRecoveryAction(error, action, context) {
|
|
66
|
+
const errorMessage = error.message.toLowerCase();
|
|
67
|
+
|
|
68
|
+
// Timeout or network errors: wait longer
|
|
69
|
+
if (errorMessage.includes('timeout') || errorMessage.includes('network')) {
|
|
70
|
+
return {
|
|
71
|
+
type: 'wait',
|
|
72
|
+
duration: this.retryDelay * 2,
|
|
73
|
+
reason: 'Timeout/network error, waiting longer',
|
|
74
|
+
originalAction: action
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Everything else: wait and retry
|
|
79
|
+
return {
|
|
80
|
+
type: 'wait',
|
|
81
|
+
duration: this.retryDelay,
|
|
82
|
+
reason: 'Error occurred, waiting and retrying',
|
|
83
|
+
originalAction: action
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Reset recovery state
|
|
89
|
+
*/
|
|
90
|
+
reset() {
|
|
91
|
+
this.recoveryHistory = [];
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Get recovery statistics
|
|
96
|
+
*/
|
|
97
|
+
getStats() {
|
|
98
|
+
const successful = this.recoveryHistory.filter(r => r.success).length;
|
|
99
|
+
const total = this.recoveryHistory.length;
|
|
100
|
+
const successRate = total > 0 ? successful / total : 0;
|
|
101
|
+
|
|
102
|
+
return {
|
|
103
|
+
totalRecoveries: total,
|
|
104
|
+
successfulRecoveries: successful,
|
|
105
|
+
successRate,
|
|
106
|
+
recoveries: this.recoveryHistory
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Create error recovery strategy
|
|
113
|
+
*/
|
|
114
|
+
export function createErrorRecoveryStrategy(options = {}) {
|
|
115
|
+
return new ErrorRecoveryStrategy(options);
|
|
116
|
+
}
|
|
117
|
+
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Explainability Scoring
|
|
3
|
+
*
|
|
4
|
+
* Simple heuristic: checks if reasoning exists, mentions the action, and isn't too technical.
|
|
5
|
+
*
|
|
6
|
+
* Research Context:
|
|
7
|
+
* - Explainability score >80% is often cited as critical for browser automation agents
|
|
8
|
+
* - Users need to understand agent reasoning for trust and debugging
|
|
9
|
+
* - Transparency scores measure communication quality
|
|
10
|
+
*
|
|
11
|
+
* Implementation:
|
|
12
|
+
* - Simple checks (has action, has target, not too technical, reasonable length) are sufficient
|
|
13
|
+
* - Complex scoring adds computation without clear benefit
|
|
14
|
+
* - The VLLM's reasoning is already human-readable
|
|
15
|
+
*
|
|
16
|
+
* See docs/research/IMPLEMENTATION_VS_RESEARCH.md for detailed research context.
|
|
17
|
+
*
|
|
18
|
+
* @module explainability-scorer
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Score explainability of action reasoning
|
|
23
|
+
*
|
|
24
|
+
* @param {string} reasoning - Agent's reasoning for an action
|
|
25
|
+
* @param {Object} action - The action taken
|
|
26
|
+
* @param {Object} [options] - Scoring options
|
|
27
|
+
* @returns {Object} Explainability score and analysis
|
|
28
|
+
*/
|
|
29
|
+
export function scoreExplainability(reasoning, action, options = {}) {
|
|
30
|
+
if (!reasoning || reasoning.trim().length === 0) {
|
|
31
|
+
return {
|
|
32
|
+
score: 0,
|
|
33
|
+
clarity: 0,
|
|
34
|
+
completeness: 0,
|
|
35
|
+
relevance: 0,
|
|
36
|
+
issues: ['No reasoning provided'],
|
|
37
|
+
recommendation: 'Add reasoning to explain why this action was taken'
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Simple scoring: has reasoning, mentions action, not too technical
|
|
42
|
+
const hasAction = action.type && reasoning.toLowerCase().includes(action.type.toLowerCase());
|
|
43
|
+
|
|
44
|
+
// Check for target: selector, key, or URL - also check for semantic mentions (e.g., "submit button" for selector "#submit")
|
|
45
|
+
let hasTarget = false;
|
|
46
|
+
if (action.selector) {
|
|
47
|
+
// Check for exact selector match or semantic match (e.g., "submit button" for "#submit")
|
|
48
|
+
const selectorLower = action.selector.toLowerCase().replace(/[#.]/g, '');
|
|
49
|
+
const reasoningLower = reasoning.toLowerCase();
|
|
50
|
+
hasTarget = reasoning.includes(action.selector) ||
|
|
51
|
+
(selectorLower && reasoningLower.includes(selectorLower));
|
|
52
|
+
} else if (action.key) {
|
|
53
|
+
hasTarget = reasoning.includes(action.key);
|
|
54
|
+
} else if (action.url) {
|
|
55
|
+
hasTarget = reasoning.includes(action.url);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const notTooTechnical = !reasoning.match(/\b(algorithm|implementation|optimization|paradigm)\b/gi);
|
|
59
|
+
const reasonableLength = reasoning.length > 20 && reasoning.length < 500;
|
|
60
|
+
|
|
61
|
+
// Completeness: considers both action and target, plus reasoning depth
|
|
62
|
+
const hasDepth = reasoning.split(/[.!?]/).length > 2; // Multiple sentences indicate depth
|
|
63
|
+
const completeness = (hasAction && hasTarget && hasDepth) ? 0.9 :
|
|
64
|
+
(hasAction && hasTarget) ? 0.8 :
|
|
65
|
+
(hasAction || hasTarget) ? 0.6 : 0.4;
|
|
66
|
+
|
|
67
|
+
const score = (hasAction ? 0.4 : 0) +
|
|
68
|
+
(hasTarget ? 0.3 : 0) +
|
|
69
|
+
(notTooTechnical ? 0.2 : 0) +
|
|
70
|
+
(reasonableLength ? 0.1 : 0);
|
|
71
|
+
|
|
72
|
+
const issues = [];
|
|
73
|
+
if (!hasAction) issues.push('Reasoning does not mention action type');
|
|
74
|
+
if (!hasTarget) issues.push('Reasoning does not mention action target');
|
|
75
|
+
if (!notTooTechnical) issues.push('Reasoning uses technical jargon');
|
|
76
|
+
if (!reasonableLength) issues.push('Reasoning is too short or too long');
|
|
77
|
+
|
|
78
|
+
return {
|
|
79
|
+
score,
|
|
80
|
+
clarity: notTooTechnical && reasonableLength ? 0.8 : 0.5,
|
|
81
|
+
completeness,
|
|
82
|
+
relevance: hasAction ? 0.8 : 0.5,
|
|
83
|
+
issues,
|
|
84
|
+
recommendation: score >= 0.7
|
|
85
|
+
? 'Reasoning is clear and relevant'
|
|
86
|
+
: 'Add more context about the action and its target'
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Batch score explainability
|
|
93
|
+
*/
|
|
94
|
+
export function batchScoreExplainability(reasonings, actions, options = {}) {
|
|
95
|
+
const scores = reasonings.map((reasoning, i) =>
|
|
96
|
+
scoreExplainability(reasoning, actions[i] || {}, options)
|
|
97
|
+
);
|
|
98
|
+
|
|
99
|
+
const avgScore = scores.reduce((sum, s) => sum + s.score, 0) / scores.length;
|
|
100
|
+
const avgClarity = scores.reduce((sum, s) => sum + s.clarity, 0) / scores.length;
|
|
101
|
+
const avgCompleteness = scores.reduce((sum, s) => sum + s.completeness, 0) / scores.length;
|
|
102
|
+
const avgRelevance = scores.reduce((sum, s) => sum + s.relevance, 0) / scores.length;
|
|
103
|
+
|
|
104
|
+
const meetsTarget = avgScore >= 0.8;
|
|
105
|
+
|
|
106
|
+
return {
|
|
107
|
+
total: scores.length,
|
|
108
|
+
averageScore: avgScore,
|
|
109
|
+
averageClarity: avgClarity,
|
|
110
|
+
averageCompleteness: avgCompleteness,
|
|
111
|
+
averageRelevance: avgRelevance,
|
|
112
|
+
meetsTarget,
|
|
113
|
+
scores,
|
|
114
|
+
recommendation: meetsTarget
|
|
115
|
+
? 'Explainability meets target (>80%)'
|
|
116
|
+
: `Explainability ${(avgScore * 100).toFixed(1)}% below target. Improve reasoning clarity, completeness, or relevance.`
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Exploratory Automation
|
|
3
|
+
*
|
|
4
|
+
* Tries alternative approaches when actions fail.
|
|
5
|
+
* Simple strategy: wait, try different action type, or give up after max attempts.
|
|
6
|
+
*
|
|
7
|
+
* Research Context:
|
|
8
|
+
* - Exploratory success rate >60% is often cited as critical for browser automation agents
|
|
9
|
+
* - Agents should try alternative approaches when initial attempts fail
|
|
10
|
+
* - Need to track exploration attempts and avoid infinite loops
|
|
11
|
+
*
|
|
12
|
+
* Implementation:
|
|
13
|
+
* - Simple wait + alternative action type is sufficient for most failures
|
|
14
|
+
* - Complex exploration strategies add complexity without clear benefit
|
|
15
|
+
* - The VLLM can handle complex decision-making during action execution
|
|
16
|
+
*
|
|
17
|
+
* See docs/research/IMPLEMENTATION_VS_RESEARCH.md for detailed research context.
|
|
18
|
+
*
|
|
19
|
+
* @module exploratory-automation
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Exploration strategy
|
|
24
|
+
*/
|
|
25
|
+
export class ExploratoryStrategy {
|
|
26
|
+
constructor(options = {}) {
|
|
27
|
+
this.maxAttempts = options.maxAttempts || 5;
|
|
28
|
+
this.attemptHistory = [];
|
|
29
|
+
this.alternativeActions = [];
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Get next exploration action
|
|
34
|
+
*
|
|
35
|
+
* @param {Object} currentState - Current browser state
|
|
36
|
+
* @param {Array} failedActions - Actions that have failed
|
|
37
|
+
* @param {string} goal - Current goal
|
|
38
|
+
* @returns {Object|null} Next action to try, or null if no more alternatives
|
|
39
|
+
*/
|
|
40
|
+
getNextAction(currentState, failedActions = [], goal = '') {
|
|
41
|
+
if (this.attemptHistory.length >= this.maxAttempts) {
|
|
42
|
+
return null; // Max attempts reached
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Generate alternative actions based on goal and failed actions
|
|
46
|
+
const alternatives = this.generateAlternatives(currentState, failedActions, goal);
|
|
47
|
+
|
|
48
|
+
// Filter out already attempted actions
|
|
49
|
+
const untried = alternatives.filter(alt =>
|
|
50
|
+
!this.attemptHistory.some(attempt =>
|
|
51
|
+
JSON.stringify(attempt.action) === JSON.stringify(alt)
|
|
52
|
+
)
|
|
53
|
+
);
|
|
54
|
+
|
|
55
|
+
if (untried.length === 0) {
|
|
56
|
+
return null; // No more alternatives
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Select next action (prefer actions that haven't been tried)
|
|
60
|
+
const nextAction = untried[0];
|
|
61
|
+
this.attemptHistory.push({
|
|
62
|
+
action: nextAction,
|
|
63
|
+
timestamp: Date.now(),
|
|
64
|
+
state: currentState
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
return nextAction;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Generate alternative actions
|
|
72
|
+
*
|
|
73
|
+
* Simple strategy: wait, then try a different action type if available.
|
|
74
|
+
*/
|
|
75
|
+
generateAlternatives(currentState, failedActions, goal) {
|
|
76
|
+
const alternatives = [];
|
|
77
|
+
const lastFailed = failedActions[failedActions.length - 1];
|
|
78
|
+
|
|
79
|
+
if (!lastFailed) {
|
|
80
|
+
return alternatives;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// If click failed, try wait then retry
|
|
84
|
+
if (lastFailed.type === 'click') {
|
|
85
|
+
alternatives.push(
|
|
86
|
+
{ type: 'wait', duration: 1000 },
|
|
87
|
+
{ type: 'keyboard', key: 'Tab' } // Try keyboard navigation
|
|
88
|
+
);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// If keyboard failed, try wait
|
|
92
|
+
if (lastFailed.type === 'keyboard') {
|
|
93
|
+
alternatives.push({ type: 'wait', duration: 1000 });
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Always have wait as fallback
|
|
97
|
+
if (alternatives.length === 0) {
|
|
98
|
+
alternatives.push({ type: 'wait', duration: 1000 });
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return alternatives;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Reset exploration state
|
|
106
|
+
*/
|
|
107
|
+
reset() {
|
|
108
|
+
this.attemptHistory = [];
|
|
109
|
+
this.alternativeActions = [];
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Get exploration statistics
|
|
114
|
+
*/
|
|
115
|
+
getStats() {
|
|
116
|
+
return {
|
|
117
|
+
totalAttempts: this.attemptHistory.length,
|
|
118
|
+
maxAttempts: this.maxAttempts,
|
|
119
|
+
remainingAttempts: this.maxAttempts - this.attemptHistory.length,
|
|
120
|
+
attempts: this.attemptHistory
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Create exploratory strategy
|
|
127
|
+
*/
|
|
128
|
+
export function createExploratoryStrategy(options = {}) {
|
|
129
|
+
return new ExploratoryStrategy(options);
|
|
130
|
+
}
|
|
131
|
+
|
package/src/utils/index.mjs
CHANGED
|
@@ -173,3 +173,13 @@ export {
|
|
|
173
173
|
initHumanValidation
|
|
174
174
|
} from '../human-validation-manager.mjs';
|
|
175
175
|
|
|
176
|
+
// Browser automation utilities
|
|
177
|
+
export * from './counterfactual-tester.mjs';
|
|
178
|
+
export * from './capability-stratifier.mjs';
|
|
179
|
+
export * from './baseline-validator.mjs';
|
|
180
|
+
export * from './intent-recognizer.mjs';
|
|
181
|
+
export * from './action-hallucination-detector.mjs';
|
|
182
|
+
export * from './exploratory-automation.mjs';
|
|
183
|
+
export * from './error-recovery.mjs';
|
|
184
|
+
export * from './explainability-scorer.mjs';
|
|
185
|
+
|