@bbearai/ai-executor 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-WT22IQMS.mjs +175 -0
- package/dist/chunk-WT22IQMS.mjs.map +1 -0
- package/dist/cli.js +622 -129
- package/dist/cli.js.map +1 -1
- package/dist/index.d.mts +533 -8
- package/dist/index.d.ts +533 -8
- package/dist/index.js +1613 -131
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +1411 -130
- package/dist/index.mjs.map +1 -1
- package/dist/report-generator-EVZEB33O.mjs +7 -0
- package/dist/report-generator-EVZEB33O.mjs.map +1 -0
- package/package.json +5 -1
package/dist/index.mjs
CHANGED
|
@@ -1,9 +1,91 @@
|
|
|
1
|
+
import {
|
|
2
|
+
generateExplorationReport
|
|
3
|
+
} from "./chunk-WT22IQMS.mjs";
|
|
4
|
+
|
|
1
5
|
// src/runner.ts
|
|
2
6
|
import Anthropic from "@anthropic-ai/sdk";
|
|
3
|
-
import { z } from "zod";
|
|
4
7
|
|
|
5
8
|
// src/browser.ts
|
|
6
9
|
import { Stagehand } from "@browserbasehq/stagehand";
|
|
10
|
+
|
|
11
|
+
// src/supabase-auth.ts
|
|
12
|
+
function extractProjectRef(supabaseUrl) {
|
|
13
|
+
const url = new URL(supabaseUrl);
|
|
14
|
+
const hostname = url.hostname;
|
|
15
|
+
const ref = hostname.split(".")[0];
|
|
16
|
+
return ref;
|
|
17
|
+
}
|
|
18
|
+
async function authenticateSupabase(auth) {
|
|
19
|
+
const url = `${auth.supabaseUrl.replace(/\/$/, "")}/auth/v1/token?grant_type=password`;
|
|
20
|
+
const response = await fetch(url, {
|
|
21
|
+
method: "POST",
|
|
22
|
+
headers: {
|
|
23
|
+
"Content-Type": "application/json",
|
|
24
|
+
"apikey": auth.anonKey
|
|
25
|
+
},
|
|
26
|
+
body: JSON.stringify({
|
|
27
|
+
email: auth.email,
|
|
28
|
+
password: auth.password
|
|
29
|
+
})
|
|
30
|
+
});
|
|
31
|
+
if (!response.ok) {
|
|
32
|
+
const body = await response.text().catch(() => "");
|
|
33
|
+
throw new Error(
|
|
34
|
+
`Supabase auth failed (${response.status}): ${body.slice(0, 200)}`
|
|
35
|
+
);
|
|
36
|
+
}
|
|
37
|
+
const session = await response.json();
|
|
38
|
+
if (!session.access_token) {
|
|
39
|
+
throw new Error("Supabase auth returned no access_token");
|
|
40
|
+
}
|
|
41
|
+
return session;
|
|
42
|
+
}
|
|
43
|
+
async function injectSupabaseAuth(page, auth, session) {
|
|
44
|
+
const ref = extractProjectRef(auth.supabaseUrl);
|
|
45
|
+
const storageKey = `sb-${ref}-auth-token`;
|
|
46
|
+
const storageValue = JSON.stringify({
|
|
47
|
+
access_token: session.access_token,
|
|
48
|
+
refresh_token: session.refresh_token,
|
|
49
|
+
expires_in: session.expires_in,
|
|
50
|
+
expires_at: session.expires_at,
|
|
51
|
+
token_type: session.token_type,
|
|
52
|
+
user: session.user
|
|
53
|
+
});
|
|
54
|
+
const currentUrl = page.url();
|
|
55
|
+
if (currentUrl === "about:blank" || !currentUrl) {
|
|
56
|
+
await page.goto(auth.supabaseUrl.replace(/\/$/, ""), {
|
|
57
|
+
waitUntil: "domcontentloaded",
|
|
58
|
+
timeoutMs: 1e4
|
|
59
|
+
}).catch(() => {
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
await page.evaluate(
|
|
63
|
+
({ key, value }) => {
|
|
64
|
+
localStorage.setItem(key, value);
|
|
65
|
+
},
|
|
66
|
+
{ key: storageKey, value: storageValue }
|
|
67
|
+
);
|
|
68
|
+
}
|
|
69
|
+
async function verifySupabaseSession(auth, accessToken) {
|
|
70
|
+
const url = `${auth.supabaseUrl.replace(/\/$/, "")}/auth/v1/user`;
|
|
71
|
+
const response = await fetch(url, {
|
|
72
|
+
headers: {
|
|
73
|
+
"Authorization": `Bearer ${accessToken}`,
|
|
74
|
+
"apikey": auth.anonKey
|
|
75
|
+
}
|
|
76
|
+
});
|
|
77
|
+
return response.ok;
|
|
78
|
+
}
|
|
79
|
+
async function performSupabaseAuth(page, auth) {
|
|
80
|
+
const session = await authenticateSupabase(auth);
|
|
81
|
+
await injectSupabaseAuth(page, auth, session);
|
|
82
|
+
const valid = await verifySupabaseSession(auth, session.access_token);
|
|
83
|
+
if (!valid) {
|
|
84
|
+
throw new Error("Supabase auth verification failed \u2014 session token rejected");
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// src/browser.ts
|
|
7
89
|
var DEFAULT_MODEL = "anthropic/claude-sonnet-4-20250514";
|
|
8
90
|
async function createStagehandSession(config, anthropicApiKey) {
|
|
9
91
|
const modelName = config.model ?? DEFAULT_MODEL;
|
|
@@ -16,6 +98,11 @@ async function createStagehandSession(config, anthropicApiKey) {
|
|
|
16
98
|
modelName,
|
|
17
99
|
apiKey: anthropicApiKey
|
|
18
100
|
},
|
|
101
|
+
// Bypass pino logger — its pino-pretty transport uses worker threads
|
|
102
|
+
// which fail in Vercel's serverless environment
|
|
103
|
+
logger: (msg) => {
|
|
104
|
+
if ((msg.level ?? 0) >= 40) console.warn("[Stagehand]", msg.message);
|
|
105
|
+
},
|
|
19
106
|
localBrowserLaunchOptions: config.provider === "local" ? {
|
|
20
107
|
headless: config.headless ?? true,
|
|
21
108
|
viewport
|
|
@@ -39,6 +126,21 @@ async function createStagehandSession(config, anthropicApiKey) {
|
|
|
39
126
|
}
|
|
40
127
|
};
|
|
41
128
|
}
|
|
129
|
+
async function suppressBugBearWidget(stagehand) {
|
|
130
|
+
try {
|
|
131
|
+
const ctx = stagehand.context;
|
|
132
|
+
if (ctx?.addInitScript) {
|
|
133
|
+
await ctx.addInitScript(() => {
|
|
134
|
+
window.__bugbear_suppress = true;
|
|
135
|
+
try {
|
|
136
|
+
localStorage.setItem("__bugbear_suppress", "true");
|
|
137
|
+
} catch {
|
|
138
|
+
}
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
} catch {
|
|
142
|
+
}
|
|
143
|
+
}
|
|
42
144
|
async function injectAuth(page, auth, stagehand) {
|
|
43
145
|
if (auth.type === "cookie") {
|
|
44
146
|
for (const c of auth.cookies) {
|
|
@@ -64,23 +166,123 @@ async function injectAuth(page, auth, stagehand) {
|
|
|
64
166
|
}, auth.items);
|
|
65
167
|
} else if (auth.type === "form-login") {
|
|
66
168
|
await performFormLogin(page, auth, stagehand);
|
|
169
|
+
} else if (auth.type === "supabase-native") {
|
|
170
|
+
await performSupabaseAuth(page, auth);
|
|
67
171
|
}
|
|
68
172
|
}
|
|
173
|
+
function createNetworkCapture(page) {
|
|
174
|
+
const requests = [];
|
|
175
|
+
const errors = [];
|
|
176
|
+
let active = false;
|
|
177
|
+
let startTimestamp = Date.now();
|
|
178
|
+
const onResponse = async (response) => {
|
|
179
|
+
if (!active) return;
|
|
180
|
+
const req = response.request();
|
|
181
|
+
const resourceType = typeof req.resourceType === "function" ? req.resourceType() : req.resourceType;
|
|
182
|
+
if (["image", "stylesheet", "font", "media"].includes(resourceType)) return;
|
|
183
|
+
const entry = {
|
|
184
|
+
method: typeof req.method === "function" ? req.method() : String(req.method),
|
|
185
|
+
url: (typeof response.url === "function" ? response.url() : String(response.url)).slice(0, 500),
|
|
186
|
+
status: typeof response.status === "function" ? response.status() : Number(response.status),
|
|
187
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
188
|
+
};
|
|
189
|
+
const status = entry.status;
|
|
190
|
+
if (status >= 400) {
|
|
191
|
+
try {
|
|
192
|
+
const body = await response.text();
|
|
193
|
+
entry.responseBody = body.slice(0, 500);
|
|
194
|
+
} catch {
|
|
195
|
+
}
|
|
196
|
+
errors.push({
|
|
197
|
+
method: entry.method,
|
|
198
|
+
url: entry.url,
|
|
199
|
+
status,
|
|
200
|
+
statusText: typeof response.statusText === "function" ? response.statusText() : String(response.statusText ?? ""),
|
|
201
|
+
timestamp: Date.now() - startTimestamp
|
|
202
|
+
});
|
|
203
|
+
}
|
|
204
|
+
if (["POST", "PUT", "PATCH"].includes(entry.method)) {
|
|
205
|
+
try {
|
|
206
|
+
const postData = typeof req.postData === "function" ? req.postData() : req.postData;
|
|
207
|
+
if (postData) entry.requestBody = String(postData).slice(0, 500);
|
|
208
|
+
} catch {
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
requests.push(entry);
|
|
212
|
+
};
|
|
213
|
+
const onRequestFailed = (req) => {
|
|
214
|
+
if (!active) return;
|
|
215
|
+
const url = typeof req.url === "function" ? req.url() : String(req.url ?? "");
|
|
216
|
+
const method = typeof req.method === "function" ? req.method() : String(req.method ?? "GET");
|
|
217
|
+
const failure = typeof req.failure === "function" ? req.failure() : req.failure;
|
|
218
|
+
errors.push({
|
|
219
|
+
method,
|
|
220
|
+
url: url.slice(0, 500),
|
|
221
|
+
status: 0,
|
|
222
|
+
statusText: failure?.errorText ?? "Request failed",
|
|
223
|
+
timestamp: Date.now() - startTimestamp
|
|
224
|
+
});
|
|
225
|
+
};
|
|
226
|
+
const rawPage = page;
|
|
227
|
+
let responseSupported = true;
|
|
228
|
+
let requestFailedSupported = true;
|
|
229
|
+
return {
|
|
230
|
+
start() {
|
|
231
|
+
active = true;
|
|
232
|
+
requests.length = 0;
|
|
233
|
+
errors.length = 0;
|
|
234
|
+
startTimestamp = Date.now();
|
|
235
|
+
if (responseSupported) {
|
|
236
|
+
try {
|
|
237
|
+
rawPage.on("response", onResponse);
|
|
238
|
+
} catch {
|
|
239
|
+
responseSupported = false;
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
if (requestFailedSupported) {
|
|
243
|
+
try {
|
|
244
|
+
rawPage.on("requestfailed", onRequestFailed);
|
|
245
|
+
} catch {
|
|
246
|
+
requestFailedSupported = false;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
},
|
|
250
|
+
stop() {
|
|
251
|
+
active = false;
|
|
252
|
+
if (responseSupported) {
|
|
253
|
+
try {
|
|
254
|
+
rawPage.off("response", onResponse);
|
|
255
|
+
} catch {
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
if (requestFailedSupported) {
|
|
259
|
+
try {
|
|
260
|
+
rawPage.off("requestfailed", onRequestFailed);
|
|
261
|
+
} catch {
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
},
|
|
265
|
+
getRequests: () => [...requests],
|
|
266
|
+
getErrors: () => [...errors]
|
|
267
|
+
};
|
|
268
|
+
}
|
|
69
269
|
async function performFormLogin(page, auth, stagehand) {
|
|
70
270
|
await page.goto(auth.loginUrl, { waitUntil: "domcontentloaded" });
|
|
71
271
|
await page.waitForLoadState("networkidle", 15e3).catch(() => {
|
|
72
272
|
});
|
|
273
|
+
await fillLoginCredentials(page, auth);
|
|
73
274
|
if (stagehand) {
|
|
74
275
|
await stagehand.act(
|
|
75
|
-
|
|
76
|
-
)
|
|
276
|
+
"Click the login, sign-in, or submit button to submit the form."
|
|
277
|
+
).catch(() => {
|
|
278
|
+
});
|
|
77
279
|
} else {
|
|
78
|
-
await
|
|
280
|
+
await clickSubmitButton(page);
|
|
79
281
|
}
|
|
80
282
|
await page.waitForLoadState("networkidle", 15e3).catch(() => {
|
|
81
283
|
});
|
|
82
284
|
}
|
|
83
|
-
async function
|
|
285
|
+
async function fillLoginCredentials(page, auth) {
|
|
84
286
|
await page.waitForSelector(
|
|
85
287
|
'input[type="email"], input[type="text"][name*="email"], input[name*="user"], input[type="text"]',
|
|
86
288
|
{ timeout: 15e3 }
|
|
@@ -114,6 +316,8 @@ async function manualFormLogin(page, auth) {
|
|
|
114
316
|
} else {
|
|
115
317
|
throw new Error("Could not find password input on login page");
|
|
116
318
|
}
|
|
319
|
+
}
|
|
320
|
+
async function clickSubmitButton(page) {
|
|
117
321
|
const submitSelectors = [
|
|
118
322
|
'button[type="submit"]',
|
|
119
323
|
'input[type="submit"]'
|
|
@@ -138,21 +342,23 @@ async function generateRunSummary(anthropic, testTitle, steps, model) {
|
|
|
138
342
|
(s) => `Step ${s.stepNumber}: ${s.action}
|
|
139
343
|
Expected: ${s.expectedResult}
|
|
140
344
|
Actual: ${s.actualResult}
|
|
141
|
-
Result: ${s.passed ? "PASS" : "FAIL"} (confidence: ${Math.round(s.confidence * 100)}%)${s.error ? `
|
|
345
|
+
Result: ${s.skipped ? "SKIPPED" : s.passed ? "PASS" : "FAIL"} (confidence: ${Math.round(s.confidence * 100)}%)${s.error ? `
|
|
142
346
|
Error: ${s.error}` : ""}`
|
|
143
347
|
).join("\n\n");
|
|
144
|
-
const passCount = steps.filter((s) => s.passed).length;
|
|
145
|
-
const failCount = steps.filter((s) => !s.passed).length;
|
|
348
|
+
const passCount = steps.filter((s) => s.passed && !s.skipped).length;
|
|
349
|
+
const failCount = steps.filter((s) => !s.passed && !s.skipped).length;
|
|
350
|
+
const skipCount = steps.filter((s) => s.skipped).length;
|
|
351
|
+
const skipNote = skipCount > 0 ? " Some steps were skipped due to page state recovery \u2014 these are not failures, just steps that could not be executed." : "";
|
|
146
352
|
const response = await anthropic.messages.create({
|
|
147
353
|
model,
|
|
148
354
|
max_tokens: 512,
|
|
149
355
|
messages: [
|
|
150
356
|
{
|
|
151
357
|
role: "user",
|
|
152
|
-
content: `Summarize this AI test execution in 2-3 sentences. Focus on what was tested, what passed, and what failed (if anything)
|
|
358
|
+
content: `Summarize this AI test execution in 2-3 sentences. Focus on what was tested, what passed, and what failed (if anything).${skipNote} Be concise and factual.
|
|
153
359
|
|
|
154
360
|
Test: ${testTitle}
|
|
155
|
-
Results: ${passCount} passed, ${failCount} failed out of ${steps.length} steps
|
|
361
|
+
Results: ${passCount} passed, ${failCount} failed, ${skipCount} skipped out of ${steps.length} steps
|
|
156
362
|
|
|
157
363
|
${stepsText}`
|
|
158
364
|
}
|
|
@@ -161,7 +367,355 @@ ${stepsText}`
|
|
|
161
367
|
return response.content.filter((block) => block.type === "text").map((block) => block.text).join("");
|
|
162
368
|
}
|
|
163
369
|
|
|
370
|
+
// src/vision-evaluator.ts
|
|
371
|
+
var DEFAULT_MODEL2 = "claude-sonnet-4-20250514";
|
|
372
|
+
async function evaluateStep(input) {
|
|
373
|
+
const model = input.model ?? DEFAULT_MODEL2;
|
|
374
|
+
const hintClause = input.evaluationHint ? `
|
|
375
|
+
EVALUATION HINT: ${input.evaluationHint}` : "";
|
|
376
|
+
const response = await input.anthropic.messages.create({
|
|
377
|
+
model,
|
|
378
|
+
max_tokens: 512,
|
|
379
|
+
messages: [
|
|
380
|
+
{
|
|
381
|
+
role: "user",
|
|
382
|
+
content: [
|
|
383
|
+
{
|
|
384
|
+
type: "text",
|
|
385
|
+
text: "BEFORE screenshot (page state before the action):"
|
|
386
|
+
},
|
|
387
|
+
{
|
|
388
|
+
type: "image",
|
|
389
|
+
source: {
|
|
390
|
+
type: "base64",
|
|
391
|
+
media_type: "image/png",
|
|
392
|
+
data: input.screenshotBefore.toString("base64")
|
|
393
|
+
}
|
|
394
|
+
},
|
|
395
|
+
{
|
|
396
|
+
type: "text",
|
|
397
|
+
text: "AFTER screenshot (page state after the action):"
|
|
398
|
+
},
|
|
399
|
+
{
|
|
400
|
+
type: "image",
|
|
401
|
+
source: {
|
|
402
|
+
type: "base64",
|
|
403
|
+
media_type: "image/png",
|
|
404
|
+
data: input.screenshotAfter.toString("base64")
|
|
405
|
+
}
|
|
406
|
+
},
|
|
407
|
+
{
|
|
408
|
+
type: "text",
|
|
409
|
+
text: `You are a QA test evaluator. Compare the BEFORE and AFTER screenshots to evaluate this test step.
|
|
410
|
+
|
|
411
|
+
ACTION PERFORMED: ${input.action}
|
|
412
|
+
EXPECTED RESULT: ${input.expectedResult}${hintClause}
|
|
413
|
+
|
|
414
|
+
Analyze the visual differences between the two screenshots and determine if the expected result was achieved.
|
|
415
|
+
|
|
416
|
+
Respond with ONLY a JSON object (no markdown, no explanation outside the JSON):
|
|
417
|
+
{
|
|
418
|
+
"passed": true/false,
|
|
419
|
+
"confidence": 0.0-1.0,
|
|
420
|
+
"actualResult": "Brief description of what actually changed between the screenshots"
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
Confidence guide:
|
|
424
|
+
- 0.95-1.0: Clearly achieved/not achieved, obvious visual evidence
|
|
425
|
+
- 0.8-0.94: Very likely, strong visual indicators
|
|
426
|
+
- 0.6-0.79: Probable but some ambiguity
|
|
427
|
+
- Below 0.6: Uncertain, hard to tell from screenshots alone`
|
|
428
|
+
}
|
|
429
|
+
]
|
|
430
|
+
}
|
|
431
|
+
]
|
|
432
|
+
});
|
|
433
|
+
const text = response.content.filter((block) => block.type === "text").map((block) => block.text).join("");
|
|
434
|
+
return parseEvaluation(text);
|
|
435
|
+
}
|
|
436
|
+
function parseEvaluation(text) {
|
|
437
|
+
try {
|
|
438
|
+
const parsed = JSON.parse(text.trim());
|
|
439
|
+
return validateEvaluation(parsed);
|
|
440
|
+
} catch {
|
|
441
|
+
const jsonMatch = text.match(/\{[\s\S]*"passed"[\s\S]*"confidence"[\s\S]*"actualResult"[\s\S]*\}/);
|
|
442
|
+
if (jsonMatch) {
|
|
443
|
+
try {
|
|
444
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
445
|
+
return validateEvaluation(parsed);
|
|
446
|
+
} catch {
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
return {
|
|
451
|
+
passed: false,
|
|
452
|
+
confidence: 0.3,
|
|
453
|
+
actualResult: `Vision evaluation returned unparseable response: ${text.slice(0, 200)}`
|
|
454
|
+
};
|
|
455
|
+
}
|
|
456
|
+
function validateEvaluation(parsed) {
|
|
457
|
+
return {
|
|
458
|
+
passed: typeof parsed.passed === "boolean" ? parsed.passed : false,
|
|
459
|
+
confidence: typeof parsed.confidence === "number" ? Math.max(0, Math.min(1, parsed.confidence)) : 0.5,
|
|
460
|
+
actualResult: typeof parsed.actualResult === "string" ? parsed.actualResult : "No description provided"
|
|
461
|
+
};
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
// src/action-executor.ts
|
|
465
|
+
async function executeAction(page, stagehand, step) {
|
|
466
|
+
if (step.selector && step.actionType) {
|
|
467
|
+
try {
|
|
468
|
+
await executePlaywrightAction(page, step);
|
|
469
|
+
return { deterministic: true };
|
|
470
|
+
} catch (err) {
|
|
471
|
+
const fallbackResult = await executeStagehandAction(stagehand, step);
|
|
472
|
+
return {
|
|
473
|
+
deterministic: false,
|
|
474
|
+
error: fallbackResult.error ? `Playwright failed (${err instanceof Error ? err.message : String(err)}), Stagehand fallback also failed: ${fallbackResult.error}` : void 0
|
|
475
|
+
};
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
return executeStagehandAction(stagehand, step);
|
|
479
|
+
}
|
|
480
|
+
async function executePlaywrightAction(page, step) {
|
|
481
|
+
const { actionType, selector, value, waitMs } = step;
|
|
482
|
+
switch (actionType) {
|
|
483
|
+
case "click": {
|
|
484
|
+
const locator = page.locator(selector);
|
|
485
|
+
await locator.click();
|
|
486
|
+
break;
|
|
487
|
+
}
|
|
488
|
+
case "fill": {
|
|
489
|
+
const locator = page.locator(selector);
|
|
490
|
+
await locator.fill(value ?? "");
|
|
491
|
+
break;
|
|
492
|
+
}
|
|
493
|
+
case "select": {
|
|
494
|
+
await page.evaluate(
|
|
495
|
+
({ sel, val }) => {
|
|
496
|
+
const el = document.querySelector(sel);
|
|
497
|
+
if (!el) throw new Error(`Select element not found: ${sel}`);
|
|
498
|
+
el.value = val;
|
|
499
|
+
el.dispatchEvent(new Event("change", { bubbles: true }));
|
|
500
|
+
},
|
|
501
|
+
{ sel: selector, val: value ?? "" }
|
|
502
|
+
);
|
|
503
|
+
break;
|
|
504
|
+
}
|
|
505
|
+
case "navigate": {
|
|
506
|
+
const url = value ?? selector ?? "";
|
|
507
|
+
if (!url) throw new Error("Navigate action requires a value or selector with the URL");
|
|
508
|
+
await page.goto(url, { waitUntil: "domcontentloaded", timeoutMs: 15e3 });
|
|
509
|
+
break;
|
|
510
|
+
}
|
|
511
|
+
case "scroll": {
|
|
512
|
+
await page.evaluate((sel) => {
|
|
513
|
+
const el = document.querySelector(sel);
|
|
514
|
+
if (el) el.scrollIntoView({ behavior: "smooth", block: "center" });
|
|
515
|
+
}, selector);
|
|
516
|
+
break;
|
|
517
|
+
}
|
|
518
|
+
case "wait": {
|
|
519
|
+
if (selector) {
|
|
520
|
+
await page.waitForSelector(selector, { timeout: waitMs ?? 1e4 });
|
|
521
|
+
} else if (waitMs) {
|
|
522
|
+
await page.waitForTimeout(waitMs);
|
|
523
|
+
}
|
|
524
|
+
break;
|
|
525
|
+
}
|
|
526
|
+
case "assert": {
|
|
527
|
+
break;
|
|
528
|
+
}
|
|
529
|
+
default: {
|
|
530
|
+
throw new Error(`Unknown actionType: ${actionType}`);
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
if (waitMs && actionType !== "wait") {
|
|
534
|
+
await page.waitForTimeout(waitMs);
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
async function executeStagehandAction(stagehand, step) {
|
|
538
|
+
try {
|
|
539
|
+
await stagehand.act(step.action);
|
|
540
|
+
return { deterministic: false };
|
|
541
|
+
} catch (err) {
|
|
542
|
+
return {
|
|
543
|
+
deterministic: false,
|
|
544
|
+
error: err instanceof Error ? err.message : String(err)
|
|
545
|
+
};
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
// src/selector-discovery.ts
|
|
550
|
+
async function discoverSelector(page) {
|
|
551
|
+
try {
|
|
552
|
+
const result = await page.evaluate(() => {
|
|
553
|
+
const el = document.__bbLastClicked ?? document.activeElement;
|
|
554
|
+
if (!el || el === document.body || el === document.documentElement) return null;
|
|
555
|
+
const tagName = el.tagName?.toLowerCase() ?? "unknown";
|
|
556
|
+
const textContent = (el.textContent ?? "").trim().slice(0, 100);
|
|
557
|
+
let selector = "";
|
|
558
|
+
let strategy = "css-path";
|
|
559
|
+
const testId = el.getAttribute("data-testid") ?? el.getAttribute("data-test-id");
|
|
560
|
+
if (testId) {
|
|
561
|
+
selector = `[data-testid="${testId}"]`;
|
|
562
|
+
strategy = "data-testid";
|
|
563
|
+
} else if (el.id && !/^:r[0-9a-z]+:?$/.test(el.id) && !/^react-/.test(el.id)) {
|
|
564
|
+
selector = `#${el.id}`;
|
|
565
|
+
strategy = "id";
|
|
566
|
+
} else if (el.getAttribute("role")) {
|
|
567
|
+
const role = el.getAttribute("role");
|
|
568
|
+
const name = el.getAttribute("aria-label") ?? el.getAttribute("name") ?? "";
|
|
569
|
+
if (name) {
|
|
570
|
+
selector = `[role="${role}"][aria-label="${name}"]`;
|
|
571
|
+
strategy = "role";
|
|
572
|
+
} else {
|
|
573
|
+
selector = `[role="${role}"]`;
|
|
574
|
+
strategy = "role";
|
|
575
|
+
}
|
|
576
|
+
} else if (el.getAttribute("aria-label")) {
|
|
577
|
+
selector = `[aria-label="${el.getAttribute("aria-label")}"]`;
|
|
578
|
+
strategy = "aria-label";
|
|
579
|
+
} else {
|
|
580
|
+
const parts = [];
|
|
581
|
+
let current = el;
|
|
582
|
+
while (current && current !== document.body) {
|
|
583
|
+
let part = current.tagName.toLowerCase();
|
|
584
|
+
if (current.className && typeof current.className === "string") {
|
|
585
|
+
const classes = current.className.split(/\s+/).filter(
|
|
586
|
+
(c) => c && !c.startsWith("_") && c.length < 30
|
|
587
|
+
);
|
|
588
|
+
if (classes.length > 0) {
|
|
589
|
+
part += `.${classes[0]}`;
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
parts.unshift(part);
|
|
593
|
+
current = current.parentElement;
|
|
594
|
+
if (parts.length >= 4) break;
|
|
595
|
+
}
|
|
596
|
+
selector = parts.join(" > ");
|
|
597
|
+
strategy = "css-path";
|
|
598
|
+
}
|
|
599
|
+
let suggestedActionType;
|
|
600
|
+
if (tagName === "button" || tagName === "a" || el.getAttribute("role") === "button") {
|
|
601
|
+
suggestedActionType = "click";
|
|
602
|
+
} else if (tagName === "input" || tagName === "textarea") {
|
|
603
|
+
const type = el.getAttribute("type") ?? "text";
|
|
604
|
+
if (type === "checkbox" || type === "radio") {
|
|
605
|
+
suggestedActionType = "click";
|
|
606
|
+
} else {
|
|
607
|
+
suggestedActionType = "fill";
|
|
608
|
+
}
|
|
609
|
+
} else if (tagName === "select") {
|
|
610
|
+
suggestedActionType = "select";
|
|
611
|
+
}
|
|
612
|
+
return { selector, strategy, suggestedActionType, tagName, textContent };
|
|
613
|
+
});
|
|
614
|
+
return result;
|
|
615
|
+
} catch {
|
|
616
|
+
return null;
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
async function installClickTracker(page) {
|
|
620
|
+
try {
|
|
621
|
+
await page.evaluate(() => {
|
|
622
|
+
document.addEventListener("click", (e) => {
|
|
623
|
+
document.__bbLastClicked = e.target;
|
|
624
|
+
}, { capture: true });
|
|
625
|
+
});
|
|
626
|
+
} catch {
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
// src/cost.ts
|
|
631
|
+
var MODEL_PRICING = {
|
|
632
|
+
"claude-sonnet-4-20250514": { input: 3, output: 15 },
|
|
633
|
+
"claude-haiku-4-20250514": { input: 0.8, output: 4 },
|
|
634
|
+
"claude-opus-4-20250514": { input: 15, output: 75 },
|
|
635
|
+
// Aliases
|
|
636
|
+
"sonnet": { input: 3, output: 15 },
|
|
637
|
+
"haiku": { input: 0.8, output: 4 },
|
|
638
|
+
"opus": { input: 15, output: 75 }
|
|
639
|
+
};
|
|
640
|
+
var DEFAULT_MODEL3 = "claude-sonnet-4-20250514";
|
|
641
|
+
var TOKEN_PROFILE = {
|
|
642
|
+
/** act() — screenshot + DOM context → action decision */
|
|
643
|
+
actInput: 2e3,
|
|
644
|
+
actOutput: 200,
|
|
645
|
+
/** extract() — screenshot + extraction schema → structured result */
|
|
646
|
+
extractInput: 3e3,
|
|
647
|
+
extractOutput: 500,
|
|
648
|
+
/** summary — all step results → narrative summary (once per run) */
|
|
649
|
+
summaryInput: 2e3,
|
|
650
|
+
summaryOutput: 500
|
|
651
|
+
};
|
|
652
|
+
function estimateCost(inputTokens, outputTokens, model) {
|
|
653
|
+
const resolvedModel = model ?? DEFAULT_MODEL3;
|
|
654
|
+
const pricing = MODEL_PRICING[resolvedModel] ?? MODEL_PRICING[DEFAULT_MODEL3];
|
|
655
|
+
const inputCost = inputTokens / 1e6 * pricing.input;
|
|
656
|
+
const outputCost = outputTokens / 1e6 * pricing.output;
|
|
657
|
+
const totalDollars = inputCost + outputCost;
|
|
658
|
+
const cents = Math.round(totalDollars * 100 * 100) / 100;
|
|
659
|
+
return {
|
|
660
|
+
cents,
|
|
661
|
+
formatted: `$${totalDollars.toFixed(4)}`,
|
|
662
|
+
tokens: { inputTokens, outputTokens },
|
|
663
|
+
model: resolvedModel
|
|
664
|
+
};
|
|
665
|
+
}
|
|
666
|
+
function estimateTestCost(stepCount, model) {
|
|
667
|
+
const inputTokens = stepCount * (TOKEN_PROFILE.actInput + TOKEN_PROFILE.extractInput) + TOKEN_PROFILE.summaryInput;
|
|
668
|
+
const outputTokens = stepCount * (TOKEN_PROFILE.actOutput + TOKEN_PROFILE.extractOutput) + TOKEN_PROFILE.summaryOutput;
|
|
669
|
+
return estimateCost(inputTokens, outputTokens, model);
|
|
670
|
+
}
|
|
671
|
+
function estimateBatchCost(testCases, model) {
|
|
672
|
+
let totalInput = 0;
|
|
673
|
+
let totalOutput = 0;
|
|
674
|
+
for (const tc of testCases) {
|
|
675
|
+
totalInput += tc.stepCount * (TOKEN_PROFILE.actInput + TOKEN_PROFILE.extractInput) + TOKEN_PROFILE.summaryInput;
|
|
676
|
+
totalOutput += tc.stepCount * (TOKEN_PROFILE.actOutput + TOKEN_PROFILE.extractOutput) + TOKEN_PROFILE.summaryOutput;
|
|
677
|
+
}
|
|
678
|
+
return estimateCost(totalInput, totalOutput, model);
|
|
679
|
+
}
|
|
680
|
+
function getTokenEstimate(stepCount) {
|
|
681
|
+
return {
|
|
682
|
+
inputTokens: stepCount * (TOKEN_PROFILE.actInput + TOKEN_PROFILE.extractInput) + TOKEN_PROFILE.summaryInput,
|
|
683
|
+
outputTokens: stepCount * (TOKEN_PROFILE.actOutput + TOKEN_PROFILE.extractOutput) + TOKEN_PROFILE.summaryOutput
|
|
684
|
+
};
|
|
685
|
+
}
|
|
686
|
+
|
|
164
687
|
// src/runner.ts
|
|
688
|
+
var AI_OPERATION_TIMEOUT_MS = 3e4;
|
|
689
|
+
var DEFAULT_MAX_RETRIES = 2;
|
|
690
|
+
var DEFAULT_RETRY_DELAY_MS = 2e3;
|
|
691
|
+
function isRetryableError(error) {
|
|
692
|
+
const patterns = [
|
|
693
|
+
/timed?\s*out/i,
|
|
694
|
+
/ECONNREFUSED/i,
|
|
695
|
+
/ECONNRESET/i,
|
|
696
|
+
/ENOTFOUND/i,
|
|
697
|
+
/net::ERR_/i,
|
|
698
|
+
/navigation failed/i,
|
|
699
|
+
/page crashed/i,
|
|
700
|
+
/context was destroyed/i,
|
|
701
|
+
/target closed/i,
|
|
702
|
+
/session closed/i,
|
|
703
|
+
/browser disconnected/i,
|
|
704
|
+
/execution context/i
|
|
705
|
+
];
|
|
706
|
+
return patterns.some((p) => p.test(error));
|
|
707
|
+
}
|
|
708
|
+
async function withTimeout(promise, timeoutMs, operation) {
|
|
709
|
+
let timeoutId;
|
|
710
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
711
|
+
timeoutId = setTimeout(() => reject(new Error(`${operation} timed out after ${timeoutMs}ms`)), timeoutMs);
|
|
712
|
+
});
|
|
713
|
+
try {
|
|
714
|
+
return await Promise.race([promise, timeoutPromise]);
|
|
715
|
+
} finally {
|
|
716
|
+
clearTimeout(timeoutId);
|
|
717
|
+
}
|
|
718
|
+
}
|
|
165
719
|
async function runTest(config) {
|
|
166
720
|
const anthropic = new Anthropic({ apiKey: config.anthropicApiKey });
|
|
167
721
|
const startTime = Date.now();
|
|
@@ -170,60 +724,71 @@ async function runTest(config) {
|
|
|
170
724
|
headless: true
|
|
171
725
|
};
|
|
172
726
|
config.onStatusChange?.("initializing");
|
|
173
|
-
|
|
174
|
-
const { stagehand, page } = session;
|
|
727
|
+
let session;
|
|
175
728
|
const stepResults = [];
|
|
176
729
|
let pendingConsoleLogs = [];
|
|
177
730
|
let pendingNetworkErrors = [];
|
|
178
731
|
let stepStartTime = Date.now();
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
const
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
pendingNetworkErrors.push({
|
|
195
|
-
method,
|
|
196
|
-
url: url.slice(0, 500),
|
|
197
|
-
status: 0,
|
|
198
|
-
statusText: failure?.errorText ?? "Request failed",
|
|
199
|
-
timestamp: Date.now() - stepStartTime
|
|
200
|
-
});
|
|
201
|
-
});
|
|
202
|
-
rawPage.on("response", (res) => {
|
|
203
|
-
const status = typeof res.status === "function" ? res.status() : Number(res.status ?? 0);
|
|
204
|
-
if (status >= 400) {
|
|
205
|
-
const url = typeof res.url === "function" ? res.url() : String(res.url ?? "");
|
|
206
|
-
const statusText = typeof res.statusText === "function" ? res.statusText() : String(res.statusText ?? "");
|
|
207
|
-
const req = typeof res.request === "function" ? res.request() : res.request;
|
|
208
|
-
const method = req ? typeof req.method === "function" ? req.method() : String(req.method ?? "GET") : "GET";
|
|
209
|
-
pendingNetworkErrors.push({
|
|
210
|
-
method,
|
|
211
|
-
url: url.slice(0, 500),
|
|
212
|
-
status,
|
|
213
|
-
statusText,
|
|
214
|
-
timestamp: Date.now() - stepStartTime
|
|
732
|
+
try {
|
|
733
|
+
session = await createStagehandSession(browserConfig, config.anthropicApiKey);
|
|
734
|
+
const { stagehand, page } = session;
|
|
735
|
+
await suppressBugBearWidget(stagehand);
|
|
736
|
+
const rawPage = page;
|
|
737
|
+
try {
|
|
738
|
+
rawPage.on("console", (msg) => {
|
|
739
|
+
const level = msg.type?.() ?? msg.type ?? "log";
|
|
740
|
+
const mappedLevel = level === "error" ? "error" : level === "warn" || level === "warning" ? "warning" : level === "info" ? "info" : level === "debug" ? "debug" : "log";
|
|
741
|
+
pendingConsoleLogs.push({
|
|
742
|
+
level: mappedLevel,
|
|
743
|
+
text: (typeof msg.text === "function" ? msg.text() : String(msg.text ?? msg)).slice(0, 2e3),
|
|
744
|
+
source: typeof msg.location === "function" ? msg.location()?.url : void 0,
|
|
745
|
+
timestamp: Date.now() - stepStartTime
|
|
746
|
+
});
|
|
215
747
|
});
|
|
748
|
+
} catch {
|
|
216
749
|
}
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
750
|
+
try {
|
|
751
|
+
rawPage.on("requestfailed", (req) => {
|
|
752
|
+
const url = typeof req.url === "function" ? req.url() : String(req.url ?? "");
|
|
753
|
+
const method = typeof req.method === "function" ? req.method() : String(req.method ?? "GET");
|
|
754
|
+
const failure = typeof req.failure === "function" ? req.failure() : req.failure;
|
|
755
|
+
pendingNetworkErrors.push({
|
|
756
|
+
method,
|
|
757
|
+
url: url.slice(0, 500),
|
|
758
|
+
status: 0,
|
|
759
|
+
statusText: failure?.errorText ?? "Request failed",
|
|
760
|
+
timestamp: Date.now() - stepStartTime
|
|
761
|
+
});
|
|
762
|
+
});
|
|
763
|
+
} catch {
|
|
764
|
+
}
|
|
765
|
+
try {
|
|
766
|
+
rawPage.on("response", (res) => {
|
|
767
|
+
const status = typeof res.status === "function" ? res.status() : Number(res.status ?? 0);
|
|
768
|
+
if (status >= 400) {
|
|
769
|
+
const url = typeof res.url === "function" ? res.url() : String(res.url ?? "");
|
|
770
|
+
const statusText = typeof res.statusText === "function" ? res.statusText() : String(res.statusText ?? "");
|
|
771
|
+
const req = typeof res.request === "function" ? res.request() : res.request;
|
|
772
|
+
const method = req ? typeof req.method === "function" ? req.method() : String(req.method ?? "GET") : "GET";
|
|
773
|
+
pendingNetworkErrors.push({
|
|
774
|
+
method,
|
|
775
|
+
url: url.slice(0, 500),
|
|
776
|
+
status,
|
|
777
|
+
statusText,
|
|
778
|
+
timestamp: Date.now() - stepStartTime
|
|
779
|
+
});
|
|
780
|
+
}
|
|
781
|
+
});
|
|
782
|
+
} catch {
|
|
783
|
+
}
|
|
784
|
+
if (config.auth?.type === "form-login" || config.auth?.type === "supabase-native") {
|
|
220
785
|
config.onStatusChange?.("authenticating");
|
|
221
786
|
await injectAuth(page, config.auth, stagehand);
|
|
222
787
|
}
|
|
223
788
|
config.onStatusChange?.("navigating");
|
|
224
789
|
const targetUrl = config.testCase.targetRoute ? `${config.targetUrl.replace(/\/$/, "")}${config.testCase.targetRoute}` : config.targetUrl;
|
|
225
790
|
await page.goto(targetUrl, { waitUntil: "domcontentloaded", timeoutMs: 3e4 });
|
|
226
|
-
if (config.auth && config.auth.type !== "form-login") {
|
|
791
|
+
if (config.auth && config.auth.type !== "form-login" && config.auth.type !== "supabase-native") {
|
|
227
792
|
config.onStatusChange?.("authenticating");
|
|
228
793
|
await injectAuth(page, config.auth, stagehand);
|
|
229
794
|
if (config.auth.type === "localStorage") {
|
|
@@ -237,79 +802,143 @@ async function runTest(config) {
|
|
|
237
802
|
}
|
|
238
803
|
await page.waitForLoadState("networkidle").catch(() => {
|
|
239
804
|
});
|
|
805
|
+
await page.evaluate(() => {
|
|
806
|
+
window.__bugbear_suppress = true;
|
|
807
|
+
try {
|
|
808
|
+
localStorage.setItem("__bugbear_suppress", "true");
|
|
809
|
+
} catch {
|
|
810
|
+
}
|
|
811
|
+
}).catch(() => {
|
|
812
|
+
});
|
|
813
|
+
await installClickTracker(page);
|
|
240
814
|
pendingConsoleLogs = [];
|
|
241
815
|
pendingNetworkErrors = [];
|
|
242
816
|
config.onStatusChange?.("executing");
|
|
243
817
|
const steps = config.testCase.steps;
|
|
818
|
+
const maxRetries = config.retry?.maxRetries ?? DEFAULT_MAX_RETRIES;
|
|
819
|
+
const retryDelayMs = config.retry?.retryDelayMs ?? DEFAULT_RETRY_DELAY_MS;
|
|
820
|
+
const resilientMode = config.resilientMode ?? true;
|
|
244
821
|
for (let i = 0; i < steps.length; i++) {
|
|
245
822
|
const step = steps[i];
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
actSucceeded =
|
|
256
|
-
await page
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
823
|
+
const retryHistory = [];
|
|
824
|
+
let finalResult;
|
|
825
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
826
|
+
stepStartTime = Date.now();
|
|
827
|
+
pendingConsoleLogs = [];
|
|
828
|
+
pendingNetworkErrors = [];
|
|
829
|
+
const screenshotBefore = await page.screenshot({ type: "png" });
|
|
830
|
+
let error;
|
|
831
|
+
let screenshotAfter = screenshotBefore;
|
|
832
|
+
let actSucceeded = false;
|
|
833
|
+
const actionResult = await executeAction(page, stagehand, step);
|
|
834
|
+
error = actionResult.error;
|
|
835
|
+
actSucceeded = !error;
|
|
836
|
+
if (actSucceeded) {
|
|
837
|
+
await page.waitForLoadState("networkidle").catch(() => {
|
|
838
|
+
});
|
|
839
|
+
await page.waitForTimeout(step.waitMs ?? 500);
|
|
840
|
+
}
|
|
262
841
|
screenshotAfter = await page.screenshot({ type: "png" }).catch(() => screenshotBefore);
|
|
842
|
+
let evaluation = {
|
|
843
|
+
passed: false,
|
|
844
|
+
confidence: 0,
|
|
845
|
+
actualResult: error ?? "Action execution failed"
|
|
846
|
+
};
|
|
847
|
+
if (actSucceeded) {
|
|
848
|
+
try {
|
|
849
|
+
const visionResult = await withTimeout(
|
|
850
|
+
evaluateStep({
|
|
851
|
+
anthropic,
|
|
852
|
+
screenshotBefore,
|
|
853
|
+
screenshotAfter,
|
|
854
|
+
action: step.action,
|
|
855
|
+
expectedResult: step.expectedResult,
|
|
856
|
+
evaluationHint: step.evaluationHint,
|
|
857
|
+
model: config.model
|
|
858
|
+
}),
|
|
859
|
+
AI_OPERATION_TIMEOUT_MS,
|
|
860
|
+
"Vision evaluation"
|
|
861
|
+
);
|
|
862
|
+
evaluation = {
|
|
863
|
+
passed: visionResult.passed,
|
|
864
|
+
confidence: visionResult.confidence,
|
|
865
|
+
actualResult: visionResult.actualResult
|
|
866
|
+
};
|
|
867
|
+
} catch (evalErr) {
|
|
868
|
+
evaluation = {
|
|
869
|
+
passed: false,
|
|
870
|
+
confidence: 0.2,
|
|
871
|
+
actualResult: `Vision evaluation error: ${evalErr instanceof Error ? evalErr.message : String(evalErr)}`
|
|
872
|
+
};
|
|
873
|
+
}
|
|
874
|
+
}
|
|
875
|
+
let discoveredActions = [];
|
|
876
|
+
if (actSucceeded && !actionResult.deterministic) {
|
|
877
|
+
const discovered = await discoverSelector(page);
|
|
878
|
+
if (discovered) {
|
|
879
|
+
discoveredActions = [{
|
|
880
|
+
type: discovered.suggestedActionType ?? "click",
|
|
881
|
+
selector: discovered.selector,
|
|
882
|
+
description: `Discovered via ${discovered.strategy}: ${discovered.tagName}${discovered.textContent ? ` "${discovered.textContent.slice(0, 50)}"` : ""}`
|
|
883
|
+
}];
|
|
884
|
+
}
|
|
885
|
+
}
|
|
886
|
+
const consoleLogs = pendingConsoleLogs.slice(0, 50);
|
|
887
|
+
const networkErrors = pendingNetworkErrors.slice(0, 30);
|
|
888
|
+
finalResult = {
|
|
889
|
+
stepNumber: step.stepNumber,
|
|
890
|
+
action: step.action,
|
|
891
|
+
expectedResult: step.expectedResult,
|
|
892
|
+
actualResult: evaluation.actualResult,
|
|
893
|
+
passed: evaluation.passed,
|
|
894
|
+
confidence: evaluation.confidence,
|
|
895
|
+
screenshotBefore,
|
|
896
|
+
screenshotAfter,
|
|
897
|
+
actionsTaken: discoveredActions,
|
|
898
|
+
error,
|
|
899
|
+
durationMs: Date.now() - stepStartTime,
|
|
900
|
+
consoleLogs,
|
|
901
|
+
networkErrors,
|
|
902
|
+
retryCount: attempt,
|
|
903
|
+
retryHistory,
|
|
904
|
+
skipped: false
|
|
905
|
+
};
|
|
906
|
+
const shouldRetry = !evaluation.passed && error && isRetryableError(error) && attempt < maxRetries;
|
|
907
|
+
if (!shouldRetry) break;
|
|
908
|
+
retryHistory.push({
|
|
909
|
+
attempt,
|
|
910
|
+
error,
|
|
911
|
+
confidence: evaluation.confidence,
|
|
912
|
+
timestamp: Date.now()
|
|
913
|
+
});
|
|
914
|
+
await new Promise((resolve) => setTimeout(resolve, retryDelayMs));
|
|
263
915
|
}
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
actualResult: error ?? "Action execution failed"
|
|
268
|
-
};
|
|
269
|
-
if (actSucceeded) {
|
|
916
|
+
if (resilientMode && finalResult && !finalResult.passed) {
|
|
917
|
+
finalResult.skipped = true;
|
|
918
|
+
finalResult.skipReason = "Step failed, recovered page state";
|
|
270
919
|
try {
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
920
|
+
config.onStatusChange?.("navigating");
|
|
921
|
+
await page.goto(targetUrl, { waitUntil: "domcontentloaded", timeoutMs: 3e4 });
|
|
922
|
+
await page.waitForLoadState("networkidle").catch(() => {
|
|
923
|
+
});
|
|
924
|
+
await installClickTracker(page);
|
|
925
|
+
await page.evaluate(() => {
|
|
926
|
+
window.__bugbear_suppress = true;
|
|
927
|
+
try {
|
|
928
|
+
localStorage.setItem("__bugbear_suppress", "true");
|
|
929
|
+
} catch {
|
|
930
|
+
}
|
|
931
|
+
}).catch(() => {
|
|
275
932
|
});
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
passed: verification.passed,
|
|
282
|
-
confidence: verification.confidence,
|
|
283
|
-
actualResult: verification.actualResult
|
|
284
|
-
};
|
|
285
|
-
} catch (evalErr) {
|
|
286
|
-
evaluation = {
|
|
287
|
-
passed: false,
|
|
288
|
-
confidence: 0.2,
|
|
289
|
-
actualResult: `Verification error: ${evalErr instanceof Error ? evalErr.message : String(evalErr)}`
|
|
290
|
-
};
|
|
933
|
+
pendingConsoleLogs = [];
|
|
934
|
+
pendingNetworkErrors = [];
|
|
935
|
+
config.onStatusChange?.("executing");
|
|
936
|
+
} catch (recoveryErr) {
|
|
937
|
+
finalResult.skipReason = `Step failed, recovery also failed: ${recoveryErr instanceof Error ? recoveryErr.message : String(recoveryErr)}`;
|
|
291
938
|
}
|
|
292
939
|
}
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
const result = {
|
|
296
|
-
stepNumber: step.stepNumber,
|
|
297
|
-
action: step.action,
|
|
298
|
-
expectedResult: step.expectedResult,
|
|
299
|
-
actualResult: evaluation.actualResult,
|
|
300
|
-
passed: evaluation.passed,
|
|
301
|
-
confidence: evaluation.confidence,
|
|
302
|
-
screenshotBefore,
|
|
303
|
-
screenshotAfter,
|
|
304
|
-
actionsTaken: [],
|
|
305
|
-
// Stagehand handles actions internally
|
|
306
|
-
error,
|
|
307
|
-
durationMs: Date.now() - stepStartTime,
|
|
308
|
-
consoleLogs,
|
|
309
|
-
networkErrors
|
|
310
|
-
};
|
|
311
|
-
stepResults.push(result);
|
|
312
|
-
config.onStepComplete?.(result, i, steps.length);
|
|
940
|
+
stepResults.push(finalResult);
|
|
941
|
+
config.onStepComplete?.(finalResult, i, steps.length);
|
|
313
942
|
}
|
|
314
943
|
config.onStatusChange?.("completed");
|
|
315
944
|
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
@@ -323,11 +952,7 @@ async function runTest(config) {
|
|
|
323
952
|
totalDurationMs: Date.now() - startTime,
|
|
324
953
|
summary,
|
|
325
954
|
screenshotUrls: [],
|
|
326
|
-
tokenUsage:
|
|
327
|
-
// Stagehand tracks tokens internally; these are approximate
|
|
328
|
-
inputTokens: steps.length * 3e3,
|
|
329
|
-
outputTokens: steps.length * 500
|
|
330
|
-
},
|
|
955
|
+
tokenUsage: getTokenEstimate(steps.length),
|
|
331
956
|
browserSessionId: session.sessionId
|
|
332
957
|
};
|
|
333
958
|
} catch (err) {
|
|
@@ -339,29 +964,685 @@ async function runTest(config) {
|
|
|
339
964
|
totalDurationMs: Date.now() - startTime,
|
|
340
965
|
summary: `Test execution failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
341
966
|
screenshotUrls: [],
|
|
342
|
-
tokenUsage:
|
|
343
|
-
|
|
344
|
-
outputTokens: stepResults.length * 500
|
|
345
|
-
},
|
|
346
|
-
browserSessionId: session.sessionId
|
|
967
|
+
tokenUsage: getTokenEstimate(stepResults.length),
|
|
968
|
+
browserSessionId: session?.sessionId ?? "unknown"
|
|
347
969
|
};
|
|
348
970
|
} finally {
|
|
349
|
-
|
|
971
|
+
if (session?.page) {
|
|
972
|
+
const rawPage = session.page;
|
|
973
|
+
rawPage.removeAllListeners?.("console");
|
|
974
|
+
rawPage.removeAllListeners?.("requestfailed");
|
|
975
|
+
rawPage.removeAllListeners?.("response");
|
|
976
|
+
}
|
|
977
|
+
await session?.close();
|
|
350
978
|
}
|
|
351
979
|
}
|
|
352
980
|
function determineOverallResult(steps) {
|
|
353
981
|
if (steps.length === 0) return "error";
|
|
354
|
-
const
|
|
355
|
-
const
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
982
|
+
const nonSkipped = steps.filter((s) => !s.skipped);
|
|
983
|
+
const skippedCount = steps.length - nonSkipped.length;
|
|
984
|
+
if (nonSkipped.length === 0) return "error";
|
|
985
|
+
const allNonSkippedPassed = nonSkipped.every((s) => s.passed);
|
|
986
|
+
const hasErrors = nonSkipped.some((s) => s.error);
|
|
987
|
+
if (skippedCount > 0 && allNonSkippedPassed) return "passed_with_skips";
|
|
988
|
+
if (allNonSkippedPassed) return "passed";
|
|
989
|
+
if (nonSkipped.every((s) => !s.passed) || hasErrors) return "failed";
|
|
359
990
|
return "partial";
|
|
360
991
|
}
|
|
992
|
+
|
|
993
|
+
// src/explorer.ts
|
|
994
|
+
import Anthropic2 from "@anthropic-ai/sdk";
|
|
995
|
+
var DEFAULT_MODEL4 = "anthropic/claude-sonnet-4-20250514";
|
|
996
|
+
var AI_OPERATION_TIMEOUT_MS2 = 6e4;
|
|
997
|
+
async function withTimeout2(promise, timeoutMs, operation) {
|
|
998
|
+
let timeoutId;
|
|
999
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
1000
|
+
timeoutId = setTimeout(() => reject(new Error(`${operation} timed out after ${timeoutMs}ms`)), timeoutMs);
|
|
1001
|
+
});
|
|
1002
|
+
try {
|
|
1003
|
+
return await Promise.race([promise, timeoutPromise]);
|
|
1004
|
+
} finally {
|
|
1005
|
+
clearTimeout(timeoutId);
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
1008
|
+
async function runExploration(config) {
|
|
1009
|
+
const {
|
|
1010
|
+
targetUrl,
|
|
1011
|
+
featureDescription,
|
|
1012
|
+
actionBudget,
|
|
1013
|
+
auth,
|
|
1014
|
+
browserConfig,
|
|
1015
|
+
anthropicApiKey,
|
|
1016
|
+
model = DEFAULT_MODEL4,
|
|
1017
|
+
onActionComplete
|
|
1018
|
+
} = config;
|
|
1019
|
+
const anthropic = new Anthropic2({ apiKey: anthropicApiKey });
|
|
1020
|
+
const startTime = Date.now();
|
|
1021
|
+
const actions = [];
|
|
1022
|
+
let totalInputTokens = 0;
|
|
1023
|
+
let totalOutputTokens = 0;
|
|
1024
|
+
const session = await createStagehandSession(browserConfig, anthropicApiKey);
|
|
1025
|
+
const { stagehand, page } = session;
|
|
1026
|
+
await suppressBugBearWidget(stagehand);
|
|
1027
|
+
try {
|
|
1028
|
+
await page.goto(targetUrl, { waitUntil: "networkidle", timeoutMs: 3e4 });
|
|
1029
|
+
if (auth) {
|
|
1030
|
+
await injectAuth(page, auth, stagehand);
|
|
1031
|
+
await page.waitForLoadState("networkidle").catch(() => {
|
|
1032
|
+
});
|
|
1033
|
+
}
|
|
1034
|
+
const networkCapture = createNetworkCapture(page);
|
|
1035
|
+
let consoleLogs = [];
|
|
1036
|
+
let actionStartTime = Date.now();
|
|
1037
|
+
const rawPage = page;
|
|
1038
|
+
rawPage.on("console", (msg) => {
|
|
1039
|
+
const level = msg.type?.() ?? msg.type ?? "log";
|
|
1040
|
+
if (["error", "warning", "warn"].includes(level)) {
|
|
1041
|
+
consoleLogs.push({
|
|
1042
|
+
level: level === "warn" ? "warning" : level,
|
|
1043
|
+
text: (typeof msg.text === "function" ? msg.text() : String(msg.text ?? msg)).slice(0, 500),
|
|
1044
|
+
source: typeof msg.location === "function" ? msg.location()?.url : void 0,
|
|
1045
|
+
timestamp: Date.now() - actionStartTime
|
|
1046
|
+
});
|
|
1047
|
+
}
|
|
1048
|
+
});
|
|
1049
|
+
const actionLog = [];
|
|
1050
|
+
for (let i = 0; i < actionBudget; i++) {
|
|
1051
|
+
actionStartTime = Date.now();
|
|
1052
|
+
consoleLogs = [];
|
|
1053
|
+
const observations = await withTimeout2(
|
|
1054
|
+
stagehand.observe(),
|
|
1055
|
+
AI_OPERATION_TIMEOUT_MS2,
|
|
1056
|
+
"Page observation"
|
|
1057
|
+
);
|
|
1058
|
+
const decisionResponse = await withTimeout2(
|
|
1059
|
+
anthropic.messages.create({
|
|
1060
|
+
model: model.replace("anthropic/", ""),
|
|
1061
|
+
max_tokens: 300,
|
|
1062
|
+
system: buildDecisionPrompt(featureDescription, actionBudget - i, actionLog),
|
|
1063
|
+
messages: [
|
|
1064
|
+
{
|
|
1065
|
+
role: "user",
|
|
1066
|
+
content: `Current page URL: ${page.url()}
|
|
1067
|
+
|
|
1068
|
+
Visible interactive elements:
|
|
1069
|
+
${formatObservations(observations)}
|
|
1070
|
+
|
|
1071
|
+
What single action should I perform next?`
|
|
1072
|
+
}
|
|
1073
|
+
]
|
|
1074
|
+
}),
|
|
1075
|
+
AI_OPERATION_TIMEOUT_MS2,
|
|
1076
|
+
"Action decision"
|
|
1077
|
+
);
|
|
1078
|
+
const actionText = extractText(decisionResponse);
|
|
1079
|
+
totalInputTokens += decisionResponse.usage.input_tokens;
|
|
1080
|
+
totalOutputTokens += decisionResponse.usage.output_tokens;
|
|
1081
|
+
if (actionText.toLowerCase().includes("[done]") || actionText.toLowerCase().includes("no more actions")) {
|
|
1082
|
+
break;
|
|
1083
|
+
}
|
|
1084
|
+
const screenshotBefore = await page.screenshot({ type: "png" });
|
|
1085
|
+
networkCapture.start();
|
|
1086
|
+
try {
|
|
1087
|
+
await stagehand.act(actionText);
|
|
1088
|
+
} catch (actError) {
|
|
1089
|
+
networkCapture.stop();
|
|
1090
|
+
const screenshotAfter2 = await page.screenshot({ type: "png" });
|
|
1091
|
+
const action2 = {
|
|
1092
|
+
actionNumber: i + 1,
|
|
1093
|
+
action: actionText,
|
|
1094
|
+
category: "broken_interaction",
|
|
1095
|
+
severity: "medium",
|
|
1096
|
+
confidence: 0.9,
|
|
1097
|
+
description: `Action failed: ${actError instanceof Error ? actError.message : String(actError)}`,
|
|
1098
|
+
screenshotBefore,
|
|
1099
|
+
screenshotAfter: screenshotAfter2,
|
|
1100
|
+
networkRequests: networkCapture.getRequests(),
|
|
1101
|
+
consoleLogs: [...consoleLogs],
|
|
1102
|
+
durationMs: Date.now() - actionStartTime
|
|
1103
|
+
};
|
|
1104
|
+
actions.push(action2);
|
|
1105
|
+
actionLog.push(`[${i + 1}] ${actionText} -> FAILED: ${action2.description}`);
|
|
1106
|
+
onActionComplete?.(action2, i);
|
|
1107
|
+
continue;
|
|
1108
|
+
}
|
|
1109
|
+
await page.waitForLoadState("networkidle").catch(() => {
|
|
1110
|
+
});
|
|
1111
|
+
await page.waitForTimeout(500);
|
|
1112
|
+
networkCapture.stop();
|
|
1113
|
+
const screenshotAfter = await page.screenshot({ type: "png" });
|
|
1114
|
+
const capturedRequests = networkCapture.getRequests();
|
|
1115
|
+
const networkErrors = networkCapture.getErrors();
|
|
1116
|
+
const evalResponse = await withTimeout2(
|
|
1117
|
+
anthropic.messages.create({
|
|
1118
|
+
model: model.replace("anthropic/", ""),
|
|
1119
|
+
max_tokens: 400,
|
|
1120
|
+
system: buildEvaluationPrompt(),
|
|
1121
|
+
messages: [
|
|
1122
|
+
{
|
|
1123
|
+
role: "user",
|
|
1124
|
+
content: buildEvaluationContext(actionText, consoleLogs, networkErrors, page.url())
|
|
1125
|
+
}
|
|
1126
|
+
]
|
|
1127
|
+
}),
|
|
1128
|
+
AI_OPERATION_TIMEOUT_MS2,
|
|
1129
|
+
"Action evaluation"
|
|
1130
|
+
);
|
|
1131
|
+
totalInputTokens += evalResponse.usage.input_tokens;
|
|
1132
|
+
totalOutputTokens += evalResponse.usage.output_tokens;
|
|
1133
|
+
const evaluation = parseEvaluation2(extractText(evalResponse));
|
|
1134
|
+
const action = {
|
|
1135
|
+
actionNumber: i + 1,
|
|
1136
|
+
action: actionText,
|
|
1137
|
+
category: evaluation.category,
|
|
1138
|
+
severity: evaluation.severity,
|
|
1139
|
+
confidence: evaluation.confidence,
|
|
1140
|
+
description: evaluation.description,
|
|
1141
|
+
screenshotBefore,
|
|
1142
|
+
screenshotAfter,
|
|
1143
|
+
networkRequests: capturedRequests,
|
|
1144
|
+
consoleLogs: [...consoleLogs],
|
|
1145
|
+
domContext: evaluation.domContext,
|
|
1146
|
+
durationMs: Date.now() - actionStartTime
|
|
1147
|
+
};
|
|
1148
|
+
actions.push(action);
|
|
1149
|
+
const logEntry = evaluation.category === "normal" ? `[${i + 1}] ${actionText} -> OK` : `[${i + 1}] ${actionText} -> FINDING (${evaluation.category}): ${evaluation.description}`;
|
|
1150
|
+
actionLog.push(logEntry);
|
|
1151
|
+
onActionComplete?.(action, i);
|
|
1152
|
+
}
|
|
1153
|
+
const { generateExplorationReport: generateExplorationReport2 } = await import("./report-generator-EVZEB33O.mjs");
|
|
1154
|
+
const report = await generateExplorationReport2(anthropic, {
|
|
1155
|
+
projectName: "",
|
|
1156
|
+
featureDescription,
|
|
1157
|
+
targetUrl,
|
|
1158
|
+
actions,
|
|
1159
|
+
model: model.replace("anthropic/", "")
|
|
1160
|
+
});
|
|
1161
|
+
totalInputTokens += report.tokenUsage.inputTokens;
|
|
1162
|
+
totalOutputTokens += report.tokenUsage.outputTokens;
|
|
1163
|
+
const findings = actions.filter((a) => a.category !== "normal");
|
|
1164
|
+
return {
|
|
1165
|
+
overallResult: findings.length > 0 ? "findings" : "clean",
|
|
1166
|
+
actions,
|
|
1167
|
+
report: report.report,
|
|
1168
|
+
totalDurationMs: Date.now() - startTime,
|
|
1169
|
+
tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens },
|
|
1170
|
+
browserSessionId: session.sessionId
|
|
1171
|
+
};
|
|
1172
|
+
} catch (error) {
|
|
1173
|
+
return {
|
|
1174
|
+
overallResult: "error",
|
|
1175
|
+
actions,
|
|
1176
|
+
report: {
|
|
1177
|
+
projectName: "",
|
|
1178
|
+
featureDescription,
|
|
1179
|
+
targetUrl,
|
|
1180
|
+
exploredAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1181
|
+
duration: `${Math.round((Date.now() - startTime) / 1e3)}s`,
|
|
1182
|
+
actionsUsed: actions.length,
|
|
1183
|
+
actionBudget,
|
|
1184
|
+
findings: [],
|
|
1185
|
+
tested: [],
|
|
1186
|
+
notTested: [{ description: "Exploration aborted due to error", reason: String(error) }],
|
|
1187
|
+
summary: `Exploration failed after ${actions.length} actions: ${error instanceof Error ? error.message : String(error)}`,
|
|
1188
|
+
suggestedPrompt: ""
|
|
1189
|
+
},
|
|
1190
|
+
totalDurationMs: Date.now() - startTime,
|
|
1191
|
+
tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens },
|
|
1192
|
+
browserSessionId: session.sessionId
|
|
1193
|
+
};
|
|
1194
|
+
} finally {
|
|
1195
|
+
if (session.page) {
|
|
1196
|
+
const rawPage = session.page;
|
|
1197
|
+
rawPage.removeAllListeners?.("console");
|
|
1198
|
+
}
|
|
1199
|
+
await session.close();
|
|
1200
|
+
}
|
|
1201
|
+
}
|
|
1202
|
+
function buildDecisionPrompt(featureDescription, remainingBudget, actionLog) {
|
|
1203
|
+
return `You are an exploratory QA tester examining the feature: "${featureDescription}".
|
|
1204
|
+
Your goal is to find bugs by interacting with the page like a real user would.
|
|
1205
|
+
|
|
1206
|
+
Strategy for choosing your next action:
|
|
1207
|
+
1. Try the happy path first (normal usage)
|
|
1208
|
+
2. Then try edge cases: empty inputs, very long text, special characters
|
|
1209
|
+
3. Click buttons and links to verify they work
|
|
1210
|
+
4. Submit forms with missing required fields
|
|
1211
|
+
5. Look for visual problems: overlapping text, broken layouts, missing images
|
|
1212
|
+
|
|
1213
|
+
You have ${remainingBudget} actions left. Prioritize high-risk interactions.
|
|
1214
|
+
${actionLog.length > 0 ? `
|
|
1215
|
+
Actions already taken:
|
|
1216
|
+
${actionLog.join("\n")}` : ""}
|
|
1217
|
+
|
|
1218
|
+
DO NOT repeat an action you've already performed.
|
|
1219
|
+
Respond with a single action description. If there's nothing left to test, respond with "[DONE]".`;
|
|
1220
|
+
}
|
|
1221
|
+
function buildEvaluationPrompt() {
|
|
1222
|
+
return `You are evaluating the result of a QA test action. Categorize what happened.
|
|
1223
|
+
|
|
1224
|
+
Respond in this exact JSON format:
|
|
1225
|
+
{
|
|
1226
|
+
"category": "normal" | "console_error" | "broken_interaction" | "visual_anomaly" | "input_handling",
|
|
1227
|
+
"severity": "critical" | "high" | "medium" | "low",
|
|
1228
|
+
"confidence": 0.0-1.0,
|
|
1229
|
+
"description": "What happened",
|
|
1230
|
+
"expectedBehavior": "What should have happened",
|
|
1231
|
+
"domSelector": "CSS selector of the element involved (if applicable)"
|
|
1232
|
+
}
|
|
1233
|
+
|
|
1234
|
+
Category definitions:
|
|
1235
|
+
- normal: Expected behavior, no issues found
|
|
1236
|
+
- console_error: JavaScript exception or failed network request (4xx/5xx)
|
|
1237
|
+
- broken_interaction: Action had no visible effect, button didn't respond, navigation failed
|
|
1238
|
+
- visual_anomaly: Layout break, text overflow, missing/broken images, overlapping elements
|
|
1239
|
+
- input_handling: Missing validation, accepted clearly invalid input, no error feedback
|
|
1240
|
+
|
|
1241
|
+
Only report genuine issues. If behavior seems correct, use "normal".
|
|
1242
|
+
For "normal" results, severity and domSelector are not required.`;
|
|
1243
|
+
}
|
|
1244
|
+
function buildEvaluationContext(action, consoleLogs, networkErrors, currentUrl) {
|
|
1245
|
+
let context = `Action performed: "${action}"
|
|
1246
|
+
Current URL: ${currentUrl}
|
|
1247
|
+
`;
|
|
1248
|
+
if (consoleLogs.length > 0) {
|
|
1249
|
+
context += `
|
|
1250
|
+
Console output:
|
|
1251
|
+
${consoleLogs.map((l) => `[${l.level}] ${l.text}`).join("\n")}
|
|
1252
|
+
`;
|
|
1253
|
+
}
|
|
1254
|
+
if (networkErrors.length > 0) {
|
|
1255
|
+
context += `
|
|
1256
|
+
Failed network requests:
|
|
1257
|
+
${networkErrors.map((e) => `${e.method} ${e.url} -> ${e.status} ${e.statusText}`).join("\n")}
|
|
1258
|
+
`;
|
|
1259
|
+
}
|
|
1260
|
+
return context;
|
|
1261
|
+
}
|
|
1262
|
+
function formatObservations(observations) {
|
|
1263
|
+
return observations.slice(0, 30).map((o, i) => `${i + 1}. [${o.selector}] ${o.description}`).join("\n");
|
|
1264
|
+
}
|
|
1265
|
+
function extractText(response) {
|
|
1266
|
+
const block = response.content[0];
|
|
1267
|
+
return block.type === "text" ? block.text : "";
|
|
1268
|
+
}
|
|
1269
|
+
function parseEvaluation2(text) {
|
|
1270
|
+
try {
|
|
1271
|
+
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
1272
|
+
if (!jsonMatch) throw new Error("No JSON found");
|
|
1273
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
1274
|
+
return {
|
|
1275
|
+
category: parsed.category || "normal",
|
|
1276
|
+
severity: parsed.severity,
|
|
1277
|
+
confidence: typeof parsed.confidence === "number" ? parsed.confidence : 0.5,
|
|
1278
|
+
description: parsed.description || text,
|
|
1279
|
+
expectedBehavior: parsed.expectedBehavior,
|
|
1280
|
+
domContext: parsed.domSelector ? { selector: parsed.domSelector, elementText: "", nearbyText: "" } : void 0
|
|
1281
|
+
};
|
|
1282
|
+
} catch {
|
|
1283
|
+
return { category: "normal", confidence: 0.3, description: text };
|
|
1284
|
+
}
|
|
1285
|
+
}
|
|
1286
|
+
|
|
1287
|
+
// src/report-triager.ts
|
|
1288
|
+
var DEFAULT_MODEL5 = "claude-sonnet-4-20250514";
|
|
1289
|
+
async function triageReport(input) {
|
|
1290
|
+
const model = input.model ?? DEFAULT_MODEL5;
|
|
1291
|
+
const { report, recentReports } = input;
|
|
1292
|
+
const prompt = buildTriagePrompt(report, recentReports);
|
|
1293
|
+
const response = await input.anthropic.messages.create({
|
|
1294
|
+
model,
|
|
1295
|
+
max_tokens: 1024,
|
|
1296
|
+
messages: [{ role: "user", content: prompt }]
|
|
1297
|
+
});
|
|
1298
|
+
const text = response.content.filter((block) => block.type === "text").map((block) => block.text).join("");
|
|
1299
|
+
return parseTriageResult(text);
|
|
1300
|
+
}
|
|
1301
|
+
function buildTriagePrompt(report, recentReports) {
|
|
1302
|
+
const sections = [];
|
|
1303
|
+
sections.push(`REPORT TITLE: ${report.title ?? "(no title)"}`);
|
|
1304
|
+
sections.push(`DESCRIPTION: ${report.description}`);
|
|
1305
|
+
if (report.report_source) {
|
|
1306
|
+
sections.push(`SOURCE: ${report.report_source}`);
|
|
1307
|
+
}
|
|
1308
|
+
if (report.app_context && Object.keys(report.app_context).length > 0) {
|
|
1309
|
+
const ctx = report.app_context;
|
|
1310
|
+
const parts = [];
|
|
1311
|
+
if (ctx.currentRoute) parts.push(`Route: ${ctx.currentRoute}`);
|
|
1312
|
+
if (ctx.currentUrl) parts.push(`URL: ${ctx.currentUrl}`);
|
|
1313
|
+
if (ctx.componentName) parts.push(`Component: ${ctx.componentName}`);
|
|
1314
|
+
if (ctx.userAction) parts.push(`User action: ${ctx.userAction}`);
|
|
1315
|
+
if (parts.length > 0) {
|
|
1316
|
+
sections.push(`APP CONTEXT:
|
|
1317
|
+
${parts.join("\n")}`);
|
|
1318
|
+
}
|
|
1319
|
+
}
|
|
1320
|
+
if (report.enhanced_context) {
|
|
1321
|
+
const enhanced = report.enhanced_context;
|
|
1322
|
+
const consoleLogs = enhanced.consoleLogs;
|
|
1323
|
+
if (consoleLogs && consoleLogs.length > 0) {
|
|
1324
|
+
const errors = consoleLogs.filter((l) => l.level === "error" || l.level === "warning").slice(0, 10).map((l) => `[${l.level}] ${l.text}`).join("\n");
|
|
1325
|
+
if (errors) {
|
|
1326
|
+
sections.push(`CONSOLE ERRORS:
|
|
1327
|
+
${errors}`);
|
|
1328
|
+
}
|
|
1329
|
+
}
|
|
1330
|
+
const networkErrors = enhanced.networkErrors;
|
|
1331
|
+
if (networkErrors && networkErrors.length > 0) {
|
|
1332
|
+
const netErrors = networkErrors.slice(0, 10).map((e) => `${e.method} ${e.url} \u2192 ${e.status}`).join("\n");
|
|
1333
|
+
sections.push(`NETWORK ERRORS:
|
|
1334
|
+
${netErrors}`);
|
|
1335
|
+
}
|
|
1336
|
+
}
|
|
1337
|
+
if (report.device_info && Object.keys(report.device_info).length > 0) {
|
|
1338
|
+
const device = report.device_info;
|
|
1339
|
+
const parts = [];
|
|
1340
|
+
if (device.platform) parts.push(`Platform: ${device.platform}`);
|
|
1341
|
+
if (device.browser) parts.push(`Browser: ${device.browser}`);
|
|
1342
|
+
if (device.os) parts.push(`OS: ${device.os}`);
|
|
1343
|
+
if (device.screenSize) parts.push(`Screen: ${device.screenSize}`);
|
|
1344
|
+
if (parts.length > 0) {
|
|
1345
|
+
sections.push(`DEVICE:
|
|
1346
|
+
${parts.join(", ")}`);
|
|
1347
|
+
}
|
|
1348
|
+
}
|
|
1349
|
+
if (report.error_fingerprint) {
|
|
1350
|
+
sections.push(`ERROR FINGERPRINT: ${report.error_fingerprint}`);
|
|
1351
|
+
}
|
|
1352
|
+
let recentSection = "";
|
|
1353
|
+
if (recentReports.length > 0) {
|
|
1354
|
+
const recentLines = recentReports.map((r) => {
|
|
1355
|
+
const desc = r.description.slice(0, 150);
|
|
1356
|
+
const fp = r.error_fingerprint ? ` [fingerprint: ${r.error_fingerprint}]` : "";
|
|
1357
|
+
return `- ID: ${r.id} | "${r.title ?? "(no title)"}" | ${desc}${fp}`;
|
|
1358
|
+
});
|
|
1359
|
+
recentSection = `
|
|
1360
|
+
RECENT REPORTS (check for duplicates):
|
|
1361
|
+
${recentLines.join("\n")}`;
|
|
1362
|
+
}
|
|
1363
|
+
return `You are a QA triage specialist. Analyze this bug report and provide structured triage.
|
|
1364
|
+
|
|
1365
|
+
${sections.join("\n\n")}
|
|
1366
|
+
${recentSection}
|
|
1367
|
+
|
|
1368
|
+
Respond with ONLY a JSON object (no markdown, no explanation outside the JSON):
|
|
1369
|
+
{
|
|
1370
|
+
"suggested_severity": "critical" | "high" | "medium" | "low",
|
|
1371
|
+
"severity_confidence": 0.0-1.0,
|
|
1372
|
+
"suggested_category": "ui_ux" | "functional" | "crash" | "security" | "other",
|
|
1373
|
+
"category_confidence": 0.0-1.0,
|
|
1374
|
+
"root_cause_analysis": "Brief analysis of the likely root cause",
|
|
1375
|
+
"duplicate_of": null or "uuid-of-matching-report",
|
|
1376
|
+
"duplicate_confidence": 0.0-1.0,
|
|
1377
|
+
"triage_notes": "Summary of triage reasoning"
|
|
1378
|
+
}
|
|
1379
|
+
|
|
1380
|
+
Severity guide:
|
|
1381
|
+
- critical: App crash, data loss, security vulnerability, blocks core workflow
|
|
1382
|
+
- high: Major feature broken, significant UX degradation, affects many users
|
|
1383
|
+
- medium: Feature partially broken, workaround exists, moderate impact
|
|
1384
|
+
- low: Minor cosmetic issue, edge case, minimal user impact
|
|
1385
|
+
|
|
1386
|
+
Category guide:
|
|
1387
|
+
- crash: App crashes, unhandled exceptions, white screen of death
|
|
1388
|
+
- security: Auth bypass, data exposure, injection vulnerabilities
|
|
1389
|
+
- functional: Feature doesn't work as expected, logic errors, broken flows
|
|
1390
|
+
- ui_ux: Visual glitches, layout issues, confusing UX, accessibility problems
|
|
1391
|
+
- other: Performance, documentation, configuration issues
|
|
1392
|
+
|
|
1393
|
+
Duplicate detection:
|
|
1394
|
+
- Compare error fingerprints first (exact match = very high confidence)
|
|
1395
|
+
- Then compare descriptions semantically (similar symptoms on same route/feature)
|
|
1396
|
+
- Only flag as duplicate if confidence \u2265 0.80`;
|
|
1397
|
+
}
|
|
1398
|
+
var VALID_SEVERITIES = ["critical", "high", "medium", "low"];
|
|
1399
|
+
var VALID_CATEGORIES = ["ui_ux", "functional", "crash", "security", "other"];
|
|
1400
|
+
function parseTriageResult(text) {
|
|
1401
|
+
try {
|
|
1402
|
+
const parsed = JSON.parse(text.trim());
|
|
1403
|
+
return validateTriageResult(parsed);
|
|
1404
|
+
} catch {
|
|
1405
|
+
const jsonMatch = text.match(/\{[\s\S]*"suggested_severity"[\s\S]*"suggested_category"[\s\S]*\}/);
|
|
1406
|
+
if (jsonMatch) {
|
|
1407
|
+
try {
|
|
1408
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
1409
|
+
return validateTriageResult(parsed);
|
|
1410
|
+
} catch {
|
|
1411
|
+
}
|
|
1412
|
+
}
|
|
1413
|
+
}
|
|
1414
|
+
return {
|
|
1415
|
+
suggested_severity: "medium",
|
|
1416
|
+
severity_confidence: 0.3,
|
|
1417
|
+
suggested_category: "other",
|
|
1418
|
+
category_confidence: 0.3,
|
|
1419
|
+
root_cause_analysis: `Triage returned unparseable response: ${text.slice(0, 200)}`,
|
|
1420
|
+
duplicate_of: null,
|
|
1421
|
+
duplicate_confidence: 0,
|
|
1422
|
+
triage_notes: "Auto-triage failed to parse AI response"
|
|
1423
|
+
};
|
|
1424
|
+
}
|
|
1425
|
+
function validateTriageResult(parsed) {
|
|
1426
|
+
const severity = VALID_SEVERITIES.includes(parsed.suggested_severity) ? parsed.suggested_severity : "medium";
|
|
1427
|
+
const category = VALID_CATEGORIES.includes(parsed.suggested_category) ? parsed.suggested_category : "other";
|
|
1428
|
+
return {
|
|
1429
|
+
suggested_severity: severity,
|
|
1430
|
+
severity_confidence: clampConfidence(parsed.severity_confidence),
|
|
1431
|
+
suggested_category: category,
|
|
1432
|
+
category_confidence: clampConfidence(parsed.category_confidence),
|
|
1433
|
+
root_cause_analysis: typeof parsed.root_cause_analysis === "string" ? parsed.root_cause_analysis : "No analysis provided",
|
|
1434
|
+
duplicate_of: typeof parsed.duplicate_of === "string" ? parsed.duplicate_of : null,
|
|
1435
|
+
duplicate_confidence: clampConfidence(parsed.duplicate_confidence),
|
|
1436
|
+
triage_notes: typeof parsed.triage_notes === "string" ? parsed.triage_notes : "No notes provided"
|
|
1437
|
+
};
|
|
1438
|
+
}
|
|
1439
|
+
function clampConfidence(value) {
|
|
1440
|
+
if (typeof value !== "number") return 0.5;
|
|
1441
|
+
return Math.max(0, Math.min(1, value));
|
|
1442
|
+
}
|
|
1443
|
+
|
|
1444
|
+
// src/failure-analyzer.ts
|
|
1445
|
+
var DEFAULT_MODEL6 = "claude-sonnet-4-20250514";
|
|
1446
|
+
async function analyzeFailure(input) {
|
|
1447
|
+
const model = input.model ?? DEFAULT_MODEL6;
|
|
1448
|
+
const { step, result, discoveredSelector, consoleLogs, networkErrors } = input;
|
|
1449
|
+
const content = [];
|
|
1450
|
+
content.push({ type: "text", text: "BEFORE screenshot (page state before the failed action):" });
|
|
1451
|
+
content.push({
|
|
1452
|
+
type: "image",
|
|
1453
|
+
source: { type: "base64", media_type: "image/png", data: result.screenshotBefore.toString("base64") }
|
|
1454
|
+
});
|
|
1455
|
+
content.push({ type: "text", text: "AFTER screenshot (page state after the failed action):" });
|
|
1456
|
+
content.push({
|
|
1457
|
+
type: "image",
|
|
1458
|
+
source: { type: "base64", media_type: "image/png", data: result.screenshotAfter.toString("base64") }
|
|
1459
|
+
});
|
|
1460
|
+
content.push({ type: "text", text: buildFailurePrompt(step, result, discoveredSelector, consoleLogs, networkErrors) });
|
|
1461
|
+
const response = await input.anthropic.messages.create({
|
|
1462
|
+
model,
|
|
1463
|
+
max_tokens: 1024,
|
|
1464
|
+
messages: [{ role: "user", content }]
|
|
1465
|
+
});
|
|
1466
|
+
const text = response.content.filter((block) => block.type === "text").map((block) => block.text).join("");
|
|
1467
|
+
return parseFailureAnalysis(text, step);
|
|
1468
|
+
}
|
|
1469
|
+
var STEP_TO_RUN = {
|
|
1470
|
+
real_bug: "bug",
|
|
1471
|
+
test_maintenance: "test_issue",
|
|
1472
|
+
ai_limitation: "ai_limitation",
|
|
1473
|
+
flaky: "flaky",
|
|
1474
|
+
unknown: "unknown"
|
|
1475
|
+
};
|
|
1476
|
+
function rollupFailureClassification(stepClassifications) {
|
|
1477
|
+
if (stepClassifications.length === 0) return "unknown";
|
|
1478
|
+
if (stepClassifications.some((c) => c === "real_bug")) return "bug";
|
|
1479
|
+
if (stepClassifications.every((c) => c === "ai_limitation")) return "ai_limitation";
|
|
1480
|
+
if (stepClassifications.every((c) => c === "test_maintenance")) return "test_issue";
|
|
1481
|
+
if (stepClassifications.every((c) => c === "flaky")) return "flaky";
|
|
1482
|
+
const counts = /* @__PURE__ */ new Map();
|
|
1483
|
+
for (const c of stepClassifications) {
|
|
1484
|
+
counts.set(c, (counts.get(c) ?? 0) + 1);
|
|
1485
|
+
}
|
|
1486
|
+
let best = "unknown";
|
|
1487
|
+
let bestCount = 0;
|
|
1488
|
+
for (const [cls, count] of counts) {
|
|
1489
|
+
if (count > bestCount) {
|
|
1490
|
+
bestCount = count;
|
|
1491
|
+
best = cls;
|
|
1492
|
+
}
|
|
1493
|
+
}
|
|
1494
|
+
return STEP_TO_RUN[best];
|
|
1495
|
+
}
|
|
1496
|
+
function buildFailurePrompt(step, result, discoveredSelector, consoleLogs, networkErrors) {
|
|
1497
|
+
const sections = [];
|
|
1498
|
+
sections.push(`FAILED STEP #${step.stepNumber}: ${step.action}`);
|
|
1499
|
+
sections.push(`EXPECTED: ${step.expectedResult}`);
|
|
1500
|
+
sections.push(`ACTUAL: ${result.actualResult}`);
|
|
1501
|
+
if (step.selector) sections.push(`SELECTOR USED: ${step.selector}`);
|
|
1502
|
+
if (step.actionType) sections.push(`ACTION TYPE: ${step.actionType}`);
|
|
1503
|
+
if (result.error) sections.push(`ERROR: ${result.error}`);
|
|
1504
|
+
if (discoveredSelector) {
|
|
1505
|
+
sections.push(`DISCOVERED SELECTOR (what Stagehand actually clicked): ${discoveredSelector.selector} (via ${discoveredSelector.strategy})${discoveredSelector.textContent ? ` \u2014 text: "${discoveredSelector.textContent}"` : ""}`);
|
|
1506
|
+
}
|
|
1507
|
+
if (consoleLogs && consoleLogs.length > 0) {
|
|
1508
|
+
const errors = consoleLogs.filter((l) => l.level === "error" || l.level === "warning").slice(0, 8).map((l) => `[${l.level}] ${l.text}`).join("\n");
|
|
1509
|
+
if (errors) sections.push(`CONSOLE ERRORS:
|
|
1510
|
+
${errors}`);
|
|
1511
|
+
}
|
|
1512
|
+
if (networkErrors && networkErrors.length > 0) {
|
|
1513
|
+
const netErrors = networkErrors.slice(0, 8).map((e) => `${e.method} ${e.url} \u2192 ${e.status} ${e.statusText}`).join("\n");
|
|
1514
|
+
sections.push(`NETWORK ERRORS:
|
|
1515
|
+
${netErrors}`);
|
|
1516
|
+
}
|
|
1517
|
+
return `You are a QA failure analyst. A test step failed. Analyze the before/after screenshots and the context below to classify this failure.
|
|
1518
|
+
|
|
1519
|
+
${sections.join("\n\n")}
|
|
1520
|
+
|
|
1521
|
+
Classify into ONE of these categories:
|
|
1522
|
+
- **real_bug**: The application has an actual defect. Indicators: API errors (4xx/5xx), JavaScript exceptions, missing/broken UI elements that SHOULD be there, incorrect behavior, data not saving.
|
|
1523
|
+
- **test_maintenance**: The test is stale \u2014 the app changed but the test wasn't updated. Indicators: element moved/renamed, selector no longer matches, page restructured but app works correctly, the discovered selector differs from the test's selector.
|
|
1524
|
+
- **ai_limitation**: The AI executor itself could not complete this step \u2014 NOT an app bug. Indicators: already logged in so can't reach the login page, a QA/testing widget or overlay appeared and blocked the real UI, the test requires measuring something the AI can't (contrast ratios, pixel measurements), the AI landed on a completely wrong page and never reached the test target, authentication redirect prevented navigation, a popup or modal unrelated to the test blocked interaction.
|
|
1525
|
+
- **flaky**: Timing or intermittent issue. Indicators: timeout errors, "element not found" but the element IS visible in screenshots, network hiccup, race condition.
|
|
1526
|
+
- **unknown**: Can't determine with confidence.
|
|
1527
|
+
|
|
1528
|
+
For **test_maintenance** failures, suggest a corrected step (selector, action, value).
|
|
1529
|
+
|
|
1530
|
+
Respond with ONLY a JSON object (no markdown, no explanation outside the JSON):
|
|
1531
|
+
{
|
|
1532
|
+
"classification": "real_bug" | "test_maintenance" | "ai_limitation" | "flaky" | "unknown",
|
|
1533
|
+
"confidence": 0.0-1.0,
|
|
1534
|
+
"reasoning": "Brief explanation of why this classification",
|
|
1535
|
+
"suggested_fix": null | {
|
|
1536
|
+
"corrected_action": "Updated natural language action (if changed)",
|
|
1537
|
+
"corrected_selector": "Updated CSS selector (if selector changed)",
|
|
1538
|
+
"corrected_actionType": "Updated action type (if changed)",
|
|
1539
|
+
"corrected_value": "Updated value (if changed)"
|
|
1540
|
+
}
|
|
1541
|
+
}`;
|
|
1542
|
+
}
|
|
1543
|
+
var VALID_CLASSIFICATIONS = ["real_bug", "test_maintenance", "ai_limitation", "flaky", "unknown"];
|
|
1544
|
+
function parseFailureAnalysis(text, step) {
|
|
1545
|
+
try {
|
|
1546
|
+
const parsed = JSON.parse(text.trim());
|
|
1547
|
+
return validateFailureAnalysis(parsed, step);
|
|
1548
|
+
} catch {
|
|
1549
|
+
const jsonMatch = text.match(/\{[\s\S]*"classification"[\s\S]*"confidence"[\s\S]*\}/);
|
|
1550
|
+
if (jsonMatch) {
|
|
1551
|
+
try {
|
|
1552
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
1553
|
+
return validateFailureAnalysis(parsed, step);
|
|
1554
|
+
} catch {
|
|
1555
|
+
}
|
|
1556
|
+
}
|
|
1557
|
+
}
|
|
1558
|
+
return {
|
|
1559
|
+
classification: "unknown",
|
|
1560
|
+
confidence: 0.3,
|
|
1561
|
+
reasoning: `Failure analysis returned unparseable response: ${text.slice(0, 200)}`
|
|
1562
|
+
};
|
|
1563
|
+
}
|
|
1564
|
+
function validateFailureAnalysis(parsed, step) {
|
|
1565
|
+
const classification = VALID_CLASSIFICATIONS.includes(parsed.classification) ? parsed.classification : "unknown";
|
|
1566
|
+
const result = {
|
|
1567
|
+
classification,
|
|
1568
|
+
confidence: clampConfidence2(parsed.confidence),
|
|
1569
|
+
reasoning: typeof parsed.reasoning === "string" ? parsed.reasoning : "No reasoning provided"
|
|
1570
|
+
};
|
|
1571
|
+
if (parsed.suggested_fix && typeof parsed.suggested_fix === "object") {
|
|
1572
|
+
const fix = parsed.suggested_fix;
|
|
1573
|
+
result.suggested_fix = {
|
|
1574
|
+
stepNumber: step.stepNumber,
|
|
1575
|
+
original_action: step.action,
|
|
1576
|
+
corrected_action: typeof fix.corrected_action === "string" ? fix.corrected_action : void 0,
|
|
1577
|
+
corrected_selector: typeof fix.corrected_selector === "string" ? fix.corrected_selector : void 0,
|
|
1578
|
+
corrected_actionType: typeof fix.corrected_actionType === "string" ? fix.corrected_actionType : void 0,
|
|
1579
|
+
corrected_value: typeof fix.corrected_value === "string" ? fix.corrected_value : void 0
|
|
1580
|
+
};
|
|
1581
|
+
}
|
|
1582
|
+
return result;
|
|
1583
|
+
}
|
|
1584
|
+
function clampConfidence2(value) {
|
|
1585
|
+
if (typeof value !== "number") return 0.5;
|
|
1586
|
+
return Math.max(0, Math.min(1, value));
|
|
1587
|
+
}
|
|
1588
|
+
|
|
1589
|
+
// src/concurrency.ts
|
|
1590
|
+
var Semaphore = class {
|
|
1591
|
+
constructor(max) {
|
|
1592
|
+
this.max = max;
|
|
1593
|
+
this.current = 0;
|
|
1594
|
+
this.queue = [];
|
|
1595
|
+
if (max < 1) throw new Error("Semaphore max must be >= 1");
|
|
1596
|
+
}
|
|
1597
|
+
async acquire() {
|
|
1598
|
+
if (this.current < this.max) {
|
|
1599
|
+
this.current++;
|
|
1600
|
+
return;
|
|
1601
|
+
}
|
|
1602
|
+
return new Promise((resolve) => {
|
|
1603
|
+
this.queue.push(resolve);
|
|
1604
|
+
});
|
|
1605
|
+
}
|
|
1606
|
+
release() {
|
|
1607
|
+
const next = this.queue.shift();
|
|
1608
|
+
if (next) {
|
|
1609
|
+
next();
|
|
1610
|
+
} else {
|
|
1611
|
+
this.current--;
|
|
1612
|
+
}
|
|
1613
|
+
}
|
|
1614
|
+
/** Number of slots currently in use */
|
|
1615
|
+
get active() {
|
|
1616
|
+
return this.current;
|
|
1617
|
+
}
|
|
1618
|
+
/** Number of waiters in the queue */
|
|
1619
|
+
get waiting() {
|
|
1620
|
+
return this.queue.length;
|
|
1621
|
+
}
|
|
1622
|
+
};
|
|
361
1623
|
export {
|
|
1624
|
+
Semaphore,
|
|
1625
|
+
analyzeFailure,
|
|
1626
|
+
authenticateSupabase,
|
|
362
1627
|
createStagehandSession,
|
|
1628
|
+
discoverSelector,
|
|
1629
|
+
estimateBatchCost,
|
|
1630
|
+
estimateCost,
|
|
1631
|
+
estimateTestCost,
|
|
1632
|
+
evaluateStep,
|
|
1633
|
+
executeAction,
|
|
1634
|
+
generateExplorationReport,
|
|
363
1635
|
generateRunSummary,
|
|
1636
|
+
getTokenEstimate,
|
|
364
1637
|
injectAuth,
|
|
365
|
-
|
|
1638
|
+
injectSupabaseAuth,
|
|
1639
|
+
installClickTracker,
|
|
1640
|
+
performSupabaseAuth,
|
|
1641
|
+
rollupFailureClassification,
|
|
1642
|
+
runExploration,
|
|
1643
|
+
runTest,
|
|
1644
|
+
suppressBugBearWidget,
|
|
1645
|
+
triageReport,
|
|
1646
|
+
verifySupabaseSession
|
|
366
1647
|
};
|
|
367
1648
|
//# sourceMappingURL=index.mjs.map
|