@bbearai/ai-executor 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-WT22IQMS.mjs +175 -0
- package/dist/chunk-WT22IQMS.mjs.map +1 -0
- package/dist/cli.js +622 -129
- package/dist/cli.js.map +1 -1
- package/dist/index.d.mts +533 -8
- package/dist/index.d.ts +533 -8
- package/dist/index.js +1613 -131
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +1411 -130
- package/dist/index.mjs.map +1 -1
- package/dist/report-generator-EVZEB33O.mjs +7 -0
- package/dist/report-generator-EVZEB33O.mjs.map +1 -0
- package/package.json +5 -1
package/dist/index.js
CHANGED
|
@@ -5,6 +5,9 @@ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
|
5
5
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
6
|
var __getProtoOf = Object.getPrototypeOf;
|
|
7
7
|
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __esm = (fn, res) => function __init() {
|
|
9
|
+
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
|
|
10
|
+
};
|
|
8
11
|
var __export = (target, all) => {
|
|
9
12
|
for (var name in all)
|
|
10
13
|
__defProp(target, name, { get: all[name], enumerable: true });
|
|
@@ -27,22 +30,299 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
27
30
|
));
|
|
28
31
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
32
|
|
|
33
|
+
// src/report-generator.ts
|
|
34
|
+
var report_generator_exports = {};
|
|
35
|
+
__export(report_generator_exports, {
|
|
36
|
+
generateExplorationReport: () => generateExplorationReport
|
|
37
|
+
});
|
|
38
|
+
async function generateExplorationReport(anthropic, input) {
|
|
39
|
+
const { projectName, featureDescription, targetUrl, actions, model } = input;
|
|
40
|
+
const findings = actions.filter((a) => a.category !== "normal");
|
|
41
|
+
const passed = actions.filter((a) => a.category === "normal");
|
|
42
|
+
const actionableFindings = findings.map((f) => ({
|
|
43
|
+
title: buildFindingTitle(f),
|
|
44
|
+
category: f.category,
|
|
45
|
+
severity: f.severity || "medium",
|
|
46
|
+
confidence: f.confidence,
|
|
47
|
+
networkRequests: f.networkRequests,
|
|
48
|
+
consoleErrors: f.consoleLogs.filter((l) => l.level === "error"),
|
|
49
|
+
domContext: f.domContext,
|
|
50
|
+
url: targetUrl,
|
|
51
|
+
route: extractRoute(targetUrl),
|
|
52
|
+
reproSteps: buildReproSteps(actions, f.actionNumber),
|
|
53
|
+
screenshotUrl: "",
|
|
54
|
+
// Filled in by API route after upload
|
|
55
|
+
actionPerformed: f.action,
|
|
56
|
+
expectedBehavior: "Normal application behavior",
|
|
57
|
+
actualBehavior: f.description
|
|
58
|
+
}));
|
|
59
|
+
const tested = passed.map((a) => ({
|
|
60
|
+
description: a.action,
|
|
61
|
+
route: extractRoute(targetUrl),
|
|
62
|
+
status: "passed"
|
|
63
|
+
}));
|
|
64
|
+
const notTested = detectUntestable(actions);
|
|
65
|
+
const summaryResponse = await anthropic.messages.create({
|
|
66
|
+
model,
|
|
67
|
+
max_tokens: 500,
|
|
68
|
+
messages: [
|
|
69
|
+
{
|
|
70
|
+
role: "user",
|
|
71
|
+
content: `Summarize this exploratory QA session in 2-3 sentences.
|
|
72
|
+
|
|
73
|
+
Feature tested: "${featureDescription}"
|
|
74
|
+
URL: ${targetUrl}
|
|
75
|
+
Actions performed: ${actions.length}
|
|
76
|
+
Findings: ${findings.length} (${findings.filter((f) => f.severity === "critical" || f.severity === "high").length} high/critical)
|
|
77
|
+
Passed checks: ${passed.length}
|
|
78
|
+
|
|
79
|
+
Finding details:
|
|
80
|
+
${findings.map((f) => `- [${f.severity?.toUpperCase()}] ${f.category}: ${f.description}`).join("\n")}
|
|
81
|
+
|
|
82
|
+
Be concise and factual. Focus on what was tested and the most important findings.`
|
|
83
|
+
}
|
|
84
|
+
]
|
|
85
|
+
});
|
|
86
|
+
const summary = summaryResponse.content[0].type === "text" ? summaryResponse.content[0].text : "Exploration complete.";
|
|
87
|
+
const suggestedPrompt = buildSuggestedPrompt(
|
|
88
|
+
featureDescription,
|
|
89
|
+
actionableFindings,
|
|
90
|
+
tested,
|
|
91
|
+
notTested
|
|
92
|
+
);
|
|
93
|
+
const totalDuration = actions.reduce((sum, a) => sum + a.durationMs, 0);
|
|
94
|
+
return {
|
|
95
|
+
report: {
|
|
96
|
+
projectName,
|
|
97
|
+
featureDescription,
|
|
98
|
+
targetUrl,
|
|
99
|
+
exploredAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
100
|
+
duration: `${Math.round(totalDuration / 1e3)}s`,
|
|
101
|
+
actionsUsed: actions.length,
|
|
102
|
+
actionBudget: actions.length,
|
|
103
|
+
findings: actionableFindings,
|
|
104
|
+
tested,
|
|
105
|
+
notTested,
|
|
106
|
+
summary,
|
|
107
|
+
suggestedPrompt
|
|
108
|
+
},
|
|
109
|
+
tokenUsage: {
|
|
110
|
+
inputTokens: summaryResponse.usage.input_tokens,
|
|
111
|
+
outputTokens: summaryResponse.usage.output_tokens
|
|
112
|
+
}
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
function buildFindingTitle(action) {
|
|
116
|
+
const prefix = {
|
|
117
|
+
console_error: "JS Error",
|
|
118
|
+
broken_interaction: "Broken",
|
|
119
|
+
visual_anomaly: "Visual",
|
|
120
|
+
input_handling: "Validation",
|
|
121
|
+
normal: ""
|
|
122
|
+
};
|
|
123
|
+
return `${prefix[action.category] || action.category}: ${action.description.slice(0, 80)}`;
|
|
124
|
+
}
|
|
125
|
+
function extractRoute(url) {
|
|
126
|
+
try {
|
|
127
|
+
return new URL(url).pathname;
|
|
128
|
+
} catch {
|
|
129
|
+
return url;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
function buildReproSteps(allActions, targetActionNumber) {
|
|
133
|
+
return allActions.filter((a) => a.actionNumber <= targetActionNumber).map((a) => `${a.actionNumber}. ${a.action}`);
|
|
134
|
+
}
|
|
135
|
+
function detectUntestable(actions) {
|
|
136
|
+
const untestable = [];
|
|
137
|
+
const allText = actions.map((a) => `${a.action} ${a.description}`).join(" ").toLowerCase();
|
|
138
|
+
if (allText.includes("file upload") || allText.includes("drag and drop")) {
|
|
139
|
+
untestable.push({
|
|
140
|
+
description: "File upload functionality",
|
|
141
|
+
reason: "AI cannot interact with OS file dialogs"
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
if (allText.includes("captcha") || allText.includes("recaptcha")) {
|
|
145
|
+
untestable.push({
|
|
146
|
+
description: "CAPTCHA verification",
|
|
147
|
+
reason: "AI cannot solve CAPTCHAs"
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
if (allText.includes("disabled") || allText.includes("permission")) {
|
|
151
|
+
untestable.push({
|
|
152
|
+
description: "Permission-gated features",
|
|
153
|
+
reason: "Current auth may not have required permissions"
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
return untestable;
|
|
157
|
+
}
|
|
158
|
+
function buildSuggestedPrompt(featureDescription, findings, tested, notTested) {
|
|
159
|
+
if (findings.length === 0) {
|
|
160
|
+
return `Exploratory QA tested "${featureDescription}" with ${tested.length} interactions \u2014 no issues found.`;
|
|
161
|
+
}
|
|
162
|
+
let prompt = `Fix these ${findings.length} issue(s) found during exploratory QA testing of "${featureDescription}":
|
|
163
|
+
|
|
164
|
+
`;
|
|
165
|
+
findings.forEach((f, i) => {
|
|
166
|
+
prompt += `${i + 1}. [${f.severity.toUpperCase()}] ${f.title}
|
|
167
|
+
`;
|
|
168
|
+
if (f.consoleErrors.length > 0) {
|
|
169
|
+
prompt += ` Console: ${f.consoleErrors[0].text}
|
|
170
|
+
`;
|
|
171
|
+
if (f.consoleErrors[0].source) {
|
|
172
|
+
prompt += ` Source: ${f.consoleErrors[0].source}
|
|
173
|
+
`;
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
if (f.networkRequests.some((r) => r.status >= 400)) {
|
|
177
|
+
const failed = f.networkRequests.find((r) => r.status >= 400);
|
|
178
|
+
if (failed) {
|
|
179
|
+
prompt += ` API: ${failed.method} ${failed.url} \u2192 ${failed.status}
|
|
180
|
+
`;
|
|
181
|
+
if (failed.responseBody) {
|
|
182
|
+
prompt += ` Response: ${failed.responseBody.slice(0, 200)}
|
|
183
|
+
`;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
if (f.domContext?.selector) {
|
|
188
|
+
prompt += ` Element: ${f.domContext.selector}
|
|
189
|
+
`;
|
|
190
|
+
}
|
|
191
|
+
prompt += ` Route: ${f.route}
|
|
192
|
+
`;
|
|
193
|
+
prompt += ` Repro: ${f.reproSteps.join(" \u2192 ")}
|
|
194
|
+
|
|
195
|
+
`;
|
|
196
|
+
});
|
|
197
|
+
if (notTested.length > 0) {
|
|
198
|
+
prompt += `Not tested (requires manual review):
|
|
199
|
+
`;
|
|
200
|
+
notTested.forEach((n) => {
|
|
201
|
+
prompt += `- ${n.description}: ${n.reason}
|
|
202
|
+
`;
|
|
203
|
+
});
|
|
204
|
+
}
|
|
205
|
+
return prompt.trim();
|
|
206
|
+
}
|
|
207
|
+
var init_report_generator = __esm({
|
|
208
|
+
"src/report-generator.ts"() {
|
|
209
|
+
"use strict";
|
|
210
|
+
}
|
|
211
|
+
});
|
|
212
|
+
|
|
30
213
|
// src/index.ts
|
|
31
214
|
var index_exports = {};
|
|
32
215
|
__export(index_exports, {
|
|
216
|
+
Semaphore: () => Semaphore,
|
|
217
|
+
analyzeFailure: () => analyzeFailure,
|
|
218
|
+
authenticateSupabase: () => authenticateSupabase,
|
|
33
219
|
createStagehandSession: () => createStagehandSession,
|
|
220
|
+
discoverSelector: () => discoverSelector,
|
|
221
|
+
estimateBatchCost: () => estimateBatchCost,
|
|
222
|
+
estimateCost: () => estimateCost,
|
|
223
|
+
estimateTestCost: () => estimateTestCost,
|
|
224
|
+
evaluateStep: () => evaluateStep,
|
|
225
|
+
executeAction: () => executeAction,
|
|
226
|
+
generateExplorationReport: () => generateExplorationReport,
|
|
34
227
|
generateRunSummary: () => generateRunSummary,
|
|
228
|
+
getTokenEstimate: () => getTokenEstimate,
|
|
35
229
|
injectAuth: () => injectAuth,
|
|
36
|
-
|
|
230
|
+
injectSupabaseAuth: () => injectSupabaseAuth,
|
|
231
|
+
installClickTracker: () => installClickTracker,
|
|
232
|
+
performSupabaseAuth: () => performSupabaseAuth,
|
|
233
|
+
rollupFailureClassification: () => rollupFailureClassification,
|
|
234
|
+
runExploration: () => runExploration,
|
|
235
|
+
runTest: () => runTest,
|
|
236
|
+
suppressBugBearWidget: () => suppressBugBearWidget,
|
|
237
|
+
triageReport: () => triageReport,
|
|
238
|
+
verifySupabaseSession: () => verifySupabaseSession
|
|
37
239
|
});
|
|
38
240
|
module.exports = __toCommonJS(index_exports);
|
|
39
241
|
|
|
40
242
|
// src/runner.ts
|
|
41
243
|
var import_sdk = __toESM(require("@anthropic-ai/sdk"));
|
|
42
|
-
var import_zod = require("zod");
|
|
43
244
|
|
|
44
245
|
// src/browser.ts
|
|
45
246
|
var import_stagehand = require("@browserbasehq/stagehand");
|
|
247
|
+
|
|
248
|
+
// src/supabase-auth.ts
|
|
249
|
+
function extractProjectRef(supabaseUrl) {
|
|
250
|
+
const url = new URL(supabaseUrl);
|
|
251
|
+
const hostname = url.hostname;
|
|
252
|
+
const ref = hostname.split(".")[0];
|
|
253
|
+
return ref;
|
|
254
|
+
}
|
|
255
|
+
async function authenticateSupabase(auth) {
|
|
256
|
+
const url = `${auth.supabaseUrl.replace(/\/$/, "")}/auth/v1/token?grant_type=password`;
|
|
257
|
+
const response = await fetch(url, {
|
|
258
|
+
method: "POST",
|
|
259
|
+
headers: {
|
|
260
|
+
"Content-Type": "application/json",
|
|
261
|
+
"apikey": auth.anonKey
|
|
262
|
+
},
|
|
263
|
+
body: JSON.stringify({
|
|
264
|
+
email: auth.email,
|
|
265
|
+
password: auth.password
|
|
266
|
+
})
|
|
267
|
+
});
|
|
268
|
+
if (!response.ok) {
|
|
269
|
+
const body = await response.text().catch(() => "");
|
|
270
|
+
throw new Error(
|
|
271
|
+
`Supabase auth failed (${response.status}): ${body.slice(0, 200)}`
|
|
272
|
+
);
|
|
273
|
+
}
|
|
274
|
+
const session = await response.json();
|
|
275
|
+
if (!session.access_token) {
|
|
276
|
+
throw new Error("Supabase auth returned no access_token");
|
|
277
|
+
}
|
|
278
|
+
return session;
|
|
279
|
+
}
|
|
280
|
+
async function injectSupabaseAuth(page, auth, session) {
|
|
281
|
+
const ref = extractProjectRef(auth.supabaseUrl);
|
|
282
|
+
const storageKey = `sb-${ref}-auth-token`;
|
|
283
|
+
const storageValue = JSON.stringify({
|
|
284
|
+
access_token: session.access_token,
|
|
285
|
+
refresh_token: session.refresh_token,
|
|
286
|
+
expires_in: session.expires_in,
|
|
287
|
+
expires_at: session.expires_at,
|
|
288
|
+
token_type: session.token_type,
|
|
289
|
+
user: session.user
|
|
290
|
+
});
|
|
291
|
+
const currentUrl = page.url();
|
|
292
|
+
if (currentUrl === "about:blank" || !currentUrl) {
|
|
293
|
+
await page.goto(auth.supabaseUrl.replace(/\/$/, ""), {
|
|
294
|
+
waitUntil: "domcontentloaded",
|
|
295
|
+
timeoutMs: 1e4
|
|
296
|
+
}).catch(() => {
|
|
297
|
+
});
|
|
298
|
+
}
|
|
299
|
+
await page.evaluate(
|
|
300
|
+
({ key, value }) => {
|
|
301
|
+
localStorage.setItem(key, value);
|
|
302
|
+
},
|
|
303
|
+
{ key: storageKey, value: storageValue }
|
|
304
|
+
);
|
|
305
|
+
}
|
|
306
|
+
async function verifySupabaseSession(auth, accessToken) {
|
|
307
|
+
const url = `${auth.supabaseUrl.replace(/\/$/, "")}/auth/v1/user`;
|
|
308
|
+
const response = await fetch(url, {
|
|
309
|
+
headers: {
|
|
310
|
+
"Authorization": `Bearer ${accessToken}`,
|
|
311
|
+
"apikey": auth.anonKey
|
|
312
|
+
}
|
|
313
|
+
});
|
|
314
|
+
return response.ok;
|
|
315
|
+
}
|
|
316
|
+
async function performSupabaseAuth(page, auth) {
|
|
317
|
+
const session = await authenticateSupabase(auth);
|
|
318
|
+
await injectSupabaseAuth(page, auth, session);
|
|
319
|
+
const valid = await verifySupabaseSession(auth, session.access_token);
|
|
320
|
+
if (!valid) {
|
|
321
|
+
throw new Error("Supabase auth verification failed \u2014 session token rejected");
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// src/browser.ts
|
|
46
326
|
var DEFAULT_MODEL = "anthropic/claude-sonnet-4-20250514";
|
|
47
327
|
async function createStagehandSession(config, anthropicApiKey) {
|
|
48
328
|
const modelName = config.model ?? DEFAULT_MODEL;
|
|
@@ -55,6 +335,11 @@ async function createStagehandSession(config, anthropicApiKey) {
|
|
|
55
335
|
modelName,
|
|
56
336
|
apiKey: anthropicApiKey
|
|
57
337
|
},
|
|
338
|
+
// Bypass pino logger — its pino-pretty transport uses worker threads
|
|
339
|
+
// which fail in Vercel's serverless environment
|
|
340
|
+
logger: (msg) => {
|
|
341
|
+
if ((msg.level ?? 0) >= 40) console.warn("[Stagehand]", msg.message);
|
|
342
|
+
},
|
|
58
343
|
localBrowserLaunchOptions: config.provider === "local" ? {
|
|
59
344
|
headless: config.headless ?? true,
|
|
60
345
|
viewport
|
|
@@ -78,6 +363,21 @@ async function createStagehandSession(config, anthropicApiKey) {
|
|
|
78
363
|
}
|
|
79
364
|
};
|
|
80
365
|
}
|
|
366
|
+
async function suppressBugBearWidget(stagehand) {
|
|
367
|
+
try {
|
|
368
|
+
const ctx = stagehand.context;
|
|
369
|
+
if (ctx?.addInitScript) {
|
|
370
|
+
await ctx.addInitScript(() => {
|
|
371
|
+
window.__bugbear_suppress = true;
|
|
372
|
+
try {
|
|
373
|
+
localStorage.setItem("__bugbear_suppress", "true");
|
|
374
|
+
} catch {
|
|
375
|
+
}
|
|
376
|
+
});
|
|
377
|
+
}
|
|
378
|
+
} catch {
|
|
379
|
+
}
|
|
380
|
+
}
|
|
81
381
|
async function injectAuth(page, auth, stagehand) {
|
|
82
382
|
if (auth.type === "cookie") {
|
|
83
383
|
for (const c of auth.cookies) {
|
|
@@ -103,23 +403,123 @@ async function injectAuth(page, auth, stagehand) {
|
|
|
103
403
|
}, auth.items);
|
|
104
404
|
} else if (auth.type === "form-login") {
|
|
105
405
|
await performFormLogin(page, auth, stagehand);
|
|
406
|
+
} else if (auth.type === "supabase-native") {
|
|
407
|
+
await performSupabaseAuth(page, auth);
|
|
106
408
|
}
|
|
107
409
|
}
|
|
410
|
+
function createNetworkCapture(page) {
|
|
411
|
+
const requests = [];
|
|
412
|
+
const errors = [];
|
|
413
|
+
let active = false;
|
|
414
|
+
let startTimestamp = Date.now();
|
|
415
|
+
const onResponse = async (response) => {
|
|
416
|
+
if (!active) return;
|
|
417
|
+
const req = response.request();
|
|
418
|
+
const resourceType = typeof req.resourceType === "function" ? req.resourceType() : req.resourceType;
|
|
419
|
+
if (["image", "stylesheet", "font", "media"].includes(resourceType)) return;
|
|
420
|
+
const entry = {
|
|
421
|
+
method: typeof req.method === "function" ? req.method() : String(req.method),
|
|
422
|
+
url: (typeof response.url === "function" ? response.url() : String(response.url)).slice(0, 500),
|
|
423
|
+
status: typeof response.status === "function" ? response.status() : Number(response.status),
|
|
424
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
425
|
+
};
|
|
426
|
+
const status = entry.status;
|
|
427
|
+
if (status >= 400) {
|
|
428
|
+
try {
|
|
429
|
+
const body = await response.text();
|
|
430
|
+
entry.responseBody = body.slice(0, 500);
|
|
431
|
+
} catch {
|
|
432
|
+
}
|
|
433
|
+
errors.push({
|
|
434
|
+
method: entry.method,
|
|
435
|
+
url: entry.url,
|
|
436
|
+
status,
|
|
437
|
+
statusText: typeof response.statusText === "function" ? response.statusText() : String(response.statusText ?? ""),
|
|
438
|
+
timestamp: Date.now() - startTimestamp
|
|
439
|
+
});
|
|
440
|
+
}
|
|
441
|
+
if (["POST", "PUT", "PATCH"].includes(entry.method)) {
|
|
442
|
+
try {
|
|
443
|
+
const postData = typeof req.postData === "function" ? req.postData() : req.postData;
|
|
444
|
+
if (postData) entry.requestBody = String(postData).slice(0, 500);
|
|
445
|
+
} catch {
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
requests.push(entry);
|
|
449
|
+
};
|
|
450
|
+
const onRequestFailed = (req) => {
|
|
451
|
+
if (!active) return;
|
|
452
|
+
const url = typeof req.url === "function" ? req.url() : String(req.url ?? "");
|
|
453
|
+
const method = typeof req.method === "function" ? req.method() : String(req.method ?? "GET");
|
|
454
|
+
const failure = typeof req.failure === "function" ? req.failure() : req.failure;
|
|
455
|
+
errors.push({
|
|
456
|
+
method,
|
|
457
|
+
url: url.slice(0, 500),
|
|
458
|
+
status: 0,
|
|
459
|
+
statusText: failure?.errorText ?? "Request failed",
|
|
460
|
+
timestamp: Date.now() - startTimestamp
|
|
461
|
+
});
|
|
462
|
+
};
|
|
463
|
+
const rawPage = page;
|
|
464
|
+
let responseSupported = true;
|
|
465
|
+
let requestFailedSupported = true;
|
|
466
|
+
return {
|
|
467
|
+
start() {
|
|
468
|
+
active = true;
|
|
469
|
+
requests.length = 0;
|
|
470
|
+
errors.length = 0;
|
|
471
|
+
startTimestamp = Date.now();
|
|
472
|
+
if (responseSupported) {
|
|
473
|
+
try {
|
|
474
|
+
rawPage.on("response", onResponse);
|
|
475
|
+
} catch {
|
|
476
|
+
responseSupported = false;
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
if (requestFailedSupported) {
|
|
480
|
+
try {
|
|
481
|
+
rawPage.on("requestfailed", onRequestFailed);
|
|
482
|
+
} catch {
|
|
483
|
+
requestFailedSupported = false;
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
},
|
|
487
|
+
stop() {
|
|
488
|
+
active = false;
|
|
489
|
+
if (responseSupported) {
|
|
490
|
+
try {
|
|
491
|
+
rawPage.off("response", onResponse);
|
|
492
|
+
} catch {
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
if (requestFailedSupported) {
|
|
496
|
+
try {
|
|
497
|
+
rawPage.off("requestfailed", onRequestFailed);
|
|
498
|
+
} catch {
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
},
|
|
502
|
+
getRequests: () => [...requests],
|
|
503
|
+
getErrors: () => [...errors]
|
|
504
|
+
};
|
|
505
|
+
}
|
|
108
506
|
async function performFormLogin(page, auth, stagehand) {
|
|
109
507
|
await page.goto(auth.loginUrl, { waitUntil: "domcontentloaded" });
|
|
110
508
|
await page.waitForLoadState("networkidle", 15e3).catch(() => {
|
|
111
509
|
});
|
|
510
|
+
await fillLoginCredentials(page, auth);
|
|
112
511
|
if (stagehand) {
|
|
113
512
|
await stagehand.act(
|
|
114
|
-
|
|
115
|
-
)
|
|
513
|
+
"Click the login, sign-in, or submit button to submit the form."
|
|
514
|
+
).catch(() => {
|
|
515
|
+
});
|
|
116
516
|
} else {
|
|
117
|
-
await
|
|
517
|
+
await clickSubmitButton(page);
|
|
118
518
|
}
|
|
119
519
|
await page.waitForLoadState("networkidle", 15e3).catch(() => {
|
|
120
520
|
});
|
|
121
521
|
}
|
|
122
|
-
async function
|
|
522
|
+
async function fillLoginCredentials(page, auth) {
|
|
123
523
|
await page.waitForSelector(
|
|
124
524
|
'input[type="email"], input[type="text"][name*="email"], input[name*="user"], input[type="text"]',
|
|
125
525
|
{ timeout: 15e3 }
|
|
@@ -153,6 +553,8 @@ async function manualFormLogin(page, auth) {
|
|
|
153
553
|
} else {
|
|
154
554
|
throw new Error("Could not find password input on login page");
|
|
155
555
|
}
|
|
556
|
+
}
|
|
557
|
+
async function clickSubmitButton(page) {
|
|
156
558
|
const submitSelectors = [
|
|
157
559
|
'button[type="submit"]',
|
|
158
560
|
'input[type="submit"]'
|
|
@@ -177,21 +579,23 @@ async function generateRunSummary(anthropic, testTitle, steps, model) {
|
|
|
177
579
|
(s) => `Step ${s.stepNumber}: ${s.action}
|
|
178
580
|
Expected: ${s.expectedResult}
|
|
179
581
|
Actual: ${s.actualResult}
|
|
180
|
-
Result: ${s.passed ? "PASS" : "FAIL"} (confidence: ${Math.round(s.confidence * 100)}%)${s.error ? `
|
|
582
|
+
Result: ${s.skipped ? "SKIPPED" : s.passed ? "PASS" : "FAIL"} (confidence: ${Math.round(s.confidence * 100)}%)${s.error ? `
|
|
181
583
|
Error: ${s.error}` : ""}`
|
|
182
584
|
).join("\n\n");
|
|
183
|
-
const passCount = steps.filter((s) => s.passed).length;
|
|
184
|
-
const failCount = steps.filter((s) => !s.passed).length;
|
|
585
|
+
const passCount = steps.filter((s) => s.passed && !s.skipped).length;
|
|
586
|
+
const failCount = steps.filter((s) => !s.passed && !s.skipped).length;
|
|
587
|
+
const skipCount = steps.filter((s) => s.skipped).length;
|
|
588
|
+
const skipNote = skipCount > 0 ? " Some steps were skipped due to page state recovery \u2014 these are not failures, just steps that could not be executed." : "";
|
|
185
589
|
const response = await anthropic.messages.create({
|
|
186
590
|
model,
|
|
187
591
|
max_tokens: 512,
|
|
188
592
|
messages: [
|
|
189
593
|
{
|
|
190
594
|
role: "user",
|
|
191
|
-
content: `Summarize this AI test execution in 2-3 sentences. Focus on what was tested, what passed, and what failed (if anything)
|
|
595
|
+
content: `Summarize this AI test execution in 2-3 sentences. Focus on what was tested, what passed, and what failed (if anything).${skipNote} Be concise and factual.
|
|
192
596
|
|
|
193
597
|
Test: ${testTitle}
|
|
194
|
-
Results: ${passCount} passed, ${failCount} failed out of ${steps.length} steps
|
|
598
|
+
Results: ${passCount} passed, ${failCount} failed, ${skipCount} skipped out of ${steps.length} steps
|
|
195
599
|
|
|
196
600
|
${stepsText}`
|
|
197
601
|
}
|
|
@@ -200,7 +604,355 @@ ${stepsText}`
|
|
|
200
604
|
return response.content.filter((block) => block.type === "text").map((block) => block.text).join("");
|
|
201
605
|
}
|
|
202
606
|
|
|
607
|
+
// src/vision-evaluator.ts
|
|
608
|
+
var DEFAULT_MODEL2 = "claude-sonnet-4-20250514";
|
|
609
|
+
async function evaluateStep(input) {
|
|
610
|
+
const model = input.model ?? DEFAULT_MODEL2;
|
|
611
|
+
const hintClause = input.evaluationHint ? `
|
|
612
|
+
EVALUATION HINT: ${input.evaluationHint}` : "";
|
|
613
|
+
const response = await input.anthropic.messages.create({
|
|
614
|
+
model,
|
|
615
|
+
max_tokens: 512,
|
|
616
|
+
messages: [
|
|
617
|
+
{
|
|
618
|
+
role: "user",
|
|
619
|
+
content: [
|
|
620
|
+
{
|
|
621
|
+
type: "text",
|
|
622
|
+
text: "BEFORE screenshot (page state before the action):"
|
|
623
|
+
},
|
|
624
|
+
{
|
|
625
|
+
type: "image",
|
|
626
|
+
source: {
|
|
627
|
+
type: "base64",
|
|
628
|
+
media_type: "image/png",
|
|
629
|
+
data: input.screenshotBefore.toString("base64")
|
|
630
|
+
}
|
|
631
|
+
},
|
|
632
|
+
{
|
|
633
|
+
type: "text",
|
|
634
|
+
text: "AFTER screenshot (page state after the action):"
|
|
635
|
+
},
|
|
636
|
+
{
|
|
637
|
+
type: "image",
|
|
638
|
+
source: {
|
|
639
|
+
type: "base64",
|
|
640
|
+
media_type: "image/png",
|
|
641
|
+
data: input.screenshotAfter.toString("base64")
|
|
642
|
+
}
|
|
643
|
+
},
|
|
644
|
+
{
|
|
645
|
+
type: "text",
|
|
646
|
+
text: `You are a QA test evaluator. Compare the BEFORE and AFTER screenshots to evaluate this test step.
|
|
647
|
+
|
|
648
|
+
ACTION PERFORMED: ${input.action}
|
|
649
|
+
EXPECTED RESULT: ${input.expectedResult}${hintClause}
|
|
650
|
+
|
|
651
|
+
Analyze the visual differences between the two screenshots and determine if the expected result was achieved.
|
|
652
|
+
|
|
653
|
+
Respond with ONLY a JSON object (no markdown, no explanation outside the JSON):
|
|
654
|
+
{
|
|
655
|
+
"passed": true/false,
|
|
656
|
+
"confidence": 0.0-1.0,
|
|
657
|
+
"actualResult": "Brief description of what actually changed between the screenshots"
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
Confidence guide:
|
|
661
|
+
- 0.95-1.0: Clearly achieved/not achieved, obvious visual evidence
|
|
662
|
+
- 0.8-0.94: Very likely, strong visual indicators
|
|
663
|
+
- 0.6-0.79: Probable but some ambiguity
|
|
664
|
+
- Below 0.6: Uncertain, hard to tell from screenshots alone`
|
|
665
|
+
}
|
|
666
|
+
]
|
|
667
|
+
}
|
|
668
|
+
]
|
|
669
|
+
});
|
|
670
|
+
const text = response.content.filter((block) => block.type === "text").map((block) => block.text).join("");
|
|
671
|
+
return parseEvaluation(text);
|
|
672
|
+
}
|
|
673
|
+
function parseEvaluation(text) {
|
|
674
|
+
try {
|
|
675
|
+
const parsed = JSON.parse(text.trim());
|
|
676
|
+
return validateEvaluation(parsed);
|
|
677
|
+
} catch {
|
|
678
|
+
const jsonMatch = text.match(/\{[\s\S]*"passed"[\s\S]*"confidence"[\s\S]*"actualResult"[\s\S]*\}/);
|
|
679
|
+
if (jsonMatch) {
|
|
680
|
+
try {
|
|
681
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
682
|
+
return validateEvaluation(parsed);
|
|
683
|
+
} catch {
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
return {
|
|
688
|
+
passed: false,
|
|
689
|
+
confidence: 0.3,
|
|
690
|
+
actualResult: `Vision evaluation returned unparseable response: ${text.slice(0, 200)}`
|
|
691
|
+
};
|
|
692
|
+
}
|
|
693
|
+
function validateEvaluation(parsed) {
|
|
694
|
+
return {
|
|
695
|
+
passed: typeof parsed.passed === "boolean" ? parsed.passed : false,
|
|
696
|
+
confidence: typeof parsed.confidence === "number" ? Math.max(0, Math.min(1, parsed.confidence)) : 0.5,
|
|
697
|
+
actualResult: typeof parsed.actualResult === "string" ? parsed.actualResult : "No description provided"
|
|
698
|
+
};
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
// src/action-executor.ts
|
|
702
|
+
async function executeAction(page, stagehand, step) {
|
|
703
|
+
if (step.selector && step.actionType) {
|
|
704
|
+
try {
|
|
705
|
+
await executePlaywrightAction(page, step);
|
|
706
|
+
return { deterministic: true };
|
|
707
|
+
} catch (err) {
|
|
708
|
+
const fallbackResult = await executeStagehandAction(stagehand, step);
|
|
709
|
+
return {
|
|
710
|
+
deterministic: false,
|
|
711
|
+
error: fallbackResult.error ? `Playwright failed (${err instanceof Error ? err.message : String(err)}), Stagehand fallback also failed: ${fallbackResult.error}` : void 0
|
|
712
|
+
};
|
|
713
|
+
}
|
|
714
|
+
}
|
|
715
|
+
return executeStagehandAction(stagehand, step);
|
|
716
|
+
}
|
|
717
|
+
async function executePlaywrightAction(page, step) {
|
|
718
|
+
const { actionType, selector, value, waitMs } = step;
|
|
719
|
+
switch (actionType) {
|
|
720
|
+
case "click": {
|
|
721
|
+
const locator = page.locator(selector);
|
|
722
|
+
await locator.click();
|
|
723
|
+
break;
|
|
724
|
+
}
|
|
725
|
+
case "fill": {
|
|
726
|
+
const locator = page.locator(selector);
|
|
727
|
+
await locator.fill(value ?? "");
|
|
728
|
+
break;
|
|
729
|
+
}
|
|
730
|
+
case "select": {
|
|
731
|
+
await page.evaluate(
|
|
732
|
+
({ sel, val }) => {
|
|
733
|
+
const el = document.querySelector(sel);
|
|
734
|
+
if (!el) throw new Error(`Select element not found: ${sel}`);
|
|
735
|
+
el.value = val;
|
|
736
|
+
el.dispatchEvent(new Event("change", { bubbles: true }));
|
|
737
|
+
},
|
|
738
|
+
{ sel: selector, val: value ?? "" }
|
|
739
|
+
);
|
|
740
|
+
break;
|
|
741
|
+
}
|
|
742
|
+
case "navigate": {
|
|
743
|
+
const url = value ?? selector ?? "";
|
|
744
|
+
if (!url) throw new Error("Navigate action requires a value or selector with the URL");
|
|
745
|
+
await page.goto(url, { waitUntil: "domcontentloaded", timeoutMs: 15e3 });
|
|
746
|
+
break;
|
|
747
|
+
}
|
|
748
|
+
case "scroll": {
|
|
749
|
+
await page.evaluate((sel) => {
|
|
750
|
+
const el = document.querySelector(sel);
|
|
751
|
+
if (el) el.scrollIntoView({ behavior: "smooth", block: "center" });
|
|
752
|
+
}, selector);
|
|
753
|
+
break;
|
|
754
|
+
}
|
|
755
|
+
case "wait": {
|
|
756
|
+
if (selector) {
|
|
757
|
+
await page.waitForSelector(selector, { timeout: waitMs ?? 1e4 });
|
|
758
|
+
} else if (waitMs) {
|
|
759
|
+
await page.waitForTimeout(waitMs);
|
|
760
|
+
}
|
|
761
|
+
break;
|
|
762
|
+
}
|
|
763
|
+
case "assert": {
|
|
764
|
+
break;
|
|
765
|
+
}
|
|
766
|
+
default: {
|
|
767
|
+
throw new Error(`Unknown actionType: ${actionType}`);
|
|
768
|
+
}
|
|
769
|
+
}
|
|
770
|
+
if (waitMs && actionType !== "wait") {
|
|
771
|
+
await page.waitForTimeout(waitMs);
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
async function executeStagehandAction(stagehand, step) {
|
|
775
|
+
try {
|
|
776
|
+
await stagehand.act(step.action);
|
|
777
|
+
return { deterministic: false };
|
|
778
|
+
} catch (err) {
|
|
779
|
+
return {
|
|
780
|
+
deterministic: false,
|
|
781
|
+
error: err instanceof Error ? err.message : String(err)
|
|
782
|
+
};
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
// src/selector-discovery.ts
|
|
787
|
+
async function discoverSelector(page) {
|
|
788
|
+
try {
|
|
789
|
+
const result = await page.evaluate(() => {
|
|
790
|
+
const el = document.__bbLastClicked ?? document.activeElement;
|
|
791
|
+
if (!el || el === document.body || el === document.documentElement) return null;
|
|
792
|
+
const tagName = el.tagName?.toLowerCase() ?? "unknown";
|
|
793
|
+
const textContent = (el.textContent ?? "").trim().slice(0, 100);
|
|
794
|
+
let selector = "";
|
|
795
|
+
let strategy = "css-path";
|
|
796
|
+
const testId = el.getAttribute("data-testid") ?? el.getAttribute("data-test-id");
|
|
797
|
+
if (testId) {
|
|
798
|
+
selector = `[data-testid="${testId}"]`;
|
|
799
|
+
strategy = "data-testid";
|
|
800
|
+
} else if (el.id && !/^:r[0-9a-z]+:?$/.test(el.id) && !/^react-/.test(el.id)) {
|
|
801
|
+
selector = `#${el.id}`;
|
|
802
|
+
strategy = "id";
|
|
803
|
+
} else if (el.getAttribute("role")) {
|
|
804
|
+
const role = el.getAttribute("role");
|
|
805
|
+
const name = el.getAttribute("aria-label") ?? el.getAttribute("name") ?? "";
|
|
806
|
+
if (name) {
|
|
807
|
+
selector = `[role="${role}"][aria-label="${name}"]`;
|
|
808
|
+
strategy = "role";
|
|
809
|
+
} else {
|
|
810
|
+
selector = `[role="${role}"]`;
|
|
811
|
+
strategy = "role";
|
|
812
|
+
}
|
|
813
|
+
} else if (el.getAttribute("aria-label")) {
|
|
814
|
+
selector = `[aria-label="${el.getAttribute("aria-label")}"]`;
|
|
815
|
+
strategy = "aria-label";
|
|
816
|
+
} else {
|
|
817
|
+
const parts = [];
|
|
818
|
+
let current = el;
|
|
819
|
+
while (current && current !== document.body) {
|
|
820
|
+
let part = current.tagName.toLowerCase();
|
|
821
|
+
if (current.className && typeof current.className === "string") {
|
|
822
|
+
const classes = current.className.split(/\s+/).filter(
|
|
823
|
+
(c) => c && !c.startsWith("_") && c.length < 30
|
|
824
|
+
);
|
|
825
|
+
if (classes.length > 0) {
|
|
826
|
+
part += `.${classes[0]}`;
|
|
827
|
+
}
|
|
828
|
+
}
|
|
829
|
+
parts.unshift(part);
|
|
830
|
+
current = current.parentElement;
|
|
831
|
+
if (parts.length >= 4) break;
|
|
832
|
+
}
|
|
833
|
+
selector = parts.join(" > ");
|
|
834
|
+
strategy = "css-path";
|
|
835
|
+
}
|
|
836
|
+
let suggestedActionType;
|
|
837
|
+
if (tagName === "button" || tagName === "a" || el.getAttribute("role") === "button") {
|
|
838
|
+
suggestedActionType = "click";
|
|
839
|
+
} else if (tagName === "input" || tagName === "textarea") {
|
|
840
|
+
const type = el.getAttribute("type") ?? "text";
|
|
841
|
+
if (type === "checkbox" || type === "radio") {
|
|
842
|
+
suggestedActionType = "click";
|
|
843
|
+
} else {
|
|
844
|
+
suggestedActionType = "fill";
|
|
845
|
+
}
|
|
846
|
+
} else if (tagName === "select") {
|
|
847
|
+
suggestedActionType = "select";
|
|
848
|
+
}
|
|
849
|
+
return { selector, strategy, suggestedActionType, tagName, textContent };
|
|
850
|
+
});
|
|
851
|
+
return result;
|
|
852
|
+
} catch {
|
|
853
|
+
return null;
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
async function installClickTracker(page) {
|
|
857
|
+
try {
|
|
858
|
+
await page.evaluate(() => {
|
|
859
|
+
document.addEventListener("click", (e) => {
|
|
860
|
+
document.__bbLastClicked = e.target;
|
|
861
|
+
}, { capture: true });
|
|
862
|
+
});
|
|
863
|
+
} catch {
|
|
864
|
+
}
|
|
865
|
+
}
|
|
866
|
+
|
|
867
|
+
// src/cost.ts
|
|
868
|
+
var MODEL_PRICING = {
|
|
869
|
+
"claude-sonnet-4-20250514": { input: 3, output: 15 },
|
|
870
|
+
"claude-haiku-4-20250514": { input: 0.8, output: 4 },
|
|
871
|
+
"claude-opus-4-20250514": { input: 15, output: 75 },
|
|
872
|
+
// Aliases
|
|
873
|
+
"sonnet": { input: 3, output: 15 },
|
|
874
|
+
"haiku": { input: 0.8, output: 4 },
|
|
875
|
+
"opus": { input: 15, output: 75 }
|
|
876
|
+
};
|
|
877
|
+
var DEFAULT_MODEL3 = "claude-sonnet-4-20250514";
|
|
878
|
+
var TOKEN_PROFILE = {
|
|
879
|
+
/** act() — screenshot + DOM context → action decision */
|
|
880
|
+
actInput: 2e3,
|
|
881
|
+
actOutput: 200,
|
|
882
|
+
/** extract() — screenshot + extraction schema → structured result */
|
|
883
|
+
extractInput: 3e3,
|
|
884
|
+
extractOutput: 500,
|
|
885
|
+
/** summary — all step results → narrative summary (once per run) */
|
|
886
|
+
summaryInput: 2e3,
|
|
887
|
+
summaryOutput: 500
|
|
888
|
+
};
|
|
889
|
+
function estimateCost(inputTokens, outputTokens, model) {
|
|
890
|
+
const resolvedModel = model ?? DEFAULT_MODEL3;
|
|
891
|
+
const pricing = MODEL_PRICING[resolvedModel] ?? MODEL_PRICING[DEFAULT_MODEL3];
|
|
892
|
+
const inputCost = inputTokens / 1e6 * pricing.input;
|
|
893
|
+
const outputCost = outputTokens / 1e6 * pricing.output;
|
|
894
|
+
const totalDollars = inputCost + outputCost;
|
|
895
|
+
const cents = Math.round(totalDollars * 100 * 100) / 100;
|
|
896
|
+
return {
|
|
897
|
+
cents,
|
|
898
|
+
formatted: `$${totalDollars.toFixed(4)}`,
|
|
899
|
+
tokens: { inputTokens, outputTokens },
|
|
900
|
+
model: resolvedModel
|
|
901
|
+
};
|
|
902
|
+
}
|
|
903
|
+
function estimateTestCost(stepCount, model) {
|
|
904
|
+
const inputTokens = stepCount * (TOKEN_PROFILE.actInput + TOKEN_PROFILE.extractInput) + TOKEN_PROFILE.summaryInput;
|
|
905
|
+
const outputTokens = stepCount * (TOKEN_PROFILE.actOutput + TOKEN_PROFILE.extractOutput) + TOKEN_PROFILE.summaryOutput;
|
|
906
|
+
return estimateCost(inputTokens, outputTokens, model);
|
|
907
|
+
}
|
|
908
|
+
function estimateBatchCost(testCases, model) {
|
|
909
|
+
let totalInput = 0;
|
|
910
|
+
let totalOutput = 0;
|
|
911
|
+
for (const tc of testCases) {
|
|
912
|
+
totalInput += tc.stepCount * (TOKEN_PROFILE.actInput + TOKEN_PROFILE.extractInput) + TOKEN_PROFILE.summaryInput;
|
|
913
|
+
totalOutput += tc.stepCount * (TOKEN_PROFILE.actOutput + TOKEN_PROFILE.extractOutput) + TOKEN_PROFILE.summaryOutput;
|
|
914
|
+
}
|
|
915
|
+
return estimateCost(totalInput, totalOutput, model);
|
|
916
|
+
}
|
|
917
|
+
function getTokenEstimate(stepCount) {
|
|
918
|
+
return {
|
|
919
|
+
inputTokens: stepCount * (TOKEN_PROFILE.actInput + TOKEN_PROFILE.extractInput) + TOKEN_PROFILE.summaryInput,
|
|
920
|
+
outputTokens: stepCount * (TOKEN_PROFILE.actOutput + TOKEN_PROFILE.extractOutput) + TOKEN_PROFILE.summaryOutput
|
|
921
|
+
};
|
|
922
|
+
}
|
|
923
|
+
|
|
203
924
|
// src/runner.ts
|
|
925
|
+
var AI_OPERATION_TIMEOUT_MS = 3e4;
|
|
926
|
+
var DEFAULT_MAX_RETRIES = 2;
|
|
927
|
+
var DEFAULT_RETRY_DELAY_MS = 2e3;
|
|
928
|
+
function isRetryableError(error) {
|
|
929
|
+
const patterns = [
|
|
930
|
+
/timed?\s*out/i,
|
|
931
|
+
/ECONNREFUSED/i,
|
|
932
|
+
/ECONNRESET/i,
|
|
933
|
+
/ENOTFOUND/i,
|
|
934
|
+
/net::ERR_/i,
|
|
935
|
+
/navigation failed/i,
|
|
936
|
+
/page crashed/i,
|
|
937
|
+
/context was destroyed/i,
|
|
938
|
+
/target closed/i,
|
|
939
|
+
/session closed/i,
|
|
940
|
+
/browser disconnected/i,
|
|
941
|
+
/execution context/i
|
|
942
|
+
];
|
|
943
|
+
return patterns.some((p) => p.test(error));
|
|
944
|
+
}
|
|
945
|
+
async function withTimeout(promise, timeoutMs, operation) {
|
|
946
|
+
let timeoutId;
|
|
947
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
948
|
+
timeoutId = setTimeout(() => reject(new Error(`${operation} timed out after ${timeoutMs}ms`)), timeoutMs);
|
|
949
|
+
});
|
|
950
|
+
try {
|
|
951
|
+
return await Promise.race([promise, timeoutPromise]);
|
|
952
|
+
} finally {
|
|
953
|
+
clearTimeout(timeoutId);
|
|
954
|
+
}
|
|
955
|
+
}
|
|
204
956
|
async function runTest(config) {
|
|
205
957
|
const anthropic = new import_sdk.default({ apiKey: config.anthropicApiKey });
|
|
206
958
|
const startTime = Date.now();
|
|
@@ -209,60 +961,71 @@ async function runTest(config) {
|
|
|
209
961
|
headless: true
|
|
210
962
|
};
|
|
211
963
|
config.onStatusChange?.("initializing");
|
|
212
|
-
|
|
213
|
-
const { stagehand, page } = session;
|
|
964
|
+
let session;
|
|
214
965
|
const stepResults = [];
|
|
215
966
|
let pendingConsoleLogs = [];
|
|
216
967
|
let pendingNetworkErrors = [];
|
|
217
968
|
let stepStartTime = Date.now();
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
const
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
pendingNetworkErrors.push({
|
|
234
|
-
method,
|
|
235
|
-
url: url.slice(0, 500),
|
|
236
|
-
status: 0,
|
|
237
|
-
statusText: failure?.errorText ?? "Request failed",
|
|
238
|
-
timestamp: Date.now() - stepStartTime
|
|
239
|
-
});
|
|
240
|
-
});
|
|
241
|
-
rawPage.on("response", (res) => {
|
|
242
|
-
const status = typeof res.status === "function" ? res.status() : Number(res.status ?? 0);
|
|
243
|
-
if (status >= 400) {
|
|
244
|
-
const url = typeof res.url === "function" ? res.url() : String(res.url ?? "");
|
|
245
|
-
const statusText = typeof res.statusText === "function" ? res.statusText() : String(res.statusText ?? "");
|
|
246
|
-
const req = typeof res.request === "function" ? res.request() : res.request;
|
|
247
|
-
const method = req ? typeof req.method === "function" ? req.method() : String(req.method ?? "GET") : "GET";
|
|
248
|
-
pendingNetworkErrors.push({
|
|
249
|
-
method,
|
|
250
|
-
url: url.slice(0, 500),
|
|
251
|
-
status,
|
|
252
|
-
statusText,
|
|
253
|
-
timestamp: Date.now() - stepStartTime
|
|
969
|
+
try {
|
|
970
|
+
session = await createStagehandSession(browserConfig, config.anthropicApiKey);
|
|
971
|
+
const { stagehand, page } = session;
|
|
972
|
+
await suppressBugBearWidget(stagehand);
|
|
973
|
+
const rawPage = page;
|
|
974
|
+
try {
|
|
975
|
+
rawPage.on("console", (msg) => {
|
|
976
|
+
const level = msg.type?.() ?? msg.type ?? "log";
|
|
977
|
+
const mappedLevel = level === "error" ? "error" : level === "warn" || level === "warning" ? "warning" : level === "info" ? "info" : level === "debug" ? "debug" : "log";
|
|
978
|
+
pendingConsoleLogs.push({
|
|
979
|
+
level: mappedLevel,
|
|
980
|
+
text: (typeof msg.text === "function" ? msg.text() : String(msg.text ?? msg)).slice(0, 2e3),
|
|
981
|
+
source: typeof msg.location === "function" ? msg.location()?.url : void 0,
|
|
982
|
+
timestamp: Date.now() - stepStartTime
|
|
983
|
+
});
|
|
254
984
|
});
|
|
985
|
+
} catch {
|
|
255
986
|
}
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
987
|
+
try {
|
|
988
|
+
rawPage.on("requestfailed", (req) => {
|
|
989
|
+
const url = typeof req.url === "function" ? req.url() : String(req.url ?? "");
|
|
990
|
+
const method = typeof req.method === "function" ? req.method() : String(req.method ?? "GET");
|
|
991
|
+
const failure = typeof req.failure === "function" ? req.failure() : req.failure;
|
|
992
|
+
pendingNetworkErrors.push({
|
|
993
|
+
method,
|
|
994
|
+
url: url.slice(0, 500),
|
|
995
|
+
status: 0,
|
|
996
|
+
statusText: failure?.errorText ?? "Request failed",
|
|
997
|
+
timestamp: Date.now() - stepStartTime
|
|
998
|
+
});
|
|
999
|
+
});
|
|
1000
|
+
} catch {
|
|
1001
|
+
}
|
|
1002
|
+
try {
|
|
1003
|
+
rawPage.on("response", (res) => {
|
|
1004
|
+
const status = typeof res.status === "function" ? res.status() : Number(res.status ?? 0);
|
|
1005
|
+
if (status >= 400) {
|
|
1006
|
+
const url = typeof res.url === "function" ? res.url() : String(res.url ?? "");
|
|
1007
|
+
const statusText = typeof res.statusText === "function" ? res.statusText() : String(res.statusText ?? "");
|
|
1008
|
+
const req = typeof res.request === "function" ? res.request() : res.request;
|
|
1009
|
+
const method = req ? typeof req.method === "function" ? req.method() : String(req.method ?? "GET") : "GET";
|
|
1010
|
+
pendingNetworkErrors.push({
|
|
1011
|
+
method,
|
|
1012
|
+
url: url.slice(0, 500),
|
|
1013
|
+
status,
|
|
1014
|
+
statusText,
|
|
1015
|
+
timestamp: Date.now() - stepStartTime
|
|
1016
|
+
});
|
|
1017
|
+
}
|
|
1018
|
+
});
|
|
1019
|
+
} catch {
|
|
1020
|
+
}
|
|
1021
|
+
if (config.auth?.type === "form-login" || config.auth?.type === "supabase-native") {
|
|
259
1022
|
config.onStatusChange?.("authenticating");
|
|
260
1023
|
await injectAuth(page, config.auth, stagehand);
|
|
261
1024
|
}
|
|
262
1025
|
config.onStatusChange?.("navigating");
|
|
263
1026
|
const targetUrl = config.testCase.targetRoute ? `${config.targetUrl.replace(/\/$/, "")}${config.testCase.targetRoute}` : config.targetUrl;
|
|
264
1027
|
await page.goto(targetUrl, { waitUntil: "domcontentloaded", timeoutMs: 3e4 });
|
|
265
|
-
if (config.auth && config.auth.type !== "form-login") {
|
|
1028
|
+
if (config.auth && config.auth.type !== "form-login" && config.auth.type !== "supabase-native") {
|
|
266
1029
|
config.onStatusChange?.("authenticating");
|
|
267
1030
|
await injectAuth(page, config.auth, stagehand);
|
|
268
1031
|
if (config.auth.type === "localStorage") {
|
|
@@ -276,79 +1039,143 @@ async function runTest(config) {
|
|
|
276
1039
|
}
|
|
277
1040
|
await page.waitForLoadState("networkidle").catch(() => {
|
|
278
1041
|
});
|
|
1042
|
+
await page.evaluate(() => {
|
|
1043
|
+
window.__bugbear_suppress = true;
|
|
1044
|
+
try {
|
|
1045
|
+
localStorage.setItem("__bugbear_suppress", "true");
|
|
1046
|
+
} catch {
|
|
1047
|
+
}
|
|
1048
|
+
}).catch(() => {
|
|
1049
|
+
});
|
|
1050
|
+
await installClickTracker(page);
|
|
279
1051
|
pendingConsoleLogs = [];
|
|
280
1052
|
pendingNetworkErrors = [];
|
|
281
1053
|
config.onStatusChange?.("executing");
|
|
282
1054
|
const steps = config.testCase.steps;
|
|
1055
|
+
const maxRetries = config.retry?.maxRetries ?? DEFAULT_MAX_RETRIES;
|
|
1056
|
+
const retryDelayMs = config.retry?.retryDelayMs ?? DEFAULT_RETRY_DELAY_MS;
|
|
1057
|
+
const resilientMode = config.resilientMode ?? true;
|
|
283
1058
|
for (let i = 0; i < steps.length; i++) {
|
|
284
1059
|
const step = steps[i];
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
actSucceeded =
|
|
295
|
-
await page
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
1060
|
+
const retryHistory = [];
|
|
1061
|
+
let finalResult;
|
|
1062
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
1063
|
+
stepStartTime = Date.now();
|
|
1064
|
+
pendingConsoleLogs = [];
|
|
1065
|
+
pendingNetworkErrors = [];
|
|
1066
|
+
const screenshotBefore = await page.screenshot({ type: "png" });
|
|
1067
|
+
let error;
|
|
1068
|
+
let screenshotAfter = screenshotBefore;
|
|
1069
|
+
let actSucceeded = false;
|
|
1070
|
+
const actionResult = await executeAction(page, stagehand, step);
|
|
1071
|
+
error = actionResult.error;
|
|
1072
|
+
actSucceeded = !error;
|
|
1073
|
+
if (actSucceeded) {
|
|
1074
|
+
await page.waitForLoadState("networkidle").catch(() => {
|
|
1075
|
+
});
|
|
1076
|
+
await page.waitForTimeout(step.waitMs ?? 500);
|
|
1077
|
+
}
|
|
301
1078
|
screenshotAfter = await page.screenshot({ type: "png" }).catch(() => screenshotBefore);
|
|
1079
|
+
let evaluation = {
|
|
1080
|
+
passed: false,
|
|
1081
|
+
confidence: 0,
|
|
1082
|
+
actualResult: error ?? "Action execution failed"
|
|
1083
|
+
};
|
|
1084
|
+
if (actSucceeded) {
|
|
1085
|
+
try {
|
|
1086
|
+
const visionResult = await withTimeout(
|
|
1087
|
+
evaluateStep({
|
|
1088
|
+
anthropic,
|
|
1089
|
+
screenshotBefore,
|
|
1090
|
+
screenshotAfter,
|
|
1091
|
+
action: step.action,
|
|
1092
|
+
expectedResult: step.expectedResult,
|
|
1093
|
+
evaluationHint: step.evaluationHint,
|
|
1094
|
+
model: config.model
|
|
1095
|
+
}),
|
|
1096
|
+
AI_OPERATION_TIMEOUT_MS,
|
|
1097
|
+
"Vision evaluation"
|
|
1098
|
+
);
|
|
1099
|
+
evaluation = {
|
|
1100
|
+
passed: visionResult.passed,
|
|
1101
|
+
confidence: visionResult.confidence,
|
|
1102
|
+
actualResult: visionResult.actualResult
|
|
1103
|
+
};
|
|
1104
|
+
} catch (evalErr) {
|
|
1105
|
+
evaluation = {
|
|
1106
|
+
passed: false,
|
|
1107
|
+
confidence: 0.2,
|
|
1108
|
+
actualResult: `Vision evaluation error: ${evalErr instanceof Error ? evalErr.message : String(evalErr)}`
|
|
1109
|
+
};
|
|
1110
|
+
}
|
|
1111
|
+
}
|
|
1112
|
+
let discoveredActions = [];
|
|
1113
|
+
if (actSucceeded && !actionResult.deterministic) {
|
|
1114
|
+
const discovered = await discoverSelector(page);
|
|
1115
|
+
if (discovered) {
|
|
1116
|
+
discoveredActions = [{
|
|
1117
|
+
type: discovered.suggestedActionType ?? "click",
|
|
1118
|
+
selector: discovered.selector,
|
|
1119
|
+
description: `Discovered via ${discovered.strategy}: ${discovered.tagName}${discovered.textContent ? ` "${discovered.textContent.slice(0, 50)}"` : ""}`
|
|
1120
|
+
}];
|
|
1121
|
+
}
|
|
1122
|
+
}
|
|
1123
|
+
const consoleLogs = pendingConsoleLogs.slice(0, 50);
|
|
1124
|
+
const networkErrors = pendingNetworkErrors.slice(0, 30);
|
|
1125
|
+
finalResult = {
|
|
1126
|
+
stepNumber: step.stepNumber,
|
|
1127
|
+
action: step.action,
|
|
1128
|
+
expectedResult: step.expectedResult,
|
|
1129
|
+
actualResult: evaluation.actualResult,
|
|
1130
|
+
passed: evaluation.passed,
|
|
1131
|
+
confidence: evaluation.confidence,
|
|
1132
|
+
screenshotBefore,
|
|
1133
|
+
screenshotAfter,
|
|
1134
|
+
actionsTaken: discoveredActions,
|
|
1135
|
+
error,
|
|
1136
|
+
durationMs: Date.now() - stepStartTime,
|
|
1137
|
+
consoleLogs,
|
|
1138
|
+
networkErrors,
|
|
1139
|
+
retryCount: attempt,
|
|
1140
|
+
retryHistory,
|
|
1141
|
+
skipped: false
|
|
1142
|
+
};
|
|
1143
|
+
const shouldRetry = !evaluation.passed && error && isRetryableError(error) && attempt < maxRetries;
|
|
1144
|
+
if (!shouldRetry) break;
|
|
1145
|
+
retryHistory.push({
|
|
1146
|
+
attempt,
|
|
1147
|
+
error,
|
|
1148
|
+
confidence: evaluation.confidence,
|
|
1149
|
+
timestamp: Date.now()
|
|
1150
|
+
});
|
|
1151
|
+
await new Promise((resolve) => setTimeout(resolve, retryDelayMs));
|
|
302
1152
|
}
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
actualResult: error ?? "Action execution failed"
|
|
307
|
-
};
|
|
308
|
-
if (actSucceeded) {
|
|
1153
|
+
if (resilientMode && finalResult && !finalResult.passed) {
|
|
1154
|
+
finalResult.skipped = true;
|
|
1155
|
+
finalResult.skipReason = "Step failed, recovered page state";
|
|
309
1156
|
try {
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
1157
|
+
config.onStatusChange?.("navigating");
|
|
1158
|
+
await page.goto(targetUrl, { waitUntil: "domcontentloaded", timeoutMs: 3e4 });
|
|
1159
|
+
await page.waitForLoadState("networkidle").catch(() => {
|
|
1160
|
+
});
|
|
1161
|
+
await installClickTracker(page);
|
|
1162
|
+
await page.evaluate(() => {
|
|
1163
|
+
window.__bugbear_suppress = true;
|
|
1164
|
+
try {
|
|
1165
|
+
localStorage.setItem("__bugbear_suppress", "true");
|
|
1166
|
+
} catch {
|
|
1167
|
+
}
|
|
1168
|
+
}).catch(() => {
|
|
314
1169
|
});
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
passed: verification.passed,
|
|
321
|
-
confidence: verification.confidence,
|
|
322
|
-
actualResult: verification.actualResult
|
|
323
|
-
};
|
|
324
|
-
} catch (evalErr) {
|
|
325
|
-
evaluation = {
|
|
326
|
-
passed: false,
|
|
327
|
-
confidence: 0.2,
|
|
328
|
-
actualResult: `Verification error: ${evalErr instanceof Error ? evalErr.message : String(evalErr)}`
|
|
329
|
-
};
|
|
1170
|
+
pendingConsoleLogs = [];
|
|
1171
|
+
pendingNetworkErrors = [];
|
|
1172
|
+
config.onStatusChange?.("executing");
|
|
1173
|
+
} catch (recoveryErr) {
|
|
1174
|
+
finalResult.skipReason = `Step failed, recovery also failed: ${recoveryErr instanceof Error ? recoveryErr.message : String(recoveryErr)}`;
|
|
330
1175
|
}
|
|
331
1176
|
}
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
const result = {
|
|
335
|
-
stepNumber: step.stepNumber,
|
|
336
|
-
action: step.action,
|
|
337
|
-
expectedResult: step.expectedResult,
|
|
338
|
-
actualResult: evaluation.actualResult,
|
|
339
|
-
passed: evaluation.passed,
|
|
340
|
-
confidence: evaluation.confidence,
|
|
341
|
-
screenshotBefore,
|
|
342
|
-
screenshotAfter,
|
|
343
|
-
actionsTaken: [],
|
|
344
|
-
// Stagehand handles actions internally
|
|
345
|
-
error,
|
|
346
|
-
durationMs: Date.now() - stepStartTime,
|
|
347
|
-
consoleLogs,
|
|
348
|
-
networkErrors
|
|
349
|
-
};
|
|
350
|
-
stepResults.push(result);
|
|
351
|
-
config.onStepComplete?.(result, i, steps.length);
|
|
1177
|
+
stepResults.push(finalResult);
|
|
1178
|
+
config.onStepComplete?.(finalResult, i, steps.length);
|
|
352
1179
|
}
|
|
353
1180
|
config.onStatusChange?.("completed");
|
|
354
1181
|
const model = config.model ?? "claude-sonnet-4-20250514";
|
|
@@ -362,11 +1189,7 @@ async function runTest(config) {
|
|
|
362
1189
|
totalDurationMs: Date.now() - startTime,
|
|
363
1190
|
summary,
|
|
364
1191
|
screenshotUrls: [],
|
|
365
|
-
tokenUsage:
|
|
366
|
-
// Stagehand tracks tokens internally; these are approximate
|
|
367
|
-
inputTokens: steps.length * 3e3,
|
|
368
|
-
outputTokens: steps.length * 500
|
|
369
|
-
},
|
|
1192
|
+
tokenUsage: getTokenEstimate(steps.length),
|
|
370
1193
|
browserSessionId: session.sessionId
|
|
371
1194
|
};
|
|
372
1195
|
} catch (err) {
|
|
@@ -378,30 +1201,689 @@ async function runTest(config) {
|
|
|
378
1201
|
totalDurationMs: Date.now() - startTime,
|
|
379
1202
|
summary: `Test execution failed: ${err instanceof Error ? err.message : String(err)}`,
|
|
380
1203
|
screenshotUrls: [],
|
|
381
|
-
tokenUsage:
|
|
382
|
-
|
|
383
|
-
outputTokens: stepResults.length * 500
|
|
384
|
-
},
|
|
385
|
-
browserSessionId: session.sessionId
|
|
1204
|
+
tokenUsage: getTokenEstimate(stepResults.length),
|
|
1205
|
+
browserSessionId: session?.sessionId ?? "unknown"
|
|
386
1206
|
};
|
|
387
1207
|
} finally {
|
|
388
|
-
|
|
1208
|
+
if (session?.page) {
|
|
1209
|
+
const rawPage = session.page;
|
|
1210
|
+
rawPage.removeAllListeners?.("console");
|
|
1211
|
+
rawPage.removeAllListeners?.("requestfailed");
|
|
1212
|
+
rawPage.removeAllListeners?.("response");
|
|
1213
|
+
}
|
|
1214
|
+
await session?.close();
|
|
389
1215
|
}
|
|
390
1216
|
}
|
|
391
1217
|
function determineOverallResult(steps) {
|
|
392
1218
|
if (steps.length === 0) return "error";
|
|
393
|
-
const
|
|
394
|
-
const
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
1219
|
+
const nonSkipped = steps.filter((s) => !s.skipped);
|
|
1220
|
+
const skippedCount = steps.length - nonSkipped.length;
|
|
1221
|
+
if (nonSkipped.length === 0) return "error";
|
|
1222
|
+
const allNonSkippedPassed = nonSkipped.every((s) => s.passed);
|
|
1223
|
+
const hasErrors = nonSkipped.some((s) => s.error);
|
|
1224
|
+
if (skippedCount > 0 && allNonSkippedPassed) return "passed_with_skips";
|
|
1225
|
+
if (allNonSkippedPassed) return "passed";
|
|
1226
|
+
if (nonSkipped.every((s) => !s.passed) || hasErrors) return "failed";
|
|
398
1227
|
return "partial";
|
|
399
1228
|
}
|
|
1229
|
+
|
|
1230
|
+
// src/explorer.ts
|
|
1231
|
+
var import_sdk2 = __toESM(require("@anthropic-ai/sdk"));
|
|
1232
|
+
var DEFAULT_MODEL4 = "anthropic/claude-sonnet-4-20250514";
|
|
1233
|
+
var AI_OPERATION_TIMEOUT_MS2 = 6e4;
|
|
1234
|
+
async function withTimeout2(promise, timeoutMs, operation) {
|
|
1235
|
+
let timeoutId;
|
|
1236
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
1237
|
+
timeoutId = setTimeout(() => reject(new Error(`${operation} timed out after ${timeoutMs}ms`)), timeoutMs);
|
|
1238
|
+
});
|
|
1239
|
+
try {
|
|
1240
|
+
return await Promise.race([promise, timeoutPromise]);
|
|
1241
|
+
} finally {
|
|
1242
|
+
clearTimeout(timeoutId);
|
|
1243
|
+
}
|
|
1244
|
+
}
|
|
1245
|
+
async function runExploration(config) {
|
|
1246
|
+
const {
|
|
1247
|
+
targetUrl,
|
|
1248
|
+
featureDescription,
|
|
1249
|
+
actionBudget,
|
|
1250
|
+
auth,
|
|
1251
|
+
browserConfig,
|
|
1252
|
+
anthropicApiKey,
|
|
1253
|
+
model = DEFAULT_MODEL4,
|
|
1254
|
+
onActionComplete
|
|
1255
|
+
} = config;
|
|
1256
|
+
const anthropic = new import_sdk2.default({ apiKey: anthropicApiKey });
|
|
1257
|
+
const startTime = Date.now();
|
|
1258
|
+
const actions = [];
|
|
1259
|
+
let totalInputTokens = 0;
|
|
1260
|
+
let totalOutputTokens = 0;
|
|
1261
|
+
const session = await createStagehandSession(browserConfig, anthropicApiKey);
|
|
1262
|
+
const { stagehand, page } = session;
|
|
1263
|
+
await suppressBugBearWidget(stagehand);
|
|
1264
|
+
try {
|
|
1265
|
+
await page.goto(targetUrl, { waitUntil: "networkidle", timeoutMs: 3e4 });
|
|
1266
|
+
if (auth) {
|
|
1267
|
+
await injectAuth(page, auth, stagehand);
|
|
1268
|
+
await page.waitForLoadState("networkidle").catch(() => {
|
|
1269
|
+
});
|
|
1270
|
+
}
|
|
1271
|
+
const networkCapture = createNetworkCapture(page);
|
|
1272
|
+
let consoleLogs = [];
|
|
1273
|
+
let actionStartTime = Date.now();
|
|
1274
|
+
const rawPage = page;
|
|
1275
|
+
rawPage.on("console", (msg) => {
|
|
1276
|
+
const level = msg.type?.() ?? msg.type ?? "log";
|
|
1277
|
+
if (["error", "warning", "warn"].includes(level)) {
|
|
1278
|
+
consoleLogs.push({
|
|
1279
|
+
level: level === "warn" ? "warning" : level,
|
|
1280
|
+
text: (typeof msg.text === "function" ? msg.text() : String(msg.text ?? msg)).slice(0, 500),
|
|
1281
|
+
source: typeof msg.location === "function" ? msg.location()?.url : void 0,
|
|
1282
|
+
timestamp: Date.now() - actionStartTime
|
|
1283
|
+
});
|
|
1284
|
+
}
|
|
1285
|
+
});
|
|
1286
|
+
const actionLog = [];
|
|
1287
|
+
for (let i = 0; i < actionBudget; i++) {
|
|
1288
|
+
actionStartTime = Date.now();
|
|
1289
|
+
consoleLogs = [];
|
|
1290
|
+
const observations = await withTimeout2(
|
|
1291
|
+
stagehand.observe(),
|
|
1292
|
+
AI_OPERATION_TIMEOUT_MS2,
|
|
1293
|
+
"Page observation"
|
|
1294
|
+
);
|
|
1295
|
+
const decisionResponse = await withTimeout2(
|
|
1296
|
+
anthropic.messages.create({
|
|
1297
|
+
model: model.replace("anthropic/", ""),
|
|
1298
|
+
max_tokens: 300,
|
|
1299
|
+
system: buildDecisionPrompt(featureDescription, actionBudget - i, actionLog),
|
|
1300
|
+
messages: [
|
|
1301
|
+
{
|
|
1302
|
+
role: "user",
|
|
1303
|
+
content: `Current page URL: ${page.url()}
|
|
1304
|
+
|
|
1305
|
+
Visible interactive elements:
|
|
1306
|
+
${formatObservations(observations)}
|
|
1307
|
+
|
|
1308
|
+
What single action should I perform next?`
|
|
1309
|
+
}
|
|
1310
|
+
]
|
|
1311
|
+
}),
|
|
1312
|
+
AI_OPERATION_TIMEOUT_MS2,
|
|
1313
|
+
"Action decision"
|
|
1314
|
+
);
|
|
1315
|
+
const actionText = extractText(decisionResponse);
|
|
1316
|
+
totalInputTokens += decisionResponse.usage.input_tokens;
|
|
1317
|
+
totalOutputTokens += decisionResponse.usage.output_tokens;
|
|
1318
|
+
if (actionText.toLowerCase().includes("[done]") || actionText.toLowerCase().includes("no more actions")) {
|
|
1319
|
+
break;
|
|
1320
|
+
}
|
|
1321
|
+
const screenshotBefore = await page.screenshot({ type: "png" });
|
|
1322
|
+
networkCapture.start();
|
|
1323
|
+
try {
|
|
1324
|
+
await stagehand.act(actionText);
|
|
1325
|
+
} catch (actError) {
|
|
1326
|
+
networkCapture.stop();
|
|
1327
|
+
const screenshotAfter2 = await page.screenshot({ type: "png" });
|
|
1328
|
+
const action2 = {
|
|
1329
|
+
actionNumber: i + 1,
|
|
1330
|
+
action: actionText,
|
|
1331
|
+
category: "broken_interaction",
|
|
1332
|
+
severity: "medium",
|
|
1333
|
+
confidence: 0.9,
|
|
1334
|
+
description: `Action failed: ${actError instanceof Error ? actError.message : String(actError)}`,
|
|
1335
|
+
screenshotBefore,
|
|
1336
|
+
screenshotAfter: screenshotAfter2,
|
|
1337
|
+
networkRequests: networkCapture.getRequests(),
|
|
1338
|
+
consoleLogs: [...consoleLogs],
|
|
1339
|
+
durationMs: Date.now() - actionStartTime
|
|
1340
|
+
};
|
|
1341
|
+
actions.push(action2);
|
|
1342
|
+
actionLog.push(`[${i + 1}] ${actionText} -> FAILED: ${action2.description}`);
|
|
1343
|
+
onActionComplete?.(action2, i);
|
|
1344
|
+
continue;
|
|
1345
|
+
}
|
|
1346
|
+
await page.waitForLoadState("networkidle").catch(() => {
|
|
1347
|
+
});
|
|
1348
|
+
await page.waitForTimeout(500);
|
|
1349
|
+
networkCapture.stop();
|
|
1350
|
+
const screenshotAfter = await page.screenshot({ type: "png" });
|
|
1351
|
+
const capturedRequests = networkCapture.getRequests();
|
|
1352
|
+
const networkErrors = networkCapture.getErrors();
|
|
1353
|
+
const evalResponse = await withTimeout2(
|
|
1354
|
+
anthropic.messages.create({
|
|
1355
|
+
model: model.replace("anthropic/", ""),
|
|
1356
|
+
max_tokens: 400,
|
|
1357
|
+
system: buildEvaluationPrompt(),
|
|
1358
|
+
messages: [
|
|
1359
|
+
{
|
|
1360
|
+
role: "user",
|
|
1361
|
+
content: buildEvaluationContext(actionText, consoleLogs, networkErrors, page.url())
|
|
1362
|
+
}
|
|
1363
|
+
]
|
|
1364
|
+
}),
|
|
1365
|
+
AI_OPERATION_TIMEOUT_MS2,
|
|
1366
|
+
"Action evaluation"
|
|
1367
|
+
);
|
|
1368
|
+
totalInputTokens += evalResponse.usage.input_tokens;
|
|
1369
|
+
totalOutputTokens += evalResponse.usage.output_tokens;
|
|
1370
|
+
const evaluation = parseEvaluation2(extractText(evalResponse));
|
|
1371
|
+
const action = {
|
|
1372
|
+
actionNumber: i + 1,
|
|
1373
|
+
action: actionText,
|
|
1374
|
+
category: evaluation.category,
|
|
1375
|
+
severity: evaluation.severity,
|
|
1376
|
+
confidence: evaluation.confidence,
|
|
1377
|
+
description: evaluation.description,
|
|
1378
|
+
screenshotBefore,
|
|
1379
|
+
screenshotAfter,
|
|
1380
|
+
networkRequests: capturedRequests,
|
|
1381
|
+
consoleLogs: [...consoleLogs],
|
|
1382
|
+
domContext: evaluation.domContext,
|
|
1383
|
+
durationMs: Date.now() - actionStartTime
|
|
1384
|
+
};
|
|
1385
|
+
actions.push(action);
|
|
1386
|
+
const logEntry = evaluation.category === "normal" ? `[${i + 1}] ${actionText} -> OK` : `[${i + 1}] ${actionText} -> FINDING (${evaluation.category}): ${evaluation.description}`;
|
|
1387
|
+
actionLog.push(logEntry);
|
|
1388
|
+
onActionComplete?.(action, i);
|
|
1389
|
+
}
|
|
1390
|
+
const { generateExplorationReport: generateExplorationReport2 } = await Promise.resolve().then(() => (init_report_generator(), report_generator_exports));
|
|
1391
|
+
const report = await generateExplorationReport2(anthropic, {
|
|
1392
|
+
projectName: "",
|
|
1393
|
+
featureDescription,
|
|
1394
|
+
targetUrl,
|
|
1395
|
+
actions,
|
|
1396
|
+
model: model.replace("anthropic/", "")
|
|
1397
|
+
});
|
|
1398
|
+
totalInputTokens += report.tokenUsage.inputTokens;
|
|
1399
|
+
totalOutputTokens += report.tokenUsage.outputTokens;
|
|
1400
|
+
const findings = actions.filter((a) => a.category !== "normal");
|
|
1401
|
+
return {
|
|
1402
|
+
overallResult: findings.length > 0 ? "findings" : "clean",
|
|
1403
|
+
actions,
|
|
1404
|
+
report: report.report,
|
|
1405
|
+
totalDurationMs: Date.now() - startTime,
|
|
1406
|
+
tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens },
|
|
1407
|
+
browserSessionId: session.sessionId
|
|
1408
|
+
};
|
|
1409
|
+
} catch (error) {
|
|
1410
|
+
return {
|
|
1411
|
+
overallResult: "error",
|
|
1412
|
+
actions,
|
|
1413
|
+
report: {
|
|
1414
|
+
projectName: "",
|
|
1415
|
+
featureDescription,
|
|
1416
|
+
targetUrl,
|
|
1417
|
+
exploredAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1418
|
+
duration: `${Math.round((Date.now() - startTime) / 1e3)}s`,
|
|
1419
|
+
actionsUsed: actions.length,
|
|
1420
|
+
actionBudget,
|
|
1421
|
+
findings: [],
|
|
1422
|
+
tested: [],
|
|
1423
|
+
notTested: [{ description: "Exploration aborted due to error", reason: String(error) }],
|
|
1424
|
+
summary: `Exploration failed after ${actions.length} actions: ${error instanceof Error ? error.message : String(error)}`,
|
|
1425
|
+
suggestedPrompt: ""
|
|
1426
|
+
},
|
|
1427
|
+
totalDurationMs: Date.now() - startTime,
|
|
1428
|
+
tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens },
|
|
1429
|
+
browserSessionId: session.sessionId
|
|
1430
|
+
};
|
|
1431
|
+
} finally {
|
|
1432
|
+
if (session.page) {
|
|
1433
|
+
const rawPage = session.page;
|
|
1434
|
+
rawPage.removeAllListeners?.("console");
|
|
1435
|
+
}
|
|
1436
|
+
await session.close();
|
|
1437
|
+
}
|
|
1438
|
+
}
|
|
1439
|
+
function buildDecisionPrompt(featureDescription, remainingBudget, actionLog) {
|
|
1440
|
+
return `You are an exploratory QA tester examining the feature: "${featureDescription}".
|
|
1441
|
+
Your goal is to find bugs by interacting with the page like a real user would.
|
|
1442
|
+
|
|
1443
|
+
Strategy for choosing your next action:
|
|
1444
|
+
1. Try the happy path first (normal usage)
|
|
1445
|
+
2. Then try edge cases: empty inputs, very long text, special characters
|
|
1446
|
+
3. Click buttons and links to verify they work
|
|
1447
|
+
4. Submit forms with missing required fields
|
|
1448
|
+
5. Look for visual problems: overlapping text, broken layouts, missing images
|
|
1449
|
+
|
|
1450
|
+
You have ${remainingBudget} actions left. Prioritize high-risk interactions.
|
|
1451
|
+
${actionLog.length > 0 ? `
|
|
1452
|
+
Actions already taken:
|
|
1453
|
+
${actionLog.join("\n")}` : ""}
|
|
1454
|
+
|
|
1455
|
+
DO NOT repeat an action you've already performed.
|
|
1456
|
+
Respond with a single action description. If there's nothing left to test, respond with "[DONE]".`;
|
|
1457
|
+
}
|
|
1458
|
+
function buildEvaluationPrompt() {
|
|
1459
|
+
return `You are evaluating the result of a QA test action. Categorize what happened.
|
|
1460
|
+
|
|
1461
|
+
Respond in this exact JSON format:
|
|
1462
|
+
{
|
|
1463
|
+
"category": "normal" | "console_error" | "broken_interaction" | "visual_anomaly" | "input_handling",
|
|
1464
|
+
"severity": "critical" | "high" | "medium" | "low",
|
|
1465
|
+
"confidence": 0.0-1.0,
|
|
1466
|
+
"description": "What happened",
|
|
1467
|
+
"expectedBehavior": "What should have happened",
|
|
1468
|
+
"domSelector": "CSS selector of the element involved (if applicable)"
|
|
1469
|
+
}
|
|
1470
|
+
|
|
1471
|
+
Category definitions:
|
|
1472
|
+
- normal: Expected behavior, no issues found
|
|
1473
|
+
- console_error: JavaScript exception or failed network request (4xx/5xx)
|
|
1474
|
+
- broken_interaction: Action had no visible effect, button didn't respond, navigation failed
|
|
1475
|
+
- visual_anomaly: Layout break, text overflow, missing/broken images, overlapping elements
|
|
1476
|
+
- input_handling: Missing validation, accepted clearly invalid input, no error feedback
|
|
1477
|
+
|
|
1478
|
+
Only report genuine issues. If behavior seems correct, use "normal".
|
|
1479
|
+
For "normal" results, severity and domSelector are not required.`;
|
|
1480
|
+
}
|
|
1481
|
+
function buildEvaluationContext(action, consoleLogs, networkErrors, currentUrl) {
|
|
1482
|
+
let context = `Action performed: "${action}"
|
|
1483
|
+
Current URL: ${currentUrl}
|
|
1484
|
+
`;
|
|
1485
|
+
if (consoleLogs.length > 0) {
|
|
1486
|
+
context += `
|
|
1487
|
+
Console output:
|
|
1488
|
+
${consoleLogs.map((l) => `[${l.level}] ${l.text}`).join("\n")}
|
|
1489
|
+
`;
|
|
1490
|
+
}
|
|
1491
|
+
if (networkErrors.length > 0) {
|
|
1492
|
+
context += `
|
|
1493
|
+
Failed network requests:
|
|
1494
|
+
${networkErrors.map((e) => `${e.method} ${e.url} -> ${e.status} ${e.statusText}`).join("\n")}
|
|
1495
|
+
`;
|
|
1496
|
+
}
|
|
1497
|
+
return context;
|
|
1498
|
+
}
|
|
1499
|
+
function formatObservations(observations) {
|
|
1500
|
+
return observations.slice(0, 30).map((o, i) => `${i + 1}. [${o.selector}] ${o.description}`).join("\n");
|
|
1501
|
+
}
|
|
1502
|
+
function extractText(response) {
|
|
1503
|
+
const block = response.content[0];
|
|
1504
|
+
return block.type === "text" ? block.text : "";
|
|
1505
|
+
}
|
|
1506
|
+
function parseEvaluation2(text) {
|
|
1507
|
+
try {
|
|
1508
|
+
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
1509
|
+
if (!jsonMatch) throw new Error("No JSON found");
|
|
1510
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
1511
|
+
return {
|
|
1512
|
+
category: parsed.category || "normal",
|
|
1513
|
+
severity: parsed.severity,
|
|
1514
|
+
confidence: typeof parsed.confidence === "number" ? parsed.confidence : 0.5,
|
|
1515
|
+
description: parsed.description || text,
|
|
1516
|
+
expectedBehavior: parsed.expectedBehavior,
|
|
1517
|
+
domContext: parsed.domSelector ? { selector: parsed.domSelector, elementText: "", nearbyText: "" } : void 0
|
|
1518
|
+
};
|
|
1519
|
+
} catch {
|
|
1520
|
+
return { category: "normal", confidence: 0.3, description: text };
|
|
1521
|
+
}
|
|
1522
|
+
}
|
|
1523
|
+
|
|
1524
|
+
// src/index.ts
|
|
1525
|
+
init_report_generator();
|
|
1526
|
+
|
|
1527
|
+
// src/report-triager.ts
|
|
1528
|
+
var DEFAULT_MODEL5 = "claude-sonnet-4-20250514";
|
|
1529
|
+
async function triageReport(input) {
|
|
1530
|
+
const model = input.model ?? DEFAULT_MODEL5;
|
|
1531
|
+
const { report, recentReports } = input;
|
|
1532
|
+
const prompt = buildTriagePrompt(report, recentReports);
|
|
1533
|
+
const response = await input.anthropic.messages.create({
|
|
1534
|
+
model,
|
|
1535
|
+
max_tokens: 1024,
|
|
1536
|
+
messages: [{ role: "user", content: prompt }]
|
|
1537
|
+
});
|
|
1538
|
+
const text = response.content.filter((block) => block.type === "text").map((block) => block.text).join("");
|
|
1539
|
+
return parseTriageResult(text);
|
|
1540
|
+
}
|
|
1541
|
+
function buildTriagePrompt(report, recentReports) {
|
|
1542
|
+
const sections = [];
|
|
1543
|
+
sections.push(`REPORT TITLE: ${report.title ?? "(no title)"}`);
|
|
1544
|
+
sections.push(`DESCRIPTION: ${report.description}`);
|
|
1545
|
+
if (report.report_source) {
|
|
1546
|
+
sections.push(`SOURCE: ${report.report_source}`);
|
|
1547
|
+
}
|
|
1548
|
+
if (report.app_context && Object.keys(report.app_context).length > 0) {
|
|
1549
|
+
const ctx = report.app_context;
|
|
1550
|
+
const parts = [];
|
|
1551
|
+
if (ctx.currentRoute) parts.push(`Route: ${ctx.currentRoute}`);
|
|
1552
|
+
if (ctx.currentUrl) parts.push(`URL: ${ctx.currentUrl}`);
|
|
1553
|
+
if (ctx.componentName) parts.push(`Component: ${ctx.componentName}`);
|
|
1554
|
+
if (ctx.userAction) parts.push(`User action: ${ctx.userAction}`);
|
|
1555
|
+
if (parts.length > 0) {
|
|
1556
|
+
sections.push(`APP CONTEXT:
|
|
1557
|
+
${parts.join("\n")}`);
|
|
1558
|
+
}
|
|
1559
|
+
}
|
|
1560
|
+
if (report.enhanced_context) {
|
|
1561
|
+
const enhanced = report.enhanced_context;
|
|
1562
|
+
const consoleLogs = enhanced.consoleLogs;
|
|
1563
|
+
if (consoleLogs && consoleLogs.length > 0) {
|
|
1564
|
+
const errors = consoleLogs.filter((l) => l.level === "error" || l.level === "warning").slice(0, 10).map((l) => `[${l.level}] ${l.text}`).join("\n");
|
|
1565
|
+
if (errors) {
|
|
1566
|
+
sections.push(`CONSOLE ERRORS:
|
|
1567
|
+
${errors}`);
|
|
1568
|
+
}
|
|
1569
|
+
}
|
|
1570
|
+
const networkErrors = enhanced.networkErrors;
|
|
1571
|
+
if (networkErrors && networkErrors.length > 0) {
|
|
1572
|
+
const netErrors = networkErrors.slice(0, 10).map((e) => `${e.method} ${e.url} \u2192 ${e.status}`).join("\n");
|
|
1573
|
+
sections.push(`NETWORK ERRORS:
|
|
1574
|
+
${netErrors}`);
|
|
1575
|
+
}
|
|
1576
|
+
}
|
|
1577
|
+
if (report.device_info && Object.keys(report.device_info).length > 0) {
|
|
1578
|
+
const device = report.device_info;
|
|
1579
|
+
const parts = [];
|
|
1580
|
+
if (device.platform) parts.push(`Platform: ${device.platform}`);
|
|
1581
|
+
if (device.browser) parts.push(`Browser: ${device.browser}`);
|
|
1582
|
+
if (device.os) parts.push(`OS: ${device.os}`);
|
|
1583
|
+
if (device.screenSize) parts.push(`Screen: ${device.screenSize}`);
|
|
1584
|
+
if (parts.length > 0) {
|
|
1585
|
+
sections.push(`DEVICE:
|
|
1586
|
+
${parts.join(", ")}`);
|
|
1587
|
+
}
|
|
1588
|
+
}
|
|
1589
|
+
if (report.error_fingerprint) {
|
|
1590
|
+
sections.push(`ERROR FINGERPRINT: ${report.error_fingerprint}`);
|
|
1591
|
+
}
|
|
1592
|
+
let recentSection = "";
|
|
1593
|
+
if (recentReports.length > 0) {
|
|
1594
|
+
const recentLines = recentReports.map((r) => {
|
|
1595
|
+
const desc = r.description.slice(0, 150);
|
|
1596
|
+
const fp = r.error_fingerprint ? ` [fingerprint: ${r.error_fingerprint}]` : "";
|
|
1597
|
+
return `- ID: ${r.id} | "${r.title ?? "(no title)"}" | ${desc}${fp}`;
|
|
1598
|
+
});
|
|
1599
|
+
recentSection = `
|
|
1600
|
+
RECENT REPORTS (check for duplicates):
|
|
1601
|
+
${recentLines.join("\n")}`;
|
|
1602
|
+
}
|
|
1603
|
+
return `You are a QA triage specialist. Analyze this bug report and provide structured triage.
|
|
1604
|
+
|
|
1605
|
+
${sections.join("\n\n")}
|
|
1606
|
+
${recentSection}
|
|
1607
|
+
|
|
1608
|
+
Respond with ONLY a JSON object (no markdown, no explanation outside the JSON):
|
|
1609
|
+
{
|
|
1610
|
+
"suggested_severity": "critical" | "high" | "medium" | "low",
|
|
1611
|
+
"severity_confidence": 0.0-1.0,
|
|
1612
|
+
"suggested_category": "ui_ux" | "functional" | "crash" | "security" | "other",
|
|
1613
|
+
"category_confidence": 0.0-1.0,
|
|
1614
|
+
"root_cause_analysis": "Brief analysis of the likely root cause",
|
|
1615
|
+
"duplicate_of": null or "uuid-of-matching-report",
|
|
1616
|
+
"duplicate_confidence": 0.0-1.0,
|
|
1617
|
+
"triage_notes": "Summary of triage reasoning"
|
|
1618
|
+
}
|
|
1619
|
+
|
|
1620
|
+
Severity guide:
|
|
1621
|
+
- critical: App crash, data loss, security vulnerability, blocks core workflow
|
|
1622
|
+
- high: Major feature broken, significant UX degradation, affects many users
|
|
1623
|
+
- medium: Feature partially broken, workaround exists, moderate impact
|
|
1624
|
+
- low: Minor cosmetic issue, edge case, minimal user impact
|
|
1625
|
+
|
|
1626
|
+
Category guide:
|
|
1627
|
+
- crash: App crashes, unhandled exceptions, white screen of death
|
|
1628
|
+
- security: Auth bypass, data exposure, injection vulnerabilities
|
|
1629
|
+
- functional: Feature doesn't work as expected, logic errors, broken flows
|
|
1630
|
+
- ui_ux: Visual glitches, layout issues, confusing UX, accessibility problems
|
|
1631
|
+
- other: Performance, documentation, configuration issues
|
|
1632
|
+
|
|
1633
|
+
Duplicate detection:
|
|
1634
|
+
- Compare error fingerprints first (exact match = very high confidence)
|
|
1635
|
+
- Then compare descriptions semantically (similar symptoms on same route/feature)
|
|
1636
|
+
- Only flag as duplicate if confidence \u2265 0.80`;
|
|
1637
|
+
}
|
|
1638
|
+
var VALID_SEVERITIES = ["critical", "high", "medium", "low"];
|
|
1639
|
+
var VALID_CATEGORIES = ["ui_ux", "functional", "crash", "security", "other"];
|
|
1640
|
+
function parseTriageResult(text) {
|
|
1641
|
+
try {
|
|
1642
|
+
const parsed = JSON.parse(text.trim());
|
|
1643
|
+
return validateTriageResult(parsed);
|
|
1644
|
+
} catch {
|
|
1645
|
+
const jsonMatch = text.match(/\{[\s\S]*"suggested_severity"[\s\S]*"suggested_category"[\s\S]*\}/);
|
|
1646
|
+
if (jsonMatch) {
|
|
1647
|
+
try {
|
|
1648
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
1649
|
+
return validateTriageResult(parsed);
|
|
1650
|
+
} catch {
|
|
1651
|
+
}
|
|
1652
|
+
}
|
|
1653
|
+
}
|
|
1654
|
+
return {
|
|
1655
|
+
suggested_severity: "medium",
|
|
1656
|
+
severity_confidence: 0.3,
|
|
1657
|
+
suggested_category: "other",
|
|
1658
|
+
category_confidence: 0.3,
|
|
1659
|
+
root_cause_analysis: `Triage returned unparseable response: ${text.slice(0, 200)}`,
|
|
1660
|
+
duplicate_of: null,
|
|
1661
|
+
duplicate_confidence: 0,
|
|
1662
|
+
triage_notes: "Auto-triage failed to parse AI response"
|
|
1663
|
+
};
|
|
1664
|
+
}
|
|
1665
|
+
function validateTriageResult(parsed) {
|
|
1666
|
+
const severity = VALID_SEVERITIES.includes(parsed.suggested_severity) ? parsed.suggested_severity : "medium";
|
|
1667
|
+
const category = VALID_CATEGORIES.includes(parsed.suggested_category) ? parsed.suggested_category : "other";
|
|
1668
|
+
return {
|
|
1669
|
+
suggested_severity: severity,
|
|
1670
|
+
severity_confidence: clampConfidence(parsed.severity_confidence),
|
|
1671
|
+
suggested_category: category,
|
|
1672
|
+
category_confidence: clampConfidence(parsed.category_confidence),
|
|
1673
|
+
root_cause_analysis: typeof parsed.root_cause_analysis === "string" ? parsed.root_cause_analysis : "No analysis provided",
|
|
1674
|
+
duplicate_of: typeof parsed.duplicate_of === "string" ? parsed.duplicate_of : null,
|
|
1675
|
+
duplicate_confidence: clampConfidence(parsed.duplicate_confidence),
|
|
1676
|
+
triage_notes: typeof parsed.triage_notes === "string" ? parsed.triage_notes : "No notes provided"
|
|
1677
|
+
};
|
|
1678
|
+
}
|
|
1679
|
+
function clampConfidence(value) {
|
|
1680
|
+
if (typeof value !== "number") return 0.5;
|
|
1681
|
+
return Math.max(0, Math.min(1, value));
|
|
1682
|
+
}
|
|
1683
|
+
|
|
1684
|
+
// src/failure-analyzer.ts
|
|
1685
|
+
var DEFAULT_MODEL6 = "claude-sonnet-4-20250514";
|
|
1686
|
+
async function analyzeFailure(input) {
|
|
1687
|
+
const model = input.model ?? DEFAULT_MODEL6;
|
|
1688
|
+
const { step, result, discoveredSelector, consoleLogs, networkErrors } = input;
|
|
1689
|
+
const content = [];
|
|
1690
|
+
content.push({ type: "text", text: "BEFORE screenshot (page state before the failed action):" });
|
|
1691
|
+
content.push({
|
|
1692
|
+
type: "image",
|
|
1693
|
+
source: { type: "base64", media_type: "image/png", data: result.screenshotBefore.toString("base64") }
|
|
1694
|
+
});
|
|
1695
|
+
content.push({ type: "text", text: "AFTER screenshot (page state after the failed action):" });
|
|
1696
|
+
content.push({
|
|
1697
|
+
type: "image",
|
|
1698
|
+
source: { type: "base64", media_type: "image/png", data: result.screenshotAfter.toString("base64") }
|
|
1699
|
+
});
|
|
1700
|
+
content.push({ type: "text", text: buildFailurePrompt(step, result, discoveredSelector, consoleLogs, networkErrors) });
|
|
1701
|
+
const response = await input.anthropic.messages.create({
|
|
1702
|
+
model,
|
|
1703
|
+
max_tokens: 1024,
|
|
1704
|
+
messages: [{ role: "user", content }]
|
|
1705
|
+
});
|
|
1706
|
+
const text = response.content.filter((block) => block.type === "text").map((block) => block.text).join("");
|
|
1707
|
+
return parseFailureAnalysis(text, step);
|
|
1708
|
+
}
|
|
1709
|
+
var STEP_TO_RUN = {
|
|
1710
|
+
real_bug: "bug",
|
|
1711
|
+
test_maintenance: "test_issue",
|
|
1712
|
+
ai_limitation: "ai_limitation",
|
|
1713
|
+
flaky: "flaky",
|
|
1714
|
+
unknown: "unknown"
|
|
1715
|
+
};
|
|
1716
|
+
function rollupFailureClassification(stepClassifications) {
|
|
1717
|
+
if (stepClassifications.length === 0) return "unknown";
|
|
1718
|
+
if (stepClassifications.some((c) => c === "real_bug")) return "bug";
|
|
1719
|
+
if (stepClassifications.every((c) => c === "ai_limitation")) return "ai_limitation";
|
|
1720
|
+
if (stepClassifications.every((c) => c === "test_maintenance")) return "test_issue";
|
|
1721
|
+
if (stepClassifications.every((c) => c === "flaky")) return "flaky";
|
|
1722
|
+
const counts = /* @__PURE__ */ new Map();
|
|
1723
|
+
for (const c of stepClassifications) {
|
|
1724
|
+
counts.set(c, (counts.get(c) ?? 0) + 1);
|
|
1725
|
+
}
|
|
1726
|
+
let best = "unknown";
|
|
1727
|
+
let bestCount = 0;
|
|
1728
|
+
for (const [cls, count] of counts) {
|
|
1729
|
+
if (count > bestCount) {
|
|
1730
|
+
bestCount = count;
|
|
1731
|
+
best = cls;
|
|
1732
|
+
}
|
|
1733
|
+
}
|
|
1734
|
+
return STEP_TO_RUN[best];
|
|
1735
|
+
}
|
|
1736
|
+
function buildFailurePrompt(step, result, discoveredSelector, consoleLogs, networkErrors) {
|
|
1737
|
+
const sections = [];
|
|
1738
|
+
sections.push(`FAILED STEP #${step.stepNumber}: ${step.action}`);
|
|
1739
|
+
sections.push(`EXPECTED: ${step.expectedResult}`);
|
|
1740
|
+
sections.push(`ACTUAL: ${result.actualResult}`);
|
|
1741
|
+
if (step.selector) sections.push(`SELECTOR USED: ${step.selector}`);
|
|
1742
|
+
if (step.actionType) sections.push(`ACTION TYPE: ${step.actionType}`);
|
|
1743
|
+
if (result.error) sections.push(`ERROR: ${result.error}`);
|
|
1744
|
+
if (discoveredSelector) {
|
|
1745
|
+
sections.push(`DISCOVERED SELECTOR (what Stagehand actually clicked): ${discoveredSelector.selector} (via ${discoveredSelector.strategy})${discoveredSelector.textContent ? ` \u2014 text: "${discoveredSelector.textContent}"` : ""}`);
|
|
1746
|
+
}
|
|
1747
|
+
if (consoleLogs && consoleLogs.length > 0) {
|
|
1748
|
+
const errors = consoleLogs.filter((l) => l.level === "error" || l.level === "warning").slice(0, 8).map((l) => `[${l.level}] ${l.text}`).join("\n");
|
|
1749
|
+
if (errors) sections.push(`CONSOLE ERRORS:
|
|
1750
|
+
${errors}`);
|
|
1751
|
+
}
|
|
1752
|
+
if (networkErrors && networkErrors.length > 0) {
|
|
1753
|
+
const netErrors = networkErrors.slice(0, 8).map((e) => `${e.method} ${e.url} \u2192 ${e.status} ${e.statusText}`).join("\n");
|
|
1754
|
+
sections.push(`NETWORK ERRORS:
|
|
1755
|
+
${netErrors}`);
|
|
1756
|
+
}
|
|
1757
|
+
return `You are a QA failure analyst. A test step failed. Analyze the before/after screenshots and the context below to classify this failure.
|
|
1758
|
+
|
|
1759
|
+
${sections.join("\n\n")}
|
|
1760
|
+
|
|
1761
|
+
Classify into ONE of these categories:
|
|
1762
|
+
- **real_bug**: The application has an actual defect. Indicators: API errors (4xx/5xx), JavaScript exceptions, missing/broken UI elements that SHOULD be there, incorrect behavior, data not saving.
|
|
1763
|
+
- **test_maintenance**: The test is stale \u2014 the app changed but the test wasn't updated. Indicators: element moved/renamed, selector no longer matches, page restructured but app works correctly, the discovered selector differs from the test's selector.
|
|
1764
|
+
- **ai_limitation**: The AI executor itself could not complete this step \u2014 NOT an app bug. Indicators: already logged in so can't reach the login page, a QA/testing widget or overlay appeared and blocked the real UI, the test requires measuring something the AI can't (contrast ratios, pixel measurements), the AI landed on a completely wrong page and never reached the test target, authentication redirect prevented navigation, a popup or modal unrelated to the test blocked interaction.
|
|
1765
|
+
- **flaky**: Timing or intermittent issue. Indicators: timeout errors, "element not found" but the element IS visible in screenshots, network hiccup, race condition.
|
|
1766
|
+
- **unknown**: Can't determine with confidence.
|
|
1767
|
+
|
|
1768
|
+
For **test_maintenance** failures, suggest a corrected step (selector, action, value).
|
|
1769
|
+
|
|
1770
|
+
Respond with ONLY a JSON object (no markdown, no explanation outside the JSON):
|
|
1771
|
+
{
|
|
1772
|
+
"classification": "real_bug" | "test_maintenance" | "ai_limitation" | "flaky" | "unknown",
|
|
1773
|
+
"confidence": 0.0-1.0,
|
|
1774
|
+
"reasoning": "Brief explanation of why this classification",
|
|
1775
|
+
"suggested_fix": null | {
|
|
1776
|
+
"corrected_action": "Updated natural language action (if changed)",
|
|
1777
|
+
"corrected_selector": "Updated CSS selector (if selector changed)",
|
|
1778
|
+
"corrected_actionType": "Updated action type (if changed)",
|
|
1779
|
+
"corrected_value": "Updated value (if changed)"
|
|
1780
|
+
}
|
|
1781
|
+
}`;
|
|
1782
|
+
}
|
|
1783
|
+
var VALID_CLASSIFICATIONS = ["real_bug", "test_maintenance", "ai_limitation", "flaky", "unknown"];
|
|
1784
|
+
function parseFailureAnalysis(text, step) {
|
|
1785
|
+
try {
|
|
1786
|
+
const parsed = JSON.parse(text.trim());
|
|
1787
|
+
return validateFailureAnalysis(parsed, step);
|
|
1788
|
+
} catch {
|
|
1789
|
+
const jsonMatch = text.match(/\{[\s\S]*"classification"[\s\S]*"confidence"[\s\S]*\}/);
|
|
1790
|
+
if (jsonMatch) {
|
|
1791
|
+
try {
|
|
1792
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
1793
|
+
return validateFailureAnalysis(parsed, step);
|
|
1794
|
+
} catch {
|
|
1795
|
+
}
|
|
1796
|
+
}
|
|
1797
|
+
}
|
|
1798
|
+
return {
|
|
1799
|
+
classification: "unknown",
|
|
1800
|
+
confidence: 0.3,
|
|
1801
|
+
reasoning: `Failure analysis returned unparseable response: ${text.slice(0, 200)}`
|
|
1802
|
+
};
|
|
1803
|
+
}
|
|
1804
|
+
function validateFailureAnalysis(parsed, step) {
|
|
1805
|
+
const classification = VALID_CLASSIFICATIONS.includes(parsed.classification) ? parsed.classification : "unknown";
|
|
1806
|
+
const result = {
|
|
1807
|
+
classification,
|
|
1808
|
+
confidence: clampConfidence2(parsed.confidence),
|
|
1809
|
+
reasoning: typeof parsed.reasoning === "string" ? parsed.reasoning : "No reasoning provided"
|
|
1810
|
+
};
|
|
1811
|
+
if (parsed.suggested_fix && typeof parsed.suggested_fix === "object") {
|
|
1812
|
+
const fix = parsed.suggested_fix;
|
|
1813
|
+
result.suggested_fix = {
|
|
1814
|
+
stepNumber: step.stepNumber,
|
|
1815
|
+
original_action: step.action,
|
|
1816
|
+
corrected_action: typeof fix.corrected_action === "string" ? fix.corrected_action : void 0,
|
|
1817
|
+
corrected_selector: typeof fix.corrected_selector === "string" ? fix.corrected_selector : void 0,
|
|
1818
|
+
corrected_actionType: typeof fix.corrected_actionType === "string" ? fix.corrected_actionType : void 0,
|
|
1819
|
+
corrected_value: typeof fix.corrected_value === "string" ? fix.corrected_value : void 0
|
|
1820
|
+
};
|
|
1821
|
+
}
|
|
1822
|
+
return result;
|
|
1823
|
+
}
|
|
1824
|
+
function clampConfidence2(value) {
|
|
1825
|
+
if (typeof value !== "number") return 0.5;
|
|
1826
|
+
return Math.max(0, Math.min(1, value));
|
|
1827
|
+
}
|
|
1828
|
+
|
|
1829
|
+
// src/concurrency.ts
|
|
1830
|
+
var Semaphore = class {
|
|
1831
|
+
constructor(max) {
|
|
1832
|
+
this.max = max;
|
|
1833
|
+
this.current = 0;
|
|
1834
|
+
this.queue = [];
|
|
1835
|
+
if (max < 1) throw new Error("Semaphore max must be >= 1");
|
|
1836
|
+
}
|
|
1837
|
+
async acquire() {
|
|
1838
|
+
if (this.current < this.max) {
|
|
1839
|
+
this.current++;
|
|
1840
|
+
return;
|
|
1841
|
+
}
|
|
1842
|
+
return new Promise((resolve) => {
|
|
1843
|
+
this.queue.push(resolve);
|
|
1844
|
+
});
|
|
1845
|
+
}
|
|
1846
|
+
release() {
|
|
1847
|
+
const next = this.queue.shift();
|
|
1848
|
+
if (next) {
|
|
1849
|
+
next();
|
|
1850
|
+
} else {
|
|
1851
|
+
this.current--;
|
|
1852
|
+
}
|
|
1853
|
+
}
|
|
1854
|
+
/** Number of slots currently in use */
|
|
1855
|
+
get active() {
|
|
1856
|
+
return this.current;
|
|
1857
|
+
}
|
|
1858
|
+
/** Number of waiters in the queue */
|
|
1859
|
+
get waiting() {
|
|
1860
|
+
return this.queue.length;
|
|
1861
|
+
}
|
|
1862
|
+
};
|
|
400
1863
|
// Annotate the CommonJS export names for ESM import in node:
|
|
401
1864
|
0 && (module.exports = {
|
|
1865
|
+
Semaphore,
|
|
1866
|
+
analyzeFailure,
|
|
1867
|
+
authenticateSupabase,
|
|
402
1868
|
createStagehandSession,
|
|
1869
|
+
discoverSelector,
|
|
1870
|
+
estimateBatchCost,
|
|
1871
|
+
estimateCost,
|
|
1872
|
+
estimateTestCost,
|
|
1873
|
+
evaluateStep,
|
|
1874
|
+
executeAction,
|
|
1875
|
+
generateExplorationReport,
|
|
403
1876
|
generateRunSummary,
|
|
1877
|
+
getTokenEstimate,
|
|
404
1878
|
injectAuth,
|
|
405
|
-
|
|
1879
|
+
injectSupabaseAuth,
|
|
1880
|
+
installClickTracker,
|
|
1881
|
+
performSupabaseAuth,
|
|
1882
|
+
rollupFailureClassification,
|
|
1883
|
+
runExploration,
|
|
1884
|
+
runTest,
|
|
1885
|
+
suppressBugBearWidget,
|
|
1886
|
+
triageReport,
|
|
1887
|
+
verifySupabaseSession
|
|
406
1888
|
});
|
|
407
1889
|
//# sourceMappingURL=index.js.map
|