@bbearai/ai-executor 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -28,10 +28,88 @@ var import_supabase_js = require("@supabase/supabase-js");
28
28
 
29
29
  // src/runner.ts
30
30
  var import_sdk = __toESM(require("@anthropic-ai/sdk"));
31
- var import_zod = require("zod");
32
31
 
33
32
  // src/browser.ts
34
33
  var import_stagehand = require("@browserbasehq/stagehand");
34
+
35
+ // src/supabase-auth.ts
36
+ function extractProjectRef(supabaseUrl) {
37
+ const url = new URL(supabaseUrl);
38
+ const hostname = url.hostname;
39
+ const ref = hostname.split(".")[0];
40
+ return ref;
41
+ }
42
+ async function authenticateSupabase(auth) {
43
+ const url = `${auth.supabaseUrl.replace(/\/$/, "")}/auth/v1/token?grant_type=password`;
44
+ const response = await fetch(url, {
45
+ method: "POST",
46
+ headers: {
47
+ "Content-Type": "application/json",
48
+ "apikey": auth.anonKey
49
+ },
50
+ body: JSON.stringify({
51
+ email: auth.email,
52
+ password: auth.password
53
+ })
54
+ });
55
+ if (!response.ok) {
56
+ const body = await response.text().catch(() => "");
57
+ throw new Error(
58
+ `Supabase auth failed (${response.status}): ${body.slice(0, 200)}`
59
+ );
60
+ }
61
+ const session = await response.json();
62
+ if (!session.access_token) {
63
+ throw new Error("Supabase auth returned no access_token");
64
+ }
65
+ return session;
66
+ }
67
+ async function injectSupabaseAuth(page, auth, session) {
68
+ const ref = extractProjectRef(auth.supabaseUrl);
69
+ const storageKey = `sb-${ref}-auth-token`;
70
+ const storageValue = JSON.stringify({
71
+ access_token: session.access_token,
72
+ refresh_token: session.refresh_token,
73
+ expires_in: session.expires_in,
74
+ expires_at: session.expires_at,
75
+ token_type: session.token_type,
76
+ user: session.user
77
+ });
78
+ const currentUrl = page.url();
79
+ if (currentUrl === "about:blank" || !currentUrl) {
80
+ await page.goto(auth.supabaseUrl.replace(/\/$/, ""), {
81
+ waitUntil: "domcontentloaded",
82
+ timeoutMs: 1e4
83
+ }).catch(() => {
84
+ });
85
+ }
86
+ await page.evaluate(
87
+ ({ key, value }) => {
88
+ localStorage.setItem(key, value);
89
+ },
90
+ { key: storageKey, value: storageValue }
91
+ );
92
+ }
93
+ async function verifySupabaseSession(auth, accessToken) {
94
+ const url = `${auth.supabaseUrl.replace(/\/$/, "")}/auth/v1/user`;
95
+ const response = await fetch(url, {
96
+ headers: {
97
+ "Authorization": `Bearer ${accessToken}`,
98
+ "apikey": auth.anonKey
99
+ }
100
+ });
101
+ return response.ok;
102
+ }
103
+ async function performSupabaseAuth(page, auth) {
104
+ const session = await authenticateSupabase(auth);
105
+ await injectSupabaseAuth(page, auth, session);
106
+ const valid = await verifySupabaseSession(auth, session.access_token);
107
+ if (!valid) {
108
+ throw new Error("Supabase auth verification failed \u2014 session token rejected");
109
+ }
110
+ }
111
+
112
+ // src/browser.ts
35
113
  var DEFAULT_MODEL = "anthropic/claude-sonnet-4-20250514";
36
114
  async function createStagehandSession(config, anthropicApiKey) {
37
115
  const modelName = config.model ?? DEFAULT_MODEL;
@@ -44,6 +122,11 @@ async function createStagehandSession(config, anthropicApiKey) {
44
122
  modelName,
45
123
  apiKey: anthropicApiKey
46
124
  },
125
+ // Bypass pino logger — its pino-pretty transport uses worker threads
126
+ // which fail in Vercel's serverless environment
127
+ logger: (msg) => {
128
+ if ((msg.level ?? 0) >= 40) console.warn("[Stagehand]", msg.message);
129
+ },
47
130
  localBrowserLaunchOptions: config.provider === "local" ? {
48
131
  headless: config.headless ?? true,
49
132
  viewport
@@ -67,6 +150,21 @@ async function createStagehandSession(config, anthropicApiKey) {
67
150
  }
68
151
  };
69
152
  }
153
+ async function suppressBugBearWidget(stagehand) {
154
+ try {
155
+ const ctx = stagehand.context;
156
+ if (ctx?.addInitScript) {
157
+ await ctx.addInitScript(() => {
158
+ window.__bugbear_suppress = true;
159
+ try {
160
+ localStorage.setItem("__bugbear_suppress", "true");
161
+ } catch {
162
+ }
163
+ });
164
+ }
165
+ } catch {
166
+ }
167
+ }
70
168
  async function injectAuth(page, auth, stagehand) {
71
169
  if (auth.type === "cookie") {
72
170
  for (const c of auth.cookies) {
@@ -92,23 +190,27 @@ async function injectAuth(page, auth, stagehand) {
92
190
  }, auth.items);
93
191
  } else if (auth.type === "form-login") {
94
192
  await performFormLogin(page, auth, stagehand);
193
+ } else if (auth.type === "supabase-native") {
194
+ await performSupabaseAuth(page, auth);
95
195
  }
96
196
  }
97
197
  async function performFormLogin(page, auth, stagehand) {
98
198
  await page.goto(auth.loginUrl, { waitUntil: "domcontentloaded" });
99
199
  await page.waitForLoadState("networkidle", 15e3).catch(() => {
100
200
  });
201
+ await fillLoginCredentials(page, auth);
101
202
  if (stagehand) {
102
203
  await stagehand.act(
103
- `Fill in the email/username field with "${auth.email}" and the password field with "${auth.password}", then click the login/sign-in button to submit the form.`
104
- );
204
+ "Click the login, sign-in, or submit button to submit the form."
205
+ ).catch(() => {
206
+ });
105
207
  } else {
106
- await manualFormLogin(page, auth);
208
+ await clickSubmitButton(page);
107
209
  }
108
210
  await page.waitForLoadState("networkidle", 15e3).catch(() => {
109
211
  });
110
212
  }
111
- async function manualFormLogin(page, auth) {
213
+ async function fillLoginCredentials(page, auth) {
112
214
  await page.waitForSelector(
113
215
  'input[type="email"], input[type="text"][name*="email"], input[name*="user"], input[type="text"]',
114
216
  { timeout: 15e3 }
@@ -142,6 +244,8 @@ async function manualFormLogin(page, auth) {
142
244
  } else {
143
245
  throw new Error("Could not find password input on login page");
144
246
  }
247
+ }
248
+ async function clickSubmitButton(page) {
145
249
  const submitSelectors = [
146
250
  'button[type="submit"]',
147
251
  'input[type="submit"]'
@@ -166,21 +270,23 @@ async function generateRunSummary(anthropic, testTitle, steps, model) {
166
270
  (s) => `Step ${s.stepNumber}: ${s.action}
167
271
  Expected: ${s.expectedResult}
168
272
  Actual: ${s.actualResult}
169
- Result: ${s.passed ? "PASS" : "FAIL"} (confidence: ${Math.round(s.confidence * 100)}%)${s.error ? `
273
+ Result: ${s.skipped ? "SKIPPED" : s.passed ? "PASS" : "FAIL"} (confidence: ${Math.round(s.confidence * 100)}%)${s.error ? `
170
274
  Error: ${s.error}` : ""}`
171
275
  ).join("\n\n");
172
- const passCount = steps.filter((s) => s.passed).length;
173
- const failCount = steps.filter((s) => !s.passed).length;
276
+ const passCount = steps.filter((s) => s.passed && !s.skipped).length;
277
+ const failCount = steps.filter((s) => !s.passed && !s.skipped).length;
278
+ const skipCount = steps.filter((s) => s.skipped).length;
279
+ const skipNote = skipCount > 0 ? " Some steps were skipped due to page state recovery \u2014 these are not failures, just steps that could not be executed." : "";
174
280
  const response = await anthropic.messages.create({
175
281
  model,
176
282
  max_tokens: 512,
177
283
  messages: [
178
284
  {
179
285
  role: "user",
180
- content: `Summarize this AI test execution in 2-3 sentences. Focus on what was tested, what passed, and what failed (if anything). Be concise and factual.
286
+ content: `Summarize this AI test execution in 2-3 sentences. Focus on what was tested, what passed, and what failed (if anything).${skipNote} Be concise and factual.
181
287
 
182
288
  Test: ${testTitle}
183
- Results: ${passCount} passed, ${failCount} failed out of ${steps.length} steps
289
+ Results: ${passCount} passed, ${failCount} failed, ${skipCount} skipped out of ${steps.length} steps
184
290
 
185
291
  ${stepsText}`
186
292
  }
@@ -189,7 +295,317 @@ ${stepsText}`
189
295
  return response.content.filter((block) => block.type === "text").map((block) => block.text).join("");
190
296
  }
191
297
 
298
+ // src/vision-evaluator.ts
299
+ var DEFAULT_MODEL2 = "claude-sonnet-4-20250514";
300
+ async function evaluateStep(input) {
301
+ const model = input.model ?? DEFAULT_MODEL2;
302
+ const hintClause = input.evaluationHint ? `
303
+ EVALUATION HINT: ${input.evaluationHint}` : "";
304
+ const response = await input.anthropic.messages.create({
305
+ model,
306
+ max_tokens: 512,
307
+ messages: [
308
+ {
309
+ role: "user",
310
+ content: [
311
+ {
312
+ type: "text",
313
+ text: "BEFORE screenshot (page state before the action):"
314
+ },
315
+ {
316
+ type: "image",
317
+ source: {
318
+ type: "base64",
319
+ media_type: "image/png",
320
+ data: input.screenshotBefore.toString("base64")
321
+ }
322
+ },
323
+ {
324
+ type: "text",
325
+ text: "AFTER screenshot (page state after the action):"
326
+ },
327
+ {
328
+ type: "image",
329
+ source: {
330
+ type: "base64",
331
+ media_type: "image/png",
332
+ data: input.screenshotAfter.toString("base64")
333
+ }
334
+ },
335
+ {
336
+ type: "text",
337
+ text: `You are a QA test evaluator. Compare the BEFORE and AFTER screenshots to evaluate this test step.
338
+
339
+ ACTION PERFORMED: ${input.action}
340
+ EXPECTED RESULT: ${input.expectedResult}${hintClause}
341
+
342
+ Analyze the visual differences between the two screenshots and determine if the expected result was achieved.
343
+
344
+ Respond with ONLY a JSON object (no markdown, no explanation outside the JSON):
345
+ {
346
+ "passed": true/false,
347
+ "confidence": 0.0-1.0,
348
+ "actualResult": "Brief description of what actually changed between the screenshots"
349
+ }
350
+
351
+ Confidence guide:
352
+ - 0.95-1.0: Clearly achieved/not achieved, obvious visual evidence
353
+ - 0.8-0.94: Very likely, strong visual indicators
354
+ - 0.6-0.79: Probable but some ambiguity
355
+ - Below 0.6: Uncertain, hard to tell from screenshots alone`
356
+ }
357
+ ]
358
+ }
359
+ ]
360
+ });
361
+ const text = response.content.filter((block) => block.type === "text").map((block) => block.text).join("");
362
+ return parseEvaluation(text);
363
+ }
364
+ function parseEvaluation(text) {
365
+ try {
366
+ const parsed = JSON.parse(text.trim());
367
+ return validateEvaluation(parsed);
368
+ } catch {
369
+ const jsonMatch = text.match(/\{[\s\S]*"passed"[\s\S]*"confidence"[\s\S]*"actualResult"[\s\S]*\}/);
370
+ if (jsonMatch) {
371
+ try {
372
+ const parsed = JSON.parse(jsonMatch[0]);
373
+ return validateEvaluation(parsed);
374
+ } catch {
375
+ }
376
+ }
377
+ }
378
+ return {
379
+ passed: false,
380
+ confidence: 0.3,
381
+ actualResult: `Vision evaluation returned unparseable response: ${text.slice(0, 200)}`
382
+ };
383
+ }
384
+ function validateEvaluation(parsed) {
385
+ return {
386
+ passed: typeof parsed.passed === "boolean" ? parsed.passed : false,
387
+ confidence: typeof parsed.confidence === "number" ? Math.max(0, Math.min(1, parsed.confidence)) : 0.5,
388
+ actualResult: typeof parsed.actualResult === "string" ? parsed.actualResult : "No description provided"
389
+ };
390
+ }
391
+
392
+ // src/action-executor.ts
393
+ async function executeAction(page, stagehand, step) {
394
+ if (step.selector && step.actionType) {
395
+ try {
396
+ await executePlaywrightAction(page, step);
397
+ return { deterministic: true };
398
+ } catch (err) {
399
+ const fallbackResult = await executeStagehandAction(stagehand, step);
400
+ return {
401
+ deterministic: false,
402
+ error: fallbackResult.error ? `Playwright failed (${err instanceof Error ? err.message : String(err)}), Stagehand fallback also failed: ${fallbackResult.error}` : void 0
403
+ };
404
+ }
405
+ }
406
+ return executeStagehandAction(stagehand, step);
407
+ }
408
+ async function executePlaywrightAction(page, step) {
409
+ const { actionType, selector, value, waitMs } = step;
410
+ switch (actionType) {
411
+ case "click": {
412
+ const locator = page.locator(selector);
413
+ await locator.click();
414
+ break;
415
+ }
416
+ case "fill": {
417
+ const locator = page.locator(selector);
418
+ await locator.fill(value ?? "");
419
+ break;
420
+ }
421
+ case "select": {
422
+ await page.evaluate(
423
+ ({ sel, val }) => {
424
+ const el = document.querySelector(sel);
425
+ if (!el) throw new Error(`Select element not found: ${sel}`);
426
+ el.value = val;
427
+ el.dispatchEvent(new Event("change", { bubbles: true }));
428
+ },
429
+ { sel: selector, val: value ?? "" }
430
+ );
431
+ break;
432
+ }
433
+ case "navigate": {
434
+ const url = value ?? selector ?? "";
435
+ if (!url) throw new Error("Navigate action requires a value or selector with the URL");
436
+ await page.goto(url, { waitUntil: "domcontentloaded", timeoutMs: 15e3 });
437
+ break;
438
+ }
439
+ case "scroll": {
440
+ await page.evaluate((sel) => {
441
+ const el = document.querySelector(sel);
442
+ if (el) el.scrollIntoView({ behavior: "smooth", block: "center" });
443
+ }, selector);
444
+ break;
445
+ }
446
+ case "wait": {
447
+ if (selector) {
448
+ await page.waitForSelector(selector, { timeout: waitMs ?? 1e4 });
449
+ } else if (waitMs) {
450
+ await page.waitForTimeout(waitMs);
451
+ }
452
+ break;
453
+ }
454
+ case "assert": {
455
+ break;
456
+ }
457
+ default: {
458
+ throw new Error(`Unknown actionType: ${actionType}`);
459
+ }
460
+ }
461
+ if (waitMs && actionType !== "wait") {
462
+ await page.waitForTimeout(waitMs);
463
+ }
464
+ }
465
+ async function executeStagehandAction(stagehand, step) {
466
+ try {
467
+ await stagehand.act(step.action);
468
+ return { deterministic: false };
469
+ } catch (err) {
470
+ return {
471
+ deterministic: false,
472
+ error: err instanceof Error ? err.message : String(err)
473
+ };
474
+ }
475
+ }
476
+
477
+ // src/selector-discovery.ts
478
+ async function discoverSelector(page) {
479
+ try {
480
+ const result = await page.evaluate(() => {
481
+ const el = document.__bbLastClicked ?? document.activeElement;
482
+ if (!el || el === document.body || el === document.documentElement) return null;
483
+ const tagName = el.tagName?.toLowerCase() ?? "unknown";
484
+ const textContent = (el.textContent ?? "").trim().slice(0, 100);
485
+ let selector = "";
486
+ let strategy = "css-path";
487
+ const testId = el.getAttribute("data-testid") ?? el.getAttribute("data-test-id");
488
+ if (testId) {
489
+ selector = `[data-testid="${testId}"]`;
490
+ strategy = "data-testid";
491
+ } else if (el.id && !/^:r[0-9a-z]+:?$/.test(el.id) && !/^react-/.test(el.id)) {
492
+ selector = `#${el.id}`;
493
+ strategy = "id";
494
+ } else if (el.getAttribute("role")) {
495
+ const role = el.getAttribute("role");
496
+ const name = el.getAttribute("aria-label") ?? el.getAttribute("name") ?? "";
497
+ if (name) {
498
+ selector = `[role="${role}"][aria-label="${name}"]`;
499
+ strategy = "role";
500
+ } else {
501
+ selector = `[role="${role}"]`;
502
+ strategy = "role";
503
+ }
504
+ } else if (el.getAttribute("aria-label")) {
505
+ selector = `[aria-label="${el.getAttribute("aria-label")}"]`;
506
+ strategy = "aria-label";
507
+ } else {
508
+ const parts = [];
509
+ let current = el;
510
+ while (current && current !== document.body) {
511
+ let part = current.tagName.toLowerCase();
512
+ if (current.className && typeof current.className === "string") {
513
+ const classes = current.className.split(/\s+/).filter(
514
+ (c) => c && !c.startsWith("_") && c.length < 30
515
+ );
516
+ if (classes.length > 0) {
517
+ part += `.${classes[0]}`;
518
+ }
519
+ }
520
+ parts.unshift(part);
521
+ current = current.parentElement;
522
+ if (parts.length >= 4) break;
523
+ }
524
+ selector = parts.join(" > ");
525
+ strategy = "css-path";
526
+ }
527
+ let suggestedActionType;
528
+ if (tagName === "button" || tagName === "a" || el.getAttribute("role") === "button") {
529
+ suggestedActionType = "click";
530
+ } else if (tagName === "input" || tagName === "textarea") {
531
+ const type = el.getAttribute("type") ?? "text";
532
+ if (type === "checkbox" || type === "radio") {
533
+ suggestedActionType = "click";
534
+ } else {
535
+ suggestedActionType = "fill";
536
+ }
537
+ } else if (tagName === "select") {
538
+ suggestedActionType = "select";
539
+ }
540
+ return { selector, strategy, suggestedActionType, tagName, textContent };
541
+ });
542
+ return result;
543
+ } catch {
544
+ return null;
545
+ }
546
+ }
547
+ async function installClickTracker(page) {
548
+ try {
549
+ await page.evaluate(() => {
550
+ document.addEventListener("click", (e) => {
551
+ document.__bbLastClicked = e.target;
552
+ }, { capture: true });
553
+ });
554
+ } catch {
555
+ }
556
+ }
557
+
558
+ // src/cost.ts
559
+ var TOKEN_PROFILE = {
560
+ /** act() — screenshot + DOM context → action decision */
561
+ actInput: 2e3,
562
+ actOutput: 200,
563
+ /** extract() — screenshot + extraction schema → structured result */
564
+ extractInput: 3e3,
565
+ extractOutput: 500,
566
+ /** summary — all step results → narrative summary (once per run) */
567
+ summaryInput: 2e3,
568
+ summaryOutput: 500
569
+ };
570
+ function getTokenEstimate(stepCount) {
571
+ return {
572
+ inputTokens: stepCount * (TOKEN_PROFILE.actInput + TOKEN_PROFILE.extractInput) + TOKEN_PROFILE.summaryInput,
573
+ outputTokens: stepCount * (TOKEN_PROFILE.actOutput + TOKEN_PROFILE.extractOutput) + TOKEN_PROFILE.summaryOutput
574
+ };
575
+ }
576
+
192
577
  // src/runner.ts
578
+ var AI_OPERATION_TIMEOUT_MS = 3e4;
579
+ var DEFAULT_MAX_RETRIES = 2;
580
+ var DEFAULT_RETRY_DELAY_MS = 2e3;
581
+ function isRetryableError(error) {
582
+ const patterns = [
583
+ /timed?\s*out/i,
584
+ /ECONNREFUSED/i,
585
+ /ECONNRESET/i,
586
+ /ENOTFOUND/i,
587
+ /net::ERR_/i,
588
+ /navigation failed/i,
589
+ /page crashed/i,
590
+ /context was destroyed/i,
591
+ /target closed/i,
592
+ /session closed/i,
593
+ /browser disconnected/i,
594
+ /execution context/i
595
+ ];
596
+ return patterns.some((p) => p.test(error));
597
+ }
598
+ async function withTimeout(promise, timeoutMs, operation) {
599
+ let timeoutId;
600
+ const timeoutPromise = new Promise((_, reject) => {
601
+ timeoutId = setTimeout(() => reject(new Error(`${operation} timed out after ${timeoutMs}ms`)), timeoutMs);
602
+ });
603
+ try {
604
+ return await Promise.race([promise, timeoutPromise]);
605
+ } finally {
606
+ clearTimeout(timeoutId);
607
+ }
608
+ }
193
609
  async function runTest(config) {
194
610
  const anthropic = new import_sdk.default({ apiKey: config.anthropicApiKey });
195
611
  const startTime = Date.now();
@@ -198,60 +614,71 @@ async function runTest(config) {
198
614
  headless: true
199
615
  };
200
616
  config.onStatusChange?.("initializing");
201
- const session = await createStagehandSession(browserConfig, config.anthropicApiKey);
202
- const { stagehand, page } = session;
617
+ let session;
203
618
  const stepResults = [];
204
619
  let pendingConsoleLogs = [];
205
620
  let pendingNetworkErrors = [];
206
621
  let stepStartTime = Date.now();
207
- const rawPage = page;
208
- rawPage.on("console", (msg) => {
209
- const level = msg.type?.() ?? msg.type ?? "log";
210
- const mappedLevel = level === "error" ? "error" : level === "warn" || level === "warning" ? "warning" : level === "info" ? "info" : level === "debug" ? "debug" : "log";
211
- pendingConsoleLogs.push({
212
- level: mappedLevel,
213
- text: (typeof msg.text === "function" ? msg.text() : String(msg.text ?? msg)).slice(0, 2e3),
214
- source: typeof msg.location === "function" ? msg.location()?.url : void 0,
215
- timestamp: Date.now() - stepStartTime
216
- });
217
- });
218
- rawPage.on("requestfailed", (req) => {
219
- const url = typeof req.url === "function" ? req.url() : String(req.url ?? "");
220
- const method = typeof req.method === "function" ? req.method() : String(req.method ?? "GET");
221
- const failure = typeof req.failure === "function" ? req.failure() : req.failure;
222
- pendingNetworkErrors.push({
223
- method,
224
- url: url.slice(0, 500),
225
- status: 0,
226
- statusText: failure?.errorText ?? "Request failed",
227
- timestamp: Date.now() - stepStartTime
228
- });
229
- });
230
- rawPage.on("response", (res) => {
231
- const status = typeof res.status === "function" ? res.status() : Number(res.status ?? 0);
232
- if (status >= 400) {
233
- const url = typeof res.url === "function" ? res.url() : String(res.url ?? "");
234
- const statusText = typeof res.statusText === "function" ? res.statusText() : String(res.statusText ?? "");
235
- const req = typeof res.request === "function" ? res.request() : res.request;
236
- const method = req ? typeof req.method === "function" ? req.method() : String(req.method ?? "GET") : "GET";
237
- pendingNetworkErrors.push({
238
- method,
239
- url: url.slice(0, 500),
240
- status,
241
- statusText,
242
- timestamp: Date.now() - stepStartTime
622
+ try {
623
+ session = await createStagehandSession(browserConfig, config.anthropicApiKey);
624
+ const { stagehand, page } = session;
625
+ await suppressBugBearWidget(stagehand);
626
+ const rawPage = page;
627
+ try {
628
+ rawPage.on("console", (msg) => {
629
+ const level = msg.type?.() ?? msg.type ?? "log";
630
+ const mappedLevel = level === "error" ? "error" : level === "warn" || level === "warning" ? "warning" : level === "info" ? "info" : level === "debug" ? "debug" : "log";
631
+ pendingConsoleLogs.push({
632
+ level: mappedLevel,
633
+ text: (typeof msg.text === "function" ? msg.text() : String(msg.text ?? msg)).slice(0, 2e3),
634
+ source: typeof msg.location === "function" ? msg.location()?.url : void 0,
635
+ timestamp: Date.now() - stepStartTime
636
+ });
243
637
  });
638
+ } catch {
244
639
  }
245
- });
246
- try {
247
- if (config.auth?.type === "form-login") {
640
+ try {
641
+ rawPage.on("requestfailed", (req) => {
642
+ const url = typeof req.url === "function" ? req.url() : String(req.url ?? "");
643
+ const method = typeof req.method === "function" ? req.method() : String(req.method ?? "GET");
644
+ const failure = typeof req.failure === "function" ? req.failure() : req.failure;
645
+ pendingNetworkErrors.push({
646
+ method,
647
+ url: url.slice(0, 500),
648
+ status: 0,
649
+ statusText: failure?.errorText ?? "Request failed",
650
+ timestamp: Date.now() - stepStartTime
651
+ });
652
+ });
653
+ } catch {
654
+ }
655
+ try {
656
+ rawPage.on("response", (res) => {
657
+ const status = typeof res.status === "function" ? res.status() : Number(res.status ?? 0);
658
+ if (status >= 400) {
659
+ const url = typeof res.url === "function" ? res.url() : String(res.url ?? "");
660
+ const statusText = typeof res.statusText === "function" ? res.statusText() : String(res.statusText ?? "");
661
+ const req = typeof res.request === "function" ? res.request() : res.request;
662
+ const method = req ? typeof req.method === "function" ? req.method() : String(req.method ?? "GET") : "GET";
663
+ pendingNetworkErrors.push({
664
+ method,
665
+ url: url.slice(0, 500),
666
+ status,
667
+ statusText,
668
+ timestamp: Date.now() - stepStartTime
669
+ });
670
+ }
671
+ });
672
+ } catch {
673
+ }
674
+ if (config.auth?.type === "form-login" || config.auth?.type === "supabase-native") {
248
675
  config.onStatusChange?.("authenticating");
249
676
  await injectAuth(page, config.auth, stagehand);
250
677
  }
251
678
  config.onStatusChange?.("navigating");
252
679
  const targetUrl = config.testCase.targetRoute ? `${config.targetUrl.replace(/\/$/, "")}${config.testCase.targetRoute}` : config.targetUrl;
253
680
  await page.goto(targetUrl, { waitUntil: "domcontentloaded", timeoutMs: 3e4 });
254
- if (config.auth && config.auth.type !== "form-login") {
681
+ if (config.auth && config.auth.type !== "form-login" && config.auth.type !== "supabase-native") {
255
682
  config.onStatusChange?.("authenticating");
256
683
  await injectAuth(page, config.auth, stagehand);
257
684
  if (config.auth.type === "localStorage") {
@@ -265,79 +692,143 @@ async function runTest(config) {
265
692
  }
266
693
  await page.waitForLoadState("networkidle").catch(() => {
267
694
  });
695
+ await page.evaluate(() => {
696
+ window.__bugbear_suppress = true;
697
+ try {
698
+ localStorage.setItem("__bugbear_suppress", "true");
699
+ } catch {
700
+ }
701
+ }).catch(() => {
702
+ });
703
+ await installClickTracker(page);
268
704
  pendingConsoleLogs = [];
269
705
  pendingNetworkErrors = [];
270
706
  config.onStatusChange?.("executing");
271
707
  const steps = config.testCase.steps;
708
+ const maxRetries = config.retry?.maxRetries ?? DEFAULT_MAX_RETRIES;
709
+ const retryDelayMs = config.retry?.retryDelayMs ?? DEFAULT_RETRY_DELAY_MS;
710
+ const resilientMode = config.resilientMode ?? true;
272
711
  for (let i = 0; i < steps.length; i++) {
273
712
  const step = steps[i];
274
- stepStartTime = Date.now();
275
- pendingConsoleLogs = [];
276
- pendingNetworkErrors = [];
277
- const screenshotBefore = await page.screenshot({ type: "png" });
278
- let error;
279
- let screenshotAfter = screenshotBefore;
280
- let actSucceeded = false;
281
- try {
282
- await stagehand.act(step.action);
283
- actSucceeded = true;
284
- await page.waitForLoadState("networkidle").catch(() => {
285
- });
286
- await page.waitForTimeout(500);
287
- screenshotAfter = await page.screenshot({ type: "png" });
288
- } catch (err) {
289
- error = err instanceof Error ? err.message : String(err);
713
+ const retryHistory = [];
714
+ let finalResult;
715
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
716
+ stepStartTime = Date.now();
717
+ pendingConsoleLogs = [];
718
+ pendingNetworkErrors = [];
719
+ const screenshotBefore = await page.screenshot({ type: "png" });
720
+ let error;
721
+ let screenshotAfter = screenshotBefore;
722
+ let actSucceeded = false;
723
+ const actionResult = await executeAction(page, stagehand, step);
724
+ error = actionResult.error;
725
+ actSucceeded = !error;
726
+ if (actSucceeded) {
727
+ await page.waitForLoadState("networkidle").catch(() => {
728
+ });
729
+ await page.waitForTimeout(step.waitMs ?? 500);
730
+ }
290
731
  screenshotAfter = await page.screenshot({ type: "png" }).catch(() => screenshotBefore);
732
+ let evaluation = {
733
+ passed: false,
734
+ confidence: 0,
735
+ actualResult: error ?? "Action execution failed"
736
+ };
737
+ if (actSucceeded) {
738
+ try {
739
+ const visionResult = await withTimeout(
740
+ evaluateStep({
741
+ anthropic,
742
+ screenshotBefore,
743
+ screenshotAfter,
744
+ action: step.action,
745
+ expectedResult: step.expectedResult,
746
+ evaluationHint: step.evaluationHint,
747
+ model: config.model
748
+ }),
749
+ AI_OPERATION_TIMEOUT_MS,
750
+ "Vision evaluation"
751
+ );
752
+ evaluation = {
753
+ passed: visionResult.passed,
754
+ confidence: visionResult.confidence,
755
+ actualResult: visionResult.actualResult
756
+ };
757
+ } catch (evalErr) {
758
+ evaluation = {
759
+ passed: false,
760
+ confidence: 0.2,
761
+ actualResult: `Vision evaluation error: ${evalErr instanceof Error ? evalErr.message : String(evalErr)}`
762
+ };
763
+ }
764
+ }
765
+ let discoveredActions = [];
766
+ if (actSucceeded && !actionResult.deterministic) {
767
+ const discovered = await discoverSelector(page);
768
+ if (discovered) {
769
+ discoveredActions = [{
770
+ type: discovered.suggestedActionType ?? "click",
771
+ selector: discovered.selector,
772
+ description: `Discovered via ${discovered.strategy}: ${discovered.tagName}${discovered.textContent ? ` "${discovered.textContent.slice(0, 50)}"` : ""}`
773
+ }];
774
+ }
775
+ }
776
+ const consoleLogs = pendingConsoleLogs.slice(0, 50);
777
+ const networkErrors = pendingNetworkErrors.slice(0, 30);
778
+ finalResult = {
779
+ stepNumber: step.stepNumber,
780
+ action: step.action,
781
+ expectedResult: step.expectedResult,
782
+ actualResult: evaluation.actualResult,
783
+ passed: evaluation.passed,
784
+ confidence: evaluation.confidence,
785
+ screenshotBefore,
786
+ screenshotAfter,
787
+ actionsTaken: discoveredActions,
788
+ error,
789
+ durationMs: Date.now() - stepStartTime,
790
+ consoleLogs,
791
+ networkErrors,
792
+ retryCount: attempt,
793
+ retryHistory,
794
+ skipped: false
795
+ };
796
+ const shouldRetry = !evaluation.passed && error && isRetryableError(error) && attempt < maxRetries;
797
+ if (!shouldRetry) break;
798
+ retryHistory.push({
799
+ attempt,
800
+ error,
801
+ confidence: evaluation.confidence,
802
+ timestamp: Date.now()
803
+ });
804
+ await new Promise((resolve) => setTimeout(resolve, retryDelayMs));
291
805
  }
292
- let evaluation = {
293
- passed: false,
294
- confidence: 0,
295
- actualResult: error ?? "Action execution failed"
296
- };
297
- if (actSucceeded) {
806
+ if (resilientMode && finalResult && !finalResult.passed) {
807
+ finalResult.skipped = true;
808
+ finalResult.skipReason = "Step failed, recovered page state";
298
809
  try {
299
- const verificationSchema = import_zod.z.object({
300
- passed: import_zod.z.boolean().describe("Whether the expected result was achieved"),
301
- confidence: import_zod.z.number().min(0).max(1).describe("Confidence in the assessment (0.9+ = very sure, 0.7-0.9 = likely, below 0.7 = uncertain)"),
302
- actualResult: import_zod.z.string().describe("Description of what actually happened on the page")
810
+ config.onStatusChange?.("navigating");
811
+ await page.goto(targetUrl, { waitUntil: "domcontentloaded", timeoutMs: 3e4 });
812
+ await page.waitForLoadState("networkidle").catch(() => {
303
813
  });
304
- const verification = await stagehand.extract(
305
- `You are evaluating a QA test step. The action "${step.action}" was just performed. Check if this expected result was achieved: "${step.expectedResult}". Look at the current page state and describe what actually happened. Be precise and factual in your assessment.`,
306
- verificationSchema
307
- );
308
- evaluation = {
309
- passed: verification.passed,
310
- confidence: verification.confidence,
311
- actualResult: verification.actualResult
312
- };
313
- } catch (evalErr) {
314
- evaluation = {
315
- passed: false,
316
- confidence: 0.2,
317
- actualResult: `Verification error: ${evalErr instanceof Error ? evalErr.message : String(evalErr)}`
318
- };
814
+ await installClickTracker(page);
815
+ await page.evaluate(() => {
816
+ window.__bugbear_suppress = true;
817
+ try {
818
+ localStorage.setItem("__bugbear_suppress", "true");
819
+ } catch {
820
+ }
821
+ }).catch(() => {
822
+ });
823
+ pendingConsoleLogs = [];
824
+ pendingNetworkErrors = [];
825
+ config.onStatusChange?.("executing");
826
+ } catch (recoveryErr) {
827
+ finalResult.skipReason = `Step failed, recovery also failed: ${recoveryErr instanceof Error ? recoveryErr.message : String(recoveryErr)}`;
319
828
  }
320
829
  }
321
- const consoleLogs = pendingConsoleLogs.slice(0, 50);
322
- const networkErrors = pendingNetworkErrors.slice(0, 30);
323
- const result = {
324
- stepNumber: step.stepNumber,
325
- action: step.action,
326
- expectedResult: step.expectedResult,
327
- actualResult: evaluation.actualResult,
328
- passed: evaluation.passed,
329
- confidence: evaluation.confidence,
330
- screenshotBefore,
331
- screenshotAfter,
332
- actionsTaken: [],
333
- // Stagehand handles actions internally
334
- error,
335
- durationMs: Date.now() - stepStartTime,
336
- consoleLogs,
337
- networkErrors
338
- };
339
- stepResults.push(result);
340
- config.onStepComplete?.(result, i, steps.length);
830
+ stepResults.push(finalResult);
831
+ config.onStepComplete?.(finalResult, i, steps.length);
341
832
  }
342
833
  config.onStatusChange?.("completed");
343
834
  const model = config.model ?? "claude-sonnet-4-20250514";
@@ -351,11 +842,7 @@ async function runTest(config) {
351
842
  totalDurationMs: Date.now() - startTime,
352
843
  summary,
353
844
  screenshotUrls: [],
354
- tokenUsage: {
355
- // Stagehand tracks tokens internally; these are approximate
356
- inputTokens: steps.length * 3e3,
357
- outputTokens: steps.length * 500
358
- },
845
+ tokenUsage: getTokenEstimate(steps.length),
359
846
  browserSessionId: session.sessionId
360
847
  };
361
848
  } catch (err) {
@@ -367,23 +854,29 @@ async function runTest(config) {
367
854
  totalDurationMs: Date.now() - startTime,
368
855
  summary: `Test execution failed: ${err instanceof Error ? err.message : String(err)}`,
369
856
  screenshotUrls: [],
370
- tokenUsage: {
371
- inputTokens: stepResults.length * 3e3,
372
- outputTokens: stepResults.length * 500
373
- },
374
- browserSessionId: session.sessionId
857
+ tokenUsage: getTokenEstimate(stepResults.length),
858
+ browserSessionId: session?.sessionId ?? "unknown"
375
859
  };
376
860
  } finally {
377
- await session.close();
861
+ if (session?.page) {
862
+ const rawPage = session.page;
863
+ rawPage.removeAllListeners?.("console");
864
+ rawPage.removeAllListeners?.("requestfailed");
865
+ rawPage.removeAllListeners?.("response");
866
+ }
867
+ await session?.close();
378
868
  }
379
869
  }
380
870
  function determineOverallResult(steps) {
381
871
  if (steps.length === 0) return "error";
382
- const allPassed = steps.every((s) => s.passed);
383
- const allFailed = steps.every((s) => !s.passed);
384
- const hasErrors = steps.some((s) => s.error);
385
- if (allPassed) return "passed";
386
- if (allFailed || hasErrors) return "failed";
872
+ const nonSkipped = steps.filter((s) => !s.skipped);
873
+ const skippedCount = steps.length - nonSkipped.length;
874
+ if (nonSkipped.length === 0) return "error";
875
+ const allNonSkippedPassed = nonSkipped.every((s) => s.passed);
876
+ const hasErrors = nonSkipped.some((s) => s.error);
877
+ if (skippedCount > 0 && allNonSkippedPassed) return "passed_with_skips";
878
+ if (allNonSkippedPassed) return "passed";
879
+ if (nonSkipped.every((s) => !s.passed) || hasErrors) return "failed";
387
880
  return "partial";
388
881
  }
389
882