@browserbasehq/stagehand 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1421 @@
1
+ import { Stagehand } from "../index";
2
+ import { LLMProvider } from "../llm/LLMProvider";
3
+ import { ScreenshotService } from "../vision";
4
+ import { verifyActCompletion, act, fillInVariables } from "../inference";
5
+ import { Locator, Page } from "@playwright/test";
6
+ import { ActionCache } from "../cache/ActionCache";
7
+ import { LLMClient, modelsWithVision } from "../llm/LLMClient";
8
+ import { generateId } from "../utils";
9
+ import { LogLine } from "../../types/log";
10
+ import {
11
+ PlaywrightCommandException,
12
+ PlaywrightCommandMethodNotSupportedException,
13
+ } from "../../types/playwright";
14
+
15
+ export class StagehandActHandler {
16
+ private readonly stagehand: Stagehand;
17
+ private readonly verbose: 0 | 1 | 2;
18
+ private readonly llmProvider: LLMProvider;
19
+ private readonly enableCaching: boolean;
20
+ private readonly logger: (logLine: LogLine) => void;
21
+ private readonly waitForSettledDom: (
22
+ domSettleTimeoutMs?: number,
23
+ ) => Promise<void>;
24
+ private readonly actionCache: ActionCache | undefined;
25
+ private readonly llmClient: LLMClient;
26
+ private readonly startDomDebug: () => Promise<void>;
27
+ private readonly cleanupDomDebug: () => Promise<void>;
28
+ private actions: { [key: string]: { result: string; action: string } };
29
+
30
+ constructor({
31
+ stagehand,
32
+ verbose,
33
+ llmProvider,
34
+ enableCaching,
35
+ logger,
36
+ waitForSettledDom,
37
+ llmClient,
38
+ startDomDebug,
39
+ cleanupDomDebug,
40
+ }: {
41
+ stagehand: Stagehand;
42
+ verbose: 0 | 1 | 2;
43
+ llmProvider: LLMProvider;
44
+ enableCaching: boolean;
45
+ logger: (logLine: LogLine) => void;
46
+ waitForSettledDom: (domSettleTimeoutMs?: number) => Promise<void>;
47
+ llmClient: LLMClient;
48
+ startDomDebug: () => Promise<void>;
49
+ cleanupDomDebug: () => Promise<void>;
50
+ }) {
51
+ this.stagehand = stagehand;
52
+ this.verbose = verbose;
53
+ this.llmProvider = llmProvider;
54
+ this.enableCaching = enableCaching;
55
+ this.logger = logger;
56
+ this.waitForSettledDom = waitForSettledDom;
57
+ this.actionCache = enableCaching ? new ActionCache(this.logger) : undefined;
58
+ this.llmClient = llmClient;
59
+ this.startDomDebug = startDomDebug;
60
+ this.cleanupDomDebug = cleanupDomDebug;
61
+ this.actions = {};
62
+ }
63
+
64
+ private async _recordAction(action: string, result: string): Promise<string> {
65
+ const id = generateId(action);
66
+
67
+ this.actions[id] = { result, action };
68
+
69
+ return id;
70
+ }
71
+
72
+ private async _verifyActionCompletion({
73
+ completed,
74
+ verifierUseVision,
75
+ requestId,
76
+ action,
77
+ steps,
78
+ llmClient,
79
+ domSettleTimeoutMs,
80
+ }: {
81
+ completed: boolean;
82
+ verifierUseVision: boolean;
83
+ requestId: string;
84
+ action: string;
85
+ steps: string;
86
+ llmClient: LLMClient;
87
+ domSettleTimeoutMs?: number;
88
+ }): Promise<boolean> {
89
+ await this.waitForSettledDom(domSettleTimeoutMs);
90
+
91
+ const { selectorMap } = await this.stagehand.page.evaluate(() => {
92
+ return window.processAllOfDom();
93
+ });
94
+
95
+ let actionCompleted = false;
96
+ if (completed) {
97
+ // Run action completion verifier
98
+ this.stagehand.log({
99
+ category: "action",
100
+ message: "action marked as completed, verifying if this is true...",
101
+ level: 1,
102
+ auxiliary: {
103
+ action: {
104
+ value: action,
105
+ type: "string",
106
+ },
107
+ },
108
+ });
109
+
110
+ let domElements: string | undefined = undefined;
111
+ let fullpageScreenshot: Buffer | undefined = undefined;
112
+
113
+ if (verifierUseVision) {
114
+ try {
115
+ const screenshotService = new ScreenshotService(
116
+ this.stagehand.page,
117
+ selectorMap,
118
+ this.verbose,
119
+ this.logger,
120
+ );
121
+
122
+ fullpageScreenshot = await screenshotService.getScreenshot(true, 15);
123
+ } catch (e) {
124
+ this.stagehand.log({
125
+ category: "action",
126
+ message: "error getting full page screenshot. trying again...",
127
+ level: 1,
128
+ auxiliary: {
129
+ error: {
130
+ value: e.message,
131
+ type: "string",
132
+ },
133
+ trace: {
134
+ value: e.stack,
135
+ type: "string",
136
+ },
137
+ },
138
+ });
139
+
140
+ const screenshotService = new ScreenshotService(
141
+ this.stagehand.page,
142
+ selectorMap,
143
+ this.verbose,
144
+ this.logger,
145
+ );
146
+
147
+ fullpageScreenshot = await screenshotService.getScreenshot(true, 15);
148
+ }
149
+ } else {
150
+ ({ outputString: domElements } = await this.stagehand.page.evaluate(
151
+ () => {
152
+ return window.processAllOfDom();
153
+ },
154
+ ));
155
+ }
156
+
157
+ actionCompleted = await verifyActCompletion({
158
+ goal: action,
159
+ steps,
160
+ llmProvider: this.llmProvider,
161
+ llmClient,
162
+ screenshot: fullpageScreenshot,
163
+ domElements,
164
+ logger: this.logger,
165
+ requestId,
166
+ });
167
+
168
+ this.stagehand.log({
169
+ category: "action",
170
+ message: "action completion verification result",
171
+ level: 1,
172
+ auxiliary: {
173
+ action: {
174
+ value: action,
175
+ type: "string",
176
+ },
177
+ result: {
178
+ value: actionCompleted.toString(),
179
+ type: "boolean",
180
+ },
181
+ },
182
+ });
183
+ }
184
+
185
+ return actionCompleted;
186
+ }
187
+
188
+ private async _performPlaywrightMethod(
189
+ method: string,
190
+ args: string[],
191
+ xpath: string,
192
+ domSettleTimeoutMs?: number,
193
+ ) {
194
+ const locator = this.stagehand.page.locator(`xpath=${xpath}`).first();
195
+ const initialUrl = this.stagehand.page.url();
196
+ if (method === "scrollIntoView") {
197
+ this.stagehand.log({
198
+ category: "action",
199
+ message: "scrolling element into view",
200
+ level: 2,
201
+ auxiliary: {
202
+ xpath: {
203
+ value: xpath,
204
+ type: "string",
205
+ },
206
+ },
207
+ });
208
+ try {
209
+ await locator
210
+ .evaluate((element: any) => {
211
+ element.scrollIntoView({ behavior: "smooth", block: "center" });
212
+ })
213
+ .catch((e: Error) => {
214
+ this.stagehand.log({
215
+ category: "action",
216
+ message: "error scrolling element into view",
217
+ level: 1,
218
+ auxiliary: {
219
+ error: {
220
+ value: e.message,
221
+ type: "string",
222
+ },
223
+ trace: {
224
+ value: e.stack,
225
+ type: "string",
226
+ },
227
+ xpath: {
228
+ value: xpath,
229
+ type: "string",
230
+ },
231
+ },
232
+ });
233
+ });
234
+ } catch (e) {
235
+ this.stagehand.log({
236
+ category: "action",
237
+ message: "error scrolling element into view",
238
+ level: 1,
239
+ auxiliary: {
240
+ error: {
241
+ value: e.message,
242
+ type: "string",
243
+ },
244
+ trace: {
245
+ value: e.stack,
246
+ type: "string",
247
+ },
248
+ xpath: {
249
+ value: xpath,
250
+ type: "string",
251
+ },
252
+ },
253
+ });
254
+
255
+ throw new PlaywrightCommandException(e.message);
256
+ }
257
+ } else if (method === "fill" || method === "type") {
258
+ try {
259
+ await locator.fill("");
260
+ await locator.click();
261
+ const text = args[0];
262
+ for (const char of text) {
263
+ await this.stagehand.page.keyboard.type(char, {
264
+ delay: Math.random() * 50 + 25,
265
+ });
266
+ }
267
+ } catch (e) {
268
+ this.logger({
269
+ category: "action",
270
+ message: "error filling element",
271
+ level: 1,
272
+ auxiliary: {
273
+ error: {
274
+ value: e.message,
275
+ type: "string",
276
+ },
277
+ trace: {
278
+ value: e.stack,
279
+ type: "string",
280
+ },
281
+ xpath: {
282
+ value: xpath,
283
+ type: "string",
284
+ },
285
+ },
286
+ });
287
+
288
+ throw new PlaywrightCommandException(e.message);
289
+ }
290
+ } else if (method === "press") {
291
+ try {
292
+ const key = args[0];
293
+ await this.stagehand.page.keyboard.press(key);
294
+ } catch (e) {
295
+ this.logger({
296
+ category: "action",
297
+ message: "error pressing key",
298
+ level: 1,
299
+ auxiliary: {
300
+ error: {
301
+ value: e.message,
302
+ type: "string",
303
+ },
304
+ trace: {
305
+ value: e.stack,
306
+ type: "string",
307
+ },
308
+ key: {
309
+ value: args[0]?.toString() ?? "unknown",
310
+ type: "string",
311
+ },
312
+ },
313
+ });
314
+
315
+ throw new PlaywrightCommandException(e.message);
316
+ }
317
+ } else if (typeof locator[method as keyof typeof locator] === "function") {
318
+ // Log current URL before action
319
+ this.logger({
320
+ category: "action",
321
+ message: "page URL before action",
322
+ level: 2,
323
+ auxiliary: {
324
+ url: {
325
+ value: this.stagehand.page.url(),
326
+ type: "string",
327
+ },
328
+ },
329
+ });
330
+
331
+ // Perform the action
332
+ try {
333
+ // @ts-ignore
334
+ await locator[method](...args);
335
+ } catch (e) {
336
+ this.logger({
337
+ category: "action",
338
+ message: "error performing method",
339
+ level: 1,
340
+ auxiliary: {
341
+ error: {
342
+ value: e.message,
343
+ type: "string",
344
+ },
345
+ trace: {
346
+ value: e.stack,
347
+ type: "string",
348
+ },
349
+ xpath: {
350
+ value: xpath,
351
+ type: "string",
352
+ },
353
+ method: {
354
+ value: method,
355
+ type: "string",
356
+ },
357
+ args: {
358
+ value: JSON.stringify(args),
359
+ type: "object",
360
+ },
361
+ },
362
+ });
363
+
364
+ throw new PlaywrightCommandException(e.message);
365
+ }
366
+
367
+ // Handle navigation if a new page is opened
368
+ if (method === "click") {
369
+ this.logger({
370
+ category: "action",
371
+ message: "clicking element, checking for page navigation",
372
+ level: 1,
373
+ auxiliary: {
374
+ xpath: {
375
+ value: xpath,
376
+ type: "string",
377
+ },
378
+ },
379
+ });
380
+
381
+ // NAVIDNOTE: Should this happen before we wait for locator[method]?
382
+ const newOpenedTab = await Promise.race([
383
+ new Promise<Page | null>((resolve) => {
384
+ this.stagehand.context.once("page", (page) => resolve(page));
385
+ setTimeout(() => resolve(null), 1_500);
386
+ }),
387
+ ]);
388
+
389
+ this.logger({
390
+ category: "action",
391
+ message: "clicked element",
392
+ level: 1,
393
+ auxiliary: {
394
+ newOpenedTab: {
395
+ value: newOpenedTab ? "opened a new tab" : "no new tabs opened",
396
+ type: "string",
397
+ },
398
+ },
399
+ });
400
+
401
+ if (newOpenedTab) {
402
+ this.logger({
403
+ category: "action",
404
+ message: "new page detected (new tab) with URL",
405
+ level: 1,
406
+ auxiliary: {
407
+ url: {
408
+ value: newOpenedTab.url(),
409
+ type: "string",
410
+ },
411
+ },
412
+ });
413
+ await newOpenedTab.close();
414
+ await this.stagehand.page.goto(newOpenedTab.url());
415
+ await this.stagehand.page.waitForLoadState("domcontentloaded");
416
+ await this.waitForSettledDom(domSettleTimeoutMs);
417
+ }
418
+
419
+ // Wait for the network to be idle with timeout of 5s (will only wait if loading a new page)
420
+ // await this.waitForSettledDom(domSettleTimeoutMs);
421
+ await Promise.race([
422
+ this.stagehand.page.waitForLoadState("networkidle"),
423
+ new Promise((resolve) => setTimeout(resolve, 5_000)),
424
+ ]).catch((e: Error) => {
425
+ this.logger({
426
+ category: "action",
427
+ message: "network idle timeout hit",
428
+ level: 1,
429
+ });
430
+ });
431
+
432
+ this.logger({
433
+ category: "action",
434
+ message: "finished waiting for (possible) page navigation",
435
+ level: 1,
436
+ });
437
+
438
+ if (this.stagehand.page.url() !== initialUrl) {
439
+ this.logger({
440
+ category: "action",
441
+ message: "new page detected with URL",
442
+ level: 1,
443
+ auxiliary: {
444
+ url: {
445
+ value: this.stagehand.page.url(),
446
+ type: "string",
447
+ },
448
+ },
449
+ });
450
+ }
451
+ }
452
+ } else {
453
+ this.logger({
454
+ category: "action",
455
+ message: "chosen method is invalid",
456
+ level: 1,
457
+ auxiliary: {
458
+ method: {
459
+ value: method,
460
+ type: "string",
461
+ },
462
+ },
463
+ });
464
+
465
+ throw new PlaywrightCommandMethodNotSupportedException(
466
+ `Method ${method} not supported`,
467
+ );
468
+ }
469
+
470
+ await this.waitForSettledDom(domSettleTimeoutMs);
471
+ }
472
+
473
+ private async _getComponentString(locator: Locator) {
474
+ return await locator.evaluate((el) => {
475
+ // Create a clone of the element to avoid modifying the original
476
+ const clone = el.cloneNode(true) as HTMLElement;
477
+
478
+ // Keep only specific stable attributes that help identify elements
479
+ const attributesToKeep = [
480
+ "type",
481
+ "name",
482
+ "placeholder",
483
+ "aria-label",
484
+ "role",
485
+ "href",
486
+ "title",
487
+ "alt",
488
+ ];
489
+
490
+ // Remove all attributes except those we want to keep
491
+ Array.from(clone.attributes).forEach((attr) => {
492
+ if (!attributesToKeep.includes(attr.name)) {
493
+ clone.removeAttribute(attr.name);
494
+ }
495
+ });
496
+
497
+ const outerHtml = clone.outerHTML;
498
+
499
+ // const variables = {
500
+ // // Replace with your actual variables and their values
501
+ // // Example:
502
+ // username: "JohnDoe",
503
+ // email: "john@example.com",
504
+ // };
505
+
506
+ // // Function to replace variable values with variable names
507
+ // const replaceVariables = (element: Element) => {
508
+ // if (element instanceof HTMLElement) {
509
+ // for (const [key, value] of Object.entries(variables)) {
510
+ // if (value) {
511
+ // element.innerText = element.innerText.replace(
512
+ // new RegExp(value, "g"),
513
+ // key,
514
+ // );
515
+ // }
516
+ // }
517
+ // }
518
+
519
+ // if (
520
+ // element instanceof HTMLInputElement ||
521
+ // element instanceof HTMLTextAreaElement
522
+ // ) {
523
+ // for (const [key, value] of Object.entries(variables)) {
524
+ // if (value) {
525
+ // element.value = element.value.replace(
526
+ // new RegExp(value, "g"),
527
+ // key,
528
+ // );
529
+ // }
530
+ // }
531
+ // }
532
+ // };
533
+
534
+ // // Replace variables in the cloned element
535
+ // replaceVariables(clone);
536
+
537
+ // // Replace variables in all child elements
538
+ // clone.querySelectorAll("*").forEach(replaceVariables);
539
+ return outerHtml.trim().replace(/\s+/g, " ");
540
+ });
541
+ }
542
+
543
+ private async getElement(
544
+ xpath: string,
545
+ timeout: number = 5_000,
546
+ ): Promise<Locator | null> {
547
+ try {
548
+ const element = this.stagehand.page.locator(`xpath=${xpath}`).first();
549
+ await element.waitFor({ state: "attached", timeout });
550
+ return element;
551
+ } catch {
552
+ this.logger({
553
+ category: "action",
554
+ message: "element not found within timeout",
555
+ level: 1,
556
+ auxiliary: {
557
+ xpath: {
558
+ value: xpath,
559
+ type: "string",
560
+ },
561
+ timeout_ms: {
562
+ value: timeout.toString(),
563
+ type: "integer",
564
+ },
565
+ },
566
+ });
567
+ return null;
568
+ }
569
+ }
570
+
571
+ private async _checkIfCachedStepIsValid_oneXpath(cachedStep: {
572
+ xpath: string;
573
+ savedComponentString: string;
574
+ }) {
575
+ this.logger({
576
+ category: "action",
577
+ message: "checking if cached step is valid",
578
+ level: 1,
579
+ auxiliary: {
580
+ xpath: {
581
+ value: cachedStep.xpath,
582
+ type: "string",
583
+ },
584
+ savedComponentString: {
585
+ value: cachedStep.savedComponentString,
586
+ type: "string",
587
+ },
588
+ },
589
+ });
590
+ try {
591
+ const locator = await this.getElement(cachedStep.xpath);
592
+ if (!locator) {
593
+ this.logger({
594
+ category: "action",
595
+ message: "locator not found for xpath",
596
+ level: 1,
597
+ auxiliary: {
598
+ xpath: {
599
+ value: cachedStep.xpath,
600
+ type: "string",
601
+ },
602
+ },
603
+ });
604
+ return false;
605
+ }
606
+
607
+ this.logger({
608
+ category: "action",
609
+ message: "locator element",
610
+ level: 1,
611
+ auxiliary: {
612
+ componentString: {
613
+ value: await this._getComponentString(locator),
614
+ type: "string",
615
+ },
616
+ },
617
+ });
618
+
619
+ // First try to get the value (for input/textarea elements)
620
+ let currentComponent = await this._getComponentString(locator);
621
+
622
+ this.logger({
623
+ category: "action",
624
+ message: "current text",
625
+ level: 1,
626
+ auxiliary: {
627
+ componentString: {
628
+ value: currentComponent,
629
+ type: "string",
630
+ },
631
+ },
632
+ });
633
+
634
+ if (!currentComponent || !cachedStep.savedComponentString) {
635
+ this.logger({
636
+ category: "action",
637
+ message: "current text or cached text is undefined",
638
+ level: 1,
639
+ });
640
+ return false;
641
+ }
642
+
643
+ // Normalize whitespace and trim both strings before comparing
644
+ const normalizedCurrentText = currentComponent
645
+ .trim()
646
+ .replace(/\s+/g, " ");
647
+ const normalizedCachedText = cachedStep.savedComponentString
648
+ .trim()
649
+ .replace(/\s+/g, " ");
650
+
651
+ if (normalizedCurrentText !== normalizedCachedText) {
652
+ this.logger({
653
+ category: "action",
654
+ message: "current text and cached text do not match",
655
+ level: 1,
656
+ auxiliary: {
657
+ currentText: {
658
+ value: normalizedCurrentText,
659
+ type: "string",
660
+ },
661
+ cachedText: {
662
+ value: normalizedCachedText,
663
+ type: "string",
664
+ },
665
+ },
666
+ });
667
+ return false;
668
+ }
669
+
670
+ return true;
671
+ } catch (e) {
672
+ this.logger({
673
+ category: "action",
674
+ message: "error checking if cached step is valid",
675
+ level: 1,
676
+ auxiliary: {
677
+ error: {
678
+ value: e.message,
679
+ type: "string",
680
+ },
681
+ trace: {
682
+ value: e.stack,
683
+ type: "string",
684
+ },
685
+ },
686
+ });
687
+ return false; // Added explicit return false for error cases
688
+ }
689
+ }
690
+
691
+ private async _getValidCachedStepXpath(cachedStep: {
692
+ xpaths: string[];
693
+ savedComponentString: string;
694
+ }) {
695
+ const reversedXpaths = [...cachedStep.xpaths].reverse(); // We reverse the xpaths to try the most cachable ones first
696
+ for (const xpath of reversedXpaths) {
697
+ const isValid = await this._checkIfCachedStepIsValid_oneXpath({
698
+ xpath,
699
+ savedComponentString: cachedStep.savedComponentString,
700
+ });
701
+
702
+ if (isValid) {
703
+ return xpath;
704
+ }
705
+ }
706
+ return null;
707
+ }
708
+
709
+ private async _runCachedActionIfAvailable({
710
+ action,
711
+ previousSelectors,
712
+ requestId,
713
+ steps,
714
+ chunksSeen,
715
+ llmClient,
716
+ useVision,
717
+ verifierUseVision,
718
+ retries,
719
+ variables,
720
+ domSettleTimeoutMs,
721
+ }: {
722
+ action: string;
723
+ previousSelectors: string[];
724
+ requestId: string;
725
+ steps: string;
726
+ chunksSeen: number[];
727
+ llmClient: LLMClient;
728
+ useVision: boolean | "fallback";
729
+ verifierUseVision: boolean;
730
+ retries: number;
731
+ variables: Record<string, string>;
732
+ domSettleTimeoutMs?: number;
733
+ }) {
734
+ if (!this.enableCaching) {
735
+ return null;
736
+ }
737
+
738
+ const cacheObj = {
739
+ url: this.stagehand.page.url(),
740
+ action,
741
+ previousSelectors,
742
+ requestId,
743
+ };
744
+
745
+ this.logger({
746
+ category: "action",
747
+ message: "checking action cache",
748
+ level: 1,
749
+ auxiliary: {
750
+ cacheObj: {
751
+ value: JSON.stringify(cacheObj),
752
+ type: "object",
753
+ },
754
+ },
755
+ });
756
+
757
+ const cachedStep = await this.actionCache.getActionStep(cacheObj);
758
+
759
+ if (!cachedStep) {
760
+ this.logger({
761
+ category: "action",
762
+ message: "action cache miss",
763
+ level: 1,
764
+ auxiliary: {
765
+ cacheObj: {
766
+ value: JSON.stringify(cacheObj),
767
+ type: "object",
768
+ },
769
+ },
770
+ });
771
+ return null;
772
+ }
773
+
774
+ this.logger({
775
+ category: "action",
776
+ message: "action cache semi-hit",
777
+ level: 1,
778
+ auxiliary: {
779
+ playwrightCommand: {
780
+ value: JSON.stringify(cachedStep.playwrightCommand),
781
+ type: "object",
782
+ },
783
+ },
784
+ });
785
+
786
+ try {
787
+ const validXpath = await this._getValidCachedStepXpath({
788
+ xpaths: cachedStep.xpaths,
789
+ savedComponentString: cachedStep.componentString,
790
+ });
791
+
792
+ this.logger({
793
+ category: "action",
794
+ message: "cached action step is valid",
795
+ level: 1,
796
+ auxiliary: {
797
+ validXpath: {
798
+ value: validXpath,
799
+ type: "string",
800
+ },
801
+ },
802
+ });
803
+
804
+ if (!validXpath) {
805
+ this.logger({
806
+ category: "action",
807
+ message: "cached action step is invalid, removing...",
808
+ level: 1,
809
+ auxiliary: {
810
+ cacheObj: {
811
+ value: JSON.stringify(cacheObj),
812
+ type: "object",
813
+ },
814
+ },
815
+ });
816
+
817
+ await this.actionCache?.removeActionStep(cacheObj);
818
+ return null;
819
+ }
820
+
821
+ this.logger({
822
+ category: "action",
823
+ message: "action cache hit",
824
+ level: 1,
825
+ auxiliary: {
826
+ playwrightCommand: {
827
+ value: JSON.stringify(cachedStep.playwrightCommand),
828
+ type: "object",
829
+ },
830
+ },
831
+ });
832
+
833
+ cachedStep.playwrightCommand.args = cachedStep.playwrightCommand.args.map(
834
+ (arg) => {
835
+ return fillInVariables(arg, variables);
836
+ },
837
+ );
838
+
839
+ await this._performPlaywrightMethod(
840
+ cachedStep.playwrightCommand.method,
841
+ cachedStep.playwrightCommand.args,
842
+ validXpath,
843
+ domSettleTimeoutMs,
844
+ );
845
+
846
+ steps = steps + cachedStep.newStepString;
847
+ const { outputString, selectorMap } = await this.stagehand.page.evaluate(
848
+ ({ chunksSeen }: { chunksSeen: number[] }) => {
849
+ // @ts-ignore
850
+ return window.processDom(chunksSeen);
851
+ },
852
+ { chunksSeen },
853
+ );
854
+
855
+ if (cachedStep.completed) {
856
+ // Verify the action was completed successfully
857
+ let actionCompleted = await this._verifyActionCompletion({
858
+ completed: true,
859
+ verifierUseVision,
860
+ llmClient,
861
+ steps,
862
+ requestId,
863
+ action,
864
+ domSettleTimeoutMs,
865
+ });
866
+
867
+ this.logger({
868
+ category: "action",
869
+ message: "action completion verification result from cache",
870
+ level: 1,
871
+ auxiliary: {
872
+ actionCompleted: {
873
+ value: actionCompleted.toString(),
874
+ type: "boolean",
875
+ },
876
+ },
877
+ });
878
+
879
+ if (actionCompleted) {
880
+ return {
881
+ success: true,
882
+ message: "action completed successfully using cached step",
883
+ action,
884
+ };
885
+ }
886
+ }
887
+
888
+ return this.act({
889
+ action,
890
+ steps,
891
+ chunksSeen,
892
+ llmClient,
893
+ useVision,
894
+ verifierUseVision,
895
+ retries,
896
+ requestId,
897
+ variables,
898
+ previousSelectors: [...previousSelectors, cachedStep.xpaths[0]],
899
+ skipActionCacheForThisStep: false,
900
+ domSettleTimeoutMs,
901
+ });
902
+ } catch (exception) {
903
+ this.logger({
904
+ category: "action",
905
+ message: "error performing cached action step",
906
+ level: 1,
907
+ auxiliary: {
908
+ error: {
909
+ value: exception.message,
910
+ type: "string",
911
+ },
912
+ trace: {
913
+ value: exception.stack,
914
+ type: "string",
915
+ },
916
+ },
917
+ });
918
+
919
+ await this.actionCache?.removeActionStep(cacheObj);
920
+ return null;
921
+ }
922
+ }
923
+
924
+ public async act({
925
+ action,
926
+ steps = "",
927
+ chunksSeen,
928
+ llmClient,
929
+ useVision,
930
+ verifierUseVision,
931
+ retries = 0,
932
+ requestId,
933
+ variables,
934
+ previousSelectors,
935
+ skipActionCacheForThisStep = false,
936
+ domSettleTimeoutMs,
937
+ }: {
938
+ action: string;
939
+ steps?: string;
940
+ chunksSeen: number[];
941
+ llmClient: LLMClient;
942
+ useVision: boolean | "fallback";
943
+ verifierUseVision: boolean;
944
+ retries?: number;
945
+ requestId?: string;
946
+ variables: Record<string, string>;
947
+ previousSelectors: string[];
948
+ skipActionCacheForThisStep: boolean;
949
+ domSettleTimeoutMs?: number;
950
+ }): Promise<{ success: boolean; message: string; action: string }> {
951
+ try {
952
+ await this.waitForSettledDom(domSettleTimeoutMs);
953
+ await this.startDomDebug();
954
+
955
+ if (this.enableCaching && !skipActionCacheForThisStep) {
956
+ const response = await this._runCachedActionIfAvailable({
957
+ action,
958
+ previousSelectors,
959
+ requestId,
960
+ steps,
961
+ chunksSeen,
962
+ llmClient,
963
+ useVision,
964
+ verifierUseVision,
965
+ retries,
966
+ variables,
967
+ domSettleTimeoutMs,
968
+ });
969
+
970
+ if (response !== null) {
971
+ return response;
972
+ } else {
973
+ return this.act({
974
+ action,
975
+ steps,
976
+ chunksSeen,
977
+ llmClient,
978
+ useVision,
979
+ verifierUseVision,
980
+ retries,
981
+ requestId,
982
+ variables,
983
+ previousSelectors,
984
+ skipActionCacheForThisStep: true,
985
+ domSettleTimeoutMs,
986
+ });
987
+ }
988
+ }
989
+
990
+ if (!llmClient.hasVision && (useVision !== false || verifierUseVision)) {
991
+ this.logger({
992
+ category: "action",
993
+ message:
994
+ "model does not support vision but useVision was not false. defaulting to false.",
995
+ level: 1,
996
+ auxiliary: {
997
+ model: {
998
+ value: llmClient.modelName,
999
+ type: "string",
1000
+ },
1001
+ useVision: {
1002
+ value: useVision.toString(),
1003
+ type: "boolean",
1004
+ },
1005
+ },
1006
+ });
1007
+ useVision = false;
1008
+ verifierUseVision = false;
1009
+ }
1010
+
1011
+ this.logger({
1012
+ category: "action",
1013
+ message: "running / continuing action",
1014
+ level: 2,
1015
+ auxiliary: {
1016
+ action: {
1017
+ value: action,
1018
+ type: "string",
1019
+ },
1020
+ pageUrl: {
1021
+ value: this.stagehand.page.url(),
1022
+ type: "string",
1023
+ },
1024
+ },
1025
+ });
1026
+
1027
+ this.logger({
1028
+ category: "action",
1029
+ message: "processing DOM",
1030
+ level: 2,
1031
+ });
1032
+
1033
+ const { outputString, selectorMap, chunk, chunks } =
1034
+ await this.stagehand.page.evaluate(
1035
+ ({ chunksSeen }: { chunksSeen: number[] }) => {
1036
+ // @ts-ignore
1037
+ return window.processDom(chunksSeen);
1038
+ },
1039
+ { chunksSeen },
1040
+ );
1041
+
1042
+ this.logger({
1043
+ category: "action",
1044
+ message: "looking at chunk",
1045
+ level: 1,
1046
+ auxiliary: {
1047
+ chunk: {
1048
+ value: chunk.toString(),
1049
+ type: "integer",
1050
+ },
1051
+ chunks: {
1052
+ value: chunks.length.toString(),
1053
+ type: "integer",
1054
+ },
1055
+ chunksSeen: {
1056
+ value: chunksSeen.length.toString(),
1057
+ type: "integer",
1058
+ },
1059
+ chunksLeft: {
1060
+ value: (chunks.length - chunksSeen.length).toString(),
1061
+ type: "integer",
1062
+ },
1063
+ },
1064
+ });
1065
+
1066
+ // Prepare annotated screenshot if vision is enabled
1067
+ let annotatedScreenshot: Buffer | undefined;
1068
+ if (useVision === true) {
1069
+ if (!llmClient.hasVision) {
1070
+ this.logger({
1071
+ category: "action",
1072
+ message:
1073
+ "model does not support vision. skipping vision processing.",
1074
+ level: 1,
1075
+ auxiliary: {
1076
+ model: {
1077
+ value: llmClient.modelName,
1078
+ type: "string",
1079
+ },
1080
+ },
1081
+ });
1082
+ } else {
1083
+ const screenshotService = new ScreenshotService(
1084
+ this.stagehand.page,
1085
+ selectorMap,
1086
+ this.verbose,
1087
+ this.logger,
1088
+ );
1089
+
1090
+ annotatedScreenshot =
1091
+ await screenshotService.getAnnotatedScreenshot(false);
1092
+ }
1093
+ }
1094
+
1095
+ const response = await act({
1096
+ action,
1097
+ domElements: outputString,
1098
+ steps,
1099
+ llmClient,
1100
+ screenshot: annotatedScreenshot,
1101
+ logger: this.logger,
1102
+ requestId,
1103
+ variables,
1104
+ });
1105
+
1106
+ this.logger({
1107
+ category: "action",
1108
+ message: "received response from LLM",
1109
+ level: 1,
1110
+ auxiliary: {
1111
+ response: {
1112
+ value: JSON.stringify(response),
1113
+ type: "object",
1114
+ },
1115
+ },
1116
+ });
1117
+
1118
+ await this.cleanupDomDebug();
1119
+
1120
+ if (!response) {
1121
+ if (chunksSeen.length + 1 < chunks.length) {
1122
+ chunksSeen.push(chunk);
1123
+
1124
+ this.logger({
1125
+ category: "action",
1126
+ message: "no action found in current chunk",
1127
+ level: 1,
1128
+ auxiliary: {
1129
+ chunksSeen: {
1130
+ value: chunksSeen.length.toString(),
1131
+ type: "integer",
1132
+ },
1133
+ },
1134
+ });
1135
+
1136
+ return this.act({
1137
+ action,
1138
+ steps:
1139
+ steps +
1140
+ (!steps.endsWith("\n") ? "\n" : "") +
1141
+ "## Step: Scrolled to another section\n",
1142
+ chunksSeen,
1143
+ llmClient,
1144
+ useVision,
1145
+ verifierUseVision,
1146
+ requestId,
1147
+ variables,
1148
+ previousSelectors,
1149
+ skipActionCacheForThisStep,
1150
+ domSettleTimeoutMs,
1151
+ });
1152
+ } else if (useVision === "fallback") {
1153
+ this.logger({
1154
+ category: "action",
1155
+ message: "switching to vision-based processing",
1156
+ level: 1,
1157
+ auxiliary: {
1158
+ useVision: {
1159
+ value: useVision.toString(),
1160
+ type: "string",
1161
+ },
1162
+ },
1163
+ });
1164
+ await this.stagehand.page.evaluate(() => window.scrollToHeight(0));
1165
+ return await this.act({
1166
+ action,
1167
+ steps,
1168
+ chunksSeen,
1169
+ llmClient,
1170
+ useVision: true,
1171
+ verifierUseVision,
1172
+ requestId,
1173
+ variables,
1174
+ previousSelectors,
1175
+ skipActionCacheForThisStep,
1176
+ domSettleTimeoutMs,
1177
+ });
1178
+ } else {
1179
+ if (this.enableCaching) {
1180
+ this.llmProvider.cleanRequestCache(requestId);
1181
+ this.actionCache?.deleteCacheForRequestId(requestId);
1182
+ }
1183
+
1184
+ return {
1185
+ success: false,
1186
+ message: `Action was not able to be completed.`,
1187
+ action: action,
1188
+ };
1189
+ }
1190
+ }
1191
+
1192
+ // Action found, proceed to execute
1193
+ const elementId = response["element"];
1194
+ const xpaths = selectorMap[elementId];
1195
+ const method = response["method"];
1196
+ const args = response["args"];
1197
+
1198
+ // Get the element text from the outputString
1199
+ const elementLines = outputString.split("\n");
1200
+ const elementText =
1201
+ elementLines
1202
+ .find((line) => line.startsWith(`${elementId}:`))
1203
+ ?.split(":")[1] || "Element not found";
1204
+
1205
+ this.logger({
1206
+ category: "action",
1207
+ message: "executing method",
1208
+ level: 1,
1209
+ auxiliary: {
1210
+ method: {
1211
+ value: method,
1212
+ type: "string",
1213
+ },
1214
+ elementId: {
1215
+ value: elementId.toString(),
1216
+ type: "integer",
1217
+ },
1218
+ xpaths: {
1219
+ value: JSON.stringify(xpaths),
1220
+ type: "object",
1221
+ },
1222
+ args: {
1223
+ value: JSON.stringify(args),
1224
+ type: "object",
1225
+ },
1226
+ },
1227
+ });
1228
+
1229
+ try {
1230
+ const initialUrl = this.stagehand.page.url();
1231
+ const locator = this.stagehand.page
1232
+ .locator(`xpath=${xpaths[0]}`)
1233
+ .first();
1234
+ const originalUrl = this.stagehand.page.url();
1235
+ const componentString = await this._getComponentString(locator);
1236
+ const responseArgs = [...args];
1237
+ if (variables) {
1238
+ responseArgs.forEach((arg, index) => {
1239
+ if (typeof arg === "string") {
1240
+ args[index] = fillInVariables(arg, variables);
1241
+ }
1242
+ });
1243
+ }
1244
+ await this._performPlaywrightMethod(
1245
+ method,
1246
+ args,
1247
+ xpaths[0],
1248
+ domSettleTimeoutMs,
1249
+ );
1250
+
1251
+ const newStepString =
1252
+ (!steps.endsWith("\n") ? "\n" : "") +
1253
+ `## Step: ${response.step}\n` +
1254
+ ` Element: ${elementText}\n` +
1255
+ ` Action: ${response.method}\n` +
1256
+ ` Reasoning: ${response.why}\n`;
1257
+
1258
+ steps += newStepString;
1259
+
1260
+ if (this.enableCaching) {
1261
+ this.actionCache
1262
+ .addActionStep({
1263
+ action,
1264
+ url: originalUrl,
1265
+ previousSelectors,
1266
+ playwrightCommand: {
1267
+ method,
1268
+ args: responseArgs,
1269
+ },
1270
+ componentString,
1271
+ requestId,
1272
+ xpaths: xpaths,
1273
+ newStepString,
1274
+ completed: response.completed,
1275
+ })
1276
+ .catch((e) => {
1277
+ this.logger({
1278
+ category: "action",
1279
+ message: "error adding action step to cache",
1280
+ level: 1,
1281
+ auxiliary: {
1282
+ error: {
1283
+ value: e.message,
1284
+ type: "string",
1285
+ },
1286
+ trace: {
1287
+ value: e.stack,
1288
+ type: "string",
1289
+ },
1290
+ },
1291
+ });
1292
+ });
1293
+ }
1294
+
1295
+ if (this.stagehand.page.url() !== initialUrl) {
1296
+ steps += ` Result (Important): Page URL changed from ${initialUrl} to ${this.stagehand.page.url()}\n\n`;
1297
+ }
1298
+
1299
+ const actionCompleted = await this._verifyActionCompletion({
1300
+ completed: response.completed,
1301
+ verifierUseVision,
1302
+ requestId,
1303
+ action,
1304
+ steps,
1305
+ llmClient,
1306
+ domSettleTimeoutMs,
1307
+ });
1308
+
1309
+ if (!actionCompleted) {
1310
+ this.logger({
1311
+ category: "action",
1312
+ message: "continuing to next action step",
1313
+ level: 1,
1314
+ });
1315
+
1316
+ return this.act({
1317
+ action,
1318
+ steps,
1319
+ llmClient,
1320
+ chunksSeen,
1321
+ useVision,
1322
+ verifierUseVision,
1323
+ requestId,
1324
+ variables,
1325
+ previousSelectors: [...previousSelectors, xpaths[0]],
1326
+ skipActionCacheForThisStep: false,
1327
+ domSettleTimeoutMs,
1328
+ });
1329
+ } else {
1330
+ this.logger({
1331
+ category: "action",
1332
+ message: "action completed successfully",
1333
+ level: 1,
1334
+ });
1335
+ await this._recordAction(action, response.step);
1336
+ return {
1337
+ success: true,
1338
+ message: `Action completed successfully: ${steps}${response.step}`,
1339
+ action: action,
1340
+ };
1341
+ }
1342
+ } catch (error) {
1343
+ this.logger({
1344
+ category: "action",
1345
+ message: "error performing action - d",
1346
+ level: 1,
1347
+ auxiliary: {
1348
+ error: {
1349
+ value: error.message,
1350
+ type: "string",
1351
+ },
1352
+ trace: {
1353
+ value: error.stack,
1354
+ type: "string",
1355
+ },
1356
+ retries: {
1357
+ value: retries.toString(),
1358
+ type: "integer",
1359
+ },
1360
+ },
1361
+ });
1362
+
1363
+ if (retries < 2) {
1364
+ return this.act({
1365
+ action,
1366
+ steps,
1367
+ llmClient,
1368
+ useVision,
1369
+ verifierUseVision,
1370
+ retries: retries + 1,
1371
+ chunksSeen,
1372
+ requestId,
1373
+ variables,
1374
+ previousSelectors,
1375
+ skipActionCacheForThisStep,
1376
+ domSettleTimeoutMs,
1377
+ });
1378
+ }
1379
+
1380
+ await this._recordAction(action, "");
1381
+ if (this.enableCaching) {
1382
+ this.llmProvider.cleanRequestCache(requestId);
1383
+ this.actionCache.deleteCacheForRequestId(requestId);
1384
+ }
1385
+
1386
+ return {
1387
+ success: false,
1388
+ message: "error performing action - a",
1389
+ action: action,
1390
+ };
1391
+ }
1392
+ } catch (error) {
1393
+ this.logger({
1394
+ category: "action",
1395
+ message: "error performing action - b",
1396
+ level: 1,
1397
+ auxiliary: {
1398
+ error: {
1399
+ value: error.message,
1400
+ type: "string",
1401
+ },
1402
+ trace: {
1403
+ value: error.stack,
1404
+ type: "string",
1405
+ },
1406
+ },
1407
+ });
1408
+
1409
+ if (this.enableCaching) {
1410
+ this.llmProvider.cleanRequestCache(requestId);
1411
+ this.actionCache.deleteCacheForRequestId(requestId);
1412
+ }
1413
+
1414
+ return {
1415
+ success: false,
1416
+ message: `Error performing action - C: ${error.message}`,
1417
+ action: action,
1418
+ };
1419
+ }
1420
+ }
1421
+ }