camel-ai 0.2.73a11__py3-none-any.whl → 0.2.74__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

Files changed (28) hide show
  1. camel/__init__.py +1 -1
  2. camel/interpreters/e2b_interpreter.py +34 -1
  3. camel/models/anthropic_model.py +5 -3
  4. camel/societies/workforce/prompts.py +3 -19
  5. camel/societies/workforce/workforce.py +13 -8
  6. camel/toolkits/hybrid_browser_toolkit/config_loader.py +3 -0
  7. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit_ts.py +225 -0
  8. camel/toolkits/hybrid_browser_toolkit/ts/src/browser-session.ts +164 -8
  9. camel/toolkits/hybrid_browser_toolkit/ts/src/config-loader.ts +2 -0
  10. camel/toolkits/hybrid_browser_toolkit/ts/src/hybrid-browser-toolkit.ts +106 -1
  11. camel/toolkits/hybrid_browser_toolkit/ts/src/types.ts +19 -1
  12. camel/toolkits/hybrid_browser_toolkit/ts/websocket-server.js +20 -0
  13. camel/toolkits/hybrid_browser_toolkit/ws_wrapper.py +41 -0
  14. camel/toolkits/hybrid_browser_toolkit_py/actions.py +158 -0
  15. camel/toolkits/hybrid_browser_toolkit_py/browser_session.py +55 -8
  16. camel/toolkits/hybrid_browser_toolkit_py/config_loader.py +43 -0
  17. camel/toolkits/hybrid_browser_toolkit_py/hybrid_browser_toolkit.py +312 -3
  18. camel/toolkits/hybrid_browser_toolkit_py/snapshot.py +10 -4
  19. camel/toolkits/hybrid_browser_toolkit_py/unified_analyzer.js +45 -4
  20. camel/toolkits/note_taking_toolkit.py +3 -4
  21. camel/toolkits/search_toolkit.py +192 -59
  22. camel/toolkits/terminal_toolkit.py +12 -2
  23. camel/types/enums.py +3 -0
  24. camel/utils/token_counting.py +13 -2
  25. {camel_ai-0.2.73a11.dist-info → camel_ai-0.2.74.dist-info}/METADATA +37 -4
  26. {camel_ai-0.2.73a11.dist-info → camel_ai-0.2.74.dist-info}/RECORD +28 -28
  27. {camel_ai-0.2.73a11.dist-info → camel_ai-0.2.74.dist-info}/WHEEL +0 -0
  28. {camel_ai-0.2.73a11.dist-info → camel_ai-0.2.74.dist-info}/licenses/LICENSE +0 -0
@@ -1,4 +1,4 @@
1
- import { Page, Browser, BrowserContext, chromium } from 'playwright';
1
+ import { Page, Browser, BrowserContext, chromium, ConsoleMessage } from 'playwright';
2
2
  import { BrowserToolkitConfig, SnapshotResult, SnapshotElement, ActionResult, TabInfo, BrowserAction, DetailedTiming } from './types';
3
3
  import { ConfigLoader, StealthConfig } from './config-loader';
4
4
 
@@ -6,18 +6,43 @@ export class HybridBrowserSession {
6
6
  private browser: Browser | null = null;
7
7
  private context: BrowserContext | null = null;
8
8
  private pages: Map<string, Page> = new Map();
9
+ private consoleLogs: Map<string, ConsoleMessage[]> = new Map();
9
10
  private currentTabId: string | null = null;
10
11
  private tabCounter = 0;
11
12
  private configLoader: ConfigLoader;
12
13
  private scrollPosition: { x: number; y: number } = {x: 0, y: 0};
13
14
  private hasNavigatedBefore = false; // Track if we've navigated before
15
+ private logLimit: number;
14
16
 
15
17
  constructor(config: BrowserToolkitConfig = {}) {
16
18
  // Use ConfigLoader's fromPythonConfig to handle conversion properly
17
19
  this.configLoader = ConfigLoader.fromPythonConfig(config);
20
+ // Load browser configuration for console log limit, default to 1000
21
+ this.logLimit = this.configLoader.getBrowserConfig().consoleLogLimit || 1000;
18
22
  }
19
23
 
20
- async ensureBrowser(): Promise<void> {
24
+ private registerNewPage(tabId: string, page: Page): void {
25
+ // Register page and logs with tabId
26
+ this.pages.set(tabId, page);
27
+ this.consoleLogs.set(tabId, []);
28
+ // Set up console log listener for the page
29
+ page.on('console', (msg: ConsoleMessage) => {
30
+ const logs = this.consoleLogs.get(tabId);
31
+ if (logs) {
32
+ logs.push(msg);
33
+ if (logs.length > this.logLimit) {
34
+ logs.shift();
35
+ }
36
+ }
37
+ });
38
+
39
+ // Clean logs on page close
40
+ page.on('close', () => {
41
+ this.consoleLogs.delete(tabId);
42
+ });
43
+ }
44
+
45
+ async ensureBrowser(): Promise<void> {
21
46
  if (this.browser) {
22
47
  return;
23
48
  }
@@ -57,7 +82,7 @@ export class HybridBrowserSession {
57
82
  // In CDP mode, only consider pages with about:blank as available
58
83
  if (pageUrl === 'about:blank') {
59
84
  const tabId = this.generateTabId();
60
- this.pages.set(tabId, page);
85
+ this.registerNewPage(tabId, page);
61
86
  if (!this.currentTabId) {
62
87
  this.currentTabId = tabId;
63
88
  availablePageFound = true;
@@ -97,7 +122,7 @@ export class HybridBrowserSession {
97
122
  const pages = this.context.pages();
98
123
  if (pages.length > 0) {
99
124
  const initialTabId = this.generateTabId();
100
- this.pages.set(initialTabId, pages[0]);
125
+ this.registerNewPage(initialTabId, pages[0]);
101
126
  this.currentTabId = initialTabId;
102
127
  }
103
128
  } else {
@@ -115,7 +140,7 @@ export class HybridBrowserSession {
115
140
 
116
141
  const initialPage = await this.context.newPage();
117
142
  const initialTabId = this.generateTabId();
118
- this.pages.set(initialTabId, initialPage);
143
+ this.registerNewPage(initialTabId, initialPage);
119
144
  this.currentTabId = initialTabId;
120
145
  }
121
146
  }
@@ -139,6 +164,13 @@ export class HybridBrowserSession {
139
164
  return this.pages.get(this.currentTabId)!;
140
165
  }
141
166
 
167
+ async getCurrentLogs(): Promise<ConsoleMessage[]> {
168
+ if (!this.currentTabId || !this.consoleLogs.has(this.currentTabId)) {
169
+ return [];
170
+ }
171
+ return this.consoleLogs.get(this.currentTabId) || [];
172
+ }
173
+
142
174
  /**
143
175
  * Get current scroll position from the page
144
176
  */
@@ -343,7 +375,7 @@ export class HybridBrowserSession {
343
375
 
344
376
  // Generate tab ID for the new page
345
377
  const newTabId = this.generateTabId();
346
- this.pages.set(newTabId, newPage);
378
+ this.registerNewPage(newTabId, newPage);
347
379
 
348
380
  // Set up page properties
349
381
  const browserConfig = this.configLoader.getBrowserConfig();
@@ -434,7 +466,97 @@ export class HybridBrowserSession {
434
466
  }
435
467
  }
436
468
 
469
+ /**
470
+ * Simplified mouse control implementation
471
+ */
472
+ private async performMouseControl(page: Page, control: string, x: number, y: number): Promise<{ success: boolean; error?: string }> {
473
+ try {
474
+ const viewport = page.viewportSize();
475
+ if (!viewport) {
476
+ return { success: false, error: 'Viewport size not available from page.' };
477
+ }
478
+ if (x < 0 || y < 0 || x > viewport.width || y > viewport.height) {
479
+ return { success: false, error: `Invalid coordinates, outside viewport bounds: (${x}, ${y})` };
480
+ }
481
+ switch (control) {
482
+ case 'click': {
483
+ await page.mouse.click(x, y);
484
+ break;
485
+ }
486
+ case 'right_click': {
487
+ await page.mouse.click(x, y, { button: 'right' });
488
+ break;
489
+ }
490
+ case 'dblclick': {
491
+ await page.mouse.dblclick(x, y);
492
+ break;
493
+ }
494
+ default:
495
+ return { success: false, error: `Invalid control action: ${control}` };
496
+ }
497
+
498
+ return { success: true };
499
+ } catch (error) {
500
+ return { success: false, error: `Mouse action failed: ${error}` };
501
+ }
502
+ }
437
503
 
504
+ /**
505
+ * Enhanced mouse drag and drop implementation using ref IDs
506
+ */
507
+ private async performMouseDrag(page: Page, fromRef: string, toRef: string): Promise<{ success: boolean; error?: string }> {
508
+ try {
509
+ // Ensure we have the latest snapshot
510
+ await (page as any)._snapshotForAI();
511
+
512
+ // Get elements using Playwright's aria-ref selector
513
+ const fromSelector = `aria-ref=${fromRef}`;
514
+ const toSelector = `aria-ref=${toRef}`;
515
+
516
+ const fromElement = await page.locator(fromSelector).first();
517
+ const toElement = await page.locator(toSelector).first();
518
+
519
+ // Check if elements exist
520
+ const fromExists = await fromElement.count() > 0;
521
+ const toExists = await toElement.count() > 0;
522
+
523
+ if (!fromExists) {
524
+ return { success: false, error: `Source element with ref ${fromRef} not found` };
525
+ }
526
+
527
+ if (!toExists) {
528
+ return { success: false, error: `Target element with ref ${toRef} not found` };
529
+ }
530
+
531
+ // Get the center coordinates of both elements
532
+ const fromBox = await fromElement.boundingBox();
533
+ const toBox = await toElement.boundingBox();
534
+
535
+ if (!fromBox) {
536
+ return { success: false, error: `Could not get bounding box for source element with ref ${fromRef}` };
537
+ }
538
+
539
+ if (!toBox) {
540
+ return { success: false, error: `Could not get bounding box for target element with ref ${toRef}` };
541
+ }
542
+
543
+ const fromX = fromBox.x + fromBox.width / 2;
544
+ const fromY = fromBox.y + fromBox.height / 2;
545
+ const toX = toBox.x + toBox.width / 2;
546
+ const toY = toBox.y + toBox.height / 2;
547
+
548
+ // Perform the drag operation
549
+ await page.mouse.move(fromX, fromY);
550
+ await page.mouse.down();
551
+ // Destination coordinates
552
+ await page.mouse.move(toX, toY);
553
+ await page.mouse.up();
554
+
555
+ return { success: true };
556
+ } catch (error) {
557
+ return { success: false, error: `Mouse drag action failed: ${error}` };
558
+ }
559
+ }
438
560
 
439
561
  async executeAction(action: BrowserAction): Promise<ActionResult> {
440
562
  const startTime = Date.now();
@@ -519,6 +641,40 @@ export class HybridBrowserSession {
519
641
  actionExecutionTime = Date.now() - enterStart;
520
642
  break;
521
643
  }
644
+
645
+ case 'mouse_control': {
646
+ elementSearchTime = Date.now() - elementSearchStart;
647
+ const mouseControlStart = Date.now();
648
+ const mouseControlResult = await this.performMouseControl(page, action.control, action.x, action.y);
649
+
650
+ if (!mouseControlResult.success) {
651
+ throw new Error(`Action failed: ${mouseControlResult.error}`);
652
+ }
653
+ actionExecutionTime = Date.now() - mouseControlStart;
654
+ break;
655
+ }
656
+
657
+ case 'mouse_drag': {
658
+ elementSearchTime = Date.now() - elementSearchStart;
659
+ const mouseDragStart = Date.now();
660
+ const mouseDragResult = await this.performMouseDrag(page, action.from_ref, action.to_ref);
661
+
662
+ if (!mouseDragResult.success) {
663
+ throw new Error(`Action failed: ${mouseDragResult.error}`);
664
+ }
665
+ actionExecutionTime = Date.now() - mouseDragStart;
666
+ break;
667
+ }
668
+
669
+ case 'press_key': {
670
+ elementSearchTime = Date.now() - elementSearchStart;
671
+ const keyPressStart = Date.now();
672
+ // concatenate keys with '+' for key combinations
673
+ const keys = action.keys.join('+');
674
+ await page.keyboard.press(keys);
675
+ actionExecutionTime = Date.now() - keyPressStart;
676
+ break;
677
+ }
522
678
 
523
679
  default:
524
680
  throw new Error(`Unknown action type: ${(action as any).type}`);
@@ -651,7 +807,7 @@ export class HybridBrowserSession {
651
807
  if (!isTracked && pageUrl === 'about:blank') {
652
808
  newPage = page;
653
809
  newTabId = this.generateTabId();
654
- this.pages.set(newTabId, newPage);
810
+ this.registerNewPage(newTabId, newPage);
655
811
  break;
656
812
  }
657
813
  }
@@ -663,7 +819,7 @@ export class HybridBrowserSession {
663
819
  // Non-CDP mode: create new page as usual
664
820
  newPage = await this.context.newPage();
665
821
  newTabId = this.generateTabId();
666
- this.pages.set(newTabId, newPage);
822
+ this.registerNewPage(newTabId, newPage);
667
823
  }
668
824
 
669
825
  // Set up page properties
@@ -30,6 +30,7 @@ export interface BrowserConfig {
30
30
  // Tab management
31
31
  tabIdPrefix: string;
32
32
  tabCounterPadding: number;
33
+ consoleLogLimit: number;
33
34
 
34
35
  // Scroll and positioning
35
36
  scrollPositionScale: number;
@@ -113,6 +114,7 @@ function getDefaultBrowserConfig(): BrowserConfig {
113
114
  clickTimeout: 3000,
114
115
  tabIdPrefix: 'tab-',
115
116
  tabCounterPadding: 3,
117
+ consoleLogLimit: 1000,
116
118
  scrollPositionScale: 0.1,
117
119
  navigationDelay: 100,
118
120
  blankPageUrls: ['about:blank', ''],
@@ -1,6 +1,7 @@
1
1
  import {HybridBrowserSession} from './browser-session';
2
2
  import {ActionResult, BrowserAction, BrowserToolkitConfig, SnapshotResult, TabInfo, VisualMarkResult} from './types';
3
3
  import {ConfigLoader} from './config-loader';
4
+ import {ConsoleMessage} from 'playwright';
4
5
 
5
6
  export class HybridBrowserToolkit {
6
7
  private session: HybridBrowserSession;
@@ -382,6 +383,21 @@ export class HybridBrowserToolkit {
382
383
  return this.executeActionWithSnapshot(action);
383
384
  }
384
385
 
386
+ async mouseControl(control: 'click' | 'right_click'| 'dblclick', x: number, y: number): Promise<any> {
387
+ const action: BrowserAction = { type: 'mouse_control', control, x, y };
388
+ return this.executeActionWithSnapshot(action);
389
+ }
390
+
391
+ async mouseDrag(from_ref: string, to_ref: string): Promise<any> {
392
+ const action: BrowserAction = { type: 'mouse_drag', from_ref, to_ref };
393
+ return this.executeActionWithSnapshot(action);
394
+ }
395
+
396
+ async pressKeys(keys: string[]): Promise<any> {
397
+ const action: BrowserAction = { type: 'press_key', keys};
398
+ return this.executeActionWithSnapshot(action);
399
+ }
400
+
385
401
  async back(): Promise<ActionResult> {
386
402
  const startTime = Date.now();
387
403
 
@@ -519,4 +535,93 @@ export class HybridBrowserToolkit {
519
535
  return await this.session.getTabInfo();
520
536
  }
521
537
 
522
- }
538
+ async getConsoleView(): Promise<any> {
539
+ const currentLogs = await this.session.getCurrentLogs();
540
+ // Format logs
541
+ return currentLogs.map(item => ({
542
+ type: item.type(),
543
+ text: item.text(),
544
+ }));
545
+ }
546
+
547
+ async consoleExecute(code: string): Promise<any> {
548
+ const startTime = Date.now();
549
+ try {
550
+ const page = await this.session.getCurrentPage();
551
+
552
+ // Wrap the code to capture console.log output
553
+ const wrappedCode = `
554
+ (function() {
555
+ const _logs = [];
556
+ const originalLog = console.log;
557
+ console.log = function(...args) {
558
+ _logs.push(args.map(arg => {
559
+ try {
560
+ return typeof arg === 'object' ? JSON.stringify(arg) : String(arg);
561
+ } catch (e) {
562
+ return String(arg);
563
+ }
564
+ }).join(' '));
565
+ originalLog.apply(console, args);
566
+ };
567
+
568
+ let result;
569
+ try {
570
+ result = eval(${JSON.stringify(code)});
571
+ } catch (e) {
572
+ try {
573
+ result = (function() { ${code} })();
574
+ } catch (error) {
575
+ console.log = originalLog;
576
+ throw error;
577
+ }
578
+ }
579
+
580
+ console.log = originalLog;
581
+ return { result, logs: _logs };
582
+ })()
583
+ `;
584
+
585
+ const evalResult = await page.evaluate(wrappedCode) as { result: any; logs: string[] };
586
+ const { result, logs } = evalResult;
587
+
588
+ const snapshotStart = Date.now();
589
+ const snapshot = await this.getPageSnapshot(this.viewportLimit);
590
+ const snapshotTime = Date.now() - snapshotStart;
591
+ const totalTime = Date.now() - startTime;
592
+
593
+ // Properly serialize the result
594
+ let resultStr: string;
595
+ try {
596
+ resultStr = JSON.stringify(result, null, 2);
597
+ } catch (e) {
598
+ // Fallback for non-serializable values
599
+ resultStr = String(result);
600
+ }
601
+
602
+ return {
603
+ result: `Console execution result: ${resultStr}`,
604
+ console_output: logs,
605
+ snapshot: snapshot,
606
+ timing: {
607
+ total_time_ms: totalTime,
608
+ snapshot_time_ms: snapshotTime,
609
+ },
610
+ };
611
+
612
+ } catch (error) {
613
+ const totalTime = Date.now() - startTime;
614
+ return {
615
+ result: `Console execution failed: ${error}`,
616
+ console_output: [],
617
+ snapshot: '',
618
+ timing: {
619
+ total_time_ms: totalTime,
620
+ snapshot_time_ms: 0,
621
+ },
622
+ };
623
+ }
624
+ }
625
+
626
+ }
627
+
@@ -101,7 +101,25 @@ export interface EnterAction {
101
101
  type: 'enter';
102
102
  }
103
103
 
104
- export type BrowserAction = ClickAction | TypeAction | SelectAction | ScrollAction | EnterAction;
104
+ export interface MouseAction {
105
+ type: 'mouse_control';
106
+ control: 'click' | 'right_click' | 'dblclick';
107
+ x: number;
108
+ y: number;
109
+ }
110
+
111
+ export interface MouseDragAction {
112
+ type: 'mouse_drag';
113
+ from_ref: string;
114
+ to_ref: string;
115
+ }
116
+
117
+ export interface PressKeyAction {
118
+ type: 'press_key';
119
+ keys: string[];
120
+ }
121
+
122
+ export type BrowserAction = ClickAction | TypeAction | SelectAction | ScrollAction | EnterAction | MouseAction | MouseDragAction | PressKeyAction;
105
123
 
106
124
  export interface VisualMarkResult {
107
125
  text: string;
@@ -173,6 +173,18 @@ class WebSocketBrowserServer {
173
173
  case 'enter':
174
174
  if (!this.toolkit) throw new Error('Toolkit not initialized');
175
175
  return await this.toolkit.enter();
176
+
177
+ case 'mouse_control':
178
+ if (!this.toolkit) throw new Error('Toolkit not initialized');
179
+ return await this.toolkit.mouseControl(params.control, params.x, params.y);
180
+
181
+ case 'mouse_drag':
182
+ if (!this.toolkit) throw new Error('Toolkit not initialized');
183
+ return await this.toolkit.mouseDrag(params.from_ref, params.to_ref);
184
+
185
+ case 'press_key':
186
+ if (!this.toolkit) throw new Error('Toolkit not initialized');
187
+ return await this.toolkit.pressKeys(params.keys);
176
188
 
177
189
  case 'back':
178
190
  if (!this.toolkit) throw new Error('Toolkit not initialized');
@@ -194,6 +206,14 @@ class WebSocketBrowserServer {
194
206
  if (!this.toolkit) throw new Error('Toolkit not initialized');
195
207
  return await this.toolkit.getTabInfo();
196
208
 
209
+ case 'console_view':
210
+ if (!this.toolkit) throw new Error('Toolkit not initialized');
211
+ return await this.toolkit.getConsoleView();
212
+
213
+ case 'console_exec':
214
+ if (!this.toolkit) throw new Error('Toolkit not initialized');
215
+ return await this.toolkit.consoleExecute(params.code);
216
+
197
217
  case 'wait_user':
198
218
  if (!this.toolkit) throw new Error('Toolkit not initialized');
199
219
  return await this.toolkit.waitUser(params.timeout);
@@ -537,6 +537,31 @@ class WebSocketBrowserWrapper:
537
537
  response = await self._send_command('enter', {})
538
538
  return response
539
539
 
540
+ @action_logger
541
+ async def mouse_control(
542
+ self, control: str, x: float, y: float
543
+ ) -> Dict[str, Any]:
544
+ """Control the mouse to interact with browser with x, y coordinates."""
545
+ response = await self._send_command(
546
+ 'mouse_control', {'control': control, 'x': x, 'y': y}
547
+ )
548
+ return response
549
+
550
+ @action_logger
551
+ async def mouse_drag(self, from_ref: str, to_ref: str) -> Dict[str, Any]:
552
+ """Control the mouse to drag and drop in the browser using ref IDs."""
553
+ response = await self._send_command(
554
+ 'mouse_drag',
555
+ {'from_ref': from_ref, 'to_ref': to_ref},
556
+ )
557
+ return response
558
+
559
+ @action_logger
560
+ async def press_key(self, keys: List[str]) -> Dict[str, Any]:
561
+ """Press key and key combinations."""
562
+ response = await self._send_command('press_key', {'keys': keys})
563
+ return response
564
+
540
565
  @action_logger
541
566
  async def back(self) -> Dict[str, Any]:
542
567
  """Navigate back."""
@@ -571,6 +596,22 @@ class WebSocketBrowserWrapper:
571
596
  # Fallback if wrapped in an object
572
597
  return response.get('tabs', [])
573
598
 
599
+ @action_logger
600
+ async def console_view(self) -> List[Dict[str, Any]]:
601
+ """Get current page console view"""
602
+ response = await self._send_command('console_view', {})
603
+
604
+ if isinstance(response, list):
605
+ return response
606
+
607
+ return response.get('logs', [])
608
+
609
+ @action_logger
610
+ async def console_exec(self, code: str) -> Dict[str, Any]:
611
+ """Execute javascript code and get result."""
612
+ response = await self._send_command('console_exec', {'code': code})
613
+ return response
614
+
574
615
  @action_logger
575
616
  async def wait_user(
576
617
  self, timeout_sec: Optional[float] = None
@@ -73,6 +73,9 @@ class ActionExecutor:
73
73
  "extract": self._extract,
74
74
  "scroll": self._scroll,
75
75
  "enter": self._enter,
76
+ "mouse_control": self._mouse_control,
77
+ "mouse_drag": self._mouse_drag,
78
+ "press_key": self._press_key,
76
79
  }.get(action_type)
77
80
 
78
81
  if handler is None:
@@ -382,6 +385,150 @@ class ActionExecutor:
382
385
  "details": details,
383
386
  }
384
387
 
388
+ async def _mouse_control(self, action: Dict[str, Any]) -> Dict[str, Any]:
389
+ r"""Handle mouse_control action based on the coordinates"""
390
+ control = action.get("control", "click")
391
+ x_coord = action.get("x", 0)
392
+ y_coord = action.get("y", 0)
393
+
394
+ details = {
395
+ "action_type": "mouse_control",
396
+ "target": f"coordinates : ({x_coord}, {y_coord})",
397
+ }
398
+ try:
399
+ if not self._valid_coordinates(x_coord, y_coord):
400
+ raise ValueError(
401
+ "Invalid coordinates, outside viewport bounds :"
402
+ f"({x_coord}, {y_coord})"
403
+ )
404
+ match control:
405
+ case "click":
406
+ await self.page.mouse.click(x_coord, y_coord)
407
+ message = "Action 'click' performed on the target"
408
+
409
+ case "right_click":
410
+ await self.page.mouse.click(
411
+ x_coord, y_coord, button="right"
412
+ )
413
+ message = "Action 'right_click' performed on the target"
414
+
415
+ case "dblclick":
416
+ await self.page.mouse.dblclick(x_coord, y_coord)
417
+ message = "Action 'dblclick' performed on the target"
418
+
419
+ case _:
420
+ return {
421
+ "message": f"Invalid control action {control}",
422
+ "details": details,
423
+ }
424
+
425
+ return {"message": message, "details": details}
426
+ except Exception as e:
427
+ return {"message": f"Action failed: {e}", "details": details}
428
+
429
+ async def _mouse_drag(self, action: Dict[str, Any]) -> Dict[str, Any]:
430
+ r"""Handle mouse_drag action using ref IDs"""
431
+ from_ref = action.get("from_ref")
432
+ to_ref = action.get("to_ref")
433
+
434
+ if not from_ref or not to_ref:
435
+ return {
436
+ "message": "Error: mouse_drag requires from_ref and to_ref",
437
+ "details": {"error": "missing_refs"},
438
+ }
439
+
440
+ from_selector = f"[aria-ref='{from_ref}']"
441
+ to_selector = f"[aria-ref='{to_ref}']"
442
+
443
+ details = {
444
+ "action_type": "mouse_drag",
445
+ "from_ref": from_ref,
446
+ "to_ref": to_ref,
447
+ "from_selector": from_selector,
448
+ "to_selector": to_selector,
449
+ }
450
+
451
+ try:
452
+ # Get the source element
453
+ from_element = self.page.locator(from_selector)
454
+ from_count = await from_element.count()
455
+ if from_count == 0:
456
+ raise ValueError(
457
+ f"Source element with ref '{from_ref}' not found"
458
+ )
459
+
460
+ # Get the target element
461
+ to_element = self.page.locator(to_selector)
462
+ to_count = await to_element.count()
463
+ if to_count == 0:
464
+ raise ValueError(
465
+ f"Target element with ref '{to_ref}' not found"
466
+ )
467
+
468
+ # Get bounding boxes
469
+ from_box = await from_element.first.bounding_box()
470
+ to_box = await to_element.first.bounding_box()
471
+
472
+ if not from_box:
473
+ raise ValueError(
474
+ f"Could not get bounding box for source element "
475
+ f"with ref '{from_ref}'"
476
+ )
477
+ if not to_box:
478
+ raise ValueError(
479
+ f"Could not get bounding box for target element "
480
+ f"with ref '{to_ref}'"
481
+ )
482
+
483
+ # Calculate center coordinates
484
+ from_x = from_box['x'] + from_box['width'] / 2
485
+ from_y = from_box['y'] + from_box['height'] / 2
486
+ to_x = to_box['x'] + to_box['width'] / 2
487
+ to_y = to_box['y'] + to_box['height'] / 2
488
+
489
+ details.update(
490
+ {
491
+ "from_coordinates": {"x": from_x, "y": from_y},
492
+ "to_coordinates": {"x": to_x, "y": to_y},
493
+ }
494
+ )
495
+
496
+ # Perform the drag operation
497
+ await self.page.mouse.move(from_x, from_y)
498
+ await self.page.mouse.down()
499
+ # Destination coordinates
500
+ await self.page.mouse.move(to_x, to_y)
501
+ await self.page.mouse.up()
502
+
503
+ return {
504
+ "message": (
505
+ f"Dragged from element [ref={from_ref}] to element "
506
+ f"[ref={to_ref}]"
507
+ ),
508
+ "details": details,
509
+ }
510
+ except Exception as e:
511
+ return {"message": f"Action failed: {e}", "details": details}
512
+
513
+ async def _press_key(self, action: Dict[str, Any]) -> Dict[str, Any]:
514
+ r"""Handle press_key action by combining the keys in a list."""
515
+ keys = action.get("keys", [])
516
+ if not keys:
517
+ return {
518
+ "message": "Error: No keys specified",
519
+ "details": {"action_type": "press_key", "keys": ""},
520
+ }
521
+ combined_keys = "+".join(keys)
522
+ details = {"action_type": "press_key", "keys": combined_keys}
523
+ try:
524
+ await self.page.keyboard.press(combined_keys)
525
+ return {
526
+ "message": "Pressed keys in the browser",
527
+ "details": details,
528
+ }
529
+ except Exception as e:
530
+ return {"message": f"Action failed: {e}", "details": details}
531
+
385
532
  # utilities
386
533
  async def _wait_dom_stable(self) -> None:
387
534
  r"""Wait for DOM to become stable before executing actions."""
@@ -402,6 +549,17 @@ class ActionExecutor:
402
549
  except Exception:
403
550
  pass # Don't fail if wait times out
404
551
 
552
+ def _valid_coordinates(self, x_coord: float, y_coord: float) -> bool:
553
+ r"""Validate given coordinates against viewport bounds."""
554
+ viewport = self.page.viewport_size
555
+ if not viewport:
556
+ raise ValueError("Viewport size not available from current page.")
557
+
558
+ return (
559
+ 0 <= x_coord <= viewport['width']
560
+ and 0 <= y_coord <= viewport['height']
561
+ )
562
+
405
563
  # static helpers
406
564
  @staticmethod
407
565
  def should_update_snapshot(action: Dict[str, Any]) -> bool: