camel-ai 0.2.73a4__py3-none-any.whl → 0.2.80a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/_utils.py +38 -0
  3. camel/agents/chat_agent.py +2217 -519
  4. camel/agents/mcp_agent.py +30 -27
  5. camel/configs/__init__.py +15 -0
  6. camel/configs/aihubmix_config.py +88 -0
  7. camel/configs/amd_config.py +70 -0
  8. camel/configs/cometapi_config.py +104 -0
  9. camel/configs/minimax_config.py +93 -0
  10. camel/configs/nebius_config.py +103 -0
  11. camel/data_collectors/alpaca_collector.py +15 -6
  12. camel/datasets/base_generator.py +39 -10
  13. camel/environments/single_step.py +28 -3
  14. camel/environments/tic_tac_toe.py +1 -1
  15. camel/interpreters/__init__.py +2 -0
  16. camel/interpreters/docker/Dockerfile +3 -12
  17. camel/interpreters/e2b_interpreter.py +34 -1
  18. camel/interpreters/microsandbox_interpreter.py +395 -0
  19. camel/loaders/__init__.py +11 -2
  20. camel/loaders/chunkr_reader.py +9 -0
  21. camel/memories/agent_memories.py +48 -4
  22. camel/memories/base.py +26 -0
  23. camel/memories/blocks/chat_history_block.py +122 -4
  24. camel/memories/context_creators/score_based.py +25 -384
  25. camel/memories/records.py +88 -8
  26. camel/messages/base.py +153 -34
  27. camel/models/__init__.py +10 -0
  28. camel/models/aihubmix_model.py +83 -0
  29. camel/models/aiml_model.py +1 -16
  30. camel/models/amd_model.py +101 -0
  31. camel/models/anthropic_model.py +6 -19
  32. camel/models/aws_bedrock_model.py +2 -33
  33. camel/models/azure_openai_model.py +114 -89
  34. camel/models/base_audio_model.py +3 -1
  35. camel/models/base_model.py +32 -14
  36. camel/models/cohere_model.py +1 -16
  37. camel/models/cometapi_model.py +83 -0
  38. camel/models/crynux_model.py +1 -16
  39. camel/models/deepseek_model.py +1 -16
  40. camel/models/fish_audio_model.py +6 -0
  41. camel/models/gemini_model.py +36 -18
  42. camel/models/groq_model.py +1 -17
  43. camel/models/internlm_model.py +1 -16
  44. camel/models/litellm_model.py +1 -16
  45. camel/models/lmstudio_model.py +1 -17
  46. camel/models/minimax_model.py +83 -0
  47. camel/models/mistral_model.py +1 -16
  48. camel/models/model_factory.py +27 -1
  49. camel/models/modelscope_model.py +1 -16
  50. camel/models/moonshot_model.py +105 -24
  51. camel/models/nebius_model.py +83 -0
  52. camel/models/nemotron_model.py +0 -5
  53. camel/models/netmind_model.py +1 -16
  54. camel/models/novita_model.py +1 -16
  55. camel/models/nvidia_model.py +1 -16
  56. camel/models/ollama_model.py +4 -19
  57. camel/models/openai_compatible_model.py +62 -41
  58. camel/models/openai_model.py +62 -57
  59. camel/models/openrouter_model.py +1 -17
  60. camel/models/ppio_model.py +1 -16
  61. camel/models/qianfan_model.py +1 -16
  62. camel/models/qwen_model.py +1 -16
  63. camel/models/reka_model.py +1 -16
  64. camel/models/samba_model.py +34 -47
  65. camel/models/sglang_model.py +64 -31
  66. camel/models/siliconflow_model.py +1 -16
  67. camel/models/stub_model.py +0 -4
  68. camel/models/togetherai_model.py +1 -16
  69. camel/models/vllm_model.py +1 -16
  70. camel/models/volcano_model.py +0 -17
  71. camel/models/watsonx_model.py +1 -16
  72. camel/models/yi_model.py +1 -16
  73. camel/models/zhipuai_model.py +60 -16
  74. camel/parsers/__init__.py +18 -0
  75. camel/parsers/mcp_tool_call_parser.py +176 -0
  76. camel/retrievers/auto_retriever.py +1 -0
  77. camel/runtimes/daytona_runtime.py +11 -12
  78. camel/societies/__init__.py +2 -0
  79. camel/societies/workforce/__init__.py +2 -0
  80. camel/societies/workforce/events.py +122 -0
  81. camel/societies/workforce/prompts.py +146 -66
  82. camel/societies/workforce/role_playing_worker.py +15 -11
  83. camel/societies/workforce/single_agent_worker.py +302 -65
  84. camel/societies/workforce/structured_output_handler.py +30 -18
  85. camel/societies/workforce/task_channel.py +163 -27
  86. camel/societies/workforce/utils.py +107 -13
  87. camel/societies/workforce/workflow_memory_manager.py +772 -0
  88. camel/societies/workforce/workforce.py +1949 -579
  89. camel/societies/workforce/workforce_callback.py +74 -0
  90. camel/societies/workforce/workforce_logger.py +168 -145
  91. camel/societies/workforce/workforce_metrics.py +33 -0
  92. camel/storages/key_value_storages/json.py +15 -2
  93. camel/storages/key_value_storages/mem0_cloud.py +48 -47
  94. camel/storages/object_storages/google_cloud.py +1 -1
  95. camel/storages/vectordb_storages/oceanbase.py +13 -13
  96. camel/storages/vectordb_storages/qdrant.py +3 -3
  97. camel/storages/vectordb_storages/tidb.py +8 -6
  98. camel/tasks/task.py +4 -3
  99. camel/toolkits/__init__.py +20 -7
  100. camel/toolkits/aci_toolkit.py +45 -0
  101. camel/toolkits/base.py +6 -4
  102. camel/toolkits/code_execution.py +28 -1
  103. camel/toolkits/context_summarizer_toolkit.py +684 -0
  104. camel/toolkits/dappier_toolkit.py +5 -1
  105. camel/toolkits/dingtalk.py +1135 -0
  106. camel/toolkits/edgeone_pages_mcp_toolkit.py +11 -31
  107. camel/toolkits/excel_toolkit.py +1 -1
  108. camel/toolkits/{file_write_toolkit.py → file_toolkit.py} +430 -36
  109. camel/toolkits/function_tool.py +13 -3
  110. camel/toolkits/github_toolkit.py +104 -17
  111. camel/toolkits/gmail_toolkit.py +1839 -0
  112. camel/toolkits/google_calendar_toolkit.py +38 -4
  113. camel/toolkits/google_drive_mcp_toolkit.py +12 -31
  114. camel/toolkits/hybrid_browser_toolkit/config_loader.py +15 -0
  115. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +77 -8
  116. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit_ts.py +884 -88
  117. camel/toolkits/hybrid_browser_toolkit/installer.py +203 -0
  118. camel/toolkits/hybrid_browser_toolkit/ts/package-lock.json +5 -612
  119. camel/toolkits/hybrid_browser_toolkit/ts/package.json +0 -1
  120. camel/toolkits/hybrid_browser_toolkit/ts/src/browser-session.ts +959 -89
  121. camel/toolkits/hybrid_browser_toolkit/ts/src/config-loader.ts +9 -2
  122. camel/toolkits/hybrid_browser_toolkit/ts/src/hybrid-browser-toolkit.ts +281 -213
  123. camel/toolkits/hybrid_browser_toolkit/ts/src/parent-child-filter.ts +226 -0
  124. camel/toolkits/hybrid_browser_toolkit/ts/src/snapshot-parser.ts +219 -0
  125. camel/toolkits/hybrid_browser_toolkit/ts/src/som-screenshot-injected.ts +543 -0
  126. camel/toolkits/hybrid_browser_toolkit/ts/src/types.ts +23 -3
  127. camel/toolkits/hybrid_browser_toolkit/ts/websocket-server.js +72 -7
  128. camel/toolkits/hybrid_browser_toolkit/ws_wrapper.py +582 -132
  129. camel/toolkits/hybrid_browser_toolkit_py/actions.py +158 -0
  130. camel/toolkits/hybrid_browser_toolkit_py/browser_session.py +55 -8
  131. camel/toolkits/hybrid_browser_toolkit_py/config_loader.py +43 -0
  132. camel/toolkits/hybrid_browser_toolkit_py/hybrid_browser_toolkit.py +321 -8
  133. camel/toolkits/hybrid_browser_toolkit_py/snapshot.py +10 -4
  134. camel/toolkits/hybrid_browser_toolkit_py/unified_analyzer.js +45 -4
  135. camel/toolkits/{openai_image_toolkit.py → image_generation_toolkit.py} +151 -53
  136. camel/toolkits/klavis_toolkit.py +5 -1
  137. camel/toolkits/markitdown_toolkit.py +27 -1
  138. camel/toolkits/math_toolkit.py +64 -10
  139. camel/toolkits/mcp_toolkit.py +366 -71
  140. camel/toolkits/memory_toolkit.py +5 -1
  141. camel/toolkits/message_integration.py +18 -13
  142. camel/toolkits/minimax_mcp_toolkit.py +195 -0
  143. camel/toolkits/note_taking_toolkit.py +19 -10
  144. camel/toolkits/notion_mcp_toolkit.py +16 -26
  145. camel/toolkits/openbb_toolkit.py +5 -1
  146. camel/toolkits/origene_mcp_toolkit.py +8 -49
  147. camel/toolkits/playwright_mcp_toolkit.py +12 -31
  148. camel/toolkits/resend_toolkit.py +168 -0
  149. camel/toolkits/search_toolkit.py +264 -91
  150. camel/toolkits/slack_toolkit.py +64 -10
  151. camel/toolkits/terminal_toolkit/__init__.py +18 -0
  152. camel/toolkits/terminal_toolkit/terminal_toolkit.py +957 -0
  153. camel/toolkits/terminal_toolkit/utils.py +532 -0
  154. camel/toolkits/vertex_ai_veo_toolkit.py +590 -0
  155. camel/toolkits/video_analysis_toolkit.py +17 -11
  156. camel/toolkits/wechat_official_toolkit.py +483 -0
  157. camel/toolkits/zapier_toolkit.py +5 -1
  158. camel/types/__init__.py +2 -2
  159. camel/types/enums.py +274 -7
  160. camel/types/openai_types.py +2 -2
  161. camel/types/unified_model_type.py +15 -0
  162. camel/utils/commons.py +36 -5
  163. camel/utils/constants.py +3 -0
  164. camel/utils/context_utils.py +1003 -0
  165. camel/utils/mcp.py +138 -4
  166. camel/utils/token_counting.py +43 -20
  167. {camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/METADATA +223 -83
  168. {camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/RECORD +170 -141
  169. camel/loaders/pandas_reader.py +0 -368
  170. camel/toolkits/openai_agent_toolkit.py +0 -135
  171. camel/toolkits/terminal_toolkit.py +0 -1550
  172. {camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/WHEEL +0 -0
  173. {camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/licenses/LICENSE +0 -0
@@ -1,23 +1,49 @@
1
- import { Page, Browser, BrowserContext, chromium } from 'playwright';
1
+ import { Page, Browser, BrowserContext, chromium, ConsoleMessage, Frame } from 'playwright';
2
2
  import { BrowserToolkitConfig, SnapshotResult, SnapshotElement, ActionResult, TabInfo, BrowserAction, DetailedTiming } from './types';
3
3
  import { ConfigLoader, StealthConfig } from './config-loader';
4
4
 
5
5
  export class HybridBrowserSession {
6
6
  private browser: Browser | null = null;
7
7
  private context: BrowserContext | null = null;
8
+ private contextOwnedByUs: boolean = false;
8
9
  private pages: Map<string, Page> = new Map();
10
+ private consoleLogs: Map<string, ConsoleMessage[]> = new Map();
9
11
  private currentTabId: string | null = null;
10
12
  private tabCounter = 0;
11
13
  private configLoader: ConfigLoader;
12
14
  private scrollPosition: { x: number; y: number } = {x: 0, y: 0};
13
15
  private hasNavigatedBefore = false; // Track if we've navigated before
16
+ private logLimit: number;
14
17
 
15
18
  constructor(config: BrowserToolkitConfig = {}) {
16
19
  // Use ConfigLoader's fromPythonConfig to handle conversion properly
17
20
  this.configLoader = ConfigLoader.fromPythonConfig(config);
21
+ // Load browser configuration for console log limit, default to 1000
22
+ this.logLimit = this.configLoader.getBrowserConfig().consoleLogLimit || 1000;
18
23
  }
19
24
 
20
- async ensureBrowser(): Promise<void> {
25
+ private registerNewPage(tabId: string, page: Page): void {
26
+ // Register page and logs with tabId
27
+ this.pages.set(tabId, page);
28
+ this.consoleLogs.set(tabId, []);
29
+ // Set up console log listener for the page
30
+ page.on('console', (msg: ConsoleMessage) => {
31
+ const logs = this.consoleLogs.get(tabId);
32
+ if (logs) {
33
+ logs.push(msg);
34
+ if (logs.length > this.logLimit) {
35
+ logs.shift();
36
+ }
37
+ }
38
+ });
39
+
40
+ // Clean logs on page close
41
+ page.on('close', () => {
42
+ this.consoleLogs.delete(tabId);
43
+ });
44
+ }
45
+
46
+ async ensureBrowser(): Promise<void> {
21
47
  if (this.browser) {
22
48
  return;
23
49
  }
@@ -25,8 +51,8 @@ export class HybridBrowserSession {
25
51
  const browserConfig = this.configLoader.getBrowserConfig();
26
52
  const stealthConfig = this.configLoader.getStealthConfig();
27
53
 
28
- // Check if CDP connection is requested
29
- if (browserConfig.connectOverCdp && browserConfig.cdpUrl) {
54
+ // Check if CDP URL is provided
55
+ if (browserConfig.cdpUrl) {
30
56
  // Connect to existing browser via CDP
31
57
  this.browser = await chromium.connectOverCDP(browserConfig.cdpUrl);
32
58
 
@@ -34,44 +60,94 @@ export class HybridBrowserSession {
34
60
  const contexts = this.browser.contexts();
35
61
  if (contexts.length > 0) {
36
62
  this.context = contexts[0];
63
+ this.contextOwnedByUs = false;
64
+
65
+ // Apply stealth headers to existing context if configured
66
+ // Note: userAgent cannot be changed on an existing context
67
+ if (stealthConfig.enabled) {
68
+ if (stealthConfig.extraHTTPHeaders) {
69
+ await this.context.setExtraHTTPHeaders(stealthConfig.extraHTTPHeaders);
70
+ }
71
+ if (stealthConfig.userAgent) {
72
+ console.warn('[HybridBrowserSession] Cannot apply userAgent to existing context. Consider creating a new context if userAgent customization is required.');
73
+ }
74
+ }
37
75
  } else {
38
76
  const contextOptions: any = {
39
77
  viewport: browserConfig.viewport
40
78
  };
41
79
 
42
- // Apply stealth headers if configured
43
- if (stealthConfig.enabled && stealthConfig.extraHTTPHeaders) {
44
- contextOptions.extraHTTPHeaders = stealthConfig.extraHTTPHeaders;
80
+ // Apply stealth headers and UA if configured
81
+ if (stealthConfig.enabled) {
82
+ if (stealthConfig.extraHTTPHeaders) {
83
+ contextOptions.extraHTTPHeaders = stealthConfig.extraHTTPHeaders;
84
+ }
85
+ if (stealthConfig.userAgent) {
86
+ contextOptions.userAgent = stealthConfig.userAgent;
87
+ }
45
88
  }
46
89
 
47
90
  this.context = await this.browser.newContext(contextOptions);
91
+ this.contextOwnedByUs = true;
92
+ this.browser = this.context.browser();
48
93
  }
49
94
 
50
- // Handle existing pages
51
95
  const pages = this.context.pages();
52
- if (pages.length > 0) {
53
- // Map existing pages - for CDP, only use pages with about:blank URL
54
- let availablePageFound = false;
55
- for (const page of pages) {
56
- const pageUrl = page.url();
57
- // In CDP mode, only consider pages with about:blank as available
58
- if (pageUrl === 'about:blank') {
96
+ console.log(`[CDP] cdpKeepCurrentPage: ${browserConfig.cdpKeepCurrentPage}, pages count: ${pages.length}`);
97
+ if (browserConfig.cdpKeepCurrentPage) {
98
+ // Use existing page without creating new ones
99
+ if (pages.length > 0) {
100
+ // Find first non-closed page
101
+ let validPage: Page | null = null;
102
+ for (const page of pages) {
103
+ if (!page.isClosed()) {
104
+ validPage = page;
105
+ break;
106
+ }
107
+ }
108
+
109
+ if (validPage) {
59
110
  const tabId = this.generateTabId();
60
- this.pages.set(tabId, page);
61
- if (!this.currentTabId) {
111
+ this.registerNewPage(tabId, validPage);
112
+ this.currentTabId = tabId;
113
+ console.log(`[CDP] cdpKeepCurrentPage mode: using existing page as initial tab: ${tabId}, URL: ${validPage.url()}`);
114
+ } else {
115
+ throw new Error('No active pages available in CDP mode with cdpKeepCurrentPage=true (all pages are closed)');
116
+ }
117
+ } else {
118
+ throw new Error('No pages available in CDP mode with cdpKeepCurrentPage=true');
119
+ }
120
+ } else {
121
+ // Look for blank pages or create new ones
122
+ if (pages.length > 0) {
123
+ // Find one available blank page
124
+ let availablePageFound = false;
125
+ for (const page of pages) {
126
+ const pageUrl = page.url();
127
+ if (this.isBlankPageUrl(pageUrl)) {
128
+ const tabId = this.generateTabId();
129
+ this.registerNewPage(tabId, page);
62
130
  this.currentTabId = tabId;
63
131
  availablePageFound = true;
132
+ console.log(`[CDP] Registered blank page as initial tab: ${tabId}, URL: ${pageUrl}`);
133
+ break;
64
134
  }
65
135
  }
136
+
137
+ if (!availablePageFound) {
138
+ console.log('[CDP] No blank pages found, creating new page');
139
+ const newPage = await this.context.newPage();
140
+ const tabId = this.generateTabId();
141
+ this.registerNewPage(tabId, newPage);
142
+ this.currentTabId = tabId;
143
+ }
144
+ } else {
145
+ console.log('[CDP] No existing pages, creating initial page');
146
+ const newPage = await this.context.newPage();
147
+ const tabId = this.generateTabId();
148
+ this.registerNewPage(tabId, newPage);
149
+ this.currentTabId = tabId;
66
150
  }
67
-
68
- // If no available blank pages found in CDP mode, we cannot create new ones
69
- if (!availablePageFound) {
70
- throw new Error('No available blank tabs found in CDP mode. The frontend should have pre-created blank tabs.');
71
- }
72
- } else {
73
- // In CDP mode, newPage is not supported
74
- throw new Error('No pages available in CDP mode and newPage() is not supported. Ensure the frontend has pre-created blank tabs.');
75
151
  }
76
152
  } else {
77
153
  // Original launch logic
@@ -82,22 +158,28 @@ export class HybridBrowserSession {
82
158
  if (stealthConfig.enabled) {
83
159
  launchOptions.args = stealthConfig.args || [];
84
160
 
85
- // Apply stealth user agent if configured
161
+ // Apply stealth user agent/headers if configured
86
162
  if (stealthConfig.userAgent) {
87
163
  launchOptions.userAgent = stealthConfig.userAgent;
88
164
  }
165
+ if (stealthConfig.extraHTTPHeaders) {
166
+ launchOptions.extraHTTPHeaders = stealthConfig.extraHTTPHeaders;
167
+ }
89
168
  }
90
169
 
91
170
  if (browserConfig.userDataDir) {
171
+ // Ensure viewport is honored in persistent context
172
+ launchOptions.viewport = browserConfig.viewport;
92
173
  this.context = await chromium.launchPersistentContext(
93
174
  browserConfig.userDataDir,
94
175
  launchOptions
95
176
  );
96
-
177
+ this.contextOwnedByUs = true;
178
+ this.browser = this.context.browser();
97
179
  const pages = this.context.pages();
98
180
  if (pages.length > 0) {
99
181
  const initialTabId = this.generateTabId();
100
- this.pages.set(initialTabId, pages[0]);
182
+ this.registerNewPage(initialTabId, pages[0]);
101
183
  this.currentTabId = initialTabId;
102
184
  }
103
185
  } else {
@@ -106,16 +188,22 @@ export class HybridBrowserSession {
106
188
  viewport: browserConfig.viewport
107
189
  };
108
190
 
109
- // Apply stealth headers if configured
110
- if (stealthConfig.enabled && stealthConfig.extraHTTPHeaders) {
111
- contextOptions.extraHTTPHeaders = stealthConfig.extraHTTPHeaders;
191
+ // Apply stealth headers and UA if configured
192
+ if (stealthConfig.enabled) {
193
+ if (stealthConfig.extraHTTPHeaders) {
194
+ contextOptions.extraHTTPHeaders = stealthConfig.extraHTTPHeaders;
195
+ }
196
+ if (stealthConfig.userAgent) {
197
+ contextOptions.userAgent = stealthConfig.userAgent;
198
+ }
112
199
  }
113
200
 
114
201
  this.context = await this.browser.newContext(contextOptions);
202
+ this.contextOwnedByUs = true;
115
203
 
116
204
  const initialPage = await this.context.newPage();
117
205
  const initialTabId = this.generateTabId();
118
- this.pages.set(initialTabId, initialPage);
206
+ this.registerNewPage(initialTabId, initialPage);
119
207
  this.currentTabId = initialTabId;
120
208
  }
121
209
  }
@@ -132,13 +220,86 @@ export class HybridBrowserSession {
132
220
  return `${browserConfig.tabIdPrefix}${String(++this.tabCounter).padStart(browserConfig.tabCounterPadding, '0')}`;
133
221
  }
134
222
 
223
+ private isBlankPageUrl(url: string): boolean {
224
+ // Unified blank page detection logic used across the codebase
225
+ const browserConfig = this.configLoader.getBrowserConfig();
226
+ return (
227
+ // Standard about:blank variations (prefix match for query params)
228
+ url === 'about:blank' ||
229
+ url.startsWith('about:blank?') ||
230
+ // Configured blank page URLs (exact match for compatibility)
231
+ browserConfig.blankPageUrls.includes(url) ||
232
+ // Empty URL
233
+ url === '' ||
234
+ // Data URLs (often used for blank pages)
235
+ url.startsWith(browserConfig.dataUrlPrefix || 'data:')
236
+ );
237
+ }
238
+
135
239
  async getCurrentPage(): Promise<Page> {
136
240
  if (!this.currentTabId || !this.pages.has(this.currentTabId)) {
137
- throw new Error('No active page available');
241
+ const browserConfig = this.configLoader.getBrowserConfig();
242
+
243
+ // In CDP keep-current-page mode, find existing page
244
+ if (browserConfig.cdpKeepCurrentPage && browserConfig.cdpUrl && this.context) {
245
+ const allPages = this.context.pages();
246
+ console.log(`[getCurrentPage] cdpKeepCurrentPage mode: Looking for existing page, found ${allPages.length} pages`);
247
+
248
+ if (allPages.length > 0) {
249
+ // Try to find a page that's not already tracked
250
+ for (const page of allPages) {
251
+ const isTracked = Array.from(this.pages.values()).includes(page);
252
+ if (!isTracked && !page.isClosed()) {
253
+ const tabId = this.generateTabId();
254
+ this.registerNewPage(tabId, page);
255
+ this.currentTabId = tabId;
256
+ console.log(`[getCurrentPage] cdpKeepCurrentPage mode: Found and registered untracked page: ${tabId}`);
257
+ return page;
258
+ }
259
+ }
260
+
261
+ // If all pages are tracked, use the first available one
262
+ const firstPage = allPages[0];
263
+ if (!firstPage.isClosed()) {
264
+ // Find the tab ID for this page
265
+ for (const [tabId, page] of this.pages.entries()) {
266
+ if (page === firstPage) {
267
+ this.currentTabId = tabId;
268
+ console.log(`[getCurrentPage] cdpKeepCurrentPage mode: Using existing tracked page: ${tabId}`);
269
+ return page;
270
+ }
271
+ }
272
+ }
273
+ }
274
+
275
+ throw new Error('No active page available in CDP mode with cdpKeepCurrentPage=true');
276
+ }
277
+
278
+ // Normal mode: create new page
279
+ if (this.context) {
280
+ console.log('[getCurrentPage] No active page, creating new page');
281
+ const newPage = await this.context.newPage();
282
+ const tabId = this.generateTabId();
283
+ this.registerNewPage(tabId, newPage);
284
+ this.currentTabId = tabId;
285
+
286
+ newPage.setDefaultNavigationTimeout(browserConfig.navigationTimeout);
287
+ newPage.setDefaultTimeout(browserConfig.navigationTimeout);
288
+
289
+ return newPage;
290
+ }
291
+ throw new Error('No browser context available');
138
292
  }
139
293
  return this.pages.get(this.currentTabId)!;
140
294
  }
141
295
 
296
+ async getCurrentLogs(): Promise<ConsoleMessage[]> {
297
+ if (!this.currentTabId || !this.consoleLogs.has(this.currentTabId)) {
298
+ return [];
299
+ }
300
+ return this.consoleLogs.get(this.currentTabId) || [];
301
+ }
302
+
142
303
  /**
143
304
  * Get current scroll position from the page
144
305
  */
@@ -168,6 +329,36 @@ export class HybridBrowserSession {
168
329
  return this.getSnapshotForAINative(includeCoordinates, viewportLimit);
169
330
  }
170
331
 
332
+ private parseElementFromSnapshot(snapshotText: string, ref: string): { role?: string; text?: string } {
333
+ const lines = snapshotText.split('\n');
334
+ for (const line of lines) {
335
+ if (line.includes(`[ref=${ref}]`)) {
336
+ const typeMatch = line.match(/^\s*-?\s*([\w-]+)/);
337
+ const role = typeMatch ? typeMatch[1] : undefined;
338
+ const textMatch = line.match(/"([^"]*)"/);
339
+ const text = textMatch ? textMatch[1] : undefined;
340
+ return { role, text };
341
+ }
342
+ }
343
+ return {};
344
+ }
345
+
346
+ private buildSnapshotIndex(snapshotText: string): Map<string, { role?: string; text?: string }> {
347
+ const index = new Map<string, { role?: string; text?: string }>();
348
+ const refRe = /\[ref=([^\]]+)\]/i;
349
+ for (const line of snapshotText.split('\n')) {
350
+ const m = line.match(refRe);
351
+ if (!m) continue;
352
+ const ref = m[1];
353
+ const roleMatch = line.match(/^\s*-?\s*([a-z0-9_-]+)/i);
354
+ const role = roleMatch ? roleMatch[1].toLowerCase() : undefined;
355
+ const textMatch = line.match(/"([^"]*)"/);
356
+ const text = textMatch ? textMatch[1] : undefined;
357
+ index.set(ref, { role, text });
358
+ }
359
+ return index;
360
+ }
361
+
171
362
  private async getSnapshotForAINative(includeCoordinates = false, viewportLimit = false): Promise<SnapshotResult & { timing: DetailedTiming }> {
172
363
  const startTime = Date.now();
173
364
  const page = await this.getCurrentPage();
@@ -190,6 +381,17 @@ export class HybridBrowserSession {
190
381
  const mappingStart = Date.now();
191
382
  const playwrightMapping: Record<string, any> = {};
192
383
 
384
+ // Parse element info in a single pass
385
+ const snapshotIndex = this.buildSnapshotIndex(snapshotText);
386
+ for (const ref of refs) {
387
+ const elementInfo = snapshotIndex.get(ref) || {};
388
+ playwrightMapping[ref] = {
389
+ ref,
390
+ role: elementInfo.role || 'unknown',
391
+ text: elementInfo.text || '',
392
+ };
393
+ }
394
+
193
395
  if (includeCoordinates) {
194
396
  // Get coordinates for each ref using aria-ref selector
195
397
  for (const ref of refs) {
@@ -203,8 +405,9 @@ export class HybridBrowserSession {
203
405
  const boundingBox = await element.boundingBox();
204
406
 
205
407
  if (boundingBox) {
408
+ // Add coordinates to existing element info
206
409
  playwrightMapping[ref] = {
207
- ref,
410
+ ...playwrightMapping[ref],
208
411
  coordinates: {
209
412
  x: Math.round(boundingBox.x),
210
413
  y: Math.round(boundingBox.y),
@@ -277,7 +480,7 @@ export class HybridBrowserSession {
277
480
  /**
278
481
  * Enhanced click implementation with new tab detection and scroll fix
279
482
  */
280
- private async performClick(page: Page, ref: string): Promise<{ success: boolean; method?: string; error?: string; newTabId?: string }> {
483
+ private async performClick(page: Page, ref: string): Promise<{ success: boolean; method?: string; error?: string; newTabId?: string; diffSnapshot?: string }> {
281
484
 
282
485
  try {
283
486
  // Ensure we have the latest snapshot and mapping
@@ -294,6 +497,17 @@ export class HybridBrowserSession {
294
497
  return { success: false, error: `Element with ref ${ref} not found` };
295
498
  }
296
499
 
500
+ const role = await element.getAttribute('role');
501
+ const elementTagName = await element.evaluate(el => el.tagName.toLowerCase());
502
+ const isCombobox = role === 'combobox' || elementTagName === 'combobox';
503
+ const isTextbox = role === 'textbox' || elementTagName === 'input' || elementTagName === 'textarea';
504
+ const shouldCheckDiff = isCombobox || isTextbox;
505
+
506
+ let snapshotBefore: string | null = null;
507
+ if (shouldCheckDiff) {
508
+ snapshotBefore = await (page as any)._snapshotForAI();
509
+ }
510
+
297
511
  // Check element properties
298
512
  const browserConfig = this.configLoader.getBrowserConfig();
299
513
  const target = await element.getAttribute(browserConfig.targetAttribute);
@@ -321,7 +535,6 @@ export class HybridBrowserSession {
321
535
 
322
536
  if (shouldOpenNewTab) {
323
537
  // Handle new tab opening
324
-
325
538
  // If it's a link that doesn't naturally open in new tab, force it
326
539
  if (isNavigableLink && !naturallyOpensNewTab) {
327
540
  await element.evaluate((el, blankTarget) => {
@@ -343,7 +556,7 @@ export class HybridBrowserSession {
343
556
 
344
557
  // Generate tab ID for the new page
345
558
  const newTabId = this.generateTabId();
346
- this.pages.set(newTabId, newPage);
559
+ this.registerNewPage(newTabId, newPage);
347
560
 
348
561
  // Set up page properties
349
562
  const browserConfig = this.configLoader.getBrowserConfig();
@@ -364,13 +577,17 @@ export class HybridBrowserSession {
364
577
  }
365
578
  } else {
366
579
  // Add options to prevent scrolling issues
367
- try {
368
- // First try normal click
369
- const browserConfig = this.configLoader.getBrowserConfig();
370
- await element.click({ timeout: browserConfig.clickTimeout });
371
- } catch (clickError) {
372
- // If normal click fails due to scrolling, try force click
373
- await element.click({ force: browserConfig.forceClick });
580
+ const browserConfig = this.configLoader.getBrowserConfig();
581
+ await element.click({ force: browserConfig.forceClick });
582
+
583
+ if (shouldCheckDiff && snapshotBefore) {
584
+ await page.waitForTimeout(300);
585
+ const snapshotAfter = await (page as any)._snapshotForAI();
586
+ const diffSnapshot = this.getSnapshotDiff(snapshotBefore, snapshotAfter, ['option', 'menuitem']);
587
+
588
+ if (diffSnapshot && diffSnapshot.trim() !== '') {
589
+ return { success: true, method: 'playwright-aria-ref', diffSnapshot };
590
+ }
374
591
  }
375
592
 
376
593
  return { success: true, method: 'playwright-aria-ref' };
@@ -382,27 +599,375 @@ export class HybridBrowserSession {
382
599
  }
383
600
  }
384
601
 
602
+ /**
603
+ * Extract diff between two snapshots, returning only new elements of specified types
604
+ */
605
+ private getSnapshotDiff(snapshotBefore: string, snapshotAfter: string, targetRoles: string[]): string {
606
+ const refsBefore = new Set<string>();
607
+ const refPattern = /\[ref=([^\]]+)\]/g;
608
+ let match;
609
+ while ((match = refPattern.exec(snapshotBefore)) !== null) {
610
+ refsBefore.add(match[1]);
611
+ }
612
+
613
+ const lines = snapshotAfter.split('\n');
614
+ const newElements: string[] = [];
615
+
616
+ for (const line of lines) {
617
+ const refMatch = line.match(/\[ref=([^\]]+)\]/);
618
+ if (refMatch && !refsBefore.has(refMatch[1])) {
619
+ const hasTargetRole = targetRoles.some(role => {
620
+ const rolePattern = new RegExp(`\\b${role}\\b`, 'i');
621
+ return rolePattern.test(line);
622
+ });
623
+
624
+ if (hasTargetRole) {
625
+ newElements.push(line.trim());
626
+ }
627
+ }
628
+ }
629
+
630
+ if (newElements.length > 0) {
631
+ return newElements.join('\n');
632
+ } else {
633
+ return '';
634
+ }
635
+ }
636
+
385
637
  /**
386
638
  * Simplified type implementation using Playwright's aria-ref selector
639
+ * Supports both single and multiple input operations
387
640
  */
388
- private async performType(page: Page, ref: string, text: string): Promise<{ success: boolean; error?: string }> {
641
+ private async performType(page: Page, ref: string | undefined, text: string | undefined, inputs?: Array<{ ref: string; text: string }>): Promise<{ success: boolean; error?: string; details?: Record<string, any>; diffSnapshot?: string }> {
389
642
  try {
390
643
  // Ensure we have the latest snapshot
391
644
  await (page as any)._snapshotForAI();
392
645
 
393
- // Use Playwright's aria-ref selector
394
- const selector = `aria-ref=${ref}`;
395
- const element = await page.locator(selector).first();
396
-
397
- const exists = await element.count() > 0;
398
- if (!exists) {
399
- return { success: false, error: `Element with ref ${ref} not found` };
646
+ // Handle multiple inputs if provided
647
+ if (inputs && inputs.length > 0) {
648
+ const results: Record<string, { success: boolean; error?: string }> = {};
649
+
650
+ for (const input of inputs) {
651
+ const singleResult = await this.performType(page, input.ref, input.text);
652
+ results[input.ref] = {
653
+ success: singleResult.success,
654
+ error: singleResult.error
655
+ };
656
+ }
657
+
658
+ // Check if all inputs were successful
659
+ const allSuccess = Object.values(results).every(r => r.success);
660
+ const errors = Object.entries(results)
661
+ .filter(([_, r]) => !r.success)
662
+ .map(([ref, r]) => `${ref}: ${r.error}`)
663
+ .join('; ');
664
+
665
+ return {
666
+ success: allSuccess,
667
+ error: allSuccess ? undefined : `Some inputs failed: ${errors}`,
668
+ details: results
669
+ };
400
670
  }
401
671
 
402
- // Type text using Playwright's built-in fill method
403
- await element.fill(text);
672
+ // Handle single input (backward compatibility)
673
+ if (ref && text !== undefined) {
674
+ const selector = `aria-ref=${ref}`;
675
+ const element = await page.locator(selector).first();
676
+
677
+ const exists = await element.count() > 0;
678
+ if (!exists) {
679
+ return { success: false, error: `Element with ref ${ref} not found` };
680
+ }
681
+
682
+ // Get element attributes to check if it's readonly or a special input type
683
+ let originalPlaceholder: string | null = null;
684
+ let isReadonly = false;
685
+ let elementType: string | null = null;
686
+ let isCombobox = false;
687
+ let isTextbox = false;
688
+ let shouldCheckDiff = false;
689
+
690
+ try {
691
+ // Get element info in one evaluation to minimize interactions
692
+ const elementInfo = await element.evaluate((el: any) => {
693
+ return {
694
+ placeholder: el.placeholder || null,
695
+ readonly: el.readOnly || el.hasAttribute('readonly'),
696
+ type: el.type || null,
697
+ tagName: el.tagName.toLowerCase(),
698
+ disabled: el.disabled || false,
699
+ role: el.getAttribute('role'),
700
+ ariaHaspopup: el.getAttribute('aria-haspopup')
701
+ };
702
+ });
703
+
704
+ originalPlaceholder = elementInfo.placeholder;
705
+ isReadonly = elementInfo.readonly;
706
+ elementType = elementInfo.type;
707
+ isCombobox = elementInfo.role === 'combobox' ||
708
+ elementInfo.tagName === 'combobox' ||
709
+ elementInfo.ariaHaspopup === 'listbox';
710
+ isTextbox = elementInfo.role === 'textbox' ||
711
+ elementInfo.tagName === 'input' ||
712
+ elementInfo.tagName === 'textarea';
713
+ shouldCheckDiff = isCombobox || isTextbox;
714
+
715
+ } catch (e) {
716
+ console.log(`Warning: Failed to get element attributes: ${e}`);
717
+ }
718
+
719
+ // Get snapshot before action to record existing elements
720
+ const snapshotBefore = await (page as any)._snapshotForAI();
721
+ const existingRefs = new Set<string>();
722
+ const refPattern = /\[ref=([^\]]+)\]/g;
723
+ let match;
724
+ while ((match = refPattern.exec(snapshotBefore)) !== null) {
725
+ existingRefs.add(match[1]);
726
+ }
727
+ console.log(`Found ${existingRefs.size} total elements before action`);
728
+
729
+ // If element is readonly or a date/time input, skip fill attempt and go directly to click
730
+ if (isReadonly || ['date', 'datetime-local', 'time'].includes(elementType || '')) {
731
+ console.log(`Element ref=${ref} is readonly or date/time input, skipping direct fill attempt`);
732
+
733
+ // Click with force option to avoid scrolling
734
+ try {
735
+ await element.click({ force: true });
736
+ console.log(`Clicked readonly/special element ref=${ref} to trigger dynamic content`);
737
+ // Wait for potential dynamic content to appear
738
+ await page.waitForTimeout(500);
739
+ } catch (clickError) {
740
+ console.log(`Warning: Failed to click element: ${clickError}`);
741
+ }
742
+ } else {
743
+ // For normal inputs, click first then try to fill
744
+ try {
745
+ await element.click({ force: true });
746
+ console.log(`Clicked element ref=${ref} before typing`);
747
+ } catch (clickError) {
748
+ console.log(`Warning: Failed to click element before typing: ${clickError}`);
749
+ }
750
+
751
+ // Try to fill the element directly
752
+ try {
753
+ // Use force option to avoid scrolling during fill
754
+ await element.fill(text, { timeout: 3000, force: true });
755
+
756
+ // If this element might show dropdown, wait and check for new elements
757
+ if (shouldCheckDiff) {
758
+ await page.waitForTimeout(300);
759
+ const snapshotAfter = await (page as any)._snapshotForAI();
760
+ const diffSnapshot = this.getSnapshotDiff(snapshotBefore, snapshotAfter, ['option', 'menuitem']);
761
+
762
+ if (diffSnapshot && diffSnapshot.trim() !== '') {
763
+ return { success: true, diffSnapshot };
764
+ }
765
+ }
766
+
767
+ return { success: true };
768
+ } catch (fillError: any) {
769
+ // Log the error for debugging
770
+ console.log(`Fill error for ref ${ref}: ${fillError.message}`);
771
+
772
+ // Check for various error messages that indicate the element is not fillable
773
+ const errorMessage = fillError.message.toLowerCase();
774
+ if (errorMessage.includes('not an <input>') ||
775
+ errorMessage.includes('not have a role allowing') ||
776
+ errorMessage.includes('element is not') ||
777
+ errorMessage.includes('cannot type') ||
778
+ errorMessage.includes('readonly') ||
779
+ errorMessage.includes('not editable') ||
780
+ errorMessage.includes('timeout') ||
781
+ errorMessage.includes('timeouterror')) {
782
+
783
+ // Click the element again to trigger dynamic content (like date pickers)
784
+ try {
785
+ await element.click({ force: true });
786
+ console.log(`Clicked element ref=${ref} again to trigger dynamic content`);
787
+ // Wait for potential dynamic content to appear
788
+ await page.waitForTimeout(500);
789
+ } catch (clickError) {
790
+ console.log(`Warning: Failed to click element to trigger dynamic content: ${clickError}`);
791
+ }
792
+
793
+ // Step 1: Try to find input elements within the clicked element
794
+ const inputSelector = `input:visible, textarea:visible, [contenteditable="true"]:visible, [role="textbox"]:visible`;
795
+ const inputElement = await element.locator(inputSelector).first();
796
+
797
+ const inputExists = await inputElement.count() > 0;
798
+ if (inputExists) {
799
+ console.log(`Found input element within ref ${ref}, attempting to fill`);
800
+ try {
801
+ await inputElement.fill(text, { force: true });
802
+
803
+ // If element might show dropdown, check for new elements
804
+ if (shouldCheckDiff) {
805
+ await page.waitForTimeout(300);
806
+ const snapshotFinal = await (page as any)._snapshotForAI();
807
+ const diffSnapshot = this.getSnapshotDiff(snapshotBefore, snapshotFinal, ['option', 'menuitem']);
808
+
809
+ if (diffSnapshot && diffSnapshot.trim() !== '') {
810
+ return { success: true, diffSnapshot };
811
+ }
812
+ }
813
+
814
+ return { success: true };
815
+ } catch (innerError) {
816
+ console.log(`Failed to fill child element: ${innerError}`);
817
+ }
818
+ }
819
+
820
+ // Step 2: Look for new elements that appeared after the action
821
+ console.log(`Looking for new elements that appeared after action...`);
822
+
823
+ // Get snapshot after action to find new elements
824
+ const snapshotAfter = await (page as any)._snapshotForAI();
825
+ const newRefs = new Set<string>();
826
+ const afterRefPattern = /\[ref=([^\]]+)\]/g;
827
+ let afterMatch;
828
+ while ((afterMatch = afterRefPattern.exec(snapshotAfter)) !== null) {
829
+ const refId = afterMatch[1];
830
+ if (!existingRefs.has(refId)) {
831
+ newRefs.add(refId);
832
+ }
833
+ }
834
+
835
+ console.log(`Found ${newRefs.size} new elements after action`);
836
+
837
+ // If we have a placeholder, try to find new input elements with that placeholder
838
+ if (originalPlaceholder && newRefs.size > 0) {
839
+ console.log(`Looking for new input elements with placeholder: ${originalPlaceholder}`);
840
+
841
+ // Try each new ref to see if it's an input with our placeholder
842
+ for (const newRef of newRefs) {
843
+ try {
844
+ const newElement = await page.locator(`aria-ref=${newRef}`).first();
845
+ const tagName = await newElement.evaluate(el => el.tagName.toLowerCase()).catch(() => null);
846
+
847
+ if (tagName === 'input' || tagName === 'textarea') {
848
+ const placeholder = await newElement.getAttribute('placeholder').catch(() => null);
849
+ if (placeholder === originalPlaceholder) {
850
+ console.log(`Found new input element with matching placeholder: ref=${newRef}`);
851
+
852
+ // Check if it's visible and fillable
853
+ const elementInfo = await newElement.evaluate((el: any) => {
854
+ return {
855
+ tagName: el.tagName,
856
+ id: el.id,
857
+ className: el.className,
858
+ placeholder: el.placeholder,
859
+ isVisible: el.offsetParent !== null,
860
+ isReadonly: el.readOnly || el.getAttribute('readonly') !== null
861
+ };
862
+ });
863
+ console.log(`New element details:`, JSON.stringify(elementInfo));
864
+
865
+ // Try to fill it with force to avoid scrolling
866
+ await newElement.fill(text, { force: true });
867
+
868
+ // If element might show dropdown, check for new elements
869
+ if (shouldCheckDiff) {
870
+ await page.waitForTimeout(300);
871
+ const snapshotFinal = await (page as any)._snapshotForAI();
872
+ const diffSnapshot = this.getSnapshotDiff(snapshotBefore, snapshotFinal, ['option', 'menuitem']);
873
+
874
+ if (diffSnapshot && diffSnapshot.trim() !== '') {
875
+ return { success: true, diffSnapshot };
876
+ }
877
+ }
878
+
879
+ return { success: true };
880
+ }
881
+ }
882
+ } catch (e) {
883
+ // Ignore errors for non-input elements
884
+ }
885
+ }
886
+ }
887
+
888
+ console.log(`No suitable input element found for ref ${ref}`);
889
+ }
890
+ // Re-throw the original error if we couldn't find an input element
891
+ throw fillError;
892
+ }
893
+ }
894
+
895
+ // If we skipped the fill attempt (readonly elements), look for new elements directly
896
+ if (isReadonly || ['date', 'datetime-local', 'time'].includes(elementType || '')) {
897
+ // Look for new elements that appeared after clicking
898
+ console.log(`Looking for new elements that appeared after clicking readonly element...`);
899
+
900
+ // Get snapshot after action to find new elements
901
+ const snapshotAfter = await (page as any)._snapshotForAI();
902
+ const newRefs = new Set<string>();
903
+ const afterRefPattern = /\[ref=([^\]]+)\]/g;
904
+ let afterMatch;
905
+ while ((afterMatch = afterRefPattern.exec(snapshotAfter)) !== null) {
906
+ const refId = afterMatch[1];
907
+ if (!existingRefs.has(refId)) {
908
+ newRefs.add(refId);
909
+ }
910
+ }
911
+
912
+ console.log(`Found ${newRefs.size} new elements after clicking readonly element`);
913
+
914
+ // If we have a placeholder, try to find new input elements with that placeholder
915
+ if (originalPlaceholder && newRefs.size > 0) {
916
+ console.log(`Looking for new input elements with placeholder: ${originalPlaceholder}`);
917
+
918
+ // Try each new ref to see if it's an input with our placeholder
919
+ for (const newRef of newRefs) {
920
+ try {
921
+ const newElement = await page.locator(`aria-ref=${newRef}`).first();
922
+ const tagName = await newElement.evaluate(el => el.tagName.toLowerCase()).catch(() => null);
923
+
924
+ if (tagName === 'input' || tagName === 'textarea') {
925
+ const placeholder = await newElement.getAttribute('placeholder').catch(() => null);
926
+ if (placeholder === originalPlaceholder) {
927
+ console.log(`Found new input element with matching placeholder: ref=${newRef}`);
928
+
929
+ // Check if it's visible and fillable
930
+ const elementInfo = await newElement.evaluate((el: any) => {
931
+ return {
932
+ tagName: el.tagName,
933
+ id: el.id,
934
+ className: el.className,
935
+ placeholder: el.placeholder,
936
+ isVisible: el.offsetParent !== null,
937
+ isReadonly: el.readOnly || el.getAttribute('readonly') !== null
938
+ };
939
+ });
940
+ console.log(`New element details:`, JSON.stringify(elementInfo));
941
+
942
+ // Try to fill it with force to avoid scrolling
943
+ await newElement.fill(text, { force: true });
944
+
945
+ // If element might show dropdown, check for new elements
946
+ if (shouldCheckDiff) {
947
+ await page.waitForTimeout(300);
948
+ const snapshotFinal = await (page as any)._snapshotForAI();
949
+ const diffSnapshot = this.getSnapshotDiff(snapshotBefore, snapshotFinal, ['option', 'menuitem']);
950
+
951
+ if (diffSnapshot && diffSnapshot.trim() !== '') {
952
+ return { success: true, diffSnapshot };
953
+ }
954
+ }
955
+
956
+ return { success: true };
957
+ }
958
+ }
959
+ } catch (e) {
960
+ // Ignore errors for non-input elements
961
+ }
962
+ }
963
+ }
964
+
965
+ console.log(`No suitable input element found for readonly ref ${ref}`);
966
+ return { success: false, error: `Element ref=${ref} is readonly and no suitable input was found` };
967
+ }
968
+ }
404
969
 
405
- return { success: true };
970
+ return { success: false, error: 'No valid input provided' };
406
971
  } catch (error) {
407
972
  return { success: false, error: `Type failed: ${error}` };
408
973
  }
@@ -434,7 +999,97 @@ export class HybridBrowserSession {
434
999
  }
435
1000
  }
436
1001
 
1002
+ /**
1003
+ * Simplified mouse control implementation
1004
+ */
1005
+ private async performMouseControl(page: Page, control: string, x: number, y: number): Promise<{ success: boolean; error?: string }> {
1006
+ try {
1007
+ const viewport = page.viewportSize();
1008
+ if (!viewport) {
1009
+ return { success: false, error: 'Viewport size not available from page.' };
1010
+ }
1011
+ if (x < 0 || y < 0 || x > viewport.width || y > viewport.height) {
1012
+ return { success: false, error: `Invalid coordinates, outside viewport bounds: (${x}, ${y})` };
1013
+ }
1014
+ switch (control) {
1015
+ case 'click': {
1016
+ await page.mouse.click(x, y);
1017
+ break;
1018
+ }
1019
+ case 'right_click': {
1020
+ await page.mouse.click(x, y, { button: 'right' });
1021
+ break;
1022
+ }
1023
+ case 'dblclick': {
1024
+ await page.mouse.dblclick(x, y);
1025
+ break;
1026
+ }
1027
+ default:
1028
+ return { success: false, error: `Invalid control action: ${control}` };
1029
+ }
1030
+
1031
+ return { success: true };
1032
+ } catch (error) {
1033
+ return { success: false, error: `Mouse action failed: ${error}` };
1034
+ }
1035
+ }
1036
+
1037
+ /**
1038
+ * Enhanced mouse drag and drop implementation using ref IDs
1039
+ */
1040
+ private async performMouseDrag(page: Page, fromRef: string, toRef: string): Promise<{ success: boolean; error?: string }> {
1041
+ try {
1042
+ // Ensure we have the latest snapshot
1043
+ await (page as any)._snapshotForAI();
1044
+
1045
+ // Get elements using Playwright's aria-ref selector
1046
+ const fromSelector = `aria-ref=${fromRef}`;
1047
+ const toSelector = `aria-ref=${toRef}`;
1048
+
1049
+ const fromElement = await page.locator(fromSelector).first();
1050
+ const toElement = await page.locator(toSelector).first();
1051
+
1052
+ // Check if elements exist
1053
+ const fromExists = await fromElement.count() > 0;
1054
+ const toExists = await toElement.count() > 0;
1055
+
1056
+ if (!fromExists) {
1057
+ return { success: false, error: `Source element with ref ${fromRef} not found` };
1058
+ }
1059
+
1060
+ if (!toExists) {
1061
+ return { success: false, error: `Target element with ref ${toRef} not found` };
1062
+ }
1063
+
1064
+ // Get the center coordinates of both elements
1065
+ const fromBox = await fromElement.boundingBox();
1066
+ const toBox = await toElement.boundingBox();
1067
+
1068
+ if (!fromBox) {
1069
+ return { success: false, error: `Could not get bounding box for source element with ref ${fromRef}` };
1070
+ }
1071
+
1072
+ if (!toBox) {
1073
+ return { success: false, error: `Could not get bounding box for target element with ref ${toRef}` };
1074
+ }
1075
+
1076
+ const fromX = fromBox.x + fromBox.width / 2;
1077
+ const fromY = fromBox.y + fromBox.height / 2;
1078
+ const toX = toBox.x + toBox.width / 2;
1079
+ const toY = toBox.y + toBox.height / 2;
1080
+
1081
+ // Perform the drag operation
1082
+ await page.mouse.move(fromX, fromY);
1083
+ await page.mouse.down();
1084
+ // Destination coordinates
1085
+ await page.mouse.move(toX, toY);
1086
+ await page.mouse.up();
437
1087
 
1088
+ return { success: true };
1089
+ } catch (error) {
1090
+ return { success: false, error: `Mouse drag action failed: ${error}` };
1091
+ }
1092
+ }
438
1093
 
439
1094
  async executeAction(action: BrowserAction): Promise<ActionResult> {
440
1095
  const startTime = Date.now();
@@ -450,6 +1105,8 @@ export class HybridBrowserSession {
450
1105
  // No need to pre-fetch snapshot - each action method handles this
451
1106
 
452
1107
  let newTabId: string | undefined;
1108
+ let customMessage: string | undefined;
1109
+ let actionDetails: Record<string, any> | undefined;
453
1110
 
454
1111
  switch (action.type) {
455
1112
  case 'click': {
@@ -466,6 +1123,11 @@ export class HybridBrowserSession {
466
1123
  // Capture new tab ID if present
467
1124
  newTabId = clickResult.newTabId;
468
1125
 
1126
+ // Capture diff snapshot if present
1127
+ if (clickResult.diffSnapshot) {
1128
+ actionDetails = { diffSnapshot: clickResult.diffSnapshot };
1129
+ }
1130
+
469
1131
  actionExecutionTime = Date.now() - clickStart;
470
1132
  break;
471
1133
  }
@@ -474,12 +1136,28 @@ export class HybridBrowserSession {
474
1136
  elementSearchTime = Date.now() - elementSearchStart;
475
1137
  const typeStart = Date.now();
476
1138
 
477
- const typeResult = await this.performType(page, action.ref, action.text);
1139
+ const typeResult = await this.performType(page, action.ref, action.text, action.inputs);
478
1140
 
479
1141
  if (!typeResult.success) {
480
1142
  throw new Error(`Type failed: ${typeResult.error}`);
481
1143
  }
482
1144
 
1145
+ // Set custom message and details if multiple inputs were used
1146
+ if (typeResult.details) {
1147
+ const successCount = Object.values(typeResult.details).filter((r: any) => r.success).length;
1148
+ const totalCount = Object.keys(typeResult.details).length;
1149
+ customMessage = `Typed text into ${successCount}/${totalCount} elements`;
1150
+ actionDetails = typeResult.details;
1151
+ }
1152
+
1153
+ // Capture diff snapshot if present
1154
+ if (typeResult.diffSnapshot) {
1155
+ if (!actionDetails) {
1156
+ actionDetails = {};
1157
+ }
1158
+ actionDetails.diffSnapshot = typeResult.diffSnapshot;
1159
+ }
1160
+
483
1161
  actionExecutionTime = Date.now() - typeStart;
484
1162
  break;
485
1163
  }
@@ -519,6 +1197,40 @@ export class HybridBrowserSession {
519
1197
  actionExecutionTime = Date.now() - enterStart;
520
1198
  break;
521
1199
  }
1200
+
1201
+ case 'mouse_control': {
1202
+ elementSearchTime = Date.now() - elementSearchStart;
1203
+ const mouseControlStart = Date.now();
1204
+ const mouseControlResult = await this.performMouseControl(page, action.control, action.x, action.y);
1205
+
1206
+ if (!mouseControlResult.success) {
1207
+ throw new Error(`Action failed: ${mouseControlResult.error}`);
1208
+ }
1209
+ actionExecutionTime = Date.now() - mouseControlStart;
1210
+ break;
1211
+ }
1212
+
1213
+ case 'mouse_drag': {
1214
+ elementSearchTime = Date.now() - elementSearchStart;
1215
+ const mouseDragStart = Date.now();
1216
+ const mouseDragResult = await this.performMouseDrag(page, action.from_ref, action.to_ref);
1217
+
1218
+ if (!mouseDragResult.success) {
1219
+ throw new Error(`Action failed: ${mouseDragResult.error}`);
1220
+ }
1221
+ actionExecutionTime = Date.now() - mouseDragStart;
1222
+ break;
1223
+ }
1224
+
1225
+ case 'press_key': {
1226
+ elementSearchTime = Date.now() - elementSearchStart;
1227
+ const keyPressStart = Date.now();
1228
+ // concatenate keys with '+' for key combinations
1229
+ const keys = action.keys.join('+');
1230
+ await page.keyboard.press(keys);
1231
+ actionExecutionTime = Date.now() - keyPressStart;
1232
+ break;
1233
+ }
522
1234
 
523
1235
  default:
524
1236
  throw new Error(`Unknown action type: ${(action as any).type}`);
@@ -533,7 +1245,7 @@ export class HybridBrowserSession {
533
1245
 
534
1246
  return {
535
1247
  success: true,
536
- message: `Action ${action.type} executed successfully`,
1248
+ message: customMessage || `Action ${action.type} executed successfully`,
537
1249
  timing: {
538
1250
  total_time_ms: totalTime,
539
1251
  element_search_time_ms: elementSearchTime,
@@ -543,6 +1255,7 @@ export class HybridBrowserSession {
543
1255
  network_idle_time_ms: stabilityResult.networkIdleTime,
544
1256
  },
545
1257
  ...(newTabId && { newTabId }), // Include new tab ID if present
1258
+ ...(actionDetails && { details: actionDetails }), // Include action details if present
546
1259
  };
547
1260
  } catch (error) {
548
1261
  const totalTime = Date.now() - startTime;
@@ -559,6 +1272,55 @@ export class HybridBrowserSession {
559
1272
  }
560
1273
  }
561
1274
 
1275
+ /**
1276
+ * Wait for DOM to stop changing for a specified duration
1277
+ */
1278
+ private async waitForDOMStability(page: Page, maxWaitTime: number = 500): Promise<void> {
1279
+ const startTime = Date.now();
1280
+ const stabilityThreshold = 100; // Consider stable if no changes for 100ms
1281
+ let lastChangeTime = Date.now();
1282
+
1283
+ try {
1284
+ // Monitor DOM changes
1285
+ await page.evaluate(() => {
1286
+ let changeCount = 0;
1287
+ (window as any).__domStabilityCheck = { changeCount: 0, lastChange: Date.now() };
1288
+
1289
+ const observer = new MutationObserver(() => {
1290
+ (window as any).__domStabilityCheck.changeCount++;
1291
+ (window as any).__domStabilityCheck.lastChange = Date.now();
1292
+ });
1293
+
1294
+ observer.observe(document.body, {
1295
+ childList: true,
1296
+ subtree: true,
1297
+ attributes: true,
1298
+ characterData: true
1299
+ });
1300
+
1301
+ (window as any).__domStabilityObserver = observer;
1302
+ });
1303
+
1304
+ // Wait until no changes for stabilityThreshold or timeout
1305
+ await page.waitForFunction(
1306
+ (threshold) => {
1307
+ const check = (window as any).__domStabilityCheck;
1308
+ return check && (Date.now() - check.lastChange) > threshold;
1309
+ },
1310
+ stabilityThreshold,
1311
+ { timeout: Math.max(0, maxWaitTime) }
1312
+ ).catch(() => {});
1313
+ } finally {
1314
+ // Cleanup
1315
+ await page.evaluate(() => {
1316
+ const observer = (window as any).__domStabilityObserver;
1317
+ if (observer) observer.disconnect();
1318
+ delete (window as any).__domStabilityObserver;
1319
+ delete (window as any).__domStabilityCheck;
1320
+ }).catch(() => {});
1321
+ }
1322
+ }
1323
+
562
1324
  private async waitForPageStability(page: Page): Promise<{ domContentLoadedTime: number; networkIdleTime: number }> {
563
1325
  let domContentLoadedTime = 0;
564
1326
  let networkIdleTime = 0;
@@ -584,16 +1346,23 @@ export class HybridBrowserSession {
584
1346
 
585
1347
  try {
586
1348
  // Get current page to check if it's blank
587
- const currentPage = await this.getCurrentPage();
588
- const currentUrl = currentPage.url();
1349
+ let currentPage: Page;
1350
+ let currentUrl: string;
1351
+
1352
+ try {
1353
+ currentPage = await this.getCurrentPage();
1354
+ currentUrl = currentPage.url();
1355
+ } catch (error: any) {
1356
+ // If no active page is available, getCurrentPage() will create one in CDP mode
1357
+ console.log('[visitPage] Failed to get current page:', error);
1358
+ throw new Error(`No active page available: ${error?.message || error}`);
1359
+ }
589
1360
 
590
1361
  // Check if current page is blank or if this is the first navigation
591
1362
  const browserConfig = this.configLoader.getBrowserConfig();
592
- const isBlankPage = (
593
- browserConfig.blankPageUrls.includes(currentUrl) ||
594
- currentUrl === browserConfig.defaultStartUrl ||
595
- currentUrl.startsWith(browserConfig.dataUrlPrefix) // data URLs are often used for blank pages
596
- );
1363
+
1364
+ // Use unified blank page detection
1365
+ const isBlankPage = this.isBlankPageUrl(currentUrl) || currentUrl === browserConfig.defaultStartUrl;
597
1366
 
598
1367
  const shouldUseCurrentTab = isBlankPage || !this.hasNavigatedBefore;
599
1368
 
@@ -641,29 +1410,32 @@ export class HybridBrowserSession {
641
1410
  let newTabId: string | null = null;
642
1411
 
643
1412
  const browserConfig = this.configLoader.getBrowserConfig();
644
- if (browserConfig.connectOverCdp) {
1413
+ if (browserConfig.cdpUrl) {
645
1414
  // CDP mode: find an available blank tab
646
1415
  const allPages = this.context.pages();
647
1416
  for (const page of allPages) {
648
1417
  const pageUrl = page.url();
649
1418
  // Check if this page is not already tracked and is blank
650
1419
  const isTracked = Array.from(this.pages.values()).includes(page);
651
- if (!isTracked && pageUrl === 'about:blank') {
1420
+ if (!isTracked && this.isBlankPageUrl(pageUrl)) {
652
1421
  newPage = page;
653
1422
  newTabId = this.generateTabId();
654
- this.pages.set(newTabId, newPage);
1423
+ this.registerNewPage(newTabId, newPage);
655
1424
  break;
656
1425
  }
657
1426
  }
658
1427
 
659
1428
  if (!newPage || !newTabId) {
660
- throw new Error('No available blank tabs in CDP mode. Frontend should create more blank tabs when half are used.');
1429
+ console.log('[CDP] No available blank tabs, creating new page');
1430
+ newPage = await this.context.newPage();
1431
+ newTabId = this.generateTabId();
1432
+ this.registerNewPage(newTabId, newPage);
661
1433
  }
662
1434
  } else {
663
1435
  // Non-CDP mode: create new page as usual
664
1436
  newPage = await this.context.newPage();
665
1437
  newTabId = this.generateTabId();
666
- this.pages.set(newTabId, newPage);
1438
+ this.registerNewPage(newTabId, newPage);
667
1439
  }
668
1440
 
669
1441
  // Set up page properties
@@ -800,15 +1572,110 @@ export class HybridBrowserSession {
800
1572
  return true;
801
1573
  }
802
1574
 
1575
+ async batchKeyboardInput(operations: Array<{type: string, keys?: string[], text?: string, delay?: number}>, skipStabilityWait: boolean = false): Promise<any> {
1576
+ const startTime = Date.now();
1577
+ const page = await this.getCurrentPage();
1578
+
1579
+ try {
1580
+ const maxOperations = 100; // Prevent excessive number of operations per batch
1581
+ if (!Array.isArray(operations) || operations.length > maxOperations) {
1582
+ throw new Error(`Too many operations in batch (max ${maxOperations} allowed)`);
1583
+ }
1584
+
1585
+ const executionStart = Date.now();
1586
+
1587
+ for (const op of operations) {
1588
+ switch (op.type) {
1589
+ case 'press':
1590
+ if (op.keys) {
1591
+ const keys = op.keys.join('+');
1592
+ await page.keyboard.press(keys);
1593
+ }
1594
+ break;
1595
+ case 'type':
1596
+ if (op.text) {
1597
+ // Limit delay to prevent resource exhaustion attacks
1598
+ const maxTypeDelay = 1000; // 1 second per character max
1599
+ let delayValue = Number(op.delay);
1600
+ if (!isFinite(delayValue) || delayValue < 0) delayValue = 0;
1601
+ const safeTypeDelay = Math.min(delayValue, maxTypeDelay);
1602
+ await page.keyboard.type(op.text, { delay: safeTypeDelay });
1603
+ }
1604
+ break;
1605
+ case 'wait':
1606
+ // Only apply wait if op.delay is a non-negative finite number
1607
+ // Limit to prevent resource exhaustion (CodeQL js/resource-exhaustion)
1608
+ {
1609
+ const MAX_WAIT_DELAY = 10000; // 10 seconds maximum
1610
+ let delayValue = Number(op.delay);
1611
+ if (!isFinite(delayValue) || delayValue < 0) {
1612
+ delayValue = 0;
1613
+ }
1614
+ // Clamp delay to safe range [0, MAX_WAIT_DELAY]
1615
+ const safeDelay = delayValue > MAX_WAIT_DELAY ? MAX_WAIT_DELAY : delayValue;
1616
+ // lgtm[js/resource-exhaustion]
1617
+ // Safe: delay is clamped to MAX_WAIT_DELAY (10 seconds)
1618
+ await new Promise(resolve => setTimeout(resolve, safeDelay));
1619
+ }
1620
+ break;
1621
+ }
1622
+ }
1623
+
1624
+ const executionTime = Date.now() - executionStart;
1625
+ let stabilityTime = 0;
1626
+ let stabilityResult = { domContentLoadedTime: 0, networkIdleTime: 0 };
1627
+
1628
+ if (!skipStabilityWait) {
1629
+ const stabilityStart = Date.now();
1630
+
1631
+ try {
1632
+ const browserConfig = this.configLoader.getBrowserConfig();
1633
+ await page.waitForLoadState(browserConfig.domContentLoadedState as any, { timeout: browserConfig.pageStabilityTimeout });
1634
+ stabilityResult.domContentLoadedTime = Date.now() - stabilityStart;
1635
+ } catch (error) {
1636
+ }
1637
+
1638
+ await new Promise(resolve => setTimeout(resolve, 50));
1639
+ stabilityTime = Date.now() - stabilityStart;
1640
+ } else {
1641
+ await new Promise(resolve => setTimeout(resolve, 50));
1642
+ stabilityTime = 50;
1643
+ }
1644
+
1645
+ const totalTime = Date.now() - startTime;
1646
+
1647
+ return {
1648
+ success: true,
1649
+ message: `Batch keyboard input completed (${operations.length} operations)`,
1650
+ timing: {
1651
+ total_time_ms: totalTime,
1652
+ execution_time_ms: executionTime,
1653
+ stability_wait_time_ms: stabilityTime,
1654
+ operations_count: operations.length,
1655
+ skipped_stability: skipStabilityWait,
1656
+ },
1657
+ };
1658
+ } catch (error) {
1659
+ const totalTime = Date.now() - startTime;
1660
+ return {
1661
+ success: false,
1662
+ message: `Batch keyboard input failed: ${error}`,
1663
+ timing: {
1664
+ total_time_ms: totalTime,
1665
+ },
1666
+ };
1667
+ }
1668
+ }
1669
+
803
1670
  async getTabInfo(): Promise<TabInfo[]> {
804
1671
  const tabInfo: TabInfo[] = [];
805
-
1672
+
806
1673
  for (const [tabId, page] of this.pages) {
807
1674
  if (!page.isClosed()) {
808
1675
  try {
809
1676
  const title = await page.title();
810
1677
  const url = page.url();
811
-
1678
+
812
1679
  tabInfo.push({
813
1680
  tab_id: tabId,
814
1681
  title,
@@ -820,7 +1687,7 @@ export class HybridBrowserSession {
820
1687
  }
821
1688
  }
822
1689
  }
823
-
1690
+
824
1691
  return tabInfo;
825
1692
  }
826
1693
 
@@ -856,17 +1723,25 @@ export class HybridBrowserSession {
856
1723
  this.pages.clear();
857
1724
  this.currentTabId = null;
858
1725
 
859
- if (this.context) {
1726
+ // Handle context cleanup separately for CDP mode
1727
+ if (!browserConfig.cdpUrl && this.context && this.contextOwnedByUs) {
1728
+ // For non-CDP mode, close context here
860
1729
  await this.context.close();
861
1730
  this.context = null;
1731
+ this.contextOwnedByUs = false;
862
1732
  }
863
1733
 
864
1734
  if (this.browser) {
865
- if (browserConfig.connectOverCdp) {
866
- // For CDP connections, just disconnect without closing the browser
867
- await this.browser.close();
1735
+ if (browserConfig.cdpUrl) {
1736
+ // In CDP mode: tear down only our context, then disconnect
1737
+ if (this.context && this.contextOwnedByUs) {
1738
+ await this.context.close().catch(() => {});
1739
+ this.context = null;
1740
+ this.contextOwnedByUs = false;
1741
+ }
1742
+ await this.browser.close(); // disconnect
868
1743
  } else {
869
- // For launched browsers, close completely
1744
+ // Local launch: close everything
870
1745
  await this.browser.close();
871
1746
  }
872
1747
  this.browser = null;
@@ -881,12 +1756,12 @@ export class HybridBrowserSession {
881
1756
  const filtered: Record<string, SnapshotElement> = {};
882
1757
 
883
1758
 
884
- // Apply viewport filtering with scroll position adjustment
885
- const browserConfig = this.configLoader.getBrowserConfig();
886
- const adjustedScrollPos = {
887
- x: scrollPos.x * browserConfig.scrollPositionScale,
888
- y: scrollPos.y * browserConfig.scrollPositionScale
889
- };
1759
+ // Apply viewport filtering
1760
+ // boundingBox() returns viewport-relative coordinates, so we don't need to add scroll offsets
1761
+ const viewportLeft = 0;
1762
+ const viewportTop = 0;
1763
+ const viewportRight = viewport.width;
1764
+ const viewportBottom = viewport.height;
890
1765
 
891
1766
  for (const [ref, element] of Object.entries(elements)) {
892
1767
  // If element has no coordinates, include it (fallback)
@@ -897,14 +1772,9 @@ export class HybridBrowserSession {
897
1772
 
898
1773
  const { x, y, width, height } = element.coordinates;
899
1774
 
900
- // Calculate viewport bounds using adjusted scroll position
901
- const viewportLeft = adjustedScrollPos.x;
902
- const viewportTop = adjustedScrollPos.y;
903
- const viewportRight = adjustedScrollPos.x + viewport.width;
904
- const viewportBottom = adjustedScrollPos.y + viewport.height;
905
-
906
1775
  // Check if element is visible in current viewport
907
1776
  // Element is visible if it overlaps with viewport bounds
1777
+ // Since boundingBox() coords are viewport-relative, we compare directly
908
1778
  const isVisible = (
909
1779
  x < viewportRight && // Left edge is before viewport right
910
1780
  y < viewportBottom && // Top edge is before viewport bottom