camel-ai 0.2.75a6__py3-none-any.whl → 0.2.76__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

Files changed (97) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/chat_agent.py +1001 -205
  3. camel/agents/mcp_agent.py +30 -27
  4. camel/configs/__init__.py +6 -0
  5. camel/configs/amd_config.py +70 -0
  6. camel/configs/cometapi_config.py +104 -0
  7. camel/data_collectors/alpaca_collector.py +15 -6
  8. camel/environments/tic_tac_toe.py +1 -1
  9. camel/interpreters/__init__.py +2 -0
  10. camel/interpreters/docker/Dockerfile +3 -12
  11. camel/interpreters/microsandbox_interpreter.py +395 -0
  12. camel/loaders/__init__.py +11 -2
  13. camel/loaders/chunkr_reader.py +9 -0
  14. camel/memories/__init__.py +2 -1
  15. camel/memories/agent_memories.py +3 -1
  16. camel/memories/blocks/chat_history_block.py +21 -3
  17. camel/memories/records.py +88 -8
  18. camel/messages/base.py +127 -34
  19. camel/models/__init__.py +4 -0
  20. camel/models/amd_model.py +101 -0
  21. camel/models/azure_openai_model.py +0 -6
  22. camel/models/base_model.py +30 -0
  23. camel/models/cometapi_model.py +83 -0
  24. camel/models/model_factory.py +4 -0
  25. camel/models/openai_compatible_model.py +0 -6
  26. camel/models/openai_model.py +0 -6
  27. camel/models/zhipuai_model.py +61 -2
  28. camel/parsers/__init__.py +18 -0
  29. camel/parsers/mcp_tool_call_parser.py +176 -0
  30. camel/retrievers/auto_retriever.py +1 -0
  31. camel/runtimes/daytona_runtime.py +11 -12
  32. camel/societies/workforce/prompts.py +131 -50
  33. camel/societies/workforce/single_agent_worker.py +434 -49
  34. camel/societies/workforce/structured_output_handler.py +30 -18
  35. camel/societies/workforce/task_channel.py +43 -0
  36. camel/societies/workforce/utils.py +105 -12
  37. camel/societies/workforce/workforce.py +1322 -311
  38. camel/societies/workforce/workforce_logger.py +24 -5
  39. camel/storages/key_value_storages/json.py +15 -2
  40. camel/storages/object_storages/google_cloud.py +1 -1
  41. camel/storages/vectordb_storages/oceanbase.py +10 -11
  42. camel/storages/vectordb_storages/tidb.py +8 -6
  43. camel/tasks/task.py +4 -3
  44. camel/toolkits/__init__.py +18 -5
  45. camel/toolkits/aci_toolkit.py +45 -0
  46. camel/toolkits/code_execution.py +28 -1
  47. camel/toolkits/context_summarizer_toolkit.py +684 -0
  48. camel/toolkits/dingtalk.py +1135 -0
  49. camel/toolkits/edgeone_pages_mcp_toolkit.py +11 -31
  50. camel/toolkits/{file_write_toolkit.py → file_toolkit.py} +194 -34
  51. camel/toolkits/function_tool.py +6 -1
  52. camel/toolkits/google_drive_mcp_toolkit.py +12 -31
  53. camel/toolkits/hybrid_browser_toolkit/config_loader.py +12 -0
  54. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +79 -2
  55. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit_ts.py +95 -59
  56. camel/toolkits/hybrid_browser_toolkit/installer.py +203 -0
  57. camel/toolkits/hybrid_browser_toolkit/ts/package-lock.json +5 -612
  58. camel/toolkits/hybrid_browser_toolkit/ts/package.json +0 -1
  59. camel/toolkits/hybrid_browser_toolkit/ts/src/browser-session.ts +619 -95
  60. camel/toolkits/hybrid_browser_toolkit/ts/src/config-loader.ts +7 -2
  61. camel/toolkits/hybrid_browser_toolkit/ts/src/hybrid-browser-toolkit.ts +115 -219
  62. camel/toolkits/hybrid_browser_toolkit/ts/src/parent-child-filter.ts +226 -0
  63. camel/toolkits/hybrid_browser_toolkit/ts/src/snapshot-parser.ts +219 -0
  64. camel/toolkits/hybrid_browser_toolkit/ts/src/som-screenshot-injected.ts +543 -0
  65. camel/toolkits/hybrid_browser_toolkit/ts/src/types.ts +1 -0
  66. camel/toolkits/hybrid_browser_toolkit/ts/websocket-server.js +39 -6
  67. camel/toolkits/hybrid_browser_toolkit/ws_wrapper.py +405 -131
  68. camel/toolkits/hybrid_browser_toolkit_py/hybrid_browser_toolkit.py +9 -5
  69. camel/toolkits/{openai_image_toolkit.py → image_generation_toolkit.py} +98 -31
  70. camel/toolkits/markitdown_toolkit.py +27 -1
  71. camel/toolkits/mcp_toolkit.py +348 -348
  72. camel/toolkits/message_integration.py +3 -0
  73. camel/toolkits/minimax_mcp_toolkit.py +195 -0
  74. camel/toolkits/note_taking_toolkit.py +18 -8
  75. camel/toolkits/notion_mcp_toolkit.py +16 -26
  76. camel/toolkits/origene_mcp_toolkit.py +8 -49
  77. camel/toolkits/playwright_mcp_toolkit.py +12 -31
  78. camel/toolkits/resend_toolkit.py +168 -0
  79. camel/toolkits/slack_toolkit.py +50 -1
  80. camel/toolkits/terminal_toolkit/__init__.py +18 -0
  81. camel/toolkits/terminal_toolkit/terminal_toolkit.py +924 -0
  82. camel/toolkits/terminal_toolkit/utils.py +532 -0
  83. camel/toolkits/vertex_ai_veo_toolkit.py +590 -0
  84. camel/toolkits/video_analysis_toolkit.py +17 -11
  85. camel/toolkits/wechat_official_toolkit.py +483 -0
  86. camel/types/enums.py +124 -1
  87. camel/types/unified_model_type.py +5 -0
  88. camel/utils/commons.py +17 -0
  89. camel/utils/context_utils.py +804 -0
  90. camel/utils/mcp.py +136 -2
  91. camel/utils/token_counting.py +25 -17
  92. {camel_ai-0.2.75a6.dist-info → camel_ai-0.2.76.dist-info}/METADATA +158 -59
  93. {camel_ai-0.2.75a6.dist-info → camel_ai-0.2.76.dist-info}/RECORD +95 -76
  94. camel/loaders/pandas_reader.py +0 -368
  95. camel/toolkits/terminal_toolkit.py +0 -1788
  96. {camel_ai-0.2.75a6.dist-info → camel_ai-0.2.76.dist-info}/WHEEL +0 -0
  97. {camel_ai-0.2.75a6.dist-info → camel_ai-0.2.76.dist-info}/licenses/LICENSE +0 -0
@@ -1,10 +1,11 @@
1
- import { Page, Browser, BrowserContext, chromium, ConsoleMessage } from 'playwright';
1
+ import { Page, Browser, BrowserContext, chromium, ConsoleMessage, Frame } from 'playwright';
2
2
  import { BrowserToolkitConfig, SnapshotResult, SnapshotElement, ActionResult, TabInfo, BrowserAction, DetailedTiming } from './types';
3
3
  import { ConfigLoader, StealthConfig } from './config-loader';
4
4
 
5
5
  export class HybridBrowserSession {
6
6
  private browser: Browser | null = null;
7
7
  private context: BrowserContext | null = null;
8
+ private contextOwnedByUs: boolean = false;
8
9
  private pages: Map<string, Page> = new Map();
9
10
  private consoleLogs: Map<string, ConsoleMessage[]> = new Map();
10
11
  private currentTabId: string | null = null;
@@ -50,8 +51,8 @@ export class HybridBrowserSession {
50
51
  const browserConfig = this.configLoader.getBrowserConfig();
51
52
  const stealthConfig = this.configLoader.getStealthConfig();
52
53
 
53
- // Check if CDP connection is requested
54
- if (browserConfig.connectOverCdp && browserConfig.cdpUrl) {
54
+ // Check if CDP URL is provided
55
+ if (browserConfig.cdpUrl) {
55
56
  // Connect to existing browser via CDP
56
57
  this.browser = await chromium.connectOverCDP(browserConfig.cdpUrl);
57
58
 
@@ -59,42 +60,94 @@ export class HybridBrowserSession {
59
60
  const contexts = this.browser.contexts();
60
61
  if (contexts.length > 0) {
61
62
  this.context = contexts[0];
63
+ this.contextOwnedByUs = false;
64
+
65
+ // Apply stealth headers to existing context if configured
66
+ // Note: userAgent cannot be changed on an existing context
67
+ if (stealthConfig.enabled) {
68
+ if (stealthConfig.extraHTTPHeaders) {
69
+ await this.context.setExtraHTTPHeaders(stealthConfig.extraHTTPHeaders);
70
+ }
71
+ if (stealthConfig.userAgent) {
72
+ console.warn('[HybridBrowserSession] Cannot apply userAgent to existing context. Consider creating a new context if userAgent customization is required.');
73
+ }
74
+ }
62
75
  } else {
63
76
  const contextOptions: any = {
64
77
  viewport: browserConfig.viewport
65
78
  };
66
79
 
67
- // Apply stealth headers if configured
68
- if (stealthConfig.enabled && stealthConfig.extraHTTPHeaders) {
69
- contextOptions.extraHTTPHeaders = stealthConfig.extraHTTPHeaders;
80
+ // Apply stealth headers and UA if configured
81
+ if (stealthConfig.enabled) {
82
+ if (stealthConfig.extraHTTPHeaders) {
83
+ contextOptions.extraHTTPHeaders = stealthConfig.extraHTTPHeaders;
84
+ }
85
+ if (stealthConfig.userAgent) {
86
+ contextOptions.userAgent = stealthConfig.userAgent;
87
+ }
70
88
  }
71
89
 
72
90
  this.context = await this.browser.newContext(contextOptions);
91
+ this.contextOwnedByUs = true;
92
+ this.browser = this.context.browser();
73
93
  }
74
94
 
75
95
  const pages = this.context.pages();
76
- if (pages.length > 0) {
77
- // Map existing pages - for CDP, find ONE available blank page
78
- let availablePageFound = false;
79
- for (const page of pages) {
80
- const pageUrl = page.url();
81
- if (this.isBlankPageUrl(pageUrl)) {
96
+ console.log(`[CDP] cdpKeepCurrentPage: ${browserConfig.cdpKeepCurrentPage}, pages count: ${pages.length}`);
97
+ if (browserConfig.cdpKeepCurrentPage) {
98
+ // Use existing page without creating new ones
99
+ if (pages.length > 0) {
100
+ // Find first non-closed page
101
+ let validPage: Page | null = null;
102
+ for (const page of pages) {
103
+ if (!page.isClosed()) {
104
+ validPage = page;
105
+ break;
106
+ }
107
+ }
108
+
109
+ if (validPage) {
82
110
  const tabId = this.generateTabId();
83
- this.registerNewPage(tabId, page);
111
+ this.registerNewPage(tabId, validPage);
84
112
  this.currentTabId = tabId;
85
- availablePageFound = true;
86
- console.log(`[CDP] Registered blank page as initial tab: ${tabId}, URL: ${pageUrl}`);
87
- break; // Only register ONE page initially
113
+ console.log(`[CDP] cdpKeepCurrentPage mode: using existing page as initial tab: ${tabId}, URL: ${validPage.url()}`);
114
+ } else {
115
+ throw new Error('No active pages available in CDP mode with cdpKeepCurrentPage=true (all pages are closed)');
88
116
  }
89
- }
90
-
91
- // If no available blank pages found in CDP mode, we cannot create new ones
92
- if (!availablePageFound) {
93
- throw new Error('No available blank tabs found in CDP mode. The frontend should have pre-created blank tabs.');
117
+ } else {
118
+ throw new Error('No pages available in CDP mode with cdpKeepCurrentPage=true');
94
119
  }
95
120
  } else {
96
- // In CDP mode, newPage is not supported
97
- throw new Error('No pages available in CDP mode and newPage() is not supported. Ensure the frontend has pre-created blank tabs.');
121
+ // Look for blank pages or create new ones
122
+ if (pages.length > 0) {
123
+ // Find one available blank page
124
+ let availablePageFound = false;
125
+ for (const page of pages) {
126
+ const pageUrl = page.url();
127
+ if (this.isBlankPageUrl(pageUrl)) {
128
+ const tabId = this.generateTabId();
129
+ this.registerNewPage(tabId, page);
130
+ this.currentTabId = tabId;
131
+ availablePageFound = true;
132
+ console.log(`[CDP] Registered blank page as initial tab: ${tabId}, URL: ${pageUrl}`);
133
+ break;
134
+ }
135
+ }
136
+
137
+ if (!availablePageFound) {
138
+ console.log('[CDP] No blank pages found, creating new page');
139
+ const newPage = await this.context.newPage();
140
+ const tabId = this.generateTabId();
141
+ this.registerNewPage(tabId, newPage);
142
+ this.currentTabId = tabId;
143
+ }
144
+ } else {
145
+ console.log('[CDP] No existing pages, creating initial page');
146
+ const newPage = await this.context.newPage();
147
+ const tabId = this.generateTabId();
148
+ this.registerNewPage(tabId, newPage);
149
+ this.currentTabId = tabId;
150
+ }
98
151
  }
99
152
  } else {
100
153
  // Original launch logic
@@ -105,18 +158,24 @@ export class HybridBrowserSession {
105
158
  if (stealthConfig.enabled) {
106
159
  launchOptions.args = stealthConfig.args || [];
107
160
 
108
- // Apply stealth user agent if configured
161
+ // Apply stealth user agent/headers if configured
109
162
  if (stealthConfig.userAgent) {
110
163
  launchOptions.userAgent = stealthConfig.userAgent;
111
164
  }
165
+ if (stealthConfig.extraHTTPHeaders) {
166
+ launchOptions.extraHTTPHeaders = stealthConfig.extraHTTPHeaders;
167
+ }
112
168
  }
113
169
 
114
170
  if (browserConfig.userDataDir) {
171
+ // Ensure viewport is honored in persistent context
172
+ launchOptions.viewport = browserConfig.viewport;
115
173
  this.context = await chromium.launchPersistentContext(
116
174
  browserConfig.userDataDir,
117
175
  launchOptions
118
176
  );
119
-
177
+ this.contextOwnedByUs = true;
178
+ this.browser = this.context.browser();
120
179
  const pages = this.context.pages();
121
180
  if (pages.length > 0) {
122
181
  const initialTabId = this.generateTabId();
@@ -129,12 +188,18 @@ export class HybridBrowserSession {
129
188
  viewport: browserConfig.viewport
130
189
  };
131
190
 
132
- // Apply stealth headers if configured
133
- if (stealthConfig.enabled && stealthConfig.extraHTTPHeaders) {
134
- contextOptions.extraHTTPHeaders = stealthConfig.extraHTTPHeaders;
191
+ // Apply stealth headers and UA if configured
192
+ if (stealthConfig.enabled) {
193
+ if (stealthConfig.extraHTTPHeaders) {
194
+ contextOptions.extraHTTPHeaders = stealthConfig.extraHTTPHeaders;
195
+ }
196
+ if (stealthConfig.userAgent) {
197
+ contextOptions.userAgent = stealthConfig.userAgent;
198
+ }
135
199
  }
136
200
 
137
201
  this.context = await this.browser.newContext(contextOptions);
202
+ this.contextOwnedByUs = true;
138
203
 
139
204
  const initialPage = await this.context.newPage();
140
205
  const initialTabId = this.generateTabId();
@@ -173,28 +238,57 @@ export class HybridBrowserSession {
173
238
 
174
239
  async getCurrentPage(): Promise<Page> {
175
240
  if (!this.currentTabId || !this.pages.has(this.currentTabId)) {
176
- // In CDP mode, try to create a new page if none exists
177
241
  const browserConfig = this.configLoader.getBrowserConfig();
178
- if (browserConfig.connectOverCdp && this.context) {
179
- console.log('[CDP] No active page found, attempting to create new page...');
180
- try {
181
- const newPage = await this.context.newPage();
182
- const newTabId = this.generateTabId();
183
- this.registerNewPage(newTabId, newPage);
184
- this.currentTabId = newTabId;
185
-
186
- // Set page timeouts
187
- newPage.setDefaultNavigationTimeout(browserConfig.navigationTimeout);
188
- newPage.setDefaultTimeout(browserConfig.navigationTimeout);
242
+
243
+ // In CDP keep-current-page mode, find existing page
244
+ if (browserConfig.cdpKeepCurrentPage && browserConfig.cdpUrl && this.context) {
245
+ const allPages = this.context.pages();
246
+ console.log(`[getCurrentPage] cdpKeepCurrentPage mode: Looking for existing page, found ${allPages.length} pages`);
247
+
248
+ if (allPages.length > 0) {
249
+ // Try to find a page that's not already tracked
250
+ for (const page of allPages) {
251
+ const isTracked = Array.from(this.pages.values()).includes(page);
252
+ if (!isTracked && !page.isClosed()) {
253
+ const tabId = this.generateTabId();
254
+ this.registerNewPage(tabId, page);
255
+ this.currentTabId = tabId;
256
+ console.log(`[getCurrentPage] cdpKeepCurrentPage mode: Found and registered untracked page: ${tabId}`);
257
+ return page;
258
+ }
259
+ }
189
260
 
190
- console.log(`[CDP] Created new page with tab ID: ${newTabId}`);
191
- return newPage;
192
- } catch (error) {
193
- console.error('[CDP] Failed to create new page:', error);
194
- throw new Error('No active page available and failed to create new page in CDP mode');
261
+ // If all pages are tracked, use the first available one
262
+ const firstPage = allPages[0];
263
+ if (!firstPage.isClosed()) {
264
+ // Find the tab ID for this page
265
+ for (const [tabId, page] of this.pages.entries()) {
266
+ if (page === firstPage) {
267
+ this.currentTabId = tabId;
268
+ console.log(`[getCurrentPage] cdpKeepCurrentPage mode: Using existing tracked page: ${tabId}`);
269
+ return page;
270
+ }
271
+ }
272
+ }
195
273
  }
274
+
275
+ throw new Error('No active page available in CDP mode with cdpKeepCurrentPage=true');
276
+ }
277
+
278
+ // Normal mode: create new page
279
+ if (this.context) {
280
+ console.log('[getCurrentPage] No active page, creating new page');
281
+ const newPage = await this.context.newPage();
282
+ const tabId = this.generateTabId();
283
+ this.registerNewPage(tabId, newPage);
284
+ this.currentTabId = tabId;
285
+
286
+ newPage.setDefaultNavigationTimeout(browserConfig.navigationTimeout);
287
+ newPage.setDefaultTimeout(browserConfig.navigationTimeout);
288
+
289
+ return newPage;
196
290
  }
197
- throw new Error('No active page available');
291
+ throw new Error('No browser context available');
198
292
  }
199
293
  return this.pages.get(this.currentTabId)!;
200
294
  }
@@ -235,6 +329,36 @@ export class HybridBrowserSession {
235
329
  return this.getSnapshotForAINative(includeCoordinates, viewportLimit);
236
330
  }
237
331
 
332
+ private parseElementFromSnapshot(snapshotText: string, ref: string): { role?: string; text?: string } {
333
+ const lines = snapshotText.split('\n');
334
+ for (const line of lines) {
335
+ if (line.includes(`[ref=${ref}]`)) {
336
+ const typeMatch = line.match(/^\s*-?\s*([\w-]+)/);
337
+ const role = typeMatch ? typeMatch[1] : undefined;
338
+ const textMatch = line.match(/"([^"]*)"/);
339
+ const text = textMatch ? textMatch[1] : undefined;
340
+ return { role, text };
341
+ }
342
+ }
343
+ return {};
344
+ }
345
+
346
+ private buildSnapshotIndex(snapshotText: string): Map<string, { role?: string; text?: string }> {
347
+ const index = new Map<string, { role?: string; text?: string }>();
348
+ const refRe = /\[ref=([^\]]+)\]/i;
349
+ for (const line of snapshotText.split('\n')) {
350
+ const m = line.match(refRe);
351
+ if (!m) continue;
352
+ const ref = m[1];
353
+ const roleMatch = line.match(/^\s*-?\s*([a-z0-9_-]+)/i);
354
+ const role = roleMatch ? roleMatch[1].toLowerCase() : undefined;
355
+ const textMatch = line.match(/"([^"]*)"/);
356
+ const text = textMatch ? textMatch[1] : undefined;
357
+ index.set(ref, { role, text });
358
+ }
359
+ return index;
360
+ }
361
+
238
362
  private async getSnapshotForAINative(includeCoordinates = false, viewportLimit = false): Promise<SnapshotResult & { timing: DetailedTiming }> {
239
363
  const startTime = Date.now();
240
364
  const page = await this.getCurrentPage();
@@ -257,6 +381,17 @@ export class HybridBrowserSession {
257
381
  const mappingStart = Date.now();
258
382
  const playwrightMapping: Record<string, any> = {};
259
383
 
384
+ // Parse element info in a single pass
385
+ const snapshotIndex = this.buildSnapshotIndex(snapshotText);
386
+ for (const ref of refs) {
387
+ const elementInfo = snapshotIndex.get(ref) || {};
388
+ playwrightMapping[ref] = {
389
+ ref,
390
+ role: elementInfo.role || 'unknown',
391
+ text: elementInfo.text || '',
392
+ };
393
+ }
394
+
260
395
  if (includeCoordinates) {
261
396
  // Get coordinates for each ref using aria-ref selector
262
397
  for (const ref of refs) {
@@ -270,8 +405,9 @@ export class HybridBrowserSession {
270
405
  const boundingBox = await element.boundingBox();
271
406
 
272
407
  if (boundingBox) {
408
+ // Add coordinates to existing element info
273
409
  playwrightMapping[ref] = {
274
- ref,
410
+ ...playwrightMapping[ref],
275
411
  coordinates: {
276
412
  x: Math.round(boundingBox.x),
277
413
  y: Math.round(boundingBox.y),
@@ -344,7 +480,7 @@ export class HybridBrowserSession {
344
480
  /**
345
481
  * Enhanced click implementation with new tab detection and scroll fix
346
482
  */
347
- private async performClick(page: Page, ref: string): Promise<{ success: boolean; method?: string; error?: string; newTabId?: string }> {
483
+ private async performClick(page: Page, ref: string): Promise<{ success: boolean; method?: string; error?: string; newTabId?: string; diffSnapshot?: string }> {
348
484
 
349
485
  try {
350
486
  // Ensure we have the latest snapshot and mapping
@@ -361,6 +497,17 @@ export class HybridBrowserSession {
361
497
  return { success: false, error: `Element with ref ${ref} not found` };
362
498
  }
363
499
 
500
+ const role = await element.getAttribute('role');
501
+ const elementTagName = await element.evaluate(el => el.tagName.toLowerCase());
502
+ const isCombobox = role === 'combobox' || elementTagName === 'combobox';
503
+ const isTextbox = role === 'textbox' || elementTagName === 'input' || elementTagName === 'textarea';
504
+ const shouldCheckDiff = isCombobox || isTextbox;
505
+
506
+ let snapshotBefore: string | null = null;
507
+ if (shouldCheckDiff) {
508
+ snapshotBefore = await (page as any)._snapshotForAI();
509
+ }
510
+
364
511
  // Check element properties
365
512
  const browserConfig = this.configLoader.getBrowserConfig();
366
513
  const target = await element.getAttribute(browserConfig.targetAttribute);
@@ -388,7 +535,6 @@ export class HybridBrowserSession {
388
535
 
389
536
  if (shouldOpenNewTab) {
390
537
  // Handle new tab opening
391
-
392
538
  // If it's a link that doesn't naturally open in new tab, force it
393
539
  if (isNavigableLink && !naturallyOpensNewTab) {
394
540
  await element.evaluate((el, blankTarget) => {
@@ -431,13 +577,17 @@ export class HybridBrowserSession {
431
577
  }
432
578
  } else {
433
579
  // Add options to prevent scrolling issues
434
- try {
435
- // First try normal click
436
- const browserConfig = this.configLoader.getBrowserConfig();
437
- await element.click({ timeout: browserConfig.clickTimeout });
438
- } catch (clickError) {
439
- // If normal click fails due to scrolling, try force click
440
- await element.click({ force: browserConfig.forceClick });
580
+ const browserConfig = this.configLoader.getBrowserConfig();
581
+ await element.click({ force: browserConfig.forceClick });
582
+
583
+ if (shouldCheckDiff && snapshotBefore) {
584
+ await page.waitForTimeout(300);
585
+ const snapshotAfter = await (page as any)._snapshotForAI();
586
+ const diffSnapshot = this.getSnapshotDiff(snapshotBefore, snapshotAfter, ['option', 'menuitem']);
587
+
588
+ if (diffSnapshot && diffSnapshot.trim() !== '') {
589
+ return { success: true, method: 'playwright-aria-ref', diffSnapshot };
590
+ }
441
591
  }
442
592
 
443
593
  return { success: true, method: 'playwright-aria-ref' };
@@ -449,11 +599,46 @@ export class HybridBrowserSession {
449
599
  }
450
600
  }
451
601
 
602
+ /**
603
+ * Extract diff between two snapshots, returning only new elements of specified types
604
+ */
605
+ private getSnapshotDiff(snapshotBefore: string, snapshotAfter: string, targetRoles: string[]): string {
606
+ const refsBefore = new Set<string>();
607
+ const refPattern = /\[ref=([^\]]+)\]/g;
608
+ let match;
609
+ while ((match = refPattern.exec(snapshotBefore)) !== null) {
610
+ refsBefore.add(match[1]);
611
+ }
612
+
613
+ const lines = snapshotAfter.split('\n');
614
+ const newElements: string[] = [];
615
+
616
+ for (const line of lines) {
617
+ const refMatch = line.match(/\[ref=([^\]]+)\]/);
618
+ if (refMatch && !refsBefore.has(refMatch[1])) {
619
+ const hasTargetRole = targetRoles.some(role => {
620
+ const rolePattern = new RegExp(`\\b${role}\\b`, 'i');
621
+ return rolePattern.test(line);
622
+ });
623
+
624
+ if (hasTargetRole) {
625
+ newElements.push(line.trim());
626
+ }
627
+ }
628
+ }
629
+
630
+ if (newElements.length > 0) {
631
+ return newElements.join('\n');
632
+ } else {
633
+ return '';
634
+ }
635
+ }
636
+
452
637
  /**
453
638
  * Simplified type implementation using Playwright's aria-ref selector
454
639
  * Supports both single and multiple input operations
455
640
  */
456
- private async performType(page: Page, ref: string | undefined, text: string | undefined, inputs?: Array<{ ref: string; text: string }>): Promise<{ success: boolean; error?: string; details?: Record<string, any> }> {
641
+ private async performType(page: Page, ref: string | undefined, text: string | undefined, inputs?: Array<{ ref: string; text: string }>): Promise<{ success: boolean; error?: string; details?: Record<string, any>; diffSnapshot?: string }> {
457
642
  try {
458
643
  // Ensure we have the latest snapshot
459
644
  await (page as any)._snapshotForAI();
@@ -463,22 +648,11 @@ export class HybridBrowserSession {
463
648
  const results: Record<string, { success: boolean; error?: string }> = {};
464
649
 
465
650
  for (const input of inputs) {
466
- const selector = `aria-ref=${input.ref}`;
467
- const element = await page.locator(selector).first();
468
-
469
- const exists = await element.count() > 0;
470
- if (!exists) {
471
- results[input.ref] = { success: false, error: `Element with ref ${input.ref} not found` };
472
- continue;
473
- }
474
-
475
- try {
476
- // Type text using Playwright's built-in fill method
477
- await element.fill(input.text);
478
- results[input.ref] = { success: true };
479
- } catch (error) {
480
- results[input.ref] = { success: false, error: `Type failed: ${error}` };
481
- }
651
+ const singleResult = await this.performType(page, input.ref, input.text);
652
+ results[input.ref] = {
653
+ success: singleResult.success,
654
+ error: singleResult.error
655
+ };
482
656
  }
483
657
 
484
658
  // Check if all inputs were successful
@@ -505,10 +679,292 @@ export class HybridBrowserSession {
505
679
  return { success: false, error: `Element with ref ${ref} not found` };
506
680
  }
507
681
 
508
- // Type text using Playwright's built-in fill method
509
- await element.fill(text);
682
+ // Get element attributes to check if it's readonly or a special input type
683
+ let originalPlaceholder: string | null = null;
684
+ let isReadonly = false;
685
+ let elementType: string | null = null;
686
+ let isCombobox = false;
687
+ let isTextbox = false;
688
+ let shouldCheckDiff = false;
510
689
 
511
- return { success: true };
690
+ try {
691
+ // Get element info in one evaluation to minimize interactions
692
+ const elementInfo = await element.evaluate((el: any) => {
693
+ return {
694
+ placeholder: el.placeholder || null,
695
+ readonly: el.readOnly || el.hasAttribute('readonly'),
696
+ type: el.type || null,
697
+ tagName: el.tagName.toLowerCase(),
698
+ disabled: el.disabled || false,
699
+ role: el.getAttribute('role'),
700
+ ariaHaspopup: el.getAttribute('aria-haspopup')
701
+ };
702
+ });
703
+
704
+ originalPlaceholder = elementInfo.placeholder;
705
+ isReadonly = elementInfo.readonly;
706
+ elementType = elementInfo.type;
707
+ isCombobox = elementInfo.role === 'combobox' ||
708
+ elementInfo.tagName === 'combobox' ||
709
+ elementInfo.ariaHaspopup === 'listbox';
710
+ isTextbox = elementInfo.role === 'textbox' ||
711
+ elementInfo.tagName === 'input' ||
712
+ elementInfo.tagName === 'textarea';
713
+ shouldCheckDiff = isCombobox || isTextbox;
714
+
715
+ } catch (e) {
716
+ console.log(`Warning: Failed to get element attributes: ${e}`);
717
+ }
718
+
719
+ // Get snapshot before action to record existing elements
720
+ const snapshotBefore = await (page as any)._snapshotForAI();
721
+ const existingRefs = new Set<string>();
722
+ const refPattern = /\[ref=([^\]]+)\]/g;
723
+ let match;
724
+ while ((match = refPattern.exec(snapshotBefore)) !== null) {
725
+ existingRefs.add(match[1]);
726
+ }
727
+ console.log(`Found ${existingRefs.size} total elements before action`);
728
+
729
+ // If element is readonly or a date/time input, skip fill attempt and go directly to click
730
+ if (isReadonly || ['date', 'datetime-local', 'time'].includes(elementType || '')) {
731
+ console.log(`Element ref=${ref} is readonly or date/time input, skipping direct fill attempt`);
732
+
733
+ // Click with force option to avoid scrolling
734
+ try {
735
+ await element.click({ force: true });
736
+ console.log(`Clicked readonly/special element ref=${ref} to trigger dynamic content`);
737
+ // Wait for potential dynamic content to appear
738
+ await page.waitForTimeout(500);
739
+ } catch (clickError) {
740
+ console.log(`Warning: Failed to click element: ${clickError}`);
741
+ }
742
+ } else {
743
+ // For normal inputs, click first then try to fill
744
+ try {
745
+ await element.click({ force: true });
746
+ console.log(`Clicked element ref=${ref} before typing`);
747
+ } catch (clickError) {
748
+ console.log(`Warning: Failed to click element before typing: ${clickError}`);
749
+ }
750
+
751
+ // Try to fill the element directly
752
+ try {
753
+ // Use force option to avoid scrolling during fill
754
+ await element.fill(text, { timeout: 3000, force: true });
755
+
756
+ // If this element might show dropdown, wait and check for new elements
757
+ if (shouldCheckDiff) {
758
+ await page.waitForTimeout(300);
759
+ const snapshotAfter = await (page as any)._snapshotForAI();
760
+ const diffSnapshot = this.getSnapshotDiff(snapshotBefore, snapshotAfter, ['option', 'menuitem']);
761
+
762
+ if (diffSnapshot && diffSnapshot.trim() !== '') {
763
+ return { success: true, diffSnapshot };
764
+ }
765
+ }
766
+
767
+ return { success: true };
768
+ } catch (fillError: any) {
769
+ // Log the error for debugging
770
+ console.log(`Fill error for ref ${ref}: ${fillError.message}`);
771
+
772
+ // Check for various error messages that indicate the element is not fillable
773
+ const errorMessage = fillError.message.toLowerCase();
774
+ if (errorMessage.includes('not an <input>') ||
775
+ errorMessage.includes('not have a role allowing') ||
776
+ errorMessage.includes('element is not') ||
777
+ errorMessage.includes('cannot type') ||
778
+ errorMessage.includes('readonly') ||
779
+ errorMessage.includes('not editable') ||
780
+ errorMessage.includes('timeout') ||
781
+ errorMessage.includes('timeouterror')) {
782
+
783
+ // Click the element again to trigger dynamic content (like date pickers)
784
+ try {
785
+ await element.click({ force: true });
786
+ console.log(`Clicked element ref=${ref} again to trigger dynamic content`);
787
+ // Wait for potential dynamic content to appear
788
+ await page.waitForTimeout(500);
789
+ } catch (clickError) {
790
+ console.log(`Warning: Failed to click element to trigger dynamic content: ${clickError}`);
791
+ }
792
+
793
+ // Step 1: Try to find input elements within the clicked element
794
+ const inputSelector = `input:visible, textarea:visible, [contenteditable="true"]:visible, [role="textbox"]:visible`;
795
+ const inputElement = await element.locator(inputSelector).first();
796
+
797
+ const inputExists = await inputElement.count() > 0;
798
+ if (inputExists) {
799
+ console.log(`Found input element within ref ${ref}, attempting to fill`);
800
+ try {
801
+ await inputElement.fill(text, { force: true });
802
+
803
+ // If element might show dropdown, check for new elements
804
+ if (shouldCheckDiff) {
805
+ await page.waitForTimeout(300);
806
+ const snapshotFinal = await (page as any)._snapshotForAI();
807
+ const diffSnapshot = this.getSnapshotDiff(snapshotBefore, snapshotFinal, ['option', 'menuitem']);
808
+
809
+ if (diffSnapshot && diffSnapshot.trim() !== '') {
810
+ return { success: true, diffSnapshot };
811
+ }
812
+ }
813
+
814
+ return { success: true };
815
+ } catch (innerError) {
816
+ console.log(`Failed to fill child element: ${innerError}`);
817
+ }
818
+ }
819
+
820
+ // Step 2: Look for new elements that appeared after the action
821
+ console.log(`Looking for new elements that appeared after action...`);
822
+
823
+ // Get snapshot after action to find new elements
824
+ const snapshotAfter = await (page as any)._snapshotForAI();
825
+ const newRefs = new Set<string>();
826
+ const afterRefPattern = /\[ref=([^\]]+)\]/g;
827
+ let afterMatch;
828
+ while ((afterMatch = afterRefPattern.exec(snapshotAfter)) !== null) {
829
+ const refId = afterMatch[1];
830
+ if (!existingRefs.has(refId)) {
831
+ newRefs.add(refId);
832
+ }
833
+ }
834
+
835
+ console.log(`Found ${newRefs.size} new elements after action`);
836
+
837
+ // If we have a placeholder, try to find new input elements with that placeholder
838
+ if (originalPlaceholder && newRefs.size > 0) {
839
+ console.log(`Looking for new input elements with placeholder: ${originalPlaceholder}`);
840
+
841
+ // Try each new ref to see if it's an input with our placeholder
842
+ for (const newRef of newRefs) {
843
+ try {
844
+ const newElement = await page.locator(`aria-ref=${newRef}`).first();
845
+ const tagName = await newElement.evaluate(el => el.tagName.toLowerCase()).catch(() => null);
846
+
847
+ if (tagName === 'input' || tagName === 'textarea') {
848
+ const placeholder = await newElement.getAttribute('placeholder').catch(() => null);
849
+ if (placeholder === originalPlaceholder) {
850
+ console.log(`Found new input element with matching placeholder: ref=${newRef}`);
851
+
852
+ // Check if it's visible and fillable
853
+ const elementInfo = await newElement.evaluate((el: any) => {
854
+ return {
855
+ tagName: el.tagName,
856
+ id: el.id,
857
+ className: el.className,
858
+ placeholder: el.placeholder,
859
+ isVisible: el.offsetParent !== null,
860
+ isReadonly: el.readOnly || el.getAttribute('readonly') !== null
861
+ };
862
+ });
863
+ console.log(`New element details:`, JSON.stringify(elementInfo));
864
+
865
+ // Try to fill it with force to avoid scrolling
866
+ await newElement.fill(text, { force: true });
867
+
868
+ // If element might show dropdown, check for new elements
869
+ if (shouldCheckDiff) {
870
+ await page.waitForTimeout(300);
871
+ const snapshotFinal = await (page as any)._snapshotForAI();
872
+ const diffSnapshot = this.getSnapshotDiff(snapshotBefore, snapshotFinal, ['option', 'menuitem']);
873
+
874
+ if (diffSnapshot && diffSnapshot.trim() !== '') {
875
+ return { success: true, diffSnapshot };
876
+ }
877
+ }
878
+
879
+ return { success: true };
880
+ }
881
+ }
882
+ } catch (e) {
883
+ // Ignore errors for non-input elements
884
+ }
885
+ }
886
+ }
887
+
888
+ console.log(`No suitable input element found for ref ${ref}`);
889
+ }
890
+ // Re-throw the original error if we couldn't find an input element
891
+ throw fillError;
892
+ }
893
+ }
894
+
895
+ // If we skipped the fill attempt (readonly elements), look for new elements directly
896
+ if (isReadonly || ['date', 'datetime-local', 'time'].includes(elementType || '')) {
897
+ // Look for new elements that appeared after clicking
898
+ console.log(`Looking for new elements that appeared after clicking readonly element...`);
899
+
900
+ // Get snapshot after action to find new elements
901
+ const snapshotAfter = await (page as any)._snapshotForAI();
902
+ const newRefs = new Set<string>();
903
+ const afterRefPattern = /\[ref=([^\]]+)\]/g;
904
+ let afterMatch;
905
+ while ((afterMatch = afterRefPattern.exec(snapshotAfter)) !== null) {
906
+ const refId = afterMatch[1];
907
+ if (!existingRefs.has(refId)) {
908
+ newRefs.add(refId);
909
+ }
910
+ }
911
+
912
+ console.log(`Found ${newRefs.size} new elements after clicking readonly element`);
913
+
914
+ // If we have a placeholder, try to find new input elements with that placeholder
915
+ if (originalPlaceholder && newRefs.size > 0) {
916
+ console.log(`Looking for new input elements with placeholder: ${originalPlaceholder}`);
917
+
918
+ // Try each new ref to see if it's an input with our placeholder
919
+ for (const newRef of newRefs) {
920
+ try {
921
+ const newElement = await page.locator(`aria-ref=${newRef}`).first();
922
+ const tagName = await newElement.evaluate(el => el.tagName.toLowerCase()).catch(() => null);
923
+
924
+ if (tagName === 'input' || tagName === 'textarea') {
925
+ const placeholder = await newElement.getAttribute('placeholder').catch(() => null);
926
+ if (placeholder === originalPlaceholder) {
927
+ console.log(`Found new input element with matching placeholder: ref=${newRef}`);
928
+
929
+ // Check if it's visible and fillable
930
+ const elementInfo = await newElement.evaluate((el: any) => {
931
+ return {
932
+ tagName: el.tagName,
933
+ id: el.id,
934
+ className: el.className,
935
+ placeholder: el.placeholder,
936
+ isVisible: el.offsetParent !== null,
937
+ isReadonly: el.readOnly || el.getAttribute('readonly') !== null
938
+ };
939
+ });
940
+ console.log(`New element details:`, JSON.stringify(elementInfo));
941
+
942
+ // Try to fill it with force to avoid scrolling
943
+ await newElement.fill(text, { force: true });
944
+
945
+ // If element might show dropdown, check for new elements
946
+ if (shouldCheckDiff) {
947
+ await page.waitForTimeout(300);
948
+ const snapshotFinal = await (page as any)._snapshotForAI();
949
+ const diffSnapshot = this.getSnapshotDiff(snapshotBefore, snapshotFinal, ['option', 'menuitem']);
950
+
951
+ if (diffSnapshot && diffSnapshot.trim() !== '') {
952
+ return { success: true, diffSnapshot };
953
+ }
954
+ }
955
+
956
+ return { success: true };
957
+ }
958
+ }
959
+ } catch (e) {
960
+ // Ignore errors for non-input elements
961
+ }
962
+ }
963
+ }
964
+
965
+ console.log(`No suitable input element found for readonly ref ${ref}`);
966
+ return { success: false, error: `Element ref=${ref} is readonly and no suitable input was found` };
967
+ }
512
968
  }
513
969
 
514
970
  return { success: false, error: 'No valid input provided' };
@@ -667,6 +1123,11 @@ export class HybridBrowserSession {
667
1123
  // Capture new tab ID if present
668
1124
  newTabId = clickResult.newTabId;
669
1125
 
1126
+ // Capture diff snapshot if present
1127
+ if (clickResult.diffSnapshot) {
1128
+ actionDetails = { diffSnapshot: clickResult.diffSnapshot };
1129
+ }
1130
+
670
1131
  actionExecutionTime = Date.now() - clickStart;
671
1132
  break;
672
1133
  }
@@ -689,6 +1150,14 @@ export class HybridBrowserSession {
689
1150
  actionDetails = typeResult.details;
690
1151
  }
691
1152
 
1153
+ // Capture diff snapshot if present
1154
+ if (typeResult.diffSnapshot) {
1155
+ if (!actionDetails) {
1156
+ actionDetails = {};
1157
+ }
1158
+ actionDetails.diffSnapshot = typeResult.diffSnapshot;
1159
+ }
1160
+
692
1161
  actionExecutionTime = Date.now() - typeStart;
693
1162
  break;
694
1163
  }
@@ -803,6 +1272,55 @@ export class HybridBrowserSession {
803
1272
  }
804
1273
  }
805
1274
 
1275
+ /**
1276
+ * Wait for DOM to stop changing for a specified duration
1277
+ */
1278
+ private async waitForDOMStability(page: Page, maxWaitTime: number = 500): Promise<void> {
1279
+ const startTime = Date.now();
1280
+ const stabilityThreshold = 100; // Consider stable if no changes for 100ms
1281
+ let lastChangeTime = Date.now();
1282
+
1283
+ try {
1284
+ // Monitor DOM changes
1285
+ await page.evaluate(() => {
1286
+ let changeCount = 0;
1287
+ (window as any).__domStabilityCheck = { changeCount: 0, lastChange: Date.now() };
1288
+
1289
+ const observer = new MutationObserver(() => {
1290
+ (window as any).__domStabilityCheck.changeCount++;
1291
+ (window as any).__domStabilityCheck.lastChange = Date.now();
1292
+ });
1293
+
1294
+ observer.observe(document.body, {
1295
+ childList: true,
1296
+ subtree: true,
1297
+ attributes: true,
1298
+ characterData: true
1299
+ });
1300
+
1301
+ (window as any).__domStabilityObserver = observer;
1302
+ });
1303
+
1304
+ // Wait until no changes for stabilityThreshold or timeout
1305
+ await page.waitForFunction(
1306
+ (threshold) => {
1307
+ const check = (window as any).__domStabilityCheck;
1308
+ return check && (Date.now() - check.lastChange) > threshold;
1309
+ },
1310
+ stabilityThreshold,
1311
+ { timeout: Math.max(0, maxWaitTime) }
1312
+ ).catch(() => {});
1313
+ } finally {
1314
+ // Cleanup
1315
+ await page.evaluate(() => {
1316
+ const observer = (window as any).__domStabilityObserver;
1317
+ if (observer) observer.disconnect();
1318
+ delete (window as any).__domStabilityObserver;
1319
+ delete (window as any).__domStabilityCheck;
1320
+ }).catch(() => {});
1321
+ }
1322
+ }
1323
+
806
1324
  private async waitForPageStability(page: Page): Promise<{ domContentLoadedTime: number; networkIdleTime: number }> {
807
1325
  let domContentLoadedTime = 0;
808
1326
  let networkIdleTime = 0;
@@ -892,7 +1410,7 @@ export class HybridBrowserSession {
892
1410
  let newTabId: string | null = null;
893
1411
 
894
1412
  const browserConfig = this.configLoader.getBrowserConfig();
895
- if (browserConfig.connectOverCdp) {
1413
+ if (browserConfig.cdpUrl) {
896
1414
  // CDP mode: find an available blank tab
897
1415
  const allPages = this.context.pages();
898
1416
  for (const page of allPages) {
@@ -908,7 +1426,10 @@ export class HybridBrowserSession {
908
1426
  }
909
1427
 
910
1428
  if (!newPage || !newTabId) {
911
- throw new Error('No available blank tabs in CDP mode. Frontend should create more blank tabs when half are used.');
1429
+ console.log('[CDP] No available blank tabs, creating new page');
1430
+ newPage = await this.context.newPage();
1431
+ newTabId = this.generateTabId();
1432
+ this.registerNewPage(newTabId, newPage);
912
1433
  }
913
1434
  } else {
914
1435
  // Non-CDP mode: create new page as usual
@@ -1107,17 +1628,25 @@ export class HybridBrowserSession {
1107
1628
  this.pages.clear();
1108
1629
  this.currentTabId = null;
1109
1630
 
1110
- if (this.context) {
1631
+ // Handle context cleanup separately for CDP mode
1632
+ if (!browserConfig.cdpUrl && this.context && this.contextOwnedByUs) {
1633
+ // For non-CDP mode, close context here
1111
1634
  await this.context.close();
1112
1635
  this.context = null;
1636
+ this.contextOwnedByUs = false;
1113
1637
  }
1114
1638
 
1115
1639
  if (this.browser) {
1116
- if (browserConfig.connectOverCdp) {
1117
- // For CDP connections, just disconnect without closing the browser
1118
- await this.browser.close();
1640
+ if (browserConfig.cdpUrl) {
1641
+ // In CDP mode: tear down only our context, then disconnect
1642
+ if (this.context && this.contextOwnedByUs) {
1643
+ await this.context.close().catch(() => {});
1644
+ this.context = null;
1645
+ this.contextOwnedByUs = false;
1646
+ }
1647
+ await this.browser.close(); // disconnect
1119
1648
  } else {
1120
- // For launched browsers, close completely
1649
+ // Local launch: close everything
1121
1650
  await this.browser.close();
1122
1651
  }
1123
1652
  this.browser = null;
@@ -1132,12 +1661,12 @@ export class HybridBrowserSession {
1132
1661
  const filtered: Record<string, SnapshotElement> = {};
1133
1662
 
1134
1663
 
1135
- // Apply viewport filtering with scroll position adjustment
1136
- const browserConfig = this.configLoader.getBrowserConfig();
1137
- const adjustedScrollPos = {
1138
- x: scrollPos.x * browserConfig.scrollPositionScale,
1139
- y: scrollPos.y * browserConfig.scrollPositionScale
1140
- };
1664
+ // Apply viewport filtering
1665
+ // boundingBox() returns viewport-relative coordinates, so we don't need to add scroll offsets
1666
+ const viewportLeft = 0;
1667
+ const viewportTop = 0;
1668
+ const viewportRight = viewport.width;
1669
+ const viewportBottom = viewport.height;
1141
1670
 
1142
1671
  for (const [ref, element] of Object.entries(elements)) {
1143
1672
  // If element has no coordinates, include it (fallback)
@@ -1148,14 +1677,9 @@ export class HybridBrowserSession {
1148
1677
 
1149
1678
  const { x, y, width, height } = element.coordinates;
1150
1679
 
1151
- // Calculate viewport bounds using adjusted scroll position
1152
- const viewportLeft = adjustedScrollPos.x;
1153
- const viewportTop = adjustedScrollPos.y;
1154
- const viewportRight = adjustedScrollPos.x + viewport.width;
1155
- const viewportBottom = adjustedScrollPos.y + viewport.height;
1156
-
1157
1680
  // Check if element is visible in current viewport
1158
1681
  // Element is visible if it overlaps with viewport bounds
1682
+ // Since boundingBox() coords are viewport-relative, we compare directly
1159
1683
  const isVisible = (
1160
1684
  x < viewportRight && // Left edge is before viewport right
1161
1685
  y < viewportBottom && // Top edge is before viewport bottom