camel-ai 0.2.75a6__py3-none-any.whl → 0.2.76__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

Files changed (97) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/chat_agent.py +1001 -205
  3. camel/agents/mcp_agent.py +30 -27
  4. camel/configs/__init__.py +6 -0
  5. camel/configs/amd_config.py +70 -0
  6. camel/configs/cometapi_config.py +104 -0
  7. camel/data_collectors/alpaca_collector.py +15 -6
  8. camel/environments/tic_tac_toe.py +1 -1
  9. camel/interpreters/__init__.py +2 -0
  10. camel/interpreters/docker/Dockerfile +3 -12
  11. camel/interpreters/microsandbox_interpreter.py +395 -0
  12. camel/loaders/__init__.py +11 -2
  13. camel/loaders/chunkr_reader.py +9 -0
  14. camel/memories/__init__.py +2 -1
  15. camel/memories/agent_memories.py +3 -1
  16. camel/memories/blocks/chat_history_block.py +21 -3
  17. camel/memories/records.py +88 -8
  18. camel/messages/base.py +127 -34
  19. camel/models/__init__.py +4 -0
  20. camel/models/amd_model.py +101 -0
  21. camel/models/azure_openai_model.py +0 -6
  22. camel/models/base_model.py +30 -0
  23. camel/models/cometapi_model.py +83 -0
  24. camel/models/model_factory.py +4 -0
  25. camel/models/openai_compatible_model.py +0 -6
  26. camel/models/openai_model.py +0 -6
  27. camel/models/zhipuai_model.py +61 -2
  28. camel/parsers/__init__.py +18 -0
  29. camel/parsers/mcp_tool_call_parser.py +176 -0
  30. camel/retrievers/auto_retriever.py +1 -0
  31. camel/runtimes/daytona_runtime.py +11 -12
  32. camel/societies/workforce/prompts.py +131 -50
  33. camel/societies/workforce/single_agent_worker.py +434 -49
  34. camel/societies/workforce/structured_output_handler.py +30 -18
  35. camel/societies/workforce/task_channel.py +43 -0
  36. camel/societies/workforce/utils.py +105 -12
  37. camel/societies/workforce/workforce.py +1322 -311
  38. camel/societies/workforce/workforce_logger.py +24 -5
  39. camel/storages/key_value_storages/json.py +15 -2
  40. camel/storages/object_storages/google_cloud.py +1 -1
  41. camel/storages/vectordb_storages/oceanbase.py +10 -11
  42. camel/storages/vectordb_storages/tidb.py +8 -6
  43. camel/tasks/task.py +4 -3
  44. camel/toolkits/__init__.py +18 -5
  45. camel/toolkits/aci_toolkit.py +45 -0
  46. camel/toolkits/code_execution.py +28 -1
  47. camel/toolkits/context_summarizer_toolkit.py +684 -0
  48. camel/toolkits/dingtalk.py +1135 -0
  49. camel/toolkits/edgeone_pages_mcp_toolkit.py +11 -31
  50. camel/toolkits/{file_write_toolkit.py → file_toolkit.py} +194 -34
  51. camel/toolkits/function_tool.py +6 -1
  52. camel/toolkits/google_drive_mcp_toolkit.py +12 -31
  53. camel/toolkits/hybrid_browser_toolkit/config_loader.py +12 -0
  54. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +79 -2
  55. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit_ts.py +95 -59
  56. camel/toolkits/hybrid_browser_toolkit/installer.py +203 -0
  57. camel/toolkits/hybrid_browser_toolkit/ts/package-lock.json +5 -612
  58. camel/toolkits/hybrid_browser_toolkit/ts/package.json +0 -1
  59. camel/toolkits/hybrid_browser_toolkit/ts/src/browser-session.ts +619 -95
  60. camel/toolkits/hybrid_browser_toolkit/ts/src/config-loader.ts +7 -2
  61. camel/toolkits/hybrid_browser_toolkit/ts/src/hybrid-browser-toolkit.ts +115 -219
  62. camel/toolkits/hybrid_browser_toolkit/ts/src/parent-child-filter.ts +226 -0
  63. camel/toolkits/hybrid_browser_toolkit/ts/src/snapshot-parser.ts +219 -0
  64. camel/toolkits/hybrid_browser_toolkit/ts/src/som-screenshot-injected.ts +543 -0
  65. camel/toolkits/hybrid_browser_toolkit/ts/src/types.ts +1 -0
  66. camel/toolkits/hybrid_browser_toolkit/ts/websocket-server.js +39 -6
  67. camel/toolkits/hybrid_browser_toolkit/ws_wrapper.py +405 -131
  68. camel/toolkits/hybrid_browser_toolkit_py/hybrid_browser_toolkit.py +9 -5
  69. camel/toolkits/{openai_image_toolkit.py → image_generation_toolkit.py} +98 -31
  70. camel/toolkits/markitdown_toolkit.py +27 -1
  71. camel/toolkits/mcp_toolkit.py +348 -348
  72. camel/toolkits/message_integration.py +3 -0
  73. camel/toolkits/minimax_mcp_toolkit.py +195 -0
  74. camel/toolkits/note_taking_toolkit.py +18 -8
  75. camel/toolkits/notion_mcp_toolkit.py +16 -26
  76. camel/toolkits/origene_mcp_toolkit.py +8 -49
  77. camel/toolkits/playwright_mcp_toolkit.py +12 -31
  78. camel/toolkits/resend_toolkit.py +168 -0
  79. camel/toolkits/slack_toolkit.py +50 -1
  80. camel/toolkits/terminal_toolkit/__init__.py +18 -0
  81. camel/toolkits/terminal_toolkit/terminal_toolkit.py +924 -0
  82. camel/toolkits/terminal_toolkit/utils.py +532 -0
  83. camel/toolkits/vertex_ai_veo_toolkit.py +590 -0
  84. camel/toolkits/video_analysis_toolkit.py +17 -11
  85. camel/toolkits/wechat_official_toolkit.py +483 -0
  86. camel/types/enums.py +124 -1
  87. camel/types/unified_model_type.py +5 -0
  88. camel/utils/commons.py +17 -0
  89. camel/utils/context_utils.py +804 -0
  90. camel/utils/mcp.py +136 -2
  91. camel/utils/token_counting.py +25 -17
  92. {camel_ai-0.2.75a6.dist-info → camel_ai-0.2.76.dist-info}/METADATA +158 -59
  93. {camel_ai-0.2.75a6.dist-info → camel_ai-0.2.76.dist-info}/RECORD +95 -76
  94. camel/loaders/pandas_reader.py +0 -368
  95. camel/toolkits/terminal_toolkit.py +0 -1788
  96. {camel_ai-0.2.75a6.dist-info → camel_ai-0.2.76.dist-info}/WHEEL +0 -0
  97. {camel_ai-0.2.75a6.dist-info → camel_ai-0.2.76.dist-info}/licenses/LICENSE +0 -0
@@ -73,12 +73,14 @@ export interface BrowserConfig {
73
73
  // CDP connection options
74
74
  connectOverCdp: boolean;
75
75
  cdpUrl?: string;
76
+ cdpKeepCurrentPage: boolean;
76
77
  }
77
78
 
78
79
  export interface WebSocketConfig {
79
80
  browser_log_to_file: boolean;
80
81
  session_id?: string;
81
82
  viewport_limit: boolean;
83
+ fullVisualMode?: boolean;
82
84
  }
83
85
 
84
86
  // Default stealth configuration
@@ -117,7 +119,7 @@ function getDefaultBrowserConfig(): BrowserConfig {
117
119
  consoleLogLimit: 1000,
118
120
  scrollPositionScale: 0.1,
119
121
  navigationDelay: 100,
120
- blankPageUrls: [],
122
+ blankPageUrls: ['chrome://newtab/', 'edge://newtab/', 'chrome://new-tab-page/'],
121
123
  dataUrlPrefix: 'data:',
122
124
  domContentLoadedState: 'domcontentloaded',
123
125
  networkIdleState: 'networkidle',
@@ -138,7 +140,8 @@ function getDefaultBrowserConfig(): BrowserConfig {
138
140
  height: 720
139
141
  },
140
142
  connectOverCdp: false,
141
- cdpUrl: undefined
143
+ cdpUrl: undefined,
144
+ cdpKeepCurrentPage: false
142
145
  };
143
146
  }
144
147
 
@@ -212,10 +215,12 @@ export class ConfigLoader {
212
215
  if (config.browser_log_to_file !== undefined) wsConfig.browser_log_to_file = config.browser_log_to_file;
213
216
  if (config.session_id !== undefined) wsConfig.session_id = config.session_id;
214
217
  if (config.viewport_limit !== undefined) wsConfig.viewport_limit = config.viewport_limit;
218
+ if (config.fullVisualMode !== undefined) wsConfig.fullVisualMode = config.fullVisualMode;
215
219
 
216
220
  // CDP connection options
217
221
  if (config.connectOverCdp !== undefined) browserConfig.connectOverCdp = config.connectOverCdp;
218
222
  if (config.cdpUrl !== undefined) browserConfig.cdpUrl = config.cdpUrl;
223
+ if (config.cdpKeepCurrentPage !== undefined) browserConfig.cdpKeepCurrentPage = config.cdpKeepCurrentPage;
219
224
 
220
225
  return new ConfigLoader(browserConfig, wsConfig);
221
226
  }
@@ -2,18 +2,22 @@ import {HybridBrowserSession} from './browser-session';
2
2
  import {ActionResult, BrowserAction, BrowserToolkitConfig, SnapshotResult, TabInfo, VisualMarkResult} from './types';
3
3
  import {ConfigLoader} from './config-loader';
4
4
  import {ConsoleMessage} from 'playwright';
5
+ import {SomScreenshotInjected} from './som-screenshot-injected';
6
+ import {filterClickableByHierarchy} from './snapshot-parser';
5
7
 
6
8
  export class HybridBrowserToolkit {
7
9
  private session: HybridBrowserSession;
8
10
  private config: BrowserToolkitConfig;
9
11
  private configLoader: ConfigLoader;
10
12
  private viewportLimit: boolean;
13
+ private fullVisualMode: boolean;
11
14
 
12
15
  constructor(config: BrowserToolkitConfig = {}) {
13
16
  this.configLoader = ConfigLoader.fromPythonConfig(config);
14
17
  this.config = config; // Store original config for backward compatibility
15
- this.session = new HybridBrowserSession(this.configLoader.getBrowserConfig()); // Pass processed config
18
+ this.session = new HybridBrowserSession(config); // Pass original config
16
19
  this.viewportLimit = this.configLoader.getWebSocketConfig().viewport_limit;
20
+ this.fullVisualMode = this.configLoader.getWebSocketConfig().fullVisualMode || false;
17
21
  }
18
22
 
19
23
  async openBrowser(startUrl?: string): Promise<ActionResult> {
@@ -22,22 +26,66 @@ export class HybridBrowserToolkit {
22
26
  try {
23
27
  await this.session.ensureBrowser();
24
28
 
25
- const url = startUrl || this.config.defaultStartUrl || 'https://google.com/';
26
- const result = await this.session.visitPage(url);
29
+ // Check if we should skip navigation in CDP keep-current-page mode
30
+ const browserConfig = this.configLoader.getBrowserConfig();
31
+ if (browserConfig.cdpUrl && browserConfig.cdpKeepCurrentPage && !startUrl) {
32
+ // In CDP keep-current-page mode without explicit URL, just ensure browser and return current page
33
+ const snapshotStart = Date.now();
34
+ const snapshot = await this.getSnapshotForAction(this.viewportLimit);
35
+ const snapshotTime = Date.now() - snapshotStart;
36
+
37
+ const page = await this.session.getCurrentPage();
38
+ const currentUrl = page ? await page.url() : 'unknown';
39
+
40
+ const totalTime = Date.now() - startTime;
41
+
42
+ return {
43
+ success: true,
44
+ message: `Browser opened in CDP keep-current-page mode (current page: ${currentUrl})`,
45
+ snapshot,
46
+ timing: {
47
+ total_time_ms: totalTime,
48
+ snapshot_time_ms: snapshotTime,
49
+ },
50
+ };
51
+ }
52
+
53
+ // For normal mode or CDP with cdpKeepCurrentPage=false: navigate to URL
54
+ if (!browserConfig.cdpUrl || !browserConfig.cdpKeepCurrentPage) {
55
+ const url = startUrl || this.config.defaultStartUrl || 'https://google.com/';
56
+ const result = await this.session.visitPage(url);
57
+
58
+ const snapshotStart = Date.now();
59
+ const snapshot = await this.getSnapshotForAction(this.viewportLimit);
60
+ const snapshotTime = Date.now() - snapshotStart;
61
+
62
+ const totalTime = Date.now() - startTime;
63
+
64
+ return {
65
+ success: true,
66
+ message: result.message,
67
+ snapshot,
68
+ timing: {
69
+ total_time_ms: totalTime,
70
+ page_load_time_ms: result.timing?.page_load_time_ms || 0,
71
+ snapshot_time_ms: snapshotTime,
72
+ },
73
+ };
74
+ }
27
75
 
76
+ // Fallback: Just return current page snapshot without any navigation
28
77
  const snapshotStart = Date.now();
29
- const snapshot = await this.getPageSnapshot(this.viewportLimit);
78
+ const snapshot = await this.getSnapshotForAction(this.viewportLimit);
30
79
  const snapshotTime = Date.now() - snapshotStart;
31
80
 
32
81
  const totalTime = Date.now() - startTime;
33
82
 
34
83
  return {
35
84
  success: true,
36
- message: `Browser opened and navigated to ${url}`,
85
+ message: `Browser opened without navigation`,
37
86
  snapshot,
38
87
  timing: {
39
88
  total_time_ms: totalTime,
40
- ...result.timing,
41
89
  snapshot_time_ms: snapshotTime,
42
90
  },
43
91
  };
@@ -83,7 +131,7 @@ export class HybridBrowserToolkit {
83
131
 
84
132
  if (result.success) {
85
133
  const snapshotStart = Date.now();
86
- response.snapshot = await this.getPageSnapshot(this.viewportLimit);
134
+ response.snapshot = await this.getSnapshotForAction(this.viewportLimit);
87
135
  const snapshotTime = Date.now() - snapshotStart;
88
136
 
89
137
  if (result.timing) {
@@ -119,6 +167,7 @@ export class HybridBrowserToolkit {
119
167
 
120
168
  async getPageSnapshot(viewportLimit: boolean = false): Promise<string> {
121
169
  try {
170
+ // Always return real snapshot when explicitly called
122
171
  // If viewport limiting is enabled, we need coordinates for filtering
123
172
  const snapshotResult = await this.session.getSnapshotForAI(viewportLimit, viewportLimit);
124
173
  return snapshotResult.snapshot;
@@ -126,6 +175,14 @@ export class HybridBrowserToolkit {
126
175
  return `Error capturing snapshot: ${error}`;
127
176
  }
128
177
  }
178
+
179
+ // Internal method for getting snapshot in actions (respects fullVisualMode)
180
+ private async getSnapshotForAction(viewportLimit: boolean = false): Promise<string> {
181
+ if (this.fullVisualMode) {
182
+ return 'full visual mode';
183
+ }
184
+ return this.getPageSnapshot(viewportLimit);
185
+ }
129
186
 
130
187
 
131
188
  async getSnapshotForAI(): Promise<SnapshotResult> {
@@ -134,35 +191,34 @@ export class HybridBrowserToolkit {
134
191
 
135
192
  async getSomScreenshot(): Promise<VisualMarkResult & { timing: any }> {
136
193
  const startTime = Date.now();
194
+ console.log('[HybridBrowserToolkit] Starting getSomScreenshot...');
137
195
 
138
196
  try {
139
- const screenshotResult = await this.session.takeScreenshot();
140
- const snapshotResult = await this.session.getSnapshotForAI(true); // Include coordinates for SOM_mark
141
-
142
- // Add visual marks using improved method
143
- const markingStart = Date.now();
144
- const markedImageBuffer = await this.addVisualMarksOptimized(screenshotResult.buffer, snapshotResult);
145
- const markingTime = Date.now() - markingStart;
197
+ // Get page and snapshot data
198
+ const page = await this.session.getCurrentPage();
199
+ const snapshotResult = await this.session.getSnapshotForAI(true); // Include coordinates
146
200
 
147
- const base64Image = markedImageBuffer.toString('base64');
148
- const dataUrl = `data:image/png;base64,${base64Image}`;
201
+ // Parse clickable elements from snapshot text
202
+ const clickableElements = this.parseClickableElements(snapshotResult.snapshot);
203
+ console.log(`[HybridBrowserToolkit] Found ${clickableElements.size} clickable elements`);
149
204
 
150
- const totalTime = Date.now() - startTime;
205
+ // Apply hierarchy-based filtering
206
+ const filteredElements = filterClickableByHierarchy(snapshotResult.snapshot, clickableElements);
207
+ console.log(`[HybridBrowserToolkit] After filtering: ${filteredElements.size} elements remain`);
208
+
209
+ // Use injected SOM-screenshot method without export path
210
+ const result = await SomScreenshotInjected.captureOptimized(
211
+ page,
212
+ snapshotResult,
213
+ filteredElements,
214
+ undefined // No export path - don't generate files
215
+ );
151
216
 
152
- // Count elements with coordinates
153
- const elementsWithCoords = Object.values(snapshotResult.elements).filter(el => el.coordinates).length;
217
+ // Add snapshot timing info to result
218
+ result.timing.snapshot_time_ms = snapshotResult.timing.snapshot_time_ms;
219
+ result.timing.coordinate_enrichment_time_ms = snapshotResult.timing.coordinate_enrichment_time_ms;
154
220
 
155
- return {
156
- text: `Visual webpage screenshot captured with ${Object.keys(snapshotResult.elements).length} interactive elements (${elementsWithCoords} marked visually)`,
157
- images: [dataUrl],
158
- timing: {
159
- total_time_ms: totalTime,
160
- screenshot_time_ms: screenshotResult.timing.screenshot_time_ms,
161
- snapshot_time_ms: snapshotResult.timing.snapshot_time_ms,
162
- coordinate_enrichment_time_ms: snapshotResult.timing.coordinate_enrichment_time_ms,
163
- visual_marking_time_ms: markingTime,
164
- },
165
- };
221
+ return result;
166
222
  } catch (error) {
167
223
  const totalTime = Date.now() - startTime;
168
224
  return {
@@ -179,132 +235,6 @@ export class HybridBrowserToolkit {
179
235
  }
180
236
  }
181
237
 
182
- private async addVisualMarksOptimized(screenshotBuffer: Buffer, snapshotResult: SnapshotResult): Promise<Buffer> {
183
- try {
184
-
185
- // Check if we have any elements with coordinates
186
- const elementsWithCoords = Object.entries(snapshotResult.elements)
187
- .filter(([ref, element]) => element.coordinates);
188
-
189
- if (elementsWithCoords.length === 0) {
190
- return screenshotBuffer;
191
- }
192
-
193
- // Parse clickable elements from snapshot text
194
- const clickableElements = this.parseClickableElements(snapshotResult.snapshot);
195
-
196
- // Use sharp for image processing
197
- const sharp = require('sharp');
198
- const page = await this.session.getCurrentPage();
199
- let viewport = page.viewportSize();
200
-
201
- // In CDP mode, viewportSize might be null, get it from window dimensions
202
- if (!viewport) {
203
- const windowSize = await page.evaluate(() => ({
204
- width: window.innerWidth,
205
- height: window.innerHeight
206
- }));
207
- viewport = windowSize;
208
- }
209
-
210
- // Get device pixel ratio to handle high DPI screens
211
- const dpr = await page.evaluate(() => window.devicePixelRatio) || 1;
212
-
213
- // Get actual screenshot dimensions
214
- const metadata = await sharp(screenshotBuffer).metadata();
215
- const screenshotWidth = metadata.width || viewport.width;
216
- const screenshotHeight = metadata.height || viewport.height;
217
-
218
- // Calculate scaling factor between CSS pixels and screenshot pixels
219
- const scaleX = screenshotWidth / viewport.width;
220
- const scaleY = screenshotHeight / viewport.height;
221
-
222
- // Debug logging for CDP mode
223
- if (process.env.HYBRID_BROWSER_DEBUG === '1') {
224
- console.log('[CDP Debug] Viewport size:', viewport);
225
- console.log('[CDP Debug] Device pixel ratio:', dpr);
226
- console.log('[CDP Debug] Screenshot dimensions:', { width: screenshotWidth, height: screenshotHeight });
227
- console.log('[CDP Debug] Scale factors:', { scaleX, scaleY });
228
- console.log('[CDP Debug] Elements with coordinates:', elementsWithCoords.length);
229
- elementsWithCoords.slice(0, 3).forEach(([ref, element]) => {
230
- console.log(`[CDP Debug] Element ${ref}:`, element.coordinates);
231
- });
232
- }
233
-
234
- // Filter elements visible in viewport
235
- const visibleElements = elementsWithCoords.filter(([ref, element]) => {
236
- const coords = element.coordinates!;
237
- return coords.x < viewport.width &&
238
- coords.y < viewport.height &&
239
- coords.x + coords.width > 0 &&
240
- coords.y + coords.height > 0;
241
- });
242
-
243
- // Remove overlapped elements (only keep topmost)
244
- const nonOverlappedElements = this.removeOverlappedElements(visibleElements);
245
-
246
- // Create SVG overlay with all the marks
247
- const marks = nonOverlappedElements.map(([ref, element]) => {
248
- const coords = element.coordinates!;
249
- const isClickable = clickableElements.has(ref);
250
-
251
- // Scale coordinates from CSS pixels to screenshot pixels
252
- const x = Math.max(0, coords.x * scaleX);
253
- const y = Math.max(0, coords.y * scaleY);
254
- const width = coords.width * scaleX;
255
- const height = coords.height * scaleY;
256
-
257
- // Clamp to screenshot bounds
258
- const clampedWidth = Math.min(width, screenshotWidth - x);
259
- const clampedHeight = Math.min(height, screenshotHeight - y);
260
-
261
- // Position text to be visible even if element is partially cut off
262
- const textX = Math.max(2, Math.min(x + 2, screenshotWidth - 40));
263
- const textY = Math.max(14, Math.min(y + 14, screenshotHeight - 4));
264
-
265
- // Different colors for clickable vs non-clickable elements
266
- const colors = isClickable ? {
267
- fill: 'rgba(0, 150, 255, 0.15)', // Blue for clickable
268
- stroke: '#0096FF',
269
- textFill: '#0096FF'
270
- } : {
271
- fill: 'rgba(255, 107, 107, 0.1)', // Red for non-clickable
272
- stroke: '#FF6B6B',
273
- textFill: '#FF6B6B'
274
- };
275
-
276
- return `
277
- <rect x="${x}" y="${y}" width="${clampedWidth}" height="${clampedHeight}"
278
- fill="${colors.fill}" stroke="${colors.stroke}" stroke-width="2" rx="2"/>
279
- <text x="${textX}" y="${textY}" font-family="Arial, sans-serif"
280
- font-size="12" fill="${colors.textFill}" font-weight="bold">${ref}</text>
281
- `;
282
- }).join('');
283
-
284
- const svgOverlay = `
285
- <svg width="${screenshotWidth}" height="${screenshotHeight}" xmlns="http://www.w3.org/2000/svg">
286
- ${marks}
287
- </svg>
288
- `;
289
-
290
- // Composite the overlay onto the screenshot
291
- const markedImageBuffer = await sharp(screenshotBuffer)
292
- .composite([{
293
- input: Buffer.from(svgOverlay),
294
- top: 0,
295
- left: 0
296
- }])
297
- .png()
298
- .toBuffer();
299
-
300
- return markedImageBuffer;
301
-
302
- } catch (error) {
303
- // Error adding visual marks, falling back to original screenshot
304
- // Return original screenshot if marking fails
305
- return screenshotBuffer;
306
- }
307
- }
308
238
 
309
239
  /**
310
240
  * Parse clickable elements from snapshot text
@@ -314,8 +244,8 @@ export class HybridBrowserToolkit {
314
244
  const lines = snapshotText.split('\n');
315
245
 
316
246
  for (const line of lines) {
317
- // Look for lines containing [cursor=pointer] and extract ref
318
- if (line.includes('[cursor=pointer]')) {
247
+ // Look for lines containing [cursor=pointer] or [active] and extract ref
248
+ if (line.includes('[cursor=pointer]') || line.includes('[active]')) {
319
249
  const refMatch = line.match(/\[ref=([^\]]+)\]/);
320
250
  if (refMatch) {
321
251
  clickableElements.add(refMatch[1]);
@@ -326,73 +256,31 @@ export class HybridBrowserToolkit {
326
256
  return clickableElements;
327
257
  }
328
258
 
329
- /**
330
- * Remove overlapped elements, keeping only the topmost (last in DOM order)
331
- */
332
- private removeOverlappedElements(elements: Array<[string, any]>): Array<[string, any]> {
333
- const result: Array<[string, any]> = [];
334
-
335
- for (let i = 0; i < elements.length; i++) {
336
- const [refA, elementA] = elements[i];
337
- const coordsA = elementA.coordinates!;
338
- let isOverlapped = false;
339
-
340
- // Check if this element is completely overlapped by any later element
341
- for (let j = i + 1; j < elements.length; j++) {
342
- const [refB, elementB] = elements[j];
343
- const coordsB = elementB.coordinates!;
344
-
345
- // Check if element A is completely covered by element B
346
- if (this.isCompletelyOverlapped(coordsA, coordsB)) {
347
- isOverlapped = true;
348
- break;
349
- }
350
- }
351
-
352
- if (!isOverlapped) {
353
- result.push(elements[i]);
354
- }
355
- }
356
-
357
- return result;
358
- }
359
-
360
- /**
361
- * Check if element A is completely overlapped by element B
362
- */
363
- private isCompletelyOverlapped(
364
- coordsA: { x: number; y: number; width: number; height: number },
365
- coordsB: { x: number; y: number; width: number; height: number }
366
- ): boolean {
367
- // A is completely overlapped by B if:
368
- // B's left edge is <= A's left edge AND
369
- // B's top edge is <= A's top edge AND
370
- // B's right edge is >= A's right edge AND
371
- // B's bottom edge is >= A's bottom edge
372
- return (
373
- coordsB.x <= coordsA.x &&
374
- coordsB.y <= coordsA.y &&
375
- coordsB.x + coordsB.width >= coordsA.x + coordsA.width &&
376
- coordsB.y + coordsB.height >= coordsA.y + coordsA.height
377
- );
378
- }
379
259
 
380
260
  private async executeActionWithSnapshot(action: BrowserAction): Promise<any> {
381
261
  const result = await this.session.executeAction(action);
382
262
 
383
- // Format response for Python layer compatibility
384
263
  const response: any = {
385
264
  result: result.message,
386
265
  snapshot: '',
387
266
  };
388
267
 
389
268
  if (result.success) {
390
- const snapshotStart = Date.now();
391
- response.snapshot = await this.getPageSnapshot(this.viewportLimit);
392
- const snapshotTime = Date.now() - snapshotStart;
393
-
394
- if (result.timing) {
395
- result.timing.snapshot_time_ms = snapshotTime;
269
+ if (result.details?.diffSnapshot) {
270
+ response.snapshot = result.details.diffSnapshot;
271
+
272
+ if (result.timing) {
273
+ result.timing.snapshot_time_ms = 0; // Diff snapshot time is included in action time
274
+ }
275
+ } else {
276
+ // Get full snapshot as usual
277
+ const snapshotStart = Date.now();
278
+ response.snapshot = await this.getPageSnapshot(this.viewportLimit);
279
+ const snapshotTime = Date.now() - snapshotStart;
280
+
281
+ if (result.timing) {
282
+ result.timing.snapshot_time_ms = snapshotTime;
283
+ }
396
284
  }
397
285
  }
398
286
 
@@ -406,6 +294,14 @@ export class HybridBrowserToolkit {
406
294
  response.newTabId = result.newTabId;
407
295
  }
408
296
 
297
+ // Include details if present (excluding diffSnapshot as it's already in snapshot)
298
+ if (result.details) {
299
+ const { diffSnapshot, ...otherDetails } = result.details;
300
+ if (Object.keys(otherDetails).length > 0) {
301
+ response.details = otherDetails;
302
+ }
303
+ }
304
+
409
305
  return response;
410
306
  }
411
307
 
@@ -472,7 +368,7 @@ export class HybridBrowserToolkit {
472
368
  const navigationTime = Date.now() - navigationStart;
473
369
 
474
370
  const snapshotStart = Date.now();
475
- const snapshot = await this.getPageSnapshot(this.viewportLimit);
371
+ const snapshot = await this.getSnapshotForAction(this.viewportLimit);
476
372
  const snapshotTime = Date.now() - snapshotStart;
477
373
 
478
374
  const totalTime = Date.now() - startTime;
@@ -512,7 +408,7 @@ export class HybridBrowserToolkit {
512
408
  const navigationTime = Date.now() - navigationStart;
513
409
 
514
410
  const snapshotStart = Date.now();
515
- const snapshot = await this.getPageSnapshot(this.viewportLimit);
411
+ const snapshot = await this.getSnapshotForAction(this.viewportLimit);
516
412
  const snapshotTime = Date.now() - snapshotStart;
517
413
 
518
414
  const totalTime = Date.now() - startTime;
@@ -584,7 +480,7 @@ export class HybridBrowserToolkit {
584
480
  return {
585
481
  success: true,
586
482
  message: `Closed tab ${tabId}`,
587
- snapshot: await this.getPageSnapshot(this.viewportLimit),
483
+ snapshot: await this.getSnapshotForAction(this.viewportLimit),
588
484
  };
589
485
  } else {
590
486
  return {
@@ -649,7 +545,7 @@ export class HybridBrowserToolkit {
649
545
  const { result, logs } = evalResult;
650
546
 
651
547
  const snapshotStart = Date.now();
652
- const snapshot = await this.getPageSnapshot(this.viewportLimit);
548
+ const snapshot = await this.getSnapshotForAction(this.viewportLimit);
653
549
  const snapshotTime = Date.now() - snapshotStart;
654
550
  const totalTime = Date.now() - startTime;
655
551