camel-ai 0.2.75a5__py3-none-any.whl → 0.2.76a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

Files changed (47) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/chat_agent.py +298 -130
  3. camel/configs/__init__.py +6 -0
  4. camel/configs/amd_config.py +70 -0
  5. camel/configs/nebius_config.py +103 -0
  6. camel/interpreters/__init__.py +2 -0
  7. camel/interpreters/microsandbox_interpreter.py +395 -0
  8. camel/models/__init__.py +4 -0
  9. camel/models/amd_model.py +101 -0
  10. camel/models/model_factory.py +4 -0
  11. camel/models/nebius_model.py +83 -0
  12. camel/models/ollama_model.py +3 -3
  13. camel/models/openai_model.py +0 -6
  14. camel/runtimes/daytona_runtime.py +11 -12
  15. camel/societies/workforce/task_channel.py +120 -27
  16. camel/societies/workforce/workforce.py +35 -3
  17. camel/toolkits/__init__.py +5 -3
  18. camel/toolkits/code_execution.py +28 -1
  19. camel/toolkits/function_tool.py +6 -1
  20. camel/toolkits/github_toolkit.py +104 -17
  21. camel/toolkits/hybrid_browser_toolkit/config_loader.py +8 -0
  22. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +12 -0
  23. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit_ts.py +33 -14
  24. camel/toolkits/hybrid_browser_toolkit/ts/src/browser-session.ts +135 -40
  25. camel/toolkits/hybrid_browser_toolkit/ts/src/config-loader.ts +2 -0
  26. camel/toolkits/hybrid_browser_toolkit/ts/src/hybrid-browser-toolkit.ts +43 -207
  27. camel/toolkits/hybrid_browser_toolkit/ts/src/parent-child-filter.ts +226 -0
  28. camel/toolkits/hybrid_browser_toolkit/ts/src/snapshot-parser.ts +231 -0
  29. camel/toolkits/hybrid_browser_toolkit/ts/src/som-screenshot-injected.ts +543 -0
  30. camel/toolkits/hybrid_browser_toolkit/ts/websocket-server.js +39 -6
  31. camel/toolkits/hybrid_browser_toolkit/ws_wrapper.py +248 -58
  32. camel/toolkits/hybrid_browser_toolkit_py/hybrid_browser_toolkit.py +5 -1
  33. camel/toolkits/{openai_image_toolkit.py → image_generation_toolkit.py} +98 -31
  34. camel/toolkits/math_toolkit.py +64 -10
  35. camel/toolkits/mcp_toolkit.py +39 -14
  36. camel/toolkits/minimax_mcp_toolkit.py +195 -0
  37. camel/toolkits/search_toolkit.py +13 -2
  38. camel/toolkits/terminal_toolkit.py +12 -2
  39. camel/toolkits/video_analysis_toolkit.py +16 -10
  40. camel/types/enums.py +42 -0
  41. camel/types/unified_model_type.py +5 -0
  42. camel/utils/commons.py +2 -0
  43. camel/utils/mcp.py +136 -2
  44. {camel_ai-0.2.75a5.dist-info → camel_ai-0.2.76a0.dist-info}/METADATA +5 -11
  45. {camel_ai-0.2.75a5.dist-info → camel_ai-0.2.76a0.dist-info}/RECORD +47 -38
  46. {camel_ai-0.2.75a5.dist-info → camel_ai-0.2.76a0.dist-info}/WHEEL +0 -0
  47. {camel_ai-0.2.75a5.dist-info → camel_ai-0.2.76a0.dist-info}/licenses/LICENSE +0 -0
@@ -59,14 +59,30 @@ export class HybridBrowserSession {
59
59
  const contexts = this.browser.contexts();
60
60
  if (contexts.length > 0) {
61
61
  this.context = contexts[0];
62
+
63
+ // Apply stealth headers to existing context if configured
64
+ // Note: userAgent cannot be changed on an existing context
65
+ if (stealthConfig.enabled) {
66
+ if (stealthConfig.extraHTTPHeaders) {
67
+ await this.context.setExtraHTTPHeaders(stealthConfig.extraHTTPHeaders);
68
+ }
69
+ if (stealthConfig.userAgent) {
70
+ console.warn('[HybridBrowserSession] Cannot apply userAgent to existing context. Consider creating a new context if userAgent customization is required.');
71
+ }
72
+ }
62
73
  } else {
63
74
  const contextOptions: any = {
64
75
  viewport: browserConfig.viewport
65
76
  };
66
77
 
67
- // Apply stealth headers if configured
68
- if (stealthConfig.enabled && stealthConfig.extraHTTPHeaders) {
69
- contextOptions.extraHTTPHeaders = stealthConfig.extraHTTPHeaders;
78
+ // Apply stealth headers and UA if configured
79
+ if (stealthConfig.enabled) {
80
+ if (stealthConfig.extraHTTPHeaders) {
81
+ contextOptions.extraHTTPHeaders = stealthConfig.extraHTTPHeaders;
82
+ }
83
+ if (stealthConfig.userAgent) {
84
+ contextOptions.userAgent = stealthConfig.userAgent;
85
+ }
70
86
  }
71
87
 
72
88
  this.context = await this.browser.newContext(contextOptions);
@@ -105,13 +121,18 @@ export class HybridBrowserSession {
105
121
  if (stealthConfig.enabled) {
106
122
  launchOptions.args = stealthConfig.args || [];
107
123
 
108
- // Apply stealth user agent if configured
124
+ // Apply stealth user agent/headers if configured
109
125
  if (stealthConfig.userAgent) {
110
126
  launchOptions.userAgent = stealthConfig.userAgent;
111
127
  }
128
+ if (stealthConfig.extraHTTPHeaders) {
129
+ launchOptions.extraHTTPHeaders = stealthConfig.extraHTTPHeaders;
130
+ }
112
131
  }
113
132
 
114
133
  if (browserConfig.userDataDir) {
134
+ // Ensure viewport is honored in persistent context
135
+ launchOptions.viewport = browserConfig.viewport;
115
136
  this.context = await chromium.launchPersistentContext(
116
137
  browserConfig.userDataDir,
117
138
  launchOptions
@@ -129,9 +150,14 @@ export class HybridBrowserSession {
129
150
  viewport: browserConfig.viewport
130
151
  };
131
152
 
132
- // Apply stealth headers if configured
133
- if (stealthConfig.enabled && stealthConfig.extraHTTPHeaders) {
134
- contextOptions.extraHTTPHeaders = stealthConfig.extraHTTPHeaders;
153
+ // Apply stealth headers and UA if configured
154
+ if (stealthConfig.enabled) {
155
+ if (stealthConfig.extraHTTPHeaders) {
156
+ contextOptions.extraHTTPHeaders = stealthConfig.extraHTTPHeaders;
157
+ }
158
+ if (stealthConfig.userAgent) {
159
+ contextOptions.userAgent = stealthConfig.userAgent;
160
+ }
135
161
  }
136
162
 
137
163
  this.context = await this.browser.newContext(contextOptions);
@@ -173,26 +199,10 @@ export class HybridBrowserSession {
173
199
 
174
200
  async getCurrentPage(): Promise<Page> {
175
201
  if (!this.currentTabId || !this.pages.has(this.currentTabId)) {
176
- // In CDP mode, try to create a new page if none exists
202
+ // In CDP mode, we cannot create new pages
177
203
  const browserConfig = this.configLoader.getBrowserConfig();
178
- if (browserConfig.connectOverCdp && this.context) {
179
- console.log('[CDP] No active page found, attempting to create new page...');
180
- try {
181
- const newPage = await this.context.newPage();
182
- const newTabId = this.generateTabId();
183
- this.registerNewPage(newTabId, newPage);
184
- this.currentTabId = newTabId;
185
-
186
- // Set page timeouts
187
- newPage.setDefaultNavigationTimeout(browserConfig.navigationTimeout);
188
- newPage.setDefaultTimeout(browserConfig.navigationTimeout);
189
-
190
- console.log(`[CDP] Created new page with tab ID: ${newTabId}`);
191
- return newPage;
192
- } catch (error) {
193
- console.error('[CDP] Failed to create new page:', error);
194
- throw new Error('No active page available and failed to create new page in CDP mode');
195
- }
204
+ if (browserConfig.connectOverCdp) {
205
+ throw new Error('No active page available in CDP mode; frontend must pre-create blank tabs.');
196
206
  }
197
207
  throw new Error('No active page available');
198
208
  }
@@ -235,6 +245,36 @@ export class HybridBrowserSession {
235
245
  return this.getSnapshotForAINative(includeCoordinates, viewportLimit);
236
246
  }
237
247
 
248
+ private parseElementFromSnapshot(snapshotText: string, ref: string): { role?: string; text?: string } {
249
+ const lines = snapshotText.split('\n');
250
+ for (const line of lines) {
251
+ if (line.includes(`[ref=${ref}]`)) {
252
+ const typeMatch = line.match(/^\s*-?\s*([\w-]+)/);
253
+ const role = typeMatch ? typeMatch[1] : undefined;
254
+ const textMatch = line.match(/"([^"]*)"/);
255
+ const text = textMatch ? textMatch[1] : undefined;
256
+ return { role, text };
257
+ }
258
+ }
259
+ return {};
260
+ }
261
+
262
+ private buildSnapshotIndex(snapshotText: string): Map<string, { role?: string; text?: string }> {
263
+ const index = new Map<string, { role?: string; text?: string }>();
264
+ const refRe = /\[ref=([^\]]+)\]/i;
265
+ for (const line of snapshotText.split('\n')) {
266
+ const m = line.match(refRe);
267
+ if (!m) continue;
268
+ const ref = m[1];
269
+ const roleMatch = line.match(/^\s*-?\s*([a-z0-9_-]+)/i);
270
+ const role = roleMatch ? roleMatch[1].toLowerCase() : undefined;
271
+ const textMatch = line.match(/"([^"]*)"/);
272
+ const text = textMatch ? textMatch[1] : undefined;
273
+ index.set(ref, { role, text });
274
+ }
275
+ return index;
276
+ }
277
+
238
278
  private async getSnapshotForAINative(includeCoordinates = false, viewportLimit = false): Promise<SnapshotResult & { timing: DetailedTiming }> {
239
279
  const startTime = Date.now();
240
280
  const page = await this.getCurrentPage();
@@ -257,6 +297,17 @@ export class HybridBrowserSession {
257
297
  const mappingStart = Date.now();
258
298
  const playwrightMapping: Record<string, any> = {};
259
299
 
300
+ // Parse element info in a single pass
301
+ const snapshotIndex = this.buildSnapshotIndex(snapshotText);
302
+ for (const ref of refs) {
303
+ const elementInfo = snapshotIndex.get(ref) || {};
304
+ playwrightMapping[ref] = {
305
+ ref,
306
+ role: elementInfo.role || 'unknown',
307
+ text: elementInfo.text || '',
308
+ };
309
+ }
310
+
260
311
  if (includeCoordinates) {
261
312
  // Get coordinates for each ref using aria-ref selector
262
313
  for (const ref of refs) {
@@ -270,8 +321,9 @@ export class HybridBrowserSession {
270
321
  const boundingBox = await element.boundingBox();
271
322
 
272
323
  if (boundingBox) {
324
+ // Add coordinates to existing element info
273
325
  playwrightMapping[ref] = {
274
- ref,
326
+ ...playwrightMapping[ref],
275
327
  coordinates: {
276
328
  x: Math.round(boundingBox.x),
277
329
  y: Math.round(boundingBox.y),
@@ -388,7 +440,6 @@ export class HybridBrowserSession {
388
440
 
389
441
  if (shouldOpenNewTab) {
390
442
  // Handle new tab opening
391
-
392
443
  // If it's a link that doesn't naturally open in new tab, force it
393
444
  if (isNavigableLink && !naturallyOpensNewTab) {
394
445
  await element.evaluate((el, blankTarget) => {
@@ -803,6 +854,55 @@ export class HybridBrowserSession {
803
854
  }
804
855
  }
805
856
 
857
+ /**
858
+ * Wait for DOM to stop changing for a specified duration
859
+ */
860
+ private async waitForDOMStability(page: Page, maxWaitTime: number = 500): Promise<void> {
861
+ const startTime = Date.now();
862
+ const stabilityThreshold = 100; // Consider stable if no changes for 100ms
863
+ let lastChangeTime = Date.now();
864
+
865
+ try {
866
+ // Monitor DOM changes
867
+ await page.evaluate(() => {
868
+ let changeCount = 0;
869
+ (window as any).__domStabilityCheck = { changeCount: 0, lastChange: Date.now() };
870
+
871
+ const observer = new MutationObserver(() => {
872
+ (window as any).__domStabilityCheck.changeCount++;
873
+ (window as any).__domStabilityCheck.lastChange = Date.now();
874
+ });
875
+
876
+ observer.observe(document.body, {
877
+ childList: true,
878
+ subtree: true,
879
+ attributes: true,
880
+ characterData: true
881
+ });
882
+
883
+ (window as any).__domStabilityObserver = observer;
884
+ });
885
+
886
+ // Wait until no changes for stabilityThreshold or timeout
887
+ await page.waitForFunction(
888
+ (threshold) => {
889
+ const check = (window as any).__domStabilityCheck;
890
+ return check && (Date.now() - check.lastChange) > threshold;
891
+ },
892
+ stabilityThreshold,
893
+ { timeout: Math.max(0, maxWaitTime) }
894
+ ).catch(() => {});
895
+ } finally {
896
+ // Cleanup
897
+ await page.evaluate(() => {
898
+ const observer = (window as any).__domStabilityObserver;
899
+ if (observer) observer.disconnect();
900
+ delete (window as any).__domStabilityObserver;
901
+ delete (window as any).__domStabilityCheck;
902
+ }).catch(() => {});
903
+ }
904
+ }
905
+
806
906
  private async waitForPageStability(page: Page): Promise<{ domContentLoadedTime: number; networkIdleTime: number }> {
807
907
  let domContentLoadedTime = 0;
808
908
  let networkIdleTime = 0;
@@ -1132,12 +1232,12 @@ export class HybridBrowserSession {
1132
1232
  const filtered: Record<string, SnapshotElement> = {};
1133
1233
 
1134
1234
 
1135
- // Apply viewport filtering with scroll position adjustment
1136
- const browserConfig = this.configLoader.getBrowserConfig();
1137
- const adjustedScrollPos = {
1138
- x: scrollPos.x * browserConfig.scrollPositionScale,
1139
- y: scrollPos.y * browserConfig.scrollPositionScale
1140
- };
1235
+ // Apply viewport filtering
1236
+ // boundingBox() returns viewport-relative coordinates, so we don't need to add scroll offsets
1237
+ const viewportLeft = 0;
1238
+ const viewportTop = 0;
1239
+ const viewportRight = viewport.width;
1240
+ const viewportBottom = viewport.height;
1141
1241
 
1142
1242
  for (const [ref, element] of Object.entries(elements)) {
1143
1243
  // If element has no coordinates, include it (fallback)
@@ -1148,14 +1248,9 @@ export class HybridBrowserSession {
1148
1248
 
1149
1249
  const { x, y, width, height } = element.coordinates;
1150
1250
 
1151
- // Calculate viewport bounds using adjusted scroll position
1152
- const viewportLeft = adjustedScrollPos.x;
1153
- const viewportTop = adjustedScrollPos.y;
1154
- const viewportRight = adjustedScrollPos.x + viewport.width;
1155
- const viewportBottom = adjustedScrollPos.y + viewport.height;
1156
-
1157
1251
  // Check if element is visible in current viewport
1158
1252
  // Element is visible if it overlaps with viewport bounds
1253
+ // Since boundingBox() coords are viewport-relative, we compare directly
1159
1254
  const isVisible = (
1160
1255
  x < viewportRight && // Left edge is before viewport right
1161
1256
  y < viewportBottom && // Top edge is before viewport bottom
@@ -79,6 +79,7 @@ export interface WebSocketConfig {
79
79
  browser_log_to_file: boolean;
80
80
  session_id?: string;
81
81
  viewport_limit: boolean;
82
+ fullVisualMode?: boolean;
82
83
  }
83
84
 
84
85
  // Default stealth configuration
@@ -212,6 +213,7 @@ export class ConfigLoader {
212
213
  if (config.browser_log_to_file !== undefined) wsConfig.browser_log_to_file = config.browser_log_to_file;
213
214
  if (config.session_id !== undefined) wsConfig.session_id = config.session_id;
214
215
  if (config.viewport_limit !== undefined) wsConfig.viewport_limit = config.viewport_limit;
216
+ if (config.fullVisualMode !== undefined) wsConfig.fullVisualMode = config.fullVisualMode;
215
217
 
216
218
  // CDP connection options
217
219
  if (config.connectOverCdp !== undefined) browserConfig.connectOverCdp = config.connectOverCdp;
@@ -2,18 +2,22 @@ import {HybridBrowserSession} from './browser-session';
2
2
  import {ActionResult, BrowserAction, BrowserToolkitConfig, SnapshotResult, TabInfo, VisualMarkResult} from './types';
3
3
  import {ConfigLoader} from './config-loader';
4
4
  import {ConsoleMessage} from 'playwright';
5
+ import {SomScreenshotInjected} from './som-screenshot-injected';
6
+ import {filterClickableByHierarchy} from './snapshot-parser';
5
7
 
6
8
  export class HybridBrowserToolkit {
7
9
  private session: HybridBrowserSession;
8
10
  private config: BrowserToolkitConfig;
9
11
  private configLoader: ConfigLoader;
10
12
  private viewportLimit: boolean;
13
+ private fullVisualMode: boolean;
11
14
 
12
15
  constructor(config: BrowserToolkitConfig = {}) {
13
16
  this.configLoader = ConfigLoader.fromPythonConfig(config);
14
17
  this.config = config; // Store original config for backward compatibility
15
18
  this.session = new HybridBrowserSession(this.configLoader.getBrowserConfig()); // Pass processed config
16
19
  this.viewportLimit = this.configLoader.getWebSocketConfig().viewport_limit;
20
+ this.fullVisualMode = this.configLoader.getWebSocketConfig().fullVisualMode || false;
17
21
  }
18
22
 
19
23
  async openBrowser(startUrl?: string): Promise<ActionResult> {
@@ -26,7 +30,7 @@ export class HybridBrowserToolkit {
26
30
  const result = await this.session.visitPage(url);
27
31
 
28
32
  const snapshotStart = Date.now();
29
- const snapshot = await this.getPageSnapshot(this.viewportLimit);
33
+ const snapshot = await this.getSnapshotForAction(this.viewportLimit);
30
34
  const snapshotTime = Date.now() - snapshotStart;
31
35
 
32
36
  const totalTime = Date.now() - startTime;
@@ -83,7 +87,7 @@ export class HybridBrowserToolkit {
83
87
 
84
88
  if (result.success) {
85
89
  const snapshotStart = Date.now();
86
- response.snapshot = await this.getPageSnapshot(this.viewportLimit);
90
+ response.snapshot = await this.getSnapshotForAction(this.viewportLimit);
87
91
  const snapshotTime = Date.now() - snapshotStart;
88
92
 
89
93
  if (result.timing) {
@@ -119,6 +123,7 @@ export class HybridBrowserToolkit {
119
123
 
120
124
  async getPageSnapshot(viewportLimit: boolean = false): Promise<string> {
121
125
  try {
126
+ // Always return real snapshot when explicitly called
122
127
  // If viewport limiting is enabled, we need coordinates for filtering
123
128
  const snapshotResult = await this.session.getSnapshotForAI(viewportLimit, viewportLimit);
124
129
  return snapshotResult.snapshot;
@@ -126,6 +131,14 @@ export class HybridBrowserToolkit {
126
131
  return `Error capturing snapshot: ${error}`;
127
132
  }
128
133
  }
134
+
135
+ // Internal method for getting snapshot in actions (respects fullVisualMode)
136
+ private async getSnapshotForAction(viewportLimit: boolean = false): Promise<string> {
137
+ if (this.fullVisualMode) {
138
+ return 'full visual mode';
139
+ }
140
+ return this.getPageSnapshot(viewportLimit);
141
+ }
129
142
 
130
143
 
131
144
  async getSnapshotForAI(): Promise<SnapshotResult> {
@@ -134,35 +147,34 @@ export class HybridBrowserToolkit {
134
147
 
135
148
  async getSomScreenshot(): Promise<VisualMarkResult & { timing: any }> {
136
149
  const startTime = Date.now();
150
+ console.log('[HybridBrowserToolkit] Starting getSomScreenshot...');
137
151
 
138
152
  try {
139
- const screenshotResult = await this.session.takeScreenshot();
140
- const snapshotResult = await this.session.getSnapshotForAI(true); // Include coordinates for SOM_mark
141
-
142
- // Add visual marks using improved method
143
- const markingStart = Date.now();
144
- const markedImageBuffer = await this.addVisualMarksOptimized(screenshotResult.buffer, snapshotResult);
145
- const markingTime = Date.now() - markingStart;
153
+ // Get page and snapshot data
154
+ const page = await this.session.getCurrentPage();
155
+ const snapshotResult = await this.session.getSnapshotForAI(true); // Include coordinates
146
156
 
147
- const base64Image = markedImageBuffer.toString('base64');
148
- const dataUrl = `data:image/png;base64,${base64Image}`;
157
+ // Parse clickable elements from snapshot text
158
+ const clickableElements = this.parseClickableElements(snapshotResult.snapshot);
159
+ console.log(`[HybridBrowserToolkit] Found ${clickableElements.size} clickable elements`);
149
160
 
150
- const totalTime = Date.now() - startTime;
161
+ // Apply hierarchy-based filtering
162
+ const filteredElements = filterClickableByHierarchy(snapshotResult.snapshot, clickableElements);
163
+ console.log(`[HybridBrowserToolkit] After filtering: ${filteredElements.size} elements remain`);
164
+
165
+ // Use injected SOM-screenshot method without export path
166
+ const result = await SomScreenshotInjected.captureOptimized(
167
+ page,
168
+ snapshotResult,
169
+ filteredElements,
170
+ undefined // No export path - don't generate files
171
+ );
151
172
 
152
- // Count elements with coordinates
153
- const elementsWithCoords = Object.values(snapshotResult.elements).filter(el => el.coordinates).length;
173
+ // Add snapshot timing info to result
174
+ result.timing.snapshot_time_ms = snapshotResult.timing.snapshot_time_ms;
175
+ result.timing.coordinate_enrichment_time_ms = snapshotResult.timing.coordinate_enrichment_time_ms;
154
176
 
155
- return {
156
- text: `Visual webpage screenshot captured with ${Object.keys(snapshotResult.elements).length} interactive elements (${elementsWithCoords} marked visually)`,
157
- images: [dataUrl],
158
- timing: {
159
- total_time_ms: totalTime,
160
- screenshot_time_ms: screenshotResult.timing.screenshot_time_ms,
161
- snapshot_time_ms: snapshotResult.timing.snapshot_time_ms,
162
- coordinate_enrichment_time_ms: snapshotResult.timing.coordinate_enrichment_time_ms,
163
- visual_marking_time_ms: markingTime,
164
- },
165
- };
177
+ return result;
166
178
  } catch (error) {
167
179
  const totalTime = Date.now() - startTime;
168
180
  return {
@@ -179,132 +191,6 @@ export class HybridBrowserToolkit {
179
191
  }
180
192
  }
181
193
 
182
- private async addVisualMarksOptimized(screenshotBuffer: Buffer, snapshotResult: SnapshotResult): Promise<Buffer> {
183
- try {
184
-
185
- // Check if we have any elements with coordinates
186
- const elementsWithCoords = Object.entries(snapshotResult.elements)
187
- .filter(([ref, element]) => element.coordinates);
188
-
189
- if (elementsWithCoords.length === 0) {
190
- return screenshotBuffer;
191
- }
192
-
193
- // Parse clickable elements from snapshot text
194
- const clickableElements = this.parseClickableElements(snapshotResult.snapshot);
195
-
196
- // Use sharp for image processing
197
- const sharp = require('sharp');
198
- const page = await this.session.getCurrentPage();
199
- let viewport = page.viewportSize();
200
-
201
- // In CDP mode, viewportSize might be null, get it from window dimensions
202
- if (!viewport) {
203
- const windowSize = await page.evaluate(() => ({
204
- width: window.innerWidth,
205
- height: window.innerHeight
206
- }));
207
- viewport = windowSize;
208
- }
209
-
210
- // Get device pixel ratio to handle high DPI screens
211
- const dpr = await page.evaluate(() => window.devicePixelRatio) || 1;
212
-
213
- // Get actual screenshot dimensions
214
- const metadata = await sharp(screenshotBuffer).metadata();
215
- const screenshotWidth = metadata.width || viewport.width;
216
- const screenshotHeight = metadata.height || viewport.height;
217
-
218
- // Calculate scaling factor between CSS pixels and screenshot pixels
219
- const scaleX = screenshotWidth / viewport.width;
220
- const scaleY = screenshotHeight / viewport.height;
221
-
222
- // Debug logging for CDP mode
223
- if (process.env.HYBRID_BROWSER_DEBUG === '1') {
224
- console.log('[CDP Debug] Viewport size:', viewport);
225
- console.log('[CDP Debug] Device pixel ratio:', dpr);
226
- console.log('[CDP Debug] Screenshot dimensions:', { width: screenshotWidth, height: screenshotHeight });
227
- console.log('[CDP Debug] Scale factors:', { scaleX, scaleY });
228
- console.log('[CDP Debug] Elements with coordinates:', elementsWithCoords.length);
229
- elementsWithCoords.slice(0, 3).forEach(([ref, element]) => {
230
- console.log(`[CDP Debug] Element ${ref}:`, element.coordinates);
231
- });
232
- }
233
-
234
- // Filter elements visible in viewport
235
- const visibleElements = elementsWithCoords.filter(([ref, element]) => {
236
- const coords = element.coordinates!;
237
- return coords.x < viewport.width &&
238
- coords.y < viewport.height &&
239
- coords.x + coords.width > 0 &&
240
- coords.y + coords.height > 0;
241
- });
242
-
243
- // Remove overlapped elements (only keep topmost)
244
- const nonOverlappedElements = this.removeOverlappedElements(visibleElements);
245
-
246
- // Create SVG overlay with all the marks
247
- const marks = nonOverlappedElements.map(([ref, element]) => {
248
- const coords = element.coordinates!;
249
- const isClickable = clickableElements.has(ref);
250
-
251
- // Scale coordinates from CSS pixels to screenshot pixels
252
- const x = Math.max(0, coords.x * scaleX);
253
- const y = Math.max(0, coords.y * scaleY);
254
- const width = coords.width * scaleX;
255
- const height = coords.height * scaleY;
256
-
257
- // Clamp to screenshot bounds
258
- const clampedWidth = Math.min(width, screenshotWidth - x);
259
- const clampedHeight = Math.min(height, screenshotHeight - y);
260
-
261
- // Position text to be visible even if element is partially cut off
262
- const textX = Math.max(2, Math.min(x + 2, screenshotWidth - 40));
263
- const textY = Math.max(14, Math.min(y + 14, screenshotHeight - 4));
264
-
265
- // Different colors for clickable vs non-clickable elements
266
- const colors = isClickable ? {
267
- fill: 'rgba(0, 150, 255, 0.15)', // Blue for clickable
268
- stroke: '#0096FF',
269
- textFill: '#0096FF'
270
- } : {
271
- fill: 'rgba(255, 107, 107, 0.1)', // Red for non-clickable
272
- stroke: '#FF6B6B',
273
- textFill: '#FF6B6B'
274
- };
275
-
276
- return `
277
- <rect x="${x}" y="${y}" width="${clampedWidth}" height="${clampedHeight}"
278
- fill="${colors.fill}" stroke="${colors.stroke}" stroke-width="2" rx="2"/>
279
- <text x="${textX}" y="${textY}" font-family="Arial, sans-serif"
280
- font-size="12" fill="${colors.textFill}" font-weight="bold">${ref}</text>
281
- `;
282
- }).join('');
283
-
284
- const svgOverlay = `
285
- <svg width="${screenshotWidth}" height="${screenshotHeight}" xmlns="http://www.w3.org/2000/svg">
286
- ${marks}
287
- </svg>
288
- `;
289
-
290
- // Composite the overlay onto the screenshot
291
- const markedImageBuffer = await sharp(screenshotBuffer)
292
- .composite([{
293
- input: Buffer.from(svgOverlay),
294
- top: 0,
295
- left: 0
296
- }])
297
- .png()
298
- .toBuffer();
299
-
300
- return markedImageBuffer;
301
-
302
- } catch (error) {
303
- // Error adding visual marks, falling back to original screenshot
304
- // Return original screenshot if marking fails
305
- return screenshotBuffer;
306
- }
307
- }
308
194
 
309
195
  /**
310
196
  * Parse clickable elements from snapshot text
@@ -314,8 +200,8 @@ export class HybridBrowserToolkit {
314
200
  const lines = snapshotText.split('\n');
315
201
 
316
202
  for (const line of lines) {
317
- // Look for lines containing [cursor=pointer] and extract ref
318
- if (line.includes('[cursor=pointer]')) {
203
+ // Look for lines containing [cursor=pointer] or [active] and extract ref
204
+ if (line.includes('[cursor=pointer]') || line.includes('[active]')) {
319
205
  const refMatch = line.match(/\[ref=([^\]]+)\]/);
320
206
  if (refMatch) {
321
207
  clickableElements.add(refMatch[1]);
@@ -326,56 +212,6 @@ export class HybridBrowserToolkit {
326
212
  return clickableElements;
327
213
  }
328
214
 
329
- /**
330
- * Remove overlapped elements, keeping only the topmost (last in DOM order)
331
- */
332
- private removeOverlappedElements(elements: Array<[string, any]>): Array<[string, any]> {
333
- const result: Array<[string, any]> = [];
334
-
335
- for (let i = 0; i < elements.length; i++) {
336
- const [refA, elementA] = elements[i];
337
- const coordsA = elementA.coordinates!;
338
- let isOverlapped = false;
339
-
340
- // Check if this element is completely overlapped by any later element
341
- for (let j = i + 1; j < elements.length; j++) {
342
- const [refB, elementB] = elements[j];
343
- const coordsB = elementB.coordinates!;
344
-
345
- // Check if element A is completely covered by element B
346
- if (this.isCompletelyOverlapped(coordsA, coordsB)) {
347
- isOverlapped = true;
348
- break;
349
- }
350
- }
351
-
352
- if (!isOverlapped) {
353
- result.push(elements[i]);
354
- }
355
- }
356
-
357
- return result;
358
- }
359
-
360
- /**
361
- * Check if element A is completely overlapped by element B
362
- */
363
- private isCompletelyOverlapped(
364
- coordsA: { x: number; y: number; width: number; height: number },
365
- coordsB: { x: number; y: number; width: number; height: number }
366
- ): boolean {
367
- // A is completely overlapped by B if:
368
- // B's left edge is <= A's left edge AND
369
- // B's top edge is <= A's top edge AND
370
- // B's right edge is >= A's right edge AND
371
- // B's bottom edge is >= A's bottom edge
372
- return (
373
- coordsB.x <= coordsA.x &&
374
- coordsB.y <= coordsA.y &&
375
- coordsB.x + coordsB.width >= coordsA.x + coordsA.width &&
376
- coordsB.y + coordsB.height >= coordsA.y + coordsA.height
377
- );
378
- }
379
215
 
380
216
  private async executeActionWithSnapshot(action: BrowserAction): Promise<any> {
381
217
  const result = await this.session.executeAction(action);
@@ -472,7 +308,7 @@ export class HybridBrowserToolkit {
472
308
  const navigationTime = Date.now() - navigationStart;
473
309
 
474
310
  const snapshotStart = Date.now();
475
- const snapshot = await this.getPageSnapshot(this.viewportLimit);
311
+ const snapshot = await this.getSnapshotForAction(this.viewportLimit);
476
312
  const snapshotTime = Date.now() - snapshotStart;
477
313
 
478
314
  const totalTime = Date.now() - startTime;
@@ -512,7 +348,7 @@ export class HybridBrowserToolkit {
512
348
  const navigationTime = Date.now() - navigationStart;
513
349
 
514
350
  const snapshotStart = Date.now();
515
- const snapshot = await this.getPageSnapshot(this.viewportLimit);
351
+ const snapshot = await this.getSnapshotForAction(this.viewportLimit);
516
352
  const snapshotTime = Date.now() - snapshotStart;
517
353
 
518
354
  const totalTime = Date.now() - startTime;
@@ -584,7 +420,7 @@ export class HybridBrowserToolkit {
584
420
  return {
585
421
  success: true,
586
422
  message: `Closed tab ${tabId}`,
587
- snapshot: await this.getPageSnapshot(this.viewportLimit),
423
+ snapshot: await this.getSnapshotForAction(this.viewportLimit),
588
424
  };
589
425
  } else {
590
426
  return {
@@ -649,7 +485,7 @@ export class HybridBrowserToolkit {
649
485
  const { result, logs } = evalResult;
650
486
 
651
487
  const snapshotStart = Date.now();
652
- const snapshot = await this.getPageSnapshot(this.viewportLimit);
488
+ const snapshot = await this.getSnapshotForAction(this.viewportLimit);
653
489
  const snapshotTime = Date.now() - snapshotStart;
654
490
  const totalTime = Date.now() - startTime;
655
491