camel-ai 0.2.73a4__py3-none-any.whl → 0.2.80a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/_utils.py +38 -0
  3. camel/agents/chat_agent.py +2217 -519
  4. camel/agents/mcp_agent.py +30 -27
  5. camel/configs/__init__.py +15 -0
  6. camel/configs/aihubmix_config.py +88 -0
  7. camel/configs/amd_config.py +70 -0
  8. camel/configs/cometapi_config.py +104 -0
  9. camel/configs/minimax_config.py +93 -0
  10. camel/configs/nebius_config.py +103 -0
  11. camel/data_collectors/alpaca_collector.py +15 -6
  12. camel/datasets/base_generator.py +39 -10
  13. camel/environments/single_step.py +28 -3
  14. camel/environments/tic_tac_toe.py +1 -1
  15. camel/interpreters/__init__.py +2 -0
  16. camel/interpreters/docker/Dockerfile +3 -12
  17. camel/interpreters/e2b_interpreter.py +34 -1
  18. camel/interpreters/microsandbox_interpreter.py +395 -0
  19. camel/loaders/__init__.py +11 -2
  20. camel/loaders/chunkr_reader.py +9 -0
  21. camel/memories/agent_memories.py +48 -4
  22. camel/memories/base.py +26 -0
  23. camel/memories/blocks/chat_history_block.py +122 -4
  24. camel/memories/context_creators/score_based.py +25 -384
  25. camel/memories/records.py +88 -8
  26. camel/messages/base.py +153 -34
  27. camel/models/__init__.py +10 -0
  28. camel/models/aihubmix_model.py +83 -0
  29. camel/models/aiml_model.py +1 -16
  30. camel/models/amd_model.py +101 -0
  31. camel/models/anthropic_model.py +6 -19
  32. camel/models/aws_bedrock_model.py +2 -33
  33. camel/models/azure_openai_model.py +114 -89
  34. camel/models/base_audio_model.py +3 -1
  35. camel/models/base_model.py +32 -14
  36. camel/models/cohere_model.py +1 -16
  37. camel/models/cometapi_model.py +83 -0
  38. camel/models/crynux_model.py +1 -16
  39. camel/models/deepseek_model.py +1 -16
  40. camel/models/fish_audio_model.py +6 -0
  41. camel/models/gemini_model.py +36 -18
  42. camel/models/groq_model.py +1 -17
  43. camel/models/internlm_model.py +1 -16
  44. camel/models/litellm_model.py +1 -16
  45. camel/models/lmstudio_model.py +1 -17
  46. camel/models/minimax_model.py +83 -0
  47. camel/models/mistral_model.py +1 -16
  48. camel/models/model_factory.py +27 -1
  49. camel/models/modelscope_model.py +1 -16
  50. camel/models/moonshot_model.py +105 -24
  51. camel/models/nebius_model.py +83 -0
  52. camel/models/nemotron_model.py +0 -5
  53. camel/models/netmind_model.py +1 -16
  54. camel/models/novita_model.py +1 -16
  55. camel/models/nvidia_model.py +1 -16
  56. camel/models/ollama_model.py +4 -19
  57. camel/models/openai_compatible_model.py +62 -41
  58. camel/models/openai_model.py +62 -57
  59. camel/models/openrouter_model.py +1 -17
  60. camel/models/ppio_model.py +1 -16
  61. camel/models/qianfan_model.py +1 -16
  62. camel/models/qwen_model.py +1 -16
  63. camel/models/reka_model.py +1 -16
  64. camel/models/samba_model.py +34 -47
  65. camel/models/sglang_model.py +64 -31
  66. camel/models/siliconflow_model.py +1 -16
  67. camel/models/stub_model.py +0 -4
  68. camel/models/togetherai_model.py +1 -16
  69. camel/models/vllm_model.py +1 -16
  70. camel/models/volcano_model.py +0 -17
  71. camel/models/watsonx_model.py +1 -16
  72. camel/models/yi_model.py +1 -16
  73. camel/models/zhipuai_model.py +60 -16
  74. camel/parsers/__init__.py +18 -0
  75. camel/parsers/mcp_tool_call_parser.py +176 -0
  76. camel/retrievers/auto_retriever.py +1 -0
  77. camel/runtimes/daytona_runtime.py +11 -12
  78. camel/societies/__init__.py +2 -0
  79. camel/societies/workforce/__init__.py +2 -0
  80. camel/societies/workforce/events.py +122 -0
  81. camel/societies/workforce/prompts.py +146 -66
  82. camel/societies/workforce/role_playing_worker.py +15 -11
  83. camel/societies/workforce/single_agent_worker.py +302 -65
  84. camel/societies/workforce/structured_output_handler.py +30 -18
  85. camel/societies/workforce/task_channel.py +163 -27
  86. camel/societies/workforce/utils.py +107 -13
  87. camel/societies/workforce/workflow_memory_manager.py +772 -0
  88. camel/societies/workforce/workforce.py +1949 -579
  89. camel/societies/workforce/workforce_callback.py +74 -0
  90. camel/societies/workforce/workforce_logger.py +168 -145
  91. camel/societies/workforce/workforce_metrics.py +33 -0
  92. camel/storages/key_value_storages/json.py +15 -2
  93. camel/storages/key_value_storages/mem0_cloud.py +48 -47
  94. camel/storages/object_storages/google_cloud.py +1 -1
  95. camel/storages/vectordb_storages/oceanbase.py +13 -13
  96. camel/storages/vectordb_storages/qdrant.py +3 -3
  97. camel/storages/vectordb_storages/tidb.py +8 -6
  98. camel/tasks/task.py +4 -3
  99. camel/toolkits/__init__.py +20 -7
  100. camel/toolkits/aci_toolkit.py +45 -0
  101. camel/toolkits/base.py +6 -4
  102. camel/toolkits/code_execution.py +28 -1
  103. camel/toolkits/context_summarizer_toolkit.py +684 -0
  104. camel/toolkits/dappier_toolkit.py +5 -1
  105. camel/toolkits/dingtalk.py +1135 -0
  106. camel/toolkits/edgeone_pages_mcp_toolkit.py +11 -31
  107. camel/toolkits/excel_toolkit.py +1 -1
  108. camel/toolkits/{file_write_toolkit.py → file_toolkit.py} +430 -36
  109. camel/toolkits/function_tool.py +13 -3
  110. camel/toolkits/github_toolkit.py +104 -17
  111. camel/toolkits/gmail_toolkit.py +1839 -0
  112. camel/toolkits/google_calendar_toolkit.py +38 -4
  113. camel/toolkits/google_drive_mcp_toolkit.py +12 -31
  114. camel/toolkits/hybrid_browser_toolkit/config_loader.py +15 -0
  115. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +77 -8
  116. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit_ts.py +884 -88
  117. camel/toolkits/hybrid_browser_toolkit/installer.py +203 -0
  118. camel/toolkits/hybrid_browser_toolkit/ts/package-lock.json +5 -612
  119. camel/toolkits/hybrid_browser_toolkit/ts/package.json +0 -1
  120. camel/toolkits/hybrid_browser_toolkit/ts/src/browser-session.ts +959 -89
  121. camel/toolkits/hybrid_browser_toolkit/ts/src/config-loader.ts +9 -2
  122. camel/toolkits/hybrid_browser_toolkit/ts/src/hybrid-browser-toolkit.ts +281 -213
  123. camel/toolkits/hybrid_browser_toolkit/ts/src/parent-child-filter.ts +226 -0
  124. camel/toolkits/hybrid_browser_toolkit/ts/src/snapshot-parser.ts +219 -0
  125. camel/toolkits/hybrid_browser_toolkit/ts/src/som-screenshot-injected.ts +543 -0
  126. camel/toolkits/hybrid_browser_toolkit/ts/src/types.ts +23 -3
  127. camel/toolkits/hybrid_browser_toolkit/ts/websocket-server.js +72 -7
  128. camel/toolkits/hybrid_browser_toolkit/ws_wrapper.py +582 -132
  129. camel/toolkits/hybrid_browser_toolkit_py/actions.py +158 -0
  130. camel/toolkits/hybrid_browser_toolkit_py/browser_session.py +55 -8
  131. camel/toolkits/hybrid_browser_toolkit_py/config_loader.py +43 -0
  132. camel/toolkits/hybrid_browser_toolkit_py/hybrid_browser_toolkit.py +321 -8
  133. camel/toolkits/hybrid_browser_toolkit_py/snapshot.py +10 -4
  134. camel/toolkits/hybrid_browser_toolkit_py/unified_analyzer.js +45 -4
  135. camel/toolkits/{openai_image_toolkit.py → image_generation_toolkit.py} +151 -53
  136. camel/toolkits/klavis_toolkit.py +5 -1
  137. camel/toolkits/markitdown_toolkit.py +27 -1
  138. camel/toolkits/math_toolkit.py +64 -10
  139. camel/toolkits/mcp_toolkit.py +366 -71
  140. camel/toolkits/memory_toolkit.py +5 -1
  141. camel/toolkits/message_integration.py +18 -13
  142. camel/toolkits/minimax_mcp_toolkit.py +195 -0
  143. camel/toolkits/note_taking_toolkit.py +19 -10
  144. camel/toolkits/notion_mcp_toolkit.py +16 -26
  145. camel/toolkits/openbb_toolkit.py +5 -1
  146. camel/toolkits/origene_mcp_toolkit.py +8 -49
  147. camel/toolkits/playwright_mcp_toolkit.py +12 -31
  148. camel/toolkits/resend_toolkit.py +168 -0
  149. camel/toolkits/search_toolkit.py +264 -91
  150. camel/toolkits/slack_toolkit.py +64 -10
  151. camel/toolkits/terminal_toolkit/__init__.py +18 -0
  152. camel/toolkits/terminal_toolkit/terminal_toolkit.py +957 -0
  153. camel/toolkits/terminal_toolkit/utils.py +532 -0
  154. camel/toolkits/vertex_ai_veo_toolkit.py +590 -0
  155. camel/toolkits/video_analysis_toolkit.py +17 -11
  156. camel/toolkits/wechat_official_toolkit.py +483 -0
  157. camel/toolkits/zapier_toolkit.py +5 -1
  158. camel/types/__init__.py +2 -2
  159. camel/types/enums.py +274 -7
  160. camel/types/openai_types.py +2 -2
  161. camel/types/unified_model_type.py +15 -0
  162. camel/utils/commons.py +36 -5
  163. camel/utils/constants.py +3 -0
  164. camel/utils/context_utils.py +1003 -0
  165. camel/utils/mcp.py +138 -4
  166. camel/utils/token_counting.py +43 -20
  167. {camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/METADATA +223 -83
  168. {camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/RECORD +170 -141
  169. camel/loaders/pandas_reader.py +0 -368
  170. camel/toolkits/openai_agent_toolkit.py +0 -135
  171. camel/toolkits/terminal_toolkit.py +0 -1550
  172. {camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/WHEEL +0 -0
  173. {camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/licenses/LICENSE +0 -0
@@ -30,6 +30,7 @@ export interface BrowserConfig {
30
30
  // Tab management
31
31
  tabIdPrefix: string;
32
32
  tabCounterPadding: number;
33
+ consoleLogLimit: number;
33
34
 
34
35
  // Scroll and positioning
35
36
  scrollPositionScale: number;
@@ -72,12 +73,14 @@ export interface BrowserConfig {
72
73
  // CDP connection options
73
74
  connectOverCdp: boolean;
74
75
  cdpUrl?: string;
76
+ cdpKeepCurrentPage: boolean;
75
77
  }
76
78
 
77
79
  export interface WebSocketConfig {
78
80
  browser_log_to_file: boolean;
79
81
  session_id?: string;
80
82
  viewport_limit: boolean;
83
+ fullVisualMode?: boolean;
81
84
  }
82
85
 
83
86
  // Default stealth configuration
@@ -113,9 +116,10 @@ function getDefaultBrowserConfig(): BrowserConfig {
113
116
  clickTimeout: 3000,
114
117
  tabIdPrefix: 'tab-',
115
118
  tabCounterPadding: 3,
119
+ consoleLogLimit: 1000,
116
120
  scrollPositionScale: 0.1,
117
121
  navigationDelay: 100,
118
- blankPageUrls: ['about:blank', ''],
122
+ blankPageUrls: ['chrome://newtab/', 'edge://newtab/', 'chrome://new-tab-page/'],
119
123
  dataUrlPrefix: 'data:',
120
124
  domContentLoadedState: 'domcontentloaded',
121
125
  networkIdleState: 'networkidle',
@@ -136,7 +140,8 @@ function getDefaultBrowserConfig(): BrowserConfig {
136
140
  height: 720
137
141
  },
138
142
  connectOverCdp: false,
139
- cdpUrl: undefined
143
+ cdpUrl: undefined,
144
+ cdpKeepCurrentPage: false
140
145
  };
141
146
  }
142
147
 
@@ -210,10 +215,12 @@ export class ConfigLoader {
210
215
  if (config.browser_log_to_file !== undefined) wsConfig.browser_log_to_file = config.browser_log_to_file;
211
216
  if (config.session_id !== undefined) wsConfig.session_id = config.session_id;
212
217
  if (config.viewport_limit !== undefined) wsConfig.viewport_limit = config.viewport_limit;
218
+ if (config.fullVisualMode !== undefined) wsConfig.fullVisualMode = config.fullVisualMode;
213
219
 
214
220
  // CDP connection options
215
221
  if (config.connectOverCdp !== undefined) browserConfig.connectOverCdp = config.connectOverCdp;
216
222
  if (config.cdpUrl !== undefined) browserConfig.cdpUrl = config.cdpUrl;
223
+ if (config.cdpKeepCurrentPage !== undefined) browserConfig.cdpKeepCurrentPage = config.cdpKeepCurrentPage;
217
224
 
218
225
  return new ConfigLoader(browserConfig, wsConfig);
219
226
  }
@@ -1,18 +1,23 @@
1
1
  import {HybridBrowserSession} from './browser-session';
2
2
  import {ActionResult, BrowserAction, BrowserToolkitConfig, SnapshotResult, TabInfo, VisualMarkResult} from './types';
3
3
  import {ConfigLoader} from './config-loader';
4
+ import {ConsoleMessage} from 'playwright';
5
+ import {SomScreenshotInjected} from './som-screenshot-injected';
6
+ import {filterClickableByHierarchy} from './snapshot-parser';
4
7
 
5
8
  export class HybridBrowserToolkit {
6
9
  private session: HybridBrowserSession;
7
10
  private config: BrowserToolkitConfig;
8
11
  private configLoader: ConfigLoader;
9
12
  private viewportLimit: boolean;
13
+ private fullVisualMode: boolean;
10
14
 
11
15
  constructor(config: BrowserToolkitConfig = {}) {
12
16
  this.configLoader = ConfigLoader.fromPythonConfig(config);
13
17
  this.config = config; // Store original config for backward compatibility
14
- this.session = new HybridBrowserSession(this.configLoader.getBrowserConfig()); // Pass processed config
18
+ this.session = new HybridBrowserSession(config); // Pass original config
15
19
  this.viewportLimit = this.configLoader.getWebSocketConfig().viewport_limit;
20
+ this.fullVisualMode = this.configLoader.getWebSocketConfig().fullVisualMode || false;
16
21
  }
17
22
 
18
23
  async openBrowser(startUrl?: string): Promise<ActionResult> {
@@ -21,22 +26,66 @@ export class HybridBrowserToolkit {
21
26
  try {
22
27
  await this.session.ensureBrowser();
23
28
 
24
- const url = startUrl || this.config.defaultStartUrl || 'https://google.com/';
25
- const result = await this.session.visitPage(url);
29
+ // Check if we should skip navigation in CDP keep-current-page mode
30
+ const browserConfig = this.configLoader.getBrowserConfig();
31
+ if (browserConfig.cdpUrl && browserConfig.cdpKeepCurrentPage && !startUrl) {
32
+ // In CDP keep-current-page mode without explicit URL, just ensure browser and return current page
33
+ const snapshotStart = Date.now();
34
+ const snapshot = await this.getSnapshotForAction(this.viewportLimit);
35
+ const snapshotTime = Date.now() - snapshotStart;
36
+
37
+ const page = await this.session.getCurrentPage();
38
+ const currentUrl = page ? await page.url() : 'unknown';
39
+
40
+ const totalTime = Date.now() - startTime;
41
+
42
+ return {
43
+ success: true,
44
+ message: `Browser opened in CDP keep-current-page mode (current page: ${currentUrl})`,
45
+ snapshot,
46
+ timing: {
47
+ total_time_ms: totalTime,
48
+ snapshot_time_ms: snapshotTime,
49
+ },
50
+ };
51
+ }
26
52
 
53
+ // For normal mode or CDP with cdpKeepCurrentPage=false: navigate to URL
54
+ if (!browserConfig.cdpUrl || !browserConfig.cdpKeepCurrentPage) {
55
+ const url = startUrl || this.config.defaultStartUrl || 'https://google.com/';
56
+ const result = await this.session.visitPage(url);
57
+
58
+ const snapshotStart = Date.now();
59
+ const snapshot = await this.getSnapshotForAction(this.viewportLimit);
60
+ const snapshotTime = Date.now() - snapshotStart;
61
+
62
+ const totalTime = Date.now() - startTime;
63
+
64
+ return {
65
+ success: true,
66
+ message: result.message,
67
+ snapshot,
68
+ timing: {
69
+ total_time_ms: totalTime,
70
+ page_load_time_ms: result.timing?.page_load_time_ms || 0,
71
+ snapshot_time_ms: snapshotTime,
72
+ },
73
+ };
74
+ }
75
+
76
+ // Fallback: Just return current page snapshot without any navigation
27
77
  const snapshotStart = Date.now();
28
- const snapshot = await this.getPageSnapshot(this.viewportLimit);
78
+ const snapshot = await this.getSnapshotForAction(this.viewportLimit);
29
79
  const snapshotTime = Date.now() - snapshotStart;
30
80
 
31
81
  const totalTime = Date.now() - startTime;
32
82
 
33
83
  return {
34
84
  success: true,
35
- message: `Browser opened and navigated to ${url}`,
85
+ message: `Browser opened without navigation`,
36
86
  snapshot,
37
87
  timing: {
38
88
  total_time_ms: totalTime,
39
- ...result.timing,
40
89
  snapshot_time_ms: snapshotTime,
41
90
  },
42
91
  };
@@ -68,39 +117,57 @@ export class HybridBrowserToolkit {
68
117
  }
69
118
 
70
119
  async visitPage(url: string): Promise<any> {
71
- const result = await this.session.visitPage(url);
72
-
73
- // Format response for Python layer compatibility
74
- const response: any = {
75
- result: result.message,
76
- snapshot: '',
77
- };
78
-
79
- if (result.success) {
80
- const snapshotStart = Date.now();
81
- response.snapshot = await this.getPageSnapshot(this.viewportLimit);
82
- const snapshotTime = Date.now() - snapshotStart;
120
+ try {
121
+ // Ensure browser is initialized before visiting page
122
+ await this.session.ensureBrowser();
123
+
124
+ const result = await this.session.visitPage(url);
125
+
126
+ // Format response for Python layer compatibility
127
+ const response: any = {
128
+ result: result.message,
129
+ snapshot: '',
130
+ };
83
131
 
132
+ if (result.success) {
133
+ const snapshotStart = Date.now();
134
+ response.snapshot = await this.getSnapshotForAction(this.viewportLimit);
135
+ const snapshotTime = Date.now() - snapshotStart;
136
+
137
+ if (result.timing) {
138
+ result.timing.snapshot_time_ms = snapshotTime;
139
+ }
140
+ }
141
+
142
+ // Include timing if available
84
143
  if (result.timing) {
85
- result.timing.snapshot_time_ms = snapshotTime;
144
+ response.timing = result.timing;
86
145
  }
146
+
147
+ // Include newTabId if present
148
+ if (result.newTabId) {
149
+ response.newTabId = result.newTabId;
150
+ }
151
+
152
+ return response;
153
+ } catch (error) {
154
+ console.error('[visitPage] Error:', error);
155
+ return {
156
+ result: `Navigation to ${url} failed: ${error}`,
157
+ snapshot: '',
158
+ timing: {
159
+ total_time_ms: 0,
160
+ navigation_time_ms: 0,
161
+ dom_content_loaded_time_ms: 0,
162
+ network_idle_time_ms: 0,
163
+ }
164
+ };
87
165
  }
88
-
89
- // Include timing if available
90
- if (result.timing) {
91
- response.timing = result.timing;
92
- }
93
-
94
- // Include newTabId if present
95
- if (result.newTabId) {
96
- response.newTabId = result.newTabId;
97
- }
98
-
99
- return response;
100
166
  }
101
167
 
102
168
  async getPageSnapshot(viewportLimit: boolean = false): Promise<string> {
103
169
  try {
170
+ // Always return real snapshot when explicitly called
104
171
  // If viewport limiting is enabled, we need coordinates for filtering
105
172
  const snapshotResult = await this.session.getSnapshotForAI(viewportLimit, viewportLimit);
106
173
  return snapshotResult.snapshot;
@@ -108,6 +175,14 @@ export class HybridBrowserToolkit {
108
175
  return `Error capturing snapshot: ${error}`;
109
176
  }
110
177
  }
178
+
179
+ // Internal method for getting snapshot in actions (respects fullVisualMode)
180
+ private async getSnapshotForAction(viewportLimit: boolean = false): Promise<string> {
181
+ if (this.fullVisualMode) {
182
+ return 'full visual mode';
183
+ }
184
+ return this.getPageSnapshot(viewportLimit);
185
+ }
111
186
 
112
187
 
113
188
  async getSnapshotForAI(): Promise<SnapshotResult> {
@@ -116,35 +191,34 @@ export class HybridBrowserToolkit {
116
191
 
117
192
  async getSomScreenshot(): Promise<VisualMarkResult & { timing: any }> {
118
193
  const startTime = Date.now();
194
+ console.log('[HybridBrowserToolkit] Starting getSomScreenshot...');
119
195
 
120
196
  try {
121
- const screenshotResult = await this.session.takeScreenshot();
122
- const snapshotResult = await this.session.getSnapshotForAI(true); // Include coordinates for SOM_mark
123
-
124
- // Add visual marks using improved method
125
- const markingStart = Date.now();
126
- const markedImageBuffer = await this.addVisualMarksOptimized(screenshotResult.buffer, snapshotResult);
127
- const markingTime = Date.now() - markingStart;
128
-
129
- const base64Image = markedImageBuffer.toString('base64');
130
- const dataUrl = `data:image/png;base64,${base64Image}`;
131
-
132
- const totalTime = Date.now() - startTime;
197
+ // Get page and snapshot data
198
+ const page = await this.session.getCurrentPage();
199
+ const snapshotResult = await this.session.getSnapshotForAI(true); // Include coordinates
133
200
 
134
- // Count elements with coordinates
135
- const elementsWithCoords = Object.values(snapshotResult.elements).filter(el => el.coordinates).length;
201
+ // Parse clickable elements from snapshot text
202
+ const clickableElements = this.parseClickableElements(snapshotResult.snapshot);
203
+ console.log(`[HybridBrowserToolkit] Found ${clickableElements.size} clickable elements`);
136
204
 
137
- return {
138
- text: `Visual webpage screenshot captured with ${Object.keys(snapshotResult.elements).length} interactive elements (${elementsWithCoords} marked visually)`,
139
- images: [dataUrl],
140
- timing: {
141
- total_time_ms: totalTime,
142
- screenshot_time_ms: screenshotResult.timing.screenshot_time_ms,
143
- snapshot_time_ms: snapshotResult.timing.snapshot_time_ms,
144
- coordinate_enrichment_time_ms: snapshotResult.timing.coordinate_enrichment_time_ms,
145
- visual_marking_time_ms: markingTime,
146
- },
147
- };
205
+ // Apply hierarchy-based filtering
206
+ const filteredElements = filterClickableByHierarchy(snapshotResult.snapshot, clickableElements);
207
+ console.log(`[HybridBrowserToolkit] After filtering: ${filteredElements.size} elements remain`);
208
+
209
+ // Use injected SOM-screenshot method without export path
210
+ const result = await SomScreenshotInjected.captureOptimized(
211
+ page,
212
+ snapshotResult,
213
+ filteredElements,
214
+ undefined // No export path - don't generate files
215
+ );
216
+
217
+ // Add snapshot timing info to result
218
+ result.timing.snapshot_time_ms = snapshotResult.timing.snapshot_time_ms;
219
+ result.timing.coordinate_enrichment_time_ms = snapshotResult.timing.coordinate_enrichment_time_ms;
220
+
221
+ return result;
148
222
  } catch (error) {
149
223
  const totalTime = Date.now() - startTime;
150
224
  return {
@@ -161,98 +235,6 @@ export class HybridBrowserToolkit {
161
235
  }
162
236
  }
163
237
 
164
- private async addVisualMarksOptimized(screenshotBuffer: Buffer, snapshotResult: SnapshotResult): Promise<Buffer> {
165
- try {
166
-
167
- // Check if we have any elements with coordinates
168
- const elementsWithCoords = Object.entries(snapshotResult.elements)
169
- .filter(([ref, element]) => element.coordinates);
170
-
171
- if (elementsWithCoords.length === 0) {
172
- return screenshotBuffer;
173
- }
174
-
175
- // Parse clickable elements from snapshot text
176
- const clickableElements = this.parseClickableElements(snapshotResult.snapshot);
177
-
178
- // Use sharp for image processing
179
- const sharp = require('sharp');
180
- const page = await this.session.getCurrentPage();
181
- const viewport = page.viewportSize() || { width: 1280, height: 720 };
182
-
183
- // Filter elements visible in viewport
184
- const visibleElements = elementsWithCoords.filter(([ref, element]) => {
185
- const coords = element.coordinates!;
186
- return coords.x < viewport.width &&
187
- coords.y < viewport.height &&
188
- coords.x + coords.width > 0 &&
189
- coords.y + coords.height > 0;
190
- });
191
-
192
- // Remove overlapped elements (only keep topmost)
193
- const nonOverlappedElements = this.removeOverlappedElements(visibleElements);
194
-
195
- // Create SVG overlay with all the marks
196
- const marks = nonOverlappedElements.map(([ref, element]) => {
197
- const coords = element.coordinates!;
198
- const isClickable = clickableElements.has(ref);
199
-
200
- // Use original coordinates for elements within viewport
201
- // Clamp only to prevent marks from extending beyond screenshot bounds
202
- const x = Math.max(0, coords.x);
203
- const y = Math.max(0, coords.y);
204
- const maxWidth = viewport.width - x;
205
- const maxHeight = viewport.height - y;
206
- const width = Math.min(coords.width, maxWidth);
207
- const height = Math.min(coords.height, maxHeight);
208
-
209
- // Position text to be visible even if element is partially cut off
210
- const textX = Math.max(2, Math.min(x + 2, viewport.width - 40));
211
- const textY = Math.max(14, Math.min(y + 14, viewport.height - 4));
212
-
213
- // Different colors for clickable vs non-clickable elements
214
- const colors = isClickable ? {
215
- fill: 'rgba(0, 150, 255, 0.15)', // Blue for clickable
216
- stroke: '#0096FF',
217
- textFill: '#0096FF'
218
- } : {
219
- fill: 'rgba(255, 107, 107, 0.1)', // Red for non-clickable
220
- stroke: '#FF6B6B',
221
- textFill: '#FF6B6B'
222
- };
223
-
224
- return `
225
- <rect x="${x}" y="${y}" width="${width}" height="${height}"
226
- fill="${colors.fill}" stroke="${colors.stroke}" stroke-width="2" rx="2"/>
227
- <text x="${textX}" y="${textY}" font-family="Arial, sans-serif"
228
- font-size="12" fill="${colors.textFill}" font-weight="bold">${ref}</text>
229
- `;
230
- }).join('');
231
-
232
- const svgOverlay = `
233
- <svg width="${viewport.width}" height="${viewport.height}" xmlns="http://www.w3.org/2000/svg">
234
- ${marks}
235
- </svg>
236
- `;
237
-
238
- // Composite the overlay onto the screenshot
239
- const markedImageBuffer = await sharp(screenshotBuffer)
240
- .composite([{
241
- input: Buffer.from(svgOverlay),
242
- top: 0,
243
- left: 0
244
- }])
245
- .png()
246
- .toBuffer();
247
-
248
- return markedImageBuffer;
249
-
250
- } catch (error) {
251
- // Error adding visual marks, falling back to original screenshot
252
- // Return original screenshot if marking fails
253
- return screenshotBuffer;
254
- }
255
- }
256
238
 
257
239
  /**
258
240
  * Parse clickable elements from snapshot text
@@ -262,8 +244,8 @@ export class HybridBrowserToolkit {
262
244
  const lines = snapshotText.split('\n');
263
245
 
264
246
  for (const line of lines) {
265
- // Look for lines containing [cursor=pointer] and extract ref
266
- if (line.includes('[cursor=pointer]')) {
247
+ // Look for lines containing [cursor=pointer] or [active] and extract ref
248
+ if (line.includes('[cursor=pointer]') || line.includes('[active]')) {
267
249
  const refMatch = line.match(/\[ref=([^\]]+)\]/);
268
250
  if (refMatch) {
269
251
  clickableElements.add(refMatch[1]);
@@ -274,73 +256,31 @@ export class HybridBrowserToolkit {
274
256
  return clickableElements;
275
257
  }
276
258
 
277
- /**
278
- * Remove overlapped elements, keeping only the topmost (last in DOM order)
279
- */
280
- private removeOverlappedElements(elements: Array<[string, any]>): Array<[string, any]> {
281
- const result: Array<[string, any]> = [];
282
-
283
- for (let i = 0; i < elements.length; i++) {
284
- const [refA, elementA] = elements[i];
285
- const coordsA = elementA.coordinates!;
286
- let isOverlapped = false;
287
-
288
- // Check if this element is completely overlapped by any later element
289
- for (let j = i + 1; j < elements.length; j++) {
290
- const [refB, elementB] = elements[j];
291
- const coordsB = elementB.coordinates!;
292
-
293
- // Check if element A is completely covered by element B
294
- if (this.isCompletelyOverlapped(coordsA, coordsB)) {
295
- isOverlapped = true;
296
- break;
297
- }
298
- }
299
-
300
- if (!isOverlapped) {
301
- result.push(elements[i]);
302
- }
303
- }
304
-
305
- return result;
306
- }
307
-
308
- /**
309
- * Check if element A is completely overlapped by element B
310
- */
311
- private isCompletelyOverlapped(
312
- coordsA: { x: number; y: number; width: number; height: number },
313
- coordsB: { x: number; y: number; width: number; height: number }
314
- ): boolean {
315
- // A is completely overlapped by B if:
316
- // B's left edge is <= A's left edge AND
317
- // B's top edge is <= A's top edge AND
318
- // B's right edge is >= A's right edge AND
319
- // B's bottom edge is >= A's bottom edge
320
- return (
321
- coordsB.x <= coordsA.x &&
322
- coordsB.y <= coordsA.y &&
323
- coordsB.x + coordsB.width >= coordsA.x + coordsA.width &&
324
- coordsB.y + coordsB.height >= coordsA.y + coordsA.height
325
- );
326
- }
327
259
 
328
260
  private async executeActionWithSnapshot(action: BrowserAction): Promise<any> {
329
261
  const result = await this.session.executeAction(action);
330
262
 
331
- // Format response for Python layer compatibility
332
263
  const response: any = {
333
264
  result: result.message,
334
265
  snapshot: '',
335
266
  };
336
267
 
337
268
  if (result.success) {
338
- const snapshotStart = Date.now();
339
- response.snapshot = await this.getPageSnapshot(this.viewportLimit);
340
- const snapshotTime = Date.now() - snapshotStart;
341
-
342
- if (result.timing) {
343
- result.timing.snapshot_time_ms = snapshotTime;
269
+ if (result.details?.diffSnapshot) {
270
+ response.snapshot = result.details.diffSnapshot;
271
+
272
+ if (result.timing) {
273
+ result.timing.snapshot_time_ms = 0; // Diff snapshot time is included in action time
274
+ }
275
+ } else {
276
+ // Get full snapshot as usual
277
+ const snapshotStart = Date.now();
278
+ response.snapshot = await this.getPageSnapshot(this.viewportLimit);
279
+ const snapshotTime = Date.now() - snapshotStart;
280
+
281
+ if (result.timing) {
282
+ result.timing.snapshot_time_ms = snapshotTime;
283
+ }
344
284
  }
345
285
  }
346
286
 
@@ -354,6 +294,14 @@ export class HybridBrowserToolkit {
354
294
  response.newTabId = result.newTabId;
355
295
  }
356
296
 
297
+ // Include details if present (excluding diffSnapshot as it's already in snapshot)
298
+ if (result.details) {
299
+ const { diffSnapshot, ...otherDetails } = result.details;
300
+ if (Object.keys(otherDetails).length > 0) {
301
+ response.details = otherDetails;
302
+ }
303
+ }
304
+
357
305
  return response;
358
306
  }
359
307
 
@@ -362,8 +310,20 @@ export class HybridBrowserToolkit {
362
310
  return this.executeActionWithSnapshot(action);
363
311
  }
364
312
 
365
- async type(ref: string, text: string): Promise<any> {
366
- const action: BrowserAction = { type: 'type', ref, text };
313
+ async type(refOrInputs: string | Array<{ ref: string; text: string }>, text?: string): Promise<any> {
314
+ let action: BrowserAction;
315
+
316
+ if (typeof refOrInputs === 'string') {
317
+ // Single input mode (backward compatibility)
318
+ if (text === undefined) {
319
+ throw new Error('Text parameter is required when ref is a string');
320
+ }
321
+ action = { type: 'type', ref: refOrInputs, text };
322
+ } else {
323
+ // Multiple inputs mode
324
+ action = { type: 'type', inputs: refOrInputs };
325
+ }
326
+
367
327
  return this.executeActionWithSnapshot(action);
368
328
  }
369
329
 
@@ -382,6 +342,25 @@ export class HybridBrowserToolkit {
382
342
  return this.executeActionWithSnapshot(action);
383
343
  }
384
344
 
345
+ async mouseControl(control: 'click' | 'right_click'| 'dblclick', x: number, y: number): Promise<any> {
346
+ const action: BrowserAction = { type: 'mouse_control', control, x, y };
347
+ return this.executeActionWithSnapshot(action);
348
+ }
349
+
350
+ async mouseDrag(from_ref: string, to_ref: string): Promise<any> {
351
+ const action: BrowserAction = { type: 'mouse_drag', from_ref, to_ref };
352
+ return this.executeActionWithSnapshot(action);
353
+ }
354
+
355
+ async pressKeys(keys: string[]): Promise<any> {
356
+ const action: BrowserAction = { type: 'press_key', keys};
357
+ return this.executeActionWithSnapshot(action);
358
+ }
359
+
360
+ async batchKeyboardInput(operations: Array<{type: string, keys?: string[], text?: string, delay?: number}>, skipStabilityWait: boolean = true): Promise<any> {
361
+ return this.session.batchKeyboardInput(operations, skipStabilityWait);
362
+ }
363
+
385
364
  async back(): Promise<ActionResult> {
386
365
  const startTime = Date.now();
387
366
 
@@ -393,7 +372,7 @@ export class HybridBrowserToolkit {
393
372
  const navigationTime = Date.now() - navigationStart;
394
373
 
395
374
  const snapshotStart = Date.now();
396
- const snapshot = await this.getPageSnapshot(this.viewportLimit);
375
+ const snapshot = await this.getSnapshotForAction(this.viewportLimit);
397
376
  const snapshotTime = Date.now() - snapshotStart;
398
377
 
399
378
  const totalTime = Date.now() - startTime;
@@ -433,7 +412,7 @@ export class HybridBrowserToolkit {
433
412
  const navigationTime = Date.now() - navigationStart;
434
413
 
435
414
  const snapshotStart = Date.now();
436
- const snapshot = await this.getPageSnapshot(this.viewportLimit);
415
+ const snapshot = await this.getSnapshotForAction(this.viewportLimit);
437
416
  const snapshotTime = Date.now() - snapshotStart;
438
417
 
439
418
  const totalTime = Date.now() - startTime;
@@ -505,7 +484,7 @@ export class HybridBrowserToolkit {
505
484
  return {
506
485
  success: true,
507
486
  message: `Closed tab ${tabId}`,
508
- snapshot: await this.getPageSnapshot(this.viewportLimit),
487
+ snapshot: await this.getSnapshotForAction(this.viewportLimit),
509
488
  };
510
489
  } else {
511
490
  return {
@@ -519,4 +498,93 @@ export class HybridBrowserToolkit {
519
498
  return await this.session.getTabInfo();
520
499
  }
521
500
 
522
- }
501
+ async getConsoleView(): Promise<any> {
502
+ const currentLogs = await this.session.getCurrentLogs();
503
+ // Format logs
504
+ return currentLogs.map(item => ({
505
+ type: item.type(),
506
+ text: item.text(),
507
+ }));
508
+ }
509
+
510
+ async consoleExecute(code: string): Promise<any> {
511
+ const startTime = Date.now();
512
+ try {
513
+ const page = await this.session.getCurrentPage();
514
+
515
+ // Wrap the code to capture console.log output
516
+ const wrappedCode = `
517
+ (function() {
518
+ const _logs = [];
519
+ const originalLog = console.log;
520
+ console.log = function(...args) {
521
+ _logs.push(args.map(arg => {
522
+ try {
523
+ return typeof arg === 'object' ? JSON.stringify(arg) : String(arg);
524
+ } catch (e) {
525
+ return String(arg);
526
+ }
527
+ }).join(' '));
528
+ originalLog.apply(console, args);
529
+ };
530
+
531
+ let result;
532
+ try {
533
+ result = eval(${JSON.stringify(code)});
534
+ } catch (e) {
535
+ try {
536
+ result = (function() { ${code} })();
537
+ } catch (error) {
538
+ console.log = originalLog;
539
+ throw error;
540
+ }
541
+ }
542
+
543
+ console.log = originalLog;
544
+ return { result, logs: _logs };
545
+ })()
546
+ `;
547
+
548
+ const evalResult = await page.evaluate(wrappedCode) as { result: any; logs: string[] };
549
+ const { result, logs } = evalResult;
550
+
551
+ const snapshotStart = Date.now();
552
+ const snapshot = await this.getSnapshotForAction(this.viewportLimit);
553
+ const snapshotTime = Date.now() - snapshotStart;
554
+ const totalTime = Date.now() - startTime;
555
+
556
+ // Properly serialize the result
557
+ let resultStr: string;
558
+ try {
559
+ resultStr = JSON.stringify(result, null, 2);
560
+ } catch (e) {
561
+ // Fallback for non-serializable values
562
+ resultStr = String(result);
563
+ }
564
+
565
+ return {
566
+ result: `Console execution result: ${resultStr}`,
567
+ console_output: logs,
568
+ snapshot: snapshot,
569
+ timing: {
570
+ total_time_ms: totalTime,
571
+ snapshot_time_ms: snapshotTime,
572
+ },
573
+ };
574
+
575
+ } catch (error) {
576
+ const totalTime = Date.now() - startTime;
577
+ return {
578
+ result: `Console execution failed: ${error}`,
579
+ console_output: [],
580
+ snapshot: '',
581
+ timing: {
582
+ total_time_ms: totalTime,
583
+ snapshot_time_ms: 0,
584
+ },
585
+ };
586
+ }
587
+ }
588
+
589
+ }
590
+