@elizaos/computeruse 0.24.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.toml +34 -0
- package/build.rs +10 -0
- package/computeruse.darwin-arm64.node +0 -0
- package/index.d.ts +0 -0
- package/index.js +327 -0
- package/package.json +74 -0
- package/scripts/sync-version.js +60 -0
- package/src/desktop.rs +2763 -0
- package/src/element.rs +1341 -0
- package/src/exceptions.rs +65 -0
- package/src/lib.rs +26 -0
- package/src/locator.rs +172 -0
- package/src/selector.rs +158 -0
- package/src/types.rs +963 -0
- package/src/window_manager.rs +342 -0
- package/tests/comprehensive-ui-elements.test.js +524 -0
- package/tests/cross-app-verification.test.js +243 -0
- package/tests/desktop-verify.test.js +169 -0
- package/tests/element-chaining.test.js +158 -0
- package/tests/element-range.test.js +207 -0
- package/tests/element-scroll-into-view.test.js +256 -0
- package/tests/element-value.test.js +264 -0
- package/tests/execute-browser-script-wrapper.test.js +135 -0
- package/tests/fixtures/sample-browser-script.js +7 -0
- package/tests/fixtures/script-with-env.js +16 -0
- package/tests/locator-validate.test.js +260 -0
- package/tests/locator-waitfor.test.js +286 -0
- package/wrapper.d.ts +84 -0
- package/wrapper.js +344 -0
- package/wrapper.ts +394 -0
package/src/desktop.rs
ADDED
|
@@ -0,0 +1,2763 @@
|
|
|
1
|
+
use crate::types::{
|
|
2
|
+
ClickResult, ClickType, ComputerUseResult, ComputerUseStep, Monitor, MonitorScreenshotPair,
|
|
3
|
+
ResizedDimensions, TreeOutputFormat, VisionType, WindowTreeResult,
|
|
4
|
+
};
|
|
5
|
+
use crate::Selector;
|
|
6
|
+
use crate::{
|
|
7
|
+
map_error, CommandOutput, Element, Locator, ScreenshotResult, TreeBuildConfig, UINode,
|
|
8
|
+
};
|
|
9
|
+
use napi::bindgen_prelude::Either;
|
|
10
|
+
use napi::threadsafe_function::{ThreadsafeFunction, ThreadsafeFunctionCallMode};
|
|
11
|
+
use napi_derive::napi;
|
|
12
|
+
use std::sync::{Arc, Once};
|
|
13
|
+
use computeruse::Desktop as ComputerUseDesktop;
|
|
14
|
+
|
|
15
|
+
/// Normalize key format to ensure curly brace syntax for special keys.
|
|
16
|
+
/// If key already contains `{`, assume it's correctly formatted.
|
|
17
|
+
/// Otherwise, wrap the entire key in `{}` to ensure it's treated as a special key press.
|
|
18
|
+
fn normalize_key(key: &str) -> String {
|
|
19
|
+
if key.contains('{') {
|
|
20
|
+
key.to_string()
|
|
21
|
+
} else {
|
|
22
|
+
format!("{{{}}}", key)
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/// Result of screenshot capture operations
|
|
27
|
+
#[derive(Default)]
|
|
28
|
+
struct ScreenshotPaths {
|
|
29
|
+
window_path: Option<String>,
|
|
30
|
+
monitor_paths: Option<Vec<String>>,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/// Helper to capture and save screenshots based on options
|
|
34
|
+
fn capture_screenshots(
|
|
35
|
+
desktop: &ComputerUseDesktop,
|
|
36
|
+
pid: Option<u32>,
|
|
37
|
+
include_window: bool,
|
|
38
|
+
include_monitors: bool,
|
|
39
|
+
operation: &str,
|
|
40
|
+
) -> ScreenshotPaths {
|
|
41
|
+
let mut result = ScreenshotPaths::default();
|
|
42
|
+
|
|
43
|
+
if !include_window && !include_monitors {
|
|
44
|
+
return result;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
computeruse::screenshot_logger::init();
|
|
48
|
+
let prefix = computeruse::screenshot_logger::generate_prefix(None, operation);
|
|
49
|
+
|
|
50
|
+
if include_window {
|
|
51
|
+
if let Some(pid) = pid {
|
|
52
|
+
// Try to capture window screenshot by PID
|
|
53
|
+
if let Ok(apps) = desktop.applications() {
|
|
54
|
+
if let Some(app) = apps.into_iter().find(|a| a.process_id().ok() == Some(pid)) {
|
|
55
|
+
if let Ok(screenshot) = app.capture() {
|
|
56
|
+
if let Some(saved) = computeruse::screenshot_logger::save_window_screenshot(
|
|
57
|
+
&screenshot,
|
|
58
|
+
&prefix,
|
|
59
|
+
None,
|
|
60
|
+
) {
|
|
61
|
+
result.window_path = Some(saved.path.to_string_lossy().to_string());
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
if include_monitors {
|
|
70
|
+
// Capture all monitors using futures executor for sync context
|
|
71
|
+
if let Ok(monitors) = futures::executor::block_on(desktop.capture_all_monitors()) {
|
|
72
|
+
let saved =
|
|
73
|
+
computeruse::screenshot_logger::save_monitor_screenshots(&monitors, &prefix, None);
|
|
74
|
+
if !saved.is_empty() {
|
|
75
|
+
result.monitor_paths = Some(
|
|
76
|
+
saved
|
|
77
|
+
.into_iter()
|
|
78
|
+
.map(|s| s.path.to_string_lossy().to_string())
|
|
79
|
+
.collect(),
|
|
80
|
+
);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
result
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/// Helper to find PID from process name using the shared core function.
|
|
89
|
+
fn find_pid_for_process(desktop: &ComputerUseDesktop, process_name: &str) -> napi::Result<u32> {
|
|
90
|
+
computeruse::find_pid_for_process(desktop, process_name).map_err(map_error)
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/// Main entry point for desktop automation.
|
|
94
|
+
#[napi(js_name = "Desktop")]
|
|
95
|
+
pub struct Desktop {
|
|
96
|
+
inner: ComputerUseDesktop,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
#[allow(clippy::needless_pass_by_value)]
|
|
100
|
+
#[napi]
|
|
101
|
+
impl Desktop {
|
|
102
|
+
/// Create a new Desktop automation instance with configurable options.
|
|
103
|
+
///
|
|
104
|
+
/// @param {boolean} [useBackgroundApps=false] - Enable background apps support.
|
|
105
|
+
/// @param {boolean} [activateApp=false] - Enable app activation support.
|
|
106
|
+
/// @param {string} [logLevel] - Logging level (e.g., 'info', 'debug', 'warn', 'error').
|
|
107
|
+
/// Falls back to RUST_LOG or COMPUTERUSE_LOG_LEVEL env vars, defaults to 'info'.
|
|
108
|
+
/// @returns {Desktop} A new Desktop automation instance.
|
|
109
|
+
#[napi(constructor)]
|
|
110
|
+
pub fn new(
|
|
111
|
+
use_background_apps: Option<bool>,
|
|
112
|
+
activate_app: Option<bool>,
|
|
113
|
+
log_level: Option<String>,
|
|
114
|
+
) -> Self {
|
|
115
|
+
let use_background_apps = use_background_apps.unwrap_or(false);
|
|
116
|
+
let activate_app = activate_app.unwrap_or(false);
|
|
117
|
+
|
|
118
|
+
// Priority: explicit param > RUST_LOG env > COMPUTERUSE_LOG_LEVEL env > "info" default
|
|
119
|
+
let log_level = log_level
|
|
120
|
+
.or_else(|| std::env::var("RUST_LOG").ok())
|
|
121
|
+
.or_else(|| std::env::var("COMPUTERUSE_LOG_LEVEL").ok())
|
|
122
|
+
.unwrap_or_else(|| "info".to_string());
|
|
123
|
+
|
|
124
|
+
static INIT: Once = Once::new();
|
|
125
|
+
INIT.call_once(|| {
|
|
126
|
+
let _ = tracing_subscriber::fmt()
|
|
127
|
+
.with_env_filter(log_level)
|
|
128
|
+
.with_ansi(false) // Disable ANSI color codes for cleaner output
|
|
129
|
+
.try_init();
|
|
130
|
+
});
|
|
131
|
+
let desktop = ComputerUseDesktop::new(use_background_apps, activate_app)
|
|
132
|
+
.expect("Failed to create Desktop instance");
|
|
133
|
+
Desktop { inner: desktop }
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/// Get the root UI element of the desktop.
|
|
137
|
+
///
|
|
138
|
+
/// @returns {Element} The root UI element.
|
|
139
|
+
#[napi]
|
|
140
|
+
pub fn root(&self) -> Element {
|
|
141
|
+
let root = self.inner.root();
|
|
142
|
+
Element::from(root)
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/// Get a list of all running applications.
|
|
146
|
+
///
|
|
147
|
+
/// @returns {Array<Element>} List of application UI elements.
|
|
148
|
+
#[napi]
|
|
149
|
+
pub fn applications(&self) -> napi::Result<Vec<Element>> {
|
|
150
|
+
self.inner
|
|
151
|
+
.applications()
|
|
152
|
+
.map(|apps| apps.into_iter().map(Element::from).collect())
|
|
153
|
+
.map_err(map_error)
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/// Get a running application by name.
|
|
157
|
+
///
|
|
158
|
+
/// @param {string} name - The name of the application to find.
|
|
159
|
+
/// @returns {Element} The application UI element.
|
|
160
|
+
#[napi]
|
|
161
|
+
pub fn application(&self, name: String) -> napi::Result<Element> {
|
|
162
|
+
self.inner
|
|
163
|
+
.application(&name)
|
|
164
|
+
.map(Element::from)
|
|
165
|
+
.map_err(map_error)
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/// Open an application by name.
|
|
169
|
+
///
|
|
170
|
+
/// @param {string} name - The name of the application to open.
|
|
171
|
+
/// @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot after opening
|
|
172
|
+
/// @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after opening
|
|
173
|
+
#[napi]
|
|
174
|
+
pub fn open_application(
|
|
175
|
+
&self,
|
|
176
|
+
name: String,
|
|
177
|
+
include_window_screenshot: Option<bool>,
|
|
178
|
+
include_monitor_screenshots: Option<bool>,
|
|
179
|
+
) -> napi::Result<Element> {
|
|
180
|
+
let element = self.inner.open_application(&name).map_err(map_error)?;
|
|
181
|
+
|
|
182
|
+
// Capture screenshots if enabled (window default: true, monitor default: false)
|
|
183
|
+
let _screenshots = capture_screenshots(
|
|
184
|
+
&self.inner,
|
|
185
|
+
element.process_id().ok(),
|
|
186
|
+
include_window_screenshot.unwrap_or(true),
|
|
187
|
+
include_monitor_screenshots.unwrap_or(false),
|
|
188
|
+
"openApplication",
|
|
189
|
+
);
|
|
190
|
+
|
|
191
|
+
Ok(Element::from(element))
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/// Activate an application by name.
|
|
195
|
+
///
|
|
196
|
+
/// @param {string} name - The name of the application to activate.
|
|
197
|
+
#[napi]
|
|
198
|
+
pub fn activate_application(&self, name: String) -> napi::Result<()> {
|
|
199
|
+
self.inner.activate_application(&name).map_err(map_error)
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/// Click within element bounds at a specified position.
|
|
203
|
+
///
|
|
204
|
+
/// This is useful for clicking on elements from UI tree, OCR, omniparser, gemini vision, or DOM
|
|
205
|
+
/// without needing an element reference - just the bounds.
|
|
206
|
+
///
|
|
207
|
+
/// @param {number} x - X coordinate of the bounds.
|
|
208
|
+
/// @param {number} y - Y coordinate of the bounds.
|
|
209
|
+
/// @param {number} width - Width of the bounds.
|
|
210
|
+
/// @param {number} height - Height of the bounds.
|
|
211
|
+
/// @param {number} [xPercentage=50] - X position within bounds as percentage (0-100). Defaults to 50 (center).
|
|
212
|
+
/// @param {number} [yPercentage=50] - Y position within bounds as percentage (0-100). Defaults to 50 (center).
|
|
213
|
+
/// @param {ClickType} [clickType='left'] - Type of click: 'left', 'double', or 'right'.
|
|
214
|
+
/// @param {boolean} [restoreCursor=true] - If true, restore cursor to original position after clicking.
|
|
215
|
+
/// @param {string} [process] - Process name for window screenshot capture. If provided, enables window screenshots.
|
|
216
|
+
/// @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot (requires process).
|
|
217
|
+
/// @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after clicking.
|
|
218
|
+
/// @returns {ClickResult} Result with clicked coordinates and method details.
|
|
219
|
+
#[napi]
|
|
220
|
+
#[allow(clippy::too_many_arguments)]
|
|
221
|
+
pub fn click_at_bounds(
|
|
222
|
+
&self,
|
|
223
|
+
x: f64,
|
|
224
|
+
y: f64,
|
|
225
|
+
width: f64,
|
|
226
|
+
height: f64,
|
|
227
|
+
x_percentage: Option<u8>,
|
|
228
|
+
y_percentage: Option<u8>,
|
|
229
|
+
click_type: Option<ClickType>,
|
|
230
|
+
restore_cursor: Option<bool>,
|
|
231
|
+
process: Option<String>,
|
|
232
|
+
include_window_screenshot: Option<bool>,
|
|
233
|
+
include_monitor_screenshots: Option<bool>,
|
|
234
|
+
) -> napi::Result<ClickResult> {
|
|
235
|
+
let bounds = (x, y, width, height);
|
|
236
|
+
let click_position = match (x_percentage, y_percentage) {
|
|
237
|
+
(Some(xp), Some(yp)) => Some((xp, yp)),
|
|
238
|
+
(Some(xp), None) => Some((xp, 50)),
|
|
239
|
+
(None, Some(yp)) => Some((50, yp)),
|
|
240
|
+
(None, None) => None,
|
|
241
|
+
};
|
|
242
|
+
let click_type = click_type.unwrap_or(ClickType::Left);
|
|
243
|
+
let restore_cursor = restore_cursor.unwrap_or(true);
|
|
244
|
+
|
|
245
|
+
let result = self
|
|
246
|
+
.inner
|
|
247
|
+
.click_at_bounds(bounds, click_position, click_type.into(), restore_cursor)
|
|
248
|
+
.map(ClickResult::from)
|
|
249
|
+
.map_err(map_error);
|
|
250
|
+
|
|
251
|
+
// Get PID from process name if provided
|
|
252
|
+
let pid = process
|
|
253
|
+
.as_ref()
|
|
254
|
+
.and_then(|p| find_pid_for_process(&self.inner, p).ok());
|
|
255
|
+
|
|
256
|
+
// Capture screenshots if requested
|
|
257
|
+
let _screenshots = capture_screenshots(
|
|
258
|
+
&self.inner,
|
|
259
|
+
pid,
|
|
260
|
+
include_window_screenshot.unwrap_or(true) && pid.is_some(),
|
|
261
|
+
include_monitor_screenshots.unwrap_or(false),
|
|
262
|
+
"clickAtBounds",
|
|
263
|
+
);
|
|
264
|
+
|
|
265
|
+
result
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
/// Click on an element by its index from the last tree/vision query.
|
|
269
|
+
///
|
|
270
|
+
/// This looks up cached bounds from the appropriate cache based on visionType,
|
|
271
|
+
/// then clicks at the specified position within those bounds.
|
|
272
|
+
///
|
|
273
|
+
/// @param {number} index - 1-based index from the tree/vision output (e.g., #1, #2).
|
|
274
|
+
/// @param {VisionType} [visionType='UiTree'] - Source of the index: 'UiTree', 'Ocr', 'Omniparser', 'Gemini', or 'Dom'.
|
|
275
|
+
/// @param {number} [xPercentage=50] - X position within bounds as percentage (0-100).
|
|
276
|
+
/// @param {number} [yPercentage=50] - Y position within bounds as percentage (0-100).
|
|
277
|
+
/// @param {ClickType} [clickType='Left'] - Type of click: 'Left', 'Double', or 'Right'.
|
|
278
|
+
/// @param {boolean} [restoreCursor=true] - If true, restore cursor to original position after clicking.
|
|
279
|
+
/// @param {string} [process] - Process name for window screenshot capture. If provided, enables window screenshots.
|
|
280
|
+
/// @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot (requires process).
|
|
281
|
+
/// @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after clicking.
|
|
282
|
+
/// @returns {ClickResult} Result with clicked coordinates, element info, and method details.
|
|
283
|
+
#[napi]
|
|
284
|
+
#[allow(clippy::too_many_arguments)]
|
|
285
|
+
pub fn click_by_index(
|
|
286
|
+
&self,
|
|
287
|
+
index: u32,
|
|
288
|
+
vision_type: Option<VisionType>,
|
|
289
|
+
x_percentage: Option<u8>,
|
|
290
|
+
y_percentage: Option<u8>,
|
|
291
|
+
click_type: Option<ClickType>,
|
|
292
|
+
restore_cursor: Option<bool>,
|
|
293
|
+
process: Option<String>,
|
|
294
|
+
include_window_screenshot: Option<bool>,
|
|
295
|
+
include_monitor_screenshots: Option<bool>,
|
|
296
|
+
) -> napi::Result<ClickResult> {
|
|
297
|
+
let vision_type = vision_type.unwrap_or(VisionType::UiTree);
|
|
298
|
+
let click_position = match (x_percentage, y_percentage) {
|
|
299
|
+
(Some(xp), Some(yp)) => Some((xp, yp)),
|
|
300
|
+
(Some(xp), None) => Some((xp, 50)),
|
|
301
|
+
(None, Some(yp)) => Some((50, yp)),
|
|
302
|
+
(None, None) => None,
|
|
303
|
+
};
|
|
304
|
+
let click_type = click_type.unwrap_or(ClickType::Left);
|
|
305
|
+
let restore_cursor = restore_cursor.unwrap_or(true);
|
|
306
|
+
|
|
307
|
+
let result = self
|
|
308
|
+
.inner
|
|
309
|
+
.click_by_index(
|
|
310
|
+
index,
|
|
311
|
+
vision_type.into(),
|
|
312
|
+
click_position,
|
|
313
|
+
click_type.into(),
|
|
314
|
+
restore_cursor,
|
|
315
|
+
)
|
|
316
|
+
.map(ClickResult::from)
|
|
317
|
+
.map_err(map_error);
|
|
318
|
+
|
|
319
|
+
// Get PID from process name if provided
|
|
320
|
+
let pid = process
|
|
321
|
+
.as_ref()
|
|
322
|
+
.and_then(|p| find_pid_for_process(&self.inner, p).ok());
|
|
323
|
+
|
|
324
|
+
// Capture screenshots if requested
|
|
325
|
+
let _screenshots = capture_screenshots(
|
|
326
|
+
&self.inner,
|
|
327
|
+
pid,
|
|
328
|
+
include_window_screenshot.unwrap_or(true) && pid.is_some(),
|
|
329
|
+
include_monitor_screenshots.unwrap_or(false),
|
|
330
|
+
"clickByIndex",
|
|
331
|
+
);
|
|
332
|
+
|
|
333
|
+
result
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
/// (async) Run a shell command.
|
|
337
|
+
///
|
|
338
|
+
/// @param {string} [windowsCommand] - Command to run on Windows.
|
|
339
|
+
/// @param {string} [unixCommand] - Command to run on Unix.
|
|
340
|
+
/// @returns {Promise<CommandOutput>} The command output.
|
|
341
|
+
#[napi]
|
|
342
|
+
pub async fn run_command(
|
|
343
|
+
&self,
|
|
344
|
+
windows_command: Option<String>,
|
|
345
|
+
unix_command: Option<String>,
|
|
346
|
+
) -> napi::Result<CommandOutput> {
|
|
347
|
+
self.inner
|
|
348
|
+
.run_command(windows_command.as_deref(), unix_command.as_deref())
|
|
349
|
+
.await
|
|
350
|
+
.map(|r| CommandOutput {
|
|
351
|
+
exit_status: r.exit_status,
|
|
352
|
+
stdout: r.stdout,
|
|
353
|
+
stderr: r.stderr,
|
|
354
|
+
})
|
|
355
|
+
.map_err(map_error)
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
/// (async) Execute a shell command using GitHub Actions-style syntax.
|
|
359
|
+
///
|
|
360
|
+
/// @param {string} command - The command to run (can be single or multi-line).
|
|
361
|
+
/// @param {string} [shell] - Optional shell to use (defaults to PowerShell on Windows, bash on Unix).
|
|
362
|
+
/// @param {string} [workingDirectory] - Optional working directory for the command.
|
|
363
|
+
/// @returns {Promise<CommandOutput>} The command output.
|
|
364
|
+
#[napi]
|
|
365
|
+
pub async fn run(
|
|
366
|
+
&self,
|
|
367
|
+
command: String,
|
|
368
|
+
shell: Option<String>,
|
|
369
|
+
working_directory: Option<String>,
|
|
370
|
+
) -> napi::Result<CommandOutput> {
|
|
371
|
+
self.inner
|
|
372
|
+
.run(
|
|
373
|
+
command.as_str(),
|
|
374
|
+
shell.as_deref(),
|
|
375
|
+
working_directory.as_deref(),
|
|
376
|
+
)
|
|
377
|
+
.await
|
|
378
|
+
.map(|r| CommandOutput {
|
|
379
|
+
exit_status: r.exit_status,
|
|
380
|
+
stdout: r.stdout,
|
|
381
|
+
stderr: r.stderr,
|
|
382
|
+
})
|
|
383
|
+
.map_err(map_error)
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
/// (async) Perform OCR on an image file.
|
|
387
|
+
///
|
|
388
|
+
/// @param {string} imagePath - Path to the image file.
|
|
389
|
+
/// @returns {Promise<string>} The extracted text.
|
|
390
|
+
#[napi]
|
|
391
|
+
pub async fn ocr_image_path(&self, image_path: String) -> napi::Result<String> {
|
|
392
|
+
self.inner
|
|
393
|
+
.ocr_image_path(&image_path)
|
|
394
|
+
.await
|
|
395
|
+
.map_err(map_error)
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
/// (async) Perform OCR on a screenshot.
|
|
399
|
+
///
|
|
400
|
+
/// @param {ScreenshotResult} screenshot - The screenshot to process.
|
|
401
|
+
/// @returns {Promise<string>} The extracted text.
|
|
402
|
+
#[napi]
|
|
403
|
+
pub async fn ocr_screenshot(&self, screenshot: ScreenshotResult) -> napi::Result<String> {
|
|
404
|
+
let rust_screenshot = screenshot.to_inner();
|
|
405
|
+
self.inner
|
|
406
|
+
.ocr_screenshot(&rust_screenshot)
|
|
407
|
+
.await
|
|
408
|
+
.map_err(map_error)
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
/// (async) Perform OCR on a window by process name and return structured results with bounding boxes.
|
|
412
|
+
/// Returns an OcrResult containing the OCR tree, formatted output, and index-to-bounds mapping
|
|
413
|
+
/// for click targeting.
|
|
414
|
+
///
|
|
415
|
+
/// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
|
|
416
|
+
/// @param {boolean} [formatOutput=true] - Whether to generate formatted compact YAML output.
|
|
417
|
+
/// @returns {Promise<OcrResult>} Complete OCR result with tree, formatted output, and bounds mapping.
|
|
418
|
+
#[napi]
|
|
419
|
+
#[cfg(target_os = "windows")]
|
|
420
|
+
pub async fn perform_ocr_for_process(
|
|
421
|
+
&self,
|
|
422
|
+
process: String,
|
|
423
|
+
format_output: Option<bool>,
|
|
424
|
+
) -> napi::Result<crate::types::OcrResult> {
|
|
425
|
+
let format_output = format_output.unwrap_or(true);
|
|
426
|
+
|
|
427
|
+
// Find PID for the process name
|
|
428
|
+
let pid = find_pid_for_process(&self.inner, &process)?;
|
|
429
|
+
|
|
430
|
+
// Find the application element by PID
|
|
431
|
+
let apps = self.inner.applications().map_err(map_error)?;
|
|
432
|
+
let window_element = apps
|
|
433
|
+
.into_iter()
|
|
434
|
+
.find(|app| app.process_id().ok() == Some(pid))
|
|
435
|
+
.ok_or_else(|| {
|
|
436
|
+
napi::Error::from_reason(format!("No window found for process '{}'", process))
|
|
437
|
+
})?;
|
|
438
|
+
|
|
439
|
+
// Get window bounds (absolute screen coordinates)
|
|
440
|
+
let bounds = window_element.bounds().map_err(map_error)?;
|
|
441
|
+
let (window_x, window_y, win_w, win_h) = bounds;
|
|
442
|
+
|
|
443
|
+
// Capture screenshot of the window
|
|
444
|
+
let screenshot = window_element.capture().map_err(map_error)?;
|
|
445
|
+
|
|
446
|
+
// Calculate DPI scale factors (physical screenshot pixels / logical window size)
|
|
447
|
+
let dpi_scale_w = screenshot.width as f64 / win_w;
|
|
448
|
+
let dpi_scale_h = screenshot.height as f64 / win_h;
|
|
449
|
+
|
|
450
|
+
// Perform OCR with bounding boxes
|
|
451
|
+
let ocr_element = self
|
|
452
|
+
.inner
|
|
453
|
+
.ocr_screenshot_with_bounds(&screenshot, window_x, window_y, dpi_scale_w, dpi_scale_h)
|
|
454
|
+
.map_err(map_error)?;
|
|
455
|
+
|
|
456
|
+
// Format the OCR tree if requested
|
|
457
|
+
let (formatted, index_to_bounds) = if format_output {
|
|
458
|
+
let result = computeruse::format_ocr_tree_as_compact_yaml(&ocr_element, 0);
|
|
459
|
+
|
|
460
|
+
// Populate the OCR cache for click_by_index support
|
|
461
|
+
self.inner
|
|
462
|
+
.populate_ocr_cache(result.index_to_bounds.clone());
|
|
463
|
+
|
|
464
|
+
let bounds_map: std::collections::HashMap<String, crate::types::OcrBoundsEntry> =
|
|
465
|
+
result
|
|
466
|
+
.index_to_bounds
|
|
467
|
+
.into_iter()
|
|
468
|
+
.map(|(idx, (text, (x, y, w, h)))| {
|
|
469
|
+
(
|
|
470
|
+
idx.to_string(),
|
|
471
|
+
crate::types::OcrBoundsEntry {
|
|
472
|
+
text,
|
|
473
|
+
bounds: crate::types::Bounds {
|
|
474
|
+
x,
|
|
475
|
+
y,
|
|
476
|
+
width: w,
|
|
477
|
+
height: h,
|
|
478
|
+
},
|
|
479
|
+
},
|
|
480
|
+
)
|
|
481
|
+
})
|
|
482
|
+
.collect();
|
|
483
|
+
(Some(result.formatted), bounds_map)
|
|
484
|
+
} else {
|
|
485
|
+
(None, std::collections::HashMap::new())
|
|
486
|
+
};
|
|
487
|
+
|
|
488
|
+
let element_count = index_to_bounds.len() as u32;
|
|
489
|
+
|
|
490
|
+
Ok(crate::types::OcrResult {
|
|
491
|
+
tree: crate::types::OcrElement::from(ocr_element),
|
|
492
|
+
formatted,
|
|
493
|
+
index_to_bounds,
|
|
494
|
+
element_count,
|
|
495
|
+
})
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
/// (async) Perform OCR on a window by process name (non-Windows stub).
|
|
499
|
+
#[napi]
|
|
500
|
+
#[cfg(not(target_os = "windows"))]
|
|
501
|
+
pub async fn perform_ocr_for_process(
|
|
502
|
+
&self,
|
|
503
|
+
_process: String,
|
|
504
|
+
_format_output: Option<bool>,
|
|
505
|
+
) -> napi::Result<crate::types::OcrResult> {
|
|
506
|
+
Err(napi::Error::from_reason(
|
|
507
|
+
"OCR with bounding boxes is currently only supported on Windows",
|
|
508
|
+
))
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
/// (async) Capture DOM elements from the current browser tab.
|
|
512
|
+
///
|
|
513
|
+
/// Extracts visible DOM elements with their properties and screen coordinates.
|
|
514
|
+
/// Uses JavaScript injection via Chrome extension to traverse the DOM tree.
|
|
515
|
+
///
|
|
516
|
+
/// @param {number} [maxElements=200] - Maximum number of elements to capture.
|
|
517
|
+
/// @param {boolean} [formatOutput=true] - Whether to include formatted compact YAML output.
|
|
518
|
+
/// @returns {Promise<BrowserDomResult>} DOM elements with bounds for click targeting.
|
|
519
|
+
#[napi]
|
|
520
|
+
pub async fn capture_browser_dom(
|
|
521
|
+
&self,
|
|
522
|
+
max_elements: Option<u32>,
|
|
523
|
+
format_output: Option<bool>,
|
|
524
|
+
) -> napi::Result<crate::types::BrowserDomResult> {
|
|
525
|
+
use std::collections::HashMap;
|
|
526
|
+
use std::time::Duration;
|
|
527
|
+
|
|
528
|
+
let max_elements = max_elements.unwrap_or(200);
|
|
529
|
+
let format_output = format_output.unwrap_or(true);
|
|
530
|
+
|
|
531
|
+
// Get viewport offset from Document element (more reliable than JS due to DPI scaling)
|
|
532
|
+
let viewport_offset = match self
|
|
533
|
+
.inner
|
|
534
|
+
.locator("role:Document")
|
|
535
|
+
.first(Some(Duration::from_millis(2000)))
|
|
536
|
+
.await
|
|
537
|
+
{
|
|
538
|
+
Ok(doc_element) => match doc_element.bounds() {
|
|
539
|
+
Ok((x, y, _w, _h)) => (x, y),
|
|
540
|
+
Err(_) => (0.0, 0.0),
|
|
541
|
+
},
|
|
542
|
+
Err(_) => (0.0, 0.0),
|
|
543
|
+
};
|
|
544
|
+
|
|
545
|
+
// JavaScript to extract visible DOM elements
|
|
546
|
+
let script = format!(
|
|
547
|
+
r#"
|
|
548
|
+
(function() {{
|
|
549
|
+
const elements = [];
|
|
550
|
+
const maxElements = {max_elements};
|
|
551
|
+
|
|
552
|
+
const walker = document.createTreeWalker(
|
|
553
|
+
document.body,
|
|
554
|
+
NodeFilter.SHOW_ELEMENT,
|
|
555
|
+
{{
|
|
556
|
+
acceptNode: function(node) {{
|
|
557
|
+
const style = window.getComputedStyle(node);
|
|
558
|
+
const rect = node.getBoundingClientRect();
|
|
559
|
+
|
|
560
|
+
if (style.display === 'none' ||
|
|
561
|
+
style.visibility === 'hidden' ||
|
|
562
|
+
style.opacity === '0' ||
|
|
563
|
+
rect.width === 0 ||
|
|
564
|
+
rect.height === 0) {{
|
|
565
|
+
return NodeFilter.FILTER_SKIP;
|
|
566
|
+
}}
|
|
567
|
+
|
|
568
|
+
return NodeFilter.FILTER_ACCEPT;
|
|
569
|
+
}}
|
|
570
|
+
}}
|
|
571
|
+
);
|
|
572
|
+
|
|
573
|
+
let node;
|
|
574
|
+
while (node = walker.nextNode()) {{
|
|
575
|
+
if (elements.length >= maxElements) {{
|
|
576
|
+
break;
|
|
577
|
+
}}
|
|
578
|
+
|
|
579
|
+
const rect = node.getBoundingClientRect();
|
|
580
|
+
const text = node.innerText ? node.innerText.substring(0, 100).trim() : null;
|
|
581
|
+
|
|
582
|
+
elements.push({{
|
|
583
|
+
tag: node.tagName.toLowerCase(),
|
|
584
|
+
id: node.id || null,
|
|
585
|
+
classes: Array.from(node.classList),
|
|
586
|
+
text: text,
|
|
587
|
+
href: node.href || null,
|
|
588
|
+
type: node.type || null,
|
|
589
|
+
name: node.name || null,
|
|
590
|
+
value: node.value || null,
|
|
591
|
+
placeholder: node.placeholder || null,
|
|
592
|
+
aria_label: node.getAttribute('aria-label'),
|
|
593
|
+
role: node.getAttribute('role'),
|
|
594
|
+
x: Math.round(rect.x * window.devicePixelRatio),
|
|
595
|
+
y: Math.round(rect.y * window.devicePixelRatio),
|
|
596
|
+
width: Math.round(rect.width * window.devicePixelRatio),
|
|
597
|
+
height: Math.round(rect.height * window.devicePixelRatio)
|
|
598
|
+
}});
|
|
599
|
+
}}
|
|
600
|
+
|
|
601
|
+
return JSON.stringify({{
|
|
602
|
+
elements: elements,
|
|
603
|
+
total_found: elements.length,
|
|
604
|
+
page_url: window.location.href,
|
|
605
|
+
page_title: document.title,
|
|
606
|
+
devicePixelRatio: window.devicePixelRatio
|
|
607
|
+
}});
|
|
608
|
+
}})()"#
|
|
609
|
+
);
|
|
610
|
+
|
|
611
|
+
let result_str = self
|
|
612
|
+
.inner
|
|
613
|
+
.execute_browser_script(&script)
|
|
614
|
+
.await
|
|
615
|
+
.map_err(map_error)?;
|
|
616
|
+
|
|
617
|
+
let parsed: serde_json::Value = serde_json::from_str(&result_str)
|
|
618
|
+
.map_err(|e| napi::Error::from_reason(format!("Failed to parse DOM result: {e}")))?;
|
|
619
|
+
|
|
620
|
+
let page_url = parsed
|
|
621
|
+
.get("page_url")
|
|
622
|
+
.and_then(|v| v.as_str())
|
|
623
|
+
.unwrap_or("")
|
|
624
|
+
.to_string();
|
|
625
|
+
let page_title = parsed
|
|
626
|
+
.get("page_title")
|
|
627
|
+
.and_then(|v| v.as_str())
|
|
628
|
+
.unwrap_or("")
|
|
629
|
+
.to_string();
|
|
630
|
+
|
|
631
|
+
let raw_elements = parsed
|
|
632
|
+
.get("elements")
|
|
633
|
+
.and_then(|v| v.as_array())
|
|
634
|
+
.cloned()
|
|
635
|
+
.unwrap_or_default();
|
|
636
|
+
|
|
637
|
+
// Convert to BrowserDomElement and build index_to_bounds
|
|
638
|
+
let mut elements = Vec::new();
|
|
639
|
+
let mut index_to_bounds: HashMap<String, crate::types::DomBoundsEntry> = HashMap::new();
|
|
640
|
+
let mut formatted_lines: Vec<String> = Vec::new();
|
|
641
|
+
|
|
642
|
+
if format_output {
|
|
643
|
+
formatted_lines.push(format!(
|
|
644
|
+
"Browser DOM: {} elements (url: {}, title: {})",
|
|
645
|
+
raw_elements.len(),
|
|
646
|
+
page_url,
|
|
647
|
+
page_title
|
|
648
|
+
));
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
for (i, elem) in raw_elements.iter().enumerate() {
|
|
652
|
+
let idx = i + 1;
|
|
653
|
+
let tag = elem
|
|
654
|
+
.get("tag")
|
|
655
|
+
.and_then(|v| v.as_str())
|
|
656
|
+
.unwrap_or("")
|
|
657
|
+
.to_string();
|
|
658
|
+
let id = elem.get("id").and_then(|v| v.as_str()).map(String::from);
|
|
659
|
+
let classes: Vec<String> = elem
|
|
660
|
+
.get("classes")
|
|
661
|
+
.and_then(|v| v.as_array())
|
|
662
|
+
.map(|arr| {
|
|
663
|
+
arr.iter()
|
|
664
|
+
.filter_map(|c| c.as_str().map(String::from))
|
|
665
|
+
.collect()
|
|
666
|
+
})
|
|
667
|
+
.unwrap_or_default();
|
|
668
|
+
let text = elem.get("text").and_then(|v| v.as_str()).map(String::from);
|
|
669
|
+
let href = elem.get("href").and_then(|v| v.as_str()).map(String::from);
|
|
670
|
+
let r#type = elem.get("type").and_then(|v| v.as_str()).map(String::from);
|
|
671
|
+
let name = elem.get("name").and_then(|v| v.as_str()).map(String::from);
|
|
672
|
+
let value = elem.get("value").and_then(|v| v.as_str()).map(String::from);
|
|
673
|
+
let placeholder = elem
|
|
674
|
+
.get("placeholder")
|
|
675
|
+
.and_then(|v| v.as_str())
|
|
676
|
+
.map(String::from);
|
|
677
|
+
let aria_label = elem
|
|
678
|
+
.get("aria_label")
|
|
679
|
+
.and_then(|v| v.as_str())
|
|
680
|
+
.map(String::from);
|
|
681
|
+
let role = elem.get("role").and_then(|v| v.as_str()).map(String::from);
|
|
682
|
+
|
|
683
|
+
// Build bounds with viewport offset added
|
|
684
|
+
let x = elem.get("x").and_then(|v| v.as_f64()).unwrap_or(0.0) + viewport_offset.0;
|
|
685
|
+
let y = elem.get("y").and_then(|v| v.as_f64()).unwrap_or(0.0) + viewport_offset.1;
|
|
686
|
+
let width = elem.get("width").and_then(|v| v.as_f64()).unwrap_or(0.0);
|
|
687
|
+
let height = elem.get("height").and_then(|v| v.as_f64()).unwrap_or(0.0);
|
|
688
|
+
|
|
689
|
+
let bounds = crate::types::Bounds {
|
|
690
|
+
x,
|
|
691
|
+
y,
|
|
692
|
+
width,
|
|
693
|
+
height,
|
|
694
|
+
};
|
|
695
|
+
|
|
696
|
+
// Display name for index_to_bounds
|
|
697
|
+
let display_name = text
|
|
698
|
+
.as_ref()
|
|
699
|
+
.filter(|t| !t.is_empty())
|
|
700
|
+
.cloned()
|
|
701
|
+
.or_else(|| aria_label.clone())
|
|
702
|
+
.or_else(|| placeholder.clone())
|
|
703
|
+
.or_else(|| name.clone())
|
|
704
|
+
.or_else(|| id.clone())
|
|
705
|
+
.unwrap_or_else(|| format!("<{}>", tag));
|
|
706
|
+
|
|
707
|
+
// Format line for compact YAML
|
|
708
|
+
if format_output {
|
|
709
|
+
let mut line_parts = vec![format!("#{} [{}]", idx, tag.to_uppercase())];
|
|
710
|
+
if let Some(ref t) = text {
|
|
711
|
+
if !t.is_empty() {
|
|
712
|
+
let truncated = if t.len() > 40 {
|
|
713
|
+
format!("{}...", &t[..40])
|
|
714
|
+
} else {
|
|
715
|
+
t.clone()
|
|
716
|
+
};
|
|
717
|
+
line_parts.push(truncated);
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
if let Some(ref a) = aria_label {
|
|
721
|
+
line_parts.push(format!("aria:{}", a));
|
|
722
|
+
}
|
|
723
|
+
if let Some(ref r) = role {
|
|
724
|
+
line_parts.push(format!("role:{}", r));
|
|
725
|
+
}
|
|
726
|
+
formatted_lines.push(format!(" {}", line_parts.join(" ")));
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
index_to_bounds.insert(
|
|
730
|
+
idx.to_string(),
|
|
731
|
+
crate::types::DomBoundsEntry {
|
|
732
|
+
name: display_name,
|
|
733
|
+
tag: tag.clone(),
|
|
734
|
+
bounds: bounds.clone(),
|
|
735
|
+
},
|
|
736
|
+
);
|
|
737
|
+
|
|
738
|
+
elements.push(crate::types::BrowserDomElement {
|
|
739
|
+
tag,
|
|
740
|
+
id,
|
|
741
|
+
classes,
|
|
742
|
+
text,
|
|
743
|
+
href,
|
|
744
|
+
r#type,
|
|
745
|
+
name,
|
|
746
|
+
value,
|
|
747
|
+
placeholder,
|
|
748
|
+
aria_label,
|
|
749
|
+
role,
|
|
750
|
+
bounds,
|
|
751
|
+
});
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
// Populate DOM cache for click_by_index
|
|
755
|
+
#[allow(clippy::type_complexity)]
|
|
756
|
+
let cache_items: std::collections::HashMap<
|
|
757
|
+
u32,
|
|
758
|
+
(String, String, (f64, f64, f64, f64)),
|
|
759
|
+
> = index_to_bounds
|
|
760
|
+
.iter()
|
|
761
|
+
.filter_map(|(key, entry)| {
|
|
762
|
+
key.parse::<u32>().ok().map(|idx| {
|
|
763
|
+
(
|
|
764
|
+
idx,
|
|
765
|
+
(
|
|
766
|
+
entry.name.clone(),
|
|
767
|
+
entry.tag.clone(),
|
|
768
|
+
(
|
|
769
|
+
entry.bounds.x,
|
|
770
|
+
entry.bounds.y,
|
|
771
|
+
entry.bounds.width,
|
|
772
|
+
entry.bounds.height,
|
|
773
|
+
),
|
|
774
|
+
),
|
|
775
|
+
)
|
|
776
|
+
})
|
|
777
|
+
})
|
|
778
|
+
.collect();
|
|
779
|
+
self.inner.populate_dom_cache(cache_items);
|
|
780
|
+
|
|
781
|
+
Ok(crate::types::BrowserDomResult {
|
|
782
|
+
elements,
|
|
783
|
+
formatted: if format_output {
|
|
784
|
+
Some(formatted_lines.join("\n"))
|
|
785
|
+
} else {
|
|
786
|
+
None
|
|
787
|
+
},
|
|
788
|
+
index_to_bounds,
|
|
789
|
+
element_count: raw_elements.len() as u32,
|
|
790
|
+
page_url,
|
|
791
|
+
page_title,
|
|
792
|
+
})
|
|
793
|
+
}
|
|
794
|
+
|
|
795
|
+
/// (async) Get a clustered tree combining elements from multiple sources grouped by spatial proximity.
|
|
796
|
+
///
|
|
797
|
+
/// Combines accessibility tree (UIA) elements with optional DOM, Omniparser, and Gemini Vision elements,
|
|
798
|
+
/// clustering nearby elements together. Each element is prefixed with its source:
|
|
799
|
+
/// - #u1, #u2... for UIA (accessibility tree)
|
|
800
|
+
/// - #d1, #d2... for DOM (browser content)
|
|
801
|
+
/// - #p1, #p2... for Omniparser (vision AI detection)
|
|
802
|
+
/// - #g1, #g2... for Gemini Vision (AI element detection)
|
|
803
|
+
///
|
|
804
|
+
/// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
|
|
805
|
+
/// @param {number} [maxDomElements=100] - Maximum DOM elements to capture for browsers.
|
|
806
|
+
/// @param {boolean} [includeOmniparser=false] - Whether to include Omniparser vision detection.
|
|
807
|
+
/// @param {boolean} [includeGeminiVision=false] - Whether to include Gemini Vision AI detection.
|
|
808
|
+
/// @returns {Promise<ClusteredFormattingResult>} Clustered tree with prefixed indices.
|
|
809
|
+
#[napi]
|
|
810
|
+
pub async fn get_clustered_tree(
|
|
811
|
+
&self,
|
|
812
|
+
process: String,
|
|
813
|
+
max_dom_elements: Option<u32>,
|
|
814
|
+
include_omniparser: Option<bool>,
|
|
815
|
+
include_gemini_vision: Option<bool>,
|
|
816
|
+
) -> napi::Result<crate::types::ClusteredFormattingResult> {
|
|
817
|
+
use std::collections::HashMap;
|
|
818
|
+
|
|
819
|
+
// Find PID for the process name
|
|
820
|
+
let pid = find_pid_for_process(&self.inner, &process)?;
|
|
821
|
+
|
|
822
|
+
let max_dom_elements = max_dom_elements.unwrap_or(100);
|
|
823
|
+
let include_omniparser = include_omniparser.unwrap_or(false);
|
|
824
|
+
let include_gemini_vision = include_gemini_vision.unwrap_or(false);
|
|
825
|
+
|
|
826
|
+
// Get UIA tree with bounds
|
|
827
|
+
let uia_result = self
|
|
828
|
+
.inner
|
|
829
|
+
.get_window_tree_result(pid, None, None)
|
|
830
|
+
.map_err(map_error)?;
|
|
831
|
+
|
|
832
|
+
// Build UIA bounds cache: HashMap<u32, (role, name, bounds, selector)>
|
|
833
|
+
#[allow(clippy::type_complexity)]
|
|
834
|
+
let mut uia_bounds: HashMap<
|
|
835
|
+
u32,
|
|
836
|
+
(String, String, (f64, f64, f64, f64), Option<String>),
|
|
837
|
+
> = HashMap::new();
|
|
838
|
+
|
|
839
|
+
// Use the formatted result to extract bounds
|
|
840
|
+
let formatted_result = computeruse::format_ui_node_as_compact_yaml(&uia_result.tree, 0);
|
|
841
|
+
for (idx, (role, name, bounds, selector)) in formatted_result.index_to_bounds {
|
|
842
|
+
uia_bounds.insert(idx, (role, name, bounds, selector));
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
// Check if this is a browser
|
|
846
|
+
let is_browser = computeruse::is_browser_process(pid);
|
|
847
|
+
|
|
848
|
+
// Build DOM bounds cache: HashMap<u32, (tag, identifier, bounds)>
|
|
849
|
+
#[allow(clippy::type_complexity)]
|
|
850
|
+
let mut dom_bounds: HashMap<u32, (String, String, (f64, f64, f64, f64))> = HashMap::new();
|
|
851
|
+
|
|
852
|
+
if is_browser {
|
|
853
|
+
// Try to capture DOM elements
|
|
854
|
+
match self
|
|
855
|
+
.capture_browser_dom(Some(max_dom_elements), Some(true))
|
|
856
|
+
.await
|
|
857
|
+
{
|
|
858
|
+
Ok(dom_result) => {
|
|
859
|
+
for (idx_str, entry) in dom_result.index_to_bounds {
|
|
860
|
+
if let Ok(idx) = idx_str.parse::<u32>() {
|
|
861
|
+
let bounds = (
|
|
862
|
+
entry.bounds.x,
|
|
863
|
+
entry.bounds.y,
|
|
864
|
+
entry.bounds.width,
|
|
865
|
+
entry.bounds.height,
|
|
866
|
+
);
|
|
867
|
+
dom_bounds.insert(idx, (entry.tag, entry.name, bounds));
|
|
868
|
+
}
|
|
869
|
+
}
|
|
870
|
+
}
|
|
871
|
+
Err(_) => {
|
|
872
|
+
// DOM capture failed (e.g., chrome:// page), continue with UIA only
|
|
873
|
+
}
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
// Build Omniparser items cache if requested
|
|
878
|
+
let mut omniparser_items: HashMap<u32, computeruse::OmniparserItem> = HashMap::new();
|
|
879
|
+
|
|
880
|
+
if include_omniparser {
|
|
881
|
+
match self
|
|
882
|
+
.perform_omniparser_for_process(process.clone(), None, Some(true))
|
|
883
|
+
.await
|
|
884
|
+
{
|
|
885
|
+
Ok(omni_result) => {
|
|
886
|
+
for (idx_str, entry) in omni_result.index_to_bounds {
|
|
887
|
+
if let Ok(idx) = idx_str.parse::<u32>() {
|
|
888
|
+
omniparser_items.insert(
|
|
889
|
+
idx,
|
|
890
|
+
computeruse::OmniparserItem {
|
|
891
|
+
label: entry.label.clone(),
|
|
892
|
+
content: Some(entry.name.clone()),
|
|
893
|
+
box_2d: Some([
|
|
894
|
+
entry.bounds.x,
|
|
895
|
+
entry.bounds.y,
|
|
896
|
+
entry.bounds.x + entry.bounds.width,
|
|
897
|
+
entry.bounds.y + entry.bounds.height,
|
|
898
|
+
]),
|
|
899
|
+
},
|
|
900
|
+
);
|
|
901
|
+
}
|
|
902
|
+
}
|
|
903
|
+
}
|
|
904
|
+
Err(_) => {
|
|
905
|
+
// Omniparser failed, continue without it
|
|
906
|
+
}
|
|
907
|
+
}
|
|
908
|
+
}
|
|
909
|
+
|
|
910
|
+
// Build Gemini Vision items cache if requested
|
|
911
|
+
let mut vision_items: HashMap<u32, computeruse::VisionElement> = HashMap::new();
|
|
912
|
+
|
|
913
|
+
if include_gemini_vision {
|
|
914
|
+
match self
|
|
915
|
+
.perform_gemini_vision_for_process(process.clone(), Some(true))
|
|
916
|
+
.await
|
|
917
|
+
{
|
|
918
|
+
Ok(vision_result) => {
|
|
919
|
+
for (idx_str, entry) in vision_result.index_to_bounds {
|
|
920
|
+
if let Ok(idx) = idx_str.parse::<u32>() {
|
|
921
|
+
vision_items.insert(
|
|
922
|
+
idx,
|
|
923
|
+
computeruse::VisionElement {
|
|
924
|
+
element_type: entry.element_type.clone(),
|
|
925
|
+
content: Some(entry.name.clone()),
|
|
926
|
+
description: None,
|
|
927
|
+
box_2d: Some([
|
|
928
|
+
entry.bounds.x,
|
|
929
|
+
entry.bounds.y,
|
|
930
|
+
entry.bounds.x + entry.bounds.width,
|
|
931
|
+
entry.bounds.y + entry.bounds.height,
|
|
932
|
+
]),
|
|
933
|
+
interactivity: None,
|
|
934
|
+
},
|
|
935
|
+
);
|
|
936
|
+
}
|
|
937
|
+
}
|
|
938
|
+
}
|
|
939
|
+
Err(_) => {
|
|
940
|
+
// Gemini Vision failed, continue without it
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
|
|
945
|
+
// Empty cache for OCR (not implemented yet)
|
|
946
|
+
#[allow(clippy::type_complexity)]
|
|
947
|
+
let ocr_bounds: HashMap<u32, (String, (f64, f64, f64, f64))> = HashMap::new();
|
|
948
|
+
|
|
949
|
+
// Call the core clustering function
|
|
950
|
+
let clustered_result = computeruse::format_clustered_tree_from_caches(
|
|
951
|
+
&uia_bounds,
|
|
952
|
+
&dom_bounds,
|
|
953
|
+
&ocr_bounds,
|
|
954
|
+
&omniparser_items,
|
|
955
|
+
&vision_items,
|
|
956
|
+
);
|
|
957
|
+
|
|
958
|
+
// Convert to SDK types
|
|
959
|
+
let mut index_to_source_and_bounds: HashMap<String, crate::types::ClusteredBoundsEntry> =
|
|
960
|
+
HashMap::new();
|
|
961
|
+
|
|
962
|
+
for (key, (source, original_idx, (x, y, w, h))) in
|
|
963
|
+
clustered_result.index_to_source_and_bounds
|
|
964
|
+
{
|
|
965
|
+
let sdk_source = match source {
|
|
966
|
+
computeruse::ElementSource::Uia => crate::types::ElementSource::Uia,
|
|
967
|
+
computeruse::ElementSource::Dom => crate::types::ElementSource::Dom,
|
|
968
|
+
computeruse::ElementSource::Ocr => crate::types::ElementSource::Ocr,
|
|
969
|
+
computeruse::ElementSource::Omniparser => crate::types::ElementSource::Omniparser,
|
|
970
|
+
computeruse::ElementSource::Gemini => crate::types::ElementSource::Gemini,
|
|
971
|
+
};
|
|
972
|
+
index_to_source_and_bounds.insert(
|
|
973
|
+
key,
|
|
974
|
+
crate::types::ClusteredBoundsEntry {
|
|
975
|
+
source: sdk_source,
|
|
976
|
+
original_index: original_idx,
|
|
977
|
+
bounds: crate::types::Bounds {
|
|
978
|
+
x,
|
|
979
|
+
y,
|
|
980
|
+
width: w,
|
|
981
|
+
height: h,
|
|
982
|
+
},
|
|
983
|
+
},
|
|
984
|
+
);
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
Ok(crate::types::ClusteredFormattingResult {
|
|
988
|
+
formatted: clustered_result.formatted,
|
|
989
|
+
index_to_source_and_bounds,
|
|
990
|
+
})
|
|
991
|
+
}
|
|
992
|
+
|
|
993
|
+
/// (async) Perform Gemini vision AI detection on a window by process name.
|
|
994
|
+
///
|
|
995
|
+
/// Captures a screenshot and sends it to the Gemini vision backend for UI element detection.
|
|
996
|
+
/// Requires GEMINI_VISION_BACKEND_URL environment variable (defaults to https://app.mediar.ai/api/vision/parse).
|
|
997
|
+
///
|
|
998
|
+
/// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
|
|
999
|
+
/// @param {boolean} [formatOutput=true] - Whether to include formatted compact YAML output.
|
|
1000
|
+
/// @returns {Promise<GeminiVisionResult>} Detected UI elements with bounds for click targeting.
|
|
1001
|
+
#[napi]
|
|
1002
|
+
pub async fn perform_gemini_vision_for_process(
|
|
1003
|
+
&self,
|
|
1004
|
+
process: String,
|
|
1005
|
+
format_output: Option<bool>,
|
|
1006
|
+
) -> napi::Result<crate::types::GeminiVisionResult> {
|
|
1007
|
+
use base64::{engine::general_purpose, Engine};
|
|
1008
|
+
use image::imageops::FilterType;
|
|
1009
|
+
use image::{codecs::png::PngEncoder, ExtendedColorType, ImageBuffer, ImageEncoder, Rgba};
|
|
1010
|
+
use std::collections::HashMap;
|
|
1011
|
+
use std::io::Cursor;
|
|
1012
|
+
|
|
1013
|
+
let format_output = format_output.unwrap_or(true);
|
|
1014
|
+
|
|
1015
|
+
// Find PID for the process name
|
|
1016
|
+
let pid = find_pid_for_process(&self.inner, &process)?;
|
|
1017
|
+
|
|
1018
|
+
// Find the window element for this process
|
|
1019
|
+
let apps = self.inner.applications().map_err(map_error)?;
|
|
1020
|
+
let window_element = apps
|
|
1021
|
+
.into_iter()
|
|
1022
|
+
.find(|app| app.process_id().ok() == Some(pid))
|
|
1023
|
+
.ok_or_else(|| {
|
|
1024
|
+
napi::Error::from_reason(format!("No window found for process '{}'", process))
|
|
1025
|
+
})?;
|
|
1026
|
+
|
|
1027
|
+
// Get window bounds
|
|
1028
|
+
let bounds = window_element.bounds().map_err(map_error)?;
|
|
1029
|
+
let (window_x, window_y, win_w, win_h) = bounds;
|
|
1030
|
+
|
|
1031
|
+
// Capture screenshot
|
|
1032
|
+
let screenshot = window_element.capture().map_err(map_error)?;
|
|
1033
|
+
let original_width = screenshot.width;
|
|
1034
|
+
let original_height = screenshot.height;
|
|
1035
|
+
|
|
1036
|
+
// Calculate DPI scale
|
|
1037
|
+
let dpi_scale_w = original_width as f64 / win_w;
|
|
1038
|
+
let dpi_scale_h = original_height as f64 / win_h;
|
|
1039
|
+
|
|
1040
|
+
// Convert BGRA to RGBA
|
|
1041
|
+
let rgba_data: Vec<u8> = screenshot
|
|
1042
|
+
.image_data
|
|
1043
|
+
.chunks_exact(4)
|
|
1044
|
+
.flat_map(|bgra| [bgra[2], bgra[1], bgra[0], bgra[3]])
|
|
1045
|
+
.collect();
|
|
1046
|
+
|
|
1047
|
+
// Resize if needed (max 1920px)
|
|
1048
|
+
const MAX_DIM: u32 = 1920;
|
|
1049
|
+
let (final_width, final_height, final_rgba_data, scale_factor) = if original_width > MAX_DIM
|
|
1050
|
+
|| original_height > MAX_DIM
|
|
1051
|
+
{
|
|
1052
|
+
let scale = (MAX_DIM as f32 / original_width.max(original_height) as f32).min(1.0);
|
|
1053
|
+
let new_width = (original_width as f32 * scale).round() as u32;
|
|
1054
|
+
let new_height = (original_height as f32 * scale).round() as u32;
|
|
1055
|
+
|
|
1056
|
+
let img =
|
|
1057
|
+
ImageBuffer::<Rgba<u8>, _>::from_raw(original_width, original_height, rgba_data)
|
|
1058
|
+
.ok_or_else(|| napi::Error::from_reason("Failed to create image buffer"))?;
|
|
1059
|
+
|
|
1060
|
+
let resized =
|
|
1061
|
+
image::imageops::resize(&img, new_width, new_height, FilterType::Lanczos3);
|
|
1062
|
+
(new_width, new_height, resized.into_raw(), scale as f64)
|
|
1063
|
+
} else {
|
|
1064
|
+
(original_width, original_height, rgba_data, 1.0)
|
|
1065
|
+
};
|
|
1066
|
+
|
|
1067
|
+
// Encode to PNG
|
|
1068
|
+
let mut png_data = Vec::new();
|
|
1069
|
+
let encoder = PngEncoder::new(Cursor::new(&mut png_data));
|
|
1070
|
+
encoder
|
|
1071
|
+
.write_image(
|
|
1072
|
+
&final_rgba_data,
|
|
1073
|
+
final_width,
|
|
1074
|
+
final_height,
|
|
1075
|
+
ExtendedColorType::Rgba8,
|
|
1076
|
+
)
|
|
1077
|
+
.map_err(|e| napi::Error::from_reason(format!("Failed to encode PNG: {e}")))?;
|
|
1078
|
+
|
|
1079
|
+
let base64_image = general_purpose::STANDARD.encode(&png_data);
|
|
1080
|
+
|
|
1081
|
+
// Call Gemini Vision backend
|
|
1082
|
+
let backend_url = std::env::var("GEMINI_VISION_BACKEND_URL")
|
|
1083
|
+
.unwrap_or_else(|_| "https://app.mediar.ai/api/vision/parse".to_string());
|
|
1084
|
+
|
|
1085
|
+
let client = reqwest::Client::builder()
|
|
1086
|
+
.timeout(std::time::Duration::from_secs(300))
|
|
1087
|
+
.build()
|
|
1088
|
+
.map_err(|e| napi::Error::from_reason(format!("Failed to create HTTP client: {e}")))?;
|
|
1089
|
+
|
|
1090
|
+
let payload = serde_json::json!({
|
|
1091
|
+
"image": base64_image,
|
|
1092
|
+
"model": "gemini",
|
|
1093
|
+
"prompt": "Detect all UI elements in this screenshot. Return their type, content, description, bounding boxes, and interactivity."
|
|
1094
|
+
});
|
|
1095
|
+
|
|
1096
|
+
let resp = client
|
|
1097
|
+
.post(&backend_url)
|
|
1098
|
+
.header("Content-Type", "application/json")
|
|
1099
|
+
.json(&payload)
|
|
1100
|
+
.send()
|
|
1101
|
+
.await
|
|
1102
|
+
.map_err(|e| napi::Error::from_reason(format!("Vision backend request failed: {e}")))?;
|
|
1103
|
+
|
|
1104
|
+
if !resp.status().is_success() {
|
|
1105
|
+
let text = resp.text().await.unwrap_or_default();
|
|
1106
|
+
return Err(napi::Error::from_reason(format!(
|
|
1107
|
+
"Vision backend error: {}",
|
|
1108
|
+
text
|
|
1109
|
+
)));
|
|
1110
|
+
}
|
|
1111
|
+
|
|
1112
|
+
let response_text = resp
|
|
1113
|
+
.text()
|
|
1114
|
+
.await
|
|
1115
|
+
.map_err(|e| napi::Error::from_reason(format!("Failed to read response: {e}")))?;
|
|
1116
|
+
|
|
1117
|
+
let parsed: serde_json::Value = serde_json::from_str(&response_text)
|
|
1118
|
+
.map_err(|e| napi::Error::from_reason(format!("Failed to parse response: {e}")))?;
|
|
1119
|
+
|
|
1120
|
+
if let Some(error) = parsed.get("error").and_then(|v| v.as_str()) {
|
|
1121
|
+
return Err(napi::Error::from_reason(format!("Vision error: {}", error)));
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
let raw_elements = parsed
|
|
1125
|
+
.get("elements")
|
|
1126
|
+
.and_then(|v| v.as_array())
|
|
1127
|
+
.cloned()
|
|
1128
|
+
.unwrap_or_default();
|
|
1129
|
+
|
|
1130
|
+
// Convert to VisionElement with absolute screen coordinates
|
|
1131
|
+
let mut elements = Vec::new();
|
|
1132
|
+
let mut index_to_bounds: HashMap<String, crate::types::VisionBoundsEntry> = HashMap::new();
|
|
1133
|
+
let mut formatted_lines: Vec<String> = Vec::new();
|
|
1134
|
+
|
|
1135
|
+
if format_output {
|
|
1136
|
+
formatted_lines.push(format!(
|
|
1137
|
+
"Gemini Vision: {} elements (PID: {})",
|
|
1138
|
+
raw_elements.len(),
|
|
1139
|
+
pid
|
|
1140
|
+
));
|
|
1141
|
+
}
|
|
1142
|
+
|
|
1143
|
+
let inv_scale = 1.0 / scale_factor;
|
|
1144
|
+
|
|
1145
|
+
for (i, elem) in raw_elements.iter().enumerate() {
|
|
1146
|
+
let idx = i + 1;
|
|
1147
|
+
let element_type = elem
|
|
1148
|
+
.get("type")
|
|
1149
|
+
.and_then(|v| v.as_str())
|
|
1150
|
+
.unwrap_or("unknown")
|
|
1151
|
+
.to_string();
|
|
1152
|
+
let content = elem
|
|
1153
|
+
.get("content")
|
|
1154
|
+
.and_then(|v| v.as_str())
|
|
1155
|
+
.filter(|s| !s.is_empty())
|
|
1156
|
+
.map(String::from);
|
|
1157
|
+
let description = elem
|
|
1158
|
+
.get("description")
|
|
1159
|
+
.and_then(|v| v.as_str())
|
|
1160
|
+
.filter(|s| !s.is_empty())
|
|
1161
|
+
.map(String::from);
|
|
1162
|
+
let interactivity = elem.get("interactivity").and_then(|v| v.as_bool());
|
|
1163
|
+
|
|
1164
|
+
// Get normalized bbox [x1, y1, x2, y2] from 0-1
|
|
1165
|
+
let bbox = elem.get("bbox").and_then(|v| v.as_array());
|
|
1166
|
+
let bounds = bbox.and_then(|arr| {
|
|
1167
|
+
if arr.len() >= 4 {
|
|
1168
|
+
let x1 = arr[0].as_f64()? * final_width as f64;
|
|
1169
|
+
let y1 = arr[1].as_f64()? * final_height as f64;
|
|
1170
|
+
let x2 = arr[2].as_f64()? * final_width as f64;
|
|
1171
|
+
let y2 = arr[3].as_f64()? * final_height as f64;
|
|
1172
|
+
|
|
1173
|
+
// Scale back to original size and convert to logical screen coords
|
|
1174
|
+
let abs_x = window_x + (x1 * inv_scale / dpi_scale_w);
|
|
1175
|
+
let abs_y = window_y + (y1 * inv_scale / dpi_scale_h);
|
|
1176
|
+
let abs_w = (x2 - x1) * inv_scale / dpi_scale_w;
|
|
1177
|
+
let abs_h = (y2 - y1) * inv_scale / dpi_scale_h;
|
|
1178
|
+
|
|
1179
|
+
Some(crate::types::Bounds {
|
|
1180
|
+
x: abs_x,
|
|
1181
|
+
y: abs_y,
|
|
1182
|
+
width: abs_w,
|
|
1183
|
+
height: abs_h,
|
|
1184
|
+
})
|
|
1185
|
+
} else {
|
|
1186
|
+
None
|
|
1187
|
+
}
|
|
1188
|
+
});
|
|
1189
|
+
|
|
1190
|
+
// Display name for index_to_bounds
|
|
1191
|
+
let display_name = content
|
|
1192
|
+
.as_ref()
|
|
1193
|
+
.cloned()
|
|
1194
|
+
.or_else(|| description.clone())
|
|
1195
|
+
.unwrap_or_else(|| format!("<{}>", element_type));
|
|
1196
|
+
|
|
1197
|
+
// Format line for compact YAML
|
|
1198
|
+
if format_output {
|
|
1199
|
+
let mut line_parts = vec![format!("#{} [{}]", idx, element_type.to_uppercase())];
|
|
1200
|
+
if let Some(ref c) = content {
|
|
1201
|
+
let truncated = if c.len() > 40 {
|
|
1202
|
+
format!("{}...", &c[..40])
|
|
1203
|
+
} else {
|
|
1204
|
+
c.clone()
|
|
1205
|
+
};
|
|
1206
|
+
line_parts.push(truncated);
|
|
1207
|
+
}
|
|
1208
|
+
if let Some(ref d) = description {
|
|
1209
|
+
let truncated = if d.len() > 30 {
|
|
1210
|
+
format!("{}...", &d[..30])
|
|
1211
|
+
} else {
|
|
1212
|
+
d.clone()
|
|
1213
|
+
};
|
|
1214
|
+
line_parts.push(format!("desc:{}", truncated));
|
|
1215
|
+
}
|
|
1216
|
+
if interactivity == Some(true) {
|
|
1217
|
+
line_parts.push("interactive".to_string());
|
|
1218
|
+
}
|
|
1219
|
+
formatted_lines.push(format!(" {}", line_parts.join(" ")));
|
|
1220
|
+
}
|
|
1221
|
+
|
|
1222
|
+
if let Some(ref b) = bounds {
|
|
1223
|
+
index_to_bounds.insert(
|
|
1224
|
+
idx.to_string(),
|
|
1225
|
+
crate::types::VisionBoundsEntry {
|
|
1226
|
+
name: display_name.clone(),
|
|
1227
|
+
element_type: element_type.clone(),
|
|
1228
|
+
bounds: b.clone(),
|
|
1229
|
+
},
|
|
1230
|
+
);
|
|
1231
|
+
}
|
|
1232
|
+
|
|
1233
|
+
elements.push(crate::types::VisionElement {
|
|
1234
|
+
element_type,
|
|
1235
|
+
content,
|
|
1236
|
+
description,
|
|
1237
|
+
bounds,
|
|
1238
|
+
interactivity,
|
|
1239
|
+
});
|
|
1240
|
+
}
|
|
1241
|
+
|
|
1242
|
+
// Populate the Vision cache for click_by_index support
|
|
1243
|
+
let cache_items: HashMap<u32, computeruse::VisionElement> = elements
|
|
1244
|
+
.iter()
|
|
1245
|
+
.enumerate()
|
|
1246
|
+
.map(|(i, elem)| {
|
|
1247
|
+
let box_2d = elem
|
|
1248
|
+
.bounds
|
|
1249
|
+
.as_ref()
|
|
1250
|
+
.map(|b| [b.x, b.y, b.x + b.width, b.y + b.height]);
|
|
1251
|
+
(
|
|
1252
|
+
(i + 1) as u32,
|
|
1253
|
+
computeruse::VisionElement {
|
|
1254
|
+
element_type: elem.element_type.clone(),
|
|
1255
|
+
content: elem.content.clone(),
|
|
1256
|
+
description: elem.description.clone(),
|
|
1257
|
+
box_2d,
|
|
1258
|
+
interactivity: elem.interactivity,
|
|
1259
|
+
},
|
|
1260
|
+
)
|
|
1261
|
+
})
|
|
1262
|
+
.collect();
|
|
1263
|
+
self.inner.populate_vision_cache(cache_items);
|
|
1264
|
+
|
|
1265
|
+
Ok(crate::types::GeminiVisionResult {
|
|
1266
|
+
elements,
|
|
1267
|
+
formatted: if format_output {
|
|
1268
|
+
Some(formatted_lines.join("\n"))
|
|
1269
|
+
} else {
|
|
1270
|
+
None
|
|
1271
|
+
},
|
|
1272
|
+
index_to_bounds,
|
|
1273
|
+
element_count: raw_elements.len() as u32,
|
|
1274
|
+
})
|
|
1275
|
+
}
|
|
1276
|
+
|
|
1277
|
+
/// (async) Perform Omniparser V2 detection on a window by process name.
|
|
1278
|
+
///
|
|
1279
|
+
/// Captures a screenshot and sends it to the Omniparser backend for icon/field detection.
|
|
1280
|
+
/// Requires OMNIPARSER_BACKEND_URL environment variable (defaults to https://app.mediar.ai/api/omniparser/parse).
|
|
1281
|
+
///
|
|
1282
|
+
/// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
|
|
1283
|
+
/// @param {number} [imgsz=1920] - Icon detection image size (640-1920). Higher = better but slower.
|
|
1284
|
+
/// @param {boolean} [formatOutput=true] - Whether to include formatted compact YAML output.
|
|
1285
|
+
/// @returns {Promise<OmniparserResult>} Detected items with bounds for click targeting.
|
|
1286
|
+
#[napi]
|
|
1287
|
+
pub async fn perform_omniparser_for_process(
|
|
1288
|
+
&self,
|
|
1289
|
+
process: String,
|
|
1290
|
+
imgsz: Option<u32>,
|
|
1291
|
+
format_output: Option<bool>,
|
|
1292
|
+
) -> napi::Result<crate::types::OmniparserResult> {
|
|
1293
|
+
use base64::{engine::general_purpose, Engine};
|
|
1294
|
+
use image::imageops::FilterType;
|
|
1295
|
+
use image::{codecs::png::PngEncoder, ExtendedColorType, ImageBuffer, ImageEncoder, Rgba};
|
|
1296
|
+
use std::collections::HashMap;
|
|
1297
|
+
use std::io::Cursor;
|
|
1298
|
+
|
|
1299
|
+
let imgsz = imgsz.unwrap_or(1920).clamp(640, 1920);
|
|
1300
|
+
let format_output = format_output.unwrap_or(true);
|
|
1301
|
+
|
|
1302
|
+
// Find PID for the process name
|
|
1303
|
+
let pid = find_pid_for_process(&self.inner, &process)?;
|
|
1304
|
+
|
|
1305
|
+
// Find the window element for this process
|
|
1306
|
+
let apps = self.inner.applications().map_err(map_error)?;
|
|
1307
|
+
let window_element = apps
|
|
1308
|
+
.into_iter()
|
|
1309
|
+
.find(|app| app.process_id().ok() == Some(pid))
|
|
1310
|
+
.ok_or_else(|| {
|
|
1311
|
+
napi::Error::from_reason(format!("No window found for process '{}'", process))
|
|
1312
|
+
})?;
|
|
1313
|
+
|
|
1314
|
+
// Get window bounds
|
|
1315
|
+
let bounds = window_element.bounds().map_err(map_error)?;
|
|
1316
|
+
let (window_x, window_y, win_w, win_h) = bounds;
|
|
1317
|
+
|
|
1318
|
+
// Capture screenshot
|
|
1319
|
+
let screenshot = window_element.capture().map_err(map_error)?;
|
|
1320
|
+
let original_width = screenshot.width;
|
|
1321
|
+
let original_height = screenshot.height;
|
|
1322
|
+
|
|
1323
|
+
// Calculate DPI scale
|
|
1324
|
+
let dpi_scale_w = original_width as f64 / win_w;
|
|
1325
|
+
let dpi_scale_h = original_height as f64 / win_h;
|
|
1326
|
+
|
|
1327
|
+
// Convert BGRA to RGBA
|
|
1328
|
+
let rgba_data: Vec<u8> = screenshot
|
|
1329
|
+
.image_data
|
|
1330
|
+
.chunks_exact(4)
|
|
1331
|
+
.flat_map(|bgra| [bgra[2], bgra[1], bgra[0], bgra[3]])
|
|
1332
|
+
.collect();
|
|
1333
|
+
|
|
1334
|
+
// Resize if needed (max 1920px)
|
|
1335
|
+
const MAX_DIM: u32 = 1920;
|
|
1336
|
+
let (final_width, final_height, final_rgba_data, scale_factor) = if original_width > MAX_DIM
|
|
1337
|
+
|| original_height > MAX_DIM
|
|
1338
|
+
{
|
|
1339
|
+
let scale = (MAX_DIM as f32 / original_width.max(original_height) as f32).min(1.0);
|
|
1340
|
+
let new_width = (original_width as f32 * scale).round() as u32;
|
|
1341
|
+
let new_height = (original_height as f32 * scale).round() as u32;
|
|
1342
|
+
|
|
1343
|
+
let img =
|
|
1344
|
+
ImageBuffer::<Rgba<u8>, _>::from_raw(original_width, original_height, rgba_data)
|
|
1345
|
+
.ok_or_else(|| napi::Error::from_reason("Failed to create image buffer"))?;
|
|
1346
|
+
|
|
1347
|
+
let resized =
|
|
1348
|
+
image::imageops::resize(&img, new_width, new_height, FilterType::Lanczos3);
|
|
1349
|
+
(new_width, new_height, resized.into_raw(), scale as f64)
|
|
1350
|
+
} else {
|
|
1351
|
+
(original_width, original_height, rgba_data, 1.0)
|
|
1352
|
+
};
|
|
1353
|
+
|
|
1354
|
+
// Encode to PNG
|
|
1355
|
+
let mut png_data = Vec::new();
|
|
1356
|
+
let encoder = PngEncoder::new(Cursor::new(&mut png_data));
|
|
1357
|
+
encoder
|
|
1358
|
+
.write_image(
|
|
1359
|
+
&final_rgba_data,
|
|
1360
|
+
final_width,
|
|
1361
|
+
final_height,
|
|
1362
|
+
ExtendedColorType::Rgba8,
|
|
1363
|
+
)
|
|
1364
|
+
.map_err(|e| napi::Error::from_reason(format!("Failed to encode PNG: {e}")))?;
|
|
1365
|
+
|
|
1366
|
+
let base64_image = general_purpose::STANDARD.encode(&png_data);
|
|
1367
|
+
|
|
1368
|
+
// Call Omniparser backend
|
|
1369
|
+
let backend_url = std::env::var("OMNIPARSER_BACKEND_URL")
|
|
1370
|
+
.unwrap_or_else(|_| "https://app.mediar.ai/api/omniparser/parse".to_string());
|
|
1371
|
+
|
|
1372
|
+
let client = reqwest::Client::builder()
|
|
1373
|
+
.timeout(std::time::Duration::from_secs(300))
|
|
1374
|
+
.build()
|
|
1375
|
+
.map_err(|e| napi::Error::from_reason(format!("Failed to create HTTP client: {e}")))?;
|
|
1376
|
+
|
|
1377
|
+
let payload = serde_json::json!({
|
|
1378
|
+
"image": base64_image,
|
|
1379
|
+
"imgsz": imgsz
|
|
1380
|
+
});
|
|
1381
|
+
|
|
1382
|
+
let resp = client
|
|
1383
|
+
.post(&backend_url)
|
|
1384
|
+
.header("Content-Type", "application/json")
|
|
1385
|
+
.json(&payload)
|
|
1386
|
+
.send()
|
|
1387
|
+
.await
|
|
1388
|
+
.map_err(|e| {
|
|
1389
|
+
napi::Error::from_reason(format!("Omniparser backend request failed: {e}"))
|
|
1390
|
+
})?;
|
|
1391
|
+
|
|
1392
|
+
if !resp.status().is_success() {
|
|
1393
|
+
let text = resp.text().await.unwrap_or_default();
|
|
1394
|
+
return Err(napi::Error::from_reason(format!(
|
|
1395
|
+
"Omniparser backend error: {}",
|
|
1396
|
+
text
|
|
1397
|
+
)));
|
|
1398
|
+
}
|
|
1399
|
+
|
|
1400
|
+
let response_text = resp
|
|
1401
|
+
.text()
|
|
1402
|
+
.await
|
|
1403
|
+
.map_err(|e| napi::Error::from_reason(format!("Failed to read response: {e}")))?;
|
|
1404
|
+
|
|
1405
|
+
let parsed: serde_json::Value = serde_json::from_str(&response_text)
|
|
1406
|
+
.map_err(|e| napi::Error::from_reason(format!("Failed to parse response: {e}")))?;
|
|
1407
|
+
|
|
1408
|
+
if let Some(error) = parsed.get("error").and_then(|v| v.as_str()) {
|
|
1409
|
+
return Err(napi::Error::from_reason(format!(
|
|
1410
|
+
"Omniparser error: {}",
|
|
1411
|
+
error
|
|
1412
|
+
)));
|
|
1413
|
+
}
|
|
1414
|
+
|
|
1415
|
+
let raw_elements = parsed
|
|
1416
|
+
.get("elements")
|
|
1417
|
+
.and_then(|v| v.as_array())
|
|
1418
|
+
.cloned()
|
|
1419
|
+
.unwrap_or_default();
|
|
1420
|
+
|
|
1421
|
+
// Convert to OmniparserItem with absolute screen coordinates
|
|
1422
|
+
let mut items = Vec::new();
|
|
1423
|
+
let mut index_to_bounds: HashMap<String, crate::types::OmniparserBoundsEntry> =
|
|
1424
|
+
HashMap::new();
|
|
1425
|
+
let mut formatted_lines: Vec<String> = Vec::new();
|
|
1426
|
+
|
|
1427
|
+
if format_output {
|
|
1428
|
+
formatted_lines.push(format!(
|
|
1429
|
+
"Omniparser: {} items (PID: {})",
|
|
1430
|
+
raw_elements.len(),
|
|
1431
|
+
pid
|
|
1432
|
+
));
|
|
1433
|
+
}
|
|
1434
|
+
|
|
1435
|
+
let inv_scale = 1.0 / scale_factor;
|
|
1436
|
+
|
|
1437
|
+
for (i, elem) in raw_elements.iter().enumerate() {
|
|
1438
|
+
let idx = i + 1;
|
|
1439
|
+
let label = elem
|
|
1440
|
+
.get("type")
|
|
1441
|
+
.and_then(|v| v.as_str())
|
|
1442
|
+
.unwrap_or("unknown")
|
|
1443
|
+
.to_string();
|
|
1444
|
+
let content = elem
|
|
1445
|
+
.get("content")
|
|
1446
|
+
.and_then(|v| v.as_str())
|
|
1447
|
+
.filter(|s| !s.is_empty())
|
|
1448
|
+
.map(String::from);
|
|
1449
|
+
|
|
1450
|
+
// Get normalized bbox [x1, y1, x2, y2] from 0-1
|
|
1451
|
+
let bbox = elem.get("bbox").and_then(|v| v.as_array());
|
|
1452
|
+
let bounds = bbox.and_then(|arr| {
|
|
1453
|
+
if arr.len() >= 4 {
|
|
1454
|
+
let x1 = arr[0].as_f64()? * final_width as f64;
|
|
1455
|
+
let y1 = arr[1].as_f64()? * final_height as f64;
|
|
1456
|
+
let x2 = arr[2].as_f64()? * final_width as f64;
|
|
1457
|
+
let y2 = arr[3].as_f64()? * final_height as f64;
|
|
1458
|
+
|
|
1459
|
+
// Scale back to original size and convert to logical screen coords
|
|
1460
|
+
let abs_x = window_x + (x1 * inv_scale / dpi_scale_w);
|
|
1461
|
+
let abs_y = window_y + (y1 * inv_scale / dpi_scale_h);
|
|
1462
|
+
let abs_w = (x2 - x1) * inv_scale / dpi_scale_w;
|
|
1463
|
+
let abs_h = (y2 - y1) * inv_scale / dpi_scale_h;
|
|
1464
|
+
|
|
1465
|
+
Some(crate::types::Bounds {
|
|
1466
|
+
x: abs_x,
|
|
1467
|
+
y: abs_y,
|
|
1468
|
+
width: abs_w,
|
|
1469
|
+
height: abs_h,
|
|
1470
|
+
})
|
|
1471
|
+
} else {
|
|
1472
|
+
None
|
|
1473
|
+
}
|
|
1474
|
+
});
|
|
1475
|
+
|
|
1476
|
+
// Display name for index_to_bounds
|
|
1477
|
+
let display_name = content
|
|
1478
|
+
.as_ref()
|
|
1479
|
+
.cloned()
|
|
1480
|
+
.unwrap_or_else(|| format!("<{}>", label));
|
|
1481
|
+
|
|
1482
|
+
// Format line for compact YAML
|
|
1483
|
+
if format_output {
|
|
1484
|
+
let mut line_parts = vec![format!("#{} [{}]", idx, label.to_uppercase())];
|
|
1485
|
+
if let Some(ref c) = content {
|
|
1486
|
+
let truncated = if c.len() > 50 {
|
|
1487
|
+
format!("{}...", &c[..50])
|
|
1488
|
+
} else {
|
|
1489
|
+
c.clone()
|
|
1490
|
+
};
|
|
1491
|
+
line_parts.push(truncated);
|
|
1492
|
+
}
|
|
1493
|
+
formatted_lines.push(format!(" {}", line_parts.join(" ")));
|
|
1494
|
+
}
|
|
1495
|
+
|
|
1496
|
+
if let Some(ref b) = bounds {
|
|
1497
|
+
index_to_bounds.insert(
|
|
1498
|
+
idx.to_string(),
|
|
1499
|
+
crate::types::OmniparserBoundsEntry {
|
|
1500
|
+
name: display_name.clone(),
|
|
1501
|
+
label: label.clone(),
|
|
1502
|
+
bounds: b.clone(),
|
|
1503
|
+
},
|
|
1504
|
+
);
|
|
1505
|
+
}
|
|
1506
|
+
|
|
1507
|
+
items.push(crate::types::OmniparserItem {
|
|
1508
|
+
label,
|
|
1509
|
+
content,
|
|
1510
|
+
bounds,
|
|
1511
|
+
});
|
|
1512
|
+
}
|
|
1513
|
+
|
|
1514
|
+
// Populate the Omniparser cache for click_by_index support
|
|
1515
|
+
let cache_items: HashMap<u32, computeruse::OmniparserItem> = items
|
|
1516
|
+
.iter()
|
|
1517
|
+
.enumerate()
|
|
1518
|
+
.map(|(i, item)| {
|
|
1519
|
+
let box_2d = item
|
|
1520
|
+
.bounds
|
|
1521
|
+
.as_ref()
|
|
1522
|
+
.map(|b| [b.x, b.y, b.x + b.width, b.y + b.height]);
|
|
1523
|
+
(
|
|
1524
|
+
(i + 1) as u32,
|
|
1525
|
+
computeruse::OmniparserItem {
|
|
1526
|
+
label: item.label.clone(),
|
|
1527
|
+
content: item.content.clone(),
|
|
1528
|
+
box_2d,
|
|
1529
|
+
},
|
|
1530
|
+
)
|
|
1531
|
+
})
|
|
1532
|
+
.collect();
|
|
1533
|
+
self.inner.populate_omniparser_cache(cache_items);
|
|
1534
|
+
|
|
1535
|
+
Ok(crate::types::OmniparserResult {
|
|
1536
|
+
items,
|
|
1537
|
+
formatted: if format_output {
|
|
1538
|
+
Some(formatted_lines.join("\n"))
|
|
1539
|
+
} else {
|
|
1540
|
+
None
|
|
1541
|
+
},
|
|
1542
|
+
index_to_bounds,
|
|
1543
|
+
item_count: raw_elements.len() as u32,
|
|
1544
|
+
})
|
|
1545
|
+
}
|
|
1546
|
+
|
|
1547
|
+
/// (async) Get the currently focused browser window.
|
|
1548
|
+
///
|
|
1549
|
+
/// @returns {Promise<Element>} The current browser window element.
|
|
1550
|
+
#[napi]
|
|
1551
|
+
pub async fn get_current_browser_window(&self) -> napi::Result<Element> {
|
|
1552
|
+
self.inner
|
|
1553
|
+
.get_current_browser_window()
|
|
1554
|
+
.await
|
|
1555
|
+
.map(Element::from)
|
|
1556
|
+
.map_err(map_error)
|
|
1557
|
+
}
|
|
1558
|
+
|
|
1559
|
+
/// Create a locator for finding UI elements.
|
|
1560
|
+
///
|
|
1561
|
+
/// @param {string | Selector} selector - The selector.
|
|
1562
|
+
/// @returns {Locator} A locator for finding elements.
|
|
1563
|
+
#[napi]
|
|
1564
|
+
pub fn locator(
|
|
1565
|
+
&self,
|
|
1566
|
+
#[napi(ts_arg_type = "string | Selector")] selector: Either<String, &Selector>,
|
|
1567
|
+
) -> napi::Result<Locator> {
|
|
1568
|
+
use napi::bindgen_prelude::Either::*;
|
|
1569
|
+
let sel_rust: computeruse::selector::Selector = match selector {
|
|
1570
|
+
A(sel_str) => sel_str.as_str().into(),
|
|
1571
|
+
B(sel_obj) => sel_obj.inner.clone(),
|
|
1572
|
+
};
|
|
1573
|
+
let loc = self.inner.locator(sel_rust);
|
|
1574
|
+
Ok(Locator::from(loc))
|
|
1575
|
+
}
|
|
1576
|
+
|
|
1577
|
+
/// Create a process-scoped locator for finding UI elements.
|
|
1578
|
+
/// This is the recommended way to create locators - always scope to a specific process.
|
|
1579
|
+
///
|
|
1580
|
+
/// @param {string} process - Process name to scope the search (e.g., 'chrome', 'notepad').
|
|
1581
|
+
/// @param {string | Selector} selector - The selector to find within the process.
|
|
1582
|
+
/// @param {string} [windowSelector] - Optional window selector for additional filtering.
|
|
1583
|
+
/// @returns {Locator} A locator for finding elements within the process.
|
|
1584
|
+
#[napi]
|
|
1585
|
+
pub fn locator_for_process(
|
|
1586
|
+
&self,
|
|
1587
|
+
process: String,
|
|
1588
|
+
#[napi(ts_arg_type = "string | Selector")] selector: Either<String, &Selector>,
|
|
1589
|
+
window_selector: Option<String>,
|
|
1590
|
+
) -> napi::Result<Locator> {
|
|
1591
|
+
use napi::bindgen_prelude::Either::*;
|
|
1592
|
+
|
|
1593
|
+
// Build the full selector string like MCP does
|
|
1594
|
+
let selector_str = match &selector {
|
|
1595
|
+
A(sel_str) => sel_str.clone(),
|
|
1596
|
+
B(sel_obj) => format!("{:?}", sel_obj.inner),
|
|
1597
|
+
};
|
|
1598
|
+
|
|
1599
|
+
let full_selector = if selector_str.is_empty() {
|
|
1600
|
+
if let Some(window_sel) = window_selector {
|
|
1601
|
+
format!("process:{} >> {}", process, window_sel)
|
|
1602
|
+
} else {
|
|
1603
|
+
format!("process:{}", process)
|
|
1604
|
+
}
|
|
1605
|
+
} else if let Some(window_sel) = window_selector {
|
|
1606
|
+
format!("process:{} >> {} >> {}", process, window_sel, selector_str)
|
|
1607
|
+
} else {
|
|
1608
|
+
format!("process:{} >> {}", process, selector_str)
|
|
1609
|
+
};
|
|
1610
|
+
|
|
1611
|
+
let sel_rust: computeruse::selector::Selector = full_selector.as_str().into();
|
|
1612
|
+
let loc = self.inner.locator(sel_rust);
|
|
1613
|
+
Ok(Locator::from(loc))
|
|
1614
|
+
}
|
|
1615
|
+
|
|
1616
|
+
/// (async) Get the currently focused window.
|
|
1617
|
+
///
|
|
1618
|
+
/// @returns {Promise<Element>} The current window element.
|
|
1619
|
+
#[napi]
|
|
1620
|
+
pub async fn get_current_window(&self) -> napi::Result<Element> {
|
|
1621
|
+
self.inner
|
|
1622
|
+
.get_current_window()
|
|
1623
|
+
.await
|
|
1624
|
+
.map(Element::from)
|
|
1625
|
+
.map_err(map_error)
|
|
1626
|
+
}
|
|
1627
|
+
|
|
1628
|
+
/// (async) Get the currently focused application.
|
|
1629
|
+
///
|
|
1630
|
+
/// @returns {Promise<Element>} The current application element.
|
|
1631
|
+
#[napi]
|
|
1632
|
+
pub async fn get_current_application(&self) -> napi::Result<Element> {
|
|
1633
|
+
self.inner
|
|
1634
|
+
.get_current_application()
|
|
1635
|
+
.await
|
|
1636
|
+
.map(Element::from)
|
|
1637
|
+
.map_err(map_error)
|
|
1638
|
+
}
|
|
1639
|
+
|
|
1640
|
+
/// Get the currently focused element.
|
|
1641
|
+
///
|
|
1642
|
+
/// @returns {Element} The focused element.
|
|
1643
|
+
#[napi]
|
|
1644
|
+
pub fn focused_element(&self) -> napi::Result<Element> {
|
|
1645
|
+
self.inner
|
|
1646
|
+
.focused_element()
|
|
1647
|
+
.map(Element::from)
|
|
1648
|
+
.map_err(map_error)
|
|
1649
|
+
}
|
|
1650
|
+
|
|
1651
|
+
/// Open a URL in a browser.
|
|
1652
|
+
///
|
|
1653
|
+
/// @param {string} url - The URL to open.
|
|
1654
|
+
/// @param {string} [browser] - The browser to use. Can be "Default", "Chrome", "Firefox", "Edge", "Brave", "Opera", "Vivaldi", or a custom browser path.
|
|
1655
|
+
/// @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot after opening
|
|
1656
|
+
/// @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after opening
|
|
1657
|
+
#[napi]
|
|
1658
|
+
pub fn open_url(
|
|
1659
|
+
&self,
|
|
1660
|
+
url: String,
|
|
1661
|
+
browser: Option<String>,
|
|
1662
|
+
include_window_screenshot: Option<bool>,
|
|
1663
|
+
include_monitor_screenshots: Option<bool>,
|
|
1664
|
+
) -> napi::Result<Element> {
|
|
1665
|
+
let browser_enum = browser.map(|b| match b.to_lowercase().as_str() {
|
|
1666
|
+
"default" => computeruse::Browser::Default,
|
|
1667
|
+
"chrome" => computeruse::Browser::Chrome,
|
|
1668
|
+
"firefox" => computeruse::Browser::Firefox,
|
|
1669
|
+
"edge" => computeruse::Browser::Edge,
|
|
1670
|
+
"brave" => computeruse::Browser::Brave,
|
|
1671
|
+
"opera" => computeruse::Browser::Opera,
|
|
1672
|
+
"vivaldi" => computeruse::Browser::Vivaldi,
|
|
1673
|
+
custom => computeruse::Browser::Custom(custom.to_string()),
|
|
1674
|
+
});
|
|
1675
|
+
let element = self.inner.open_url(&url, browser_enum).map_err(map_error)?;
|
|
1676
|
+
|
|
1677
|
+
// Capture screenshots if enabled (window default: true, monitor default: false)
|
|
1678
|
+
let _screenshots = capture_screenshots(
|
|
1679
|
+
&self.inner,
|
|
1680
|
+
element.process_id().ok(),
|
|
1681
|
+
include_window_screenshot.unwrap_or(true),
|
|
1682
|
+
include_monitor_screenshots.unwrap_or(false),
|
|
1683
|
+
"openUrl",
|
|
1684
|
+
);
|
|
1685
|
+
|
|
1686
|
+
Ok(Element::from(element))
|
|
1687
|
+
}
|
|
1688
|
+
|
|
1689
|
+
/// Open a file with its default application.
|
|
1690
|
+
///
|
|
1691
|
+
/// @param {string} filePath - Path to the file to open.
|
|
1692
|
+
/// @param {string} [process] - Process name for window screenshot capture. If provided, enables window screenshots.
|
|
1693
|
+
/// @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot (requires process).
|
|
1694
|
+
/// @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after opening.
|
|
1695
|
+
#[napi]
|
|
1696
|
+
pub fn open_file(
|
|
1697
|
+
&self,
|
|
1698
|
+
file_path: String,
|
|
1699
|
+
process: Option<String>,
|
|
1700
|
+
include_window_screenshot: Option<bool>,
|
|
1701
|
+
include_monitor_screenshots: Option<bool>,
|
|
1702
|
+
) -> napi::Result<()> {
|
|
1703
|
+
let result = self.inner.open_file(&file_path).map_err(map_error);
|
|
1704
|
+
|
|
1705
|
+
// Get PID from process name if provided
|
|
1706
|
+
let pid = process
|
|
1707
|
+
.as_ref()
|
|
1708
|
+
.and_then(|p| find_pid_for_process(&self.inner, p).ok());
|
|
1709
|
+
|
|
1710
|
+
// Capture screenshots if requested
|
|
1711
|
+
let _screenshots = capture_screenshots(
|
|
1712
|
+
&self.inner,
|
|
1713
|
+
pid,
|
|
1714
|
+
include_window_screenshot.unwrap_or(true) && pid.is_some(),
|
|
1715
|
+
include_monitor_screenshots.unwrap_or(false),
|
|
1716
|
+
"openFile",
|
|
1717
|
+
);
|
|
1718
|
+
|
|
1719
|
+
result
|
|
1720
|
+
}
|
|
1721
|
+
|
|
1722
|
+
/// Activate a browser window by title.
|
|
1723
|
+
///
|
|
1724
|
+
/// @param {string} title - The window title to match.
|
|
1725
|
+
#[napi]
|
|
1726
|
+
pub fn activate_browser_window_by_title(&self, title: String) -> napi::Result<()> {
|
|
1727
|
+
self.inner
|
|
1728
|
+
.activate_browser_window_by_title(&title)
|
|
1729
|
+
.map_err(map_error)
|
|
1730
|
+
}
|
|
1731
|
+
|
|
1732
|
+
/// Get the UI tree for a window identified by process name and optional title.
|
|
1733
|
+
///
|
|
1734
|
+
/// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
|
|
1735
|
+
/// @param {string} [title] - Optional window title filter.
|
|
1736
|
+
/// @param {TreeBuildConfig} [config] - Optional configuration for tree building.
|
|
1737
|
+
/// @returns {UINode} Complete UI tree starting from the identified window.
|
|
1738
|
+
#[napi]
|
|
1739
|
+
pub fn get_window_tree(
|
|
1740
|
+
&self,
|
|
1741
|
+
process: String,
|
|
1742
|
+
title: Option<String>,
|
|
1743
|
+
config: Option<TreeBuildConfig>,
|
|
1744
|
+
) -> napi::Result<UINode> {
|
|
1745
|
+
// Find PID for the process name
|
|
1746
|
+
let pid = find_pid_for_process(&self.inner, &process)?;
|
|
1747
|
+
|
|
1748
|
+
let rust_config = config.map(|c| c.into());
|
|
1749
|
+
self.inner
|
|
1750
|
+
.get_window_tree(pid, title.as_deref(), rust_config)
|
|
1751
|
+
.map(UINode::from)
|
|
1752
|
+
.map_err(map_error)
|
|
1753
|
+
}
|
|
1754
|
+
|
|
1755
|
+
/// Get the UI tree with full result including formatting and bounds mapping.
|
|
1756
|
+
///
|
|
1757
|
+
/// This is the recommended method for getting window trees when you need:
|
|
1758
|
+
/// - Formatted YAML output for LLM consumption
|
|
1759
|
+
/// - Index-to-bounds mapping for click targeting
|
|
1760
|
+
/// - Browser detection
|
|
1761
|
+
///
|
|
1762
|
+
/// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
|
|
1763
|
+
/// @param {string} [title] - Optional window title filter.
|
|
1764
|
+
/// @param {TreeBuildConfig} [config] - Configuration options:
|
|
1765
|
+
/// - formatOutput: Enable formatted output (default: true if treeOutputFormat set)
|
|
1766
|
+
/// - treeOutputFormat: 'CompactYaml' (default) or 'VerboseJson'
|
|
1767
|
+
/// - treeFromSelector: Selector to start tree from (use getWindowTreeResultAsync for this)
|
|
1768
|
+
/// - includeWindowScreenshot: Save window screenshot to executions dir (default: false)
|
|
1769
|
+
/// - includeMonitorScreenshots: Save all monitor screenshots to executions dir (default: false)
|
|
1770
|
+
/// @returns {WindowTreeResult} Complete result with tree, formatted output, bounds mapping, and screenshot paths.
|
|
1771
|
+
#[napi]
|
|
1772
|
+
pub fn get_window_tree_result(
|
|
1773
|
+
&self,
|
|
1774
|
+
process: String,
|
|
1775
|
+
title: Option<String>,
|
|
1776
|
+
config: Option<TreeBuildConfig>,
|
|
1777
|
+
) -> napi::Result<WindowTreeResult> {
|
|
1778
|
+
// Find PID for the process name
|
|
1779
|
+
let pid = find_pid_for_process(&self.inner, &process)?;
|
|
1780
|
+
|
|
1781
|
+
// Extract screenshot options (window: true, monitor: false by default)
|
|
1782
|
+
let include_window_screenshot = config
|
|
1783
|
+
.as_ref()
|
|
1784
|
+
.and_then(|c| c.include_window_screenshot)
|
|
1785
|
+
.unwrap_or(true);
|
|
1786
|
+
let include_monitor_screenshots = config
|
|
1787
|
+
.as_ref()
|
|
1788
|
+
.and_then(|c| c.include_monitor_screenshots)
|
|
1789
|
+
.unwrap_or(false);
|
|
1790
|
+
|
|
1791
|
+
// Extract options before converting config
|
|
1792
|
+
let output_format = config
|
|
1793
|
+
.as_ref()
|
|
1794
|
+
.and_then(|c| c.tree_output_format)
|
|
1795
|
+
.unwrap_or(TreeOutputFormat::CompactYaml);
|
|
1796
|
+
|
|
1797
|
+
// If format is VerboseJson, we don't need formatted output from core
|
|
1798
|
+
// ClusteredYaml is treated like CompactYaml (needs format_output = true)
|
|
1799
|
+
let rust_config = config.map(|mut c| {
|
|
1800
|
+
if matches!(output_format, TreeOutputFormat::VerboseJson) {
|
|
1801
|
+
c.format_output = Some(false);
|
|
1802
|
+
} else if c.format_output.is_none() {
|
|
1803
|
+
c.format_output = Some(true);
|
|
1804
|
+
}
|
|
1805
|
+
c.into()
|
|
1806
|
+
});
|
|
1807
|
+
|
|
1808
|
+
let result = self
|
|
1809
|
+
.inner
|
|
1810
|
+
.get_window_tree_result(pid, title.as_deref(), rust_config)
|
|
1811
|
+
.map_err(map_error)?;
|
|
1812
|
+
|
|
1813
|
+
// Convert and handle format
|
|
1814
|
+
let mut sdk_result = WindowTreeResult::from(result);
|
|
1815
|
+
|
|
1816
|
+
// For VerboseJson, serialize the tree as the formatted output
|
|
1817
|
+
if matches!(output_format, TreeOutputFormat::VerboseJson) {
|
|
1818
|
+
sdk_result.formatted =
|
|
1819
|
+
Some(serde_json::to_string_pretty(&sdk_result.tree).unwrap_or_default());
|
|
1820
|
+
}
|
|
1821
|
+
|
|
1822
|
+
// Handle screenshot capture and saving using helper
|
|
1823
|
+
let screenshots = capture_screenshots(
|
|
1824
|
+
&self.inner,
|
|
1825
|
+
Some(pid),
|
|
1826
|
+
include_window_screenshot,
|
|
1827
|
+
include_monitor_screenshots,
|
|
1828
|
+
"getWindowTreeResult",
|
|
1829
|
+
);
|
|
1830
|
+
sdk_result.window_screenshot_path = screenshots.window_path;
|
|
1831
|
+
sdk_result.monitor_screenshot_paths = screenshots.monitor_paths;
|
|
1832
|
+
|
|
1833
|
+
Ok(sdk_result)
|
|
1834
|
+
}
|
|
1835
|
+
|
|
1836
|
+
/// (async) Get the UI tree with full result, supporting tree_from_selector.
|
|
1837
|
+
///
|
|
1838
|
+
/// Use this method when you need to scope the tree to a specific subtree using a selector.
|
|
1839
|
+
///
|
|
1840
|
+
/// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
|
|
1841
|
+
/// @param {string} [title] - Optional window title filter.
|
|
1842
|
+
/// @param {TreeBuildConfig} [config] - Configuration options:
|
|
1843
|
+
/// - formatOutput: Enable formatted output (default: true)
|
|
1844
|
+
/// - treeOutputFormat: 'CompactYaml' (default) or 'VerboseJson'
|
|
1845
|
+
/// - treeFromSelector: Selector to start tree from (e.g., "role:Dialog")
|
|
1846
|
+
/// @returns {Promise<WindowTreeResult>} Complete result with tree, formatted output, and bounds mapping.
|
|
1847
|
+
#[napi]
|
|
1848
|
+
pub async fn get_window_tree_result_async(
|
|
1849
|
+
&self,
|
|
1850
|
+
process: String,
|
|
1851
|
+
title: Option<String>,
|
|
1852
|
+
config: Option<TreeBuildConfig>,
|
|
1853
|
+
) -> napi::Result<WindowTreeResult> {
|
|
1854
|
+
use std::collections::HashMap;
|
|
1855
|
+
|
|
1856
|
+
// Find PID for the process name
|
|
1857
|
+
let pid = find_pid_for_process(&self.inner, &process)?;
|
|
1858
|
+
|
|
1859
|
+
// Extract vision and format options from config
|
|
1860
|
+
let include_gemini_vision = config
|
|
1861
|
+
.as_ref()
|
|
1862
|
+
.and_then(|c| c.include_gemini_vision)
|
|
1863
|
+
.unwrap_or(false);
|
|
1864
|
+
let include_omniparser = config
|
|
1865
|
+
.as_ref()
|
|
1866
|
+
.and_then(|c| c.include_omniparser)
|
|
1867
|
+
.unwrap_or(false);
|
|
1868
|
+
let include_ocr = config.as_ref().and_then(|c| c.include_ocr).unwrap_or(false);
|
|
1869
|
+
let include_browser_dom = config
|
|
1870
|
+
.as_ref()
|
|
1871
|
+
.and_then(|c| c.include_browser_dom)
|
|
1872
|
+
.unwrap_or(false);
|
|
1873
|
+
let output_format = config
|
|
1874
|
+
.as_ref()
|
|
1875
|
+
.and_then(|c| c.tree_output_format)
|
|
1876
|
+
.unwrap_or(TreeOutputFormat::CompactYaml);
|
|
1877
|
+
|
|
1878
|
+
let has_vision_options =
|
|
1879
|
+
include_gemini_vision || include_omniparser || include_ocr || include_browser_dom;
|
|
1880
|
+
|
|
1881
|
+
// Build rust config with from_selector passed through
|
|
1882
|
+
let rust_config = config.as_ref().map(|c| {
|
|
1883
|
+
let mut c_clone = TreeBuildConfig {
|
|
1884
|
+
property_mode: c.property_mode,
|
|
1885
|
+
timeout_per_operation_ms: c.timeout_per_operation_ms,
|
|
1886
|
+
yield_every_n_elements: c.yield_every_n_elements,
|
|
1887
|
+
batch_size: c.batch_size,
|
|
1888
|
+
max_depth: c.max_depth,
|
|
1889
|
+
ui_settle_delay_ms: c.ui_settle_delay_ms,
|
|
1890
|
+
format_output: c.format_output,
|
|
1891
|
+
tree_output_format: c.tree_output_format,
|
|
1892
|
+
tree_from_selector: c.tree_from_selector.clone(),
|
|
1893
|
+
include_window_screenshot: c.include_window_screenshot,
|
|
1894
|
+
include_monitor_screenshots: c.include_monitor_screenshots,
|
|
1895
|
+
include_gemini_vision: None,
|
|
1896
|
+
include_omniparser: None,
|
|
1897
|
+
include_ocr: None,
|
|
1898
|
+
include_browser_dom: None,
|
|
1899
|
+
};
|
|
1900
|
+
if matches!(output_format, TreeOutputFormat::VerboseJson) {
|
|
1901
|
+
c_clone.format_output = Some(false);
|
|
1902
|
+
} else if c_clone.format_output.is_none() {
|
|
1903
|
+
c_clone.format_output = Some(true);
|
|
1904
|
+
}
|
|
1905
|
+
c_clone.into()
|
|
1906
|
+
});
|
|
1907
|
+
|
|
1908
|
+
// Get UIA tree (always)
|
|
1909
|
+
let result = self
|
|
1910
|
+
.inner
|
|
1911
|
+
.get_window_tree_result_async(pid, title.as_deref(), rust_config)
|
|
1912
|
+
.await
|
|
1913
|
+
.map_err(map_error)?;
|
|
1914
|
+
|
|
1915
|
+
let mut sdk_result = WindowTreeResult::from(result);
|
|
1916
|
+
|
|
1917
|
+
// If no vision options and not clustered format, return simple result
|
|
1918
|
+
if !has_vision_options && !matches!(output_format, TreeOutputFormat::ClusteredYaml) {
|
|
1919
|
+
if matches!(output_format, TreeOutputFormat::VerboseJson) {
|
|
1920
|
+
sdk_result.formatted =
|
|
1921
|
+
Some(serde_json::to_string_pretty(&sdk_result.tree).unwrap_or_default());
|
|
1922
|
+
}
|
|
1923
|
+
return Ok(sdk_result);
|
|
1924
|
+
}
|
|
1925
|
+
|
|
1926
|
+
// Build UIA bounds cache from formatted result
|
|
1927
|
+
#[allow(clippy::type_complexity)]
|
|
1928
|
+
let mut uia_bounds: HashMap<
|
|
1929
|
+
u32,
|
|
1930
|
+
(String, String, (f64, f64, f64, f64), Option<String>),
|
|
1931
|
+
> = HashMap::new();
|
|
1932
|
+
let uia_tree_result = self
|
|
1933
|
+
.inner
|
|
1934
|
+
.get_window_tree_result(pid, None, None)
|
|
1935
|
+
.map_err(map_error)?;
|
|
1936
|
+
let formatted_result = computeruse::format_ui_node_as_compact_yaml(&uia_tree_result.tree, 0);
|
|
1937
|
+
for (idx, (role, name, bounds, selector)) in formatted_result.index_to_bounds {
|
|
1938
|
+
uia_bounds.insert(idx, (role, name, bounds, selector));
|
|
1939
|
+
}
|
|
1940
|
+
|
|
1941
|
+
// Build DOM bounds cache if requested
|
|
1942
|
+
#[allow(clippy::type_complexity)]
|
|
1943
|
+
let mut dom_bounds: HashMap<u32, (String, String, (f64, f64, f64, f64))> = HashMap::new();
|
|
1944
|
+
if include_browser_dom && computeruse::is_browser_process(pid) {
|
|
1945
|
+
if let Ok(dom_result) = self.capture_browser_dom(Some(100), Some(true)).await {
|
|
1946
|
+
for (idx_str, entry) in dom_result.index_to_bounds {
|
|
1947
|
+
if let Ok(idx) = idx_str.parse::<u32>() {
|
|
1948
|
+
let bounds = (
|
|
1949
|
+
entry.bounds.x,
|
|
1950
|
+
entry.bounds.y,
|
|
1951
|
+
entry.bounds.width,
|
|
1952
|
+
entry.bounds.height,
|
|
1953
|
+
);
|
|
1954
|
+
dom_bounds.insert(idx, (entry.tag, entry.name, bounds));
|
|
1955
|
+
}
|
|
1956
|
+
}
|
|
1957
|
+
}
|
|
1958
|
+
}
|
|
1959
|
+
|
|
1960
|
+
// Build Omniparser cache if requested
|
|
1961
|
+
let mut omniparser_items: HashMap<u32, computeruse::OmniparserItem> = HashMap::new();
|
|
1962
|
+
if include_omniparser {
|
|
1963
|
+
if let Ok(omni_result) = self
|
|
1964
|
+
.perform_omniparser_for_process(process.clone(), None, Some(true))
|
|
1965
|
+
.await
|
|
1966
|
+
{
|
|
1967
|
+
for (idx_str, entry) in omni_result.index_to_bounds {
|
|
1968
|
+
if let Ok(idx) = idx_str.parse::<u32>() {
|
|
1969
|
+
omniparser_items.insert(
|
|
1970
|
+
idx,
|
|
1971
|
+
computeruse::OmniparserItem {
|
|
1972
|
+
label: entry.label.clone(),
|
|
1973
|
+
content: Some(entry.name.clone()),
|
|
1974
|
+
box_2d: Some([
|
|
1975
|
+
entry.bounds.x,
|
|
1976
|
+
entry.bounds.y,
|
|
1977
|
+
entry.bounds.x + entry.bounds.width,
|
|
1978
|
+
entry.bounds.y + entry.bounds.height,
|
|
1979
|
+
]),
|
|
1980
|
+
},
|
|
1981
|
+
);
|
|
1982
|
+
}
|
|
1983
|
+
}
|
|
1984
|
+
}
|
|
1985
|
+
}
|
|
1986
|
+
|
|
1987
|
+
// Build Gemini Vision cache if requested
|
|
1988
|
+
let mut vision_items: HashMap<u32, computeruse::VisionElement> = HashMap::new();
|
|
1989
|
+
if include_gemini_vision {
|
|
1990
|
+
if let Ok(vision_result) = self
|
|
1991
|
+
.perform_gemini_vision_for_process(process.clone(), Some(true))
|
|
1992
|
+
.await
|
|
1993
|
+
{
|
|
1994
|
+
for (idx_str, entry) in vision_result.index_to_bounds {
|
|
1995
|
+
if let Ok(idx) = idx_str.parse::<u32>() {
|
|
1996
|
+
vision_items.insert(
|
|
1997
|
+
idx,
|
|
1998
|
+
computeruse::VisionElement {
|
|
1999
|
+
element_type: entry.element_type.clone(),
|
|
2000
|
+
content: Some(entry.name.clone()),
|
|
2001
|
+
description: None,
|
|
2002
|
+
box_2d: Some([
|
|
2003
|
+
entry.bounds.x,
|
|
2004
|
+
entry.bounds.y,
|
|
2005
|
+
entry.bounds.x + entry.bounds.width,
|
|
2006
|
+
entry.bounds.y + entry.bounds.height,
|
|
2007
|
+
]),
|
|
2008
|
+
interactivity: None,
|
|
2009
|
+
},
|
|
2010
|
+
);
|
|
2011
|
+
}
|
|
2012
|
+
}
|
|
2013
|
+
}
|
|
2014
|
+
}
|
|
2015
|
+
|
|
2016
|
+
// Build OCR cache if requested
|
|
2017
|
+
#[allow(clippy::type_complexity)]
|
|
2018
|
+
let mut ocr_bounds: HashMap<u32, (String, (f64, f64, f64, f64))> = HashMap::new();
|
|
2019
|
+
if include_ocr {
|
|
2020
|
+
if let Ok(ocr_result) = self
|
|
2021
|
+
.perform_ocr_for_process(process.clone(), Some(true))
|
|
2022
|
+
.await
|
|
2023
|
+
{
|
|
2024
|
+
for (idx_str, entry) in ocr_result.index_to_bounds {
|
|
2025
|
+
if let Ok(idx) = idx_str.parse::<u32>() {
|
|
2026
|
+
let bounds = (
|
|
2027
|
+
entry.bounds.x,
|
|
2028
|
+
entry.bounds.y,
|
|
2029
|
+
entry.bounds.width,
|
|
2030
|
+
entry.bounds.height,
|
|
2031
|
+
);
|
|
2032
|
+
ocr_bounds.insert(idx, (entry.text.clone(), bounds));
|
|
2033
|
+
}
|
|
2034
|
+
}
|
|
2035
|
+
}
|
|
2036
|
+
}
|
|
2037
|
+
|
|
2038
|
+
// If ClusteredYaml format, use clustering
|
|
2039
|
+
if matches!(output_format, TreeOutputFormat::ClusteredYaml) {
|
|
2040
|
+
let clustered_result = computeruse::format_clustered_tree_from_caches(
|
|
2041
|
+
&uia_bounds,
|
|
2042
|
+
&dom_bounds,
|
|
2043
|
+
&ocr_bounds,
|
|
2044
|
+
&omniparser_items,
|
|
2045
|
+
&vision_items,
|
|
2046
|
+
);
|
|
2047
|
+
sdk_result.formatted = Some(clustered_result.formatted);
|
|
2048
|
+
} else {
|
|
2049
|
+
// CompactYaml with vision - append vision trees to UIA tree
|
|
2050
|
+
let mut combined = sdk_result.formatted.unwrap_or_default();
|
|
2051
|
+
if !dom_bounds.is_empty() {
|
|
2052
|
+
combined.push_str("\n\n# Browser DOM elements:\n");
|
|
2053
|
+
for (idx, (tag, name, _)) in &dom_bounds {
|
|
2054
|
+
combined.push_str(&format!("#d{} [{}] {}\n", idx, tag, name));
|
|
2055
|
+
}
|
|
2056
|
+
}
|
|
2057
|
+
if !omniparser_items.is_empty() {
|
|
2058
|
+
combined.push_str("\n\n# Omniparser elements:\n");
|
|
2059
|
+
for (idx, item) in &omniparser_items {
|
|
2060
|
+
combined.push_str(&format!(
|
|
2061
|
+
"#p{} [{}] {}\n",
|
|
2062
|
+
idx,
|
|
2063
|
+
item.label,
|
|
2064
|
+
item.content.as_deref().unwrap_or("")
|
|
2065
|
+
));
|
|
2066
|
+
}
|
|
2067
|
+
}
|
|
2068
|
+
if !vision_items.is_empty() {
|
|
2069
|
+
combined.push_str("\n\n# Gemini Vision elements:\n");
|
|
2070
|
+
for (idx, item) in &vision_items {
|
|
2071
|
+
combined.push_str(&format!(
|
|
2072
|
+
"#g{} [{}] {}\n",
|
|
2073
|
+
idx,
|
|
2074
|
+
item.element_type,
|
|
2075
|
+
item.content.as_deref().unwrap_or("")
|
|
2076
|
+
));
|
|
2077
|
+
}
|
|
2078
|
+
}
|
|
2079
|
+
if !ocr_bounds.is_empty() {
|
|
2080
|
+
combined.push_str("\n\n# OCR elements:\n");
|
|
2081
|
+
for (idx, (text, _)) in &ocr_bounds {
|
|
2082
|
+
combined.push_str(&format!("#o{} {}\n", idx, text));
|
|
2083
|
+
}
|
|
2084
|
+
}
|
|
2085
|
+
sdk_result.formatted = Some(combined);
|
|
2086
|
+
}
|
|
2087
|
+
|
|
2088
|
+
Ok(sdk_result)
|
|
2089
|
+
}
|
|
2090
|
+
|
|
2091
|
+
// ============== NEW MONITOR METHODS ==============
|
|
2092
|
+
|
|
2093
|
+
/// (async) List all available monitors/displays.
|
|
2094
|
+
///
|
|
2095
|
+
/// @returns {Promise<Array<Monitor>>} List of monitor information.
|
|
2096
|
+
#[napi]
|
|
2097
|
+
pub async fn list_monitors(&self) -> napi::Result<Vec<Monitor>> {
|
|
2098
|
+
self.inner
|
|
2099
|
+
.list_monitors()
|
|
2100
|
+
.await
|
|
2101
|
+
.map(|monitors| monitors.into_iter().map(Monitor::from).collect())
|
|
2102
|
+
.map_err(map_error)
|
|
2103
|
+
}
|
|
2104
|
+
|
|
2105
|
+
/// (async) Get the primary monitor.
|
|
2106
|
+
///
|
|
2107
|
+
/// @returns {Promise<Monitor>} Primary monitor information.
|
|
2108
|
+
#[napi]
|
|
2109
|
+
pub async fn get_primary_monitor(&self) -> napi::Result<Monitor> {
|
|
2110
|
+
self.inner
|
|
2111
|
+
.get_primary_monitor()
|
|
2112
|
+
.await
|
|
2113
|
+
.map(Monitor::from)
|
|
2114
|
+
.map_err(map_error)
|
|
2115
|
+
}
|
|
2116
|
+
|
|
2117
|
+
/// (async) Get the monitor containing the currently focused window.
|
|
2118
|
+
///
|
|
2119
|
+
/// @returns {Promise<Monitor>} Active monitor information.
|
|
2120
|
+
#[napi]
|
|
2121
|
+
pub async fn get_active_monitor(&self) -> napi::Result<Monitor> {
|
|
2122
|
+
self.inner
|
|
2123
|
+
.get_active_monitor()
|
|
2124
|
+
.await
|
|
2125
|
+
.map(Monitor::from)
|
|
2126
|
+
.map_err(map_error)
|
|
2127
|
+
}
|
|
2128
|
+
|
|
2129
|
+
/// (async) Get a monitor by its ID.
|
|
2130
|
+
///
|
|
2131
|
+
/// @param {string} id - The monitor ID to find.
|
|
2132
|
+
/// @returns {Promise<Monitor>} Monitor information.
|
|
2133
|
+
#[napi]
|
|
2134
|
+
pub async fn get_monitor_by_id(&self, id: String) -> napi::Result<Monitor> {
|
|
2135
|
+
self.inner
|
|
2136
|
+
.get_monitor_by_id(&id)
|
|
2137
|
+
.await
|
|
2138
|
+
.map(Monitor::from)
|
|
2139
|
+
.map_err(map_error)
|
|
2140
|
+
}
|
|
2141
|
+
|
|
2142
|
+
/// (async) Get a monitor by its name.
|
|
2143
|
+
///
|
|
2144
|
+
/// @param {string} name - The monitor name to find.
|
|
2145
|
+
/// @returns {Promise<Monitor>} Monitor information.
|
|
2146
|
+
#[napi]
|
|
2147
|
+
pub async fn get_monitor_by_name(&self, name: String) -> napi::Result<Monitor> {
|
|
2148
|
+
self.inner
|
|
2149
|
+
.get_monitor_by_name(&name)
|
|
2150
|
+
.await
|
|
2151
|
+
.map(Monitor::from)
|
|
2152
|
+
.map_err(map_error)
|
|
2153
|
+
}
|
|
2154
|
+
|
|
2155
|
+
/// (async) Capture a screenshot of a specific monitor.
|
|
2156
|
+
///
|
|
2157
|
+
/// @param {Monitor} monitor - The monitor to capture.
|
|
2158
|
+
/// @returns {Promise<ScreenshotResult>} The screenshot data.
|
|
2159
|
+
#[napi]
|
|
2160
|
+
pub async fn capture_monitor(&self, monitor: Monitor) -> napi::Result<ScreenshotResult> {
|
|
2161
|
+
let rust_monitor = computeruse::Monitor {
|
|
2162
|
+
id: monitor.id,
|
|
2163
|
+
name: monitor.name,
|
|
2164
|
+
is_primary: monitor.is_primary,
|
|
2165
|
+
width: monitor.width,
|
|
2166
|
+
height: monitor.height,
|
|
2167
|
+
x: monitor.x,
|
|
2168
|
+
y: monitor.y,
|
|
2169
|
+
scale_factor: monitor.scale_factor,
|
|
2170
|
+
work_area: None,
|
|
2171
|
+
};
|
|
2172
|
+
self.inner
|
|
2173
|
+
.capture_monitor(&rust_monitor)
|
|
2174
|
+
.await
|
|
2175
|
+
.map(|r| ScreenshotResult {
|
|
2176
|
+
width: r.width,
|
|
2177
|
+
height: r.height,
|
|
2178
|
+
image_data: r.image_data,
|
|
2179
|
+
monitor: r.monitor.map(Monitor::from),
|
|
2180
|
+
})
|
|
2181
|
+
.map_err(map_error)
|
|
2182
|
+
}
|
|
2183
|
+
|
|
2184
|
+
/// (async) Capture screenshots of all monitors.
|
|
2185
|
+
///
|
|
2186
|
+
/// @returns {Promise<Array<{monitor: Monitor, screenshot: ScreenshotResult}>>} Array of monitor and screenshot pairs.
|
|
2187
|
+
#[napi]
|
|
2188
|
+
pub async fn capture_all_monitors(&self) -> napi::Result<Vec<MonitorScreenshotPair>> {
|
|
2189
|
+
self.inner
|
|
2190
|
+
.capture_all_monitors()
|
|
2191
|
+
.await
|
|
2192
|
+
.map(|results| {
|
|
2193
|
+
results
|
|
2194
|
+
.into_iter()
|
|
2195
|
+
.map(|(monitor, screenshot)| MonitorScreenshotPair {
|
|
2196
|
+
monitor: Monitor::from(monitor),
|
|
2197
|
+
screenshot: ScreenshotResult {
|
|
2198
|
+
width: screenshot.width,
|
|
2199
|
+
height: screenshot.height,
|
|
2200
|
+
image_data: screenshot.image_data,
|
|
2201
|
+
monitor: screenshot.monitor.map(Monitor::from),
|
|
2202
|
+
},
|
|
2203
|
+
})
|
|
2204
|
+
.collect()
|
|
2205
|
+
})
|
|
2206
|
+
.map_err(map_error)
|
|
2207
|
+
}
|
|
2208
|
+
|
|
2209
|
+
/// Capture a screenshot of a window by process name.
|
|
2210
|
+
///
|
|
2211
|
+
/// Finds the first window matching the given process name and captures its screenshot.
|
|
2212
|
+
/// Process name matching is case-insensitive and uses substring matching.
|
|
2213
|
+
///
|
|
2214
|
+
/// @param {string} process - Process name to match (e.g., "chrome", "notepad", "code")
|
|
2215
|
+
/// @returns {ScreenshotResult} The screenshot data.
|
|
2216
|
+
#[napi(js_name = "captureWindowByProcess")]
|
|
2217
|
+
pub fn capture_window_by_process(&self, process: String) -> napi::Result<ScreenshotResult> {
|
|
2218
|
+
self.inner
|
|
2219
|
+
.capture_window_by_process(&process)
|
|
2220
|
+
.map(|r| ScreenshotResult {
|
|
2221
|
+
width: r.width,
|
|
2222
|
+
height: r.height,
|
|
2223
|
+
image_data: r.image_data,
|
|
2224
|
+
monitor: r.monitor.map(Monitor::from),
|
|
2225
|
+
})
|
|
2226
|
+
.map_err(map_error)
|
|
2227
|
+
}
|
|
2228
|
+
|
|
2229
|
+
/// (async) Captures a screenshot. Three modes:
|
|
2230
|
+
/// 1. Element mode: provide process + selector to capture specific element
|
|
2231
|
+
/// 2. Window mode: provide process only to capture entire window
|
|
2232
|
+
/// 3. Monitor mode: provide process + entireMonitor=true to capture the monitor where the window is located
|
|
2233
|
+
///
|
|
2234
|
+
/// @param {string} process - Process name to match (e.g., "chrome", "notepad", "code")
|
|
2235
|
+
/// @param {string} [selector] - Optional selector to capture a specific element within the process
|
|
2236
|
+
/// @param {boolean} [entireMonitor=false] - If true, captures the entire monitor containing the window
|
|
2237
|
+
/// @param {number} [timeoutMs=10000] - Timeout in milliseconds for finding the element
|
|
2238
|
+
/// @returns {Promise<ScreenshotResult>} The screenshot data.
|
|
2239
|
+
#[napi(js_name = "captureScreenshot")]
|
|
2240
|
+
pub async fn capture_screenshot(
|
|
2241
|
+
&self,
|
|
2242
|
+
process: String,
|
|
2243
|
+
selector: Option<String>,
|
|
2244
|
+
entire_monitor: Option<bool>,
|
|
2245
|
+
timeout_ms: Option<f64>,
|
|
2246
|
+
) -> napi::Result<ScreenshotResult> {
|
|
2247
|
+
use std::time::Duration;
|
|
2248
|
+
|
|
2249
|
+
let entire_monitor = entire_monitor.unwrap_or(false);
|
|
2250
|
+
let timeout = Duration::from_millis(timeout_ms.unwrap_or(10000.0) as u64);
|
|
2251
|
+
|
|
2252
|
+
// Build the full selector string like MCP does
|
|
2253
|
+
let full_selector = if let Some(sel) = &selector {
|
|
2254
|
+
if sel.is_empty() {
|
|
2255
|
+
format!("process:{}", process)
|
|
2256
|
+
} else {
|
|
2257
|
+
format!("process:{} >> {}", process, sel)
|
|
2258
|
+
}
|
|
2259
|
+
} else {
|
|
2260
|
+
format!("process:{}", process)
|
|
2261
|
+
};
|
|
2262
|
+
|
|
2263
|
+
// Create locator and find element
|
|
2264
|
+
let sel_rust: computeruse::selector::Selector = full_selector.as_str().into();
|
|
2265
|
+
let locator = self.inner.locator(sel_rust);
|
|
2266
|
+
let element = locator.first(Some(timeout)).await.map_err(map_error)?;
|
|
2267
|
+
|
|
2268
|
+
if entire_monitor {
|
|
2269
|
+
// Monitor mode: get element's monitor and capture it
|
|
2270
|
+
let monitor = element.monitor().map_err(map_error)?;
|
|
2271
|
+
let screenshot = monitor.capture(&self.inner).await.map_err(map_error)?;
|
|
2272
|
+
Ok(ScreenshotResult {
|
|
2273
|
+
width: screenshot.width,
|
|
2274
|
+
height: screenshot.height,
|
|
2275
|
+
image_data: screenshot.image_data,
|
|
2276
|
+
monitor: Some(Monitor::from(monitor)),
|
|
2277
|
+
})
|
|
2278
|
+
} else {
|
|
2279
|
+
// Element/Window mode: capture the element directly
|
|
2280
|
+
let screenshot = element.capture().map_err(map_error)?;
|
|
2281
|
+
Ok(ScreenshotResult {
|
|
2282
|
+
width: screenshot.width,
|
|
2283
|
+
height: screenshot.height,
|
|
2284
|
+
image_data: screenshot.image_data,
|
|
2285
|
+
monitor: screenshot.monitor.map(Monitor::from),
|
|
2286
|
+
})
|
|
2287
|
+
}
|
|
2288
|
+
}
|
|
2289
|
+
|
|
2290
|
+
// ============== SCREENSHOT UTILITIES ==============
|
|
2291
|
+
|
|
2292
|
+
/// Convert a screenshot to PNG bytes.
|
|
2293
|
+
/// Converts BGRA to RGBA and encodes as PNG format.
|
|
2294
|
+
///
|
|
2295
|
+
/// @param {ScreenshotResult} screenshot - The screenshot to convert.
|
|
2296
|
+
/// @returns {Buffer} PNG-encoded bytes.
|
|
2297
|
+
#[napi(js_name = "screenshotToPng")]
|
|
2298
|
+
pub fn screenshot_to_png(&self, screenshot: ScreenshotResult) -> napi::Result<Vec<u8>> {
|
|
2299
|
+
screenshot
|
|
2300
|
+
.to_inner()
|
|
2301
|
+
.to_png()
|
|
2302
|
+
.map_err(|e| napi::Error::from_reason(e.to_string()))
|
|
2303
|
+
}
|
|
2304
|
+
|
|
2305
|
+
/// Convert a screenshot to PNG bytes with resizing.
|
|
2306
|
+
/// If the image exceeds maxDimension in either width or height,
|
|
2307
|
+
/// it will be resized while maintaining aspect ratio.
|
|
2308
|
+
///
|
|
2309
|
+
/// @param {ScreenshotResult} screenshot - The screenshot to convert.
|
|
2310
|
+
/// @param {number} [maxDimension] - Maximum width or height. Defaults to 1920.
|
|
2311
|
+
/// @returns {Buffer} PNG-encoded bytes (potentially resized).
|
|
2312
|
+
#[napi(js_name = "screenshotToPngResized")]
|
|
2313
|
+
pub fn screenshot_to_png_resized(
|
|
2314
|
+
&self,
|
|
2315
|
+
screenshot: ScreenshotResult,
|
|
2316
|
+
max_dimension: Option<u32>,
|
|
2317
|
+
) -> napi::Result<Vec<u8>> {
|
|
2318
|
+
screenshot
|
|
2319
|
+
.to_inner()
|
|
2320
|
+
.to_png_resized(max_dimension)
|
|
2321
|
+
.map_err(|e| napi::Error::from_reason(e.to_string()))
|
|
2322
|
+
}
|
|
2323
|
+
|
|
2324
|
+
/// Convert a screenshot to base64-encoded PNG string.
|
|
2325
|
+
/// Useful for embedding in JSON responses or passing to LLMs.
|
|
2326
|
+
///
|
|
2327
|
+
/// @param {ScreenshotResult} screenshot - The screenshot to convert.
|
|
2328
|
+
/// @returns {string} Base64-encoded PNG string.
|
|
2329
|
+
#[napi(js_name = "screenshotToBase64Png")]
|
|
2330
|
+
pub fn screenshot_to_base64_png(&self, screenshot: ScreenshotResult) -> napi::Result<String> {
|
|
2331
|
+
screenshot
|
|
2332
|
+
.to_inner()
|
|
2333
|
+
.to_base64_png()
|
|
2334
|
+
.map_err(|e| napi::Error::from_reason(e.to_string()))
|
|
2335
|
+
}
|
|
2336
|
+
|
|
2337
|
+
/// Convert a screenshot to base64-encoded PNG string with resizing.
|
|
2338
|
+
/// If the image exceeds maxDimension in either width or height,
|
|
2339
|
+
/// it will be resized while maintaining aspect ratio.
|
|
2340
|
+
///
|
|
2341
|
+
/// @param {ScreenshotResult} screenshot - The screenshot to convert.
|
|
2342
|
+
/// @param {number} [maxDimension] - Maximum width or height. Defaults to 1920.
|
|
2343
|
+
/// @returns {string} Base64-encoded PNG string (potentially resized).
|
|
2344
|
+
#[napi(js_name = "screenshotToBase64PngResized")]
|
|
2345
|
+
pub fn screenshot_to_base64_png_resized(
|
|
2346
|
+
&self,
|
|
2347
|
+
screenshot: ScreenshotResult,
|
|
2348
|
+
max_dimension: Option<u32>,
|
|
2349
|
+
) -> napi::Result<String> {
|
|
2350
|
+
screenshot
|
|
2351
|
+
.to_inner()
|
|
2352
|
+
.to_base64_png_resized(max_dimension)
|
|
2353
|
+
.map_err(|e| napi::Error::from_reason(e.to_string()))
|
|
2354
|
+
}
|
|
2355
|
+
|
|
2356
|
+
/// Get the dimensions a screenshot would have after resizing.
|
|
2357
|
+
///
|
|
2358
|
+
/// @param {ScreenshotResult} screenshot - The screenshot to check.
|
|
2359
|
+
/// @param {number} maxDimension - Maximum width or height.
|
|
2360
|
+
/// @returns {ResizedDimensions} Object with width and height after resize.
|
|
2361
|
+
#[napi(js_name = "screenshotResizedDimensions")]
|
|
2362
|
+
pub fn screenshot_resized_dimensions(
|
|
2363
|
+
&self,
|
|
2364
|
+
screenshot: ScreenshotResult,
|
|
2365
|
+
max_dimension: u32,
|
|
2366
|
+
) -> ResizedDimensions {
|
|
2367
|
+
let (width, height) = screenshot.to_inner().resized_dimensions(max_dimension);
|
|
2368
|
+
ResizedDimensions { width, height }
|
|
2369
|
+
}
|
|
2370
|
+
|
|
2371
|
+
/// (async) Get all window elements for a given application name.
|
|
2372
|
+
///
|
|
2373
|
+
/// @param {string} name - The name of the application whose windows will be retrieved.
|
|
2374
|
+
/// @returns {Promise<Array<Element>>} A list of window elements belonging to the application.
|
|
2375
|
+
#[napi]
|
|
2376
|
+
pub async fn windows_for_application(&self, name: String) -> napi::Result<Vec<Element>> {
|
|
2377
|
+
self.inner
|
|
2378
|
+
.windows_for_application(&name)
|
|
2379
|
+
.await
|
|
2380
|
+
.map(|windows| windows.into_iter().map(Element::from).collect())
|
|
2381
|
+
.map_err(map_error)
|
|
2382
|
+
}
|
|
2383
|
+
|
|
2384
|
+
// ============== ADDITIONAL MISSING METHODS ==============
|
|
2385
|
+
|
|
2386
|
+
/// (async) Get the UI tree for all open applications in parallel.
|
|
2387
|
+
///
|
|
2388
|
+
/// @returns {Promise<Array<UINode>>} List of UI trees for all applications.
|
|
2389
|
+
#[napi]
|
|
2390
|
+
pub async fn get_all_applications_tree(&self) -> napi::Result<Vec<UINode>> {
|
|
2391
|
+
self.inner
|
|
2392
|
+
.get_all_applications_tree()
|
|
2393
|
+
.await
|
|
2394
|
+
.map(|trees| trees.into_iter().map(UINode::from).collect())
|
|
2395
|
+
.map_err(map_error)
|
|
2396
|
+
}
|
|
2397
|
+
|
|
2398
|
+
/// (async) Press a key globally.
|
|
2399
|
+
///
|
|
2400
|
+
/// @param {string} key - The key to press (e.g., "Enter", "Ctrl+C", "F1").
|
|
2401
|
+
/// @param {string} [process] - Process name for window screenshot capture. If provided, enables window screenshots.
|
|
2402
|
+
/// @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot (requires process).
|
|
2403
|
+
/// @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after key press.
|
|
2404
|
+
#[napi]
|
|
2405
|
+
pub async fn press_key(
|
|
2406
|
+
&self,
|
|
2407
|
+
key: String,
|
|
2408
|
+
process: Option<String>,
|
|
2409
|
+
include_window_screenshot: Option<bool>,
|
|
2410
|
+
include_monitor_screenshots: Option<bool>,
|
|
2411
|
+
) -> napi::Result<()> {
|
|
2412
|
+
// Normalize key to ensure curly brace format (e.g., "Enter" -> "{Enter}")
|
|
2413
|
+
let normalized_key = normalize_key(&key);
|
|
2414
|
+
tracing::debug!(
|
|
2415
|
+
"[TS SDK] desktop.press_key: normalized key: {} -> {}",
|
|
2416
|
+
key,
|
|
2417
|
+
normalized_key
|
|
2418
|
+
);
|
|
2419
|
+
let result = self
|
|
2420
|
+
.inner
|
|
2421
|
+
.press_key(&normalized_key)
|
|
2422
|
+
.await
|
|
2423
|
+
.map_err(map_error);
|
|
2424
|
+
|
|
2425
|
+
// Get PID from process name if provided
|
|
2426
|
+
let pid = process
|
|
2427
|
+
.as_ref()
|
|
2428
|
+
.and_then(|p| find_pid_for_process(&self.inner, p).ok());
|
|
2429
|
+
|
|
2430
|
+
// Capture screenshots if requested
|
|
2431
|
+
let _screenshots = capture_screenshots(
|
|
2432
|
+
&self.inner,
|
|
2433
|
+
pid,
|
|
2434
|
+
include_window_screenshot.unwrap_or(true) && pid.is_some(),
|
|
2435
|
+
include_monitor_screenshots.unwrap_or(false),
|
|
2436
|
+
"pressKey",
|
|
2437
|
+
);
|
|
2438
|
+
|
|
2439
|
+
result
|
|
2440
|
+
}
|
|
2441
|
+
|
|
2442
|
+
/// (async) Execute JavaScript in a browser tab.
|
|
2443
|
+
/// Finds the browser window by process name and executes the script.
|
|
2444
|
+
///
|
|
2445
|
+
/// @param {string} script - The JavaScript code to execute in browser context.
|
|
2446
|
+
/// @param {string} process - Process name to scope the browser window (e.g., 'chrome', 'msedge'). Required.
|
|
2447
|
+
/// @param {number} [timeoutMs=10000] - Timeout in milliseconds for finding the browser window.
|
|
2448
|
+
/// @returns {Promise<string>} The result of script execution.
|
|
2449
|
+
#[napi]
|
|
2450
|
+
pub async fn execute_browser_script(
|
|
2451
|
+
&self,
|
|
2452
|
+
script: String,
|
|
2453
|
+
process: String,
|
|
2454
|
+
timeout_ms: Option<f64>,
|
|
2455
|
+
) -> napi::Result<String> {
|
|
2456
|
+
use std::time::Duration;
|
|
2457
|
+
|
|
2458
|
+
let timeout = Duration::from_millis(timeout_ms.unwrap_or(10000.0) as u64);
|
|
2459
|
+
let selector_str = format!("process:{}", process);
|
|
2460
|
+
let sel: computeruse::selector::Selector = selector_str.as_str().into();
|
|
2461
|
+
let locator = self.inner.locator(sel);
|
|
2462
|
+
let element = locator.first(Some(timeout)).await.map_err(map_error)?;
|
|
2463
|
+
element
|
|
2464
|
+
.execute_browser_script(&script)
|
|
2465
|
+
.await
|
|
2466
|
+
.map_err(map_error)
|
|
2467
|
+
}
|
|
2468
|
+
|
|
2469
|
+
/// (async) Close a browser tab safely.
|
|
2470
|
+
///
|
|
2471
|
+
/// This method can identify the tab to close by:
|
|
2472
|
+
/// - tabId: Close a specific tab by its Chrome tab ID
|
|
2473
|
+
/// - url: Find and close a tab matching this URL (partial match supported)
|
|
2474
|
+
/// - title: Find and close a tab matching this title (case-insensitive partial match)
|
|
2475
|
+
/// - If none provided, closes the currently active tab
|
|
2476
|
+
///
|
|
2477
|
+
/// Returns information about the closed tab for verification.
|
|
2478
|
+
/// Returns null if no browser extension is connected or tab couldn't be found.
|
|
2479
|
+
///
|
|
2480
|
+
/// Safety:
|
|
2481
|
+
/// - Will NOT close protected browser pages (chrome://, edge://, about:, etc.)
|
|
2482
|
+
/// - Returns the closed tab's URL/title so you can verify the correct tab was closed
|
|
2483
|
+
///
|
|
2484
|
+
/// @param {number} [tabId] - Specific Chrome tab ID to close.
|
|
2485
|
+
/// @param {string} [url] - URL to match (partial match supported).
|
|
2486
|
+
/// @param {string} [title] - Title to match (case-insensitive partial match).
|
|
2487
|
+
/// @returns {Promise<CloseTabResult | null>} Info about closed tab, or null if no extension/tab found.
|
|
2488
|
+
///
|
|
2489
|
+
/// @example
|
|
2490
|
+
/// // Close by URL
|
|
2491
|
+
/// const result = await desktop.closeTab({ url: "example.com" });
|
|
2492
|
+
///
|
|
2493
|
+
/// @example
|
|
2494
|
+
/// // Close by title
|
|
2495
|
+
/// const result = await desktop.closeTab({ title: "My Page" });
|
|
2496
|
+
///
|
|
2497
|
+
/// @example
|
|
2498
|
+
/// // Close active tab
|
|
2499
|
+
/// const result = await desktop.closeTab();
|
|
2500
|
+
#[napi]
|
|
2501
|
+
pub async fn close_tab(
|
|
2502
|
+
&self,
|
|
2503
|
+
options: Option<crate::types::CloseTabOptions>,
|
|
2504
|
+
) -> napi::Result<Option<crate::types::CloseTabResult>> {
|
|
2505
|
+
let opts = options.unwrap_or_default();
|
|
2506
|
+
self.inner
|
|
2507
|
+
.close_tab(opts.tab_id, opts.url.as_deref(), opts.title.as_deref())
|
|
2508
|
+
.await
|
|
2509
|
+
.map(|opt| opt.map(crate::types::CloseTabResult::from))
|
|
2510
|
+
.map_err(map_error)
|
|
2511
|
+
}
|
|
2512
|
+
/// (async) Delay execution for a specified number of milliseconds.
|
|
2513
|
+
/// Useful for waiting between actions to ensure UI stability.
|
|
2514
|
+
///
|
|
2515
|
+
/// @param {number} delayMs - Delay in milliseconds.
|
|
2516
|
+
/// @returns {Promise<void>}
|
|
2517
|
+
#[napi]
|
|
2518
|
+
pub async fn delay(&self, delay_ms: u32) -> napi::Result<()> {
|
|
2519
|
+
self.inner.delay(delay_ms as u64).await.map_err(map_error)
|
|
2520
|
+
}
|
|
2521
|
+
|
|
2522
|
+
/// Navigate to a URL in a browser.
|
|
2523
|
+
/// This is the recommended method for browser navigation - more reliable than
|
|
2524
|
+
/// manually manipulating the address bar with keyboard/mouse actions.
|
|
2525
|
+
///
|
|
2526
|
+
/// @param {string} url - URL to navigate to
|
|
2527
|
+
/// @param {string | null} browser - Optional browser name ('Chrome', 'Firefox', 'Edge', 'Brave', 'Opera', 'Vivaldi', or 'Default')
|
|
2528
|
+
/// @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot after navigation
|
|
2529
|
+
/// @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after navigation
|
|
2530
|
+
/// @returns {Promise<Element>} The browser window element
|
|
2531
|
+
#[napi]
|
|
2532
|
+
pub fn navigate_browser(
|
|
2533
|
+
&self,
|
|
2534
|
+
url: String,
|
|
2535
|
+
browser: Option<String>,
|
|
2536
|
+
include_window_screenshot: Option<bool>,
|
|
2537
|
+
include_monitor_screenshots: Option<bool>,
|
|
2538
|
+
) -> napi::Result<Element> {
|
|
2539
|
+
let browser_enum = browser.map(|b| match b.as_str() {
|
|
2540
|
+
"Chrome" => computeruse::Browser::Chrome,
|
|
2541
|
+
"Firefox" => computeruse::Browser::Firefox,
|
|
2542
|
+
"Edge" => computeruse::Browser::Edge,
|
|
2543
|
+
"Brave" => computeruse::Browser::Brave,
|
|
2544
|
+
"Opera" => computeruse::Browser::Opera,
|
|
2545
|
+
"Vivaldi" => computeruse::Browser::Vivaldi,
|
|
2546
|
+
"Default" => computeruse::Browser::Default,
|
|
2547
|
+
custom => computeruse::Browser::Custom(custom.to_string()),
|
|
2548
|
+
});
|
|
2549
|
+
|
|
2550
|
+
let element = self.inner.open_url(&url, browser_enum).map_err(map_error)?;
|
|
2551
|
+
|
|
2552
|
+
// Capture screenshots if enabled (window default: true, monitor default: false)
|
|
2553
|
+
let _screenshots = capture_screenshots(
|
|
2554
|
+
&self.inner,
|
|
2555
|
+
element.process_id().ok(),
|
|
2556
|
+
include_window_screenshot.unwrap_or(true),
|
|
2557
|
+
include_monitor_screenshots.unwrap_or(false),
|
|
2558
|
+
"navigateBrowser",
|
|
2559
|
+
);
|
|
2560
|
+
|
|
2561
|
+
Ok(Element { inner: element })
|
|
2562
|
+
}
|
|
2563
|
+
|
|
2564
|
+
/// (async) Set the zoom level to a specific percentage.
|
|
2565
|
+
///
|
|
2566
|
+
/// @param {number} percentage - The zoom percentage (e.g., 100 for 100%, 150 for 150%, 50 for 50%).
|
|
2567
|
+
#[napi]
|
|
2568
|
+
pub async fn set_zoom(&self, percentage: u32) -> napi::Result<()> {
|
|
2569
|
+
self.inner.set_zoom(percentage).await.map_err(map_error)
|
|
2570
|
+
}
|
|
2571
|
+
|
|
2572
|
+
/// (async) Run Gemini Computer Use agentic loop.
|
|
2573
|
+
///
|
|
2574
|
+
/// Provide a goal and target process, and this will autonomously take actions
|
|
2575
|
+
/// (click, type, scroll, etc.) until the goal is achieved or max_steps is reached.
|
|
2576
|
+
/// Uses Gemini's vision model to analyze screenshots and decide actions.
|
|
2577
|
+
///
|
|
2578
|
+
/// @param {string} process - Process name of the target application (e.g., "chrome", "notepad")
|
|
2579
|
+
/// @param {string} goal - What to achieve (e.g., "Open Notepad and type Hello World")
|
|
2580
|
+
/// @param {number} [maxSteps=20] - Maximum number of steps before stopping
|
|
2581
|
+
/// @param {function} [onStep] - Optional callback invoked after each step with step details
|
|
2582
|
+
/// @returns {Promise<ComputerUseResult>} Result with status, steps executed, and history
|
|
2583
|
+
#[napi]
|
|
2584
|
+
pub async fn gemini_computer_use(
|
|
2585
|
+
&self,
|
|
2586
|
+
process: String,
|
|
2587
|
+
goal: String,
|
|
2588
|
+
max_steps: Option<u32>,
|
|
2589
|
+
#[napi(ts_arg_type = "((err: null | Error, step: ComputerUseStep) => void) | undefined")]
|
|
2590
|
+
on_step: Option<ThreadsafeFunction<ComputerUseStep>>,
|
|
2591
|
+
) -> napi::Result<ComputerUseResult> {
|
|
2592
|
+
// Create progress callback if onStep is provided
|
|
2593
|
+
#[allow(clippy::type_complexity)]
|
|
2594
|
+
let progress_callback: Option<
|
|
2595
|
+
Box<dyn Fn(&computeruse::ComputerUseStep) + Send + Sync>,
|
|
2596
|
+
> = on_step.map(|tsfn| {
|
|
2597
|
+
let tsfn = Arc::new(tsfn);
|
|
2598
|
+
Box::new(move |step: &computeruse::ComputerUseStep| {
|
|
2599
|
+
let js_step = ComputerUseStep::from(step.clone());
|
|
2600
|
+
tsfn.call(Ok(js_step), ThreadsafeFunctionCallMode::NonBlocking);
|
|
2601
|
+
}) as Box<dyn Fn(&computeruse::ComputerUseStep) + Send + Sync>
|
|
2602
|
+
});
|
|
2603
|
+
|
|
2604
|
+
self.inner
|
|
2605
|
+
.gemini_computer_use(&process, &goal, max_steps, progress_callback)
|
|
2606
|
+
.await
|
|
2607
|
+
.map(ComputerUseResult::from)
|
|
2608
|
+
.map_err(|e| napi::Error::from_reason(e.to_string()))
|
|
2609
|
+
}
|
|
2610
|
+
|
|
2611
|
+
/// Stop all currently executing operations.
|
|
2612
|
+
///
|
|
2613
|
+
/// This cancels the internal cancellation token, which will cause any
|
|
2614
|
+
/// operations that check `isCancelled()` to abort. After calling this,
|
|
2615
|
+
/// you should create a new Desktop instance to start fresh.
|
|
2616
|
+
#[napi]
|
|
2617
|
+
pub fn stop_execution(&self) {
|
|
2618
|
+
self.inner.stop_execution();
|
|
2619
|
+
}
|
|
2620
|
+
|
|
2621
|
+
/// Check if execution has been cancelled.
|
|
2622
|
+
///
|
|
2623
|
+
/// Returns `true` if `stopExecution()` has been called.
|
|
2624
|
+
/// Long-running operations should periodically check this and abort if true.
|
|
2625
|
+
#[napi]
|
|
2626
|
+
pub fn is_cancelled(&self) -> bool {
|
|
2627
|
+
self.inner.is_cancelled()
|
|
2628
|
+
}
|
|
2629
|
+
|
|
2630
|
+
/// Stop all active highlight overlays globally.
|
|
2631
|
+
///
|
|
2632
|
+
/// This finds and destroys all highlight overlay windows that were created
|
|
2633
|
+
/// by `element.highlight()`. Useful for cleaning up highlights without
|
|
2634
|
+
/// needing to track individual HighlightHandle objects.
|
|
2635
|
+
///
|
|
2636
|
+
/// @returns {number} The number of highlights that were stopped.
|
|
2637
|
+
#[napi]
|
|
2638
|
+
pub fn stop_highlighting(&self) -> u32 {
|
|
2639
|
+
#[cfg(target_os = "windows")]
|
|
2640
|
+
{
|
|
2641
|
+
computeruse::stop_all_highlights() as u32
|
|
2642
|
+
}
|
|
2643
|
+
#[cfg(not(target_os = "windows"))]
|
|
2644
|
+
{
|
|
2645
|
+
// Not implemented for other platforms yet
|
|
2646
|
+
0
|
|
2647
|
+
}
|
|
2648
|
+
}
|
|
2649
|
+
|
|
2650
|
+
/// Show inspect overlay with indexed elements for visual debugging.
|
|
2651
|
+
///
|
|
2652
|
+
/// Displays a transparent overlay window with colored rectangles around UI elements,
|
|
2653
|
+
/// showing their index numbers for click targeting. Use `hideInspectOverlay()` to remove.
|
|
2654
|
+
///
|
|
2655
|
+
/// @param {InspectElement[]} elements - Array of elements to highlight with their bounds.
|
|
2656
|
+
/// @param {object} windowBounds - The window bounds {x, y, width, height} to constrain the overlay.
|
|
2657
|
+
/// @param {OverlayDisplayMode} [displayMode='Index'] - What to show in labels: 'Index', 'Role', 'Name', etc.
|
|
2658
|
+
#[napi]
|
|
2659
|
+
#[cfg(target_os = "windows")]
|
|
2660
|
+
pub fn show_inspect_overlay(
|
|
2661
|
+
&self,
|
|
2662
|
+
elements: Vec<crate::types::InspectElement>,
|
|
2663
|
+
window_bounds: crate::types::Bounds,
|
|
2664
|
+
display_mode: Option<crate::types::OverlayDisplayMode>,
|
|
2665
|
+
) -> napi::Result<()> {
|
|
2666
|
+
let core_elements: Vec<computeruse::InspectElement> =
|
|
2667
|
+
elements.into_iter().map(|e| e.into()).collect();
|
|
2668
|
+
let core_bounds = (
|
|
2669
|
+
window_bounds.x as i32,
|
|
2670
|
+
window_bounds.y as i32,
|
|
2671
|
+
window_bounds.width as i32,
|
|
2672
|
+
window_bounds.height as i32,
|
|
2673
|
+
);
|
|
2674
|
+
let core_mode = display_mode
|
|
2675
|
+
.map(|m| m.into())
|
|
2676
|
+
.unwrap_or(computeruse::OverlayDisplayMode::Index);
|
|
2677
|
+
|
|
2678
|
+
computeruse::show_inspect_overlay(core_elements, core_bounds, core_mode)
|
|
2679
|
+
.map(|_handle| ()) // Discard handle - use hideInspectOverlay to close
|
|
2680
|
+
.map_err(|e| napi::Error::from_reason(e.to_string()))
|
|
2681
|
+
}
|
|
2682
|
+
|
|
2683
|
+
/// Show inspect overlay (non-Windows stub).
|
|
2684
|
+
#[napi]
|
|
2685
|
+
#[cfg(not(target_os = "windows"))]
|
|
2686
|
+
pub fn show_inspect_overlay(
|
|
2687
|
+
&self,
|
|
2688
|
+
_elements: Vec<crate::types::InspectElement>,
|
|
2689
|
+
_window_bounds: crate::types::Bounds,
|
|
2690
|
+
_display_mode: Option<crate::types::OverlayDisplayMode>,
|
|
2691
|
+
) -> napi::Result<()> {
|
|
2692
|
+
// Not implemented for other platforms yet
|
|
2693
|
+
Ok(())
|
|
2694
|
+
}
|
|
2695
|
+
|
|
2696
|
+
/// Hide any active inspect overlay.
|
|
2697
|
+
///
|
|
2698
|
+
/// This hides the visual overlay that was shown via `showInspectOverlay()`.
|
|
2699
|
+
/// Can be called from any thread.
|
|
2700
|
+
#[napi]
|
|
2701
|
+
pub fn hide_inspect_overlay(&self) {
|
|
2702
|
+
#[cfg(target_os = "windows")]
|
|
2703
|
+
{
|
|
2704
|
+
computeruse::hide_inspect_overlay();
|
|
2705
|
+
}
|
|
2706
|
+
#[cfg(not(target_os = "windows"))]
|
|
2707
|
+
{
|
|
2708
|
+
// Not implemented for other platforms yet
|
|
2709
|
+
}
|
|
2710
|
+
}
|
|
2711
|
+
|
|
2712
|
+
// ============== ELEMENT VERIFICATION ==============
|
|
2713
|
+
|
|
2714
|
+
/// Verify that an element matching the selector exists within the same application as the scope element.
|
|
2715
|
+
///
|
|
2716
|
+
/// This is used for post-action verification - checking that an expected element appeared after
|
|
2717
|
+
/// performing an action (e.g., a success dialog after clicking submit).
|
|
2718
|
+
///
|
|
2719
|
+
/// @param {Element} scopeElement - The element to get the application scope from (typically the element the action was performed on)
|
|
2720
|
+
/// @param {string} selector - The selector string to search for
|
|
2721
|
+
/// @param {number} [timeoutMs=2000] - How long to wait for the element to appear in milliseconds
|
|
2722
|
+
/// @returns {Element} The found element if verification passes
|
|
2723
|
+
/// @throws Error if the element is not found within the timeout
|
|
2724
|
+
#[napi]
|
|
2725
|
+
pub async fn verify_element_exists(
|
|
2726
|
+
&self,
|
|
2727
|
+
scope_element: &crate::Element,
|
|
2728
|
+
selector: String,
|
|
2729
|
+
timeout_ms: Option<u32>,
|
|
2730
|
+
) -> napi::Result<crate::Element> {
|
|
2731
|
+
let timeout = timeout_ms.unwrap_or(2000) as u64;
|
|
2732
|
+
let found = self
|
|
2733
|
+
.inner
|
|
2734
|
+
.verify_element_exists(&scope_element.inner, &selector, timeout)
|
|
2735
|
+
.await
|
|
2736
|
+
.map_err(map_error)?;
|
|
2737
|
+
Ok(crate::Element { inner: found })
|
|
2738
|
+
}
|
|
2739
|
+
|
|
2740
|
+
/// Verify that an element matching the selector does NOT exist within the same application as the scope element.
|
|
2741
|
+
///
|
|
2742
|
+
/// This is used for post-action verification - checking that an element disappeared after
|
|
2743
|
+
/// performing an action (e.g., a modal dialog closed after clicking OK).
|
|
2744
|
+
///
|
|
2745
|
+
/// @param {Element} scopeElement - The element to get the application scope from (typically the element the action was performed on)
|
|
2746
|
+
/// @param {string} selector - The selector string that should NOT be found
|
|
2747
|
+
/// @param {number} [timeoutMs=2000] - How long to wait/check that the element doesn't appear in milliseconds
|
|
2748
|
+
/// @returns {void}
|
|
2749
|
+
/// @throws Error if the element IS found (meaning verification failed)
|
|
2750
|
+
#[napi]
|
|
2751
|
+
pub async fn verify_element_not_exists(
|
|
2752
|
+
&self,
|
|
2753
|
+
scope_element: &crate::Element,
|
|
2754
|
+
selector: String,
|
|
2755
|
+
timeout_ms: Option<u32>,
|
|
2756
|
+
) -> napi::Result<()> {
|
|
2757
|
+
let timeout = timeout_ms.unwrap_or(2000) as u64;
|
|
2758
|
+
self.inner
|
|
2759
|
+
.verify_element_not_exists(&scope_element.inner, &selector, timeout)
|
|
2760
|
+
.await
|
|
2761
|
+
.map_err(map_error)
|
|
2762
|
+
}
|
|
2763
|
+
}
|