@elizaos/computeruse 0.24.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/desktop.rs ADDED
@@ -0,0 +1,2763 @@
1
+ use crate::types::{
2
+ ClickResult, ClickType, ComputerUseResult, ComputerUseStep, Monitor, MonitorScreenshotPair,
3
+ ResizedDimensions, TreeOutputFormat, VisionType, WindowTreeResult,
4
+ };
5
+ use crate::Selector;
6
+ use crate::{
7
+ map_error, CommandOutput, Element, Locator, ScreenshotResult, TreeBuildConfig, UINode,
8
+ };
9
+ use napi::bindgen_prelude::Either;
10
+ use napi::threadsafe_function::{ThreadsafeFunction, ThreadsafeFunctionCallMode};
11
+ use napi_derive::napi;
12
+ use std::sync::{Arc, Once};
13
+ use computeruse::Desktop as ComputerUseDesktop;
14
+
15
+ /// Normalize key format to ensure curly brace syntax for special keys.
16
+ /// If key already contains `{`, assume it's correctly formatted.
17
+ /// Otherwise, wrap the entire key in `{}` to ensure it's treated as a special key press.
18
+ fn normalize_key(key: &str) -> String {
19
+ if key.contains('{') {
20
+ key.to_string()
21
+ } else {
22
+ format!("{{{}}}", key)
23
+ }
24
+ }
25
+
26
+ /// Result of screenshot capture operations
27
+ #[derive(Default)]
28
+ struct ScreenshotPaths {
29
+ window_path: Option<String>,
30
+ monitor_paths: Option<Vec<String>>,
31
+ }
32
+
33
+ /// Helper to capture and save screenshots based on options
34
+ fn capture_screenshots(
35
+ desktop: &ComputerUseDesktop,
36
+ pid: Option<u32>,
37
+ include_window: bool,
38
+ include_monitors: bool,
39
+ operation: &str,
40
+ ) -> ScreenshotPaths {
41
+ let mut result = ScreenshotPaths::default();
42
+
43
+ if !include_window && !include_monitors {
44
+ return result;
45
+ }
46
+
47
+ computeruse::screenshot_logger::init();
48
+ let prefix = computeruse::screenshot_logger::generate_prefix(None, operation);
49
+
50
+ if include_window {
51
+ if let Some(pid) = pid {
52
+ // Try to capture window screenshot by PID
53
+ if let Ok(apps) = desktop.applications() {
54
+ if let Some(app) = apps.into_iter().find(|a| a.process_id().ok() == Some(pid)) {
55
+ if let Ok(screenshot) = app.capture() {
56
+ if let Some(saved) = computeruse::screenshot_logger::save_window_screenshot(
57
+ &screenshot,
58
+ &prefix,
59
+ None,
60
+ ) {
61
+ result.window_path = Some(saved.path.to_string_lossy().to_string());
62
+ }
63
+ }
64
+ }
65
+ }
66
+ }
67
+ }
68
+
69
+ if include_monitors {
70
+ // Capture all monitors using futures executor for sync context
71
+ if let Ok(monitors) = futures::executor::block_on(desktop.capture_all_monitors()) {
72
+ let saved =
73
+ computeruse::screenshot_logger::save_monitor_screenshots(&monitors, &prefix, None);
74
+ if !saved.is_empty() {
75
+ result.monitor_paths = Some(
76
+ saved
77
+ .into_iter()
78
+ .map(|s| s.path.to_string_lossy().to_string())
79
+ .collect(),
80
+ );
81
+ }
82
+ }
83
+ }
84
+
85
+ result
86
+ }
87
+
88
+ /// Helper to find PID from process name using the shared core function.
89
+ fn find_pid_for_process(desktop: &ComputerUseDesktop, process_name: &str) -> napi::Result<u32> {
90
+ computeruse::find_pid_for_process(desktop, process_name).map_err(map_error)
91
+ }
92
+
93
+ /// Main entry point for desktop automation.
94
+ #[napi(js_name = "Desktop")]
95
+ pub struct Desktop {
96
+ inner: ComputerUseDesktop,
97
+ }
98
+
99
+ #[allow(clippy::needless_pass_by_value)]
100
+ #[napi]
101
+ impl Desktop {
102
+ /// Create a new Desktop automation instance with configurable options.
103
+ ///
104
+ /// @param {boolean} [useBackgroundApps=false] - Enable background apps support.
105
+ /// @param {boolean} [activateApp=false] - Enable app activation support.
106
+ /// @param {string} [logLevel] - Logging level (e.g., 'info', 'debug', 'warn', 'error').
107
+ /// Falls back to RUST_LOG or COMPUTERUSE_LOG_LEVEL env vars, defaults to 'info'.
108
+ /// @returns {Desktop} A new Desktop automation instance.
109
+ #[napi(constructor)]
110
+ pub fn new(
111
+ use_background_apps: Option<bool>,
112
+ activate_app: Option<bool>,
113
+ log_level: Option<String>,
114
+ ) -> Self {
115
+ let use_background_apps = use_background_apps.unwrap_or(false);
116
+ let activate_app = activate_app.unwrap_or(false);
117
+
118
+ // Priority: explicit param > RUST_LOG env > COMPUTERUSE_LOG_LEVEL env > "info" default
119
+ let log_level = log_level
120
+ .or_else(|| std::env::var("RUST_LOG").ok())
121
+ .or_else(|| std::env::var("COMPUTERUSE_LOG_LEVEL").ok())
122
+ .unwrap_or_else(|| "info".to_string());
123
+
124
+ static INIT: Once = Once::new();
125
+ INIT.call_once(|| {
126
+ let _ = tracing_subscriber::fmt()
127
+ .with_env_filter(log_level)
128
+ .with_ansi(false) // Disable ANSI color codes for cleaner output
129
+ .try_init();
130
+ });
131
+ let desktop = ComputerUseDesktop::new(use_background_apps, activate_app)
132
+ .expect("Failed to create Desktop instance");
133
+ Desktop { inner: desktop }
134
+ }
135
+
136
+ /// Get the root UI element of the desktop.
137
+ ///
138
+ /// @returns {Element} The root UI element.
139
+ #[napi]
140
+ pub fn root(&self) -> Element {
141
+ let root = self.inner.root();
142
+ Element::from(root)
143
+ }
144
+
145
+ /// Get a list of all running applications.
146
+ ///
147
+ /// @returns {Array<Element>} List of application UI elements.
148
+ #[napi]
149
+ pub fn applications(&self) -> napi::Result<Vec<Element>> {
150
+ self.inner
151
+ .applications()
152
+ .map(|apps| apps.into_iter().map(Element::from).collect())
153
+ .map_err(map_error)
154
+ }
155
+
156
+ /// Get a running application by name.
157
+ ///
158
+ /// @param {string} name - The name of the application to find.
159
+ /// @returns {Element} The application UI element.
160
+ #[napi]
161
+ pub fn application(&self, name: String) -> napi::Result<Element> {
162
+ self.inner
163
+ .application(&name)
164
+ .map(Element::from)
165
+ .map_err(map_error)
166
+ }
167
+
168
+ /// Open an application by name.
169
+ ///
170
+ /// @param {string} name - The name of the application to open.
171
+ /// @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot after opening
172
+ /// @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after opening
173
+ #[napi]
174
+ pub fn open_application(
175
+ &self,
176
+ name: String,
177
+ include_window_screenshot: Option<bool>,
178
+ include_monitor_screenshots: Option<bool>,
179
+ ) -> napi::Result<Element> {
180
+ let element = self.inner.open_application(&name).map_err(map_error)?;
181
+
182
+ // Capture screenshots if enabled (window default: true, monitor default: false)
183
+ let _screenshots = capture_screenshots(
184
+ &self.inner,
185
+ element.process_id().ok(),
186
+ include_window_screenshot.unwrap_or(true),
187
+ include_monitor_screenshots.unwrap_or(false),
188
+ "openApplication",
189
+ );
190
+
191
+ Ok(Element::from(element))
192
+ }
193
+
194
+ /// Activate an application by name.
195
+ ///
196
+ /// @param {string} name - The name of the application to activate.
197
+ #[napi]
198
+ pub fn activate_application(&self, name: String) -> napi::Result<()> {
199
+ self.inner.activate_application(&name).map_err(map_error)
200
+ }
201
+
202
+ /// Click within element bounds at a specified position.
203
+ ///
204
+ /// This is useful for clicking on elements from UI tree, OCR, omniparser, gemini vision, or DOM
205
+ /// without needing an element reference - just the bounds.
206
+ ///
207
+ /// @param {number} x - X coordinate of the bounds.
208
+ /// @param {number} y - Y coordinate of the bounds.
209
+ /// @param {number} width - Width of the bounds.
210
+ /// @param {number} height - Height of the bounds.
211
+ /// @param {number} [xPercentage=50] - X position within bounds as percentage (0-100). Defaults to 50 (center).
212
+ /// @param {number} [yPercentage=50] - Y position within bounds as percentage (0-100). Defaults to 50 (center).
213
+ /// @param {ClickType} [clickType='left'] - Type of click: 'left', 'double', or 'right'.
214
+ /// @param {boolean} [restoreCursor=true] - If true, restore cursor to original position after clicking.
215
+ /// @param {string} [process] - Process name for window screenshot capture. If provided, enables window screenshots.
216
+ /// @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot (requires process).
217
+ /// @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after clicking.
218
+ /// @returns {ClickResult} Result with clicked coordinates and method details.
219
+ #[napi]
220
+ #[allow(clippy::too_many_arguments)]
221
+ pub fn click_at_bounds(
222
+ &self,
223
+ x: f64,
224
+ y: f64,
225
+ width: f64,
226
+ height: f64,
227
+ x_percentage: Option<u8>,
228
+ y_percentage: Option<u8>,
229
+ click_type: Option<ClickType>,
230
+ restore_cursor: Option<bool>,
231
+ process: Option<String>,
232
+ include_window_screenshot: Option<bool>,
233
+ include_monitor_screenshots: Option<bool>,
234
+ ) -> napi::Result<ClickResult> {
235
+ let bounds = (x, y, width, height);
236
+ let click_position = match (x_percentage, y_percentage) {
237
+ (Some(xp), Some(yp)) => Some((xp, yp)),
238
+ (Some(xp), None) => Some((xp, 50)),
239
+ (None, Some(yp)) => Some((50, yp)),
240
+ (None, None) => None,
241
+ };
242
+ let click_type = click_type.unwrap_or(ClickType::Left);
243
+ let restore_cursor = restore_cursor.unwrap_or(true);
244
+
245
+ let result = self
246
+ .inner
247
+ .click_at_bounds(bounds, click_position, click_type.into(), restore_cursor)
248
+ .map(ClickResult::from)
249
+ .map_err(map_error);
250
+
251
+ // Get PID from process name if provided
252
+ let pid = process
253
+ .as_ref()
254
+ .and_then(|p| find_pid_for_process(&self.inner, p).ok());
255
+
256
+ // Capture screenshots if requested
257
+ let _screenshots = capture_screenshots(
258
+ &self.inner,
259
+ pid,
260
+ include_window_screenshot.unwrap_or(true) && pid.is_some(),
261
+ include_monitor_screenshots.unwrap_or(false),
262
+ "clickAtBounds",
263
+ );
264
+
265
+ result
266
+ }
267
+
268
+ /// Click on an element by its index from the last tree/vision query.
269
+ ///
270
+ /// This looks up cached bounds from the appropriate cache based on visionType,
271
+ /// then clicks at the specified position within those bounds.
272
+ ///
273
+ /// @param {number} index - 1-based index from the tree/vision output (e.g., #1, #2).
274
+ /// @param {VisionType} [visionType='UiTree'] - Source of the index: 'UiTree', 'Ocr', 'Omniparser', 'Gemini', or 'Dom'.
275
+ /// @param {number} [xPercentage=50] - X position within bounds as percentage (0-100).
276
+ /// @param {number} [yPercentage=50] - Y position within bounds as percentage (0-100).
277
+ /// @param {ClickType} [clickType='Left'] - Type of click: 'Left', 'Double', or 'Right'.
278
+ /// @param {boolean} [restoreCursor=true] - If true, restore cursor to original position after clicking.
279
+ /// @param {string} [process] - Process name for window screenshot capture. If provided, enables window screenshots.
280
+ /// @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot (requires process).
281
+ /// @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after clicking.
282
+ /// @returns {ClickResult} Result with clicked coordinates, element info, and method details.
283
+ #[napi]
284
+ #[allow(clippy::too_many_arguments)]
285
+ pub fn click_by_index(
286
+ &self,
287
+ index: u32,
288
+ vision_type: Option<VisionType>,
289
+ x_percentage: Option<u8>,
290
+ y_percentage: Option<u8>,
291
+ click_type: Option<ClickType>,
292
+ restore_cursor: Option<bool>,
293
+ process: Option<String>,
294
+ include_window_screenshot: Option<bool>,
295
+ include_monitor_screenshots: Option<bool>,
296
+ ) -> napi::Result<ClickResult> {
297
+ let vision_type = vision_type.unwrap_or(VisionType::UiTree);
298
+ let click_position = match (x_percentage, y_percentage) {
299
+ (Some(xp), Some(yp)) => Some((xp, yp)),
300
+ (Some(xp), None) => Some((xp, 50)),
301
+ (None, Some(yp)) => Some((50, yp)),
302
+ (None, None) => None,
303
+ };
304
+ let click_type = click_type.unwrap_or(ClickType::Left);
305
+ let restore_cursor = restore_cursor.unwrap_or(true);
306
+
307
+ let result = self
308
+ .inner
309
+ .click_by_index(
310
+ index,
311
+ vision_type.into(),
312
+ click_position,
313
+ click_type.into(),
314
+ restore_cursor,
315
+ )
316
+ .map(ClickResult::from)
317
+ .map_err(map_error);
318
+
319
+ // Get PID from process name if provided
320
+ let pid = process
321
+ .as_ref()
322
+ .and_then(|p| find_pid_for_process(&self.inner, p).ok());
323
+
324
+ // Capture screenshots if requested
325
+ let _screenshots = capture_screenshots(
326
+ &self.inner,
327
+ pid,
328
+ include_window_screenshot.unwrap_or(true) && pid.is_some(),
329
+ include_monitor_screenshots.unwrap_or(false),
330
+ "clickByIndex",
331
+ );
332
+
333
+ result
334
+ }
335
+
336
+ /// (async) Run a shell command.
337
+ ///
338
+ /// @param {string} [windowsCommand] - Command to run on Windows.
339
+ /// @param {string} [unixCommand] - Command to run on Unix.
340
+ /// @returns {Promise<CommandOutput>} The command output.
341
+ #[napi]
342
+ pub async fn run_command(
343
+ &self,
344
+ windows_command: Option<String>,
345
+ unix_command: Option<String>,
346
+ ) -> napi::Result<CommandOutput> {
347
+ self.inner
348
+ .run_command(windows_command.as_deref(), unix_command.as_deref())
349
+ .await
350
+ .map(|r| CommandOutput {
351
+ exit_status: r.exit_status,
352
+ stdout: r.stdout,
353
+ stderr: r.stderr,
354
+ })
355
+ .map_err(map_error)
356
+ }
357
+
358
+ /// (async) Execute a shell command using GitHub Actions-style syntax.
359
+ ///
360
+ /// @param {string} command - The command to run (can be single or multi-line).
361
+ /// @param {string} [shell] - Optional shell to use (defaults to PowerShell on Windows, bash on Unix).
362
+ /// @param {string} [workingDirectory] - Optional working directory for the command.
363
+ /// @returns {Promise<CommandOutput>} The command output.
364
+ #[napi]
365
+ pub async fn run(
366
+ &self,
367
+ command: String,
368
+ shell: Option<String>,
369
+ working_directory: Option<String>,
370
+ ) -> napi::Result<CommandOutput> {
371
+ self.inner
372
+ .run(
373
+ command.as_str(),
374
+ shell.as_deref(),
375
+ working_directory.as_deref(),
376
+ )
377
+ .await
378
+ .map(|r| CommandOutput {
379
+ exit_status: r.exit_status,
380
+ stdout: r.stdout,
381
+ stderr: r.stderr,
382
+ })
383
+ .map_err(map_error)
384
+ }
385
+
386
+ /// (async) Perform OCR on an image file.
387
+ ///
388
+ /// @param {string} imagePath - Path to the image file.
389
+ /// @returns {Promise<string>} The extracted text.
390
+ #[napi]
391
+ pub async fn ocr_image_path(&self, image_path: String) -> napi::Result<String> {
392
+ self.inner
393
+ .ocr_image_path(&image_path)
394
+ .await
395
+ .map_err(map_error)
396
+ }
397
+
398
+ /// (async) Perform OCR on a screenshot.
399
+ ///
400
+ /// @param {ScreenshotResult} screenshot - The screenshot to process.
401
+ /// @returns {Promise<string>} The extracted text.
402
+ #[napi]
403
+ pub async fn ocr_screenshot(&self, screenshot: ScreenshotResult) -> napi::Result<String> {
404
+ let rust_screenshot = screenshot.to_inner();
405
+ self.inner
406
+ .ocr_screenshot(&rust_screenshot)
407
+ .await
408
+ .map_err(map_error)
409
+ }
410
+
411
+ /// (async) Perform OCR on a window by process name and return structured results with bounding boxes.
412
+ /// Returns an OcrResult containing the OCR tree, formatted output, and index-to-bounds mapping
413
+ /// for click targeting.
414
+ ///
415
+ /// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
416
+ /// @param {boolean} [formatOutput=true] - Whether to generate formatted compact YAML output.
417
+ /// @returns {Promise<OcrResult>} Complete OCR result with tree, formatted output, and bounds mapping.
418
+ #[napi]
419
+ #[cfg(target_os = "windows")]
420
+ pub async fn perform_ocr_for_process(
421
+ &self,
422
+ process: String,
423
+ format_output: Option<bool>,
424
+ ) -> napi::Result<crate::types::OcrResult> {
425
+ let format_output = format_output.unwrap_or(true);
426
+
427
+ // Find PID for the process name
428
+ let pid = find_pid_for_process(&self.inner, &process)?;
429
+
430
+ // Find the application element by PID
431
+ let apps = self.inner.applications().map_err(map_error)?;
432
+ let window_element = apps
433
+ .into_iter()
434
+ .find(|app| app.process_id().ok() == Some(pid))
435
+ .ok_or_else(|| {
436
+ napi::Error::from_reason(format!("No window found for process '{}'", process))
437
+ })?;
438
+
439
+ // Get window bounds (absolute screen coordinates)
440
+ let bounds = window_element.bounds().map_err(map_error)?;
441
+ let (window_x, window_y, win_w, win_h) = bounds;
442
+
443
+ // Capture screenshot of the window
444
+ let screenshot = window_element.capture().map_err(map_error)?;
445
+
446
+ // Calculate DPI scale factors (physical screenshot pixels / logical window size)
447
+ let dpi_scale_w = screenshot.width as f64 / win_w;
448
+ let dpi_scale_h = screenshot.height as f64 / win_h;
449
+
450
+ // Perform OCR with bounding boxes
451
+ let ocr_element = self
452
+ .inner
453
+ .ocr_screenshot_with_bounds(&screenshot, window_x, window_y, dpi_scale_w, dpi_scale_h)
454
+ .map_err(map_error)?;
455
+
456
+ // Format the OCR tree if requested
457
+ let (formatted, index_to_bounds) = if format_output {
458
+ let result = computeruse::format_ocr_tree_as_compact_yaml(&ocr_element, 0);
459
+
460
+ // Populate the OCR cache for click_by_index support
461
+ self.inner
462
+ .populate_ocr_cache(result.index_to_bounds.clone());
463
+
464
+ let bounds_map: std::collections::HashMap<String, crate::types::OcrBoundsEntry> =
465
+ result
466
+ .index_to_bounds
467
+ .into_iter()
468
+ .map(|(idx, (text, (x, y, w, h)))| {
469
+ (
470
+ idx.to_string(),
471
+ crate::types::OcrBoundsEntry {
472
+ text,
473
+ bounds: crate::types::Bounds {
474
+ x,
475
+ y,
476
+ width: w,
477
+ height: h,
478
+ },
479
+ },
480
+ )
481
+ })
482
+ .collect();
483
+ (Some(result.formatted), bounds_map)
484
+ } else {
485
+ (None, std::collections::HashMap::new())
486
+ };
487
+
488
+ let element_count = index_to_bounds.len() as u32;
489
+
490
+ Ok(crate::types::OcrResult {
491
+ tree: crate::types::OcrElement::from(ocr_element),
492
+ formatted,
493
+ index_to_bounds,
494
+ element_count,
495
+ })
496
+ }
497
+
498
+ /// (async) Perform OCR on a window by process name (non-Windows stub).
499
+ #[napi]
500
+ #[cfg(not(target_os = "windows"))]
501
+ pub async fn perform_ocr_for_process(
502
+ &self,
503
+ _process: String,
504
+ _format_output: Option<bool>,
505
+ ) -> napi::Result<crate::types::OcrResult> {
506
+ Err(napi::Error::from_reason(
507
+ "OCR with bounding boxes is currently only supported on Windows",
508
+ ))
509
+ }
510
+
511
+ /// (async) Capture DOM elements from the current browser tab.
512
+ ///
513
+ /// Extracts visible DOM elements with their properties and screen coordinates.
514
+ /// Uses JavaScript injection via Chrome extension to traverse the DOM tree.
515
+ ///
516
+ /// @param {number} [maxElements=200] - Maximum number of elements to capture.
517
+ /// @param {boolean} [formatOutput=true] - Whether to include formatted compact YAML output.
518
+ /// @returns {Promise<BrowserDomResult>} DOM elements with bounds for click targeting.
519
+ #[napi]
520
+ pub async fn capture_browser_dom(
521
+ &self,
522
+ max_elements: Option<u32>,
523
+ format_output: Option<bool>,
524
+ ) -> napi::Result<crate::types::BrowserDomResult> {
525
+ use std::collections::HashMap;
526
+ use std::time::Duration;
527
+
528
+ let max_elements = max_elements.unwrap_or(200);
529
+ let format_output = format_output.unwrap_or(true);
530
+
531
+ // Get viewport offset from Document element (more reliable than JS due to DPI scaling)
532
+ let viewport_offset = match self
533
+ .inner
534
+ .locator("role:Document")
535
+ .first(Some(Duration::from_millis(2000)))
536
+ .await
537
+ {
538
+ Ok(doc_element) => match doc_element.bounds() {
539
+ Ok((x, y, _w, _h)) => (x, y),
540
+ Err(_) => (0.0, 0.0),
541
+ },
542
+ Err(_) => (0.0, 0.0),
543
+ };
544
+
545
+ // JavaScript to extract visible DOM elements
546
+ let script = format!(
547
+ r#"
548
+ (function() {{
549
+ const elements = [];
550
+ const maxElements = {max_elements};
551
+
552
+ const walker = document.createTreeWalker(
553
+ document.body,
554
+ NodeFilter.SHOW_ELEMENT,
555
+ {{
556
+ acceptNode: function(node) {{
557
+ const style = window.getComputedStyle(node);
558
+ const rect = node.getBoundingClientRect();
559
+
560
+ if (style.display === 'none' ||
561
+ style.visibility === 'hidden' ||
562
+ style.opacity === '0' ||
563
+ rect.width === 0 ||
564
+ rect.height === 0) {{
565
+ return NodeFilter.FILTER_SKIP;
566
+ }}
567
+
568
+ return NodeFilter.FILTER_ACCEPT;
569
+ }}
570
+ }}
571
+ );
572
+
573
+ let node;
574
+ while (node = walker.nextNode()) {{
575
+ if (elements.length >= maxElements) {{
576
+ break;
577
+ }}
578
+
579
+ const rect = node.getBoundingClientRect();
580
+ const text = node.innerText ? node.innerText.substring(0, 100).trim() : null;
581
+
582
+ elements.push({{
583
+ tag: node.tagName.toLowerCase(),
584
+ id: node.id || null,
585
+ classes: Array.from(node.classList),
586
+ text: text,
587
+ href: node.href || null,
588
+ type: node.type || null,
589
+ name: node.name || null,
590
+ value: node.value || null,
591
+ placeholder: node.placeholder || null,
592
+ aria_label: node.getAttribute('aria-label'),
593
+ role: node.getAttribute('role'),
594
+ x: Math.round(rect.x * window.devicePixelRatio),
595
+ y: Math.round(rect.y * window.devicePixelRatio),
596
+ width: Math.round(rect.width * window.devicePixelRatio),
597
+ height: Math.round(rect.height * window.devicePixelRatio)
598
+ }});
599
+ }}
600
+
601
+ return JSON.stringify({{
602
+ elements: elements,
603
+ total_found: elements.length,
604
+ page_url: window.location.href,
605
+ page_title: document.title,
606
+ devicePixelRatio: window.devicePixelRatio
607
+ }});
608
+ }})()"#
609
+ );
610
+
611
+ let result_str = self
612
+ .inner
613
+ .execute_browser_script(&script)
614
+ .await
615
+ .map_err(map_error)?;
616
+
617
+ let parsed: serde_json::Value = serde_json::from_str(&result_str)
618
+ .map_err(|e| napi::Error::from_reason(format!("Failed to parse DOM result: {e}")))?;
619
+
620
+ let page_url = parsed
621
+ .get("page_url")
622
+ .and_then(|v| v.as_str())
623
+ .unwrap_or("")
624
+ .to_string();
625
+ let page_title = parsed
626
+ .get("page_title")
627
+ .and_then(|v| v.as_str())
628
+ .unwrap_or("")
629
+ .to_string();
630
+
631
+ let raw_elements = parsed
632
+ .get("elements")
633
+ .and_then(|v| v.as_array())
634
+ .cloned()
635
+ .unwrap_or_default();
636
+
637
+ // Convert to BrowserDomElement and build index_to_bounds
638
+ let mut elements = Vec::new();
639
+ let mut index_to_bounds: HashMap<String, crate::types::DomBoundsEntry> = HashMap::new();
640
+ let mut formatted_lines: Vec<String> = Vec::new();
641
+
642
+ if format_output {
643
+ formatted_lines.push(format!(
644
+ "Browser DOM: {} elements (url: {}, title: {})",
645
+ raw_elements.len(),
646
+ page_url,
647
+ page_title
648
+ ));
649
+ }
650
+
651
+ for (i, elem) in raw_elements.iter().enumerate() {
652
+ let idx = i + 1;
653
+ let tag = elem
654
+ .get("tag")
655
+ .and_then(|v| v.as_str())
656
+ .unwrap_or("")
657
+ .to_string();
658
+ let id = elem.get("id").and_then(|v| v.as_str()).map(String::from);
659
+ let classes: Vec<String> = elem
660
+ .get("classes")
661
+ .and_then(|v| v.as_array())
662
+ .map(|arr| {
663
+ arr.iter()
664
+ .filter_map(|c| c.as_str().map(String::from))
665
+ .collect()
666
+ })
667
+ .unwrap_or_default();
668
+ let text = elem.get("text").and_then(|v| v.as_str()).map(String::from);
669
+ let href = elem.get("href").and_then(|v| v.as_str()).map(String::from);
670
+ let r#type = elem.get("type").and_then(|v| v.as_str()).map(String::from);
671
+ let name = elem.get("name").and_then(|v| v.as_str()).map(String::from);
672
+ let value = elem.get("value").and_then(|v| v.as_str()).map(String::from);
673
+ let placeholder = elem
674
+ .get("placeholder")
675
+ .and_then(|v| v.as_str())
676
+ .map(String::from);
677
+ let aria_label = elem
678
+ .get("aria_label")
679
+ .and_then(|v| v.as_str())
680
+ .map(String::from);
681
+ let role = elem.get("role").and_then(|v| v.as_str()).map(String::from);
682
+
683
+ // Build bounds with viewport offset added
684
+ let x = elem.get("x").and_then(|v| v.as_f64()).unwrap_or(0.0) + viewport_offset.0;
685
+ let y = elem.get("y").and_then(|v| v.as_f64()).unwrap_or(0.0) + viewport_offset.1;
686
+ let width = elem.get("width").and_then(|v| v.as_f64()).unwrap_or(0.0);
687
+ let height = elem.get("height").and_then(|v| v.as_f64()).unwrap_or(0.0);
688
+
689
+ let bounds = crate::types::Bounds {
690
+ x,
691
+ y,
692
+ width,
693
+ height,
694
+ };
695
+
696
+ // Display name for index_to_bounds
697
+ let display_name = text
698
+ .as_ref()
699
+ .filter(|t| !t.is_empty())
700
+ .cloned()
701
+ .or_else(|| aria_label.clone())
702
+ .or_else(|| placeholder.clone())
703
+ .or_else(|| name.clone())
704
+ .or_else(|| id.clone())
705
+ .unwrap_or_else(|| format!("<{}>", tag));
706
+
707
+ // Format line for compact YAML
708
+ if format_output {
709
+ let mut line_parts = vec![format!("#{} [{}]", idx, tag.to_uppercase())];
710
+ if let Some(ref t) = text {
711
+ if !t.is_empty() {
712
+ let truncated = if t.len() > 40 {
713
+ format!("{}...", &t[..40])
714
+ } else {
715
+ t.clone()
716
+ };
717
+ line_parts.push(truncated);
718
+ }
719
+ }
720
+ if let Some(ref a) = aria_label {
721
+ line_parts.push(format!("aria:{}", a));
722
+ }
723
+ if let Some(ref r) = role {
724
+ line_parts.push(format!("role:{}", r));
725
+ }
726
+ formatted_lines.push(format!(" {}", line_parts.join(" ")));
727
+ }
728
+
729
+ index_to_bounds.insert(
730
+ idx.to_string(),
731
+ crate::types::DomBoundsEntry {
732
+ name: display_name,
733
+ tag: tag.clone(),
734
+ bounds: bounds.clone(),
735
+ },
736
+ );
737
+
738
+ elements.push(crate::types::BrowserDomElement {
739
+ tag,
740
+ id,
741
+ classes,
742
+ text,
743
+ href,
744
+ r#type,
745
+ name,
746
+ value,
747
+ placeholder,
748
+ aria_label,
749
+ role,
750
+ bounds,
751
+ });
752
+ }
753
+
754
+ // Populate DOM cache for click_by_index
755
+ #[allow(clippy::type_complexity)]
756
+ let cache_items: std::collections::HashMap<
757
+ u32,
758
+ (String, String, (f64, f64, f64, f64)),
759
+ > = index_to_bounds
760
+ .iter()
761
+ .filter_map(|(key, entry)| {
762
+ key.parse::<u32>().ok().map(|idx| {
763
+ (
764
+ idx,
765
+ (
766
+ entry.name.clone(),
767
+ entry.tag.clone(),
768
+ (
769
+ entry.bounds.x,
770
+ entry.bounds.y,
771
+ entry.bounds.width,
772
+ entry.bounds.height,
773
+ ),
774
+ ),
775
+ )
776
+ })
777
+ })
778
+ .collect();
779
+ self.inner.populate_dom_cache(cache_items);
780
+
781
+ Ok(crate::types::BrowserDomResult {
782
+ elements,
783
+ formatted: if format_output {
784
+ Some(formatted_lines.join("\n"))
785
+ } else {
786
+ None
787
+ },
788
+ index_to_bounds,
789
+ element_count: raw_elements.len() as u32,
790
+ page_url,
791
+ page_title,
792
+ })
793
+ }
794
+
795
+ /// (async) Get a clustered tree combining elements from multiple sources grouped by spatial proximity.
796
+ ///
797
+ /// Combines accessibility tree (UIA) elements with optional DOM, Omniparser, and Gemini Vision elements,
798
+ /// clustering nearby elements together. Each element is prefixed with its source:
799
+ /// - #u1, #u2... for UIA (accessibility tree)
800
+ /// - #d1, #d2... for DOM (browser content)
801
+ /// - #p1, #p2... for Omniparser (vision AI detection)
802
+ /// - #g1, #g2... for Gemini Vision (AI element detection)
803
+ ///
804
+ /// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
805
+ /// @param {number} [maxDomElements=100] - Maximum DOM elements to capture for browsers.
806
+ /// @param {boolean} [includeOmniparser=false] - Whether to include Omniparser vision detection.
807
+ /// @param {boolean} [includeGeminiVision=false] - Whether to include Gemini Vision AI detection.
808
+ /// @returns {Promise<ClusteredFormattingResult>} Clustered tree with prefixed indices.
809
+ #[napi]
810
+ pub async fn get_clustered_tree(
811
+ &self,
812
+ process: String,
813
+ max_dom_elements: Option<u32>,
814
+ include_omniparser: Option<bool>,
815
+ include_gemini_vision: Option<bool>,
816
+ ) -> napi::Result<crate::types::ClusteredFormattingResult> {
817
+ use std::collections::HashMap;
818
+
819
+ // Find PID for the process name
820
+ let pid = find_pid_for_process(&self.inner, &process)?;
821
+
822
+ let max_dom_elements = max_dom_elements.unwrap_or(100);
823
+ let include_omniparser = include_omniparser.unwrap_or(false);
824
+ let include_gemini_vision = include_gemini_vision.unwrap_or(false);
825
+
826
+ // Get UIA tree with bounds
827
+ let uia_result = self
828
+ .inner
829
+ .get_window_tree_result(pid, None, None)
830
+ .map_err(map_error)?;
831
+
832
+ // Build UIA bounds cache: HashMap<u32, (role, name, bounds, selector)>
833
+ #[allow(clippy::type_complexity)]
834
+ let mut uia_bounds: HashMap<
835
+ u32,
836
+ (String, String, (f64, f64, f64, f64), Option<String>),
837
+ > = HashMap::new();
838
+
839
+ // Use the formatted result to extract bounds
840
+ let formatted_result = computeruse::format_ui_node_as_compact_yaml(&uia_result.tree, 0);
841
+ for (idx, (role, name, bounds, selector)) in formatted_result.index_to_bounds {
842
+ uia_bounds.insert(idx, (role, name, bounds, selector));
843
+ }
844
+
845
+ // Check if this is a browser
846
+ let is_browser = computeruse::is_browser_process(pid);
847
+
848
+ // Build DOM bounds cache: HashMap<u32, (tag, identifier, bounds)>
849
+ #[allow(clippy::type_complexity)]
850
+ let mut dom_bounds: HashMap<u32, (String, String, (f64, f64, f64, f64))> = HashMap::new();
851
+
852
+ if is_browser {
853
+ // Try to capture DOM elements
854
+ match self
855
+ .capture_browser_dom(Some(max_dom_elements), Some(true))
856
+ .await
857
+ {
858
+ Ok(dom_result) => {
859
+ for (idx_str, entry) in dom_result.index_to_bounds {
860
+ if let Ok(idx) = idx_str.parse::<u32>() {
861
+ let bounds = (
862
+ entry.bounds.x,
863
+ entry.bounds.y,
864
+ entry.bounds.width,
865
+ entry.bounds.height,
866
+ );
867
+ dom_bounds.insert(idx, (entry.tag, entry.name, bounds));
868
+ }
869
+ }
870
+ }
871
+ Err(_) => {
872
+ // DOM capture failed (e.g., chrome:// page), continue with UIA only
873
+ }
874
+ }
875
+ }
876
+
877
+ // Build Omniparser items cache if requested
878
+ let mut omniparser_items: HashMap<u32, computeruse::OmniparserItem> = HashMap::new();
879
+
880
+ if include_omniparser {
881
+ match self
882
+ .perform_omniparser_for_process(process.clone(), None, Some(true))
883
+ .await
884
+ {
885
+ Ok(omni_result) => {
886
+ for (idx_str, entry) in omni_result.index_to_bounds {
887
+ if let Ok(idx) = idx_str.parse::<u32>() {
888
+ omniparser_items.insert(
889
+ idx,
890
+ computeruse::OmniparserItem {
891
+ label: entry.label.clone(),
892
+ content: Some(entry.name.clone()),
893
+ box_2d: Some([
894
+ entry.bounds.x,
895
+ entry.bounds.y,
896
+ entry.bounds.x + entry.bounds.width,
897
+ entry.bounds.y + entry.bounds.height,
898
+ ]),
899
+ },
900
+ );
901
+ }
902
+ }
903
+ }
904
+ Err(_) => {
905
+ // Omniparser failed, continue without it
906
+ }
907
+ }
908
+ }
909
+
910
+ // Build Gemini Vision items cache if requested
911
+ let mut vision_items: HashMap<u32, computeruse::VisionElement> = HashMap::new();
912
+
913
+ if include_gemini_vision {
914
+ match self
915
+ .perform_gemini_vision_for_process(process.clone(), Some(true))
916
+ .await
917
+ {
918
+ Ok(vision_result) => {
919
+ for (idx_str, entry) in vision_result.index_to_bounds {
920
+ if let Ok(idx) = idx_str.parse::<u32>() {
921
+ vision_items.insert(
922
+ idx,
923
+ computeruse::VisionElement {
924
+ element_type: entry.element_type.clone(),
925
+ content: Some(entry.name.clone()),
926
+ description: None,
927
+ box_2d: Some([
928
+ entry.bounds.x,
929
+ entry.bounds.y,
930
+ entry.bounds.x + entry.bounds.width,
931
+ entry.bounds.y + entry.bounds.height,
932
+ ]),
933
+ interactivity: None,
934
+ },
935
+ );
936
+ }
937
+ }
938
+ }
939
+ Err(_) => {
940
+ // Gemini Vision failed, continue without it
941
+ }
942
+ }
943
+ }
944
+
945
+ // Empty cache for OCR (not implemented yet)
946
+ #[allow(clippy::type_complexity)]
947
+ let ocr_bounds: HashMap<u32, (String, (f64, f64, f64, f64))> = HashMap::new();
948
+
949
+ // Call the core clustering function
950
+ let clustered_result = computeruse::format_clustered_tree_from_caches(
951
+ &uia_bounds,
952
+ &dom_bounds,
953
+ &ocr_bounds,
954
+ &omniparser_items,
955
+ &vision_items,
956
+ );
957
+
958
+ // Convert to SDK types
959
+ let mut index_to_source_and_bounds: HashMap<String, crate::types::ClusteredBoundsEntry> =
960
+ HashMap::new();
961
+
962
+ for (key, (source, original_idx, (x, y, w, h))) in
963
+ clustered_result.index_to_source_and_bounds
964
+ {
965
+ let sdk_source = match source {
966
+ computeruse::ElementSource::Uia => crate::types::ElementSource::Uia,
967
+ computeruse::ElementSource::Dom => crate::types::ElementSource::Dom,
968
+ computeruse::ElementSource::Ocr => crate::types::ElementSource::Ocr,
969
+ computeruse::ElementSource::Omniparser => crate::types::ElementSource::Omniparser,
970
+ computeruse::ElementSource::Gemini => crate::types::ElementSource::Gemini,
971
+ };
972
+ index_to_source_and_bounds.insert(
973
+ key,
974
+ crate::types::ClusteredBoundsEntry {
975
+ source: sdk_source,
976
+ original_index: original_idx,
977
+ bounds: crate::types::Bounds {
978
+ x,
979
+ y,
980
+ width: w,
981
+ height: h,
982
+ },
983
+ },
984
+ );
985
+ }
986
+
987
+ Ok(crate::types::ClusteredFormattingResult {
988
+ formatted: clustered_result.formatted,
989
+ index_to_source_and_bounds,
990
+ })
991
+ }
992
+
993
+ /// (async) Perform Gemini vision AI detection on a window by process name.
994
+ ///
995
+ /// Captures a screenshot and sends it to the Gemini vision backend for UI element detection.
996
+ /// Requires GEMINI_VISION_BACKEND_URL environment variable (defaults to https://app.mediar.ai/api/vision/parse).
997
+ ///
998
+ /// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
999
+ /// @param {boolean} [formatOutput=true] - Whether to include formatted compact YAML output.
1000
+ /// @returns {Promise<GeminiVisionResult>} Detected UI elements with bounds for click targeting.
1001
+ #[napi]
1002
+ pub async fn perform_gemini_vision_for_process(
1003
+ &self,
1004
+ process: String,
1005
+ format_output: Option<bool>,
1006
+ ) -> napi::Result<crate::types::GeminiVisionResult> {
1007
+ use base64::{engine::general_purpose, Engine};
1008
+ use image::imageops::FilterType;
1009
+ use image::{codecs::png::PngEncoder, ExtendedColorType, ImageBuffer, ImageEncoder, Rgba};
1010
+ use std::collections::HashMap;
1011
+ use std::io::Cursor;
1012
+
1013
+ let format_output = format_output.unwrap_or(true);
1014
+
1015
+ // Find PID for the process name
1016
+ let pid = find_pid_for_process(&self.inner, &process)?;
1017
+
1018
+ // Find the window element for this process
1019
+ let apps = self.inner.applications().map_err(map_error)?;
1020
+ let window_element = apps
1021
+ .into_iter()
1022
+ .find(|app| app.process_id().ok() == Some(pid))
1023
+ .ok_or_else(|| {
1024
+ napi::Error::from_reason(format!("No window found for process '{}'", process))
1025
+ })?;
1026
+
1027
+ // Get window bounds
1028
+ let bounds = window_element.bounds().map_err(map_error)?;
1029
+ let (window_x, window_y, win_w, win_h) = bounds;
1030
+
1031
+ // Capture screenshot
1032
+ let screenshot = window_element.capture().map_err(map_error)?;
1033
+ let original_width = screenshot.width;
1034
+ let original_height = screenshot.height;
1035
+
1036
+ // Calculate DPI scale
1037
+ let dpi_scale_w = original_width as f64 / win_w;
1038
+ let dpi_scale_h = original_height as f64 / win_h;
1039
+
1040
+ // Convert BGRA to RGBA
1041
+ let rgba_data: Vec<u8> = screenshot
1042
+ .image_data
1043
+ .chunks_exact(4)
1044
+ .flat_map(|bgra| [bgra[2], bgra[1], bgra[0], bgra[3]])
1045
+ .collect();
1046
+
1047
+ // Resize if needed (max 1920px)
1048
+ const MAX_DIM: u32 = 1920;
1049
+ let (final_width, final_height, final_rgba_data, scale_factor) = if original_width > MAX_DIM
1050
+ || original_height > MAX_DIM
1051
+ {
1052
+ let scale = (MAX_DIM as f32 / original_width.max(original_height) as f32).min(1.0);
1053
+ let new_width = (original_width as f32 * scale).round() as u32;
1054
+ let new_height = (original_height as f32 * scale).round() as u32;
1055
+
1056
+ let img =
1057
+ ImageBuffer::<Rgba<u8>, _>::from_raw(original_width, original_height, rgba_data)
1058
+ .ok_or_else(|| napi::Error::from_reason("Failed to create image buffer"))?;
1059
+
1060
+ let resized =
1061
+ image::imageops::resize(&img, new_width, new_height, FilterType::Lanczos3);
1062
+ (new_width, new_height, resized.into_raw(), scale as f64)
1063
+ } else {
1064
+ (original_width, original_height, rgba_data, 1.0)
1065
+ };
1066
+
1067
+ // Encode to PNG
1068
+ let mut png_data = Vec::new();
1069
+ let encoder = PngEncoder::new(Cursor::new(&mut png_data));
1070
+ encoder
1071
+ .write_image(
1072
+ &final_rgba_data,
1073
+ final_width,
1074
+ final_height,
1075
+ ExtendedColorType::Rgba8,
1076
+ )
1077
+ .map_err(|e| napi::Error::from_reason(format!("Failed to encode PNG: {e}")))?;
1078
+
1079
+ let base64_image = general_purpose::STANDARD.encode(&png_data);
1080
+
1081
+ // Call Gemini Vision backend
1082
+ let backend_url = std::env::var("GEMINI_VISION_BACKEND_URL")
1083
+ .unwrap_or_else(|_| "https://app.mediar.ai/api/vision/parse".to_string());
1084
+
1085
+ let client = reqwest::Client::builder()
1086
+ .timeout(std::time::Duration::from_secs(300))
1087
+ .build()
1088
+ .map_err(|e| napi::Error::from_reason(format!("Failed to create HTTP client: {e}")))?;
1089
+
1090
+ let payload = serde_json::json!({
1091
+ "image": base64_image,
1092
+ "model": "gemini",
1093
+ "prompt": "Detect all UI elements in this screenshot. Return their type, content, description, bounding boxes, and interactivity."
1094
+ });
1095
+
1096
+ let resp = client
1097
+ .post(&backend_url)
1098
+ .header("Content-Type", "application/json")
1099
+ .json(&payload)
1100
+ .send()
1101
+ .await
1102
+ .map_err(|e| napi::Error::from_reason(format!("Vision backend request failed: {e}")))?;
1103
+
1104
+ if !resp.status().is_success() {
1105
+ let text = resp.text().await.unwrap_or_default();
1106
+ return Err(napi::Error::from_reason(format!(
1107
+ "Vision backend error: {}",
1108
+ text
1109
+ )));
1110
+ }
1111
+
1112
+ let response_text = resp
1113
+ .text()
1114
+ .await
1115
+ .map_err(|e| napi::Error::from_reason(format!("Failed to read response: {e}")))?;
1116
+
1117
+ let parsed: serde_json::Value = serde_json::from_str(&response_text)
1118
+ .map_err(|e| napi::Error::from_reason(format!("Failed to parse response: {e}")))?;
1119
+
1120
+ if let Some(error) = parsed.get("error").and_then(|v| v.as_str()) {
1121
+ return Err(napi::Error::from_reason(format!("Vision error: {}", error)));
1122
+ }
1123
+
1124
+ let raw_elements = parsed
1125
+ .get("elements")
1126
+ .and_then(|v| v.as_array())
1127
+ .cloned()
1128
+ .unwrap_or_default();
1129
+
1130
+ // Convert to VisionElement with absolute screen coordinates
1131
+ let mut elements = Vec::new();
1132
+ let mut index_to_bounds: HashMap<String, crate::types::VisionBoundsEntry> = HashMap::new();
1133
+ let mut formatted_lines: Vec<String> = Vec::new();
1134
+
1135
+ if format_output {
1136
+ formatted_lines.push(format!(
1137
+ "Gemini Vision: {} elements (PID: {})",
1138
+ raw_elements.len(),
1139
+ pid
1140
+ ));
1141
+ }
1142
+
1143
+ let inv_scale = 1.0 / scale_factor;
1144
+
1145
+ for (i, elem) in raw_elements.iter().enumerate() {
1146
+ let idx = i + 1;
1147
+ let element_type = elem
1148
+ .get("type")
1149
+ .and_then(|v| v.as_str())
1150
+ .unwrap_or("unknown")
1151
+ .to_string();
1152
+ let content = elem
1153
+ .get("content")
1154
+ .and_then(|v| v.as_str())
1155
+ .filter(|s| !s.is_empty())
1156
+ .map(String::from);
1157
+ let description = elem
1158
+ .get("description")
1159
+ .and_then(|v| v.as_str())
1160
+ .filter(|s| !s.is_empty())
1161
+ .map(String::from);
1162
+ let interactivity = elem.get("interactivity").and_then(|v| v.as_bool());
1163
+
1164
+ // Get normalized bbox [x1, y1, x2, y2] from 0-1
1165
+ let bbox = elem.get("bbox").and_then(|v| v.as_array());
1166
+ let bounds = bbox.and_then(|arr| {
1167
+ if arr.len() >= 4 {
1168
+ let x1 = arr[0].as_f64()? * final_width as f64;
1169
+ let y1 = arr[1].as_f64()? * final_height as f64;
1170
+ let x2 = arr[2].as_f64()? * final_width as f64;
1171
+ let y2 = arr[3].as_f64()? * final_height as f64;
1172
+
1173
+ // Scale back to original size and convert to logical screen coords
1174
+ let abs_x = window_x + (x1 * inv_scale / dpi_scale_w);
1175
+ let abs_y = window_y + (y1 * inv_scale / dpi_scale_h);
1176
+ let abs_w = (x2 - x1) * inv_scale / dpi_scale_w;
1177
+ let abs_h = (y2 - y1) * inv_scale / dpi_scale_h;
1178
+
1179
+ Some(crate::types::Bounds {
1180
+ x: abs_x,
1181
+ y: abs_y,
1182
+ width: abs_w,
1183
+ height: abs_h,
1184
+ })
1185
+ } else {
1186
+ None
1187
+ }
1188
+ });
1189
+
1190
+ // Display name for index_to_bounds
1191
+ let display_name = content
1192
+ .as_ref()
1193
+ .cloned()
1194
+ .or_else(|| description.clone())
1195
+ .unwrap_or_else(|| format!("<{}>", element_type));
1196
+
1197
+ // Format line for compact YAML
1198
+ if format_output {
1199
+ let mut line_parts = vec![format!("#{} [{}]", idx, element_type.to_uppercase())];
1200
+ if let Some(ref c) = content {
1201
+ let truncated = if c.len() > 40 {
1202
+ format!("{}...", &c[..40])
1203
+ } else {
1204
+ c.clone()
1205
+ };
1206
+ line_parts.push(truncated);
1207
+ }
1208
+ if let Some(ref d) = description {
1209
+ let truncated = if d.len() > 30 {
1210
+ format!("{}...", &d[..30])
1211
+ } else {
1212
+ d.clone()
1213
+ };
1214
+ line_parts.push(format!("desc:{}", truncated));
1215
+ }
1216
+ if interactivity == Some(true) {
1217
+ line_parts.push("interactive".to_string());
1218
+ }
1219
+ formatted_lines.push(format!(" {}", line_parts.join(" ")));
1220
+ }
1221
+
1222
+ if let Some(ref b) = bounds {
1223
+ index_to_bounds.insert(
1224
+ idx.to_string(),
1225
+ crate::types::VisionBoundsEntry {
1226
+ name: display_name.clone(),
1227
+ element_type: element_type.clone(),
1228
+ bounds: b.clone(),
1229
+ },
1230
+ );
1231
+ }
1232
+
1233
+ elements.push(crate::types::VisionElement {
1234
+ element_type,
1235
+ content,
1236
+ description,
1237
+ bounds,
1238
+ interactivity,
1239
+ });
1240
+ }
1241
+
1242
+ // Populate the Vision cache for click_by_index support
1243
+ let cache_items: HashMap<u32, computeruse::VisionElement> = elements
1244
+ .iter()
1245
+ .enumerate()
1246
+ .map(|(i, elem)| {
1247
+ let box_2d = elem
1248
+ .bounds
1249
+ .as_ref()
1250
+ .map(|b| [b.x, b.y, b.x + b.width, b.y + b.height]);
1251
+ (
1252
+ (i + 1) as u32,
1253
+ computeruse::VisionElement {
1254
+ element_type: elem.element_type.clone(),
1255
+ content: elem.content.clone(),
1256
+ description: elem.description.clone(),
1257
+ box_2d,
1258
+ interactivity: elem.interactivity,
1259
+ },
1260
+ )
1261
+ })
1262
+ .collect();
1263
+ self.inner.populate_vision_cache(cache_items);
1264
+
1265
+ Ok(crate::types::GeminiVisionResult {
1266
+ elements,
1267
+ formatted: if format_output {
1268
+ Some(formatted_lines.join("\n"))
1269
+ } else {
1270
+ None
1271
+ },
1272
+ index_to_bounds,
1273
+ element_count: raw_elements.len() as u32,
1274
+ })
1275
+ }
1276
+
1277
+ /// (async) Perform Omniparser V2 detection on a window by process name.
1278
+ ///
1279
+ /// Captures a screenshot and sends it to the Omniparser backend for icon/field detection.
1280
+ /// Requires OMNIPARSER_BACKEND_URL environment variable (defaults to https://app.mediar.ai/api/omniparser/parse).
1281
+ ///
1282
+ /// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
1283
+ /// @param {number} [imgsz=1920] - Icon detection image size (640-1920). Higher = better but slower.
1284
+ /// @param {boolean} [formatOutput=true] - Whether to include formatted compact YAML output.
1285
+ /// @returns {Promise<OmniparserResult>} Detected items with bounds for click targeting.
1286
+ #[napi]
1287
+ pub async fn perform_omniparser_for_process(
1288
+ &self,
1289
+ process: String,
1290
+ imgsz: Option<u32>,
1291
+ format_output: Option<bool>,
1292
+ ) -> napi::Result<crate::types::OmniparserResult> {
1293
+ use base64::{engine::general_purpose, Engine};
1294
+ use image::imageops::FilterType;
1295
+ use image::{codecs::png::PngEncoder, ExtendedColorType, ImageBuffer, ImageEncoder, Rgba};
1296
+ use std::collections::HashMap;
1297
+ use std::io::Cursor;
1298
+
1299
+ let imgsz = imgsz.unwrap_or(1920).clamp(640, 1920);
1300
+ let format_output = format_output.unwrap_or(true);
1301
+
1302
+ // Find PID for the process name
1303
+ let pid = find_pid_for_process(&self.inner, &process)?;
1304
+
1305
+ // Find the window element for this process
1306
+ let apps = self.inner.applications().map_err(map_error)?;
1307
+ let window_element = apps
1308
+ .into_iter()
1309
+ .find(|app| app.process_id().ok() == Some(pid))
1310
+ .ok_or_else(|| {
1311
+ napi::Error::from_reason(format!("No window found for process '{}'", process))
1312
+ })?;
1313
+
1314
+ // Get window bounds
1315
+ let bounds = window_element.bounds().map_err(map_error)?;
1316
+ let (window_x, window_y, win_w, win_h) = bounds;
1317
+
1318
+ // Capture screenshot
1319
+ let screenshot = window_element.capture().map_err(map_error)?;
1320
+ let original_width = screenshot.width;
1321
+ let original_height = screenshot.height;
1322
+
1323
+ // Calculate DPI scale
1324
+ let dpi_scale_w = original_width as f64 / win_w;
1325
+ let dpi_scale_h = original_height as f64 / win_h;
1326
+
1327
+ // Convert BGRA to RGBA
1328
+ let rgba_data: Vec<u8> = screenshot
1329
+ .image_data
1330
+ .chunks_exact(4)
1331
+ .flat_map(|bgra| [bgra[2], bgra[1], bgra[0], bgra[3]])
1332
+ .collect();
1333
+
1334
+ // Resize if needed (max 1920px)
1335
+ const MAX_DIM: u32 = 1920;
1336
+ let (final_width, final_height, final_rgba_data, scale_factor) = if original_width > MAX_DIM
1337
+ || original_height > MAX_DIM
1338
+ {
1339
+ let scale = (MAX_DIM as f32 / original_width.max(original_height) as f32).min(1.0);
1340
+ let new_width = (original_width as f32 * scale).round() as u32;
1341
+ let new_height = (original_height as f32 * scale).round() as u32;
1342
+
1343
+ let img =
1344
+ ImageBuffer::<Rgba<u8>, _>::from_raw(original_width, original_height, rgba_data)
1345
+ .ok_or_else(|| napi::Error::from_reason("Failed to create image buffer"))?;
1346
+
1347
+ let resized =
1348
+ image::imageops::resize(&img, new_width, new_height, FilterType::Lanczos3);
1349
+ (new_width, new_height, resized.into_raw(), scale as f64)
1350
+ } else {
1351
+ (original_width, original_height, rgba_data, 1.0)
1352
+ };
1353
+
1354
+ // Encode to PNG
1355
+ let mut png_data = Vec::new();
1356
+ let encoder = PngEncoder::new(Cursor::new(&mut png_data));
1357
+ encoder
1358
+ .write_image(
1359
+ &final_rgba_data,
1360
+ final_width,
1361
+ final_height,
1362
+ ExtendedColorType::Rgba8,
1363
+ )
1364
+ .map_err(|e| napi::Error::from_reason(format!("Failed to encode PNG: {e}")))?;
1365
+
1366
+ let base64_image = general_purpose::STANDARD.encode(&png_data);
1367
+
1368
+ // Call Omniparser backend
1369
+ let backend_url = std::env::var("OMNIPARSER_BACKEND_URL")
1370
+ .unwrap_or_else(|_| "https://app.mediar.ai/api/omniparser/parse".to_string());
1371
+
1372
+ let client = reqwest::Client::builder()
1373
+ .timeout(std::time::Duration::from_secs(300))
1374
+ .build()
1375
+ .map_err(|e| napi::Error::from_reason(format!("Failed to create HTTP client: {e}")))?;
1376
+
1377
+ let payload = serde_json::json!({
1378
+ "image": base64_image,
1379
+ "imgsz": imgsz
1380
+ });
1381
+
1382
+ let resp = client
1383
+ .post(&backend_url)
1384
+ .header("Content-Type", "application/json")
1385
+ .json(&payload)
1386
+ .send()
1387
+ .await
1388
+ .map_err(|e| {
1389
+ napi::Error::from_reason(format!("Omniparser backend request failed: {e}"))
1390
+ })?;
1391
+
1392
+ if !resp.status().is_success() {
1393
+ let text = resp.text().await.unwrap_or_default();
1394
+ return Err(napi::Error::from_reason(format!(
1395
+ "Omniparser backend error: {}",
1396
+ text
1397
+ )));
1398
+ }
1399
+
1400
+ let response_text = resp
1401
+ .text()
1402
+ .await
1403
+ .map_err(|e| napi::Error::from_reason(format!("Failed to read response: {e}")))?;
1404
+
1405
+ let parsed: serde_json::Value = serde_json::from_str(&response_text)
1406
+ .map_err(|e| napi::Error::from_reason(format!("Failed to parse response: {e}")))?;
1407
+
1408
+ if let Some(error) = parsed.get("error").and_then(|v| v.as_str()) {
1409
+ return Err(napi::Error::from_reason(format!(
1410
+ "Omniparser error: {}",
1411
+ error
1412
+ )));
1413
+ }
1414
+
1415
+ let raw_elements = parsed
1416
+ .get("elements")
1417
+ .and_then(|v| v.as_array())
1418
+ .cloned()
1419
+ .unwrap_or_default();
1420
+
1421
+ // Convert to OmniparserItem with absolute screen coordinates
1422
+ let mut items = Vec::new();
1423
+ let mut index_to_bounds: HashMap<String, crate::types::OmniparserBoundsEntry> =
1424
+ HashMap::new();
1425
+ let mut formatted_lines: Vec<String> = Vec::new();
1426
+
1427
+ if format_output {
1428
+ formatted_lines.push(format!(
1429
+ "Omniparser: {} items (PID: {})",
1430
+ raw_elements.len(),
1431
+ pid
1432
+ ));
1433
+ }
1434
+
1435
+ let inv_scale = 1.0 / scale_factor;
1436
+
1437
+ for (i, elem) in raw_elements.iter().enumerate() {
1438
+ let idx = i + 1;
1439
+ let label = elem
1440
+ .get("type")
1441
+ .and_then(|v| v.as_str())
1442
+ .unwrap_or("unknown")
1443
+ .to_string();
1444
+ let content = elem
1445
+ .get("content")
1446
+ .and_then(|v| v.as_str())
1447
+ .filter(|s| !s.is_empty())
1448
+ .map(String::from);
1449
+
1450
+ // Get normalized bbox [x1, y1, x2, y2] from 0-1
1451
+ let bbox = elem.get("bbox").and_then(|v| v.as_array());
1452
+ let bounds = bbox.and_then(|arr| {
1453
+ if arr.len() >= 4 {
1454
+ let x1 = arr[0].as_f64()? * final_width as f64;
1455
+ let y1 = arr[1].as_f64()? * final_height as f64;
1456
+ let x2 = arr[2].as_f64()? * final_width as f64;
1457
+ let y2 = arr[3].as_f64()? * final_height as f64;
1458
+
1459
+ // Scale back to original size and convert to logical screen coords
1460
+ let abs_x = window_x + (x1 * inv_scale / dpi_scale_w);
1461
+ let abs_y = window_y + (y1 * inv_scale / dpi_scale_h);
1462
+ let abs_w = (x2 - x1) * inv_scale / dpi_scale_w;
1463
+ let abs_h = (y2 - y1) * inv_scale / dpi_scale_h;
1464
+
1465
+ Some(crate::types::Bounds {
1466
+ x: abs_x,
1467
+ y: abs_y,
1468
+ width: abs_w,
1469
+ height: abs_h,
1470
+ })
1471
+ } else {
1472
+ None
1473
+ }
1474
+ });
1475
+
1476
+ // Display name for index_to_bounds
1477
+ let display_name = content
1478
+ .as_ref()
1479
+ .cloned()
1480
+ .unwrap_or_else(|| format!("<{}>", label));
1481
+
1482
+ // Format line for compact YAML
1483
+ if format_output {
1484
+ let mut line_parts = vec![format!("#{} [{}]", idx, label.to_uppercase())];
1485
+ if let Some(ref c) = content {
1486
+ let truncated = if c.len() > 50 {
1487
+ format!("{}...", &c[..50])
1488
+ } else {
1489
+ c.clone()
1490
+ };
1491
+ line_parts.push(truncated);
1492
+ }
1493
+ formatted_lines.push(format!(" {}", line_parts.join(" ")));
1494
+ }
1495
+
1496
+ if let Some(ref b) = bounds {
1497
+ index_to_bounds.insert(
1498
+ idx.to_string(),
1499
+ crate::types::OmniparserBoundsEntry {
1500
+ name: display_name.clone(),
1501
+ label: label.clone(),
1502
+ bounds: b.clone(),
1503
+ },
1504
+ );
1505
+ }
1506
+
1507
+ items.push(crate::types::OmniparserItem {
1508
+ label,
1509
+ content,
1510
+ bounds,
1511
+ });
1512
+ }
1513
+
1514
+ // Populate the Omniparser cache for click_by_index support
1515
+ let cache_items: HashMap<u32, computeruse::OmniparserItem> = items
1516
+ .iter()
1517
+ .enumerate()
1518
+ .map(|(i, item)| {
1519
+ let box_2d = item
1520
+ .bounds
1521
+ .as_ref()
1522
+ .map(|b| [b.x, b.y, b.x + b.width, b.y + b.height]);
1523
+ (
1524
+ (i + 1) as u32,
1525
+ computeruse::OmniparserItem {
1526
+ label: item.label.clone(),
1527
+ content: item.content.clone(),
1528
+ box_2d,
1529
+ },
1530
+ )
1531
+ })
1532
+ .collect();
1533
+ self.inner.populate_omniparser_cache(cache_items);
1534
+
1535
+ Ok(crate::types::OmniparserResult {
1536
+ items,
1537
+ formatted: if format_output {
1538
+ Some(formatted_lines.join("\n"))
1539
+ } else {
1540
+ None
1541
+ },
1542
+ index_to_bounds,
1543
+ item_count: raw_elements.len() as u32,
1544
+ })
1545
+ }
1546
+
1547
+ /// (async) Get the currently focused browser window.
1548
+ ///
1549
+ /// @returns {Promise<Element>} The current browser window element.
1550
+ #[napi]
1551
+ pub async fn get_current_browser_window(&self) -> napi::Result<Element> {
1552
+ self.inner
1553
+ .get_current_browser_window()
1554
+ .await
1555
+ .map(Element::from)
1556
+ .map_err(map_error)
1557
+ }
1558
+
1559
+ /// Create a locator for finding UI elements.
1560
+ ///
1561
+ /// @param {string | Selector} selector - The selector.
1562
+ /// @returns {Locator} A locator for finding elements.
1563
+ #[napi]
1564
+ pub fn locator(
1565
+ &self,
1566
+ #[napi(ts_arg_type = "string | Selector")] selector: Either<String, &Selector>,
1567
+ ) -> napi::Result<Locator> {
1568
+ use napi::bindgen_prelude::Either::*;
1569
+ let sel_rust: computeruse::selector::Selector = match selector {
1570
+ A(sel_str) => sel_str.as_str().into(),
1571
+ B(sel_obj) => sel_obj.inner.clone(),
1572
+ };
1573
+ let loc = self.inner.locator(sel_rust);
1574
+ Ok(Locator::from(loc))
1575
+ }
1576
+
1577
+ /// Create a process-scoped locator for finding UI elements.
1578
+ /// This is the recommended way to create locators - always scope to a specific process.
1579
+ ///
1580
+ /// @param {string} process - Process name to scope the search (e.g., 'chrome', 'notepad').
1581
+ /// @param {string | Selector} selector - The selector to find within the process.
1582
+ /// @param {string} [windowSelector] - Optional window selector for additional filtering.
1583
+ /// @returns {Locator} A locator for finding elements within the process.
1584
+ #[napi]
1585
+ pub fn locator_for_process(
1586
+ &self,
1587
+ process: String,
1588
+ #[napi(ts_arg_type = "string | Selector")] selector: Either<String, &Selector>,
1589
+ window_selector: Option<String>,
1590
+ ) -> napi::Result<Locator> {
1591
+ use napi::bindgen_prelude::Either::*;
1592
+
1593
+ // Build the full selector string like MCP does
1594
+ let selector_str = match &selector {
1595
+ A(sel_str) => sel_str.clone(),
1596
+ B(sel_obj) => format!("{:?}", sel_obj.inner),
1597
+ };
1598
+
1599
+ let full_selector = if selector_str.is_empty() {
1600
+ if let Some(window_sel) = window_selector {
1601
+ format!("process:{} >> {}", process, window_sel)
1602
+ } else {
1603
+ format!("process:{}", process)
1604
+ }
1605
+ } else if let Some(window_sel) = window_selector {
1606
+ format!("process:{} >> {} >> {}", process, window_sel, selector_str)
1607
+ } else {
1608
+ format!("process:{} >> {}", process, selector_str)
1609
+ };
1610
+
1611
+ let sel_rust: computeruse::selector::Selector = full_selector.as_str().into();
1612
+ let loc = self.inner.locator(sel_rust);
1613
+ Ok(Locator::from(loc))
1614
+ }
1615
+
1616
+ /// (async) Get the currently focused window.
1617
+ ///
1618
+ /// @returns {Promise<Element>} The current window element.
1619
+ #[napi]
1620
+ pub async fn get_current_window(&self) -> napi::Result<Element> {
1621
+ self.inner
1622
+ .get_current_window()
1623
+ .await
1624
+ .map(Element::from)
1625
+ .map_err(map_error)
1626
+ }
1627
+
1628
+ /// (async) Get the currently focused application.
1629
+ ///
1630
+ /// @returns {Promise<Element>} The current application element.
1631
+ #[napi]
1632
+ pub async fn get_current_application(&self) -> napi::Result<Element> {
1633
+ self.inner
1634
+ .get_current_application()
1635
+ .await
1636
+ .map(Element::from)
1637
+ .map_err(map_error)
1638
+ }
1639
+
1640
+ /// Get the currently focused element.
1641
+ ///
1642
+ /// @returns {Element} The focused element.
1643
+ #[napi]
1644
+ pub fn focused_element(&self) -> napi::Result<Element> {
1645
+ self.inner
1646
+ .focused_element()
1647
+ .map(Element::from)
1648
+ .map_err(map_error)
1649
+ }
1650
+
1651
+ /// Open a URL in a browser.
1652
+ ///
1653
+ /// @param {string} url - The URL to open.
1654
+ /// @param {string} [browser] - The browser to use. Can be "Default", "Chrome", "Firefox", "Edge", "Brave", "Opera", "Vivaldi", or a custom browser path.
1655
+ /// @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot after opening
1656
+ /// @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after opening
1657
+ #[napi]
1658
+ pub fn open_url(
1659
+ &self,
1660
+ url: String,
1661
+ browser: Option<String>,
1662
+ include_window_screenshot: Option<bool>,
1663
+ include_monitor_screenshots: Option<bool>,
1664
+ ) -> napi::Result<Element> {
1665
+ let browser_enum = browser.map(|b| match b.to_lowercase().as_str() {
1666
+ "default" => computeruse::Browser::Default,
1667
+ "chrome" => computeruse::Browser::Chrome,
1668
+ "firefox" => computeruse::Browser::Firefox,
1669
+ "edge" => computeruse::Browser::Edge,
1670
+ "brave" => computeruse::Browser::Brave,
1671
+ "opera" => computeruse::Browser::Opera,
1672
+ "vivaldi" => computeruse::Browser::Vivaldi,
1673
+ custom => computeruse::Browser::Custom(custom.to_string()),
1674
+ });
1675
+ let element = self.inner.open_url(&url, browser_enum).map_err(map_error)?;
1676
+
1677
+ // Capture screenshots if enabled (window default: true, monitor default: false)
1678
+ let _screenshots = capture_screenshots(
1679
+ &self.inner,
1680
+ element.process_id().ok(),
1681
+ include_window_screenshot.unwrap_or(true),
1682
+ include_monitor_screenshots.unwrap_or(false),
1683
+ "openUrl",
1684
+ );
1685
+
1686
+ Ok(Element::from(element))
1687
+ }
1688
+
1689
+ /// Open a file with its default application.
1690
+ ///
1691
+ /// @param {string} filePath - Path to the file to open.
1692
+ /// @param {string} [process] - Process name for window screenshot capture. If provided, enables window screenshots.
1693
+ /// @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot (requires process).
1694
+ /// @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after opening.
1695
+ #[napi]
1696
+ pub fn open_file(
1697
+ &self,
1698
+ file_path: String,
1699
+ process: Option<String>,
1700
+ include_window_screenshot: Option<bool>,
1701
+ include_monitor_screenshots: Option<bool>,
1702
+ ) -> napi::Result<()> {
1703
+ let result = self.inner.open_file(&file_path).map_err(map_error);
1704
+
1705
+ // Get PID from process name if provided
1706
+ let pid = process
1707
+ .as_ref()
1708
+ .and_then(|p| find_pid_for_process(&self.inner, p).ok());
1709
+
1710
+ // Capture screenshots if requested
1711
+ let _screenshots = capture_screenshots(
1712
+ &self.inner,
1713
+ pid,
1714
+ include_window_screenshot.unwrap_or(true) && pid.is_some(),
1715
+ include_monitor_screenshots.unwrap_or(false),
1716
+ "openFile",
1717
+ );
1718
+
1719
+ result
1720
+ }
1721
+
1722
+ /// Activate a browser window by title.
1723
+ ///
1724
+ /// @param {string} title - The window title to match.
1725
+ #[napi]
1726
+ pub fn activate_browser_window_by_title(&self, title: String) -> napi::Result<()> {
1727
+ self.inner
1728
+ .activate_browser_window_by_title(&title)
1729
+ .map_err(map_error)
1730
+ }
1731
+
1732
+ /// Get the UI tree for a window identified by process name and optional title.
1733
+ ///
1734
+ /// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
1735
+ /// @param {string} [title] - Optional window title filter.
1736
+ /// @param {TreeBuildConfig} [config] - Optional configuration for tree building.
1737
+ /// @returns {UINode} Complete UI tree starting from the identified window.
1738
+ #[napi]
1739
+ pub fn get_window_tree(
1740
+ &self,
1741
+ process: String,
1742
+ title: Option<String>,
1743
+ config: Option<TreeBuildConfig>,
1744
+ ) -> napi::Result<UINode> {
1745
+ // Find PID for the process name
1746
+ let pid = find_pid_for_process(&self.inner, &process)?;
1747
+
1748
+ let rust_config = config.map(|c| c.into());
1749
+ self.inner
1750
+ .get_window_tree(pid, title.as_deref(), rust_config)
1751
+ .map(UINode::from)
1752
+ .map_err(map_error)
1753
+ }
1754
+
1755
+ /// Get the UI tree with full result including formatting and bounds mapping.
1756
+ ///
1757
+ /// This is the recommended method for getting window trees when you need:
1758
+ /// - Formatted YAML output for LLM consumption
1759
+ /// - Index-to-bounds mapping for click targeting
1760
+ /// - Browser detection
1761
+ ///
1762
+ /// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
1763
+ /// @param {string} [title] - Optional window title filter.
1764
+ /// @param {TreeBuildConfig} [config] - Configuration options:
1765
+ /// - formatOutput: Enable formatted output (default: true if treeOutputFormat set)
1766
+ /// - treeOutputFormat: 'CompactYaml' (default) or 'VerboseJson'
1767
+ /// - treeFromSelector: Selector to start tree from (use getWindowTreeResultAsync for this)
1768
+ /// - includeWindowScreenshot: Save window screenshot to executions dir (default: false)
1769
+ /// - includeMonitorScreenshots: Save all monitor screenshots to executions dir (default: false)
1770
+ /// @returns {WindowTreeResult} Complete result with tree, formatted output, bounds mapping, and screenshot paths.
1771
+ #[napi]
1772
+ pub fn get_window_tree_result(
1773
+ &self,
1774
+ process: String,
1775
+ title: Option<String>,
1776
+ config: Option<TreeBuildConfig>,
1777
+ ) -> napi::Result<WindowTreeResult> {
1778
+ // Find PID for the process name
1779
+ let pid = find_pid_for_process(&self.inner, &process)?;
1780
+
1781
+ // Extract screenshot options (window: true, monitor: false by default)
1782
+ let include_window_screenshot = config
1783
+ .as_ref()
1784
+ .and_then(|c| c.include_window_screenshot)
1785
+ .unwrap_or(true);
1786
+ let include_monitor_screenshots = config
1787
+ .as_ref()
1788
+ .and_then(|c| c.include_monitor_screenshots)
1789
+ .unwrap_or(false);
1790
+
1791
+ // Extract options before converting config
1792
+ let output_format = config
1793
+ .as_ref()
1794
+ .and_then(|c| c.tree_output_format)
1795
+ .unwrap_or(TreeOutputFormat::CompactYaml);
1796
+
1797
+ // If format is VerboseJson, we don't need formatted output from core
1798
+ // ClusteredYaml is treated like CompactYaml (needs format_output = true)
1799
+ let rust_config = config.map(|mut c| {
1800
+ if matches!(output_format, TreeOutputFormat::VerboseJson) {
1801
+ c.format_output = Some(false);
1802
+ } else if c.format_output.is_none() {
1803
+ c.format_output = Some(true);
1804
+ }
1805
+ c.into()
1806
+ });
1807
+
1808
+ let result = self
1809
+ .inner
1810
+ .get_window_tree_result(pid, title.as_deref(), rust_config)
1811
+ .map_err(map_error)?;
1812
+
1813
+ // Convert and handle format
1814
+ let mut sdk_result = WindowTreeResult::from(result);
1815
+
1816
+ // For VerboseJson, serialize the tree as the formatted output
1817
+ if matches!(output_format, TreeOutputFormat::VerboseJson) {
1818
+ sdk_result.formatted =
1819
+ Some(serde_json::to_string_pretty(&sdk_result.tree).unwrap_or_default());
1820
+ }
1821
+
1822
+ // Handle screenshot capture and saving using helper
1823
+ let screenshots = capture_screenshots(
1824
+ &self.inner,
1825
+ Some(pid),
1826
+ include_window_screenshot,
1827
+ include_monitor_screenshots,
1828
+ "getWindowTreeResult",
1829
+ );
1830
+ sdk_result.window_screenshot_path = screenshots.window_path;
1831
+ sdk_result.monitor_screenshot_paths = screenshots.monitor_paths;
1832
+
1833
+ Ok(sdk_result)
1834
+ }
1835
+
1836
+ /// (async) Get the UI tree with full result, supporting tree_from_selector.
1837
+ ///
1838
+ /// Use this method when you need to scope the tree to a specific subtree using a selector.
1839
+ ///
1840
+ /// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
1841
+ /// @param {string} [title] - Optional window title filter.
1842
+ /// @param {TreeBuildConfig} [config] - Configuration options:
1843
+ /// - formatOutput: Enable formatted output (default: true)
1844
+ /// - treeOutputFormat: 'CompactYaml' (default) or 'VerboseJson'
1845
+ /// - treeFromSelector: Selector to start tree from (e.g., "role:Dialog")
1846
+ /// @returns {Promise<WindowTreeResult>} Complete result with tree, formatted output, and bounds mapping.
1847
+ #[napi]
1848
+ pub async fn get_window_tree_result_async(
1849
+ &self,
1850
+ process: String,
1851
+ title: Option<String>,
1852
+ config: Option<TreeBuildConfig>,
1853
+ ) -> napi::Result<WindowTreeResult> {
1854
+ use std::collections::HashMap;
1855
+
1856
+ // Find PID for the process name
1857
+ let pid = find_pid_for_process(&self.inner, &process)?;
1858
+
1859
+ // Extract vision and format options from config
1860
+ let include_gemini_vision = config
1861
+ .as_ref()
1862
+ .and_then(|c| c.include_gemini_vision)
1863
+ .unwrap_or(false);
1864
+ let include_omniparser = config
1865
+ .as_ref()
1866
+ .and_then(|c| c.include_omniparser)
1867
+ .unwrap_or(false);
1868
+ let include_ocr = config.as_ref().and_then(|c| c.include_ocr).unwrap_or(false);
1869
+ let include_browser_dom = config
1870
+ .as_ref()
1871
+ .and_then(|c| c.include_browser_dom)
1872
+ .unwrap_or(false);
1873
+ let output_format = config
1874
+ .as_ref()
1875
+ .and_then(|c| c.tree_output_format)
1876
+ .unwrap_or(TreeOutputFormat::CompactYaml);
1877
+
1878
+ let has_vision_options =
1879
+ include_gemini_vision || include_omniparser || include_ocr || include_browser_dom;
1880
+
1881
+ // Build rust config with from_selector passed through
1882
+ let rust_config = config.as_ref().map(|c| {
1883
+ let mut c_clone = TreeBuildConfig {
1884
+ property_mode: c.property_mode,
1885
+ timeout_per_operation_ms: c.timeout_per_operation_ms,
1886
+ yield_every_n_elements: c.yield_every_n_elements,
1887
+ batch_size: c.batch_size,
1888
+ max_depth: c.max_depth,
1889
+ ui_settle_delay_ms: c.ui_settle_delay_ms,
1890
+ format_output: c.format_output,
1891
+ tree_output_format: c.tree_output_format,
1892
+ tree_from_selector: c.tree_from_selector.clone(),
1893
+ include_window_screenshot: c.include_window_screenshot,
1894
+ include_monitor_screenshots: c.include_monitor_screenshots,
1895
+ include_gemini_vision: None,
1896
+ include_omniparser: None,
1897
+ include_ocr: None,
1898
+ include_browser_dom: None,
1899
+ };
1900
+ if matches!(output_format, TreeOutputFormat::VerboseJson) {
1901
+ c_clone.format_output = Some(false);
1902
+ } else if c_clone.format_output.is_none() {
1903
+ c_clone.format_output = Some(true);
1904
+ }
1905
+ c_clone.into()
1906
+ });
1907
+
1908
+ // Get UIA tree (always)
1909
+ let result = self
1910
+ .inner
1911
+ .get_window_tree_result_async(pid, title.as_deref(), rust_config)
1912
+ .await
1913
+ .map_err(map_error)?;
1914
+
1915
+ let mut sdk_result = WindowTreeResult::from(result);
1916
+
1917
+ // If no vision options and not clustered format, return simple result
1918
+ if !has_vision_options && !matches!(output_format, TreeOutputFormat::ClusteredYaml) {
1919
+ if matches!(output_format, TreeOutputFormat::VerboseJson) {
1920
+ sdk_result.formatted =
1921
+ Some(serde_json::to_string_pretty(&sdk_result.tree).unwrap_or_default());
1922
+ }
1923
+ return Ok(sdk_result);
1924
+ }
1925
+
1926
+ // Build UIA bounds cache from formatted result
1927
+ #[allow(clippy::type_complexity)]
1928
+ let mut uia_bounds: HashMap<
1929
+ u32,
1930
+ (String, String, (f64, f64, f64, f64), Option<String>),
1931
+ > = HashMap::new();
1932
+ let uia_tree_result = self
1933
+ .inner
1934
+ .get_window_tree_result(pid, None, None)
1935
+ .map_err(map_error)?;
1936
+ let formatted_result = computeruse::format_ui_node_as_compact_yaml(&uia_tree_result.tree, 0);
1937
+ for (idx, (role, name, bounds, selector)) in formatted_result.index_to_bounds {
1938
+ uia_bounds.insert(idx, (role, name, bounds, selector));
1939
+ }
1940
+
1941
+ // Build DOM bounds cache if requested
1942
+ #[allow(clippy::type_complexity)]
1943
+ let mut dom_bounds: HashMap<u32, (String, String, (f64, f64, f64, f64))> = HashMap::new();
1944
+ if include_browser_dom && computeruse::is_browser_process(pid) {
1945
+ if let Ok(dom_result) = self.capture_browser_dom(Some(100), Some(true)).await {
1946
+ for (idx_str, entry) in dom_result.index_to_bounds {
1947
+ if let Ok(idx) = idx_str.parse::<u32>() {
1948
+ let bounds = (
1949
+ entry.bounds.x,
1950
+ entry.bounds.y,
1951
+ entry.bounds.width,
1952
+ entry.bounds.height,
1953
+ );
1954
+ dom_bounds.insert(idx, (entry.tag, entry.name, bounds));
1955
+ }
1956
+ }
1957
+ }
1958
+ }
1959
+
1960
+ // Build Omniparser cache if requested
1961
+ let mut omniparser_items: HashMap<u32, computeruse::OmniparserItem> = HashMap::new();
1962
+ if include_omniparser {
1963
+ if let Ok(omni_result) = self
1964
+ .perform_omniparser_for_process(process.clone(), None, Some(true))
1965
+ .await
1966
+ {
1967
+ for (idx_str, entry) in omni_result.index_to_bounds {
1968
+ if let Ok(idx) = idx_str.parse::<u32>() {
1969
+ omniparser_items.insert(
1970
+ idx,
1971
+ computeruse::OmniparserItem {
1972
+ label: entry.label.clone(),
1973
+ content: Some(entry.name.clone()),
1974
+ box_2d: Some([
1975
+ entry.bounds.x,
1976
+ entry.bounds.y,
1977
+ entry.bounds.x + entry.bounds.width,
1978
+ entry.bounds.y + entry.bounds.height,
1979
+ ]),
1980
+ },
1981
+ );
1982
+ }
1983
+ }
1984
+ }
1985
+ }
1986
+
1987
+ // Build Gemini Vision cache if requested
1988
+ let mut vision_items: HashMap<u32, computeruse::VisionElement> = HashMap::new();
1989
+ if include_gemini_vision {
1990
+ if let Ok(vision_result) = self
1991
+ .perform_gemini_vision_for_process(process.clone(), Some(true))
1992
+ .await
1993
+ {
1994
+ for (idx_str, entry) in vision_result.index_to_bounds {
1995
+ if let Ok(idx) = idx_str.parse::<u32>() {
1996
+ vision_items.insert(
1997
+ idx,
1998
+ computeruse::VisionElement {
1999
+ element_type: entry.element_type.clone(),
2000
+ content: Some(entry.name.clone()),
2001
+ description: None,
2002
+ box_2d: Some([
2003
+ entry.bounds.x,
2004
+ entry.bounds.y,
2005
+ entry.bounds.x + entry.bounds.width,
2006
+ entry.bounds.y + entry.bounds.height,
2007
+ ]),
2008
+ interactivity: None,
2009
+ },
2010
+ );
2011
+ }
2012
+ }
2013
+ }
2014
+ }
2015
+
2016
+ // Build OCR cache if requested
2017
+ #[allow(clippy::type_complexity)]
2018
+ let mut ocr_bounds: HashMap<u32, (String, (f64, f64, f64, f64))> = HashMap::new();
2019
+ if include_ocr {
2020
+ if let Ok(ocr_result) = self
2021
+ .perform_ocr_for_process(process.clone(), Some(true))
2022
+ .await
2023
+ {
2024
+ for (idx_str, entry) in ocr_result.index_to_bounds {
2025
+ if let Ok(idx) = idx_str.parse::<u32>() {
2026
+ let bounds = (
2027
+ entry.bounds.x,
2028
+ entry.bounds.y,
2029
+ entry.bounds.width,
2030
+ entry.bounds.height,
2031
+ );
2032
+ ocr_bounds.insert(idx, (entry.text.clone(), bounds));
2033
+ }
2034
+ }
2035
+ }
2036
+ }
2037
+
2038
+ // If ClusteredYaml format, use clustering
2039
+ if matches!(output_format, TreeOutputFormat::ClusteredYaml) {
2040
+ let clustered_result = computeruse::format_clustered_tree_from_caches(
2041
+ &uia_bounds,
2042
+ &dom_bounds,
2043
+ &ocr_bounds,
2044
+ &omniparser_items,
2045
+ &vision_items,
2046
+ );
2047
+ sdk_result.formatted = Some(clustered_result.formatted);
2048
+ } else {
2049
+ // CompactYaml with vision - append vision trees to UIA tree
2050
+ let mut combined = sdk_result.formatted.unwrap_or_default();
2051
+ if !dom_bounds.is_empty() {
2052
+ combined.push_str("\n\n# Browser DOM elements:\n");
2053
+ for (idx, (tag, name, _)) in &dom_bounds {
2054
+ combined.push_str(&format!("#d{} [{}] {}\n", idx, tag, name));
2055
+ }
2056
+ }
2057
+ if !omniparser_items.is_empty() {
2058
+ combined.push_str("\n\n# Omniparser elements:\n");
2059
+ for (idx, item) in &omniparser_items {
2060
+ combined.push_str(&format!(
2061
+ "#p{} [{}] {}\n",
2062
+ idx,
2063
+ item.label,
2064
+ item.content.as_deref().unwrap_or("")
2065
+ ));
2066
+ }
2067
+ }
2068
+ if !vision_items.is_empty() {
2069
+ combined.push_str("\n\n# Gemini Vision elements:\n");
2070
+ for (idx, item) in &vision_items {
2071
+ combined.push_str(&format!(
2072
+ "#g{} [{}] {}\n",
2073
+ idx,
2074
+ item.element_type,
2075
+ item.content.as_deref().unwrap_or("")
2076
+ ));
2077
+ }
2078
+ }
2079
+ if !ocr_bounds.is_empty() {
2080
+ combined.push_str("\n\n# OCR elements:\n");
2081
+ for (idx, (text, _)) in &ocr_bounds {
2082
+ combined.push_str(&format!("#o{} {}\n", idx, text));
2083
+ }
2084
+ }
2085
+ sdk_result.formatted = Some(combined);
2086
+ }
2087
+
2088
+ Ok(sdk_result)
2089
+ }
2090
+
2091
+ // ============== NEW MONITOR METHODS ==============
2092
+
2093
+ /// (async) List all available monitors/displays.
2094
+ ///
2095
+ /// @returns {Promise<Array<Monitor>>} List of monitor information.
2096
+ #[napi]
2097
+ pub async fn list_monitors(&self) -> napi::Result<Vec<Monitor>> {
2098
+ self.inner
2099
+ .list_monitors()
2100
+ .await
2101
+ .map(|monitors| monitors.into_iter().map(Monitor::from).collect())
2102
+ .map_err(map_error)
2103
+ }
2104
+
2105
+ /// (async) Get the primary monitor.
2106
+ ///
2107
+ /// @returns {Promise<Monitor>} Primary monitor information.
2108
+ #[napi]
2109
+ pub async fn get_primary_monitor(&self) -> napi::Result<Monitor> {
2110
+ self.inner
2111
+ .get_primary_monitor()
2112
+ .await
2113
+ .map(Monitor::from)
2114
+ .map_err(map_error)
2115
+ }
2116
+
2117
+ /// (async) Get the monitor containing the currently focused window.
2118
+ ///
2119
+ /// @returns {Promise<Monitor>} Active monitor information.
2120
+ #[napi]
2121
+ pub async fn get_active_monitor(&self) -> napi::Result<Monitor> {
2122
+ self.inner
2123
+ .get_active_monitor()
2124
+ .await
2125
+ .map(Monitor::from)
2126
+ .map_err(map_error)
2127
+ }
2128
+
2129
+ /// (async) Get a monitor by its ID.
2130
+ ///
2131
+ /// @param {string} id - The monitor ID to find.
2132
+ /// @returns {Promise<Monitor>} Monitor information.
2133
+ #[napi]
2134
+ pub async fn get_monitor_by_id(&self, id: String) -> napi::Result<Monitor> {
2135
+ self.inner
2136
+ .get_monitor_by_id(&id)
2137
+ .await
2138
+ .map(Monitor::from)
2139
+ .map_err(map_error)
2140
+ }
2141
+
2142
+ /// (async) Get a monitor by its name.
2143
+ ///
2144
+ /// @param {string} name - The monitor name to find.
2145
+ /// @returns {Promise<Monitor>} Monitor information.
2146
+ #[napi]
2147
+ pub async fn get_monitor_by_name(&self, name: String) -> napi::Result<Monitor> {
2148
+ self.inner
2149
+ .get_monitor_by_name(&name)
2150
+ .await
2151
+ .map(Monitor::from)
2152
+ .map_err(map_error)
2153
+ }
2154
+
2155
+ /// (async) Capture a screenshot of a specific monitor.
2156
+ ///
2157
+ /// @param {Monitor} monitor - The monitor to capture.
2158
+ /// @returns {Promise<ScreenshotResult>} The screenshot data.
2159
+ #[napi]
2160
+ pub async fn capture_monitor(&self, monitor: Monitor) -> napi::Result<ScreenshotResult> {
2161
+ let rust_monitor = computeruse::Monitor {
2162
+ id: monitor.id,
2163
+ name: monitor.name,
2164
+ is_primary: monitor.is_primary,
2165
+ width: monitor.width,
2166
+ height: monitor.height,
2167
+ x: monitor.x,
2168
+ y: monitor.y,
2169
+ scale_factor: monitor.scale_factor,
2170
+ work_area: None,
2171
+ };
2172
+ self.inner
2173
+ .capture_monitor(&rust_monitor)
2174
+ .await
2175
+ .map(|r| ScreenshotResult {
2176
+ width: r.width,
2177
+ height: r.height,
2178
+ image_data: r.image_data,
2179
+ monitor: r.monitor.map(Monitor::from),
2180
+ })
2181
+ .map_err(map_error)
2182
+ }
2183
+
2184
+ /// (async) Capture screenshots of all monitors.
2185
+ ///
2186
+ /// @returns {Promise<Array<{monitor: Monitor, screenshot: ScreenshotResult}>>} Array of monitor and screenshot pairs.
2187
+ #[napi]
2188
+ pub async fn capture_all_monitors(&self) -> napi::Result<Vec<MonitorScreenshotPair>> {
2189
+ self.inner
2190
+ .capture_all_monitors()
2191
+ .await
2192
+ .map(|results| {
2193
+ results
2194
+ .into_iter()
2195
+ .map(|(monitor, screenshot)| MonitorScreenshotPair {
2196
+ monitor: Monitor::from(monitor),
2197
+ screenshot: ScreenshotResult {
2198
+ width: screenshot.width,
2199
+ height: screenshot.height,
2200
+ image_data: screenshot.image_data,
2201
+ monitor: screenshot.monitor.map(Monitor::from),
2202
+ },
2203
+ })
2204
+ .collect()
2205
+ })
2206
+ .map_err(map_error)
2207
+ }
2208
+
2209
+ /// Capture a screenshot of a window by process name.
2210
+ ///
2211
+ /// Finds the first window matching the given process name and captures its screenshot.
2212
+ /// Process name matching is case-insensitive and uses substring matching.
2213
+ ///
2214
+ /// @param {string} process - Process name to match (e.g., "chrome", "notepad", "code")
2215
+ /// @returns {ScreenshotResult} The screenshot data.
2216
+ #[napi(js_name = "captureWindowByProcess")]
2217
+ pub fn capture_window_by_process(&self, process: String) -> napi::Result<ScreenshotResult> {
2218
+ self.inner
2219
+ .capture_window_by_process(&process)
2220
+ .map(|r| ScreenshotResult {
2221
+ width: r.width,
2222
+ height: r.height,
2223
+ image_data: r.image_data,
2224
+ monitor: r.monitor.map(Monitor::from),
2225
+ })
2226
+ .map_err(map_error)
2227
+ }
2228
+
2229
+ /// (async) Captures a screenshot. Three modes:
2230
+ /// 1. Element mode: provide process + selector to capture specific element
2231
+ /// 2. Window mode: provide process only to capture entire window
2232
+ /// 3. Monitor mode: provide process + entireMonitor=true to capture the monitor where the window is located
2233
+ ///
2234
+ /// @param {string} process - Process name to match (e.g., "chrome", "notepad", "code")
2235
+ /// @param {string} [selector] - Optional selector to capture a specific element within the process
2236
+ /// @param {boolean} [entireMonitor=false] - If true, captures the entire monitor containing the window
2237
+ /// @param {number} [timeoutMs=10000] - Timeout in milliseconds for finding the element
2238
+ /// @returns {Promise<ScreenshotResult>} The screenshot data.
2239
+ #[napi(js_name = "captureScreenshot")]
2240
+ pub async fn capture_screenshot(
2241
+ &self,
2242
+ process: String,
2243
+ selector: Option<String>,
2244
+ entire_monitor: Option<bool>,
2245
+ timeout_ms: Option<f64>,
2246
+ ) -> napi::Result<ScreenshotResult> {
2247
+ use std::time::Duration;
2248
+
2249
+ let entire_monitor = entire_monitor.unwrap_or(false);
2250
+ let timeout = Duration::from_millis(timeout_ms.unwrap_or(10000.0) as u64);
2251
+
2252
+ // Build the full selector string like MCP does
2253
+ let full_selector = if let Some(sel) = &selector {
2254
+ if sel.is_empty() {
2255
+ format!("process:{}", process)
2256
+ } else {
2257
+ format!("process:{} >> {}", process, sel)
2258
+ }
2259
+ } else {
2260
+ format!("process:{}", process)
2261
+ };
2262
+
2263
+ // Create locator and find element
2264
+ let sel_rust: computeruse::selector::Selector = full_selector.as_str().into();
2265
+ let locator = self.inner.locator(sel_rust);
2266
+ let element = locator.first(Some(timeout)).await.map_err(map_error)?;
2267
+
2268
+ if entire_monitor {
2269
+ // Monitor mode: get element's monitor and capture it
2270
+ let monitor = element.monitor().map_err(map_error)?;
2271
+ let screenshot = monitor.capture(&self.inner).await.map_err(map_error)?;
2272
+ Ok(ScreenshotResult {
2273
+ width: screenshot.width,
2274
+ height: screenshot.height,
2275
+ image_data: screenshot.image_data,
2276
+ monitor: Some(Monitor::from(monitor)),
2277
+ })
2278
+ } else {
2279
+ // Element/Window mode: capture the element directly
2280
+ let screenshot = element.capture().map_err(map_error)?;
2281
+ Ok(ScreenshotResult {
2282
+ width: screenshot.width,
2283
+ height: screenshot.height,
2284
+ image_data: screenshot.image_data,
2285
+ monitor: screenshot.monitor.map(Monitor::from),
2286
+ })
2287
+ }
2288
+ }
2289
+
2290
+ // ============== SCREENSHOT UTILITIES ==============
2291
+
2292
+ /// Convert a screenshot to PNG bytes.
2293
+ /// Converts BGRA to RGBA and encodes as PNG format.
2294
+ ///
2295
+ /// @param {ScreenshotResult} screenshot - The screenshot to convert.
2296
+ /// @returns {Buffer} PNG-encoded bytes.
2297
+ #[napi(js_name = "screenshotToPng")]
2298
+ pub fn screenshot_to_png(&self, screenshot: ScreenshotResult) -> napi::Result<Vec<u8>> {
2299
+ screenshot
2300
+ .to_inner()
2301
+ .to_png()
2302
+ .map_err(|e| napi::Error::from_reason(e.to_string()))
2303
+ }
2304
+
2305
+ /// Convert a screenshot to PNG bytes with resizing.
2306
+ /// If the image exceeds maxDimension in either width or height,
2307
+ /// it will be resized while maintaining aspect ratio.
2308
+ ///
2309
+ /// @param {ScreenshotResult} screenshot - The screenshot to convert.
2310
+ /// @param {number} [maxDimension] - Maximum width or height. Defaults to 1920.
2311
+ /// @returns {Buffer} PNG-encoded bytes (potentially resized).
2312
+ #[napi(js_name = "screenshotToPngResized")]
2313
+ pub fn screenshot_to_png_resized(
2314
+ &self,
2315
+ screenshot: ScreenshotResult,
2316
+ max_dimension: Option<u32>,
2317
+ ) -> napi::Result<Vec<u8>> {
2318
+ screenshot
2319
+ .to_inner()
2320
+ .to_png_resized(max_dimension)
2321
+ .map_err(|e| napi::Error::from_reason(e.to_string()))
2322
+ }
2323
+
2324
+ /// Convert a screenshot to base64-encoded PNG string.
2325
+ /// Useful for embedding in JSON responses or passing to LLMs.
2326
+ ///
2327
+ /// @param {ScreenshotResult} screenshot - The screenshot to convert.
2328
+ /// @returns {string} Base64-encoded PNG string.
2329
+ #[napi(js_name = "screenshotToBase64Png")]
2330
+ pub fn screenshot_to_base64_png(&self, screenshot: ScreenshotResult) -> napi::Result<String> {
2331
+ screenshot
2332
+ .to_inner()
2333
+ .to_base64_png()
2334
+ .map_err(|e| napi::Error::from_reason(e.to_string()))
2335
+ }
2336
+
2337
+ /// Convert a screenshot to base64-encoded PNG string with resizing.
2338
+ /// If the image exceeds maxDimension in either width or height,
2339
+ /// it will be resized while maintaining aspect ratio.
2340
+ ///
2341
+ /// @param {ScreenshotResult} screenshot - The screenshot to convert.
2342
+ /// @param {number} [maxDimension] - Maximum width or height. Defaults to 1920.
2343
+ /// @returns {string} Base64-encoded PNG string (potentially resized).
2344
+ #[napi(js_name = "screenshotToBase64PngResized")]
2345
+ pub fn screenshot_to_base64_png_resized(
2346
+ &self,
2347
+ screenshot: ScreenshotResult,
2348
+ max_dimension: Option<u32>,
2349
+ ) -> napi::Result<String> {
2350
+ screenshot
2351
+ .to_inner()
2352
+ .to_base64_png_resized(max_dimension)
2353
+ .map_err(|e| napi::Error::from_reason(e.to_string()))
2354
+ }
2355
+
2356
+ /// Get the dimensions a screenshot would have after resizing.
2357
+ ///
2358
+ /// @param {ScreenshotResult} screenshot - The screenshot to check.
2359
+ /// @param {number} maxDimension - Maximum width or height.
2360
+ /// @returns {ResizedDimensions} Object with width and height after resize.
2361
+ #[napi(js_name = "screenshotResizedDimensions")]
2362
+ pub fn screenshot_resized_dimensions(
2363
+ &self,
2364
+ screenshot: ScreenshotResult,
2365
+ max_dimension: u32,
2366
+ ) -> ResizedDimensions {
2367
+ let (width, height) = screenshot.to_inner().resized_dimensions(max_dimension);
2368
+ ResizedDimensions { width, height }
2369
+ }
2370
+
2371
+ /// (async) Get all window elements for a given application name.
2372
+ ///
2373
+ /// @param {string} name - The name of the application whose windows will be retrieved.
2374
+ /// @returns {Promise<Array<Element>>} A list of window elements belonging to the application.
2375
+ #[napi]
2376
+ pub async fn windows_for_application(&self, name: String) -> napi::Result<Vec<Element>> {
2377
+ self.inner
2378
+ .windows_for_application(&name)
2379
+ .await
2380
+ .map(|windows| windows.into_iter().map(Element::from).collect())
2381
+ .map_err(map_error)
2382
+ }
2383
+
2384
+ // ============== ADDITIONAL MISSING METHODS ==============
2385
+
2386
+ /// (async) Get the UI tree for all open applications in parallel.
2387
+ ///
2388
+ /// @returns {Promise<Array<UINode>>} List of UI trees for all applications.
2389
+ #[napi]
2390
+ pub async fn get_all_applications_tree(&self) -> napi::Result<Vec<UINode>> {
2391
+ self.inner
2392
+ .get_all_applications_tree()
2393
+ .await
2394
+ .map(|trees| trees.into_iter().map(UINode::from).collect())
2395
+ .map_err(map_error)
2396
+ }
2397
+
2398
+ /// (async) Press a key globally.
2399
+ ///
2400
+ /// @param {string} key - The key to press (e.g., "Enter", "Ctrl+C", "F1").
2401
+ /// @param {string} [process] - Process name for window screenshot capture. If provided, enables window screenshots.
2402
+ /// @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot (requires process).
2403
+ /// @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after key press.
2404
+ #[napi]
2405
+ pub async fn press_key(
2406
+ &self,
2407
+ key: String,
2408
+ process: Option<String>,
2409
+ include_window_screenshot: Option<bool>,
2410
+ include_monitor_screenshots: Option<bool>,
2411
+ ) -> napi::Result<()> {
2412
+ // Normalize key to ensure curly brace format (e.g., "Enter" -> "{Enter}")
2413
+ let normalized_key = normalize_key(&key);
2414
+ tracing::debug!(
2415
+ "[TS SDK] desktop.press_key: normalized key: {} -> {}",
2416
+ key,
2417
+ normalized_key
2418
+ );
2419
+ let result = self
2420
+ .inner
2421
+ .press_key(&normalized_key)
2422
+ .await
2423
+ .map_err(map_error);
2424
+
2425
+ // Get PID from process name if provided
2426
+ let pid = process
2427
+ .as_ref()
2428
+ .and_then(|p| find_pid_for_process(&self.inner, p).ok());
2429
+
2430
+ // Capture screenshots if requested
2431
+ let _screenshots = capture_screenshots(
2432
+ &self.inner,
2433
+ pid,
2434
+ include_window_screenshot.unwrap_or(true) && pid.is_some(),
2435
+ include_monitor_screenshots.unwrap_or(false),
2436
+ "pressKey",
2437
+ );
2438
+
2439
+ result
2440
+ }
2441
+
2442
+ /// (async) Execute JavaScript in a browser tab.
2443
+ /// Finds the browser window by process name and executes the script.
2444
+ ///
2445
+ /// @param {string} script - The JavaScript code to execute in browser context.
2446
+ /// @param {string} process - Process name to scope the browser window (e.g., 'chrome', 'msedge'). Required.
2447
+ /// @param {number} [timeoutMs=10000] - Timeout in milliseconds for finding the browser window.
2448
+ /// @returns {Promise<string>} The result of script execution.
2449
+ #[napi]
2450
+ pub async fn execute_browser_script(
2451
+ &self,
2452
+ script: String,
2453
+ process: String,
2454
+ timeout_ms: Option<f64>,
2455
+ ) -> napi::Result<String> {
2456
+ use std::time::Duration;
2457
+
2458
+ let timeout = Duration::from_millis(timeout_ms.unwrap_or(10000.0) as u64);
2459
+ let selector_str = format!("process:{}", process);
2460
+ let sel: computeruse::selector::Selector = selector_str.as_str().into();
2461
+ let locator = self.inner.locator(sel);
2462
+ let element = locator.first(Some(timeout)).await.map_err(map_error)?;
2463
+ element
2464
+ .execute_browser_script(&script)
2465
+ .await
2466
+ .map_err(map_error)
2467
+ }
2468
+
2469
+ /// (async) Close a browser tab safely.
2470
+ ///
2471
+ /// This method can identify the tab to close by:
2472
+ /// - tabId: Close a specific tab by its Chrome tab ID
2473
+ /// - url: Find and close a tab matching this URL (partial match supported)
2474
+ /// - title: Find and close a tab matching this title (case-insensitive partial match)
2475
+ /// - If none provided, closes the currently active tab
2476
+ ///
2477
+ /// Returns information about the closed tab for verification.
2478
+ /// Returns null if no browser extension is connected or tab couldn't be found.
2479
+ ///
2480
+ /// Safety:
2481
+ /// - Will NOT close protected browser pages (chrome://, edge://, about:, etc.)
2482
+ /// - Returns the closed tab's URL/title so you can verify the correct tab was closed
2483
+ ///
2484
+ /// @param {number} [tabId] - Specific Chrome tab ID to close.
2485
+ /// @param {string} [url] - URL to match (partial match supported).
2486
+ /// @param {string} [title] - Title to match (case-insensitive partial match).
2487
+ /// @returns {Promise<CloseTabResult | null>} Info about closed tab, or null if no extension/tab found.
2488
+ ///
2489
+ /// @example
2490
+ /// // Close by URL
2491
+ /// const result = await desktop.closeTab({ url: "example.com" });
2492
+ ///
2493
+ /// @example
2494
+ /// // Close by title
2495
+ /// const result = await desktop.closeTab({ title: "My Page" });
2496
+ ///
2497
+ /// @example
2498
+ /// // Close active tab
2499
+ /// const result = await desktop.closeTab();
2500
+ #[napi]
2501
+ pub async fn close_tab(
2502
+ &self,
2503
+ options: Option<crate::types::CloseTabOptions>,
2504
+ ) -> napi::Result<Option<crate::types::CloseTabResult>> {
2505
+ let opts = options.unwrap_or_default();
2506
+ self.inner
2507
+ .close_tab(opts.tab_id, opts.url.as_deref(), opts.title.as_deref())
2508
+ .await
2509
+ .map(|opt| opt.map(crate::types::CloseTabResult::from))
2510
+ .map_err(map_error)
2511
+ }
2512
+ /// (async) Delay execution for a specified number of milliseconds.
2513
+ /// Useful for waiting between actions to ensure UI stability.
2514
+ ///
2515
+ /// @param {number} delayMs - Delay in milliseconds.
2516
+ /// @returns {Promise<void>}
2517
+ #[napi]
2518
+ pub async fn delay(&self, delay_ms: u32) -> napi::Result<()> {
2519
+ self.inner.delay(delay_ms as u64).await.map_err(map_error)
2520
+ }
2521
+
2522
+ /// Navigate to a URL in a browser.
2523
+ /// This is the recommended method for browser navigation - more reliable than
2524
+ /// manually manipulating the address bar with keyboard/mouse actions.
2525
+ ///
2526
+ /// @param {string} url - URL to navigate to
2527
+ /// @param {string | null} browser - Optional browser name ('Chrome', 'Firefox', 'Edge', 'Brave', 'Opera', 'Vivaldi', or 'Default')
2528
+ /// @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot after navigation
2529
+ /// @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after navigation
2530
+ /// @returns {Promise<Element>} The browser window element
2531
+ #[napi]
2532
+ pub fn navigate_browser(
2533
+ &self,
2534
+ url: String,
2535
+ browser: Option<String>,
2536
+ include_window_screenshot: Option<bool>,
2537
+ include_monitor_screenshots: Option<bool>,
2538
+ ) -> napi::Result<Element> {
2539
+ let browser_enum = browser.map(|b| match b.as_str() {
2540
+ "Chrome" => computeruse::Browser::Chrome,
2541
+ "Firefox" => computeruse::Browser::Firefox,
2542
+ "Edge" => computeruse::Browser::Edge,
2543
+ "Brave" => computeruse::Browser::Brave,
2544
+ "Opera" => computeruse::Browser::Opera,
2545
+ "Vivaldi" => computeruse::Browser::Vivaldi,
2546
+ "Default" => computeruse::Browser::Default,
2547
+ custom => computeruse::Browser::Custom(custom.to_string()),
2548
+ });
2549
+
2550
+ let element = self.inner.open_url(&url, browser_enum).map_err(map_error)?;
2551
+
2552
+ // Capture screenshots if enabled (window default: true, monitor default: false)
2553
+ let _screenshots = capture_screenshots(
2554
+ &self.inner,
2555
+ element.process_id().ok(),
2556
+ include_window_screenshot.unwrap_or(true),
2557
+ include_monitor_screenshots.unwrap_or(false),
2558
+ "navigateBrowser",
2559
+ );
2560
+
2561
+ Ok(Element { inner: element })
2562
+ }
2563
+
2564
+ /// (async) Set the zoom level to a specific percentage.
2565
+ ///
2566
+ /// @param {number} percentage - The zoom percentage (e.g., 100 for 100%, 150 for 150%, 50 for 50%).
2567
+ #[napi]
2568
+ pub async fn set_zoom(&self, percentage: u32) -> napi::Result<()> {
2569
+ self.inner.set_zoom(percentage).await.map_err(map_error)
2570
+ }
2571
+
2572
+ /// (async) Run Gemini Computer Use agentic loop.
2573
+ ///
2574
+ /// Provide a goal and target process, and this will autonomously take actions
2575
+ /// (click, type, scroll, etc.) until the goal is achieved or max_steps is reached.
2576
+ /// Uses Gemini's vision model to analyze screenshots and decide actions.
2577
+ ///
2578
+ /// @param {string} process - Process name of the target application (e.g., "chrome", "notepad")
2579
+ /// @param {string} goal - What to achieve (e.g., "Open Notepad and type Hello World")
2580
+ /// @param {number} [maxSteps=20] - Maximum number of steps before stopping
2581
+ /// @param {function} [onStep] - Optional callback invoked after each step with step details
2582
+ /// @returns {Promise<ComputerUseResult>} Result with status, steps executed, and history
2583
+ #[napi]
2584
+ pub async fn gemini_computer_use(
2585
+ &self,
2586
+ process: String,
2587
+ goal: String,
2588
+ max_steps: Option<u32>,
2589
+ #[napi(ts_arg_type = "((err: null | Error, step: ComputerUseStep) => void) | undefined")]
2590
+ on_step: Option<ThreadsafeFunction<ComputerUseStep>>,
2591
+ ) -> napi::Result<ComputerUseResult> {
2592
+ // Create progress callback if onStep is provided
2593
+ #[allow(clippy::type_complexity)]
2594
+ let progress_callback: Option<
2595
+ Box<dyn Fn(&computeruse::ComputerUseStep) + Send + Sync>,
2596
+ > = on_step.map(|tsfn| {
2597
+ let tsfn = Arc::new(tsfn);
2598
+ Box::new(move |step: &computeruse::ComputerUseStep| {
2599
+ let js_step = ComputerUseStep::from(step.clone());
2600
+ tsfn.call(Ok(js_step), ThreadsafeFunctionCallMode::NonBlocking);
2601
+ }) as Box<dyn Fn(&computeruse::ComputerUseStep) + Send + Sync>
2602
+ });
2603
+
2604
+ self.inner
2605
+ .gemini_computer_use(&process, &goal, max_steps, progress_callback)
2606
+ .await
2607
+ .map(ComputerUseResult::from)
2608
+ .map_err(|e| napi::Error::from_reason(e.to_string()))
2609
+ }
2610
+
2611
+ /// Stop all currently executing operations.
2612
+ ///
2613
+ /// This cancels the internal cancellation token, which will cause any
2614
+ /// operations that check `isCancelled()` to abort. After calling this,
2615
+ /// you should create a new Desktop instance to start fresh.
2616
+ #[napi]
2617
+ pub fn stop_execution(&self) {
2618
+ self.inner.stop_execution();
2619
+ }
2620
+
2621
+ /// Check if execution has been cancelled.
2622
+ ///
2623
+ /// Returns `true` if `stopExecution()` has been called.
2624
+ /// Long-running operations should periodically check this and abort if true.
2625
+ #[napi]
2626
+ pub fn is_cancelled(&self) -> bool {
2627
+ self.inner.is_cancelled()
2628
+ }
2629
+
2630
+ /// Stop all active highlight overlays globally.
2631
+ ///
2632
+ /// This finds and destroys all highlight overlay windows that were created
2633
+ /// by `element.highlight()`. Useful for cleaning up highlights without
2634
+ /// needing to track individual HighlightHandle objects.
2635
+ ///
2636
+ /// @returns {number} The number of highlights that were stopped.
2637
+ #[napi]
2638
+ pub fn stop_highlighting(&self) -> u32 {
2639
+ #[cfg(target_os = "windows")]
2640
+ {
2641
+ computeruse::stop_all_highlights() as u32
2642
+ }
2643
+ #[cfg(not(target_os = "windows"))]
2644
+ {
2645
+ // Not implemented for other platforms yet
2646
+ 0
2647
+ }
2648
+ }
2649
+
2650
+ /// Show inspect overlay with indexed elements for visual debugging.
2651
+ ///
2652
+ /// Displays a transparent overlay window with colored rectangles around UI elements,
2653
+ /// showing their index numbers for click targeting. Use `hideInspectOverlay()` to remove.
2654
+ ///
2655
+ /// @param {InspectElement[]} elements - Array of elements to highlight with their bounds.
2656
+ /// @param {object} windowBounds - The window bounds {x, y, width, height} to constrain the overlay.
2657
+ /// @param {OverlayDisplayMode} [displayMode='Index'] - What to show in labels: 'Index', 'Role', 'Name', etc.
2658
+ #[napi]
2659
+ #[cfg(target_os = "windows")]
2660
+ pub fn show_inspect_overlay(
2661
+ &self,
2662
+ elements: Vec<crate::types::InspectElement>,
2663
+ window_bounds: crate::types::Bounds,
2664
+ display_mode: Option<crate::types::OverlayDisplayMode>,
2665
+ ) -> napi::Result<()> {
2666
+ let core_elements: Vec<computeruse::InspectElement> =
2667
+ elements.into_iter().map(|e| e.into()).collect();
2668
+ let core_bounds = (
2669
+ window_bounds.x as i32,
2670
+ window_bounds.y as i32,
2671
+ window_bounds.width as i32,
2672
+ window_bounds.height as i32,
2673
+ );
2674
+ let core_mode = display_mode
2675
+ .map(|m| m.into())
2676
+ .unwrap_or(computeruse::OverlayDisplayMode::Index);
2677
+
2678
+ computeruse::show_inspect_overlay(core_elements, core_bounds, core_mode)
2679
+ .map(|_handle| ()) // Discard handle - use hideInspectOverlay to close
2680
+ .map_err(|e| napi::Error::from_reason(e.to_string()))
2681
+ }
2682
+
2683
+ /// Show inspect overlay (non-Windows stub).
2684
+ #[napi]
2685
+ #[cfg(not(target_os = "windows"))]
2686
+ pub fn show_inspect_overlay(
2687
+ &self,
2688
+ _elements: Vec<crate::types::InspectElement>,
2689
+ _window_bounds: crate::types::Bounds,
2690
+ _display_mode: Option<crate::types::OverlayDisplayMode>,
2691
+ ) -> napi::Result<()> {
2692
+ // Not implemented for other platforms yet
2693
+ Ok(())
2694
+ }
2695
+
2696
+ /// Hide any active inspect overlay.
2697
+ ///
2698
+ /// This hides the visual overlay that was shown via `showInspectOverlay()`.
2699
+ /// Can be called from any thread.
2700
+ #[napi]
2701
+ pub fn hide_inspect_overlay(&self) {
2702
+ #[cfg(target_os = "windows")]
2703
+ {
2704
+ computeruse::hide_inspect_overlay();
2705
+ }
2706
+ #[cfg(not(target_os = "windows"))]
2707
+ {
2708
+ // Not implemented for other platforms yet
2709
+ }
2710
+ }
2711
+
2712
+ // ============== ELEMENT VERIFICATION ==============
2713
+
2714
+ /// Verify that an element matching the selector exists within the same application as the scope element.
2715
+ ///
2716
+ /// This is used for post-action verification - checking that an expected element appeared after
2717
+ /// performing an action (e.g., a success dialog after clicking submit).
2718
+ ///
2719
+ /// @param {Element} scopeElement - The element to get the application scope from (typically the element the action was performed on)
2720
+ /// @param {string} selector - The selector string to search for
2721
+ /// @param {number} [timeoutMs=2000] - How long to wait for the element to appear in milliseconds
2722
+ /// @returns {Element} The found element if verification passes
2723
+ /// @throws Error if the element is not found within the timeout
2724
+ #[napi]
2725
+ pub async fn verify_element_exists(
2726
+ &self,
2727
+ scope_element: &crate::Element,
2728
+ selector: String,
2729
+ timeout_ms: Option<u32>,
2730
+ ) -> napi::Result<crate::Element> {
2731
+ let timeout = timeout_ms.unwrap_or(2000) as u64;
2732
+ let found = self
2733
+ .inner
2734
+ .verify_element_exists(&scope_element.inner, &selector, timeout)
2735
+ .await
2736
+ .map_err(map_error)?;
2737
+ Ok(crate::Element { inner: found })
2738
+ }
2739
+
2740
+ /// Verify that an element matching the selector does NOT exist within the same application as the scope element.
2741
+ ///
2742
+ /// This is used for post-action verification - checking that an element disappeared after
2743
+ /// performing an action (e.g., a modal dialog closed after clicking OK).
2744
+ ///
2745
+ /// @param {Element} scopeElement - The element to get the application scope from (typically the element the action was performed on)
2746
+ /// @param {string} selector - The selector string that should NOT be found
2747
+ /// @param {number} [timeoutMs=2000] - How long to wait/check that the element doesn't appear in milliseconds
2748
+ /// @returns {void}
2749
+ /// @throws Error if the element IS found (meaning verification failed)
2750
+ #[napi]
2751
+ pub async fn verify_element_not_exists(
2752
+ &self,
2753
+ scope_element: &crate::Element,
2754
+ selector: String,
2755
+ timeout_ms: Option<u32>,
2756
+ ) -> napi::Result<()> {
2757
+ let timeout = timeout_ms.unwrap_or(2000) as u64;
2758
+ self.inner
2759
+ .verify_element_not_exists(&scope_element.inner, &selector, timeout)
2760
+ .await
2761
+ .map_err(map_error)
2762
+ }
2763
+ }