@mediar-ai/terminator 0.23.36 → 0.23.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/desktop.rs CHANGED
@@ -1,13 +1,84 @@
1
- use crate::types::{ComputerUseResult, Monitor, MonitorScreenshotPair};
1
+ use crate::types::{
2
+ ClickResult, ClickType, ComputerUseResult, ComputerUseStep, Monitor, MonitorScreenshotPair,
3
+ ResizedDimensions, TreeOutputFormat, VisionType, WindowTreeResult,
4
+ };
2
5
  use crate::Selector;
3
6
  use crate::{
4
7
  map_error, CommandOutput, Element, Locator, ScreenshotResult, TreeBuildConfig, UINode,
5
8
  };
6
9
  use napi::bindgen_prelude::Either;
10
+ use napi::threadsafe_function::{ThreadsafeFunction, ThreadsafeFunctionCallMode};
7
11
  use napi_derive::napi;
8
- use std::sync::Once;
12
+ use std::sync::{Arc, Once};
9
13
  use terminator::Desktop as TerminatorDesktop;
10
14
 
15
+ /// Result of screenshot capture operations
16
+ #[derive(Default)]
17
+ struct ScreenshotPaths {
18
+ window_path: Option<String>,
19
+ monitor_paths: Option<Vec<String>>,
20
+ }
21
+
22
+ /// Helper to capture and save screenshots based on options
23
+ fn capture_screenshots(
24
+ desktop: &TerminatorDesktop,
25
+ pid: Option<u32>,
26
+ include_window: bool,
27
+ include_monitors: bool,
28
+ operation: &str,
29
+ ) -> ScreenshotPaths {
30
+ let mut result = ScreenshotPaths::default();
31
+
32
+ if !include_window && !include_monitors {
33
+ return result;
34
+ }
35
+
36
+ terminator::screenshot_logger::init();
37
+ let prefix = terminator::screenshot_logger::generate_prefix(None, operation);
38
+
39
+ if include_window {
40
+ if let Some(pid) = pid {
41
+ // Try to capture window screenshot by PID
42
+ if let Ok(apps) = desktop.applications() {
43
+ if let Some(app) = apps.into_iter().find(|a| a.process_id().ok() == Some(pid)) {
44
+ if let Ok(screenshot) = app.capture() {
45
+ if let Some(saved) = terminator::screenshot_logger::save_window_screenshot(
46
+ &screenshot,
47
+ &prefix,
48
+ None,
49
+ ) {
50
+ result.window_path = Some(saved.path.to_string_lossy().to_string());
51
+ }
52
+ }
53
+ }
54
+ }
55
+ }
56
+ }
57
+
58
+ if include_monitors {
59
+ // Capture all monitors using futures executor for sync context
60
+ if let Ok(monitors) = futures::executor::block_on(desktop.capture_all_monitors()) {
61
+ let saved =
62
+ terminator::screenshot_logger::save_monitor_screenshots(&monitors, &prefix, None);
63
+ if !saved.is_empty() {
64
+ result.monitor_paths = Some(
65
+ saved
66
+ .into_iter()
67
+ .map(|s| s.path.to_string_lossy().to_string())
68
+ .collect(),
69
+ );
70
+ }
71
+ }
72
+ }
73
+
74
+ result
75
+ }
76
+
77
+ /// Helper to find PID from process name using the shared core function.
78
+ fn find_pid_for_process(desktop: &TerminatorDesktop, process_name: &str) -> napi::Result<u32> {
79
+ terminator::find_pid_for_process(desktop, process_name).map_err(map_error)
80
+ }
81
+
11
82
  /// Main entry point for desktop automation.
12
83
  #[napi(js_name = "Desktop")]
13
84
  pub struct Desktop {
@@ -86,12 +157,27 @@ impl Desktop {
86
157
  /// Open an application by name.
87
158
  ///
88
159
  /// @param {string} name - The name of the application to open.
160
+ /// @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot after opening
161
+ /// @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after opening
89
162
  #[napi]
90
- pub fn open_application(&self, name: String) -> napi::Result<Element> {
91
- self.inner
92
- .open_application(&name)
93
- .map(Element::from)
94
- .map_err(map_error)
163
+ pub fn open_application(
164
+ &self,
165
+ name: String,
166
+ include_window_screenshot: Option<bool>,
167
+ include_monitor_screenshots: Option<bool>,
168
+ ) -> napi::Result<Element> {
169
+ let element = self.inner.open_application(&name).map_err(map_error)?;
170
+
171
+ // Capture screenshots if enabled (window default: true, monitor default: false)
172
+ let _screenshots = capture_screenshots(
173
+ &self.inner,
174
+ element.process_id().ok(),
175
+ include_window_screenshot.unwrap_or(true),
176
+ include_monitor_screenshots.unwrap_or(false),
177
+ "openApplication",
178
+ );
179
+
180
+ Ok(Element::from(element))
95
181
  }
96
182
 
97
183
  /// Activate an application by name.
@@ -102,6 +188,93 @@ impl Desktop {
102
188
  self.inner.activate_application(&name).map_err(map_error)
103
189
  }
104
190
 
191
+ /// Click within element bounds at a specified position.
192
+ ///
193
+ /// This is useful for clicking on elements from UI tree, OCR, omniparser, gemini vision, or DOM
194
+ /// without needing an element reference - just the bounds.
195
+ ///
196
+ /// @param {number} x - X coordinate of the bounds.
197
+ /// @param {number} y - Y coordinate of the bounds.
198
+ /// @param {number} width - Width of the bounds.
199
+ /// @param {number} height - Height of the bounds.
200
+ /// @param {number} [xPercentage=50] - X position within bounds as percentage (0-100). Defaults to 50 (center).
201
+ /// @param {number} [yPercentage=50] - Y position within bounds as percentage (0-100). Defaults to 50 (center).
202
+ /// @param {ClickType} [clickType='left'] - Type of click: 'left', 'double', or 'right'.
203
+ /// @param {boolean} [restoreCursor=true] - If true, restore cursor to original position after clicking.
204
+ /// @returns {ClickResult} Result with clicked coordinates and method details.
205
+ #[napi]
206
+ #[allow(clippy::too_many_arguments)]
207
+ pub fn click_at_bounds(
208
+ &self,
209
+ x: f64,
210
+ y: f64,
211
+ width: f64,
212
+ height: f64,
213
+ x_percentage: Option<u8>,
214
+ y_percentage: Option<u8>,
215
+ click_type: Option<ClickType>,
216
+ restore_cursor: Option<bool>,
217
+ ) -> napi::Result<ClickResult> {
218
+ let bounds = (x, y, width, height);
219
+ let click_position = match (x_percentage, y_percentage) {
220
+ (Some(xp), Some(yp)) => Some((xp, yp)),
221
+ (Some(xp), None) => Some((xp, 50)),
222
+ (None, Some(yp)) => Some((50, yp)),
223
+ (None, None) => None,
224
+ };
225
+ let click_type = click_type.unwrap_or(ClickType::Left);
226
+ let restore_cursor = restore_cursor.unwrap_or(true);
227
+
228
+ self.inner
229
+ .click_at_bounds(bounds, click_position, click_type.into(), restore_cursor)
230
+ .map(ClickResult::from)
231
+ .map_err(map_error)
232
+ }
233
+
234
+ /// Click on an element by its index from the last tree/vision query.
235
+ ///
236
+ /// This looks up cached bounds from the appropriate cache based on visionType,
237
+ /// then clicks at the specified position within those bounds.
238
+ ///
239
+ /// @param {number} index - 1-based index from the tree/vision output (e.g., #1, #2).
240
+ /// @param {VisionType} [visionType='UiTree'] - Source of the index: 'UiTree', 'Ocr', 'Omniparser', 'Gemini', or 'Dom'.
241
+ /// @param {number} [xPercentage=50] - X position within bounds as percentage (0-100).
242
+ /// @param {number} [yPercentage=50] - Y position within bounds as percentage (0-100).
243
+ /// @param {ClickType} [clickType='Left'] - Type of click: 'Left', 'Double', or 'Right'.
244
+ /// @param {boolean} [restoreCursor=true] - If true, restore cursor to original position after clicking.
245
+ /// @returns {ClickResult} Result with clicked coordinates, element info, and method details.
246
+ #[napi]
247
+ pub fn click_by_index(
248
+ &self,
249
+ index: u32,
250
+ vision_type: Option<VisionType>,
251
+ x_percentage: Option<u8>,
252
+ y_percentage: Option<u8>,
253
+ click_type: Option<ClickType>,
254
+ restore_cursor: Option<bool>,
255
+ ) -> napi::Result<ClickResult> {
256
+ let vision_type = vision_type.unwrap_or(VisionType::UiTree);
257
+ let click_position = match (x_percentage, y_percentage) {
258
+ (Some(xp), Some(yp)) => Some((xp, yp)),
259
+ (Some(xp), None) => Some((xp, 50)),
260
+ (None, Some(yp)) => Some((50, yp)),
261
+ (None, None) => None,
262
+ };
263
+ let click_type = click_type.unwrap_or(ClickType::Left);
264
+ let restore_cursor = restore_cursor.unwrap_or(true);
265
+
266
+ self.inner
267
+ .click_by_index(
268
+ index,
269
+ vision_type.into(),
270
+ click_position,
271
+ click_type.into(),
272
+ restore_cursor,
273
+ )
274
+ .map(ClickResult::from)
275
+ .map_err(map_error)
276
+ }
277
+
105
278
  /// (async) Run a shell command.
106
279
  ///
107
280
  /// @param {string} [windowsCommand] - Command to run on Windows.
@@ -170,28 +343,1149 @@ impl Desktop {
170
343
  /// @returns {Promise<string>} The extracted text.
171
344
  #[napi]
172
345
  pub async fn ocr_screenshot(&self, screenshot: ScreenshotResult) -> napi::Result<String> {
173
- let rust_screenshot = terminator::ScreenshotResult {
174
- image_data: screenshot.image_data,
175
- width: screenshot.width,
176
- height: screenshot.height,
177
- monitor: screenshot.monitor.map(|m| terminator::Monitor {
178
- id: m.id,
179
- name: m.name,
180
- is_primary: m.is_primary,
181
- width: m.width,
182
- height: m.height,
183
- x: m.x,
184
- y: m.y,
185
- scale_factor: m.scale_factor,
186
- work_area: None,
187
- }),
188
- };
346
+ let rust_screenshot = screenshot.to_inner();
189
347
  self.inner
190
348
  .ocr_screenshot(&rust_screenshot)
191
349
  .await
192
350
  .map_err(map_error)
193
351
  }
194
352
 
353
+ /// (async) Perform OCR on a window by process name and return structured results with bounding boxes.
354
+ /// Returns an OcrResult containing the OCR tree, formatted output, and index-to-bounds mapping
355
+ /// for click targeting.
356
+ ///
357
+ /// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
358
+ /// @param {boolean} [formatOutput=true] - Whether to generate formatted compact YAML output.
359
+ /// @returns {Promise<OcrResult>} Complete OCR result with tree, formatted output, and bounds mapping.
360
+ #[napi]
361
+ #[cfg(target_os = "windows")]
362
+ pub async fn perform_ocr_for_process(
363
+ &self,
364
+ process: String,
365
+ format_output: Option<bool>,
366
+ ) -> napi::Result<crate::types::OcrResult> {
367
+ let format_output = format_output.unwrap_or(true);
368
+
369
+ // Find PID for the process name
370
+ let pid = find_pid_for_process(&self.inner, &process)?;
371
+
372
+ // Find the application element by PID
373
+ let apps = self.inner.applications().map_err(map_error)?;
374
+ let window_element = apps
375
+ .into_iter()
376
+ .find(|app| app.process_id().ok() == Some(pid))
377
+ .ok_or_else(|| {
378
+ napi::Error::from_reason(format!("No window found for process '{}'", process))
379
+ })?;
380
+
381
+ // Get window bounds (absolute screen coordinates)
382
+ let bounds = window_element.bounds().map_err(map_error)?;
383
+ let (window_x, window_y, win_w, win_h) = bounds;
384
+
385
+ // Capture screenshot of the window
386
+ let screenshot = window_element.capture().map_err(map_error)?;
387
+
388
+ // Calculate DPI scale factors (physical screenshot pixels / logical window size)
389
+ let dpi_scale_w = screenshot.width as f64 / win_w;
390
+ let dpi_scale_h = screenshot.height as f64 / win_h;
391
+
392
+ // Perform OCR with bounding boxes
393
+ let ocr_element = self
394
+ .inner
395
+ .ocr_screenshot_with_bounds(&screenshot, window_x, window_y, dpi_scale_w, dpi_scale_h)
396
+ .map_err(map_error)?;
397
+
398
+ // Format the OCR tree if requested
399
+ let (formatted, index_to_bounds) = if format_output {
400
+ let result = terminator::format_ocr_tree_as_compact_yaml(&ocr_element, 0);
401
+
402
+ // Populate the OCR cache for click_by_index support
403
+ self.inner
404
+ .populate_ocr_cache(result.index_to_bounds.clone());
405
+
406
+ let bounds_map: std::collections::HashMap<String, crate::types::OcrBoundsEntry> =
407
+ result
408
+ .index_to_bounds
409
+ .into_iter()
410
+ .map(|(idx, (text, (x, y, w, h)))| {
411
+ (
412
+ idx.to_string(),
413
+ crate::types::OcrBoundsEntry {
414
+ text,
415
+ bounds: crate::types::Bounds {
416
+ x,
417
+ y,
418
+ width: w,
419
+ height: h,
420
+ },
421
+ },
422
+ )
423
+ })
424
+ .collect();
425
+ (Some(result.formatted), bounds_map)
426
+ } else {
427
+ (None, std::collections::HashMap::new())
428
+ };
429
+
430
+ let element_count = index_to_bounds.len() as u32;
431
+
432
+ Ok(crate::types::OcrResult {
433
+ tree: crate::types::OcrElement::from(ocr_element),
434
+ formatted,
435
+ index_to_bounds,
436
+ element_count,
437
+ })
438
+ }
439
+
440
+ /// (async) Perform OCR on a window by process name (non-Windows stub).
441
+ #[napi]
442
+ #[cfg(not(target_os = "windows"))]
443
+ pub async fn perform_ocr_for_process(
444
+ &self,
445
+ _process: String,
446
+ _format_output: Option<bool>,
447
+ ) -> napi::Result<crate::types::OcrResult> {
448
+ Err(napi::Error::from_reason(
449
+ "OCR with bounding boxes is currently only supported on Windows",
450
+ ))
451
+ }
452
+
453
+ /// (async) Capture DOM elements from the current browser tab.
454
+ ///
455
+ /// Extracts visible DOM elements with their properties and screen coordinates.
456
+ /// Uses JavaScript injection via Chrome extension to traverse the DOM tree.
457
+ ///
458
+ /// @param {number} [maxElements=200] - Maximum number of elements to capture.
459
+ /// @param {boolean} [formatOutput=true] - Whether to include formatted compact YAML output.
460
+ /// @returns {Promise<BrowserDomResult>} DOM elements with bounds for click targeting.
461
+ #[napi]
462
+ pub async fn capture_browser_dom(
463
+ &self,
464
+ max_elements: Option<u32>,
465
+ format_output: Option<bool>,
466
+ ) -> napi::Result<crate::types::BrowserDomResult> {
467
+ use std::collections::HashMap;
468
+ use std::time::Duration;
469
+
470
+ let max_elements = max_elements.unwrap_or(200);
471
+ let format_output = format_output.unwrap_or(true);
472
+
473
+ // Get viewport offset from Document element (more reliable than JS due to DPI scaling)
474
+ let viewport_offset = match self
475
+ .inner
476
+ .locator("role:Document")
477
+ .first(Some(Duration::from_millis(2000)))
478
+ .await
479
+ {
480
+ Ok(doc_element) => match doc_element.bounds() {
481
+ Ok((x, y, _w, _h)) => (x, y),
482
+ Err(_) => (0.0, 0.0),
483
+ },
484
+ Err(_) => (0.0, 0.0),
485
+ };
486
+
487
+ // JavaScript to extract visible DOM elements
488
+ let script = format!(
489
+ r#"
490
+ (function() {{
491
+ const elements = [];
492
+ const maxElements = {max_elements};
493
+
494
+ const walker = document.createTreeWalker(
495
+ document.body,
496
+ NodeFilter.SHOW_ELEMENT,
497
+ {{
498
+ acceptNode: function(node) {{
499
+ const style = window.getComputedStyle(node);
500
+ const rect = node.getBoundingClientRect();
501
+
502
+ if (style.display === 'none' ||
503
+ style.visibility === 'hidden' ||
504
+ style.opacity === '0' ||
505
+ rect.width === 0 ||
506
+ rect.height === 0) {{
507
+ return NodeFilter.FILTER_SKIP;
508
+ }}
509
+
510
+ return NodeFilter.FILTER_ACCEPT;
511
+ }}
512
+ }}
513
+ );
514
+
515
+ let node;
516
+ while (node = walker.nextNode()) {{
517
+ if (elements.length >= maxElements) {{
518
+ break;
519
+ }}
520
+
521
+ const rect = node.getBoundingClientRect();
522
+ const text = node.innerText ? node.innerText.substring(0, 100).trim() : null;
523
+
524
+ elements.push({{
525
+ tag: node.tagName.toLowerCase(),
526
+ id: node.id || null,
527
+ classes: Array.from(node.classList),
528
+ text: text,
529
+ href: node.href || null,
530
+ type: node.type || null,
531
+ name: node.name || null,
532
+ value: node.value || null,
533
+ placeholder: node.placeholder || null,
534
+ aria_label: node.getAttribute('aria-label'),
535
+ role: node.getAttribute('role'),
536
+ x: Math.round(rect.x * window.devicePixelRatio),
537
+ y: Math.round(rect.y * window.devicePixelRatio),
538
+ width: Math.round(rect.width * window.devicePixelRatio),
539
+ height: Math.round(rect.height * window.devicePixelRatio)
540
+ }});
541
+ }}
542
+
543
+ return JSON.stringify({{
544
+ elements: elements,
545
+ total_found: elements.length,
546
+ page_url: window.location.href,
547
+ page_title: document.title,
548
+ devicePixelRatio: window.devicePixelRatio
549
+ }});
550
+ }})()"#
551
+ );
552
+
553
+ let result_str = self
554
+ .inner
555
+ .execute_browser_script(&script)
556
+ .await
557
+ .map_err(map_error)?;
558
+
559
+ let parsed: serde_json::Value = serde_json::from_str(&result_str)
560
+ .map_err(|e| napi::Error::from_reason(format!("Failed to parse DOM result: {e}")))?;
561
+
562
+ let page_url = parsed
563
+ .get("page_url")
564
+ .and_then(|v| v.as_str())
565
+ .unwrap_or("")
566
+ .to_string();
567
+ let page_title = parsed
568
+ .get("page_title")
569
+ .and_then(|v| v.as_str())
570
+ .unwrap_or("")
571
+ .to_string();
572
+
573
+ let raw_elements = parsed
574
+ .get("elements")
575
+ .and_then(|v| v.as_array())
576
+ .cloned()
577
+ .unwrap_or_default();
578
+
579
+ // Convert to BrowserDomElement and build index_to_bounds
580
+ let mut elements = Vec::new();
581
+ let mut index_to_bounds: HashMap<String, crate::types::DomBoundsEntry> = HashMap::new();
582
+ let mut formatted_lines: Vec<String> = Vec::new();
583
+
584
+ if format_output {
585
+ formatted_lines.push(format!(
586
+ "Browser DOM: {} elements (url: {}, title: {})",
587
+ raw_elements.len(),
588
+ page_url,
589
+ page_title
590
+ ));
591
+ }
592
+
593
+ for (i, elem) in raw_elements.iter().enumerate() {
594
+ let idx = i + 1;
595
+ let tag = elem
596
+ .get("tag")
597
+ .and_then(|v| v.as_str())
598
+ .unwrap_or("")
599
+ .to_string();
600
+ let id = elem.get("id").and_then(|v| v.as_str()).map(String::from);
601
+ let classes: Vec<String> = elem
602
+ .get("classes")
603
+ .and_then(|v| v.as_array())
604
+ .map(|arr| {
605
+ arr.iter()
606
+ .filter_map(|c| c.as_str().map(String::from))
607
+ .collect()
608
+ })
609
+ .unwrap_or_default();
610
+ let text = elem.get("text").and_then(|v| v.as_str()).map(String::from);
611
+ let href = elem.get("href").and_then(|v| v.as_str()).map(String::from);
612
+ let r#type = elem.get("type").and_then(|v| v.as_str()).map(String::from);
613
+ let name = elem.get("name").and_then(|v| v.as_str()).map(String::from);
614
+ let value = elem.get("value").and_then(|v| v.as_str()).map(String::from);
615
+ let placeholder = elem
616
+ .get("placeholder")
617
+ .and_then(|v| v.as_str())
618
+ .map(String::from);
619
+ let aria_label = elem
620
+ .get("aria_label")
621
+ .and_then(|v| v.as_str())
622
+ .map(String::from);
623
+ let role = elem.get("role").and_then(|v| v.as_str()).map(String::from);
624
+
625
+ // Build bounds with viewport offset added
626
+ let x = elem.get("x").and_then(|v| v.as_f64()).unwrap_or(0.0) + viewport_offset.0;
627
+ let y = elem.get("y").and_then(|v| v.as_f64()).unwrap_or(0.0) + viewport_offset.1;
628
+ let width = elem.get("width").and_then(|v| v.as_f64()).unwrap_or(0.0);
629
+ let height = elem.get("height").and_then(|v| v.as_f64()).unwrap_or(0.0);
630
+
631
+ let bounds = crate::types::Bounds {
632
+ x,
633
+ y,
634
+ width,
635
+ height,
636
+ };
637
+
638
+ // Display name for index_to_bounds
639
+ let display_name = text
640
+ .as_ref()
641
+ .filter(|t| !t.is_empty())
642
+ .cloned()
643
+ .or_else(|| aria_label.clone())
644
+ .or_else(|| placeholder.clone())
645
+ .or_else(|| name.clone())
646
+ .or_else(|| id.clone())
647
+ .unwrap_or_else(|| format!("<{}>", tag));
648
+
649
+ // Format line for compact YAML
650
+ if format_output {
651
+ let mut line_parts = vec![format!("#{} [{}]", idx, tag.to_uppercase())];
652
+ if let Some(ref t) = text {
653
+ if !t.is_empty() {
654
+ let truncated = if t.len() > 40 {
655
+ format!("{}...", &t[..40])
656
+ } else {
657
+ t.clone()
658
+ };
659
+ line_parts.push(truncated);
660
+ }
661
+ }
662
+ if let Some(ref a) = aria_label {
663
+ line_parts.push(format!("aria:{}", a));
664
+ }
665
+ if let Some(ref r) = role {
666
+ line_parts.push(format!("role:{}", r));
667
+ }
668
+ formatted_lines.push(format!(" {}", line_parts.join(" ")));
669
+ }
670
+
671
+ index_to_bounds.insert(
672
+ idx.to_string(),
673
+ crate::types::DomBoundsEntry {
674
+ name: display_name,
675
+ tag: tag.clone(),
676
+ bounds: bounds.clone(),
677
+ },
678
+ );
679
+
680
+ elements.push(crate::types::BrowserDomElement {
681
+ tag,
682
+ id,
683
+ classes,
684
+ text,
685
+ href,
686
+ r#type,
687
+ name,
688
+ value,
689
+ placeholder,
690
+ aria_label,
691
+ role,
692
+ bounds,
693
+ });
694
+ }
695
+
696
+ // Populate DOM cache for click_by_index
697
+ #[allow(clippy::type_complexity)]
698
+ let cache_items: std::collections::HashMap<
699
+ u32,
700
+ (String, String, (f64, f64, f64, f64)),
701
+ > = index_to_bounds
702
+ .iter()
703
+ .filter_map(|(key, entry)| {
704
+ key.parse::<u32>().ok().map(|idx| {
705
+ (
706
+ idx,
707
+ (
708
+ entry.name.clone(),
709
+ entry.tag.clone(),
710
+ (
711
+ entry.bounds.x,
712
+ entry.bounds.y,
713
+ entry.bounds.width,
714
+ entry.bounds.height,
715
+ ),
716
+ ),
717
+ )
718
+ })
719
+ })
720
+ .collect();
721
+ self.inner.populate_dom_cache(cache_items);
722
+
723
+ Ok(crate::types::BrowserDomResult {
724
+ elements,
725
+ formatted: if format_output {
726
+ Some(formatted_lines.join("\n"))
727
+ } else {
728
+ None
729
+ },
730
+ index_to_bounds,
731
+ element_count: raw_elements.len() as u32,
732
+ page_url,
733
+ page_title,
734
+ })
735
+ }
736
+
737
+ /// (async) Get a clustered tree combining elements from multiple sources grouped by spatial proximity.
738
+ ///
739
+ /// Combines accessibility tree (UIA) elements with optional DOM, Omniparser, and Gemini Vision elements,
740
+ /// clustering nearby elements together. Each element is prefixed with its source:
741
+ /// - #u1, #u2... for UIA (accessibility tree)
742
+ /// - #d1, #d2... for DOM (browser content)
743
+ /// - #p1, #p2... for Omniparser (vision AI detection)
744
+ /// - #g1, #g2... for Gemini Vision (AI element detection)
745
+ ///
746
+ /// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
747
+ /// @param {number} [maxDomElements=100] - Maximum DOM elements to capture for browsers.
748
+ /// @param {boolean} [includeOmniparser=false] - Whether to include Omniparser vision detection.
749
+ /// @param {boolean} [includeGeminiVision=false] - Whether to include Gemini Vision AI detection.
750
+ /// @returns {Promise<ClusteredFormattingResult>} Clustered tree with prefixed indices.
751
+ #[napi]
752
+ pub async fn get_clustered_tree(
753
+ &self,
754
+ process: String,
755
+ max_dom_elements: Option<u32>,
756
+ include_omniparser: Option<bool>,
757
+ include_gemini_vision: Option<bool>,
758
+ ) -> napi::Result<crate::types::ClusteredFormattingResult> {
759
+ use std::collections::HashMap;
760
+
761
+ // Find PID for the process name
762
+ let pid = find_pid_for_process(&self.inner, &process)?;
763
+
764
+ let max_dom_elements = max_dom_elements.unwrap_or(100);
765
+ let include_omniparser = include_omniparser.unwrap_or(false);
766
+ let include_gemini_vision = include_gemini_vision.unwrap_or(false);
767
+
768
+ // Get UIA tree with bounds
769
+ let uia_result = self
770
+ .inner
771
+ .get_window_tree_result(pid, None, None)
772
+ .map_err(map_error)?;
773
+
774
+ // Build UIA bounds cache: HashMap<u32, (role, name, bounds, selector)>
775
+ #[allow(clippy::type_complexity)]
776
+ let mut uia_bounds: HashMap<
777
+ u32,
778
+ (String, String, (f64, f64, f64, f64), Option<String>),
779
+ > = HashMap::new();
780
+
781
+ // Use the formatted result to extract bounds
782
+ let formatted_result = terminator::format_ui_node_as_compact_yaml(&uia_result.tree, 0);
783
+ for (idx, (role, name, bounds, selector)) in formatted_result.index_to_bounds {
784
+ uia_bounds.insert(idx, (role, name, bounds, selector));
785
+ }
786
+
787
+ // Check if this is a browser
788
+ let is_browser = terminator::is_browser_process(pid);
789
+
790
+ // Build DOM bounds cache: HashMap<u32, (tag, identifier, bounds)>
791
+ #[allow(clippy::type_complexity)]
792
+ let mut dom_bounds: HashMap<u32, (String, String, (f64, f64, f64, f64))> = HashMap::new();
793
+
794
+ if is_browser {
795
+ // Try to capture DOM elements
796
+ match self
797
+ .capture_browser_dom(Some(max_dom_elements), Some(true))
798
+ .await
799
+ {
800
+ Ok(dom_result) => {
801
+ for (idx_str, entry) in dom_result.index_to_bounds {
802
+ if let Ok(idx) = idx_str.parse::<u32>() {
803
+ let bounds = (
804
+ entry.bounds.x,
805
+ entry.bounds.y,
806
+ entry.bounds.width,
807
+ entry.bounds.height,
808
+ );
809
+ dom_bounds.insert(idx, (entry.tag, entry.name, bounds));
810
+ }
811
+ }
812
+ }
813
+ Err(_) => {
814
+ // DOM capture failed (e.g., chrome:// page), continue with UIA only
815
+ }
816
+ }
817
+ }
818
+
819
+ // Build Omniparser items cache if requested
820
+ let mut omniparser_items: HashMap<u32, terminator::OmniparserItem> = HashMap::new();
821
+
822
+ if include_omniparser {
823
+ match self
824
+ .perform_omniparser_for_process(process.clone(), None, Some(true))
825
+ .await
826
+ {
827
+ Ok(omni_result) => {
828
+ for (idx_str, entry) in omni_result.index_to_bounds {
829
+ if let Ok(idx) = idx_str.parse::<u32>() {
830
+ omniparser_items.insert(
831
+ idx,
832
+ terminator::OmniparserItem {
833
+ label: entry.label.clone(),
834
+ content: Some(entry.name.clone()),
835
+ box_2d: Some([
836
+ entry.bounds.x,
837
+ entry.bounds.y,
838
+ entry.bounds.x + entry.bounds.width,
839
+ entry.bounds.y + entry.bounds.height,
840
+ ]),
841
+ },
842
+ );
843
+ }
844
+ }
845
+ }
846
+ Err(_) => {
847
+ // Omniparser failed, continue without it
848
+ }
849
+ }
850
+ }
851
+
852
+ // Build Gemini Vision items cache if requested
853
+ let mut vision_items: HashMap<u32, terminator::VisionElement> = HashMap::new();
854
+
855
+ if include_gemini_vision {
856
+ match self
857
+ .perform_gemini_vision_for_process(process.clone(), Some(true))
858
+ .await
859
+ {
860
+ Ok(vision_result) => {
861
+ for (idx_str, entry) in vision_result.index_to_bounds {
862
+ if let Ok(idx) = idx_str.parse::<u32>() {
863
+ vision_items.insert(
864
+ idx,
865
+ terminator::VisionElement {
866
+ element_type: entry.element_type.clone(),
867
+ content: Some(entry.name.clone()),
868
+ description: None,
869
+ box_2d: Some([
870
+ entry.bounds.x,
871
+ entry.bounds.y,
872
+ entry.bounds.x + entry.bounds.width,
873
+ entry.bounds.y + entry.bounds.height,
874
+ ]),
875
+ interactivity: None,
876
+ },
877
+ );
878
+ }
879
+ }
880
+ }
881
+ Err(_) => {
882
+ // Gemini Vision failed, continue without it
883
+ }
884
+ }
885
+ }
886
+
887
+ // Empty cache for OCR (not implemented yet)
888
+ #[allow(clippy::type_complexity)]
889
+ let ocr_bounds: HashMap<u32, (String, (f64, f64, f64, f64))> = HashMap::new();
890
+
891
+ // Call the core clustering function
892
+ let clustered_result = terminator::format_clustered_tree_from_caches(
893
+ &uia_bounds,
894
+ &dom_bounds,
895
+ &ocr_bounds,
896
+ &omniparser_items,
897
+ &vision_items,
898
+ );
899
+
900
+ // Convert to SDK types
901
+ let mut index_to_source_and_bounds: HashMap<String, crate::types::ClusteredBoundsEntry> =
902
+ HashMap::new();
903
+
904
+ for (key, (source, original_idx, (x, y, w, h))) in
905
+ clustered_result.index_to_source_and_bounds
906
+ {
907
+ let sdk_source = match source {
908
+ terminator::ElementSource::Uia => crate::types::ElementSource::Uia,
909
+ terminator::ElementSource::Dom => crate::types::ElementSource::Dom,
910
+ terminator::ElementSource::Ocr => crate::types::ElementSource::Ocr,
911
+ terminator::ElementSource::Omniparser => crate::types::ElementSource::Omniparser,
912
+ terminator::ElementSource::Gemini => crate::types::ElementSource::Gemini,
913
+ };
914
+ index_to_source_and_bounds.insert(
915
+ key,
916
+ crate::types::ClusteredBoundsEntry {
917
+ source: sdk_source,
918
+ original_index: original_idx,
919
+ bounds: crate::types::Bounds {
920
+ x,
921
+ y,
922
+ width: w,
923
+ height: h,
924
+ },
925
+ },
926
+ );
927
+ }
928
+
929
+ Ok(crate::types::ClusteredFormattingResult {
930
+ formatted: clustered_result.formatted,
931
+ index_to_source_and_bounds,
932
+ })
933
+ }
934
+
935
+ /// (async) Perform Gemini vision AI detection on a window by process name.
936
+ ///
937
+ /// Captures a screenshot and sends it to the Gemini vision backend for UI element detection.
938
+ /// Requires GEMINI_VISION_BACKEND_URL environment variable (defaults to https://app.mediar.ai/api/vision/parse).
939
+ ///
940
+ /// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
941
+ /// @param {boolean} [formatOutput=true] - Whether to include formatted compact YAML output.
942
+ /// @returns {Promise<GeminiVisionResult>} Detected UI elements with bounds for click targeting.
943
+ #[napi]
944
+ pub async fn perform_gemini_vision_for_process(
945
+ &self,
946
+ process: String,
947
+ format_output: Option<bool>,
948
+ ) -> napi::Result<crate::types::GeminiVisionResult> {
949
+ use base64::{engine::general_purpose, Engine};
950
+ use image::imageops::FilterType;
951
+ use image::{codecs::png::PngEncoder, ExtendedColorType, ImageBuffer, ImageEncoder, Rgba};
952
+ use std::collections::HashMap;
953
+ use std::io::Cursor;
954
+
955
+ let format_output = format_output.unwrap_or(true);
956
+
957
+ // Find PID for the process name
958
+ let pid = find_pid_for_process(&self.inner, &process)?;
959
+
960
+ // Find the window element for this process
961
+ let apps = self.inner.applications().map_err(map_error)?;
962
+ let window_element = apps
963
+ .into_iter()
964
+ .find(|app| app.process_id().ok() == Some(pid))
965
+ .ok_or_else(|| {
966
+ napi::Error::from_reason(format!("No window found for process '{}'", process))
967
+ })?;
968
+
969
+ // Get window bounds
970
+ let bounds = window_element.bounds().map_err(map_error)?;
971
+ let (window_x, window_y, win_w, win_h) = bounds;
972
+
973
+ // Capture screenshot
974
+ let screenshot = window_element.capture().map_err(map_error)?;
975
+ let original_width = screenshot.width;
976
+ let original_height = screenshot.height;
977
+
978
+ // Calculate DPI scale
979
+ let dpi_scale_w = original_width as f64 / win_w;
980
+ let dpi_scale_h = original_height as f64 / win_h;
981
+
982
+ // Convert BGRA to RGBA
983
+ let rgba_data: Vec<u8> = screenshot
984
+ .image_data
985
+ .chunks_exact(4)
986
+ .flat_map(|bgra| [bgra[2], bgra[1], bgra[0], bgra[3]])
987
+ .collect();
988
+
989
+ // Resize if needed (max 1920px)
990
+ const MAX_DIM: u32 = 1920;
991
+ let (final_width, final_height, final_rgba_data, scale_factor) = if original_width > MAX_DIM
992
+ || original_height > MAX_DIM
993
+ {
994
+ let scale = (MAX_DIM as f32 / original_width.max(original_height) as f32).min(1.0);
995
+ let new_width = (original_width as f32 * scale).round() as u32;
996
+ let new_height = (original_height as f32 * scale).round() as u32;
997
+
998
+ let img =
999
+ ImageBuffer::<Rgba<u8>, _>::from_raw(original_width, original_height, rgba_data)
1000
+ .ok_or_else(|| napi::Error::from_reason("Failed to create image buffer"))?;
1001
+
1002
+ let resized =
1003
+ image::imageops::resize(&img, new_width, new_height, FilterType::Lanczos3);
1004
+ (new_width, new_height, resized.into_raw(), scale as f64)
1005
+ } else {
1006
+ (original_width, original_height, rgba_data, 1.0)
1007
+ };
1008
+
1009
+ // Encode to PNG
1010
+ let mut png_data = Vec::new();
1011
+ let encoder = PngEncoder::new(Cursor::new(&mut png_data));
1012
+ encoder
1013
+ .write_image(
1014
+ &final_rgba_data,
1015
+ final_width,
1016
+ final_height,
1017
+ ExtendedColorType::Rgba8,
1018
+ )
1019
+ .map_err(|e| napi::Error::from_reason(format!("Failed to encode PNG: {e}")))?;
1020
+
1021
+ let base64_image = general_purpose::STANDARD.encode(&png_data);
1022
+
1023
+ // Call Gemini Vision backend
1024
+ let backend_url = std::env::var("GEMINI_VISION_BACKEND_URL")
1025
+ .unwrap_or_else(|_| "https://app.mediar.ai/api/vision/parse".to_string());
1026
+
1027
+ let client = reqwest::Client::builder()
1028
+ .timeout(std::time::Duration::from_secs(300))
1029
+ .build()
1030
+ .map_err(|e| napi::Error::from_reason(format!("Failed to create HTTP client: {e}")))?;
1031
+
1032
+ let payload = serde_json::json!({
1033
+ "image": base64_image,
1034
+ "model": "gemini",
1035
+ "prompt": "Detect all UI elements in this screenshot. Return their type, content, description, bounding boxes, and interactivity."
1036
+ });
1037
+
1038
+ let resp = client
1039
+ .post(&backend_url)
1040
+ .header("Content-Type", "application/json")
1041
+ .json(&payload)
1042
+ .send()
1043
+ .await
1044
+ .map_err(|e| napi::Error::from_reason(format!("Vision backend request failed: {e}")))?;
1045
+
1046
+ if !resp.status().is_success() {
1047
+ let text = resp.text().await.unwrap_or_default();
1048
+ return Err(napi::Error::from_reason(format!(
1049
+ "Vision backend error: {}",
1050
+ text
1051
+ )));
1052
+ }
1053
+
1054
+ let response_text = resp
1055
+ .text()
1056
+ .await
1057
+ .map_err(|e| napi::Error::from_reason(format!("Failed to read response: {e}")))?;
1058
+
1059
+ let parsed: serde_json::Value = serde_json::from_str(&response_text)
1060
+ .map_err(|e| napi::Error::from_reason(format!("Failed to parse response: {e}")))?;
1061
+
1062
+ if let Some(error) = parsed.get("error").and_then(|v| v.as_str()) {
1063
+ return Err(napi::Error::from_reason(format!("Vision error: {}", error)));
1064
+ }
1065
+
1066
+ let raw_elements = parsed
1067
+ .get("elements")
1068
+ .and_then(|v| v.as_array())
1069
+ .cloned()
1070
+ .unwrap_or_default();
1071
+
1072
+ // Convert to VisionElement with absolute screen coordinates
1073
+ let mut elements = Vec::new();
1074
+ let mut index_to_bounds: HashMap<String, crate::types::VisionBoundsEntry> = HashMap::new();
1075
+ let mut formatted_lines: Vec<String> = Vec::new();
1076
+
1077
+ if format_output {
1078
+ formatted_lines.push(format!(
1079
+ "Gemini Vision: {} elements (PID: {})",
1080
+ raw_elements.len(),
1081
+ pid
1082
+ ));
1083
+ }
1084
+
1085
+ let inv_scale = 1.0 / scale_factor;
1086
+
1087
+ for (i, elem) in raw_elements.iter().enumerate() {
1088
+ let idx = i + 1;
1089
+ let element_type = elem
1090
+ .get("type")
1091
+ .and_then(|v| v.as_str())
1092
+ .unwrap_or("unknown")
1093
+ .to_string();
1094
+ let content = elem
1095
+ .get("content")
1096
+ .and_then(|v| v.as_str())
1097
+ .filter(|s| !s.is_empty())
1098
+ .map(String::from);
1099
+ let description = elem
1100
+ .get("description")
1101
+ .and_then(|v| v.as_str())
1102
+ .filter(|s| !s.is_empty())
1103
+ .map(String::from);
1104
+ let interactivity = elem.get("interactivity").and_then(|v| v.as_bool());
1105
+
1106
+ // Get normalized bbox [x1, y1, x2, y2] from 0-1
1107
+ let bbox = elem.get("bbox").and_then(|v| v.as_array());
1108
+ let bounds = bbox.and_then(|arr| {
1109
+ if arr.len() >= 4 {
1110
+ let x1 = arr[0].as_f64()? * final_width as f64;
1111
+ let y1 = arr[1].as_f64()? * final_height as f64;
1112
+ let x2 = arr[2].as_f64()? * final_width as f64;
1113
+ let y2 = arr[3].as_f64()? * final_height as f64;
1114
+
1115
+ // Scale back to original size and convert to logical screen coords
1116
+ let abs_x = window_x + (x1 * inv_scale / dpi_scale_w);
1117
+ let abs_y = window_y + (y1 * inv_scale / dpi_scale_h);
1118
+ let abs_w = (x2 - x1) * inv_scale / dpi_scale_w;
1119
+ let abs_h = (y2 - y1) * inv_scale / dpi_scale_h;
1120
+
1121
+ Some(crate::types::Bounds {
1122
+ x: abs_x,
1123
+ y: abs_y,
1124
+ width: abs_w,
1125
+ height: abs_h,
1126
+ })
1127
+ } else {
1128
+ None
1129
+ }
1130
+ });
1131
+
1132
+ // Display name for index_to_bounds
1133
+ let display_name = content
1134
+ .as_ref()
1135
+ .cloned()
1136
+ .or_else(|| description.clone())
1137
+ .unwrap_or_else(|| format!("<{}>", element_type));
1138
+
1139
+ // Format line for compact YAML
1140
+ if format_output {
1141
+ let mut line_parts = vec![format!("#{} [{}]", idx, element_type.to_uppercase())];
1142
+ if let Some(ref c) = content {
1143
+ let truncated = if c.len() > 40 {
1144
+ format!("{}...", &c[..40])
1145
+ } else {
1146
+ c.clone()
1147
+ };
1148
+ line_parts.push(truncated);
1149
+ }
1150
+ if let Some(ref d) = description {
1151
+ let truncated = if d.len() > 30 {
1152
+ format!("{}...", &d[..30])
1153
+ } else {
1154
+ d.clone()
1155
+ };
1156
+ line_parts.push(format!("desc:{}", truncated));
1157
+ }
1158
+ if interactivity == Some(true) {
1159
+ line_parts.push("interactive".to_string());
1160
+ }
1161
+ formatted_lines.push(format!(" {}", line_parts.join(" ")));
1162
+ }
1163
+
1164
+ if let Some(ref b) = bounds {
1165
+ index_to_bounds.insert(
1166
+ idx.to_string(),
1167
+ crate::types::VisionBoundsEntry {
1168
+ name: display_name.clone(),
1169
+ element_type: element_type.clone(),
1170
+ bounds: b.clone(),
1171
+ },
1172
+ );
1173
+ }
1174
+
1175
+ elements.push(crate::types::VisionElement {
1176
+ element_type,
1177
+ content,
1178
+ description,
1179
+ bounds,
1180
+ interactivity,
1181
+ });
1182
+ }
1183
+
1184
+ // Populate the Vision cache for click_by_index support
1185
+ let cache_items: HashMap<u32, terminator::VisionElement> = elements
1186
+ .iter()
1187
+ .enumerate()
1188
+ .map(|(i, elem)| {
1189
+ let box_2d = elem
1190
+ .bounds
1191
+ .as_ref()
1192
+ .map(|b| [b.x, b.y, b.x + b.width, b.y + b.height]);
1193
+ (
1194
+ (i + 1) as u32,
1195
+ terminator::VisionElement {
1196
+ element_type: elem.element_type.clone(),
1197
+ content: elem.content.clone(),
1198
+ description: elem.description.clone(),
1199
+ box_2d,
1200
+ interactivity: elem.interactivity,
1201
+ },
1202
+ )
1203
+ })
1204
+ .collect();
1205
+ self.inner.populate_vision_cache(cache_items);
1206
+
1207
+ Ok(crate::types::GeminiVisionResult {
1208
+ elements,
1209
+ formatted: if format_output {
1210
+ Some(formatted_lines.join("\n"))
1211
+ } else {
1212
+ None
1213
+ },
1214
+ index_to_bounds,
1215
+ element_count: raw_elements.len() as u32,
1216
+ })
1217
+ }
1218
+
1219
+ /// (async) Perform Omniparser V2 detection on a window by process name.
1220
+ ///
1221
+ /// Captures a screenshot and sends it to the Omniparser backend for icon/field detection.
1222
+ /// Requires OMNIPARSER_BACKEND_URL environment variable (defaults to https://app.mediar.ai/api/omniparser/parse).
1223
+ ///
1224
+ /// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
1225
+ /// @param {number} [imgsz=1920] - Icon detection image size (640-1920). Higher = better but slower.
1226
+ /// @param {boolean} [formatOutput=true] - Whether to include formatted compact YAML output.
1227
+ /// @returns {Promise<OmniparserResult>} Detected items with bounds for click targeting.
1228
+ #[napi]
1229
+ pub async fn perform_omniparser_for_process(
1230
+ &self,
1231
+ process: String,
1232
+ imgsz: Option<u32>,
1233
+ format_output: Option<bool>,
1234
+ ) -> napi::Result<crate::types::OmniparserResult> {
1235
+ use base64::{engine::general_purpose, Engine};
1236
+ use image::imageops::FilterType;
1237
+ use image::{codecs::png::PngEncoder, ExtendedColorType, ImageBuffer, ImageEncoder, Rgba};
1238
+ use std::collections::HashMap;
1239
+ use std::io::Cursor;
1240
+
1241
+ let imgsz = imgsz.unwrap_or(1920).clamp(640, 1920);
1242
+ let format_output = format_output.unwrap_or(true);
1243
+
1244
+ // Find PID for the process name
1245
+ let pid = find_pid_for_process(&self.inner, &process)?;
1246
+
1247
+ // Find the window element for this process
1248
+ let apps = self.inner.applications().map_err(map_error)?;
1249
+ let window_element = apps
1250
+ .into_iter()
1251
+ .find(|app| app.process_id().ok() == Some(pid))
1252
+ .ok_or_else(|| {
1253
+ napi::Error::from_reason(format!("No window found for process '{}'", process))
1254
+ })?;
1255
+
1256
+ // Get window bounds
1257
+ let bounds = window_element.bounds().map_err(map_error)?;
1258
+ let (window_x, window_y, win_w, win_h) = bounds;
1259
+
1260
+ // Capture screenshot
1261
+ let screenshot = window_element.capture().map_err(map_error)?;
1262
+ let original_width = screenshot.width;
1263
+ let original_height = screenshot.height;
1264
+
1265
+ // Calculate DPI scale
1266
+ let dpi_scale_w = original_width as f64 / win_w;
1267
+ let dpi_scale_h = original_height as f64 / win_h;
1268
+
1269
+ // Convert BGRA to RGBA
1270
+ let rgba_data: Vec<u8> = screenshot
1271
+ .image_data
1272
+ .chunks_exact(4)
1273
+ .flat_map(|bgra| [bgra[2], bgra[1], bgra[0], bgra[3]])
1274
+ .collect();
1275
+
1276
+ // Resize if needed (max 1920px)
1277
+ const MAX_DIM: u32 = 1920;
1278
+ let (final_width, final_height, final_rgba_data, scale_factor) = if original_width > MAX_DIM
1279
+ || original_height > MAX_DIM
1280
+ {
1281
+ let scale = (MAX_DIM as f32 / original_width.max(original_height) as f32).min(1.0);
1282
+ let new_width = (original_width as f32 * scale).round() as u32;
1283
+ let new_height = (original_height as f32 * scale).round() as u32;
1284
+
1285
+ let img =
1286
+ ImageBuffer::<Rgba<u8>, _>::from_raw(original_width, original_height, rgba_data)
1287
+ .ok_or_else(|| napi::Error::from_reason("Failed to create image buffer"))?;
1288
+
1289
+ let resized =
1290
+ image::imageops::resize(&img, new_width, new_height, FilterType::Lanczos3);
1291
+ (new_width, new_height, resized.into_raw(), scale as f64)
1292
+ } else {
1293
+ (original_width, original_height, rgba_data, 1.0)
1294
+ };
1295
+
1296
+ // Encode to PNG
1297
+ let mut png_data = Vec::new();
1298
+ let encoder = PngEncoder::new(Cursor::new(&mut png_data));
1299
+ encoder
1300
+ .write_image(
1301
+ &final_rgba_data,
1302
+ final_width,
1303
+ final_height,
1304
+ ExtendedColorType::Rgba8,
1305
+ )
1306
+ .map_err(|e| napi::Error::from_reason(format!("Failed to encode PNG: {e}")))?;
1307
+
1308
+ let base64_image = general_purpose::STANDARD.encode(&png_data);
1309
+
1310
+ // Call Omniparser backend
1311
+ let backend_url = std::env::var("OMNIPARSER_BACKEND_URL")
1312
+ .unwrap_or_else(|_| "https://app.mediar.ai/api/omniparser/parse".to_string());
1313
+
1314
+ let client = reqwest::Client::builder()
1315
+ .timeout(std::time::Duration::from_secs(300))
1316
+ .build()
1317
+ .map_err(|e| napi::Error::from_reason(format!("Failed to create HTTP client: {e}")))?;
1318
+
1319
+ let payload = serde_json::json!({
1320
+ "image": base64_image,
1321
+ "imgsz": imgsz
1322
+ });
1323
+
1324
+ let resp = client
1325
+ .post(&backend_url)
1326
+ .header("Content-Type", "application/json")
1327
+ .json(&payload)
1328
+ .send()
1329
+ .await
1330
+ .map_err(|e| {
1331
+ napi::Error::from_reason(format!("Omniparser backend request failed: {e}"))
1332
+ })?;
1333
+
1334
+ if !resp.status().is_success() {
1335
+ let text = resp.text().await.unwrap_or_default();
1336
+ return Err(napi::Error::from_reason(format!(
1337
+ "Omniparser backend error: {}",
1338
+ text
1339
+ )));
1340
+ }
1341
+
1342
+ let response_text = resp
1343
+ .text()
1344
+ .await
1345
+ .map_err(|e| napi::Error::from_reason(format!("Failed to read response: {e}")))?;
1346
+
1347
+ let parsed: serde_json::Value = serde_json::from_str(&response_text)
1348
+ .map_err(|e| napi::Error::from_reason(format!("Failed to parse response: {e}")))?;
1349
+
1350
+ if let Some(error) = parsed.get("error").and_then(|v| v.as_str()) {
1351
+ return Err(napi::Error::from_reason(format!(
1352
+ "Omniparser error: {}",
1353
+ error
1354
+ )));
1355
+ }
1356
+
1357
+ let raw_elements = parsed
1358
+ .get("elements")
1359
+ .and_then(|v| v.as_array())
1360
+ .cloned()
1361
+ .unwrap_or_default();
1362
+
1363
+ // Convert to OmniparserItem with absolute screen coordinates
1364
+ let mut items = Vec::new();
1365
+ let mut index_to_bounds: HashMap<String, crate::types::OmniparserBoundsEntry> =
1366
+ HashMap::new();
1367
+ let mut formatted_lines: Vec<String> = Vec::new();
1368
+
1369
+ if format_output {
1370
+ formatted_lines.push(format!(
1371
+ "Omniparser: {} items (PID: {})",
1372
+ raw_elements.len(),
1373
+ pid
1374
+ ));
1375
+ }
1376
+
1377
+ let inv_scale = 1.0 / scale_factor;
1378
+
1379
+ for (i, elem) in raw_elements.iter().enumerate() {
1380
+ let idx = i + 1;
1381
+ let label = elem
1382
+ .get("type")
1383
+ .and_then(|v| v.as_str())
1384
+ .unwrap_or("unknown")
1385
+ .to_string();
1386
+ let content = elem
1387
+ .get("content")
1388
+ .and_then(|v| v.as_str())
1389
+ .filter(|s| !s.is_empty())
1390
+ .map(String::from);
1391
+
1392
+ // Get normalized bbox [x1, y1, x2, y2] from 0-1
1393
+ let bbox = elem.get("bbox").and_then(|v| v.as_array());
1394
+ let bounds = bbox.and_then(|arr| {
1395
+ if arr.len() >= 4 {
1396
+ let x1 = arr[0].as_f64()? * final_width as f64;
1397
+ let y1 = arr[1].as_f64()? * final_height as f64;
1398
+ let x2 = arr[2].as_f64()? * final_width as f64;
1399
+ let y2 = arr[3].as_f64()? * final_height as f64;
1400
+
1401
+ // Scale back to original size and convert to logical screen coords
1402
+ let abs_x = window_x + (x1 * inv_scale / dpi_scale_w);
1403
+ let abs_y = window_y + (y1 * inv_scale / dpi_scale_h);
1404
+ let abs_w = (x2 - x1) * inv_scale / dpi_scale_w;
1405
+ let abs_h = (y2 - y1) * inv_scale / dpi_scale_h;
1406
+
1407
+ Some(crate::types::Bounds {
1408
+ x: abs_x,
1409
+ y: abs_y,
1410
+ width: abs_w,
1411
+ height: abs_h,
1412
+ })
1413
+ } else {
1414
+ None
1415
+ }
1416
+ });
1417
+
1418
+ // Display name for index_to_bounds
1419
+ let display_name = content
1420
+ .as_ref()
1421
+ .cloned()
1422
+ .unwrap_or_else(|| format!("<{}>", label));
1423
+
1424
+ // Format line for compact YAML
1425
+ if format_output {
1426
+ let mut line_parts = vec![format!("#{} [{}]", idx, label.to_uppercase())];
1427
+ if let Some(ref c) = content {
1428
+ let truncated = if c.len() > 50 {
1429
+ format!("{}...", &c[..50])
1430
+ } else {
1431
+ c.clone()
1432
+ };
1433
+ line_parts.push(truncated);
1434
+ }
1435
+ formatted_lines.push(format!(" {}", line_parts.join(" ")));
1436
+ }
1437
+
1438
+ if let Some(ref b) = bounds {
1439
+ index_to_bounds.insert(
1440
+ idx.to_string(),
1441
+ crate::types::OmniparserBoundsEntry {
1442
+ name: display_name.clone(),
1443
+ label: label.clone(),
1444
+ bounds: b.clone(),
1445
+ },
1446
+ );
1447
+ }
1448
+
1449
+ items.push(crate::types::OmniparserItem {
1450
+ label,
1451
+ content,
1452
+ bounds,
1453
+ });
1454
+ }
1455
+
1456
+ // Populate the Omniparser cache for click_by_index support
1457
+ let cache_items: HashMap<u32, terminator::OmniparserItem> = items
1458
+ .iter()
1459
+ .enumerate()
1460
+ .map(|(i, item)| {
1461
+ let box_2d = item
1462
+ .bounds
1463
+ .as_ref()
1464
+ .map(|b| [b.x, b.y, b.x + b.width, b.y + b.height]);
1465
+ (
1466
+ (i + 1) as u32,
1467
+ terminator::OmniparserItem {
1468
+ label: item.label.clone(),
1469
+ content: item.content.clone(),
1470
+ box_2d,
1471
+ },
1472
+ )
1473
+ })
1474
+ .collect();
1475
+ self.inner.populate_omniparser_cache(cache_items);
1476
+
1477
+ Ok(crate::types::OmniparserResult {
1478
+ items,
1479
+ formatted: if format_output {
1480
+ Some(formatted_lines.join("\n"))
1481
+ } else {
1482
+ None
1483
+ },
1484
+ index_to_bounds,
1485
+ item_count: raw_elements.len() as u32,
1486
+ })
1487
+ }
1488
+
195
1489
  /// (async) Get the currently focused browser window.
196
1490
  ///
197
1491
  /// @returns {Promise<Element>} The current browser window element.
@@ -222,6 +1516,45 @@ impl Desktop {
222
1516
  Ok(Locator::from(loc))
223
1517
  }
224
1518
 
1519
+ /// Create a process-scoped locator for finding UI elements.
1520
+ /// This is the recommended way to create locators - always scope to a specific process.
1521
+ ///
1522
+ /// @param {string} process - Process name to scope the search (e.g., 'chrome', 'notepad').
1523
+ /// @param {string | Selector} selector - The selector to find within the process.
1524
+ /// @param {string} [windowSelector] - Optional window selector for additional filtering.
1525
+ /// @returns {Locator} A locator for finding elements within the process.
1526
+ #[napi]
1527
+ pub fn locator_for_process(
1528
+ &self,
1529
+ process: String,
1530
+ #[napi(ts_arg_type = "string | Selector")] selector: Either<String, &Selector>,
1531
+ window_selector: Option<String>,
1532
+ ) -> napi::Result<Locator> {
1533
+ use napi::bindgen_prelude::Either::*;
1534
+
1535
+ // Build the full selector string like MCP does
1536
+ let selector_str = match &selector {
1537
+ A(sel_str) => sel_str.clone(),
1538
+ B(sel_obj) => format!("{:?}", sel_obj.inner),
1539
+ };
1540
+
1541
+ let full_selector = if selector_str.is_empty() {
1542
+ if let Some(window_sel) = window_selector {
1543
+ format!("process:{} >> {}", process, window_sel)
1544
+ } else {
1545
+ format!("process:{}", process)
1546
+ }
1547
+ } else if let Some(window_sel) = window_selector {
1548
+ format!("process:{} >> {} >> {}", process, window_sel, selector_str)
1549
+ } else {
1550
+ format!("process:{} >> {}", process, selector_str)
1551
+ };
1552
+
1553
+ let sel_rust: terminator::selector::Selector = full_selector.as_str().into();
1554
+ let loc = self.inner.locator(sel_rust);
1555
+ Ok(Locator::from(loc))
1556
+ }
1557
+
225
1558
  /// (async) Get the currently focused window.
226
1559
  ///
227
1560
  /// @returns {Promise<Element>} The current window element.
@@ -261,8 +1594,16 @@ impl Desktop {
261
1594
  ///
262
1595
  /// @param {string} url - The URL to open.
263
1596
  /// @param {string} [browser] - The browser to use. Can be "Default", "Chrome", "Firefox", "Edge", "Brave", "Opera", "Vivaldi", or a custom browser path.
1597
+ /// @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot after opening
1598
+ /// @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after opening
264
1599
  #[napi]
265
- pub fn open_url(&self, url: String, browser: Option<String>) -> napi::Result<Element> {
1600
+ pub fn open_url(
1601
+ &self,
1602
+ url: String,
1603
+ browser: Option<String>,
1604
+ include_window_screenshot: Option<bool>,
1605
+ include_monitor_screenshots: Option<bool>,
1606
+ ) -> napi::Result<Element> {
266
1607
  let browser_enum = browser.map(|b| match b.to_lowercase().as_str() {
267
1608
  "default" => terminator::Browser::Default,
268
1609
  "chrome" => terminator::Browser::Chrome,
@@ -273,10 +1614,18 @@ impl Desktop {
273
1614
  "vivaldi" => terminator::Browser::Vivaldi,
274
1615
  custom => terminator::Browser::Custom(custom.to_string()),
275
1616
  });
276
- self.inner
277
- .open_url(&url, browser_enum)
278
- .map(Element::from)
279
- .map_err(map_error)
1617
+ let element = self.inner.open_url(&url, browser_enum).map_err(map_error)?;
1618
+
1619
+ // Capture screenshots if enabled (window default: true, monitor default: false)
1620
+ let _screenshots = capture_screenshots(
1621
+ &self.inner,
1622
+ element.process_id().ok(),
1623
+ include_window_screenshot.unwrap_or(true),
1624
+ include_monitor_screenshots.unwrap_or(false),
1625
+ "openUrl",
1626
+ );
1627
+
1628
+ Ok(Element::from(element))
280
1629
  }
281
1630
 
282
1631
  /// Open a file with its default application.
@@ -297,19 +1646,22 @@ impl Desktop {
297
1646
  .map_err(map_error)
298
1647
  }
299
1648
 
300
- /// Get the UI tree for a window identified by process ID and optional title.
1649
+ /// Get the UI tree for a window identified by process name and optional title.
301
1650
  ///
302
- /// @param {number} pid - Process ID of the target application.
1651
+ /// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
303
1652
  /// @param {string} [title] - Optional window title filter.
304
1653
  /// @param {TreeBuildConfig} [config] - Optional configuration for tree building.
305
1654
  /// @returns {UINode} Complete UI tree starting from the identified window.
306
1655
  #[napi]
307
1656
  pub fn get_window_tree(
308
1657
  &self,
309
- pid: u32,
1658
+ process: String,
310
1659
  title: Option<String>,
311
1660
  config: Option<TreeBuildConfig>,
312
1661
  ) -> napi::Result<UINode> {
1662
+ // Find PID for the process name
1663
+ let pid = find_pid_for_process(&self.inner, &process)?;
1664
+
313
1665
  let rust_config = config.map(|c| c.into());
314
1666
  self.inner
315
1667
  .get_window_tree(pid, title.as_deref(), rust_config)
@@ -317,6 +1669,342 @@ impl Desktop {
317
1669
  .map_err(map_error)
318
1670
  }
319
1671
 
1672
+ /// Get the UI tree with full result including formatting and bounds mapping.
1673
+ ///
1674
+ /// This is the recommended method for getting window trees when you need:
1675
+ /// - Formatted YAML output for LLM consumption
1676
+ /// - Index-to-bounds mapping for click targeting
1677
+ /// - Browser detection
1678
+ ///
1679
+ /// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
1680
+ /// @param {string} [title] - Optional window title filter.
1681
+ /// @param {TreeBuildConfig} [config] - Configuration options:
1682
+ /// - formatOutput: Enable formatted output (default: true if treeOutputFormat set)
1683
+ /// - treeOutputFormat: 'CompactYaml' (default) or 'VerboseJson'
1684
+ /// - treeFromSelector: Selector to start tree from (use getWindowTreeResultAsync for this)
1685
+ /// - includeWindowScreenshot: Save window screenshot to executions dir (default: false)
1686
+ /// - includeMonitorScreenshots: Save all monitor screenshots to executions dir (default: false)
1687
+ /// @returns {WindowTreeResult} Complete result with tree, formatted output, bounds mapping, and screenshot paths.
1688
+ #[napi]
1689
+ pub fn get_window_tree_result(
1690
+ &self,
1691
+ process: String,
1692
+ title: Option<String>,
1693
+ config: Option<TreeBuildConfig>,
1694
+ ) -> napi::Result<WindowTreeResult> {
1695
+ // Find PID for the process name
1696
+ let pid = find_pid_for_process(&self.inner, &process)?;
1697
+
1698
+ // Extract screenshot options (window: true, monitor: false by default)
1699
+ let include_window_screenshot = config
1700
+ .as_ref()
1701
+ .and_then(|c| c.include_window_screenshot)
1702
+ .unwrap_or(true);
1703
+ let include_monitor_screenshots = config
1704
+ .as_ref()
1705
+ .and_then(|c| c.include_monitor_screenshots)
1706
+ .unwrap_or(false);
1707
+
1708
+ // Extract options before converting config
1709
+ let output_format = config
1710
+ .as_ref()
1711
+ .and_then(|c| c.tree_output_format)
1712
+ .unwrap_or(TreeOutputFormat::CompactYaml);
1713
+
1714
+ // If format is VerboseJson, we don't need formatted output from core
1715
+ // ClusteredYaml is treated like CompactYaml (needs format_output = true)
1716
+ let rust_config = config.map(|mut c| {
1717
+ if matches!(output_format, TreeOutputFormat::VerboseJson) {
1718
+ c.format_output = Some(false);
1719
+ } else if c.format_output.is_none() {
1720
+ c.format_output = Some(true);
1721
+ }
1722
+ c.into()
1723
+ });
1724
+
1725
+ let result = self
1726
+ .inner
1727
+ .get_window_tree_result(pid, title.as_deref(), rust_config)
1728
+ .map_err(map_error)?;
1729
+
1730
+ // Convert and handle format
1731
+ let mut sdk_result = WindowTreeResult::from(result);
1732
+
1733
+ // For VerboseJson, serialize the tree as the formatted output
1734
+ if matches!(output_format, TreeOutputFormat::VerboseJson) {
1735
+ sdk_result.formatted =
1736
+ Some(serde_json::to_string_pretty(&sdk_result.tree).unwrap_or_default());
1737
+ }
1738
+
1739
+ // Handle screenshot capture and saving using helper
1740
+ let screenshots = capture_screenshots(
1741
+ &self.inner,
1742
+ Some(pid),
1743
+ include_window_screenshot,
1744
+ include_monitor_screenshots,
1745
+ "getWindowTreeResult",
1746
+ );
1747
+ sdk_result.window_screenshot_path = screenshots.window_path;
1748
+ sdk_result.monitor_screenshot_paths = screenshots.monitor_paths;
1749
+
1750
+ Ok(sdk_result)
1751
+ }
1752
+
1753
+ /// (async) Get the UI tree with full result, supporting tree_from_selector.
1754
+ ///
1755
+ /// Use this method when you need to scope the tree to a specific subtree using a selector.
1756
+ ///
1757
+ /// @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
1758
+ /// @param {string} [title] - Optional window title filter.
1759
+ /// @param {TreeBuildConfig} [config] - Configuration options:
1760
+ /// - formatOutput: Enable formatted output (default: true)
1761
+ /// - treeOutputFormat: 'CompactYaml' (default) or 'VerboseJson'
1762
+ /// - treeFromSelector: Selector to start tree from (e.g., "role:Dialog")
1763
+ /// @returns {Promise<WindowTreeResult>} Complete result with tree, formatted output, and bounds mapping.
1764
+ #[napi]
1765
+ pub async fn get_window_tree_result_async(
1766
+ &self,
1767
+ process: String,
1768
+ title: Option<String>,
1769
+ config: Option<TreeBuildConfig>,
1770
+ ) -> napi::Result<WindowTreeResult> {
1771
+ use std::collections::HashMap;
1772
+
1773
+ // Find PID for the process name
1774
+ let pid = find_pid_for_process(&self.inner, &process)?;
1775
+
1776
+ // Extract vision and format options from config
1777
+ let include_gemini_vision = config
1778
+ .as_ref()
1779
+ .and_then(|c| c.include_gemini_vision)
1780
+ .unwrap_or(false);
1781
+ let include_omniparser = config
1782
+ .as_ref()
1783
+ .and_then(|c| c.include_omniparser)
1784
+ .unwrap_or(false);
1785
+ let include_ocr = config.as_ref().and_then(|c| c.include_ocr).unwrap_or(false);
1786
+ let include_browser_dom = config
1787
+ .as_ref()
1788
+ .and_then(|c| c.include_browser_dom)
1789
+ .unwrap_or(false);
1790
+ let output_format = config
1791
+ .as_ref()
1792
+ .and_then(|c| c.tree_output_format)
1793
+ .unwrap_or(TreeOutputFormat::CompactYaml);
1794
+
1795
+ let has_vision_options =
1796
+ include_gemini_vision || include_omniparser || include_ocr || include_browser_dom;
1797
+
1798
+ // Build rust config with from_selector passed through
1799
+ let rust_config = config.as_ref().map(|c| {
1800
+ let mut c_clone = TreeBuildConfig {
1801
+ property_mode: c.property_mode,
1802
+ timeout_per_operation_ms: c.timeout_per_operation_ms,
1803
+ yield_every_n_elements: c.yield_every_n_elements,
1804
+ batch_size: c.batch_size,
1805
+ max_depth: c.max_depth,
1806
+ ui_settle_delay_ms: c.ui_settle_delay_ms,
1807
+ format_output: c.format_output,
1808
+ tree_output_format: c.tree_output_format,
1809
+ tree_from_selector: c.tree_from_selector.clone(),
1810
+ include_window_screenshot: c.include_window_screenshot,
1811
+ include_monitor_screenshots: c.include_monitor_screenshots,
1812
+ include_gemini_vision: None,
1813
+ include_omniparser: None,
1814
+ include_ocr: None,
1815
+ include_browser_dom: None,
1816
+ };
1817
+ if matches!(output_format, TreeOutputFormat::VerboseJson) {
1818
+ c_clone.format_output = Some(false);
1819
+ } else if c_clone.format_output.is_none() {
1820
+ c_clone.format_output = Some(true);
1821
+ }
1822
+ c_clone.into()
1823
+ });
1824
+
1825
+ // Get UIA tree (always)
1826
+ let result = self
1827
+ .inner
1828
+ .get_window_tree_result_async(pid, title.as_deref(), rust_config)
1829
+ .await
1830
+ .map_err(map_error)?;
1831
+
1832
+ let mut sdk_result = WindowTreeResult::from(result);
1833
+
1834
+ // If no vision options and not clustered format, return simple result
1835
+ if !has_vision_options && !matches!(output_format, TreeOutputFormat::ClusteredYaml) {
1836
+ if matches!(output_format, TreeOutputFormat::VerboseJson) {
1837
+ sdk_result.formatted =
1838
+ Some(serde_json::to_string_pretty(&sdk_result.tree).unwrap_or_default());
1839
+ }
1840
+ return Ok(sdk_result);
1841
+ }
1842
+
1843
+ // Build UIA bounds cache from formatted result
1844
+ #[allow(clippy::type_complexity)]
1845
+ let mut uia_bounds: HashMap<
1846
+ u32,
1847
+ (String, String, (f64, f64, f64, f64), Option<String>),
1848
+ > = HashMap::new();
1849
+ let uia_tree_result = self
1850
+ .inner
1851
+ .get_window_tree_result(pid, None, None)
1852
+ .map_err(map_error)?;
1853
+ let formatted_result = terminator::format_ui_node_as_compact_yaml(&uia_tree_result.tree, 0);
1854
+ for (idx, (role, name, bounds, selector)) in formatted_result.index_to_bounds {
1855
+ uia_bounds.insert(idx, (role, name, bounds, selector));
1856
+ }
1857
+
1858
+ // Build DOM bounds cache if requested
1859
+ #[allow(clippy::type_complexity)]
1860
+ let mut dom_bounds: HashMap<u32, (String, String, (f64, f64, f64, f64))> = HashMap::new();
1861
+ if include_browser_dom && terminator::is_browser_process(pid) {
1862
+ if let Ok(dom_result) = self.capture_browser_dom(Some(100), Some(true)).await {
1863
+ for (idx_str, entry) in dom_result.index_to_bounds {
1864
+ if let Ok(idx) = idx_str.parse::<u32>() {
1865
+ let bounds = (
1866
+ entry.bounds.x,
1867
+ entry.bounds.y,
1868
+ entry.bounds.width,
1869
+ entry.bounds.height,
1870
+ );
1871
+ dom_bounds.insert(idx, (entry.tag, entry.name, bounds));
1872
+ }
1873
+ }
1874
+ }
1875
+ }
1876
+
1877
+ // Build Omniparser cache if requested
1878
+ let mut omniparser_items: HashMap<u32, terminator::OmniparserItem> = HashMap::new();
1879
+ if include_omniparser {
1880
+ if let Ok(omni_result) = self
1881
+ .perform_omniparser_for_process(process.clone(), None, Some(true))
1882
+ .await
1883
+ {
1884
+ for (idx_str, entry) in omni_result.index_to_bounds {
1885
+ if let Ok(idx) = idx_str.parse::<u32>() {
1886
+ omniparser_items.insert(
1887
+ idx,
1888
+ terminator::OmniparserItem {
1889
+ label: entry.label.clone(),
1890
+ content: Some(entry.name.clone()),
1891
+ box_2d: Some([
1892
+ entry.bounds.x,
1893
+ entry.bounds.y,
1894
+ entry.bounds.x + entry.bounds.width,
1895
+ entry.bounds.y + entry.bounds.height,
1896
+ ]),
1897
+ },
1898
+ );
1899
+ }
1900
+ }
1901
+ }
1902
+ }
1903
+
1904
+ // Build Gemini Vision cache if requested
1905
+ let mut vision_items: HashMap<u32, terminator::VisionElement> = HashMap::new();
1906
+ if include_gemini_vision {
1907
+ if let Ok(vision_result) = self
1908
+ .perform_gemini_vision_for_process(process.clone(), Some(true))
1909
+ .await
1910
+ {
1911
+ for (idx_str, entry) in vision_result.index_to_bounds {
1912
+ if let Ok(idx) = idx_str.parse::<u32>() {
1913
+ vision_items.insert(
1914
+ idx,
1915
+ terminator::VisionElement {
1916
+ element_type: entry.element_type.clone(),
1917
+ content: Some(entry.name.clone()),
1918
+ description: None,
1919
+ box_2d: Some([
1920
+ entry.bounds.x,
1921
+ entry.bounds.y,
1922
+ entry.bounds.x + entry.bounds.width,
1923
+ entry.bounds.y + entry.bounds.height,
1924
+ ]),
1925
+ interactivity: None,
1926
+ },
1927
+ );
1928
+ }
1929
+ }
1930
+ }
1931
+ }
1932
+
1933
+ // Build OCR cache if requested
1934
+ #[allow(clippy::type_complexity)]
1935
+ let mut ocr_bounds: HashMap<u32, (String, (f64, f64, f64, f64))> = HashMap::new();
1936
+ if include_ocr {
1937
+ if let Ok(ocr_result) = self
1938
+ .perform_ocr_for_process(process.clone(), Some(true))
1939
+ .await
1940
+ {
1941
+ for (idx_str, entry) in ocr_result.index_to_bounds {
1942
+ if let Ok(idx) = idx_str.parse::<u32>() {
1943
+ let bounds = (
1944
+ entry.bounds.x,
1945
+ entry.bounds.y,
1946
+ entry.bounds.width,
1947
+ entry.bounds.height,
1948
+ );
1949
+ ocr_bounds.insert(idx, (entry.text.clone(), bounds));
1950
+ }
1951
+ }
1952
+ }
1953
+ }
1954
+
1955
+ // If ClusteredYaml format, use clustering
1956
+ if matches!(output_format, TreeOutputFormat::ClusteredYaml) {
1957
+ let clustered_result = terminator::format_clustered_tree_from_caches(
1958
+ &uia_bounds,
1959
+ &dom_bounds,
1960
+ &ocr_bounds,
1961
+ &omniparser_items,
1962
+ &vision_items,
1963
+ );
1964
+ sdk_result.formatted = Some(clustered_result.formatted);
1965
+ } else {
1966
+ // CompactYaml with vision - append vision trees to UIA tree
1967
+ let mut combined = sdk_result.formatted.unwrap_or_default();
1968
+ if !dom_bounds.is_empty() {
1969
+ combined.push_str("\n\n# Browser DOM elements:\n");
1970
+ for (idx, (tag, name, _)) in &dom_bounds {
1971
+ combined.push_str(&format!("#d{} [{}] {}\n", idx, tag, name));
1972
+ }
1973
+ }
1974
+ if !omniparser_items.is_empty() {
1975
+ combined.push_str("\n\n# Omniparser elements:\n");
1976
+ for (idx, item) in &omniparser_items {
1977
+ combined.push_str(&format!(
1978
+ "#p{} [{}] {}\n",
1979
+ idx,
1980
+ item.label,
1981
+ item.content.as_deref().unwrap_or("")
1982
+ ));
1983
+ }
1984
+ }
1985
+ if !vision_items.is_empty() {
1986
+ combined.push_str("\n\n# Gemini Vision elements:\n");
1987
+ for (idx, item) in &vision_items {
1988
+ combined.push_str(&format!(
1989
+ "#g{} [{}] {}\n",
1990
+ idx,
1991
+ item.element_type,
1992
+ item.content.as_deref().unwrap_or("")
1993
+ ));
1994
+ }
1995
+ }
1996
+ if !ocr_bounds.is_empty() {
1997
+ combined.push_str("\n\n# OCR elements:\n");
1998
+ for (idx, (text, _)) in &ocr_bounds {
1999
+ combined.push_str(&format!("#o{} {}\n", idx, text));
2000
+ }
2001
+ }
2002
+ sdk_result.formatted = Some(combined);
2003
+ }
2004
+
2005
+ Ok(sdk_result)
2006
+ }
2007
+
320
2008
  // ============== NEW MONITOR METHODS ==============
321
2009
 
322
2010
  /// (async) List all available monitors/displays.
@@ -435,6 +2123,168 @@ impl Desktop {
435
2123
  .map_err(map_error)
436
2124
  }
437
2125
 
2126
+ /// Capture a screenshot of a window by process name.
2127
+ ///
2128
+ /// Finds the first window matching the given process name and captures its screenshot.
2129
+ /// Process name matching is case-insensitive and uses substring matching.
2130
+ ///
2131
+ /// @param {string} process - Process name to match (e.g., "chrome", "notepad", "code")
2132
+ /// @returns {ScreenshotResult} The screenshot data.
2133
+ #[napi(js_name = "captureWindowByProcess")]
2134
+ pub fn capture_window_by_process(&self, process: String) -> napi::Result<ScreenshotResult> {
2135
+ self.inner
2136
+ .capture_window_by_process(&process)
2137
+ .map(|r| ScreenshotResult {
2138
+ width: r.width,
2139
+ height: r.height,
2140
+ image_data: r.image_data,
2141
+ monitor: r.monitor.map(Monitor::from),
2142
+ })
2143
+ .map_err(map_error)
2144
+ }
2145
+
2146
+ /// (async) Captures a screenshot. Three modes:
2147
+ /// 1. Element mode: provide process + selector to capture specific element
2148
+ /// 2. Window mode: provide process only to capture entire window
2149
+ /// 3. Monitor mode: provide process + entireMonitor=true to capture the monitor where the window is located
2150
+ ///
2151
+ /// @param {string} process - Process name to match (e.g., "chrome", "notepad", "code")
2152
+ /// @param {string} [selector] - Optional selector to capture a specific element within the process
2153
+ /// @param {boolean} [entireMonitor=false] - If true, captures the entire monitor containing the window
2154
+ /// @param {number} [timeoutMs=10000] - Timeout in milliseconds for finding the element
2155
+ /// @returns {Promise<ScreenshotResult>} The screenshot data.
2156
+ #[napi(js_name = "captureScreenshot")]
2157
+ pub async fn capture_screenshot(
2158
+ &self,
2159
+ process: String,
2160
+ selector: Option<String>,
2161
+ entire_monitor: Option<bool>,
2162
+ timeout_ms: Option<f64>,
2163
+ ) -> napi::Result<ScreenshotResult> {
2164
+ use std::time::Duration;
2165
+
2166
+ let entire_monitor = entire_monitor.unwrap_or(false);
2167
+ let timeout = Duration::from_millis(timeout_ms.unwrap_or(10000.0) as u64);
2168
+
2169
+ // Build the full selector string like MCP does
2170
+ let full_selector = if let Some(sel) = &selector {
2171
+ if sel.is_empty() {
2172
+ format!("process:{}", process)
2173
+ } else {
2174
+ format!("process:{} >> {}", process, sel)
2175
+ }
2176
+ } else {
2177
+ format!("process:{}", process)
2178
+ };
2179
+
2180
+ // Create locator and find element
2181
+ let sel_rust: terminator::selector::Selector = full_selector.as_str().into();
2182
+ let locator = self.inner.locator(sel_rust);
2183
+ let element = locator.first(Some(timeout)).await.map_err(map_error)?;
2184
+
2185
+ if entire_monitor {
2186
+ // Monitor mode: get element's monitor and capture it
2187
+ let monitor = element.monitor().map_err(map_error)?;
2188
+ let screenshot = monitor.capture(&self.inner).await.map_err(map_error)?;
2189
+ Ok(ScreenshotResult {
2190
+ width: screenshot.width,
2191
+ height: screenshot.height,
2192
+ image_data: screenshot.image_data,
2193
+ monitor: Some(Monitor::from(monitor)),
2194
+ })
2195
+ } else {
2196
+ // Element/Window mode: capture the element directly
2197
+ let screenshot = element.capture().map_err(map_error)?;
2198
+ Ok(ScreenshotResult {
2199
+ width: screenshot.width,
2200
+ height: screenshot.height,
2201
+ image_data: screenshot.image_data,
2202
+ monitor: screenshot.monitor.map(Monitor::from),
2203
+ })
2204
+ }
2205
+ }
2206
+
2207
+ // ============== SCREENSHOT UTILITIES ==============
2208
+
2209
+ /// Convert a screenshot to PNG bytes.
2210
+ /// Converts BGRA to RGBA and encodes as PNG format.
2211
+ ///
2212
+ /// @param {ScreenshotResult} screenshot - The screenshot to convert.
2213
+ /// @returns {Buffer} PNG-encoded bytes.
2214
+ #[napi(js_name = "screenshotToPng")]
2215
+ pub fn screenshot_to_png(&self, screenshot: ScreenshotResult) -> napi::Result<Vec<u8>> {
2216
+ screenshot
2217
+ .to_inner()
2218
+ .to_png()
2219
+ .map_err(|e| napi::Error::from_reason(e.to_string()))
2220
+ }
2221
+
2222
+ /// Convert a screenshot to PNG bytes with resizing.
2223
+ /// If the image exceeds maxDimension in either width or height,
2224
+ /// it will be resized while maintaining aspect ratio.
2225
+ ///
2226
+ /// @param {ScreenshotResult} screenshot - The screenshot to convert.
2227
+ /// @param {number} [maxDimension] - Maximum width or height. Defaults to 1920.
2228
+ /// @returns {Buffer} PNG-encoded bytes (potentially resized).
2229
+ #[napi(js_name = "screenshotToPngResized")]
2230
+ pub fn screenshot_to_png_resized(
2231
+ &self,
2232
+ screenshot: ScreenshotResult,
2233
+ max_dimension: Option<u32>,
2234
+ ) -> napi::Result<Vec<u8>> {
2235
+ screenshot
2236
+ .to_inner()
2237
+ .to_png_resized(max_dimension)
2238
+ .map_err(|e| napi::Error::from_reason(e.to_string()))
2239
+ }
2240
+
2241
+ /// Convert a screenshot to base64-encoded PNG string.
2242
+ /// Useful for embedding in JSON responses or passing to LLMs.
2243
+ ///
2244
+ /// @param {ScreenshotResult} screenshot - The screenshot to convert.
2245
+ /// @returns {string} Base64-encoded PNG string.
2246
+ #[napi(js_name = "screenshotToBase64Png")]
2247
+ pub fn screenshot_to_base64_png(&self, screenshot: ScreenshotResult) -> napi::Result<String> {
2248
+ screenshot
2249
+ .to_inner()
2250
+ .to_base64_png()
2251
+ .map_err(|e| napi::Error::from_reason(e.to_string()))
2252
+ }
2253
+
2254
+ /// Convert a screenshot to base64-encoded PNG string with resizing.
2255
+ /// If the image exceeds maxDimension in either width or height,
2256
+ /// it will be resized while maintaining aspect ratio.
2257
+ ///
2258
+ /// @param {ScreenshotResult} screenshot - The screenshot to convert.
2259
+ /// @param {number} [maxDimension] - Maximum width or height. Defaults to 1920.
2260
+ /// @returns {string} Base64-encoded PNG string (potentially resized).
2261
+ #[napi(js_name = "screenshotToBase64PngResized")]
2262
+ pub fn screenshot_to_base64_png_resized(
2263
+ &self,
2264
+ screenshot: ScreenshotResult,
2265
+ max_dimension: Option<u32>,
2266
+ ) -> napi::Result<String> {
2267
+ screenshot
2268
+ .to_inner()
2269
+ .to_base64_png_resized(max_dimension)
2270
+ .map_err(|e| napi::Error::from_reason(e.to_string()))
2271
+ }
2272
+
2273
+ /// Get the dimensions a screenshot would have after resizing.
2274
+ ///
2275
+ /// @param {ScreenshotResult} screenshot - The screenshot to check.
2276
+ /// @param {number} maxDimension - Maximum width or height.
2277
+ /// @returns {ResizedDimensions} Object with width and height after resize.
2278
+ #[napi(js_name = "screenshotResizedDimensions")]
2279
+ pub fn screenshot_resized_dimensions(
2280
+ &self,
2281
+ screenshot: ScreenshotResult,
2282
+ max_dimension: u32,
2283
+ ) -> ResizedDimensions {
2284
+ let (width, height) = screenshot.to_inner().resized_dimensions(max_dimension);
2285
+ ResizedDimensions { width, height }
2286
+ }
2287
+
438
2288
  /// (async) Get all window elements for a given application name.
439
2289
  ///
440
2290
  /// @param {string} name - The name of the application whose windows will be retrieved.
@@ -470,14 +2320,28 @@ impl Desktop {
470
2320
  self.inner.press_key(&key).await.map_err(map_error)
471
2321
  }
472
2322
 
473
- /// (async) Execute JavaScript in the currently focused browser tab.
474
- /// Automatically finds the active browser window and executes the script.
2323
+ /// (async) Execute JavaScript in a browser tab.
2324
+ /// Finds the browser window by process name and executes the script.
475
2325
  ///
476
2326
  /// @param {string} script - The JavaScript code to execute in browser context.
2327
+ /// @param {string} process - Process name to scope the browser window (e.g., 'chrome', 'msedge'). Required.
2328
+ /// @param {number} [timeoutMs=10000] - Timeout in milliseconds for finding the browser window.
477
2329
  /// @returns {Promise<string>} The result of script execution.
478
2330
  #[napi]
479
- pub async fn execute_browser_script(&self, script: String) -> napi::Result<String> {
480
- self.inner
2331
+ pub async fn execute_browser_script(
2332
+ &self,
2333
+ script: String,
2334
+ process: String,
2335
+ timeout_ms: Option<f64>,
2336
+ ) -> napi::Result<String> {
2337
+ use std::time::Duration;
2338
+
2339
+ let timeout = Duration::from_millis(timeout_ms.unwrap_or(10000.0) as u64);
2340
+ let selector_str = format!("process:{}", process);
2341
+ let sel: terminator::selector::Selector = selector_str.as_str().into();
2342
+ let locator = self.inner.locator(sel);
2343
+ let element = locator.first(Some(timeout)).await.map_err(map_error)?;
2344
+ element
481
2345
  .execute_browser_script(&script)
482
2346
  .await
483
2347
  .map_err(map_error)
@@ -499,9 +2363,17 @@ impl Desktop {
499
2363
  ///
500
2364
  /// @param {string} url - URL to navigate to
501
2365
  /// @param {string | null} browser - Optional browser name ('Chrome', 'Firefox', 'Edge', 'Brave', 'Opera', 'Vivaldi', or 'Default')
2366
+ /// @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot after navigation
2367
+ /// @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after navigation
502
2368
  /// @returns {Promise<Element>} The browser window element
503
2369
  #[napi]
504
- pub fn navigate_browser(&self, url: String, browser: Option<String>) -> napi::Result<Element> {
2370
+ pub fn navigate_browser(
2371
+ &self,
2372
+ url: String,
2373
+ browser: Option<String>,
2374
+ include_window_screenshot: Option<bool>,
2375
+ include_monitor_screenshots: Option<bool>,
2376
+ ) -> napi::Result<Element> {
505
2377
  let browser_enum = browser.map(|b| match b.as_str() {
506
2378
  "Chrome" => terminator::Browser::Chrome,
507
2379
  "Firefox" => terminator::Browser::Firefox,
@@ -514,6 +2386,16 @@ impl Desktop {
514
2386
  });
515
2387
 
516
2388
  let element = self.inner.open_url(&url, browser_enum).map_err(map_error)?;
2389
+
2390
+ // Capture screenshots if enabled (window default: true, monitor default: false)
2391
+ let _screenshots = capture_screenshots(
2392
+ &self.inner,
2393
+ element.process_id().ok(),
2394
+ include_window_screenshot.unwrap_or(true),
2395
+ include_monitor_screenshots.unwrap_or(false),
2396
+ "navigateBrowser",
2397
+ );
2398
+
517
2399
  Ok(Element { inner: element })
518
2400
  }
519
2401
 
@@ -534,6 +2416,7 @@ impl Desktop {
534
2416
  /// @param {string} process - Process name of the target application (e.g., "chrome", "notepad")
535
2417
  /// @param {string} goal - What to achieve (e.g., "Open Notepad and type Hello World")
536
2418
  /// @param {number} [maxSteps=20] - Maximum number of steps before stopping
2419
+ /// @param {function} [onStep] - Optional callback invoked after each step with step details
537
2420
  /// @returns {Promise<ComputerUseResult>} Result with status, steps executed, and history
538
2421
  #[napi]
539
2422
  pub async fn gemini_computer_use(
@@ -541,11 +2424,178 @@ impl Desktop {
541
2424
  process: String,
542
2425
  goal: String,
543
2426
  max_steps: Option<u32>,
2427
+ #[napi(ts_arg_type = "((err: null | Error, step: ComputerUseStep) => void) | undefined")]
2428
+ on_step: Option<ThreadsafeFunction<ComputerUseStep>>,
544
2429
  ) -> napi::Result<ComputerUseResult> {
2430
+ // Create progress callback if onStep is provided
2431
+ #[allow(clippy::type_complexity)]
2432
+ let progress_callback: Option<
2433
+ Box<dyn Fn(&terminator::ComputerUseStep) + Send + Sync>,
2434
+ > = on_step.map(|tsfn| {
2435
+ let tsfn = Arc::new(tsfn);
2436
+ Box::new(move |step: &terminator::ComputerUseStep| {
2437
+ let js_step = ComputerUseStep::from(step.clone());
2438
+ tsfn.call(Ok(js_step), ThreadsafeFunctionCallMode::NonBlocking);
2439
+ }) as Box<dyn Fn(&terminator::ComputerUseStep) + Send + Sync>
2440
+ });
2441
+
545
2442
  self.inner
546
- .gemini_computer_use(&process, &goal, max_steps, None)
2443
+ .gemini_computer_use(&process, &goal, max_steps, progress_callback)
547
2444
  .await
548
2445
  .map(ComputerUseResult::from)
549
2446
  .map_err(|e| napi::Error::from_reason(e.to_string()))
550
2447
  }
2448
+
2449
+ /// Stop all currently executing operations.
2450
+ ///
2451
+ /// This cancels the internal cancellation token, which will cause any
2452
+ /// operations that check `isCancelled()` to abort. After calling this,
2453
+ /// you should create a new Desktop instance to start fresh.
2454
+ #[napi]
2455
+ pub fn stop_execution(&self) {
2456
+ self.inner.stop_execution();
2457
+ }
2458
+
2459
+ /// Check if execution has been cancelled.
2460
+ ///
2461
+ /// Returns `true` if `stopExecution()` has been called.
2462
+ /// Long-running operations should periodically check this and abort if true.
2463
+ #[napi]
2464
+ pub fn is_cancelled(&self) -> bool {
2465
+ self.inner.is_cancelled()
2466
+ }
2467
+
2468
+ /// Stop all active highlight overlays globally.
2469
+ ///
2470
+ /// This finds and destroys all highlight overlay windows that were created
2471
+ /// by `element.highlight()`. Useful for cleaning up highlights without
2472
+ /// needing to track individual HighlightHandle objects.
2473
+ ///
2474
+ /// @returns {number} The number of highlights that were stopped.
2475
+ #[napi]
2476
+ pub fn stop_highlighting(&self) -> u32 {
2477
+ #[cfg(target_os = "windows")]
2478
+ {
2479
+ terminator::stop_all_highlights() as u32
2480
+ }
2481
+ #[cfg(not(target_os = "windows"))]
2482
+ {
2483
+ // Not implemented for other platforms yet
2484
+ 0
2485
+ }
2486
+ }
2487
+
2488
+ /// Show inspect overlay with indexed elements for visual debugging.
2489
+ ///
2490
+ /// Displays a transparent overlay window with colored rectangles around UI elements,
2491
+ /// showing their index numbers for click targeting. Use `hideInspectOverlay()` to remove.
2492
+ ///
2493
+ /// @param {InspectElement[]} elements - Array of elements to highlight with their bounds.
2494
+ /// @param {object} windowBounds - The window bounds {x, y, width, height} to constrain the overlay.
2495
+ /// @param {OverlayDisplayMode} [displayMode='Index'] - What to show in labels: 'Index', 'Role', 'Name', etc.
2496
+ #[napi]
2497
+ #[cfg(target_os = "windows")]
2498
+ pub fn show_inspect_overlay(
2499
+ &self,
2500
+ elements: Vec<crate::types::InspectElement>,
2501
+ window_bounds: crate::types::Bounds,
2502
+ display_mode: Option<crate::types::OverlayDisplayMode>,
2503
+ ) -> napi::Result<()> {
2504
+ let core_elements: Vec<terminator::InspectElement> =
2505
+ elements.into_iter().map(|e| e.into()).collect();
2506
+ let core_bounds = (
2507
+ window_bounds.x as i32,
2508
+ window_bounds.y as i32,
2509
+ window_bounds.width as i32,
2510
+ window_bounds.height as i32,
2511
+ );
2512
+ let core_mode = display_mode
2513
+ .map(|m| m.into())
2514
+ .unwrap_or(terminator::OverlayDisplayMode::Index);
2515
+
2516
+ terminator::show_inspect_overlay(core_elements, core_bounds, core_mode)
2517
+ .map(|_handle| ()) // Discard handle - use hideInspectOverlay to close
2518
+ .map_err(|e| napi::Error::from_reason(e.to_string()))
2519
+ }
2520
+
2521
+ /// Show inspect overlay (non-Windows stub).
2522
+ #[napi]
2523
+ #[cfg(not(target_os = "windows"))]
2524
+ pub fn show_inspect_overlay(
2525
+ &self,
2526
+ _elements: Vec<crate::types::InspectElement>,
2527
+ _window_bounds: crate::types::Bounds,
2528
+ _display_mode: Option<crate::types::OverlayDisplayMode>,
2529
+ ) -> napi::Result<()> {
2530
+ // Not implemented for other platforms yet
2531
+ Ok(())
2532
+ }
2533
+
2534
+ /// Hide any active inspect overlay.
2535
+ ///
2536
+ /// This hides the visual overlay that was shown via `showInspectOverlay()`.
2537
+ /// Can be called from any thread.
2538
+ #[napi]
2539
+ pub fn hide_inspect_overlay(&self) {
2540
+ #[cfg(target_os = "windows")]
2541
+ {
2542
+ terminator::hide_inspect_overlay();
2543
+ }
2544
+ #[cfg(not(target_os = "windows"))]
2545
+ {
2546
+ // Not implemented for other platforms yet
2547
+ }
2548
+ }
2549
+
2550
+ // ============== ELEMENT VERIFICATION ==============
2551
+
2552
+ /// Verify that an element matching the selector exists within the same application as the scope element.
2553
+ ///
2554
+ /// This is used for post-action verification - checking that an expected element appeared after
2555
+ /// performing an action (e.g., a success dialog after clicking submit).
2556
+ ///
2557
+ /// @param {Element} scopeElement - The element to get the application scope from (typically the element the action was performed on)
2558
+ /// @param {string} selector - The selector string to search for
2559
+ /// @param {number} [timeoutMs=2000] - How long to wait for the element to appear in milliseconds
2560
+ /// @returns {Element} The found element if verification passes
2561
+ /// @throws Error if the element is not found within the timeout
2562
+ #[napi]
2563
+ pub async fn verify_element_exists(
2564
+ &self,
2565
+ scope_element: &crate::Element,
2566
+ selector: String,
2567
+ timeout_ms: Option<u32>,
2568
+ ) -> napi::Result<crate::Element> {
2569
+ let timeout = timeout_ms.unwrap_or(2000) as u64;
2570
+ let found = self
2571
+ .inner
2572
+ .verify_element_exists(&scope_element.inner, &selector, timeout)
2573
+ .await
2574
+ .map_err(map_error)?;
2575
+ Ok(crate::Element { inner: found })
2576
+ }
2577
+
2578
+ /// Verify that an element matching the selector does NOT exist within the same application as the scope element.
2579
+ ///
2580
+ /// This is used for post-action verification - checking that an element disappeared after
2581
+ /// performing an action (e.g., a modal dialog closed after clicking OK).
2582
+ ///
2583
+ /// @param {Element} scopeElement - The element to get the application scope from (typically the element the action was performed on)
2584
+ /// @param {string} selector - The selector string that should NOT be found
2585
+ /// @param {number} [timeoutMs=2000] - How long to wait/check that the element doesn't appear in milliseconds
2586
+ /// @returns {void}
2587
+ /// @throws Error if the element IS found (meaning verification failed)
2588
+ #[napi]
2589
+ pub async fn verify_element_not_exists(
2590
+ &self,
2591
+ scope_element: &crate::Element,
2592
+ selector: String,
2593
+ timeout_ms: Option<u32>,
2594
+ ) -> napi::Result<()> {
2595
+ let timeout = timeout_ms.unwrap_or(2000) as u64;
2596
+ self.inner
2597
+ .verify_element_not_exists(&scope_element.inner, &selector, timeout)
2598
+ .await
2599
+ .map_err(map_error)
2600
+ }
551
2601
  }