@elizaos/computeruse 0.24.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/types.rs ADDED
@@ -0,0 +1,963 @@
1
+ use napi_derive::napi;
2
+ use serde::Serialize;
3
+ use std::collections::HashMap;
4
+
5
+ #[derive(Serialize, Clone)]
6
+ #[napi(object, js_name = "Bounds")]
7
+ pub struct Bounds {
8
+ pub x: f64,
9
+ pub y: f64,
10
+ pub width: f64,
11
+ pub height: f64,
12
+ }
13
+
14
+ #[napi(object, js_name = "Coordinates")]
15
+ pub struct Coordinates {
16
+ pub x: f64,
17
+ pub y: f64,
18
+ }
19
+
20
+ /// Result of UI diff capture
21
+ #[napi(object, js_name = "UiDiffResult")]
22
+ pub struct UiDiffResult {
23
+ /// The computed diff showing changes (lines starting with + or -)
24
+ pub diff: String,
25
+ /// Whether any UI changes were detected
26
+ pub has_changes: bool,
27
+ }
28
+
29
+ #[napi(object, js_name = "ClickResult")]
30
+ pub struct ClickResult {
31
+ pub method: String,
32
+ pub coordinates: Option<Coordinates>,
33
+ pub details: String,
34
+ /// Path to window screenshot if captured
35
+ pub window_screenshot_path: Option<String>,
36
+ /// Paths to monitor screenshots if captured
37
+ pub monitor_screenshot_paths: Option<Vec<String>>,
38
+ /// UI diff result if ui_diff_before_after was enabled
39
+ pub ui_diff: Option<UiDiffResult>,
40
+ }
41
+
42
+ /// Result of an action operation (type_text, press_key, scroll, etc.)
43
+ #[napi(object, js_name = "ActionResult")]
44
+ pub struct ActionResult {
45
+ /// Whether the action succeeded
46
+ pub success: bool,
47
+ /// Path to window screenshot if captured
48
+ pub window_screenshot_path: Option<String>,
49
+ /// Paths to monitor screenshots if captured
50
+ pub monitor_screenshot_paths: Option<Vec<String>>,
51
+ /// UI diff result if ui_diff_before_after was enabled
52
+ pub ui_diff: Option<UiDiffResult>,
53
+ }
54
+
55
+ /// Type of mouse click to perform
56
+ #[napi(string_enum, js_name = "ClickType")]
57
+ pub enum ClickType {
58
+ /// Single left click (default)
59
+ Left,
60
+ /// Double left click
61
+ Double,
62
+ /// Single right click
63
+ Right,
64
+ }
65
+
66
+ impl From<ClickType> for computeruse::ClickType {
67
+ fn from(ct: ClickType) -> Self {
68
+ match ct {
69
+ ClickType::Left => computeruse::ClickType::Left,
70
+ ClickType::Double => computeruse::ClickType::Double,
71
+ ClickType::Right => computeruse::ClickType::Right,
72
+ }
73
+ }
74
+ }
75
+
76
+ /// Source of indexed elements for click targeting
77
+ #[napi(string_enum, js_name = "VisionType")]
78
+ pub enum VisionType {
79
+ /// UI Automation tree elements (default)
80
+ UiTree,
81
+ /// OCR-detected text elements
82
+ Ocr,
83
+ /// Omniparser-detected elements
84
+ Omniparser,
85
+ /// Gemini Vision-detected elements
86
+ Gemini,
87
+ /// Browser DOM elements
88
+ Dom,
89
+ }
90
+
91
+ impl From<VisionType> for computeruse::VisionType {
92
+ fn from(vt: VisionType) -> Self {
93
+ match vt {
94
+ VisionType::UiTree => computeruse::VisionType::UiTree,
95
+ VisionType::Ocr => computeruse::VisionType::Ocr,
96
+ VisionType::Omniparser => computeruse::VisionType::Omniparser,
97
+ VisionType::Gemini => computeruse::VisionType::Gemini,
98
+ VisionType::Dom => computeruse::VisionType::Dom,
99
+ }
100
+ }
101
+ }
102
+
103
+ #[napi(object, js_name = "CommandOutput")]
104
+ pub struct CommandOutput {
105
+ pub exit_status: Option<i32>,
106
+ pub stdout: String,
107
+ pub stderr: String,
108
+ }
109
+
110
+ #[derive(Clone)]
111
+ #[napi(object)]
112
+ pub struct Monitor {
113
+ pub id: String,
114
+ pub name: String,
115
+ pub is_primary: bool,
116
+ pub width: u32,
117
+ pub height: u32,
118
+ pub x: i32,
119
+ pub y: i32,
120
+ pub scale_factor: f64,
121
+ }
122
+
123
+ /// A screenshot result containing image data and dimensions.
124
+ #[napi(object)]
125
+ pub struct ScreenshotResult {
126
+ pub width: u32,
127
+ pub height: u32,
128
+ pub image_data: Vec<u8>,
129
+ pub monitor: Option<Monitor>,
130
+ }
131
+
132
+ impl ScreenshotResult {
133
+ /// Convert to the internal computeruse::ScreenshotResult
134
+ pub fn to_inner(&self) -> computeruse::ScreenshotResult {
135
+ computeruse::ScreenshotResult {
136
+ image_data: self.image_data.clone(),
137
+ width: self.width,
138
+ height: self.height,
139
+ monitor: self.monitor.as_ref().map(|m| computeruse::Monitor {
140
+ id: m.id.clone(),
141
+ name: m.name.clone(),
142
+ is_primary: m.is_primary,
143
+ width: m.width,
144
+ height: m.height,
145
+ x: m.x,
146
+ y: m.y,
147
+ scale_factor: m.scale_factor,
148
+ work_area: None,
149
+ }),
150
+ }
151
+ }
152
+ }
153
+
154
+ #[napi(object)]
155
+ pub struct ResizedDimensions {
156
+ pub width: u32,
157
+ pub height: u32,
158
+ }
159
+
160
+ #[napi(object)]
161
+ pub struct MonitorScreenshotPair {
162
+ pub monitor: Monitor,
163
+ pub screenshot: ScreenshotResult,
164
+ }
165
+
166
+ #[derive(Serialize)]
167
+ #[napi(object, js_name = "UIElementAttributes")]
168
+ pub struct UIElementAttributes {
169
+ pub role: String,
170
+ pub name: Option<String>,
171
+ pub label: Option<String>,
172
+ pub value: Option<String>,
173
+ pub description: Option<String>,
174
+ pub properties: HashMap<String, Option<String>>,
175
+ pub is_keyboard_focusable: Option<bool>,
176
+ pub bounds: Option<Bounds>,
177
+ }
178
+
179
+ #[derive(Serialize)]
180
+ #[napi(object, js_name = "UINode")]
181
+ pub struct UINode {
182
+ pub id: Option<String>,
183
+ pub attributes: UIElementAttributes,
184
+ pub children: Vec<UINode>,
185
+ }
186
+
187
+ /// Entry in index-to-bounds mapping for click targeting
188
+ #[napi(object, js_name = "BoundsEntry")]
189
+ pub struct BoundsEntry {
190
+ pub role: String,
191
+ pub name: String,
192
+ pub bounds: Bounds,
193
+ pub selector: Option<String>,
194
+ }
195
+
196
+ /// Result of get_window_tree_result operation with all computed data
197
+ #[napi(object, js_name = "WindowTreeResult")]
198
+ pub struct WindowTreeResult {
199
+ /// The raw UI tree structure
200
+ pub tree: UINode,
201
+ /// Process ID of the window
202
+ pub pid: u32,
203
+ /// Whether this is a browser window
204
+ pub is_browser: bool,
205
+ /// Formatted compact YAML output (if format_output was true)
206
+ pub formatted: Option<String>,
207
+ /// Mapping of index to bounds for click targeting (keys are 1-based indices as strings)
208
+ pub index_to_bounds: HashMap<String, BoundsEntry>,
209
+ /// Total count of indexed elements (elements with bounds)
210
+ pub element_count: u32,
211
+ /// Path to saved window screenshot (if include_window_screenshot was true)
212
+ pub window_screenshot_path: Option<String>,
213
+ /// Paths to saved monitor screenshots (if include_monitor_screenshots was true)
214
+ pub monitor_screenshot_paths: Option<Vec<String>>,
215
+ }
216
+
217
+ #[napi(string_enum)]
218
+ pub enum PropertyLoadingMode {
219
+ /// Only load essential properties (role + name) - fastest
220
+ Fast,
221
+ /// Load all properties for complete element data - slower but comprehensive
222
+ Complete,
223
+ /// Load specific properties based on element type - balanced approach
224
+ Smart,
225
+ }
226
+
227
+ /// Output format for UI tree
228
+ #[napi(string_enum)]
229
+ pub enum TreeOutputFormat {
230
+ /// Compact YAML format with indexed elements: #1 [ROLE] name
231
+ CompactYaml,
232
+ /// Full JSON format with all fields and properties
233
+ VerboseJson,
234
+ /// Clustered YAML format: groups elements from all sources (UIA, DOM, OCR, Omniparser, Gemini)
235
+ /// by spatial proximity with prefixed indices (#u1, #d2, #o3, #p4, #g5)
236
+ ClusteredYaml,
237
+ }
238
+
239
+ /// Source of an element for clustered output
240
+ #[napi(string_enum)]
241
+ pub enum ElementSource {
242
+ /// #u - Accessibility tree (UIA)
243
+ Uia,
244
+ /// #d - Browser DOM
245
+ Dom,
246
+ /// #o - OCR text
247
+ Ocr,
248
+ /// #p - Omniparser vision
249
+ Omniparser,
250
+ /// #g - Gemini vision
251
+ Gemini,
252
+ }
253
+
254
+ /// Display mode for inspect overlay labels
255
+ #[napi(string_enum)]
256
+ pub enum OverlayDisplayMode {
257
+ /// Just rectangles, no labels
258
+ Rectangles,
259
+ /// [index] only (default)
260
+ Index,
261
+ /// [role] only
262
+ Role,
263
+ /// [index:role]
264
+ IndexRole,
265
+ /// [name] only
266
+ Name,
267
+ /// [index:name]
268
+ IndexName,
269
+ /// [index:role:name]
270
+ Full,
271
+ }
272
+
273
+ /// Element data for inspect overlay rendering
274
+ #[napi(object, js_name = "InspectElement")]
275
+ pub struct InspectElement {
276
+ /// 1-based index for click targeting
277
+ pub index: u32,
278
+ /// Element role (e.g., "Button", "Edit")
279
+ pub role: String,
280
+ /// Element name if available
281
+ pub name: Option<String>,
282
+ /// Bounding box (x, y, width, height)
283
+ pub bounds: Bounds,
284
+ }
285
+
286
+ /// OCR element representing text detected via optical character recognition.
287
+ /// Hierarchy: OcrResult -> OcrLine -> OcrWord
288
+ #[derive(Serialize)]
289
+ #[napi(object, js_name = "OcrElement")]
290
+ pub struct OcrElement {
291
+ /// Role type: "OcrResult", "OcrLine", or "OcrWord"
292
+ pub role: String,
293
+ /// The recognized text content
294
+ pub text: Option<String>,
295
+ /// Bounding box in absolute screen coordinates
296
+ pub bounds: Option<Bounds>,
297
+ /// Text rotation angle in degrees (only present on OcrResult)
298
+ pub text_angle: Option<f64>,
299
+ /// Confidence score (0.0 to 1.0) if available
300
+ pub confidence: Option<f64>,
301
+ /// Child elements (lines for OcrResult, words for OcrLine)
302
+ pub children: Option<Vec<OcrElement>>,
303
+ }
304
+
305
+ /// Result of OCR operation with tree and index-to-bounds mapping
306
+ #[napi(object, js_name = "OcrResult")]
307
+ pub struct OcrResult {
308
+ /// The OCR tree structure
309
+ pub tree: OcrElement,
310
+ /// Formatted compact YAML output (if format_output was true)
311
+ pub formatted: Option<String>,
312
+ /// Mapping of index to bounds for click targeting (keys are 1-based indices as strings)
313
+ /// Value contains (text, bounds)
314
+ pub index_to_bounds: HashMap<String, OcrBoundsEntry>,
315
+ /// Total count of indexed elements (words with bounds)
316
+ pub element_count: u32,
317
+ }
318
+
319
+ /// Entry in OCR index-to-bounds mapping for click targeting
320
+ #[napi(object, js_name = "OcrBoundsEntry")]
321
+ pub struct OcrBoundsEntry {
322
+ pub text: String,
323
+ pub bounds: Bounds,
324
+ }
325
+
326
+ /// Browser DOM element captured from a web page
327
+ #[derive(Serialize)]
328
+ #[napi(object, js_name = "BrowserDomElement")]
329
+ pub struct BrowserDomElement {
330
+ /// HTML tag name (lowercase)
331
+ pub tag: String,
332
+ /// Element id attribute
333
+ pub id: Option<String>,
334
+ /// CSS classes
335
+ pub classes: Vec<String>,
336
+ /// Visible text content (truncated to 100 chars)
337
+ pub text: Option<String>,
338
+ /// href attribute for links
339
+ pub href: Option<String>,
340
+ /// type attribute for inputs
341
+ pub r#type: Option<String>,
342
+ /// name attribute
343
+ pub name: Option<String>,
344
+ /// value attribute for inputs
345
+ pub value: Option<String>,
346
+ /// placeholder attribute
347
+ pub placeholder: Option<String>,
348
+ /// aria-label attribute
349
+ pub aria_label: Option<String>,
350
+ /// role attribute
351
+ pub role: Option<String>,
352
+ /// Bounding box in screen coordinates
353
+ pub bounds: Bounds,
354
+ }
355
+
356
+ /// Entry in DOM index-to-bounds mapping for click targeting
357
+ #[napi(object, js_name = "DomBoundsEntry")]
358
+ pub struct DomBoundsEntry {
359
+ /// Display name (text or aria-label or tag)
360
+ pub name: String,
361
+ /// HTML tag
362
+ pub tag: String,
363
+ /// Bounding box
364
+ pub bounds: Bounds,
365
+ }
366
+
367
+ /// Result of browser DOM capture operation
368
+ #[napi(object, js_name = "BrowserDomResult")]
369
+ pub struct BrowserDomResult {
370
+ /// List of captured DOM elements
371
+ pub elements: Vec<BrowserDomElement>,
372
+ /// Formatted compact YAML output (if format_output was true)
373
+ pub formatted: Option<String>,
374
+ /// Mapping of index to bounds for click targeting
375
+ pub index_to_bounds: HashMap<String, DomBoundsEntry>,
376
+ /// Total count of captured elements
377
+ pub element_count: u32,
378
+ /// Page URL
379
+ pub page_url: String,
380
+ /// Page title
381
+ pub page_title: String,
382
+ }
383
+
384
+ /// UI element detected by Gemini vision model
385
+ #[derive(Serialize, Clone)]
386
+ #[napi(object, js_name = "VisionElement")]
387
+ pub struct VisionElement {
388
+ /// Element type: text, icon, button, input, checkbox, dropdown, link, image, unknown
389
+ pub element_type: String,
390
+ /// Visible text or label on the element
391
+ pub content: Option<String>,
392
+ /// AI description of what this element is or does
393
+ pub description: Option<String>,
394
+ /// Bounding box in screen coordinates (x, y, width, height)
395
+ pub bounds: Option<Bounds>,
396
+ /// Whether the element is interactive/clickable
397
+ pub interactivity: Option<bool>,
398
+ }
399
+
400
+ /// Entry in Gemini vision index-to-bounds mapping for click targeting
401
+ #[napi(object, js_name = "VisionBoundsEntry")]
402
+ pub struct VisionBoundsEntry {
403
+ /// Display name (content or description)
404
+ pub name: String,
405
+ /// Element type
406
+ pub element_type: String,
407
+ /// Bounding box
408
+ pub bounds: Bounds,
409
+ }
410
+
411
+ /// Result of Gemini vision detection operation
412
+ #[napi(object, js_name = "GeminiVisionResult")]
413
+ pub struct GeminiVisionResult {
414
+ /// List of detected UI elements
415
+ pub elements: Vec<VisionElement>,
416
+ /// Formatted compact YAML output (if format_output was true)
417
+ pub formatted: Option<String>,
418
+ /// Mapping of index to bounds for click targeting
419
+ pub index_to_bounds: HashMap<String, VisionBoundsEntry>,
420
+ /// Total count of detected elements
421
+ pub element_count: u32,
422
+ }
423
+
424
+ /// Item detected by Omniparser V2 (icon/field detection)
425
+ #[derive(Serialize, Clone)]
426
+ #[napi(object, js_name = "OmniparserItem")]
427
+ pub struct OmniparserItem {
428
+ /// Element label: "icon", "text", etc.
429
+ pub label: String,
430
+ /// Content or OCR text
431
+ pub content: Option<String>,
432
+ /// Bounding box in screen coordinates (x, y, width, height)
433
+ pub bounds: Option<Bounds>,
434
+ }
435
+
436
+ /// Entry in Omniparser index-to-bounds mapping for click targeting
437
+ #[napi(object, js_name = "OmniparserBoundsEntry")]
438
+ pub struct OmniparserBoundsEntry {
439
+ /// Display name (content or label)
440
+ pub name: String,
441
+ /// Element label
442
+ pub label: String,
443
+ /// Bounding box
444
+ pub bounds: Bounds,
445
+ }
446
+
447
+ /// Result of Omniparser detection operation
448
+ #[napi(object, js_name = "OmniparserResult")]
449
+ pub struct OmniparserResult {
450
+ /// List of detected items
451
+ pub items: Vec<OmniparserItem>,
452
+ /// Formatted compact YAML output (if format_output was true)
453
+ pub formatted: Option<String>,
454
+ /// Mapping of index to bounds for click targeting
455
+ pub index_to_bounds: HashMap<String, OmniparserBoundsEntry>,
456
+ /// Total count of detected items
457
+ pub item_count: u32,
458
+ }
459
+
460
+ /// Entry in clustered index mapping (for click targeting across all sources)
461
+ #[napi(object, js_name = "ClusteredBoundsEntry")]
462
+ pub struct ClusteredBoundsEntry {
463
+ /// Element source (Uia, Dom, Ocr, Omniparser, Gemini)
464
+ pub source: ElementSource,
465
+ /// Original index within the source
466
+ pub original_index: u32,
467
+ /// Bounding box in screen coordinates
468
+ pub bounds: Bounds,
469
+ }
470
+
471
+ /// Result of clustered tree formatting
472
+ #[napi(object, js_name = "ClusteredFormattingResult")]
473
+ pub struct ClusteredFormattingResult {
474
+ /// Formatted clustered YAML output
475
+ pub formatted: String,
476
+ /// Mapping from prefixed index (e.g., "u1", "d2") to source and bounds
477
+ pub index_to_source_and_bounds: HashMap<String, ClusteredBoundsEntry>,
478
+ }
479
+
480
+ #[napi(object, js_name = "TreeBuildConfig")]
481
+ pub struct TreeBuildConfig {
482
+ /// Property loading strategy
483
+ pub property_mode: PropertyLoadingMode,
484
+ /// Optional timeout per operation in milliseconds
485
+ pub timeout_per_operation_ms: Option<i64>,
486
+ /// Optional yield frequency for responsiveness
487
+ pub yield_every_n_elements: Option<i32>,
488
+ /// Optional batch size for processing elements
489
+ pub batch_size: Option<i32>,
490
+ /// Optional maximum depth to traverse (undefined = unlimited)
491
+ pub max_depth: Option<i32>,
492
+ /// Delay in milliseconds to wait for UI to stabilize before capturing tree
493
+ pub ui_settle_delay_ms: Option<i64>,
494
+ /// Generate formatted output alongside the tree structure (defaults to true if tree_output_format is set)
495
+ pub format_output: Option<bool>,
496
+ /// Output format for tree: 'CompactYaml' (default) or 'VerboseJson'
497
+ pub tree_output_format: Option<TreeOutputFormat>,
498
+ /// Selector to start tree from instead of window root (e.g., "role:Dialog" to focus on a dialog)
499
+ pub tree_from_selector: Option<String>,
500
+ /// Include window screenshot in result (saved to executions dir). Defaults to false.
501
+ pub include_window_screenshot: Option<bool>,
502
+ /// Include all monitor screenshots in result (saved to executions dir). Defaults to false.
503
+ pub include_monitor_screenshots: Option<bool>,
504
+ /// Include Gemini Vision AI detection. Elements prefixed with #g1, #g2, etc.
505
+ pub include_gemini_vision: Option<bool>,
506
+ /// Include Omniparser detection. Elements prefixed with #p1, #p2, etc.
507
+ pub include_omniparser: Option<bool>,
508
+ /// Include OCR text detection. Elements prefixed with #o1, #o2, etc.
509
+ pub include_ocr: Option<bool>,
510
+ /// Include browser DOM elements (requires ComputerUse Bridge extension). Elements prefixed with #d1, #d2, etc.
511
+ pub include_browser_dom: Option<bool>,
512
+ }
513
+
514
+ impl From<(f64, f64, f64, f64)> for Bounds {
515
+ fn from(t: (f64, f64, f64, f64)) -> Self {
516
+ Bounds {
517
+ x: t.0,
518
+ y: t.1,
519
+ width: t.2,
520
+ height: t.3,
521
+ }
522
+ }
523
+ }
524
+
525
+ impl From<(f64, f64)> for Coordinates {
526
+ fn from(t: (f64, f64)) -> Self {
527
+ Coordinates { x: t.0, y: t.1 }
528
+ }
529
+ }
530
+
531
+ impl From<computeruse::ClickResult> for ClickResult {
532
+ fn from(r: computeruse::ClickResult) -> Self {
533
+ ClickResult {
534
+ method: r.method,
535
+ coordinates: r.coordinates.map(Coordinates::from),
536
+ details: r.details,
537
+ window_screenshot_path: None,
538
+ monitor_screenshot_paths: None,
539
+ ui_diff: None,
540
+ }
541
+ }
542
+ }
543
+
544
+ impl From<computeruse::Monitor> for Monitor {
545
+ fn from(m: computeruse::Monitor) -> Self {
546
+ Monitor {
547
+ id: m.id,
548
+ name: m.name,
549
+ is_primary: m.is_primary,
550
+ width: m.width,
551
+ height: m.height,
552
+ x: m.x,
553
+ y: m.y,
554
+ scale_factor: m.scale_factor,
555
+ }
556
+ }
557
+ }
558
+
559
+ impl From<computeruse::OcrElement> for OcrElement {
560
+ fn from(e: computeruse::OcrElement) -> Self {
561
+ OcrElement {
562
+ role: e.role,
563
+ text: e.text,
564
+ bounds: e.bounds.map(|(x, y, w, h)| Bounds {
565
+ x,
566
+ y,
567
+ width: w,
568
+ height: h,
569
+ }),
570
+ text_angle: e.text_angle,
571
+ confidence: e.confidence,
572
+ children: e
573
+ .children
574
+ .map(|children| children.into_iter().map(OcrElement::from).collect()),
575
+ }
576
+ }
577
+ }
578
+
579
+ impl From<computeruse::UINode> for UINode {
580
+ fn from(node: computeruse::UINode) -> Self {
581
+ UINode {
582
+ id: node.id,
583
+ attributes: UIElementAttributes::from(node.attributes),
584
+ children: node.children.into_iter().map(UINode::from).collect(),
585
+ }
586
+ }
587
+ }
588
+
589
+ impl From<computeruse::WindowTreeResult> for WindowTreeResult {
590
+ fn from(result: computeruse::WindowTreeResult) -> Self {
591
+ // Convert HashMap<u32, (String, String, (f64, f64, f64, f64), Option<String>)>
592
+ // to HashMap<String, BoundsEntry>
593
+ let index_to_bounds = result
594
+ .index_to_bounds
595
+ .into_iter()
596
+ .map(|(idx, (role, name, (x, y, w, h), selector))| {
597
+ (
598
+ idx.to_string(),
599
+ BoundsEntry {
600
+ role,
601
+ name,
602
+ bounds: Bounds {
603
+ x,
604
+ y,
605
+ width: w,
606
+ height: h,
607
+ },
608
+ selector,
609
+ },
610
+ )
611
+ })
612
+ .collect();
613
+
614
+ WindowTreeResult {
615
+ tree: UINode::from(result.tree),
616
+ pid: result.pid,
617
+ is_browser: result.is_browser,
618
+ formatted: result.formatted,
619
+ index_to_bounds,
620
+ element_count: result.element_count,
621
+ window_screenshot_path: None,
622
+ monitor_screenshot_paths: None,
623
+ }
624
+ }
625
+ }
626
+
627
+ impl From<computeruse::UIElementAttributes> for UIElementAttributes {
628
+ fn from(attrs: computeruse::UIElementAttributes) -> Self {
629
+ // Convert HashMap<String, Option<serde_json::Value>> to HashMap<String, Option<String>>
630
+ let properties = attrs
631
+ .properties
632
+ .into_iter()
633
+ .map(|(k, v)| (k, v.map(|val| val.to_string())))
634
+ .collect();
635
+
636
+ UIElementAttributes {
637
+ role: attrs.role,
638
+ name: attrs.name,
639
+ label: attrs.label,
640
+ value: attrs.value,
641
+ description: attrs.description,
642
+ properties,
643
+ is_keyboard_focusable: attrs.is_keyboard_focusable,
644
+ bounds: attrs.bounds.map(|(x, y, width, height)| Bounds {
645
+ x,
646
+ y,
647
+ width,
648
+ height,
649
+ }),
650
+ }
651
+ }
652
+ }
653
+
654
+ #[napi(string_enum)]
655
+ pub enum TextPosition {
656
+ Top,
657
+ TopRight,
658
+ Right,
659
+ BottomRight,
660
+ Bottom,
661
+ BottomLeft,
662
+ Left,
663
+ TopLeft,
664
+ Inside,
665
+ }
666
+
667
+ #[napi(object)]
668
+ pub struct FontStyle {
669
+ pub size: u32,
670
+ pub bold: bool,
671
+ pub color: u32,
672
+ }
673
+
674
+ #[napi]
675
+ pub struct HighlightHandle {
676
+ inner: Option<computeruse::HighlightHandle>,
677
+ }
678
+
679
+ #[napi]
680
+ impl HighlightHandle {
681
+ #[napi]
682
+ pub fn close(&mut self) {
683
+ if let Some(handle) = self.inner.take() {
684
+ handle.close();
685
+ }
686
+ }
687
+ }
688
+
689
+ impl HighlightHandle {
690
+ pub fn new(handle: computeruse::HighlightHandle) -> Self {
691
+ Self {
692
+ inner: Some(handle),
693
+ }
694
+ }
695
+
696
+ pub fn new_dummy() -> Self {
697
+ Self { inner: None }
698
+ }
699
+ }
700
+
701
+ impl From<TextPosition> for computeruse::TextPosition {
702
+ fn from(pos: TextPosition) -> Self {
703
+ match pos {
704
+ TextPosition::Top => computeruse::TextPosition::Top,
705
+ TextPosition::TopRight => computeruse::TextPosition::TopRight,
706
+ TextPosition::Right => computeruse::TextPosition::Right,
707
+ TextPosition::BottomRight => computeruse::TextPosition::BottomRight,
708
+ TextPosition::Bottom => computeruse::TextPosition::Bottom,
709
+ TextPosition::BottomLeft => computeruse::TextPosition::BottomLeft,
710
+ TextPosition::Left => computeruse::TextPosition::Left,
711
+ TextPosition::TopLeft => computeruse::TextPosition::TopLeft,
712
+ TextPosition::Inside => computeruse::TextPosition::Inside,
713
+ }
714
+ }
715
+ }
716
+
717
+ impl From<FontStyle> for computeruse::FontStyle {
718
+ fn from(style: FontStyle) -> Self {
719
+ computeruse::FontStyle {
720
+ size: style.size,
721
+ bold: style.bold,
722
+ color: style.color,
723
+ }
724
+ }
725
+ }
726
+
727
+ impl Default for FontStyle {
728
+ fn default() -> Self {
729
+ Self {
730
+ size: 12,
731
+ bold: false,
732
+ color: 0x000000,
733
+ }
734
+ }
735
+ }
736
+
737
+ impl From<OverlayDisplayMode> for computeruse::OverlayDisplayMode {
738
+ fn from(mode: OverlayDisplayMode) -> Self {
739
+ match mode {
740
+ OverlayDisplayMode::Rectangles => computeruse::OverlayDisplayMode::Rectangles,
741
+ OverlayDisplayMode::Index => computeruse::OverlayDisplayMode::Index,
742
+ OverlayDisplayMode::Role => computeruse::OverlayDisplayMode::Role,
743
+ OverlayDisplayMode::IndexRole => computeruse::OverlayDisplayMode::IndexRole,
744
+ OverlayDisplayMode::Name => computeruse::OverlayDisplayMode::Name,
745
+ OverlayDisplayMode::IndexName => computeruse::OverlayDisplayMode::IndexName,
746
+ OverlayDisplayMode::Full => computeruse::OverlayDisplayMode::Full,
747
+ }
748
+ }
749
+ }
750
+
751
+ impl From<InspectElement> for computeruse::InspectElement {
752
+ fn from(elem: InspectElement) -> Self {
753
+ computeruse::InspectElement {
754
+ index: elem.index,
755
+ role: elem.role,
756
+ name: elem.name,
757
+ bounds: (
758
+ elem.bounds.x,
759
+ elem.bounds.y,
760
+ elem.bounds.width,
761
+ elem.bounds.height,
762
+ ),
763
+ }
764
+ }
765
+ }
766
+
767
+ impl From<TreeBuildConfig> for computeruse::platforms::TreeBuildConfig {
768
+ fn from(config: TreeBuildConfig) -> Self {
769
+ computeruse::platforms::TreeBuildConfig {
770
+ property_mode: match config.property_mode {
771
+ PropertyLoadingMode::Fast => computeruse::platforms::PropertyLoadingMode::Fast,
772
+ PropertyLoadingMode::Complete => {
773
+ computeruse::platforms::PropertyLoadingMode::Complete
774
+ }
775
+ PropertyLoadingMode::Smart => computeruse::platforms::PropertyLoadingMode::Smart,
776
+ },
777
+ timeout_per_operation_ms: config.timeout_per_operation_ms.map(|x| x as u64),
778
+ yield_every_n_elements: config.yield_every_n_elements.map(|x| x as usize),
779
+ batch_size: config.batch_size.map(|x| x as usize),
780
+ max_depth: config.max_depth.map(|x| x as usize),
781
+ include_all_bounds: false,
782
+ ui_settle_delay_ms: config.ui_settle_delay_ms.map(|x| x as u64),
783
+ format_output: config.format_output.unwrap_or(false),
784
+ show_overlay: false, // Use Desktop.showInspectOverlay() method instead
785
+ overlay_display_mode: None,
786
+ from_selector: config.tree_from_selector, // Pass through to core SDK
787
+ }
788
+ }
789
+ }
790
+
791
+ /// Convert SerializableUIElement to UINode
792
+ pub(crate) fn serializable_to_ui_node(elem: &computeruse::SerializableUIElement) -> UINode {
793
+ let attrs = UIElementAttributes {
794
+ role: elem.role.clone(),
795
+ name: elem.name.clone(),
796
+ label: elem.label.clone(),
797
+ value: elem.value.clone(),
798
+ description: elem.description.clone(),
799
+ properties: HashMap::new(), // SerializableUIElement doesn't have properties field
800
+ is_keyboard_focusable: elem.is_keyboard_focusable,
801
+ bounds: elem.bounds.map(|(x, y, w, h)| Bounds {
802
+ x,
803
+ y,
804
+ width: w,
805
+ height: h,
806
+ }),
807
+ };
808
+
809
+ let children = elem
810
+ .children
811
+ .as_ref()
812
+ .map(|children| children.iter().map(serializable_to_ui_node).collect())
813
+ .unwrap_or_default();
814
+
815
+ UINode {
816
+ id: elem.id.clone(),
817
+ attributes: attrs,
818
+ children,
819
+ }
820
+ }
821
+
822
+ // ===== Computer Use Types =====
823
+
824
+ /// A single step in the computer use execution
825
+ #[napi(object)]
826
+ pub struct ComputerUseStep {
827
+ /// Step number (1-indexed)
828
+ pub step: u32,
829
+ /// Action that was executed
830
+ pub action: String,
831
+ /// Arguments passed to the action (as JSON string)
832
+ pub args: String,
833
+ /// Whether the action succeeded
834
+ pub success: bool,
835
+ /// Error message if action failed
836
+ pub error: Option<String>,
837
+ /// Model's reasoning text for this step
838
+ pub text: Option<String>,
839
+ }
840
+
841
+ /// Pending confirmation info when safety check triggers
842
+ #[napi(object)]
843
+ pub struct ComputerUsePendingConfirmation {
844
+ /// Action that needs confirmation
845
+ pub action: String,
846
+ /// Arguments for the action (as JSON string)
847
+ pub args: String,
848
+ /// Model's explanation text
849
+ pub text: Option<String>,
850
+ }
851
+
852
+ /// Result of the computer use execution
853
+ #[napi(object)]
854
+ pub struct ComputerUseResult {
855
+ /// Status: "success", "failed", "needs_confirmation", "max_steps_reached"
856
+ pub status: String,
857
+ /// The goal that was attempted
858
+ pub goal: String,
859
+ /// Number of steps executed
860
+ pub steps_executed: u32,
861
+ /// Last action performed
862
+ pub final_action: String,
863
+ /// Final text response from model
864
+ pub final_text: Option<String>,
865
+ /// History of all steps
866
+ pub steps: Vec<ComputerUseStep>,
867
+ /// Pending confirmation info if status is "needs_confirmation"
868
+ pub pending_confirmation: Option<ComputerUsePendingConfirmation>,
869
+ /// Execution ID for finding screenshots (e.g., "20251205_134500_geminiComputerUse_msedge")
870
+ pub execution_id: Option<String>,
871
+ }
872
+
873
+ impl From<computeruse::ComputerUseStep> for ComputerUseStep {
874
+ fn from(step: computeruse::ComputerUseStep) -> Self {
875
+ ComputerUseStep {
876
+ step: step.step,
877
+ action: step.action,
878
+ args: step.args.to_string(),
879
+ success: step.success,
880
+ error: step.error,
881
+ text: step.text,
882
+ }
883
+ }
884
+ }
885
+
886
+ impl From<computeruse::ComputerUseResult> for ComputerUseResult {
887
+ fn from(result: computeruse::ComputerUseResult) -> Self {
888
+ let pending_confirmation =
889
+ result
890
+ .pending_confirmation
891
+ .map(|pc| ComputerUsePendingConfirmation {
892
+ action: pc
893
+ .get("action")
894
+ .and_then(|v| v.as_str())
895
+ .unwrap_or("")
896
+ .to_string(),
897
+ args: pc.get("args").map(|v| v.to_string()).unwrap_or_default(),
898
+ text: pc
899
+ .get("text")
900
+ .and_then(|v| v.as_str())
901
+ .map(|s: &str| s.to_string()),
902
+ });
903
+
904
+ ComputerUseResult {
905
+ status: result.status,
906
+ goal: result.goal,
907
+ steps_executed: result.steps_executed,
908
+ final_action: result.final_action,
909
+ final_text: result.final_text,
910
+ steps: result
911
+ .steps
912
+ .into_iter()
913
+ .map(ComputerUseStep::from)
914
+ .collect(),
915
+ pending_confirmation,
916
+ execution_id: result.execution_id,
917
+ }
918
+ }
919
+ }
920
+
921
+ /// Result of closing a browser tab
922
+ #[napi(object)]
923
+ #[derive(Debug, Clone)]
924
+ pub struct CloseTabResult {
925
+ pub closed: bool,
926
+ pub tab: ClosedTabInfo,
927
+ }
928
+
929
+ /// Information about a closed tab
930
+ #[napi(object)]
931
+ #[derive(Debug, Clone)]
932
+ pub struct ClosedTabInfo {
933
+ pub id: i32,
934
+ pub url: Option<String>,
935
+ pub title: Option<String>,
936
+ pub window_id: Option<i32>,
937
+ }
938
+
939
+ impl From<computeruse::extension_bridge::CloseTabResult> for CloseTabResult {
940
+ fn from(result: computeruse::extension_bridge::CloseTabResult) -> Self {
941
+ CloseTabResult {
942
+ closed: result.closed,
943
+ tab: ClosedTabInfo {
944
+ id: result.tab.id,
945
+ url: result.tab.url,
946
+ title: result.tab.title,
947
+ window_id: result.tab.window_id,
948
+ },
949
+ }
950
+ }
951
+ }
952
+
953
+ /// Options for closing a browser tab
954
+ #[napi(object)]
955
+ #[derive(Debug, Clone, Default)]
956
+ pub struct CloseTabOptions {
957
+ /// Specific Chrome tab ID to close
958
+ pub tab_id: Option<i32>,
959
+ /// URL to match (partial match supported)
960
+ pub url: Option<String>,
961
+ /// Title to match (case-insensitive partial match)
962
+ pub title: Option<String>,
963
+ }