@elizaos/computeruse 0.24.21 → 2.0.0-alpha.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/LICENSE +21 -0
  2. package/index.d.ts +1653 -0
  3. package/index.js +327 -0
  4. package/package.json +40 -16
package/index.d.ts ADDED
@@ -0,0 +1,1653 @@
1
+ /* tslint:disable */
2
+ /* eslint-disable */
3
+
4
+ /* auto-generated by NAPI-RS */
5
+
6
+ /** Click position within element bounds as percentages (0-100) */
7
+ export interface ClickPosition {
8
+ /** X position as percentage from left edge (0-100). 50 = center. */
9
+ xPercentage: number
10
+ /** Y position as percentage from top edge (0-100). 50 = center. */
11
+ yPercentage: number
12
+ }
13
+ /** Options for action methods (click, pressKey, scroll, etc.) */
14
+ export interface ActionOptions {
15
+ /** Whether to highlight the element before performing the action. Defaults to false. */
16
+ highlightBeforeAction?: boolean
17
+ /** Whether to capture window screenshot after action. Defaults to true. */
18
+ includeWindowScreenshot?: boolean
19
+ /** Whether to capture monitor screenshots after action. Defaults to false. */
20
+ includeMonitorScreenshots?: boolean
21
+ /** Whether to try focusing the element before the action. Defaults to true. */
22
+ tryFocusBefore?: boolean
23
+ /** Whether to try clicking the element if focus fails. Defaults to true. */
24
+ tryClickBefore?: boolean
25
+ /** Whether to capture UI tree before/after action and compute diff. Defaults to false. */
26
+ uiDiffBeforeAfter?: boolean
27
+ /** Max depth for tree capture when doing UI diff. */
28
+ uiDiffMaxDepth?: number
29
+ /** Click position within element bounds. If not specified, clicks at center. */
30
+ clickPosition?: ClickPosition
31
+ /** Type of click: 'Left', 'Double', or 'Right'. Defaults to 'Left'. */
32
+ clickType?: ClickType
33
+ /** Whether to restore cursor to original position after click. Defaults to false. */
34
+ restoreCursor?: boolean
35
+ /**
36
+ * Whether to restore the original focus and caret position after the action. Defaults to false.
37
+ * When true, saves the currently focused element and caret position before the action, then restores them after.
38
+ */
39
+ restoreFocus?: boolean
40
+ }
41
+ /** Options for typeText method */
42
+ export interface TypeTextOptions {
43
+ /**
44
+ * REQUIRED: Whether to clear existing text before typing.
45
+ * Set to true to clear the field first, false to append.
46
+ */
47
+ clearBeforeTyping: boolean
48
+ /** Whether to use clipboard for pasting. Defaults to false. */
49
+ useClipboard?: boolean
50
+ /** Whether to highlight the element before typing. Defaults to false. */
51
+ highlightBeforeAction?: boolean
52
+ /** Whether to capture window screenshot after action. Defaults to true. */
53
+ includeWindowScreenshot?: boolean
54
+ /** Whether to capture monitor screenshots after action. Defaults to false. */
55
+ includeMonitorScreenshots?: boolean
56
+ /** Whether to try focusing the element before typing. Defaults to true. */
57
+ tryFocusBefore?: boolean
58
+ /** Whether to try clicking the element if focus fails. Defaults to true. */
59
+ tryClickBefore?: boolean
60
+ /**
61
+ * Whether to restore the original focus and caret position after typing. Defaults to false.
62
+ * When true, saves the currently focused element and caret position before typing, then restores them after.
63
+ */
64
+ restoreFocus?: boolean
65
+ /** Whether to capture UI tree before/after action and compute diff. Defaults to false. */
66
+ uiDiffBeforeAfter?: boolean
67
+ /** Max depth for tree capture when doing UI diff. */
68
+ uiDiffMaxDepth?: number
69
+ }
70
+ /** Result of element validation */
71
+ export interface ValidationResult {
72
+ /** Whether the element exists */
73
+ exists: boolean
74
+ /** The element if found */
75
+ element?: Element
76
+ /** Error message if validation failed (not element not found, but actual error) */
77
+ error?: string
78
+ }
79
+ export interface Bounds {
80
+ x: number
81
+ y: number
82
+ width: number
83
+ height: number
84
+ }
85
+ export interface Coordinates {
86
+ x: number
87
+ y: number
88
+ }
89
+ /** Result of UI diff capture */
90
+ export interface UiDiffResult {
91
+ /** The computed diff showing changes (lines starting with + or -) */
92
+ diff: string
93
+ /** Whether any UI changes were detected */
94
+ hasChanges: boolean
95
+ }
96
+ export interface ClickResult {
97
+ method: string
98
+ coordinates?: Coordinates
99
+ details: string
100
+ /** Path to window screenshot if captured */
101
+ windowScreenshotPath?: string
102
+ /** Paths to monitor screenshots if captured */
103
+ monitorScreenshotPaths?: Array<string>
104
+ /** UI diff result if ui_diff_before_after was enabled */
105
+ uiDiff?: UiDiffResult
106
+ }
107
+ /** Result of an action operation (type_text, press_key, scroll, etc.) */
108
+ export interface ActionResult {
109
+ /** Whether the action succeeded */
110
+ success: boolean
111
+ /** Path to window screenshot if captured */
112
+ windowScreenshotPath?: string
113
+ /** Paths to monitor screenshots if captured */
114
+ monitorScreenshotPaths?: Array<string>
115
+ /** UI diff result if ui_diff_before_after was enabled */
116
+ uiDiff?: UiDiffResult
117
+ }
118
+ /** Type of mouse click to perform */
119
+ export const enum ClickType {
120
+ /** Single left click (default) */
121
+ Left = 'Left',
122
+ /** Double left click */
123
+ Double = 'Double',
124
+ /** Single right click */
125
+ Right = 'Right'
126
+ }
127
+ /** Source of indexed elements for click targeting */
128
+ export const enum VisionType {
129
+ /** UI Automation tree elements (default) */
130
+ UiTree = 'UiTree',
131
+ /** OCR-detected text elements */
132
+ Ocr = 'Ocr',
133
+ /** Omniparser-detected elements */
134
+ Omniparser = 'Omniparser',
135
+ /** Gemini Vision-detected elements */
136
+ Gemini = 'Gemini',
137
+ /** Browser DOM elements */
138
+ Dom = 'Dom'
139
+ }
140
+ export interface CommandOutput {
141
+ exitStatus?: number
142
+ stdout: string
143
+ stderr: string
144
+ }
145
+ export interface Monitor {
146
+ id: string
147
+ name: string
148
+ isPrimary: boolean
149
+ width: number
150
+ height: number
151
+ x: number
152
+ y: number
153
+ scaleFactor: number
154
+ }
155
+ /** A screenshot result containing image data and dimensions. */
156
+ export interface ScreenshotResult {
157
+ width: number
158
+ height: number
159
+ imageData: Array<number>
160
+ monitor?: Monitor
161
+ }
162
+ export interface ResizedDimensions {
163
+ width: number
164
+ height: number
165
+ }
166
+ export interface MonitorScreenshotPair {
167
+ monitor: Monitor
168
+ screenshot: ScreenshotResult
169
+ }
170
+ export interface UIElementAttributes {
171
+ role: string
172
+ name?: string
173
+ label?: string
174
+ value?: string
175
+ description?: string
176
+ properties: Record<string, string | undefined | null>
177
+ isKeyboardFocusable?: boolean
178
+ bounds?: Bounds
179
+ }
180
+ export interface UINode {
181
+ id?: string
182
+ attributes: UIElementAttributes
183
+ children: Array<UINode>
184
+ }
185
+ /** Entry in index-to-bounds mapping for click targeting */
186
+ export interface BoundsEntry {
187
+ role: string
188
+ name: string
189
+ bounds: Bounds
190
+ selector?: string
191
+ }
192
+ /** Result of get_window_tree_result operation with all computed data */
193
+ export interface WindowTreeResult {
194
+ /** The raw UI tree structure */
195
+ tree: UINode
196
+ /** Process ID of the window */
197
+ pid: number
198
+ /** Whether this is a browser window */
199
+ isBrowser: boolean
200
+ /** Formatted compact YAML output (if format_output was true) */
201
+ formatted?: string
202
+ /** Mapping of index to bounds for click targeting (keys are 1-based indices as strings) */
203
+ indexToBounds: Record<string, BoundsEntry>
204
+ /** Total count of indexed elements (elements with bounds) */
205
+ elementCount: number
206
+ /** Path to saved window screenshot (if include_window_screenshot was true) */
207
+ windowScreenshotPath?: string
208
+ /** Paths to saved monitor screenshots (if include_monitor_screenshots was true) */
209
+ monitorScreenshotPaths?: Array<string>
210
+ }
211
+ export const enum PropertyLoadingMode {
212
+ /** Only load essential properties (role + name) - fastest */
213
+ Fast = 'Fast',
214
+ /** Load all properties for complete element data - slower but comprehensive */
215
+ Complete = 'Complete',
216
+ /** Load specific properties based on element type - balanced approach */
217
+ Smart = 'Smart'
218
+ }
219
+ /** Output format for UI tree */
220
+ export const enum TreeOutputFormat {
221
+ /** Compact YAML format with indexed elements: #1 [ROLE] name */
222
+ CompactYaml = 'CompactYaml',
223
+ /** Full JSON format with all fields and properties */
224
+ VerboseJson = 'VerboseJson',
225
+ /**
226
+ * Clustered YAML format: groups elements from all sources (UIA, DOM, OCR, Omniparser, Gemini)
227
+ * by spatial proximity with prefixed indices (#u1, #d2, #o3, #p4, #g5)
228
+ */
229
+ ClusteredYaml = 'ClusteredYaml'
230
+ }
231
+ /** Source of an element for clustered output */
232
+ export const enum ElementSource {
233
+ /** #u - Accessibility tree (UIA) */
234
+ Uia = 'Uia',
235
+ /** #d - Browser DOM */
236
+ Dom = 'Dom',
237
+ /** #o - OCR text */
238
+ Ocr = 'Ocr',
239
+ /** #p - Omniparser vision */
240
+ Omniparser = 'Omniparser',
241
+ /** #g - Gemini vision */
242
+ Gemini = 'Gemini'
243
+ }
244
+ /** Display mode for inspect overlay labels */
245
+ export const enum OverlayDisplayMode {
246
+ /** Just rectangles, no labels */
247
+ Rectangles = 'Rectangles',
248
+ /** [index] only (default) */
249
+ Index = 'Index',
250
+ /** [role] only */
251
+ Role = 'Role',
252
+ /** [index:role] */
253
+ IndexRole = 'IndexRole',
254
+ /** [name] only */
255
+ Name = 'Name',
256
+ /** [index:name] */
257
+ IndexName = 'IndexName',
258
+ /** [index:role:name] */
259
+ Full = 'Full'
260
+ }
261
+ /** Element data for inspect overlay rendering */
262
+ export interface InspectElement {
263
+ /** 1-based index for click targeting */
264
+ index: number
265
+ /** Element role (e.g., "Button", "Edit") */
266
+ role: string
267
+ /** Element name if available */
268
+ name?: string
269
+ /** Bounding box (x, y, width, height) */
270
+ bounds: Bounds
271
+ }
272
+ /**
273
+ * OCR element representing text detected via optical character recognition.
274
+ * Hierarchy: OcrResult -> OcrLine -> OcrWord
275
+ */
276
+ export interface OcrElement {
277
+ /** Role type: "OcrResult", "OcrLine", or "OcrWord" */
278
+ role: string
279
+ /** The recognized text content */
280
+ text?: string
281
+ /** Bounding box in absolute screen coordinates */
282
+ bounds?: Bounds
283
+ /** Text rotation angle in degrees (only present on OcrResult) */
284
+ textAngle?: number
285
+ /** Confidence score (0.0 to 1.0) if available */
286
+ confidence?: number
287
+ /** Child elements (lines for OcrResult, words for OcrLine) */
288
+ children?: Array<OcrElement>
289
+ }
290
+ /** Result of OCR operation with tree and index-to-bounds mapping */
291
+ export interface OcrResult {
292
+ /** The OCR tree structure */
293
+ tree: OcrElement
294
+ /** Formatted compact YAML output (if format_output was true) */
295
+ formatted?: string
296
+ /**
297
+ * Mapping of index to bounds for click targeting (keys are 1-based indices as strings)
298
+ * Value contains (text, bounds)
299
+ */
300
+ indexToBounds: Record<string, OcrBoundsEntry>
301
+ /** Total count of indexed elements (words with bounds) */
302
+ elementCount: number
303
+ }
304
+ /** Entry in OCR index-to-bounds mapping for click targeting */
305
+ export interface OcrBoundsEntry {
306
+ text: string
307
+ bounds: Bounds
308
+ }
309
+ /** Browser DOM element captured from a web page */
310
+ export interface BrowserDomElement {
311
+ /** HTML tag name (lowercase) */
312
+ tag: string
313
+ /** Element id attribute */
314
+ id?: string
315
+ /** CSS classes */
316
+ classes: Array<string>
317
+ /** Visible text content (truncated to 100 chars) */
318
+ text?: string
319
+ /** href attribute for links */
320
+ href?: string
321
+ /** type attribute for inputs */
322
+ type?: string
323
+ /** name attribute */
324
+ name?: string
325
+ /** value attribute for inputs */
326
+ value?: string
327
+ /** placeholder attribute */
328
+ placeholder?: string
329
+ /** aria-label attribute */
330
+ ariaLabel?: string
331
+ /** role attribute */
332
+ role?: string
333
+ /** Bounding box in screen coordinates */
334
+ bounds: Bounds
335
+ }
336
+ /** Entry in DOM index-to-bounds mapping for click targeting */
337
+ export interface DomBoundsEntry {
338
+ /** Display name (text or aria-label or tag) */
339
+ name: string
340
+ /** HTML tag */
341
+ tag: string
342
+ /** Bounding box */
343
+ bounds: Bounds
344
+ }
345
+ /** Result of browser DOM capture operation */
346
+ export interface BrowserDomResult {
347
+ /** List of captured DOM elements */
348
+ elements: Array<BrowserDomElement>
349
+ /** Formatted compact YAML output (if format_output was true) */
350
+ formatted?: string
351
+ /** Mapping of index to bounds for click targeting */
352
+ indexToBounds: Record<string, DomBoundsEntry>
353
+ /** Total count of captured elements */
354
+ elementCount: number
355
+ /** Page URL */
356
+ pageUrl: string
357
+ /** Page title */
358
+ pageTitle: string
359
+ }
360
+ /** UI element detected by Gemini vision model */
361
+ export interface VisionElement {
362
+ /** Element type: text, icon, button, input, checkbox, dropdown, link, image, unknown */
363
+ elementType: string
364
+ /** Visible text or label on the element */
365
+ content?: string
366
+ /** AI description of what this element is or does */
367
+ description?: string
368
+ /** Bounding box in screen coordinates (x, y, width, height) */
369
+ bounds?: Bounds
370
+ /** Whether the element is interactive/clickable */
371
+ interactivity?: boolean
372
+ }
373
+ /** Entry in Gemini vision index-to-bounds mapping for click targeting */
374
+ export interface VisionBoundsEntry {
375
+ /** Display name (content or description) */
376
+ name: string
377
+ /** Element type */
378
+ elementType: string
379
+ /** Bounding box */
380
+ bounds: Bounds
381
+ }
382
+ /** Result of Gemini vision detection operation */
383
+ export interface GeminiVisionResult {
384
+ /** List of detected UI elements */
385
+ elements: Array<VisionElement>
386
+ /** Formatted compact YAML output (if format_output was true) */
387
+ formatted?: string
388
+ /** Mapping of index to bounds for click targeting */
389
+ indexToBounds: Record<string, VisionBoundsEntry>
390
+ /** Total count of detected elements */
391
+ elementCount: number
392
+ }
393
+ /** Item detected by Omniparser V2 (icon/field detection) */
394
+ export interface OmniparserItem {
395
+ /** Element label: "icon", "text", etc. */
396
+ label: string
397
+ /** Content or OCR text */
398
+ content?: string
399
+ /** Bounding box in screen coordinates (x, y, width, height) */
400
+ bounds?: Bounds
401
+ }
402
+ /** Entry in Omniparser index-to-bounds mapping for click targeting */
403
+ export interface OmniparserBoundsEntry {
404
+ /** Display name (content or label) */
405
+ name: string
406
+ /** Element label */
407
+ label: string
408
+ /** Bounding box */
409
+ bounds: Bounds
410
+ }
411
+ /** Result of Omniparser detection operation */
412
+ export interface OmniparserResult {
413
+ /** List of detected items */
414
+ items: Array<OmniparserItem>
415
+ /** Formatted compact YAML output (if format_output was true) */
416
+ formatted?: string
417
+ /** Mapping of index to bounds for click targeting */
418
+ indexToBounds: Record<string, OmniparserBoundsEntry>
419
+ /** Total count of detected items */
420
+ itemCount: number
421
+ }
422
+ /** Entry in clustered index mapping (for click targeting across all sources) */
423
+ export interface ClusteredBoundsEntry {
424
+ /** Element source (Uia, Dom, Ocr, Omniparser, Gemini) */
425
+ source: ElementSource
426
+ /** Original index within the source */
427
+ originalIndex: number
428
+ /** Bounding box in screen coordinates */
429
+ bounds: Bounds
430
+ }
431
+ /** Result of clustered tree formatting */
432
+ export interface ClusteredFormattingResult {
433
+ /** Formatted clustered YAML output */
434
+ formatted: string
435
+ /** Mapping from prefixed index (e.g., "u1", "d2") to source and bounds */
436
+ indexToSourceAndBounds: Record<string, ClusteredBoundsEntry>
437
+ }
438
+ export interface TreeBuildConfig {
439
+ /** Property loading strategy */
440
+ propertyMode: PropertyLoadingMode
441
+ /** Optional timeout per operation in milliseconds */
442
+ timeoutPerOperationMs?: number
443
+ /** Optional yield frequency for responsiveness */
444
+ yieldEveryNElements?: number
445
+ /** Optional batch size for processing elements */
446
+ batchSize?: number
447
+ /** Optional maximum depth to traverse (undefined = unlimited) */
448
+ maxDepth?: number
449
+ /** Delay in milliseconds to wait for UI to stabilize before capturing tree */
450
+ uiSettleDelayMs?: number
451
+ /** Generate formatted output alongside the tree structure (defaults to true if tree_output_format is set) */
452
+ formatOutput?: boolean
453
+ /** Output format for tree: 'CompactYaml' (default) or 'VerboseJson' */
454
+ treeOutputFormat?: TreeOutputFormat
455
+ /** Selector to start tree from instead of window root (e.g., "role:Dialog" to focus on a dialog) */
456
+ treeFromSelector?: string
457
+ /** Include window screenshot in result (saved to executions dir). Defaults to false. */
458
+ includeWindowScreenshot?: boolean
459
+ /** Include all monitor screenshots in result (saved to executions dir). Defaults to false. */
460
+ includeMonitorScreenshots?: boolean
461
+ /** Include Gemini Vision AI detection. Elements prefixed with #g1, #g2, etc. */
462
+ includeGeminiVision?: boolean
463
+ /** Include Omniparser detection. Elements prefixed with #p1, #p2, etc. */
464
+ includeOmniparser?: boolean
465
+ /** Include OCR text detection. Elements prefixed with #o1, #o2, etc. */
466
+ includeOcr?: boolean
467
+ /** Include browser DOM elements (requires ComputerUse Bridge extension). Elements prefixed with #d1, #d2, etc. */
468
+ includeBrowserDom?: boolean
469
+ }
470
+ export const enum TextPosition {
471
+ Top = 'Top',
472
+ TopRight = 'TopRight',
473
+ Right = 'Right',
474
+ BottomRight = 'BottomRight',
475
+ Bottom = 'Bottom',
476
+ BottomLeft = 'BottomLeft',
477
+ Left = 'Left',
478
+ TopLeft = 'TopLeft',
479
+ Inside = 'Inside'
480
+ }
481
+ export interface FontStyle {
482
+ size: number
483
+ bold: boolean
484
+ color: number
485
+ }
486
+ /** A single step in the computer use execution */
487
+ export interface ComputerUseStep {
488
+ /** Step number (1-indexed) */
489
+ step: number
490
+ /** Action that was executed */
491
+ action: string
492
+ /** Arguments passed to the action (as JSON string) */
493
+ args: string
494
+ /** Whether the action succeeded */
495
+ success: boolean
496
+ /** Error message if action failed */
497
+ error?: string
498
+ /** Model's reasoning text for this step */
499
+ text?: string
500
+ }
501
+ /** Pending confirmation info when safety check triggers */
502
+ export interface ComputerUsePendingConfirmation {
503
+ /** Action that needs confirmation */
504
+ action: string
505
+ /** Arguments for the action (as JSON string) */
506
+ args: string
507
+ /** Model's explanation text */
508
+ text?: string
509
+ }
510
+ /** Result of the computer use execution */
511
+ export interface ComputerUseResult {
512
+ /** Status: "success", "failed", "needs_confirmation", "max_steps_reached" */
513
+ status: string
514
+ /** The goal that was attempted */
515
+ goal: string
516
+ /** Number of steps executed */
517
+ stepsExecuted: number
518
+ /** Last action performed */
519
+ finalAction: string
520
+ /** Final text response from model */
521
+ finalText?: string
522
+ /** History of all steps */
523
+ steps: Array<ComputerUseStep>
524
+ /** Pending confirmation info if status is "needs_confirmation" */
525
+ pendingConfirmation?: ComputerUsePendingConfirmation
526
+ /** Execution ID for finding screenshots (e.g., "20251205_134500_geminiComputerUse_msedge") */
527
+ executionId?: string
528
+ }
529
+ /** Result of closing a browser tab */
530
+ export interface CloseTabResult {
531
+ closed: boolean
532
+ tab: ClosedTabInfo
533
+ }
534
+ /** Information about a closed tab */
535
+ export interface ClosedTabInfo {
536
+ id: number
537
+ url?: string
538
+ title?: string
539
+ windowId?: number
540
+ }
541
+ /** Options for closing a browser tab */
542
+ export interface CloseTabOptions {
543
+ /** Specific Chrome tab ID to close */
544
+ tabId?: number
545
+ /** URL to match (partial match supported) */
546
+ url?: string
547
+ /** Title to match (case-insensitive partial match) */
548
+ title?: string
549
+ }
550
+ /** Information about a window */
551
+ export interface WindowInfo {
552
+ /** Window handle */
553
+ hwnd: number
554
+ /** Process name (e.g., "notepad.exe") */
555
+ processName: string
556
+ /** Process ID */
557
+ processId: number
558
+ /** Z-order position (0 = topmost) */
559
+ zOrder: number
560
+ /** Whether the window is minimized */
561
+ isMinimized: boolean
562
+ /** Whether the window is maximized */
563
+ isMaximized: boolean
564
+ /** Whether the window has WS_EX_TOPMOST style */
565
+ isAlwaysOnTop: boolean
566
+ /** Window title */
567
+ title: string
568
+ }
569
+ /** Main entry point for desktop automation. */
570
+ export declare class Desktop {
571
+ /**
572
+ * Create a new Desktop automation instance with configurable options.
573
+ *
574
+ * @param {boolean} [useBackgroundApps=false] - Enable background apps support.
575
+ * @param {boolean} [activateApp=false] - Enable app activation support.
576
+ * @param {string} [logLevel] - Logging level (e.g., 'info', 'debug', 'warn', 'error').
577
+ * Falls back to RUST_LOG or COMPUTERUSE_LOG_LEVEL env vars, defaults to 'info'.
578
+ * @returns {Desktop} A new Desktop automation instance.
579
+ */
580
+ constructor(useBackgroundApps?: boolean | undefined | null, activateApp?: boolean | undefined | null, logLevel?: string | undefined | null)
581
+ /**
582
+ * Get the root UI element of the desktop.
583
+ *
584
+ * @returns {Element} The root UI element.
585
+ */
586
+ root(): Element
587
+ /**
588
+ * Get a list of all running applications.
589
+ *
590
+ * @returns {Array<Element>} List of application UI elements.
591
+ */
592
+ applications(): Array<Element>
593
+ /**
594
+ * Get a running application by name.
595
+ *
596
+ * @param {string} name - The name of the application to find.
597
+ * @returns {Element} The application UI element.
598
+ */
599
+ application(name: string): Element
600
+ /**
601
+ * Open an application by name.
602
+ *
603
+ * @param {string} name - The name of the application to open.
604
+ * @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot after opening
605
+ * @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after opening
606
+ */
607
+ openApplication(name: string, includeWindowScreenshot?: boolean | undefined | null, includeMonitorScreenshots?: boolean | undefined | null): Element
608
+ /**
609
+ * Activate an application by name.
610
+ *
611
+ * @param {string} name - The name of the application to activate.
612
+ */
613
+ activateApplication(name: string): void
614
+ /**
615
+ * Click within element bounds at a specified position.
616
+ *
617
+ * This is useful for clicking on elements from UI tree, OCR, omniparser, gemini vision, or DOM
618
+ * without needing an element reference - just the bounds.
619
+ *
620
+ * @param {number} x - X coordinate of the bounds.
621
+ * @param {number} y - Y coordinate of the bounds.
622
+ * @param {number} width - Width of the bounds.
623
+ * @param {number} height - Height of the bounds.
624
+ * @param {number} [xPercentage=50] - X position within bounds as percentage (0-100). Defaults to 50 (center).
625
+ * @param {number} [yPercentage=50] - Y position within bounds as percentage (0-100). Defaults to 50 (center).
626
+ * @param {ClickType} [clickType='left'] - Type of click: 'left', 'double', or 'right'.
627
+ * @param {boolean} [restoreCursor=true] - If true, restore cursor to original position after clicking.
628
+ * @param {string} [process] - Process name for window screenshot capture. If provided, enables window screenshots.
629
+ * @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot (requires process).
630
+ * @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after clicking.
631
+ * @returns {ClickResult} Result with clicked coordinates and method details.
632
+ */
633
+ clickAtBounds(x: number, y: number, width: number, height: number, xPercentage?: number | undefined | null, yPercentage?: number | undefined | null, clickType?: ClickType | undefined | null, restoreCursor?: boolean | undefined | null, process?: string | undefined | null, includeWindowScreenshot?: boolean | undefined | null, includeMonitorScreenshots?: boolean | undefined | null): ClickResult
634
+ /**
635
+ * Click on an element by its index from the last tree/vision query.
636
+ *
637
+ * This looks up cached bounds from the appropriate cache based on visionType,
638
+ * then clicks at the specified position within those bounds.
639
+ *
640
+ * @param {number} index - 1-based index from the tree/vision output (e.g., #1, #2).
641
+ * @param {VisionType} [visionType='UiTree'] - Source of the index: 'UiTree', 'Ocr', 'Omniparser', 'Gemini', or 'Dom'.
642
+ * @param {number} [xPercentage=50] - X position within bounds as percentage (0-100).
643
+ * @param {number} [yPercentage=50] - Y position within bounds as percentage (0-100).
644
+ * @param {ClickType} [clickType='Left'] - Type of click: 'Left', 'Double', or 'Right'.
645
+ * @param {boolean} [restoreCursor=true] - If true, restore cursor to original position after clicking.
646
+ * @param {string} [process] - Process name for window screenshot capture. If provided, enables window screenshots.
647
+ * @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot (requires process).
648
+ * @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after clicking.
649
+ * @returns {ClickResult} Result with clicked coordinates, element info, and method details.
650
+ */
651
+ clickByIndex(index: number, visionType?: VisionType | undefined | null, xPercentage?: number | undefined | null, yPercentage?: number | undefined | null, clickType?: ClickType | undefined | null, restoreCursor?: boolean | undefined | null, process?: string | undefined | null, includeWindowScreenshot?: boolean | undefined | null, includeMonitorScreenshots?: boolean | undefined | null): ClickResult
652
+ /**
653
+ * (async) Run a shell command.
654
+ *
655
+ * @param {string} [windowsCommand] - Command to run on Windows.
656
+ * @param {string} [unixCommand] - Command to run on Unix.
657
+ * @returns {Promise<CommandOutput>} The command output.
658
+ */
659
+ runCommand(windowsCommand?: string | undefined | null, unixCommand?: string | undefined | null): Promise<CommandOutput>
660
+ /**
661
+ * (async) Execute a shell command using GitHub Actions-style syntax.
662
+ *
663
+ * @param {string} command - The command to run (can be single or multi-line).
664
+ * @param {string} [shell] - Optional shell to use (defaults to PowerShell on Windows, bash on Unix).
665
+ * @param {string} [workingDirectory] - Optional working directory for the command.
666
+ * @returns {Promise<CommandOutput>} The command output.
667
+ */
668
+ run(command: string, shell?: string | undefined | null, workingDirectory?: string | undefined | null): Promise<CommandOutput>
669
+ /**
670
+ * (async) Perform OCR on an image file.
671
+ *
672
+ * @param {string} imagePath - Path to the image file.
673
+ * @returns {Promise<string>} The extracted text.
674
+ */
675
+ ocrImagePath(imagePath: string): Promise<string>
676
+ /**
677
+ * (async) Perform OCR on a screenshot.
678
+ *
679
+ * @param {ScreenshotResult} screenshot - The screenshot to process.
680
+ * @returns {Promise<string>} The extracted text.
681
+ */
682
+ ocrScreenshot(screenshot: ScreenshotResult): Promise<string>
683
+ /**
684
+ * (async) Perform OCR on a window by process name and return structured results with bounding boxes.
685
+ * Returns an OcrResult containing the OCR tree, formatted output, and index-to-bounds mapping
686
+ * for click targeting.
687
+ *
688
+ * @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
689
+ * @param {boolean} [formatOutput=true] - Whether to generate formatted compact YAML output.
690
+ * @returns {Promise<OcrResult>} Complete OCR result with tree, formatted output, and bounds mapping.
691
+ */
692
+ performOcrForProcess(process: string, formatOutput?: boolean | undefined | null): Promise<OcrResult>
693
+ /** (async) Perform OCR on a window by process name (non-Windows stub). */
694
+ performOcrForProcess(process: string, formatOutput?: boolean | undefined | null): Promise<OcrResult>
695
+ /**
696
+ * (async) Capture DOM elements from the current browser tab.
697
+ *
698
+ * Extracts visible DOM elements with their properties and screen coordinates.
699
+ * Uses JavaScript injection via Chrome extension to traverse the DOM tree.
700
+ *
701
+ * @param {number} [maxElements=200] - Maximum number of elements to capture.
702
+ * @param {boolean} [formatOutput=true] - Whether to include formatted compact YAML output.
703
+ * @returns {Promise<BrowserDomResult>} DOM elements with bounds for click targeting.
704
+ */
705
+ captureBrowserDom(maxElements?: number | undefined | null, formatOutput?: boolean | undefined | null): Promise<BrowserDomResult>
706
+ /**
707
+ * (async) Get a clustered tree combining elements from multiple sources grouped by spatial proximity.
708
+ *
709
+ * Combines accessibility tree (UIA) elements with optional DOM, Omniparser, and Gemini Vision elements,
710
+ * clustering nearby elements together. Each element is prefixed with its source:
711
+ * - #u1, #u2... for UIA (accessibility tree)
712
+ * - #d1, #d2... for DOM (browser content)
713
+ * - #p1, #p2... for Omniparser (vision AI detection)
714
+ * - #g1, #g2... for Gemini Vision (AI element detection)
715
+ *
716
+ * @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
717
+ * @param {number} [maxDomElements=100] - Maximum DOM elements to capture for browsers.
718
+ * @param {boolean} [includeOmniparser=false] - Whether to include Omniparser vision detection.
719
+ * @param {boolean} [includeGeminiVision=false] - Whether to include Gemini Vision AI detection.
720
+ * @returns {Promise<ClusteredFormattingResult>} Clustered tree with prefixed indices.
721
+ */
722
+ getClusteredTree(process: string, maxDomElements?: number | undefined | null, includeOmniparser?: boolean | undefined | null, includeGeminiVision?: boolean | undefined | null): Promise<ClusteredFormattingResult>
723
+ /**
724
+ * (async) Perform Gemini vision AI detection on a window by process name.
725
+ *
726
+ * Captures a screenshot and sends it to the Gemini vision backend for UI element detection.
727
+ * Requires GEMINI_VISION_BACKEND_URL environment variable (defaults to https://app.mediar.ai/api/vision/parse).
728
+ *
729
+ * @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
730
+ * @param {boolean} [formatOutput=true] - Whether to include formatted compact YAML output.
731
+ * @returns {Promise<GeminiVisionResult>} Detected UI elements with bounds for click targeting.
732
+ */
733
+ performGeminiVisionForProcess(process: string, formatOutput?: boolean | undefined | null): Promise<GeminiVisionResult>
734
+ /**
735
+ * (async) Perform Omniparser V2 detection on a window by process name.
736
+ *
737
+ * Captures a screenshot and sends it to the Omniparser backend for icon/field detection.
738
+ * Requires OMNIPARSER_BACKEND_URL environment variable (defaults to https://app.mediar.ai/api/omniparser/parse).
739
+ *
740
+ * @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
741
+ * @param {number} [imgsz=1920] - Icon detection image size (640-1920). Higher = better but slower.
742
+ * @param {boolean} [formatOutput=true] - Whether to include formatted compact YAML output.
743
+ * @returns {Promise<OmniparserResult>} Detected items with bounds for click targeting.
744
+ */
745
+ performOmniparserForProcess(process: string, imgsz?: number | undefined | null, formatOutput?: boolean | undefined | null): Promise<OmniparserResult>
746
+ /**
747
+ * (async) Get the currently focused browser window.
748
+ *
749
+ * @returns {Promise<Element>} The current browser window element.
750
+ */
751
+ getCurrentBrowserWindow(): Promise<Element>
752
+ /**
753
+ * Create a locator for finding UI elements.
754
+ *
755
+ * @param {string | Selector} selector - The selector.
756
+ * @returns {Locator} A locator for finding elements.
757
+ */
758
+ locator(selector: string | Selector): Locator
759
+ /**
760
+ * Create a process-scoped locator for finding UI elements.
761
+ * This is the recommended way to create locators - always scope to a specific process.
762
+ *
763
+ * @param {string} process - Process name to scope the search (e.g., 'chrome', 'notepad').
764
+ * @param {string | Selector} selector - The selector to find within the process.
765
+ * @param {string} [windowSelector] - Optional window selector for additional filtering.
766
+ * @returns {Locator} A locator for finding elements within the process.
767
+ */
768
+ locatorForProcess(process: string, selector: string | Selector, windowSelector?: string | undefined | null): Locator
769
+ /**
770
+ * (async) Get the currently focused window.
771
+ *
772
+ * @returns {Promise<Element>} The current window element.
773
+ */
774
+ getCurrentWindow(): Promise<Element>
775
+ /**
776
+ * (async) Get the currently focused application.
777
+ *
778
+ * @returns {Promise<Element>} The current application element.
779
+ */
780
+ getCurrentApplication(): Promise<Element>
781
+ /**
782
+ * Get the currently focused element.
783
+ *
784
+ * @returns {Element} The focused element.
785
+ */
786
+ focusedElement(): Element
787
+ /**
788
+ * Open a URL in a browser.
789
+ *
790
+ * @param {string} url - The URL to open.
791
+ * @param {string} [browser] - The browser to use. Can be "Default", "Chrome", "Firefox", "Edge", "Brave", "Opera", "Vivaldi", or a custom browser path.
792
+ * @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot after opening
793
+ * @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after opening
794
+ */
795
+ openUrl(url: string, browser?: string | undefined | null, includeWindowScreenshot?: boolean | undefined | null, includeMonitorScreenshots?: boolean | undefined | null): Element
796
+ /**
797
+ * Open a file with its default application.
798
+ *
799
+ * @param {string} filePath - Path to the file to open.
800
+ * @param {string} [process] - Process name for window screenshot capture. If provided, enables window screenshots.
801
+ * @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot (requires process).
802
+ * @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after opening.
803
+ */
804
+ openFile(filePath: string, process?: string | undefined | null, includeWindowScreenshot?: boolean | undefined | null, includeMonitorScreenshots?: boolean | undefined | null): void
805
+ /**
806
+ * Activate a browser window by title.
807
+ *
808
+ * @param {string} title - The window title to match.
809
+ */
810
+ activateBrowserWindowByTitle(title: string): void
811
+ /**
812
+ * Get the UI tree for a window identified by process name and optional title.
813
+ *
814
+ * @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
815
+ * @param {string} [title] - Optional window title filter.
816
+ * @param {TreeBuildConfig} [config] - Optional configuration for tree building.
817
+ * @returns {UINode} Complete UI tree starting from the identified window.
818
+ */
819
+ getWindowTree(process: string, title?: string | undefined | null, config?: TreeBuildConfig | undefined | null): UINode
820
+ /**
821
+ * Get the UI tree with full result including formatting and bounds mapping.
822
+ *
823
+ * This is the recommended method for getting window trees when you need:
824
+ * - Formatted YAML output for LLM consumption
825
+ * - Index-to-bounds mapping for click targeting
826
+ * - Browser detection
827
+ *
828
+ * @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
829
+ * @param {string} [title] - Optional window title filter.
830
+ * @param {TreeBuildConfig} [config] - Configuration options:
831
+ * - formatOutput: Enable formatted output (default: true if treeOutputFormat set)
832
+ * - treeOutputFormat: 'CompactYaml' (default) or 'VerboseJson'
833
+ * - treeFromSelector: Selector to start tree from (use getWindowTreeResultAsync for this)
834
+ * - includeWindowScreenshot: Save window screenshot to executions dir (default: false)
835
+ * - includeMonitorScreenshots: Save all monitor screenshots to executions dir (default: false)
836
+ * @returns {WindowTreeResult} Complete result with tree, formatted output, bounds mapping, and screenshot paths.
837
+ */
838
+ getWindowTreeResult(process: string, title?: string | undefined | null, config?: TreeBuildConfig | undefined | null): WindowTreeResult
839
+ /**
840
+ * (async) Get the UI tree with full result, supporting tree_from_selector.
841
+ *
842
+ * Use this method when you need to scope the tree to a specific subtree using a selector.
843
+ *
844
+ * @param {string} process - Process name to match (e.g., 'chrome', 'notepad').
845
+ * @param {string} [title] - Optional window title filter.
846
+ * @param {TreeBuildConfig} [config] - Configuration options:
847
+ * - formatOutput: Enable formatted output (default: true)
848
+ * - treeOutputFormat: 'CompactYaml' (default) or 'VerboseJson'
849
+ * - treeFromSelector: Selector to start tree from (e.g., "role:Dialog")
850
+ * @returns {Promise<WindowTreeResult>} Complete result with tree, formatted output, and bounds mapping.
851
+ */
852
+ getWindowTreeResultAsync(process: string, title?: string | undefined | null, config?: TreeBuildConfig | undefined | null): Promise<WindowTreeResult>
853
+ /**
854
+ * (async) List all available monitors/displays.
855
+ *
856
+ * @returns {Promise<Array<Monitor>>} List of monitor information.
857
+ */
858
+ listMonitors(): Promise<Array<Monitor>>
859
+ /**
860
+ * (async) Get the primary monitor.
861
+ *
862
+ * @returns {Promise<Monitor>} Primary monitor information.
863
+ */
864
+ getPrimaryMonitor(): Promise<Monitor>
865
+ /**
866
+ * (async) Get the monitor containing the currently focused window.
867
+ *
868
+ * @returns {Promise<Monitor>} Active monitor information.
869
+ */
870
+ getActiveMonitor(): Promise<Monitor>
871
+ /**
872
+ * (async) Get a monitor by its ID.
873
+ *
874
+ * @param {string} id - The monitor ID to find.
875
+ * @returns {Promise<Monitor>} Monitor information.
876
+ */
877
+ getMonitorById(id: string): Promise<Monitor>
878
+ /**
879
+ * (async) Get a monitor by its name.
880
+ *
881
+ * @param {string} name - The monitor name to find.
882
+ * @returns {Promise<Monitor>} Monitor information.
883
+ */
884
+ getMonitorByName(name: string): Promise<Monitor>
885
+ /**
886
+ * (async) Capture a screenshot of a specific monitor.
887
+ *
888
+ * @param {Monitor} monitor - The monitor to capture.
889
+ * @returns {Promise<ScreenshotResult>} The screenshot data.
890
+ */
891
+ captureMonitor(monitor: Monitor): Promise<ScreenshotResult>
892
+ /**
893
+ * (async) Capture screenshots of all monitors.
894
+ *
895
+ * @returns {Promise<Array<{monitor: Monitor, screenshot: ScreenshotResult}>>} Array of monitor and screenshot pairs.
896
+ */
897
+ captureAllMonitors(): Promise<Array<MonitorScreenshotPair>>
898
+ /**
899
+ * Capture a screenshot of a window by process name.
900
+ *
901
+ * Finds the first window matching the given process name and captures its screenshot.
902
+ * Process name matching is case-insensitive and uses substring matching.
903
+ *
904
+ * @param {string} process - Process name to match (e.g., "chrome", "notepad", "code")
905
+ * @returns {ScreenshotResult} The screenshot data.
906
+ */
907
+ captureWindowByProcess(process: string): ScreenshotResult
908
+ /**
909
+ * (async) Captures a screenshot. Three modes:
910
+ * 1. Element mode: provide process + selector to capture specific element
911
+ * 2. Window mode: provide process only to capture entire window
912
+ * 3. Monitor mode: provide process + entireMonitor=true to capture the monitor where the window is located
913
+ *
914
+ * @param {string} process - Process name to match (e.g., "chrome", "notepad", "code")
915
+ * @param {string} [selector] - Optional selector to capture a specific element within the process
916
+ * @param {boolean} [entireMonitor=false] - If true, captures the entire monitor containing the window
917
+ * @param {number} [timeoutMs=10000] - Timeout in milliseconds for finding the element
918
+ * @returns {Promise<ScreenshotResult>} The screenshot data.
919
+ */
920
+ captureScreenshot(process: string, selector?: string | undefined | null, entireMonitor?: boolean | undefined | null, timeoutMs?: number | undefined | null): Promise<ScreenshotResult>
921
+ /**
922
+ * Convert a screenshot to PNG bytes.
923
+ * Converts BGRA to RGBA and encodes as PNG format.
924
+ *
925
+ * @param {ScreenshotResult} screenshot - The screenshot to convert.
926
+ * @returns {Buffer} PNG-encoded bytes.
927
+ */
928
+ screenshotToPng(screenshot: ScreenshotResult): Array<number>
929
+ /**
930
+ * Convert a screenshot to PNG bytes with resizing.
931
+ * If the image exceeds maxDimension in either width or height,
932
+ * it will be resized while maintaining aspect ratio.
933
+ *
934
+ * @param {ScreenshotResult} screenshot - The screenshot to convert.
935
+ * @param {number} [maxDimension] - Maximum width or height. Defaults to 1920.
936
+ * @returns {Buffer} PNG-encoded bytes (potentially resized).
937
+ */
938
+ screenshotToPngResized(screenshot: ScreenshotResult, maxDimension?: number | undefined | null): Array<number>
939
+ /**
940
+ * Convert a screenshot to base64-encoded PNG string.
941
+ * Useful for embedding in JSON responses or passing to LLMs.
942
+ *
943
+ * @param {ScreenshotResult} screenshot - The screenshot to convert.
944
+ * @returns {string} Base64-encoded PNG string.
945
+ */
946
+ screenshotToBase64Png(screenshot: ScreenshotResult): string
947
+ /**
948
+ * Convert a screenshot to base64-encoded PNG string with resizing.
949
+ * If the image exceeds maxDimension in either width or height,
950
+ * it will be resized while maintaining aspect ratio.
951
+ *
952
+ * @param {ScreenshotResult} screenshot - The screenshot to convert.
953
+ * @param {number} [maxDimension] - Maximum width or height. Defaults to 1920.
954
+ * @returns {string} Base64-encoded PNG string (potentially resized).
955
+ */
956
+ screenshotToBase64PngResized(screenshot: ScreenshotResult, maxDimension?: number | undefined | null): string
957
+ /**
958
+ * Get the dimensions a screenshot would have after resizing.
959
+ *
960
+ * @param {ScreenshotResult} screenshot - The screenshot to check.
961
+ * @param {number} maxDimension - Maximum width or height.
962
+ * @returns {ResizedDimensions} Object with width and height after resize.
963
+ */
964
+ screenshotResizedDimensions(screenshot: ScreenshotResult, maxDimension: number): ResizedDimensions
965
+ /**
966
+ * (async) Get all window elements for a given application name.
967
+ *
968
+ * @param {string} name - The name of the application whose windows will be retrieved.
969
+ * @returns {Promise<Array<Element>>} A list of window elements belonging to the application.
970
+ */
971
+ windowsForApplication(name: string): Promise<Array<Element>>
972
+ /**
973
+ * (async) Get the UI tree for all open applications in parallel.
974
+ *
975
+ * @returns {Promise<Array<UINode>>} List of UI trees for all applications.
976
+ */
977
+ getAllApplicationsTree(): Promise<Array<UINode>>
978
+ /**
979
+ * (async) Press a key globally.
980
+ *
981
+ * @param {string} key - The key to press (e.g., "Enter", "Ctrl+C", "F1").
982
+ * @param {string} [process] - Process name for window screenshot capture. If provided, enables window screenshots.
983
+ * @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot (requires process).
984
+ * @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after key press.
985
+ */
986
+ pressKey(key: string, process?: string | undefined | null, includeWindowScreenshot?: boolean | undefined | null, includeMonitorScreenshots?: boolean | undefined | null): Promise<void>
987
+ /**
988
+ * (async) Execute JavaScript in a browser tab.
989
+ * Finds the browser window by process name and executes the script.
990
+ *
991
+ * @param {string} script - The JavaScript code to execute in browser context.
992
+ * @param {string} process - Process name to scope the browser window (e.g., 'chrome', 'msedge'). Required.
993
+ * @param {number} [timeoutMs=10000] - Timeout in milliseconds for finding the browser window.
994
+ * @returns {Promise<string>} The result of script execution.
995
+ */
996
+ executeBrowserScript(script: string, process: string, timeoutMs?: number | undefined | null): Promise<string>
997
+ /**
998
+ * (async) Close a browser tab safely.
999
+ *
1000
+ * This method can identify the tab to close by:
1001
+ * - tabId: Close a specific tab by its Chrome tab ID
1002
+ * - url: Find and close a tab matching this URL (partial match supported)
1003
+ * - title: Find and close a tab matching this title (case-insensitive partial match)
1004
+ * - If none provided, closes the currently active tab
1005
+ *
1006
+ * Returns information about the closed tab for verification.
1007
+ * Returns null if no browser extension is connected or tab couldn't be found.
1008
+ *
1009
+ * Safety:
1010
+ * - Will NOT close protected browser pages (chrome://, edge://, about:, etc.)
1011
+ * - Returns the closed tab's URL/title so you can verify the correct tab was closed
1012
+ *
1013
+ * @param {number} [tabId] - Specific Chrome tab ID to close.
1014
+ * @param {string} [url] - URL to match (partial match supported).
1015
+ * @param {string} [title] - Title to match (case-insensitive partial match).
1016
+ * @returns {Promise<CloseTabResult | null>} Info about closed tab, or null if no extension/tab found.
1017
+ *
1018
+ * @example
1019
+ * // Close by URL
1020
+ * const result = await desktop.closeTab({ url: "example.com" });
1021
+ *
1022
+ * @example
1023
+ * // Close by title
1024
+ * const result = await desktop.closeTab({ title: "My Page" });
1025
+ *
1026
+ * @example
1027
+ * // Close active tab
1028
+ * const result = await desktop.closeTab();
1029
+ */
1030
+ closeTab(options?: CloseTabOptions | undefined | null): Promise<CloseTabResult | null>
1031
+ /**
1032
+ * (async) Delay execution for a specified number of milliseconds.
1033
+ * Useful for waiting between actions to ensure UI stability.
1034
+ *
1035
+ * @param {number} delayMs - Delay in milliseconds.
1036
+ * @returns {Promise<void>}
1037
+ */
1038
+ delay(delayMs: number): Promise<void>
1039
+ /**
1040
+ * Navigate to a URL in a browser.
1041
+ * This is the recommended method for browser navigation - more reliable than
1042
+ * manually manipulating the address bar with keyboard/mouse actions.
1043
+ *
1044
+ * @param {string} url - URL to navigate to
1045
+ * @param {string | null} browser - Optional browser name ('Chrome', 'Firefox', 'Edge', 'Brave', 'Opera', 'Vivaldi', or 'Default')
1046
+ * @param {boolean} [includeWindowScreenshot=true] - Whether to capture window screenshot after navigation
1047
+ * @param {boolean} [includeMonitorScreenshots=false] - Whether to capture monitor screenshots after navigation
1048
+ * @returns {Promise<Element>} The browser window element
1049
+ */
1050
+ navigateBrowser(url: string, browser?: string | undefined | null, includeWindowScreenshot?: boolean | undefined | null, includeMonitorScreenshots?: boolean | undefined | null): Element
1051
+ /**
1052
+ * (async) Set the zoom level to a specific percentage.
1053
+ *
1054
+ * @param {number} percentage - The zoom percentage (e.g., 100 for 100%, 150 for 150%, 50 for 50%).
1055
+ */
1056
+ setZoom(percentage: number): Promise<void>
1057
+ /**
1058
+ * (async) Run Gemini Computer Use agentic loop.
1059
+ *
1060
+ * Provide a goal and target process, and this will autonomously take actions
1061
+ * (click, type, scroll, etc.) until the goal is achieved or max_steps is reached.
1062
+ * Uses Gemini's vision model to analyze screenshots and decide actions.
1063
+ *
1064
+ * @param {string} process - Process name of the target application (e.g., "chrome", "notepad")
1065
+ * @param {string} goal - What to achieve (e.g., "Open Notepad and type Hello World")
1066
+ * @param {number} [maxSteps=20] - Maximum number of steps before stopping
1067
+ * @param {function} [onStep] - Optional callback invoked after each step with step details
1068
+ * @returns {Promise<ComputerUseResult>} Result with status, steps executed, and history
1069
+ */
1070
+ geminiComputerUse(process: string, goal: string, maxSteps?: number | undefined | null, onStep?: ((err: null | Error, step: ComputerUseStep) => void) | undefined): Promise<ComputerUseResult>
1071
+ /**
1072
+ * Stop all currently executing operations.
1073
+ *
1074
+ * This cancels the internal cancellation token, which will cause any
1075
+ * operations that check `isCancelled()` to abort. After calling this,
1076
+ * you should create a new Desktop instance to start fresh.
1077
+ */
1078
+ stopExecution(): void
1079
+ /**
1080
+ * Check if execution has been cancelled.
1081
+ *
1082
+ * Returns `true` if `stopExecution()` has been called.
1083
+ * Long-running operations should periodically check this and abort if true.
1084
+ */
1085
+ isCancelled(): boolean
1086
+ /**
1087
+ * Stop all active highlight overlays globally.
1088
+ *
1089
+ * This finds and destroys all highlight overlay windows that were created
1090
+ * by `element.highlight()`. Useful for cleaning up highlights without
1091
+ * needing to track individual HighlightHandle objects.
1092
+ *
1093
+ * @returns {number} The number of highlights that were stopped.
1094
+ */
1095
+ stopHighlighting(): number
1096
+ /**
1097
+ * Show inspect overlay with indexed elements for visual debugging.
1098
+ *
1099
+ * Displays a transparent overlay window with colored rectangles around UI elements,
1100
+ * showing their index numbers for click targeting. Use `hideInspectOverlay()` to remove.
1101
+ *
1102
+ * @param {InspectElement[]} elements - Array of elements to highlight with their bounds.
1103
+ * @param {object} windowBounds - The window bounds {x, y, width, height} to constrain the overlay.
1104
+ * @param {OverlayDisplayMode} [displayMode='Index'] - What to show in labels: 'Index', 'Role', 'Name', etc.
1105
+ */
1106
+ showInspectOverlay(elements: Array<InspectElement>, windowBounds: Bounds, displayMode?: OverlayDisplayMode | undefined | null): void
1107
+ /** Show inspect overlay (non-Windows stub). */
1108
+ showInspectOverlay(elements: Array<InspectElement>, windowBounds: Bounds, displayMode?: OverlayDisplayMode | undefined | null): void
1109
+ /**
1110
+ * Hide any active inspect overlay.
1111
+ *
1112
+ * This hides the visual overlay that was shown via `showInspectOverlay()`.
1113
+ * Can be called from any thread.
1114
+ */
1115
+ hideInspectOverlay(): void
1116
+ /**
1117
+ * Verify that an element matching the selector exists within the same application as the scope element.
1118
+ *
1119
+ * This is used for post-action verification - checking that an expected element appeared after
1120
+ * performing an action (e.g., a success dialog after clicking submit).
1121
+ *
1122
+ * @param {Element} scopeElement - The element to get the application scope from (typically the element the action was performed on)
1123
+ * @param {string} selector - The selector string to search for
1124
+ * @param {number} [timeoutMs=2000] - How long to wait for the element to appear in milliseconds
1125
+ * @returns {Element} The found element if verification passes
1126
+ * @throws Error if the element is not found within the timeout
1127
+ */
1128
+ verifyElementExists(scopeElement: Element, selector: string, timeoutMs?: number | undefined | null): Promise<Element>
1129
+ /**
1130
+ * Verify that an element matching the selector does NOT exist within the same application as the scope element.
1131
+ *
1132
+ * This is used for post-action verification - checking that an element disappeared after
1133
+ * performing an action (e.g., a modal dialog closed after clicking OK).
1134
+ *
1135
+ * @param {Element} scopeElement - The element to get the application scope from (typically the element the action was performed on)
1136
+ * @param {string} selector - The selector string that should NOT be found
1137
+ * @param {number} [timeoutMs=2000] - How long to wait/check that the element doesn't appear in milliseconds
1138
+ * @returns {void}
1139
+ * @throws Error if the element IS found (meaning verification failed)
1140
+ */
1141
+ verifyElementNotExists(scopeElement: Element, selector: string, timeoutMs?: number | undefined | null): Promise<void>
1142
+ }
1143
+ /** A UI element in the accessibility tree. */
1144
+ export declare class Element {
1145
+ /**
1146
+ * Get the element's ID.
1147
+ *
1148
+ * @returns {string | null} The element's ID, if available.
1149
+ */
1150
+ id(): string | null
1151
+ /**
1152
+ * Get the element's role.
1153
+ *
1154
+ * @returns {string} The element's role (e.g., "button", "textfield").
1155
+ */
1156
+ role(): string
1157
+ /**
1158
+ * Get all attributes of the element.
1159
+ *
1160
+ * @returns {UIElementAttributes} The element's attributes.
1161
+ */
1162
+ attributes(): UIElementAttributes
1163
+ /**
1164
+ * Get the element's name.
1165
+ *
1166
+ * @returns {string | null} The element's name, if available.
1167
+ */
1168
+ name(): string | null
1169
+ /**
1170
+ * Get children of this element.
1171
+ *
1172
+ * @returns {Array<Element>} List of child elements.
1173
+ */
1174
+ children(): Array<Element>
1175
+ /**
1176
+ * Get the parent element.
1177
+ *
1178
+ * @returns {Element | null} The parent element, if available.
1179
+ */
1180
+ parent(): Element | null
1181
+ /**
1182
+ * Get element bounds.
1183
+ *
1184
+ * @returns {Bounds} The element's bounds (x, y, width, height).
1185
+ */
1186
+ bounds(): Bounds
1187
+ /**
1188
+ * Click on this element.
1189
+ *
1190
+ * @param {ActionOptions} [options] - Options for the click action.
1191
+ * @returns {Promise<ClickResult>} Result of the click operation.
1192
+ */
1193
+ click(options?: ActionOptions | undefined | null): Promise<ClickResult>
1194
+ /**
1195
+ * Double click on this element.
1196
+ *
1197
+ * @param {ActionOptions} [options] - Options for the double click action.
1198
+ * @returns {ClickResult} Result of the click operation.
1199
+ */
1200
+ doubleClick(options?: ActionOptions | undefined | null): ClickResult
1201
+ /**
1202
+ * Right click on this element.
1203
+ *
1204
+ * @param {ActionOptions} [options] - Options for the right click action.
1205
+ */
1206
+ rightClick(options?: ActionOptions | undefined | null): void
1207
+ /**
1208
+ * Hover over this element.
1209
+ *
1210
+ * @param {ActionOptions} [options] - Optional action options.
1211
+ */
1212
+ hover(options?: ActionOptions | undefined | null): void
1213
+ /**
1214
+ * Check if element is visible.
1215
+ *
1216
+ * @returns {boolean} True if the element is visible.
1217
+ */
1218
+ isVisible(): boolean
1219
+ /**
1220
+ * Check if element is enabled.
1221
+ *
1222
+ * @returns {boolean} True if the element is enabled.
1223
+ */
1224
+ isEnabled(): boolean
1225
+ /** Focus this element. */
1226
+ focus(): void
1227
+ /**
1228
+ * Get text content of this element.
1229
+ *
1230
+ * @param {number} [maxDepth] - Maximum depth to search for text.
1231
+ * @returns {string} The element's text content.
1232
+ */
1233
+ text(maxDepth?: number | undefined | null): string
1234
+ /**
1235
+ * Type text into this element.
1236
+ *
1237
+ * @param {string} text - The text to type.
1238
+ * @param {TypeTextOptions} [options] - Options for typing.
1239
+ * @returns {ActionResult} Result of the type operation.
1240
+ */
1241
+ typeText(text: string, options?: TypeTextOptions | undefined | null): ActionResult
1242
+ /**
1243
+ * Press a key while this element is focused.
1244
+ *
1245
+ * @param {string} key - The key to press.
1246
+ * @param {ActionOptions} [options] - Options for the key press action.
1247
+ * @returns {ActionResult} Result of the key press operation.
1248
+ */
1249
+ pressKey(key: string, options?: ActionOptions | undefined | null): ActionResult
1250
+ /**
1251
+ * Set value of this element.
1252
+ *
1253
+ * @param {string} value - The value to set.
1254
+ * @param {ActionOptions} [options] - Options for the set value action.
1255
+ * @returns {ActionResult} Result of the set value operation.
1256
+ */
1257
+ setValue(value: string, options?: ActionOptions | undefined | null): ActionResult
1258
+ /**
1259
+ * Perform a named action on this element.
1260
+ *
1261
+ * @param {string} action - The action to perform.
1262
+ */
1263
+ performAction(action: string): void
1264
+ /**
1265
+ * Invoke this element (triggers the default action).
1266
+ * This is often more reliable than clicking for controls like radio buttons or menu items.
1267
+ *
1268
+ * @param {ActionOptions} [options] - Options for the invoke action.
1269
+ * @returns {ActionResult} Result of the invoke operation.
1270
+ */
1271
+ invoke(options?: ActionOptions | undefined | null): ActionResult
1272
+ /**
1273
+ * Scroll the element in a given direction.
1274
+ *
1275
+ * @param {string} direction - The direction to scroll.
1276
+ * @param {number} amount - The amount to scroll.
1277
+ * @param {ActionOptions} [options] - Options for the scroll action.
1278
+ * @returns {ActionResult} Result of the scroll operation.
1279
+ */
1280
+ scroll(direction: string, amount: number, options?: ActionOptions | undefined | null): ActionResult
1281
+ /** Activate the window containing this element. */
1282
+ activateWindow(): void
1283
+ /** Minimize the window containing this element. */
1284
+ minimizeWindow(): void
1285
+ /** Maximize the window containing this element. */
1286
+ maximizeWindow(): void
1287
+ /**
1288
+ * Check if element is focused.
1289
+ *
1290
+ * @returns {boolean} True if the element is focused.
1291
+ */
1292
+ isFocused(): boolean
1293
+ /**
1294
+ * Check if element is keyboard focusable.
1295
+ *
1296
+ * @returns {boolean} True if the element can receive keyboard focus.
1297
+ */
1298
+ isKeyboardFocusable(): boolean
1299
+ /**
1300
+ * Drag mouse from start to end coordinates.
1301
+ *
1302
+ * @param {number} startX - Starting X coordinate.
1303
+ * @param {number} startY - Starting Y coordinate.
1304
+ * @param {number} endX - Ending X coordinate.
1305
+ * @param {number} endY - Ending Y coordinate.
1306
+ * @param {ActionOptions} [options] - Optional action options.
1307
+ */
1308
+ mouseDrag(startX: number, startY: number, endX: number, endY: number, options?: ActionOptions | undefined | null): void
1309
+ /**
1310
+ * Press and hold mouse at coordinates.
1311
+ *
1312
+ * @param {number} x - X coordinate.
1313
+ * @param {number} y - Y coordinate.
1314
+ */
1315
+ mouseClickAndHold(x: number, y: number): void
1316
+ /**
1317
+ * Move mouse to coordinates.
1318
+ *
1319
+ * @param {number} x - X coordinate.
1320
+ * @param {number} y - Y coordinate.
1321
+ */
1322
+ mouseMove(x: number, y: number): void
1323
+ /**
1324
+ * Release mouse button.
1325
+ *
1326
+ * @param {ActionOptions} [options] - Optional action options.
1327
+ */
1328
+ mouseRelease(options?: ActionOptions | undefined | null): void
1329
+ /**
1330
+ * Create a locator from this element.
1331
+ * Accepts either a selector string or a Selector object.
1332
+ *
1333
+ * @param {string | Selector} selector - The selector.
1334
+ * @returns {Locator} A new locator for finding elements.
1335
+ */
1336
+ locator(selector: string | Selector): Locator
1337
+ /**
1338
+ * Get the containing application element.
1339
+ *
1340
+ * @returns {Element | null} The containing application element, if available.
1341
+ */
1342
+ application(): Element | null
1343
+ /**
1344
+ * Get the containing window element.
1345
+ *
1346
+ * @returns {Element | null} The containing window element, if available.
1347
+ */
1348
+ window(): Element | null
1349
+ /**
1350
+ * Highlights the element with a colored border and optional text overlay.
1351
+ *
1352
+ * @param {number} [color] - Optional BGR color code (32-bit integer). Default: 0x0000FF (red)
1353
+ * @param {number} [durationMs] - Optional duration in milliseconds.
1354
+ * @param {string} [text] - Optional text to display. Text will be truncated to 10 characters.
1355
+ * @param {TextPosition} [textPosition] - Optional position for the text overlay (default: Top)
1356
+ * @param {FontStyle} [fontStyle] - Optional font styling for the text
1357
+ * @returns {HighlightHandle} Handle that can be used to close the highlight early
1358
+ */
1359
+ highlight(color?: number | undefined | null, durationMs?: number | undefined | null, text?: string | undefined | null, textPosition?: TextPosition | undefined | null, fontStyle?: FontStyle | undefined | null): HighlightHandle
1360
+ /**
1361
+ * Capture a screenshot of this element.
1362
+ *
1363
+ * @returns {ScreenshotResult} The screenshot data containing image data and dimensions.
1364
+ */
1365
+ capture(): ScreenshotResult
1366
+ /**
1367
+ * Get the process ID of the application containing this element.
1368
+ *
1369
+ * @returns {number} The process ID.
1370
+ */
1371
+ processId(): number
1372
+ /**
1373
+ * Get the process name of the application containing this element.
1374
+ *
1375
+ * @returns {string} The process name (e.g., "chrome", "notepad").
1376
+ */
1377
+ processName(): string
1378
+ toString(): string
1379
+ /**
1380
+ * Sets the transparency of the window.
1381
+ *
1382
+ * @param {number} percentage - The transparency percentage from 0 (completely transparent) to 100 (completely opaque).
1383
+ * @returns {void}
1384
+ */
1385
+ setTransparency(percentage: number): void
1386
+ /**
1387
+ * Close the element if it's closable (like windows, applications).
1388
+ * Does nothing for non-closable elements (like buttons, text, etc.).
1389
+ *
1390
+ * @returns {void}
1391
+ */
1392
+ close(): void
1393
+ /**
1394
+ * Get the monitor containing this element.
1395
+ *
1396
+ * @returns {Monitor} The monitor information for the display containing this element.
1397
+ */
1398
+ monitor(): Monitor
1399
+ /**
1400
+ * Scrolls the element into view within its window viewport.
1401
+ * If the element is already visible, returns immediately.
1402
+ *
1403
+ * @returns {void}
1404
+ */
1405
+ scrollIntoView(): void
1406
+ /**
1407
+ * Selects an option in a dropdown or combobox by its visible text.
1408
+ *
1409
+ * @param {string} optionName - The visible text of the option to select.
1410
+ * @param {ActionOptions} [options] - Optional action options.
1411
+ * @returns {void}
1412
+ */
1413
+ selectOption(optionName: string, options?: ActionOptions | undefined | null): void
1414
+ /**
1415
+ * Lists all available option strings from a dropdown or list box.
1416
+ *
1417
+ * @returns {Array<string>} List of available option strings.
1418
+ */
1419
+ listOptions(): Array<string>
1420
+ /**
1421
+ * Checks if a control (like a checkbox or toggle switch) is currently toggled on.
1422
+ *
1423
+ * @returns {boolean} True if the control is toggled on.
1424
+ */
1425
+ isToggled(): boolean
1426
+ /**
1427
+ * Sets the state of a toggleable control.
1428
+ * It only performs an action if the control is not already in the desired state.
1429
+ *
1430
+ * @param {boolean} state - The desired toggle state.
1431
+ * @param {ActionOptions} [options] - Optional action options.
1432
+ * @returns {void}
1433
+ */
1434
+ setToggled(state: boolean, options?: ActionOptions | undefined | null): void
1435
+ /**
1436
+ * Checks if an element is selected (e.g., list item, tree node, tab).
1437
+ *
1438
+ * @returns {boolean} True if the element is selected, false otherwise.
1439
+ */
1440
+ isSelected(): boolean
1441
+ /**
1442
+ * Sets the selection state of a selectable item.
1443
+ * Only performs an action if the element is not already in the desired state.
1444
+ *
1445
+ * @param {boolean} state - The desired selection state.
1446
+ * @param {ActionOptions} [options] - Optional action options.
1447
+ * @returns {void}
1448
+ */
1449
+ setSelected(state: boolean, options?: ActionOptions | undefined | null): void
1450
+ /**
1451
+ * Gets the current value from a range-based control like a slider or progress bar.
1452
+ *
1453
+ * @returns {number} The current value of the range control.
1454
+ */
1455
+ getRangeValue(): number
1456
+ /**
1457
+ * Sets the value of a range-based control like a slider.
1458
+ *
1459
+ * @param {number} value - The value to set.
1460
+ * @param {ActionOptions} [options] - Optional action options.
1461
+ * @returns {void}
1462
+ */
1463
+ setRangeValue(value: number, options?: ActionOptions | undefined | null): void
1464
+ /**
1465
+ * Gets the value attribute of an element (text inputs, combo boxes, etc.).
1466
+ *
1467
+ * @returns {string | null} The value attribute, or null if not available.
1468
+ */
1469
+ getValue(): string | null
1470
+ /**
1471
+ * Execute JavaScript in web browser using dev tools console.
1472
+ * Returns the result of the script execution as a string.
1473
+ *
1474
+ * @param {string} script - The JavaScript code to execute.
1475
+ * @returns {Promise<string>} The result of script execution.
1476
+ */
1477
+ executeBrowserScript(script: string): Promise<string>
1478
+ /**
1479
+ * Get the UI tree starting from this element.
1480
+ * Returns a tree structure containing this element and all its descendants.
1481
+ *
1482
+ * @param {number} [maxDepth=100] - Maximum depth to traverse (default: 100).
1483
+ * @returns {UINode} Tree structure with recursive children.
1484
+ */
1485
+ getTree(maxDepth?: number | undefined | null): UINode
1486
+ }
1487
+ /** Locator for finding UI elements by selector. */
1488
+ export declare class Locator {
1489
+ /**
1490
+ * (async) Get the first matching element.
1491
+ *
1492
+ * @param {number} [timeoutMs] - Timeout in milliseconds (default: 10000).
1493
+ * @returns {Promise<Element>} The first matching element.
1494
+ */
1495
+ first(timeoutMs?: number | undefined | null): Promise<Element>
1496
+ /**
1497
+ * (async) Get all matching elements.
1498
+ *
1499
+ * @param {number} timeoutMs - Timeout in milliseconds (required).
1500
+ * @param {number} [depth] - Maximum depth to search.
1501
+ * @returns {Promise<Array<Element>>} List of matching elements.
1502
+ */
1503
+ all(timeoutMs: number, depth?: number | undefined | null): Promise<Array<Element>>
1504
+ /**
1505
+ * Set a default timeout for this locator.
1506
+ *
1507
+ * @param {number} timeoutMs - Timeout in milliseconds.
1508
+ * @returns {Locator} A new locator with the specified timeout.
1509
+ */
1510
+ timeout(timeoutMs: number): Locator
1511
+ /**
1512
+ * Set the root element for this locator.
1513
+ *
1514
+ * @param {Element} element - The root element.
1515
+ * @returns {Locator} A new locator with the specified root element.
1516
+ */
1517
+ within(element: Element): Locator
1518
+ /**
1519
+ * Chain another selector.
1520
+ * Accepts either a selector string or a Selector object.
1521
+ *
1522
+ * @param {string | Selector} selector - The selector.
1523
+ * @returns {Locator} A new locator with the chained selector.
1524
+ */
1525
+ locator(selector: string | Selector): Locator
1526
+ /**
1527
+ * (async) Validate element existence without throwing an error.
1528
+ *
1529
+ * @param {number} timeoutMs - Timeout in milliseconds (required).
1530
+ * @returns {Promise<ValidationResult>} Validation result with exists flag and optional element.
1531
+ */
1532
+ validate(timeoutMs: number): Promise<ValidationResult>
1533
+ /**
1534
+ * (async) Wait for an element to meet a specific condition.
1535
+ *
1536
+ * @param {string} condition - Condition to wait for: 'exists', 'visible', 'enabled', 'focused'
1537
+ * @param {number} timeoutMs - Timeout in milliseconds (required).
1538
+ * @returns {Promise<Element>} The element when condition is met.
1539
+ */
1540
+ waitFor(condition: string, timeoutMs: number): Promise<Element>
1541
+ }
1542
+ /** Selector for locating UI elements. Provides a typed alternative to the string based selector API. */
1543
+ export declare class Selector {
1544
+ /** Create a selector that matches elements by their accessibility `name`. */
1545
+ static name(name: string): Selector
1546
+ /** Create a selector that matches elements by role (and optionally name). */
1547
+ static role(role: string, name?: string | undefined | null): Selector
1548
+ /** Create a selector that matches elements by accessibility `id`. */
1549
+ static id(id: string): Selector
1550
+ /** Create a selector that matches elements by the text they display. */
1551
+ static text(text: string): Selector
1552
+ /** Create a selector from an XPath-like path string. */
1553
+ static path(path: string): Selector
1554
+ /** Create a selector that matches elements by a native automation id (e.g., AutomationID on Windows). */
1555
+ static nativeId(id: string): Selector
1556
+ /** Create a selector that matches elements by their class name. */
1557
+ static className(name: string): Selector
1558
+ /** Create a selector from an arbitrary attribute map. */
1559
+ static attributes(attributes: Record<string, string>): Selector
1560
+ /** Chain another selector onto this selector. */
1561
+ chain(other: Selector): Selector
1562
+ /** Filter by visibility. */
1563
+ visible(isVisible: boolean): Selector
1564
+ /**
1565
+ * Create a selector that selects the nth element from matches.
1566
+ * Positive values are 0-based from the start (0 = first, 1 = second).
1567
+ * Negative values are from the end (-1 = last, -2 = second-to-last).
1568
+ */
1569
+ static nth(index: number): Selector
1570
+ /**
1571
+ * Create a selector that matches elements having at least one descendant matching the inner selector.
1572
+ * This is similar to Playwright's :has() pseudo-class.
1573
+ */
1574
+ static has(innerSelector: Selector): Selector
1575
+ /**
1576
+ * Create a selector that navigates to the parent element.
1577
+ * This is similar to Playwright's .. syntax.
1578
+ */
1579
+ static parent(): Selector
1580
+ /**
1581
+ * Create a selector that scopes the search to a specific process.
1582
+ * This is typically used as the first part of a chained selector.
1583
+ * Example: `Selector.process("chrome").chain(Selector.role("Button", "Submit"))`
1584
+ */
1585
+ static process(processName: string): Selector
1586
+ /**
1587
+ * Create a selector that scopes the search to a specific window within a process.
1588
+ * Typically chained after a process selector.
1589
+ * Example: `Selector.process("notepad").chain(Selector.window("Untitled"))`
1590
+ */
1591
+ static window(title: string): Selector
1592
+ }
1593
+ export declare class HighlightHandle {
1594
+ close(): void
1595
+ }
1596
+ /**
1597
+ * Window manager for controlling window states
1598
+ *
1599
+ * Provides functionality for:
1600
+ * - Enumerating windows with Z-order tracking
1601
+ * - Bringing windows to front (bypassing Windows focus-stealing prevention)
1602
+ * - Minimizing/maximizing windows
1603
+ * - Capturing and restoring window states for workflows
1604
+ */
1605
+ export declare class WindowManager {
1606
+ /** Create a new WindowManager instance */
1607
+ constructor()
1608
+ /** Update window cache with current window information */
1609
+ updateWindowCache(): Promise<void>
1610
+ /** Get topmost window for a process by name */
1611
+ getTopmostWindowForProcess(process: string): Promise<WindowInfo | null>
1612
+ /** Get topmost window for a specific PID */
1613
+ getTopmostWindowForPid(pid: number): Promise<WindowInfo | null>
1614
+ /** Get all visible always-on-top windows */
1615
+ getAlwaysOnTopWindows(): Promise<Array<WindowInfo>>
1616
+ /**
1617
+ * Minimize only always-on-top windows (excluding target)
1618
+ * Returns the number of windows minimized
1619
+ */
1620
+ minimizeAlwaysOnTopWindows(targetHwnd: number): Promise<number>
1621
+ /** Minimize all visible windows except the target */
1622
+ minimizeAllExcept(targetHwnd: number): Promise<number>
1623
+ /**
1624
+ * Maximize window if not already maximized
1625
+ * Returns true if the window was maximized (wasn't already maximized)
1626
+ */
1627
+ maximizeIfNeeded(hwnd: number): Promise<boolean>
1628
+ /**
1629
+ * Bring window to front using AttachThreadInput trick
1630
+ *
1631
+ * This uses AttachThreadInput to bypass Windows' focus-stealing prevention.
1632
+ * Returns true if the window is now in the foreground.
1633
+ */
1634
+ bringWindowToFront(hwnd: number): Promise<boolean>
1635
+ /**
1636
+ * Minimize window if not already minimized
1637
+ * Returns true if the window was minimized (wasn't already minimized)
1638
+ */
1639
+ minimizeIfNeeded(hwnd: number): Promise<boolean>
1640
+ /** Capture current state before workflow */
1641
+ captureInitialState(): Promise<void>
1642
+ /**
1643
+ * Restore windows that were minimized and target window to their original state
1644
+ * Returns the number of windows restored
1645
+ */
1646
+ restoreAllWindows(): Promise<number>
1647
+ /** Clear captured state */
1648
+ clearCapturedState(): Promise<void>
1649
+ /** Check if a process is a UWP/Modern app */
1650
+ isUwpApp(pid: number): Promise<boolean>
1651
+ /** Track a window as the target for restoration */
1652
+ setTargetWindow(hwnd: number): Promise<void>
1653
+ }