windows-mcp 0.5.9__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
windows_mcp/__main__.py CHANGED
@@ -56,7 +56,7 @@ mcp=FastMCP(name='windows-mcp',instructions=instructions,lifespan=lifespan)
56
56
 
57
57
  @mcp.tool(
58
58
  name="App",
59
- description="Manages Windows applications with three modes: 'launch' (start app by name), 'resize' (set window position/size using window_loc=[x,y] and window_size=[width,height]), 'switch' (activate app by name). Essential for application lifecycle management.",
59
+ description="Manages Windows applications with three modes: 'launch' (opens the prescibed application), 'resize' (adjusts active window size/position), 'switch' (brings specific window into focus).",
60
60
  annotations=ToolAnnotations(
61
61
  title="App",
62
62
  readOnlyHint=False,
@@ -87,7 +87,7 @@ def powershell_tool(command: str,timeout:int=10, ctx: Context = None) -> str:
87
87
 
88
88
  @mcp.tool(
89
89
  name='Snapshot',
90
- description='Captures complete desktop state including: system language, focused/opened apps, interactive elements (buttons, text fields, links, menus with coordinates), and scrollable areas. Set use_vision=True to include screenshot. Set use_dom=True for browser content to get web page elements instead of browser UI. Always call this first to understand the current desktop state before taking actions.',
90
+ description='Captures complete desktop state including: system language, focused/opened windows, interactive elements (buttons, text fields, links, menus with coordinates), and scrollable areas. Set use_vision=True to include screenshot. Set use_dom=True for browser content to get web page elements instead of browser UI. Always call this first to understand the current desktop state before taking actions.',
91
91
  annotations=ToolAnnotations(
92
92
  title="Snapshot",
93
93
  readOnlyHint=True,
@@ -106,14 +106,22 @@ def state_tool(use_vision:bool=False,use_dom:bool=False, ctx: Context = None):
106
106
  desktop_state=desktop.get_state(use_vision=use_vision,use_dom=use_dom,as_bytes=True,scale=scale)
107
107
  interactive_elements=desktop_state.tree_state.interactive_elements_to_string()
108
108
  scrollable_elements=desktop_state.tree_state.scrollable_elements_to_string()
109
- apps=desktop_state.apps_to_string()
110
- active_app=desktop_state.active_app_to_string()
111
- return [dedent(f'''
112
- Focused App:
113
- {active_app}
109
+ windows=desktop_state.windows_to_string()
110
+ active_window=desktop_state.active_window_to_string()
111
+ active_desktop=desktop_state.active_desktop_to_string()
112
+ all_desktops=desktop_state.desktops_to_string()
113
+ return [dedent(f'''
114
+ Active Desktop:
115
+ {active_desktop}
114
116
 
115
- Opened Apps:
116
- {apps}
117
+ All Desktops:
118
+ {all_desktops}
119
+
120
+ Focused Window:
121
+ {active_window}
122
+
123
+ Opened Windows:
124
+ {windows}
117
125
 
118
126
  List of Interactive Elements:
119
127
  {interactive_elements or 'No interactive elements found.'}
@@ -124,7 +132,7 @@ def state_tool(use_vision:bool=False,use_dom:bool=False, ctx: Context = None):
124
132
 
125
133
  @mcp.tool(
126
134
  name='Click',
127
- description='Performs mouse clicks at specified coordinates [x, y]. Supports button types: left (default), right (context menu), middle. Supports clicks: 1 (single), 2 (double), 3 (triple). Always use coordinates from State-Tool output to ensure accuracy.',
135
+ description="Performs mouse clicks at specified coordinates [x, y]. Supports button types: 'left' for selection/activation, 'right' for context menus, 'middle'. Supports clicks: 0=hover only (no click), 1=single click (select/focus), 2=double click (open/activate).",
128
136
  annotations=ToolAnnotations(
129
137
  title="Click",
130
138
  readOnlyHint=False,
@@ -139,12 +147,12 @@ def click_tool(loc:list[int],button:Literal['left','right','middle']='left',clic
139
147
  raise ValueError("Location must be a list of exactly 2 integers [x, y]")
140
148
  x,y=loc[0],loc[1]
141
149
  desktop.click(loc=loc,button=button,clicks=clicks)
142
- num_clicks={1:'Single',2:'Double',3:'Triple'}
150
+ num_clicks={0:'Hover',1:'Single',2:'Double'}
143
151
  return f'{num_clicks.get(clicks)} {button} clicked at ({x},{y}).'
144
152
 
145
153
  @mcp.tool(
146
154
  name='Type',
147
- description='Types text at specified coordinates [x, y]. Set clear=True to clear existing text first (Ctrl+A then type), clear=False to append. Set press_enter=True to submit after typing. Always click on the target input field first to ensure focus.',
155
+ description="Types text at specified coordinates [x, y]. Set clear=True to clear existing text first, False to append. Set press_enter=True to submit after typing. Set caret_position to 'start' (beginning), 'end' (end), or 'idle' (default).",
148
156
  annotations=ToolAnnotations(
149
157
  title="Type",
150
158
  readOnlyHint=False,
@@ -154,11 +162,11 @@ def click_tool(loc:list[int],button:Literal['left','right','middle']='left',clic
154
162
  )
155
163
  )
156
164
  @with_analytics(analytics, "Type-Tool")
157
- def type_tool(loc:list[int],text:str,clear:bool=False,press_enter:bool=False, ctx: Context = None)->str:
165
+ def type_tool(loc:list[int],text:str,clear:bool|str=False,caret_position:Literal['start', 'idle', 'end']='idle',press_enter:bool|str=False, ctx: Context = None)->str:
158
166
  if len(loc) != 2:
159
167
  raise ValueError("Location must be a list of exactly 2 integers [x, y]")
160
168
  x,y=loc[0],loc[1]
161
- desktop.type(loc=loc,text=text,clear=clear,press_enter=press_enter)
169
+ desktop.type(loc=loc,text=text,caret_position=caret_position,clear=clear,press_enter=press_enter)
162
170
  return f'Typed {text} at ({x},{y}).'
163
171
 
164
172
  @mcp.tool(
@@ -262,7 +270,41 @@ def scrape_tool(url:str,use_dom:bool=False, ctx: Context = None)->str:
262
270
  content='\n'.join([node.text for node in tree_state.dom_informative_nodes])
263
271
  header_status = "Reached top" if vertical_scroll_percent <= 0 else "Scroll up to see more"
264
272
  footer_status = "Reached bottom" if vertical_scroll_percent >= 100 else "Scroll down to see more"
265
- return f'URL:{url}\nContent:\n[{header_status}]\n{content}\n[{footer_status}]'
273
+ return f'URL:{url}\nContent:\n{header_status}\n{content}\n{footer_status}'
274
+
275
+ @mcp.tool(
276
+ name='MultiSelect',
277
+ description="Selects multiple items such as files, folders, or checkboxes if press_ctrl=True, or performs multiple clicks if False.",
278
+ annotations=ToolAnnotations(
279
+ title="MultiSelect",
280
+ readOnlyHint=False,
281
+ destructiveHint=True,
282
+ idempotentHint=False,
283
+ openWorldHint=False
284
+ )
285
+ )
286
+ @with_analytics(analytics, "Multi-Select-Tool")
287
+ def multi_select_tool(locs:list[list[int]], press_ctrl:bool=True, ctx: Context = None)->str:
288
+ desktop.multi_select(press_ctrl,locs)
289
+ elements_str = '\n'.join([f"({loc[0]},{loc[1]})" for loc in locs])
290
+ return f"Multi-selected elements at:\n{elements_str}"
291
+
292
+ @mcp.tool(
293
+ name='MultiEdit',
294
+ description="Enters text into multiple input fields at specified coordinates [[x,y,text], ...].",
295
+ annotations=ToolAnnotations(
296
+ title="MultiEdit",
297
+ readOnlyHint=False,
298
+ destructiveHint=True,
299
+ idempotentHint=False,
300
+ openWorldHint=False
301
+ )
302
+ )
303
+ @with_analytics(analytics, "Multi-Edit-Tool")
304
+ def multi_edit_tool(locs:list[list], ctx: Context = None)->str:
305
+ desktop.multi_edit(locs)
306
+ elements_str = ', '.join([f"({e[0]},{e[1]}) with text '{e[2]}'" for e in locs])
307
+ return f"Multi-edited elements at: {elements_str}"
266
308
 
267
309
 
268
310
  @click.command()
@@ -1,5 +1,6 @@
1
- from windows_mcp.desktop.config import BROWSER_NAMES, PROCESS_PER_MONITOR_DPI_AWARE
2
- from windows_mcp.desktop.views import DesktopState, App, Status, Size
1
+ from windows_mcp.vdm.core import get_all_desktops, get_current_desktop, is_window_on_current_desktop
2
+ from windows_mcp.desktop.views import DesktopState, Window, Browser, Status, Size
3
+ from windows_mcp.desktop.config import PROCESS_PER_MONITOR_DPI_AWARE
3
4
  from windows_mcp.tree.views import BoundingBox, TreeElementNode
4
5
  from concurrent.futures import ThreadPoolExecutor, as_completed
5
6
  from PIL import ImageGrab, ImageFont, ImageDraw, Image
@@ -46,24 +47,30 @@ class Desktop:
46
47
  self.desktop_state=None
47
48
 
48
49
  def get_state(self,use_annotation:bool=True,use_vision:bool=False,use_dom:bool=False,as_bytes:bool=False,scale:float=1.0)->DesktopState:
49
- sleep(0.1)
50
50
  start_time = time()
51
51
 
52
52
  controls_handles=self.get_controls_handles() # Taskbar,Program Manager,Apps, Dialogs
53
- apps,apps_handles=self.get_apps(controls_handles=controls_handles) # Apps
54
- active_app=self.get_active_app(apps=apps) #Active App
55
- active_app_handle=active_app.handle if active_app else None
53
+ windows,windows_handles=self.get_windows(controls_handles=controls_handles) # Apps
54
+ active_window=self.get_active_window(windows=windows) #Active Window
55
+ active_window_handle=active_window.handle if active_window else None
56
56
 
57
- if active_app is not None and active_app in apps:
58
- apps.remove(active_app)
57
+ try:
58
+ active_desktop=get_current_desktop()
59
+ all_desktops=get_all_desktops()
60
+ except RuntimeError:
61
+ active_desktop = {'id': '00000000-0000-0000-0000-000000000000', 'name': 'Default Desktop'}
62
+ all_desktops = [active_desktop]
63
+
64
+ if active_window is not None and active_window in windows:
65
+ windows.remove(active_window)
59
66
 
60
- logger.debug(f"Active app: {active_app or 'No Active App Found'}")
61
- logger.debug(f"Apps: {apps}")
67
+ logger.debug(f"Active window: {active_window or 'No Active Window Found'}")
68
+ logger.debug(f"Windows: {windows}")
62
69
 
63
70
  #Preparing handles for Tree
64
- other_apps_handles=list(controls_handles-apps_handles)
71
+ other_windows_handles=list(controls_handles-windows_handles)
65
72
 
66
- tree_state=self.tree.get_state(active_app_handle,other_apps_handles,use_dom=use_dom)
73
+ tree_state=self.tree.get_state(active_window_handle,other_windows_handles,use_dom=use_dom)
67
74
 
68
75
  if use_vision:
69
76
  if use_annotation:
@@ -83,13 +90,20 @@ class Desktop:
83
90
  else:
84
91
  screenshot=None
85
92
 
86
- self.desktop_state=DesktopState(apps= apps,active_app=active_app,screenshot=screenshot,tree_state=tree_state)
93
+ self.desktop_state=DesktopState(
94
+ active_window=active_window,
95
+ windows=windows,
96
+ active_desktop=active_desktop,
97
+ all_desktops=all_desktops,
98
+ screenshot=screenshot,
99
+ tree_state=tree_state
100
+ )
87
101
  # Log the time taken to capture the state
88
102
  end_time = time()
89
103
  logger.info(f"Desktop State capture took {end_time - start_time:.2f} seconds")
90
104
  return self.desktop_state
91
105
 
92
- def get_app_status(self,control:uia.Control)->Status:
106
+ def get_window_status(self,control:uia.Control)->Status:
93
107
  if uia.IsIconic(control.NativeWindowHandle):
94
108
  return Status.MINIMIZED
95
109
  elif uia.IsZoomed(control.NativeWindowHandle):
@@ -117,8 +131,8 @@ class Desktop:
117
131
  try:
118
132
  reader = csv.DictReader(io.StringIO(apps_info.strip()))
119
133
  return {
120
- row.get('Name').lower(): row.get('AppID')
121
- for row in reader
134
+ row.get('Name', '').lower(): row.get('AppID', '')
135
+ for row in reader
122
136
  if row.get('Name') and row.get('AppID')
123
137
  }
124
138
  except Exception as e:
@@ -129,10 +143,11 @@ class Desktop:
129
143
  try:
130
144
  encoded = base64.b64encode(command.encode("utf-16le")).decode("ascii")
131
145
  result = subprocess.run(
132
- ['powershell', '-NoProfile', '-EncodedCommand', encoded],
146
+ ['powershell', '-NoProfile', '-OutputFormat', 'Text', '-EncodedCommand', encoded],
133
147
  capture_output=True, # No errors='ignore' - let subprocess return bytes
134
148
  timeout=timeout,
135
- cwd=os.path.expanduser(path='~')
149
+ cwd=os.path.expanduser(path='~'),
150
+ env=os.environ.copy() # Inherit environment variables including PATH
136
151
  )
137
152
  # Handle both bytes and str output (subprocess behavior varies by environment)
138
153
  stdout = result.stdout
@@ -147,11 +162,11 @@ class Desktop:
147
162
  except Exception as e:
148
163
  return (f'Command execution failed: {type(e).__name__}: {e}', 1)
149
164
 
150
- def is_app_browser(self,node:uia.Control):
165
+ def is_window_browser(self,node:uia.Control):
151
166
  '''Give any node of the app and it will return True if the app is a browser, False otherwise.'''
152
167
  try:
153
168
  process=Process(node.ProcessId)
154
- return process.name() in BROWSER_NAMES
169
+ return Browser.has_process(process.name())
155
170
  except:
156
171
  return False
157
172
 
@@ -162,32 +177,32 @@ class Desktop:
162
177
  return "".join([row.get('DisplayName') for row in reader])
163
178
 
164
179
  def resize_app(self,size:tuple[int,int]=None,loc:tuple[int,int]=None)->tuple[str,int]:
165
- active_app=self.desktop_state.active_app
166
- if active_app is None:
167
- return "No active app found",1
168
- if active_app.status==Status.MINIMIZED:
169
- return f"{active_app.name} is minimized",1
170
- elif active_app.status==Status.MAXIMIZED:
171
- return f"{active_app.name} is maximized",1
180
+ active_window=self.desktop_state.active_window
181
+ if active_window is None:
182
+ return "No active window found",1
183
+ if active_window.status==Status.MINIMIZED:
184
+ return f"{active_window.name} is minimized",1
185
+ elif active_window.status==Status.MAXIMIZED:
186
+ return f"{active_window.name} is maximized",1
172
187
  else:
173
- app_control=uia.ControlFromHandle(active_app.handle)
188
+ window_control=uia.ControlFromHandle(active_window.handle)
174
189
  if loc is None:
175
- x=app_control.BoundingRectangle.left
176
- y=app_control.BoundingRectangle.top
190
+ x=window_control.BoundingRectangle.left
191
+ y=window_control.BoundingRectangle.top
177
192
  loc=(x,y)
178
193
  if size is None:
179
- width=app_control.BoundingRectangle.width()
180
- height=app_control.BoundingRectangle.height()
194
+ width=window_control.BoundingRectangle.width()
195
+ height=window_control.BoundingRectangle.height()
181
196
  size=(width,height)
182
197
  x,y=loc
183
198
  width,height=size
184
- app_control.MoveWindow(x,y,width,height)
185
- return (f'{active_app.name} resized to {width}x{height} at {x},{y}.',0)
199
+ window_control.MoveWindow(x,y,width,height)
200
+ return (f'{active_window.name} resized to {width}x{height} at {x},{y}.',0)
186
201
 
187
202
  def is_app_running(self,name:str)->bool:
188
- apps, _ = self.get_apps()
189
- apps_dict = {app.name: app for app in apps}
190
- return process.extractOne(name,list(apps_dict.keys()),score_cutoff=60) is not None
203
+ windows, _ = self.get_windows()
204
+ windows_dict = {window.name: window for window in windows}
205
+ return process.extractOne(name,list(windows_dict.keys()),score_cutoff=60) is not None
191
206
 
192
207
  def app(self,mode:Literal['launch','switch','resize'],name:Optional[str]=None,loc:Optional[tuple[int,int]]=None,size:Optional[tuple[int,int]]=None):
193
208
  match mode:
@@ -207,7 +222,7 @@ class Desktop:
207
222
  safe_name = re.escape(name)
208
223
  if uia.WindowControl(RegexName=f'(?i).*{safe_name}.*').Exists(maxSearchSeconds=10):
209
224
  launched = True
210
-
225
+
211
226
  if launched:
212
227
  return f'{name.title()} launched.'
213
228
  return f'Launching {name.title()} sent, but window not detected yet.'
@@ -232,37 +247,41 @@ class Desktop:
232
247
  app_name,_=matched_app
233
248
  appid=apps_map.get(app_name)
234
249
  if appid is None:
235
- return (name,f'{name.title()} not found in start menu.',1,0)
250
+ return (f'{name.title()} not found in start menu.',1,0)
236
251
 
237
252
  pid = 0
238
253
  if os.path.exists(appid) or "\\" in appid:
239
254
  # It's a file path, we can try to get the PID using PassThru
240
- command = f'Start-Process "{appid}" -PassThru | Select-Object -ExpandProperty Id'
255
+ # Escape any single quotes and wrap in single quotes for PowerShell safety
256
+ safe_appid = appid.replace("'", "''")
257
+ command = f"Start-Process '{safe_appid}' -PassThru | Select-Object -ExpandProperty Id"
241
258
  response, status = self.execute_command(command)
242
259
  if status == 0 and response.strip().isdigit():
243
260
  pid = int(response.strip())
244
261
  else:
245
- # It's an AUMID (Store App)
262
+ # It's an AUMID (Store App) - validate it only contains expected characters
263
+ if not appid.replace('\\', '').replace('_', '').replace('.', '').replace('-', '').isalnum():
264
+ return (f'Invalid app identifier: {appid}', 1, 0)
246
265
  command = f'Start-Process "shell:AppsFolder\\{appid}"'
247
266
  response, status = self.execute_command(command)
248
267
 
249
268
  return response, status, pid
250
269
 
251
270
  def switch_app(self,name:str):
252
- apps={app.name:app for app in [self.desktop_state.active_app]+self.desktop_state.apps if app is not None}
253
- matched_app:Optional[tuple[str,float]]=process.extractOne(name,list(apps.keys()),score_cutoff=70)
254
- if matched_app is None:
271
+ windows={window.name:window for window in [self.desktop_state.active_window]+self.desktop_state.windows if window is not None}
272
+ matched_window:Optional[tuple[str,float]]=process.extractOne(name,list(windows.keys()),score_cutoff=70)
273
+ if matched_window is None:
255
274
  return (f'Application {name.title()} not found.',1)
256
- app_name,_=matched_app
257
- app=apps.get(app_name)
258
- target_handle=app.handle
275
+ window_name,_=matched_window
276
+ window=windows.get(window_name)
277
+ target_handle=window.handle
259
278
 
260
279
  if uia.IsIconic(target_handle):
261
280
  uia.ShowWindow(target_handle, win32con.SW_RESTORE)
262
- content=f'{app_name.title()} restored from Minimized state.'
281
+ content=f'{window_name.title()} restored from Minimized state.'
263
282
  else:
264
283
  self.bring_window_to_top(target_handle)
265
- content=f'Switched to {app_name.title()} window.'
284
+ content=f'Switched to {window_name.title()} window.'
266
285
  return content,0
267
286
 
268
287
  def bring_window_to_top(self, target_handle: int):
@@ -274,6 +293,14 @@ class Desktop:
274
293
  win32gui.ShowWindow(target_handle, win32con.SW_RESTORE)
275
294
 
276
295
  foreground_handle = win32gui.GetForegroundWindow()
296
+
297
+ # Validate both handles before proceeding
298
+ if not win32gui.IsWindow(foreground_handle):
299
+ # No valid foreground window, just try to set target as foreground
300
+ win32gui.SetForegroundWindow(target_handle)
301
+ win32gui.BringWindowToTop(target_handle)
302
+ return
303
+
277
304
  foreground_thread, _ = win32process.GetWindowThreadProcessId(foreground_handle)
278
305
  target_thread, _ = win32process.GetWindowThreadProcessId(target_handle)
279
306
 
@@ -322,7 +349,7 @@ class Desktop:
322
349
  x,y=loc
323
350
  pg.click(x,y,button=button,clicks=clicks,duration=0.1)
324
351
 
325
- def type(self,loc:tuple[int,int],text:str,caret_position:Literal['start','end','none']='none',clear:Literal['true','false']='false',press_enter:Literal['true','false']='false'):
352
+ def type(self,loc:tuple[int,int],text:str,caret_position:Literal['start', 'idle', 'end']='idle',clear:bool|str=False,press_enter:bool|str=False):
326
353
  x,y=loc
327
354
  pg.leftClick(x,y)
328
355
  if caret_position == 'start':
@@ -331,12 +358,16 @@ class Desktop:
331
358
  pg.press('end')
332
359
  else:
333
360
  pass
334
- if clear=='true':
361
+
362
+ # Handle both boolean and string 'true'/'false'
363
+ if clear is True or (isinstance(clear, str) and clear.lower() == 'true'):
335
364
  pg.sleep(0.5)
336
365
  pg.hotkey('ctrl','a')
337
366
  pg.press('backspace')
367
+
338
368
  pg.typewrite(text,interval=0.02)
339
- if press_enter=='true':
369
+
370
+ if press_enter is True or (isinstance(press_enter, str) and press_enter.lower() == 'true'):
340
371
  pg.press('enter')
341
372
 
342
373
  def scroll(self,loc:tuple[int,int]=None,type:Literal['horizontal','vertical']='vertical',direction:Literal['up','down','left','right']='down',wheel_times:int=1)->str|None:
@@ -387,19 +418,19 @@ class Desktop:
387
418
  else:
388
419
  pg.press(''.join(shortcut))
389
420
 
390
- def multi_select(self,press_ctrl:Literal['true','false']='false',elements:list[tuple[int,int]|int]=[]):
391
- if press_ctrl=='true':
421
+ def multi_select(self,press_ctrl:bool=False,locs:list[tuple[int,int]]=[]):
422
+ if press_ctrl:
392
423
  pg.keyDown('ctrl')
393
- for element in elements:
394
- x,y=element
424
+ for loc in locs:
425
+ x,y=loc
395
426
  pg.click(x,y,duration=0.2)
396
427
  pg.sleep(0.5)
397
428
  pg.keyUp('ctrl')
398
429
 
399
- def multi_edit(self,elements:list[tuple[int,int,str]|tuple[int,str]]):
400
- for element in elements:
401
- x,y,text=element
402
- self.type((x,y),text=text,clear='true')
430
+ def multi_edit(self,locs:list[tuple[int,int,str]]):
431
+ for loc in locs:
432
+ x,y,text=loc
433
+ self.type((x,y),text=text,clear=True)
403
434
 
404
435
  def scrape(self,url:str)->str:
405
436
  response=requests.get(url,timeout=10)
@@ -407,56 +438,105 @@ class Desktop:
407
438
  content=markdownify(html=html)
408
439
  return content
409
440
 
410
- def get_app_from_element(self,element:uia.Control)->App|None:
441
+ def get_window_from_element(self,element:uia.Control)->Window|None:
411
442
  if element is None:
412
443
  return None
413
444
  top_window=element.GetTopLevelControl()
414
445
  if top_window is None:
415
446
  return None
416
447
  handle=top_window.NativeWindowHandle
417
- apps,_=self.get_apps()
418
- for app in apps:
419
- if app.handle==handle:
420
- return app
448
+ windows,_=self.get_windows()
449
+ for window in windows:
450
+ if window.handle==handle:
451
+ return window
421
452
  return None
422
453
 
423
- def is_app_visible(self,app:uia.Control)->bool:
424
- is_minimized=self.get_app_status(app)!=Status.MINIMIZED
425
- size=app.BoundingRectangle
454
+ def is_window_visible(self,window:uia.Control)->bool:
455
+ is_minimized=self.get_window_status(window)!=Status.MINIMIZED
456
+ size=window.BoundingRectangle
426
457
  area=size.width()*size.height()
427
- is_overlay=self.is_overlay_app(app)
458
+ is_overlay=self.is_overlay_window(window)
428
459
  return not is_overlay and is_minimized and area>10
429
460
 
430
- def is_overlay_app(self,element:uia.Control) -> bool:
461
+ def is_overlay_window(self,element:uia.Control) -> bool:
431
462
  no_children = len(element.GetChildren()) == 0
432
463
  is_name = "Overlay" in element.Name.strip()
433
464
  return no_children or is_name
434
465
 
435
- def get_controls_handles(self):
466
+ def get_controls_handles(self,optimized:bool=False):
436
467
  handles = set()
437
- root=uia.GetRootControl()
438
- children=root.GetChildren()
439
- for child in children:
440
- handles.add(child.NativeWindowHandle)
468
+ # For even more faster results (still under development)
469
+ def callback(hwnd, _):
470
+ try:
471
+ # Validate handle before checking properties
472
+ if win32gui.IsWindow(hwnd) and win32gui.IsWindowVisible(hwnd) and is_window_on_current_desktop(hwnd):
473
+ handles.add(hwnd)
474
+ except Exception:
475
+ # Skip invalid handles without logging (common during window enumeration)
476
+ pass
477
+
478
+ win32gui.EnumWindows(callback, None)
479
+
480
+ if desktop_hwnd:= win32gui.FindWindow('Progman',None):
481
+ handles.add(desktop_hwnd)
482
+ if taskbar_hwnd:= win32gui.FindWindow('Shell_TrayWnd',None):
483
+ handles.add(taskbar_hwnd)
484
+ if secondary_taskbar_hwnd:= win32gui.FindWindow('Shell_SecondaryTrayWnd',None):
485
+ handles.add(secondary_taskbar_hwnd)
441
486
  return handles
442
487
 
443
- def get_active_app(self,apps:list[App]|None=None)->App|None:
488
+ def get_active_window(self,windows:list[Window]|None=None)->Window|None:
444
489
  try:
445
- if apps is None:
446
- apps,_=self.get_apps()
447
- handle=uia.GetForegroundWindow()
448
- for app in apps:
449
- if app.handle!=handle:
490
+ if windows is None:
491
+ windows,_=self.get_windows()
492
+ active_window=self.get_foreground_window()
493
+ if active_window.ClassName=="Progman":
494
+ return None
495
+ active_window_handle=active_window.NativeWindowHandle
496
+ for window in windows:
497
+ if window.handle!=active_window_handle:
450
498
  continue
451
- return app
499
+ return window
500
+ # In case active window is not present in the windows list
501
+ return Window(**{
502
+ "name":active_window.Name,
503
+ "is_browser":self.is_window_browser(active_window),
504
+ "depth":0,
505
+ "bounding_box":BoundingBox(
506
+ left=active_window.BoundingRectangle.left,
507
+ top=active_window.BoundingRectangle.top,
508
+ right=active_window.BoundingRectangle.right,
509
+ bottom=active_window.BoundingRectangle.bottom,
510
+ width=active_window.BoundingRectangle.width(),
511
+ height=active_window.BoundingRectangle.height()
512
+ ),
513
+ "status":self.get_window_status(active_window),
514
+ "handle":active_window_handle,
515
+ "process_id":active_window.ProcessId,
516
+ })
452
517
  except Exception as ex:
453
- logger.error(f"Error in get_active_app: {ex}")
518
+ logger.error(f"Error in get_active_window: {ex}")
454
519
  return None
520
+
521
+ def get_foreground_window(self)->uia.Control:
522
+ handle=uia.GetForegroundWindow()
523
+ active_window=self.get_window_from_element_handle(handle)
524
+ return active_window
525
+
526
+ def get_window_from_element_handle(self, element_handle: int) -> uia.Control:
527
+ current = uia.ControlFromHandle(element_handle)
528
+ root_handle = uia.GetRootControl().NativeWindowHandle
455
529
 
456
- def get_apps(self,controls_handles:set[int]|None=None) -> tuple[list[App],set[int]]:
530
+ while True:
531
+ parent = current.GetParentControl()
532
+ if parent is None or parent.NativeWindowHandle == root_handle:
533
+ return current
534
+ current = parent
535
+
536
+ def get_windows(self,controls_handles:set[int]|None=None) -> tuple[list[Window],set[int]]:
457
537
  try:
458
- apps = []
459
- handles = set()
538
+ windows = []
539
+ window_handles = set()
460
540
  controls_handles=controls_handles or self.get_controls_handles()
461
541
  for depth, hwnd in enumerate(controls_handles):
462
542
  try:
@@ -465,7 +545,7 @@ class Desktop:
465
545
  continue
466
546
 
467
547
  # Filter out Overlays (e.g. NVIDIA, Steam)
468
- if self.is_overlay_app(child):
548
+ if self.is_overlay_window(child):
469
549
  continue
470
550
 
471
551
  if isinstance(child,(uia.WindowControl,uia.PaneControl)):
@@ -474,15 +554,14 @@ class Desktop:
474
554
  continue
475
555
 
476
556
  if window_pattern.CanMinimize and window_pattern.CanMaximize:
477
- status = self.get_app_status(child)
557
+ status = self.get_window_status(child)
478
558
 
479
559
  bounding_rect=child.BoundingRectangle
480
560
  if bounding_rect.isempty() and status!=Status.MINIMIZED:
481
561
  continue
482
562
 
483
- apps.append(App(**{
563
+ windows.append(Window(**{
484
564
  "name":child.Name,
485
- "runtime_id":tuple(child.GetRuntimeId()),
486
565
  "depth":depth,
487
566
  "status":status,
488
567
  "bounding_box":BoundingBox(
@@ -495,13 +574,13 @@ class Desktop:
495
574
  ),
496
575
  "handle":child.NativeWindowHandle,
497
576
  "process_id":child.ProcessId,
498
- "is_browser":self.is_app_browser(child)
577
+ "is_browser":self.is_window_browser(child)
499
578
  }))
500
- handles.add(child.NativeWindowHandle)
579
+ window_handles.add(child.NativeWindowHandle)
501
580
  except Exception as ex:
502
- logger.error(f"Error in get_apps: {ex}")
503
- apps = []
504
- return apps,handles
581
+ logger.error(f"Error in get_windows: {ex}")
582
+ windows = []
583
+ return windows,window_handles
505
584
 
506
585
  def get_xpath_from_element(self,element:uia.Control):
507
586
  current=element
@@ -556,9 +635,13 @@ class Desktop:
556
635
  return "Local Account" if response.strip()=='Local' else "Microsoft Account" if status==0 else "Local Account"
557
636
 
558
637
  def get_dpi_scaling(self):
559
- user32 = ctypes.windll.user32
560
- dpi = user32.GetDpiForSystem()
561
- return dpi / 96.0
638
+ try:
639
+ user32 = ctypes.windll.user32
640
+ dpi = user32.GetDpiForSystem()
641
+ return dpi / 96.0 if dpi > 0 else 1.0
642
+ except Exception:
643
+ # Fallback to standard DPI if system call fails
644
+ return 1.0
562
645
 
563
646
  def get_screen_size(self)->Size:
564
647
  width, height = uia.GetVirtualScreenSize()
@@ -568,12 +651,11 @@ class Desktop:
568
651
  try:
569
652
  return ImageGrab.grab(all_screens=True)
570
653
  except Exception as e:
571
- logger.warning(f"Failed to capture all screens: {e}. Fallback to primary.")
654
+ logger.warning(f"Failed to capture virtual screen, using primary screen")
572
655
  return pg.screenshot()
573
656
 
574
657
  def get_annotated_screenshot(self, nodes: list[TreeElementNode]) -> Image.Image:
575
658
  screenshot = self.get_screenshot()
576
- sleep(0.10)
577
659
  # Add padding
578
660
  padding = 5
579
661
  width = int(screenshot.width + (1.5 * padding))