windows-mcp 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  from live_inspect.watch_cursor import WatchCursor
2
2
  from contextlib import asynccontextmanager
3
3
  from fastmcp.utilities.types import Image
4
- from src.desktop.service import Desktop
4
+ from windows_mcp.desktop.service import Desktop
5
5
  from mcp.types import ToolAnnotations
6
6
  from humancursor import SystemCursor
7
7
  from textwrap import dedent
@@ -19,6 +19,7 @@ cursor=SystemCursor()
19
19
  watch_cursor=WatchCursor()
20
20
  windows_version=desktop.get_windows_version()
21
21
  default_language=desktop.get_default_language()
22
+ screen_width,screen_height=desktop.get_resolution()
22
23
 
23
24
  instructions=dedent(f'''
24
25
  Windows MCP server provides tools to interact directly with the {windows_version} desktop,
@@ -39,11 +40,11 @@ mcp=FastMCP(name='windows-mcp',instructions=instructions,lifespan=lifespan)
39
40
 
40
41
  @mcp.tool(
41
42
  name="App-Tool",
42
- description="Manages Windows applications through launch, resize, and window switching operations.",
43
+ description="Manages Windows applications with three modes: 'launch' (start app by name), 'resize' (set window position/size using window_loc=[x,y] and window_size=[width,height]), 'switch' (activate app by name). Essential for application lifecycle management.",
43
44
  annotations=ToolAnnotations(
44
45
  title="App Tool",
45
46
  readOnlyHint=False,
46
- destructiveHint=False,
47
+ destructiveHint=True,
47
48
  idempotentHint=False,
48
49
  openWorldHint=False
49
50
  )
@@ -53,7 +54,7 @@ def app_tool(mode:Literal['launch','resize','switch'],name:str|None=None,window_
53
54
 
54
55
  @mcp.tool(
55
56
  name='Powershell-Tool',
56
- description='Execute PowerShell commands and return the output with status code',
57
+ description='Execute PowerShell commands directly on the Windows system and return output with status code. Supports all PowerShell cmdlets, scripts, and system commands. Use for file operations, system queries, and administrative tasks.',
57
58
  annotations=ToolAnnotations(
58
59
  title="Powershell Tool",
59
60
  readOnlyHint=False,
@@ -68,15 +69,23 @@ def powershell_tool(command: str) -> str:
68
69
 
69
70
  @mcp.tool(
70
71
  name='State-Tool',
71
- description='Capture comprehensive desktop state including default language used by user interface, focused/opened applications, interactive UI elements (buttons, text fields, menus), informative content (text, labels, status), and scrollable areas. Optionally includes visual screenshot when use_vision=True. Essential for understanding current desktop context and available UI interactions.',
72
+ description='Captures complete desktop state including: system language, focused/opened apps, interactive elements (buttons, text fields, links, menus with coordinates), and scrollable areas. Set use_vision=True to include screenshot. Set use_dom=True for browser content to get web page elements instead of browser UI. Always call this first to understand the current desktop state before taking actions.',
72
73
  annotations=ToolAnnotations(
73
74
  title="State Tool",
74
75
  readOnlyHint=True,
76
+ destructiveHint=False,
77
+ idempotentHint=True,
75
78
  openWorldHint=False
76
79
  )
77
80
  )
78
- def state_tool(use_vision:bool=False):
79
- desktop_state=desktop.get_state(use_vision=use_vision,as_bytes=True)
81
+ def state_tool(use_vision:bool=False,use_dom:bool=False):
82
+ # Calculate scale factor to cap resolution at 1080p (1920x1080)
83
+ max_width, max_height = 1920, 1080
84
+ scale_width = max_width / screen_width if screen_width > max_width else 1.0
85
+ scale_height = max_height / screen_height if screen_height > max_height else 1.0
86
+ scale = min(scale_width, scale_height) # Use the smaller scale to ensure both dimensions fit
87
+
88
+ desktop_state=desktop.get_state(use_vision=use_vision,use_dom=use_dom,as_bytes=True,scale=scale)
80
89
  interactive_elements=desktop_state.tree_state.interactive_elements_to_string()
81
90
  scrollable_elements=desktop_state.tree_state.scrollable_elements_to_string()
82
91
  apps=desktop_state.apps_to_string()
@@ -100,7 +109,7 @@ def state_tool(use_vision:bool=False):
100
109
 
101
110
  @mcp.tool(
102
111
  name='Click-Tool',
103
- description='Click on UI elements at specific coordinates. Supports left/right/middle mouse buttons and single/double/triple clicks. Use coordinates from State-Tool output.',
112
+ description='Performs mouse clicks at specified coordinates [x, y]. Supports button types: left (default), right (context menu), middle. Supports clicks: 1 (single), 2 (double), 3 (triple). Always use coordinates from State-Tool output to ensure accuracy.',
104
113
  annotations=ToolAnnotations(
105
114
  title="Click Tool",
106
115
  readOnlyHint=False,
@@ -119,7 +128,7 @@ def click_tool(loc:list[int],button:Literal['left','right','middle']='left',clic
119
128
 
120
129
  @mcp.tool(
121
130
  name='Type-Tool',
122
- description='Type text into input fields, text areas, or focused elements. Set clear=True to replace existing text, False to append. Click on target element coordinates first.',
131
+ description='Types text at specified coordinates [x, y]. Set clear=True to clear existing text first (Ctrl+A then type), clear=False to append. Set press_enter=True to submit after typing. Always click on the target input field first to ensure focus.',
123
132
  annotations=ToolAnnotations(
124
133
  title="Type Tool",
125
134
  readOnlyHint=False,
@@ -137,7 +146,7 @@ def type_tool(loc:list[int],text:str,clear:bool=False,press_enter:bool=False)->s
137
146
 
138
147
  @mcp.tool(
139
148
  name='Scroll-Tool',
140
- description='Scroll at specific coordinates or current mouse position. Use wheel_times to control scroll amount (1 wheel = ~3-5 lines). Essential for navigating lists, web pages, and long content.',
149
+ description='Scrolls at coordinates [x, y] or current mouse position if loc=None. Type: vertical (default) or horizontal. Direction: up/down for vertical, left/right for horizontal. wheel_times controls amount (1 wheel 3-5 lines). Use for navigating long content, lists, and web pages.',
141
150
  annotations=ToolAnnotations(
142
151
  title="Scroll Tool",
143
152
  readOnlyHint=False,
@@ -156,7 +165,7 @@ def scroll_tool(loc:list[int]=None,type:Literal['horizontal','vertical']='vertic
156
165
 
157
166
  @mcp.tool(
158
167
  name='Drag-Tool',
159
- description='Drag and drop operation from current coordinates to destination coordinates. Useful for moving files, resizing windows, or drag-and-drop interactions.',
168
+ description='Performs drag-and-drop from current mouse position to destination coordinates [x, y]. Click or move to source position first, then call this tool with target coordinates. Use for moving files, reordering items, resizing windows, or any drag-drop UI interactions.',
160
169
  annotations=ToolAnnotations(
161
170
  title="Drag Tool",
162
171
  readOnlyHint=False,
@@ -174,7 +183,7 @@ def drag_tool(to_loc:list[int])->str:
174
183
 
175
184
  @mcp.tool(
176
185
  name='Move-Tool',
177
- description='Move mouse cursor to specific coordinates without clicking. Useful for hovering over elements or positioning cursor before other actions.',
186
+ description='Moves mouse cursor to coordinates [x, y] without clicking. Use for hovering to reveal tooltips/menus, positioning cursor before drag operations, or triggering hover-based UI changes. Does not interact with elements.',
178
187
  annotations=ToolAnnotations(
179
188
  title="Move Tool",
180
189
  readOnlyHint=False,
@@ -192,7 +201,7 @@ def move_tool(to_loc:list[int])->str:
192
201
 
193
202
  @mcp.tool(
194
203
  name='Shortcut-Tool',
195
- description='Execute keyboard shortcuts using key combinations. Pass keys as list (e.g., ctrl+c for copy, alt+tab for app switching, win+r for Run dialog, win is for opening the start menu).',
204
+ description='Executes keyboard shortcuts using key combinations separated by +. Examples: "ctrl+c" (copy), "ctrl+v" (paste), "alt+tab" (switch apps), "win+r" (Run dialog), "win" (Start menu), "ctrl+shift+esc" (Task Manager). Use for quick actions and system commands.',
196
205
  annotations=ToolAnnotations(
197
206
  title="Shortcut Tool",
198
207
  readOnlyHint=False,
@@ -207,10 +216,12 @@ def shortcut_tool(shortcut:str):
207
216
 
208
217
  @mcp.tool(
209
218
  name='Wait-Tool',
210
- description='Pause execution for specified duration in seconds. Useful for waiting for applications to load, animations to complete, or adding delays between actions.',
219
+ description='Pauses execution for specified duration in seconds. Use when waiting for: applications to launch/load, UI animations to complete, page content to render, dialogs to appear, or between rapid actions. Helps ensure UI is ready before next interaction.',
211
220
  annotations=ToolAnnotations(
212
221
  title="Wait Tool",
213
222
  readOnlyHint=True,
223
+ destructiveHint=False,
224
+ idempotentHint=True,
214
225
  openWorldHint=False
215
226
  )
216
227
  )
@@ -220,16 +231,26 @@ def wait_tool(duration:int)->str:
220
231
 
221
232
  @mcp.tool(
222
233
  name='Scrape-Tool',
223
- description='Fetch and convert webpage content to markdown format. Provide full URL including protocol (http/https). Returns structured text content suitable for analysis.',
234
+ description='Extracts visible text content from the currently focused browser tab. Returns content in plain text format with scroll status indicators (top/bottom reached or more content available). Only works when a browser with DOM is active. Use State-Tool with use_dom=True first to ensure browser is ready.',
224
235
  annotations=ToolAnnotations(
225
236
  title="Scrape Tool",
226
237
  readOnlyHint=True,
238
+ destructiveHint=False,
239
+ idempotentHint=True,
227
240
  openWorldHint=True
228
241
  )
229
242
  )
230
243
  def scrape_tool(url:str)->str:
231
- content=desktop.scrape(url)
232
- return f'Scraped the contents of the entire webpage:\n{content}'
244
+ desktop_state=desktop.desktop_state
245
+ tree_state=desktop_state.tree_state
246
+ if not tree_state.dom_node:
247
+ return f'Unable to scrape URL: {url}. No DOM node found.'
248
+ dom_node=tree_state.dom
249
+ vertical_scroll_percent=dom_node.vertical_scroll_percent
250
+ content='\n'.join([node.text for node in tree_state.dom_informative_nodes])
251
+ header_status = "Reached top" if vertical_scroll_percent <= 0 else "Scroll up to see more"
252
+ footer_status = "Reached bottom" if vertical_scroll_percent >= 100 else "Scroll down to see more"
253
+ return f'URL:{url}\nContent:\n{header_status}\n{content}\n{footer_status}'
233
254
 
234
255
 
235
256
  @click.command()
@@ -1,10 +1,10 @@
1
- from src.desktop.config import BROWSER_NAMES, PROCESS_PER_MONITOR_DPI_AWARE
2
- from src.desktop.views import DesktopState, App, Size, Status
1
+ from windows_mcp.desktop.config import BROWSER_NAMES, PROCESS_PER_MONITOR_DPI_AWARE
2
+ from windows_mcp.desktop.views import DesktopState, App, Size, Status
3
3
  from locale import getpreferredencoding
4
4
  from contextlib import contextmanager
5
5
  from typing import Optional,Literal
6
6
  from markdownify import markdownify
7
- from src.tree.service import Tree
7
+ from windows_mcp.tree.service import Tree
8
8
  from fuzzywuzzy import process
9
9
  from psutil import Process
10
10
  from time import sleep
@@ -46,7 +46,10 @@ class Desktop:
46
46
  self.tree=Tree(self)
47
47
  self.desktop_state=None
48
48
 
49
- def get_state(self,use_vision:bool=False,as_bytes:bool=False)->DesktopState:
49
+ def get_resolution(self)->tuple[int,int]:
50
+ return pg.size()
51
+
52
+ def get_state(self,use_vision:bool=False,use_dom:bool=False,as_bytes:bool=False,scale:float=1.0)->DesktopState:
50
53
  sleep(0.1)
51
54
  apps=self.get_apps()
52
55
  active_app=self.get_active_app()
@@ -54,9 +57,9 @@ class Desktop:
54
57
  apps.remove(active_app)
55
58
  logger.debug(f"Active app: {active_app}")
56
59
  logger.debug(f"Apps: {apps}")
57
- tree_state=self.tree.get_state(active_app,apps)
60
+ tree_state=self.tree.get_state(active_app,apps,use_dom=use_dom)
58
61
  if use_vision:
59
- screenshot=self.tree.annotated_screenshot(tree_state.interactive_nodes)
62
+ screenshot=self.tree.get_annotated_screenshot(tree_state.interactive_nodes,scale=scale)
60
63
  if as_bytes:
61
64
  bytes_io=io.BytesIO()
62
65
  screenshot.save(bytes_io,format='PNG')
@@ -1,4 +1,4 @@
1
- from src.tree.views import TreeState
1
+ from windows_mcp.tree.views import TreeState
2
2
  from dataclasses import dataclass
3
3
  from tabulate import tabulate
4
4
  from typing import Optional
@@ -1,11 +1,11 @@
1
- from src.tree.config import INTERACTIVE_CONTROL_TYPE_NAMES,DOCUMENT_CONTROL_TYPE_NAMES,INFORMATIVE_CONTROL_TYPE_NAMES, DEFAULT_ACTIONS, THREAD_MAX_RETRIES
1
+ from windows_mcp.tree.config import INTERACTIVE_CONTROL_TYPE_NAMES,DOCUMENT_CONTROL_TYPE_NAMES,INFORMATIVE_CONTROL_TYPE_NAMES, DEFAULT_ACTIONS, THREAD_MAX_RETRIES
2
2
  from uiautomation import Control,ImageControl,ScrollPattern,WindowControl,Rect,GetRootControl,PatternId
3
- from src.tree.views import TreeElementNode, ScrollElementNode, Center, BoundingBox, TreeState
3
+ from windows_mcp.tree.views import TreeElementNode, ScrollElementNode, TextElementNode, Center, BoundingBox, TreeState
4
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
5
- from src.tree.utils import random_point_within_bounding_box
5
+ from windows_mcp.tree.utils import random_point_within_bounding_box
6
6
  from PIL import Image, ImageFont, ImageDraw
7
- from src.desktop.views import App
8
- from typing import TYPE_CHECKING
7
+ from typing import TYPE_CHECKING,Optional
8
+ from windows_mcp.desktop.views import App
9
9
  from time import sleep
10
10
  import logging
11
11
  import random
@@ -18,36 +18,74 @@ handler.setFormatter(formatter)
18
18
  logger.addHandler(handler)
19
19
 
20
20
  if TYPE_CHECKING:
21
- from src.desktop.service import Desktop
21
+ from windows_mcp.desktop.service import Desktop
22
22
 
23
23
  class Tree:
24
24
  def __init__(self,desktop:'Desktop'):
25
25
  self.desktop=desktop
26
- screen_size=self.desktop.get_screen_size()
26
+ self.screen_size=self.desktop.get_screen_size()
27
+ self.dom:Optional[Control]=None
27
28
  self.dom_bounding_box:BoundingBox=None
28
29
  self.screen_box=BoundingBox(
29
- top=0, left=0, bottom=screen_size.height, right=screen_size.width,
30
- width=screen_size.width, height=screen_size.height
30
+ top=0, left=0, bottom=self.screen_size.height, right=self.screen_size.width,
31
+ width=self.screen_size.width, height=self.screen_size.height
31
32
  )
33
+ self.root:Optional[TreeElementNode]=None
32
34
 
33
- def get_state(self,active_app:App,other_apps:list[App])->TreeState:
34
- root=GetRootControl()
35
+ def get_state(self,active_app:App,other_apps:list[App],use_dom:bool=False)->TreeState:
36
+ self.root=GetRootControl()
35
37
  other_apps_handle=set(map(lambda other_app: other_app.handle,other_apps))
36
- apps=list(filter(lambda app:app.NativeWindowHandle not in other_apps_handle,root.GetChildren()))
38
+ apps=list(filter(lambda app:app.NativeWindowHandle not in other_apps_handle,self.root.GetChildren()))
37
39
  del other_apps_handle
38
40
  if active_app:
39
41
  apps=list(filter(lambda app:app.ClassName!='Progman',apps))
40
- interactive_nodes,scrollable_nodes=self.get_appwise_nodes(apps=apps)
41
- return TreeState(interactive_nodes=interactive_nodes,scrollable_nodes=scrollable_nodes)
42
+ interactive_nodes,scrollable_nodes,dom_informative_nodes=self.get_appwise_nodes(apps=apps,use_dom=use_dom)
43
+ root=TreeElementNode(**{
44
+ 'name':'Desktop',
45
+ 'control_type':'PaneControl',
46
+ 'app_name':'Desktop',
47
+ 'value':'',
48
+ 'shortcut':'',
49
+ 'bounding_box':self.screen_box,
50
+ 'center':Center(x=self.screen_box.left+self.screen_box.width//2,y=self.screen_box.top+self.screen_box.height//2),
51
+ 'xpath':'',
52
+ 'is_focused':False
53
+ })
54
+ dom=None
55
+ if self.dom:
56
+ scroll_pattern=self.dom.GetPattern(PatternId.ScrollPattern)
57
+ bounding_box=self.dom.BoundingRectangle
58
+ dom=ScrollElementNode(**{
59
+ 'name':"DOM",
60
+ 'control_type':'DocumentControl',
61
+ 'app_name':"DOM",
62
+ 'bounding_box':BoundingBox(
63
+ left=bounding_box.left,
64
+ top=bounding_box.top,
65
+ right=bounding_box.right,
66
+ bottom=bounding_box.bottom,
67
+ width=bounding_box.width(),
68
+ height=bounding_box.height()
69
+ ),
70
+ 'center':Center(x=bounding_box.left+bounding_box.width()//2,y=bounding_box.top+bounding_box.height()//2),
71
+ 'horizontal_scrollable':scroll_pattern.HorizontallyScrollable,
72
+ 'horizontal_scroll_percent':scroll_pattern.HorizontalScrollPercent if scroll_pattern.HorizontallyScrollable else 0,
73
+ 'vertical_scrollable':scroll_pattern.VerticallyScrollable,
74
+ 'vertical_scroll_percent':scroll_pattern.VerticalScrollPercent if scroll_pattern.VerticallyScrollable else 0,
75
+ 'xpath':'',
76
+ 'is_focused':False
77
+ })
78
+ return TreeState(root=root,dom=dom,interactive_nodes=interactive_nodes,scrollable_nodes=scrollable_nodes,dom_informative_nodes=dom_informative_nodes)
42
79
 
43
- def get_appwise_nodes(self,apps:list[Control]) -> tuple[list[TreeElementNode],list[ScrollElementNode]]:
44
- interactive_nodes, scrollable_nodes = [], []
80
+ def get_appwise_nodes(self,apps:list[Control],use_dom:bool=False)-> tuple[list[TreeElementNode],list[ScrollElementNode],list[TextElementNode]]:
81
+ interactive_nodes, scrollable_nodes,dom_informative_nodes = [], [], []
45
82
  with ThreadPoolExecutor() as executor:
46
83
  retry_counts = {app: 0 for app in apps}
47
84
  future_to_app = {
48
85
  executor.submit(
49
86
  self.get_nodes, app,
50
- self.desktop.is_app_browser(app)
87
+ self.desktop.is_app_browser(app),
88
+ use_dom
51
89
  ): app
52
90
  for app in apps
53
91
  }
@@ -57,18 +95,20 @@ class Tree:
57
95
  try:
58
96
  result = future.result()
59
97
  if result:
60
- element_nodes, scroll_nodes = result
98
+ element_nodes, scroll_nodes,informative_nodes = result
61
99
  interactive_nodes.extend(element_nodes)
62
100
  scrollable_nodes.extend(scroll_nodes)
101
+ dom_informative_nodes.extend(informative_nodes)
63
102
  except Exception as e:
64
103
  retry_counts[app] += 1
65
104
  logger.debug(f"Error in processing node {app.Name}, retry attempt {retry_counts[app]}\nError: {e}")
66
105
  if retry_counts[app] < THREAD_MAX_RETRIES:
67
- new_future = executor.submit(self.get_nodes, app, self.desktop.is_app_browser(app))
106
+ logger.debug(f"Retrying {app.Name} for the {retry_counts[app]}th time")
107
+ new_future = executor.submit(self.get_nodes, app, self.desktop.is_app_browser(app),use_dom)
68
108
  future_to_app[new_future] = app
69
109
  else:
70
110
  logger.error(f"Task failed completely for {app.Name} after {THREAD_MAX_RETRIES} retries")
71
- return interactive_nodes,scrollable_nodes
111
+ return interactive_nodes,scrollable_nodes,dom_informative_nodes
72
112
 
73
113
  def iou_bounding_box(self,window_box: Rect,element_box: Rect,) -> BoundingBox:
74
114
  # Step 1: Intersection of element and window (existing logic)
@@ -105,7 +145,7 @@ class Tree:
105
145
  )
106
146
  return bounding_box
107
147
 
108
- def get_nodes(self, node: Control, is_browser:bool=False) -> tuple[list[TreeElementNode],list[ScrollElementNode]]:
148
+ def get_nodes(self, node: Control, is_browser:bool=False,use_dom:bool=False) -> tuple[list[TreeElementNode],list[ScrollElementNode]]:
109
149
  window_bounding_box=node.BoundingRectangle
110
150
 
111
151
  def is_element_visible(node:Control,threshold:int=0):
@@ -331,11 +371,10 @@ class Tree:
331
371
  'is_focused':is_focused
332
372
  })
333
373
  interactive_nodes.append(tree_node)
334
- # elif is_element_text(node):
335
- # informative_nodes.append(TextElementNode(
336
- # name=node.Name.strip() or "''",
337
- # app_name=app_name
338
- # ))
374
+ elif is_element_text(node):
375
+ dom_informative_nodes.append(TextElementNode(
376
+ text=node.Name.strip(),
377
+ ))
339
378
 
340
379
  children=node.GetChildren()
341
380
 
@@ -344,11 +383,12 @@ class Tree:
344
383
  # Incrementally building the xpath
345
384
 
346
385
  # Check if the child is a DOM element
347
- if is_browser and child.ClassName == "Chrome_RenderWidgetHostHWND":
386
+ if is_browser and child.AutomationId == "RootWebArea":
348
387
  bounding_box=child.BoundingRectangle
349
388
  self.dom_bounding_box=BoundingBox(left=bounding_box.left,top=bounding_box.top,
350
389
  right=bounding_box.right,bottom=bounding_box.bottom,width=bounding_box.width(),
351
390
  height=bounding_box.height())
391
+ self.dom=child
352
392
  # enter DOM subtree
353
393
  tree_traversal(child, is_dom=True, is_dialog=is_dialog)
354
394
  # Check if the child is a dialog
@@ -369,7 +409,7 @@ class Tree:
369
409
  # normal non-dialog children
370
410
  tree_traversal(child, is_dom=is_dom, is_dialog=is_dialog)
371
411
 
372
- interactive_nodes, dom_interactive_nodes, scrollable_nodes = [], [], []
412
+ interactive_nodes, dom_interactive_nodes, scrollable_nodes, dom_informative_nodes = [], [], [], []
373
413
  app_name=node.Name.strip()
374
414
  match node.ClassName:
375
415
  case "Progman":
@@ -386,12 +426,25 @@ class Tree:
386
426
  logger.debug(f'DOM interactive nodes:{len(dom_interactive_nodes)}')
387
427
  logger.debug(f'Scrollable nodes:{len(scrollable_nodes)}')
388
428
 
389
- interactive_nodes.extend(dom_interactive_nodes)
390
- return (interactive_nodes,scrollable_nodes)
429
+ if use_dom:
430
+ if is_browser:
431
+ return (dom_interactive_nodes,scrollable_nodes,dom_informative_nodes)
432
+ else:
433
+ return ([],[],[])
434
+ else:
435
+ return (interactive_nodes+dom_interactive_nodes,scrollable_nodes,dom_informative_nodes)
391
436
 
392
- def annotated_screenshot(self, nodes: list[TreeElementNode]) -> Image.Image:
437
+ def get_annotated_screenshot(self, nodes: list[TreeElementNode],scale:float=1.0) -> Image.Image:
393
438
  screenshot = self.desktop.get_screenshot()
394
439
  sleep(0.10)
440
+
441
+ original_width = screenshot.width
442
+ original_height = screenshot.height
443
+
444
+ scaled_width = int(original_width * scale)
445
+ scaled_height = int(original_height * scale)
446
+ screenshot = screenshot.resize((scaled_width, scaled_height), Image.Resampling.LANCZOS)
447
+
395
448
  # Add padding
396
449
  padding = 5
397
450
  width = int(screenshot.width + (1.5 * padding))
@@ -413,12 +466,12 @@ class Tree:
413
466
  box = node.bounding_box
414
467
  color = get_random_color()
415
468
 
416
- # Scale and pad the bounding box also clip the bounding box
469
+ # Scale and pad the bounding box coordinates
417
470
  adjusted_box = (
418
- int(box.left) + padding,
419
- int(box.top) + padding,
420
- int(box.right) + padding,
421
- int(box.bottom) + padding
471
+ int(box.left * scale) + padding,
472
+ int(box.top * scale) + padding,
473
+ int(box.right * scale) + padding,
474
+ int(box.bottom * scale) + padding
422
475
  )
423
476
  # Draw bounding box
424
477
  draw.rectangle(adjusted_box, outline=color, width=2)
@@ -1,10 +1,14 @@
1
1
  from dataclasses import dataclass,field
2
2
  from tabulate import tabulate
3
+ from typing import Optional
3
4
 
4
5
  @dataclass
5
6
  class TreeState:
7
+ root:Optional['TreeElementNode']=None
8
+ dom:Optional['ScrollElementNode']=None
6
9
  interactive_nodes:list['TreeElementNode']=field(default_factory=list)
7
10
  scrollable_nodes:list['ScrollElementNode']=field(default_factory=list)
11
+ dom_informative_nodes:list['TextElementNode']=field(default_factory=list)
8
12
 
9
13
  def interactive_elements_to_string(self) -> str:
10
14
  if not self.interactive_nodes:
@@ -99,4 +103,8 @@ class ScrollElementNode:
99
103
  self.is_focused
100
104
  ]
101
105
 
106
+ @dataclass
107
+ class TextElementNode:
108
+ text:str
109
+
102
110
  ElementNode=TreeElementNode|ScrollElementNode
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: windows-mcp
3
- Version: 0.5.2
3
+ Version: 0.5.3
4
4
  Summary: Lightweight MCP Server for interacting with Windows Operating System.
5
5
  Project-URL: homepage, https://github.com/CursorTouch
6
6
  Author-email: Jeomon George <jeogeoalukka@gmail.com>
@@ -44,7 +44,6 @@ Requires-Dist: python-levenshtein>=0.27.1
44
44
  Requires-Dist: pywinauto>=0.6.9
45
45
  Requires-Dist: requests>=2.32.3
46
46
  Requires-Dist: tabulate>=0.9.0
47
- Requires-Dist: twine>=6.2.0
48
47
  Requires-Dist: uiautomation>=2.0.24
49
48
  Description-Content-Type: text/markdown
50
49
 
@@ -114,6 +113,58 @@ mcp-name: io.github.CursorTouch/Windows-MCP
114
113
  - **Real-Time Interaction**
115
114
  Typical latency between actions (e.g., from one mouse click to the next) ranges from **0.7 to 2.5 secs**, and may slightly vary based on the number of active applications and system load, also the inferencing speed of the llm.
116
115
 
116
+ - **DOM Mode for Browser Automation**
117
+ Special `use_dom=True` mode for State-Tool that focuses exclusively on web page content, filtering out browser UI elements for cleaner, more efficient web automation.
118
+
119
+ ## 🌐 DOM Mode for Browser Automation
120
+
121
+ Windows-MCP includes a powerful **DOM Mode** feature that enhances browser automation by focusing on web page content rather than browser UI elements.
122
+
123
+ ### What is DOM Mode?
124
+
125
+ When `use_dom=True` is set in the State-Tool, the MCP server:
126
+ - **Filters out browser UI**: Removes address bars, tabs, toolbars, and other browser chrome elements
127
+ - **Returns only web content**: Provides interactive elements (links, buttons, forms) from the actual web page
128
+ - **Reduces token usage**: Cleaner output means fewer tokens sent to the LLM
129
+ - **Improves accuracy**: LLM focuses only on relevant web page elements
130
+
131
+ ### When to Use DOM Mode
132
+
133
+ ✅ **Use `use_dom=True` when:**
134
+ - Automating web applications or websites
135
+ - Scraping web content
136
+ - Filling out web forms
137
+ - Clicking links or buttons on web pages
138
+ - Testing web interfaces
139
+ - You want to ignore browser UI and focus on page content
140
+
141
+ ❌ **Use `use_dom=False` (default) when:**
142
+ - Interacting with browser controls (address bar, tabs, bookmarks)
143
+ - Working with desktop applications
144
+ - Need to see all UI elements including browser chrome
145
+ - Managing browser settings or extensions
146
+
147
+ ### Example Usage
148
+
149
+ ```python
150
+ # Get web page content only (no browser UI)
151
+ state_tool(use_vision=False, use_dom=True)
152
+
153
+ # Get full desktop state including browser UI
154
+ state_tool(use_vision=False, use_dom=False)
155
+
156
+ # Get web page content with screenshot
157
+ state_tool(use_vision=True, use_dom=True)
158
+ ```
159
+
160
+ ### Benefits
161
+
162
+ 1. **Token Efficiency**: Reduces the amount of data sent to LLM by filtering irrelevant browser UI
163
+ 2. **Better Focus**: LLM concentrates on actionable web page elements
164
+ 3. **Cleaner Output**: Only relevant interactive elements from the DOM are returned
165
+ 4. **Faster Processing**: Less data means faster LLM inference
166
+ 5. **Cost Savings**: Fewer tokens = lower API costs for cloud LLMs
167
+
117
168
  ## 🛠️Installation
118
169
 
119
170
  ### Prerequisites
@@ -317,7 +368,7 @@ MCP Client can access the following tools to interact with Windows:
317
368
  - `Move-Tool`: Move mouse pointer.
318
369
  - `Shortcut-Tool`: Press keyboard shortcuts (`Ctrl+c`, `Alt+Tab`, etc).
319
370
  - `Wait-Tool`: Pause for a defined duration.
320
- - `State-Tool`: Combined snapshot of default language, browser, active apps and interactive, textual and scrollable elements along with screenshot of the desktop..
371
+ - `State-Tool`: Combined snapshot of default language, browser, active apps and interactive, textual and scrollable elements along with screenshot of the desktop. Supports `use_dom=True` for browser content extraction (web page elements only) and `use_vision=True` for including screenshots.
321
372
  - `App-Tool`: To launch an application from the start menu, resize or move the window and switch between apps.
322
373
  - `Shell-Tool`: To execute PowerShell commands.
323
374
  - `Scrape-Tool`: To scrape the entire webpage for information.
@@ -0,0 +1,16 @@
1
+ windows_mcp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ windows_mcp/__main__.py,sha256=kCgkB5ckRlb7hgjg_Gpj_OQWiWJdgEOEMcBFJ7Kqmy8,11920
3
+ windows_mcp/desktop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ windows_mcp/desktop/config.py,sha256=7rAb64pmC275PpNRXVOyOf0Psu089AOosRC8T5kVGWA,384
5
+ windows_mcp/desktop/service.py,sha256=97e2E4TdMs3TwW6CtupVxnwhWqdBKU5eH4MDz6_5Hmk,18469
6
+ windows_mcp/desktop/views.py,sha256=_hZ5sfY1uWVi5mpaysVd-plwP_DT6SXpKa33Z8WT6gI,1523
7
+ windows_mcp/tree/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ windows_mcp/tree/config.py,sha256=k-Mjo_yIn0d1AzcEW_bxiaXyBFxBZZSyy7hCNQ3XVp0,1010
9
+ windows_mcp/tree/service.py,sha256=KWdOHY5Q1HU-PJV6vMv_h9KVb5oL_E5vnAW-KICEzfw,24786
10
+ windows_mcp/tree/utils.py,sha256=6hbxdIQPrAY-I3jcHsRqodHlxboTQj2GnLA71bf1lqY,911
11
+ windows_mcp/tree/views.py,sha256=6A1bLGVt_MHPTvQt9kbUFoPpIqMI43JZjOSg-_o3ajk,3479
12
+ windows_mcp-0.5.3.dist-info/METADATA,sha256=aMbmdQu1I-6gK58_Lyz3lA5ZfbzI9XXRUNwNwOaey7o,14541
13
+ windows_mcp-0.5.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
14
+ windows_mcp-0.5.3.dist-info/entry_points.txt,sha256=wW8NcVQ_OJK5e5GemZSE_nOKyxfUtBPq2acFLszRwaw,58
15
+ windows_mcp-0.5.3.dist-info/licenses/LICENSE.md,sha256=U1UM4Xi_IX-jHnHjGT0rETNia-Ck8gd92iSQMqQ6a8Y,1089
16
+ windows_mcp-0.5.3.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ windows-mcp = windows_mcp.__main__:main
@@ -1,16 +0,0 @@
1
- main.py,sha256=Bg_iHXmNxIE1uUioBf0OMEolNkYisGCManA9tpLzv5w,9630
2
- src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- src/desktop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- src/desktop/config.py,sha256=7rAb64pmC275PpNRXVOyOf0Psu089AOosRC8T5kVGWA,384
5
- src/desktop/service.py,sha256=yzB1SFS2h1fSxMHsYOwa0mJLTOSdIyDWAmfex-DX3dM,18295
6
- src/desktop/views.py,sha256=vDPPUfD8vNkCS_4-vc-bA4tqG-klqDtznypAQJCN4TA,1515
7
- src/tree/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- src/tree/config.py,sha256=k-Mjo_yIn0d1AzcEW_bxiaXyBFxBZZSyy7hCNQ3XVp0,1010
9
- src/tree/service.py,sha256=5RIaabVBwmdKSsmaxTV8UW2f6VFwmyeJTvNWhoudTeM,21864
10
- src/tree/utils.py,sha256=6hbxdIQPrAY-I3jcHsRqodHlxboTQj2GnLA71bf1lqY,911
11
- src/tree/views.py,sha256=DVgB8x7Mg9NaZL5xZzhOAzgLuwFw6DWFTLK5hIxWsvk,3232
12
- windows_mcp-0.5.2.dist-info/METADATA,sha256=Vp5YyAirr8qtj7SMekByPhW8Fx9PbpamHVFsD2X1xlY,12380
13
- windows_mcp-0.5.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
14
- windows_mcp-0.5.2.dist-info/entry_points.txt,sha256=NMSKckn68nbiSSmQ9eFiP8cmPrDSR_vzeYE-Zqmhn_o,42
15
- windows_mcp-0.5.2.dist-info/licenses/LICENSE.md,sha256=U1UM4Xi_IX-jHnHjGT0rETNia-Ck8gd92iSQMqQ6a8Y,1089
16
- windows_mcp-0.5.2.dist-info/RECORD,,
@@ -1,2 +0,0 @@
1
- [console_scripts]
2
- windows-mcp = main:main
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes