windows-mcp 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- main.py → windows_mcp/__main__.py +38 -17
- {src → windows_mcp}/desktop/service.py +9 -6
- {src → windows_mcp}/desktop/views.py +1 -1
- {src → windows_mcp}/tree/service.py +89 -36
- {src → windows_mcp}/tree/views.py +8 -0
- {windows_mcp-0.5.2.dist-info → windows_mcp-0.5.3.dist-info}/METADATA +54 -3
- windows_mcp-0.5.3.dist-info/RECORD +16 -0
- {windows_mcp-0.5.2.dist-info → windows_mcp-0.5.3.dist-info}/WHEEL +1 -1
- windows_mcp-0.5.3.dist-info/entry_points.txt +2 -0
- windows_mcp-0.5.2.dist-info/RECORD +0 -16
- windows_mcp-0.5.2.dist-info/entry_points.txt +0 -2
- {src → windows_mcp}/__init__.py +0 -0
- {src → windows_mcp}/desktop/__init__.py +0 -0
- {src → windows_mcp}/desktop/config.py +0 -0
- {src → windows_mcp}/tree/__init__.py +0 -0
- {src → windows_mcp}/tree/config.py +0 -0
- {src → windows_mcp}/tree/utils.py +0 -0
- {windows_mcp-0.5.2.dist-info → windows_mcp-0.5.3.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from live_inspect.watch_cursor import WatchCursor
|
|
2
2
|
from contextlib import asynccontextmanager
|
|
3
3
|
from fastmcp.utilities.types import Image
|
|
4
|
-
from
|
|
4
|
+
from windows_mcp.desktop.service import Desktop
|
|
5
5
|
from mcp.types import ToolAnnotations
|
|
6
6
|
from humancursor import SystemCursor
|
|
7
7
|
from textwrap import dedent
|
|
@@ -19,6 +19,7 @@ cursor=SystemCursor()
|
|
|
19
19
|
watch_cursor=WatchCursor()
|
|
20
20
|
windows_version=desktop.get_windows_version()
|
|
21
21
|
default_language=desktop.get_default_language()
|
|
22
|
+
screen_width,screen_height=desktop.get_resolution()
|
|
22
23
|
|
|
23
24
|
instructions=dedent(f'''
|
|
24
25
|
Windows MCP server provides tools to interact directly with the {windows_version} desktop,
|
|
@@ -39,11 +40,11 @@ mcp=FastMCP(name='windows-mcp',instructions=instructions,lifespan=lifespan)
|
|
|
39
40
|
|
|
40
41
|
@mcp.tool(
|
|
41
42
|
name="App-Tool",
|
|
42
|
-
description="Manages Windows applications
|
|
43
|
+
description="Manages Windows applications with three modes: 'launch' (start app by name), 'resize' (set window position/size using window_loc=[x,y] and window_size=[width,height]), 'switch' (activate app by name). Essential for application lifecycle management.",
|
|
43
44
|
annotations=ToolAnnotations(
|
|
44
45
|
title="App Tool",
|
|
45
46
|
readOnlyHint=False,
|
|
46
|
-
destructiveHint=
|
|
47
|
+
destructiveHint=True,
|
|
47
48
|
idempotentHint=False,
|
|
48
49
|
openWorldHint=False
|
|
49
50
|
)
|
|
@@ -53,7 +54,7 @@ def app_tool(mode:Literal['launch','resize','switch'],name:str|None=None,window_
|
|
|
53
54
|
|
|
54
55
|
@mcp.tool(
|
|
55
56
|
name='Powershell-Tool',
|
|
56
|
-
description='Execute PowerShell commands and return
|
|
57
|
+
description='Execute PowerShell commands directly on the Windows system and return output with status code. Supports all PowerShell cmdlets, scripts, and system commands. Use for file operations, system queries, and administrative tasks.',
|
|
57
58
|
annotations=ToolAnnotations(
|
|
58
59
|
title="Powershell Tool",
|
|
59
60
|
readOnlyHint=False,
|
|
@@ -68,15 +69,23 @@ def powershell_tool(command: str) -> str:
|
|
|
68
69
|
|
|
69
70
|
@mcp.tool(
|
|
70
71
|
name='State-Tool',
|
|
71
|
-
description='
|
|
72
|
+
description='Captures complete desktop state including: system language, focused/opened apps, interactive elements (buttons, text fields, links, menus with coordinates), and scrollable areas. Set use_vision=True to include screenshot. Set use_dom=True for browser content to get web page elements instead of browser UI. Always call this first to understand the current desktop state before taking actions.',
|
|
72
73
|
annotations=ToolAnnotations(
|
|
73
74
|
title="State Tool",
|
|
74
75
|
readOnlyHint=True,
|
|
76
|
+
destructiveHint=False,
|
|
77
|
+
idempotentHint=True,
|
|
75
78
|
openWorldHint=False
|
|
76
79
|
)
|
|
77
80
|
)
|
|
78
|
-
def state_tool(use_vision:bool=False):
|
|
79
|
-
|
|
81
|
+
def state_tool(use_vision:bool=False,use_dom:bool=False):
|
|
82
|
+
# Calculate scale factor to cap resolution at 1080p (1920x1080)
|
|
83
|
+
max_width, max_height = 1920, 1080
|
|
84
|
+
scale_width = max_width / screen_width if screen_width > max_width else 1.0
|
|
85
|
+
scale_height = max_height / screen_height if screen_height > max_height else 1.0
|
|
86
|
+
scale = min(scale_width, scale_height) # Use the smaller scale to ensure both dimensions fit
|
|
87
|
+
|
|
88
|
+
desktop_state=desktop.get_state(use_vision=use_vision,use_dom=use_dom,as_bytes=True,scale=scale)
|
|
80
89
|
interactive_elements=desktop_state.tree_state.interactive_elements_to_string()
|
|
81
90
|
scrollable_elements=desktop_state.tree_state.scrollable_elements_to_string()
|
|
82
91
|
apps=desktop_state.apps_to_string()
|
|
@@ -100,7 +109,7 @@ def state_tool(use_vision:bool=False):
|
|
|
100
109
|
|
|
101
110
|
@mcp.tool(
|
|
102
111
|
name='Click-Tool',
|
|
103
|
-
description='
|
|
112
|
+
description='Performs mouse clicks at specified coordinates [x, y]. Supports button types: left (default), right (context menu), middle. Supports clicks: 1 (single), 2 (double), 3 (triple). Always use coordinates from State-Tool output to ensure accuracy.',
|
|
104
113
|
annotations=ToolAnnotations(
|
|
105
114
|
title="Click Tool",
|
|
106
115
|
readOnlyHint=False,
|
|
@@ -119,7 +128,7 @@ def click_tool(loc:list[int],button:Literal['left','right','middle']='left',clic
|
|
|
119
128
|
|
|
120
129
|
@mcp.tool(
|
|
121
130
|
name='Type-Tool',
|
|
122
|
-
description='
|
|
131
|
+
description='Types text at specified coordinates [x, y]. Set clear=True to clear existing text first (Ctrl+A then type), clear=False to append. Set press_enter=True to submit after typing. Always click on the target input field first to ensure focus.',
|
|
123
132
|
annotations=ToolAnnotations(
|
|
124
133
|
title="Type Tool",
|
|
125
134
|
readOnlyHint=False,
|
|
@@ -137,7 +146,7 @@ def type_tool(loc:list[int],text:str,clear:bool=False,press_enter:bool=False)->s
|
|
|
137
146
|
|
|
138
147
|
@mcp.tool(
|
|
139
148
|
name='Scroll-Tool',
|
|
140
|
-
description='
|
|
149
|
+
description='Scrolls at coordinates [x, y] or current mouse position if loc=None. Type: vertical (default) or horizontal. Direction: up/down for vertical, left/right for horizontal. wheel_times controls amount (1 wheel ≈ 3-5 lines). Use for navigating long content, lists, and web pages.',
|
|
141
150
|
annotations=ToolAnnotations(
|
|
142
151
|
title="Scroll Tool",
|
|
143
152
|
readOnlyHint=False,
|
|
@@ -156,7 +165,7 @@ def scroll_tool(loc:list[int]=None,type:Literal['horizontal','vertical']='vertic
|
|
|
156
165
|
|
|
157
166
|
@mcp.tool(
|
|
158
167
|
name='Drag-Tool',
|
|
159
|
-
description='
|
|
168
|
+
description='Performs drag-and-drop from current mouse position to destination coordinates [x, y]. Click or move to source position first, then call this tool with target coordinates. Use for moving files, reordering items, resizing windows, or any drag-drop UI interactions.',
|
|
160
169
|
annotations=ToolAnnotations(
|
|
161
170
|
title="Drag Tool",
|
|
162
171
|
readOnlyHint=False,
|
|
@@ -174,7 +183,7 @@ def drag_tool(to_loc:list[int])->str:
|
|
|
174
183
|
|
|
175
184
|
@mcp.tool(
|
|
176
185
|
name='Move-Tool',
|
|
177
|
-
description='
|
|
186
|
+
description='Moves mouse cursor to coordinates [x, y] without clicking. Use for hovering to reveal tooltips/menus, positioning cursor before drag operations, or triggering hover-based UI changes. Does not interact with elements.',
|
|
178
187
|
annotations=ToolAnnotations(
|
|
179
188
|
title="Move Tool",
|
|
180
189
|
readOnlyHint=False,
|
|
@@ -192,7 +201,7 @@ def move_tool(to_loc:list[int])->str:
|
|
|
192
201
|
|
|
193
202
|
@mcp.tool(
|
|
194
203
|
name='Shortcut-Tool',
|
|
195
|
-
description='
|
|
204
|
+
description='Executes keyboard shortcuts using key combinations separated by +. Examples: "ctrl+c" (copy), "ctrl+v" (paste), "alt+tab" (switch apps), "win+r" (Run dialog), "win" (Start menu), "ctrl+shift+esc" (Task Manager). Use for quick actions and system commands.',
|
|
196
205
|
annotations=ToolAnnotations(
|
|
197
206
|
title="Shortcut Tool",
|
|
198
207
|
readOnlyHint=False,
|
|
@@ -207,10 +216,12 @@ def shortcut_tool(shortcut:str):
|
|
|
207
216
|
|
|
208
217
|
@mcp.tool(
|
|
209
218
|
name='Wait-Tool',
|
|
210
|
-
description='
|
|
219
|
+
description='Pauses execution for specified duration in seconds. Use when waiting for: applications to launch/load, UI animations to complete, page content to render, dialogs to appear, or between rapid actions. Helps ensure UI is ready before next interaction.',
|
|
211
220
|
annotations=ToolAnnotations(
|
|
212
221
|
title="Wait Tool",
|
|
213
222
|
readOnlyHint=True,
|
|
223
|
+
destructiveHint=False,
|
|
224
|
+
idempotentHint=True,
|
|
214
225
|
openWorldHint=False
|
|
215
226
|
)
|
|
216
227
|
)
|
|
@@ -220,16 +231,26 @@ def wait_tool(duration:int)->str:
|
|
|
220
231
|
|
|
221
232
|
@mcp.tool(
|
|
222
233
|
name='Scrape-Tool',
|
|
223
|
-
description='
|
|
234
|
+
description='Extracts visible text content from the currently focused browser tab. Returns content in plain text format with scroll status indicators (top/bottom reached or more content available). Only works when a browser with DOM is active. Use State-Tool with use_dom=True first to ensure browser is ready.',
|
|
224
235
|
annotations=ToolAnnotations(
|
|
225
236
|
title="Scrape Tool",
|
|
226
237
|
readOnlyHint=True,
|
|
238
|
+
destructiveHint=False,
|
|
239
|
+
idempotentHint=True,
|
|
227
240
|
openWorldHint=True
|
|
228
241
|
)
|
|
229
242
|
)
|
|
230
243
|
def scrape_tool(url:str)->str:
|
|
231
|
-
|
|
232
|
-
|
|
244
|
+
desktop_state=desktop.desktop_state
|
|
245
|
+
tree_state=desktop_state.tree_state
|
|
246
|
+
if not tree_state.dom_node:
|
|
247
|
+
return f'Unable to scrape URL: {url}. No DOM node found.'
|
|
248
|
+
dom_node=tree_state.dom
|
|
249
|
+
vertical_scroll_percent=dom_node.vertical_scroll_percent
|
|
250
|
+
content='\n'.join([node.text for node in tree_state.dom_informative_nodes])
|
|
251
|
+
header_status = "Reached top" if vertical_scroll_percent <= 0 else "Scroll up to see more"
|
|
252
|
+
footer_status = "Reached bottom" if vertical_scroll_percent >= 100 else "Scroll down to see more"
|
|
253
|
+
return f'URL:{url}\nContent:\n{header_status}\n{content}\n{footer_status}'
|
|
233
254
|
|
|
234
255
|
|
|
235
256
|
@click.command()
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
1
|
+
from windows_mcp.desktop.config import BROWSER_NAMES, PROCESS_PER_MONITOR_DPI_AWARE
|
|
2
|
+
from windows_mcp.desktop.views import DesktopState, App, Size, Status
|
|
3
3
|
from locale import getpreferredencoding
|
|
4
4
|
from contextlib import contextmanager
|
|
5
5
|
from typing import Optional,Literal
|
|
6
6
|
from markdownify import markdownify
|
|
7
|
-
from
|
|
7
|
+
from windows_mcp.tree.service import Tree
|
|
8
8
|
from fuzzywuzzy import process
|
|
9
9
|
from psutil import Process
|
|
10
10
|
from time import sleep
|
|
@@ -46,7 +46,10 @@ class Desktop:
|
|
|
46
46
|
self.tree=Tree(self)
|
|
47
47
|
self.desktop_state=None
|
|
48
48
|
|
|
49
|
-
def
|
|
49
|
+
def get_resolution(self)->tuple[int,int]:
|
|
50
|
+
return pg.size()
|
|
51
|
+
|
|
52
|
+
def get_state(self,use_vision:bool=False,use_dom:bool=False,as_bytes:bool=False,scale:float=1.0)->DesktopState:
|
|
50
53
|
sleep(0.1)
|
|
51
54
|
apps=self.get_apps()
|
|
52
55
|
active_app=self.get_active_app()
|
|
@@ -54,9 +57,9 @@ class Desktop:
|
|
|
54
57
|
apps.remove(active_app)
|
|
55
58
|
logger.debug(f"Active app: {active_app}")
|
|
56
59
|
logger.debug(f"Apps: {apps}")
|
|
57
|
-
tree_state=self.tree.get_state(active_app,apps)
|
|
60
|
+
tree_state=self.tree.get_state(active_app,apps,use_dom=use_dom)
|
|
58
61
|
if use_vision:
|
|
59
|
-
screenshot=self.tree.
|
|
62
|
+
screenshot=self.tree.get_annotated_screenshot(tree_state.interactive_nodes,scale=scale)
|
|
60
63
|
if as_bytes:
|
|
61
64
|
bytes_io=io.BytesIO()
|
|
62
65
|
screenshot.save(bytes_io,format='PNG')
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
from
|
|
1
|
+
from windows_mcp.tree.config import INTERACTIVE_CONTROL_TYPE_NAMES,DOCUMENT_CONTROL_TYPE_NAMES,INFORMATIVE_CONTROL_TYPE_NAMES, DEFAULT_ACTIONS, THREAD_MAX_RETRIES
|
|
2
2
|
from uiautomation import Control,ImageControl,ScrollPattern,WindowControl,Rect,GetRootControl,PatternId
|
|
3
|
-
from
|
|
3
|
+
from windows_mcp.tree.views import TreeElementNode, ScrollElementNode, TextElementNode, Center, BoundingBox, TreeState
|
|
4
4
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
5
|
-
from
|
|
5
|
+
from windows_mcp.tree.utils import random_point_within_bounding_box
|
|
6
6
|
from PIL import Image, ImageFont, ImageDraw
|
|
7
|
-
from
|
|
8
|
-
from
|
|
7
|
+
from typing import TYPE_CHECKING,Optional
|
|
8
|
+
from windows_mcp.desktop.views import App
|
|
9
9
|
from time import sleep
|
|
10
10
|
import logging
|
|
11
11
|
import random
|
|
@@ -18,36 +18,74 @@ handler.setFormatter(formatter)
|
|
|
18
18
|
logger.addHandler(handler)
|
|
19
19
|
|
|
20
20
|
if TYPE_CHECKING:
|
|
21
|
-
from
|
|
21
|
+
from windows_mcp.desktop.service import Desktop
|
|
22
22
|
|
|
23
23
|
class Tree:
|
|
24
24
|
def __init__(self,desktop:'Desktop'):
|
|
25
25
|
self.desktop=desktop
|
|
26
|
-
screen_size=self.desktop.get_screen_size()
|
|
26
|
+
self.screen_size=self.desktop.get_screen_size()
|
|
27
|
+
self.dom:Optional[Control]=None
|
|
27
28
|
self.dom_bounding_box:BoundingBox=None
|
|
28
29
|
self.screen_box=BoundingBox(
|
|
29
|
-
top=0, left=0, bottom=screen_size.height, right=screen_size.width,
|
|
30
|
-
width=screen_size.width, height=screen_size.height
|
|
30
|
+
top=0, left=0, bottom=self.screen_size.height, right=self.screen_size.width,
|
|
31
|
+
width=self.screen_size.width, height=self.screen_size.height
|
|
31
32
|
)
|
|
33
|
+
self.root:Optional[TreeElementNode]=None
|
|
32
34
|
|
|
33
|
-
def get_state(self,active_app:App,other_apps:list[App])->TreeState:
|
|
34
|
-
root=GetRootControl()
|
|
35
|
+
def get_state(self,active_app:App,other_apps:list[App],use_dom:bool=False)->TreeState:
|
|
36
|
+
self.root=GetRootControl()
|
|
35
37
|
other_apps_handle=set(map(lambda other_app: other_app.handle,other_apps))
|
|
36
|
-
apps=list(filter(lambda app:app.NativeWindowHandle not in other_apps_handle,root.GetChildren()))
|
|
38
|
+
apps=list(filter(lambda app:app.NativeWindowHandle not in other_apps_handle,self.root.GetChildren()))
|
|
37
39
|
del other_apps_handle
|
|
38
40
|
if active_app:
|
|
39
41
|
apps=list(filter(lambda app:app.ClassName!='Progman',apps))
|
|
40
|
-
interactive_nodes,scrollable_nodes=self.get_appwise_nodes(apps=apps)
|
|
41
|
-
|
|
42
|
+
interactive_nodes,scrollable_nodes,dom_informative_nodes=self.get_appwise_nodes(apps=apps,use_dom=use_dom)
|
|
43
|
+
root=TreeElementNode(**{
|
|
44
|
+
'name':'Desktop',
|
|
45
|
+
'control_type':'PaneControl',
|
|
46
|
+
'app_name':'Desktop',
|
|
47
|
+
'value':'',
|
|
48
|
+
'shortcut':'',
|
|
49
|
+
'bounding_box':self.screen_box,
|
|
50
|
+
'center':Center(x=self.screen_box.left+self.screen_box.width//2,y=self.screen_box.top+self.screen_box.height//2),
|
|
51
|
+
'xpath':'',
|
|
52
|
+
'is_focused':False
|
|
53
|
+
})
|
|
54
|
+
dom=None
|
|
55
|
+
if self.dom:
|
|
56
|
+
scroll_pattern=self.dom.GetPattern(PatternId.ScrollPattern)
|
|
57
|
+
bounding_box=self.dom.BoundingRectangle
|
|
58
|
+
dom=ScrollElementNode(**{
|
|
59
|
+
'name':"DOM",
|
|
60
|
+
'control_type':'DocumentControl',
|
|
61
|
+
'app_name':"DOM",
|
|
62
|
+
'bounding_box':BoundingBox(
|
|
63
|
+
left=bounding_box.left,
|
|
64
|
+
top=bounding_box.top,
|
|
65
|
+
right=bounding_box.right,
|
|
66
|
+
bottom=bounding_box.bottom,
|
|
67
|
+
width=bounding_box.width(),
|
|
68
|
+
height=bounding_box.height()
|
|
69
|
+
),
|
|
70
|
+
'center':Center(x=bounding_box.left+bounding_box.width()//2,y=bounding_box.top+bounding_box.height()//2),
|
|
71
|
+
'horizontal_scrollable':scroll_pattern.HorizontallyScrollable,
|
|
72
|
+
'horizontal_scroll_percent':scroll_pattern.HorizontalScrollPercent if scroll_pattern.HorizontallyScrollable else 0,
|
|
73
|
+
'vertical_scrollable':scroll_pattern.VerticallyScrollable,
|
|
74
|
+
'vertical_scroll_percent':scroll_pattern.VerticalScrollPercent if scroll_pattern.VerticallyScrollable else 0,
|
|
75
|
+
'xpath':'',
|
|
76
|
+
'is_focused':False
|
|
77
|
+
})
|
|
78
|
+
return TreeState(root=root,dom=dom,interactive_nodes=interactive_nodes,scrollable_nodes=scrollable_nodes,dom_informative_nodes=dom_informative_nodes)
|
|
42
79
|
|
|
43
|
-
def get_appwise_nodes(self,apps:list[Control])
|
|
44
|
-
interactive_nodes, scrollable_nodes = [], []
|
|
80
|
+
def get_appwise_nodes(self,apps:list[Control],use_dom:bool=False)-> tuple[list[TreeElementNode],list[ScrollElementNode],list[TextElementNode]]:
|
|
81
|
+
interactive_nodes, scrollable_nodes,dom_informative_nodes = [], [], []
|
|
45
82
|
with ThreadPoolExecutor() as executor:
|
|
46
83
|
retry_counts = {app: 0 for app in apps}
|
|
47
84
|
future_to_app = {
|
|
48
85
|
executor.submit(
|
|
49
86
|
self.get_nodes, app,
|
|
50
|
-
self.desktop.is_app_browser(app)
|
|
87
|
+
self.desktop.is_app_browser(app),
|
|
88
|
+
use_dom
|
|
51
89
|
): app
|
|
52
90
|
for app in apps
|
|
53
91
|
}
|
|
@@ -57,18 +95,20 @@ class Tree:
|
|
|
57
95
|
try:
|
|
58
96
|
result = future.result()
|
|
59
97
|
if result:
|
|
60
|
-
element_nodes, scroll_nodes = result
|
|
98
|
+
element_nodes, scroll_nodes,informative_nodes = result
|
|
61
99
|
interactive_nodes.extend(element_nodes)
|
|
62
100
|
scrollable_nodes.extend(scroll_nodes)
|
|
101
|
+
dom_informative_nodes.extend(informative_nodes)
|
|
63
102
|
except Exception as e:
|
|
64
103
|
retry_counts[app] += 1
|
|
65
104
|
logger.debug(f"Error in processing node {app.Name}, retry attempt {retry_counts[app]}\nError: {e}")
|
|
66
105
|
if retry_counts[app] < THREAD_MAX_RETRIES:
|
|
67
|
-
|
|
106
|
+
logger.debug(f"Retrying {app.Name} for the {retry_counts[app]}th time")
|
|
107
|
+
new_future = executor.submit(self.get_nodes, app, self.desktop.is_app_browser(app),use_dom)
|
|
68
108
|
future_to_app[new_future] = app
|
|
69
109
|
else:
|
|
70
110
|
logger.error(f"Task failed completely for {app.Name} after {THREAD_MAX_RETRIES} retries")
|
|
71
|
-
return interactive_nodes,scrollable_nodes
|
|
111
|
+
return interactive_nodes,scrollable_nodes,dom_informative_nodes
|
|
72
112
|
|
|
73
113
|
def iou_bounding_box(self,window_box: Rect,element_box: Rect,) -> BoundingBox:
|
|
74
114
|
# Step 1: Intersection of element and window (existing logic)
|
|
@@ -105,7 +145,7 @@ class Tree:
|
|
|
105
145
|
)
|
|
106
146
|
return bounding_box
|
|
107
147
|
|
|
108
|
-
def get_nodes(self, node: Control, is_browser:bool=False) -> tuple[list[TreeElementNode],list[ScrollElementNode]]:
|
|
148
|
+
def get_nodes(self, node: Control, is_browser:bool=False,use_dom:bool=False) -> tuple[list[TreeElementNode],list[ScrollElementNode]]:
|
|
109
149
|
window_bounding_box=node.BoundingRectangle
|
|
110
150
|
|
|
111
151
|
def is_element_visible(node:Control,threshold:int=0):
|
|
@@ -331,11 +371,10 @@ class Tree:
|
|
|
331
371
|
'is_focused':is_focused
|
|
332
372
|
})
|
|
333
373
|
interactive_nodes.append(tree_node)
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
# ))
|
|
374
|
+
elif is_element_text(node):
|
|
375
|
+
dom_informative_nodes.append(TextElementNode(
|
|
376
|
+
text=node.Name.strip(),
|
|
377
|
+
))
|
|
339
378
|
|
|
340
379
|
children=node.GetChildren()
|
|
341
380
|
|
|
@@ -344,11 +383,12 @@ class Tree:
|
|
|
344
383
|
# Incrementally building the xpath
|
|
345
384
|
|
|
346
385
|
# Check if the child is a DOM element
|
|
347
|
-
if is_browser and child.
|
|
386
|
+
if is_browser and child.AutomationId == "RootWebArea":
|
|
348
387
|
bounding_box=child.BoundingRectangle
|
|
349
388
|
self.dom_bounding_box=BoundingBox(left=bounding_box.left,top=bounding_box.top,
|
|
350
389
|
right=bounding_box.right,bottom=bounding_box.bottom,width=bounding_box.width(),
|
|
351
390
|
height=bounding_box.height())
|
|
391
|
+
self.dom=child
|
|
352
392
|
# enter DOM subtree
|
|
353
393
|
tree_traversal(child, is_dom=True, is_dialog=is_dialog)
|
|
354
394
|
# Check if the child is a dialog
|
|
@@ -369,7 +409,7 @@ class Tree:
|
|
|
369
409
|
# normal non-dialog children
|
|
370
410
|
tree_traversal(child, is_dom=is_dom, is_dialog=is_dialog)
|
|
371
411
|
|
|
372
|
-
interactive_nodes, dom_interactive_nodes, scrollable_nodes = [], [], []
|
|
412
|
+
interactive_nodes, dom_interactive_nodes, scrollable_nodes, dom_informative_nodes = [], [], [], []
|
|
373
413
|
app_name=node.Name.strip()
|
|
374
414
|
match node.ClassName:
|
|
375
415
|
case "Progman":
|
|
@@ -386,12 +426,25 @@ class Tree:
|
|
|
386
426
|
logger.debug(f'DOM interactive nodes:{len(dom_interactive_nodes)}')
|
|
387
427
|
logger.debug(f'Scrollable nodes:{len(scrollable_nodes)}')
|
|
388
428
|
|
|
389
|
-
|
|
390
|
-
|
|
429
|
+
if use_dom:
|
|
430
|
+
if is_browser:
|
|
431
|
+
return (dom_interactive_nodes,scrollable_nodes,dom_informative_nodes)
|
|
432
|
+
else:
|
|
433
|
+
return ([],[],[])
|
|
434
|
+
else:
|
|
435
|
+
return (interactive_nodes+dom_interactive_nodes,scrollable_nodes,dom_informative_nodes)
|
|
391
436
|
|
|
392
|
-
def
|
|
437
|
+
def get_annotated_screenshot(self, nodes: list[TreeElementNode],scale:float=1.0) -> Image.Image:
|
|
393
438
|
screenshot = self.desktop.get_screenshot()
|
|
394
439
|
sleep(0.10)
|
|
440
|
+
|
|
441
|
+
original_width = screenshot.width
|
|
442
|
+
original_height = screenshot.height
|
|
443
|
+
|
|
444
|
+
scaled_width = int(original_width * scale)
|
|
445
|
+
scaled_height = int(original_height * scale)
|
|
446
|
+
screenshot = screenshot.resize((scaled_width, scaled_height), Image.Resampling.LANCZOS)
|
|
447
|
+
|
|
395
448
|
# Add padding
|
|
396
449
|
padding = 5
|
|
397
450
|
width = int(screenshot.width + (1.5 * padding))
|
|
@@ -413,12 +466,12 @@ class Tree:
|
|
|
413
466
|
box = node.bounding_box
|
|
414
467
|
color = get_random_color()
|
|
415
468
|
|
|
416
|
-
# Scale and pad the bounding box
|
|
469
|
+
# Scale and pad the bounding box coordinates
|
|
417
470
|
adjusted_box = (
|
|
418
|
-
int(box.left) + padding,
|
|
419
|
-
int(box.top) + padding,
|
|
420
|
-
int(box.right) + padding,
|
|
421
|
-
int(box.bottom) + padding
|
|
471
|
+
int(box.left * scale) + padding,
|
|
472
|
+
int(box.top * scale) + padding,
|
|
473
|
+
int(box.right * scale) + padding,
|
|
474
|
+
int(box.bottom * scale) + padding
|
|
422
475
|
)
|
|
423
476
|
# Draw bounding box
|
|
424
477
|
draw.rectangle(adjusted_box, outline=color, width=2)
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
from dataclasses import dataclass,field
|
|
2
2
|
from tabulate import tabulate
|
|
3
|
+
from typing import Optional
|
|
3
4
|
|
|
4
5
|
@dataclass
|
|
5
6
|
class TreeState:
|
|
7
|
+
root:Optional['TreeElementNode']=None
|
|
8
|
+
dom:Optional['ScrollElementNode']=None
|
|
6
9
|
interactive_nodes:list['TreeElementNode']=field(default_factory=list)
|
|
7
10
|
scrollable_nodes:list['ScrollElementNode']=field(default_factory=list)
|
|
11
|
+
dom_informative_nodes:list['TextElementNode']=field(default_factory=list)
|
|
8
12
|
|
|
9
13
|
def interactive_elements_to_string(self) -> str:
|
|
10
14
|
if not self.interactive_nodes:
|
|
@@ -99,4 +103,8 @@ class ScrollElementNode:
|
|
|
99
103
|
self.is_focused
|
|
100
104
|
]
|
|
101
105
|
|
|
106
|
+
@dataclass
|
|
107
|
+
class TextElementNode:
|
|
108
|
+
text:str
|
|
109
|
+
|
|
102
110
|
ElementNode=TreeElementNode|ScrollElementNode
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: windows-mcp
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.3
|
|
4
4
|
Summary: Lightweight MCP Server for interacting with Windows Operating System.
|
|
5
5
|
Project-URL: homepage, https://github.com/CursorTouch
|
|
6
6
|
Author-email: Jeomon George <jeogeoalukka@gmail.com>
|
|
@@ -44,7 +44,6 @@ Requires-Dist: python-levenshtein>=0.27.1
|
|
|
44
44
|
Requires-Dist: pywinauto>=0.6.9
|
|
45
45
|
Requires-Dist: requests>=2.32.3
|
|
46
46
|
Requires-Dist: tabulate>=0.9.0
|
|
47
|
-
Requires-Dist: twine>=6.2.0
|
|
48
47
|
Requires-Dist: uiautomation>=2.0.24
|
|
49
48
|
Description-Content-Type: text/markdown
|
|
50
49
|
|
|
@@ -114,6 +113,58 @@ mcp-name: io.github.CursorTouch/Windows-MCP
|
|
|
114
113
|
- **Real-Time Interaction**
|
|
115
114
|
Typical latency between actions (e.g., from one mouse click to the next) ranges from **0.7 to 2.5 secs**, and may slightly vary based on the number of active applications and system load, also the inferencing speed of the llm.
|
|
116
115
|
|
|
116
|
+
- **DOM Mode for Browser Automation**
|
|
117
|
+
Special `use_dom=True` mode for State-Tool that focuses exclusively on web page content, filtering out browser UI elements for cleaner, more efficient web automation.
|
|
118
|
+
|
|
119
|
+
## 🌐 DOM Mode for Browser Automation
|
|
120
|
+
|
|
121
|
+
Windows-MCP includes a powerful **DOM Mode** feature that enhances browser automation by focusing on web page content rather than browser UI elements.
|
|
122
|
+
|
|
123
|
+
### What is DOM Mode?
|
|
124
|
+
|
|
125
|
+
When `use_dom=True` is set in the State-Tool, the MCP server:
|
|
126
|
+
- **Filters out browser UI**: Removes address bars, tabs, toolbars, and other browser chrome elements
|
|
127
|
+
- **Returns only web content**: Provides interactive elements (links, buttons, forms) from the actual web page
|
|
128
|
+
- **Reduces token usage**: Cleaner output means fewer tokens sent to the LLM
|
|
129
|
+
- **Improves accuracy**: LLM focuses only on relevant web page elements
|
|
130
|
+
|
|
131
|
+
### When to Use DOM Mode
|
|
132
|
+
|
|
133
|
+
✅ **Use `use_dom=True` when:**
|
|
134
|
+
- Automating web applications or websites
|
|
135
|
+
- Scraping web content
|
|
136
|
+
- Filling out web forms
|
|
137
|
+
- Clicking links or buttons on web pages
|
|
138
|
+
- Testing web interfaces
|
|
139
|
+
- You want to ignore browser UI and focus on page content
|
|
140
|
+
|
|
141
|
+
❌ **Use `use_dom=False` (default) when:**
|
|
142
|
+
- Interacting with browser controls (address bar, tabs, bookmarks)
|
|
143
|
+
- Working with desktop applications
|
|
144
|
+
- Need to see all UI elements including browser chrome
|
|
145
|
+
- Managing browser settings or extensions
|
|
146
|
+
|
|
147
|
+
### Example Usage
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
# Get web page content only (no browser UI)
|
|
151
|
+
state_tool(use_vision=False, use_dom=True)
|
|
152
|
+
|
|
153
|
+
# Get full desktop state including browser UI
|
|
154
|
+
state_tool(use_vision=False, use_dom=False)
|
|
155
|
+
|
|
156
|
+
# Get web page content with screenshot
|
|
157
|
+
state_tool(use_vision=True, use_dom=True)
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Benefits
|
|
161
|
+
|
|
162
|
+
1. **Token Efficiency**: Reduces the amount of data sent to LLM by filtering irrelevant browser UI
|
|
163
|
+
2. **Better Focus**: LLM concentrates on actionable web page elements
|
|
164
|
+
3. **Cleaner Output**: Only relevant interactive elements from the DOM are returned
|
|
165
|
+
4. **Faster Processing**: Less data means faster LLM inference
|
|
166
|
+
5. **Cost Savings**: Fewer tokens = lower API costs for cloud LLMs
|
|
167
|
+
|
|
117
168
|
## 🛠️Installation
|
|
118
169
|
|
|
119
170
|
### Prerequisites
|
|
@@ -317,7 +368,7 @@ MCP Client can access the following tools to interact with Windows:
|
|
|
317
368
|
- `Move-Tool`: Move mouse pointer.
|
|
318
369
|
- `Shortcut-Tool`: Press keyboard shortcuts (`Ctrl+c`, `Alt+Tab`, etc).
|
|
319
370
|
- `Wait-Tool`: Pause for a defined duration.
|
|
320
|
-
- `State-Tool`: Combined snapshot of default language, browser, active apps and interactive, textual and scrollable elements along with screenshot of the desktop
|
|
371
|
+
- `State-Tool`: Combined snapshot of default language, browser, active apps and interactive, textual and scrollable elements along with screenshot of the desktop. Supports `use_dom=True` for browser content extraction (web page elements only) and `use_vision=True` for including screenshots.
|
|
321
372
|
- `App-Tool`: To launch an application from the start menu, resize or move the window and switch between apps.
|
|
322
373
|
- `Shell-Tool`: To execute PowerShell commands.
|
|
323
374
|
- `Scrape-Tool`: To scrape the entire webpage for information.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
windows_mcp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
windows_mcp/__main__.py,sha256=kCgkB5ckRlb7hgjg_Gpj_OQWiWJdgEOEMcBFJ7Kqmy8,11920
|
|
3
|
+
windows_mcp/desktop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
windows_mcp/desktop/config.py,sha256=7rAb64pmC275PpNRXVOyOf0Psu089AOosRC8T5kVGWA,384
|
|
5
|
+
windows_mcp/desktop/service.py,sha256=97e2E4TdMs3TwW6CtupVxnwhWqdBKU5eH4MDz6_5Hmk,18469
|
|
6
|
+
windows_mcp/desktop/views.py,sha256=_hZ5sfY1uWVi5mpaysVd-plwP_DT6SXpKa33Z8WT6gI,1523
|
|
7
|
+
windows_mcp/tree/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
windows_mcp/tree/config.py,sha256=k-Mjo_yIn0d1AzcEW_bxiaXyBFxBZZSyy7hCNQ3XVp0,1010
|
|
9
|
+
windows_mcp/tree/service.py,sha256=KWdOHY5Q1HU-PJV6vMv_h9KVb5oL_E5vnAW-KICEzfw,24786
|
|
10
|
+
windows_mcp/tree/utils.py,sha256=6hbxdIQPrAY-I3jcHsRqodHlxboTQj2GnLA71bf1lqY,911
|
|
11
|
+
windows_mcp/tree/views.py,sha256=6A1bLGVt_MHPTvQt9kbUFoPpIqMI43JZjOSg-_o3ajk,3479
|
|
12
|
+
windows_mcp-0.5.3.dist-info/METADATA,sha256=aMbmdQu1I-6gK58_Lyz3lA5ZfbzI9XXRUNwNwOaey7o,14541
|
|
13
|
+
windows_mcp-0.5.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
14
|
+
windows_mcp-0.5.3.dist-info/entry_points.txt,sha256=wW8NcVQ_OJK5e5GemZSE_nOKyxfUtBPq2acFLszRwaw,58
|
|
15
|
+
windows_mcp-0.5.3.dist-info/licenses/LICENSE.md,sha256=U1UM4Xi_IX-jHnHjGT0rETNia-Ck8gd92iSQMqQ6a8Y,1089
|
|
16
|
+
windows_mcp-0.5.3.dist-info/RECORD,,
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
main.py,sha256=Bg_iHXmNxIE1uUioBf0OMEolNkYisGCManA9tpLzv5w,9630
|
|
2
|
-
src/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
src/desktop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
src/desktop/config.py,sha256=7rAb64pmC275PpNRXVOyOf0Psu089AOosRC8T5kVGWA,384
|
|
5
|
-
src/desktop/service.py,sha256=yzB1SFS2h1fSxMHsYOwa0mJLTOSdIyDWAmfex-DX3dM,18295
|
|
6
|
-
src/desktop/views.py,sha256=vDPPUfD8vNkCS_4-vc-bA4tqG-klqDtznypAQJCN4TA,1515
|
|
7
|
-
src/tree/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
src/tree/config.py,sha256=k-Mjo_yIn0d1AzcEW_bxiaXyBFxBZZSyy7hCNQ3XVp0,1010
|
|
9
|
-
src/tree/service.py,sha256=5RIaabVBwmdKSsmaxTV8UW2f6VFwmyeJTvNWhoudTeM,21864
|
|
10
|
-
src/tree/utils.py,sha256=6hbxdIQPrAY-I3jcHsRqodHlxboTQj2GnLA71bf1lqY,911
|
|
11
|
-
src/tree/views.py,sha256=DVgB8x7Mg9NaZL5xZzhOAzgLuwFw6DWFTLK5hIxWsvk,3232
|
|
12
|
-
windows_mcp-0.5.2.dist-info/METADATA,sha256=Vp5YyAirr8qtj7SMekByPhW8Fx9PbpamHVFsD2X1xlY,12380
|
|
13
|
-
windows_mcp-0.5.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
14
|
-
windows_mcp-0.5.2.dist-info/entry_points.txt,sha256=NMSKckn68nbiSSmQ9eFiP8cmPrDSR_vzeYE-Zqmhn_o,42
|
|
15
|
-
windows_mcp-0.5.2.dist-info/licenses/LICENSE.md,sha256=U1UM4Xi_IX-jHnHjGT0rETNia-Ck8gd92iSQMqQ6a8Y,1089
|
|
16
|
-
windows_mcp-0.5.2.dist-info/RECORD,,
|
{src → windows_mcp}/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|