computeruseprotocol 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- computeruseprotocol-0.1.0.dist-info/METADATA +225 -0
- computeruseprotocol-0.1.0.dist-info/RECORD +27 -0
- computeruseprotocol-0.1.0.dist-info/WHEEL +4 -0
- computeruseprotocol-0.1.0.dist-info/entry_points.txt +3 -0
- computeruseprotocol-0.1.0.dist-info/licenses/LICENSE +21 -0
- cup/__init__.py +548 -0
- cup/__main__.py +222 -0
- cup/_base.py +123 -0
- cup/_router.py +63 -0
- cup/actions/__init__.py +9 -0
- cup/actions/_handler.py +62 -0
- cup/actions/_keys.py +56 -0
- cup/actions/_linux.py +1008 -0
- cup/actions/_macos.py +1090 -0
- cup/actions/_web.py +555 -0
- cup/actions/_windows.py +984 -0
- cup/actions/executor.py +162 -0
- cup/format.py +653 -0
- cup/mcp/__init__.py +1 -0
- cup/mcp/__main__.py +11 -0
- cup/mcp/server.py +418 -0
- cup/platforms/__init__.py +0 -0
- cup/platforms/linux.py +1060 -0
- cup/platforms/macos.py +1005 -0
- cup/platforms/web.py +1009 -0
- cup/platforms/windows.py +935 -0
- cup/search.py +583 -0
cup/mcp/server.py
ADDED
|
@@ -0,0 +1,418 @@
|
|
|
1
|
+
"""CUP MCP Server — Computer Use Protocol tools for AI agents.
|
|
2
|
+
|
|
3
|
+
Exposes simple, focused tools for UI tree snapshot, element search,
|
|
4
|
+
action execution, and screenshots.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
|
|
11
|
+
from mcp.server.fastmcp import FastMCP
|
|
12
|
+
from mcp.server.fastmcp.utilities.types import Image
|
|
13
|
+
|
|
14
|
+
import cup
|
|
15
|
+
from cup.format import _format_line
|
|
16
|
+
|
|
17
|
+
mcp = FastMCP(
|
|
18
|
+
name="cup",
|
|
19
|
+
instructions=(
|
|
20
|
+
"CUP (Computer Use Protocol) gives you access to the UI accessibility "
|
|
21
|
+
"tree of the user's computer.\n\n"
|
|
22
|
+
"WORKFLOW — follow this pattern:\n"
|
|
23
|
+
"1. snapshot to capture the active window's UI\n"
|
|
24
|
+
"2. find to locate specific elements (PREFERRED over re-capturing)\n"
|
|
25
|
+
"3. action to interact (click, type, press, etc.)\n"
|
|
26
|
+
"4. Re-capture ONLY after actions change the UI\n\n"
|
|
27
|
+
"TOOLS:\n"
|
|
28
|
+
"- snapshot() — active window tree + window list (most common)\n"
|
|
29
|
+
"- snapshot_app(app) — specific app by title (when not in foreground)\n"
|
|
30
|
+
"- overview() — just the window list, near-instant\n"
|
|
31
|
+
"- snapshot_desktop() — desktop icons and widgets\n"
|
|
32
|
+
"- find(role/name/state) — search last tree without re-capturing\n"
|
|
33
|
+
"- action(action, ...) — interact with elements or press keys\n"
|
|
34
|
+
"- open_app(name) — open an app by name with fuzzy matching\n"
|
|
35
|
+
"- screenshot(region) — visual context when tree isn't enough\n\n"
|
|
36
|
+
"IMPORTANT — minimize token usage:\n"
|
|
37
|
+
"- Use find(name=...) to locate elements — NOT repeated tree captures\n"
|
|
38
|
+
"- Use overview() to discover what apps are open\n"
|
|
39
|
+
"- Use snapshot_app(app='...') to target a specific app\n"
|
|
40
|
+
"- snapshot() is your default starting point\n\n"
|
|
41
|
+
"Element IDs (e.g., 'e14') are ephemeral — only valid for the most "
|
|
42
|
+
"recent tree snapshot. After any action, re-capture before using IDs.\n\n"
|
|
43
|
+
"Use action(action='press', keys='ctrl+s') for keyboard shortcuts.\n\n"
|
|
44
|
+
"Use screenshot when you need visual context (colors, images, layout)."
|
|
45
|
+
),
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
# Session state (one per MCP server process)
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
_session: cup.Session | None = None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _get_session() -> cup.Session:
|
|
56
|
+
global _session
|
|
57
|
+
if _session is None:
|
|
58
|
+
_session = cup.Session()
|
|
59
|
+
return _session
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# ---------------------------------------------------------------------------
|
|
63
|
+
# Tree capture tools
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@mcp.tool()
|
|
68
|
+
def snapshot() -> str:
|
|
69
|
+
"""Capture the foreground (active) window's accessibility tree.
|
|
70
|
+
|
|
71
|
+
Returns a structured text representation where each UI element has an ID
|
|
72
|
+
(e.g., 'e14') that can be used with the action tool. The format shows:
|
|
73
|
+
|
|
74
|
+
[id] role "name" @x,y wxh {states} [actions] val="value"
|
|
75
|
+
|
|
76
|
+
Indentation shows the element hierarchy.
|
|
77
|
+
|
|
78
|
+
Also includes a window list in the header showing all open apps.
|
|
79
|
+
This is the primary tool for interacting with the current app's UI.
|
|
80
|
+
|
|
81
|
+
Element IDs are ephemeral — they are only valid for THIS snapshot.
|
|
82
|
+
After executing any action, you MUST call this again for fresh IDs.
|
|
83
|
+
"""
|
|
84
|
+
session = _get_session()
|
|
85
|
+
return session.snapshot(
|
|
86
|
+
scope="foreground",
|
|
87
|
+
max_depth=999,
|
|
88
|
+
compact=True,
|
|
89
|
+
detail="compact",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@mcp.tool()
|
|
94
|
+
def snapshot_app(app: str) -> str:
|
|
95
|
+
"""Capture a specific app's window accessibility tree by title.
|
|
96
|
+
|
|
97
|
+
Use this when you need to interact with a window that is NOT in the
|
|
98
|
+
foreground, or when you know the exact app you want by name.
|
|
99
|
+
|
|
100
|
+
The 'app' parameter is a case-insensitive substring match against
|
|
101
|
+
window titles (e.g., "Spotify", "Firefox", "VS Code").
|
|
102
|
+
|
|
103
|
+
Returns the same compact format as snapshot, with element IDs
|
|
104
|
+
that can be used with the action tool.
|
|
105
|
+
|
|
106
|
+
Element IDs are ephemeral — only valid for THIS snapshot.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
app: Target app by window title (case-insensitive substring match).
|
|
110
|
+
"""
|
|
111
|
+
session = _get_session()
|
|
112
|
+
return session.snapshot(
|
|
113
|
+
scope="full",
|
|
114
|
+
app=app,
|
|
115
|
+
max_depth=999,
|
|
116
|
+
compact=True,
|
|
117
|
+
detail="compact",
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@mcp.tool()
|
|
122
|
+
def snapshot_desktop() -> str:
|
|
123
|
+
"""Capture the desktop surface (icons, widgets, shortcuts).
|
|
124
|
+
|
|
125
|
+
Use this to see and interact with desktop items. Falls back to a
|
|
126
|
+
window overview if the platform has no desktop concept.
|
|
127
|
+
|
|
128
|
+
Returns the same compact format with element IDs for the action tool.
|
|
129
|
+
|
|
130
|
+
Element IDs are ephemeral — only valid for THIS snapshot.
|
|
131
|
+
"""
|
|
132
|
+
session = _get_session()
|
|
133
|
+
return session.snapshot(
|
|
134
|
+
scope="desktop",
|
|
135
|
+
max_depth=999,
|
|
136
|
+
compact=True,
|
|
137
|
+
detail="compact",
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@mcp.tool()
|
|
142
|
+
def overview() -> str:
|
|
143
|
+
"""List all open windows. Near-instant, no tree walking.
|
|
144
|
+
|
|
145
|
+
Returns a lightweight window list showing app names, PIDs, and bounds.
|
|
146
|
+
No element IDs are returned (no tree walking is performed).
|
|
147
|
+
|
|
148
|
+
Use this to quickly discover what apps are open before targeting
|
|
149
|
+
a specific one with snapshot_app(app='...').\n
|
|
150
|
+
"""
|
|
151
|
+
session = _get_session()
|
|
152
|
+
return session.snapshot(scope="overview", compact=True)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# ---------------------------------------------------------------------------
|
|
156
|
+
# Action tools
|
|
157
|
+
# ---------------------------------------------------------------------------
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@mcp.tool()
|
|
161
|
+
def action(
|
|
162
|
+
action: str,
|
|
163
|
+
element_id: str | None = None,
|
|
164
|
+
value: str | None = None,
|
|
165
|
+
direction: str | None = None,
|
|
166
|
+
keys: str | None = None,
|
|
167
|
+
) -> str:
|
|
168
|
+
"""Perform an action on a UI element or send a keyboard shortcut.
|
|
169
|
+
|
|
170
|
+
IMPORTANT: Element IDs are only valid from the most recent tree snapshot
|
|
171
|
+
(snapshot, snapshot_app, etc.). After performing any action, re-capture
|
|
172
|
+
for fresh IDs.
|
|
173
|
+
|
|
174
|
+
Element actions (require element_id):
|
|
175
|
+
click — Click/invoke the element
|
|
176
|
+
rightclick — Right-click to open context menu
|
|
177
|
+
doubleclick— Double-click the element
|
|
178
|
+
toggle — Toggle a checkbox or switch
|
|
179
|
+
type — Type text into a text field (pass text in 'value')
|
|
180
|
+
setvalue — Set element value programmatically (pass in 'value')
|
|
181
|
+
select — Select an item in a list/tree/tab
|
|
182
|
+
expand — Expand a collapsed element
|
|
183
|
+
collapse — Collapse an expanded element
|
|
184
|
+
scroll — Scroll a container (pass direction: up/down/left/right)
|
|
185
|
+
increment — Increment a slider/spinbutton
|
|
186
|
+
decrement — Decrement a slider/spinbutton
|
|
187
|
+
focus — Move keyboard focus to the element
|
|
188
|
+
|
|
189
|
+
Keyboard shortcut (no element_id needed):
|
|
190
|
+
press — Send a keyboard shortcut (pass combo in 'keys')
|
|
191
|
+
Examples: "enter", "ctrl+s", "ctrl+shift+p", "alt+f4"
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
action: The action to perform.
|
|
195
|
+
element_id: Element ID from the tree (e.g., "e14"). Required for
|
|
196
|
+
all actions except press.
|
|
197
|
+
value: Text for 'type' or 'setvalue' actions.
|
|
198
|
+
direction: Direction for 'scroll' action (up/down/left/right).
|
|
199
|
+
keys: Key combination for 'press' action (e.g., "ctrl+s").
|
|
200
|
+
"""
|
|
201
|
+
session = _get_session()
|
|
202
|
+
|
|
203
|
+
# Handle press action
|
|
204
|
+
if action == "press":
|
|
205
|
+
if not keys:
|
|
206
|
+
return json.dumps(
|
|
207
|
+
{
|
|
208
|
+
"success": False,
|
|
209
|
+
"message": "",
|
|
210
|
+
"error": "press action requires the 'keys' parameter (e.g., keys='ctrl+s').",
|
|
211
|
+
}
|
|
212
|
+
)
|
|
213
|
+
result = session.press(keys)
|
|
214
|
+
return json.dumps(
|
|
215
|
+
{
|
|
216
|
+
"success": result.success,
|
|
217
|
+
"message": result.message,
|
|
218
|
+
"error": result.error,
|
|
219
|
+
}
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# All other actions require element_id
|
|
223
|
+
if not element_id:
|
|
224
|
+
return json.dumps(
|
|
225
|
+
{
|
|
226
|
+
"success": False,
|
|
227
|
+
"message": "",
|
|
228
|
+
"error": f"Action '{action}' requires the 'element_id' parameter.",
|
|
229
|
+
}
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Build params dict from the optional arguments
|
|
233
|
+
params: dict = {}
|
|
234
|
+
if value is not None:
|
|
235
|
+
params["value"] = value
|
|
236
|
+
if direction is not None:
|
|
237
|
+
params["direction"] = direction
|
|
238
|
+
|
|
239
|
+
result = session.action(element_id, action, **params)
|
|
240
|
+
|
|
241
|
+
return json.dumps(
|
|
242
|
+
{
|
|
243
|
+
"success": result.success,
|
|
244
|
+
"message": result.message,
|
|
245
|
+
"error": result.error,
|
|
246
|
+
}
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
# ---------------------------------------------------------------------------
|
|
251
|
+
# Open app tool
|
|
252
|
+
# ---------------------------------------------------------------------------
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
@mcp.tool()
|
|
256
|
+
def open_app(name: str) -> str:
|
|
257
|
+
"""Open an application by name.
|
|
258
|
+
|
|
259
|
+
Fuzzy-matches the name against installed apps on the system.
|
|
260
|
+
Examples: "chrome" → Google Chrome, "code" → Visual Studio Code,
|
|
261
|
+
"notepad" → Notepad, "slack" → Slack.
|
|
262
|
+
|
|
263
|
+
Waits for the app window to appear before returning success.
|
|
264
|
+
|
|
265
|
+
After opening, use snapshot() to capture the new app's UI tree.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
name: Application name to open (fuzzy matched against installed apps).
|
|
269
|
+
"""
|
|
270
|
+
session = _get_session()
|
|
271
|
+
result = session.open_app(name)
|
|
272
|
+
return json.dumps(
|
|
273
|
+
{
|
|
274
|
+
"success": result.success,
|
|
275
|
+
"message": result.message,
|
|
276
|
+
"error": result.error,
|
|
277
|
+
}
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
# ---------------------------------------------------------------------------
|
|
282
|
+
# Search tool
|
|
283
|
+
# ---------------------------------------------------------------------------
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
@mcp.tool()
|
|
287
|
+
def find(
|
|
288
|
+
query: str | None = None,
|
|
289
|
+
role: str | None = None,
|
|
290
|
+
name: str | None = None,
|
|
291
|
+
state: str | None = None,
|
|
292
|
+
) -> str:
|
|
293
|
+
"""Search the last captured tree for elements matching criteria.
|
|
294
|
+
|
|
295
|
+
Searches the FULL tree (including elements not shown in compact output)
|
|
296
|
+
with semantic matching and relevance ranking. Results are sorted by
|
|
297
|
+
relevance — best matches first.
|
|
298
|
+
|
|
299
|
+
If no tree has been captured yet in this session, auto-captures the
|
|
300
|
+
foreground window.
|
|
301
|
+
|
|
302
|
+
QUERY MODE (recommended):
|
|
303
|
+
Pass a freeform ``query`` describing what you're looking for.
|
|
304
|
+
The query is automatically parsed into role and name signals.
|
|
305
|
+
|
|
306
|
+
Examples:
|
|
307
|
+
query="the play button" -> finds buttons with "play" in the name
|
|
308
|
+
query="search input" -> finds textbox/combobox/searchbox elements
|
|
309
|
+
query="volume slider" -> finds sliders with "volume" in the name
|
|
310
|
+
query="Submit" -> finds elements named "Submit"
|
|
311
|
+
|
|
312
|
+
STRUCTURED MODE (backward compatible):
|
|
313
|
+
Pass explicit role, name, and/or state filters.
|
|
314
|
+
|
|
315
|
+
role — CUP role or natural language (e.g., "button", "search bar", "input")
|
|
316
|
+
name — Fuzzy name match (token overlap, not just substring)
|
|
317
|
+
state — Exact state match (e.g., "focused", "disabled", "checked")
|
|
318
|
+
|
|
319
|
+
Both modes can be combined: query + state="focused" narrows to focused elements.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
query: Freeform semantic query (e.g., "play button", "search input").
|
|
323
|
+
role: Filter by role (exact CUP role or natural language synonym).
|
|
324
|
+
name: Filter by name (fuzzy token matching).
|
|
325
|
+
state: Filter by state (exact match).
|
|
326
|
+
"""
|
|
327
|
+
if query is None and role is None and name is None and state is None:
|
|
328
|
+
return json.dumps(
|
|
329
|
+
{
|
|
330
|
+
"success": False,
|
|
331
|
+
"message": "",
|
|
332
|
+
"error": "At least one search parameter (query, role, name, or state) must be provided.",
|
|
333
|
+
}
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
session = _get_session()
|
|
337
|
+
matches = session.find(query=query, role=role, name=name, state=state)
|
|
338
|
+
|
|
339
|
+
if not matches:
|
|
340
|
+
return json.dumps(
|
|
341
|
+
{
|
|
342
|
+
"success": True,
|
|
343
|
+
"message": "No matching elements found.",
|
|
344
|
+
"matches": 0,
|
|
345
|
+
}
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
lines = [_format_line(node) for node in matches]
|
|
349
|
+
return (
|
|
350
|
+
"\n".join(
|
|
351
|
+
[
|
|
352
|
+
f"# {len(matches)} match{'es' if len(matches) != 1 else ''} found",
|
|
353
|
+
"",
|
|
354
|
+
]
|
|
355
|
+
+ lines
|
|
356
|
+
)
|
|
357
|
+
+ "\n"
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
# ---------------------------------------------------------------------------
|
|
362
|
+
# Screenshot
|
|
363
|
+
# ---------------------------------------------------------------------------
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
@mcp.tool()
|
|
367
|
+
def screenshot(
|
|
368
|
+
region_x: int | None = None,
|
|
369
|
+
region_y: int | None = None,
|
|
370
|
+
region_w: int | None = None,
|
|
371
|
+
region_h: int | None = None,
|
|
372
|
+
) -> Image:
|
|
373
|
+
"""Capture a screenshot of the screen and return it as a PNG image.
|
|
374
|
+
|
|
375
|
+
By default captures the full primary monitor. Optionally specify a
|
|
376
|
+
region to capture only part of the screen.
|
|
377
|
+
|
|
378
|
+
Use this alongside tree capture tools when you need visual context
|
|
379
|
+
(e.g., to see colors, images, or layout that the tree doesn't capture).
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
region_x: Left edge of capture region in pixels.
|
|
383
|
+
region_y: Top edge of capture region in pixels.
|
|
384
|
+
region_w: Width of capture region in pixels.
|
|
385
|
+
region_h: Height of capture region in pixels.
|
|
386
|
+
"""
|
|
387
|
+
region_params = [region_x, region_y, region_w, region_h]
|
|
388
|
+
has_any = any(v is not None for v in region_params)
|
|
389
|
+
has_all = all(v is not None for v in region_params)
|
|
390
|
+
|
|
391
|
+
if has_any and not has_all:
|
|
392
|
+
return json.dumps(
|
|
393
|
+
{
|
|
394
|
+
"success": False,
|
|
395
|
+
"message": "",
|
|
396
|
+
"error": "All region parameters (region_x, region_y, region_w, region_h) "
|
|
397
|
+
"must be provided together, or none at all.",
|
|
398
|
+
}
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
region = None
|
|
402
|
+
if has_all:
|
|
403
|
+
region = {"x": region_x, "y": region_y, "w": region_w, "h": region_h}
|
|
404
|
+
|
|
405
|
+
session = _get_session()
|
|
406
|
+
try:
|
|
407
|
+
png_bytes = session.screenshot(region=region)
|
|
408
|
+
except ImportError:
|
|
409
|
+
return json.dumps(
|
|
410
|
+
{
|
|
411
|
+
"success": False,
|
|
412
|
+
"message": "",
|
|
413
|
+
"error": "Screenshot support requires the 'mss' package. "
|
|
414
|
+
"Install with: pip install cup[screenshot]",
|
|
415
|
+
}
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
return Image(data=png_bytes, format="png")
|
|
File without changes
|