computeruseprotocol 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cup/mcp/server.py ADDED
@@ -0,0 +1,418 @@
1
+ """CUP MCP Server — Computer Use Protocol tools for AI agents.
2
+
3
+ Exposes simple, focused tools for UI tree snapshot, element search,
4
+ action execution, and screenshots.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+
11
+ from mcp.server.fastmcp import FastMCP
12
+ from mcp.server.fastmcp.utilities.types import Image
13
+
14
+ import cup
15
+ from cup.format import _format_line
16
+
17
+ mcp = FastMCP(
18
+ name="cup",
19
+ instructions=(
20
+ "CUP (Computer Use Protocol) gives you access to the UI accessibility "
21
+ "tree of the user's computer.\n\n"
22
+ "WORKFLOW — follow this pattern:\n"
23
+ "1. snapshot to capture the active window's UI\n"
24
+ "2. find to locate specific elements (PREFERRED over re-capturing)\n"
25
+ "3. action to interact (click, type, press, etc.)\n"
26
+ "4. Re-capture ONLY after actions change the UI\n\n"
27
+ "TOOLS:\n"
28
+ "- snapshot() — active window tree + window list (most common)\n"
29
+ "- snapshot_app(app) — specific app by title (when not in foreground)\n"
30
+ "- overview() — just the window list, near-instant\n"
31
+ "- snapshot_desktop() — desktop icons and widgets\n"
32
+ "- find(role/name/state) — search last tree without re-capturing\n"
33
+ "- action(action, ...) — interact with elements or press keys\n"
34
+ "- open_app(name) — open an app by name with fuzzy matching\n"
35
+ "- screenshot(region) — visual context when tree isn't enough\n\n"
36
+ "IMPORTANT — minimize token usage:\n"
37
+ "- Use find(name=...) to locate elements — NOT repeated tree captures\n"
38
+ "- Use overview() to discover what apps are open\n"
39
+ "- Use snapshot_app(app='...') to target a specific app\n"
40
+ "- snapshot() is your default starting point\n\n"
41
+ "Element IDs (e.g., 'e14') are ephemeral — only valid for the most "
42
+ "recent tree snapshot. After any action, re-capture before using IDs.\n\n"
43
+ "Use action(action='press', keys='ctrl+s') for keyboard shortcuts.\n\n"
44
+ "Use screenshot when you need visual context (colors, images, layout)."
45
+ ),
46
+ )
47
+
48
+ # ---------------------------------------------------------------------------
49
+ # Session state (one per MCP server process)
50
+ # ---------------------------------------------------------------------------
51
+
52
+ _session: cup.Session | None = None
53
+
54
+
55
+ def _get_session() -> cup.Session:
56
+ global _session
57
+ if _session is None:
58
+ _session = cup.Session()
59
+ return _session
60
+
61
+
62
+ # ---------------------------------------------------------------------------
63
+ # Tree capture tools
64
+ # ---------------------------------------------------------------------------
65
+
66
+
67
+ @mcp.tool()
68
+ def snapshot() -> str:
69
+ """Capture the foreground (active) window's accessibility tree.
70
+
71
+ Returns a structured text representation where each UI element has an ID
72
+ (e.g., 'e14') that can be used with the action tool. The format shows:
73
+
74
+ [id] role "name" @x,y wxh {states} [actions] val="value"
75
+
76
+ Indentation shows the element hierarchy.
77
+
78
+ Also includes a window list in the header showing all open apps.
79
+ This is the primary tool for interacting with the current app's UI.
80
+
81
+ Element IDs are ephemeral — they are only valid for THIS snapshot.
82
+ After executing any action, you MUST call this again for fresh IDs.
83
+ """
84
+ session = _get_session()
85
+ return session.snapshot(
86
+ scope="foreground",
87
+ max_depth=999,
88
+ compact=True,
89
+ detail="compact",
90
+ )
91
+
92
+
93
+ @mcp.tool()
94
+ def snapshot_app(app: str) -> str:
95
+ """Capture a specific app's window accessibility tree by title.
96
+
97
+ Use this when you need to interact with a window that is NOT in the
98
+ foreground, or when you know the exact app you want by name.
99
+
100
+ The 'app' parameter is a case-insensitive substring match against
101
+ window titles (e.g., "Spotify", "Firefox", "VS Code").
102
+
103
+ Returns the same compact format as snapshot, with element IDs
104
+ that can be used with the action tool.
105
+
106
+ Element IDs are ephemeral — only valid for THIS snapshot.
107
+
108
+ Args:
109
+ app: Target app by window title (case-insensitive substring match).
110
+ """
111
+ session = _get_session()
112
+ return session.snapshot(
113
+ scope="full",
114
+ app=app,
115
+ max_depth=999,
116
+ compact=True,
117
+ detail="compact",
118
+ )
119
+
120
+
121
+ @mcp.tool()
122
+ def snapshot_desktop() -> str:
123
+ """Capture the desktop surface (icons, widgets, shortcuts).
124
+
125
+ Use this to see and interact with desktop items. Falls back to a
126
+ window overview if the platform has no desktop concept.
127
+
128
+ Returns the same compact format with element IDs for the action tool.
129
+
130
+ Element IDs are ephemeral — only valid for THIS snapshot.
131
+ """
132
+ session = _get_session()
133
+ return session.snapshot(
134
+ scope="desktop",
135
+ max_depth=999,
136
+ compact=True,
137
+ detail="compact",
138
+ )
139
+
140
+
141
+ @mcp.tool()
142
+ def overview() -> str:
143
+ """List all open windows. Near-instant, no tree walking.
144
+
145
+ Returns a lightweight window list showing app names, PIDs, and bounds.
146
+ No element IDs are returned (no tree walking is performed).
147
+
148
+ Use this to quickly discover what apps are open before targeting
149
+ a specific one with snapshot_app(app='...').\n
150
+ """
151
+ session = _get_session()
152
+ return session.snapshot(scope="overview", compact=True)
153
+
154
+
155
+ # ---------------------------------------------------------------------------
156
+ # Action tools
157
+ # ---------------------------------------------------------------------------
158
+
159
+
160
+ @mcp.tool()
161
+ def action(
162
+ action: str,
163
+ element_id: str | None = None,
164
+ value: str | None = None,
165
+ direction: str | None = None,
166
+ keys: str | None = None,
167
+ ) -> str:
168
+ """Perform an action on a UI element or send a keyboard shortcut.
169
+
170
+ IMPORTANT: Element IDs are only valid from the most recent tree snapshot
171
+ (snapshot, snapshot_app, etc.). After performing any action, re-capture
172
+ for fresh IDs.
173
+
174
+ Element actions (require element_id):
175
+ click — Click/invoke the element
176
+ rightclick — Right-click to open context menu
177
+ doubleclick— Double-click the element
178
+ toggle — Toggle a checkbox or switch
179
+ type — Type text into a text field (pass text in 'value')
180
+ setvalue — Set element value programmatically (pass in 'value')
181
+ select — Select an item in a list/tree/tab
182
+ expand — Expand a collapsed element
183
+ collapse — Collapse an expanded element
184
+ scroll — Scroll a container (pass direction: up/down/left/right)
185
+ increment — Increment a slider/spinbutton
186
+ decrement — Decrement a slider/spinbutton
187
+ focus — Move keyboard focus to the element
188
+
189
+ Keyboard shortcut (no element_id needed):
190
+ press — Send a keyboard shortcut (pass combo in 'keys')
191
+ Examples: "enter", "ctrl+s", "ctrl+shift+p", "alt+f4"
192
+
193
+ Args:
194
+ action: The action to perform.
195
+ element_id: Element ID from the tree (e.g., "e14"). Required for
196
+ all actions except press.
197
+ value: Text for 'type' or 'setvalue' actions.
198
+ direction: Direction for 'scroll' action (up/down/left/right).
199
+ keys: Key combination for 'press' action (e.g., "ctrl+s").
200
+ """
201
+ session = _get_session()
202
+
203
+ # Handle press action
204
+ if action == "press":
205
+ if not keys:
206
+ return json.dumps(
207
+ {
208
+ "success": False,
209
+ "message": "",
210
+ "error": "press action requires the 'keys' parameter (e.g., keys='ctrl+s').",
211
+ }
212
+ )
213
+ result = session.press(keys)
214
+ return json.dumps(
215
+ {
216
+ "success": result.success,
217
+ "message": result.message,
218
+ "error": result.error,
219
+ }
220
+ )
221
+
222
+ # All other actions require element_id
223
+ if not element_id:
224
+ return json.dumps(
225
+ {
226
+ "success": False,
227
+ "message": "",
228
+ "error": f"Action '{action}' requires the 'element_id' parameter.",
229
+ }
230
+ )
231
+
232
+ # Build params dict from the optional arguments
233
+ params: dict = {}
234
+ if value is not None:
235
+ params["value"] = value
236
+ if direction is not None:
237
+ params["direction"] = direction
238
+
239
+ result = session.action(element_id, action, **params)
240
+
241
+ return json.dumps(
242
+ {
243
+ "success": result.success,
244
+ "message": result.message,
245
+ "error": result.error,
246
+ }
247
+ )
248
+
249
+
250
+ # ---------------------------------------------------------------------------
251
+ # Open app tool
252
+ # ---------------------------------------------------------------------------
253
+
254
+
255
+ @mcp.tool()
256
+ def open_app(name: str) -> str:
257
+ """Open an application by name.
258
+
259
+ Fuzzy-matches the name against installed apps on the system.
260
+ Examples: "chrome" → Google Chrome, "code" → Visual Studio Code,
261
+ "notepad" → Notepad, "slack" → Slack.
262
+
263
+ Waits for the app window to appear before returning success.
264
+
265
+ After opening, use snapshot() to capture the new app's UI tree.
266
+
267
+ Args:
268
+ name: Application name to open (fuzzy matched against installed apps).
269
+ """
270
+ session = _get_session()
271
+ result = session.open_app(name)
272
+ return json.dumps(
273
+ {
274
+ "success": result.success,
275
+ "message": result.message,
276
+ "error": result.error,
277
+ }
278
+ )
279
+
280
+
281
+ # ---------------------------------------------------------------------------
282
+ # Search tool
283
+ # ---------------------------------------------------------------------------
284
+
285
+
286
+ @mcp.tool()
287
+ def find(
288
+ query: str | None = None,
289
+ role: str | None = None,
290
+ name: str | None = None,
291
+ state: str | None = None,
292
+ ) -> str:
293
+ """Search the last captured tree for elements matching criteria.
294
+
295
+ Searches the FULL tree (including elements not shown in compact output)
296
+ with semantic matching and relevance ranking. Results are sorted by
297
+ relevance — best matches first.
298
+
299
+ If no tree has been captured yet in this session, auto-captures the
300
+ foreground window.
301
+
302
+ QUERY MODE (recommended):
303
+ Pass a freeform ``query`` describing what you're looking for.
304
+ The query is automatically parsed into role and name signals.
305
+
306
+ Examples:
307
+ query="the play button" -> finds buttons with "play" in the name
308
+ query="search input" -> finds textbox/combobox/searchbox elements
309
+ query="volume slider" -> finds sliders with "volume" in the name
310
+ query="Submit" -> finds elements named "Submit"
311
+
312
+ STRUCTURED MODE (backward compatible):
313
+ Pass explicit role, name, and/or state filters.
314
+
315
+ role — CUP role or natural language (e.g., "button", "search bar", "input")
316
+ name — Fuzzy name match (token overlap, not just substring)
317
+ state — Exact state match (e.g., "focused", "disabled", "checked")
318
+
319
+ Both modes can be combined: query + state="focused" narrows to focused elements.
320
+
321
+ Args:
322
+ query: Freeform semantic query (e.g., "play button", "search input").
323
+ role: Filter by role (exact CUP role or natural language synonym).
324
+ name: Filter by name (fuzzy token matching).
325
+ state: Filter by state (exact match).
326
+ """
327
+ if query is None and role is None and name is None and state is None:
328
+ return json.dumps(
329
+ {
330
+ "success": False,
331
+ "message": "",
332
+ "error": "At least one search parameter (query, role, name, or state) must be provided.",
333
+ }
334
+ )
335
+
336
+ session = _get_session()
337
+ matches = session.find(query=query, role=role, name=name, state=state)
338
+
339
+ if not matches:
340
+ return json.dumps(
341
+ {
342
+ "success": True,
343
+ "message": "No matching elements found.",
344
+ "matches": 0,
345
+ }
346
+ )
347
+
348
+ lines = [_format_line(node) for node in matches]
349
+ return (
350
+ "\n".join(
351
+ [
352
+ f"# {len(matches)} match{'es' if len(matches) != 1 else ''} found",
353
+ "",
354
+ ]
355
+ + lines
356
+ )
357
+ + "\n"
358
+ )
359
+
360
+
361
+ # ---------------------------------------------------------------------------
362
+ # Screenshot
363
+ # ---------------------------------------------------------------------------
364
+
365
+
366
+ @mcp.tool()
367
+ def screenshot(
368
+ region_x: int | None = None,
369
+ region_y: int | None = None,
370
+ region_w: int | None = None,
371
+ region_h: int | None = None,
372
+ ) -> Image:
373
+ """Capture a screenshot of the screen and return it as a PNG image.
374
+
375
+ By default captures the full primary monitor. Optionally specify a
376
+ region to capture only part of the screen.
377
+
378
+ Use this alongside tree capture tools when you need visual context
379
+ (e.g., to see colors, images, or layout that the tree doesn't capture).
380
+
381
+ Args:
382
+ region_x: Left edge of capture region in pixels.
383
+ region_y: Top edge of capture region in pixels.
384
+ region_w: Width of capture region in pixels.
385
+ region_h: Height of capture region in pixels.
386
+ """
387
+ region_params = [region_x, region_y, region_w, region_h]
388
+ has_any = any(v is not None for v in region_params)
389
+ has_all = all(v is not None for v in region_params)
390
+
391
+ if has_any and not has_all:
392
+ return json.dumps(
393
+ {
394
+ "success": False,
395
+ "message": "",
396
+ "error": "All region parameters (region_x, region_y, region_w, region_h) "
397
+ "must be provided together, or none at all.",
398
+ }
399
+ )
400
+
401
+ region = None
402
+ if has_all:
403
+ region = {"x": region_x, "y": region_y, "w": region_w, "h": region_h}
404
+
405
+ session = _get_session()
406
+ try:
407
+ png_bytes = session.screenshot(region=region)
408
+ except ImportError:
409
+ return json.dumps(
410
+ {
411
+ "success": False,
412
+ "message": "",
413
+ "error": "Screenshot support requires the 'mss' package. "
414
+ "Install with: pip install cup[screenshot]",
415
+ }
416
+ )
417
+
418
+ return Image(data=png_bytes, format="png")
File without changes