desk-mcp 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.3
2
+ Name: desk-mcp
3
+ Version: 0.2.0
4
+ Summary: Desktop automation MCP — screenshot (calibrated), mouse/keyboard via xdotool, window inspection
5
+ Requires-Dist: fastmcp>=2.0
6
+ Requires-Dist: pillow>=10.0
7
+ Requires-Dist: dbus-python>=1.3
8
+ Requires-Dist: typer>=0.12
9
+ Requires-Dist: rich>=13.0
10
+ Requires-Python: >=3.12
@@ -0,0 +1,21 @@
1
+ [project]
2
+ name = "desk-mcp"
3
+ version = "0.2.0"
4
+ description = "Desktop automation MCP — screenshot (calibrated), mouse/keyboard via xdotool, window inspection"
5
+ requires-python = ">=3.12"
6
+ dependencies = [
7
+ "fastmcp>=2.0",
8
+ "Pillow>=10.0",
9
+ "dbus-python>=1.3",
10
+ "typer>=0.12",
11
+ "rich>=13.0",
12
+ ]
13
+
14
+ [project.scripts]
15
+ desk-mcp = "desk_mcp.cli:app"
16
+
17
+ [build-system]
18
+ requires = ["uv_build"]
19
+ build-backend = "uv_build"
20
+
21
+ [tool.uv.sources]
@@ -0,0 +1,2 @@
1
+ """desk-mcp — Desktop automation MCP for Claude Code."""
2
+ __version__ = "0.1.0"
@@ -0,0 +1,134 @@
1
+ """desk-mcp CLI — admin and diagnostic commands."""
2
+
3
+ import os
4
+ import shutil
5
+ import subprocess
6
+
7
+ import typer
8
+ from rich.console import Console
9
+ from rich.table import Table
10
+ from rich import print as rprint
11
+
12
+ app = typer.Typer(name="desktop-mcp", help="desk-mcp — Desktop automation MCP")
13
+ console = Console()
14
+
15
+
16
+ @app.command()
17
+ def serve():
18
+ """Start the MCP server (stdio transport for Claude Code)."""
19
+ from desk_mcp.server import serve as _serve
20
+ _serve()
21
+
22
+
23
+ @app.command()
24
+ def status():
25
+ """Show environment and tool availability."""
26
+ console.rule("[bold blue]desk-mcp status[/]")
27
+
28
+ table = Table(show_header=True, header_style="bold cyan")
29
+ table.add_column("Component", style="white")
30
+ table.add_column("Status")
31
+ table.add_column("Details")
32
+
33
+ # Session type
34
+ session = os.environ.get("XDG_SESSION_TYPE", "unknown")
35
+ display = os.environ.get("DISPLAY", "")
36
+ wayland = os.environ.get("WAYLAND_DISPLAY", "")
37
+ table.add_row(
38
+ "Session",
39
+ f"[green]{session}[/]",
40
+ f"DISPLAY={display} WAYLAND={wayland}"
41
+ )
42
+
43
+ # xdotool
44
+ xt = shutil.which("xdotool")
45
+ if xt:
46
+ r = subprocess.run(["xdotool", "getdisplaygeometry"],
47
+ capture_output=True, text=True)
48
+ res = r.stdout.strip() if r.returncode == 0 else "error"
49
+ table.add_row("xdotool", "[green]✓ installed[/]",
50
+ f"Resolution: {res}")
51
+ else:
52
+ table.add_row("xdotool", "[red]✗ missing[/]",
53
+ "sudo apt install xdotool")
54
+
55
+ # python3-dbus + python3-gi (XDG portal screenshot backend)
56
+ r = subprocess.run(
57
+ ["/usr/bin/python3", "-c", "import dbus, dbus.mainloop.glib; from gi.repository import GLib; print('ok')"],
58
+ capture_output=True, text=True
59
+ )
60
+ portal_ok = r.returncode == 0
61
+ table.add_row(
62
+ "python3-gi / dbus",
63
+ "[green]✓ available[/]" if portal_ok else "[red]✗ missing[/]",
64
+ "XDG portal screenshot backend" if portal_ok else "sudo apt install python3-gi python3-dbus"
65
+ )
66
+
67
+ # ydotool (Wayland-native input)
68
+ yd = shutil.which("ydotool")
69
+ table.add_row(
70
+ "ydotool",
71
+ "[green]✓ installed[/]" if yd else "[dim]not installed[/]",
72
+ "Wayland-native input (optional)" if not yd else "Available"
73
+ )
74
+
75
+ # wmctrl
76
+ wm = shutil.which("wmctrl")
77
+ table.add_row(
78
+ "wmctrl",
79
+ "[green]✓ installed[/]" if wm else "[dim]not installed[/]",
80
+ "Window management (optional)"
81
+ )
82
+
83
+ # Pillow
84
+ try:
85
+ import PIL
86
+ table.add_row("Pillow", f"[green]✓ {PIL.__version__}[/]",
87
+ "Image cropping for calibrated shots")
88
+ except ImportError:
89
+ table.add_row("Pillow", "[red]✗ missing[/]", "pip install Pillow")
90
+
91
+ console.print(table)
92
+
93
+ rprint("\n[bold]Screenshot backend:[/]")
94
+ rprint(" [green]XDG Desktop Portal[/] via /usr/bin/python3 + python3-gi + dbus-python")
95
+ rprint(" Requires: [cyan]sudo apt install python3-gi python3-dbus[/] (standard on Ubuntu)")
96
+ rprint("\n[bold]Input simulation:[/] xdotool (XWayland — works for most apps)")
97
+ rprint("[dim]For Wayland-native windows: coordinates work, window auto-detect may not.[/]")
98
+
99
+
100
+ @app.command()
101
+ def screenshot(
102
+ output: str = typer.Option("/tmp/k_desktop/test.png", help="Output path"),
103
+ window: str = typer.Option(None, help="Window name to capture"),
104
+ ):
105
+ """Test screenshot from the CLI."""
106
+ from desk_mcp.server import _take_screenshot, _get_window_geometry, _crop_image # noqa: F401
107
+ from pathlib import Path
108
+
109
+ dest = Path(output)
110
+ dest.parent.mkdir(parents=True, exist_ok=True)
111
+
112
+ console.print(f"[cyan]Taking screenshot → {dest}[/]")
113
+ ok = _take_screenshot(dest)
114
+ if not ok:
115
+ rprint("[red]Screenshot failed. Install gnome-screenshot:[/] sudo apt install gnome-screenshot")
116
+ raise typer.Exit(1)
117
+
118
+ if window:
119
+ geom = _get_window_geometry(window)
120
+ if geom:
121
+ cropped = dest.with_stem(dest.stem + "_cropped")
122
+ _crop_image(dest, {**geom, "w": geom["w"], "h": geom["h"]}, cropped)
123
+ dest.unlink()
124
+ rprint(f"[green]✓ Window '{window}' captured → {cropped}[/]")
125
+ rprint(f" Geometry: x={geom['x']} y={geom['y']} {geom['w']}×{geom['h']}")
126
+ else:
127
+ rprint(f"[yellow]Window '{window}' not found via xdotool — full screenshot saved.[/]")
128
+ rprint(f"[green]✓ Full screenshot → {dest}[/]")
129
+ else:
130
+ rprint(f"[green]✓ Full screenshot → {dest}[/]")
131
+
132
+
133
+ if __name__ == "__main__":
134
+ app()
@@ -0,0 +1,422 @@
1
+ """
2
+ desk-mcp server — Desktop automation tools for Claude Code.
3
+
4
+ Tools:
5
+ screenshot — Take a calibrated screenshot (full screen, window, or region)
6
+ get_windows — List all visible windows with geometry
7
+ get_screen — Screen info (resolution, session type)
8
+ click — Left/right/middle click at coordinates
9
+ double_click — Double-click at coordinates
10
+ move_mouse — Move mouse without clicking
11
+ type_text — Type text at current focus
12
+ key — Press a key combo (e.g. "ctrl+c", "Return", "super")
13
+ scroll — Scroll at coordinates
14
+
15
+ Screenshot backend:
16
+ XDG Desktop Portal via system python3 + dbus-python + GLib event loop.
17
+ Works natively on GNOME Wayland — no dialog, no user interaction required.
18
+ Requires: /usr/bin/python3 with python3-gi and python3-dbus (standard on Ubuntu).
19
+
20
+ Input simulation: xdotool (XWayland — covers X11 and XWayland apps).
21
+ For pure Wayland-native apps (e.g. WaveTerm): mouse position is still correct,
22
+ keyboard works, but window auto-detection via get_windows() may not see them.
23
+ """
24
+
25
+ import os
26
+ import shutil
27
+ import subprocess
28
+ import time
29
+ from datetime import datetime
30
+ from pathlib import Path
31
+ from typing import Optional
32
+
33
+ from fastmcp import FastMCP
34
+
35
+ # ── Init ──────────────────────────────────────────────────────────────────────
36
+
37
+ mcp = FastMCP(
38
+ name="desk-mcp",
39
+ instructions=(
40
+ "Desktop automation MCP. Use screenshot() to see the screen — "
41
+ "pass window_name to auto-crop to a specific window, or region dict "
42
+ "{x,y,w,h} for a precise area. Use get_windows() to discover window "
43
+ "coordinates. Use click/type_text/key for input simulation via xdotool."
44
+ ),
45
+ )
46
+
47
+ SHOT_DIR = Path(os.environ.get("K_DESKTOP_SHOT_DIR", "/tmp/k_desktop"))
48
+ SHOT_DIR.mkdir(parents=True, exist_ok=True)
49
+
50
+
51
+ # ── Helpers ───────────────────────────────────────────────────────────────────
52
+
53
+ def _run(cmd: list[str], timeout: int = 10) -> subprocess.CompletedProcess:
54
+ return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
55
+
56
+
57
+ def _ts() -> str:
58
+ return datetime.now().strftime("%H%M%S_%f")[:10]
59
+
60
+
61
+ def _crop_image(src: Path, region: dict, dest: Path) -> Path:
62
+ """Crop image to region {x, y, w, h} using Pillow."""
63
+ from PIL import Image as PILImage
64
+ with PILImage.open(src) as img:
65
+ box = (region["x"], region["y"],
66
+ region["x"] + region["w"], region["y"] + region["h"])
67
+ # Clamp to image bounds
68
+ box = (
69
+ max(0, box[0]), max(0, box[1]),
70
+ min(img.width, box[2]), min(img.height, box[3]),
71
+ )
72
+ if box[2] <= box[0] or box[3] <= box[1]:
73
+ return src # Region is outside image bounds — return uncropped
74
+ cropped = img.crop(box)
75
+ cropped.save(dest)
76
+ return dest
77
+
78
+
79
+ def _get_window_geometry(window_name: str) -> Optional[dict]:
80
+ """Find a window by name and return its geometry via xdotool."""
81
+ r = _run(["xdotool", "search", "--name", window_name])
82
+ if r.returncode != 0 or not r.stdout.strip():
83
+ # Try class-based search
84
+ r = _run(["xdotool", "search", "--class", window_name])
85
+ if r.returncode != 0 or not r.stdout.strip():
86
+ return None
87
+ wid = r.stdout.strip().splitlines()[-1] # Take last (most recent) match
88
+ r2 = _run(["xdotool", "getwindowgeometry", "--shell", wid])
89
+ if r2.returncode != 0:
90
+ return None
91
+ vals = {}
92
+ for line in r2.stdout.splitlines():
93
+ if "=" in line:
94
+ k, v = line.split("=", 1)
95
+ vals[k.strip()] = int(v.strip())
96
+ if all(k in vals for k in ("X", "Y", "WIDTH", "HEIGHT")):
97
+ return {"x": vals["X"], "y": vals["Y"],
98
+ "w": vals["WIDTH"], "h": vals["HEIGHT"]}
99
+ return None
100
+
101
+
102
+ _PORTAL_SCRIPT = """\
103
+ import sys, dbus, dbus.mainloop.glib
104
+ from gi.repository import GLib
105
+
106
+ dbus.mainloop.glib.DBusGMainLoop(set_as_default=True)
107
+ bus = dbus.SessionBus()
108
+ loop = GLib.MainLoop()
109
+ result = {}
110
+
111
+ def on_response(response, results, **kwargs):
112
+ if response == 0:
113
+ result['uri'] = str(results.get('uri', ''))
114
+ loop.quit()
115
+
116
+ def on_timeout():
117
+ loop.quit()
118
+ return False
119
+
120
+ try:
121
+ portal = bus.get_object('org.freedesktop.portal.Desktop', '/org/freedesktop/portal/desktop')
122
+ portal_iface = dbus.Interface(portal, 'org.freedesktop.portal.Screenshot')
123
+ options = dbus.Dictionary({'interactive': dbus.Boolean(False)}, signature='sv')
124
+ request_path = str(portal_iface.Screenshot('', options))
125
+ request_obj = bus.get_object('org.freedesktop.portal.Desktop', request_path)
126
+ request_iface = dbus.Interface(request_obj, 'org.freedesktop.portal.Request')
127
+ request_iface.connect_to_signal('Response', on_response)
128
+ GLib.timeout_add_seconds(15, on_timeout)
129
+ loop.run()
130
+ except Exception:
131
+ pass
132
+
133
+ print(result.get('uri', ''), end='')
134
+ """
135
+
136
+
137
+ def _take_screenshot(dest: Path) -> bool:
138
+ """Take a screenshot via XDG Desktop Portal (system python3 + dbus + GLib)."""
139
+ r = subprocess.run(
140
+ ["/usr/bin/python3", "-c", _PORTAL_SCRIPT],
141
+ capture_output=True, text=True, timeout=20,
142
+ )
143
+ uri = r.stdout.strip()
144
+ if uri.startswith("file://"):
145
+ src = Path(uri[len("file://"):])
146
+ if src.exists():
147
+ shutil.copy2(src, dest)
148
+ return True
149
+ return False
150
+
151
+
152
+ # ── Tools ─────────────────────────────────────────────────────────────────────
153
+
154
+ @mcp.tool()
155
+ def screenshot(
156
+ window_name: Optional[str] = None,
157
+ region: Optional[dict] = None,
158
+ ) -> dict:
159
+ """
160
+ Take a screenshot and return the image.
161
+
162
+ Args:
163
+ window_name: Name (or partial name) of the window to capture.
164
+ Automatically finds the window and crops to its bounds.
165
+ Works for XWayland apps (Chromium, TickTick, Bitwarden, etc.).
166
+ For Wayland-native apps, use region instead.
167
+ region: Explicit crop region: {"x": int, "y": int, "w": int, "h": int}.
168
+ Takes priority over window_name if both provided.
169
+
170
+ Returns:
171
+ Dict with "path" (absolute path to PNG file) and "geometry" info.
172
+ Use the Read tool on the returned path to view the image.
173
+
174
+ Notes:
175
+ Uses XDG Desktop Portal via /usr/bin/python3 + dbus-python + GLib.
176
+ Works natively on GNOME Wayland. No extra tools needed.
177
+ """
178
+ ts = _ts()
179
+ full_path = SHOT_DIR / f"full_{ts}.png"
180
+ final_path = SHOT_DIR / f"shot_{ts}.png"
181
+
182
+ if not _take_screenshot(full_path):
183
+ raise RuntimeError(
184
+ "Screenshot failed. Install: sudo apt install gnome-screenshot\n"
185
+ "Or run: sudo apt install grim (for Wayland-native)"
186
+ )
187
+
188
+ # Resolve crop region
189
+ crop = None
190
+ geom_used = None
191
+ if region:
192
+ crop = region
193
+ geom_used = region
194
+ elif window_name:
195
+ geom = _get_window_geometry(window_name)
196
+ if geom:
197
+ pad = 4
198
+ crop = {
199
+ "x": max(0, geom["x"] - pad),
200
+ "y": max(0, geom["y"] - pad),
201
+ "w": geom["w"] + pad * 2,
202
+ "h": geom["h"] + pad * 2,
203
+ }
204
+ geom_used = geom
205
+ # If window not found, return full screenshot (don't error)
206
+
207
+ if crop:
208
+ _crop_image(full_path, crop, final_path)
209
+ full_path.unlink(missing_ok=True)
210
+ else:
211
+ final_path = full_path
212
+
213
+ result = {
214
+ "path": str(final_path),
215
+ "note": "Use the Read tool on 'path' to view this image",
216
+ }
217
+ if geom_used:
218
+ result["geometry"] = geom_used
219
+ if window_name and not geom_used:
220
+ result["warning"] = f"Window '{window_name}' not found via xdotool — full screenshot returned"
221
+ return result
222
+
223
+
224
+ @mcp.tool()
225
+ def get_windows() -> list[dict]:
226
+ """
227
+ List all visible windows with their IDs, names, and screen geometry.
228
+
229
+ Note: Only shows XWayland-accessible windows. Pure Wayland-native apps
230
+ (e.g. WaveTerm running in native Wayland mode) may not appear here.
231
+ Use screenshot() with a known region for those.
232
+
233
+ Returns:
234
+ List of dicts: {id, name, x, y, width, height}
235
+ """
236
+ r = _run(["xdotool", "search", "--name", ""])
237
+ if r.returncode != 0:
238
+ return []
239
+
240
+ windows = []
241
+ for wid in r.stdout.strip().splitlines():
242
+ name_r = _run(["xdotool", "getwindowname", wid])
243
+ geom_r = _run(["xdotool", "getwindowgeometry", "--shell", wid])
244
+ if name_r.returncode != 0:
245
+ continue
246
+ name = name_r.stdout.strip()
247
+ if not name or name in ("", "mutter guard window"):
248
+ continue
249
+ vals = {}
250
+ for line in geom_r.stdout.splitlines():
251
+ if "=" in line:
252
+ k, v = line.split("=", 1)
253
+ vals[k.strip()] = v.strip()
254
+ windows.append({
255
+ "id": int(wid),
256
+ "name": name,
257
+ "x": int(vals.get("X", 0)),
258
+ "y": int(vals.get("Y", 0)),
259
+ "width": int(vals.get("WIDTH", 0)),
260
+ "height": int(vals.get("HEIGHT", 0)),
261
+ })
262
+ return windows
263
+
264
+
265
+ @mcp.tool()
266
+ def get_screen() -> dict:
267
+ """
268
+ Return screen information: resolution, session type, display.
269
+
270
+ Returns:
271
+ {width, height, session_type, display, wayland_display}
272
+ """
273
+ r = _run(["xdotool", "getdisplaygeometry"])
274
+ w, h = 0, 0
275
+ if r.returncode == 0:
276
+ parts = r.stdout.strip().split()
277
+ if len(parts) == 2:
278
+ w, h = int(parts[0]), int(parts[1])
279
+ return {
280
+ "width": w,
281
+ "height": h,
282
+ "session_type": os.environ.get("XDG_SESSION_TYPE", "unknown"),
283
+ "display": os.environ.get("DISPLAY", ""),
284
+ "wayland_display": os.environ.get("WAYLAND_DISPLAY", ""),
285
+ "gnome_screenshot_available": bool(shutil.which("gnome-screenshot")),
286
+ }
287
+
288
+
289
+ @mcp.tool()
290
+ def click(x: int, y: int, button: str = "left") -> str:
291
+ """
292
+ Click at screen coordinates.
293
+
294
+ Args:
295
+ x: X coordinate in pixels
296
+ y: Y coordinate in pixels
297
+ button: "left" (default), "right", or "middle"
298
+
299
+ Returns:
300
+ Confirmation string.
301
+ """
302
+ btn_map = {"left": "1", "middle": "2", "right": "3"}
303
+ btn = btn_map.get(button.lower(), "1")
304
+ _run(["xdotool", "mousemove", "--sync", str(x), str(y)])
305
+ _run(["xdotool", "click", btn])
306
+ return f"Clicked {button} at ({x}, {y})"
307
+
308
+
309
+ @mcp.tool()
310
+ def double_click(x: int, y: int) -> str:
311
+ """
312
+ Double-click at screen coordinates.
313
+
314
+ Args:
315
+ x: X coordinate in pixels
316
+ y: Y coordinate in pixels
317
+ """
318
+ _run(["xdotool", "mousemove", "--sync", str(x), str(y)])
319
+ _run(["xdotool", "click", "--repeat", "2", "--delay", "100", "1"])
320
+ return f"Double-clicked at ({x}, {y})"
321
+
322
+
323
+ @mcp.tool()
324
+ def right_click(x: int, y: int) -> str:
325
+ """
326
+ Right-click at screen coordinates (opens context menu).
327
+
328
+ Args:
329
+ x: X coordinate in pixels
330
+ y: Y coordinate in pixels
331
+ """
332
+ _run(["xdotool", "mousemove", "--sync", str(x), str(y)])
333
+ _run(["xdotool", "click", "3"])
334
+ return f"Right-clicked at ({x}, {y})"
335
+
336
+
337
+ @mcp.tool()
338
+ def move_mouse(x: int, y: int) -> str:
339
+ """
340
+ Move mouse to coordinates without clicking.
341
+
342
+ Args:
343
+ x: X coordinate in pixels
344
+ y: Y coordinate in pixels
345
+ """
346
+ _run(["xdotool", "mousemove", str(x), str(y)])
347
+ return f"Mouse moved to ({x}, {y})"
348
+
349
+
350
+ @mcp.tool()
351
+ def type_text(text: str, delay_ms: int = 12) -> str:
352
+ """
353
+ Type text at the current keyboard focus.
354
+
355
+ Args:
356
+ text: Text to type
357
+ delay_ms: Delay between keystrokes in ms (default 12 — natural speed)
358
+
359
+ Returns:
360
+ Confirmation string.
361
+
362
+ Note:
363
+ For special characters or passwords, prefer key() with individual keys.
364
+ For apps running natively on Wayland, focus the window first with a click().
365
+ """
366
+ _run(["xdotool", "type", "--delay", str(delay_ms), "--", text])
367
+ preview = text[:40] + ("..." if len(text) > 40 else "")
368
+ return f"Typed: {repr(preview)}"
369
+
370
+
371
+ @mcp.tool()
372
+ def key(combo: str) -> str:
373
+ """
374
+ Press a key or key combination.
375
+
376
+ Args:
377
+ combo: Key combo string. Examples:
378
+ "Return" — Enter key
379
+ "ctrl+c" — Copy
380
+ "ctrl+v" — Paste
381
+ "ctrl+shift+t" — New tab (in many apps)
382
+ "super" — Super/Windows key
383
+ "alt+F4" — Close window
384
+ "ctrl+alt+t" — Open terminal (GNOME default)
385
+ "Escape" — Escape
386
+ "Tab" — Tab
387
+ "BackSpace" — Backspace
388
+ "ctrl+a" — Select all
389
+
390
+ Returns:
391
+ Confirmation string.
392
+ """
393
+ _run(["xdotool", "key", "--clearmodifiers", combo])
394
+ return f"Key pressed: {combo}"
395
+
396
+
397
+ @mcp.tool()
398
+ def scroll(x: int, y: int, direction: str = "down", clicks: int = 3) -> str:
399
+ """
400
+ Scroll at screen coordinates.
401
+
402
+ Args:
403
+ x: X coordinate
404
+ y: Y coordinate
405
+ direction: "up", "down", "left", or "right"
406
+ clicks: Number of scroll ticks (default 3)
407
+
408
+ Returns:
409
+ Confirmation string.
410
+ """
411
+ btn_map = {"up": "4", "down": "5", "left": "6", "right": "7"}
412
+ btn = btn_map.get(direction.lower(), "5")
413
+ _run(["xdotool", "mousemove", "--sync", str(x), str(y)])
414
+ for _ in range(clicks):
415
+ _run(["xdotool", "click", btn])
416
+ return f"Scrolled {direction} {clicks}x at ({x}, {y})"
417
+
418
+
419
+ # ── Entry point ───────────────────────────────────────────────────────────────
420
+
421
+ def serve():
422
+ mcp.run(transport="stdio")