realhands 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- realhands-0.1.0/LICENSE +21 -0
- realhands-0.1.0/PKG-INFO +233 -0
- realhands-0.1.0/README.md +201 -0
- realhands-0.1.0/pyproject.toml +45 -0
- realhands-0.1.0/setup.cfg +4 -0
- realhands-0.1.0/src/realhands/__init__.py +3 -0
- realhands-0.1.0/src/realhands/config.py +44 -0
- realhands-0.1.0/src/realhands/input.py +237 -0
- realhands-0.1.0/src/realhands/safety.py +174 -0
- realhands-0.1.0/src/realhands/screen.py +151 -0
- realhands-0.1.0/src/realhands/server.py +219 -0
- realhands-0.1.0/src/realhands.egg-info/PKG-INFO +233 -0
- realhands-0.1.0/src/realhands.egg-info/SOURCES.txt +15 -0
- realhands-0.1.0/src/realhands.egg-info/dependency_links.txt +1 -0
- realhands-0.1.0/src/realhands.egg-info/entry_points.txt +2 -0
- realhands-0.1.0/src/realhands.egg-info/requires.txt +12 -0
- realhands-0.1.0/src/realhands.egg-info/top_level.txt +1 -0
realhands-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Kanishka Gunawardana
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
realhands-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: realhands
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: MCP server that lets Claude operate the REAL desktop (mouse, keyboard, screen) like a human — works in your own logged-in Chrome and any app
|
|
5
|
+
Author: kanishka089
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/kanishka089/computer-use-mcp
|
|
8
|
+
Project-URL: Issues, https://github.com/kanishka089/computer-use-mcp/issues
|
|
9
|
+
Keywords: mcp,claude,computer-use,automation,desktop,rpa,realhands
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: mcp>=1.2
|
|
22
|
+
Requires-Dist: pyautogui>=0.9.54
|
|
23
|
+
Requires-Dist: mss>=9.0
|
|
24
|
+
Requires-Dist: pillow>=10.0
|
|
25
|
+
Requires-Dist: pynput>=1.7
|
|
26
|
+
Requires-Dist: keyboard>=0.13.5
|
|
27
|
+
Requires-Dist: pyperclip>=1.8
|
|
28
|
+
Requires-Dist: python-dotenv>=1.0
|
|
29
|
+
Requires-Dist: pygetwindow>=0.0.9
|
|
30
|
+
Requires-Dist: pywin32>=306; sys_platform == "win32"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
<!-- mcp-name: io.github.kanishka089/realhands -->
|
|
34
|
+
|
|
35
|
+
# computer-use-mcp ("realhands")
|
|
36
|
+
|
|
37
|
+
An MCP server that lets **Claude operate your real computer** the way a human does —
|
|
38
|
+
moving the **actual mouse**, clicking, typing, and reading the **actual screen**.
|
|
39
|
+
|
|
40
|
+
Unlike OpenAI Operator, browser-use, or Playwright agents (which spin up a separate,
|
|
41
|
+
isolated, logged-out Chrome), this drives the **physical OS cursor and keyboard**. So it
|
|
42
|
+
works in **your own Chrome with your own logged-in sessions** — and in every other app —
|
|
43
|
+
because it's just a human at the keyboard, as far as any website can tell.
|
|
44
|
+
|
|
45
|
+
**Status: LIVE and battle-tested.** Registered with Claude Code as the user-scope MCP
|
|
46
|
+
**`realhands`** (tool `mcp__realhands__computer`) and ✓Connected since 2026-06-02.
|
|
47
|
+
On 2026-06-09 it drove the user's real, logged-in Chrome through a **complete Google
|
|
48
|
+
Play Console deployment** (app upload, release notes, submission) end-to-end.
|
|
49
|
+
|
|
50
|
+
> The server is registered as `realhands` rather than `computer-use` because the name
|
|
51
|
+
> "computer-use" is reserved in Claude Code.
|
|
52
|
+
|
|
53
|
+
## How it works
|
|
54
|
+
|
|
55
|
+
Claude (Desktop or Code) is the agent loop. You type a task; Claude calls the single
|
|
56
|
+
`computer` tool in a see → think → act cycle:
|
|
57
|
+
|
|
58
|
+
> **See** — `screenshot` returns the real screen (downscaled to ~1280px for grounding accuracy)
|
|
59
|
+
> → **Think** — Claude picks the next action + pixel coordinates
|
|
60
|
+
> → **Act** — the server moves the real mouse / types on the real keyboard
|
|
61
|
+
> → a fresh screenshot comes back automatically after every action, and it repeats.
|
|
62
|
+
|
|
63
|
+
Two Windows-specific details make clicks land accurately (`src/screen.py`):
|
|
64
|
+
|
|
65
|
+
- **DPI awareness** — `SetProcessDpiAwareness(2)` is set at import time so screenshot
|
|
66
|
+
pixels == pyautogui cursor coordinates even under display scaling (125% / 150% / …).
|
|
67
|
+
- **Stateless coordinate scaling** — screenshots are downscaled (LANCZOS) to at most
|
|
68
|
+
`COMPUTER_USE_MAX_DIM` on the longest side before sending; incoming click coordinates
|
|
69
|
+
are scaled back up to real pixels. The scale factor is a pure function of monitor
|
|
70
|
+
geometry + `MAX_DIM`, so mapping never depends on which screenshot ran last.
|
|
71
|
+
Coordinates are clamped inside the target monitor so a stray click can't fly off-screen.
|
|
72
|
+
|
|
73
|
+
**Multi-monitor:** every call takes an optional `monitor` index (1 = primary, 2.. =
|
|
74
|
+
others, 0 = the whole virtual desktop). `action="monitors"` enumerates the setup.
|
|
75
|
+
Origins may be negative for screens left/above the primary — `to_real()` handles the
|
|
76
|
+
offset. Use the **same** monitor for a click as for the screenshot you're clicking on.
|
|
77
|
+
|
|
78
|
+
## Architecture
|
|
79
|
+
|
|
80
|
+
```
|
|
81
|
+
src/realhands/
|
|
82
|
+
server.py FastMCP server "computer-use"; the single `computer` tool (action enum
|
|
83
|
+
modeled on Anthropic's reference computer_20250124 tool); returns
|
|
84
|
+
status text + a fresh screenshot after every action
|
|
85
|
+
screen.py DPI awareness, mss capture, downscale, model-space -> real-pixel mapping
|
|
86
|
+
input.py pyautogui mouse/keyboard execution; xdotool-style key-name translation
|
|
87
|
+
(Return, Page_Down, ctrl+a, super, ...); clipboard-paste fast path for
|
|
88
|
+
long/Unicode/multiline typing (preserves your existing clipboard);
|
|
89
|
+
activate_window via win32 AttachThreadInput
|
|
90
|
+
safety.py kill switches + lazy arm / stand-down lifecycle
|
|
91
|
+
config.py .env-driven configuration (all defaults are sensible; .env is optional)
|
|
92
|
+
install.py one-shot installer: venv, deps, .env, Claude Desktop registration
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Stack: Python 3.10/3.11 · `mcp` (FastMCP, stdio) · `pyautogui` · `mss` · `pillow` ·
|
|
96
|
+
`pynput` · `keyboard` · `pyperclip` · `python-dotenv` — plus `pygetwindow` and `pywin32`
|
|
97
|
+
for `activate_window`.
|
|
98
|
+
|
|
99
|
+
## The `computer` tool
|
|
100
|
+
|
|
101
|
+
A single tool with an `action` parameter:
|
|
102
|
+
|
|
103
|
+
| Action | What it does |
|
|
104
|
+
|---|---|
|
|
105
|
+
| `screenshot` | Capture the screen (always start a task with this) |
|
|
106
|
+
| `cursor_position` | Report the real mouse position |
|
|
107
|
+
| `monitors` | List detected monitors (for multi-screen setups) |
|
|
108
|
+
| `mouse_move` | Glide the cursor to `coordinate` |
|
|
109
|
+
| `left_click` / `right_click` / `middle_click` / `double_click` / `triple_click` | Click at `coordinate` (or current position) |
|
|
110
|
+
| `left_click_drag` | Drag from `text="x1,y1"` to `coordinate=[x2,y2]` |
|
|
111
|
+
| `left_mouse_down` / `left_mouse_up` | Press / release the left button |
|
|
112
|
+
| `scroll` | Scroll at `coordinate` (`scroll_direction` + `scroll_amount` notches) |
|
|
113
|
+
| `type` | Type `text` (clipboard-paste path for long/Unicode/multiline) |
|
|
114
|
+
| `key` | Press a key or chord — `"Return"`, `"ctrl+s"`, `"alt+Tab"` |
|
|
115
|
+
| `hold_key` | Hold keys for `duration` seconds |
|
|
116
|
+
| `activate_window` | Bring an app to the front by title substring (beats Windows' foreground-lock; far more reliable than clicking the taskbar) |
|
|
117
|
+
| `wait` | Sleep `duration` seconds, then screenshot |
|
|
118
|
+
| `stop` | Stand down: close the STOP overlay + release the panic hotkey (call as the final action) |
|
|
119
|
+
|
|
120
|
+
Coordinates are in the pixel space of the most recent screenshot; its size is reported
|
|
121
|
+
with every capture. After every non-screenshot action the tool waits ~0.4s for the UI
|
|
122
|
+
to settle and returns a fresh screenshot.
|
|
123
|
+
|
|
124
|
+
## Safety — it controls your REAL machine
|
|
125
|
+
|
|
126
|
+
This is **fully autonomous**: it does not ask before each action. Three independent
|
|
127
|
+
kill switches (`src/safety.py`):
|
|
128
|
+
|
|
129
|
+
1. **Fail-safe corner** — slam the mouse into the **top-left corner** → pyautogui raises
|
|
130
|
+
`FailSafeException` and the action aborts instantly.
|
|
131
|
+
2. **Panic hotkey** — **Ctrl+Alt+Q** (configurable) → hard-kills the server process
|
|
132
|
+
(`os._exit(1)`).
|
|
133
|
+
3. **STOP overlay** — an always-on-top window (top-right) showing the current action,
|
|
134
|
+
with a big red **■ STOP AGENT** button that also hard-kills the process.
|
|
135
|
+
|
|
136
|
+
**Lazy arm / stand-down:** the overlay and the global panic hotkey are armed lazily on
|
|
137
|
+
the **first action** of a task, not at server startup — idle sessions show nothing and
|
|
138
|
+
grab no hotkeys. They stand down again when the agent calls `action="stop"`, or
|
|
139
|
+
automatically after `COMPUTER_USE_IDLE_STOP` seconds (default 30) of inactivity. The
|
|
140
|
+
lightweight stdio process stays connected so the next task is instant. Everything
|
|
141
|
+
re-arms automatically on the next action.
|
|
142
|
+
|
|
143
|
+
Pacing also helps you stay in control: every action is followed by a configurable pause
|
|
144
|
+
(`COMPUTER_USE_PAUSE`) and the cursor glides rather than teleports
|
|
145
|
+
(`COMPUTER_USE_MOVE_DURATION`), so you can watch and interrupt.
|
|
146
|
+
|
|
147
|
+
**Don't leave it unsupervised on anything that can spend money, send messages, or
|
|
148
|
+
delete data.**
|
|
149
|
+
|
|
150
|
+
## Install
|
|
151
|
+
|
|
152
|
+
Requires **Python 3.10 or 3.11** (3.13+ untested; avoid the 3.14 beta).
|
|
153
|
+
|
|
154
|
+
### From PyPI
|
|
155
|
+
|
|
156
|
+
```powershell
|
|
157
|
+
pip install realhands
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
This installs the `realhands` console script and the importable `realhands`
|
|
161
|
+
package. Run the server with either `realhands` or `python -m realhands.server`.
|
|
162
|
+
|
|
163
|
+
### From source (with Claude Desktop registration)
|
|
164
|
+
|
|
165
|
+
```powershell
|
|
166
|
+
git clone https://github.com/kanishka089/computer-use-mcp
|
|
167
|
+
cd computer-use-mcp
|
|
168
|
+
py -3.10 install.py
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
This creates `.venv/`, installs the package + deps (editable), copies `.env.example` to
|
|
172
|
+
`.env` if missing, and registers the server in Claude Desktop's config (backing up any
|
|
173
|
+
existing config). **Restart Claude Desktop**, then look for the `computer-use` tool.
|
|
174
|
+
|
|
175
|
+
### Claude Code
|
|
176
|
+
|
|
177
|
+
Register it as a **user-scope** stdio server named `realhands` (pointing at the Python
|
|
178
|
+
that has the package installed):
|
|
179
|
+
|
|
180
|
+
```powershell
|
|
181
|
+
claude mcp add realhands --scope user -- python -m realhands.server
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
The tool then appears as `mcp__realhands__computer` in every project.
|
|
185
|
+
|
|
186
|
+
## Use
|
|
187
|
+
|
|
188
|
+
Just ask. For example:
|
|
189
|
+
|
|
190
|
+
> *Take a screenshot, open Chrome, go to YouTube, and search for "lofi".*
|
|
191
|
+
|
|
192
|
+
Watch your real cursor move and your logged-in Chrome respond. Real-world proof: it has
|
|
193
|
+
autonomously completed a full Google Play Console release flow in the user's own
|
|
194
|
+
signed-in Chrome session.
|
|
195
|
+
|
|
196
|
+
## Configuration (`.env`, optional — defaults are fine)
|
|
197
|
+
|
|
198
|
+
| Var | Default | Meaning |
|
|
199
|
+
|-----|---------|---------|
|
|
200
|
+
| `COMPUTER_USE_MAX_DIM` | `1280` | Longest screenshot side sent to Claude (sweet spot for accuracy + token cost) |
|
|
201
|
+
| `COMPUTER_USE_MONITOR` | `1` | Default monitor (1 = primary, 2.. = others, 0 = all screens); overridable per call |
|
|
202
|
+
| `COMPUTER_USE_IMAGE_FORMAT` | `png` | `png` (crisp text) or `jpeg` (cheaper tokens) |
|
|
203
|
+
| `COMPUTER_USE_PAUSE` | `0.15` | Delay after each pyautogui action (interruptibility) |
|
|
204
|
+
| `COMPUTER_USE_PANIC_HOTKEY` | `ctrl+alt+q` | Global hard-stop hotkey |
|
|
205
|
+
| `COMPUTER_USE_OVERLAY` | `1` | Show the STOP overlay window |
|
|
206
|
+
| `COMPUTER_USE_MOVE_DURATION` | `0.4` | Cursor glide time (human-like movement) |
|
|
207
|
+
| `COMPUTER_USE_IDLE_STOP` | `30` | Auto stand-down after this many idle seconds (0 = never) |
|
|
208
|
+
|
|
209
|
+
## Known gotchas
|
|
210
|
+
|
|
211
|
+
- **MCP connection drops when the agent idles between turns.** The stdio connection to
|
|
212
|
+
`realhands` can silently die while Claude is thinking/waiting between turns. Fix: issue
|
|
213
|
+
a `screenshot` action — it silently reconnects. Importantly, an action that "failed"
|
|
214
|
+
with *Connection closed* **often still executed** on the real machine — take a
|
|
215
|
+
screenshot and check the actual screen state before retrying, or you may double-click /
|
|
216
|
+
double-submit.
|
|
217
|
+
- **Click coordinates must match the screenshot's monitor.** If you screenshot
|
|
218
|
+
`monitor=2` and then click without passing `monitor=2`, the click lands on the primary.
|
|
219
|
+
- **`activate_window` beats the taskbar.** Windows' foreground-lock makes taskbar clicks
|
|
220
|
+
unreliable (the icon just flashes). `activate_window` uses `AttachThreadInput` +
|
|
221
|
+
z-order toggling + a minimize/restore fallback, so prefer it for app switching.
|
|
222
|
+
- **Don't run with Python 3.13/3.14.** Tested on 3.10/3.11 only; the installer warns.
|
|
223
|
+
- **Typing long/Unicode text uses the clipboard.** Your clipboard is saved and restored,
|
|
224
|
+
but anything watching the clipboard will see the pasted text momentarily.
|
|
225
|
+
|
|
226
|
+
## Self-test
|
|
227
|
+
|
|
228
|
+
```powershell
|
|
229
|
+
python -m realhands.screen
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
Captures the screen, prints real vs. sent dimensions and the scale factor, writes
|
|
233
|
+
`test_capture.png`, and runs a coordinate round-trip check (center + both corners).
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
<!-- mcp-name: io.github.kanishka089/realhands -->
|
|
2
|
+
|
|
3
|
+
# computer-use-mcp ("realhands")
|
|
4
|
+
|
|
5
|
+
An MCP server that lets **Claude operate your real computer** the way a human does —
|
|
6
|
+
moving the **actual mouse**, clicking, typing, and reading the **actual screen**.
|
|
7
|
+
|
|
8
|
+
Unlike OpenAI Operator, browser-use, or Playwright agents (which spin up a separate,
|
|
9
|
+
isolated, logged-out Chrome), this drives the **physical OS cursor and keyboard**. So it
|
|
10
|
+
works in **your own Chrome with your own logged-in sessions** — and in every other app —
|
|
11
|
+
because it's just a human at the keyboard, as far as any website can tell.
|
|
12
|
+
|
|
13
|
+
**Status: LIVE and battle-tested.** Registered with Claude Code as the user-scope MCP
|
|
14
|
+
**`realhands`** (tool `mcp__realhands__computer`) and ✓Connected since 2026-06-02.
|
|
15
|
+
On 2026-06-09 it drove the user's real, logged-in Chrome through a **complete Google
|
|
16
|
+
Play Console deployment** (app upload, release notes, submission) end-to-end.
|
|
17
|
+
|
|
18
|
+
> The server is registered as `realhands` rather than `computer-use` because the name
|
|
19
|
+
> "computer-use" is reserved in Claude Code.
|
|
20
|
+
|
|
21
|
+
## How it works
|
|
22
|
+
|
|
23
|
+
Claude (Desktop or Code) is the agent loop. You type a task; Claude calls the single
|
|
24
|
+
`computer` tool in a see → think → act cycle:
|
|
25
|
+
|
|
26
|
+
> **See** — `screenshot` returns the real screen (downscaled to ~1280px for grounding accuracy)
|
|
27
|
+
> → **Think** — Claude picks the next action + pixel coordinates
|
|
28
|
+
> → **Act** — the server moves the real mouse / types on the real keyboard
|
|
29
|
+
> → a fresh screenshot comes back automatically after every action, and it repeats.
|
|
30
|
+
|
|
31
|
+
Two Windows-specific details make clicks land accurately (`src/screen.py`):
|
|
32
|
+
|
|
33
|
+
- **DPI awareness** — `SetProcessDpiAwareness(2)` is set at import time so screenshot
|
|
34
|
+
pixels == pyautogui cursor coordinates even under display scaling (125% / 150% / …).
|
|
35
|
+
- **Stateless coordinate scaling** — screenshots are downscaled (LANCZOS) to at most
|
|
36
|
+
`COMPUTER_USE_MAX_DIM` on the longest side before sending; incoming click coordinates
|
|
37
|
+
are scaled back up to real pixels. The scale factor is a pure function of monitor
|
|
38
|
+
geometry + `MAX_DIM`, so mapping never depends on which screenshot ran last.
|
|
39
|
+
Coordinates are clamped inside the target monitor so a stray click can't fly off-screen.
|
|
40
|
+
|
|
41
|
+
**Multi-monitor:** every call takes an optional `monitor` index (1 = primary, 2.. =
|
|
42
|
+
others, 0 = the whole virtual desktop). `action="monitors"` enumerates the setup.
|
|
43
|
+
Origins may be negative for screens left/above the primary — `to_real()` handles the
|
|
44
|
+
offset. Use the **same** monitor for a click as for the screenshot you're clicking on.
|
|
45
|
+
|
|
46
|
+
## Architecture
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
src/realhands/
|
|
50
|
+
server.py FastMCP server "computer-use"; the single `computer` tool (action enum
|
|
51
|
+
modeled on Anthropic's reference computer_20250124 tool); returns
|
|
52
|
+
status text + a fresh screenshot after every action
|
|
53
|
+
screen.py DPI awareness, mss capture, downscale, model-space -> real-pixel mapping
|
|
54
|
+
input.py pyautogui mouse/keyboard execution; xdotool-style key-name translation
|
|
55
|
+
(Return, Page_Down, ctrl+a, super, ...); clipboard-paste fast path for
|
|
56
|
+
long/Unicode/multiline typing (preserves your existing clipboard);
|
|
57
|
+
activate_window via win32 AttachThreadInput
|
|
58
|
+
safety.py kill switches + lazy arm / stand-down lifecycle
|
|
59
|
+
config.py .env-driven configuration (all defaults are sensible; .env is optional)
|
|
60
|
+
install.py one-shot installer: venv, deps, .env, Claude Desktop registration
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Stack: Python 3.10/3.11 · `mcp` (FastMCP, stdio) · `pyautogui` · `mss` · `pillow` ·
|
|
64
|
+
`pynput` · `keyboard` · `pyperclip` · `python-dotenv` — plus `pygetwindow` and `pywin32`
|
|
65
|
+
for `activate_window`.
|
|
66
|
+
|
|
67
|
+
## The `computer` tool
|
|
68
|
+
|
|
69
|
+
A single tool with an `action` parameter:
|
|
70
|
+
|
|
71
|
+
| Action | What it does |
|
|
72
|
+
|---|---|
|
|
73
|
+
| `screenshot` | Capture the screen (always start a task with this) |
|
|
74
|
+
| `cursor_position` | Report the real mouse position |
|
|
75
|
+
| `monitors` | List detected monitors (for multi-screen setups) |
|
|
76
|
+
| `mouse_move` | Glide the cursor to `coordinate` |
|
|
77
|
+
| `left_click` / `right_click` / `middle_click` / `double_click` / `triple_click` | Click at `coordinate` (or current position) |
|
|
78
|
+
| `left_click_drag` | Drag from `text="x1,y1"` to `coordinate=[x2,y2]` |
|
|
79
|
+
| `left_mouse_down` / `left_mouse_up` | Press / release the left button |
|
|
80
|
+
| `scroll` | Scroll at `coordinate` (`scroll_direction` + `scroll_amount` notches) |
|
|
81
|
+
| `type` | Type `text` (clipboard-paste path for long/Unicode/multiline) |
|
|
82
|
+
| `key` | Press a key or chord — `"Return"`, `"ctrl+s"`, `"alt+Tab"` |
|
|
83
|
+
| `hold_key` | Hold keys for `duration` seconds |
|
|
84
|
+
| `activate_window` | Bring an app to the front by title substring (beats Windows' foreground-lock; far more reliable than clicking the taskbar) |
|
|
85
|
+
| `wait` | Sleep `duration` seconds, then screenshot |
|
|
86
|
+
| `stop` | Stand down: close the STOP overlay + release the panic hotkey (call as the final action) |
|
|
87
|
+
|
|
88
|
+
Coordinates are in the pixel space of the most recent screenshot; its size is reported
|
|
89
|
+
with every capture. After every non-screenshot action the tool waits ~0.4s for the UI
|
|
90
|
+
to settle and returns a fresh screenshot.
|
|
91
|
+
|
|
92
|
+
## Safety — it controls your REAL machine
|
|
93
|
+
|
|
94
|
+
This is **fully autonomous**: it does not ask before each action. Three independent
|
|
95
|
+
kill switches (`src/safety.py`):
|
|
96
|
+
|
|
97
|
+
1. **Fail-safe corner** — slam the mouse into the **top-left corner** → pyautogui raises
|
|
98
|
+
`FailSafeException` and the action aborts instantly.
|
|
99
|
+
2. **Panic hotkey** — **Ctrl+Alt+Q** (configurable) → hard-kills the server process
|
|
100
|
+
(`os._exit(1)`).
|
|
101
|
+
3. **STOP overlay** — an always-on-top window (top-right) showing the current action,
|
|
102
|
+
with a big red **■ STOP AGENT** button that also hard-kills the process.
|
|
103
|
+
|
|
104
|
+
**Lazy arm / stand-down:** the overlay and the global panic hotkey are armed lazily on
|
|
105
|
+
the **first action** of a task, not at server startup — idle sessions show nothing and
|
|
106
|
+
grab no hotkeys. They stand down again when the agent calls `action="stop"`, or
|
|
107
|
+
automatically after `COMPUTER_USE_IDLE_STOP` seconds (default 30) of inactivity. The
|
|
108
|
+
lightweight stdio process stays connected so the next task is instant. Everything
|
|
109
|
+
re-arms automatically on the next action.
|
|
110
|
+
|
|
111
|
+
Pacing also helps you stay in control: every action is followed by a configurable pause
|
|
112
|
+
(`COMPUTER_USE_PAUSE`) and the cursor glides rather than teleports
|
|
113
|
+
(`COMPUTER_USE_MOVE_DURATION`), so you can watch and interrupt.
|
|
114
|
+
|
|
115
|
+
**Don't leave it unsupervised on anything that can spend money, send messages, or
|
|
116
|
+
delete data.**
|
|
117
|
+
|
|
118
|
+
## Install
|
|
119
|
+
|
|
120
|
+
Requires **Python 3.10 or 3.11** (3.13+ untested; avoid the 3.14 beta).
|
|
121
|
+
|
|
122
|
+
### From PyPI
|
|
123
|
+
|
|
124
|
+
```powershell
|
|
125
|
+
pip install realhands
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
This installs the `realhands` console script and the importable `realhands`
|
|
129
|
+
package. Run the server with either `realhands` or `python -m realhands.server`.
|
|
130
|
+
|
|
131
|
+
### From source (with Claude Desktop registration)
|
|
132
|
+
|
|
133
|
+
```powershell
|
|
134
|
+
git clone https://github.com/kanishka089/computer-use-mcp
|
|
135
|
+
cd computer-use-mcp
|
|
136
|
+
py -3.10 install.py
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
This creates `.venv/`, installs the package + deps (editable), copies `.env.example` to
|
|
140
|
+
`.env` if missing, and registers the server in Claude Desktop's config (backing up any
|
|
141
|
+
existing config). **Restart Claude Desktop**, then look for the `computer-use` tool.
|
|
142
|
+
|
|
143
|
+
### Claude Code
|
|
144
|
+
|
|
145
|
+
Register it as a **user-scope** stdio server named `realhands` (pointing at the Python
|
|
146
|
+
that has the package installed):
|
|
147
|
+
|
|
148
|
+
```powershell
|
|
149
|
+
claude mcp add realhands --scope user -- python -m realhands.server
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
The tool then appears as `mcp__realhands__computer` in every project.
|
|
153
|
+
|
|
154
|
+
## Use
|
|
155
|
+
|
|
156
|
+
Just ask. For example:
|
|
157
|
+
|
|
158
|
+
> *Take a screenshot, open Chrome, go to YouTube, and search for "lofi".*
|
|
159
|
+
|
|
160
|
+
Watch your real cursor move and your logged-in Chrome respond. Real-world proof: it has
|
|
161
|
+
autonomously completed a full Google Play Console release flow in the user's own
|
|
162
|
+
signed-in Chrome session.
|
|
163
|
+
|
|
164
|
+
## Configuration (`.env`, optional — defaults are fine)
|
|
165
|
+
|
|
166
|
+
| Var | Default | Meaning |
|
|
167
|
+
|-----|---------|---------|
|
|
168
|
+
| `COMPUTER_USE_MAX_DIM` | `1280` | Longest screenshot side sent to Claude (sweet spot for accuracy + token cost) |
|
|
169
|
+
| `COMPUTER_USE_MONITOR` | `1` | Default monitor (1 = primary, 2.. = others, 0 = all screens); overridable per call |
|
|
170
|
+
| `COMPUTER_USE_IMAGE_FORMAT` | `png` | `png` (crisp text) or `jpeg` (cheaper tokens) |
|
|
171
|
+
| `COMPUTER_USE_PAUSE` | `0.15` | Delay after each pyautogui action (interruptibility) |
|
|
172
|
+
| `COMPUTER_USE_PANIC_HOTKEY` | `ctrl+alt+q` | Global hard-stop hotkey |
|
|
173
|
+
| `COMPUTER_USE_OVERLAY` | `1` | Show the STOP overlay window |
|
|
174
|
+
| `COMPUTER_USE_MOVE_DURATION` | `0.4` | Cursor glide time (human-like movement) |
|
|
175
|
+
| `COMPUTER_USE_IDLE_STOP` | `30` | Auto stand-down after this many idle seconds (0 = never) |
|
|
176
|
+
|
|
177
|
+
## Known gotchas
|
|
178
|
+
|
|
179
|
+
- **MCP connection drops when the agent idles between turns.** The stdio connection to
|
|
180
|
+
`realhands` can silently die while Claude is thinking/waiting between turns. Fix: issue
|
|
181
|
+
a `screenshot` action — it silently reconnects. Importantly, an action that "failed"
|
|
182
|
+
with *Connection closed* **often still executed** on the real machine — take a
|
|
183
|
+
screenshot and check the actual screen state before retrying, or you may double-click /
|
|
184
|
+
double-submit.
|
|
185
|
+
- **Click coordinates must match the screenshot's monitor.** If you screenshot
|
|
186
|
+
`monitor=2` and then click without passing `monitor=2`, the click lands on the primary.
|
|
187
|
+
- **`activate_window` beats the taskbar.** Windows' foreground-lock makes taskbar clicks
|
|
188
|
+
unreliable (the icon just flashes). `activate_window` uses `AttachThreadInput` +
|
|
189
|
+
z-order toggling + a minimize/restore fallback, so prefer it for app switching.
|
|
190
|
+
- **Don't run with Python 3.13/3.14.** Tested on 3.10/3.11 only; the installer warns.
|
|
191
|
+
- **Typing long/Unicode text uses the clipboard.** Your clipboard is saved and restored,
|
|
192
|
+
but anything watching the clipboard will see the pasted text momentarily.
|
|
193
|
+
|
|
194
|
+
## Self-test
|
|
195
|
+
|
|
196
|
+
```powershell
|
|
197
|
+
python -m realhands.screen
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
Captures the screen, prints real vs. sent dimensions and the scale factor, writes
|
|
201
|
+
`test_capture.png`, and runs a coordinate round-trip check (center + both corners).
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "realhands"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "MCP server that lets Claude operate the REAL desktop (mouse, keyboard, screen) like a human — works in your own logged-in Chrome and any app"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "kanishka089" }]
|
|
13
|
+
keywords = ["mcp", "claude", "computer-use", "automation", "desktop", "rpa", "realhands"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: Microsoft :: Windows",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
]
|
|
24
|
+
dependencies = [
|
|
25
|
+
"mcp>=1.2",
|
|
26
|
+
"pyautogui>=0.9.54",
|
|
27
|
+
"mss>=9.0",
|
|
28
|
+
"pillow>=10.0",
|
|
29
|
+
"pynput>=1.7",
|
|
30
|
+
"keyboard>=0.13.5",
|
|
31
|
+
"pyperclip>=1.8",
|
|
32
|
+
"python-dotenv>=1.0",
|
|
33
|
+
"pygetwindow>=0.0.9",
|
|
34
|
+
"pywin32>=306; sys_platform == 'win32'",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.scripts]
|
|
38
|
+
realhands = "realhands.server:main"
|
|
39
|
+
|
|
40
|
+
[project.urls]
|
|
41
|
+
Homepage = "https://github.com/kanishka089/computer-use-mcp"
|
|
42
|
+
Issues = "https://github.com/kanishka089/computer-use-mcp/issues"
|
|
43
|
+
|
|
44
|
+
[tool.setuptools.packages.find]
|
|
45
|
+
where = ["src"]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Environment-driven configuration for the computer-use-mcp server."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
|
|
9
|
+
_HERE = Path(__file__).parent.resolve() # src/
|
|
10
|
+
_ROOT = _HERE.parent # repo root
|
|
11
|
+
load_dotenv(_ROOT / ".env")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _int(name: str, default: int) -> int:
|
|
15
|
+
raw = os.environ.get(name, "").strip()
|
|
16
|
+
return int(raw) if raw else default
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _float(name: str, default: float) -> float:
|
|
20
|
+
raw = os.environ.get(name, "").strip()
|
|
21
|
+
return float(raw) if raw else default
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _bool(name: str, default: bool) -> bool:
|
|
25
|
+
raw = os.environ.get(name, "").strip().lower()
|
|
26
|
+
if not raw:
|
|
27
|
+
return default
|
|
28
|
+
return raw in ("1", "true", "yes", "on")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# --- Screen / coordinate grounding ---
|
|
32
|
+
MAX_DIM: int = _int("COMPUTER_USE_MAX_DIM", 1280)
|
|
33
|
+
MONITOR: int = _int("COMPUTER_USE_MONITOR", 1)
|
|
34
|
+
IMAGE_FORMAT: str = os.environ.get("COMPUTER_USE_IMAGE_FORMAT", "png").strip().lower()
|
|
35
|
+
|
|
36
|
+
# --- Pacing / safety ---
|
|
37
|
+
PAUSE: float = _float("COMPUTER_USE_PAUSE", 0.15)
|
|
38
|
+
PANIC_HOTKEY: str = os.environ.get("COMPUTER_USE_PANIC_HOTKEY", "ctrl+alt+q").strip()
|
|
39
|
+
OVERLAY: bool = _bool("COMPUTER_USE_OVERLAY", True)
|
|
40
|
+
MOVE_DURATION: float = _float("COMPUTER_USE_MOVE_DURATION", 0.4)
|
|
41
|
+
|
|
42
|
+
# Stand down (close overlay + release panic hotkey) after this many seconds with
|
|
43
|
+
# no action. 0 disables the idle watchdog. The agent can also call action="stop".
|
|
44
|
+
IDLE_STOP: float = _float("COMPUTER_USE_IDLE_STOP", 30.0)
|