puppet-agent 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- puppet_agent-0.1.0/.gitignore +21 -0
- puppet_agent-0.1.0/CHANGELOG.md +19 -0
- puppet_agent-0.1.0/LICENSE +21 -0
- puppet_agent-0.1.0/PKG-INFO +409 -0
- puppet_agent-0.1.0/README.md +370 -0
- puppet_agent-0.1.0/benchmarks/expected/.gitkeep +0 -0
- puppet_agent-0.1.0/benchmarks/run_benchmark.py +71 -0
- puppet_agent-0.1.0/benchmarks/screenshots/.gitkeep +0 -0
- puppet_agent-0.1.0/examples/browser_automation.py +39 -0
- puppet_agent-0.1.0/examples/desktop_control.py +38 -0
- puppet_agent-0.1.0/examples/read_screen.py +28 -0
- puppet_agent-0.1.0/integrations/antigravity/README.md +50 -0
- puppet_agent-0.1.0/integrations/claude-code/README.md +44 -0
- puppet_agent-0.1.0/integrations/cline/README.md +42 -0
- puppet_agent-0.1.0/integrations/codex/README.md +47 -0
- puppet_agent-0.1.0/integrations/cursor/README.md +52 -0
- puppet_agent-0.1.0/integrations/gemini/README.md +46 -0
- puppet_agent-0.1.0/integrations/openclaw/README.md +49 -0
- puppet_agent-0.1.0/integrations/openclaw/SOUL_VISION.md +39 -0
- puppet_agent-0.1.0/integrations/windsurf/README.md +50 -0
- puppet_agent-0.1.0/integrations/zed/README.md +46 -0
- puppet_agent-0.1.0/puppet-ai.yaml.example +13 -0
- puppet_agent-0.1.0/pyproject.toml +51 -0
- puppet_agent-0.1.0/src/puppet_ai/__init__.py +4 -0
- puppet_agent-0.1.0/src/puppet_ai/cli.py +189 -0
- puppet_agent-0.1.0/src/puppet_ai/config/__init__.py +0 -0
- puppet_agent-0.1.0/src/puppet_ai/config/presets.py +30 -0
- puppet_agent-0.1.0/src/puppet_ai/config/schema.py +31 -0
- puppet_agent-0.1.0/src/puppet_ai/core/__init__.py +0 -0
- puppet_agent-0.1.0/src/puppet_ai/core/accessibility.py +119 -0
- puppet_agent-0.1.0/src/puppet_ai/core/actions.py +81 -0
- puppet_agent-0.1.0/src/puppet_ai/core/capture.py +128 -0
- puppet_agent-0.1.0/src/puppet_ai/core/detector.py +84 -0
- puppet_agent-0.1.0/src/puppet_ai/core/foveal.py +23 -0
- puppet_agent-0.1.0/src/puppet_ai/core/ocr.py +136 -0
- puppet_agent-0.1.0/src/puppet_ai/core/ocr_cache.py +40 -0
- puppet_agent-0.1.0/src/puppet_ai/core/peripheral.py +24 -0
- puppet_agent-0.1.0/src/puppet_ai/core/permissions.py +43 -0
- puppet_agent-0.1.0/src/puppet_ai/core/pii_filter.py +97 -0
- puppet_agent-0.1.0/src/puppet_ai/core/wait.py +50 -0
- puppet_agent-0.1.0/src/puppet_ai/core/world_model.py +69 -0
- puppet_agent-0.1.0/src/puppet_ai/instructions.py +165 -0
- puppet_agent-0.1.0/src/puppet_ai/providers/__init__.py +48 -0
- puppet_agent-0.1.0/src/puppet_ai/providers/anthropic.py +156 -0
- puppet_agent-0.1.0/src/puppet_ai/providers/base.py +9 -0
- puppet_agent-0.1.0/src/puppet_ai/providers/gemini.py +40 -0
- puppet_agent-0.1.0/src/puppet_ai/providers/ollama.py +108 -0
- puppet_agent-0.1.0/src/puppet_ai/providers/openai.py +42 -0
- puppet_agent-0.1.0/src/puppet_ai/providers/sampling.py +45 -0
- puppet_agent-0.1.0/src/puppet_ai/server/__init__.py +0 -0
- puppet_agent-0.1.0/src/puppet_ai/server/mcp.py +399 -0
- puppet_agent-0.1.0/src/puppet_ai/types.py +61 -0
- puppet_agent-0.1.0/tests/conftest.py +60 -0
- puppet_agent-0.1.0/tests/test_accessibility.py +51 -0
- puppet_agent-0.1.0/tests/test_actions.py +97 -0
- puppet_agent-0.1.0/tests/test_cli.py +37 -0
- puppet_agent-0.1.0/tests/test_config.py +66 -0
- puppet_agent-0.1.0/tests/test_detector.py +64 -0
- puppet_agent-0.1.0/tests/test_foveal.py +65 -0
- puppet_agent-0.1.0/tests/test_instructions.py +22 -0
- puppet_agent-0.1.0/tests/test_mcp_v2.py +116 -0
- puppet_agent-0.1.0/tests/test_ocr_bounds.py +54 -0
- puppet_agent-0.1.0/tests/test_ocr_cache.py +37 -0
- puppet_agent-0.1.0/tests/test_peripheral.py +49 -0
- puppet_agent-0.1.0/tests/test_permissions.py +12 -0
- puppet_agent-0.1.0/tests/test_pii_filter.py +88 -0
- puppet_agent-0.1.0/tests/test_providers/__init__.py +0 -0
- puppet_agent-0.1.0/tests/test_providers/test_registry.py +48 -0
- puppet_agent-0.1.0/tests/test_types.py +75 -0
- puppet_agent-0.1.0/tests/test_wait.py +37 -0
- puppet_agent-0.1.0/tests/test_world_model.py +84 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.pyc
|
|
3
|
+
*.egg-info/
|
|
4
|
+
dist/
|
|
5
|
+
build/
|
|
6
|
+
.venv/
|
|
7
|
+
.pytest_cache/
|
|
8
|
+
.ruff_cache/
|
|
9
|
+
.superpowers/
|
|
10
|
+
.DS_Store
|
|
11
|
+
*.DS_Store
|
|
12
|
+
|
|
13
|
+
# Test artifacts
|
|
14
|
+
TEST_RESULTS.md
|
|
15
|
+
TEST_EXECUTION_SUMMARY.txt
|
|
16
|
+
COMPREHENSIVE_TEST_REPORT.md
|
|
17
|
+
00_TEST_INDEX.md
|
|
18
|
+
test_results.log
|
|
19
|
+
test_comprehensive.py
|
|
20
|
+
twitter_news_*.py
|
|
21
|
+
TWITTER_NEWS_REPORT.md
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.1.0 (2026-04-03)
|
|
4
|
+
|
|
5
|
+
Initial release.
|
|
6
|
+
|
|
7
|
+
### Features
|
|
8
|
+
|
|
9
|
+
- **27 MCP tools** for full computer control — vision, actions, system
|
|
10
|
+
- **Native macOS OCR** via Apple Vision Framework (Russian + English)
|
|
11
|
+
- **Per-window capture** via CGWindowList — no app switching needed
|
|
12
|
+
- **Accessibility API** — detect buttons, links, checkboxes, text fields
|
|
13
|
+
- **Computer control** — click, type, scroll, drag, hotkeys, clipboard
|
|
14
|
+
- **Smart tools** — `action_click_text`, `action_click_and_wait`, `action_open_url`
|
|
15
|
+
- **PII protection** — auto-mask API keys, passwords, credit cards in OCR output
|
|
16
|
+
- **OCR cache** — 7x speedup on repeated reads, auto-invalidation after actions
|
|
17
|
+
- **Retina display** support — automatic coordinate scaling
|
|
18
|
+
- **Agent-agnostic** — works with Claude Code, OpenClaw, any MCP client
|
|
19
|
+
- **System prompt** — built-in instructions teach agents how to use the tools
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 transoff
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,409 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: puppet-agent
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Full computer control for AI agents — see, click, type, scroll via MCP
|
|
5
|
+
Project-URL: Homepage, https://github.com/transoff/puppet-ai
|
|
6
|
+
Project-URL: Repository, https://github.com/transoff/puppet-ai
|
|
7
|
+
Project-URL: Issues, https://github.com/transoff/puppet-ai/issues
|
|
8
|
+
Author: Daniel Starkov
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Python: >=3.11
|
|
12
|
+
Requires-Dist: click>=8.0
|
|
13
|
+
Requires-Dist: mcp>=1.0.0
|
|
14
|
+
Requires-Dist: numpy>=1.24
|
|
15
|
+
Requires-Dist: pillow>=10.0
|
|
16
|
+
Requires-Dist: pyautogui>=0.9
|
|
17
|
+
Requires-Dist: pydantic>=2.0
|
|
18
|
+
Requires-Dist: pyperclip>=1.8
|
|
19
|
+
Requires-Dist: pyyaml>=6.0
|
|
20
|
+
Provides-Extra: all
|
|
21
|
+
Requires-Dist: puppet-ai[anthropic,apple-vision,gemini,mlx,openai]; extra == 'all'
|
|
22
|
+
Provides-Extra: anthropic
|
|
23
|
+
Requires-Dist: anthropic>=0.40; extra == 'anthropic'
|
|
24
|
+
Provides-Extra: apple-vision
|
|
25
|
+
Requires-Dist: pyobjc-framework-screencapturekit>=10.0; extra == 'apple-vision'
|
|
26
|
+
Requires-Dist: pyobjc-framework-vision>=10.0; extra == 'apple-vision'
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
31
|
+
Provides-Extra: gemini
|
|
32
|
+
Requires-Dist: google-genai>=1.0; extra == 'gemini'
|
|
33
|
+
Provides-Extra: mlx
|
|
34
|
+
Requires-Dist: mlx-vlm>=0.1; extra == 'mlx'
|
|
35
|
+
Requires-Dist: mlx>=0.20; extra == 'mlx'
|
|
36
|
+
Provides-Extra: openai
|
|
37
|
+
Requires-Dist: openai>=1.50; extra == 'openai'
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
|
|
40
|
+
# puppet-ai
|
|
41
|
+
|
|
42
|
+
Full computer control for AI agents — see, click, type, scroll via MCP.
|
|
43
|
+
|
|
44
|
+
Give any AI agent eyes and hands. puppet-ai captures the screen, reads text via native OCR, detects UI elements, and controls mouse + keyboard. Works with any app on macOS — browsers, desktop apps, games, terminals.
|
|
45
|
+
|
|
46
|
+
## Why puppet-ai?
|
|
47
|
+
|
|
48
|
+
- **Fast** — native macOS OCR in ~0.5s, 7x faster with caching
|
|
49
|
+
- **Universal** — works with ANY app, not just browsers
|
|
50
|
+
- **Agent-agnostic** — MCP standard, plug into any AI agent
|
|
51
|
+
- **Secure** — auto-masks API keys, passwords, credit cards, emails in OCR output
|
|
52
|
+
- **Complete** — 27 tools: vision + actions + system
|
|
53
|
+
- **Private** — all processing on-device, no data leaves your Mac
|
|
54
|
+
|
|
55
|
+
## Quick Start
|
|
56
|
+
|
|
57
|
+
### 1. Install
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install puppet-ai
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### 2. Enable Accessibility
|
|
64
|
+
|
|
65
|
+
System Settings → Privacy & Security → Accessibility → enable your terminal/IDE app.
|
|
66
|
+
|
|
67
|
+
### 3. Connect to your AI agent
|
|
68
|
+
|
|
69
|
+
puppet-ai is an MCP server. Connect it to any MCP-compatible agent below, then ask the agent to interact with your computer.
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## Integrations
|
|
74
|
+
|
|
75
|
+
### Claude Code
|
|
76
|
+
|
|
77
|
+
Add to `~/.claude/settings.json`:
|
|
78
|
+
|
|
79
|
+
```json
|
|
80
|
+
{
|
|
81
|
+
"mcpServers": {
|
|
82
|
+
"puppet-ai": {
|
|
83
|
+
"command": "puppet-ai",
|
|
84
|
+
"args": ["serve"]
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### OpenAI Codex CLI
|
|
91
|
+
|
|
92
|
+
Add to `~/.codex/config.toml`:
|
|
93
|
+
|
|
94
|
+
```toml
|
|
95
|
+
[mcp_servers.puppet-ai]
|
|
96
|
+
command = "puppet-ai"
|
|
97
|
+
args = ["serve"]
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Or via CLI:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
codex mcp add puppet-ai -- puppet-ai serve
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Google Gemini CLI
|
|
107
|
+
|
|
108
|
+
Add to `~/.gemini/settings.json`:
|
|
109
|
+
|
|
110
|
+
```json
|
|
111
|
+
{
|
|
112
|
+
"mcpServers": {
|
|
113
|
+
"puppet-ai": {
|
|
114
|
+
"command": "puppet-ai",
|
|
115
|
+
"args": ["serve"]
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Verify: launch `gemini` and run `/mcp` to check connection.
|
|
122
|
+
|
|
123
|
+
### Google Antigravity
|
|
124
|
+
|
|
125
|
+
Via MCP settings in Antigravity, or add to your project's `.antigravity/settings.json`:
|
|
126
|
+
|
|
127
|
+
```json
|
|
128
|
+
{
|
|
129
|
+
"mcpServers": {
|
|
130
|
+
"puppet-ai": {
|
|
131
|
+
"command": "puppet-ai",
|
|
132
|
+
"args": ["serve"]
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Cursor
|
|
139
|
+
|
|
140
|
+
Add to `~/.cursor/mcp.json`:
|
|
141
|
+
|
|
142
|
+
```json
|
|
143
|
+
{
|
|
144
|
+
"mcpServers": {
|
|
145
|
+
"puppet-ai": {
|
|
146
|
+
"command": "puppet-ai",
|
|
147
|
+
"args": ["serve"]
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
Or: Cursor Settings → Tools & MCP → Add Server.
|
|
154
|
+
|
|
155
|
+
### Windsurf
|
|
156
|
+
|
|
157
|
+
Add to `~/.codeium/windsurf/mcp_config.json`:
|
|
158
|
+
|
|
159
|
+
```json
|
|
160
|
+
{
|
|
161
|
+
"mcpServers": {
|
|
162
|
+
"puppet-ai": {
|
|
163
|
+
"command": "puppet-ai",
|
|
164
|
+
"args": ["serve"]
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
### Cline (VS Code)
|
|
171
|
+
|
|
172
|
+
In VS Code, open Cline settings → MCP Servers → Add:
|
|
173
|
+
|
|
174
|
+
```json
|
|
175
|
+
{
|
|
176
|
+
"puppet-ai": {
|
|
177
|
+
"command": "puppet-ai",
|
|
178
|
+
"args": ["serve"]
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Zed
|
|
184
|
+
|
|
185
|
+
Add to Zed settings (`~/.config/zed/settings.json`):
|
|
186
|
+
|
|
187
|
+
```json
|
|
188
|
+
{
|
|
189
|
+
"context_servers": {
|
|
190
|
+
"puppet-ai": {
|
|
191
|
+
"command": {
|
|
192
|
+
"path": "puppet-ai",
|
|
193
|
+
"args": ["serve"]
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### OpenClaw
|
|
201
|
+
|
|
202
|
+
Add to your agent's MCP config:
|
|
203
|
+
|
|
204
|
+
```yaml
|
|
205
|
+
mcp_servers:
|
|
206
|
+
puppet-ai:
|
|
207
|
+
command: puppet-ai
|
|
208
|
+
args: [serve]
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
### Any MCP Client
|
|
212
|
+
|
|
213
|
+
puppet-ai speaks MCP over stdio. Spawn it as a subprocess:
|
|
214
|
+
|
|
215
|
+
```python
|
|
216
|
+
import subprocess
|
|
217
|
+
proc = subprocess.Popen(
|
|
218
|
+
["puppet-ai", "serve"],
|
|
219
|
+
stdin=subprocess.PIPE,
|
|
220
|
+
stdout=subprocess.PIPE,
|
|
221
|
+
)
|
|
222
|
+
# Communicate via MCP JSON-RPC over stdin/stdout
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
Works with any agent that supports the [Model Context Protocol](https://modelcontextprotocol.io).
|
|
226
|
+
|
|
227
|
+
Detailed setup guides: [`integrations/`](integrations/)
|
|
228
|
+
|
|
229
|
+
---
|
|
230
|
+
|
|
231
|
+
## Tools
|
|
232
|
+
|
|
233
|
+
### Vision (see the screen)
|
|
234
|
+
|
|
235
|
+
| Tool | Description |
|
|
236
|
+
|------|-------------|
|
|
237
|
+
| `vision_list_windows` | List all open windows (app, title, size) |
|
|
238
|
+
| `vision_read_window(app)` | Read text via OCR with bounding boxes for clicking |
|
|
239
|
+
| `vision_screenshot(app)` | Capture screenshot as base64 JPEG |
|
|
240
|
+
| `vision_get_state` | Full screen state: all windows + active window text |
|
|
241
|
+
| `vision_ui_elements(app)` | Get UI elements via Accessibility API (buttons, links, checkboxes) |
|
|
242
|
+
|
|
243
|
+
### Actions (control the computer)
|
|
244
|
+
|
|
245
|
+
| Tool | Description |
|
|
246
|
+
|------|-------------|
|
|
247
|
+
| `action_click(x, y)` | Click at coordinates |
|
|
248
|
+
| `action_click_text(text, app)` | Find text on screen and click it — no coordinates needed |
|
|
249
|
+
| `action_click_and_wait(text, app)` | Click text, wait for screen to stabilize, return new state |
|
|
250
|
+
| `action_type_safe(text)` | Type text via clipboard paste (works with any keyboard layout) |
|
|
251
|
+
| `action_open_url(url)` | Open URL in browser (http/https only) |
|
|
252
|
+
| `action_scroll(amount, app)` | Scroll up/down in an app |
|
|
253
|
+
| `action_hotkey(keys)` | Keyboard shortcut (e.g. `["cmd", "c"]`) |
|
|
254
|
+
| `action_press(key)` | Press a key (enter, tab, escape, etc.) |
|
|
255
|
+
| `action_drag(...)` | Drag and drop |
|
|
256
|
+
| `action_activate_window(app)` | Bring app to front |
|
|
257
|
+
| `action_clipboard_copy(text)` | Copy to clipboard |
|
|
258
|
+
| `action_clipboard_paste()` | Paste from clipboard |
|
|
259
|
+
|
|
260
|
+
### System
|
|
261
|
+
|
|
262
|
+
| Tool | Description |
|
|
263
|
+
|------|-------------|
|
|
264
|
+
| `system_check_permissions` | Check accessibility access |
|
|
265
|
+
| `system_get_screen_size` | Screen dimensions |
|
|
266
|
+
| `system_get_mouse_position` | Current cursor position |
|
|
267
|
+
| `system_unmask(reason)` | Temporarily disable PII masking |
|
|
268
|
+
| `system_mask()` | Re-enable PII masking |
|
|
269
|
+
|
|
270
|
+
## How It Works
|
|
271
|
+
|
|
272
|
+
```
|
|
273
|
+
AI Agent (Claude, Codex, Gemini, Cursor, Windsurf, ...)
|
|
274
|
+
↕ MCP protocol (stdio)
|
|
275
|
+
puppet-ai server
|
|
276
|
+
├── Vision: Apple Vision OCR + CGWindowList capture
|
|
277
|
+
├── Accessibility: AXUIElement tree (buttons, links, fields)
|
|
278
|
+
├── Actions: pyautogui (mouse, keyboard, scroll)
|
|
279
|
+
└── Security: PII regex filter (API keys, cards, passwords, emails)
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
**The loop:**
|
|
283
|
+
1. **Look** — `vision_read_window("Safari")` → text + coordinates
|
|
284
|
+
2. **Decide** — agent plans next action
|
|
285
|
+
3. **Act** — `action_click_text("Sign In")` → clicks center of text
|
|
286
|
+
4. **Verify** — `vision_read_window` again → confirm it worked
|
|
287
|
+
5. **Repeat**
|
|
288
|
+
|
|
289
|
+
## Features
|
|
290
|
+
|
|
291
|
+
### Native macOS OCR
|
|
292
|
+
|
|
293
|
+
Uses Apple Vision Framework — no external API, no GPU needed, works offline. Supports Russian and English.
|
|
294
|
+
|
|
295
|
+
### Per-Window Capture
|
|
296
|
+
|
|
297
|
+
Captures specific windows via CGWindowList without switching apps or stealing focus.
|
|
298
|
+
|
|
299
|
+
### Smart Coordinates
|
|
300
|
+
|
|
301
|
+
OCR returns absolute screen coordinates with Retina scaling handled automatically.
|
|
302
|
+
|
|
303
|
+
### OCR Cache
|
|
304
|
+
|
|
305
|
+
Repeated reads of unchanged windows are 7x faster. Cache auto-invalidates after any action.
|
|
306
|
+
|
|
307
|
+
### PII Protection
|
|
308
|
+
|
|
309
|
+
Sensitive data is automatically masked in OCR output:
|
|
310
|
+
- API keys: `sk-1***ef`
|
|
311
|
+
- Credit cards: `4111***1111`
|
|
312
|
+
- Emails: `user***com`
|
|
313
|
+
- Passwords in forms
|
|
314
|
+
- Crypto keys
|
|
315
|
+
|
|
316
|
+
### Accessibility API
|
|
317
|
+
|
|
318
|
+
Detect interactive UI elements — buttons, checkboxes, links, text fields — with exact clickable coordinates.
|
|
319
|
+
|
|
320
|
+
### Built-in Agent Instructions
|
|
321
|
+
|
|
322
|
+
The MCP server includes a system prompt that teaches agents how to use all 27 tools, macOS keyboard shortcuts, and the look-decide-act-verify loop.
|
|
323
|
+
|
|
324
|
+
## Security
|
|
325
|
+
|
|
326
|
+
- **All data stays on your Mac** — no telemetry, no analytics, no external calls
|
|
327
|
+
- **PII auto-masking** — API keys, credit cards, emails, passwords masked before reaching the agent
|
|
328
|
+
- **URL validation** — only `http://` and `https://` allowed, `file://` blocked
|
|
329
|
+
- **Input sanitization** — app names validated to prevent injection
|
|
330
|
+
- **Browser allowlist** — only known browsers accepted (Chrome, Safari, Firefox, Arc, etc.)
|
|
331
|
+
- **Failsafe** — pyautogui failsafe enabled by default (move mouse to corner to abort)
|
|
332
|
+
|
|
333
|
+
## Examples
|
|
334
|
+
|
|
335
|
+
```python
|
|
336
|
+
import asyncio
|
|
337
|
+
from puppet_ai.core.capture import ScreenCapture
|
|
338
|
+
from puppet_ai.core.actions import DesktopActions
|
|
339
|
+
from puppet_ai.server.mcp import VisionPipeContext, create_all_tools
|
|
340
|
+
|
|
341
|
+
async def main():
|
|
342
|
+
ctx = VisionPipeContext(
|
|
343
|
+
capture=ScreenCapture(),
|
|
344
|
+
actions=DesktopActions(failsafe=True),
|
|
345
|
+
)
|
|
346
|
+
tools = create_all_tools(ctx)
|
|
347
|
+
|
|
348
|
+
# See what's on screen
|
|
349
|
+
windows = await tools["vision_list_windows"]()
|
|
350
|
+
for w in windows:
|
|
351
|
+
print(f"{w['app']:20s} — {w['title'][:50]}")
|
|
352
|
+
|
|
353
|
+
# Read a window
|
|
354
|
+
page = await tools["vision_read_window"](app="Safari")
|
|
355
|
+
print(page["text"][:500])
|
|
356
|
+
|
|
357
|
+
# Click text on screen
|
|
358
|
+
await tools["action_click_text"](text="Sign In", app="Safari")
|
|
359
|
+
|
|
360
|
+
# Open a URL
|
|
361
|
+
await tools["action_open_url"](url="https://example.com", browser="Safari")
|
|
362
|
+
|
|
363
|
+
asyncio.run(main())
|
|
364
|
+
```
|
|
365
|
+
|
|
366
|
+
More examples in [`examples/`](examples/).
|
|
367
|
+
|
|
368
|
+
## Configuration
|
|
369
|
+
|
|
370
|
+
```yaml
|
|
371
|
+
# puppet-ai.yaml
|
|
372
|
+
ocr:
|
|
373
|
+
languages: ["en", "ru"]
|
|
374
|
+
mode: accurate # or "fast"
|
|
375
|
+
|
|
376
|
+
pii:
|
|
377
|
+
enabled: true
|
|
378
|
+
categories: [api_keys, credit_cards, crypto_keys, emails, passwords]
|
|
379
|
+
|
|
380
|
+
capture:
|
|
381
|
+
max_width: 800
|
|
382
|
+
format: jpeg
|
|
383
|
+
quality: 75
|
|
384
|
+
```
|
|
385
|
+
|
|
386
|
+
Presets:
|
|
387
|
+
|
|
388
|
+
```bash
|
|
389
|
+
puppet-ai serve --preset fast # speed over accuracy
|
|
390
|
+
puppet-ai serve --preset balanced # default
|
|
391
|
+
puppet-ai serve --preset quality # max accuracy
|
|
392
|
+
```
|
|
393
|
+
|
|
394
|
+
## Requirements
|
|
395
|
+
|
|
396
|
+
- macOS 13+ (Ventura or later)
|
|
397
|
+
- Python 3.11+
|
|
398
|
+
- Accessibility permissions enabled
|
|
399
|
+
|
|
400
|
+
## Author
|
|
401
|
+
|
|
402
|
+
**Daniel Starkov**
|
|
403
|
+
|
|
404
|
+
- Twitter: [@retardTransoff](https://x.com/retardTransoff)
|
|
405
|
+
- LinkedIn: [Daniel Starkov](https://www.linkedin.com/in/daniel-starkov-568baa39b/)
|
|
406
|
+
|
|
407
|
+
## License
|
|
408
|
+
|
|
409
|
+
MIT
|